summaryrefslogtreecommitdiff
path: root/tools/perf/scripts/python
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/scripts/python')
0 files changed, 0 insertions, 0 deletions
tr> summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/.gitignore6
-rw-r--r--arch/x86/Kbuild30
-rw-r--r--arch/x86/Kconfig3257
-rw-r--r--arch/x86/Kconfig.assembler7
-rw-r--r--arch/x86/Kconfig.cpu376
-rw-r--r--arch/x86/Kconfig.cpufeatures201
-rw-r--r--arch/x86/Kconfig.debug368
-rw-r--r--arch/x86/Makefile358
-rw-r--r--arch/x86/Makefile.um63
-rw-r--r--arch/x86/Makefile_32.cpu25
-rw-r--r--arch/x86/boot/.gitignore7
-rw-r--r--arch/x86/boot/Makefile167
-rw-r--r--arch/x86/boot/a20.c14
-rw-r--r--arch/x86/boot/apm.c6
-rw-r--r--arch/x86/boot/bioscall.S15
-rw-r--r--arch/x86/boot/bitops.h19
-rw-r--r--arch/x86/boot/boot.h173
-rw-r--r--arch/x86/boot/cmdline.c22
-rw-r--r--arch/x86/boot/code16gcc.h15
-rw-r--r--arch/x86/boot/compressed/.gitignore1
-rw-r--r--arch/x86/boot/compressed/Makefile144
-rw-r--r--arch/x86/boot/compressed/acpi.c317
-rw-r--r--arch/x86/boot/compressed/cmdline.c32
-rw-r--r--arch/x86/boot/compressed/cpuflags.c9
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c6
-rw-r--r--arch/x86/boot/compressed/efi.c236
-rw-r--r--arch/x86/boot/compressed/efi.h127
-rw-r--r--arch/x86/boot/compressed/error.c43
-rw-r--r--arch/x86/boot/compressed/error.h11
-rw-r--r--arch/x86/boot/compressed/head_32.S155
-rw-r--r--arch/x86/boot/compressed/head_64.S401
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c393
-rw-r--r--arch/x86/boot/compressed/idt_64.c92
-rw-r--r--arch/x86/boot/compressed/idt_handlers_64.S78
-rw-r--r--arch/x86/boot/compressed/kaslr.c908
-rw-r--r--arch/x86/boot/compressed/kernel_info.S22
-rw-r--r--arch/x86/boot/compressed/mem.c86
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S324
-rw-r--r--arch/x86/boot/compressed/misc.c568
-rw-r--r--arch/x86/boot/compressed/misc.h258
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c77
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c200
-rw-r--r--arch/x86/boot/compressed/relocs.c651
-rw-r--r--arch/x86/boot/compressed/sbat.S7
-rw-r--r--arch/x86/boot/compressed/sev-handle-vc.c136
-rw-r--r--arch/x86/boot/compressed/sev.c512
-rw-r--r--arch/x86/boot/compressed/sev.h44
-rw-r--r--arch/x86/boot/compressed/string.c81
-rw-r--r--arch/x86/boot/compressed/tdcall.S3
-rw-r--r--arch/x86/boot/compressed/tdx-shared.c2
-rw-r--r--arch/x86/boot/compressed/tdx.c77
-rw-r--r--arch/x86/boot/compressed/tdx.h13
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S77
-rw-r--r--arch/x86/boot/copy.S62
-rw-r--r--arch/x86/boot/cpu.c61
-rw-r--r--arch/x86/boot/cpucheck.c196
-rw-r--r--arch/x86/boot/cpuflags.c110
-rw-r--r--arch/x86/boot/cpuflags.h27
-rw-r--r--arch/x86/boot/ctype.h21
-rw-r--r--arch/x86/boot/early_serial_console.c154
-rw-r--r--arch/x86/boot/edd.c11
-rw-r--r--arch/x86/boot/genimage.sh275
-rw-r--r--arch/x86/boot/header.S445
-rwxr-xr-x[-rw-r--r--]arch/x86/boot/install.sh22
-rw-r--r--arch/x86/boot/io.h41
-rw-r--r--arch/x86/boot/main.c86
-rw-r--r--arch/x86/boot/mca.c38
-rw-r--r--arch/x86/boot/memory.c45
-rw-r--r--arch/x86/boot/mkcpustr.c10
-rw-r--r--arch/x86/boot/mtools.conf.in6
-rw-r--r--arch/x86/boot/pm.c11
-rw-r--r--arch/x86/boot/pmjump.S14
-rw-r--r--arch/x86/boot/printf.c13
-rw-r--r--arch/x86/boot/regs.c8
-rw-r--r--arch/x86/boot/setup.ld27
-rw-r--r--arch/x86/boot/startup/Makefile52
-rw-r--r--arch/x86/boot/startup/efi-mixed.S253
-rw-r--r--arch/x86/boot/startup/exports.h14
-rw-r--r--arch/x86/boot/startup/gdt_idt.c71
-rw-r--r--arch/x86/boot/startup/la57toggle.S111
-rw-r--r--arch/x86/boot/startup/map_kernel.c217
-rw-r--r--arch/x86/boot/startup/sev-shared.c762
-rw-r--r--arch/x86/boot/startup/sev-startup.c220
-rw-r--r--arch/x86/boot/startup/sme.c575
-rw-r--r--arch/x86/boot/string.c342
-rw-r--r--arch/x86/boot/string.h33
-rw-r--r--arch/x86/boot/tools/.gitignore1
-rw-r--r--arch/x86/boot/tools/build.c247
-rw-r--r--arch/x86/boot/tty.c43
-rw-r--r--arch/x86/boot/version.c9
-rw-r--r--arch/x86/boot/vesa.h7
-rw-r--r--arch/x86/boot/video-bios.c4
-rw-r--r--arch/x86/boot/video-mode.c10
-rw-r--r--arch/x86/boot/video-vesa.c22
-rw-r--r--arch/x86/boot/video-vga.c32
-rw-r--r--arch/x86/boot/video.c30
-rw-r--r--arch/x86/boot/video.h27
-rw-r--r--arch/x86/coco/Makefile9
-rw-r--r--arch/x86/coco/core.c249
-rw-r--r--arch/x86/coco/sev/Makefile10
-rw-r--r--arch/x86/coco/sev/core.c2431
-rw-r--r--arch/x86/coco/sev/noinstr.c182
-rw-r--r--arch/x86/coco/sev/vc-handle.c1080
-rw-r--r--arch/x86/coco/sev/vc-shared.c656
-rw-r--r--arch/x86/coco/tdx/Makefile3
-rw-r--r--arch/x86/coco/tdx/debug.c69
-rw-r--r--arch/x86/coco/tdx/tdcall.S63
-rw-r--r--arch/x86/coco/tdx/tdx-shared.c91
-rw-r--r--arch/x86/coco/tdx/tdx.c1196
-rw-r--r--arch/x86/configs/hardening.config17
-rw-r--r--arch/x86/configs/i386_defconfig2462
-rw-r--r--arch/x86/configs/tiny.config2
-rw-r--r--arch/x86/configs/x86_64_defconfig2431
-rw-r--r--arch/x86/configs/xen.config24
-rw-r--r--arch/x86/crypto/.gitignore2
-rw-r--r--arch/x86/crypto/Kconfig379
-rw-r--r--arch/x86/crypto/Makefile85
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S602
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c287
-rw-r--r--arch/x86/crypto/aes-ctr-avx-x86_64.S571
-rw-r--r--arch/x86/crypto/aes-gcm-aesni-x86_64.S1128
-rw-r--r--arch/x86/crypto/aes-gcm-vaes-avx2.S1146
-rw-r--r--arch/x86/crypto/aes-gcm-vaes-avx512.S1163
-rw-r--r--arch/x86/crypto/aes-i586-asm_32.S367
-rw-r--r--arch/x86/crypto/aes-x86_64-asm_64.S188
-rw-r--r--arch/x86/crypto/aes-xts-avx-x86_64.S905
-rw-r--r--arch/x86/crypto/aes_glue.c69
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1296
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c2071
-rw-r--r--arch/x86/crypto/aria-aesni-avx-asm_64.S1352
-rw-r--r--arch/x86/crypto/aria-aesni-avx2-asm_64.S1433
-rw-r--r--arch/x86/crypto/aria-avx.h62
-rw-r--r--arch/x86/crypto/aria-gfni-avx512-asm_64.S971
-rw-r--r--arch/x86/crypto/aria_aesni_avx2_glue.c245
-rw-r--r--arch/x86/crypto/aria_aesni_avx_glue.c225
-rw-r--r--arch/x86/crypto/aria_gfni_avx512_glue.c242
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S354
-rw-r--r--arch/x86/crypto/blowfish_glue.c196
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S990
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S1048
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S502
-rw-r--r--arch/x86/crypto/camellia.h67
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c131
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c131
-rw-r--r--arch/x86/crypto/camellia_glue.c1417
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S489
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c117
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S416
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c116
-rw-r--r--arch/x86/crypto/crc32c-intel.c198
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S831
-rw-r--r--arch/x86/crypto/des3_ede_glue.c391
-rw-r--r--arch/x86/crypto/ecb_cbc_helpers.h87
-rw-r--r--arch/x86/crypto/fpu.c166
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S133
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c163
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx.S36
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx2.S39
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S159
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S124
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c81
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c80
-rw-r--r--arch/x86/crypto/salsa20-i586-asm_32.S1114
-rw-r--r--arch/x86/crypto/salsa20-x86_64-asm_64.S920
-rw-r--r--arch/x86/crypto/salsa20_glue.c129
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S712
-rw-r--r--arch/x86/crypto/serpent-avx.h21
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S724
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S616
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S739
-rw-r--r--arch/x86/crypto/serpent-sse2.h60
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c123
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c125
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c124
-rw-r--r--arch/x86/crypto/sm3-avx-asm_64.S517
-rw-r--r--arch/x86/crypto/sm3_avx_glue.c100
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S536
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S441
-rw-r--r--arch/x86/crypto/sm4-avx.h20
-rw-r--r--arch/x86/crypto/sm4_aesni_avx2_glue.c134
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c349
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S374
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S46
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S306
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S53
-rw-r--r--arch/x86/crypto/twofish.h21
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c126
-rw-r--r--arch/x86/crypto/twofish_glue.c29
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c170
-rw-r--r--arch/x86/entry/Makefile26
-rw-r--r--arch/x86/entry/calling.h489
-rw-r--r--arch/x86/entry/entry.S77
-rw-r--r--arch/x86/entry/entry_32.S1226
-rw-r--r--arch/x86/entry/entry_64.S1571
-rw-r--r--arch/x86/entry/entry_64_compat.S299
-rw-r--r--arch/x86/entry/entry_64_fred.S151
-rw-r--r--arch/x86/entry/entry_fred.c296
-rw-r--r--arch/x86/entry/syscall_32.c371
-rw-r--r--arch/x86/entry/syscall_64.c141
-rw-r--r--arch/x86/entry/syscalls/Makefile78
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl478
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl442
-rw-r--r--arch/x86/entry/thunk.S15
-rw-r--r--arch/x86/entry/vdso/.gitignore (renamed from arch/x86/vdso/.gitignore)6
-rw-r--r--arch/x86/entry/vdso/Makefile162
-rw-r--r--arch/x86/entry/vdso/extable.c46
-rw-r--r--arch/x86/entry/vdso/extable.h28
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c77
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S101
-rw-r--r--arch/x86/entry/vdso/vdso-note.S15
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S37
-rw-r--r--arch/x86/entry/vdso/vdso2c.c233
-rw-r--r--arch/x86/entry/vdso/vdso2c.h208
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c86
-rw-r--r--arch/x86/entry/vdso/vdso32/.gitignore2
-rw-r--r--arch/x86/entry/vdso/vdso32/fake_32bit_build.h25
-rw-r--r--arch/x86/entry/vdso/vdso32/note.S18
-rw-r--r--arch/x86/entry/vdso/vdso32/sigreturn.S (renamed from arch/x86/vdso/vdso32/sigreturn.S)12
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S85
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c4
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso32.lds.S (renamed from arch/x86/vdso/vdso32/vdso32.lds.S)28
-rw-r--r--arch/x86/entry/vdso/vdso32/vgetcpu.c3
-rw-r--r--arch/x86/entry/vdso/vdsox32.lds.S27
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c22
-rw-r--r--arch/x86/entry/vdso/vgetrandom-chacha.S178
-rw-r--r--arch/x86/entry/vdso/vgetrandom.c15
-rw-r--r--arch/x86/entry/vdso/vma.c285
-rw-r--r--arch/x86/entry/vdso/vsgx.S150
-rw-r--r--arch/x86/entry/vsyscall/Makefile6
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c383
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_emu_64.S39
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_trace.h30
-rw-r--r--arch/x86/events/Kconfig55
-rw-r--r--arch/x86/events/Makefile8
-rw-r--r--arch/x86/events/amd/Makefile10
-rw-r--r--arch/x86/events/amd/brs.c432
-rw-r--r--arch/x86/events/amd/core.c1595
-rw-r--r--arch/x86/events/amd/ibs.c1790
-rw-r--r--arch/x86/events/amd/iommu.c491
-rw-r--r--arch/x86/events/amd/iommu.h24
-rw-r--r--arch/x86/events/amd/lbr.c436
-rw-r--r--arch/x86/events/amd/power.c306
-rw-r--r--arch/x86/events/amd/uncore.c1227
-rw-r--r--arch/x86/events/core.c3093
-rw-r--r--arch/x86/events/intel/Makefile8
-rw-r--r--arch/x86/events/intel/bts.c646
-rw-r--r--arch/x86/events/intel/core.c8223
-rw-r--r--arch/x86/events/intel/cstate.c768
-rw-r--r--arch/x86/events/intel/ds.c3189
-rw-r--r--arch/x86/events/intel/knc.c324
-rw-r--r--arch/x86/events/intel/lbr.c1713
-rw-r--r--arch/x86/events/intel/p4.c1405
-rw-r--r--arch/x86/events/intel/p6.c261
-rw-r--r--arch/x86/events/intel/pt.c1894
-rw-r--r--arch/x86/events/intel/pt.h135
-rw-r--r--arch/x86/events/intel/uncore.c1985
-rw-r--r--arch/x86/events/intel/uncore.h650
-rw-r--r--arch/x86/events/intel/uncore_discovery.c793
-rw-r--r--arch/x86/events/intel/uncore_discovery.h177
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c1230
-rw-r--r--arch/x86/events/intel/uncore_snb.c1936
-rw-r--r--arch/x86/events/intel/uncore_snbep.c6711
-rw-r--r--arch/x86/events/msr.c318
-rw-r--r--arch/x86/events/perf_event.h1871
-rw-r--r--arch/x86/events/perf_event_flags.h25
-rw-r--r--arch/x86/events/probe.c65
-rw-r--r--arch/x86/events/probe.h30
-rw-r--r--arch/x86/events/rapl.c965
-rw-r--r--arch/x86/events/utils.c253
-rw-r--r--arch/x86/events/zhaoxin/Makefile2
-rw-r--r--arch/x86/events/zhaoxin/core.c619
-rw-r--r--arch/x86/hyperv/Makefile22
-rw-r--r--arch/x86/hyperv/hv_apic.c341
-rw-r--r--arch/x86/hyperv/hv_crash.c642
-rw-r--r--arch/x86/hyperv/hv_init.c741
-rw-r--r--arch/x86/hyperv/hv_spinlock.c94
-rw-r--r--arch/x86/hyperv/hv_trampoline.S101
-rw-r--r--arch/x86/hyperv/hv_vtl.c281
-rw-r--r--arch/x86/hyperv/irqdomain.c418
-rw-r--r--arch/x86/hyperv/ivm.c945
-rw-r--r--arch/x86/hyperv/mmu.c246
-rw-r--r--arch/x86/hyperv/mshv-asm-offsets.c37
-rw-r--r--arch/x86/hyperv/mshv_vtl_asm.S116
-rw-r--r--arch/x86/hyperv/nested.c130
-rw-r--r--arch/x86/ia32/Makefile8
-rw-r--r--arch/x86/ia32/audit.c16
-rw-r--r--arch/x86/ia32/ia32_aout.c544
-rw-r--r--arch/x86/ia32/ia32_signal.c567
-rw-r--r--arch/x86/ia32/ia32entry.S835
-rw-r--r--arch/x86/ia32/ipc32.c54
-rw-r--r--arch/x86/ia32/sys_ia32.c728
-rw-r--r--arch/x86/include/asm/GEN-for-each-reg.h31
-rw-r--r--arch/x86/include/asm/Kbuild36
-rw-r--r--arch/x86/include/asm/a.out-core.h71
-rw-r--r--arch/x86/include/asm/acenv.h54
-rw-r--r--arch/x86/include/asm/acpi.h245
-rw-r--r--arch/x86/include/asm/acrn.h92
-rw-r--r--arch/x86/include/asm/aes.h11
-rw-r--r--arch/x86/include/asm/agp.h15
-rw-r--r--arch/x86/include/asm/alternative-asm.h22
-rw-r--r--arch/x86/include/asm/alternative.h383
-rw-r--r--arch/x86/include/asm/amd/hsmp.h16
-rw-r--r--arch/x86/include/asm/amd/ibs.h158
-rw-r--r--arch/x86/include/asm/amd/nb.h77
-rw-r--r--arch/x86/include/asm/amd/node.h59
-rw-r--r--arch/x86/include/asm/amd_iommu.h39
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h460
-rw-r--r--arch/x86/include/asm/apic.h787
-rw-r--r--arch/x86/include/asm/apicdef.h305
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/apm.h7
-rw-r--r--arch/x86/include/asm/arch_hweight.h57
-rw-r--r--arch/x86/include/asm/archrandom.h60
-rw-r--r--arch/x86/include/asm/asm-offsets.h1
-rw-r--r--arch/x86/include/asm/asm-prototypes.h25
-rw-r--r--arch/x86/include/asm/asm.h240
-rw-r--r--arch/x86/include/asm/atomic.h176
-rw-r--r--arch/x86/include/asm/atomic64_32.h312
-rw-r--r--arch/x86/include/asm/atomic64_64.h165
-rw-r--r--arch/x86/include/asm/atomic_32.h415
-rw-r--r--arch/x86/include/asm/atomic_64.h485
-rw-r--r--arch/x86/include/asm/audit.h14
-rw-r--r--arch/x86/include/asm/barrier.h83
-rw-r--r--arch/x86/include/asm/bios_ebda.h10
-rw-r--r--arch/x86/include/asm/bitops.h513
-rw-r--r--arch/x86/include/asm/boot.h105
-rw-r--r--arch/x86/include/asm/bootparam.h112
-rw-r--r--arch/x86/include/asm/bootparam_utils.h91
-rw-r--r--arch/x86/include/asm/bug.h196
-rw-r--r--arch/x86/include/asm/bugs.h5
-rw-r--r--arch/x86/include/asm/cache.h14
-rw-r--r--arch/x86/include/asm/cacheflush.h138
-rw-r--r--arch/x86/include/asm/cacheinfo.h18
-rw-r--r--arch/x86/include/asm/calgary.h72
-rw-r--r--arch/x86/include/asm/calling.h220
-rw-r--r--arch/x86/include/asm/ce4100.h13
-rw-r--r--arch/x86/include/asm/cfi.h164
-rw-r--r--arch/x86/include/asm/checksum.h14
-rw-r--r--arch/x86/include/asm/checksum_32.h59
-rw-r--r--arch/x86/include/asm/checksum_64.h43
-rw-r--r--arch/x86/include/asm/clocksource.h21
-rw-r--r--arch/x86/include/asm/cmdline.h13
-rw-r--r--arch/x86/include/asm/cmpxchg.h243
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h409
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h236
-rw-r--r--arch/x86/include/asm/coco.h50
-rw-r--r--arch/x86/include/asm/compat.h206
-rw-r--r--arch/x86/include/asm/cpu.h65
-rw-r--r--arch/x86/include/asm/cpu_debug.h127
-rw-r--r--arch/x86/include/asm/cpu_device_id.h207
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h153
-rw-r--r--arch/x86/include/asm/cpufeature.h376
-rw-r--r--arch/x86/include/asm/cpufeatures.h573
-rw-r--r--arch/x86/include/asm/cpuid/api.h292
-rw-r--r--arch/x86/include/asm/cpuid/types.h127
-rw-r--r--arch/x86/include/asm/cpuidle_haltpoll.h8
-rw-r--r--arch/x86/include/asm/cpumask.h40
-rw-r--r--arch/x86/include/asm/cputime.h1
-rw-r--r--arch/x86/include/asm/crash.h12
-rw-r--r--arch/x86/include/asm/crash_reserve.h44
-rw-r--r--arch/x86/include/asm/current.h19
-rw-r--r--arch/x86/include/asm/debugreg.h245
-rw-r--r--arch/x86/include/asm/delay.h31
-rw-r--r--arch/x86/include/asm/desc.h491
-rw-r--r--arch/x86/include/asm/desc_defs.h206
-rw-r--r--arch/x86/include/asm/device.h13
-rw-r--r--arch/x86/include/asm/div64.h86
-rw-r--r--arch/x86/include/asm/dma-mapping.h136
-rw-r--r--arch/x86/include/asm/dma.h30
-rw-r--r--arch/x86/include/asm/dmi.h9
-rw-r--r--arch/x86/include/asm/do_timer.h16
-rw-r--r--arch/x86/include/asm/doublefault.h17
-rw-r--r--arch/x86/include/asm/ds.h302
-rw-r--r--arch/x86/include/asm/dwarf2.h91
-rw-r--r--arch/x86/include/asm/e820.h147
-rw-r--r--arch/x86/include/asm/e820/api.h55
-rw-r--r--arch/x86/include/asm/e820/types.h104
-rw-r--r--arch/x86/include/asm/edac.h5
-rw-r--r--arch/x86/include/asm/efi.h498
-rw-r--r--arch/x86/include/asm/elf.h221
-rw-r--r--arch/x86/include/asm/elfcore-compat.h31
-rw-r--r--arch/x86/include/asm/emergency-restart.h15
-rw-r--r--arch/x86/include/asm/emulate_prefix.h14
-rw-r--r--arch/x86/include/asm/enclu.h9
-rw-r--r--arch/x86/include/asm/entry-common.h112
-rw-r--r--arch/x86/include/asm/entry_arch.h68
-rw-r--r--arch/x86/include/asm/errno.h1
-rw-r--r--arch/x86/include/asm/espfix.h18
-rw-r--r--arch/x86/include/asm/exec.h1
-rw-r--r--arch/x86/include/asm/extable.h60
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h71
-rw-r--r--arch/x86/include/asm/fb.h21
-rw-r--r--arch/x86/include/asm/fcntl.h1
-rw-r--r--arch/x86/include/asm/fixmap.h175
-rw-r--r--arch/x86/include/asm/floppy.h54
-rw-r--r--arch/x86/include/asm/fpu.h13
-rw-r--r--arch/x86/include/asm/fpu/api.h180
-rw-r--r--arch/x86/include/asm/fpu/regset.h23
-rw-r--r--arch/x86/include/asm/fpu/sched.h55
-rw-r--r--arch/x86/include/asm/fpu/signal.h37
-rw-r--r--arch/x86/include/asm/fpu/types.h647
-rw-r--r--arch/x86/include/asm/fpu/xcr.h35
-rw-r--r--arch/x86/include/asm/fpu/xstate.h134
-rw-r--r--arch/x86/include/asm/frame.h130
-rw-r--r--arch/x86/include/asm/fred.h119
-rw-r--r--arch/x86/include/asm/fsgsbase.h85
-rw-r--r--arch/x86/include/asm/ftrace.h177
-rw-r--r--arch/x86/include/asm/futex.h168
-rw-r--r--arch/x86/include/asm/gart.h53
-rw-r--r--arch/x86/include/asm/geode.h224
-rw-r--r--arch/x86/include/asm/gpio.h56
-rw-r--r--arch/x86/include/asm/gsseg.h66
-rw-r--r--arch/x86/include/asm/hardirq.h79
-rw-r--r--arch/x86/include/asm/highmem.h54
-rw-r--r--arch/x86/include/asm/hpet.h26
-rw-r--r--arch/x86/include/asm/hugetlb.h89
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h77
-rw-r--r--arch/x86/include/asm/hw_irq.h182
-rw-r--r--arch/x86/include/asm/hypertransport.h45
-rw-r--r--arch/x86/include/asm/hyperv_timer.h9
-rw-r--r--arch/x86/include/asm/hypervisor.h69
-rw-r--r--arch/x86/include/asm/i387.h405
-rw-r--r--arch/x86/include/asm/i8253.h18
-rw-r--r--arch/x86/include/asm/i8259.h40
-rw-r--r--arch/x86/include/asm/ia32.h119
-rw-r--r--arch/x86/include/asm/ia32_unistd.h18
-rw-r--r--arch/x86/include/asm/ibt.h117
-rw-r--r--arch/x86/include/asm/idle.h21
-rw-r--r--arch/x86/include/asm/idtentry.h775
-rw-r--r--arch/x86/include/asm/imr.h56
-rw-r--r--arch/x86/include/asm/inat.h266
-rw-r--r--arch/x86/include/asm/inat_types.h15
-rw-r--r--arch/x86/include/asm/init.h30
-rw-r--r--arch/x86/include/asm/insn-eval.h49
-rw-r--r--arch/x86/include/asm/insn.h344
-rw-r--r--arch/x86/include/asm/inst.h148
-rw-r--r--arch/x86/include/asm/intel-family.h227
-rw-r--r--arch/x86/include/asm/intel-mid.h23
-rw-r--r--arch/x86/include/asm/intel_ds.h47
-rw-r--r--arch/x86/include/asm/intel_pt.h41
-rw-r--r--arch/x86/include/asm/intel_punit_ipc.h95
-rw-r--r--arch/x86/include/asm/intel_telemetry.h102
-rw-r--r--arch/x86/include/asm/invpcid.h50
-rw-r--r--arch/x86/include/asm/io.h337
-rw-r--r--arch/x86/include/asm/io_32.h196
-rw-r--r--arch/x86/include/asm/io_64.h181
-rw-r--r--arch/x86/include/asm/io_apic.h219
-rw-r--r--arch/x86/include/asm/io_bitmap.h52
-rw-r--r--arch/x86/include/asm/ioctl.h1
-rw-r--r--arch/x86/include/asm/ioctls.h94
-rw-r--r--arch/x86/include/asm/iomap.h26
-rw-r--r--arch/x86/include/asm/iommu.h34
-rw-r--r--arch/x86/include/asm/iosf_mbi.h246
-rw-r--r--arch/x86/include/asm/ipcbuf.h28
-rw-r--r--arch/x86/include/asm/ipi.h162
-rw-r--r--arch/x86/include/asm/irq.h46
-rw-r--r--arch/x86/include/asm/irq_regs.h31
-rw-r--r--arch/x86/include/asm/irq_remapping.h92
-rw-r--r--arch/x86/include/asm/irq_stack.h241
-rw-r--r--arch/x86/include/asm/irq_vectors.h135
-rw-r--r--arch/x86/include/asm/irq_work.h19
-rw-r--r--arch/x86/include/asm/irqdomain.h64
-rw-r--r--arch/x86/include/asm/irqflags.h204
-rw-r--r--arch/x86/include/asm/ist.h28
-rw-r--r--arch/x86/include/asm/jailhouse_para.h26
-rw-r--r--arch/x86/include/asm/jump_label.h61
-rw-r--r--arch/x86/include/asm/k8.h28
-rw-r--r--arch/x86/include/asm/kasan.h41
-rw-r--r--arch/x86/include/asm/kaslr.h15
-rw-r--r--arch/x86/include/asm/kbdleds.h18
-rw-r--r--arch/x86/include/asm/kdebug.h24
-rw-r--r--arch/x86/include/asm/kexec-bzimage64.h7
-rw-r--r--arch/x86/include/asm/kexec.h221
-rw-r--r--arch/x86/include/asm/kfence.h73
-rw-r--r--arch/x86/include/asm/kgdb.h31
-rw-r--r--arch/x86/include/asm/kmap_types.h12
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/kmsan.h102
-rw-r--r--arch/x86/include/asm/kprobes.h101
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h154
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h27
-rw-r--r--arch/x86/include/asm/kvm.h243
-rw-r--r--arch/x86/include/asm/kvm_host.h2714
-rw-r--r--arch/x86/include/asm/kvm_page_track.h62
-rw-r--r--arch/x86/include/asm/kvm_para.h158
-rw-r--r--arch/x86/include/asm/kvm_types.h22
-rw-r--r--arch/x86/include/asm/kvm_vcpu_regs.h25
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h187
-rw-r--r--arch/x86/include/asm/kvmclock.h19
-rw-r--r--arch/x86/include/asm/lguest.h98
-rw-r--r--arch/x86/include/asm/lguest_hcall.h58
-rw-r--r--arch/x86/include/asm/linkage.h177
-rw-r--r--arch/x86/include/asm/local.h157
-rw-r--r--arch/x86/include/asm/mach_timer.h3
-rw-r--r--arch/x86/include/asm/mach_traps.h15
-rw-r--r--arch/x86/include/asm/math_emu.h7
-rw-r--r--arch/x86/include/asm/mc146818rtc.h13
-rw-r--r--arch/x86/include/asm/mca.h43
-rw-r--r--arch/x86/include/asm/mca_dma.h201
-rw-r--r--arch/x86/include/asm/mce.h448
-rw-r--r--arch/x86/include/asm/mem_encrypt.h119
-rw-r--r--arch/x86/include/asm/memtype.h29
-rw-r--r--arch/x86/include/asm/microcode.h103
-rw-r--r--arch/x86/include/asm/misc.h7
-rw-r--r--arch/x86/include/asm/mman.h29
-rw-r--r--arch/x86/include/asm/mmconfig.h5
-rw-r--r--arch/x86/include/asm/mmu.h93
-rw-r--r--arch/x86/include/asm/mmu_context.h298
-rw-r--r--arch/x86/include/asm/mmx.h14
-rw-r--r--arch/x86/include/asm/mmzone.h5
-rw-r--r--arch/x86/include/asm/mmzone_32.h99
-rw-r--r--arch/x86/include/asm/mmzone_64.h49
-rw-r--r--arch/x86/include/asm/module.h86
-rw-r--r--arch/x86/include/asm/mpspec.h150
-rw-r--r--arch/x86/include/asm/mpspec_def.h27
-rw-r--r--arch/x86/include/asm/msgbuf.h39
-rw-r--r--arch/x86/include/asm/mshyperv.h319
-rw-r--r--arch/x86/include/asm/msi.h71
-rw-r--r--arch/x86/include/asm/msidef.h56
-rw-r--r--arch/x86/include/asm/msr-index.h1008
-rw-r--r--arch/x86/include/asm/msr-trace.h58
-rw-r--r--arch/x86/include/asm/msr.h367
-rw-r--r--arch/x86/include/asm/mtrr.h146
-rw-r--r--arch/x86/include/asm/mutex.h5
-rw-r--r--arch/x86/include/asm/mutex_32.h125
-rw-r--r--arch/x86/include/asm/mutex_64.h100
-rw-r--r--arch/x86/include/asm/mwait.h150
-rw-r--r--arch/x86/include/asm/nmi.h150
-rw-r--r--arch/x86/include/asm/nops.h183
-rw-r--r--arch/x86/include/asm/nospec-branch.h626
-rw-r--r--arch/x86/include/asm/numa.h67
-rw-r--r--arch/x86/include/asm/numa_32.h15
-rw-r--r--arch/x86/include/asm/numa_64.h47
-rw-r--r--arch/x86/include/asm/numachip/numachip.h20
-rw-r--r--arch/x86/include/asm/numachip/numachip_csr.h98
-rw-r--r--arch/x86/include/asm/numaq.h171
-rw-r--r--arch/x86/include/asm/olpc.h66
-rw-r--r--arch/x86/include/asm/olpc_ofw.h38
-rw-r--r--arch/x86/include/asm/orc_header.h19
-rw-r--r--arch/x86/include/asm/orc_lookup.h34
-rw-r--r--arch/x86/include/asm/orc_types.h78
-rw-r--r--arch/x86/include/asm/page.h47
-rw-r--r--arch/x86/include/asm/page_32.h28
-rw-r--r--arch/x86/include/asm/page_32_types.h64
-rw-r--r--arch/x86/include/asm/page_64.h107
-rw-r--r--arch/x86/include/asm/page_64_types.h115
-rw-r--r--arch/x86/include/asm/page_types.h50
-rw-r--r--arch/x86/include/asm/param.h22
-rw-r--r--arch/x86/include/asm/paravirt.h1651
-rw-r--r--arch/x86/include/asm/paravirt_api_clock.h1
-rw-r--r--arch/x86/include/asm/paravirt_types.h530
-rw-r--r--arch/x86/include/asm/parport.h5
-rw-r--r--arch/x86/include/asm/pat.h22
-rw-r--r--arch/x86/include/asm/pc-conf-reg.h33
-rw-r--r--arch/x86/include/asm/pci-direct.h5
-rw-r--r--arch/x86/include/asm/pci-functions.h1
-rw-r--r--arch/x86/include/asm/pci.h129
-rw-r--r--arch/x86/include/asm/pci_64.h29
-rw-r--r--arch/x86/include/asm/pci_x86.h132
-rw-r--r--arch/x86/include/asm/percpu.h757
-rw-r--r--arch/x86/include/asm/perf_counter.h98
-rw-r--r--arch/x86/include/asm/perf_event.h813
-rw-r--r--arch/x86/include/asm/perf_event_p4.h877
-rw-r--r--arch/x86/include/asm/pgalloc.h117
-rw-r--r--arch/x86/include/asm/pgtable-2level.h115
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h20
-rw-r--r--arch/x86/include/asm/pgtable-3level.h171
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h21
-rw-r--r--arch/x86/include/asm/pgtable-invert.h41
-rw-r--r--arch/x86/include/asm/pgtable.h1496
-rw-r--r--arch/x86/include/asm/pgtable_32.h76
-rw-r--r--arch/x86/include/asm/pgtable_32_areas.h53
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h44
-rw-r--r--arch/x86/include/asm/pgtable_64.h238
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h161
-rw-r--r--arch/x86/include/asm/pgtable_areas.h22
-rw-r--r--arch/x86/include/asm/pgtable_types.h534
-rw-r--r--arch/x86/include/asm/pkeys.h126
-rw-r--r--arch/x86/include/asm/pkru.h62
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h136
-rw-r--r--arch/x86/include/asm/pm-trace.h (renamed from arch/x86/include/asm/resume-trace.h)11
-rw-r--r--arch/x86/include/asm/poll.h1
-rw-r--r--arch/x86/include/asm/posix_types.h13
-rw-r--r--arch/x86/include/asm/posix_types_32.h85
-rw-r--r--arch/x86/include/asm/posix_types_64.h119
-rw-r--r--arch/x86/include/asm/posted_intr.h187
-rw-r--r--arch/x86/include/asm/prctl.h9
-rw-r--r--arch/x86/include/asm/preempt.h151
-rw-r--r--arch/x86/include/asm/probe_roms.h9
-rw-r--r--arch/x86/include/asm/processor-cyrix.h30
-rw-r--r--arch/x86/include/asm/processor-flags.h130
-rw-r--r--arch/x86/include/asm/processor.h1172
-rw-r--r--arch/x86/include/asm/prom.h37
-rw-r--r--arch/x86/include/asm/proto.h56
-rw-r--r--arch/x86/include/asm/pti.h15
-rw-r--r--arch/x86/include/asm/ptrace-abi.h142
-rw-r--r--arch/x86/include/asm/ptrace.h482
-rw-r--r--arch/x86/include/asm/purgatory.h11
-rw-r--r--arch/x86/include/asm/pvclock-abi.h12
-rw-r--r--arch/x86/include/asm/pvclock.h100
-rw-r--r--arch/x86/include/asm/qrwlock.h8
-rw-r--r--arch/x86/include/asm/qspinlock.h116
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h67
-rw-r--r--arch/x86/include/asm/rdc321x_defs.h12
-rw-r--r--arch/x86/include/asm/realmode.h100
-rw-r--r--arch/x86/include/asm/reboot.h21
-rw-r--r--arch/x86/include/asm/reboot_fixups.h1
-rw-r--r--arch/x86/include/asm/required-features.h88
-rw-r--r--arch/x86/include/asm/resctrl.h203
-rw-r--r--arch/x86/include/asm/resource.h1
-rw-r--r--arch/x86/include/asm/rio.h63
-rw-r--r--arch/x86/include/asm/rmwcc.h43
-rw-r--r--arch/x86/include/asm/rqspinlock.h33
-rw-r--r--arch/x86/include/asm/rtc.h1
-rw-r--r--arch/x86/include/asm/runtime-const.h78
-rw-r--r--arch/x86/include/asm/rwlock.h8
-rw-r--r--arch/x86/include/asm/rwsem.h265
-rw-r--r--arch/x86/include/asm/scatterlist.h33
-rw-r--r--arch/x86/include/asm/seccomp.h42
-rw-r--r--arch/x86/include/asm/seccomp_32.h11
-rw-r--r--arch/x86/include/asm/seccomp_64.h17
-rw-r--r--arch/x86/include/asm/sections.h12
-rw-r--r--arch/x86/include/asm/segment.h392
-rw-r--r--arch/x86/include/asm/sembuf.h24
-rw-r--r--arch/x86/include/asm/serial.h27
-rw-r--r--arch/x86/include/asm/set_memory.h97
-rw-r--r--arch/x86/include/asm/setup.h169
-rw-r--r--arch/x86/include/asm/setup_data.h32
-rw-r--r--arch/x86/include/asm/sev-common.h261
-rw-r--r--arch/x86/include/asm/sev-internal.h87
-rw-r--r--arch/x86/include/asm/sev.h682
-rw-r--r--arch/x86/include/asm/sgx.h430
-rw-r--r--arch/x86/include/asm/shared/io.h34
-rw-r--r--arch/x86/include/asm/shared/msr.h30
-rw-r--r--arch/x86/include/asm/shared/tdx.h191
-rw-r--r--arch/x86/include/asm/shmbuf.h51
-rw-r--r--arch/x86/include/asm/shmparam.h1
-rw-r--r--arch/x86/include/asm/shstk.h46
-rw-r--r--arch/x86/include/asm/sigcontext.h287
-rw-r--r--arch/x86/include/asm/sigcontext32.h77
-rw-r--r--arch/x86/include/asm/sigframe.h38
-rw-r--r--arch/x86/include/asm/sighandling.h49
-rw-r--r--arch/x86/include/asm/siginfo.h10
-rw-r--r--arch/x86/include/asm/signal.h188
-rw-r--r--arch/x86/include/asm/simd.h18
-rw-r--r--arch/x86/include/asm/smap.h110
-rw-r--r--arch/x86/include/asm/smp.h188
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h61
-rw-r--r--arch/x86/include/asm/socket.h60
-rw-r--r--arch/x86/include/asm/sockios.h13
-rw-r--r--arch/x86/include/asm/softirq_stack.h11
-rw-r--r--arch/x86/include/asm/sparsemem.h12
-rw-r--r--arch/x86/include/asm/spec-ctrl.h101
-rw-r--r--arch/x86/include/asm/special_insns.h308
-rw-r--r--arch/x86/include/asm/spinlock.h288
-rw-r--r--arch/x86/include/asm/spinlock_types.h19
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stackprotector.h109
-rw-r--r--arch/x86/include/asm/stacktrace.h117
-rw-r--r--arch/x86/include/asm/static_call.h83
-rw-r--r--arch/x86/include/asm/string.h31
-rw-r--r--arch/x86/include/asm/string_32.h183
-rw-r--r--arch/x86/include/asm/string_64.h117
-rw-r--r--arch/x86/include/asm/suspend.h13
-rw-r--r--arch/x86/include/asm/suspend_32.h21
-rw-r--r--arch/x86/include/asm/suspend_64.h53
-rw-r--r--arch/x86/include/asm/svm.h638
-rw-r--r--arch/x86/include/asm/swiotlb.h22
-rw-r--r--arch/x86/include/asm/switch_to.h92
-rw-r--r--arch/x86/include/asm/sync_bitops.h44
-rw-r--r--arch/x86/include/asm/sync_core.h111
-rw-r--r--arch/x86/include/asm/sys_ia32.h99
-rw-r--r--arch/x86/include/asm/syscall.h219
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h264
-rw-r--r--arch/x86/include/asm/syscalls.h86
-rw-r--r--arch/x86/include/asm/system.h485
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/tce.h48
-rw-r--r--arch/x86/include/asm/tdx.h240
-rw-r--r--arch/x86/include/asm/tdx_global_metadata.h44
-rw-r--r--arch/x86/include/asm/termbits.h198
-rw-r--r--arch/x86/include/asm/termios.h114
-rw-r--r--arch/x86/include/asm/text-patching.h218
-rw-r--r--arch/x86/include/asm/thermal.h15
-rw-r--r--arch/x86/include/asm/thread_info.h352
-rw-r--r--arch/x86/include/asm/time.h61
-rw-r--r--arch/x86/include/asm/timer.h74
-rw-r--r--arch/x86/include/asm/timex.h10
-rw-r--r--arch/x86/include/asm/tlb.h163
-rw-r--r--arch/x86/include/asm/tlbbatch.h20
-rw-r--r--arch/x86/include/asm/tlbflush.h512
-rw-r--r--arch/x86/include/asm/topology.h340
-rw-r--r--arch/x86/include/asm/trace/fpu.h79
-rw-r--r--arch/x86/include/asm/trace/hyperv.h98
-rw-r--r--arch/x86/include/asm/trace/irq_vectors.h382
-rw-r--r--arch/x86/include/asm/trace_clock.h21
-rw-r--r--arch/x86/include/asm/trampoline.h29
-rw-r--r--arch/x86/include/asm/trap_pf.h32
-rw-r--r--arch/x86/include/asm/trapnr.h44
-rw-r--r--arch/x86/include/asm/traps.h101
-rw-r--r--arch/x86/include/asm/tsc.h111
-rw-r--r--arch/x86/include/asm/types.h30
-rw-r--r--arch/x86/include/asm/uaccess.h895
-rw-r--r--arch/x86/include/asm/uaccess_32.h193
-rw-r--r--arch/x86/include/asm/uaccess_64.h330
-rw-r--r--arch/x86/include/asm/ucontext.h18
-rw-r--r--arch/x86/include/asm/umip.h12
-rw-r--r--arch/x86/include/asm/unaccepted_memory.h27
-rw-r--r--arch/x86/include/asm/unaligned.h14
-rw-r--r--arch/x86/include/asm/unistd.h67
-rw-r--r--arch/x86/include/asm/unistd_32.h383
-rw-r--r--arch/x86/include/asm/unistd_64.h700
-rw-r--r--arch/x86/include/asm/unwind.h154
-rw-r--r--arch/x86/include/asm/unwind_hints.h93
-rw-r--r--arch/x86/include/asm/unwind_user.h41
-rw-r--r--arch/x86/include/asm/uprobes.h74
-rw-r--r--arch/x86/include/asm/user.h63
-rw-r--r--arch/x86/include/asm/user32.h1
-rw-r--r--arch/x86/include/asm/user_32.h5
-rw-r--r--arch/x86/include/asm/user_64.h5
-rw-r--r--arch/x86/include/asm/uv/bios.h175
-rw-r--r--arch/x86/include/asm/uv/uv.h37
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h331
-rw-r--r--arch/x86/include/asm/uv/uv_geo.h103
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h626
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h13
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h5650
-rw-r--r--arch/x86/include/asm/vdso.h80
-rw-r--r--arch/x86/include/asm/vdso/clocksource.h12
-rw-r--r--arch/x86/include/asm/vdso/getrandom.h32
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h336
-rw-r--r--arch/x86/include/asm/vdso/processor.h27
-rw-r--r--arch/x86/include/asm/vdso/vsyscall.h22
-rw-r--r--arch/x86/include/asm/vermagic.h64
-rw-r--r--arch/x86/include/asm/vga.h15
-rw-r--r--arch/x86/include/asm/vgtod.h34
-rw-r--r--arch/x86/include/asm/video.h23
-rw-r--r--arch/x86/include/asm/virtext.h132
-rw-r--r--arch/x86/include/asm/visws/cobalt.h125
-rw-r--r--arch/x86/include/asm/visws/lithium.h53
-rw-r--r--arch/x86/include/asm/visws/piix4.h107
-rw-r--r--arch/x86/include/asm/visws/sgivw.h5
-rw-r--r--arch/x86/include/asm/vm86.h187
-rw-r--r--arch/x86/include/asm/vmalloc.h26
-rw-r--r--arch/x86/include/asm/vmi.h269
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/vmware.h336
-rw-r--r--arch/x86/include/asm/vmx.h553
-rw-r--r--arch/x86/include/asm/vmxfeatures.h93
-rw-r--r--arch/x86/include/asm/vsyscall.h65
-rw-r--r--arch/x86/include/asm/word-at-a-time.h84
-rw-r--r--arch/x86/include/asm/x86_init.h355
-rw-r--r--arch/x86/include/asm/xcr.h49
-rw-r--r--arch/x86/include/asm/xen/cpuid.h139
-rw-r--r--arch/x86/include/asm/xen/events.h20
-rw-r--r--arch/x86/include/asm/xen/grant_table.h7
-rw-r--r--arch/x86/include/asm/xen/hypercall.h434
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h76
-rw-r--r--arch/x86/include/asm/xen/interface.h273
-rw-r--r--arch/x86/include/asm/xen/interface_32.h10
-rw-r--r--arch/x86/include/asm/xen/interface_64.h20
-rw-r--r--arch/x86/include/asm/xen/page.h260
-rw-r--r--arch/x86/include/asm/xen/pci.h67
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h11
-rw-r--r--arch/x86/include/asm/xen/trace_types.h19
-rw-r--r--arch/x86/include/asm/xor.h504
-rw-r--r--arch/x86/include/asm/xor_32.h407
-rw-r--r--arch/x86/include/asm/xor_64.h353
-rw-r--r--arch/x86/include/asm/xor_avx.h178
-rw-r--r--arch/x86/include/asm/xsave.h119
-rw-r--r--arch/x86/include/uapi/asm/Kbuild4
-rw-r--r--arch/x86/include/uapi/asm/a.out.h (renamed from arch/x86/include/asm/a.out.h)1
-rw-r--r--arch/x86/include/uapi/asm/amd_hsmp.h477
-rw-r--r--arch/x86/include/uapi/asm/auxvec.h (renamed from arch/x86/include/asm/auxvec.h)8
-rw-r--r--arch/x86/include/uapi/asm/bitsperlong.h (renamed from arch/x86/include/asm/bitsperlong.h)3
-rw-r--r--arch/x86/include/uapi/asm/boot.h11
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h215
-rw-r--r--arch/x86/include/uapi/asm/byteorder.h (renamed from arch/x86/include/asm/byteorder.h)1
-rw-r--r--arch/x86/include/uapi/asm/debugreg.h101
-rw-r--r--arch/x86/include/uapi/asm/e820.h82
-rw-r--r--arch/x86/include/uapi/asm/elf.h16
-rw-r--r--arch/x86/include/uapi/asm/hw_breakpoint.h2
-rw-r--r--arch/x86/include/uapi/asm/hwcap2.h13
-rw-r--r--arch/x86/include/uapi/asm/ist.h30
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1046
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h152
-rw-r--r--arch/x86/include/uapi/asm/kvm_perf.h17
-rw-r--r--arch/x86/include/uapi/asm/ldt.h (renamed from arch/x86/include/asm/ldt.h)12
-rw-r--r--arch/x86/include/uapi/asm/mce.h46
-rw-r--r--arch/x86/include/uapi/asm/mman.h10
-rw-r--r--arch/x86/include/uapi/asm/msgbuf.h35
-rw-r--r--arch/x86/include/uapi/asm/msr.h14
-rw-r--r--arch/x86/include/uapi/asm/mtrr.h116
-rw-r--r--arch/x86/include/uapi/asm/perf_regs.h58
-rw-r--r--arch/x86/include/uapi/asm/posix_types.h10
-rw-r--r--arch/x86/include/uapi/asm/posix_types_32.h26
-rw-r--r--arch/x86/include/uapi/asm/posix_types_64.h20
-rw-r--r--arch/x86/include/uapi/asm/posix_types_x32.h20
-rw-r--r--arch/x86/include/uapi/asm/prctl.h43
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h183
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h94
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h86
-rw-r--r--arch/x86/include/uapi/asm/sembuf.h36
-rw-r--r--arch/x86/include/uapi/asm/setup.h1
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h94
-rw-r--r--arch/x86/include/uapi/asm/sgx.h238
-rw-r--r--arch/x86/include/uapi/asm/shmbuf.h47
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h389
-rw-r--r--arch/x86/include/uapi/asm/sigcontext32.h9
-rw-r--r--arch/x86/include/uapi/asm/siginfo.h15
-rw-r--r--arch/x86/include/uapi/asm/signal.h111
-rw-r--r--arch/x86/include/uapi/asm/stat.h (renamed from arch/x86/include/asm/stat.h)62
-rw-r--r--arch/x86/include/uapi/asm/statfs.h (renamed from arch/x86/include/asm/statfs.h)1
-rw-r--r--arch/x86/include/uapi/asm/svm.h252
-rw-r--r--arch/x86/include/uapi/asm/swab.h (renamed from arch/x86/include/asm/swab.h)30
-rw-r--r--arch/x86/include/uapi/asm/ucontext.h56
-rw-r--r--arch/x86/include/uapi/asm/unistd.h25
-rw-r--r--arch/x86/include/uapi/asm/vm86.h130
-rw-r--r--arch/x86/include/uapi/asm/vmx.h175
-rw-r--r--arch/x86/include/uapi/asm/vsyscall.h13
-rw-r--r--arch/x86/kernel/.gitignore1
-rw-r--r--arch/x86/kernel/Makefile170
-rw-r--r--arch/x86/kernel/acpi/Makefile14
-rw-r--r--arch/x86/kernel/acpi/apei.c52
-rw-r--r--arch/x86/kernel/acpi/boot.c1573
-rw-r--r--arch/x86/kernel/acpi/cppc.c298
-rw-r--r--arch/x86/kernel/acpi/cstate.c124
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S29
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c249
-rw-r--r--arch/x86/kernel/acpi/processor.c100
-rw-r--r--arch/x86/kernel/acpi/realmode/.gitignore3
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile59
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/copy.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-bios.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-mode.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vesa.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vga.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S149
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S61
-rw-r--r--arch/x86/kernel/acpi/sleep.c224
-rw-r--r--arch/x86/kernel/acpi/sleep.h15
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S33
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S85
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S10
-rw-r--r--arch/x86/kernel/alternative.c3293
-rw-r--r--arch/x86/kernel/amd_gart_64.c (renamed from arch/x86/kernel/pci-gart_64.c)408
-rw-r--r--arch/x86/kernel/amd_iommu.c2193
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1349
-rw-r--r--arch/x86/kernel/amd_nb.c331
-rw-r--r--arch/x86/kernel/amd_node.c316
-rw-r--r--arch/x86/kernel/aperture_64.c328
-rw-r--r--arch/x86/kernel/apic/Makefile25
-rw-r--r--arch/x86/kernel/apic/apic.c2333
-rw-r--r--arch/x86/kernel/apic/apic_common.c43
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c363
-rw-r--r--arch/x86/kernel/apic/apic_noop.c80
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c272
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c267
-rw-r--r--arch/x86/kernel/apic/es7000_32.c782
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c61
-rw-r--r--arch/x86/kernel/apic/init.c110
-rw-r--r--arch/x86/kernel/apic/io_apic.c4894
-rw-r--r--arch/x86/kernel/apic/ipi.c331
-rw-r--r--arch/x86/kernel/apic/local.h68
-rw-r--r--arch/x86/kernel/apic/msi.c391
-rw-r--r--arch/x86/kernel/apic/nmi.c566
-rw-r--r--arch/x86/kernel/apic/numaq_32.c559
-rw-r--r--arch/x86/kernel/apic/probe_32.c230
-rw-r--r--arch/x86/kernel/apic/probe_64.c92
-rw-r--r--arch/x86/kernel/apic/summit_32.c568
-rw-r--r--arch/x86/kernel/apic/vector.c1387
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c336
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c231
-rw-r--r--arch/x86/kernel/apic/x2apic_savic.c428
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1975
-rw-r--r--arch/x86/kernel/apm_32.c345
-rw-r--r--arch/x86/kernel/asm-offsets.c125
-rw-r--r--arch/x86/kernel/asm-offsets_32.c136
-rw-r--r--arch/x86/kernel/asm-offsets_64.c119
-rw-r--r--arch/x86/kernel/audit_64.c19
-rw-r--r--arch/x86/kernel/bios_uv.c192
-rw-r--r--arch/x86/kernel/bootflag.c33
-rw-r--r--arch/x86/kernel/callthunks.c383
-rw-r--r--arch/x86/kernel/cet.c162
-rw-r--r--arch/x86/kernel/cfi.c100
-rw-r--r--arch/x86/kernel/check.c108
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile65
-rw-r--r--arch/x86/kernel/cpu/acrn.c81
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c138
-rw-r--r--arch/x86/kernel/cpu/amd.c1369
-rw-r--r--arch/x86/kernel/cpu/amd_cache_disable.c301
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c552
-rw-r--r--arch/x86/kernel/cpu/bhyve.c66
-rw-r--r--arch/x86/kernel/cpu/bugs.c3767
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c33
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c433
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c820
-rw-r--r--arch/x86/kernel/cpu/centaur.c367
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c2589
-rw-r--r--arch/x86/kernel/cpu/cpu.h79
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig252
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile20
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c851
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c446
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c367
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c310
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c519
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1029
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c328
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c337
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c262
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c749
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1463
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h226
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c194
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c635
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c458
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c482
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c468
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c192
-rw-r--r--arch/x86/kernel/cpu/cpuid_0x2_table.c128
-rw-r--r--arch/x86/kernel/cpu/cyrix.c105
-rw-r--r--arch/x86/kernel/cpu/debugfs.c101
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c215
-rw-r--r--arch/x86/kernel/cpu/hygon.c279
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c94
-rw-r--r--arch/x86/kernel/cpu/intel.c759
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c1006
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c244
-rw-r--r--arch/x86/kernel/cpu/match.c98
-rw-r--r--arch/x86/kernel/cpu/mce/Makefile14
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c1270
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c265
-rw-r--r--arch/x86/kernel/cpu/mce/core.c2970
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c365
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c156
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c805
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c537
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h352
-rw-r--r--arch/x86/kernel/cpu/mce/p5.c (renamed from arch/x86/kernel/cpu/mcheck/p5.c)38
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c489
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c163
-rw-r--r--arch/x86/kernel/cpu/mce/winchip.c (renamed from arch/x86/kernel/cpu/mcheck/winchip.c)25
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile12
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c116
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2047
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c703
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c226
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c163
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c293
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile5
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1306
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c556
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c926
-rw-r--r--arch/x86/kernel/cpu/microcode/intel-ucode-defs.h160
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c1016
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h136
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh73
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c771
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c108
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c180
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c634
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c134
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c976
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c236
-rw-r--r--arch/x86/kernel/cpu/mtrr/legacy.c94
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c750
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c633
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h73
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c82
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1964
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c705
-rw-r--r--arch/x86/kernel/cpu/powerflags.c14
-rw-r--r--arch/x86/kernel/cpu/proc.c128
-rw-r--r--arch/x86/kernel/cpu/rdrand.c50
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile7
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c1079
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c133
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h225
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c583
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c517
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h45
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c262
-rw-r--r--arch/x86/kernel/cpu/scattered.c93
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile6
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c201
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h28
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c1326
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h129
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h241
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c1244
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c1072
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h110
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c454
-rw-r--r--arch/x86/kernel/cpu/topology.c581
-rw-r--r--arch/x86/kernel/cpu/topology.h67
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c227
-rw-r--r--arch/x86/kernel/cpu/topology_common.c258
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c145
-rw-r--r--arch/x86/kernel/cpu/transmeta.c26
-rw-r--r--arch/x86/kernel/cpu/tsx.c267
-rw-r--r--arch/x86/kernel/cpu/umc.c8
-rw-r--r--arch/x86/kernel/cpu/umwait.c246
-rw-r--r--arch/x86/kernel/cpu/vmware.c583
-rw-r--r--arch/x86/kernel/cpu/vortex.c39
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c116
-rw-r--r--arch/x86/kernel/cpuid.c186
-rw-r--r--arch/x86/kernel/crash.c534
-rw-r--r--arch/x86/kernel/crash_dump_32.c81
-rw-r--r--arch/x86/kernel/crash_dump_64.c69
-rw-r--r--arch/x86/kernel/devicetree.c319
-rw-r--r--arch/x86/kernel/doublefault_32.c140
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c573
-rw-r--r--arch/x86/kernel/dumpstack.h38
-rw-r--r--arch/x86/kernel/dumpstack_32.c233
-rw-r--r--arch/x86/kernel/dumpstack_64.c416
-rw-r--r--arch/x86/kernel/e820.c1718
-rw-r--r--arch/x86/kernel/early-quirks.c571
-rw-r--r--arch/x86/kernel/early_printk.c992
-rw-r--r--arch/x86/kernel/ebda.c98
-rw-r--r--arch/x86/kernel/efi.c606
-rw-r--r--arch/x86/kernel/efi_32.c112
-rw-r--r--arch/x86/kernel/efi_64.c114
-rw-r--r--arch/x86/kernel/efi_stub_32.S123
-rw-r--r--arch/x86/kernel/efi_stub_64.S116
-rw-r--r--arch/x86/kernel/eisa.c25
-rw-r--r--arch/x86/kernel/entry_32.S1425
-rw-r--r--arch/x86/kernel/entry_64.S1594
-rw-r--r--arch/x86/kernel/espfix_64.c205
-rw-r--r--arch/x86/kernel/fpu/Makefile6
-rw-r--r--arch/x86/kernel/fpu/bugs.c62
-rw-r--r--arch/x86/kernel/fpu/context.h82
-rw-r--r--arch/x86/kernel/fpu/core.c989
-rw-r--r--arch/x86/kernel/fpu/init.c229
-rw-r--r--arch/x86/kernel/fpu/internal.h28
-rw-r--r--arch/x86/kernel/fpu/legacy.h111
-rw-r--r--arch/x86/kernel/fpu/regset.c468
-rw-r--r--arch/x86/kernel/fpu/signal.c526
-rw-r--r--arch/x86/kernel/fpu/xstate.c2012
-rw-r--r--arch/x86/kernel/fpu/xstate.h368
-rw-r--r--arch/x86/kernel/fred.c93
-rw-r--r--arch/x86/kernel/ftrace.c881
-rw-r--r--arch/x86/kernel/ftrace_32.S201
-rw-r--r--arch/x86/kernel/ftrace_64.S402
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head.c55
-rw-r--r--arch/x86/kernel/head32.c165
-rw-r--r--arch/x86/kernel/head64.c305
-rw-r--r--arch/x86/kernel/head_32.S702
-rw-r--r--arch/x86/kernel/head_64.S895
-rw-r--r--arch/x86/kernel/hpet.c1448
-rw-r--r--arch/x86/kernel/hw_breakpoint.c593
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c28
-rw-r--r--arch/x86/kernel/i387.c669
-rw-r--r--arch/x86/kernel/i8237.c61
-rw-r--r--arch/x86/kernel/i8253.c224
-rw-r--r--arch/x86/kernel/i8259.c252
-rw-r--r--arch/x86/kernel/ibt_selftest.S17
-rw-r--r--arch/x86/kernel/idt.c353
-rw-r--r--arch/x86/kernel/init_task.c43
-rw-r--r--arch/x86/kernel/io_delay.c43
-rw-r--r--arch/x86/kernel/ioport.c229
-rw-r--r--arch/x86/kernel/irq.c510
-rw-r--r--arch/x86/kernel/irq_32.c248
-rw-r--r--arch/x86/kernel/irq_64.c147
-rw-r--r--arch/x86/kernel/irq_work.c34
-rw-r--r--arch/x86/kernel/irqflags.S18
-rw-r--r--arch/x86/kernel/irqinit.c253
-rw-r--r--arch/x86/kernel/itmt.c190
-rw-r--r--arch/x86/kernel/jailhouse.c296
-rw-r--r--arch/x86/kernel/jump_label.c148
-rw-r--r--arch/x86/kernel/k8.c123
-rw-r--r--arch/x86/kernel/kdebugfs.c158
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c716
-rw-r--r--arch/x86/kernel/kgdb.c688
-rw-r--r--arch/x86/kernel/kprobes.c1068
-rw-r--r--arch/x86/kernel/kprobes/Makefile8
-rw-r--r--arch/x86/kernel/kprobes/common.h109
-rw-r--r--arch/x86/kernel/kprobes/core.c1081
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c76
-rw-r--r--arch/x86/kernel/kprobes/opt.c551
-rw-r--r--arch/x86/kernel/ksysfs.c401
-rw-r--r--arch/x86/kernel/kvm.c1208
-rw-r--r--arch/x86/kernel/kvmclock.c369
-rw-r--r--arch/x86/kernel/ldt.c687
-rw-r--r--arch/x86/kernel/machine_kexec_32.c92
-rw-r--r--arch/x86/kernel/machine_kexec_64.c739
-rw-r--r--arch/x86/kernel/mca_32.c476
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c354
-rw-r--r--arch/x86/kernel/microcode_core.c570
-rw-r--r--arch/x86/kernel/microcode_intel.c475
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c100
-rw-r--r--arch/x86/kernel/module.c296
-rw-r--r--arch/x86/kernel/mpparse.c575
-rw-r--r--arch/x86/kernel/msr.c319
-rw-r--r--arch/x86/kernel/nmi.c752
-rw-r--r--arch/x86/kernel/nmi_selftest.c164
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c45
-rw-r--r--arch/x86/kernel/paravirt.c579
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c61
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c75
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1607
-rw-r--r--arch/x86/kernel/pci-dma.c295
-rw-r--r--arch/x86/kernel/pci-nommu.c97
-rw-r--r--arch/x86/kernel/pci-swiotlb.c85
-rw-r--r--arch/x86/kernel/pcspeaker.c3
-rw-r--r--arch/x86/kernel/perf_regs.c202
-rw-r--r--arch/x86/kernel/platform-quirks.c47
-rw-r--r--arch/x86/kernel/pmem.c37
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)122
-rw-r--r--arch/x86/kernel/process.c1248
-rw-r--r--arch/x86/kernel/process.h39
-rw-r--r--arch/x86/kernel/process_32.c399
-rw-r--r--arch/x86/kernel/process_64.c1224
-rw-r--r--arch/x86/kernel/ptrace.c1419
-rw-r--r--arch/x86/kernel/pvclock.c213
-rw-r--r--arch/x86/kernel/quirks.c185
-rw-r--r--arch/x86/kernel/reboot.c1054
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c22
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S52
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S615
-rw-r--r--arch/x86/kernel/resource.c72
-rw-r--r--arch/x86/kernel/rethook.c127
-rw-r--r--arch/x86/kernel/rtc.c177
-rw-r--r--arch/x86/kernel/setup.c1549
-rw-r--r--arch/x86/kernel/setup_percpu.c505
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S89
-rw-r--r--arch/x86/kernel/shstk.c635
-rw-r--r--arch/x86/kernel/signal.c997
-rw-r--r--arch/x86/kernel/signal_32.c535
-rw-r--r--arch/x86/kernel/signal_64.c526
-rw-r--r--arch/x86/kernel/smp.c243
-rw-r--r--arch/x86/kernel/smpboot.c1848
-rw-r--r--arch/x86/kernel/stacktrace.c168
-rw-r--r--arch/x86/kernel/static_call.c231
-rw-r--r--arch/x86/kernel/step.c99
-rw-r--r--arch/x86/kernel/sys_i386_32.c248
-rw-r--r--arch/x86/kernel/sys_ia32.c256
-rw-r--r--arch/x86/kernel/sys_x86_64.c288
-rw-r--r--arch/x86/kernel/syscall_64.c29
-rw-r--r--arch/x86/kernel/syscall_table_32.S338
-rw-r--r--arch/x86/kernel/tboot.c516
-rw-r--r--arch/x86/kernel/tce_64.c189
-rw-r--r--arch/x86/kernel/test_nx.c175
-rw-r--r--arch/x86/kernel/test_rodata.c86
-rw-r--r--arch/x86/kernel/time.c111
-rw-r--r--arch/x86/kernel/time_32.c137
-rw-r--r--arch/x86/kernel/time_64.c135
-rw-r--r--arch/x86/kernel/tlb_uv.c868
-rw-r--r--arch/x86/kernel/tls.c162
-rw-r--r--arch/x86/kernel/tls.h7
-rw-r--r--arch/x86/kernel/topology.c82
-rw-r--r--arch/x86/kernel/trace.c234
-rw-r--r--arch/x86/kernel/trace_clock.c17
-rw-r--r--arch/x86/kernel/trampoline.c33
-rw-r--r--arch/x86/kernel/trampoline_32.S80
-rw-r--r--arch/x86/kernel/trampoline_64.S162
-rw-r--r--arch/x86/kernel/traps.c2086
-rw-r--r--arch/x86/kernel/tsc.c1216
-rw-r--r--arch/x86/kernel/tsc_msr.c236
-rw-r--r--arch/x86/kernel/tsc_sync.c446
-rw-r--r--arch/x86/kernel/umip.c422
-rw-r--r--arch/x86/kernel/unwind_frame.c419
-rw-r--r--arch/x86/kernel/unwind_guess.c72
-rw-r--r--arch/x86/kernel/unwind_orc.c767
-rw-r--r--arch/x86/kernel/uprobes.c1825
-rw-r--r--arch/x86/kernel/uv_irq.c79
-rw-r--r--arch/x86/kernel/uv_sysfs.c76
-rw-r--r--arch/x86/kernel/verify_cpu.S144
-rw-r--r--arch/x86/kernel/verify_cpu_64.S105
-rw-r--r--arch/x86/kernel/visws_quirks.c687
-rw-r--r--arch/x86/kernel/vm86_32.c496
-rw-r--r--arch/x86/kernel/vmcore_info_32.c17
-rw-r--r--arch/x86/kernel/vmcore_info_64.c24
-rw-r--r--arch/x86/kernel/vmi_32.c913
-rw-r--r--arch/x86/kernel/vmiclock_32.c321
-rw-r--r--arch/x86/kernel/vmlinux.lds.S628
-rw-r--r--arch/x86/kernel/vsmp_64.c148
-rw-r--r--arch/x86/kernel/vsyscall_64.c314
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c60
-rw-r--r--arch/x86/kernel/x86_init.c177
-rw-r--r--arch/x86/kernel/xsave.c347
-rw-r--r--arch/x86/kvm/.gitignore2
-rw-r--r--arch/x86/kvm/Kconfig234
-rw-r--r--arch/x86/kvm/Makefile65
-rw-r--r--arch/x86/kvm/cpuid.c2107
-rw-r--r--arch/x86/kvm/cpuid.h293
-rw-r--r--arch/x86/kvm/debugfs.c196
-rw-r--r--arch/x86/kvm/emulate.c5645
-rw-r--r--arch/x86/kvm/fpu.h206
-rw-r--r--arch/x86/kvm/hyperv.c2925
-rw-r--r--arch/x86/kvm/hyperv.h327
-rw-r--r--arch/x86/kvm/i8254.c658
-rw-r--r--arch/x86/kvm/i8254.h39
-rw-r--r--arch/x86/kvm/i8259.c398
-rw-r--r--arch/x86/kvm/ioapic.c789
-rw-r--r--arch/x86/kvm/ioapic.h141
-rw-r--r--arch/x86/kvm/irq.c628
-rw-r--r--arch/x86/kvm/irq.h80
-rw-r--r--arch/x86/kvm/kvm-asm-offsets.c29
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h239
-rw-r--r--arch/x86/kvm/kvm_emulate.h575
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c124
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h44
-rw-r--r--arch/x86/kvm/kvm_svm.h51
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c3378
-rw-r--r--arch/x86/kvm/lapic.h253
-rw-r--r--arch/x86/kvm/mmu.c3205
-rw-r--r--arch/x86/kvm/mmu.h298
-rw-r--r--arch/x86/kvm/mmu/mmu.c8095
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h414
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h455
-rw-r--r--arch/x86/kvm/mmu/page_track.c374
-rw-r--r--arch/x86/kvm/mmu/page_track.h58
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h983
-rw-r--r--arch/x86/kvm/mmu/spte.c576
-rw-r--r--arch/x86/kvm/mmu/spte.h572
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c179
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h143
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c1992
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h122
-rw-r--r--arch/x86/kvm/mtrr.c133
-rw-r--r--arch/x86/kvm/paging_tmpl.h611
-rw-r--r--arch/x86/kvm/pmu.c1150
-rw-r--r--arch/x86/kvm/pmu.h235
-rw-r--r--arch/x86/kvm/reverse_cpuid.h243
-rw-r--r--arch/x86/kvm/smm.c663
-rw-r--r--arch/x86/kvm/smm.h171
-rw-r--r--arch/x86/kvm/svm.c2750
-rw-r--r--arch/x86/kvm/svm/avic.c1305
-rw-r--r--arch/x86/kvm/svm/hyperv.c18
-rw-r--r--arch/x86/kvm/svm/hyperv.h54
-rw-r--r--arch/x86/kvm/svm/nested.c1928
-rw-r--r--arch/x86/kvm/svm/pmu.c242
-rw-r--r--arch/x86/kvm/svm/sev.c5169
-rw-r--r--arch/x86/kvm/svm/svm.c5510
-rw-r--r--arch/x86/kvm/svm/svm.h947
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c63
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h87
-rw-r--r--arch/x86/kvm/svm/svm_ops.h64
-rw-r--r--arch/x86/kvm/svm/vmenter.S404
-rw-r--r--arch/x86/kvm/timer.c46
-rw-r--r--arch/x86/kvm/trace.h1976
-rw-r--r--arch/x86/kvm/tss.h1
-rw-r--r--arch/x86/kvm/vmx.c3853
-rw-r--r--arch/x86/kvm/vmx/capabilities.h407
-rw-r--r--arch/x86/kvm/vmx/common.h180
-rw-r--r--arch/x86/kvm/vmx/hyperv.c230
-rw-r--r--arch/x86/kvm/vmx/hyperv.h90
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.c315
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.h166
-rw-r--r--arch/x86/kvm/vmx/main.c1082
-rw-r--r--arch/x86/kvm/vmx/nested.c7459
-rw-r--r--arch/x86/kvm/vmx/nested.h323
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c784
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.h28
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c319
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h30
-rw-r--r--arch/x86/kvm/vmx/run_flags.h9
-rw-r--r--arch/x86/kvm/vmx/sgx.c510
-rw-r--r--arch/x86/kvm/vmx/sgx.h34
-rw-r--r--arch/x86/kvm/vmx/tdx.c3613
-rw-r--r--arch/x86/kvm/vmx/tdx.h208
-rw-r--r--arch/x86/kvm/vmx/tdx_arch.h167
-rw-r--r--arch/x86/kvm/vmx/tdx_errno.h40
-rw-r--r--arch/x86/kvm/vmx/vmcs.h198
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c161
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h443
-rw-r--r--arch/x86/kvm/vmx/vmcs_shadow_fields.h79
-rw-r--r--arch/x86/kvm/vmx/vmenter.S387
-rw-r--r--arch/x86/kvm/vmx/vmx.c8774
-rw-r--r--arch/x86/kvm/vmx/vmx.h743
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.c36
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h133
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h371
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h159
-rw-r--r--arch/x86/kvm/x86.c15548
-rw-r--r--arch/x86/kvm/x86.h724
-rw-r--r--arch/x86/kvm/x86_emulate.c2141
-rw-r--r--arch/x86/kvm/xen.c2349
-rw-r--r--arch/x86/kvm/xen.h264
-rw-r--r--arch/x86/lguest/Kconfig14
-rw-r--r--arch/x86/lguest/Makefile2
-rw-r--r--arch/x86/lguest/boot.c1429
-rw-r--r--arch/x86/lguest/i386_head.S191
-rw-r--r--arch/x86/lib/.gitignore6
-rw-r--r--arch/x86/lib/Makefile59
-rw-r--r--arch/x86/lib/atomic64_32.c232
-rw-r--r--arch/x86/lib/atomic64_386_32.S195
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S185
-rw-r--r--arch/x86/lib/bhi.S147
-rw-r--r--arch/x86/lib/cache-smp.c45
-rw-r--r--arch/x86/lib/checksum_32.S230
-rw-r--r--arch/x86/lib/clear_page_64.S154
-rw-r--r--arch/x86/lib/cmdline.c235
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S54
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S97
-rw-r--r--arch/x86/lib/copy_mc.c105
-rw-r--r--arch/x86/lib/copy_mc_64.S149
-rw-r--r--arch/x86/lib/copy_page_64.S177
-rw-r--r--arch/x86/lib/copy_user_64.S345
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S137
-rw-r--r--arch/x86/lib/copy_user_uncached_64.S244
-rw-r--r--arch/x86/lib/cpu.c38
-rw-r--r--arch/x86/lib/csum-copy_64.S341
-rw-r--r--arch/x86/lib/csum-partial_64.c199
-rw-r--r--arch/x86/lib/csum-wrappers_64.c106
-rw-r--r--arch/x86/lib/delay.c143
-rw-r--r--arch/x86/lib/error-inject.c25
-rw-r--r--arch/x86/lib/getuser.S188
-rw-r--r--arch/x86/lib/hweight.S78
-rw-r--r--arch/x86/lib/inat.c96
-rw-r--r--arch/x86/lib/insn-eval.c1821
-rw-r--r--arch/x86/lib/insn.c792
-rw-r--r--arch/x86/lib/io_64.c25
-rw-r--r--arch/x86/lib/iomap_copy_64.S30
-rw-r--r--arch/x86/lib/iomem.c126
-rw-r--r--arch/x86/lib/kaslr.c98
-rw-r--r--arch/x86/lib/memcpy_32.c34
-rw-r--r--arch/x86/lib/memcpy_64.S247
-rw-r--r--arch/x86/lib/memmove_32.S200
-rw-r--r--arch/x86/lib/memmove_64.S217
-rw-r--r--arch/x86/lib/memmove_64.c21
-rw-r--r--arch/x86/lib/memset_64.S110
-rw-r--r--arch/x86/lib/misc.c24
-rw-r--r--arch/x86/lib/mmx_32.c377
-rw-r--r--arch/x86/lib/msr-reg-export.c6
-rw-r--r--arch/x86/lib/msr-reg.S94
-rw-r--r--arch/x86/lib/msr-smp.c277
-rw-r--r--arch/x86/lib/msr.c234
-rw-r--r--arch/x86/lib/pc-conf-reg.c13
-rw-r--r--arch/x86/lib/putuser.S173
-rw-r--r--arch/x86/lib/retpoline.S454
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/semaphore_32.S136
-rw-r--r--arch/x86/lib/string_32.c27
-rw-r--r--arch/x86/lib/strstr_32.c10
-rw-r--r--arch/x86/lib/thunk_32.S47
-rw-r--r--arch/x86/lib/thunk_64.S81
-rw-r--r--arch/x86/lib/usercopy.c55
-rw-r--r--arch/x86/lib/usercopy_32.c725
-rw-r--r--arch/x86/lib/usercopy_64.c267
-rw-r--r--arch/x86/lib/x86-opcode-map.txt1431
-rw-r--r--arch/x86/math-emu/Makefile5
-rw-r--r--arch/x86/math-emu/control_w.h3
-rw-r--r--arch/x86/math-emu/div_Xsig.S6
-rw-r--r--arch/x86/math-emu/div_small.S7
-rw-r--r--arch/x86/math-emu/errors.c29
-rw-r--r--arch/x86/math-emu/exception.h7
-rw-r--r--arch/x86/math-emu/fpu_arith.c1
-rw-r--r--arch/x86/math-emu/fpu_asm.h1
-rw-r--r--arch/x86/math-emu/fpu_aux.c77
-rw-r--r--arch/x86/math-emu/fpu_emu.h13
-rw-r--r--arch/x86/math-emu/fpu_entry.c155
-rw-r--r--arch/x86/math-emu/fpu_etc.c10
-rw-r--r--arch/x86/math-emu/fpu_proto.h15
-rw-r--r--arch/x86/math-emu/fpu_system.h84
-rw-r--r--arch/x86/math-emu/fpu_tags.c1
-rw-r--r--arch/x86/math-emu/fpu_trig.c20
-rw-r--r--arch/x86/math-emu/get_address.c26
-rw-r--r--arch/x86/math-emu/load_store.c74
-rw-r--r--arch/x86/math-emu/mul_Xsig.S17
-rw-r--r--arch/x86/math-emu/poly.h3
-rw-r--r--arch/x86/math-emu/poly_2xm1.c1
-rw-r--r--arch/x86/math-emu/poly_atan.c1
-rw-r--r--arch/x86/math-emu/poly_l2.c1
-rw-r--r--arch/x86/math-emu/poly_sin.c1
-rw-r--r--arch/x86/math-emu/poly_tan.c1
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S6
-rw-r--r--arch/x86/math-emu/reg_add_sub.c1
-rw-r--r--arch/x86/math-emu/reg_compare.c141
-rw-r--r--arch/x86/math-emu/reg_constant.c10
-rw-r--r--arch/x86/math-emu/reg_constant.h1
-rw-r--r--arch/x86/math-emu/reg_convert.c1
-rw-r--r--arch/x86/math-emu/reg_divide.c1
-rw-r--r--arch/x86/math-emu/reg_ld_str.c61
-rw-r--r--arch/x86/math-emu/reg_mul.c1
-rw-r--r--arch/x86/math-emu/reg_norm.S13
-rw-r--r--arch/x86/math-emu/reg_round.S9
-rw-r--r--arch/x86/math-emu/reg_u_add.S6
-rw-r--r--arch/x86/math-emu/reg_u_div.S7
-rw-r--r--arch/x86/math-emu/reg_u_mul.S6
-rw-r--r--arch/x86/math-emu/reg_u_sub.S6
-rw-r--r--arch/x86/math-emu/round_Xsig.S13
-rw-r--r--arch/x86/math-emu/shr_Xsig.S12
-rw-r--r--arch/x86/math-emu/status_w.h7
-rw-r--r--arch/x86/math-emu/wm_shrx.S23
-rw-r--r--arch/x86/math-emu/wm_sqrt.S6
-rw-r--r--arch/x86/mm/Makefile56
-rw-r--r--arch/x86/mm/amdtopology.c174
-rw-r--r--arch/x86/mm/cpu_entry_area.c279
-rw-r--r--arch/x86/mm/debug_pagetables.c75
-rw-r--r--arch/x86/mm/dump_pagetables.c612
-rw-r--r--arch/x86/mm/extable.c459
-rw-r--r--arch/x86/mm/fault.c1489
-rw-r--r--arch/x86/mm/gup.c375
-rw-r--r--arch/x86/mm/highmem_32.c130
-rw-r--r--arch/x86/mm/hugetlbpage.c438
-rw-r--r--arch/x86/mm/ident_map.c225
-rw-r--r--arch/x86/mm/init.c1196
-rw-r--r--arch/x86/mm/init_32.c654
-rw-r--r--arch/x86/mm/init_64.c1637
-rw-r--r--arch/x86/mm/iomap_32.c92
-rw-r--r--arch/x86/mm/ioremap.c1069
-rw-r--r--arch/x86/mm/k8topology_64.c222
-rw-r--r--arch/x86/mm/kasan_init_64.c455
-rw-r--r--arch/x86/mm/kaslr.c211
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/kmmio.c250
-rw-r--r--arch/x86/mm/kmsan_shadow.c20
-rw-r--r--arch/x86/mm/maccess.c43
-rw-r--r--arch/x86/mm/mem_encrypt.c140
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c569
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S163
-rw-r--r--arch/x86/mm/memtest.c130
-rw-r--r--arch/x86/mm/mm_internal.h32
-rw-r--r--arch/x86/mm/mmap.c265
-rw-r--r--arch/x86/mm/mmio-mod.c118
-rw-r--r--arch/x86/mm/numa.c431
-rw-r--r--arch/x86/mm/numa_32.c454
-rw-r--r--arch/x86/mm/numa_64.c754
-rw-r--r--arch/x86/mm/pageattr.c1312
-rw-r--r--arch/x86/mm/pat.c857
-rw-r--r--arch/x86/mm/pat/Makefile5
-rw-r--r--arch/x86/mm/pat/cpa-test.c (renamed from arch/x86/mm/pageattr-test.c)49
-rw-r--r--arch/x86/mm/pat/memtype.c1049
-rw-r--r--arch/x86/mm/pat/memtype.h49
-rw-r--r--arch/x86/mm/pat/memtype_interval.c149
-rw-r--r--arch/x86/mm/pat/set_memory.c2777
-rw-r--r--arch/x86/mm/pf_in.c64
-rw-r--r--arch/x86/mm/pf_in.h17
-rw-r--r--arch/x86/mm/pgprot.c63
-rw-r--r--arch/x86/mm/pgtable.c719
-rw-r--r--arch/x86/mm/pgtable_32.c60
-rw-r--r--arch/x86/mm/physaddr.c89
-rw-r--r--arch/x86/mm/physaddr.h11
-rw-r--r--arch/x86/mm/pkeys.c197
-rw-r--r--arch/x86/mm/pti.c686
-rw-r--r--arch/x86/mm/srat.c111
-rw-r--r--arch/x86/mm/srat_32.c283
-rw-r--r--arch/x86/mm/srat_64.c496
-rw-r--r--arch/x86/mm/testmmiotrace.c62
-rw-r--r--arch/x86/mm/tlb.c1916
-rw-r--r--arch/x86/net/Makefile10
-rw-r--r--arch/x86/net/bpf_jit_comp.c4069
-rw-r--r--arch/x86/net/bpf_jit_comp32.c2623
-rw-r--r--arch/x86/net/bpf_timed_may_goto.S55
-rw-r--r--arch/x86/oprofile/Makefile12
-rw-r--r--arch/x86/oprofile/backtrace.c90
-rw-r--r--arch/x86/oprofile/init.c49
-rw-r--r--arch/x86/oprofile/nmi_int.c553
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c69
-rw-r--r--arch/x86/oprofile/op_counter.h29
-rw-r--r--arch/x86/oprofile/op_model_amd.c504
-rw-r--r--arch/x86/oprofile/op_model_p4.c721
-rw-r--r--arch/x86/oprofile/op_model_ppro.c275
-rw-r--r--arch/x86/oprofile/op_x86_model.h56
-rw-r--r--arch/x86/pci/Makefile14
-rw-r--r--arch/x86/pci/acpi.c738
-rw-r--r--arch/x86/pci/amd_bus.c568
-rw-r--r--arch/x86/pci/broadcom_bus.c112
-rw-r--r--arch/x86/pci/bus_numa.c146
-rw-r--r--arch/x86/pci/bus_numa.h27
-rw-r--r--arch/x86/pci/ce4100.c318
-rw-r--r--arch/x86/pci/common.c316
-rw-r--r--arch/x86/pci/direct.c48
-rw-r--r--arch/x86/pci/early.c61
-rw-r--r--arch/x86/pci/fixup.c674
-rw-r--r--arch/x86/pci/i386.c394
-rw-r--r--arch/x86/pci/init.c29
-rw-r--r--arch/x86/pci/intel_mid.c406
-rw-r--r--arch/x86/pci/irq.c753
-rw-r--r--arch/x86/pci/legacy.c87
-rw-r--r--arch/x86/pci/mmconfig-shared.c775
-rw-r--r--arch/x86/pci/mmconfig_32.c62
-rw-r--r--arch/x86/pci/mmconfig_64.c129
-rw-r--r--arch/x86/pci/numachip.c127
-rw-r--r--arch/x86/pci/numaq_32.c175
-rw-r--r--arch/x86/pci/olpc.c20
-rw-r--r--arch/x86/pci/pcbios.c192
-rw-r--r--arch/x86/pci/visws.c94
-rw-r--r--arch/x86/pci/xen.c586
-rw-r--r--arch/x86/platform/Makefile14
-rw-r--r--arch/x86/platform/atom/Makefile2
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c207
-rw-r--r--arch/x86/platform/ce4100/Makefile2
-rw-r--r--arch/x86/platform/ce4100/ce4100.c60
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts430
-rw-r--r--arch/x86/platform/efi/Makefile8
-rw-r--r--arch/x86/platform/efi/efi.c927
-rw-r--r--arch/x86/platform/efi/efi_32.c154
-rw-r--r--arch/x86/platform/efi/efi_64.c840
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S60
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S31
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S98
-rw-r--r--arch/x86/platform/efi/memmap.c250
-rw-r--r--arch/x86/platform/efi/quirks.c785
-rw-r--r--arch/x86/platform/efi/runtime-map.c194
-rw-r--r--arch/x86/platform/geode/Makefile5
-rw-r--r--arch/x86/platform/geode/alix.c136
-rw-r--r--arch/x86/platform/geode/geode-common.c178
-rw-r--r--arch/x86/platform/geode/geode-common.h21
-rw-r--r--arch/x86/platform/geode/geos.c59
-rw-r--r--arch/x86/platform/geode/net5501.c97
-rw-r--r--arch/x86/platform/intel-mid/Makefile2
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c126
-rw-r--r--arch/x86/platform/intel-mid/pwr.c485
-rw-r--r--arch/x86/platform/intel-quark/Makefile3
-rw-r--r--arch/x86/platform/intel-quark/imr.c597
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c129
-rw-r--r--arch/x86/platform/intel/Makefile2
-rw-r--r--arch/x86/platform/intel/iosf_mbi.c558
-rw-r--r--arch/x86/platform/iris/Makefile2
-rw-r--r--arch/x86/platform/iris/iris.c121
-rw-r--r--arch/x86/platform/olpc/Makefile6
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c187
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-rtc.c80
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c627
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c230
-rw-r--r--arch/x86/platform/olpc/olpc.c (renamed from arch/x86/kernel/olpc.c)209
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c321
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c121
-rw-r--r--arch/x86/platform/olpc/xo1-wakeup.S126
-rw-r--r--arch/x86/platform/pvh/Makefile6
-rw-r--r--arch/x86/platform/pvh/enlighten.c142
-rw-r--r--arch/x86/platform/pvh/head.S311
-rw-r--r--arch/x86/platform/scx200/Makefile3
-rw-r--r--arch/x86/platform/scx200/scx200_32.c (renamed from arch/x86/kernel/scx200_32.c)31
-rw-r--r--arch/x86/platform/ts5500/Makefile2
-rw-r--r--arch/x86/platform/ts5500/ts5500.c341
-rw-r--r--arch/x86/platform/uv/Makefile2
-rw-r--r--arch/x86/platform/uv/bios_uv.c269
-rw-r--r--arch/x86/platform/uv/uv_irq.c213
-rw-r--r--arch/x86/platform/uv/uv_nmi.c1102
-rw-r--r--arch/x86/platform/uv/uv_time.c (renamed from arch/x86/kernel/uv_time.c)186
-rw-r--r--arch/x86/power/Makefile11
-rw-r--r--arch/x86/power/cpu.c532
-rw-r--r--arch/x86/power/hibernate.c216
-rw-r--r--arch/x86/power/hibernate_32.c78
-rw-r--r--arch/x86/power/hibernate_64.c226
-rw-r--r--arch/x86/power/hibernate_asm_32.S70
-rw-r--r--arch/x86/power/hibernate_asm_64.S168
-rw-r--r--arch/x86/purgatory/.gitignore1
-rw-r--r--arch/x86/purgatory/Makefile83
-rw-r--r--arch/x86/purgatory/entry64.S103
-rw-r--r--arch/x86/purgatory/kexec-purgatory.S14
-rw-r--r--arch/x86/purgatory/purgatory.c60
-rw-r--r--arch/x86/purgatory/setup-x86_64.S59
-rw-r--r--arch/x86/purgatory/stack.S18
-rw-r--r--arch/x86/ras/Kconfig23
-rw-r--r--arch/x86/realmode/Makefile22
-rw-r--r--arch/x86/realmode/init.c223
-rw-r--r--arch/x86/realmode/rm/.gitignore4
-rw-r--r--arch/x86/realmode/rm/Makefile69
-rw-r--r--arch/x86/realmode/rm/bioscall.S1
-rw-r--r--arch/x86/realmode/rm/copy.S1
-rw-r--r--arch/x86/realmode/rm/header.S45
-rw-r--r--arch/x86/realmode/rm/realmode.h22
-rw-r--r--arch/x86/realmode/rm/realmode.lds.S77
-rw-r--r--arch/x86/realmode/rm/reboot.S158
-rw-r--r--arch/x86/realmode/rm/regs.c1
-rw-r--r--arch/x86/realmode/rm/stack.S18
-rw-r--r--arch/x86/realmode/rm/trampoline_32.S73
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S294
-rw-r--r--arch/x86/realmode/rm/trampoline_common.S14
-rw-r--r--arch/x86/realmode/rm/video-bios.c1
-rw-r--r--arch/x86/realmode/rm/video-mode.c1
-rw-r--r--arch/x86/realmode/rm/video-vesa.c1
-rw-r--r--arch/x86/realmode/rm/video-vga.c1
-rw-r--r--arch/x86/realmode/rm/wakemain.c (renamed from arch/x86/kernel/acpi/realmode/wakemain.c)8
-rw-r--r--arch/x86/realmode/rm/wakeup.h (renamed from arch/x86/kernel/acpi/realmode/wakeup.h)24
-rw-r--r--arch/x86/realmode/rm/wakeup_asm.S179
-rw-r--r--arch/x86/realmode/rmpiggy.S19
-rw-r--r--arch/x86/tools/.gitignore2
-rw-r--r--arch/x86/tools/Makefile46
-rwxr-xr-xarch/x86/tools/cpufeaturemasks.awk88
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk505
-rw-r--r--arch/x86/tools/insn_decoder_test.c173
-rw-r--r--arch/x86/tools/insn_sanity.c265
-rw-r--r--arch/x86/tools/objdump_reformat.awk48
-rw-r--r--arch/x86/tools/relocs.c1079
-rw-r--r--arch/x86/tools/relocs.h39
-rw-r--r--arch/x86/tools/relocs_32.c18
-rw-r--r--arch/x86/tools/relocs_64.c18
-rw-r--r--arch/x86/tools/relocs_common.c85
-rw-r--r--arch/x86/um/Kconfig38
-rw-r--r--arch/x86/um/Makefile50
-rw-r--r--arch/x86/um/asm/apic.h4
-rw-r--r--arch/x86/um/asm/arch_hweight.h7
-rw-r--r--arch/x86/um/asm/barrier.h29
-rw-r--r--arch/x86/um/asm/checksum.h122
-rw-r--r--arch/x86/um/asm/checksum_32.h38
-rw-r--r--arch/x86/um/asm/checksum_64.h19
-rw-r--r--arch/x86/um/asm/desc.h17
-rw-r--r--arch/x86/um/asm/elf.h186
-rw-r--r--arch/x86/um/asm/irq_vectors.h10
-rw-r--r--arch/x86/um/asm/processor.h43
-rw-r--r--arch/x86/um/asm/processor_32.h53
-rw-r--r--arch/x86/um/asm/processor_64.h34
-rw-r--r--arch/x86/um/asm/ptrace.h103
-rw-r--r--arch/x86/um/asm/required-features.h9
-rw-r--r--arch/x86/um/asm/segment.h11
-rw-r--r--arch/x86/um/asm/spinlock.h8
-rw-r--r--arch/x86/um/asm/syscall.h23
-rw-r--r--arch/x86/um/asm/vm-flags.h19
-rw-r--r--arch/x86/um/bugs_32.c75
-rw-r--r--arch/x86/um/bugs_64.c16
-rw-r--r--arch/x86/um/delay.c57
-rw-r--r--arch/x86/um/fault.c29
-rw-r--r--arch/x86/um/mem_64.c11
-rw-r--r--arch/x86/um/os-Linux/Makefile12
-rw-r--r--arch/x86/um/os-Linux/mcontext.c260
-rw-r--r--arch/x86/um/os-Linux/registers.c109
-rw-r--r--arch/x86/um/os-Linux/tls.c69
-rw-r--r--arch/x86/um/ptrace.c305
-rw-r--r--arch/x86/um/ptrace_32.c202
-rw-r--r--arch/x86/um/ptrace_64.c216
-rw-r--r--arch/x86/um/ptrace_user.c21
-rw-r--r--arch/x86/um/setjmp_32.S59
-rw-r--r--arch/x86/um/setjmp_64.S55
-rw-r--r--arch/x86/um/shared/sysdep/archsetjmp.h13
-rw-r--r--arch/x86/um/shared/sysdep/archsetjmp_32.h23
-rw-r--r--arch/x86/um/shared/sysdep/archsetjmp_64.h25
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo.h6
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_32.h44
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_64.h44
-rw-r--r--arch/x86/um/shared/sysdep/mcontext.h40
-rw-r--r--arch/x86/um/shared/sysdep/ptrace.h67
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_32.h16
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_64.h58
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_user.h20
-rw-r--r--arch/x86/um/shared/sysdep/stub-data.h23
-rw-r--r--arch/x86/um/shared/sysdep/stub.h17
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h147
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h154
-rw-r--r--arch/x86/um/shared/sysdep/tls.h40
-rw-r--r--arch/x86/um/signal.c470
-rw-r--r--arch/x86/um/stub_segv.c20
-rw-r--r--arch/x86/um/sys_call_table_32.c40
-rw-r--r--arch/x86/um/sys_call_table_64.c36
-rw-r--r--arch/x86/um/syscalls_32.c8
-rw-r--r--arch/x86/um/syscalls_64.c64
-rw-r--r--arch/x86/um/sysrq_32.c33
-rw-r--r--arch/x86/um/sysrq_64.c36
-rw-r--r--arch/x86/um/tls_32.c386
-rw-r--r--arch/x86/um/tls_64.c18
-rw-r--r--arch/x86/um/user-offsets.c79
-rw-r--r--arch/x86/um/vdso/.gitignore2
-rw-r--r--arch/x86/um/vdso/Makefile58
-rw-r--r--arch/x86/um/vdso/um_vdso.c54
-rw-r--r--arch/x86/um/vdso/vdso-layout.lds.S (renamed from arch/x86/vdso/vdso-layout.lds.S)1
-rw-r--r--arch/x86/um/vdso/vdso-note.S (renamed from arch/x86/vdso/vdso-note.S)0
-rw-r--r--arch/x86/um/vdso/vdso.S11
-rw-r--r--arch/x86/um/vdso/vdso.lds.S (renamed from arch/x86/vdso/vdso.lds.S)12
-rw-r--r--arch/x86/um/vdso/vma.c55
-rw-r--r--arch/x86/vdso/Makefile140
-rw-r--r--arch/x86/vdso/vclock_gettime.c126
-rw-r--r--arch/x86/vdso/vdso.S10
-rw-r--r--arch/x86/vdso/vdso32-setup.c443
-rw-r--r--arch/x86/vdso/vdso32.S22
-rw-r--r--arch/x86/vdso/vdso32/.gitignore1
-rw-r--r--arch/x86/vdso/vdso32/int80.S56
-rw-r--r--arch/x86/vdso/vdso32/note.S44
-rw-r--r--arch/x86/vdso/vdso32/syscall.S77
-rw-r--r--arch/x86/vdso/vdso32/sysenter.S116
-rw-r--r--arch/x86/vdso/vextern.h16
-rw-r--r--arch/x86/vdso/vgetcpu.c36
-rw-r--r--arch/x86/vdso/vma.c141
-rw-r--r--arch/x86/vdso/vvar.c12
-rw-r--r--arch/x86/video/Makefile4
-rw-r--r--arch/x86/video/fbdev.c31
-rw-r--r--arch/x86/video/video-common.c64
-rw-r--r--arch/x86/virt/Makefile2
-rw-r--r--arch/x86/virt/svm/Makefile4
-rw-r--r--arch/x86/virt/svm/cmdline.c45
-rw-r--r--arch/x86/virt/svm/sev.c1073
-rw-r--r--arch/x86/virt/vmx/Makefile2
-rw-r--r--arch/x86/virt/vmx/tdx/Makefile2
-rw-r--r--arch/x86/virt/vmx/tdx/seamcall.S64
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c1889
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.h121
-rw-r--r--arch/x86/virt/vmx/tdx/tdx_global_metadata.c98
-rw-r--r--arch/x86/virt/vmx/tdx/tdxcall.S220
-rw-r--r--arch/x86/xen/Kconfig98
-rw-r--r--arch/x86/xen/Makefile42
-rw-r--r--arch/x86/xen/apic.c148
-rw-r--r--arch/x86/xen/debugfs.c115
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/efi.c151
-rw-r--r--arch/x86/xen/enlighten.c1250
-rw-r--r--arch/x86/xen/enlighten_hvm.c334
-rw-r--r--arch/x86/xen/enlighten_pv.c1640
-rw-r--r--arch/x86/xen/enlighten_pvh.c185
-rw-r--r--arch/x86/xen/grant-table.c184
-rw-r--r--arch/x86/xen/irq.c115
-rw-r--r--arch/x86/xen/mmu.c1995
-rw-r--r--arch/x86/xen/mmu.h63
-rw-r--r--arch/x86/xen/mmu_hvm.c69
-rw-r--r--arch/x86/xen/mmu_pv.c2580
-rw-r--r--arch/x86/xen/multicalls.c294
-rw-r--r--arch/x86/xen/multicalls.h62
-rw-r--r--arch/x86/xen/p2m.c930
-rw-r--r--arch/x86/xen/platform-pci-unplug.c210
-rw-r--r--arch/x86/xen/pmu.c552
-rw-r--r--arch/x86/xen/setup.c1055
-rw-r--r--arch/x86/xen/smp.c505
-rw-r--r--arch/x86/xen/smp_hvm.c88
-rw-r--r--arch/x86/xen/smp_pv.c455
-rw-r--r--arch/x86/xen/spinlock.c440
-rw-r--r--arch/x86/xen/suspend.c85
-rw-r--r--arch/x86/xen/suspend_hvm.c27
-rw-r--r--arch/x86/xen/suspend_pv.c48
-rw-r--r--arch/x86/xen/time.c706
-rw-r--r--arch/x86/xen/trace.c21
-rw-r--r--arch/x86/xen/vdso.h4
-rw-r--r--arch/x86/xen/vga.c77
-rw-r--r--arch/x86/xen/xen-asm.S390
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S228
-rw-r--r--arch/x86/xen/xen-asm_64.S159
-rw-r--r--arch/x86/xen/xen-head.S169
-rw-r--r--arch/x86/xen/xen-ops.h296
1755 files changed, 413108 insertions, 154373 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
new file mode 100644
index 000000000000..f2e1d6c347fb
--- /dev/null
+++ b/arch/x86/.gitignore
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+boot/compressed/vmlinux
+tools/test_get_len
+tools/insn_sanity
+tools/insn_decoder_test
+purgatory/purgatory.ro
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index ad8ec356fb36..36b985d0e7bf 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,16 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Branch profiling isn't noinstr-safe. Disable it for arch/x86/*
+subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
+
+obj-y += boot/startup/
+
+obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/
+
+obj-y += entry/
+
+obj-$(CONFIG_PERF_EVENTS) += events/
obj-$(CONFIG_KVM) += kvm/
# Xen paravirtualization support
obj-$(CONFIG_XEN) += xen/
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
+obj-$(CONFIG_PVH) += platform/pvh/
+# Hyper-V paravirtualization support
+obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/
+
+obj-y += realmode/
obj-y += kernel/
obj-y += mm/
obj-y += crypto/
-obj-y += vdso/
+
obj-$(CONFIG_IA32_EMULATION) += ia32/
+obj-y += platform/
+obj-y += net/
+
+obj-$(CONFIG_KEXEC_FILE) += purgatory/
+
+obj-y += virt/
+
+# for cleaning
+subdir- += boot tools
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d7..80527299f859 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,247 +1,449 @@
-# x86 configuration
-mainmenu "Linux Kernel Configuration for x86"
-
+# SPDX-License-Identifier: GPL-2.0
# Select 32 or 64 bit
config 64BIT
- bool "64-bit kernel" if ARCH = "x86"
- default ARCH = "x86_64"
- ---help---
+ bool "64-bit kernel" if "$(ARCH)" = "x86"
+ default "$(ARCH)" != "i386"
+ help
Say yes to build a 64-bit kernel - formerly known as x86_64
Say no to build a 32-bit kernel - formerly known as i386
config X86_32
- def_bool !64BIT
+ def_bool y
+ depends on !64BIT
+ # Options that are inherently 32-bit kernel only:
+ select ARCH_WANT_IPC_PARSE_VERSION
+ select CLKSRC_I8253
+ select CLONE_BACKWARDS
+ select HAVE_DEBUG_STACKOVERFLOW
+ select KMAP_LOCAL
+ select MODULES_USE_ELF_REL
+ select OLD_SIGACTION
+ select ARCH_SPLIT_ARG64
config X86_64
- def_bool 64BIT
+ def_bool y
+ depends on 64BIT
+ # Options that are inherently 64-bit kernel only:
+ select ARCH_HAS_GIGANTIC_PAGE
+ select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
+ select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ select ARCH_SUPPORTS_PER_VMA_LOCK
+ select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_SOFT_DIRTY
+ select MODULES_USE_ELF_RELA
+ select NEED_DMA_MAP_STATE
+ select SWIOTLB
+ select ARCH_HAS_ELFCORE_COMPAT
+ select ZONE_DMA32
+ select EXECMEM if DYNAMIC_FTRACE
+ select ACPI_MRRM if ACPI
-### Arch settings
+config FORCE_DYNAMIC_FTRACE
+ def_bool y
+ depends on X86_32
+ depends on FUNCTION_TRACER
+ select DYNAMIC_FTRACE
+ help
+ We keep the static function tracing (!DYNAMIC_FTRACE) around
+ in order to test the non static function tracing in the
+ generic code, as other architectures still use it. But we
+ only need to keep it around for x86_64. No need to keep it
+ for x86_32. For x86_32, force DYNAMIC_FTRACE.
+#
+# Arch settings
+#
+# ( Note that options that are marked 'if X86_64' could in principle be
+# ported to 32-bit as well. )
+#
config X86
def_bool y
- select HAVE_AOUT if X86_32
- select HAVE_READQ
- select HAVE_WRITEQ
- select HAVE_UNSTABLE_SCHED_CLOCK
- select HAVE_IDE
- select HAVE_OPROFILE
- select HAVE_PERF_COUNTERS if (!M386 && !M486)
- select HAVE_IOREMAP_PROT
- select HAVE_KPROBES
- select ARCH_WANT_OPTIONAL_GPIOLIB
- select ARCH_WANT_FRAME_POINTERS
- select HAVE_DMA_ATTRS
- select HAVE_KRETPROBES
- select HAVE_FTRACE_MCOUNT_RECORD
- select HAVE_DYNAMIC_FTRACE
- select HAVE_FUNCTION_TRACER
- select HAVE_FUNCTION_GRAPH_TRACER
- select HAVE_FUNCTION_GRAPH_FP_TEST
- select HAVE_FUNCTION_TRACE_MCOUNT_TEST
- select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
- select HAVE_FTRACE_SYSCALLS
- select HAVE_KVM
+ #
+ # Note: keep this list sorted alphabetically
+ #
+ select ACPI_LEGACY_TABLES_LOOKUP if ACPI
+ select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
+ select ACPI_HOTPLUG_CPU if ACPI_PROCESSOR && HOTPLUG_CPU
+ select ARCH_32BIT_OFF_T if X86_32
+ select ARCH_CLOCKSOURCE_INIT
+ select ARCH_CONFIGURES_CPU_MITIGATIONS
+ select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
+ select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
+ select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
+ select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
+ select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
+ select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_CPU_ATTACK_VECTORS if CPU_MITIGATIONS
+ select ARCH_HAS_CACHE_LINE_SIZE
+ select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
+ select ARCH_HAS_CPU_FINALIZE_INIT
+ select ARCH_HAS_CPU_PASID if IOMMU_SVA
+ select ARCH_HAS_CURRENT_STACK_POINTER
+ select ARCH_HAS_DEBUG_VIRTUAL
+ select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
+ select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
+ select ARCH_HAS_EARLY_DEBUG if KGDB
+ select ARCH_HAS_ELF_RANDOMIZE
+ select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX
+ select ARCH_HAS_FAST_MULTIPLIER
+ select ARCH_HAS_FORTIFY_SOURCE
+ select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_KERNEL_FPU_SUPPORT
+ select ARCH_HAS_MEM_ENCRYPT
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
+ select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_PREEMPT_LAZY
+ select ARCH_HAS_PTDUMP
+ select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_HW_PTE_YOUNG
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
+ select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
+ select ARCH_HAS_COPY_MC if X86_64
+ select ARCH_HAS_SET_MEMORY
+ select ARCH_HAS_SET_DIRECT_MAP
+ select ARCH_HAS_STRICT_KERNEL_RWX
+ select ARCH_HAS_STRICT_MODULE_RWX
+ select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
+ select ARCH_HAS_SYSCALL_WRAPPER
+ select ARCH_HAS_UBSAN
+ select ARCH_HAS_DEBUG_WX
+ select ARCH_HAS_ZONE_DMA_SET if EXPERT
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_HAVE_EXTRA_ELF_NOTES
+ select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
+ select ARCH_MIGHT_HAVE_PC_PARPORT
+ select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_STACKWALK
+ select ARCH_SUPPORTS_ACPI
+ select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_HUGETLBFS
+ select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
+ select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
+ select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
+ select ARCH_SUPPORTS_CFI if X86_64
+ select ARCH_USES_CFI_TRAPS if X86_64 && CFI
+ select ARCH_SUPPORTS_LTO_CLANG
+ select ARCH_SUPPORTS_LTO_CLANG_THIN
+ select ARCH_SUPPORTS_RT
+ select ARCH_SUPPORTS_AUTOFDO_CLANG
+ select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64
+ select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_CMPXCHG_LOCKREF if X86_CX8
+ select ARCH_USE_MEMTEST
+ select ARCH_USE_QUEUED_RWLOCKS
+ select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_USE_SYM_ANNOTATIONS
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ select ARCH_WANT_DEFAULT_BPF_JIT if X86_64
+ select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_WANTS_NO_INSTR
+ select ARCH_WANT_GENERAL_HUGETLB
+ select ARCH_WANT_HUGE_PMD_SHARE if X86_64
+ select ARCH_WANT_LD_ORPHAN_WARN
+ select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64
+ select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
+ select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64
+ select ARCH_WANTS_THP_SWAP if X86_64
+ select ARCH_HAS_PARANOID_L1D_FLUSH
+ select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+ select BUILDTIME_TABLE_SORT
+ select CLKEVT_I8253
+ select CLOCKSOURCE_WATCHDOG
+ # Word-size accesses may read uninitialized data past the trailing \0
+ # in strings and cause false KMSAN reports.
+ select DCACHE_WORD_ACCESS if !KMSAN
+ select DYNAMIC_SIGFRAME
+ select EDAC_ATOMIC_SCRUB
+ select EDAC_SUPPORT
+ select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
+ select GENERIC_CLOCKEVENTS_BROADCAST_IDLE if GENERIC_CLOCKEVENTS_BROADCAST
+ select GENERIC_CLOCKEVENTS_MIN_ADJUST
+ select GENERIC_CMOS_UPDATE
+ select GENERIC_CPU_AUTOPROBE
+ select GENERIC_CPU_DEVICES
+ select GENERIC_CPU_VULNERABILITIES
+ select GENERIC_EARLY_IOREMAP
+ select GENERIC_ENTRY
+ select GENERIC_IOMAP
+ select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
+ select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC
+ select GENERIC_IRQ_MIGRATION if SMP
+ select GENERIC_IRQ_PROBE
+ select GENERIC_IRQ_RESERVATION_MODE
+ select GENERIC_IRQ_SHOW
+ select GENERIC_PENDING_IRQ if SMP
+ select GENERIC_SMP_IDLE_THREAD
+ select GENERIC_TIME_VSYSCALL
+ select GENERIC_GETTIMEOFDAY
+ select GENERIC_VDSO_OVERFLOW_PROTECT
+ select GUP_GET_PXX_LOW_HIGH if X86_PAE
+ select HARDIRQS_SW_RESEND
+ select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
+ select HAS_IOPORT
+ select HAVE_ACPI_APEI if ACPI
+ select HAVE_ACPI_APEI_NMI if ACPI
+ select HAVE_ALIGNED_STRUCT_PAGE
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
+ select HAVE_ARCH_HUGE_VMALLOC if X86_64
+ select HAVE_ARCH_JUMP_LABEL
+ select HAVE_ARCH_JUMP_LABEL_RELATIVE
+ select HAVE_ARCH_KASAN if X86_64
+ select HAVE_ARCH_KASAN_VMALLOC if X86_64
+ select HAVE_ARCH_KFENCE
+ select HAVE_ARCH_KMSAN if X86_64
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KSTACK_ERASE
+ select HAVE_ARCH_MMAP_RND_BITS if MMU
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
+ select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
+ select HAVE_ARCH_PREL32_RELOCATIONS
+ select HAVE_ARCH_SECCOMP_FILTER
+ select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
- select HAVE_GENERIC_DMA_COHERENT if X86_32
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
+ select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
+ select HAVE_ARCH_USERFAULTFD_MINOR if X86_64 && USERFAULTFD
+ select HAVE_ARCH_VMAP_STACK if X86_64
+ select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+ select HAVE_ARCH_WITHIN_STACK_FRAMES
+ select HAVE_ASM_MODVERSIONS
+ select HAVE_CMPXCHG_DOUBLE
+ select HAVE_CMPXCHG_LOCAL
+ select HAVE_CONTEXT_TRACKING_USER if X86_64
+ select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER
+ select HAVE_C_RECORDMCOUNT
+ select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL
+ select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT
+ select HAVE_BUILDTIME_MCOUNT_SORT
+ select HAVE_DEBUG_KMEMLEAK
+ select HAVE_DMA_CONTIGUOUS
+ select HAVE_DYNAMIC_FTRACE
+ select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64
+ select HAVE_FTRACE_REGS_HAVING_PT_REGS if X86_64
+ select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ select HAVE_DYNAMIC_FTRACE_WITH_JMP if X86_64
+ select HAVE_SAMPLE_FTRACE_DIRECT if X86_64
+ select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64
+ select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
- select USER_STACKTRACE_SUPPORT
- select HAVE_DMA_API_DEBUG
- select HAVE_KERNEL_GZIP
+ select HAVE_EISA if X86_32
+ select HAVE_EXIT_THREAD
+ select HAVE_GENERIC_TIF_BITS
+ select HAVE_GUP_FAST
+ select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
+ select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_GRAPH_FREGS if HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE)
+ select HAVE_FUNCTION_TRACER
+ select HAVE_GCC_PLUGINS
+ select HAVE_HW_BREAKPOINT
+ select HAVE_IOREMAP_PROT
+ select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
+ select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_JUMP_LABEL_HACK if HAVE_OBJTOOL
select HAVE_KERNEL_BZIP2
+ select HAVE_KERNEL_GZIP
+ select HAVE_KERNEL_LZ4
select HAVE_KERNEL_LZMA
- select HAVE_ARCH_KMEMCHECK
+ select HAVE_KERNEL_LZO
+ select HAVE_KERNEL_XZ
+ select HAVE_KERNEL_ZSTD
+ select HAVE_KPROBES
+ select HAVE_KPROBES_ON_FTRACE
+ select HAVE_FUNCTION_ERROR_INJECTION
+ select HAVE_KRETPROBES
+ select HAVE_RETHOOK
+ select HAVE_KLP_BUILD if X86_64
+ select HAVE_LIVEPATCH if X86_64
+ select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_MOD_ARCH_SPECIFIC
+ select HAVE_MOVE_PMD
+ select HAVE_MOVE_PUD
+ select HAVE_NOINSTR_HACK if HAVE_OBJTOOL
+ select HAVE_NMI
+ select HAVE_NOINSTR_VALIDATION if HAVE_OBJTOOL
+ select HAVE_OBJTOOL if X86_64
+ select HAVE_OPTPROBES
+ select HAVE_PAGE_SIZE_4KB
+ select HAVE_PCSPKR_PLATFORM
+ select HAVE_PERF_EVENTS
+ select HAVE_PERF_EVENTS_NMI
+ select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+ select HAVE_PCI
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
+ select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA
+ select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_GATHER_MERGE_VMAS
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION
+ select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_SETUP_PER_CPU_AREA
+ select HAVE_SOFTIRQ_ON_OWN_STACK
+ select HAVE_STACKPROTECTOR
+ select HAVE_STACK_VALIDATION if HAVE_OBJTOOL
+ select HAVE_STATIC_CALL
+ select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL
+ select HAVE_PREEMPT_DYNAMIC_CALL
+ select HAVE_RSEQ
+ select HAVE_RUST if X86_64
+ select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
+ select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_UNWIND_USER_FP if X86_64
+ select HAVE_USER_RETURN_NOTIFIER
+ select HAVE_GENERIC_VDSO
+ select VDSO_GETRANDOM if X86_64
+ select HOTPLUG_PARALLEL if SMP && X86_64
+ select HOTPLUG_SMT if SMP
+ select HOTPLUG_SPLIT_STARTUP if SMP && X86_32
+ select IRQ_FORCED_THREADING
+ select LOCK_MM_AND_FIND_VMA
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ select NEED_PER_CPU_PAGE_FIRST_CHUNK
+ select NEED_SG_DMA_LENGTH
+ select NUMA_MEMBLKS if NUMA
+ select PCI_DOMAINS if PCI
+ select PCI_LOCKLESS_CONFIG if PCI
+ select PERF_EVENTS
+ select RTC_LIB
+ select RTC_MC146818_LIB
+ select SPARSE_IRQ
+ select SYSCTL_EXCEPTION_TRACE
+ select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
+ select TRACE_IRQFLAGS_NMI_SUPPORT
+ select USER_STACKTRACE_SUPPORT
+ select HAVE_ARCH_KCSAN if X86_64
+ select PROC_PID_ARCH_STATUS if PROC_FS
+ select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
+ select FUNCTION_ALIGNMENT_16B if X86_64 || X86_ALIGNMENT_16
+ select FUNCTION_ALIGNMENT_4B
+ imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
+ select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ select ARCH_SUPPORTS_PT_RECLAIM if X86_64
+ select ARCH_SUPPORTS_SCHED_SMT if SMP
+ select SCHED_SMT if SMP
+ select ARCH_SUPPORTS_SCHED_CLUSTER if SMP
+ select ARCH_SUPPORTS_SCHED_MC if SMP
+
+config INSTRUCTION_DECODER
+ def_bool y
+ depends on KPROBES || PERF_EVENTS || UPROBES
config OUTPUT_FORMAT
string
default "elf32-i386" if X86_32
default "elf64-x86-64" if X86_64
-config ARCH_DEFCONFIG
- string
- default "arch/x86/configs/i386_defconfig" if X86_32
- default "arch/x86/configs/x86_64_defconfig" if X86_64
-
-config GENERIC_TIME
- def_bool y
-
-config GENERIC_CMOS_UPDATE
- def_bool y
-
-config CLOCKSOURCE_WATCHDOG
- def_bool y
-
-config GENERIC_CLOCKEVENTS
- def_bool y
-
-config GENERIC_CLOCKEVENTS_BROADCAST
- def_bool y
- depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
-
config LOCKDEP_SUPPORT
def_bool y
config STACKTRACE_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
+config MMU
def_bool y
-config FAST_CMPXCHG_LOCAL
- bool
- default y
+config ARCH_MMAP_RND_BITS_MIN
+ default 28 if 64BIT
+ default 8
-config MMU
- def_bool y
+config ARCH_MMAP_RND_BITS_MAX
+ default 32 if 64BIT
+ default 16
-config ZONE_DMA
- def_bool y
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
config SBUS
bool
config GENERIC_ISA_DMA
def_bool y
+ depends on ISA_DMA_API
-config GENERIC_IOMAP
- def_bool y
+config GENERIC_CSUM
+ bool
+ default y if KMSAN || KASAN
config GENERIC_BUG
def_bool y
depends on BUG
- select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+ select GENERIC_BUG_RELATIVE_POINTERS
config GENERIC_BUG_RELATIVE_POINTERS
bool
-config GENERIC_HWEIGHT
- def_bool y
-
-config GENERIC_GPIO
- bool
-
config ARCH_MAY_HAVE_PC_FDC
def_bool y
-
-config RWSEM_GENERIC_SPINLOCK
- def_bool !X86_XADD
-
-config RWSEM_XCHGADD_ALGORITHM
- def_bool X86_XADD
-
-config ARCH_HAS_CPU_IDLE_WAIT
- def_bool y
+ depends on ISA_DMA_API
config GENERIC_CALIBRATE_DELAY
def_bool y
-config GENERIC_TIME_VSYSCALL
- bool
- default X86_64
-
config ARCH_HAS_CPU_RELAX
def_bool y
-config ARCH_HAS_DEFAULT_IDLE
- def_bool y
-
-config ARCH_HAS_CACHE_LINE_SIZE
- def_bool y
-
-config HAVE_SETUP_PER_CPU_AREA
- def_bool y
-
-config HAVE_DYNAMIC_PER_CPU_AREA
- def_bool y
-
-config HAVE_CPUMASK_OF_CPU_MAP
- def_bool X86_64_SMP
-
config ARCH_HIBERNATION_POSSIBLE
def_bool y
config ARCH_SUSPEND_POSSIBLE
def_bool y
-config ZONE_DMA32
- bool
- default X86_64
-
-config ARCH_POPULATES_NODE_MAP
- def_bool y
-
config AUDIT_ARCH
- bool
- default X86_64
-
-config ARCH_SUPPORTS_OPTIMIZED_INLINING
- def_bool y
-
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
- def_bool y
-
-# Use the generic interrupt handling code in kernel/irq/:
-config GENERIC_HARDIRQS
- bool
- default y
-
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
-config GENERIC_IRQ_PROBE
- bool
- default y
+ def_bool y if X86_64
-config GENERIC_PENDING_IRQ
- bool
- depends on GENERIC_HARDIRQS && SMP
- default y
+config KASAN_SHADOW_OFFSET
+ hex
+ depends on KASAN
+ default 0xdffffc0000000000
-config USE_GENERIC_SMP_HELPERS
+config HAVE_INTEL_TXT
def_bool y
- depends on SMP
+ depends on INTEL_IOMMU && ACPI
-config X86_32_SMP
+config ARCH_SUPPORTS_UPROBES
def_bool y
- depends on X86_32 && SMP
-config X86_64_SMP
+config FIX_EARLYCON_MEM
def_bool y
- depends on X86_64 && SMP
-config X86_HT
+config DYNAMIC_PHYSICAL_MASK
bool
- depends on SMP
- default y
-config X86_TRAMPOLINE
- bool
- depends on SMP || (64BIT && ACPI_SLEEP)
- default y
-
-config X86_32_LAZY_GS
- def_bool y
- depends on X86_32 && !CC_STACKPROTECTOR
-
-config KTIME_SCALAR
- def_bool X86_32
-source "init/Kconfig"
-source "kernel/Kconfig.freezer"
+config PGTABLE_LEVELS
+ int
+ default 5 if X86_64
+ default 3 if X86_PAE
+ default 2
menu "Processor type and features"
-source "kernel/time/Kconfig"
-
config SMP
bool "Symmetric multi-processing support"
- ---help---
+ help
This enables support for systems with more than one CPU. If you have
- a system with only one CPU, like most personal computers, say N. If
- you have a system with more than one CPU, say Y.
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
- If you say N here, the kernel will run on single and multiprocessor
+ If you say N here, the kernel will run on uni- and multiprocessor
machines, but will use only one CPU of a multiprocessor machine. If
you say Y here, the kernel will run on many, but not all,
- singleprocessor machines. On a singleprocessor machine, the kernel
+ uniprocessor machines. On a uniprocessor machine, the kernel
will run faster if you say N here.
Note that if you say Y here and choose architecture "586" or
@@ -253,102 +455,151 @@ config SMP
Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
Management" code will be disabled if you say Y here.
- See also <file:Documentation/i386/IO-APIC.txt>,
- <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
+ See also <file:Documentation/arch/x86/i386/IO-APIC.rst>,
+ <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
<http://www.tldp.org/docs.html#howto>.
If you don't know what to do here, say N.
config X86_X2APIC
- bool "Support x2apic"
- depends on X86_LOCAL_APIC && X86_64 && INTR_REMAP
- ---help---
- This enables x2apic support on CPUs that have this feature.
+ bool "x2APIC interrupt controller architecture support"
+ depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
+ default y
+ help
+ x2APIC is an interrupt controller architecture, a component of which
+ (the local APIC) is present in the CPU. It allows faster access to
+ the local APIC and supports a larger number of CPUs in the system
+ than the predecessors.
+
+ x2APIC was introduced in Intel CPUs around 2008 and in AMD EPYC CPUs
+ in 2019, but it can be disabled by the BIOS. It is also frequently
+ emulated in virtual machines, even when the host CPU does not support
+ it. Support in the CPU can be checked by executing
+ grep x2apic /proc/cpuinfo
+
+ If this configuration option is disabled, the kernel will boot with
+ very reduced functionality and performance on some platforms that
+ have x2APIC enabled. On the other hand, on hardware that does not
+ support x2APIC, a kernel with this option enabled will just fallback
+ to older APIC implementations.
+
+ If in doubt, say Y.
+
+config AMD_SECURE_AVIC
+ bool "AMD Secure AVIC"
+ depends on AMD_MEM_ENCRYPT && X86_X2APIC
+ help
+ Enable this to get AMD Secure AVIC support on guests that have this feature.
- This allows 32-bit apic IDs (so it can support very large systems),
- and accesses the local apic via MSRs not via mmio.
+ AMD Secure AVIC provides hardware acceleration for performance sensitive
+ APIC accesses and support for managing guest owned APIC state for SEV-SNP
+ guests. Secure AVIC does not support xAPIC mode. It has functional
+ dependency on x2apic being enabled in the guest.
If you don't know what to do here, say N.
-config SPARSE_IRQ
- bool "Support sparse irq numbering"
- depends on PCI_MSI || HT_IRQ
- ---help---
- This enables support for sparse irqs. This is useful for distro
- kernels that want to define a high CONFIG_NR_CPUS value but still
- want to have low kernel memory footprint on smaller machines.
-
- ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
- out the irq_desc[] array in a more NUMA-friendly way. )
+config X86_POSTED_MSI
+ bool "Enable MSI and MSI-x delivery by posted interrupts"
+ depends on X86_64 && IRQ_REMAP
+ help
+ This enables MSIs that are under interrupt remapping to be delivered as
+ posted interrupts to the host kernel. Interrupt throughput can
+ potentially be improved by coalescing CPU notifications during high
+ frequency bursts.
If you don't know what to do here, say N.
-config NUMA_IRQ_DESC
- def_bool y
- depends on SPARSE_IRQ && NUMA
-
config X86_MPPARSE
bool "Enable MPS table" if ACPI
default y
depends on X86_LOCAL_APIC
- ---help---
+ help
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
-config X86_BIGSMP
- bool "Support for big SMP systems with more than 8 CPUs"
- depends on X86_32 && SMP
- ---help---
- This option is needed for the systems that have more than 8 CPUs
+config X86_CPU_RESCTRL
+ bool "x86 CPU resource control support"
+ depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
+ depends on MISC_FILESYSTEMS
+ select ARCH_HAS_CPU_RESCTRL
+ select RESCTRL_FS
+ select RESCTRL_FS_PSEUDO_LOCK
+ help
+ Enable x86 CPU resource control support.
-if X86_32
-config X86_EXTENDED_PLATFORM
- bool "Support for extended (non-PC) x86 platforms"
- default y
- ---help---
- If you disable this option then the kernel will only support
- standard PC platforms. (which covers the vast majority of
- systems out there.)
+ Provide support for the allocation and monitoring of system resources
+ usage by the CPU.
- If you enable this option then you'll be able to select support
- for the following (non-PC) 32 bit x86 platforms:
- AMD Elan
- NUMAQ (IBM/Sequent)
- RDC R-321x SoC
- SGI 320/540 (Visual Workstation)
- Summit/EXA (IBM x440)
- Unisys ES7000 IA32 series
+ Intel calls this Intel Resource Director Technology
+ (Intel(R) RDT). More information about RDT can be found in the
+ Intel x86 Architecture Software Developer Manual.
- If you have one of these systems, or if you want to build a
- generic distribution kernel, say Y here - otherwise say N.
-endif
+ AMD calls this AMD Platform Quality of Service (AMD QoS).
+ More information about AMD QoS can be found in the AMD64 Technology
+ Platform Quality of Service Extensions manual.
+
+ Say N if unsure.
+
+config X86_FRED
+ bool "Flexible Return and Event Delivery"
+ depends on X86_64
+ help
+ When enabled, try to use Flexible Return and Event Delivery
+ instead of the legacy SYSCALL/SYSENTER/IDT architecture for
+ ring transitions and exception/interrupt handling if the
+ system supports it.
-if X86_64
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
default y
- ---help---
+ help
If you disable this option then the kernel will only support
standard PC platforms. (which covers the vast majority of
systems out there.)
If you enable this option then you'll be able to select support
- for the following (non-PC) 64 bit x86 platforms:
+ for the following non-PC x86 platforms, depending on the value of
+ CONFIG_64BIT.
+
+ 32-bit platforms (CONFIG_64BIT=n):
+ Goldfish (mostly Android emulator)
+ Intel CE media processor (CE4100) SoC
+ Intel Quark
+ RDC R-321x SoC
+
+ 64-bit platforms (CONFIG_64BIT=y):
+ Numascale NumaChip
ScaleMP vSMP
SGI Ultraviolet
+ Merrifield/Moorefield MID devices
+ Goldfish (mostly Android emulator)
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
-endif
+
# This is an alphabetically sorted list of 64 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
+config X86_NUMACHIP
+ bool "Numascale NumaChip"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ depends on NUMA
+ depends on SMP
+ depends on X86_X2APIC
+ depends on PCI_MMCONFIG
+ help
+ Adds support for Numascale NumaChip large-SMP systems. Needed to
+ enable more than ~168 cores.
+ If you don't have one of these, you should say N here.
config X86_VSMP
bool "ScaleMP vSMP"
+ select HYPERVISOR_GUEST
select PARAVIRT
depends on X86_64 && PCI
depends on X86_EXTENDED_PLATFORM
- ---help---
+ depends on SMP
+ help
Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
supposed to run on these EM64T-based machines. Only choose this option
if you have one of these machines.
@@ -358,24 +609,82 @@ config X86_UV
depends on X86_64
depends on X86_EXTENDED_PLATFORM
depends on NUMA
+ depends on EFI
+ depends on KEXEC_CORE
depends on X86_X2APIC
- ---help---
+ depends on PCI
+ help
This option is needed in order to support SGI Ultraviolet systems.
If you don't have one of these, you should say N here.
+config X86_INTEL_MID
+ bool "Intel Z34xx/Z35xx MID platform support"
+ depends on X86_EXTENDED_PLATFORM
+ depends on X86_PLATFORM_DEVICES
+ depends on PCI
+ depends on X86_64 || (EXPERT && PCI_GOANY)
+ depends on X86_IO_APIC
+ select I2C
+ select DW_APB_TIMER
+ select INTEL_SCU_PCI
+ help
+ Select to build a kernel capable of supporting 64-bit Intel MID
+ (Mobile Internet Device) platform systems which do not have
+ the PCI legacy interfaces.
+
+ The only supported devices are the 22nm Merrified (Z34xx)
+ and Moorefield (Z35xx) SoC used in the Intel Edison board and
+ a small number of Android devices such as the Asus Zenfone 2,
+ Asus FonePad 8 and Dell Venue 7.
+
+ If you are building for a PC class system or non-MID tablet
+ SoCs like Bay Trail (Z36xx/Z37xx), say N here.
+
+ Intel MID platforms are based on an Intel processor and chipset which
+ consume less power than most of the x86 derivatives.
+
+config X86_GOLDFISH
+ bool "Goldfish (Virtual Platform)"
+ depends on X86_EXTENDED_PLATFORM
+ help
+ Enable support for the Goldfish virtual platform used primarily
+ for Android development. Unless you are building for the Android
+ Goldfish emulator say N here.
+
# Following is an alphabetically sorted list of 32 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
-config X86_ELAN
- bool "AMD Elan"
+config X86_INTEL_CE
+ bool "CE4100 TV platform"
+ depends on PCI
+ depends on PCI_GODIRECT
+ depends on X86_IO_APIC
depends on X86_32
depends on X86_EXTENDED_PLATFORM
- ---help---
- Select this for an AMD Elan processor.
-
- Do not use this option for K6/Athlon/Opteron processors!
+ select X86_REBOOTFIXUPS
+ select OF
+ select OF_EARLY_FLATTREE
+ help
+ Select for the Intel CE media processor (CE4100) SOC.
+ This option compiles in support for the CE4100 SOC for settop
+ boxes and media devices.
- If unsure, choose "PC-compatible" instead.
+config X86_INTEL_QUARK
+ bool "Intel Quark platform support"
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ depends on X86_PLATFORM_DEVICES
+ depends on X86_TSC
+ depends on PCI
+ depends on PCI_GOANY
+ depends on X86_IO_APIC
+ select IOSF_MBI
+ select INTEL_IMR
+ select COMMON_CLK
+ help
+ Select to include support for Quark X1000 SoC.
+ Say Y here if you have a Quark based system such as the Arduino
+ compatible Intel Galileo.
config X86_RDC321X
bool "RDC R-321x SoC"
@@ -383,67 +692,91 @@ config X86_RDC321X
depends on X86_EXTENDED_PLATFORM
select M486
select X86_REBOOTFIXUPS
- ---help---
+ help
This option is needed for RDC R-321x system-on-chip, also known
as R-8610-(G).
If you don't have one of these chips, you should say N here.
-config X86_32_NON_STANDARD
- bool "Support non-standard 32-bit SMP architectures"
- depends on X86_32 && SMP
- depends on X86_EXTENDED_PLATFORM
- ---help---
- This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
- subarchitectures. It is intended for a generic binary kernel.
- if you select them all, kernel will probe it one by one. and will
- fallback to default.
-
-# Alphabetically sorted list of Non standard 32 bit platforms
-
-config X86_NUMAQ
- bool "NUMAQ (IBM/Sequent)"
- depends on X86_32_NON_STANDARD
- select NUMA
- select X86_MPPARSE
- ---help---
- This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
- NUMA multiquad box. This changes the way that processors are
- bootstrapped, and uses Clustered Logical APIC addressing mode instead
- of Flat Logical. You will need a new lynxer.elf file to flash your
- firmware with - send email to <Martin.Bligh@us.ibm.com>.
-
-config X86_VISWS
- bool "SGI 320/540 (Visual Workstation)"
- depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
- depends on X86_32_NON_STANDARD
- ---help---
- The SGI Visual Workstation series is an IA32-based workstation
- based on SGI systems chips with some legacy PC hardware attached.
-
- Say Y here to create a kernel to run on the SGI 320 or 540.
-
- A kernel compiled for the Visual Workstation will run on general
- PCs as well. See <file:Documentation/sgi-visws.txt> for details.
-
-config X86_SUMMIT
- bool "Summit/EXA (IBM x440)"
- depends on X86_32_NON_STANDARD
- ---help---
- This option is needed for IBM systems that use the Summit/EXA chipset.
- In particular, it is needed for the x440.
-
-config X86_ES7000
- bool "Unisys ES7000 IA32 series"
- depends on X86_32_NON_STANDARD && X86_BIGSMP
- ---help---
- Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
- supposed to run on an IA32-based Unisys ES7000 system.
+config X86_INTEL_LPSS
+ bool "Intel Low Power Subsystem Support"
+ depends on X86 && ACPI && PCI
+ select COMMON_CLK
+ select PINCTRL
+ select IOSF_MBI
+ help
+ Select to build support for Intel Low Power Subsystem such as
+ found on Intel Lynxpoint PCH. Selecting this option enables
+ things like clock tree (common clock framework) and pincontrol
+ which are needed by the LPSS peripheral drivers.
+
+config X86_AMD_PLATFORM_DEVICE
+ bool "AMD ACPI2Platform devices support"
+ depends on ACPI
+ select COMMON_CLK
+ select PINCTRL
+ help
+ Select to interpret AMD specific ACPI device to platform device
+ such as I2C, UART, GPIO found on AMD Carrizo and later chipsets.
+ I2C and UART depend on COMMON_CLK to set clock. GPIO driver is
+ implemented under PINCTRL subsystem.
+
+config IOSF_MBI
+ tristate "Intel SoC IOSF Sideband support for SoC platforms"
+ depends on PCI
+ help
+ This option enables sideband register access support for Intel SoC
+ platforms. On these platforms the IOSF sideband is used in lieu of
+ MSR's for some register accesses, mostly but not limited to thermal
+ and power. Drivers may query the availability of this device to
+ determine if they need the sideband in order to work on these
+ platforms. The sideband is available on the following SoC products.
+ This list is not meant to be exclusive.
+ - BayTrail
+ - Braswell
+ - Quark
+
+ You should say Y if you are running a kernel on one of these SoC's.
+
+config IOSF_MBI_DEBUG
+ bool "Enable IOSF sideband access through debugfs"
+ depends on IOSF_MBI && DEBUG_FS
+ help
+ Select this option to expose the IOSF sideband access registers (MCR,
+ MDR, MCRX) through debugfs to write and read register information from
+ different units on the SoC. This is most useful for obtaining device
+ state information for debug and analysis. As this is a general access
+ mechanism, users of this option would have specific knowledge of the
+ device they want to access.
+
+ If you don't require the option or are in doubt, say N.
+
+config X86_SUPPORTS_MEMORY_FAILURE
+ def_bool y
+ # MCE code calls memory_failure():
+ depends on X86_MCE
+ # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
+ # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
+ depends on X86_64 || !SPARSEMEM
+ select ARCH_SUPPORTS_MEMORY_FAILURE
+
+config X86_32_IRIS
+ tristate "Eurobraille/Iris poweroff module"
+ depends on X86_32
+ help
+ The Iris machines from EuroBraille do not have APM or ACPI support
+ to shut themselves down properly. A special I/O sequence is
+ needed to do so, which is what this module does at
+ kernel shutdown.
+
+ This is only for Iris machines from EuroBraille.
+
+ If unused, say N.
config SCHED_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
depends on X86
- ---help---
+ help
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
caller function. This provides more accurate wchan values,
@@ -451,115 +784,159 @@ config SCHED_OMIT_FRAME_POINTER
If in doubt, say "Y".
-menuconfig PARAVIRT_GUEST
- bool "Paravirtualized guest support"
- ---help---
- Say Y here to get to see options related to running Linux under
- various hypervisors. This option alone does not add any kernel code.
-
- If you say N, all options in this submenu will be skipped and disabled.
-
-if PARAVIRT_GUEST
-
-source "arch/x86/xen/Kconfig"
-
-config VMI
- bool "VMI Guest support"
- select PARAVIRT
- depends on X86_32
- ---help---
- VMI provides a paravirtualized interface to the VMware ESX server
- (it could be used by other hypervisors in theory too, but is not
- at the moment), by linking the kernel to a GPL-ed ROM module
- provided by the hypervisor.
-
-config KVM_CLOCK
- bool "KVM paravirtualized clock"
- select PARAVIRT
- select PARAVIRT_CLOCK
- ---help---
- Turning on this option will allow you to run a paravirtualized clock
- when running over the KVM hypervisor. Instead of relying on a PIT
- (or probably other) emulation by the underlying device model, the host
- provides the guest with timing infrastructure such as time of day, and
- system time
+menuconfig HYPERVISOR_GUEST
+ bool "Linux guest support"
+ help
+ Say Y here to enable options for running Linux under various hyper-
+ visors. This option enables basic hypervisor detection and platform
+ setup.
-config KVM_GUEST
- bool "KVM Guest support"
- select PARAVIRT
- ---help---
- This option enables various optimizations for running under the KVM
- hypervisor.
+ If you say N, all options in this submenu will be skipped and
+ disabled, and Linux guest support won't be built in.
-source "arch/x86/lguest/Kconfig"
+if HYPERVISOR_GUEST
config PARAVIRT
bool "Enable paravirtualization code"
- ---help---
+ depends on HAVE_STATIC_CALL
+ help
This changes the kernel so it can modify itself when it is run
under a hypervisor, potentially improving performance significantly
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
+config PARAVIRT_XXL
+ bool
+ depends on X86_64
+
+config PARAVIRT_DEBUG
+ bool "paravirt-ops debugging"
+ depends on PARAVIRT && DEBUG_KERNEL
+ help
+ Enable to debug paravirt_ops internals. Specifically, BUG if
+ a paravirt_op is missing when it is called.
+
config PARAVIRT_SPINLOCKS
bool "Paravirtualization layer for spinlocks"
- depends on PARAVIRT && SMP && EXPERIMENTAL
- ---help---
+ depends on PARAVIRT && SMP
+ help
Paravirtualized spinlocks allow a pvops backend to replace the
spinlock implementation with something virtualization-friendly
(for example, block the virtual CPU rather than spinning).
- Unfortunately the downside is an up to 5% performance hit on
- native kernels, with various workloads.
+ It has a minimal impact on native kernels and gives a nice performance
+ benefit on paravirtualized KVM / Xen kernels.
- If you are unsure how to answer this question, answer N.
+ If you are unsure how to answer this question, answer Y.
-config PARAVIRT_CLOCK
- bool
- default n
+config X86_HV_CALLBACK_VECTOR
+ def_bool n
-endif
+source "arch/x86/xen/Kconfig"
-config PARAVIRT_DEBUG
- bool "paravirt-ops debugging"
- depends on PARAVIRT && DEBUG_KERNEL
- ---help---
- Enable to debug paravirt_ops internals. Specifically, BUG if
- a paravirt_op is missing when it is called.
+config KVM_GUEST
+ bool "KVM Guest support (including kvmclock)"
+ depends on PARAVIRT
+ select PARAVIRT_CLOCK
+ select ARCH_CPUIDLE_HALTPOLL
+ select X86_HV_CALLBACK_VECTOR
+ default y
+ help
+ This option enables various optimizations for running under the KVM
+ hypervisor. It includes a paravirtualized clock, so that instead
+ of relying on a PIT (or probably other) emulation by the
+ underlying device model, the host provides the guest with
+ timing infrastructure such as time of day, and system time
-config MEMTEST
- bool "Memtest"
- ---help---
- This option adds a kernel parameter 'memtest', which allows memtest
- to be set.
- memtest=0, mean disabled; -- default
- memtest=1, mean do 1 test pattern;
- ...
- memtest=4, mean do 4 test patterns.
- If you are unsure how to answer this question, answer N.
+config ARCH_CPUIDLE_HALTPOLL
+ def_bool n
+ prompt "Disable host haltpoll when loading haltpoll driver"
+ help
+ If virtualized under KVM, disable host haltpoll.
-config X86_SUMMIT_NUMA
- def_bool y
- depends on X86_32 && NUMA && X86_32_NON_STANDARD
+config PVH
+ bool "Support for running PVH guests"
+ help
+ This option enables the PVH entry point for guest virtual machines
+ as specified in the x86/HVM direct boot ABI.
-config X86_CYCLONE_TIMER
- def_bool y
- depends on X86_32_NON_STANDARD
+config PARAVIRT_TIME_ACCOUNTING
+ bool "Paravirtual steal time accounting"
+ depends on PARAVIRT
+ help
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
+config PARAVIRT_CLOCK
+ bool
+
+config JAILHOUSE_GUEST
+ bool "Jailhouse non-root cell support"
+ depends on X86_64 && PCI
+ select X86_PM_TIMER
+ help
+ This option allows to run Linux as guest in a Jailhouse non-root
+ cell. You can leave this option disabled if you only want to start
+ Jailhouse and run Linux afterwards in the root cell.
+
+config ACRN_GUEST
+ bool "ACRN Guest support"
+ depends on X86_64
+ select X86_HV_CALLBACK_VECTOR
+ help
+ This option allows to run Linux as guest in the ACRN hypervisor. ACRN is
+ a flexible, lightweight reference open-source hypervisor, built with
+ real-time and safety-criticality in mind. It is built for embedded
+ IOT with small footprint and real-time features. More details can be
+ found in https://projectacrn.org/.
+
+config BHYVE_GUEST
+ bool "Bhyve (BSD Hypervisor) Guest support"
+ depends on X86_64
+ help
+ This option allows to run Linux to recognise when it is running as a
+ guest in the Bhyve hypervisor, and to support more than 255 vCPUs when
+ when doing so. More details about Bhyve can be found at https://bhyve.org
+ and https://wiki.freebsd.org/bhyve/.
+
+config INTEL_TDX_GUEST
+ bool "Intel TDX (Trust Domain Extensions) - Guest Support"
+ depends on X86_64 && CPU_SUP_INTEL
+ depends on X86_X2APIC
+ depends on EFI_STUB
+ depends on PARAVIRT
+ select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
+ select X86_MCE
+ select UNACCEPTED_MEMORY
+ help
+ Support running as a guest under Intel TDX. Without this support,
+ the guest kernel can not boot or run under TDX.
+ TDX includes memory encryption and integrity capabilities
+ which protect the confidentiality and integrity of guest
+ memory contents and CPU state. TDX guests are protected from
+ some attacks from the VMM.
+
+endif # HYPERVISOR_GUEST
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
def_bool X86_64
prompt "HPET Timer Support" if X86_32
- ---help---
+ help
Use the IA-PC HPET (High Precision Event Timer) to manage
time in preference to the PIT and RTC, if a HPET is
present.
HPET is the next generation timer replacing legacy 8254s.
The HPET provides a stable time base on SMP
systems, unlike the TSC, but it is more expensive to access,
- as it is off-chip. You can find the HPET spec at
- <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
+ as it is off-chip. The interface used is documented
+ in the HPET spec, revision 1.
You can safely choose Y here. However, HPET will only be
activated if the platform and the BIOS support this feature.
@@ -569,156 +946,146 @@ config HPET_TIMER
config HPET_EMULATE_RTC
def_bool y
- depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+ depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
-# Mark as embedded because too many people got it wrong.
+# Mark as expert because too many people got it wrong.
# The code disables itself when not needed.
config DMI
default y
- bool "Enable DMI scanning" if EMBEDDED
- ---help---
+ select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
+ bool "Enable DMI scanning" if EXPERT
+ help
Enabled scanning of DMI to identify machine quirks. Say Y
here unless you have verified that your setup is not
affected by entries in the DMI blacklist. Required by PNP
BIOS code.
config GART_IOMMU
- bool "GART IOMMU support" if EMBEDDED
- default y
+ bool "Old AMD GART IOMMU support"
+ select IOMMU_HELPER
select SWIOTLB
- select AGP
- depends on X86_64 && PCI
- ---help---
- Support for full DMA access of devices with 32bit memory access only
- on systems with more than 3GB. This is usually needed for USB,
- sound, many IDE/SATA chipsets and some other devices.
- Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
- based hardware IOMMU and a software bounce buffer based IOMMU used
- on Intel systems and as fallback.
- The code is only active when needed (enough memory and limited
- device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
- too.
-
-config CALGARY_IOMMU
- bool "IBM Calgary IOMMU support"
- select SWIOTLB
- depends on X86_64 && PCI && EXPERIMENTAL
- ---help---
- Support for hardware IOMMUs in IBM's xSeries x366 and x460
- systems. Needed to run systems with more than 3GB of memory
- properly with 32-bit PCI devices that do not support DAC
- (Double Address Cycle). Calgary also supports bus level
- isolation, where all DMAs pass through the IOMMU. This
- prevents them from going anywhere except their intended
- destination. This catches hard-to-find kernel bugs and
- mis-behaving drivers and devices that do not use the DMA-API
- properly to set up their DMA buffers. The IOMMU can be
- turned off at boot time with the iommu=off parameter.
- Normally the kernel will make the right choice by itself.
- If unsure, say Y.
+ depends on X86_64 && PCI && AMD_NB
+ help
+ Provides a driver for older AMD Athlon64/Opteron/Turion/Sempron
+ GART based hardware IOMMUs.
-config CALGARY_IOMMU_ENABLED_BY_DEFAULT
- def_bool y
- prompt "Should Calgary be enabled by default?"
- depends on CALGARY_IOMMU
- ---help---
- Should Calgary be enabled by default? if you choose 'y', Calgary
- will be used (if it exists). If you choose 'n', Calgary will not be
- used even if it exists. If you choose 'n' and would like to use
- Calgary anyway, pass 'iommu=calgary' on the kernel command line.
- If unsure, say Y.
+ The GART supports full DMA access for devices with 32-bit access
+ limitations, on systems with more than 3 GB. This is usually needed
+ for USB, sound, many IDE/SATA chipsets and some other devices.
-config AMD_IOMMU
- bool "AMD IOMMU support"
- select SWIOTLB
- select PCI_MSI
- depends on X86_64 && PCI && ACPI
- ---help---
- With this option you can enable support for AMD IOMMU hardware in
- your system. An IOMMU is a hardware component which provides
- remapping of DMA memory accesses from devices. With an AMD IOMMU you
- can isolate the the DMA memory of different devices and protect the
- system from misbehaving device drivers or hardware.
-
- You can find out if your system has an AMD IOMMU if you look into
- your BIOS for an option to enable it or if you have an IVRS ACPI
- table.
-
-config AMD_IOMMU_STATS
- bool "Export AMD IOMMU statistics to debugfs"
- depends on AMD_IOMMU
- select DEBUG_FS
- ---help---
- This option enables code in the AMD IOMMU driver to collect various
- statistics about whats happening in the driver and exports that
- information to userspace via debugfs.
- If unsure, say N.
+ Newer systems typically have a modern AMD IOMMU, supported via
+ the CONFIG_AMD_IOMMU=y config option.
-# need this always selected by IOMMU for the VIA workaround
-config SWIOTLB
- def_bool y if X86_64
- ---help---
- Support for software bounce buffers used on x86-64 systems
- which don't have a hardware IOMMU (e.g. the current generation
- of Intel's x86-64 CPUs). Using this PCI devices which can only
- access 32-bits of memory can be used on systems with more than
- 3 GB of memory. If unsure, say Y.
+ In normal configurations this driver is only active when needed:
+ there's more than 3 GB of memory and the system contains a
+ 32-bit limited device.
-config IOMMU_HELPER
- def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
+ If unsure, say Y.
-config IOMMU_API
- def_bool (AMD_IOMMU || DMAR)
+config BOOT_VESA_SUPPORT
+ bool
+ help
+ If true, at least one selected framebuffer driver can take advantage
+ of VESA video modes set at an early boot stage via the vga= parameter.
config MAXSMP
- bool "Configure Maximum number of SMP Processors and NUMA Nodes"
- depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+ bool "Enable Maximum number of SMP Processors and NUMA Nodes"
+ depends on X86_64 && SMP && DEBUG_KERNEL
select CPUMASK_OFFSTACK
- default n
- ---help---
- Configure maximum number of CPUS and NUMA Nodes for this architecture.
+ help
+ Enable maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
+#
+# The maximum number of CPUs supported:
+#
+# The main config value is NR_CPUS, which defaults to NR_CPUS_DEFAULT,
+# and which can be configured interactively in the
+# [NR_CPUS_RANGE_BEGIN ... NR_CPUS_RANGE_END] range.
+#
+# The ranges are different on 32-bit and 64-bit kernels, depending on
+# hardware capabilities and scalability features of the kernel.
+#
+# ( If MAXSMP is enabled we just use the highest possible value and disable
+# interactive configuration. )
+#
+
+config NR_CPUS_RANGE_BEGIN
+ int
+ default NR_CPUS_RANGE_END if MAXSMP
+ default 1 if !SMP
+ default 2
+
+config NR_CPUS_RANGE_END
+ int
+ depends on X86_32
+ default 8 if SMP
+ default 1 if !SMP
+
+config NR_CPUS_RANGE_END
+ int
+ depends on X86_64
+ default 8192 if SMP && CPUMASK_OFFSTACK
+ default 512 if SMP && !CPUMASK_OFFSTACK
+ default 1 if !SMP
+
+config NR_CPUS_DEFAULT
+ int
+ depends on X86_32
+ default 8 if SMP
+ default 1 if !SMP
+
+config NR_CPUS_DEFAULT
+ int
+ depends on X86_64
+ default 8192 if MAXSMP
+ default 64 if SMP
+ default 1 if !SMP
+
config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP
- range 2 8 if SMP && X86_32 && !X86_BIGSMP
- range 2 512 if SMP && !MAXSMP
- default "1" if !SMP
- default "4096" if MAXSMP
- default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
- default "8" if SMP
- ---help---
+ range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END
+ default NR_CPUS_DEFAULT
+ help
This allows you to specify the maximum number of CPUs which this
- kernel will support. The maximum supported value is 512 and the
+ kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum
+ supported value is 8192, otherwise the maximum value is 512. The
minimum value which makes sense is 2.
- This is purely to save memory - each supported CPU adds
- approximately eight kilobytes to the kernel image.
+ This is purely to save memory: each supported CPU adds about 8KB
+ to the kernel image.
+
+config SCHED_MC_PRIO
+ bool "CPU core priorities scheduler support"
+ depends on SCHED_MC
+ select X86_INTEL_PSTATE if CPU_SUP_INTEL
+ select X86_AMD_PSTATE if CPU_SUP_AMD && ACPI
+ select CPU_FREQ
+ default y
+ help
+ Intel Turbo Boost Max Technology 3.0 enabled CPUs have a
+ core ordering determined at manufacturing time, which allows
+ certain cores to reach higher turbo frequencies (when running
+ single threaded workloads) than others.
-config SCHED_SMT
- bool "SMT (Hyperthreading) scheduler support"
- depends on X86_HT
- ---help---
- SMT scheduler support improves the CPU scheduler's decision making
- when dealing with Intel Pentium 4 chips with HyperThreading at a
- cost of slightly increased overhead in some places. If unsure say
- N here.
+ Enabling this kernel feature teaches the scheduler about
+ the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the
+ scheduler's CPU selection logic accordingly, so that higher
+ overall system performance can be achieved.
-config SCHED_MC
- def_bool y
- prompt "Multi-core scheduler support"
- depends on X86_HT
- ---help---
- Multi-core scheduler support improves the CPU scheduler's decision
- making when dealing with multi-core CPU chips at a cost of slightly
- increased overhead in some places. If unsure say N here.
+ This feature will have no effect on CPUs without this feature.
-source "kernel/Kconfig.preempt"
+ If unsure say Y here.
+
+config UP_LATE_INIT
+ def_bool y
+ depends on !SMP && X86_LOCAL_APIC
config X86_UP_APIC
- bool "Local APIC support on uniprocessors"
- depends on X86_32 && !SMP && !X86_32_NON_STANDARD
- ---help---
+ bool "Local APIC support on uniprocessors" if !PCI_MSI
+ default PCI_MSI
+ depends on X86_32 && !SMP
+ help
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
system which has a processor with a local APIC, you can say Y here to
@@ -731,7 +1098,7 @@ config X86_UP_APIC
config X86_UP_IOAPIC
bool "IO-APIC support on uniprocessors"
depends on X86_UP_APIC
- ---help---
+ help
An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
SMP-capable replacement for PC-style interrupt controllers. Most
SMP systems and many recent uniprocessor systems have one.
@@ -742,21 +1109,24 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
- depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+ depends on X86_64 || SMP || X86_UP_APIC || PCI_MSI
+ select IRQ_DOMAIN_HIERARCHY
-config X86_IO_APIC
+config ACPI_MADT_WAKEUP
def_bool y
- depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+ depends on X86_64
+ depends on ACPI
+ depends on SMP
+ depends on X86_LOCAL_APIC
-config X86_VISWS_APIC
+config X86_IO_APIC
def_bool y
- depends on X86_32 && X86_VISWS
+ depends on X86_LOCAL_APIC || X86_UP_IOAPIC
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
- default n
depends on X86_IO_APIC
- ---help---
+ help
This option enables a workaround that fixes a source of
spurious interrupts. This is recommended when threaded
interrupt handling is used on systems where the generation of
@@ -777,113 +1147,150 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
increased on these systems.
config X86_MCE
- bool "Machine Check Exception"
- ---help---
- Machine Check Exception support allows the processor to notify the
- kernel if it detects a problem (e.g. overheating, component failure).
+ bool "Machine Check / overheating reporting"
+ select GENERIC_ALLOCATOR
+ default y
+ help
+ Machine Check support allows the processor to notify the
+ kernel if it detects a problem (e.g. overheating, data corruption).
The action the kernel takes depends on the severity of the problem,
- ranging from a warning message on the console, to halting the machine.
- Your processor must be a Pentium or newer to support this - check the
- flags in /proc/cpuinfo for mce. Note that some older Pentium systems
- have a design flaw which leads to false MCE events - hence MCE is
- disabled on all P5 processors, unless explicitly enabled with "mce"
- as a boot argument. Similarly, if MCE is built in and creates a
- problem on some new non-standard machine, you can boot with "nomce"
- to disable it. MCE support simply ignores non-MCE processors like
- the 386 and 486, so nearly everyone can say Y here.
-
-config X86_OLD_MCE
- depends on X86_32 && X86_MCE
- bool "Use legacy machine check code (will go away)"
- default n
- select X86_ANCIENT_MCE
- ---help---
- Use the old i386 machine check code. This is merely intended for
- testing in a transition period. Try this if you run into any machine
- check related software problems, but report the problem to
- linux-kernel. When in doubt say no.
-
-config X86_NEW_MCE
+ ranging from warning messages to halting the machine.
+
+config X86_MCELOG_LEGACY
+ bool "Support for deprecated /dev/mcelog character device"
depends on X86_MCE
- bool
- default y if (!X86_OLD_MCE && X86_32) || X86_64
+ help
+ Enable support for /dev/mcelog which is needed by the old mcelog
+ userspace logging daemon. Consider switching to the new generation
+ rasdaemon solution.
config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
- depends on X86_NEW_MCE && X86_LOCAL_APIC
- ---help---
- Additional support for intel specific MCE features such as
- the thermal monitor.
+ depends on X86_MCE && X86_LOCAL_APIC
+ help
+ Additional support for intel specific MCE features such as
+ the thermal monitor.
config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
- depends on X86_NEW_MCE && X86_LOCAL_APIC
- ---help---
- Additional support for AMD specific MCE features such as
- the DRAM Error Threshold.
+ depends on X86_MCE && X86_LOCAL_APIC
+ help
+ Additional support for AMD specific MCE features such as
+ the DRAM Error Threshold.
config X86_ANCIENT_MCE
- def_bool n
- depends on X86_32
- prompt "Support for old Pentium 5 / WinChip machine checks"
- ---help---
+ bool "Support for old Pentium 5 / WinChip machine checks"
+ depends on X86_32 && X86_MCE
+ help
Include support for machine check handling on old Pentium 5 or WinChip
- systems. These typically need to be enabled explicitely on the command
+ systems. These typically need to be enabled explicitly on the command
line.
config X86_MCE_THRESHOLD
depends on X86_MCE_AMD || X86_MCE_INTEL
- bool
- default y
+ def_bool y
config X86_MCE_INJECT
- depends on X86_NEW_MCE
+ depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS
tristate "Machine check injector support"
- ---help---
+ help
Provide support for injecting machine checks for testing purposes.
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
-config X86_MCE_NONFATAL
- tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
- depends on X86_OLD_MCE
- ---help---
- Enabling this feature starts a timer that triggers every 5 seconds which
- will look at the machine check registers to see if anything happened.
- Non-fatal problems automatically get corrected (but still logged).
- Disable this if you don't want to see these messages.
- Seeing the messages this option prints out may be indicative of dying
- or out-of-spec (ie, overclocked) hardware.
- This option only does something on certain CPUs.
- (AMD Athlon/Duron and Intel Pentium 4)
-
-config X86_MCE_P4THERMAL
- bool "check for P4 thermal throttling interrupt."
- depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
- ---help---
- Enabling this feature will cause a message to be printed when the P4
- enters thermal throttling.
-
-config X86_THERMAL_VECTOR
- def_bool y
- depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
+source "arch/x86/events/Kconfig"
+
+config X86_LEGACY_VM86
+ bool "Legacy VM86 support"
+ depends on X86_32
+ help
+ This option allows user programs to put the CPU into V8086
+ mode, which is an 80286-era approximation of 16-bit real mode.
+
+ Some very old versions of X and/or vbetool require this option
+ for user mode setting. Similarly, DOSEMU will use it if
+ available to accelerate real mode DOS programs. However, any
+ recent version of DOSEMU, X, or vbetool should be fully
+ functional even without kernel VM86 support, as they will all
+ fall back to software emulation. Nevertheless, if you are using
+ a 16-bit DOS program where 16-bit performance matters, vm86
+ mode might be faster than emulation and you might want to
+ enable this option.
+
+ Note that any app that works on a 64-bit kernel is unlikely to
+ need this option, as 64-bit kernels don't, and can't, support
+ V8086 mode. This option is also unrelated to 16-bit protected
+ mode and is not needed to run most 16-bit programs under Wine.
+
+ Enabling this option increases the complexity of the kernel
+ and slows down exception handling a tiny bit.
+
+ If unsure, say N here.
config VM86
- bool "Enable VM86 support" if EMBEDDED
+ bool
+ default X86_LEGACY_VM86
+
+config X86_16BIT
+ bool "Enable support for 16-bit segments" if EXPERT
default y
- depends on X86_32
- ---help---
- This option is required by programs like DOSEMU to run 16-bit legacy
- code on X86 processors. It also may be needed by software like
- XFree86 to initialize some video cards via BIOS. Disabling this
- option saves about 6k.
+ depends on MODIFY_LDT_SYSCALL
+ help
+ This option is required by programs like Wine to run 16-bit
+ protected mode legacy code on x86 processors. Disabling
+ this option saves about 300 bytes on i386, or around 6K text
+ plus 16K runtime memory on x86-64,
+
+config X86_ESPFIX32
+ def_bool y
+ depends on X86_16BIT && X86_32
+
+config X86_ESPFIX64
+ def_bool y
+ depends on X86_16BIT && X86_64
+
+config X86_VSYSCALL_EMULATION
+ bool "Enable vsyscall emulation" if EXPERT
+ default y
+ depends on X86_64
+ help
+ This enables emulation of the legacy vsyscall page. Disabling
+ it is roughly equivalent to booting with vsyscall=none, except
+ that it will also disable the helpful warning if a program
+ tries to use a vsyscall. With this option set to N, offending
+ programs will just segfault, citing addresses of the form
+ 0xffffffffff600?00.
+
+ This option is required by many programs built before 2013, and
+ care should be used even with newer programs if set to N.
+
+ Disabling this option saves about 7K of kernel size and
+ possibly 4K of additional runtime pagetable memory.
+
+config X86_IOPL_IOPERM
+ bool "IOPERM and IOPL Emulation"
+ default y
+ help
+ This enables the ioperm() and iopl() syscalls which are necessary
+ for legacy applications.
+
+ Legacy IOPL support is an overbroad mechanism which allows user
+ space aside of accessing all 65536 I/O ports also to disable
+ interrupts. To gain this access the caller needs CAP_SYS_RAWIO
+ capabilities and permission from potentially active security
+ modules.
+
+ The emulation restricts the functionality of the syscall to
+ only allowing the full range I/O port access, but prevents the
+ ability to disable interrupts from user space which would be
+ granted if the hardware IOPL mechanism would be used.
config TOSHIBA
tristate "Toshiba Laptop support"
depends on X86_32
- ---help---
+ help
This adds a driver to safely access the System Management Mode of
the CPU on Toshiba portables with a genuine Toshiba BIOS. It does
not work on models with a Phoenix BIOS. The System Management Mode
@@ -896,30 +1303,10 @@ config TOSHIBA
Say Y if you intend to run this kernel on a Toshiba portable.
Say N otherwise.
-config I8K
- tristate "Dell laptop support"
- ---help---
- This adds a driver to safely access the System Management Mode
- of the CPU on the Dell Inspiron 8000. The System Management Mode
- is used to read cpu temperature and cooling fan status and to
- control the fans on the I8K portables.
-
- This driver has been tested only on the Inspiron 8000 but it may
- also work with other Dell laptops. You can force loading on other
- models by passing the parameter `force=1' to the module. Use at
- your own risk.
-
- For information on utilities to make use of this driver see the
- I8K Linux utilities web site at:
- <http://people.debian.org/~dz/i8k/>
-
- Say Y if you intend to run this kernel on a Dell Inspiron 8000.
- Say N otherwise.
-
config X86_REBOOTFIXUPS
bool "Enable X86 board specific fixups for reboot"
depends on X86_32
- ---help---
+ help
This enables chipset and/or board specific fixups to be done
in order to get reboot to work correctly. This is only needed on
some combinations of hardware and BIOS. The symptom, for which
@@ -934,51 +1321,61 @@ config X86_REBOOTFIXUPS
Say N otherwise.
config MICROCODE
- tristate "/dev/cpu/microcode - microcode support"
- select FW_LOADER
- ---help---
- If you say Y here, you will be able to update the microcode on
- certain Intel and AMD processors. The Intel support is for the
- IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
- Pentium 4, Xeon etc. The AMD support is for family 0x10 and
- 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
- You will obviously need the actual microcode binary data itself
- which is not shipped with the Linux kernel.
-
- This option selects the general module only, you need to select
- at least one vendor specific module as well.
+ def_bool y
+ depends on CPU_SUP_AMD || CPU_SUP_INTEL
+ select CRYPTO_LIB_SHA256 if CPU_SUP_AMD
- To compile this driver as a module, choose M here: the
- module will be called microcode.
+config MICROCODE_INITRD32
+ def_bool y
+ depends on MICROCODE && X86_32 && BLK_DEV_INITRD
-config MICROCODE_INTEL
- bool "Intel microcode patch loading support"
- depends on MICROCODE
- default MICROCODE
- select FW_LOADER
- ---help---
- This options enables microcode patch loading support for Intel
- processors.
-
- For latest news and information on obtaining all the required
- Intel ingredients for this driver, check:
- <http://www.urbanmyth.org/microcode/>.
-
-config MICROCODE_AMD
- bool "AMD microcode patch loading support"
- depends on MICROCODE
- select FW_LOADER
- ---help---
- If you select this option, microcode patch loading support for AMD
- processors will be enabled.
+config MICROCODE_LATE_LOADING
+ bool "Late microcode loading (DANGEROUS)"
+ default n
+ depends on MICROCODE && SMP
+ help
+ Loading microcode late, when the system is up and executing instructions
+ is a tricky business and should be avoided if possible. Just the sequence
+ of synchronizing all cores and SMT threads is one fragile dance which does
+ not guarantee that cores might not softlock after the loading. Therefore,
+ use this at your own risk. Late loading taints the kernel unless the
+ microcode header indicates that it is safe for late loading via the
+ minimal revision check. This minimal revision check can be enforced on
+ the kernel command line with "microcode=force_minrev".
+
+config MICROCODE_LATE_FORCE_MINREV
+ bool "Enforce late microcode loading minimal revision check"
+ default n
+ depends on MICROCODE_LATE_LOADING
+ help
+ To prevent that users load microcode late which modifies already
+ in use features, newer microcode patches have a minimum revision field
+ in the microcode header, which tells the kernel which minimum
+ revision must be active in the CPU to safely load that new microcode
+ late into the running system. If disabled the check will not
+ be enforced but the kernel will be tainted when the minimal
+ revision check fails.
-config MICROCODE_OLD_INTERFACE
- def_bool y
+ This minimal revision check can also be controlled via the
+ "microcode=force_minrev" parameter on the kernel command line.
+
+ If unsure say Y.
+
+config MICROCODE_DBG
+ bool "Enable microcode loader debugging"
+ default n
depends on MICROCODE
+ help
+ Enable code which allows for debugging the microcode loader in
+ a guest. Meaning the patch loading is simulated but everything else
+ related to patch parsing and handling is done as on baremetal with
+ the purpose of debugging solely the software side of things.
+
+ You almost certainly want to say n here.
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
- ---help---
+ help
This device gives privileged processes access to the x86
Model-Specific Registers (MSRs). It is a character device with
major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
@@ -987,29 +1384,17 @@ config X86_MSR
config X86_CPUID
tristate "/dev/cpu/*/cpuid - CPU information support"
- ---help---
+ help
This device gives processes access to the x86 CPUID instruction to
be executed on a specific processor. It is a character device
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
/dev/cpu/31/cpuid.
-config X86_CPU_DEBUG
- tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
- ---help---
- If you select this option, this will provide various x86 CPUs
- information through debugfs.
-
-choice
- prompt "High Memory Support"
- default HIGHMEM4G if !X86_NUMAQ
- default HIGHMEM64G if X86_NUMAQ
+config HIGHMEM4G
+ bool "High Memory Support"
depends on X86_32
-
-config NOHIGHMEM
- bool "off"
- depends on !X86_NUMAQ
- ---help---
- Linux can use up to 64 Gigabytes of physical memory on x86 systems.
+ help
+ Linux can use up to 4 Gigabytes of physical memory on x86 systems.
However, the address space of 32-bit x86 processors is only 4
Gigabytes large. That means that, if you have a large amount of
physical memory, not all of it can be "permanently mapped" by the
@@ -1025,46 +1410,15 @@ config NOHIGHMEM
possible.
If the machine has between 1 and 4 Gigabytes physical RAM, then
- answer "4GB" here.
-
- If more than 4 Gigabytes is used then answer "64GB" here. This
- selection turns Intel PAE (Physical Address Extension) mode on.
- PAE implements 3-level paging on IA32 processors. PAE is fully
- supported by Linux, PAE mode is implemented on all recent Intel
- processors (Pentium Pro and better). NOTE: If you say "64GB" here,
- then the kernel will not boot on CPUs that don't support PAE!
-
- The actual amount of total physical memory will either be
- auto detected or can be forced by using a kernel command line option
- such as "mem=256M". (Try "man bootparam" or see the documentation of
- your boot loader (lilo or loadlin) about how to pass options to the
- kernel at boot time.)
+ answer "Y" here.
- If unsure, say "off".
-
-config HIGHMEM4G
- bool "4GB"
- depends on !X86_NUMAQ
- ---help---
- Select this if you have a 32-bit processor and between 1 and 4
- gigabytes of physical RAM.
-
-config HIGHMEM64G
- bool "64GB"
- depends on !M386 && !M486
- select X86_PAE
- ---help---
- Select this if you have a 32-bit processor and more than 4
- gigabytes of physical RAM.
-
-endchoice
+ If unsure, say N.
choice
- depends on EXPERIMENTAL
- prompt "Memory split" if EMBEDDED
+ prompt "Memory split" if EXPERT
default VMSPLIT_3G
depends on X86_32
- ---help---
+ help
Select the desired split between kernel and user memory.
If the address range available to the kernel is less than the
@@ -1104,38 +1458,65 @@ config PAGE_OFFSET
depends on X86_32
config HIGHMEM
- def_bool y
- depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
+ def_bool HIGHMEM4G
config X86_PAE
bool "PAE (Physical Address Extension) Support"
- depends on X86_32 && !HIGHMEM4G
- ---help---
+ depends on X86_32 && X86_HAVE_PAE
+ select PHYS_ADDR_T_64BIT
+ help
PAE is required for NX support, and furthermore enables
larger swapspace support for non-overcommit purposes. It
has the cost of more pagetable lookup overhead, and also
consumes more pagetable space per process.
-config ARCH_PHYS_ADDR_T_64BIT
- def_bool X86_64 || X86_PAE
-
-config DIRECT_GBPAGES
- bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
- default y
+config X86_DIRECT_GBPAGES
+ def_bool y
depends on X86_64
- ---help---
- Allow the kernel linear mapping to use 1GB pages on CPUs that
- support it. This can improve the kernel's performance a tiny bit by
- reducing TLB pressure. If in doubt, say "Y".
+ help
+ Certain kernel features effectively disable kernel
+ linear 1 GB mappings (even if the CPU otherwise
+ supports them), so don't confuse the user by printing
+ that we have them enabled.
+
+config X86_CPA_STATISTICS
+ bool "Enable statistic for Change Page Attribute"
+ depends on DEBUG_FS
+ help
+ Expose statistics about the Change Page Attribute mechanism, which
+ helps to determine the effectiveness of preserving large and huge
+ page mappings when mapping protections are changed.
+
+config X86_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select DYNAMIC_PHYSICAL_MASK
+ def_bool n
+
+config AMD_MEM_ENCRYPT
+ bool "AMD Secure Memory Encryption (SME) support"
+ depends on X86_64 && CPU_SUP_AMD
+ depends on EFI_STUB
+ select DMA_COHERENT_POOL
+ select ARCH_USE_MEMREMAP_PROT
+ select INSTRUCTION_DECODER
+ select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
+ select UNACCEPTED_MEMORY
+ select CRYPTO_LIB_AESGCM
+ help
+ Say yes to enable support for the encryption of system memory.
+ This requires an AMD processor that supports Secure Memory
+ Encryption (SME).
# Common NUMA Features
config NUMA
- bool "Numa Memory Allocation and Scheduler Support"
+ bool "NUMA Memory Allocation and Scheduler Support"
depends on SMP
- depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
- default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
- ---help---
- Enable NUMA (Non Uniform Memory Access) support.
+ depends on X86_64
+ select USE_PERCPU_NUMA_NODE_ID
+ select OF_NUMA if OF
+ help
+ Enable NUMA (Non-Uniform Memory Access) support.
The kernel will try to allocate memory used by a CPU on the
local memory controller of the CPU and add some more
@@ -1144,123 +1525,92 @@ config NUMA
For 64-bit this is recommended if the system is Intel Core i7
(or later), AMD Opteron, or EM64T NUMA.
- For 32-bit this is only needed on (rare) 32-bit-only platforms
- that support NUMA topologies, such as NUMAQ / Summit, or if you
- boot a 32-bit kernel on a 64-bit NUMA platform.
-
Otherwise, you should say N.
-comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
- depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
-
-config K8_NUMA
+config AMD_NUMA
def_bool y
prompt "Old style AMD Opteron NUMA detection"
depends on X86_64 && NUMA && PCI
- ---help---
- Enable K8 NUMA node topology detection. You should say Y here if
- you have a multi processor AMD K8 system. This uses an old
- method to read the NUMA configuration directly from the builtin
- Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
- instead, which also takes priority if both are compiled in.
+ help
+ Enable AMD NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD system. This uses an old method to
+ read the NUMA configuration directly from the builtin Northbridge
+ of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
+ which also takes priority if both are compiled in.
config X86_64_ACPI_NUMA
def_bool y
prompt "ACPI NUMA detection"
depends on X86_64 && NUMA && ACPI && PCI
select ACPI_NUMA
- ---help---
+ help
Enable ACPI SRAT based node topology detection.
-# Some NUMA nodes have memory ranges that span
-# other nodes. Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node. See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
- def_bool y
- depends on X86_64_ACPI_NUMA
-
-config NUMA_EMU
- bool "NUMA emulation"
- depends on X86_64 && NUMA
- ---help---
- Enable NUMA emulation. A flat machine will be split
- into virtual nodes when booted with "numa=fake=N", where N is the
- number of nodes. This is only useful for debugging.
-
config NODES_SHIFT
int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
- range 1 9
- default "9" if MAXSMP
+ range 1 10
+ default "10" if MAXSMP
default "6" if X86_64
- default "4" if X86_NUMAQ
default "3"
- depends on NEED_MULTIPLE_NODES
- ---help---
+ depends on NUMA
+ help
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accommodate various tables.
-config HAVE_ARCH_BOOTMEM
- def_bool y
- depends on X86_32 && NUMA
-
-config ARCH_HAVE_MEMORY_PRESENT
- def_bool y
- depends on X86_32 && DISCONTIGMEM
-
-config NEED_NODE_MEMMAP_SIZE
- def_bool y
- depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
-
-config HAVE_ARCH_ALLOC_REMAP
- def_bool y
- depends on X86_32 && NUMA
-
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
-
-config ARCH_DISCONTIGMEM_ENABLE
- def_bool y
- depends on NUMA && X86_32
-
-config ARCH_DISCONTIGMEM_DEFAULT
- def_bool y
- depends on NUMA && X86_32
-
-config ARCH_SPARSEMEM_DEFAULT
- def_bool y
- depends on X86_64
+ depends on X86_32 && !NUMA
config ARCH_SPARSEMEM_ENABLE
def_bool y
- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
+config ARCH_SPARSEMEM_DEFAULT
+ def_bool X86_64 || (NUMA && X86_32)
+
config ARCH_SELECT_MEMORY_MODEL
def_bool y
- depends on ARCH_SPARSEMEM_ENABLE
+ depends on ARCH_SPARSEMEM_ENABLE && ARCH_FLATMEM_ENABLE
config ARCH_MEMORY_PROBE
- def_bool X86_64
+ bool "Enable sysfs memory/probe interface"
depends on MEMORY_HOTPLUG
+ help
+ This option enables a sysfs memory/probe interface for testing.
+ See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
+ If you are unsure how to answer this question, answer N.
+
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+ depends on X86_64 && PROC_KCORE
-source "mm/Kconfig"
+config ILLEGAL_POINTER_VALUE
+ hex
+ default 0 if X86_32
+ default 0xdead000000000000 if X86_64
-config HIGHPTE
- bool "Allocate 3rd-level pagetables from highmem"
- depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
- ---help---
- The VM uses one page table entry for each page of physical memory.
- For systems with a lot of RAM, this can be wasteful of precious
- low memory. Setting this option will put user-space page table
- entries in high memory.
+config X86_PMEM_LEGACY_DEVICE
+ bool
+
+config X86_PMEM_LEGACY
+ tristate "Support non-standard NVDIMMs and ADR protected memory"
+ depends on PHYS_ADDR_T_64BIT
+ depends on BLK_DEV
+ select X86_PMEM_LEGACY_DEVICE
+ select NUMA_KEEP_MEMINFO if NUMA
+ select LIBNVDIMM
+ help
+ Treat memory marked using the non-standard e820 type of 12 as used
+ by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+ The kernel will offer these regions to the 'pmem' driver so
+ they can be used for persistent storage.
+
+ Say Y if unsure.
config X86_CHECK_BIOS_CORRUPTION
bool "Check for low memory corruption"
- ---help---
+ help
Periodically check for memory corruption in low memory, which
is suspected to be caused by BIOS. Even when enabled in the
configuration, it is disabled at runtime. Enable it by
@@ -1268,7 +1618,7 @@ config X86_CHECK_BIOS_CORRUPTION
line. By default it scans the low 64k of memory every 60
seconds; see the memory_corruption_check_size and
memory_corruption_check_period parameters in
- Documentation/kernel-parameters.txt to adjust this.
+ Documentation/admin-guide/kernel-parameters.rst to adjust this.
When enabled with the default parameters, this option has
almost no overhead, as it reserves a relatively small amount
@@ -1284,34 +1634,15 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
bool "Set the default setting of memory_corruption_check"
depends on X86_CHECK_BIOS_CORRUPTION
default y
- ---help---
+ help
Set whether the default state of memory_corruption_check is
on or off.
-config X86_RESERVE_LOW_64K
- bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
- default y
- ---help---
- Reserve the first 64K of physical RAM on BIOSes that are known
- to potentially corrupt that memory range. A numbers of BIOSes are
- known to utilize this area during suspend/resume, so it must not
- be used by the kernel.
-
- Set this to N if you are absolutely sure that you trust the BIOS
- to get all its memory reservations and usages right.
-
- If you have doubts about the BIOS (e.g. suspend/resume does not
- work or there's kernel crashes after certain hardware hotplug
- events) and it's not AMI or Phoenix, then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
- corruption patterns.
-
- Say Y if unsure.
-
config MATH_EMULATION
bool
- prompt "Math emulation" if X86_32
- ---help---
+ depends on MODIFY_LDT_SYSCALL
+ prompt "Math emulation" if X86_32 && (M486SX || MELAN)
+ help
Linux can emulate a math coprocessor (used for floating point
operations) if you don't have one. 486DX and Pentium processors have
a math coprocessor built in, 486SX and 386 do not, unless you added
@@ -1335,8 +1666,9 @@ config MATH_EMULATION
kernel, it won't hurt.
config MTRR
- bool "MTRR (Memory Type Range Register) support"
- ---help---
+ def_bool y
+ prompt "MTRR (Memory Type Range Register) support" if EXPERT
+ help
On Intel P6 family processors (Pentium Pro, Pentium II and later)
the Memory Type Range Registers (MTRRs) may be used to control
processor access to memory ranges. This is most useful if you have
@@ -1366,13 +1698,13 @@ config MTRR
You can safely say Y even if your machine doesn't have MTRRs, you'll
just add about 9 KB to your kernel.
- See <file:Documentation/x86/mtrr.txt> for more information.
+ See <file:Documentation/arch/x86/mtrr.rst> for more information.
config MTRR_SANITIZER
def_bool y
prompt "MTRR cleanup support"
depends on MTRR
- ---help---
+ help
Convert MTRR layout from continuous to discrete, so X drivers can
add writeback entries.
@@ -1387,7 +1719,7 @@ config MTRR_SANITIZER_ENABLE_DEFAULT
range 0 1
default "0"
depends on MTRR_SANITIZER
- ---help---
+ help
Enable mtrr cleanup default value
config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
@@ -1395,15 +1727,16 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
range 0 7
default "1"
depends on MTRR_SANITIZER
- ---help---
+ help
mtrr cleanup spare entries default, it can be changed via
mtrr_spare_reg_nr=N on the kernel command line.
config X86_PAT
- bool
- prompt "x86 PAT support"
+ def_bool y
+ prompt "x86 PAT support" if EXPERT
depends on MTRR
- ---help---
+ select ARCH_USES_PG_ARCH_2
+ help
Use PAT attributes to setup page level cache control.
PATs are the modern equivalents of MTRRs and are much more
@@ -1414,10 +1747,177 @@ config X86_PAT
If unsure, say Y.
+config X86_UMIP
+ def_bool y
+ prompt "User Mode Instruction Prevention" if EXPERT
+ help
+ User Mode Instruction Prevention (UMIP) is a security feature in
+ some x86 processors. If enabled, a general protection fault is
+ issued if the SGDT, SLDT, SIDT, SMSW or STR instructions are
+ executed in user mode. These instructions unnecessarily expose
+ information about the hardware state.
+
+ The vast majority of applications do not use these instructions.
+ For the very few that do, software emulation is provided in
+ specific cases in protected and virtual-8086 modes. Emulated
+ results are dummy.
+
+config CC_HAS_IBT
+ # GCC >= 9 and binutils >= 2.29
+ # Retpoline check to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93654
+ def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || CC_IS_CLANG) && \
+ $(as-instr,endbr64)
+
+config X86_CET
+ def_bool n
+ help
+ CET features configured (Shadow stack or IBT)
+
+config X86_KERNEL_IBT
+ prompt "Indirect Branch Tracking"
+ def_bool y
+ depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL
+ select OBJTOOL
+ select X86_CET
+ help
+ Build the kernel with support for Indirect Branch Tracking, a
+ hardware support course-grain forward-edge Control Flow Integrity
+ protection. It enforces that all indirect calls must land on
+ an ENDBR instruction, as such, the compiler will instrument the
+ code with them to make this happen.
+
+ In addition to building the kernel with IBT, seal all functions that
+ are not indirect call targets, avoiding them ever becoming one.
+
+ This requires LTO like objtool runs and will slow down the build. It
+ does significantly reduce the number of ENDBR instructions in the
+ kernel image.
+
+config X86_INTEL_MEMORY_PROTECTION_KEYS
+ prompt "Memory Protection Keys"
+ def_bool y
+ # Note: only available in 64-bit mode
+ depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD)
+ select ARCH_USES_HIGH_VMA_FLAGS
+ select ARCH_HAS_PKEYS
+ help
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/core-api/protection-keys.rst
+
+ If unsure, say y.
+
+config ARCH_PKEY_BITS
+ int
+ default 4
+
+choice
+ prompt "TSX enable mode"
+ depends on CPU_SUP_INTEL
+ default X86_INTEL_TSX_MODE_OFF
+ help
+ Intel's TSX (Transactional Synchronization Extensions) feature
+ allows to optimize locking protocols through lock elision which
+ can lead to a noticeable performance boost.
+
+ On the other hand it has been shown that TSX can be exploited
+ to form side channel attacks (e.g. TAA) and chances are there
+ will be more of those attacks discovered in the future.
+
+ Therefore TSX is not enabled by default (aka tsx=off). An admin
+ might override this decision by tsx=on the command line parameter.
+ Even with TSX enabled, the kernel will attempt to enable the best
+ possible TAA mitigation setting depending on the microcode available
+ for the particular machine.
+
+ This option allows to set the default tsx mode between tsx=on, =off
+ and =auto. See Documentation/admin-guide/kernel-parameters.txt for more
+ details.
+
+ Say off if not sure, auto if TSX is in use but it should be used on safe
+ platforms or on if TSX is in use and the security aspect of tsx is not
+ relevant.
+
+config X86_INTEL_TSX_MODE_OFF
+ bool "off"
+ help
+ TSX is disabled if possible - equals to tsx=off command line parameter.
+
+config X86_INTEL_TSX_MODE_ON
+ bool "on"
+ help
+ TSX is always enabled on TSX capable HW - equals the tsx=on command
+ line parameter.
+
+config X86_INTEL_TSX_MODE_AUTO
+ bool "auto"
+ help
+ TSX is enabled on TSX capable HW that is believed to be safe against
+ side channel attacks- equals the tsx=auto command line parameter.
+endchoice
+
+config X86_SGX
+ bool "Software Guard eXtensions (SGX)"
+ depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
+ select CRYPTO_LIB_SHA256
+ select MMU_NOTIFIER
+ select NUMA_KEEP_MEMINFO if NUMA
+ select XARRAY_MULTI
+ help
+ Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
+ that can be used by applications to set aside private regions of code
+ and data, referred to as enclaves. An enclave's private memory can
+ only be accessed by code running within the enclave. Accesses from
+ outside the enclave, including other enclaves, are disallowed by
+ hardware.
+
+ If unsure, say N.
+
+config X86_USER_SHADOW_STACK
+ bool "X86 userspace shadow stack"
+ depends on AS_WRUSS
+ depends on X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS
+ select ARCH_HAS_USER_SHADOW_STACK
+ select X86_CET
+ help
+ Shadow stack protection is a hardware feature that detects function
+ return address corruption. This helps mitigate ROP attacks.
+ Applications must be enabled to use it, and old userspace does not
+ get protection "for free".
+
+ CPUs supporting shadow stacks were first released in 2020.
+
+ See Documentation/arch/x86/shstk.rst for more information.
+
+ If unsure, say N.
+
+config INTEL_TDX_HOST
+ bool "Intel Trust Domain Extensions (TDX) host support"
+ depends on CPU_SUP_INTEL
+ depends on X86_64
+ depends on KVM_INTEL
+ depends on X86_X2APIC
+ select ARCH_KEEP_MEMBLOCK
+ depends on CONTIG_ALLOC
+ depends on X86_MCE
+ help
+ Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
+ host and certain physical attacks. This option enables necessary TDX
+ support in the host kernel to run confidential VMs.
+
+ If unsure, say N.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
- ---help---
+ select UCS2_STRING
+ select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
+ select EFI_RUNTIME_MAP if KEXEC_CORE
+ help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -1428,93 +1928,111 @@ config EFI
resultant kernel should continue to boot on existing non-EFI
platforms.
-config SECCOMP
+config EFI_STUB
+ bool "EFI stub support"
+ depends on EFI
+ select RELOCATABLE
+ help
+ This kernel feature allows a bzImage to be loaded directly
+ by EFI firmware without the use of a bootloader.
+
+ See Documentation/admin-guide/efi-stub.rst for more information.
+
+config EFI_HANDOVER_PROTOCOL
+ bool "EFI handover protocol (DEPRECATED)"
+ depends on EFI_STUB
+ default y
+ help
+ Select this in order to include support for the deprecated EFI
+ handover protocol, which defines alternative entry points into the
+ EFI stub. This is a practice that has no basis in the UEFI
+ specification, and requires a priori knowledge on the part of the
+ bootloader about Linux/x86 specific ways of passing the command line
+ and initrd, and where in memory those assets may be loaded.
+
+ If in doubt, say Y. Even though the corresponding support is not
+ present in upstream GRUB or other bootloaders, most distros build
+ GRUB with numerous downstream patches applied, and may rely on the
+ handover protocol as as result.
+
+config EFI_MIXED
+ bool "EFI mixed-mode support"
+ depends on EFI_STUB && X86_64
+ help
+ Enabling this feature allows a 64-bit kernel to be booted
+ on a 32-bit firmware, provided that your CPU supports 64-bit
+ mode.
+
+ Note that it is not possible to boot a mixed-mode enabled
+ kernel via the EFI boot stub - a bootloader that supports
+ the EFI handover protocol must be used.
+
+ If unsure, say N.
+
+config EFI_RUNTIME_MAP
+ bool "Export EFI runtime maps to sysfs" if EXPERT
+ depends on EFI
+ help
+ Export EFI runtime memory regions to /sys/firmware/efi/runtime-map.
+ That memory map is required by the 2nd kernel to set up EFI virtual
+ mappings after kexec, but can also be used for debugging purposes.
+
+ See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map.
+
+source "kernel/Kconfig.hz"
+
+config ARCH_SUPPORTS_KEXEC
def_bool y
- prompt "Enable seccomp to safely compute untrusted bytecode"
- ---help---
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
- If unsure, say Y. Only embedded should say N here.
+config ARCH_SUPPORTS_KEXEC_FILE
+ def_bool X86_64
-config CC_STACKPROTECTOR_ALL
- bool
+config ARCH_SELECTS_KEXEC_FILE
+ def_bool y
+ depends on KEXEC_FILE
+ select HAVE_IMA_KEXEC if IMA
+
+config ARCH_SUPPORTS_KEXEC_PURGATORY
+ def_bool y
+
+config ARCH_SUPPORTS_KEXEC_SIG
+ def_bool y
-config CC_STACKPROTECTOR
- bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- select CC_STACKPROTECTOR_ALL
- ---help---
- This option turns on the -fstack-protector GCC feature. This
- feature puts, at the beginning of functions, a canary value on
- the stack just before the return address, and validates
- the value just before actually returning. Stack based buffer
- overflows (that need to overwrite this return address) now also
- overwrite the canary, which gets detected and the attack is then
- neutralized via a kernel panic.
-
- This feature requires gcc version 4.2 or above, or a distribution
- gcc with the feature backported. Older versions are automatically
- detected and for those versions, this configuration option is
- ignored. (and a warning is printed during bootup)
-
-source kernel/Kconfig.hz
-
-config KEXEC
- bool "kexec system call"
- ---help---
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel. It is like a reboot
- but it is independent of the system firmware. And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you. It may help to enable device hotplugging
- support. As of this writing the exact hardware interface is
- strongly in flux, so no good recommendation can be made.
-
-config CRASH_DUMP
- bool "kernel crash dumps"
- depends on X86_64 || (X86_32 && HIGHMEM)
- ---help---
- Generate crash dump after being started by kexec.
- This should be normally only set in special crash dump kernels
- which are loaded in the main kernel with kexec-tools into
- a specially reserved region and then later executed after
- a crash by kdump/kexec. The crash dump kernel must be compiled
- to a memory address not used by the main kernel or BIOS using
- PHYSICAL_START, or it must be built as a relocatable image
- (CONFIG_RELOCATABLE=y).
- For more details see Documentation/kdump/kdump.txt
-
-config KEXEC_JUMP
- bool "kexec jump (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- depends on KEXEC && HIBERNATION
- ---help---
- Jump between original kernel and kexeced kernel and invoke
- code in physical address mode via KEXEC
+config ARCH_SUPPORTS_KEXEC_SIG_FORCE
+ def_bool y
+
+config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
+ def_bool y
+
+config ARCH_SUPPORTS_KEXEC_JUMP
+ def_bool y
+
+config ARCH_SUPPORTS_KEXEC_HANDOVER
+ def_bool X86_64
+
+config ARCH_SUPPORTS_CRASH_DUMP
+ def_bool X86_64 || (X86_32 && HIGHMEM)
+
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
+config ARCH_SUPPORTS_CRASH_HOTPLUG
+ def_bool y
+
+config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ def_bool CRASH_RESERVE
config PHYSICAL_START
- hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+ hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
default "0x1000000"
- ---help---
+ help
This gives the physical address where the kernel is loaded.
- If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
- bzImage will decompress itself to above physical address and
- run from there. Otherwise, bzImage will run from the address where
- it has been loaded by the boot loader and will ignore above physical
- address.
+ If the kernel is not relocatable (CONFIG_RELOCATABLE=n) then bzImage
+ will decompress itself to above physical address and run from there.
+ Otherwise, bzImage will run from the address where it has been loaded
+ by the boot loader. The only exception is if it is loaded below the
+ above physical address, in which case it will relocate itself there.
In normal kdump cases one does not have to set/change this option
as now bzImage can be compiled as a completely relocatable image
@@ -1532,7 +2050,7 @@ config PHYSICAL_START
the reserved region. In other words, it can be set based on
the "X" value as specified in the "crashkernel=YM@XM"
command line boot parameter passed to the panic-ed
- kernel. Please take a look at Documentation/kdump/kdump.txt
+ kernel. Please take a look at Documentation/admin-guide/kdump/kdump.rst
for more details about crash dumps.
Usage of bzImage for capturing the crash dump is recommended as
@@ -1548,7 +2066,7 @@ config PHYSICAL_START
config RELOCATABLE
bool "Build a relocatable kernel"
default y
- ---help---
+ help
This builds a kernel image that retains relocation information
so it can be loaded someplace besides the default 1MB.
The relocations tend to make the kernel binary about 10% larger,
@@ -1560,19 +2078,55 @@ config RELOCATABLE
Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
it has been loaded at and the compile time physical address
- (CONFIG_PHYSICAL_START) is ignored.
+ (CONFIG_PHYSICAL_START) is used as the minimum location.
+
+config RANDOMIZE_BASE
+ bool "Randomize the address of the kernel image (KASLR)"
+ depends on RELOCATABLE
+ default y
+ help
+ In support of Kernel Address Space Layout Randomization (KASLR),
+ this randomizes the physical address at which the kernel image
+ is decompressed and the virtual address where the kernel
+ image is mapped, as a security feature that deters exploit
+ attempts relying on knowledge of the location of kernel
+ code internals.
+
+ On 64-bit, the kernel physical and virtual addresses are
+ randomized separately. The physical address will be anywhere
+ between 16MB and the top of physical memory (up to 64TB). The
+ virtual address will be randomized from 16MB up to 1GB (9 bits
+ of entropy). Note that this also reduces the memory space
+ available to kernel modules from 1.5GB to 1GB.
+
+ On 32-bit, the kernel physical and virtual addresses are
+ randomized together. They will be randomized from 16MB up to
+ 512MB (8 bits of entropy).
+
+ Entropy is generated using the RDRAND instruction if it is
+ supported. If RDTSC is supported, its value is mixed into
+ the entropy pool as well. If neither RDRAND nor RDTSC are
+ supported, then entropy is read from the i8254 timer. The
+ usable entropy is limited by the kernel being built using
+ 2GB addressing, and that PHYSICAL_ALIGN must be at a
+ minimum of 2MB. As a result, only 10 bits of entropy are
+ theoretically possible, but the implementations are further
+ limited due to memory layouts.
+
+ If unsure, say Y.
-# Relocation on x86-32 needs some additional build support
+# Relocation on x86 needs some additional build support
config X86_NEED_RELOCS
def_bool y
- depends on X86_32 && RELOCATABLE
+ depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
+ select ARCH_VMLINUX_NEEDS_RELOCS
config PHYSICAL_ALIGN
- hex
- prompt "Alignment value to which kernel should be aligned" if X86_32
- default "0x1000000"
- range 0x2000 0x1000000
- ---help---
+ hex "Alignment value to which kernel should be aligned"
+ default "0x200000"
+ range 0x2000 0x1000000 if X86_32
+ range 0x200000 0x1000000 if X86_64
+ help
This value puts the alignment restrictions on physical address
where kernel is loaded and run from. Kernel is compiled for an
address which meets above alignment restriction.
@@ -1589,35 +2143,130 @@ config PHYSICAL_ALIGN
end result is that kernel runs from a physical address meeting
above alignment restrictions.
+ On 32-bit this value must be a multiple of 0x2000. On 64-bit
+ this value must be a multiple of 0x200000.
+
Don't change this unless you know what you are doing.
+config RANDOMIZE_MEMORY
+ bool "Randomize the kernel memory sections"
+ depends on X86_64
+ depends on RANDOMIZE_BASE
+ default RANDOMIZE_BASE
+ help
+ Randomizes the base virtual address of kernel memory sections
+ (physical memory mapping, vmalloc & vmemmap). This security feature
+ makes exploits relying on predictable memory locations less reliable.
+
+ The order of allocations remains unchanged. Entropy is generated in
+ the same way as RANDOMIZE_BASE. Current implementation in the optimal
+ configuration have in average 30,000 different possible virtual
+ addresses for each memory section.
+
+ If unsure, say Y.
+
+config RANDOMIZE_MEMORY_PHYSICAL_PADDING
+ hex "Physical memory mapping padding" if EXPERT
+ depends on RANDOMIZE_MEMORY
+ default "0xa" if MEMORY_HOTPLUG
+ default "0x0"
+ range 0x1 0x40 if MEMORY_HOTPLUG
+ range 0x0 0x40
+ help
+ Define the padding in terabytes added to the existing physical
+ memory size during kernel memory randomization. It is useful
+ for memory hotplug support but reduces the entropy available for
+ address randomization.
+
+ If unsure, leave at the default value.
+
+config ADDRESS_MASKING
+ bool "Linear Address Masking support"
+ depends on X86_64
+ depends on COMPILE_TEST || !CPU_MITIGATIONS # wait for LASS
+ help
+ Linear Address Masking (LAM) modifies the checking that is applied
+ to 64-bit linear addresses, allowing software to use of the
+ untranslated address bits for metadata.
+
+ The capability can be used for efficient address sanitizers (ASAN)
+ implementation and for optimizations in JITs.
+
config HOTPLUG_CPU
- bool "Support for hot-pluggable CPUs"
- depends on SMP && HOTPLUG
- ---help---
- Say Y here to allow turning CPUs off and on. CPUs can be
- controlled through /sys/devices/system/cpu.
- ( Note: power management support will enable this option
- automatically on SMP systems. )
- Say N if you want to disable CPU hotplug.
+ def_bool y
+ depends on SMP
config COMPAT_VDSO
- def_bool y
- prompt "Compat VDSO support"
- depends on X86_32 || IA32_EMULATION
- ---help---
- Map the 32-bit VDSO to the predictable old-style address too.
- ---help---
- Say N here if you are running a sufficiently recent glibc
- version (2.3.3 or later), to remove the high-mapped
- VDSO mapping and to exclusively use the randomized VDSO.
+ def_bool n
+ prompt "Workaround for glibc 2.3.2 / 2.3.3 (released in year 2003/2004)"
+ depends on COMPAT_32
+ help
+ Certain buggy versions of glibc will crash if they are
+ presented with a 32-bit vDSO that is not mapped at the address
+ indicated in its segment table.
- If unsure, say Y.
+ The bug was introduced by f866314b89d56845f55e6f365e18b31ec978ec3a
+ and fixed by 3b3ddb4f7db98ec9e912ccdf54d35df4aa30e04a and
+ 49ad572a70b8aeb91e57483a11dd1b77e31c4468. Glibc 2.3.3 is
+ the only released version with the bug, but OpenSUSE 9
+ contains a buggy "glibc 2.3.2".
+
+ The symptom of the bug is that everything crashes on startup, saying:
+ dl_main: Assertion `(void *) ph->p_vaddr == _rtld_local._dl_sysinfo_dso' failed!
+
+ Saying Y here changes the default value of the vdso32 boot
+ option from 1 to 0, which turns off the 32-bit vDSO entirely.
+ This works around the glibc bug but hurts performance.
+
+ If unsure, say N: if you are compiling your own kernel, you
+ are unlikely to be using a buggy version of glibc.
+
+choice
+ prompt "vsyscall table for legacy applications"
+ depends on X86_64
+ default LEGACY_VSYSCALL_XONLY
+ help
+ Legacy user code that does not know how to find the vDSO expects
+ to be able to issue three syscalls by calling fixed addresses in
+ kernel space. Since this location is not randomized with ASLR,
+ it can be used to assist security vulnerability exploitation.
+
+ This setting can be changed at boot time via the kernel command
+ line parameter vsyscall=[emulate|xonly|none]. Emulate mode
+ is deprecated and can only be enabled using the kernel command
+ line.
+
+ On a system with recent enough glibc (2.14 or newer) and no
+ static binaries, you can say None without a performance penalty
+ to improve security.
+
+ If unsure, select "Emulate execution only".
+
+ config LEGACY_VSYSCALL_XONLY
+ bool "Emulate execution only"
+ help
+ The kernel traps and emulates calls into the fixed vsyscall
+ address mapping and does not allow reads. This
+ configuration is recommended when userspace might use the
+ legacy vsyscall area but support for legacy binary
+ instrumentation of legacy code is not needed. It mitigates
+ certain uses of the vsyscall area as an ASLR-bypassing
+ buffer.
+
+ config LEGACY_VSYSCALL_NONE
+ bool "None"
+ help
+ There will be no vsyscall mapping at all. This will
+ eliminate any risk of ASLR bypass due to the vsyscall
+ fixed address mapping. Attempts to use the vsyscalls
+ will be reported to dmesg, so that either old or
+ malicious userspace programs can be identified.
+
+endchoice
config CMDLINE_BOOL
bool "Built-in kernel command line"
- default n
- ---help---
+ help
Allow for specifying boot arguments to the kernel at
build time. On some systems (e.g. embedded ones), it is
necessary or convenient to provide some or all of the
@@ -1626,7 +2275,7 @@ config CMDLINE_BOOL
To compile command line arguments into the kernel,
set this option to 'Y', then fill in the
- the boot arguments in CONFIG_CMDLINE.
+ boot arguments in CONFIG_CMDLINE.
Systems with fully functional boot loaders (i.e. non-embedded)
should leave this option set to 'N'.
@@ -1635,7 +2284,7 @@ config CMDLINE
string "Built-in kernel command string"
depends on CMDLINE_BOOL
default ""
- ---help---
+ help
Enter arguments here that should be compiled into the kernel
image and used at boot time. If the boot loader provides a
command line at boot time, it is appended to this string to
@@ -1650,48 +2299,446 @@ config CMDLINE
config CMDLINE_OVERRIDE
bool "Built-in command line overrides boot loader arguments"
- default n
- depends on CMDLINE_BOOL
- ---help---
+ depends on CMDLINE_BOOL && CMDLINE != ""
+ help
Set this option to 'Y' to have the kernel ignore the boot loader
command line, and use ONLY the built-in command line.
This is used to work around broken boot loaders. This should
be set to 'N' under normal conditions.
+config MODIFY_LDT_SYSCALL
+ bool "Enable the LDT (local descriptor table)" if EXPERT
+ default y
+ help
+ Linux can allow user programs to install a per-process x86
+ Local Descriptor Table (LDT) using the modify_ldt(2) system
+ call. This is required to run 16-bit or segmented code such as
+ DOSEMU or some Wine programs. It is also used by some very old
+ threading libraries.
+
+ Enabling this feature adds a small amount of overhead to
+ context switches and increases the low-level kernel attack
+ surface. Disabling it removes the modify_ldt(2) system call.
+
+ Saying 'N' here may make sense for embedded or server kernels.
+
+config STRICT_SIGALTSTACK_SIZE
+ bool "Enforce strict size checking for sigaltstack"
+ depends on DYNAMIC_SIGFRAME
+ help
+ For historical reasons MINSIGSTKSZ is a constant which became
+ already too small with AVX512 support. Add a mechanism to
+ enforce strict checking of the sigaltstack size against the
+ real size of the FPU frame. This option enables the check
+ by default. It can also be controlled via the kernel command
+ line option 'strict_sas_size' independent of this config
+ switch. Enabling it might break existing applications which
+ allocate a too small sigaltstack but 'work' because they
+ never get a signal delivered.
+
+ Say 'N' unless you want to really enforce this check.
+
+config CFI_AUTO_DEFAULT
+ bool "Attempt to use FineIBT by default at boot time"
+ depends on FINEIBT
+ depends on !RUST || RUSTC_VERSION >= 108800
+ default y
+ help
+ Attempt to use FineIBT by default at boot time. If enabled,
+ this is the same as booting with "cfi=auto". If disabled,
+ this is the same as booting with "cfi=kcfi".
+
+source "kernel/livepatch/Kconfig"
+
+config X86_BUS_LOCK_DETECT
+ bool "Split Lock Detect and Bus Lock Detect support"
+ depends on CPU_SUP_INTEL || CPU_SUP_AMD
+ default y
+ help
+ Enable Split Lock Detect and Bus Lock Detect functionalities.
+ See <file:Documentation/arch/x86/buslock.rst> for more information.
+
endmenu
-config ARCH_ENABLE_MEMORY_HOTPLUG
+config CC_HAS_NAMED_AS
+ def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC) -x c - -S -o /dev/null)
+ depends on CC_IS_GCC
+
+#
+# -fsanitize=kernel-address (KASAN) and -fsanitize=thread (KCSAN)
+# are incompatible with named address spaces with GCC < 13.3
+# (see GCC PR sanitizer/111736 and also PR sanitizer/115172).
+#
+
+config CC_HAS_NAMED_AS_FIXED_SANITIZERS
def_bool y
- depends on X86_64 || (X86_32 && HIGHMEM)
+ depends on !(KASAN || KCSAN) || GCC_VERSION >= 130300
+ depends on !(UBSAN_BOOL && KASAN) || GCC_VERSION >= 140200
+
+config USE_X86_SEG_SUPPORT
+ def_bool CC_HAS_NAMED_AS
+ depends on CC_HAS_NAMED_AS_FIXED_SANITIZERS
-config ARCH_ENABLE_MEMORY_HOTREMOVE
+config CC_HAS_SLS
+ def_bool $(cc-option,-mharden-sls=all)
+
+config CC_HAS_RETURN_THUNK
+ def_bool $(cc-option,-mfunction-return=thunk-extern)
+
+config CC_HAS_ENTRY_PADDING
+ def_bool $(cc-option,-fpatchable-function-entry=16,16)
+
+config CC_HAS_KCFI_ARITY
+ def_bool $(cc-option,-fsanitize=kcfi -fsanitize-kcfi-arity)
+ depends on CC_IS_CLANG && !RUST
+
+config FUNCTION_PADDING_CFI
+ int
+ default 59 if FUNCTION_ALIGNMENT_64B
+ default 27 if FUNCTION_ALIGNMENT_32B
+ default 11 if FUNCTION_ALIGNMENT_16B
+ default 3 if FUNCTION_ALIGNMENT_8B
+ default 0
+
+# Basically: FUNCTION_ALIGNMENT - 5*CFI
+# except Kconfig can't do arithmetic :/
+config FUNCTION_PADDING_BYTES
+ int
+ default FUNCTION_PADDING_CFI if CFI
+ default FUNCTION_ALIGNMENT
+
+config CALL_PADDING
+ def_bool n
+ depends on CC_HAS_ENTRY_PADDING && OBJTOOL
+ select FUNCTION_ALIGNMENT_16B
+
+config FINEIBT
def_bool y
- depends on MEMORY_HOTPLUG
+ depends on X86_KERNEL_IBT && CFI && MITIGATION_RETPOLINE
+ select CALL_PADDING
-config HAVE_ARCH_EARLY_PFN_TO_NID
- def_bool X86_64
- depends on NUMA
+config FINEIBT_BHI
+ def_bool y
+ depends on FINEIBT && CC_HAS_KCFI_ARITY
+
+config HAVE_CALL_THUNKS
+ def_bool y
+ depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL
+
+config CALL_THUNKS
+ def_bool n
+ select CALL_PADDING
+
+config PREFIX_SYMBOLS
+ def_bool y
+ depends on CALL_PADDING && !CFI
+
+menuconfig CPU_MITIGATIONS
+ bool "Mitigations for CPU vulnerabilities"
+ default y
+ help
+ Say Y here to enable options which enable mitigations for hardware
+ vulnerabilities (usually related to speculative execution).
+ Mitigations can be disabled or restricted to SMT systems at runtime
+ via the "mitigations" kernel parameter.
+
+ If you say N, all mitigations will be disabled. This CANNOT be
+ overridden at runtime.
+
+ Say 'Y', unless you really know what you are doing.
+
+if CPU_MITIGATIONS
+
+config MITIGATION_PAGE_TABLE_ISOLATION
+ bool "Remove the kernel mapping in user mode"
+ default y
+ depends on (X86_64 || X86_PAE)
+ help
+ This feature reduces the number of hardware side channels by
+ ensuring that the majority of kernel addresses are not mapped
+ into userspace.
+
+ See Documentation/arch/x86/pti.rst for more details.
+
+config MITIGATION_RETPOLINE
+ bool "Avoid speculative indirect branches in kernel"
+ select OBJTOOL if HAVE_OBJTOOL
+ default y
+ help
+ Compile kernel with the retpoline compiler options to guard against
+ kernel-to-user data leaks by avoiding speculative indirect
+ branches. Requires a compiler with -mindirect-branch=thunk-extern
+ support for full protection. The kernel may run slower.
+
+config MITIGATION_RETHUNK
+ bool "Enable return-thunks"
+ depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK
+ select OBJTOOL if HAVE_OBJTOOL
+ default y if X86_64
+ help
+ Compile the kernel with the return-thunks compiler option to guard
+ against kernel-to-user data leaks by avoiding return speculation.
+ Requires a compiler with -mfunction-return=thunk-extern
+ support for full protection. The kernel may run slower.
+
+config MITIGATION_UNRET_ENTRY
+ bool "Enable UNRET on kernel entry"
+ depends on CPU_SUP_AMD && MITIGATION_RETHUNK && X86_64
+ default y
+ help
+ Compile the kernel with support for the retbleed=unret mitigation.
+
+config MITIGATION_CALL_DEPTH_TRACKING
+ bool "Mitigate RSB underflow with call depth tracking"
+ depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS
+ select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ select CALL_THUNKS
+ default y
+ help
+ Compile the kernel with call depth tracking to mitigate the Intel
+ SKL Return-Stack-Buffer (RSB) underflow issue. The mitigation is off
+ by default and needs to be enabled on the kernel command line via the
+ retbleed=stuff option. For non-affected systems the overhead of this
+ option is marginal as the call depth tracking is using run-time
+ generated call thunks in a compiler generated padding area and call
+ patching. This increases text size by ~5%. For non affected systems
+ this space is unused. On affected SKL systems this results in a
+ significant performance gain over the IBRS mitigation.
+
+config CALL_THUNKS_DEBUG
+ bool "Enable call thunks and call depth tracking debugging"
+ depends on MITIGATION_CALL_DEPTH_TRACKING
+ select FUNCTION_ALIGNMENT_32B
+ default n
+ help
+ Enable call/ret counters for imbalance detection and build in
+ a noisy dmesg about callthunks generation and call patching for
+ trouble shooting. The debug prints need to be enabled on the
+ kernel command line with 'debug-callthunks'.
+ Only enable this when you are debugging call thunks as this
+ creates a noticeable runtime overhead. If unsure say N.
+
+config MITIGATION_IBPB_ENTRY
+ bool "Enable IBPB on kernel entry"
+ depends on CPU_SUP_AMD && X86_64
+ default y
+ help
+ Compile the kernel with support for the retbleed=ibpb and
+ spec_rstack_overflow={ibpb,ibpb-vmexit} mitigations.
+
+config MITIGATION_IBRS_ENTRY
+ bool "Enable IBRS on kernel entry"
+ depends on CPU_SUP_INTEL && X86_64
+ default y
+ help
+ Compile the kernel with support for the spectre_v2=ibrs mitigation.
+ This mitigates both spectre_v2 and retbleed at great cost to
+ performance.
+
+config MITIGATION_SRSO
+ bool "Mitigate speculative RAS overflow on AMD"
+ depends on CPU_SUP_AMD && X86_64 && MITIGATION_RETHUNK
+ default y
+ help
+ Enable the SRSO mitigation needed on AMD Zen1-4 machines.
+
+config MITIGATION_SLS
+ bool "Mitigate Straight-Line-Speculation"
+ depends on CC_HAS_SLS && X86_64
+ select OBJTOOL if HAVE_OBJTOOL
+ default n
+ help
+ Compile the kernel with straight-line-speculation options to guard
+ against straight line speculation. The kernel image might be slightly
+ larger.
+
+config MITIGATION_GDS
+ bool "Mitigate Gather Data Sampling"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Gather Data Sampling (GDS). GDS is a hardware
+ vulnerability which allows unprivileged speculative access to data
+ which was previously stored in vector registers. The attacker uses gather
+ instructions to infer the stale vector register data.
+
+config MITIGATION_RFDS
+ bool "RFDS Mitigation"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Register File Data Sampling (RFDS) by default.
+ RFDS is a hardware vulnerability which affects Intel Atom CPUs. It
+ allows unprivileged speculative access to stale data previously
+ stored in floating point, vector and integer registers.
+ See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst>
+
+config MITIGATION_SPECTRE_BHI
+ bool "Mitigate Spectre-BHB (Branch History Injection)"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks
+ where the branch history buffer is poisoned to speculatively steer
+ indirect branches.
+ See <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
+config MITIGATION_MDS
+ bool "Mitigate Microarchitectural Data Sampling (MDS) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Microarchitectural Data Sampling (MDS). MDS is
+ a hardware vulnerability which allows unprivileged speculative access
+ to data which is available in various CPU internal buffers.
+ See also <file:Documentation/admin-guide/hw-vuln/mds.rst>
+
+config MITIGATION_TAA
+ bool "Mitigate TSX Asynchronous Abort (TAA) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for TSX Asynchronous Abort (TAA). TAA is a hardware
+ vulnerability that allows unprivileged speculative access to data
+ which is available in various CPU internal buffers by using
+ asynchronous aborts within an Intel TSX transactional region.
+ See also <file:Documentation/admin-guide/hw-vuln/tsx_async_abort.rst>
+
+config MITIGATION_MMIO_STALE_DATA
+ bool "Mitigate MMIO Stale Data hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for MMIO Stale Data hardware bugs. Processor MMIO
+ Stale Data Vulnerabilities are a class of memory-mapped I/O (MMIO)
+ vulnerabilities that can expose data. The vulnerabilities require the
+ attacker to have access to MMIO.
+ See also
+ <file:Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst>
+
+config MITIGATION_L1TF
+ bool "Mitigate L1 Terminal Fault (L1TF) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Mitigate L1 Terminal Fault (L1TF) hardware bug. L1 Terminal Fault is a
+ hardware vulnerability which allows unprivileged speculative access to data
+ available in the Level 1 Data Cache.
+ See <file:Documentation/admin-guide/hw-vuln/l1tf.rst
+
+config MITIGATION_RETBLEED
+ bool "Mitigate RETBleed hardware bug"
+ depends on (CPU_SUP_INTEL && MITIGATION_SPECTRE_V2) || MITIGATION_UNRET_ENTRY || MITIGATION_IBPB_ENTRY
+ default y
+ help
+ Enable mitigation for RETBleed (Arbitrary Speculative Code Execution
+ with Return Instructions) vulnerability. RETBleed is a speculative
+ execution attack which takes advantage of microarchitectural behavior
+ in many modern microprocessors, similar to Spectre v2. An
+ unprivileged attacker can use these flaws to bypass conventional
+ memory security restrictions to gain read access to privileged memory
+ that would otherwise be inaccessible.
+
+config MITIGATION_SPECTRE_V1
+ bool "Mitigate SPECTRE V1 hardware bug"
+ default y
+ help
+ Enable mitigation for Spectre V1 (Bounds Check Bypass). Spectre V1 is a
+ class of side channel attacks that takes advantage of speculative
+ execution that bypasses conditional branch instructions used for
+ memory access bounds check.
+ See also <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
+config MITIGATION_SPECTRE_V2
+ bool "Mitigate SPECTRE V2 hardware bug"
+ default y
+ help
+ Enable mitigation for Spectre V2 (Branch Target Injection). Spectre
+ V2 is a class of side channel attacks that takes advantage of
+ indirect branch predictors inside the processor. In Spectre variant 2
+ attacks, the attacker can steer speculative indirect branches in the
+ victim to gadget code by poisoning the branch target buffer of a CPU
+ used for predicting indirect branch addresses.
+ See also <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
+config MITIGATION_SRBDS
+ bool "Mitigate Special Register Buffer Data Sampling (SRBDS) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Special Register Buffer Data Sampling (SRBDS).
+ SRBDS is a hardware vulnerability that allows Microarchitectural Data
+ Sampling (MDS) techniques to infer values returned from special
+ register accesses. An unprivileged user can extract values returned
+ from RDRAND and RDSEED executed on another core or sibling thread
+ using MDS techniques.
+ See also
+ <file:Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst>
+
+config MITIGATION_SSB
+ bool "Mitigate Speculative Store Bypass (SSB) hardware bug"
+ default y
+ help
+ Enable mitigation for Speculative Store Bypass (SSB). SSB is a
+ hardware security vulnerability and its exploitation takes advantage
+ of speculative execution in a similar way to the Meltdown and Spectre
+ security vulnerabilities.
+
+config MITIGATION_ITS
+ bool "Enable Indirect Target Selection mitigation"
+ depends on CPU_SUP_INTEL && X86_64
+ depends on MITIGATION_RETPOLINE && MITIGATION_RETHUNK
+ select EXECMEM
+ default y
+ help
+ Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in
+ BPU on some Intel CPUs that may allow Spectre V2 style attacks. If
+ disabled, mitigation cannot be enabled via cmdline.
+ See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst>
+
+config MITIGATION_TSA
+ bool "Mitigate Transient Scheduler Attacks"
+ depends on CPU_SUP_AMD
+ default y
+ help
+ Enable mitigation for Transient Scheduler Attacks. TSA is a hardware
+ security vulnerability on AMD CPUs which can lead to forwarding of
+ invalid info to subsequent instructions and thus can affect their
+ timing and thereby cause a leakage.
+
+config MITIGATION_VMSCAPE
+ bool "Mitigate VMSCAPE"
+ depends on KVM
+ default y
+ help
+ Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security
+ vulnerability on Intel and AMD CPUs that may allow a guest to do
+ Spectre v2 style attacks on userspace hypervisor.
+endif
+
+config ARCH_HAS_ADD_PAGES
+ def_bool y
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
def_bool y
- depends on X86_64 && HIBERNATION
+ depends on HIBERNATION
source "kernel/power/Kconfig"
source "drivers/acpi/Kconfig"
config X86_APM_BOOT
- bool
- default y
- depends on APM || APM_MODULE
+ def_bool y
+ depends on APM
menuconfig APM
tristate "APM (Advanced Power Management) BIOS support"
depends on X86_32 && PM_SLEEP
- ---help---
+ help
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
APM compliant BIOSes. If you say Y here, the system time will be
@@ -1706,8 +2753,8 @@ menuconfig APM
machines with more than one CPU.
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/power/pm.txt> and the
- Battery Powered Linux mini-HOWTO, available from
+ and more information, read <file:Documentation/power/apm-acpi.rst>
+ and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
This driver does not spin down disk drives (see the hdparm(8)
@@ -1730,7 +2777,7 @@ menuconfig APM
1) make sure that you have enough swap space and that it is
enabled.
- 2) pass the "no-hlt" option to the kernel
+ 2) pass the "idle=poll" option to the kernel
3) switch on floating point emulation in the kernel and pass
the "no387" option to the kernel
4) pass the "floppy=nodma" option to the kernel
@@ -1751,14 +2798,14 @@ if APM
config APM_IGNORE_USER_SUSPEND
bool "Ignore USER SUSPEND"
- ---help---
+ help
This option will ignore USER SUSPEND requests. On machines with a
compliant APM BIOS, you want to say N. However, on the NEC Versa M
series notebooks, it is necessary to say Y because of a BIOS bug.
config APM_DO_ENABLE
bool "Enable PM at boot time"
- ---help---
+ help
Enable APM features at boot time. From page 36 of the APM BIOS
specification: "When disabled, the APM BIOS does not automatically
power manage devices, enter the Standby State, enter the Suspend
@@ -1774,8 +2821,9 @@ config APM_DO_ENABLE
this feature.
config APM_CPU_IDLE
+ depends on CPU_IDLE
bool "Make CPU Idle calls when idle"
- ---help---
+ help
Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
On some machines, this can activate improved power savings, such as
a slowed CPU clock rate, when the machine is idle. These idle calls
@@ -1786,7 +2834,7 @@ config APM_CPU_IDLE
config APM_DISPLAY_BLANK
bool "Enable console blanking using APM"
- ---help---
+ help
Enable console blanking using the APM. Some laptops can use this to
turn off the LCD backlight when the screen blanker of the Linux
virtual console blanks the screen. Note that this is only used by
@@ -1799,7 +2847,7 @@ config APM_DISPLAY_BLANK
config APM_ALLOW_INTS
bool "Allow interrupts during APM BIOS calls"
- ---help---
+ help
Normally we disable external interrupts while we are making calls to
the APM BIOS as a measure to lessen the effects of a badly behaving
BIOS implementation. The BIOS should reenable interrupts if it
@@ -1809,7 +2857,7 @@ config APM_ALLOW_INTS
endif # APM
-source "arch/x86/kernel/cpu/cpufreq/Kconfig"
+source "drivers/cpufreq/Kconfig"
source "drivers/cpuidle/Kconfig"
@@ -1817,24 +2865,13 @@ source "drivers/idle/Kconfig"
endmenu
-
menu "Bus options (PCI etc.)"
-config PCI
- bool "PCI support"
- default y
- select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
- ---help---
- Find out whether you have a PCI motherboard. PCI is the name of a
- bus system, i.e. the way the CPU talks to the other stuff inside
- your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
- VESA. If you have PCI, say Y, otherwise N.
-
choice
prompt "PCI access mode"
depends on X86_32 && PCI
default PCI_GOANY
- ---help---
+ help
On PCI systems, the BIOS can be used to detect the PCI devices and
determine their configuration. However, some old PCI motherboards
have BIOS bugs and may crash if this is done. Also, some embedded
@@ -1859,7 +2896,7 @@ config PCI_GODIRECT
bool "Direct"
config PCI_GOOLPC
- bool "OLPC"
+ bool "OLPC XO-1"
depends on OLPC
config PCI_GOANY
@@ -1874,124 +2911,93 @@ config PCI_BIOS
# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
config PCI_DIRECT
def_bool y
- depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
+ depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
config PCI_MMCONFIG
- def_bool y
- depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
+ bool "Support mmconfig PCI config space access" if X86_64
+ default y
+ depends on PCI && (ACPI || JAILHOUSE_GUEST)
+ depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG)
+ help
+ Add support for accessing the PCI configuration space as a memory
+ mapped area. It is the recommended method if the system supports
+ this (it must have PCI Express and ACPI for it to be available).
+
+ In the unlikely case that enabling this configuration option causes
+ problems, the mechanism can be switched off with the 'pci=nommconf'
+ command line parameter.
+
+ Say N only if you are sure that your platform does not support this
+ access method or you have problems caused by it.
+
+ Say Y otherwise.
config PCI_OLPC
def_bool y
depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
-config PCI_DOMAINS
+config PCI_XEN
def_bool y
- depends on PCI
+ depends on PCI && XEN
-config PCI_MMCONFIG
- bool "Support mmconfig PCI config space access"
- depends on X86_64 && PCI && ACPI
+config MMCONF_FAM10H
+ def_bool y
+ depends on X86_64 && PCI_MMCONFIG && ACPI
-config DMAR
- bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
- depends on PCI_MSI && ACPI && EXPERIMENTAL
+config PCI_CNB20LE_QUIRK
+ bool "Read PCI host bridge windows from the CNB20LE chipset" if EXPERT
+ depends on X86_32 && PCI
help
- DMA remapping (DMAR) devices support enables independent address
- translations for Direct Memory Access (DMA) from devices.
- These DMA remapping devices are reported via ACPI tables
- and include PCI device scope covered by these DMA
- remapping devices.
+ Read the PCI windows out of the CNB20LE host bridge. This allows
+ PCI hotplug to work on systems with the CNB20LE chipset which do
+ not have ACPI.
-config DMAR_DEFAULT_ON
- def_bool y
- prompt "Enable DMA Remapping Devices by default"
- depends on DMAR
+ The ServerWorks (later Broadcom) CNB20LE was a chipset designed
+ most probably only for Pentium III.
+
+ To find out if you have such a chipset, search for a PCI device with
+ 1166:0009 PCI IDs, for example by executing
+ lspci -nn | grep '1166:0009'
+ The code is inactive if there is none.
+
+ There's no public spec for this chipset, and this functionality
+ is known to be incomplete.
+
+ You should say N unless you know you need this.
+
+config ISA_BUS
+ bool "ISA bus support on modern systems" if EXPERT
help
- Selecting this option will enable a DMAR device at boot time if
- one is found. If this option is not selected, DMAR support can
- be enabled by passing intel_iommu=on to the kernel. It is
- recommended you say N here while the DMAR code remains
- experimental.
+ Expose ISA bus device drivers and options available for selection and
+ configuration. Enable this option if your target machine has an ISA
+ bus. ISA is an older system, displaced by PCI and newer bus
+ architectures -- if your target machine is modern, it probably does
+ not have an ISA bus.
-config DMAR_BROKEN_GFX_WA
- def_bool n
- prompt "Workaround broken graphics drivers (going away soon)"
- depends on DMAR
- ---help---
- Current Graphics drivers tend to use physical address
- for DMA and avoid using DMA APIs. Setting this config
- option permits the IOMMU driver to set a unity map for
- all the OS-visible memory. Hence the driver can continue
- to use physical addresses for DMA, at least until this
- option is removed in the 2.6.32 kernel.
-
-config DMAR_FLOPPY_WA
- def_bool y
- depends on DMAR
- ---help---
- Floppy disk drivers are known to bypass DMA API calls
- thereby failing to work when IOMMU is enabled. This
- workaround will setup a 1:1 mapping for the first
- 16MiB to make floppy (an ISA device) work.
-
-config INTR_REMAP
- bool "Support for Interrupt Remapping (EXPERIMENTAL)"
- depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
- ---help---
- Supports Interrupt remapping for IO-APIC and MSI devices.
- To use x2apic mode in the CPU's which support x2APIC enhancements or
- to support platforms with CPU's having > 8 bit APIC ID, say Y.
-
-source "drivers/pci/pcie/Kconfig"
-
-source "drivers/pci/Kconfig"
-
-# x86_64 have no ISA slots, but do have ISA-style DMA.
+ If unsure, say N.
+
+# x86_64 have no ISA slots, but can have ISA-style DMA.
config ISA_DMA_API
- def_bool y
+ bool "ISA-style DMA support" if (X86_64 && EXPERT)
+ default y
+ help
+ Enables ISA-style DMA support for devices requiring such controllers.
+ If unsure, say Y.
if X86_32
config ISA
bool "ISA support"
- ---help---
+ help
Find out whether you have ISA slots on your motherboard. ISA is the
name of a bus system, i.e. the way the CPU talks to the other stuff
inside your box. Other bus systems are PCI, EISA, MicroChannel
(MCA) or VESA. ISA is an older system, now being displaced by PCI;
newer boards don't support it. If you have ISA, say Y, otherwise N.
-config EISA
- bool "EISA support"
- depends on ISA
- ---help---
- The Extended Industry Standard Architecture (EISA) bus was
- developed as an open alternative to the IBM MicroChannel bus.
-
- The EISA bus provided some of the features of the IBM MicroChannel
- bus while maintaining backward compatibility with cards made for
- the older ISA bus. The EISA bus saw limited use between 1988 and
- 1995 when it was made obsolete by the PCI bus.
-
- Say Y here if you are building a kernel for an EISA-based machine.
-
- Otherwise, say N.
-
-source "drivers/eisa/Kconfig"
-
-config MCA
- bool "MCA support"
- ---help---
- MicroChannel Architecture is found in some IBM PS/2 machines and
- laptops. It is a bus system similar to PCI or ISA. See
- <file:Documentation/mca.txt> (and especially the web page given
- there) before attempting to build an MCA bus kernel.
-
-source "drivers/mca/Kconfig"
-
config SCx200
tristate "NatSemi SCx200 support"
- ---help---
+ help
This provides basic support for National Semiconductor's
(now AMD's) Geode processors. The driver probes for the
PCI-IDs of several on-chip devices, so its a good dependency
@@ -2001,97 +3007,176 @@ config SCx200
config SCx200HR_TIMER
tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
- depends on SCx200 && GENERIC_TIME
+ depends on SCx200
default y
- ---help---
+ help
This driver provides a clocksource built upon the on-chip
27MHz high-resolution timer. Its also a workaround for
NSC Geode SC-1100's buggy TSC, which loses time when the
processor goes idle (as is done by the scheduler). The
other workaround is idle=poll boot option.
-config GEODE_MFGPT_TIMER
- def_bool y
- prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
- depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
- ---help---
- This driver provides a clock event source based on the MFGPT
- timer(s) in the CS5535 and CS5536 companion chip for the geode.
- MFGPTs have a better resolution and max interval than the
- generic PIT, and are suitable for use as high-res timers.
-
config OLPC
bool "One Laptop Per Child support"
- default n
- ---help---
+ depends on !X86_PAE
+ select GPIOLIB
+ select OF
+ select OF_PROMTREE
+ select IRQ_DOMAIN
+ select OLPC_EC
+ help
Add support for detecting the unique features of the OLPC
XO hardware.
-endif # X86_32
+config OLPC_XO1_PM
+ bool "OLPC XO-1 Power Management"
+ depends on OLPC && MFD_CS5535=y && PM_SLEEP
+ help
+ Add support for poweroff and suspend of the OLPC XO-1 laptop.
-config K8_NB
- def_bool y
- depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA)))
+config OLPC_XO1_RTC
+ bool "OLPC XO-1 Real Time Clock"
+ depends on OLPC_XO1_PM && RTC_DRV_CMOS
+ help
+ Add support for the XO-1 real time clock, which can be used as a
+ programmable wakeup source.
+
+config OLPC_XO1_SCI
+ bool "OLPC XO-1 SCI extras"
+ depends on OLPC && OLPC_XO1_PM && GPIO_CS5535=y
+ depends on INPUT=y
+ select POWER_SUPPLY
+ help
+ Add support for SCI-based features of the OLPC XO-1 laptop:
+ - EC-driven system wakeups
+ - Power button
+ - Ebook switch
+ - Lid switch
+ - AC adapter status updates
+ - Battery status updates
+
+config OLPC_XO15_SCI
+ bool "OLPC XO-1.5 SCI extras"
+ depends on OLPC && ACPI
+ select POWER_SUPPLY
+ help
+ Add support for SCI-based features of the OLPC XO-1.5 laptop:
+ - EC-driven system wakeups
+ - AC adapter status updates
+ - Battery status updates
-source "drivers/pcmcia/Kconfig"
+config GEODE_COMMON
+ bool
-source "drivers/pci/hotplug/Kconfig"
+config ALIX
+ bool "PCEngines ALIX System Support (LED setup)"
+ select GPIOLIB
+ select GEODE_COMMON
+ help
+ This option enables system support for the PCEngines ALIX.
+ At present this just sets up LEDs for GPIO control on
+ ALIX2/3/6 boards. However, other system specific setup should
+ get added here.
-endmenu
+ Note: You must still enable the drivers for GPIO and LED support
+ (GPIO_CS5535 & LEDS_GPIO) to actually use the LEDs
+
+ Note: You have to set alix.force=1 for boards with Award BIOS.
+
+config NET5501
+ bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)"
+ select GPIOLIB
+ select GEODE_COMMON
+ help
+ This option enables system support for the Soekris Engineering net5501.
+
+config GEOS
+ bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"
+ select GPIOLIB
+ select GEODE_COMMON
+ depends on DMI
+ help
+ This option enables system support for the Traverse Technologies GEOS.
+
+config TS5500
+ bool "Technologic Systems TS-5500 platform support"
+ depends on MELAN
+ select CHECK_SIGNATURE
+ select NEW_LEDS
+ select LEDS_CLASS
+ help
+ This option enables system support for the Technologic Systems TS-5500.
+
+endif # X86_32
+
+config AMD_NB
+ def_bool y
+ depends on AMD_NODE
+config AMD_NODE
+ def_bool y
+ depends on CPU_SUP_AMD && PCI
-menu "Executable file formats / Emulations"
+endmenu
-source "fs/Kconfig.binfmt"
+menu "Binary Emulations"
config IA32_EMULATION
bool "IA32 Emulation"
depends on X86_64
- select COMPAT_BINFMT_ELF
- ---help---
- Include code to run 32-bit programs under a 64-bit kernel. You should
- likely turn this on, unless you're 100% sure that you don't have any
- 32-bit programs left.
-
-config IA32_AOUT
- tristate "IA32 a.out support"
+ select ARCH_WANT_OLD_COMPAT_IPC
+ select BINFMT_ELF
+ select COMPAT_OLD_SIGACTION
+ help
+ Include code to run legacy 32-bit programs under a
+ 64-bit kernel. You should likely turn this on, unless you're
+ 100% sure that you don't have any 32-bit programs left.
+
+config IA32_EMULATION_DEFAULT_DISABLED
+ bool "IA32 emulation disabled by default"
+ default n
depends on IA32_EMULATION
- ---help---
- Support old a.out binaries in the 32bit emulation.
+ help
+ Make IA32 emulation disabled by default. This prevents loading 32-bit
+ processes and access to 32-bit syscalls. If unsure, leave it to its
+ default value.
+
+config X86_X32_ABI
+ bool "x32 ABI for 64-bit mode"
+ depends on X86_64
+ # llvm-objcopy does not convert x86_64 .note.gnu.property or
+ # compressed debug sections to x86_x32 properly:
+ # https://github.com/ClangBuiltLinux/linux/issues/514
+ # https://github.com/ClangBuiltLinux/linux/issues/1141
+ depends on $(success,$(OBJCOPY) --version | head -n1 | grep -qv llvm)
+ help
+ Include code to run binaries for the x32 native 32-bit ABI
+ for 64-bit processors. An x32 process gets access to the
+ full 64-bit register file and wide data path while leaving
+ pointers at 32 bits for smaller memory footprint.
+
+config COMPAT_32
+ def_bool y
+ depends on IA32_EMULATION || X86_32
+ select HAVE_UID16
+ select OLD_SIGSUSPEND3
config COMPAT
def_bool y
- depends on IA32_EMULATION
+ depends on IA32_EMULATION || X86_X32_ABI
config COMPAT_FOR_U64_ALIGNMENT
- def_bool COMPAT
- depends on X86_64
-
-config SYSVIPC_COMPAT
def_bool y
- depends on COMPAT && SYSVIPC
+ depends on COMPAT
endmenu
-
config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
-source "net/Kconfig"
-
-source "drivers/Kconfig"
-
-source "drivers/firmware/Kconfig"
-
-source "fs/Kconfig"
-
-source "arch/x86/Kconfig.debug"
-
-source "security/Kconfig"
-
-source "crypto/Kconfig"
-
source "arch/x86/kvm/Kconfig"
-source "lib/Kconfig"
+source "arch/x86/Kconfig.cpufeatures"
+
+source "arch/x86/Kconfig.assembler"
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
new file mode 100644
index 000000000000..b1c59fb0a4c9
--- /dev/null
+++ b/arch/x86/Kconfig.assembler
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+
+config AS_WRUSS
+ def_bool $(as-instr64,wrussq %rax$(comma)(%rbx))
+ help
+ Supported by binutils >= 2.31 and LLVM integrated assembler
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 8130334329c0..f928cf6e3252 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,32 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
# Put here option for CPU selection and depending optimization
-if !X86_ELAN
-
choice
- prompt "Processor family"
- default M686 if X86_32
- default GENERIC_CPU if X86_64
-
-config M386
- bool "386"
- depends on X86_32 && !UML
- ---help---
- This is the processor type of your CPU. This information is used for
- optimizing purposes. In order to compile a kernel that can run on
- all x86 CPU types (albeit not optimally fast), you can specify
- "386" here.
+ prompt "x86-32 Processor family"
+ depends on X86_32
+ default M686
+ help
+ This is the processor type of your CPU. This information is
+ used for optimizing purposes. In order to compile a kernel
+ that can run on all supported x86 CPU types (albeit not
+ optimally fast), you can specify "486" here.
+
+ Note that the 386 is no longer supported, this includes
+ AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI 486DLC/DLC2,
+ UMC 486SX-S and the NexGen Nx586.
The kernel will not necessarily run on earlier architectures than
the one you have chosen, e.g. a Pentium optimized kernel will run on
a PPro, but not necessarily on a i486.
Here are the settings recommended for greatest speed:
- - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI
- 486DLC/DLC2, and UMC 486SX-S. Only "386" kernels will run on a 386
- class machine.
- "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or
- SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
+ SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
- "586" for generic Pentium CPUs lacking the TSC
- (time stamp counter) register.
+ (time stamp counter) register.
- "Pentium-Classic" for the Intel Pentium.
- "Pentium-MMX" for the Intel Pentium MMX.
- "Pentium-Pro" for the Intel Pentium Pro.
@@ -39,27 +35,35 @@ config M386
- "Efficeon" for the Transmeta Efficeon series.
- "Winchip-C6" for original IDT Winchip.
- "Winchip-2" for IDT Winchips with 3dNow! capabilities.
+ - "AMD Elan" for the 32-bit AMD Elan embedded CPU.
- "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
- "Geode GX/LX" For AMD Geode GX and LX processors.
- "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
- "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above).
- "VIA C7" for VIA C7.
+ - "Intel Atom" for the Atom-microarchitecture CPUs.
- If you don't know what to do, choose "386".
+ See each option's help text for additional details. If you don't know
+ what to do, choose "Pentium-Pro".
+
+config M486SX
+ bool "486SX"
+ depends on X86_32
+ help
+ Select this for an 486-class CPU without an FPU such as
+ AMD/Cyrix/IBM/Intel SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5S.
config M486
- bool "486"
+ bool "486DX"
depends on X86_32
- ---help---
- Select this for a 486 series processor, either Intel or one of the
- compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX,
- DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or
- U5S.
+ help
+ Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel
+ 486DX/DX2/DX4 and UMC U5D.
config M586
bool "586/K5/5x86/6x86/6x86MX"
depends on X86_32
- ---help---
+ help
Select this for an 586 or 686 series processor such as the AMD K5,
the Cyrix 5x86, 6x86 and 6x86MX. This choice does not
assume the RDTSC (Read Time Stamp Counter) instruction.
@@ -67,21 +71,21 @@ config M586
config M586TSC
bool "Pentium-Classic"
depends on X86_32
- ---help---
+ help
Select this for a Pentium Classic processor with the RDTSC (Read
Time Stamp Counter) instruction for benchmarking.
config M586MMX
bool "Pentium-MMX"
depends on X86_32
- ---help---
+ help
Select this for a Pentium with the MMX graphics/multimedia
extended instructions.
config M686
bool "Pentium-Pro"
depends on X86_32
- ---help---
+ help
Select this for Intel Pentium Pro chips. This enables the use of
Pentium Pro extended instructions, and disables the init-time guard
against the f00f bug found in earlier Pentiums.
@@ -89,7 +93,7 @@ config M686
config MPENTIUMII
bool "Pentium-II/Celeron(pre-Coppermine)"
depends on X86_32
- ---help---
+ help
Select this for Intel chips based on the Pentium-II and
pre-Coppermine Celeron core. This option enables an unaligned
copy optimization, compiles the kernel with optimization flags
@@ -99,23 +103,23 @@ config MPENTIUMII
config MPENTIUMIII
bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon"
depends on X86_32
- ---help---
+ help
Select this for Intel chips based on the Pentium-III and
Celeron-Coppermine core. This option enables use of some
extended prefetch instructions in addition to the Pentium II
extensions.
config MPENTIUMM
- bool "Pentium M"
+ bool "Pentium M/Pentium Dual Core/Core Solo/Core Duo"
depends on X86_32
- ---help---
+ help
Select this for Intel Pentium M (not Pentium-4 M)
- notebook chips.
+ "Merom" Core Solo/Duo notebook chips
config MPENTIUM4
bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
depends on X86_32
- ---help---
+ help
Select this for Intel Pentium 4 chips. This includes the
Pentium 4, Pentium D, P4-based Celeron and Xeon, and
Pentium-4 M (not Pentium M) chips. This option enables compile
@@ -131,27 +135,15 @@ config MPENTIUM4
-Mobile Pentium 4
-Mobile Pentium 4 M
-Extreme Edition (Gallatin)
- -Prescott
- -Prescott 2M
- -Cedar Mill
- -Presler
- -Smithfiled
Xeons (Intel Xeon, Xeon MP, Xeon LV, Xeon MV) corename:
-Foster
-Prestonia
-Gallatin
- -Nocona
- -Irwindale
- -Cranford
- -Potomac
- -Paxville
- -Dempsey
-
config MK6
bool "K6/K6-II/K6-III"
depends on X86_32
- ---help---
+ help
Select this for an AMD K6-family processor. Enables use of
some extended instructions, and passes appropriate optimization
flags to GCC.
@@ -159,22 +151,15 @@ config MK6
config MK7
bool "Athlon/Duron/K7"
depends on X86_32
- ---help---
+ help
Select this for an AMD Athlon K7-family processor. Enables use of
some extended instructions, and passes appropriate optimization
flags to GCC.
-config MK8
- bool "Opteron/Athlon64/Hammer/K8"
- ---help---
- Select this for an AMD Opteron or Athlon64 Hammer-family processor.
- Enables use of some extended instructions, and passes appropriate
- optimization flags to GCC.
-
config MCRUSOE
bool "Crusoe"
depends on X86_32
- ---help---
+ help
Select this for a Transmeta Crusoe processor. Treats the processor
like a 586 with TSC, and sets some GCC optimization flags (like a
Pentium Pro with no alignment requirements).
@@ -182,13 +167,13 @@ config MCRUSOE
config MEFFICEON
bool "Efficeon"
depends on X86_32
- ---help---
+ help
Select this for a Transmeta Efficeon processor.
config MWINCHIPC6
bool "Winchip-C6"
depends on X86_32
- ---help---
+ help
Select this for an IDT Winchip C6 chip. Linux and GCC
treat this chip as a 586TSC with some extended instructions
and alignment requirements.
@@ -196,29 +181,37 @@ config MWINCHIPC6
config MWINCHIP3D
bool "Winchip-2/Winchip-2A/Winchip-3"
depends on X86_32
- ---help---
+ help
Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
treat this chip as a 586TSC with some extended instructions
and alignment requirements. Also enable out of order memory
stores for this CPU, which can increase performance of some
operations.
+config MELAN
+ bool "AMD Elan"
+ depends on X86_32
+ help
+ Select this for an AMD Elan processor.
+
+ Do not use this option for K6/Athlon/Opteron processors!
+
config MGEODEGX1
bool "GeodeGX1"
depends on X86_32
- ---help---
+ help
Select this for a Geode GX1 (Cyrix MediaGX) chip.
config MGEODE_LX
bool "Geode GX/LX"
depends on X86_32
- ---help---
+ help
Select this for AMD Geode GX and LX processors.
config MCYRIXIII
bool "CyrixIII/VIA-C3"
depends on X86_32
- ---help---
+ help
Select this for a Cyrix III or C3 chip. Presently Linux and GCC
treat this chip as a generic 586. Whilst the CPU is 686 class,
it lacks the cmov extension which gcc assumes is present when
@@ -230,7 +223,7 @@ config MCYRIXIII
config MVIAC3_2
bool "VIA C3-2 (Nehemiah)"
depends on X86_32
- ---help---
+ help
Select this for a VIA C3 "Nehemiah". Selecting this enables usage
of SSE and tells gcc to treat the CPU as a 686.
Note, this kernel will not boot on older (pre model 9) C3s.
@@ -238,43 +231,48 @@ config MVIAC3_2
config MVIAC7
bool "VIA C7"
depends on X86_32
- ---help---
+ help
Select this for a VIA C7. Selecting this uses the correct cache
shift and tells gcc to treat the CPU as a 686.
-config MPSC
- bool "Intel P4 / older Netburst based Xeon"
- depends on X86_64
- ---help---
- Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
- Xeon CPUs with Intel 64bit which is compatible with x86-64.
- Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
- Netburst core and shouldn't use this option. You can distinguish them
- using the cpu family field
- in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
-
-config MCORE2
- bool "Core 2/newer Xeon"
- ---help---
-
- Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
- 53xx) CPUs. You can distinguish newer from older Xeons by the CPU
- family in /proc/cpuinfo. Newer ones have 6 and older ones 15
- (not a typo)
-
-config GENERIC_CPU
- bool "Generic-x86-64"
- depends on X86_64
- ---help---
- Generic x86-64 CPU.
- Run equally well on all x86-64 CPUs.
+config MATOM
+ bool "Intel Atom"
+ help
+ Select this for the Intel Atom platform. Intel Atom CPUs have an
+ in-order pipelining architecture and thus can benefit from
+ accordingly optimized code. Use a recent GCC with specific Atom
+ support in order to fully benefit from selecting this option.
endchoice
+config CC_HAS_MARCH_NATIVE
+ # This flag might not be available in cross-compilers:
+ def_bool $(cc-option, -march=native)
+ # LLVM 18 has an easily triggered internal compiler error in core
+ # networking code with '-march=native' on certain systems:
+ # https://github.com/llvm/llvm-project/issues/72026
+ # LLVM 19 introduces an optimization that resolves some high stack
+ # usage warnings that only appear wth '-march=native'.
+ depends on CC_IS_GCC || CLANG_VERSION >= 190100
+
+config X86_NATIVE_CPU
+ bool "Build and optimize for local/native CPU"
+ depends on X86_64
+ depends on CC_HAS_MARCH_NATIVE
+ help
+ Optimize for the current CPU used to compile the kernel.
+ Use this option if you intend to build the kernel for your
+ local machine.
+
+ Note that such a kernel might not work optimally on a
+ different x86 machine.
+
+ If unsure, say N.
+
config X86_GENERIC
bool "Generic x86 support"
depends on X86_32
- ---help---
+ help
Instead of just including optimizations for the selected
x86 variant (e.g. PII, Crusoe or Athlon), include some more
generic optimizations as well. This will make the kernel
@@ -283,143 +281,91 @@ config X86_GENERIC
This is really intended for distributors who need more
generic optimizations.
-endif
-
-config X86_CPU
- def_bool y
- select GENERIC_FIND_FIRST_BIT
- select GENERIC_FIND_NEXT_BIT
-
#
# Define implied options from the CPU selection here
-config X86_L1_CACHE_BYTES
- int
- default "128" if MPSC
- default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32
-
-config X86_INTERNODE_CACHE_BYTES
+config X86_INTERNODE_CACHE_SHIFT
int
- default "4096" if X86_VSMP
- default X86_L1_CACHE_BYTES if !X86_VSMP
-
-config X86_CMPXCHG
- def_bool X86_64 || (X86_32 && !M386)
+ default "12" if X86_VSMP
+ default X86_L1_CACHE_SHIFT
config X86_L1_CACHE_SHIFT
int
- default "7" if MPENTIUM4 || MPSC
- default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
+ default "7" if MPENTIUM4
+ default "6" if MK7 || MPENTIUMM || MATOM || MVIAC7 || X86_GENERIC || X86_64
+ default "4" if MELAN || M486SX || M486 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU
-
-config X86_XADD
- def_bool y
- depends on X86_32 && !M386
-
-config X86_PPRO_FENCE
- bool "PentiumPro memory ordering errata workaround"
- depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1
- ---help---
- Old PentiumPro multiprocessor systems had errata that could cause
- memory operations to violate the x86 ordering standard in rare cases.
- Enabling this option will attempt to work around some (but not all)
- occurances of this problem, at the cost of much heavier spinlock and
- memory barrier operations.
-
- If unsure, say n here. Even distro kernels should think twice before
- enabling this: there are few systems, and an unlikely bug.
config X86_F00F_BUG
def_bool y
- depends on M586MMX || M586TSC || M586 || M486 || M386
-
-config X86_WP_WORKS_OK
- def_bool y
- depends on !M386
-
-config X86_INVLPG
- def_bool y
- depends on X86_32 && !M386
-
-config X86_BSWAP
- def_bool y
- depends on X86_32 && !M386
+ depends on M586MMX || M586TSC || M586 || M486SX || M486
-config X86_POPAD_OK
+config X86_INVD_BUG
def_bool y
- depends on X86_32 && !M386
+ depends on M486SX || M486
config X86_ALIGNMENT_16
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
config X86_INTEL_USERCOPY
def_bool y
- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK7 || MEFFICEON
config X86_USE_PPRO_CHECKSUM
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MATOM
-config X86_USE_3DNOW
- def_bool y
- depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
-
-config X86_OOSTORE
- def_bool y
- depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
-
-#
-# P6_NOPs are a relatively minor optimization that require a family >=
-# 6 processor, except that it is broken on certain VIA chips.
-# Furthermore, AMD chips prefer a totally different sequence of NOPs
-# (which work on all CPUs). In addition, it looks like Virtual PC
-# does not understand them.
-#
-# As a result, disallow these if we're not compiling for X86_64 (these
-# NOPs do work on all x86-64 capable chips); the list of processors in
-# the right-hand clause are the cores that benefit from this optimization.
-#
-config X86_P6_NOP
+config X86_TSC
def_bool y
- depends on X86_64
- depends on (MCORE2 || MPENTIUM4 || MPSC)
+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MATOM) || X86_64
-config X86_TSC
+config X86_HAVE_PAE
def_bool y
- depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
+ depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64
-config X86_CMPXCHG64
+config X86_CX8
def_bool y
- depends on X86_PAE || X86_64
+ depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX
# this should be set for all -march=.. options where the compiler
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64)
+ depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || MATOM || MGEODE_LX || X86_64)
config X86_MINIMUM_CPU_FAMILY
int
default "64" if X86_64
- default "6" if X86_32 && X86_P6_NOP
- default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
- default "3"
+ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7)
+ default "5" if X86_32 && X86_CX8
+ default "4"
config X86_DEBUGCTLMSR
def_bool y
- depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
+ depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML
+
+config IA32_FEAT_CTL
+ def_bool y
+ depends on CPU_SUP_INTEL || CPU_SUP_CENTAUR || CPU_SUP_ZHAOXIN
+
+config X86_VMX_FEATURE_NAMES
+ def_bool y
+ depends on IA32_FEAT_CTL
menuconfig PROCESSOR_SELECT
- bool "Supported processor vendors" if EMBEDDED
- ---help---
+ bool "Supported processor vendors" if EXPERT
+ help
This lets you choose what x86 vendor support code your kernel
will include.
+config BROADCAST_TLB_FLUSH
+ def_bool y
+ depends on CPU_SUP_AMD && 64BIT
+
config CPU_SUP_INTEL
default y
bool "Support Intel processors" if PROCESSOR_SELECT
- ---help---
+ help
This enables detection, tunings and quirks for Intel processors
You need this enabled if you want your kernel to run on an
@@ -432,8 +378,8 @@ config CPU_SUP_INTEL
config CPU_SUP_CYRIX_32
default y
bool "Support Cyrix processors" if PROCESSOR_SELECT
- depends on !64BIT
- ---help---
+ depends on M486SX || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT)
+ help
This enables detection, tunings and quirks for Cyrix processors
You need this enabled if you want your kernel to run on a
@@ -446,7 +392,7 @@ config CPU_SUP_CYRIX_32
config CPU_SUP_AMD
default y
bool "Support AMD processors" if PROCESSOR_SELECT
- ---help---
+ help
This enables detection, tunings and quirks for AMD processors
You need this enabled if you want your kernel to run on an
@@ -456,10 +402,24 @@ config CPU_SUP_AMD
If unsure, say N.
+config CPU_SUP_HYGON
+ default y
+ bool "Support Hygon processors" if PROCESSOR_SELECT
+ select CPU_SUP_AMD
+ help
+ This enables detection, tunings and quirks for Hygon processors
+
+ You need this enabled if you want your kernel to run on an
+ Hygon CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on an Hygon
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
config CPU_SUP_CENTAUR
default y
bool "Support Centaur processors" if PROCESSOR_SELECT
- ---help---
+ help
This enables detection, tunings and quirks for Centaur processors
You need this enabled if you want your kernel to run on a
@@ -473,7 +433,7 @@ config CPU_SUP_TRANSMETA_32
default y
bool "Support Transmeta processors" if PROCESSOR_SELECT
depends on !64BIT
- ---help---
+ help
This enables detection, tunings and quirks for Transmeta processors
You need this enabled if you want your kernel to run on a
@@ -486,8 +446,8 @@ config CPU_SUP_TRANSMETA_32
config CPU_SUP_UMC_32
default y
bool "Support UMC processors" if PROCESSOR_SELECT
- depends on !64BIT
- ---help---
+ depends on M486SX || M486 || (EXPERT && !64BIT)
+ help
This enables detection, tunings and quirks for UMC processors
You need this enabled if you want your kernel to run on a
@@ -497,22 +457,28 @@ config CPU_SUP_UMC_32
If unsure, say N.
-config X86_DS
- def_bool X86_PTRACE_BTS
- depends on X86_DEBUGCTLMSR
- select HAVE_HW_BRANCH_TRACER
+config CPU_SUP_ZHAOXIN
+ default y
+ bool "Support Zhaoxin processors" if PROCESSOR_SELECT
+ help
+ This enables detection, tunings and quirks for Zhaoxin processors
+
+ You need this enabled if you want your kernel to run on a
+ Zhaoxin CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
-config X86_PTRACE_BTS
- bool "Branch Trace Store"
+config CPU_SUP_VORTEX_32
default y
- depends on X86_DEBUGCTLMSR
- depends on BROKEN
- ---help---
- This adds a ptrace interface to the hardware's branch trace store.
+ bool "Support Vortex processors" if PROCESSOR_SELECT
+ depends on X86_32
+ help
+ This enables detection, tunings and quirks for Vortex processors
- Debuggers may use it to collect an execution trace of the debugged
- application in order to answer the question 'how did I get here?'.
- Debuggers may trace user mode as well as kernel mode.
+ You need this enabled if you want your kernel to run on a
+ Vortex CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller.
- Say Y unless there is no application development on this machine
- and you want to save a small amount of code size.
+ If unsure, say N.
diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures
new file mode 100644
index 000000000000..733d5aff2456
--- /dev/null
+++ b/arch/x86/Kconfig.cpufeatures
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# x86 feature bits (see arch/x86/include/asm/cpufeatures.h) that are
+# either REQUIRED to be enabled, or DISABLED (always ignored) for this
+# particular compile-time configuration. The tests for these features
+# are turned into compile-time constants via the generated
+# <asm/cpufeaturemasks.h>.
+#
+# The naming of these variables *must* match asm/cpufeatures.h, e.g.,
+# X86_FEATURE_ALWAYS <==> X86_REQUIRED_FEATURE_ALWAYS
+# X86_FEATURE_FRED <==> X86_DISABLED_FEATURE_FRED
+#
+# And these REQUIRED and DISABLED config options are manipulated in an
+# AWK script as the following example:
+#
+# +----------------------+
+# | X86_FRED = y ? |
+# +----------------------+
+# / \
+# Y / \ N
+# +-------------------------------------+ +-------------------------------+
+# | X86_DISABLED_FEATURE_FRED undefined | | X86_DISABLED_FEATURE_FRED = y |
+# +-------------------------------------+ +-------------------------------+
+# |
+# |
+# +-------------------------------------------+ |
+# | X86_FEATURE_FRED: feature word 12, bit 17 | ---->|
+# +-------------------------------------------+ |
+# |
+# |
+# +-------------------------------+
+# | set bit 17 of DISABLED_MASK12 |
+# +-------------------------------+
+#
+
+config X86_REQUIRED_FEATURE_ALWAYS
+ def_bool y
+
+config X86_REQUIRED_FEATURE_NOPL
+ def_bool y
+ depends on X86_64 || X86_P6_NOP
+
+config X86_REQUIRED_FEATURE_CX8
+ def_bool y
+ depends on X86_CX8
+
+# this should be set for all -march=.. options where the compiler
+# generates cmov.
+config X86_REQUIRED_FEATURE_CMOV
+ def_bool y
+ depends on X86_CMOV
+
+# this should be set for all -march= options where the compiler
+# generates movbe.
+config X86_REQUIRED_FEATURE_MOVBE
+ def_bool y
+ depends on MATOM
+
+config X86_REQUIRED_FEATURE_CPUID
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_UP
+ def_bool y
+ depends on !SMP
+
+config X86_REQUIRED_FEATURE_FPU
+ def_bool y
+ depends on !MATH_EMULATION
+
+config X86_REQUIRED_FEATURE_PAE
+ def_bool y
+ depends on X86_64 || X86_PAE
+
+config X86_REQUIRED_FEATURE_PSE
+ def_bool y
+ depends on X86_64 && !PARAVIRT_XXL
+
+config X86_REQUIRED_FEATURE_PGE
+ def_bool y
+ depends on X86_64 && !PARAVIRT_XXL
+
+config X86_REQUIRED_FEATURE_MSR
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_FXSR
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_XMM
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_XMM2
+ def_bool y
+ depends on X86_64
+
+config X86_REQUIRED_FEATURE_LM
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_UMIP
+ def_bool y
+ depends on !X86_UMIP
+
+config X86_DISABLED_FEATURE_VME
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_K6_MTRR
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_CYRIX_ARR
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_CENTAUR_MCR
+ def_bool y
+ depends on X86_64
+
+config X86_DISABLED_FEATURE_PCID
+ def_bool y
+ depends on !X86_64
+
+config X86_DISABLED_FEATURE_LASS
+ def_bool y
+ depends on X86_32
+
+config X86_DISABLED_FEATURE_PKU
+ def_bool y
+ depends on !X86_INTEL_MEMORY_PROTECTION_KEYS
+
+config X86_DISABLED_FEATURE_OSPKE
+ def_bool y
+ depends on !X86_INTEL_MEMORY_PROTECTION_KEYS
+
+config X86_DISABLED_FEATURE_PTI
+ def_bool y
+ depends on !MITIGATION_PAGE_TABLE_ISOLATION
+
+config X86_DISABLED_FEATURE_RETPOLINE
+ def_bool y
+ depends on !MITIGATION_RETPOLINE
+
+config X86_DISABLED_FEATURE_RETPOLINE_LFENCE
+ def_bool y
+ depends on !MITIGATION_RETPOLINE
+
+config X86_DISABLED_FEATURE_RETHUNK
+ def_bool y
+ depends on !MITIGATION_RETHUNK
+
+config X86_DISABLED_FEATURE_UNRET
+ def_bool y
+ depends on !MITIGATION_UNRET_ENTRY
+
+config X86_DISABLED_FEATURE_CALL_DEPTH
+ def_bool y
+ depends on !MITIGATION_CALL_DEPTH_TRACKING
+
+config X86_DISABLED_FEATURE_LAM
+ def_bool y
+ depends on !ADDRESS_MASKING
+
+config X86_DISABLED_FEATURE_ENQCMD
+ def_bool y
+ depends on !INTEL_IOMMU_SVM
+
+config X86_DISABLED_FEATURE_SGX
+ def_bool y
+ depends on !X86_SGX
+
+config X86_DISABLED_FEATURE_XENPV
+ def_bool y
+ depends on !XEN_PV
+
+config X86_DISABLED_FEATURE_TDX_GUEST
+ def_bool y
+ depends on !INTEL_TDX_GUEST
+
+config X86_DISABLED_FEATURE_USER_SHSTK
+ def_bool y
+ depends on !X86_USER_SHADOW_STACK
+
+config X86_DISABLED_FEATURE_IBT
+ def_bool y
+ depends on !X86_KERNEL_IBT
+
+config X86_DISABLED_FEATURE_FRED
+ def_bool y
+ depends on !X86_FRED
+
+config X86_DISABLED_FEATURE_SEV_SNP
+ def_bool y
+ depends on !KVM_AMD_SEV
+
+config X86_DISABLED_FEATURE_INVLPGB
+ def_bool y
+ depends on !BROADCAST_TLB_FLUSH
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..c95c3aaadf97 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -1,153 +1,94 @@
-menu "Kernel hacking"
+# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
-source "lib/Kconfig.debug"
-
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- ---help---
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel. Note that with PAT support
- enabled, even in this case there are restrictions on /dev/mem
- use due to the cache aliasing requirements.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to PCI space and the BIOS code and data regions.
- This is sufficient for dosemu and X and all common users of
- /dev/mem.
-
- If in doubt, say Y.
+config EARLY_PRINTK_USB
+ bool
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
- ---help---
+ help
Enables the informational output from the decompression stage
(e.g. bzImage) of the boot. If you disable this you will still
see errors. Disable this if you want silent bootup.
config EARLY_PRINTK
- bool "Early printk" if EMBEDDED
+ bool "Early printk" if EXPERT
default y
- ---help---
+ help
Write kernel log output directly into the VGA buffer or to a serial
port.
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized. For normal operation
it is not recommended because it looks ugly and doesn't cooperate
- with klogd/syslogd or the X server. You should normally N here,
+ with klogd/syslogd or the X server. You should normally say N here,
unless you want to debug such a crash.
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
- default n
depends on EARLY_PRINTK && PCI
- ---help---
+ select EARLY_PRINTK_USB
+ help
Write kernel log output directly into the EHCI debug port.
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized. For normal operation
it is not recommended because it looks ugly and doesn't cooperate
- with klogd/syslogd or the X server. You should normally N here,
+ with klogd/syslogd or the X server. You should normally say N here,
unless you want to debug such a crash. You need usb debug device.
-config DEBUG_STACKOVERFLOW
- bool "Check for stack overflows"
- depends on DEBUG_KERNEL
- ---help---
- This option will cause messages to be printed if free stack space
- drops below a certain limit.
-
-config DEBUG_STACK_USAGE
- bool "Stack utilization instrumentation"
+config EARLY_PRINTK_USB_XDBC
+ bool "Early printk via the xHCI debug port"
+ depends on EARLY_PRINTK && PCI
+ select EARLY_PRINTK_USB
+ help
+ Write kernel log output directly into the xHCI debug port.
+
+ One use for this feature is kernel debugging, for example when your
+ machine crashes very early before the regular console code is
+ initialized. Other uses include simpler, lockless logging instead of
+ a full-blown printk console driver + klogd.
+
+ For normal production environments this is normally not recommended,
+ because it doesn't feed events into klogd/syslogd and doesn't try to
+ print anything on the screen.
+
+ You should normally say N here, unless you want to debug early
+ crashes or need a very simple printk logging facility.
+
+config EFI_PGT_DUMP
+ bool "Dump the EFI pagetable"
+ depends on EFI
+ select PTDUMP
+ help
+ Enable this if you want to dump the EFI page table before
+ enabling virtual mode. This can be used to debug miscellaneous
+ issues with the mapping of the EFI runtime regions into that
+ table.
+
+config DEBUG_TLBFLUSH
+ bool "Set upper limit of TLB entries to flush one-by-one"
depends on DEBUG_KERNEL
- ---help---
- Enables the display of the minimum amount of free stack which each
- task has ever had available in the sysrq-T and sysrq-P debug output.
+ help
+ X86-only for now.
- This option will slow down process creation somewhat.
+ This option allows the user to tune the amount of TLB entries the
+ kernel flushes one-by-one instead of doing a full TLB flush. In
+ certain situations, the former is cheaper. This is controlled by the
+ tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
+ to -1, the code flushes the whole TLB unconditionally. Otherwise,
+ for positive values of it, the kernel will use single TLB entry
+ invalidating instructions according to the following formula:
-config DEBUG_PER_CPU_MAPS
- bool "Debug access to per_cpu maps"
- depends on DEBUG_KERNEL
- depends on SMP
- default n
- ---help---
- Say Y to verify that the per_cpu map being accessed has
- been setup. Adds a fair amount of code to kernel memory
- and decreases performance.
+ flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
- Say N if unsure.
-
-config X86_PTDUMP
- bool "Export kernel pagetable layout to userspace via debugfs"
- depends on DEBUG_KERNEL
- select DEBUG_FS
- ---help---
- Say Y here if you want to show the kernel pagetable layout in a
- debugfs file. This information is only useful for kernel developers
- who are working in architecture specific areas of the kernel.
- It is probably not a good idea to enable this feature in a production
- kernel.
- If in doubt, say "N"
-
-config DEBUG_RODATA
- bool "Write protect kernel read-only data structures"
- default y
- depends on DEBUG_KERNEL
- ---help---
- Mark the kernel read-only data as write-protected in the pagetables,
- in order to catch accidental (and incorrect) writes to such const
- data. This is recommended so that we can catch kernel bugs sooner.
- If in doubt, say "Y".
-
-config DEBUG_RODATA_TEST
- bool "Testcase for the DEBUG_RODATA feature"
- depends on DEBUG_RODATA
- default y
- ---help---
- This option enables a testcase for the DEBUG_RODATA
- feature as well as for the change_page_attr() infrastructure.
- If in doubt, say "N"
-
-config DEBUG_NX_TEST
- tristate "Testcase for the NX non-executable stack feature"
- depends on DEBUG_KERNEL && m
- ---help---
- This option enables a testcase for the CPU NX capability
- and the software setup of this feature.
- If in doubt, say "N"
-
-config 4KSTACKS
- bool "Use 4Kb for kernel stacks instead of 8Kb"
- depends on X86_32
- ---help---
- If you say Y here the kernel will use a 4Kb stacksize for the
- kernel stack attached to each process/thread. This facilitates
- running more threads on a system and also reduces the pressure
- on the VM subsystem for higher order allocations. This option
- will also use IRQ stacks to compensate for the reduced stackspace.
-
-config DOUBLEFAULT
- default y
- bool "Enable doublefault exception handler" if EMBEDDED
- depends on X86_32
- ---help---
- This option allows trapping of rare doublefault exceptions that
- would otherwise cause a system to silently reboot. Disabling this
- option saves about 4k and might cause you much additional grey
- hair.
+ If in doubt, say "N".
config IOMMU_DEBUG
bool "Enable IOMMU debugging"
depends on GART_IOMMU && DEBUG_KERNEL
depends on X86_64
- ---help---
+ help
Force the IOMMU to on even when you have less than 4GB of
memory and add debugging code. On overflow always panic. And
allow to enable IOMMU leak tracing. Can be disabled at boot
@@ -156,55 +97,28 @@ config IOMMU_DEBUG
code. When you use it make sure you have a big enough
IOMMU/AGP aperture. Most of the options enabled by this can
be set more finegrained using the iommu= command line
- options. See Documentation/x86_64/boot-options.txt for more
+ options. See Documentation/admin-guide/kernel-parameters.txt for more
details.
-config IOMMU_STRESS
- bool "Enable IOMMU stress-test mode"
- ---help---
- This option disables various optimizations in IOMMU related
- code to do real stress testing of the IOMMU code. This option
- will cause a performance drop and should only be enabled for
- testing.
-
config IOMMU_LEAK
bool "IOMMU leak tracing"
depends on IOMMU_DEBUG && DMA_API_DEBUG
- ---help---
+ help
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
-config X86_DS_SELFTEST
- bool "DS selftest"
- default y
- depends on DEBUG_KERNEL
- depends on X86_DS
- ---help---
- Perform Debug Store selftests at boot time.
- If in doubt, say "N".
-
config HAVE_MMIOTRACE_SUPPORT
def_bool y
-#
-# IO delay types:
-#
-
-config IO_DELAY_TYPE_0X80
- int
- default "0"
-
-config IO_DELAY_TYPE_0XED
- int
- default "1"
-
-config IO_DELAY_TYPE_UDELAY
- int
- default "2"
-
-config IO_DELAY_TYPE_NONE
- int
- default "3"
+config X86_DECODER_SELFTEST
+ bool "x86 instruction decoder selftest"
+ depends on DEBUG_KERNEL && INSTRUCTION_DECODER
+ depends on !COMPILE_TEST
+ help
+ Perform x86 instruction decoder selftests at build time.
+ This option is useful for checking the sanity of x86 instruction
+ decoder code.
+ If unsure, say "N".
choice
prompt "IO delay type"
@@ -212,79 +126,149 @@ choice
config IO_DELAY_0X80
bool "port 0x80 based port-IO delay [recommended]"
- ---help---
+ help
This is the traditional Linux IO delay used for in/out_p.
It is the most tested hence safest selection here.
config IO_DELAY_0XED
bool "port 0xed based port-IO delay"
- ---help---
+ help
Use port 0xed as the IO delay. This frees up port 0x80 which is
often used as a hardware-debug port.
config IO_DELAY_UDELAY
bool "udelay based port-IO delay"
- ---help---
+ help
Use udelay(2) as the IO delay method. This provides the delay
while not having any side-effect on the IO port space.
config IO_DELAY_NONE
bool "no port-IO delay"
- ---help---
+ help
No port-IO delay. Will break on old boxes that require port-IO
delay for certain operations. Should work on most new machines.
endchoice
-if IO_DELAY_0X80
-config DEFAULT_IO_DELAY_TYPE
- int
- default IO_DELAY_TYPE_0X80
-endif
-
-if IO_DELAY_0XED
-config DEFAULT_IO_DELAY_TYPE
- int
- default IO_DELAY_TYPE_0XED
-endif
-
-if IO_DELAY_UDELAY
-config DEFAULT_IO_DELAY_TYPE
- int
- default IO_DELAY_TYPE_UDELAY
-endif
-
-if IO_DELAY_NONE
-config DEFAULT_IO_DELAY_TYPE
- int
- default IO_DELAY_TYPE_NONE
-endif
-
config DEBUG_BOOT_PARAMS
bool "Debug boot parameters"
depends on DEBUG_KERNEL
depends on DEBUG_FS
- ---help---
+ help
This option will cause struct boot_params to be exported via debugfs.
config CPA_DEBUG
bool "CPA self-test code"
depends on DEBUG_KERNEL
- ---help---
+ help
Do change_page_attr() self-tests every 30 seconds.
-config OPTIMIZE_INLINING
- bool "Allow gcc to uninline functions marked 'inline'"
- ---help---
- This option determines if the kernel forces gcc to inline the functions
- developers have marked 'inline'. Doing so takes away freedom from gcc to
- do what it thinks is best, which is desirable for the gcc 3.x series of
- compilers. The gcc 4.x series have a rewritten inlining algorithm and
- enabling this option will generate a smaller kernel there. Hopefully
- this algorithm is so good that allowing gcc 4.x and above to make the
- decision will become the default in the future. Until then this option
- is there to test gcc for this.
+config DEBUG_ENTRY
+ bool "Debug low-level entry code"
+ depends on DEBUG_KERNEL
+ help
+ This option enables sanity checks in x86's low-level entry code.
+ Some of these sanity checks may slow down kernel entries and
+ exits or otherwise impact performance.
If unsure, say N.
-endmenu
+config DEBUG_NMI_SELFTEST
+ bool "NMI Selftest"
+ depends on DEBUG_KERNEL && X86_LOCAL_APIC
+ help
+ Enabling this option turns on a quick NMI selftest to verify
+ that the NMI behaves correctly.
+
+ This might help diagnose strange hangs that rely on NMI to
+ function properly.
+
+ If unsure, say N.
+
+config DEBUG_IMR_SELFTEST
+ bool "Isolated Memory Region self test"
+ depends on INTEL_IMR
+ help
+ This option enables automated sanity testing of the IMR code.
+ Some simple tests are run to verify IMR bounds checking, alignment
+ and overlapping. This option is really only useful if you are
+ debugging an IMR memory map or are modifying the IMR code and want to
+ test your changes.
+
+ If unsure say N here.
+
+config X86_DEBUG_FPU
+ bool "Debug the x86 FPU code"
+ depends on DEBUG_KERNEL
+ default y
+ help
+ If this option is enabled then there will be extra sanity
+ checks and (boot time) debug printouts added to the kernel.
+ This debugging adds some small amount of runtime overhead
+ to the kernel.
+
+ If unsure, say N.
+
+config PUNIT_ATOM_DEBUG
+ tristate "ATOM Punit debug driver"
+ depends on PCI
+ select DEBUG_FS
+ select IOSF_MBI
+ help
+ This is a debug driver, which gets the power states
+ of all Punit North Complex devices. The power states of
+ each device is exposed as part of the debugfs interface.
+ The current power state can be read from
+ /sys/kernel/debug/punit_atom/dev_power_state
+
+choice
+ prompt "Choose kernel unwinder"
+ default UNWINDER_ORC if X86_64
+ default UNWINDER_FRAME_POINTER if X86_32
+ help
+ This determines which method will be used for unwinding kernel stack
+ traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+ livepatch, lockdep, and more.
+
+config UNWINDER_ORC
+ bool "ORC unwinder"
+ depends on X86_64
+ select OBJTOOL
+ help
+ This option enables the ORC (Oops Rewind Capability) unwinder for
+ unwinding kernel stack traces. It uses a custom data format which is
+ a simplified version of the DWARF Call Frame Information standard.
+
+ This unwinder is more accurate across interrupt entry frames than the
+ frame pointer unwinder. It also enables a 5-10% performance
+ improvement across the entire kernel compared to frame pointers.
+
+ Enabling this option will increase the kernel's runtime memory usage
+ by roughly 2-4MB, depending on your kernel config.
+
+config UNWINDER_FRAME_POINTER
+ bool "Frame pointer unwinder"
+ select ARCH_WANT_FRAME_POINTERS
+ select FRAME_POINTER
+ help
+ This option enables the frame pointer unwinder for unwinding kernel
+ stack traces.
+
+ The unwinder itself is fast and it uses less RAM than the ORC
+ unwinder, but the kernel text size will grow by ~3% and the kernel's
+ overall performance will degrade by roughly 5-10%.
+
+config UNWINDER_GUESS
+ bool "Guess unwinder"
+ depends on EXPERT
+ depends on !STACKDEPOT
+ help
+ This option enables the "guess" unwinder for unwinding kernel stack
+ traces. It scans the stack and reports every kernel text address it
+ finds. Some of the addresses it reports may be incorrect.
+
+ While this option often produces false positives, it can still be
+ useful in many cases. Unlike the other unwinders, it has no runtime
+ overhead.
+
+endchoice
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1b68659c41b4..1d403a3612ea 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -1,149 +1,299 @@
+# SPDX-License-Identifier: GPL-2.0
# Unified Makefile for i386 and x86_64
# select defconfig based on actual architecture
ifeq ($(ARCH),x86)
+ ifeq ($(shell uname -m | sed -e 's/i.86/i386/'),i386)
KBUILD_DEFCONFIG := i386_defconfig
+ else
+ KBUILD_DEFCONFIG := x86_64_defconfig
+ endif
else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+ifdef CONFIG_CC_IS_GCC
+RETPOLINE_CFLAGS := -mindirect-branch=thunk-extern -mindirect-branch-register
+RETPOLINE_VDSO_CFLAGS := -mindirect-branch=thunk-inline -mindirect-branch-register
+endif
+ifdef CONFIG_CC_IS_CLANG
+RETPOLINE_CFLAGS := -mretpoline-external-thunk
+RETPOLINE_VDSO_CFLAGS := -mretpoline
+endif
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
+
+ifdef CONFIG_MITIGATION_RETHUNK
+RETHUNK_CFLAGS := -mfunction-return=thunk-extern
+RETHUNK_RUSTFLAGS := -Zfunction-return=thunk-extern
+RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS)
+RETPOLINE_RUSTFLAGS += $(RETHUNK_RUSTFLAGS)
+endif
+
+export RETHUNK_CFLAGS
+export RETHUNK_RUSTFLAGS
+export RETPOLINE_CFLAGS
+export RETPOLINE_RUSTFLAGS
+export RETPOLINE_VDSO_CFLAGS
+
+# For gcc stack alignment is specified with -mpreferred-stack-boundary,
+# clang has the option -mstack-alignment for that purpose.
+ifdef CONFIG_CC_IS_GCC
+ cc_stack_align4 := -mpreferred-stack-boundary=2
+ cc_stack_align8 := -mpreferred-stack-boundary=3
+endif
+ifdef CONFIG_CC_IS_CLANG
+ cc_stack_align4 := -mstack-alignment=4
+ cc_stack_align8 := -mstack-alignment=8
+endif
+
+# How to compile the 16-bit code. Note we always compile for -march=i386;
+# that way we can complain to the user if the CPU is insufficient.
+REALMODE_CFLAGS := -std=gnu11 -fms-extensions -m16 -g -Os \
+ -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
+ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
+ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
+ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
+
+REALMODE_CFLAGS += -ffreestanding
+REALMODE_CFLAGS += -fno-stack-protector
+REALMODE_CFLAGS += -Wno-address-of-packed-member
+REALMODE_CFLAGS += $(cc_stack_align4)
+REALMODE_CFLAGS += $(CLANG_FLAGS)
+ifdef CONFIG_CC_IS_CLANG
+REALMODE_CFLAGS += -Wno-gnu
+REALMODE_CFLAGS += -Wno-microsoft-anon-tag
+endif
+export REALMODE_CFLAGS
+
# BITS is used as extension for files which are available in a 32 bit
# and a 64 bit version to simplify shared Makefiles.
# e.g.: obj-y += foo_$(BITS).o
export BITS
+#
+# Prevent GCC from generating any FP code by mistake.
+#
+# This must happen before we try the -mpreferred-stack-boundary, see:
+#
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a
+KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+
+#
+# CFLAGS for compiling floating point code inside the kernel.
+#
+CC_FLAGS_FPU := -msse -msse2
+ifdef CONFIG_CC_IS_GCC
+CC_FLAGS_FPU += -mhard-float
+endif
+
+ifeq ($(CONFIG_X86_KERNEL_IBT),y)
+#
+# Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
+# NOTRACK prefixes. Current generation compilers unconditionally employ NOTRACK
+# for jump-tables, as such, disable jump-tables for now.
+#
+# (jump-tables are implicitly disabled by RETPOLINE)
+#
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
+#
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
+KBUILD_RUSTFLAGS += -Zcf-protection=branch $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
+else
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
+endif
+
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
CHECKFLAGS += -D__i386__
- biarch := $(call cc-option,-m32)
- KBUILD_AFLAGS += $(biarch)
- KBUILD_CFLAGS += $(biarch)
-
- ifdef CONFIG_RELOCATABLE
- LDFLAGS_vmlinux := --emit-relocs
- endif
+ KBUILD_AFLAGS += -m32
+ KBUILD_CFLAGS += -m32
KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
- # prevent gcc from keeping the stack 16 byte aligned
- KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
+ # Never want PIC in a 32-bit kernel, prevent breakage with GCC built
+ # with nonstandard options
+ KBUILD_CFLAGS += -fno-pic
- # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
- # a lot more stack due to the lack of sharing of stacklots:
- KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
- echo $(call cc-option,-fno-unit-at-a-time); fi ;)
+ # Align the stack to the register width instead of using the default
+ # alignment of 16 bytes. This reduces stack usage and the number of
+ # alignment instructions.
+ KBUILD_CFLAGS += $(cc_stack_align4)
# CPU-specific tuning. Anything which can be shared with UML should go here.
include $(srctree)/arch/x86/Makefile_32.cpu
KBUILD_CFLAGS += $(cflags-y)
- # temporary until string.h is fixed
+ ifneq ($(call clang-min-version, 160000),y)
+ # https://github.com/llvm/llvm-project/issues/53645
KBUILD_CFLAGS += -ffreestanding
+ endif
+
+ percpu_seg := fs
else
BITS := 64
UTS_MACHINE := x86_64
- CHECKFLAGS += -D__x86_64__ -m64
+ CHECKFLAGS += -D__x86_64__
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
- # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
- cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
- cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
+ # Align jump targets to 1 byte, not the default 16 bytes:
+ KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1)
- cflags-$(CONFIG_MCORE2) += \
- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
- cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
- KBUILD_CFLAGS += $(cflags-y)
+ # Pack loops tightly as well:
+ KBUILD_CFLAGS += $(call cc-option,-falign-loops=1)
- KBUILD_CFLAGS += -mno-red-zone
- KBUILD_CFLAGS += -mcmodel=kernel
+ # Don't autogenerate traditional x87 instructions
+ KBUILD_CFLAGS += -mno-80387
+ KBUILD_CFLAGS += -mno-fp-ret-in-387
- # -funit-at-a-time shrinks the kernel .text considerably
- # unfortunately it makes reading oopses harder.
- KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
+ # By default gcc and clang use a stack alignment of 16 bytes for x86.
+ # However the standard kernel entry on x86-64 leaves the stack on an
+ # 8-byte boundary. If the compiler isn't informed about the actual
+ # alignment it will generate extra alignment instructions for the
+ # default alignment which keep the stack *mis*aligned.
+ # Furthermore an alignment to the register width reduces stack usage
+ # and the number of alignment instructions.
+ KBUILD_CFLAGS += $(cc_stack_align8)
- # this works around some issues with generating unwind tables in older gccs
- # newer gccs do it by default
- KBUILD_CFLAGS += -maccumulate-outgoing-args
-endif
+ # Use -mskip-rax-setup if supported.
+ KBUILD_CFLAGS += -mskip-rax-setup
-ifdef CONFIG_CC_STACKPROTECTOR
- cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
- ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y)
- stackp-y := -fstack-protector
- stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
- KBUILD_CFLAGS += $(stackp-y)
- else
- $(warning stack protector enabled but no compiler support)
- endif
+ifdef CONFIG_X86_NATIVE_CPU
+ KBUILD_CFLAGS += -march=native
+ KBUILD_RUSTFLAGS += -Ctarget-cpu=native
+else
+ KBUILD_CFLAGS += -march=x86-64 -mtune=generic
+ KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic
endif
-# Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
- KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
+ KBUILD_CFLAGS += -mno-red-zone
+ KBUILD_CFLAGS += -mcmodel=kernel
+ KBUILD_RUSTFLAGS += -Cno-redzone=y
+ KBUILD_RUSTFLAGS += -Ccode-model=kernel
+
+ percpu_seg := gs
endif
-# Stackpointer is addressed different for 32 bit and 64 bit x86
-sp-$(CONFIG_X86_32) := esp
-sp-$(CONFIG_X86_64) := rsp
+ifeq ($(CONFIG_STACKPROTECTOR),y)
+ ifeq ($(CONFIG_SMP),y)
+ KBUILD_CFLAGS += -mstack-protector-guard-reg=$(percpu_seg)
+ KBUILD_CFLAGS += -mstack-protector-guard-symbol=__ref_stack_chk_guard
+ else
+ KBUILD_CFLAGS += -mstack-protector-guard=global
+ endif
+endif
-# do binutils support CFI?
-cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
-# is .cfi_signal_frame supported too?
-cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
+#
+# If the function graph tracer is used with mcount instead of fentry,
+# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
+# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42109)
+#
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ ifndef CONFIG_HAVE_FENTRY
+ ACCUMULATE_OUTGOING_ARGS := 1
+ endif
+endif
-LDFLAGS := -m elf_$(UTS_MACHINE)
+ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
+ # This compiler flag is not supported by Clang:
+ KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
+endif
-# Speed up the build
-KBUILD_CFLAGS += -pipe
# Workaround for a gcc prelease that unfortunately was shipped in a suse release
KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-# prevent gcc from generating any FP code by mistake
-KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
-KBUILD_CFLAGS += $(mflags-y)
-KBUILD_AFLAGS += $(mflags-y)
+# Avoid indirect branches in kernel to deal with Spectre
+ifdef CONFIG_MITIGATION_RETPOLINE
+ KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
+ KBUILD_RUSTFLAGS += $(RETPOLINE_RUSTFLAGS)
+ # Additionally, avoid generating expensive indirect jumps which
+ # are subject to retpolines for small number of switch cases.
+ # LLVM turns off jump table generation by default when under
+ # retpoline builds, however, gcc does not for x86. This has
+ # only been fixed starting from gcc stable version 8.4.0 and
+ # onwards, but not for older ones. See gcc bug #86952.
+ ifndef CONFIG_CC_IS_CLANG
+ KBUILD_CFLAGS += -fno-jump-tables
+ endif
+endif
+
+ifdef CONFIG_MITIGATION_SLS
+ KBUILD_CFLAGS += -mharden-sls=all
+endif
+
+ifdef CONFIG_CALL_PADDING
+PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
+KBUILD_CFLAGS += $(PADDING_CFLAGS)
+export PADDING_CFLAGS
+
+PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
+KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
+export PADDING_RUSTFLAGS
+endif
+
+KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
+
+#
+# The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to
+# the linker to force 2MB page size regardless of the default page size used
+# by the linker.
+#
+ifdef CONFIG_X86_64
+LDFLAGS_vmlinux += -z max-page-size=0x200000
+endif
+
+
+archscripts: scripts_basic
+ $(Q)$(MAKE) $(build)=arch/x86/tools relocs
###
-# Kernel objects
+# Syscall table generation
-head-y := arch/x86/kernel/head_$(BITS).o
-head-y += arch/x86/kernel/head$(BITS).o
-head-y += arch/x86/kernel/head.o
-head-y += arch/x86/kernel/init_task.o
+archheaders:
+ $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
-libs-y += arch/x86/lib/
+###
+# <asm/cpufeaturemasks.h> header generation
+
+cpufeaturemasks.hdr := arch/x86/include/generated/asm/cpufeaturemasks.h
+cpufeaturemasks.awk := $(srctree)/arch/x86/tools/cpufeaturemasks.awk
+cpufeatures_hdr := $(srctree)/arch/x86/include/asm/cpufeatures.h
+targets += $(cpufeaturemasks.hdr)
+ filechk_gen_featuremasks = $(AWK) -f $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG)
+
+$(cpufeaturemasks.hdr): $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) FORCE
+ $(shell mkdir -p $(dir $@))
+ $(call filechk,gen_featuremasks)
+archprepare: $(cpufeaturemasks.hdr)
-# See arch/x86/Kbuild for content of core part of the kernel
-core-y += arch/x86/
+###
+# Kernel objects
+
+libs-y += arch/x86/lib/
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
-# must be linked after kernel/
-drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
-
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
-ifeq ($(CONFIG_X86_32),y)
-drivers-$(CONFIG_FB) += arch/x86/video/
-endif
+drivers-$(CONFIG_VIDEO) += arch/x86/video/
####
# boot loader support. Several targets are kept for legacy purposes
boot := arch/x86/boot
-BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
+BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 hdimage isoimage
PHONY += bzImage $(BOOT_TARGETS)
@@ -154,6 +304,9 @@ all: bzImage
KBUILD_IMAGE := $(boot)/bzImage
bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+ $(Q)$(MAKE) $(build)=arch/x86/tools posttest
+endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
@@ -163,28 +316,51 @@ $(BOOT_TARGETS): vmlinux
PHONY += install
install:
- $(Q)$(MAKE) $(build)=$(boot) $@
+ $(call cmd,install)
+
+vdso-install-$(CONFIG_X86_64) += arch/x86/entry/vdso/vdso64.so.dbg
+vdso-install-$(CONFIG_X86_X32_ABI) += arch/x86/entry/vdso/vdsox32.so.dbg
+vdso-install-$(CONFIG_COMPAT_32) += arch/x86/entry/vdso/vdso32.so.dbg
+
+archprepare: checkbin
+checkbin:
+ifdef CONFIG_MITIGATION_RETPOLINE
+ifeq ($(RETPOLINE_CFLAGS),)
+ @echo "You are building kernel with non-retpoline compiler." >&2
+ @echo "Please update your compiler." >&2
+ @false
+endif
+endif
-PHONY += vdso_install
-vdso_install:
- $(Q)$(MAKE) $(build)=arch/x86/vdso $@
+ifdef CONFIG_UNWINDER_ORC
+orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h
+orc_hash_sh := $(srctree)/scripts/orc_hash.sh
+targets += $(orc_hash_h)
+quiet_cmd_orc_hash = GEN $@
+ cmd_orc_hash = mkdir -p $(dir $@); \
+ $(CONFIG_SHELL) $(orc_hash_sh) < $< > $@
+$(orc_hash_h): $(srctree)/arch/x86/include/asm/orc_types.h $(orc_hash_sh) FORCE
+ $(call if_changed,orc_hash)
+archprepare: $(orc_hash_h)
+endif
archclean:
$(Q)rm -rf $(objtree)/arch/i386
$(Q)rm -rf $(objtree)/arch/x86_64
- $(Q)$(MAKE) $(clean)=$(boot)
define archhelp
- echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
- echo ' install - Install kernel using'
- echo ' (your) ~/bin/installkernel or'
- echo ' (distribution) /sbin/installkernel or'
- echo ' install to $$(INSTALL_PATH) and run lilo'
- echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
- echo ' bzdisk/fdimage*/isoimage also accept:'
- echo ' FDARGS="..." arguments for the booted kernel'
- echo ' FDINITRD=file initrd for the booted kernel'
+ echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
+ echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or'
+ echo ' (distribution) /sbin/$(INSTALLKERNEL) or install to '
+ echo ' $$(INSTALL_PATH) and run lilo'
+ echo ''
+ echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' hdimage - Create a BIOS/EFI hard disk image (arch/x86/boot/hdimage)'
+ echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
+ echo ' bzdisk/fdimage*/hdimage/isoimage also accept:'
+ echo ' FDARGS="..." arguments for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
+
endef
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
new file mode 100644
index 000000000000..c86cbd9cbba3
--- /dev/null
+++ b/arch/x86/Makefile.um
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+core-y += arch/x86/crypto/
+
+#
+# Disable SSE and other FP/SIMD instructions to match normal x86
+# This is required to work around issues in older LLVM versions, but breaks
+# GCC versions < 11. See:
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652
+#
+ifeq ($(call gcc-min-version, 110000)$(CONFIG_CC_IS_CLANG),y)
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+endif
+
+KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+
+ifeq ($(CONFIG_X86_32),y)
+START := 0x8048000
+
+KBUILD_LDFLAGS += -m elf_i386
+ELF_ARCH := i386
+ELF_FORMAT := elf32-i386
+CHECKFLAGS += -D__i386__
+
+KBUILD_CFLAGS += $(call cc-option,-m32)
+KBUILD_AFLAGS += $(call cc-option,-m32)
+LINK-y += $(call cc-option,-m32)
+
+LDS_EXTRA := -Ui386
+export LDS_EXTRA
+
+# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
+include $(srctree)/arch/x86/Makefile_32.cpu
+
+# prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
+cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
+
+# Prevent sprintf in nfsd from being converted to strcpy and resulting in
+# an unresolved reference.
+cflags-y += -ffreestanding
+
+KBUILD_CFLAGS += $(cflags-y)
+
+else
+
+START := 0x60000000
+
+KBUILD_CFLAGS += -fno-builtin -m64
+
+CHECKFLAGS += -m64 -D__x86_64__
+KBUILD_AFLAGS += -m64
+KBUILD_LDFLAGS += -m elf_x86_64
+KBUILD_CPPFLAGS += -m64
+
+ELF_ARCH := i386:x86-64
+ELF_FORMAT := elf64-x86-64
+
+# Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example.
+
+LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib64
+LINK-y += -m64
+
+endif
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 80177ec052f0..af7de9a42752 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -1,16 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
# CPU tuning section - shared with UML.
# Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
-#-mtune exists since gcc 3.4
-HAS_MTUNE := $(call cc-option-yn, -mtune=i386)
-ifeq ($(HAS_MTUNE),y)
tune = $(call cc-option,-mtune=$(1),$(2))
+
+ifdef CONFIG_CC_IS_CLANG
+align := -falign-functions=0 $(call cc-option,-falign-jumps=0) $(call cc-option,-falign-loops=0)
else
-tune = $(call cc-option,-mcpu=$(1),$(2))
+align := -falign-functions=0 -falign-jumps=0 -falign-loops=0
endif
-align := $(cc-option-align)
-cflags-$(CONFIG_M386) += -march=i386
+cflags-$(CONFIG_M486SX) += -march=i486
cflags-$(CONFIG_M486) += -march=i486
cflags-$(CONFIG_M586) += -march=i586
cflags-$(CONFIG_M586TSC) += -march=i586
@@ -24,22 +24,21 @@ cflags-$(CONFIG_MK6) += -march=k6
# Please note, that patches that add -march=athlon-xp and friends are pointless.
# They make zero difference whatsosever to performance at this time.
cflags-$(CONFIG_MK7) += -march=athlon
-cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
-cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
-cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
+cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)
+cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)
cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
-cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
+cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)
cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
cflags-$(CONFIG_MVIAC7) += -march=i686
-cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
+cflags-$(CONFIG_MATOM) += -march=atom
# AMD Elan support
-cflags-$(CONFIG_X86_ELAN) += -march=i486
+cflags-$(CONFIG_MELAN) += -march=i486
# Geode GX1 support
cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
-
+cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
# add at the end to overwrite eventual tuning options from earlier
# cpu entries
cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 851fe936d242..070ef534c915 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -1,10 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
bootsect
bzImage
cpustr.h
mkcpustr
-offsets.h
voffset.h
zoffset.h
setup
setup.bin
setup.elf
+fdimage
+mtools.conf
+image.iso
+hdimage
+tools/
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index ec749c2bfdd7..3f9fb3698d66 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,12 +9,6 @@
# Changed by many, many contributors over the years.
#
-# ROOT_DEV specifies the default root-device when making the image.
-# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
-# the default of FLOPPY is used by 'build'.
-
-ROOT_DEV := CURRENT
-
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
@@ -23,13 +17,13 @@ ROOT_DEV := CURRENT
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
-targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
+targets += fdimage fdimage144 fdimage288 image.iso hdimage
subdir- := compressed
-setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
-setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
-setup-y += version.o
+setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
+setup-y += early_serial_console.o edd.o header.o main.o memory.o
+setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
+setup-y += video-mode.o version.o
setup-$(CONFIG_X86_APM_BOOT) += apm.o
# The link order of the video-*.o modules can matter. In particular,
@@ -41,46 +35,35 @@ setup-y += video-vesa.o
setup-y += video-bios.o
targets += $(setup-y)
-hostprogs-y := mkcpustr tools/build
+hostprogs += mkcpustr
-HOST_EXTRACFLAGS += $(LINUXINCLUDE)
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include \
+ -include include/generated/autoconf.h \
+ -D__EXPORTED_HEADERS__
$(obj)/cpu.o: $(obj)/cpustr.h
quiet_cmd_cpustr = CPUSTR $@
cmd_cpustr = $(obj)/mkcpustr > $@
-targets += cpustr.h
$(obj)/cpustr.h: $(obj)/mkcpustr FORCE
$(call if_changed,cpustr)
+targets += cpustr.h
# ---------------------------------------------------------------------------
-# How to compile the 16-bit code. Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
- -DDISABLE_BRANCH_PROFILING \
- -Wall -Wstrict-prototypes \
- -march=i386 -mregparm=3 \
- -include $(srctree)/$(src)/code16gcc.h \
- -fno-strict-aliasing -fomit-frame-pointer \
- $(call cc-option, -ffreestanding) \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
- $(call cc-option, -fno-stack-protector) \
- $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS += $(call cc-option, -m32)
+KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH)
$(obj)/bzImage: asflags-y := $(SVGA_MODE)
quiet_cmd_image = BUILD $@
-cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
- $(ROOT_DEV) > $@
+ cmd_image = (dd if=$< bs=4k conv=sync status=none; cat $(filter-out $<,$(real-prereqs))) >$@
-$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE
$(call if_changed,image)
- @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+ @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')'
OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
@@ -88,16 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
-
-quiet_cmd_voffset = VOFFSET $@
- cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
-
-targets += voffset.h
-$(obj)/voffset.h: vmlinux FORCE
- $(call if_changed,voffset)
-
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|_e\?sbat\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
@@ -107,10 +81,10 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
$(call if_changed,zoffset)
-AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
+AFLAGS_header.o += -I$(objtree)/$(obj)
+$(obj)/header.o: $(obj)/zoffset.h
-LDFLAGS_setup.elf := -T
+LDFLAGS_setup.elf := -m elf_i386 -z noexecstack -T
$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
$(call if_changed,ld)
@@ -122,79 +96,44 @@ $(obj)/compressed/vmlinux: FORCE
$(Q)$(MAKE) $(build)=$(obj)/compressed $@
# Set this if you want to pass append arguments to the
-# bzdisk/fdimage/isoimage kernel
+# bzdisk/fdimage/hdimage/isoimage kernel
FDARGS =
-# Set this if you want an initrd included with the
-# bzdisk/fdimage/isoimage kernel
+# Set this if you want one or more initrds included in the image
FDINITRD =
-image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
+imgdeps = $(obj)/bzImage $(obj)/mtools.conf $(src)/genimage.sh
$(obj)/mtools.conf: $(src)/mtools.conf.in
sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+targets += mtools.conf
+
+# genimage.sh requires bash, but it also has a bunch of other
+# external dependencies.
+quiet_cmd_genimage = GENIMAGE $3
+ cmd_genimage = $(BASH) $(src)/genimage.sh $2 $3 $(obj)/bzImage \
+ $(obj)/mtools.conf '$(FDARGS)' $(FDINITRD)
+
+PHONY += bzdisk fdimage fdimage144 fdimage288 hdimage isoimage
+
# This requires write access to /dev/fd0
-bzdisk: $(obj)/bzImage $(obj)/mtools.conf
- MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
- syslinux /dev/fd0 ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
-
-# These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
- dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
- MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
- syslinux $(obj)/fdimage ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
-
-fdimage288: $(obj)/bzImage $(obj)/mtools.conf
- dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
- MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
- syslinux $(obj)/fdimage ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
-
-isoimage: $(obj)/bzImage
- -rm -rf $(obj)/isoimage
- mkdir $(obj)/isoimage
- for i in lib lib64 share end ; do \
- if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
- cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
- break ; \
- fi ; \
- if [ $$i = end ] ; then exit 1 ; fi ; \
- done
- cp $(obj)/bzImage $(obj)/isoimage/linux
- echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
- fi
- mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
- -no-emul-boot -boot-load-size 4 -boot-info-table \
- $(obj)/isoimage
- isohybrid $(obj)/image.iso 2>/dev/null || true
- rm -rf $(obj)/isoimage
-
-bzlilo: $(obj)/bzImage
- if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
- if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
- cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
- cp System.map $(INSTALL_PATH)/
- if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
-
-install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
- System.map "$(INSTALL_PATH)"
+# All images require syslinux to be installed; hdimage also requires
+# EDK2/OVMF if the kernel is compiled with the EFI stub.
+bzdisk: $(imgdeps)
+ $(call cmd,genimage,bzdisk,/dev/fd0)
+
+fdimage fdimage144: $(imgdeps)
+ $(call cmd,genimage,fdimage144,$(obj)/fdimage)
+ @$(kecho) 'Kernel: $(obj)/fdimage is ready'
+
+fdimage288: $(imgdeps)
+ $(call cmd,genimage,fdimage288,$(obj)/fdimage)
+ @$(kecho) 'Kernel: $(obj)/fdimage is ready'
+
+hdimage: $(imgdeps)
+ $(call cmd,genimage,hdimage,$(obj)/hdimage)
+ @$(kecho) 'Kernel: $(obj)/hdimage is ready'
+
+isoimage: $(imgdeps)
+ $(call cmd,genimage,isoimage,$(obj)/image.iso)
+ @$(kecho) 'Kernel: $(obj)/image.iso is ready'
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 64a31a6d751a..bda042933a05 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007-2008 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -137,29 +135,29 @@ int enable_a20(void)
(legacy free, etc.) */
if (a20_test_short())
return 0;
-
+
/* Next, try the BIOS (INT 0x15, AX=0x2401) */
enable_a20_bios();
if (a20_test_short())
return 0;
-
+
/* Try enabling A20 through the keyboard controller */
kbc_err = empty_8042();
if (a20_test_short())
return 0; /* BIOS worked, but with delayed reaction */
-
+
if (!kbc_err) {
enable_a20_kbc();
if (a20_test_long())
return 0;
}
-
+
/* Finally, try enabling the "fast A20 gate" */
enable_a20_fast();
if (a20_test_long())
return 0;
}
-
+
return -1;
}
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index ee274834ea8b..bda15f9673d5 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
@@ -7,9 +8,6 @@
* Original APM BIOS checking by Stephen Rothwell, May 1994
* (sfr@canb.auug.org.au)
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -62,7 +60,7 @@ int query_apm_bios(void)
intcall(0x15, &ireg, &oreg);
if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
- /* Failure with 32-bit connect, try to disconect and ignore */
+ /* Failure with 32-bit connect, try to disconnect and ignore */
ireg.al = 0x04;
intcall(0x15, &ireg, NULL);
return -1;
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 1dfbf64e52a2..cf4a6155714e 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* -----------------------------------------------------------------------
*
- * Copyright 2009 Intel Corporation; author H. Peter Anvin
- *
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2 or (at your
- * option) any later version; incorporated herein by reference.
+ * Copyright 2009-2014 Intel Corporation; author H. Peter Anvin
*
* ----------------------------------------------------------------------- */
@@ -13,8 +10,8 @@
* touching registers they shouldn't be.
*/
- .code16gcc
- .text
+ .code16
+ .section ".inittext","ax"
.globl intcall
.type intcall, @function
intcall:
@@ -35,7 +32,7 @@ intcall:
movw %dx, %si
movw %sp, %di
movw $11, %cx
- rep; movsd
+ rep movsl
/* Pop full state from the stack */
popal
@@ -70,7 +67,7 @@ intcall:
jz 4f
movw %sp, %si
movw $11, %cx
- rep; movsd
+ rep movsl
4: addw $44, %sp
/* Restore state and return */
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
index 878e4b9940d9..79e15971529d 100644
--- a/arch/x86/boot/bitops.h
+++ b/arch/x86/boot/bitops.h
@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -16,17 +14,20 @@
#define BOOT_BITOPS_H
#define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */
-static inline int constant_test_bit(int nr, const void *addr)
+#include <linux/types.h>
+#include <asm/asm.h>
+
+static inline bool constant_test_bit(int nr, const void *addr)
{
- const u32 *p = (const u32 *)addr;
+ const u32 *p = addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
-static inline int variable_test_bit(int nr, const void *addr)
+static inline bool variable_test_bit(int nr, const void *addr)
{
- u8 v;
- const u32 *p = (const u32 *)addr;
+ bool v;
+ const u32 *p = addr;
- asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
+ asm("btl %2,%1" : "=@ccc" (v) : "m" (*p), "Ir" (nr));
return v;
}
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 98239d2658f2..8e3eab34dff4 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -16,65 +14,32 @@
#ifndef BOOT_BOOT_H
#define BOOT_BOOT_H
-#define STACK_SIZE 512 /* Minimum number of bytes for stack */
+#define STACK_SIZE 1024 /* Minimum number of bytes for stack */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include <linux/types.h>
#include <linux/edd.h>
-#include <asm/boot.h>
#include <asm/setup.h>
+#include <asm/asm.h>
#include "bitops.h"
-#include <asm/cpufeature.h>
-#include <asm/processor-flags.h>
+#include "ctype.h"
+#include "cpuflags.h"
+#include "io.h"
/* Useful macros */
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
-
#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
extern struct setup_header hdr;
extern struct boot_params boot_params;
-/* Basic port I/O */
-static inline void outb(u8 v, u16 port)
-{
- asm volatile("outb %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u8 inb(u16 port)
-{
- u8 v;
- asm volatile("inb %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
-static inline void outw(u16 v, u16 port)
-{
- asm volatile("outw %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u16 inw(u16 port)
-{
- u16 v;
- asm volatile("inw %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
-static inline void outl(u32 v, u16 port)
-{
- asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u32 inl(u32 port)
-{
- u32 v;
- asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
+#define cpu_relax() asm volatile("pause")
static inline void io_delay(void)
{
const u16 DELAY_PORT = 0x80;
- asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
+ outb(0, DELAY_PORT);
}
/* These functions are used to reference data in other segments. */
@@ -112,97 +77,96 @@ typedef unsigned int addr_t;
static inline u8 rdfs8(addr_t addr)
{
+ u8 *ptr = (u8 *)absolute_pointer(addr);
u8 v;
- asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr));
return v;
}
static inline u16 rdfs16(addr_t addr)
{
+ u16 *ptr = (u16 *)absolute_pointer(addr);
u16 v;
- asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline u32 rdfs32(addr_t addr)
{
+ u32 *ptr = (u32 *)absolute_pointer(addr);
u32 v;
- asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline void wrfs8(u8 v, addr_t addr)
{
- asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+ u8 *ptr = (u8 *)absolute_pointer(addr);
+ asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v));
}
static inline void wrfs16(u16 v, addr_t addr)
{
- asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+ u16 *ptr = (u16 *)absolute_pointer(addr);
+ asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline void wrfs32(u32 v, addr_t addr)
{
- asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+ u32 *ptr = (u32 *)absolute_pointer(addr);
+ asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline u8 rdgs8(addr_t addr)
{
+ u8 *ptr = (u8 *)absolute_pointer(addr);
u8 v;
- asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr));
return v;
}
static inline u16 rdgs16(addr_t addr)
{
+ u16 *ptr = (u16 *)absolute_pointer(addr);
u16 v;
- asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline u32 rdgs32(addr_t addr)
{
+ u32 *ptr = (u32 *)absolute_pointer(addr);
u32 v;
- asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline void wrgs8(u8 v, addr_t addr)
{
- asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+ u8 *ptr = (u8 *)absolute_pointer(addr);
+ asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v));
}
static inline void wrgs16(u16 v, addr_t addr)
{
- asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+ u16 *ptr = (u16 *)absolute_pointer(addr);
+ asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline void wrgs32(u32 v, addr_t addr)
{
- asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+ u32 *ptr = (u32 *)absolute_pointer(addr);
+ asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
}
/* Note: these only return true/false, not a signed return value! */
-static inline int memcmp(const void *s1, const void *s2, size_t len)
-{
- u8 diff;
- asm("repe; cmpsb; setnz %0"
- : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
- return diff;
-}
-
-static inline int memcmp_fs(const void *s1, addr_t s2, size_t len)
+static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len)
{
- u8 diff;
- asm volatile("fs; repe; cmpsb; setnz %0"
- : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ bool diff;
+ asm volatile("fs repe cmpsb"
+ : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
-static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
+static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len)
{
- u8 diff;
- asm volatile("gs; repe; cmpsb; setnz %0"
- : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ bool diff;
+ asm volatile("gs repe cmpsb"
+ : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
-static inline int isdigit(int ch)
-{
- return (ch >= '0') && (ch <= '9');
-}
-
/* Heap -- available for dynamic lists. */
extern char _end[];
extern char *HEAP;
@@ -229,13 +193,6 @@ static inline bool heap_free(size_t n)
void copy_to_fs(addr_t dst, void *src, size_t len);
void *copy_from_fs(void *dst, addr_t src, size_t len);
-void copy_to_gs(addr_t dst, void *src, size_t len);
-void *copy_from_gs(void *dst, addr_t src, size_t len);
-void *memcpy(void *dst, void *src, size_t len);
-void *memset(void *dst, int c, size_t len);
-
-#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
-#define memset(d,c,l) __builtin_memset(d,c,l)
/* a20.c */
int enable_a20(void);
@@ -287,30 +244,45 @@ struct biosregs {
void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
/* cmdline.c */
-int cmdline_find_option(const char *option, char *buffer, int bufsize);
-int cmdline_find_option_bool(const char *option);
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
+static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
+}
+
+static inline int cmdline_find_option_bool(const char *option)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option_bool(cmd_line_ptr, option);
+}
/* cpu.c, cpucheck.c */
-struct cpu_features {
- int level; /* Family, or 64 for x86-64 */
- int model;
- u32 flags[NCAPINTS];
-};
-extern struct cpu_features cpu;
int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
+int check_knl_erratum(void);
int validate_cpu(void);
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
/* edd.c */
void query_edd(void);
/* header.S */
void __attribute__((noreturn)) die(void);
-/* mca.c */
-int query_mca(void);
-
/* memory.c */
-int detect_memory(void);
+void detect_memory(void);
/* pm.c */
void __attribute__((noreturn)) go_to_protected_mode(void);
@@ -329,8 +301,11 @@ void initregs(struct biosregs *regs);
/* string.c */
int strcmp(const char *str1, const char *str2);
+int strncmp(const char *cs, const char *ct, size_t count);
size_t strnlen(const char *s, size_t maxlen);
-unsigned int atou(const char *s);
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
+size_t strlen(const char *s);
+char *strchr(const char *s, int c);
/* tty.c */
void puts(const char *);
@@ -350,6 +325,6 @@ void probe_cards(int unsafe);
/* video-vesa.c */
void vesa_store_edid(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* BOOT_BOOT_H */
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index a1d35634bce0..21d56ae83cdf 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -27,9 +25,8 @@ static inline int myisspace(u8 c)
* Returns the length of the argument (regardless of if it was
* truncated to fit in the buffer), or -1 on not found.
*/
-int cmdline_find_option(const char *option, char *buffer, int bufsize)
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize)
{
- u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
addr_t cptr;
char c;
int len = -1;
@@ -42,8 +39,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
st_bufcpy /* Copying this to buffer */
} state = st_wordstart;
- if (!cmdline_ptr || cmdline_ptr >= 0x100000)
- return -1; /* No command line, or inaccessible */
+ if (!cmdline_ptr)
+ return -1; /* No command line */
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
@@ -57,7 +54,7 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
/* else */
state = st_wordcmp;
opptr = option;
- /* fall through */
+ fallthrough;
case st_wordcmp:
if (c == '=' && !*opptr) {
@@ -100,9 +97,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
* Returns the position of that option (starts counting with 1)
* or 0 on not found
*/
-int cmdline_find_option_bool(const char *option)
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
{
- u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
addr_t cptr;
char c;
int pos = 0, wstart = 0;
@@ -113,8 +109,8 @@ int cmdline_find_option_bool(const char *option)
st_wordskip, /* Miscompare, skip */
} state = st_wordstart;
- if (!cmdline_ptr || cmdline_ptr >= 0x100000)
- return -1; /* No command line, or inaccessible */
+ if (!cmdline_ptr)
+ return -1; /* No command line */
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
@@ -133,7 +129,7 @@ int cmdline_find_option_bool(const char *option)
state = st_wordcmp;
opptr = option;
wstart = pos;
- /* fall through */
+ fallthrough;
case st_wordcmp:
if (!*opptr)
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
deleted file mode 100644
index d93e48010b61..000000000000
--- a/arch/x86/boot/code16gcc.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * code16gcc.h
- *
- * This file is -include'd when compiling 16-bit C code.
- * Note: this asm() needs to be emitted before gcc emits any code.
- * Depending on gcc version, this requires -fno-unit-at-a-time or
- * -fno-toplevel-reorder.
- *
- * Hopefully gcc will eventually have a real -m16 option so we can
- * drop this hack long term.
- */
-
-#ifndef __ASSEMBLY__
-asm(".code16gcc");
-#endif
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 4a46fab7162e..25805199a506 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
relocs
vmlinux.bin.all
vmlinux.relocs
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index e2ff504b4ddc..68f9d7a1683b 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -1,42 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
#
# linux/arch/x86/boot/compressed/Makefile
#
# create a compressed vmlinux image from the original vmlinux
#
+# vmlinuz is:
+# decompression code (*.o)
+# asm globals (piggy.S), including:
+# vmlinux.bin.(gz|bz2|lzma|...)
+#
+# vmlinux.bin is:
+# vmlinux stripped of debugging and comments
+# vmlinux.bin.all is:
+# vmlinux.bin + vmlinux.relocs
+# vmlinux.bin.(gz|bz2|lzma|...) is:
+# (see scripts/Makefile.lib size_append)
+# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
-targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
+targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
+ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst
-KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
-KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
+# CLANG_FLAGS must come before any cc-disable-warning or cc-option calls in
+# case of cross compiling, as it has the '--target=' flag, which is needed to
+# avoid errors with '-march=i386', and future flags may depend on the target to
+# be valid.
+KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
+KBUILD_CFLAGS += -std=gnu11 -fms-extensions
+KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
+KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
-cflags-$(CONFIG_X86_64) := -mcmodel=small
+cflags-$(CONFIG_X86_32) := -march=i386
+cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
KBUILD_CFLAGS += $(cflags-y)
-KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
-KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
+KBUILD_CFLAGS += -mno-mmx -mno-sse
+KBUILD_CFLAGS += -ffreestanding -fshort-wchar
+KBUILD_CFLAGS += -fno-stack-protector
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+ifdef CONFIG_CC_IS_CLANG
+KBUILD_CFLAGS += -Wno-gnu
+KBUILD_CFLAGS += -Wno-microsoft-anon-tag
+endif
+KBUILD_CFLAGS += -Wno-pointer-sign
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+KBUILD_CFLAGS += -D__DISABLE_EXPORTS
+# Disable relocation relaxation in case the link is not PIE.
+KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
+KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
+
+# sev-decode-insn.c indirectly includes inat-table.c which is generated during
+# compilation and stored in $(objtree). Add the directory to the includes so
+# that the compiler finds it even with out-of-tree builds (make O=/some/path).
+CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-LDFLAGS := -m elf_$(UTS_MACHINE)
-LDFLAGS_vmlinux := -T
+KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info)
+# Compressed kernel should be built as PIE since it may be loaded at any
+# address by the bootloader.
+LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker)
+ifdef CONFIG_LD_ORPHAN_WARN
+LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
+endif
+LDFLAGS_vmlinux += -z noexecstack
+ifeq ($(CONFIG_LD_IS_BFD),y)
+LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments)
+endif
+ifeq ($(CONFIG_EFI_STUB),y)
+# ensure that the static EFI stub library will be pulled in, even if it is
+# never referenced explicitly from the startup code
+LDFLAGS_vmlinux += -u efi_pe_entry
+endif
+LDFLAGS_vmlinux += -T
+
+hostprogs := mkpiggy
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+
+quiet_cmd_voffset = VOFFSET $@
+ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
+
+targets += ../voffset.h
+
+$(obj)/../voffset.h: vmlinux FORCE
+ $(call if_changed,voffset)
-hostprogs-y := mkpiggy
+$(obj)/misc.o: $(obj)/../voffset.h
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \
+ $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
+ $(obj)/piggy.o $(obj)/cpuflags.o
+
+vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
+vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
+ifdef CONFIG_X86_64
+ vmlinux-objs-y += $(obj)/ident_map_64.o
+ vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
+ vmlinux-objs-y += $(obj)/pgtable_64.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o
+endif
+
+vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o
+vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
+
+vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
+vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a
+vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o
+
+ifdef CONFIG_EFI_SBAT
+$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE)
+endif
+
+$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
- @:
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
+targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
-targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
-
+CMD_RELOCS = arch/x86/tools/relocs
quiet_cmd_relocs = RELOCS $@
- cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
-$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
+ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
+
+$(obj)/vmlinux.relocs: vmlinux.unstripped FORCE
$(call if_changed,relocs)
vmlinux.bin.all-y := $(obj)/vmlinux.bin
@@ -45,16 +137,28 @@ vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,bzip2)
+ $(call if_changed,bzip2_with_size)
$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzma)
+ $(call if_changed,lzma_with_size)
+$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,xzkern_with_size)
+$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,lzo_with_size)
+$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,lz4_with_size)
+$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,zstd22_with_size)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
suffix-$(CONFIG_KERNEL_LZMA) := lzma
+suffix-$(CONFIG_KERNEL_XZ) := xz
+suffix-$(CONFIG_KERNEL_LZO) := lzo
+suffix-$(CONFIG_KERNEL_LZ4) := lz4
+suffix-$(CONFIG_KERNEL_ZSTD) := zst
quiet_cmd_mkpiggy = MKPIGGY $@
- cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
+ cmd_mkpiggy = $(obj)/mkpiggy $< > $@
targets += piggy.S
$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
new file mode 100644
index 000000000000..f196b1d1ddf8
--- /dev/null
+++ b/arch/x86/boot/compressed/acpi.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BOOT_CTYPE_H
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+#include "efi.h"
+
+#include <asm/bootparam.h>
+
+#include <linux/numa.h>
+
+/*
+ * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0'
+ * for termination.
+ */
+#define MAX_ACPI_ARG_LENGTH 10
+
+/*
+ * Immovable memory regions representation. Max amount of memory regions is
+ * MAX_NUMNODES*2.
+ */
+struct mem_vector immovable_mem[MAX_NUMNODES*2];
+
+static acpi_physical_address
+__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len)
+{
+#ifdef CONFIG_EFI
+ unsigned long rsdp_addr;
+ int ret;
+
+ /*
+ * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to
+ * ACPI_TABLE_GUID because it has more features.
+ */
+ rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_20_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */
+ rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ debug_putstr("Error getting RSDP address.\n");
+#endif
+ return 0;
+}
+
+static acpi_physical_address efi_get_rsdp_addr(void)
+{
+#ifdef CONFIG_EFI
+ unsigned long cfg_tbl_pa = 0;
+ unsigned int cfg_tbl_len;
+ unsigned long systab_pa;
+ unsigned int nr_tables;
+ enum efi_type et;
+ int ret;
+
+ et = efi_get_type(boot_params_ptr);
+ if (et == EFI_TYPE_NONE)
+ return 0;
+
+ systab_pa = efi_get_system_table(boot_params_ptr);
+ if (!systab_pa)
+ error("EFI support advertised, but unable to locate system table.");
+
+ ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len);
+ if (ret || !cfg_tbl_pa)
+ error("EFI config table not found.");
+
+ return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len);
+#else
+ return 0;
+#endif
+}
+
+static u8 compute_checksum(u8 *buffer, u32 length)
+{
+ u8 *end = buffer + length;
+ u8 sum = 0;
+
+ while (buffer < end)
+ sum += *(buffer++);
+
+ return sum;
+}
+
+/* Search a block of memory for the RSDP signature. */
+static u8 *scan_mem_for_rsdp(u8 *start, u32 length)
+{
+ struct acpi_table_rsdp *rsdp;
+ u8 *address, *end;
+
+ end = start + length;
+
+ /* Search from given start address for the requested length */
+ for (address = start; address < end; address += ACPI_RSDP_SCAN_STEP) {
+ /*
+ * Both RSDP signature and checksum must be correct.
+ * Note: Sometimes there exists more than one RSDP in memory;
+ * the valid RSDP has a valid checksum, all others have an
+ * invalid checksum.
+ */
+ rsdp = (struct acpi_table_rsdp *)address;
+
+ /* BAD Signature */
+ if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature))
+ continue;
+
+ /* Check the standard checksum */
+ if (compute_checksum((u8 *)rsdp, ACPI_RSDP_CHECKSUM_LENGTH))
+ continue;
+
+ /* Check extended checksum if table version >= 2 */
+ if ((rsdp->revision >= 2) &&
+ (compute_checksum((u8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)))
+ continue;
+
+ /* Signature and checksum valid, we have found a real RSDP */
+ return address;
+ }
+ return NULL;
+}
+
+/* Search RSDP address in EBDA. */
+static acpi_physical_address bios_get_rsdp_addr(void)
+{
+ unsigned long address;
+ u8 *rsdp;
+
+ /* Get the location of the Extended BIOS Data Area (EBDA) */
+ address = *(u16 *)ACPI_EBDA_PTR_LOCATION;
+ address <<= 4;
+
+ /*
+ * Search EBDA paragraphs (EBDA is required to be a minimum of
+ * 1K length)
+ */
+ if (address > 0x400) {
+ rsdp = scan_mem_for_rsdp((u8 *)address, ACPI_EBDA_WINDOW_SIZE);
+ if (rsdp)
+ return (acpi_physical_address)(unsigned long)rsdp;
+ }
+
+ /* Search upper memory: 16-byte boundaries in E0000h-FFFFFh */
+ rsdp = scan_mem_for_rsdp((u8 *) ACPI_HI_RSDP_WINDOW_BASE,
+ ACPI_HI_RSDP_WINDOW_SIZE);
+ if (rsdp)
+ return (acpi_physical_address)(unsigned long)rsdp;
+
+ return 0;
+}
+
+/* Return RSDP address on success, otherwise 0. */
+acpi_physical_address get_rsdp_addr(void)
+{
+ acpi_physical_address pa;
+
+ pa = boot_params_ptr->acpi_rsdp_addr;
+
+ if (!pa)
+ pa = efi_get_rsdp_addr();
+
+ if (!pa)
+ pa = bios_get_rsdp_addr();
+
+ return pa;
+}
+
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE)
+/*
+ * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex
+ * digits, and '\0' for termination.
+ */
+#define MAX_ADDR_LEN 19
+
+static unsigned long get_cmdline_acpi_rsdp(void)
+{
+ unsigned long addr = 0;
+
+#ifdef CONFIG_KEXEC_CORE
+ char val[MAX_ADDR_LEN] = { };
+ int ret;
+
+ ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN);
+ if (ret < 0)
+ return 0;
+
+ if (boot_kstrtoul(val, 16, &addr))
+ return 0;
+#endif
+ return addr;
+}
+
+/* Compute SRAT address from RSDP. */
+static unsigned long get_acpi_srat_table(void)
+{
+ unsigned long root_table, acpi_table;
+ struct acpi_table_header *header;
+ struct acpi_table_rsdp *rsdp;
+ u32 num_entries, size, len;
+ char arg[10];
+ u8 *entry;
+
+ /*
+ * Check whether we were given an RSDP on the command line. We don't
+ * stash this in boot params because the kernel itself may have
+ * different ideas about whether to trust a command-line parameter.
+ */
+ rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp();
+ if (!rsdp)
+ rsdp = (struct acpi_table_rsdp *)(long)
+ boot_params_ptr->acpi_rsdp_addr;
+
+ if (!rsdp)
+ return 0;
+
+ /* Get ACPI root table from RSDP.*/
+ if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 &&
+ !strncmp(arg, "rsdt", 4)) &&
+ rsdp->xsdt_physical_address &&
+ rsdp->revision > 1) {
+ root_table = rsdp->xsdt_physical_address;
+ size = ACPI_XSDT_ENTRY_SIZE;
+ } else {
+ root_table = rsdp->rsdt_physical_address;
+ size = ACPI_RSDT_ENTRY_SIZE;
+ }
+
+ if (!root_table)
+ return 0;
+
+ header = (struct acpi_table_header *)root_table;
+ len = header->length;
+ if (len < sizeof(struct acpi_table_header) + size)
+ return 0;
+
+ num_entries = (len - sizeof(struct acpi_table_header)) / size;
+ entry = (u8 *)(root_table + sizeof(struct acpi_table_header));
+
+ while (num_entries--) {
+ if (size == ACPI_RSDT_ENTRY_SIZE)
+ acpi_table = *(u32 *)entry;
+ else
+ acpi_table = *(u64 *)entry;
+
+ if (acpi_table) {
+ header = (struct acpi_table_header *)acpi_table;
+
+ if (ACPI_COMPARE_NAMESEG(header->signature, ACPI_SIG_SRAT))
+ return acpi_table;
+ }
+ entry += size;
+ }
+ return 0;
+}
+
+/**
+ * count_immovable_mem_regions - Parse SRAT and cache the immovable
+ * memory regions into the immovable_mem array.
+ *
+ * Return the number of immovable memory regions on success, 0 on failure:
+ *
+ * - Too many immovable memory regions
+ * - ACPI off or no SRAT found
+ * - No immovable memory region found.
+ */
+int count_immovable_mem_regions(void)
+{
+ unsigned long table_addr, table_end, table;
+ struct acpi_subtable_header *sub_table;
+ struct acpi_table_header *table_header;
+ char arg[MAX_ACPI_ARG_LENGTH];
+ int num = 0;
+
+ if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
+ !strncmp(arg, "off", 3))
+ return 0;
+
+ table_addr = get_acpi_srat_table();
+ if (!table_addr)
+ return 0;
+
+ table_header = (struct acpi_table_header *)table_addr;
+ table_end = table_addr + table_header->length;
+ table = table_addr + sizeof(struct acpi_table_srat);
+
+ while (table + sizeof(struct acpi_subtable_header) < table_end) {
+
+ sub_table = (struct acpi_subtable_header *)table;
+ if (!sub_table->length) {
+ debug_putstr("Invalid zero length SRAT subtable.\n");
+ return 0;
+ }
+
+ if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
+ struct acpi_srat_mem_affinity *ma;
+
+ ma = (struct acpi_srat_mem_affinity *)sub_table;
+ if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) {
+ immovable_mem[num].start = ma->base_address;
+ immovable_mem[num].size = ma->length;
+ num++;
+ }
+
+ if (num >= MAX_NUMNODES*2) {
+ debug_putstr("Too many immovable memory regions, aborting.\n");
+ return 0;
+ }
+ }
+ table += sub_table->length;
+ }
+ return num;
+}
+#endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 000000000000..e162d7f59cc5
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "misc.h"
+
+#include <asm/bootparam.h>
+
+static unsigned long fs;
+static inline void set_fs(unsigned long seg)
+{
+ fs = seg << 4; /* shift it back */
+}
+typedef unsigned long addr_t;
+static inline char rdfs8(addr_t addr)
+{
+ return *((char *)(fs + addr));
+}
+#include "../cmdline.c"
+unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
+int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize);
+}
+int cmdline_find_option_bool(const char *option)
+{
+ return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
+}
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
new file mode 100644
index 000000000000..0cc1323896d1
--- /dev/null
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../cpuflags.c"
+
+bool has_cpuflag(int flag)
+{
+ get_cpuflags();
+
+ return test_bit(flag, cpu.flags);
+}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 000000000000..70a8d1706d0f
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,6 @@
+#include "misc.h"
+
+/* This might be accessed before .bss is cleared, so use .data instead. */
+int early_serial_base __section(".data");
+
+#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c
new file mode 100644
index 000000000000..f2e50f9758e6
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for early access to EFI configuration table.
+ *
+ * Originally derived from arch/x86/boot/compressed/acpi.c
+ */
+
+#include "misc.h"
+
+#include <asm/bootparam.h>
+
+/**
+ * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise.
+ */
+enum efi_type efi_get_type(struct boot_params *bp)
+{
+ struct efi_info *ei;
+ enum efi_type et;
+ const char *sig;
+
+ ei = &bp->efi_info;
+ sig = (char *)&ei->efi_loader_signature;
+
+ if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_64;
+ } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_32;
+ } else {
+ debug_putstr("No EFI environment detected.\n");
+ et = EFI_TYPE_NONE;
+ }
+
+#ifndef CONFIG_X86_64
+ /*
+ * Existing callers like acpi.c treat this case as an indicator to
+ * fall-through to non-EFI, rather than an error, so maintain that
+ * functionality here as well.
+ */
+ if (ei->efi_systab_hi || ei->efi_memmap_hi) {
+ debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n");
+ et = EFI_TYPE_NONE;
+ }
+#endif
+
+ return et;
+}
+
+/**
+ * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address
+ * of the EFI system table.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI system table address on success. On error, return 0.
+ */
+unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ unsigned long sys_tbl_pa;
+ struct efi_info *ei;
+ enum efi_type et;
+
+ /* Get systab from boot params. */
+ ei = &bp->efi_info;
+#ifdef CONFIG_X86_64
+ sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
+#else
+ sys_tbl_pa = ei->efi_systab;
+#endif
+ if (!sys_tbl_pa) {
+ debug_putstr("EFI system table not found.");
+ return 0;
+ }
+
+ return sys_tbl_pa;
+}
+
+/*
+ * EFI config table address changes to virtual address after boot, which may
+ * not be accessible for the kexec'd kernel. To address this, kexec provides
+ * the initial physical address via a struct setup_data entry, which is
+ * checked for here, along with some sanity checks.
+ */
+static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp,
+ enum efi_type et)
+{
+#ifdef CONFIG_X86_64
+ struct efi_setup_data *esd = NULL;
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = bp->hdr.setup_data;
+ while (pa_data) {
+ data = (struct setup_data *)pa_data;
+ if (data->type == SETUP_EFI) {
+ esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data));
+ break;
+ }
+
+ pa_data = data->next;
+ }
+
+ /*
+ * Original ACPI code falls back to attempting normal EFI boot in these
+ * cases, so maintain existing behavior by indicating non-kexec
+ * environment to the caller, but print them for debugging.
+ */
+ if (esd && !esd->tables) {
+ debug_putstr("kexec EFI environment missing valid configuration table.\n");
+ return NULL;
+ }
+
+ return esd;
+#endif
+ return NULL;
+}
+
+/**
+ * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical
+ * address of EFI configuration table.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: location to store physical address of config table
+ * @cfg_tbl_len: location to store number of config table entries
+ *
+ * Return: 0 on success. On error, return params are left unchanged.
+ */
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ unsigned long sys_tbl_pa;
+ enum efi_type et;
+ int ret;
+
+ if (!cfg_tbl_pa || !cfg_tbl_len)
+ return -EINVAL;
+
+ sys_tbl_pa = efi_get_system_table(bp);
+ if (!sys_tbl_pa)
+ return -EINVAL;
+
+ /* Handle EFI bitness properly */
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_64) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa;
+ struct efi_setup_data *esd;
+
+ /* kexec provides an alternative EFI conf table, check for it. */
+ esd = get_kexec_setup_data(bp, et);
+
+ *cfg_tbl_pa = esd ? esd->tables : stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else if (et == EFI_TYPE_32) {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa;
+
+ *cfg_tbl_pa = stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Get vendor table address/guid from EFI config table at the given index */
+static int get_vendor_table(void *cfg_tbl, unsigned int idx,
+ unsigned long *vendor_tbl_pa,
+ efi_guid_t *vendor_tbl_guid,
+ enum efi_type et)
+{
+ if (et == EFI_TYPE_64) {
+ efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) {
+ debug_putstr("Error: EFI config table entry located above 4GB.\n");
+ return -EINVAL;
+ }
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+
+ } else if (et == EFI_TYPE_32) {
+ efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx;
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * efi_find_vendor_table - Given EFI config table, search it for the physical
+ * address of the vendor table associated with GUID.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: pointer to EFI configuration table
+ * @cfg_tbl_len: number of entries in EFI configuration table
+ * @guid: GUID of vendor table
+ *
+ * Return: vendor table address on success. On error, return 0.
+ */
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ enum efi_type et;
+ unsigned int i;
+
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_NONE)
+ return 0;
+
+ for (i = 0; i < cfg_tbl_len; i++) {
+ unsigned long vendor_tbl_pa;
+ efi_guid_t vendor_tbl_guid;
+ int ret;
+
+ ret = get_vendor_table((void *)cfg_tbl_pa, i,
+ &vendor_tbl_pa,
+ &vendor_tbl_guid, et);
+ if (ret)
+ return 0;
+
+ if (!efi_guidcmp(guid, vendor_tbl_guid))
+ return vendor_tbl_pa;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h
new file mode 100644
index 000000000000..b22300970f97
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_EFI_H
+#define BOOT_COMPRESSED_EFI_H
+
+#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H)
+#error Please do not include kernel proper namespace headers
+#endif
+
+typedef guid_t efi_guid_t __aligned(__alignof__(u32));
+
+#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \
+ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+ (b) & 0xff, ((b) >> 8) & 0xff, \
+ (c) & 0xff, ((c) >> 8) & 0xff, d } }
+
+#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
+#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
+#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42)
+#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31)
+
+#define EFI32_LOADER_SIGNATURE "EL32"
+#define EFI64_LOADER_SIGNATURE "EL64"
+
+/*
+ * Generic EFI table header
+ */
+typedef struct {
+ u64 signature;
+ u32 revision;
+ u32 headersize;
+ u32 crc32;
+ u32 reserved;
+} efi_table_hdr_t;
+
+#define EFI_CONVENTIONAL_MEMORY 7
+#define EFI_UNACCEPTED_MEMORY 15
+
+#define EFI_MEMORY_MORE_RELIABLE \
+ ((u64)0x0000000000010000ULL) /* higher reliability */
+#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */
+
+#define EFI_PAGE_SHIFT 12
+
+typedef struct {
+ u32 type;
+ u32 pad;
+ u64 phys_addr;
+ u64 virt_addr;
+ u64 num_pages;
+ u64 attribute;
+} efi_memory_desc_t;
+
+#define efi_early_memdesc_ptr(map, desc_size, n) \
+ (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size)))
+
+typedef struct {
+ efi_guid_t guid;
+ u64 table;
+} efi_config_table_64_t;
+
+typedef struct {
+ efi_guid_t guid;
+ u32 table;
+} efi_config_table_32_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u64 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 __pad1;
+ u64 con_in_handle;
+ u64 con_in;
+ u64 con_out_handle;
+ u64 con_out;
+ u64 stderr_handle;
+ u64 stderr;
+ u64 runtime;
+ u64 boottime;
+ u32 nr_tables;
+ u32 __pad2;
+ u64 tables;
+} efi_system_table_64_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u32 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 con_in_handle;
+ u32 con_in;
+ u32 con_out_handle;
+ u32 con_out;
+ u32 stderr_handle;
+ u32 stderr;
+ u32 runtime;
+ u32 boottime;
+ u32 nr_tables;
+ u32 tables;
+} efi_system_table_32_t;
+
+struct efi_unaccepted_memory {
+ u32 version;
+ u32 unit_size;
+ u64 phys_base;
+ u64 size;
+ unsigned long bitmap[];
+};
+
+static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right)
+{
+ return memcmp(&left, &right, sizeof (efi_guid_t));
+}
+
+#ifdef CONFIG_EFI
+bool __pure __efi_soft_reserve_enabled(void);
+
+static inline bool __pure efi_soft_reserve_enabled(void)
+{
+ return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE)
+ && __efi_soft_reserve_enabled();
+}
+#else
+static inline bool efi_soft_reserve_enabled(void)
+{
+ return false;
+}
+#endif /* CONFIG_EFI */
+#endif /* BOOT_COMPRESSED_EFI_H */
diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c
new file mode 100644
index 000000000000..19a8251de506
--- /dev/null
+++ b/arch/x86/boot/compressed/error.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Callers outside of misc.c need access to the error reporting routines,
+ * but the *_putstr() functions need to stay in misc.c because of how
+ * memcpy() and memmove() are defined for the compressed boot environment.
+ */
+#include "misc.h"
+#include "error.h"
+
+void warn(const char *m)
+{
+ error_putstr("\n\n");
+ error_putstr(m);
+ error_putstr("\n\n");
+}
+
+void error(char *m)
+{
+ warn(m);
+ error_putstr(" -- System halted");
+
+ while (1)
+ asm("hlt");
+}
+
+/* EFI libstub provides vsnprintf() */
+#ifdef CONFIG_EFI_STUB
+void panic(const char *fmt, ...)
+{
+ static char buf[1024];
+ va_list args;
+ int len;
+
+ va_start(args, fmt);
+ len = vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ if (len && buf[len - 1] == '\n')
+ buf[len - 1] = '\0';
+
+ error(buf);
+}
+#endif
diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h
new file mode 100644
index 000000000000..31f9e080d61a
--- /dev/null
+++ b/arch/x86/boot/compressed/error.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_ERROR_H
+#define BOOT_COMPRESSED_ERROR_H
+
+#include <linux/compiler.h>
+
+void warn(const char *m);
+void error(char *m) __noreturn;
+void panic(const char *fmt, ...) __noreturn __cold;
+
+#endif /* BOOT_COMPRESSED_ERROR_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 75e4f001e706..1cfe9802a42f 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/boot/head.S
*
@@ -23,30 +24,28 @@
*/
.text
+#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page_types.h>
#include <asm/boot.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
- .section ".text.head","ax",@progbits
-ENTRY(startup_32)
- cld
- /*
- * Test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments
- */
- testb $(1<<6), BP_loadflags(%esi)
- jnz 1f
+/*
+ * These symbols needed to be marked as .hidden to prevent the BFD linker from
+ * generating R_386_32 (rather than R_386_RELATIVE) relocations for them when
+ * the 32-bit compressed kernel is linked as PIE. This is no longer necessary,
+ * but it doesn't hurt to keep them .hidden.
+ */
+ .hidden _bss
+ .hidden _ebss
+ .hidden _end
+ __HEAD
+SYM_FUNC_START(startup_32)
+ cld
cli
- movl $__BOOT_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %fs
- movl %eax, %gs
- movl %eax, %ss
-1:
/*
* Calculate the delta between where we were compiled to run
@@ -58,31 +57,51 @@ ENTRY(startup_32)
*/
leal (BP_scratch+4)(%esi), %esp
call 1f
-1: popl %ebp
- subl $1b, %ebp
+1: popl %edx
+ addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %edx
+
+ /* Load new GDT */
+ leal gdt@GOTOFF(%edx), %eax
+ movl %eax, 2(%eax)
+ lgdt (%eax)
+
+ /* Load segment registers with our descriptors */
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
/*
- * %ebp contains the address we are loaded at by the boot loader and %ebx
- * contains the address where we should move the kernel image temporarily
- * for safe in-place decompression.
+ * %edx contains the address we are loaded at by the boot loader (plus the
+ * offset to the GOT). The below code calculates %ebx to be the address where
+ * we should move the kernel image temporarily for safe in-place decompression
+ * (again, plus the offset to the GOT).
+ *
+ * %ebp is calculated to be the address that the kernel will be decompressed to.
*/
#ifdef CONFIG_RELOCATABLE
- movl %ebp, %ebx
+ leal startup_32@GOTOFF(%edx), %ebx
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
notl %eax
andl %eax, %ebx
-#else
- movl $LOAD_PHYSICAL_ADDR, %ebx
+ cmpl $LOAD_PHYSICAL_ADDR, %ebx
+ jae 1f
#endif
+ movl $LOAD_PHYSICAL_ADDR, %ebx
+1:
+ movl %ebx, %ebp // Save the output address for later
/* Target address to relocate to for decompression */
- addl $z_extract_offset, %ebx
+ addl BP_init_size(%esi), %ebx
+ subl $_end@GOTOFF, %ebx
/* Set up the stack */
- leal boot_stack_end(%ebx), %esp
+ leal boot_stack_end@GOTOFF(%ebx), %esp
/* Zero EFLAGS */
pushl $0
@@ -93,8 +112,8 @@ ENTRY(startup_32)
* where decompression in place becomes safe.
*/
pushl %esi
- leal (_bss-4)(%ebp), %esi
- leal (_bss-4)(%ebx), %edi
+ leal (_bss@GOTOFF-4)(%edx), %esi
+ leal (_bss@GOTOFF-4)(%ebx), %edi
movl $(_bss - startup_32), %ecx
shrl $2, %ecx
std
@@ -102,80 +121,68 @@ ENTRY(startup_32)
cld
popl %esi
+ /*
+ * The GDT may get overwritten either during the copy we just did or
+ * during extract_kernel below. To avoid any issues, repoint the GDTR
+ * to the new copy of the GDT.
+ */
+ leal gdt@GOTOFF(%ebx), %eax
+ movl %eax, 2(%eax)
+ lgdt (%eax)
+
/*
* Jump to the relocated address.
*/
- leal relocated(%ebx), %eax
+ leal .Lrelocated@GOTOFF(%ebx), %eax
jmp *%eax
-ENDPROC(startup_32)
+SYM_FUNC_END(startup_32)
.text
-relocated:
+SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
/*
* Clear BSS (stack is currently empty)
*/
xorl %eax, %eax
- leal _bss(%ebx), %edi
- leal _ebss(%ebx), %ecx
+ leal _bss@GOTOFF(%ebx), %edi
+ leal _ebss@GOTOFF(%ebx), %ecx
subl %edi, %ecx
shrl $2, %ecx
rep stosl
/*
- * Do the decompression, and jump to the new kernel..
+ * Do the extraction, and jump to the new kernel..
*/
- leal z_extract_offset_negative(%ebx), %ebp
- /* push arguments for decompress_kernel: */
- pushl %ebp /* output address */
- pushl $z_input_len /* input_len */
- leal input_data(%ebx), %eax
- pushl %eax /* input_data */
- leal boot_heap(%ebx), %eax
- pushl %eax /* heap area */
- pushl %esi /* real mode pointer */
- call decompress_kernel
- addl $20, %esp
-
-#if CONFIG_RELOCATABLE
-/*
- * Find the address of the relocations.
- */
- leal z_output_len(%ebp), %edi
+ /* push arguments for extract_kernel: */
-/*
- * Calculate the delta between where vmlinux was compiled to run
- * and where it was actually loaded.
- */
- movl %ebp, %ebx
- subl $LOAD_PHYSICAL_ADDR, %ebx
- jz 2f /* Nothing to be done if loaded at compiled addr. */
-/*
- * Process relocations.
- */
-
-1: subl $4, %edi
- movl (%edi), %ecx
- testl %ecx, %ecx
- jz 2f
- addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
- jmp 1b
-2:
-#endif
+ pushl %ebp /* output address */
+ pushl %esi /* real mode pointer */
+ call extract_kernel /* returns kernel entry point in %eax */
+ addl $24, %esp
/*
- * Jump to the decompressed kernel.
+ * Jump to the extracted kernel.
*/
xorl %ebx, %ebx
- jmp *%ebp
+ jmp *%eax
+SYM_FUNC_END(.Lrelocated)
+
+ .data
+ .balign 8
+SYM_DATA_START_LOCAL(gdt)
+ .word gdt_end - gdt - 1
+ .long 0
+ .word 0
+ .quad 0x0000000000000000 /* Reserved */
+ .quad 0x00cf9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
/*
* Stack and heap for uncompression
*/
.bss
.balign 4
-boot_heap:
- .fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f62c284db9eb..d9dab940ff62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/boot/head.S
*
@@ -24,32 +25,69 @@
.code32
.text
+#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/pgtable_types.h>
-#include <asm/page_types.h>
#include <asm/boot.h>
#include <asm/msr.h>
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
+#include <asm/desc_defs.h>
+#include <asm/trapnr.h>
+
+/*
+ * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result
+ * in assembly errors due to trying to move .org backward due to the excessive
+ * alignment.
+ */
+#undef __ALIGN
+#define __ALIGN .balign 16, 0x90
+
+/*
+ * Locally defined symbols should be marked hidden:
+ */
+ .hidden _bss
+ .hidden _ebss
+ .hidden _end
+
+ __HEAD
+
+/*
+ * This macro gives the relative virtual address of X, i.e. the offset of X
+ * from startup_32. This is the same as the link-time virtual address of X,
+ * since startup_32 is at 0, but defining it this way tells the
+ * assembler/linker that we do not want the actual run-time address of X. This
+ * prevents the linker from trying to create unwanted run-time relocation
+ * entries for the reference when the compressed kernel is linked as PIE.
+ *
+ * A reference X(%reg) will result in the link-time VA of X being stored with
+ * the instruction, and a run-time R_X86_64_RELATIVE relocation entry that
+ * adds the 64-bit base address where the kernel is loaded.
+ *
+ * Replacing it with (X-startup_32)(%reg) results in the offset being stored,
+ * and no run-time relocation.
+ *
+ * The macro should be used as a displacement with a base register containing
+ * the run-time address of startup_32 [i.e. rva(X)(%reg)], or as an immediate
+ * [$ rva(X)].
+ *
+ * This macro can only be used from within the .head.text section, since the
+ * expression requires startup_32 to be in the same section as the code being
+ * assembled.
+ */
+#define rva(X) ((X) - startup_32)
- .section ".text.head"
.code32
-ENTRY(startup_32)
- cld
+SYM_FUNC_START(startup_32)
/*
- * Test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments
+ * 32bit entry is 0 and it is ABI so immutable!
+ * If we come here directly from a bootloader,
+ * kernel(text+data+bss+brk) ramdisk, zero_page, command line
+ * all need to be under the 4G limit.
*/
- testb $(1<<6), BP_loadflags(%esi)
- jnz 1f
-
+ cld
cli
- movl $(__KERNEL_DS), %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %ss
-1:
/*
* Calculate the delta between where we were compiled to run
@@ -62,16 +100,39 @@ ENTRY(startup_32)
leal (BP_scratch+4)(%esi), %esp
call 1f
1: popl %ebp
- subl $1b, %ebp
+ subl $ rva(1b), %ebp
+
+ /* Load new GDT with the 64bit segments using 32bit descriptor */
+ leal rva(gdt)(%ebp), %eax
+ movl %eax, 2(%eax)
+ lgdt (%eax)
+
+ /* Load segment registers with our descriptors */
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
-/* setup a stack and make sure cpu supports long mode. */
- movl $boot_stack_end, %eax
- addl %ebp, %eax
- movl %eax, %esp
+ /* Setup a stack and load CS from current GDT */
+ leal rva(boot_stack_end)(%ebp), %esp
+
+ pushl $__KERNEL32_CS
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
+1:
+
+ /* Setup Exception handling for SEV-ES */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ call startup32_load_idt
+#endif
+ /* Make sure cpu supports long mode. */
call verify_cpu
testl %eax, %eax
- jnz no_longmode
+ jnz .Lno_longmode
/*
* Compute the delta between where we were compiled to run at
@@ -89,63 +150,88 @@ ENTRY(startup_32)
addl %eax, %ebx
notl %eax
andl %eax, %ebx
-#else
- movl $LOAD_PHYSICAL_ADDR, %ebx
+ cmpl $LOAD_PHYSICAL_ADDR, %ebx
+ jae 1f
#endif
+ movl $LOAD_PHYSICAL_ADDR, %ebx
+1:
/* Target address to relocate to for decompression */
- addl $z_extract_offset, %ebx
+ addl BP_init_size(%esi), %ebx
+ subl $ rva(_end), %ebx
/*
* Prepare for entering 64 bit mode
*/
- /* Load new GDT with the 64bit segments using 32bit descriptor */
- leal gdt(%ebp), %eax
- movl %eax, gdt+2(%ebp)
- lgdt gdt(%ebp)
-
/* Enable PAE mode */
- xorl %eax, %eax
- orl $(X86_CR4_PAE), %eax
+ movl %cr4, %eax
+ orl $X86_CR4_PAE, %eax
movl %eax, %cr4
/*
* Build early 4G boot pagetable
*/
+ /*
+ * If SEV is active then set the encryption mask in the page tables.
+ * This will ensure that when the kernel is copied and decompressed
+ * it will be done so encrypted.
+ */
+ xorl %edx, %edx
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ call get_sev_encryption_bit
+ xorl %edx, %edx
+ testl %eax, %eax
+ jz 1f
+ subl $32, %eax /* Encryption bit is always above bit 31 */
+ bts %eax, %edx /* Set encryption mask for page tables */
+ /*
+ * Set MSR_AMD64_SEV_ENABLED_BIT in sev_status so that
+ * startup32_check_sev_cbit() will do a check. sev_enable() will
+ * initialize sev_status with all the bits reported by
+ * MSR_AMD_SEV_STATUS later, but only MSR_AMD64_SEV_ENABLED_BIT
+ * needs to be set for now.
+ */
+ movl $1, rva(sev_status)(%ebp)
+1:
+#endif
+
/* Initialize Page tables to 0 */
- leal pgtable(%ebx), %edi
+ leal rva(pgtable)(%ebx), %edi
xorl %eax, %eax
- movl $((4096*6)/4), %ecx
+ movl $(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl
/* Build Level 4 */
- leal pgtable + 0(%ebx), %edi
+ leal rva(pgtable + 0)(%ebx), %edi
leal 0x1007 (%edi), %eax
movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
/* Build Level 3 */
- leal pgtable + 0x1000(%ebx), %edi
+ leal rva(pgtable + 0x1000)(%ebx), %edi
leal 0x1007(%edi), %eax
movl $4, %ecx
1: movl %eax, 0x00(%edi)
+ addl %edx, 0x04(%edi)
addl $0x00001000, %eax
addl $8, %edi
decl %ecx
jnz 1b
/* Build Level 2 */
- leal pgtable + 0x2000(%ebx), %edi
+ leal rva(pgtable + 0x2000)(%ebx), %edi
movl $0x00000183, %eax
movl $2048, %ecx
1: movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
addl $0x00200000, %eax
addl $8, %edi
decl %ecx
jnz 1b
/* Enable the boot page tables */
- leal pgtable(%ebx), %eax
+ leal rva(pgtable)(%ebx), %eax
movl %eax, %cr3
/* Enable Long mode in EFER (Extended Feature Enable Register) */
@@ -154,52 +240,56 @@ ENTRY(startup_32)
btsl $_EFER_LME, %eax
wrmsr
+ /* After gdt is loaded */
+ xorl %eax, %eax
+ lldt %ax
+ movl $__BOOT_TSS, %eax
+ ltr %ax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* Check if the C-bit position is correct when SEV is active */
+ call startup32_check_sev_cbit
+#endif
+
/*
* Setup for the jump to 64bit mode
*
- * When the jump is performend we will be in long mode but
+ * When the jump is performed we will be in long mode but
* in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
* (and in turn EFER.LMA = 1). To jump into 64bit mode we use
* the new gdt/idt that has __KERNEL_CS with CS.L = 1.
* We place all of the values on our mini stack so lret can
* used to perform that far jump.
*/
+ leal rva(startup_64)(%ebp), %eax
pushl $__KERNEL_CS
- leal startup_64(%ebp), %eax
pushl %eax
/* Enter paged protected Mode, activating Long Mode */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
+ movl $CR0_STATE, %eax
movl %eax, %cr0
/* Jump from 32bit compatibility mode into 64bit mode. */
lret
-ENDPROC(startup_32)
+SYM_FUNC_END(startup_32)
-no_longmode:
- /* This isn't an x86-64 CPU so hang */
-1:
- hlt
- jmp 1b
-
-#include "../../kernel/verify_cpu_64.S"
-
- /*
- * Be careful here startup_64 needs to be at a predictable
- * address so I can export it in an ELF header. Bootloaders
- * should look at the ELF header to find this address, as
- * it may change in the future.
- */
.code64
.org 0x200
-ENTRY(startup_64)
+SYM_CODE_START(startup_64)
/*
+ * 64bit entry is 0x200 and it is ABI so immutable!
* We come here either from startup_32 or directly from a
- * 64bit bootloader. If we come here from a bootloader we depend on
- * an identity mapped page table being provied that maps our
- * entire text+data+bss and hopefully all of memory.
+ * 64bit bootloader.
+ * If we come here from a bootloader, kernel(text+data+bss+brk),
+ * ramdisk, zero_page, command line could be above 4G.
+ * We depend on an identity mapped page table being provided
+ * that maps our entire kernel(text+data+bss+brk), zero page
+ * and command line.
*/
+ cld
+ cli
+
/* Setup data segments. */
xorl %eax, %eax
movl %eax, %ds
@@ -207,9 +297,6 @@ ENTRY(startup_64)
movl %eax, %ss
movl %eax, %fs
movl %eax, %gs
- lldt %ax
- movl $0x20, %eax
- ltr %ax
/*
* Compute the decompressed kernel start address. It is where
@@ -232,15 +319,94 @@ ENTRY(startup_64)
addq %rax, %rbp
notq %rax
andq %rax, %rbp
-#else
- movq $LOAD_PHYSICAL_ADDR, %rbp
+ cmpq $LOAD_PHYSICAL_ADDR, %rbp
+ jae 1f
#endif
+ movq $LOAD_PHYSICAL_ADDR, %rbp
+1:
/* Target address to relocate to for decompression */
- leaq z_extract_offset(%rbp), %rbx
+ movl BP_init_size(%rsi), %ebx
+ subl $ rva(_end), %ebx
+ addq %rbp, %rbx
/* Set up the stack */
- leaq boot_stack_end(%rbx), %rsp
+ leaq rva(boot_stack_end)(%rbx), %rsp
+
+ /*
+ * At this point we are in long mode with 4-level paging enabled,
+ * but we might want to enable 5-level paging or vice versa.
+ *
+ * The problem is that we cannot do it directly. Setting or clearing
+ * CR4.LA57 in long mode would trigger #GP. So we need to switch off
+ * long mode and paging first.
+ *
+ * We also need a trampoline in lower memory to switch over from
+ * 4- to 5-level paging for cases when the bootloader puts the kernel
+ * above 4G, but didn't enable 5-level paging for us.
+ *
+ * The same trampoline can be used to switch from 5- to 4-level paging
+ * mode, like when starting 4-level paging kernel via kexec() when
+ * original kernel worked in 5-level paging mode.
+ *
+ * For the trampoline, we need the top page table to reside in lower
+ * memory as we don't have a way to load 64-bit values into CR3 in
+ * 32-bit mode.
+ */
+
+ /* Make sure we have GDT with 32-bit code segment */
+ leaq gdt64(%rip), %rax
+ addq %rax, 2(%rax)
+ lgdt (%rax)
+
+ /* Reload CS so IRET returns to a CS actually in the GDT */
+ pushq $__KERNEL_CS
+ leaq .Lon_kernel_cs(%rip), %rax
+ pushq %rax
+ lretq
+
+.Lon_kernel_cs:
+ /*
+ * RSI holds a pointer to a boot_params structure provided by the
+ * loader, and this needs to be preserved across C function calls. So
+ * move it into a callee saved register.
+ */
+ movq %rsi, %r15
+
+ call load_stage1_idt
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Now that the stage1 interrupt handlers are set up, #VC exceptions from
+ * CPUID instructions can be properly handled for SEV-ES guests.
+ *
+ * For SEV-SNP, the CPUID table also needs to be set up in advance of any
+ * CPUID instructions being issued, so go ahead and do that now via
+ * sev_enable(), which will also handle the rest of the SEV-related
+ * detection/setup to ensure that has been done in advance of any dependent
+ * code. Pass the boot_params pointer as the first argument.
+ */
+ movq %r15, %rdi
+ call sev_enable
+#endif
+
+ /* Preserve only the CR4 bits that must be preserved, and clear the rest */
+ movq %cr4, %rax
+ andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax
+ movq %rax, %cr4
+
+ /*
+ * configure_5level_paging() updates the number of paging levels using
+ * a trampoline in 32-bit addressable memory if the current number does
+ * not match the desired number.
+ *
+ * Pass the boot_params pointer as the first argument. The second
+ * argument is the relocated address of the page table to use instead
+ * of the page table in trampoline memory (if required).
+ */
+ movq %r15, %rdi
+ leaq rva(top_pgtable)(%rbx), %rsi
+ call configure_5level_paging
/* Zero EFLAGS */
pushq $0
@@ -250,24 +416,33 @@ ENTRY(startup_64)
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- pushq %rsi
leaq (_bss-8)(%rip), %rsi
- leaq (_bss-8)(%rbx), %rdi
- movq $_bss /* - $startup_32 */, %rcx
- shrq $3, %rcx
+ leaq rva(_bss-8)(%rbx), %rdi
+ movl $(_bss - startup_32), %ecx
+ shrl $3, %ecx
std
rep movsq
cld
- popq %rsi
+
+ /*
+ * The GDT may get overwritten either during the copy we just did or
+ * during extract_kernel below. To avoid any issues, repoint the GDTR
+ * to the new copy of the GDT.
+ */
+ leaq rva(gdt64)(%rbx), %rax
+ leaq rva(gdt)(%rbx), %rdx
+ movq %rdx, 2(%rax)
+ lgdt (%rax)
/*
* Jump to the relocated address.
*/
- leaq relocated(%rbx), %rax
+ leaq rva(.Lrelocated)(%rbx), %rax
jmp *%rax
+SYM_CODE_END(startup_64)
.text
-relocated:
+SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
/*
* Clear BSS (stack is currently empty)
@@ -279,50 +454,86 @@ relocated:
shrq $3, %rcx
rep stosq
+ call load_stage2_idt
+
+ /* Pass boot_params to initialize_identity_maps() */
+ movq %r15, %rdi
+ call initialize_identity_maps
+
/*
- * Do the decompression, and jump to the new kernel..
+ * Do the extraction, and jump to the new kernel..
*/
- pushq %rsi /* Save the real mode argument */
- movq %rsi, %rdi /* real mode address */
- leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
- leaq input_data(%rip), %rdx /* input_data */
- movl $z_input_len, %ecx /* input_len */
- movq %rbp, %r8 /* output target address */
- call decompress_kernel
- popq %rsi
+ /* pass struct boot_params pointer and output target address */
+ movq %r15, %rdi
+ movq %rbp, %rsi
+ call extract_kernel /* returns kernel entry point in %rax */
/*
* Jump to the decompressed kernel.
*/
- jmp *%rbp
+ movq %r15, %rsi
+ jmp *%rax
+SYM_FUNC_END(.Lrelocated)
+
+ .code32
+SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
+ /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
+1:
+ hlt
+ jmp 1b
+SYM_FUNC_END(.Lno_longmode)
+
+ .globl verify_cpu
+#include "../../kernel/verify_cpu.S"
.data
-gdt:
- .word gdt_end - gdt
- .long gdt
+SYM_DATA_START_LOCAL(gdt64)
+ .word gdt_end - gdt - 1
+ .quad gdt - gdt64
+SYM_DATA_END(gdt64)
+ .balign 8
+SYM_DATA_START_LOCAL(gdt)
+ .word gdt_end - gdt - 1
+ .long 0
.word 0
- .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
-gdt_end:
+SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
+
+SYM_DATA_START(boot_idt_desc)
+ .word boot_idt_end - boot_idt - 1
+ .quad 0
+SYM_DATA_END(boot_idt_desc)
+ .balign 8
+SYM_DATA_START(boot_idt)
+ .rept BOOT_IDT_ENTRIES
+ .quad 0
+ .quad 0
+ .endr
+SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
/*
* Stack and heap for uncompression
*/
.bss
.balign 4
-boot_heap:
- .fill BOOT_HEAP_SIZE, 1, 0
-boot_stack:
+SYM_DATA_START_LOCAL(boot_stack)
.fill BOOT_STACK_SIZE, 1, 0
-boot_stack_end:
+ .balign 16
+SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end)
/*
* Space for page tables (not in .bss so not zeroed)
*/
- .section ".pgtable","a",@nobits
+ .section ".pgtable","aw",@nobits
.balign 4096
-pgtable:
- .fill 6*4096, 1, 0
+SYM_DATA_LOCAL(pgtable, .fill BOOT_PGT_SIZE, 1, 0)
+
+/*
+ * The page table is going to be used instead of page table in the trampoline
+ * memory.
+ */
+SYM_DATA_LOCAL(top_pgtable, .fill PAGE_SIZE, 1, 0)
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
new file mode 100644
index 000000000000..dfb9c2deb77c
--- /dev/null
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code is used on x86_64 to create page table identity mappings on
+ * demand by building up a new set of page tables (or appending to the
+ * existing ones), and then switching over to them when ready.
+ *
+ * Copyright (C) 2015-2016 Yinghai Lu
+ * Copyright (C) 2016 Kees Cook
+ */
+
+/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+
+#include "error.h"
+#include "misc.h"
+
+/* These actually do the work of building the kernel identity maps. */
+#include <linux/pgtable.h>
+#include <asm/cmpxchg.h>
+#include <asm/trap_pf.h>
+#include <asm/trapnr.h>
+#include <asm/init.h>
+/* Use the static base for this part of the boot process */
+#undef __PAGE_OFFSET
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+#include "../../mm/ident_map.c"
+
+#define _SETUP
+#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
+#undef _SETUP
+
+extern unsigned long get_cmd_line_ptr(void);
+
+/* Used by PAGE_KERN* macros: */
+pteval_t __default_kernel_pte_mask __read_mostly = ~0;
+
+/* Used to track our page table allocation area. */
+struct alloc_pgt_data {
+ unsigned char *pgt_buf;
+ unsigned long pgt_buf_size;
+ unsigned long pgt_buf_offset;
+};
+
+/*
+ * Allocates space for a page table entry, using struct alloc_pgt_data
+ * above. Besides the local callers, this is used as the allocation
+ * callback in mapping_info below.
+ */
+static void *alloc_pgt_page(void *context)
+{
+ struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
+ unsigned char *entry;
+
+ /* Validate there is space available for a new page. */
+ if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
+ debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
+ debug_putaddr(pages->pgt_buf_offset);
+ debug_putaddr(pages->pgt_buf_size);
+ return NULL;
+ }
+
+ /* Consumed more tables than expected? */
+ if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) {
+ debug_putstr("pgt_buf running low in " __FILE__ "\n");
+ debug_putstr("Need to raise BOOT_PGT_SIZE?\n");
+ debug_putaddr(pages->pgt_buf_offset);
+ debug_putaddr(pages->pgt_buf_size);
+ }
+
+ entry = pages->pgt_buf + pages->pgt_buf_offset;
+ pages->pgt_buf_offset += PAGE_SIZE;
+
+ return entry;
+}
+
+/* Used to track our allocated page tables. */
+static struct alloc_pgt_data pgt_data;
+
+/* The top level page table entry pointer. */
+static unsigned long top_level_pgt;
+
+phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+
+/*
+ * Mapping information structure passed to kernel_ident_mapping_init().
+ * Due to relocation, pointers must be assigned at run time not build time.
+ */
+static struct x86_mapping_info mapping_info;
+
+/*
+ * Adds the specified range to the identity mappings.
+ */
+void kernel_add_identity_map(unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /* Align boundary to 2M. */
+ start = round_down(start, PMD_SIZE);
+ end = round_up(end, PMD_SIZE);
+ if (start >= end)
+ return;
+
+ /* Build the mapping. */
+ ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end);
+ if (ret)
+ error("Error: kernel_ident_mapping_init() failed\n");
+}
+
+/* Locates and clears a region for a new top level page table. */
+void initialize_identity_maps(void *rmode)
+{
+ unsigned long cmdline;
+ struct setup_data *sd;
+
+ /* Exclude the encryption mask from __PHYSICAL_MASK */
+ physical_mask &= ~sme_me_mask;
+
+ /* Init mapping_info with run-time function/buffer pointers. */
+ mapping_info.alloc_pgt_page = alloc_pgt_page;
+ mapping_info.context = &pgt_data;
+ mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
+ mapping_info.kernpg_flag = _KERNPG_TABLE;
+
+ /*
+ * It should be impossible for this not to already be true,
+ * but since calling this a second time would rewind the other
+ * counters, let's just make sure this is reset too.
+ */
+ pgt_data.pgt_buf_offset = 0;
+
+ /*
+ * If we came here via startup_32(), cr3 will be _pgtable already
+ * and we must append to the existing area instead of entirely
+ * overwriting it.
+ *
+ * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+ * the top-level page table is allocated separately.
+ *
+ * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+ * cases. On 4-level paging it's equal to 'top_level_pgt'.
+ */
+ top_level_pgt = read_cr3_pa();
+ if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
+ pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ } else {
+ pgt_data.pgt_buf = _pgtable;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
+ }
+
+ /*
+ * New page-table is set up - map the kernel image, boot_params and the
+ * command line. The uncompressed kernel requires boot_params and the
+ * command line to be mapped in the identity mapping. Map them
+ * explicitly here in case the compressed kernel does not touch them,
+ * or does not touch all the pages covering them.
+ */
+ kernel_add_identity_map((unsigned long)_head, (unsigned long)_end);
+ boot_params_ptr = rmode;
+ kernel_add_identity_map((unsigned long)boot_params_ptr,
+ (unsigned long)(boot_params_ptr + 1));
+ cmdline = get_cmd_line_ptr();
+ kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
+
+ /*
+ * Also map the setup_data entries passed via boot_params in case they
+ * need to be accessed by uncompressed kernel via the identity mapping.
+ */
+ sd = (struct setup_data *)boot_params_ptr->hdr.setup_data;
+ while (sd) {
+ unsigned long sd_addr = (unsigned long)sd;
+
+ kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len);
+ sd = (struct setup_data *)sd->next;
+ }
+
+ sev_prep_identity_maps(top_level_pgt);
+
+ /* Load the new page-table. */
+ write_cr3(top_level_pgt);
+
+ /*
+ * Now that the required page table mappings are established and a
+ * GHCB can be used, check for SNP guest/HV feature compatibility.
+ */
+ snp_check_features();
+}
+
+static pte_t *split_large_pmd(struct x86_mapping_info *info,
+ pmd_t *pmdp, unsigned long __address)
+{
+ unsigned long page_flags;
+ unsigned long address;
+ pte_t *pte;
+ pmd_t pmd;
+ int i;
+
+ pte = (pte_t *)info->alloc_pgt_page(info->context);
+ if (!pte)
+ return NULL;
+
+ address = __address & PMD_MASK;
+ /* No large page - clear PSE flag */
+ page_flags = info->page_flag & ~_PAGE_PSE;
+
+ /* Populate the PTEs */
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ set_pte(&pte[i], __pte(address | page_flags));
+ address += PAGE_SIZE;
+ }
+
+ /*
+ * Ideally we need to clear the large PMD first and do a TLB
+ * flush before we write the new PMD. But the 2M range of the
+ * PMD might contain the code we execute and/or the stack
+ * we are on, so we can't do that. But that should be safe here
+ * because we are going from large to small mappings and we are
+ * also the only user of the page-table, so there is no chance
+ * of a TLB multihit.
+ */
+ pmd = __pmd((unsigned long)pte | info->kernpg_flag);
+ set_pmd(pmdp, pmd);
+ /* Flush TLB to establish the new PMD */
+ write_cr3(top_level_pgt);
+
+ return pte + pte_index(__address);
+}
+
+static void clflush_page(unsigned long address)
+{
+ unsigned int flush_size;
+ char *cl, *start, *end;
+
+ /*
+ * Hardcode cl-size to 64 - CPUID can't be used here because that might
+ * cause another #VC exception and the GHCB is not ready to use yet.
+ */
+ flush_size = 64;
+ start = (char *)(address & PAGE_MASK);
+ end = start + PAGE_SIZE;
+
+ /*
+ * First make sure there are no pending writes on the cache-lines to
+ * flush.
+ */
+ asm volatile("mfence" : : : "memory");
+
+ for (cl = start; cl != end; cl += flush_size)
+ clflush(cl);
+}
+
+static int set_clr_page_flags(struct x86_mapping_info *info,
+ unsigned long address,
+ pteval_t set, pteval_t clr)
+{
+ pgd_t *pgdp = (pgd_t *)top_level_pgt;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep, pte;
+
+ /*
+ * First make sure there is a PMD mapping for 'address'.
+ * It should already exist, but keep things generic.
+ *
+ * To map the page just read from it and fault it in if there is no
+ * mapping yet. kernel_add_identity_map() can't be called here because
+ * that would unconditionally map the address on PMD level, destroying
+ * any PTE-level mappings that might already exist. Use assembly here
+ * so the access won't be optimized away.
+ */
+ asm volatile("mov %[address], %%r9"
+ :: [address] "g" (*(unsigned long *)address)
+ : "r9", "memory");
+
+ /*
+ * The page is mapped at least with PMD size - so skip checks and walk
+ * directly to the PMD.
+ */
+ p4dp = p4d_offset(pgdp, address);
+ pudp = pud_offset(p4dp, address);
+ pmdp = pmd_offset(pudp, address);
+
+ if (pmd_leaf(*pmdp))
+ ptep = split_large_pmd(info, pmdp, address);
+ else
+ ptep = pte_offset_kernel(pmdp, address);
+
+ if (!ptep)
+ return -ENOMEM;
+
+ /*
+ * Changing encryption attributes of a page requires to flush it from
+ * the caches.
+ */
+ if ((set | clr) & _PAGE_ENC) {
+ clflush_page(address);
+
+ /*
+ * If the encryption attribute is being cleared, change the page state
+ * to shared in the RMP table.
+ */
+ if (clr)
+ snp_set_page_shared(__pa(address & PAGE_MASK));
+ }
+
+ /* Update PTE */
+ pte = *ptep;
+ pte = pte_set_flags(pte, set);
+ pte = pte_clear_flags(pte, clr);
+ set_pte(ptep, pte);
+
+ /*
+ * If the encryption attribute is being set, then change the page state to
+ * private in the RMP entry. The page state change must be done after the PTE
+ * is updated.
+ */
+ if (set & _PAGE_ENC)
+ snp_set_page_private(__pa(address & PAGE_MASK));
+
+ /* Flush TLB after changing encryption attribute */
+ write_cr3(top_level_pgt);
+
+ return 0;
+}
+
+int set_page_decrypted(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC);
+}
+
+int set_page_encrypted(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0);
+}
+
+int set_page_non_present(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT);
+}
+
+static void do_pf_error(const char *msg, unsigned long error_code,
+ unsigned long address, unsigned long ip)
+{
+ error_putstr(msg);
+
+ error_putstr("\nError Code: ");
+ error_puthex(error_code);
+ error_putstr("\nCR2: 0x");
+ error_puthex(address);
+ error_putstr("\nRIP relative to _head: 0x");
+ error_puthex(ip - (unsigned long)_head);
+ error_putstr("\n");
+
+ error("Stopping.\n");
+}
+
+void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ unsigned long address = native_read_cr2();
+ unsigned long end;
+ bool ghcb_fault;
+
+ ghcb_fault = sev_es_check_ghcb_fault(address);
+
+ address &= PMD_MASK;
+ end = address + PMD_SIZE;
+
+ /*
+ * Check for unexpected error codes. Unexpected are:
+ * - Faults on present pages
+ * - User faults
+ * - Reserved bits set
+ */
+ if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD))
+ do_pf_error("Unexpected page-fault:", error_code, address, regs->ip);
+ else if (ghcb_fault)
+ do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip);
+
+ /*
+ * Error code is sane - now identity map the 2M region around
+ * the faulting address.
+ */
+ kernel_add_identity_map(address, end);
+}
+
+void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code)
+{
+ spurious_nmi_count++;
+}
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
new file mode 100644
index 000000000000..d100284bbef4
--- /dev/null
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/trap_pf.h>
+#include <asm/segment.h>
+#include <asm/trapnr.h>
+#include "misc.h"
+
+static void set_idt_entry(int vector, void (*handler)(void))
+{
+ unsigned long address = (unsigned long)handler;
+ gate_desc entry;
+
+ memset(&entry, 0, sizeof(entry));
+
+ entry.offset_low = (u16)(address & 0xffff);
+ entry.segment = __KERNEL_CS;
+ entry.bits.type = GATE_TRAP;
+ entry.bits.p = 1;
+ entry.offset_middle = (u16)((address >> 16) & 0xffff);
+ entry.offset_high = (u32)(address >> 32);
+
+ memcpy(&boot_idt[vector], &entry, sizeof(entry));
+}
+
+/* Have this here so we don't need to include <asm/desc.h> */
+static void load_boot_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+/* Setup IDT before kernel jumping to .Lrelocated */
+void load_stage1_idt(void)
+{
+ boot_idt_desc.address = (unsigned long)boot_idt;
+
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ set_idt_entry(X86_TRAP_VC, boot_stage1_vc);
+
+ load_boot_idt(&boot_idt_desc);
+}
+
+/*
+ * Setup IDT after kernel jumping to .Lrelocated.
+ *
+ * initialize_identity_maps() needs a #PF handler to be setup
+ * in order to be able to fault-in identity mapping ranges; see
+ * do_boot_page_fault().
+ *
+ * This #PF handler setup needs to happen in load_stage2_idt() where the
+ * IDT is loaded and there the #VC IDT entry gets setup too.
+ *
+ * In order to be able to handle #VCs, one needs a GHCB which
+ * gets setup with an already set up pagetable, which is done in
+ * initialize_identity_maps(). And there's the catch 22: the boot #VC
+ * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself
+ * (and, especially set_page_decrypted()) because the SEV-ES setup code
+ * cannot initialize a GHCB as there's no #PF handler yet...
+ */
+void load_stage2_idt(void)
+{
+ boot_idt_desc.address = (unsigned long)boot_idt;
+
+ set_idt_entry(X86_TRAP_PF, boot_page_fault);
+ set_idt_entry(X86_TRAP_NMI, boot_nmi_trap);
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Clear the second stage #VC handler in case guest types
+ * needing #VC have not been detected.
+ */
+ if (sev_status & BIT(1))
+ set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+ else
+ set_idt_entry(X86_TRAP_VC, NULL);
+#endif
+
+ load_boot_idt(&boot_idt_desc);
+}
+
+void cleanup_exception_handling(void)
+{
+ /*
+ * Flush GHCB from cache and map it encrypted again when running as
+ * SEV-ES guest.
+ */
+ sev_es_shutdown_ghcb();
+
+ /* Set a null-idt, disabling #PF and #VC handling */
+ boot_idt_desc.size = 0;
+ boot_idt_desc.address = 0;
+ load_boot_idt(&boot_idt_desc);
+}
diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S
new file mode 100644
index 000000000000..4d03c8562f63
--- /dev/null
+++ b/arch/x86/boot/compressed/idt_handlers_64.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Early IDT handler entry points
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#include <asm/segment.h>
+
+/* For ORIG_RAX */
+#include "../../entry/calling.h"
+
+.macro EXCEPTION_HANDLER name function error_code=0
+SYM_FUNC_START(\name)
+
+ /* Build pt_regs */
+ .if \error_code == 0
+ pushq $0
+ .endif
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* Call handler with pt_regs */
+ movq %rsp, %rdi
+ /* Error code is second parameter */
+ movq ORIG_RAX(%rsp), %rsi
+ call \function
+
+ /* Restore regs */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+
+ /* Remove error code and return */
+ addq $8, %rsp
+
+ iretq
+SYM_FUNC_END(\name)
+ .endm
+
+ .text
+ .code64
+
+EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1
+EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1
+EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1
+#endif
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
new file mode 100644
index 000000000000..3b0948ad449f
--- /dev/null
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -0,0 +1,908 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kaslr.c
+ *
+ * This contains the routines needed to generate a reasonable level of
+ * entropy to choose a randomized kernel base address offset in support
+ * of Kernel Address Space Layout Randomization (KASLR). Additionally
+ * handles walking the physical memory maps (and tracking memory regions
+ * to avoid) in order to select a physical memory location that can
+ * contain the entire properly aligned running kernel image.
+ *
+ */
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+#include "efi.h"
+
+#include <generated/compile.h>
+#include <generated/utsversion.h>
+#include <generated/utsrelease.h>
+
+#define _SETUP
+#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
+#undef _SETUP
+
+extern unsigned long get_cmd_line_ptr(void);
+
+/* Simplified build-specific string for starting entropy. */
+static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
+
+static unsigned long rotate_xor(unsigned long hash, const void *area,
+ size_t size)
+{
+ size_t i;
+ unsigned long *ptr = (unsigned long *)area;
+
+ for (i = 0; i < size / sizeof(hash); i++) {
+ /* Rotate by odd number of bits and XOR. */
+ hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+ hash ^= ptr[i];
+ }
+
+ return hash;
+}
+
+/* Attempt to create a simple but unpredictable starting entropy. */
+static unsigned long get_boot_seed(void)
+{
+ unsigned long hash = 0;
+
+ hash = rotate_xor(hash, build_str, sizeof(build_str));
+ hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr));
+
+ return hash;
+}
+
+#define KASLR_COMPRESSED_BOOT
+#include "../../lib/kaslr.c"
+
+
+/* Only supporting at most 4 unusable memmap regions with kaslr */
+#define MAX_MEMMAP_REGIONS 4
+
+static bool memmap_too_large;
+
+
+/*
+ * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit.
+ * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options.
+ */
+static u64 mem_limit;
+
+/* Number of immovable memory regions */
+static int num_immovable_mem;
+
+enum mem_avoid_index {
+ MEM_AVOID_ZO_RANGE = 0,
+ MEM_AVOID_INITRD,
+ MEM_AVOID_CMDLINE,
+ MEM_AVOID_BOOTPARAMS,
+ MEM_AVOID_MEMMAP_BEGIN,
+ MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
+ MEM_AVOID_MAX,
+};
+
+static struct mem_vector mem_avoid[MEM_AVOID_MAX];
+
+static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
+{
+ /* Item one is entirely before item two. */
+ if (one->start + one->size <= two->start)
+ return false;
+ /* Item one is entirely after item two. */
+ if (one->start >= two->start + two->size)
+ return false;
+ return true;
+}
+
+char *skip_spaces(const char *str)
+{
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
+}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
+
+static int
+parse_memmap(char *p, u64 *start, u64 *size)
+{
+ char *oldp;
+
+ if (!p)
+ return -EINVAL;
+
+ /* We don't care about this option here */
+ if (!strncmp(p, "exactmap", 8))
+ return -EINVAL;
+
+ oldp = p;
+ *size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ switch (*p) {
+ case '#':
+ case '$':
+ case '!':
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /*
+ * memmap=nn@ss specifies usable region, should
+ * be skipped
+ */
+ *size = 0;
+ fallthrough;
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void mem_avoid_memmap(char *str)
+{
+ static int i;
+
+ if (i >= MAX_MEMMAP_REGIONS)
+ return;
+
+ while (str && (i < MAX_MEMMAP_REGIONS)) {
+ int rc;
+ u64 start, size;
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ rc = parse_memmap(str, &start, &size);
+ if (rc < 0)
+ break;
+ str = k;
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0 && size < mem_limit)
+ mem_limit = size;
+
+ continue;
+ }
+
+ mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
+ mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
+ i++;
+ }
+
+ /* More than 4 memmaps, fail kaslr */
+ if ((i >= MAX_MEMMAP_REGIONS) && str)
+ memmap_too_large = true;
+}
+
+/* Store the number of 1GB huge pages which users specified: */
+static unsigned long max_gb_huge_pages;
+
+static void parse_gb_huge_pages(char *param, char *val)
+{
+ static bool gbpage_sz;
+ char *p;
+
+ if (!strcmp(param, "hugepagesz")) {
+ p = val;
+ if (memparse(p, &p) != PUD_SIZE) {
+ gbpage_sz = false;
+ return;
+ }
+
+ if (gbpage_sz)
+ warn("Repeatedly set hugeTLB page size of 1G!\n");
+ gbpage_sz = true;
+ return;
+ }
+
+ if (!strcmp(param, "hugepages") && gbpage_sz) {
+ p = val;
+ max_gb_huge_pages = simple_strtoull(p, &p, 0);
+ return;
+ }
+}
+
+static void handle_mem_options(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len;
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!args)
+ return;
+
+ len = strnlen(args, COMMAND_LINE_SIZE-1);
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline)
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0)
+ break;
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) {
+ parse_gb_huge_pages(param, val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0)
+ break;
+
+ if (mem_size < mem_limit)
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return;
+}
+
+/*
+ * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM)
+ * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit.
+ *
+ * The mem_avoid array is used to store the ranges that need to be avoided
+ * when KASLR searches for an appropriate random address. We must avoid any
+ * regions that are unsafe to overlap with during decompression, and other
+ * things like the initrd, cmdline and boot_params. This comment seeks to
+ * explain mem_avoid as clearly as possible since incorrect mem_avoid
+ * memory ranges lead to really hard to debug boot failures.
+ *
+ * The initrd, cmdline, and boot_params are trivial to identify for
+ * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
+ * MEM_AVOID_BOOTPARAMS respectively below.
+ *
+ * What is not obvious how to avoid is the range of memory that is used
+ * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
+ * the compressed kernel (ZO) and its run space, which is used to extract
+ * the uncompressed kernel (VO) and relocs.
+ *
+ * ZO's full run size sits against the end of the decompression buffer, so
+ * we can calculate where text, data, bss, etc of ZO are positioned more
+ * easily.
+ *
+ * For additional background, the decompression calculations can be found
+ * in header.S, and the memory diagram is based on the one found in misc.c.
+ *
+ * The following conditions are already enforced by the image layouts and
+ * associated code:
+ * - input + input_size >= output + output_size
+ * - kernel_total_size <= init_size
+ * - kernel_total_size <= output_size (see Note below)
+ * - output + init_size >= output + output_size
+ *
+ * (Note that kernel_total_size and output_size have no fundamental
+ * relationship, but output_size is passed to choose_random_location
+ * as a maximum of the two. The diagram is showing a case where
+ * kernel_total_size is larger than output_size, but this case is
+ * handled by bumping output_size.)
+ *
+ * The above conditions can be illustrated by a diagram:
+ *
+ * 0 output input input+input_size output+init_size
+ * | | | | |
+ * | | | | |
+ * |-----|--------|--------|--------------|-----------|--|-------------|
+ * | | |
+ * | | |
+ * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size
+ *
+ * [output, output+init_size) is the entire memory range used for
+ * extracting the compressed image.
+ *
+ * [output, output+kernel_total_size) is the range needed for the
+ * uncompressed kernel (VO) and its run size (bss, brk, etc).
+ *
+ * [output, output+output_size) is VO plus relocs (i.e. the entire
+ * uncompressed payload contained by ZO). This is the area of the buffer
+ * written to during decompression.
+ *
+ * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
+ * range of the copied ZO and decompression code. (i.e. the range
+ * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
+ *
+ * [input, input+input_size) is the original copied compressed image (ZO)
+ * (i.e. it does not include its run size). This range must be avoided
+ * because it contains the data used for decompression.
+ *
+ * [input+input_size, output+init_size) is [_text, _end) for ZO. This
+ * range includes ZO's heap and stack, and must be avoided since it
+ * performs the decompression.
+ *
+ * Since the above two ranges need to be avoided and they are adjacent,
+ * they can be merged, resulting in: [input, output+init_size) which
+ * becomes the MEM_AVOID_ZO_RANGE below.
+ */
+static void mem_avoid_init(unsigned long input, unsigned long input_size,
+ unsigned long output)
+{
+ unsigned long init_size = boot_params_ptr->hdr.init_size;
+ u64 initrd_start, initrd_size;
+ unsigned long cmd_line, cmd_line_size;
+
+ /*
+ * Avoid the region that is unsafe to overlap during
+ * decompression.
+ */
+ mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
+ mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
+
+ /* Avoid initrd. */
+ initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32;
+ initrd_start |= boot_params_ptr->hdr.ramdisk_image;
+ initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32;
+ initrd_size |= boot_params_ptr->hdr.ramdisk_size;
+ mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
+ mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
+ /* No need to set mapping for initrd, it will be handled in VO. */
+
+ /* Avoid kernel command line. */
+ cmd_line = get_cmd_line_ptr();
+ /* Calculate size of cmd_line. */
+ if (cmd_line) {
+ cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
+ mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
+ mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
+ }
+
+ /* Avoid boot parameters. */
+ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr;
+ mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr);
+
+ /* We don't need to set a mapping for setup_data. */
+
+ /* Mark the memmap regions we need to avoid */
+ handle_mem_options();
+
+ /* Enumerate the immovable memory regions */
+ num_immovable_mem = count_immovable_mem_regions();
+}
+
+/*
+ * Does this memory vector overlap a known avoided area? If so, record the
+ * overlap region with the lowest address.
+ */
+static bool mem_avoid_overlap(struct mem_vector *img,
+ struct mem_vector *overlap)
+{
+ int i;
+ struct setup_data *ptr;
+ u64 earliest = img->start + img->size;
+ bool is_overlapping = false;
+
+ for (i = 0; i < MEM_AVOID_MAX; i++) {
+ if (mem_overlaps(img, &mem_avoid[i]) &&
+ mem_avoid[i].start < earliest) {
+ *overlap = mem_avoid[i];
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+ }
+
+ /* Avoid all entries in the setup_data linked list. */
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
+ while (ptr) {
+ struct mem_vector avoid;
+
+ avoid.start = (unsigned long)ptr;
+ avoid.size = sizeof(*ptr) + ptr->len;
+
+ if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
+ *overlap = avoid;
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+
+ if (ptr->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) {
+ avoid.start = ((struct setup_indirect *)ptr->data)->addr;
+ avoid.size = ((struct setup_indirect *)ptr->data)->len;
+
+ if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
+ *overlap = avoid;
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ return is_overlapping;
+}
+
+struct slot_area {
+ u64 addr;
+ unsigned long num;
+};
+
+#define MAX_SLOT_AREA 100
+
+static struct slot_area slot_areas[MAX_SLOT_AREA];
+static unsigned int slot_area_index;
+static unsigned long slot_max;
+
+static void store_slot_info(struct mem_vector *region, unsigned long image_size)
+{
+ struct slot_area slot_area;
+
+ if (slot_area_index == MAX_SLOT_AREA)
+ return;
+
+ slot_area.addr = region->start;
+ slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN;
+
+ slot_areas[slot_area_index++] = slot_area;
+ slot_max += slot_area.num;
+}
+
+/*
+ * Skip as many 1GB huge pages as possible in the passed region
+ * according to the number which users specified:
+ */
+static void
+process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
+{
+ u64 pud_start, pud_end;
+ unsigned long gb_huge_pages;
+ struct mem_vector tmp;
+
+ if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /* Are there any 1GB pages in the region? */
+ pud_start = ALIGN(region->start, PUD_SIZE);
+ pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE);
+
+ /* No good 1GB huge pages found: */
+ if (pud_start >= pud_end) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /* Check if the head part of the region is usable. */
+ if (pud_start >= region->start + image_size) {
+ tmp.start = region->start;
+ tmp.size = pud_start - region->start;
+ store_slot_info(&tmp, image_size);
+ }
+
+ /* Skip the good 1GB pages. */
+ gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT;
+ if (gb_huge_pages > max_gb_huge_pages) {
+ pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT);
+ max_gb_huge_pages = 0;
+ } else {
+ max_gb_huge_pages -= gb_huge_pages;
+ }
+
+ /* Check if the tail part of the region is usable. */
+ if (region->start + region->size >= pud_end + image_size) {
+ tmp.start = pud_end;
+ tmp.size = region->start + region->size - pud_end;
+ store_slot_info(&tmp, image_size);
+ }
+}
+
+static u64 slots_fetch_random(void)
+{
+ unsigned long slot;
+ unsigned int i;
+
+ /* Handle case of no slots stored. */
+ if (slot_max == 0)
+ return 0;
+
+ slot = kaslr_get_random_long("Physical") % slot_max;
+
+ for (i = 0; i < slot_area_index; i++) {
+ if (slot >= slot_areas[i].num) {
+ slot -= slot_areas[i].num;
+ continue;
+ }
+ return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN);
+ }
+
+ if (i == slot_area_index)
+ debug_putstr("slots_fetch_random() failed!?\n");
+ return 0;
+}
+
+static void __process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
+{
+ struct mem_vector region, overlap;
+ u64 region_end;
+
+ /* Enforce minimum and memory limit. */
+ region.start = max_t(u64, entry->start, minimum);
+ region_end = min(entry->start + entry->size, mem_limit);
+
+ /* Give up if slot area array is full. */
+ while (slot_area_index < MAX_SLOT_AREA) {
+ /* Potentially raise address to meet alignment needs. */
+ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
+
+ /* Did we raise the address above the passed in memory entry? */
+ if (region.start > region_end)
+ return;
+
+ /* Reduce size by any delta from the original address. */
+ region.size = region_end - region.start;
+
+ /* Return if region can't contain decompressed kernel */
+ if (region.size < image_size)
+ return;
+
+ /* If nothing overlaps, store the region and return. */
+ if (!mem_avoid_overlap(&region, &overlap)) {
+ process_gb_huge_pages(&region, image_size);
+ return;
+ }
+
+ /* Store beginning of region if holds at least image_size. */
+ if (overlap.start >= region.start + image_size) {
+ region.size = overlap.start - region.start;
+ process_gb_huge_pages(&region, image_size);
+ }
+
+ /* Clip off the overlapping region and start over. */
+ region.start = overlap.start + overlap.size;
+ }
+}
+
+static bool process_mem_region(struct mem_vector *region,
+ unsigned long minimum,
+ unsigned long image_size)
+{
+ int i;
+ /*
+ * If no immovable memory found, or MEMORY_HOTREMOVE disabled,
+ * use @region directly.
+ */
+ if (!num_immovable_mem) {
+ __process_mem_region(region, minimum, image_size);
+
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
+ return true;
+ }
+ return false;
+ }
+
+#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+ /*
+ * If immovable memory found, filter the intersection between
+ * immovable memory and @region.
+ */
+ for (i = 0; i < num_immovable_mem; i++) {
+ u64 start, end, entry_end, region_end;
+ struct mem_vector entry;
+
+ if (!mem_overlaps(region, &immovable_mem[i]))
+ continue;
+
+ start = immovable_mem[i].start;
+ end = start + immovable_mem[i].size;
+ region_end = region->start + region->size;
+
+ entry.start = clamp(region->start, start, end);
+ entry_end = clamp(region_end, start, end);
+ entry.size = entry_end - entry.start;
+
+ __process_mem_region(&entry, minimum, image_size);
+
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n");
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
+#ifdef CONFIG_EFI
+
+/*
+ * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are
+ * guaranteed to be free.
+ *
+ * Pick free memory more conservatively than the EFI spec allows: according to
+ * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus
+ * available to place the kernel image into, but in practice there's firmware
+ * where using that memory leads to crashes. Buggy vendor EFI code registers
+ * for an event that triggers on SetVirtualAddressMap(). The handler assumes
+ * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which
+ * is probably true for Windows.
+ *
+ * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap().
+ */
+static inline bool memory_type_is_free(efi_memory_desc_t *md)
+{
+ if (md->type == EFI_CONVENTIONAL_MEMORY)
+ return true;
+
+ if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) &&
+ md->type == EFI_UNACCEPTED_MEMORY)
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns true if we processed the EFI memmap, which we prefer over the E820
+ * table if it is available.
+ */
+static bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct efi_info *e = &boot_params_ptr->efi_info;
+ bool efi_mirror_found = false;
+ struct mem_vector region;
+ efi_memory_desc_t *md;
+ unsigned long pmap;
+ char *signature;
+ u32 nr_desc;
+ int i;
+
+ signature = (char *)&e->efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
+ return false;
+
+#ifdef CONFIG_X86_32
+ /* Can't handle data above 4GB at this time */
+ if (e->efi_memmap_hi) {
+ warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
+ return false;
+ }
+ pmap = e->efi_memmap;
+#else
+ pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
+
+ nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ efi_mirror_found = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+
+ if (!memory_type_is_free(md))
+ continue;
+
+ if (efi_soft_reserve_enabled() &&
+ (md->attribute & EFI_MEMORY_SP))
+ continue;
+
+ if (efi_mirror_found &&
+ !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
+ continue;
+
+ region.start = md->phys_addr;
+ region.size = md->num_pages << EFI_PAGE_SHIFT;
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+ return true;
+}
+#else
+static inline bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ return false;
+}
+#endif
+
+static void process_e820_entries(unsigned long minimum,
+ unsigned long image_size)
+{
+ int i;
+ struct mem_vector region;
+ struct boot_e820_entry *entry;
+
+ /* Verify potential e820 positions, appending to slots list. */
+ for (i = 0; i < boot_params_ptr->e820_entries; i++) {
+ entry = &boot_params_ptr->e820_table[i];
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+ region.start = entry->addr;
+ region.size = entry->size;
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+}
+
+/*
+ * If KHO is active, only process its scratch areas to ensure we are not
+ * stepping onto preserved memory.
+ */
+static bool process_kho_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct kho_scratch *kho_scratch;
+ struct setup_data *ptr;
+ struct kho_data *kho;
+ int i, nr_areas = 0;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return false;
+
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
+ while (ptr) {
+ if (ptr->type == SETUP_KEXEC_KHO) {
+ kho = (struct kho_data *)(unsigned long)ptr->data;
+ kho_scratch = (void *)(unsigned long)kho->scratch_addr;
+ nr_areas = kho->scratch_size / sizeof(*kho_scratch);
+ break;
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ if (!nr_areas)
+ return false;
+
+ for (i = 0; i < nr_areas; i++) {
+ struct kho_scratch *area = &kho_scratch[i];
+ struct mem_vector region = {
+ .start = area->addr,
+ .size = area->size,
+ };
+
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+
+ return true;
+}
+
+static unsigned long find_random_phys_addr(unsigned long minimum,
+ unsigned long image_size)
+{
+ u64 phys_addr;
+
+ /* Bail out early if it's impossible to succeed. */
+ if (minimum + image_size > mem_limit)
+ return 0;
+
+ /* Check if we had too many memmaps. */
+ if (memmap_too_large) {
+ debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
+ return 0;
+ }
+
+ /*
+ * During kexec handover only process KHO scratch areas that are known
+ * not to contain any data that must be preserved.
+ */
+ if (!process_kho_entries(minimum, image_size) &&
+ !process_efi_entries(minimum, image_size))
+ process_e820_entries(minimum, image_size);
+
+ phys_addr = slots_fetch_random();
+
+ /* Perform a final check to make sure the address is in range. */
+ if (phys_addr < minimum || phys_addr + image_size > mem_limit) {
+ warn("Invalid physical address chosen!\n");
+ return 0;
+ }
+
+ return (unsigned long)phys_addr;
+}
+
+static unsigned long find_random_virt_addr(unsigned long minimum,
+ unsigned long image_size)
+{
+ unsigned long slots, random_addr;
+
+ /*
+ * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
+ * that can hold image_size within the range of minimum to
+ * KERNEL_IMAGE_SIZE?
+ */
+ slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;
+
+ random_addr = kaslr_get_random_long("Virtual") % slots;
+
+ return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
+}
+
+/*
+ * Since this function examines addresses much more numerically,
+ * it takes the input and output pointers as 'unsigned long'.
+ */
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
+{
+ unsigned long random_addr, min_addr;
+
+ if (cmdline_find_option_bool("nokaslr")) {
+ warn("KASLR disabled: 'nokaslr' on cmdline.");
+ return;
+ }
+
+ boot_params_ptr->hdr.loadflags |= KASLR_FLAG;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ mem_limit = KERNEL_IMAGE_SIZE;
+ else
+ mem_limit = MAXMEM;
+
+ /* Record the various known unsafe memory ranges. */
+ mem_avoid_init(input, input_size, *output);
+
+ /*
+ * Low end of the randomization range should be the
+ * smaller of 512M or the initial kernel image
+ * location:
+ */
+ min_addr = min(*output, 512UL << 20);
+ /* Make sure minimum is aligned. */
+ min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);
+
+ /* Walk available memory entries to find a random address. */
+ random_addr = find_random_phys_addr(min_addr, output_size);
+ if (!random_addr) {
+ warn("Physical KASLR disabled: no suitable memory region!");
+ } else {
+ /* Update the new physical address location. */
+ if (*output != random_addr)
+ *output = random_addr;
+ }
+
+
+ /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
+ if (IS_ENABLED(CONFIG_X86_64))
+ random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
+ *virt_addr = random_addr;
+}
diff --git a/arch/x86/boot/compressed/kernel_info.S b/arch/x86/boot/compressed/kernel_info.S
new file mode 100644
index 000000000000..f818ee8fba38
--- /dev/null
+++ b/arch/x86/boot/compressed/kernel_info.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/bootparam.h>
+
+ .section ".rodata.kernel_info", "a"
+
+ .global kernel_info
+
+kernel_info:
+ /* Header, Linux top (structure). */
+ .ascii "LToP"
+ /* Size. */
+ .long kernel_info_var_len_data - kernel_info
+ /* Size total. */
+ .long kernel_info_end - kernel_info
+
+ /* Maximal allowed type for setup_data and setup_indirect structs. */
+ .long SETUP_TYPE_MAX
+
+kernel_info_var_len_data:
+ /* Empty for time being... */
+kernel_info_end:
diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c
new file mode 100644
index 000000000000..0e9f84ab4bdc
--- /dev/null
+++ b/arch/x86/boot/compressed/mem.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "error.h"
+#include "misc.h"
+#include "tdx.h"
+#include "sev.h"
+#include <asm/shared/tdx.h>
+
+/*
+ * accept_memory() and process_unaccepted_memory() called from EFI stub which
+ * runs before decompressor and its early_tdx_detect().
+ *
+ * Enumerate TDX directly from the early users.
+ */
+static bool early_is_tdx_guest(void)
+{
+ static bool once;
+ static bool is_tdx;
+
+ if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST))
+ return false;
+
+ if (!once) {
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax,
+ &sig[0], &sig[2], &sig[1]);
+ is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig));
+ once = true;
+ }
+
+ return is_tdx;
+}
+
+void arch_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ /* Platform-specific memory-acceptance call goes here */
+ if (early_is_tdx_guest()) {
+ if (!tdx_accept_memory(start, end))
+ panic("TDX: Failed to accept memory\n");
+ } else if (early_is_sevsnp_guest()) {
+ snp_accept_memory(start, end);
+ } else {
+ error("Cannot accept memory: unknown platform\n");
+ }
+}
+
+bool init_unaccepted_memory(void)
+{
+ guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID;
+ struct efi_unaccepted_memory *table;
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ enum efi_type et;
+ int ret;
+
+ et = efi_get_type(boot_params_ptr);
+ if (et == EFI_TYPE_NONE)
+ return false;
+
+ ret = efi_get_conf_table(boot_params_ptr, &cfg_table_pa, &cfg_table_len);
+ if (ret) {
+ warn("EFI config table not found.");
+ return false;
+ }
+
+ table = (void *)efi_find_vendor_table(boot_params_ptr, cfg_table_pa,
+ cfg_table_len, guid);
+ if (!table)
+ return false;
+
+ if (table->version != 1)
+ error("Unknown version of unaccepted memory table\n");
+
+ /*
+ * In many cases unaccepted_table is already set by EFI stub, but it
+ * has to be initialized again to cover cases when the table is not
+ * allocated by EFI stub or EFI stub copied the kernel image with
+ * efi_relocate_kernel() before the variable is set.
+ *
+ * It must be initialized before the first usage of accept_memory().
+ */
+ unaccepted_table = table;
+
+ return true;
+}
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
new file mode 100644
index 000000000000..32f7cc8a8625
--- /dev/null
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+#include <asm/segment.h>
+#include <asm/trapnr.h>
+
+ .text
+ .code32
+SYM_FUNC_START(get_sev_encryption_bit)
+ push %ebx
+
+ movl $0x80000000, %eax /* CPUID to check the highest leaf */
+ cpuid
+ cmpl $0x8000001f, %eax /* See if 0x8000001f is available */
+ jb .Lno_sev
+
+ /*
+ * Check for the SEV feature:
+ * CPUID Fn8000_001F[EAX] - Bit 1
+ * CPUID Fn8000_001F[EBX] - Bits 5:0
+ * Pagetable bit position used to indicate encryption
+ */
+ movl $0x8000001f, %eax
+ cpuid
+ bt $1, %eax /* Check if SEV is available */
+ jnc .Lno_sev
+
+ movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
+ rdmsr
+ bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */
+ jnc .Lno_sev
+
+ movl %ebx, %eax
+ andl $0x3f, %eax /* Return the encryption bit location */
+ jmp .Lsev_exit
+
+.Lno_sev:
+ xor %eax, %eax
+
+.Lsev_exit:
+ pop %ebx
+ RET
+SYM_FUNC_END(get_sev_encryption_bit)
+
+/**
+ * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using
+ * the GHCB MSR protocol
+ *
+ * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX)
+ * @%edx: CPUID Function
+ *
+ * Returns 0 in %eax on success, non-zero on failure
+ * %edx returns CPUID value on success
+ */
+SYM_CODE_START_LOCAL(sev_es_req_cpuid)
+ shll $30, %eax
+ orl $0x00000004, %eax
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall # VMGEXIT
+ rdmsr
+
+ /* Check response */
+ movl %eax, %ecx
+ andl $0x3ffff000, %ecx # Bits [12-29] MBZ
+ jnz 2f
+
+ /* Check return code */
+ andl $0xfff, %eax
+ cmpl $5, %eax
+ jne 2f
+
+ /* All good - return success */
+ xorl %eax, %eax
+1:
+ RET
+2:
+ movl $-1, %eax
+ jmp 1b
+SYM_CODE_END(sev_es_req_cpuid)
+
+SYM_CODE_START_LOCAL(startup32_vc_handler)
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Keep CPUID function in %ebx */
+ movl %eax, %ebx
+
+ /* Check if error-code == SVM_EXIT_CPUID */
+ cmpl $0x72, 16(%esp)
+ jne .Lfail
+
+ movl $0, %eax # Request CPUID[fn].EAX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 12(%esp) # Store result
+
+ movl $1, %eax # Request CPUID[fn].EBX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 8(%esp) # Store result
+
+ movl $2, %eax # Request CPUID[fn].ECX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 4(%esp) # Store result
+
+ movl $3, %eax # Request CPUID[fn].EDX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 0(%esp) # Store result
+
+ /*
+ * Sanity check CPUID results from the Hypervisor. See comment in
+ * do_vc_no_ghcb() for more details on why this is necessary.
+ */
+
+ /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */
+ cmpl $0x80000000, %ebx
+ jne .Lcheck_sev
+ cmpl $0x8000001f, 12(%esp)
+ jb .Lfail
+ jmp .Ldone
+
+.Lcheck_sev:
+ /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */
+ cmpl $0x8000001f, %ebx
+ jne .Ldone
+ btl $1, 12(%esp)
+ jnc .Lfail
+
+.Ldone:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ /* Remove error code */
+ addl $4, %esp
+
+ /* Jump over CPUID instruction */
+ addl $2, (%esp)
+
+ iret
+.Lfail:
+ /* Send terminate request to Hypervisor */
+ movl $0x100, %eax
+ xorl %edx, %edx
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall
+
+ /* If request fails, go to hlt loop */
+ hlt
+ jmp .Lfail
+SYM_CODE_END(startup32_vc_handler)
+
+/*
+ * Write an IDT entry into boot32_idt
+ *
+ * Parameters:
+ *
+ * %eax: Handler address
+ * %edx: Vector number
+ * %ecx: IDT address
+ */
+SYM_FUNC_START_LOCAL(startup32_set_idt_entry)
+ /* IDT entry address to %ecx */
+ leal (%ecx, %edx, 8), %ecx
+
+ /* Build IDT entry, lower 4 bytes */
+ movl %eax, %edx
+ andl $0x0000ffff, %edx # Target code segment offset [15:0]
+ orl $(__KERNEL32_CS << 16), %edx # Target code segment selector
+
+ /* Store lower 4 bytes to IDT */
+ movl %edx, (%ecx)
+
+ /* Build IDT entry, upper 4 bytes */
+ movl %eax, %edx
+ andl $0xffff0000, %edx # Target code segment offset [31:16]
+ orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
+
+ /* Store upper 4 bytes to IDT */
+ movl %edx, 4(%ecx)
+
+ RET
+SYM_FUNC_END(startup32_set_idt_entry)
+
+SYM_FUNC_START(startup32_load_idt)
+ push %ebp
+ push %ebx
+
+ call 1f
+1: pop %ebp
+
+ leal (boot32_idt - 1b)(%ebp), %ebx
+
+ /* #VC handler */
+ leal (startup32_vc_handler - 1b)(%ebp), %eax
+ movl $X86_TRAP_VC, %edx
+ movl %ebx, %ecx
+ call startup32_set_idt_entry
+
+ /* Load IDT */
+ leal (boot32_idt_desc - 1b)(%ebp), %ecx
+ movl %ebx, 2(%ecx)
+ lidt (%ecx)
+
+ pop %ebx
+ pop %ebp
+ RET
+SYM_FUNC_END(startup32_load_idt)
+
+/*
+ * Check for the correct C-bit position when the startup_32 boot-path is used.
+ *
+ * The check makes use of the fact that all memory is encrypted when paging is
+ * disabled. The function creates 64 bits of random data using the RDRAND
+ * instruction. RDRAND is mandatory for SEV guests, so always available. If the
+ * hypervisor violates that the kernel will crash right here.
+ *
+ * The 64 bits of random data are stored to a memory location and at the same
+ * time kept in the %eax and %ebx registers. Since encryption is always active
+ * when paging is off the random data will be stored encrypted in main memory.
+ *
+ * Then paging is enabled. When the C-bit position is correct all memory is
+ * still mapped encrypted and comparing the register values with memory will
+ * succeed. An incorrect C-bit position will map all memory unencrypted, so that
+ * the compare will use the encrypted random data and fail.
+ */
+SYM_FUNC_START(startup32_check_sev_cbit)
+ pushl %ebx
+ pushl %ebp
+
+ call 0f
+0: popl %ebp
+
+ /* Check for non-zero sev_status */
+ movl (sev_status - 0b)(%ebp), %eax
+ testl %eax, %eax
+ jz 4f
+
+ /*
+ * Get two 32-bit random values - Don't bail out if RDRAND fails
+ * because it is better to prevent forward progress if no random value
+ * can be gathered.
+ */
+1: rdrand %eax
+ jnc 1b
+2: rdrand %ebx
+ jnc 2b
+
+ /* Store to memory and keep it in the registers */
+ leal (sev_check_data - 0b)(%ebp), %ebp
+ movl %eax, 0(%ebp)
+ movl %ebx, 4(%ebp)
+
+ /* Enable paging to see if encryption is active */
+ movl %cr0, %edx /* Backup %cr0 in %edx */
+ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
+ movl %ecx, %cr0
+
+ cmpl %eax, 0(%ebp)
+ jne 3f
+ cmpl %ebx, 4(%ebp)
+ jne 3f
+
+ movl %edx, %cr0 /* Restore previous %cr0 */
+
+ jmp 4f
+
+3: /* Check failed - hlt the machine */
+ hlt
+ jmp 3b
+
+4:
+ popl %ebp
+ popl %ebx
+ RET
+SYM_FUNC_END(startup32_check_sev_cbit)
+
+ .code64
+
+#include "../../kernel/sev_verify_cbit.S"
+
+ .data
+
+ .balign 8
+SYM_DATA(sme_me_mask, .quad 0)
+SYM_DATA(sev_status, .quad 0)
+SYM_DATA(sev_check_data, .quad 0)
+
+SYM_DATA_START_LOCAL(boot32_idt)
+ .rept 32
+ .quad 0
+ .endr
+SYM_DATA_END(boot32_idt)
+
+SYM_DATA_START_LOCAL(boot32_idt_desc)
+ .word . - boot32_idt - 1
+ .long 0
+SYM_DATA_END(boot32_idt_desc)
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 842b2a36174a..0f41ca0e52c0 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -1,154 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* misc.c
*
- * This is a collection of several routines from gzip-1.0.3
- * adapted for Linux.
+ * This is a collection of several routines used to extract the kernel
+ * which includes KASLR relocation, decompression, ELF parsing, and
+ * relocation processing. Additionally included are the screen and serial
+ * output functions and related debugging support functions.
*
* malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
* puts by Nick Holloway 1993, better puts by Martin Mares 1995
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
-/*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
- */
-#undef CONFIG_PARAVIRT
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
-
-#ifdef CONFIG_X86_64
-#define _LINUX_STRING_H_ 1
-#define __LINUX_BITMAP_H 1
-#endif
-
-#include <linux/linkage.h>
-#include <linux/screen_info.h>
-#include <linux/elf.h>
-#include <linux/io.h>
-#include <asm/page.h>
-#include <asm/boot.h>
-#include <asm/bootparam.h>
-
-/* WARNING!!
- * This code is compiled with -fPIC and it is relocated dynamically
- * at run time, but no relocation processing is performed.
- * This means that it is not safe to place pointers in static structures.
- */
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+#include "../voffset.h"
+#include <asm/bootparam_utils.h>
/*
- * Getting to provable safe in place decompression is hard.
- * Worst case behaviours need to be analyzed.
- * Background information:
- *
- * The file layout is:
- * magic[2]
- * method[1]
- * flags[1]
- * timestamp[4]
- * extraflags[1]
- * os[1]
- * compressed data blocks[N]
- * crc[4] orig_len[4]
- *
- * resulting in 18 bytes of non compressed data overhead.
- *
- * Files divided into blocks
- * 1 bit (last block flag)
- * 2 bits (block type)
- *
- * 1 block occurs every 32K -1 bytes or when there 50% compression
- * has been achieved. The smallest block type encoding is always used.
- *
- * stored:
- * 32 bits length in bytes.
- *
- * fixed:
- * magic fixed tree.
- * symbols.
- *
- * dynamic:
- * dynamic tree encoding.
- * symbols.
- *
- *
- * The buffer for decompression in place is the length of the
- * uncompressed data, plus a small amount extra to keep the algorithm safe.
- * The compressed data is placed at the end of the buffer. The output
- * pointer is placed at the start of the buffer and the input pointer
- * is placed where the compressed data starts. Problems will occur
- * when the output pointer overruns the input pointer.
- *
- * The output pointer can only overrun the input pointer if the input
- * pointer is moving faster than the output pointer. A condition only
- * triggered by data whose compressed form is larger than the uncompressed
- * form.
- *
- * The worst case at the block level is a growth of the compressed data
- * of 5 bytes per 32767 bytes.
- *
- * The worst case internal to a compressed block is very hard to figure.
- * The worst case can at least be boundined by having one bit that represents
- * 32764 bytes and then all of the rest of the bytes representing the very
- * very last byte.
- *
- * All of which is enough to compute an amount of extra data that is required
- * to be safe. To avoid problems at the block level allocating 5 extra bytes
- * per 32767 bytes of data is sufficient. To avoind problems internal to a
- * block adding an extra 32767 bytes (the worst case uncompressed block size)
- * is sufficient, to ensure that in the worst case the decompressed data for
- * block will stop the byte before the compressed data for a block begins.
- * To avoid problems with the compressed data's meta information an extra 18
- * bytes are needed. Leading to the formula:
- *
- * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
- *
- * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
- * Adding 32768 instead of 32767 just makes for round numbers.
- * Adding the decompressor_size is necessary as it musht live after all
- * of the data as well. Last I measured the decompressor is about 14K.
- * 10K of actual data and 4K of bss.
- *
+ * WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically at
+ * run time, but no relocation processing is performed. This means that
+ * it is not safe to place pointers in static structures.
*/
-/*
- * gzip declarations
- */
+/* Macros used by the included decompressor code below. */
#define STATIC static
+/* Define an externally visible malloc()/free(). */
+#define MALLOC_VISIBLE
+#include <linux/decompress/mm.h>
-#undef memset
-#undef memcpy
+/*
+ * Provide definitions of memzero and memmove as some of the decompressors will
+ * try to define their own functions if these are not defined as macros.
+ */
#define memzero(s, n) memset((s), 0, (n))
-
-
-static void error(char *m);
+#ifndef memmove
+#define memmove memmove
+/* Functions used by the included decompressor code below. */
+void *memmove(void *dest, const void *src, size_t n);
+#endif
/*
* This is set up by the setup-routine at boot-time
*/
-static struct boot_params *real_mode; /* Pointer to real-mode data */
-static int quiet;
-
-static void *memset(void *s, int c, unsigned n);
-void *memcpy(void *dest, const void *src, unsigned n);
-
-static void __putstr(int, const char *);
-#define putstr(__x) __putstr(0, __x)
+struct boot_params *boot_params_ptr;
-#ifdef CONFIG_X86_64
-#define memptr long
-#else
-#define memptr unsigned
-#endif
+struct port_io_ops pio_ops;
-static memptr free_mem_ptr;
-static memptr free_mem_end_ptr;
+memptr free_mem_ptr;
+memptr free_mem_end_ptr;
+int spurious_nmi_count;
static char *vidmem;
static int vidport;
-static int lines, cols;
+
+/* These might be accessed before .bss is cleared, so use .data instead. */
+static int lines __section(".data");
+static int cols __section(".data");
#ifdef CONFIG_KERNEL_GZIP
#include "../../../../lib/decompress_inflate.c"
@@ -162,33 +72,68 @@ static int lines, cols;
#include "../../../../lib/decompress_unlzma.c"
#endif
+#ifdef CONFIG_KERNEL_XZ
+#include "../../../../lib/decompress_unxz.c"
+#endif
+
+#ifdef CONFIG_KERNEL_LZO
+#include "../../../../lib/decompress_unlzo.c"
+#endif
+
+#ifdef CONFIG_KERNEL_LZ4
+#include "../../../../lib/decompress_unlz4.c"
+#endif
+
+#ifdef CONFIG_KERNEL_ZSTD
+#include "../../../../lib/decompress_unzstd.c"
+#endif
+/*
+ * NOTE: When adding a new decompressor, please update the analysis in
+ * ../header.S.
+ */
+
static void scroll(void)
{
int i;
- memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
+ memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2)
vidmem[i] = ' ';
}
-static void __putstr(int error, const char *s)
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+static void serial_putchar(int ch)
+{
+ unsigned timeout = 0xffff;
+
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+void __putstr(const char *s)
{
int x, y, pos;
char c;
-#ifndef CONFIG_X86_VERBOSE_BOOTUP
- if (!error)
- return;
-#endif
+ if (early_serial_base) {
+ const char *str = s;
+ while (*str) {
+ if (*str == '\n')
+ serial_putchar('\r');
+ serial_putchar(*str++);
+ }
+ }
-#ifdef CONFIG_X86_32
- if (real_mode->screen_info.orig_video_mode == 0 &&
- lines == 0 && cols == 0)
+ if (lines == 0 || cols == 0)
return;
-#endif
- x = real_mode->screen_info.orig_x;
- y = real_mode->screen_info.orig_y;
+ x = boot_params_ptr->screen_info.orig_x;
+ y = boot_params_ptr->screen_info.orig_y;
while ((c = *s++) != '\0') {
if (c == '\n') {
@@ -209,8 +154,8 @@ static void __putstr(int error, const char *s)
}
}
- real_mode->screen_info.orig_x = x;
- real_mode->screen_info.orig_y = y;
+ boot_params_ptr->screen_info.orig_x = x;
+ boot_params_ptr->screen_info.orig_y = y;
pos = (x + cols * y) * 2; /* Update cursor position */
outb(14, vidport);
@@ -219,39 +164,121 @@ static void __putstr(int error, const char *s)
outb(0xff & (pos >> 1), vidport+1);
}
-static void *memset(void *s, int c, unsigned n)
+static noinline void __putnum(unsigned long value, unsigned int base,
+ int mindig)
{
- int i;
- char *ss = s;
+ char buf[8*sizeof(value)+1];
+ char *p;
+
+ p = buf + sizeof(buf);
+ *--p = '\0';
+
+ while (mindig-- > 0 || value) {
+ unsigned char digit = value % base;
+ digit += (digit >= 10) ? ('a'-10) : '0';
+ *--p = digit;
+
+ value /= base;
+ }
- for (i = 0; i < n; i++)
- ss[i] = c;
- return s;
+ __putstr(p);
}
-void *memcpy(void *dest, const void *src, unsigned n)
+void __puthex(unsigned long value)
{
- int i;
- const char *s = src;
- char *d = dest;
-
- for (i = 0; i < n; i++)
- d[i] = s[i];
- return dest;
+ __putnum(value, 16, sizeof(value)*2);
}
+void __putdec(unsigned long value)
+{
+ __putnum(value, 10, 1);
+}
-static void error(char *x)
+#ifdef CONFIG_X86_NEED_RELOCS
+static void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
{
- __putstr(1, "\n\n");
- __putstr(1, x);
- __putstr(1, "\n\n -- System halted");
+ int *reloc;
+ unsigned long delta, map, ptr;
+ unsigned long min_addr = (unsigned long)output;
+ unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
+
+ /*
+ * Calculate the delta between where vmlinux was linked to load
+ * and where it was actually loaded.
+ */
+ delta = min_addr - LOAD_PHYSICAL_ADDR;
+
+ /*
+ * The kernel contains a table of relocation addresses. Those
+ * addresses have the final load address of the kernel in virtual
+ * memory. We are currently working in the self map. So we need to
+ * create an adjustment for kernel memory addresses to the self map.
+ * This will involve subtracting out the base address of the kernel.
+ */
+ map = delta - __START_KERNEL_map;
+
+ /*
+ * 32-bit always performs relocations. 64-bit relocations are only
+ * needed if KASLR has chosen a different starting address offset
+ * from __START_KERNEL_map.
+ */
+ if (IS_ENABLED(CONFIG_X86_64))
+ delta = virt_addr - LOAD_PHYSICAL_ADDR;
+
+ if (!delta) {
+ debug_putstr("No relocation needed... ");
+ return;
+ }
+ debug_putstr("Performing relocations... ");
+
+ /*
+ * Process relocations: 32 bit relocations first then 64 bit after.
+ * Two sets of binary relocations are added to the end of the kernel
+ * before compression. Each relocation table entry is the kernel
+ * address of the location which needs to be updated stored as a
+ * 32-bit value which is sign extended to 64 bits.
+ *
+ * Format is:
+ *
+ * kernel bits...
+ * 0 - zero terminator for 64 bit relocations
+ * 64 bit relocation repeated
+ * 0 - zero terminator for 32 bit relocations
+ * 32 bit relocation repeated
+ *
+ * So we work backwards from the end of the decompressed image.
+ */
+ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
+ long extended = *reloc;
+ extended += map;
+
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("32-bit relocation outside of kernel!\n");
+
+ *(uint32_t *)ptr += delta;
+ }
+#ifdef CONFIG_X86_64
+ for (reloc--; *reloc; reloc--) {
+ long extended = *reloc;
+ extended += map;
- while (1)
- asm("hlt");
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("64-bit relocation outside of kernel!\n");
+
+ *(uint64_t *)ptr += delta;
+ }
+#endif
}
+#else
+static inline void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
+{ }
+#endif
-static void parse_elf(void *output)
+static size_t parse_elf(void *output)
{
#ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -267,13 +294,10 @@ static void parse_elf(void *output)
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
- ehdr.e_ident[EI_MAG3] != ELFMAG3) {
+ ehdr.e_ident[EI_MAG3] != ELFMAG3)
error("Kernel is not a valid ELF file");
- return;
- }
- if (!quiet)
- putstr("Parsing ELF... ");
+ debug_putstr("Parsing ELF... ");
phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
if (!phdrs)
@@ -286,32 +310,118 @@ static void parse_elf(void *output)
switch (phdr->p_type) {
case PT_LOAD:
+#ifdef CONFIG_X86_64
+ if ((phdr->p_align % 0x200000) != 0)
+ error("Alignment of LOAD segment isn't multiple of 2MB");
+#endif
#ifdef CONFIG_RELOCATABLE
dest = output;
dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR);
#else
dest = (void *)(phdr->p_paddr);
#endif
- memcpy(dest,
- output + phdr->p_offset,
- phdr->p_filesz);
+ memmove(dest, output + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
}
+
+ free(phdrs);
+
+ return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
-asmlinkage void decompress_kernel(void *rmode, memptr heap,
- unsigned char *input_data,
- unsigned long input_len,
- unsigned char *output)
+const unsigned long kernel_text_size = VO___start_rodata - VO__text;
+const unsigned long kernel_inittext_offset = VO__sinittext - VO__text;
+const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext;
+const unsigned long kernel_total_size = VO__end - VO__text;
+
+static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
+
+extern unsigned char input_data[];
+extern unsigned int input_len, output_len;
+
+unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
+ void (*error)(char *x))
+{
+ unsigned long entry;
+
+ if (!free_mem_ptr) {
+ free_mem_ptr = (unsigned long)boot_heap;
+ free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
+ }
+
+ if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len,
+ NULL, error) < 0)
+ return ULONG_MAX;
+
+ entry = parse_elf(outbuf);
+ handle_relocations(outbuf, output_len, virt_addr);
+
+ return entry;
+}
+
+/*
+ * Set the memory encryption xloadflag based on the mem_encrypt= command line
+ * parameter, if provided.
+ */
+static void parse_mem_encrypt(struct setup_header *hdr)
{
- real_mode = rmode;
+ int on = cmdline_find_option_bool("mem_encrypt=on");
+ int off = cmdline_find_option_bool("mem_encrypt=off");
- if (real_mode->hdr.loadflags & QUIET_FLAG)
- quiet = 1;
+ if (on > off)
+ hdr->xloadflags |= XLF_MEM_ENCRYPTION;
+}
+
+static void early_sev_detect(void)
+{
+ /*
+ * Accessing video memory causes guest termination because
+ * the boot stage2 #VC handler of SEV-ES/SNP guests does not
+ * support MMIO handling and kexec -c adds screen_info to the
+ * boot parameters passed to the kexec kernel, which causes
+ * console output to be dumped to both video and serial.
+ */
+ if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
+ lines = cols = 0;
+}
- if (real_mode->screen_info.orig_video_mode == 7) {
+/*
+ * The compressed kernel image (ZO), has been moved so that its position
+ * is against the end of the buffer used to hold the uncompressed kernel
+ * image (VO) and the execution environment (.bss, .brk), which makes sure
+ * there is room to do the in-place decompression. (See header.S for the
+ * calculations.)
+ *
+ * |-----compressed kernel image------|
+ * V V
+ * 0 extract_offset +INIT_SIZE
+ * |-----------|---------------|-------------------------|--------|
+ * | | | |
+ * VO__text startup_32 of ZO VO__end ZO__end
+ * ^ ^
+ * |-------uncompressed kernel image---------|
+ *
+ */
+asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
+{
+ unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
+ memptr heap = (memptr)boot_heap;
+ unsigned long needed_size;
+ size_t entry_offset;
+
+ /* Retain x86 boot parameters pointer passed from startup_32/64. */
+ boot_params_ptr = rmode;
+
+ /* Clear flags intended for solely in-kernel use. */
+ boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG;
+
+ parse_mem_encrypt(&boot_params_ptr->hdr);
+
+ sanitize_boot_params(boot_params_ptr);
+
+ if (boot_params_ptr->screen_info.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
vidport = 0x3b4;
} else {
@@ -319,31 +429,109 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
vidport = 0x3d4;
}
- lines = real_mode->screen_info.orig_video_lines;
- cols = real_mode->screen_info.orig_video_cols;
+ lines = boot_params_ptr->screen_info.orig_video_lines;
+ cols = boot_params_ptr->screen_info.orig_video_cols;
+
+ init_default_io_ops();
+
+ /*
+ * Detect TDX guest environment.
+ *
+ * It has to be done before console_init() in order to use
+ * paravirtualized port I/O operations if needed.
+ */
+ early_tdx_detect();
+
+ early_sev_detect();
+
+ console_init();
+
+ /*
+ * Save RSDP address for later use. Have this after console_init()
+ * so that early debugging output from the RSDP parsing code can be
+ * collected.
+ */
+ boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr();
+
+ debug_putstr("early console in extract_kernel\n");
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
+ /*
+ * The memory hole needed for the kernel is the larger of either
+ * the entire decompressed kernel plus relocation table, or the
+ * entire decompressed kernel plus .bss and .brk sections.
+ *
+ * On X86_64, the memory is mapped with PMD pages. Round the
+ * size up so that the full extent of PMD pages mapped is
+ * included in the check against the valid memory table
+ * entries. This ensures the full mapped area is usable RAM
+ * and doesn't include any reserved areas.
+ */
+ needed_size = max_t(unsigned long, output_len, kernel_total_size);
+#ifdef CONFIG_X86_64
+ needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN);
+#endif
+
+ /* Report initial kernel position details. */
+ debug_putaddr(input_data);
+ debug_putaddr(input_len);
+ debug_putaddr(output);
+ debug_putaddr(output_len);
+ debug_putaddr(kernel_total_size);
+ debug_putaddr(needed_size);
+
+#ifdef CONFIG_X86_64
+ /* Report address of 32-bit trampoline */
+ debug_putaddr(trampoline_32bit);
+#endif
+
+ choose_random_location((unsigned long)input_data, input_len,
+ (unsigned long *)&output,
+ needed_size,
+ &virt_addr);
+
+ /* Validate memory location choices. */
if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
- error("Destination address inappropriately aligned");
+ error("Destination physical address inappropriately aligned");
+ if (virt_addr & (MIN_KERNEL_ALIGN - 1))
+ error("Destination virtual address inappropriately aligned");
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
+ if (virt_addr + needed_size > KERNEL_IMAGE_SIZE)
+ error("Destination virtual address is beyond the kernel mapping area");
#else
- if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
+ if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
error("Destination address too large");
#endif
#ifndef CONFIG_RELOCATABLE
- if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
- error("Wrong destination address");
+ if (virt_addr != LOAD_PHYSICAL_ADDR)
+ error("Destination virtual address changed when not relocatable");
#endif
- if (!quiet)
- putstr("\nDecompressing Linux... ");
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
- parse_elf(output);
- if (!quiet)
- putstr("done.\nBooting the kernel.\n");
- return;
+ debug_putstr("\nDecompressing Linux... ");
+
+ if (init_unaccepted_memory()) {
+ debug_putstr("Accepting memory... ");
+ accept_memory(__pa(output), needed_size);
+ }
+
+ entry_offset = decompress_kernel(output, virt_addr, error);
+
+ debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
+ debug_puthex(entry_offset);
+ debug_putstr(").\n");
+
+ /* Disable exception handling before booting the kernel */
+ cleanup_exception_handling();
+
+ if (spurious_nmi_count) {
+ error_putstr("Spurious early NMIs ignored: ");
+ error_putdec(spurious_nmi_count);
+ error_putstr("\n");
+ }
+
+ return output + entry_offset;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 000000000000..fd855e32c9b9
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_MISC_H
+#define BOOT_COMPRESSED_MISC_H
+
+/*
+ * Special hack: we have to be careful, because no indirections are allowed here,
+ * and paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening. (This list needs to be extended when new
+ * paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XXL
+#undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
+
+#define __NO_FORTIFY
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+/*
+ * Boot stub deals with identity mappings, physical and virtual addresses are
+ * the same, so override these defines.
+ *
+ * <asm/page.h> will not define them if they are already defined.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+#include <linux/linkage.h>
+#include <linux/screen_info.h>
+#include <linux/elf.h>
+#include <asm/page.h>
+#include <asm/boot.h>
+#include <asm/bootparam.h>
+#include <asm/desc_defs.h>
+
+#include "tdx.h"
+
+#define BOOT_CTYPE_H
+#include <linux/acpi.h>
+
+#define BOOT_BOOT_H
+#include "../ctype.h"
+#include "../io.h"
+
+#include "efi.h"
+
+#ifdef CONFIG_X86_64
+#define memptr long
+#else
+#define memptr unsigned
+#endif
+
+/* boot/compressed/vmlinux start and end markers */
+extern char _head[], _end[];
+
+/* misc.c */
+extern memptr free_mem_ptr;
+extern memptr free_mem_end_ptr;
+extern int spurious_nmi_count;
+void *malloc(int size);
+void free(void *where);
+void __putstr(const char *s);
+void __puthex(unsigned long value);
+void __putdec(unsigned long value);
+#define error_putstr(__x) __putstr(__x)
+#define error_puthex(__x) __puthex(__x)
+#define error_putdec(__x) __putdec(__x)
+
+#ifdef CONFIG_X86_VERBOSE_BOOTUP
+
+#define debug_putstr(__x) __putstr(__x)
+#define debug_puthex(__x) __puthex(__x)
+#define debug_putaddr(__x) { \
+ debug_putstr(#__x ": 0x"); \
+ debug_puthex((unsigned long)(__x)); \
+ debug_putstr("\n"); \
+ }
+
+#else
+
+static inline void debug_putstr(const char *s)
+{ }
+static inline void debug_puthex(unsigned long value)
+{ }
+#define debug_putaddr(x) /* */
+
+#endif
+
+/* cmdline.c */
+int cmdline_find_option(const char *option, char *buffer, int bufsize);
+int cmdline_find_option_bool(const char *option);
+
+struct mem_vector {
+ u64 start;
+ u64 size;
+};
+
+#ifdef CONFIG_RANDOMIZE_BASE
+/* kaslr.c */
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr);
+#else
+static inline void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
+{
+}
+#endif
+
+/* cpuflags.c */
+bool has_cpuflag(int flag);
+
+#ifdef CONFIG_X86_64
+extern int set_page_decrypted(unsigned long address);
+extern int set_page_encrypted(unsigned long address);
+extern int set_page_non_present(unsigned long address);
+extern unsigned char _pgtable[];
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+#else
+static const int early_serial_base;
+static inline void console_init(void)
+{ }
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+struct es_em_ctxt;
+struct insn;
+
+void sev_enable(struct boot_params *bp);
+void snp_check_features(void);
+void sev_es_shutdown_ghcb(void);
+extern bool sev_es_check_ghcb_fault(unsigned long address);
+void snp_set_page_private(unsigned long paddr);
+void snp_set_page_shared(unsigned long paddr);
+void sev_prep_identity_maps(unsigned long top_level_pgt);
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt);
+bool insn_has_rep_prefix(struct insn *insn);
+void sev_insn_decode_init(void);
+bool early_setup_ghcb(void);
+#else
+static inline void snp_check_features(void) { }
+static inline void sev_es_shutdown_ghcb(void) { }
+static inline bool sev_es_check_ghcb_fault(unsigned long address)
+{
+ return false;
+}
+static inline void snp_set_page_private(unsigned long paddr) { }
+static inline void snp_set_page_shared(unsigned long paddr) { }
+static inline void sev_prep_identity_maps(unsigned long top_level_pgt) { }
+#endif
+
+/* acpi.c */
+#ifdef CONFIG_ACPI
+acpi_physical_address get_rsdp_addr(void);
+#else
+static inline acpi_physical_address get_rsdp_addr(void) { return 0; }
+#endif
+
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+extern struct mem_vector immovable_mem[MAX_NUMNODES*2];
+int count_immovable_mem_regions(void);
+#else
+static inline int count_immovable_mem_regions(void) { return 0; }
+#endif
+
+/* ident_map_64.c */
+extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
+extern void kernel_add_identity_map(unsigned long start, unsigned long end);
+
+/* Used by PAGE_KERN* macros: */
+extern pteval_t __default_kernel_pte_mask;
+
+/* idt_64.c */
+extern gate_desc boot_idt[BOOT_IDT_ENTRIES];
+extern struct desc_ptr boot_idt_desc;
+
+#ifdef CONFIG_X86_64
+void cleanup_exception_handling(void);
+#else
+static inline void cleanup_exception_handling(void) { }
+#endif
+
+/* IDT Entry Points */
+void boot_page_fault(void);
+void boot_nmi_trap(void);
+void boot_stage1_vc(void);
+void boot_stage2_vc(void);
+
+unsigned long sev_verify_cbit(unsigned long cr3);
+
+enum efi_type {
+ EFI_TYPE_64,
+ EFI_TYPE_32,
+ EFI_TYPE_NONE,
+};
+
+#ifdef CONFIG_EFI
+/* helpers for early EFI config table access */
+enum efi_type efi_get_type(struct boot_params *bp);
+unsigned long efi_get_system_table(struct boot_params *bp);
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len);
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid);
+#else
+static inline enum efi_type efi_get_type(struct boot_params *bp)
+{
+ return EFI_TYPE_NONE;
+}
+
+static inline unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ return 0;
+}
+
+static inline int efi_get_conf_table(struct boot_params *bp,
+ unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ return -ENOENT;
+}
+
+static inline unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+bool init_unaccepted_memory(void);
+#else
+static inline bool init_unaccepted_memory(void) { return false; }
+#endif
+
+/* Defined in EFI stub */
+extern struct efi_unaccepted_memory *unaccepted_table;
+void accept_memory(phys_addr_t start, unsigned long size);
+
+#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index bcbd36c41432..52aa56cdbacc 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -1,53 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* ----------------------------------------------------------------------- *
*
* Copyright (C) 2009 Intel Corporation. All rights reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- *
* H. Peter Anvin <hpa@linux.intel.com>
*
- * ----------------------------------------------------------------------- */
-
-/*
- * Compute the desired load offset from a compressed program; outputs
- * a small assembly wrapper with the appropriate symbols defined.
+ * -----------------------------------------------------------------------
+ *
+ * Outputs a small assembly wrapper with the appropriate symbols defined.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
-
-static uint32_t getle32(const void *p)
-{
- const uint8_t *cp = p;
-
- return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
- ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
-}
+#include <tools/le_byteshift.h>
int main(int argc, char *argv[])
{
uint32_t olen;
long ilen;
- unsigned long offs;
- FILE *f;
+ FILE *f = NULL;
+ int retval = 1;
if (argc < 2) {
fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
- return 1;
+ goto bail;
}
/* Get the information for the compressed kernel image first */
@@ -55,43 +33,42 @@ int main(int argc, char *argv[])
f = fopen(argv[1], "r");
if (!f) {
perror(argv[1]);
- return 1;
+ goto bail;
}
if (fseek(f, -4L, SEEK_END)) {
perror(argv[1]);
}
- fread(&olen, sizeof olen, 1, f);
- ilen = ftell(f);
- olen = getle32(&olen);
- fclose(f);
- /*
- * Now we have the input (compressed) and output (uncompressed)
- * sizes, compute the necessary decompression offset...
- */
+ if (fread(&olen, sizeof(olen), 1, f) != 1) {
+ perror(argv[1]);
+ goto bail;
+ }
- offs = (olen > ilen) ? olen - ilen : 0;
- offs += olen >> 12; /* Add 8 bytes for each 32K block */
- offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
- offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+ ilen = ftell(f);
+ olen = get_unaligned_le32(&olen);
- printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+ printf(".section \".rodata..compressed\",\"a\",@progbits\n");
printf(".globl z_input_len\n");
printf("z_input_len = %lu\n", ilen);
printf(".globl z_output_len\n");
printf("z_output_len = %lu\n", (unsigned long)olen);
- printf(".globl z_extract_offset\n");
- printf("z_extract_offset = 0x%lx\n", offs);
- /* z_extract_offset_negative allows simplification of head_32.S */
- printf(".globl z_extract_offset_negative\n");
- printf("z_extract_offset_negative = -0x%lx\n", offs);
printf(".globl input_data, input_data_end\n");
printf("input_data:\n");
printf(".incbin \"%s\"\n", argv[1]);
printf("input_data_end:\n");
- return 0;
+ printf(".section \".rodata\",\"a\",@progbits\n");
+ printf(".globl input_len\n");
+ printf("input_len:\n\t.long %lu\n", ilen);
+ printf(".globl output_len\n");
+ printf("output_len:\n\t.long %lu\n", (unsigned long)olen);
+
+ retval = 0;
+bail:
+ if (f)
+ fclose(f);
+ return retval;
}
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
new file mode 100644
index 000000000000..0e89e197e112
--- /dev/null
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "misc.h"
+#include <asm/bootparam.h>
+#include <asm/bootparam_utils.h>
+#include <asm/e820/types.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include "../string.h"
+#include "efi.h"
+
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+/* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */
+unsigned int __section(".data") __pgtable_l5_enabled;
+unsigned int __section(".data") pgdir_shift = 39;
+unsigned int __section(".data") ptrs_per_p4d = 1;
+
+/* Buffer to preserve trampoline memory */
+static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
+
+/*
+ * Trampoline address will be printed by extract_kernel() for debugging
+ * purposes.
+ *
+ * Avoid putting the pointer into .bss as it will be cleared between
+ * configure_5level_paging() and extract_kernel().
+ */
+unsigned long *trampoline_32bit __section(".data");
+
+int cmdline_find_option_bool(const char *option);
+
+static unsigned long find_trampoline_placement(void)
+{
+ unsigned long bios_start = 0, ebda_start = 0;
+ struct boot_e820_entry *entry;
+ char *signature;
+ int i;
+
+ /*
+ * Find a suitable spot for the trampoline.
+ * This code is based on reserve_bios_regions().
+ */
+
+ /*
+ * EFI systems may not provide legacy ROM. The memory may not be mapped
+ * at all.
+ *
+ * Only look for values in the legacy ROM for non-EFI system.
+ */
+ signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
+ ebda_start = *(unsigned short *)0x40e << 4;
+ bios_start = *(unsigned short *)0x413 << 10;
+ }
+
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ if (ebda_start > BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ bios_start = round_down(bios_start, PAGE_SIZE);
+
+ /* Find the first usable memory region under bios_start. */
+ for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) {
+ unsigned long new = bios_start;
+
+ entry = &boot_params_ptr->e820_table[i];
+
+ /* Skip all entries above bios_start. */
+ if (bios_start <= entry->addr)
+ continue;
+
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ /* Adjust bios_start to the end of the entry if needed. */
+ if (bios_start > entry->addr + entry->size)
+ new = entry->addr + entry->size;
+
+ /* Keep bios_start page-aligned. */
+ new = round_down(new, PAGE_SIZE);
+
+ /* Skip the entry if it's too small. */
+ if (new - TRAMPOLINE_32BIT_SIZE < entry->addr)
+ continue;
+
+ /* Protect against underflow. */
+ if (new - TRAMPOLINE_32BIT_SIZE > bios_start)
+ break;
+
+ bios_start = new;
+ break;
+ }
+
+ /* Place the trampoline just below the end of low memory */
+ return bios_start - TRAMPOLINE_32BIT_SIZE;
+}
+
+asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
+{
+ void (*toggle_la57)(void *cr3);
+ bool l5_required = false;
+
+ /* Initialize boot_params. Required for cmdline_find_option_bool(). */
+ sanitize_boot_params(bp);
+ boot_params_ptr = bp;
+
+ /*
+ * Check if LA57 is desired and supported.
+ *
+ * There are several parts to the check:
+ * - if user asked to disable 5-level paging: no5lvl in cmdline
+ * - if the machine supports 5-level paging:
+ * + CPUID leaf 7 is supported
+ * + the leaf has the feature bit set
+ */
+ if (!cmdline_find_option_bool("no5lvl") &&
+ native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) {
+ l5_required = true;
+
+ /* Initialize variables for 5-level paging */
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ }
+
+ /*
+ * The trampoline will not be used if the paging mode is already set to
+ * the desired one.
+ */
+ if (l5_required == !!(native_read_cr4() & X86_CR4_LA57))
+ return;
+
+ trampoline_32bit = (unsigned long *)find_trampoline_placement();
+
+ /* Preserve trampoline memory */
+ memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
+
+ /* Clear trampoline memory first */
+ memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
+
+ /* Copy trampoline code in place */
+ toggle_la57 = memcpy(trampoline_32bit +
+ TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
+ &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
+
+ /*
+ * Avoid the need for a stack in the 32-bit trampoline code, by using
+ * LJMP rather than LRET to return back to long mode. LJMP takes an
+ * immediate absolute address, which needs to be adjusted based on the
+ * placement of the trampoline.
+ */
+ *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) +=
+ (unsigned long)toggle_la57;
+
+ /*
+ * The code below prepares page table in trampoline memory.
+ *
+ * The new page table will be used by trampoline code for switching
+ * from 4- to 5-level paging or vice versa.
+ */
+
+ if (l5_required) {
+ /*
+ * For 4- to 5-level paging transition, set up current CR3 as
+ * the first and the only entry in a new top-level page table.
+ */
+ *trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC;
+ } else {
+ u64 *new_cr3;
+ pgd_t *pgdp;
+
+ /*
+ * For 5- to 4-level paging transition, copy page table pointed
+ * by first entry in the current top-level page table as our
+ * new top-level page table.
+ *
+ * We cannot just point to the page table from trampoline as it
+ * may be above 4G.
+ */
+ pgdp = (pgd_t *)native_read_cr3_pa();
+ new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK);
+ memcpy(trampoline_32bit, new_cr3, PAGE_SIZE);
+ }
+
+ toggle_la57(trampoline_32bit);
+
+ /*
+ * Move the top level page table out of trampoline memory.
+ */
+ memcpy(pgtable, trampoline_32bit, PAGE_SIZE);
+ native_write_cr3((unsigned long)pgtable);
+
+ /* Restore trampoline memory */
+ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
+}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
deleted file mode 100644
index bbeb0c3fbd90..000000000000
--- a/arch/x86/boot/compressed/relocs.c
+++ /dev/null
@@ -1,651 +0,0 @@
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <elf.h>
-#include <byteswap.h>
-#define USE_BSD
-#include <endian.h>
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-static Elf32_Ehdr ehdr;
-static unsigned long reloc_count, reloc_idx;
-static unsigned long *relocs;
-
-struct section {
- Elf32_Shdr shdr;
- struct section *link;
- Elf32_Sym *symtab;
- Elf32_Rel *reltab;
- char *strtab;
-};
-static struct section *secs;
-
-/*
- * Following symbols have been audited. There values are constant and do
- * not change if bzImage is loaded at a different physical address than
- * the address for which it has been compiled. Don't warn user about
- * absolute relocations present w.r.t these symbols.
- */
-static const char* safe_abs_relocs[] = {
- "xen_irq_disable_direct_reloc",
- "xen_save_fl_direct_reloc",
-};
-
-static int is_safe_abs_reloc(const char* sym_name)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) {
- if (!strcmp(sym_name, safe_abs_relocs[i]))
- /* Match found */
- return 1;
- }
- if (strncmp(sym_name, "VDSO", 4) == 0)
- return 1;
- if (strncmp(sym_name, "__crc_", 6) == 0)
- return 1;
- return 0;
-}
-
-static void die(char *fmt, ...)
-{
- va_list ap;
- va_start(ap, fmt);
- vfprintf(stderr, fmt, ap);
- va_end(ap);
- exit(1);
-}
-
-static const char *sym_type(unsigned type)
-{
- static const char *type_name[] = {
-#define SYM_TYPE(X) [X] = #X
- SYM_TYPE(STT_NOTYPE),
- SYM_TYPE(STT_OBJECT),
- SYM_TYPE(STT_FUNC),
- SYM_TYPE(STT_SECTION),
- SYM_TYPE(STT_FILE),
- SYM_TYPE(STT_COMMON),
- SYM_TYPE(STT_TLS),
-#undef SYM_TYPE
- };
- const char *name = "unknown sym type name";
- if (type < ARRAY_SIZE(type_name)) {
- name = type_name[type];
- }
- return name;
-}
-
-static const char *sym_bind(unsigned bind)
-{
- static const char *bind_name[] = {
-#define SYM_BIND(X) [X] = #X
- SYM_BIND(STB_LOCAL),
- SYM_BIND(STB_GLOBAL),
- SYM_BIND(STB_WEAK),
-#undef SYM_BIND
- };
- const char *name = "unknown sym bind name";
- if (bind < ARRAY_SIZE(bind_name)) {
- name = bind_name[bind];
- }
- return name;
-}
-
-static const char *sym_visibility(unsigned visibility)
-{
- static const char *visibility_name[] = {
-#define SYM_VISIBILITY(X) [X] = #X
- SYM_VISIBILITY(STV_DEFAULT),
- SYM_VISIBILITY(STV_INTERNAL),
- SYM_VISIBILITY(STV_HIDDEN),
- SYM_VISIBILITY(STV_PROTECTED),
-#undef SYM_VISIBILITY
- };
- const char *name = "unknown sym visibility name";
- if (visibility < ARRAY_SIZE(visibility_name)) {
- name = visibility_name[visibility];
- }
- return name;
-}
-
-static const char *rel_type(unsigned type)
-{
- static const char *type_name[] = {
-#define REL_TYPE(X) [X] = #X
- REL_TYPE(R_386_NONE),
- REL_TYPE(R_386_32),
- REL_TYPE(R_386_PC32),
- REL_TYPE(R_386_GOT32),
- REL_TYPE(R_386_PLT32),
- REL_TYPE(R_386_COPY),
- REL_TYPE(R_386_GLOB_DAT),
- REL_TYPE(R_386_JMP_SLOT),
- REL_TYPE(R_386_RELATIVE),
- REL_TYPE(R_386_GOTOFF),
- REL_TYPE(R_386_GOTPC),
-#undef REL_TYPE
- };
- const char *name = "unknown type rel type name";
- if (type < ARRAY_SIZE(type_name)) {
- name = type_name[type];
- }
- return name;
-}
-
-static const char *sec_name(unsigned shndx)
-{
- const char *sec_strtab;
- const char *name;
- sec_strtab = secs[ehdr.e_shstrndx].strtab;
- name = "<noname>";
- if (shndx < ehdr.e_shnum) {
- name = sec_strtab + secs[shndx].shdr.sh_name;
- }
- else if (shndx == SHN_ABS) {
- name = "ABSOLUTE";
- }
- else if (shndx == SHN_COMMON) {
- name = "COMMON";
- }
- return name;
-}
-
-static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
-{
- const char *name;
- name = "<noname>";
- if (sym->st_name) {
- name = sym_strtab + sym->st_name;
- }
- else {
- name = sec_name(secs[sym->st_shndx].shdr.sh_name);
- }
- return name;
-}
-
-
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#define le16_to_cpu(val) (val)
-#define le32_to_cpu(val) (val)
-#endif
-#if BYTE_ORDER == BIG_ENDIAN
-#define le16_to_cpu(val) bswap_16(val)
-#define le32_to_cpu(val) bswap_32(val)
-#endif
-
-static uint16_t elf16_to_cpu(uint16_t val)
-{
- return le16_to_cpu(val);
-}
-
-static uint32_t elf32_to_cpu(uint32_t val)
-{
- return le32_to_cpu(val);
-}
-
-static void read_ehdr(FILE *fp)
-{
- if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
- die("Cannot read ELF header: %s\n",
- strerror(errno));
- }
- if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
- die("No ELF magic\n");
- }
- if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
- die("Not a 32 bit executable\n");
- }
- if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
- die("Not a LSB ELF executable\n");
- }
- if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
- die("Unknown ELF version\n");
- }
- /* Convert the fields to native endian */
- ehdr.e_type = elf16_to_cpu(ehdr.e_type);
- ehdr.e_machine = elf16_to_cpu(ehdr.e_machine);
- ehdr.e_version = elf32_to_cpu(ehdr.e_version);
- ehdr.e_entry = elf32_to_cpu(ehdr.e_entry);
- ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff);
- ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff);
- ehdr.e_flags = elf32_to_cpu(ehdr.e_flags);
- ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize);
- ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
- ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum);
- ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
- ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum);
- ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx);
-
- if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
- die("Unsupported ELF header type\n");
- }
- if (ehdr.e_machine != EM_386) {
- die("Not for x86\n");
- }
- if (ehdr.e_version != EV_CURRENT) {
- die("Unknown ELF version\n");
- }
- if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
- die("Bad Elf header size\n");
- }
- if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
- die("Bad program header entry\n");
- }
- if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
- die("Bad section header entry\n");
- }
- if (ehdr.e_shstrndx >= ehdr.e_shnum) {
- die("String table index out of bounds\n");
- }
-}
-
-static void read_shdrs(FILE *fp)
-{
- int i;
- Elf32_Shdr shdr;
-
- secs = calloc(ehdr.e_shnum, sizeof(struct section));
- if (!secs) {
- die("Unable to allocate %d section headers\n",
- ehdr.e_shnum);
- }
- if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- ehdr.e_shoff, strerror(errno));
- }
- for (i = 0; i < ehdr.e_shnum; i++) {
- struct section *sec = &secs[i];
- if (fread(&shdr, sizeof shdr, 1, fp) != 1)
- die("Cannot read ELF section headers %d/%d: %s\n",
- i, ehdr.e_shnum, strerror(errno));
- sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name);
- sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type);
- sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags);
- sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr);
- sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset);
- sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size);
- sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link);
- sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info);
- sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign);
- sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize);
- if (sec->shdr.sh_link < ehdr.e_shnum)
- sec->link = &secs[sec->shdr.sh_link];
- }
-
-}
-
-static void read_strtabs(FILE *fp)
-{
- int i;
- for (i = 0; i < ehdr.e_shnum; i++) {
- struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_STRTAB) {
- continue;
- }
- sec->strtab = malloc(sec->shdr.sh_size);
- if (!sec->strtab) {
- die("malloc of %d bytes for strtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
- }
-}
-
-static void read_symtabs(FILE *fp)
-{
- int i,j;
- for (i = 0; i < ehdr.e_shnum; i++) {
- struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_SYMTAB) {
- continue;
- }
- sec->symtab = malloc(sec->shdr.sh_size);
- if (!sec->symtab) {
- die("malloc of %d bytes for symtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
- for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
- Elf32_Sym *sym = &sec->symtab[j];
- sym->st_name = elf32_to_cpu(sym->st_name);
- sym->st_value = elf32_to_cpu(sym->st_value);
- sym->st_size = elf32_to_cpu(sym->st_size);
- sym->st_shndx = elf16_to_cpu(sym->st_shndx);
- }
- }
-}
-
-
-static void read_relocs(FILE *fp)
-{
- int i,j;
- for (i = 0; i < ehdr.e_shnum; i++) {
- struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_REL) {
- continue;
- }
- sec->reltab = malloc(sec->shdr.sh_size);
- if (!sec->reltab) {
- die("malloc of %d bytes for relocs failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %d failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
- for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
- Elf32_Rel *rel = &sec->reltab[j];
- rel->r_offset = elf32_to_cpu(rel->r_offset);
- rel->r_info = elf32_to_cpu(rel->r_info);
- }
- }
-}
-
-
-static void print_absolute_symbols(void)
-{
- int i;
- printf("Absolute symbols\n");
- printf(" Num: Value Size Type Bind Visibility Name\n");
- for (i = 0; i < ehdr.e_shnum; i++) {
- struct section *sec = &secs[i];
- char *sym_strtab;
- Elf32_Sym *sh_symtab;
- int j;
-
- if (sec->shdr.sh_type != SHT_SYMTAB) {
- continue;
- }
- sh_symtab = sec->symtab;
- sym_strtab = sec->link->strtab;
- for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) {
- Elf32_Sym *sym;
- const char *name;
- sym = &sec->symtab[j];
- name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
- continue;
- }
- printf("%5d %08x %5d %10s %10s %12s %s\n",
- j, sym->st_value, sym->st_size,
- sym_type(ELF32_ST_TYPE(sym->st_info)),
- sym_bind(ELF32_ST_BIND(sym->st_info)),
- sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
- name);
- }
- }
- printf("\n");
-}
-
-static void print_absolute_relocs(void)
-{
- int i, printed = 0;
-
- for (i = 0; i < ehdr.e_shnum; i++) {
- struct section *sec = &secs[i];
- struct section *sec_applies, *sec_symtab;
- char *sym_strtab;
- Elf32_Sym *sh_symtab;
- int j;
- if (sec->shdr.sh_type != SHT_REL) {
- continue;
- }
- sec_symtab = sec->link;
- sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
- continue;
- }
- sh_symtab = sec_symtab->symtab;
- sym_strtab = sec_symtab->link->strtab;
- for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
- Elf32_Rel *rel;
- Elf32_Sym *sym;
- const char *name;
- rel = &sec->reltab[j];
- sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
- name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
- continue;
- }
-
- /* Absolute symbols are not relocated if bzImage is
- * loaded at a non-compiled address. Display a warning
- * to user at compile time about the absolute
- * relocations present.
- *
- * User need to audit the code to make sure
- * some symbols which should have been section
- * relative have not become absolute because of some
- * linker optimization or wrong programming usage.
- *
- * Before warning check if this absolute symbol
- * relocation is harmless.
- */
- if (is_safe_abs_reloc(name))
- continue;
-
- if (!printed) {
- printf("WARNING: Absolute relocations"
- " present\n");
- printf("Offset Info Type Sym.Value "
- "Sym.Name\n");
- printed = 1;
- }
-
- printf("%08x %08x %10s %08x %s\n",
- rel->r_offset,
- rel->r_info,
- rel_type(ELF32_R_TYPE(rel->r_info)),
- sym->st_value,
- name);
- }
- }
-
- if (printed)
- printf("\n");
-}
-
-static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
-{
- int i;
- /* Walk through the relocations */
- for (i = 0; i < ehdr.e_shnum; i++) {
- char *sym_strtab;
- Elf32_Sym *sh_symtab;
- struct section *sec_applies, *sec_symtab;
- int j;
- struct section *sec = &secs[i];
-
- if (sec->shdr.sh_type != SHT_REL) {
- continue;
- }
- sec_symtab = sec->link;
- sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
- continue;
- }
- sh_symtab = sec_symtab->symtab;
- sym_strtab = sec_symtab->link->strtab;
- for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
- Elf32_Rel *rel;
- Elf32_Sym *sym;
- unsigned r_type;
- rel = &sec->reltab[j];
- sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
- r_type = ELF32_R_TYPE(rel->r_info);
- /* Don't visit relocations to absolute symbols */
- if (sym->st_shndx == SHN_ABS) {
- continue;
- }
- if (r_type == R_386_NONE || r_type == R_386_PC32) {
- /*
- * NONE can be ignored and and PC relative
- * relocations don't need to be adjusted.
- */
- }
- else if (r_type == R_386_32) {
- /* Visit relocations that need to be adjusted */
- visit(rel, sym);
- }
- else {
- die("Unsupported relocation type: %d\n", r_type);
- }
- }
- }
-}
-
-static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
-{
- reloc_count += 1;
-}
-
-static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
-{
- /* Remember the address that needs to be adjusted. */
- relocs[reloc_idx++] = rel->r_offset;
-}
-
-static int cmp_relocs(const void *va, const void *vb)
-{
- const unsigned long *a, *b;
- a = va; b = vb;
- return (*a == *b)? 0 : (*a > *b)? 1 : -1;
-}
-
-static void emit_relocs(int as_text)
-{
- int i;
- /* Count how many relocations I have and allocate space for them. */
- reloc_count = 0;
- walk_relocs(count_reloc);
- relocs = malloc(reloc_count * sizeof(relocs[0]));
- if (!relocs) {
- die("malloc of %d entries for relocs failed\n",
- reloc_count);
- }
- /* Collect up the relocations */
- reloc_idx = 0;
- walk_relocs(collect_reloc);
-
- /* Order the relocations for more efficient processing */
- qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
-
- /* Print the relocations */
- if (as_text) {
- /* Print the relocations in a form suitable that
- * gas will like.
- */
- printf(".section \".data.reloc\",\"a\"\n");
- printf(".balign 4\n");
- for (i = 0; i < reloc_count; i++) {
- printf("\t .long 0x%08lx\n", relocs[i]);
- }
- printf("\n");
- }
- else {
- unsigned char buf[4];
- buf[0] = buf[1] = buf[2] = buf[3] = 0;
- /* Print a stop */
- printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
- /* Now print each relocation */
- for (i = 0; i < reloc_count; i++) {
- buf[0] = (relocs[i] >> 0) & 0xff;
- buf[1] = (relocs[i] >> 8) & 0xff;
- buf[2] = (relocs[i] >> 16) & 0xff;
- buf[3] = (relocs[i] >> 24) & 0xff;
- printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
- }
- }
-}
-
-static void usage(void)
-{
- die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
-}
-
-int main(int argc, char **argv)
-{
- int show_absolute_syms, show_absolute_relocs;
- int as_text;
- const char *fname;
- FILE *fp;
- int i;
-
- show_absolute_syms = 0;
- show_absolute_relocs = 0;
- as_text = 0;
- fname = NULL;
- for (i = 1; i < argc; i++) {
- char *arg = argv[i];
- if (*arg == '-') {
- if (strcmp(argv[1], "--abs-syms") == 0) {
- show_absolute_syms = 1;
- continue;
- }
-
- if (strcmp(argv[1], "--abs-relocs") == 0) {
- show_absolute_relocs = 1;
- continue;
- }
- else if (strcmp(argv[1], "--text") == 0) {
- as_text = 1;
- continue;
- }
- }
- else if (!fname) {
- fname = arg;
- continue;
- }
- usage();
- }
- if (!fname) {
- usage();
- }
- fp = fopen(fname, "r");
- if (!fp) {
- die("Cannot open %s: %s\n",
- fname, strerror(errno));
- }
- read_ehdr(fp);
- read_shdrs(fp);
- read_strtabs(fp);
- read_symtabs(fp);
- read_relocs(fp);
- if (show_absolute_syms) {
- print_absolute_symbols();
- return 0;
- }
- if (show_absolute_relocs) {
- print_absolute_relocs();
- return 0;
- }
- emit_relocs(as_text);
- return 0;
-}
diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S
new file mode 100644
index 000000000000..838f70a997dd
--- /dev/null
+++ b/arch/x86/boot/compressed/sbat.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Embed SBAT data in the kernel.
+ */
+ .pushsection ".sbat", "a", @progbits
+ .incbin CONFIG_EFI_SBAT_FILE
+ .popsection
diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c
new file mode 100644
index 000000000000..030001b46554
--- /dev/null
+++ b/arch/x86/boot/compressed/sev-handle-vc.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "error.h"
+#include "sev.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <asm/insn.h>
+#include <asm/pgtable_types.h>
+#include <asm/ptrace.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/fpu/xcr.h>
+
+#define __BOOT_COMPRESSED
+#undef __init
+#define __init
+
+/* Basic instruction decoding support needed */
+#include "../../lib/inat.c"
+#include "../../lib/insn.c"
+
+/*
+ * Copy a version of this function here - insn-eval.c can't be used in
+ * pre-decompression code.
+ */
+bool insn_has_rep_prefix(struct insn *insn)
+{
+ insn_byte_t p;
+
+ insn_get_prefixes(insn);
+
+ for_each_insn_prefix(insn, p) {
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int ret;
+
+ memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+
+ return ES_OK;
+}
+
+extern void sev_insn_decode_init(void) __alias(inat_init_tables);
+
+/*
+ * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
+ * doesn't use segments.
+ */
+static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ return 0UL;
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ void *dst, char *buf, size_t size)
+{
+ memcpy(dst, buf, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ void *src, char *buf, size_t size)
+{
+ memcpy(buf, src, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ return ES_OK;
+}
+
+static bool fault_in_kernel_space(unsigned long address)
+{
+ return false;
+}
+
+#define sev_printk(fmt, ...)
+
+#include "../../coco/sev/vc-shared.c"
+
+void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
+{
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ if (!boot_ghcb && !early_setup_ghcb())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ vc_ghcb_invalidate(boot_ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ result = vc_check_opcode_bytes(&ctxt, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ switch (exit_code) {
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(boot_ghcb, &ctxt);
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(boot_ghcb, &ctxt);
+ break;
+ default:
+ result = ES_UNSUPPORTED;
+ break;
+ }
+
+finish:
+ if (result == ES_OK)
+ vc_finish_insn(&ctxt);
+ else if (result != ES_RETRY)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
new file mode 100644
index 000000000000..c8c1464b3a56
--- /dev/null
+++ b/arch/x86/boot/compressed/sev.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+/*
+ * misc.h needs to be first because it knows how to include the other kernel
+ * headers in the pre-decompression code in a way that does not break
+ * compilation.
+ */
+#include "misc.h"
+
+#include <asm/bootparam.h>
+#include <asm/pgtable_types.h>
+#include <asm/shared/msr.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/msr-index.h>
+#include <asm/fpu/xcr.h>
+#include <asm/ptrace.h>
+#include <asm/svm.h>
+#include <asm/cpuid/api.h>
+
+#include "error.h"
+#include "sev.h"
+
+static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
+struct ghcb *boot_ghcb;
+
+#undef __init
+#define __init
+
+#define __BOOT_COMPRESSED
+
+u8 snp_vmpl;
+u16 ghcb_version;
+
+u64 boot_svsm_caa_pa;
+
+/* Include code for early handlers */
+#include "../../boot/startup/sev-shared.c"
+
+static bool sev_snp_enabled(void)
+{
+ return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+}
+
+void snp_set_page_private(unsigned long paddr)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_PRIVATE,
+ (struct svsm_ca *)boot_svsm_caa_pa,
+ boot_svsm_caa_pa
+ };
+
+ if (!sev_snp_enabled())
+ return;
+
+ __page_state_change(paddr, paddr, &d);
+}
+
+void snp_set_page_shared(unsigned long paddr)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_SHARED,
+ (struct svsm_ca *)boot_svsm_caa_pa,
+ boot_svsm_caa_pa
+ };
+
+ if (!sev_snp_enabled())
+ return;
+
+ __page_state_change(paddr, paddr, &d);
+}
+
+bool early_setup_ghcb(void)
+{
+ if (set_page_decrypted((unsigned long)&boot_ghcb_page))
+ return false;
+
+ /* Page is now mapped decrypted, clear it */
+ memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page));
+
+ boot_ghcb = &boot_ghcb_page;
+
+ /* Initialize lookup tables for the instruction decoder */
+ sev_insn_decode_init();
+
+ /* SNP guest requires the GHCB GPA must be registered */
+ if (sev_snp_enabled())
+ snp_register_ghcb_early(__pa(&boot_ghcb_page));
+
+ return true;
+}
+
+void snp_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_PRIVATE,
+ (struct svsm_ca *)boot_svsm_caa_pa,
+ boot_svsm_caa_pa
+ };
+
+ for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE)
+ __page_state_change(pa, pa, &d);
+}
+
+void sev_es_shutdown_ghcb(void)
+{
+ if (!boot_ghcb)
+ return;
+
+ if (!sev_es_check_cpu_features())
+ error("SEV-ES CPU Features missing.");
+
+ /*
+ * This denotes whether to use the GHCB MSR protocol or the GHCB
+ * shared page to perform a GHCB request. Since the GHCB page is
+ * being changed to encrypted, it can't be used to perform GHCB
+ * requests. Clear the boot_ghcb variable so that the GHCB MSR
+ * protocol is used to change the GHCB page over to an encrypted
+ * page.
+ */
+ boot_ghcb = NULL;
+
+ /*
+ * GHCB Page must be flushed from the cache and mapped encrypted again.
+ * Otherwise the running kernel will see strange cache effects when
+ * trying to use that page.
+ */
+ if (set_page_encrypted((unsigned long)&boot_ghcb_page))
+ error("Can't map GHCB page encrypted");
+
+ /*
+ * GHCB page is mapped encrypted again and flushed from the cache.
+ * Mark it non-present now to catch bugs when #VC exceptions trigger
+ * after this point.
+ */
+ if (set_page_non_present((unsigned long)&boot_ghcb_page))
+ error("Can't unmap GHCB page");
+}
+
+static void __noreturn sev_es_ghcb_terminate(struct ghcb *ghcb, unsigned int set,
+ unsigned int reason, u64 exit_info_2)
+{
+ u64 exit_info_1 = SVM_VMGEXIT_TERM_REASON(set, reason);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_TERM_REQUEST);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+bool sev_es_check_ghcb_fault(unsigned long address)
+{
+ /* Check whether the fault was on the GHCB page */
+ return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
+}
+
+/*
+ * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
+ * guest side implementation for proper functioning of the guest. If any
+ * of these features are enabled in the hypervisor but are lacking guest
+ * side implementation, the behavior of the guest will be undefined. The
+ * guest could fail in non-obvious way making it difficult to debug.
+ *
+ * As the behavior of reserved feature bits is unknown to be on the
+ * safe side add them to the required features mask.
+ */
+#define SNP_FEATURES_IMPL_REQ (MSR_AMD64_SNP_VTOM | \
+ MSR_AMD64_SNP_REFLECT_VC | \
+ MSR_AMD64_SNP_RESTRICTED_INJ | \
+ MSR_AMD64_SNP_ALT_INJ | \
+ MSR_AMD64_SNP_DEBUG_SWAP | \
+ MSR_AMD64_SNP_VMPL_SSS | \
+ MSR_AMD64_SNP_SECURE_TSC | \
+ MSR_AMD64_SNP_VMGEXIT_PARAM | \
+ MSR_AMD64_SNP_VMSA_REG_PROT | \
+ MSR_AMD64_SNP_RESERVED_BIT13 | \
+ MSR_AMD64_SNP_RESERVED_BIT15 | \
+ MSR_AMD64_SNP_SECURE_AVIC | \
+ MSR_AMD64_SNP_RESERVED_MASK)
+
+#ifdef CONFIG_AMD_SECURE_AVIC
+#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC
+#else
+#define SNP_FEATURE_SECURE_AVIC 0
+#endif
+
+/*
+ * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented
+ * by the guest kernel. As and when a new feature is implemented in the
+ * guest kernel, a corresponding bit should be added to the mask.
+ */
+#define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \
+ MSR_AMD64_SNP_SECURE_TSC | \
+ SNP_FEATURE_SECURE_AVIC)
+
+u64 snp_get_unsupported_features(u64 status)
+{
+ if (!(status & MSR_AMD64_SEV_SNP_ENABLED))
+ return 0;
+
+ return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+}
+
+void snp_check_features(void)
+{
+ u64 unsupported;
+
+ /*
+ * Terminate the boot if hypervisor has enabled any feature lacking
+ * guest side implementation. Pass on the unsupported features mask through
+ * EXIT_INFO_2 of the GHCB protocol so that those features can be reported
+ * as part of the guest boot failure.
+ */
+ unsupported = snp_get_unsupported_features(sev_status);
+ if (unsupported) {
+ if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ sev_es_ghcb_terminate(boot_ghcb, SEV_TERM_SET_GEN,
+ GHCB_SNP_UNSUPPORTED, unsupported);
+ }
+}
+
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ int ret;
+
+ ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+ if (ret)
+ return NULL;
+
+ return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+ cfg_table_len,
+ EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ cc_info = find_cc_blob_efi(bp);
+ if (cc_info)
+ goto found_cc_info;
+
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+static bool early_snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ /*
+ * If a SNP-specific Confidential Computing blob is present, then
+ * firmware/bootloader have indicated SNP support. Verifying this
+ * involves CPUID checks which will be more reliable if the SNP
+ * CPUID table is used. See comments over snp_setup_cpuid_table() for
+ * more details.
+ */
+ setup_cpuid_table(cc_info);
+
+ /*
+ * Record the SVSM Calling Area (CA) address if the guest is not
+ * running at VMPL0. The CA will be used to communicate with the
+ * SVSM and request its services.
+ */
+ svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page));
+
+ /*
+ * Pass run-time kernel a pointer to CC info via boot_params so EFI
+ * config table doesn't need to be searched again during early startup
+ * phase.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+/*
+ * sev_check_cpu_support - Check for SEV support in the CPU capabilities
+ *
+ * Returns < 0 if SEV is not supported, otherwise the position of the
+ * encryption bit in the page table descriptors.
+ */
+static int sev_check_cpu_support(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return -ENODEV;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* Check whether SEV is supported */
+ if (!(eax & BIT(1)))
+ return -ENODEV;
+
+ sev_snp_needs_sfw = !(ebx & BIT(31));
+
+ return ebx & 0x3f;
+}
+
+void sev_enable(struct boot_params *bp)
+{
+ struct msr m;
+ int bitpos;
+ bool snp;
+
+ /*
+ * bp->cc_blob_address should only be set by boot/compressed kernel.
+ * Initialize it to 0 to ensure that uninitialized values from
+ * buggy bootloaders aren't propagated.
+ */
+ if (bp)
+ bp->cc_blob_address = 0;
+
+ /*
+ * Do an initial SEV capability check before early_snp_init() which
+ * loads the CPUID page and the same checks afterwards are done
+ * without the hypervisor and are trustworthy.
+ *
+ * If the HV fakes SEV support, the guest will crash'n'burn
+ * which is good enough.
+ */
+
+ if (sev_check_cpu_support() < 0)
+ return;
+
+ /*
+ * Setup/preliminary detection of SNP. This will be sanity-checked
+ * against CPUID/MSR values later.
+ */
+ snp = early_snp_init(bp);
+
+ /* Now repeat the checks with the SNP CPUID table. */
+
+ bitpos = sev_check_cpu_support();
+ if (bitpos < 0) {
+ if (snp)
+ error("SEV-SNP support indicated by CC blob, but not CPUID.");
+ return;
+ }
+
+ /* Set the SME mask if this is an SEV guest. */
+ raw_rdmsr(MSR_AMD64_SEV, &m);
+ sev_status = m.q;
+ if (!(sev_status & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* Negotiate the GHCB protocol version. */
+ if (sev_status & MSR_AMD64_SEV_ES_ENABLED) {
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED);
+ }
+
+ /*
+ * SNP is supported in v2 of the GHCB spec which mandates support for HV
+ * features.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ u64 hv_features;
+
+ hv_features = get_hv_features();
+ if (!(hv_features & GHCB_HV_FT_SNP))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ /*
+ * Running at VMPL0 is required unless an SVSM is present and
+ * the hypervisor supports the required SVSM GHCB events.
+ */
+ if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
+ }
+
+ if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
+
+ sme_me_mask = BIT_ULL(bitpos);
+}
+
+/*
+ * sev_get_status - Retrieve the SEV status mask
+ *
+ * Returns 0 if the CPU is not SEV capable, otherwise the value of the
+ * AMD64_SEV MSR.
+ */
+u64 sev_get_status(void)
+{
+ struct msr m;
+
+ if (sev_check_cpu_support() < 0)
+ return 0;
+
+ raw_rdmsr(MSR_AMD64_SEV, &m);
+ return m.q;
+}
+
+void sev_prep_identity_maps(unsigned long top_level_pgt)
+{
+ /*
+ * The Confidential Computing blob is used very early in uncompressed
+ * kernel to find the in-memory CPUID table to handle CPUID
+ * instructions. Make sure an identity-mapping exists so it can be
+ * accessed after switchover.
+ */
+ if (sev_snp_enabled()) {
+ unsigned long cc_info_pa = boot_params_ptr->cc_blob_address;
+ struct cc_blob_sev_info *cc_info;
+
+ kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info));
+
+ cc_info = (struct cc_blob_sev_info *)cc_info_pa;
+ kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len);
+ }
+
+ sev_verify_cbit(top_level_pgt);
+}
+
+bool early_is_sevsnp_guest(void)
+{
+ static bool sevsnp;
+
+ if (sevsnp)
+ return true;
+
+ if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED))
+ return false;
+
+ sevsnp = true;
+
+ if (!snp_vmpl) {
+ unsigned int eax, ebx, ecx, edx;
+
+ /*
+ * CPUID Fn8000_001F_EAX[28] - SVSM support
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax & BIT(28)) {
+ struct msr m;
+
+ /* Obtain the address of the calling area to use */
+ raw_rdmsr(MSR_SVSM_CAA, &m);
+ boot_svsm_caa_pa = m.q;
+
+ /*
+ * The real VMPL level cannot be discovered, but the
+ * memory acceptance routines make no use of that so
+ * any non-zero value suffices here.
+ */
+ snp_vmpl = U8_MAX;
+ }
+ }
+ return true;
+}
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
new file mode 100644
index 000000000000..22637b416b46
--- /dev/null
+++ b/arch/x86/boot/compressed/sev.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD SEV header for early boot related functions.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#ifndef BOOT_COMPRESSED_SEV_H
+#define BOOT_COMPRESSED_SEV_H
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+#include <asm/shared/msr.h>
+
+void snp_accept_memory(phys_addr_t start, phys_addr_t end);
+u64 sev_get_status(void);
+bool early_is_sevsnp_guest(void);
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ struct msr m;
+
+ raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+
+ return m.q;
+}
+
+static inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ struct msr m;
+
+ m.q = val;
+ raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+}
+
+#else
+
+static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
+static inline u64 sev_get_status(void) { return 0; }
+static inline bool early_is_sevsnp_guest(void) { return false; }
+
+#endif
+
+#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 000000000000..9af19d9614cb
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This provides an optimized implementation of memcpy, and a simplified
+ * implementation of memset and memmove. These are used here because the
+ * standard kernel runtime versions are not yet available and we don't
+ * trust the gcc built-in implementations as they may do unexpected things
+ * (e.g. FPU ops) in the minimal decompression stub execution environment.
+ */
+#include "error.h"
+
+#include "../string.c"
+
+#ifdef CONFIG_X86_32
+static void *____memcpy(void *dest, const void *src, size_t n)
+{
+ int d0, d1, d2;
+ asm volatile(
+ "rep movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "rep movsb"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
+ : "memory");
+
+ return dest;
+}
+#else
+static void *____memcpy(void *dest, const void *src, size_t n)
+{
+ long d0, d1, d2;
+ asm volatile(
+ "rep movsq\n\t"
+ "movq %4,%%rcx\n\t"
+ "rep movsb"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
+ : "memory");
+
+ return dest;
+}
+#endif
+
+void *memset(void *s, int c, size_t n)
+{
+ int i;
+ char *ss = s;
+
+ for (i = 0; i < n; i++)
+ ss[i] = c;
+ return s;
+}
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+ unsigned char *d = dest;
+ const unsigned char *s = src;
+
+ if (d <= s || d - s >= n)
+ return ____memcpy(dest, src, n);
+
+ while (n-- > 0)
+ d[n] = s[n];
+
+ return dest;
+}
+
+/* Detect and warn about potential overlaps, but handle them with memmove. */
+void *memcpy(void *dest, const void *src, size_t n)
+{
+ if (dest > src && dest - src < n) {
+ warn("Avoiding potentially unsafe overlapping memcpy()!");
+ return memmove(dest, src, n);
+ }
+ return ____memcpy(dest, src, n);
+}
+
+#ifdef CONFIG_KASAN
+extern void *__memset(void *s, int c, size_t n) __alias(memset);
+extern void *__memmove(void *dest, const void *src, size_t n) __alias(memmove);
+extern void *__memcpy(void *dest, const void *src, size_t n) __alias(memcpy);
+#endif
diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S
new file mode 100644
index 000000000000..46d0495e0d3a
--- /dev/null
+++ b/arch/x86/boot/compressed/tdcall.S
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "../../coco/tdx/tdcall.S"
diff --git a/arch/x86/boot/compressed/tdx-shared.c b/arch/x86/boot/compressed/tdx-shared.c
new file mode 100644
index 000000000000..5ac43762fe13
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx-shared.c
@@ -0,0 +1,2 @@
+#include "error.h"
+#include "../../coco/tdx/tdx-shared.c"
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
new file mode 100644
index 000000000000..8451d6a1030c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../cpuflags.h"
+#include "../string.h"
+#include "../io.h"
+#include "error.h"
+
+#include <vdso/limits.h>
+#include <uapi/asm/vmx.h>
+
+#include <asm/shared/tdx.h>
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void)
+{
+ error("TDVMCALL failed. TDX module bug?");
+}
+
+static inline unsigned int tdx_io_in(int size, u16 port)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
+ .r12 = size,
+ .r13 = 0,
+ .r14 = port,
+ };
+
+ if (__tdx_hypercall(&args))
+ return UINT_MAX;
+
+ return args.r11;
+}
+
+static inline void tdx_io_out(int size, u16 port, u32 value)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
+ .r12 = size,
+ .r13 = 1,
+ .r14 = port,
+ .r15 = value,
+ };
+
+ __tdx_hypercall(&args);
+}
+
+static inline u8 tdx_inb(u16 port)
+{
+ return tdx_io_in(1, port);
+}
+
+static inline void tdx_outb(u8 value, u16 port)
+{
+ tdx_io_out(1, port, value);
+}
+
+static inline void tdx_outw(u16 value, u16 port)
+{
+ tdx_io_out(2, port, value);
+}
+
+void early_tdx_detect(void)
+{
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
+
+ if (memcmp(TDX_IDENT, sig, sizeof(sig)))
+ return;
+
+ /* Use hypercalls instead of I/O instructions */
+ pio_ops.f_inb = tdx_inb;
+ pio_ops.f_outb = tdx_outb;
+ pio_ops.f_outw = tdx_outw;
+}
diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h
new file mode 100644
index 000000000000..9055482cd35c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_TDX_H
+#define BOOT_COMPRESSED_TDX_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+void early_tdx_detect(void);
+#else
+static inline void early_tdx_detect(void) { };
+#endif
+
+#endif /* BOOT_COMPRESSED_TDX_H */
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index cc353e1b3ffd..587ce3e7c504 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,7 +1,11 @@
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm-generic/vmlinux.lds.h>
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#undef i386
+#include <asm/cache.h>
#include <asm/page_types.h>
#ifdef CONFIG_X86_64
@@ -18,18 +22,19 @@ SECTIONS
* address 0.
*/
. = 0;
- .text.head : {
+ .head.text : {
_head = . ;
- *(.text.head)
+ HEAD_TEXT
_ehead = . ;
}
- .rodata.compressed : {
- *(.rodata.compressed)
+ .rodata..compressed : {
+ *(.rodata..compressed)
}
.text : {
_text = .; /* Text */
*(.text)
*(.text.*)
+ *(.noinstr.text)
_etext = . ;
}
.rodata : {
@@ -38,13 +43,24 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
- .data : {
+#ifdef CONFIG_EFI_SBAT
+ .sbat : ALIGN(0x1000) {
+ _sbat = . ;
+ *(.sbat)
+ _esbat = ALIGN(0x1000);
+ . = _esbat;
+ }
+#endif
+ .data : ALIGN(0x1000) {
_data = . ;
*(.data)
*(.data.*)
+
+ /* Add 4 bytes of extra space for the obsolete CRC-32 checksum */
+ . = ALIGN(. + 4, 0x200);
_edata = . ;
}
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.bss : {
_bss = . ;
*(.bss)
@@ -61,5 +77,52 @@ SECTIONS
_epgtable = . ;
}
#endif
+ . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */
_end = .;
+
+ STABS_DEBUG
+ DWARF_DEBUG
+ ELF_DETAILS
+
+ DISCARDS
+ /DISCARD/ : {
+ *(.dynamic) *(.dynsym) *(.dynstr) *(.dynbss)
+ *(.hash) *(.gnu.hash)
+ *(.note.*)
+ }
+
+ .got.plt (INFO) : {
+ *(.got.plt)
+ }
+ ASSERT(SIZEOF(.got.plt) == 0 ||
+#ifdef CONFIG_X86_64
+ SIZEOF(.got.plt) == 0x18,
+#else
+ SIZEOF(.got.plt) == 0xc,
+#endif
+ "Unexpected GOT/PLT entries detected!")
+
+ /*
+ * Sections that should stay zero sized, which is safer to
+ * explicitly check instead of blindly discarding.
+ */
+ .got : {
+ *(.got)
+ }
+ ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!")
+
+ .plt : {
+ *(.plt) *(.plt.*)
+ }
+ ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+
+ .rel.dyn : {
+ *(.rel.*) *(.rel_*)
+ }
+ ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!")
+
+ .rela.dyn : {
+ *(.rela.*) *(.rela_*)
+ }
+ ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 11f272c6f5e9..3973a67cd04e 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* ----------------------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
#include <linux/linkage.h>
@@ -14,74 +12,54 @@
* Memory copy routines
*/
- .code16gcc
+ .code16
.text
-GLOBAL(memcpy)
+SYM_FUNC_START_NOALIGN(memcpy)
pushw %si
pushw %di
movw %ax, %di
movw %dx, %si
pushw %cx
shrw $2, %cx
- rep; movsl
+ rep movsl
popw %cx
andw $3, %cx
- rep; movsb
+ rep movsb
popw %di
popw %si
- ret
-ENDPROC(memcpy)
+ retl
+SYM_FUNC_END(memcpy)
-GLOBAL(memset)
+SYM_FUNC_START_NOALIGN(memset)
pushw %di
movw %ax, %di
movzbl %dl, %eax
imull $0x01010101,%eax
pushw %cx
shrw $2, %cx
- rep; stosl
+ rep stosl
popw %cx
andw $3, %cx
- rep; stosb
+ rep stosb
popw %di
- ret
-ENDPROC(memset)
+ retl
+SYM_FUNC_END(memset)
-GLOBAL(copy_from_fs)
+SYM_FUNC_START_NOALIGN(copy_from_fs)
pushw %ds
pushw %fs
popw %ds
- call memcpy
+ calll memcpy
popw %ds
- ret
-ENDPROC(copy_from_fs)
+ retl
+SYM_FUNC_END(copy_from_fs)
-GLOBAL(copy_to_fs)
+SYM_FUNC_START_NOALIGN(copy_to_fs)
pushw %es
pushw %fs
popw %es
- call memcpy
- popw %es
- ret
-ENDPROC(copy_to_fs)
-
-#if 0 /* Not currently used, but can be enabled as needed */
-GLOBAL(copy_from_gs)
- pushw %ds
- pushw %gs
- popw %ds
- call memcpy
- popw %ds
- ret
-ENDPROC(copy_from_gs)
-
-GLOBAL(copy_to_gs)
- pushw %es
- pushw %gs
- popw %es
- call memcpy
+ calll memcpy
popw %es
- ret
-ENDPROC(copy_to_gs)
-#endif
+ retl
+SYM_FUNC_END(copy_to_fs)
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 6ec6bb6e9957..feb6dbd7ca86 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007-2008 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -32,11 +30,37 @@ static char *cpu_name(int level)
}
}
+static void show_cap_strs(u32 *err_flags)
+{
+ int i, j;
+ const unsigned char *msg_strs = (const unsigned char *)x86_cap_strs;
+ for (i = 0; i < NCAPINTS; i++) {
+ u32 e = err_flags[i];
+ for (j = 0; j < 32; j++) {
+ if (msg_strs[0] < i ||
+ (msg_strs[0] == i && msg_strs[1] < j)) {
+ /* Skip to the next string */
+ msg_strs += 2;
+ while (*msg_strs++)
+ ;
+ }
+ if (e & 1) {
+ if (msg_strs[0] == i &&
+ msg_strs[1] == j &&
+ msg_strs[2])
+ printf("%s ", msg_strs+2);
+ else
+ printf("%d:%d ", i, j);
+ }
+ e >>= 1;
+ }
+ }
+}
+
int validate_cpu(void)
{
u32 *err_flags;
int cpu_level, req_level;
- const unsigned char *msg_strs;
check_cpu(&cpu_level, &req_level, &err_flags);
@@ -49,36 +73,13 @@ int validate_cpu(void)
}
if (err_flags) {
- int i, j;
puts("This kernel requires the following features "
"not present on the CPU:\n");
-
- msg_strs = (const unsigned char *)x86_cap_strs;
-
- for (i = 0; i < NCAPINTS; i++) {
- u32 e = err_flags[i];
-
- for (j = 0; j < 32; j++) {
- if (msg_strs[0] < i ||
- (msg_strs[0] == i && msg_strs[1] < j)) {
- /* Skip to the next string */
- msg_strs += 2;
- while (*msg_strs++)
- ;
- }
- if (e & 1) {
- if (msg_strs[0] == i &&
- msg_strs[1] == j &&
- msg_strs[2])
- printf("%s ", msg_strs+2);
- else
- printf("%d:%d ", i, j);
- }
- e >>= 1;
- }
- }
+ show_cap_strs(err_flags);
putchar('\n');
return -1;
+ } else if (check_knl_erratum()) {
+ return -1;
} else {
return 0;
}
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4d3ff037201f..2e1bb936cba2 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -24,12 +22,14 @@
# include "boot.h"
#endif
#include <linux/types.h>
+#include <asm/cpufeaturemasks.h>
+#include <asm/intel-family.h>
#include <asm/processor-flags.h>
-#include <asm/required-features.h>
#include <asm/msr-index.h>
+#include <asm/shared/msr.h>
+
+#include "string.h"
-struct cpu_features cpu;
-static u32 cpu_vendor[3];
static u32 err_flags[NCAPINTS];
static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
0, /* REQUIRED_MASK5 not implemented in this file */
REQUIRED_MASK6,
0, /* REQUIRED_MASK7 not implemented in this file */
+ 0, /* REQUIRED_MASK8 not implemented in this file */
+ 0, /* REQUIRED_MASK9 not implemented in this file */
+ 0, /* REQUIRED_MASK10 not implemented in this file */
+ 0, /* REQUIRED_MASK11 not implemented in this file */
+ 0, /* REQUIRED_MASK12 not implemented in this file */
+ 0, /* REQUIRED_MASK13 not implemented in this file */
+ 0, /* REQUIRED_MASK14 not implemented in this file */
+ 0, /* REQUIRED_MASK15 not implemented in this file */
+ REQUIRED_MASK16,
};
#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
@@ -69,92 +78,15 @@ static int is_transmeta(void)
cpu_vendor[2] == A32('M', 'x', '8', '6');
}
-static int has_fpu(void)
+static int is_intel(void)
{
- u16 fcw = -1, fsw = -1;
- u32 cr0;
-
- asm("movl %%cr0,%0" : "=r" (cr0));
- if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
- cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
- asm volatile("movl %0,%%cr0" : : "r" (cr0));
- }
-
- asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
- : "+m" (fsw), "+m" (fcw));
-
- return fsw == 0 && (fcw & 0x103f) == 0x003f;
-}
-
-static int has_eflag(u32 mask)
-{
- u32 f0, f1;
-
- asm("pushfl ; "
- "pushfl ; "
- "popl %0 ; "
- "movl %0,%1 ; "
- "xorl %2,%1 ; "
- "pushl %1 ; "
- "popfl ; "
- "pushfl ; "
- "popl %1 ; "
- "popfl"
- : "=&r" (f0), "=&r" (f1)
- : "ri" (mask));
-
- return !!((f0^f1) & mask);
-}
-
-static void get_flags(void)
-{
- u32 max_intel_level, max_amd_level;
- u32 tfms;
-
- if (has_fpu())
- set_bit(X86_FEATURE_FPU, cpu.flags);
-
- if (has_eflag(X86_EFLAGS_ID)) {
- asm("cpuid"
- : "=a" (max_intel_level),
- "=b" (cpu_vendor[0]),
- "=d" (cpu_vendor[1]),
- "=c" (cpu_vendor[2])
- : "a" (0));
-
- if (max_intel_level >= 0x00000001 &&
- max_intel_level <= 0x0000ffff) {
- asm("cpuid"
- : "=a" (tfms),
- "=c" (cpu.flags[4]),
- "=d" (cpu.flags[0])
- : "a" (0x00000001)
- : "ebx");
- cpu.level = (tfms >> 8) & 15;
- cpu.model = (tfms >> 4) & 15;
- if (cpu.level >= 6)
- cpu.model += ((tfms >> 16) & 0xf) << 4;
- }
-
- asm("cpuid"
- : "=a" (max_amd_level)
- : "a" (0x80000000)
- : "ebx", "ecx", "edx");
-
- if (max_amd_level >= 0x80000001 &&
- max_amd_level <= 0x8000ffff) {
- u32 eax = 0x80000001;
- asm("cpuid"
- : "+a" (eax),
- "=c" (cpu.flags[6]),
- "=d" (cpu.flags[1])
- : : "ebx");
- }
- }
+ return cpu_vendor[0] == A32('G', 'e', 'n', 'u') &&
+ cpu_vendor[1] == A32('i', 'n', 'e', 'I') &&
+ cpu_vendor[2] == A32('n', 't', 'e', 'l');
}
/* Returns a bitmask of which words we have error bits in */
-static int check_flags(void)
+static int check_cpuflags(void)
{
u32 err;
int i;
@@ -181,14 +113,14 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
{
int err;
- memset(&cpu.flags, 0, sizeof cpu.flags);
+ memset(&cpu.flags, 0, sizeof(cpu.flags));
cpu.level = 3;
if (has_eflag(X86_EFLAGS_AC))
cpu.level = 4;
- get_flags();
- err = check_flags();
+ get_cpuflags();
+ err = check_cpuflags();
if (test_bit(X86_FEATURE_LM, cpu.flags))
cpu.level = 64;
@@ -200,46 +132,60 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
/* If this is an AMD and we're only missing SSE+SSE2, try to
turn them on */
- u32 ecx = MSR_K7_HWCR;
- u32 eax, edx;
+ struct msr m;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- eax &= ~(1 << 15);
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ raw_rdmsr(MSR_K7_HWCR, &m);
+ m.l &= ~(1 << 15);
+ raw_wrmsr(MSR_K7_HWCR, &m);
- get_flags(); /* Make sure it really did something */
- err = check_flags();
+ get_cpuflags(); /* Make sure it really did something */
+ err = check_cpuflags();
} else if (err == 0x01 &&
!(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
is_centaur() && cpu.model >= 6) {
/* If this is a VIA C3, we might have to enable CX8
explicitly */
- u32 ecx = MSR_VIA_FCR;
- u32 eax, edx;
+ struct msr m;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- eax |= (1<<1)|(1<<7);
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ raw_rdmsr(MSR_VIA_FCR, &m);
+ m.l |= (1 << 1) | (1 << 7);
+ raw_wrmsr(MSR_VIA_FCR, &m);
set_bit(X86_FEATURE_CX8, cpu.flags);
- err = check_flags();
+ err = check_cpuflags();
} else if (err == 0x01 && is_transmeta()) {
/* Transmeta might have masked feature bits in word 0 */
- u32 ecx = 0x80860004;
- u32 eax, edx;
+ struct msr m, m_tmp;
u32 level = 1;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx));
+ raw_rdmsr(0x80860004, &m);
+ m_tmp = m;
+ m_tmp.l = ~0;
+ raw_wrmsr(0x80860004, &m_tmp);
asm("cpuid"
: "+a" (level), "=d" (cpu.flags[0])
: : "ecx", "ebx");
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ raw_wrmsr(0x80860004, &m);
- err = check_flags();
+ err = check_cpuflags();
+ } else if (err == 0x01 &&
+ !(err_flags[0] & ~(1 << X86_FEATURE_PAE)) &&
+ is_intel() && cpu.level == 6 &&
+ (cpu.model == 9 || cpu.model == 13)) {
+ /* PAE is disabled on this Pentium M but can be forced */
+ if (cmdline_find_option_bool("forcepae")) {
+ puts("WARNING: Forcing PAE in CPU flags\n");
+ set_bit(X86_FEATURE_PAE, cpu.flags);
+ err = check_cpuflags();
+ }
+ else {
+ puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n");
+ }
}
+ if (!err)
+ err = check_knl_erratum();
if (err_flags_ptr)
*err_flags_ptr = err ? err_flags : NULL;
@@ -250,3 +196,33 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
return (cpu.level < req_level || err) ? -1 : 0;
}
+
+int check_knl_erratum(void)
+{
+ /*
+ * First check for the affected model/family:
+ */
+ if (!is_intel() ||
+ cpu.family != 6 ||
+ cpu.model != 0x57 /*INTEL_XEON_PHI_KNL*/)
+ return 0;
+
+ /*
+ * This erratum affects the Accessed/Dirty bits, and can
+ * cause stray bits to be set in !Present PTEs. We have
+ * enough bits in our 64-bit PTEs (which we have on real
+ * 64-bit mode or PAE) to avoid using these troublesome
+ * bits. But, we do not have enough space in our 32-bit
+ * PTEs. So, refuse to run on 32-bit non-PAE kernels.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) || IS_ENABLED(CONFIG_X86_PAE))
+ return 0;
+
+ puts("This 32-bit kernel can not run on this Xeon Phi x200\n"
+ "processor due to a processor erratum. Use a 64-bit\n"
+ "kernel, or enable PAE in this 32-bit kernel.\n\n");
+
+ return -1;
+}
+
+
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
new file mode 100644
index 000000000000..916bac09b464
--- /dev/null
+++ b/arch/x86/boot/cpuflags.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include "bitops.h"
+
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include "cpuflags.h"
+
+struct cpu_features cpu;
+u32 cpu_vendor[3];
+
+static bool loaded_flags;
+
+static int has_fpu(void)
+{
+ u16 fcw = -1, fsw = -1;
+ unsigned long cr0;
+
+ asm volatile("mov %%cr0,%0" : "=r" (cr0));
+ if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
+ cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
+ asm volatile("mov %0,%%cr0" : : "r" (cr0));
+ }
+
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+ : "+m" (fsw), "+m" (fcw));
+
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * For building the 16-bit code we want to explicitly specify 32-bit
+ * push/pop operations, rather than just saying 'pushf' or 'popf' and
+ * letting the compiler choose.
+ */
+bool has_eflag(unsigned long mask)
+{
+ unsigned long f0, f1;
+
+ asm volatile("pushfl \n\t"
+ "pushfl \n\t"
+ "pop %0 \n\t"
+ "mov %0,%1 \n\t"
+ "xor %2,%1 \n\t"
+ "push %1 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "pop %1 \n\t"
+ "popfl"
+ : "=&r" (f0), "=&r" (f1)
+ : "ri" (mask));
+
+ return !!((f0^f1) & mask);
+}
+#endif
+
+void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+ asm volatile("cpuid"
+ : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
+ : "0" (id), "2" (count)
+ );
+}
+
+#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
+
+void get_cpuflags(void)
+{
+ u32 max_intel_level, max_amd_level;
+ u32 tfms;
+ u32 ignored;
+
+ if (loaded_flags)
+ return;
+ loaded_flags = true;
+
+ if (has_fpu())
+ set_bit(X86_FEATURE_FPU, cpu.flags);
+
+ if (has_eflag(X86_EFLAGS_ID)) {
+ cpuid(0x0, &max_intel_level, &cpu_vendor[0], &cpu_vendor[2],
+ &cpu_vendor[1]);
+
+ if (max_intel_level >= 0x00000001 &&
+ max_intel_level <= 0x0000ffff) {
+ cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
+ &cpu.flags[0]);
+ cpu.level = (tfms >> 8) & 15;
+ cpu.family = cpu.level;
+ cpu.model = (tfms >> 4) & 15;
+ if (cpu.level >= 6)
+ cpu.model += ((tfms >> 16) & 0xf) << 4;
+ }
+
+ if (max_intel_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &ignored, &ignored,
+ &cpu.flags[16], &ignored);
+ }
+
+ cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
+ &ignored);
+
+ if (max_amd_level >= 0x80000001 &&
+ max_amd_level <= 0x8000ffff) {
+ cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6],
+ &cpu.flags[1]);
+ }
+ }
+}
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
new file mode 100644
index 000000000000..a398d9204ad0
--- /dev/null
+++ b/arch/x86/boot/cpuflags.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_CPUFLAGS_H
+#define BOOT_CPUFLAGS_H
+
+#include <asm/cpufeatures.h>
+#include <asm/processor-flags.h>
+
+struct cpu_features {
+ int level; /* Family, or 64 for x86-64 */
+ int family; /* Family, always */
+ int model;
+ u32 flags[NCAPINTS];
+};
+
+extern struct cpu_features cpu;
+extern u32 cpu_vendor[3];
+
+#ifdef CONFIG_X86_32
+bool has_eflag(unsigned long mask);
+#else
+static inline bool has_eflag(unsigned long mask) { return true; }
+#endif
+void get_cpuflags(void);
+void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d);
+bool has_cpuflag(int flag);
+
+#endif
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 000000000000..8f5ef2994b5e
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_CTYPE_H
+#define BOOT_CTYPE_H
+
+static inline int isdigit(int ch)
+{
+ return (ch >= '0') && (ch <= '9');
+}
+
+static inline int isxdigit(int ch)
+{
+ if (isdigit(ch))
+ return true;
+
+ if ((ch >= 'a') && (ch <= 'f'))
+ return true;
+
+ return (ch >= 'A') && (ch <= 'F');
+}
+
+#endif
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 000000000000..023bf1c3de8b
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Serial port routines for use during early boot reporting. This code is
+ * included from both the compressed kernel and the regular kernel.
+ */
+#include "boot.h"
+
+#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+#define DEFAULT_BAUD 9600
+
+static void early_serial_init(int port, int baud)
+{
+ unsigned char c;
+ unsigned divisor;
+
+ outb(0x3, port + LCR); /* 8n1 */
+ outb(0, port + IER); /* no interrupt */
+ outb(0, port + FCR); /* no fifo */
+ outb(0x3, port + MCR); /* DTR + RTS */
+
+ divisor = 115200 / baud;
+ c = inb(port + LCR);
+ outb(c | DLAB, port + LCR);
+ outb(divisor & 0xff, port + DLL);
+ outb((divisor >> 8) & 0xff, port + DLH);
+ outb(c & ~DLAB, port + LCR);
+
+ early_serial_base = port;
+}
+
+static void parse_earlyprintk(void)
+{
+ int baud = DEFAULT_BAUD;
+ char arg[32];
+ int pos = 0;
+ int port = 0;
+
+ if (cmdline_find_option("earlyprintk", arg, sizeof(arg)) > 0) {
+ char *e;
+
+ if (!strncmp(arg, "serial", 6)) {
+ port = DEFAULT_SERIAL_PORT;
+ pos += 6;
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ /*
+ * make sure we have
+ * "serial,0x3f8,115200"
+ * "serial,ttyS0,115200"
+ * "ttyS0,115200"
+ */
+ if (pos == 7 && !strncmp(arg + pos, "0x", 2)) {
+ port = simple_strtoull(arg + pos, &e, 16);
+ if (port == 0 || arg + pos == e)
+ port = DEFAULT_SERIAL_PORT;
+ else
+ pos = e - arg;
+ } else if (!strncmp(arg + pos, "ttyS", 4)) {
+ static const int bases[] = { 0x3f8, 0x2f8 };
+ int idx = 0;
+
+ /* += strlen("ttyS"); */
+ pos += 4;
+
+ if (arg[pos++] == '1')
+ idx = 1;
+
+ port = bases[idx];
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ baud = simple_strtoull(arg + pos, &e, 0);
+ if (baud == 0 || arg + pos == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+#define BASE_BAUD (1843200/16)
+static unsigned int probe_baud(int port)
+{
+ unsigned char lcr, dll, dlh;
+ unsigned int quot;
+
+ lcr = inb(port + LCR);
+ outb(lcr | DLAB, port + LCR);
+ dll = inb(port + DLL);
+ dlh = inb(port + DLH);
+ outb(lcr, port + LCR);
+ quot = (dlh << 8) | dll;
+
+ return BASE_BAUD / quot;
+}
+
+static void parse_console_uart8250(void)
+{
+ char optstr[64], *options;
+ int baud = DEFAULT_BAUD;
+ int port = 0;
+
+ /*
+ * console=uart8250,io,0x3f8,115200n8
+ * need to make sure it is last one console !
+ */
+ if (cmdline_find_option("console", optstr, sizeof(optstr)) <= 0)
+ return;
+
+ options = optstr;
+
+ if (!strncmp(options, "uart8250,io,", 12))
+ port = simple_strtoull(options + 12, &options, 0);
+ else if (!strncmp(options, "uart,io,", 8))
+ port = simple_strtoull(options + 8, &options, 0);
+ else
+ return;
+
+ if (options && (options[0] == ','))
+ baud = simple_strtoull(options + 1, &options, 0);
+ else
+ baud = probe_baud(port);
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+void console_init(void)
+{
+ parse_earlyprintk();
+
+ if (!early_serial_base)
+ parse_console_uart8250();
+}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index c501a5b466f8..1fb4bc70cee9 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -15,6 +13,7 @@
#include "boot.h"
#include <linux/edd.h>
+#include "string.h"
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -75,7 +74,7 @@ static int get_edd_info(u8 devno, struct edd_info *ei)
{
struct biosregs ireg, oreg;
- memset(ei, 0, sizeof *ei);
+ memset(ei, 0, sizeof(*ei));
/* Check Extensions Present */
@@ -132,7 +131,7 @@ void query_edd(void)
struct edd_info ei, *edp;
u32 *mbrptr;
- if (cmdline_find_option("edd", eddarg, sizeof eddarg) > 0) {
+ if (cmdline_find_option("edd", eddarg, sizeof(eddarg)) > 0) {
if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip")) {
do_edd = 1;
do_mbr = 0;
@@ -165,7 +164,7 @@ void query_edd(void)
*/
if (!get_edd_info(devno, &ei)
&& boot_params.eddbuf_entries < EDDMAXNR) {
- memcpy(edp, &ei, sizeof ei);
+ memcpy(edp, &ei, sizeof(ei));
edp++;
boot_params.eddbuf_entries++;
}
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
new file mode 100644
index 000000000000..3882ead513f7
--- /dev/null
+++ b/arch/x86/boot/genimage.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 2017 by Changbin Du <changbin.du@intel.com>
+#
+# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others
+#
+# "make fdimage/fdimage144/fdimage288/hdimage/isoimage"
+# script for x86 architecture
+#
+# Arguments:
+# $1 - fdimage format
+# $2 - target image file
+# $3 - kernel bzImage file
+# $4 - mtools configuration file
+# $5 - kernel cmdline
+# $6+ - initrd image file(s)
+#
+# This script requires:
+# bash
+# syslinux
+# genisoimage
+# mtools (for fdimage* and hdimage)
+# edk2/OVMF (for hdimage)
+#
+# Otherwise try to stick to POSIX shell commands...
+#
+
+# Use "make V=1" to debug this script
+case "${KBUILD_VERBOSE}" in
+*1*)
+ set -x
+ ;;
+esac
+
+# Exit the top-level shell with an error
+topshell=$$
+trap 'exit 1' USR1
+die() {
+ echo "" 1>&2
+ echo " *** $*" 1>&2
+ echo "" 1>&2
+ kill -USR1 $topshell
+}
+
+# Verify the existence and readability of a file
+verify() {
+ if [ ! -f "$1" -o ! -r "$1" ]; then
+ die "Missing file: $1"
+ fi
+}
+
+diskfmt="$1"
+FIMAGE="$2"
+FBZIMAGE="$3"
+MTOOLSRC="$4"
+KCMDLINE="$5"
+shift 5 # Remaining arguments = initrd files
+
+export MTOOLSRC
+
+# common options for dd
+dd='dd iflag=fullblock'
+
+# Make sure the files actually exist
+verify "$FBZIMAGE"
+
+declare -a FDINITRDS
+irdpfx=' initrd='
+initrdopts_syslinux=''
+initrdopts_efi=''
+for f in "$@"; do
+ if [ -f "$f" -a -r "$f" ]; then
+ FDINITRDS=("${FDINITRDS[@]}" "$f")
+ fname="$(basename "$f")"
+ initrdopts_syslinux="${initrdopts_syslinux}${irdpfx}${fname}"
+ irdpfx=,
+ initrdopts_efi="${initrdopts_efi} initrd=${fname}"
+ fi
+done
+
+# Read a $3-byte littleendian unsigned value at offset $2 from file $1
+le() {
+ local n=0
+ local m=1
+ for b in $(od -A n -v -j $2 -N $3 -t u1 "$1"); do
+ n=$((n + b*m))
+ m=$((m * 256))
+ done
+ echo $n
+}
+
+# Get the EFI architecture name such that boot{name}.efi is the default
+# boot file name. Returns false with no output if the file is not an
+# EFI image or otherwise unknown.
+efiarch() {
+ [ -f "$1" ] || return
+ [ $(le "$1" 0 2) -eq 23117 ] || return # MZ magic
+ peoffs=$(le "$1" 60 4) # PE header offset
+ [ $peoffs -ge 64 ] || return
+ [ $(le "$1" $peoffs 4) -eq 17744 ] || return # PE magic
+ case $(le "$1" $((peoffs+4+20)) 2) in # PE type
+ 267) ;; # PE32
+ 523) ;; # PE32+
+ *) return 1 ;; # Invalid
+ esac
+ [ $(le "$1" $((peoffs+4+20+68)) 2) -eq 10 ] || return # EFI app
+ case $(le "$1" $((peoffs+4)) 2) in # Machine type
+ 332) echo i386 ;;
+ 450) echo arm ;;
+ 512) echo ia64 ;;
+ 20530) echo riscv32 ;;
+ 20580) echo riscv64 ;;
+ 20776) echo riscv128 ;;
+ 34404) echo x64 ;;
+ 43620) echo aa64 ;;
+ esac
+}
+
+# Get the combined sizes in bytes of the files given, counting sparse
+# files as full length, and padding each file to cluster size
+cluster=16384
+filesizes() {
+ local t=0
+ local s
+ for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do
+ t=$((t + ((s+cluster-1)/cluster)*cluster))
+ done
+ echo $t
+}
+
+# Expand directory names which should be in /usr/share into a list
+# of possible alternatives
+sharedirs() {
+ local dir file
+ for dir in /usr/share /usr/lib64 /usr/lib; do
+ for file; do
+ echo "$dir/$file"
+ echo "$dir/${file^^}"
+ done
+ done
+}
+efidirs() {
+ local dir file
+ for dir in /usr/share /boot /usr/lib64 /usr/lib; do
+ for file; do
+ echo "$dir/$file"
+ echo "$dir/${file^^}"
+ done
+ done
+}
+
+findsyslinux() {
+ local f="$(find -L $(sharedirs syslinux isolinux) \
+ -name "$1" -readable -type f -print -quit 2>/dev/null)"
+ if [ ! -f "$f" ]; then
+ die "Need a $1 file, please install syslinux/isolinux."
+ fi
+ echo "$f"
+ return 0
+}
+
+findovmf() {
+ local arch="$1"
+ shift
+ local -a names=(-false)
+ local name f
+ for name; do
+ names=("${names[@]}" -or -iname "$name")
+ done
+ for f in $(find -L $(efidirs edk2 ovmf) \
+ \( "${names[@]}" \) -readable -type f \
+ -print 2>/dev/null); do
+ if [ "$(efiarch "$f")" = "$arch" ]; then
+ echo "$f"
+ return 0
+ fi
+ done
+ die "Need a $1 file for $arch, please install EDK2/OVMF."
+}
+
+do_mcopy() {
+ if [ ${#FDINITRDS[@]} -gt 0 ]; then
+ mcopy "${FDINITRDS[@]}" "$1"
+ fi
+ if [ -n "$efishell" ]; then
+ mmd "$1"EFI "$1"EFI/Boot
+ mcopy "$efishell" "$1"EFI/Boot/boot${kefiarch}.efi
+ fi
+ if [ -n "$kefiarch" ]; then
+ echo linux "$KCMDLINE$initrdopts_efi" | \
+ mcopy - "$1"startup.nsh
+ fi
+ echo default linux "$KCMDLINE$initrdopts_syslinux" | \
+ mcopy - "$1"syslinux.cfg
+ mcopy "$FBZIMAGE" "$1"linux
+}
+
+genbzdisk() {
+ verify "$MTOOLSRC"
+ mformat -v 'LINUX_BOOT' a:
+ syslinux "$FIMAGE"
+ do_mcopy a:
+}
+
+genfdimage144() {
+ verify "$MTOOLSRC"
+ $dd if=/dev/zero of="$FIMAGE" bs=1024 count=1440 2>/dev/null
+ mformat -v 'LINUX_BOOT' v:
+ syslinux "$FIMAGE"
+ do_mcopy v:
+}
+
+genfdimage288() {
+ verify "$MTOOLSRC"
+ $dd if=/dev/zero of="$FIMAGE" bs=1024 count=2880 2>/dev/null
+ mformat -v 'LINUX_BOOT' w:
+ syslinux "$FIMAGE"
+ do_mcopy w:
+}
+
+genhdimage() {
+ verify "$MTOOLSRC"
+ mbr="$(findsyslinux mbr.bin)"
+ kefiarch="$(efiarch "$FBZIMAGE")"
+ if [ -n "$kefiarch" ]; then
+ # The efishell provides command line handling
+ efishell="$(findovmf $kefiarch shell.efi shell${kefiarch}.efi)"
+ ptype='-T 0xef' # EFI system partition, no GPT
+ fi
+ sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell")
+ # Allow 1% + 2 MiB for filesystem and partition table overhead,
+ # syslinux, and config files; this is probably excessive...
+ megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024)))
+ $dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null
+ mpartition -I -c -s 32 -h 64 $ptype -b 64 -a p:
+ $dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null
+ mformat -v 'LINUX_BOOT' -s 32 -h 64 -c $((cluster/512)) -t $megs h:
+ syslinux --offset $((64*512)) "$FIMAGE"
+ do_mcopy h:
+}
+
+geniso() {
+ tmp_dir="$(dirname "$FIMAGE")/isoimage"
+ rm -rf "$tmp_dir"
+ mkdir "$tmp_dir"
+ isolinux=$(findsyslinux isolinux.bin)
+ ldlinux=$(findsyslinux ldlinux.c32)
+ cp "$isolinux" "$ldlinux" "$tmp_dir"
+ cp "$FBZIMAGE" "$tmp_dir"/linux
+ echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg
+ if [ ${#FDINITRDS[@]} -gt 0 ]; then
+ cp "${FDINITRDS[@]}" "$tmp_dir"/
+ fi
+ genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \
+ -quiet -o "$FIMAGE" -b isolinux.bin \
+ -c boot.cat -no-emul-boot -boot-load-size 4 \
+ -boot-info-table "$tmp_dir"
+ isohybrid "$FIMAGE" 2>/dev/null || true
+ rm -rf "$tmp_dir"
+}
+
+rm -f "$FIMAGE"
+
+case "$diskfmt" in
+ bzdisk) genbzdisk;;
+ fdimage144) genfdimage144;;
+ fdimage288) genfdimage288;;
+ hdimage) genhdimage;;
+ isoimage) geniso;;
+ *) die "Unknown image format: $diskfmt";;
+esac
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b31cc54b4641..9bea5a1e2c52 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* header.S
*
@@ -14,13 +15,12 @@
* hex while segment addresses are written as segment:offset.
*
*/
-
+#include <linux/pe.h>
#include <asm/segment.h>
-#include <linux/utsrelease.h>
#include <asm/boot.h>
-#include <asm/e820.h>
#include <asm/page_types.h>
#include <asm/setup.h>
+#include <asm/bootparam.h>
#include "boot.h"
#include "voffset.h"
#include "zoffset.h"
@@ -32,74 +32,212 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
#define SVGA_MODE ASK_VGA
#endif
-#ifndef RAMDISK
-#define RAMDISK 0
-#endif
-
#ifndef ROOT_RDONLY
#define ROOT_RDONLY 1
#endif
+ .set salign, 0x1000
+ .set falign, 0x200
+
.code16
.section ".bstext", "ax"
+#ifdef CONFIG_EFI_STUB
+ # "MZ", MS-DOS header
+ .word IMAGE_DOS_SIGNATURE
+ .org 0x38
+ #
+ # Offset to the PE header.
+ #
+ .long LINUX_PE_MAGIC
+ .long pe_header
+pe_header:
+ .long IMAGE_NT_SIGNATURE
+
+coff_header:
+#ifdef CONFIG_X86_32
+ .set image_file_add_flags, IMAGE_FILE_32BIT_MACHINE
+ .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR32_MAGIC
+ .word IMAGE_FILE_MACHINE_I386
+#else
+ .set image_file_add_flags, 0
+ .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR64_MAGIC
+ .word IMAGE_FILE_MACHINE_AMD64
+#endif
+ .word section_count # nr_sections
+ .long 0 # TimeDateStamp
+ .long 0 # PointerToSymbolTable
+ .long 1 # NumberOfSymbols
+ .word section_table - optional_header # SizeOfOptionalHeader
+ .word IMAGE_FILE_EXECUTABLE_IMAGE | \
+ image_file_add_flags | \
+ IMAGE_FILE_DEBUG_STRIPPED | \
+ IMAGE_FILE_LINE_NUMS_STRIPPED # Characteristics
+
+optional_header:
+ .word pe_opt_magic
+ .byte 0x02 # MajorLinkerVersion
+ .byte 0x14 # MinorLinkerVersion
+
+ .long ZO__data # SizeOfCode
+
+ .long ZO__end - ZO__data # SizeOfInitializedData
+ .long 0 # SizeOfUninitializedData
+
+ .long setup_size + ZO_efi_pe_entry # AddressOfEntryPoint
+
+ .long setup_size # BaseOfCode
+#ifdef CONFIG_X86_32
+ .long 0 # data
+#endif
- .global bootsect_start
-bootsect_start:
-
- # Normalize the start address
- ljmp $BOOTSEG, $start2
+extra_header_fields:
+#ifdef CONFIG_X86_32
+ .long 0 # ImageBase
+#else
+ .quad 0 # ImageBase
+#endif
+ .long salign # SectionAlignment
+ .long falign # FileAlignment
+ .word 0 # MajorOperatingSystemVersion
+ .word 0 # MinorOperatingSystemVersion
+ .word LINUX_EFISTUB_MAJOR_VERSION # MajorImageVersion
+ .word LINUX_EFISTUB_MINOR_VERSION # MinorImageVersion
+ .word 0 # MajorSubsystemVersion
+ .word 0 # MinorSubsystemVersion
+ .long 0 # Win32VersionValue
+
+ .long setup_size + ZO__end # SizeOfImage
+
+ .long salign # SizeOfHeaders
+ .long 0 # CheckSum
+ .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
+ .word IMAGE_DLLCHARACTERISTICS_NX_COMPAT # DllCharacteristics
+#ifdef CONFIG_X86_32
+ .long 0 # SizeOfStackReserve
+ .long 0 # SizeOfStackCommit
+ .long 0 # SizeOfHeapReserve
+ .long 0 # SizeOfHeapCommit
+#else
+ .quad 0 # SizeOfStackReserve
+ .quad 0 # SizeOfStackCommit
+ .quad 0 # SizeOfHeapReserve
+ .quad 0 # SizeOfHeapCommit
+#endif
+ .long 0 # LoaderFlags
+ .long (section_table - .) / 8 # NumberOfRvaAndSizes
+
+ .quad 0 # ExportTable
+ .quad 0 # ImportTable
+ .quad 0 # ResourceTable
+ .quad 0 # ExceptionTable
+ .quad 0 # CertificationTable
+ .quad 0 # BaseRelocationTable
+
+ # Section table
+section_table:
+ .ascii ".setup"
+ .byte 0
+ .byte 0
+ .long pecompat_fstart - salign # VirtualSize
+ .long salign # VirtualAddress
+ .long pecompat_fstart - salign # SizeOfRawData
+ .long salign # PointerToRawData
+
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
+
+#ifdef CONFIG_EFI_MIXED
+ .asciz ".compat"
+
+ .long pecompat_fsize # VirtualSize
+ .long pecompat_fstart # VirtualAddress
+ .long pecompat_fsize # SizeOfRawData
+ .long pecompat_fstart # PointerToRawData
+
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
+
+ /*
+ * Put the IA-32 machine type and the associated entry point address in
+ * the .compat section, so loaders can figure out which other execution
+ * modes this image supports.
+ */
+ .pushsection ".pecompat", "a", @progbits
+ .balign salign
+ .globl pecompat_fstart
+pecompat_fstart:
+ .byte 0x1 # Version
+ .byte 8 # Size
+ .word IMAGE_FILE_MACHINE_I386 # PE machine type
+ .long setup_size + ZO_efi32_pe_entry # Entrypoint
+ .byte 0x0 # Sentinel
+ .popsection
+#else
+ .set pecompat_fstart, setup_size
+#endif
+ .ascii ".text\0\0\0"
+ .long textsize # VirtualSize
+ .long setup_size # VirtualAddress
+ .long textsize # SizeOfRawData
+ .long setup_size # PointerToRawData
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long IMAGE_SCN_CNT_CODE | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_EXECUTE # Characteristics
+
+#ifdef CONFIG_EFI_SBAT
+ .ascii ".sbat\0\0\0"
+ .long ZO__esbat - ZO__sbat # VirtualSize
+ .long setup_size + ZO__sbat # VirtualAddress
+ .long ZO__esbat - ZO__sbat # SizeOfRawData
+ .long setup_size + ZO__sbat # PointerToRawData
+
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
+
+ .set textsize, ZO__sbat
+#else
+ .set textsize, ZO__data
+#endif
-start2:
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- xorw %sp, %sp
- sti
- cld
+ .ascii ".data\0\0\0"
+ .long ZO__end - ZO__data # VirtualSize
+ .long setup_size + ZO__data # VirtualAddress
+ .long ZO__edata - ZO__data # SizeOfRawData
+ .long setup_size + ZO__data # PointerToRawData
- movw $bugger_off_msg, %si
-
-msg_loop:
- lodsb
- andb %al, %al
- jz bs_die
- movb $0xe, %ah
- movw $7, %bx
- int $0x10
- jmp msg_loop
-
-bs_die:
- # Allow the user to press a key, then reboot
- xorw %ax, %ax
- int $0x16
- int $0x19
-
- # int 0x19 should never return. In case it does anyway,
- # invoke the BIOS reset code...
- ljmp $0xf000,$0xfff0
-
- .section ".bsdata", "a"
-bugger_off_msg:
- .ascii "Direct booting from floppy is no longer supported.\r\n"
- .ascii "Please use a boot loader program instead.\r\n"
- .ascii "\n"
- .ascii "Remove disk and press any key to reboot . . .\r\n"
- .byte 0
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_WRITE # Characteristics
+ .set section_count, (. - section_table) / 40
+#endif /* CONFIG_EFI_STUB */
# Kernel attributes; used by setup. This is part 1 of the
# header, from the old boot sector.
.section ".header", "a"
+ .globl sentinel
+sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */
+
.globl hdr
hdr:
-setup_sects: .byte 0 /* Filled in by build.c */
+ .byte setup_sects - 1
root_flags: .word ROOT_RDONLY
-syssize: .long 0 /* Filled in by build.c */
+syssize: .long ZO__edata / 16
ram_size: .word 0 /* Obsolete */
vid_mode: .word SVGA_MODE
-root_dev: .word 0 /* Filled in by build.c */
+root_dev: .word 0 /* Default to major/minor 0/0 */
boot_flag: .word 0xAA55
# offset 512, entry point
@@ -116,7 +254,7 @@ _start:
# Part 2 of the header, from the old setup.S
.ascii "HdrS" # header signature
- .word 0x020a # header version number (>= 0x0105)
+ .word 0x020f # header version number (>= 0x0105)
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -129,18 +267,12 @@ start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
type_of_loader: .byte 0 # 0 means ancient bootloader, newer
# bootloaders know to change this.
- # See Documentation/i386/boot.txt for
+ # See Documentation/arch/x86/boot.rst for
# assigned ids
# flags, unused bits must be zero (RFU) bit within loadflags
loadflags:
-LOADED_HIGH = 1 # If set, the kernel is loaded high
-CAN_USE_HEAP = 0x80 # If set, the loader also has set
- # heap_end_ptr to tell how much
- # space behind setup.S can be used for
- # heap purposes.
- # Only the loader knows what is free
- .byte LOADED_HIGH
+ .byte LOADED_HIGH # The kernel is to be loaded high
setup_move_size: .word 0x8000 # size to move, when setup is not
# loaded at 0x90000. We will move setup
@@ -189,7 +321,7 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
# can be located anywhere in
# low memory 0x10000 or higher.
-ramdisk_max: .long 0x7fffffff
+initrd_addr_max: .long 0x7fffffff
# (Header version 0x0203 or later)
# The highest safe address for
# the contents of an initrd
@@ -206,7 +338,48 @@ relocatable_kernel: .byte 1
relocatable_kernel: .byte 0
#endif
min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
-pad3: .word 0
+
+xloadflags:
+#ifdef CONFIG_X86_64
+# define XLF0 XLF_KERNEL_64 /* 64-bit kernel */
+#else
+# define XLF0 0
+#endif
+
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
+ /* kernel/boot_param/ramdisk could be loaded above 4g */
+# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
+#else
+# define XLF1 0
+#endif
+
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+# ifdef CONFIG_EFI_MIXED
+# define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64)
+# else
+# ifdef CONFIG_X86_64
+# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
+# else
+# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */
+# endif
+# endif
+#else
+# define XLF23 0
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
+# define XLF4 XLF_EFI_KEXEC
+#else
+# define XLF4 0
+#endif
+
+#ifdef CONFIG_X86_64
+#define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED)
+#else
+#define XLF56 0
+#endif
+
+ .word XLF0 | XLF1 | XLF23 | XLF4 | XLF56
cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
#added with boot protocol
@@ -226,26 +399,152 @@ setup_data: .quad 0 # 64-bit physical pointer to
pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
-#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
+#
+# Getting to provably safe in-place decompression is hard. Worst case
+# behaviours need to be analyzed. Here let's take the decompression of
+# a gzip-compressed kernel as example, to illustrate it:
+#
+# The file layout of gzip compressed kernel is:
+#
+# magic[2]
+# method[1]
+# flags[1]
+# timestamp[4]
+# extraflags[1]
+# os[1]
+# compressed data blocks[N]
+# crc[4] orig_len[4]
+#
+# ... resulting in +18 bytes overhead of uncompressed data.
+#
+# (For more information, please refer to RFC 1951 and RFC 1952.)
+#
+# Files divided into blocks
+# 1 bit (last block flag)
+# 2 bits (block type)
+#
+# 1 block occurs every 32K -1 bytes or when there 50% compression
+# has been achieved. The smallest block type encoding is always used.
+#
+# stored:
+# 32 bits length in bytes.
+#
+# fixed:
+# magic fixed tree.
+# symbols.
+#
+# dynamic:
+# dynamic tree encoding.
+# symbols.
+#
+#
+# The buffer for decompression in place is the length of the uncompressed
+# data, plus a small amount extra to keep the algorithm safe. The
+# compressed data is placed at the end of the buffer. The output pointer
+# is placed at the start of the buffer and the input pointer is placed
+# where the compressed data starts. Problems will occur when the output
+# pointer overruns the input pointer.
+#
+# The output pointer can only overrun the input pointer if the input
+# pointer is moving faster than the output pointer. A condition only
+# triggered by data whose compressed form is larger than the uncompressed
+# form.
+#
+# The worst case at the block level is a growth of the compressed data
+# of 5 bytes per 32767 bytes.
+#
+# The worst case internal to a compressed block is very hard to figure.
+# The worst case can at least be bounded by having one bit that represents
+# 32764 bytes and then all of the rest of the bytes representing the very
+# very last byte.
+#
+# All of which is enough to compute an amount of extra data that is required
+# to be safe. To avoid problems at the block level allocating 5 extra bytes
+# per 32767 bytes of data is sufficient. To avoid problems internal to a
+# block adding an extra 32767 bytes (the worst case uncompressed block size)
+# is sufficient, to ensure that in the worst case the decompressed data for
+# block will stop the byte before the compressed data for a block begins.
+# To avoid problems with the compressed data's meta information an extra 18
+# bytes are needed. Leading to the formula:
+#
+# extra_bytes = (uncompressed_size >> 12) + 32768 + 18
+#
+# Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
+# Adding 32768 instead of 32767 just makes for round numbers.
+#
+# Above analysis is for decompressing gzip compressed kernel only. Up to
+# now 6 different decompressor are supported all together. And among them
+# xz stores data in chunks and has maximum chunk of 64K. Hence safety
+# margin should be updated to cover all decompressors so that we don't
+# need to deal with each of them separately. Please check
+# the description in lib/decompressor_xxx.c for specific information.
+#
+# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
+#
+# LZ4 is even worse: data that cannot be further compressed grows by 0.4%,
+# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as
+# the size-dependent part now grows so fast.
+#
+# extra_bytes = (uncompressed_size >> 8) + 65536
+#
+# ZSTD compressed data grows by at most 3 bytes per 128K, and only has a 22
+# byte fixed overhead but has a maximum block size of 128K, so it needs a
+# larger margin.
+#
+# extra_bytes = (uncompressed_size >> 8) + 131072
+
+#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 131072)
+#if ZO_z_output_len > ZO_z_input_len
+# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \
+ ZO_z_input_len)
+#else
+# define ZO_z_extract_offset ZO_z_extra_bytes
+#endif
+
+/*
+ * The extract_offset has to be bigger than ZO head section. Otherwise when
+ * the head code is running to move ZO to the end of the buffer, it will
+ * overwrite the head code itself.
+ */
+#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset
+# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095)
+#else
+# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095)
+#endif
+
+#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset)
+
#define VO_INIT_SIZE (VO__end - VO__text)
#if ZO_INIT_SIZE > VO_INIT_SIZE
-#define INIT_SIZE ZO_INIT_SIZE
+# define INIT_SIZE ZO_INIT_SIZE
#else
-#define INIT_SIZE VO_INIT_SIZE
+# define INIT_SIZE VO_INIT_SIZE
#endif
+
+ .macro __handover_offset
+#ifndef CONFIG_EFI_HANDOVER_PROTOCOL
+ .long 0
+#elif !defined(CONFIG_X86_64)
+ .long ZO_efi32_stub_entry
+#else
+ /* Yes, this is really how we defined it :( */
+ .long ZO_efi64_stub_entry - 0x200
+#ifdef CONFIG_EFI_MIXED
+ .if ZO_efi32_stub_entry != ZO_efi64_stub_entry - 0x200
+ .error "32-bit and 64-bit EFI entry points do not match"
+ .endif
+#endif
+#endif
+ .endm
+
init_size: .long INIT_SIZE # kernel initialization size
+handover_offset: __handover_offset
+kernel_info_offset: .long ZO_kernel_info
# End of setup header #####################################################
.section ".entrytext", "ax"
start_of_setup:
-#ifdef SAFE_RESET_DISK_CONTROLLER
-# Reset the disk controller.
- movw $0x0000, %ax # Reset disk controller
- movb $0x80, %dl # All disks
- int $0x13
-#endif
-
# Force %es = %ds
movw %ds, %ax
movw %ax, %es
@@ -295,7 +594,7 @@ start_of_setup:
xorl %eax, %eax
subw %di, %cx
shrw $2, %cx
- rep; stosl
+ rep stosl
# Jump to C code (should not return)
calll main
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 8d60ee15dfd9..93784abcd66d 100644..100755
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -15,28 +15,8 @@
# $2 - kernel image file
# $3 - kernel map file
# $4 - default install path (blank if root directory)
-#
-
-verify () {
- if [ ! -f "$1" ]; then
- echo "" 1>&2
- echo " *** Missing file: $1" 1>&2
- echo ' *** You need to run "make" before "make install".' 1>&2
- echo "" 1>&2
- exit 1
- fi
-}
-
-# Make sure the files actually exist
-verify "$2"
-verify "$3"
-
-# User may have a custom install script
-
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
-# Default install - same as make zlilo
+set -e
if [ -f $4/vmlinuz ]; then
mv $4/vmlinuz $4/vmlinuz.old
diff --git a/arch/x86/boot/io.h b/arch/x86/boot/io.h
new file mode 100644
index 000000000000..110880907f87
--- /dev/null
+++ b/arch/x86/boot/io.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_IO_H
+#define BOOT_IO_H
+
+#include <asm/shared/io.h>
+
+#undef inb
+#undef inw
+#undef inl
+#undef outb
+#undef outw
+#undef outl
+
+struct port_io_ops {
+ u8 (*f_inb)(u16 port);
+ void (*f_outb)(u8 v, u16 port);
+ void (*f_outw)(u16 v, u16 port);
+};
+
+extern struct port_io_ops pio_ops;
+
+/*
+ * Use the normal I/O instructions by default.
+ * TDX guests override these to use hypercalls.
+ */
+static inline void init_default_io_ops(void)
+{
+ pio_ops.f_inb = __inb;
+ pio_ops.f_outb = __outb;
+ pio_ops.f_outw = __outw;
+}
+
+/*
+ * Redirect port I/O operations via pio_ops callbacks.
+ * TDX guests override these callbacks with TDX-specific helpers.
+ */
+#define inb pio_ops.f_inb
+#define outb pio_ops.f_outb
+#define outw pio_ops.f_outw
+
+#endif
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 140172b895bd..9d0fea18d3c8 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -1,22 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
* Main module for the real-mode kernel code
*/
+#include <linux/build_bug.h>
#include "boot.h"
+#include "string.h"
struct boot_params boot_params __attribute__((aligned(16)));
+struct port_io_ops pio_ops;
+
char *HEAP = _end;
char *heap_end = _end; /* Default end of heap = no heap */
@@ -25,46 +27,51 @@ char *heap_end = _end; /* Default end of heap = no heap */
* screws up the old-style command line protocol, adjust by
* filling in the new-style command line pointer instead.
*/
-
static void copy_boot_params(void)
{
struct old_cmdline {
u16 cl_magic;
u16 cl_offset;
};
- const struct old_cmdline * const oldcmd =
- (const struct old_cmdline *)OLD_CL_ADDRESS;
+ const struct old_cmdline * const oldcmd = absolute_pointer(OLD_CL_ADDRESS);
- BUILD_BUG_ON(sizeof boot_params != 4096);
- memcpy(&boot_params.hdr, &hdr, sizeof hdr);
+ BUILD_BUG_ON(sizeof(boot_params) != 4096);
+ memcpy(&boot_params.hdr, &hdr, sizeof(hdr));
- if (!boot_params.hdr.cmd_line_ptr &&
- oldcmd->cl_magic == OLD_CL_MAGIC) {
- /* Old-style command line protocol. */
+ if (!boot_params.hdr.cmd_line_ptr && oldcmd->cl_magic == OLD_CL_MAGIC) {
+ /* Old-style command line protocol */
u16 cmdline_seg;
- /* Figure out if the command line falls in the region
- of memory that an old kernel would have copied up
- to 0x90000... */
+ /*
+ * Figure out if the command line falls in the region
+ * of memory that an old kernel would have copied up
+ * to 0x90000...
+ */
if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
cmdline_seg = ds();
else
cmdline_seg = 0x9000;
- boot_params.hdr.cmd_line_ptr =
- (cmdline_seg << 4) + oldcmd->cl_offset;
+ boot_params.hdr.cmd_line_ptr = (cmdline_seg << 4) + oldcmd->cl_offset;
}
}
/*
- * Set the keyboard repeat rate to maximum. Unclear why this
+ * Query the keyboard lock status as given by the BIOS, and
+ * set the keyboard repeat rate to maximum. Unclear why the latter
* is done here; this might be possible to kill off as stale code.
*/
-static void keyboard_set_repeat(void)
+static void keyboard_init(void)
{
- struct biosregs ireg;
+ struct biosregs ireg, oreg;
+
initregs(&ireg);
- ireg.ax = 0x0305;
+
+ ireg.ah = 0x02; /* Get keyboard status */
+ intcall(0x16, &ireg, &oreg);
+ boot_params.kbd_status = oreg.al;
+
+ ireg.ax = 0x0305; /* Set keyboard repeat rate */
intcall(0x16, &ireg, NULL);
}
@@ -75,8 +82,10 @@ static void query_ist(void)
{
struct biosregs ireg, oreg;
- /* Some older BIOSes apparently crash on this call, so filter
- it from machines too old to have SpeedStep at all. */
+ /*
+ * Some older BIOSes apparently crash on this call, so filter
+ * it from machines too old to have SpeedStep at all.
+ */
if (cpu.level < 6)
return;
@@ -111,46 +120,45 @@ static void init_heap(void)
char *stack_end;
if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
- asm("leal %P1(%%esp),%0"
- : "=r" (stack_end) : "i" (-STACK_SIZE));
-
- heap_end = (char *)
- ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
+ stack_end = (char *) (current_stack_pointer - STACK_SIZE);
+ heap_end = (char *) ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
if (heap_end > stack_end)
heap_end = stack_end;
} else {
/* Boot protocol 2.00 only, no heap available */
- puts("WARNING: Ancient bootloader, some functionality "
- "may be limited!\n");
+ puts("WARNING: Ancient bootloader, some functionality may be limited!\n");
}
}
void main(void)
{
+ init_default_io_ops();
+
/* First, copy the boot header into the "zeropage" */
copy_boot_params();
+ /* Initialize the early-boot console */
+ console_init();
+ if (cmdline_find_option_bool("debug"))
+ puts("early console in setup code\n");
+
/* End of heap check */
init_heap();
/* Make sure we have all the proper CPU support */
if (validate_cpu()) {
- puts("Unable to boot - please use a kernel appropriate "
- "for your CPU.\n");
+ puts("Unable to boot - please use a kernel appropriate for your CPU.\n");
die();
}
- /* Tell the BIOS what CPU mode we intend to run in. */
+ /* Tell the BIOS what CPU mode we intend to run in */
set_bios_mode();
/* Detect memory layout */
detect_memory();
- /* Set keyboard repeat rate (why?) */
- keyboard_set_repeat();
-
- /* Query MCA information */
- query_mca();
+ /* Set keyboard repeat rate (why?) and query the lock flags */
+ keyboard_init();
/* Query Intel SpeedStep (IST) information */
query_ist();
@@ -168,10 +176,6 @@ void main(void)
/* Set the video mode */
set_video();
- /* Parse command line for 'quiet' and pass it to decompressor. */
- if (cmdline_find_option_bool("quiet"))
- boot_params.hdr.loadflags |= QUIET_FLAG;
-
/* Do the last things and invoke protected mode */
go_to_protected_mode();
}
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
deleted file mode 100644
index a95a531148ef..000000000000
--- a/arch/x86/boot/mca.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright 2007 rPath, Inc. - All Rights Reserved
- * Copyright 2009 Intel Corporation; author H. Peter Anvin
- *
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * Get the MCA system description table
- */
-
-#include "boot.h"
-
-int query_mca(void)
-{
- struct biosregs ireg, oreg;
- u16 len;
-
- initregs(&ireg);
- ireg.ah = 0xc0;
- intcall(0x15, &ireg, &oreg);
-
- if (oreg.eflags & X86_EFLAGS_CF)
- return -1; /* No MCA present */
-
- set_fs(oreg.es);
- len = rdfs16(oreg.bx);
-
- if (len > sizeof(boot_params.sys_desc_table))
- len = sizeof(boot_params.sys_desc_table);
-
- copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
- return 0;
-}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index cae3feb1035e..b0422b79debc 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -17,16 +15,16 @@
#define SMAP 0x534d4150 /* ASCII "SMAP" */
-static int detect_memory_e820(void)
+static void detect_memory_e820(void)
{
int count = 0;
struct biosregs ireg, oreg;
- struct e820entry *desc = boot_params.e820_map;
- static struct e820entry buf; /* static so it is zeroed */
+ struct boot_e820_entry *desc = boot_params.e820_table;
+ static struct boot_e820_entry buf; /* static so it is zeroed */
initregs(&ireg);
ireg.ax = 0xe820;
- ireg.cx = sizeof buf;
+ ireg.cx = sizeof(buf);
ireg.edx = SMAP;
ireg.di = (size_t)&buf;
@@ -66,12 +64,12 @@ static int detect_memory_e820(void)
*desc++ = buf;
count++;
- } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
+ } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table));
- return boot_params.e820_entries = count;
+ boot_params.e820_entries = count;
}
-static int detect_memory_e801(void)
+static void detect_memory_e801(void)
{
struct biosregs ireg, oreg;
@@ -80,7 +78,7 @@ static int detect_memory_e801(void)
intcall(0x15, &ireg, &oreg);
if (oreg.eflags & X86_EFLAGS_CF)
- return -1;
+ return;
/* Do we really need to do this? */
if (oreg.cx || oreg.dx) {
@@ -89,9 +87,9 @@ static int detect_memory_e801(void)
}
if (oreg.ax > 15*1024) {
- return -1; /* Bogus! */
+ return; /* Bogus! */
} else if (oreg.ax == 15*1024) {
- boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+ boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
} else {
/*
* This ignores memory above 16MB if we have a memory
@@ -102,11 +100,9 @@ static int detect_memory_e801(void)
*/
boot_params.alt_mem_k = oreg.ax;
}
-
- return 0;
}
-static int detect_memory_88(void)
+static void detect_memory_88(void)
{
struct biosregs ireg, oreg;
@@ -115,22 +111,13 @@ static int detect_memory_88(void)
intcall(0x15, &ireg, &oreg);
boot_params.screen_info.ext_mem_k = oreg.ax;
-
- return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
}
-int detect_memory(void)
+void detect_memory(void)
{
- int err = -1;
-
- if (detect_memory_e820() > 0)
- err = 0;
-
- if (!detect_memory_e801())
- err = 0;
+ detect_memory_e820();
- if (!detect_memory_88())
- err = 0;
+ detect_memory_e801();
- return err;
+ detect_memory_88();
}
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 8ef60f20b371..22d730b227e3 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* ----------------------------------------------------------------------- *
*
* Copyright 2008 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2 or (at your
- * option) any later version; incorporated herein by reference.
- *
* ----------------------------------------------------------------------- */
/*
@@ -15,6 +12,8 @@
#include <stdio.h>
+#include "../include/asm/cpufeatures.h"
+#include "../include/asm/vmxfeatures.h"
#include "../kernel/cpu/capflags.c"
int main(void)
@@ -22,7 +21,8 @@ int main(void)
int i, j;
const char *str;
- printf("static const char x86_cap_strs[] = \n");
+ printf("#include <asm/cpufeaturemasks.h>\n\n");
+ printf("static const char x86_cap_strs[] =\n");
for (i = 0; i < NCAPINTS; i++) {
for (j = 0; j < 32; j++) {
diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in
index efd6d2490c1d..174c60508766 100644
--- a/arch/x86/boot/mtools.conf.in
+++ b/arch/x86/boot/mtools.conf.in
@@ -14,4 +14,8 @@ drive v:
drive w:
file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
-
+# Hard disk (h: for the filesystem, p: for format - old mtools bug?)
+drive h:
+ file="@OBJ@/hdimage" offset=32768 mformat_only
+drive p:
+ file="@OBJ@/hdimage" partition=1 mformat_only
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 8062f8915250..5941f930f6c5 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -13,6 +11,7 @@
*/
#include "boot.h"
+#include <asm/desc_defs.h>
#include <asm/segment.h>
/*
@@ -69,13 +68,13 @@ static void setup_gdt(void)
being 8-byte unaligned. Intel recommends 16 byte alignment. */
static const u64 boot_gdt[] __attribute__((aligned(16))) = {
/* CS: code, read/execute, 4 GB, base 0 */
- [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff),
/* DS: data, read/write, 4 GB, base 0 */
- [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
+ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff),
/* TSS: 32-bit tss, 104 bytes, base 4096 */
/* We only have a TSS here to keep Intel VT happy;
we don't actually use it for anything. */
- [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
+ [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103),
};
/* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
of the gdt_ptr contents. Thus, make it static so it will
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index 3e0edc6d2a20..cbec8bd0841f 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* ----------------------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -23,7 +21,7 @@
/*
* void protected_mode_jump(u32 entrypoint, u32 bootparams);
*/
-GLOBAL(protected_mode_jump)
+SYM_FUNC_START_NOALIGN(protected_mode_jump)
movl %edx, %esi # Pointer to boot_params table
xorl %ebx, %ebx
@@ -42,13 +40,13 @@ GLOBAL(protected_mode_jump)
# Transition to 32-bit mode
.byte 0x66, 0xea # ljmpl opcode
-2: .long in_pm32 # offset
+2: .long .Lin_pm32 # offset
.word __BOOT_CS # segment
-ENDPROC(protected_mode_jump)
+SYM_FUNC_END(protected_mode_jump)
.code32
.section ".text32","ax"
-GLOBAL(in_pm32)
+SYM_FUNC_START_LOCAL_NOALIGN(.Lin_pm32)
# Set up data segments for flat 32-bit mode
movl %ecx, %ds
movl %ecx, %es
@@ -74,4 +72,4 @@ GLOBAL(in_pm32)
lldt %cx
jmpl *%eax # Jump to the 32-bit entrypoint
-ENDPROC(in_pm32)
+SYM_FUNC_END(.Lin_pm32)
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 50e47cdbdddd..51dc14b714f6 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -34,7 +32,7 @@ static int skip_atoi(const char **s)
#define SMALL 32 /* Must be 32 == 0x20 */
#define SPECIAL 64 /* 0x */
-#define do_div(n,base) ({ \
+#define __do_div(n, base) ({ \
int __res; \
__res = ((unsigned long) n) % (unsigned) base; \
n = ((unsigned long) n) / (unsigned) base; \
@@ -55,7 +53,7 @@ static char *number(char *str, long num, int base, int size, int precision,
locase = (type & SMALL);
if (type & LEFT)
type &= ~ZEROPAD;
- if (base < 2 || base > 36)
+ if (base < 2 || base > 16)
return NULL;
c = (type & ZEROPAD) ? '0' : ' ';
sign = 0;
@@ -83,7 +81,7 @@ static char *number(char *str, long num, int base, int size, int precision,
tmp[i++] = '0';
else
while (num != 0)
- tmp[i++] = (digits[do_div(num, base)] | locase);
+ tmp[i++] = (digits[__do_div(num, base)] | locase);
if (i > precision)
precision = i;
size -= precision;
@@ -248,6 +246,7 @@ int vsprintf(char *buf, const char *fmt, va_list args)
case 'x':
flags |= SMALL;
+ fallthrough;
case 'X':
base = 16;
break;
@@ -255,6 +254,8 @@ int vsprintf(char *buf, const char *fmt, va_list args)
case 'd':
case 'i':
flags |= SIGN;
+ break;
+
case 'u':
break;
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
index 958019b1cfa5..55de6b3092b8 100644
--- a/arch/x86/boot/regs.c
+++ b/arch/x86/boot/regs.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* -----------------------------------------------------------------------
*
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2 or (at your
- * option) any later version; incorporated herein by reference.
- *
* ----------------------------------------------------------------------- */
/*
@@ -17,10 +14,11 @@
*/
#include "boot.h"
+#include "string.h"
void initregs(struct biosregs *reg)
{
- memset(reg, 0, sizeof *reg);
+ memset(reg, 0, sizeof(*reg));
reg->eflags |= X86_EFLAGS_CF;
reg->ds = ds();
reg->es = ds();
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 0f6ec455a2b1..e1d594a60204 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -3,26 +3,30 @@
*
* Linker script for the i386 setup code
*/
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_FORMAT("elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(_start)
SECTIONS
{
. = 0;
- .bstext : { *(.bstext) }
- .bsdata : { *(.bsdata) }
+ .bstext : {
+ *(.bstext)
+ . = 495;
+ } =0xffffffff
- . = 497;
.header : { *(.header) }
.entrytext : { *(.entrytext) }
.inittext : { *(.inittext) }
.initdata : { *(.initdata) }
__end_init = .;
- .text : { *(.text) }
+ .text : { *(.text .text.*) }
.text32 : { *(.text32) }
+ .pecompat : { *(.pecompat) }
+ PROVIDE(pecompat_fsize = setup_size - pecompat_fstart);
+
. = ALIGN(16);
.rodata : { *(.rodata*) }
@@ -38,8 +42,12 @@ SECTIONS
.signature : {
setup_sig = .;
LONG(0x5a5aaa55)
- }
+ setup_size = ALIGN(ABSOLUTE(.), 4096);
+ setup_sects = ABSOLUTE(setup_size / 512);
+ ASSERT(setup_sects >= 5, "The setup must be at least 5 sectors in size");
+ ASSERT(setup_sects <= 64, "The setup must be at most 64 sectors in size");
+ }
. = ALIGN(16);
.bss :
@@ -51,8 +59,13 @@ SECTIONS
. = ALIGN(16);
_end = .;
- /DISCARD/ : { *(.note*) }
+ /DISCARD/ : {
+ *(.note*)
+ }
+ /*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
. = ASSERT(_end <= 0x8000, "Setup too big!");
. = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
/* Necessary for the very-old-loader check to work... */
diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile
new file mode 100644
index 000000000000..5e499cfb29b5
--- /dev/null
+++ b/arch/x86/boot/startup/Makefile
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KBUILD_AFLAGS += -D__DISABLE_EXPORTS
+KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \
+ -Os -DDISABLE_BRANCH_PROFILING \
+ $(DISABLE_STACKLEAK_PLUGIN) \
+ $(DISABLE_LATENT_ENTROPY_PLUGIN) \
+ -fno-stack-protector -D__NO_FORTIFY \
+ -fno-jump-tables \
+ -include $(srctree)/include/linux/hidden.h
+
+# disable ftrace hooks and LTO
+KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS))
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+KMSAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o
+pi-objs := $(patsubst %.o,$(obj)/%.o,$(obj-y))
+
+lib-$(CONFIG_X86_64) += la57toggle.o
+lib-$(CONFIG_EFI_MIXED) += efi-mixed.o
+
+#
+# Disable objtool validation for all library code, which is intended
+# to be linked into the decompressor or the EFI stub but not vmlinux
+#
+$(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y
+
+#
+# Invoke objtool for each object individually to check for absolute
+# relocations, even if other objtool actions are being deferred.
+#
+$(pi-objs): objtool-enabled = 1
+$(pi-objs): objtool-args = $(if $(delay-objtool),--dry-run,$(objtool-args-y)) --noabs
+
+#
+# Confine the startup code by prefixing all symbols with __pi_ (for position
+# independent). This ensures that startup code can only call other startup
+# code, or code that has explicitly been made accessible to it via a symbol
+# alias.
+#
+$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_
+$(obj)/%.pi.o: $(obj)/%.o FORCE
+ $(call if_changed,objcopy)
+
+targets += $(obj-y)
+obj-y := $(patsubst %.o,%.pi.o,$(obj-y))
diff --git a/arch/x86/boot/startup/efi-mixed.S b/arch/x86/boot/startup/efi-mixed.S
new file mode 100644
index 000000000000..e04ed99bc449
--- /dev/null
+++ b/arch/x86/boot/startup/efi-mixed.S
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT and IDT before we make EFI service
+ * calls.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identity
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/desc_defs.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+ .text
+ .code32
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+SYM_FUNC_START(efi32_stub_entry)
+ call 1f
+1: popl %ecx
+
+ /* Clear BSS */
+ xorl %eax, %eax
+ leal (_bss - 1b)(%ecx), %edi
+ leal (_ebss - 1b)(%ecx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ cld
+ rep stosl
+
+ add $0x4, %esp /* Discard return address */
+ movl 8(%esp), %ebx /* struct boot_params pointer */
+ jmp efi32_startup
+SYM_FUNC_END(efi32_stub_entry)
+#endif
+
+/*
+ * Called using a far call from __efi64_thunk() below, using the x86_64 SysV
+ * ABI (except for R8/R9 which are inaccessible to 32-bit code - EAX/EBX are
+ * used instead). EBP+16 points to the arguments passed via the stack.
+ *
+ * The first argument (EDI) is a pointer to the boot service or protocol, to
+ * which the remaining arguments are passed, each truncated to 32 bits.
+ */
+SYM_FUNC_START_LOCAL(efi_enter32)
+ /*
+ * Convert x86-64 SysV ABI params to i386 ABI
+ */
+ pushl 32(%ebp) /* Up to 3 args passed via the stack */
+ pushl 24(%ebp)
+ pushl 16(%ebp)
+ pushl %ebx /* R9 */
+ pushl %eax /* R8 */
+ pushl %ecx
+ pushl %edx
+ pushl %esi
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ call efi32_enable_long_mode
+
+ addl $32, %esp
+ movl %edi, %eax
+ lret
+SYM_FUNC_END(efi_enter32)
+
+ .code64
+SYM_FUNC_START(__efi64_thunk)
+ push %rbp
+ movl %esp, %ebp
+ push %rbx
+
+ /* Move args #5 and #6 into 32-bit accessible registers */
+ movl %r8d, %eax
+ movl %r9d, %ebx
+
+ lcalll *efi32_call(%rip)
+
+ pop %rbx
+ pop %rbp
+ RET
+SYM_FUNC_END(__efi64_thunk)
+
+ .code32
+SYM_FUNC_START_LOCAL(efi32_enable_long_mode)
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Disable interrupts - the firmware's IDT does not work in long mode */
+ cli
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ ret
+SYM_FUNC_END(efi32_enable_long_mode)
+
+/*
+ * This is the common EFI stub entry point for mixed mode. It sets up the GDT
+ * and page tables needed for 64-bit execution, after which it calls the
+ * common 64-bit EFI entrypoint efi_stub_entry().
+ *
+ * Arguments: 0(%esp) image handle
+ * 4(%esp) EFI system table pointer
+ * %ebx struct boot_params pointer (or NULL)
+ *
+ * Since this is the point of no return for ordinary execution, no registers
+ * are considered live except for the function parameters. [Note that the EFI
+ * stub may still exit and return to the firmware using the Exit() EFI boot
+ * service.]
+ */
+SYM_FUNC_START_LOCAL(efi32_startup)
+ movl %esp, %ebp
+
+ subl $8, %esp
+ sgdtl (%esp) /* Save GDT descriptor to the stack */
+ movl 2(%esp), %esi /* Existing GDT pointer */
+ movzwl (%esp), %ecx /* Existing GDT limit */
+ inc %ecx /* Existing GDT size */
+ andl $~7, %ecx /* Ensure size is multiple of 8 */
+
+ subl %ecx, %esp /* Allocate new GDT */
+ andl $~15, %esp /* Realign the stack */
+ movl %esp, %edi /* New GDT address */
+ leal 7(%ecx), %eax /* New GDT limit */
+ pushw %cx /* Push 64-bit CS (for LJMP below) */
+ pushl %edi /* Push new GDT address */
+ pushw %ax /* Push new GDT limit */
+
+ /* Copy GDT to the stack and add a 64-bit code segment at the end */
+ movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) & 0xffffffff, (%edi,%ecx)
+ movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) >> 32, 4(%edi,%ecx)
+ shrl $2, %ecx
+ cld
+ rep movsl /* Copy the firmware GDT */
+ lgdtl (%esp) /* Switch to the new GDT */
+
+ call 1f
+1: pop %edi
+
+ /* Record mixed mode entry */
+ movb $0x0, (efi_is64 - 1b)(%edi)
+
+ /* Set up indirect far call to re-enter 32-bit mode */
+ leal (efi32_call - 1b)(%edi), %eax
+ addl %eax, (%eax)
+ movw %cs, 4(%eax)
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Set up 1:1 mapping */
+ leal (pte - 1b)(%edi), %eax
+ movl $_PAGE_PRESENT | _PAGE_RW | _PAGE_PSE, %ecx
+ leal (_PAGE_PRESENT | _PAGE_RW)(%eax), %edx
+2: movl %ecx, (%eax)
+ addl $8, %eax
+ addl $PMD_SIZE, %ecx
+ jnc 2b
+
+ movl $PAGE_SIZE, %ecx
+ .irpc l, 0123
+ movl %edx, \l * 8(%eax)
+ addl %ecx, %edx
+ .endr
+ addl %ecx, %eax
+ movl %edx, (%eax)
+ movl %eax, %cr3
+
+ call efi32_enable_long_mode
+
+ /* Set up far jump to 64-bit mode (CS is already on the stack) */
+ leal (efi_stub_entry - 1b)(%edi), %eax
+ movl %eax, 2(%esp)
+
+ movl 0(%ebp), %edi
+ movl 4(%ebp), %esi
+ movl %ebx, %edx
+ ljmpl *2(%esp)
+SYM_FUNC_END(efi32_startup)
+
+/*
+ * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
+ * efi_system_table_32_t *sys_table)
+ */
+SYM_FUNC_START(efi32_pe_entry)
+ pushl %ebx // save callee-save registers
+
+ /* Check whether the CPU supports long mode */
+ movl $0x80000001, %eax // assume extended info support
+ cpuid
+ btl $29, %edx // check long mode bit
+ jnc 1f
+ leal 8(%esp), %esp // preserve stack alignment
+ xor %ebx, %ebx // no struct boot_params pointer
+ jmp efi32_startup // only ESP and EBX remain live
+1: movl $0x80000003, %eax // EFI_UNSUPPORTED
+ popl %ebx
+ RET
+SYM_FUNC_END(efi32_pe_entry)
+
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+ .org efi32_stub_entry + 0x200
+ .code64
+SYM_FUNC_START_NOALIGN(efi64_stub_entry)
+ jmp efi_handover_entry
+SYM_FUNC_END(efi64_stub_entry)
+#endif
+
+ .data
+ .balign 8
+SYM_DATA_START_LOCAL(efi32_call)
+ .long efi_enter32 - .
+ .word 0x0
+SYM_DATA_END(efi32_call)
+SYM_DATA(efi_is64, .byte 1)
+
+ .bss
+ .balign PAGE_SIZE
+SYM_DATA_LOCAL(pte, .fill 6 * PAGE_SIZE, 1, 0)
diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h
new file mode 100644
index 000000000000..01d2363dc445
--- /dev/null
+++ b/arch/x86/boot/startup/exports.h
@@ -0,0 +1,14 @@
+
+/*
+ * The symbols below are functions that are implemented by the startup code,
+ * but called at runtime by the SEV code residing in the core kernel.
+ */
+PROVIDE(early_set_pages_state = __pi_early_set_pages_state);
+PROVIDE(early_snp_set_memory_private = __pi_early_snp_set_memory_private);
+PROVIDE(early_snp_set_memory_shared = __pi_early_snp_set_memory_shared);
+PROVIDE(get_hv_features = __pi_get_hv_features);
+PROVIDE(sev_es_terminate = __pi_sev_es_terminate);
+PROVIDE(snp_cpuid = __pi_snp_cpuid);
+PROVIDE(snp_cpuid_get_table = __pi_snp_cpuid_get_table);
+PROVIDE(svsm_issue_call = __pi_svsm_issue_call);
+PROVIDE(svsm_process_result_codes = __pi_svsm_process_result_codes);
diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c
new file mode 100644
index 000000000000..d16102abdaec
--- /dev/null
+++ b/arch/x86/boot/startup/gdt_idt.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <linux/types.h>
+
+#include <asm/desc.h>
+#include <asm/init.h>
+#include <asm/setup.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+
+/*
+ * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is
+ * used until the idt_table takes over. On the boot CPU this happens in
+ * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases
+ * this happens in the functions called from head_64.S.
+ *
+ * The idt_table can't be used that early because all the code modifying it is
+ * in idt.c and can be instrumented by tracing or KASAN, which both don't work
+ * during early CPU bringup. Also the idt_table has the runtime vectors
+ * configured which require certain CPU state to be setup already (like TSS),
+ * which also hasn't happened yet in early CPU bringup.
+ */
+static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
+
+/* This may run while still in the direct mapping */
+void startup_64_load_idt(void *vc_handler)
+{
+ struct desc_ptr desc = {
+ .address = (unsigned long)rip_rel_ptr(bringup_idt_table),
+ .size = sizeof(bringup_idt_table) - 1,
+ };
+ struct idt_data data;
+ gate_desc idt_desc;
+
+ /* @vc_handler is set only for a VMM Communication Exception */
+ if (vc_handler) {
+ init_idt_data(&data, X86_TRAP_VC, vc_handler);
+ idt_init_desc(&idt_desc, &data);
+ native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
+ }
+
+ native_load_idt(&desc);
+}
+
+/*
+ * Setup boot CPU state needed before kernel switches to virtual addresses.
+ */
+void __init startup_64_setup_gdt_idt(void)
+{
+ struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page);
+ void *handler = NULL;
+
+ struct desc_ptr startup_gdt_descr = {
+ .address = (unsigned long)gp->gdt,
+ .size = GDT_SIZE - 1,
+ };
+
+ /* Load GDT */
+ native_load_gdt(&startup_gdt_descr);
+
+ /* New GDT is live - reload data segment registers */
+ asm volatile("movl %%eax, %%ds\n"
+ "movl %%eax, %%ss\n"
+ "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ handler = rip_rel_ptr(vc_no_ghcb);
+
+ startup_64_load_idt(handler);
+}
diff --git a/arch/x86/boot/startup/la57toggle.S b/arch/x86/boot/startup/la57toggle.S
new file mode 100644
index 000000000000..370075b4d95b
--- /dev/null
+++ b/arch/x86/boot/startup/la57toggle.S
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/boot.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+
+/*
+ * This is the 32-bit trampoline that will be copied over to low memory. It
+ * will be called using the ordinary 64-bit calling convention from code
+ * running in 64-bit mode.
+ *
+ * Return address is at the top of the stack (might be above 4G).
+ * The first argument (EDI) contains the address of the temporary PGD level
+ * page table in 32-bit addressable memory which will be programmed into
+ * register CR3.
+ */
+
+ .section ".rodata", "a", @progbits
+SYM_CODE_START(trampoline_32bit_src)
+ /*
+ * Preserve callee save 64-bit registers on the stack: this is
+ * necessary because the architecture does not guarantee that GPRs will
+ * retain their full 64-bit values across a 32-bit mode switch.
+ */
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbp
+ pushq %rbx
+
+ /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
+ movq %rsp, %rbx
+ shrq $32, %rbx
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq 0f(%rip), %rax
+ pushq %rax
+ lretq
+
+ /*
+ * The 32-bit code below will do a far jump back to long mode and end
+ * up here after reconfiguring the number of paging levels. First, the
+ * stack pointer needs to be restored to its full 64-bit value before
+ * the callee save register contents can be popped from the stack.
+ */
+.Lret:
+ shlq $32, %rbx
+ orq %rbx, %rsp
+
+ /* Restore the preserved 64-bit registers */
+ popq %rbx
+ popq %rbp
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ retq
+
+ .code32
+0:
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Point CR3 to the trampoline's new top level page table */
+ movl %edi, %cr3
+
+ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
+ wrmsr
+1:
+ /* Toggle CR4.LA57 */
+ movl %cr4, %eax
+ btcl $X86_CR4_LA57_BIT, %eax
+ movl %eax, %cr4
+
+ /* Enable paging again. */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /*
+ * Return to the 64-bit calling code using LJMP rather than LRET, to
+ * avoid the need for a 32-bit addressable stack. The destination
+ * address will be adjusted after the template code is copied into a
+ * 32-bit addressable buffer.
+ */
+.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
+SYM_CODE_END(trampoline_32bit_src)
+
+/*
+ * This symbol is placed right after trampoline_32bit_src() so its address can
+ * be used to infer the size of the trampoline code.
+ */
+SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
+
+ /*
+ * The trampoline code has a size limit.
+ * Make sure we fail to compile if the trampoline code grows
+ * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
+ */
+ .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c
new file mode 100644
index 000000000000..83ba98d61572
--- /dev/null
+++ b/arch/x86/boot/startup/map_kernel.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+#include <asm/init.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/sev.h>
+
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+extern unsigned int next_early_pgt;
+
+static inline bool check_la57_support(void)
+{
+ /*
+ * 5-level paging is detected and enabled at kernel decompression
+ * stage. Only check if it has been enabled there.
+ */
+ if (!(native_read_cr4() & X86_CR4_LA57))
+ return false;
+
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+
+ return true;
+}
+
+static unsigned long __init sme_postprocess_startup(struct boot_params *bp,
+ pmdval_t *pmd,
+ unsigned long p2v_offset)
+{
+ unsigned long paddr, paddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
+ paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
+
+ for (; paddr < paddr_end; paddr += PMD_SIZE) {
+ /*
+ * On SNP, transition the page to shared in the RMP table so that
+ * it is consistent with the page table attribute change.
+ *
+ * __start_bss_decrypted has a virtual address in the high range
+ * mapping (kernel .text). PVALIDATE, by way of
+ * early_snp_set_memory_shared(), requires a valid virtual
+ * address but the kernel is currently running off of the identity
+ * mapping so use the PA to get a *currently* valid virtual address.
+ */
+ early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
+
+ i = pmd_index(paddr - p2v_offset);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+/*
+ * This code is compiled using PIC codegen because it will execute from the
+ * early 1:1 mapping of memory, which deviates from the mapping expected by the
+ * linker. Due to this deviation, taking the address of a global variable will
+ * produce an ambiguous result when using the plain & operator. Instead,
+ * rip_rel_ptr() must be used, which will return the RIP-relative address in
+ * the 1:1 mapping of memory. Kernel virtual addresses can be determined by
+ * subtracting p2v_offset from the RIP-relative address.
+ */
+unsigned long __init __startup_64(unsigned long p2v_offset,
+ struct boot_params *bp)
+{
+ pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
+ unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
+ unsigned long va_text, va_end;
+ unsigned long pgtable_flags;
+ unsigned long load_delta;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ bool la57;
+ int i;
+
+ la57 = check_la57_support();
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ phys_base = load_delta = __START_KERNEL_map + p2v_offset;
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_MASK)
+ for (;;);
+
+ va_text = physaddr - p2v_offset;
+ va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = rip_rel_ptr(early_top_pgt);
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (la57) {
+ p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
+ p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
+ }
+
+ level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta;
+ level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta;
+
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ level2_fixmap_pgt[i].pmd += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = &early_pgts[0]->pmd;
+ pmd = &early_pgts[1]->pmd;
+ next_early_pgt = 2;
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
+ if (la57) {
+ p4d = &early_pgts[next_early_pgt++]->pmd;
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
+
+ i = physaddr >> P4D_SHIFT;
+ p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
+ }
+
+ i = physaddr >> PUD_SHIFT;
+ pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+ pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += sme_get_me_mask();
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT);
+
+ pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ *
+ * Only the region occupied by the kernel image has so far
+ * been checked against the table of usable memory regions
+ * provided by the firmware, so invalidate pages outside that
+ * region. A page table entry that maps to a reserved area of
+ * memory would allow processor speculation into that area,
+ * and on some hardware (particularly the UV platform) even
+ * speculative access to some reserved areas is caught as an
+ * error, causing the BIOS to halt the system.
+ */
+
+ pmd = rip_rel_ptr(level2_kernel_pgt);
+
+ /* invalidate pages before the kernel image */
+ for (i = 0; i < pmd_index(va_text); i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /* fixup pages that are part of the kernel image */
+ for (; i <= pmd_index(va_end); i++)
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+
+ /* invalidate pages after the kernel image */
+ for (; i < PTRS_PER_PMD; i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ return sme_postprocess_startup(bp, pmd, p2v_offset);
+}
diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c
new file mode 100644
index 000000000000..a0fa8bb2b945
--- /dev/null
+++ b/arch/x86/boot/startup/sev-shared.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This file is not compiled stand-alone. It contains code shared
+ * between the pre-decompression boot code and the running Linux kernel
+ * and is included directly into both code-bases.
+ */
+
+#include <asm/setup_data.h>
+
+#ifndef __BOOT_COMPRESSED
+#define has_cpuflag(f) cpu_feature_enabled(f)
+#else
+#undef WARN
+#define WARN(condition, format...) (!!(condition))
+#endif
+
+/* Copy of the SNP firmware's CPUID page. */
+static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
+
+/*
+ * These will be initialized based on CPUID table so that non-present
+ * all-zero leaves (for sparse tables) can be differentiated from
+ * invalid/out-of-range leaves. This is needed since all-zero leaves
+ * still need to be post-processed.
+ */
+static u32 cpuid_std_range_max __ro_after_init;
+static u32 cpuid_hyp_range_max __ro_after_init;
+static u32 cpuid_ext_range_max __ro_after_init;
+
+bool sev_snp_needs_sfw;
+
+void __noreturn
+sev_es_terminate(unsigned int set, unsigned int reason)
+{
+ u64 val = GHCB_MSR_TERM_REQ;
+
+ /* Tell the hypervisor what went wrong. */
+ val |= GHCB_SEV_TERM_REASON(set, reason);
+
+ /* Request Guest Termination from Hypervisor */
+ sev_es_wr_ghcb_msr(val);
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+/*
+ * The hypervisor features are available from GHCB version 2 onward.
+ */
+u64 __init get_hv_features(void)
+{
+ u64 val;
+
+ if (ghcb_version < 2)
+ return 0;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ);
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP)
+ return 0;
+
+ return GHCB_MSR_HV_FT_RESP_VAL(val);
+}
+
+int svsm_process_result_codes(struct svsm_call *call)
+{
+ switch (call->rax_out) {
+ case SVSM_SUCCESS:
+ return 0;
+ case SVSM_ERR_INCOMPLETE:
+ case SVSM_ERR_BUSY:
+ return -EAGAIN;
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * Issue a VMGEXIT to call the SVSM:
+ * - Load the SVSM register state (RAX, RCX, RDX, R8 and R9)
+ * - Set the CA call pending field to 1
+ * - Issue VMGEXIT
+ * - Save the SVSM return register state (RAX, RCX, RDX, R8 and R9)
+ * - Perform atomic exchange of the CA call pending field
+ *
+ * - See the "Secure VM Service Module for SEV-SNP Guests" specification for
+ * details on the calling convention.
+ * - The calling convention loosely follows the Microsoft X64 calling
+ * convention by putting arguments in RCX, RDX, R8 and R9.
+ * - RAX specifies the SVSM protocol/callid as input and the return code
+ * as output.
+ */
+void svsm_issue_call(struct svsm_call *call, u8 *pending)
+{
+ register unsigned long rax asm("rax") = call->rax;
+ register unsigned long rcx asm("rcx") = call->rcx;
+ register unsigned long rdx asm("rdx") = call->rdx;
+ register unsigned long r8 asm("r8") = call->r8;
+ register unsigned long r9 asm("r9") = call->r9;
+
+ call->caa->call_pending = 1;
+
+ asm volatile("rep; vmmcall\n\t"
+ : "+r" (rax), "+r" (rcx), "+r" (rdx), "+r" (r8), "+r" (r9)
+ : : "memory");
+
+ *pending = xchg(&call->caa->call_pending, *pending);
+
+ call->rax_out = rax;
+ call->rcx_out = rcx;
+ call->rdx_out = rdx;
+ call->r8_out = r8;
+ call->r9_out = r9;
+}
+
+int svsm_perform_msr_protocol(struct svsm_call *call)
+{
+ u8 pending = 0;
+ u64 val, resp;
+
+ /*
+ * When using the MSR protocol, be sure to save and restore
+ * the current MSR value.
+ */
+ val = sev_es_rd_ghcb_msr();
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_VMPL_REQ_LEVEL(0));
+
+ svsm_issue_call(call, &pending);
+
+ resp = sev_es_rd_ghcb_msr();
+
+ sev_es_wr_ghcb_msr(val);
+
+ if (pending)
+ return -EINVAL;
+
+ if (GHCB_RESP_CODE(resp) != GHCB_MSR_VMPL_RESP)
+ return -EINVAL;
+
+ if (GHCB_MSR_VMPL_RESP_VAL(resp))
+ return -EINVAL;
+
+ return svsm_process_result_codes(call);
+}
+
+static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
+{
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
+ return -EIO;
+
+ *reg = (val >> 32);
+
+ return 0;
+}
+
+static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf)
+{
+ int ret;
+
+ /*
+ * MSR protocol does not support fetching non-zero subfunctions, but is
+ * sufficient to handle current early-boot cases. Should that change,
+ * make sure to report an error rather than ignoring the index and
+ * grabbing random values. If this issue arises in the future, handling
+ * can be added here to use GHCB-page protocol for cases that occur late
+ * enough in boot that GHCB page is available.
+ */
+ if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn)
+ return -EINVAL;
+
+ ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx);
+
+ return ret;
+}
+
+
+
+/*
+ * This may be called early while still running on the initial identity
+ * mapping. Use RIP-relative addressing to obtain the correct address
+ * while running with the initial identity mapping as well as the
+ * switch-over to kernel virtual addresses later.
+ */
+const struct snp_cpuid_table *snp_cpuid_get_table(void)
+{
+ return rip_rel_ptr(&cpuid_table_copy);
+}
+
+/*
+ * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of
+ * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0
+ * and 1 based on the corresponding features enabled by a particular
+ * combination of XCR0 and XSS registers so that a guest can look up the
+ * version corresponding to the features currently enabled in its XCR0/XSS
+ * registers. The only values that differ between these versions/table
+ * entries is the enabled XSAVE area size advertised via EBX.
+ *
+ * While hypervisors may choose to make use of this support, it is more
+ * robust/secure for a guest to simply find the entry corresponding to the
+ * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the
+ * XSAVE area size using subfunctions 2 through 64, as documented in APM
+ * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here.
+ *
+ * Since base/legacy XSAVE area size is documented as 0x240, use that value
+ * directly rather than relying on the base size in the CPUID table.
+ *
+ * Return: XSAVE area size on success, 0 otherwise.
+ */
+static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ u64 xfeatures_found = 0;
+ u32 xsave_size = 0x240;
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64))
+ continue;
+ if (!(xfeatures_en & (BIT_ULL(e->ecx_in))))
+ continue;
+ if (xfeatures_found & (BIT_ULL(e->ecx_in)))
+ continue;
+
+ xfeatures_found |= (BIT_ULL(e->ecx_in));
+
+ if (compacted)
+ xsave_size += e->eax;
+ else
+ xsave_size = max(xsave_size, e->eax + e->ebx);
+ }
+
+ /*
+ * Either the guest set unsupported XCR0/XSS bits, or the corresponding
+ * entries in the CPUID table were not present. This is not a valid
+ * state to be in.
+ */
+ if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
+ return 0;
+
+ return xsave_size;
+}
+
+static bool
+snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (e->eax_in != leaf->fn)
+ continue;
+
+ if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn)
+ continue;
+
+ /*
+ * For 0xD subfunctions 0 and 1, only use the entry corresponding
+ * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0).
+ * See the comments above snp_cpuid_calc_xsave_size() for more
+ * details.
+ */
+ if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1))
+ if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in)
+ continue;
+
+ leaf->eax = e->eax;
+ leaf->ebx = e->ebx;
+ leaf->ecx = e->ecx;
+ leaf->edx = e->edx;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf)
+{
+ if (__sev_cpuid_hv_msr(leaf))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
+}
+
+static int
+snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf),
+ void *ctx, struct cpuid_leaf *leaf)
+{
+ struct cpuid_leaf leaf_hv = *leaf;
+
+ switch (leaf->fn) {
+ case 0x1:
+ cpuid_fn(ctx, &leaf_hv);
+
+ /* initial APIC ID */
+ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
+ /* APIC enabled bit */
+ leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
+
+ /* OSXSAVE enabled bit */
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ leaf->ecx |= BIT(27);
+ break;
+ case 0x7:
+ /* OSPKE enabled bit */
+ leaf->ecx &= ~BIT(4);
+ if (native_read_cr4() & X86_CR4_PKE)
+ leaf->ecx |= BIT(4);
+ break;
+ case 0xB:
+ leaf_hv.subfn = 0;
+ cpuid_fn(ctx, &leaf_hv);
+
+ /* extended APIC ID */
+ leaf->edx = leaf_hv.edx;
+ break;
+ case 0xD: {
+ bool compacted = false;
+ u64 xcr0 = 1, xss = 0;
+ u32 xsave_size;
+
+ if (leaf->subfn != 0 && leaf->subfn != 1)
+ return 0;
+
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ if (leaf->subfn == 1) {
+ /* Get XSS value if XSAVES is enabled. */
+ if (leaf->eax & BIT(3)) {
+ unsigned long lo, hi;
+
+ asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
+ : "c" (MSR_IA32_XSS));
+ xss = (hi << 32) | lo;
+ }
+
+ /*
+ * The PPR and APM aren't clear on what size should be
+ * encoded in 0xD:0x1:EBX when compaction is not enabled
+ * by either XSAVEC (feature bit 1) or XSAVES (feature
+ * bit 3) since SNP-capable hardware has these feature
+ * bits fixed as 1. KVM sets it to 0 in this case, but
+ * to avoid this becoming an issue it's safer to simply
+ * treat this as unsupported for SNP guests.
+ */
+ if (!(leaf->eax & (BIT(1) | BIT(3))))
+ return -EINVAL;
+
+ compacted = true;
+ }
+
+ xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted);
+ if (!xsave_size)
+ return -EINVAL;
+
+ leaf->ebx = xsave_size;
+ }
+ break;
+ case 0x8000001E:
+ cpuid_fn(ctx, &leaf_hv);
+
+ /* extended APIC ID */
+ leaf->eax = leaf_hv.eax;
+ /* compute ID */
+ leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0));
+ /* node ID */
+ leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0));
+ break;
+ default:
+ /* No fix-ups needed, use values as-is. */
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
+ * should be treated as fatal by caller.
+ */
+int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf),
+ void *ctx, struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+ if (!cpuid_table->count)
+ return -EOPNOTSUPP;
+
+ if (!snp_cpuid_get_validated_func(leaf)) {
+ /*
+ * Some hypervisors will avoid keeping track of CPUID entries
+ * where all values are zero, since they can be handled the
+ * same as out-of-range values (all-zero). This is useful here
+ * as well as it allows virtually all guest configurations to
+ * work using a single SNP CPUID table.
+ *
+ * To allow for this, there is a need to distinguish between
+ * out-of-range entries and in-range zero entries, since the
+ * CPUID table entries are only a template that may need to be
+ * augmented with additional values for things like
+ * CPU-specific information during post-processing. So if it's
+ * not in the table, set the values to zero. Then, if they are
+ * within a valid CPUID range, proceed with post-processing
+ * using zeros as the initial values. Otherwise, skip
+ * post-processing and just return zeros immediately.
+ */
+ leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
+
+ /* Skip post-processing for out-of-range zero leafs. */
+ if (!(leaf->fn <= cpuid_std_range_max ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+ return 0;
+ }
+
+ return snp_cpuid_postprocess(cpuid_fn, ctx, leaf);
+}
+
+/*
+ * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
+ * page yet, so it only supports the MSR based communication with the
+ * hypervisor and only the CPUID exit-code.
+ */
+void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+{
+ unsigned int subfn = lower_bits(regs->cx, 32);
+ unsigned int fn = lower_bits(regs->ax, 32);
+ u16 opcode = *(unsigned short *)regs->ip;
+ struct cpuid_leaf leaf;
+ int ret;
+
+ /* Only CPUID is supported via MSR protocol */
+ if (exit_code != SVM_EXIT_CPUID)
+ goto fail;
+
+ /* Is it really a CPUID insn? */
+ if (opcode != 0xa20f)
+ goto fail;
+
+ leaf.fn = fn;
+ leaf.subfn = subfn;
+
+ /*
+ * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the
+ * CPUID values (with possible HV interaction during post-processing of
+ * the values). But if SNP is not active (no CPUID table present), then
+ * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the
+ * HV to obtain the CPUID information.
+ */
+ ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf);
+ if (!ret)
+ goto cpuid_done;
+
+ if (ret != -EOPNOTSUPP)
+ goto fail;
+
+ /*
+ * This is reached by a SEV-ES guest and needs to invoke the HV for
+ * the CPUID data.
+ */
+ if (__sev_cpuid_hv_msr(&leaf))
+ goto fail;
+
+cpuid_done:
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
+
+ /*
+ * This is a VC handler and the #VC is only raised when SEV-ES is
+ * active, which means SEV must be active too. Do sanity checks on the
+ * CPUID results to make sure the hypervisor does not trick the kernel
+ * into the no-sev path. This could map sensitive data unencrypted and
+ * make it accessible to the hypervisor.
+ *
+ * In particular, check for:
+ * - Availability of CPUID leaf 0x8000001f
+ * - SEV CPUID bit.
+ *
+ * The hypervisor might still report the wrong C-bit position, but this
+ * can't be checked here.
+ */
+
+ if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ /* SEV leaf check */
+ goto fail;
+ else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
+ /* SEV bit */
+ goto fail;
+
+ /* Skip over the CPUID two-byte opcode */
+ regs->ip += 2;
+
+ return;
+
+fail:
+ /* Terminate the guest */
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
+struct cc_setup_data {
+ struct setup_data header;
+ u32 cc_blob_address;
+};
+
+/*
+ * Search for a Confidential Computing blob passed in as a setup_data entry
+ * via the Linux Boot Protocol.
+ */
+static __init
+struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
+{
+ struct cc_setup_data *sd = NULL;
+ struct setup_data *hdr;
+
+ hdr = (struct setup_data *)bp->hdr.setup_data;
+
+ while (hdr) {
+ if (hdr->type == SETUP_CC_BLOB) {
+ sd = (struct cc_setup_data *)hdr;
+ return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address;
+ }
+ hdr = (struct setup_data *)hdr->next;
+ }
+
+ return NULL;
+}
+
+/*
+ * Initialize the kernel's copy of the SNP CPUID table, and set up the
+ * pointer that will be used to access it.
+ *
+ * Maintaining a direct mapping of the SNP CPUID table used by firmware would
+ * be possible as an alternative, but the approach is brittle since the
+ * mapping needs to be updated in sync with all the changes to virtual memory
+ * layout and related mapping facilities throughout the boot process.
+ */
+static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+{
+ const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
+ int i;
+
+ if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys;
+ if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table = snp_cpuid_get_table();
+ memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table));
+
+ /* Initialize CPUID ranges for range-checking. */
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x0)
+ cpuid_std_range_max = fn->eax;
+ else if (fn->eax_in == 0x40000000)
+ cpuid_hyp_range_max = fn->eax;
+ else if (fn->eax_in == 0x80000000)
+ cpuid_ext_range_max = fn->eax;
+ }
+}
+
+static int svsm_call_msr_protocol(struct svsm_call *call)
+{
+ int ret;
+
+ do {
+ ret = svsm_perform_msr_protocol(call);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+static void svsm_pval_4k_page(unsigned long paddr, bool validate,
+ struct svsm_ca *caa, u64 caa_pa)
+{
+ struct svsm_pvalidate_call *pc;
+ struct svsm_call call = {};
+ unsigned long flags;
+ u64 pc_pa;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ call.caa = caa;
+
+ pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
+ pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer);
+
+ pc->num_entries = 1;
+ pc->cur_index = 0;
+ pc->entry[0].page_size = RMP_PG_SIZE_4K;
+ pc->entry[0].action = validate;
+ pc->entry[0].ignore_cf = 0;
+ pc->entry[0].rsvd = 0;
+ pc->entry[0].pfn = paddr >> PAGE_SHIFT;
+
+ /* Protocol 0, Call ID 1 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
+ call.rcx = pc_pa;
+
+ /*
+ * Use the MSR protocol exclusively, so that this code is usable in
+ * startup code where VA/PA translations of the GHCB page's address may
+ * be problematic.
+ */
+ if (svsm_call_msr_protocol(&call))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+
+ native_local_irq_restore(flags);
+}
+
+static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
+ bool validate, struct svsm_ca *caa, u64 caa_pa)
+{
+ int ret;
+
+ if (snp_vmpl) {
+ svsm_pval_4k_page(paddr, validate, caa, caa_pa);
+ } else {
+ ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (ret)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+ }
+
+ /*
+ * If validating memory (making it private) and affected by the
+ * cache-coherency vulnerability, perform the cache eviction mitigation.
+ */
+ if (validate && sev_snp_needs_sfw)
+ sev_evict_cache((void *)vaddr, 1);
+}
+
+static void __page_state_change(unsigned long vaddr, unsigned long paddr,
+ const struct psc_desc *desc)
+{
+ u64 val, msr;
+
+ /*
+ * If private -> shared then invalidate the page before requesting the
+ * state change in the RMP table.
+ */
+ if (desc->op == SNP_PAGE_STATE_SHARED)
+ pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa);
+
+ /* Save the current GHCB MSR value */
+ msr = sev_es_rd_ghcb_msr();
+
+ /* Issue VMGEXIT to change the page state in RMP table. */
+ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op));
+ VMGEXIT();
+
+ /* Read the response of the VMGEXIT. */
+ val = sev_es_rd_ghcb_msr();
+ if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+
+ /* Restore the GHCB MSR value */
+ sev_es_wr_ghcb_msr(msr);
+
+ /*
+ * Now that page state is changed in the RMP table, validate it so that it is
+ * consistent with the RMP entry.
+ */
+ if (desc->op == SNP_PAGE_STATE_PRIVATE)
+ pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa);
+}
+
+/*
+ * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM
+ * services needed when not running in VMPL0.
+ */
+static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info,
+ void *page)
+{
+ struct snp_secrets_page *secrets_page;
+ struct snp_cpuid_table *cpuid_table;
+ unsigned int i;
+ u64 caa;
+
+ BUILD_BUG_ON(sizeof(*secrets_page) != PAGE_SIZE);
+
+ /*
+ * Check if running at VMPL0.
+ *
+ * Use RMPADJUST (see the rmpadjust() function for a description of what
+ * the instruction does) to update the VMPL1 permissions of a page. If
+ * the guest is running at VMPL0, this will succeed and implies there is
+ * no SVSM. If the guest is running at any other VMPL, this will fail.
+ * Linux SNP guests only ever run at a single VMPL level so permission mask
+ * changes of a lesser-privileged VMPL are a don't-care.
+ *
+ * Use a rip-relative reference to obtain the proper address, since this
+ * routine is running identity mapped when called, both by the decompressor
+ * code and the early kernel code.
+ */
+ if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1))
+ return false;
+
+ /*
+ * Not running at VMPL0, ensure everything has been properly supplied
+ * for running under an SVSM.
+ */
+ if (!cc_info || !cc_info->secrets_phys || cc_info->secrets_len != PAGE_SIZE)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECRETS_PAGE);
+
+ secrets_page = (struct snp_secrets_page *)cc_info->secrets_phys;
+ if (!secrets_page->svsm_size)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NO_SVSM);
+
+ if (!secrets_page->svsm_guest_vmpl)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_VMPL0);
+
+ snp_vmpl = secrets_page->svsm_guest_vmpl;
+
+ caa = secrets_page->svsm_caa;
+
+ /*
+ * An open-coded PAGE_ALIGNED() in order to avoid including
+ * kernel-proper headers into the decompressor.
+ */
+ if (caa & (PAGE_SIZE - 1))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA);
+
+ boot_svsm_caa_pa = caa;
+
+ /* Advertise the SVSM presence via CPUID. */
+ cpuid_table = (struct snp_cpuid_table *)snp_cpuid_get_table();
+ for (i = 0; i < cpuid_table->count; i++) {
+ struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x8000001f)
+ fn->eax |= BIT(28);
+ }
+
+ return true;
+}
diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c
new file mode 100644
index 000000000000..09725428d3e6
--- /dev/null
+++ b/arch/x86/boot/startup/sev-startup.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/percpu-defs.h>
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/efi.h>
+#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
+
+#include <asm/init.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/xcr.h>
+#include <asm/processor.h>
+#include <asm/realmode.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid/api.h>
+#include <asm/cmdline.h>
+
+/* Include code shared with pre-decompression boot stage */
+#include "sev-shared.c"
+
+void
+early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, const struct psc_desc *desc)
+{
+ unsigned long paddr_end;
+
+ vaddr = vaddr & PAGE_MASK;
+
+ paddr = paddr & PAGE_MASK;
+ paddr_end = paddr + (npages << PAGE_SHIFT);
+
+ while (paddr < paddr_end) {
+ __page_state_change(vaddr, paddr, desc);
+
+ vaddr += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ }
+}
+
+void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_PRIVATE,
+ rip_rel_ptr(&boot_svsm_ca_page),
+ boot_svsm_caa_pa
+ };
+
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /*
+ * Ask the hypervisor to mark the memory pages as private in the RMP
+ * table.
+ */
+ early_set_pages_state(vaddr, paddr, npages, &d);
+}
+
+void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_SHARED,
+ rip_rel_ptr(&boot_svsm_ca_page),
+ boot_svsm_caa_pa
+ };
+
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /* Ask hypervisor to mark the memory pages shared in the RMP table. */
+ early_set_pages_state(vaddr, paddr, npages, &d);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the kernel
+ * in the following ways, depending on how it is booted:
+ *
+ * - when booted via the boot/decompress kernel:
+ * - via boot_params
+ *
+ * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
+ * - via a setup_data entry, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ /* Boot kernel would have passed the CC blob via boot_params. */
+ if (bp->cc_blob_address) {
+ cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
+ goto found_cc_info;
+ }
+
+ /*
+ * If kernel was booted directly, without the use of the
+ * boot/decompression kernel, the CC blob may have been passed via
+ * setup_data instead.
+ */
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+static void __init svsm_setup(struct cc_blob_sev_info *cc_info)
+{
+ struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys;
+ struct svsm_call call = {};
+ u64 pa;
+
+ /*
+ * Record the SVSM Calling Area address (CAA) if the guest is not
+ * running at VMPL0. The CA will be used to communicate with the
+ * SVSM to perform the SVSM services.
+ */
+ if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page)))
+ return;
+
+ /*
+ * It is very early in the boot and the kernel is running identity
+ * mapped but without having adjusted the pagetables to where the
+ * kernel was loaded (physbase), so the get the CA address using
+ * RIP-relative addressing.
+ */
+ pa = (u64)rip_rel_ptr(&boot_svsm_ca_page);
+
+ /*
+ * Switch over to the boot SVSM CA while the current CA is still 1:1
+ * mapped and thus addressable with VA == PA. There is no GHCB at this
+ * point so use the MSR protocol.
+ *
+ * SVSM_CORE_REMAP_CA call:
+ * RAX = 0 (Protocol=0, CallID=0)
+ * RCX = New CA GPA
+ */
+ call.caa = (struct svsm_ca *)secrets->svsm_caa;
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
+ call.rcx = pa;
+
+ if (svsm_call_msr_protocol(&call))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
+
+ boot_svsm_caa_pa = pa;
+}
+
+bool __init snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE)
+ sev_secrets_pa = cc_info->secrets_phys;
+ else
+ return false;
+
+ setup_cpuid_table(cc_info);
+
+ svsm_setup(cc_info);
+
+ /*
+ * The CC blob will be used later to access the secrets page. Cache
+ * it here like the boot kernel does.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c
new file mode 100644
index 000000000000..e7ea65f3f1d6
--- /dev/null
+++ b/arch/x86/boot/startup/sme.c
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/*
+ * Special hack: we have to be careful, because no indirections are
+ * allowed here, and paravirt_ops is a kind of one. As it will only run in
+ * baremetal anyway, we just keep it from happening. (This list needs to
+ * be extended when new paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XXL
+#undef CONFIG_PARAVIRT_SPINLOCKS
+
+/*
+ * This code runs before CPU feature bits are set. By default, the
+ * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if
+ * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5
+ * is provided to handle this situation and, instead, use a variable that
+ * has been set by the early boot code.
+ */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
+
+#include <asm/init.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/coco.h>
+#include <asm/sev.h>
+
+#define PGD_FLAGS _KERNPG_TABLE_NOENC
+#define P4D_FLAGS _KERNPG_TABLE_NOENC
+#define PUD_FLAGS _KERNPG_TABLE_NOENC
+#define PMD_FLAGS _KERNPG_TABLE_NOENC
+
+#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
+#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
+ (_PAGE_PAT_LARGE | _PAGE_PWT))
+
+#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
+
+#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
+
+#define PTE_FLAGS_DEC PTE_FLAGS
+#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
+
+#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
+
+struct sme_populate_pgd_data {
+ void *pgtable_area;
+ pgd_t *pgd;
+
+ pmdval_t pmd_flags;
+ pteval_t pte_flags;
+ unsigned long paddr;
+
+ unsigned long vaddr;
+ unsigned long vaddr_end;
+};
+
+/*
+ * This work area lives in the .init.scratch section, which lives outside of
+ * the kernel proper. It is sized to hold the intermediate copy buffer and
+ * more than enough pagetable pages.
+ *
+ * By using this section, the kernel can be encrypted in place and it
+ * avoids any possibility of boot parameters or initramfs images being
+ * placed such that the in-place encryption logic overwrites them. This
+ * section is 2MB aligned to allow for simple pagetable setup using only
+ * PMD entries (see vmlinux.lds.S).
+ */
+static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
+
+static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+{
+ unsigned long pgd_start, pgd_end, pgd_size;
+ pgd_t *pgd_p;
+
+ pgd_start = ppd->vaddr & PGDIR_MASK;
+ pgd_end = ppd->vaddr_end & PGDIR_MASK;
+
+ pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
+
+ pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
+
+ memset(pgd_p, 0, pgd_size);
+}
+
+static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = ppd->pgd + pgd_index(ppd->vaddr);
+ if (pgd_none(*pgd)) {
+ p4d = ppd->pgtable_area;
+ memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
+ ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
+ set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
+ }
+
+ p4d = p4d_offset(pgd, ppd->vaddr);
+ if (p4d_none(*p4d)) {
+ pud = ppd->pgtable_area;
+ memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+ ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
+ set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
+ }
+
+ pud = pud_offset(p4d, ppd->vaddr);
+ if (pud_none(*pud)) {
+ pmd = ppd->pgtable_area;
+ memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+ ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
+ set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
+ }
+
+ if (pud_leaf(*pud))
+ return NULL;
+
+ return pud;
+}
+
+static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_leaf(*pmd))
+ return;
+
+ set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
+}
+
+static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_none(*pmd)) {
+ pte = ppd->pgtable_area;
+ memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE);
+ ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE;
+ set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
+ }
+
+ if (pmd_leaf(*pmd))
+ return;
+
+ pte = pte_offset_kernel(pmd, ppd->vaddr);
+ if (pte_none(*pte))
+ set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
+}
+
+static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd_large(ppd);
+
+ ppd->vaddr += PMD_SIZE;
+ ppd->paddr += PMD_SIZE;
+ }
+}
+
+static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd(ppd);
+
+ ppd->vaddr += PAGE_SIZE;
+ ppd->paddr += PAGE_SIZE;
+ }
+}
+
+static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
+ pmdval_t pmd_flags, pteval_t pte_flags)
+{
+ unsigned long vaddr_end;
+
+ ppd->pmd_flags = pmd_flags;
+ ppd->pte_flags = pte_flags;
+
+ /* Save original end value since we modify the struct value */
+ vaddr_end = ppd->vaddr_end;
+
+ /* If start is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE);
+ __sme_map_range_pte(ppd);
+
+ /* Create PMD entries */
+ ppd->vaddr_end = vaddr_end & PMD_MASK;
+ __sme_map_range_pmd(ppd);
+
+ /* If end is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = vaddr_end;
+ __sme_map_range_pte(ppd);
+}
+
+static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
+}
+
+static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
+}
+
+static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+ unsigned long entries = 0, tables = 0;
+
+ /*
+ * Perform a relatively simplistic calculation of the pagetable
+ * entries that are needed. Those mappings will be covered mostly
+ * by 2MB PMD entries so we can conservatively calculate the required
+ * number of P4D, PUD and PMD structures needed to perform the
+ * mappings. For mappings that are not 2MB aligned, PTE mappings
+ * would be needed for the start and end portion of the address range
+ * that fall outside of the 2MB alignment. This results in, at most,
+ * two extra pages to hold PTE entries for each range that is mapped.
+ * Incrementing the count for each covers the case where the addresses
+ * cross entries.
+ */
+
+ /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */
+ if (PTRS_PER_P4D > 1)
+ entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D;
+ entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD;
+ entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD;
+ entries += 2 * sizeof(pte_t) * PTRS_PER_PTE;
+
+ /*
+ * Now calculate the added pagetable structures needed to populate
+ * the new pagetables.
+ */
+
+ if (PTRS_PER_P4D > 1)
+ tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D;
+ tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD;
+ tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD;
+
+ return entries + tables;
+}
+
+void __init sme_encrypt_kernel(struct boot_params *bp)
+{
+ unsigned long workarea_start, workarea_end, workarea_len;
+ unsigned long execute_start, execute_end, execute_len;
+ unsigned long kernel_start, kernel_end, kernel_len;
+ unsigned long initrd_start, initrd_end, initrd_len;
+ struct sme_populate_pgd_data ppd;
+ unsigned long pgtable_area_len;
+ unsigned long decrypted_base;
+
+ /*
+ * This is early code, use an open coded check for SME instead of
+ * using cc_platform_has(). This eliminates worries about removing
+ * instrumentation or checking boot_cpu_data in the cc_platform_has()
+ * function.
+ */
+ if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
+ return;
+
+ /*
+ * Prepare for encrypting the kernel and initrd by building new
+ * pagetables with the necessary attributes needed to encrypt the
+ * kernel in place.
+ *
+ * One range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as encrypted.
+ *
+ * Another range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as decrypted and write-protected.
+ *
+ * The use of write-protect attribute will prevent any of the
+ * memory from being cached.
+ */
+
+ kernel_start = (unsigned long)rip_rel_ptr(_text);
+ kernel_end = ALIGN((unsigned long)rip_rel_ptr(_end), PMD_SIZE);
+ kernel_len = kernel_end - kernel_start;
+
+ initrd_start = 0;
+ initrd_end = 0;
+ initrd_len = 0;
+#ifdef CONFIG_BLK_DEV_INITRD
+ initrd_len = (unsigned long)bp->hdr.ramdisk_size |
+ ((unsigned long)bp->ext_ramdisk_size << 32);
+ if (initrd_len) {
+ initrd_start = (unsigned long)bp->hdr.ramdisk_image |
+ ((unsigned long)bp->ext_ramdisk_image << 32);
+ initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
+ initrd_len = initrd_end - initrd_start;
+ }
+#endif
+
+ /*
+ * Calculate required number of workarea bytes needed:
+ * executable encryption area size:
+ * stack page (PAGE_SIZE)
+ * encryption routine page (PAGE_SIZE)
+ * intermediate copy buffer (PMD_SIZE)
+ * pagetable structures for the encryption of the kernel
+ * pagetable structures for workarea (in case not currently mapped)
+ */
+ execute_start = workarea_start = (unsigned long)rip_rel_ptr(sme_workarea);
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
+ execute_len = execute_end - execute_start;
+
+ /*
+ * One PGD for both encrypted and decrypted mappings and a set of
+ * PUDs and PMDs for each of the encrypted and decrypted mappings.
+ */
+ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+ pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+ if (initrd_len)
+ pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
+
+ /* PUDs and PMDs needed in the current pagetables for the workarea */
+ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+ /*
+ * The total workarea includes the executable encryption area and
+ * the pagetable area. The start of the workarea is already 2MB
+ * aligned, align the end of the workarea on a 2MB boundary so that
+ * we don't try to create/allocate PTE entries from the workarea
+ * before it is mapped.
+ */
+ workarea_len = execute_len + pgtable_area_len;
+ workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE);
+
+ /*
+ * Set the address to the start of where newly created pagetable
+ * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+ * structures are created when the workarea is added to the current
+ * pagetables and when the new encrypted and decrypted kernel
+ * mappings are populated.
+ */
+ ppd.pgtable_area = (void *)execute_end;
+
+ /*
+ * Make sure the current pagetable structure has entries for
+ * addressing the workarea.
+ */
+ ppd.pgd = (pgd_t *)native_read_cr3_pa();
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+
+ /*
+ * A new pagetable structure is being built to allow for the kernel
+ * and initrd to be encrypted. It starts with an empty PGD that will
+ * then be populated with new PUDs and PMDs as the encrypted and
+ * decrypted kernel mappings are created.
+ */
+ ppd.pgd = ppd.pgtable_area;
+ memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+ ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+
+ /*
+ * A different PGD index/entry must be used to get different
+ * pagetable entries for the decrypted mapping. Choose the next
+ * PGD index and convert it to a virtual address to be used as
+ * the base of the mapping.
+ */
+ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+ if (initrd_len) {
+ unsigned long check_base;
+
+ check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
+ decrypted_base = max(decrypted_base, check_base);
+ }
+ decrypted_base <<= PGDIR_SHIFT;
+
+ /* Add encrypted kernel (identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start;
+ ppd.vaddr_end = kernel_end;
+ sme_map_range_encrypted(&ppd);
+
+ /* Add decrypted, write-protected kernel (non-identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+
+ if (initrd_len) {
+ /* Add encrypted initrd (identity) mappings */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start;
+ ppd.vaddr_end = initrd_end;
+ sme_map_range_encrypted(&ppd);
+ /*
+ * Add decrypted, write-protected initrd (non-identity) mappings
+ */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+ }
+
+ /* Add decrypted workarea mappings to both kernel mappings */
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_map_range_decrypted(&ppd);
+
+ /* Perform the encryption */
+ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+ kernel_len, workarea_start, (unsigned long)ppd.pgd);
+
+ if (initrd_len)
+ sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
+ initrd_len, workarea_start,
+ (unsigned long)ppd.pgd);
+
+ /*
+ * At this point we are running encrypted. Remove the mappings for
+ * the decrypted areas - all that is needed for this is to remove
+ * the PGD entry/entries.
+ */
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ if (initrd_len) {
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+ }
+
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+}
+
+void __init sme_enable(struct boot_params *bp)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
+ unsigned long me_mask;
+ bool snp_en;
+ u64 msr;
+
+ snp_en = snp_init(bp);
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* Check whether SEV or SME is supported */
+ if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
+ return;
+
+ me_mask = 1UL << (ebx & 0x3f);
+ sev_snp_needs_sfw = !(ebx & BIT(31));
+
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = msr = native_rdmsrq(MSR_AMD64_SEV);
+ feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Any discrepancies between the presence of a CC blob and SNP
+ * enablement abort the guest.
+ */
+ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION))
+ return;
+
+ /*
+ * No SME if Hypervisor bit is set. This check is here to
+ * prevent a guest from trying to enable SME. For running as a
+ * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there
+ * might be other hypervisors which emulate that MSR as non-zero
+ * or even pass it through to the guest.
+ * A malicious hypervisor can still trick a guest into this
+ * path, but there is no way to protect against that.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(31))
+ return;
+
+ /* For SME, check the SYSCFG MSR */
+ msr = native_rdmsrq(MSR_AMD64_SYSCFG);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+ }
+
+ sme_me_mask = me_mask;
+ physical_mask &= ~me_mask;
+ cc_vendor = CC_VENDOR_AMD;
+ cc_set_mask(me_mask);
+}
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+/* Local version for startup code, which never operates on user page tables */
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index f94b7a0c2abf..b25c6a9303b7 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -1,27 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
* Very basic string functions
*/
-#include "boot.h"
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/limits.h>
+#include <asm/asm.h>
+#include "ctype.h"
+#include "string.h"
+
+#define KSTRTOX_OVERFLOW (1U << 31)
+
+/*
+ * Undef these macros so that the functions that we provide
+ * here will have the correct names regardless of how string.h
+ * may have chosen to #define them.
+ */
+#undef memcpy
+#undef memset
+#undef memcmp
+
+int memcmp(const void *s1, const void *s2, size_t len)
+{
+ bool diff;
+ asm("repe cmpsb"
+ : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+
+/*
+ * Clang may lower `memcmp == 0` to `bcmp == 0`.
+ */
+int bcmp(const void *s1, const void *s2, size_t len)
+{
+ return memcmp(s1, s2, len);
+}
int strcmp(const char *str1, const char *str2)
{
const unsigned char *s1 = (const unsigned char *)str1;
const unsigned char *s2 = (const unsigned char *)str2;
- int delta = 0;
+ int delta;
while (*s1 || *s2) {
- delta = *s2 - *s1;
+ delta = *s1 - *s2;
if (delta)
return delta;
s1++;
@@ -30,6 +61,22 @@ int strcmp(const char *str1, const char *str2)
return 0;
}
+int strncmp(const char *cs, const char *ct, size_t count)
+{
+ unsigned char c1, c2;
+
+ while (count) {
+ c1 = *cs++;
+ c2 = *ct++;
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (!c1)
+ break;
+ count--;
+ }
+ return 0;
+}
+
size_t strnlen(const char *s, size_t maxlen)
{
const char *es = s;
@@ -41,10 +88,283 @@ size_t strnlen(const char *s, size_t maxlen)
return (es - s);
}
-unsigned int atou(const char *s)
+/* Works only for digits and letters, but small and fast */
+#define TOLOWER(x) ((x) | 0x20)
+
+static unsigned int simple_guess_base(const char *cp)
+{
+ if (cp[0] == '0') {
+ if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
+ return 16;
+ else
+ return 8;
+ } else {
+ return 10;
+ }
+}
+
+/**
+ * simple_strtoull - convert a string to an unsigned long long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ */
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
+{
+ unsigned long long result = 0;
+
+ if (!base)
+ base = simple_guess_base(cp);
+
+ if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
+ cp += 2;
+
+ while (isxdigit(*cp)) {
+ unsigned int value;
+
+ value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
+ if (value >= base)
+ break;
+ result = result * base + value;
+ cp++;
+ }
+ if (endp)
+ *endp = (char *)cp;
+
+ return result;
+}
+
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+ if (*cp == '-')
+ return -simple_strtoull(cp + 1, endp, base);
+
+ return simple_strtoull(cp, endp, base);
+}
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t strlen(const char *s)
+{
+ const char *sc;
+
+ for (sc = s; *sc != '\0'; ++sc)
+ /* nothing */;
+ return sc - s;
+}
+
+/**
+ * strstr - Find the first substring in a %NUL terminated string
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ */
+char *strstr(const char *s1, const char *s2)
+{
+ size_t l1, l2;
+
+ l2 = strlen(s2);
+ if (!l2)
+ return (char *)s1;
+ l1 = strlen(s1);
+ while (l1 >= l2) {
+ l1--;
+ if (!memcmp(s1, s2, l2))
+ return (char *)s1;
+ s1++;
+ }
+ return NULL;
+}
+
+/**
+ * strchr - Find the first occurrence of the character c in the string s.
+ * @s: the string to be searched
+ * @c: the character to search for
+ */
+char *strchr(const char *s, int c)
+{
+ while (*s != (char)c)
+ if (*s++ == '\0')
+ return NULL;
+ return (char *)s;
+}
+
+static inline u64 __div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+ union {
+ u64 v64;
+ u32 v32[2];
+ } d = { dividend };
+ u32 upper;
+
+ upper = d.v32[1];
+ d.v32[1] = 0;
+ if (upper >= divisor) {
+ d.v32[1] = upper / divisor;
+ upper %= divisor;
+ }
+ asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
+ "rm" (divisor), "0" (d.v32[0]), "1" (upper));
+ return d.v64;
+}
+
+static inline u64 __div_u64(u64 dividend, u32 divisor)
+{
+ u32 remainder;
+
+ return __div_u64_rem(dividend, divisor, &remainder);
+}
+
+static inline char _tolower(const char c)
+{
+ return c | 0x20;
+}
+
+static const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
+{
+ if (*base == 0) {
+ if (s[0] == '0') {
+ if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
+ *base = 16;
+ else
+ *base = 8;
+ } else
+ *base = 10;
+ }
+ if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
+ s += 2;
+ return s;
+}
+
+/*
+ * Convert non-negative integer string representation in explicitly given radix
+ * to an integer.
+ * Return number of characters consumed maybe or-ed with overflow bit.
+ * If overflow occurs, result integer (incorrect) is still returned.
+ *
+ * Don't you dare use this function.
+ */
+static unsigned int _parse_integer(const char *s,
+ unsigned int base,
+ unsigned long long *p)
+{
+ unsigned long long res;
+ unsigned int rv;
+
+ res = 0;
+ rv = 0;
+ while (1) {
+ unsigned int c = *s;
+ unsigned int lc = c | 0x20; /* don't tolower() this line */
+ unsigned int val;
+
+ if ('0' <= c && c <= '9')
+ val = c - '0';
+ else if ('a' <= lc && lc <= 'f')
+ val = lc - 'a' + 10;
+ else
+ break;
+
+ if (val >= base)
+ break;
+ /*
+ * Check for overflow only if we are within range of
+ * it in the max base we support (16)
+ */
+ if (unlikely(res & (~0ull << 60))) {
+ if (res > __div_u64(ULLONG_MAX - val, base))
+ rv |= KSTRTOX_OVERFLOW;
+ }
+ res = res * base + val;
+ rv++;
+ s++;
+ }
+ *p = res;
+ return rv;
+}
+
+static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+ unsigned long long _res;
+ unsigned int rv;
+
+ s = _parse_integer_fixup_radix(s, &base);
+ rv = _parse_integer(s, base, &_res);
+ if (rv & KSTRTOX_OVERFLOW)
+ return -ERANGE;
+ if (rv == 0)
+ return -EINVAL;
+ s += rv;
+ if (*s == '\n')
+ s++;
+ if (*s)
+ return -EINVAL;
+ *res = _res;
+ return 0;
+}
+
+/**
+ * kstrtoull - convert a string to an unsigned long long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ * include a single newline before its terminating null. The first character
+ * may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ * given as 0, then the base of the string is automatically detected with the
+ * conventional semantics - If it begins with 0x the number will be parsed as a
+ * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ * parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+ if (s[0] == '+')
+ s++;
+ return _kstrtoull(s, base, res);
+}
+
+static int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+ unsigned long long tmp;
+ int rv;
+
+ rv = kstrtoull(s, base, &tmp);
+ if (rv < 0)
+ return rv;
+ if (tmp != (unsigned long)tmp)
+ return -ERANGE;
+ *res = tmp;
+ return 0;
+}
+
+/**
+ * boot_kstrtoul - convert a string to an unsigned long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ * include a single newline before its terminating null. The first character
+ * may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ * given as 0, then the base of the string is automatically detected with the
+ * conventional semantics - If it begins with 0x the number will be parsed as a
+ * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ * parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the simple_strtoull.
+ */
+int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res)
{
- unsigned int i = 0;
- while (isdigit(*s))
- i = i * 10 + (*s++ - '0');
- return i;
+ /*
+ * We want to shortcut function call, but
+ * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
+ */
+ if (sizeof(unsigned long) == sizeof(unsigned long long) &&
+ __alignof__(unsigned long) == __alignof__(unsigned long long))
+ return kstrtoull(s, base, (unsigned long long *)res);
+ else
+ return _kstrtoul(s, base, res);
}
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
new file mode 100644
index 000000000000..a5b05ebc037d
--- /dev/null
+++ b/arch/x86/boot/string.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_STRING_H
+#define BOOT_STRING_H
+
+/* Undef any of these macros coming from string_32.h. */
+#undef memcpy
+#undef memset
+#undef memcmp
+
+void *memcpy(void *dst, const void *src, size_t len);
+void *memmove(void *dst, const void *src, size_t len);
+void *memset(void *dst, int c, size_t len);
+int memcmp(const void *s1, const void *s2, size_t len);
+int bcmp(const void *s1, const void *s2, size_t len);
+
+/* Access builtin version by default. */
+#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
+#define memset(d,c,l) __builtin_memset(d,c,l)
+#define memcmp __builtin_memcmp
+
+extern int strcmp(const char *str1, const char *str2);
+extern int strncmp(const char *cs, const char *ct, size_t count);
+extern size_t strlen(const char *s);
+extern char *strstr(const char *s1, const char *s2);
+extern char *strchr(const char *s, int c);
+extern size_t strnlen(const char *s, size_t maxlen);
+extern unsigned long long simple_strtoull(const char *cp, char **endp,
+ unsigned int base);
+long simple_strtol(const char *cp, char **endp, unsigned int base);
+
+int kstrtoull(const char *s, unsigned int base, unsigned long long *res);
+int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res);
+#endif /* BOOT_STRING_H */
diff --git a/arch/x86/boot/tools/.gitignore b/arch/x86/boot/tools/.gitignore
deleted file mode 100644
index 378eac25d311..000000000000
--- a/arch/x86/boot/tools/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-build
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
deleted file mode 100644
index ee3a4ea923ac..000000000000
--- a/arch/x86/boot/tools/build.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 1997 Martin Mares
- * Copyright (C) 2007 H. Peter Anvin
- */
-
-/*
- * This file builds a disk-image from two different files:
- *
- * - setup: 8086 machine code, sets up system parm
- * - system: 80386 code for actual system
- *
- * It does some checking that all files are of the correct type, and
- * just writes the result to stdout, removing headers and padding to
- * the right amount. It also writes some system data to stderr.
- */
-
-/*
- * Changes by tytso to allow root device specification
- * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
- * Cross compiling fixes by Gertjan van Wingerde, July 1996
- * Rewritten by Martin Mares, April 1997
- * Substantially overhauled by H. Peter Anvin, April 2007
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <asm/boot.h>
-
-typedef unsigned char u8;
-typedef unsigned short u16;
-typedef unsigned long u32;
-
-#define DEFAULT_MAJOR_ROOT 0
-#define DEFAULT_MINOR_ROOT 0
-
-/* Minimal number of setup sectors */
-#define SETUP_SECT_MIN 5
-#define SETUP_SECT_MAX 64
-
-/* This must be large enough to hold the entire setup */
-u8 buf[SETUP_SECT_MAX*512];
-int is_big_kernel;
-
-/*----------------------------------------------------------------------*/
-
-static const u32 crctab32[] = {
- 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
- 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
- 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
- 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
- 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
- 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
- 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
- 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
- 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
- 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
- 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
- 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
- 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
- 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
- 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
- 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
- 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
- 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
- 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
- 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
- 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
- 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
- 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
- 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
- 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
- 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
- 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
- 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
- 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
- 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
- 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
- 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
- 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
- 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
- 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
- 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
- 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
- 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
- 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
- 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
- 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
- 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
- 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
- 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
- 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
- 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
- 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
- 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
- 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
- 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
- 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
- 0x2d02ef8d
-};
-
-static u32 partial_crc32_one(u8 c, u32 crc)
-{
- return crctab32[(crc ^ c) & 0xff] ^ (crc >> 8);
-}
-
-static u32 partial_crc32(const u8 *s, int len, u32 crc)
-{
- while (len--)
- crc = partial_crc32_one(*s++, crc);
- return crc;
-}
-
-static void die(const char * str, ...)
-{
- va_list args;
- va_start(args, str);
- vfprintf(stderr, str, args);
- fputc('\n', stderr);
- exit(1);
-}
-
-static void usage(void)
-{
- die("Usage: build setup system [rootdev] [> image]");
-}
-
-int main(int argc, char ** argv)
-{
- unsigned int i, sz, setup_sectors;
- int c;
- u32 sys_size;
- u8 major_root, minor_root;
- struct stat sb;
- FILE *file;
- int fd;
- void *kernel;
- u32 crc = 0xffffffffUL;
-
- if ((argc < 3) || (argc > 4))
- usage();
- if (argc > 3) {
- if (!strcmp(argv[3], "CURRENT")) {
- if (stat("/", &sb)) {
- perror("/");
- die("Couldn't stat /");
- }
- major_root = major(sb.st_dev);
- minor_root = minor(sb.st_dev);
- } else if (strcmp(argv[3], "FLOPPY")) {
- if (stat(argv[3], &sb)) {
- perror(argv[3]);
- die("Couldn't stat root device.");
- }
- major_root = major(sb.st_rdev);
- minor_root = minor(sb.st_rdev);
- } else {
- major_root = 0;
- minor_root = 0;
- }
- } else {
- major_root = DEFAULT_MAJOR_ROOT;
- minor_root = DEFAULT_MINOR_ROOT;
- }
- fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
-
- /* Copy the setup code */
- file = fopen(argv[1], "r");
- if (!file)
- die("Unable to open `%s': %m", argv[1]);
- c = fread(buf, 1, sizeof(buf), file);
- if (ferror(file))
- die("read-error on `setup'");
- if (c < 1024)
- die("The setup must be at least 1024 bytes");
- if (buf[510] != 0x55 || buf[511] != 0xaa)
- die("Boot block hasn't got boot flag (0xAA55)");
- fclose(file);
-
- /* Pad unused space with zeros */
- setup_sectors = (c + 511) / 512;
- if (setup_sectors < SETUP_SECT_MIN)
- setup_sectors = SETUP_SECT_MIN;
- i = setup_sectors*512;
- memset(buf+c, 0, i-c);
-
- /* Set the default root device */
- buf[508] = minor_root;
- buf[509] = major_root;
-
- fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
-
- /* Open and stat the kernel file */
- fd = open(argv[2], O_RDONLY);
- if (fd < 0)
- die("Unable to open `%s': %m", argv[2]);
- if (fstat(fd, &sb))
- die("Unable to stat `%s': %m", argv[2]);
- sz = sb.st_size;
- fprintf (stderr, "System is %d kB\n", (sz+1023)/1024);
- kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
- if (kernel == MAP_FAILED)
- die("Unable to mmap '%s': %m", argv[2]);
- /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
- sys_size = (sz + 15 + 4) / 16;
-
- /* Patch the setup code with the appropriate size parameters */
- buf[0x1f1] = setup_sectors-1;
- buf[0x1f4] = sys_size;
- buf[0x1f5] = sys_size >> 8;
- buf[0x1f6] = sys_size >> 16;
- buf[0x1f7] = sys_size >> 24;
-
- crc = partial_crc32(buf, i, crc);
- if (fwrite(buf, 1, i, stdout) != i)
- die("Writing setup failed");
-
- /* Copy the kernel code */
- crc = partial_crc32(kernel, sz, crc);
- if (fwrite(kernel, 1, sz, stdout) != sz)
- die("Writing kernel failed");
-
- /* Add padding leaving 4 bytes for the checksum */
- while (sz++ < (sys_size*16) - 4) {
- crc = partial_crc32_one('\0', crc);
- if (fwrite("\0", 1, 1, stdout) != 1)
- die("Writing padding failed");
- }
-
- /* Write the CRC */
- fprintf(stderr, "CRC %lx\n", crc);
- if (fwrite(&crc, 1, 4, stdout) != 4)
- die("Writing CRC failed");
-
- close(fd);
-
- /* Everything is OK */
- return 0;
-}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 01ec69c901c7..f7eb976b0a4b 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -1,32 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
- * Very simple screen I/O
- * XXX: Probably should add very simple serial I/O?
+ * Very simple screen and serial I/O
*/
#include "boot.h"
+int early_serial_base;
+
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
/*
* These functions are in .inittext so they can be used to signal
* error during initialization.
*/
-void __attribute__((section(".inittext"))) putchar(int ch)
+static void __section(".inittext") serial_putchar(int ch)
{
- struct biosregs ireg;
+ unsigned timeout = 0xffff;
- if (ch == '\n')
- putchar('\r'); /* \n -> \r\n */
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+static void __section(".inittext") bios_putchar(int ch)
+{
+ struct biosregs ireg;
initregs(&ireg);
ireg.bx = 0x0007;
@@ -36,7 +47,18 @@ void __attribute__((section(".inittext"))) putchar(int ch)
intcall(0x10, &ireg, NULL);
}
-void __attribute__((section(".inittext"))) puts(const char *str)
+void __section(".inittext") putchar(int ch)
+{
+ if (ch == '\n')
+ putchar('\r'); /* \n -> \r\n */
+
+ bios_putchar(ch);
+
+ if (early_serial_base != 0)
+ serial_putchar(ch);
+}
+
+void __section(".inittext") puts(const char *str)
{
while (*str)
putchar(*str++);
@@ -112,3 +134,4 @@ int getchar_timeout(void)
return 0; /* Timeout! */
}
+
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
index 2723d9b5ce43..945383f0f606 100644
--- a/arch/x86/boot/version.c
+++ b/arch/x86/boot/version.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -13,8 +11,9 @@
*/
#include "boot.h"
-#include <linux/utsrelease.h>
-#include <linux/compile.h>
+#include <generated/utsversion.h>
+#include <generated/utsrelease.h>
+#include <generated/compile.h>
const char kernel_version[] =
UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h
index 468e444622c5..9e23fdffbb88 100644
--- a/arch/x86/boot/vesa.h
+++ b/arch/x86/boot/vesa.h
@@ -1,13 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* ----------------------------------------------------------------------- *
*
* Copyright 1999-2007 H. Peter Anvin - All Rights Reserved
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Boston MA 02111-1307, USA; either version 2 of the License, or
- * (at your option) any later version; incorporated herein by reference.
- *
* ----------------------------------------------------------------------- */
#ifndef BOOT_VESA_H
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 49e0c18833e0..6eb8c06bc287 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index 748e8d06290a..9ada55dc1ab7 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007-2008 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -19,13 +17,13 @@
#include "video.h"
#include "vesa.h"
+#include <uapi/asm/boot.h>
+
/*
* Common variables
*/
-int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
-u16 video_segment;
+int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
int force_x, force_y; /* Don't query the BIOS for cols/rows */
-
int do_restore; /* Screen contents changed during mode flip */
int graphic_mode; /* Graphic mode with linear frame buffer */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 275dd177f198..c2c6d35e3a43 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -16,6 +14,7 @@
#include "boot.h"
#include "video.h"
#include "vesa.h"
+#include "string.h"
/* VESA information */
static struct vesa_general_info vginfo;
@@ -31,7 +30,6 @@ static inline void vesa_store_mode_params_graphics(void) {}
static int vesa_probe(void)
{
-#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
struct biosregs ireg, oreg;
u16 mode;
addr_t mode_ptr;
@@ -49,8 +47,7 @@ static int vesa_probe(void)
vginfo.signature != VESA_MAGIC ||
vginfo.version < 0x0102)
return 0; /* Not present */
-#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */
-#ifdef CONFIG_VIDEO_VESA
+
set_fs(vginfo.video_mode_ptr.seg);
mode_ptr = vginfo.video_mode_ptr.off;
@@ -63,7 +60,7 @@ static int vesa_probe(void)
if (mode & ~0x1ff)
continue;
- memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+ memset(&vminfo, 0, sizeof(vminfo)); /* Just in case... */
ireg.ax = 0x4f01;
ireg.cx = mode;
@@ -86,7 +83,7 @@ static int vesa_probe(void)
(vminfo.memory_layout == 4 ||
vminfo.memory_layout == 6) &&
vminfo.memory_planes == 1) {
-#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
+#ifdef CONFIG_BOOT_VESA_SUPPORT
/* Graphics mode, color, linear frame buffer
supported. Only register the mode if
if framebuffer is configured, however,
@@ -102,9 +99,6 @@ static int vesa_probe(void)
}
return nmodes;
-#else
- return 0;
-#endif /* CONFIG_VIDEO_VESA */
}
static int vesa_set_mode(struct mode_info *mode)
@@ -113,7 +107,7 @@ static int vesa_set_mode(struct mode_info *mode)
int is_graphic;
u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
- memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+ memset(&vminfo, 0, sizeof(vminfo)); /* Just in case... */
initregs(&ireg);
ireg.ax = 0x4f01;
@@ -127,7 +121,7 @@ static int vesa_set_mode(struct mode_info *mode)
if ((vminfo.mode_attr & 0x15) == 0x05) {
/* It's a supported text mode */
is_graphic = 0;
-#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
+#ifdef CONFIG_BOOT_VESA_SUPPORT
} else if ((vminfo.mode_attr & 0x99) == 0x99) {
/* It's a graphics mode with linear frame buffer */
is_graphic = 1;
@@ -245,7 +239,7 @@ void vesa_store_edid(void)
struct biosregs ireg, oreg;
/* Apparently used as a nonsense token... */
- memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
+ memset(&boot_params.edid_info, 0x13, sizeof(boot_params.edid_info));
if (vginfo.version < 0x0200)
return; /* EDID requires VBE 2.0+ */
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 8f8d827e254d..4816cb9cf996 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -41,33 +39,15 @@ static __videocard video_vga;
static u8 vga_set_basic_mode(void)
{
struct biosregs ireg, oreg;
- u16 ax;
- u8 rows;
u8 mode;
initregs(&ireg);
-#ifdef CONFIG_VIDEO_400_HACK
- if (adapter >= ADAPTER_VGA) {
- ireg.ax = 0x1202;
- ireg.bx = 0x0030;
- intcall(0x10, &ireg, NULL);
- }
-#endif
-
- ax = 0x0f00;
+ /* Query current mode */
+ ireg.ax = 0x0f00;
intcall(0x10, &ireg, &oreg);
mode = oreg.al;
- set_fs(0);
- rows = rdfs8(0x484); /* rows minus one */
-
-#ifndef CONFIG_VIDEO_400_HACK
- if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
- (rows == 0 || rows == 24))
- return mode;
-#endif
-
if (mode != 3 && mode != 7)
mode = 3;
@@ -259,9 +239,9 @@ static int vga_probe(void)
vga_modes,
};
static int mode_count[] = {
- sizeof(cga_modes)/sizeof(struct mode_info),
- sizeof(ega_modes)/sizeof(struct mode_info),
- sizeof(vga_modes)/sizeof(struct mode_info),
+ ARRAY_SIZE(cga_modes),
+ ARRAY_SIZE(ega_modes),
+ ARRAY_SIZE(vga_modes),
};
struct biosregs ireg, oreg;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index bad728b76fc2..0641c8c46aee 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -1,22 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
* Copyright 2009 Intel Corporation; author H. Peter Anvin
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
* Select video mode
*/
+#include <uapi/asm/boot.h>
+
#include "boot.h"
#include "video.h"
#include "vesa.h"
+static u16 video_segment;
+
static void store_cursor_position(void)
{
struct biosregs ireg, oreg;
@@ -27,6 +29,12 @@ static void store_cursor_position(void)
boot_params.screen_info.orig_x = oreg.dl;
boot_params.screen_info.orig_y = oreg.dh;
+
+ if (oreg.ch & 0x20)
+ boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
+
+ if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
+ boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
}
static void store_video_mode(void)
@@ -105,7 +113,7 @@ static unsigned int get_entry(void)
} else if ((key >= '0' && key <= '9') ||
(key >= 'A' && key <= 'Z') ||
(key >= 'a' && key <= 'z')) {
- if (len < sizeof entry_buf) {
+ if (len < sizeof(entry_buf)) {
entry_buf[len++] = key;
putchar(key);
}
@@ -221,7 +229,6 @@ static unsigned int mode_menu(void)
}
}
-#ifdef CONFIG_VIDEO_RETAIN
/* Save screen content to the heap */
static struct saved_screen {
int x, y;
@@ -285,7 +292,7 @@ static void restore_screen(void)
"shrw %%cx ; "
"jnc 1f ; "
"stosw \n\t"
- "1: rep;stosl ; "
+ "1: rep stosl ; "
"popw %%es"
: "+D" (dst), "+c" (npad)
: "bdS" (video_segment),
@@ -293,16 +300,19 @@ static void restore_screen(void)
}
/* Restore cursor position */
+ if (saved.curx >= xs)
+ saved.curx = xs-1;
+ if (saved.cury >= ys)
+ saved.cury = ys-1;
+
initregs(&ireg);
ireg.ah = 0x02; /* Set cursor position */
ireg.dh = saved.cury;
ireg.dl = saved.curx;
intcall(0x10, &ireg, NULL);
+
+ store_cursor_position();
}
-#else
-#define save_screen() ((void)0)
-#define restore_screen() ((void)0)
-#endif
void set_video(void)
{
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index 5bb174a997fc..04bde0bb2003 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -1,11 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* -*- linux-c -*- ------------------------------------------------------- *
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
*
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
* ----------------------------------------------------------------------- */
/*
@@ -17,19 +15,8 @@
#include <linux/types.h>
-/* Enable autodetection of SVGA adapters and modes. */
-#undef CONFIG_VIDEO_SVGA
-
-/* Enable autodetection of VESA modes */
-#define CONFIG_VIDEO_VESA
-
-/* Retain screen contents when switching modes */
-#define CONFIG_VIDEO_RETAIN
-
-/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
-#undef CONFIG_VIDEO_400_HACK
-
-/* This code uses an extended set of video mode numbers. These include:
+/*
+ * This code uses an extended set of video mode numbers. These include:
* Aliases for standard modes
* NORMAL_VGA (-1)
* EXTENDED_VGA (-2)
@@ -67,13 +54,8 @@
/* The "recalculate timings" flag */
#define VIDEO_RECALC 0x8000
-/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
-#ifdef CONFIG_VIDEO_RETAIN
void store_screen(void);
#define DO_STORE() store_screen()
-#else
-#define DO_STORE() ((void)0)
-#endif /* CONFIG_VIDEO_RETAIN */
/*
* Mode table structures
@@ -96,7 +78,7 @@ struct card_info {
u16 xmode_n; /* Size of unprobed mode range */
};
-#define __videocard struct card_info __attribute__((section(".videocards")))
+#define __videocard struct card_info __section(".videocards") __attribute__((used))
extern struct card_info video_cards[], video_cards_end[];
int mode_defined(u16 mode); /* video.c */
@@ -107,7 +89,6 @@ int mode_defined(u16 mode); /* video.c */
#define ADAPTER_VGA 2
extern int adapter;
-extern u16 video_segment;
extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
extern int do_restore; /* Restore screen contents */
extern int graphic_mode; /* Graphics mode with linear frame buffer */
diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile
new file mode 100644
index 000000000000..eabdc7486538
--- /dev/null
+++ b/arch/x86/coco/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS_REMOVE_core.o = -pg
+KASAN_SANITIZE_core.o := n
+CFLAGS_core.o += -fno-stack-protector
+
+obj-y += core.o
+
+obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev/
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
new file mode 100644
index 000000000000..989ca9f72ba3
--- /dev/null
+++ b/arch/x86/coco/core.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#include <linux/export.h>
+#include <linux/cc_platform.h>
+#include <linux/string.h>
+#include <linux/random.h>
+
+#include <asm/archrandom.h>
+#include <asm/coco.h>
+#include <asm/processor.h>
+
+enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
+SYM_PIC_ALIAS(cc_vendor);
+u64 cc_mask __ro_after_init;
+SYM_PIC_ALIAS(cc_mask);
+
+static struct cc_attr_flags {
+ __u64 host_sev_snp : 1,
+ __resv : 63;
+} cc_flags;
+
+static bool noinstr intel_cc_platform_has(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ case CC_ATTR_MEM_ENCRYPT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * Handle the SEV-SNP vTOM case where sme_me_mask is zero, and
+ * the other levels of SME/SEV functionality, including C-bit
+ * based SEV-SNP, are not enabled.
+ */
+static __maybe_unused __always_inline bool amd_cc_platform_vtom(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ case CC_ATTR_MEM_ENCRYPT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * cc_platform_has() function is used for this. When a distinction isn't
+ * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+static bool noinstr amd_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return amd_cc_platform_vtom(attr);
+
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return sme_me_mask;
+
+ case CC_ATTR_HOST_MEM_ENCRYPT:
+ return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED);
+
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+
+ case CC_ATTR_GUEST_STATE_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+
+ /*
+ * With SEV, the rep string I/O instructions need to be unrolled
+ * but SEV-ES supports them through the #VC handler.
+ */
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ return (sev_status & MSR_AMD64_SEV_ENABLED) &&
+ !(sev_status & MSR_AMD64_SEV_ES_ENABLED);
+
+ case CC_ATTR_GUEST_SEV_SNP:
+ return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+
+ case CC_ATTR_GUEST_SNP_SECURE_TSC:
+ return sev_status & MSR_AMD64_SNP_SECURE_TSC;
+
+ case CC_ATTR_HOST_SEV_SNP:
+ return cc_flags.host_sev_snp;
+
+ case CC_ATTR_SNP_SECURE_AVIC:
+ return sev_status & MSR_AMD64_SNP_SECURE_AVIC;
+
+ default:
+ return false;
+ }
+#else
+ return false;
+#endif
+}
+
+bool noinstr cc_platform_has(enum cc_attr attr)
+{
+ switch (cc_vendor) {
+ case CC_VENDOR_AMD:
+ return amd_cc_platform_has(attr);
+ case CC_VENDOR_INTEL:
+ return intel_cc_platform_has(attr);
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
+
+u64 cc_mkenc(u64 val)
+{
+ /*
+ * Both AMD and Intel use a bit in the page table to indicate
+ * encryption status of the page.
+ *
+ * - for AMD, bit *set* means the page is encrypted
+ * - for AMD with vTOM and for Intel, *clear* means encrypted
+ */
+ switch (cc_vendor) {
+ case CC_VENDOR_AMD:
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return val & ~cc_mask;
+ else
+ return val | cc_mask;
+ case CC_VENDOR_INTEL:
+ return val & ~cc_mask;
+ default:
+ return val;
+ }
+}
+
+u64 cc_mkdec(u64 val)
+{
+ /* See comment in cc_mkenc() */
+ switch (cc_vendor) {
+ case CC_VENDOR_AMD:
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return val | cc_mask;
+ else
+ return val & ~cc_mask;
+ case CC_VENDOR_INTEL:
+ return val | cc_mask;
+ default:
+ return val;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_mkdec);
+
+static void amd_cc_platform_clear(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_HOST_SEV_SNP:
+ cc_flags.host_sev_snp = 0;
+ break;
+ default:
+ break;
+ }
+}
+
+void cc_platform_clear(enum cc_attr attr)
+{
+ switch (cc_vendor) {
+ case CC_VENDOR_AMD:
+ amd_cc_platform_clear(attr);
+ break;
+ default:
+ break;
+ }
+}
+
+static void amd_cc_platform_set(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_HOST_SEV_SNP:
+ cc_flags.host_sev_snp = 1;
+ break;
+ default:
+ break;
+ }
+}
+
+void cc_platform_set(enum cc_attr attr)
+{
+ switch (cc_vendor) {
+ case CC_VENDOR_AMD:
+ amd_cc_platform_set(attr);
+ break;
+ default:
+ break;
+ }
+}
+
+__init void cc_random_init(void)
+{
+ /*
+ * The seed is 32 bytes (in units of longs), which is 256 bits, which
+ * is the security level that the RNG is targeting.
+ */
+ unsigned long rng_seed[32 / sizeof(long)];
+ size_t i, longs;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * Since the CoCo threat model includes the host, the only reliable
+ * source of entropy that can be neither observed nor manipulated is
+ * RDRAND. Usually, RDRAND failure is considered tolerable, but since
+ * CoCo guests have no other unobservable source of entropy, it's
+ * important to at least ensure the RNG gets some initial random seeds.
+ */
+ for (i = 0; i < ARRAY_SIZE(rng_seed); i += longs) {
+ longs = arch_get_random_longs(&rng_seed[i], ARRAY_SIZE(rng_seed) - i);
+
+ /*
+ * A zero return value means that the guest doesn't have RDRAND
+ * or the CPU is physically broken, and in both cases that
+ * means most crypto inside of the CoCo instance will be
+ * broken, defeating the purpose of CoCo in the first place. So
+ * just panic here because it's absolutely unsafe to continue
+ * executing.
+ */
+ if (longs == 0)
+ panic("RDRAND is defective.");
+ }
+ add_device_randomness(rng_seed, sizeof(rng_seed));
+ memzero_explicit(rng_seed, sizeof(rng_seed));
+}
diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile
new file mode 100644
index 000000000000..3b8ae214a6a6
--- /dev/null
+++ b/arch/x86/coco/sev/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += core.o noinstr.o vc-handle.o
+
+# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
+UBSAN_SANITIZE_noinstr.o := n
+
+# GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining
+KASAN_SANITIZE_noinstr.o := n
+KCSAN_SANITIZE_noinstr.o := n
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
new file mode 100644
index 000000000000..9ae3b11754e6
--- /dev/null
+++ b/arch/x86/coco/sev/core.c
@@ -0,0 +1,2431 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/sched/debug.h> /* For show_regs() */
+#include <linux/percpu-defs.h>
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/efi.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <linux/dmi.h>
+#include <uapi/linux/sev-guest.h>
+#include <crypto/gcm.h>
+
+#include <asm/init.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/xcr.h>
+#include <asm/processor.h>
+#include <asm/realmode.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid/api.h>
+#include <asm/cmdline.h>
+#include <asm/msr.h>
+
+/* Bitmap of SEV features supported by the hypervisor */
+u64 sev_hv_features __ro_after_init;
+SYM_PIC_ALIAS(sev_hv_features);
+
+/* Secrets page physical address from the CC blob */
+u64 sev_secrets_pa __ro_after_init;
+SYM_PIC_ALIAS(sev_secrets_pa);
+
+/* For early boot SVSM communication */
+struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
+SYM_PIC_ALIAS(boot_svsm_ca_page);
+
+/*
+ * SVSM related information:
+ * During boot, the page tables are set up as identity mapped and later
+ * changed to use kernel virtual addresses. Maintain separate virtual and
+ * physical addresses for the CAA to allow SVSM functions to be used during
+ * early boot, both with identity mapped virtual addresses and proper kernel
+ * virtual addresses.
+ */
+u64 boot_svsm_caa_pa __ro_after_init;
+SYM_PIC_ALIAS(boot_svsm_caa_pa);
+
+DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
+DEFINE_PER_CPU(u64, svsm_caa_pa);
+
+static inline struct svsm_ca *svsm_get_caa(void)
+{
+ if (sev_cfg.use_cas)
+ return this_cpu_read(svsm_caa);
+ else
+ return rip_rel_ptr(&boot_svsm_ca_page);
+}
+
+static inline u64 svsm_get_caa_pa(void)
+{
+ if (sev_cfg.use_cas)
+ return this_cpu_read(svsm_caa_pa);
+ else
+ return boot_svsm_caa_pa;
+}
+
+/* AP INIT values as documented in the APM2 section "Processor Initialization State" */
+#define AP_INIT_CS_LIMIT 0xffff
+#define AP_INIT_DS_LIMIT 0xffff
+#define AP_INIT_LDTR_LIMIT 0xffff
+#define AP_INIT_GDTR_LIMIT 0xffff
+#define AP_INIT_IDTR_LIMIT 0xffff
+#define AP_INIT_TR_LIMIT 0xffff
+#define AP_INIT_RFLAGS_DEFAULT 0x2
+#define AP_INIT_DR6_DEFAULT 0xffff0ff0
+#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define AP_INIT_XCR0_DEFAULT 0x1
+#define AP_INIT_X87_FTW_DEFAULT 0x5555
+#define AP_INIT_X87_FCW_DEFAULT 0x0040
+#define AP_INIT_CR0_DEFAULT 0x60000010
+#define AP_INIT_MXCSR_DEFAULT 0x1f80
+
+static const char * const sev_status_feat_names[] = {
+ [MSR_AMD64_SEV_ENABLED_BIT] = "SEV",
+ [MSR_AMD64_SEV_ES_ENABLED_BIT] = "SEV-ES",
+ [MSR_AMD64_SEV_SNP_ENABLED_BIT] = "SEV-SNP",
+ [MSR_AMD64_SNP_VTOM_BIT] = "vTom",
+ [MSR_AMD64_SNP_REFLECT_VC_BIT] = "ReflectVC",
+ [MSR_AMD64_SNP_RESTRICTED_INJ_BIT] = "RI",
+ [MSR_AMD64_SNP_ALT_INJ_BIT] = "AI",
+ [MSR_AMD64_SNP_DEBUG_SWAP_BIT] = "DebugSwap",
+ [MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT] = "NoHostIBS",
+ [MSR_AMD64_SNP_BTB_ISOLATION_BIT] = "BTBIsol",
+ [MSR_AMD64_SNP_VMPL_SSS_BIT] = "VmplSSS",
+ [MSR_AMD64_SNP_SECURE_TSC_BIT] = "SecureTSC",
+ [MSR_AMD64_SNP_VMGEXIT_PARAM_BIT] = "VMGExitParam",
+ [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt",
+ [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt",
+ [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt",
+ [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC",
+};
+
+/*
+ * For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and
+ * initializes snp_tsc_scale and snp_tsc_offset. These values are replicated
+ * across the APs VMSA fields (TSC_SCALE and TSC_OFFSET).
+ */
+static u64 snp_tsc_scale __ro_after_init;
+static u64 snp_tsc_offset __ro_after_init;
+static unsigned long snp_tsc_freq_khz __ro_after_init;
+
+DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
+DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
+
+/*
+ * SVSM related information:
+ * When running under an SVSM, the VMPL that Linux is executing at must be
+ * non-zero. The VMPL is therefore used to indicate the presence of an SVSM.
+ */
+u8 snp_vmpl __ro_after_init;
+EXPORT_SYMBOL_GPL(snp_vmpl);
+SYM_PIC_ALIAS(snp_vmpl);
+
+/*
+ * Since feature negotiation related variables are set early in the boot
+ * process they must reside in the .data section so as not to be zeroed
+ * out when the .bss section is later cleared.
+ *
+ * GHCB protocol version negotiated with the hypervisor.
+ */
+u16 ghcb_version __ro_after_init;
+SYM_PIC_ALIAS(ghcb_version);
+
+/* For early boot hypervisor communication in SEV-ES enabled guests */
+static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
+
+/*
+ * Needs to be in the .data section because we need it NULL before bss is
+ * cleared
+ */
+struct ghcb *boot_ghcb __section(".data");
+
+static u64 __init get_snp_jump_table_addr(void)
+{
+ struct snp_secrets_page *secrets;
+ void __iomem *mem;
+ u64 addr;
+
+ mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE);
+ if (!mem) {
+ pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
+ return 0;
+ }
+
+ secrets = (__force struct snp_secrets_page *)mem;
+
+ addr = secrets->os_area.ap_jump_table_pa;
+ iounmap(mem);
+
+ return addr;
+}
+
+static u64 __init get_jump_table_addr(void)
+{
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ u64 ret = 0;
+
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return get_snp_jump_table_addr();
+
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
+ ghcb_sw_exit_info_2_is_valid(ghcb))
+ ret = ghcb->save.sw_exit_info_2;
+
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call)
+{
+ struct es_em_ctxt ctxt;
+ u8 pending = 0;
+
+ vc_ghcb_invalidate(ghcb);
+
+ /*
+ * Fill in protocol and format specifiers. This can be called very early
+ * in the boot, so use rip-relative references as needed.
+ */
+ ghcb->protocol_version = ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+
+ svsm_issue_call(call, &pending);
+
+ if (pending)
+ return -EINVAL;
+
+ switch (verify_exception_info(ghcb, &ctxt)) {
+ case ES_OK:
+ break;
+ case ES_EXCEPTION:
+ vc_forward_exception(&ctxt);
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ return svsm_process_result_codes(call);
+}
+
+static int svsm_perform_call_protocol(struct svsm_call *call)
+{
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret;
+
+ flags = native_local_irq_save();
+
+ if (sev_cfg.ghcbs_initialized)
+ ghcb = __sev_get_ghcb(&state);
+ else if (boot_ghcb)
+ ghcb = boot_ghcb;
+ else
+ ghcb = NULL;
+
+ do {
+ ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
+ : __pi_svsm_perform_msr_protocol(call);
+ } while (ret == -EAGAIN);
+
+ if (sev_cfg.ghcbs_initialized)
+ __sev_put_ghcb(&state);
+
+ native_local_irq_restore(flags);
+
+ return ret;
+}
+
+static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size,
+ int ret, u64 svsm_ret)
+{
+ WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n",
+ pfn, action, page_size, ret, svsm_ret);
+
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+}
+
+static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret)
+{
+ unsigned int page_size;
+ bool action;
+ u64 pfn;
+
+ pfn = pc->entry[pc->cur_index].pfn;
+ action = pc->entry[pc->cur_index].action;
+ page_size = pc->entry[pc->cur_index].page_size;
+
+ __pval_terminate(pfn, action, page_size, ret, svsm_ret);
+}
+
+static void pval_pages(struct snp_psc_desc *desc)
+{
+ struct psc_entry *e;
+ unsigned long vaddr;
+ unsigned int size;
+ unsigned int i;
+ bool validate;
+ u64 pfn;
+ int rc;
+
+ for (i = 0; i <= desc->hdr.end_entry; i++) {
+ e = &desc->entries[i];
+
+ pfn = e->gfn;
+ vaddr = (unsigned long)pfn_to_kaddr(pfn);
+ size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
+ validate = e->operation == SNP_PAGE_STATE_PRIVATE;
+
+ rc = pvalidate(vaddr, size, validate);
+ if (!rc)
+ continue;
+
+ if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) {
+ unsigned long vaddr_end = vaddr + PMD_SIZE;
+
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) {
+ rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (rc)
+ __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0);
+ }
+ } else {
+ __pval_terminate(pfn, validate, size, rc, 0);
+ }
+ }
+}
+
+static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action,
+ struct svsm_pvalidate_call *pc)
+{
+ struct svsm_pvalidate_entry *pe;
+
+ /* Nothing in the CA yet */
+ pc->num_entries = 0;
+ pc->cur_index = 0;
+
+ pe = &pc->entry[0];
+
+ while (pfn < pfn_end) {
+ pe->page_size = RMP_PG_SIZE_4K;
+ pe->action = action;
+ pe->ignore_cf = 0;
+ pe->rsvd = 0;
+ pe->pfn = pfn;
+
+ pe++;
+ pfn++;
+
+ pc->num_entries++;
+ if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
+ break;
+ }
+
+ return pfn;
+}
+
+static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry,
+ struct svsm_pvalidate_call *pc)
+{
+ struct svsm_pvalidate_entry *pe;
+ struct psc_entry *e;
+
+ /* Nothing in the CA yet */
+ pc->num_entries = 0;
+ pc->cur_index = 0;
+
+ pe = &pc->entry[0];
+ e = &desc->entries[desc_entry];
+
+ while (desc_entry <= desc->hdr.end_entry) {
+ pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
+ pe->action = e->operation == SNP_PAGE_STATE_PRIVATE;
+ pe->ignore_cf = 0;
+ pe->rsvd = 0;
+ pe->pfn = e->gfn;
+
+ pe++;
+ e++;
+
+ desc_entry++;
+ pc->num_entries++;
+ if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
+ break;
+ }
+
+ return desc_entry;
+}
+
+static void svsm_pval_pages(struct snp_psc_desc *desc)
+{
+ struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY];
+ unsigned int i, pv_4k_count = 0;
+ struct svsm_pvalidate_call *pc;
+ struct svsm_call call = {};
+ unsigned long flags;
+ bool action;
+ u64 pc_pa;
+ int ret;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ /*
+ * The SVSM calling area (CA) can support processing 510 entries at a
+ * time. Loop through the Page State Change descriptor until the CA is
+ * full or the last entry in the descriptor is reached, at which time
+ * the SVSM is invoked. This repeats until all entries in the descriptor
+ * are processed.
+ */
+ call.caa = svsm_get_caa();
+
+ pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
+ pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
+
+ /* Protocol 0, Call ID 1 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
+ call.rcx = pc_pa;
+
+ for (i = 0; i <= desc->hdr.end_entry;) {
+ i = svsm_build_ca_from_psc_desc(desc, i, pc);
+
+ do {
+ ret = svsm_perform_call_protocol(&call);
+ if (!ret)
+ continue;
+
+ /*
+ * Check if the entry failed because of an RMP mismatch (a
+ * PVALIDATE at 2M was requested, but the page is mapped in
+ * the RMP as 4K).
+ */
+
+ if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH &&
+ pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) {
+ /* Save this entry for post-processing at 4K */
+ pv_4k[pv_4k_count++] = pc->entry[pc->cur_index];
+
+ /* Skip to the next one unless at the end of the list */
+ pc->cur_index++;
+ if (pc->cur_index < pc->num_entries)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ }
+ } while (ret == -EAGAIN);
+
+ if (ret)
+ svsm_pval_terminate(pc, ret, call.rax_out);
+ }
+
+ /* Process any entries that failed to be validated at 2M and validate them at 4K */
+ for (i = 0; i < pv_4k_count; i++) {
+ u64 pfn, pfn_end;
+
+ action = pv_4k[i].action;
+ pfn = pv_4k[i].pfn;
+ pfn_end = pfn + 512;
+
+ while (pfn < pfn_end) {
+ pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc);
+
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ svsm_pval_terminate(pc, ret, call.rax_out);
+ }
+ }
+
+ native_local_irq_restore(flags);
+}
+
+static void pvalidate_pages(struct snp_psc_desc *desc)
+{
+ struct psc_entry *e;
+ unsigned int i;
+
+ if (snp_vmpl)
+ svsm_pval_pages(desc);
+ else
+ pval_pages(desc);
+
+ /*
+ * If not affected by the cache-coherency vulnerability there is no need
+ * to perform the cache eviction mitigation.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_COHERENCY_SFW_NO))
+ return;
+
+ for (i = 0; i <= desc->hdr.end_entry; i++) {
+ e = &desc->entries[i];
+
+ /*
+ * If validating memory (making it private) perform the cache
+ * eviction mitigation.
+ */
+ if (e->operation == SNP_PAGE_STATE_PRIVATE)
+ sev_evict_cache(pfn_to_kaddr(e->gfn), e->pagesize ? 512 : 1);
+ }
+}
+
+static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
+{
+ int cur_entry, end_entry, ret = 0;
+ struct snp_psc_desc *data;
+ struct es_em_ctxt ctxt;
+
+ vc_ghcb_invalidate(ghcb);
+
+ /* Copy the input desc into GHCB shared buffer */
+ data = (struct snp_psc_desc *)ghcb->shared_buffer;
+ memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
+
+ /*
+ * As per the GHCB specification, the hypervisor can resume the guest
+ * before processing all the entries. Check whether all the entries
+ * are processed. If not, then keep retrying. Note, the hypervisor
+ * will update the data memory directly to indicate the status, so
+ * reference the data->hdr everywhere.
+ *
+ * The strategy here is to wait for the hypervisor to change the page
+ * state in the RMP table before guest accesses the memory pages. If the
+ * page state change was not successful, then later memory access will
+ * result in a crash.
+ */
+ cur_entry = data->hdr.cur_entry;
+ end_entry = data->hdr.end_entry;
+
+ while (data->hdr.cur_entry <= data->hdr.end_entry) {
+ ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
+
+ /* This will advance the shared buffer data points to. */
+ ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
+
+ /*
+ * Page State Change VMGEXIT can pass error code through
+ * exit_info_2.
+ */
+ if (WARN(ret || ghcb->save.sw_exit_info_2,
+ "SNP: PSC failed ret=%d exit_info_2=%llx\n",
+ ret, ghcb->save.sw_exit_info_2)) {
+ ret = 1;
+ goto out;
+ }
+
+ /* Verify that reserved bit is not set */
+ if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Sanity check that entry processing is not going backwards.
+ * This will happen only if hypervisor is tricking us.
+ */
+ if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
+"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
+ end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
+ unsigned long vaddr_end, int op)
+{
+ struct ghcb_state state;
+ bool use_large_entry;
+ struct psc_hdr *hdr;
+ struct psc_entry *e;
+ unsigned long flags;
+ unsigned long pfn;
+ struct ghcb *ghcb;
+ int i;
+
+ hdr = &data->hdr;
+ e = data->entries;
+
+ memset(data, 0, sizeof(*data));
+ i = 0;
+
+ while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) {
+ hdr->end_entry = i;
+
+ if (is_vmalloc_addr((void *)vaddr)) {
+ pfn = vmalloc_to_pfn((void *)vaddr);
+ use_large_entry = false;
+ } else {
+ pfn = __pa(vaddr) >> PAGE_SHIFT;
+ use_large_entry = true;
+ }
+
+ e->gfn = pfn;
+ e->operation = op;
+
+ if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) &&
+ (vaddr_end - vaddr) >= PMD_SIZE) {
+ e->pagesize = RMP_PG_SIZE_2M;
+ vaddr += PMD_SIZE;
+ } else {
+ e->pagesize = RMP_PG_SIZE_4K;
+ vaddr += PAGE_SIZE;
+ }
+
+ e++;
+ i++;
+ }
+
+ /* Page validation must be rescinded before changing to shared */
+ if (op == SNP_PAGE_STATE_SHARED)
+ pvalidate_pages(data);
+
+ local_irq_save(flags);
+
+ if (sev_cfg.ghcbs_initialized)
+ ghcb = __sev_get_ghcb(&state);
+ else
+ ghcb = boot_ghcb;
+
+ /* Invoke the hypervisor to perform the page state changes */
+ if (!ghcb || vmgexit_psc(ghcb, data))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+
+ if (sev_cfg.ghcbs_initialized)
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ /* Page validation must be performed after changing to private */
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ pvalidate_pages(data);
+
+ return vaddr;
+}
+
+static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
+{
+ struct snp_psc_desc desc;
+ unsigned long vaddr_end;
+
+ /* Use the MSR protocol when a GHCB is not available. */
+ if (!boot_ghcb) {
+ struct psc_desc d = { op, svsm_get_caa(), svsm_get_caa_pa() };
+
+ return early_set_pages_state(vaddr, __pa(vaddr), npages, &d);
+ }
+
+ vaddr = vaddr & PAGE_MASK;
+ vaddr_end = vaddr + (npages << PAGE_SHIFT);
+
+ while (vaddr < vaddr_end)
+ vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op);
+}
+
+void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
+}
+
+void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
+}
+
+void snp_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ unsigned long vaddr, npages;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ vaddr = (unsigned long)__va(start);
+ npages = (end - start) >> PAGE_SHIFT;
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
+}
+
+static int vmgexit_ap_control(u64 event, struct sev_es_save_area *vmsa, u32 apic_id)
+{
+ bool create = event != SVM_VMGEXIT_AP_DESTROY;
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+
+ if (create)
+ ghcb_set_rax(ghcb, vmsa->sev_features);
+
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
+ ghcb_set_sw_exit_info_1(ghcb,
+ ((u64)apic_id << 32) |
+ ((u64)snp_vmpl << 16) |
+ event);
+ ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
+ lower_32_bits(ghcb->save.sw_exit_info_1)) {
+ pr_err("SNP AP %s error\n", (create ? "CREATE" : "DESTROY"));
+ ret = -EINVAL;
+ }
+
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa)
+{
+ int ret;
+
+ if (snp_vmpl) {
+ struct svsm_call call = {};
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ call.caa = this_cpu_read(svsm_caa);
+ call.rcx = __pa(va);
+
+ if (make_vmsa) {
+ /* Protocol 0, Call ID 2 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU);
+ call.rdx = __pa(caa);
+ call.r8 = apic_id;
+ } else {
+ /* Protocol 0, Call ID 3 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU);
+ }
+
+ ret = svsm_perform_call_protocol(&call);
+
+ local_irq_restore(flags);
+ } else {
+ /*
+ * If the kernel runs at VMPL0, it can change the VMSA
+ * bit for a page using the RMPADJUST instruction.
+ * However, for the instruction to succeed it must
+ * target the permissions of a lesser privileged (higher
+ * numbered) VMPL level, so use VMPL1.
+ */
+ u64 attrs = 1;
+
+ if (make_vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+ }
+
+ return ret;
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, NULL, apic_id, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
+static void set_pte_enc(pte_t *kpte, int level, void *va)
+{
+ struct pte_enc_desc d = {
+ .kpte = kpte,
+ .pte_level = level,
+ .va = va,
+ .encrypt = true
+ };
+
+ prepare_pte_enc(&d);
+ set_pte_enc_mask(kpte, d.pfn, d.new_pgprot);
+}
+
+static void unshare_all_memory(void)
+{
+ unsigned long addr, end, size, ghcb;
+ struct sev_es_runtime_data *data;
+ unsigned int npages, level;
+ bool skipped_addr;
+ pte_t *pte;
+ int cpu;
+
+ /* Unshare the direct mapping. */
+ addr = PAGE_OFFSET;
+ end = PAGE_OFFSET + get_max_mapped();
+
+ while (addr < end) {
+ pte = lookup_address(addr, &level);
+ size = page_level_size(level);
+ npages = size / PAGE_SIZE;
+ skipped_addr = false;
+
+ if (!pte || !pte_decrypted(*pte) || pte_none(*pte)) {
+ addr += size;
+ continue;
+ }
+
+ /*
+ * Ensure that all the per-CPU GHCBs are made private at the
+ * end of the unsharing loop so that the switch to the slower
+ * MSR protocol happens last.
+ */
+ for_each_possible_cpu(cpu) {
+ data = per_cpu(runtime_data, cpu);
+ ghcb = (unsigned long)&data->ghcb_page;
+
+ /* Handle the case of a huge page containing the GHCB page */
+ if (addr <= ghcb && ghcb < addr + size) {
+ skipped_addr = true;
+ break;
+ }
+ }
+
+ if (!skipped_addr) {
+ set_pte_enc(pte, level, (void *)addr);
+ snp_set_memory_private(addr, npages);
+ }
+ addr += size;
+ }
+
+ /* Unshare all bss decrypted memory. */
+ addr = (unsigned long)__start_bss_decrypted;
+ end = (unsigned long)__start_bss_decrypted_unused;
+ npages = (end - addr) >> PAGE_SHIFT;
+
+ for (; addr < end; addr += PAGE_SIZE) {
+ pte = lookup_address(addr, &level);
+ if (!pte || !pte_decrypted(*pte) || pte_none(*pte))
+ continue;
+
+ set_pte_enc(pte, level, (void *)addr);
+ }
+ addr = (unsigned long)__start_bss_decrypted;
+ snp_set_memory_private(addr, npages);
+
+ __flush_tlb_all();
+}
+
+/* Stop new private<->shared conversions */
+void snp_kexec_begin(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ /*
+ * Crash kernel ends up here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!set_memory_enc_stop_conversion())
+ pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+/*
+ * Shutdown all APs except the one handling kexec/kdump and clearing
+ * the VMSA tag on AP's VMSA pages as they are not being used as
+ * VMSA page anymore.
+ */
+static void shutdown_all_aps(void)
+{
+ struct sev_es_save_area *vmsa;
+ int apic_id, this_cpu, cpu;
+
+ this_cpu = get_cpu();
+
+ /*
+ * APs are already in HLT loop when enc_kexec_finish() callback
+ * is invoked.
+ */
+ for_each_present_cpu(cpu) {
+ vmsa = per_cpu(sev_vmsa, cpu);
+
+ /*
+ * The BSP or offlined APs do not have guest allocated VMSA
+ * and there is no need to clear the VMSA tag for this page.
+ */
+ if (!vmsa)
+ continue;
+
+ /*
+ * Cannot clear the VMSA tag for the currently running vCPU.
+ */
+ if (this_cpu == cpu) {
+ unsigned long pa;
+ struct page *p;
+
+ pa = __pa(vmsa);
+ /*
+ * Mark the VMSA page of the running vCPU as offline
+ * so that is excluded and not touched by makedumpfile
+ * while generating vmcore during kdump.
+ */
+ p = pfn_to_online_page(pa >> PAGE_SHIFT);
+ if (p)
+ __SetPageOffline(p);
+ continue;
+ }
+
+ apic_id = cpuid_to_apicid[cpu];
+
+ /*
+ * Issue AP destroy to ensure AP gets kicked out of guest mode
+ * to allow using RMPADJUST to remove the VMSA tag on it's
+ * VMSA page.
+ */
+ vmgexit_ap_control(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id);
+ snp_cleanup_vmsa(vmsa, apic_id);
+ }
+
+ put_cpu();
+}
+
+void snp_kexec_finish(void)
+{
+ struct sev_es_runtime_data *data;
+ unsigned long size, addr;
+ unsigned int level, cpu;
+ struct ghcb *ghcb;
+ pte_t *pte;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ shutdown_all_aps();
+
+ unshare_all_memory();
+
+ /*
+ * Switch to using the MSR protocol to change per-CPU GHCBs to
+ * private. All the per-CPU GHCBs have been switched back to private,
+ * so can't do any more GHCB calls to the hypervisor beyond this point
+ * until the kexec'ed kernel starts running.
+ */
+ boot_ghcb = NULL;
+ sev_cfg.ghcbs_initialized = false;
+
+ for_each_possible_cpu(cpu) {
+ data = per_cpu(runtime_data, cpu);
+ ghcb = &data->ghcb_page;
+ pte = lookup_address((unsigned long)ghcb, &level);
+ size = page_level_size(level);
+ /* Handle the case of a huge page containing the GHCB page */
+ addr = (unsigned long)ghcb & page_level_mask(level);
+ set_pte_enc(pte, level, (void *)addr);
+ snp_set_memory_private(addr, (size / PAGE_SIZE));
+ }
+}
+
+#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
+#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
+#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
+
+#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
+#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
+
+static void *snp_alloc_vmsa_page(int cpu)
+{
+ struct page *p;
+
+ /*
+ * Allocate VMSA page to work around the SNP erratum where the CPU will
+ * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
+ * collides with the RMP entry of VMSA page. The recommended workaround
+ * is to not use a large page.
+ *
+ * Allocate an 8k page which is also 8k-aligned.
+ */
+ p = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ if (!p)
+ return NULL;
+
+ split_page(p, 1);
+
+ /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */
+ __free_page(p);
+
+ return page_address(p + 1);
+}
+
+static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned int cpu)
+{
+ struct sev_es_save_area *cur_vmsa, *vmsa;
+ struct svsm_ca *caa;
+ u8 sipi_vector;
+ int ret;
+ u64 cr4;
+
+ /*
+ * The hypervisor SNP feature support check has happened earlier, just check
+ * the AP_CREATION one here.
+ */
+ if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
+ return -EOPNOTSUPP;
+
+ /*
+ * Verify the desired start IP against the known trampoline start IP
+ * to catch any future new trampolines that may be introduced that
+ * would require a new protected guest entry point.
+ */
+ if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
+ "Unsupported SNP start_ip: %lx\n", start_ip))
+ return -EINVAL;
+
+ /* Override start_ip with known protected guest start IP */
+ start_ip = real_mode_header->sev_es_trampoline_start;
+ cur_vmsa = per_cpu(sev_vmsa, cpu);
+
+ /*
+ * A new VMSA is created each time because there is no guarantee that
+ * the current VMSA is the kernels or that the vCPU is not running. If
+ * an attempt was done to use the current VMSA with a running vCPU, a
+ * #VMEXIT of that vCPU would wipe out all of the settings being done
+ * here.
+ */
+ vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(cpu);
+ if (!vmsa)
+ return -ENOMEM;
+
+ /* If an SVSM is present, the SVSM per-CPU CAA will be !NULL */
+ caa = per_cpu(svsm_caa, cpu);
+
+ /* CR4 should maintain the MCE value */
+ cr4 = native_read_cr4() & X86_CR4_MCE;
+
+ /* Set the CS value based on the start_ip converted to a SIPI vector */
+ sipi_vector = (start_ip >> 12);
+ vmsa->cs.base = sipi_vector << 12;
+ vmsa->cs.limit = AP_INIT_CS_LIMIT;
+ vmsa->cs.attrib = INIT_CS_ATTRIBS;
+ vmsa->cs.selector = sipi_vector << 8;
+
+ /* Set the RIP value based on start_ip */
+ vmsa->rip = start_ip & 0xfff;
+
+ /* Set AP INIT defaults as documented in the APM */
+ vmsa->ds.limit = AP_INIT_DS_LIMIT;
+ vmsa->ds.attrib = INIT_DS_ATTRIBS;
+ vmsa->es = vmsa->ds;
+ vmsa->fs = vmsa->ds;
+ vmsa->gs = vmsa->ds;
+ vmsa->ss = vmsa->ds;
+
+ vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT;
+ vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT;
+ vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
+ vmsa->idtr.limit = AP_INIT_IDTR_LIMIT;
+ vmsa->tr.limit = AP_INIT_TR_LIMIT;
+ vmsa->tr.attrib = INIT_TR_ATTRIBS;
+
+ vmsa->cr4 = cr4;
+ vmsa->cr0 = AP_INIT_CR0_DEFAULT;
+ vmsa->dr7 = DR7_RESET_VALUE;
+ vmsa->dr6 = AP_INIT_DR6_DEFAULT;
+ vmsa->rflags = AP_INIT_RFLAGS_DEFAULT;
+ vmsa->g_pat = AP_INIT_GPAT_DEFAULT;
+ vmsa->xcr0 = AP_INIT_XCR0_DEFAULT;
+ vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT;
+ vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT;
+ vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT;
+
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ vmsa->vintr_ctrl |= V_GIF_MASK | V_NMI_ENABLE_MASK;
+
+ /* SVME must be set. */
+ vmsa->efer = EFER_SVME;
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = snp_vmpl;
+ vmsa->sev_features = sev_status >> 2;
+
+ /* Populate AP's TSC scale/offset to get accurate TSC values. */
+ if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC)) {
+ vmsa->tsc_scale = snp_tsc_scale;
+ vmsa->tsc_offset = snp_tsc_offset;
+ }
+
+ /* Switch the page over to a VMSA page now that it is initialized */
+ ret = snp_set_vmsa(vmsa, caa, apic_id, true);
+ if (ret) {
+ pr_err("set VMSA page failed (%u)\n", ret);
+ free_page((unsigned long)vmsa);
+
+ return -EINVAL;
+ }
+
+ /* Issue VMGEXIT AP Creation NAE event */
+ ret = vmgexit_ap_control(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id);
+ if (ret) {
+ snp_cleanup_vmsa(vmsa, apic_id);
+ vmsa = NULL;
+ }
+
+ /* Free up any previous VMSA page */
+ if (cur_vmsa)
+ snp_cleanup_vmsa(cur_vmsa, apic_id);
+
+ /* Record the current VMSA page */
+ per_cpu(sev_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+void __init snp_set_wakeup_secondary_cpu(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ /*
+ * Always set this override if SNP is enabled. This makes it the
+ * required method to start APs under SNP. If the hypervisor does
+ * not support AP creation, then no APs will be started.
+ */
+ apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit);
+}
+
+int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
+{
+ u16 startup_cs, startup_ip;
+ phys_addr_t jump_table_pa;
+ u64 jump_table_addr;
+ u16 __iomem *jump_table;
+
+ jump_table_addr = get_jump_table_addr();
+
+ /* On UP guests there is no jump table so this is not a failure */
+ if (!jump_table_addr)
+ return 0;
+
+ /* Check if AP Jump Table is page-aligned */
+ if (jump_table_addr & ~PAGE_MASK)
+ return -EINVAL;
+
+ jump_table_pa = jump_table_addr & PAGE_MASK;
+
+ startup_cs = (u16)(rmh->trampoline_start >> 4);
+ startup_ip = (u16)(rmh->sev_es_trampoline_start -
+ rmh->trampoline_start);
+
+ jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
+ if (!jump_table)
+ return -EIO;
+
+ writew(startup_ip, &jump_table[0]);
+ writew(startup_cs, &jump_table[1]);
+
+ iounmap(jump_table);
+
+ return 0;
+}
+
+/*
+ * This is needed by the OVMF UEFI firmware which will use whatever it finds in
+ * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
+ * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
+ *
+ * When running under SVSM the CA page is needed too, so map it as well.
+ */
+int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd)
+{
+ unsigned long address, pflags, pflags_enc;
+ struct sev_es_runtime_data *data;
+ int cpu;
+ u64 pfn;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ return 0;
+
+ pflags = _PAGE_NX | _PAGE_RW;
+ pflags_enc = cc_mkenc(pflags);
+
+ for_each_possible_cpu(cpu) {
+ data = per_cpu(runtime_data, cpu);
+
+ address = __pa(&data->ghcb_page);
+ pfn = address >> PAGE_SHIFT;
+
+ if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
+ return 1;
+
+ if (snp_vmpl) {
+ address = per_cpu(svsm_caa_pa, cpu);
+ if (!address)
+ return 1;
+
+ pfn = address >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags_enc))
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+u64 savic_ghcb_msr_read(u32 reg)
+{
+ u64 msr = APIC_BASE_MSR + (reg >> 4);
+ struct pt_regs regs = { .cx = msr };
+ struct es_em_ctxt ctxt = { .regs = &regs };
+ struct ghcb_state state;
+ enum es_result res;
+ struct ghcb *ghcb;
+
+ guard(irqsave)();
+
+ ghcb = __sev_get_ghcb(&state);
+ vc_ghcb_invalidate(ghcb);
+
+ res = sev_es_ghcb_handle_msr(ghcb, &ctxt, false);
+ if (res != ES_OK) {
+ pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res);
+ /* MSR read failures are treated as fatal errors */
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+ }
+
+ __sev_put_ghcb(&state);
+
+ return regs.ax | regs.dx << 32;
+}
+
+void savic_ghcb_msr_write(u32 reg, u64 value)
+{
+ u64 msr = APIC_BASE_MSR + (reg >> 4);
+ struct pt_regs regs = {
+ .cx = msr,
+ .ax = lower_32_bits(value),
+ .dx = upper_32_bits(value)
+ };
+ struct es_em_ctxt ctxt = { .regs = &regs };
+ struct ghcb_state state;
+ enum es_result res;
+ struct ghcb *ghcb;
+
+ guard(irqsave)();
+
+ ghcb = __sev_get_ghcb(&state);
+ vc_ghcb_invalidate(ghcb);
+
+ res = sev_es_ghcb_handle_msr(ghcb, &ctxt, true);
+ if (res != ES_OK) {
+ pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res);
+ /* MSR writes should never fail. Any failure is fatal error for SNP guest */
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+ }
+
+ __sev_put_ghcb(&state);
+}
+
+enum es_result savic_register_gpa(u64 gpa)
+{
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ enum es_result res;
+ struct ghcb *ghcb;
+
+ guard(irqsave)();
+
+ ghcb = __sev_get_ghcb(&state);
+ vc_ghcb_invalidate(ghcb);
+
+ ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA);
+ ghcb_set_rbx(ghcb, gpa);
+ res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC,
+ SVM_VMGEXIT_SAVIC_REGISTER_GPA, 0);
+
+ __sev_put_ghcb(&state);
+
+ return res;
+}
+
+enum es_result savic_unregister_gpa(u64 *gpa)
+{
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ enum es_result res;
+ struct ghcb *ghcb;
+
+ guard(irqsave)();
+
+ ghcb = __sev_get_ghcb(&state);
+ vc_ghcb_invalidate(ghcb);
+
+ ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA);
+ res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC,
+ SVM_VMGEXIT_SAVIC_UNREGISTER_GPA, 0);
+ if (gpa && res == ES_OK)
+ *gpa = ghcb->save.rbx;
+
+ __sev_put_ghcb(&state);
+
+ return res;
+}
+
+static void snp_register_per_cpu_ghcb(void)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ snp_register_ghcb_early(__pa(ghcb));
+}
+
+void setup_ghcb(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ return;
+
+ /*
+ * Check whether the runtime #VC exception handler is active. It uses
+ * the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
+ *
+ * If SNP is active, register the per-CPU GHCB page so that the runtime
+ * exception handler can use it.
+ */
+ if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ snp_register_per_cpu_ghcb();
+
+ sev_cfg.ghcbs_initialized = true;
+
+ return;
+ }
+
+ /*
+ * Make sure the hypervisor talks a supported protocol.
+ * This gets called only in the BSP boot phase.
+ */
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /*
+ * Clear the boot_ghcb. The first exception comes in before the bss
+ * section is cleared.
+ */
+ memset(&boot_ghcb_page, 0, PAGE_SIZE);
+
+ /* Alright - Make the boot-ghcb public */
+ boot_ghcb = &boot_ghcb_page;
+
+ /* SNP guest requires that GHCB GPA must be registered. */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ snp_register_ghcb_early(__pa(&boot_ghcb_page));
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sev_es_ap_hlt_loop(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ while (true) {
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ /* Wakeup signal? */
+ if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
+ ghcb->save.sw_exit_info_2)
+ break;
+ }
+
+ __sev_put_ghcb(&state);
+}
+
+/*
+ * Play_dead handler when running under SEV-ES. This is needed because
+ * the hypervisor can't deliver an SIPI request to restart the AP.
+ * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
+ * hypervisor wakes it up again.
+ */
+static void sev_es_play_dead(void)
+{
+ play_dead_common();
+
+ /* IRQs now disabled */
+
+ sev_es_ap_hlt_loop();
+
+ /*
+ * If we get here, the VCPU was woken up again. Jump to CPU
+ * startup code to get it back online.
+ */
+ soft_restart_cpu();
+}
+#else /* CONFIG_HOTPLUG_CPU */
+#define sev_es_play_dead native_play_dead
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_SMP
+static void __init sev_es_setup_play_dead(void)
+{
+ smp_ops.play_dead = sev_es_play_dead;
+}
+#else
+static inline void sev_es_setup_play_dead(void) { }
+#endif
+
+static void __init alloc_runtime_data(int cpu)
+{
+ struct sev_es_runtime_data *data;
+
+ data = memblock_alloc_node(sizeof(*data), PAGE_SIZE, cpu_to_node(cpu));
+ if (!data)
+ panic("Can't allocate SEV-ES runtime data");
+
+ per_cpu(runtime_data, cpu) = data;
+
+ if (snp_vmpl) {
+ struct svsm_ca *caa;
+
+ /* Allocate the SVSM CA page if an SVSM is present */
+ caa = cpu ? memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE)
+ : &boot_svsm_ca_page;
+
+ per_cpu(svsm_caa, cpu) = caa;
+ per_cpu(svsm_caa_pa, cpu) = __pa(caa);
+ }
+}
+
+static void __init init_ghcb(int cpu)
+{
+ struct sev_es_runtime_data *data;
+ int err;
+
+ data = per_cpu(runtime_data, cpu);
+
+ err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
+ sizeof(data->ghcb_page));
+ if (err)
+ panic("Can't map GHCBs unencrypted");
+
+ memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
+
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+}
+
+void __init sev_es_init_vc_handling(void)
+{
+ int cpu;
+
+ BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
+
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ return;
+
+ if (!sev_es_check_cpu_features())
+ panic("SEV-ES CPU Features missing");
+
+ /*
+ * SNP is supported in v2 of the GHCB spec which mandates support for HV
+ * features.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ sev_hv_features = get_hv_features();
+
+ if (!(sev_hv_features & GHCB_HV_FT_SNP))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+ }
+
+ /* Initialize per-cpu GHCB pages */
+ for_each_possible_cpu(cpu) {
+ alloc_runtime_data(cpu);
+ init_ghcb(cpu);
+ }
+
+ if (snp_vmpl)
+ sev_cfg.use_cas = true;
+
+ sev_es_setup_play_dead();
+
+ /* Secondary CPUs use the runtime #VC handler */
+ initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
+}
+
+/*
+ * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
+ * enabled, as the alternative (fallback) logic for DMI probing in the legacy
+ * ROM region can cause a crash since this region is not pre-validated.
+ */
+void __init snp_dmi_setup(void)
+{
+ if (efi_enabled(EFI_CONFIG_TABLES))
+ dmi_setup();
+}
+
+static void dump_cpuid_table(void)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ int i = 0;
+
+ pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
+ cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
+
+ for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) {
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
+ i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
+ fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
+ }
+}
+
+/*
+ * It is useful from an auditing/testing perspective to provide an easy way
+ * for the guest owner to know that the CPUID table has been initialized as
+ * expected, but that initialization happens too early in boot to print any
+ * sort of indicator, and there's not really any other good place to do it,
+ * so do it here.
+ *
+ * If running as an SNP guest, report the current VM privilege level (VMPL).
+ */
+static int __init report_snp_info(void)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+ if (cpuid_table->count) {
+ pr_info("Using SNP CPUID table, %d entries present.\n",
+ cpuid_table->count);
+
+ if (sev_cfg.debug)
+ dump_cpuid_table();
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ pr_info("SNP running at VMPL%u.\n", snp_vmpl);
+
+ return 0;
+}
+arch_initcall(report_snp_info);
+
+static void update_attest_input(struct svsm_call *call, struct svsm_attest_call *input)
+{
+ /* If (new) lengths have been returned, propagate them up */
+ if (call->rcx_out != call->rcx)
+ input->manifest_buf.len = call->rcx_out;
+
+ if (call->rdx_out != call->rdx)
+ input->certificates_buf.len = call->rdx_out;
+
+ if (call->r8_out != call->r8)
+ input->report_buf.len = call->r8_out;
+}
+
+int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
+ struct svsm_attest_call *input)
+{
+ struct svsm_attest_call *ac;
+ unsigned long flags;
+ u64 attest_call_pa;
+ int ret;
+
+ if (!snp_vmpl)
+ return -EINVAL;
+
+ local_irq_save(flags);
+
+ call->caa = svsm_get_caa();
+
+ ac = (struct svsm_attest_call *)call->caa->svsm_buffer;
+ attest_call_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
+
+ *ac = *input;
+
+ /*
+ * Set input registers for the request and set RDX and R8 to known
+ * values in order to detect length values being returned in them.
+ */
+ call->rax = call_id;
+ call->rcx = attest_call_pa;
+ call->rdx = -1;
+ call->r8 = -1;
+ ret = svsm_perform_call_protocol(call);
+ update_attest_input(call, input);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
+
+static int snp_issue_guest_request(struct snp_guest_req *req)
+{
+ struct snp_req_data *input = &req->input;
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret;
+
+ req->exitinfo2 = SEV_RET_NO_FW_CALL;
+
+ /*
+ * __sev_get_ghcb() needs to run with IRQs disabled because it is using
+ * a per-CPU GHCB.
+ */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+ if (!ghcb) {
+ ret = -EIO;
+ goto e_restore_irq;
+ }
+
+ vc_ghcb_invalidate(ghcb);
+
+ if (req->exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
+ ghcb_set_rax(ghcb, input->data_gpa);
+ ghcb_set_rbx(ghcb, input->data_npages);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, &ctxt, req->exit_code, input->req_gpa, input->resp_gpa);
+ if (ret)
+ goto e_put;
+
+ req->exitinfo2 = ghcb->save.sw_exit_info_2;
+ switch (req->exitinfo2) {
+ case 0:
+ break;
+
+ case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
+ ret = -EAGAIN;
+ break;
+
+ case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
+ /* Number of expected pages are returned in RBX */
+ if (req->exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
+ input->data_npages = ghcb_get_rbx(ghcb);
+ ret = -ENOSPC;
+ break;
+ }
+ fallthrough;
+ default:
+ ret = -EIO;
+ break;
+ }
+
+e_put:
+ __sev_put_ghcb(&state);
+e_restore_irq:
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * snp_svsm_vtpm_probe() - Probe if SVSM provides a vTPM device
+ *
+ * Check that there is SVSM and that it supports at least TPM_SEND_COMMAND
+ * which is the only request used so far.
+ *
+ * Return: true if the platform provides a vTPM SVSM device, false otherwise.
+ */
+static bool snp_svsm_vtpm_probe(void)
+{
+ struct svsm_call call = {};
+
+ /* The vTPM device is available only if a SVSM is present */
+ if (!snp_vmpl)
+ return false;
+
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_VTPM_CALL(SVSM_VTPM_QUERY);
+
+ if (svsm_perform_call_protocol(&call))
+ return false;
+
+ /* Check platform commands contains TPM_SEND_COMMAND - platform command 8 */
+ return call.rcx_out & BIT_ULL(8);
+}
+
+/**
+ * snp_svsm_vtpm_send_command() - Execute a vTPM operation on SVSM
+ * @buffer: A buffer used to both send the command and receive the response.
+ *
+ * Execute a SVSM_VTPM_CMD call as defined by
+ * "Secure VM Service Module for SEV-SNP Guests" Publication # 58019 Revision: 1.00
+ *
+ * All command request/response buffers have a common structure as specified by
+ * the following table:
+ * Byte Size     In/Out    Description
+ * Offset    (Bytes)
+ * 0x000     4          In        Platform command
+ *                         Out       Platform command response size
+ *
+ * Each command can build upon this common request/response structure to create
+ * a structure specific to the command. See include/linux/tpm_svsm.h for more
+ * details.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int snp_svsm_vtpm_send_command(u8 *buffer)
+{
+ struct svsm_call call = {};
+
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_VTPM_CALL(SVSM_VTPM_CMD);
+ call.rcx = __pa(buffer);
+
+ return svsm_perform_call_protocol(&call);
+}
+EXPORT_SYMBOL_GPL(snp_svsm_vtpm_send_command);
+
+static struct platform_device sev_guest_device = {
+ .name = "sev-guest",
+ .id = -1,
+};
+
+static struct platform_device tpm_svsm_device = {
+ .name = "tpm-svsm",
+ .id = -1,
+};
+
+static int __init snp_init_platform_device(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return -ENODEV;
+
+ if (platform_device_register(&sev_guest_device))
+ return -ENODEV;
+
+ if (snp_svsm_vtpm_probe() &&
+ platform_device_register(&tpm_svsm_device))
+ return -ENODEV;
+
+ pr_info("SNP guest platform devices initialized.\n");
+ return 0;
+}
+device_initcall(snp_init_platform_device);
+
+void sev_show_status(void)
+{
+ int i;
+
+ pr_info("Status: ");
+ for (i = 0; i < MSR_AMD64_SNP_RESV_BIT; i++) {
+ if (sev_status & BIT_ULL(i)) {
+ if (!sev_status_feat_names[i])
+ continue;
+
+ pr_cont("%s ", sev_status_feat_names[i]);
+ }
+ }
+ pr_cont("\n");
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t vmpl_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", snp_vmpl);
+}
+
+static struct kobj_attribute vmpl_attr = __ATTR_RO(vmpl);
+
+static struct attribute *vmpl_attrs[] = {
+ &vmpl_attr.attr,
+ NULL
+};
+
+static struct attribute_group sev_attr_group = {
+ .attrs = vmpl_attrs,
+};
+
+static int __init sev_sysfs_init(void)
+{
+ struct kobject *sev_kobj;
+ struct device *dev_root;
+ int ret;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (!dev_root)
+ return -ENODEV;
+
+ sev_kobj = kobject_create_and_add("sev", &dev_root->kobj);
+ put_device(dev_root);
+
+ if (!sev_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(sev_kobj, &sev_attr_group);
+ if (ret)
+ kobject_put(sev_kobj);
+
+ return ret;
+}
+arch_initcall(sev_sysfs_init);
+#endif // CONFIG_SYSFS
+
+static void free_shared_pages(void *buf, size_t sz)
+{
+ unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+ int ret;
+
+ if (!buf)
+ return;
+
+ ret = set_memory_encrypted((unsigned long)buf, npages);
+ if (ret) {
+ WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n");
+ return;
+ }
+
+ __free_pages(virt_to_page(buf), get_order(sz));
+}
+
+static void *alloc_shared_pages(size_t sz)
+{
+ unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+ struct page *page;
+ int ret;
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(sz));
+ if (!page)
+ return NULL;
+
+ ret = set_memory_decrypted((unsigned long)page_address(page), npages);
+ if (ret) {
+ pr_err("failed to mark page shared, ret=%d\n", ret);
+ __free_pages(page, get_order(sz));
+ return NULL;
+ }
+
+ return page_address(page);
+}
+
+static u8 *get_vmpck(int id, struct snp_secrets_page *secrets, u32 **seqno)
+{
+ u8 *key = NULL;
+
+ switch (id) {
+ case 0:
+ *seqno = &secrets->os_area.msg_seqno_0;
+ key = secrets->vmpck0;
+ break;
+ case 1:
+ *seqno = &secrets->os_area.msg_seqno_1;
+ key = secrets->vmpck1;
+ break;
+ case 2:
+ *seqno = &secrets->os_area.msg_seqno_2;
+ key = secrets->vmpck2;
+ break;
+ case 3:
+ *seqno = &secrets->os_area.msg_seqno_3;
+ key = secrets->vmpck3;
+ break;
+ default:
+ break;
+ }
+
+ return key;
+}
+
+static struct aesgcm_ctx *snp_init_crypto(u8 *key, size_t keylen)
+{
+ struct aesgcm_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ if (aesgcm_expandkey(ctx, key, keylen, AUTHTAG_LEN)) {
+ pr_err("Crypto context initialization failed\n");
+ kfree(ctx);
+ return NULL;
+ }
+
+ return ctx;
+}
+
+int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id)
+{
+ /* Adjust the default VMPCK key based on the executing VMPL level */
+ if (vmpck_id == -1)
+ vmpck_id = snp_vmpl;
+
+ mdesc->vmpck = get_vmpck(vmpck_id, mdesc->secrets, &mdesc->os_area_msg_seqno);
+ if (!mdesc->vmpck) {
+ pr_err("Invalid VMPCK%d communication key\n", vmpck_id);
+ return -EINVAL;
+ }
+
+ /* Verify that VMPCK is not zero. */
+ if (!memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
+ pr_err("Empty VMPCK%d communication key\n", vmpck_id);
+ return -EINVAL;
+ }
+
+ mdesc->vmpck_id = vmpck_id;
+
+ mdesc->ctx = snp_init_crypto(mdesc->vmpck, VMPCK_KEY_LEN);
+ if (!mdesc->ctx)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(snp_msg_init);
+
+struct snp_msg_desc *snp_msg_alloc(void)
+{
+ struct snp_msg_desc *mdesc;
+ void __iomem *mem;
+
+ BUILD_BUG_ON(sizeof(struct snp_guest_msg) > PAGE_SIZE);
+
+ mdesc = kzalloc(sizeof(struct snp_msg_desc), GFP_KERNEL);
+ if (!mdesc)
+ return ERR_PTR(-ENOMEM);
+
+ mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE);
+ if (!mem)
+ goto e_free_mdesc;
+
+ mdesc->secrets = (__force struct snp_secrets_page *)mem;
+
+ /* Allocate the shared page used for the request and response message. */
+ mdesc->request = alloc_shared_pages(sizeof(struct snp_guest_msg));
+ if (!mdesc->request)
+ goto e_unmap;
+
+ mdesc->response = alloc_shared_pages(sizeof(struct snp_guest_msg));
+ if (!mdesc->response)
+ goto e_free_request;
+
+ return mdesc;
+
+e_free_request:
+ free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
+e_unmap:
+ iounmap(mem);
+e_free_mdesc:
+ kfree(mdesc);
+
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(snp_msg_alloc);
+
+void snp_msg_free(struct snp_msg_desc *mdesc)
+{
+ if (!mdesc)
+ return;
+
+ kfree(mdesc->ctx);
+ free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg));
+ free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg));
+ iounmap((__force void __iomem *)mdesc->secrets);
+
+ memset(mdesc, 0, sizeof(*mdesc));
+ kfree(mdesc);
+}
+EXPORT_SYMBOL_GPL(snp_msg_free);
+
+/* Mutex to serialize the shared buffer access and command handling. */
+static DEFINE_MUTEX(snp_cmd_mutex);
+
+/*
+ * If an error is received from the host or AMD Secure Processor (ASP) there
+ * are two options. Either retry the exact same encrypted request or discontinue
+ * using the VMPCK.
+ *
+ * This is because in the current encryption scheme GHCB v2 uses AES-GCM to
+ * encrypt the requests. The IV for this scheme is the sequence number. GCM
+ * cannot tolerate IV reuse.
+ *
+ * The ASP FW v1.51 only increments the sequence numbers on a successful
+ * guest<->ASP back and forth and only accepts messages at its exact sequence
+ * number.
+ *
+ * So if the sequence number were to be reused the encryption scheme is
+ * vulnerable. If the sequence number were incremented for a fresh IV the ASP
+ * will reject the request.
+ */
+static void snp_disable_vmpck(struct snp_msg_desc *mdesc)
+{
+ pr_alert("Disabling VMPCK%d communication key to prevent IV reuse.\n",
+ mdesc->vmpck_id);
+ memzero_explicit(mdesc->vmpck, VMPCK_KEY_LEN);
+ mdesc->vmpck = NULL;
+}
+
+static inline u64 __snp_get_msg_seqno(struct snp_msg_desc *mdesc)
+{
+ u64 count;
+
+ lockdep_assert_held(&snp_cmd_mutex);
+
+ /* Read the current message sequence counter from secrets pages */
+ count = *mdesc->os_area_msg_seqno;
+
+ return count + 1;
+}
+
+/* Return a non-zero on success */
+static u64 snp_get_msg_seqno(struct snp_msg_desc *mdesc)
+{
+ u64 count = __snp_get_msg_seqno(mdesc);
+
+ /*
+ * The message sequence counter for the SNP guest request is a 64-bit
+ * value but the version 2 of GHCB specification defines a 32-bit storage
+ * for it. If the counter exceeds the 32-bit value then return zero.
+ * The caller should check the return value, but if the caller happens to
+ * not check the value and use it, then the firmware treats zero as an
+ * invalid number and will fail the message request.
+ */
+ if (count >= UINT_MAX) {
+ pr_err("request message sequence counter overflow\n");
+ return 0;
+ }
+
+ return count;
+}
+
+static void snp_inc_msg_seqno(struct snp_msg_desc *mdesc)
+{
+ /*
+ * The counter is also incremented by the PSP, so increment it by 2
+ * and save in secrets page.
+ */
+ *mdesc->os_area_msg_seqno += 2;
+}
+
+static int verify_and_dec_payload(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
+{
+ struct snp_guest_msg *resp_msg = &mdesc->secret_response;
+ struct snp_guest_msg *req_msg = &mdesc->secret_request;
+ struct snp_guest_msg_hdr *req_msg_hdr = &req_msg->hdr;
+ struct snp_guest_msg_hdr *resp_msg_hdr = &resp_msg->hdr;
+ struct aesgcm_ctx *ctx = mdesc->ctx;
+ u8 iv[GCM_AES_IV_SIZE] = {};
+
+ pr_debug("response [seqno %lld type %d version %d sz %d]\n",
+ resp_msg_hdr->msg_seqno, resp_msg_hdr->msg_type, resp_msg_hdr->msg_version,
+ resp_msg_hdr->msg_sz);
+
+ /* Copy response from shared memory to encrypted memory. */
+ memcpy(resp_msg, mdesc->response, sizeof(*resp_msg));
+
+ /* Verify that the sequence counter is incremented by 1 */
+ if (unlikely(resp_msg_hdr->msg_seqno != (req_msg_hdr->msg_seqno + 1)))
+ return -EBADMSG;
+
+ /* Verify response message type and version number. */
+ if (resp_msg_hdr->msg_type != (req_msg_hdr->msg_type + 1) ||
+ resp_msg_hdr->msg_version != req_msg_hdr->msg_version)
+ return -EBADMSG;
+
+ /*
+ * If the message size is greater than our buffer length then return
+ * an error.
+ */
+ if (unlikely((resp_msg_hdr->msg_sz + ctx->authsize) > req->resp_sz))
+ return -EBADMSG;
+
+ /* Decrypt the payload */
+ memcpy(iv, &resp_msg_hdr->msg_seqno, min(sizeof(iv), sizeof(resp_msg_hdr->msg_seqno)));
+ if (!aesgcm_decrypt(ctx, req->resp_buf, resp_msg->payload, resp_msg_hdr->msg_sz,
+ &resp_msg_hdr->algo, AAD_LEN, iv, resp_msg_hdr->authtag))
+ return -EBADMSG;
+
+ return 0;
+}
+
+static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_req *req)
+{
+ struct snp_guest_msg *msg = &mdesc->secret_request;
+ struct snp_guest_msg_hdr *hdr = &msg->hdr;
+ struct aesgcm_ctx *ctx = mdesc->ctx;
+ u8 iv[GCM_AES_IV_SIZE] = {};
+
+ memset(msg, 0, sizeof(*msg));
+
+ hdr->algo = SNP_AEAD_AES_256_GCM;
+ hdr->hdr_version = MSG_HDR_VER;
+ hdr->hdr_sz = sizeof(*hdr);
+ hdr->msg_type = req->msg_type;
+ hdr->msg_version = req->msg_version;
+ hdr->msg_seqno = seqno;
+ hdr->msg_vmpck = req->vmpck_id;
+ hdr->msg_sz = req->req_sz;
+
+ /* Verify the sequence number is non-zero */
+ if (!hdr->msg_seqno)
+ return -ENOSR;
+
+ pr_debug("request [seqno %lld type %d version %d sz %d]\n",
+ hdr->msg_seqno, hdr->msg_type, hdr->msg_version, hdr->msg_sz);
+
+ if (WARN_ON((req->req_sz + ctx->authsize) > sizeof(msg->payload)))
+ return -EBADMSG;
+
+ memcpy(iv, &hdr->msg_seqno, min(sizeof(iv), sizeof(hdr->msg_seqno)));
+ aesgcm_encrypt(ctx, msg->payload, req->req_buf, req->req_sz, &hdr->algo,
+ AAD_LEN, iv, hdr->authtag);
+
+ return 0;
+}
+
+static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
+{
+ unsigned long req_start = jiffies;
+ unsigned int override_npages = 0;
+ u64 override_err = 0;
+ int rc;
+
+retry_request:
+ /*
+ * Call firmware to process the request. In this function the encrypted
+ * message enters shared memory with the host. So after this call the
+ * sequence number must be incremented or the VMPCK must be deleted to
+ * prevent reuse of the IV.
+ */
+ rc = snp_issue_guest_request(req);
+ switch (rc) {
+ case -ENOSPC:
+ /*
+ * If the extended guest request fails due to having too
+ * small of a certificate data buffer, retry the same
+ * guest request without the extended data request in
+ * order to increment the sequence number and thus avoid
+ * IV reuse.
+ */
+ override_npages = req->input.data_npages;
+ req->exit_code = SVM_VMGEXIT_GUEST_REQUEST;
+
+ /*
+ * Override the error to inform callers the given extended
+ * request buffer size was too small and give the caller the
+ * required buffer size.
+ */
+ override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN);
+
+ /*
+ * If this call to the firmware succeeds, the sequence number can
+ * be incremented allowing for continued use of the VMPCK. If
+ * there is an error reflected in the return value, this value
+ * is checked further down and the result will be the deletion
+ * of the VMPCK and the error code being propagated back to the
+ * user as an ioctl() return code.
+ */
+ goto retry_request;
+
+ /*
+ * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been
+ * throttled. Retry in the driver to avoid returning and reusing the
+ * message sequence number on a different message.
+ */
+ case -EAGAIN:
+ if (jiffies - req_start > SNP_REQ_MAX_RETRY_DURATION) {
+ rc = -ETIMEDOUT;
+ break;
+ }
+ schedule_timeout_killable(SNP_REQ_RETRY_DELAY);
+ goto retry_request;
+ }
+
+ /*
+ * Increment the message sequence number. There is no harm in doing
+ * this now because decryption uses the value stored in the response
+ * structure and any failure will wipe the VMPCK, preventing further
+ * use anyway.
+ */
+ snp_inc_msg_seqno(mdesc);
+
+ if (override_err) {
+ req->exitinfo2 = override_err;
+
+ /*
+ * If an extended guest request was issued and the supplied certificate
+ * buffer was not large enough, a standard guest request was issued to
+ * prevent IV reuse. If the standard request was successful, return -EIO
+ * back to the caller as would have originally been returned.
+ */
+ if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
+ rc = -EIO;
+ }
+
+ if (override_npages)
+ req->input.data_npages = override_npages;
+
+ return rc;
+}
+
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req)
+{
+ u64 seqno;
+ int rc;
+
+ /*
+ * enc_payload() calls aesgcm_encrypt(), which can potentially offload to HW.
+ * The offload's DMA SG list of data to encrypt has to be in linear mapping.
+ */
+ if (!virt_addr_valid(req->req_buf) || !virt_addr_valid(req->resp_buf)) {
+ pr_warn("AES-GSM buffers must be in linear mapping");
+ return -EINVAL;
+ }
+
+ guard(mutex)(&snp_cmd_mutex);
+
+ /* Check if the VMPCK is not empty */
+ if (!mdesc->vmpck || !memchr_inv(mdesc->vmpck, 0, VMPCK_KEY_LEN)) {
+ pr_err_ratelimited("VMPCK is disabled\n");
+ return -ENOTTY;
+ }
+
+ /* Get message sequence and verify that its a non-zero */
+ seqno = snp_get_msg_seqno(mdesc);
+ if (!seqno)
+ return -EIO;
+
+ /* Clear shared memory's response for the host to populate. */
+ memset(mdesc->response, 0, sizeof(struct snp_guest_msg));
+
+ /* Encrypt the userspace provided payload in mdesc->secret_request. */
+ rc = enc_payload(mdesc, seqno, req);
+ if (rc)
+ return rc;
+
+ /*
+ * Write the fully encrypted request to the shared unencrypted
+ * request page.
+ */
+ memcpy(mdesc->request, &mdesc->secret_request, sizeof(mdesc->secret_request));
+
+ /* Initialize the input address for guest request */
+ req->input.req_gpa = __pa(mdesc->request);
+ req->input.resp_gpa = __pa(mdesc->response);
+ req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0;
+
+ rc = __handle_guest_request(mdesc, req);
+ if (rc) {
+ if (rc == -EIO &&
+ req->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
+ return rc;
+
+ pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
+ rc, req->exitinfo2);
+
+ snp_disable_vmpck(mdesc);
+ return rc;
+ }
+
+ rc = verify_and_dec_payload(mdesc, req);
+ if (rc) {
+ pr_alert("Detected unexpected decode failure from ASP. rc: %d\n", rc);
+ snp_disable_vmpck(mdesc);
+ return rc;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(snp_send_guest_request);
+
+static int __init snp_get_tsc_info(void)
+{
+ struct snp_tsc_info_resp *tsc_resp;
+ struct snp_tsc_info_req *tsc_req;
+ struct snp_msg_desc *mdesc;
+ struct snp_guest_req req = {};
+ int rc = -ENOMEM;
+
+ tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL);
+ if (!tsc_req)
+ return rc;
+
+ /*
+ * The intermediate response buffer is used while decrypting the
+ * response payload. Make sure that it has enough space to cover
+ * the authtag.
+ */
+ tsc_resp = kzalloc(sizeof(*tsc_resp) + AUTHTAG_LEN, GFP_KERNEL);
+ if (!tsc_resp)
+ goto e_free_tsc_req;
+
+ mdesc = snp_msg_alloc();
+ if (IS_ERR_OR_NULL(mdesc))
+ goto e_free_tsc_resp;
+
+ rc = snp_msg_init(mdesc, snp_vmpl);
+ if (rc)
+ goto e_free_mdesc;
+
+ req.msg_version = MSG_HDR_VER;
+ req.msg_type = SNP_MSG_TSC_INFO_REQ;
+ req.vmpck_id = snp_vmpl;
+ req.req_buf = tsc_req;
+ req.req_sz = sizeof(*tsc_req);
+ req.resp_buf = (void *)tsc_resp;
+ req.resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN;
+ req.exit_code = SVM_VMGEXIT_GUEST_REQUEST;
+
+ rc = snp_send_guest_request(mdesc, &req);
+ if (rc)
+ goto e_request;
+
+ pr_debug("%s: response status 0x%x scale 0x%llx offset 0x%llx factor 0x%x\n",
+ __func__, tsc_resp->status, tsc_resp->tsc_scale, tsc_resp->tsc_offset,
+ tsc_resp->tsc_factor);
+
+ if (!tsc_resp->status) {
+ snp_tsc_scale = tsc_resp->tsc_scale;
+ snp_tsc_offset = tsc_resp->tsc_offset;
+ } else {
+ pr_err("Failed to get TSC info, response status 0x%x\n", tsc_resp->status);
+ rc = -EIO;
+ }
+
+e_request:
+ /* The response buffer contains sensitive data, explicitly clear it. */
+ memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN);
+e_free_mdesc:
+ snp_msg_free(mdesc);
+e_free_tsc_resp:
+ kfree(tsc_resp);
+e_free_tsc_req:
+ kfree(tsc_req);
+
+ return rc;
+}
+
+void __init snp_secure_tsc_prepare(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
+ return;
+
+ if (snp_get_tsc_info()) {
+ pr_alert("Unable to retrieve Secure TSC info from ASP\n");
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC);
+ }
+
+ pr_debug("SecureTSC enabled");
+}
+
+static unsigned long securetsc_get_tsc_khz(void)
+{
+ return snp_tsc_freq_khz;
+}
+
+void __init snp_secure_tsc_init(void)
+{
+ struct snp_secrets_page *secrets;
+ unsigned long tsc_freq_mhz;
+ void *mem;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
+ return;
+
+ mem = early_memremap_encrypted(sev_secrets_pa, PAGE_SIZE);
+ if (!mem) {
+ pr_err("Unable to get TSC_FACTOR: failed to map the SNP secrets page.\n");
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC);
+ }
+
+ secrets = (__force struct snp_secrets_page *)mem;
+
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
+
+ /* Extract the GUEST TSC MHZ from BIT[17:0], rest is reserved space */
+ tsc_freq_mhz &= GENMASK_ULL(17, 0);
+
+ snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
+
+ x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
+ x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
+
+ early_memunmap(mem, PAGE_SIZE);
+}
diff --git a/arch/x86/coco/sev/noinstr.c b/arch/x86/coco/sev/noinstr.c
new file mode 100644
index 000000000000..b527eafb6312
--- /dev/null
+++ b/arch/x86/coco/sev/noinstr.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/bug.h>
+#include <linux/kernel.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/msr.h>
+#include <asm/ptrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+
+static __always_inline bool on_vc_stack(struct pt_regs *regs)
+{
+ unsigned long sp = regs->sp;
+
+ /* User-mode RSP is not trusted */
+ if (user_mode(regs))
+ return false;
+
+ /* SYSCALL gap still has user-mode RSP */
+ if (ip_within_syscall_gap(regs))
+ return false;
+
+ return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
+}
+
+/*
+ * This function handles the case when an NMI is raised in the #VC
+ * exception handler entry code, before the #VC handler has switched off
+ * its IST stack. In this case, the IST entry for #VC must be adjusted,
+ * so that any nested #VC exception will not overwrite the stack
+ * contents of the interrupted #VC handler.
+ *
+ * The IST entry is adjusted unconditionally so that it can be also be
+ * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
+ * nested sev_es_ist_exit() call may adjust back the IST entry too
+ * early.
+ *
+ * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
+ * on the NMI IST stack, as they are only called from NMI handling code
+ * right now.
+ */
+void noinstr __sev_es_ist_enter(struct pt_regs *regs)
+{
+ unsigned long old_ist, new_ist;
+
+ /* Read old IST entry */
+ new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ /*
+ * If NMI happened while on the #VC IST stack, set the new IST
+ * value below regs->sp, so that the interrupted stack frame is
+ * not overwritten by subsequent #VC exceptions.
+ */
+ if (on_vc_stack(regs))
+ new_ist = regs->sp;
+
+ /*
+ * Reserve additional 8 bytes and store old IST value so this
+ * adjustment can be unrolled in __sev_es_ist_exit().
+ */
+ new_ist -= sizeof(old_ist);
+ *(unsigned long *)new_ist = old_ist;
+
+ /* Set new IST entry */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
+}
+
+void noinstr __sev_es_ist_exit(void)
+{
+ unsigned long ist;
+
+ /* Read IST entry */
+ ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
+ return;
+
+ /* Read back old IST entry and write it to the TSS */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
+}
+
+void noinstr __sev_es_nmi_complete(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
+ VMGEXIT();
+
+ __sev_put_ghcb(&state);
+}
+
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (unlikely(data->ghcb_active)) {
+ /* GHCB is already in use - save its contents */
+
+ if (unlikely(data->backup_ghcb_active)) {
+ /*
+ * Backup-GHCB is also already in use. There is no way
+ * to continue here so just kill the machine. To make
+ * panic() work, mark GHCBs inactive so that messages
+ * can be printed out.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ instrumentation_begin();
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ instrumentation_end();
+ }
+
+ /* Mark backup_ghcb active before writing to it */
+ data->backup_ghcb_active = true;
+
+ state->ghcb = &data->backup_ghcb;
+
+ /* Backup GHCB content */
+ *state->ghcb = *ghcb;
+ } else {
+ state->ghcb = NULL;
+ data->ghcb_active = true;
+ }
+
+ return ghcb;
+}
+
+noinstr void __sev_put_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (state->ghcb) {
+ /* Restore GHCB from Backup */
+ *ghcb = *state->ghcb;
+ data->backup_ghcb_active = false;
+ state->ghcb = NULL;
+ } else {
+ /*
+ * Invalidate the GHCB so a VMGEXIT instruction issued
+ * from userspace won't appear to be valid.
+ */
+ vc_ghcb_invalidate(ghcb);
+ data->ghcb_active = false;
+ }
+}
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
new file mode 100644
index 000000000000..f08c7505ed82
--- /dev/null
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -0,0 +1,1080 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/sched/debug.h> /* For show_regs() */
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <linux/efi.h>
+#include <uapi/linux/sev-guest.h>
+
+#include <asm/init.h>
+#include <asm/stacktrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/xcr.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid/api.h>
+
+static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned long vaddr, phys_addr_t *paddr)
+{
+ unsigned long va = (unsigned long)vaddr;
+ unsigned int level;
+ phys_addr_t pa;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3_pa());
+ pgd = &pgd[pgd_index(va)];
+ pte = lookup_address_in_pgd(pgd, va, &level);
+ if (!pte) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.cr2 = vaddr;
+ ctxt->fi.error_code = 0;
+
+ if (user_mode(ctxt->regs))
+ ctxt->fi.error_code |= X86_PF_USER;
+
+ return ES_EXCEPTION;
+ }
+
+ if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
+ /* Emulated MMIO to/from encrypted memory not supported */
+ return ES_UNSUPPORTED;
+
+ pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+ pa |= va & ~page_level_mask(level);
+
+ *paddr = pa;
+
+ return ES_OK;
+}
+
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ BUG_ON(size > 4);
+
+ if (user_mode(ctxt->regs)) {
+ struct thread_struct *t = &current->thread;
+ struct io_bitmap *iobm = t->io_bitmap;
+ size_t idx;
+
+ if (!iobm)
+ goto fault;
+
+ for (idx = port; idx < port + size; ++idx) {
+ if (test_bit(idx, iobm->bitmap))
+ goto fault;
+ }
+ }
+
+ return ES_OK;
+
+fault:
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+
+ return ES_EXCEPTION;
+}
+
+void vc_forward_exception(struct es_em_ctxt *ctxt)
+{
+ long error_code = ctxt->fi.error_code;
+ int trapnr = ctxt->fi.vector;
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+
+ switch (trapnr) {
+ case X86_TRAP_GP:
+ exc_general_protection(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_UD:
+ exc_invalid_op(ctxt->regs);
+ break;
+ case X86_TRAP_PF:
+ write_cr2(ctxt->fi.cr2);
+ exc_page_fault(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_AC:
+ exc_alignment_check(ctxt->regs, error_code);
+ break;
+ default:
+ pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
+ BUG();
+ }
+}
+
+static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
+ unsigned char *buffer)
+{
+ return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+}
+
+static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int insn_bytes;
+
+ insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
+ if (insn_bytes == 0) {
+ /* Nothing could be copied */
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ } else if (insn_bytes == -EINVAL) {
+ /* Effective RIP could not be calculated */
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ ctxt->fi.cr2 = 0;
+ return ES_EXCEPTION;
+ }
+
+ if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
+ return ES_DECODE_FAILED;
+
+ if (ctxt->insn.immediate.got)
+ return ES_OK;
+ else
+ return ES_DECODE_FAILED;
+}
+
+static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int res, ret;
+
+ res = vc_fetch_insn_kernel(ctxt, buffer);
+ if (res) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ }
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+ else
+ return ES_OK;
+}
+
+/*
+ * User instruction decoding is also required for the EFI runtime. Even though
+ * the EFI runtime is running in kernel mode, it uses special EFI virtual
+ * address mappings that require the use of efi_mm to properly address and
+ * decode.
+ */
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ if (user_mode(ctxt->regs) || mm_is_efi(current->active_mm))
+ return __vc_decode_user_insn(ctxt);
+ else
+ return __vc_decode_kern_insn(ctxt);
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ char *dst, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
+
+ /*
+ * This function uses __put_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __put_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __put_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_to_user() here because
+ * vc_write_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
+ switch (size) {
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
+ memcpy(&d1, buf, 1);
+ if (__put_user(d1, target))
+ goto fault;
+ break;
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
+ memcpy(&d2, buf, 2);
+ if (__put_user(d2, target))
+ goto fault;
+ break;
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
+ memcpy(&d4, buf, 4);
+ if (__put_user(d4, target))
+ goto fault;
+ break;
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
+ memcpy(&d8, buf, 8);
+ if (__put_user(d8, target))
+ goto fault;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)dst;
+
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ char *src, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT;
+
+ /*
+ * This function uses __get_user() independent of whether kernel or user
+ * memory is accessed. This works fine because __get_user() does no
+ * sanity checks of the pointer being accessed. All that it does is
+ * to report when the access failed.
+ *
+ * Also, this function runs in atomic context, so __get_user() is not
+ * allowed to sleep. The page-fault handler detects that it is running
+ * in atomic context and will not try to take mmap_sem and handle the
+ * fault, so additional pagefault_enable()/disable() calls are not
+ * needed.
+ *
+ * The access can't be done via copy_from_user() here because
+ * vc_read_mem() must not use string instructions to access unsafe
+ * memory. The reason is that MOVS is emulated by the #VC handler by
+ * splitting the move up into a read and a write and taking a nested #VC
+ * exception on whatever of them is the MMIO access. Using string
+ * instructions here would cause infinite nesting.
+ */
+ switch (size) {
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
+ if (__get_user(d1, s))
+ goto fault;
+ memcpy(buf, &d1, 1);
+ break;
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
+ if (__get_user(d2, s))
+ goto fault;
+ memcpy(buf, &d2, 2);
+ break;
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
+ if (__get_user(d4, s))
+ goto fault;
+ memcpy(buf, &d4, 4);
+ break;
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
+ if (__get_user(d8, s))
+ goto fault;
+ memcpy(buf, &d8, 8);
+ break;
+ }
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)src;
+
+ return ES_EXCEPTION;
+}
+
+#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#define error(v)
+
+#include "vc-shared.c"
+
+/* Writes to the SVSM CAA MSR are ignored */
+static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write)
+{
+ if (write)
+ return ES_OK;
+
+ regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa));
+ regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa));
+
+ return ES_OK;
+}
+
+/*
+ * TSC related accesses should not exit to the hypervisor when a guest is
+ * executing with Secure TSC enabled, so special handling is required for
+ * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ.
+ */
+static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool write)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u64 tsc;
+
+ /*
+ * Writing to MSR_IA32_TSC can cause subsequent reads of the TSC to
+ * return undefined values, and GUEST_TSC_FREQ is read-only. Generate
+ * a #GP on all writes.
+ */
+ if (write) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ /*
+ * GUEST_TSC_FREQ read should not be intercepted when Secure TSC is
+ * enabled. Terminate the guest if a read is attempted.
+ */
+ if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
+ return ES_VMM_ERROR;
+
+ /* Reads of MSR_IA32_TSC should return the current TSC value. */
+ tsc = rdtsc_ordered();
+ regs->ax = lower_32_bits(tsc);
+ regs->dx = upper_32_bits(tsc);
+
+ return ES_OK;
+}
+
+enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write)
+{
+ struct pt_regs *regs = ctxt->regs;
+ enum es_result ret;
+
+ switch (regs->cx) {
+ case MSR_SVSM_CAA:
+ return __vc_handle_msr_caa(regs, write);
+ case MSR_IA32_TSC:
+ case MSR_AMD64_GUEST_TSC_FREQ:
+ if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
+ return __vc_handle_secure_tsc_msrs(ctxt, write);
+ break;
+ case MSR_AMD64_SAVIC_CONTROL:
+ /*
+ * AMD64_SAVIC_CONTROL should not be intercepted when
+ * Secure AVIC is enabled. Terminate the Secure AVIC guest
+ * if the interception is enabled.
+ */
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ return ES_VMM_ERROR;
+ break;
+ default:
+ break;
+ }
+
+ ghcb_set_rcx(ghcb, regs->cx);
+ if (write) {
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rdx(ghcb, regs->dx);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0);
+
+ if ((ret == ES_OK) && !write) {
+ regs->ax = ghcb->save.rax;
+ regs->dx = ghcb->save.rdx;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ return sev_es_ghcb_handle_msr(ghcb, ctxt, ctxt->insn.opcode.bytes[1] == 0x30);
+}
+
+static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
+{
+ int trapnr = ctxt->fi.vector;
+
+ if (trapnr == X86_TRAP_PF)
+ native_write_cr2(ctxt->fi.cr2);
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+ do_early_exception(ctxt->regs, trapnr);
+}
+
+static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
+{
+ long *reg_array;
+ int offset;
+
+ reg_array = (long *)ctxt->regs;
+ offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
+
+ if (offset < 0)
+ return NULL;
+
+ offset /= sizeof(long);
+
+ return reg_array + offset;
+}
+static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned int bytes, bool read)
+{
+ u64 exit_code, exit_info_1, exit_info_2;
+ unsigned long ghcb_pa = __pa(ghcb);
+ enum es_result res;
+ phys_addr_t paddr;
+ void __user *ref;
+
+ ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
+ if (ref == (void __user *)-1L)
+ return ES_UNSUPPORTED;
+
+ exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
+
+ res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
+ if (res != ES_OK) {
+ if (res == ES_EXCEPTION && !read)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return res;
+ }
+
+ exit_info_1 = paddr;
+ /* Can never be greater than 8 */
+ exit_info_2 = bytes;
+
+ ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
+
+ return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
+}
+
+/*
+ * The MOVS instruction has two memory operands, which raises the
+ * problem that it is not known whether the access to the source or the
+ * destination caused the #VC exception (and hence whether an MMIO read
+ * or write operation needs to be emulated).
+ *
+ * Instead of playing games with walking page-tables and trying to guess
+ * whether the source or destination is an MMIO range, split the move
+ * into two operations, a read and a write with only one memory operand.
+ * This will cause a nested #VC exception on the MMIO address which can
+ * then be handled.
+ *
+ * This implementation has the benefit that it also supports MOVS where
+ * source _and_ destination are MMIO regions.
+ *
+ * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
+ * rare operation. If it turns out to be a performance problem the split
+ * operations can be moved to memcpy_fromio() and memcpy_toio().
+ */
+static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
+ unsigned int bytes)
+{
+ unsigned long ds_base, es_base;
+ unsigned char *src, *dst;
+ unsigned char buffer[8];
+ enum es_result ret;
+ bool rep;
+ int off;
+
+ ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ if (ds_base == -1L || es_base == -1L) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ src = ds_base + (unsigned char *)ctxt->regs->si;
+ dst = es_base + (unsigned char *)ctxt->regs->di;
+
+ ret = vc_read_mem(ctxt, src, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ ret = vc_write_mem(ctxt, dst, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ if (ctxt->regs->flags & X86_EFLAGS_DF)
+ off = -bytes;
+ else
+ off = bytes;
+
+ ctxt->regs->si += off;
+ ctxt->regs->di += off;
+
+ rep = insn_has_rep_prefix(&ctxt->insn);
+ if (rep)
+ ctxt->regs->cx -= 1;
+
+ if (!rep || ctxt->regs->cx == 0)
+ return ES_OK;
+ else
+ return ES_RETRY;
+}
+
+static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct insn *insn = &ctxt->insn;
+ enum insn_mmio_type mmio;
+ unsigned int bytes = 0;
+ enum es_result ret;
+ u8 sign_byte;
+ long *reg_data;
+
+ mmio = insn_decode_mmio(insn, &bytes);
+ if (mmio == INSN_MMIO_DECODE_FAILED)
+ return ES_DECODE_FAILED;
+
+ if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
+ reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
+ if (!reg_data)
+ return ES_DECODE_FAILED;
+ }
+
+ if (user_mode(ctxt->regs))
+ return ES_UNSUPPORTED;
+
+ switch (mmio) {
+ case INSN_MMIO_WRITE:
+ memcpy(ghcb->shared_buffer, reg_data, bytes);
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+ case INSN_MMIO_WRITE_IMM:
+ memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+ case INSN_MMIO_READ:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Zero-extend for 32-bit operation */
+ if (bytes == 4)
+ *reg_data = 0;
+
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Zero extend based on operand size */
+ memset(reg_data, 0, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case INSN_MMIO_READ_SIGN_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ if (bytes == 1) {
+ u8 *val = (u8 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x80) ? 0xff : 0x00;
+ } else {
+ u16 *val = (u16 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x8000) ? 0xff : 0x00;
+ }
+
+ /* Sign extend based on operand size */
+ memset(reg_data, sign_byte, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case INSN_MMIO_MOVS:
+ ret = vc_handle_mmio_movs(ctxt, bytes);
+ break;
+ default:
+ ret = ES_UNSUPPORTED;
+ break;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long val, *reg = vc_insn_get_rm(ctxt);
+ enum es_result ret;
+
+ if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
+ return ES_VMM_ERROR;
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ val = *reg;
+
+ /* Upper 32 bits must be written as zeroes */
+ if (val >> 32) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ /* Clear out other reserved bits and set bit 10 */
+ val = (val & 0xffff23ffL) | BIT(10);
+
+ /* Early non-zero writes to DR7 are not supported */
+ if (!data && (val & ~DR7_RESET_VALUE))
+ return ES_UNSUPPORTED;
+
+ /* Using a value of 0 for ExitInfo1 means RAX holds the value */
+ ghcb_set_rax(ghcb, val);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (data)
+ data->dr7 = val;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long *reg = vc_insn_get_rm(ctxt);
+
+ if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
+ return ES_VMM_ERROR;
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ if (data)
+ *reg = data->dr7;
+ else
+ *reg = DR7_RESET_VALUE;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
+}
+
+static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rcx(ghcb, ctxt->regs->cx);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_monitor(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Treat it as a NOP and do not leak a physical address to the
+ * hypervisor.
+ */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_mwait(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /* Treat the same as MONITOR/MONITORX */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rax(ghcb, ctxt->regs->ax);
+ ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
+
+ if (x86_platform.hyper.sev_es_hcall_prepare)
+ x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+
+ /*
+ * Call sev_es_hcall_finish() after regs->ax is already set.
+ * This allows the hypervisor handler to overwrite it again if
+ * necessary.
+ */
+ if (x86_platform.hyper.sev_es_hcall_finish &&
+ !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
+ return ES_VMM_ERROR;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Calling ecx_alignment_check() directly does not work, because it
+ * enables IRQs and the GHCB is active. Forward the exception and call
+ * it later from vc_forward_exception().
+ */
+ ctxt->fi.vector = X86_TRAP_AC;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
+ struct ghcb *ghcb,
+ unsigned long exit_code)
+{
+ enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
+
+ if (result != ES_OK)
+ return result;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ result = vc_handle_dr7_read(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ result = vc_handle_dr7_write(ghcb, ctxt);
+ break;
+ case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
+ result = vc_handle_trap_ac(ghcb, ctxt);
+ break;
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
+ break;
+ case SVM_EXIT_RDPMC:
+ result = vc_handle_rdpmc(ghcb, ctxt);
+ break;
+ case SVM_EXIT_INVD:
+ pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
+ result = ES_UNSUPPORTED;
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(ghcb, ctxt);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MSR:
+ result = vc_handle_msr(ghcb, ctxt);
+ break;
+ case SVM_EXIT_VMMCALL:
+ result = vc_handle_vmmcall(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WBINVD:
+ result = vc_handle_wbinvd(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MONITOR:
+ result = vc_handle_monitor(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MWAIT:
+ result = vc_handle_mwait(ghcb, ctxt);
+ break;
+ case SVM_EXIT_NPF:
+ result = vc_handle_mmio(ghcb, ctxt);
+ break;
+ default:
+ /*
+ * Unexpected #VC exception
+ */
+ result = ES_UNSUPPORTED;
+ }
+
+ return result;
+}
+
+static __always_inline bool is_vc2_stack(unsigned long sp)
+{
+ return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
+}
+
+static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
+{
+ unsigned long sp, prev_sp;
+
+ sp = (unsigned long)regs;
+ prev_sp = regs->sp;
+
+ /*
+ * If the code was already executing on the VC2 stack when the #VC
+ * happened, let it proceed to the normal handling routine. This way the
+ * code executing on the VC2 stack can cause #VC exceptions to get handled.
+ */
+ return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
+}
+
+static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
+{
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+ struct ghcb *ghcb;
+ bool ret = true;
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, error_code);
+
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, ghcb, error_code);
+
+ __sev_put_ghcb(&state);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_VMM_ERROR:
+ pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_DECODE_FAILED:
+ pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ ret = false;
+ break;
+ case ES_EXCEPTION:
+ vc_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ pr_emerg("Unknown result in %s():%d\n", __func__, result);
+ /*
+ * Emulating the instruction which caused the #VC exception
+ * failed - can't continue so print debug information
+ */
+ BUG();
+ }
+
+ return ret;
+}
+
+static __always_inline bool vc_is_db(unsigned long error_code)
+{
+ return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
+}
+
+/*
+ * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
+ * and will panic when an error happens.
+ */
+DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
+{
+ irqentry_state_t irq_state;
+
+ /*
+ * With the current implementation it is always possible to switch to a
+ * safe stack because #VC exceptions only happen at known places, like
+ * intercepted instructions or accesses to MMIO areas/IO ports. They can
+ * also happen with code instrumentation when the hypervisor intercepts
+ * #DB, but the critical paths are forbidden to be instrumented, so #DB
+ * exceptions currently also only happen in safe places.
+ *
+ * But keep this here in case the noinstr annotations are violated due
+ * to bug elsewhere.
+ */
+ if (unlikely(vc_from_invalid_context(regs))) {
+ instrumentation_begin();
+ panic("Can't handle #VC exception from unsupported context\n");
+ instrumentation_end();
+ }
+
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ exc_debug(regs);
+ return;
+ }
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /* Show some debug info */
+ show_regs(regs);
+
+ /* Ask hypervisor to sev_es_terminate */
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /* If that fails and we get here - just panic */
+ panic("Returned from Terminate-Request to Hypervisor\n");
+ }
+
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+/*
+ * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
+ * and will kill the current task with SIGBUS when an error happens.
+ */
+DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
+{
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ noist_exc_debug(regs);
+ return;
+ }
+
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
+
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+}
+
+bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
+{
+ unsigned long exit_code = regs->orig_ax;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ vc_ghcb_invalidate(boot_ghcb);
+
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_VMM_ERROR:
+ early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_DECODE_FAILED:
+ early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_EXCEPTION:
+ vc_early_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+
+ return true;
+
+fail:
+ show_regs(regs);
+
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c
new file mode 100644
index 000000000000..58b2f985d546
--- /dev/null
+++ b/arch/x86/coco/sev/vc-shared.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __BOOT_COMPRESSED
+#define has_cpuflag(f) cpu_feature_enabled(f)
+#endif
+
+static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
+ unsigned long exit_code)
+{
+ unsigned int opcode = (unsigned int)ctxt->insn.opcode.value;
+ u8 modrm = ctxt->insn.modrm.value;
+
+ switch (exit_code) {
+
+ case SVM_EXIT_IOIO:
+ case SVM_EXIT_NPF:
+ /* handled separately */
+ return ES_OK;
+
+ case SVM_EXIT_CPUID:
+ if (opcode == 0xa20f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_INVD:
+ if (opcode == 0x080f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MONITOR:
+ /* MONITOR and MONITORX instructions generate the same error code */
+ if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa))
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MWAIT:
+ /* MWAIT and MWAITX instructions generate the same error code */
+ if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb))
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MSR:
+ /* RDMSR */
+ if (opcode == 0x320f ||
+ /* WRMSR */
+ opcode == 0x300f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDPMC:
+ if (opcode == 0x330f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDTSC:
+ if (opcode == 0x310f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDTSCP:
+ if (opcode == 0x010f && modrm == 0xf9)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_READ_DR7:
+ if (opcode == 0x210f &&
+ X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_VMMCALL:
+ if (opcode == 0x010f && modrm == 0xd9)
+ return ES_OK;
+
+ break;
+
+ case SVM_EXIT_WRITE_DR7:
+ if (opcode == 0x230f &&
+ X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_WBINVD:
+ if (opcode == 0x90f)
+ return ES_OK;
+ break;
+
+ default:
+ break;
+ }
+
+ sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n",
+ opcode, exit_code, ctxt->regs->ip);
+
+ return ES_UNSUPPORTED;
+}
+
+static bool vc_decoding_needed(unsigned long exit_code)
+{
+ /* Exceptions don't require to decode the instruction */
+ return !(exit_code >= SVM_EXIT_EXCP_BASE &&
+ exit_code <= SVM_EXIT_LAST_EXCP);
+}
+
+static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
+ struct pt_regs *regs,
+ unsigned long exit_code)
+{
+ enum es_result ret = ES_OK;
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->regs = regs;
+
+ if (vc_decoding_needed(exit_code))
+ ret = vc_decode_insn(ctxt);
+
+ return ret;
+}
+
+static void vc_finish_insn(struct es_em_ctxt *ctxt)
+{
+ ctxt->regs->ip += ctxt->insn.length;
+}
+
+static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt,
+ unsigned long address,
+ bool write)
+{
+ if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_USER;
+ ctxt->fi.cr2 = address;
+ if (write)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return ES_EXCEPTION;
+ }
+
+ return ES_OK;
+}
+
+static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
+ void *src, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, b = backwards ? -1 : 1;
+ unsigned long address = (unsigned long)src;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, false);
+ if (ret != ES_OK)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ void *s = src + (i * data_size * b);
+ char *d = buf + (i * data_size);
+
+ ret = vc_read_mem(ctxt, s, d, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
+ void *dst, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, s = backwards ? -1 : 1;
+ unsigned long address = (unsigned long)dst;
+ enum es_result ret;
+
+ ret = vc_insn_string_check(ctxt, address, true);
+ if (ret != ES_OK)
+ return ret;
+
+ for (i = 0; i < count; i++) {
+ void *d = dst + (i * data_size * s);
+ char *b = buf + (i * data_size);
+
+ ret = vc_write_mem(ctxt, d, b, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+#define IOIO_TYPE_STR BIT(2)
+#define IOIO_TYPE_IN 1
+#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
+#define IOIO_TYPE_OUT 0
+#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
+
+#define IOIO_REP BIT(3)
+
+#define IOIO_ADDR_64 BIT(9)
+#define IOIO_ADDR_32 BIT(8)
+#define IOIO_ADDR_16 BIT(7)
+
+#define IOIO_DATA_32 BIT(6)
+#define IOIO_DATA_16 BIT(5)
+#define IOIO_DATA_8 BIT(4)
+
+#define IOIO_SEG_ES (0 << 10)
+#define IOIO_SEG_DS (3 << 10)
+
+static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
+{
+ struct insn *insn = &ctxt->insn;
+ size_t size;
+ u64 port;
+
+ *exitinfo = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ /* INS opcodes */
+ case 0x6c:
+ case 0x6d:
+ *exitinfo |= IOIO_TYPE_INS;
+ *exitinfo |= IOIO_SEG_ES;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ /* OUTS opcodes */
+ case 0x6e:
+ case 0x6f:
+ *exitinfo |= IOIO_TYPE_OUTS;
+ *exitinfo |= IOIO_SEG_DS;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ /* IN immediate opcodes */
+ case 0xe4:
+ case 0xe5:
+ *exitinfo |= IOIO_TYPE_IN;
+ port = (u8)insn->immediate.value & 0xffff;
+ break;
+
+ /* OUT immediate opcodes */
+ case 0xe6:
+ case 0xe7:
+ *exitinfo |= IOIO_TYPE_OUT;
+ port = (u8)insn->immediate.value & 0xffff;
+ break;
+
+ /* IN register opcodes */
+ case 0xec:
+ case 0xed:
+ *exitinfo |= IOIO_TYPE_IN;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ /* OUT register opcodes */
+ case 0xee:
+ case 0xef:
+ *exitinfo |= IOIO_TYPE_OUT;
+ port = ctxt->regs->dx & 0xffff;
+ break;
+
+ default:
+ return ES_DECODE_FAILED;
+ }
+
+ *exitinfo |= port << 16;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x6c:
+ case 0x6e:
+ case 0xe4:
+ case 0xe6:
+ case 0xec:
+ case 0xee:
+ /* Single byte opcodes */
+ *exitinfo |= IOIO_DATA_8;
+ size = 1;
+ break;
+ default:
+ /* Length determined by instruction parsing */
+ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
+ : IOIO_DATA_32;
+ size = (insn->opnd_bytes == 2) ? 2 : 4;
+ }
+
+ switch (insn->addr_bytes) {
+ case 2:
+ *exitinfo |= IOIO_ADDR_16;
+ break;
+ case 4:
+ *exitinfo |= IOIO_ADDR_32;
+ break;
+ case 8:
+ *exitinfo |= IOIO_ADDR_64;
+ break;
+ }
+
+ if (insn_has_rep_prefix(insn))
+ *exitinfo |= IOIO_REP;
+
+ return vc_ioio_check(ctxt, (u16)port, size);
+}
+
+static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u64 exit_info_1, exit_info_2;
+ enum es_result ret;
+
+ ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_STR) {
+
+ /* (REP) INS/OUTS */
+
+ bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
+ unsigned int io_bytes, exit_bytes;
+ unsigned int ghcb_count, op_count;
+ unsigned long es_base;
+ u64 sw_scratch;
+
+ /*
+ * For the string variants with rep prefix the amount of in/out
+ * operations per #VC exception is limited so that the kernel
+ * has a chance to take interrupts and re-schedule while the
+ * instruction is emulated.
+ */
+ io_bytes = (exit_info_1 >> 4) & 0x7;
+ ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
+
+ op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
+ exit_info_2 = min(op_count, ghcb_count);
+ exit_bytes = exit_info_2 * io_bytes;
+
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ /* Read bytes of OUTS into the shared buffer */
+ if (!(exit_info_1 & IOIO_TYPE_IN)) {
+ ret = vc_insn_string_read(ctxt,
+ (void *)(es_base + regs->si),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Issue an VMGEXIT to the HV to consume the bytes from the
+ * shared buffer or to have it write them into the shared buffer
+ * depending on the instruction: OUTS or INS.
+ */
+ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
+ ghcb_set_sw_scratch(ghcb, sw_scratch);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
+ exit_info_1, exit_info_2);
+ if (ret != ES_OK)
+ return ret;
+
+ /* Read bytes from shared buffer into the guest's destination. */
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ ret = vc_insn_string_write(ctxt,
+ (void *)(es_base + regs->di),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+
+ if (df)
+ regs->di -= exit_bytes;
+ else
+ regs->di += exit_bytes;
+ } else {
+ if (df)
+ regs->si -= exit_bytes;
+ else
+ regs->si += exit_bytes;
+ }
+
+ if (exit_info_1 & IOIO_REP)
+ regs->cx -= exit_info_2;
+
+ ret = regs->cx ? ES_RETRY : ES_OK;
+
+ } else {
+
+ /* IN/OUT into/from rAX */
+
+ int bits = (exit_info_1 & 0x70) >> 1;
+ u64 rax = 0;
+
+ if (!(exit_info_1 & IOIO_TYPE_IN))
+ rax = lower_bits(regs->ax, bits);
+
+ ghcb_set_rax(ghcb, rax);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+ regs->ax = lower_bits(ghcb->save.rax, bits);
+ }
+ }
+
+ return ret;
+}
+
+enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ u32 ret;
+
+ ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
+ if (!ret)
+ return ES_OK;
+
+ if (ret == 1) {
+ u64 info = ghcb->save.sw_exit_info_2;
+ unsigned long v = info & SVM_EVTINJ_VEC_MASK;
+
+ /* Check if exception information from hypervisor is sane. */
+ if ((info & SVM_EVTINJ_VALID) &&
+ ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
+ ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
+ ctxt->fi.vector = v;
+
+ if (info & SVM_EVTINJ_VALID_ERR)
+ ctxt->fi.error_code = info >> 32;
+
+ return ES_EXCEPTION;
+ }
+ }
+
+ return ES_VMM_ERROR;
+}
+
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ return verify_exception_info(ghcb, ctxt);
+}
+
+static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ u32 cr4 = native_read_cr4();
+ int ret;
+
+ ghcb_set_rax(ghcb, leaf->fn);
+ ghcb_set_rcx(ghcb, leaf->subfn);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #UD - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ leaf->eax = ghcb->save.rax;
+ leaf->ebx = ghcb->save.rbx;
+ leaf->ecx = ghcb->save.rcx;
+ leaf->edx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+struct cpuid_ctx {
+ struct ghcb *ghcb;
+ struct es_em_ctxt *ctxt;
+};
+
+static void snp_cpuid_hv_ghcb(void *p, struct cpuid_leaf *leaf)
+{
+ struct cpuid_ctx *ctx = p;
+
+ if (__sev_cpuid_hv_ghcb(ctx->ghcb, ctx->ctxt, leaf))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
+}
+
+static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct cpuid_ctx ctx = { ghcb, ctxt };
+ struct pt_regs *regs = ctxt->regs;
+ struct cpuid_leaf leaf;
+ int ret;
+
+ leaf.fn = regs->ax;
+ leaf.subfn = regs->cx;
+ ret = snp_cpuid(snp_cpuid_hv_ghcb, &ctx, &leaf);
+ if (!ret) {
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u32 cr4 = native_read_cr4();
+ enum es_result ret;
+ int snp_cpuid_ret;
+
+ snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt);
+ if (!snp_cpuid_ret)
+ return ES_OK;
+ if (snp_cpuid_ret != -EOPNOTSUPP)
+ return ES_VMM_ERROR;
+
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rcx(ghcb, regs->cx);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #GP - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ if (has_cpuflag(X86_FEATURE_SHSTK) && regs->ax == 0xd && regs->cx == 1) {
+ struct msr m;
+
+ raw_rdmsr(MSR_IA32_XSS, &m);
+ ghcb_set_xss(ghcb, m.q);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ regs->ax = ghcb->save.rax;
+ regs->bx = ghcb->save.rbx;
+ regs->cx = ghcb->save.rcx;
+ regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ unsigned long exit_code)
+{
+ bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
+ enum es_result ret;
+
+ /*
+ * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure
+ * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP
+ * instructions are being intercepted. If this should occur and Secure
+ * TSC is enabled, guest execution should be terminated as the guest
+ * cannot rely on the TSC value provided by the hypervisor.
+ */
+ if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
+ return ES_VMM_ERROR;
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
+ (!rdtscp || ghcb_rcx_is_valid(ghcb))))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+ if (rdtscp)
+ ctxt->regs->cx = ghcb->save.rcx;
+
+ return ES_OK;
+}
+
+void snp_register_ghcb_early(unsigned long paddr)
+{
+ unsigned long pfn = paddr >> PAGE_SHIFT;
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ /* If the response GPA is not ours then abort the guest */
+ if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) ||
+ (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
+}
+
+bool __init sev_es_check_cpu_features(void)
+{
+ if (!has_cpuflag(X86_FEATURE_RDRAND)) {
+ error("RDRAND instruction not supported - no trusted source of randomness available\n");
+ return false;
+ }
+
+ return true;
+}
+
+bool sev_es_negotiate_protocol(void)
+{
+ u64 val;
+
+ /* Do the GHCB protocol version negotiation */
+ sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+
+ if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
+ return false;
+
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
+ return false;
+
+ ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX);
+
+ return true;
+}
diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile
new file mode 100644
index 000000000000..b3c47d3700e2
--- /dev/null
+++ b/arch/x86/coco/tdx/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += debug.o tdcall.o tdx.o tdx-shared.o
diff --git a/arch/x86/coco/tdx/debug.c b/arch/x86/coco/tdx/debug.c
new file mode 100644
index 000000000000..cef847c8bb67
--- /dev/null
+++ b/arch/x86/coco/tdx/debug.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tdx: " fmt
+
+#include <linux/array_size.h>
+#include <linux/printk.h>
+#include <asm/tdx.h>
+
+#define DEF_TDX_ATTR_NAME(_name) [TDX_ATTR_##_name##_BIT] = __stringify(_name)
+
+static __initdata const char *tdx_attributes[] = {
+ DEF_TDX_ATTR_NAME(DEBUG),
+ DEF_TDX_ATTR_NAME(HGS_PLUS_PROF),
+ DEF_TDX_ATTR_NAME(PERF_PROF),
+ DEF_TDX_ATTR_NAME(PMT_PROF),
+ DEF_TDX_ATTR_NAME(ICSSD),
+ DEF_TDX_ATTR_NAME(LASS),
+ DEF_TDX_ATTR_NAME(SEPT_VE_DISABLE),
+ DEF_TDX_ATTR_NAME(MIGRTABLE),
+ DEF_TDX_ATTR_NAME(PKS),
+ DEF_TDX_ATTR_NAME(KL),
+ DEF_TDX_ATTR_NAME(TPA),
+ DEF_TDX_ATTR_NAME(PERFMON),
+};
+
+#define DEF_TD_CTLS_NAME(_name) [TD_CTLS_##_name##_BIT] = __stringify(_name)
+
+static __initdata const char *tdcs_td_ctls[] = {
+ DEF_TD_CTLS_NAME(PENDING_VE_DISABLE),
+ DEF_TD_CTLS_NAME(ENUM_TOPOLOGY),
+ DEF_TD_CTLS_NAME(VIRT_CPUID2),
+ DEF_TD_CTLS_NAME(REDUCE_VE),
+ DEF_TD_CTLS_NAME(LOCK),
+};
+
+void __init tdx_dump_attributes(u64 td_attr)
+{
+ pr_info("Attributes:");
+
+ for (int i = 0; i < ARRAY_SIZE(tdx_attributes); i++) {
+ if (!tdx_attributes[i])
+ continue;
+ if (td_attr & BIT(i))
+ pr_cont(" %s", tdx_attributes[i]);
+ td_attr &= ~BIT(i);
+ }
+
+ if (td_attr)
+ pr_cont(" unknown:%#llx", td_attr);
+ pr_cont("\n");
+
+}
+
+void __init tdx_dump_td_ctls(u64 td_ctls)
+{
+ pr_info("TD_CTLS:");
+
+ for (int i = 0; i < ARRAY_SIZE(tdcs_td_ctls); i++) {
+ if (!tdcs_td_ctls[i])
+ continue;
+ if (td_ctls & BIT(i))
+ pr_cont(" %s", tdcs_td_ctls[i]);
+ td_ctls &= ~BIT(i);
+ }
+ if (td_ctls)
+ pr_cont(" unknown:%#llx", td_ctls);
+ pr_cont("\n");
+}
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
new file mode 100644
index 000000000000..52d9786da308
--- /dev/null
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/asm-offsets.h>
+#include <asm/asm.h>
+
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include "../../virt/vmx/tdx/tdxcall.S"
+
+.section .noinstr.text, "ax"
+
+/*
+ * __tdcall() - Used by TDX guests to request services from the TDX
+ * module (does not include VMM services) using TDCALL instruction.
+ *
+ * __tdcall() function ABI:
+ *
+ * @fn (RDI) - TDCALL Leaf ID, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input
+ *
+ * Only RCX/RDX/R8-R11 are used as input registers.
+ *
+ * Return status of TDCALL via RAX.
+ */
+SYM_FUNC_START(__tdcall)
+ TDX_MODULE_CALL host=0
+SYM_FUNC_END(__tdcall)
+
+/*
+ * __tdcall_ret() - Used by TDX guests to request services from the TDX
+ * module (does not include VMM services) using TDCALL instruction, with
+ * saving output registers to the 'struct tdx_module_args' used as input.
+ *
+ * __tdcall_ret() function ABI:
+ *
+ * @fn (RDI) - TDCALL Leaf ID, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input and output
+ *
+ * Only RCX/RDX/R8-R11 are used as input/output registers.
+ *
+ * Return status of TDCALL via RAX.
+ */
+SYM_FUNC_START(__tdcall_ret)
+ TDX_MODULE_CALL host=0 ret=1
+SYM_FUNC_END(__tdcall_ret)
+
+/*
+ * __tdcall_saved_ret() - Used by TDX guests to request services from the
+ * TDX module (including VMM services) using TDCALL instruction, with
+ * saving output registers to the 'struct tdx_module_args' used as input.
+ *
+ * __tdcall_saved_ret() function ABI:
+ *
+ * @fn (RDI) - TDCALL leaf ID, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input/output
+ *
+ * All registers in @args are used as input/output registers.
+ *
+ * On successful completion, return the hypercall error code.
+ */
+SYM_FUNC_START(__tdcall_saved_ret)
+ TDX_MODULE_CALL host=0 ret=1 saved=1
+SYM_FUNC_END(__tdcall_saved_ret)
diff --git a/arch/x86/coco/tdx/tdx-shared.c b/arch/x86/coco/tdx/tdx-shared.c
new file mode 100644
index 000000000000..1655aa56a0a5
--- /dev/null
+++ b/arch/x86/coco/tdx/tdx-shared.c
@@ -0,0 +1,91 @@
+#include <asm/tdx.h>
+#include <asm/pgtable.h>
+
+static unsigned long try_accept_one(phys_addr_t start, unsigned long len,
+ enum pg_level pg_level)
+{
+ unsigned long accept_size = page_level_size(pg_level);
+ struct tdx_module_args args = {};
+ u8 page_size;
+
+ if (!IS_ALIGNED(start, accept_size))
+ return 0;
+
+ if (len < accept_size)
+ return 0;
+
+ /*
+ * Pass the page physical address to the TDX module to accept the
+ * pending, private page.
+ *
+ * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
+ */
+ switch (pg_level) {
+ case PG_LEVEL_4K:
+ page_size = TDX_PS_4K;
+ break;
+ case PG_LEVEL_2M:
+ page_size = TDX_PS_2M;
+ break;
+ case PG_LEVEL_1G:
+ page_size = TDX_PS_1G;
+ break;
+ default:
+ return 0;
+ }
+
+ args.rcx = start | page_size;
+ if (__tdcall(TDG_MEM_PAGE_ACCEPT, &args))
+ return 0;
+
+ return accept_size;
+}
+
+bool tdx_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ /*
+ * For shared->private conversion, accept the page using
+ * TDG_MEM_PAGE_ACCEPT TDX module call.
+ */
+ while (start < end) {
+ unsigned long len = end - start;
+ unsigned long accept_size;
+
+ /*
+ * Try larger accepts first. It gives chance to VMM to keep
+ * 1G/2M Secure EPT entries where possible and speeds up
+ * process by cutting number of hypercalls (if successful).
+ */
+
+ accept_size = try_accept_one(start, len, PG_LEVEL_1G);
+ if (!accept_size)
+ accept_size = try_accept_one(start, len, PG_LEVEL_2M);
+ if (!accept_size)
+ accept_size = try_accept_one(start, len, PG_LEVEL_4K);
+ if (!accept_size)
+ return false;
+ start += accept_size;
+ }
+
+ return true;
+}
+
+noinstr u64 __tdx_hypercall(struct tdx_module_args *args)
+{
+ /*
+ * For TDVMCALL explicitly set RCX to the bitmap of shared registers.
+ * The caller isn't expected to set @args->rcx anyway.
+ */
+ args->rcx = TDVMCALL_EXPOSE_REGS_MASK;
+
+ /*
+ * Failure of __tdcall_saved_ret() indicates a failure of the TDVMCALL
+ * mechanism itself and that something has gone horribly wrong with
+ * the TDX module. __tdx_hypercall_failed() never returns.
+ */
+ if (__tdcall_saved_ret(TDG_VP_VMCALL, args))
+ __tdx_hypercall_failed();
+
+ /* TDVMCALL leaf return code is in R10 */
+ return args->r10;
+}
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
new file mode 100644
index 000000000000..7b2833705d47
--- /dev/null
+++ b/arch/x86/coco/tdx/tdx.c
@@ -0,0 +1,1196 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021-2022 Intel Corporation */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tdx: " fmt
+
+#include <linux/cpufeature.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <asm/coco.h>
+#include <asm/tdx.h>
+#include <asm/vmx.h>
+#include <asm/ia32.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/paravirt_types.h>
+#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
+
+/* MMIO direction */
+#define EPT_READ 0
+#define EPT_WRITE 1
+
+/* Port I/O direction */
+#define PORT_READ 0
+#define PORT_WRITE 1
+
+/* See Exit Qualification for I/O Instructions in VMX documentation */
+#define VE_IS_IO_IN(e) ((e) & BIT(3))
+#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
+#define VE_GET_PORT_NUM(e) ((e) >> 16)
+#define VE_IS_IO_STRING(e) ((e) & BIT(4))
+
+/* TDX Module call error codes */
+#define TDCALL_RETURN_CODE(a) ((a) >> 32)
+#define TDCALL_INVALID_OPERAND 0xc0000100
+#define TDCALL_OPERAND_BUSY 0x80000200
+
+#define TDREPORT_SUBTYPE_0 0
+
+static atomic_long_t nr_shared;
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+noinstr void __noreturn __tdx_hypercall_failed(void)
+{
+ instrumentation_begin();
+ panic("TDVMCALL failed. TDX module bug?");
+}
+
+#ifdef CONFIG_KVM_GUEST
+long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4)
+{
+ struct tdx_module_args args = {
+ .r10 = nr,
+ .r11 = p1,
+ .r12 = p2,
+ .r13 = p3,
+ .r14 = p4,
+ };
+
+ return __tdx_hypercall(&args);
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
+#endif
+
+/*
+ * Used for TDX guests to make calls directly to the TD module. This
+ * should only be used for calls that have no legitimate reason to fail
+ * or where the kernel can not survive the call failing.
+ */
+static inline void tdcall(u64 fn, struct tdx_module_args *args)
+{
+ if (__tdcall_ret(fn, args))
+ panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
+}
+
+/* Read TD-scoped metadata */
+static inline u64 tdg_vm_rd(u64 field, u64 *value)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ };
+ u64 ret;
+
+ ret = __tdcall_ret(TDG_VM_RD, &args);
+ *value = args.r8;
+
+ return ret;
+}
+
+/* Write TD-scoped metadata */
+static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ .r8 = value,
+ .r9 = mask,
+ };
+
+ return __tdcall(TDG_VM_WR, &args);
+}
+
+/**
+ * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
+ * subtype 0) using TDG.MR.REPORT TDCALL.
+ * @reportdata: Address of the input buffer which contains user-defined
+ * REPORTDATA to be included into TDREPORT.
+ * @tdreport: Address of the output buffer to store TDREPORT.
+ *
+ * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.REPORT TDCALL.
+ *
+ * It is used in the TDX guest driver module to get the TDREPORT0.
+ *
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
+ */
+int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
+{
+ struct tdx_module_args args = {
+ .rcx = virt_to_phys(tdreport),
+ .rdx = virt_to_phys(reportdata),
+ .r8 = TDREPORT_SUBTYPE_0,
+ };
+ u64 ret;
+
+ ret = __tdcall(TDG_MR_REPORT, &args);
+ if (ret) {
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
+ return -ENXIO;
+ else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
+
+/**
+ * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using
+ * TDG.MR.RTMR.EXTEND TDCALL.
+ * @index: Index of RTMR register to be extended.
+ * @data: Address of the input buffer with RTMR register extend data.
+ *
+ * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.RTMR.EXTEND TDCALL.
+ *
+ * It is used in the TDX guest driver module to allow user to extend the RTMR
+ * registers.
+ *
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
+ */
+int tdx_mcall_extend_rtmr(u8 index, u8 *data)
+{
+ struct tdx_module_args args = {
+ .rcx = virt_to_phys(data),
+ .rdx = index,
+ };
+ u64 ret;
+
+ ret = __tdcall(TDG_MR_RTMR_EXTEND, &args);
+ if (ret) {
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
+ return -ENXIO;
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr);
+
+/**
+ * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
+ * hypercall.
+ * @buf: Address of the directly mapped shared kernel buffer which
+ * contains TDREPORT. The same buffer will be used by VMM to
+ * store the generated TD Quote output.
+ * @size: size of the tdquote buffer (4KB-aligned).
+ *
+ * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
+ * v1.0 specification for more information on GetQuote hypercall.
+ * It is used in the TDX guest driver module to get the TD Quote.
+ *
+ * Return 0 on success or error code on failure.
+ */
+u64 tdx_hcall_get_quote(u8 *buf, size_t size)
+{
+ /* Since buf is a shared memory, set the shared (decrypted) bits */
+ return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
+}
+EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
+
+static void __noreturn tdx_panic(const char *msg)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = TDVMCALL_REPORT_FATAL_ERROR,
+ .r12 = 0, /* Error code: 0 is Panic */
+ };
+ union {
+ /* Define register order according to the GHCI */
+ struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
+
+ char bytes[64] __nonstring;
+ } message;
+
+ /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
+ strtomem_pad(message.bytes, msg, '\0');
+
+ args.r8 = message.r8;
+ args.r9 = message.r9;
+ args.r14 = message.r14;
+ args.r15 = message.r15;
+ args.rdi = message.rdi;
+ args.rsi = message.rsi;
+ args.rbx = message.rbx;
+ args.rdx = message.rdx;
+
+ /*
+ * This hypercall should never return and it is not safe
+ * to keep the guest running. Call it forever if it
+ * happens to return.
+ */
+ while (1)
+ __tdx_hypercall(&args);
+}
+
+/*
+ * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure
+ * that no #VE will be delivered for accesses to TD-private memory.
+ *
+ * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
+ * controls if the guest will receive such #VE with TD attribute
+ * TDX_ATTR_SEPT_VE_DISABLE.
+ *
+ * Newer TDX modules allow the guest to control if it wants to receive SEPT
+ * violation #VEs.
+ *
+ * Check if the feature is available and disable SEPT #VE if possible.
+ *
+ * If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE
+ * attribute is no longer reliable. It reflects the initial state of the
+ * control for the TD, but it will not be updated if someone (e.g. bootloader)
+ * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
+ * determine if SEPT #VEs are enabled or disabled.
+ */
+static void disable_sept_ve(u64 td_attr)
+{
+ const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
+ bool debug = td_attr & TDX_ATTR_DEBUG;
+ u64 config, controls;
+
+ /* Is this TD allowed to disable SEPT #VE */
+ tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
+ if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
+ /* No SEPT #VE controls for the guest: check the attribute */
+ if (td_attr & TDX_ATTR_SEPT_VE_DISABLE)
+ return;
+
+ /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
+ if (debug)
+ pr_warn("%s\n", msg);
+ else
+ tdx_panic(msg);
+ return;
+ }
+
+ /* Check if SEPT #VE has been disabled before us */
+ tdg_vm_rd(TDCS_TD_CTLS, &controls);
+ if (controls & TD_CTLS_PENDING_VE_DISABLE)
+ return;
+
+ /* Keep #VEs enabled for splats in debugging environments */
+ if (debug)
+ return;
+
+ /* Disable SEPT #VEs */
+ tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE,
+ TD_CTLS_PENDING_VE_DISABLE);
+}
+
+/*
+ * TDX 1.0 generates a #VE when accessing topology-related CPUID leafs (0xB and
+ * 0x1F) and the X2APIC_APICID MSR. The kernel returns all zeros on CPUID #VEs.
+ * In practice, this means that the kernel can only boot with a plain topology.
+ * Any complications will cause problems.
+ *
+ * The ENUM_TOPOLOGY feature allows the VMM to provide topology information.
+ * Enabling the feature eliminates topology-related #VEs: the TDX module
+ * virtualizes accesses to the CPUID leafs and the MSR.
+ *
+ * Enable ENUM_TOPOLOGY if it is available.
+ */
+static void enable_cpu_topology_enumeration(void)
+{
+ u64 configured;
+
+ /* Has the VMM provided a valid topology configuration? */
+ tdg_vm_rd(TDCS_TOPOLOGY_ENUM_CONFIGURED, &configured);
+ if (!configured) {
+ pr_err("VMM did not configure X2APIC_IDs properly\n");
+ return;
+ }
+
+ tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);
+}
+
+static void reduce_unnecessary_ve(void)
+{
+ u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE);
+
+ if (err == TDX_SUCCESS)
+ return;
+
+ /*
+ * Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to
+ * enable ENUM_TOPOLOGY if REDUCE_VE was not successful.
+ */
+ enable_cpu_topology_enumeration();
+}
+
+static void tdx_setup(u64 *cc_mask)
+{
+ struct tdx_module_args args = {};
+ unsigned int gpa_width;
+ u64 td_attr;
+
+ /*
+ * TDINFO TDX module call is used to get the TD execution environment
+ * information like GPA width, number of available vcpus, debug mode
+ * information, etc. More details about the ABI can be found in TDX
+ * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
+ * [TDG.VP.INFO].
+ */
+ tdcall(TDG_VP_INFO, &args);
+
+ /*
+ * The highest bit of a guest physical address is the "sharing" bit.
+ * Set it for shared pages and clear it for private pages.
+ *
+ * The GPA width that comes out of this call is critical. TDX guests
+ * can not meaningfully run without it.
+ */
+ gpa_width = args.rcx & GENMASK(5, 0);
+ *cc_mask = BIT_ULL(gpa_width - 1);
+
+ td_attr = args.rdx;
+
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
+
+ disable_sept_ve(td_attr);
+
+ reduce_unnecessary_ve();
+}
+
+/*
+ * The TDX module spec states that #VE may be injected for a limited set of
+ * reasons:
+ *
+ * - Emulation of the architectural #VE injection on EPT violation;
+ *
+ * - As a result of guest TD execution of a disallowed instruction,
+ * a disallowed MSR access, or CPUID virtualization;
+ *
+ * - A notification to the guest TD about anomalous behavior;
+ *
+ * The last one is opt-in and is not used by the kernel.
+ *
+ * The Intel Software Developer's Manual describes cases when instruction
+ * length field can be used in section "Information for VM Exits Due to
+ * Instruction Execution".
+ *
+ * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
+ * information if #VE occurred due to instruction execution, but not for EPT
+ * violations.
+ */
+static int ve_instr_len(struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ case EXIT_REASON_CPUID:
+ case EXIT_REASON_IO_INSTRUCTION:
+ /* It is safe to use ve->instr_len for #VE due instructions */
+ return ve->instr_len;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * For EPT violations, ve->insn_len is not defined. For those,
+ * the kernel must decode instructions manually and should not
+ * be using this function.
+ */
+ WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
+ return 0;
+ default:
+ WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
+ return ve->instr_len;
+ }
+}
+
+static u64 __cpuidle __halt(const bool irq_disabled)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_HLT),
+ .r12 = irq_disabled,
+ };
+
+ /*
+ * Emulate HLT operation via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
+ *
+ * The VMM uses the "IRQ disabled" param to understand IRQ
+ * enabled status (RFLAGS.IF) of the TD guest and to determine
+ * whether or not it should schedule the halted vCPU if an
+ * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
+ * can keep the vCPU in virtual HLT, even if an IRQ is
+ * pending, without hanging/breaking the guest.
+ */
+ return __tdx_hypercall(&args);
+}
+
+static int handle_halt(struct ve_info *ve)
+{
+ const bool irq_disabled = irqs_disabled();
+
+ /*
+ * HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a
+ * wake event may be consumed before requesting HLT emulation, leaving
+ * the vCPU blocking indefinitely.
+ */
+ if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled"))
+ return -EIO;
+
+ if (__halt(irq_disabled))
+ return -EIO;
+
+ return ve_instr_len(ve);
+}
+
+void __cpuidle tdx_halt(void)
+{
+ const bool irq_disabled = false;
+
+ /*
+ * Use WARN_ONCE() to report the failure.
+ */
+ if (__halt(irq_disabled))
+ WARN_ONCE(1, "HLT instruction emulation failed\n");
+}
+
+static void __cpuidle tdx_safe_halt(void)
+{
+ tdx_halt();
+ /*
+ * "__cpuidle" section doesn't support instrumentation, so stick
+ * with raw_* variant that avoids tracing hooks.
+ */
+ raw_local_irq_enable();
+}
+
+static int read_msr(struct pt_regs *regs, struct ve_info *ve)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_MSR_READ),
+ .r12 = regs->cx,
+ };
+
+ /*
+ * Emulate the MSR read via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
+ */
+ if (__tdx_hypercall(&args))
+ return -EIO;
+
+ regs->ax = lower_32_bits(args.r11);
+ regs->dx = upper_32_bits(args.r11);
+ return ve_instr_len(ve);
+}
+
+static int write_msr(struct pt_regs *regs, struct ve_info *ve)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
+ .r12 = regs->cx,
+ .r13 = (u64)regs->dx << 32 | regs->ax,
+ };
+
+ /*
+ * Emulate the MSR write via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
+ */
+ if (__tdx_hypercall(&args))
+ return -EIO;
+
+ return ve_instr_len(ve);
+}
+
+static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_CPUID),
+ .r12 = regs->ax,
+ .r13 = regs->cx,
+ };
+
+ /*
+ * Only allow VMM to control range reserved for hypervisor
+ * communication.
+ *
+ * Return all-zeros for any CPUID outside the range. It matches CPU
+ * behaviour for non-supported leaf.
+ */
+ if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
+ regs->ax = regs->bx = regs->cx = regs->dx = 0;
+ return ve_instr_len(ve);
+ }
+
+ /*
+ * Emulate the CPUID instruction via a hypercall. More info about
+ * ABI can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
+ */
+ if (__tdx_hypercall(&args))
+ return -EIO;
+
+ /*
+ * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
+ * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
+ * So copy the register contents back to pt_regs.
+ */
+ regs->ax = args.r12;
+ regs->bx = args.r13;
+ regs->cx = args.r14;
+ regs->dx = args.r15;
+
+ return ve_instr_len(ve);
+}
+
+static bool mmio_read(int size, unsigned long addr, unsigned long *val)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
+ .r12 = size,
+ .r13 = EPT_READ,
+ .r14 = addr,
+ };
+
+ if (__tdx_hypercall(&args))
+ return false;
+
+ *val = args.r11;
+ return true;
+}
+
+static bool mmio_write(int size, unsigned long addr, unsigned long val)
+{
+ return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
+ EPT_WRITE, addr, val);
+}
+
+static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+{
+ unsigned long *reg, val, vaddr;
+ char buffer[MAX_INSN_SIZE];
+ enum insn_mmio_type mmio;
+ struct insn insn = {};
+ int size, extend_size;
+ u8 extend_val = 0;
+
+ /* Only in-kernel MMIO is supported */
+ if (WARN_ON_ONCE(user_mode(regs)))
+ return -EFAULT;
+
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
+ return -EFAULT;
+
+ if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
+ return -EINVAL;
+
+ mmio = insn_decode_mmio(&insn, &size);
+ if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
+ return -EINVAL;
+
+ if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
+ reg = insn_get_modrm_reg_ptr(&insn, regs);
+ if (!reg)
+ return -EINVAL;
+ }
+
+ if (!fault_in_kernel_space(ve->gla)) {
+ WARN_ONCE(1, "Access to userspace address is not supported");
+ return -EINVAL;
+ }
+
+ /*
+ * Reject EPT violation #VEs that split pages.
+ *
+ * MMIO accesses are supposed to be naturally aligned and therefore
+ * never cross page boundaries. Seeing split page accesses indicates
+ * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
+ *
+ * load_unaligned_zeropad() will recover using exception fixups.
+ */
+ vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
+ return -EFAULT;
+
+ /* Handle writes first */
+ switch (mmio) {
+ case INSN_MMIO_WRITE:
+ memcpy(&val, reg, size);
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
+ case INSN_MMIO_WRITE_IMM:
+ val = insn.immediate.value;
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
+ case INSN_MMIO_READ:
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
+ /* Reads are handled below */
+ break;
+ case INSN_MMIO_MOVS:
+ case INSN_MMIO_DECODE_FAILED:
+ /*
+ * MMIO was accessed with an instruction that could not be
+ * decoded or handled properly. It was likely not using io.h
+ * helpers or accessed MMIO accidentally.
+ */
+ return -EINVAL;
+ default:
+ WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
+ return -EINVAL;
+ }
+
+ /* Handle reads */
+ if (!mmio_read(size, ve->gpa, &val))
+ return -EIO;
+
+ switch (mmio) {
+ case INSN_MMIO_READ:
+ /* Zero-extend for 32-bit operation */
+ extend_size = size == 4 ? sizeof(*reg) : 0;
+ break;
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ /* Zero extend based on operand size */
+ extend_size = insn.opnd_bytes;
+ break;
+ case INSN_MMIO_READ_SIGN_EXTEND:
+ /* Sign extend based on operand size */
+ extend_size = insn.opnd_bytes;
+ if (size == 1 && val & BIT(7))
+ extend_val = 0xFF;
+ else if (size > 1 && val & BIT(15))
+ extend_val = 0xFF;
+ break;
+ default:
+ /* All other cases has to be covered with the first switch() */
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ if (extend_size)
+ memset(reg, extend_val, extend_size);
+ memcpy(reg, &val, size);
+ return insn.length;
+}
+
+static bool handle_in(struct pt_regs *regs, int size, int port)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
+ .r12 = size,
+ .r13 = PORT_READ,
+ .r14 = port,
+ };
+ u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+ bool success;
+
+ /*
+ * Emulate the I/O read via hypercall. More info about ABI can be found
+ * in TDX Guest-Host-Communication Interface (GHCI) section titled
+ * "TDG.VP.VMCALL<Instruction.IO>".
+ */
+ success = !__tdx_hypercall(&args);
+
+ /* Update part of the register affected by the emulated instruction */
+ regs->ax &= ~mask;
+ if (success)
+ regs->ax |= args.r11 & mask;
+
+ return success;
+}
+
+static bool handle_out(struct pt_regs *regs, int size, int port)
+{
+ u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+
+ /*
+ * Emulate the I/O write via hypercall. More info about ABI can be found
+ * in TDX Guest-Host-Communication Interface (GHCI) section titled
+ * "TDG.VP.VMCALL<Instruction.IO>".
+ */
+ return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
+ PORT_WRITE, port, regs->ax & mask);
+}
+
+/*
+ * Emulate I/O using hypercall.
+ *
+ * Assumes the IO instruction was using ax, which is enforced
+ * by the standard io.h macros.
+ *
+ * Return True on success or False on failure.
+ */
+static int handle_io(struct pt_regs *regs, struct ve_info *ve)
+{
+ u32 exit_qual = ve->exit_qual;
+ int size, port;
+ bool in, ret;
+
+ if (VE_IS_IO_STRING(exit_qual))
+ return -EIO;
+
+ in = VE_IS_IO_IN(exit_qual);
+ size = VE_GET_IO_SIZE(exit_qual);
+ port = VE_GET_PORT_NUM(exit_qual);
+
+
+ if (in)
+ ret = handle_in(regs, size, port);
+ else
+ ret = handle_out(regs, size, port);
+ if (!ret)
+ return -EIO;
+
+ return ve_instr_len(ve);
+}
+
+/*
+ * Early #VE exception handler. Only handles a subset of port I/O.
+ * Intended only for earlyprintk. If failed, return false.
+ */
+__init bool tdx_early_handle_ve(struct pt_regs *regs)
+{
+ struct ve_info ve;
+ int insn_len;
+
+ tdx_get_ve_info(&ve);
+
+ if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
+ return false;
+
+ insn_len = handle_io(regs, &ve);
+ if (insn_len < 0)
+ return false;
+
+ regs->ip += insn_len;
+ return true;
+}
+
+void tdx_get_ve_info(struct ve_info *ve)
+{
+ struct tdx_module_args args = {};
+
+ /*
+ * Called during #VE handling to retrieve the #VE info from the
+ * TDX module.
+ *
+ * This has to be called early in #VE handling. A "nested" #VE which
+ * occurs before this will raise a #DF and is not recoverable.
+ *
+ * The call retrieves the #VE info from the TDX module, which also
+ * clears the "#VE valid" flag. This must be done before anything else
+ * because any #VE that occurs while the valid flag is set will lead to
+ * #DF.
+ *
+ * Note, the TDX module treats virtual NMIs as inhibited if the #VE
+ * valid flag is set. It means that NMI=>#VE will not result in a #DF.
+ */
+ tdcall(TDG_VP_VEINFO_GET, &args);
+
+ /* Transfer the output parameters */
+ ve->exit_reason = args.rcx;
+ ve->exit_qual = args.rdx;
+ ve->gla = args.r8;
+ ve->gpa = args.r9;
+ ve->instr_len = lower_32_bits(args.r10);
+ ve->instr_info = upper_32_bits(args.r10);
+}
+
+/*
+ * Handle the user initiated #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_CPUID:
+ return handle_cpuid(regs, ve);
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ return -EIO;
+ }
+}
+
+static inline bool is_private_gpa(u64 gpa)
+{
+ return gpa == cc_mkenc(gpa);
+}
+
+/*
+ * Handle the kernel #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ return handle_halt(ve);
+ case EXIT_REASON_MSR_READ:
+ return read_msr(regs, ve);
+ case EXIT_REASON_MSR_WRITE:
+ return write_msr(regs, ve);
+ case EXIT_REASON_CPUID:
+ return handle_cpuid(regs, ve);
+ case EXIT_REASON_EPT_VIOLATION:
+ if (is_private_gpa(ve->gpa))
+ panic("Unexpected EPT-violation on private memory.");
+ return handle_mmio(regs, ve);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return handle_io(regs, ve);
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ return -EIO;
+ }
+}
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
+{
+ int insn_len;
+
+ if (user_mode(regs))
+ insn_len = virt_exception_user(regs, ve);
+ else
+ insn_len = virt_exception_kernel(regs, ve);
+ if (insn_len < 0)
+ return false;
+
+ /* After successful #VE handling, move the IP */
+ regs->ip += insn_len;
+
+ return true;
+}
+
+static bool tdx_tlb_flush_required(bool private)
+{
+ /*
+ * TDX guest is responsible for flushing TLB on private->shared
+ * transition. VMM is responsible for flushing on shared->private.
+ *
+ * The VMM _can't_ flush private addresses as it can't generate PAs
+ * with the guest's HKID. Shared memory isn't subject to integrity
+ * checking, i.e. the VMM doesn't need to flush for its own protection.
+ *
+ * There's no need to flush when converting from shared to private,
+ * as flushing is the VMM's responsibility in this case, e.g. it must
+ * flush to avoid integrity failures in the face of a buggy or
+ * malicious guest.
+ */
+ return !private;
+}
+
+static bool tdx_cache_flush_required(void)
+{
+ /*
+ * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
+ * TDX doesn't have such capability.
+ *
+ * Flush cache unconditionally.
+ */
+ return true;
+}
+
+/*
+ * Notify the VMM about page mapping conversion. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface (GHCI),
+ * section "TDG.VP.VMCALL<MapGPA>".
+ */
+static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
+{
+ /* Retrying the hypercall a second time should succeed; use 3 just in case */
+ const int max_retries_per_page = 3;
+ int retry_count = 0;
+
+ if (!enc) {
+ /* Set the shared (decrypted) bits: */
+ start |= cc_mkdec(0);
+ end |= cc_mkdec(0);
+ }
+
+ while (retry_count < max_retries_per_page) {
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = TDVMCALL_MAP_GPA,
+ .r12 = start,
+ .r13 = end - start };
+
+ u64 map_fail_paddr;
+ u64 ret = __tdx_hypercall(&args);
+
+ if (ret != TDVMCALL_STATUS_RETRY)
+ return !ret;
+ /*
+ * The guest must retry the operation for the pages in the
+ * region starting at the GPA specified in R11. R11 comes
+ * from the untrusted VMM. Sanity check it.
+ */
+ map_fail_paddr = args.r11;
+ if (map_fail_paddr < start || map_fail_paddr >= end)
+ return false;
+
+ /* "Consume" a retry without forward progress */
+ if (map_fail_paddr == start) {
+ retry_count++;
+ continue;
+ }
+
+ start = map_fail_paddr;
+ retry_count = 0;
+ }
+
+ return false;
+}
+
+/*
+ * Inform the VMM of the guest's intent for this physical page: shared with
+ * the VMM or private to the guest. The VMM is expected to change its mapping
+ * of the page in response.
+ */
+static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
+{
+ phys_addr_t start = __pa(vaddr);
+ phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+
+ if (!tdx_map_gpa(start, end, enc))
+ return false;
+
+ /* shared->private conversion requires memory to be accepted before use */
+ if (enc)
+ return tdx_accept_memory(start, end);
+
+ return true;
+}
+
+static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
+ bool enc)
+{
+ /*
+ * Only handle shared->private conversion here.
+ * See the comment in tdx_early_init().
+ */
+ if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ return 0;
+}
+
+static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
+ bool enc)
+{
+ /*
+ * Only handle private->shared conversion here.
+ * See the comment in tdx_early_init().
+ */
+ if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ if (enc)
+ atomic_long_sub(numpages, &nr_shared);
+ else
+ atomic_long_add(numpages, &nr_shared);
+
+ return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void tdx_kexec_begin(void)
+{
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ /*
+ * Crash kernel reaches here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!set_memory_enc_stop_conversion())
+ pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+/* Walk direct mapping and convert all shared memory back to private */
+static void tdx_kexec_finish(void)
+{
+ unsigned long addr, end;
+ long found = 0, shared;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ lockdep_assert_irqs_disabled();
+
+ addr = PAGE_OFFSET;
+ end = PAGE_OFFSET + get_max_mapped();
+
+ while (addr < end) {
+ unsigned long size;
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ size = page_level_size(level);
+
+ if (pte && pte_decrypted(*pte)) {
+ int pages = size / PAGE_SIZE;
+
+ /*
+ * Touching memory with shared bit set triggers implicit
+ * conversion to shared.
+ *
+ * Make sure nobody touches the shared range from
+ * now on.
+ */
+ set_pte(pte, __pte(0));
+
+ /*
+ * Memory encryption state persists across kexec.
+ * If tdx_enc_status_changed() fails in the first
+ * kernel, it leaves memory in an unknown state.
+ *
+ * If that memory remains shared, accessing it in the
+ * *next* kernel through a private mapping will result
+ * in an unrecoverable guest shutdown.
+ *
+ * The kdump kernel boot is not impacted as it uses
+ * a pre-reserved memory range that is always private.
+ * However, gathering crash information could lead to
+ * a crash if it accesses unconverted memory through
+ * a private mapping which is possible when accessing
+ * that memory through /proc/vmcore, for example.
+ *
+ * In all cases, print error info in order to leave
+ * enough bread crumbs for debugging.
+ */
+ if (!tdx_enc_status_changed(addr, pages, true)) {
+ pr_err("Failed to unshare range %#lx-%#lx\n",
+ addr, addr + size);
+ }
+
+ found += pages;
+ }
+
+ addr += size;
+ }
+
+ __flush_tlb_all();
+
+ shared = atomic_long_read(&nr_shared);
+ if (shared != found) {
+ pr_err("shared page accounting is off\n");
+ pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+ }
+}
+
+static __init void tdx_announce(void)
+{
+ struct tdx_module_args args = {};
+ u64 controls;
+
+ pr_info("Guest detected\n");
+
+ tdcall(TDG_VP_INFO, &args);
+ tdx_dump_attributes(args.rdx);
+
+ tdg_vm_rd(TDCS_TD_CTLS, &controls);
+ tdx_dump_td_ctls(controls);
+}
+
+void __init tdx_early_init(void)
+{
+ u64 cc_mask;
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
+
+ if (memcmp(TDX_IDENT, sig, sizeof(sig)))
+ return;
+
+ setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+
+ /* TSC is the only reliable clock in TDX guest */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ cc_vendor = CC_VENDOR_INTEL;
+
+ /* Configure the TD */
+ tdx_setup(&cc_mask);
+
+ cc_set_mask(cc_mask);
+
+ /*
+ * All bits above GPA width are reserved and kernel treats shared bit
+ * as flag, not as part of physical address.
+ *
+ * Adjust physical mask to only cover valid GPA bits.
+ */
+ physical_mask &= cc_mask - 1;
+
+ /*
+ * The kernel mapping should match the TDX metadata for the page.
+ * load_unaligned_zeropad() can touch memory *adjacent* to that which is
+ * owned by the caller and can catch even _momentary_ mismatches. Bad
+ * things happen on mismatch:
+ *
+ * - Private mapping => Shared Page == Guest shutdown
+ * - Shared mapping => Private Page == Recoverable #VE
+ *
+ * guest.enc_status_change_prepare() converts the page from
+ * shared=>private before the mapping becomes private.
+ *
+ * guest.enc_status_change_finish() converts the page from
+ * private=>shared after the mapping becomes private.
+ *
+ * In both cases there is a temporary shared mapping to a private page,
+ * which can result in a #VE. But, there is never a private mapping to
+ * a shared page.
+ */
+ x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;
+ x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish;
+
+ x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
+ x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
+
+ x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
+ x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+
+ /*
+ * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
+ * will enable interrupts before HLT TDCALL invocation if executed
+ * in STI-shadow, possibly resulting in missed wakeup events.
+ *
+ * Modify all possible HLT execution paths to use TDX specific routines
+ * that directly execute TDCALL and toggle the interrupt state as
+ * needed after TDCALL completion. This also reduces HLT related #VEs
+ * in addition to having a reliable halt logic execution.
+ */
+ pv_ops.irq.safe_halt = tdx_safe_halt;
+ pv_ops.irq.halt = tdx_halt;
+
+ /*
+ * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
+ * bringup low level code. That raises #VE which cannot be handled
+ * there.
+ *
+ * Intel-TDX has a secure RDMSR hypercall, but that needs to be
+ * implemented separately in the low level startup ASM code.
+ * Until that is in place, disable parallel bringup for TDX.
+ */
+ x86_cpuinit.parallel_bringup = false;
+
+ tdx_announce();
+}
diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config
new file mode 100644
index 000000000000..de319852a1e9
--- /dev/null
+++ b/arch/x86/configs/hardening.config
@@ -0,0 +1,17 @@
+# Basic kernel hardening options (specific to x86)
+
+# Modern libc no longer needs a fixed-position mapping in userspace, remove
+# it as a possible target.
+CONFIG_LEGACY_VSYSCALL_NONE=y
+
+# Enable chip-specific IOMMU support.
+CONFIG_INTEL_IOMMU=y
+CONFIG_INTEL_IOMMU_DEFAULT_ON=y
+CONFIG_INTEL_IOMMU_SVM=y
+CONFIG_AMD_IOMMU=y
+
+# Enforce CET Indirect Branch Tracking in the kernel.
+CONFIG_X86_KERNEL_IBT=y
+
+# Enable CET Shadow Stack for userspace.
+CONFIG_X86_USER_SHADOW_STACK=y
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index edb992ebef92..79fa38ca954d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,524 +1,69 @@
-#
-# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:21:55 2009
-#
-# CONFIG_64BIT is not set
-CONFIG_X86_32=y
-# CONFIG_X86_64 is not set
-CONFIG_X86=y
-CONFIG_OUTPUT_FORMAT="elf32-i386"
-CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
-CONFIG_GENERIC_TIME=y
-CONFIG_GENERIC_CMOS_UPDATE=y
-CONFIG_CLOCKSOURCE_WATCHDOG=y
-CONFIG_GENERIC_CLOCKEVENTS=y
-CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
-CONFIG_LOCKDEP_SUPPORT=y
-CONFIG_STACKTRACE_SUPPORT=y
-CONFIG_HAVE_LATENCYTOP_SUPPORT=y
-CONFIG_FAST_CMPXCHG_LOCAL=y
-CONFIG_MMU=y
-CONFIG_ZONE_DMA=y
-CONFIG_GENERIC_ISA_DMA=y
-CONFIG_GENERIC_IOMAP=y
-CONFIG_GENERIC_BUG=y
-CONFIG_GENERIC_HWEIGHT=y
-CONFIG_ARCH_MAY_HAVE_PC_FDC=y
-# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
-CONFIG_RWSEM_XCHGADD_ALGORITHM=y
-CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
-CONFIG_GENERIC_CALIBRATE_DELAY=y
-# CONFIG_GENERIC_TIME_VSYSCALL is not set
-CONFIG_ARCH_HAS_CPU_RELAX=y
-CONFIG_ARCH_HAS_DEFAULT_IDLE=y
-CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
-CONFIG_HAVE_SETUP_PER_CPU_AREA=y
-CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
-# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
-CONFIG_ARCH_HIBERNATION_POSSIBLE=y
-CONFIG_ARCH_SUSPEND_POSSIBLE=y
-# CONFIG_ZONE_DMA32 is not set
-CONFIG_ARCH_POPULATES_NODE_MAP=y
-# CONFIG_AUDIT_ARCH is not set
-CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
-CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
-CONFIG_GENERIC_HARDIRQS=y
-CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
-CONFIG_GENERIC_IRQ_PROBE=y
-CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_USE_GENERIC_SMP_HELPERS=y
-CONFIG_X86_32_SMP=y
-CONFIG_X86_HT=y
-CONFIG_X86_TRAMPOLINE=y
-CONFIG_X86_32_LAZY_GS=y
-CONFIG_KTIME_SCALAR=y
-CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
-
-#
-# General setup
-#
-CONFIG_EXPERIMENTAL=y
-CONFIG_LOCK_KERNEL=y
-CONFIG_INIT_ENV_ARG_LIMIT=32
-CONFIG_LOCALVERSION=""
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_HAVE_KERNEL_GZIP=y
-CONFIG_HAVE_KERNEL_BZIP2=y
-CONFIG_HAVE_KERNEL_LZMA=y
-CONFIG_KERNEL_GZIP=y
-# CONFIG_KERNEL_BZIP2 is not set
-# CONFIG_KERNEL_LZMA is not set
-CONFIG_SWAP=y
+CONFIG_WERROR=y
CONFIG_SYSVIPC=y
-CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_POSIX_MQUEUE_SYSCTL=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_PREEMPT_VOLUNTARY=y
CONFIG_BSD_PROCESS_ACCT=y
-# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
-CONFIG_AUDITSYSCALL=y
-CONFIG_AUDIT_TREE=y
-
-#
-# RCU Subsystem
-#
-# CONFIG_CLASSIC_RCU is not set
-CONFIG_TREE_RCU=y
-# CONFIG_PREEMPT_RCU is not set
-# CONFIG_RCU_TRACE is not set
-CONFIG_RCU_FANOUT=32
-# CONFIG_RCU_FANOUT_EXACT is not set
-# CONFIG_TREE_RCU_TRACE is not set
-# CONFIG_PREEMPT_RCU_TRACE is not set
-# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=18
-CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
-CONFIG_GROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-# CONFIG_RT_GROUP_SCHED is not set
-# CONFIG_USER_SCHED is not set
-CONFIG_CGROUP_SCHED=y
CONFIG_CGROUPS=y
-# CONFIG_CGROUP_DEBUG is not set
-CONFIG_CGROUP_NS=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
-# CONFIG_CGROUP_DEVICE is not set
+CONFIG_CGROUP_HUGETLB=y
CONFIG_CPUSETS=y
-CONFIG_PROC_PID_CPUSET=y
+CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
-# CONFIG_CGROUP_MEM_RES_CTLR is not set
-# CONFIG_SYSFS_DEPRECATED_V2 is not set
-CONFIG_RELAY=y
-CONFIG_NAMESPACES=y
-CONFIG_UTS_NS=y
-CONFIG_IPC_NS=y
-CONFIG_USER_NS=y
-CONFIG_PID_NS=y
-CONFIG_NET_NS=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_MISC=y
+CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
-CONFIG_INITRAMFS_SOURCE=""
-CONFIG_RD_GZIP=y
-CONFIG_RD_BZIP2=y
-CONFIG_RD_LZMA=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_SYSCTL=y
-CONFIG_ANON_INODES=y
-# CONFIG_EMBEDDED is not set
-CONFIG_UID16=y
-CONFIG_SYSCTL_SYSCALL=y
-CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
-CONFIG_KALLSYMS_EXTRA_PASS=y
-# CONFIG_STRIP_ASM_SYMS is not set
-CONFIG_HOTPLUG=y
-CONFIG_PRINTK=y
-CONFIG_BUG=y
-CONFIG_ELF_CORE=y
-CONFIG_PCSPKR_PLATFORM=y
-CONFIG_BASE_FULL=y
-CONFIG_FUTEX=y
-CONFIG_EPOLL=y
-CONFIG_SIGNALFD=y
-CONFIG_TIMERFD=y
-CONFIG_EVENTFD=y
-CONFIG_SHMEM=y
-CONFIG_AIO=y
-CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_PCI_QUIRKS=y
-CONFIG_SLUB_DEBUG=y
-# CONFIG_COMPAT_BRK is not set
-# CONFIG_SLAB is not set
-CONFIG_SLUB=y
-# CONFIG_SLOB is not set
CONFIG_PROFILING=y
-CONFIG_TRACEPOINTS=y
-CONFIG_MARKERS=y
-# CONFIG_OPROFILE is not set
-CONFIG_HAVE_OPROFILE=y
-CONFIG_KPROBES=y
-CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
-CONFIG_KRETPROBES=y
-CONFIG_HAVE_IOREMAP_PROT=y
-CONFIG_HAVE_KPROBES=y
-CONFIG_HAVE_KRETPROBES=y
-CONFIG_HAVE_ARCH_TRACEHOOK=y
-CONFIG_HAVE_DMA_API_DEBUG=y
-# CONFIG_SLOW_WORK is not set
-CONFIG_HAVE_GENERIC_DMA_COHERENT=y
-CONFIG_SLABINFO=y
-CONFIG_RT_MUTEXES=y
-CONFIG_BASE_SMALL=0
-CONFIG_MODULES=y
-# CONFIG_MODULE_FORCE_LOAD is not set
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_MODVERSIONS is not set
-# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_STOP_MACHINE=y
-CONFIG_BLOCK=y
-# CONFIG_LBD is not set
-CONFIG_BLK_DEV_BSG=y
-# CONFIG_BLK_DEV_INTEGRITY is not set
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
-CONFIG_IOSCHED_DEADLINE=y
-CONFIG_IOSCHED_CFQ=y
-# CONFIG_DEFAULT_AS is not set
-# CONFIG_DEFAULT_DEADLINE is not set
-CONFIG_DEFAULT_CFQ=y
-# CONFIG_DEFAULT_NOOP is not set
-CONFIG_DEFAULT_IOSCHED="cfq"
-CONFIG_FREEZER=y
-
-#
-# Processor type and features
-#
-CONFIG_TICK_ONESHOT=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+CONFIG_KEXEC=y
+# Do not remove this as it results in non-bootable kernels
+# CONFIG_64BIT is not set
CONFIG_SMP=y
-CONFIG_SPARSE_IRQ=y
-CONFIG_X86_MPPARSE=y
-# CONFIG_X86_BIGSMP is not set
-CONFIG_X86_EXTENDED_PLATFORM=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_RDC321X is not set
-# CONFIG_X86_32_NON_STANDARD is not set
-CONFIG_SCHED_OMIT_FRAME_POINTER=y
-# CONFIG_PARAVIRT_GUEST is not set
-# CONFIG_MEMTEST is not set
-# CONFIG_M386 is not set
-# CONFIG_M486 is not set
-# CONFIG_M586 is not set
-# CONFIG_M586TSC is not set
-# CONFIG_M586MMX is not set
-CONFIG_M686=y
-# CONFIG_MPENTIUMII is not set
-# CONFIG_MPENTIUMIII is not set
-# CONFIG_MPENTIUMM is not set
-# CONFIG_MPENTIUM4 is not set
-# CONFIG_MK6 is not set
-# CONFIG_MK7 is not set
-# CONFIG_MK8 is not set
-# CONFIG_MCRUSOE is not set
-# CONFIG_MEFFICEON is not set
-# CONFIG_MWINCHIPC6 is not set
-# CONFIG_MWINCHIP3D is not set
-# CONFIG_MGEODEGX1 is not set
-# CONFIG_MGEODE_LX is not set
-# CONFIG_MCYRIXIII is not set
-# CONFIG_MVIAC3_2 is not set
-# CONFIG_MVIAC7 is not set
-# CONFIG_MPSC is not set
-# CONFIG_MCORE2 is not set
-# CONFIG_GENERIC_CPU is not set
-CONFIG_X86_GENERIC=y
-CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=64
-CONFIG_X86_INTERNODE_CACHE_BYTES=64
-CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=5
-CONFIG_X86_XADD=y
-# CONFIG_X86_PPRO_FENCE is not set
-CONFIG_X86_WP_WORKS_OK=y
-CONFIG_X86_INVLPG=y
-CONFIG_X86_BSWAP=y
-CONFIG_X86_POPAD_OK=y
-CONFIG_X86_INTEL_USERCOPY=y
-CONFIG_X86_USE_PPRO_CHECKSUM=y
-CONFIG_X86_TSC=y
-CONFIG_X86_CMOV=y
-CONFIG_X86_MINIMUM_CPU_FAMILY=4
-CONFIG_X86_DEBUGCTLMSR=y
-CONFIG_CPU_SUP_INTEL=y
-CONFIG_CPU_SUP_CYRIX_32=y
-CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR=y
-CONFIG_CPU_SUP_TRANSMETA_32=y
-CONFIG_CPU_SUP_UMC_32=y
-CONFIG_X86_DS=y
-CONFIG_X86_PTRACE_BTS=y
-CONFIG_HPET_TIMER=y
-CONFIG_HPET_EMULATE_RTC=y
-CONFIG_DMI=y
-# CONFIG_IOMMU_HELPER is not set
-# CONFIG_IOMMU_API is not set
-CONFIG_NR_CPUS=64
-CONFIG_SCHED_SMT=y
-CONFIG_SCHED_MC=y
-# CONFIG_PREEMPT_NONE is not set
-CONFIG_PREEMPT_VOLUNTARY=y
-# CONFIG_PREEMPT is not set
-CONFIG_X86_LOCAL_APIC=y
-CONFIG_X86_IO_APIC=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
-CONFIG_X86_MCE=y
-CONFIG_X86_MCE_NONFATAL=y
-CONFIG_X86_MCE_P4THERMAL=y
-CONFIG_VM86=y
-# CONFIG_TOSHIBA is not set
-# CONFIG_I8K is not set
-CONFIG_X86_REBOOTFIXUPS=y
-CONFIG_MICROCODE=y
-CONFIG_MICROCODE_INTEL=y
-CONFIG_MICROCODE_AMD=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
-# CONFIG_X86_CPU_DEBUG is not set
-# CONFIG_NOHIGHMEM is not set
-CONFIG_HIGHMEM4G=y
-# CONFIG_HIGHMEM64G is not set
-CONFIG_PAGE_OFFSET=0xC0000000
-CONFIG_HIGHMEM=y
-# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set
-CONFIG_ARCH_FLATMEM_ENABLE=y
-CONFIG_ARCH_SPARSEMEM_ENABLE=y
-CONFIG_ARCH_SELECT_MEMORY_MODEL=y
-CONFIG_SELECT_MEMORY_MODEL=y
-CONFIG_FLATMEM_MANUAL=y
-# CONFIG_DISCONTIGMEM_MANUAL is not set
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_FLATMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
-CONFIG_SPARSEMEM_STATIC=y
-CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
-# CONFIG_PHYS_ADDR_T_64BIT is not set
-CONFIG_ZONE_DMA_FLAG=1
-CONFIG_BOUNCE=y
-CONFIG_VIRT_TO_BUS=y
-CONFIG_UNEVICTABLE_LRU=y
-CONFIG_HAVE_MLOCK=y
-CONFIG_HAVE_MLOCKED_PAGE_BIT=y
-CONFIG_HIGHPTE=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
-CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
-CONFIG_X86_RESERVE_LOW_64K=y
-# CONFIG_MATH_EMULATION is not set
-CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-CONFIG_X86_PAT=y
CONFIG_EFI=y
-CONFIG_SECCOMP=y
-# CONFIG_CC_STACKPROTECTOR is not set
-# CONFIG_HZ_100 is not set
-# CONFIG_HZ_250 is not set
-# CONFIG_HZ_300 is not set
+CONFIG_EFI_STUB=y
CONFIG_HZ_1000=y
-CONFIG_HZ=1000
-CONFIG_SCHED_HRTICK=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
-# CONFIG_KEXEC_JUMP is not set
-CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
-CONFIG_X86_NEED_RELOCS=y
-CONFIG_PHYSICAL_ALIGN=0x1000000
-CONFIG_HOTPLUG_CPU=y
-# CONFIG_COMPAT_VDSO is not set
-# CONFIG_CMDLINE_BOOL is not set
-CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
-
-#
-# Power management and ACPI options
-#
-CONFIG_PM=y
+CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
-# CONFIG_PM_VERBOSE is not set
-CONFIG_CAN_PM_TRACE=y
-CONFIG_PM_TRACE=y
CONFIG_PM_TRACE_RTC=y
-CONFIG_PM_SLEEP_SMP=y
-CONFIG_PM_SLEEP=y
-CONFIG_SUSPEND=y
-# CONFIG_PM_TEST_SUSPEND is not set
-CONFIG_SUSPEND_FREEZER=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_STD_PARTITION=""
-CONFIG_ACPI=y
-CONFIG_ACPI_SLEEP=y
-CONFIG_ACPI_PROCFS=y
-CONFIG_ACPI_PROCFS_POWER=y
-CONFIG_ACPI_SYSFS_POWER=y
-CONFIG_ACPI_PROC_EVENT=y
-CONFIG_ACPI_AC=y
-CONFIG_ACPI_BATTERY=y
-CONFIG_ACPI_BUTTON=y
-CONFIG_ACPI_FAN=y
CONFIG_ACPI_DOCK=y
-CONFIG_ACPI_PROCESSOR=y
-CONFIG_ACPI_HOTPLUG_CPU=y
-CONFIG_ACPI_THERMAL=y
-# CONFIG_ACPI_CUSTOM_DSDT is not set
-CONFIG_ACPI_BLACKLIST_YEAR=0
-# CONFIG_ACPI_DEBUG is not set
-# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_X86_PM_TIMER=y
-CONFIG_ACPI_CONTAINER=y
-# CONFIG_ACPI_SBS is not set
-# CONFIG_APM is not set
-
-#
-# CPU Frequency scaling
-#
-CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_TABLE=y
-CONFIG_CPU_FREQ_DEBUG=y
-# CONFIG_CPU_FREQ_STAT is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+CONFIG_ACPI_BGRT=y
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
-CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
-# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
-
-#
-# CPUFreq processor drivers
-#
CONFIG_X86_ACPI_CPUFREQ=y
-# CONFIG_X86_POWERNOW_K6 is not set
-# CONFIG_X86_POWERNOW_K7 is not set
-# CONFIG_X86_POWERNOW_K8 is not set
-# CONFIG_X86_GX_SUSPMOD is not set
-# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
-# CONFIG_X86_SPEEDSTEP_ICH is not set
-# CONFIG_X86_SPEEDSTEP_SMI is not set
-# CONFIG_X86_P4_CLOCKMOD is not set
-# CONFIG_X86_CPUFREQ_NFORCE2 is not set
-# CONFIG_X86_LONGRUN is not set
-# CONFIG_X86_LONGHAUL is not set
-# CONFIG_X86_E_POWERSAVER is not set
-
-#
-# shared options
-#
-# CONFIG_X86_SPEEDSTEP_LIB is not set
-CONFIG_CPU_IDLE=y
-CONFIG_CPU_IDLE_GOV_LADDER=y
-CONFIG_CPU_IDLE_GOV_MENU=y
-
-#
-# Bus options (PCI etc.)
-#
-CONFIG_PCI=y
-# CONFIG_PCI_GOBIOS is not set
-# CONFIG_PCI_GOMMCONFIG is not set
-# CONFIG_PCI_GODIRECT is not set
-# CONFIG_PCI_GOOLPC is not set
-CONFIG_PCI_GOANY=y
-CONFIG_PCI_BIOS=y
-CONFIG_PCI_DIRECT=y
-CONFIG_PCI_MMCONFIG=y
-CONFIG_PCI_DOMAINS=y
-# CONFIG_DMAR is not set
-CONFIG_PCIEPORTBUS=y
-# CONFIG_HOTPLUG_PCI_PCIE is not set
-CONFIG_PCIEAER=y
-# CONFIG_PCIEASPM is not set
-CONFIG_ARCH_SUPPORTS_MSI=y
-CONFIG_PCI_MSI=y
-# CONFIG_PCI_LEGACY is not set
-# CONFIG_PCI_DEBUG is not set
-# CONFIG_PCI_STUB is not set
-CONFIG_HT_IRQ=y
-# CONFIG_PCI_IOV is not set
-CONFIG_ISA_DMA_API=y
-# CONFIG_ISA is not set
-# CONFIG_MCA is not set
-# CONFIG_SCx200 is not set
-# CONFIG_OLPC is not set
-CONFIG_K8_NB=y
-CONFIG_PCCARD=y
-# CONFIG_PCMCIA_DEBUG is not set
-CONFIG_PCMCIA=y
-CONFIG_PCMCIA_LOAD_CIS=y
-CONFIG_PCMCIA_IOCTL=y
-CONFIG_CARDBUS=y
-
-#
-# PC-card bridges
-#
-CONFIG_YENTA=y
-CONFIG_YENTA_O2=y
-CONFIG_YENTA_RICOH=y
-CONFIG_YENTA_TI=y
-CONFIG_YENTA_ENE_TUNE=y
-CONFIG_YENTA_TOSHIBA=y
-# CONFIG_PD6729 is not set
-# CONFIG_I82092 is not set
-CONFIG_PCCARD_NONSTATIC=y
-CONFIG_HOTPLUG_PCI=y
-# CONFIG_HOTPLUG_PCI_FAKE is not set
-# CONFIG_HOTPLUG_PCI_IBM is not set
-# CONFIG_HOTPLUG_PCI_ACPI is not set
-# CONFIG_HOTPLUG_PCI_CPCI is not set
-# CONFIG_HOTPLUG_PCI_SHPC is not set
-
-#
-# Executable file formats / Emulations
-#
-CONFIG_BINFMT_ELF=y
-CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
-CONFIG_HAVE_AOUT=y
-# CONFIG_BINFMT_AOUT is not set
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_BLK_CGROUP_IOLATENCY=y
+CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
CONFIG_BINFMT_MISC=y
-CONFIG_HAVE_ATOMIC_IOMAP=y
+# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
-
-#
-# Networking options
-#
CONFIG_PACKET=y
-CONFIG_PACKET_MMAP=y
-CONFIG_UNIX=y
-CONFIG_XFRM=y
CONFIG_XFRM_USER=y
-# CONFIG_XFRM_SUB_POLICY is not set
-# CONFIG_XFRM_MIGRATE is not set
-# CONFIG_XFRM_STATISTICS is not set
-# CONFIG_NET_KEY is not set
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_ASK_IP_FIB_HASH=y
-# CONFIG_IP_FIB_TRIE is not set
-CONFIG_IP_FIB_HASH=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
@@ -526,1347 +71,140 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
-# CONFIG_NET_IPIP is not set
-# CONFIG_NET_IPGRE is not set
CONFIG_IP_MROUTE=y
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
-# CONFIG_ARPD is not set
CONFIG_SYN_COOKIES=y
-# CONFIG_INET_AH is not set
-# CONFIG_INET_ESP is not set
-# CONFIG_INET_IPCOMP is not set
-# CONFIG_INET_XFRM_TUNNEL is not set
-CONFIG_INET_TUNNEL=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
-CONFIG_INET_LRO=y
# CONFIG_INET_DIAG is not set
CONFIG_TCP_CONG_ADVANCED=y
# CONFIG_TCP_CONG_BIC is not set
-CONFIG_TCP_CONG_CUBIC=y
# CONFIG_TCP_CONG_WESTWOOD is not set
# CONFIG_TCP_CONG_HTCP is not set
-# CONFIG_TCP_CONG_HSTCP is not set
-# CONFIG_TCP_CONG_HYBLA is not set
-# CONFIG_TCP_CONG_VEGAS is not set
-# CONFIG_TCP_CONG_SCALABLE is not set
-# CONFIG_TCP_CONG_LP is not set
-# CONFIG_TCP_CONG_VENO is not set
-# CONFIG_TCP_CONG_YEAH is not set
-# CONFIG_TCP_CONG_ILLINOIS is not set
-# CONFIG_DEFAULT_BIC is not set
-CONFIG_DEFAULT_CUBIC=y
-# CONFIG_DEFAULT_HTCP is not set
-# CONFIG_DEFAULT_VEGAS is not set
-# CONFIG_DEFAULT_WESTWOOD is not set
-# CONFIG_DEFAULT_RENO is not set
-CONFIG_DEFAULT_TCP_CONG="cubic"
CONFIG_TCP_MD5SIG=y
-CONFIG_IPV6=y
-# CONFIG_IPV6_PRIVACY is not set
-# CONFIG_IPV6_ROUTER_PREF is not set
-# CONFIG_IPV6_OPTIMISTIC_DAD is not set
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
-# CONFIG_INET6_IPCOMP is not set
-# CONFIG_IPV6_MIP6 is not set
-# CONFIG_INET6_XFRM_TUNNEL is not set
-# CONFIG_INET6_TUNNEL is not set
-CONFIG_INET6_XFRM_MODE_TRANSPORT=y
-CONFIG_INET6_XFRM_MODE_TUNNEL=y
-CONFIG_INET6_XFRM_MODE_BEET=y
-# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
-CONFIG_IPV6_SIT=y
-CONFIG_IPV6_NDISC_NODETYPE=y
-# CONFIG_IPV6_TUNNEL is not set
-# CONFIG_IPV6_MULTIPLE_TABLES is not set
-# CONFIG_IPV6_MROUTE is not set
CONFIG_NETLABEL=y
-CONFIG_NETWORK_SECMARK=y
CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_DEBUG is not set
# CONFIG_NETFILTER_ADVANCED is not set
-
-#
-# Core Netfilter Configuration
-#
-CONFIG_NETFILTER_NETLINK=y
-CONFIG_NETFILTER_NETLINK_LOG=y
CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_SECMARK=y
CONFIG_NF_CONNTRACK_FTP=y
CONFIG_NF_CONNTRACK_IRC=y
CONFIG_NF_CONNTRACK_SIP=y
CONFIG_NF_CT_NETLINK=y
-CONFIG_NETFILTER_XTABLES=y
+CONFIG_NF_NAT=y
CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
CONFIG_NETFILTER_XT_TARGET_NFLOG=y
CONFIG_NETFILTER_XT_TARGET_SECMARK=y
CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
CONFIG_NETFILTER_XT_MATCH_POLICY=y
CONFIG_NETFILTER_XT_MATCH_STATE=y
-# CONFIG_IP_VS is not set
-
-#
-# IP: Netfilter Configuration
-#
-CONFIG_NF_DEFRAG_IPV4=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_PROC_COMPAT=y
CONFIG_IP_NF_IPTABLES=y
CONFIG_IP_NF_FILTER=y
CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=y
-CONFIG_IP_NF_TARGET_ULOG=y
-CONFIG_NF_NAT=y
-CONFIG_NF_NAT_NEEDED=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_NF_NAT_FTP=y
-CONFIG_NF_NAT_IRC=y
-# CONFIG_NF_NAT_TFTP is not set
-# CONFIG_NF_NAT_AMANDA is not set
-# CONFIG_NF_NAT_PPTP is not set
-# CONFIG_NF_NAT_H323 is not set
-CONFIG_NF_NAT_SIP=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
CONFIG_IP_NF_MANGLE=y
-
-#
-# IPv6: Netfilter Configuration
-#
-CONFIG_NF_CONNTRACK_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_MATCH_IPV6HEADER=y
-CONFIG_IP6_NF_TARGET_LOG=y
CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
-# CONFIG_IP_DCCP is not set
-# CONFIG_IP_SCTP is not set
-# CONFIG_TIPC is not set
-# CONFIG_ATM is not set
-# CONFIG_BRIDGE is not set
-# CONFIG_NET_DSA is not set
-# CONFIG_VLAN_8021Q is not set
-# CONFIG_DECNET is not set
-CONFIG_LLC=y
-# CONFIG_LLC2 is not set
-# CONFIG_IPX is not set
-# CONFIG_ATALK is not set
-# CONFIG_X25 is not set
-# CONFIG_LAPB is not set
-# CONFIG_ECONET is not set
-# CONFIG_WAN_ROUTER is not set
-# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
-
-#
-# Queueing/Scheduling
-#
-# CONFIG_NET_SCH_CBQ is not set
-# CONFIG_NET_SCH_HTB is not set
-# CONFIG_NET_SCH_HFSC is not set
-# CONFIG_NET_SCH_PRIO is not set
-# CONFIG_NET_SCH_MULTIQ is not set
-# CONFIG_NET_SCH_RED is not set
-# CONFIG_NET_SCH_SFQ is not set
-# CONFIG_NET_SCH_TEQL is not set
-# CONFIG_NET_SCH_TBF is not set
-# CONFIG_NET_SCH_GRED is not set
-# CONFIG_NET_SCH_DSMARK is not set
-# CONFIG_NET_SCH_NETEM is not set
-# CONFIG_NET_SCH_DRR is not set
-# CONFIG_NET_SCH_INGRESS is not set
-
-#
-# Classification
-#
-CONFIG_NET_CLS=y
-# CONFIG_NET_CLS_BASIC is not set
-# CONFIG_NET_CLS_TCINDEX is not set
-# CONFIG_NET_CLS_ROUTE4 is not set
-# CONFIG_NET_CLS_FW is not set
-# CONFIG_NET_CLS_U32 is not set
-# CONFIG_NET_CLS_RSVP is not set
-# CONFIG_NET_CLS_RSVP6 is not set
-# CONFIG_NET_CLS_FLOW is not set
-# CONFIG_NET_CLS_CGROUP is not set
+CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_STACK=32
-# CONFIG_NET_EMATCH_CMP is not set
-# CONFIG_NET_EMATCH_NBYTE is not set
-# CONFIG_NET_EMATCH_U32 is not set
-# CONFIG_NET_EMATCH_META is not set
-# CONFIG_NET_EMATCH_TEXT is not set
CONFIG_NET_CLS_ACT=y
-# CONFIG_NET_ACT_POLICE is not set
-# CONFIG_NET_ACT_GACT is not set
-# CONFIG_NET_ACT_MIRRED is not set
-# CONFIG_NET_ACT_IPT is not set
-# CONFIG_NET_ACT_NAT is not set
-# CONFIG_NET_ACT_PEDIT is not set
-# CONFIG_NET_ACT_SIMP is not set
-# CONFIG_NET_ACT_SKBEDIT is not set
-CONFIG_NET_SCH_FIFO=y
-# CONFIG_DCB is not set
-
-#
-# Network testing
-#
-# CONFIG_NET_PKTGEN is not set
-# CONFIG_NET_TCPPROBE is not set
-# CONFIG_NET_DROP_MONITOR is not set
-CONFIG_HAMRADIO=y
-
-#
-# Packet Radio protocols
-#
-# CONFIG_AX25 is not set
-# CONFIG_CAN is not set
-# CONFIG_IRDA is not set
-# CONFIG_BT is not set
-# CONFIG_AF_RXRPC is not set
-CONFIG_FIB_RULES=y
-CONFIG_WIRELESS=y
+CONFIG_CGROUP_NET_PRIO=y
CONFIG_CFG80211=y
-# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_WIRELESS_OLD_REGULATORY=y
-CONFIG_WIRELESS_EXT=y
-CONFIG_WIRELESS_EXT_SYSFS=y
-# CONFIG_LIB80211 is not set
CONFIG_MAC80211=y
-
-#
-# Rate control algorithm selection
-#
-CONFIG_MAC80211_RC_MINSTREL=y
-# CONFIG_MAC80211_RC_DEFAULT_PID is not set
-CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
-CONFIG_MAC80211_RC_DEFAULT="minstrel"
-# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
-# CONFIG_MAC80211_DEBUGFS is not set
-# CONFIG_MAC80211_DEBUG_MENU is not set
-# CONFIG_WIMAX is not set
CONFIG_RFKILL=y
-# CONFIG_RFKILL_INPUT is not set
-CONFIG_RFKILL_LEDS=y
-# CONFIG_NET_9P is not set
-
-#
-# Device Drivers
-#
-
-#
-# Generic Driver Options
-#
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_STANDALONE=y
-CONFIG_PREVENT_FIRMWARE_BUILD=y
-CONFIG_FW_LOADER=y
-CONFIG_FIRMWARE_IN_KERNEL=y
-CONFIG_EXTRA_FIRMWARE=""
-# CONFIG_DEBUG_DRIVER is not set
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_PCI=y
+CONFIG_PCIEPORTBUS=y
+CONFIG_PCI_MSI=y
+CONFIG_HOTPLUG_PCI=y
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
-# CONFIG_SYS_HYPERVISOR is not set
CONFIG_CONNECTOR=y
-CONFIG_PROC_EVENTS=y
-# CONFIG_MTD is not set
-# CONFIG_PARPORT is not set
-CONFIG_PNP=y
-CONFIG_PNP_DEBUG_MESSAGES=y
-
-#
-# Protocols
-#
-CONFIG_PNPACPI=y
-CONFIG_BLK_DEV=y
-# CONFIG_BLK_DEV_FD is not set
-# CONFIG_BLK_CPQ_DA is not set
-# CONFIG_BLK_CPQ_CISS_DA is not set
-# CONFIG_BLK_DEV_DAC960 is not set
-# CONFIG_BLK_DEV_UMEM is not set
-# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=y
-# CONFIG_BLK_DEV_CRYPTOLOOP is not set
-# CONFIG_BLK_DEV_NBD is not set
-# CONFIG_BLK_DEV_SX8 is not set
-# CONFIG_BLK_DEV_UB is not set
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_COUNT=16
-CONFIG_BLK_DEV_RAM_SIZE=16384
-# CONFIG_BLK_DEV_XIP is not set
-# CONFIG_CDROM_PKTCDVD is not set
-# CONFIG_ATA_OVER_ETH is not set
-# CONFIG_BLK_DEV_HD is not set
-CONFIG_MISC_DEVICES=y
-# CONFIG_IBM_ASM is not set
-# CONFIG_PHANTOM is not set
-# CONFIG_SGI_IOC4 is not set
-# CONFIG_TIFM_CORE is not set
-# CONFIG_ICS932S401 is not set
-# CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_HP_ILO is not set
-# CONFIG_ISL29003 is not set
-# CONFIG_C2PORT is not set
-
-#
-# EEPROM support
-#
-# CONFIG_EEPROM_AT24 is not set
-# CONFIG_EEPROM_LEGACY is not set
-# CONFIG_EEPROM_93CX6 is not set
-CONFIG_HAVE_IDE=y
-# CONFIG_IDE is not set
-
-#
-# SCSI device support
-#
-# CONFIG_RAID_ATTRS is not set
-CONFIG_SCSI=y
-CONFIG_SCSI_DMA=y
-# CONFIG_SCSI_TGT is not set
-# CONFIG_SCSI_NETLINK is not set
-CONFIG_SCSI_PROC_FS=y
-
-#
-# SCSI support type (disk, tape, CD-ROM)
-#
+CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
-# CONFIG_CHR_DEV_ST is not set
-# CONFIG_CHR_DEV_OSST is not set
CONFIG_BLK_DEV_SR=y
-CONFIG_BLK_DEV_SR_VENDOR=y
CONFIG_CHR_DEV_SG=y
-# CONFIG_CHR_DEV_SCH is not set
-
-#
-# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
-#
-# CONFIG_SCSI_MULTI_LUN is not set
CONFIG_SCSI_CONSTANTS=y
-# CONFIG_SCSI_LOGGING is not set
-# CONFIG_SCSI_SCAN_ASYNC is not set
-CONFIG_SCSI_WAIT_SCAN=m
-
-#
-# SCSI Transports
-#
CONFIG_SCSI_SPI_ATTRS=y
-# CONFIG_SCSI_FC_ATTRS is not set
-# CONFIG_SCSI_ISCSI_ATTRS is not set
-# CONFIG_SCSI_SAS_ATTRS is not set
-# CONFIG_SCSI_SAS_LIBSAS is not set
-# CONFIG_SCSI_SRP_ATTRS is not set
-# CONFIG_SCSI_LOWLEVEL is not set
-# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
-# CONFIG_SCSI_DH is not set
-# CONFIG_SCSI_OSD_INITIATOR is not set
+CONFIG_SCSI_VIRTIO=y
CONFIG_ATA=y
-# CONFIG_ATA_NONSTANDARD is not set
-CONFIG_ATA_ACPI=y
-CONFIG_SATA_PMP=y
CONFIG_SATA_AHCI=y
-# CONFIG_SATA_SIL24 is not set
-CONFIG_ATA_SFF=y
-# CONFIG_SATA_SVW is not set
CONFIG_ATA_PIIX=y
-# CONFIG_SATA_MV is not set
-# CONFIG_SATA_NV is not set
-# CONFIG_PDC_ADMA is not set
-# CONFIG_SATA_QSTOR is not set
-# CONFIG_SATA_PROMISE is not set
-# CONFIG_SATA_SX4 is not set
-# CONFIG_SATA_SIL is not set
-# CONFIG_SATA_SIS is not set
-# CONFIG_SATA_ULI is not set
-# CONFIG_SATA_VIA is not set
-# CONFIG_SATA_VITESSE is not set
-# CONFIG_SATA_INIC162X is not set
-# CONFIG_PATA_ACPI is not set
-# CONFIG_PATA_ALI is not set
CONFIG_PATA_AMD=y
-# CONFIG_PATA_ARTOP is not set
-# CONFIG_PATA_ATIIXP is not set
-# CONFIG_PATA_CMD640_PCI is not set
-# CONFIG_PATA_CMD64X is not set
-# CONFIG_PATA_CS5520 is not set
-# CONFIG_PATA_CS5530 is not set
-# CONFIG_PATA_CS5535 is not set
-# CONFIG_PATA_CS5536 is not set
-# CONFIG_PATA_CYPRESS is not set
-# CONFIG_PATA_EFAR is not set
-CONFIG_ATA_GENERIC=y
-# CONFIG_PATA_HPT366 is not set
-# CONFIG_PATA_HPT37X is not set
-# CONFIG_PATA_HPT3X2N is not set
-# CONFIG_PATA_HPT3X3 is not set
-# CONFIG_PATA_IT821X is not set
-# CONFIG_PATA_IT8213 is not set
-# CONFIG_PATA_JMICRON is not set
-# CONFIG_PATA_TRIFLEX is not set
-# CONFIG_PATA_MARVELL is not set
-CONFIG_PATA_MPIIX=y
CONFIG_PATA_OLDPIIX=y
-# CONFIG_PATA_NETCELL is not set
-# CONFIG_PATA_NINJA32 is not set
-# CONFIG_PATA_NS87410 is not set
-# CONFIG_PATA_NS87415 is not set
-# CONFIG_PATA_OPTI is not set
-# CONFIG_PATA_OPTIDMA is not set
-# CONFIG_PATA_PCMCIA is not set
-# CONFIG_PATA_PDC_OLD is not set
-# CONFIG_PATA_RADISYS is not set
-# CONFIG_PATA_RZ1000 is not set
-# CONFIG_PATA_SC1200 is not set
-# CONFIG_PATA_SERVERWORKS is not set
-# CONFIG_PATA_PDC2027X is not set
-# CONFIG_PATA_SIL680 is not set
-# CONFIG_PATA_SIS is not set
-# CONFIG_PATA_VIA is not set
-# CONFIG_PATA_WINBOND is not set
CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
-CONFIG_MD_AUTODETECT=y
-# CONFIG_MD_LINEAR is not set
-# CONFIG_MD_RAID0 is not set
-# CONFIG_MD_RAID1 is not set
-# CONFIG_MD_RAID10 is not set
-# CONFIG_MD_RAID456 is not set
-# CONFIG_MD_MULTIPATH is not set
-# CONFIG_MD_FAULTY is not set
CONFIG_BLK_DEV_DM=y
-# CONFIG_DM_DEBUG is not set
-# CONFIG_DM_CRYPT is not set
-# CONFIG_DM_SNAPSHOT is not set
CONFIG_DM_MIRROR=y
CONFIG_DM_ZERO=y
-# CONFIG_DM_MULTIPATH is not set
-# CONFIG_DM_DELAY is not set
-# CONFIG_DM_UEVENT is not set
-# CONFIG_FUSION is not set
-
-#
-# IEEE 1394 (FireWire) support
-#
-
-#
-# Enable only one of the two stacks, unless you know what you are doing
-#
-# CONFIG_FIREWIRE is not set
-# CONFIG_IEEE1394 is not set
-# CONFIG_I2O is not set
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
-CONFIG_COMPAT_NET_DEV_OPS=y
-# CONFIG_IFB is not set
-# CONFIG_DUMMY is not set
-# CONFIG_BONDING is not set
-# CONFIG_MACVLAN is not set
-# CONFIG_EQUALIZER is not set
-# CONFIG_TUN is not set
-# CONFIG_VETH is not set
-# CONFIG_NET_SB1000 is not set
-# CONFIG_ARCNET is not set
-CONFIG_PHYLIB=y
-
-#
-# MII PHY device drivers
-#
-# CONFIG_MARVELL_PHY is not set
-# CONFIG_DAVICOM_PHY is not set
-# CONFIG_QSEMI_PHY is not set
-# CONFIG_LXT_PHY is not set
-# CONFIG_CICADA_PHY is not set
-# CONFIG_VITESSE_PHY is not set
-# CONFIG_SMSC_PHY is not set
-# CONFIG_BROADCOM_PHY is not set
-# CONFIG_ICPLUS_PHY is not set
-# CONFIG_REALTEK_PHY is not set
-# CONFIG_NATIONAL_PHY is not set
-# CONFIG_STE10XP is not set
-# CONFIG_LSI_ET1011C_PHY is not set
-# CONFIG_FIXED_PHY is not set
-# CONFIG_MDIO_BITBANG is not set
-CONFIG_NET_ETHERNET=y
-CONFIG_MII=y
-# CONFIG_HAPPYMEAL is not set
-# CONFIG_SUNGEM is not set
-# CONFIG_CASSINI is not set
-CONFIG_NET_VENDOR_3COM=y
-# CONFIG_VORTEX is not set
-# CONFIG_TYPHOON is not set
-# CONFIG_ETHOC is not set
-# CONFIG_DNET is not set
+CONFIG_NETCONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
CONFIG_NET_TULIP=y
-# CONFIG_DE2104X is not set
-# CONFIG_TULIP is not set
-# CONFIG_DE4X5 is not set
-# CONFIG_WINBOND_840 is not set
-# CONFIG_DM9102 is not set
-# CONFIG_ULI526X is not set
-# CONFIG_PCMCIA_XIRCOM is not set
-# CONFIG_HP100 is not set
-# CONFIG_IBM_NEW_EMAC_ZMII is not set
-# CONFIG_IBM_NEW_EMAC_RGMII is not set
-# CONFIG_IBM_NEW_EMAC_TAH is not set
-# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
-# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
-# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
-# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
-CONFIG_NET_PCI=y
-# CONFIG_PCNET32 is not set
-# CONFIG_AMD8111_ETH is not set
-# CONFIG_ADAPTEC_STARFIRE is not set
-# CONFIG_B44 is not set
-CONFIG_FORCEDETH=y
-# CONFIG_FORCEDETH_NAPI is not set
CONFIG_E100=y
-# CONFIG_FEALNX is not set
-# CONFIG_NATSEMI is not set
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
CONFIG_NE2K_PCI=y
-# CONFIG_8139CP is not set
+CONFIG_FORCEDETH=y
CONFIG_8139TOO=y
# CONFIG_8139TOO_PIO is not set
-# CONFIG_8139TOO_TUNE_TWISTER is not set
-# CONFIG_8139TOO_8129 is not set
-# CONFIG_8139_OLD_RX_RESET is not set
-# CONFIG_R6040 is not set
-# CONFIG_SIS900 is not set
-# CONFIG_EPIC100 is not set
-# CONFIG_SMSC9420 is not set
-# CONFIG_SUNDANCE is not set
-# CONFIG_TLAN is not set
-# CONFIG_VIA_RHINE is not set
-# CONFIG_SC92031 is not set
-# CONFIG_ATL2 is not set
-CONFIG_NETDEV_1000=y
-# CONFIG_ACENIC is not set
-# CONFIG_DL2K is not set
-CONFIG_E1000=y
-CONFIG_E1000E=y
-# CONFIG_IP1000 is not set
-# CONFIG_IGB is not set
-# CONFIG_IGBVF is not set
-# CONFIG_NS83820 is not set
-# CONFIG_HAMACHI is not set
-# CONFIG_YELLOWFIN is not set
CONFIG_R8169=y
-# CONFIG_SIS190 is not set
-# CONFIG_SKGE is not set
-CONFIG_SKY2=y
-# CONFIG_SKY2_DEBUG is not set
-# CONFIG_VIA_VELOCITY is not set
-CONFIG_TIGON3=y
-CONFIG_BNX2=y
-# CONFIG_QLA3XXX is not set
-# CONFIG_ATL1 is not set
-# CONFIG_ATL1E is not set
-# CONFIG_ATL1C is not set
-# CONFIG_JME is not set
-CONFIG_NETDEV_10000=y
-# CONFIG_CHELSIO_T1 is not set
-CONFIG_CHELSIO_T3_DEPENDS=y
-# CONFIG_CHELSIO_T3 is not set
-# CONFIG_ENIC is not set
-# CONFIG_IXGBE is not set
-# CONFIG_IXGB is not set
-# CONFIG_S2IO is not set
-# CONFIG_VXGE is not set
-# CONFIG_MYRI10GE is not set
-# CONFIG_NETXEN_NIC is not set
-# CONFIG_NIU is not set
-# CONFIG_MLX4_EN is not set
-# CONFIG_MLX4_CORE is not set
-# CONFIG_TEHUTI is not set
-# CONFIG_BNX2X is not set
-# CONFIG_QLGE is not set
-# CONFIG_SFC is not set
-# CONFIG_BE2NET is not set
-CONFIG_TR=y
-# CONFIG_IBMOL is not set
-# CONFIG_IBMLS is not set
-# CONFIG_3C359 is not set
-# CONFIG_TMS380TR is not set
-
-#
-# Wireless LAN
-#
-# CONFIG_WLAN_PRE80211 is not set
-CONFIG_WLAN_80211=y
-# CONFIG_PCMCIA_RAYCS is not set
-# CONFIG_LIBERTAS is not set
-# CONFIG_LIBERTAS_THINFIRM is not set
-# CONFIG_AIRO is not set
-# CONFIG_ATMEL is not set
-# CONFIG_AT76C50X_USB is not set
-# CONFIG_AIRO_CS is not set
-# CONFIG_PCMCIA_WL3501 is not set
-# CONFIG_PRISM54 is not set
-# CONFIG_USB_ZD1201 is not set
-# CONFIG_USB_NET_RNDIS_WLAN is not set
-# CONFIG_RTL8180 is not set
-# CONFIG_RTL8187 is not set
-# CONFIG_ADM8211 is not set
-# CONFIG_MAC80211_HWSIM is not set
-# CONFIG_MWL8K is not set
-# CONFIG_P54_COMMON is not set
-CONFIG_ATH5K=y
-# CONFIG_ATH5K_DEBUG is not set
-# CONFIG_ATH9K is not set
-# CONFIG_AR9170_USB is not set
-# CONFIG_IPW2100 is not set
-# CONFIG_IPW2200 is not set
-# CONFIG_IWLWIFI is not set
-# CONFIG_HOSTAP is not set
-# CONFIG_B43 is not set
-# CONFIG_B43LEGACY is not set
-# CONFIG_ZD1211RW is not set
-# CONFIG_RT2X00 is not set
-# CONFIG_HERMES is not set
-
-#
-# Enable WiMAX (Networking options) to see the WiMAX drivers
-#
-
-#
-# USB Network Adapters
-#
-# CONFIG_USB_CATC is not set
-# CONFIG_USB_KAWETH is not set
-# CONFIG_USB_PEGASUS is not set
-# CONFIG_USB_RTL8150 is not set
-# CONFIG_USB_USBNET is not set
-# CONFIG_USB_HSO is not set
-CONFIG_NET_PCMCIA=y
-# CONFIG_PCMCIA_3C589 is not set
-# CONFIG_PCMCIA_3C574 is not set
-# CONFIG_PCMCIA_FMVJ18X is not set
-# CONFIG_PCMCIA_PCNET is not set
-# CONFIG_PCMCIA_NMCLAN is not set
-# CONFIG_PCMCIA_SMC91C92 is not set
-# CONFIG_PCMCIA_XIRC2PS is not set
-# CONFIG_PCMCIA_AXNET is not set
-# CONFIG_PCMCIA_IBMTR is not set
-# CONFIG_WAN is not set
-CONFIG_FDDI=y
-# CONFIG_DEFXX is not set
-# CONFIG_SKFP is not set
-# CONFIG_HIPPI is not set
-# CONFIG_PPP is not set
-# CONFIG_SLIP is not set
-# CONFIG_NET_FC is not set
-CONFIG_NETCONSOLE=y
-# CONFIG_NETCONSOLE_DYNAMIC is not set
-CONFIG_NETPOLL=y
-# CONFIG_NETPOLL_TRAP is not set
-CONFIG_NET_POLL_CONTROLLER=y
-# CONFIG_ISDN is not set
-# CONFIG_PHONE is not set
-
-#
-# Input device support
-#
-CONFIG_INPUT=y
-CONFIG_INPUT_FF_MEMLESS=y
-CONFIG_INPUT_POLLDEV=y
-
-#
-# Userland interfaces
-#
-CONFIG_INPUT_MOUSEDEV=y
-# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
-CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
-# CONFIG_INPUT_JOYDEV is not set
CONFIG_INPUT_EVDEV=y
-# CONFIG_INPUT_EVBUG is not set
-
-#
-# Input Device Drivers
-#
-CONFIG_INPUT_KEYBOARD=y
-CONFIG_KEYBOARD_ATKBD=y
-# CONFIG_KEYBOARD_SUNKBD is not set
-# CONFIG_KEYBOARD_LKKBD is not set
-# CONFIG_KEYBOARD_XTKBD is not set
-# CONFIG_KEYBOARD_NEWTON is not set
-# CONFIG_KEYBOARD_STOWAWAY is not set
-CONFIG_INPUT_MOUSE=y
-CONFIG_MOUSE_PS2=y
-CONFIG_MOUSE_PS2_ALPS=y
-CONFIG_MOUSE_PS2_LOGIPS2PP=y
-CONFIG_MOUSE_PS2_SYNAPTICS=y
-CONFIG_MOUSE_PS2_LIFEBOOK=y
-CONFIG_MOUSE_PS2_TRACKPOINT=y
-# CONFIG_MOUSE_PS2_ELANTECH is not set
-# CONFIG_MOUSE_PS2_TOUCHKIT is not set
-# CONFIG_MOUSE_SERIAL is not set
-# CONFIG_MOUSE_APPLETOUCH is not set
-# CONFIG_MOUSE_BCM5974 is not set
-# CONFIG_MOUSE_VSXXXAA is not set
CONFIG_INPUT_JOYSTICK=y
-# CONFIG_JOYSTICK_ANALOG is not set
-# CONFIG_JOYSTICK_A3D is not set
-# CONFIG_JOYSTICK_ADI is not set
-# CONFIG_JOYSTICK_COBRA is not set
-# CONFIG_JOYSTICK_GF2K is not set
-# CONFIG_JOYSTICK_GRIP is not set
-# CONFIG_JOYSTICK_GRIP_MP is not set
-# CONFIG_JOYSTICK_GUILLEMOT is not set
-# CONFIG_JOYSTICK_INTERACT is not set
-# CONFIG_JOYSTICK_SIDEWINDER is not set
-# CONFIG_JOYSTICK_TMDC is not set
-# CONFIG_JOYSTICK_IFORCE is not set
-# CONFIG_JOYSTICK_WARRIOR is not set
-# CONFIG_JOYSTICK_MAGELLAN is not set
-# CONFIG_JOYSTICK_SPACEORB is not set
-# CONFIG_JOYSTICK_SPACEBALL is not set
-# CONFIG_JOYSTICK_STINGER is not set
-# CONFIG_JOYSTICK_TWIDJOY is not set
-# CONFIG_JOYSTICK_ZHENHUA is not set
-# CONFIG_JOYSTICK_JOYDUMP is not set
-# CONFIG_JOYSTICK_XPAD is not set
CONFIG_INPUT_TABLET=y
-# CONFIG_TABLET_USB_ACECAD is not set
-# CONFIG_TABLET_USB_AIPTEK is not set
-# CONFIG_TABLET_USB_GTCO is not set
-# CONFIG_TABLET_USB_KBTAB is not set
-# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
-# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
-# CONFIG_TOUCHSCREEN_AD7879 is not set
-# CONFIG_TOUCHSCREEN_FUJITSU is not set
-# CONFIG_TOUCHSCREEN_GUNZE is not set
-# CONFIG_TOUCHSCREEN_ELO is not set
-# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
-# CONFIG_TOUCHSCREEN_MTOUCH is not set
-# CONFIG_TOUCHSCREEN_INEXIO is not set
-# CONFIG_TOUCHSCREEN_MK712 is not set
-# CONFIG_TOUCHSCREEN_PENMOUNT is not set
-# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
-# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
-# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
-# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
-# CONFIG_TOUCHSCREEN_TSC2007 is not set
CONFIG_INPUT_MISC=y
-# CONFIG_INPUT_PCSPKR is not set
-# CONFIG_INPUT_APANEL is not set
-# CONFIG_INPUT_WISTRON_BTNS is not set
-# CONFIG_INPUT_ATLAS_BTNS is not set
-# CONFIG_INPUT_ATI_REMOTE is not set
-# CONFIG_INPUT_ATI_REMOTE2 is not set
-# CONFIG_INPUT_KEYSPAN_REMOTE is not set
-# CONFIG_INPUT_POWERMATE is not set
-# CONFIG_INPUT_YEALINK is not set
-# CONFIG_INPUT_CM109 is not set
-# CONFIG_INPUT_UINPUT is not set
-
-#
-# Hardware I/O ports
-#
-CONFIG_SERIO=y
-CONFIG_SERIO_I8042=y
-CONFIG_SERIO_SERPORT=y
-# CONFIG_SERIO_CT82C710 is not set
-# CONFIG_SERIO_PCIPS2 is not set
-CONFIG_SERIO_LIBPS2=y
-# CONFIG_SERIO_RAW is not set
-# CONFIG_GAMEPORT is not set
-
-#
-# Character devices
-#
-CONFIG_VT=y
-CONFIG_CONSOLE_TRANSLATIONS=y
-CONFIG_VT_CONSOLE=y
-CONFIG_HW_CONSOLE=y
-CONFIG_VT_HW_CONSOLE_BINDING=y
-CONFIG_DEVKMEM=y
-CONFIG_SERIAL_NONSTANDARD=y
-# CONFIG_COMPUTONE is not set
-# CONFIG_ROCKETPORT is not set
-# CONFIG_CYCLADES is not set
-# CONFIG_DIGIEPCA is not set
-# CONFIG_MOXA_INTELLIO is not set
-# CONFIG_MOXA_SMARTIO is not set
-# CONFIG_ISI is not set
-# CONFIG_SYNCLINK is not set
-# CONFIG_SYNCLINKMP is not set
-# CONFIG_SYNCLINK_GT is not set
-# CONFIG_N_HDLC is not set
-# CONFIG_RISCOM8 is not set
-# CONFIG_SPECIALIX is not set
-# CONFIG_SX is not set
-# CONFIG_RIO is not set
-# CONFIG_STALDRV is not set
-# CONFIG_NOZOMI is not set
-
-#
-# Serial drivers
-#
+# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_FIX_EARLYCON_MEM=y
-CONFIG_SERIAL_8250_PCI=y
-CONFIG_SERIAL_8250_PNP=y
-# CONFIG_SERIAL_8250_CS is not set
CONFIG_SERIAL_8250_NR_UARTS=32
-CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_MANY_PORTS=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y
-
-#
-# Non-8250 serial port support
-#
-CONFIG_SERIAL_CORE=y
-CONFIG_SERIAL_CORE_CONSOLE=y
-# CONFIG_SERIAL_JSM is not set
-CONFIG_UNIX98_PTYS=y
-# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_IPMI_HANDLER is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_VIRTIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-# CONFIG_HW_RANDOM_TIMERIOMEM is not set
-CONFIG_HW_RANDOM_INTEL=y
-CONFIG_HW_RANDOM_AMD=y
-CONFIG_HW_RANDOM_GEODE=y
-CONFIG_HW_RANDOM_VIA=y
CONFIG_NVRAM=y
-# CONFIG_R3964 is not set
-# CONFIG_APPLICOM is not set
-# CONFIG_SONYPI is not set
-
-#
-# PCMCIA character devices
-#
-# CONFIG_SYNCLINK_CS is not set
-# CONFIG_CARDMAN_4000 is not set
-# CONFIG_CARDMAN_4040 is not set
-# CONFIG_IPWIRELESS is not set
-# CONFIG_MWAVE is not set
-# CONFIG_PC8736x_GPIO is not set
-# CONFIG_NSC_GPIO is not set
-# CONFIG_CS5535_GPIO is not set
-# CONFIG_RAW_DRIVER is not set
CONFIG_HPET=y
# CONFIG_HPET_MMAP is not set
-# CONFIG_HANGCHECK_TIMER is not set
-# CONFIG_TCG_TPM is not set
-# CONFIG_TELCLOCK is not set
-CONFIG_DEVPORT=y
-CONFIG_I2C=y
-CONFIG_I2C_BOARDINFO=y
-# CONFIG_I2C_CHARDEV is not set
-CONFIG_I2C_HELPER_AUTO=y
-CONFIG_I2C_ALGOBIT=y
-
-#
-# I2C Hardware Bus support
-#
-
-#
-# PC SMBus host controller drivers
-#
-# CONFIG_I2C_ALI1535 is not set
-# CONFIG_I2C_ALI1563 is not set
-# CONFIG_I2C_ALI15X3 is not set
-# CONFIG_I2C_AMD756 is not set
-# CONFIG_I2C_AMD8111 is not set
CONFIG_I2C_I801=y
-# CONFIG_I2C_ISCH is not set
-# CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_NFORCE2 is not set
-# CONFIG_I2C_SIS5595 is not set
-# CONFIG_I2C_SIS630 is not set
-# CONFIG_I2C_SIS96X is not set
-# CONFIG_I2C_VIA is not set
-# CONFIG_I2C_VIAPRO is not set
-
-#
-# I2C system bus drivers (mostly embedded / system-on-chip)
-#
-# CONFIG_I2C_OCORES is not set
-# CONFIG_I2C_SIMTEC is not set
-
-#
-# External I2C/SMBus adapter drivers
-#
-# CONFIG_I2C_PARPORT_LIGHT is not set
-# CONFIG_I2C_TAOS_EVM is not set
-# CONFIG_I2C_TINY_USB is not set
-
-#
-# Graphics adapter I2C/DDC channel drivers
-#
-# CONFIG_I2C_VOODOO3 is not set
-
-#
-# Other I2C/SMBus bus drivers
-#
-# CONFIG_I2C_PCA_PLATFORM is not set
-# CONFIG_I2C_STUB is not set
-# CONFIG_SCx200_ACB is not set
-
-#
-# Miscellaneous I2C Chip support
-#
-# CONFIG_DS1682 is not set
-# CONFIG_SENSORS_PCF8574 is not set
-# CONFIG_PCF8575 is not set
-# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_MAX6875 is not set
-# CONFIG_SENSORS_TSL2550 is not set
-# CONFIG_I2C_DEBUG_CORE is not set
-# CONFIG_I2C_DEBUG_ALGO is not set
-# CONFIG_I2C_DEBUG_BUS is not set
-# CONFIG_I2C_DEBUG_CHIP is not set
-# CONFIG_SPI is not set
-CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
-# CONFIG_GPIOLIB is not set
-# CONFIG_W1 is not set
-CONFIG_POWER_SUPPLY=y
-# CONFIG_POWER_SUPPLY_DEBUG is not set
-# CONFIG_PDA_POWER is not set
-# CONFIG_BATTERY_DS2760 is not set
-# CONFIG_BATTERY_BQ27x00 is not set
-CONFIG_HWMON=y
-# CONFIG_HWMON_VID is not set
-# CONFIG_SENSORS_ABITUGURU is not set
-# CONFIG_SENSORS_ABITUGURU3 is not set
-# CONFIG_SENSORS_AD7414 is not set
-# CONFIG_SENSORS_AD7418 is not set
-# CONFIG_SENSORS_ADM1021 is not set
-# CONFIG_SENSORS_ADM1025 is not set
-# CONFIG_SENSORS_ADM1026 is not set
-# CONFIG_SENSORS_ADM1029 is not set
-# CONFIG_SENSORS_ADM1031 is not set
-# CONFIG_SENSORS_ADM9240 is not set
-# CONFIG_SENSORS_ADT7462 is not set
-# CONFIG_SENSORS_ADT7470 is not set
-# CONFIG_SENSORS_ADT7473 is not set
-# CONFIG_SENSORS_ADT7475 is not set
-# CONFIG_SENSORS_K8TEMP is not set
-# CONFIG_SENSORS_ASB100 is not set
-# CONFIG_SENSORS_ATK0110 is not set
-# CONFIG_SENSORS_ATXP1 is not set
-# CONFIG_SENSORS_DS1621 is not set
-# CONFIG_SENSORS_I5K_AMB is not set
-# CONFIG_SENSORS_F71805F is not set
-# CONFIG_SENSORS_F71882FG is not set
-# CONFIG_SENSORS_F75375S is not set
-# CONFIG_SENSORS_FSCHER is not set
-# CONFIG_SENSORS_FSCPOS is not set
-# CONFIG_SENSORS_FSCHMD is not set
-# CONFIG_SENSORS_G760A is not set
-# CONFIG_SENSORS_GL518SM is not set
-# CONFIG_SENSORS_GL520SM is not set
-# CONFIG_SENSORS_CORETEMP is not set
-# CONFIG_SENSORS_IT87 is not set
-# CONFIG_SENSORS_LM63 is not set
-# CONFIG_SENSORS_LM75 is not set
-# CONFIG_SENSORS_LM77 is not set
-# CONFIG_SENSORS_LM78 is not set
-# CONFIG_SENSORS_LM80 is not set
-# CONFIG_SENSORS_LM83 is not set
-# CONFIG_SENSORS_LM85 is not set
-# CONFIG_SENSORS_LM87 is not set
-# CONFIG_SENSORS_LM90 is not set
-# CONFIG_SENSORS_LM92 is not set
-# CONFIG_SENSORS_LM93 is not set
-# CONFIG_SENSORS_LTC4215 is not set
-# CONFIG_SENSORS_LTC4245 is not set
-# CONFIG_SENSORS_LM95241 is not set
-# CONFIG_SENSORS_MAX1619 is not set
-# CONFIG_SENSORS_MAX6650 is not set
-# CONFIG_SENSORS_PC87360 is not set
-# CONFIG_SENSORS_PC87427 is not set
-# CONFIG_SENSORS_PCF8591 is not set
-# CONFIG_SENSORS_SIS5595 is not set
-# CONFIG_SENSORS_DME1737 is not set
-# CONFIG_SENSORS_SMSC47M1 is not set
-# CONFIG_SENSORS_SMSC47M192 is not set
-# CONFIG_SENSORS_SMSC47B397 is not set
-# CONFIG_SENSORS_ADS7828 is not set
-# CONFIG_SENSORS_THMC50 is not set
-# CONFIG_SENSORS_VIA686A is not set
-# CONFIG_SENSORS_VT1211 is not set
-# CONFIG_SENSORS_VT8231 is not set
-# CONFIG_SENSORS_W83781D is not set
-# CONFIG_SENSORS_W83791D is not set
-# CONFIG_SENSORS_W83792D is not set
-# CONFIG_SENSORS_W83793 is not set
-# CONFIG_SENSORS_W83L785TS is not set
-# CONFIG_SENSORS_W83L786NG is not set
-# CONFIG_SENSORS_W83627HF is not set
-# CONFIG_SENSORS_W83627EHF is not set
-# CONFIG_SENSORS_HDAPS is not set
-# CONFIG_SENSORS_LIS3LV02D is not set
-# CONFIG_SENSORS_APPLESMC is not set
-# CONFIG_HWMON_DEBUG_CHIP is not set
-CONFIG_THERMAL=y
-# CONFIG_THERMAL_HWMON is not set
CONFIG_WATCHDOG=y
-# CONFIG_WATCHDOG_NOWAYOUT is not set
-
-#
-# Watchdog Device Drivers
-#
-# CONFIG_SOFT_WATCHDOG is not set
-# CONFIG_ACQUIRE_WDT is not set
-# CONFIG_ADVANTECH_WDT is not set
-# CONFIG_ALIM1535_WDT is not set
-# CONFIG_ALIM7101_WDT is not set
-# CONFIG_SC520_WDT is not set
-# CONFIG_EUROTECH_WDT is not set
-# CONFIG_IB700_WDT is not set
-# CONFIG_IBMASR is not set
-# CONFIG_WAFER_WDT is not set
-# CONFIG_I6300ESB_WDT is not set
-# CONFIG_ITCO_WDT is not set
-# CONFIG_IT8712F_WDT is not set
-# CONFIG_IT87_WDT is not set
-# CONFIG_HP_WATCHDOG is not set
-# CONFIG_SC1200_WDT is not set
-# CONFIG_PC87413_WDT is not set
-# CONFIG_60XX_WDT is not set
-# CONFIG_SBC8360_WDT is not set
-# CONFIG_SBC7240_WDT is not set
-# CONFIG_CPU5_WDT is not set
-# CONFIG_SMSC_SCH311X_WDT is not set
-# CONFIG_SMSC37B787_WDT is not set
-# CONFIG_W83627HF_WDT is not set
-# CONFIG_W83697HF_WDT is not set
-# CONFIG_W83697UG_WDT is not set
-# CONFIG_W83877F_WDT is not set
-# CONFIG_W83977F_WDT is not set
-# CONFIG_MACHZ_WDT is not set
-# CONFIG_SBC_EPX_C3_WATCHDOG is not set
-
-#
-# PCI-based Watchdog Cards
-#
-# CONFIG_PCIPCWATCHDOG is not set
-# CONFIG_WDTPCI is not set
-
-#
-# USB-based Watchdog Cards
-#
-# CONFIG_USBPCWATCHDOG is not set
-CONFIG_SSB_POSSIBLE=y
-
-#
-# Sonics Silicon Backplane
-#
-# CONFIG_SSB is not set
-
-#
-# Multifunction device drivers
-#
-# CONFIG_MFD_CORE is not set
-# CONFIG_MFD_SM501 is not set
-# CONFIG_HTC_PASIC3 is not set
-# CONFIG_TWL4030_CORE is not set
-# CONFIG_MFD_TMIO is not set
-# CONFIG_PMIC_DA903X is not set
-# CONFIG_MFD_WM8400 is not set
-# CONFIG_MFD_WM8350_I2C is not set
-# CONFIG_MFD_PCF50633 is not set
-# CONFIG_REGULATOR is not set
-
-#
-# Multimedia devices
-#
-
-#
-# Multimedia core support
-#
-# CONFIG_VIDEO_DEV is not set
-# CONFIG_DVB_CORE is not set
-# CONFIG_VIDEO_MEDIA is not set
-
-#
-# Multimedia drivers
-#
-CONFIG_DAB=y
-# CONFIG_USB_DABUSB is not set
-
-#
-# Graphics support
-#
CONFIG_AGP=y
-# CONFIG_AGP_ALI is not set
-# CONFIG_AGP_ATI is not set
-# CONFIG_AGP_AMD is not set
CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
-# CONFIG_AGP_NVIDIA is not set
-# CONFIG_AGP_SIS is not set
-# CONFIG_AGP_SWORKS is not set
-# CONFIG_AGP_VIA is not set
-# CONFIG_AGP_EFFICEON is not set
CONFIG_DRM=y
-# CONFIG_DRM_TDFX is not set
-# CONFIG_DRM_R128 is not set
-# CONFIG_DRM_RADEON is not set
-# CONFIG_DRM_I810 is not set
-# CONFIG_DRM_I830 is not set
CONFIG_DRM_I915=y
-# CONFIG_DRM_I915_KMS is not set
-# CONFIG_DRM_MGA is not set
-# CONFIG_DRM_SIS is not set
-# CONFIG_DRM_VIA is not set
-# CONFIG_DRM_SAVAGE is not set
-# CONFIG_VGASTATE is not set
-# CONFIG_VIDEO_OUTPUT_CONTROL is not set
-CONFIG_FB=y
-# CONFIG_FIRMWARE_EDID is not set
-# CONFIG_FB_DDC is not set
-# CONFIG_FB_BOOT_VESA_SUPPORT is not set
-CONFIG_FB_CFB_FILLRECT=y
-CONFIG_FB_CFB_COPYAREA=y
-CONFIG_FB_CFB_IMAGEBLIT=y
-# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
-# CONFIG_FB_SYS_FILLRECT is not set
-# CONFIG_FB_SYS_COPYAREA is not set
-# CONFIG_FB_SYS_IMAGEBLIT is not set
-# CONFIG_FB_FOREIGN_ENDIAN is not set
-# CONFIG_FB_SYS_FOPS is not set
-# CONFIG_FB_SVGALIB is not set
-# CONFIG_FB_MACMODES is not set
-# CONFIG_FB_BACKLIGHT is not set
-CONFIG_FB_MODE_HELPERS=y
-CONFIG_FB_TILEBLITTING=y
-
-#
-# Frame buffer hardware drivers
-#
-# CONFIG_FB_CIRRUS is not set
-# CONFIG_FB_PM2 is not set
-# CONFIG_FB_CYBER2000 is not set
-# CONFIG_FB_ARC is not set
-# CONFIG_FB_ASILIANT is not set
-# CONFIG_FB_IMSTT is not set
-# CONFIG_FB_VGA16 is not set
-# CONFIG_FB_UVESA is not set
-# CONFIG_FB_VESA is not set
-CONFIG_FB_EFI=y
-# CONFIG_FB_N411 is not set
-# CONFIG_FB_HGA is not set
-# CONFIG_FB_S1D13XXX is not set
-# CONFIG_FB_NVIDIA is not set
-# CONFIG_FB_RIVA is not set
-# CONFIG_FB_I810 is not set
-# CONFIG_FB_LE80578 is not set
-# CONFIG_FB_INTEL is not set
-# CONFIG_FB_MATROX is not set
-# CONFIG_FB_RADEON is not set
-# CONFIG_FB_ATY128 is not set
-# CONFIG_FB_ATY is not set
-# CONFIG_FB_S3 is not set
-# CONFIG_FB_SAVAGE is not set
-# CONFIG_FB_SIS is not set
-# CONFIG_FB_VIA is not set
-# CONFIG_FB_NEOMAGIC is not set
-# CONFIG_FB_KYRO is not set
-# CONFIG_FB_3DFX is not set
-# CONFIG_FB_VOODOO1 is not set
-# CONFIG_FB_VT8623 is not set
-# CONFIG_FB_TRIDENT is not set
-# CONFIG_FB_ARK is not set
-# CONFIG_FB_PM3 is not set
-# CONFIG_FB_CARMINE is not set
-# CONFIG_FB_GEODE is not set
-# CONFIG_FB_VIRTUAL is not set
-# CONFIG_FB_METRONOME is not set
-# CONFIG_FB_MB862XX is not set
-# CONFIG_FB_BROADSHEET is not set
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_BACKLIGHT_GENERIC=y
-# CONFIG_BACKLIGHT_PROGEAR is not set
-# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
-# CONFIG_BACKLIGHT_SAHARA is not set
-
-#
-# Display device support
-#
-# CONFIG_DISPLAY_SUPPORT is not set
-
-#
-# Console display driver support
-#
-CONFIG_VGA_CONSOLE=y
-CONFIG_VGACON_SOFT_SCROLLBACK=y
-CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_DUMMY_CONSOLE=y
-# CONFIG_FRAMEBUFFER_CONSOLE is not set
-CONFIG_LOGO=y
-# CONFIG_LOGO_LINUX_MONO is not set
-# CONFIG_LOGO_LINUX_VGA16 is not set
-CONFIG_LOGO_LINUX_CLUT224=y
+CONFIG_DRM_VIRTIO_GPU=y
CONFIG_SOUND=y
-CONFIG_SOUND_OSS_CORE=y
CONFIG_SND=y
-CONFIG_SND_TIMER=y
-CONFIG_SND_PCM=y
-CONFIG_SND_HWDEP=y
-CONFIG_SND_JACK=y
+CONFIG_SND_HRTIMER=y
CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQ_DUMMY=y
-CONFIG_SND_OSSEMUL=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_PCM_OSS_PLUGINS=y
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_HRTIMER=y
-CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
-CONFIG_SND_DYNAMIC_MINORS=y
-CONFIG_SND_SUPPORT_OLD_API=y
-CONFIG_SND_VERBOSE_PROCFS=y
-# CONFIG_SND_VERBOSE_PRINTK is not set
-# CONFIG_SND_DEBUG is not set
-CONFIG_SND_VMASTER=y
-CONFIG_SND_DRIVERS=y
-# CONFIG_SND_PCSP is not set
-# CONFIG_SND_DUMMY is not set
-# CONFIG_SND_VIRMIDI is not set
-# CONFIG_SND_MTPAV is not set
-# CONFIG_SND_SERIAL_U16550 is not set
-# CONFIG_SND_MPU401 is not set
-CONFIG_SND_PCI=y
-# CONFIG_SND_AD1889 is not set
-# CONFIG_SND_ALS300 is not set
-# CONFIG_SND_ALS4000 is not set
-# CONFIG_SND_ALI5451 is not set
-# CONFIG_SND_ATIIXP is not set
-# CONFIG_SND_ATIIXP_MODEM is not set
-# CONFIG_SND_AU8810 is not set
-# CONFIG_SND_AU8820 is not set
-# CONFIG_SND_AU8830 is not set
-# CONFIG_SND_AW2 is not set
-# CONFIG_SND_AZT3328 is not set
-# CONFIG_SND_BT87X is not set
-# CONFIG_SND_CA0106 is not set
-# CONFIG_SND_CMIPCI is not set
-# CONFIG_SND_OXYGEN is not set
-# CONFIG_SND_CS4281 is not set
-# CONFIG_SND_CS46XX is not set
-# CONFIG_SND_CS5530 is not set
-# CONFIG_SND_CS5535AUDIO is not set
-# CONFIG_SND_DARLA20 is not set
-# CONFIG_SND_GINA20 is not set
-# CONFIG_SND_LAYLA20 is not set
-# CONFIG_SND_DARLA24 is not set
-# CONFIG_SND_GINA24 is not set
-# CONFIG_SND_LAYLA24 is not set
-# CONFIG_SND_MONA is not set
-# CONFIG_SND_MIA is not set
-# CONFIG_SND_ECHO3G is not set
-# CONFIG_SND_INDIGO is not set
-# CONFIG_SND_INDIGOIO is not set
-# CONFIG_SND_INDIGODJ is not set
-# CONFIG_SND_INDIGOIOX is not set
-# CONFIG_SND_INDIGODJX is not set
-# CONFIG_SND_EMU10K1 is not set
-# CONFIG_SND_EMU10K1X is not set
-# CONFIG_SND_ENS1370 is not set
-# CONFIG_SND_ENS1371 is not set
-# CONFIG_SND_ES1938 is not set
-# CONFIG_SND_ES1968 is not set
-# CONFIG_SND_FM801 is not set
CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
-# CONFIG_SND_HDA_RECONFIG is not set
-# CONFIG_SND_HDA_INPUT_BEEP is not set
-CONFIG_SND_HDA_CODEC_REALTEK=y
-CONFIG_SND_HDA_CODEC_ANALOG=y
-CONFIG_SND_HDA_CODEC_SIGMATEL=y
-CONFIG_SND_HDA_CODEC_VIA=y
-CONFIG_SND_HDA_CODEC_ATIHDMI=y
-CONFIG_SND_HDA_CODEC_NVHDMI=y
-CONFIG_SND_HDA_CODEC_INTELHDMI=y
-CONFIG_SND_HDA_ELD=y
-CONFIG_SND_HDA_CODEC_CONEXANT=y
-CONFIG_SND_HDA_CODEC_CMEDIA=y
-CONFIG_SND_HDA_CODEC_SI3054=y
-CONFIG_SND_HDA_GENERIC=y
-# CONFIG_SND_HDA_POWER_SAVE is not set
-# CONFIG_SND_HDSP is not set
-# CONFIG_SND_HDSPM is not set
-# CONFIG_SND_HIFIER is not set
-# CONFIG_SND_ICE1712 is not set
-# CONFIG_SND_ICE1724 is not set
-# CONFIG_SND_INTEL8X0 is not set
-# CONFIG_SND_INTEL8X0M is not set
-# CONFIG_SND_KORG1212 is not set
-# CONFIG_SND_MAESTRO3 is not set
-# CONFIG_SND_MIXART is not set
-# CONFIG_SND_NM256 is not set
-# CONFIG_SND_PCXHR is not set
-# CONFIG_SND_RIPTIDE is not set
-# CONFIG_SND_RME32 is not set
-# CONFIG_SND_RME96 is not set
-# CONFIG_SND_RME9652 is not set
-# CONFIG_SND_SIS7019 is not set
-# CONFIG_SND_SONICVIBES is not set
-# CONFIG_SND_TRIDENT is not set
-# CONFIG_SND_VIA82XX is not set
-# CONFIG_SND_VIA82XX_MODEM is not set
-# CONFIG_SND_VIRTUOSO is not set
-# CONFIG_SND_VX222 is not set
-# CONFIG_SND_YMFPCI is not set
-CONFIG_SND_USB=y
-# CONFIG_SND_USB_AUDIO is not set
-# CONFIG_SND_USB_USX2Y is not set
-# CONFIG_SND_USB_CAIAQ is not set
-# CONFIG_SND_USB_US122L is not set
-CONFIG_SND_PCMCIA=y
-# CONFIG_SND_VXPOCKET is not set
-# CONFIG_SND_PDAUDIOCF is not set
-# CONFIG_SND_SOC is not set
-# CONFIG_SOUND_PRIME is not set
-CONFIG_HID_SUPPORT=y
-CONFIG_HID=y
-CONFIG_HID_DEBUG=y
CONFIG_HIDRAW=y
-
-#
-# USB Input Devices
-#
-CONFIG_USB_HID=y
-CONFIG_HID_PID=y
-CONFIG_USB_HIDDEV=y
-
-#
-# Special HID drivers
-#
-CONFIG_HID_A4TECH=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-# CONFIG_DRAGONRISE_FF is not set
-CONFIG_HID_EZKEY=y
-CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_LOGITECH=y
-CONFIG_LOGITECH_FF=y
-# CONFIG_LOGIRUMBLEPAD2_FF is not set
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -1874,702 +212,60 @@ CONFIG_HID_PETALYNX=y
CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
CONFIG_HID_SUNPLUS=y
-# CONFIG_GREENASIA_FF is not set
CONFIG_HID_TOPSEED=y
-CONFIG_THRUSTMASTER_FF=y
-CONFIG_ZEROPLUS_FF=y
-CONFIG_USB_SUPPORT=y
-CONFIG_USB_ARCH_HAS_HCD=y
-CONFIG_USB_ARCH_HAS_OHCI=y
-CONFIG_USB_ARCH_HAS_EHCI=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
CONFIG_USB=y
-CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-
-#
-# Miscellaneous USB options
-#
-CONFIG_USB_DEVICEFS=y
-# CONFIG_USB_DEVICE_CLASS is not set
-# CONFIG_USB_DYNAMIC_MINORS is not set
-CONFIG_USB_SUSPEND=y
-# CONFIG_USB_OTG is not set
CONFIG_USB_MON=y
-# CONFIG_USB_WUSB is not set
-# CONFIG_USB_WUSB_CBAF is not set
-
-#
-# USB Host Controller Drivers
-#
-# CONFIG_USB_C67X00_HCD is not set
+CONFIG_USB_XHCI_HCD=y
CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
-# CONFIG_USB_OXU210HP_HCD is not set
-# CONFIG_USB_ISP116X_HCD is not set
-# CONFIG_USB_ISP1760_HCD is not set
CONFIG_USB_OHCI_HCD=y
-# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
-# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
-CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
-# CONFIG_USB_SL811_HCD is not set
-# CONFIG_USB_R8A66597_HCD is not set
-# CONFIG_USB_WHCI_HCD is not set
-# CONFIG_USB_HWA_HCD is not set
-
-#
-# USB Device Class drivers
-#
-# CONFIG_USB_ACM is not set
CONFIG_USB_PRINTER=y
-# CONFIG_USB_WDM is not set
-# CONFIG_USB_TMC is not set
-
-#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
-#
-
-#
-# also be needed; see USB_STORAGE Help for more info
-#
CONFIG_USB_STORAGE=y
-# CONFIG_USB_STORAGE_DEBUG is not set
-# CONFIG_USB_STORAGE_DATAFAB is not set
-# CONFIG_USB_STORAGE_FREECOM is not set
-# CONFIG_USB_STORAGE_ISD200 is not set
-# CONFIG_USB_STORAGE_USBAT is not set
-# CONFIG_USB_STORAGE_SDDR09 is not set
-# CONFIG_USB_STORAGE_SDDR55 is not set
-# CONFIG_USB_STORAGE_JUMPSHOT is not set
-# CONFIG_USB_STORAGE_ALAUDA is not set
-# CONFIG_USB_STORAGE_ONETOUCH is not set
-# CONFIG_USB_STORAGE_KARMA is not set
-# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
-CONFIG_USB_LIBUSUAL=y
-
-#
-# USB Imaging devices
-#
-# CONFIG_USB_MDC800 is not set
-# CONFIG_USB_MICROTEK is not set
-
-#
-# USB port drivers
-#
-# CONFIG_USB_SERIAL is not set
-
-#
-# USB Miscellaneous drivers
-#
-# CONFIG_USB_EMI62 is not set
-# CONFIG_USB_EMI26 is not set
-# CONFIG_USB_ADUTUX is not set
-# CONFIG_USB_SEVSEG is not set
-# CONFIG_USB_RIO500 is not set
-# CONFIG_USB_LEGOTOWER is not set
-# CONFIG_USB_LCD is not set
-# CONFIG_USB_BERRY_CHARGE is not set
-# CONFIG_USB_LED is not set
-# CONFIG_USB_CYPRESS_CY7C63 is not set
-# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_IDMOUSE is not set
-# CONFIG_USB_FTDI_ELAN is not set
-# CONFIG_USB_APPLEDISPLAY is not set
-# CONFIG_USB_SISUSBVGA is not set
-# CONFIG_USB_LD is not set
-# CONFIG_USB_TRANCEVIBRATOR is not set
-# CONFIG_USB_IOWARRIOR is not set
-# CONFIG_USB_TEST is not set
-# CONFIG_USB_ISIGHTFW is not set
-# CONFIG_USB_VST is not set
-# CONFIG_USB_GADGET is not set
-
-#
-# OTG and related infrastructure
-#
-# CONFIG_NOP_USB_XCEIV is not set
-# CONFIG_UWB is not set
-# CONFIG_MMC is not set
-# CONFIG_MEMSTICK is not set
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-
-#
-# LED drivers
-#
-# CONFIG_LEDS_ALIX2 is not set
-# CONFIG_LEDS_PCA9532 is not set
-# CONFIG_LEDS_LP5521 is not set
-# CONFIG_LEDS_CLEVO_MAIL is not set
-# CONFIG_LEDS_PCA955X is not set
-# CONFIG_LEDS_BD2802 is not set
-
-#
-# LED Triggers
-#
-CONFIG_LEDS_TRIGGERS=y
-# CONFIG_LEDS_TRIGGER_TIMER is not set
-# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
-# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
-# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
-
-#
-# iptables trigger is under Netfilter config (LED target)
-#
-# CONFIG_ACCESSIBILITY is not set
-# CONFIG_INFINIBAND is not set
-CONFIG_EDAC=y
-
-#
-# Reporting subsystems
-#
-# CONFIG_EDAC_DEBUG is not set
-# CONFIG_EDAC_MM_EDAC is not set
-CONFIG_RTC_LIB=y
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
-# CONFIG_RTC_DEBUG is not set
-
-#
-# RTC interfaces
-#
-CONFIG_RTC_INTF_SYSFS=y
-CONFIG_RTC_INTF_PROC=y
-CONFIG_RTC_INTF_DEV=y
-# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
-# CONFIG_RTC_DRV_TEST is not set
-
-#
-# I2C RTC drivers
-#
-# CONFIG_RTC_DRV_DS1307 is not set
-# CONFIG_RTC_DRV_DS1374 is not set
-# CONFIG_RTC_DRV_DS1672 is not set
-# CONFIG_RTC_DRV_MAX6900 is not set
-# CONFIG_RTC_DRV_RS5C372 is not set
-# CONFIG_RTC_DRV_ISL1208 is not set
-# CONFIG_RTC_DRV_X1205 is not set
-# CONFIG_RTC_DRV_PCF8563 is not set
-# CONFIG_RTC_DRV_PCF8583 is not set
-# CONFIG_RTC_DRV_M41T80 is not set
-# CONFIG_RTC_DRV_S35390A is not set
-# CONFIG_RTC_DRV_FM3130 is not set
-# CONFIG_RTC_DRV_RX8581 is not set
-
-#
-# SPI RTC drivers
-#
-
-#
-# Platform RTC drivers
-#
-CONFIG_RTC_DRV_CMOS=y
-# CONFIG_RTC_DRV_DS1286 is not set
-# CONFIG_RTC_DRV_DS1511 is not set
-# CONFIG_RTC_DRV_DS1553 is not set
-# CONFIG_RTC_DRV_DS1742 is not set
-# CONFIG_RTC_DRV_STK17TA8 is not set
-# CONFIG_RTC_DRV_M48T86 is not set
-# CONFIG_RTC_DRV_M48T35 is not set
-# CONFIG_RTC_DRV_M48T59 is not set
-# CONFIG_RTC_DRV_BQ4802 is not set
-# CONFIG_RTC_DRV_V3020 is not set
-
-#
-# on-CPU RTC drivers
-#
CONFIG_DMADEVICES=y
-
-#
-# DMA Devices
-#
-# CONFIG_INTEL_IOATDMA is not set
-# CONFIG_AUXDISPLAY is not set
-# CONFIG_UIO is not set
-# CONFIG_STAGING is not set
-CONFIG_X86_PLATFORM_DEVICES=y
-# CONFIG_ACER_WMI is not set
-# CONFIG_ASUS_LAPTOP is not set
-# CONFIG_FUJITSU_LAPTOP is not set
-# CONFIG_TC1100_WMI is not set
-# CONFIG_MSI_LAPTOP is not set
-# CONFIG_PANASONIC_LAPTOP is not set
-# CONFIG_COMPAL_LAPTOP is not set
-# CONFIG_SONY_LAPTOP is not set
-# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_INTEL_MENLOW is not set
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_INPUT=y
CONFIG_EEEPC_LAPTOP=y
-# CONFIG_ACPI_WMI is not set
-# CONFIG_ACPI_ASUS is not set
-# CONFIG_ACPI_TOSHIBA is not set
-
-#
-# Firmware Drivers
-#
-# CONFIG_EDD is not set
-CONFIG_FIRMWARE_MEMMAP=y
-CONFIG_EFI_VARS=y
-# CONFIG_DELL_RBU is not set
-# CONFIG_DCDBAS is not set
-CONFIG_DMIID=y
-# CONFIG_ISCSI_IBFT_FIND is not set
-
-#
-# File systems
-#
-# CONFIG_EXT2_FS is not set
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_EXT3_FS_XATTR=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-# CONFIG_EXT4_FS is not set
-CONFIG_JBD=y
-# CONFIG_JBD_DEBUG is not set
-CONFIG_FS_MBCACHE=y
-# CONFIG_REISERFS_FS is not set
-# CONFIG_JFS_FS is not set
-CONFIG_FS_POSIX_ACL=y
-CONFIG_FILE_LOCKING=y
-# CONFIG_XFS_FS is not set
-# CONFIG_OCFS2_FS is not set
-# CONFIG_BTRFS_FS is not set
-CONFIG_DNOTIFY=y
-CONFIG_INOTIFY=y
-CONFIG_INOTIFY_USER=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
-CONFIG_QUOTA_TREE=y
-# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
-CONFIG_QUOTACTL=y
-# CONFIG_AUTOFS_FS is not set
-CONFIG_AUTOFS4_FS=y
-# CONFIG_FUSE_FS is not set
-CONFIG_GENERIC_ACL=y
-
-#
-# Caches
-#
-# CONFIG_FSCACHE is not set
-
-#
-# CD-ROM/DVD Filesystems
-#
+CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
-# CONFIG_UDF_FS is not set
-
-#
-# DOS/FAT/NT Filesystems
-#
-CONFIG_FAT_FS=y
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
-CONFIG_FAT_DEFAULT_CODEPAGE=437
-CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
-# CONFIG_NTFS_FS is not set
-
-#
-# Pseudo filesystems
-#
-CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
-CONFIG_PROC_VMCORE=y
-CONFIG_PROC_SYSCTL=y
-CONFIG_PROC_PAGE_MONITOR=y
-CONFIG_SYSFS=y
-CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
-CONFIG_HUGETLB_PAGE=y
-# CONFIG_CONFIGFS_FS is not set
-CONFIG_MISC_FILESYSTEMS=y
-# CONFIG_ADFS_FS is not set
-# CONFIG_AFFS_FS is not set
-# CONFIG_ECRYPT_FS is not set
-# CONFIG_HFS_FS is not set
-# CONFIG_HFSPLUS_FS is not set
-# CONFIG_BEFS_FS is not set
-# CONFIG_BFS_FS is not set
-# CONFIG_EFS_FS is not set
-# CONFIG_CRAMFS is not set
-# CONFIG_SQUASHFS is not set
-# CONFIG_VXFS_FS is not set
-# CONFIG_MINIX_FS is not set
-# CONFIG_OMFS_FS is not set
-# CONFIG_HPFS_FS is not set
-# CONFIG_QNX4FS_FS is not set
-# CONFIG_ROMFS_FS is not set
-# CONFIG_SYSV_FS is not set
-# CONFIG_UFS_FS is not set
-# CONFIG_NILFS2_FS is not set
-CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-# CONFIG_NFSD is not set
-CONFIG_LOCKD=y
-CONFIG_LOCKD_V4=y
-CONFIG_NFS_ACL_SUPPORT=y
-CONFIG_NFS_COMMON=y
-CONFIG_SUNRPC=y
-CONFIG_SUNRPC_GSS=y
-CONFIG_RPCSEC_GSS_KRB5=y
-# CONFIG_RPCSEC_GSS_SPKM3 is not set
-# CONFIG_SMB_FS is not set
-# CONFIG_CIFS is not set
-# CONFIG_NCP_FS is not set
-# CONFIG_CODA_FS is not set
-# CONFIG_AFS_FS is not set
-
-#
-# Partition Types
-#
-CONFIG_PARTITION_ADVANCED=y
-# CONFIG_ACORN_PARTITION is not set
-CONFIG_OSF_PARTITION=y
-CONFIG_AMIGA_PARTITION=y
-# CONFIG_ATARI_PARTITION is not set
-CONFIG_MAC_PARTITION=y
-CONFIG_MSDOS_PARTITION=y
-CONFIG_BSD_DISKLABEL=y
-CONFIG_MINIX_SUBPARTITION=y
-CONFIG_SOLARIS_X86_PARTITION=y
-CONFIG_UNIXWARE_DISKLABEL=y
-# CONFIG_LDM_PARTITION is not set
-CONFIG_SGI_PARTITION=y
-# CONFIG_ULTRIX_PARTITION is not set
-CONFIG_SUN_PARTITION=y
-CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
-# CONFIG_SYSV68_PARTITION is not set
-CONFIG_NLS=y
+CONFIG_9P_FS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
-# CONFIG_NLS_CODEPAGE_737 is not set
-# CONFIG_NLS_CODEPAGE_775 is not set
-# CONFIG_NLS_CODEPAGE_850 is not set
-# CONFIG_NLS_CODEPAGE_852 is not set
-# CONFIG_NLS_CODEPAGE_855 is not set
-# CONFIG_NLS_CODEPAGE_857 is not set
-# CONFIG_NLS_CODEPAGE_860 is not set
-# CONFIG_NLS_CODEPAGE_861 is not set
-# CONFIG_NLS_CODEPAGE_862 is not set
-# CONFIG_NLS_CODEPAGE_863 is not set
-# CONFIG_NLS_CODEPAGE_864 is not set
-# CONFIG_NLS_CODEPAGE_865 is not set
-# CONFIG_NLS_CODEPAGE_866 is not set
-# CONFIG_NLS_CODEPAGE_869 is not set
-# CONFIG_NLS_CODEPAGE_936 is not set
-# CONFIG_NLS_CODEPAGE_950 is not set
-# CONFIG_NLS_CODEPAGE_932 is not set
-# CONFIG_NLS_CODEPAGE_949 is not set
-# CONFIG_NLS_CODEPAGE_874 is not set
-# CONFIG_NLS_ISO8859_8 is not set
-# CONFIG_NLS_CODEPAGE_1250 is not set
-# CONFIG_NLS_CODEPAGE_1251 is not set
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_NLS_ISO8859_2 is not set
-# CONFIG_NLS_ISO8859_3 is not set
-# CONFIG_NLS_ISO8859_4 is not set
-# CONFIG_NLS_ISO8859_5 is not set
-# CONFIG_NLS_ISO8859_6 is not set
-# CONFIG_NLS_ISO8859_7 is not set
-# CONFIG_NLS_ISO8859_9 is not set
-# CONFIG_NLS_ISO8859_13 is not set
-# CONFIG_NLS_ISO8859_14 is not set
-# CONFIG_NLS_ISO8859_15 is not set
-# CONFIG_NLS_KOI8_R is not set
-# CONFIG_NLS_KOI8_U is not set
CONFIG_NLS_UTF8=y
-# CONFIG_DLM is not set
-
-#
-# Kernel hacking
-#
-CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_ENABLE_MUST_CHECK=y
-CONFIG_FRAME_WARN=2048
-CONFIG_MAGIC_SYSRQ=y
-# CONFIG_UNUSED_SYMBOLS is not set
-CONFIG_DEBUG_FS=y
-# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
-# CONFIG_DEBUG_SHIRQ is not set
-# CONFIG_DETECT_SOFTLOCKUP is not set
-# CONFIG_DETECT_HUNG_TASK is not set
-# CONFIG_SCHED_DEBUG is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_STACK_USAGE=y
CONFIG_SCHEDSTATS=y
-CONFIG_TIMER_STATS=y
-# CONFIG_DEBUG_OBJECTS is not set
-# CONFIG_SLUB_DEBUG_ON is not set
-# CONFIG_SLUB_STATS is not set
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_RT_MUTEX_TESTER is not set
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_LOCK_ALLOC is not set
-# CONFIG_PROVE_LOCKING is not set
-# CONFIG_LOCK_STAT is not set
-# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
-CONFIG_STACKTRACE=y
-# CONFIG_DEBUG_KOBJECT is not set
-# CONFIG_DEBUG_HIGHMEM is not set
-CONFIG_DEBUG_BUGVERBOSE=y
-# CONFIG_DEBUG_INFO is not set
-# CONFIG_DEBUG_VM is not set
-# CONFIG_DEBUG_VIRTUAL is not set
-# CONFIG_DEBUG_WRITECOUNT is not set
-CONFIG_DEBUG_MEMORY_INIT=y
-# CONFIG_DEBUG_LIST is not set
-# CONFIG_DEBUG_SG is not set
-# CONFIG_DEBUG_NOTIFIERS is not set
-CONFIG_ARCH_WANT_FRAME_POINTERS=y
-CONFIG_FRAME_POINTER=y
-# CONFIG_BOOT_PRINTK_DELAY is not set
-# CONFIG_RCU_TORTURE_TEST is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-# CONFIG_KPROBES_SANITY_TEST is not set
-# CONFIG_BACKTRACE_SELF_TEST is not set
-# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
-# CONFIG_LKDTM is not set
-# CONFIG_FAULT_INJECTION is not set
-# CONFIG_LATENCYTOP is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
-# CONFIG_DEBUG_PAGEALLOC is not set
-CONFIG_USER_STACKTRACE_SUPPORT=y
-CONFIG_NOP_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACER=y
-CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
-CONFIG_HAVE_DYNAMIC_FTRACE=y
-CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
-CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
-CONFIG_RING_BUFFER=y
-CONFIG_TRACING=y
-CONFIG_TRACING_SUPPORT=y
-
-#
-# Tracers
-#
-# CONFIG_FUNCTION_TRACER is not set
-# CONFIG_IRQSOFF_TRACER is not set
-# CONFIG_SYSPROF_TRACER is not set
-# CONFIG_SCHED_TRACER is not set
-# CONFIG_CONTEXT_SWITCH_TRACER is not set
-# CONFIG_EVENT_TRACER is not set
-# CONFIG_FTRACE_SYSCALLS is not set
-# CONFIG_BOOT_TRACER is not set
-# CONFIG_TRACE_BRANCH_PROFILING is not set
-# CONFIG_POWER_TRACER is not set
-# CONFIG_STACK_TRACER is not set
-# CONFIG_HW_BRANCH_TRACER is not set
-# CONFIG_KMEMTRACE is not set
-# CONFIG_WORKQUEUE_TRACER is not set
CONFIG_BLK_DEV_IO_TRACE=y
-# CONFIG_FTRACE_STARTUP_TEST is not set
-# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_DEBUG is not set
-# CONFIG_DMA_API_DEBUG is not set
-# CONFIG_SAMPLES is not set
-CONFIG_HAVE_ARCH_KGDB=y
-# CONFIG_KGDB is not set
-# CONFIG_STRICT_DEVMEM is not set
-CONFIG_X86_VERBOSE_BOOTUP=y
-CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
-CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PER_CPU_MAPS is not set
-# CONFIG_X86_PTDUMP is not set
-CONFIG_DEBUG_RODATA=y
-# CONFIG_DEBUG_RODATA_TEST is not set
-CONFIG_DEBUG_NX_TEST=m
-# CONFIG_4KSTACKS is not set
-CONFIG_DOUBLEFAULT=y
-CONFIG_HAVE_MMIOTRACE_SUPPORT=y
-CONFIG_IO_DELAY_TYPE_0X80=0
-CONFIG_IO_DELAY_TYPE_0XED=1
-CONFIG_IO_DELAY_TYPE_UDELAY=2
-CONFIG_IO_DELAY_TYPE_NONE=3
-CONFIG_IO_DELAY_0X80=y
-# CONFIG_IO_DELAY_0XED is not set
-# CONFIG_IO_DELAY_UDELAY is not set
-# CONFIG_IO_DELAY_NONE is not set
-CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
-# CONFIG_CPA_DEBUG is not set
-CONFIG_OPTIMIZE_INLINING=y
-
-#
-# Security options
-#
-CONFIG_KEYS=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
-CONFIG_SECURITY=y
-# CONFIG_SECURITYFS is not set
-CONFIG_SECURITY_NETWORK=y
-# CONFIG_SECURITY_NETWORK_XFRM is not set
-# CONFIG_SECURITY_PATH is not set
-CONFIG_SECURITY_FILE_CAPABILITIES=y
-# CONFIG_SECURITY_ROOTPLUG is not set
-CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
-CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_SECURITY_SELINUX_DEVELOP=y
-CONFIG_SECURITY_SELINUX_AVC_STATS=y
-CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
-# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
-# CONFIG_SECURITY_SMACK is not set
-# CONFIG_SECURITY_TOMOYO is not set
-# CONFIG_IMA is not set
-CONFIG_CRYPTO=y
-
-#
-# Crypto core or helper
-#
-# CONFIG_CRYPTO_FIPS is not set
-CONFIG_CRYPTO_ALGAPI=y
-CONFIG_CRYPTO_ALGAPI2=y
-CONFIG_CRYPTO_AEAD=y
-CONFIG_CRYPTO_AEAD2=y
-CONFIG_CRYPTO_BLKCIPHER=y
-CONFIG_CRYPTO_BLKCIPHER2=y
-CONFIG_CRYPTO_HASH=y
-CONFIG_CRYPTO_HASH2=y
-CONFIG_CRYPTO_RNG2=y
-CONFIG_CRYPTO_PCOMP=y
-CONFIG_CRYPTO_MANAGER=y
-CONFIG_CRYPTO_MANAGER2=y
-# CONFIG_CRYPTO_GF128MUL is not set
-# CONFIG_CRYPTO_NULL is not set
-CONFIG_CRYPTO_WORKQUEUE=y
-# CONFIG_CRYPTO_CRYPTD is not set
-CONFIG_CRYPTO_AUTHENC=y
-# CONFIG_CRYPTO_TEST is not set
-
-#
-# Authenticated Encryption with Associated Data
-#
-# CONFIG_CRYPTO_CCM is not set
-# CONFIG_CRYPTO_GCM is not set
-# CONFIG_CRYPTO_SEQIV is not set
-
-#
-# Block modes
-#
-CONFIG_CRYPTO_CBC=y
-# CONFIG_CRYPTO_CTR is not set
-# CONFIG_CRYPTO_CTS is not set
-CONFIG_CRYPTO_ECB=y
-# CONFIG_CRYPTO_LRW is not set
-# CONFIG_CRYPTO_PCBC is not set
-# CONFIG_CRYPTO_XTS is not set
-
-#
-# Hash modes
-#
-CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_XCBC is not set
-
-#
-# Digest
-#
-# CONFIG_CRYPTO_CRC32C is not set
-# CONFIG_CRYPTO_CRC32C_INTEL is not set
-# CONFIG_CRYPTO_MD4 is not set
-CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_MICHAEL_MIC is not set
-# CONFIG_CRYPTO_RMD128 is not set
-# CONFIG_CRYPTO_RMD160 is not set
-# CONFIG_CRYPTO_RMD256 is not set
-# CONFIG_CRYPTO_RMD320 is not set
-CONFIG_CRYPTO_SHA1=y
-# CONFIG_CRYPTO_SHA256 is not set
-# CONFIG_CRYPTO_SHA512 is not set
-# CONFIG_CRYPTO_TGR192 is not set
-# CONFIG_CRYPTO_WP512 is not set
-
-#
-# Ciphers
-#
-CONFIG_CRYPTO_AES=y
-CONFIG_CRYPTO_AES_586=y
-# CONFIG_CRYPTO_ANUBIS is not set
-CONFIG_CRYPTO_ARC4=y
-# CONFIG_CRYPTO_BLOWFISH is not set
-# CONFIG_CRYPTO_CAMELLIA is not set
-# CONFIG_CRYPTO_CAST5 is not set
-# CONFIG_CRYPTO_CAST6 is not set
-CONFIG_CRYPTO_DES=y
-# CONFIG_CRYPTO_FCRYPT is not set
-# CONFIG_CRYPTO_KHAZAD is not set
-# CONFIG_CRYPTO_SALSA20 is not set
-# CONFIG_CRYPTO_SALSA20_586 is not set
-# CONFIG_CRYPTO_SEED is not set
-# CONFIG_CRYPTO_SERPENT is not set
-# CONFIG_CRYPTO_TEA is not set
-# CONFIG_CRYPTO_TWOFISH is not set
-# CONFIG_CRYPTO_TWOFISH_586 is not set
-
-#
-# Compression
-#
-# CONFIG_CRYPTO_DEFLATE is not set
-# CONFIG_CRYPTO_ZLIB is not set
-# CONFIG_CRYPTO_LZO is not set
-
-#
-# Random Number Generation
-#
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_CRYPTO_HW=y
-# CONFIG_CRYPTO_DEV_PADLOCK is not set
-# CONFIG_CRYPTO_DEV_GEODE is not set
-# CONFIG_CRYPTO_DEV_HIFN_795X is not set
-CONFIG_HAVE_KVM=y
-CONFIG_HAVE_KVM_IRQCHIP=y
-CONFIG_VIRTUALIZATION=y
-# CONFIG_KVM is not set
-# CONFIG_LGUEST is not set
-# CONFIG_VIRTIO_PCI is not set
-# CONFIG_VIRTIO_BALLOON is not set
-CONFIG_BINARY_PRINTF=y
-
-#
-# Library routines
-#
-CONFIG_BITREVERSE=y
-CONFIG_GENERIC_FIND_FIRST_BIT=y
-CONFIG_GENERIC_FIND_NEXT_BIT=y
-CONFIG_GENERIC_FIND_LAST_BIT=y
-# CONFIG_CRC_CCITT is not set
-# CONFIG_CRC16 is not set
-CONFIG_CRC_T10DIF=y
-# CONFIG_CRC_ITU_T is not set
-CONFIG_CRC32=y
-# CONFIG_CRC7 is not set
-# CONFIG_LIBCRC32C is not set
-CONFIG_AUDIT_GENERIC=y
-CONFIG_ZLIB_INFLATE=y
-CONFIG_DECOMPRESS_GZIP=y
-CONFIG_DECOMPRESS_BZIP2=y
-CONFIG_DECOMPRESS_LZMA=y
-CONFIG_HAS_IOMEM=y
-CONFIG_HAS_IOPORT=y
-CONFIG_HAS_DMA=y
-CONFIG_NLATTR=y
+CONFIG_DEBUG_ENTRY=y
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
new file mode 100644
index 000000000000..aabafa3faa6d
--- /dev/null
+++ b/arch/x86/configs/tiny.config
@@ -0,0 +1,2 @@
+CONFIG_NOHIGHMEM=y
+CONFIG_UNWINDER_GUESS=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cee1dd2e69b2..7d7310cdf8b0 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,519 +1,70 @@
-#
-# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:22:00 2009
-#
-CONFIG_64BIT=y
-# CONFIG_X86_32 is not set
-CONFIG_X86_64=y
-CONFIG_X86=y
-CONFIG_OUTPUT_FORMAT="elf64-x86-64"
-CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
-CONFIG_GENERIC_TIME=y
-CONFIG_GENERIC_CMOS_UPDATE=y
-CONFIG_CLOCKSOURCE_WATCHDOG=y
-CONFIG_GENERIC_CLOCKEVENTS=y
-CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
-CONFIG_LOCKDEP_SUPPORT=y
-CONFIG_STACKTRACE_SUPPORT=y
-CONFIG_HAVE_LATENCYTOP_SUPPORT=y
-CONFIG_FAST_CMPXCHG_LOCAL=y
-CONFIG_MMU=y
-CONFIG_ZONE_DMA=y
-CONFIG_GENERIC_ISA_DMA=y
-CONFIG_GENERIC_IOMAP=y
-CONFIG_GENERIC_BUG=y
-CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
-CONFIG_GENERIC_HWEIGHT=y
-CONFIG_ARCH_MAY_HAVE_PC_FDC=y
-CONFIG_RWSEM_GENERIC_SPINLOCK=y
-# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
-CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
-CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_GENERIC_TIME_VSYSCALL=y
-CONFIG_ARCH_HAS_CPU_RELAX=y
-CONFIG_ARCH_HAS_DEFAULT_IDLE=y
-CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
-CONFIG_HAVE_SETUP_PER_CPU_AREA=y
-CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
-CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
-CONFIG_ARCH_HIBERNATION_POSSIBLE=y
-CONFIG_ARCH_SUSPEND_POSSIBLE=y
-CONFIG_ZONE_DMA32=y
-CONFIG_ARCH_POPULATES_NODE_MAP=y
-CONFIG_AUDIT_ARCH=y
-CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
-CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
-CONFIG_GENERIC_HARDIRQS=y
-CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
-CONFIG_GENERIC_IRQ_PROBE=y
-CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_USE_GENERIC_SMP_HELPERS=y
-CONFIG_X86_64_SMP=y
-CONFIG_X86_HT=y
-CONFIG_X86_TRAMPOLINE=y
-# CONFIG_KTIME_SCALAR is not set
-CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
-
-#
-# General setup
-#
-CONFIG_EXPERIMENTAL=y
-CONFIG_LOCK_KERNEL=y
-CONFIG_INIT_ENV_ARG_LIMIT=32
-CONFIG_LOCALVERSION=""
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_HAVE_KERNEL_GZIP=y
-CONFIG_HAVE_KERNEL_BZIP2=y
-CONFIG_HAVE_KERNEL_LZMA=y
-CONFIG_KERNEL_GZIP=y
-# CONFIG_KERNEL_BZIP2 is not set
-# CONFIG_KERNEL_LZMA is not set
-CONFIG_SWAP=y
+CONFIG_WERROR=y
CONFIG_SYSVIPC=y
-CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_POSIX_MQUEUE_SYSCTL=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_PREEMPT_VOLUNTARY=y
CONFIG_BSD_PROCESS_ACCT=y
-# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
-CONFIG_AUDITSYSCALL=y
-CONFIG_AUDIT_TREE=y
-
-#
-# RCU Subsystem
-#
-# CONFIG_CLASSIC_RCU is not set
-CONFIG_TREE_RCU=y
-# CONFIG_PREEMPT_RCU is not set
-# CONFIG_RCU_TRACE is not set
-CONFIG_RCU_FANOUT=64
-# CONFIG_RCU_FANOUT_EXACT is not set
-# CONFIG_TREE_RCU_TRACE is not set
-# CONFIG_PREEMPT_RCU_TRACE is not set
-# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=18
-CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
-CONFIG_GROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-# CONFIG_RT_GROUP_SCHED is not set
-# CONFIG_USER_SCHED is not set
-CONFIG_CGROUP_SCHED=y
CONFIG_CGROUPS=y
-# CONFIG_CGROUP_DEBUG is not set
-CONFIG_CGROUP_NS=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_RDMA=y
CONFIG_CGROUP_FREEZER=y
-# CONFIG_CGROUP_DEVICE is not set
+CONFIG_CGROUP_HUGETLB=y
CONFIG_CPUSETS=y
-CONFIG_PROC_PID_CPUSET=y
+CONFIG_CGROUP_DEVICE=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_RESOURCE_COUNTERS=y
-# CONFIG_CGROUP_MEM_RES_CTLR is not set
-# CONFIG_SYSFS_DEPRECATED_V2 is not set
-CONFIG_RELAY=y
-CONFIG_NAMESPACES=y
-CONFIG_UTS_NS=y
-CONFIG_IPC_NS=y
-CONFIG_USER_NS=y
-CONFIG_PID_NS=y
-CONFIG_NET_NS=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_MISC=y
+CONFIG_CGROUP_DEBUG=y
CONFIG_BLK_DEV_INITRD=y
-CONFIG_INITRAMFS_SOURCE=""
-CONFIG_RD_GZIP=y
-CONFIG_RD_BZIP2=y
-CONFIG_RD_LZMA=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_SYSCTL=y
-CONFIG_ANON_INODES=y
-# CONFIG_EMBEDDED is not set
-CONFIG_UID16=y
-CONFIG_SYSCTL_SYSCALL=y
-CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
-CONFIG_KALLSYMS_EXTRA_PASS=y
-# CONFIG_STRIP_ASM_SYMS is not set
-CONFIG_HOTPLUG=y
-CONFIG_PRINTK=y
-CONFIG_BUG=y
-CONFIG_ELF_CORE=y
-CONFIG_PCSPKR_PLATFORM=y
-CONFIG_BASE_FULL=y
-CONFIG_FUTEX=y
-CONFIG_EPOLL=y
-CONFIG_SIGNALFD=y
-CONFIG_TIMERFD=y
-CONFIG_EVENTFD=y
-CONFIG_SHMEM=y
-CONFIG_AIO=y
-CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_PCI_QUIRKS=y
-CONFIG_SLUB_DEBUG=y
-# CONFIG_COMPAT_BRK is not set
-# CONFIG_SLAB is not set
-CONFIG_SLUB=y
-# CONFIG_SLOB is not set
CONFIG_PROFILING=y
-CONFIG_TRACEPOINTS=y
-CONFIG_MARKERS=y
-# CONFIG_OPROFILE is not set
-CONFIG_HAVE_OPROFILE=y
-CONFIG_KPROBES=y
-CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
-CONFIG_KRETPROBES=y
-CONFIG_HAVE_IOREMAP_PROT=y
-CONFIG_HAVE_KPROBES=y
-CONFIG_HAVE_KRETPROBES=y
-CONFIG_HAVE_ARCH_TRACEHOOK=y
-CONFIG_HAVE_DMA_API_DEBUG=y
-# CONFIG_SLOW_WORK is not set
-# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
-CONFIG_SLABINFO=y
-CONFIG_RT_MUTEXES=y
-CONFIG_BASE_SMALL=0
-CONFIG_MODULES=y
-# CONFIG_MODULE_FORCE_LOAD is not set
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_MODVERSIONS is not set
-# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_STOP_MACHINE=y
-CONFIG_BLOCK=y
-CONFIG_BLK_DEV_BSG=y
-# CONFIG_BLK_DEV_INTEGRITY is not set
-CONFIG_BLOCK_COMPAT=y
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
-CONFIG_IOSCHED_DEADLINE=y
-CONFIG_IOSCHED_CFQ=y
-# CONFIG_DEFAULT_AS is not set
-# CONFIG_DEFAULT_DEADLINE is not set
-CONFIG_DEFAULT_CFQ=y
-# CONFIG_DEFAULT_NOOP is not set
-CONFIG_DEFAULT_IOSCHED="cfq"
-CONFIG_FREEZER=y
-
-#
-# Processor type and features
-#
-CONFIG_TICK_ONESHOT=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+CONFIG_KEXEC=y
CONFIG_SMP=y
-CONFIG_SPARSE_IRQ=y
-CONFIG_X86_MPPARSE=y
-CONFIG_X86_EXTENDED_PLATFORM=y
-# CONFIG_X86_VSMP is not set
-# CONFIG_X86_UV is not set
-CONFIG_SCHED_OMIT_FRAME_POINTER=y
-# CONFIG_PARAVIRT_GUEST is not set
-# CONFIG_MEMTEST is not set
-# CONFIG_M386 is not set
-# CONFIG_M486 is not set
-# CONFIG_M586 is not set
-# CONFIG_M586TSC is not set
-# CONFIG_M586MMX is not set
-# CONFIG_M686 is not set
-# CONFIG_MPENTIUMII is not set
-# CONFIG_MPENTIUMIII is not set
-# CONFIG_MPENTIUMM is not set
-# CONFIG_MPENTIUM4 is not set
-# CONFIG_MK6 is not set
-# CONFIG_MK7 is not set
-# CONFIG_MK8 is not set
-# CONFIG_MCRUSOE is not set
-# CONFIG_MEFFICEON is not set
-# CONFIG_MWINCHIPC6 is not set
-# CONFIG_MWINCHIP3D is not set
-# CONFIG_MGEODEGX1 is not set
-# CONFIG_MGEODE_LX is not set
-# CONFIG_MCYRIXIII is not set
-# CONFIG_MVIAC3_2 is not set
-# CONFIG_MVIAC7 is not set
-# CONFIG_MPSC is not set
-# CONFIG_MCORE2 is not set
-CONFIG_GENERIC_CPU=y
-CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=64
-CONFIG_X86_INTERNODE_CACHE_BYTES=64
-CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=6
-CONFIG_X86_WP_WORKS_OK=y
-CONFIG_X86_TSC=y
-CONFIG_X86_CMPXCHG64=y
-CONFIG_X86_CMOV=y
-CONFIG_X86_MINIMUM_CPU_FAMILY=64
-CONFIG_X86_DEBUGCTLMSR=y
-CONFIG_CPU_SUP_INTEL=y
-CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR=y
-CONFIG_X86_DS=y
-CONFIG_X86_PTRACE_BTS=y
-CONFIG_HPET_TIMER=y
-CONFIG_HPET_EMULATE_RTC=y
-CONFIG_DMI=y
-CONFIG_GART_IOMMU=y
-CONFIG_CALGARY_IOMMU=y
-CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
-CONFIG_AMD_IOMMU=y
-CONFIG_AMD_IOMMU_STATS=y
-CONFIG_SWIOTLB=y
-CONFIG_IOMMU_HELPER=y
-CONFIG_IOMMU_API=y
-# CONFIG_MAXSMP is not set
-CONFIG_NR_CPUS=64
-CONFIG_SCHED_SMT=y
-CONFIG_SCHED_MC=y
-# CONFIG_PREEMPT_NONE is not set
-CONFIG_PREEMPT_VOLUNTARY=y
-# CONFIG_PREEMPT is not set
-CONFIG_X86_LOCAL_APIC=y
-CONFIG_X86_IO_APIC=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
-CONFIG_X86_MCE=y
-CONFIG_X86_MCE_INTEL=y
-CONFIG_X86_MCE_AMD=y
-CONFIG_X86_MCE_THRESHOLD=y
-# CONFIG_I8K is not set
-CONFIG_MICROCODE=y
-CONFIG_MICROCODE_INTEL=y
-CONFIG_MICROCODE_AMD=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
-# CONFIG_X86_CPU_DEBUG is not set
-CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
-CONFIG_DIRECT_GBPAGES=y
CONFIG_NUMA=y
-CONFIG_K8_NUMA=y
-CONFIG_X86_64_ACPI_NUMA=y
-CONFIG_NODES_SPAN_OTHER_NODES=y
-# CONFIG_NUMA_EMU is not set
-CONFIG_NODES_SHIFT=6
-CONFIG_ARCH_SPARSEMEM_DEFAULT=y
-CONFIG_ARCH_SPARSEMEM_ENABLE=y
-CONFIG_ARCH_SELECT_MEMORY_MODEL=y
-CONFIG_SELECT_MEMORY_MODEL=y
-# CONFIG_FLATMEM_MANUAL is not set
-# CONFIG_DISCONTIGMEM_MANUAL is not set
-CONFIG_SPARSEMEM_MANUAL=y
-CONFIG_SPARSEMEM=y
-CONFIG_NEED_MULTIPLE_NODES=y
-CONFIG_HAVE_MEMORY_PRESENT=y
-CONFIG_SPARSEMEM_EXTREME=y
-CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
-CONFIG_SPARSEMEM_VMEMMAP=y
-
-#
-# Memory hotplug is currently incompatible with Software Suspend
-#
-CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
-CONFIG_MIGRATION=y
-CONFIG_PHYS_ADDR_T_64BIT=y
-CONFIG_ZONE_DMA_FLAG=1
-CONFIG_BOUNCE=y
-CONFIG_VIRT_TO_BUS=y
-CONFIG_UNEVICTABLE_LRU=y
-CONFIG_HAVE_MLOCK=y
-CONFIG_HAVE_MLOCKED_PAGE_BIT=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
-CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
-CONFIG_X86_RESERVE_LOW_64K=y
-CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-CONFIG_X86_PAT=y
CONFIG_EFI=y
-CONFIG_SECCOMP=y
-# CONFIG_CC_STACKPROTECTOR is not set
-# CONFIG_HZ_100 is not set
-# CONFIG_HZ_250 is not set
-# CONFIG_HZ_300 is not set
+CONFIG_EFI_STUB=y
+CONFIG_EFI_MIXED=y
CONFIG_HZ_1000=y
-CONFIG_HZ=1000
-CONFIG_SCHED_HRTICK=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
-# CONFIG_KEXEC_JUMP is not set
-CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
-CONFIG_PHYSICAL_ALIGN=0x1000000
-CONFIG_HOTPLUG_CPU=y
-# CONFIG_COMPAT_VDSO is not set
-# CONFIG_CMDLINE_BOOL is not set
-CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
-CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
-
-#
-# Power management and ACPI options
-#
-CONFIG_ARCH_HIBERNATION_HEADER=y
-CONFIG_PM=y
+CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
-# CONFIG_PM_VERBOSE is not set
-CONFIG_CAN_PM_TRACE=y
-CONFIG_PM_TRACE=y
CONFIG_PM_TRACE_RTC=y
-CONFIG_PM_SLEEP_SMP=y
-CONFIG_PM_SLEEP=y
-CONFIG_SUSPEND=y
-# CONFIG_PM_TEST_SUSPEND is not set
-CONFIG_SUSPEND_FREEZER=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_STD_PARTITION=""
-CONFIG_ACPI=y
-CONFIG_ACPI_SLEEP=y
-CONFIG_ACPI_PROCFS=y
-CONFIG_ACPI_PROCFS_POWER=y
-CONFIG_ACPI_SYSFS_POWER=y
-CONFIG_ACPI_PROC_EVENT=y
-CONFIG_ACPI_AC=y
-CONFIG_ACPI_BATTERY=y
-CONFIG_ACPI_BUTTON=y
-CONFIG_ACPI_FAN=y
CONFIG_ACPI_DOCK=y
-CONFIG_ACPI_PROCESSOR=y
-CONFIG_ACPI_HOTPLUG_CPU=y
-CONFIG_ACPI_THERMAL=y
-CONFIG_ACPI_NUMA=y
-# CONFIG_ACPI_CUSTOM_DSDT is not set
-CONFIG_ACPI_BLACKLIST_YEAR=0
-# CONFIG_ACPI_DEBUG is not set
-# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_X86_PM_TIMER=y
-CONFIG_ACPI_CONTAINER=y
-# CONFIG_ACPI_SBS is not set
-
-#
-# CPU Frequency scaling
-#
-CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_TABLE=y
-CONFIG_CPU_FREQ_DEBUG=y
-# CONFIG_CPU_FREQ_STAT is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+CONFIG_ACPI_BGRT=y
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
-CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
-# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
-
-#
-# CPUFreq processor drivers
-#
CONFIG_X86_ACPI_CPUFREQ=y
-# CONFIG_X86_POWERNOW_K8 is not set
-# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
-# CONFIG_X86_P4_CLOCKMOD is not set
-
-#
-# shared options
-#
-# CONFIG_X86_SPEEDSTEP_LIB is not set
-CONFIG_CPU_IDLE=y
-CONFIG_CPU_IDLE_GOV_LADDER=y
-CONFIG_CPU_IDLE_GOV_MENU=y
-
-#
-# Memory power savings
-#
-# CONFIG_I7300_IDLE is not set
-
-#
-# Bus options (PCI etc.)
-#
-CONFIG_PCI=y
-CONFIG_PCI_DIRECT=y
-CONFIG_PCI_MMCONFIG=y
-CONFIG_PCI_DOMAINS=y
-CONFIG_DMAR=y
-# CONFIG_DMAR_DEFAULT_ON is not set
-CONFIG_DMAR_GFX_WA=y
-CONFIG_DMAR_FLOPPY_WA=y
-# CONFIG_INTR_REMAP is not set
-CONFIG_PCIEPORTBUS=y
-# CONFIG_HOTPLUG_PCI_PCIE is not set
-CONFIG_PCIEAER=y
-# CONFIG_PCIEASPM is not set
-CONFIG_ARCH_SUPPORTS_MSI=y
-CONFIG_PCI_MSI=y
-# CONFIG_PCI_LEGACY is not set
-# CONFIG_PCI_DEBUG is not set
-# CONFIG_PCI_STUB is not set
-CONFIG_HT_IRQ=y
-# CONFIG_PCI_IOV is not set
-CONFIG_ISA_DMA_API=y
-CONFIG_K8_NB=y
-CONFIG_PCCARD=y
-# CONFIG_PCMCIA_DEBUG is not set
-CONFIG_PCMCIA=y
-CONFIG_PCMCIA_LOAD_CIS=y
-CONFIG_PCMCIA_IOCTL=y
-CONFIG_CARDBUS=y
-
-#
-# PC-card bridges
-#
-CONFIG_YENTA=y
-CONFIG_YENTA_O2=y
-CONFIG_YENTA_RICOH=y
-CONFIG_YENTA_TI=y
-CONFIG_YENTA_ENE_TUNE=y
-CONFIG_YENTA_TOSHIBA=y
-# CONFIG_PD6729 is not set
-# CONFIG_I82092 is not set
-CONFIG_PCCARD_NONSTATIC=y
-CONFIG_HOTPLUG_PCI=y
-# CONFIG_HOTPLUG_PCI_FAKE is not set
-# CONFIG_HOTPLUG_PCI_ACPI is not set
-# CONFIG_HOTPLUG_PCI_CPCI is not set
-# CONFIG_HOTPLUG_PCI_SHPC is not set
-
-#
-# Executable file formats / Emulations
-#
-CONFIG_BINFMT_ELF=y
-CONFIG_COMPAT_BINFMT_ELF=y
-CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
-# CONFIG_HAVE_AOUT is not set
-CONFIG_BINFMT_MISC=y
CONFIG_IA32_EMULATION=y
-# CONFIG_IA32_AOUT is not set
-CONFIG_COMPAT=y
-CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
-CONFIG_SYSVIPC_COMPAT=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_BLK_CGROUP_IOLATENCY=y
+CONFIG_BLK_CGROUP_IOCOST=y
+CONFIG_BLK_CGROUP_IOPRIO=y
+CONFIG_BINFMT_MISC=y
+# CONFIG_COMPAT_BRK is not set
CONFIG_NET=y
-
-#
-# Networking options
-#
CONFIG_PACKET=y
-CONFIG_PACKET_MMAP=y
-CONFIG_UNIX=y
-CONFIG_XFRM=y
CONFIG_XFRM_USER=y
-# CONFIG_XFRM_SUB_POLICY is not set
-# CONFIG_XFRM_MIGRATE is not set
-# CONFIG_XFRM_STATISTICS is not set
-# CONFIG_NET_KEY is not set
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_ASK_IP_FIB_HASH=y
-# CONFIG_IP_FIB_TRIE is not set
-CONFIG_IP_FIB_HASH=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
@@ -521,1327 +72,136 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
-# CONFIG_NET_IPIP is not set
-# CONFIG_NET_IPGRE is not set
CONFIG_IP_MROUTE=y
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
-# CONFIG_ARPD is not set
CONFIG_SYN_COOKIES=y
-# CONFIG_INET_AH is not set
-# CONFIG_INET_ESP is not set
-# CONFIG_INET_IPCOMP is not set
-# CONFIG_INET_XFRM_TUNNEL is not set
-CONFIG_INET_TUNNEL=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
-CONFIG_INET_LRO=y
# CONFIG_INET_DIAG is not set
CONFIG_TCP_CONG_ADVANCED=y
# CONFIG_TCP_CONG_BIC is not set
-CONFIG_TCP_CONG_CUBIC=y
# CONFIG_TCP_CONG_WESTWOOD is not set
# CONFIG_TCP_CONG_HTCP is not set
-# CONFIG_TCP_CONG_HSTCP is not set
-# CONFIG_TCP_CONG_HYBLA is not set
-# CONFIG_TCP_CONG_VEGAS is not set
-# CONFIG_TCP_CONG_SCALABLE is not set
-# CONFIG_TCP_CONG_LP is not set
-# CONFIG_TCP_CONG_VENO is not set
-# CONFIG_TCP_CONG_YEAH is not set
-# CONFIG_TCP_CONG_ILLINOIS is not set
-# CONFIG_DEFAULT_BIC is not set
-CONFIG_DEFAULT_CUBIC=y
-# CONFIG_DEFAULT_HTCP is not set
-# CONFIG_DEFAULT_VEGAS is not set
-# CONFIG_DEFAULT_WESTWOOD is not set
-# CONFIG_DEFAULT_RENO is not set
-CONFIG_DEFAULT_TCP_CONG="cubic"
CONFIG_TCP_MD5SIG=y
-CONFIG_IPV6=y
-# CONFIG_IPV6_PRIVACY is not set
-# CONFIG_IPV6_ROUTER_PREF is not set
-# CONFIG_IPV6_OPTIMISTIC_DAD is not set
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
-# CONFIG_INET6_IPCOMP is not set
-# CONFIG_IPV6_MIP6 is not set
-# CONFIG_INET6_XFRM_TUNNEL is not set
-# CONFIG_INET6_TUNNEL is not set
-CONFIG_INET6_XFRM_MODE_TRANSPORT=y
-CONFIG_INET6_XFRM_MODE_TUNNEL=y
-CONFIG_INET6_XFRM_MODE_BEET=y
-# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
-CONFIG_IPV6_SIT=y
-CONFIG_IPV6_NDISC_NODETYPE=y
-# CONFIG_IPV6_TUNNEL is not set
-# CONFIG_IPV6_MULTIPLE_TABLES is not set
-# CONFIG_IPV6_MROUTE is not set
CONFIG_NETLABEL=y
-CONFIG_NETWORK_SECMARK=y
CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_DEBUG is not set
# CONFIG_NETFILTER_ADVANCED is not set
-
-#
-# Core Netfilter Configuration
-#
-CONFIG_NETFILTER_NETLINK=y
-CONFIG_NETFILTER_NETLINK_LOG=y
CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_SECMARK=y
CONFIG_NF_CONNTRACK_FTP=y
CONFIG_NF_CONNTRACK_IRC=y
CONFIG_NF_CONNTRACK_SIP=y
CONFIG_NF_CT_NETLINK=y
-CONFIG_NETFILTER_XTABLES=y
+CONFIG_NF_NAT=y
CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
CONFIG_NETFILTER_XT_TARGET_NFLOG=y
CONFIG_NETFILTER_XT_TARGET_SECMARK=y
CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
CONFIG_NETFILTER_XT_MATCH_POLICY=y
CONFIG_NETFILTER_XT_MATCH_STATE=y
-# CONFIG_IP_VS is not set
-
-#
-# IP: Netfilter Configuration
-#
-CONFIG_NF_DEFRAG_IPV4=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_PROC_COMPAT=y
CONFIG_IP_NF_IPTABLES=y
CONFIG_IP_NF_FILTER=y
CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=y
-CONFIG_IP_NF_TARGET_ULOG=y
-CONFIG_NF_NAT=y
-CONFIG_NF_NAT_NEEDED=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_NF_NAT_FTP=y
-CONFIG_NF_NAT_IRC=y
-# CONFIG_NF_NAT_TFTP is not set
-# CONFIG_NF_NAT_AMANDA is not set
-# CONFIG_NF_NAT_PPTP is not set
-# CONFIG_NF_NAT_H323 is not set
-CONFIG_NF_NAT_SIP=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
CONFIG_IP_NF_MANGLE=y
-
-#
-# IPv6: Netfilter Configuration
-#
-CONFIG_NF_CONNTRACK_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_MATCH_IPV6HEADER=y
-CONFIG_IP6_NF_TARGET_LOG=y
CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
-# CONFIG_IP_DCCP is not set
-# CONFIG_IP_SCTP is not set
-# CONFIG_TIPC is not set
-# CONFIG_ATM is not set
-# CONFIG_BRIDGE is not set
-# CONFIG_NET_DSA is not set
-# CONFIG_VLAN_8021Q is not set
-# CONFIG_DECNET is not set
-CONFIG_LLC=y
-# CONFIG_LLC2 is not set
-# CONFIG_IPX is not set
-# CONFIG_ATALK is not set
-# CONFIG_X25 is not set
-# CONFIG_LAPB is not set
-# CONFIG_ECONET is not set
-# CONFIG_WAN_ROUTER is not set
-# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
-
-#
-# Queueing/Scheduling
-#
-# CONFIG_NET_SCH_CBQ is not set
-# CONFIG_NET_SCH_HTB is not set
-# CONFIG_NET_SCH_HFSC is not set
-# CONFIG_NET_SCH_PRIO is not set
-# CONFIG_NET_SCH_MULTIQ is not set
-# CONFIG_NET_SCH_RED is not set
-# CONFIG_NET_SCH_SFQ is not set
-# CONFIG_NET_SCH_TEQL is not set
-# CONFIG_NET_SCH_TBF is not set
-# CONFIG_NET_SCH_GRED is not set
-# CONFIG_NET_SCH_DSMARK is not set
-# CONFIG_NET_SCH_NETEM is not set
-# CONFIG_NET_SCH_DRR is not set
-# CONFIG_NET_SCH_INGRESS is not set
-
-#
-# Classification
-#
-CONFIG_NET_CLS=y
-# CONFIG_NET_CLS_BASIC is not set
-# CONFIG_NET_CLS_TCINDEX is not set
-# CONFIG_NET_CLS_ROUTE4 is not set
-# CONFIG_NET_CLS_FW is not set
-# CONFIG_NET_CLS_U32 is not set
-# CONFIG_NET_CLS_RSVP is not set
-# CONFIG_NET_CLS_RSVP6 is not set
-# CONFIG_NET_CLS_FLOW is not set
-# CONFIG_NET_CLS_CGROUP is not set
+CONFIG_NET_CLS_CGROUP=y
CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_STACK=32
-# CONFIG_NET_EMATCH_CMP is not set
-# CONFIG_NET_EMATCH_NBYTE is not set
-# CONFIG_NET_EMATCH_U32 is not set
-# CONFIG_NET_EMATCH_META is not set
-# CONFIG_NET_EMATCH_TEXT is not set
CONFIG_NET_CLS_ACT=y
-# CONFIG_NET_ACT_POLICE is not set
-# CONFIG_NET_ACT_GACT is not set
-# CONFIG_NET_ACT_MIRRED is not set
-# CONFIG_NET_ACT_IPT is not set
-# CONFIG_NET_ACT_NAT is not set
-# CONFIG_NET_ACT_PEDIT is not set
-# CONFIG_NET_ACT_SIMP is not set
-# CONFIG_NET_ACT_SKBEDIT is not set
-CONFIG_NET_SCH_FIFO=y
-# CONFIG_DCB is not set
-
-#
-# Network testing
-#
-# CONFIG_NET_PKTGEN is not set
-# CONFIG_NET_TCPPROBE is not set
-# CONFIG_NET_DROP_MONITOR is not set
-CONFIG_HAMRADIO=y
-
-#
-# Packet Radio protocols
-#
-# CONFIG_AX25 is not set
-# CONFIG_CAN is not set
-# CONFIG_IRDA is not set
-# CONFIG_BT is not set
-# CONFIG_AF_RXRPC is not set
-CONFIG_FIB_RULES=y
-CONFIG_WIRELESS=y
+CONFIG_CGROUP_NET_PRIO=y
CONFIG_CFG80211=y
-# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_WIRELESS_OLD_REGULATORY=y
-CONFIG_WIRELESS_EXT=y
-CONFIG_WIRELESS_EXT_SYSFS=y
-# CONFIG_LIB80211 is not set
CONFIG_MAC80211=y
-
-#
-# Rate control algorithm selection
-#
-CONFIG_MAC80211_RC_MINSTREL=y
-# CONFIG_MAC80211_RC_DEFAULT_PID is not set
-CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
-CONFIG_MAC80211_RC_DEFAULT="minstrel"
-# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
-# CONFIG_MAC80211_DEBUGFS is not set
-# CONFIG_MAC80211_DEBUG_MENU is not set
-# CONFIG_WIMAX is not set
CONFIG_RFKILL=y
-# CONFIG_RFKILL_INPUT is not set
-CONFIG_RFKILL_LEDS=y
-# CONFIG_NET_9P is not set
-
-#
-# Device Drivers
-#
-
-#
-# Generic Driver Options
-#
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_STANDALONE=y
-CONFIG_PREVENT_FIRMWARE_BUILD=y
-CONFIG_FW_LOADER=y
-CONFIG_FIRMWARE_IN_KERNEL=y
-CONFIG_EXTRA_FIRMWARE=""
-# CONFIG_DEBUG_DRIVER is not set
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_PCI=y
+CONFIG_PCIEPORTBUS=y
+CONFIG_HOTPLUG_PCI=y
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DEBUG_DEVRES=y
-# CONFIG_SYS_HYPERVISOR is not set
CONFIG_CONNECTOR=y
-CONFIG_PROC_EVENTS=y
-# CONFIG_MTD is not set
-# CONFIG_PARPORT is not set
-CONFIG_PNP=y
-CONFIG_PNP_DEBUG_MESSAGES=y
-
-#
-# Protocols
-#
-CONFIG_PNPACPI=y
-CONFIG_BLK_DEV=y
-# CONFIG_BLK_DEV_FD is not set
-# CONFIG_BLK_CPQ_DA is not set
-# CONFIG_BLK_CPQ_CISS_DA is not set
-# CONFIG_BLK_DEV_DAC960 is not set
-# CONFIG_BLK_DEV_UMEM is not set
-# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=y
-# CONFIG_BLK_DEV_CRYPTOLOOP is not set
-# CONFIG_BLK_DEV_NBD is not set
-# CONFIG_BLK_DEV_SX8 is not set
-# CONFIG_BLK_DEV_UB is not set
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_COUNT=16
-CONFIG_BLK_DEV_RAM_SIZE=16384
-# CONFIG_BLK_DEV_XIP is not set
-# CONFIG_CDROM_PKTCDVD is not set
-# CONFIG_ATA_OVER_ETH is not set
-# CONFIG_BLK_DEV_HD is not set
-CONFIG_MISC_DEVICES=y
-# CONFIG_IBM_ASM is not set
-# CONFIG_PHANTOM is not set
-# CONFIG_SGI_IOC4 is not set
-# CONFIG_TIFM_CORE is not set
-# CONFIG_ICS932S401 is not set
-# CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_HP_ILO is not set
-# CONFIG_ISL29003 is not set
-# CONFIG_C2PORT is not set
-
-#
-# EEPROM support
-#
-# CONFIG_EEPROM_AT24 is not set
-# CONFIG_EEPROM_LEGACY is not set
-# CONFIG_EEPROM_93CX6 is not set
-CONFIG_HAVE_IDE=y
-# CONFIG_IDE is not set
-
-#
-# SCSI device support
-#
-# CONFIG_RAID_ATTRS is not set
-CONFIG_SCSI=y
-CONFIG_SCSI_DMA=y
-# CONFIG_SCSI_TGT is not set
-# CONFIG_SCSI_NETLINK is not set
-CONFIG_SCSI_PROC_FS=y
-
-#
-# SCSI support type (disk, tape, CD-ROM)
-#
+CONFIG_VIRTIO_BLK=y
CONFIG_BLK_DEV_SD=y
-# CONFIG_CHR_DEV_ST is not set
-# CONFIG_CHR_DEV_OSST is not set
CONFIG_BLK_DEV_SR=y
-CONFIG_BLK_DEV_SR_VENDOR=y
CONFIG_CHR_DEV_SG=y
-# CONFIG_CHR_DEV_SCH is not set
-
-#
-# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
-#
-# CONFIG_SCSI_MULTI_LUN is not set
CONFIG_SCSI_CONSTANTS=y
-# CONFIG_SCSI_LOGGING is not set
-# CONFIG_SCSI_SCAN_ASYNC is not set
-CONFIG_SCSI_WAIT_SCAN=m
-
-#
-# SCSI Transports
-#
CONFIG_SCSI_SPI_ATTRS=y
-# CONFIG_SCSI_FC_ATTRS is not set
-# CONFIG_SCSI_ISCSI_ATTRS is not set
-# CONFIG_SCSI_SAS_ATTRS is not set
-# CONFIG_SCSI_SAS_LIBSAS is not set
-# CONFIG_SCSI_SRP_ATTRS is not set
-# CONFIG_SCSI_LOWLEVEL is not set
-# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
-# CONFIG_SCSI_DH is not set
-# CONFIG_SCSI_OSD_INITIATOR is not set
+CONFIG_SCSI_VIRTIO=y
CONFIG_ATA=y
-# CONFIG_ATA_NONSTANDARD is not set
-CONFIG_ATA_ACPI=y
-CONFIG_SATA_PMP=y
CONFIG_SATA_AHCI=y
-# CONFIG_SATA_SIL24 is not set
-CONFIG_ATA_SFF=y
-# CONFIG_SATA_SVW is not set
CONFIG_ATA_PIIX=y
-# CONFIG_SATA_MV is not set
-# CONFIG_SATA_NV is not set
-# CONFIG_PDC_ADMA is not set
-# CONFIG_SATA_QSTOR is not set
-# CONFIG_SATA_PROMISE is not set
-# CONFIG_SATA_SX4 is not set
-# CONFIG_SATA_SIL is not set
-# CONFIG_SATA_SIS is not set
-# CONFIG_SATA_ULI is not set
-# CONFIG_SATA_VIA is not set
-# CONFIG_SATA_VITESSE is not set
-# CONFIG_SATA_INIC162X is not set
-# CONFIG_PATA_ACPI is not set
-# CONFIG_PATA_ALI is not set
CONFIG_PATA_AMD=y
-# CONFIG_PATA_ARTOP is not set
-# CONFIG_PATA_ATIIXP is not set
-# CONFIG_PATA_CMD640_PCI is not set
-# CONFIG_PATA_CMD64X is not set
-# CONFIG_PATA_CS5520 is not set
-# CONFIG_PATA_CS5530 is not set
-# CONFIG_PATA_CYPRESS is not set
-# CONFIG_PATA_EFAR is not set
-# CONFIG_ATA_GENERIC is not set
-# CONFIG_PATA_HPT366 is not set
-# CONFIG_PATA_HPT37X is not set
-# CONFIG_PATA_HPT3X2N is not set
-# CONFIG_PATA_HPT3X3 is not set
-# CONFIG_PATA_IT821X is not set
-# CONFIG_PATA_IT8213 is not set
-# CONFIG_PATA_JMICRON is not set
-# CONFIG_PATA_TRIFLEX is not set
-# CONFIG_PATA_MARVELL is not set
-# CONFIG_PATA_MPIIX is not set
CONFIG_PATA_OLDPIIX=y
-# CONFIG_PATA_NETCELL is not set
-# CONFIG_PATA_NINJA32 is not set
-# CONFIG_PATA_NS87410 is not set
-# CONFIG_PATA_NS87415 is not set
-# CONFIG_PATA_OPTI is not set
-# CONFIG_PATA_OPTIDMA is not set
-# CONFIG_PATA_PCMCIA is not set
-# CONFIG_PATA_PDC_OLD is not set
-# CONFIG_PATA_RADISYS is not set
-# CONFIG_PATA_RZ1000 is not set
-# CONFIG_PATA_SC1200 is not set
-# CONFIG_PATA_SERVERWORKS is not set
-# CONFIG_PATA_PDC2027X is not set
-# CONFIG_PATA_SIL680 is not set
-# CONFIG_PATA_SIS is not set
-# CONFIG_PATA_VIA is not set
-# CONFIG_PATA_WINBOND is not set
CONFIG_PATA_SCH=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
-CONFIG_MD_AUTODETECT=y
-# CONFIG_MD_LINEAR is not set
-# CONFIG_MD_RAID0 is not set
-# CONFIG_MD_RAID1 is not set
-# CONFIG_MD_RAID10 is not set
-# CONFIG_MD_RAID456 is not set
-# CONFIG_MD_MULTIPATH is not set
-# CONFIG_MD_FAULTY is not set
CONFIG_BLK_DEV_DM=y
-# CONFIG_DM_DEBUG is not set
-# CONFIG_DM_CRYPT is not set
-# CONFIG_DM_SNAPSHOT is not set
CONFIG_DM_MIRROR=y
CONFIG_DM_ZERO=y
-# CONFIG_DM_MULTIPATH is not set
-# CONFIG_DM_DELAY is not set
-# CONFIG_DM_UEVENT is not set
-# CONFIG_FUSION is not set
-
-#
-# IEEE 1394 (FireWire) support
-#
-
-#
-# Enable only one of the two stacks, unless you know what you are doing
-#
-# CONFIG_FIREWIRE is not set
-# CONFIG_IEEE1394 is not set
-# CONFIG_I2O is not set
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
-CONFIG_COMPAT_NET_DEV_OPS=y
-# CONFIG_IFB is not set
-# CONFIG_DUMMY is not set
-# CONFIG_BONDING is not set
-# CONFIG_MACVLAN is not set
-# CONFIG_EQUALIZER is not set
-# CONFIG_TUN is not set
-# CONFIG_VETH is not set
-# CONFIG_NET_SB1000 is not set
-# CONFIG_ARCNET is not set
-CONFIG_PHYLIB=y
-
-#
-# MII PHY device drivers
-#
-# CONFIG_MARVELL_PHY is not set
-# CONFIG_DAVICOM_PHY is not set
-# CONFIG_QSEMI_PHY is not set
-# CONFIG_LXT_PHY is not set
-# CONFIG_CICADA_PHY is not set
-# CONFIG_VITESSE_PHY is not set
-# CONFIG_SMSC_PHY is not set
-# CONFIG_BROADCOM_PHY is not set
-# CONFIG_ICPLUS_PHY is not set
-# CONFIG_REALTEK_PHY is not set
-# CONFIG_NATIONAL_PHY is not set
-# CONFIG_STE10XP is not set
-# CONFIG_LSI_ET1011C_PHY is not set
-# CONFIG_FIXED_PHY is not set
-# CONFIG_MDIO_BITBANG is not set
-CONFIG_NET_ETHERNET=y
-CONFIG_MII=y
-# CONFIG_HAPPYMEAL is not set
-# CONFIG_SUNGEM is not set
-# CONFIG_CASSINI is not set
-CONFIG_NET_VENDOR_3COM=y
-# CONFIG_VORTEX is not set
-# CONFIG_TYPHOON is not set
-# CONFIG_ETHOC is not set
-# CONFIG_DNET is not set
+CONFIG_NETCONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_TIGON3=y
CONFIG_NET_TULIP=y
-# CONFIG_DE2104X is not set
-# CONFIG_TULIP is not set
-# CONFIG_DE4X5 is not set
-# CONFIG_WINBOND_840 is not set
-# CONFIG_DM9102 is not set
-# CONFIG_ULI526X is not set
-# CONFIG_PCMCIA_XIRCOM is not set
-# CONFIG_HP100 is not set
-# CONFIG_IBM_NEW_EMAC_ZMII is not set
-# CONFIG_IBM_NEW_EMAC_RGMII is not set
-# CONFIG_IBM_NEW_EMAC_TAH is not set
-# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
-# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
-# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
-# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
-CONFIG_NET_PCI=y
-# CONFIG_PCNET32 is not set
-# CONFIG_AMD8111_ETH is not set
-# CONFIG_ADAPTEC_STARFIRE is not set
-# CONFIG_B44 is not set
-CONFIG_FORCEDETH=y
-# CONFIG_FORCEDETH_NAPI is not set
CONFIG_E100=y
-# CONFIG_FEALNX is not set
-# CONFIG_NATSEMI is not set
-# CONFIG_NE2K_PCI is not set
-# CONFIG_8139CP is not set
-CONFIG_8139TOO=y
-CONFIG_8139TOO_PIO=y
-# CONFIG_8139TOO_TUNE_TWISTER is not set
-# CONFIG_8139TOO_8129 is not set
-# CONFIG_8139_OLD_RX_RESET is not set
-# CONFIG_R6040 is not set
-# CONFIG_SIS900 is not set
-# CONFIG_EPIC100 is not set
-# CONFIG_SMSC9420 is not set
-# CONFIG_SUNDANCE is not set
-# CONFIG_TLAN is not set
-# CONFIG_VIA_RHINE is not set
-# CONFIG_SC92031 is not set
-# CONFIG_ATL2 is not set
-CONFIG_NETDEV_1000=y
-# CONFIG_ACENIC is not set
-# CONFIG_DL2K is not set
CONFIG_E1000=y
-# CONFIG_E1000E is not set
-# CONFIG_IP1000 is not set
-# CONFIG_IGB is not set
-# CONFIG_IGBVF is not set
-# CONFIG_NS83820 is not set
-# CONFIG_HAMACHI is not set
-# CONFIG_YELLOWFIN is not set
-# CONFIG_R8169 is not set
-# CONFIG_SIS190 is not set
-# CONFIG_SKGE is not set
+CONFIG_E1000E=y
CONFIG_SKY2=y
-# CONFIG_SKY2_DEBUG is not set
-# CONFIG_VIA_VELOCITY is not set
-CONFIG_TIGON3=y
-# CONFIG_BNX2 is not set
-# CONFIG_QLA3XXX is not set
-# CONFIG_ATL1 is not set
-# CONFIG_ATL1E is not set
-# CONFIG_ATL1C is not set
-# CONFIG_JME is not set
-CONFIG_NETDEV_10000=y
-# CONFIG_CHELSIO_T1 is not set
-CONFIG_CHELSIO_T3_DEPENDS=y
-# CONFIG_CHELSIO_T3 is not set
-# CONFIG_ENIC is not set
-# CONFIG_IXGBE is not set
-# CONFIG_IXGB is not set
-# CONFIG_S2IO is not set
-# CONFIG_VXGE is not set
-# CONFIG_MYRI10GE is not set
-# CONFIG_NETXEN_NIC is not set
-# CONFIG_NIU is not set
-# CONFIG_MLX4_EN is not set
-# CONFIG_MLX4_CORE is not set
-# CONFIG_TEHUTI is not set
-# CONFIG_BNX2X is not set
-# CONFIG_QLGE is not set
-# CONFIG_SFC is not set
-# CONFIG_BE2NET is not set
-CONFIG_TR=y
-# CONFIG_IBMOL is not set
-# CONFIG_3C359 is not set
-# CONFIG_TMS380TR is not set
-
-#
-# Wireless LAN
-#
-# CONFIG_WLAN_PRE80211 is not set
-CONFIG_WLAN_80211=y
-# CONFIG_PCMCIA_RAYCS is not set
-# CONFIG_LIBERTAS is not set
-# CONFIG_LIBERTAS_THINFIRM is not set
-# CONFIG_AIRO is not set
-# CONFIG_ATMEL is not set
-# CONFIG_AT76C50X_USB is not set
-# CONFIG_AIRO_CS is not set
-# CONFIG_PCMCIA_WL3501 is not set
-# CONFIG_PRISM54 is not set
-# CONFIG_USB_ZD1201 is not set
-# CONFIG_USB_NET_RNDIS_WLAN is not set
-# CONFIG_RTL8180 is not set
-# CONFIG_RTL8187 is not set
-# CONFIG_ADM8211 is not set
-# CONFIG_MAC80211_HWSIM is not set
-# CONFIG_MWL8K is not set
-# CONFIG_P54_COMMON is not set
-CONFIG_ATH5K=y
-# CONFIG_ATH5K_DEBUG is not set
-# CONFIG_ATH9K is not set
-# CONFIG_AR9170_USB is not set
-# CONFIG_IPW2100 is not set
-# CONFIG_IPW2200 is not set
-# CONFIG_IWLWIFI is not set
-# CONFIG_HOSTAP is not set
-# CONFIG_B43 is not set
-# CONFIG_B43LEGACY is not set
-# CONFIG_ZD1211RW is not set
-# CONFIG_RT2X00 is not set
-# CONFIG_HERMES is not set
-
-#
-# Enable WiMAX (Networking options) to see the WiMAX drivers
-#
-
-#
-# USB Network Adapters
-#
-# CONFIG_USB_CATC is not set
-# CONFIG_USB_KAWETH is not set
-# CONFIG_USB_PEGASUS is not set
-# CONFIG_USB_RTL8150 is not set
-# CONFIG_USB_USBNET is not set
-# CONFIG_USB_HSO is not set
-CONFIG_NET_PCMCIA=y
-# CONFIG_PCMCIA_3C589 is not set
-# CONFIG_PCMCIA_3C574 is not set
-# CONFIG_PCMCIA_FMVJ18X is not set
-# CONFIG_PCMCIA_PCNET is not set
-# CONFIG_PCMCIA_NMCLAN is not set
-# CONFIG_PCMCIA_SMC91C92 is not set
-# CONFIG_PCMCIA_XIRC2PS is not set
-# CONFIG_PCMCIA_AXNET is not set
-# CONFIG_PCMCIA_IBMTR is not set
-# CONFIG_WAN is not set
-CONFIG_FDDI=y
-# CONFIG_DEFXX is not set
-# CONFIG_SKFP is not set
-# CONFIG_HIPPI is not set
-# CONFIG_PPP is not set
-# CONFIG_SLIP is not set
-# CONFIG_NET_FC is not set
-CONFIG_NETCONSOLE=y
-# CONFIG_NETCONSOLE_DYNAMIC is not set
-CONFIG_NETPOLL=y
-# CONFIG_NETPOLL_TRAP is not set
-CONFIG_NET_POLL_CONTROLLER=y
-# CONFIG_ISDN is not set
-# CONFIG_PHONE is not set
-
-#
-# Input device support
-#
-CONFIG_INPUT=y
-CONFIG_INPUT_FF_MEMLESS=y
-CONFIG_INPUT_POLLDEV=y
-
-#
-# Userland interfaces
-#
-CONFIG_INPUT_MOUSEDEV=y
-# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
-CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
-# CONFIG_INPUT_JOYDEV is not set
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+CONFIG_R8169=y
CONFIG_INPUT_EVDEV=y
-# CONFIG_INPUT_EVBUG is not set
-
-#
-# Input Device Drivers
-#
-CONFIG_INPUT_KEYBOARD=y
-CONFIG_KEYBOARD_ATKBD=y
-# CONFIG_KEYBOARD_SUNKBD is not set
-# CONFIG_KEYBOARD_LKKBD is not set
-# CONFIG_KEYBOARD_XTKBD is not set
-# CONFIG_KEYBOARD_NEWTON is not set
-# CONFIG_KEYBOARD_STOWAWAY is not set
-CONFIG_INPUT_MOUSE=y
-CONFIG_MOUSE_PS2=y
-CONFIG_MOUSE_PS2_ALPS=y
-CONFIG_MOUSE_PS2_LOGIPS2PP=y
-CONFIG_MOUSE_PS2_SYNAPTICS=y
-CONFIG_MOUSE_PS2_LIFEBOOK=y
-CONFIG_MOUSE_PS2_TRACKPOINT=y
-# CONFIG_MOUSE_PS2_ELANTECH is not set
-# CONFIG_MOUSE_PS2_TOUCHKIT is not set
-# CONFIG_MOUSE_SERIAL is not set
-# CONFIG_MOUSE_APPLETOUCH is not set
-# CONFIG_MOUSE_BCM5974 is not set
-# CONFIG_MOUSE_VSXXXAA is not set
CONFIG_INPUT_JOYSTICK=y
-# CONFIG_JOYSTICK_ANALOG is not set
-# CONFIG_JOYSTICK_A3D is not set
-# CONFIG_JOYSTICK_ADI is not set
-# CONFIG_JOYSTICK_COBRA is not set
-# CONFIG_JOYSTICK_GF2K is not set
-# CONFIG_JOYSTICK_GRIP is not set
-# CONFIG_JOYSTICK_GRIP_MP is not set
-# CONFIG_JOYSTICK_GUILLEMOT is not set
-# CONFIG_JOYSTICK_INTERACT is not set
-# CONFIG_JOYSTICK_SIDEWINDER is not set
-# CONFIG_JOYSTICK_TMDC is not set
-# CONFIG_JOYSTICK_IFORCE is not set
-# CONFIG_JOYSTICK_WARRIOR is not set
-# CONFIG_JOYSTICK_MAGELLAN is not set
-# CONFIG_JOYSTICK_SPACEORB is not set
-# CONFIG_JOYSTICK_SPACEBALL is not set
-# CONFIG_JOYSTICK_STINGER is not set
-# CONFIG_JOYSTICK_TWIDJOY is not set
-# CONFIG_JOYSTICK_ZHENHUA is not set
-# CONFIG_JOYSTICK_JOYDUMP is not set
-# CONFIG_JOYSTICK_XPAD is not set
CONFIG_INPUT_TABLET=y
-# CONFIG_TABLET_USB_ACECAD is not set
-# CONFIG_TABLET_USB_AIPTEK is not set
-# CONFIG_TABLET_USB_GTCO is not set
-# CONFIG_TABLET_USB_KBTAB is not set
-# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
-# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
-# CONFIG_TOUCHSCREEN_AD7879 is not set
-# CONFIG_TOUCHSCREEN_FUJITSU is not set
-# CONFIG_TOUCHSCREEN_GUNZE is not set
-# CONFIG_TOUCHSCREEN_ELO is not set
-# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
-# CONFIG_TOUCHSCREEN_MTOUCH is not set
-# CONFIG_TOUCHSCREEN_INEXIO is not set
-# CONFIG_TOUCHSCREEN_MK712 is not set
-# CONFIG_TOUCHSCREEN_PENMOUNT is not set
-# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
-# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
-# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
-# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
-# CONFIG_TOUCHSCREEN_TSC2007 is not set
CONFIG_INPUT_MISC=y
-# CONFIG_INPUT_PCSPKR is not set
-# CONFIG_INPUT_APANEL is not set
-# CONFIG_INPUT_ATLAS_BTNS is not set
-# CONFIG_INPUT_ATI_REMOTE is not set
-# CONFIG_INPUT_ATI_REMOTE2 is not set
-# CONFIG_INPUT_KEYSPAN_REMOTE is not set
-# CONFIG_INPUT_POWERMATE is not set
-# CONFIG_INPUT_YEALINK is not set
-# CONFIG_INPUT_CM109 is not set
-# CONFIG_INPUT_UINPUT is not set
-
-#
-# Hardware I/O ports
-#
-CONFIG_SERIO=y
-CONFIG_SERIO_I8042=y
-CONFIG_SERIO_SERPORT=y
-# CONFIG_SERIO_CT82C710 is not set
-# CONFIG_SERIO_PCIPS2 is not set
-CONFIG_SERIO_LIBPS2=y
-# CONFIG_SERIO_RAW is not set
-# CONFIG_GAMEPORT is not set
-
-#
-# Character devices
-#
-CONFIG_VT=y
-CONFIG_CONSOLE_TRANSLATIONS=y
-CONFIG_VT_CONSOLE=y
-CONFIG_HW_CONSOLE=y
-CONFIG_VT_HW_CONSOLE_BINDING=y
-CONFIG_DEVKMEM=y
-CONFIG_SERIAL_NONSTANDARD=y
-# CONFIG_COMPUTONE is not set
-# CONFIG_ROCKETPORT is not set
-# CONFIG_CYCLADES is not set
-# CONFIG_DIGIEPCA is not set
-# CONFIG_MOXA_INTELLIO is not set
-# CONFIG_MOXA_SMARTIO is not set
-# CONFIG_ISI is not set
-# CONFIG_SYNCLINK is not set
-# CONFIG_SYNCLINKMP is not set
-# CONFIG_SYNCLINK_GT is not set
-# CONFIG_N_HDLC is not set
-# CONFIG_RISCOM8 is not set
-# CONFIG_SPECIALIX is not set
-# CONFIG_SX is not set
-# CONFIG_RIO is not set
-# CONFIG_STALDRV is not set
-# CONFIG_NOZOMI is not set
-
-#
-# Serial drivers
-#
+# CONFIG_LEGACY_PTYS is not set
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_FIX_EARLYCON_MEM=y
-CONFIG_SERIAL_8250_PCI=y
-CONFIG_SERIAL_8250_PNP=y
-# CONFIG_SERIAL_8250_CS is not set
CONFIG_SERIAL_8250_NR_UARTS=32
-CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_MANY_PORTS=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y
-
-#
-# Non-8250 serial port support
-#
-CONFIG_SERIAL_CORE=y
-CONFIG_SERIAL_CORE_CONSOLE=y
-# CONFIG_SERIAL_JSM is not set
-CONFIG_UNIX98_PTYS=y
-# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_IPMI_HANDLER is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_VIRTIO_CONSOLE=y
CONFIG_HW_RANDOM=y
-# CONFIG_HW_RANDOM_TIMERIOMEM is not set
# CONFIG_HW_RANDOM_INTEL is not set
# CONFIG_HW_RANDOM_AMD is not set
CONFIG_NVRAM=y
-# CONFIG_R3964 is not set
-# CONFIG_APPLICOM is not set
-
-#
-# PCMCIA character devices
-#
-# CONFIG_SYNCLINK_CS is not set
-# CONFIG_CARDMAN_4000 is not set
-# CONFIG_CARDMAN_4040 is not set
-# CONFIG_IPWIRELESS is not set
-# CONFIG_MWAVE is not set
-# CONFIG_PC8736x_GPIO is not set
-# CONFIG_RAW_DRIVER is not set
CONFIG_HPET=y
# CONFIG_HPET_MMAP is not set
-# CONFIG_HANGCHECK_TIMER is not set
-# CONFIG_TCG_TPM is not set
-# CONFIG_TELCLOCK is not set
-CONFIG_DEVPORT=y
-CONFIG_I2C=y
-CONFIG_I2C_BOARDINFO=y
-# CONFIG_I2C_CHARDEV is not set
-CONFIG_I2C_HELPER_AUTO=y
-CONFIG_I2C_ALGOBIT=y
-
-#
-# I2C Hardware Bus support
-#
-
-#
-# PC SMBus host controller drivers
-#
-# CONFIG_I2C_ALI1535 is not set
-# CONFIG_I2C_ALI1563 is not set
-# CONFIG_I2C_ALI15X3 is not set
-# CONFIG_I2C_AMD756 is not set
-# CONFIG_I2C_AMD8111 is not set
CONFIG_I2C_I801=y
-# CONFIG_I2C_ISCH is not set
-# CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_NFORCE2 is not set
-# CONFIG_I2C_SIS5595 is not set
-# CONFIG_I2C_SIS630 is not set
-# CONFIG_I2C_SIS96X is not set
-# CONFIG_I2C_VIA is not set
-# CONFIG_I2C_VIAPRO is not set
-
-#
-# I2C system bus drivers (mostly embedded / system-on-chip)
-#
-# CONFIG_I2C_OCORES is not set
-# CONFIG_I2C_SIMTEC is not set
-
-#
-# External I2C/SMBus adapter drivers
-#
-# CONFIG_I2C_PARPORT_LIGHT is not set
-# CONFIG_I2C_TAOS_EVM is not set
-# CONFIG_I2C_TINY_USB is not set
-
-#
-# Graphics adapter I2C/DDC channel drivers
-#
-# CONFIG_I2C_VOODOO3 is not set
-
-#
-# Other I2C/SMBus bus drivers
-#
-# CONFIG_I2C_PCA_PLATFORM is not set
-# CONFIG_I2C_STUB is not set
-
-#
-# Miscellaneous I2C Chip support
-#
-# CONFIG_DS1682 is not set
-# CONFIG_SENSORS_PCF8574 is not set
-# CONFIG_PCF8575 is not set
-# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_MAX6875 is not set
-# CONFIG_SENSORS_TSL2550 is not set
-# CONFIG_I2C_DEBUG_CORE is not set
-# CONFIG_I2C_DEBUG_ALGO is not set
-# CONFIG_I2C_DEBUG_BUS is not set
-# CONFIG_I2C_DEBUG_CHIP is not set
-# CONFIG_SPI is not set
-CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
-# CONFIG_GPIOLIB is not set
-# CONFIG_W1 is not set
-CONFIG_POWER_SUPPLY=y
-# CONFIG_POWER_SUPPLY_DEBUG is not set
-# CONFIG_PDA_POWER is not set
-# CONFIG_BATTERY_DS2760 is not set
-# CONFIG_BATTERY_BQ27x00 is not set
-CONFIG_HWMON=y
-# CONFIG_HWMON_VID is not set
-# CONFIG_SENSORS_ABITUGURU is not set
-# CONFIG_SENSORS_ABITUGURU3 is not set
-# CONFIG_SENSORS_AD7414 is not set
-# CONFIG_SENSORS_AD7418 is not set
-# CONFIG_SENSORS_ADM1021 is not set
-# CONFIG_SENSORS_ADM1025 is not set
-# CONFIG_SENSORS_ADM1026 is not set
-# CONFIG_SENSORS_ADM1029 is not set
-# CONFIG_SENSORS_ADM1031 is not set
-# CONFIG_SENSORS_ADM9240 is not set
-# CONFIG_SENSORS_ADT7462 is not set
-# CONFIG_SENSORS_ADT7470 is not set
-# CONFIG_SENSORS_ADT7473 is not set
-# CONFIG_SENSORS_ADT7475 is not set
-# CONFIG_SENSORS_K8TEMP is not set
-# CONFIG_SENSORS_ASB100 is not set
-# CONFIG_SENSORS_ATK0110 is not set
-# CONFIG_SENSORS_ATXP1 is not set
-# CONFIG_SENSORS_DS1621 is not set
-# CONFIG_SENSORS_I5K_AMB is not set
-# CONFIG_SENSORS_F71805F is not set
-# CONFIG_SENSORS_F71882FG is not set
-# CONFIG_SENSORS_F75375S is not set
-# CONFIG_SENSORS_FSCHER is not set
-# CONFIG_SENSORS_FSCPOS is not set
-# CONFIG_SENSORS_FSCHMD is not set
-# CONFIG_SENSORS_G760A is not set
-# CONFIG_SENSORS_GL518SM is not set
-# CONFIG_SENSORS_GL520SM is not set
-# CONFIG_SENSORS_CORETEMP is not set
-# CONFIG_SENSORS_IT87 is not set
-# CONFIG_SENSORS_LM63 is not set
-# CONFIG_SENSORS_LM75 is not set
-# CONFIG_SENSORS_LM77 is not set
-# CONFIG_SENSORS_LM78 is not set
-# CONFIG_SENSORS_LM80 is not set
-# CONFIG_SENSORS_LM83 is not set
-# CONFIG_SENSORS_LM85 is not set
-# CONFIG_SENSORS_LM87 is not set
-# CONFIG_SENSORS_LM90 is not set
-# CONFIG_SENSORS_LM92 is not set
-# CONFIG_SENSORS_LM93 is not set
-# CONFIG_SENSORS_LTC4215 is not set
-# CONFIG_SENSORS_LTC4245 is not set
-# CONFIG_SENSORS_LM95241 is not set
-# CONFIG_SENSORS_MAX1619 is not set
-# CONFIG_SENSORS_MAX6650 is not set
-# CONFIG_SENSORS_PC87360 is not set
-# CONFIG_SENSORS_PC87427 is not set
-# CONFIG_SENSORS_PCF8591 is not set
-# CONFIG_SENSORS_SIS5595 is not set
-# CONFIG_SENSORS_DME1737 is not set
-# CONFIG_SENSORS_SMSC47M1 is not set
-# CONFIG_SENSORS_SMSC47M192 is not set
-# CONFIG_SENSORS_SMSC47B397 is not set
-# CONFIG_SENSORS_ADS7828 is not set
-# CONFIG_SENSORS_THMC50 is not set
-# CONFIG_SENSORS_VIA686A is not set
-# CONFIG_SENSORS_VT1211 is not set
-# CONFIG_SENSORS_VT8231 is not set
-# CONFIG_SENSORS_W83781D is not set
-# CONFIG_SENSORS_W83791D is not set
-# CONFIG_SENSORS_W83792D is not set
-# CONFIG_SENSORS_W83793 is not set
-# CONFIG_SENSORS_W83L785TS is not set
-# CONFIG_SENSORS_W83L786NG is not set
-# CONFIG_SENSORS_W83627HF is not set
-# CONFIG_SENSORS_W83627EHF is not set
-# CONFIG_SENSORS_HDAPS is not set
-# CONFIG_SENSORS_LIS3LV02D is not set
-# CONFIG_SENSORS_APPLESMC is not set
-# CONFIG_HWMON_DEBUG_CHIP is not set
-CONFIG_THERMAL=y
-# CONFIG_THERMAL_HWMON is not set
CONFIG_WATCHDOG=y
-# CONFIG_WATCHDOG_NOWAYOUT is not set
-
-#
-# Watchdog Device Drivers
-#
-# CONFIG_SOFT_WATCHDOG is not set
-# CONFIG_ACQUIRE_WDT is not set
-# CONFIG_ADVANTECH_WDT is not set
-# CONFIG_ALIM1535_WDT is not set
-# CONFIG_ALIM7101_WDT is not set
-# CONFIG_SC520_WDT is not set
-# CONFIG_EUROTECH_WDT is not set
-# CONFIG_IB700_WDT is not set
-# CONFIG_IBMASR is not set
-# CONFIG_WAFER_WDT is not set
-# CONFIG_I6300ESB_WDT is not set
-# CONFIG_ITCO_WDT is not set
-# CONFIG_IT8712F_WDT is not set
-# CONFIG_IT87_WDT is not set
-# CONFIG_HP_WATCHDOG is not set
-# CONFIG_SC1200_WDT is not set
-# CONFIG_PC87413_WDT is not set
-# CONFIG_60XX_WDT is not set
-# CONFIG_SBC8360_WDT is not set
-# CONFIG_CPU5_WDT is not set
-# CONFIG_SMSC_SCH311X_WDT is not set
-# CONFIG_SMSC37B787_WDT is not set
-# CONFIG_W83627HF_WDT is not set
-# CONFIG_W83697HF_WDT is not set
-# CONFIG_W83697UG_WDT is not set
-# CONFIG_W83877F_WDT is not set
-# CONFIG_W83977F_WDT is not set
-# CONFIG_MACHZ_WDT is not set
-# CONFIG_SBC_EPX_C3_WATCHDOG is not set
-
-#
-# PCI-based Watchdog Cards
-#
-# CONFIG_PCIPCWATCHDOG is not set
-# CONFIG_WDTPCI is not set
-
-#
-# USB-based Watchdog Cards
-#
-# CONFIG_USBPCWATCHDOG is not set
-CONFIG_SSB_POSSIBLE=y
-
-#
-# Sonics Silicon Backplane
-#
-# CONFIG_SSB is not set
-
-#
-# Multifunction device drivers
-#
-# CONFIG_MFD_CORE is not set
-# CONFIG_MFD_SM501 is not set
-# CONFIG_HTC_PASIC3 is not set
-# CONFIG_TWL4030_CORE is not set
-# CONFIG_MFD_TMIO is not set
-# CONFIG_PMIC_DA903X is not set
-# CONFIG_MFD_WM8400 is not set
-# CONFIG_MFD_WM8350_I2C is not set
-# CONFIG_MFD_PCF50633 is not set
-# CONFIG_REGULATOR is not set
-
-#
-# Multimedia devices
-#
-
-#
-# Multimedia core support
-#
-# CONFIG_VIDEO_DEV is not set
-# CONFIG_DVB_CORE is not set
-# CONFIG_VIDEO_MEDIA is not set
-
-#
-# Multimedia drivers
-#
-CONFIG_DAB=y
-# CONFIG_USB_DABUSB is not set
-
-#
-# Graphics support
-#
CONFIG_AGP=y
CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
-# CONFIG_AGP_SIS is not set
-# CONFIG_AGP_VIA is not set
CONFIG_DRM=y
-# CONFIG_DRM_TDFX is not set
-# CONFIG_DRM_R128 is not set
-# CONFIG_DRM_RADEON is not set
-# CONFIG_DRM_I810 is not set
-# CONFIG_DRM_I830 is not set
CONFIG_DRM_I915=y
-CONFIG_DRM_I915_KMS=y
-# CONFIG_DRM_MGA is not set
-# CONFIG_DRM_SIS is not set
-# CONFIG_DRM_VIA is not set
-# CONFIG_DRM_SAVAGE is not set
-# CONFIG_VGASTATE is not set
-# CONFIG_VIDEO_OUTPUT_CONTROL is not set
-CONFIG_FB=y
-# CONFIG_FIRMWARE_EDID is not set
-# CONFIG_FB_DDC is not set
-# CONFIG_FB_BOOT_VESA_SUPPORT is not set
-CONFIG_FB_CFB_FILLRECT=y
-CONFIG_FB_CFB_COPYAREA=y
-CONFIG_FB_CFB_IMAGEBLIT=y
-# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
-# CONFIG_FB_SYS_FILLRECT is not set
-# CONFIG_FB_SYS_COPYAREA is not set
-# CONFIG_FB_SYS_IMAGEBLIT is not set
-# CONFIG_FB_FOREIGN_ENDIAN is not set
-# CONFIG_FB_SYS_FOPS is not set
-# CONFIG_FB_SVGALIB is not set
-# CONFIG_FB_MACMODES is not set
-# CONFIG_FB_BACKLIGHT is not set
-CONFIG_FB_MODE_HELPERS=y
-CONFIG_FB_TILEBLITTING=y
-
-#
-# Frame buffer hardware drivers
-#
-# CONFIG_FB_CIRRUS is not set
-# CONFIG_FB_PM2 is not set
-# CONFIG_FB_CYBER2000 is not set
-# CONFIG_FB_ARC is not set
-# CONFIG_FB_ASILIANT is not set
-# CONFIG_FB_IMSTT is not set
-# CONFIG_FB_VGA16 is not set
-# CONFIG_FB_UVESA is not set
-# CONFIG_FB_VESA is not set
-CONFIG_FB_EFI=y
-# CONFIG_FB_N411 is not set
-# CONFIG_FB_HGA is not set
-# CONFIG_FB_S1D13XXX is not set
-# CONFIG_FB_NVIDIA is not set
-# CONFIG_FB_RIVA is not set
-# CONFIG_FB_LE80578 is not set
-# CONFIG_FB_INTEL is not set
-# CONFIG_FB_MATROX is not set
-# CONFIG_FB_RADEON is not set
-# CONFIG_FB_ATY128 is not set
-# CONFIG_FB_ATY is not set
-# CONFIG_FB_S3 is not set
-# CONFIG_FB_SAVAGE is not set
-# CONFIG_FB_SIS is not set
-# CONFIG_FB_VIA is not set
-# CONFIG_FB_NEOMAGIC is not set
-# CONFIG_FB_KYRO is not set
-# CONFIG_FB_3DFX is not set
-# CONFIG_FB_VOODOO1 is not set
-# CONFIG_FB_VT8623 is not set
-# CONFIG_FB_TRIDENT is not set
-# CONFIG_FB_ARK is not set
-# CONFIG_FB_PM3 is not set
-# CONFIG_FB_CARMINE is not set
-# CONFIG_FB_GEODE is not set
-# CONFIG_FB_VIRTUAL is not set
-# CONFIG_FB_METRONOME is not set
-# CONFIG_FB_MB862XX is not set
-# CONFIG_FB_BROADSHEET is not set
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_BACKLIGHT_GENERIC=y
-# CONFIG_BACKLIGHT_PROGEAR is not set
-# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
-# CONFIG_BACKLIGHT_SAHARA is not set
-
-#
-# Display device support
-#
-# CONFIG_DISPLAY_SUPPORT is not set
-
-#
-# Console display driver support
-#
-CONFIG_VGA_CONSOLE=y
-CONFIG_VGACON_SOFT_SCROLLBACK=y
-CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_DUMMY_CONSOLE=y
-# CONFIG_FRAMEBUFFER_CONSOLE is not set
-CONFIG_LOGO=y
-# CONFIG_LOGO_LINUX_MONO is not set
-# CONFIG_LOGO_LINUX_VGA16 is not set
-CONFIG_LOGO_LINUX_CLUT224=y
+CONFIG_DRM_VIRTIO_GPU=y
CONFIG_SOUND=y
-CONFIG_SOUND_OSS_CORE=y
CONFIG_SND=y
-CONFIG_SND_TIMER=y
-CONFIG_SND_PCM=y
-CONFIG_SND_HWDEP=y
-CONFIG_SND_JACK=y
+CONFIG_SND_HRTIMER=y
CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQ_DUMMY=y
-CONFIG_SND_OSSEMUL=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_PCM_OSS_PLUGINS=y
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_HRTIMER=y
-CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
-CONFIG_SND_DYNAMIC_MINORS=y
-CONFIG_SND_SUPPORT_OLD_API=y
-CONFIG_SND_VERBOSE_PROCFS=y
-# CONFIG_SND_VERBOSE_PRINTK is not set
-# CONFIG_SND_DEBUG is not set
-CONFIG_SND_VMASTER=y
-CONFIG_SND_DRIVERS=y
-# CONFIG_SND_PCSP is not set
-# CONFIG_SND_DUMMY is not set
-# CONFIG_SND_VIRMIDI is not set
-# CONFIG_SND_MTPAV is not set
-# CONFIG_SND_SERIAL_U16550 is not set
-# CONFIG_SND_MPU401 is not set
-CONFIG_SND_PCI=y
-# CONFIG_SND_AD1889 is not set
-# CONFIG_SND_ALS300 is not set
-# CONFIG_SND_ALS4000 is not set
-# CONFIG_SND_ALI5451 is not set
-# CONFIG_SND_ATIIXP is not set
-# CONFIG_SND_ATIIXP_MODEM is not set
-# CONFIG_SND_AU8810 is not set
-# CONFIG_SND_AU8820 is not set
-# CONFIG_SND_AU8830 is not set
-# CONFIG_SND_AW2 is not set
-# CONFIG_SND_AZT3328 is not set
-# CONFIG_SND_BT87X is not set
-# CONFIG_SND_CA0106 is not set
-# CONFIG_SND_CMIPCI is not set
-# CONFIG_SND_OXYGEN is not set
-# CONFIG_SND_CS4281 is not set
-# CONFIG_SND_CS46XX is not set
-# CONFIG_SND_CS5530 is not set
-# CONFIG_SND_DARLA20 is not set
-# CONFIG_SND_GINA20 is not set
-# CONFIG_SND_LAYLA20 is not set
-# CONFIG_SND_DARLA24 is not set
-# CONFIG_SND_GINA24 is not set
-# CONFIG_SND_LAYLA24 is not set
-# CONFIG_SND_MONA is not set
-# CONFIG_SND_MIA is not set
-# CONFIG_SND_ECHO3G is not set
-# CONFIG_SND_INDIGO is not set
-# CONFIG_SND_INDIGOIO is not set
-# CONFIG_SND_INDIGODJ is not set
-# CONFIG_SND_INDIGOIOX is not set
-# CONFIG_SND_INDIGODJX is not set
-# CONFIG_SND_EMU10K1 is not set
-# CONFIG_SND_EMU10K1X is not set
-# CONFIG_SND_ENS1370 is not set
-# CONFIG_SND_ENS1371 is not set
-# CONFIG_SND_ES1938 is not set
-# CONFIG_SND_ES1968 is not set
-# CONFIG_SND_FM801 is not set
CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
-# CONFIG_SND_HDA_RECONFIG is not set
-# CONFIG_SND_HDA_INPUT_BEEP is not set
-CONFIG_SND_HDA_CODEC_REALTEK=y
-CONFIG_SND_HDA_CODEC_ANALOG=y
-CONFIG_SND_HDA_CODEC_SIGMATEL=y
-CONFIG_SND_HDA_CODEC_VIA=y
-CONFIG_SND_HDA_CODEC_ATIHDMI=y
-CONFIG_SND_HDA_CODEC_NVHDMI=y
-CONFIG_SND_HDA_CODEC_INTELHDMI=y
-CONFIG_SND_HDA_ELD=y
-CONFIG_SND_HDA_CODEC_CONEXANT=y
-CONFIG_SND_HDA_CODEC_CMEDIA=y
-CONFIG_SND_HDA_CODEC_SI3054=y
-CONFIG_SND_HDA_GENERIC=y
-# CONFIG_SND_HDA_POWER_SAVE is not set
-# CONFIG_SND_HDSP is not set
-# CONFIG_SND_HDSPM is not set
-# CONFIG_SND_HIFIER is not set
-# CONFIG_SND_ICE1712 is not set
-# CONFIG_SND_ICE1724 is not set
-# CONFIG_SND_INTEL8X0 is not set
-# CONFIG_SND_INTEL8X0M is not set
-# CONFIG_SND_KORG1212 is not set
-# CONFIG_SND_MAESTRO3 is not set
-# CONFIG_SND_MIXART is not set
-# CONFIG_SND_NM256 is not set
-# CONFIG_SND_PCXHR is not set
-# CONFIG_SND_RIPTIDE is not set
-# CONFIG_SND_RME32 is not set
-# CONFIG_SND_RME96 is not set
-# CONFIG_SND_RME9652 is not set
-# CONFIG_SND_SONICVIBES is not set
-# CONFIG_SND_TRIDENT is not set
-# CONFIG_SND_VIA82XX is not set
-# CONFIG_SND_VIA82XX_MODEM is not set
-# CONFIG_SND_VIRTUOSO is not set
-# CONFIG_SND_VX222 is not set
-# CONFIG_SND_YMFPCI is not set
-CONFIG_SND_USB=y
-# CONFIG_SND_USB_AUDIO is not set
-# CONFIG_SND_USB_USX2Y is not set
-# CONFIG_SND_USB_CAIAQ is not set
-# CONFIG_SND_USB_US122L is not set
-CONFIG_SND_PCMCIA=y
-# CONFIG_SND_VXPOCKET is not set
-# CONFIG_SND_PDAUDIOCF is not set
-# CONFIG_SND_SOC is not set
-# CONFIG_SOUND_PRIME is not set
-CONFIG_HID_SUPPORT=y
-CONFIG_HID=y
-CONFIG_HID_DEBUG=y
CONFIG_HIDRAW=y
-
-#
-# USB Input Devices
-#
-CONFIG_USB_HID=y
-CONFIG_HID_PID=y
-CONFIG_USB_HIDDEV=y
-
-#
-# Special HID drivers
-#
-CONFIG_HID_A4TECH=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-# CONFIG_DRAGONRISE_FF is not set
-CONFIG_HID_EZKEY=y
-CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_LOGITECH=y
-CONFIG_LOGITECH_FF=y
-# CONFIG_LOGIRUMBLEPAD2_FF is not set
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -1849,697 +209,64 @@ CONFIG_HID_PETALYNX=y
CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
CONFIG_HID_SUNPLUS=y
-# CONFIG_GREENASIA_FF is not set
CONFIG_HID_TOPSEED=y
-CONFIG_THRUSTMASTER_FF=y
-CONFIG_ZEROPLUS_FF=y
-CONFIG_USB_SUPPORT=y
-CONFIG_USB_ARCH_HAS_HCD=y
-CONFIG_USB_ARCH_HAS_OHCI=y
-CONFIG_USB_ARCH_HAS_EHCI=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
CONFIG_USB=y
-CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-
-#
-# Miscellaneous USB options
-#
-CONFIG_USB_DEVICEFS=y
-# CONFIG_USB_DEVICE_CLASS is not set
-# CONFIG_USB_DYNAMIC_MINORS is not set
-CONFIG_USB_SUSPEND=y
-# CONFIG_USB_OTG is not set
CONFIG_USB_MON=y
-# CONFIG_USB_WUSB is not set
-# CONFIG_USB_WUSB_CBAF is not set
-
-#
-# USB Host Controller Drivers
-#
-# CONFIG_USB_C67X00_HCD is not set
+CONFIG_USB_XHCI_HCD=y
CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
-# CONFIG_USB_OXU210HP_HCD is not set
-# CONFIG_USB_ISP116X_HCD is not set
-# CONFIG_USB_ISP1760_HCD is not set
CONFIG_USB_OHCI_HCD=y
-# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
-# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
-CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
-# CONFIG_USB_SL811_HCD is not set
-# CONFIG_USB_R8A66597_HCD is not set
-# CONFIG_USB_WHCI_HCD is not set
-# CONFIG_USB_HWA_HCD is not set
-
-#
-# USB Device Class drivers
-#
-# CONFIG_USB_ACM is not set
CONFIG_USB_PRINTER=y
-# CONFIG_USB_WDM is not set
-# CONFIG_USB_TMC is not set
-
-#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
-#
-
-#
-# also be needed; see USB_STORAGE Help for more info
-#
CONFIG_USB_STORAGE=y
-# CONFIG_USB_STORAGE_DEBUG is not set
-# CONFIG_USB_STORAGE_DATAFAB is not set
-# CONFIG_USB_STORAGE_FREECOM is not set
-# CONFIG_USB_STORAGE_ISD200 is not set
-# CONFIG_USB_STORAGE_USBAT is not set
-# CONFIG_USB_STORAGE_SDDR09 is not set
-# CONFIG_USB_STORAGE_SDDR55 is not set
-# CONFIG_USB_STORAGE_JUMPSHOT is not set
-# CONFIG_USB_STORAGE_ALAUDA is not set
-# CONFIG_USB_STORAGE_ONETOUCH is not set
-# CONFIG_USB_STORAGE_KARMA is not set
-# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
-CONFIG_USB_LIBUSUAL=y
-
-#
-# USB Imaging devices
-#
-# CONFIG_USB_MDC800 is not set
-# CONFIG_USB_MICROTEK is not set
-
-#
-# USB port drivers
-#
-# CONFIG_USB_SERIAL is not set
-
-#
-# USB Miscellaneous drivers
-#
-# CONFIG_USB_EMI62 is not set
-# CONFIG_USB_EMI26 is not set
-# CONFIG_USB_ADUTUX is not set
-# CONFIG_USB_SEVSEG is not set
-# CONFIG_USB_RIO500 is not set
-# CONFIG_USB_LEGOTOWER is not set
-# CONFIG_USB_LCD is not set
-# CONFIG_USB_BERRY_CHARGE is not set
-# CONFIG_USB_LED is not set
-# CONFIG_USB_CYPRESS_CY7C63 is not set
-# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_IDMOUSE is not set
-# CONFIG_USB_FTDI_ELAN is not set
-# CONFIG_USB_APPLEDISPLAY is not set
-# CONFIG_USB_SISUSBVGA is not set
-# CONFIG_USB_LD is not set
-# CONFIG_USB_TRANCEVIBRATOR is not set
-# CONFIG_USB_IOWARRIOR is not set
-# CONFIG_USB_TEST is not set
-# CONFIG_USB_ISIGHTFW is not set
-# CONFIG_USB_VST is not set
-# CONFIG_USB_GADGET is not set
-
-#
-# OTG and related infrastructure
-#
-# CONFIG_NOP_USB_XCEIV is not set
-# CONFIG_UWB is not set
-# CONFIG_MMC is not set
-# CONFIG_MEMSTICK is not set
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-
-#
-# LED drivers
-#
-# CONFIG_LEDS_ALIX2 is not set
-# CONFIG_LEDS_PCA9532 is not set
-# CONFIG_LEDS_LP5521 is not set
-# CONFIG_LEDS_CLEVO_MAIL is not set
-# CONFIG_LEDS_PCA955X is not set
-# CONFIG_LEDS_BD2802 is not set
-
-#
-# LED Triggers
-#
-CONFIG_LEDS_TRIGGERS=y
-# CONFIG_LEDS_TRIGGER_TIMER is not set
-# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
-# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
-# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
-
-#
-# iptables trigger is under Netfilter config (LED target)
-#
-# CONFIG_ACCESSIBILITY is not set
-# CONFIG_INFINIBAND is not set
-CONFIG_EDAC=y
-
-#
-# Reporting subsystems
-#
-# CONFIG_EDAC_DEBUG is not set
-# CONFIG_EDAC_MM_EDAC is not set
-CONFIG_RTC_LIB=y
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
-# CONFIG_RTC_DEBUG is not set
-
-#
-# RTC interfaces
-#
-CONFIG_RTC_INTF_SYSFS=y
-CONFIG_RTC_INTF_PROC=y
-CONFIG_RTC_INTF_DEV=y
-# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
-# CONFIG_RTC_DRV_TEST is not set
-
-#
-# I2C RTC drivers
-#
-# CONFIG_RTC_DRV_DS1307 is not set
-# CONFIG_RTC_DRV_DS1374 is not set
-# CONFIG_RTC_DRV_DS1672 is not set
-# CONFIG_RTC_DRV_MAX6900 is not set
-# CONFIG_RTC_DRV_RS5C372 is not set
-# CONFIG_RTC_DRV_ISL1208 is not set
-# CONFIG_RTC_DRV_X1205 is not set
-# CONFIG_RTC_DRV_PCF8563 is not set
-# CONFIG_RTC_DRV_PCF8583 is not set
-# CONFIG_RTC_DRV_M41T80 is not set
-# CONFIG_RTC_DRV_S35390A is not set
-# CONFIG_RTC_DRV_FM3130 is not set
-# CONFIG_RTC_DRV_RX8581 is not set
-
-#
-# SPI RTC drivers
-#
-
-#
-# Platform RTC drivers
-#
-CONFIG_RTC_DRV_CMOS=y
-# CONFIG_RTC_DRV_DS1286 is not set
-# CONFIG_RTC_DRV_DS1511 is not set
-# CONFIG_RTC_DRV_DS1553 is not set
-# CONFIG_RTC_DRV_DS1742 is not set
-# CONFIG_RTC_DRV_STK17TA8 is not set
-# CONFIG_RTC_DRV_M48T86 is not set
-# CONFIG_RTC_DRV_M48T35 is not set
-# CONFIG_RTC_DRV_M48T59 is not set
-# CONFIG_RTC_DRV_BQ4802 is not set
-# CONFIG_RTC_DRV_V3020 is not set
-
-#
-# on-CPU RTC drivers
-#
CONFIG_DMADEVICES=y
-
-#
-# DMA Devices
-#
-# CONFIG_INTEL_IOATDMA is not set
-# CONFIG_AUXDISPLAY is not set
-# CONFIG_UIO is not set
-# CONFIG_STAGING is not set
-CONFIG_X86_PLATFORM_DEVICES=y
-# CONFIG_ACER_WMI is not set
-# CONFIG_ASUS_LAPTOP is not set
-# CONFIG_FUJITSU_LAPTOP is not set
-# CONFIG_MSI_LAPTOP is not set
-# CONFIG_PANASONIC_LAPTOP is not set
-# CONFIG_COMPAL_LAPTOP is not set
-# CONFIG_SONY_LAPTOP is not set
-# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_INTEL_MENLOW is not set
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_INPUT=y
CONFIG_EEEPC_LAPTOP=y
-# CONFIG_ACPI_WMI is not set
-# CONFIG_ACPI_ASUS is not set
-# CONFIG_ACPI_TOSHIBA is not set
-
-#
-# Firmware Drivers
-#
-# CONFIG_EDD is not set
-CONFIG_FIRMWARE_MEMMAP=y
-CONFIG_EFI_VARS=y
-# CONFIG_DELL_RBU is not set
-# CONFIG_DCDBAS is not set
-CONFIG_DMIID=y
-# CONFIG_ISCSI_IBFT_FIND is not set
-
-#
-# File systems
-#
-# CONFIG_EXT2_FS is not set
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_EXT3_FS_XATTR=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-# CONFIG_EXT4_FS is not set
-CONFIG_JBD=y
-# CONFIG_JBD_DEBUG is not set
-CONFIG_FS_MBCACHE=y
-# CONFIG_REISERFS_FS is not set
-# CONFIG_JFS_FS is not set
-CONFIG_FS_POSIX_ACL=y
-CONFIG_FILE_LOCKING=y
-# CONFIG_XFS_FS is not set
-# CONFIG_GFS2_FS is not set
-# CONFIG_OCFS2_FS is not set
-# CONFIG_BTRFS_FS is not set
-CONFIG_DNOTIFY=y
-CONFIG_INOTIFY=y
-CONFIG_INOTIFY_USER=y
+CONFIG_AMD_IOMMU=y
+CONFIG_INTEL_IOMMU=y
+# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
-CONFIG_QUOTA_TREE=y
-# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
-CONFIG_QUOTACTL=y
-# CONFIG_AUTOFS_FS is not set
-CONFIG_AUTOFS4_FS=y
-# CONFIG_FUSE_FS is not set
-CONFIG_GENERIC_ACL=y
-
-#
-# Caches
-#
-# CONFIG_FSCACHE is not set
-
-#
-# CD-ROM/DVD Filesystems
-#
+CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
-# CONFIG_UDF_FS is not set
-
-#
-# DOS/FAT/NT Filesystems
-#
-CONFIG_FAT_FS=y
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
-CONFIG_FAT_DEFAULT_CODEPAGE=437
-CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
-# CONFIG_NTFS_FS is not set
-
-#
-# Pseudo filesystems
-#
-CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
-CONFIG_PROC_VMCORE=y
-CONFIG_PROC_SYSCTL=y
-CONFIG_PROC_PAGE_MONITOR=y
-CONFIG_SYSFS=y
-CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
-CONFIG_HUGETLB_PAGE=y
-# CONFIG_CONFIGFS_FS is not set
-CONFIG_MISC_FILESYSTEMS=y
-# CONFIG_ADFS_FS is not set
-# CONFIG_AFFS_FS is not set
-# CONFIG_ECRYPT_FS is not set
-# CONFIG_HFS_FS is not set
-# CONFIG_HFSPLUS_FS is not set
-# CONFIG_BEFS_FS is not set
-# CONFIG_BFS_FS is not set
-# CONFIG_EFS_FS is not set
-# CONFIG_CRAMFS is not set
-# CONFIG_SQUASHFS is not set
-# CONFIG_VXFS_FS is not set
-# CONFIG_MINIX_FS is not set
-# CONFIG_OMFS_FS is not set
-# CONFIG_HPFS_FS is not set
-# CONFIG_QNX4FS_FS is not set
-# CONFIG_ROMFS_FS is not set
-# CONFIG_SYSV_FS is not set
-# CONFIG_UFS_FS is not set
-# CONFIG_NILFS2_FS is not set
-CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-# CONFIG_NFSD is not set
-CONFIG_LOCKD=y
-CONFIG_LOCKD_V4=y
-CONFIG_NFS_ACL_SUPPORT=y
-CONFIG_NFS_COMMON=y
-CONFIG_SUNRPC=y
-CONFIG_SUNRPC_GSS=y
-CONFIG_RPCSEC_GSS_KRB5=y
-# CONFIG_RPCSEC_GSS_SPKM3 is not set
-# CONFIG_SMB_FS is not set
-# CONFIG_CIFS is not set
-# CONFIG_NCP_FS is not set
-# CONFIG_CODA_FS is not set
-# CONFIG_AFS_FS is not set
-
-#
-# Partition Types
-#
-CONFIG_PARTITION_ADVANCED=y
-# CONFIG_ACORN_PARTITION is not set
-CONFIG_OSF_PARTITION=y
-CONFIG_AMIGA_PARTITION=y
-# CONFIG_ATARI_PARTITION is not set
-CONFIG_MAC_PARTITION=y
-CONFIG_MSDOS_PARTITION=y
-CONFIG_BSD_DISKLABEL=y
-CONFIG_MINIX_SUBPARTITION=y
-CONFIG_SOLARIS_X86_PARTITION=y
-CONFIG_UNIXWARE_DISKLABEL=y
-# CONFIG_LDM_PARTITION is not set
-CONFIG_SGI_PARTITION=y
-# CONFIG_ULTRIX_PARTITION is not set
-CONFIG_SUN_PARTITION=y
-CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
-# CONFIG_SYSV68_PARTITION is not set
-CONFIG_NLS=y
+CONFIG_9P_FS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
-# CONFIG_NLS_CODEPAGE_737 is not set
-# CONFIG_NLS_CODEPAGE_775 is not set
-# CONFIG_NLS_CODEPAGE_850 is not set
-# CONFIG_NLS_CODEPAGE_852 is not set
-# CONFIG_NLS_CODEPAGE_855 is not set
-# CONFIG_NLS_CODEPAGE_857 is not set
-# CONFIG_NLS_CODEPAGE_860 is not set
-# CONFIG_NLS_CODEPAGE_861 is not set
-# CONFIG_NLS_CODEPAGE_862 is not set
-# CONFIG_NLS_CODEPAGE_863 is not set
-# CONFIG_NLS_CODEPAGE_864 is not set
-# CONFIG_NLS_CODEPAGE_865 is not set
-# CONFIG_NLS_CODEPAGE_866 is not set
-# CONFIG_NLS_CODEPAGE_869 is not set
-# CONFIG_NLS_CODEPAGE_936 is not set
-# CONFIG_NLS_CODEPAGE_950 is not set
-# CONFIG_NLS_CODEPAGE_932 is not set
-# CONFIG_NLS_CODEPAGE_949 is not set
-# CONFIG_NLS_CODEPAGE_874 is not set
-# CONFIG_NLS_ISO8859_8 is not set
-# CONFIG_NLS_CODEPAGE_1250 is not set
-# CONFIG_NLS_CODEPAGE_1251 is not set
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_NLS_ISO8859_2 is not set
-# CONFIG_NLS_ISO8859_3 is not set
-# CONFIG_NLS_ISO8859_4 is not set
-# CONFIG_NLS_ISO8859_5 is not set
-# CONFIG_NLS_ISO8859_6 is not set
-# CONFIG_NLS_ISO8859_7 is not set
-# CONFIG_NLS_ISO8859_9 is not set
-# CONFIG_NLS_ISO8859_13 is not set
-# CONFIG_NLS_ISO8859_14 is not set
-# CONFIG_NLS_ISO8859_15 is not set
-# CONFIG_NLS_KOI8_R is not set
-# CONFIG_NLS_KOI8_U is not set
CONFIG_NLS_UTF8=y
-# CONFIG_DLM is not set
-
-#
-# Kernel hacking
-#
-CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_ENABLE_MUST_CHECK=y
-CONFIG_FRAME_WARN=2048
-CONFIG_MAGIC_SYSRQ=y
-# CONFIG_UNUSED_SYMBOLS is not set
-CONFIG_DEBUG_FS=y
-# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
-# CONFIG_DEBUG_SHIRQ is not set
-# CONFIG_DETECT_SOFTLOCKUP is not set
-# CONFIG_DETECT_HUNG_TASK is not set
-# CONFIG_SCHED_DEBUG is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_WX=y
+CONFIG_DEBUG_STACK_USAGE=y
CONFIG_SCHEDSTATS=y
-CONFIG_TIMER_STATS=y
-# CONFIG_DEBUG_OBJECTS is not set
-# CONFIG_SLUB_DEBUG_ON is not set
-# CONFIG_SLUB_STATS is not set
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_RT_MUTEX_TESTER is not set
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_LOCK_ALLOC is not set
-# CONFIG_PROVE_LOCKING is not set
-# CONFIG_LOCK_STAT is not set
-# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
-CONFIG_STACKTRACE=y
-# CONFIG_DEBUG_KOBJECT is not set
-CONFIG_DEBUG_BUGVERBOSE=y
-# CONFIG_DEBUG_INFO is not set
-# CONFIG_DEBUG_VM is not set
-# CONFIG_DEBUG_VIRTUAL is not set
-# CONFIG_DEBUG_WRITECOUNT is not set
-CONFIG_DEBUG_MEMORY_INIT=y
-# CONFIG_DEBUG_LIST is not set
-# CONFIG_DEBUG_SG is not set
-# CONFIG_DEBUG_NOTIFIERS is not set
-CONFIG_ARCH_WANT_FRAME_POINTERS=y
-CONFIG_FRAME_POINTER=y
-# CONFIG_BOOT_PRINTK_DELAY is not set
-# CONFIG_RCU_TORTURE_TEST is not set
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-# CONFIG_KPROBES_SANITY_TEST is not set
-# CONFIG_BACKTRACE_SELF_TEST is not set
-# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
-# CONFIG_LKDTM is not set
-# CONFIG_FAULT_INJECTION is not set
-# CONFIG_LATENCYTOP is not set
-CONFIG_SYSCTL_SYSCALL_CHECK=y
-# CONFIG_DEBUG_PAGEALLOC is not set
-CONFIG_USER_STACKTRACE_SUPPORT=y
-CONFIG_NOP_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACER=y
-CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
-CONFIG_HAVE_DYNAMIC_FTRACE=y
-CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
-CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
-CONFIG_RING_BUFFER=y
-CONFIG_TRACING=y
-CONFIG_TRACING_SUPPORT=y
-
-#
-# Tracers
-#
-# CONFIG_FUNCTION_TRACER is not set
-# CONFIG_IRQSOFF_TRACER is not set
-# CONFIG_SYSPROF_TRACER is not set
-# CONFIG_SCHED_TRACER is not set
-# CONFIG_CONTEXT_SWITCH_TRACER is not set
-# CONFIG_EVENT_TRACER is not set
-# CONFIG_FTRACE_SYSCALLS is not set
-# CONFIG_BOOT_TRACER is not set
-# CONFIG_TRACE_BRANCH_PROFILING is not set
-# CONFIG_POWER_TRACER is not set
-# CONFIG_STACK_TRACER is not set
-# CONFIG_HW_BRANCH_TRACER is not set
-# CONFIG_KMEMTRACE is not set
-# CONFIG_WORKQUEUE_TRACER is not set
CONFIG_BLK_DEV_IO_TRACE=y
-# CONFIG_FTRACE_STARTUP_TEST is not set
-# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_DEBUG is not set
-# CONFIG_DMA_API_DEBUG is not set
-# CONFIG_SAMPLES is not set
-CONFIG_HAVE_ARCH_KGDB=y
-# CONFIG_KGDB is not set
-# CONFIG_STRICT_DEVMEM is not set
-CONFIG_X86_VERBOSE_BOOTUP=y
-CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
-CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PER_CPU_MAPS is not set
-# CONFIG_X86_PTDUMP is not set
-CONFIG_DEBUG_RODATA=y
-# CONFIG_DEBUG_RODATA_TEST is not set
-CONFIG_DEBUG_NX_TEST=m
-# CONFIG_IOMMU_DEBUG is not set
-CONFIG_HAVE_MMIOTRACE_SUPPORT=y
-CONFIG_IO_DELAY_TYPE_0X80=0
-CONFIG_IO_DELAY_TYPE_0XED=1
-CONFIG_IO_DELAY_TYPE_UDELAY=2
-CONFIG_IO_DELAY_TYPE_NONE=3
-CONFIG_IO_DELAY_0X80=y
-# CONFIG_IO_DELAY_0XED is not set
-# CONFIG_IO_DELAY_UDELAY is not set
-# CONFIG_IO_DELAY_NONE is not set
-CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
-# CONFIG_CPA_DEBUG is not set
-CONFIG_OPTIMIZE_INLINING=y
-
-#
-# Security options
-#
-CONFIG_KEYS=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
-CONFIG_SECURITY=y
-# CONFIG_SECURITYFS is not set
-CONFIG_SECURITY_NETWORK=y
-# CONFIG_SECURITY_NETWORK_XFRM is not set
-# CONFIG_SECURITY_PATH is not set
-CONFIG_SECURITY_FILE_CAPABILITIES=y
-# CONFIG_SECURITY_ROOTPLUG is not set
-CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
-CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_SECURITY_SELINUX_DEVELOP=y
-CONFIG_SECURITY_SELINUX_AVC_STATS=y
-CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
-# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
-# CONFIG_SECURITY_SMACK is not set
-# CONFIG_SECURITY_TOMOYO is not set
-# CONFIG_IMA is not set
-CONFIG_CRYPTO=y
-
-#
-# Crypto core or helper
-#
-# CONFIG_CRYPTO_FIPS is not set
-CONFIG_CRYPTO_ALGAPI=y
-CONFIG_CRYPTO_ALGAPI2=y
-CONFIG_CRYPTO_AEAD=y
-CONFIG_CRYPTO_AEAD2=y
-CONFIG_CRYPTO_BLKCIPHER=y
-CONFIG_CRYPTO_BLKCIPHER2=y
-CONFIG_CRYPTO_HASH=y
-CONFIG_CRYPTO_HASH2=y
-CONFIG_CRYPTO_RNG2=y
-CONFIG_CRYPTO_PCOMP=y
-CONFIG_CRYPTO_MANAGER=y
-CONFIG_CRYPTO_MANAGER2=y
-# CONFIG_CRYPTO_GF128MUL is not set
-# CONFIG_CRYPTO_NULL is not set
-CONFIG_CRYPTO_WORKQUEUE=y
-# CONFIG_CRYPTO_CRYPTD is not set
-CONFIG_CRYPTO_AUTHENC=y
-# CONFIG_CRYPTO_TEST is not set
-
-#
-# Authenticated Encryption with Associated Data
-#
-# CONFIG_CRYPTO_CCM is not set
-# CONFIG_CRYPTO_GCM is not set
-# CONFIG_CRYPTO_SEQIV is not set
-
-#
-# Block modes
-#
-CONFIG_CRYPTO_CBC=y
-# CONFIG_CRYPTO_CTR is not set
-# CONFIG_CRYPTO_CTS is not set
-CONFIG_CRYPTO_ECB=y
-# CONFIG_CRYPTO_LRW is not set
-# CONFIG_CRYPTO_PCBC is not set
-# CONFIG_CRYPTO_XTS is not set
-
-#
-# Hash modes
-#
-CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_XCBC is not set
-
-#
-# Digest
-#
-# CONFIG_CRYPTO_CRC32C is not set
-# CONFIG_CRYPTO_CRC32C_INTEL is not set
-# CONFIG_CRYPTO_MD4 is not set
-CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_MICHAEL_MIC is not set
-# CONFIG_CRYPTO_RMD128 is not set
-# CONFIG_CRYPTO_RMD160 is not set
-# CONFIG_CRYPTO_RMD256 is not set
-# CONFIG_CRYPTO_RMD320 is not set
-CONFIG_CRYPTO_SHA1=y
-# CONFIG_CRYPTO_SHA256 is not set
-# CONFIG_CRYPTO_SHA512 is not set
-# CONFIG_CRYPTO_TGR192 is not set
-# CONFIG_CRYPTO_WP512 is not set
-
-#
-# Ciphers
-#
-CONFIG_CRYPTO_AES=y
-# CONFIG_CRYPTO_AES_X86_64 is not set
-# CONFIG_CRYPTO_AES_NI_INTEL is not set
-# CONFIG_CRYPTO_ANUBIS is not set
-CONFIG_CRYPTO_ARC4=y
-# CONFIG_CRYPTO_BLOWFISH is not set
-# CONFIG_CRYPTO_CAMELLIA is not set
-# CONFIG_CRYPTO_CAST5 is not set
-# CONFIG_CRYPTO_CAST6 is not set
-CONFIG_CRYPTO_DES=y
-# CONFIG_CRYPTO_FCRYPT is not set
-# CONFIG_CRYPTO_KHAZAD is not set
-# CONFIG_CRYPTO_SALSA20 is not set
-# CONFIG_CRYPTO_SALSA20_X86_64 is not set
-# CONFIG_CRYPTO_SEED is not set
-# CONFIG_CRYPTO_SERPENT is not set
-# CONFIG_CRYPTO_TEA is not set
-# CONFIG_CRYPTO_TWOFISH is not set
-# CONFIG_CRYPTO_TWOFISH_X86_64 is not set
-
-#
-# Compression
-#
-# CONFIG_CRYPTO_DEFLATE is not set
-# CONFIG_CRYPTO_ZLIB is not set
-# CONFIG_CRYPTO_LZO is not set
-
-#
-# Random Number Generation
-#
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_CRYPTO_HW=y
-# CONFIG_CRYPTO_DEV_HIFN_795X is not set
-CONFIG_HAVE_KVM=y
-CONFIG_HAVE_KVM_IRQCHIP=y
-CONFIG_VIRTUALIZATION=y
-# CONFIG_KVM is not set
-# CONFIG_VIRTIO_PCI is not set
-# CONFIG_VIRTIO_BALLOON is not set
-CONFIG_BINARY_PRINTF=y
-
-#
-# Library routines
-#
-CONFIG_BITREVERSE=y
-CONFIG_GENERIC_FIND_FIRST_BIT=y
-CONFIG_GENERIC_FIND_NEXT_BIT=y
-CONFIG_GENERIC_FIND_LAST_BIT=y
-# CONFIG_CRC_CCITT is not set
-# CONFIG_CRC16 is not set
-CONFIG_CRC_T10DIF=y
-# CONFIG_CRC_ITU_T is not set
-CONFIG_CRC32=y
-# CONFIG_CRC7 is not set
-# CONFIG_LIBCRC32C is not set
-CONFIG_ZLIB_INFLATE=y
-CONFIG_DECOMPRESS_GZIP=y
-CONFIG_DECOMPRESS_BZIP2=y
-CONFIG_DECOMPRESS_LZMA=y
-CONFIG_HAS_IOMEM=y
-CONFIG_HAS_IOPORT=y
-CONFIG_HAS_DMA=y
-CONFIG_NLATTR=y
+CONFIG_DEBUG_ENTRY=y
diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config
new file mode 100644
index 000000000000..98b6952ba9d2
--- /dev/null
+++ b/arch/x86/configs/xen.config
@@ -0,0 +1,24 @@
+# global x86 required specific stuff
+CONFIG_64BIT=y
+
+# These enable us to allow some of the
+# not so generic stuff below
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_X86_MCE=y
+CONFIG_ACPI_PROCESSOR=y
+CONFIG_CPU_FREQ=y
+
+# x86 xen specific config options
+CONFIG_XEN_PVH=y
+# CONFIG_XEN_DEBUG_FS is not set
+CONFIG_XEN_MCE_LOG=y
+CONFIG_XEN_ACPI_PROCESSOR=m
+# x86 specific backend drivers
+CONFIG_XEN_PCIDEV_BACKEND=m
+# x86 specific frontend drivers
+CONFIG_XEN_PCIDEV_FRONTEND=m
+# depends on MEMORY_HOTPLUG, arm64 doesn't enable this yet,
+# move to generic config if it ever does.
+CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y
diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore
new file mode 100644
index 000000000000..580c839bb177
--- /dev/null
+++ b/arch/x86/crypto/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
new file mode 100644
index 000000000000..3fd2423d3cf8
--- /dev/null
+++ b/arch/x86/crypto/Kconfig
@@ -0,0 +1,379 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (x86)"
+
+config CRYPTO_AES_NI_INTEL
+ tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)"
+ select CRYPTO_AEAD
+ select CRYPTO_LIB_AES
+ select CRYPTO_LIB_GF128MUL
+ select CRYPTO_ALGAPI
+ select CRYPTO_SKCIPHER
+ help
+ Block cipher: AES cipher algorithms
+ AEAD cipher: AES with GCM
+ Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XCTR, XTS
+
+ Architecture: x86 (32-bit and 64-bit) using:
+ - AES-NI (AES new instructions)
+ - VAES (Vector AES)
+
+ Some algorithm implementations are supported only in 64-bit builds,
+ and some have additional prerequisites such as AVX2 or AVX512.
+
+config CRYPTO_BLOWFISH_X86_64
+ tristate "Ciphers: Blowfish, modes: ECB, CBC"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_BLOWFISH_COMMON
+ imply CRYPTO_CTR
+ help
+ Block cipher: Blowfish cipher algorithm
+ Length-preserving ciphers: Blowfish with ECB and CBC modes
+
+ Architecture: x86_64
+
+config CRYPTO_CAMELLIA_X86_64
+ tristate "Ciphers: Camellia with modes: ECB, CBC"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ imply CRYPTO_CTR
+ help
+ Block cipher: Camellia cipher algorithms
+ Length-preserving ciphers: Camellia with ECB and CBC modes
+
+ Architecture: x86_64
+
+config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
+ tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_CAMELLIA_X86_64
+ imply CRYPTO_XTS
+ help
+ Length-preserving ciphers: Camellia with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX (Advanced Vector Extensions)
+
+config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
+ tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)"
+ depends on 64BIT
+ select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
+ help
+ Length-preserving ciphers: Camellia with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX2 (Advanced Vector Extensions 2)
+
+config CRYPTO_CAST5_AVX_X86_64
+ tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_CAST5
+ select CRYPTO_CAST_COMMON
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm
+ (RFC2144) with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes 16 blocks in parallel.
+
+config CRYPTO_CAST6_AVX_X86_64
+ tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_CAST6
+ select CRYPTO_CAST_COMMON
+ imply CRYPTO_XTS
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: CAST6 (CAST-256) cipher algorithm
+ (RFC2612) with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_DES3_EDE_X86_64
+ tristate "Ciphers: Triple DES EDE with modes: ECB, CBC"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_DES
+ imply CRYPTO_CTR
+ help
+ Block cipher: Triple DES EDE (FIPS 46-3) cipher algorithm
+ Length-preserving ciphers: Triple DES EDE with ECB and CBC modes
+
+ Architecture: x86_64
+
+ Processes one or three blocks in parallel.
+
+config CRYPTO_SERPENT_SSE2_X86_64
+ tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SERPENT
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - SSE2 (Streaming SIMD Extensions 2)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_SERPENT_SSE2_586
+ tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)"
+ depends on !64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SERPENT
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86 (32-bit) using:
+ - SSE2 (Streaming SIMD Extensions 2)
+
+ Processes four blocks in parallel.
+
+config CRYPTO_SERPENT_AVX_X86_64
+ tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SERPENT
+ imply CRYPTO_XTS
+ imply CRYPTO_CTR
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_SERPENT_AVX2_X86_64
+ tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)"
+ depends on 64BIT
+ select CRYPTO_SERPENT_AVX_X86_64
+ help
+ Length-preserving ciphers: Serpent cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX2 (Advanced Vector Extensions 2)
+
+ Processes 16 blocks in parallel.
+
+config CRYPTO_SM4_AESNI_AVX_X86_64
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_ALGAPI
+ select CRYPTO_SM4
+ help
+ Length-preserving ciphers: SM4 cipher algorithms
+ (OSCCA GB/T 32907-2016) with ECB, CBC, and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX (Advanced Vector Extensions)
+
+ Through two affine transforms,
+ we can use the AES S-Box to simulate the SM4 S-Box to achieve the
+ effect of instruction acceleration.
+
+ If unsure, say N.
+
+config CRYPTO_SM4_AESNI_AVX2_X86_64
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_ALGAPI
+ select CRYPTO_SM4
+ select CRYPTO_SM4_AESNI_AVX_X86_64
+ help
+ Length-preserving ciphers: SM4 cipher algorithms
+ (OSCCA GB/T 32907-2016) with ECB, CBC, and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX2 (Advanced Vector Extensions 2)
+
+ Through two affine transforms,
+ we can use the AES S-Box to simulate the SM4 S-Box to achieve the
+ effect of instruction acceleration.
+
+ If unsure, say N.
+
+config CRYPTO_TWOFISH_586
+ tristate "Ciphers: Twofish (32-bit)"
+ depends on !64BIT
+ select CRYPTO_ALGAPI
+ select CRYPTO_TWOFISH_COMMON
+ imply CRYPTO_CTR
+ help
+ Block cipher: Twofish cipher algorithm
+
+ Architecture: x86 (32-bit)
+
+config CRYPTO_TWOFISH_X86_64
+ tristate "Ciphers: Twofish"
+ depends on 64BIT
+ select CRYPTO_ALGAPI
+ select CRYPTO_TWOFISH_COMMON
+ imply CRYPTO_CTR
+ help
+ Block cipher: Twofish cipher algorithm
+
+ Architecture: x86_64
+
+config CRYPTO_TWOFISH_X86_64_3WAY
+ tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_TWOFISH_COMMON
+ select CRYPTO_TWOFISH_X86_64
+ help
+ Length-preserving cipher: Twofish cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64
+
+ Processes three blocks in parallel, better utilizing resources of
+ out-of-order CPUs.
+
+config CRYPTO_TWOFISH_AVX_X86_64
+ tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_TWOFISH_COMMON
+ select CRYPTO_TWOFISH_X86_64
+ select CRYPTO_TWOFISH_X86_64_3WAY
+ imply CRYPTO_XTS
+ help
+ Length-preserving cipher: Twofish cipher algorithm
+ with ECB and CBC modes
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ Processes eight blocks in parallel.
+
+config CRYPTO_ARIA_AESNI_AVX_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 16 blocks in parallel.
+
+config CRYPTO_ARIA_AESNI_AVX2_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ select CRYPTO_ARIA_AESNI_AVX_X86_64
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX2 (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 32 blocks in parallel.
+
+config CRYPTO_ARIA_GFNI_AVX512_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)"
+ depends on 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ select CRYPTO_ARIA_AESNI_AVX_X86_64
+ select CRYPTO_ARIA_AESNI_AVX2_X86_64
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AVX512 (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 64 blocks in parallel.
+
+config CRYPTO_AEGIS128_AESNI_SSE2
+ tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
+ depends on 64BIT
+ select CRYPTO_AEAD
+ help
+ AEGIS-128 AEAD algorithm
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - SSE4.1 (Streaming SIMD Extensions 4.1)
+
+config CRYPTO_NHPOLY1305_SSE2
+ tristate "Hash functions: NHPoly1305 (SSE2)"
+ depends on 64BIT
+ select CRYPTO_NHPOLY1305
+ help
+ NHPoly1305 hash function for Adiantum
+
+ Architecture: x86_64 using:
+ - SSE2 (Streaming SIMD Extensions 2)
+
+config CRYPTO_NHPOLY1305_AVX2
+ tristate "Hash functions: NHPoly1305 (AVX2)"
+ depends on 64BIT
+ select CRYPTO_NHPOLY1305
+ help
+ NHPoly1305 hash function for Adiantum
+
+ Architecture: x86_64 using:
+ - AVX2 (Advanced Vector Extensions 2)
+
+config CRYPTO_SM3_AVX_X86_64
+ tristate "Hash functions: SM3 (AVX)"
+ depends on 64BIT
+ select CRYPTO_HASH
+ select CRYPTO_LIB_SM3
+ help
+ SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3
+
+ Architecture: x86_64 using:
+ - AVX (Advanced Vector Extensions)
+
+ If unsure, say N.
+
+config CRYPTO_GHASH_CLMUL_NI_INTEL
+ tristate "Hash functions: GHASH (CLMUL-NI)"
+ depends on 64BIT
+ select CRYPTO_CRYPTD
+ help
+ GCM GHASH hash function (NIST SP800-38D)
+
+ Architecture: x86_64 using:
+ - CLMUL-NI (carry-less multiplication new instructions)
+
+endmenu
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index cfb0010fa940..5f2fb4f148fe 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -1,26 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0
#
-# Arch-specific CryptoAPI modules.
-#
-
-obj-$(CONFIG_CRYPTO_FPU) += fpu.o
+# x86 crypto algorithms
-obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
-obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
-
-obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
-obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
-obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
-obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
-aes-i586-y := aes-i586-asm_32.o aes_glue.o
-twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
+obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
-aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
-twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
-salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
+camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
+camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o camellia_aesni_avx_glue.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+
+obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
+blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
+
+obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
+obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
+ aes-gcm-aesni-x86_64.o \
+ aes-gcm-vaes-avx2.o \
+ aes-gcm-vaes-avx512.o \
+ aes-xts-avx-x86_64.o
+
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
+
+obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o
+sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o
+sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64) += sm4-aesni-avx2-x86_64.o
+sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o
+
+obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) += aria-aesni-avx-x86_64.o
+aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o
+
+obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX2_X86_64) += aria-aesni-avx2-x86_64.o
+aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o
+
+obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o
+aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
new file mode 100644
index 000000000000..7294dc0ee7ba
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -0,0 +1,602 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AES-NI + SSE4.1 implementation of AEGIS-128
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ * Copyright 2024 Google LLC
+ */
+
+#include <linux/linkage.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define KEY %xmm5
+#define MSG %xmm5
+#define T0 %xmm6
+#define T1 %xmm7
+
+.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
+.align 16
+.Laegis128_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32
+.align 32
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
+
+.text
+
+/*
+ * aegis128_update
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state (shifted positions)
+ * changed:
+ * T0
+ */
+.macro aegis128_update
+ movdqa STATE4, T0
+ aesenc STATE0, STATE4
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc T0, STATE3
+.endm
+
+/*
+ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register
+ * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8.
+ */
+.macro load_partial
+ sub $8, %ecx /* LEN - 8 */
+ jle .Lle8\@
+
+ /* Load 9 <= LEN <= 15 bytes: */
+ movq (SRC), MSG /* Load first 8 bytes */
+ mov (SRC, %rcx), %rax /* Load last 8 bytes */
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax /* Discard overlapping bytes */
+ pinsrq $1, %rax, MSG
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
+
+ /* Load 4 <= LEN <= 8 bytes: */
+ mov (SRC), %eax /* Load first 4 bytes */
+ mov (SRC, %rcx), %r8d /* Load last 4 bytes */
+ jmp .Lcombine\@
+
+.Llt4\@:
+ /* Load 1 <= LEN <= 3 bytes: */
+ add $2, %ecx /* LEN - 2 */
+ movzbl (SRC), %eax /* Load first byte */
+ jl .Lmovq\@
+ movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, %r8
+ or %r8, %rax /* Combine the two parts */
+.Lmovq\@:
+ movq %rax, MSG
+.Ldone\@:
+.endm
+
+/*
+ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer
+ * DST. Clobbers %rax, %rcx, and %r8.
+ */
+.macro store_partial msg
+ sub $8, %ecx /* LEN - 8 */
+ jl .Llt8\@
+
+ /* Store 8 <= LEN <= 15 bytes: */
+ pextrq $1, \msg, %rax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */
+ movq \msg, (DST) /* Store first 8 bytes */
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
+
+ /* Store 4 <= LEN <= 7 bytes: */
+ pextrd $1, \msg, %eax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */
+ movd \msg, (DST) /* Store first 4 bytes */
+ jmp .Ldone\@
+
+.Llt4\@:
+ /* Store 1 <= LEN <= 3 bytes: */
+ pextrb $0, \msg, 0(DST)
+ cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */
+ jl .Ldone\@
+ pextrb $1, \msg, 1(DST)
+ je .Ldone\@
+ pextrb $2, \msg, 2(DST)
+.Ldone\@:
+.endm
+
+/*
+ * void aegis128_aesni_init(struct aegis_state *state,
+ * const struct aegis_block *key,
+ * const u8 iv[AEGIS128_NONCE_SIZE]);
+ */
+SYM_FUNC_START(aegis128_aesni_init)
+ .set STATEP, %rdi
+ .set KEYP, %rsi
+ .set IVP, %rdx
+
+ /* load IV: */
+ movdqu (IVP), T1
+
+ /* load key: */
+ movdqa (KEYP), KEY
+ pxor KEY, T1
+ movdqa T1, STATE0
+ movdqa KEY, STATE3
+ movdqa KEY, STATE4
+
+ /* load the constants: */
+ movdqa .Laegis128_const_0(%rip), STATE2
+ movdqa .Laegis128_const_1(%rip), STATE1
+ pxor STATE2, STATE3
+ pxor STATE1, STATE4
+
+ /* update 10 times with KEY / KEY xor IV: */
+ aegis128_update; pxor KEY, STATE4
+ aegis128_update; pxor T1, STATE3
+ aegis128_update; pxor KEY, STATE2
+ aegis128_update; pxor T1, STATE1
+ aegis128_update; pxor KEY, STATE0
+ aegis128_update; pxor T1, STATE4
+ aegis128_update; pxor KEY, STATE3
+ aegis128_update; pxor T1, STATE2
+ aegis128_update; pxor KEY, STATE1
+ aegis128_update; pxor T1, STATE0
+
+ /* store the state: */
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ RET
+SYM_FUNC_END(aegis128_aesni_init)
+
+/*
+ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ * unsigned int len);
+ *
+ * len must be a multiple of 16.
+ */
+SYM_FUNC_START(aegis128_aesni_ad)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set LEN, %edx
+
+ test LEN, LEN
+ jz .Lad_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+.align 8
+.Lad_loop:
+ movdqu 0x00(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE4
+ sub $0x10, LEN
+ jz .Lad_out_1
+
+ movdqu 0x10(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE3
+ sub $0x10, LEN
+ jz .Lad_out_2
+
+ movdqu 0x20(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE2
+ sub $0x10, LEN
+ jz .Lad_out_3
+
+ movdqu 0x30(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE1
+ sub $0x10, LEN
+ jz .Lad_out_4
+
+ movdqu 0x40(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE0
+ sub $0x10, LEN
+ jz .Lad_out_0
+
+ add $0x50, SRC
+ jmp .Lad_loop
+
+ /* store the state: */
+.Lad_out_0:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ RET
+
+.Lad_out_1:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ RET
+
+.Lad_out_2:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ RET
+
+.Lad_out_3:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ RET
+
+.Lad_out_4:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+.Lad_out:
+ RET
+SYM_FUNC_END(aegis128_aesni_ad)
+
+.macro encrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
+ movdqa MSG, T0
+ pxor \s1, T0
+ pxor \s4, T0
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, T0
+ movdqu T0, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ jz .Lenc_out_\i
+.endm
+
+/*
+ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
+ */
+SYM_FUNC_START(aegis128_aesni_enc)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+.align 8
+.Lenc_loop:
+ encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Lenc_loop
+
+ /* store the state: */
+.Lenc_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ RET
+
+.Lenc_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ RET
+
+.Lenc_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ RET
+
+.Lenc_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ RET
+
+.Lenc_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+.Lenc_out:
+ RET
+SYM_FUNC_END(aegis128_aesni_enc)
+
+/*
+ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
+ */
+SYM_FUNC_START(aegis128_aesni_enc_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* encrypt message: */
+ mov LEN, %r9d
+ load_partial
+
+ movdqa MSG, T0
+ pxor STATE1, T0
+ pxor STATE4, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+
+ mov %r9d, LEN
+ store_partial T0
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ RET
+SYM_FUNC_END(aegis128_aesni_enc_tail)
+
+.macro decrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
+ pxor \s1, MSG
+ pxor \s4, MSG
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, MSG
+ movdqu MSG, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ jz .Ldec_out_\i
+.endm
+
+/*
+ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
+ */
+SYM_FUNC_START(aegis128_aesni_dec)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+.align 8
+.Ldec_loop:
+ decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Ldec_loop
+
+ /* store the state: */
+.Ldec_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ RET
+
+.Ldec_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ RET
+
+.Ldec_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ RET
+
+.Ldec_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ RET
+
+.Ldec_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+.Ldec_out:
+ RET
+SYM_FUNC_END(aegis128_aesni_dec)
+
+/*
+ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
+ */
+SYM_FUNC_START(aegis128_aesni_dec_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* decrypt message: */
+ mov LEN, %r9d
+ load_partial
+
+ pxor STATE1, MSG
+ pxor STATE4, MSG
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, MSG
+
+ mov %r9d, LEN
+ store_partial MSG
+
+ /* mask with byte count: */
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub %r9, %rax
+ movdqu (%rax), T0
+ pand T0, MSG
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ RET
+SYM_FUNC_END(aegis128_aesni_dec_tail)
+
+/*
+ * void aegis128_aesni_final(struct aegis_state *state,
+ * struct aegis_block *tag_xor,
+ * unsigned int assoclen, unsigned int cryptlen);
+ */
+SYM_FUNC_START(aegis128_aesni_final)
+ .set STATEP, %rdi
+ .set TAG_XOR, %rsi
+ .set ASSOCLEN, %edx
+ .set CRYPTLEN, %ecx
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* prepare length block: */
+ movd ASSOCLEN, MSG
+ pinsrd $2, CRYPTLEN, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ pxor STATE3, MSG
+
+ /* update state: */
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+ aegis128_update; pxor MSG, STATE2
+ aegis128_update; pxor MSG, STATE1
+ aegis128_update; pxor MSG, STATE0
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+
+ /* xor tag: */
+ movdqu (TAG_XOR), MSG
+
+ pxor STATE0, MSG
+ pxor STATE1, MSG
+ pxor STATE2, MSG
+ pxor STATE3, MSG
+ pxor STATE4, MSG
+
+ movdqu MSG, (TAG_XOR)
+ RET
+SYM_FUNC_END(aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
new file mode 100644
index 000000000000..f1adfba1a76e
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The AEGIS-128 Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE4.1 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128_BLOCK_ALIGN 16
+#define AEGIS128_BLOCK_SIZE 16
+#define AEGIS128_NONCE_SIZE 16
+#define AEGIS128_STATE_BLOCKS 5
+#define AEGIS128_KEY_SIZE 16
+#define AEGIS128_MIN_AUTH_SIZE 8
+#define AEGIS128_MAX_AUTH_SIZE 16
+
+struct aegis_block {
+ u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS128_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key;
+};
+
+asmlinkage void aegis128_aesni_init(struct aegis_state *state,
+ const struct aegis_block *key,
+ const u8 iv[AEGIS128_NONCE_SIZE]);
+
+asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_final(struct aegis_state *state,
+ struct aegis_block *tag_xor,
+ unsigned int assoclen,
+ unsigned int cryptlen);
+
+static void crypto_aegis128_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_next(&walk, assoclen);
+ const u8 *src = walk.addr;
+ unsigned int left = size;
+
+ if (pos + size >= AEGIS128_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ aegis128_aesni_ad(state, buf.bytes,
+ AEGIS128_BLOCK_SIZE);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ aegis128_aesni_ad(state, src,
+ left & ~(AEGIS128_BLOCK_SIZE - 1));
+ src += left & ~(AEGIS128_BLOCK_SIZE - 1);
+ left &= AEGIS128_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_done_src(&walk, size);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
+ aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE);
+ }
+}
+
+static __always_inline int
+crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
+ struct skcipher_walk *walk, bool enc)
+{
+ int err = 0;
+
+ while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
+ if (enc)
+ aegis128_aesni_enc(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
+ else
+ aegis128_aesni_dec(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
+ kernel_fpu_end();
+ err = skcipher_walk_done(walk,
+ walk->nbytes % AEGIS128_BLOCK_SIZE);
+ kernel_fpu_begin();
+ }
+
+ if (walk->nbytes) {
+ if (enc)
+ aegis128_aesni_enc_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
+ else
+ aegis128_aesni_dec_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
+ kernel_fpu_end();
+ err = skcipher_walk_done(walk, 0);
+ kernel_fpu_begin();
+ }
+ return err;
+}
+
+static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead);
+
+ if (keylen != AEGIS128_KEY_SIZE)
+ return -EINVAL;
+
+ memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS128_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS128_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static __always_inline int
+crypto_aegis128_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen, bool enc)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+ struct skcipher_walk walk;
+ struct aegis_state state;
+ int err;
+
+ if (enc)
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
+ else
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+
+ aegis128_aesni_init(&state, &ctx->key, req->iv);
+ crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
+ err = crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
+ if (err == 0)
+ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+ kernel_fpu_end();
+ return err;
+}
+
+static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+ int err;
+
+ err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
+ if (err)
+ return err;
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+ int err;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ err = crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
+ if (err)
+ return err;
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static struct aead_alg crypto_aegis128_aesni_alg = {
+ .setkey = crypto_aegis128_aesni_setkey,
+ .setauthsize = crypto_aegis128_aesni_setauthsize,
+ .encrypt = crypto_aegis128_aesni_encrypt,
+ .decrypt = crypto_aegis128_aesni_decrypt,
+
+ .ivsize = AEGIS128_NONCE_SIZE,
+ .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128_BLOCK_SIZE,
+
+ .base = {
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_priority = 400,
+
+ .cra_name = "aegis128",
+ .cra_driver_name = "aegis128-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init crypto_aegis128_aesni_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return -ENODEV;
+
+ return crypto_register_aead(&crypto_aegis128_aesni_alg);
+}
+
+static void __exit crypto_aegis128_aesni_module_exit(void)
+{
+ crypto_unregister_aead(&crypto_aegis128_aesni_alg);
+}
+
+module_init(crypto_aegis128_aesni_module_init);
+module_exit(crypto_aegis128_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation");
+MODULE_ALIAS_CRYPTO("aegis128");
+MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S
new file mode 100644
index 000000000000..2745918f68ee
--- /dev/null
+++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S
@@ -0,0 +1,571 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR
+// using the following sets of CPU features:
+// - AES-NI && AVX
+// - VAES && AVX2
+// - VAES && AVX512BW && AVX512VL && BMI2
+//
+// See the function definitions at the bottom of the file for more information.
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+.Lctr_pattern:
+ .quad 0, 0
+.Lone:
+ .quad 1, 0
+.Ltwo:
+ .quad 2, 0
+ .quad 3, 0
+
+.Lfour:
+ .quad 4, 0
+
+.text
+
+// Move a vector between memory and a register.
+.macro _vmovdqu src, dst
+.if VL < 64
+ vmovdqu \src, \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.endm
+
+// Move a vector between registers.
+.macro _vmovdqa src, dst
+.if VL < 64
+ vmovdqa \src, \dst
+.else
+ vmovdqa64 \src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
+// register.
+.macro _vbroadcast128 src, dst
+.if VL == 16
+ vmovdqu \src, \dst
+.elseif VL == 32
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro _vpxor src1, src2, dst
+.if VL < 64
+ vpxor \src1, \src2, \dst
+.else
+ vpxord \src1, \src2, \dst
+.endif
+.endm
+
+// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _load_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jle .Lle8\@
+
+ // Load 9 <= LEN <= 15 bytes.
+ vmovq (\src), \dst // Load first 8 bytes
+ mov (\src, %rcx), %rax // Load last 8 bytes
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax // Discard overlapping bytes
+ vpinsrq $1, %rax, \dst, \dst
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Load 4 <= LEN <= 8 bytes.
+ mov (\src), %eax // Load first 4 bytes
+ mov (\src, %rcx), \tmp32 // Load last 4 bytes
+ jmp .Lcombine\@
+
+.Llt4\@:
+ // Load 1 <= LEN <= 3 bytes.
+ add $2, %ecx // LEN - 2
+ movzbl (\src), %eax // Load first byte
+ jl .Lmovq\@
+ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, \tmp64
+ or \tmp64, %rax // Combine the two parts
+.Lmovq\@:
+ vmovq %rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _store_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jl .Llt8\@
+
+ // Store 8 <= LEN <= 15 bytes.
+ vpextrq $1, \src, %rax
+ mov %ecx, \tmp32
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes
+ vmovq \src, (\dst) // Store first 8 bytes
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Store 4 <= LEN <= 7 bytes.
+ vpextrd $1, \src, %eax
+ mov %ecx, \tmp32
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes
+ vmovd \src, (\dst) // Store first 4 bytes
+ jmp .Ldone\@
+
+.Llt4\@:
+ // Store 1 <= LEN <= 3 bytes.
+ vpextrb $0, \src, 0(\dst)
+ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
+ jl .Ldone\@
+ vpextrb $1, \src, 1(\dst)
+ je .Ldone\@
+ vpextrb $2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and
+// XOR each with the zero-th round key. Also update LE_CTR if !\final.
+.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0
+.if \is_xctr
+ .if USE_AVX512
+ vmovdqa64 LE_CTR, AESDATA\i0
+ vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0
+ .else
+ vpxor XCTR_IV, LE_CTR, AESDATA\i0
+ vpxor RNDKEY0, AESDATA\i0, AESDATA\i0
+ .endif
+ vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
+
+ .if USE_AVX512
+ vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1
+ .else
+ vpxor XCTR_IV, AESDATA\i1, AESDATA\i1
+ vpxor RNDKEY0, AESDATA\i1, AESDATA\i1
+ .endif
+.else
+ vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0
+ _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0
+ vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
+ vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1
+ _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1
+.endif
+.if !\final
+ vpaddq LE_CTR_INC2, LE_CTR, LE_CTR
+.endif
+.endm
+
+// Do all AES rounds on the data in the given AESDATA vectors, excluding the
+// zero-th and last rounds.
+.macro _aesenc_loop vecs:vararg
+ mov KEY, %rax
+1:
+ _vbroadcast128 (%rax), RNDKEY
+.irp i, \vecs
+ vaesenc RNDKEY, AESDATA\i, AESDATA\i
+.endr
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+.endm
+
+// Finalize the keystream blocks in the given AESDATA vectors by doing the last
+// AES round, then XOR those keystream blocks with the corresponding data.
+// Reduce latency by doing the XOR before the vaesenclast, utilizing the
+// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+.macro _aesenclast_and_xor vecs:vararg
+.irp i, \vecs
+ _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY
+ vaesenclast RNDKEY, AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+ _vmovdqu AESDATA\i, \i*VL(DST)
+.endr
+.endm
+
+// XOR the keystream blocks in the specified AESDATA vectors with the
+// corresponding data.
+.macro _xor_data vecs:vararg
+.irp i, \vecs
+ _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+ _vmovdqu AESDATA\i, \i*VL(DST)
+.endr
+.endm
+
+.macro _aes_ctr_crypt is_xctr
+
+ // Define register aliases V0-V15 that map to the xmm, ymm, or zmm
+ // registers according to the selected Vector Length (VL).
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ .if VL == 16
+ .set V\i, %xmm\i
+ .elseif VL == 32
+ .set V\i, %ymm\i
+ .elseif VL == 64
+ .set V\i, %zmm\i
+ .else
+ .error "Unsupported Vector Length (VL)"
+ .endif
+.endr
+
+ // Function arguments
+ .set KEY, %rdi // Initially points to the start of the
+ // crypto_aes_ctx, then is advanced to
+ // point to the index 1 round key
+ .set KEY32, %edi // Available as temp register after all
+ // keystream blocks have been generated
+ .set SRC, %rsi // Pointer to next source data
+ .set DST, %rdx // Pointer to next destination data
+ .set LEN, %ecx // Remaining length in bytes.
+ // Note: _load_partial_block relies on
+ // this being in %ecx.
+ .set LEN64, %rcx // Zero-extend LEN before using!
+ .set LEN8, %cl
+.if \is_xctr
+ .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE];
+ .set XCTR_CTR, %r9 // u64 ctr;
+.else
+ .set LE_CTR_PTR, %r8 // const u64 le_ctr[2];
+.endif
+
+ // Additional local variables
+ .set RNDKEYLAST_PTR, %r10
+ .set AESDATA0, V0
+ .set AESDATA0_XMM, %xmm0
+ .set AESDATA1, V1
+ .set AESDATA1_XMM, %xmm1
+ .set AESDATA2, V2
+ .set AESDATA3, V3
+ .set AESDATA4, V4
+ .set AESDATA5, V5
+ .set AESDATA6, V6
+ .set AESDATA7, V7
+.if \is_xctr
+ .set XCTR_IV, V8
+.else
+ .set BSWAP_MASK, V8
+.endif
+ .set LE_CTR, V9
+ .set LE_CTR_XMM, %xmm9
+ .set LE_CTR_INC1, V10
+ .set LE_CTR_INC2, V11
+ .set RNDKEY0, V12
+ .set RNDKEYLAST, V13
+ .set RNDKEY, V14
+
+ // Create the first vector of counters.
+.if \is_xctr
+ .if VL == 16
+ vmovq XCTR_CTR, LE_CTR
+ .elseif VL == 32
+ vmovq XCTR_CTR, LE_CTR_XMM
+ inc XCTR_CTR
+ vmovq XCTR_CTR, AESDATA0_XMM
+ vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR
+ .else
+ vpbroadcastq XCTR_CTR, LE_CTR
+ vpsrldq $8, LE_CTR, LE_CTR
+ vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR
+ .endif
+ _vbroadcast128 (XCTR_IV_PTR), XCTR_IV
+.else
+ _vbroadcast128 (LE_CTR_PTR), LE_CTR
+ .if VL > 16
+ vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR
+ .endif
+ _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK
+.endif
+
+.if VL == 16
+ _vbroadcast128 .Lone(%rip), LE_CTR_INC1
+.elseif VL == 32
+ _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1
+.else
+ _vbroadcast128 .Lfour(%rip), LE_CTR_INC1
+.endif
+ vpsllq $1, LE_CTR_INC1, LE_CTR_INC2
+
+ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+ movl 480(KEY), %eax
+
+ // Compute the pointer to the last round key.
+ lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR
+
+ // Load the zero-th and last round keys.
+ _vbroadcast128 (KEY), RNDKEY0
+ _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST
+
+ // Make KEY point to the first round key.
+ add $16, KEY
+
+ // This is the main loop, which encrypts 8 vectors of data at a time.
+ add $-8*VL, LEN
+ jl .Lloop_8x_done\@
+.Lloop_8x\@:
+ _prepare_2_ctr_vecs \is_xctr, 0, 1
+ _prepare_2_ctr_vecs \is_xctr, 2, 3
+ _prepare_2_ctr_vecs \is_xctr, 4, 5
+ _prepare_2_ctr_vecs \is_xctr, 6, 7
+ _aesenc_loop 0,1,2,3,4,5,6,7
+ _aesenclast_and_xor 0,1,2,3,4,5,6,7
+ sub $-8*VL, SRC
+ sub $-8*VL, DST
+ add $-8*VL, LEN
+ jge .Lloop_8x\@
+.Lloop_8x_done\@:
+ sub $-8*VL, LEN
+ jz .Ldone\@
+
+ // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream
+ // blocks, depending on the remaining LEN.
+
+ _prepare_2_ctr_vecs \is_xctr, 0, 1
+ _prepare_2_ctr_vecs \is_xctr, 2, 3
+ cmp $4*VL, LEN
+ jle .Lenc_tail_atmost4vecs\@
+
+ // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the
+ // first 4 to XOR 4 full vectors of data. Then XOR the remaining data.
+ _prepare_2_ctr_vecs \is_xctr, 4, 5
+ _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1
+ _aesenc_loop 0,1,2,3,4,5,6,7
+ _aesenclast_and_xor 0,1,2,3
+ vaesenclast RNDKEYLAST, AESDATA4, AESDATA0
+ vaesenclast RNDKEYLAST, AESDATA5, AESDATA1
+ vaesenclast RNDKEYLAST, AESDATA6, AESDATA2
+ vaesenclast RNDKEYLAST, AESDATA7, AESDATA3
+ sub $-4*VL, SRC
+ sub $-4*VL, DST
+ add $-4*VL, LEN
+ cmp $1*VL-1, LEN
+ jle .Lxor_tail_partial_vec_0\@
+ _xor_data 0
+ cmp $2*VL-1, LEN
+ jle .Lxor_tail_partial_vec_1\@
+ _xor_data 1
+ cmp $3*VL-1, LEN
+ jle .Lxor_tail_partial_vec_2\@
+ _xor_data 2
+ cmp $4*VL-1, LEN
+ jle .Lxor_tail_partial_vec_3\@
+ _xor_data 3
+ jmp .Ldone\@
+
+.Lenc_tail_atmost4vecs\@:
+ cmp $2*VL, LEN
+ jle .Lenc_tail_atmost2vecs\@
+
+ // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the
+ // first 2 to XOR 2 full vectors of data. Then XOR the remaining data.
+ _aesenc_loop 0,1,2,3
+ _aesenclast_and_xor 0,1
+ vaesenclast RNDKEYLAST, AESDATA2, AESDATA0
+ vaesenclast RNDKEYLAST, AESDATA3, AESDATA1
+ sub $-2*VL, SRC
+ sub $-2*VL, DST
+ add $-2*VL, LEN
+ jmp .Lxor_tail_upto2vecs\@
+
+.Lenc_tail_atmost2vecs\@:
+ // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR
+ // the remaining data.
+ _aesenc_loop 0,1
+ vaesenclast RNDKEYLAST, AESDATA0, AESDATA0
+ vaesenclast RNDKEYLAST, AESDATA1, AESDATA1
+
+.Lxor_tail_upto2vecs\@:
+ cmp $1*VL-1, LEN
+ jle .Lxor_tail_partial_vec_0\@
+ _xor_data 0
+ cmp $2*VL-1, LEN
+ jle .Lxor_tail_partial_vec_1\@
+ _xor_data 1
+ jmp .Ldone\@
+
+.Lxor_tail_partial_vec_1\@:
+ add $-1*VL, LEN
+ jz .Ldone\@
+ sub $-1*VL, SRC
+ sub $-1*VL, DST
+ _vmovdqa AESDATA1, AESDATA0
+ jmp .Lxor_tail_partial_vec_0\@
+
+.Lxor_tail_partial_vec_2\@:
+ add $-2*VL, LEN
+ jz .Ldone\@
+ sub $-2*VL, SRC
+ sub $-2*VL, DST
+ _vmovdqa AESDATA2, AESDATA0
+ jmp .Lxor_tail_partial_vec_0\@
+
+.Lxor_tail_partial_vec_3\@:
+ add $-3*VL, LEN
+ jz .Ldone\@
+ sub $-3*VL, SRC
+ sub $-3*VL, DST
+ _vmovdqa AESDATA3, AESDATA0
+
+.Lxor_tail_partial_vec_0\@:
+ // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked
+ // loads/stores are available; otherwise it's a bit harder...
+.if USE_AVX512
+ mov $-1, %rax
+ bzhi LEN64, %rax, %rax
+ kmovq %rax, %k1
+ vmovdqu8 (SRC), AESDATA1{%k1}{z}
+ vpxord AESDATA1, AESDATA0, AESDATA0
+ vmovdqu8 AESDATA0, (DST){%k1}
+.else
+ .if VL == 32
+ cmp $16, LEN
+ jl 1f
+ vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM
+ vmovdqu AESDATA1_XMM, (DST)
+ add $16, SRC
+ add $16, DST
+ sub $16, LEN
+ jz .Ldone\@
+ vextracti128 $1, AESDATA0, AESDATA0_XMM
+1:
+ .endif
+ mov LEN, %r10d
+ _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32
+ vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM
+ mov %r10d, %ecx
+ _store_partial_block AESDATA0_XMM, DST, KEY, KEY32
+.endif
+
+.Ldone\@:
+.if VL > 16
+ vzeroupper
+.endif
+ RET
+.endm
+
+// Below are the definitions of the functions generated by the above macro.
+// They have the following prototypes:
+//
+//
+// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key,
+// const u8 *src, u8 *dst, int len,
+// const u64 le_ctr[2]);
+//
+// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key,
+// const u8 *src, u8 *dst, int len,
+// const u8 iv[AES_BLOCK_SIZE], u64 ctr);
+//
+// Both functions generate |len| bytes of keystream, XOR it with the data from
+// |src|, and write the result to |dst|. On non-final calls, |len| must be a
+// multiple of 16. On the final call, |len| can be any value.
+//
+// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated
+// from a 128-bit big endian counter that increments by 1 for each AES block.
+// HOWEVER, to keep the assembly code simple, some of the counter management is
+// left to the caller. aes_ctr64_crypt_* take the counter in little endian
+// form, only increment the low 64 bits internally, do the conversion to big
+// endian internally, and don't write the updated counter back to memory. The
+// caller is responsible for converting the starting IV to the little endian
+// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits
+// being needed and splitting at that point with a carry done in between, and
+// updating le_ctr after each part if the message is multi-part.
+//
+// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption
+// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an
+// easier-to-implement variant of CTR that uses little endian byte order and
+// eliminates carries. |ctr| is the per-message block counter starting at 1.
+
+.set VL, 16
+.set USE_AVX512, 0
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
+ _aes_ctr_crypt 0
+SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx)
+ _aes_ctr_crypt 1
+SYM_FUNC_END(aes_xctr_crypt_aesni_avx)
+
+.set VL, 32
+.set USE_AVX512, 0
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
+ _aes_ctr_crypt 0
+SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
+ _aes_ctr_crypt 1
+SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
+
+.set VL, 64
+.set USE_AVX512, 1
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
+ _aes_ctr_crypt 0
+SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
+ _aes_ctr_crypt 1
+SYM_FUNC_END(aes_xctr_crypt_vaes_avx512)
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
new file mode 100644
index 000000000000..7c8a8a32bd3c
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -0,0 +1,1128 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-NI optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support the original set of AES instructions, i.e. AES-NI. Two
+// implementations are provided, one that uses AVX and one that doesn't. They
+// are very similar, being generated by the same macros. The only difference is
+// that the AVX implementation takes advantage of VEX-coded instructions in some
+// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX
+// implementation does *not* use 256-bit vectors, as AES is not supported on
+// 256-bit vectors until the VAES feature (which this file doesn't target).
+//
+// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
+// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
+// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
+//
+// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is
+// more thoroughly commented. This file has the following notable changes:
+//
+// - The vector length is fixed at 128-bit, i.e. xmm registers. This means
+// there is only one AES block (and GHASH block) per register.
+//
+// - Without AVX512, only 16 SIMD registers are available instead of 32. We
+// work around this by being much more careful about using registers,
+// relying heavily on loads to load values as they are needed.
+//
+// - Masking is not available either. We work around this by implementing
+// partial block loads and stores using overlapping scalar loads and stores
+// combined with shifts and SSE4.1 insertion and extraction instructions.
+//
+// - The main loop is organized differently due to the different design
+// constraints. First, with just one AES block per SIMD register, on some
+// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore
+// do an 8-register wide loop. Considering that and the fact that we have
+// just 16 SIMD registers to work with, it's not feasible to cache AES
+// round keys and GHASH key powers in registers across loop iterations.
+// That's not ideal, but also not actually that bad, since loads can run in
+// parallel with other instructions. Significantly, this also makes it
+// possible to roll up the inner loops, relying on hardware loop unrolling
+// instead of software loop unrolling, greatly reducing code size.
+//
+// - We implement the GHASH multiplications in the main loop using Karatsuba
+// multiplication instead of schoolbook multiplication. This saves one
+// pclmulqdq instruction per block, at the cost of one 64-bit load, one
+// pshufd, and 0.25 pxors per block. (This is without the three-argument
+// XOR support that would be provided by AVX512, which would be more
+// beneficial to schoolbook than Karatsuba.)
+//
+// As a rough approximation, we can assume that Karatsuba multiplication is
+// faster than schoolbook multiplication in this context if one pshufd and
+// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
+// load is "free" due to running in parallel with arithmetic instructions.)
+// This is true on AMD CPUs, including all that support pclmulqdq up to at
+// least Zen 3. It's also true on older Intel CPUs: Westmere through
+// Haswell on the Core side, and Silvermont through Goldmont Plus on the
+// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
+// benefit of Karatsuba should be substantial. On newer Intel CPUs,
+// schoolbook multiplication should be faster, but only marginally.
+//
+// Not all these CPUs were available to be tested. However, benchmarks on
+// available CPUs suggest that this approximation is plausible. Switching
+// to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
+// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
+// Considering that and the fact that Karatsuba should be even more
+// beneficial on older Intel CPUs, it seems like the right choice here.
+//
+// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
+// saved by using a multiplication-less reduction method. We don't do that
+// because it would require a large number of shift and xor instructions,
+// making it less worthwhile and likely harmful on newer CPUs.
+//
+// It does make sense to sometimes use a different reduction optimization
+// that saves a pclmulqdq, though: precompute the hash key times x^64, and
+// multiply the low half of the data block by the hash key with the extra
+// factor of x^64. This eliminates one step of the reduction. However,
+// this is incompatible with Karatsuba multiplication. Therefore, for
+// multi-block processing we use Karatsuba multiplication with a regular
+// reduction. For single-block processing, we use the x^64 optimization.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+.Lgfpoly:
+ .quad 0xc200000000000000
+.Lone:
+ .quad 1
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+ // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
+ // 'len' 0xff bytes and the rest zeroes.
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
+
+// Offsets in struct aes_gcm_key_aesni
+#define OFFSETOF_AESKEYLEN 480
+#define OFFSETOF_H_POWERS 496
+#define OFFSETOF_H_POWERS_XORED 624
+#define OFFSETOF_H_TIMES_X64 688
+
+.text
+
+// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
+// assumes that all operands are distinct and that any mem operand is aligned.
+.macro _vpclmulqdq imm, src1, src2, dst
+.if USE_AVX
+ vpclmulqdq \imm, \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pclmulqdq \imm, \src1, \dst
+.endif
+.endm
+
+// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
+// that all operands are distinct and that any mem operand is aligned.
+.macro _vpshufb src1, src2, dst
+.if USE_AVX
+ vpshufb \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pshufb \src1, \dst
+.endif
+.endm
+
+// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
+// all operands are distinct.
+.macro _vpand src1, src2, dst
+.if USE_AVX
+ vpand \src1, \src2, \dst
+.else
+ movdqu \src1, \dst
+ pand \src2, \dst
+.endif
+.endm
+
+// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must
+// be a temporary xmm register.
+.macro _xor_mem_to_reg mem, reg, tmp
+.if USE_AVX
+ vpxor \mem, \reg, \reg
+.else
+ movdqu \mem, \tmp
+ pxor \tmp, \reg
+.endif
+.endm
+
+// Test the unaligned memory operand \mem against the xmm register \reg. \tmp
+// must be a temporary xmm register.
+.macro _test_mem mem, reg, tmp
+.if USE_AVX
+ vptest \mem, \reg
+.else
+ movdqu \mem, \tmp
+ ptest \tmp, \reg
+.endif
+.endm
+
+// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _load_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jle .Lle8\@
+
+ // Load 9 <= LEN <= 15 bytes.
+ movq (\src), \dst // Load first 8 bytes
+ mov (\src, %rcx), %rax // Load last 8 bytes
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax // Discard overlapping bytes
+ pinsrq $1, %rax, \dst
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Load 4 <= LEN <= 8 bytes.
+ mov (\src), %eax // Load first 4 bytes
+ mov (\src, %rcx), \tmp32 // Load last 4 bytes
+ jmp .Lcombine\@
+
+.Llt4\@:
+ // Load 1 <= LEN <= 3 bytes.
+ add $2, %ecx // LEN - 2
+ movzbl (\src), %eax // Load first byte
+ jl .Lmovq\@
+ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, \tmp64
+ or \tmp64, %rax // Combine the two parts
+.Lmovq\@:
+ movq %rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and %rsi.
+.macro _store_partial_block src, dst
+ sub $8, %ecx // LEN - 8
+ jl .Llt8\@
+
+ // Store 8 <= LEN <= 15 bytes.
+ pextrq $1, \src, %rax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
+ movq \src, (\dst) // Store first 8 bytes
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Store 4 <= LEN <= 7 bytes.
+ pextrd $1, \src, %eax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
+ movd \src, (\dst) // Store first 4 bytes
+ jmp .Ldone\@
+
+.Llt4\@:
+ // Store 1 <= LEN <= 3 bytes.
+ pextrb $0, \src, 0(\dst)
+ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
+ jl .Ldone\@
+ pextrb $1, \src, 1(\dst)
+ je .Ldone\@
+ pextrb $2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
+// \b. To complete all steps, this must be invoked with \i=0 through \i=9.
+// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
+// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
+.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
+
+ // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
+.if \i == 0
+ _vpclmulqdq $0x01, \a, \b, \t0
+.elseif \i == 1
+ _vpclmulqdq $0x00, \a_times_x64, \b, \t1
+.elseif \i == 2
+ pxor \t1, \t0
+
+ // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
+.elseif \i == 3
+ _vpclmulqdq $0x11, \a, \b, \t1
+.elseif \i == 4
+ pclmulqdq $0x10, \a_times_x64, \b
+.elseif \i == 5
+ pxor \t1, \b
+.elseif \i == 6
+
+ // Fold MI into HI.
+ pshufd $0x4e, \t0, \t1 // Swap halves of MI
+.elseif \i == 7
+ pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ pxor \t1, \b
+.elseif \i == 9
+ pxor \t0, \b
+.endif
+.endm
+
+// GHASH-multiply \a by \b and store the reduced product in \b.
+// See _ghash_mul_step for details.
+.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
+.endr
+.endm
+
+// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
+// This does Karatsuba multiplication and must be paired with _ghash_reduce. On
+// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the
+// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
+.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, \a, \b, \t0
+ pxor \t0, \lo
+
+ // b_L + b_H
+ pshufd $0x4e, \b, \t0
+ pxor \b, \t0
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, \a, \b
+ pxor \b, \hi
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, \a_xored, \t0
+ pxor \t0, \mi
+.endm
+
+// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
+// This assumes that _ghash_mul_noreduce was used.
+.macro _ghash_reduce lo, mi, hi, dst, t0
+
+ movq .Lgfpoly(%rip), \t0
+
+ // MI += LO + HI (needed because we used Karatsuba multiplication)
+ pxor \lo, \mi
+ pxor \hi, \mi
+
+ // Fold LO into MI.
+ pshufd $0x4e, \lo, \dst
+ pclmulqdq $0x00, \t0, \lo
+ pxor \dst, \mi
+ pxor \lo, \mi
+
+ // Fold MI into HI.
+ pshufd $0x4e, \mi, \dst
+ pclmulqdq $0x00, \t0, \mi
+ pxor \hi, \dst
+ pxor \mi, \dst
+.endm
+
+// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
+//
+// The whole GHASH update does:
+//
+// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
+// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
+//
+// This macro just does the first step: it does the unreduced multiplication
+// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
+// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
+// inner block counter in %rax, which is a value that counts up by 8 for each
+// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
+//
+// To reduce the number of pclmulqdq instructions required, both this macro and
+// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
+// multiplication. See the file comment for more details about this choice.
+//
+// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
+// encrypting, or SRC if decrypting. They also expect the precomputed hash key
+// powers H^i and their XOR'd-together halves to be available in the struct
+// pointed to by KEY. Both macros clobber TMP[0-2].
+.macro _ghash_update_begin_8x enc
+
+ // Initialize the inner block counter.
+ xor %eax, %eax
+
+ // Load the highest hash key power, H^8.
+ movdqa OFFSETOF_H_POWERS(KEY), TMP0
+
+ // Load the first ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST), TMP1
+.else
+ movdqu (SRC), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // Add the GHASH accumulator to the ciphertext block to get the block
+ // 'b' that needs to be multiplied with the hash key power 'a'.
+ pxor TMP1, GHASH_ACC
+
+ // b_L + b_H
+ pshufd $0x4e, GHASH_ACC, MI
+ pxor GHASH_ACC, MI
+
+ // LO = a_L * b_L
+ _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO
+
+ // HI = a_H * b_H
+ pclmulqdq $0x11, TMP0, GHASH_ACC
+
+ // MI = (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
+.endm
+
+// Continue the GHASH update of 8 ciphertext blocks as described above by doing
+// an unreduced multiplication of the next ciphertext block by the next lowest
+// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
+.macro _ghash_update_continue_8x enc
+ add $8, %eax
+
+ // Load the next lowest key power.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
+
+ // Load the next ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST,%rax,2), TMP1
+.else
+ movdqu (SRC,%rax,2), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, TMP0, TMP1, TMP2
+ pxor TMP2, LO
+
+ // b_L + b_H
+ pshufd $0x4e, TMP1, TMP2
+ pxor TMP1, TMP2
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, TMP0, TMP1
+ pxor TMP1, GHASH_ACC
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
+ pclmulqdq $0x00, TMP1, TMP2
+ pxor TMP2, MI
+.endm
+
+// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
+// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
+// it uses the same register for HI and the destination. It's also divided into
+// two steps. TMP1 must be preserved across steps.
+//
+// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
+// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would
+// increase the critical path length, and it seems to slightly hurt performance.
+.macro _ghash_update_end_8x_step i
+.if \i == 0
+ movq .Lgfpoly(%rip), TMP1
+ pxor LO, MI
+ pxor GHASH_ACC, MI
+ pshufd $0x4e, LO, TMP2
+ pclmulqdq $0x00, TMP1, LO
+ pxor TMP2, MI
+ pxor LO, MI
+.elseif \i == 1
+ pshufd $0x4e, MI, TMP2
+ pclmulqdq $0x00, TMP1, MI
+ pxor TMP2, GHASH_ACC
+ pxor MI, GHASH_ACC
+.endif
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
+//
+// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
+// related fields in the key struct.
+.macro _aes_gcm_precompute
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables.
+ // %xmm0-%xmm1 and %rax are used as temporaries.
+ .set RNDKEYLAST_PTR, %rsi
+ .set H_CUR, %xmm2
+ .set H_POW1, %xmm3 // H^1
+ .set H_POW1_X64, %xmm4 // H^1 * x^64
+ .set GFPOLY, %xmm5
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
+ lea 16(KEY), %rax
+1:
+ aesenc (%rax), H_POW1
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), H_POW1
+
+ // Preprocess the raw hash subkey as needed to operate on GHASH's
+ // bit-reflected values directly: reflect its bytes, then multiply it by
+ // x^-1 (using the backwards interpretation of polynomial coefficients
+ // from the GCM spec) or equivalently x^1 (using the alternative,
+ // natural interpretation of polynomial coefficients).
+ pshufb .Lbswap_mask(%rip), H_POW1
+ movdqa H_POW1, %xmm0
+ pshufd $0xd3, %xmm0, %xmm0
+ psrad $31, %xmm0
+ paddq H_POW1, H_POW1
+ pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0
+ pxor %xmm0, H_POW1
+
+ // Store H^1.
+ movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
+
+ // Compute and store H^1 * x^64.
+ movq .Lgfpoly(%rip), GFPOLY
+ pshufd $0x4e, H_POW1, %xmm0
+ _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64
+ pxor %xmm0, H_POW1_X64
+ movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
+
+ // Compute and store the halves of H^1 XOR'd together.
+ pxor H_POW1, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
+
+ // Compute and store the remaining key powers H^2 through H^8.
+ movdqa H_POW1, H_CUR
+ mov $6*8, %eax
+.Lprecompute_next\@:
+ // Compute H^i = H^{i-1} * H^1.
+ _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
+ // Store H^i.
+ movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
+ // Compute and store the halves of H^i XOR'd together.
+ pshufd $0x4e, H_CUR, %xmm0
+ pxor H_CUR, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
+ sub $8, %eax
+ jge .Lprecompute_next\@
+
+ RET
+.endm
+
+// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+// u8 ghash_acc[16], const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+.macro _aes_gcm_aad_update
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ // Note: _load_partial_block relies on AADLEN being in %ecx.
+
+ // Additional local variables.
+ // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
+ .set BSWAP_MASK, %xmm2
+ .set GHASH_ACC, %xmm3
+ .set H_POW1, %xmm4 // H^1
+ .set H_POW1_X64, %xmm5 // H^1 * x^64
+ .set GFPOLY, %xmm6
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Process the AAD one full block at a time.
+ sub $16, AADLEN
+ jl .Laad_loop_1x_done\@
+.Laad_loop_1x\@:
+ movdqu (AAD), %xmm0
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+ add $16, AAD
+ sub $16, AADLEN
+ jge .Laad_loop_1x\@
+.Laad_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, AADLEN
+ jz .Laad_done\@
+
+ // Process a partial block of length 1 <= AADLEN <= 15.
+ // _load_partial_block assumes that %ecx contains AADLEN.
+ _load_partial_block AAD, %xmm0, %r10, %r10d
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+
+.Laad_done\@:
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+ RET
+.endm
+
+// Increment LE_CTR eight times to generate eight little-endian counter blocks,
+// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
+// the zero-th AES round key. Clobbers TMP0 and TMP1.
+.macro _ctr_begin_8x
+ movq .Lone(%rip), TMP0
+ movdqa (KEY), TMP1 // zero-th round key
+.irp i, 0,1,2,3,4,5,6,7
+ _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
+ pxor TMP1, AESDATA\i
+ paddd TMP0, LE_CTR
+.endr
+.endm
+
+// Do a non-last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenc_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenc \round_key, AESDATA\i
+.endr
+.endm
+
+// Do the last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenclast_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenclast \round_key, AESDATA\i
+.endr
+.endm
+
+// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
+// store the result to DST. Clobbers TMP0.
+.macro _xor_data_8x
+.irp i, 0,1,2,3,4,5,6,7
+ _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
+.endr
+.irp i, 0,1,2,3,4,5,6,7
+ movdqu AESDATA\i, \i*16(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. For a new
+// message, the low word of the counter must be 2. This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|. The caller must
+// update |le_ctr| if any more data segments follow. Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+ // Note: the code setting up for _load_partial_block assumes that SRC is
+ // in %rcx (and that DATALEN is *not* in %rcx).
+
+ // Additional local variables
+
+ // %rax and %rsi are used as temporary registers. Note: %rsi overlaps
+ // with LE_CTR_PTR, which is used only at the beginning.
+
+ .set AESKEYLEN, %r10d // AES key length in bytes
+ .set AESKEYLEN64, %r10
+ .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
+
+ // Put the most frequently used values in %xmm0-%xmm7 to reduce code
+ // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
+ .set TMP0, %xmm0
+ .set TMP1, %xmm1
+ .set TMP2, %xmm2
+ .set LO, %xmm3 // Low part of unreduced product
+ .set MI, %xmm4 // Middle part of unreduced product
+ .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also
+ // the high part of unreduced product
+ .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes
+ .set LE_CTR, %xmm7 // Little-endian counter value
+ .set AESDATA0, %xmm8
+ .set AESDATA1, %xmm9
+ .set AESDATA2, %xmm10
+ .set AESDATA3, %xmm11
+ .set AESDATA4, %xmm12
+ .set AESDATA5, %xmm13
+ .set AESDATA6, %xmm14
+ .set AESDATA7, %xmm15
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqu (LE_CTR_PTR), LE_CTR
+
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+
+ // If there are at least 8*16 bytes of data, then continue into the main
+ // loop, which processes 8*16 bytes of data per iteration.
+ //
+ // The main loop interleaves AES and GHASH to improve performance on
+ // CPUs that can execute these instructions in parallel. When
+ // decrypting, the GHASH input (the ciphertext) is immediately
+ // available. When encrypting, we instead encrypt a set of 8 blocks
+ // first and then GHASH those blocks while encrypting the next set of 8,
+ // repeat that as needed, and finally GHASH the last set of 8 blocks.
+ //
+ // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
+ // as this makes the immediate fit in a signed byte, saving 3 bytes.
+ add $-8*16, DATALEN
+ jl .Lcrypt_loop_8x_done\@
+.if \enc
+ // Encrypt the first 8 plaintext blocks.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ _aesenc_8x TMP0
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ movdqa (%rsi), TMP0
+ _aesenclast_8x TMP0
+ _xor_data_8x
+ // Don't increment DST until the ciphertext blocks have been hashed.
+ sub $-8*16, SRC
+ add $-8*16, DATALEN
+ jl .Lghash_last_ciphertext_8x\@
+.endif
+
+ .p2align 4
+.Lcrypt_loop_8x\@:
+
+ // Generate the next set of 8 counter blocks and start encrypting them.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+
+ // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
+ // by doing the unreduced multiplication for the first ciphertext block.
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_begin_8x \enc
+
+ // Do 7 more rounds of AES, and continue the GHASH update by doing the
+ // unreduced multiplication for the remaining ciphertext blocks.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+
+ // Do the remaining AES rounds.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+
+ // Do the GHASH reduction and the last round of AES.
+ movdqa (RNDKEYLAST_PTR), TMP0
+ _ghash_update_end_8x_step 0
+ _aesenclast_8x TMP0
+ _ghash_update_end_8x_step 1
+
+ // XOR the data with the AES-CTR keystream blocks.
+.if \enc
+ sub $-8*16, DST
+.endif
+ _xor_data_8x
+ sub $-8*16, SRC
+.if !\enc
+ sub $-8*16, DST
+.endif
+ add $-8*16, DATALEN
+ jge .Lcrypt_loop_8x\@
+
+.if \enc
+.Lghash_last_ciphertext_8x\@:
+ // Update GHASH with the last set of 8 ciphertext blocks.
+ _ghash_update_begin_8x \enc
+ .p2align 4
+1:
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+ _ghash_update_end_8x_step 0
+ _ghash_update_end_8x_step 1
+ sub $-8*16, DST
+.endif
+
+.Lcrypt_loop_8x_done\@:
+
+ sub $-8*16, DATALEN
+ jz .Ldone\@
+
+ // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
+ // things simple and keep the code size down by just going one block at
+ // a time, again taking advantage of hardware loop unrolling. Since
+ // there are enough key powers available for all remaining data, we do
+ // the GHASH multiplications unreduced, and only reduce at the very end.
+
+ .set HI, TMP2
+ .set H_POW, AESDATA0
+ .set H_POW_XORED, AESDATA1
+ .set ONE, AESDATA2
+
+ movq .Lone(%rip), ONE
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ pxor LO, LO
+ pxor MI, MI
+ pxor HI, HI
+
+ // Set up a block counter %rax to contain 8*(8-n), where n is the number
+ // of blocks that remain, counting any partial block. This will be used
+ // to access the key powers H^n through H^1.
+ mov DATALEN, %eax
+ neg %eax
+ and $~15, %eax
+ sar $1, %eax
+ add $64, %eax
+
+ sub $16, DATALEN
+ jl .Lcrypt_loop_1x_done\@
+
+ // Process the data one full block at a time.
+.Lcrypt_loop_1x\@:
+
+ // Encrypt the next counter block.
+ _vpshufb BSWAP_MASK, LE_CTR, TMP0
+ paddd ONE, LE_CTR
+ pxor (KEY), TMP0
+ lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rsi), TMP0
+ aesenc -6*16(%rsi), TMP0
+192:
+ aesenc -5*16(%rsi), TMP0
+ aesenc -4*16(%rsi), TMP0
+128:
+.irp i, -3,-2,-1,0,1,2,3,4,5
+ aesenc \i*16(%rsi), TMP0
+.endr
+ aesenclast (RNDKEYLAST_PTR), TMP0
+
+ // Load the next key power H^i.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // XOR the keystream block that was just generated in TMP0 with the next
+ // source data block and store the resulting en/decrypted data to DST.
+.if \enc
+ _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
+ movdqu TMP0, (DST)
+.else
+ movdqu (SRC), TMP1
+ pxor TMP1, TMP0
+ movdqu TMP0, (DST)
+.endif
+
+ // Update GHASH with the ciphertext block.
+.if \enc
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+.else
+ pshufb BSWAP_MASK, TMP1
+ pxor TMP1, GHASH_ACC
+.endif
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+ pxor GHASH_ACC, GHASH_ACC
+
+ add $8, %eax
+ add $16, SRC
+ add $16, DST
+ sub $16, DATALEN
+ jge .Lcrypt_loop_1x\@
+.Lcrypt_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, DATALEN
+ jz .Lghash_reduce\@
+
+ // Process a partial block of length 1 <= DATALEN <= 15.
+
+ // Encrypt a counter block for the last time.
+ pshufb BSWAP_MASK, LE_CTR
+ pxor (KEY), LE_CTR
+ lea 16(KEY), %rsi
+1:
+ aesenc (%rsi), LE_CTR
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), LE_CTR
+
+ // Load the lowest key power, H^1.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
+ // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
+ // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
+ mov SRC, RNDKEYLAST_PTR
+ mov DATALEN, %ecx
+ _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi
+
+ // XOR the keystream block that was just generated in LE_CTR with the
+ // source data block and store the resulting en/decrypted data to DST.
+ pxor TMP0, LE_CTR
+ mov DATALEN, %ecx
+ _store_partial_block LE_CTR, DST
+
+ // If encrypting, zero-pad the final ciphertext block for GHASH. (If
+ // decrypting, this was already done by _load_partial_block.)
+.if \enc
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub DATALEN64, %rax
+ _vpand (%rax), LE_CTR, TMP0
+.endif
+
+ // Update GHASH with the final ciphertext block.
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+
+.Lghash_reduce\@:
+ // Finally, do the GHASH reduction.
+ _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+
+ RET
+.endm
+
+// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+ .set TAGLEN64, %r10
+
+ // Additional local variables.
+ // %rax and %xmm0-%xmm2 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set BSWAP_MASK, %xmm3
+ .set GHASH_ACC, %xmm4
+ .set H_POW1, %xmm5 // H^1
+ .set H_POW1_X64, %xmm6 // H^1 * x^64
+ .set GFPOLY, %xmm7
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ movdqu (LE_CTR_PTR), %xmm0
+ mov $1, %eax
+ pinsrd $0, %eax, %xmm0
+
+ // Build the lengths block and XOR it into the GHASH accumulator.
+ movq TOTAL_DATALEN, GHASH_ACC
+ pinsrq $1, TOTAL_AADLEN, GHASH_ACC
+ psllq $3, GHASH_ACC // Bytes to bits
+ _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
+
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Make %rax point to the 6th from last AES round key. (Using signed
+ // byte offsets -7*16 through 6*16 decreases code size.)
+ lea (KEY,AESKEYLEN64,4), %rax
+
+ // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ pshufb BSWAP_MASK, %xmm0
+ pxor (KEY), %xmm0
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rax), %xmm0
+ aesenc -6*16(%rax), %xmm0
+192:
+ aesenc -5*16(%rax), %xmm0
+ aesenc -4*16(%rax), %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ aesenc (\i-3)*16(%rax), %xmm0
+ _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+.endr
+ aesenclast 6*16(%rax), %xmm0
+ _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+
+ // Undo the byte reflection of the GHASH accumulator.
+ pshufb BSWAP_MASK, GHASH_ACC
+
+ // Encrypt the GHASH accumulator.
+ pxor %xmm0, GHASH_ACC
+
+.if \enc
+ // Return the computed auth tag.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
+
+ // Verify the auth tag in constant time by XOR'ing the transmitted and
+ // computed auth tags together and using the ptest instruction to check
+ // whether the first TAGLEN bytes of the result are zero.
+ _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
+ movl 8(%rsp), TAGLEN
+ lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
+ sub TAGLEN64, ZEROPAD_MASK_PTR
+ xor %eax, %eax
+ _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
+ sete %al
+.endif
+ RET
+.endm
+
+.set USE_AVX, 0
+SYM_FUNC_START(aes_gcm_precompute_aesni)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni)
+SYM_FUNC_START(aes_gcm_aad_update_aesni)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_update_aesni)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni)
+SYM_FUNC_START(aes_gcm_dec_update_aesni)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_final_aesni)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni)
+SYM_FUNC_START(aes_gcm_dec_final_aesni)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni)
+
+.set USE_AVX, 1
+SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
+SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)
diff --git a/arch/x86/crypto/aes-gcm-vaes-avx2.S b/arch/x86/crypto/aes-gcm-vaes-avx2.S
new file mode 100644
index 000000000000..93c9504a488f
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-vaes-avx2.S
@@ -0,0 +1,1146 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX2
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// -----------------------------------------------------------------------------
+//
+// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512.
+// This means it can only use 16 vector registers instead of 32, the maximum
+// vector length is 32 bytes, and some instructions such as vpternlogd and
+// masked loads/stores are unavailable. However, it is able to run on CPUs that
+// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs),
+// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest.
+//
+// This implementation also uses Karatsuba multiplication instead of schoolbook
+// multiplication for GHASH in its main loop. This does not help much on Intel,
+// but it improves performance by ~5% on AMD Zen 3. Other factors weighing
+// slightly in favor of Karatsuba multiplication in this implementation are the
+// lower maximum vector length (which means there are fewer key powers, so we
+// can cache the halves of each key power XOR'd together and still use less
+// memory than the AVX512 implementation), and the unavailability of the
+// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba).
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+
+ // The below three 16-byte values must be in the order that they are, as
+ // they are really two 32-byte tables and a 16-byte value that overlap:
+ //
+ // - The first 32-byte table begins at .Lselect_high_bytes_table.
+ // For 0 <= len <= 16, the 16-byte value at
+ // '.Lselect_high_bytes_table + len' selects the high 'len' bytes of
+ // another 16-byte value when AND'ed with it.
+ //
+ // - The second 32-byte table begins at .Lrshift_and_bswap_table.
+ // For 0 <= len <= 16, the 16-byte value at
+ // '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the
+ // following operation: right-shift by '16 - len' bytes (shifting in
+ // zeroes), then reflect all 16 bytes.
+ //
+ // - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects
+ // all 16 bytes.
+.Lselect_high_bytes_table:
+ .octa 0
+.Lrshift_and_bswap_table:
+ .octa 0xffffffffffffffffffffffffffffffff
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+ // Sixteen 0x0f bytes. By XOR'ing an entry of .Lrshift_and_bswap_table
+ // with this, we get a mask that left-shifts by '16 - len' bytes.
+.Lfifteens:
+ .octa 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+ // This is the GHASH reducing polynomial without its constant term, i.e.
+ // x^128 + x^7 + x^2 + x, represented using the backwards mapping
+ // between bits and polynomial coefficients.
+ //
+ // Alternatively, it can be interpreted as the naturally-ordered
+ // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+ // "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+ .octa 0xc2000000000000000000000000000001
+
+ // Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+
+ // Values needed to prepare the initial vector of counter blocks.
+.Lctr_pattern:
+ .octa 0
+ .octa 1
+
+ // The number of AES blocks per vector, as a 128-bit value.
+.Linc_2blocks:
+ .octa 2
+
+// Offsets in struct aes_gcm_key_vaes_avx2
+#define OFFSETOF_AESKEYLEN 480
+#define OFFSETOF_H_POWERS 512
+#define NUM_H_POWERS 8
+#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+#define OFFSETOF_H_POWERS_XORED OFFSETOFEND_H_POWERS
+
+.text
+
+// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes
+// of \b and storing the reduced products in \dst. Uses schoolbook
+// multiplication.
+.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+ vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H
+.elseif \i == 1
+ vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L
+.elseif \i == 2
+ vpxor \t2, \t1, \t1 // MI = MI_0 + MI_1
+.elseif \i == 3
+ vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+.elseif \i == 5
+ vpxor \t0, \t1, \t1 // Fold LO into MI (part 1)
+ vpxor \t2, \t1, \t1 // Fold LO into MI (part 2)
+.elseif \i == 6
+ vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H
+.elseif \i == 7
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+.elseif \i == 9
+ vpxor \t1, \dst, \dst // Fold MI into HI (part 1)
+ vpxor \t0, \dst, \dst // Fold MI into HI (part 2)
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst. See _ghash_mul_step for full explanation.
+.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0
+ vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L
+ vpxor \t0, \lo, \lo
+ vpclmulqdq $0x01, \a, \b, \t0 // a_L * b_H
+ vpxor \t0, \mi, \mi
+ vpclmulqdq $0x10, \a, \b, \t0 // a_H * b_L
+ vpxor \t0, \mi, \mi
+ vpclmulqdq $0x11, \a, \b, \t0 // a_H * b_H
+ vpxor \t0, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi. See _ghash_mul_step for explanation of reduction.
+.macro _ghash_reduce lo, mi, hi, gfpoly, t0
+ vpclmulqdq $0x01, \lo, \gfpoly, \t0
+ vpshufd $0x4e, \lo, \lo
+ vpxor \lo, \mi, \mi
+ vpxor \t0, \mi, \mi
+ vpclmulqdq $0x01, \mi, \gfpoly, \t0
+ vpshufd $0x4e, \mi, \mi
+ vpxor \mi, \hi, \hi
+ vpxor \t0, \hi, \hi
+.endm
+
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro _ghash_square a, dst, gfpoly, t0, t1
+ vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L
+ vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H
+ vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+ vpxor \t0, \t1, \t1 // Fold LO into MI
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+ vpxor \t1, \dst, \dst // Fold MI into HI (part 1)
+ vpxor \t0, \dst, \dst // Fold MI into HI (part 2)
+.endm
+
+// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
+//
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->h_powers_xored|.
+//
+// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to
+// store the 64-bit halves of the key powers XOR'd together (for Karatsuba
+// multiplication) in the order 8,6,7,5,4,2,3,1.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx2)
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables
+ .set POWERS_PTR, %rsi
+ .set RNDKEYLAST_PTR, %rdx
+ .set TMP0, %ymm0
+ .set TMP0_XMM, %xmm0
+ .set TMP1, %ymm1
+ .set TMP1_XMM, %xmm1
+ .set TMP2, %ymm2
+ .set TMP2_XMM, %xmm2
+ .set H_CUR, %ymm3
+ .set H_CUR_XMM, %xmm3
+ .set H_CUR2, %ymm4
+ .set H_INC, %ymm5
+ .set H_INC_XMM, %xmm5
+ .set GFPOLY, %ymm6
+ .set GFPOLY_XMM, %xmm6
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ vmovdqu (KEY), H_CUR_XMM // Zero-th round key XOR all-zeroes block
+ lea 16(KEY), %rax
+1:
+ vaesenc (%rax), H_CUR_XMM, H_CUR_XMM
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast (RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM
+
+ // Reflect the bytes of the raw hash subkey.
+ vpshufb .Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM
+
+ // Finish preprocessing the byte-reflected hash subkey by multiplying it
+ // by x^-1 ("standard" interpretation of polynomial coefficients) or
+ // equivalently x^1 (natural interpretation). This gets the key into a
+ // format that avoids having to bit-reflect the data blocks later.
+ vpshufd $0xd3, H_CUR_XMM, TMP0_XMM
+ vpsrad $31, TMP0_XMM, TMP0_XMM
+ vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+ vpand .Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM
+ vpxor TMP0_XMM, H_CUR_XMM, H_CUR_XMM
+
+ // Load the gfpoly constant.
+ vbroadcasti128 .Lgfpoly(%rip), GFPOLY
+
+ // Square H^1 to get H^2.
+ _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM
+
+ // Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
+ vinserti128 $1, H_CUR_XMM, H_INC, H_CUR
+ vinserti128 $1, H_INC_XMM, H_INC, H_INC
+
+ // Compute H_CUR2 = [H^4, H^3].
+ _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+
+ // Store [H^2, H^1] and [H^4, H^3].
+ vmovdqu H_CUR, OFFSETOF_H_POWERS+3*32(KEY)
+ vmovdqu H_CUR2, OFFSETOF_H_POWERS+2*32(KEY)
+
+ // For Karatsuba multiplication: compute and store the two 64-bit halves
+ // of each key power XOR'd together. Order is 4,2,3,1.
+ vpunpcklqdq H_CUR, H_CUR2, TMP0
+ vpunpckhqdq H_CUR, H_CUR2, TMP1
+ vpxor TMP1, TMP0, TMP0
+ vmovdqu TMP0, OFFSETOF_H_POWERS_XORED+32(KEY)
+
+ // Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
+ _ghash_mul H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2
+ _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+ vmovdqu H_CUR, OFFSETOF_H_POWERS+1*32(KEY)
+ vmovdqu H_CUR2, OFFSETOF_H_POWERS+0*32(KEY)
+
+ // Again, compute and store the two 64-bit halves of each key power
+ // XOR'd together. Order is 8,6,7,5.
+ vpunpcklqdq H_CUR, H_CUR2, TMP0
+ vpunpckhqdq H_CUR, H_CUR2, TMP1
+ vpxor TMP1, TMP0, TMP0
+ vmovdqu TMP0, OFFSETOF_H_POWERS_XORED(KEY)
+
+ vzeroupper
+ RET
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx2)
+
+// Do one step of the GHASH update of four vectors of data blocks.
+// \i: the step to do, 0 through 9
+// \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
+// KEY: pointer to struct aes_gcm_key_vaes_avx2
+// BSWAP_MASK: mask for reflecting the bytes of blocks
+// H_POW[2-1]_XORED: cached values from KEY->h_powers_xored
+// TMP[0-2]: temporary registers. TMP[1-2] must be preserved across steps.
+// LO, MI: working state for this macro that must be preserved across steps
+// GHASH_ACC: the GHASH accumulator (input/output)
+.macro _ghash_step_4x i, ghashdata_ptr
+ .set HI, GHASH_ACC # alias
+ .set HI_XMM, GHASH_ACC_XMM
+.if \i == 0
+ // First vector
+ vmovdqu 0*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+ vmovdqu OFFSETOF_H_POWERS+0*32(KEY), TMP2
+ vpxor GHASH_ACC, TMP1, TMP1
+ vpclmulqdq $0x00, TMP2, TMP1, LO
+ vpclmulqdq $0x11, TMP2, TMP1, HI
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x00, H_POW2_XORED, TMP0, MI
+.elseif \i == 1
+.elseif \i == 2
+ // Second vector
+ vmovdqu 1*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+ vmovdqu OFFSETOF_H_POWERS+1*32(KEY), TMP2
+ vpclmulqdq $0x00, TMP2, TMP1, TMP0
+ vpxor TMP0, LO, LO
+ vpclmulqdq $0x11, TMP2, TMP1, TMP0
+ vpxor TMP0, HI, HI
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x10, H_POW2_XORED, TMP0, TMP0
+ vpxor TMP0, MI, MI
+.elseif \i == 3
+ // Third vector
+ vmovdqu 2*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+ vmovdqu OFFSETOF_H_POWERS+2*32(KEY), TMP2
+.elseif \i == 4
+ vpclmulqdq $0x00, TMP2, TMP1, TMP0
+ vpxor TMP0, LO, LO
+ vpclmulqdq $0x11, TMP2, TMP1, TMP0
+ vpxor TMP0, HI, HI
+.elseif \i == 5
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x00, H_POW1_XORED, TMP0, TMP0
+ vpxor TMP0, MI, MI
+
+ // Fourth vector
+ vmovdqu 3*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+.elseif \i == 6
+ vmovdqu OFFSETOF_H_POWERS+3*32(KEY), TMP2
+ vpclmulqdq $0x00, TMP2, TMP1, TMP0
+ vpxor TMP0, LO, LO
+ vpclmulqdq $0x11, TMP2, TMP1, TMP0
+ vpxor TMP0, HI, HI
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x10, H_POW1_XORED, TMP0, TMP0
+ vpxor TMP0, MI, MI
+.elseif \i == 7
+ // Finalize 'mi' following Karatsuba multiplication.
+ vpxor LO, MI, MI
+ vpxor HI, MI, MI
+
+ // Fold lo into mi.
+ vbroadcasti128 .Lgfpoly(%rip), TMP2
+ vpclmulqdq $0x01, LO, TMP2, TMP0
+ vpshufd $0x4e, LO, LO
+ vpxor LO, MI, MI
+ vpxor TMP0, MI, MI
+.elseif \i == 8
+ // Fold mi into hi.
+ vpclmulqdq $0x01, MI, TMP2, TMP0
+ vpshufd $0x4e, MI, MI
+ vpxor MI, HI, HI
+ vpxor TMP0, HI, HI
+.elseif \i == 9
+ vextracti128 $1, HI, TMP0_XMM
+ vpxor TMP0_XMM, HI_XMM, GHASH_ACC_XMM
+.endif
+.endm
+
+// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full
+// explanation.
+.macro _ghash_4x ghashdata_ptr
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_step_4x \i, \ghashdata_ptr
+.endr
+.endm
+
+// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _load_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jle .Lle8\@
+
+ // Load 9 <= LEN <= 16 bytes.
+ vmovq (\src), \dst // Load first 8 bytes
+ mov (\src, %rcx), %rax // Load last 8 bytes
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax // Discard overlapping bytes
+ vpinsrq $1, %rax, \dst, \dst
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Load 4 <= LEN <= 8 bytes.
+ mov (\src), %eax // Load first 4 bytes
+ mov (\src, %rcx), \tmp32 // Load last 4 bytes
+ jmp .Lcombine\@
+
+.Llt4\@:
+ // Load 1 <= LEN <= 3 bytes.
+ add $2, %ecx // LEN - 2
+ movzbl (\src), %eax // Load first byte
+ jl .Lmovq\@
+ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, \tmp64
+ or \tmp64, %rax // Combine the two parts
+.Lmovq\@:
+ vmovq %rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _store_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jl .Llt8\@
+
+ // Store 8 <= LEN <= 16 bytes.
+ vpextrq $1, \src, %rax
+ mov %ecx, \tmp32
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes
+ vmovq \src, (\dst) // Store first 8 bytes
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Store 4 <= LEN <= 7 bytes.
+ vpextrd $1, \src, %eax
+ mov %ecx, \tmp32
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes
+ vmovd \src, (\dst) // Store first 4 bytes
+ jmp .Ldone\@
+
+.Llt4\@:
+ // Store 1 <= LEN <= 3 bytes.
+ vpextrb $0, \src, 0(\dst)
+ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
+ jl .Ldone\@
+ vpextrb $1, \src, 1(\dst)
+ je .Ldone\@
+ vpextrb $2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// u8 ghash_acc[16],
+// const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case. TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2)
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx // Must be %ecx for _load_partial_block
+ .set AADLEN64, %rcx // Zero-extend AADLEN before using!
+
+ // Additional local variables.
+ // %rax and %r8 are used as temporary registers.
+ .set TMP0, %ymm0
+ .set TMP0_XMM, %xmm0
+ .set TMP1, %ymm1
+ .set TMP1_XMM, %xmm1
+ .set TMP2, %ymm2
+ .set TMP2_XMM, %xmm2
+ .set LO, %ymm3
+ .set LO_XMM, %xmm3
+ .set MI, %ymm4
+ .set MI_XMM, %xmm4
+ .set GHASH_ACC, %ymm5
+ .set GHASH_ACC_XMM, %xmm5
+ .set BSWAP_MASK, %ymm6
+ .set BSWAP_MASK_XMM, %xmm6
+ .set GFPOLY, %ymm7
+ .set GFPOLY_XMM, %xmm7
+ .set H_POW2_XORED, %ymm8
+ .set H_POW1_XORED, %ymm9
+
+ // Load the bswap_mask and gfpoly constants. Since AADLEN is usually
+ // small, usually only 128-bit vectors will be used. So as an
+ // optimization, don't broadcast these constants to both 128-bit lanes
+ // quite yet.
+ vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM
+ vmovdqu .Lgfpoly(%rip), GFPOLY_XMM
+
+ // Load the GHASH accumulator.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+
+ // Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+ test AADLEN, AADLEN
+ jz .Laad_done
+ cmp $16, AADLEN
+ jle .Laad_lastblock
+
+ // AADLEN > 16, so we'll operate on full vectors. Broadcast bswap_mask
+ // and gfpoly to both 128-bit lanes.
+ vinserti128 $1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK
+ vinserti128 $1, GFPOLY_XMM, GFPOLY, GFPOLY
+
+ // If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time.
+ add $-128, AADLEN // 128 is 4 bytes, -128 is 1 byte
+ jl .Laad_loop_4x_done
+ vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+ vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+.Laad_loop_4x:
+ _ghash_4x AAD
+ sub $-128, AAD
+ add $-128, AADLEN
+ jge .Laad_loop_4x
+.Laad_loop_4x_done:
+
+ // If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time.
+ add $96, AADLEN
+ jl .Laad_loop_1x_done
+.Laad_loop_1x:
+ vmovdqu (AAD), TMP0
+ vpshufb BSWAP_MASK, TMP0, TMP0
+ vpxor TMP0, GHASH_ACC, GHASH_ACC
+ vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0
+ _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+ vextracti128 $1, GHASH_ACC, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ add $32, AAD
+ sub $32, AADLEN
+ jge .Laad_loop_1x
+.Laad_loop_1x_done:
+ add $32, AADLEN
+ // Now 0 <= AADLEN < 32.
+
+ jz .Laad_done
+ cmp $16, AADLEN
+ jle .Laad_lastblock
+
+ // Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD.
+ mov AADLEN, AADLEN // Zero-extend AADLEN to AADLEN64.
+ vmovdqu (AAD), TMP0_XMM
+ vmovdqu -16(AAD, AADLEN64), TMP1_XMM
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ lea .Lrshift_and_bswap_table(%rip), %rax
+ vpshufb -16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM
+ vinserti128 $1, TMP1_XMM, GHASH_ACC, GHASH_ACC
+ vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0
+ _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+ vextracti128 $1, GHASH_ACC, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ jmp .Laad_done
+
+.Laad_lastblock:
+ // Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD.
+ _load_partial_block AAD, TMP0_XMM, %r8, %r8d
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM
+ _ghash_mul TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+ TMP1_XMM, TMP2_XMM, LO_XMM
+
+.Laad_done:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper
+ RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2)
+
+// Do one non-last round of AES encryption on the blocks in the given AESDATA
+// vectors using the round key that has been broadcast to all 128-bit lanes of
+// \round_key.
+.macro _vaesenc round_key, vecs:vararg
+.irp i, \vecs
+ vaesenc \round_key, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES
+// round on them. Clobbers TMP0.
+.macro _ctr_begin vecs:vararg
+ vbroadcasti128 .Linc_2blocks(%rip), TMP0
+.irp i, \vecs
+ vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
+ vpaddd TMP0, LE_CTR, LE_CTR
+.endr
+.irp i, \vecs
+ vpxor RNDKEY0, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate and encrypt counter blocks in the given AESDATA vectors, excluding
+// the last AES round. Clobbers %rax and TMP0.
+.macro _aesenc_loop vecs:vararg
+ _ctr_begin \vecs
+ lea 16(KEY), %rax
+.Laesenc_loop\@:
+ vbroadcasti128 (%rax), TMP0
+ _vaesenc TMP0, \vecs
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne .Laesenc_loop\@
+.endm
+
+// Finalize the keystream blocks in the given AESDATA vectors by doing the last
+// AES round, then XOR those keystream blocks with the corresponding data.
+// Reduce latency by doing the XOR before the vaesenclast, utilizing the
+// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). Clobbers TMP0.
+.macro _aesenclast_and_xor vecs:vararg
+.irp i, \vecs
+ vpxor \i*32(SRC), RNDKEYLAST, TMP0
+ vaesenclast TMP0, AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+ vmovdqu AESDATA\i, \i*32(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one). The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|. It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|. The
+// caller must update |le_ctr| if any more data segments follow. Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set LE_CTR_PTR32, %esi
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx // Assumed to be %rcx.
+ // See .Ltail_xor_and_ghash_1to16bytes
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+
+ // Additional local variables
+
+ // %rax is used as a temporary register. LE_CTR_PTR is also available
+ // as a temporary register after the counter is loaded.
+
+ // AES key length in bytes
+ .set AESKEYLEN, %r10d
+ .set AESKEYLEN64, %r10
+
+ // Pointer to the last AES round key for the chosen AES variant
+ .set RNDKEYLAST_PTR, %r11
+
+ // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ // using vpshufb, copied to all 128-bit lanes.
+ .set BSWAP_MASK, %ymm0
+ .set BSWAP_MASK_XMM, %xmm0
+
+ // GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
+ // only the lowest 128-bit lane can be nonzero. When not fully reduced,
+ // more than one lane may be used, and they need to be XOR'd together.
+ .set GHASH_ACC, %ymm1
+ .set GHASH_ACC_XMM, %xmm1
+
+ // TMP[0-2] are temporary registers.
+ .set TMP0, %ymm2
+ .set TMP0_XMM, %xmm2
+ .set TMP1, %ymm3
+ .set TMP1_XMM, %xmm3
+ .set TMP2, %ymm4
+ .set TMP2_XMM, %xmm4
+
+ // LO and MI are used to accumulate unreduced GHASH products.
+ .set LO, %ymm5
+ .set LO_XMM, %xmm5
+ .set MI, %ymm6
+ .set MI_XMM, %xmm6
+
+ // H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored. The
+ // descending numbering reflects the order of the key powers.
+ .set H_POW2_XORED, %ymm7
+ .set H_POW2_XORED_XMM, %xmm7
+ .set H_POW1_XORED, %ymm8
+
+ // RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
+ .set RNDKEY0, %ymm9
+ .set RNDKEYLAST, %ymm10
+
+ // LE_CTR contains the next set of little-endian counter blocks.
+ .set LE_CTR, %ymm11
+
+ // AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+ .set AESDATA0, %ymm12
+ .set AESDATA0_XMM, %xmm12
+ .set AESDATA1, %ymm13
+ .set AESDATA1_XMM, %xmm13
+ .set AESDATA2, %ymm14
+ .set AESDATA3, %ymm15
+
+.if \enc
+ .set GHASHDATA_PTR, DST
+.else
+ .set GHASHDATA_PTR, SRC
+.endif
+
+ vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK
+
+ // Load the GHASH accumulator and the starting counter.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+ vbroadcasti128 (LE_CTR_PTR), LE_CTR
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Make RNDKEYLAST_PTR point to the last AES round key. This is the
+ // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+ // respectively. Then load the zero-th and last round keys.
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+ vbroadcasti128 (KEY), RNDKEY0
+ vbroadcasti128 (RNDKEYLAST_PTR), RNDKEYLAST
+
+ // Finish initializing LE_CTR by adding 1 to the second block.
+ vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+ // If there are at least 128 bytes of data, then continue into the loop
+ // that processes 128 bytes of data at a time. Otherwise skip it.
+ add $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte
+ jl .Lcrypt_loop_4x_done\@
+
+ vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+ vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+
+ // Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
+
+.if \enc
+ // Encrypt the first 4 vectors of plaintext blocks.
+ _aesenc_loop 0,1,2,3
+ _aesenclast_and_xor 0,1,2,3
+ sub $-128, SRC // 128 is 4 bytes, -128 is 1 byte
+ add $-128, DATALEN
+ jl .Lghash_last_ciphertext_4x\@
+.endif
+
+.align 16
+.Lcrypt_loop_4x\@:
+
+ // Start the AES encryption of the counter blocks.
+ _ctr_begin 0,1,2,3
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vbroadcasti128 -13*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+ vbroadcasti128 -12*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+192:
+ vbroadcasti128 -11*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+ vbroadcasti128 -10*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+128:
+
+ // Finish the AES encryption of the counter blocks in AESDATA[0-3],
+ // interleaved with the GHASH update of the ciphertext blocks.
+.irp i, 9,8,7,6,5,4,3,2,1
+ _ghash_step_4x (9 - \i), GHASHDATA_PTR
+ vbroadcasti128 -\i*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+.endr
+ _ghash_step_4x 9, GHASHDATA_PTR
+.if \enc
+ sub $-128, DST // 128 is 4 bytes, -128 is 1 byte
+.endif
+ _aesenclast_and_xor 0,1,2,3
+ sub $-128, SRC
+.if !\enc
+ sub $-128, DST
+.endif
+ add $-128, DATALEN
+ jge .Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+ // Update GHASH with the last set of ciphertext blocks.
+ _ghash_4x DST
+ sub $-128, DST
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+ // Undo the extra subtraction by 128 and check whether data remains.
+ sub $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte
+ jz .Ldone\@
+
+ // The data length isn't a multiple of 128 bytes. Process the remaining
+ // data of length 1 <= DATALEN < 128.
+ //
+ // Since there are enough key powers available for all remaining data,
+ // there is no need to do a GHASH reduction after each iteration.
+ // Instead, multiply each remaining block by its own key power, and only
+ // do a GHASH reduction at the very end.
+
+ // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+ // is the number of blocks that remain.
+ .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused.
+ .set POWERS_PTR32, LE_CTR_PTR32
+ mov DATALEN, %eax
+ neg %rax
+ and $~15, %rax // -round_up(DATALEN, 16)
+ lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ .set HI, H_POW2_XORED // H_POW2_XORED is free to be reused.
+ .set HI_XMM, H_POW2_XORED_XMM
+ vpxor LO_XMM, LO_XMM, LO_XMM
+ vpxor MI_XMM, MI_XMM, MI_XMM
+ vpxor HI_XMM, HI_XMM, HI_XMM
+
+ // 1 <= DATALEN < 128. Generate 2 or 4 more vectors of keystream blocks
+ // excluding the last AES round, depending on the remaining DATALEN.
+ cmp $64, DATALEN
+ jg .Ltail_gen_4_keystream_vecs\@
+ _aesenc_loop 0,1
+ cmp $32, DATALEN
+ jge .Ltail_xor_and_ghash_full_vec_loop\@
+ jmp .Ltail_xor_and_ghash_partial_vec\@
+.Ltail_gen_4_keystream_vecs\@:
+ _aesenc_loop 0,1,2,3
+
+ // XOR the remaining data and accumulate the unreduced GHASH products
+ // for DATALEN >= 32, starting with one full 32-byte vector at a time.
+.Ltail_xor_and_ghash_full_vec_loop\@:
+.if \enc
+ _aesenclast_and_xor 0
+ vpshufb BSWAP_MASK, AESDATA0, AESDATA0
+.else
+ vmovdqu (SRC), TMP1
+ vpxor TMP1, RNDKEYLAST, TMP0
+ vaesenclast TMP0, AESDATA0, AESDATA0
+ vmovdqu AESDATA0, (DST)
+ vpshufb BSWAP_MASK, TMP1, AESDATA0
+.endif
+ // The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0.
+ vpxor GHASH_ACC, AESDATA0, AESDATA0
+ vmovdqu (POWERS_PTR), TMP2
+ _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0
+ vmovdqa AESDATA1, AESDATA0
+ vmovdqa AESDATA2, AESDATA1
+ vmovdqa AESDATA3, AESDATA2
+ vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ add $32, SRC
+ add $32, DST
+ add $32, POWERS_PTR
+ sub $32, DATALEN
+ cmp $32, DATALEN
+ jge .Ltail_xor_and_ghash_full_vec_loop\@
+ test DATALEN, DATALEN
+ jz .Ltail_ghash_reduce\@
+
+.Ltail_xor_and_ghash_partial_vec\@:
+ // XOR the remaining data and accumulate the unreduced GHASH products,
+ // for 1 <= DATALEN < 32.
+ vaesenclast RNDKEYLAST, AESDATA0, AESDATA0
+ cmp $16, DATALEN
+ jle .Ltail_xor_and_ghash_1to16bytes\@
+
+ // Handle 17 <= DATALEN < 32.
+
+ // Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes
+ // (shifting in zeroes), then reflect all 16 bytes.
+ lea .Lrshift_and_bswap_table(%rip), %rax
+ vmovdqu -16(%rax, DATALEN64), TMP2_XMM
+
+ // Move the second keystream block to its own register and left-align it
+ vextracti128 $1, AESDATA0, AESDATA1_XMM
+ vpxor .Lfifteens(%rip), TMP2_XMM, TMP0_XMM
+ vpshufb TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM
+
+ // Using overlapping loads and stores, XOR the source data with the
+ // keystream and write the destination data. Then prepare the GHASH
+ // input data: the full ciphertext block and the zero-padded partial
+ // ciphertext block, both byte-reflected, in AESDATA0.
+.if \enc
+ vpxor -16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM
+ vpxor (SRC), AESDATA0_XMM, AESDATA0_XMM
+ vmovdqu AESDATA1_XMM, -16(DST, DATALEN64)
+ vmovdqu AESDATA0_XMM, (DST)
+ vpshufb TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM
+ vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+.else
+ vmovdqu -16(SRC, DATALEN64), TMP1_XMM
+ vmovdqu (SRC), TMP0_XMM
+ vpxor TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM
+ vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+ vmovdqu AESDATA1_XMM, -16(DST, DATALEN64)
+ vmovdqu AESDATA0_XMM, (DST)
+ vpshufb TMP2_XMM, TMP1_XMM, AESDATA1_XMM
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+ vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+ vinserti128 $1, AESDATA1_XMM, AESDATA0, AESDATA0
+ vmovdqu (POWERS_PTR), TMP2
+ jmp .Ltail_ghash_last_vec\@
+
+.Ltail_xor_and_ghash_1to16bytes\@:
+ // Handle 1 <= DATALEN <= 16. Carefully load and store the
+ // possibly-partial block, which we mustn't access out of bounds.
+ vmovdqu (POWERS_PTR), TMP2_XMM
+ mov SRC, KEY // Free up %rcx, assuming SRC == %rcx
+ mov DATALEN, %ecx
+ _load_partial_block KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32
+ vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+ mov DATALEN, %ecx
+ _store_partial_block AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32
+.if \enc
+ lea .Lselect_high_bytes_table(%rip), %rax
+ vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+ vpand (%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM
+.else
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+ vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+
+.Ltail_ghash_last_vec\@:
+ // Accumulate the unreduced GHASH products for the last 1-2 blocks. The
+ // GHASH input data is in AESDATA0. If only one block remains, then the
+ // second block in AESDATA0 is zero and does not affect the result.
+ _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0
+
+.Ltail_ghash_reduce\@:
+ // Finally, do the GHASH reduction.
+ vbroadcasti128 .Lgfpoly(%rip), TMP0
+ _ghash_reduce LO, MI, HI, TMP0, TMP1
+ vextracti128 $1, HI, GHASH_ACC_XMM
+ vpxor HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper
+ RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// const u32 le_ctr[4], const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+ .set TAGLEN64, %r10
+
+ // Additional local variables.
+ // %rax and %xmm0-%xmm3 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set GFPOLY, %xmm4
+ .set BSWAP_MASK, %xmm5
+ .set LE_CTR, %xmm6
+ .set GHASH_ACC, %xmm7
+ .set H_POW1, %xmm8
+
+ // Load some constants.
+ vmovdqa .Lgfpoly(%rip), GFPOLY
+ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ // GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+ vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+ // Build the lengths block and XOR it with the GHASH accumulator.
+ // Although the lengths block is defined as the AAD length followed by
+ // the en/decrypted data length, both in big-endian byte order, a byte
+ // reflection of the full block is needed because of the way we compute
+ // GHASH (see _ghash_mul_step). By using little-endian values in the
+ // opposite order, we avoid having to reflect any bytes here.
+ vmovq TOTAL_DATALEN, %xmm0
+ vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0
+ vpsllq $3, %xmm0, %xmm0 // Bytes to bits
+ vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+ // Load the first hash key power (H^1), which is stored last.
+ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+ // Load TAGLEN if decrypting.
+.if !\enc
+ movl 8(%rsp), TAGLEN
+.endif
+
+ // Make %rax point to the last AES round key for the chosen AES variant.
+ lea 6*16(KEY,AESKEYLEN64,4), %rax
+
+ // Start the AES encryption of the counter block by swapping the counter
+ // block to big-endian and XOR-ing it with the zero-th AES round key.
+ vpshufb BSWAP_MASK, LE_CTR, %xmm0
+ vpxor (KEY), %xmm0, %xmm0
+
+ // Complete the AES encryption and multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vaesenc -13*16(%rax), %xmm0, %xmm0
+ vaesenc -12*16(%rax), %xmm0, %xmm0
+192:
+ vaesenc -11*16(%rax), %xmm0, %xmm0
+ vaesenc -10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+ vaesenc (\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+ _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+
+ // Undo the byte reflection of the GHASH accumulator.
+ vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+ // Do the last AES round and XOR the resulting keystream block with the
+ // GHASH accumulator to produce the full computed authentication tag.
+ //
+ // Reduce latency by taking advantage of the property vaesenclast(key,
+ // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last
+ // round key, instead of XOR'ing the final AES output with GHASH_ACC.
+ //
+ // enc_final then returns the computed auth tag, while dec_final
+ // compares it with the transmitted one and returns a bool. To compare
+ // the tags, dec_final XORs them together and uses vptest to check
+ // whether the result is all-zeroes. This should be constant-time.
+ // dec_final applies the vaesenclast optimization to this additional
+ // value XOR'd too.
+.if \enc
+ vpxor (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, GHASH_ACC
+ vmovdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ vpxor (TAG), GHASH_ACC, GHASH_ACC
+ vpxor (%rax), GHASH_ACC, GHASH_ACC
+ vaesenclast GHASH_ACC, %xmm0, %xmm0
+ lea .Lselect_high_bytes_table(%rip), %rax
+ vmovdqu (%rax, TAGLEN64), %xmm1
+ vpshufb BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high
+ xor %eax, %eax
+ vptest %xmm1, %xmm0
+ sete %al
+.endif
+ // No need for vzeroupper here, since only used xmm registers were used.
+ RET
+.endm
+
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2)
diff --git a/arch/x86/crypto/aes-gcm-vaes-avx512.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S
new file mode 100644
index 000000000000..06b71314d65c
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S
@@ -0,0 +1,1163 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 6
+
+ // A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+ // This is the GHASH reducing polynomial without its constant term, i.e.
+ // x^128 + x^7 + x^2 + x, represented using the backwards mapping
+ // between bits and polynomial coefficients.
+ //
+ // Alternatively, it can be interpreted as the naturally-ordered
+ // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+ // "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+ .octa 0xc2000000000000000000000000000001
+
+ // Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+
+ // Values needed to prepare the initial vector of counter blocks.
+.Lctr_pattern:
+ .octa 0
+ .octa 1
+ .octa 2
+ .octa 3
+
+ // The number of AES blocks per vector, as a 128-bit value.
+.Linc_4blocks:
+ .octa 4
+
+// Number of powers of the hash key stored in the key struct. The powers are
+// stored from highest (H^NUM_H_POWERS) to lowest (H^1).
+#define NUM_H_POWERS 16
+
+// Offset to AES key length (in bytes) in the key struct
+#define OFFSETOF_AESKEYLEN 480
+
+// Offset to start of hash key powers array in the key struct
+#define OFFSETOF_H_POWERS 512
+
+// Offset to end of hash key powers array in the key struct.
+//
+// This is immediately followed by three zeroized padding blocks, which are
+// included so that partial vectors can be handled more easily. E.g. if two
+// blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most padding
+// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+
+.text
+
+// The _ghash_mul_step macro does one step of GHASH multiplication of the
+// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
+// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the
+// same size as \a and \b. To complete all steps, this must invoked with \i=0
+// through \i=9. The division into steps allows users of this macro to
+// optionally interleave the computation with other instructions. Users of this
+// macro must preserve the parameter registers across steps.
+//
+// The multiplications are done in GHASH's representation of the finite field
+// GF(2^128). Elements of GF(2^128) are represented as binary polynomials
+// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
+// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is
+// just XOR, while multiplication is more complex and has two parts: (a) do
+// carryless multiplication of two 128-bit input polynomials to get a 256-bit
+// intermediate product polynomial, and (b) reduce the intermediate product to
+// 128 bits by adding multiples of G that cancel out terms in it. (Adding
+// multiples of G doesn't change which field element the polynomial represents.)
+//
+// Unfortunately, the GCM specification maps bits to/from polynomial
+// coefficients backwards from the natural order. In each byte it specifies the
+// highest bit to be the lowest order polynomial coefficient, *not* the highest!
+// This makes it nontrivial to work with the GHASH polynomials. We could
+// reflect the bits, but x86 doesn't have an instruction that does that.
+//
+// Instead, we operate on the values without bit-reflecting them. This *mostly*
+// just works, since XOR and carryless multiplication are symmetric with respect
+// to bit order, but it has some consequences. First, due to GHASH's byte
+// order, by skipping bit reflection, *byte* reflection becomes necessary to
+// give the polynomial terms a consistent order. E.g., considering an N-bit
+// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
+// through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
+// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
+// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
+// with. Fortunately, x86's vpshufb instruction can do byte reflection.
+//
+// Second, forgoing the bit reflection causes an extra multiple of x (still
+// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
+// multiplication. This is because an M-bit by N-bit carryless multiplication
+// really produces a (M+N-1)-bit product, but in practice it's zero-extended to
+// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
+// to polynomial coefficients backwards, this zero-extension actually changes
+// the product by introducing an extra factor of x. Therefore, users of this
+// macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
+// the multiplicative inverse of x, to cancel out the extra x.
+//
+// Third, the backwards coefficients convention is just confusing to work with,
+// since it makes "low" and "high" in the polynomial math mean the opposite of
+// their normal meaning in computer programming. This can be solved by using an
+// alternative interpretation: the polynomial coefficients are understood to be
+// in the natural order, and the multiplication is actually \a * \b * x^-128 mod
+// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs,
+// or the implementation at all; it just changes the mathematical interpretation
+// of what each instruction is doing. Starting from here, we'll use this
+// alternative interpretation, as it's easier to understand the code that way.
+//
+// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
+// 128-bit carryless multiplication, so we break the 128 x 128 multiplication
+// into parts as follows (the _L and _H suffixes denote low and high 64 bits):
+//
+// LO = a_L * b_L
+// MI = (a_L * b_H) + (a_H * b_L)
+// HI = a_H * b_H
+//
+// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit.
+// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and
+// HI right away, since the way the reduction works makes that unnecessary.
+//
+// For the reduction, we cancel out the low 128 bits by adding multiples of G =
+// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of
+// which cancels out the next lowest 64 bits. Consider a value x^64*A + B,
+// where A and B are 128-bit. Adding B_L*G to that value gives:
+//
+// x^64*A + B + B_L*G
+// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
+// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
+// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
+// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
+//
+// So: if we sum A, B with its halves swapped, and the low half of B times x^63
+// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
+// original value x^64*A + B. I.e., the low 64 bits got canceled out.
+//
+// We just need to apply this twice: first to fold LO into MI, and second to
+// fold the updated MI into HI.
+//
+// The needed three-argument XORs are done using the vpternlogd instruction with
+// immediate 0x96, since this is faster than two vpxord instructions.
+//
+// A potential optimization, assuming that b is fixed per-key (if a is fixed
+// per-key it would work the other way around), is to use one iteration of the
+// reduction described above to precompute a value c such that x^64*c = b mod G,
+// and then multiply a_L by c (and implicitly by x^64) instead of by b:
+//
+// MI = (a_L * c_L) + (a_H * b_L)
+// HI = (a_L * c_H) + (a_H * b_H)
+//
+// This would eliminate the LO part of the intermediate product, which would
+// eliminate the need to fold LO into MI. This would save two instructions,
+// including a vpclmulqdq. However, we currently don't use this optimization
+// because it would require twice as many per-key precomputed values.
+//
+// Using Karatsuba multiplication instead of "schoolbook" multiplication
+// similarly would save a vpclmulqdq but does not seem to be worth it.
+.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+ vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H
+.elseif \i == 1
+ vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L
+.elseif \i == 2
+ vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1
+.elseif \i == 3
+ vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+.elseif \i == 5
+ vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI
+.elseif \i == 6
+ vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H
+.elseif \i == 7
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+.elseif \i == 9
+ vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst. See _ghash_mul_step for full explanation.
+.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3
+ vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H
+ vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L
+ vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H
+ vpxord \t0, \lo, \lo
+ vpternlogd $0x96, \t2, \t1, \mi
+ vpxord \t3, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi. See _ghash_mul_step for explanation of reduction.
+.macro _ghash_reduce lo, mi, hi, gfpoly, t0
+ vpclmulqdq $0x01, \lo, \gfpoly, \t0
+ vpshufd $0x4e, \lo, \lo
+ vpternlogd $0x96, \t0, \lo, \mi
+ vpclmulqdq $0x01, \mi, \gfpoly, \t0
+ vpshufd $0x4e, \mi, \mi
+ vpternlogd $0x96, \t0, \mi, \hi
+.endm
+
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro _ghash_square a, dst, gfpoly, t0, t1
+ vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L
+ vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H
+ vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+ vpxord \t0, \t1, \t1 // Fold LO into MI
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+ vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI
+.endm
+
+// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
+//
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->padding|.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables.
+ // %zmm[0-2] and %rax are used as temporaries.
+ .set POWERS_PTR, %rsi
+ .set RNDKEYLAST_PTR, %rdx
+ .set H_CUR, %zmm3
+ .set H_CUR_YMM, %ymm3
+ .set H_CUR_XMM, %xmm3
+ .set H_INC, %zmm4
+ .set H_INC_YMM, %ymm4
+ .set H_INC_XMM, %xmm4
+ .set GFPOLY, %zmm5
+ .set GFPOLY_YMM, %ymm5
+ .set GFPOLY_XMM, %xmm5
+
+ // Get pointer to lowest set of key powers (located at end of array).
+ lea OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block
+ add $16, KEY
+1:
+ vaesenc (KEY), %xmm0, %xmm0
+ add $16, KEY
+ cmp KEY, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0
+
+ // Reflect the bytes of the raw hash subkey.
+ vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM
+
+ // Zeroize the padding blocks.
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovdqu %ymm0, 64(POWERS_PTR)
+ vmovdqu %xmm0, 64+2*16(POWERS_PTR)
+
+ // Finish preprocessing the first key power, H^1. Since this GHASH
+ // implementation operates directly on values with the backwards bit
+ // order specified by the GCM standard, it's necessary to preprocess the
+ // raw key as follows. First, reflect its bytes. Second, multiply it
+ // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
+ // interpretation of polynomial coefficients), which can also be
+ // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
+ // + 1 using the alternative, natural interpretation of polynomial
+ // coefficients. For details, see the comment above _ghash_mul_step.
+ //
+ // Either way, for the multiplication the concrete operation performed
+ // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
+ // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit
+ // wide shift instruction, so instead double each of the two 64-bit
+ // halves and incorporate the internal carry bit into the value XOR'd.
+ vpshufd $0xd3, H_CUR_XMM, %xmm0
+ vpsrad $31, %xmm0, %xmm0
+ vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+ // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
+ vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM
+
+ // Load the gfpoly constant.
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // Square H^1 to get H^2.
+ //
+ // Note that as with H^1, all higher key powers also need an extra
+ // factor of x^-1 (or x using the natural interpretation). Nothing
+ // special needs to be done to make this happen, though: H^1 * H^1 would
+ // end up with two factors of x^-1, but the multiplication consumes one.
+ // So the product H^2 ends up with the desired one factor of x^-1.
+ _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1
+
+ // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
+ vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
+ vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM
+
+ // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+ _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
+ %ymm0, %ymm1, %ymm2
+ vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR
+ vshufi64x2 $0, H_INC, H_INC, H_INC
+
+ // Store the lowest set of key powers.
+ vmovdqu8 H_CUR, (POWERS_PTR)
+
+ // Compute and store the remaining key powers.
+ // Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+ // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
+ mov $3, %eax
+.Lprecompute_next:
+ sub $64, POWERS_PTR
+ _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2
+ vmovdqu8 H_CUR, (POWERS_PTR)
+ dec %eax
+ jnz .Lprecompute_next
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
+
+// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
+// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst.
+.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
+ vextracti32x4 $1, \src, \t0_xmm
+ vextracti32x4 $2, \src, \t1_xmm
+ vextracti32x4 $3, \src, \t2_xmm
+ vpxord \t0_xmm, \src_xmm, \dst_xmm
+ vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm
+.endm
+
+// Do one step of the GHASH update of the data blocks given in the vector
+// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The
+// division into steps allows users of this macro to optionally interleave the
+// computation with other instructions. This macro uses the vector register
+// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
+// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
+// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the
+// data blocks. The parameter registers must be preserved across steps.
+//
+// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
+// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
+// operations are vectorized operations on 512-bit vectors of 128-bit blocks.
+// The vectorized terms correspond to the following non-vectorized terms:
+//
+// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
+// H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
+// H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
+// H_POW2*GHASHDATA2 => H^8*blk8, H^7*blk9, H^6*blk10, and H^5*blk11
+// H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
+//
+// More concretely, this code does:
+// - Do vectorized "schoolbook" multiplications to compute the intermediate
+// 256-bit product of each block and its corresponding hash key power.
+// - Sum (XOR) the intermediate 256-bit products across vectors.
+// - Do a vectorized reduction of these 256-bit intermediate values to
+// 128-bits each.
+// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
+//
+// See _ghash_mul_step for the full explanation of the operations performed for
+// each individual finite field multiplication and reduction.
+.macro _ghash_step_4x i
+.if \i == 0
+ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0
+ vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0
+ vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1
+ vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2
+.elseif \i == 1
+ vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3
+ vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0
+ vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1
+ vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2
+.elseif \i == 2
+ vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0})
+ vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0})
+ vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0
+.elseif \i == 3
+ vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1
+ vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0})
+ vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3
+.elseif \i == 4
+ vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0})
+ vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5
+ vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6
+.elseif \i == 5
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0})
+ vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57)
+ vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7
+ vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0})
+.elseif \i == 6
+ vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO
+ vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0
+ vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1
+ vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2
+.elseif \i == 7
+ vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI
+ vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3
+ vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0})
+ vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0})
+ vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI
+ vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI
+.elseif \i == 9
+ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+.endif
+.endm
+
+// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full
+// explanation.
+.macro _ghash_4x
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_step_4x \i
+.endr
+.endm
+
+// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// u8 ghash_acc[16],
+// const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case. TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ .set AADLEN64, %rcx // Zero-extend AADLEN before using!
+
+ // Additional local variables.
+ // %rax and %k1 are used as temporary registers.
+ .set GHASHDATA0, %zmm0
+ .set GHASHDATA0_XMM, %xmm0
+ .set GHASHDATA1, %zmm1
+ .set GHASHDATA1_XMM, %xmm1
+ .set GHASHDATA2, %zmm2
+ .set GHASHDATA2_XMM, %xmm2
+ .set GHASHDATA3, %zmm3
+ .set BSWAP_MASK, %zmm4
+ .set BSWAP_MASK_XMM, %xmm4
+ .set GHASH_ACC, %zmm5
+ .set GHASH_ACC_XMM, %xmm5
+ .set H_POW4, %zmm6
+ .set H_POW3, %zmm7
+ .set H_POW2, %zmm8
+ .set H_POW1, %zmm9
+ .set H_POW1_XMM, %xmm9
+ .set GFPOLY, %zmm10
+ .set GFPOLY_XMM, %xmm10
+ .set GHASHTMP0, %zmm11
+ .set GHASHTMP1, %zmm12
+ .set GHASHTMP2, %zmm13
+
+ // Load the GHASH accumulator.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+
+ // Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+ cmp $16, AADLEN
+ jg .Laad_more_than_16bytes
+ test AADLEN, AADLEN
+ jz .Laad_done
+
+ // Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD.
+ vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM
+ vmovdqu .Lgfpoly(%rip), GFPOLY_XMM
+ mov $-1, %eax
+ bzhi AADLEN, %eax, %eax
+ kmovd %eax, %k1
+ vmovdqu8 (AAD), GHASHDATA0_XMM{%k1}{z}
+ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM
+ vpshufb BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM
+ vpxor GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ _ghash_mul H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+ jmp .Laad_done
+
+.Laad_more_than_16bytes:
+ vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time.
+ sub $256, AADLEN
+ jl .Laad_loop_4x_done
+ vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+ vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+ vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_4x:
+ vmovdqu8 0*64(AAD), GHASHDATA0
+ vmovdqu8 1*64(AAD), GHASHDATA1
+ vmovdqu8 2*64(AAD), GHASHDATA2
+ vmovdqu8 3*64(AAD), GHASHDATA3
+ _ghash_4x
+ add $256, AAD
+ sub $256, AADLEN
+ jge .Laad_loop_4x
+.Laad_loop_4x_done:
+
+ // If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time.
+ add $192, AADLEN
+ jl .Laad_loop_1x_done
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_1x:
+ vmovdqu8 (AAD), GHASHDATA0
+ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0
+ vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ GHASHDATA0, GHASHDATA1, GHASHDATA2
+ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+ add $64, AAD
+ sub $64, AADLEN
+ jge .Laad_loop_1x
+.Laad_loop_1x_done:
+
+ // Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD.
+ add $64, AADLEN
+ jz .Laad_done
+ mov $-1, %rax
+ bzhi AADLEN64, %rax, %rax
+ kmovq %rax, %k1
+ vmovdqu8 (AAD), GHASHDATA0{%k1}{z}
+ neg AADLEN64
+ and $~15, AADLEN64 // -round_up(AADLEN, 16)
+ vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0
+ vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ GHASHDATA0, GHASHDATA1, GHASHDATA2
+ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+
+.Laad_done:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
+
+// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the
+// round key that has been broadcast to all 128-bit lanes of \round_key.
+.macro _vaesenc_4x round_key
+ vaesenc \round_key, %zmm0, %zmm0
+ vaesenc \round_key, %zmm1, %zmm1
+ vaesenc \round_key, %zmm2, %zmm2
+ vaesenc \round_key, %zmm3, %zmm3
+.endm
+
+// Start the AES encryption of four vectors of counter blocks.
+.macro _ctr_begin_4x
+
+ // Increment LE_CTR four times to generate four vectors of little-endian
+ // counter blocks, swap each to big-endian, and store them in %zmm[0-3].
+ vpshufb BSWAP_MASK, LE_CTR, %zmm0
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, %zmm1
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, %zmm2
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, %zmm3
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+
+ // AES "round zero": XOR in the zero-th round key.
+ vpxord RNDKEY0, %zmm0, %zmm0
+ vpxord RNDKEY0, %zmm1, %zmm1
+ vpxord RNDKEY0, %zmm2, %zmm2
+ vpxord RNDKEY0, %zmm3, %zmm3
+.endm
+
+// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR
+// source data with the resulting keystream, and write the result to DST and
+// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)
+.macro _aesenclast_and_xor_4x
+ // XOR the source data with the last round key, saving the result in
+ // GHASHDATA[0-3]. This reduces latency by taking advantage of the
+ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+ vpxord 0*64(SRC), RNDKEYLAST, GHASHDATA0
+ vpxord 1*64(SRC), RNDKEYLAST, GHASHDATA1
+ vpxord 2*64(SRC), RNDKEYLAST, GHASHDATA2
+ vpxord 3*64(SRC), RNDKEYLAST, GHASHDATA3
+
+ // Do the last AES round. This handles the XOR with the source data
+ // too, as per the optimization described above.
+ vaesenclast GHASHDATA0, %zmm0, GHASHDATA0
+ vaesenclast GHASHDATA1, %zmm1, GHASHDATA1
+ vaesenclast GHASHDATA2, %zmm2, GHASHDATA2
+ vaesenclast GHASHDATA3, %zmm3, GHASHDATA3
+
+ // Store the en/decrypted data to DST.
+ vmovdqu8 GHASHDATA0, 0*64(DST)
+ vmovdqu8 GHASHDATA1, 1*64(DST)
+ vmovdqu8 GHASHDATA2, 2*64(DST)
+ vmovdqu8 GHASHDATA3, 3*64(DST)
+.endm
+
+// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one). The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|. It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|. The
+// caller must update |le_ctr| if any more data segments follow. Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+
+ // Additional local variables
+
+ // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also
+ // available as a temporary register after the counter is loaded.
+
+ // AES key length in bytes
+ .set AESKEYLEN, %r10d
+ .set AESKEYLEN64, %r10
+
+ // Pointer to the last AES round key for the chosen AES variant
+ .set RNDKEYLAST_PTR, %r11
+
+ // In the main loop, %zmm[0-3] are used as AES input and output.
+ // Elsewhere they are used as temporary registers.
+
+ // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
+ .set GHASHDATA0, %zmm4
+ .set GHASHDATA0_XMM, %xmm4
+ .set GHASHDATA1, %zmm5
+ .set GHASHDATA1_XMM, %xmm5
+ .set GHASHDATA2, %zmm6
+ .set GHASHDATA2_XMM, %xmm6
+ .set GHASHDATA3, %zmm7
+
+ // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ // using vpshufb, copied to all 128-bit lanes.
+ .set BSWAP_MASK, %zmm8
+
+ // RNDKEY temporarily holds the next AES round key.
+ .set RNDKEY, %zmm9
+
+ // GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
+ // only the lowest 128-bit lane can be nonzero. When not fully reduced,
+ // more than one lane may be used, and they need to be XOR'd together.
+ .set GHASH_ACC, %zmm10
+ .set GHASH_ACC_XMM, %xmm10
+
+ // LE_CTR_INC is the vector of 32-bit words that need to be added to a
+ // vector of little-endian counter blocks to advance it forwards.
+ .set LE_CTR_INC, %zmm11
+
+ // LE_CTR contains the next set of little-endian counter blocks.
+ .set LE_CTR, %zmm12
+
+ // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
+ // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
+ // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+ .set RNDKEY0, %zmm13
+ .set RNDKEYLAST, %zmm14
+ .set RNDKEY_M9, %zmm15
+ .set RNDKEY_M8, %zmm16
+ .set RNDKEY_M7, %zmm17
+ .set RNDKEY_M6, %zmm18
+ .set RNDKEY_M5, %zmm19
+ .set RNDKEY_M4, %zmm20
+ .set RNDKEY_M3, %zmm21
+ .set RNDKEY_M2, %zmm22
+ .set RNDKEY_M1, %zmm23
+
+ // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
+ // cannot coincide with anything used for AES encryption, since for
+ // performance reasons GHASH and AES encryption are interleaved.
+ .set GHASHTMP0, %zmm24
+ .set GHASHTMP1, %zmm25
+ .set GHASHTMP2, %zmm26
+
+ // H_POW[4-1] contain the powers of the hash key H^16...H^1. The
+ // descending numbering reflects the order of the key powers.
+ .set H_POW4, %zmm27
+ .set H_POW3, %zmm28
+ .set H_POW2, %zmm29
+ .set H_POW1, %zmm30
+
+ // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
+ .set GFPOLY, %zmm31
+
+ // Load some constants.
+ vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // Load the GHASH accumulator and the starting counter.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+ vbroadcasti32x4 (LE_CTR_PTR), LE_CTR
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Make RNDKEYLAST_PTR point to the last AES round key. This is the
+ // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+ // respectively. Then load the zero-th and last round keys.
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+ vbroadcasti32x4 (KEY), RNDKEY0
+ vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST
+
+ // Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+ vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+ // Load 4 into all 128-bit lanes of LE_CTR_INC.
+ vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC
+
+ // If there are at least 256 bytes of data, then continue into the loop
+ // that processes 256 bytes of data at a time. Otherwise skip it.
+ //
+ // Pre-subtracting 256 from DATALEN saves an instruction from the main
+ // loop and also ensures that at least one write always occurs to
+ // DATALEN, zero-extending it and allowing DATALEN64 to be used later.
+ sub $256, DATALEN
+ jl .Lcrypt_loop_4x_done\@
+
+ // Load powers of the hash key.
+ vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+ vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+ vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+
+ // Main loop: en/decrypt and hash 4 vectors at a time.
+ //
+ // When possible, interleave the AES encryption of the counter blocks
+ // with the GHASH update of the ciphertext blocks. This improves
+ // performance on many CPUs because the execution ports used by the VAES
+ // instructions often differ from those used by vpclmulqdq and other
+ // instructions used in GHASH. For example, many Intel CPUs dispatch
+ // vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
+ //
+ // The interleaving is easiest to do during decryption, since during
+ // decryption the ciphertext blocks are immediately available. For
+ // encryption, instead encrypt the first set of blocks, then hash those
+ // blocks while encrypting the next set of blocks, repeat that as
+ // needed, and finally hash the last set of blocks.
+
+.if \enc
+ // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting
+ // ciphertext in GHASHDATA[0-3] for GHASH.
+ _ctr_begin_4x
+ lea 16(KEY), %rax
+1:
+ vbroadcasti32x4 (%rax), RNDKEY
+ _vaesenc_4x RNDKEY
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ _aesenclast_and_xor_4x
+ add $256, SRC
+ add $256, DST
+ sub $256, DATALEN
+ jl .Lghash_last_ciphertext_4x\@
+.endif
+
+ // Cache as many additional AES round keys as possible.
+.irp i, 9,8,7,6,5,4,3,2,1
+ vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
+.endr
+
+.Lcrypt_loop_4x\@:
+
+ // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If
+ // encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
+.if !\enc
+ vmovdqu8 0*64(SRC), GHASHDATA0
+ vmovdqu8 1*64(SRC), GHASHDATA1
+ vmovdqu8 2*64(SRC), GHASHDATA2
+ vmovdqu8 3*64(SRC), GHASHDATA3
+.endif
+
+ // Start the AES encryption of the counter blocks.
+ _ctr_begin_4x
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+192:
+ vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+128:
+
+ // Finish the AES encryption of the counter blocks in %zmm[0-3],
+ // interleaved with the GHASH update of the ciphertext blocks in
+ // GHASHDATA[0-3].
+.irp i, 9,8,7,6,5,4,3,2,1
+ _ghash_step_4x (9 - \i)
+ _vaesenc_4x RNDKEY_M\i
+.endr
+ _ghash_step_4x 9
+ _aesenclast_and_xor_4x
+ add $256, SRC
+ add $256, DST
+ sub $256, DATALEN
+ jge .Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+ // Update GHASH with the last set of ciphertext blocks.
+ _ghash_4x
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+ // Undo the extra subtraction by 256 and check whether data remains.
+ add $256, DATALEN
+ jz .Ldone\@
+
+ // The data length isn't a multiple of 256 bytes. Process the remaining
+ // data of length 1 <= DATALEN < 256, up to one 64-byte vector at a
+ // time. Going one vector at a time may seem inefficient compared to
+ // having separate code paths for each possible number of vectors
+ // remaining. However, using a loop keeps the code size down, and it
+ // performs surprising well; modern CPUs will start executing the next
+ // iteration before the previous one finishes and also predict the
+ // number of loop iterations. For a similar reason, we roll up the AES
+ // rounds.
+ //
+ // On the last iteration, the remaining length may be less than 64
+ // bytes. Handle this using masking.
+ //
+ // Since there are enough key powers available for all remaining data,
+ // there is no need to do a GHASH reduction after each iteration.
+ // Instead, multiply each remaining block by its own key power, and only
+ // do a GHASH reduction at the very end.
+
+ // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+ // is the number of blocks that remain.
+ .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused.
+ mov DATALEN, %eax
+ neg %rax
+ and $~15, %rax // -round_up(DATALEN, 16)
+ lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ .set LO, GHASHDATA0
+ .set LO_XMM, GHASHDATA0_XMM
+ .set MI, GHASHDATA1
+ .set MI_XMM, GHASHDATA1_XMM
+ .set HI, GHASHDATA2
+ .set HI_XMM, GHASHDATA2_XMM
+ vpxor LO_XMM, LO_XMM, LO_XMM
+ vpxor MI_XMM, MI_XMM, MI_XMM
+ vpxor HI_XMM, HI_XMM, HI_XMM
+
+.Lcrypt_loop_1x\@:
+
+ // Select the appropriate mask for this iteration: all 1's if
+ // DATALEN >= 64, otherwise DATALEN 1's. Do this branchlessly using the
+ // bzhi instruction from BMI2. (This relies on DATALEN <= 255.)
+ mov $-1, %rax
+ bzhi DATALEN64, %rax, %rax
+ kmovq %rax, %k1
+
+ // Encrypt a vector of counter blocks. This does not need to be masked.
+ vpshufb BSWAP_MASK, LE_CTR, %zmm0
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpxord RNDKEY0, %zmm0, %zmm0
+ lea 16(KEY), %rax
+1:
+ vbroadcasti32x4 (%rax), RNDKEY
+ vaesenc RNDKEY, %zmm0, %zmm0
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast RNDKEYLAST, %zmm0, %zmm0
+
+ // XOR the data with the appropriate number of keystream bytes.
+ vmovdqu8 (SRC), %zmm1{%k1}{z}
+ vpxord %zmm1, %zmm0, %zmm0
+ vmovdqu8 %zmm0, (DST){%k1}
+
+ // Update GHASH with the ciphertext block(s), without reducing.
+ //
+ // In the case of DATALEN < 64, the ciphertext is zero-padded to 64
+ // bytes. (If decrypting, it's done by the above masked load. If
+ // encrypting, it's done by the below masked register-to-register move.)
+ // Note that if DATALEN <= 48, there will be additional padding beyond
+ // the padding of the last block specified by GHASH itself; i.e., there
+ // may be whole block(s) that get processed by the GHASH multiplication
+ // and reduction instructions but should not actually be included in the
+ // GHASH. However, any such blocks are all-zeroes, and the values that
+ // they're multiplied with are also all-zeroes. Therefore they just add
+ // 0 * 0 = 0 to the final GHASH result, which makes no difference.
+ vmovdqu8 (POWERS_PTR), H_POW1
+.if \enc
+ vmovdqu8 %zmm0, %zmm1{%k1}{z}
+.endif
+ vpshufb BSWAP_MASK, %zmm1, %zmm0
+ vpxord GHASH_ACC, %zmm0, %zmm0
+ _ghash_mul_noreduce H_POW1, %zmm0, LO, MI, HI, \
+ GHASHDATA3, %zmm1, %zmm2, %zmm3
+ vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+ add $64, POWERS_PTR
+ add $64, SRC
+ add $64, DST
+ sub $64, DATALEN
+ jg .Lcrypt_loop_1x\@
+
+ // Finally, do the GHASH reduction.
+ _ghash_reduce LO, MI, HI, GFPOLY, %zmm0
+ _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4],
+// const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+
+ // Additional local variables.
+ // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set GFPOLY, %xmm4
+ .set BSWAP_MASK, %xmm5
+ .set LE_CTR, %xmm6
+ .set GHASH_ACC, %xmm7
+ .set H_POW1, %xmm8
+
+ // Load some constants.
+ vmovdqa .Lgfpoly(%rip), GFPOLY
+ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ // GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+ vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+ // Build the lengths block and XOR it with the GHASH accumulator.
+ // Although the lengths block is defined as the AAD length followed by
+ // the en/decrypted data length, both in big-endian byte order, a byte
+ // reflection of the full block is needed because of the way we compute
+ // GHASH (see _ghash_mul_step). By using little-endian values in the
+ // opposite order, we avoid having to reflect any bytes here.
+ vmovq TOTAL_DATALEN, %xmm0
+ vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0
+ vpsllq $3, %xmm0, %xmm0 // Bytes to bits
+ vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+ // Load the first hash key power (H^1), which is stored last.
+ vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+.if !\enc
+ // Prepare a mask of TAGLEN one bits.
+ movl 8(%rsp), TAGLEN
+ mov $-1, %eax
+ bzhi TAGLEN, %eax, %eax
+ kmovd %eax, %k1
+.endif
+
+ // Make %rax point to the last AES round key for the chosen AES variant.
+ lea 6*16(KEY,AESKEYLEN64,4), %rax
+
+ // Start the AES encryption of the counter block by swapping the counter
+ // block to big-endian and XOR-ing it with the zero-th AES round key.
+ vpshufb BSWAP_MASK, LE_CTR, %xmm0
+ vpxor (KEY), %xmm0, %xmm0
+
+ // Complete the AES encryption and multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vaesenc -13*16(%rax), %xmm0, %xmm0
+ vaesenc -12*16(%rax), %xmm0, %xmm0
+192:
+ vaesenc -11*16(%rax), %xmm0, %xmm0
+ vaesenc -10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+ vaesenc (\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+ _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+
+ // Undo the byte reflection of the GHASH accumulator.
+ vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+ // Do the last AES round and XOR the resulting keystream block with the
+ // GHASH accumulator to produce the full computed authentication tag.
+ //
+ // Reduce latency by taking advantage of the property vaesenclast(key,
+ // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last
+ // round key, instead of XOR'ing the final AES output with GHASH_ACC.
+ //
+ // enc_final then returns the computed auth tag, while dec_final
+ // compares it with the transmitted one and returns a bool. To compare
+ // the tags, dec_final XORs them together and uses vptest to check
+ // whether the result is all-zeroes. This should be constant-time.
+ // dec_final applies the vaesenclast optimization to this additional
+ // value XOR'd too, using vpternlogd to XOR the last round key, GHASH
+ // accumulator, and transmitted auth tag together in one instruction.
+.if \enc
+ vpxor (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, GHASH_ACC
+ vmovdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ vmovdqu (TAG), %xmm1
+ vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, %xmm0
+ xor %eax, %eax
+ vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes
+ vptest %xmm0, %xmm0
+ sete %al
+.endif
+ // No need for vzeroupper here, since only used xmm registers were used.
+ RET
+.endm
+
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
deleted file mode 100644
index b949ec2f9af4..000000000000
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ /dev/null
@@ -1,367 +0,0 @@
-// -------------------------------------------------------------------------
-// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK.
-// All rights reserved.
-//
-// LICENSE TERMS
-//
-// The free distribution and use of this software in both source and binary
-// form is allowed (with or without changes) provided that:
-//
-// 1. distributions of this source code include the above copyright
-// notice, this list of conditions and the following disclaimer//
-//
-// 2. distributions in binary form include the above copyright
-// notice, this list of conditions and the following disclaimer
-// in the documentation and/or other associated materials//
-//
-// 3. the copyright holder's name is not used to endorse products
-// built using this software without specific written permission.
-//
-//
-// ALTERNATIVELY, provided that this notice is retained in full, this product
-// may be distributed under the terms of the GNU General Public License (GPL),
-// in which case the provisions of the GPL apply INSTEAD OF those given above.
-//
-// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
-// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
-
-// DISCLAIMER
-//
-// This software is provided 'as is' with no explicit or implied warranties
-// in respect of its properties including, but not limited to, correctness
-// and fitness for purpose.
-// -------------------------------------------------------------------------
-// Issue Date: 29/07/2002
-
-.file "aes-i586-asm.S"
-.text
-
-#include <asm/asm-offsets.h>
-
-#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
-
-/* offsets to parameters with one register pushed onto stack */
-#define ctx 8
-#define out_blk 12
-#define in_blk 16
-
-/* offsets in crypto_aes_ctx structure */
-#define klen (480)
-#define ekey (0)
-#define dkey (240)
-
-// register mapping for encrypt and decrypt subroutines
-
-#define r0 eax
-#define r1 ebx
-#define r2 ecx
-#define r3 edx
-#define r4 esi
-#define r5 edi
-
-#define eaxl al
-#define eaxh ah
-#define ebxl bl
-#define ebxh bh
-#define ecxl cl
-#define ecxh ch
-#define edxl dl
-#define edxh dh
-
-#define _h(reg) reg##h
-#define h(reg) _h(reg)
-
-#define _l(reg) reg##l
-#define l(reg) _l(reg)
-
-// This macro takes a 32-bit word representing a column and uses
-// each of its four bytes to index into four tables of 256 32-bit
-// words to obtain values that are then xored into the appropriate
-// output registers r0, r1, r4 or r5.
-
-// Parameters:
-// table table base address
-// %1 out_state[0]
-// %2 out_state[1]
-// %3 out_state[2]
-// %4 out_state[3]
-// idx input register for the round (destroyed)
-// tmp scratch register for the round
-// sched key schedule
-
-#define do_col(table, a1,a2,a3,a4, idx, tmp) \
- movzx %l(idx),%tmp; \
- xor table(,%tmp,4),%a1; \
- movzx %h(idx),%tmp; \
- shr $16,%idx; \
- xor table+tlen(,%tmp,4),%a2; \
- movzx %l(idx),%tmp; \
- movzx %h(idx),%idx; \
- xor table+2*tlen(,%tmp,4),%a3; \
- xor table+3*tlen(,%idx,4),%a4;
-
-// initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
- mov 0 sched,%a1; \
- movzx %l(idx),%tmp; \
- mov 12 sched,%a2; \
- xor table(,%tmp,4),%a1; \
- mov 4 sched,%a4; \
- movzx %h(idx),%tmp; \
- shr $16,%idx; \
- xor table+tlen(,%tmp,4),%a2; \
- movzx %l(idx),%tmp; \
- movzx %h(idx),%idx; \
- xor table+3*tlen(,%idx,4),%a4; \
- mov %a3,%idx; \
- mov 8 sched,%a3; \
- xor table+2*tlen(,%tmp,4),%a3;
-
-// initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
- mov 0 sched,%a1; \
- movzx %l(idx),%tmp; \
- mov 4 sched,%a2; \
- xor table(,%tmp,4),%a1; \
- mov 12 sched,%a4; \
- movzx %h(idx),%tmp; \
- shr $16,%idx; \
- xor table+tlen(,%tmp,4),%a2; \
- movzx %l(idx),%tmp; \
- movzx %h(idx),%idx; \
- xor table+3*tlen(,%idx,4),%a4; \
- mov %a3,%idx; \
- mov 8 sched,%a3; \
- xor table+2*tlen(,%tmp,4),%a3;
-
-
-// original Gladman had conditional saves to MMX regs.
-#define save(a1, a2) \
- mov %a2,4*a1(%esp)
-
-#define restore(a1, a2) \
- mov 4*a2(%esp),%a1
-
-// These macros perform a forward encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage.
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit: r2,r1,r4,r5
-#define fwd_rnd1(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \
- do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \
- restore(r0,0); \
- do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \
- restore(r0,1); \
- do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit: r0,r1,r4,r5
-#define fwd_rnd2(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \
- do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \
- restore(r2,0); \
- do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \
- restore(r2,1); \
- do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */
-
-// These macros performs an inverse encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit: r2,r1,r4,r5
-#define inv_rnd1(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \
- do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \
- restore(r0,0); \
- do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \
- restore(r0,1); \
- do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit: r0,r1,r4,r5
-#define inv_rnd2(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \
- do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \
- restore(r2,0); \
- do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \
- restore(r2,1); \
- do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */
-
-// AES (Rijndael) Encryption Subroutine
-/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
-
-.global aes_enc_blk
-
-.extern crypto_ft_tab
-.extern crypto_fl_tab
-
-.align 4
-
-aes_enc_blk:
- push %ebp
- mov ctx(%esp),%ebp
-
-// CAUTION: the order and the values used in these assigns
-// rely on the register mappings
-
-1: push %ebx
- mov in_blk+4(%esp),%r2
- push %esi
- mov klen(%ebp),%r3 // key size
- push %edi
-#if ekey != 0
- lea ekey(%ebp),%ebp // key pointer
-#endif
-
-// input four columns and xor in first round key
-
- mov (%r2),%r0
- mov 4(%r2),%r1
- mov 8(%r2),%r4
- mov 12(%r2),%r5
- xor (%ebp),%r0
- xor 4(%ebp),%r1
- xor 8(%ebp),%r4
- xor 12(%ebp),%r5
-
- sub $8,%esp // space for register saves on stack
- add $16,%ebp // increment to next round key
- cmp $24,%r3
- jb 4f // 10 rounds for 128-bit key
- lea 32(%ebp),%ebp
- je 3f // 12 rounds for 192-bit key
- lea 32(%ebp),%ebp
-
-2: fwd_rnd1( -64(%ebp), crypto_ft_tab) // 14 rounds for 256-bit key
- fwd_rnd2( -48(%ebp), crypto_ft_tab)
-3: fwd_rnd1( -32(%ebp), crypto_ft_tab) // 12 rounds for 192-bit key
- fwd_rnd2( -16(%ebp), crypto_ft_tab)
-4: fwd_rnd1( (%ebp), crypto_ft_tab) // 10 rounds for 128-bit key
- fwd_rnd2( +16(%ebp), crypto_ft_tab)
- fwd_rnd1( +32(%ebp), crypto_ft_tab)
- fwd_rnd2( +48(%ebp), crypto_ft_tab)
- fwd_rnd1( +64(%ebp), crypto_ft_tab)
- fwd_rnd2( +80(%ebp), crypto_ft_tab)
- fwd_rnd1( +96(%ebp), crypto_ft_tab)
- fwd_rnd2(+112(%ebp), crypto_ft_tab)
- fwd_rnd1(+128(%ebp), crypto_ft_tab)
- fwd_rnd2(+144(%ebp), crypto_fl_tab) // last round uses a different table
-
-// move final values to the output array. CAUTION: the
-// order of these assigns rely on the register mappings
-
- add $8,%esp
- mov out_blk+12(%esp),%ebp
- mov %r5,12(%ebp)
- pop %edi
- mov %r4,8(%ebp)
- pop %esi
- mov %r1,4(%ebp)
- pop %ebx
- mov %r0,(%ebp)
- pop %ebp
- ret
-
-// AES (Rijndael) Decryption Subroutine
-/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
-
-.global aes_dec_blk
-
-.extern crypto_it_tab
-.extern crypto_il_tab
-
-.align 4
-
-aes_dec_blk:
- push %ebp
- mov ctx(%esp),%ebp
-
-// CAUTION: the order and the values used in these assigns
-// rely on the register mappings
-
-1: push %ebx
- mov in_blk+4(%esp),%r2
- push %esi
- mov klen(%ebp),%r3 // key size
- push %edi
-#if dkey != 0
- lea dkey(%ebp),%ebp // key pointer
-#endif
-
-// input four columns and xor in first round key
-
- mov (%r2),%r0
- mov 4(%r2),%r1
- mov 8(%r2),%r4
- mov 12(%r2),%r5
- xor (%ebp),%r0
- xor 4(%ebp),%r1
- xor 8(%ebp),%r4
- xor 12(%ebp),%r5
-
- sub $8,%esp // space for register saves on stack
- add $16,%ebp // increment to next round key
- cmp $24,%r3
- jb 4f // 10 rounds for 128-bit key
- lea 32(%ebp),%ebp
- je 3f // 12 rounds for 192-bit key
- lea 32(%ebp),%ebp
-
-2: inv_rnd1( -64(%ebp), crypto_it_tab) // 14 rounds for 256-bit key
- inv_rnd2( -48(%ebp), crypto_it_tab)
-3: inv_rnd1( -32(%ebp), crypto_it_tab) // 12 rounds for 192-bit key
- inv_rnd2( -16(%ebp), crypto_it_tab)
-4: inv_rnd1( (%ebp), crypto_it_tab) // 10 rounds for 128-bit key
- inv_rnd2( +16(%ebp), crypto_it_tab)
- inv_rnd1( +32(%ebp), crypto_it_tab)
- inv_rnd2( +48(%ebp), crypto_it_tab)
- inv_rnd1( +64(%ebp), crypto_it_tab)
- inv_rnd2( +80(%ebp), crypto_it_tab)
- inv_rnd1( +96(%ebp), crypto_it_tab)
- inv_rnd2(+112(%ebp), crypto_it_tab)
- inv_rnd1(+128(%ebp), crypto_it_tab)
- inv_rnd2(+144(%ebp), crypto_il_tab) // last round uses a different table
-
-// move final values to the output array. CAUTION: the
-// order of these assigns rely on the register mappings
-
- add $8,%esp
- mov out_blk+12(%esp),%ebp
- mov %r5,12(%ebp)
- pop %edi
- mov %r4,8(%ebp)
- pop %esi
- mov %r1,4(%ebp)
- pop %ebx
- mov %r0,(%ebp)
- pop %ebp
- ret
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
deleted file mode 100644
index 5b577d5a059b..000000000000
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
- *
- * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
- *
- * License:
- * This code can be distributed under the terms of the GNU General Public
- * License (GPL) Version 2 provided that the above header down to and
- * including this sentence is retained in full.
- */
-
-.extern crypto_ft_tab
-.extern crypto_it_tab
-.extern crypto_fl_tab
-.extern crypto_il_tab
-
-.text
-
-#include <asm/asm-offsets.h>
-
-#define R1 %rax
-#define R1E %eax
-#define R1X %ax
-#define R1H %ah
-#define R1L %al
-#define R2 %rbx
-#define R2E %ebx
-#define R2X %bx
-#define R2H %bh
-#define R2L %bl
-#define R3 %rcx
-#define R3E %ecx
-#define R3X %cx
-#define R3H %ch
-#define R3L %cl
-#define R4 %rdx
-#define R4E %edx
-#define R4X %dx
-#define R4H %dh
-#define R4L %dl
-#define R5 %rsi
-#define R5E %esi
-#define R6 %rdi
-#define R6E %edi
-#define R7 %rbp
-#define R7E %ebp
-#define R8 %r8
-#define R9 %r9
-#define R10 %r10
-#define R11 %r11
-
-#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
- .global FUNC; \
- .type FUNC,@function; \
- .align 8; \
-FUNC: movq r1,r2; \
- movq r3,r4; \
- leaq KEY+48(r8),r9; \
- movq r10,r11; \
- movl (r7),r5 ## E; \
- movl 4(r7),r1 ## E; \
- movl 8(r7),r6 ## E; \
- movl 12(r7),r7 ## E; \
- movl 480(r8),r10 ## E; \
- xorl -48(r9),r5 ## E; \
- xorl -44(r9),r1 ## E; \
- xorl -40(r9),r6 ## E; \
- xorl -36(r9),r7 ## E; \
- cmpl $24,r10 ## E; \
- jb B128; \
- leaq 32(r9),r9; \
- je B192; \
- leaq 32(r9),r9;
-
-#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
- movq r1,r2; \
- movq r3,r4; \
- movl r5 ## E,(r9); \
- movl r6 ## E,4(r9); \
- movl r7 ## E,8(r9); \
- movl r8 ## E,12(r9); \
- ret;
-
-#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
- movzbl r2 ## H,r5 ## E; \
- movzbl r2 ## L,r6 ## E; \
- movl TAB+1024(,r5,4),r5 ## E;\
- movw r4 ## X,r2 ## X; \
- movl TAB(,r6,4),r6 ## E; \
- roll $16,r2 ## E; \
- shrl $16,r4 ## E; \
- movzbl r4 ## H,r7 ## E; \
- movzbl r4 ## L,r4 ## E; \
- xorl OFFSET(r8),ra ## E; \
- xorl OFFSET+4(r8),rb ## E; \
- xorl TAB+3072(,r7,4),r5 ## E;\
- xorl TAB+2048(,r4,4),r6 ## E;\
- movzbl r1 ## L,r7 ## E; \
- movzbl r1 ## H,r4 ## E; \
- movl TAB+1024(,r4,4),r4 ## E;\
- movw r3 ## X,r1 ## X; \
- roll $16,r1 ## E; \
- shrl $16,r3 ## E; \
- xorl TAB(,r7,4),r5 ## E; \
- movzbl r3 ## H,r7 ## E; \
- movzbl r3 ## L,r3 ## E; \
- xorl TAB+3072(,r7,4),r4 ## E;\
- xorl TAB+2048(,r3,4),r5 ## E;\
- movzbl r1 ## H,r7 ## E; \
- movzbl r1 ## L,r3 ## E; \
- shrl $16,r1 ## E; \
- xorl TAB+3072(,r7,4),r6 ## E;\
- movl TAB+2048(,r3,4),r3 ## E;\
- movzbl r1 ## H,r7 ## E; \
- movzbl r1 ## L,r1 ## E; \
- xorl TAB+1024(,r7,4),r6 ## E;\
- xorl TAB(,r1,4),r3 ## E; \
- movzbl r2 ## H,r1 ## E; \
- movzbl r2 ## L,r7 ## E; \
- shrl $16,r2 ## E; \
- xorl TAB+3072(,r1,4),r3 ## E;\
- xorl TAB+2048(,r7,4),r4 ## E;\
- movzbl r2 ## H,r1 ## E; \
- movzbl r2 ## L,r2 ## E; \
- xorl OFFSET+8(r8),rc ## E; \
- xorl OFFSET+12(r8),rd ## E; \
- xorl TAB+1024(,r1,4),r3 ## E;\
- xorl TAB(,r2,4),r4 ## E;
-
-#define move_regs(r1,r2,r3,r4) \
- movl r3 ## E,r1 ## E; \
- movl r4 ## E,r2 ## E;
-
-#define entry(FUNC,KEY,B128,B192) \
- prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
-
-#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
-
-#define encrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
- move_regs(R1,R2,R5,R6)
-
-#define encrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
-
-#define decrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
- move_regs(R1,R2,R5,R6)
-
-#define decrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
-
-/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
-
- entry(aes_enc_blk,0,enc128,enc192)
- encrypt_round(crypto_ft_tab,-96)
- encrypt_round(crypto_ft_tab,-80)
-enc192: encrypt_round(crypto_ft_tab,-64)
- encrypt_round(crypto_ft_tab,-48)
-enc128: encrypt_round(crypto_ft_tab,-32)
- encrypt_round(crypto_ft_tab,-16)
- encrypt_round(crypto_ft_tab, 0)
- encrypt_round(crypto_ft_tab, 16)
- encrypt_round(crypto_ft_tab, 32)
- encrypt_round(crypto_ft_tab, 48)
- encrypt_round(crypto_ft_tab, 64)
- encrypt_round(crypto_ft_tab, 80)
- encrypt_round(crypto_ft_tab, 96)
- encrypt_final(crypto_fl_tab,112)
- return
-
-/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
-
- entry(aes_dec_blk,240,dec128,dec192)
- decrypt_round(crypto_it_tab,-96)
- decrypt_round(crypto_it_tab,-80)
-dec192: decrypt_round(crypto_it_tab,-64)
- decrypt_round(crypto_it_tab,-48)
-dec128: decrypt_round(crypto_it_tab,-32)
- decrypt_round(crypto_it_tab,-16)
- decrypt_round(crypto_it_tab, 0)
- decrypt_round(crypto_it_tab, 16)
- decrypt_round(crypto_it_tab, 32)
- decrypt_round(crypto_it_tab, 48)
- decrypt_round(crypto_it_tab, 64)
- decrypt_round(crypto_it_tab, 80)
- decrypt_round(crypto_it_tab, 96)
- decrypt_final(crypto_il_tab,112)
- return
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
new file mode 100644
index 000000000000..a30753a3e207
--- /dev/null
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -0,0 +1,905 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-XTS for modern x86_64 CPUs
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/*
+ * This file implements AES-XTS for modern x86_64 CPUs. To handle the
+ * complexities of coding for x86 SIMD, e.g. where every vector length needs
+ * different code, it uses a macro to generate several implementations that
+ * share similar source code but are targeted at different CPUs, listed below:
+ *
+ * AES-NI && AVX
+ * - 128-bit vectors (1 AES block per vector)
+ * - VEX-coded instructions
+ * - xmm0-xmm15
+ * - This is for older CPUs that lack VAES but do have AVX.
+ *
+ * VAES && VPCLMULQDQ && AVX2
+ * - 256-bit vectors (2 AES blocks per vector)
+ * - VEX-coded instructions
+ * - ymm0-ymm15
+ * - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's
+ * Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm
+ * registers (e.g. Intel's Ice Lake).
+ *
+ * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
+ * - 512-bit vectors (4 AES blocks per vector)
+ * - EVEX-coded instructions
+ * - zmm0-zmm31
+ * - This is for CPUs that have good AVX512 support.
+ *
+ * This file doesn't have an implementation for AES-NI alone (without AVX), as
+ * the lack of VEX would make all the assembly code different.
+ *
+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
+ * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be
+ * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might
+ * need to start also providing an implementation using VAES alone.
+ *
+ * The AES-XTS implementations in this file support everything required by the
+ * crypto API, including support for arbitrary input lengths and multi-part
+ * processing. However, they are most heavily optimized for the common case of
+ * power-of-2 length inputs that are processed in a single part (disk sectors).
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+.Lgf_poly:
+ // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
+ // + 1. It is the value that must be XOR'd into the low 64 bits of the
+ // tweak each time a 1 is carried out of the high 64 bits.
+ //
+ // The high 64 bits of this value is just the internal carry bit that
+ // exists when there's a carry out of the low 64 bits of the tweak.
+ .quad 0x87, 1
+
+ // These are the shift amounts that are needed when multiplying by [x^0,
+ // x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
+ //
+ // The right shifts by 64 are expected to zeroize the destination.
+ // 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
+ // amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
+.Lrshift_amounts:
+ .byte 64, 64, 63, 63, 62, 62, 61, 61
+.Llshift_amounts:
+ .byte 0, 0, 1, 1, 2, 2, 3, 3
+
+ // This table contains constants for vpshufb and vpblendvb, used to
+ // handle variable byte shifts and blending during ciphertext stealing
+ // on CPUs that don't support AVX512-style masking.
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+.text
+
+.macro _define_Vi i
+.if VL == 16
+ .set V\i, %xmm\i
+.elseif VL == 32
+ .set V\i, %ymm\i
+.elseif VL == 64
+ .set V\i, %zmm\i
+.else
+ .error "Unsupported Vector Length (VL)"
+.endif
+.endm
+
+.macro _define_aliases
+ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+ // are available, that map to the xmm, ymm, or zmm registers according
+ // to the selected Vector Length (VL).
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ _define_Vi \i
+.endr
+.if USE_AVX512
+.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+ _define_Vi \i
+.endr
+.endif
+
+ // Function parameters
+ .set KEY, %rdi // Initially points to crypto_aes_ctx, then is
+ // advanced to point to 7th-from-last round key
+ .set SRC, %rsi // Pointer to next source data
+ .set DST, %rdx // Pointer to next destination data
+ .set LEN, %ecx // Remaining length in bytes
+ .set LEN8, %cl
+ .set LEN64, %rcx
+ .set TWEAK, %r8 // Pointer to next tweak
+
+ // %rax holds the AES key length in bytes.
+ .set KEYLEN, %eax
+ .set KEYLEN64, %rax
+
+ // %r9-r11 are available as temporaries.
+
+ // V0-V3 hold the data blocks during the main loop, or temporary values
+ // otherwise. V4-V5 hold temporary values.
+
+ // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
+ .set TWEAK0_XMM, %xmm6
+ .set TWEAK0, V6
+ .set TWEAK1_XMM, %xmm7
+ .set TWEAK1, V7
+ .set TWEAK2, V8
+ .set TWEAK3, V9
+
+ // V10-V13 are used for computing the next values of TWEAK[0-3].
+ .set NEXT_TWEAK0, V10
+ .set NEXT_TWEAK1, V11
+ .set NEXT_TWEAK2, V12
+ .set NEXT_TWEAK3, V13
+
+ // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
+ .set GF_POLY_XMM, %xmm14
+ .set GF_POLY, V14
+
+ // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
+ .set KEY0_XMM, %xmm15
+ .set KEY0, V15
+
+ // If 32 SIMD registers are available, then V16-V29 hold the remaining
+ // AES round keys, copied to all 128-bit lanes.
+ //
+ // AES-128, AES-192, and AES-256 use different numbers of round keys.
+ // To allow handling all three variants efficiently, we align the round
+ // keys to the *end* of this register range. I.e., AES-128 uses
+ // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
+ // (All also use KEY0 for the XOR-only "round" at the beginning.)
+.if USE_AVX512
+ .set KEY1_XMM, %xmm16
+ .set KEY1, V16
+ .set KEY2_XMM, %xmm17
+ .set KEY2, V17
+ .set KEY3_XMM, %xmm18
+ .set KEY3, V18
+ .set KEY4_XMM, %xmm19
+ .set KEY4, V19
+ .set KEY5_XMM, %xmm20
+ .set KEY5, V20
+ .set KEY6_XMM, %xmm21
+ .set KEY6, V21
+ .set KEY7_XMM, %xmm22
+ .set KEY7, V22
+ .set KEY8_XMM, %xmm23
+ .set KEY8, V23
+ .set KEY9_XMM, %xmm24
+ .set KEY9, V24
+ .set KEY10_XMM, %xmm25
+ .set KEY10, V25
+ .set KEY11_XMM, %xmm26
+ .set KEY11, V26
+ .set KEY12_XMM, %xmm27
+ .set KEY12, V27
+ .set KEY13_XMM, %xmm28
+ .set KEY13, V28
+ .set KEY14_XMM, %xmm29
+ .set KEY14, V29
+.endif
+ // V30-V31 are currently unused.
+.endm
+
+// Move a vector between memory and a register.
+.macro _vmovdqu src, dst
+.if VL < 64
+ vmovdqu \src, \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value into a vector.
+.macro _vbroadcast128 src, dst
+.if VL == 16
+ vmovdqu \src, \dst
+.elseif VL == 32
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro _vpxor src1, src2, dst
+.if VL < 64
+ vpxor \src1, \src2, \dst
+.else
+ vpxord \src1, \src2, \dst
+.endif
+.endm
+
+// XOR three vectors together.
+.macro _xor3 src1, src2, src3_and_dst
+.if USE_AVX512
+ // vpternlogd with immediate 0x96 is a three-argument XOR.
+ vpternlogd $0x96, \src1, \src2, \src3_and_dst
+.else
+ vpxor \src1, \src3_and_dst, \src3_and_dst
+ vpxor \src2, \src3_and_dst, \src3_and_dst
+.endif
+.endm
+
+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
+// (by multiplying by the polynomial 'x') and write it to \dst.
+.macro _next_tweak src, tmp, dst
+ vpshufd $0x13, \src, \tmp
+ vpaddq \src, \src, \dst
+ vpsrad $31, \tmp, \tmp
+.if USE_AVX512
+ vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst
+.else
+ vpand GF_POLY_XMM, \tmp, \tmp
+ vpxor \tmp, \dst, \dst
+.endif
+.endm
+
+// Given the XTS tweak(s) in the vector \src, compute the next vector of
+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
+//
+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
+// all tweaks in the vector in parallel. If VL=16, we just do the regular
+// computation without vpclmulqdq, as it's the faster method for a single tweak.
+.macro _next_tweakvec src, tmp1, tmp2, dst
+.if VL == 16
+ _next_tweak \src, \tmp1, \dst
+.else
+ vpsrlq $64 - VL/16, \src, \tmp1
+ vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
+ vpslldq $8, \tmp1, \tmp1
+ vpsllq $VL/16, \src, \dst
+ _xor3 \tmp1, \tmp2, \dst
+.endif
+.endm
+
+// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
+// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
+.macro _compute_first_set_of_tweaks
+.if VL == 16
+ vmovdqu (TWEAK), TWEAK0_XMM
+ vmovdqu .Lgf_poly(%rip), GF_POLY
+ _next_tweak TWEAK0, %xmm0, TWEAK1
+ _next_tweak TWEAK1, %xmm0, TWEAK2
+ _next_tweak TWEAK2, %xmm0, TWEAK3
+.elseif VL == 32
+ vmovdqu (TWEAK), TWEAK0_XMM
+ vbroadcasti128 .Lgf_poly(%rip), GF_POLY
+
+ // Compute the first vector of tweaks.
+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1
+ vinserti128 $1, %xmm1, TWEAK0, TWEAK0
+
+ // Compute the next three vectors of tweaks:
+ // TWEAK1 = TWEAK0 * [x^2, x^2]
+ // TWEAK2 = TWEAK0 * [x^4, x^4]
+ // TWEAK3 = TWEAK0 * [x^6, x^6]
+ vpsrlq $64 - 2, TWEAK0, V0
+ vpsrlq $64 - 4, TWEAK0, V2
+ vpsrlq $64 - 6, TWEAK0, V4
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpclmulqdq $0x01, GF_POLY, V2, V3
+ vpclmulqdq $0x01, GF_POLY, V4, V5
+ vpslldq $8, V0, V0
+ vpslldq $8, V2, V2
+ vpslldq $8, V4, V4
+ vpsllq $2, TWEAK0, TWEAK1
+ vpsllq $4, TWEAK0, TWEAK2
+ vpsllq $6, TWEAK0, TWEAK3
+ vpxor V0, TWEAK1, TWEAK1
+ vpxor V2, TWEAK2, TWEAK2
+ vpxor V4, TWEAK3, TWEAK3
+ vpxor V1, TWEAK1, TWEAK1
+ vpxor V3, TWEAK2, TWEAK2
+ vpxor V5, TWEAK3, TWEAK3
+.else
+ vbroadcasti32x4 (TWEAK), TWEAK0
+ vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY
+
+ // Compute the first vector of tweaks:
+ // TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
+ vpmovzxbq .Lrshift_amounts(%rip), V4
+ vpsrlvq V4, TWEAK0, V0
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpmovzxbq .Llshift_amounts(%rip), V4
+ vpslldq $8, V0, V0
+ vpsllvq V4, TWEAK0, TWEAK0
+ vpternlogd $0x96, V0, V1, TWEAK0
+
+ // Compute the next three vectors of tweaks:
+ // TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
+ // TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
+ // TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
+ // x^8 only needs byte-aligned shifts, so optimize accordingly.
+ vpsrlq $64 - 4, TWEAK0, V0
+ vpsrldq $(64 - 8) / 8, TWEAK0, V2
+ vpsrlq $64 - 12, TWEAK0, V4
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpclmulqdq $0x01, GF_POLY, V2, V3
+ vpclmulqdq $0x01, GF_POLY, V4, V5
+ vpslldq $8, V0, V0
+ vpslldq $8, V4, V4
+ vpsllq $4, TWEAK0, TWEAK1
+ vpslldq $8 / 8, TWEAK0, TWEAK2
+ vpsllq $12, TWEAK0, TWEAK3
+ vpternlogd $0x96, V0, V1, TWEAK1
+ vpxord V3, TWEAK2, TWEAK2
+ vpternlogd $0x96, V4, V5, TWEAK3
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the method of just
+// multiplying by x repeatedly (the same method _next_tweak uses).
+.macro _tweak_step_mulx i
+.if \i == 0
+ .set PREV_TWEAK, TWEAK3
+ .set NEXT_TWEAK, NEXT_TWEAK0
+.elseif \i == 5
+ .set PREV_TWEAK, NEXT_TWEAK0
+ .set NEXT_TWEAK, NEXT_TWEAK1
+.elseif \i == 10
+ .set PREV_TWEAK, NEXT_TWEAK1
+ .set NEXT_TWEAK, NEXT_TWEAK2
+.elseif \i == 15
+ .set PREV_TWEAK, NEXT_TWEAK2
+ .set NEXT_TWEAK, NEXT_TWEAK3
+.endif
+.if \i >= 0 && \i < 20 && \i % 5 == 0
+ vpshufd $0x13, PREV_TWEAK, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 1
+ vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
+.elseif \i >= 0 && \i < 20 && \i % 5 == 2
+ vpsrad $31, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 3
+ vpand GF_POLY, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 4
+ vpxor V5, NEXT_TWEAK, NEXT_TWEAK
+.elseif \i == 1000
+ vmovdqa NEXT_TWEAK0, TWEAK0
+ vmovdqa NEXT_TWEAK1, TWEAK1
+ vmovdqa NEXT_TWEAK2, TWEAK2
+ vmovdqa NEXT_TWEAK3, TWEAK3
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
+// (the same method _next_tweakvec uses for VL > 16). This means multiplying
+// each tweak by x^(4*VL/16) independently.
+//
+// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed
+// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
+// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves
+// instructions directly. The 128-bit right shift (vpsrldq) performs better
+// than a 64-bit right shift on Intel CPUs in the context where it is used here,
+// because it runs on a different execution port from the AES instructions.
+.macro _tweak_step_pclmul i
+.if \i == 0
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
+.elseif \i == 2
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
+.elseif \i == 4
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
+.elseif \i == 6
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
+.elseif \i == 8
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
+.elseif \i == 10
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
+.elseif \i == 12
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
+.elseif \i == 14
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
+.elseif \i == 1000
+ vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
+ vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
+ vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
+ vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
+ _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
+ _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
+ _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
+ _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
+.endif
+.endm
+
+// _tweak_step does one step of the computation of the next set of tweaks from
+// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
+// \i that include at least 0 through 19, then 1000 which signals the last step.
+//
+// This is used to interleave the computation of the next set of tweaks with the
+// AES en/decryptions, which increases performance in some cases. Clobbers V5.
+.macro _tweak_step i
+.if VL == 16
+ _tweak_step_mulx \i
+.else
+ _tweak_step_pclmul \i
+.endif
+.endm
+
+.macro _setup_round_keys enc
+
+ // Select either the encryption round keys or the decryption round keys.
+.if \enc
+ .set OFFS, 0
+.else
+ .set OFFS, 240
+.endif
+
+ // Load the round key for "round 0".
+ _vbroadcast128 OFFS(KEY), KEY0
+
+ // Increment KEY to make it so that 7*16(KEY) is the last round key.
+ // For AES-128, increment by 3*16, resulting in the 10 round keys (not
+ // counting the zero-th round key which was just loaded into KEY0) being
+ // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
+ // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
+ // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
+ //
+ // This rebasing provides two benefits. First, it makes the offset to
+ // any round key be in the range [-96, 112], fitting in a signed byte.
+ // This shortens VEX-encoded instructions that access the later round
+ // keys which otherwise would need 4-byte offsets. Second, it makes it
+ // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
+ // beginning. Skipping rounds at the end doesn't work as well because
+ // the last round needs different instructions.
+ //
+ // An alternative approach would be to roll up all the round loops. We
+ // don't do that because (a) it isn't compatible with caching the round
+ // keys in registers which we do when possible (see below), (b) we
+ // interleave the AES rounds with the XTS tweak computation, and (c) it
+ // seems unwise to rely *too* heavily on the CPU's branch predictor.
+ lea OFFS-16(KEY, KEYLEN64, 4), KEY
+
+ // If all 32 SIMD registers are available, cache all the round keys.
+.if USE_AVX512
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ vbroadcasti32x4 -6*16(KEY), KEY1
+ vbroadcasti32x4 -5*16(KEY), KEY2
+.Laes192\@:
+ vbroadcasti32x4 -4*16(KEY), KEY3
+ vbroadcasti32x4 -3*16(KEY), KEY4
+.Laes128\@:
+ vbroadcasti32x4 -2*16(KEY), KEY5
+ vbroadcasti32x4 -1*16(KEY), KEY6
+ vbroadcasti32x4 0*16(KEY), KEY7
+ vbroadcasti32x4 1*16(KEY), KEY8
+ vbroadcasti32x4 2*16(KEY), KEY9
+ vbroadcasti32x4 3*16(KEY), KEY10
+ vbroadcasti32x4 4*16(KEY), KEY11
+ vbroadcasti32x4 5*16(KEY), KEY12
+ vbroadcasti32x4 6*16(KEY), KEY13
+ vbroadcasti32x4 7*16(KEY), KEY14
+.endif
+.endm
+
+// Do a single non-last round of AES encryption (if \enc==1) or decryption (if
+// \enc==0) on the block(s) in \data using the round key(s) in \key. The
+// register length determines the number of AES blocks en/decrypted.
+.macro _vaes enc, key, data
+.if \enc
+ vaesenc \key, \data, \data
+.else
+ vaesdec \key, \data, \data
+.endif
+.endm
+
+// Same as _vaes, but does the last round.
+.macro _vaeslast enc, key, data
+.if \enc
+ vaesenclast \key, \data, \data
+.else
+ vaesdeclast \key, \data, \data
+.endif
+.endm
+
+// Do a single non-last round of AES en/decryption on the block(s) in \data,
+// using the same key for all block(s). The round key is loaded from the
+// appropriate register or memory location for round \i. May clobber \tmp.
+.macro _vaes_1x enc, i, xmm_suffix, data, tmp
+.if USE_AVX512
+ _vaes \enc, KEY\i\xmm_suffix, \data
+.else
+.ifnb \xmm_suffix
+ _vaes \enc, (\i-7)*16(KEY), \data
+.else
+ _vbroadcast128 (\i-7)*16(KEY), \tmp
+ _vaes \enc, \tmp, \data
+.endif
+.endif
+.endm
+
+// Do a single non-last round of AES en/decryption on the blocks in registers
+// V0-V3, using the same key for all blocks. The round key is loaded from the
+// appropriate register or memory location for round \i. In addition, does two
+// steps of the computation of the next set of tweaks. May clobber V4 and V5.
+.macro _vaes_4x enc, i
+.if USE_AVX512
+ _tweak_step (2*(\i-5))
+ _vaes \enc, KEY\i, V0
+ _vaes \enc, KEY\i, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, KEY\i, V2
+ _vaes \enc, KEY\i, V3
+.else
+ _vbroadcast128 (\i-7)*16(KEY), V4
+ _tweak_step (2*(\i-5))
+ _vaes \enc, V4, V0
+ _vaes \enc, V4, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, V4, V2
+ _vaes \enc, V4, V3
+.endif
+.endm
+
+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+// then XOR with \tweak again) of the block(s) in \data. To process a single
+// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
+// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp.
+.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp
+ _xor3 KEY0\xmm_suffix, \tweak, \data
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp
+ _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp
+.Laes192\@:
+ _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp
+ _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp
+.Laes128\@:
+.irp i, 5,6,7,8,9,10,11,12,13
+ _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp
+.endr
+.if USE_AVX512
+ vpxord KEY14\xmm_suffix, \tweak, \tmp
+.else
+.ifnb \xmm_suffix
+ vpxor 7*16(KEY), \tweak, \tmp
+.else
+ _vbroadcast128 7*16(KEY), \tmp
+ vpxor \tweak, \tmp, \tmp
+.endif
+.endif
+ _vaeslast \enc, \tmp, \data
+.endm
+
+.macro _aes_xts_crypt enc
+ _define_aliases
+
+.if !\enc
+ // When decrypting a message whose length isn't a multiple of the AES
+ // block length, exclude the last full block from the main loop by
+ // subtracting 16 from LEN. This is needed because ciphertext stealing
+ // decryption uses the last two tweaks in reverse order. We'll handle
+ // the last full block and the partial block specially at the end.
+ lea -16(LEN), %eax
+ test $15, LEN8
+ cmovnz %eax, LEN
+.endif
+
+ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+ movl 480(KEY), KEYLEN
+
+ // Setup the pointer to the round keys and cache as many as possible.
+ _setup_round_keys \enc
+
+ // Compute the first set of tweaks TWEAK[0-3].
+ _compute_first_set_of_tweaks
+
+ add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32
+ jl .Lhandle_remainder\@
+
+.Lmain_loop\@:
+ // This is the main loop, en/decrypting 4*VL bytes per iteration.
+
+ // XOR each source block with its tweak and the zero-th round key.
+.if USE_AVX512
+ vmovdqu8 0*VL(SRC), V0
+ vmovdqu8 1*VL(SRC), V1
+ vmovdqu8 2*VL(SRC), V2
+ vmovdqu8 3*VL(SRC), V3
+ vpternlogd $0x96, TWEAK0, KEY0, V0
+ vpternlogd $0x96, TWEAK1, KEY0, V1
+ vpternlogd $0x96, TWEAK2, KEY0, V2
+ vpternlogd $0x96, TWEAK3, KEY0, V3
+.else
+ vpxor 0*VL(SRC), KEY0, V0
+ vpxor 1*VL(SRC), KEY0, V1
+ vpxor 2*VL(SRC), KEY0, V2
+ vpxor 3*VL(SRC), KEY0, V3
+ vpxor TWEAK0, V0, V0
+ vpxor TWEAK1, V1, V1
+ vpxor TWEAK2, V2, V2
+ vpxor TWEAK3, V3, V3
+.endif
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ // Do all the AES rounds on the data blocks, interleaved with
+ // the computation of the next set of tweaks.
+ _vaes_4x \enc, 1
+ _vaes_4x \enc, 2
+.Laes192\@:
+ _vaes_4x \enc, 3
+ _vaes_4x \enc, 4
+.Laes128\@:
+.irp i, 5,6,7,8,9,10,11,12,13
+ _vaes_4x \enc, \i
+.endr
+ // Do the last AES round, then XOR the results with the tweaks again.
+ // Reduce latency by doing the XOR before the vaesenclast, utilizing the
+ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
+ // (and likewise for vaesdeclast).
+.if USE_AVX512
+ _tweak_step 18
+ _tweak_step 19
+ vpxord TWEAK0, KEY14, V4
+ vpxord TWEAK1, KEY14, V5
+ _vaeslast \enc, V4, V0
+ _vaeslast \enc, V5, V1
+ vpxord TWEAK2, KEY14, V4
+ vpxord TWEAK3, KEY14, V5
+ _vaeslast \enc, V4, V2
+ _vaeslast \enc, V5, V3
+.else
+ _vbroadcast128 7*16(KEY), V4
+ _tweak_step 18 // uses V5
+ _tweak_step 19 // uses V5
+ vpxor TWEAK0, V4, V5
+ _vaeslast \enc, V5, V0
+ vpxor TWEAK1, V4, V5
+ _vaeslast \enc, V5, V1
+ vpxor TWEAK2, V4, V5
+ vpxor TWEAK3, V4, V4
+ _vaeslast \enc, V5, V2
+ _vaeslast \enc, V4, V3
+.endif
+
+ // Store the destination blocks.
+ _vmovdqu V0, 0*VL(DST)
+ _vmovdqu V1, 1*VL(DST)
+ _vmovdqu V2, 2*VL(DST)
+ _vmovdqu V3, 3*VL(DST)
+
+ // Finish computing the next set of tweaks.
+ _tweak_step 1000
+
+ sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
+ sub $-4*VL, DST
+ add $-4*VL, LEN
+ jge .Lmain_loop\@
+
+ // Check for the uncommon case where the data length isn't a multiple of
+ // 4*VL. Handle it out-of-line in order to optimize for the common
+ // case. In the common case, just fall through to the ret.
+ test $4*VL-1, LEN8
+ jnz .Lhandle_remainder\@
+.Ldone\@:
+ // Store the next tweak back to *TWEAK to support continuation calls.
+ vmovdqu TWEAK0_XMM, (TWEAK)
+.if VL > 16
+ vzeroupper
+.endif
+ RET
+
+.Lhandle_remainder\@:
+
+ // En/decrypt any remaining full blocks, one vector at a time.
+.if VL > 16
+ add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
+ jl .Lvec_at_a_time_done\@
+.Lvec_at_a_time\@:
+ _vmovdqu (SRC), V0
+ _aes_crypt \enc, , TWEAK0, V0, tmp=V1
+ _vmovdqu V0, (DST)
+ _next_tweakvec TWEAK0, V0, V1, TWEAK0
+ add $VL, SRC
+ add $VL, DST
+ sub $VL, LEN
+ jge .Lvec_at_a_time\@
+.Lvec_at_a_time_done\@:
+ add $VL-16, LEN // Undo extra sub of VL, then sub 16.
+.else
+ add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
+.endif
+
+ // En/decrypt any remaining full blocks, one at a time.
+ jl .Lblock_at_a_time_done\@
+.Lblock_at_a_time\@:
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
+ vmovdqu %xmm0, (DST)
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
+ add $16, SRC
+ add $16, DST
+ sub $16, LEN
+ jge .Lblock_at_a_time\@
+.Lblock_at_a_time_done\@:
+ add $16, LEN // Undo the extra sub of 16.
+ // Now 0 <= LEN <= 15. If LEN is zero, we're done.
+ jz .Ldone\@
+
+ // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
+ // Do ciphertext stealing to process the last 16 + LEN bytes.
+
+.if \enc
+ // If encrypting, the main loop already encrypted the last full block to
+ // create the CTS intermediate ciphertext. Prepare for the rest of CTS
+ // by rewinding the pointers and loading the intermediate ciphertext.
+ sub $16, SRC
+ sub $16, DST
+ vmovdqu (DST), %xmm0
+.else
+ // If decrypting, the main loop didn't decrypt the last full block
+ // because CTS decryption uses the last two tweaks in reverse order.
+ // Do it now by advancing the tweak and decrypting the last full block.
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
+.endif
+
+.if USE_AVX512
+ // Create a mask that has the first LEN bits set.
+ mov $-1, %r9d
+ bzhi LEN, %r9d, %r9d
+ kmovd %r9d, %k1
+
+ // Swap the first LEN bytes of the en/decryption of the last full block
+ // with the partial block. Note that to support in-place en/decryption,
+ // the load from the src partial block must happen before the store to
+ // the dst partial block.
+ vmovdqa %xmm0, %xmm1
+ vmovdqu8 16(SRC), %xmm0{%k1}
+ vmovdqu8 %xmm1, 16(DST){%k1}
+.else
+ lea .Lcts_permute_table(%rip), %r9
+
+ // Load the src partial block, left-aligned. Note that to support
+ // in-place en/decryption, this must happen before the store to the dst
+ // partial block.
+ vmovdqu (SRC, LEN64, 1), %xmm1
+
+ // Shift the first LEN bytes of the en/decryption of the last full block
+ // to the end of a register, then store it to DST+LEN. This stores the
+ // dst partial block. It also writes to the second part of the dst last
+ // full block, but that part is overwritten later.
+ vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
+ vmovdqu %xmm2, (DST, LEN64, 1)
+
+ // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
+ sub LEN64, %r9
+ vmovdqu 32(%r9), %xmm3
+
+ // Shift the src partial block to the beginning of its register.
+ vpshufb %xmm3, %xmm1, %xmm1
+
+ // Do a blend to generate the src partial block followed by the second
+ // part of the en/decryption of the last full block.
+ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
+.endif
+ // En/decrypt again and store the last full block.
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
+ vmovdqu %xmm0, (DST)
+ jmp .Ldone\@
+.endm
+
+// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+// u8 iv[AES_BLOCK_SIZE]);
+//
+// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes
+// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512.
+SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
+ .set TWEAK_KEY, %rdi
+ .set IV, %rsi
+ .set KEYLEN, %eax
+ .set KEYLEN64, %rax
+
+ vmovdqu (IV), %xmm0
+ vpxor (TWEAK_KEY), %xmm0, %xmm0
+ movl 480(TWEAK_KEY), KEYLEN
+ lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
+ cmp $24, KEYLEN
+ jl .Lencrypt_iv_aes128
+ je .Lencrypt_iv_aes192
+ vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0
+ vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0
+.Lencrypt_iv_aes192:
+ vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0
+ vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0
+.Lencrypt_iv_aes128:
+.irp i, -2,-1,0,1,2,3,4,5,6
+ vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0
+.endr
+ vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0
+ vmovdqu %xmm0, (IV)
+ RET
+SYM_FUNC_END(aes_xts_encrypt_iv)
+
+// Below are the actual AES-XTS encryption and decryption functions,
+// instantiated from the above macro. They all have the following prototype:
+//
+// void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+// const u8 *src, u8 *dst, int len,
+// u8 tweak[AES_BLOCK_SIZE]);
+//
+// |key| is the data key. |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done. This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call. If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
+
+.set VL, 16
+.set USE_AVX512, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
+
+.set VL, 32
+.set USE_AVX512, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
+
+.set VL, 64
+.set USE_AVX512, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx512)
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
deleted file mode 100644
index 49ae9fe32b22..000000000000
--- a/arch/x86/crypto/aes_glue.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Glue Code for the asm optimized version of the AES Cipher Algorithm
- *
- */
-
-#include <crypto/aes.h>
-
-asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
-asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
-
-void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
-{
- aes_enc_blk(ctx, dst, src);
-}
-EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86);
-
-void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
-{
- aes_dec_blk(ctx, dst, src);
-}
-EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- aes_enc_blk(crypto_tfm_ctx(tfm), dst, src);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- aes_dec_blk(crypto_tfm_ctx(tfm), dst, src);
-}
-
-static struct crypto_alg aes_alg = {
- .cra_name = "aes",
- .cra_driver_name = "aes-asm",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
- .cra_u = {
- .cipher = {
- .cia_min_keysize = AES_MIN_KEY_SIZE,
- .cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = crypto_aes_set_key,
- .cia_encrypt = aes_encrypt,
- .cia_decrypt = aes_decrypt
- }
- }
-};
-
-static int __init aes_init(void)
-{
- return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
- crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
-MODULE_ALIAS("aes-asm");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb0566e83319..b37881bb9f15 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Implement AES algorithm in Intel AES-NI instructions.
*
@@ -9,15 +10,15 @@
* Vinodh Gopal <vinodh.gopal@intel.com>
* Kahraman Akdemir
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * Ported x86_64 version to x86:
+ * Author: Mathias Krause <minipli@googlemail.com>
*/
#include <linux/linkage.h>
-
-.text
+#include <linux/objtool.h>
+#include <asm/frame.h>
#define STATE1 %xmm0
#define STATE2 %xmm4
@@ -32,8 +33,17 @@
#define KEY %xmm2
#define IV %xmm3
+#define BSWAP_MASK %xmm10
+#define CTR %xmm11
+#define INC %xmm12
+
+#define GF128MUL_MASK %xmm7
+
+#ifdef __x86_64__
+#define AREG %rax
#define KEYP %rdi
#define OUTP %rsi
+#define UKEYP OUTP
#define INP %rdx
#define LEN %rcx
#define IVP %r8
@@ -41,20 +51,34 @@
#define T1 %r10
#define TKEYP T1
#define T2 %r11
+#define TCTR_LOW T2
+#else
+#define AREG %eax
+#define KEYP %edi
+#define OUTP AREG
+#define UKEYP OUTP
+#define INP %edx
+#define LEN %esi
+#define IVP %ebp
+#define KLEN %ebx
+#define T1 %ecx
+#define TKEYP T1
+#endif
-_key_expansion_128:
-_key_expansion_256a:
+SYM_FUNC_START_LOCAL(_key_expansion_256a)
pshufd $0b11111111, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
pxor %xmm4, %xmm0
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
- ret
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
+ RET
+SYM_FUNC_END(_key_expansion_256a)
+SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
-_key_expansion_192a:
+SYM_FUNC_START_LOCAL(_key_expansion_192a)
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
pxor %xmm4, %xmm0
@@ -71,13 +95,14 @@ _key_expansion_192a:
movaps %xmm0, %xmm1
shufps $0b01000100, %xmm0, %xmm6
- movaps %xmm6, (%rcx)
+ movaps %xmm6, (TKEYP)
shufps $0b01001110, %xmm2, %xmm1
- movaps %xmm1, 16(%rcx)
- add $0x20, %rcx
- ret
+ movaps %xmm1, 0x10(TKEYP)
+ add $0x20, TKEYP
+ RET
+SYM_FUNC_END(_key_expansion_192a)
-_key_expansion_192b:
+SYM_FUNC_START_LOCAL(_key_expansion_192b)
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
pxor %xmm4, %xmm0
@@ -91,165 +116,160 @@ _key_expansion_192b:
pxor %xmm3, %xmm2
pxor %xmm5, %xmm2
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
- ret
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
+ RET
+SYM_FUNC_END(_key_expansion_192b)
-_key_expansion_256b:
+SYM_FUNC_START_LOCAL(_key_expansion_256b)
pshufd $0b10101010, %xmm1, %xmm1
shufps $0b00010000, %xmm2, %xmm4
pxor %xmm4, %xmm2
shufps $0b10001100, %xmm2, %xmm4
pxor %xmm4, %xmm2
pxor %xmm1, %xmm2
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
- ret
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
+ RET
+SYM_FUNC_END(_key_expansion_256b)
/*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- * unsigned int key_len)
+ * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
*/
-ENTRY(aesni_set_key)
- movups (%rsi), %xmm0 # user key (first 16 bytes)
- movaps %xmm0, (%rdi)
- lea 0x10(%rdi), %rcx # key addr
- movl %edx, 480(%rdi)
+SYM_FUNC_START(aesni_set_key)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
+ movl (FRAME_OFFSET+16)(%esp), %edx # key_len
+#endif
+ movups (UKEYP), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (KEYP)
+ lea 0x10(KEYP), TKEYP # key addr
+ movl %edx, 480(KEYP)
pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
cmp $24, %dl
jb .Lenc_key128
je .Lenc_key192
- movups 0x10(%rsi), %xmm2 # other user key
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
- # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ movups 0x10(UKEYP), %xmm2 # other user key
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
+ aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
call _key_expansion_256a
- # aeskeygenassist $0x1, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+ aeskeygenassist $0x1, %xmm0, %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
call _key_expansion_256a
- # aeskeygenassist $0x2, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+ aeskeygenassist $0x2, %xmm0, %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
call _key_expansion_256a
- # aeskeygenassist $0x4, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+ aeskeygenassist $0x4, %xmm0, %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
call _key_expansion_256a
- # aeskeygenassist $0x8, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+ aeskeygenassist $0x8, %xmm0, %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
call _key_expansion_256a
- # aeskeygenassist $0x10, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+ aeskeygenassist $0x10, %xmm0, %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
call _key_expansion_256a
- # aeskeygenassist $0x20, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+ aeskeygenassist $0x20, %xmm0, %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
- movq 0x10(%rsi), %xmm2 # other user key
- # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ movq 0x10(UKEYP), %xmm2 # other user key
+ aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
call _key_expansion_192a
- # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
call _key_expansion_192b
- # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
call _key_expansion_192a
- # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
call _key_expansion_192b
- # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
call _key_expansion_192a
- # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
call _key_expansion_192b
- # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
call _key_expansion_192a
- # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
+ aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
call _key_expansion_192b
jmp .Ldec_key
.Lenc_key128:
- # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+ aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
call _key_expansion_128
- # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+ aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
call _key_expansion_128
- # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+ aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
call _key_expansion_128
- # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+ aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
call _key_expansion_128
- # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+ aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
call _key_expansion_128
- # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+ aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
call _key_expansion_128
- # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
+ aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
call _key_expansion_128
- # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
+ aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
call _key_expansion_128
- # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
+ aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
call _key_expansion_128
- # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
+ aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
- sub $0x10, %rcx
- movaps (%rdi), %xmm0
- movaps (%rcx), %xmm1
- movaps %xmm0, 240(%rcx)
- movaps %xmm1, 240(%rdi)
- add $0x10, %rdi
- lea 240-16(%rcx), %rsi
+ sub $0x10, TKEYP
+ movaps (KEYP), %xmm0
+ movaps (TKEYP), %xmm1
+ movaps %xmm0, 240(TKEYP)
+ movaps %xmm1, 240(KEYP)
+ add $0x10, KEYP
+ lea 240-16(TKEYP), UKEYP
.align 4
.Ldec_key_loop:
- movaps (%rdi), %xmm0
- # aesimc %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
- movaps %xmm1, (%rsi)
- add $0x10, %rdi
- sub $0x10, %rsi
- cmp %rcx, %rdi
+ movaps (KEYP), %xmm0
+ aesimc %xmm0, %xmm1
+ movaps %xmm1, (UKEYP)
+ add $0x10, KEYP
+ sub $0x10, UKEYP
+ cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor %rax, %rax
- ret
+#ifndef __x86_64__
+ popl KEYP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_set_key)
/*
- * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
*/
-ENTRY(aesni_enc)
+SYM_FUNC_START(aesni_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
+#endif
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
call _aesni_enc1
movups STATE, (OUTP) # output
- ret
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_enc)
/*
* _aesni_enc1: internal ABI
@@ -263,7 +283,7 @@ ENTRY(aesni_enc)
* KEY
* TKEYP (T1)
*/
-_aesni_enc1:
+SYM_FUNC_START_LOCAL(_aesni_enc1)
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE # round 0
@@ -274,52 +294,39 @@ _aesni_enc1:
je .Lenc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps -0x50(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
.align 4
.Lenc192:
movaps -0x40(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps -0x30(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
.align 4
.Lenc128:
movaps -0x20(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps -0x10(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps (TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps 0x10(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps 0x20(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps 0x30(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps 0x40(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps 0x50(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps 0x60(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ aesenc KEY, STATE
movaps 0x70(TKEYP), KEY
- # aesenclast KEY, STATE # last round
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
- ret
+ aesenclast KEY, STATE
+ RET
+SYM_FUNC_END(_aesni_enc1)
/*
* _aesni_enc4: internal ABI
@@ -339,7 +346,7 @@ _aesni_enc1:
* KEY
* TKEYP (T1)
*/
-_aesni_enc4:
+SYM_FUNC_START_LOCAL(_aesni_enc4)
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE1 # round 0
@@ -353,147 +360,106 @@ _aesni_enc4:
je .L4enc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x50(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
#.align 4
.L4enc192:
movaps -0x40(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x30(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
#.align 4
.L4enc128:
movaps -0x20(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps -0x10(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps (TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x10(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x20(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x30(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x40(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x50(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x60(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ aesenc KEY, STATE1
+ aesenc KEY, STATE2
+ aesenc KEY, STATE3
+ aesenc KEY, STATE4
movaps 0x70(TKEYP), KEY
- # aesenclast KEY, STATE1 # last round
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
- # aesenclast KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2
- # aesenclast KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
- # aesenclast KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
- ret
+ aesenclast KEY, STATE1 # last round
+ aesenclast KEY, STATE2
+ aesenclast KEY, STATE3
+ aesenclast KEY, STATE4
+ RET
+SYM_FUNC_END(_aesni_enc4)
/*
- * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
*/
-ENTRY(aesni_dec)
+SYM_FUNC_START(aesni_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
+#endif
mov 480(KEYP), KLEN # key length
add $240, KEYP
movups (INP), STATE # input
call _aesni_dec1
movups STATE, (OUTP) #output
- ret
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_dec)
/*
* _aesni_dec1: internal ABI
@@ -507,7 +473,7 @@ ENTRY(aesni_dec)
* KEY
* TKEYP (T1)
*/
-_aesni_dec1:
+SYM_FUNC_START_LOCAL(_aesni_dec1)
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE # round 0
@@ -518,52 +484,39 @@ _aesni_dec1:
je .Ldec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps -0x50(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
.align 4
.Ldec192:
movaps -0x40(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps -0x30(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
.align 4
.Ldec128:
movaps -0x20(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps -0x10(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps (TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps 0x10(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps 0x20(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps 0x30(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps 0x40(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps 0x50(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps 0x60(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ aesdec KEY, STATE
movaps 0x70(TKEYP), KEY
- # aesdeclast KEY, STATE # last round
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
- ret
+ aesdeclast KEY, STATE
+ RET
+SYM_FUNC_END(_aesni_dec1)
/*
* _aesni_dec4: internal ABI
@@ -583,7 +536,7 @@ _aesni_dec1:
* KEY
* TKEYP (T1)
*/
-_aesni_dec4:
+SYM_FUNC_START_LOCAL(_aesni_dec4)
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE1 # round 0
@@ -597,142 +550,97 @@ _aesni_dec4:
je .L4dec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x50(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
.align 4
.L4dec192:
movaps -0x40(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x30(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
.align 4
.L4dec128:
movaps -0x20(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps -0x10(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps (TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x10(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x20(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x30(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x40(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x50(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x60(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ aesdec KEY, STATE1
+ aesdec KEY, STATE2
+ aesdec KEY, STATE3
+ aesdec KEY, STATE4
movaps 0x70(TKEYP), KEY
- # aesdeclast KEY, STATE1 # last round
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
- # aesdeclast KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2
- # aesdeclast KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
- # aesdeclast KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
- ret
+ aesdeclast KEY, STATE1 # last round
+ aesdeclast KEY, STATE2
+ aesdeclast KEY, STATE3
+ aesdeclast KEY, STATE4
+ RET
+SYM_FUNC_END(_aesni_dec4)
/*
* void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len)
*/
-ENTRY(aesni_ecb_enc)
+SYM_FUNC_START(aesni_ecb_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
+#endif
test LEN, LEN # check length
jz .Lecb_enc_ret
mov 480(KEYP), KLEN
@@ -769,13 +677,30 @@ ENTRY(aesni_ecb_enc)
cmp $16, LEN
jge .Lecb_enc_loop1
.Lecb_enc_ret:
- ret
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_ecb_enc)
/*
* void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len);
*/
-ENTRY(aesni_ecb_dec)
+SYM_FUNC_START(aesni_ecb_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
+#endif
test LEN, LEN
jz .Lecb_dec_ret
mov 480(KEYP), KLEN
@@ -813,13 +738,32 @@ ENTRY(aesni_ecb_dec)
cmp $16, LEN
jge .Lecb_dec_loop1
.Lecb_dec_ret:
- ret
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_ecb_dec)
/*
* void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len, u8 *iv)
*/
-ENTRY(aesni_cbc_enc)
+SYM_FUNC_START(aesni_cbc_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+#endif
cmp $16, LEN
jb .Lcbc_enc_ret
mov 480(KEYP), KLEN
@@ -837,13 +781,33 @@ ENTRY(aesni_cbc_enc)
jge .Lcbc_enc_loop
movups STATE, (IVP)
.Lcbc_enc_ret:
- ret
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_cbc_enc)
/*
* void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len, u8 *iv)
*/
-ENTRY(aesni_cbc_dec)
+SYM_FUNC_START(aesni_cbc_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+#endif
cmp $16, LEN
jb .Lcbc_dec_just_ret
mov 480(KEYP), KLEN
@@ -857,16 +821,32 @@ ENTRY(aesni_cbc_dec)
movaps IN1, STATE1
movups 0x10(INP), IN2
movaps IN2, STATE2
+#ifdef __x86_64__
movups 0x20(INP), IN3
movaps IN3, STATE3
movups 0x30(INP), IN4
movaps IN4, STATE4
+#else
+ movups 0x20(INP), IN1
+ movaps IN1, STATE3
+ movups 0x30(INP), IN2
+ movaps IN2, STATE4
+#endif
call _aesni_dec4
pxor IV, STATE1
+#ifdef __x86_64__
pxor IN1, STATE2
pxor IN2, STATE3
pxor IN3, STATE4
movaps IN4, IV
+#else
+ pxor IN1, STATE4
+ movaps IN2, IV
+ movups (INP), IN1
+ pxor IN1, STATE2
+ movups 0x10(INP), IN2
+ pxor IN2, STATE3
+#endif
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
movups STATE3, 0x20(OUTP)
@@ -894,4 +874,490 @@ ENTRY(aesni_cbc_dec)
.Lcbc_dec_ret:
movups IV, (IVP)
.Lcbc_dec_just_ret:
- ret
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_cbc_dec)
+
+/*
+ * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+ movups (IVP), %xmm5
+
+ movups (INP), IN1
+ add LEN, INP
+ movups (INP), IN2
+
+ pxor IN1, STATE
+ call _aesni_enc1
+
+ pshufb %xmm5, IN2
+ pxor STATE, IN2
+ pshufb %xmm4, STATE
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movaps IN2, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_cts_cbc_enc)
+
+/*
+ * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+
+ movups (INP), STATE
+ add LEN, INP
+ movups (INP), IN1
+
+ call _aesni_dec1
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ pxor IN1, STATE
+
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+ call _aesni_dec1
+
+ pxor IV, STATE
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_cts_cbc_dec)
+
+.pushsection .rodata
+.align 16
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+#ifdef __x86_64__
+.Lbswap_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#endif
+.popsection
+
+#ifdef __x86_64__
+/*
+ * _aesni_inc_init: internal ABI
+ * setup registers used by _aesni_inc
+ * input:
+ * IV
+ * output:
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ */
+SYM_FUNC_START_LOCAL(_aesni_inc_init)
+ movaps .Lbswap_mask(%rip), BSWAP_MASK
+ movaps IV, CTR
+ pshufb BSWAP_MASK, CTR
+ mov $1, TCTR_LOW
+ movq TCTR_LOW, INC
+ movq CTR, TCTR_LOW
+ RET
+SYM_FUNC_END(_aesni_inc_init)
+
+/*
+ * _aesni_inc: internal ABI
+ * Increase IV by 1, IV is in big endian
+ * input:
+ * IV
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ * output:
+ * IV: Increase by 1
+ * changed:
+ * CTR: == output IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ */
+SYM_FUNC_START_LOCAL(_aesni_inc)
+ paddq INC, CTR
+ add $1, TCTR_LOW
+ jnc .Linc_low
+ pslldq $8, INC
+ paddq INC, CTR
+ psrldq $8, INC
+.Linc_low:
+ movaps CTR, IV
+ pshufb BSWAP_MASK, IV
+ RET
+SYM_FUNC_END(_aesni_inc)
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_ctr_enc)
+ ANNOTATE_NOENDBR
+ FRAME_BEGIN
+ cmp $16, LEN
+ jb .Lctr_enc_just_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), IV
+ call _aesni_inc_init
+ cmp $64, LEN
+ jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+ movaps IV, STATE1
+ call _aesni_inc
+ movups (INP), IN1
+ movaps IV, STATE2
+ call _aesni_inc
+ movups 0x10(INP), IN2
+ movaps IV, STATE3
+ call _aesni_inc
+ movups 0x20(INP), IN3
+ movaps IV, STATE4
+ call _aesni_inc
+ movups 0x30(INP), IN4
+ call _aesni_enc4
+ pxor IN1, STATE1
+ movups STATE1, (OUTP)
+ pxor IN2, STATE2
+ movups STATE2, 0x10(OUTP)
+ pxor IN3, STATE3
+ movups STATE3, 0x20(OUTP)
+ pxor IN4, STATE4
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lctr_enc_loop4
+ cmp $16, LEN
+ jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+ movaps IV, STATE
+ call _aesni_inc
+ movups (INP), IN
+ call _aesni_enc1
+ pxor IN, STATE
+ movups STATE, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+ movups IV, (IVP)
+.Lctr_enc_just_ret:
+ FRAME_END
+ RET
+SYM_FUNC_END(aesni_ctr_enc)
+
+#endif
+
+.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
+.align 16
+.Lgf128mul_x_ble_mask:
+ .octa 0x00000000000000010000000000000087
+.previous
+
+/*
+ * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
+ * input:
+ * IV: current IV
+ * GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ * IV: next IV
+ * changed:
+ * KEY: == temporary value
+ */
+.macro _aesni_gf128mul_x_ble
+ pshufd $0x13, IV, KEY
+ paddq IV, IV
+ psrad $31, KEY
+ pand GF128MUL_MASK, KEY
+ pxor KEY, IV
+.endm
+
+.macro _aesni_xts_crypt enc
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+#else
+ movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
+#endif
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+.if !\enc
+ add $240, KEYP
+
+ test $15, LEN
+ jz .Lxts_loop4\@
+ sub $16, LEN
+.endif
+
+.Lxts_loop4\@:
+ sub $64, LEN
+ jl .Lxts_1x\@
+
+ movdqa IV, STATE1
+ movdqu 0x00(INP), IN
+ pxor IN, STATE1
+ movdqu IV, 0x00(OUTP)
+
+ _aesni_gf128mul_x_ble
+ movdqa IV, STATE2
+ movdqu 0x10(INP), IN
+ pxor IN, STATE2
+ movdqu IV, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble
+ movdqa IV, STATE3
+ movdqu 0x20(INP), IN
+ pxor IN, STATE3
+ movdqu IV, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble
+ movdqa IV, STATE4
+ movdqu 0x30(INP), IN
+ pxor IN, STATE4
+ movdqu IV, 0x30(OUTP)
+
+.if \enc
+ call _aesni_enc4
+.else
+ call _aesni_dec4
+.endif
+
+ movdqu 0x00(OUTP), IN
+ pxor IN, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ movdqu 0x10(OUTP), IN
+ pxor IN, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ movdqu 0x20(OUTP), IN
+ pxor IN, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ movdqu 0x30(OUTP), IN
+ pxor IN, STATE4
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble
+
+ add $64, INP
+ add $64, OUTP
+ test LEN, LEN
+ jnz .Lxts_loop4\@
+
+.Lxts_ret_iv\@:
+ movups IV, (IVP)
+
+.Lxts_ret\@:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ RET
+
+.Lxts_1x\@:
+ add $64, LEN
+ jz .Lxts_ret_iv\@
+.if \enc
+ sub $16, LEN
+ jl .Lxts_cts4\@
+.endif
+
+.Lxts_loop1\@:
+ movdqu (INP), STATE
+.if \enc
+ pxor IV, STATE
+ call _aesni_enc1
+.else
+ add $16, INP
+ sub $16, LEN
+ jl .Lxts_cts1\@
+ pxor IV, STATE
+ call _aesni_dec1
+.endif
+ pxor IV, STATE
+ _aesni_gf128mul_x_ble
+
+ test LEN, LEN
+ jz .Lxts_out\@
+
+.if \enc
+ add $16, INP
+ sub $16, LEN
+ jl .Lxts_cts1\@
+.endif
+
+ movdqu STATE, (OUTP)
+ add $16, OUTP
+ jmp .Lxts_loop1\@
+
+.Lxts_out\@:
+ movdqu STATE, (OUTP)
+ jmp .Lxts_ret_iv\@
+
+.if \enc
+.Lxts_cts4\@:
+ movdqa STATE4, STATE
+ sub $16, OUTP
+.Lxts_cts1\@:
+.else
+.Lxts_cts1\@:
+ movdqa IV, STATE4
+ _aesni_gf128mul_x_ble
+
+ pxor IV, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+.endif
+#ifndef __x86_64__
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ add LEN, INP /* rewind input pointer */
+ add $16, LEN /* # bytes in final block */
+ movups (INP), IN1
+
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ add OUTP, LEN
+
+ movups (T1), %xmm4
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+
+.if \enc
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
+.else
+ pxor STATE4, STATE
+ call _aesni_dec1
+ pxor STATE4, STATE
+.endif
+
+ movups STATE, (OUTP)
+ jmp .Lxts_ret\@
+.endm
+
+/*
+ * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_enc)
+ _aesni_xts_crypt 1
+SYM_FUNC_END(aesni_xts_enc)
+
+/*
+ * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_dec)
+ _aesni_xts_crypt 0
+SYM_FUNC_END(aesni_xts_dec)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index c580c5ec1cad..48405e02d6e4 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1,55 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Support for Intel AES-NI instructions. This file contains glue
- * code, the real AES implementation is in intel-aes_asm.S.
+ * Support for AES-NI and VAES instructions. This file contains glue code.
+ * The real AES implementations are in aesni-intel_asm.S and other .S files.
*
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Adrian Hoban <adrian.hoban@intel.com>
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * Copyright 2024 Google LLC
*/
#include <linux/hardirq.h>
#include <linux/types.h>
-#include <linux/crypto.h>
+#include <linux/module.h>
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/aes.h>
-#include <crypto/cryptd.h>
-#include <asm/i387.h>
-#include <asm/aes.h>
-
-#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
-#define HAS_CTR
-#endif
+#include <crypto/b128ops.h>
+#include <crypto/gcm.h>
+#include <crypto/xts.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/static_call.h>
-#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
-#define HAS_LRW
-#endif
-
-#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
-#define HAS_PCBC
-#endif
-#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
-#define HAS_XTS
-#endif
-
-struct async_aes_ctx {
- struct cryptd_ablkcipher *cryptd_tfm;
+#define AESNI_ALIGN 16
+#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
+#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
+#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
+#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
+#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
+
+struct aesni_xts_ctx {
+ struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR;
};
-#define AESNI_ALIGN 16
-#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
-
-asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- unsigned int key_len);
-asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in);
-asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in);
+static inline void *aes_align_addr(void *addr)
+{
+ if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN)
+ return addr;
+ return PTR_ALIGN(addr, AESNI_ALIGN);
+}
+
+asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ unsigned int key_len);
+asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
+asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len);
asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
@@ -58,673 +69,1671 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
-static inline int kernel_fpu_using(void)
-{
- if (in_interrupt() && !(read_cr0() & X86_CR0_TS))
- return 1;
- return 0;
-}
+asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+#ifdef CONFIG_X86_64
+asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+#endif
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
- unsigned long addr = (unsigned long)raw_ctx;
- unsigned long align = AESNI_ALIGN;
+ return aes_align_addr(raw_ctx);
+}
- if (align <= crypto_tfm_ctx_alignment())
- align = 1;
- return (struct crypto_aes_ctx *)ALIGN(addr, align);
+static inline struct aesni_xts_ctx *aes_xts_ctx(struct crypto_skcipher *tfm)
+{
+ return aes_align_addr(crypto_skcipher_ctx(tfm));
}
-static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
+static int aes_set_key_common(struct crypto_aes_ctx *ctx,
const u8 *in_key, unsigned int key_len)
{
- struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx);
- u32 *flags = &tfm->crt_flags;
int err;
- if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
- key_len != AES_KEYSIZE_256) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ if (!crypto_simd_usable())
+ return aes_expandkey(ctx, in_key, key_len);
- if (kernel_fpu_using())
- err = crypto_aes_expand_key(ctx, in_key, key_len);
- else {
- kernel_fpu_begin();
- err = aesni_set_key(ctx, in_key, key_len);
- kernel_fpu_end();
- }
+ err = aes_check_keylen(key_len);
+ if (err)
+ return err;
- return err;
+ kernel_fpu_begin();
+ aesni_set_key(ctx, in_key, key_len);
+ kernel_fpu_end();
+ return 0;
}
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
- return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
+ return aes_set_key_common(aes_ctx(crypto_tfm_ctx(tfm)), in_key,
+ key_len);
}
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (kernel_fpu_using())
- crypto_aes_encrypt_x86(ctx, dst, src);
- else {
+ if (!crypto_simd_usable()) {
+ aes_encrypt(ctx, dst, src);
+ } else {
kernel_fpu_begin();
aesni_enc(ctx, dst, src);
kernel_fpu_end();
}
}
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (kernel_fpu_using())
- crypto_aes_decrypt_x86(ctx, dst, src);
- else {
+ if (!crypto_simd_usable()) {
+ aes_decrypt(ctx, dst, src);
+ } else {
kernel_fpu_begin();
aesni_dec(ctx, dst, src);
kernel_fpu_end();
}
}
-static struct crypto_alg aesni_alg = {
- .cra_name = "aes",
- .cra_driver_name = "aes-aesni",
- .cra_priority = 300,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
- .cra_alignmask = 0,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(aesni_alg.cra_list),
- .cra_u = {
- .cipher = {
- .cia_min_keysize = AES_MIN_KEY_SIZE,
- .cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = aes_set_key,
- .cia_encrypt = aes_encrypt,
- .cia_decrypt = aes_decrypt
- }
- }
-};
-
-static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
-
- aesni_enc(ctx, dst, src);
-}
-
-static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int len)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
-
- aesni_dec(ctx, dst, src);
+ return aes_set_key_common(aes_ctx(crypto_skcipher_ctx(tfm)), key, len);
}
-static struct crypto_alg __aesni_alg = {
- .cra_name = "__aes-aesni",
- .cra_driver_name = "__driver-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
- .cra_alignmask = 0,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list),
- .cra_u = {
- .cipher = {
- .cia_min_keysize = AES_MIN_KEY_SIZE,
- .cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = aes_set_key,
- .cia_encrypt = __aes_encrypt,
- .cia_decrypt = __aes_decrypt
- }
- }
-};
-
-static int ecb_encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int ecb_encrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
-static int ecb_decrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int ecb_decrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
-static struct crypto_alg blk_ecb_alg = {
- .cra_name = "__ecb-aes-aesni",
- .cra_driver_name = "__driver-ecb-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
- .cra_alignmask = 0,
- .cra_type = &crypto_blkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
- .cra_u = {
- .blkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .setkey = aes_set_key,
- .encrypt = ecb_encrypt,
- .decrypt = ecb_decrypt,
- },
- },
-};
-
-static int cbc_encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int cbc_encrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
-static int cbc_decrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int cbc_decrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
-static struct crypto_alg blk_cbc_alg = {
- .cra_name = "__cbc-aes-aesni",
- .cra_driver_name = "__driver-cbc-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
- .cra_alignmask = 0,
- .cra_type = &crypto_blkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
- .cra_u = {
- .blkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .setkey = aes_set_key,
- .encrypt = cbc_encrypt,
- .decrypt = cbc_decrypt,
- },
- },
-};
+static int cts_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
-static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
- unsigned int key_len)
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_encrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int cts_cbc_decrypt(struct skcipher_request *req)
{
- struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
int err;
- crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
- crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
- & CRYPTO_TFM_REQ_MASK);
- err = crypto_ablkcipher_setkey(child, key, key_len);
- crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
- & CRYPTO_TFM_RES_MASK);
- return err;
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_decrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ return skcipher_walk_done(&walk, 0);
}
-static int ablk_encrypt(struct ablkcipher_request *req)
+#ifdef CONFIG_X86_64
+/* This is the non-AVX version. */
+static int ctr_crypt_aesni(struct skcipher_request *req)
{
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
- struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ u8 keystream[AES_BLOCK_SIZE];
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
- if (kernel_fpu_using()) {
- struct ablkcipher_request *cryptd_req =
- ablkcipher_request_ctx(req);
- memcpy(cryptd_req, req, sizeof(*req));
- ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
- return crypto_ablkcipher_encrypt(cryptd_req);
- } else {
- struct blkcipher_desc desc;
- desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
- desc.info = req->info;
- desc.flags = 0;
- return crypto_blkcipher_crt(desc.tfm)->encrypt(
- &desc, req->dst, req->src, req->nbytes);
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ kernel_fpu_begin();
+ if (nbytes & AES_BLOCK_MASK)
+ aesni_ctr_enc(ctx, walk.dst.virt.addr,
+ walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes &= ~AES_BLOCK_MASK;
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ aesni_enc(ctx, keystream, walk.iv);
+ crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes,
+ walk.src.virt.addr + walk.nbytes - nbytes,
+ keystream, nbytes);
+ crypto_inc(walk.iv, AES_BLOCK_SIZE);
+ nbytes = 0;
+ }
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, nbytes);
}
+ return err;
}
+#endif
-static int ablk_decrypt(struct ablkcipher_request *req)
+static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
{
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
- struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ int err;
- if (kernel_fpu_using()) {
- struct ablkcipher_request *cryptd_req =
- ablkcipher_request_ctx(req);
- memcpy(cryptd_req, req, sizeof(*req));
- ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
- return crypto_ablkcipher_decrypt(cryptd_req);
- } else {
- struct blkcipher_desc desc;
- desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
- desc.info = req->info;
- desc.flags = 0;
- return crypto_blkcipher_crt(desc.tfm)->decrypt(
- &desc, req->dst, req->src, req->nbytes);
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ keylen /= 2;
+
+ /* first half of xts-key is for crypt */
+ err = aes_set_key_common(&ctx->crypt_ctx, key, keylen);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen);
+}
+
+typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, int len,
+ u8 tweak[AES_BLOCK_SIZE]);
+
+/* This handles cases where the source and/or destination span pages. */
+static noinline int
+xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ struct scatterlist *src, *dst;
+ int err;
+
+ /*
+ * If the message length isn't divisible by the AES block size, then
+ * separate off the last full block and the partial block. This ensures
+ * that they are processed in the same call to the assembly function,
+ * which is required for ciphertext stealing.
+ */
+ if (tail) {
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ req->cryptlen - tail - AES_BLOCK_SIZE,
+ req->iv);
+ req = &subreq;
+ }
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while (walk.nbytes) {
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx,
+ walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv);
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk,
+ walk.nbytes & (AES_BLOCK_SIZE - 1));
}
+
+ if (err || !tail)
+ return err;
+
+ /* Do ciphertext stealing with the last full block and partial block. */
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, req->iv);
+ kernel_fpu_end();
+
+ return skcipher_walk_done(&walk, 0);
}
-static void ablk_exit(struct crypto_tfm *tfm)
+/* __always_inline to avoid indirect call in fastpath */
+static __always_inline int
+xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv,
+ xts_crypt_func crypt_func)
{
- struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
- cryptd_free_ablkcipher(ctx->cryptd_tfm);
+ if (unlikely(req->cryptlen < AES_BLOCK_SIZE))
+ return -EINVAL;
+
+ kernel_fpu_begin();
+ (*encrypt_iv)(&ctx->tweak_ctx, req->iv);
+
+ /*
+ * In practice, virtually all XTS plaintexts and ciphertexts are either
+ * 512 or 4096 bytes and do not use multiple scatterlist elements. To
+ * optimize the performance of these cases, the below fast-path handles
+ * single-scatterlist-element messages as efficiently as possible. The
+ * code is 64-bit specific, as it assumes no page mapping is needed.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) &&
+ likely(req->src->length >= req->cryptlen &&
+ req->dst->length >= req->cryptlen)) {
+ (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src),
+ sg_virt(req->dst), req->cryptlen, req->iv);
+ kernel_fpu_end();
+ return 0;
+ }
+ kernel_fpu_end();
+ return xts_crypt_slowpath(req, crypt_func);
}
-static void ablk_init_common(struct crypto_tfm *tfm,
- struct cryptd_ablkcipher *cryptd_tfm)
+static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE])
{
- struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+ aesni_enc(tweak_key, iv, iv);
+}
- ctx->cryptd_tfm = cryptd_tfm;
- tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
- crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, int len,
+ u8 tweak[AES_BLOCK_SIZE])
+{
+ aesni_xts_enc(key, dst, src, len, tweak);
}
-static int ablk_ecb_init(struct crypto_tfm *tfm)
+static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, int len,
+ u8 tweak[AES_BLOCK_SIZE])
{
- struct cryptd_ablkcipher *cryptd_tfm;
+ aesni_xts_dec(key, dst, src, len, tweak);
+}
- cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-aes-aesni", 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
- ablk_init_common(tfm, cryptd_tfm);
- return 0;
+static int xts_encrypt_aesni(struct skcipher_request *req)
+{
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt);
}
-static struct crypto_alg ablk_ecb_alg = {
- .cra_name = "ecb(aes)",
- .cra_driver_name = "ecb-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+static int xts_decrypt_aesni(struct skcipher_request *req)
+{
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt);
+}
+
+static struct crypto_alg aesni_cipher_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-aesni",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
- .cra_init = ablk_ecb_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = aes_set_key,
+ .cia_encrypt = aesni_encrypt,
+ .cia_decrypt = aesni_decrypt
+ }
+ }
+};
+
+static struct skcipher_alg aesni_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "ecb(aes)",
+ .cra_driver_name = "ecb-aes-aesni",
+ .cra_priority = 400,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cts(cbc(aes))",
+ .cra_driver_name = "cts-cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cts_cbc_encrypt,
+ .decrypt = cts_cbc_decrypt,
+#ifdef CONFIG_X86_64
+ }, {
+ .base = {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-aesni",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
},
- },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ctr_crypt_aesni,
+ .decrypt = ctr_crypt_aesni,
+#endif
+ }, {
+ .base = {
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-aesni",
+ .cra_priority = 401,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = XTS_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = xts_setkey_aesni,
+ .encrypt = xts_encrypt_aesni,
+ .decrypt = xts_decrypt_aesni,
+ }
};
-static int ablk_cbc_init(struct crypto_tfm *tfm)
+#ifdef CONFIG_X86_64
+asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+
+/* __always_inline to avoid indirect call */
+static __always_inline int
+ctr_crypt(struct skcipher_request *req,
+ void (*ctr64_func)(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, int len,
+ const u64 le_ctr[2]))
{
- struct cryptd_ablkcipher *cryptd_tfm;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm));
+ unsigned int nbytes, p1_nbytes, nblocks;
+ struct skcipher_walk walk;
+ u64 le_ctr[2];
+ u64 ctr64;
+ int err;
- cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-aes-aesni", 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
- ablk_init_common(tfm, cryptd_tfm);
- return 0;
+ ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]);
+ le_ctr[1] = get_unaligned_be64(&req->iv[0]);
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) != 0) {
+ if (nbytes < walk.total) {
+ /* Not the end yet, so keep the length block-aligned. */
+ nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+ nblocks = nbytes / AES_BLOCK_SIZE;
+ } else {
+ /* It's the end, so include any final partial block. */
+ nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
+ }
+ ctr64 += nblocks;
+
+ kernel_fpu_begin();
+ if (likely(ctr64 >= nblocks)) {
+ /* The low 64 bits of the counter won't overflow. */
+ (*ctr64_func)(key, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes, le_ctr);
+ } else {
+ /*
+ * The low 64 bits of the counter will overflow. The
+ * assembly doesn't handle this case, so split the
+ * operation into two at the point where the overflow
+ * will occur. After the first part, add the carry bit.
+ */
+ p1_nbytes = min(nbytes, (nblocks - ctr64) * AES_BLOCK_SIZE);
+ (*ctr64_func)(key, walk.src.virt.addr,
+ walk.dst.virt.addr, p1_nbytes, le_ctr);
+ le_ctr[0] = 0;
+ le_ctr[1]++;
+ (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes,
+ walk.dst.virt.addr + p1_nbytes,
+ nbytes - p1_nbytes, le_ctr);
+ }
+ kernel_fpu_end();
+ le_ctr[0] = ctr64;
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ put_unaligned_be64(ctr64, &req->iv[8]);
+ put_unaligned_be64(le_ctr[1], &req->iv[0]);
+
+ return err;
}
-static struct crypto_alg ablk_cbc_alg = {
- .cra_name = "cbc(aes)",
- .cra_driver_name = "cbc-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
- .cra_init = ablk_cbc_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- },
- },
+/* __always_inline to avoid indirect call */
+static __always_inline int
+xctr_crypt(struct skcipher_request *req,
+ void (*xctr_func)(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, int len,
+ const u8 iv[AES_BLOCK_SIZE], u64 ctr))
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ u64 ctr = 1;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ while ((nbytes = walk.nbytes) != 0) {
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr,
+ nbytes, req->iv, ctr);
+ kernel_fpu_end();
+
+ ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+ return err;
+}
+
+#define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \
+ \
+asmlinkage void \
+aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \
+ \
+static int xts_encrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \
+} \
+ \
+static int xts_decrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \
+} \
+ \
+asmlinkage void \
+aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \
+ const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\
+ \
+static int ctr_crypt_##suffix(struct skcipher_request *req) \
+{ \
+ return ctr_crypt(req, aes_ctr64_crypt_##suffix); \
+} \
+ \
+asmlinkage void \
+aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \
+ const u8 *src, u8 *dst, int len, \
+ const u8 iv[AES_BLOCK_SIZE], u64 ctr); \
+ \
+static int xctr_crypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xctr_crypt(req, aes_xctr_crypt_##suffix); \
+} \
+ \
+static struct skcipher_alg skcipher_algs_##suffix[] = {{ \
+ .base.cra_name = "xts(aes)", \
+ .base.cra_driver_name = "xts-aes-" driver_name_suffix, \
+ .base.cra_priority = priority, \
+ .base.cra_blocksize = AES_BLOCK_SIZE, \
+ .base.cra_ctxsize = XTS_AES_CTX_SIZE, \
+ .base.cra_module = THIS_MODULE, \
+ .min_keysize = 2 * AES_MIN_KEY_SIZE, \
+ .max_keysize = 2 * AES_MAX_KEY_SIZE, \
+ .ivsize = AES_BLOCK_SIZE, \
+ .walksize = 2 * AES_BLOCK_SIZE, \
+ .setkey = xts_setkey_aesni, \
+ .encrypt = xts_encrypt_##suffix, \
+ .decrypt = xts_decrypt_##suffix, \
+}, { \
+ .base.cra_name = "ctr(aes)", \
+ .base.cra_driver_name = "ctr-aes-" driver_name_suffix, \
+ .base.cra_priority = priority, \
+ .base.cra_blocksize = 1, \
+ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \
+ .base.cra_module = THIS_MODULE, \
+ .min_keysize = AES_MIN_KEY_SIZE, \
+ .max_keysize = AES_MAX_KEY_SIZE, \
+ .ivsize = AES_BLOCK_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .setkey = aesni_skcipher_setkey, \
+ .encrypt = ctr_crypt_##suffix, \
+ .decrypt = ctr_crypt_##suffix, \
+}, { \
+ .base.cra_name = "xctr(aes)", \
+ .base.cra_driver_name = "xctr-aes-" driver_name_suffix, \
+ .base.cra_priority = priority, \
+ .base.cra_blocksize = 1, \
+ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \
+ .base.cra_module = THIS_MODULE, \
+ .min_keysize = AES_MIN_KEY_SIZE, \
+ .max_keysize = AES_MAX_KEY_SIZE, \
+ .ivsize = AES_BLOCK_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .setkey = aesni_skcipher_setkey, \
+ .encrypt = xctr_crypt_##suffix, \
+ .decrypt = xctr_crypt_##suffix, \
+}}
+
+DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500);
+DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600);
+DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800);
+
+/* The common part of the x86_64 AES-GCM key struct */
+struct aes_gcm_key {
+ /* Expanded AES key and the AES key length in bytes */
+ struct crypto_aes_ctx aes_key;
+
+ /* RFC4106 nonce (used only by the rfc4106 algorithms) */
+ u32 rfc4106_nonce;
+};
+
+/* Key struct used by the AES-NI implementations of AES-GCM */
+struct aes_gcm_key_aesni {
+ /*
+ * Common part of the key. The assembly code requires 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 16-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^8 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed. 16-byte
+ * alignment is required by the assembly code.
+ */
+ u64 h_powers[8][2] __aligned(16);
+
+ /*
+ * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd
+ * together. It's used for Karatsuba multiplication. 16-byte alignment
+ * is required by the assembly code.
+ */
+ u64 h_powers_xored[8] __aligned(16);
+
+ /*
+ * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte
+ * alignment is required by the assembly code.
+ */
+ u64 h_times_x64[2] __aligned(16);
+};
+#define AES_GCM_KEY_AESNI(key) \
+ container_of((key), struct aes_gcm_key_aesni, base)
+#define AES_GCM_KEY_AESNI_SIZE \
+ (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX2 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx2 {
+ /*
+ * Common part of the key. The assembly code prefers 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 32-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^8 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed.
+ * The assembly code prefers 32-byte alignment for this.
+ */
+ u64 h_powers[8][2] __aligned(32);
+
+ /*
+ * Each entry in this array contains the two halves of an entry of
+ * h_powers XOR'd together, in the following order:
+ * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7.
+ * This is used for Karatsuba multiplication.
+ */
+ u64 h_powers_xored[8];
+};
+
+#define AES_GCM_KEY_VAES_AVX2(key) \
+ container_of((key), struct aes_gcm_key_vaes_avx2, base)
+#define AES_GCM_KEY_VAES_AVX2_SIZE \
+ (sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX512 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx512 {
+ /*
+ * Common part of the key. The assembly code prefers 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 64-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^16 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed. This
+ * array is aligned to a 64-byte boundary to make it naturally aligned
+ * for 512-bit loads, which can improve performance. (The assembly code
+ * doesn't *need* the alignment; this is just an optimization.)
+ */
+ u64 h_powers[16][2] __aligned(64);
+
+ /* Three padding blocks required by the assembly code */
+ u64 padding[3][2];
};
+#define AES_GCM_KEY_VAES_AVX512(key) \
+ container_of((key), struct aes_gcm_key_vaes_avx512, base)
+#define AES_GCM_KEY_VAES_AVX512_SIZE \
+ (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1)))
+
+/*
+ * These flags are passed to the AES-GCM helper functions to specify the
+ * specific version of AES-GCM (RFC4106 or not), whether it's encryption or
+ * decryption, and which assembly functions should be called. Assembly
+ * functions are selected using flags instead of function pointers to avoid
+ * indirect calls (which are very expensive on x86) regardless of inlining.
+ */
+#define FLAG_RFC4106 BIT(0)
+#define FLAG_ENC BIT(1)
+#define FLAG_AVX BIT(2)
+#define FLAG_VAES_AVX2 BIT(3)
+#define FLAG_VAES_AVX512 BIT(4)
+
+static inline struct aes_gcm_key *
+aes_gcm_key_get(struct crypto_aead *tfm, int flags)
+{
+ if (flags & FLAG_VAES_AVX512)
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
+ else if (flags & FLAG_VAES_AVX2)
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 32);
+ else
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
+}
+
+asmlinkage void
+aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
+
+static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
+{
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key));
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key));
+ else if (flags & FLAG_AVX)
+ aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
+ else
+ aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key));
+}
+
+asmlinkage void
+aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+
+static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
+ const u8 *aad, int aadlen, int flags)
+{
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ ghash_acc, aad, aadlen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ ghash_acc, aad, aadlen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
+ aad, aadlen);
+ else
+ aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc,
+ aad, aadlen);
+}
-#ifdef HAS_CTR
-static int ablk_ctr_init(struct crypto_tfm *tfm)
+asmlinkage void
+aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+
+asmlinkage void
+aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_update(const struct aes_gcm_key *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen, int flags)
{
- struct cryptd_ablkcipher *cryptd_tfm;
+ if (flags & FLAG_ENC) {
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else
+ aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
+ ghash_acc, src, dst, datalen);
+ } else {
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else
+ aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ }
+}
+
+asmlinkage void
+aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_enc_final(const struct aes_gcm_key *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen, int flags)
+{
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else
+ aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+}
+
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline bool __must_check
+aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
+ u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
+ u8 tag[16], int taglen, int flags)
+{
+ if (flags & FLAG_VAES_AVX512)
+ return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else if (flags & FLAG_VAES_AVX2)
+ return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else if (flags & FLAG_AVX)
+ return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else
+ return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+}
+
+/*
+ * This is the Integrity Check Value (aka the authentication tag) length and can
+ * be 8, 12 or 16 bytes long.
+ */
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 8:
+ case 12:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
- cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
- 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
- ablk_init_common(tfm, cryptd_tfm);
return 0;
}
-static struct crypto_alg ablk_ctr_alg = {
- .cra_name = "ctr(aes)",
- .cra_driver_name = "ctr-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
- .cra_init = ablk_ctr_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- .geniv = "chainiv",
- },
- },
-};
-#endif
+static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
-#ifdef HAS_LRW
-static int ablk_lrw_init(struct crypto_tfm *tfm)
+/*
+ * This is the setkey function for the x86_64 implementations of AES-GCM. It
+ * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes
+ * powers of the hash key.
+ *
+ * To comply with the crypto_aead API, this has to be usable in no-SIMD context.
+ * For that reason, this function includes a portable C implementation of the
+ * needed logic. However, the portable C implementation is very slow, taking
+ * about the same time as encrypting 37 KB of data. To be ready for users that
+ * may set a key even somewhat frequently, we therefore also include a SIMD
+ * assembly implementation, expanding the AES key using AES-NI and precomputing
+ * the hash key powers using PCLMULQDQ or VPCLMULQDQ.
+ */
+static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
+ unsigned int keylen, int flags)
{
- struct cryptd_ablkcipher *cryptd_tfm;
+ struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+ int err;
+
+ if (flags & FLAG_RFC4106) {
+ if (keylen < 4)
+ return -EINVAL;
+ keylen -= 4;
+ key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen);
+ }
- cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
- 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
- ablk_init_common(tfm, cryptd_tfm);
+ /* The assembly code assumes the following offsets. */
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768);
+
+ if (likely(crypto_simd_usable())) {
+ err = aes_check_keylen(keylen);
+ if (err)
+ return err;
+ kernel_fpu_begin();
+ aesni_set_key(&key->aes_key, raw_key, keylen);
+ aes_gcm_precompute(key, flags);
+ kernel_fpu_end();
+ } else {
+ static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = {
+ [0] = 0xc2, [15] = 1
+ };
+ static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = {
+ [7] = 1,
+ };
+ be128 h1 = {};
+ be128 h;
+ int i;
+
+ err = aes_expandkey(&key->aes_key, raw_key, keylen);
+ if (err)
+ return err;
+
+ /* Encrypt the all-zeroes block to get the hash key H^1 */
+ aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1);
+
+ /* Compute H^1 * x^-1 */
+ h = h1;
+ gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
+
+ /* Compute the needed key powers */
+ if (flags & FLAG_VAES_AVX512) {
+ struct aes_gcm_key_vaes_avx512 *k =
+ AES_GCM_KEY_VAES_AVX512(key);
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ gf128mul_lle(&h, &h1);
+ }
+ memset(k->padding, 0, sizeof(k->padding));
+ } else if (flags & FLAG_VAES_AVX2) {
+ struct aes_gcm_key_vaes_avx2 *k =
+ AES_GCM_KEY_VAES_AVX2(key);
+ static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 };
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ gf128mul_lle(&h, &h1);
+ }
+ for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) {
+ int j = indices[i];
+
+ k->h_powers_xored[i] = k->h_powers[j][0] ^
+ k->h_powers[j][1];
+ }
+ } else {
+ struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ k->h_powers_xored[i] = k->h_powers[i][0] ^
+ k->h_powers[i][1];
+ gf128mul_lle(&h, &h1);
+ }
+ gf128mul_lle(&h1, (const be128 *)x_to_the_63);
+ k->h_times_x64[0] = be64_to_cpu(h1.b);
+ k->h_times_x64[1] = be64_to_cpu(h1.a);
+ }
+ }
return 0;
}
-static struct crypto_alg ablk_lrw_alg = {
- .cra_name = "lrw(aes)",
- .cra_driver_name = "lrw-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
- .cra_init = ablk_lrw_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- },
- },
-};
-#endif
+/*
+ * Initialize @ghash_acc, then pass all @assoclen bytes of associated data
+ * (a.k.a. additional authenticated data) from @sg_src through the GHASH update
+ * assembly function. kernel_fpu_begin() must have already been called.
+ */
+static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16],
+ struct scatterlist *sg_src, unsigned int assoclen,
+ int flags)
+{
+ struct scatter_walk walk;
+ /*
+ * The assembly function requires that the length of any non-last
+ * segment of associated data be a multiple of 16 bytes, so this
+ * function does the buffering needed to achieve that.
+ */
+ unsigned int pos = 0;
+ u8 buf[16];
+
+ memset(ghash_acc, 0, 16);
+ scatterwalk_start(&walk, sg_src);
+
+ while (assoclen) {
+ unsigned int orig_len_this_step = scatterwalk_next(
+ &walk, assoclen);
+ unsigned int len_this_step = orig_len_this_step;
+ unsigned int len;
+ const u8 *src = walk.addr;
+
+ if (unlikely(pos)) {
+ len = min(len_this_step, 16 - pos);
+ memcpy(&buf[pos], src, len);
+ pos += len;
+ src += len;
+ len_this_step -= len;
+ if (pos < 16)
+ goto next;
+ aes_gcm_aad_update(key, ghash_acc, buf, 16, flags);
+ pos = 0;
+ }
+ len = len_this_step;
+ if (unlikely(assoclen)) /* Not the last segment yet? */
+ len = round_down(len, 16);
+ aes_gcm_aad_update(key, ghash_acc, src, len, flags);
+ src += len;
+ len_this_step -= len;
+ if (unlikely(len_this_step)) {
+ memcpy(buf, src, len_this_step);
+ pos = len_this_step;
+ }
+next:
+ scatterwalk_done_src(&walk, orig_len_this_step);
+ if (need_resched()) {
+ kernel_fpu_end();
+ kernel_fpu_begin();
+ }
+ assoclen -= orig_len_this_step;
+ }
+ if (unlikely(pos))
+ aes_gcm_aad_update(key, ghash_acc, buf, pos, flags);
+}
+
-#ifdef HAS_PCBC
-static int ablk_pcbc_init(struct crypto_tfm *tfm)
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline int
+gcm_crypt(struct aead_request *req, int flags)
{
- struct cryptd_ablkcipher *cryptd_tfm;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+ unsigned int assoclen = req->assoclen;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ u8 ghash_acc[16]; /* GHASH accumulator */
+ u32 le_ctr[4]; /* Counter in little-endian format */
+ int taglen;
+ int err;
+
+ /* Initialize the counter and determine the associated data length. */
+ le_ctr[0] = 2;
+ if (flags & FLAG_RFC4106) {
+ if (unlikely(assoclen != 16 && assoclen != 20))
+ return -EINVAL;
+ assoclen -= 8;
+ le_ctr[1] = get_unaligned_be32(req->iv + 4);
+ le_ctr[2] = get_unaligned_be32(req->iv + 0);
+ le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */
+ } else {
+ le_ctr[1] = get_unaligned_be32(req->iv + 8);
+ le_ctr[2] = get_unaligned_be32(req->iv + 4);
+ le_ctr[3] = get_unaligned_be32(req->iv + 0);
+ }
+
+ /* Begin walking through the plaintext or ciphertext. */
+ if (flags & FLAG_ENC)
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
+ else
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (err)
+ return err;
+
+ /*
+ * Since the AES-GCM assembly code requires that at least three assembly
+ * functions be called to process any message (this is needed to support
+ * incremental updates cleanly), to reduce overhead we try to do all
+ * three calls in the same kernel FPU section if possible. We close the
+ * section and start a new one if there are multiple data segments or if
+ * rescheduling is needed while processing the associated data.
+ */
+ kernel_fpu_begin();
+
+ /* Pass the associated data through GHASH. */
+ gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags);
+
+ /* En/decrypt the data and pass the ciphertext through GHASH. */
+ while (unlikely((nbytes = walk.nbytes) < walk.total)) {
+ /*
+ * Non-last segment. In this case, the assembly function
+ * requires that the length be a multiple of 16 (AES_BLOCK_SIZE)
+ * bytes. The needed buffering of up to 16 bytes is handled by
+ * the skcipher_walk. Here we just need to round down to a
+ * multiple of 16.
+ */
+ nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes, flags);
+ le_ctr[0] += nbytes / AES_BLOCK_SIZE;
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ if (err)
+ return err;
+ kernel_fpu_begin();
+ }
+ /* Last segment: process all remaining data. */
+ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes, flags);
+ /*
+ * The low word of the counter isn't used by the finalize, so there's no
+ * need to increment it here.
+ */
+
+ /* Finalize */
+ taglen = crypto_aead_authsize(tfm);
+ if (flags & FLAG_ENC) {
+ /* Finish computing the auth tag. */
+ aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen,
+ req->cryptlen, flags);
+
+ /* Store the computed auth tag in the dst scatterlist. */
+ scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen +
+ req->cryptlen, taglen, 1);
+ } else {
+ unsigned int datalen = req->cryptlen - taglen;
+ u8 tag[16];
+
+ /* Get the transmitted auth tag from the src scatterlist. */
+ scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen,
+ taglen, 0);
+ /*
+ * Finish computing the auth tag and compare it to the
+ * transmitted one. The assembly function does the actual tag
+ * comparison. Here, just check the boolean result.
+ */
+ if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen,
+ datalen, tag, taglen, flags))
+ err = -EBADMSG;
+ }
+ kernel_fpu_end();
+ if (nbytes)
+ skcipher_walk_done(&walk, 0);
+ return err;
+}
+
+#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \
+ ctxsize, priority) \
+ \
+static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
+ unsigned int keylen) \
+{ \
+ return gcm_setkey(tfm, raw_key, keylen, (flags)); \
+} \
+ \
+static int gcm_encrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_ENC); \
+} \
+ \
+static int gcm_decrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags)); \
+} \
+ \
+static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
+ unsigned int keylen) \
+{ \
+ return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \
+} \
+ \
+static int rfc4106_encrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \
+} \
+ \
+static int rfc4106_decrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_RFC4106); \
+} \
+ \
+static struct aead_alg aes_gcm_algs_##suffix[] = { { \
+ .setkey = gcm_setkey_##suffix, \
+ .setauthsize = generic_gcmaes_set_authsize, \
+ .encrypt = gcm_encrypt_##suffix, \
+ .decrypt = gcm_decrypt_##suffix, \
+ .ivsize = GCM_AES_IV_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .maxauthsize = 16, \
+ .base = { \
+ .cra_name = "gcm(aes)", \
+ .cra_driver_name = generic_driver_name, \
+ .cra_priority = (priority), \
+ .cra_blocksize = 1, \
+ .cra_ctxsize = (ctxsize), \
+ .cra_module = THIS_MODULE, \
+ }, \
+}, { \
+ .setkey = rfc4106_setkey_##suffix, \
+ .setauthsize = common_rfc4106_set_authsize, \
+ .encrypt = rfc4106_encrypt_##suffix, \
+ .decrypt = rfc4106_decrypt_##suffix, \
+ .ivsize = GCM_RFC4106_IV_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .maxauthsize = 16, \
+ .base = { \
+ .cra_name = "rfc4106(gcm(aes))", \
+ .cra_driver_name = rfc_driver_name, \
+ .cra_priority = (priority), \
+ .cra_blocksize = 1, \
+ .cra_ctxsize = (ctxsize), \
+ .cra_module = THIS_MODULE, \
+ }, \
+} }
+
+/* aes_gcm_algs_aesni */
+DEFINE_GCM_ALGS(aesni, /* no flags */ 0,
+ "generic-gcm-aesni", "rfc4106-gcm-aesni",
+ AES_GCM_KEY_AESNI_SIZE, 400);
+
+/* aes_gcm_algs_aesni_avx */
+DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
+ "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
+ AES_GCM_KEY_AESNI_SIZE, 500);
+
+/* aes_gcm_algs_vaes_avx2 */
+DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2,
+ "generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2",
+ AES_GCM_KEY_VAES_AVX2_SIZE, 600);
+
+/* aes_gcm_algs_vaes_avx512 */
+DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512,
+ "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512",
+ AES_GCM_KEY_VAES_AVX512_SIZE, 800);
+
+static int __init register_avx_algs(void)
+{
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX))
+ return 0;
+ err = crypto_register_skciphers(skcipher_algs_aesni_avx,
+ ARRAY_SIZE(skcipher_algs_aesni_avx));
+ if (err)
+ return err;
+ err = crypto_register_aeads(aes_gcm_algs_aesni_avx,
+ ARRAY_SIZE(aes_gcm_algs_aesni_avx));
+ if (err)
+ return err;
+ /*
+ * Note: not all the algorithms registered below actually require
+ * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ.
+ * Similarly, the assembler support was added at about the same time.
+ * For simplicity, just always check for VAES and VPCLMULQDQ together.
+ */
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_VAES) ||
+ !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) ||
+ !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ return 0;
+ err = crypto_register_skciphers(skcipher_algs_vaes_avx2,
+ ARRAY_SIZE(skcipher_algs_vaes_avx2));
+ if (err)
+ return err;
+ err = crypto_register_aeads(aes_gcm_algs_vaes_avx2,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx2));
+ if (err)
+ return err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
+ !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+ !boot_cpu_has(X86_FEATURE_BMI2) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL))
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
+ skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
+ for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++)
+ aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1;
+ }
+
+ err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
+ ARRAY_SIZE(skcipher_algs_vaes_avx512));
+ if (err)
+ return err;
+ err = crypto_register_aeads(aes_gcm_algs_vaes_avx512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx512));
+ if (err)
+ return err;
- cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
- 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
- ablk_init_common(tfm, cryptd_tfm);
return 0;
}
-static struct crypto_alg ablk_pcbc_alg = {
- .cra_name = "pcbc(aes)",
- .cra_driver_name = "pcbc-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
- .cra_init = ablk_pcbc_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- },
- },
-};
-#endif
+#define unregister_skciphers(A) \
+ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
+ crypto_unregister_skciphers((A), ARRAY_SIZE(A))
+#define unregister_aeads(A) \
+ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
+ crypto_unregister_aeads((A), ARRAY_SIZE(A))
-#ifdef HAS_XTS
-static int ablk_xts_init(struct crypto_tfm *tfm)
+static void unregister_avx_algs(void)
{
- struct cryptd_ablkcipher *cryptd_tfm;
+ unregister_skciphers(skcipher_algs_aesni_avx);
+ unregister_aeads(aes_gcm_algs_aesni_avx);
+ unregister_skciphers(skcipher_algs_vaes_avx2);
+ unregister_skciphers(skcipher_algs_vaes_avx512);
+ unregister_aeads(aes_gcm_algs_vaes_avx2);
+ unregister_aeads(aes_gcm_algs_vaes_avx512);
+}
+#else /* CONFIG_X86_64 */
+static struct aead_alg aes_gcm_algs_aesni[0];
- cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
- 0, 0);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
- ablk_init_common(tfm, cryptd_tfm);
+static int __init register_avx_algs(void)
+{
return 0;
}
-static struct crypto_alg ablk_xts_alg = {
- .cra_name = "xts(aes)",
- .cra_driver_name = "xts-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_aes_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
- .cra_init = ablk_xts_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = 2 * AES_MIN_KEY_SIZE,
- .max_keysize = 2 * AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- },
- },
+static void unregister_avx_algs(void)
+{
+}
+#endif /* !CONFIG_X86_64 */
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_AES, NULL),
+ {}
};
-#endif
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
static int __init aesni_init(void)
{
int err;
- if (!cpu_has_aes) {
- printk(KERN_ERR "Intel AES-NI instructions are not detected.\n");
+ if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
- }
- if ((err = crypto_register_alg(&aesni_alg)))
- goto aes_err;
- if ((err = crypto_register_alg(&__aesni_alg)))
- goto __aes_err;
- if ((err = crypto_register_alg(&blk_ecb_alg)))
- goto blk_ecb_err;
- if ((err = crypto_register_alg(&blk_cbc_alg)))
- goto blk_cbc_err;
- if ((err = crypto_register_alg(&ablk_ecb_alg)))
- goto ablk_ecb_err;
- if ((err = crypto_register_alg(&ablk_cbc_alg)))
- goto ablk_cbc_err;
-#ifdef HAS_CTR
- if ((err = crypto_register_alg(&ablk_ctr_alg)))
- goto ablk_ctr_err;
-#endif
-#ifdef HAS_LRW
- if ((err = crypto_register_alg(&ablk_lrw_alg)))
- goto ablk_lrw_err;
-#endif
-#ifdef HAS_PCBC
- if ((err = crypto_register_alg(&ablk_pcbc_alg)))
- goto ablk_pcbc_err;
-#endif
-#ifdef HAS_XTS
- if ((err = crypto_register_alg(&ablk_xts_alg)))
- goto ablk_xts_err;
-#endif
- return err;
+ err = crypto_register_alg(&aesni_cipher_alg);
+ if (err)
+ return err;
-#ifdef HAS_XTS
-ablk_xts_err:
-#endif
-#ifdef HAS_PCBC
- crypto_unregister_alg(&ablk_pcbc_alg);
-ablk_pcbc_err:
-#endif
-#ifdef HAS_LRW
- crypto_unregister_alg(&ablk_lrw_alg);
-ablk_lrw_err:
-#endif
-#ifdef HAS_CTR
- crypto_unregister_alg(&ablk_ctr_alg);
-ablk_ctr_err:
-#endif
- crypto_unregister_alg(&ablk_cbc_alg);
-ablk_cbc_err:
- crypto_unregister_alg(&ablk_ecb_alg);
-ablk_ecb_err:
- crypto_unregister_alg(&blk_cbc_alg);
-blk_cbc_err:
- crypto_unregister_alg(&blk_ecb_alg);
-blk_ecb_err:
- crypto_unregister_alg(&__aesni_alg);
-__aes_err:
- crypto_unregister_alg(&aesni_alg);
-aes_err:
+ err = crypto_register_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
+ if (err)
+ goto unregister_cipher;
+
+ err = crypto_register_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni));
+ if (err)
+ goto unregister_skciphers;
+
+ err = register_avx_algs();
+ if (err)
+ goto unregister_avx;
+
+ return 0;
+
+unregister_avx:
+ unregister_avx_algs();
+ crypto_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni));
+unregister_skciphers:
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
+unregister_cipher:
+ crypto_unregister_alg(&aesni_cipher_alg);
return err;
}
static void __exit aesni_exit(void)
{
-#ifdef HAS_XTS
- crypto_unregister_alg(&ablk_xts_alg);
-#endif
-#ifdef HAS_PCBC
- crypto_unregister_alg(&ablk_pcbc_alg);
-#endif
-#ifdef HAS_LRW
- crypto_unregister_alg(&ablk_lrw_alg);
-#endif
-#ifdef HAS_CTR
- crypto_unregister_alg(&ablk_ctr_alg);
-#endif
- crypto_unregister_alg(&ablk_cbc_alg);
- crypto_unregister_alg(&ablk_ecb_alg);
- crypto_unregister_alg(&blk_cbc_alg);
- crypto_unregister_alg(&blk_ecb_alg);
- crypto_unregister_alg(&__aesni_alg);
- crypto_unregister_alg(&aesni_alg);
+ crypto_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni));
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
+ crypto_unregister_alg(&aesni_cipher_alg);
+ unregister_avx_algs();
}
module_init(aesni_init);
module_exit(aesni_exit);
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
+MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..932fb17308e7
--- /dev/null
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -0,0 +1,1352 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 16-way parallel algorithm (AVX)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+
+/* register macros */
+#define CTX %rdi
+
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu (0 * 16)(rio), x0; \
+ vmovdqu (1 * 16)(rio), x1; \
+ vmovdqu (2 * 16)(rio), x2; \
+ vmovdqu (3 * 16)(rio), x3; \
+ vmovdqu (4 * 16)(rio), x4; \
+ vmovdqu (5 * 16)(rio), x5; \
+ vmovdqu (6 * 16)(rio), x6; \
+ vmovdqu (7 * 16)(rio), x7; \
+ vmovdqu (8 * 16)(rio), y0; \
+ vmovdqu (9 * 16)(rio), y1; \
+ vmovdqu (10 * 16)(rio), y2; \
+ vmovdqu (11 * 16)(rio), y3; \
+ vmovdqu (12 * 16)(rio), y4; \
+ vmovdqu (13 * 16)(rio), y5; \
+ vmovdqu (14 * 16)(rio), y6; \
+ vmovdqu (15 * 16)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab); \
+ vmovdqu y0, 0 * 16(mem_cd); \
+ vmovdqu y1, 1 * 16(mem_cd); \
+ vmovdqu y2, 2 * 16(mem_cd); \
+ vmovdqu y3, 3 * 16(mem_cd); \
+ vmovdqu y4, 4 * 16(mem_cd); \
+ vmovdqu y5, 5 * 16(mem_cd); \
+ vmovdqu y6, 6 * 16(mem_cd); \
+ vmovdqu y7, 7 * 16(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu x0, 0 * 16(mem); \
+ vmovdqu x1, 1 * 16(mem); \
+ vmovdqu x2, 2 * 16(mem); \
+ vmovdqu x3, 3 * 16(mem); \
+ vmovdqu x4, 4 * 16(mem); \
+ vmovdqu x5, 5 * 16(mem); \
+ vmovdqu x6, 6 * 16(mem); \
+ vmovdqu x7, 7 * 16(mem); \
+ vmovdqu y0, 8 * 16(mem); \
+ vmovdqu y1, 9 * 16(mem); \
+ vmovdqu y2, 10 * 16(mem); \
+ vmovdqu y3, 11 * 16(mem); \
+ vmovdqu y4, 12 * 16(mem); \
+ vmovdqu y5, 13 * 16(mem); \
+ vmovdqu y6, 14 * 16(mem); \
+ vmovdqu y7, 15 * 16(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
+ vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
+ vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
+ vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
+ vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
+ vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
+ vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
+ vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
+ vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
+ vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
+ vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
+ vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
+ vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
+ vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
+ vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, rk, \
+ idx, round) \
+ /* AddRoundKey */ \
+ vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
+ vpsrld $24, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x0, x0; \
+ vpsrld $16, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x1, x1; \
+ vpsrld $8, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x2, x2; \
+ vpshufb t1, t0, t2; \
+ vpxor t2, x3, x3; \
+ vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
+ vpsrld $24, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x4, x4; \
+ vpsrld $16, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x5, x5; \
+ vpsrld $8, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x6, x6; \
+ vpshufb t1, t0, t2; \
+ vpxor t2, x7, x7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
+ vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
+ vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
+ vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
+ vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7
+
+#define aria_sbox_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vmovdqa .Linv_shift_row(%rip), t0; \
+ vmovdqa .Lshift_row(%rip), t1; \
+ vbroadcastss .L0f0f0f0f(%rip), t6; \
+ vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
+ vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
+ vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
+ vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
+ \
+ vaesenclast t7, x0, x0; \
+ vaesenclast t7, x4, x4; \
+ vaesenclast t7, x1, x1; \
+ vaesenclast t7, x5, x5; \
+ vaesdeclast t7, x2, x2; \
+ vaesdeclast t7, x6, x6; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t0, x0, x0; \
+ vpshufb t0, x4, x4; \
+ vpshufb t0, x1, x1; \
+ vpshufb t0, x5, x5; \
+ vpshufb t1, x3, x3; \
+ vpshufb t1, x7, x7; \
+ vpshufb t1, x2, x2; \
+ vpshufb t1, x6, x6; \
+ \
+ /* affine transformation for S2 */ \
+ filter_8bit(x1, t2, t3, t6, t0); \
+ /* affine transformation for S2 */ \
+ filter_8bit(x5, t2, t3, t6, t0); \
+ \
+ /* affine transformation for X2 */ \
+ filter_8bit(x3, t4, t5, t6, t0); \
+ /* affine transformation for X2 */ \
+ filter_8bit(x7, t4, t5, t6, t0); \
+ vaesdeclast t7, x3, x3; \
+ vaesdeclast t7, x7, x7;
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxor x0, x3, t0; \
+ vpxor x1, x0, t1; \
+ vpxor x2, x1, t2; \
+ vpxor x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxor t2, x0, x0; \
+ vpxor x1, t3, t3; \
+ vpxor t0, x2, x2; \
+ vpxor t1, x3, x1; \
+ vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxor y4, y0, y0; \
+ vpxor y5, y1, y1; \
+ vpxor y6, y2, y2; \
+ vpxor y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxor x4, x0, x0; \
+ vpxor x5, x1, x1; \
+ vpxor x6, x2, x2; \
+ vpxor x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxor x4, y4, y4; \
+ vpxor x5, y5, y5; \
+ vpxor x6, y6, y6; \
+ vpxor x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxor x0, y0, y0; \
+ vpxor x1, y1, y1; \
+ vpxor x2, y2, y2; \
+ vpxor x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ vpxor y7, y7, y7; \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ vpxor y7, y7, y7; \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y7, y2, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+ .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+ .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ * 1 1 0 0 0 0 0 1 x0 0
+ * 0 1 0 0 1 0 0 0 x1 0
+ * 1 1 0 0 1 1 1 1 x2 0
+ * 0 1 1 0 1 0 0 1 x3 1
+ * 0 1 0 0 1 1 0 0 * x4 + 0
+ * 0 1 0 1 1 0 0 0 x5 0
+ * 0 0 0 0 0 1 0 1 x6 0
+ * 1 1 1 0 0 1 1 1 x7 1
+ */
+.Ltf_lo__inv_aff__and__s2:
+ .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+ .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ * 1 0 1 1 0 0 0 1 x0 0
+ * 0 1 1 1 1 0 1 1 x1 0
+ * 0 0 0 1 1 0 1 0 x2 1
+ * 0 1 0 0 0 1 0 0 x3 0
+ * 0 0 1 1 1 0 1 1 * x4 + 0
+ * 0 1 0 0 1 0 0 0 x5 0
+ * 1 1 0 1 0 0 1 1 x6 0
+ * 0 1 0 0 1 0 1 0 x7 0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+ .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+ .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %xmm0..%xmm15: 16 byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 16(%rax), %r8;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r8);
+ aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 0);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 1);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 2);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 3);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 4);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 5);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 6);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 7);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 8);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 9);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_192;
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11, 12);
+ jmp .Laria_end;
+.Laria_192:
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_256;
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13, 14);
+ jmp .Laria_end;
+.Laria_256:
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 14);
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 15, 16);
+.Laria_end:
+ debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+ %xmm9, %xmm13, %xmm0, %xmm5,
+ %xmm10, %xmm14, %xmm3, %xmm6,
+ %xmm11, %xmm15, %xmm2, %xmm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+ /* load IV and byteswap */
+ vmovdqu (%r8), %xmm8;
+
+ vmovdqa .Lbswap128_mask (%rip), %xmm1;
+ vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
+
+ vpcmpeqd %xmm0, %xmm0, %xmm0;
+ vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
+
+ /* construct IVs */
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm9;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm10;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm11;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm12;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm13;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm14;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm15;
+ vmovdqu %xmm8, (0 * 16)(%rcx);
+ vmovdqu %xmm9, (1 * 16)(%rcx);
+ vmovdqu %xmm10, (2 * 16)(%rcx);
+ vmovdqu %xmm11, (3 * 16)(%rcx);
+ vmovdqu %xmm12, (4 * 16)(%rcx);
+ vmovdqu %xmm13, (5 * 16)(%rcx);
+ vmovdqu %xmm14, (6 * 16)(%rcx);
+ vmovdqu %xmm15, (7 * 16)(%rcx);
+
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm8;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm9;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm10;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm11;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm12;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm13;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm14;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm15;
+ inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+ vpshufb %xmm1, %xmm3, %xmm4;
+ vmovdqu %xmm4, (%r8);
+
+ vmovdqu (0 * 16)(%rcx), %xmm0;
+ vmovdqu (1 * 16)(%rcx), %xmm1;
+ vmovdqu (2 * 16)(%rcx), %xmm2;
+ vmovdqu (3 * 16)(%rcx), %xmm3;
+ vmovdqu (4 * 16)(%rcx), %xmm4;
+ vmovdqu (5 * 16)(%rcx), %xmm5;
+ vmovdqu (6 * 16)(%rcx), %xmm6;
+ vmovdqu (7 * 16)(%rcx), %xmm7;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx_ctr_gen_keystream_16way;
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx_crypt_16way;
+
+ vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+ vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+ vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+ vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+ vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+ vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+ vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+ vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+ vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+ vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+ vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+ vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+ vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+ vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+ vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+ vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %xmm0..%xmm15: 16 byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 16(%rax), %r8;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r8);
+ aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 0);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 1);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 2);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 3);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 4);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 5);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 6);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 7);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 8);
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 9);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 11);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 13);
+ aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+ %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %r9, 14);
+ aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+ %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11,
+ %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+ %xmm9, %xmm13, %xmm0, %xmm5,
+ %xmm10, %xmm14, %xmm3, %xmm6,
+ %xmm11, %xmm15, %xmm2, %xmm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_gfni_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ call __aria_aesni_avx_gfni_crypt_16way;
+
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx_ctr_gen_keystream_16way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx_gfni_crypt_16way;
+
+ vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+ vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+ vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+ vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+ vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+ vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+ vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+ vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+ vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+ vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+ vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+ vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+ vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+ vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+ vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+ vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+ write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..ed53d4f46bd7
--- /dev/null
+++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
@@ -0,0 +1,1433 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 32-way parallel algorithm (AVX2)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+#include <linux/cfi_types.h>
+
+/* register macros */
+#define CTX %rdi
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu (0 * 32)(rio), x0; \
+ vmovdqu (1 * 32)(rio), x1; \
+ vmovdqu (2 * 32)(rio), x2; \
+ vmovdqu (3 * 32)(rio), x3; \
+ vmovdqu (4 * 32)(rio), x4; \
+ vmovdqu (5 * 32)(rio), x5; \
+ vmovdqu (6 * 32)(rio), x6; \
+ vmovdqu (7 * 32)(rio), x7; \
+ vmovdqu (8 * 32)(rio), y0; \
+ vmovdqu (9 * 32)(rio), y1; \
+ vmovdqu (10 * 32)(rio), y2; \
+ vmovdqu (11 * 32)(rio), y3; \
+ vmovdqu (12 * 32)(rio), y4; \
+ vmovdqu (13 * 32)(rio), y5; \
+ vmovdqu (14 * 32)(rio), y6; \
+ vmovdqu (15 * 32)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab); \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu y0, 0 * 32(mem_cd); \
+ vmovdqu y1, 1 * 32(mem_cd); \
+ vmovdqu y2, 2 * 32(mem_cd); \
+ vmovdqu y3, 3 * 32(mem_cd); \
+ vmovdqu y4, 4 * 32(mem_cd); \
+ vmovdqu y5, 5 * 32(mem_cd); \
+ vmovdqu y6, 6 * 32(mem_cd); \
+ vmovdqu y7, 7 * 32(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu x0, 0 * 32(mem); \
+ vmovdqu x1, 1 * 32(mem); \
+ vmovdqu x2, 2 * 32(mem); \
+ vmovdqu x3, 3 * 32(mem); \
+ vmovdqu x4, 4 * 32(mem); \
+ vmovdqu x5, 5 * 32(mem); \
+ vmovdqu x6, 6 * 32(mem); \
+ vmovdqu x7, 7 * 32(mem); \
+ vmovdqu y0, 8 * 32(mem); \
+ vmovdqu y1, 9 * 32(mem); \
+ vmovdqu y2, 10 * 32(mem); \
+ vmovdqu y3, 11 * 32(mem); \
+ vmovdqu y4, 12 * 32(mem); \
+ vmovdqu y5, 13 * 32(mem); \
+ vmovdqu y6, 14 * 32(mem); \
+ vmovdqu y7, 15 * 32(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \
+ vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \
+ vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \
+ vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \
+ vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \
+ vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \
+ vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \
+ vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \
+ vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \
+ vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \
+ vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \
+ vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \
+ vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \
+ vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \
+ vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, rk, idx, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
+ vpxor t0, x0, x0; \
+ vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
+ vpxor t0, x1, x1; \
+ vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
+ vpxor t0, x2, x2; \
+ vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
+ vpxor t0, x3, x3; \
+ vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
+ vpxor t0, x4, x4; \
+ vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
+ vpxor t0, x5, x5; \
+ vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
+ vpxor t0, x6, x6; \
+ vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
+ vpxor t0, x7, x7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7
+
+#define aria_sbox_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpxor t7, t7, t7; \
+ vpxor t6, t6, t6; \
+ vbroadcasti128 .Linv_shift_row(%rip), t0; \
+ vbroadcasti128 .Lshift_row(%rip), t1; \
+ vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
+ vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
+ vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
+ vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
+ \
+ vextracti128 $1, x0, t6##_x; \
+ vaesenclast t7##_x, x0##_x, x0##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x0, x0; \
+ \
+ vextracti128 $1, x4, t6##_x; \
+ vaesenclast t7##_x, x4##_x, x4##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x4, x4; \
+ \
+ vextracti128 $1, x1, t6##_x; \
+ vaesenclast t7##_x, x1##_x, x1##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x1, x1; \
+ \
+ vextracti128 $1, x5, t6##_x; \
+ vaesenclast t7##_x, x5##_x, x5##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x5, x5; \
+ \
+ vextracti128 $1, x2, t6##_x; \
+ vaesdeclast t7##_x, x2##_x, x2##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x2, x2; \
+ \
+ vextracti128 $1, x6, t6##_x; \
+ vaesdeclast t7##_x, x6##_x, x6##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x6, x6; \
+ \
+ vpbroadcastd .L0f0f0f0f(%rip), t6; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t0, x0, x0; \
+ vpshufb t0, x4, x4; \
+ vpshufb t0, x1, x1; \
+ vpshufb t0, x5, x5; \
+ vpshufb t1, x3, x3; \
+ vpshufb t1, x7, x7; \
+ vpshufb t1, x2, x2; \
+ vpshufb t1, x6, x6; \
+ \
+ /* affine transformation for S2 */ \
+ filter_8bit(x1, t2, t3, t6, t0); \
+ /* affine transformation for S2 */ \
+ filter_8bit(x5, t2, t3, t6, t0); \
+ \
+ /* affine transformation for X2 */ \
+ filter_8bit(x3, t4, t5, t6, t0); \
+ /* affine transformation for X2 */ \
+ filter_8bit(x7, t4, t5, t6, t0); \
+ \
+ vpxor t6, t6, t6; \
+ vextracti128 $1, x3, t6##_x; \
+ vaesdeclast t7##_x, x3##_x, x3##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x3, x3; \
+ \
+ vextracti128 $1, x7, t6##_x; \
+ vaesdeclast t7##_x, x7##_x, x7##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x7, x7; \
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxor x0, x3, t0; \
+ vpxor x1, x0, t1; \
+ vpxor x2, x1, t2; \
+ vpxor x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxor t2, x0, x0; \
+ vpxor x1, t3, t3; \
+ vpxor t0, x2, x2; \
+ vpxor t1, x3, x1; \
+ vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxor y4, y0, y0; \
+ vpxor y5, y1, y1; \
+ vpxor y6, y2, y2; \
+ vpxor y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxor x4, x0, x0; \
+ vpxor x5, x1, x1; \
+ vpxor x6, x2, x2; \
+ vpxor x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxor x4, y4, y4; \
+ vpxor x5, y5, y5; \
+ vpxor x6, y6, y6; \
+ vpxor x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxor x0, y0, y0; \
+ vpxor x1, y1, y1; \
+ vpxor x2, y2, y2; \
+ vpxor x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+ .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+ .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ * 1 1 0 0 0 0 0 1 x0 0
+ * 0 1 0 0 1 0 0 0 x1 0
+ * 1 1 0 0 1 1 1 1 x2 0
+ * 0 1 1 0 1 0 0 1 x3 1
+ * 0 1 0 0 1 1 0 0 * x4 + 0
+ * 0 1 0 1 1 0 0 0 x5 0
+ * 0 0 0 0 0 1 0 1 x6 0
+ * 1 1 1 0 0 1 1 1 x7 1
+ */
+.Ltf_lo__inv_aff__and__s2:
+ .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+ .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ * 1 0 1 1 0 0 0 1 x0 0
+ * 0 1 1 1 1 0 1 1 x1 0
+ * 0 0 0 1 1 0 1 0 x2 1
+ * 0 1 0 0 0 1 0 0 x3 0
+ * 0 0 1 1 1 0 1 1 * x4 + 0
+ * 0 1 0 0 1 0 0 0 x5 0
+ * 1 1 0 1 0 0 1 1 x6 0
+ * 0 1 0 0 1 0 1 0 x7 0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+ .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+ .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %ymm0..%ymm15: byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 32(%rax), %r8;
+
+ inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r8);
+ aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 0);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 1);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 2);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 3);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 4);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 5);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 6);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 7);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 8);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 9);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_192;
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11, 12);
+ jmp .Laria_end;
+.Laria_192:
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_256;
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13, 14);
+ jmp .Laria_end;
+.Laria_256:
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 14);
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 15, 16);
+.Laria_end:
+ debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+ %ymm9, %ymm13, %ymm0, %ymm5,
+ %ymm10, %ymm14, %ymm3, %ymm6,
+ %ymm11, %ymm15, %ymm2, %ymm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+ movq 8(%r8), %r11;
+ bswapq %r11;
+
+ vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
+ vpcmpeqd %ymm0, %ymm0, %ymm0;
+ vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%r8), %xmm7;
+ vpshufb %xmm6, %xmm7, %xmm7;
+ vmovdqa %xmm7, %xmm3;
+ inc_le128(%xmm7, %xmm0, %xmm4);
+ vinserti128 $1, %xmm7, %ymm3, %ymm3;
+ vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 32), %r11;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
+ vpshufb %ymm6, %ymm3, %ymm9;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
+ vpshufb %ymm6, %ymm3, %ymm10;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
+ vpshufb %ymm6, %ymm3, %ymm11;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
+ vpshufb %ymm6, %ymm3, %ymm12;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
+ vpshufb %ymm6, %ymm3, %ymm13;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
+ vpshufb %ymm6, %ymm3, %ymm14;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
+ vpshufb %ymm6, %ymm3, %ymm15;
+ vmovdqu %ymm8, (0 * 32)(%rcx);
+ vmovdqu %ymm9, (1 * 32)(%rcx);
+ vmovdqu %ymm10, (2 * 32)(%rcx);
+ vmovdqu %ymm11, (3 * 32)(%rcx);
+ vmovdqu %ymm12, (4 * 32)(%rcx);
+ vmovdqu %ymm13, (5 * 32)(%rcx);
+ vmovdqu %ymm14, (6 * 32)(%rcx);
+ vmovdqu %ymm15, (7 * 32)(%rcx);
+
+ vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
+ vpshufb %ymm6, %ymm3, %ymm8;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
+ vpshufb %ymm6, %ymm3, %ymm9;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
+ vpshufb %ymm6, %ymm3, %ymm10;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
+ vpshufb %ymm6, %ymm3, %ymm11;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
+ vpshufb %ymm6, %ymm3, %ymm12;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
+ vpshufb %ymm6, %ymm3, %ymm13;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
+ vpshufb %ymm6, %ymm3, %ymm14;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
+ vpshufb %ymm6, %ymm3, %ymm15;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
+ vpshufb %xmm6, %xmm3, %xmm3;
+ vmovdqu %xmm3, (%r8);
+ vmovdqu (0 * 32)(%rcx), %ymm0;
+ vmovdqu (1 * 32)(%rcx), %ymm1;
+ vmovdqu (2 * 32)(%rcx), %ymm2;
+ vmovdqu (3 * 32)(%rcx), %ymm3;
+ vmovdqu (4 * 32)(%rcx), %ymm4;
+ vmovdqu (5 * 32)(%rcx), %ymm5;
+ vmovdqu (6 * 32)(%rcx), %ymm6;
+ vmovdqu (7 * 32)(%rcx), %ymm7;
+ jmp .Lctr_carry_done;
+
+ .Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
+ vmovdqu %ymm8, (0 * 32)(%rcx);
+ vmovdqu %ymm9, (1 * 32)(%rcx);
+ vmovdqu %ymm10, (2 * 32)(%rcx);
+ vmovdqu %ymm11, (3 * 32)(%rcx);
+ vmovdqu %ymm12, (4 * 32)(%rcx);
+ vmovdqu %ymm13, (5 * 32)(%rcx);
+ vmovdqu %ymm14, (6 * 32)(%rcx);
+ vmovdqu %ymm15, (7 * 32)(%rcx);
+
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vextracti128 $1, %ymm3, %xmm3;
+ vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
+ vmovdqu %xmm3, (%r8);
+ vmovdqu (0 * 32)(%rcx), %ymm0;
+ vmovdqu (1 * 32)(%rcx), %ymm1;
+ vmovdqu (2 * 32)(%rcx), %ymm2;
+ vmovdqu (3 * 32)(%rcx), %ymm3;
+ vmovdqu (4 * 32)(%rcx), %ymm4;
+ vmovdqu (5 * 32)(%rcx), %ymm5;
+ vmovdqu (6 * 32)(%rcx), %ymm6;
+ vmovdqu (7 * 32)(%rcx), %ymm7;
+
+ .Lctr_carry_done:
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+ vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+ vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+ vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+ vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+ vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+ vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+ vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+ vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+ vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+ vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+ vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+ vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+ vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+ vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+ vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %ymm0..%ymm15: 16 byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 32(%rax), %r8;
+
+ inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r8);
+ aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 0);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 1);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 2);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 3);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 4);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 5);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 6);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 7);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 8);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 9);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 14);
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+ %ymm9, %ymm13, %ymm0, %ymm5,
+ %ymm10, %ymm14, %ymm3, %ymm6,
+ %ymm11, %ymm15, %ymm2, %ymm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx2_ctr_gen_keystream_32way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+ vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+ vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+ vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+ vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+ vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+ vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+ vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+ vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+ vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+ vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+ vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+ vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+ vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+ vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+ vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h
new file mode 100644
index 000000000000..6e1b2d8a31ed
--- /dev/null
+++ b/arch/x86/crypto/aria-avx.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ASM_X86_ARIA_AVX_H
+#define ASM_X86_ARIA_AVX_H
+
+#include <linux/types.h>
+
+#define ARIA_AESNI_PARALLEL_BLOCKS 16
+#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_PARALLEL_BLOCKS)
+
+#define ARIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+#define ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_AVX2_PARALLEL_BLOCKS)
+
+#define ARIA_GFNI_AVX512_PARALLEL_BLOCKS 64
+#define ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_GFNI_AVX512_PARALLEL_BLOCKS)
+
+asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+struct aria_avx_ops {
+ void (*aria_encrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_16way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+ void (*aria_encrypt_32way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_32way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_32way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+ void (*aria_encrypt_64way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_64way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_64way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+
+
+};
+#endif
diff --git a/arch/x86/crypto/aria-gfni-avx512-asm_64.S b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
new file mode 100644
index 000000000000..860887e5d02e
--- /dev/null
+++ b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
@@ -0,0 +1,971 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 64-way parallel algorithm (AVX512)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+#include <linux/cfi_types.h>
+
+/* register macros */
+#define CTX %rdi
+
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define add_le128(out, in, lo_counter, hi_counter1) \
+ vpaddq lo_counter, in, out; \
+ vpcmpuq $1, lo_counter, out, %k1; \
+ kaddb %k1, %k1, %k1; \
+ vpaddq hi_counter1, out, out{%k1};
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandq x, mask4bit, tmp0; \
+ vpandqn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxorq tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu64 d2, st0; \
+ vmovdqu64 d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 a0, st0; \
+ vmovdqu64 a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu64 st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu64 d3, st1; \
+ vmovdqu64 st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu64 d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 b0, st0; \
+ vmovdqu64 b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu64 st0, b0; \
+ vmovdqu64 st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu64 d2, st0; \
+ vmovdqu64 d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 a0, st0; \
+ vmovdqu64 a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu64 st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu64 d3, st1; \
+ vmovdqu64 st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu64 d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 b0, st0; \
+ vmovdqu64 b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu64 st0, b0; \
+ vmovdqu64 st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu64 (0 * 64)(rio), x0; \
+ vmovdqu64 (1 * 64)(rio), x1; \
+ vmovdqu64 (2 * 64)(rio), x2; \
+ vmovdqu64 (3 * 64)(rio), x3; \
+ vmovdqu64 (4 * 64)(rio), x4; \
+ vmovdqu64 (5 * 64)(rio), x5; \
+ vmovdqu64 (6 * 64)(rio), x6; \
+ vmovdqu64 (7 * 64)(rio), x7; \
+ vmovdqu64 (8 * 64)(rio), y0; \
+ vmovdqu64 (9 * 64)(rio), y1; \
+ vmovdqu64 (10 * 64)(rio), y2; \
+ vmovdqu64 (11 * 64)(rio), y3; \
+ vmovdqu64 (12 * 64)(rio), y4; \
+ vmovdqu64 (13 * 64)(rio), y5; \
+ vmovdqu64 (14 * 64)(rio), y6; \
+ vmovdqu64 (15 * 64)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu64 x0, 0 * 64(mem_ab); \
+ vmovdqu64 x1, 1 * 64(mem_ab); \
+ vmovdqu64 x2, 2 * 64(mem_ab); \
+ vmovdqu64 x3, 3 * 64(mem_ab); \
+ vmovdqu64 x4, 4 * 64(mem_ab); \
+ vmovdqu64 x5, 5 * 64(mem_ab); \
+ vmovdqu64 x6, 6 * 64(mem_ab); \
+ vmovdqu64 x7, 7 * 64(mem_ab); \
+ vmovdqu64 y0, 0 * 64(mem_cd); \
+ vmovdqu64 y1, 1 * 64(mem_cd); \
+ vmovdqu64 y2, 2 * 64(mem_cd); \
+ vmovdqu64 y3, 3 * 64(mem_cd); \
+ vmovdqu64 y4, 4 * 64(mem_cd); \
+ vmovdqu64 y5, 5 * 64(mem_cd); \
+ vmovdqu64 y6, 6 * 64(mem_cd); \
+ vmovdqu64 y7, 7 * 64(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu64 x0, 0 * 64(mem); \
+ vmovdqu64 x1, 1 * 64(mem); \
+ vmovdqu64 x2, 2 * 64(mem); \
+ vmovdqu64 x3, 3 * 64(mem); \
+ vmovdqu64 x4, 4 * 64(mem); \
+ vmovdqu64 x5, 5 * 64(mem); \
+ vmovdqu64 x6, 6 * 64(mem); \
+ vmovdqu64 x7, 7 * 64(mem); \
+ vmovdqu64 y0, 8 * 64(mem); \
+ vmovdqu64 y1, 9 * 64(mem); \
+ vmovdqu64 y2, 10 * 64(mem); \
+ vmovdqu64 y3, 11 * 64(mem); \
+ vmovdqu64 y4, 12 * 64(mem); \
+ vmovdqu64 y5, 13 * 64(mem); \
+ vmovdqu64 y6, 14 * 64(mem); \
+ vmovdqu64 y7, 15 * 64(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \
+ vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \
+ vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \
+ vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \
+ vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \
+ vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \
+ vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \
+ vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \
+ vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \
+ vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \
+ vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \
+ vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \
+ vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \
+ vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \
+ vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
+
+#define aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ t0, rk, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + 3)(rk), t0; \
+ vpxorq t0, x0, x0; \
+ vpbroadcastb ((round * 16) + 2)(rk), t0; \
+ vpxorq t0, x1, x1; \
+ vpbroadcastb ((round * 16) + 1)(rk), t0; \
+ vpxorq t0, x2, x2; \
+ vpbroadcastb ((round * 16) + 0)(rk), t0; \
+ vpxorq t0, x3, x3; \
+ vpbroadcastb ((round * 16) + 7)(rk), t0; \
+ vpxorq t0, x4, x4; \
+ vpbroadcastb ((round * 16) + 6)(rk), t0; \
+ vpxorq t0, x5, x5; \
+ vpbroadcastb ((round * 16) + 5)(rk), t0; \
+ vpxorq t0, x6, x6; \
+ vpbroadcastb ((round * 16) + 4)(rk), t0; \
+ vpxorq t0, x7, x7; \
+ vpbroadcastb ((round * 16) + 11)(rk), t0; \
+ vpxorq t0, y0, y0; \
+ vpbroadcastb ((round * 16) + 10)(rk), t0; \
+ vpxorq t0, y1, y1; \
+ vpbroadcastb ((round * 16) + 9)(rk), t0; \
+ vpxorq t0, y2, y2; \
+ vpbroadcastb ((round * 16) + 8)(rk), t0; \
+ vpxorq t0, y3, y3; \
+ vpbroadcastb ((round * 16) + 15)(rk), t0; \
+ vpxorq t0, y4, y4; \
+ vpbroadcastb ((round * 16) + 14)(rk), t0; \
+ vpxorq t0, y5, y5; \
+ vpbroadcastb ((round * 16) + 13)(rk), t0; \
+ vpxorq t0, y6, y6; \
+ vpbroadcastb ((round * 16) + 12)(rk), t0; \
+ vpxorq t0, y7, y7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7;
+
+#define aria_sbox_16way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \
+ vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \
+ vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \
+ vgf2p8affineinvqb $0, t2, y2, y2; \
+ vgf2p8affineinvqb $0, t2, y6, y6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \
+ vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \
+ vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \
+ vgf2p8affineinvqb $0, t2, y3, y3; \
+ vgf2p8affineinvqb $0, t2, y7, y7;
+
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxorq x0, x3, t0; \
+ vpxorq x1, x0, t1; \
+ vpxorq x2, x1, t2; \
+ vpxorq x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxorq t2, x0, x0; \
+ vpxorq x1, t3, t3; \
+ vpxorq t0, x2, x2; \
+ vpxorq t1, x3, x1; \
+ vmovdqu64 t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxorq y0, x4, x4; \
+ vpxorq y1, x5, x5; \
+ vpxorq y2, x6, x6; \
+ vpxorq y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxorq y4, y0, y0; \
+ vpxorq y5, y1, y1; \
+ vpxorq y6, y2, y2; \
+ vpxorq y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxorq x4, x0, x0; \
+ vpxorq x5, x1, x1; \
+ vpxorq x6, x2, x2; \
+ vpxorq x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxorq x4, y4, y4; \
+ vpxorq x5, y5, y5; \
+ vpxorq x6, y6, y6; \
+ vpxorq x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxorq x0, y0, y0; \
+ vpxorq x1, y1, y1; \
+ vpxorq x2, y2, y2; \
+ vpxorq x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxorq y0, x4, x4; \
+ vpxorq y1, x5, x5; \
+ vpxorq y2, x6, x6; \
+ vpxorq y3, x7, x7;
+
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round) \
+ aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7, \
+ z0, rk, round); \
+ \
+ aria_sbox_16way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y2, y3, y0, y1, \
+ y6, y7, y4, y5, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
+ aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
+ aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
+ aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round) \
+ aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7, \
+ z0, rk, round); \
+ \
+ aria_sbox_16way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
+ aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
+ aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
+ aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, rk, round); \
+ aria_sbox_16way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y2, y3, y0, y1, \
+ y6, y7, y4, y5, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, rk, last_round);
+
+
+.section .rodata.cst64, "aM", @progbits, 64
+.align 64
+.Lcounter0123_lo:
+ .quad 0, 0
+ .quad 1, 0
+ .quad 2, 0
+ .quad 3, 0
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+.Lcounter4444_lo:
+ .quad 4, 0
+.Lcounter8888_lo:
+ .quad 8, 0
+.Lcounter16161616_lo:
+ .quad 16, 0
+.Lcounter1111_hi:
+ .quad 0, 1
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+.text
+SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %zmm0..%zmm15: byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 64(%rax), %r8;
+
+ inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax, %r8);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 0);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 1);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 2);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 3);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 4);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 5);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 6);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 7);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 8);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 9);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 11);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 13);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 14);
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
+ %zmm8, %zmm13, %zmm2, %zmm7,
+ %zmm11, %zmm14, %zmm1, %zmm4,
+ %zmm10, %zmm15, %zmm0, %zmm5,
+ (%rax), (%r8));
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rdx);
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rdx);
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
+
+SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+
+ vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
+ vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
+ vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
+ vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
+ vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
+ vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
+
+ /* load IV and byteswap */
+ movq 8(%r8), %r11;
+ movq (%r8), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ vbroadcasti64x2 (%r8), %zmm20;
+ vpshufb %zmm19, %zmm20, %zmm20;
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 64), %r11;
+ ja .Lload_ctr_carry;
+
+ /* construct IVs */
+ vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
+ vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
+ vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
+ vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
+ vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
+ vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
+ vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
+ vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
+ vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
+ vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
+ vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
+ vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
+ vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
+ vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
+ vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
+ vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
+ jmp .Lload_ctr_done;
+
+.Lload_ctr_carry:
+ /* construct IVs */
+ add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
+ add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+ add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+ add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+ add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
+ add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
+ add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
+ add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
+ add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
+ add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
+ add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
+ add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
+ add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
+ add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
+ add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
+ add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
+
+.Lload_ctr_done:
+ /* Byte-swap IVs and update counter. */
+ addq $64, %r11;
+ adcq $0, %r10;
+ vpshufb %zmm19, %zmm15, %zmm15;
+ vpshufb %zmm19, %zmm14, %zmm14;
+ vpshufb %zmm19, %zmm13, %zmm13;
+ vpshufb %zmm19, %zmm12, %zmm12;
+ vpshufb %zmm19, %zmm11, %zmm11;
+ vpshufb %zmm19, %zmm10, %zmm10;
+ vpshufb %zmm19, %zmm9, %zmm9;
+ vpshufb %zmm19, %zmm8, %zmm8;
+ bswapq %r11;
+ bswapq %r10;
+ vpshufb %zmm19, %zmm7, %zmm7;
+ vpshufb %zmm19, %zmm6, %zmm6;
+ vpshufb %zmm19, %zmm5, %zmm5;
+ vpshufb %zmm19, %zmm4, %zmm4;
+ vpshufb %zmm19, %zmm3, %zmm3;
+ vpshufb %zmm19, %zmm2, %zmm2;
+ vpshufb %zmm19, %zmm1, %zmm1;
+ vpshufb %zmm19, %zmm0, %zmm0;
+ movq %r11, 8(%r8);
+ movq %r10, (%r8);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_gfni_avx512_ctr_gen_keystream_64way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
+ vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
+ vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
+ vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
+ vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
+ vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
+ vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
+ vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
+ vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
+ vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
+ vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
+ vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
+ vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
+ vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
+ vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
+ vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)
diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c
new file mode 100644
index 000000000000..1487a49bfbac
--- /dev/null
+++ b/arch/x86/crypto/aria_aesni_avx2_glue.c
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX2/AES-NI/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_encrypt_32way);
+asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_decrypt_32way);
+asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_ctr_crypt_32way);
+asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_encrypt_32way);
+asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_decrypt_32way);
+asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_ctr_crypt_32way);
+
+static struct aria_avx_ops aria_ops;
+
+struct aria_avx2_request_ctx {
+ u8 keystream[ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE];
+};
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx2_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx2_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx2_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx2_ctr_encrypt(struct skcipher_request *req)
+{
+ struct aria_avx2_request_ctx *req_ctx = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_32way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx2_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx2_request_ctx));
+
+ return 0;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "ecb(aria)",
+ .base.cra_driver_name = "ecb-aria-avx2",
+ .base.cra_priority = 500,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx2_set_key,
+ .encrypt = aria_avx2_ecb_encrypt,
+ .decrypt = aria_avx2_ecb_decrypt,
+ }, {
+ .base.cra_name = "ctr(aria)",
+ .base.cra_driver_name = "ctr-aria-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .setkey = aria_avx2_set_key,
+ .encrypt = aria_avx2_ctr_encrypt,
+ .decrypt = aria_avx2_ctr_encrypt,
+ .init = aria_avx2_init_tfm,
+ }
+};
+
+static int __init aria_avx2_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_GFNI)) {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way;
+ } else {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way;
+ }
+
+ return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
+}
+
+static void __exit aria_avx2_exit(void)
+{
+ crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
+}
+
+module_init(aria_avx2_init);
+module_exit(aria_avx2_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX2/AES-NI/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-aesni-avx2");
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
new file mode 100644
index 000000000000..e4e3d78915a5
--- /dev/null
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX/AES-NI/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_encrypt_16way);
+asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_decrypt_16way);
+asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_ctr_crypt_16way);
+asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_encrypt_16way);
+asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_decrypt_16way);
+asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_ctr_crypt_16way);
+
+static struct aria_avx_ops aria_ops;
+
+struct aria_avx_request_ctx {
+ u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE];
+};
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx_ctr_encrypt(struct skcipher_request *req)
+{
+ struct aria_avx_request_ctx *req_ctx = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx_request_ctx));
+
+ return 0;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "ecb(aria)",
+ .base.cra_driver_name = "ecb-aria-avx",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx_set_key,
+ .encrypt = aria_avx_ecb_encrypt,
+ .decrypt = aria_avx_ecb_decrypt,
+ }, {
+ .base.cra_name = "ctr(aria)",
+ .base.cra_driver_name = "ctr-aria-avx",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .walksize = 16 * ARIA_BLOCK_SIZE,
+ .setkey = aria_avx_set_key,
+ .encrypt = aria_avx_ctr_encrypt,
+ .decrypt = aria_avx_ctr_encrypt,
+ .init = aria_avx_init_tfm,
+ }
+};
+
+static int __init aria_avx_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_GFNI)) {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ } else {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way;
+ }
+
+ return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
+}
+
+static void __exit aria_avx_exit(void)
+{
+ crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
+}
+
+module_init(aria_avx_init);
+module_exit(aria_avx_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX/AES-NI/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-aesni-avx");
diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c
new file mode 100644
index 000000000000..363cbf4399cc
--- /dev/null
+++ b/arch/x86/crypto/aria_gfni_avx512_glue.c
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX512/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_gfni_avx512_encrypt_64way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_gfni_avx512_decrypt_64way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_gfni_avx512_ctr_crypt_64way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+static struct aria_avx_ops aria_ops;
+
+struct aria_avx512_request_ctx {
+ u8 keystream[ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE];
+};
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_encrypt_64way);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_decrypt_64way);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx512_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx512_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx512_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx512_ctr_encrypt(struct skcipher_request *req)
+{
+ struct aria_avx512_request_ctx *req_ctx = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_64way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ src += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_32way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx512_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm,
+ sizeof(struct aria_avx512_request_ctx));
+
+ return 0;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "ecb(aria)",
+ .base.cra_driver_name = "ecb-aria-avx512",
+ .base.cra_priority = 600,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx512_set_key,
+ .encrypt = aria_avx512_ecb_encrypt,
+ .decrypt = aria_avx512_ecb_decrypt,
+ }, {
+ .base.cra_name = "ctr(aria)",
+ .base.cra_driver_name = "ctr-aria-avx512",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .setkey = aria_avx512_set_key,
+ .encrypt = aria_avx512_ctr_encrypt,
+ .decrypt = aria_avx512_ctr_encrypt,
+ .init = aria_avx512_init_tfm,
+ }
+};
+
+static int __init aria_avx512_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AVX512F) ||
+ !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+ !boot_cpu_has(X86_FEATURE_GFNI) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX512/GFNI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way;
+ aria_ops.aria_encrypt_64way = aria_gfni_avx512_encrypt_64way;
+ aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way;
+ aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way;
+
+ return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
+}
+
+static void __exit aria_avx512_exit(void)
+{
+ crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
+}
+
+module_init(aria_avx512_init);
+module_exit(aria_avx512_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX512/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-gfni-avx512");
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
new file mode 100644
index 000000000000..e88c8e4f013c
--- /dev/null
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -0,0 +1,354 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Blowfish Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+
+.file "blowfish-x86_64-asm.S"
+.text
+
+/* structure of crypto context */
+#define p 0
+#define s0 ((16 + 2) * 4)
+#define s1 ((16 + 2 + (1 * 256)) * 4)
+#define s2 ((16 + 2 + (2 * 256)) * 4)
+#define s3 ((16 + 2 + (3 * 256)) * 4)
+
+/* register macros */
+#define CTX %r12
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rdi
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %edi
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+ rorq $16, RX0; \
+ movzbl RX0bh, RT0d; \
+ movzbl RX0bl, RT1d; \
+ rolq $16, RX0; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT1,4), RT0d; \
+ movzbl RX0bh, RT1d; \
+ movzbl RX0bl, RT2d; \
+ rolq $32, RX0; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT2,4), RT0d; \
+ xorq RT0, RX0;
+
+#define add_roundkey_enc(n) \
+ xorq p+4*(n)(CTX), RX0;
+
+#define round_enc(n) \
+ add_roundkey_enc(n); \
+ \
+ F(); \
+ F();
+
+#define add_roundkey_dec(n) \
+ movq p+4*(n-1)(CTX), RT0; \
+ rorq $32, RT0; \
+ xorq RT0, RX0;
+
+#define round_dec(n) \
+ add_roundkey_dec(n); \
+ \
+ F(); \
+ F(); \
+
+#define read_block() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0;
+
+#define write_block() \
+ bswapq RX0; \
+ movq RX0, (RIO);
+
+SYM_FUNC_START(blowfish_enc_blk)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ movq %r12, %r11;
+
+ movq %rdi, CTX;
+ movq %rsi, %r10;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_enc(0);
+ round_enc(2);
+ round_enc(4);
+ round_enc(6);
+ round_enc(8);
+ round_enc(10);
+ round_enc(12);
+ round_enc(14);
+ add_roundkey_enc(16);
+
+ movq %r11, %r12;
+ movq %r10, RIO;
+
+ write_block();
+ RET;
+SYM_FUNC_END(blowfish_enc_blk)
+
+SYM_FUNC_START(blowfish_dec_blk)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ movq %r12, %r11;
+
+ movq %rdi, CTX;
+ movq %rsi, %r10;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_dec(17);
+ round_dec(15);
+ round_dec(13);
+ round_dec(11);
+ round_dec(9);
+ round_dec(7);
+ round_dec(5);
+ round_dec(3);
+ add_roundkey_dec(1);
+
+ movq %r10, RIO;
+ write_block();
+
+ movq %r11, %r12;
+
+ RET;
+SYM_FUNC_END(blowfish_dec_blk)
+
+/**********************************************************************
+ 4-way blowfish, four blocks parallel
+ **********************************************************************/
+
+/* F() for 4-way. Slower when used alone/1-way, but faster when used
+ * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
+ */
+#define F4(x) \
+ movzbl x ## bh, RT1d; \
+ movzbl x ## bl, RT3d; \
+ rorq $16, x; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT2d; \
+ rorq $16, x; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT2,4), RT0d; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT3,4), RT0d; \
+ xorq RT0, x;
+
+#define add_preloaded_roundkey4() \
+ xorq RKEY, RX0; \
+ xorq RKEY, RX1; \
+ xorq RKEY, RX2; \
+ xorq RKEY, RX3;
+
+#define preload_roundkey_enc(n) \
+ movq p+4*(n)(CTX), RKEY;
+
+#define add_roundkey_enc4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+ add_roundkey_enc4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define preload_roundkey_dec(n) \
+ movq p+4*((n)-1)(CTX), RKEY; \
+ rorq $32, RKEY;
+
+#define add_roundkey_dec4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+ add_roundkey_dec4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define read_block4() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0; \
+ \
+ movq 8(RIO), RX1; \
+ rorq $32, RX1; \
+ bswapq RX1; \
+ \
+ movq 16(RIO), RX2; \
+ rorq $32, RX2; \
+ bswapq RX2; \
+ \
+ movq 24(RIO), RX3; \
+ rorq $32, RX3; \
+ bswapq RX3;
+
+#define write_block4() \
+ bswapq RX0; \
+ movq RX0, (RIO); \
+ \
+ bswapq RX1; \
+ movq RX1, 8(RIO); \
+ \
+ bswapq RX2; \
+ movq RX2, 16(RIO); \
+ \
+ bswapq RX3; \
+ movq RX3, 24(RIO);
+
+#define xor_block4() \
+ movq (RIO), RT0; \
+ bswapq RT0; \
+ xorq RT0, RX1; \
+ \
+ movq 8(RIO), RT2; \
+ bswapq RT2; \
+ xorq RT2, RX2; \
+ \
+ movq 16(RIO), RT3; \
+ bswapq RT3; \
+ xorq RT3, RX3;
+
+SYM_FUNC_START(blowfish_enc_blk_4way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %r12;
+ pushq %rbx;
+
+ movq %rdi, CTX
+ movq %rsi, %r11;
+ movq %rdx, RIO;
+
+ preload_roundkey_enc(0);
+
+ read_block4();
+
+ round_enc4(0);
+ round_enc4(2);
+ round_enc4(4);
+ round_enc4(6);
+ round_enc4(8);
+ round_enc4(10);
+ round_enc4(12);
+ round_enc4(14);
+ add_preloaded_roundkey4();
+
+ movq %r11, RIO;
+ write_block4();
+
+ popq %rbx;
+ popq %r12;
+ RET;
+SYM_FUNC_END(blowfish_enc_blk_4way)
+
+SYM_FUNC_START(__blowfish_dec_blk_4way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: cbc (bool)
+ */
+ pushq %r12;
+ pushq %rbx;
+ pushq %rcx;
+ pushq %rdx;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, RIO;
+
+ preload_roundkey_dec(17);
+ read_block4();
+
+ round_dec4(17);
+ round_dec4(15);
+ round_dec4(13);
+ round_dec4(11);
+ round_dec4(9);
+ round_dec4(7);
+ round_dec4(5);
+ round_dec4(3);
+ add_preloaded_roundkey4();
+
+ popq RIO;
+ popq %r12;
+ testq %r12, %r12;
+ jz .L_no_cbc_xor;
+
+ xor_block4();
+
+.L_no_cbc_xor:
+ movq %r11, RIO;
+ write_block4();
+
+ popq %rbx;
+ popq %r12;
+
+ RET;
+SYM_FUNC_END(__blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
new file mode 100644
index 000000000000..26c5f2ee5d10
--- /dev/null
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for assembler optimized version of Blowfish
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/blowfish.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+
+/* regular block cipher functions */
+asmlinkage void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+
+/* 4-way parallel cipher functions */
+asmlinkage void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void __blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src, bool cbc);
+
+static inline void blowfish_dec_ecb_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ return __blowfish_dec_blk_4way(ctx, dst, src, false);
+}
+
+static inline void blowfish_dec_cbc_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ return __blowfish_dec_blk_4way(ctx, dst, src, true);
+}
+
+static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return blowfish_setkey(&tfm->base, key, keylen);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, BF_BLOCK_SIZE, -1);
+ ECB_BLOCK(4, blowfish_enc_blk_4way);
+ ECB_BLOCK(1, blowfish_enc_blk);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, BF_BLOCK_SIZE, -1);
+ ECB_BLOCK(4, blowfish_dec_ecb_4way);
+ ECB_BLOCK(1, blowfish_dec_blk);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, BF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(blowfish_enc_blk);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, BF_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(4, blowfish_dec_cbc_4way);
+ CBC_DEC_BLOCK(1, blowfish_dec_blk);
+ CBC_WALK_END();
+}
+
+static struct crypto_alg bf_cipher_alg = {
+ .cra_name = "blowfish",
+ .cra_driver_name = "blowfish-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = BF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct bf_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = BF_MIN_KEY_SIZE,
+ .cia_max_keysize = BF_MAX_KEY_SIZE,
+ .cia_setkey = blowfish_setkey,
+ .cia_encrypt = blowfish_encrypt,
+ .cia_decrypt = blowfish_decrypt,
+ }
+ }
+};
+
+static struct skcipher_alg bf_skcipher_algs[] = {
+ {
+ .base.cra_name = "ecb(blowfish)",
+ .base.cra_driver_name = "ecb-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = BF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(blowfish)",
+ .base.cra_driver_name = "cbc-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = BF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .ivsize = BF_BLOCK_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, blowfish-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init blowfish_init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "blowfish-x86_64: performance on this CPU "
+ "would be suboptimal: disabling "
+ "blowfish-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&bf_cipher_alg);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(bf_skcipher_algs,
+ ARRAY_SIZE(bf_skcipher_algs));
+ if (err)
+ crypto_unregister_alg(&bf_cipher_alg);
+
+ return err;
+}
+
+static void __exit blowfish_fini(void)
+{
+ crypto_unregister_alg(&bf_cipher_alg);
+ crypto_unregister_skciphers(bf_skcipher_algs,
+ ARRAY_SIZE(bf_skcipher_algs));
+}
+
+module_init(blowfish_init);
+module_exit(blowfish_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("blowfish");
+MODULE_ALIAS_CRYPTO("blowfish-asm");
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..1dfef28c1266
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -0,0 +1,990 @@
+/*
+ * x86_64/AVX/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/*
+ * Version licensed under 2-clause BSD License is available at:
+ * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/frame.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+
+/**********************************************************************
+ 16-way camellia
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/*
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+ t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vmovdqa .Linv_shift_row(%rip), t4; \
+ vbroadcastss .L0f0f0f0f(%rip), t7; \
+ vmovdqa .Lpre_tf_lo_s1(%rip), t0; \
+ vmovdqa .Lpre_tf_hi_s1(%rip), t1; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t4, x0, x0; \
+ vpshufb t4, x7, x7; \
+ vpshufb t4, x1, x1; \
+ vpshufb t4, x4, x4; \
+ vpshufb t4, x2, x2; \
+ vpshufb t4, x5, x5; \
+ vpshufb t4, x3, x3; \
+ vpshufb t4, x6, x6; \
+ \
+ /* prefilter sboxes 1, 2 and 3 */ \
+ vmovdqa .Lpre_tf_lo_s4(%rip), t2; \
+ vmovdqa .Lpre_tf_hi_s4(%rip), t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x1, t0, t1, t7, t6); \
+ filter_8bit(x4, t0, t1, t7, t6); \
+ filter_8bit(x2, t0, t1, t7, t6); \
+ filter_8bit(x5, t0, t1, t7, t6); \
+ \
+ /* prefilter sbox 4 */ \
+ vpxor t4, t4, t4; \
+ filter_8bit(x3, t2, t3, t7, t6); \
+ filter_8bit(x6, t2, t3, t7, t6); \
+ \
+ /* AES subbytes + AES shift rows */ \
+ vmovdqa .Lpost_tf_lo_s1(%rip), t0; \
+ vmovdqa .Lpost_tf_hi_s1(%rip), t1; \
+ vaesenclast t4, x0, x0; \
+ vaesenclast t4, x7, x7; \
+ vaesenclast t4, x1, x1; \
+ vaesenclast t4, x4, x4; \
+ vaesenclast t4, x2, x2; \
+ vaesenclast t4, x5, x5; \
+ vaesenclast t4, x3, x3; \
+ vaesenclast t4, x6, x6; \
+ \
+ /* postfilter sboxes 1 and 4 */ \
+ vmovdqa .Lpost_tf_lo_s3(%rip), t2; \
+ vmovdqa .Lpost_tf_hi_s3(%rip), t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x3, t0, t1, t7, t6); \
+ filter_8bit(x6, t0, t1, t7, t6); \
+ \
+ /* postfilter sbox 3 */ \
+ vmovdqa .Lpost_tf_lo_s2(%rip), t4; \
+ vmovdqa .Lpost_tf_hi_s2(%rip), t5; \
+ filter_8bit(x2, t2, t3, t7, t6); \
+ filter_8bit(x5, t2, t3, t7, t6); \
+ \
+ vpxor t6, t6, t6; \
+ vmovq key, t0; \
+ \
+ /* postfilter sbox 2 */ \
+ filter_8bit(x1, t4, t5, t7, t2); \
+ filter_8bit(x4, t4, t5, t7, t2); \
+ \
+ vpsrldq $5, t0, t5; \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpsrldq $3, t0, t3; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t6, t0, t0; \
+ vpshufb t6, t1, t1; \
+ vpshufb t6, t2, t2; \
+ vpshufb t6, t3, t3; \
+ vpshufb t6, t4, t4; \
+ vpsrldq $2, t5, t7; \
+ vpshufb t6, t7, t7; \
+ \
+ /* \
+ * P-function \
+ */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* \
+ * Add key material and result to CD (x becomes new CD) \
+ */ \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 16(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 16(mem_cd), x5, x5; \
+ \
+ vpsrldq $1, t5, t3; \
+ vpshufb t6, t5, t5; \
+ vpshufb t6, t3, t6; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 16(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 16(mem_cd), x7, x7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 16(mem_cd), x0, x0; \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 16(mem_cd), x1, x1; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 16(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * Size optimization... with inlined roundsm16, binary would be over 5 times
+ * larger and would only be 0.5% faster (on sandy-bridge).
+ */
+.align 8
+SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+ roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+ %rcx, (%r9));
+ RET;
+SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+.align 8
+SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+ roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
+ %rax, (%r9));
+ RET;
+SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ * x0..x7: byte-sliced AB state preloaded
+ * mem_ab: byte-sliced AB state in memory
+ * mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+ leaq (key_table + (i) * 8)(CTX), %r9; \
+ call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+ \
+ vmovdqu x4, 0 * 16(mem_cd); \
+ vmovdqu x5, 1 * 16(mem_cd); \
+ vmovdqu x6, 2 * 16(mem_cd); \
+ vmovdqu x7, 3 * 16(mem_cd); \
+ vmovdqu x0, 4 * 16(mem_cd); \
+ vmovdqu x1, 5 * 16(mem_cd); \
+ vmovdqu x2, 6 * 16(mem_cd); \
+ vmovdqu x3, 7 * 16(mem_cd); \
+ \
+ leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+ call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+ \
+ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+ /* Store new AB state */ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ * v0..3: byte-sliced 32-bit integers
+ * OUT:
+ * v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+ vpcmpgtb v0, zero, t0; \
+ vpaddb v0, v0, v0; \
+ vpabsb t0, t0; \
+ \
+ vpcmpgtb v1, zero, t1; \
+ vpaddb v1, v1, v1; \
+ vpabsb t1, t1; \
+ \
+ vpcmpgtb v2, zero, t2; \
+ vpaddb v2, v2, v2; \
+ vpabsb t2, t2; \
+ \
+ vpor t0, v1, v1; \
+ \
+ vpcmpgtb v3, zero, t0; \
+ vpaddb v3, v3, v3; \
+ vpabsb t0, t0; \
+ \
+ vpor t1, v2, v2; \
+ vpor t2, v3, v3; \
+ vpor t0, v0, v0;
+
+/*
+ * IN:
+ * r: byte-sliced AB state in memory
+ * l: byte-sliced CD state in memory
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+ tt1, tt2, tt3, kll, klr, krl, krr) \
+ /* \
+ * t0 = kll; \
+ * t0 &= ll; \
+ * lr ^= rol32(t0, 1); \
+ */ \
+ vpxor tt0, tt0, tt0; \
+ vmovd kll, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand l0, t0, t0; \
+ vpand l1, t1, t1; \
+ vpand l2, t2, t2; \
+ vpand l3, t3, t3; \
+ \
+ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor l4, t0, l4; \
+ vmovdqu l4, 4 * 16(l); \
+ vpxor l5, t1, l5; \
+ vmovdqu l5, 5 * 16(l); \
+ vpxor l6, t2, l6; \
+ vmovdqu l6, 6 * 16(l); \
+ vpxor l7, t3, l7; \
+ vmovdqu l7, 7 * 16(l); \
+ \
+ /* \
+ * t2 = krr; \
+ * t2 |= rr; \
+ * rl ^= t2; \
+ */ \
+ \
+ vmovd krr, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor 4 * 16(r), t0, t0; \
+ vpor 5 * 16(r), t1, t1; \
+ vpor 6 * 16(r), t2, t2; \
+ vpor 7 * 16(r), t3, t3; \
+ \
+ vpxor 0 * 16(r), t0, t0; \
+ vpxor 1 * 16(r), t1, t1; \
+ vpxor 2 * 16(r), t2, t2; \
+ vpxor 3 * 16(r), t3, t3; \
+ vmovdqu t0, 0 * 16(r); \
+ vmovdqu t1, 1 * 16(r); \
+ vmovdqu t2, 2 * 16(r); \
+ vmovdqu t3, 3 * 16(r); \
+ \
+ /* \
+ * t2 = krl; \
+ * t2 &= rl; \
+ * rr ^= rol32(t2, 1); \
+ */ \
+ vmovd krl, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand 0 * 16(r), t0, t0; \
+ vpand 1 * 16(r), t1, t1; \
+ vpand 2 * 16(r), t2, t2; \
+ vpand 3 * 16(r), t3, t3; \
+ \
+ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor 4 * 16(r), t0, t0; \
+ vpxor 5 * 16(r), t1, t1; \
+ vpxor 6 * 16(r), t2, t2; \
+ vpxor 7 * 16(r), t3, t3; \
+ vmovdqu t0, 4 * 16(r); \
+ vmovdqu t1, 5 * 16(r); \
+ vmovdqu t2, 6 * 16(r); \
+ vmovdqu t3, 7 * 16(r); \
+ \
+ /* \
+ * t0 = klr; \
+ * t0 |= lr; \
+ * ll ^= t0; \
+ */ \
+ \
+ vmovd klr, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor l4, t0, t0; \
+ vpor l5, t1, t1; \
+ vpor l6, t2, t2; \
+ vpor l7, t3, t3; \
+ \
+ vpxor l0, t0, l0; \
+ vmovdqu l0, 0 * 16(l); \
+ vpxor l1, t1, l1; \
+ vmovdqu l1, 1 * 16(l); \
+ vpxor l2, t2, l2; \
+ vmovdqu l2, 2 * 16(l); \
+ vpxor l3, t3, l3; \
+ vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
+ b3, c3, d3, st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio, key) \
+ vmovq key, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
+ \
+ vpxor 0 * 16(rio), x0, y7; \
+ vpxor 1 * 16(rio), x0, y6; \
+ vpxor 2 * 16(rio), x0, y5; \
+ vpxor 3 * 16(rio), x0, y4; \
+ vpxor 4 * 16(rio), x0, y3; \
+ vpxor 5 * 16(rio), x0, y2; \
+ vpxor 6 * 16(rio), x0, y1; \
+ vpxor 7 * 16(rio), x0, y0; \
+ vpxor 8 * 16(rio), x0, x7; \
+ vpxor 9 * 16(rio), x0, x6; \
+ vpxor 10 * 16(rio), x0, x5; \
+ vpxor 11 * 16(rio), x0, x4; \
+ vpxor 12 * 16(rio), x0, x3; \
+ vpxor 13 * 16(rio), x0, x2; \
+ vpxor 14 * 16(rio), x0, x1; \
+ vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab); \
+ vmovdqu y0, 0 * 16(mem_cd); \
+ vmovdqu y1, 1 * 16(mem_cd); \
+ vmovdqu y2, 2 * 16(mem_cd); \
+ vmovdqu y3, 3 * 16(mem_cd); \
+ vmovdqu y4, 4 * 16(mem_cd); \
+ vmovdqu y5, 5 * 16(mem_cd); \
+ vmovdqu y6, 6 * 16(mem_cd); \
+ vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+ byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
+ y7, x3, x7, stack_tmp0, stack_tmp1); \
+ \
+ vmovdqu x0, stack_tmp0; \
+ \
+ vmovq key, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
+ \
+ vpxor x0, y7, y7; \
+ vpxor x0, y6, y6; \
+ vpxor x0, y5, y5; \
+ vpxor x0, y4, y4; \
+ vpxor x0, y3, y3; \
+ vpxor x0, y2, y2; \
+ vpxor x0, y1, y1; \
+ vpxor x0, y0, y0; \
+ vpxor x0, x7, x7; \
+ vpxor x0, x6, x6; \
+ vpxor x0, x5, x5; \
+ vpxor x0, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x0, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio) \
+ vmovdqu x0, 0 * 16(rio); \
+ vmovdqu x1, 1 * 16(rio); \
+ vmovdqu x2, 2 * 16(rio); \
+ vmovdqu x3, 3 * 16(rio); \
+ vmovdqu x4, 4 * 16(rio); \
+ vmovdqu x5, 5 * 16(rio); \
+ vmovdqu x6, 6 * 16(rio); \
+ vmovdqu x7, 7 * 16(rio); \
+ vmovdqu y0, 8 * 16(rio); \
+ vmovdqu y1, 9 * 16(rio); \
+ vmovdqu y2, 10 * 16(rio); \
+ vmovdqu y3, 11 * 16(rio); \
+ vmovdqu y4, 12 * 16(rio); \
+ vmovdqu y5, 13 * 16(rio); \
+ vmovdqu y6, 14 * 16(rio); \
+ vmovdqu y7, 15 * 16(rio);
+
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x80808080
+ .long 0x80808080
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+ .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+ .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+ .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+ .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in <<< 1)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+ .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+ .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+ .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+ .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+ .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+ .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+ .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+ .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+ .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+ .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+ .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+ .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+ .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+ .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+ .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+ .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 256 bytes
+ * %xmm0..%xmm15: 16 plaintext blocks
+ * output:
+ * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 16(%rax), %rcx;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx);
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 0);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX),
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 8);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX),
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 16);
+
+ movl $24, %r8d;
+ cmpl $16, key_length(CTX);
+ jne .Lenc_max32;
+
+.Lenc_done:
+ /* load CD for output */
+ vmovdqu 0 * 16(%rcx), %xmm8;
+ vmovdqu 1 * 16(%rcx), %xmm9;
+ vmovdqu 2 * 16(%rcx), %xmm10;
+ vmovdqu 3 * 16(%rcx), %xmm11;
+ vmovdqu 4 * 16(%rcx), %xmm12;
+ vmovdqu 5 * 16(%rcx), %xmm13;
+ vmovdqu 6 * 16(%rcx), %xmm14;
+ vmovdqu 7 * 16(%rcx), %xmm15;
+
+ outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+
+ FRAME_END
+ RET;
+
+.align 8
+.Lenc_max32:
+ movl $32, %r8d;
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX),
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 24);
+
+ jmp .Lenc_done;
+SYM_FUNC_END(__camellia_enc_blk16)
+
+SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 256 bytes
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %xmm0..%xmm15: 16 encrypted blocks
+ * output:
+ * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 16(%rax), %rcx;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx);
+
+ cmpl $32, %r8d;
+ je .Ldec_max32;
+
+.Ldec_max24:
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 16);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX),
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX));
+
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 8);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX),
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX));
+
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 0);
+
+ /* load CD for output */
+ vmovdqu 0 * 16(%rcx), %xmm8;
+ vmovdqu 1 * 16(%rcx), %xmm9;
+ vmovdqu 2 * 16(%rcx), %xmm10;
+ vmovdqu 3 * 16(%rcx), %xmm11;
+ vmovdqu 4 * 16(%rcx), %xmm12;
+ vmovdqu 5 * 16(%rcx), %xmm13;
+ vmovdqu 6 * 16(%rcx), %xmm14;
+ vmovdqu 7 * 16(%rcx), %xmm15;
+
+ outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+ FRAME_END
+ RET;
+
+.align 8
+.Ldec_max32:
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 24);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX),
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX));
+
+ jmp .Ldec_max24;
+SYM_FUNC_END(__camellia_dec_blk16)
+
+SYM_TYPED_FUNC_START(camellia_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_enc_blk16;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(camellia_ecb_enc_16way)
+
+SYM_TYPED_FUNC_START(camellia_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_dec_blk16;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(camellia_ecb_dec_16way)
+
+SYM_TYPED_FUNC_START(camellia_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /*
+ * dst might still be in-use (in case dst == src), so use stack for
+ * temporary storage.
+ */
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ call __camellia_dec_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+ vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+ vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+ vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+ vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+ vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+ vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+ vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+ vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+ vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+ vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+ vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+ vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+ vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+ vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(camellia_cbc_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..b1c9b9450555
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -0,0 +1,1048 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86_64/AVX2/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/frame.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/**********************************************************************
+ 32-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+ t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vbroadcasti128 .Linv_shift_row(%rip), t4; \
+ vpbroadcastd .L0f0f0f0f(%rip), t7; \
+ vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \
+ vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \
+ vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \
+ vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t4, x0, x0; \
+ vpshufb t4, x7, x7; \
+ vpshufb t4, x3, x3; \
+ vpshufb t4, x6, x6; \
+ vpshufb t4, x2, x2; \
+ vpshufb t4, x5, x5; \
+ vpshufb t4, x1, x1; \
+ vpshufb t4, x4, x4; \
+ \
+ /* prefilter sboxes 1, 2 and 3 */ \
+ /* prefilter sbox 4 */ \
+ filter_8bit(x0, t5, t6, t7, t4); \
+ filter_8bit(x7, t5, t6, t7, t4); \
+ vextracti128 $1, x0, t0##_x; \
+ vextracti128 $1, x7, t1##_x; \
+ filter_8bit(x3, t2, t3, t7, t4); \
+ filter_8bit(x6, t2, t3, t7, t4); \
+ vextracti128 $1, x3, t3##_x; \
+ vextracti128 $1, x6, t2##_x; \
+ filter_8bit(x2, t5, t6, t7, t4); \
+ filter_8bit(x5, t5, t6, t7, t4); \
+ filter_8bit(x1, t5, t6, t7, t4); \
+ filter_8bit(x4, t5, t6, t7, t4); \
+ \
+ vpxor t4##_x, t4##_x, t4##_x; \
+ \
+ /* AES subbytes + AES shift rows */ \
+ vextracti128 $1, x2, t6##_x; \
+ vextracti128 $1, x5, t5##_x; \
+ vaesenclast t4##_x, x0##_x, x0##_x; \
+ vaesenclast t4##_x, t0##_x, t0##_x; \
+ vinserti128 $1, t0##_x, x0, x0; \
+ vaesenclast t4##_x, x7##_x, x7##_x; \
+ vaesenclast t4##_x, t1##_x, t1##_x; \
+ vinserti128 $1, t1##_x, x7, x7; \
+ vaesenclast t4##_x, x3##_x, x3##_x; \
+ vaesenclast t4##_x, t3##_x, t3##_x; \
+ vinserti128 $1, t3##_x, x3, x3; \
+ vaesenclast t4##_x, x6##_x, x6##_x; \
+ vaesenclast t4##_x, t2##_x, t2##_x; \
+ vinserti128 $1, t2##_x, x6, x6; \
+ vextracti128 $1, x1, t3##_x; \
+ vextracti128 $1, x4, t2##_x; \
+ vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \
+ vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \
+ vaesenclast t4##_x, x2##_x, x2##_x; \
+ vaesenclast t4##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x2, x2; \
+ vaesenclast t4##_x, x5##_x, x5##_x; \
+ vaesenclast t4##_x, t5##_x, t5##_x; \
+ vinserti128 $1, t5##_x, x5, x5; \
+ vaesenclast t4##_x, x1##_x, x1##_x; \
+ vaesenclast t4##_x, t3##_x, t3##_x; \
+ vinserti128 $1, t3##_x, x1, x1; \
+ vaesenclast t4##_x, x4##_x, x4##_x; \
+ vaesenclast t4##_x, t2##_x, t2##_x; \
+ vinserti128 $1, t2##_x, x4, x4; \
+ \
+ /* postfilter sboxes 1 and 4 */ \
+ vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \
+ vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x3, t0, t1, t7, t6); \
+ filter_8bit(x6, t0, t1, t7, t6); \
+ \
+ /* postfilter sbox 3 */ \
+ vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \
+ vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \
+ filter_8bit(x2, t2, t3, t7, t6); \
+ filter_8bit(x5, t2, t3, t7, t6); \
+ \
+ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+ \
+ /* postfilter sbox 2 */ \
+ filter_8bit(x1, t4, t5, t7, t2); \
+ filter_8bit(x4, t4, t5, t7, t2); \
+ vpxor t7, t7, t7; \
+ \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpshufb t7, t1, t1; \
+ vpsrldq $3, t0, t3; \
+ \
+ /* P-function */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpshufb t7, t2, t2; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t7, t3, t3; \
+ vpsrldq $5, t0, t5; \
+ vpshufb t7, t4, t4; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpsrldq $6, t0, t6; \
+ vpshufb t7, t5, t5; \
+ vpshufb t7, t6, t6; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* Add key material and result to CD (x becomes new CD) */ \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 32(mem_cd), x1, x1; \
+ \
+ vpsrldq $7, t0, t6; \
+ vpshufb t7, t0, t0; \
+ vpshufb t7, t6, t7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 32(mem_cd), x0, x0; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 32(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 32(mem_cd), x3, x3; \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 32(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 32(mem_cd), x5, x5; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 32(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 32(mem_cd), x7, x7;
+
+/*
+ * Size optimization... with inlined roundsm32 binary would be over 5 times
+ * larger and would only marginally faster.
+ */
+SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+ roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+ %rcx, (%r9));
+ RET;
+SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+ roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
+ %rax, (%r9));
+ RET;
+SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ * x0..x7: byte-sliced AB state preloaded
+ * mem_ab: byte-sliced AB state in memory
+ * mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+ leaq (key_table + (i) * 8)(CTX), %r9; \
+ call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+ \
+ vmovdqu x0, 4 * 32(mem_cd); \
+ vmovdqu x1, 5 * 32(mem_cd); \
+ vmovdqu x2, 6 * 32(mem_cd); \
+ vmovdqu x3, 7 * 32(mem_cd); \
+ vmovdqu x4, 0 * 32(mem_cd); \
+ vmovdqu x5, 1 * 32(mem_cd); \
+ vmovdqu x6, 2 * 32(mem_cd); \
+ vmovdqu x7, 3 * 32(mem_cd); \
+ \
+ leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+ call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+ \
+ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+ /* Store new AB state */ \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab);
+
+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ * v0..3: byte-sliced 32-bit integers
+ * OUT:
+ * v0..3: (IN <<< 1)
+ */
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
+ vpcmpgtb v0, zero, t0; \
+ vpaddb v0, v0, v0; \
+ vpabsb t0, t0; \
+ \
+ vpcmpgtb v1, zero, t1; \
+ vpaddb v1, v1, v1; \
+ vpabsb t1, t1; \
+ \
+ vpcmpgtb v2, zero, t2; \
+ vpaddb v2, v2, v2; \
+ vpabsb t2, t2; \
+ \
+ vpor t0, v1, v1; \
+ \
+ vpcmpgtb v3, zero, t0; \
+ vpaddb v3, v3, v3; \
+ vpabsb t0, t0; \
+ \
+ vpor t1, v2, v2; \
+ vpor t2, v3, v3; \
+ vpor t0, v0, v0;
+
+/*
+ * IN:
+ * r: byte-sliced AB state in memory
+ * l: byte-sliced CD state in memory
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+ tt1, tt2, tt3, kll, klr, krl, krr) \
+ /* \
+ * t0 = kll; \
+ * t0 &= ll; \
+ * lr ^= rol32(t0, 1); \
+ */ \
+ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+ vpxor tt0, tt0, tt0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand l0, t0, t0; \
+ vpand l1, t1, t1; \
+ vpand l2, t2, t2; \
+ vpand l3, t3, t3; \
+ \
+ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor l4, t0, l4; \
+ vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+ vmovdqu l4, 4 * 32(l); \
+ vpxor l5, t1, l5; \
+ vmovdqu l5, 5 * 32(l); \
+ vpxor l6, t2, l6; \
+ vmovdqu l6, 6 * 32(l); \
+ vpxor l7, t3, l7; \
+ vmovdqu l7, 7 * 32(l); \
+ \
+ /* \
+ * t2 = krr; \
+ * t2 |= rr; \
+ * rl ^= t2; \
+ */ \
+ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor 4 * 32(r), t0, t0; \
+ vpor 5 * 32(r), t1, t1; \
+ vpor 6 * 32(r), t2, t2; \
+ vpor 7 * 32(r), t3, t3; \
+ \
+ vpxor 0 * 32(r), t0, t0; \
+ vpxor 1 * 32(r), t1, t1; \
+ vpxor 2 * 32(r), t2, t2; \
+ vpxor 3 * 32(r), t3, t3; \
+ vmovdqu t0, 0 * 32(r); \
+ vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+ vmovdqu t1, 1 * 32(r); \
+ vmovdqu t2, 2 * 32(r); \
+ vmovdqu t3, 3 * 32(r); \
+ \
+ /* \
+ * t2 = krl; \
+ * t2 &= rl; \
+ * rr ^= rol32(t2, 1); \
+ */ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand 0 * 32(r), t0, t0; \
+ vpand 1 * 32(r), t1, t1; \
+ vpand 2 * 32(r), t2, t2; \
+ vpand 3 * 32(r), t3, t3; \
+ \
+ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor 4 * 32(r), t0, t0; \
+ vpxor 5 * 32(r), t1, t1; \
+ vpxor 6 * 32(r), t2, t2; \
+ vpxor 7 * 32(r), t3, t3; \
+ vmovdqu t0, 4 * 32(r); \
+ vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+ vmovdqu t1, 5 * 32(r); \
+ vmovdqu t2, 6 * 32(r); \
+ vmovdqu t3, 7 * 32(r); \
+ \
+ /* \
+ * t0 = klr; \
+ * t0 |= lr; \
+ * ll ^= t0; \
+ */ \
+ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor l4, t0, t0; \
+ vpor l5, t1, t1; \
+ vpor l6, t2, t2; \
+ vpor l7, t3, t3; \
+ \
+ vpxor l0, t0, l0; \
+ vmovdqu l0, 0 * 32(l); \
+ vpxor l1, t1, l1; \
+ vmovdqu l1, 1 * 32(l); \
+ vpxor l2, t2, l2; \
+ vmovdqu l2, 2 * 32(l); \
+ vpxor l3, t3, l3; \
+ vmovdqu l3, 3 * 32(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+ a3, b3, c3, d3, st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio, key) \
+ vpbroadcastq key, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
+ \
+ vpxor 0 * 32(rio), x0, y7; \
+ vpxor 1 * 32(rio), x0, y6; \
+ vpxor 2 * 32(rio), x0, y5; \
+ vpxor 3 * 32(rio), x0, y4; \
+ vpxor 4 * 32(rio), x0, y3; \
+ vpxor 5 * 32(rio), x0, y2; \
+ vpxor 6 * 32(rio), x0, y1; \
+ vpxor 7 * 32(rio), x0, y0; \
+ vpxor 8 * 32(rio), x0, x7; \
+ vpxor 9 * 32(rio), x0, x6; \
+ vpxor 10 * 32(rio), x0, x5; \
+ vpxor 11 * 32(rio), x0, x4; \
+ vpxor 12 * 32(rio), x0, x3; \
+ vpxor 13 * 32(rio), x0, x2; \
+ vpxor 14 * 32(rio), x0, x1; \
+ vpxor 15 * 32(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd) \
+ byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+ y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab); \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu y0, 0 * 32(mem_cd); \
+ vmovdqu y1, 1 * 32(mem_cd); \
+ vmovdqu y2, 2 * 32(mem_cd); \
+ vmovdqu y3, 3 * 32(mem_cd); \
+ vmovdqu y4, 4 * 32(mem_cd); \
+ vmovdqu y5, 5 * 32(mem_cd); \
+ vmovdqu y6, 6 * 32(mem_cd); \
+ vmovdqu y7, 7 * 32(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+ byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+ y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+ \
+ vmovdqu x0, stack_tmp0; \
+ \
+ vpbroadcastq key, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
+ \
+ vpxor x0, y7, y7; \
+ vpxor x0, y6, y6; \
+ vpxor x0, y5, y5; \
+ vpxor x0, y4, y4; \
+ vpxor x0, y3, y3; \
+ vpxor x0, y2, y2; \
+ vpxor x0, y1, y1; \
+ vpxor x0, y0, y0; \
+ vpxor x0, x7, x7; \
+ vpxor x0, x6, x6; \
+ vpxor x0, x5, x5; \
+ vpxor x0, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x0, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio) \
+ vmovdqu x0, 0 * 32(rio); \
+ vmovdqu x1, 1 * 32(rio); \
+ vmovdqu x2, 2 * 32(rio); \
+ vmovdqu x3, 3 * 32(rio); \
+ vmovdqu x4, 4 * 32(rio); \
+ vmovdqu x5, 5 * 32(rio); \
+ vmovdqu x6, 6 * 32(rio); \
+ vmovdqu x7, 7 * 32(rio); \
+ vmovdqu y0, 8 * 32(rio); \
+ vmovdqu y1, 9 * 32(rio); \
+ vmovdqu y2, 10 * 32(rio); \
+ vmovdqu y3, 11 * 32(rio); \
+ vmovdqu y4, 12 * 32(rio); \
+ vmovdqu y5, 13 * 32(rio); \
+ vmovdqu y6, 14 * 32(rio); \
+ vmovdqu y7, 15 * 32(rio);
+
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst32.pack_bswap, "aM", @progbits, 32
+.align 32
+.Lpack_bswap:
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+ .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+ .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+ .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+ .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in <<< 1)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+ .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+ .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+ .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+ .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+ .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+ .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+ .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+ .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+ .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+ .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+ .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+ .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+ .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+ .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+ .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+ .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 512 bytes
+ * %ymm0..%ymm15: 32 plaintext blocks
+ * output:
+ * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 32(%rax), %rcx;
+
+ inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx);
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 0);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX),
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 8);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX),
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 16);
+
+ movl $24, %r8d;
+ cmpl $16, key_length(CTX);
+ jne .Lenc_max32;
+
+.Lenc_done:
+ /* load CD for output */
+ vmovdqu 0 * 32(%rcx), %ymm8;
+ vmovdqu 1 * 32(%rcx), %ymm9;
+ vmovdqu 2 * 32(%rcx), %ymm10;
+ vmovdqu 3 * 32(%rcx), %ymm11;
+ vmovdqu 4 * 32(%rcx), %ymm12;
+ vmovdqu 5 * 32(%rcx), %ymm13;
+ vmovdqu 6 * 32(%rcx), %ymm14;
+ vmovdqu 7 * 32(%rcx), %ymm15;
+
+ outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+
+ FRAME_END
+ RET;
+
+.align 8
+.Lenc_max32:
+ movl $32, %r8d;
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX),
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 24);
+
+ jmp .Lenc_done;
+SYM_FUNC_END(__camellia_enc_blk32)
+
+SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 512 bytes
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %ymm0..%ymm15: 16 encrypted blocks
+ * output:
+ * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 32(%rax), %rcx;
+
+ inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx);
+
+ cmpl $32, %r8d;
+ je .Ldec_max32;
+
+.Ldec_max24:
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 16);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX),
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX));
+
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 8);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX),
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX));
+
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 0);
+
+ /* load CD for output */
+ vmovdqu 0 * 32(%rcx), %ymm8;
+ vmovdqu 1 * 32(%rcx), %ymm9;
+ vmovdqu 2 * 32(%rcx), %ymm10;
+ vmovdqu 3 * 32(%rcx), %ymm11;
+ vmovdqu 4 * 32(%rcx), %ymm12;
+ vmovdqu 5 * 32(%rcx), %ymm13;
+ vmovdqu 6 * 32(%rcx), %ymm14;
+ vmovdqu 7 * 32(%rcx), %ymm15;
+
+ outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+
+ FRAME_END
+ RET;
+
+.align 8
+.Ldec_max32:
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 24);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX),
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX));
+
+ jmp .Ldec_max24;
+SYM_FUNC_END(__camellia_dec_blk32)
+
+SYM_FUNC_START(camellia_ecb_enc_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_enc_blk32;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(camellia_ecb_enc_32way)
+
+SYM_FUNC_START(camellia_ecb_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_dec_blk32;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(camellia_ecb_dec_32way)
+
+SYM_FUNC_START(camellia_cbc_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+ subq $(16 * 32), %rsp;
+
+ vzeroupper;
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ cmpq %rsi, %rdx;
+ je .Lcbc_dec_use_stack;
+
+ /* dst can be used as temporary storage, src is not overwritten. */
+ movq %rsi, %rax;
+ jmp .Lcbc_dec_continue;
+
+.Lcbc_dec_use_stack:
+ /*
+ * dst still in-use (because dst == src), so use stack for temporary
+ * storage.
+ */
+ movq %rsp, %rax;
+
+.Lcbc_dec_continue:
+ call __camellia_dec_blk32;
+
+ vmovdqu %ymm7, (%rax);
+ vpxor %ymm7, %ymm7, %ymm7;
+ vinserti128 $1, (%rdx), %ymm7, %ymm7;
+ vpxor (%rax), %ymm7, %ymm7;
+ vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
+ vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
+ vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
+ vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
+ vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
+ vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
+ vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
+ vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
+ vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
+ vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
+ vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
+ vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
+ vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
+ vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
+ vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ addq $(16 * 32), %rsp;
+ FRAME_END
+ RET;
+SYM_FUNC_END(camellia_cbc_dec_32way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
new file mode 100644
index 000000000000..824cb94de6c2
--- /dev/null
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -0,0 +1,502 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Camellia Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.file "camellia-x86_64-asm_64.S"
+.text
+
+.extern camellia_sp10011110;
+.extern camellia_sp22000222;
+.extern camellia_sp03303033;
+.extern camellia_sp00444404;
+.extern camellia_sp02220222;
+.extern camellia_sp30333033;
+.extern camellia_sp44044404;
+.extern camellia_sp11101110;
+
+#define sp10011110 camellia_sp10011110
+#define sp22000222 camellia_sp22000222
+#define sp03303033 camellia_sp03303033
+#define sp00444404 camellia_sp00444404
+#define sp02220222 camellia_sp02220222
+#define sp30333033 camellia_sp30333033
+#define sp44044404 camellia_sp44044404
+#define sp11101110 camellia_sp11101110
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RIOd %esi
+
+#define RAB0 %rax
+#define RCD0 %rcx
+#define RAB1 %rbx
+#define RCD1 %rdx
+
+#define RAB0d %eax
+#define RCD0d %ecx
+#define RAB1d %ebx
+#define RCD1d %edx
+
+#define RAB0bl %al
+#define RCD0bl %cl
+#define RAB1bl %bl
+#define RCD1bl %dl
+
+#define RAB0bh %ah
+#define RCD0bh %ch
+#define RAB1bh %bh
+#define RCD1bh %dh
+
+#define RT0 %rsi
+#define RT1 %r12
+#define RT2 %r8
+
+#define RT0d %esi
+#define RT1d %r12d
+#define RT2d %r8d
+
+#define RT2bl %r8b
+
+#define RXOR %r9
+#define RR12 %r10
+#define RDST %r11
+
+#define RXORd %r9d
+#define RXORbl %r9b
+
+#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
+ leaq T0(%rip), tmp1; \
+ movzbl ab ## bl, tmp2 ## d; \
+ xorq (tmp1, tmp2, 8), dst; \
+ leaq T1(%rip), tmp2; \
+ movzbl ab ## bh, tmp1 ## d; \
+ rorq $16, ab; \
+ xorq (tmp2, tmp1, 8), dst;
+
+/**********************************************************************
+ 1-way camellia
+ **********************************************************************/
+#define roundsm(ab, subkey, cd) \
+ movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+ \
+ xorq RT2, cd ## 0;
+
+#define fls(l, r, kl, kr) \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
+ andl l ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, l ## 0; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
+ orq r ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, r ## 0; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
+ orq l ## 0, RT2; \
+ shrq $32, RT2; \
+ xorq RT2, l ## 0; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
+ andl r ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, r ## 0;
+
+#define enc_rounds(i) \
+ roundsm(RAB, i + 2, RCD); \
+ roundsm(RCD, i + 3, RAB); \
+ roundsm(RAB, i + 4, RCD); \
+ roundsm(RCD, i + 5, RAB); \
+ roundsm(RAB, i + 6, RCD); \
+ roundsm(RCD, i + 7, RAB);
+
+#define enc_fls(i) \
+ fls(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack() \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rolq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rorq $32, RCD0; \
+ xorq key_table(CTX), RAB0;
+
+#define enc_outunpack(op, max) \
+ xorq key_table(CTX, max, 8), RCD0; \
+ rorq $32, RCD0; \
+ bswapq RCD0; \
+ op ## q RCD0, (RIO); \
+ rolq $32, RAB0; \
+ bswapq RAB0; \
+ op ## q RAB0, 4*2(RIO);
+
+#define dec_rounds(i) \
+ roundsm(RAB, i + 7, RCD); \
+ roundsm(RCD, i + 6, RAB); \
+ roundsm(RAB, i + 5, RCD); \
+ roundsm(RCD, i + 4, RAB); \
+ roundsm(RAB, i + 3, RCD); \
+ roundsm(RCD, i + 2, RAB);
+
+#define dec_fls(i) \
+ fls(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack(max) \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rolq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rorq $32, RCD0; \
+ xorq key_table(CTX, max, 8), RAB0;
+
+#define dec_outunpack() \
+ xorq key_table(CTX), RCD0; \
+ rorq $32, RCD0; \
+ bswapq RCD0; \
+ movq RCD0, (RIO); \
+ rolq $32, RAB0; \
+ bswapq RAB0; \
+ movq RAB0, 4*2(RIO);
+
+SYM_TYPED_FUNC_START(__camellia_enc_blk)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool xor
+ */
+ movq %r12, RR12;
+
+ movq %rcx, RXOR;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ enc_inpack();
+
+ enc_rounds(0);
+ enc_fls(8);
+ enc_rounds(8);
+ enc_fls(16);
+ enc_rounds(16);
+ movl $24, RT1d; /* max */
+
+ cmpb $16, key_length(CTX);
+ je .L__enc_done;
+
+ enc_fls(24);
+ enc_rounds(24);
+ movl $32, RT1d; /* max */
+
+.L__enc_done:
+ testb RXORbl, RXORbl;
+ movq RDST, RIO;
+
+ jnz .L__enc_xor;
+
+ enc_outunpack(mov, RT1);
+
+ movq RR12, %r12;
+ RET;
+
+.L__enc_xor:
+ enc_outunpack(xor, RT1);
+
+ movq RR12, %r12;
+ RET;
+SYM_FUNC_END(__camellia_enc_blk)
+
+SYM_TYPED_FUNC_START(camellia_dec_blk)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ cmpl $16, key_length(CTX);
+ movl $32, RT2d;
+ movl $24, RXORd;
+ cmovel RXORd, RT2d; /* max */
+
+ movq %r12, RR12;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ dec_inpack(RT2);
+
+ cmpb $24, RT2bl;
+ je .L__dec_rounds16;
+
+ dec_rounds(24);
+ dec_fls(24);
+
+.L__dec_rounds16:
+ dec_rounds(16);
+ dec_fls(16);
+ dec_rounds(8);
+ dec_fls(8);
+ dec_rounds(0);
+
+ movq RDST, RIO;
+
+ dec_outunpack();
+
+ movq RR12, %r12;
+ RET;
+SYM_FUNC_END(camellia_dec_blk)
+
+/**********************************************************************
+ 2-way camellia
+ **********************************************************************/
+#define roundsm2(ab, subkey, cd) \
+ movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
+ xorq RT2, cd ## 1; \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
+ xorq RT2, cd ## 0; \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
+
+#define fls2(l, r, kl, kr) \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
+ andl l ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, l ## 0; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
+ orq r ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, r ## 0; \
+ \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
+ andl l ## 1d, RT2d; \
+ roll $1, RT2d; \
+ shlq $32, RT2; \
+ xorq RT2, l ## 1; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
+ orq r ## 1, RT0; \
+ shrq $32, RT0; \
+ xorq RT0, r ## 1; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
+ orq l ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, l ## 0; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
+ andl r ## 0d, RT2d; \
+ roll $1, RT2d; \
+ shlq $32, RT2; \
+ xorq RT2, r ## 0; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
+ orq l ## 1, RT0; \
+ shrq $32, RT0; \
+ xorq RT0, l ## 1; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
+ andl r ## 1d, RT1d; \
+ roll $1, RT1d; \
+ shlq $32, RT1; \
+ xorq RT1, r ## 1;
+
+#define enc_rounds2(i) \
+ roundsm2(RAB, i + 2, RCD); \
+ roundsm2(RCD, i + 3, RAB); \
+ roundsm2(RAB, i + 4, RCD); \
+ roundsm2(RCD, i + 5, RAB); \
+ roundsm2(RAB, i + 6, RCD); \
+ roundsm2(RCD, i + 7, RAB);
+
+#define enc_fls2(i) \
+ fls2(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack2() \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rorq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rolq $32, RCD0; \
+ xorq key_table(CTX), RAB0; \
+ \
+ movq 8*2(RIO), RAB1; \
+ bswapq RAB1; \
+ rorq $32, RAB1; \
+ movq 12*2(RIO), RCD1; \
+ bswapq RCD1; \
+ rolq $32, RCD1; \
+ xorq key_table(CTX), RAB1;
+
+#define enc_outunpack2(op, max) \
+ xorq key_table(CTX, max, 8), RCD0; \
+ rolq $32, RCD0; \
+ bswapq RCD0; \
+ op ## q RCD0, (RIO); \
+ rorq $32, RAB0; \
+ bswapq RAB0; \
+ op ## q RAB0, 4*2(RIO); \
+ \
+ xorq key_table(CTX, max, 8), RCD1; \
+ rolq $32, RCD1; \
+ bswapq RCD1; \
+ op ## q RCD1, 8*2(RIO); \
+ rorq $32, RAB1; \
+ bswapq RAB1; \
+ op ## q RAB1, 12*2(RIO);
+
+#define dec_rounds2(i) \
+ roundsm2(RAB, i + 7, RCD); \
+ roundsm2(RCD, i + 6, RAB); \
+ roundsm2(RAB, i + 5, RCD); \
+ roundsm2(RCD, i + 4, RAB); \
+ roundsm2(RAB, i + 3, RCD); \
+ roundsm2(RCD, i + 2, RAB);
+
+#define dec_fls2(i) \
+ fls2(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack2(max) \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rorq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rolq $32, RCD0; \
+ xorq key_table(CTX, max, 8), RAB0; \
+ \
+ movq 8*2(RIO), RAB1; \
+ bswapq RAB1; \
+ rorq $32, RAB1; \
+ movq 12*2(RIO), RCD1; \
+ bswapq RCD1; \
+ rolq $32, RCD1; \
+ xorq key_table(CTX, max, 8), RAB1;
+
+#define dec_outunpack2() \
+ xorq key_table(CTX), RCD0; \
+ rolq $32, RCD0; \
+ bswapq RCD0; \
+ movq RCD0, (RIO); \
+ rorq $32, RAB0; \
+ bswapq RAB0; \
+ movq RAB0, 4*2(RIO); \
+ \
+ xorq key_table(CTX), RCD1; \
+ rolq $32, RCD1; \
+ bswapq RCD1; \
+ movq RCD1, 8*2(RIO); \
+ rorq $32, RAB1; \
+ bswapq RAB1; \
+ movq RAB1, 12*2(RIO);
+
+SYM_TYPED_FUNC_START(__camellia_enc_blk_2way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool xor
+ */
+ pushq %rbx;
+
+ movq %r12, RR12;
+ movq %rcx, RXOR;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ enc_inpack2();
+
+ enc_rounds2(0);
+ enc_fls2(8);
+ enc_rounds2(8);
+ enc_fls2(16);
+ enc_rounds2(16);
+ movl $24, RT2d; /* max */
+
+ cmpb $16, key_length(CTX);
+ je .L__enc2_done;
+
+ enc_fls2(24);
+ enc_rounds2(24);
+ movl $32, RT2d; /* max */
+
+.L__enc2_done:
+ test RXORbl, RXORbl;
+ movq RDST, RIO;
+ jnz .L__enc2_xor;
+
+ enc_outunpack2(mov, RT2);
+
+ movq RR12, %r12;
+ popq %rbx;
+ RET;
+
+.L__enc2_xor:
+ enc_outunpack2(xor, RT2);
+
+ movq RR12, %r12;
+ popq %rbx;
+ RET;
+SYM_FUNC_END(__camellia_enc_blk_2way)
+
+SYM_TYPED_FUNC_START(camellia_dec_blk_2way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ cmpl $16, key_length(CTX);
+ movl $32, RT2d;
+ movl $24, RXORd;
+ cmovel RXORd, RT2d; /* max */
+
+ movq %rbx, RXOR;
+ movq %r12, RR12;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ dec_inpack2(RT2);
+
+ cmpb $24, RT2bl;
+ je .L__dec2_rounds16;
+
+ dec_rounds2(24);
+ dec_fls2(24);
+
+.L__dec2_rounds16:
+ dec_rounds2(16);
+ dec_fls2(16);
+ dec_rounds2(8);
+ dec_fls2(8);
+ dec_rounds2(0);
+
+ movq RDST, RIO;
+
+ dec_outunpack2();
+
+ movq RR12, %r12;
+ movq RXOR, %rbx;
+ RET;
+SYM_FUNC_END(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/camellia.h b/arch/x86/crypto/camellia.h
new file mode 100644
index 000000000000..1dcea79e8f8e
--- /dev/null
+++ b/arch/x86/crypto/camellia.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_CAMELLIA_H
+#define ASM_X86_CAMELLIA_H
+
+#include <crypto/b128ops.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+
+#define CAMELLIA_MIN_KEY_SIZE 16
+#define CAMELLIA_MAX_KEY_SIZE 32
+#define CAMELLIA_BLOCK_SIZE 16
+#define CAMELLIA_TABLE_BYTE_LEN 272
+#define CAMELLIA_PARALLEL_BLOCKS 2
+
+struct crypto_skcipher;
+
+struct camellia_ctx {
+ u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+ u32 key_length;
+};
+
+extern int __camellia_setkey(struct camellia_ctx *cctx,
+ const unsigned char *key,
+ unsigned int key_len);
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src);
+
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+
+static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src)
+{
+ __camellia_enc_blk(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor(const void *ctx, u8 *dst, const u8 *src)
+{
+ __camellia_enc_blk(ctx, dst, src, true);
+}
+
+static inline void camellia_enc_blk_2way(const void *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk_2way(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk_2way(ctx, dst, src, true);
+}
+
+/* glue helpers */
+extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src);
+
+#endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
new file mode 100644
index 000000000000..2d2f4e16537c
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <crypto/algapi.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+
+/* 32-way AVX2/AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src);
+
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_enc_32way);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_dec_32way);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_cbc_dec_32way);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg camellia_algs[] = {
+ {
+ .base.cra_name = "ecb(camellia)",
+ .base.cra_driver_name = "ecb-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(camellia)",
+ .base.cra_driver_name = "cbc-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static int __init camellia_aesni_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(camellia_algs,
+ ARRAY_SIZE(camellia_algs));
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+ crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs));
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
new file mode 100644
index 000000000000..5c321f255eb7
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <crypto/algapi.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
+
+asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
+
+asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
+
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg camellia_algs[] = {
+ {
+ .base.cra_name = "ecb(camellia)",
+ .base.cra_driver_name = "ecb-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(camellia)",
+ .base.cra_driver_name = "cbc-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }
+};
+
+static int __init camellia_aesni_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(camellia_algs,
+ ARRAY_SIZE(camellia_algs));
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+ crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs));
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
new file mode 100644
index 000000000000..cbede120e5f2
--- /dev/null
+++ b/arch/x86/crypto/camellia_glue.c
@@ -0,0 +1,1417 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for assembler optimized version of Camellia
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Camellia parts based on code by:
+ * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
+ */
+
+#include <linux/unaligned.h>
+#include <linux/crypto.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk);
+asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
+asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
+
+static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ camellia_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ camellia_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+/* camellia sboxes */
+__visible const u64 camellia_sp10011110[256] = {
+ 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL,
+ 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL,
+ 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL,
+ 0x8500008585858500ULL, 0x5700005757575700ULL, 0x3500003535353500ULL,
+ 0xea0000eaeaeaea00ULL, 0x0c00000c0c0c0c00ULL, 0xae0000aeaeaeae00ULL,
+ 0x4100004141414100ULL, 0x2300002323232300ULL, 0xef0000efefefef00ULL,
+ 0x6b00006b6b6b6b00ULL, 0x9300009393939300ULL, 0x4500004545454500ULL,
+ 0x1900001919191900ULL, 0xa50000a5a5a5a500ULL, 0x2100002121212100ULL,
+ 0xed0000edededed00ULL, 0x0e00000e0e0e0e00ULL, 0x4f00004f4f4f4f00ULL,
+ 0x4e00004e4e4e4e00ULL, 0x1d00001d1d1d1d00ULL, 0x6500006565656500ULL,
+ 0x9200009292929200ULL, 0xbd0000bdbdbdbd00ULL, 0x8600008686868600ULL,
+ 0xb80000b8b8b8b800ULL, 0xaf0000afafafaf00ULL, 0x8f00008f8f8f8f00ULL,
+ 0x7c00007c7c7c7c00ULL, 0xeb0000ebebebeb00ULL, 0x1f00001f1f1f1f00ULL,
+ 0xce0000cececece00ULL, 0x3e00003e3e3e3e00ULL, 0x3000003030303000ULL,
+ 0xdc0000dcdcdcdc00ULL, 0x5f00005f5f5f5f00ULL, 0x5e00005e5e5e5e00ULL,
+ 0xc50000c5c5c5c500ULL, 0x0b00000b0b0b0b00ULL, 0x1a00001a1a1a1a00ULL,
+ 0xa60000a6a6a6a600ULL, 0xe10000e1e1e1e100ULL, 0x3900003939393900ULL,
+ 0xca0000cacacaca00ULL, 0xd50000d5d5d5d500ULL, 0x4700004747474700ULL,
+ 0x5d00005d5d5d5d00ULL, 0x3d00003d3d3d3d00ULL, 0xd90000d9d9d9d900ULL,
+ 0x0100000101010100ULL, 0x5a00005a5a5a5a00ULL, 0xd60000d6d6d6d600ULL,
+ 0x5100005151515100ULL, 0x5600005656565600ULL, 0x6c00006c6c6c6c00ULL,
+ 0x4d00004d4d4d4d00ULL, 0x8b00008b8b8b8b00ULL, 0x0d00000d0d0d0d00ULL,
+ 0x9a00009a9a9a9a00ULL, 0x6600006666666600ULL, 0xfb0000fbfbfbfb00ULL,
+ 0xcc0000cccccccc00ULL, 0xb00000b0b0b0b000ULL, 0x2d00002d2d2d2d00ULL,
+ 0x7400007474747400ULL, 0x1200001212121200ULL, 0x2b00002b2b2b2b00ULL,
+ 0x2000002020202000ULL, 0xf00000f0f0f0f000ULL, 0xb10000b1b1b1b100ULL,
+ 0x8400008484848400ULL, 0x9900009999999900ULL, 0xdf0000dfdfdfdf00ULL,
+ 0x4c00004c4c4c4c00ULL, 0xcb0000cbcbcbcb00ULL, 0xc20000c2c2c2c200ULL,
+ 0x3400003434343400ULL, 0x7e00007e7e7e7e00ULL, 0x7600007676767600ULL,
+ 0x0500000505050500ULL, 0x6d00006d6d6d6d00ULL, 0xb70000b7b7b7b700ULL,
+ 0xa90000a9a9a9a900ULL, 0x3100003131313100ULL, 0xd10000d1d1d1d100ULL,
+ 0x1700001717171700ULL, 0x0400000404040400ULL, 0xd70000d7d7d7d700ULL,
+ 0x1400001414141400ULL, 0x5800005858585800ULL, 0x3a00003a3a3a3a00ULL,
+ 0x6100006161616100ULL, 0xde0000dededede00ULL, 0x1b00001b1b1b1b00ULL,
+ 0x1100001111111100ULL, 0x1c00001c1c1c1c00ULL, 0x3200003232323200ULL,
+ 0x0f00000f0f0f0f00ULL, 0x9c00009c9c9c9c00ULL, 0x1600001616161600ULL,
+ 0x5300005353535300ULL, 0x1800001818181800ULL, 0xf20000f2f2f2f200ULL,
+ 0x2200002222222200ULL, 0xfe0000fefefefe00ULL, 0x4400004444444400ULL,
+ 0xcf0000cfcfcfcf00ULL, 0xb20000b2b2b2b200ULL, 0xc30000c3c3c3c300ULL,
+ 0xb50000b5b5b5b500ULL, 0x7a00007a7a7a7a00ULL, 0x9100009191919100ULL,
+ 0x2400002424242400ULL, 0x0800000808080800ULL, 0xe80000e8e8e8e800ULL,
+ 0xa80000a8a8a8a800ULL, 0x6000006060606000ULL, 0xfc0000fcfcfcfc00ULL,
+ 0x6900006969696900ULL, 0x5000005050505000ULL, 0xaa0000aaaaaaaa00ULL,
+ 0xd00000d0d0d0d000ULL, 0xa00000a0a0a0a000ULL, 0x7d00007d7d7d7d00ULL,
+ 0xa10000a1a1a1a100ULL, 0x8900008989898900ULL, 0x6200006262626200ULL,
+ 0x9700009797979700ULL, 0x5400005454545400ULL, 0x5b00005b5b5b5b00ULL,
+ 0x1e00001e1e1e1e00ULL, 0x9500009595959500ULL, 0xe00000e0e0e0e000ULL,
+ 0xff0000ffffffff00ULL, 0x6400006464646400ULL, 0xd20000d2d2d2d200ULL,
+ 0x1000001010101000ULL, 0xc40000c4c4c4c400ULL, 0x0000000000000000ULL,
+ 0x4800004848484800ULL, 0xa30000a3a3a3a300ULL, 0xf70000f7f7f7f700ULL,
+ 0x7500007575757500ULL, 0xdb0000dbdbdbdb00ULL, 0x8a00008a8a8a8a00ULL,
+ 0x0300000303030300ULL, 0xe60000e6e6e6e600ULL, 0xda0000dadadada00ULL,
+ 0x0900000909090900ULL, 0x3f00003f3f3f3f00ULL, 0xdd0000dddddddd00ULL,
+ 0x9400009494949400ULL, 0x8700008787878700ULL, 0x5c00005c5c5c5c00ULL,
+ 0x8300008383838300ULL, 0x0200000202020200ULL, 0xcd0000cdcdcdcd00ULL,
+ 0x4a00004a4a4a4a00ULL, 0x9000009090909000ULL, 0x3300003333333300ULL,
+ 0x7300007373737300ULL, 0x6700006767676700ULL, 0xf60000f6f6f6f600ULL,
+ 0xf30000f3f3f3f300ULL, 0x9d00009d9d9d9d00ULL, 0x7f00007f7f7f7f00ULL,
+ 0xbf0000bfbfbfbf00ULL, 0xe20000e2e2e2e200ULL, 0x5200005252525200ULL,
+ 0x9b00009b9b9b9b00ULL, 0xd80000d8d8d8d800ULL, 0x2600002626262600ULL,
+ 0xc80000c8c8c8c800ULL, 0x3700003737373700ULL, 0xc60000c6c6c6c600ULL,
+ 0x3b00003b3b3b3b00ULL, 0x8100008181818100ULL, 0x9600009696969600ULL,
+ 0x6f00006f6f6f6f00ULL, 0x4b00004b4b4b4b00ULL, 0x1300001313131300ULL,
+ 0xbe0000bebebebe00ULL, 0x6300006363636300ULL, 0x2e00002e2e2e2e00ULL,
+ 0xe90000e9e9e9e900ULL, 0x7900007979797900ULL, 0xa70000a7a7a7a700ULL,
+ 0x8c00008c8c8c8c00ULL, 0x9f00009f9f9f9f00ULL, 0x6e00006e6e6e6e00ULL,
+ 0xbc0000bcbcbcbc00ULL, 0x8e00008e8e8e8e00ULL, 0x2900002929292900ULL,
+ 0xf50000f5f5f5f500ULL, 0xf90000f9f9f9f900ULL, 0xb60000b6b6b6b600ULL,
+ 0x2f00002f2f2f2f00ULL, 0xfd0000fdfdfdfd00ULL, 0xb40000b4b4b4b400ULL,
+ 0x5900005959595900ULL, 0x7800007878787800ULL, 0x9800009898989800ULL,
+ 0x0600000606060600ULL, 0x6a00006a6a6a6a00ULL, 0xe70000e7e7e7e700ULL,
+ 0x4600004646464600ULL, 0x7100007171717100ULL, 0xba0000babababa00ULL,
+ 0xd40000d4d4d4d400ULL, 0x2500002525252500ULL, 0xab0000abababab00ULL,
+ 0x4200004242424200ULL, 0x8800008888888800ULL, 0xa20000a2a2a2a200ULL,
+ 0x8d00008d8d8d8d00ULL, 0xfa0000fafafafa00ULL, 0x7200007272727200ULL,
+ 0x0700000707070700ULL, 0xb90000b9b9b9b900ULL, 0x5500005555555500ULL,
+ 0xf80000f8f8f8f800ULL, 0xee0000eeeeeeee00ULL, 0xac0000acacacac00ULL,
+ 0x0a00000a0a0a0a00ULL, 0x3600003636363600ULL, 0x4900004949494900ULL,
+ 0x2a00002a2a2a2a00ULL, 0x6800006868686800ULL, 0x3c00003c3c3c3c00ULL,
+ 0x3800003838383800ULL, 0xf10000f1f1f1f100ULL, 0xa40000a4a4a4a400ULL,
+ 0x4000004040404000ULL, 0x2800002828282800ULL, 0xd30000d3d3d3d300ULL,
+ 0x7b00007b7b7b7b00ULL, 0xbb0000bbbbbbbb00ULL, 0xc90000c9c9c9c900ULL,
+ 0x4300004343434300ULL, 0xc10000c1c1c1c100ULL, 0x1500001515151500ULL,
+ 0xe30000e3e3e3e300ULL, 0xad0000adadadad00ULL, 0xf40000f4f4f4f400ULL,
+ 0x7700007777777700ULL, 0xc70000c7c7c7c700ULL, 0x8000008080808000ULL,
+ 0x9e00009e9e9e9e00ULL,
+};
+
+__visible const u64 camellia_sp22000222[256] = {
+ 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL,
+ 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL,
+ 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL,
+ 0x0b0b0000000b0b0bULL, 0xaeae000000aeaeaeULL, 0x6a6a0000006a6a6aULL,
+ 0xd5d5000000d5d5d5ULL, 0x1818000000181818ULL, 0x5d5d0000005d5d5dULL,
+ 0x8282000000828282ULL, 0x4646000000464646ULL, 0xdfdf000000dfdfdfULL,
+ 0xd6d6000000d6d6d6ULL, 0x2727000000272727ULL, 0x8a8a0000008a8a8aULL,
+ 0x3232000000323232ULL, 0x4b4b0000004b4b4bULL, 0x4242000000424242ULL,
+ 0xdbdb000000dbdbdbULL, 0x1c1c0000001c1c1cULL, 0x9e9e0000009e9e9eULL,
+ 0x9c9c0000009c9c9cULL, 0x3a3a0000003a3a3aULL, 0xcaca000000cacacaULL,
+ 0x2525000000252525ULL, 0x7b7b0000007b7b7bULL, 0x0d0d0000000d0d0dULL,
+ 0x7171000000717171ULL, 0x5f5f0000005f5f5fULL, 0x1f1f0000001f1f1fULL,
+ 0xf8f8000000f8f8f8ULL, 0xd7d7000000d7d7d7ULL, 0x3e3e0000003e3e3eULL,
+ 0x9d9d0000009d9d9dULL, 0x7c7c0000007c7c7cULL, 0x6060000000606060ULL,
+ 0xb9b9000000b9b9b9ULL, 0xbebe000000bebebeULL, 0xbcbc000000bcbcbcULL,
+ 0x8b8b0000008b8b8bULL, 0x1616000000161616ULL, 0x3434000000343434ULL,
+ 0x4d4d0000004d4d4dULL, 0xc3c3000000c3c3c3ULL, 0x7272000000727272ULL,
+ 0x9595000000959595ULL, 0xabab000000abababULL, 0x8e8e0000008e8e8eULL,
+ 0xbaba000000bababaULL, 0x7a7a0000007a7a7aULL, 0xb3b3000000b3b3b3ULL,
+ 0x0202000000020202ULL, 0xb4b4000000b4b4b4ULL, 0xadad000000adadadULL,
+ 0xa2a2000000a2a2a2ULL, 0xacac000000acacacULL, 0xd8d8000000d8d8d8ULL,
+ 0x9a9a0000009a9a9aULL, 0x1717000000171717ULL, 0x1a1a0000001a1a1aULL,
+ 0x3535000000353535ULL, 0xcccc000000ccccccULL, 0xf7f7000000f7f7f7ULL,
+ 0x9999000000999999ULL, 0x6161000000616161ULL, 0x5a5a0000005a5a5aULL,
+ 0xe8e8000000e8e8e8ULL, 0x2424000000242424ULL, 0x5656000000565656ULL,
+ 0x4040000000404040ULL, 0xe1e1000000e1e1e1ULL, 0x6363000000636363ULL,
+ 0x0909000000090909ULL, 0x3333000000333333ULL, 0xbfbf000000bfbfbfULL,
+ 0x9898000000989898ULL, 0x9797000000979797ULL, 0x8585000000858585ULL,
+ 0x6868000000686868ULL, 0xfcfc000000fcfcfcULL, 0xecec000000ecececULL,
+ 0x0a0a0000000a0a0aULL, 0xdada000000dadadaULL, 0x6f6f0000006f6f6fULL,
+ 0x5353000000535353ULL, 0x6262000000626262ULL, 0xa3a3000000a3a3a3ULL,
+ 0x2e2e0000002e2e2eULL, 0x0808000000080808ULL, 0xafaf000000afafafULL,
+ 0x2828000000282828ULL, 0xb0b0000000b0b0b0ULL, 0x7474000000747474ULL,
+ 0xc2c2000000c2c2c2ULL, 0xbdbd000000bdbdbdULL, 0x3636000000363636ULL,
+ 0x2222000000222222ULL, 0x3838000000383838ULL, 0x6464000000646464ULL,
+ 0x1e1e0000001e1e1eULL, 0x3939000000393939ULL, 0x2c2c0000002c2c2cULL,
+ 0xa6a6000000a6a6a6ULL, 0x3030000000303030ULL, 0xe5e5000000e5e5e5ULL,
+ 0x4444000000444444ULL, 0xfdfd000000fdfdfdULL, 0x8888000000888888ULL,
+ 0x9f9f0000009f9f9fULL, 0x6565000000656565ULL, 0x8787000000878787ULL,
+ 0x6b6b0000006b6b6bULL, 0xf4f4000000f4f4f4ULL, 0x2323000000232323ULL,
+ 0x4848000000484848ULL, 0x1010000000101010ULL, 0xd1d1000000d1d1d1ULL,
+ 0x5151000000515151ULL, 0xc0c0000000c0c0c0ULL, 0xf9f9000000f9f9f9ULL,
+ 0xd2d2000000d2d2d2ULL, 0xa0a0000000a0a0a0ULL, 0x5555000000555555ULL,
+ 0xa1a1000000a1a1a1ULL, 0x4141000000414141ULL, 0xfafa000000fafafaULL,
+ 0x4343000000434343ULL, 0x1313000000131313ULL, 0xc4c4000000c4c4c4ULL,
+ 0x2f2f0000002f2f2fULL, 0xa8a8000000a8a8a8ULL, 0xb6b6000000b6b6b6ULL,
+ 0x3c3c0000003c3c3cULL, 0x2b2b0000002b2b2bULL, 0xc1c1000000c1c1c1ULL,
+ 0xffff000000ffffffULL, 0xc8c8000000c8c8c8ULL, 0xa5a5000000a5a5a5ULL,
+ 0x2020000000202020ULL, 0x8989000000898989ULL, 0x0000000000000000ULL,
+ 0x9090000000909090ULL, 0x4747000000474747ULL, 0xefef000000efefefULL,
+ 0xeaea000000eaeaeaULL, 0xb7b7000000b7b7b7ULL, 0x1515000000151515ULL,
+ 0x0606000000060606ULL, 0xcdcd000000cdcdcdULL, 0xb5b5000000b5b5b5ULL,
+ 0x1212000000121212ULL, 0x7e7e0000007e7e7eULL, 0xbbbb000000bbbbbbULL,
+ 0x2929000000292929ULL, 0x0f0f0000000f0f0fULL, 0xb8b8000000b8b8b8ULL,
+ 0x0707000000070707ULL, 0x0404000000040404ULL, 0x9b9b0000009b9b9bULL,
+ 0x9494000000949494ULL, 0x2121000000212121ULL, 0x6666000000666666ULL,
+ 0xe6e6000000e6e6e6ULL, 0xcece000000cececeULL, 0xeded000000edededULL,
+ 0xe7e7000000e7e7e7ULL, 0x3b3b0000003b3b3bULL, 0xfefe000000fefefeULL,
+ 0x7f7f0000007f7f7fULL, 0xc5c5000000c5c5c5ULL, 0xa4a4000000a4a4a4ULL,
+ 0x3737000000373737ULL, 0xb1b1000000b1b1b1ULL, 0x4c4c0000004c4c4cULL,
+ 0x9191000000919191ULL, 0x6e6e0000006e6e6eULL, 0x8d8d0000008d8d8dULL,
+ 0x7676000000767676ULL, 0x0303000000030303ULL, 0x2d2d0000002d2d2dULL,
+ 0xdede000000dededeULL, 0x9696000000969696ULL, 0x2626000000262626ULL,
+ 0x7d7d0000007d7d7dULL, 0xc6c6000000c6c6c6ULL, 0x5c5c0000005c5c5cULL,
+ 0xd3d3000000d3d3d3ULL, 0xf2f2000000f2f2f2ULL, 0x4f4f0000004f4f4fULL,
+ 0x1919000000191919ULL, 0x3f3f0000003f3f3fULL, 0xdcdc000000dcdcdcULL,
+ 0x7979000000797979ULL, 0x1d1d0000001d1d1dULL, 0x5252000000525252ULL,
+ 0xebeb000000ebebebULL, 0xf3f3000000f3f3f3ULL, 0x6d6d0000006d6d6dULL,
+ 0x5e5e0000005e5e5eULL, 0xfbfb000000fbfbfbULL, 0x6969000000696969ULL,
+ 0xb2b2000000b2b2b2ULL, 0xf0f0000000f0f0f0ULL, 0x3131000000313131ULL,
+ 0x0c0c0000000c0c0cULL, 0xd4d4000000d4d4d4ULL, 0xcfcf000000cfcfcfULL,
+ 0x8c8c0000008c8c8cULL, 0xe2e2000000e2e2e2ULL, 0x7575000000757575ULL,
+ 0xa9a9000000a9a9a9ULL, 0x4a4a0000004a4a4aULL, 0x5757000000575757ULL,
+ 0x8484000000848484ULL, 0x1111000000111111ULL, 0x4545000000454545ULL,
+ 0x1b1b0000001b1b1bULL, 0xf5f5000000f5f5f5ULL, 0xe4e4000000e4e4e4ULL,
+ 0x0e0e0000000e0e0eULL, 0x7373000000737373ULL, 0xaaaa000000aaaaaaULL,
+ 0xf1f1000000f1f1f1ULL, 0xdddd000000ddddddULL, 0x5959000000595959ULL,
+ 0x1414000000141414ULL, 0x6c6c0000006c6c6cULL, 0x9292000000929292ULL,
+ 0x5454000000545454ULL, 0xd0d0000000d0d0d0ULL, 0x7878000000787878ULL,
+ 0x7070000000707070ULL, 0xe3e3000000e3e3e3ULL, 0x4949000000494949ULL,
+ 0x8080000000808080ULL, 0x5050000000505050ULL, 0xa7a7000000a7a7a7ULL,
+ 0xf6f6000000f6f6f6ULL, 0x7777000000777777ULL, 0x9393000000939393ULL,
+ 0x8686000000868686ULL, 0x8383000000838383ULL, 0x2a2a0000002a2a2aULL,
+ 0xc7c7000000c7c7c7ULL, 0x5b5b0000005b5b5bULL, 0xe9e9000000e9e9e9ULL,
+ 0xeeee000000eeeeeeULL, 0x8f8f0000008f8f8fULL, 0x0101000000010101ULL,
+ 0x3d3d0000003d3d3dULL,
+};
+
+__visible const u64 camellia_sp03303033[256] = {
+ 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL,
+ 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL,
+ 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL,
+ 0x00c2c200c200c2c2ULL, 0x00abab00ab00ababULL, 0x009a9a009a009a9aULL,
+ 0x0075750075007575ULL, 0x0006060006000606ULL, 0x0057570057005757ULL,
+ 0x00a0a000a000a0a0ULL, 0x0091910091009191ULL, 0x00f7f700f700f7f7ULL,
+ 0x00b5b500b500b5b5ULL, 0x00c9c900c900c9c9ULL, 0x00a2a200a200a2a2ULL,
+ 0x008c8c008c008c8cULL, 0x00d2d200d200d2d2ULL, 0x0090900090009090ULL,
+ 0x00f6f600f600f6f6ULL, 0x0007070007000707ULL, 0x00a7a700a700a7a7ULL,
+ 0x0027270027002727ULL, 0x008e8e008e008e8eULL, 0x00b2b200b200b2b2ULL,
+ 0x0049490049004949ULL, 0x00dede00de00dedeULL, 0x0043430043004343ULL,
+ 0x005c5c005c005c5cULL, 0x00d7d700d700d7d7ULL, 0x00c7c700c700c7c7ULL,
+ 0x003e3e003e003e3eULL, 0x00f5f500f500f5f5ULL, 0x008f8f008f008f8fULL,
+ 0x0067670067006767ULL, 0x001f1f001f001f1fULL, 0x0018180018001818ULL,
+ 0x006e6e006e006e6eULL, 0x00afaf00af00afafULL, 0x002f2f002f002f2fULL,
+ 0x00e2e200e200e2e2ULL, 0x0085850085008585ULL, 0x000d0d000d000d0dULL,
+ 0x0053530053005353ULL, 0x00f0f000f000f0f0ULL, 0x009c9c009c009c9cULL,
+ 0x0065650065006565ULL, 0x00eaea00ea00eaeaULL, 0x00a3a300a300a3a3ULL,
+ 0x00aeae00ae00aeaeULL, 0x009e9e009e009e9eULL, 0x00ecec00ec00ececULL,
+ 0x0080800080008080ULL, 0x002d2d002d002d2dULL, 0x006b6b006b006b6bULL,
+ 0x00a8a800a800a8a8ULL, 0x002b2b002b002b2bULL, 0x0036360036003636ULL,
+ 0x00a6a600a600a6a6ULL, 0x00c5c500c500c5c5ULL, 0x0086860086008686ULL,
+ 0x004d4d004d004d4dULL, 0x0033330033003333ULL, 0x00fdfd00fd00fdfdULL,
+ 0x0066660066006666ULL, 0x0058580058005858ULL, 0x0096960096009696ULL,
+ 0x003a3a003a003a3aULL, 0x0009090009000909ULL, 0x0095950095009595ULL,
+ 0x0010100010001010ULL, 0x0078780078007878ULL, 0x00d8d800d800d8d8ULL,
+ 0x0042420042004242ULL, 0x00cccc00cc00ccccULL, 0x00efef00ef00efefULL,
+ 0x0026260026002626ULL, 0x00e5e500e500e5e5ULL, 0x0061610061006161ULL,
+ 0x001a1a001a001a1aULL, 0x003f3f003f003f3fULL, 0x003b3b003b003b3bULL,
+ 0x0082820082008282ULL, 0x00b6b600b600b6b6ULL, 0x00dbdb00db00dbdbULL,
+ 0x00d4d400d400d4d4ULL, 0x0098980098009898ULL, 0x00e8e800e800e8e8ULL,
+ 0x008b8b008b008b8bULL, 0x0002020002000202ULL, 0x00ebeb00eb00ebebULL,
+ 0x000a0a000a000a0aULL, 0x002c2c002c002c2cULL, 0x001d1d001d001d1dULL,
+ 0x00b0b000b000b0b0ULL, 0x006f6f006f006f6fULL, 0x008d8d008d008d8dULL,
+ 0x0088880088008888ULL, 0x000e0e000e000e0eULL, 0x0019190019001919ULL,
+ 0x0087870087008787ULL, 0x004e4e004e004e4eULL, 0x000b0b000b000b0bULL,
+ 0x00a9a900a900a9a9ULL, 0x000c0c000c000c0cULL, 0x0079790079007979ULL,
+ 0x0011110011001111ULL, 0x007f7f007f007f7fULL, 0x0022220022002222ULL,
+ 0x00e7e700e700e7e7ULL, 0x0059590059005959ULL, 0x00e1e100e100e1e1ULL,
+ 0x00dada00da00dadaULL, 0x003d3d003d003d3dULL, 0x00c8c800c800c8c8ULL,
+ 0x0012120012001212ULL, 0x0004040004000404ULL, 0x0074740074007474ULL,
+ 0x0054540054005454ULL, 0x0030300030003030ULL, 0x007e7e007e007e7eULL,
+ 0x00b4b400b400b4b4ULL, 0x0028280028002828ULL, 0x0055550055005555ULL,
+ 0x0068680068006868ULL, 0x0050500050005050ULL, 0x00bebe00be00bebeULL,
+ 0x00d0d000d000d0d0ULL, 0x00c4c400c400c4c4ULL, 0x0031310031003131ULL,
+ 0x00cbcb00cb00cbcbULL, 0x002a2a002a002a2aULL, 0x00adad00ad00adadULL,
+ 0x000f0f000f000f0fULL, 0x00caca00ca00cacaULL, 0x0070700070007070ULL,
+ 0x00ffff00ff00ffffULL, 0x0032320032003232ULL, 0x0069690069006969ULL,
+ 0x0008080008000808ULL, 0x0062620062006262ULL, 0x0000000000000000ULL,
+ 0x0024240024002424ULL, 0x00d1d100d100d1d1ULL, 0x00fbfb00fb00fbfbULL,
+ 0x00baba00ba00babaULL, 0x00eded00ed00ededULL, 0x0045450045004545ULL,
+ 0x0081810081008181ULL, 0x0073730073007373ULL, 0x006d6d006d006d6dULL,
+ 0x0084840084008484ULL, 0x009f9f009f009f9fULL, 0x00eeee00ee00eeeeULL,
+ 0x004a4a004a004a4aULL, 0x00c3c300c300c3c3ULL, 0x002e2e002e002e2eULL,
+ 0x00c1c100c100c1c1ULL, 0x0001010001000101ULL, 0x00e6e600e600e6e6ULL,
+ 0x0025250025002525ULL, 0x0048480048004848ULL, 0x0099990099009999ULL,
+ 0x00b9b900b900b9b9ULL, 0x00b3b300b300b3b3ULL, 0x007b7b007b007b7bULL,
+ 0x00f9f900f900f9f9ULL, 0x00cece00ce00ceceULL, 0x00bfbf00bf00bfbfULL,
+ 0x00dfdf00df00dfdfULL, 0x0071710071007171ULL, 0x0029290029002929ULL,
+ 0x00cdcd00cd00cdcdULL, 0x006c6c006c006c6cULL, 0x0013130013001313ULL,
+ 0x0064640064006464ULL, 0x009b9b009b009b9bULL, 0x0063630063006363ULL,
+ 0x009d9d009d009d9dULL, 0x00c0c000c000c0c0ULL, 0x004b4b004b004b4bULL,
+ 0x00b7b700b700b7b7ULL, 0x00a5a500a500a5a5ULL, 0x0089890089008989ULL,
+ 0x005f5f005f005f5fULL, 0x00b1b100b100b1b1ULL, 0x0017170017001717ULL,
+ 0x00f4f400f400f4f4ULL, 0x00bcbc00bc00bcbcULL, 0x00d3d300d300d3d3ULL,
+ 0x0046460046004646ULL, 0x00cfcf00cf00cfcfULL, 0x0037370037003737ULL,
+ 0x005e5e005e005e5eULL, 0x0047470047004747ULL, 0x0094940094009494ULL,
+ 0x00fafa00fa00fafaULL, 0x00fcfc00fc00fcfcULL, 0x005b5b005b005b5bULL,
+ 0x0097970097009797ULL, 0x00fefe00fe00fefeULL, 0x005a5a005a005a5aULL,
+ 0x00acac00ac00acacULL, 0x003c3c003c003c3cULL, 0x004c4c004c004c4cULL,
+ 0x0003030003000303ULL, 0x0035350035003535ULL, 0x00f3f300f300f3f3ULL,
+ 0x0023230023002323ULL, 0x00b8b800b800b8b8ULL, 0x005d5d005d005d5dULL,
+ 0x006a6a006a006a6aULL, 0x0092920092009292ULL, 0x00d5d500d500d5d5ULL,
+ 0x0021210021002121ULL, 0x0044440044004444ULL, 0x0051510051005151ULL,
+ 0x00c6c600c600c6c6ULL, 0x007d7d007d007d7dULL, 0x0039390039003939ULL,
+ 0x0083830083008383ULL, 0x00dcdc00dc00dcdcULL, 0x00aaaa00aa00aaaaULL,
+ 0x007c7c007c007c7cULL, 0x0077770077007777ULL, 0x0056560056005656ULL,
+ 0x0005050005000505ULL, 0x001b1b001b001b1bULL, 0x00a4a400a400a4a4ULL,
+ 0x0015150015001515ULL, 0x0034340034003434ULL, 0x001e1e001e001e1eULL,
+ 0x001c1c001c001c1cULL, 0x00f8f800f800f8f8ULL, 0x0052520052005252ULL,
+ 0x0020200020002020ULL, 0x0014140014001414ULL, 0x00e9e900e900e9e9ULL,
+ 0x00bdbd00bd00bdbdULL, 0x00dddd00dd00ddddULL, 0x00e4e400e400e4e4ULL,
+ 0x00a1a100a100a1a1ULL, 0x00e0e000e000e0e0ULL, 0x008a8a008a008a8aULL,
+ 0x00f1f100f100f1f1ULL, 0x00d6d600d600d6d6ULL, 0x007a7a007a007a7aULL,
+ 0x00bbbb00bb00bbbbULL, 0x00e3e300e300e3e3ULL, 0x0040400040004040ULL,
+ 0x004f4f004f004f4fULL,
+};
+
+__visible const u64 camellia_sp00444404[256] = {
+ 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL,
+ 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL,
+ 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL,
+ 0x00006b6b6b6b006bULL, 0x0000454545450045ULL, 0x0000a5a5a5a500a5ULL,
+ 0x0000edededed00edULL, 0x00004f4f4f4f004fULL, 0x00001d1d1d1d001dULL,
+ 0x0000929292920092ULL, 0x0000868686860086ULL, 0x0000afafafaf00afULL,
+ 0x00007c7c7c7c007cULL, 0x00001f1f1f1f001fULL, 0x00003e3e3e3e003eULL,
+ 0x0000dcdcdcdc00dcULL, 0x00005e5e5e5e005eULL, 0x00000b0b0b0b000bULL,
+ 0x0000a6a6a6a600a6ULL, 0x0000393939390039ULL, 0x0000d5d5d5d500d5ULL,
+ 0x00005d5d5d5d005dULL, 0x0000d9d9d9d900d9ULL, 0x00005a5a5a5a005aULL,
+ 0x0000515151510051ULL, 0x00006c6c6c6c006cULL, 0x00008b8b8b8b008bULL,
+ 0x00009a9a9a9a009aULL, 0x0000fbfbfbfb00fbULL, 0x0000b0b0b0b000b0ULL,
+ 0x0000747474740074ULL, 0x00002b2b2b2b002bULL, 0x0000f0f0f0f000f0ULL,
+ 0x0000848484840084ULL, 0x0000dfdfdfdf00dfULL, 0x0000cbcbcbcb00cbULL,
+ 0x0000343434340034ULL, 0x0000767676760076ULL, 0x00006d6d6d6d006dULL,
+ 0x0000a9a9a9a900a9ULL, 0x0000d1d1d1d100d1ULL, 0x0000040404040004ULL,
+ 0x0000141414140014ULL, 0x00003a3a3a3a003aULL, 0x0000dededede00deULL,
+ 0x0000111111110011ULL, 0x0000323232320032ULL, 0x00009c9c9c9c009cULL,
+ 0x0000535353530053ULL, 0x0000f2f2f2f200f2ULL, 0x0000fefefefe00feULL,
+ 0x0000cfcfcfcf00cfULL, 0x0000c3c3c3c300c3ULL, 0x00007a7a7a7a007aULL,
+ 0x0000242424240024ULL, 0x0000e8e8e8e800e8ULL, 0x0000606060600060ULL,
+ 0x0000696969690069ULL, 0x0000aaaaaaaa00aaULL, 0x0000a0a0a0a000a0ULL,
+ 0x0000a1a1a1a100a1ULL, 0x0000626262620062ULL, 0x0000545454540054ULL,
+ 0x00001e1e1e1e001eULL, 0x0000e0e0e0e000e0ULL, 0x0000646464640064ULL,
+ 0x0000101010100010ULL, 0x0000000000000000ULL, 0x0000a3a3a3a300a3ULL,
+ 0x0000757575750075ULL, 0x00008a8a8a8a008aULL, 0x0000e6e6e6e600e6ULL,
+ 0x0000090909090009ULL, 0x0000dddddddd00ddULL, 0x0000878787870087ULL,
+ 0x0000838383830083ULL, 0x0000cdcdcdcd00cdULL, 0x0000909090900090ULL,
+ 0x0000737373730073ULL, 0x0000f6f6f6f600f6ULL, 0x00009d9d9d9d009dULL,
+ 0x0000bfbfbfbf00bfULL, 0x0000525252520052ULL, 0x0000d8d8d8d800d8ULL,
+ 0x0000c8c8c8c800c8ULL, 0x0000c6c6c6c600c6ULL, 0x0000818181810081ULL,
+ 0x00006f6f6f6f006fULL, 0x0000131313130013ULL, 0x0000636363630063ULL,
+ 0x0000e9e9e9e900e9ULL, 0x0000a7a7a7a700a7ULL, 0x00009f9f9f9f009fULL,
+ 0x0000bcbcbcbc00bcULL, 0x0000292929290029ULL, 0x0000f9f9f9f900f9ULL,
+ 0x00002f2f2f2f002fULL, 0x0000b4b4b4b400b4ULL, 0x0000787878780078ULL,
+ 0x0000060606060006ULL, 0x0000e7e7e7e700e7ULL, 0x0000717171710071ULL,
+ 0x0000d4d4d4d400d4ULL, 0x0000abababab00abULL, 0x0000888888880088ULL,
+ 0x00008d8d8d8d008dULL, 0x0000727272720072ULL, 0x0000b9b9b9b900b9ULL,
+ 0x0000f8f8f8f800f8ULL, 0x0000acacacac00acULL, 0x0000363636360036ULL,
+ 0x00002a2a2a2a002aULL, 0x00003c3c3c3c003cULL, 0x0000f1f1f1f100f1ULL,
+ 0x0000404040400040ULL, 0x0000d3d3d3d300d3ULL, 0x0000bbbbbbbb00bbULL,
+ 0x0000434343430043ULL, 0x0000151515150015ULL, 0x0000adadadad00adULL,
+ 0x0000777777770077ULL, 0x0000808080800080ULL, 0x0000828282820082ULL,
+ 0x0000ecececec00ecULL, 0x0000272727270027ULL, 0x0000e5e5e5e500e5ULL,
+ 0x0000858585850085ULL, 0x0000353535350035ULL, 0x00000c0c0c0c000cULL,
+ 0x0000414141410041ULL, 0x0000efefefef00efULL, 0x0000939393930093ULL,
+ 0x0000191919190019ULL, 0x0000212121210021ULL, 0x00000e0e0e0e000eULL,
+ 0x00004e4e4e4e004eULL, 0x0000656565650065ULL, 0x0000bdbdbdbd00bdULL,
+ 0x0000b8b8b8b800b8ULL, 0x00008f8f8f8f008fULL, 0x0000ebebebeb00ebULL,
+ 0x0000cececece00ceULL, 0x0000303030300030ULL, 0x00005f5f5f5f005fULL,
+ 0x0000c5c5c5c500c5ULL, 0x00001a1a1a1a001aULL, 0x0000e1e1e1e100e1ULL,
+ 0x0000cacacaca00caULL, 0x0000474747470047ULL, 0x00003d3d3d3d003dULL,
+ 0x0000010101010001ULL, 0x0000d6d6d6d600d6ULL, 0x0000565656560056ULL,
+ 0x00004d4d4d4d004dULL, 0x00000d0d0d0d000dULL, 0x0000666666660066ULL,
+ 0x0000cccccccc00ccULL, 0x00002d2d2d2d002dULL, 0x0000121212120012ULL,
+ 0x0000202020200020ULL, 0x0000b1b1b1b100b1ULL, 0x0000999999990099ULL,
+ 0x00004c4c4c4c004cULL, 0x0000c2c2c2c200c2ULL, 0x00007e7e7e7e007eULL,
+ 0x0000050505050005ULL, 0x0000b7b7b7b700b7ULL, 0x0000313131310031ULL,
+ 0x0000171717170017ULL, 0x0000d7d7d7d700d7ULL, 0x0000585858580058ULL,
+ 0x0000616161610061ULL, 0x00001b1b1b1b001bULL, 0x00001c1c1c1c001cULL,
+ 0x00000f0f0f0f000fULL, 0x0000161616160016ULL, 0x0000181818180018ULL,
+ 0x0000222222220022ULL, 0x0000444444440044ULL, 0x0000b2b2b2b200b2ULL,
+ 0x0000b5b5b5b500b5ULL, 0x0000919191910091ULL, 0x0000080808080008ULL,
+ 0x0000a8a8a8a800a8ULL, 0x0000fcfcfcfc00fcULL, 0x0000505050500050ULL,
+ 0x0000d0d0d0d000d0ULL, 0x00007d7d7d7d007dULL, 0x0000898989890089ULL,
+ 0x0000979797970097ULL, 0x00005b5b5b5b005bULL, 0x0000959595950095ULL,
+ 0x0000ffffffff00ffULL, 0x0000d2d2d2d200d2ULL, 0x0000c4c4c4c400c4ULL,
+ 0x0000484848480048ULL, 0x0000f7f7f7f700f7ULL, 0x0000dbdbdbdb00dbULL,
+ 0x0000030303030003ULL, 0x0000dadadada00daULL, 0x00003f3f3f3f003fULL,
+ 0x0000949494940094ULL, 0x00005c5c5c5c005cULL, 0x0000020202020002ULL,
+ 0x00004a4a4a4a004aULL, 0x0000333333330033ULL, 0x0000676767670067ULL,
+ 0x0000f3f3f3f300f3ULL, 0x00007f7f7f7f007fULL, 0x0000e2e2e2e200e2ULL,
+ 0x00009b9b9b9b009bULL, 0x0000262626260026ULL, 0x0000373737370037ULL,
+ 0x00003b3b3b3b003bULL, 0x0000969696960096ULL, 0x00004b4b4b4b004bULL,
+ 0x0000bebebebe00beULL, 0x00002e2e2e2e002eULL, 0x0000797979790079ULL,
+ 0x00008c8c8c8c008cULL, 0x00006e6e6e6e006eULL, 0x00008e8e8e8e008eULL,
+ 0x0000f5f5f5f500f5ULL, 0x0000b6b6b6b600b6ULL, 0x0000fdfdfdfd00fdULL,
+ 0x0000595959590059ULL, 0x0000989898980098ULL, 0x00006a6a6a6a006aULL,
+ 0x0000464646460046ULL, 0x0000babababa00baULL, 0x0000252525250025ULL,
+ 0x0000424242420042ULL, 0x0000a2a2a2a200a2ULL, 0x0000fafafafa00faULL,
+ 0x0000070707070007ULL, 0x0000555555550055ULL, 0x0000eeeeeeee00eeULL,
+ 0x00000a0a0a0a000aULL, 0x0000494949490049ULL, 0x0000686868680068ULL,
+ 0x0000383838380038ULL, 0x0000a4a4a4a400a4ULL, 0x0000282828280028ULL,
+ 0x00007b7b7b7b007bULL, 0x0000c9c9c9c900c9ULL, 0x0000c1c1c1c100c1ULL,
+ 0x0000e3e3e3e300e3ULL, 0x0000f4f4f4f400f4ULL, 0x0000c7c7c7c700c7ULL,
+ 0x00009e9e9e9e009eULL,
+};
+
+__visible const u64 camellia_sp02220222[256] = {
+ 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL,
+ 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL,
+ 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL,
+ 0x000b0b0b000b0b0bULL, 0x00aeaeae00aeaeaeULL, 0x006a6a6a006a6a6aULL,
+ 0x00d5d5d500d5d5d5ULL, 0x0018181800181818ULL, 0x005d5d5d005d5d5dULL,
+ 0x0082828200828282ULL, 0x0046464600464646ULL, 0x00dfdfdf00dfdfdfULL,
+ 0x00d6d6d600d6d6d6ULL, 0x0027272700272727ULL, 0x008a8a8a008a8a8aULL,
+ 0x0032323200323232ULL, 0x004b4b4b004b4b4bULL, 0x0042424200424242ULL,
+ 0x00dbdbdb00dbdbdbULL, 0x001c1c1c001c1c1cULL, 0x009e9e9e009e9e9eULL,
+ 0x009c9c9c009c9c9cULL, 0x003a3a3a003a3a3aULL, 0x00cacaca00cacacaULL,
+ 0x0025252500252525ULL, 0x007b7b7b007b7b7bULL, 0x000d0d0d000d0d0dULL,
+ 0x0071717100717171ULL, 0x005f5f5f005f5f5fULL, 0x001f1f1f001f1f1fULL,
+ 0x00f8f8f800f8f8f8ULL, 0x00d7d7d700d7d7d7ULL, 0x003e3e3e003e3e3eULL,
+ 0x009d9d9d009d9d9dULL, 0x007c7c7c007c7c7cULL, 0x0060606000606060ULL,
+ 0x00b9b9b900b9b9b9ULL, 0x00bebebe00bebebeULL, 0x00bcbcbc00bcbcbcULL,
+ 0x008b8b8b008b8b8bULL, 0x0016161600161616ULL, 0x0034343400343434ULL,
+ 0x004d4d4d004d4d4dULL, 0x00c3c3c300c3c3c3ULL, 0x0072727200727272ULL,
+ 0x0095959500959595ULL, 0x00ababab00abababULL, 0x008e8e8e008e8e8eULL,
+ 0x00bababa00bababaULL, 0x007a7a7a007a7a7aULL, 0x00b3b3b300b3b3b3ULL,
+ 0x0002020200020202ULL, 0x00b4b4b400b4b4b4ULL, 0x00adadad00adadadULL,
+ 0x00a2a2a200a2a2a2ULL, 0x00acacac00acacacULL, 0x00d8d8d800d8d8d8ULL,
+ 0x009a9a9a009a9a9aULL, 0x0017171700171717ULL, 0x001a1a1a001a1a1aULL,
+ 0x0035353500353535ULL, 0x00cccccc00ccccccULL, 0x00f7f7f700f7f7f7ULL,
+ 0x0099999900999999ULL, 0x0061616100616161ULL, 0x005a5a5a005a5a5aULL,
+ 0x00e8e8e800e8e8e8ULL, 0x0024242400242424ULL, 0x0056565600565656ULL,
+ 0x0040404000404040ULL, 0x00e1e1e100e1e1e1ULL, 0x0063636300636363ULL,
+ 0x0009090900090909ULL, 0x0033333300333333ULL, 0x00bfbfbf00bfbfbfULL,
+ 0x0098989800989898ULL, 0x0097979700979797ULL, 0x0085858500858585ULL,
+ 0x0068686800686868ULL, 0x00fcfcfc00fcfcfcULL, 0x00ececec00ecececULL,
+ 0x000a0a0a000a0a0aULL, 0x00dadada00dadadaULL, 0x006f6f6f006f6f6fULL,
+ 0x0053535300535353ULL, 0x0062626200626262ULL, 0x00a3a3a300a3a3a3ULL,
+ 0x002e2e2e002e2e2eULL, 0x0008080800080808ULL, 0x00afafaf00afafafULL,
+ 0x0028282800282828ULL, 0x00b0b0b000b0b0b0ULL, 0x0074747400747474ULL,
+ 0x00c2c2c200c2c2c2ULL, 0x00bdbdbd00bdbdbdULL, 0x0036363600363636ULL,
+ 0x0022222200222222ULL, 0x0038383800383838ULL, 0x0064646400646464ULL,
+ 0x001e1e1e001e1e1eULL, 0x0039393900393939ULL, 0x002c2c2c002c2c2cULL,
+ 0x00a6a6a600a6a6a6ULL, 0x0030303000303030ULL, 0x00e5e5e500e5e5e5ULL,
+ 0x0044444400444444ULL, 0x00fdfdfd00fdfdfdULL, 0x0088888800888888ULL,
+ 0x009f9f9f009f9f9fULL, 0x0065656500656565ULL, 0x0087878700878787ULL,
+ 0x006b6b6b006b6b6bULL, 0x00f4f4f400f4f4f4ULL, 0x0023232300232323ULL,
+ 0x0048484800484848ULL, 0x0010101000101010ULL, 0x00d1d1d100d1d1d1ULL,
+ 0x0051515100515151ULL, 0x00c0c0c000c0c0c0ULL, 0x00f9f9f900f9f9f9ULL,
+ 0x00d2d2d200d2d2d2ULL, 0x00a0a0a000a0a0a0ULL, 0x0055555500555555ULL,
+ 0x00a1a1a100a1a1a1ULL, 0x0041414100414141ULL, 0x00fafafa00fafafaULL,
+ 0x0043434300434343ULL, 0x0013131300131313ULL, 0x00c4c4c400c4c4c4ULL,
+ 0x002f2f2f002f2f2fULL, 0x00a8a8a800a8a8a8ULL, 0x00b6b6b600b6b6b6ULL,
+ 0x003c3c3c003c3c3cULL, 0x002b2b2b002b2b2bULL, 0x00c1c1c100c1c1c1ULL,
+ 0x00ffffff00ffffffULL, 0x00c8c8c800c8c8c8ULL, 0x00a5a5a500a5a5a5ULL,
+ 0x0020202000202020ULL, 0x0089898900898989ULL, 0x0000000000000000ULL,
+ 0x0090909000909090ULL, 0x0047474700474747ULL, 0x00efefef00efefefULL,
+ 0x00eaeaea00eaeaeaULL, 0x00b7b7b700b7b7b7ULL, 0x0015151500151515ULL,
+ 0x0006060600060606ULL, 0x00cdcdcd00cdcdcdULL, 0x00b5b5b500b5b5b5ULL,
+ 0x0012121200121212ULL, 0x007e7e7e007e7e7eULL, 0x00bbbbbb00bbbbbbULL,
+ 0x0029292900292929ULL, 0x000f0f0f000f0f0fULL, 0x00b8b8b800b8b8b8ULL,
+ 0x0007070700070707ULL, 0x0004040400040404ULL, 0x009b9b9b009b9b9bULL,
+ 0x0094949400949494ULL, 0x0021212100212121ULL, 0x0066666600666666ULL,
+ 0x00e6e6e600e6e6e6ULL, 0x00cecece00cececeULL, 0x00ededed00edededULL,
+ 0x00e7e7e700e7e7e7ULL, 0x003b3b3b003b3b3bULL, 0x00fefefe00fefefeULL,
+ 0x007f7f7f007f7f7fULL, 0x00c5c5c500c5c5c5ULL, 0x00a4a4a400a4a4a4ULL,
+ 0x0037373700373737ULL, 0x00b1b1b100b1b1b1ULL, 0x004c4c4c004c4c4cULL,
+ 0x0091919100919191ULL, 0x006e6e6e006e6e6eULL, 0x008d8d8d008d8d8dULL,
+ 0x0076767600767676ULL, 0x0003030300030303ULL, 0x002d2d2d002d2d2dULL,
+ 0x00dedede00dededeULL, 0x0096969600969696ULL, 0x0026262600262626ULL,
+ 0x007d7d7d007d7d7dULL, 0x00c6c6c600c6c6c6ULL, 0x005c5c5c005c5c5cULL,
+ 0x00d3d3d300d3d3d3ULL, 0x00f2f2f200f2f2f2ULL, 0x004f4f4f004f4f4fULL,
+ 0x0019191900191919ULL, 0x003f3f3f003f3f3fULL, 0x00dcdcdc00dcdcdcULL,
+ 0x0079797900797979ULL, 0x001d1d1d001d1d1dULL, 0x0052525200525252ULL,
+ 0x00ebebeb00ebebebULL, 0x00f3f3f300f3f3f3ULL, 0x006d6d6d006d6d6dULL,
+ 0x005e5e5e005e5e5eULL, 0x00fbfbfb00fbfbfbULL, 0x0069696900696969ULL,
+ 0x00b2b2b200b2b2b2ULL, 0x00f0f0f000f0f0f0ULL, 0x0031313100313131ULL,
+ 0x000c0c0c000c0c0cULL, 0x00d4d4d400d4d4d4ULL, 0x00cfcfcf00cfcfcfULL,
+ 0x008c8c8c008c8c8cULL, 0x00e2e2e200e2e2e2ULL, 0x0075757500757575ULL,
+ 0x00a9a9a900a9a9a9ULL, 0x004a4a4a004a4a4aULL, 0x0057575700575757ULL,
+ 0x0084848400848484ULL, 0x0011111100111111ULL, 0x0045454500454545ULL,
+ 0x001b1b1b001b1b1bULL, 0x00f5f5f500f5f5f5ULL, 0x00e4e4e400e4e4e4ULL,
+ 0x000e0e0e000e0e0eULL, 0x0073737300737373ULL, 0x00aaaaaa00aaaaaaULL,
+ 0x00f1f1f100f1f1f1ULL, 0x00dddddd00ddddddULL, 0x0059595900595959ULL,
+ 0x0014141400141414ULL, 0x006c6c6c006c6c6cULL, 0x0092929200929292ULL,
+ 0x0054545400545454ULL, 0x00d0d0d000d0d0d0ULL, 0x0078787800787878ULL,
+ 0x0070707000707070ULL, 0x00e3e3e300e3e3e3ULL, 0x0049494900494949ULL,
+ 0x0080808000808080ULL, 0x0050505000505050ULL, 0x00a7a7a700a7a7a7ULL,
+ 0x00f6f6f600f6f6f6ULL, 0x0077777700777777ULL, 0x0093939300939393ULL,
+ 0x0086868600868686ULL, 0x0083838300838383ULL, 0x002a2a2a002a2a2aULL,
+ 0x00c7c7c700c7c7c7ULL, 0x005b5b5b005b5b5bULL, 0x00e9e9e900e9e9e9ULL,
+ 0x00eeeeee00eeeeeeULL, 0x008f8f8f008f8f8fULL, 0x0001010100010101ULL,
+ 0x003d3d3d003d3d3dULL,
+};
+
+__visible const u64 camellia_sp30333033[256] = {
+ 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL,
+ 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL,
+ 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL,
+ 0xc200c2c2c200c2c2ULL, 0xab00ababab00ababULL, 0x9a009a9a9a009a9aULL,
+ 0x7500757575007575ULL, 0x0600060606000606ULL, 0x5700575757005757ULL,
+ 0xa000a0a0a000a0a0ULL, 0x9100919191009191ULL, 0xf700f7f7f700f7f7ULL,
+ 0xb500b5b5b500b5b5ULL, 0xc900c9c9c900c9c9ULL, 0xa200a2a2a200a2a2ULL,
+ 0x8c008c8c8c008c8cULL, 0xd200d2d2d200d2d2ULL, 0x9000909090009090ULL,
+ 0xf600f6f6f600f6f6ULL, 0x0700070707000707ULL, 0xa700a7a7a700a7a7ULL,
+ 0x2700272727002727ULL, 0x8e008e8e8e008e8eULL, 0xb200b2b2b200b2b2ULL,
+ 0x4900494949004949ULL, 0xde00dedede00dedeULL, 0x4300434343004343ULL,
+ 0x5c005c5c5c005c5cULL, 0xd700d7d7d700d7d7ULL, 0xc700c7c7c700c7c7ULL,
+ 0x3e003e3e3e003e3eULL, 0xf500f5f5f500f5f5ULL, 0x8f008f8f8f008f8fULL,
+ 0x6700676767006767ULL, 0x1f001f1f1f001f1fULL, 0x1800181818001818ULL,
+ 0x6e006e6e6e006e6eULL, 0xaf00afafaf00afafULL, 0x2f002f2f2f002f2fULL,
+ 0xe200e2e2e200e2e2ULL, 0x8500858585008585ULL, 0x0d000d0d0d000d0dULL,
+ 0x5300535353005353ULL, 0xf000f0f0f000f0f0ULL, 0x9c009c9c9c009c9cULL,
+ 0x6500656565006565ULL, 0xea00eaeaea00eaeaULL, 0xa300a3a3a300a3a3ULL,
+ 0xae00aeaeae00aeaeULL, 0x9e009e9e9e009e9eULL, 0xec00ececec00ececULL,
+ 0x8000808080008080ULL, 0x2d002d2d2d002d2dULL, 0x6b006b6b6b006b6bULL,
+ 0xa800a8a8a800a8a8ULL, 0x2b002b2b2b002b2bULL, 0x3600363636003636ULL,
+ 0xa600a6a6a600a6a6ULL, 0xc500c5c5c500c5c5ULL, 0x8600868686008686ULL,
+ 0x4d004d4d4d004d4dULL, 0x3300333333003333ULL, 0xfd00fdfdfd00fdfdULL,
+ 0x6600666666006666ULL, 0x5800585858005858ULL, 0x9600969696009696ULL,
+ 0x3a003a3a3a003a3aULL, 0x0900090909000909ULL, 0x9500959595009595ULL,
+ 0x1000101010001010ULL, 0x7800787878007878ULL, 0xd800d8d8d800d8d8ULL,
+ 0x4200424242004242ULL, 0xcc00cccccc00ccccULL, 0xef00efefef00efefULL,
+ 0x2600262626002626ULL, 0xe500e5e5e500e5e5ULL, 0x6100616161006161ULL,
+ 0x1a001a1a1a001a1aULL, 0x3f003f3f3f003f3fULL, 0x3b003b3b3b003b3bULL,
+ 0x8200828282008282ULL, 0xb600b6b6b600b6b6ULL, 0xdb00dbdbdb00dbdbULL,
+ 0xd400d4d4d400d4d4ULL, 0x9800989898009898ULL, 0xe800e8e8e800e8e8ULL,
+ 0x8b008b8b8b008b8bULL, 0x0200020202000202ULL, 0xeb00ebebeb00ebebULL,
+ 0x0a000a0a0a000a0aULL, 0x2c002c2c2c002c2cULL, 0x1d001d1d1d001d1dULL,
+ 0xb000b0b0b000b0b0ULL, 0x6f006f6f6f006f6fULL, 0x8d008d8d8d008d8dULL,
+ 0x8800888888008888ULL, 0x0e000e0e0e000e0eULL, 0x1900191919001919ULL,
+ 0x8700878787008787ULL, 0x4e004e4e4e004e4eULL, 0x0b000b0b0b000b0bULL,
+ 0xa900a9a9a900a9a9ULL, 0x0c000c0c0c000c0cULL, 0x7900797979007979ULL,
+ 0x1100111111001111ULL, 0x7f007f7f7f007f7fULL, 0x2200222222002222ULL,
+ 0xe700e7e7e700e7e7ULL, 0x5900595959005959ULL, 0xe100e1e1e100e1e1ULL,
+ 0xda00dadada00dadaULL, 0x3d003d3d3d003d3dULL, 0xc800c8c8c800c8c8ULL,
+ 0x1200121212001212ULL, 0x0400040404000404ULL, 0x7400747474007474ULL,
+ 0x5400545454005454ULL, 0x3000303030003030ULL, 0x7e007e7e7e007e7eULL,
+ 0xb400b4b4b400b4b4ULL, 0x2800282828002828ULL, 0x5500555555005555ULL,
+ 0x6800686868006868ULL, 0x5000505050005050ULL, 0xbe00bebebe00bebeULL,
+ 0xd000d0d0d000d0d0ULL, 0xc400c4c4c400c4c4ULL, 0x3100313131003131ULL,
+ 0xcb00cbcbcb00cbcbULL, 0x2a002a2a2a002a2aULL, 0xad00adadad00adadULL,
+ 0x0f000f0f0f000f0fULL, 0xca00cacaca00cacaULL, 0x7000707070007070ULL,
+ 0xff00ffffff00ffffULL, 0x3200323232003232ULL, 0x6900696969006969ULL,
+ 0x0800080808000808ULL, 0x6200626262006262ULL, 0x0000000000000000ULL,
+ 0x2400242424002424ULL, 0xd100d1d1d100d1d1ULL, 0xfb00fbfbfb00fbfbULL,
+ 0xba00bababa00babaULL, 0xed00ededed00ededULL, 0x4500454545004545ULL,
+ 0x8100818181008181ULL, 0x7300737373007373ULL, 0x6d006d6d6d006d6dULL,
+ 0x8400848484008484ULL, 0x9f009f9f9f009f9fULL, 0xee00eeeeee00eeeeULL,
+ 0x4a004a4a4a004a4aULL, 0xc300c3c3c300c3c3ULL, 0x2e002e2e2e002e2eULL,
+ 0xc100c1c1c100c1c1ULL, 0x0100010101000101ULL, 0xe600e6e6e600e6e6ULL,
+ 0x2500252525002525ULL, 0x4800484848004848ULL, 0x9900999999009999ULL,
+ 0xb900b9b9b900b9b9ULL, 0xb300b3b3b300b3b3ULL, 0x7b007b7b7b007b7bULL,
+ 0xf900f9f9f900f9f9ULL, 0xce00cecece00ceceULL, 0xbf00bfbfbf00bfbfULL,
+ 0xdf00dfdfdf00dfdfULL, 0x7100717171007171ULL, 0x2900292929002929ULL,
+ 0xcd00cdcdcd00cdcdULL, 0x6c006c6c6c006c6cULL, 0x1300131313001313ULL,
+ 0x6400646464006464ULL, 0x9b009b9b9b009b9bULL, 0x6300636363006363ULL,
+ 0x9d009d9d9d009d9dULL, 0xc000c0c0c000c0c0ULL, 0x4b004b4b4b004b4bULL,
+ 0xb700b7b7b700b7b7ULL, 0xa500a5a5a500a5a5ULL, 0x8900898989008989ULL,
+ 0x5f005f5f5f005f5fULL, 0xb100b1b1b100b1b1ULL, 0x1700171717001717ULL,
+ 0xf400f4f4f400f4f4ULL, 0xbc00bcbcbc00bcbcULL, 0xd300d3d3d300d3d3ULL,
+ 0x4600464646004646ULL, 0xcf00cfcfcf00cfcfULL, 0x3700373737003737ULL,
+ 0x5e005e5e5e005e5eULL, 0x4700474747004747ULL, 0x9400949494009494ULL,
+ 0xfa00fafafa00fafaULL, 0xfc00fcfcfc00fcfcULL, 0x5b005b5b5b005b5bULL,
+ 0x9700979797009797ULL, 0xfe00fefefe00fefeULL, 0x5a005a5a5a005a5aULL,
+ 0xac00acacac00acacULL, 0x3c003c3c3c003c3cULL, 0x4c004c4c4c004c4cULL,
+ 0x0300030303000303ULL, 0x3500353535003535ULL, 0xf300f3f3f300f3f3ULL,
+ 0x2300232323002323ULL, 0xb800b8b8b800b8b8ULL, 0x5d005d5d5d005d5dULL,
+ 0x6a006a6a6a006a6aULL, 0x9200929292009292ULL, 0xd500d5d5d500d5d5ULL,
+ 0x2100212121002121ULL, 0x4400444444004444ULL, 0x5100515151005151ULL,
+ 0xc600c6c6c600c6c6ULL, 0x7d007d7d7d007d7dULL, 0x3900393939003939ULL,
+ 0x8300838383008383ULL, 0xdc00dcdcdc00dcdcULL, 0xaa00aaaaaa00aaaaULL,
+ 0x7c007c7c7c007c7cULL, 0x7700777777007777ULL, 0x5600565656005656ULL,
+ 0x0500050505000505ULL, 0x1b001b1b1b001b1bULL, 0xa400a4a4a400a4a4ULL,
+ 0x1500151515001515ULL, 0x3400343434003434ULL, 0x1e001e1e1e001e1eULL,
+ 0x1c001c1c1c001c1cULL, 0xf800f8f8f800f8f8ULL, 0x5200525252005252ULL,
+ 0x2000202020002020ULL, 0x1400141414001414ULL, 0xe900e9e9e900e9e9ULL,
+ 0xbd00bdbdbd00bdbdULL, 0xdd00dddddd00ddddULL, 0xe400e4e4e400e4e4ULL,
+ 0xa100a1a1a100a1a1ULL, 0xe000e0e0e000e0e0ULL, 0x8a008a8a8a008a8aULL,
+ 0xf100f1f1f100f1f1ULL, 0xd600d6d6d600d6d6ULL, 0x7a007a7a7a007a7aULL,
+ 0xbb00bbbbbb00bbbbULL, 0xe300e3e3e300e3e3ULL, 0x4000404040004040ULL,
+ 0x4f004f4f4f004f4fULL,
+};
+
+__visible const u64 camellia_sp44044404[256] = {
+ 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL,
+ 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL,
+ 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL,
+ 0x6b6b006b6b6b006bULL, 0x4545004545450045ULL, 0xa5a500a5a5a500a5ULL,
+ 0xeded00ededed00edULL, 0x4f4f004f4f4f004fULL, 0x1d1d001d1d1d001dULL,
+ 0x9292009292920092ULL, 0x8686008686860086ULL, 0xafaf00afafaf00afULL,
+ 0x7c7c007c7c7c007cULL, 0x1f1f001f1f1f001fULL, 0x3e3e003e3e3e003eULL,
+ 0xdcdc00dcdcdc00dcULL, 0x5e5e005e5e5e005eULL, 0x0b0b000b0b0b000bULL,
+ 0xa6a600a6a6a600a6ULL, 0x3939003939390039ULL, 0xd5d500d5d5d500d5ULL,
+ 0x5d5d005d5d5d005dULL, 0xd9d900d9d9d900d9ULL, 0x5a5a005a5a5a005aULL,
+ 0x5151005151510051ULL, 0x6c6c006c6c6c006cULL, 0x8b8b008b8b8b008bULL,
+ 0x9a9a009a9a9a009aULL, 0xfbfb00fbfbfb00fbULL, 0xb0b000b0b0b000b0ULL,
+ 0x7474007474740074ULL, 0x2b2b002b2b2b002bULL, 0xf0f000f0f0f000f0ULL,
+ 0x8484008484840084ULL, 0xdfdf00dfdfdf00dfULL, 0xcbcb00cbcbcb00cbULL,
+ 0x3434003434340034ULL, 0x7676007676760076ULL, 0x6d6d006d6d6d006dULL,
+ 0xa9a900a9a9a900a9ULL, 0xd1d100d1d1d100d1ULL, 0x0404000404040004ULL,
+ 0x1414001414140014ULL, 0x3a3a003a3a3a003aULL, 0xdede00dedede00deULL,
+ 0x1111001111110011ULL, 0x3232003232320032ULL, 0x9c9c009c9c9c009cULL,
+ 0x5353005353530053ULL, 0xf2f200f2f2f200f2ULL, 0xfefe00fefefe00feULL,
+ 0xcfcf00cfcfcf00cfULL, 0xc3c300c3c3c300c3ULL, 0x7a7a007a7a7a007aULL,
+ 0x2424002424240024ULL, 0xe8e800e8e8e800e8ULL, 0x6060006060600060ULL,
+ 0x6969006969690069ULL, 0xaaaa00aaaaaa00aaULL, 0xa0a000a0a0a000a0ULL,
+ 0xa1a100a1a1a100a1ULL, 0x6262006262620062ULL, 0x5454005454540054ULL,
+ 0x1e1e001e1e1e001eULL, 0xe0e000e0e0e000e0ULL, 0x6464006464640064ULL,
+ 0x1010001010100010ULL, 0x0000000000000000ULL, 0xa3a300a3a3a300a3ULL,
+ 0x7575007575750075ULL, 0x8a8a008a8a8a008aULL, 0xe6e600e6e6e600e6ULL,
+ 0x0909000909090009ULL, 0xdddd00dddddd00ddULL, 0x8787008787870087ULL,
+ 0x8383008383830083ULL, 0xcdcd00cdcdcd00cdULL, 0x9090009090900090ULL,
+ 0x7373007373730073ULL, 0xf6f600f6f6f600f6ULL, 0x9d9d009d9d9d009dULL,
+ 0xbfbf00bfbfbf00bfULL, 0x5252005252520052ULL, 0xd8d800d8d8d800d8ULL,
+ 0xc8c800c8c8c800c8ULL, 0xc6c600c6c6c600c6ULL, 0x8181008181810081ULL,
+ 0x6f6f006f6f6f006fULL, 0x1313001313130013ULL, 0x6363006363630063ULL,
+ 0xe9e900e9e9e900e9ULL, 0xa7a700a7a7a700a7ULL, 0x9f9f009f9f9f009fULL,
+ 0xbcbc00bcbcbc00bcULL, 0x2929002929290029ULL, 0xf9f900f9f9f900f9ULL,
+ 0x2f2f002f2f2f002fULL, 0xb4b400b4b4b400b4ULL, 0x7878007878780078ULL,
+ 0x0606000606060006ULL, 0xe7e700e7e7e700e7ULL, 0x7171007171710071ULL,
+ 0xd4d400d4d4d400d4ULL, 0xabab00ababab00abULL, 0x8888008888880088ULL,
+ 0x8d8d008d8d8d008dULL, 0x7272007272720072ULL, 0xb9b900b9b9b900b9ULL,
+ 0xf8f800f8f8f800f8ULL, 0xacac00acacac00acULL, 0x3636003636360036ULL,
+ 0x2a2a002a2a2a002aULL, 0x3c3c003c3c3c003cULL, 0xf1f100f1f1f100f1ULL,
+ 0x4040004040400040ULL, 0xd3d300d3d3d300d3ULL, 0xbbbb00bbbbbb00bbULL,
+ 0x4343004343430043ULL, 0x1515001515150015ULL, 0xadad00adadad00adULL,
+ 0x7777007777770077ULL, 0x8080008080800080ULL, 0x8282008282820082ULL,
+ 0xecec00ececec00ecULL, 0x2727002727270027ULL, 0xe5e500e5e5e500e5ULL,
+ 0x8585008585850085ULL, 0x3535003535350035ULL, 0x0c0c000c0c0c000cULL,
+ 0x4141004141410041ULL, 0xefef00efefef00efULL, 0x9393009393930093ULL,
+ 0x1919001919190019ULL, 0x2121002121210021ULL, 0x0e0e000e0e0e000eULL,
+ 0x4e4e004e4e4e004eULL, 0x6565006565650065ULL, 0xbdbd00bdbdbd00bdULL,
+ 0xb8b800b8b8b800b8ULL, 0x8f8f008f8f8f008fULL, 0xebeb00ebebeb00ebULL,
+ 0xcece00cecece00ceULL, 0x3030003030300030ULL, 0x5f5f005f5f5f005fULL,
+ 0xc5c500c5c5c500c5ULL, 0x1a1a001a1a1a001aULL, 0xe1e100e1e1e100e1ULL,
+ 0xcaca00cacaca00caULL, 0x4747004747470047ULL, 0x3d3d003d3d3d003dULL,
+ 0x0101000101010001ULL, 0xd6d600d6d6d600d6ULL, 0x5656005656560056ULL,
+ 0x4d4d004d4d4d004dULL, 0x0d0d000d0d0d000dULL, 0x6666006666660066ULL,
+ 0xcccc00cccccc00ccULL, 0x2d2d002d2d2d002dULL, 0x1212001212120012ULL,
+ 0x2020002020200020ULL, 0xb1b100b1b1b100b1ULL, 0x9999009999990099ULL,
+ 0x4c4c004c4c4c004cULL, 0xc2c200c2c2c200c2ULL, 0x7e7e007e7e7e007eULL,
+ 0x0505000505050005ULL, 0xb7b700b7b7b700b7ULL, 0x3131003131310031ULL,
+ 0x1717001717170017ULL, 0xd7d700d7d7d700d7ULL, 0x5858005858580058ULL,
+ 0x6161006161610061ULL, 0x1b1b001b1b1b001bULL, 0x1c1c001c1c1c001cULL,
+ 0x0f0f000f0f0f000fULL, 0x1616001616160016ULL, 0x1818001818180018ULL,
+ 0x2222002222220022ULL, 0x4444004444440044ULL, 0xb2b200b2b2b200b2ULL,
+ 0xb5b500b5b5b500b5ULL, 0x9191009191910091ULL, 0x0808000808080008ULL,
+ 0xa8a800a8a8a800a8ULL, 0xfcfc00fcfcfc00fcULL, 0x5050005050500050ULL,
+ 0xd0d000d0d0d000d0ULL, 0x7d7d007d7d7d007dULL, 0x8989008989890089ULL,
+ 0x9797009797970097ULL, 0x5b5b005b5b5b005bULL, 0x9595009595950095ULL,
+ 0xffff00ffffff00ffULL, 0xd2d200d2d2d200d2ULL, 0xc4c400c4c4c400c4ULL,
+ 0x4848004848480048ULL, 0xf7f700f7f7f700f7ULL, 0xdbdb00dbdbdb00dbULL,
+ 0x0303000303030003ULL, 0xdada00dadada00daULL, 0x3f3f003f3f3f003fULL,
+ 0x9494009494940094ULL, 0x5c5c005c5c5c005cULL, 0x0202000202020002ULL,
+ 0x4a4a004a4a4a004aULL, 0x3333003333330033ULL, 0x6767006767670067ULL,
+ 0xf3f300f3f3f300f3ULL, 0x7f7f007f7f7f007fULL, 0xe2e200e2e2e200e2ULL,
+ 0x9b9b009b9b9b009bULL, 0x2626002626260026ULL, 0x3737003737370037ULL,
+ 0x3b3b003b3b3b003bULL, 0x9696009696960096ULL, 0x4b4b004b4b4b004bULL,
+ 0xbebe00bebebe00beULL, 0x2e2e002e2e2e002eULL, 0x7979007979790079ULL,
+ 0x8c8c008c8c8c008cULL, 0x6e6e006e6e6e006eULL, 0x8e8e008e8e8e008eULL,
+ 0xf5f500f5f5f500f5ULL, 0xb6b600b6b6b600b6ULL, 0xfdfd00fdfdfd00fdULL,
+ 0x5959005959590059ULL, 0x9898009898980098ULL, 0x6a6a006a6a6a006aULL,
+ 0x4646004646460046ULL, 0xbaba00bababa00baULL, 0x2525002525250025ULL,
+ 0x4242004242420042ULL, 0xa2a200a2a2a200a2ULL, 0xfafa00fafafa00faULL,
+ 0x0707000707070007ULL, 0x5555005555550055ULL, 0xeeee00eeeeee00eeULL,
+ 0x0a0a000a0a0a000aULL, 0x4949004949490049ULL, 0x6868006868680068ULL,
+ 0x3838003838380038ULL, 0xa4a400a4a4a400a4ULL, 0x2828002828280028ULL,
+ 0x7b7b007b7b7b007bULL, 0xc9c900c9c9c900c9ULL, 0xc1c100c1c1c100c1ULL,
+ 0xe3e300e3e3e300e3ULL, 0xf4f400f4f4f400f4ULL, 0xc7c700c7c7c700c7ULL,
+ 0x9e9e009e9e9e009eULL,
+};
+
+__visible const u64 camellia_sp11101110[256] = {
+ 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL,
+ 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL,
+ 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL,
+ 0x8585850085858500ULL, 0x5757570057575700ULL, 0x3535350035353500ULL,
+ 0xeaeaea00eaeaea00ULL, 0x0c0c0c000c0c0c00ULL, 0xaeaeae00aeaeae00ULL,
+ 0x4141410041414100ULL, 0x2323230023232300ULL, 0xefefef00efefef00ULL,
+ 0x6b6b6b006b6b6b00ULL, 0x9393930093939300ULL, 0x4545450045454500ULL,
+ 0x1919190019191900ULL, 0xa5a5a500a5a5a500ULL, 0x2121210021212100ULL,
+ 0xededed00ededed00ULL, 0x0e0e0e000e0e0e00ULL, 0x4f4f4f004f4f4f00ULL,
+ 0x4e4e4e004e4e4e00ULL, 0x1d1d1d001d1d1d00ULL, 0x6565650065656500ULL,
+ 0x9292920092929200ULL, 0xbdbdbd00bdbdbd00ULL, 0x8686860086868600ULL,
+ 0xb8b8b800b8b8b800ULL, 0xafafaf00afafaf00ULL, 0x8f8f8f008f8f8f00ULL,
+ 0x7c7c7c007c7c7c00ULL, 0xebebeb00ebebeb00ULL, 0x1f1f1f001f1f1f00ULL,
+ 0xcecece00cecece00ULL, 0x3e3e3e003e3e3e00ULL, 0x3030300030303000ULL,
+ 0xdcdcdc00dcdcdc00ULL, 0x5f5f5f005f5f5f00ULL, 0x5e5e5e005e5e5e00ULL,
+ 0xc5c5c500c5c5c500ULL, 0x0b0b0b000b0b0b00ULL, 0x1a1a1a001a1a1a00ULL,
+ 0xa6a6a600a6a6a600ULL, 0xe1e1e100e1e1e100ULL, 0x3939390039393900ULL,
+ 0xcacaca00cacaca00ULL, 0xd5d5d500d5d5d500ULL, 0x4747470047474700ULL,
+ 0x5d5d5d005d5d5d00ULL, 0x3d3d3d003d3d3d00ULL, 0xd9d9d900d9d9d900ULL,
+ 0x0101010001010100ULL, 0x5a5a5a005a5a5a00ULL, 0xd6d6d600d6d6d600ULL,
+ 0x5151510051515100ULL, 0x5656560056565600ULL, 0x6c6c6c006c6c6c00ULL,
+ 0x4d4d4d004d4d4d00ULL, 0x8b8b8b008b8b8b00ULL, 0x0d0d0d000d0d0d00ULL,
+ 0x9a9a9a009a9a9a00ULL, 0x6666660066666600ULL, 0xfbfbfb00fbfbfb00ULL,
+ 0xcccccc00cccccc00ULL, 0xb0b0b000b0b0b000ULL, 0x2d2d2d002d2d2d00ULL,
+ 0x7474740074747400ULL, 0x1212120012121200ULL, 0x2b2b2b002b2b2b00ULL,
+ 0x2020200020202000ULL, 0xf0f0f000f0f0f000ULL, 0xb1b1b100b1b1b100ULL,
+ 0x8484840084848400ULL, 0x9999990099999900ULL, 0xdfdfdf00dfdfdf00ULL,
+ 0x4c4c4c004c4c4c00ULL, 0xcbcbcb00cbcbcb00ULL, 0xc2c2c200c2c2c200ULL,
+ 0x3434340034343400ULL, 0x7e7e7e007e7e7e00ULL, 0x7676760076767600ULL,
+ 0x0505050005050500ULL, 0x6d6d6d006d6d6d00ULL, 0xb7b7b700b7b7b700ULL,
+ 0xa9a9a900a9a9a900ULL, 0x3131310031313100ULL, 0xd1d1d100d1d1d100ULL,
+ 0x1717170017171700ULL, 0x0404040004040400ULL, 0xd7d7d700d7d7d700ULL,
+ 0x1414140014141400ULL, 0x5858580058585800ULL, 0x3a3a3a003a3a3a00ULL,
+ 0x6161610061616100ULL, 0xdedede00dedede00ULL, 0x1b1b1b001b1b1b00ULL,
+ 0x1111110011111100ULL, 0x1c1c1c001c1c1c00ULL, 0x3232320032323200ULL,
+ 0x0f0f0f000f0f0f00ULL, 0x9c9c9c009c9c9c00ULL, 0x1616160016161600ULL,
+ 0x5353530053535300ULL, 0x1818180018181800ULL, 0xf2f2f200f2f2f200ULL,
+ 0x2222220022222200ULL, 0xfefefe00fefefe00ULL, 0x4444440044444400ULL,
+ 0xcfcfcf00cfcfcf00ULL, 0xb2b2b200b2b2b200ULL, 0xc3c3c300c3c3c300ULL,
+ 0xb5b5b500b5b5b500ULL, 0x7a7a7a007a7a7a00ULL, 0x9191910091919100ULL,
+ 0x2424240024242400ULL, 0x0808080008080800ULL, 0xe8e8e800e8e8e800ULL,
+ 0xa8a8a800a8a8a800ULL, 0x6060600060606000ULL, 0xfcfcfc00fcfcfc00ULL,
+ 0x6969690069696900ULL, 0x5050500050505000ULL, 0xaaaaaa00aaaaaa00ULL,
+ 0xd0d0d000d0d0d000ULL, 0xa0a0a000a0a0a000ULL, 0x7d7d7d007d7d7d00ULL,
+ 0xa1a1a100a1a1a100ULL, 0x8989890089898900ULL, 0x6262620062626200ULL,
+ 0x9797970097979700ULL, 0x5454540054545400ULL, 0x5b5b5b005b5b5b00ULL,
+ 0x1e1e1e001e1e1e00ULL, 0x9595950095959500ULL, 0xe0e0e000e0e0e000ULL,
+ 0xffffff00ffffff00ULL, 0x6464640064646400ULL, 0xd2d2d200d2d2d200ULL,
+ 0x1010100010101000ULL, 0xc4c4c400c4c4c400ULL, 0x0000000000000000ULL,
+ 0x4848480048484800ULL, 0xa3a3a300a3a3a300ULL, 0xf7f7f700f7f7f700ULL,
+ 0x7575750075757500ULL, 0xdbdbdb00dbdbdb00ULL, 0x8a8a8a008a8a8a00ULL,
+ 0x0303030003030300ULL, 0xe6e6e600e6e6e600ULL, 0xdadada00dadada00ULL,
+ 0x0909090009090900ULL, 0x3f3f3f003f3f3f00ULL, 0xdddddd00dddddd00ULL,
+ 0x9494940094949400ULL, 0x8787870087878700ULL, 0x5c5c5c005c5c5c00ULL,
+ 0x8383830083838300ULL, 0x0202020002020200ULL, 0xcdcdcd00cdcdcd00ULL,
+ 0x4a4a4a004a4a4a00ULL, 0x9090900090909000ULL, 0x3333330033333300ULL,
+ 0x7373730073737300ULL, 0x6767670067676700ULL, 0xf6f6f600f6f6f600ULL,
+ 0xf3f3f300f3f3f300ULL, 0x9d9d9d009d9d9d00ULL, 0x7f7f7f007f7f7f00ULL,
+ 0xbfbfbf00bfbfbf00ULL, 0xe2e2e200e2e2e200ULL, 0x5252520052525200ULL,
+ 0x9b9b9b009b9b9b00ULL, 0xd8d8d800d8d8d800ULL, 0x2626260026262600ULL,
+ 0xc8c8c800c8c8c800ULL, 0x3737370037373700ULL, 0xc6c6c600c6c6c600ULL,
+ 0x3b3b3b003b3b3b00ULL, 0x8181810081818100ULL, 0x9696960096969600ULL,
+ 0x6f6f6f006f6f6f00ULL, 0x4b4b4b004b4b4b00ULL, 0x1313130013131300ULL,
+ 0xbebebe00bebebe00ULL, 0x6363630063636300ULL, 0x2e2e2e002e2e2e00ULL,
+ 0xe9e9e900e9e9e900ULL, 0x7979790079797900ULL, 0xa7a7a700a7a7a700ULL,
+ 0x8c8c8c008c8c8c00ULL, 0x9f9f9f009f9f9f00ULL, 0x6e6e6e006e6e6e00ULL,
+ 0xbcbcbc00bcbcbc00ULL, 0x8e8e8e008e8e8e00ULL, 0x2929290029292900ULL,
+ 0xf5f5f500f5f5f500ULL, 0xf9f9f900f9f9f900ULL, 0xb6b6b600b6b6b600ULL,
+ 0x2f2f2f002f2f2f00ULL, 0xfdfdfd00fdfdfd00ULL, 0xb4b4b400b4b4b400ULL,
+ 0x5959590059595900ULL, 0x7878780078787800ULL, 0x9898980098989800ULL,
+ 0x0606060006060600ULL, 0x6a6a6a006a6a6a00ULL, 0xe7e7e700e7e7e700ULL,
+ 0x4646460046464600ULL, 0x7171710071717100ULL, 0xbababa00bababa00ULL,
+ 0xd4d4d400d4d4d400ULL, 0x2525250025252500ULL, 0xababab00ababab00ULL,
+ 0x4242420042424200ULL, 0x8888880088888800ULL, 0xa2a2a200a2a2a200ULL,
+ 0x8d8d8d008d8d8d00ULL, 0xfafafa00fafafa00ULL, 0x7272720072727200ULL,
+ 0x0707070007070700ULL, 0xb9b9b900b9b9b900ULL, 0x5555550055555500ULL,
+ 0xf8f8f800f8f8f800ULL, 0xeeeeee00eeeeee00ULL, 0xacacac00acacac00ULL,
+ 0x0a0a0a000a0a0a00ULL, 0x3636360036363600ULL, 0x4949490049494900ULL,
+ 0x2a2a2a002a2a2a00ULL, 0x6868680068686800ULL, 0x3c3c3c003c3c3c00ULL,
+ 0x3838380038383800ULL, 0xf1f1f100f1f1f100ULL, 0xa4a4a400a4a4a400ULL,
+ 0x4040400040404000ULL, 0x2828280028282800ULL, 0xd3d3d300d3d3d300ULL,
+ 0x7b7b7b007b7b7b00ULL, 0xbbbbbb00bbbbbb00ULL, 0xc9c9c900c9c9c900ULL,
+ 0x4343430043434300ULL, 0xc1c1c100c1c1c100ULL, 0x1515150015151500ULL,
+ 0xe3e3e300e3e3e300ULL, 0xadadad00adadad00ULL, 0xf4f4f400f4f4f400ULL,
+ 0x7777770077777700ULL, 0xc7c7c700c7c7c700ULL, 0x8080800080808000ULL,
+ 0x9e9e9e009e9e9e00ULL,
+};
+
+/* key constants */
+#define CAMELLIA_SIGMA1L (0xA09E667FL)
+#define CAMELLIA_SIGMA1R (0x3BCC908BL)
+#define CAMELLIA_SIGMA2L (0xB67AE858L)
+#define CAMELLIA_SIGMA2R (0x4CAA73B2L)
+#define CAMELLIA_SIGMA3L (0xC6EF372FL)
+#define CAMELLIA_SIGMA3R (0xE94F82BEL)
+#define CAMELLIA_SIGMA4L (0x54FF53A5L)
+#define CAMELLIA_SIGMA4R (0xF1D36F1CL)
+#define CAMELLIA_SIGMA5L (0x10E527FAL)
+#define CAMELLIA_SIGMA5R (0xDE682D1DL)
+#define CAMELLIA_SIGMA6L (0xB05688C2L)
+#define CAMELLIA_SIGMA6R (0xB3E6C1FDL)
+
+/* macros */
+#define ROLDQ(l, r, bits) ({ \
+ u64 t = l; \
+ l = (l << bits) | (r >> (64 - bits)); \
+ r = (r << bits) | (t >> (64 - bits)); \
+})
+
+#define CAMELLIA_F(x, kl, kr, y) ({ \
+ u64 ii = x ^ (((u64)kl << 32) | kr); \
+ y = camellia_sp11101110[(uint8_t)ii]; \
+ y ^= camellia_sp44044404[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp30333033[(uint8_t)ii]; \
+ y ^= camellia_sp02220222[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp00444404[(uint8_t)ii]; \
+ y ^= camellia_sp03303033[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp22000222[(uint8_t)ii]; \
+ y ^= camellia_sp10011110[(uint8_t)(ii >> 8)]; \
+ y = ror64(y, 32); \
+})
+
+#define SET_SUBKEY_LR(INDEX, sRL) (subkey[(INDEX)] = ror64((sRL), 32))
+
+static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
+{
+ u64 kw4, tt;
+ u32 dw, tl, tr;
+
+ /* absorb kw2 to other subkeys */
+ /* round 2 */
+ subRL[3] ^= subRL[1];
+ /* round 4 */
+ subRL[5] ^= subRL[1];
+ /* round 6 */
+ subRL[7] ^= subRL[1];
+
+ subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
+ /* modified for FLinv(kl2) */
+ dw = (subRL[1] & subRL[9]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 8 */
+ subRL[11] ^= subRL[1];
+ /* round 10 */
+ subRL[13] ^= subRL[1];
+ /* round 12 */
+ subRL[15] ^= subRL[1];
+
+ subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
+ /* modified for FLinv(kl4) */
+ dw = (subRL[1] & subRL[17]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 14 */
+ subRL[19] ^= subRL[1];
+ /* round 16 */
+ subRL[21] ^= subRL[1];
+ /* round 18 */
+ subRL[23] ^= subRL[1];
+
+ if (max == 24) {
+ /* kw3 */
+ subRL[24] ^= subRL[1];
+
+ /* absorb kw4 to other subkeys */
+ kw4 = subRL[25];
+ } else {
+ subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
+ /* modified for FLinv(kl6) */
+ dw = (subRL[1] & subRL[25]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 20 */
+ subRL[27] ^= subRL[1];
+ /* round 22 */
+ subRL[29] ^= subRL[1];
+ /* round 24 */
+ subRL[31] ^= subRL[1];
+ /* kw3 */
+ subRL[32] ^= subRL[1];
+
+ /* absorb kw4 to other subkeys */
+ kw4 = subRL[33];
+ /* round 23 */
+ subRL[30] ^= kw4;
+ /* round 21 */
+ subRL[28] ^= kw4;
+ /* round 19 */
+ subRL[26] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[24]) << 32;
+ /* modified for FL(kl5) */
+ dw = (kw4 & subRL[24]) >> 32;
+ kw4 ^= rol32(dw, 1);
+ }
+
+ /* round 17 */
+ subRL[22] ^= kw4;
+ /* round 15 */
+ subRL[20] ^= kw4;
+ /* round 13 */
+ subRL[18] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[16]) << 32;
+ /* modified for FL(kl3) */
+ dw = (kw4 & subRL[16]) >> 32;
+ kw4 ^= rol32(dw, 1);
+
+ /* round 11 */
+ subRL[14] ^= kw4;
+ /* round 9 */
+ subRL[12] ^= kw4;
+ /* round 7 */
+ subRL[10] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[8]) << 32;
+ /* modified for FL(kl1) */
+ dw = (kw4 & subRL[8]) >> 32;
+ kw4 ^= rol32(dw, 1);
+
+ /* round 5 */
+ subRL[6] ^= kw4;
+ /* round 3 */
+ subRL[4] ^= kw4;
+ /* round 1 */
+ subRL[2] ^= kw4;
+ /* kw1 */
+ subRL[0] ^= kw4;
+
+ /* key XOR is end of F-function */
+ SET_SUBKEY_LR(0, subRL[0] ^ subRL[2]); /* kw1 */
+ SET_SUBKEY_LR(2, subRL[3]); /* round 1 */
+ SET_SUBKEY_LR(3, subRL[2] ^ subRL[4]); /* round 2 */
+ SET_SUBKEY_LR(4, subRL[3] ^ subRL[5]); /* round 3 */
+ SET_SUBKEY_LR(5, subRL[4] ^ subRL[6]); /* round 4 */
+ SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */
+
+ tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
+ dw = tl & (subRL[8] >> 32); /* FL(kl1) */
+ tr = subRL[10] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */
+ SET_SUBKEY_LR(8, subRL[8]); /* FL(kl1) */
+ SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */
+
+ tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
+ dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */
+ tr = subRL[7] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */
+ SET_SUBKEY_LR(11, subRL[10] ^ subRL[12]); /* round 8 */
+ SET_SUBKEY_LR(12, subRL[11] ^ subRL[13]); /* round 9 */
+ SET_SUBKEY_LR(13, subRL[12] ^ subRL[14]); /* round 10 */
+ SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */
+
+ tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
+ dw = tl & (subRL[16] >> 32); /* FL(kl3) */
+ tr = subRL[18] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */
+ SET_SUBKEY_LR(16, subRL[16]); /* FL(kl3) */
+ SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */
+
+ tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
+ dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */
+ tr = subRL[15] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */
+ SET_SUBKEY_LR(19, subRL[18] ^ subRL[20]); /* round 14 */
+ SET_SUBKEY_LR(20, subRL[19] ^ subRL[21]); /* round 15 */
+ SET_SUBKEY_LR(21, subRL[20] ^ subRL[22]); /* round 16 */
+ SET_SUBKEY_LR(22, subRL[21] ^ subRL[23]); /* round 17 */
+
+ if (max == 24) {
+ SET_SUBKEY_LR(23, subRL[22]); /* round 18 */
+ SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */
+ } else {
+ tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
+ dw = tl & (subRL[24] >> 32); /* FL(kl5) */
+ tr = subRL[26] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */
+ SET_SUBKEY_LR(24, subRL[24]); /* FL(kl5) */
+ SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */
+
+ tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
+ dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */
+ tr = subRL[23] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */
+ SET_SUBKEY_LR(27, subRL[26] ^ subRL[28]); /* round 20 */
+ SET_SUBKEY_LR(28, subRL[27] ^ subRL[29]); /* round 21 */
+ SET_SUBKEY_LR(29, subRL[28] ^ subRL[30]); /* round 22 */
+ SET_SUBKEY_LR(30, subRL[29] ^ subRL[31]); /* round 23 */
+ SET_SUBKEY_LR(31, subRL[30]); /* round 24 */
+ SET_SUBKEY_LR(32, subRL[32] ^ subRL[31]); /* kw3 */
+ }
+}
+
+static void camellia_setup128(const unsigned char *key, u64 *subkey)
+{
+ u64 kl, kr, ww;
+ u64 subRL[26];
+
+ /**
+ * k == kl || kr (|| is concatenation)
+ */
+ kl = get_unaligned_be64(key);
+ kr = get_unaligned_be64(key + 8);
+
+ /* generate KL dependent subkeys */
+ /* kw1 */
+ subRL[0] = kl;
+ /* kw2 */
+ subRL[1] = kr;
+
+ /* rotation left shift 15bit */
+ ROLDQ(kl, kr, 15);
+
+ /* k3 */
+ subRL[4] = kl;
+ /* k4 */
+ subRL[5] = kr;
+
+ /* rotation left shift 15+30bit */
+ ROLDQ(kl, kr, 30);
+
+ /* k7 */
+ subRL[10] = kl;
+ /* k8 */
+ subRL[11] = kr;
+
+ /* rotation left shift 15+30+15bit */
+ ROLDQ(kl, kr, 15);
+
+ /* k10 */
+ subRL[13] = kr;
+ /* rotation left shift 15+30+15+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* kl3 */
+ subRL[16] = kl;
+ /* kl4 */
+ subRL[17] = kr;
+
+ /* rotation left shift 15+30+15+17+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* k13 */
+ subRL[18] = kl;
+ /* k14 */
+ subRL[19] = kr;
+
+ /* rotation left shift 15+30+15+17+17+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* k17 */
+ subRL[22] = kl;
+ /* k18 */
+ subRL[23] = kr;
+
+ /* generate KA */
+ kl = subRL[0];
+ kr = subRL[1];
+ CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+
+ /* current status == (kll, klr, w0, w1) */
+ CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+ kl ^= ww;
+
+ /* generate KA dependent subkeys */
+ /* k1, k2 */
+ subRL[2] = kl;
+ subRL[3] = kr;
+ ROLDQ(kl, kr, 15);
+ /* k5,k6 */
+ subRL[6] = kl;
+ subRL[7] = kr;
+ ROLDQ(kl, kr, 15);
+ /* kl1, kl2 */
+ subRL[8] = kl;
+ subRL[9] = kr;
+ ROLDQ(kl, kr, 15);
+ /* k9 */
+ subRL[12] = kl;
+ ROLDQ(kl, kr, 15);
+ /* k11, k12 */
+ subRL[14] = kl;
+ subRL[15] = kr;
+ ROLDQ(kl, kr, 34);
+ /* k15, k16 */
+ subRL[20] = kl;
+ subRL[21] = kr;
+ ROLDQ(kl, kr, 17);
+ /* kw3, kw4 */
+ subRL[24] = kl;
+ subRL[25] = kr;
+
+ camellia_setup_tail(subkey, subRL, 24);
+}
+
+static void camellia_setup256(const unsigned char *key, u64 *subkey)
+{
+ u64 kl, kr; /* left half of key */
+ u64 krl, krr; /* right half of key */
+ u64 ww; /* temporary variables */
+ u64 subRL[34];
+
+ /**
+ * key = (kl || kr || krl || krr) (|| is concatenation)
+ */
+ kl = get_unaligned_be64(key);
+ kr = get_unaligned_be64(key + 8);
+ krl = get_unaligned_be64(key + 16);
+ krr = get_unaligned_be64(key + 24);
+
+ /* generate KL dependent subkeys */
+ /* kw1 */
+ subRL[0] = kl;
+ /* kw2 */
+ subRL[1] = kr;
+ ROLDQ(kl, kr, 45);
+ /* k9 */
+ subRL[12] = kl;
+ /* k10 */
+ subRL[13] = kr;
+ ROLDQ(kl, kr, 15);
+ /* kl3 */
+ subRL[16] = kl;
+ /* kl4 */
+ subRL[17] = kr;
+ ROLDQ(kl, kr, 17);
+ /* k17 */
+ subRL[22] = kl;
+ /* k18 */
+ subRL[23] = kr;
+ ROLDQ(kl, kr, 34);
+ /* k23 */
+ subRL[30] = kl;
+ /* k24 */
+ subRL[31] = kr;
+
+ /* generate KR dependent subkeys */
+ ROLDQ(krl, krr, 15);
+ /* k3 */
+ subRL[4] = krl;
+ /* k4 */
+ subRL[5] = krr;
+ ROLDQ(krl, krr, 15);
+ /* kl1 */
+ subRL[8] = krl;
+ /* kl2 */
+ subRL[9] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k13 */
+ subRL[18] = krl;
+ /* k14 */
+ subRL[19] = krr;
+ ROLDQ(krl, krr, 34);
+ /* k19 */
+ subRL[26] = krl;
+ /* k20 */
+ subRL[27] = krr;
+ ROLDQ(krl, krr, 34);
+
+ /* generate KA */
+ kl = subRL[0] ^ krl;
+ kr = subRL[1] ^ krr;
+
+ CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+ kl ^= krl;
+ CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+ kr ^= ww ^ krr;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+ kl ^= ww;
+
+ /* generate KB */
+ krl ^= kl;
+ krr ^= kr;
+ CAMELLIA_F(krl, CAMELLIA_SIGMA5L, CAMELLIA_SIGMA5R, ww);
+ krr ^= ww;
+ CAMELLIA_F(krr, CAMELLIA_SIGMA6L, CAMELLIA_SIGMA6R, ww);
+ krl ^= ww;
+
+ /* generate KA dependent subkeys */
+ ROLDQ(kl, kr, 15);
+ /* k5 */
+ subRL[6] = kl;
+ /* k6 */
+ subRL[7] = kr;
+ ROLDQ(kl, kr, 30);
+ /* k11 */
+ subRL[14] = kl;
+ /* k12 */
+ subRL[15] = kr;
+ /* rotation left shift 32bit */
+ ROLDQ(kl, kr, 32);
+ /* kl5 */
+ subRL[24] = kl;
+ /* kl6 */
+ subRL[25] = kr;
+ /* rotation left shift 17 from k11,k12 -> k21,k22 */
+ ROLDQ(kl, kr, 17);
+ /* k21 */
+ subRL[28] = kl;
+ /* k22 */
+ subRL[29] = kr;
+
+ /* generate KB dependent subkeys */
+ /* k1 */
+ subRL[2] = krl;
+ /* k2 */
+ subRL[3] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k7 */
+ subRL[10] = krl;
+ /* k8 */
+ subRL[11] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k15 */
+ subRL[20] = krl;
+ /* k16 */
+ subRL[21] = krr;
+ ROLDQ(krl, krr, 51);
+ /* kw3 */
+ subRL[32] = krl;
+ /* kw4 */
+ subRL[33] = krr;
+
+ camellia_setup_tail(subkey, subRL, 32);
+}
+
+static void camellia_setup192(const unsigned char *key, u64 *subkey)
+{
+ unsigned char kk[32];
+ u64 krl, krr;
+
+ memcpy(kk, key, 24);
+ memcpy((unsigned char *)&krl, key+16, 8);
+ krr = ~krl;
+ memcpy(kk+24, (unsigned char *)&krr, 8);
+ camellia_setup256(kk, subkey);
+}
+
+int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
+ unsigned int key_len)
+{
+ if (key_len != 16 && key_len != 24 && key_len != 32)
+ return -EINVAL;
+
+ cctx->key_length = key_len;
+
+ switch (key_len) {
+ case 16:
+ camellia_setup128(key, cctx->key_table);
+ break;
+ case 24:
+ camellia_setup192(key, cctx->key_table);
+ break;
+ case 32:
+ camellia_setup256(key, cctx->key_table);
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__camellia_setkey);
+
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len);
+}
+
+static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ return camellia_setkey(&tfm->base, key, key_len);
+}
+
+void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src)
+{
+ u8 buf[CAMELLIA_BLOCK_SIZE];
+ const u8 *iv = src;
+
+ if (dst == src)
+ iv = memcpy(buf, iv, sizeof(buf));
+ camellia_dec_blk_2way(ctx, dst, src);
+ crypto_xor(dst + CAMELLIA_BLOCK_SIZE, iv, CAMELLIA_BLOCK_SIZE);
+}
+EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
+}
+
+static struct crypto_alg camellia_cipher_alg = {
+ .cra_name = "camellia",
+ .cra_driver_name = "camellia-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct camellia_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .cia_max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .cia_setkey = camellia_setkey,
+ .cia_encrypt = camellia_encrypt,
+ .cia_decrypt = camellia_decrypt
+ }
+ }
+};
+
+static struct skcipher_alg camellia_skcipher_algs[] = {
+ {
+ .base.cra_name = "ecb(camellia)",
+ .base.cra_driver_name = "ecb-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(camellia)",
+ .base.cra_driver_name = "cbc-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, camellia-asm is slower than original assembler
+ * implementation because excessive uses of 64bit rotate and
+ * left-shifts (which are really slow on P4) needed to store and
+ * handle 128bit block in two 64bit registers.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init camellia_init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "camellia-x86_64: performance on this CPU "
+ "would be suboptimal: disabling "
+ "camellia-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&camellia_cipher_alg);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(camellia_skcipher_algs,
+ ARRAY_SIZE(camellia_skcipher_algs));
+ if (err)
+ crypto_unregister_alg(&camellia_cipher_alg);
+
+ return err;
+}
+
+static void __exit camellia_fini(void)
+{
+ crypto_unregister_alg(&camellia_cipher_alg);
+ crypto_unregister_skciphers(camellia_skcipher_algs,
+ ARRAY_SIZE(camellia_skcipher_algs));
+}
+
+module_init(camellia_init);
+module_exit(camellia_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..fb95a614249d
--- /dev/null
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -0,0 +1,489 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.file "cast5-avx-x86_64-asm_64.S"
+
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
+
+/* structure of crypto context */
+#define km 0
+#define kr (16*4)
+#define rr ((16*4)+16)
+
+/* s-boxes */
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
+
+/**********************************************************************
+ 16-way AVX cast5
+ **********************************************************************/
+#define CTX %r15
+
+#define RL1 %xmm0
+#define RR1 %xmm1
+#define RL2 %xmm2
+#define RR2 %xmm3
+#define RL3 %xmm4
+#define RR3 %xmm5
+#define RL4 %xmm6
+#define RR4 %xmm7
+
+#define RX %xmm8
+
+#define RKM %xmm9
+#define RKR %xmm10
+#define RKRF %xmm11
+#define RKRR %xmm12
+
+#define R32 %xmm13
+#define R1ST %xmm14
+
+#define RTMP %xmm15
+
+#define RID1 %rdi
+#define RID1d %edi
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RFS1 %r8
+#define RFS1d %r8d
+#define RFS2 %r9
+#define RFS2d %r9d
+#define RFS3 %r10
+#define RFS3d %r10d
+
+
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ leaq s1(%rip), RID2; \
+ movl (RID2,RID1,4), dst ## d; \
+ movzbl src ## bl, RID2d; \
+ leaq s2(%rip), RID1; \
+ op1 (RID1,RID2,4), dst ## d; \
+ shrq $16, src; \
+ movzbl src ## bh, RID1d; \
+ leaq s3(%rip), RID2; \
+ op2 (RID2,RID1,4), dst ## d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
+ leaq s4(%rip), RID1; \
+ op3 (RID1,RID2,4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F_head(a, x, gi1, gi2, op0) \
+ op0 a, RKM, x; \
+ vpslld RKRF, x, RTMP; \
+ vpsrld RKRR, x, x; \
+ vpor RTMP, x, x; \
+ \
+ vmovq x, gi1; \
+ vpextrq $1, x, gi2;
+
+#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
+ lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
+ lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
+ \
+ lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
+ shlq $32, RFS2; \
+ orq RFS1, RFS2; \
+ lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
+ shlq $32, RFS1; \
+ orq RFS1, RFS3; \
+ \
+ vmovq RFS2, x; \
+ vpinsrq $1, RFS3, x, x;
+
+#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
+ F_head(b1, RX, RGI1, RGI2, op0); \
+ F_head(b2, RX, RGI3, RGI4, op0); \
+ \
+ F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
+ F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
+ \
+ vpxor a1, RX, a1; \
+ vpxor a2, RTMP, a2;
+
+#define F1_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
+#define F2_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
+#define F3_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
+
+#define subround(a1, b1, a2, b2, f) \
+ F ## f ## _2(a1, b1, a2, b2);
+
+#define round(l, r, n, f) \
+ vbroadcastss (km+(4*n))(CTX), RKM; \
+ vpand R1ST, RKR, RKRF; \
+ vpsubq RKRF, R32, RKRR; \
+ vpsrldq $1, RKR, RKR; \
+ subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
+ subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
+
+#define enc_preload_rkr() \
+ vbroadcastss .L16_mask(%rip), RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor kr(CTX), RKR, RKR;
+
+#define dec_preload_rkr() \
+ vbroadcastss .L16_mask(%rip), RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor kr(CTX), RKR, RKR; \
+ vpshufb .Lbswap128_mask(%rip), RKR, RKR;
+
+#define transpose_2x4(x0, x1, t0, t1) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t1; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1;
+
+#define inpack_blocks(x0, x1, t0, t1, rmask) \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ \
+ transpose_2x4(x0, x1, t0, t1)
+
+#define outunpack_blocks(x0, x1, t0, t1, rmask) \
+ transpose_2x4(x0, x1, t0, t1) \
+ \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1;
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_iv_mask:
+ .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst4.16_mask, "aM", @progbits, 4
+.align 4
+.L16_mask:
+ .byte 16, 16, 16, 16
+.section .rodata.cst4.32_mask, "aM", @progbits, 4
+.align 4
+.L32_mask:
+ .byte 32, 0, 0, 0
+.section .rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
+.Lfirst_mask:
+ .byte 0x1f, 0, 0, 0
+
+.text
+
+SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
+ /* input:
+ * %rdi: ctx
+ * RL1: blocks 1 and 2
+ * RR1: blocks 3 and 4
+ * RL2: blocks 5 and 6
+ * RR2: blocks 7 and 8
+ * RL3: blocks 9 and 10
+ * RR3: blocks 11 and 12
+ * RL4: blocks 13 and 14
+ * RR4: blocks 15 and 16
+ * output:
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
+ enc_preload_rkr();
+
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
+
+ round(RL, RR, 0, 1);
+ round(RR, RL, 1, 2);
+ round(RL, RR, 2, 3);
+ round(RR, RL, 3, 1);
+ round(RL, RR, 4, 2);
+ round(RR, RL, 5, 3);
+ round(RL, RR, 6, 1);
+ round(RR, RL, 7, 2);
+ round(RL, RR, 8, 3);
+ round(RR, RL, 9, 1);
+ round(RL, RR, 10, 2);
+ round(RR, RL, 11, 3);
+
+ movzbl rr(CTX), %eax;
+ testl %eax, %eax;
+ jnz .L__skip_enc;
+
+ round(RL, RR, 12, 1);
+ round(RR, RL, 13, 2);
+ round(RL, RR, 14, 3);
+ round(RR, RL, 15, 1);
+
+.L__skip_enc:
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
+
+ RET;
+SYM_FUNC_END(__cast5_enc_blk16)
+
+SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
+ /* input:
+ * %rdi: ctx
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ * output:
+ * RL1: decrypted blocks 1 and 2
+ * RR1: decrypted blocks 3 and 4
+ * RL2: decrypted blocks 5 and 6
+ * RR2: decrypted blocks 7 and 8
+ * RL3: decrypted blocks 9 and 10
+ * RR3: decrypted blocks 11 and 12
+ * RL4: decrypted blocks 13 and 14
+ * RR4: decrypted blocks 15 and 16
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
+ dec_preload_rkr();
+
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
+
+ movzbl rr(CTX), %eax;
+ testl %eax, %eax;
+ jnz .L__skip_dec;
+
+ round(RL, RR, 15, 1);
+ round(RR, RL, 14, 3);
+ round(RL, RR, 13, 2);
+ round(RR, RL, 12, 1);
+
+.L__dec_tail:
+ round(RL, RR, 11, 3);
+ round(RR, RL, 10, 2);
+ round(RL, RR, 9, 1);
+ round(RR, RL, 8, 3);
+ round(RL, RR, 7, 2);
+ round(RR, RL, 6, 1);
+ round(RL, RR, 5, 3);
+ round(RR, RL, 4, 2);
+ round(RL, RR, 3, 1);
+ round(RR, RL, 2, 3);
+ round(RL, RR, 1, 2);
+ round(RR, RL, 0, 1);
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ popq %rbx;
+ popq %r15;
+
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
+
+ RET;
+
+.L__skip_dec:
+ vpsrldq $4, RKR, RKR;
+ jmp .L__dec_tail;
+SYM_FUNC_END(__cast5_dec_blk16)
+
+SYM_FUNC_START(cast5_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_enc_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ popq %r15;
+ FRAME_END
+ RET;
+SYM_FUNC_END(cast5_ecb_enc_16way)
+
+SYM_FUNC_START(cast5_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ popq %r15;
+ FRAME_END
+ RET;
+SYM_FUNC_END(cast5_ecb_dec_16way)
+
+SYM_FUNC_START(cast5_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vmovdqu (0*16)(%rdx), RL1;
+ vmovdqu (1*16)(%rdx), RR1;
+ vmovdqu (2*16)(%rdx), RL2;
+ vmovdqu (3*16)(%rdx), RR2;
+ vmovdqu (4*16)(%rdx), RL3;
+ vmovdqu (5*16)(%rdx), RR3;
+ vmovdqu (6*16)(%rdx), RL4;
+ vmovdqu (7*16)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ /* xor with src */
+ vmovq (%r12), RX;
+ vpshufd $0x4f, RX, RX;
+ vpxor RX, RR1, RR1;
+ vpxor 0*16+8(%r12), RL1, RL1;
+ vpxor 1*16+8(%r12), RR2, RR2;
+ vpxor 2*16+8(%r12), RL2, RL2;
+ vpxor 3*16+8(%r12), RR3, RR3;
+ vpxor 4*16+8(%r12), RL3, RL3;
+ vpxor 5*16+8(%r12), RR4, RR4;
+ vpxor 6*16+8(%r12), RL4, RL4;
+
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ RET;
+SYM_FUNC_END(cast5_cbc_dec_16way)
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
new file mode 100644
index 000000000000..3aca04d43b34
--- /dev/null
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for the AVX assembler implementation of the Cast5 Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/cast5.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+
+#define CAST5_PARALLEL_BLOCKS 16
+
+asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return cast5_setkey(&tfm->base, key, keylen);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way);
+ ECB_BLOCK(1, __cast5_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way);
+ ECB_BLOCK(1, __cast5_decrypt);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__cast5_encrypt);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way);
+ CBC_DEC_BLOCK(1, __cast5_decrypt);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg cast5_algs[] = {
+ {
+ .base.cra_name = "ecb(cast5)",
+ .base.cra_driver_name = "ecb-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = CAST5_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(cast5)",
+ .base.cra_driver_name = "cbc-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = CAST5_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .ivsize = CAST5_BLOCK_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }
+};
+
+static int __init cast5_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(cast5_algs,
+ ARRAY_SIZE(cast5_algs));
+}
+
+static void __exit cast5_exit(void)
+{
+ crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs));
+}
+
+module_init(cast5_init);
+module_exit(cast5_exit);
+
+MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("cast5");
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..9e86d460b409
--- /dev/null
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -0,0 +1,416 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "cast6-avx-x86_64-asm_64.S"
+
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
+
+/* structure of crypto context */
+#define km 0
+#define kr (12*4*4)
+
+/* s-boxes */
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
+
+/**********************************************************************
+ 8-way AVX cast6
+ **********************************************************************/
+#define CTX %r15
+
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+
+#define RX %xmm8
+
+#define RKM %xmm9
+#define RKR %xmm10
+#define RKRF %xmm11
+#define RKRR %xmm12
+#define R32 %xmm13
+#define R1ST %xmm14
+
+#define RTMP %xmm15
+
+#define RID1 %rdi
+#define RID1d %edi
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RFS1 %r8
+#define RFS1d %r8d
+#define RFS2 %r9
+#define RFS2d %r9d
+#define RFS3 %r10
+#define RFS3d %r10d
+
+
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ leaq s1(%rip), RID2; \
+ movl (RID2,RID1,4), dst ## d; \
+ movzbl src ## bl, RID2d; \
+ leaq s2(%rip), RID1; \
+ op1 (RID1,RID2,4), dst ## d; \
+ shrq $16, src; \
+ movzbl src ## bh, RID1d; \
+ leaq s3(%rip), RID2; \
+ op2 (RID2,RID1,4), dst ## d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
+ leaq s4(%rip), RID1; \
+ op3 (RID1,RID2,4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F_head(a, x, gi1, gi2, op0) \
+ op0 a, RKM, x; \
+ vpslld RKRF, x, RTMP; \
+ vpsrld RKRR, x, x; \
+ vpor RTMP, x, x; \
+ \
+ vmovq x, gi1; \
+ vpextrq $1, x, gi2;
+
+#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
+ lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
+ lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
+ \
+ lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
+ shlq $32, RFS2; \
+ orq RFS1, RFS2; \
+ lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
+ shlq $32, RFS1; \
+ orq RFS1, RFS3; \
+ \
+ vmovq RFS2, x; \
+ vpinsrq $1, RFS3, x, x;
+
+#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
+ F_head(b1, RX, RGI1, RGI2, op0); \
+ F_head(b2, RX, RGI3, RGI4, op0); \
+ \
+ F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
+ F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
+ \
+ vpxor a1, RX, a1; \
+ vpxor a2, RTMP, a2;
+
+#define F1_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
+#define F2_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
+#define F3_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
+
+#define qop(in, out, f) \
+ F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
+
+#define get_round_keys(nn) \
+ vbroadcastss (km+(4*(nn)))(CTX), RKM; \
+ vpand R1ST, RKR, RKRF; \
+ vpsubq RKRF, R32, RKRR; \
+ vpsrldq $1, RKR, RKR;
+
+#define Q(n) \
+ get_round_keys(4*n+0); \
+ qop(RD, RC, 1); \
+ \
+ get_round_keys(4*n+1); \
+ qop(RC, RB, 2); \
+ \
+ get_round_keys(4*n+2); \
+ qop(RB, RA, 3); \
+ \
+ get_round_keys(4*n+3); \
+ qop(RA, RD, 1);
+
+#define QBAR(n) \
+ get_round_keys(4*n+3); \
+ qop(RA, RD, 1); \
+ \
+ get_round_keys(4*n+2); \
+ qop(RB, RA, 3); \
+ \
+ get_round_keys(4*n+1); \
+ qop(RC, RB, 2); \
+ \
+ get_round_keys(4*n+0); \
+ qop(RD, RC, 1);
+
+#define shuffle(mask) \
+ vpshufb mask(%rip), RKR, RKR;
+
+#define preload_rkr(n, do_mask, mask) \
+ vbroadcastss .L16_mask(%rip), RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor (kr+n*16)(CTX), RKR, RKR; \
+ do_mask(mask);
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3;
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lrkr_enc_Q_Q_QBAR_QBAR:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
+.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lrkr_dec_Q_Q_Q_Q:
+ .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+.Lrkr_dec_Q_Q_QBAR_QBAR:
+ .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
+.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst4.L16_mask, "aM", @progbits, 4
+.align 4
+.L16_mask:
+ .byte 16, 16, 16, 16
+
+.section .rodata.cst4.L32_mask, "aM", @progbits, 4
+.align 4
+.L32_mask:
+ .byte 32, 0, 0, 0
+
+.section .rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
+.Lfirst_mask:
+ .byte 0x1f, 0, 0, 0
+
+.text
+
+.align 8
+SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
+ /* input:
+ * %rdi: ctx
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ preload_rkr(0, dummy, none);
+ Q(0);
+ Q(1);
+ Q(2);
+ Q(3);
+ preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
+ Q(4);
+ Q(5);
+ QBAR(6);
+ QBAR(7);
+ preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
+ QBAR(8);
+ QBAR(9);
+ QBAR(10);
+ QBAR(11);
+
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ RET;
+SYM_FUNC_END(__cast6_enc_blk8)
+
+.align 8
+SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
+ /* input:
+ * %rdi: ctx
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
+ Q(11);
+ Q(10);
+ Q(9);
+ Q(8);
+ preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
+ Q(7);
+ Q(6);
+ QBAR(5);
+ QBAR(4);
+ preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
+ QBAR(3);
+ QBAR(2);
+ QBAR(1);
+ QBAR(0);
+
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ RET;
+SYM_FUNC_END(__cast6_dec_blk8)
+
+SYM_FUNC_START(cast6_ecb_enc_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_enc_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ RET;
+SYM_FUNC_END(cast6_ecb_enc_8way)
+
+SYM_FUNC_START(cast6_ecb_dec_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ RET;
+SYM_FUNC_END(cast6_ecb_dec_8way)
+
+SYM_FUNC_START(cast6_cbc_dec_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ RET;
+SYM_FUNC_END(cast6_cbc_dec_8way)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
new file mode 100644
index 000000000000..c4dd28c30303
--- /dev/null
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for the AVX assembler implementation of the Cast6 Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/cast6.h>
+
+#include "ecb_cbc_helpers.h"
+
+#define CAST6_PARALLEL_BLOCKS 8
+
+asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+
+static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return cast6_setkey(&tfm->base, key, keylen);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_enc_8way);
+ ECB_BLOCK(1, __cast6_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_dec_8way);
+ ECB_BLOCK(1, __cast6_decrypt);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAST6_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__cast6_encrypt);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_cbc_dec_8way);
+ CBC_DEC_BLOCK(1, __cast6_decrypt);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg cast6_algs[] = {
+ {
+ .base.cra_name = "ecb(cast6)",
+ .base.cra_driver_name = "ecb-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(cast6)",
+ .base.cra_driver_name = "cbc-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .ivsize = CAST6_BLOCK_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static int __init cast6_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs));
+}
+
+static void __exit cast6_exit(void)
+{
+ crypto_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs));
+}
+
+module_init(cast6_init);
+module_exit(cast6_exit);
+
+MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("cast6");
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
deleted file mode 100644
index b9d00261703c..000000000000
--- a/arch/x86/crypto/crc32c-intel.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
- * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
- * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2A: Instruction Set Reference, A-M
- *
- * Copyright (C) 2008 Intel Corporation
- * Authors: Austin Zhang <austin_zhang@linux.intel.com>
- * Kent Liu <kent.liu@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <crypto/internal/hash.h>
-
-#include <asm/cpufeature.h>
-
-#define CHKSUM_BLOCK_SIZE 1
-#define CHKSUM_DIGEST_SIZE 4
-
-#define SCALE_F sizeof(unsigned long)
-
-#ifdef CONFIG_X86_64
-#define REX_PRE "0x48, "
-#else
-#define REX_PRE
-#endif
-
-static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
-{
- while (length--) {
- __asm__ __volatile__(
- ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
- :"=S"(crc)
- :"0"(crc), "c"(*data)
- );
- data++;
- }
-
- return crc;
-}
-
-static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
-{
- unsigned int iquotient = len / SCALE_F;
- unsigned int iremainder = len % SCALE_F;
- unsigned long *ptmp = (unsigned long *)p;
-
- while (iquotient--) {
- __asm__ __volatile__(
- ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
- :"=S"(crc)
- :"0"(crc), "c"(*ptmp)
- );
- ptmp++;
- }
-
- if (iremainder)
- crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
- iremainder);
-
- return crc;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
- unsigned int keylen)
-{
- u32 *mctx = crypto_shash_ctx(hash);
-
- if (keylen != sizeof(u32)) {
- crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
- *mctx = le32_to_cpup((__le32 *)key);
- return 0;
-}
-
-static int crc32c_intel_init(struct shash_desc *desc)
-{
- u32 *mctx = crypto_shash_ctx(desc->tfm);
- u32 *crcp = shash_desc_ctx(desc);
-
- *crcp = *mctx;
-
- return 0;
-}
-
-static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- u32 *crcp = shash_desc_ctx(desc);
-
- *crcp = crc32c_intel_le_hw(*crcp, data, len);
- return 0;
-}
-
-static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
- u8 *out)
-{
- *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
- return 0;
-}
-
-static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
-}
-
-static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
-{
- u32 *crcp = shash_desc_ctx(desc);
-
- *(__le32 *)out = ~cpu_to_le32p(crcp);
- return 0;
-}
-
-static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
- out);
-}
-
-static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
-{
- u32 *key = crypto_tfm_ctx(tfm);
-
- *key = ~0;
-
- return 0;
-}
-
-static struct shash_alg alg = {
- .setkey = crc32c_intel_setkey,
- .init = crc32c_intel_init,
- .update = crc32c_intel_update,
- .final = crc32c_intel_final,
- .finup = crc32c_intel_finup,
- .digest = crc32c_intel_digest,
- .descsize = sizeof(u32),
- .digestsize = CHKSUM_DIGEST_SIZE,
- .base = {
- .cra_name = "crc32c",
- .cra_driver_name = "crc32c-intel",
- .cra_priority = 200,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_ctxsize = sizeof(u32),
- .cra_module = THIS_MODULE,
- .cra_init = crc32c_intel_cra_init,
- }
-};
-
-
-static int __init crc32c_intel_mod_init(void)
-{
- if (cpu_has_xmm4_2)
- return crypto_register_shash(&alg);
- else
- return -ENODEV;
-}
-
-static void __exit crc32c_intel_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_init(crc32c_intel_mod_init);
-module_exit(crc32c_intel_mod_fini);
-
-MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
-MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
-MODULE_LICENSE("GPL");
-
-MODULE_ALIAS("crc32c");
-MODULE_ALIAS("crc32c-intel");
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
new file mode 100644
index 000000000000..cf21b998e77c
--- /dev/null
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -0,0 +1,831 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+
+.file "des3_ede-asm_64.S"
+.text
+
+#define s1 .L_s1
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rsi
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %esi
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+ movl a, RT0d; \
+ shrl $(offset), RT0d; \
+ xorl b, RT0d; \
+ andl $(mask), RT0d; \
+ xorl RT0d, b; \
+ shll $(offset), RT0d; \
+ xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ movl left##d, RW0d; \
+ roll $1, right##d; \
+ xorl right##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##d; \
+ xorl RW0d, right##d; \
+ roll $1, left##d; \
+ expand_to_64bits(right, RT3); \
+ expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+ compress_to_64bits(right); \
+ compress_to_64bits(left); \
+ movl right##d, RW0d; \
+ rorl $1, left##d; \
+ xorl left##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##d; \
+ xorl RW0d, left##d; \
+ rorl $1, right##d; \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+ xorq from, RW0; \
+ \
+ movzbl RW0bl, RT0d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ shrq $16, RW0; \
+ leaq s8(%rip), RW1; \
+ movq (RW1, RT0, 8), RT0; \
+ leaq s6(%rip), RW1; \
+ xorq (RW1, RT1, 8), to; \
+ movzbl RW0bl, RL1d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ leaq s4(%rip), RW1; \
+ xorq (RW1, RT2, 8), RT0; \
+ leaq s2(%rip), RW1; \
+ xorq (RW1, RT3, 8), to; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ leaq s7(%rip), RW1; \
+ xorq (RW1, RL1, 8), RT0; \
+ leaq s5(%rip), RW1; \
+ xorq (RW1, RT1, 8), to; \
+ leaq s3(%rip), RW1; \
+ xorq (RW1, RT2, 8), RT0; \
+ load_next_key(n, RW0); \
+ xorq RT0, to; \
+ leaq s1(%rip), RW1; \
+ xorq (RW1, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+ movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+ movl (io), left##d; \
+ movl 4(io), right##d; \
+ bswapl left##d; \
+ bswapl right##d;
+
+#define write_block(io, left, right) \
+ bswapl left##d; \
+ bswapl right##d; \
+ movl left##d, (io); \
+ movl right##d, 4(io);
+
+SYM_FUNC_START(des3_ede_x86_64_crypt_blk)
+ /* input:
+ * %rdi: round keys, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ pushq %rsi; /* dst */
+
+ read_block(%rdx, RL0, RR0);
+ initial_permutation(RL0, RR0);
+
+ movq (CTX), RW0;
+
+ round1(0, RR0, RL0, load_next_key);
+ round1(1, RL0, RR0, load_next_key);
+ round1(2, RR0, RL0, load_next_key);
+ round1(3, RL0, RR0, load_next_key);
+ round1(4, RR0, RL0, load_next_key);
+ round1(5, RL0, RR0, load_next_key);
+ round1(6, RR0, RL0, load_next_key);
+ round1(7, RL0, RR0, load_next_key);
+ round1(8, RR0, RL0, load_next_key);
+ round1(9, RL0, RR0, load_next_key);
+ round1(10, RR0, RL0, load_next_key);
+ round1(11, RL0, RR0, load_next_key);
+ round1(12, RR0, RL0, load_next_key);
+ round1(13, RL0, RR0, load_next_key);
+ round1(14, RR0, RL0, load_next_key);
+ round1(15, RL0, RR0, load_next_key);
+
+ round1(16+0, RL0, RR0, load_next_key);
+ round1(16+1, RR0, RL0, load_next_key);
+ round1(16+2, RL0, RR0, load_next_key);
+ round1(16+3, RR0, RL0, load_next_key);
+ round1(16+4, RL0, RR0, load_next_key);
+ round1(16+5, RR0, RL0, load_next_key);
+ round1(16+6, RL0, RR0, load_next_key);
+ round1(16+7, RR0, RL0, load_next_key);
+ round1(16+8, RL0, RR0, load_next_key);
+ round1(16+9, RR0, RL0, load_next_key);
+ round1(16+10, RL0, RR0, load_next_key);
+ round1(16+11, RR0, RL0, load_next_key);
+ round1(16+12, RL0, RR0, load_next_key);
+ round1(16+13, RR0, RL0, load_next_key);
+ round1(16+14, RL0, RR0, load_next_key);
+ round1(16+15, RR0, RL0, load_next_key);
+
+ round1(32+0, RR0, RL0, load_next_key);
+ round1(32+1, RL0, RR0, load_next_key);
+ round1(32+2, RR0, RL0, load_next_key);
+ round1(32+3, RL0, RR0, load_next_key);
+ round1(32+4, RR0, RL0, load_next_key);
+ round1(32+5, RL0, RR0, load_next_key);
+ round1(32+6, RR0, RL0, load_next_key);
+ round1(32+7, RL0, RR0, load_next_key);
+ round1(32+8, RR0, RL0, load_next_key);
+ round1(32+9, RL0, RR0, load_next_key);
+ round1(32+10, RR0, RL0, load_next_key);
+ round1(32+11, RL0, RR0, load_next_key);
+ round1(32+12, RR0, RL0, load_next_key);
+ round1(32+13, RL0, RR0, load_next_key);
+ round1(32+14, RR0, RL0, load_next_key);
+ round1(32+15, RL0, RR0, dummy2);
+
+ final_permutation(RR0, RL0);
+
+ popq %rsi /* dst */
+ write_block(%rsi, RR0, RL0);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+
+ RET;
+SYM_FUNC_END(des3_ede_x86_64_crypt_blk)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ \
+ movl left##0d, RW0d; \
+ roll $1, right##0d; \
+ xorl right##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##0d; \
+ xorl RW0d, right##0d; \
+ roll $1, left##0d; \
+ expand_to_64bits(right##0, RT3); \
+ expand_to_64bits(left##0, RT3); \
+ movl left##1d, RW1d; \
+ roll $1, right##1d; \
+ xorl right##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, left##1d; \
+ xorl RW1d, right##1d; \
+ roll $1, left##1d; \
+ expand_to_64bits(right##1, RT3); \
+ expand_to_64bits(left##1, RT3); \
+ movl left##2d, RW2d; \
+ roll $1, right##2d; \
+ xorl right##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, left##2d; \
+ xorl RW2d, right##2d; \
+ roll $1, left##2d; \
+ expand_to_64bits(right##2, RT3); \
+ expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+ compress_to_64bits(right##0); \
+ compress_to_64bits(left##0); \
+ movl right##0d, RW0d; \
+ rorl $1, left##0d; \
+ xorl left##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##0d; \
+ xorl RW0d, left##0d; \
+ rorl $1, right##0d; \
+ compress_to_64bits(right##1); \
+ compress_to_64bits(left##1); \
+ movl right##1d, RW1d; \
+ rorl $1, left##1d; \
+ xorl left##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, right##1d; \
+ xorl RW1d, left##1d; \
+ rorl $1, right##1d; \
+ compress_to_64bits(right##2); \
+ compress_to_64bits(left##2); \
+ movl right##2d, RW2d; \
+ rorl $1, left##2d; \
+ xorl left##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, right##2d; \
+ xorl RW2d, left##2d; \
+ rorl $1, right##2d; \
+ \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+ xorq from##0, RW0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ load_next_key(n, RW0); \
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
+ xorq from##1, RW1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrl $16, RW1d; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ do_movq(RW0, RW1); \
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
+ xorq from##2, RW2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrl $16, RW2d; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ do_movq(RW0, RW2); \
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+ movq src, dst;
+
+SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way)
+ /* input:
+ * %rdi: ctx, round keys
+ * %rsi: dst (3 blocks)
+ * %rdx: src (3 blocks)
+ */
+
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ pushq %rsi /* dst */
+
+ /* load input */
+ movl 0 * 4(%rdx), RL0d;
+ movl 1 * 4(%rdx), RR0d;
+ movl 2 * 4(%rdx), RL1d;
+ movl 3 * 4(%rdx), RR1d;
+ movl 4 * 4(%rdx), RL2d;
+ movl 5 * 4(%rdx), RR2d;
+
+ bswapl RL0d;
+ bswapl RR0d;
+ bswapl RL1d;
+ bswapl RR1d;
+ bswapl RL2d;
+ bswapl RR2d;
+
+ initial_permutation3(RL, RR);
+
+ movq 0(CTX), RW0;
+ movq RW0, RW1;
+ movq RW0, RW2;
+
+ round3(0, RR, RL, load_next_key, __movq);
+ round3(1, RL, RR, load_next_key, __movq);
+ round3(2, RR, RL, load_next_key, __movq);
+ round3(3, RL, RR, load_next_key, __movq);
+ round3(4, RR, RL, load_next_key, __movq);
+ round3(5, RL, RR, load_next_key, __movq);
+ round3(6, RR, RL, load_next_key, __movq);
+ round3(7, RL, RR, load_next_key, __movq);
+ round3(8, RR, RL, load_next_key, __movq);
+ round3(9, RL, RR, load_next_key, __movq);
+ round3(10, RR, RL, load_next_key, __movq);
+ round3(11, RL, RR, load_next_key, __movq);
+ round3(12, RR, RL, load_next_key, __movq);
+ round3(13, RL, RR, load_next_key, __movq);
+ round3(14, RR, RL, load_next_key, __movq);
+ round3(15, RL, RR, load_next_key, __movq);
+
+ round3(16+0, RL, RR, load_next_key, __movq);
+ round3(16+1, RR, RL, load_next_key, __movq);
+ round3(16+2, RL, RR, load_next_key, __movq);
+ round3(16+3, RR, RL, load_next_key, __movq);
+ round3(16+4, RL, RR, load_next_key, __movq);
+ round3(16+5, RR, RL, load_next_key, __movq);
+ round3(16+6, RL, RR, load_next_key, __movq);
+ round3(16+7, RR, RL, load_next_key, __movq);
+ round3(16+8, RL, RR, load_next_key, __movq);
+ round3(16+9, RR, RL, load_next_key, __movq);
+ round3(16+10, RL, RR, load_next_key, __movq);
+ round3(16+11, RR, RL, load_next_key, __movq);
+ round3(16+12, RL, RR, load_next_key, __movq);
+ round3(16+13, RR, RL, load_next_key, __movq);
+ round3(16+14, RL, RR, load_next_key, __movq);
+ round3(16+15, RR, RL, load_next_key, __movq);
+
+ round3(32+0, RR, RL, load_next_key, __movq);
+ round3(32+1, RL, RR, load_next_key, __movq);
+ round3(32+2, RR, RL, load_next_key, __movq);
+ round3(32+3, RL, RR, load_next_key, __movq);
+ round3(32+4, RR, RL, load_next_key, __movq);
+ round3(32+5, RL, RR, load_next_key, __movq);
+ round3(32+6, RR, RL, load_next_key, __movq);
+ round3(32+7, RL, RR, load_next_key, __movq);
+ round3(32+8, RR, RL, load_next_key, __movq);
+ round3(32+9, RL, RR, load_next_key, __movq);
+ round3(32+10, RR, RL, load_next_key, __movq);
+ round3(32+11, RL, RR, load_next_key, __movq);
+ round3(32+12, RR, RL, load_next_key, __movq);
+ round3(32+13, RL, RR, load_next_key, __movq);
+ round3(32+14, RR, RL, load_next_key, __movq);
+ round3(32+15, RL, RR, dummy2, dummy2);
+
+ final_permutation3(RR, RL);
+
+ bswapl RR0d;
+ bswapl RL0d;
+ bswapl RR1d;
+ bswapl RL1d;
+ bswapl RR2d;
+ bswapl RL2d;
+
+ popq %rsi /* dst */
+ movl RR0d, 0 * 4(%rsi);
+ movl RL0d, 1 * 4(%rsi);
+ movl RR1d, 2 * 4(%rsi);
+ movl RL1d, 3 * 4(%rsi);
+ movl RR2d, 4 * 4(%rsi);
+ movl RL2d, 5 * 4(%rsi);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+
+ RET;
+SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way)
+
+.section .rodata, "a", @progbits
+.align 16
+.L_s1:
+ .quad 0x0010100001010400, 0x0000000000000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0010100001010004, 0x0000100000010404
+ .quad 0x0000000000000004, 0x0000100000010000
+ .quad 0x0000000000000400, 0x0010100001010400
+ .quad 0x0010100001010404, 0x0000000000000400
+ .quad 0x0010000001000404, 0x0010100001010004
+ .quad 0x0010000001000000, 0x0000000000000004
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000100000010400
+ .quad 0x0000100000010400, 0x0010100001010000
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0000100000010004, 0x0010000001000004
+ .quad 0x0010000001000004, 0x0000100000010004
+ .quad 0x0000000000000000, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010000001000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0000000000000004, 0x0010100001010000
+ .quad 0x0010100001010400, 0x0010000001000000
+ .quad 0x0010000001000000, 0x0000000000000400
+ .quad 0x0010100001010004, 0x0000100000010000
+ .quad 0x0000100000010400, 0x0010000001000004
+ .quad 0x0000000000000400, 0x0000000000000004
+ .quad 0x0010000001000404, 0x0000100000010404
+ .quad 0x0010100001010404, 0x0000100000010004
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0010000001000004, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010100001010400
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000000000000000
+ .quad 0x0000100000010004, 0x0000100000010400
+ .quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+ .quad 0x0801080200100020, 0x0800080000000000
+ .quad 0x0000080000000000, 0x0001080200100020
+ .quad 0x0001000000100000, 0x0000000200000020
+ .quad 0x0801000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0801080200100020
+ .quad 0x0801080000100000, 0x0800000000000000
+ .quad 0x0800080000000000, 0x0001000000100000
+ .quad 0x0000000200000020, 0x0801000200100020
+ .quad 0x0001080000100000, 0x0001000200100020
+ .quad 0x0800080200000020, 0x0000000000000000
+ .quad 0x0800000000000000, 0x0000080000000000
+ .quad 0x0001080200100020, 0x0801000000100000
+ .quad 0x0001000200100020, 0x0800000200000020
+ .quad 0x0000000000000000, 0x0001080000100000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0801000000100000, 0x0000080200000020
+ .quad 0x0000000000000000, 0x0001080200100020
+ .quad 0x0801000200100020, 0x0001000000100000
+ .quad 0x0800080200000020, 0x0801000000100000
+ .quad 0x0801080000100000, 0x0000080000000000
+ .quad 0x0801000000100000, 0x0800080000000000
+ .quad 0x0000000200000020, 0x0801080200100020
+ .quad 0x0001080200100020, 0x0000000200000020
+ .quad 0x0000080000000000, 0x0800000000000000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0001000000100000, 0x0800000200000020
+ .quad 0x0001000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0001000200100020
+ .quad 0x0001080000100000, 0x0000000000000000
+ .quad 0x0800080000000000, 0x0000080200000020
+ .quad 0x0800000000000000, 0x0801000200100020
+ .quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+ .quad 0x0000002000000208, 0x0000202008020200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000202000020208, 0x0000002008000200
+ .quad 0x0000200000020008, 0x0000000008000008
+ .quad 0x0000000008000008, 0x0000200000020000
+ .quad 0x0000202008020208, 0x0000200000020008
+ .quad 0x0000200008020000, 0x0000002000000208
+ .quad 0x0000000008000000, 0x0000000000000008
+ .quad 0x0000202008020200, 0x0000002000000200
+ .quad 0x0000202000020200, 0x0000200008020000
+ .quad 0x0000200008020008, 0x0000202000020208
+ .quad 0x0000002008000208, 0x0000202000020200
+ .quad 0x0000200000020000, 0x0000002008000208
+ .quad 0x0000000000000008, 0x0000202008020208
+ .quad 0x0000002000000200, 0x0000000008000000
+ .quad 0x0000202008020200, 0x0000000008000000
+ .quad 0x0000200000020008, 0x0000002000000208
+ .quad 0x0000200000020000, 0x0000202008020200
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000002000000200, 0x0000200000020008
+ .quad 0x0000202008020208, 0x0000002008000200
+ .quad 0x0000000008000008, 0x0000002000000200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000208, 0x0000200000020000
+ .quad 0x0000000008000000, 0x0000202008020208
+ .quad 0x0000000000000008, 0x0000202000020208
+ .quad 0x0000202000020200, 0x0000000008000008
+ .quad 0x0000200008020000, 0x0000002008000208
+ .quad 0x0000002000000208, 0x0000200008020000
+ .quad 0x0000202000020208, 0x0000000000000008
+ .quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x0000020000002000, 0x0008020800002000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+ .quad 0x0000001000000100, 0x0020001002080100
+ .quad 0x0020000002080000, 0x0420001002000100
+ .quad 0x0000000000080000, 0x0000001000000100
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0400001000080100, 0x0000000000080000
+ .quad 0x0020001002000100, 0x0400001000080100
+ .quad 0x0420001002000100, 0x0420000002080000
+ .quad 0x0000001000080100, 0x0400000000000000
+ .quad 0x0020000002000000, 0x0400000000080000
+ .quad 0x0400000000080000, 0x0000000000000000
+ .quad 0x0400001000000100, 0x0420001002080100
+ .quad 0x0420001002080100, 0x0020001002000100
+ .quad 0x0420000002080000, 0x0400001000000100
+ .quad 0x0000000000000000, 0x0420000002000000
+ .quad 0x0020001002080100, 0x0020000002000000
+ .quad 0x0420000002000000, 0x0000001000080100
+ .quad 0x0000000000080000, 0x0420001002000100
+ .quad 0x0000001000000100, 0x0020000002000000
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0420001002000100, 0x0400001000080100
+ .quad 0x0020001002000100, 0x0400000000000000
+ .quad 0x0420000002080000, 0x0020001002080100
+ .quad 0x0400001000080100, 0x0000001000000100
+ .quad 0x0020000002000000, 0x0420000002080000
+ .quad 0x0420001002080100, 0x0000001000080100
+ .quad 0x0420000002000000, 0x0420001002080100
+ .quad 0x0020000002080000, 0x0000000000000000
+ .quad 0x0400000000080000, 0x0420000002000000
+ .quad 0x0000001000080100, 0x0020001002000100
+ .quad 0x0400001000000100, 0x0000000000080000
+ .quad 0x0000000000000000, 0x0400000000080000
+ .quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+ .quad 0x0200000120000010, 0x0204000020000000
+ .quad 0x0000040000000000, 0x0204040120000010
+ .quad 0x0204000020000000, 0x0000000100000010
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0200040020000000, 0x0004040100000010
+ .quad 0x0004000000000000, 0x0200000120000010
+ .quad 0x0004000100000010, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0000000000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000040000000000
+ .quad 0x0004040000000000, 0x0200040120000010
+ .quad 0x0000000100000010, 0x0204000120000010
+ .quad 0x0204000120000010, 0x0000000000000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000040100000010, 0x0004040000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0200040020000000, 0x0000000100000010
+ .quad 0x0204000120000010, 0x0004040000000000
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0000040100000010, 0x0200000120000010
+ .quad 0x0004000000000000, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0200000120000010, 0x0204040120000010
+ .quad 0x0004040000000000, 0x0204000020000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000000000000000, 0x0204000120000010
+ .quad 0x0000000100000010, 0x0000040000000000
+ .quad 0x0204000020000000, 0x0004040100000010
+ .quad 0x0000040000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000000000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+ .quad 0x0002000000200000, 0x2002000004200002
+ .quad 0x2000000004000802, 0x0000000000000000
+ .quad 0x0000000000000800, 0x2000000004000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2002000004200802, 0x0002000000200000
+ .quad 0x0000000000000000, 0x2000000004000002
+ .quad 0x2000000000000002, 0x0000000004000000
+ .quad 0x2002000004200002, 0x2000000000000802
+ .quad 0x0000000004000800, 0x2002000000200802
+ .quad 0x2002000000200002, 0x0000000004000800
+ .quad 0x2000000004000002, 0x0002000004200000
+ .quad 0x0002000004200800, 0x2002000000200002
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000000000802, 0x2002000004200802
+ .quad 0x0002000000200800, 0x2000000000000002
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0002000000200000, 0x2000000004000802
+ .quad 0x2000000004000802, 0x2002000004200002
+ .quad 0x2002000004200002, 0x2000000000000002
+ .quad 0x2002000000200002, 0x0000000004000000
+ .quad 0x0000000004000800, 0x0002000000200000
+ .quad 0x0002000004200800, 0x2000000000000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2000000000000802, 0x2000000004000002
+ .quad 0x2002000004200802, 0x0002000004200000
+ .quad 0x0002000000200800, 0x0000000000000000
+ .quad 0x2000000000000002, 0x2002000004200802
+ .quad 0x0000000000000000, 0x2002000000200802
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000004000002, 0x0000000004000800
+ .quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+ .quad 0x0100010410001000, 0x0000010000001000
+ .quad 0x0000000000040000, 0x0100010410041000
+ .quad 0x0100000010000000, 0x0100010410001000
+ .quad 0x0000000400000000, 0x0100000010000000
+ .quad 0x0000000400040000, 0x0100000010040000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0100010010041000, 0x0000010400041000
+ .quad 0x0000010000001000, 0x0000000400000000
+ .quad 0x0100000010040000, 0x0100000410000000
+ .quad 0x0100010010001000, 0x0000010400001000
+ .quad 0x0000010000041000, 0x0000000400040000
+ .quad 0x0100000410040000, 0x0100010010041000
+ .quad 0x0000010400001000, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0100000410040000
+ .quad 0x0100000410000000, 0x0100010010001000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0100010010041000, 0x0000010000001000
+ .quad 0x0000000400000000, 0x0100000410040000
+ .quad 0x0000010000001000, 0x0000010400041000
+ .quad 0x0100010010001000, 0x0000000400000000
+ .quad 0x0100000410000000, 0x0100000010040000
+ .quad 0x0100000410040000, 0x0100000010000000
+ .quad 0x0000000000040000, 0x0100010410001000
+ .quad 0x0000000000000000, 0x0100010410041000
+ .quad 0x0000000400040000, 0x0100000410000000
+ .quad 0x0100000010040000, 0x0100010010001000
+ .quad 0x0100010410001000, 0x0000000000000000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0000010000041000, 0x0000010400001000
+ .quad 0x0000010400001000, 0x0000000400040000
+ .quad 0x0100000010000000, 0x0100010010041000
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
new file mode 100644
index 000000000000..34600f90d8a6
--- /dev/null
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for assembler optimized version of 3DES
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/des.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+struct des3_ede_x86_ctx {
+ struct des3_ede_ctx enc;
+ struct des3_ede_ctx dec;
+};
+
+/* regular block cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc.expkey;
+
+ des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec.expkey;
+
+ des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec.expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
+}
+
+static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int ecb_crypt(struct skcipher_request *req, const u32 *expkey)
+{
+ const unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ const u8 *wsrc = walk.src.virt.addr;
+ u8 *wdst = walk.dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
+ wsrc);
+
+ wsrc += bsize * 3;
+ wdst += bsize * 3;
+ nbytes -= bsize * 3;
+ } while (nbytes >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_crypt(req, ctx->enc.expkey);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_crypt(req, ctx->dec.expkey);
+}
+
+static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while (walk.nbytes) {
+ nbytes = __cbc_encrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[3 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ nbytes -= bsize * 3 - bsize;
+ src -= 3 - 1;
+ dst -= 3 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 3);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while (walk.nbytes) {
+ nbytes = __cbc_decrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 i, j, tmp;
+ int err;
+
+ err = des3_ede_expand_key(&ctx->enc, key, keylen);
+ if (err == -ENOKEY) {
+ if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)
+ err = -EINVAL;
+ else
+ err = 0;
+ }
+
+ if (err) {
+ memset(ctx, 0, sizeof(*ctx));
+ return err;
+ }
+
+ /* Fix encryption context for this implementation and form decryption
+ * context. */
+ j = DES3_EDE_EXPKEY_WORDS - 2;
+ for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
+ tmp = ror32(ctx->enc.expkey[i + 1], 4);
+ ctx->enc.expkey[i + 1] = tmp;
+
+ ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0];
+ ctx->dec.expkey[j + 1] = tmp;
+ }
+
+ return 0;
+}
+
+static int des3_ede_x86_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key,
+ unsigned int keylen)
+{
+ return des3_ede_x86_setkey(&tfm->base, key, keylen);
+}
+
+static struct crypto_alg des3_ede_cipher = {
+ .cra_name = "des3_ede",
+ .cra_driver_name = "des3_ede-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = DES3_EDE_KEY_SIZE,
+ .cia_max_keysize = DES3_EDE_KEY_SIZE,
+ .cia_setkey = des3_ede_x86_setkey,
+ .cia_encrypt = des3_ede_x86_encrypt,
+ .cia_decrypt = des3_ede_x86_decrypt,
+ }
+ }
+};
+
+static struct skcipher_alg des3_ede_skciphers[] = {
+ {
+ .base.cra_name = "ecb(des3_ede)",
+ .base.cra_driver_name = "ecb-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(des3_ede)",
+ .base.cra_driver_name = "cbc-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, des3_ede-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init des3_ede_x86_init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&des3_ede_cipher);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(des3_ede_skciphers,
+ ARRAY_SIZE(des3_ede_skciphers));
+ if (err)
+ crypto_unregister_alg(&des3_ede_cipher);
+
+ return err;
+}
+
+static void __exit des3_ede_x86_fini(void)
+{
+ crypto_unregister_alg(&des3_ede_cipher);
+ crypto_unregister_skciphers(des3_ede_skciphers,
+ ARRAY_SIZE(des3_ede_skciphers));
+}
+
+module_init(des3_ede_x86_init);
+module_exit(des3_ede_x86_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("des3_ede");
+MODULE_ALIAS_CRYPTO("des3_ede-asm");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
new file mode 100644
index 000000000000..11955bd01af1
--- /dev/null
+++ b/arch/x86/crypto/ecb_cbc_helpers.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRYPTO_ECB_CBC_HELPER_H
+#define _CRYPTO_ECB_CBC_HELPER_H
+
+#include <crypto/internal/skcipher.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Mode helpers to instantiate parameterized skcipher ECB/CBC modes without
+ * having to rely on indirect calls and retpolines.
+ */
+
+#define ECB_WALK_START(req, bsize, fpu_blocks) do { \
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \
+ const int __fpu_blocks = (fpu_blocks); \
+ const int __bsize = (bsize); \
+ struct skcipher_walk walk; \
+ int err = skcipher_walk_virt(&walk, (req), false); \
+ while (walk.nbytes > 0) { \
+ unsigned int nbytes = walk.nbytes; \
+ bool do_fpu = __fpu_blocks != -1 && \
+ nbytes >= __fpu_blocks * __bsize; \
+ const u8 *src = walk.src.virt.addr; \
+ u8 *dst = walk.dst.virt.addr; \
+ u8 __maybe_unused buf[(bsize)]; \
+ if (do_fpu) kernel_fpu_begin()
+
+#define CBC_WALK_START(req, bsize, fpu_blocks) \
+ ECB_WALK_START(req, bsize, fpu_blocks)
+
+#define ECB_WALK_ADVANCE(blocks) do { \
+ dst += (blocks) * __bsize; \
+ src += (blocks) * __bsize; \
+ nbytes -= (blocks) * __bsize; \
+} while (0)
+
+#define ECB_BLOCK(blocks, func) do { \
+ const int __blocks = (blocks); \
+ if (do_fpu && __blocks < __fpu_blocks) { \
+ kernel_fpu_end(); \
+ do_fpu = false; \
+ } \
+ while (nbytes >= __blocks * __bsize) { \
+ (func)(ctx, dst, src); \
+ ECB_WALK_ADVANCE(blocks); \
+ } \
+} while (0)
+
+#define CBC_ENC_BLOCK(func) do { \
+ const u8 *__iv = walk.iv; \
+ while (nbytes >= __bsize) { \
+ crypto_xor_cpy(dst, src, __iv, __bsize); \
+ (func)(ctx, dst, dst); \
+ __iv = dst; \
+ ECB_WALK_ADVANCE(1); \
+ } \
+ memcpy(walk.iv, __iv, __bsize); \
+} while (0)
+
+#define CBC_DEC_BLOCK(blocks, func) do { \
+ const int __blocks = (blocks); \
+ if (do_fpu && __blocks < __fpu_blocks) { \
+ kernel_fpu_end(); \
+ do_fpu = false; \
+ } \
+ while (nbytes >= __blocks * __bsize) { \
+ const u8 *__iv = src + ((blocks) - 1) * __bsize; \
+ if (dst == src) \
+ __iv = memcpy(buf, __iv, __bsize); \
+ (func)(ctx, dst, src); \
+ crypto_xor(dst, walk.iv, __bsize); \
+ memcpy(walk.iv, __iv, __bsize); \
+ ECB_WALK_ADVANCE(blocks); \
+ } \
+} while (0)
+
+#define ECB_WALK_END() \
+ if (do_fpu) kernel_fpu_end(); \
+ err = skcipher_walk_done(&walk, nbytes); \
+ } \
+ return err; \
+} while (0)
+
+#define CBC_WALK_END() ECB_WALK_END()
+
+#endif
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
deleted file mode 100644
index daef6cd2b45d..000000000000
--- a/arch/x86/crypto/fpu.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * FPU: Wrapper for blkcipher touching fpu
- *
- * Copyright (c) Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/algapi.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/i387.h>
-
-struct crypto_fpu_ctx {
- struct crypto_blkcipher *child;
-};
-
-static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
- unsigned int keylen)
-{
- struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
- struct crypto_blkcipher *child = ctx->child;
- int err;
-
- crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
- crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
- CRYPTO_TFM_REQ_MASK);
- err = crypto_blkcipher_setkey(child, key, keylen);
- crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
- CRYPTO_TFM_RES_MASK);
- return err;
-}
-
-static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
-{
- int err;
- struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
- struct crypto_blkcipher *child = ctx->child;
- struct blkcipher_desc desc = {
- .tfm = child,
- .info = desc_in->info,
- .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
- };
-
- kernel_fpu_begin();
- err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
- kernel_fpu_end();
- return err;
-}
-
-static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
-{
- int err;
- struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
- struct crypto_blkcipher *child = ctx->child;
- struct blkcipher_desc desc = {
- .tfm = child,
- .info = desc_in->info,
- .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
- };
-
- kernel_fpu_begin();
- err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
- kernel_fpu_end();
- return err;
-}
-
-static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
-{
- struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
- struct crypto_spawn *spawn = crypto_instance_ctx(inst);
- struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
- struct crypto_blkcipher *cipher;
-
- cipher = crypto_spawn_blkcipher(spawn);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- ctx->child = cipher;
- return 0;
-}
-
-static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
-{
- struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
- crypto_free_blkcipher(ctx->child);
-}
-
-static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
-{
- struct crypto_instance *inst;
- struct crypto_alg *alg;
- int err;
-
- err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
- if (err)
- return ERR_PTR(err);
-
- alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
- CRYPTO_ALG_TYPE_MASK);
- if (IS_ERR(alg))
- return ERR_CAST(alg);
-
- inst = crypto_alloc_instance("fpu", alg);
- if (IS_ERR(inst))
- goto out_put_alg;
-
- inst->alg.cra_flags = alg->cra_flags;
- inst->alg.cra_priority = alg->cra_priority;
- inst->alg.cra_blocksize = alg->cra_blocksize;
- inst->alg.cra_alignmask = alg->cra_alignmask;
- inst->alg.cra_type = alg->cra_type;
- inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
- inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
- inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
- inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
- inst->alg.cra_init = crypto_fpu_init_tfm;
- inst->alg.cra_exit = crypto_fpu_exit_tfm;
- inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
- inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
- inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
-
-out_put_alg:
- crypto_mod_put(alg);
- return inst;
-}
-
-static void crypto_fpu_free(struct crypto_instance *inst)
-{
- crypto_drop_spawn(crypto_instance_ctx(inst));
- kfree(inst);
-}
-
-static struct crypto_template crypto_fpu_tmpl = {
- .name = "fpu",
- .alloc = crypto_fpu_alloc,
- .free = crypto_fpu_free,
- .module = THIS_MODULE,
-};
-
-static int __init crypto_fpu_module_init(void)
-{
- return crypto_register_template(&crypto_fpu_tmpl);
-}
-
-static void __exit crypto_fpu_module_exit(void)
-{
- crypto_unregister_template(&crypto_fpu_tmpl);
-}
-
-module_init(crypto_fpu_module_init);
-module_exit(crypto_fpu_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000000..c4fbaa82ed7a
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * https://www.intel.com/content/dam/develop/external/us/en/documents/clmul-wp-rev-2-02-2014-04-20.pdf
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Vinodh Gopal
+ * Erdinc Ozturk
+ * Deniz Karakoyunlu
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+#define DATA %xmm0
+#define SHASH %xmm1
+#define T1 %xmm2
+#define T2 %xmm3
+#define T3 %xmm4
+#define BSWAP %xmm5
+#define IN1 %xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble: internal ABI
+ * input:
+ * DATA: operand1
+ * SHASH: operand2, hash_key << 1 mod poly
+ * output:
+ * DATA: operand1 * operand2 mod poly
+ * changed:
+ * T1
+ * T2
+ * T3
+ */
+SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
+ movaps DATA, T1
+ pshufd $0b01001110, DATA, T2
+ pshufd $0b01001110, SHASH, T3
+ pxor DATA, T2
+ pxor SHASH, T3
+
+ pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0
+ pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1
+ pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0)
+ pxor DATA, T2
+ pxor T1, T2 # T2 = a0 * b1 + a1 * b0
+
+ movaps T2, T3
+ pslldq $8, T3
+ psrldq $8, T2
+ pxor T3, DATA
+ pxor T2, T1 # <T1:DATA> is result of
+ # carry-less multiplication
+
+ # first phase of the reduction
+ movaps DATA, T3
+ psllq $1, T3
+ pxor DATA, T3
+ psllq $5, T3
+ pxor DATA, T3
+ psllq $57, T3
+ movaps T3, T2
+ pslldq $8, T2
+ psrldq $8, T3
+ pxor T2, DATA
+ pxor T3, T1
+
+ # second phase of the reduction
+ movaps DATA, T2
+ psrlq $5, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor T2, T1
+ pxor T1, DATA
+ RET
+SYM_FUNC_END(__clmul_gf128mul_ble)
+
+/* void clmul_ghash_mul(char *dst, const le128 *shash) */
+SYM_FUNC_START(clmul_ghash_mul)
+ FRAME_BEGIN
+ movups (%rdi), DATA
+ movups (%rsi), SHASH
+ movaps .Lbswap_mask(%rip), BSWAP
+ pshufb BSWAP, DATA
+ call __clmul_gf128mul_ble
+ pshufb BSWAP, DATA
+ movups DATA, (%rdi)
+ FRAME_END
+ RET
+SYM_FUNC_END(clmul_ghash_mul)
+
+/*
+ * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ * const le128 *shash);
+ */
+SYM_FUNC_START(clmul_ghash_update)
+ FRAME_BEGIN
+ cmp $16, %rdx
+ jb .Lupdate_just_ret # check length
+ movaps .Lbswap_mask(%rip), BSWAP
+ movups (%rdi), DATA
+ movups (%rcx), SHASH
+ pshufb BSWAP, DATA
+.align 4
+.Lupdate_loop:
+ movups (%rsi), IN1
+ pshufb BSWAP, IN1
+ pxor IN1, DATA
+ call __clmul_gf128mul_ble
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lupdate_loop
+ pshufb BSWAP, DATA
+ movups DATA, (%rdi)
+.Lupdate_just_ret:
+ mov %rdx, %rax
+ FRAME_END
+ RET
+SYM_FUNC_END(clmul_ghash_update)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000000..aea5d4d06be7
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains glue code.
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ */
+
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/utils.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash);
+
+asmlinkage int clmul_ghash_update(char *dst, const char *src,
+ unsigned int srclen, const le128 *shash);
+
+struct x86_ghash_ctx {
+ le128 shash;
+};
+
+static int ghash_init(struct shash_desc *desc)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int ghash_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm);
+ u64 a, b;
+
+ if (keylen != GHASH_BLOCK_SIZE)
+ return -EINVAL;
+
+ /*
+ * GHASH maps bits to polynomial coefficients backwards, which makes it
+ * hard to implement. But it can be shown that the GHASH multiplication
+ *
+ * D * K (mod x^128 + x^7 + x^2 + x + 1)
+ *
+ * (where D is a data block and K is the key) is equivalent to:
+ *
+ * bitreflect(D) * bitreflect(K) * x^(-127)
+ * (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * So, the code below precomputes:
+ *
+ * bitreflect(K) * x^(-127) (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * ... but in Montgomery form (so that Montgomery multiplication can be
+ * used), i.e. with an extra x^128 factor, which means actually:
+ *
+ * bitreflect(K) * x (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * The within-a-byte part of bitreflect() cancels out GHASH's built-in
+ * reflection, and thus bitreflect() is actually a byteswap.
+ */
+ a = get_unaligned_be64(key);
+ b = get_unaligned_be64(key + 8);
+ ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63));
+ ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63));
+ if (a >> 63)
+ ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56);
+ return 0;
+}
+
+static int ghash_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ u8 *dst = dctx->buffer;
+ int remain;
+
+ kernel_fpu_begin();
+ remain = clmul_ghash_update(dst, src, srclen, &ctx->shash);
+ kernel_fpu_end();
+ return remain;
+}
+
+static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx,
+ const u8 *src, unsigned int len)
+{
+ u8 *dst = dctx->buffer;
+
+ kernel_fpu_begin();
+ if (len) {
+ crypto_xor(dst, src, len);
+ clmul_ghash_mul(dst, &ctx->shash);
+ }
+ kernel_fpu_end();
+}
+
+static int ghash_finup(struct shash_desc *desc, const u8 *src,
+ unsigned int len, u8 *dst)
+{
+ struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ u8 *buf = dctx->buffer;
+
+ ghash_flush(ctx, dctx, src, len);
+ memcpy(dst, buf, GHASH_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = ghash_init,
+ .update = ghash_update,
+ .finup = ghash_finup,
+ .setkey = ghash_setkey,
+ .descsize = sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "ghash-pclmulqdqni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct x86_ghash_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static const struct x86_cpu_id pcmul_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
+
+static int __init ghash_pclmulqdqni_mod_init(void)
+{
+ if (!x86_match_cpu(pcmul_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_shash(&ghash_alg);
+}
+
+static void __exit ghash_pclmulqdqni_mod_exit(void)
+{
+ crypto_unregister_shash(&ghash_alg);
+}
+
+module_init(ghash_pclmulqdqni_mod_init);
+module_exit(ghash_pclmulqdqni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI");
+MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 000000000000..3da385271227
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Shared glue code for 128bit block ciphers, AVX assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu (0*16)(src), x0; \
+ vmovdqu (1*16)(src), x1; \
+ vmovdqu (2*16)(src), x2; \
+ vmovdqu (3*16)(src), x3; \
+ vmovdqu (4*16)(src), x4; \
+ vmovdqu (5*16)(src), x5; \
+ vmovdqu (6*16)(src), x6; \
+ vmovdqu (7*16)(src), x7;
+
+#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu x0, (0*16)(dst); \
+ vmovdqu x1, (1*16)(dst); \
+ vmovdqu x2, (2*16)(dst); \
+ vmovdqu x3, (3*16)(dst); \
+ vmovdqu x4, (4*16)(dst); \
+ vmovdqu x5, (5*16)(dst); \
+ vmovdqu x6, (6*16)(dst); \
+ vmovdqu x7, (7*16)(dst);
+
+#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(src), x1, x1; \
+ vpxor (1*16)(src), x2, x2; \
+ vpxor (2*16)(src), x3, x3; \
+ vpxor (3*16)(src), x4, x4; \
+ vpxor (4*16)(src), x5, x5; \
+ vpxor (5*16)(src), x6, x6; \
+ vpxor (6*16)(src), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S
new file mode 100644
index 000000000000..c77e9049431f
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Shared glue code for 128bit block ciphers, AVX2 assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu (0*32)(src), x0; \
+ vmovdqu (1*32)(src), x1; \
+ vmovdqu (2*32)(src), x2; \
+ vmovdqu (3*32)(src), x3; \
+ vmovdqu (4*32)(src), x4; \
+ vmovdqu (5*32)(src), x5; \
+ vmovdqu (6*32)(src), x6; \
+ vmovdqu (7*32)(src), x7;
+
+#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu x0, (0*32)(dst); \
+ vmovdqu x1, (1*32)(dst); \
+ vmovdqu x2, (2*32)(dst); \
+ vmovdqu x3, (3*32)(dst); \
+ vmovdqu x4, (4*32)(dst); \
+ vmovdqu x5, (5*32)(dst); \
+ vmovdqu x6, (6*32)(dst); \
+ vmovdqu x7, (7*32)(dst);
+
+#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
+ vpxor t0, t0, t0; \
+ vinserti128 $1, (src), t0, t0; \
+ vpxor t0, x0, x0; \
+ vpxor (0*32+16)(src), x1, x1; \
+ vpxor (1*32+16)(src), x2, x2; \
+ vpxor (2*32+16)(src), x3, x3; \
+ vpxor (3*32+16)(src), x4, x4; \
+ vpxor (4*32+16)(src), x5, x5; \
+ vpxor (5*32+16)(src), x6, x6; \
+ vpxor (6*32+16)(src), x7, x7; \
+ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
new file mode 100644
index 000000000000..791386d9a83a
--- /dev/null
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+#define PASS0_SUMS %ymm0
+#define PASS1_SUMS %ymm1
+#define PASS2_SUMS %ymm2
+#define PASS3_SUMS %ymm3
+#define K0 %ymm4
+#define K0_XMM %xmm4
+#define K1 %ymm5
+#define K1_XMM %xmm5
+#define K2 %ymm6
+#define K2_XMM %xmm6
+#define K3 %ymm7
+#define K3_XMM %xmm7
+#define T0 %ymm8
+#define T1 %ymm9
+#define T2 %ymm10
+#define T2_XMM %xmm10
+#define T3 %ymm11
+#define T3_XMM %xmm11
+#define T4 %ymm12
+#define T5 %ymm13
+#define T6 %ymm14
+#define T7 %ymm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_2xstride k0, k1, k2, k3
+
+ // Add message words to key words
+ vpaddd \k0, T3, T0
+ vpaddd \k1, T3, T1
+ vpaddd \k2, T3, T2
+ vpaddd \k3, T3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ vpshufd $0x10, T0, T4
+ vpshufd $0x32, T0, T0
+ vpshufd $0x10, T1, T5
+ vpshufd $0x32, T1, T1
+ vpshufd $0x10, T2, T6
+ vpshufd $0x32, T2, T2
+ vpshufd $0x10, T3, T7
+ vpshufd $0x32, T3, T3
+ vpmuludq T4, T0, T0
+ vpmuludq T5, T1, T1
+ vpmuludq T6, T2, T2
+ vpmuludq T7, T3, T3
+ vpaddq T0, PASS0_SUMS, PASS0_SUMS
+ vpaddq T1, PASS1_SUMS, PASS1_SUMS
+ vpaddq T2, PASS2_SUMS, PASS2_SUMS
+ vpaddq T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ * __le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+SYM_TYPED_FUNC_START(nh_avx2)
+
+ vmovdqu 0x00(KEY), K0
+ vmovdqu 0x10(KEY), K1
+ add $0x20, KEY
+ vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+ vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+ vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+ vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+
+ vmovdqu 0x20(MESSAGE), T3
+ vmovdqu 0x20(KEY), K0
+ vmovdqu 0x30(KEY), K1
+ _nh_2xstride K2, K3, K0, K1
+
+ add $0x40, MESSAGE
+ add $0x40, KEY
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+
+ cmp $0x20, MESSAGE_LEN
+ jl .Llast
+
+ // 2 or 3 strides remain; do 2 more.
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+ add $0x20, MESSAGE
+ add $0x20, KEY
+ sub $0x20, MESSAGE_LEN
+ jz .Ldone
+ vmovdqa K2, K0
+ vmovdqa K3, K1
+.Llast:
+ // Last stride. Zero the high 128 bits of the message and keys so they
+ // don't affect the result when processing them like 2 strides.
+ vmovdqu (MESSAGE), T3_XMM
+ vmovdqa K0_XMM, K0_XMM
+ vmovdqa K1_XMM, K1_XMM
+ vmovdqu 0x00(KEY), K2_XMM
+ vmovdqu 0x10(KEY), K3_XMM
+ _nh_2xstride K0, K1, K2, K3
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+
+ // PASS0_SUMS is (0A 0B 0C 0D)
+ // PASS1_SUMS is (1A 1B 1C 1D)
+ // PASS2_SUMS is (2A 2B 2C 2D)
+ // PASS3_SUMS is (3A 3B 3C 3D)
+ // We need the horizontal sums:
+ // (0A + 0B + 0C + 0D,
+ // 1A + 1B + 1C + 1D,
+ // 2A + 2B + 2C + 2D,
+ // 3A + 3B + 3C + 3D)
+ //
+
+ vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C)
+ vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D)
+ vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C)
+ vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D)
+
+ vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A)
+ vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B)
+ vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C)
+ vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D)
+
+ vpaddq T5, T4, T4
+ vpaddq T1, T0, T0
+ vpaddq T4, T0, T0
+ vmovdqu T0, (HASH)
+ vzeroupper
+ RET
+SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
new file mode 100644
index 000000000000..75fb994b6d17
--- /dev/null
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+#define PASS0_SUMS %xmm0
+#define PASS1_SUMS %xmm1
+#define PASS2_SUMS %xmm2
+#define PASS3_SUMS %xmm3
+#define K0 %xmm4
+#define K1 %xmm5
+#define K2 %xmm6
+#define K3 %xmm7
+#define T0 %xmm8
+#define T1 %xmm9
+#define T2 %xmm10
+#define T3 %xmm11
+#define T4 %xmm12
+#define T5 %xmm13
+#define T6 %xmm14
+#define T7 %xmm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_stride k0, k1, k2, k3, offset
+
+ // Load next message stride
+ movdqu \offset(MESSAGE), T1
+
+ // Load next key stride
+ movdqu \offset(KEY), \k3
+
+ // Add message words to key words
+ movdqa T1, T2
+ movdqa T1, T3
+ paddd T1, \k0 // reuse k0 to avoid a move
+ paddd \k1, T1
+ paddd \k2, T2
+ paddd \k3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ pshufd $0x10, \k0, T4
+ pshufd $0x32, \k0, \k0
+ pshufd $0x10, T1, T5
+ pshufd $0x32, T1, T1
+ pshufd $0x10, T2, T6
+ pshufd $0x32, T2, T2
+ pshufd $0x10, T3, T7
+ pshufd $0x32, T3, T3
+ pmuludq T4, \k0
+ pmuludq T5, T1
+ pmuludq T6, T2
+ pmuludq T7, T3
+ paddq \k0, PASS0_SUMS
+ paddq T1, PASS1_SUMS
+ paddq T2, PASS2_SUMS
+ paddq T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ * __le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+SYM_TYPED_FUNC_START(nh_sse2)
+
+ movdqu 0x00(KEY), K0
+ movdqu 0x10(KEY), K1
+ movdqu 0x20(KEY), K2
+ add $0x30, KEY
+ pxor PASS0_SUMS, PASS0_SUMS
+ pxor PASS1_SUMS, PASS1_SUMS
+ pxor PASS2_SUMS, PASS2_SUMS
+ pxor PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ _nh_stride K0, K1, K2, K3, 0x00
+ _nh_stride K1, K2, K3, K0, 0x10
+ _nh_stride K2, K3, K0, K1, 0x20
+ _nh_stride K3, K0, K1, K2, 0x30
+ add $0x40, KEY
+ add $0x40, MESSAGE
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K0, K1, K2, K3, 0x00
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K1, K2, K3, K0, 0x10
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K2, K3, K0, K1, 0x20
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+ movdqa PASS0_SUMS, T0
+ movdqa PASS2_SUMS, T1
+ punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
+ punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
+ punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
+ punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
+ paddq PASS0_SUMS, T0
+ paddq PASS2_SUMS, T1
+ movdqu T0, 0x00(HASH)
+ movdqu T1, 0x10(HASH)
+ RET
+SYM_FUNC_END(nh_sse2)
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
new file mode 100644
index 000000000000..c3a872f4d6a7
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (AVX2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+
+asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES]);
+
+static int nhpoly1305_avx2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, nh_avx2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static int nhpoly1305_avx2_digest(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen, u8 *out)
+{
+ return crypto_nhpoly1305_init(desc) ?:
+ nhpoly1305_avx2_update(desc, src, srclen) ?:
+ crypto_nhpoly1305_final(desc, out);
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-avx2",
+ .base.cra_priority = 300,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_avx2_update,
+ .final = crypto_nhpoly1305_final,
+ .digest = nhpoly1305_avx2_digest,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
new file mode 100644
index 000000000000..a268a8439a5c
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (SSE2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+
+asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES]);
+
+static int nhpoly1305_sse2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, nh_sse2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static int nhpoly1305_sse2_digest(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen, u8 *out)
+{
+ return crypto_nhpoly1305_init(desc) ?:
+ nhpoly1305_sse2_update(desc, src, srclen) ?:
+ crypto_nhpoly1305_final(desc, out);
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-sse2",
+ .base.cra_priority = 200,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_sse2_update,
+ .final = crypto_nhpoly1305_final,
+ .digest = nhpoly1305_sse2_digest,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
deleted file mode 100644
index 72eb306680b2..000000000000
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ /dev/null
@@ -1,1114 +0,0 @@
-# salsa20_pm.s version 20051229
-# D. J. Bernstein
-# Public domain.
-
-# enter ECRYPT_encrypt_bytes
-.text
-.p2align 5
-.globl ECRYPT_encrypt_bytes
-ECRYPT_encrypt_bytes:
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,80(%esp)
- # ebx_stack = ebx
- movl %ebx,84(%esp)
- # esi_stack = esi
- movl %esi,88(%esp)
- # edi_stack = edi
- movl %edi,92(%esp)
- # ebp_stack = ebp
- movl %ebp,96(%esp)
- # x = arg1
- movl 4(%esp,%eax),%edx
- # m = arg2
- movl 8(%esp,%eax),%esi
- # out = arg3
- movl 12(%esp,%eax),%edi
- # bytes = arg4
- movl 16(%esp,%eax),%ebx
- # bytes -= 0
- sub $0,%ebx
- # goto done if unsigned<=
- jbe ._done
-._start:
- # in0 = *(uint32 *) (x + 0)
- movl 0(%edx),%eax
- # in1 = *(uint32 *) (x + 4)
- movl 4(%edx),%ecx
- # in2 = *(uint32 *) (x + 8)
- movl 8(%edx),%ebp
- # j0 = in0
- movl %eax,164(%esp)
- # in3 = *(uint32 *) (x + 12)
- movl 12(%edx),%eax
- # j1 = in1
- movl %ecx,168(%esp)
- # in4 = *(uint32 *) (x + 16)
- movl 16(%edx),%ecx
- # j2 = in2
- movl %ebp,172(%esp)
- # in5 = *(uint32 *) (x + 20)
- movl 20(%edx),%ebp
- # j3 = in3
- movl %eax,176(%esp)
- # in6 = *(uint32 *) (x + 24)
- movl 24(%edx),%eax
- # j4 = in4
- movl %ecx,180(%esp)
- # in7 = *(uint32 *) (x + 28)
- movl 28(%edx),%ecx
- # j5 = in5
- movl %ebp,184(%esp)
- # in8 = *(uint32 *) (x + 32)
- movl 32(%edx),%ebp
- # j6 = in6
- movl %eax,188(%esp)
- # in9 = *(uint32 *) (x + 36)
- movl 36(%edx),%eax
- # j7 = in7
- movl %ecx,192(%esp)
- # in10 = *(uint32 *) (x + 40)
- movl 40(%edx),%ecx
- # j8 = in8
- movl %ebp,196(%esp)
- # in11 = *(uint32 *) (x + 44)
- movl 44(%edx),%ebp
- # j9 = in9
- movl %eax,200(%esp)
- # in12 = *(uint32 *) (x + 48)
- movl 48(%edx),%eax
- # j10 = in10
- movl %ecx,204(%esp)
- # in13 = *(uint32 *) (x + 52)
- movl 52(%edx),%ecx
- # j11 = in11
- movl %ebp,208(%esp)
- # in14 = *(uint32 *) (x + 56)
- movl 56(%edx),%ebp
- # j12 = in12
- movl %eax,212(%esp)
- # in15 = *(uint32 *) (x + 60)
- movl 60(%edx),%eax
- # j13 = in13
- movl %ecx,216(%esp)
- # j14 = in14
- movl %ebp,220(%esp)
- # j15 = in15
- movl %eax,224(%esp)
- # x_backup = x
- movl %edx,64(%esp)
-._bytesatleast1:
- # bytes - 64
- cmp $64,%ebx
- # goto nocopy if unsigned>=
- jae ._nocopy
- # ctarget = out
- movl %edi,228(%esp)
- # out = &tmp
- leal 0(%esp),%edi
- # i = bytes
- mov %ebx,%ecx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # out = &tmp
- leal 0(%esp),%edi
- # m = &tmp
- leal 0(%esp),%esi
-._nocopy:
- # out_backup = out
- movl %edi,72(%esp)
- # m_backup = m
- movl %esi,68(%esp)
- # bytes_backup = bytes
- movl %ebx,76(%esp)
- # in0 = j0
- movl 164(%esp),%eax
- # in1 = j1
- movl 168(%esp),%ecx
- # in2 = j2
- movl 172(%esp),%edx
- # in3 = j3
- movl 176(%esp),%ebx
- # x0 = in0
- movl %eax,100(%esp)
- # x1 = in1
- movl %ecx,104(%esp)
- # x2 = in2
- movl %edx,108(%esp)
- # x3 = in3
- movl %ebx,112(%esp)
- # in4 = j4
- movl 180(%esp),%eax
- # in5 = j5
- movl 184(%esp),%ecx
- # in6 = j6
- movl 188(%esp),%edx
- # in7 = j7
- movl 192(%esp),%ebx
- # x4 = in4
- movl %eax,116(%esp)
- # x5 = in5
- movl %ecx,120(%esp)
- # x6 = in6
- movl %edx,124(%esp)
- # x7 = in7
- movl %ebx,128(%esp)
- # in8 = j8
- movl 196(%esp),%eax
- # in9 = j9
- movl 200(%esp),%ecx
- # in10 = j10
- movl 204(%esp),%edx
- # in11 = j11
- movl 208(%esp),%ebx
- # x8 = in8
- movl %eax,132(%esp)
- # x9 = in9
- movl %ecx,136(%esp)
- # x10 = in10
- movl %edx,140(%esp)
- # x11 = in11
- movl %ebx,144(%esp)
- # in12 = j12
- movl 212(%esp),%eax
- # in13 = j13
- movl 216(%esp),%ecx
- # in14 = j14
- movl 220(%esp),%edx
- # in15 = j15
- movl 224(%esp),%ebx
- # x12 = in12
- movl %eax,148(%esp)
- # x13 = in13
- movl %ecx,152(%esp)
- # x14 = in14
- movl %edx,156(%esp)
- # x15 = in15
- movl %ebx,160(%esp)
- # i = 20
- mov $20,%ebp
- # p = x0
- movl 100(%esp),%eax
- # s = x5
- movl 120(%esp),%ecx
- # t = x10
- movl 140(%esp),%edx
- # w = x15
- movl 160(%esp),%ebx
-._mainloop:
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x12
- addl 148(%esp),%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x6
- addl 124(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x1
- movl 104(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x11
- movl 144(%esp),%edi
- # v += w
- add %ebx,%edi
- # p <<<= 7
- rol $7,%eax
- # p ^= x4
- xorl 116(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x14
- xorl 156(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x9
- xorl 136(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x3
- xorl 112(%esp),%edi
- # x4 = p
- movl %eax,116(%esp)
- # x14 = t
- movl %edx,156(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x9 = r
- movl %esi,136(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x3 = v
- movl %edi,112(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x8
- xorl 132(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x2
- xorl 108(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x13
- xorl 152(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x7
- xorl 128(%esp),%ebx
- # x8 = p
- movl %eax,132(%esp)
- # x2 = t
- movl %edx,108(%esp)
- # p += x4
- addl 116(%esp),%eax
- # x13 = s
- movl %ecx,152(%esp)
- # t += x14
- addl 156(%esp),%edx
- # x7 = w
- movl %ebx,128(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x12
- xorl 148(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x6
- xorl 124(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x1
- xorl 104(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x11
- xorl 144(%esp),%edi
- # x12 = p
- movl %eax,148(%esp)
- # x6 = t
- movl %edx,124(%esp)
- # p += x8
- addl 132(%esp),%eax
- # x1 = r
- movl %esi,104(%esp)
- # t += x2
- addl 108(%esp),%edx
- # x11 = v
- movl %edi,144(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x3
- addl 112(%esp),%eax
- # p <<<= 7
- rol $7,%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x9
- addl 136(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x4
- movl 116(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x14
- movl 156(%esp),%edi
- # v += w
- add %ebx,%edi
- # p ^= x1
- xorl 104(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x11
- xorl 144(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x6
- xorl 124(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x12
- xorl 148(%esp),%edi
- # x1 = p
- movl %eax,104(%esp)
- # x11 = t
- movl %edx,144(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x6 = r
- movl %esi,124(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x12 = v
- movl %edi,148(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x2
- xorl 108(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x8
- xorl 132(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x7
- xorl 128(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x13
- xorl 152(%esp),%ebx
- # x2 = p
- movl %eax,108(%esp)
- # x8 = t
- movl %edx,132(%esp)
- # p += x1
- addl 104(%esp),%eax
- # x7 = s
- movl %ecx,128(%esp)
- # t += x11
- addl 144(%esp),%edx
- # x13 = w
- movl %ebx,152(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x3
- xorl 112(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x9
- xorl 136(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x4
- xorl 116(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x14
- xorl 156(%esp),%edi
- # x3 = p
- movl %eax,112(%esp)
- # x9 = t
- movl %edx,136(%esp)
- # p += x2
- addl 108(%esp),%eax
- # x4 = r
- movl %esi,116(%esp)
- # t += x8
- addl 132(%esp),%edx
- # x14 = v
- movl %edi,156(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x12
- addl 148(%esp),%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x6
- addl 124(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x1
- movl 104(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x11
- movl 144(%esp),%edi
- # v += w
- add %ebx,%edi
- # p <<<= 7
- rol $7,%eax
- # p ^= x4
- xorl 116(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x14
- xorl 156(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x9
- xorl 136(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x3
- xorl 112(%esp),%edi
- # x4 = p
- movl %eax,116(%esp)
- # x14 = t
- movl %edx,156(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x9 = r
- movl %esi,136(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x3 = v
- movl %edi,112(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x8
- xorl 132(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x2
- xorl 108(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x13
- xorl 152(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x7
- xorl 128(%esp),%ebx
- # x8 = p
- movl %eax,132(%esp)
- # x2 = t
- movl %edx,108(%esp)
- # p += x4
- addl 116(%esp),%eax
- # x13 = s
- movl %ecx,152(%esp)
- # t += x14
- addl 156(%esp),%edx
- # x7 = w
- movl %ebx,128(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x12
- xorl 148(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x6
- xorl 124(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x1
- xorl 104(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x11
- xorl 144(%esp),%edi
- # x12 = p
- movl %eax,148(%esp)
- # x6 = t
- movl %edx,124(%esp)
- # p += x8
- addl 132(%esp),%eax
- # x1 = r
- movl %esi,104(%esp)
- # t += x2
- addl 108(%esp),%edx
- # x11 = v
- movl %edi,144(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # x0 = p
- movl %eax,100(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # p += x3
- addl 112(%esp),%eax
- # p <<<= 7
- rol $7,%eax
- # x5 = s
- movl %ecx,120(%esp)
- # t += x9
- addl 136(%esp),%edx
- # x15 = w
- movl %ebx,160(%esp)
- # r = x4
- movl 116(%esp),%esi
- # r += s
- add %ecx,%esi
- # v = x14
- movl 156(%esp),%edi
- # v += w
- add %ebx,%edi
- # p ^= x1
- xorl 104(%esp),%eax
- # t <<<= 7
- rol $7,%edx
- # t ^= x11
- xorl 144(%esp),%edx
- # r <<<= 7
- rol $7,%esi
- # r ^= x6
- xorl 124(%esp),%esi
- # v <<<= 7
- rol $7,%edi
- # v ^= x12
- xorl 148(%esp),%edi
- # x1 = p
- movl %eax,104(%esp)
- # x11 = t
- movl %edx,144(%esp)
- # p += x0
- addl 100(%esp),%eax
- # x6 = r
- movl %esi,124(%esp)
- # t += x10
- addl 140(%esp),%edx
- # x12 = v
- movl %edi,148(%esp)
- # p <<<= 9
- rol $9,%eax
- # p ^= x2
- xorl 108(%esp),%eax
- # t <<<= 9
- rol $9,%edx
- # t ^= x8
- xorl 132(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 9
- rol $9,%ecx
- # s ^= x7
- xorl 128(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 9
- rol $9,%ebx
- # w ^= x13
- xorl 152(%esp),%ebx
- # x2 = p
- movl %eax,108(%esp)
- # x8 = t
- movl %edx,132(%esp)
- # p += x1
- addl 104(%esp),%eax
- # x7 = s
- movl %ecx,128(%esp)
- # t += x11
- addl 144(%esp),%edx
- # x13 = w
- movl %ebx,152(%esp)
- # p <<<= 13
- rol $13,%eax
- # p ^= x3
- xorl 112(%esp),%eax
- # t <<<= 13
- rol $13,%edx
- # t ^= x9
- xorl 136(%esp),%edx
- # r += s
- add %ecx,%esi
- # r <<<= 13
- rol $13,%esi
- # r ^= x4
- xorl 116(%esp),%esi
- # v += w
- add %ebx,%edi
- # v <<<= 13
- rol $13,%edi
- # v ^= x14
- xorl 156(%esp),%edi
- # x3 = p
- movl %eax,112(%esp)
- # x9 = t
- movl %edx,136(%esp)
- # p += x2
- addl 108(%esp),%eax
- # x4 = r
- movl %esi,116(%esp)
- # t += x8
- addl 132(%esp),%edx
- # x14 = v
- movl %edi,156(%esp)
- # p <<<= 18
- rol $18,%eax
- # p ^= x0
- xorl 100(%esp),%eax
- # t <<<= 18
- rol $18,%edx
- # t ^= x10
- xorl 140(%esp),%edx
- # s += r
- add %esi,%ecx
- # s <<<= 18
- rol $18,%ecx
- # s ^= x5
- xorl 120(%esp),%ecx
- # w += v
- add %edi,%ebx
- # w <<<= 18
- rol $18,%ebx
- # w ^= x15
- xorl 160(%esp),%ebx
- # i -= 4
- sub $4,%ebp
- # goto mainloop if unsigned >
- ja ._mainloop
- # x0 = p
- movl %eax,100(%esp)
- # x5 = s
- movl %ecx,120(%esp)
- # x10 = t
- movl %edx,140(%esp)
- # x15 = w
- movl %ebx,160(%esp)
- # out = out_backup
- movl 72(%esp),%edi
- # m = m_backup
- movl 68(%esp),%esi
- # in0 = x0
- movl 100(%esp),%eax
- # in1 = x1
- movl 104(%esp),%ecx
- # in0 += j0
- addl 164(%esp),%eax
- # in1 += j1
- addl 168(%esp),%ecx
- # in0 ^= *(uint32 *) (m + 0)
- xorl 0(%esi),%eax
- # in1 ^= *(uint32 *) (m + 4)
- xorl 4(%esi),%ecx
- # *(uint32 *) (out + 0) = in0
- movl %eax,0(%edi)
- # *(uint32 *) (out + 4) = in1
- movl %ecx,4(%edi)
- # in2 = x2
- movl 108(%esp),%eax
- # in3 = x3
- movl 112(%esp),%ecx
- # in2 += j2
- addl 172(%esp),%eax
- # in3 += j3
- addl 176(%esp),%ecx
- # in2 ^= *(uint32 *) (m + 8)
- xorl 8(%esi),%eax
- # in3 ^= *(uint32 *) (m + 12)
- xorl 12(%esi),%ecx
- # *(uint32 *) (out + 8) = in2
- movl %eax,8(%edi)
- # *(uint32 *) (out + 12) = in3
- movl %ecx,12(%edi)
- # in4 = x4
- movl 116(%esp),%eax
- # in5 = x5
- movl 120(%esp),%ecx
- # in4 += j4
- addl 180(%esp),%eax
- # in5 += j5
- addl 184(%esp),%ecx
- # in4 ^= *(uint32 *) (m + 16)
- xorl 16(%esi),%eax
- # in5 ^= *(uint32 *) (m + 20)
- xorl 20(%esi),%ecx
- # *(uint32 *) (out + 16) = in4
- movl %eax,16(%edi)
- # *(uint32 *) (out + 20) = in5
- movl %ecx,20(%edi)
- # in6 = x6
- movl 124(%esp),%eax
- # in7 = x7
- movl 128(%esp),%ecx
- # in6 += j6
- addl 188(%esp),%eax
- # in7 += j7
- addl 192(%esp),%ecx
- # in6 ^= *(uint32 *) (m + 24)
- xorl 24(%esi),%eax
- # in7 ^= *(uint32 *) (m + 28)
- xorl 28(%esi),%ecx
- # *(uint32 *) (out + 24) = in6
- movl %eax,24(%edi)
- # *(uint32 *) (out + 28) = in7
- movl %ecx,28(%edi)
- # in8 = x8
- movl 132(%esp),%eax
- # in9 = x9
- movl 136(%esp),%ecx
- # in8 += j8
- addl 196(%esp),%eax
- # in9 += j9
- addl 200(%esp),%ecx
- # in8 ^= *(uint32 *) (m + 32)
- xorl 32(%esi),%eax
- # in9 ^= *(uint32 *) (m + 36)
- xorl 36(%esi),%ecx
- # *(uint32 *) (out + 32) = in8
- movl %eax,32(%edi)
- # *(uint32 *) (out + 36) = in9
- movl %ecx,36(%edi)
- # in10 = x10
- movl 140(%esp),%eax
- # in11 = x11
- movl 144(%esp),%ecx
- # in10 += j10
- addl 204(%esp),%eax
- # in11 += j11
- addl 208(%esp),%ecx
- # in10 ^= *(uint32 *) (m + 40)
- xorl 40(%esi),%eax
- # in11 ^= *(uint32 *) (m + 44)
- xorl 44(%esi),%ecx
- # *(uint32 *) (out + 40) = in10
- movl %eax,40(%edi)
- # *(uint32 *) (out + 44) = in11
- movl %ecx,44(%edi)
- # in12 = x12
- movl 148(%esp),%eax
- # in13 = x13
- movl 152(%esp),%ecx
- # in12 += j12
- addl 212(%esp),%eax
- # in13 += j13
- addl 216(%esp),%ecx
- # in12 ^= *(uint32 *) (m + 48)
- xorl 48(%esi),%eax
- # in13 ^= *(uint32 *) (m + 52)
- xorl 52(%esi),%ecx
- # *(uint32 *) (out + 48) = in12
- movl %eax,48(%edi)
- # *(uint32 *) (out + 52) = in13
- movl %ecx,52(%edi)
- # in14 = x14
- movl 156(%esp),%eax
- # in15 = x15
- movl 160(%esp),%ecx
- # in14 += j14
- addl 220(%esp),%eax
- # in15 += j15
- addl 224(%esp),%ecx
- # in14 ^= *(uint32 *) (m + 56)
- xorl 56(%esi),%eax
- # in15 ^= *(uint32 *) (m + 60)
- xorl 60(%esi),%ecx
- # *(uint32 *) (out + 56) = in14
- movl %eax,56(%edi)
- # *(uint32 *) (out + 60) = in15
- movl %ecx,60(%edi)
- # bytes = bytes_backup
- movl 76(%esp),%ebx
- # in8 = j8
- movl 196(%esp),%eax
- # in9 = j9
- movl 200(%esp),%ecx
- # in8 += 1
- add $1,%eax
- # in9 += 0 + carry
- adc $0,%ecx
- # j8 = in8
- movl %eax,196(%esp)
- # j9 = in9
- movl %ecx,200(%esp)
- # bytes - 64
- cmp $64,%ebx
- # goto bytesatleast65 if unsigned>
- ja ._bytesatleast65
- # goto bytesatleast64 if unsigned>=
- jae ._bytesatleast64
- # m = out
- mov %edi,%esi
- # out = ctarget
- movl 228(%esp),%edi
- # i = bytes
- mov %ebx,%ecx
- # while (i) { *out++ = *m++; --i }
- rep movsb
-._bytesatleast64:
- # x = x_backup
- movl 64(%esp),%eax
- # in8 = j8
- movl 196(%esp),%ecx
- # in9 = j9
- movl 200(%esp),%edx
- # *(uint32 *) (x + 32) = in8
- movl %ecx,32(%eax)
- # *(uint32 *) (x + 36) = in9
- movl %edx,36(%eax)
-._done:
- # eax = eax_stack
- movl 80(%esp),%eax
- # ebx = ebx_stack
- movl 84(%esp),%ebx
- # esi = esi_stack
- movl 88(%esp),%esi
- # edi = edi_stack
- movl 92(%esp),%edi
- # ebp = ebp_stack
- movl 96(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-._bytesatleast65:
- # bytes -= 64
- sub $64,%ebx
- # out += 64
- add $64,%edi
- # m += 64
- add $64,%esi
- # goto bytesatleast1
- jmp ._bytesatleast1
-# enter ECRYPT_keysetup
-.text
-.p2align 5
-.globl ECRYPT_keysetup
-ECRYPT_keysetup:
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,64(%esp)
- # ebx_stack = ebx
- movl %ebx,68(%esp)
- # esi_stack = esi
- movl %esi,72(%esp)
- # edi_stack = edi
- movl %edi,76(%esp)
- # ebp_stack = ebp
- movl %ebp,80(%esp)
- # k = arg2
- movl 8(%esp,%eax),%ecx
- # kbits = arg3
- movl 12(%esp,%eax),%edx
- # x = arg1
- movl 4(%esp,%eax),%eax
- # in1 = *(uint32 *) (k + 0)
- movl 0(%ecx),%ebx
- # in2 = *(uint32 *) (k + 4)
- movl 4(%ecx),%esi
- # in3 = *(uint32 *) (k + 8)
- movl 8(%ecx),%edi
- # in4 = *(uint32 *) (k + 12)
- movl 12(%ecx),%ebp
- # *(uint32 *) (x + 4) = in1
- movl %ebx,4(%eax)
- # *(uint32 *) (x + 8) = in2
- movl %esi,8(%eax)
- # *(uint32 *) (x + 12) = in3
- movl %edi,12(%eax)
- # *(uint32 *) (x + 16) = in4
- movl %ebp,16(%eax)
- # kbits - 256
- cmp $256,%edx
- # goto kbits128 if unsigned<
- jb ._kbits128
-._kbits256:
- # in11 = *(uint32 *) (k + 16)
- movl 16(%ecx),%edx
- # in12 = *(uint32 *) (k + 20)
- movl 20(%ecx),%ebx
- # in13 = *(uint32 *) (k + 24)
- movl 24(%ecx),%esi
- # in14 = *(uint32 *) (k + 28)
- movl 28(%ecx),%ecx
- # *(uint32 *) (x + 44) = in11
- movl %edx,44(%eax)
- # *(uint32 *) (x + 48) = in12
- movl %ebx,48(%eax)
- # *(uint32 *) (x + 52) = in13
- movl %esi,52(%eax)
- # *(uint32 *) (x + 56) = in14
- movl %ecx,56(%eax)
- # in0 = 1634760805
- mov $1634760805,%ecx
- # in5 = 857760878
- mov $857760878,%edx
- # in10 = 2036477234
- mov $2036477234,%ebx
- # in15 = 1797285236
- mov $1797285236,%esi
- # *(uint32 *) (x + 0) = in0
- movl %ecx,0(%eax)
- # *(uint32 *) (x + 20) = in5
- movl %edx,20(%eax)
- # *(uint32 *) (x + 40) = in10
- movl %ebx,40(%eax)
- # *(uint32 *) (x + 60) = in15
- movl %esi,60(%eax)
- # goto keysetupdone
- jmp ._keysetupdone
-._kbits128:
- # in11 = *(uint32 *) (k + 0)
- movl 0(%ecx),%edx
- # in12 = *(uint32 *) (k + 4)
- movl 4(%ecx),%ebx
- # in13 = *(uint32 *) (k + 8)
- movl 8(%ecx),%esi
- # in14 = *(uint32 *) (k + 12)
- movl 12(%ecx),%ecx
- # *(uint32 *) (x + 44) = in11
- movl %edx,44(%eax)
- # *(uint32 *) (x + 48) = in12
- movl %ebx,48(%eax)
- # *(uint32 *) (x + 52) = in13
- movl %esi,52(%eax)
- # *(uint32 *) (x + 56) = in14
- movl %ecx,56(%eax)
- # in0 = 1634760805
- mov $1634760805,%ecx
- # in5 = 824206446
- mov $824206446,%edx
- # in10 = 2036477238
- mov $2036477238,%ebx
- # in15 = 1797285236
- mov $1797285236,%esi
- # *(uint32 *) (x + 0) = in0
- movl %ecx,0(%eax)
- # *(uint32 *) (x + 20) = in5
- movl %edx,20(%eax)
- # *(uint32 *) (x + 40) = in10
- movl %ebx,40(%eax)
- # *(uint32 *) (x + 60) = in15
- movl %esi,60(%eax)
-._keysetupdone:
- # eax = eax_stack
- movl 64(%esp),%eax
- # ebx = ebx_stack
- movl 68(%esp),%ebx
- # esi = esi_stack
- movl 72(%esp),%esi
- # edi = edi_stack
- movl 76(%esp),%edi
- # ebp = ebp_stack
- movl 80(%esp),%ebp
- # leave
- add %eax,%esp
- ret
-# enter ECRYPT_ivsetup
-.text
-.p2align 5
-.globl ECRYPT_ivsetup
-ECRYPT_ivsetup:
- mov %esp,%eax
- and $31,%eax
- add $256,%eax
- sub %eax,%esp
- # eax_stack = eax
- movl %eax,64(%esp)
- # ebx_stack = ebx
- movl %ebx,68(%esp)
- # esi_stack = esi
- movl %esi,72(%esp)
- # edi_stack = edi
- movl %edi,76(%esp)
- # ebp_stack = ebp
- movl %ebp,80(%esp)
- # iv = arg2
- movl 8(%esp,%eax),%ecx
- # x = arg1
- movl 4(%esp,%eax),%eax
- # in6 = *(uint32 *) (iv + 0)
- movl 0(%ecx),%edx
- # in7 = *(uint32 *) (iv + 4)
- movl 4(%ecx),%ecx
- # in8 = 0
- mov $0,%ebx
- # in9 = 0
- mov $0,%esi
- # *(uint32 *) (x + 24) = in6
- movl %edx,24(%eax)
- # *(uint32 *) (x + 28) = in7
- movl %ecx,28(%eax)
- # *(uint32 *) (x + 32) = in8
- movl %ebx,32(%eax)
- # *(uint32 *) (x + 36) = in9
- movl %esi,36(%eax)
- # eax = eax_stack
- movl 64(%esp),%eax
- # ebx = ebx_stack
- movl 68(%esp),%ebx
- # esi = esi_stack
- movl 72(%esp),%esi
- # edi = edi_stack
- movl 76(%esp),%edi
- # ebp = ebp_stack
- movl 80(%esp),%ebp
- # leave
- add %eax,%esp
- ret
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
deleted file mode 100644
index 6214a9b09706..000000000000
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ /dev/null
@@ -1,920 +0,0 @@
-# enter ECRYPT_encrypt_bytes
-.text
-.p2align 5
-.globl ECRYPT_encrypt_bytes
-ECRYPT_encrypt_bytes:
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # x = arg1
- mov %rdi,%r8
- # m = arg2
- mov %rsi,%rsi
- # out = arg3
- mov %rdx,%rdi
- # bytes = arg4
- mov %rcx,%rdx
- # unsigned>? bytes - 0
- cmp $0,%rdx
- # comment:fp stack unchanged by jump
- # goto done if !unsigned>
- jbe ._done
- # comment:fp stack unchanged by fallthrough
-# start:
-._start:
- # r11_stack = r11
- movq %r11,0(%rsp)
- # r12_stack = r12
- movq %r12,8(%rsp)
- # r13_stack = r13
- movq %r13,16(%rsp)
- # r14_stack = r14
- movq %r14,24(%rsp)
- # r15_stack = r15
- movq %r15,32(%rsp)
- # rbx_stack = rbx
- movq %rbx,40(%rsp)
- # rbp_stack = rbp
- movq %rbp,48(%rsp)
- # in0 = *(uint64 *) (x + 0)
- movq 0(%r8),%rcx
- # in2 = *(uint64 *) (x + 8)
- movq 8(%r8),%r9
- # in4 = *(uint64 *) (x + 16)
- movq 16(%r8),%rax
- # in6 = *(uint64 *) (x + 24)
- movq 24(%r8),%r10
- # in8 = *(uint64 *) (x + 32)
- movq 32(%r8),%r11
- # in10 = *(uint64 *) (x + 40)
- movq 40(%r8),%r12
- # in12 = *(uint64 *) (x + 48)
- movq 48(%r8),%r13
- # in14 = *(uint64 *) (x + 56)
- movq 56(%r8),%r14
- # j0 = in0
- movq %rcx,56(%rsp)
- # j2 = in2
- movq %r9,64(%rsp)
- # j4 = in4
- movq %rax,72(%rsp)
- # j6 = in6
- movq %r10,80(%rsp)
- # j8 = in8
- movq %r11,88(%rsp)
- # j10 = in10
- movq %r12,96(%rsp)
- # j12 = in12
- movq %r13,104(%rsp)
- # j14 = in14
- movq %r14,112(%rsp)
- # x_backup = x
- movq %r8,120(%rsp)
-# bytesatleast1:
-._bytesatleast1:
- # unsigned<? bytes - 64
- cmp $64,%rdx
- # comment:fp stack unchanged by jump
- # goto nocopy if !unsigned<
- jae ._nocopy
- # ctarget = out
- movq %rdi,128(%rsp)
- # out = &tmp
- leaq 192(%rsp),%rdi
- # i = bytes
- mov %rdx,%rcx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # out = &tmp
- leaq 192(%rsp),%rdi
- # m = &tmp
- leaq 192(%rsp),%rsi
- # comment:fp stack unchanged by fallthrough
-# nocopy:
-._nocopy:
- # out_backup = out
- movq %rdi,136(%rsp)
- # m_backup = m
- movq %rsi,144(%rsp)
- # bytes_backup = bytes
- movq %rdx,152(%rsp)
- # x1 = j0
- movq 56(%rsp),%rdi
- # x0 = x1
- mov %rdi,%rdx
- # (uint64) x1 >>= 32
- shr $32,%rdi
- # x3 = j2
- movq 64(%rsp),%rsi
- # x2 = x3
- mov %rsi,%rcx
- # (uint64) x3 >>= 32
- shr $32,%rsi
- # x5 = j4
- movq 72(%rsp),%r8
- # x4 = x5
- mov %r8,%r9
- # (uint64) x5 >>= 32
- shr $32,%r8
- # x5_stack = x5
- movq %r8,160(%rsp)
- # x7 = j6
- movq 80(%rsp),%r8
- # x6 = x7
- mov %r8,%rax
- # (uint64) x7 >>= 32
- shr $32,%r8
- # x9 = j8
- movq 88(%rsp),%r10
- # x8 = x9
- mov %r10,%r11
- # (uint64) x9 >>= 32
- shr $32,%r10
- # x11 = j10
- movq 96(%rsp),%r12
- # x10 = x11
- mov %r12,%r13
- # x10_stack = x10
- movq %r13,168(%rsp)
- # (uint64) x11 >>= 32
- shr $32,%r12
- # x13 = j12
- movq 104(%rsp),%r13
- # x12 = x13
- mov %r13,%r14
- # (uint64) x13 >>= 32
- shr $32,%r13
- # x15 = j14
- movq 112(%rsp),%r15
- # x14 = x15
- mov %r15,%rbx
- # (uint64) x15 >>= 32
- shr $32,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # i = 20
- mov $20,%r15
-# mainloop:
-._mainloop:
- # i_backup = i
- movq %r15,184(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x12 + x0
- lea (%r14,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x4 ^= a
- xor %rbp,%r9
- # b = x1 + x5
- lea (%rdi,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x9 ^= b
- xor %rbp,%r10
- # a = x0 + x4
- lea (%rdx,%r9),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x8 ^= a
- xor %rbp,%r11
- # b = x5 + x9
- lea (%r15,%r10),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x13 ^= b
- xor %rbp,%r13
- # a = x4 + x8
- lea (%r9,%r11),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x12 ^= a
- xor %rbp,%r14
- # b = x9 + x13
- lea (%r10,%r13),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x1 ^= b
- xor %rbp,%rdi
- # a = x8 + x12
- lea (%r11,%r14),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x13 + x1
- lea (%r13,%rdi),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x6 + x10
- lea (%rax,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x14 ^= c
- xor %r15,%rbx
- # c = x10 + x14
- lea (%rbp,%rbx),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x2 ^= c
- xor %r15,%rcx
- # c = x14 + x2
- lea (%rbx,%rcx),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x6 ^= c
- xor %r15,%rax
- # c = x2 + x6
- lea (%rcx,%rax),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x11 + x15
- lea (%r12,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x3 ^= d
- xor %rbp,%rsi
- # d = x15 + x3
- lea (%r15,%rsi),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x7 ^= d
- xor %rbp,%r8
- # d = x3 + x7
- lea (%rsi,%r8),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x11 ^= d
- xor %rbp,%r12
- # d = x7 + x11
- lea (%r8,%r12),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x3 + x0
- lea (%rsi,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x1 ^= a
- xor %rbp,%rdi
- # b = x4 + x5
- lea (%r9,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x6 ^= b
- xor %rbp,%rax
- # a = x0 + x1
- lea (%rdx,%rdi),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x2 ^= a
- xor %rbp,%rcx
- # b = x5 + x6
- lea (%r15,%rax),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x7 ^= b
- xor %rbp,%r8
- # a = x1 + x2
- lea (%rdi,%rcx),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x3 ^= a
- xor %rbp,%rsi
- # b = x6 + x7
- lea (%rax,%r8),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x4 ^= b
- xor %rbp,%r9
- # a = x2 + x3
- lea (%rcx,%rsi),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x7 + x4
- lea (%r8,%r9),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x9 + x10
- lea (%r10,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x11 ^= c
- xor %r15,%r12
- # c = x10 + x11
- lea (%rbp,%r12),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x8 ^= c
- xor %r15,%r11
- # c = x11 + x8
- lea (%r12,%r11),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x9 ^= c
- xor %r15,%r10
- # c = x8 + x9
- lea (%r11,%r10),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x14 + x15
- lea (%rbx,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x12 ^= d
- xor %rbp,%r14
- # d = x15 + x12
- lea (%r15,%r14),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x13 ^= d
- xor %rbp,%r13
- # d = x12 + x13
- lea (%r14,%r13),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x14 ^= d
- xor %rbp,%rbx
- # d = x13 + x14
- lea (%r13,%rbx),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x12 + x0
- lea (%r14,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x4 ^= a
- xor %rbp,%r9
- # b = x1 + x5
- lea (%rdi,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x9 ^= b
- xor %rbp,%r10
- # a = x0 + x4
- lea (%rdx,%r9),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x8 ^= a
- xor %rbp,%r11
- # b = x5 + x9
- lea (%r15,%r10),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x13 ^= b
- xor %rbp,%r13
- # a = x4 + x8
- lea (%r9,%r11),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x12 ^= a
- xor %rbp,%r14
- # b = x9 + x13
- lea (%r10,%r13),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x1 ^= b
- xor %rbp,%rdi
- # a = x8 + x12
- lea (%r11,%r14),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x13 + x1
- lea (%r13,%rdi),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x6 + x10
- lea (%rax,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x14 ^= c
- xor %r15,%rbx
- # c = x10 + x14
- lea (%rbp,%rbx),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x2 ^= c
- xor %r15,%rcx
- # c = x14 + x2
- lea (%rbx,%rcx),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x6 ^= c
- xor %r15,%rax
- # c = x2 + x6
- lea (%rcx,%rax),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x11 + x15
- lea (%r12,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x3 ^= d
- xor %rbp,%rsi
- # d = x15 + x3
- lea (%r15,%rsi),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x7 ^= d
- xor %rbp,%r8
- # d = x3 + x7
- lea (%rsi,%r8),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x11 ^= d
- xor %rbp,%r12
- # d = x7 + x11
- lea (%r8,%r12),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # x5 = x5_stack
- movq 160(%rsp),%r15
- # a = x3 + x0
- lea (%rsi,%rdx),%rbp
- # (uint32) a <<<= 7
- rol $7,%ebp
- # x1 ^= a
- xor %rbp,%rdi
- # b = x4 + x5
- lea (%r9,%r15),%rbp
- # (uint32) b <<<= 7
- rol $7,%ebp
- # x6 ^= b
- xor %rbp,%rax
- # a = x0 + x1
- lea (%rdx,%rdi),%rbp
- # (uint32) a <<<= 9
- rol $9,%ebp
- # x2 ^= a
- xor %rbp,%rcx
- # b = x5 + x6
- lea (%r15,%rax),%rbp
- # (uint32) b <<<= 9
- rol $9,%ebp
- # x7 ^= b
- xor %rbp,%r8
- # a = x1 + x2
- lea (%rdi,%rcx),%rbp
- # (uint32) a <<<= 13
- rol $13,%ebp
- # x3 ^= a
- xor %rbp,%rsi
- # b = x6 + x7
- lea (%rax,%r8),%rbp
- # (uint32) b <<<= 13
- rol $13,%ebp
- # x4 ^= b
- xor %rbp,%r9
- # a = x2 + x3
- lea (%rcx,%rsi),%rbp
- # (uint32) a <<<= 18
- rol $18,%ebp
- # x0 ^= a
- xor %rbp,%rdx
- # b = x7 + x4
- lea (%r8,%r9),%rbp
- # (uint32) b <<<= 18
- rol $18,%ebp
- # x5 ^= b
- xor %rbp,%r15
- # x10 = x10_stack
- movq 168(%rsp),%rbp
- # x5_stack = x5
- movq %r15,160(%rsp)
- # c = x9 + x10
- lea (%r10,%rbp),%r15
- # (uint32) c <<<= 7
- rol $7,%r15d
- # x11 ^= c
- xor %r15,%r12
- # c = x10 + x11
- lea (%rbp,%r12),%r15
- # (uint32) c <<<= 9
- rol $9,%r15d
- # x8 ^= c
- xor %r15,%r11
- # c = x11 + x8
- lea (%r12,%r11),%r15
- # (uint32) c <<<= 13
- rol $13,%r15d
- # x9 ^= c
- xor %r15,%r10
- # c = x8 + x9
- lea (%r11,%r10),%r15
- # (uint32) c <<<= 18
- rol $18,%r15d
- # x10 ^= c
- xor %r15,%rbp
- # x15 = x15_stack
- movq 176(%rsp),%r15
- # x10_stack = x10
- movq %rbp,168(%rsp)
- # d = x14 + x15
- lea (%rbx,%r15),%rbp
- # (uint32) d <<<= 7
- rol $7,%ebp
- # x12 ^= d
- xor %rbp,%r14
- # d = x15 + x12
- lea (%r15,%r14),%rbp
- # (uint32) d <<<= 9
- rol $9,%ebp
- # x13 ^= d
- xor %rbp,%r13
- # d = x12 + x13
- lea (%r14,%r13),%rbp
- # (uint32) d <<<= 13
- rol $13,%ebp
- # x14 ^= d
- xor %rbp,%rbx
- # d = x13 + x14
- lea (%r13,%rbx),%rbp
- # (uint32) d <<<= 18
- rol $18,%ebp
- # x15 ^= d
- xor %rbp,%r15
- # x15_stack = x15
- movq %r15,176(%rsp)
- # i = i_backup
- movq 184(%rsp),%r15
- # unsigned>? i -= 4
- sub $4,%r15
- # comment:fp stack unchanged by jump
- # goto mainloop if unsigned>
- ja ._mainloop
- # (uint32) x2 += j2
- addl 64(%rsp),%ecx
- # x3 <<= 32
- shl $32,%rsi
- # x3 += j2
- addq 64(%rsp),%rsi
- # (uint64) x3 >>= 32
- shr $32,%rsi
- # x3 <<= 32
- shl $32,%rsi
- # x2 += x3
- add %rsi,%rcx
- # (uint32) x6 += j6
- addl 80(%rsp),%eax
- # x7 <<= 32
- shl $32,%r8
- # x7 += j6
- addq 80(%rsp),%r8
- # (uint64) x7 >>= 32
- shr $32,%r8
- # x7 <<= 32
- shl $32,%r8
- # x6 += x7
- add %r8,%rax
- # (uint32) x8 += j8
- addl 88(%rsp),%r11d
- # x9 <<= 32
- shl $32,%r10
- # x9 += j8
- addq 88(%rsp),%r10
- # (uint64) x9 >>= 32
- shr $32,%r10
- # x9 <<= 32
- shl $32,%r10
- # x8 += x9
- add %r10,%r11
- # (uint32) x12 += j12
- addl 104(%rsp),%r14d
- # x13 <<= 32
- shl $32,%r13
- # x13 += j12
- addq 104(%rsp),%r13
- # (uint64) x13 >>= 32
- shr $32,%r13
- # x13 <<= 32
- shl $32,%r13
- # x12 += x13
- add %r13,%r14
- # (uint32) x0 += j0
- addl 56(%rsp),%edx
- # x1 <<= 32
- shl $32,%rdi
- # x1 += j0
- addq 56(%rsp),%rdi
- # (uint64) x1 >>= 32
- shr $32,%rdi
- # x1 <<= 32
- shl $32,%rdi
- # x0 += x1
- add %rdi,%rdx
- # x5 = x5_stack
- movq 160(%rsp),%rdi
- # (uint32) x4 += j4
- addl 72(%rsp),%r9d
- # x5 <<= 32
- shl $32,%rdi
- # x5 += j4
- addq 72(%rsp),%rdi
- # (uint64) x5 >>= 32
- shr $32,%rdi
- # x5 <<= 32
- shl $32,%rdi
- # x4 += x5
- add %rdi,%r9
- # x10 = x10_stack
- movq 168(%rsp),%r8
- # (uint32) x10 += j10
- addl 96(%rsp),%r8d
- # x11 <<= 32
- shl $32,%r12
- # x11 += j10
- addq 96(%rsp),%r12
- # (uint64) x11 >>= 32
- shr $32,%r12
- # x11 <<= 32
- shl $32,%r12
- # x10 += x11
- add %r12,%r8
- # x15 = x15_stack
- movq 176(%rsp),%rdi
- # (uint32) x14 += j14
- addl 112(%rsp),%ebx
- # x15 <<= 32
- shl $32,%rdi
- # x15 += j14
- addq 112(%rsp),%rdi
- # (uint64) x15 >>= 32
- shr $32,%rdi
- # x15 <<= 32
- shl $32,%rdi
- # x14 += x15
- add %rdi,%rbx
- # out = out_backup
- movq 136(%rsp),%rdi
- # m = m_backup
- movq 144(%rsp),%rsi
- # x0 ^= *(uint64 *) (m + 0)
- xorq 0(%rsi),%rdx
- # *(uint64 *) (out + 0) = x0
- movq %rdx,0(%rdi)
- # x2 ^= *(uint64 *) (m + 8)
- xorq 8(%rsi),%rcx
- # *(uint64 *) (out + 8) = x2
- movq %rcx,8(%rdi)
- # x4 ^= *(uint64 *) (m + 16)
- xorq 16(%rsi),%r9
- # *(uint64 *) (out + 16) = x4
- movq %r9,16(%rdi)
- # x6 ^= *(uint64 *) (m + 24)
- xorq 24(%rsi),%rax
- # *(uint64 *) (out + 24) = x6
- movq %rax,24(%rdi)
- # x8 ^= *(uint64 *) (m + 32)
- xorq 32(%rsi),%r11
- # *(uint64 *) (out + 32) = x8
- movq %r11,32(%rdi)
- # x10 ^= *(uint64 *) (m + 40)
- xorq 40(%rsi),%r8
- # *(uint64 *) (out + 40) = x10
- movq %r8,40(%rdi)
- # x12 ^= *(uint64 *) (m + 48)
- xorq 48(%rsi),%r14
- # *(uint64 *) (out + 48) = x12
- movq %r14,48(%rdi)
- # x14 ^= *(uint64 *) (m + 56)
- xorq 56(%rsi),%rbx
- # *(uint64 *) (out + 56) = x14
- movq %rbx,56(%rdi)
- # bytes = bytes_backup
- movq 152(%rsp),%rdx
- # in8 = j8
- movq 88(%rsp),%rcx
- # in8 += 1
- add $1,%rcx
- # j8 = in8
- movq %rcx,88(%rsp)
- # unsigned>? unsigned<? bytes - 64
- cmp $64,%rdx
- # comment:fp stack unchanged by jump
- # goto bytesatleast65 if unsigned>
- ja ._bytesatleast65
- # comment:fp stack unchanged by jump
- # goto bytesatleast64 if !unsigned<
- jae ._bytesatleast64
- # m = out
- mov %rdi,%rsi
- # out = ctarget
- movq 128(%rsp),%rdi
- # i = bytes
- mov %rdx,%rcx
- # while (i) { *out++ = *m++; --i }
- rep movsb
- # comment:fp stack unchanged by fallthrough
-# bytesatleast64:
-._bytesatleast64:
- # x = x_backup
- movq 120(%rsp),%rdi
- # in8 = j8
- movq 88(%rsp),%rsi
- # *(uint64 *) (x + 32) = in8
- movq %rsi,32(%rdi)
- # r11 = r11_stack
- movq 0(%rsp),%r11
- # r12 = r12_stack
- movq 8(%rsp),%r12
- # r13 = r13_stack
- movq 16(%rsp),%r13
- # r14 = r14_stack
- movq 24(%rsp),%r14
- # r15 = r15_stack
- movq 32(%rsp),%r15
- # rbx = rbx_stack
- movq 40(%rsp),%rbx
- # rbp = rbp_stack
- movq 48(%rsp),%rbp
- # comment:fp stack unchanged by fallthrough
-# done:
-._done:
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-# bytesatleast65:
-._bytesatleast65:
- # bytes -= 64
- sub $64,%rdx
- # out += 64
- add $64,%rdi
- # m += 64
- add $64,%rsi
- # comment:fp stack unchanged by jump
- # goto bytesatleast1
- jmp ._bytesatleast1
-# enter ECRYPT_keysetup
-.text
-.p2align 5
-.globl ECRYPT_keysetup
-ECRYPT_keysetup:
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # k = arg2
- mov %rsi,%rsi
- # kbits = arg3
- mov %rdx,%rdx
- # x = arg1
- mov %rdi,%rdi
- # in0 = *(uint64 *) (k + 0)
- movq 0(%rsi),%r8
- # in2 = *(uint64 *) (k + 8)
- movq 8(%rsi),%r9
- # *(uint64 *) (x + 4) = in0
- movq %r8,4(%rdi)
- # *(uint64 *) (x + 12) = in2
- movq %r9,12(%rdi)
- # unsigned<? kbits - 256
- cmp $256,%rdx
- # comment:fp stack unchanged by jump
- # goto kbits128 if unsigned<
- jb ._kbits128
-# kbits256:
-._kbits256:
- # in10 = *(uint64 *) (k + 16)
- movq 16(%rsi),%rdx
- # in12 = *(uint64 *) (k + 24)
- movq 24(%rsi),%rsi
- # *(uint64 *) (x + 44) = in10
- movq %rdx,44(%rdi)
- # *(uint64 *) (x + 52) = in12
- movq %rsi,52(%rdi)
- # in0 = 1634760805
- mov $1634760805,%rsi
- # in4 = 857760878
- mov $857760878,%rdx
- # in10 = 2036477234
- mov $2036477234,%rcx
- # in14 = 1797285236
- mov $1797285236,%r8
- # *(uint32 *) (x + 0) = in0
- movl %esi,0(%rdi)
- # *(uint32 *) (x + 20) = in4
- movl %edx,20(%rdi)
- # *(uint32 *) (x + 40) = in10
- movl %ecx,40(%rdi)
- # *(uint32 *) (x + 60) = in14
- movl %r8d,60(%rdi)
- # comment:fp stack unchanged by jump
- # goto keysetupdone
- jmp ._keysetupdone
-# kbits128:
-._kbits128:
- # in10 = *(uint64 *) (k + 0)
- movq 0(%rsi),%rdx
- # in12 = *(uint64 *) (k + 8)
- movq 8(%rsi),%rsi
- # *(uint64 *) (x + 44) = in10
- movq %rdx,44(%rdi)
- # *(uint64 *) (x + 52) = in12
- movq %rsi,52(%rdi)
- # in0 = 1634760805
- mov $1634760805,%rsi
- # in4 = 824206446
- mov $824206446,%rdx
- # in10 = 2036477238
- mov $2036477238,%rcx
- # in14 = 1797285236
- mov $1797285236,%r8
- # *(uint32 *) (x + 0) = in0
- movl %esi,0(%rdi)
- # *(uint32 *) (x + 20) = in4
- movl %edx,20(%rdi)
- # *(uint32 *) (x + 40) = in10
- movl %ecx,40(%rdi)
- # *(uint32 *) (x + 60) = in14
- movl %r8d,60(%rdi)
-# keysetupdone:
-._keysetupdone:
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
-# enter ECRYPT_ivsetup
-.text
-.p2align 5
-.globl ECRYPT_ivsetup
-ECRYPT_ivsetup:
- mov %rsp,%r11
- and $31,%r11
- add $256,%r11
- sub %r11,%rsp
- # iv = arg2
- mov %rsi,%rsi
- # x = arg1
- mov %rdi,%rdi
- # in6 = *(uint64 *) (iv + 0)
- movq 0(%rsi),%rsi
- # in8 = 0
- mov $0,%r8
- # *(uint64 *) (x + 24) = in6
- movq %rsi,24(%rdi)
- # *(uint64 *) (x + 32) = in8
- movq %r8,32(%rdi)
- # leave
- add %r11,%rsp
- mov %rdi,%rax
- mov %rsi,%rdx
- ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
deleted file mode 100644
index bccb76d80987..000000000000
--- a/arch/x86/crypto/salsa20_glue.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Glue code for optimized assembly version of Salsa20.
- *
- * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
- *
- * The assembly codes are public domain assembly codes written by Daniel. J.
- * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
- * and to remove extraneous comments and functions that are not needed.
- * - i586 version, renamed as salsa20-i586-asm_32.S
- * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
- * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
- * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/algapi.h>
-#include <linux/module.h>
-#include <linux/crypto.h>
-
-#define SALSA20_IV_SIZE 8U
-#define SALSA20_MIN_KEY_SIZE 16U
-#define SALSA20_MAX_KEY_SIZE 32U
-
-// use the ECRYPT_* function names
-#define salsa20_keysetup ECRYPT_keysetup
-#define salsa20_ivsetup ECRYPT_ivsetup
-#define salsa20_encrypt_bytes ECRYPT_encrypt_bytes
-
-struct salsa20_ctx
-{
- u32 input[16];
-};
-
-asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
- u32 keysize, u32 ivsize);
-asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
-asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
- const u8 *src, u8 *dst, u32 bytes);
-
-static int setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int keysize)
-{
- struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
- salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
- return 0;
-}
-
-static int encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
-{
- struct blkcipher_walk walk;
- struct crypto_blkcipher *tfm = desc->tfm;
- struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
- int err;
-
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt_block(desc, &walk, 64);
-
- salsa20_ivsetup(ctx, walk.iv);
-
- if (likely(walk.nbytes == nbytes))
- {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr, nbytes);
- return blkcipher_walk_done(desc, &walk, 0);
- }
-
- while (walk.nbytes >= 64) {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes - (walk.nbytes % 64));
- err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
- }
-
- if (walk.nbytes) {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr, walk.nbytes);
- err = blkcipher_walk_done(desc, &walk, 0);
- }
-
- return err;
-}
-
-static struct crypto_alg alg = {
- .cra_name = "salsa20",
- .cra_driver_name = "salsa20-asm",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
- .cra_type = &crypto_blkcipher_type,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct salsa20_ctx),
- .cra_alignmask = 3,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(alg.cra_list),
- .cra_u = {
- .blkcipher = {
- .setkey = setkey,
- .encrypt = encrypt,
- .decrypt = encrypt,
- .min_keysize = SALSA20_MIN_KEY_SIZE,
- .max_keysize = SALSA20_MAX_KEY_SIZE,
- .ivsize = SALSA20_IV_SIZE,
- }
- }
-};
-
-static int __init init(void)
-{
- return crypto_register_alg(&alg);
-}
-
-static void __exit fini(void)
-{
- crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS("salsa20");
-MODULE_ALIAS("salsa20-asm");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..84e47f7f6188
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,712 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "serpent-avx-x86_64-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way AVX serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define tp %xmm5
+
+#define RA2 %xmm6
+#define RB2 %xmm7
+#define RC2 %xmm8
+#define RD2 %xmm9
+#define RE2 %xmm10
+
+#define RNOT %xmm11
+
+#define RK0 %xmm12
+#define RK1 %xmm13
+#define RK2 %xmm14
+#define RK3 %xmm15
+
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ vpor x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x4; \
+ vpxor RNOT, x4, x4; \
+ vpxor x1, tp, x3; \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpxor x0, x2, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x0, x0; \
+ vpor x0, x4, x4; \
+ vpxor x2, x0, x0; \
+ vpand x1, x2, x2; \
+ vpxor x2, x3, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpxor x2, x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, tp; \
+ vpxor x3, x0, x0; \
+ vpxor RNOT, x3, x3; \
+ vpand tp, x1, x4; \
+ vpor tp, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpxor x3, x0, x0; \
+ vpxor x3, tp, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpor x4, x1, x1; \
+ vpxor x2, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x2, x2; \
+ vpor x0, x1, x1; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x0, x0; \
+ vpxor x1, x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, tp; \
+ vpxor x3, tp, tp; \
+ vpor x0, x3, x3; \
+ vpxor x1, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpand tp, x1, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ vpxor x2, tp, tp; \
+ vpand x3, x2, x2; \
+ vpor x1, x3, x3; \
+ vpxor RNOT, tp, tp; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x0, x4; \
+ vpxor x2, tp, x0; \
+ vpor x2, x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, tp; \
+ vpor x0, x3, x3; \
+ vpand x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpand x3, tp, x1; \
+ vpxor x3, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x4, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpand x3, x0, x0; \
+ vpand x4, x3, x3; \
+ vpxor x2, x3, x3; \
+ vpor x1, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x4, x4; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor x1, x0, x0; \
+ vpxor tp, x3, x4; \
+ vpor x0, x2, x2; \
+ vpxor x1, x2, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpand x2, x4, x4; \
+ vpxor tp, x2, x2; \
+ vpxor x0, x4, x4; \
+ vpor x1, tp, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x0, x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ vpor x0, x1, tp; \
+ vpxor tp, x2, x2; \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpand x4, tp, x1; \
+ vpor x3, x4, x4; \
+ vpxor x0, x4, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ vpand x3, x0, x0; \
+ vpxor x3, x1, x1; \
+ vpxor x2, x3, x3; \
+ vpxor x1, x0, x0; \
+ vpand x4, x2, x2; \
+ vpxor x2, x1, x1; \
+ vpand x0, x2, x2; \
+ vpxor x2, x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, tp; \
+ vpxor x0, x2, x2; \
+ vpand x3, x0, x0; \
+ vpor x3, tp, tp; \
+ vpxor RNOT, x1, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x2, tp, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x1, tp; \
+ vpxor RNOT, x0, x0; \
+ vpand x2, tp, x1; \
+ vpxor x3, x1, x1; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x3; \
+ vpor x1, x0, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ vpand x0, x2, x2; \
+ vpxor x4, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpand x0, x3, x3; \
+ vpxor x1, x4, x4; \
+ vpxor x4, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpor x0, x4, x4; \
+ vpxor x1, x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpor x1, x3, tp; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpxor x0, tp, x3; \
+ vpand x1, x0, x0; \
+ vpxor x2, x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ vpand x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpxor x3, x1, x1; \
+ vpand x0, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, tp; \
+ vpxor RNOT, x2, x2; \
+ vpor x1, x0, x4; \
+ vpxor x3, x4, x4; \
+ vpand x1, x3, x3; \
+ vpxor x2, x1, x1; \
+ vpand x4, x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ vpxor x1, x4, x4; \
+ vpor x3, x1, x1; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x2, x2; \
+ vpor x4, tp, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor x1, x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpxor RNOT, x3, tp; \
+ vpor x2, tp, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x4; \
+ vpxor x1, tp, x3; \
+ vpor x2, x1, x1; \
+ vpxor x0, x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x1, x1; \
+ vpor x3, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpand x2, x1, tp; \
+ vpxor x0, tp, tp; \
+ vpor x1, x0, x0; \
+ vpxor x3, x1, x4; \
+ vpxor x3, x0, x0; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x1, x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x2, x2; \
+ vpand x1, x0, tp; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor RNOT, x0, x4; \
+ vpxor tp, x1, x1; \
+ vpxor x2, tp, x0; \
+ vpand x4, x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x0, x0; \
+ vpand x2, x3, x3; \
+ vpxor x3, x4, x4; \
+ vpxor x1, x3, x3; \
+ vpand x0, x1, x1; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ vpor x2, x1, tp; \
+ vpxor x1, x2, x2; \
+ vpxor x3, tp, tp; \
+ vpand x1, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpor x0, x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ vpxor tp, x1, x4; \
+ vpxor x4, x2, x2; \
+ vpand x0, x4, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x3, tp, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x0, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x3, x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ vpxor x2, x0, x0; \
+ vpand x3, x0, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x2, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpor x0, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpand tp, x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ vpxor RNOT, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpand x2, x1, x1; \
+ vpxor tp, x0, x4; \
+ vpxor x4, x3, x3; \
+ vpxor x2, x4, x4; \
+ vpxor x1, tp, x0; \
+ vpxor x0, x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x2, x0, x0; \
+ vpor x3, x2, x2; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpor tp, x1, x1; \
+ vpxor x0, x4, x4; \
+ vpand x2, x0, x0; \
+ vpxor x1, x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ vpand x2, x1, x1; \
+ vpxor x2, tp, x3; \
+ vpxor x3, x4, x4; \
+ vpand x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor x4, x1, x1; \
+ vpxor x4, x3, x3; \
+ vpand x0, x4, x4; \
+ vpxor x2, x4, x4;
+
+#define get_key(i, j, t) \
+ vbroadcastss (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ vpslld $13, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $13, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $1, x1 ## 1, x4 ## 1; \
+ vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ get_key(i, 1, RK1); \
+ vpslld $1, x1 ## 2, x4 ## 2; \
+ vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ get_key(i, 3, RK3); \
+ vpslld $7, x3 ## 1, x4 ## 1; \
+ vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ vpslld $7, x3 ## 2, x4 ## 2; \
+ vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpslld $5, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $22, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpslld $5, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $22, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpsrld $5, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpsrld $22, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpsrld $5, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpsrld $22, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $1, x1 ## 1, x4 ## 1; \
+ vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpsrld $1, x1 ## 2, x4 ## 2; \
+ vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpsrld $7, x3 ## 1, x4 ## 1; \
+ vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $7, x3 ## 2, x4 ## 2; \
+ vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $13, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $3, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $13, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $3, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 3, RK3); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ RET;
+SYM_FUNC_END(__serpent_enc_blk8_avx)
+
+SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ RET;
+SYM_FUNC_END(__serpent_dec_blk8_avx)
+
+SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_enc_blk8_avx;
+
+ store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(serpent_ecb_enc_8way_avx)
+
+SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(serpent_ecb_dec_8way_avx)
+
+SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(serpent_cbc_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx.h b/arch/x86/crypto/serpent-avx.h
new file mode 100644
index 000000000000..23f3361a0e72
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_SERPENT_AVX_H
+#define ASM_X86_SERPENT_AVX_H
+
+#include <crypto/b128ops.h>
+#include <crypto/serpent.h>
+#include <linux/types.h>
+
+struct crypto_skcipher;
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+
+#endif
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
new file mode 100644
index 000000000000..6d60c50593a9
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -0,0 +1,724 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on AVX assembler implementation of Serpent by:
+ * Copyright © 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx2.S"
+
+.file "serpent-avx2-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.text
+
+#define CTX %rdi
+
+#define RNOT %ymm0
+#define tp %ymm1
+
+#define RA1 %ymm2
+#define RA2 %ymm3
+#define RB1 %ymm4
+#define RB2 %ymm5
+#define RC1 %ymm6
+#define RC2 %ymm7
+#define RD1 %ymm8
+#define RD2 %ymm9
+#define RE1 %ymm10
+#define RE2 %ymm11
+
+#define RK0 %ymm12
+#define RK1 %ymm13
+#define RK2 %ymm14
+#define RK3 %ymm15
+
+#define RK0x %xmm12
+#define RK1x %xmm13
+#define RK2x %xmm14
+#define RK3x %xmm15
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ vpor x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x4; \
+ vpxor RNOT, x4, x4; \
+ vpxor x1, tp, x3; \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpxor x0, x2, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x0, x0; \
+ vpor x0, x4, x4; \
+ vpxor x2, x0, x0; \
+ vpand x1, x2, x2; \
+ vpxor x2, x3, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpxor x2, x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, tp; \
+ vpxor x3, x0, x0; \
+ vpxor RNOT, x3, x3; \
+ vpand tp, x1, x4; \
+ vpor tp, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpxor x3, x0, x0; \
+ vpxor x3, tp, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpor x4, x1, x1; \
+ vpxor x2, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x2, x2; \
+ vpor x0, x1, x1; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x0, x0; \
+ vpxor x1, x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, tp; \
+ vpxor x3, tp, tp; \
+ vpor x0, x3, x3; \
+ vpxor x1, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpand tp, x1, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ vpxor x2, tp, tp; \
+ vpand x3, x2, x2; \
+ vpor x1, x3, x3; \
+ vpxor RNOT, tp, tp; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x0, x4; \
+ vpxor x2, tp, x0; \
+ vpor x2, x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, tp; \
+ vpor x0, x3, x3; \
+ vpand x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpand x3, tp, x1; \
+ vpxor x3, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x4, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpand x3, x0, x0; \
+ vpand x4, x3, x3; \
+ vpxor x2, x3, x3; \
+ vpor x1, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x4, x4; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor x1, x0, x0; \
+ vpxor tp, x3, x4; \
+ vpor x0, x2, x2; \
+ vpxor x1, x2, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpand x2, x4, x4; \
+ vpxor tp, x2, x2; \
+ vpxor x0, x4, x4; \
+ vpor x1, tp, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x0, x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ vpor x0, x1, tp; \
+ vpxor tp, x2, x2; \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpand x4, tp, x1; \
+ vpor x3, x4, x4; \
+ vpxor x0, x4, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ vpand x3, x0, x0; \
+ vpxor x3, x1, x1; \
+ vpxor x2, x3, x3; \
+ vpxor x1, x0, x0; \
+ vpand x4, x2, x2; \
+ vpxor x2, x1, x1; \
+ vpand x0, x2, x2; \
+ vpxor x2, x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, tp; \
+ vpxor x0, x2, x2; \
+ vpand x3, x0, x0; \
+ vpor x3, tp, tp; \
+ vpxor RNOT, x1, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x2, tp, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x1, tp; \
+ vpxor RNOT, x0, x0; \
+ vpand x2, tp, x1; \
+ vpxor x3, x1, x1; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x3; \
+ vpor x1, x0, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ vpand x0, x2, x2; \
+ vpxor x4, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpand x0, x3, x3; \
+ vpxor x1, x4, x4; \
+ vpxor x4, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpor x0, x4, x4; \
+ vpxor x1, x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpor x1, x3, tp; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpxor x0, tp, x3; \
+ vpand x1, x0, x0; \
+ vpxor x2, x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ vpand x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpxor x3, x1, x1; \
+ vpand x0, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, tp; \
+ vpxor RNOT, x2, x2; \
+ vpor x1, x0, x4; \
+ vpxor x3, x4, x4; \
+ vpand x1, x3, x3; \
+ vpxor x2, x1, x1; \
+ vpand x4, x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ vpxor x1, x4, x4; \
+ vpor x3, x1, x1; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x2, x2; \
+ vpor x4, tp, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor x1, x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpxor RNOT, x3, tp; \
+ vpor x2, tp, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x4; \
+ vpxor x1, tp, x3; \
+ vpor x2, x1, x1; \
+ vpxor x0, x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x1, x1; \
+ vpor x3, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpand x2, x1, tp; \
+ vpxor x0, tp, tp; \
+ vpor x1, x0, x0; \
+ vpxor x3, x1, x4; \
+ vpxor x3, x0, x0; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x1, x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x2, x2; \
+ vpand x1, x0, tp; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor RNOT, x0, x4; \
+ vpxor tp, x1, x1; \
+ vpxor x2, tp, x0; \
+ vpand x4, x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x0, x0; \
+ vpand x2, x3, x3; \
+ vpxor x3, x4, x4; \
+ vpxor x1, x3, x3; \
+ vpand x0, x1, x1; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ vpor x2, x1, tp; \
+ vpxor x1, x2, x2; \
+ vpxor x3, tp, tp; \
+ vpand x1, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpor x0, x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ vpxor tp, x1, x4; \
+ vpxor x4, x2, x2; \
+ vpand x0, x4, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x3, tp, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x0, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x3, x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ vpxor x2, x0, x0; \
+ vpand x3, x0, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x2, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpor x0, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpand tp, x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ vpxor RNOT, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpand x2, x1, x1; \
+ vpxor tp, x0, x4; \
+ vpxor x4, x3, x3; \
+ vpxor x2, x4, x4; \
+ vpxor x1, tp, x0; \
+ vpxor x0, x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x2, x0, x0; \
+ vpor x3, x2, x2; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpor tp, x1, x1; \
+ vpxor x0, x4, x4; \
+ vpand x2, x0, x0; \
+ vpxor x1, x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ vpand x2, x1, x1; \
+ vpxor x2, tp, x3; \
+ vpxor x3, x4, x4; \
+ vpand x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor x4, x1, x1; \
+ vpxor x4, x3, x3; \
+ vpand x0, x4, x4; \
+ vpxor x2, x4, x4;
+
+#define get_key(i,j,t) \
+ vpbroadcastd (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ vpslld $13, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $13, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $1, x1 ## 1, x4 ## 1; \
+ vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ get_key(i, 1, RK1); \
+ vpslld $1, x1 ## 2, x4 ## 2; \
+ vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ get_key(i, 3, RK3); \
+ vpslld $7, x3 ## 1, x4 ## 1; \
+ vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ vpslld $7, x3 ## 2, x4 ## 2; \
+ vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpslld $5, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $22, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpslld $5, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $22, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpsrld $5, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpsrld $22, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpsrld $5, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpsrld $22, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $1, x1 ## 1, x4 ## 1; \
+ vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpsrld $1, x1 ## 2, x4 ## 2; \
+ vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpsrld $7, x3 ## 1, x4 ## 1; \
+ vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $7, x3 ## 2, x4 ## 2; \
+ vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $13, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $3, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $13, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $3, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 3, RK3); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ RET;
+SYM_FUNC_END(__serpent_enc_blk16)
+
+SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+ * output:
+ * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ RET;
+SYM_FUNC_END(__serpent_dec_blk16)
+
+SYM_FUNC_START(serpent_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_enc_blk16;
+
+ store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ vzeroupper;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(serpent_ecb_enc_16way)
+
+SYM_FUNC_START(serpent_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk16;
+
+ store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ vzeroupper;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(serpent_ecb_dec_16way)
+
+SYM_FUNC_START(serpent_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk16;
+
+ store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
+ RK0);
+
+ vzeroupper;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(serpent_cbc_dec_16way)
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 000000000000..8ccb03ad7cef
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,616 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ */
+
+#include <linux/linkage.h>
+
+.file "serpent-sse2-i586-asm_32.S"
+.text
+
+#define arg_ctx 4
+#define arg_dst 8
+#define arg_src 12
+#define arg_xor 16
+
+/**********************************************************************
+ 4-way SSE2 serpent
+ **********************************************************************/
+#define CTX %edx
+
+#define RA %xmm0
+#define RB %xmm1
+#define RC %xmm2
+#define RD %xmm3
+#define RE %xmm4
+
+#define RT0 %xmm5
+#define RT1 %xmm6
+
+#define RNOT %xmm7
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, x4); \
+ get_key(i, 1, RT0); \
+ get_key(i, 2, RT1); \
+ pxor x4, x0; \
+ pxor RT0, x1; \
+ pxor RT1, x2; \
+ get_key(i, 3, x4); \
+ pxor x4, x3;
+
+#define LK(x0, x1, x2, x3, x4, i) \
+ movdqa x0, x4; \
+ pslld $13, x0; \
+ psrld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x0, x1; \
+ movdqa x2, x4; \
+ pslld $3, x2; \
+ psrld $(32 - 3), x4; \
+ por x4, x2; \
+ pxor x2, x1; \
+ movdqa x1, x4; \
+ pslld $1, x1; \
+ psrld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x2, x3; \
+ pxor x4, x3; \
+ movdqa x3, x4; \
+ pslld $7, x3; \
+ psrld $(32 - 7), x4; \
+ por x4, x3; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x3, x0; \
+ pxor x3, x2; \
+ pxor x4, x2; \
+ movdqa x0, x4; \
+ get_key(i, 1, RT0); \
+ pxor RT0, x1; \
+ get_key(i, 3, RT0); \
+ pxor RT0, x3; \
+ pslld $5, x0; \
+ psrld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ pslld $22, x2; \
+ psrld $(32 - 22), x4; \
+ por x4, x2; \
+ get_key(i, 0, RT0); \
+ pxor RT0, x0; \
+ get_key(i, 2, RT0); \
+ pxor RT0, x2;
+
+#define KL(x0, x1, x2, x3, x4, i) \
+ K(x0, x1, x2, x3, x4, i); \
+ movdqa x0, x4; \
+ psrld $5, x0; \
+ pslld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ psrld $22, x2; \
+ pslld $(32 - 22), x4; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pxor x3, x0; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x4, x2; \
+ movdqa x1, x4; \
+ psrld $1, x1; \
+ pslld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x3, x4; \
+ psrld $7, x3; \
+ pslld $(32 - 7), x4; \
+ por x4, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x4, x3; \
+ movdqa x0, x4; \
+ psrld $13, x0; \
+ pslld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x2, x1; \
+ pxor x2, x3; \
+ movdqa x2, x4; \
+ psrld $3, x2; \
+ pslld $(32 - 3), x4; \
+ por x4, x2;
+
+#define S0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2; \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1; \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4; \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1; \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0; \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2; \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2; \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1; \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3; \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0; \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ movdqa x0, t2; \
+ punpckldq x1, x0; \
+ punpckhdq x1, t2; \
+ movdqa x2, t1; \
+ punpckhdq x3, x2; \
+ punpckldq x3, t1; \
+ movdqa x0, x1; \
+ punpcklqdq t1, x0; \
+ punpckhqdq t1, x1; \
+ movdqa t2, x3; \
+ punpcklqdq x2, t2; \
+ punpckhqdq x2, x3; \
+ movdqa t2, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+SYM_FUNC_START(__serpent_enc_blk_4way)
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ * arg_xor(%esp): bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 0);
+ S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
+ S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
+ S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
+ S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
+ S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
+ S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
+ S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
+ S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
+ S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
+ S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
+ S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
+ S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
+ S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
+ S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
+ S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
+ S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
+ S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
+ S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
+ S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
+ S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
+ S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
+ S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
+ S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
+ S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
+ S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
+ S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
+ S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
+ S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
+ S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
+ S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
+ S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
+ S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
+
+ movl arg_dst(%esp), %eax;
+
+ cmpb $0, arg_xor(%esp);
+ jnz .L__enc_xor4;
+
+ write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ RET;
+
+.L__enc_xor4:
+ xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ RET;
+SYM_FUNC_END(__serpent_enc_blk_4way)
+
+SYM_FUNC_START(serpent_dec_blk_4way)
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 32);
+ SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
+ SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
+ SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
+ SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
+ SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
+ SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
+ SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
+ SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
+ SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
+ SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
+ SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
+ SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
+ SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
+ SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
+ SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
+ SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
+ SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
+ SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
+ SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
+ SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
+ SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
+ SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
+ SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
+ SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
+ SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
+ SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
+ SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
+ SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
+ SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
+ SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
+ SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
+ SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
+
+ movl arg_dst(%esp), %eax;
+ write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
+
+ RET;
+SYM_FUNC_END(serpent_dec_blk_4way)
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000000..e0998a011d1d
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,739 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ */
+
+#include <linux/linkage.h>
+
+.file "serpent-sse2-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way SSE2 serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define RA2 %xmm5
+#define RB2 %xmm6
+#define RC2 %xmm7
+#define RD2 %xmm8
+#define RE2 %xmm9
+
+#define RNOT %xmm10
+
+#define RK0 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
+#define RK3 %xmm14
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ pxor RK0, x0 ## 1; \
+ pxor RK1, x1 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK1, x1 ## 2; \
+ pxor RK2, x2 ## 2; \
+ pxor RK3, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $13, x0 ## 1; \
+ psrld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $3, x2 ## 1; \
+ psrld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $13, x0 ## 2; \
+ psrld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $3, x2 ## 2; \
+ psrld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $1, x1 ## 1; \
+ psrld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x3 ## 1, x4 ## 1; \
+ get_key(i, 1, RK1); \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $1, x1 ## 2; \
+ psrld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x3 ## 2, x4 ## 2; \
+ get_key(i, 3, RK3); \
+ pslld $7, x3 ## 1; \
+ psrld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ pslld $7, x3 ## 2; \
+ psrld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ pxor RK1, x1 ## 1; \
+ pxor RK3, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $5, x0 ## 1; \
+ psrld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $22, x2 ## 1; \
+ psrld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK1, x1 ## 2; \
+ pxor RK3, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $5, x0 ## 2; \
+ psrld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $22, x2 ## 2; \
+ psrld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ psrld $5, x0 ## 1; \
+ pslld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $22, x2 ## 1; \
+ pslld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $5, x0 ## 2; \
+ pslld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor RK3, x3 ## 2; \
+ pxor RK1, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $22, x2 ## 2; \
+ pslld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x3 ## 1, x0 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ psrld $1, x1 ## 1; \
+ pslld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ pxor x3 ## 2, x0 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ psrld $1, x1 ## 2; \
+ pslld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x3 ## 1, x4 ## 1; \
+ psrld $7, x3 ## 1; \
+ pslld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ movdqa x3 ## 2, x4 ## 2; \
+ psrld $7, x3 ## 2; \
+ pslld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $13, x0 ## 1; \
+ pslld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $3, x2 ## 1; \
+ pslld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ psrld $13, x0 ## 2; \
+ pslld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $3, x2 ## 2; \
+ pslld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 3, RK3); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ movdqa x0, t2; \
+ punpckldq x1, x0; \
+ punpckhdq x1, t2; \
+ movdqa x2, t1; \
+ punpckhdq x3, x2; \
+ punpckldq x3, t1; \
+ movdqa x0, x1; \
+ punpcklqdq t1, x0; \
+ punpckhqdq t1, x1; \
+ movdqa t2, x3; \
+ punpcklqdq x2, t2; \
+ punpckhqdq x2, x3; \
+ movdqa t2, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+SYM_FUNC_START(__serpent_enc_blk_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ leaq (4*4*4)(%rsi), %rax;
+
+ testb %cl, %cl;
+ jnz .L__enc_xor8;
+
+ write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ RET;
+
+.L__enc_xor8:
+ xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ RET;
+SYM_FUNC_END(__serpent_enc_blk_8way)
+
+SYM_FUNC_START(serpent_dec_blk_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ leaq (4*4*4)(%rsi), %rax;
+ write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ RET;
+SYM_FUNC_END(serpent_dec_blk_8way)
diff --git a/arch/x86/crypto/serpent-sse2.h b/arch/x86/crypto/serpent-sse2.h
new file mode 100644
index 000000000000..860ca248914b
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_SERPENT_SSE2_H
+#define ASM_X86_SERPENT_SSE2_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#ifdef CONFIG_X86_32
+
+#define SERPENT_PARALLEL_BLOCKS 4
+
+asmlinkage void __serpent_enc_blk_4way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_4way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx,
+ u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ serpent_dec_blk_4way(ctx, dst, src);
+}
+
+#else
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way(const struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx,
+ u8 *dst, const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ serpent_dec_blk_8way(ctx, dst, src);
+}
+
+#endif
+
+#endif
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
new file mode 100644
index 000000000000..f5f2121b7956
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+
+#include "serpent-avx.h"
+#include "ecb_cbc_helpers.h"
+
+#define SERPENT_AVX2_PARALLEL_BLOCKS 16
+
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "ecb(serpent)",
+ .base.cra_driver_name = "ecb-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(serpent)",
+ .base.cra_driver_name = "cbc-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static int __init serpent_avx2_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(serpent_algs,
+ ARRAY_SIZE(serpent_algs));
+}
+
+static void __exit serpent_avx2_fini(void)
+{
+ crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
+}
+
+module_init(serpent_avx2_init);
+module_exit(serpent_avx2_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS_CRYPTO("serpent");
+MODULE_ALIAS_CRYPTO("serpent-asm");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 000000000000..9c8b3a335d5c
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for AVX assembler versions of Serpent Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+
+#include "serpent-avx.h"
+#include "ecb_cbc_helpers.h"
+
+/* 8-way parallel cipher functions */
+asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
+
+asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
+
+asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "ecb(serpent)",
+ .base.cra_driver_name = "ecb-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(serpent)",
+ .base.cra_driver_name = "cbc-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static int __init serpent_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(serpent_algs,
+ ARRAY_SIZE(serpent_algs));
+}
+
+static void __exit serpent_exit(void)
+{
+ crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
+}
+
+module_init(serpent_init);
+module_exit(serpent_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000000..80ee17ec21b4
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for SSE2 assembler versions of Serpent Cipher
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Glue code based on aesni-intel_glue.c by:
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
+#include <crypto/serpent.h>
+
+#include "serpent-sse2.h"
+#include "ecb_cbc_helpers.h"
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static void serpent_decrypt_cbc_xway(const void *ctx, u8 *dst, const u8 *src)
+{
+ u8 buf[SERPENT_PARALLEL_BLOCKS - 1][SERPENT_BLOCK_SIZE];
+ const u8 *s = src;
+
+ if (dst == src)
+ s = memcpy(buf, src, sizeof(buf));
+ serpent_dec_blk_xway(ctx, dst, src);
+ crypto_xor(dst + SERPENT_BLOCK_SIZE, s, sizeof(buf));
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_enc_blk_xway);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_dec_blk_xway);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_decrypt_cbc_xway);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "ecb(serpent)",
+ .base.cra_driver_name = "ecb-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(serpent)",
+ .base.cra_driver_name = "cbc-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static int __init serpent_sse2_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2)) {
+ printk(KERN_INFO "SSE2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(serpent_algs,
+ ARRAY_SIZE(serpent_algs));
+}
+
+static void __exit serpent_sse2_exit(void)
+{
+ crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
+}
+
+module_init(serpent_sse2_init);
+module_exit(serpent_sse2_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S
new file mode 100644
index 000000000000..503bab450a91
--- /dev/null
+++ b/arch/x86/crypto/sm3-avx-asm_64.S
@@ -0,0 +1,517 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM3 AVX accelerated transform.
+ * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM3 AES/BMI2 accelerated work by libgcrypt at:
+ * https://gnupg.org/software/libgcrypt/index.html
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/frame.h>
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+#define state_h5 20
+#define state_h6 24
+#define state_h7 28
+
+/* Constants */
+
+/* Round constant macros */
+
+#define K0 2043430169 /* 0x79cc4519 */
+#define K1 -208106958 /* 0xf3988a32 */
+#define K2 -416213915 /* 0xe7311465 */
+#define K3 -832427829 /* 0xce6228cb */
+#define K4 -1664855657 /* 0x9cc45197 */
+#define K5 965255983 /* 0x3988a32f */
+#define K6 1930511966 /* 0x7311465e */
+#define K7 -433943364 /* 0xe6228cbc */
+#define K8 -867886727 /* 0xcc451979 */
+#define K9 -1735773453 /* 0x988a32f3 */
+#define K10 823420391 /* 0x311465e7 */
+#define K11 1646840782 /* 0x6228cbce */
+#define K12 -1001285732 /* 0xc451979c */
+#define K13 -2002571463 /* 0x88a32f39 */
+#define K14 289824371 /* 0x11465e73 */
+#define K15 579648742 /* 0x228cbce6 */
+#define K16 -1651869049 /* 0x9d8a7a87 */
+#define K17 991229199 /* 0x3b14f50f */
+#define K18 1982458398 /* 0x7629ea1e */
+#define K19 -330050500 /* 0xec53d43c */
+#define K20 -660100999 /* 0xd8a7a879 */
+#define K21 -1320201997 /* 0xb14f50f3 */
+#define K22 1654563303 /* 0x629ea1e7 */
+#define K23 -985840690 /* 0xc53d43ce */
+#define K24 -1971681379 /* 0x8a7a879d */
+#define K25 351604539 /* 0x14f50f3b */
+#define K26 703209078 /* 0x29ea1e76 */
+#define K27 1406418156 /* 0x53d43cec */
+#define K28 -1482130984 /* 0xa7a879d8 */
+#define K29 1330705329 /* 0x4f50f3b1 */
+#define K30 -1633556638 /* 0x9ea1e762 */
+#define K31 1027854021 /* 0x3d43cec5 */
+#define K32 2055708042 /* 0x7a879d8a */
+#define K33 -183551212 /* 0xf50f3b14 */
+#define K34 -367102423 /* 0xea1e7629 */
+#define K35 -734204845 /* 0xd43cec53 */
+#define K36 -1468409689 /* 0xa879d8a7 */
+#define K37 1358147919 /* 0x50f3b14f */
+#define K38 -1578671458 /* 0xa1e7629e */
+#define K39 1137624381 /* 0x43cec53d */
+#define K40 -2019718534 /* 0x879d8a7a */
+#define K41 255530229 /* 0x0f3b14f5 */
+#define K42 511060458 /* 0x1e7629ea */
+#define K43 1022120916 /* 0x3cec53d4 */
+#define K44 2044241832 /* 0x79d8a7a8 */
+#define K45 -206483632 /* 0xf3b14f50 */
+#define K46 -412967263 /* 0xe7629ea1 */
+#define K47 -825934525 /* 0xcec53d43 */
+#define K48 -1651869049 /* 0x9d8a7a87 */
+#define K49 991229199 /* 0x3b14f50f */
+#define K50 1982458398 /* 0x7629ea1e */
+#define K51 -330050500 /* 0xec53d43c */
+#define K52 -660100999 /* 0xd8a7a879 */
+#define K53 -1320201997 /* 0xb14f50f3 */
+#define K54 1654563303 /* 0x629ea1e7 */
+#define K55 -985840690 /* 0xc53d43ce */
+#define K56 -1971681379 /* 0x8a7a879d */
+#define K57 351604539 /* 0x14f50f3b */
+#define K58 703209078 /* 0x29ea1e76 */
+#define K59 1406418156 /* 0x53d43cec */
+#define K60 -1482130984 /* 0xa7a879d8 */
+#define K61 1330705329 /* 0x4f50f3b1 */
+#define K62 -1633556638 /* 0x9ea1e762 */
+#define K63 1027854021 /* 0x3d43cec5 */
+
+/* Register macros */
+
+#define RSTATE %rdi
+#define RDATA %rsi
+#define RNBLKS %rdx
+
+#define t0 %eax
+#define t1 %ebx
+#define t2 %ecx
+
+#define a %r8d
+#define b %r9d
+#define c %r10d
+#define d %r11d
+#define e %r12d
+#define f %r13d
+#define g %r14d
+#define h %r15d
+
+#define W0 %xmm0
+#define W1 %xmm1
+#define W2 %xmm2
+#define W3 %xmm3
+#define W4 %xmm4
+#define W5 %xmm5
+
+#define XTMP0 %xmm6
+#define XTMP1 %xmm7
+#define XTMP2 %xmm8
+#define XTMP3 %xmm9
+#define XTMP4 %xmm10
+#define XTMP5 %xmm11
+#define XTMP6 %xmm12
+
+#define BSWAP_REG %xmm15
+
+/* Stack structure */
+
+#define STACK_W_SIZE (32 * 2 * 3)
+#define STACK_REG_SAVE_SIZE (64)
+
+#define STACK_W (0)
+#define STACK_REG_SAVE (STACK_W + STACK_W_SIZE)
+#define STACK_SIZE (STACK_REG_SAVE + STACK_REG_SAVE_SIZE)
+
+/* Instruction helpers. */
+
+#define roll2(v, reg) \
+ roll $(v), reg;
+
+#define roll3mov(v, src, dst) \
+ movl src, dst; \
+ roll $(v), dst;
+
+#define roll3(v, src, dst) \
+ rorxl $(32-(v)), src, dst;
+
+#define addl2(a, out) \
+ leal (a, out), out;
+
+/* Round function macros. */
+
+#define GG1(x, y, z, o, t) \
+ movl x, o; \
+ xorl y, o; \
+ xorl z, o;
+
+#define FF1(x, y, z, o, t) GG1(x, y, z, o, t)
+
+#define GG2(x, y, z, o, t) \
+ andnl z, x, o; \
+ movl y, t; \
+ andl x, t; \
+ addl2(t, o);
+
+#define FF2(x, y, z, o, t) \
+ movl y, o; \
+ xorl x, o; \
+ movl y, t; \
+ andl x, t; \
+ andl z, o; \
+ xorl t, o;
+
+#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \
+ /* rol(a, 12) => t0 */ \
+ roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \
+ /* rol (t0 + e + t), 7) => t1 */ \
+ leal K##round(t0, e, 1), t1; \
+ roll2(7, t1); \
+ /* h + w1 => h */ \
+ addl wtype##_W1_ADDR(round, widx), h; \
+ /* h + t1 => h */ \
+ addl2(t1, h); \
+ /* t1 ^ t0 => t0 */ \
+ xorl t1, t0; \
+ /* w1w2 + d => d */ \
+ addl wtype##_W1W2_ADDR(round, widx), d; \
+ /* FF##i(a,b,c) => t1 */ \
+ FF##i(a, b, c, t1, t2); \
+ /* d + t1 => d */ \
+ addl2(t1, d); \
+ /* GG#i(e,f,g) => t2 */ \
+ GG##i(e, f, g, t2, t1); \
+ /* h + t2 => h */ \
+ addl2(t2, h); \
+ /* rol (f, 19) => f */ \
+ roll2(19, f); \
+ /* d + t0 => d */ \
+ addl2(t0, d); \
+ /* rol (b, 9) => b */ \
+ roll2(9, b); \
+ /* P0(h) => h */ \
+ roll3(9, h, t2); \
+ roll3(17, h, t1); \
+ xorl t2, h; \
+ xorl t1, h;
+
+#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \
+ R(1, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \
+ R(2, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+/* Input expansion macros. */
+
+/* Byte-swapped input address. */
+#define IW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Expanded input address. */
+#define XW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Rounds 1-12, byte-swapped input block addresses. */
+#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 0)
+#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
+
+/* Rounds 1-12, expanded input block addresses. */
+#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0)
+#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32)
+
+/* Input block loading. */
+#define LOAD_W_XMM_1() \
+ vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */ \
+ vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */ \
+ vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */ \
+ vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */ \
+ vpshufb BSWAP_REG, XTMP0, XTMP0; \
+ vpshufb BSWAP_REG, XTMP1, XTMP1; \
+ vpshufb BSWAP_REG, XTMP2, XTMP2; \
+ vpshufb BSWAP_REG, XTMP3, XTMP3; \
+ vpxor XTMP0, XTMP1, XTMP4; \
+ vpxor XTMP1, XTMP2, XTMP5; \
+ vpxor XTMP2, XTMP3, XTMP6; \
+ leaq 64(RDATA), RDATA; \
+ vmovdqa XTMP0, IW_W1_ADDR(0, 0); \
+ vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \
+ vmovdqa XTMP1, IW_W1_ADDR(4, 0); \
+ vmovdqa XTMP5, IW_W1W2_ADDR(4, 0);
+
+#define LOAD_W_XMM_2() \
+ vmovdqa XTMP2, IW_W1_ADDR(8, 0); \
+ vmovdqa XTMP6, IW_W1W2_ADDR(8, 0);
+
+#define LOAD_W_XMM_3() \
+ vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */ \
+ vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */ \
+ vmovdqa XTMP1, W2; /* W2: xx, w6, w5, w4 */ \
+ vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */ \
+ vpalignr $8, XTMP2, XTMP3, W4; /* W4: xx, w12, w11, w10 */ \
+ vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */
+
+/* Message scheduling. Note: 3 words per XMM register. */
+#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \
+ /* Load (w[i - 16]) => XTMP0 */ \
+ vpshufd $0b10111111, w0, XTMP0; \
+ vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */ \
+ /* Load (w[i - 13]) => XTMP1 */ \
+ vpshufd $0b10111111, w1, XTMP1; \
+ vpalignr $12, XTMP1, w2, XTMP1; \
+ /* w[i - 9] == w3 */ \
+ /* XMM3 ^ XTMP0 => XTMP0 */ \
+ vpxor w3, XTMP0, XTMP0;
+
+#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \
+ /* w[i - 3] == w5 */ \
+ /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
+ vpslld $15, w5, XTMP2; \
+ vpsrld $(32-15), w5, XTMP3; \
+ vpxor XTMP2, XTMP3, XTMP3; \
+ vpxor XTMP3, XTMP0, XTMP0; \
+ /* rol(XTMP1, 7) => XTMP1 */ \
+ vpslld $7, XTMP1, XTMP5; \
+ vpsrld $(32-7), XTMP1, XTMP1; \
+ vpxor XTMP5, XTMP1, XTMP1; \
+ /* XMM4 ^ XTMP1 => XTMP1 */ \
+ vpxor w4, XTMP1, XTMP1; \
+ /* w[i - 6] == XMM4 */ \
+ /* P1(XTMP0) ^ XTMP1 => XMM0 */ \
+ vpslld $15, XTMP0, XTMP5; \
+ vpsrld $(32-15), XTMP0, XTMP6; \
+ vpslld $23, XTMP0, XTMP2; \
+ vpsrld $(32-23), XTMP0, XTMP3; \
+ vpxor XTMP0, XTMP1, XTMP1; \
+ vpxor XTMP6, XTMP5, XTMP5; \
+ vpxor XTMP3, XTMP2, XTMP2; \
+ vpxor XTMP2, XTMP5, XTMP5; \
+ vpxor XTMP5, XTMP1, w0;
+
+#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \
+ /* W1 in XMM12 */ \
+ vpshufd $0b10111111, w4, XTMP4; \
+ vpalignr $12, XTMP4, w5, XTMP4; \
+ vmovdqa XTMP4, XW_W1_ADDR((round), 0); \
+ /* W1 ^ W2 => XTMP1 */ \
+ vpxor w0, XTMP4, XTMP1; \
+ vmovdqa XTMP1, XW_W1W2_ADDR((round), 0);
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+.Lbe32mask:
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+.text
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sm3_transform_avx(struct sm3_state *state,
+ * const u8 *data, int nblocks);
+ */
+SYM_TYPED_FUNC_START(sm3_transform_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: data (64*nblks bytes)
+ * %rdx: nblocks
+ */
+ vzeroupper;
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ movq %rdx, RNBLKS;
+
+ subq $STACK_SIZE, %rsp;
+ andq $(~63), %rsp;
+
+ movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp);
+ movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp);
+ movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp);
+ movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp);
+ movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp);
+
+ vmovdqa .Lbe32mask (%rip), BSWAP_REG;
+
+ /* Get the values of the chaining variables. */
+ movl state_h0(RSTATE), a;
+ movl state_h1(RSTATE), b;
+ movl state_h2(RSTATE), c;
+ movl state_h3(RSTATE), d;
+ movl state_h4(RSTATE), e;
+ movl state_h5(RSTATE), f;
+ movl state_h6(RSTATE), g;
+ movl state_h7(RSTATE), h;
+
+.align 16
+.Loop:
+ /* Load data part1. */
+ LOAD_W_XMM_1();
+
+ leaq -1(RNBLKS), RNBLKS;
+
+ /* Transform 0-3 + Load data part2. */
+ R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2();
+ R1(d, a, b, c, h, e, f, g, 1, 1, IW);
+ R1(c, d, a, b, g, h, e, f, 2, 2, IW);
+ R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3();
+
+ /* Transform 4-7 + Precalc 12-14. */
+ R1(a, b, c, d, e, f, g, h, 4, 0, IW);
+ R1(d, a, b, c, h, e, f, g, 5, 1, IW);
+ R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5);
+ R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 8-11 + Precalc 12-17. */
+ R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5);
+ R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0);
+ R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0);
+ R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 12-14 + Precalc 18-20 */
+ R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1);
+ R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1);
+ R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 15-17 + Precalc 21-23 */
+ R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2);
+ R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2);
+ R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 18-20 + Precalc 24-26 */
+ R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3);
+ R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3);
+ R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 21-23 + Precalc 27-29 */
+ R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4);
+ R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 24-26 + Precalc 30-32 */
+ R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5);
+ R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5);
+ R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 27-29 + Precalc 33-35 */
+ R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0);
+ R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0);
+ R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 30-32 + Precalc 36-38 */
+ R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1);
+ R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1);
+ R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 33-35 + Precalc 39-41 */
+ R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2);
+ R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2);
+ R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 36-38 + Precalc 42-44 */
+ R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3);
+ R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3);
+ R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 39-41 + Precalc 45-47 */
+ R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4);
+ R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4);
+ R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 42-44 + Precalc 48-50 */
+ R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5);
+ R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5);
+ R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5);
+
+ /* Transform 45-47 + Precalc 51-53 */
+ R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0);
+ R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0);
+ R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0);
+
+ /* Transform 48-50 + Precalc 54-56 */
+ R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1);
+ R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1);
+ R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1);
+
+ /* Transform 51-53 + Precalc 57-59 */
+ R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2);
+ R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2);
+ R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2);
+
+ /* Transform 54-56 + Precalc 60-62 */
+ R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3);
+ R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3);
+ R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3);
+
+ /* Transform 57-59 + Precalc 63 */
+ R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 58, 1, XW);
+ R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4);
+
+ /* Transform 60-62 + Precalc 63 */
+ R2(a, b, c, d, e, f, g, h, 60, 0, XW);
+ R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4);
+ R2(c, d, a, b, g, h, e, f, 62, 2, XW);
+
+ /* Transform 63 */
+ R2(b, c, d, a, f, g, h, e, 63, 0, XW);
+
+ /* Update the chaining variables. */
+ xorl state_h0(RSTATE), a;
+ xorl state_h1(RSTATE), b;
+ xorl state_h2(RSTATE), c;
+ xorl state_h3(RSTATE), d;
+ movl a, state_h0(RSTATE);
+ movl b, state_h1(RSTATE);
+ movl c, state_h2(RSTATE);
+ movl d, state_h3(RSTATE);
+ xorl state_h4(RSTATE), e;
+ xorl state_h5(RSTATE), f;
+ xorl state_h6(RSTATE), g;
+ xorl state_h7(RSTATE), h;
+ movl e, state_h4(RSTATE);
+ movl f, state_h5(RSTATE);
+ movl g, state_h6(RSTATE);
+ movl h, state_h7(RSTATE);
+
+ cmpq $0, RNBLKS;
+ jne .Loop;
+
+ vzeroall;
+
+ movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx;
+ movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15;
+ movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14;
+ movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13;
+ movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12;
+
+ vmovdqa %xmm0, IW_W1_ADDR(0, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(0, 0);
+ vmovdqa %xmm0, IW_W1_ADDR(4, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(4, 0);
+ vmovdqa %xmm0, IW_W1_ADDR(8, 0);
+ vmovdqa %xmm0, IW_W1W2_ADDR(8, 0);
+
+ movq %rbp, %rsp;
+ popq %rbp;
+ RET;
+SYM_FUNC_END(sm3_transform_avx)
diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c
new file mode 100644
index 000000000000..6e8c42b9dc8e
--- /dev/null
+++ b/arch/x86/crypto/sm3_avx_glue.c
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM3 Secure Hash Algorithm, AVX assembler accelerated.
+ * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sm3.h>
+#include <crypto/sm3_base.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sm3_transform_avx(struct sm3_state *state,
+ const u8 *data, int nblocks);
+
+static int sm3_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ int remain;
+
+ /*
+ * Make sure struct sm3_state begins directly with the SM3
+ * 256-bit internal state, as this is what the asm functions expect.
+ */
+ BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0);
+
+ kernel_fpu_begin();
+ remain = sm3_base_do_update_blocks(desc, data, len, sm3_transform_avx);
+ kernel_fpu_end();
+ return remain;
+}
+
+static int sm3_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ kernel_fpu_begin();
+ sm3_base_do_finup(desc, data, len, sm3_transform_avx);
+ kernel_fpu_end();
+ return sm3_base_finish(desc, out);
+}
+
+static struct shash_alg sm3_avx_alg = {
+ .digestsize = SM3_DIGEST_SIZE,
+ .init = sm3_base_init,
+ .update = sm3_avx_update,
+ .finup = sm3_avx_finup,
+ .descsize = SM3_STATE_SIZE,
+ .base = {
+ .cra_name = "sm3",
+ .cra_driver_name = "sm3-avx",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
+ .cra_blocksize = SM3_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init sm3_avx_mod_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX)) {
+ pr_info("AVX instruction are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_BMI2)) {
+ pr_info("BMI2 instruction are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_shash(&sm3_avx_alg);
+}
+
+static void __exit sm3_avx_mod_exit(void)
+{
+ crypto_unregister_shash(&sm3_avx_alg);
+}
+
+module_init(sm3_avx_mod_init);
+module_exit(sm3_avx_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM3 Secure Hash Algorithm, AVX assembler accelerated");
+MODULE_ALIAS_CRYPTO("sm3");
+MODULE_ALIAS_CRYPTO("sm3-avx");
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..2bf611eaa191
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -0,0 +1,536 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+#define RX0 %xmm0
+#define RX1 %xmm1
+#define MASK_4BIT %xmm2
+#define RTMP0 %xmm3
+#define RTMP1 %xmm4
+#define RTMP2 %xmm5
+#define RTMP3 %xmm6
+#define RTMP4 %xmm7
+
+#define RA0 %xmm8
+#define RA1 %xmm9
+#define RA2 %xmm10
+#define RA3 %xmm11
+
+#define RB0 %xmm12
+#define RB1 %xmm13
+#define RB2 %xmm14
+#define RB3 %xmm15
+
+#define RNOT %xmm0
+#define RBSWAP %xmm1
+
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* pre-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction.
+ */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
+
+.text
+
+/*
+ * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+SYM_FUNC_START(sm4_aesni_avx_crypt4)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..4 blocks)
+ * %rdx: src (1..4 blocks)
+ * %rcx: num blocks (1..4)
+ */
+ FRAME_BEGIN
+
+ vmovdqu 0*16(%rdx), RA0;
+ vmovdqa RA0, RA1;
+ vmovdqa RA0, RA2;
+ vmovdqa RA0, RA3;
+ cmpq $2, %rcx;
+ jb .Lblk4_load_input_done;
+ vmovdqu 1*16(%rdx), RA1;
+ je .Lblk4_load_input_done;
+ vmovdqu 2*16(%rdx), RA2;
+ cmpq $3, %rcx;
+ je .Lblk4_load_input_done;
+ vmovdqu 3*16(%rdx), RA3;
+
+.Lblk4_load_input_done:
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+ vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+ vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+ vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+ vmovdqa .Linv_shift_row rRIP, RB3;
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RB3, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP2, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP3, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+ ROUND(0, RA0, RA1, RA2, RA3);
+ ROUND(1, RA1, RA2, RA3, RA0);
+ ROUND(2, RA2, RA3, RA0, RA1);
+ ROUND(3, RA3, RA0, RA1, RA2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk4;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vmovdqu RA0, 0*16(%rsi);
+ cmpq $2, %rcx;
+ jb .Lblk4_store_output_done;
+ vmovdqu RA1, 1*16(%rsi);
+ je .Lblk4_store_output_done;
+ vmovdqu RA2, 2*16(%rsi);
+ cmpq $3, %rcx;
+ je .Lblk4_store_output_done;
+ vmovdqu RA3, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_crypt4)
+
+SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * ciphertext blocks
+ */
+ FRAME_BEGIN
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \
+ vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \
+ vmovdqa RX0, RX1; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \
+ vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \
+ vpxor r1, RX1, RX1; \
+ vpxor r2, RX1, RX1; \
+ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ vmovdqa .Linv_shift_row rRIP, RTMP4; \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ vaesenclast MASK_4BIT, RX1, RX1; \
+ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RTMP4, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4, RX1, RTMP2; \
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \
+ vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ vpxor RTMP1, s0, s0; \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpslld $2, RTMP2, RTMP3; \
+ vpsrld $30, RTMP2, RTMP2; \
+ vpxor RTMP2, r0, r0; \
+ vpxor RTMP3, r0, r0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+ ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+ ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+ ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+ ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk8;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__sm4_crypt_blk8)
+
+/*
+ * void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+SYM_FUNC_START(sm4_aesni_avx_crypt8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..8 blocks)
+ * %rdx: src (1..8 blocks)
+ * %rcx: num blocks (1..8)
+ */
+ cmpq $5, %rcx;
+ jb sm4_aesni_avx_crypt4;
+
+ FRAME_BEGIN
+
+ vmovdqu (0 * 16)(%rdx), RA0;
+ vmovdqu (1 * 16)(%rdx), RA1;
+ vmovdqu (2 * 16)(%rdx), RA2;
+ vmovdqu (3 * 16)(%rdx), RA3;
+ vmovdqu (4 * 16)(%rdx), RB0;
+ vmovdqa RB0, RB1;
+ vmovdqa RB0, RB2;
+ vmovdqa RB0, RB3;
+ je .Lblk8_load_input_done;
+ vmovdqu (5 * 16)(%rdx), RB1;
+ cmpq $7, %rcx;
+ jb .Lblk8_load_input_done;
+ vmovdqu (6 * 16)(%rdx), RB2;
+ je .Lblk8_load_input_done;
+ vmovdqu (7 * 16)(%rdx), RB3;
+
+.Lblk8_load_input_done:
+ call __sm4_crypt_blk8;
+
+ cmpq $6, %rcx;
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ jb .Lblk8_store_output_done;
+ vmovdqu RB1, (5 * 16)(%rsi);
+ je .Lblk8_store_output_done;
+ vmovdqu RB2, (6 * 16)(%rsi);
+ cmpq $7, %rcx;
+ je .Lblk8_store_output_done;
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_store_output_done:
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_crypt8)
+
+/*
+ * void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RA0;
+
+ vmovdqa .Lbswap128_mask rRIP, RBSWAP;
+ vpshufb RBSWAP, RA0, RTMP0; /* be => le */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
+ vpshufb RBSWAP, RTMP0, RA1;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
+ vpshufb RBSWAP, RTMP0, RA2;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
+ vpshufb RBSWAP, RTMP0, RA3;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
+ vpshufb RBSWAP, RTMP0, RB0;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
+ vpshufb RBSWAP, RTMP0, RB1;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
+ vpshufb RBSWAP, RTMP0, RB2;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
+ vpshufb RBSWAP, RTMP0, RB3;
+ inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
+ vpshufb RBSWAP, RTMP0, RTMP1;
+
+ /* store new IV */
+ vmovdqu RTMP1, (%rcx);
+
+ call __sm4_crypt_blk8;
+
+ vpxor (0 * 16)(%rdx), RA0, RA0;
+ vpxor (1 * 16)(%rdx), RA1, RA1;
+ vpxor (2 * 16)(%rdx), RA2, RA2;
+ vpxor (3 * 16)(%rdx), RA3, RA3;
+ vpxor (4 * 16)(%rdx), RB0, RB0;
+ vpxor (5 * 16)(%rdx), RB1, RB1;
+ vpxor (6 * 16)(%rdx), RB2, RB2;
+ vpxor (7 * 16)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
+
+/*
+ * void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vmovdqu (0 * 16)(%rdx), RA0;
+ vmovdqu (1 * 16)(%rdx), RA1;
+ vmovdqu (2 * 16)(%rdx), RA2;
+ vmovdqu (3 * 16)(%rdx), RA3;
+ vmovdqu (4 * 16)(%rdx), RB0;
+ vmovdqu (5 * 16)(%rdx), RB1;
+ vmovdqu (6 * 16)(%rdx), RB2;
+ vmovdqu (7 * 16)(%rdx), RB3;
+
+ call __sm4_crypt_blk8;
+
+ vmovdqu (7 * 16)(%rdx), RNOT;
+ vpxor (%rcx), RA0, RA0;
+ vpxor (0 * 16)(%rdx), RA1, RA1;
+ vpxor (1 * 16)(%rdx), RA2, RA2;
+ vpxor (2 * 16)(%rdx), RA3, RA3;
+ vpxor (3 * 16)(%rdx), RB0, RB0;
+ vpxor (4 * 16)(%rdx), RB1, RB1;
+ vpxor (5 * 16)(%rdx), RB2, RB2;
+ vpxor (6 * 16)(%rdx), RB3, RB3;
+ vmovdqu RNOT, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 16)(%rsi);
+ vmovdqu RA1, (1 * 16)(%rsi);
+ vmovdqu RA2, (2 * 16)(%rsi);
+ vmovdqu RA3, (3 * 16)(%rsi);
+ vmovdqu RB0, (4 * 16)(%rsi);
+ vmovdqu RB1, (5 * 16)(%rsi);
+ vmovdqu RB2, (6 * 16)(%rsi);
+ vmovdqu RB3, (7 * 16)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..9ff5ba075591
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <mjos@iki.fi>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+/* vector registers */
+#define RX0 %ymm0
+#define RX1 %ymm1
+#define MASK_4BIT %ymm2
+#define RTMP0 %ymm3
+#define RTMP1 %ymm4
+#define RTMP2 %ymm5
+#define RTMP3 %ymm6
+#define RTMP4 %ymm7
+
+#define RA0 %ymm8
+#define RA1 %ymm9
+#define RA2 %ymm10
+#define RA3 %ymm11
+
+#define RB0 %ymm12
+#define RB1 %ymm13
+#define RB2 %ymm14
+#define RB3 %ymm15
+
+#define RNOT %ymm0
+#define RBSWAP %ymm1
+
+#define RX0x %xmm0
+#define RX1x %xmm1
+#define MASK_4BITx %xmm2
+
+#define RNOTx %xmm0
+#define RBSWAPx %xmm1
+
+#define RTMP0x %xmm3
+#define RTMP1x %xmm4
+#define RTMP2x %xmm5
+#define RTMP3x %xmm6
+#define RTMP4x %xmm7
+
+
+/* helper macros */
+
+/* Transpose four 32-bit words between 128-bit vector lanes. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* post-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction. */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+/* 12 bytes, only for padding */
+.Lpadding_deadbeef:
+ .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
+
+.text
+SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * ciphertext blocks
+ */
+ FRAME_BEGIN
+
+ vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vpbroadcastd (4*(round))(%rdi), RX0; \
+ vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \
+ vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \
+ vmovdqa RX0, RX1; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \
+ vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \
+ vpxor r1, RX1, RX1; \
+ vpxor r2, RX1, RX1; \
+ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+ vextracti128 $1, RX0, RTMP4x; \
+ vextracti128 $1, RX1, RTMP0x; \
+ vaesenclast MASK_4BITx, RX0x, RX0x; \
+ vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \
+ vaesenclast MASK_4BITx, RX1x, RX1x; \
+ vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \
+ vinserti128 $1, RTMP4x, RX0, RX0; \
+ vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \
+ vinserti128 $1, RTMP0x, RX1, RX1; \
+ transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RTMP4, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4, RX1, RTMP2; \
+ vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \
+ vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0; \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP2, RTMP3; \
+ vpsrld $30, RTMP2, RTMP2; \
+ vpxor RTMP2, r0, r0; \
+ /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP3, r0, r0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+ ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+ ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+ ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+ ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk8;
+
+#undef ROUND
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__sm4_crypt_blk16)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+/*
+ * void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ movq 8(%rcx), %rax;
+ bswapq %rax;
+
+ vzeroupper;
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RTMP4x;
+ vpshufb RTMP3x, RTMP4x, RTMP4x;
+ vmovdqa RTMP4x, RTMP0x;
+ inc_le128(RTMP4x, RNOTx, RTMP1x);
+ vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+ vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 16), %rax;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+ vpshufb RTMP3, RTMP0, RA1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+ vpshufb RTMP3, RTMP0, RA2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+ vpshufb RTMP3, RTMP0, RA3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+ vpshufb RTMP3, RTMP0, RB0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+ vpshufb RTMP3, RTMP0, RB1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+ vpshufb RTMP3, RTMP0, RB2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+ vpshufb RTMP3, RTMP0, RB3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+ vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+ jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vextracti128 $1, RTMP0, RTMP0x;
+ vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+ /* store new IV */
+ vmovdqu RTMP0x, (%rcx);
+
+ call __sm4_crypt_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
+
+/*
+ * void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
+ * const u8 *src, u8 *iv)
+ */
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ vmovdqu (4 * 32)(%rdx), RB0;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RB2;
+ vmovdqu (7 * 32)(%rdx), RB3;
+
+ call __sm4_crypt_blk16;
+
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RNOT;
+ vpxor RNOT, RA0, RA0;
+ vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+ vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+ vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+ vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+ vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+ vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+ vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+ FRAME_END
+ RET;
+SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h
new file mode 100644
index 000000000000..b5b5e67e40ed
--- /dev/null
+++ b/arch/x86/crypto/sm4-avx.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ASM_X86_SM4_AVX_H
+#define ASM_X86_SM4_AVX_H
+
+#include <linux/types.h>
+#include <crypto/sm4.h>
+
+typedef void (*sm4_crypt_func)(const u32 *rk, u8 *dst, const u8 *src, u8 *iv);
+
+int sm4_avx_ecb_encrypt(struct skcipher_request *req);
+int sm4_avx_ecb_decrypt(struct skcipher_request *req);
+
+int sm4_cbc_encrypt(struct skcipher_request *req);
+int sm4_avx_cbc_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+int sm4_avx_ctr_crypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func);
+
+#endif
diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c
new file mode 100644
index 000000000000..fec0ab7a63dd
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (c) 2021, Alibaba Group.
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <asm/fpu/api.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-avx.h"
+
+#define SM4_CRYPT16_BLOCK_SIZE (SM4_BLOCK_SIZE * 16)
+
+asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+
+static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cbc_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_cbc_dec_blk16);
+}
+
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE,
+ sm4_aesni_avx2_ctr_enc_blk16);
+}
+
+static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_avx_ecb_encrypt,
+ .decrypt = sm4_avx_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-aesni-avx2",
+ .cra_priority = 500,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 16 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers));
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_skciphers(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers));
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-aesni-avx2");
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
new file mode 100644
index 000000000000..88caf418a06f
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -0,0 +1,349 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (c) 2021, Alibaba Group.
+ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <asm/fpu/api.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-avx.h"
+
+#define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8)
+
+asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
+ const u8 *src, u8 *iv);
+
+static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+ while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) {
+ sm4_aesni_avx_crypt8(rkey, dst, src, 8);
+ dst += SM4_CRYPT8_BLOCK_SIZE;
+ src += SM4_CRYPT8_BLOCK_SIZE;
+ nbytes -= SM4_CRYPT8_BLOCK_SIZE;
+ }
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ unsigned int nblocks = min(nbytes >> 4, 4u);
+ sm4_aesni_avx_crypt4(rkey, dst, src, nblocks);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+ kernel_fpu_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+int sm4_avx_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_crypt(req, ctx->rkey_enc);
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt);
+
+int sm4_avx_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_crypt(req, ctx->rkey_dec);
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt);
+
+int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE);
+ sm4_crypt_block(ctx->rkey_enc, dst, dst);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_cbc_encrypt);
+
+int sm4_avx_cbc_decrypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_dec, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ u8 iv[SM4_BLOCK_SIZE];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+ int i;
+
+ sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream,
+ src, nblocks);
+
+ src += ((int)nblocks - 2) * SM4_BLOCK_SIZE;
+ dst += (nblocks - 1) * SM4_BLOCK_SIZE;
+ memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE);
+
+ for (i = nblocks - 1; i > 0; i--) {
+ crypto_xor_cpy(dst, src,
+ &keystream[i * SM4_BLOCK_SIZE],
+ SM4_BLOCK_SIZE);
+ src -= SM4_BLOCK_SIZE;
+ dst -= SM4_BLOCK_SIZE;
+ }
+ crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE);
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += (nblocks + 1) * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt);
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_cbc_dec_blk8);
+}
+
+int sm4_avx_ctr_crypt(struct skcipher_request *req,
+ unsigned int bsize, sm4_crypt_func func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+
+ while (nbytes >= bsize) {
+ func(ctx->rkey_enc, dst, src, walk.iv);
+ dst += bsize;
+ src += bsize;
+ nbytes -= bsize;
+ }
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ u8 keystream[SM4_BLOCK_SIZE * 8];
+ unsigned int nblocks = min(nbytes >> 4, 8u);
+ int i;
+
+ for (i = 0; i < nblocks; i++) {
+ memcpy(&keystream[i * SM4_BLOCK_SIZE],
+ walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ }
+ sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream,
+ keystream, nblocks);
+
+ crypto_xor_cpy(dst, src, keystream,
+ nblocks * SM4_BLOCK_SIZE);
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_fpu_end();
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, keystream);
+
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt);
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE,
+ sm4_aesni_avx_ctr_enc_blk8);
+}
+
+static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_avx_ecb_encrypt,
+ .decrypt = sm4_avx_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-aesni-avx",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .walksize = 8 * SM4_BLOCK_SIZE,
+ .setkey = sm4_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers));
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_skciphers(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers));
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-aesni-avx");
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..12fde271cd3f
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "twofish-avx-x86_64-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.text
+
+/* structure of crypto context */
+#define s0 0
+#define s1 1024
+#define s2 2048
+#define s3 3072
+#define w 4096
+#define k 4128
+
+/**********************************************************************
+ 8-way AVX twofish
+ **********************************************************************/
+#define CTX %rdi
+
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14
+#define RR %xmm15
+
+#define RID1 %r13
+#define RID1d %r13d
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RGS1 %r8
+#define RGS1d %r8d
+#define RGS2 %r9
+#define RGS2d %r9d
+#define RGS3 %r10
+#define RGS3d %r10d
+
+
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+ movzbl src ## bl, RID1d; \
+ movzbl src ## bh, RID2d; \
+ shrq $16, src; \
+ movl t0(CTX, RID1, 4), dst ## d; \
+ movl t1(CTX, RID2, 4), RID2d; \
+ movzbl src ## bl, RID1d; \
+ xorl RID2d, dst ## d; \
+ movzbl src ## bh, RID2d; \
+ interleave_op(il_reg); \
+ xorl t2(CTX, RID1, 4), dst ## d; \
+ xorl t3(CTX, RID2, 4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
+ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
+ \
+ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
+ shlq $32, RGS2; \
+ orq RGS1, RGS2; \
+ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
+ shlq $32, RGS1; \
+ orq RGS1, RGS3;
+
+#define round_head_2(a, b, x1, y1, x2, y2) \
+ vmovq b ## 1, RGI3; \
+ vpextrq $1, b ## 1, RGI4; \
+ \
+ G(RGI1, RGI2, x1, s0, s1, s2, s3); \
+ vmovq a ## 2, RGI1; \
+ vpextrq $1, a ## 2, RGI2; \
+ vmovq RGS2, x1; \
+ vpinsrq $1, RGS3, x1, x1; \
+ \
+ G(RGI3, RGI4, y1, s1, s2, s3, s0); \
+ vmovq b ## 2, RGI3; \
+ vpextrq $1, b ## 2, RGI4; \
+ vmovq RGS2, y1; \
+ vpinsrq $1, RGS3, y1, y1; \
+ \
+ G(RGI1, RGI2, x2, s0, s1, s2, s3); \
+ vmovq RGS2, x2; \
+ vpinsrq $1, RGS3, x2, x2; \
+ \
+ G(RGI3, RGI4, y2, s1, s2, s3, s0); \
+ vmovq RGS2, y2; \
+ vpinsrq $1, RGS3, y2, y2;
+
+#define encround_tail(a, b, c, d, x, y, prerotate) \
+ vpaddd x, y, x; \
+ vpaddd x, RK1, RT;\
+ prerotate(b); \
+ vpxor RT, c, c; \
+ vpaddd y, x, y; \
+ vpaddd y, RK2, y; \
+ vpsrld $1, c, RT; \
+ vpslld $(32 - 1), c, c; \
+ vpor c, RT, c; \
+ vpxor d, y, d; \
+
+#define decround_tail(a, b, c, d, x, y, prerotate) \
+ vpaddd x, y, x; \
+ vpaddd x, RK1, RT;\
+ prerotate(a); \
+ vpxor RT, c, c; \
+ vpaddd y, x, y; \
+ vpaddd y, RK2, y; \
+ vpxor d, y, d; \
+ vpsrld $1, d, y; \
+ vpslld $(32 - 1), d, d; \
+ vpor d, y, d; \
+
+#define rotate_1l(x) \
+ vpslld $1, x, RR; \
+ vpsrld $(32 - 1), x, x; \
+ vpor x, RR, x;
+
+#define preload_rgi(c) \
+ vmovq c, RGI1; \
+ vpextrq $1, c, RGI2;
+
+#define encrypt_round(n, a, b, c, d, preload, prerotate) \
+ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
+ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
+ round_head_2(a, b, RX0, RY0, RX1, RY1); \
+ encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
+ preload(c ## 1); \
+ encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
+
+#define decrypt_round(n, a, b, c, d, preload, prerotate) \
+ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
+ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
+ round_head_2(a, b, RX0, RY0, RX1, RY1); \
+ decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
+ preload(c ## 1); \
+ decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
+
+#define encrypt_cycle(n) \
+ encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
+ encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
+
+#define encrypt_cycle_last(n) \
+ encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
+ encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
+
+#define decrypt_cycle(n) \
+ decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
+ decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
+
+#define decrypt_cycle_last(n) \
+ decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
+ decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3;
+
+SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+ */
+
+ vmovdqu w(CTX), RK1;
+
+ pushq %r13;
+ pushq %rbx;
+ pushq %rcx;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ preload_rgi(RA1);
+ rotate_1l(RD1);
+ inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+ rotate_1l(RD2);
+
+ encrypt_cycle(0);
+ encrypt_cycle(1);
+ encrypt_cycle(2);
+ encrypt_cycle(3);
+ encrypt_cycle(4);
+ encrypt_cycle(5);
+ encrypt_cycle(6);
+ encrypt_cycle_last(7);
+
+ vmovdqu (w+4*4)(CTX), RK1;
+
+ popq %rcx;
+ popq %rbx;
+ popq %r13;
+
+ outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+
+ RET;
+SYM_FUNC_END(__twofish_enc_blk8)
+
+SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
+ /* input:
+ * %rdi: ctx, CTX
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
+ */
+
+ vmovdqu (w+4*4)(CTX), RK1;
+
+ pushq %r13;
+ pushq %rbx;
+
+ inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ preload_rgi(RC1);
+ rotate_1l(RA1);
+ inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+ rotate_1l(RA2);
+
+ decrypt_cycle(7);
+ decrypt_cycle(6);
+ decrypt_cycle(5);
+ decrypt_cycle(4);
+ decrypt_cycle(3);
+ decrypt_cycle(2);
+ decrypt_cycle(1);
+ decrypt_cycle_last(0);
+
+ vmovdqu (w)(CTX), RK1;
+
+ popq %rbx;
+ popq %r13;
+
+ outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+
+ RET;
+SYM_FUNC_END(__twofish_dec_blk8)
+
+SYM_FUNC_START(twofish_ecb_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __twofish_enc_blk8;
+
+ store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(twofish_ecb_enc_8way)
+
+SYM_FUNC_START(twofish_ecb_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(twofish_ecb_dec_8way)
+
+SYM_FUNC_START(twofish_cbc_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r12;
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(twofish_cbc_dec_8way)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 39b98ed2c1b9..3abcad661884 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -1,32 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/***************************************************************************
* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
* *
-* This program is free software; you can redistribute it and/or modify *
-* it under the terms of the GNU General Public License as published by *
-* the Free Software Foundation; either version 2 of the License, or *
-* (at your option) any later version. *
-* *
-* This program is distributed in the hope that it will be useful, *
-* but WITHOUT ANY WARRANTY; without even the implied warranty of *
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
-* GNU General Public License for more details. *
-* *
-* You should have received a copy of the GNU General Public License *
-* along with this program; if not, write to the *
-* Free Software Foundation, Inc., *
-* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
.file "twofish-i586-asm.S"
.text
+#include <linux/linkage.h>
#include <asm/asm-offsets.h>
-/* return adress at 0 */
+/* return address at 0 */
#define in_blk 12 /* input byte array address parameter*/
#define out_blk 8 /* output byte array address parameter*/
-#define tfm 4 /* Twofish context structure */
+#define ctx 4 /* Twofish context structure */
#define a_offset 0
#define b_offset 4
@@ -219,19 +207,15 @@
xor %esi, d ## D;\
ror $1, d ## D;
-.align 4
-.global twofish_enc_blk
-.global twofish_dec_blk
-
-twofish_enc_blk:
+SYM_FUNC_START(twofish_enc_blk)
push %ebp /* save registers according to calling convention*/
push %ebx
push %esi
push %edi
- mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
- add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
- mov in_blk+16(%esp),%edi /* input adress in edi */
+ mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
+ * pointer to the ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
mov (%edi), %eax
mov b_offset(%edi), %ebx
@@ -276,18 +260,19 @@ twofish_enc_blk:
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
+SYM_FUNC_END(twofish_enc_blk)
-twofish_dec_blk:
+SYM_FUNC_START(twofish_dec_blk)
push %ebp /* save registers according to calling convention*/
push %ebx
push %esi
push %edi
- mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
- add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
- mov in_blk+16(%esp),%edi /* input adress in edi */
+ mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
+ * pointer to the ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
mov (%edi), %eax
mov b_offset(%edi), %ebx
@@ -332,4 +317,5 @@ twofish_dec_blk:
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
+SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
new file mode 100644
index 000000000000..071e90e7f0d8
--- /dev/null
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Twofish Cipher 3-way parallel algorithm (x86_64)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.file "twofish-x86_64-asm-3way.S"
+.text
+
+/* structure of crypto context */
+#define s0 0
+#define s1 1024
+#define s2 2048
+#define s3 3072
+#define w 4096
+#define k 4128
+
+/**********************************************************************
+ 3-way twofish
+ **********************************************************************/
+#define CTX %rdi
+#define RIO %rdx
+
+#define RAB0 %rax
+#define RAB1 %rbx
+#define RAB2 %rcx
+
+#define RAB0d %eax
+#define RAB1d %ebx
+#define RAB2d %ecx
+
+#define RAB0bh %ah
+#define RAB1bh %bh
+#define RAB2bh %ch
+
+#define RAB0bl %al
+#define RAB1bl %bl
+#define RAB2bl %cl
+
+#define CD0 0x0(%rsp)
+#define CD1 0x8(%rsp)
+#define CD2 0x10(%rsp)
+
+# used only before/after all rounds
+#define RCD0 %r8
+#define RCD1 %r9
+#define RCD2 %r10
+
+# used only during rounds
+#define RX0 %r8
+#define RX1 %r9
+#define RX2 %r10
+
+#define RX0d %r8d
+#define RX1d %r9d
+#define RX2d %r10d
+
+#define RY0 %r11
+#define RY1 %r12
+#define RY2 %r13
+
+#define RY0d %r11d
+#define RY1d %r12d
+#define RY2d %r13d
+
+#define RT0 %rdx
+#define RT1 %rsi
+
+#define RT0d %edx
+#define RT1d %esi
+
+#define RT1bl %sil
+
+#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
+ movzbl ab ## bl, tmp2 ## d; \
+ movzbl ab ## bh, tmp1 ## d; \
+ rorq $(rot), ab; \
+ op1##l T0(CTX, tmp2, 4), dst ## d; \
+ op2##l T1(CTX, tmp1, 4), dst ## d;
+
+#define swap_ab_with_cd(ab, cd, tmp) \
+ movq cd, tmp; \
+ movq ab, cd; \
+ movq tmp, ab;
+
+/*
+ * Combined G1 & G2 function. Reordered with help of rotates to have moves
+ * at beginning.
+ */
+#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
+ /* G1,1 && G2,1 */ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
+ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
+ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
+ \
+ /* G1,2 && G2,2 */ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
+ swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
+ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
+ swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
+ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
+ swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
+
+#define enc_round_end(ab, x, y, n) \
+ addl y ## d, x ## d; \
+ addl x ## d, y ## d; \
+ addl k+4*(2*(n))(CTX), x ## d; \
+ xorl ab ## d, x ## d; \
+ addl k+4*(2*(n)+1)(CTX), y ## d; \
+ shrq $32, ab; \
+ roll $1, ab ## d; \
+ xorl y ## d, ab ## d; \
+ shlq $32, ab; \
+ rorl $1, x ## d; \
+ orq x, ab;
+
+#define dec_round_end(ba, x, y, n) \
+ addl y ## d, x ## d; \
+ addl x ## d, y ## d; \
+ addl k+4*(2*(n))(CTX), x ## d; \
+ addl k+4*(2*(n)+1)(CTX), y ## d; \
+ xorl ba ## d, y ## d; \
+ shrq $32, ba; \
+ roll $1, ba ## d; \
+ xorl x ## d, ba ## d; \
+ shlq $32, ba; \
+ rorl $1, y ## d; \
+ orq y, ba;
+
+#define encrypt_round3(ab, cd, n) \
+ g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
+ \
+ enc_round_end(ab ## 0, RX0, RY0, n); \
+ enc_round_end(ab ## 1, RX1, RY1, n); \
+ enc_round_end(ab ## 2, RX2, RY2, n);
+
+#define decrypt_round3(ba, dc, n) \
+ g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
+ \
+ dec_round_end(ba ## 0, RX0, RY0, n); \
+ dec_round_end(ba ## 1, RX1, RY1, n); \
+ dec_round_end(ba ## 2, RX2, RY2, n);
+
+#define encrypt_cycle3(ab, cd, n) \
+ encrypt_round3(ab, cd, n*2); \
+ encrypt_round3(ab, cd, (n*2)+1);
+
+#define decrypt_cycle3(ba, dc, n) \
+ decrypt_round3(ba, dc, (n*2)+1); \
+ decrypt_round3(ba, dc, (n*2));
+
+#define push_cd() \
+ pushq RCD2; \
+ pushq RCD1; \
+ pushq RCD0;
+
+#define pop_cd() \
+ popq RCD0; \
+ popq RCD1; \
+ popq RCD2;
+
+#define inpack3(in, n, xy, m) \
+ movq 4*(n)(in), xy ## 0; \
+ xorq w+4*m(CTX), xy ## 0; \
+ \
+ movq 4*(4+(n))(in), xy ## 1; \
+ xorq w+4*m(CTX), xy ## 1; \
+ \
+ movq 4*(8+(n))(in), xy ## 2; \
+ xorq w+4*m(CTX), xy ## 2;
+
+#define outunpack3(op, out, n, xy, m) \
+ xorq w+4*m(CTX), xy ## 0; \
+ op ## q xy ## 0, 4*(n)(out); \
+ \
+ xorq w+4*m(CTX), xy ## 1; \
+ op ## q xy ## 1, 4*(4+(n))(out); \
+ \
+ xorq w+4*m(CTX), xy ## 2; \
+ op ## q xy ## 2, 4*(8+(n))(out);
+
+#define inpack_enc3() \
+ inpack3(RIO, 0, RAB, 0); \
+ inpack3(RIO, 2, RCD, 2);
+
+#define outunpack_enc3(op) \
+ outunpack3(op, RIO, 2, RAB, 6); \
+ outunpack3(op, RIO, 0, RCD, 4);
+
+#define inpack_dec3() \
+ inpack3(RIO, 0, RAB, 4); \
+ rorq $32, RAB0; \
+ rorq $32, RAB1; \
+ rorq $32, RAB2; \
+ inpack3(RIO, 2, RCD, 6); \
+ rorq $32, RCD0; \
+ rorq $32, RCD1; \
+ rorq $32, RCD2;
+
+#define outunpack_dec3() \
+ rorq $32, RCD0; \
+ rorq $32, RCD1; \
+ rorq $32, RCD2; \
+ outunpack3(mov, RIO, 0, RCD, 0); \
+ rorq $32, RAB0; \
+ rorq $32, RAB1; \
+ rorq $32, RAB2; \
+ outunpack3(mov, RIO, 2, RAB, 2);
+
+SYM_TYPED_FUNC_START(__twofish_enc_blk_3way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src, RIO
+ * %rcx: bool, if true: xor output
+ */
+ pushq %r13;
+ pushq %r12;
+ pushq %rbx;
+
+ pushq %rcx; /* bool xor */
+ pushq %rsi; /* dst */
+
+ inpack_enc3();
+
+ push_cd();
+ encrypt_cycle3(RAB, CD, 0);
+ encrypt_cycle3(RAB, CD, 1);
+ encrypt_cycle3(RAB, CD, 2);
+ encrypt_cycle3(RAB, CD, 3);
+ encrypt_cycle3(RAB, CD, 4);
+ encrypt_cycle3(RAB, CD, 5);
+ encrypt_cycle3(RAB, CD, 6);
+ encrypt_cycle3(RAB, CD, 7);
+ pop_cd();
+
+ popq RIO; /* dst */
+ popq RT1; /* bool xor */
+
+ testb RT1bl, RT1bl;
+ jnz .L__enc_xor3;
+
+ outunpack_enc3(mov);
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ RET;
+
+.L__enc_xor3:
+ outunpack_enc3(xor);
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ RET;
+SYM_FUNC_END(__twofish_enc_blk_3way)
+
+SYM_TYPED_FUNC_START(twofish_dec_blk_3way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src, RIO
+ */
+ pushq %r13;
+ pushq %r12;
+ pushq %rbx;
+
+ pushq %rsi; /* dst */
+
+ inpack_dec3();
+
+ push_cd();
+ decrypt_cycle3(RAB, CD, 7);
+ decrypt_cycle3(RAB, CD, 6);
+ decrypt_cycle3(RAB, CD, 5);
+ decrypt_cycle3(RAB, CD, 4);
+ decrypt_cycle3(RAB, CD, 3);
+ decrypt_cycle3(RAB, CD, 2);
+ decrypt_cycle3(RAB, CD, 1);
+ decrypt_cycle3(RAB, CD, 0);
+ pop_cd();
+
+ popq RIO; /* dst */
+
+ outunpack_dec3();
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ RET;
+SYM_FUNC_END(twofish_dec_blk_3way)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 35974a586615..e08b4ba07b93 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -1,25 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/***************************************************************************
* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
* *
-* This program is free software; you can redistribute it and/or modify *
-* it under the terms of the GNU General Public License as published by *
-* the Free Software Foundation; either version 2 of the License, or *
-* (at your option) any later version. *
-* *
-* This program is distributed in the hope that it will be useful, *
-* but WITHOUT ANY WARRANTY; without even the implied warranty of *
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
-* GNU General Public License for more details. *
-* *
-* You should have received a copy of the GNU General Public License *
-* along with this program; if not, write to the *
-* Free Software Foundation, Inc., *
-* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
.file "twofish-x86_64-asm.S"
.text
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/asm-offsets.h>
#define a_offset 0
@@ -214,18 +203,13 @@
xor %r8d, d ## D;\
ror $1, d ## D;
-.align 8
-.global twofish_enc_blk
-.global twofish_dec_blk
-
-twofish_enc_blk:
+SYM_TYPED_FUNC_START(twofish_enc_blk)
pushq R1
- /* %rdi contains the crypto tfm adress */
- /* %rsi contains the output adress */
- /* %rdx contains the input adress */
- add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
- /* ctx adress is moved to free one non-rex register
+ /* %rdi contains the ctx address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ /* ctx address is moved to free one non-rex register
as target for the 8bit high operations */
mov %rdi, %r11
@@ -268,17 +252,17 @@ twofish_enc_blk:
movq R1, 8(%rsi)
popq R1
- movq $1,%rax
- ret
+ movl $1,%eax
+ RET
+SYM_FUNC_END(twofish_enc_blk)
-twofish_dec_blk:
+SYM_TYPED_FUNC_START(twofish_dec_blk)
pushq R1
- /* %rdi contains the crypto tfm adress */
- /* %rsi contains the output adress */
- /* %rdx contains the input adress */
- add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
- /* ctx adress is moved to free one non-rex register
+ /* %rdi contains the ctx address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ /* ctx address is moved to free one non-rex register
as target for the 8bit high operations */
mov %rdi, %r11
@@ -320,5 +304,6 @@ twofish_dec_blk:
movq R1, 8(%rsi)
popq R1
- movq $1,%rax
- ret
+ movl $1,%eax
+ RET
+SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish.h b/arch/x86/crypto/twofish.h
new file mode 100644
index 000000000000..12df400e6d53
--- /dev/null
+++ b/arch/x86/crypto/twofish.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_TWOFISH_H
+#define ASM_X86_TWOFISH_H
+
+#include <linux/crypto.h>
+#include <crypto/twofish.h>
+#include <crypto/b128ops.h>
+
+/* regular block cipher functions from twofish_x86_64 module */
+asmlinkage void twofish_enc_blk(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void __twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src);
+
+/* helpers from twofish_x86_64-3way module */
+extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src);
+
+#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
new file mode 100644
index 000000000000..9e20db013750
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for AVX assembler version of Twofish Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/twofish.h>
+
+#include "twofish.h"
+#include "ecb_cbc_helpers.h"
+
+#define TWOFISH_PARALLEL_BLOCKS 8
+
+/* 8-way parallel cipher functions */
+asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+
+static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return twofish_setkey(&tfm->base, key, keylen);
+}
+
+static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_enc_8way);
+ ECB_BLOCK(3, twofish_enc_blk_3way);
+ ECB_BLOCK(1, twofish_enc_blk);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_dec_8way);
+ ECB_BLOCK(3, twofish_dec_blk_3way);
+ ECB_BLOCK(1, twofish_dec_blk);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(twofish_enc_blk);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_cbc_dec_8way);
+ CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way);
+ CBC_DEC_BLOCK(1, twofish_dec_blk);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg twofish_algs[] = {
+ {
+ .base.cra_name = "ecb(twofish)",
+ .base.cra_driver_name = "ecb-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(twofish)",
+ .base.cra_driver_name = "cbc-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static int __init twofish_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(twofish_algs,
+ ARRAY_SIZE(twofish_algs));
+}
+
+static void __exit twofish_exit(void)
+{
+ crypto_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs));
+}
+
+module_init(twofish_init);
+module_exit(twofish_exit);
+
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("twofish");
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index cefaf8b9aa18..8e9906d36902 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -38,23 +38,28 @@
* Third Edition.
*/
+#include <crypto/algapi.h>
#include <crypto/twofish.h>
-#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
-asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_enc_blk);
+asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_dec_blk);
static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
- twofish_enc_blk(tfm, dst, src);
+ twofish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
}
static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
- twofish_dec_blk(tfm, dst, src);
+ twofish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
}
static struct crypto_alg alg = {
@@ -64,9 +69,7 @@ static struct crypto_alg alg = {
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
- .cra_alignmask = 3,
.cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(alg.cra_list),
.cra_u = {
.cipher = {
.cia_min_keysize = TF_MIN_KEY_SIZE,
@@ -78,20 +81,20 @@ static struct crypto_alg alg = {
}
};
-static int __init init(void)
+static int __init twofish_glue_init(void)
{
return crypto_register_alg(&alg);
}
-static void __exit fini(void)
+static void __exit twofish_glue_fini(void)
{
crypto_unregister_alg(&alg);
}
-module_init(init);
-module_exit(fini);
+module_init(twofish_glue_init);
+module_exit(twofish_glue_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
new file mode 100644
index 000000000000..8ad77725bf60
--- /dev/null
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Glue Code for 3-way parallel assembler optimized version of Twofish
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ */
+
+#include <asm/cpu_device_id.h>
+#include <crypto/algapi.h>
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "twofish.h"
+#include "ecb_cbc_helpers.h"
+
+EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
+EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
+
+static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return twofish_setkey(&tfm->base, key, keylen);
+}
+
+static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src)
+{
+ u8 buf[2][TF_BLOCK_SIZE];
+ const u8 *s = src;
+
+ if (dst == src)
+ s = memcpy(buf, src, sizeof(buf));
+ twofish_dec_blk_3way(ctx, dst, src);
+ crypto_xor(dst + TF_BLOCK_SIZE, s, sizeof(buf));
+
+}
+EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, TF_BLOCK_SIZE, -1);
+ ECB_BLOCK(3, twofish_enc_blk_3way);
+ ECB_BLOCK(1, twofish_enc_blk);
+ ECB_WALK_END();
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ ECB_WALK_START(req, TF_BLOCK_SIZE, -1);
+ ECB_BLOCK(3, twofish_dec_blk_3way);
+ ECB_BLOCK(1, twofish_dec_blk);
+ ECB_WALK_END();
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(twofish_enc_blk);
+ CBC_WALK_END();
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way);
+ CBC_DEC_BLOCK(1, twofish_dec_blk);
+ CBC_WALK_END();
+}
+
+static struct skcipher_alg tf_skciphers[] = {
+ {
+ .base.cra_name = "ecb(twofish)",
+ .base.cra_driver_name = "ecb-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(twofish)",
+ .base.cra_driver_name = "cbc-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_BONNELL:
+ case INTEL_ATOM_BONNELL_MID:
+ case INTEL_ATOM_SALTWELL:
+ /*
+ * On Atom, twofish-3way is slower than original assembler
+ * implementation. Twofish-3way trades off some performance in
+ * storing blocks in 64bit registers to allow three blocks to
+ * be processed parallel. Parallel operation then allows gaining
+ * more performance than was trade off, on out-of-order CPUs.
+ * However Atom does not benefit from this parallelism and
+ * should be blacklisted.
+ */
+ return true;
+ }
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, twofish-3way is slower than original assembler
+ * implementation because excessive uses of 64bit rotate and
+ * left-shifts (which are really slow on P4) needed to store and
+ * handle 128bit block in two 64bit registers.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init twofish_3way_init(void)
+{
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "twofish-x86_64-3way: performance on this CPU "
+ "would be suboptimal: disabling "
+ "twofish-x86_64-3way.\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(tf_skciphers,
+ ARRAY_SIZE(tf_skciphers));
+}
+
+static void __exit twofish_3way_fini(void)
+{
+ crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers));
+}
+
+module_init(twofish_3way_init);
+module_exit(twofish_3way_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
new file mode 100644
index 000000000000..72cae8e0ce85
--- /dev/null
+++ b/arch/x86/entry/Makefile
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the x86 low level entry code
+#
+
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_syscall_32.o += -fno-stack-protector
+CFLAGS_syscall_64.o += -fno-stack-protector
+
+obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o
+
+obj-y += vdso/
+obj-y += vsyscall/
+
+obj-$(CONFIG_PREEMPTION) += thunk.o
+CFLAGS_entry_fred.o += -fno-stack-protector
+CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
+obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
+
+obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
new file mode 100644
index 000000000000..77e2d920a640
--- /dev/null
+++ b/arch/x86/entry/calling.h
@@ -0,0 +1,489 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+#include <asm/ptrace-abi.h>
+#include <asm/msr.h>
+#include <asm/nospec-branch.h>
+
+/*
+
+ x86 function call convention, 64-bit:
+ -------------------------------------
+ arguments | callee-saved | extra caller-saved | return
+ [callee-clobbered] | | [callee-clobbered] |
+ ---------------------------------------------------------------------------
+ rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**]
+
+ ( rsp is obviously invariant across normal function calls. (gcc can 'merge'
+ functions when it sees tail-call optimization possibilities) rflags is
+ clobbered. Leftover arguments are passed over the stack frame.)
+
+ [*] In the frame-pointers case rbp is fixed to the stack frame.
+
+ [**] for struct return values wider than 64 bits the return convention is a
+ bit more complex: up to 128 bits width we return small structures
+ straight in rax, rdx. For structures larger than that (3 words or
+ larger) the caller puts a pointer to an on-stack return struct
+ [allocated in the caller's stack frame] into the first argument - i.e.
+ into rdi. All other arguments shift up by one in this case.
+ Fortunately this case is rare in the kernel.
+
+For 32-bit we have the following conventions - kernel is built with
+-mregparm=3 and -freg-struct-return:
+
+ x86 function calling convention, 32-bit:
+ ----------------------------------------
+ arguments | callee-saved | extra caller-saved | return
+ [callee-clobbered] | | [callee-clobbered] |
+ -------------------------------------------------------------------------
+ eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**]
+
+ ( here too esp is obviously invariant across normal function calls. eflags
+ is clobbered. Leftover arguments are passed over the stack frame. )
+
+ [*] In the frame-pointers case ebp is fixed to the stack frame.
+
+ [**] We build with -freg-struct-return, which on 32-bit means similar
+ semantics as on 64-bit: edx can be used for a second return value
+ (i.e. covering integer and structure sizes up to 64 bits) - after that
+ it gets more complex and more expensive: 3-word or larger struct returns
+ get done in the caller's frame and the pointer to the return struct goes
+ into regparm0, i.e. eax - the other arguments shift up and the
+ function's register parameters degenerate to regparm=2 in essence.
+
+*/
+
+#ifdef CONFIG_X86_64
+
+/*
+ * 64-bit system call stack frame layout defines and helpers,
+ * for assembly code:
+ */
+
+.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1
+ .if \save_ret
+ pushq %rsi /* pt_regs->si */
+ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
+ movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */
+ /* We just clobbered the return address - use the IRET frame for unwinding: */
+ UNWIND_HINT_IRET_REGS offset=3*8
+ .else
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ .endif
+ pushq \rdx /* pt_regs->dx */
+ pushq \rcx /* pt_regs->cx */
+ pushq \rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
+ .if \unwind_hint
+ UNWIND_HINT_REGS
+ .endif
+
+ .if \save_ret
+ pushq %rsi /* return address on top of stack */
+ .endif
+.endm
+
+.macro CLEAR_REGS clear_callee=1
+ /*
+ * Sanitize registers of values that a speculation attack might
+ * otherwise want to exploit. The lower registers are likely clobbered
+ * well before they could be put to use in a speculative execution
+ * gadget.
+ */
+ xorl %esi, %esi /* nospec si */
+ xorl %edx, %edx /* nospec dx */
+ xorl %ecx, %ecx /* nospec cx */
+ xorl %r8d, %r8d /* nospec r8 */
+ xorl %r9d, %r9d /* nospec r9 */
+ xorl %r10d, %r10d /* nospec r10 */
+ xorl %r11d, %r11d /* nospec r11 */
+ .if \clear_callee
+ xorl %ebx, %ebx /* nospec rbx */
+ xorl %ebp, %ebp /* nospec rbp */
+ xorl %r12d, %r12d /* nospec r12 */
+ xorl %r13d, %r13d /* nospec r13 */
+ xorl %r14d, %r14d /* nospec r14 */
+ xorl %r15d, %r15d /* nospec r15 */
+ .endif
+.endm
+
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_callee=1 unwind_hint=1
+ PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint
+ CLEAR_REGS clear_callee=\clear_callee
+.endm
+
+.macro POP_REGS pop_rdi=1
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ .if \pop_rdi
+ popq %rdi
+ .endif
+.endm
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+
+/*
+ * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
+#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
+#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
+#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
+#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
+
+.macro SET_NOFLUSH_BIT reg:req
+ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
+.macro ADJUST_KERNEL_CR3 reg:req
+ ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+ /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+ ADJUST_KERNEL_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+#define THIS_CPU_user_pcid_flush_mask \
+ PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask)
+
+.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
+ mov %cr3, \scratch_reg
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * Test if the ASID needs a flush.
+ */
+ movq \scratch_reg, \scratch_reg2
+ andq $(0x7FF), \scratch_reg /* mask ASID */
+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jnc .Lnoflush_\@
+
+ /* Flush needed, clear the bit */
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ movq \scratch_reg2, \scratch_reg
+ jmp .Lwrcr3_pcid_\@
+
+.Lnoflush_\@:
+ movq \scratch_reg2, \scratch_reg
+ SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_pcid_\@:
+ /* Flip the ASID to the user version */
+ orq $(PTI_USER_PCID_MASK), \scratch_reg
+
+.Lwrcr3_\@:
+ /* Flip the PGD to the user version */
+ orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
+ mov \scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ pushq %rax
+ SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax
+ popq %rax
+.Lend_\@:
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ movq %cr3, \scratch_reg
+ movq \scratch_reg, \save_reg
+ /*
+ * Test the user pagetable bit. If set, then the user page tables
+ * are active. If clear CR3 already has the kernel page table
+ * active.
+ */
+ bt $PTI_USER_PGTABLE_BIT, \scratch_reg
+ jnc .Ldone_\@
+
+ ADJUST_KERNEL_CR3 \scratch_reg
+ movq \scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+/* Restore CR3 from a kernel context. May restore a user CR3 value. */
+.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ /*
+ * If CR3 contained the kernel page tables at the paranoid exception
+ * entry, then there is nothing to restore as CR3 is not modified while
+ * handling the exception.
+ */
+ bt $PTI_USER_PGTABLE_BIT, \save_reg
+ jnc .Lend_\@
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * Check if there's a pending flush for the user ASID we're
+ * about to set.
+ */
+ movq \save_reg, \scratch_reg
+ andq $(0x7FF), \scratch_reg
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jc .Lwrcr3_\@
+
+ SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
+ movq \save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
+/*
+ * IBRS kernel mitigation for Spectre_v2.
+ *
+ * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers
+ * the regs it uses (AX, CX, DX). Must be called before the first RET
+ * instruction (NOTE! UNTRAIN_RET includes a RET instruction)
+ *
+ * The optional argument is used to save/restore the current value,
+ * which is used on the paranoid paths.
+ *
+ * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
+ */
+.macro IBRS_ENTER save_reg
+#ifdef CONFIG_MITIGATION_IBRS_ENTRY
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
+ movl $MSR_IA32_SPEC_CTRL, %ecx
+
+.ifnb \save_reg
+ rdmsr
+ shl $32, %rdx
+ or %rdx, %rax
+ mov %rax, \save_reg
+ test $SPEC_CTRL_IBRS, %eax
+ jz .Ldo_wrmsr_\@
+ lfence
+ jmp .Lend_\@
+.Ldo_wrmsr_\@:
+.endif
+
+ movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ movl %edx, %eax
+ shr $32, %rdx
+ wrmsr
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX)
+ * regs. Must be called after the last RET.
+ */
+.macro IBRS_EXIT save_reg
+#ifdef CONFIG_MITIGATION_IBRS_ENTRY
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
+ movl $MSR_IA32_SPEC_CTRL, %ecx
+
+.ifnb \save_reg
+ mov \save_reg, %rdx
+.else
+ movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ andl $(~SPEC_CTRL_IBRS), %edx
+.endif
+
+ movl %edx, %eax
+ shr $32, %rdx
+ wrmsr
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Mitigate Spectre v1 for conditional swapgs code paths.
+ *
+ * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to
+ * prevent a speculative swapgs when coming from kernel space.
+ *
+ * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path,
+ * to prevent the swapgs from getting speculatively skipped when coming from
+ * user space.
+ */
+.macro FENCE_SWAPGS_USER_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER
+.endm
+.macro FENCE_SWAPGS_KERNEL_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL
+.endm
+
+.macro STACKLEAK_ERASE_NOCLOBBER
+#ifdef CONFIG_KSTACK_ERASE
+ PUSH_AND_CLEAR_REGS
+ call stackleak_erase
+ POP_REGS
+#endif
+.endm
+
+.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
+ rdgsbase \save_reg
+ GET_PERCPU_BASE \scratch_reg
+ wrgsbase \scratch_reg
+.endm
+
+#else /* CONFIG_X86_64 */
+# undef UNWIND_HINT_IRET_REGS
+# define UNWIND_HINT_IRET_REGS
+#endif /* !CONFIG_X86_64 */
+
+.macro STACKLEAK_ERASE
+#ifdef CONFIG_KSTACK_ERASE
+ call stackleak_erase
+#endif
+.endm
+
+#ifdef CONFIG_SMP
+
+/*
+ * CPU/node NR is loaded from the limit (size) field of a special segment
+ * descriptor entry in GDT.
+ */
+.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
+ movq $__CPUNODE_SEG, \reg
+ lsl \reg, \reg
+.endm
+
+/*
+ * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
+ * We normally use %gs for accessing per-CPU data, but we are setting up
+ * %gs here and obviously can not use %gs itself to access per-CPU data.
+ *
+ * Do not use RDPID, because KVM loads guest's TSC_AUX on vm-entry and
+ * may not restore the host's value until the CPU returns to userspace.
+ * Thus the kernel would consume a guest's TSC_AUX if an NMI arrives
+ * while running KVM's run loop.
+ */
+.macro GET_PERCPU_BASE reg:req
+ LOAD_CPU_AND_NODE_SEG_LIMIT \reg
+ andq $VDSO_CPUNODE_MASK, \reg
+ movq __per_cpu_offset(, \reg, 8), \reg
+.endm
+
+#else
+
+.macro GET_PERCPU_BASE reg:req
+ movq pcpu_unit_offsets(%rip), \reg
+.endm
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_X86_64
+
+/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+.macro THUNK name, func
+SYM_FUNC_START(\name)
+ ANNOTATE_NOENDBR
+ pushq %rbp
+ movq %rsp, %rbp
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+
+ call \func
+
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ popq %rbp
+ RET
+SYM_FUNC_END(\name)
+ _ASM_NOKPROBE(\name)
+.endm
+
+#else /* CONFIG_X86_32 */
+
+/* put return address in eax (arg1) */
+.macro THUNK name, func, put_ret_addr_in_eax=0
+SYM_CODE_START_NOALIGN(\name)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ .if \put_ret_addr_in_eax
+ /* Place EIP in the arg1 */
+ movl 3*4(%esp), %eax
+ .endif
+
+ call \func
+ popl %edx
+ popl %ecx
+ popl %eax
+ RET
+ _ASM_NOKPROBE(\name)
+SYM_CODE_END(\name)
+ .endm
+
+#endif
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
new file mode 100644
index 000000000000..6ba2b3adcef0
--- /dev/null
+++ b/arch/x86/entry/entry.S
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common place for both 32- and 64-bit entry routines.
+ */
+
+#include <linux/export.h>
+#include <linux/kvm_types.h>
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+#include <asm/msr-index.h>
+#include <asm/unwind_hints.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/cpufeatures.h>
+#include <asm/nospec-branch.h>
+
+#include "calling.h"
+
+.pushsection .noinstr.text, "ax"
+
+/* Clobbers AX, CX, DX */
+SYM_FUNC_START(write_ibpb)
+ ANNOTATE_NOENDBR
+ movl $MSR_IA32_PRED_CMD, %ecx
+ movl _ASM_RIP(x86_pred_cmd), %eax
+ xorl %edx, %edx
+ wrmsr
+
+ /* Make sure IBPB clears return stack preductions too. */
+ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET
+ RET
+SYM_FUNC_END(write_ibpb)
+EXPORT_SYMBOL_FOR_KVM(write_ibpb);
+
+SYM_FUNC_START(__WARN_trap)
+ ANNOTATE_NOENDBR
+ ANNOTATE_REACHABLE
+ ud1 (%edx), %_ASM_ARG1
+ RET
+SYM_FUNC_END(__WARN_trap)
+EXPORT_SYMBOL(__WARN_trap)
+
+.popsection
+
+/*
+ * Define the VERW operand that is disguised as entry code so that
+ * it can be referenced with KPTI enabled. This ensures VERW can be
+ * used late in exit-to-user path after page tables are switched.
+ */
+.pushsection .entry.text, "ax"
+
+.align L1_CACHE_BYTES, 0xcc
+SYM_CODE_START_NOALIGN(x86_verw_sel)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+ .word __KERNEL_DS
+.align L1_CACHE_BYTES, 0xcc
+SYM_CODE_END(x86_verw_sel);
+EXPORT_SYMBOL_FOR_KVM(x86_verw_sel);
+
+.popsection
+
+THUNK warn_thunk_thunk, __warn_thunk
+
+/*
+ * Clang's implementation of TLS stack cookies requires the variable in
+ * question to be a TLS variable. If the variable happens to be defined as an
+ * ordinary variable with external linkage in the same compilation unit (which
+ * amounts to the whole of vmlinux with LTO enabled), Clang will drop the
+ * segment register prefix from the references, resulting in broken code. Work
+ * around this by avoiding the symbol used in -mstack-protector-guard-symbol=
+ * entirely in the C code, and use an alias emitted by the linker script
+ * instead.
+ */
+#if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
+EXPORT_SYMBOL(__ref_stack_chk_guard);
+#endif
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
new file mode 100644
index 000000000000..92c0b4a94e0a
--- /dev/null
+++ b/arch/x86/entry/entry_32.S
@@ -0,0 +1,1226 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1991,1992 Linus Torvalds
+ *
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
+ *
+ * Stack layout while running C code:
+ * ptrace needs to have all registers on the stack.
+ * If the order here is changed, it needs to be
+ * updated in fork.c:copy_process(), signal.c:do_signal(),
+ * ptrace.c and ptrace.h
+ *
+ * 0(%esp) - %ebx
+ * 4(%esp) - %ecx
+ * 8(%esp) - %edx
+ * C(%esp) - %esi
+ * 10(%esp) - %edi
+ * 14(%esp) - %ebp
+ * 18(%esp) - %eax
+ * 1C(%esp) - %ds
+ * 20(%esp) - %es
+ * 24(%esp) - %fs
+ * 28(%esp) - unused -- was %gs on old stackprotector kernels
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/frame.h>
+#include <asm/trapnr.h>
+#include <asm/nospec-branch.h>
+
+#include "calling.h"
+
+ .section .entry.text, "ax"
+
+#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
+
+/* Unconditionally switch to user cr3 */
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ movl %cr3, \scratch_reg
+ orl $PTI_SWITCH_MASK, \scratch_reg
+ movl \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro BUG_IF_WRONG_CR3 no_user_check=0
+#ifdef CONFIG_DEBUG_ENTRY
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ .if \no_user_check == 0
+ /* coming from usermode? */
+ testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp)
+ jz .Lend_\@
+ .endif
+ /* On user-cr3? */
+ movl %cr3, %eax
+ testl $PTI_SWITCH_MASK, %eax
+ jnz .Lend_\@
+ /* From userspace with kernel cr3 - BUG */
+ ud2
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Switch to kernel cr3 if not already loaded and return current cr3 in
+ * \scratch_reg
+ */
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ movl %cr3, \scratch_reg
+ /* Test if we are already on kernel CR3 */
+ testl $PTI_SWITCH_MASK, \scratch_reg
+ jz .Lend_\@
+ andl $(~PTI_SWITCH_MASK), \scratch_reg
+ movl \scratch_reg, %cr3
+ /* Return original CR3 in \scratch_reg */
+ orl $PTI_SWITCH_MASK, \scratch_reg
+.Lend_\@:
+.endm
+
+#define CS_FROM_ENTRY_STACK (1 << 31)
+#define CS_FROM_USER_CR3 (1 << 30)
+#define CS_FROM_KERNEL (1 << 29)
+#define CS_FROM_ESPFIX (1 << 28)
+
+.macro FIXUP_FRAME
+ /*
+ * The high bits of the CS dword (__csh) are used for CS_FROM_*.
+ * Clear them in case hardware didn't do this for us.
+ */
+ andl $0x0000ffff, 4*4(%esp)
+
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, 5*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
+#endif
+ testl $USER_SEGMENT_RPL_MASK, 4*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
+
+ orl $CS_FROM_KERNEL, 4*4(%esp)
+
+ /*
+ * When we're here from kernel mode; the (exception) stack looks like:
+ *
+ * 6*4(%esp) - <previous context>
+ * 5*4(%esp) - flags
+ * 4*4(%esp) - cs
+ * 3*4(%esp) - ip
+ * 2*4(%esp) - orig_eax
+ * 1*4(%esp) - gs / function
+ * 0*4(%esp) - fs
+ *
+ * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+ * is complete and in particular regs->sp is correct. This gives us
+ * the original 6 entries as gap:
+ *
+ * 14*4(%esp) - <previous context>
+ * 13*4(%esp) - gap / flags
+ * 12*4(%esp) - gap / cs
+ * 11*4(%esp) - gap / ip
+ * 10*4(%esp) - gap / orig_eax
+ * 9*4(%esp) - gap / gs / function
+ * 8*4(%esp) - gap / fs
+ * 7*4(%esp) - ss
+ * 6*4(%esp) - sp
+ * 5*4(%esp) - flags
+ * 4*4(%esp) - cs
+ * 3*4(%esp) - ip
+ * 2*4(%esp) - orig_eax
+ * 1*4(%esp) - gs / function
+ * 0*4(%esp) - fs
+ */
+
+ pushl %ss # ss
+ pushl %esp # sp (points at ss)
+ addl $7*4, (%esp) # point sp back at the previous context
+ pushl 7*4(%esp) # flags
+ pushl 7*4(%esp) # cs
+ pushl 7*4(%esp) # ip
+ pushl 7*4(%esp) # orig_eax
+ pushl 7*4(%esp) # gs / function
+ pushl 7*4(%esp) # fs
+.Lfrom_usermode_no_fixup_\@:
+.endm
+
+.macro IRET_FRAME
+ /*
+ * We're called with %ds, %es, %fs, and %gs from the interrupted
+ * frame, so we shouldn't use them. Also, we may be in ESPFIX
+ * mode and therefore have a nonzero SS base and an offset ESP,
+ * so any attempt to access the stack needs to use SS. (except for
+ * accesses through %esp, which automatically use SS.)
+ */
+ testl $CS_FROM_KERNEL, 1*4(%esp)
+ jz .Lfinished_frame_\@
+
+ /*
+ * Reconstruct the 3 entry IRET frame right after the (modified)
+ * regs->sp without lowering %esp in between, such that an NMI in the
+ * middle doesn't scribble our stack.
+ */
+ pushl %eax
+ pushl %ecx
+ movl 5*4(%esp), %eax # (modified) regs->sp
+
+ movl 4*4(%esp), %ecx # flags
+ movl %ecx, %ss:-1*4(%eax)
+
+ movl 3*4(%esp), %ecx # cs
+ andl $0x0000ffff, %ecx
+ movl %ecx, %ss:-2*4(%eax)
+
+ movl 2*4(%esp), %ecx # ip
+ movl %ecx, %ss:-3*4(%eax)
+
+ movl 1*4(%esp), %ecx # eax
+ movl %ecx, %ss:-4*4(%eax)
+
+ popl %ecx
+ lea -4*4(%eax), %esp
+ popl %eax
+.Lfinished_frame_\@:
+.endm
+
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
+ cld
+.if \skip_gs == 0
+ pushl $0
+.endif
+ pushl %fs
+
+ pushl %eax
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+.if \unwind_espfix > 0
+ UNWIND_ESPFIX_STACK
+.endif
+ popl %eax
+
+ FIXUP_FRAME
+ pushl %es
+ pushl %ds
+ pushl \pt_regs_ax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ /* Switch to kernel stack if necessary */
+.if \switch_stacks > 0
+ SWITCH_TO_KERNEL_STACK
+.endif
+.endm
+
+.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
+ SAVE_ALL unwind_espfix=\unwind_espfix
+
+ BUG_IF_WRONG_CR3
+
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We can enter with either user or kernel cr3, the code will
+ * store the old cr3 in \cr3_reg and switches to the kernel cr3
+ * if necessary.
+ */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
+
+.Lend_\@:
+.endm
+
+.macro RESTORE_INT_REGS
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+.endm
+
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+2: popl %es
+3: popl %fs
+4: addl $(4 + \pop), %esp /* pop the unused "gs" slot */
+ IRET_FRAME
+
+ /*
+ * There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is
+ * ASM the registers are known and we can trivially hard-code them.
+ */
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS)
+ _ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES)
+ _ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS)
+.endm
+
+.macro RESTORE_ALL_NMI cr3_reg:req pop=0
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We enter with kernel cr3 and switch the cr3 to the value
+ * stored on \cr3_reg, which is either a user or a kernel cr3.
+ */
+ ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
+
+ testl $PTI_SWITCH_MASK, \cr3_reg
+ jz .Lswitched_\@
+
+ /* User cr3 in \cr3_reg - write it to hardware cr3 */
+ movl \cr3_reg, %cr3
+
+.Lswitched_\@:
+
+ BUG_IF_WRONG_CR3
+
+ RESTORE_REGS pop=\pop
+.endm
+
+.macro CHECK_AND_APPLY_ESPFIX
+#ifdef CONFIG_X86_ESPFIX32
+#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET)
+
+ ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
+
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ /*
+ * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ * are returning to the kernel.
+ * See comments in process.c:copy_thread() for details.
+ */
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ jne .Lend_\@ # returning to user-space with LDT SS
+
+ /*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ shr $16, %edx
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl $__ESPFIX_SS
+ pushl %eax /* new kernel esp */
+ /*
+ * Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the IRET:
+ */
+ cli
+ lss (%esp), %esp /* switch to espfix segment */
+.Lend_\@:
+#endif /* CONFIG_X86_ESPFIX32 */
+.endm
+
+/*
+ * Called with pt_regs fully populated and kernel segments loaded,
+ * so we can access PER_CPU and use the integer registers.
+ *
+ * We need to be very careful here with the %esp switch, because an NMI
+ * can happen everywhere. If the NMI handler finds itself on the
+ * entry-stack, it will overwrite the task-stack and everything we
+ * copied there. So allocate the stack-frame on the task-stack and
+ * switch to it before we do any copying.
+ */
+
+.macro SWITCH_TO_KERNEL_STACK
+
+ BUG_IF_WRONG_CR3
+
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+
+ /*
+ * %eax now contains the entry cr3 and we carry it forward in
+ * that register for the time this macro runs
+ */
+
+ /* Are we on the entry stack? Bail out if not! */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %esp, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
+ jae .Lend_\@
+
+ /* Load stack pointer into %esi and %edi */
+ movl %esp, %esi
+ movl %esi, %edi
+
+ /* Move %edi to the top of the entry stack */
+ andl $(MASK_entry_stack), %edi
+ addl $(SIZEOF_entry_stack), %edi
+
+ /* Load top of task-stack into %edi */
+ movl TSS_entry2task_stack(%edi), %edi
+
+ /* Special case - entry from kernel mode via entry stack */
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
+ movb PT_CS(%esp), %cl
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
+#else
+ movl PT_CS(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+#endif
+ cmpl $USER_RPL, %ecx
+ jb .Lentry_from_kernel_\@
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esi)
+ jz .Lcopy_pt_regs_\@
+
+ /*
+ * Stack-frame contains 4 additional segment registers when
+ * coming from VM86 mode
+ */
+ addl $(4 * 4), %ecx
+
+#endif
+.Lcopy_pt_regs_\@:
+
+ /* Allocate frame on task-stack */
+ subl %ecx, %edi
+
+ /* Switch to task-stack */
+ movl %edi, %esp
+
+ /*
+ * We are now on the task-stack and can safely copy over the
+ * stack-frame
+ */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ jmp .Lend_\@
+
+.Lentry_from_kernel_\@:
+
+ /*
+ * This handles the case when we enter the kernel from
+ * kernel-mode and %esp points to the entry-stack. When this
+ * happens we need to switch to the task-stack to run C code,
+ * but switch back to the entry-stack again when we approach
+ * iret and return to the interrupted code-path. This usually
+ * happens when we hit an exception while restoring user-space
+ * segment registers on the way back to user-space or when the
+ * sysenter handler runs with eflags.tf set.
+ *
+ * When we switch to the task-stack here, we can't trust the
+ * contents of the entry-stack anymore, as the exception handler
+ * might be scheduled out or moved to another CPU. Therefore we
+ * copy the complete entry-stack to the task-stack and set a
+ * marker in the iret-frame (bit 31 of the CS dword) to detect
+ * what we've done on the iret path.
+ *
+ * On the iret path we copy everything back and switch to the
+ * entry-stack, so that the interrupted kernel code-path
+ * continues on the same stack it was interrupted with.
+ *
+ * Be aware that an NMI can happen anytime in this code.
+ *
+ * %esi: Entry-Stack pointer (same as %esp)
+ * %edi: Top of the task stack
+ * %eax: CR3 on kernel entry
+ */
+
+ /* Calculate number of bytes on the entry stack in %ecx */
+ movl %esi, %ecx
+
+ /* %ecx to the top of entry-stack */
+ andl $(MASK_entry_stack), %ecx
+ addl $(SIZEOF_entry_stack), %ecx
+
+ /* Number of bytes on the entry stack to %ecx */
+ sub %esi, %ecx
+
+ /* Mark stackframe as coming from entry stack */
+ orl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+
+ /*
+ * Test the cr3 used to enter the kernel and add a marker
+ * so that we can switch back to it before iret.
+ */
+ testl $PTI_SWITCH_MASK, %eax
+ jz .Lcopy_pt_regs_\@
+ orl $CS_FROM_USER_CR3, PT_CS(%esp)
+
+ /*
+ * %esi and %edi are unchanged, %ecx contains the number of
+ * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
+ * the stack-frame on task-stack and copy everything over
+ */
+ jmp .Lcopy_pt_regs_\@
+
+.Lend_\@:
+.endm
+
+/*
+ * Switch back from the kernel stack to the entry stack.
+ *
+ * The %esp register must point to pt_regs on the task stack. It will
+ * first calculate the size of the stack-frame to copy, depending on
+ * whether we return to VM86 mode or not. With that it uses 'rep movsl'
+ * to copy the contents of the stack over to the entry stack.
+ *
+ * We must be very careful here, as we can't trust the contents of the
+ * task-stack once we switched to the entry-stack. When an NMI happens
+ * while on the entry-stack, the NMI handler will switch back to the top
+ * of the task stack, overwriting our stack-frame we are about to copy.
+ * Therefore we switch the stack only after everything is copied over.
+ */
+.macro SWITCH_TO_ENTRY_STACK
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp)
+ jz .Lcopy_pt_regs_\@
+
+ /* Additional 4 registers to copy when returning to VM86 mode */
+ addl $(4 * 4), %ecx
+
+.Lcopy_pt_regs_\@:
+#endif
+
+ /* Initialize source and destination for movsl */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+ subl %ecx, %edi
+ movl %esp, %esi
+
+ /* Save future stack pointer in %ebx */
+ movl %edi, %ebx
+
+ /* Copy over the stack-frame */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /*
+ * Switch to entry-stack - needs to happen after everything is
+ * copied because the NMI handler will overwrite the task-stack
+ * when on entry-stack
+ */
+ movl %ebx, %esp
+
+.Lend_\@:
+.endm
+
+/*
+ * This macro handles the case when we return to kernel-mode on the iret
+ * path and have to switch back to the entry stack and/or user-cr3
+ *
+ * See the comments below the .Lentry_from_kernel_\@ label in the
+ * SWITCH_TO_KERNEL_STACK macro for more details.
+ */
+.macro PARANOID_EXIT_TO_KERNEL_MODE
+
+ /*
+ * Test if we entered the kernel with the entry-stack. Most
+ * likely we did not, because this code only runs on the
+ * return-to-kernel path.
+ */
+ testl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Unlikely slow-path */
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
+
+ /* Copy the remaining task-stack contents to entry-stack */
+ movl %esp, %esi
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+
+ /* Bytes on the task-stack to ecx */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+ subl %esi, %ecx
+
+ /* Allocate stack-frame on entry-stack */
+ subl %ecx, %edi
+
+ /*
+ * Save future stack-pointer, we must not switch until the
+ * copy is done, otherwise the NMI handler could destroy the
+ * contents of the task-stack we are about to copy.
+ */
+ movl %edi, %ebx
+
+ /* Do the copy */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /* Safe to switch to entry-stack now */
+ movl %ebx, %esp
+
+ /*
+ * We came from entry-stack and need to check if we also need to
+ * switch back to user cr3.
+ */
+ testl $CS_FROM_USER_CR3, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_USER_CR3), PT_CS(%esp)
+
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+.Lend_\@:
+.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ ASM_CLAC
+ cld
+
+ .if \has_error_code == 0
+ pushl $0 /* Clear the error code */
+ .endif
+
+ /* Push the C-function address into the GS slot */
+ pushl $\cfunc
+ /* Invoke the common exception entry */
+ jmp handle_exception
+SYM_CODE_END(\asmsym)
+.endm
+
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+SYM_CODE_START_LOCAL(asm_\cfunc)
+ ASM_CLAC
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
+ call \cfunc
+ jmp handle_exception_return
+SYM_CODE_END(asm_\cfunc)
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in struct inactive_task_frame
+ */
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ /*
+ * Flags are saved to prevent AC leakage. This could go
+ * away if objtool would have 32bit support to verify
+ * the STAC/CLAC correctness.
+ */
+ pushfl
+
+ /* switch stack */
+ movl %esp, TASK_threadsp(%eax)
+ movl TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_STACKPROTECTOR
+ movl TASK_stack_canary(%edx), %ebx
+ movl %ebx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+
+ /* Restore flags or the incoming task to restore AC state. */
+ popfl
+ /* restore callee-saved registers */
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ jmp __switch_to
+SYM_CODE_END(__switch_to_asm)
+.popsection
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(ret_from_fork_asm)
+ movl %esp, %edx /* regs */
+
+ /* return address for the stack unwinder */
+ pushl $.Lsyscall_32_done
+
+ FRAME_BEGIN
+ /* prev already in EAX */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
+ addl $4, %esp
+ FRAME_END
+
+ RET
+SYM_CODE_END(ret_from_fork_asm)
+.popsection
+
+SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!). To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available. This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO. In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction. This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ */
+SYM_FUNC_START(entry_SYSENTER_32)
+ /*
+ * On entry-stack with all userspace-regs live - save and
+ * restore eflags and %eax to use it as scratch-reg for the cr3
+ * switch.
+ */
+ pushfl
+ pushl %eax
+ BUG_IF_WRONG_CR3 no_user_check=1
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+ popl %eax
+ popfl
+
+ /* Stack empty again, switch to task stack */
+ movl TSS_entry2task_stack(%esp), %esp
+
+.Lsysenter_past_esp:
+ pushl $__USER_DS /* pt_regs->ss */
+ pushl $0 /* pt_regs->sp (placeholder) */
+ pushfl /* pt_regs->flags (except IF = 0) */
+ pushl $__USER_CS /* pt_regs->cs */
+ pushl $0 /* pt_regs->ip = 0 (placeholder) */
+ pushl %eax /* pt_regs->orig_ax */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */
+
+ /*
+ * SYSENTER doesn't filter flags, so we need to clear NT, AC
+ * and TF ourselves. To save a few cycles, we can check whether
+ * either was set instead of doing an unconditional popfq.
+ * This needs to happen before enabling interrupts so that
+ * we don't get preempted with NT set.
+ *
+ * If TF is set, we will single-step all the way to here -- do_debug
+ * will ignore all the traps. (Yes, this is slow, but so is
+ * single-stepping in general. This allows us to avoid having
+ * a more complicated code to handle the case where a user program
+ * forces us to single-step through the SYSENTER entry code.)
+ *
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+ * out-of-line as an optimization: NT is unlikely to be set in the
+ * majority of the cases and instead of polluting the I$ unnecessarily,
+ * we're keeping that code behind a branch which will predict as
+ * not-taken and therefore its instructions won't be fetched.
+ */
+ testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+ movl %esp, %eax
+ call do_SYSENTER_32
+ testb %al, %al
+ jz .Lsyscall_32_done
+
+ STACKLEAK_ERASE
+
+ /* Opportunistic SYSEXIT */
+
+ /*
+ * Setup entry stack - we keep the pointer in %eax and do the
+ * switch after almost all user-state is restored.
+ */
+
+ /* Load entry stack pointer and allocate frame for eflags/eax */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
+ subl $(2*4), %eax
+
+ /* Copy eflags and eax to entry stack */
+ movl PT_EFLAGS(%esp), %edi
+ movl PT_EAX(%esp), %esi
+ movl %edi, (%eax)
+ movl %esi, 4(%eax)
+
+ /* Restore user registers and segments */
+ movl PT_EIP(%esp), %edx /* pt_regs->ip */
+ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
+1: mov PT_FS(%esp), %fs
+
+ popl %ebx /* pt_regs->bx */
+ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+
+ /* Switch to entry stack */
+ movl %eax, %esp
+
+ /* Now ready to switch the cr3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+ /* Clobbers ZF */
+ CLEAR_CPU_BUFFERS
+
+ /*
+ * Restore all flags except IF. (We restore IF separately because
+ * STI gives a one-instruction window in which we won't be interrupted,
+ * whereas POPF does not.)
+ */
+ btrl $X86_EFLAGS_IF_BIT, (%esp)
+ BUG_IF_WRONG_CR3 no_user_check=1
+ popfl
+ popl %eax
+
+ /*
+ * Return back to the vDSO, which will pop ecx and edx.
+ * Don't bother with DS and ES (they already contain __USER_DS).
+ */
+ sti
+ sysexit
+
+2: movl $0, PT_FS(%esp)
+ jmp 1b
+ _ASM_EXTABLE(1b, 2b)
+
+.Lsysenter_fix_flags:
+ pushl $X86_EFLAGS_FIXED
+ popfl
+ jmp .Lsysenter_flags_fixed
+SYM_ENTRY(__end_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
+SYM_FUNC_END(entry_SYSENTER_32)
+
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction. INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries. It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call. (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6
+ */
+SYM_FUNC_START(entry_INT80_32)
+ ASM_CLAC
+ pushl %eax /* pt_regs->orig_ax */
+
+ SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
+
+ movl %esp, %eax
+ call do_int80_syscall_32
+.Lsyscall_32_done:
+ STACKLEAK_ERASE
+
+restore_all_switch_stack:
+ SWITCH_TO_ENTRY_STACK
+ CHECK_AND_APPLY_ESPFIX
+
+ /* Switch back to user CR3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+ BUG_IF_WRONG_CR3
+
+ /* Restore user state */
+ RESTORE_REGS pop=4 # skip orig_eax/error_code
+ CLEAR_CPU_BUFFERS
+.Lirq_return:
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler and when returning from
+ * scheduler to user-space.
+ */
+ iret
+
+.Lasm_iret_error:
+ pushl $0 # no error code
+ pushl $iret_error
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * The stack-frame here is the one that iret faulted on, so its a
+ * return-to-user frame. We are on kernel-cr3 because we come here from
+ * the fixup code. This confuses the CR3 checker, so switch to user-cr3
+ * as the checker expects it.
+ */
+ pushl %eax
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+ popl %eax
+#endif
+
+ jmp handle_exception
+
+ _ASM_EXTABLE(.Lirq_return, .Lasm_iret_error)
+SYM_FUNC_END(entry_INT80_32)
+
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ *
+ * We might be on user CR3 here, so percpu data is not mapped and we can't
+ * access the GDT through the percpu segment. Instead, use SGDT to find
+ * the cpu_entry_area alias of the GDT.
+ */
+#ifdef CONFIG_X86_ESPFIX32
+ /* fixup the stack */
+ pushl %ecx
+ subl $2*4, %esp
+ sgdt (%esp)
+ movl 2(%esp), %ecx /* GDT address */
+ /*
+ * Careful: ECX is a linear pointer, so we need to force base
+ * zero. %cs is the only known-linear segment we have right now.
+ */
+ mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */
+ mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */
+ shl $16, %eax
+ addl $2*4, %esp
+ popl %ecx
+ addl %esp, %eax /* the adjusted stack pointer */
+ pushl $__KERNEL_DS
+ pushl %eax
+ lss (%esp), %esp /* switch to the normal stack segment */
+#endif
+.endm
+
+.macro UNWIND_ESPFIX_STACK
+ /* It's safe to clobber %eax, all other regs need to be preserved */
+#ifdef CONFIG_X86_ESPFIX32
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne .Lno_fixup_\@
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+.Lno_fixup_\@:
+#endif
+.endm
+
+SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
+ /* the function address is in %gs's slot on the stack */
+ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
+ ENCODE_FRAME_POINTER
+
+ movl PT_GS(%esp), %edi # get the function address
+
+ /* fixup orig %eax */
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+
+ movl %esp, %eax # pt_regs pointer
+ CALL_NOSPEC edi
+
+handle_exception_return:
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+ /*
+ * We can be coming here from child spawned by kernel_thread().
+ */
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
+ cmpl $USER_RPL, %eax # returning to v8086 or userspace ?
+ jnb ret_to_user
+
+ PARANOID_EXIT_TO_KERNEL_MODE
+ BUG_IF_WRONG_CR3
+ RESTORE_REGS 4
+ jmp .Lirq_return
+
+ret_to_user:
+ movl %esp, %eax
+ jmp restore_all_switch_stack
+SYM_CODE_END(handle_exception)
+
+SYM_CODE_START(asm_exc_double_fault)
+1:
+ /*
+ * This is a task gate handler, not an interrupt gate handler.
+ * The error code is on the stack, but the stack is otherwise
+ * empty. Interrupts are off. Our state is sane with the following
+ * exceptions:
+ *
+ * - CR0.TS is set. "TS" literally means "task switched".
+ * - EFLAGS.NT is set because we're a "nested task".
+ * - The doublefault TSS has back_link set and has been marked busy.
+ * - TR points to the doublefault TSS and the normal TSS is busy.
+ * - CR3 is the normal kernel PGD. This would be delightful, except
+ * that the CPU didn't bother to save the old CR3 anywhere. This
+ * would make it very awkward to return back to the context we came
+ * from.
+ *
+ * The rest of EFLAGS is sanitized for us, so we don't need to
+ * worry about AC or DF.
+ *
+ * Don't even bother popping the error code. It's always zero,
+ * and ignoring it makes us a bit more robust against buggy
+ * hypervisor task gate implementations.
+ *
+ * We will manually undo the task switch instead of doing a
+ * task-switching IRET.
+ */
+
+ clts /* clear CR0.TS */
+ pushl $X86_EFLAGS_FIXED
+ popfl /* clear EFLAGS.NT */
+
+ call doublefault_shim
+
+ /* We don't support returning, so we have no IRET here. */
+1:
+ hlt
+ jmp 1b
+SYM_CODE_END(asm_exc_double_fault)
+
+/*
+ * NMI is doubly nasty. It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks. We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
+ */
+SYM_CODE_START(asm_exc_nmi)
+ ASM_CLAC
+
+#ifdef CONFIG_X86_ESPFIX32
+ /*
+ * ESPFIX_SS is only ever set on the return to user path
+ * after we've switched to the entry stack.
+ */
+ pushl %eax
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ je .Lnmi_espfix_stack
+#endif
+
+ pushl %eax # pt_regs->orig_ax
+ SAVE_ALL_NMI cr3_reg=%edi
+ ENCODE_FRAME_POINTER
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+
+ /* Are we currently on the SYSENTER stack? */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
+ jb .Lnmi_from_sysenter_stack
+
+ /* Not on SYSENTER stack. */
+ call exc_nmi
+ jmp .Lnmi_return
+
+.Lnmi_from_sysenter_stack:
+ /*
+ * We're on the SYSENTER stack. Switch off. No one (not even debug)
+ * is using the thread stack right now, so it's safe for us to use it.
+ */
+ movl %esp, %ebx
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
+ call exc_nmi
+ movl %ebx, %esp
+
+.Lnmi_return:
+#ifdef CONFIG_X86_ESPFIX32
+ testl $CS_FROM_ESPFIX, PT_CS(%esp)
+ jnz .Lnmi_from_espfix
+#endif
+
+ CHECK_AND_APPLY_ESPFIX
+ RESTORE_ALL_NMI cr3_reg=%edi pop=4
+ CLEAR_CPU_BUFFERS
+ jmp .Lirq_return
+
+#ifdef CONFIG_X86_ESPFIX32
+.Lnmi_espfix_stack:
+ /*
+ * Create the pointer to LSS back
+ */
+ pushl %ss
+ pushl %esp
+ addl $4, (%esp)
+
+ /* Copy the (short) IRET frame */
+ pushl 4*4(%esp) # flags
+ pushl 4*4(%esp) # cs
+ pushl 4*4(%esp) # ip
+
+ pushl %eax # orig_ax
+
+ SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
+ ENCODE_FRAME_POINTER
+
+ /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
+ xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)
+
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+ jmp .Lnmi_from_sysenter_stack
+
+.Lnmi_from_espfix:
+ RESTORE_ALL_NMI cr3_reg=%edi
+ /*
+ * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
+ * fix up the gap and long frame:
+ *
+ * 3 - original frame (exception)
+ * 2 - ESPFIX block (above)
+ * 6 - gap (FIXUP_FRAME)
+ * 5 - long frame (FIXUP_FRAME)
+ * 1 - orig_ax
+ */
+ lss (1+5+6)*4(%esp), %esp # back to espfix stack
+ CLEAR_CPU_BUFFERS
+ jmp .Lirq_return
+#endif
+SYM_CODE_END(asm_exc_nmi)
+
+.pushsection .text, "ax"
+SYM_CODE_START(rewind_stack_and_make_dead)
+ /* Prevent any naive code from trying to unwind to our caller. */
+ xorl %ebp, %ebp
+
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
+ leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
+
+ call make_task_dead
+1: jmp 1b
+SYM_CODE_END(rewind_stack_and_make_dead)
+.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
new file mode 100644
index 000000000000..f9983a1907bf
--- /dev/null
+++ b/arch/x86/entry/entry_64.S
@@ -0,0 +1,1571 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/arch/x86_64/entry.S
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * Some of this is documented in Documentation/arch/x86/entry_64.rst
+ *
+ * A note on terminology:
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
+ *
+ * Some macro usage:
+ * - SYM_FUNC_START/END:Define functions in the symbol table.
+ * - idtentry: Define exception entry points.
+ */
+#include <linux/export.h>
+#include <linux/kvm_types.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page_types.h>
+#include <asm/irqflags.h>
+#include <asm/paravirt.h>
+#include <asm/percpu.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <asm/frame.h>
+#include <asm/trapnr.h>
+#include <asm/nospec-branch.h>
+#include <asm/fsgsbase.h>
+#include <linux/err.h>
+
+#include "calling.h"
+
+.code64
+.section .entry.text, "ax"
+
+/*
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ *
+ * This is the only entry point used for 64-bit system calls. The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries. There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Registers on entry:
+ * rax system call number
+ * rcx return address
+ * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
+ * rdi arg0
+ * rsi arg1
+ * rdx arg2
+ * r10 arg3 (needs to be moved to rcx to conform to C ABI)
+ * r8 arg4
+ * r9 arg5
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
+ *
+ * Only called from user space.
+ *
+ * When user can change pt_regs->foo always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+SYM_CODE_START(entry_SYSCALL_64)
+ UNWIND_HINT_ENTRY
+ ENDBR
+
+ swapgs
+ /* tss.sp2 is scratch space. */
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
+ pushq %rax /* pt_regs->orig_ax */
+
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
+
+ /* IRQs are off. */
+ movq %rsp, %rdi
+ /* Sign extend the lower 32bit as syscall numbers are treated as int */
+ movslq %eax, %rsi
+
+ /* clobbers %rax, make sure it is after saving the syscall nr */
+ IBRS_ENTER
+ UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
+
+ call do_syscall_64 /* returns with IRQs disabled */
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context. If we're not,
+ * go to the slow exit path.
+ * In the Xen PV case we must use iret anyway.
+ */
+
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ IBRS_EXIT
+ POP_REGS pop_rdi=0
+
+ /*
+ * Now all regs are restored except RSP and RDI.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_END_OF_STACK
+
+ pushq RSP-RDI(%rdi) /* RSP */
+ pushq (%rdi) /* RDI */
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+ popq %rdi
+ popq %rsp
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ swapgs
+ CLEAR_CPU_BUFFERS
+ sysretq
+SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ int3
+SYM_CODE_END(entry_SYSCALL_64)
+
+/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+.pushsection .text, "ax"
+SYM_FUNC_START(__switch_to_asm)
+ ANNOTATE_NOENDBR
+ /*
+ * Save callee-saved registers
+ * This must match the order in inactive_task_frame
+ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* switch stack */
+ movq %rsp, TASK_threadsp(%rdi)
+ movq TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rsi), %rbx
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+
+ /* restore callee-saved registers */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+
+ jmp __switch_to
+SYM_FUNC_END(__switch_to_asm)
+.popsection
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(ret_from_fork_asm)
+ /*
+ * This is the start of the kernel stack; even through there's a
+ * register set at the top, the regset isn't necessarily coherent
+ * (consider kthreads) and one cannot unwind further.
+ *
+ * This ensures stack unwinds of kernel threads terminate in a known
+ * good state.
+ */
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR // copy_thread
+ CALL_DEPTH_ACCOUNT
+
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ call ret_from_fork
+
+ /*
+ * Set the stack state to what is expected for the target function
+ * -- at this point the register set should be a valid user set
+ * and unwind should work normally.
+ */
+ UNWIND_HINT_REGS
+
+#ifdef CONFIG_X86_FRED
+ ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+ "jmp asm_fred_exit_user", X86_FEATURE_FRED
+#else
+ jmp swapgs_restore_regs_and_return_to_usermode
+#endif
+SYM_CODE_END(ret_from_fork_asm)
+.popsection
+
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+ pushq %rax
+ SAVE_FLAGS
+ testl $X86_EFLAGS_IF, %eax
+ jz .Lokay_\@
+ ud2
+.Lokay_\@:
+ popq %rax
+#endif
+.endm
+
+SYM_CODE_START(xen_error_entry)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+ UNTRAIN_RET_FROM_CALL
+ RET
+SYM_CODE_END(xen_error_entry)
+
+/**
+ * idtentry_body - Macro to emit code calling the C function
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ */
+.macro idtentry_body cfunc has_error_code:req
+
+ /*
+ * Call error_entry() and switch to the task stack if from userspace.
+ *
+ * When in XENPV, it is already in the task stack, and it can't fault
+ * for native_iret() nor native_load_gs_index() since XENPV uses its
+ * own pvops for IRET and load_gs_index(). And it doesn't need to
+ * switch the CR3. So it can skip invoking error_entry().
+ */
+ ALTERNATIVE "call error_entry; movq %rax, %rsp", \
+ "call xen_error_entry", X86_FEATURE_XENPV
+
+ ENCODE_FRAME_POINTER
+ UNWIND_HINT_REGS
+
+ movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
+
+ .if \has_error_code == 1
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ .endif
+
+ /* For some configurations \cfunc ends up being a noreturn. */
+ ANNOTATE_REACHABLE
+ call \cfunc
+
+ jmp error_return
+.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ *
+ * The macro emits code to set up the kernel context for straight forward
+ * and simple IDT entries. No IST stack, no paranoid entry checks.
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+
+ .if \vector == X86_TRAP_BP
+ /* #BP advances %rip to the next instruction */
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0
+ .else
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8
+ .endif
+
+ ENDBR
+ ASM_CLAC
+ cld
+
+ .if \has_error_code == 0
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
+ .if \vector == X86_TRAP_BP
+ /*
+ * If coming from kernel space, create a 6-word gap to allow the
+ * int3 handler to emulate a call instruction.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+ .endif
+
+ idtentry_body \cfunc \has_error_code
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+/*
+ * Interrupt entry/exit.
+ *
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
+ * position of idtentry exceptions, and jump to one of the two idtentry points
+ * (common/spurious).
+ *
+ * common_interrupt is a hotpath, align it to a cache line
+ */
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+ idtentry \vector asm_\cfunc \cfunc has_error_code=1
+.endm
+
+/**
+ * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ *
+ * The macro emits code to set up the kernel context for #MC and #DB
+ *
+ * If the entry comes from user space it uses the normal entry path
+ * including the return to user space work and preemption checks on
+ * exit.
+ *
+ * If hits in kernel mode then it needs to go through the paranoid
+ * entry as the exception can hit any random state. No preemption
+ * check on exit to keep the paranoid path simple.
+ */
+.macro idtentry_mce_db vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_ENTRY
+ ENDBR
+ ASM_CLAC
+ cld
+
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+
+ /*
+ * If the entry is from userspace, switch stacks and treat it as
+ * a normal entry.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_switch_stack_\@
+
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
+ call paranoid_entry
+
+ UNWIND_HINT_REGS
+
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ call \cfunc
+
+ jmp paranoid_exit
+
+ /* Switch to the regular task stack and use the noist entry point */
+.Lfrom_usermode_switch_stack_\@:
+ idtentry_body noist_\cfunc, has_error_code=0
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/**
+ * idtentry_vc - Macro to generate entry stub for #VC
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ *
+ * The macro emits code to set up the kernel context for #VC. The #VC handler
+ * runs on an IST stack and needs to be able to cause nested #VC exceptions.
+ *
+ * To make this work the #VC entry code tries its best to pretend it doesn't use
+ * an IST stack by switching to the task stack if coming from user-space (which
+ * includes early SYSCALL entry path) or back to the stack in the IRET frame if
+ * entered from kernel-mode.
+ *
+ * If entered from kernel-mode the return stack is validated first, and if it is
+ * not safe to use (e.g. because it points to the entry stack) the #VC handler
+ * will switch to a fall-back stack (VC2) and call a special handler function.
+ *
+ * The macro is only used for one vector, but it is planned to be extended in
+ * the future for the #HV exception.
+ */
+.macro idtentry_vc vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_ENTRY
+ ENDBR
+ ASM_CLAC
+ cld
+
+ /*
+ * If the entry is from userspace, switch stacks and treat it as
+ * a normal entry.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_switch_stack_\@
+
+ /*
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
+ */
+ call paranoid_entry
+
+ UNWIND_HINT_REGS
+
+ /*
+ * Switch off the IST stack to make it free for nested exceptions. The
+ * vc_switch_off_ist() function will switch back to the interrupted
+ * stack if it is safe to do so. If not it switches to the VC fall-back
+ * stack.
+ */
+ movq %rsp, %rdi /* pt_regs pointer */
+ call vc_switch_off_ist
+ movq %rax, %rsp /* Switch to new stack */
+
+ ENCODE_FRAME_POINTER
+ UNWIND_HINT_REGS
+
+ /* Update pt_regs */
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ call kernel_\cfunc
+
+ /*
+ * No need to switch back to the IST stack. The current stack is either
+ * identical to the stack in the IRET frame or the VC fall-back stack,
+ * so it is definitely mapped even with PTI enabled.
+ */
+ jmp paranoid_exit
+
+ /* Switch to the regular task stack */
+.Lfrom_usermode_switch_stack_\@:
+ idtentry_body user_\cfunc, has_error_code=1
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+#endif
+
+/*
+ * Double fault entry. Straight paranoid. No checks from which context
+ * this comes because for the espfix induced #DF this would do the wrong
+ * thing.
+ */
+.macro idtentry_df vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_ENTRY offset=8
+ ENDBR
+ ASM_CLAC
+ cld
+
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
+ call paranoid_entry
+ UNWIND_HINT_REGS
+
+ movq %rsp, %rdi /* pt_regs pointer into first argument */
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+
+ /* For some configurations \cfunc ends up being a noreturn. */
+ ANNOTATE_REACHABLE
+ call \cfunc
+
+ jmp paranoid_exit
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+ __ALIGN
+ .globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+ __ALIGN
+ .globl __irqentry_text_end
+__irqentry_text_end:
+ ANNOTATE_NOENDBR
+
+SYM_CODE_START_LOCAL(common_interrupt_return)
+SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
+ IBRS_EXIT
+#ifdef CONFIG_XEN_PV
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
+#endif
+
+ STACKLEAK_ERASE
+ POP_REGS
+ add $8, %rsp /* orig_ax */
+ UNWIND_HINT_IRET_REGS
+
+.Lswapgs_and_iret:
+ swapgs
+ CLEAR_CPU_BUFFERS
+ /* Assert that the IRET frame indicates user mode. */
+ testb $3, 8(%rsp)
+ jnz .Lnative_iret
+ ud2
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+.Lpti_restore_regs_and_return_to_usermode:
+ POP_REGS pop_rdi=0
+
+ /*
+ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_END_OF_STACK
+
+ /* Copy the IRET frame to the trampoline stack. */
+ pushq 6*8(%rdi) /* SS */
+ pushq 5*8(%rdi) /* RSP */
+ pushq 4*8(%rdi) /* EFLAGS */
+ pushq 3*8(%rdi) /* CS */
+ pushq 2*8(%rdi) /* RIP */
+
+ /* Push user RDI on the trampoline stack. */
+ pushq (%rdi)
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ push %rax
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax
+ pop %rax
+
+ /* Restore RDI. */
+ popq %rdi
+ jmp .Lswapgs_and_iret
+#endif
+
+SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
+#ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates kernel mode. */
+ testb $3, CS(%rsp)
+ jz 1f
+ ud2
+1:
+#endif
+ POP_REGS
+ addq $8, %rsp /* skip regs->orig_ax */
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler.
+ */
+#ifdef CONFIG_XEN_PV
+SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ .byte 0xe9
+ .long .Lnative_iret - (. + 4)
+#endif
+
+.Lnative_iret:
+ UNWIND_HINT_IRET_REGS
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4, (SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR // exc_double_fault
+ /*
+ * This may fault. Non-paranoid faults on return to userspace are
+ * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
+ * Double-faults due to espfix64 are handled in exc_double_fault.
+ * Other faults here are fatal.
+ */
+ iretq
+
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ /*
+ * We are running with user GSBASE. All GPRs contain their user
+ * values. We have a percpu ESPFIX stack that is eight slots
+ * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
+ * of the ESPFIX stack.
+ *
+ * We clobber RAX and RDI in this code. We stash RDI on the
+ * normal stack and RAX on the ESPFIX stack.
+ *
+ * The ESPFIX stack layout we set up looks like this:
+ *
+ * --- top of ESPFIX stack ---
+ * SS
+ * RSP
+ * RFLAGS
+ * CS
+ * RIP <-- RSP points here when we're done
+ * RAX <-- espfix_waddr points here
+ * --- bottom of ESPFIX stack ---
+ */
+
+ pushq %rdi /* Stash user RDI */
+ swapgs /* to kernel GS */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
+
+ movq PER_CPU_VAR(espfix_waddr), %rdi
+ movq %rax, (0*8)(%rdi) /* user RAX */
+ movq (1*8)(%rsp), %rax /* user RIP */
+ movq %rax, (1*8)(%rdi)
+ movq (2*8)(%rsp), %rax /* user CS */
+ movq %rax, (2*8)(%rdi)
+ movq (3*8)(%rsp), %rax /* user RFLAGS */
+ movq %rax, (3*8)(%rdi)
+ movq (5*8)(%rsp), %rax /* user SS */
+ movq %rax, (5*8)(%rdi)
+ movq (4*8)(%rsp), %rax /* user RSP */
+ movq %rax, (4*8)(%rdi)
+ /* Now RAX == RSP. */
+
+ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
+
+ /*
+ * espfix_stack[31:16] == 0. The page tables are set up such that
+ * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
+ * espfix_waddr for any X. That is, there are 65536 RO aliases of
+ * the same page. Set up RSP so that RSP[31:16] contains the
+ * respective 16 bits of the /userspace/ RSP and RSP nonetheless
+ * still points to an RO alias of the ESPFIX stack.
+ */
+ orq PER_CPU_VAR(espfix_stack), %rax
+
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ swapgs /* to user GS */
+ popq %rdi /* Restore user RDI */
+
+ movq %rax, %rsp
+ UNWIND_HINT_IRET_REGS offset=8
+
+ /*
+ * At this point, we cannot write to the stack any more, but we can
+ * still read.
+ */
+ popq %rax /* Restore user RAX */
+
+ CLEAR_CPU_BUFFERS
+
+ /*
+ * RSP now points to an ordinary IRET frame, except that the page
+ * is read-only and RSP[31:16] are preloaded with the userspace
+ * values. We can now IRET back to userspace.
+ */
+ jmp native_irq_return_iret
+#endif
+SYM_CODE_END(common_interrupt_return)
+_ASM_NOKPROBE(common_interrupt_return)
+
+/*
+ * Reload gs selector with exception handling
+ * di: new selector
+ *
+ * Is in entry.text as it shouldn't be instrumented.
+ */
+SYM_FUNC_START(asm_load_gs_index)
+ ANNOTATE_NOENDBR
+ FRAME_BEGIN
+ swapgs
+.Lgs_change:
+ ANNOTATE_NOENDBR // error_entry
+ movl %edi, %gs
+2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
+ swapgs
+ FRAME_END
+ RET
+
+ /* running with kernelgs */
+.Lbad_gs:
+ swapgs /* switch back to user gs */
+.macro ZAP_GS
+ /* This can't be a string because the preprocessor needs to see it. */
+ movl $__USER_DS, %eax
+ movl %eax, %gs
+.endm
+ ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
+ xorl %eax, %eax
+ movl %eax, %gs
+ jmp 2b
+
+ _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
+
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
+
+#ifdef CONFIG_XEN_PV
+/*
+ * A note on the "critical region" in our callback handler.
+ * We want to avoid stacking callback handlers due to events occurring
+ * during handling of the last event. To do this, we keep events disabled
+ * until we've done all processing. HOWEVER, we must enable events before
+ * popping the stack frame (can't be done atomically) and so it would still
+ * be possible to get enough handler activations to overflow the stack.
+ * Although unlikely, bugs of that kind are hard to track down, so we'd
+ * like to avoid the possibility.
+ * So, on entry to the handler we detect whether we interrupted an
+ * existing activation in its critical region -- if so, we pop the current
+ * activation and restart the handler using the previous one.
+ *
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
+ */
+ __FUNC_ALIGN
+SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)
+
+/*
+ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ * see the correct pointer to the pt_regs
+ */
+ UNWIND_HINT_FUNC
+ movq %rdi, %rsp /* we don't return, adjust the stack frame */
+ UNWIND_HINT_REGS
+
+ call xen_pv_evtchn_do_upcall
+
+ jmp error_return
+SYM_CODE_END(exc_xen_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ * 1. Fault while reloading DS, ES, FS or GS
+ * 2. Fault while executing IRET
+ * Category 1 we do not need to fix up as Xen has already reloaded all segment
+ * registers that could be reloaded and zeroed the others.
+ * Category 2 we fix up by killing the current process. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by comparing each saved segment register
+ * with its current contents: any discrepancy means we in category 1.
+ */
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(xen_failsafe_callback)
+ UNWIND_HINT_UNDEFINED
+ ENDBR
+ movl %ds, %ecx
+ cmpw %cx, 0x10(%rsp)
+ jne 1f
+ movl %es, %ecx
+ cmpw %cx, 0x18(%rsp)
+ jne 1f
+ movl %fs, %ecx
+ cmpw %cx, 0x20(%rsp)
+ jne 1f
+ movl %gs, %ecx
+ cmpw %cx, 0x28(%rsp)
+ jne 1f
+ /* All segments match their saved values => Category 2 (Bad IRET). */
+ movq (%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ pushq $0 /* RIP */
+ UNWIND_HINT_IRET_REGS offset=8
+ jmp asm_exc_general_protection
+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+ movq (%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ UNWIND_HINT_IRET_REGS
+ pushq $-1 /* orig_ax = -1 => not a system call */
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
+ jmp error_return
+SYM_CODE_END(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+
+/*
+ * Save all registers in pt_regs. Return GSBASE related information
+ * in EBX depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE R/EBX
+ * N 0 -> SWAPGS on exit
+ * 1 -> no SWAPGS on exit
+ *
+ * Y GSBASE value at entry, must be restored in paranoid_exit
+ *
+ * R14 - old CR3
+ * R15 - old SPEC_CTRL
+ */
+SYM_CODE_START(paranoid_entry)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+
+ /*
+ * Always stash CR3 in %r14. This value will be restored,
+ * verbatim, at exit. Needed if paranoid_entry interrupted
+ * another entry that already switched to the user CR3 value
+ * but has not yet returned to userspace.
+ *
+ * This is also why CS (stashed in the "iret frame" by the
+ * hardware at entry) can not be used: this may be a return
+ * to kernel code, but with a user CR3 value.
+ *
+ * Switching CR3 does not depend on kernel GSBASE so it can
+ * be done before switching to the kernel GSBASE. This is
+ * required for FSGSBASE because the kernel GSBASE has to
+ * be retrieved from a kernel internal table.
+ */
+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+ /*
+ * Handling GSBASE depends on the availability of FSGSBASE.
+ *
+ * Without FSGSBASE the kernel enforces that negative GSBASE
+ * values indicate kernel GSBASE. With FSGSBASE no assumptions
+ * can be made about the GSBASE value when entering from user
+ * space.
+ */
+ ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
+
+ /*
+ * Read the current GSBASE and store it in %rbx unconditionally,
+ * retrieve and set the current CPUs kernel GSBASE. The stored value
+ * has to be restored in paranoid_exit unconditionally.
+ *
+ * The unconditional write to GS base below ensures that no subsequent
+ * loads based on a mispredicted GS base can happen, therefore no LFENCE
+ * is needed here.
+ */
+ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
+ jmp .Lparanoid_gsbase_done
+
+.Lparanoid_entry_checkgs:
+ /* EBX = 1 -> kernel GSBASE active, no restore required */
+ movl $1, %ebx
+
+ /*
+ * The kernel-enforced convention is a negative GSBASE indicates
+ * a kernel value. No SWAPGS needed on entry and exit.
+ */
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ js .Lparanoid_kernel_gsbase
+
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
+ swapgs
+.Lparanoid_kernel_gsbase:
+ FENCE_SWAPGS_KERNEL_ENTRY
+.Lparanoid_gsbase_done:
+
+ /*
+ * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
+ * CR3 above, keep the old value in a callee saved register.
+ */
+ IBRS_ENTER save_reg=%r15
+ UNTRAIN_RET_FROM_CALL
+
+ RET
+SYM_CODE_END(paranoid_entry)
+
+/*
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, there's no good reason to try
+ * to handle preemption here.
+ *
+ * R/EBX contains the GSBASE related information depending on the
+ * availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE R/EBX
+ * N 0 -> SWAPGS on exit
+ * 1 -> no SWAPGS on exit
+ *
+ * Y User space GSBASE, must be restored unconditionally
+ *
+ * R14 - old CR3
+ * R15 - old SPEC_CTRL
+ */
+SYM_CODE_START_LOCAL(paranoid_exit)
+ UNWIND_HINT_REGS
+
+ /*
+ * Must restore IBRS state before both CR3 and %GS since we need access
+ * to the per-CPU x86_spec_ctrl_shadow variable.
+ */
+ IBRS_EXIT save_reg=%r15
+
+ /*
+ * The order of operations is important. PARANOID_RESTORE_CR3 requires
+ * kernel GSBASE.
+ *
+ * NB to anyone to try to optimize this code: this code does
+ * not execute at all for exceptions from user mode. Those
+ * exceptions go through error_return instead.
+ */
+ PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+
+ /* Handle the three GSBASE cases */
+ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
+
+ /* With FSGSBASE enabled, unconditionally restore GSBASE */
+ wrgsbase %rbx
+ jmp restore_regs_and_return_to_kernel
+
+.Lparanoid_exit_checkgs:
+ /* On non-FSGSBASE systems, conditionally do SWAPGS */
+ testl %ebx, %ebx
+ jnz restore_regs_and_return_to_kernel
+
+ /* We are returning to a context with user GSBASE */
+ swapgs
+ jmp restore_regs_and_return_to_kernel
+SYM_CODE_END(paranoid_exit)
+
+/*
+ * Switch GS and CR3 if needed.
+ */
+SYM_CODE_START(error_entry)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+
+ testb $3, CS+8(%rsp)
+ jz .Lerror_kernelspace
+
+ /*
+ * We entered from user mode or we're pretending to have entered
+ * from user mode due to an IRET fault.
+ */
+ swapgs
+ FENCE_SWAPGS_USER_ENTRY
+ /* We have user CR3. Change to kernel CR3. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ IBRS_ENTER
+ UNTRAIN_RET_FROM_CALL
+
+ leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
+ /* Put us onto the real thread stack. */
+ jmp sync_regs
+
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
+.Lerror_kernelspace:
+ leaq native_irq_return_iret(%rip), %rcx
+ cmpq %rcx, RIP+8(%rsp)
+ je .Lerror_bad_iret
+ movl %ecx, %eax /* zero extend */
+ cmpq %rax, RIP+8(%rsp)
+ je .Lbstep_iret
+ cmpq $.Lgs_change, RIP+8(%rsp)
+ jne .Lerror_entry_done_lfence
+
+ /*
+ * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
+ * gsbase and proceed. We'll fix up the exception and land in
+ * .Lgs_change's error handler with kernel gsbase.
+ */
+ swapgs
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+ * kernel or user gsbase.
+ */
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+ CALL_DEPTH_ACCOUNT
+ leaq 8(%rsp), %rax /* return pt_regs pointer */
+ VALIDATE_UNRET_END
+ RET
+
+.Lbstep_iret:
+ /* Fix truncated RIP */
+ movq %rcx, RIP+8(%rsp)
+ /* fall through */
+
+.Lerror_bad_iret:
+ /*
+ * We came from an IRET to user mode, so we have user
+ * gsbase and CR3. Switch to kernel gsbase and CR3:
+ */
+ swapgs
+ FENCE_SWAPGS_USER_ENTRY
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ IBRS_ENTER
+ UNTRAIN_RET_FROM_CALL
+
+ /*
+ * Pretend that the exception came from user mode: set up pt_regs
+ * as if we faulted immediately after IRET.
+ */
+ leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
+ call fixup_bad_iret
+ mov %rax, %rdi
+ jmp sync_regs
+SYM_CODE_END(error_entry)
+
+SYM_CODE_START_LOCAL(error_return)
+ UNWIND_HINT_REGS
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ testb $3, CS(%rsp)
+ jz restore_regs_and_return_to_kernel
+ jmp swapgs_restore_regs_and_return_to_usermode
+SYM_CODE_END(error_return)
+
+/*
+ * Runs on exception stack. Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ *
+ * Registers:
+ * %r14: Used to save/restore the CR3 of the interrupted context
+ * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber.
+ */
+SYM_CODE_START(asm_exc_nmi)
+ UNWIND_HINT_IRET_ENTRY
+ ENDBR
+
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check a special location on the stack that contains a
+ * variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "iret" location to jump to the repeat_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
+ */
+
+ ASM_CLAC
+ cld
+
+ /* Use %rdx as our temp variable throughout */
+ pushq %rdx
+
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ *
+ * We also must not push anything to the stack before switching
+ * stacks lest we corrupt the "NMI executing" variable.
+ */
+
+ swapgs
+ FENCE_SWAPGS_USER_ENTRY
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT_IRET_REGS base=%rdx offset=8
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ UNWIND_HINT_IRET_REGS
+ pushq $-1 /* pt_regs->orig_ax */
+ PUSH_AND_CLEAR_REGS rdx=(%rdx)
+ ENCODE_FRAME_POINTER
+
+ IBRS_ENTER
+ UNTRAIN_RET
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+
+ movq %rsp, %rdi
+ call exc_nmi
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts.
+ */
+ jmp swapgs_restore_regs_and_return_to_usermode
+
+.Lnmi_from_kernel:
+ /*
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
+ */
+
+ /*
+ * Determine whether we're a nested NMI.
+ *
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to call exc_nmi() anyway, so we can just resume
+ * the outer NMI.
+ */
+
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
+
+ /*
+ * Now check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
+ */
+ cmpl $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
+ */
+ lea 6*8(%rsp), %rdx
+ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+ cmpq %rdx, 4*8(%rsp)
+ /* If the stack pointer is above the NMI stack, this is a normal NMI */
+ ja first_nmi
+
+ subq $EXCEPTION_STKSZ, %rdx
+ cmpq %rdx, 4*8(%rsp)
+ /* If it is below the NMI stack, it is a normal NMI */
+ jb first_nmi
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
+
+ /* This is a nested NMI. */
+
+nested_nmi:
+ /*
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
+ */
+ subq $8, %rsp
+ leaq -10*8(%rsp), %rdx
+ pushq $__KERNEL_DS
+ pushq %rdx
+ pushfq
+ pushq $__KERNEL_CS
+ pushq $repeat_nmi
+
+ /* Put stack back */
+ addq $(6*8), %rsp
+
+nested_nmi_out:
+ popq %rdx
+
+ /* We are returning to kernel mode, so this cannot result in a fault. */
+ iretq
+
+first_nmi:
+ /* Restore rdx. */
+ movq (%rsp), %rdx
+
+ /* Make room for "NMI executing". */
+ pushq $0
+
+ /* Leave room for the "iret" frame */
+ subq $(5*8), %rsp
+
+ /* Copy the "original" frame to the "outermost" frame */
+ .rept 5
+ pushq 11*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS
+
+ /* Everything up to here is safe from nested NMIs */
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * For ease of testing, unmask NMIs right away. Disabled by
+ * default because IRET is very expensive.
+ */
+ pushq $0 /* SS */
+ pushq %rsp /* RSP (minus 8 because of the previous push) */
+ addq $8, (%rsp) /* Fix up RSP */
+ pushfq /* RFLAGS */
+ pushq $__KERNEL_CS /* CS */
+ pushq $1f /* RIP */
+ iretq /* continues at repeat_nmi below */
+ UNWIND_HINT_IRET_REGS
+1:
+#endif
+
+repeat_nmi:
+ ANNOTATE_NOENDBR // this code
+ /*
+ * If there was a nested NMI, the first NMI's iret will return
+ * here. But NMIs are still enabled and we can take another
+ * nested NMI. The nested NMI checks the interrupted RIP to see
+ * if it is between repeat_nmi and end_repeat_nmi, and if so
+ * it will just return, as we are about to repeat an NMI anyway.
+ * This makes it safe to copy to the stack frame that a nested
+ * NMI will update.
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call exc_nmi(). "NMI executing"
+ * is zero.
+ */
+ movq $1, 10*8(%rsp) /* Set "NMI executing". */
+
+ /*
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
+ */
+ addq $(10*8), %rsp
+ .rept 5
+ pushq -6*8(%rsp)
+ .endr
+ subq $(5*8), %rsp
+end_repeat_nmi:
+ ANNOTATE_NOENDBR // this code
+
+ /*
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
+ */
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+
+ /*
+ * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
+ * as we should not be calling schedule in NMI context.
+ * Even with normal interrupts enabled. An NMI should not be
+ * setting NEED_RESCHED or anything that normal interrupts and
+ * exceptions might do.
+ */
+ call paranoid_entry
+ UNWIND_HINT_REGS
+
+ movq %rsp, %rdi
+ call exc_nmi
+
+ /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
+ IBRS_EXIT save_reg=%r15
+
+ PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
+ /*
+ * The above invocation of paranoid_entry stored the GSBASE
+ * related information in R/EBX depending on the availability
+ * of FSGSBASE.
+ *
+ * If FSGSBASE is enabled, restore the saved GSBASE value
+ * unconditionally, otherwise take the conditional SWAPGS path.
+ */
+ ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
+
+ wrgsbase %rbx
+ jmp nmi_restore
+
+nmi_no_fsgsbase:
+ /* EBX == 0 -> invoke SWAPGS */
+ testl %ebx, %ebx
+ jnz nmi_restore
+
+nmi_swapgs:
+ swapgs
+
+nmi_restore:
+ POP_REGS
+
+ /*
+ * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+ * at the "iret" frame.
+ */
+ addq $6*8, %rsp
+
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths.
+ *
+ * We arguably should just inspect RIP instead, but I (Andy) wrote
+ * this code when I had the misapprehension that Xen PV supported
+ * NMIs, and Xen PV would break that approach.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+ * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like
+ * NMI in kernel after user state is restored. For an unprivileged user
+ * these conditions are hard to meet.
+ */
+
+ /*
+ * iretq reads the "iret" frame and exits the NMI stack in a
+ * single instruction. We are returning to kernel mode, so this
+ * cannot result in a fault. Similarly, we don't need to worry
+ * about espfix64 on the way back to kernel mode.
+ */
+ iretq
+SYM_CODE_END(asm_exc_nmi)
+
+/*
+ * This handles SYSCALL from 32-bit code. There is no way to program
+ * MSRs to fully disable 32-bit SYSCALL.
+ */
+SYM_CODE_START(entry_SYSCALL32_ignore)
+ UNWIND_HINT_END_OF_STACK
+ ENDBR
+ mov $-ENOSYS, %eax
+ CLEAR_CPU_BUFFERS
+ sysretl
+SYM_CODE_END(entry_SYSCALL32_ignore)
+
+.pushsection .text, "ax"
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
+ UNWIND_HINT_FUNC
+ /* Prevent any naive code from trying to unwind to our caller. */
+ xorl %ebp, %ebp
+
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
+ leaq -PTREGS_SIZE(%rax), %rsp
+ UNWIND_HINT_REGS
+
+ call make_task_dead
+SYM_CODE_END(rewind_stack_and_make_dead)
+.popsection
+
+/*
+ * This sequence executes branches in order to remove user branch information
+ * from the branch history tracker in the Branch Predictor, therefore removing
+ * user influence on subsequent BTB lookups.
+ *
+ * It should be used on parts prior to Alder Lake. Newer parts should use the
+ * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being
+ * virtualized on newer hardware the VMM should protect against BHI attacks by
+ * setting BHI_DIS_S for the guests.
+ *
+ * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging
+ * and not clearing the branch history. The call tree looks like:
+ *
+ * call 1
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ *
+ * This means that the stack is non-constant and ORC can't unwind it with %rsp
+ * alone. Therefore we unconditionally set up the frame pointer, which allows
+ * ORC to unwind properly.
+ *
+ * The alignment is for performance and not for safety, and may be safely
+ * refactored in the future if needed. The .skips are for safety, to ensure
+ * that all RETs are in the second half of a cacheline to mitigate Indirect
+ * Target Selection, rather than taking the slowpath via its_return_thunk.
+ */
+SYM_FUNC_START(clear_bhb_loop)
+ ANNOTATE_NOENDBR
+ push %rbp
+ mov %rsp, %rbp
+ movl $5, %ecx
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 1f
+ jmp 5f
+ .align 64, 0xcc
+ /*
+ * Shift instructions so that the RET is in the upper half of the
+ * cacheline and don't take the slowpath to its_return_thunk.
+ */
+ .skip 32 - (.Lret1 - 1f), 0xcc
+ ANNOTATE_INTRA_FUNCTION_CALL
+1: call 2f
+.Lret1: RET
+ .align 64, 0xcc
+ /*
+ * As above shift instructions for RET at .Lret2 as well.
+ *
+ * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
+ * but some Clang versions (e.g. 18) don't like this.
+ */
+ .skip 32 - 18, 0xcc
+2: movl $5, %eax
+3: jmp 4f
+ nop
+4: sub $1, %eax
+ jnz 3b
+ sub $1, %ecx
+ jnz 1b
+.Lret2: RET
+5: lfence
+ pop %rbp
+ RET
+SYM_FUNC_END(clear_bhb_loop)
+EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop)
+STACK_FRAME_NON_STANDARD(clear_bhb_loop)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
new file mode 100644
index 000000000000..a45e1125fc6c
--- /dev/null
+++ b/arch/x86/entry/entry_64_compat.S
@@ -0,0 +1,299 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/nospec-branch.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+#include "calling.h"
+
+ .section .entry.text, "ax"
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO. In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction. This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ */
+SYM_CODE_START(entry_SYSENTER_compat)
+ UNWIND_HINT_ENTRY
+ ENDBR
+ /* Interrupts are off on entry. */
+ swapgs
+
+ pushq %rax
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ popq %rax
+
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq $0 /* pt_regs->sp = 0 (placeholder) */
+
+ /*
+ * Push flags. This is nasty. First, interrupts are currently
+ * off, but we need pt_regs->flags to have IF set. Second, if TS
+ * was set in usermode, it's still set, and we're singlestepping
+ * through this code. do_SYSENTER_32() will fix up IF.
+ */
+ pushfq /* pt_regs->flags (except IF = 0) */
+ pushq $__USER32_CS /* pt_regs->cs */
+ pushq $0 /* pt_regs->ip = 0 (placeholder) */
+SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
+
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
+ movl %eax, %eax
+
+ pushq %rax /* pt_regs->orig_ax */
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
+ UNWIND_HINT_REGS
+
+ cld
+
+ /*
+ * SYSENTER doesn't filter flags, so we need to clear NT and AC
+ * ourselves. To save a few cycles, we can check whether
+ * either was set instead of doing an unconditional popfq.
+ * This needs to happen before enabling interrupts so that
+ * we don't get preempted with NT set.
+ *
+ * If TF is set, we will single-step all the way to here -- do_debug
+ * will ignore all the traps. (Yes, this is slow, but so is
+ * single-stepping in general. This allows us to avoid having
+ * a more complicated code to handle the case where a user program
+ * forces us to single-step through the SYSENTER entry code.)
+ *
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+ * out-of-line as an optimization: NT is unlikely to be set in the
+ * majority of the cases and instead of polluting the I$ unnecessarily,
+ * we're keeping that code behind a branch which will predict as
+ * not-taken and therefore its instructions won't be fetched.
+ */
+ testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+ /*
+ * CPU bugs mitigations mechanisms can call other functions. They
+ * should be invoked after making sure TF is cleared because
+ * single-step is ignored only for instructions inside the
+ * entry_SYSENTER_compat function.
+ */
+ IBRS_ENTER
+ UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
+
+ movq %rsp, %rdi
+ call do_SYSENTER_32
+ jmp sysret32_from_system_call
+
+.Lsysenter_fix_flags:
+ pushq $X86_EFLAGS_FIXED
+ popfq
+ jmp .Lsysenter_flags_fixed
+SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
+SYM_CODE_END(entry_SYSENTER_compat)
+
+/*
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO. In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ * - The calling convention for SYSCALL has changed several times without
+ * anyone noticing.
+ *
+ * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ * user task that did SYSCALL without immediately reloading SS
+ * would randomly crash.
+ *
+ * - Most programmers do not directly target AMD CPUs, and the 32-bit
+ * SYSCALL instruction does not exist on Intel CPUs. Even on AMD
+ * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ * because the SYSCALL instruction in legacy/native 32-bit mode (as
+ * opposed to compat mode) is sufficiently poorly designed as to be
+ * essentially unusable.
+ *
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs. RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed). SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
+ * Arguments:
+ * eax system call number
+ * ecx return address
+ * ebx arg1
+ * ebp arg2 (note: not saved in the stack frame, should not be touched)
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * esp user stack
+ * 0(%esp) arg6
+ */
+SYM_CODE_START(entry_SYSCALL_compat)
+ UNWIND_HINT_ENTRY
+ ENDBR
+ /* Interrupts are off on entry. */
+ swapgs
+
+ /* Stash user ESP */
+ movl %esp, %r8d
+
+ /* Use %rsp as scratch reg. User ESP is stashed in r8 */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+ /* Switch to the kernel stack */
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq %r8 /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER32_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
+ movl %eax, %eax /* discard orig_ax high bits */
+ pushq %rax /* pt_regs->orig_ax */
+ PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS
+ UNWIND_HINT_REGS
+
+ IBRS_ENTER
+ UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
+
+ movq %rsp, %rdi
+ call do_fast_syscall_32
+
+sysret32_from_system_call:
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+
+ /*
+ * Opportunistic SYSRET
+ *
+ * We are not going to return to userspace from the trampoline
+ * stack. So let's erase the thread stack right now.
+ */
+ STACKLEAK_ERASE
+
+ IBRS_EXIT
+
+ movq RBX(%rsp), %rbx /* pt_regs->rbx */
+ movq RBP(%rsp), %rbp /* pt_regs->rbp */
+ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
+ movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */
+ addq $RAX, %rsp /* Skip r8-r15 */
+ popq %rax /* pt_regs->rax */
+ popq %rdx /* Skip pt_regs->cx */
+ popq %rdx /* pt_regs->dx */
+ popq %rsi /* pt_regs->si */
+ popq %rdi /* pt_regs->di */
+
+ /*
+ * USERGS_SYSRET32 does:
+ * GSBASE = user's GS base
+ * EIP = ECX
+ * RFLAGS = R11
+ * CS = __USER32_CS
+ * SS = __USER_DS
+ *
+ * ECX will not match pt_regs->cx, but we're returning to a vDSO
+ * trampoline that will fix up RCX, so this is okay.
+ *
+ * R12-R15 are callee-saved, so they contain whatever was in them
+ * when the system call started, which is already known to user
+ * code. We zero R8-R10 to avoid info leaks.
+ */
+ movq RSP-ORIG_RAX(%rsp), %rsp
+SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+
+ /*
+ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+ * on the process stack which is not mapped to userspace and
+ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
+ * switch until after after the last reference to the process
+ * stack.
+ *
+ * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+ */
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ swapgs
+ CLEAR_CPU_BUFFERS
+ sysretl
+SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ int3
+SYM_CODE_END(entry_SYSCALL_compat)
+
+/*
+ * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries
+ * point to C routines, however since this is a system call interface the branch
+ * history needs to be scrubbed to protect against BHI attacks, and that
+ * scrubbing needs to take place in assembly code prior to entering any C
+ * routines.
+ */
+SYM_CODE_START(int80_emulation)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ CLEAR_BRANCH_HISTORY
+ jmp do_int80_emulation
+SYM_CODE_END(int80_emulation)
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
new file mode 100644
index 000000000000..894f7f16eb80
--- /dev/null
+++ b/arch/x86/entry/entry_64_fred.S
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The actual FRED entry points.
+ */
+
+#include <linux/export.h>
+#include <linux/kvm_types.h>
+
+#include <asm/asm.h>
+#include <asm/fred.h>
+#include <asm/segment.h>
+
+#include "calling.h"
+
+ .code64
+ .section .noinstr.text, "ax"
+
+.macro FRED_ENTER
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+ PUSH_AND_CLEAR_REGS
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+.endm
+
+.macro FRED_EXIT
+ UNWIND_HINT_REGS
+ POP_REGS
+.endm
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3.
+ * Thus the FRED ring 3 entry point must be 4K page aligned.
+ */
+ .align 4096
+
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user)
+ FRED_ENTER
+ call fred_entry_from_user
+SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL)
+ FRED_EXIT
+1: ERETU
+
+ _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU)
+SYM_CODE_END(asm_fred_entrypoint_user)
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in
+ * ring 0, i.e., asm_fred_entrypoint_user + 256.
+ */
+ .org asm_fred_entrypoint_user + 256, 0xcc
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel)
+ FRED_ENTER
+ call fred_entry_from_kernel
+ FRED_EXIT
+ ERETS
+SYM_CODE_END(asm_fred_entrypoint_kernel)
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+SYM_FUNC_START(asm_fred_entry_from_kvm)
+ ANNOTATE_NOENDBR
+ push %rbp
+ mov %rsp, %rbp
+
+ UNWIND_HINT_SAVE
+
+ /*
+ * Both IRQ and NMI from VMX can be handled on current task stack
+ * because there is no need to protect from reentrancy and the call
+ * stack leading to this helper is effectively constant and shallow
+ * (relatively speaking). Do the same when FRED is active, i.e., no
+ * need to check current stack level for a stack switch.
+ *
+ * Emulate the FRED-defined redzone and stack alignment.
+ */
+ sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp
+ and $FRED_STACK_FRAME_RSP_MASK, %rsp
+
+ /*
+ * Start to push a FRED stack frame, which is always 64 bytes:
+ *
+ * +--------+-----------------+
+ * | Bytes | Usage |
+ * +--------+-----------------+
+ * | 63:56 | Reserved |
+ * | 55:48 | Event Data |
+ * | 47:40 | SS + Event Info |
+ * | 39:32 | RSP |
+ * | 31:24 | RFLAGS |
+ * | 23:16 | CS + Aux Info |
+ * | 15:8 | RIP |
+ * | 7:0 | Error Code |
+ * +--------+-----------------+
+ */
+ push $0 /* Reserved, must be 0 */
+ push $0 /* Event data, 0 for IRQ/NMI */
+ push %rdi /* fred_ss handed in by the caller */
+ push %rbp
+ pushf
+ push $__KERNEL_CS
+
+ /*
+ * Unlike the IDT event delivery, FRED _always_ pushes an error code
+ * after pushing the return RIP, thus the CALL instruction CANNOT be
+ * used here to push the return RIP, otherwise there is no chance to
+ * push an error code before invoking the IRQ/NMI handler.
+ *
+ * Use LEA to get the return RIP and push it, then push an error code.
+ */
+ lea 1f(%rip), %rax
+ push %rax /* Return RIP */
+ push $0 /* Error code, 0 for IRQ/NMI */
+
+ PUSH_AND_CLEAR_REGS clear_callee=0 unwind_hint=0
+
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+ /*
+ * At this point: {rdi, rsi, rdx, rcx, r8, r9}, {r10, r11}, {rax, rdx}
+ * are clobbered, which corresponds to: arguments, extra caller-saved
+ * and return. All registers a C function is allowed to clobber.
+ *
+ * Notably, the callee-saved registers: {rbx, r12, r13, r14, r15}
+ * are untouched, with the exception of rbp, which carries the stack
+ * frame and will be restored before exit.
+ *
+ * Further calling another C function will not alter this state.
+ */
+ call __fred_entry_from_kvm /* Call the C entry point */
+
+ /*
+ * When FRED, use ERETS to potentially clear NMIs, otherwise simply
+ * restore the stack pointer.
+ */
+ ALTERNATIVE "nop; nop; mov %rbp, %rsp", \
+ __stringify(add $C_PTREGS_SIZE, %rsp; ERETS), \
+ X86_FEATURE_FRED
+
+1: /*
+ * Objtool doesn't understand ERETS, and the cfi register state is
+ * different from initial_func_cfi due to PUSH_REGS. Tell it the state
+ * is similar to where UNWIND_HINT_SAVE is.
+ */
+ UNWIND_HINT_RESTORE
+
+ pop %rbp
+ RET
+
+SYM_FUNC_END(asm_fred_entry_from_kvm)
+EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
+#endif
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
new file mode 100644
index 000000000000..94e626cc6a07
--- /dev/null
+++ b/arch/x86/entry/entry_fred.c
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The FRED specific kernel/user entry functions which are invoked from
+ * assembly code and dispatch to the associated handlers.
+ */
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/nospec.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
+#include <asm/syscall.h>
+#include <asm/trapnr.h>
+#include <asm/traps.h>
+
+/* FRED EVENT_TYPE_OTHER vector numbers */
+#define FRED_SYSCALL 1
+#define FRED_SYSENTER 2
+
+static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code)
+{
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ /* Panic on events from a high stack level */
+ if (regs->fred_cs.sl > 0) {
+ pr_emerg("PANIC: invalid or fatal FRED event; event type %u "
+ "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+ regs->fred_ss.type, regs->fred_ss.vector, error_code,
+ fred_event_data(regs), regs->cs, regs->ip);
+ die("invalid or fatal FRED event", regs, error_code);
+ panic("invalid or fatal FRED event");
+ } else {
+ unsigned long flags = oops_begin();
+ int sig = SIGKILL;
+
+ pr_alert("BUG: invalid or fatal FRED event; event type %u "
+ "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+ regs->fred_ss.type, regs->fred_ss.vector, error_code,
+ fred_event_data(regs), regs->cs, regs->ip);
+
+ if (__die("Invalid or fatal FRED event", regs, error_code))
+ sig = 0;
+
+ oops_end(flags, regs, sig);
+ }
+
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+static noinstr void fred_intx(struct pt_regs *regs)
+{
+ switch (regs->fred_ss.vector) {
+ /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */
+ case X86_TRAP_BP:
+ return exc_int3(regs);
+
+ /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */
+ case X86_TRAP_OF:
+ return exc_overflow(regs);
+
+#ifdef CONFIG_IA32_EMULATION
+ /* INT80 */
+ case IA32_SYSCALL_VECTOR:
+ if (ia32_enabled())
+ return fred_int80_emulation(regs);
+ fallthrough;
+#endif
+
+ default:
+ return exc_general_protection(regs, 0);
+ }
+}
+
+static __always_inline void fred_other(struct pt_regs *regs)
+{
+ /* The compiler can fold these conditions into a single test */
+ if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.l)) {
+ regs->orig_ax = regs->ax;
+ regs->ax = -ENOSYS;
+ do_syscall_64(regs, regs->orig_ax);
+ return;
+ } else if (ia32_enabled() &&
+ likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.l)) {
+ regs->orig_ax = regs->ax;
+ regs->ax = -ENOSYS;
+ do_fast_syscall_32(regs);
+ return;
+ } else {
+ exc_invalid_op(regs);
+ return;
+ }
+}
+
+#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function
+
+static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
+ SYSVEC(ERROR_APIC_VECTOR, error_interrupt),
+ SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt),
+ SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
+ SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+
+ SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi),
+ SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single),
+ SYSVEC(CALL_FUNCTION_VECTOR, call_function),
+ SYSVEC(REBOOT_VECTOR, reboot),
+
+ SYSVEC(THRESHOLD_APIC_VECTOR, threshold),
+ SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error),
+ SYSVEC(THERMAL_APIC_VECTOR, thermal),
+
+ SYSVEC(IRQ_WORK_VECTOR, irq_work),
+
+ SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
+ SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+ SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+
+ SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification),
+};
+
+static bool fred_setup_done __initdata;
+
+void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler)
+{
+ if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR))
+ return;
+
+ if (WARN_ON_ONCE(fred_setup_done))
+ return;
+
+ if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR]))
+ sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler;
+}
+
+static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs)
+{
+ spurious_interrupt(regs, regs->fred_ss.vector);
+}
+
+void __init fred_complete_exception_setup(void)
+{
+ unsigned int vector;
+
+ for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++)
+ set_bit(vector, system_vectors);
+
+ for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) {
+ if (sysvec_table[vector])
+ set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors);
+ else
+ sysvec_table[vector] = fred_handle_spurious_interrupt;
+ }
+ fred_setup_done = true;
+}
+
+static noinstr void fred_extint(struct pt_regs *regs)
+{
+ unsigned int vector = regs->fred_ss.vector;
+ unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR,
+ NR_SYSTEM_VECTORS);
+
+ if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
+ return;
+
+ if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
+ irqentry_state_t state = irqentry_enter(regs);
+
+ instrumentation_begin();
+ sysvec_table[index](regs);
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ } else {
+ common_interrupt(regs, vector);
+ }
+}
+
+static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
+{
+ /* Optimize for #PF. That's the only exception which matters performance wise */
+ if (likely(regs->fred_ss.vector == X86_TRAP_PF))
+ return exc_page_fault(regs, error_code);
+
+ switch (regs->fred_ss.vector) {
+ case X86_TRAP_DE: return exc_divide_error(regs);
+ case X86_TRAP_DB: return fred_exc_debug(regs);
+ case X86_TRAP_BR: return exc_bounds(regs);
+ case X86_TRAP_UD: return exc_invalid_op(regs);
+ case X86_TRAP_NM: return exc_device_not_available(regs);
+ case X86_TRAP_DF: return exc_double_fault(regs, error_code);
+ case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
+ case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
+ case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
+ case X86_TRAP_GP: return exc_general_protection(regs, error_code);
+ case X86_TRAP_MF: return exc_coprocessor_error(regs);
+ case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
+ case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
+
+#ifdef CONFIG_X86_MCE
+ case X86_TRAP_MC: return fred_exc_machine_check(regs);
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ case X86_TRAP_VE: return exc_virtualization_exception(regs);
+#endif
+#ifdef CONFIG_X86_CET
+ case X86_TRAP_CP: return exc_control_protection(regs, error_code);
+#endif
+ default: return fred_bad_type(regs, error_code);
+ }
+
+}
+
+static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code)
+{
+ switch (regs->fred_ss.vector) {
+ case X86_TRAP_BP: return exc_int3(regs);
+ case X86_TRAP_OF: return exc_overflow(regs);
+ default: return fred_bad_type(regs, error_code);
+ }
+}
+
+__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
+{
+ unsigned long error_code = regs->orig_ax;
+
+ /* Invalidate orig_ax so that syscall_get_nr() works correctly */
+ regs->orig_ax = -1;
+
+ switch (regs->fred_ss.type) {
+ case EVENT_TYPE_EXTINT:
+ return fred_extint(regs);
+ case EVENT_TYPE_NMI:
+ if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+ return fred_exc_nmi(regs);
+ break;
+ case EVENT_TYPE_HWEXC:
+ return fred_hwexc(regs, error_code);
+ case EVENT_TYPE_SWINT:
+ return fred_intx(regs);
+ case EVENT_TYPE_PRIV_SWEXC:
+ if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+ return fred_exc_debug(regs);
+ break;
+ case EVENT_TYPE_SWEXC:
+ return fred_swexc(regs, error_code);
+ case EVENT_TYPE_OTHER:
+ return fred_other(regs);
+ default: break;
+ }
+
+ return fred_bad_type(regs, error_code);
+}
+
+__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
+{
+ unsigned long error_code = regs->orig_ax;
+
+ /* Invalidate orig_ax so that syscall_get_nr() works correctly */
+ regs->orig_ax = -1;
+
+ switch (regs->fred_ss.type) {
+ case EVENT_TYPE_EXTINT:
+ return fred_extint(regs);
+ case EVENT_TYPE_NMI:
+ if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+ return fred_exc_nmi(regs);
+ break;
+ case EVENT_TYPE_HWEXC:
+ return fred_hwexc(regs, error_code);
+ case EVENT_TYPE_PRIV_SWEXC:
+ if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+ return fred_exc_debug(regs);
+ break;
+ case EVENT_TYPE_SWEXC:
+ return fred_swexc(regs, error_code);
+ default: break;
+ }
+
+ return fred_bad_type(regs, error_code);
+}
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs)
+{
+ switch (regs->fred_ss.type) {
+ case EVENT_TYPE_EXTINT:
+ return fred_extint(regs);
+ case EVENT_TYPE_NMI:
+ return fred_exc_nmi(regs);
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+#endif
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
new file mode 100644
index 000000000000..a67a644d0cfe
--- /dev/null
+++ b/arch/x86/entry/syscall_32.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* 32-bit system call dispatch */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
+#include <linux/uaccess.h>
+#include <asm/apic.h>
+#include <asm/traps.h>
+#include <asm/cpufeature.h>
+#include <asm/syscall.h>
+
+#ifdef CONFIG_IA32_EMULATION
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat)
+#else
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
+#endif
+
+#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);
+#include <asm/syscalls_32.h>
+#undef __SYSCALL
+
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL
+
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
+#ifdef CONFIG_X86_32
+#define __SYSCALL(nr, sym) __ia32_##sym,
+const sys_call_ptr_t sys_call_table[] = {
+#include <asm/syscalls_32.h>
+};
+#undef __SYSCALL
+#endif
+
+#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
+long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_32.h>
+ default: return __ia32_sys_ni_syscall(regs);
+ }
+}
+
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ current_thread_info()->status |= TS_COMPAT;
+
+ return (int)regs->orig_ax;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int __init ia32_emulation_override_cmdline(char *arg)
+{
+ return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#endif
+
+/*
+ * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call(regs, unr);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
+ }
+}
+
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * do_int80_emulation - 32-bit legacy syscall C entry from asm
+ * @regs: syscall arguments in struct pt_args on the stack.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+__visible noinstr void do_int80_emulation(struct pt_regs *regs)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * A FRED-specific INT80 handler is warranted for the follwing reasons:
+ *
+ * 1) As INT instructions and hardware interrupts are separate event
+ * types, FRED does not preclude the use of vector 0x80 for external
+ * interrupts. As a result, the FRED setup code does not reserve
+ * vector 0x80 and calling int80_is_external() is not merely
+ * suboptimal but actively incorrect: it could cause a system call
+ * to be incorrectly ignored.
+ *
+ * 2) It is called only for handling vector 0x80 of event type
+ * EVENT_TYPE_SWINT and will never be called to handle any external
+ * interrupt (event type EVENT_TYPE_EXTINT).
+ *
+ * 3) FRED has separate entry flows depending on if the event came from
+ * user space or kernel space, and because the kernel does not use
+ * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
+ * falls through to fred_bad_type() if the event type is
+ * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
+ * an INT insn, it can only be from a user level.
+ *
+ * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
+ * likely take a different approach if it is ever needed: it
+ * probably belongs in either fred_intx()/ fred_other() or
+ * asm_fred_entrypoint_user(), depending on if this ought to be done
+ * for all entries from userspace or only system
+ * calls.
+ *
+ * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
+ */
+DEFINE_FREDENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /*
+ * FRED pushed 0 into regs::orig_ax and regs::ax contains the
+ * syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* CONFIG_X86_FRED */
+
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+
+ add_random_kstack_offset();
+ /*
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
+ */
+ nr = syscall_enter_from_user_mode(regs, nr);
+ instrumentation_begin();
+
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif /* !CONFIG_IA32_EMULATION */
+
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+ int nr = syscall_32_enter(regs);
+ int res;
+
+ add_random_kstack_offset();
+ /*
+ * This cannot use syscall_enter_from_user_mode() as it has to
+ * fetch EBP before invoking any of the syscall entry work
+ * functions.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ local_irq_enable();
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
+ */
+ res = __get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ }
+
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+
+ local_irq_disable();
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ return false;
+ }
+
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
+ if (!__do_fast_syscall_32(regs))
+ return false;
+
+ /*
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
+}
+
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
+{
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+ regs->sp = regs->bp;
+
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
+ regs->flags |= X86_EFLAGS_IF;
+
+ return do_fast_syscall_32(regs);
+}
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
new file mode 100644
index 000000000000..b6e68ea98b83
--- /dev/null
+++ b/arch/x86/entry/syscall_64.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* 64-bit system call dispatch */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <linux/syscalls.h>
+#include <linux/entry-common.h>
+#include <linux/nospec.h>
+#include <asm/syscall.h>
+
+#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
+#include <asm/syscalls_64.h>
+#ifdef CONFIG_X86_X32_ABI
+#include <asm/syscalls_x32.h>
+#endif
+#undef __SYSCALL
+
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL
+
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
+#define __SYSCALL(nr, sym) __x64_##sym,
+const sys_call_ptr_t sys_call_table[] = {
+#include <asm/syscalls_64.h>
+};
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_64.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+}
+
+#ifdef CONFIG_X86_X32_ABI
+long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_x32.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+}
+#endif
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = x64_sys_call(regs, unr);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ /*
+ * Adjust the starting offset of the table, and convert numbers
+ * < __X32_SYSCALL_BIT to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call(regs, xnr);
+ return true;
+ }
+ return false;
+}
+
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+{
+ add_random_kstack_offset();
+ nr = syscall_enter_from_user_mode(regs, nr);
+
+ instrumentation_begin();
+
+ if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+ /* Invalid system call, but still a system call. */
+ regs->ax = __x64_sys_ni_syscall(regs);
+ }
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
+ */
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
+}
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
new file mode 100644
index 000000000000..eca5d6eff132
--- /dev/null
+++ b/arch/x86/entry/syscalls/Makefile
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0
+out := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
+
+# Create output directory if not already present
+$(shell mkdir -p $(out) $(uapi))
+
+syscall32 := $(src)/syscall_32.tbl
+syscall64 := $(src)/syscall_64.tbl
+
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
+offset :=
+prefix :=
+
+quiet_cmd_syshdr = SYSHDR $@
+ cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \
+ $(if $(offset),--offset $(offset)) \
+ $(if $(prefix),--prefix $(prefix)) \
+ $< $@
+quiet_cmd_systbl = SYSTBL $@
+ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
+
+quiet_cmd_hypercalls = HYPERCALLS $@
+ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs))
+
+$(uapi)/unistd_32.h: abis := i386
+$(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE
+ $(call if_changed,syshdr)
+
+$(out)/unistd_32_ia32.h: abis := i386
+$(out)/unistd_32_ia32.h: prefix := ia32_
+$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE
+ $(call if_changed,syshdr)
+
+$(uapi)/unistd_x32.h: abis := common,x32
+$(uapi)/unistd_x32.h: offset := __X32_SYSCALL_BIT
+$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE
+ $(call if_changed,syshdr)
+
+$(uapi)/unistd_64.h: abis := common,64
+$(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE
+ $(call if_changed,syshdr)
+
+$(out)/unistd_64_x32.h: abis := x32
+$(out)/unistd_64_x32.h: prefix := x32_
+$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE
+ $(call if_changed,syshdr)
+
+$(out)/syscalls_32.h: abis := i386
+$(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE
+ $(call if_changed,systbl)
+$(out)/syscalls_64.h: abis := common,64
+$(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE
+ $(call if_changed,systbl)
+$(out)/syscalls_x32.h: abis := common,x32
+$(out)/syscalls_x32.h: $(syscall64) $(systbl) FORCE
+ $(call if_changed,systbl)
+
+$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE
+ $(call if_changed,hypercalls)
+
+$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
+
+uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
+syshdr-y += syscalls_32.h
+syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
+syshdr-$(CONFIG_X86_64) += syscalls_64.h
+syshdr-$(CONFIG_X86_X32_ABI) += syscalls_x32.h
+syshdr-$(CONFIG_XEN) += xen-hypercalls.h
+
+uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y))
+syshdr-y := $(addprefix $(out)/, $(syshdr-y))
+targets += $(addprefix ../../../../, $(uapisyshdr-y) $(syshdr-y))
+
+PHONY += all
+all: $(uapisyshdr-y) $(syshdr-y)
+ @:
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
new file mode 100644
index 000000000000..e979a3eac7a3
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -0,0 +1,478 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# 32-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
+#
+# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
+# sys_*() system calls and compat_sys_*() compat system calls if
+# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only
+# parameter.
+#
+# The abi is always "i386" for this file.
+#
+0 i386 restart_syscall sys_restart_syscall
+1 i386 exit sys_exit - noreturn
+2 i386 fork sys_fork
+3 i386 read sys_read
+4 i386 write sys_write
+5 i386 open sys_open compat_sys_open
+6 i386 close sys_close
+7 i386 waitpid sys_waitpid
+8 i386 creat sys_creat
+9 i386 link sys_link
+10 i386 unlink sys_unlink
+11 i386 execve sys_execve compat_sys_execve
+12 i386 chdir sys_chdir
+13 i386 time sys_time32
+14 i386 mknod sys_mknod
+15 i386 chmod sys_chmod
+16 i386 lchown sys_lchown16
+17 i386 break
+18 i386 oldstat sys_stat
+19 i386 lseek sys_lseek compat_sys_lseek
+20 i386 getpid sys_getpid
+21 i386 mount sys_mount
+22 i386 umount sys_oldumount
+23 i386 setuid sys_setuid16
+24 i386 getuid sys_getuid16
+25 i386 stime sys_stime32
+26 i386 ptrace sys_ptrace compat_sys_ptrace
+27 i386 alarm sys_alarm
+28 i386 oldfstat sys_fstat
+29 i386 pause sys_pause
+30 i386 utime sys_utime32
+31 i386 stty
+32 i386 gtty
+33 i386 access sys_access
+34 i386 nice sys_nice
+35 i386 ftime
+36 i386 sync sys_sync
+37 i386 kill sys_kill
+38 i386 rename sys_rename
+39 i386 mkdir sys_mkdir
+40 i386 rmdir sys_rmdir
+41 i386 dup sys_dup
+42 i386 pipe sys_pipe
+43 i386 times sys_times compat_sys_times
+44 i386 prof
+45 i386 brk sys_brk
+46 i386 setgid sys_setgid16
+47 i386 getgid sys_getgid16
+48 i386 signal sys_signal
+49 i386 geteuid sys_geteuid16
+50 i386 getegid sys_getegid16
+51 i386 acct sys_acct
+52 i386 umount2 sys_umount
+53 i386 lock
+54 i386 ioctl sys_ioctl compat_sys_ioctl
+55 i386 fcntl sys_fcntl compat_sys_fcntl64
+56 i386 mpx
+57 i386 setpgid sys_setpgid
+58 i386 ulimit
+59 i386 oldolduname sys_olduname
+60 i386 umask sys_umask
+61 i386 chroot sys_chroot
+62 i386 ustat sys_ustat compat_sys_ustat
+63 i386 dup2 sys_dup2
+64 i386 getppid sys_getppid
+65 i386 getpgrp sys_getpgrp
+66 i386 setsid sys_setsid
+67 i386 sigaction sys_sigaction compat_sys_sigaction
+68 i386 sgetmask sys_sgetmask
+69 i386 ssetmask sys_ssetmask
+70 i386 setreuid sys_setreuid16
+71 i386 setregid sys_setregid16
+72 i386 sigsuspend sys_sigsuspend
+73 i386 sigpending sys_sigpending compat_sys_sigpending
+74 i386 sethostname sys_sethostname
+75 i386 setrlimit sys_setrlimit compat_sys_setrlimit
+76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit
+77 i386 getrusage sys_getrusage compat_sys_getrusage
+78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday
+79 i386 settimeofday sys_settimeofday compat_sys_settimeofday
+80 i386 getgroups sys_getgroups16
+81 i386 setgroups sys_setgroups16
+82 i386 select sys_old_select compat_sys_old_select
+83 i386 symlink sys_symlink
+84 i386 oldlstat sys_lstat
+85 i386 readlink sys_readlink
+86 i386 uselib sys_uselib
+87 i386 swapon sys_swapon
+88 i386 reboot sys_reboot
+89 i386 readdir sys_old_readdir compat_sys_old_readdir
+90 i386 mmap sys_old_mmap compat_sys_ia32_mmap
+91 i386 munmap sys_munmap
+92 i386 truncate sys_truncate compat_sys_truncate
+93 i386 ftruncate sys_ftruncate compat_sys_ftruncate
+94 i386 fchmod sys_fchmod
+95 i386 fchown sys_fchown16
+96 i386 getpriority sys_getpriority
+97 i386 setpriority sys_setpriority
+98 i386 profil
+99 i386 statfs sys_statfs compat_sys_statfs
+100 i386 fstatfs sys_fstatfs compat_sys_fstatfs
+101 i386 ioperm sys_ioperm
+102 i386 socketcall sys_socketcall compat_sys_socketcall
+103 i386 syslog sys_syslog
+104 i386 setitimer sys_setitimer compat_sys_setitimer
+105 i386 getitimer sys_getitimer compat_sys_getitimer
+106 i386 stat sys_newstat compat_sys_newstat
+107 i386 lstat sys_newlstat compat_sys_newlstat
+108 i386 fstat sys_newfstat compat_sys_newfstat
+109 i386 olduname sys_uname
+110 i386 iopl sys_iopl
+111 i386 vhangup sys_vhangup
+112 i386 idle
+113 i386 vm86old sys_vm86old sys_ni_syscall
+114 i386 wait4 sys_wait4 compat_sys_wait4
+115 i386 swapoff sys_swapoff
+116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
+117 i386 ipc sys_ipc compat_sys_ipc
+118 i386 fsync sys_fsync
+119 i386 sigreturn sys_sigreturn compat_sys_sigreturn
+120 i386 clone sys_clone compat_sys_ia32_clone
+121 i386 setdomainname sys_setdomainname
+122 i386 uname sys_newuname
+123 i386 modify_ldt sys_modify_ldt
+124 i386 adjtimex sys_adjtimex_time32
+125 i386 mprotect sys_mprotect
+126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask
+127 i386 create_module
+128 i386 init_module sys_init_module
+129 i386 delete_module sys_delete_module
+130 i386 get_kernel_syms
+131 i386 quotactl sys_quotactl
+132 i386 getpgid sys_getpgid
+133 i386 fchdir sys_fchdir
+134 i386 bdflush sys_ni_syscall
+135 i386 sysfs sys_sysfs
+136 i386 personality sys_personality
+137 i386 afs_syscall
+138 i386 setfsuid sys_setfsuid16
+139 i386 setfsgid sys_setfsgid16
+140 i386 _llseek sys_llseek
+141 i386 getdents sys_getdents compat_sys_getdents
+142 i386 _newselect sys_select compat_sys_select
+143 i386 flock sys_flock
+144 i386 msync sys_msync
+145 i386 readv sys_readv
+146 i386 writev sys_writev
+147 i386 getsid sys_getsid
+148 i386 fdatasync sys_fdatasync
+149 i386 _sysctl sys_ni_syscall
+150 i386 mlock sys_mlock
+151 i386 munlock sys_munlock
+152 i386 mlockall sys_mlockall
+153 i386 munlockall sys_munlockall
+154 i386 sched_setparam sys_sched_setparam
+155 i386 sched_getparam sys_sched_getparam
+156 i386 sched_setscheduler sys_sched_setscheduler
+157 i386 sched_getscheduler sys_sched_getscheduler
+158 i386 sched_yield sys_sched_yield
+159 i386 sched_get_priority_max sys_sched_get_priority_max
+160 i386 sched_get_priority_min sys_sched_get_priority_min
+161 i386 sched_rr_get_interval sys_sched_rr_get_interval_time32
+162 i386 nanosleep sys_nanosleep_time32
+163 i386 mremap sys_mremap
+164 i386 setresuid sys_setresuid16
+165 i386 getresuid sys_getresuid16
+166 i386 vm86 sys_vm86 sys_ni_syscall
+167 i386 query_module
+168 i386 poll sys_poll
+169 i386 nfsservctl
+170 i386 setresgid sys_setresgid16
+171 i386 getresgid sys_getresgid16
+172 i386 prctl sys_prctl
+173 i386 rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn
+174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction
+175 i386 rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask
+176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending
+177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 compat_sys_rt_sigtimedwait_time32
+178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo
+179 i386 rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend
+180 i386 pread64 sys_ia32_pread64
+181 i386 pwrite64 sys_ia32_pwrite64
+182 i386 chown sys_chown16
+183 i386 getcwd sys_getcwd
+184 i386 capget sys_capget
+185 i386 capset sys_capset
+186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack
+187 i386 sendfile sys_sendfile compat_sys_sendfile
+188 i386 getpmsg
+189 i386 putpmsg
+190 i386 vfork sys_vfork
+191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
+192 i386 mmap2 sys_mmap_pgoff
+193 i386 truncate64 sys_ia32_truncate64
+194 i386 ftruncate64 sys_ia32_ftruncate64
+195 i386 stat64 sys_stat64 compat_sys_ia32_stat64
+196 i386 lstat64 sys_lstat64 compat_sys_ia32_lstat64
+197 i386 fstat64 sys_fstat64 compat_sys_ia32_fstat64
+198 i386 lchown32 sys_lchown
+199 i386 getuid32 sys_getuid
+200 i386 getgid32 sys_getgid
+201 i386 geteuid32 sys_geteuid
+202 i386 getegid32 sys_getegid
+203 i386 setreuid32 sys_setreuid
+204 i386 setregid32 sys_setregid
+205 i386 getgroups32 sys_getgroups
+206 i386 setgroups32 sys_setgroups
+207 i386 fchown32 sys_fchown
+208 i386 setresuid32 sys_setresuid
+209 i386 getresuid32 sys_getresuid
+210 i386 setresgid32 sys_setresgid
+211 i386 getresgid32 sys_getresgid
+212 i386 chown32 sys_chown
+213 i386 setuid32 sys_setuid
+214 i386 setgid32 sys_setgid
+215 i386 setfsuid32 sys_setfsuid
+216 i386 setfsgid32 sys_setfsgid
+217 i386 pivot_root sys_pivot_root
+218 i386 mincore sys_mincore
+219 i386 madvise sys_madvise
+220 i386 getdents64 sys_getdents64
+221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64
+# 222 is unused
+# 223 is unused
+224 i386 gettid sys_gettid
+225 i386 readahead sys_ia32_readahead
+226 i386 setxattr sys_setxattr
+227 i386 lsetxattr sys_lsetxattr
+228 i386 fsetxattr sys_fsetxattr
+229 i386 getxattr sys_getxattr
+230 i386 lgetxattr sys_lgetxattr
+231 i386 fgetxattr sys_fgetxattr
+232 i386 listxattr sys_listxattr
+233 i386 llistxattr sys_llistxattr
+234 i386 flistxattr sys_flistxattr
+235 i386 removexattr sys_removexattr
+236 i386 lremovexattr sys_lremovexattr
+237 i386 fremovexattr sys_fremovexattr
+238 i386 tkill sys_tkill
+239 i386 sendfile64 sys_sendfile64
+240 i386 futex sys_futex_time32
+241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity
+242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity
+243 i386 set_thread_area sys_set_thread_area
+244 i386 get_thread_area sys_get_thread_area
+245 i386 io_setup sys_io_setup compat_sys_io_setup
+246 i386 io_destroy sys_io_destroy
+247 i386 io_getevents sys_io_getevents_time32
+248 i386 io_submit sys_io_submit compat_sys_io_submit
+249 i386 io_cancel sys_io_cancel
+250 i386 fadvise64 sys_ia32_fadvise64
+# 251 is available for reuse (was briefly sys_set_zone_reclaim)
+252 i386 exit_group sys_exit_group - noreturn
+253 i386 lookup_dcookie
+254 i386 epoll_create sys_epoll_create
+255 i386 epoll_ctl sys_epoll_ctl
+256 i386 epoll_wait sys_epoll_wait
+257 i386 remap_file_pages sys_remap_file_pages
+258 i386 set_tid_address sys_set_tid_address
+259 i386 timer_create sys_timer_create compat_sys_timer_create
+260 i386 timer_settime sys_timer_settime32
+261 i386 timer_gettime sys_timer_gettime32
+262 i386 timer_getoverrun sys_timer_getoverrun
+263 i386 timer_delete sys_timer_delete
+264 i386 clock_settime sys_clock_settime32
+265 i386 clock_gettime sys_clock_gettime32
+266 i386 clock_getres sys_clock_getres_time32
+267 i386 clock_nanosleep sys_clock_nanosleep_time32
+268 i386 statfs64 sys_statfs64 compat_sys_statfs64
+269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
+270 i386 tgkill sys_tgkill
+271 i386 utimes sys_utimes_time32
+272 i386 fadvise64_64 sys_ia32_fadvise64_64
+273 i386 vserver
+274 i386 mbind sys_mbind
+275 i386 get_mempolicy sys_get_mempolicy
+276 i386 set_mempolicy sys_set_mempolicy
+277 i386 mq_open sys_mq_open compat_sys_mq_open
+278 i386 mq_unlink sys_mq_unlink
+279 i386 mq_timedsend sys_mq_timedsend_time32
+280 i386 mq_timedreceive sys_mq_timedreceive_time32
+281 i386 mq_notify sys_mq_notify compat_sys_mq_notify
+282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr
+283 i386 kexec_load sys_kexec_load compat_sys_kexec_load
+284 i386 waitid sys_waitid compat_sys_waitid
+# 285 sys_setaltroot
+286 i386 add_key sys_add_key
+287 i386 request_key sys_request_key
+288 i386 keyctl sys_keyctl compat_sys_keyctl
+289 i386 ioprio_set sys_ioprio_set
+290 i386 ioprio_get sys_ioprio_get
+291 i386 inotify_init sys_inotify_init
+292 i386 inotify_add_watch sys_inotify_add_watch
+293 i386 inotify_rm_watch sys_inotify_rm_watch
+294 i386 migrate_pages sys_migrate_pages
+295 i386 openat sys_openat compat_sys_openat
+296 i386 mkdirat sys_mkdirat
+297 i386 mknodat sys_mknodat
+298 i386 fchownat sys_fchownat
+299 i386 futimesat sys_futimesat_time32
+300 i386 fstatat64 sys_fstatat64 compat_sys_ia32_fstatat64
+301 i386 unlinkat sys_unlinkat
+302 i386 renameat sys_renameat
+303 i386 linkat sys_linkat
+304 i386 symlinkat sys_symlinkat
+305 i386 readlinkat sys_readlinkat
+306 i386 fchmodat sys_fchmodat
+307 i386 faccessat sys_faccessat
+308 i386 pselect6 sys_pselect6_time32 compat_sys_pselect6_time32
+309 i386 ppoll sys_ppoll_time32 compat_sys_ppoll_time32
+310 i386 unshare sys_unshare
+311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list
+312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list
+313 i386 splice sys_splice
+314 i386 sync_file_range sys_ia32_sync_file_range
+315 i386 tee sys_tee
+316 i386 vmsplice sys_vmsplice
+317 i386 move_pages sys_move_pages
+318 i386 getcpu sys_getcpu
+319 i386 epoll_pwait sys_epoll_pwait
+320 i386 utimensat sys_utimensat_time32
+321 i386 signalfd sys_signalfd compat_sys_signalfd
+322 i386 timerfd_create sys_timerfd_create
+323 i386 eventfd sys_eventfd
+324 i386 fallocate sys_ia32_fallocate
+325 i386 timerfd_settime sys_timerfd_settime32
+326 i386 timerfd_gettime sys_timerfd_gettime32
+327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4
+328 i386 eventfd2 sys_eventfd2
+329 i386 epoll_create1 sys_epoll_create1
+330 i386 dup3 sys_dup3
+331 i386 pipe2 sys_pipe2
+332 i386 inotify_init1 sys_inotify_init1
+333 i386 preadv sys_preadv compat_sys_preadv
+334 i386 pwritev sys_pwritev compat_sys_pwritev
+335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
+336 i386 perf_event_open sys_perf_event_open
+337 i386 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32
+338 i386 fanotify_init sys_fanotify_init
+339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark
+340 i386 prlimit64 sys_prlimit64
+341 i386 name_to_handle_at sys_name_to_handle_at
+342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at
+343 i386 clock_adjtime sys_clock_adjtime32
+344 i386 syncfs sys_syncfs
+345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg
+346 i386 setns sys_setns
+347 i386 process_vm_readv sys_process_vm_readv
+348 i386 process_vm_writev sys_process_vm_writev
+349 i386 kcmp sys_kcmp
+350 i386 finit_module sys_finit_module
+351 i386 sched_setattr sys_sched_setattr
+352 i386 sched_getattr sys_sched_getattr
+353 i386 renameat2 sys_renameat2
+354 i386 seccomp sys_seccomp
+355 i386 getrandom sys_getrandom
+356 i386 memfd_create sys_memfd_create
+357 i386 bpf sys_bpf
+358 i386 execveat sys_execveat compat_sys_execveat
+359 i386 socket sys_socket
+360 i386 socketpair sys_socketpair
+361 i386 bind sys_bind
+362 i386 connect sys_connect
+363 i386 listen sys_listen
+364 i386 accept4 sys_accept4
+365 i386 getsockopt sys_getsockopt sys_getsockopt
+366 i386 setsockopt sys_setsockopt sys_setsockopt
+367 i386 getsockname sys_getsockname
+368 i386 getpeername sys_getpeername
+369 i386 sendto sys_sendto
+370 i386 sendmsg sys_sendmsg compat_sys_sendmsg
+371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
+372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
+373 i386 shutdown sys_shutdown
+374 i386 userfaultfd sys_userfaultfd
+375 i386 membarrier sys_membarrier
+376 i386 mlock2 sys_mlock2
+377 i386 copy_file_range sys_copy_file_range
+378 i386 preadv2 sys_preadv2 compat_sys_preadv2
+379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2
+380 i386 pkey_mprotect sys_pkey_mprotect
+381 i386 pkey_alloc sys_pkey_alloc
+382 i386 pkey_free sys_pkey_free
+383 i386 statx sys_statx
+384 i386 arch_prctl sys_arch_prctl
+385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents
+386 i386 rseq sys_rseq
+393 i386 semget sys_semget
+394 i386 semctl sys_semctl compat_sys_semctl
+395 i386 shmget sys_shmget
+396 i386 shmctl sys_shmctl compat_sys_shmctl
+397 i386 shmat sys_shmat compat_sys_shmat
+398 i386 shmdt sys_shmdt
+399 i386 msgget sys_msgget
+400 i386 msgsnd sys_msgsnd compat_sys_msgsnd
+401 i386 msgrcv sys_msgrcv compat_sys_msgrcv
+402 i386 msgctl sys_msgctl compat_sys_msgctl
+403 i386 clock_gettime64 sys_clock_gettime
+404 i386 clock_settime64 sys_clock_settime
+405 i386 clock_adjtime64 sys_clock_adjtime
+406 i386 clock_getres_time64 sys_clock_getres
+407 i386 clock_nanosleep_time64 sys_clock_nanosleep
+408 i386 timer_gettime64 sys_timer_gettime
+409 i386 timer_settime64 sys_timer_settime
+410 i386 timerfd_gettime64 sys_timerfd_gettime
+411 i386 timerfd_settime64 sys_timerfd_settime
+412 i386 utimensat_time64 sys_utimensat
+413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64
+414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64
+416 i386 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64
+417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64
+418 i386 mq_timedsend_time64 sys_mq_timedsend
+419 i386 mq_timedreceive_time64 sys_mq_timedreceive
+420 i386 semtimedop_time64 sys_semtimedop
+421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64
+422 i386 futex_time64 sys_futex
+423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval
+424 i386 pidfd_send_signal sys_pidfd_send_signal
+425 i386 io_uring_setup sys_io_uring_setup
+426 i386 io_uring_enter sys_io_uring_enter
+427 i386 io_uring_register sys_io_uring_register
+428 i386 open_tree sys_open_tree
+429 i386 move_mount sys_move_mount
+430 i386 fsopen sys_fsopen
+431 i386 fsconfig sys_fsconfig
+432 i386 fsmount sys_fsmount
+433 i386 fspick sys_fspick
+434 i386 pidfd_open sys_pidfd_open
+435 i386 clone3 sys_clone3
+436 i386 close_range sys_close_range
+437 i386 openat2 sys_openat2
+438 i386 pidfd_getfd sys_pidfd_getfd
+439 i386 faccessat2 sys_faccessat2
+440 i386 process_madvise sys_process_madvise
+441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
+442 i386 mount_setattr sys_mount_setattr
+443 i386 quotactl_fd sys_quotactl_fd
+444 i386 landlock_create_ruleset sys_landlock_create_ruleset
+445 i386 landlock_add_rule sys_landlock_add_rule
+446 i386 landlock_restrict_self sys_landlock_restrict_self
+447 i386 memfd_secret sys_memfd_secret
+448 i386 process_mrelease sys_process_mrelease
+449 i386 futex_waitv sys_futex_waitv
+450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
+451 i386 cachestat sys_cachestat
+452 i386 fchmodat2 sys_fchmodat2
+453 i386 map_shadow_stack sys_map_shadow_stack
+454 i386 futex_wake sys_futex_wake
+455 i386 futex_wait sys_futex_wait
+456 i386 futex_requeue sys_futex_requeue
+457 i386 statmount sys_statmount
+458 i386 listmount sys_listmount
+459 i386 lsm_get_self_attr sys_lsm_get_self_attr
+460 i386 lsm_set_self_attr sys_lsm_set_self_attr
+461 i386 lsm_list_modules sys_lsm_list_modules
+462 i386 mseal sys_mseal
+463 i386 setxattrat sys_setxattrat
+464 i386 getxattrat sys_getxattrat
+465 i386 listxattrat sys_listxattrat
+466 i386 removexattrat sys_removexattrat
+467 i386 open_tree_attr sys_open_tree_attr
+468 i386 file_getattr sys_file_getattr
+469 i386 file_setattr sys_file_setattr
+470 i386 listns sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..8a4ac4841be6
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -0,0 +1,442 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
+#
+# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
+#
+# The abi is "common", "64" or "x32" for this file.
+#
+0 common read sys_read
+1 common write sys_write
+2 common open sys_open
+3 common close sys_close
+4 common stat sys_newstat
+5 common fstat sys_newfstat
+6 common lstat sys_newlstat
+7 common poll sys_poll
+8 common lseek sys_lseek
+9 common mmap sys_mmap
+10 common mprotect sys_mprotect
+11 common munmap sys_munmap
+12 common brk sys_brk
+13 64 rt_sigaction sys_rt_sigaction
+14 common rt_sigprocmask sys_rt_sigprocmask
+15 64 rt_sigreturn sys_rt_sigreturn
+16 64 ioctl sys_ioctl
+17 common pread64 sys_pread64
+18 common pwrite64 sys_pwrite64
+19 64 readv sys_readv
+20 64 writev sys_writev
+21 common access sys_access
+22 common pipe sys_pipe
+23 common select sys_select
+24 common sched_yield sys_sched_yield
+25 common mremap sys_mremap
+26 common msync sys_msync
+27 common mincore sys_mincore
+28 common madvise sys_madvise
+29 common shmget sys_shmget
+30 common shmat sys_shmat
+31 common shmctl sys_shmctl
+32 common dup sys_dup
+33 common dup2 sys_dup2
+34 common pause sys_pause
+35 common nanosleep sys_nanosleep
+36 common getitimer sys_getitimer
+37 common alarm sys_alarm
+38 common setitimer sys_setitimer
+39 common getpid sys_getpid
+40 common sendfile sys_sendfile64
+41 common socket sys_socket
+42 common connect sys_connect
+43 common accept sys_accept
+44 common sendto sys_sendto
+45 64 recvfrom sys_recvfrom
+46 64 sendmsg sys_sendmsg
+47 64 recvmsg sys_recvmsg
+48 common shutdown sys_shutdown
+49 common bind sys_bind
+50 common listen sys_listen
+51 common getsockname sys_getsockname
+52 common getpeername sys_getpeername
+53 common socketpair sys_socketpair
+54 64 setsockopt sys_setsockopt
+55 64 getsockopt sys_getsockopt
+56 common clone sys_clone
+57 common fork sys_fork
+58 common vfork sys_vfork
+59 64 execve sys_execve
+60 common exit sys_exit - noreturn
+61 common wait4 sys_wait4
+62 common kill sys_kill
+63 common uname sys_newuname
+64 common semget sys_semget
+65 common semop sys_semop
+66 common semctl sys_semctl
+67 common shmdt sys_shmdt
+68 common msgget sys_msgget
+69 common msgsnd sys_msgsnd
+70 common msgrcv sys_msgrcv
+71 common msgctl sys_msgctl
+72 common fcntl sys_fcntl
+73 common flock sys_flock
+74 common fsync sys_fsync
+75 common fdatasync sys_fdatasync
+76 common truncate sys_truncate
+77 common ftruncate sys_ftruncate
+78 common getdents sys_getdents
+79 common getcwd sys_getcwd
+80 common chdir sys_chdir
+81 common fchdir sys_fchdir
+82 common rename sys_rename
+83 common mkdir sys_mkdir
+84 common rmdir sys_rmdir
+85 common creat sys_creat
+86 common link sys_link
+87 common unlink sys_unlink
+88 common symlink sys_symlink
+89 common readlink sys_readlink
+90 common chmod sys_chmod
+91 common fchmod sys_fchmod
+92 common chown sys_chown
+93 common fchown sys_fchown
+94 common lchown sys_lchown
+95 common umask sys_umask
+96 common gettimeofday sys_gettimeofday
+97 common getrlimit sys_getrlimit
+98 common getrusage sys_getrusage
+99 common sysinfo sys_sysinfo
+100 common times sys_times
+101 64 ptrace sys_ptrace
+102 common getuid sys_getuid
+103 common syslog sys_syslog
+104 common getgid sys_getgid
+105 common setuid sys_setuid
+106 common setgid sys_setgid
+107 common geteuid sys_geteuid
+108 common getegid sys_getegid
+109 common setpgid sys_setpgid
+110 common getppid sys_getppid
+111 common getpgrp sys_getpgrp
+112 common setsid sys_setsid
+113 common setreuid sys_setreuid
+114 common setregid sys_setregid
+115 common getgroups sys_getgroups
+116 common setgroups sys_setgroups
+117 common setresuid sys_setresuid
+118 common getresuid sys_getresuid
+119 common setresgid sys_setresgid
+120 common getresgid sys_getresgid
+121 common getpgid sys_getpgid
+122 common setfsuid sys_setfsuid
+123 common setfsgid sys_setfsgid
+124 common getsid sys_getsid
+125 common capget sys_capget
+126 common capset sys_capset
+127 64 rt_sigpending sys_rt_sigpending
+128 64 rt_sigtimedwait sys_rt_sigtimedwait
+129 64 rt_sigqueueinfo sys_rt_sigqueueinfo
+130 common rt_sigsuspend sys_rt_sigsuspend
+131 64 sigaltstack sys_sigaltstack
+132 common utime sys_utime
+133 common mknod sys_mknod
+134 64 uselib
+135 common personality sys_personality
+136 common ustat sys_ustat
+137 common statfs sys_statfs
+138 common fstatfs sys_fstatfs
+139 common sysfs sys_sysfs
+140 common getpriority sys_getpriority
+141 common setpriority sys_setpriority
+142 common sched_setparam sys_sched_setparam
+143 common sched_getparam sys_sched_getparam
+144 common sched_setscheduler sys_sched_setscheduler
+145 common sched_getscheduler sys_sched_getscheduler
+146 common sched_get_priority_max sys_sched_get_priority_max
+147 common sched_get_priority_min sys_sched_get_priority_min
+148 common sched_rr_get_interval sys_sched_rr_get_interval
+149 common mlock sys_mlock
+150 common munlock sys_munlock
+151 common mlockall sys_mlockall
+152 common munlockall sys_munlockall
+153 common vhangup sys_vhangup
+154 common modify_ldt sys_modify_ldt
+155 common pivot_root sys_pivot_root
+156 64 _sysctl sys_ni_syscall
+157 common prctl sys_prctl
+158 common arch_prctl sys_arch_prctl
+159 common adjtimex sys_adjtimex
+160 common setrlimit sys_setrlimit
+161 common chroot sys_chroot
+162 common sync sys_sync
+163 common acct sys_acct
+164 common settimeofday sys_settimeofday
+165 common mount sys_mount
+166 common umount2 sys_umount
+167 common swapon sys_swapon
+168 common swapoff sys_swapoff
+169 common reboot sys_reboot
+170 common sethostname sys_sethostname
+171 common setdomainname sys_setdomainname
+172 common iopl sys_iopl
+173 common ioperm sys_ioperm
+174 64 create_module
+175 common init_module sys_init_module
+176 common delete_module sys_delete_module
+177 64 get_kernel_syms
+178 64 query_module
+179 common quotactl sys_quotactl
+180 64 nfsservctl
+181 common getpmsg
+182 common putpmsg
+183 common afs_syscall
+184 common tuxcall
+185 common security
+186 common gettid sys_gettid
+187 common readahead sys_readahead
+188 common setxattr sys_setxattr
+189 common lsetxattr sys_lsetxattr
+190 common fsetxattr sys_fsetxattr
+191 common getxattr sys_getxattr
+192 common lgetxattr sys_lgetxattr
+193 common fgetxattr sys_fgetxattr
+194 common listxattr sys_listxattr
+195 common llistxattr sys_llistxattr
+196 common flistxattr sys_flistxattr
+197 common removexattr sys_removexattr
+198 common lremovexattr sys_lremovexattr
+199 common fremovexattr sys_fremovexattr
+200 common tkill sys_tkill
+201 common time sys_time
+202 common futex sys_futex
+203 common sched_setaffinity sys_sched_setaffinity
+204 common sched_getaffinity sys_sched_getaffinity
+205 64 set_thread_area
+206 64 io_setup sys_io_setup
+207 common io_destroy sys_io_destroy
+208 common io_getevents sys_io_getevents
+209 64 io_submit sys_io_submit
+210 common io_cancel sys_io_cancel
+211 64 get_thread_area
+212 common lookup_dcookie
+213 common epoll_create sys_epoll_create
+214 64 epoll_ctl_old
+215 64 epoll_wait_old
+216 common remap_file_pages sys_remap_file_pages
+217 common getdents64 sys_getdents64
+218 common set_tid_address sys_set_tid_address
+219 common restart_syscall sys_restart_syscall
+220 common semtimedop sys_semtimedop
+221 common fadvise64 sys_fadvise64
+222 64 timer_create sys_timer_create
+223 common timer_settime sys_timer_settime
+224 common timer_gettime sys_timer_gettime
+225 common timer_getoverrun sys_timer_getoverrun
+226 common timer_delete sys_timer_delete
+227 common clock_settime sys_clock_settime
+228 common clock_gettime sys_clock_gettime
+229 common clock_getres sys_clock_getres
+230 common clock_nanosleep sys_clock_nanosleep
+231 common exit_group sys_exit_group - noreturn
+232 common epoll_wait sys_epoll_wait
+233 common epoll_ctl sys_epoll_ctl
+234 common tgkill sys_tgkill
+235 common utimes sys_utimes
+236 64 vserver
+237 common mbind sys_mbind
+238 common set_mempolicy sys_set_mempolicy
+239 common get_mempolicy sys_get_mempolicy
+240 common mq_open sys_mq_open
+241 common mq_unlink sys_mq_unlink
+242 common mq_timedsend sys_mq_timedsend
+243 common mq_timedreceive sys_mq_timedreceive
+244 64 mq_notify sys_mq_notify
+245 common mq_getsetattr sys_mq_getsetattr
+246 64 kexec_load sys_kexec_load
+247 64 waitid sys_waitid
+248 common add_key sys_add_key
+249 common request_key sys_request_key
+250 common keyctl sys_keyctl
+251 common ioprio_set sys_ioprio_set
+252 common ioprio_get sys_ioprio_get
+253 common inotify_init sys_inotify_init
+254 common inotify_add_watch sys_inotify_add_watch
+255 common inotify_rm_watch sys_inotify_rm_watch
+256 common migrate_pages sys_migrate_pages
+257 common openat sys_openat
+258 common mkdirat sys_mkdirat
+259 common mknodat sys_mknodat
+260 common fchownat sys_fchownat
+261 common futimesat sys_futimesat
+262 common newfstatat sys_newfstatat
+263 common unlinkat sys_unlinkat
+264 common renameat sys_renameat
+265 common linkat sys_linkat
+266 common symlinkat sys_symlinkat
+267 common readlinkat sys_readlinkat
+268 common fchmodat sys_fchmodat
+269 common faccessat sys_faccessat
+270 common pselect6 sys_pselect6
+271 common ppoll sys_ppoll
+272 common unshare sys_unshare
+273 64 set_robust_list sys_set_robust_list
+274 64 get_robust_list sys_get_robust_list
+275 common splice sys_splice
+276 common tee sys_tee
+277 common sync_file_range sys_sync_file_range
+278 64 vmsplice sys_vmsplice
+279 64 move_pages sys_move_pages
+280 common utimensat sys_utimensat
+281 common epoll_pwait sys_epoll_pwait
+282 common signalfd sys_signalfd
+283 common timerfd_create sys_timerfd_create
+284 common eventfd sys_eventfd
+285 common fallocate sys_fallocate
+286 common timerfd_settime sys_timerfd_settime
+287 common timerfd_gettime sys_timerfd_gettime
+288 common accept4 sys_accept4
+289 common signalfd4 sys_signalfd4
+290 common eventfd2 sys_eventfd2
+291 common epoll_create1 sys_epoll_create1
+292 common dup3 sys_dup3
+293 common pipe2 sys_pipe2
+294 common inotify_init1 sys_inotify_init1
+295 64 preadv sys_preadv
+296 64 pwritev sys_pwritev
+297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo
+298 common perf_event_open sys_perf_event_open
+299 64 recvmmsg sys_recvmmsg
+300 common fanotify_init sys_fanotify_init
+301 common fanotify_mark sys_fanotify_mark
+302 common prlimit64 sys_prlimit64
+303 common name_to_handle_at sys_name_to_handle_at
+304 common open_by_handle_at sys_open_by_handle_at
+305 common clock_adjtime sys_clock_adjtime
+306 common syncfs sys_syncfs
+307 64 sendmmsg sys_sendmmsg
+308 common setns sys_setns
+309 common getcpu sys_getcpu
+310 64 process_vm_readv sys_process_vm_readv
+311 64 process_vm_writev sys_process_vm_writev
+312 common kcmp sys_kcmp
+313 common finit_module sys_finit_module
+314 common sched_setattr sys_sched_setattr
+315 common sched_getattr sys_sched_getattr
+316 common renameat2 sys_renameat2
+317 common seccomp sys_seccomp
+318 common getrandom sys_getrandom
+319 common memfd_create sys_memfd_create
+320 common kexec_file_load sys_kexec_file_load
+321 common bpf sys_bpf
+322 64 execveat sys_execveat
+323 common userfaultfd sys_userfaultfd
+324 common membarrier sys_membarrier
+325 common mlock2 sys_mlock2
+326 common copy_file_range sys_copy_file_range
+327 64 preadv2 sys_preadv2
+328 64 pwritev2 sys_pwritev2
+329 common pkey_mprotect sys_pkey_mprotect
+330 common pkey_alloc sys_pkey_alloc
+331 common pkey_free sys_pkey_free
+332 common statx sys_statx
+333 common io_pgetevents sys_io_pgetevents
+334 common rseq sys_rseq
+335 common uretprobe sys_uretprobe
+336 common uprobe sys_uprobe
+# don't use numbers 387 through 423, add new calls after the last
+# 'common' entry
+424 common pidfd_send_signal sys_pidfd_send_signal
+425 common io_uring_setup sys_io_uring_setup
+426 common io_uring_enter sys_io_uring_enter
+427 common io_uring_register sys_io_uring_register
+428 common open_tree sys_open_tree
+429 common move_mount sys_move_mount
+430 common fsopen sys_fsopen
+431 common fsconfig sys_fsconfig
+432 common fsmount sys_fsmount
+433 common fspick sys_fspick
+434 common pidfd_open sys_pidfd_open
+435 common clone3 sys_clone3
+436 common close_range sys_close_range
+437 common openat2 sys_openat2
+438 common pidfd_getfd sys_pidfd_getfd
+439 common faccessat2 sys_faccessat2
+440 common process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2
+442 common mount_setattr sys_mount_setattr
+443 common quotactl_fd sys_quotactl_fd
+444 common landlock_create_ruleset sys_landlock_create_ruleset
+445 common landlock_add_rule sys_landlock_add_rule
+446 common landlock_restrict_self sys_landlock_restrict_self
+447 common memfd_secret sys_memfd_secret
+448 common process_mrelease sys_process_mrelease
+449 common futex_waitv sys_futex_waitv
+450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
+453 common map_shadow_stack sys_map_shadow_stack
+454 common futex_wake sys_futex_wake
+455 common futex_wait sys_futex_wait
+456 common futex_requeue sys_futex_requeue
+457 common statmount sys_statmount
+458 common listmount sys_listmount
+459 common lsm_get_self_attr sys_lsm_get_self_attr
+460 common lsm_set_self_attr sys_lsm_set_self_attr
+461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
+467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
+470 common listns sys_listns
+
+#
+# Due to a historical design error, certain syscalls are numbered differently
+# in x32 as compared to native x86_64. These syscalls have numbers 512-547.
+# Do not add new syscalls to this range. Numbers 548 and above are available
+# for non-x32 use.
+#
+512 x32 rt_sigaction compat_sys_rt_sigaction
+513 x32 rt_sigreturn compat_sys_x32_rt_sigreturn
+514 x32 ioctl compat_sys_ioctl
+515 x32 readv sys_readv
+516 x32 writev sys_writev
+517 x32 recvfrom compat_sys_recvfrom
+518 x32 sendmsg compat_sys_sendmsg
+519 x32 recvmsg compat_sys_recvmsg
+520 x32 execve compat_sys_execve
+521 x32 ptrace compat_sys_ptrace
+522 x32 rt_sigpending compat_sys_rt_sigpending
+523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait_time64
+524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo
+525 x32 sigaltstack compat_sys_sigaltstack
+526 x32 timer_create compat_sys_timer_create
+527 x32 mq_notify compat_sys_mq_notify
+528 x32 kexec_load compat_sys_kexec_load
+529 x32 waitid compat_sys_waitid
+530 x32 set_robust_list compat_sys_set_robust_list
+531 x32 get_robust_list compat_sys_get_robust_list
+532 x32 vmsplice sys_vmsplice
+533 x32 move_pages sys_move_pages
+534 x32 preadv compat_sys_preadv64
+535 x32 pwritev compat_sys_pwritev64
+536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
+537 x32 recvmmsg compat_sys_recvmmsg_time64
+538 x32 sendmmsg compat_sys_sendmmsg
+539 x32 process_vm_readv sys_process_vm_readv
+540 x32 process_vm_writev sys_process_vm_writev
+541 x32 setsockopt sys_setsockopt
+542 x32 getsockopt sys_getsockopt
+543 x32 io_setup compat_sys_io_setup
+544 x32 io_submit compat_sys_io_submit
+545 x32 execveat compat_sys_execveat
+546 x32 preadv2 compat_sys_preadv64v2
+547 x32 pwritev2 compat_sys_pwritev64v2
+# This is the end of the legacy x32 range. Numbers 548 and above are
+# not special and are not to be used for x32-specific syscalls.
diff --git a/arch/x86/entry/thunk.S b/arch/x86/entry/thunk.S
new file mode 100644
index 000000000000..119ebdc3d362
--- /dev/null
+++ b/arch/x86/entry/thunk.S
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ */
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include "calling.h"
+#include <asm/asm.h>
+
+THUNK preempt_schedule_thunk, preempt_schedule
+THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
+EXPORT_SYMBOL(preempt_schedule_thunk)
+EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore
index 60274d5746e1..37a6129d597b 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/entry/vdso/.gitignore
@@ -1,6 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
vdso.lds
-vdso-syms.lds
-vdso32-syms.lds
+vdsox32.lds
vdso32-syscall-syms.lds
vdso32-sysenter-syms.lds
vdso32-int80-syms.lds
+vdso-image-*.c
+vdso2c
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
new file mode 100644
index 000000000000..f247f5f5cb44
--- /dev/null
+++ b/arch/x86/entry/vdso/Makefile
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Building vDSO images for x86.
+#
+
+# Include the generic Makefile to check the built vDSO:
+include $(srctree)/lib/vdso/Makefile.include
+
+# Files to link into the vDSO:
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
+vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
+vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
+vobjs-$(CONFIG_X86_SGX) += vsgx.o
+
+# Files to link into the kernel:
+obj-y += vma.o extable.o
+
+# vDSO images to build:
+obj-$(CONFIG_X86_64) += vdso-image-64.o
+obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o
+obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o
+
+vobjs := $(addprefix $(obj)/, $(vobjs-y))
+vobjs32 := $(addprefix $(obj)/, $(vobjs32-y))
+
+$(obj)/vdso.o: $(obj)/vdso.so
+
+targets += vdso.lds $(vobjs-y)
+targets += vdso32/vdso32.lds $(vobjs32-y)
+
+targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg)
+
+CPPFLAGS_vdso.lds += -P -C
+
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 \
+ -z max-page-size=4096
+
+$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
+ $(call if_changed,vdso_and_check)
+
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
+hostprogs += vdso2c
+
+quiet_cmd_vdso2c = VDSO2C $@
+ cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@
+
+$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
+ $(call if_changed,vdso2c)
+
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
+CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
+ $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
+
+ifdef CONFIG_MITIGATION_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ CFL += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
+
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg
+CFLAGS_REMOVE_vsgx.o = -pg
+CFLAGS_REMOVE_vgetrandom.o = -pg
+
+#
+# X32 processes use x32 vDSO to access 64bit kernel data.
+#
+# Build x32 vDSO image:
+# 1. Compile x32 vDSO as 64bit.
+# 2. Convert object files to x32.
+# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes
+# so that it can reach 64bit address space with 64bit pointers.
+#
+
+CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
+ -z max-page-size=4096
+
+# x32-rebranded versions
+vobjx32s-y := $(vobjs-y:.o=-x32.o)
+
+# same thing, but in the output directory
+vobjx32s := $(addprefix $(obj)/, $(vobjx32s-y))
+
+# Convert 64bit object file to x32 for x32 vDSO.
+quiet_cmd_x32 = X32 $@
+ cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@
+
+$(obj)/%-x32.o: $(obj)/%.o FORCE
+ $(call if_changed,x32)
+
+targets += vdsox32.lds $(vobjx32s-y)
+
+$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
+ $(call if_changed,vdso_and_check)
+
+CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
+$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(PADDING_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
+KBUILD_CFLAGS_32 += -fno-stack-protector
+KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
+KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
+KBUILD_CFLAGS_32 += -DBUILD_VDSO
+
+ifdef CONFIG_MITIGATION_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
+$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
+ $(call if_changed,vdso_and_check)
+
+#
+# The DSO images are built using a special linker script.
+#
+quiet_cmd_vdso = VDSO $@
+ cmd_vdso = $(LD) -o $@ \
+ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+ -T $(filter %.lds,$^) $(filter %.o,$^)
+
+VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 --no-undefined \
+ $(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack
+
+quiet_cmd_vdso_and_check = VDSO $@
+ cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c
new file mode 100644
index 000000000000..afcf5b65beef
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <asm/current.h>
+#include <asm/traps.h>
+#include <asm/vdso.h>
+
+struct vdso_exception_table_entry {
+ int insn, fixup;
+};
+
+bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, unsigned long fault_addr)
+{
+ const struct vdso_image *image = current->mm->context.vdso_image;
+ const struct vdso_exception_table_entry *extable;
+ unsigned int nr_entries, i;
+ unsigned long base;
+
+ /*
+ * Do not attempt to fixup #DB or #BP. It's impossible to identify
+ * whether or not a #DB/#BP originated from within an SGX enclave and
+ * SGX enclaves are currently the only use case for vDSO fixup.
+ */
+ if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP)
+ return false;
+
+ if (!current->mm->context.vdso)
+ return false;
+
+ base = (unsigned long)current->mm->context.vdso + image->extable_base;
+ nr_entries = image->extable_len / (sizeof(*extable));
+ extable = image->extable;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (regs->ip == base + extable[i].insn) {
+ regs->ip = base + extable[i].fixup;
+ regs->di = trapnr;
+ regs->si = error_code;
+ regs->dx = fault_addr;
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h
new file mode 100644
index 000000000000..baba612b832c
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_EXTABLE_H
+#define __VDSO_EXTABLE_H
+
+/*
+ * Inject exception fixup for vDSO code. Unlike normal exception fixup,
+ * vDSO uses a dedicated handler the addresses are relative to the overall
+ * exception table, not each individual entry.
+ */
+#ifdef __ASSEMBLER__
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
+ ASM_VDSO_EXTABLE_HANDLE from to
+
+.macro ASM_VDSO_EXTABLE_HANDLE from:req to:req
+ .pushsection __ex_table, "a"
+ .long (\from) - __ex_table
+ .long (\to) - __ex_table
+ .popsection
+.endm
+#else
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
+ ".pushsection __ex_table, \"a\"\n" \
+ ".long (" #from ") - __ex_table\n" \
+ ".long (" #to ") - __ex_table\n" \
+ ".popsection\n"
+#endif
+
+#endif /* __VDSO_EXTABLE_H */
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
new file mode 100644
index 000000000000..0debc194bd78
--- /dev/null
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
+ *
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Copyright 2019 ARM Limited
+ *
+ * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ */
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <vdso/gettime.h>
+
+#include "../../../../lib/vdso/gettimeofday.c"
+
+int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
+{
+ return __cvdso_gettimeofday(tv, tz);
+}
+
+int gettimeofday(struct __kernel_old_timeval *, struct timezone *)
+ __attribute__((weak, alias("__vdso_gettimeofday")));
+
+__kernel_old_time_t __vdso_time(__kernel_old_time_t *t)
+{
+ return __cvdso_time(t);
+}
+
+__kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time")));
+
+
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+/* both 64-bit and x32 use these */
+int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_getres(clockid_t clock,
+ struct __kernel_timespec *res)
+{
+ return __cvdso_clock_getres(clock, res);
+}
+int clock_getres(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
+
+#else
+/* i386 only */
+int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
+{
+ return __cvdso_clock_gettime32(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct old_timespec32 *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime64(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime64")));
+
+int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res)
+{
+ return __cvdso_clock_getres_time32(clock, res);
+}
+
+int clock_getres(clockid_t, struct old_timespec32 *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
+#endif
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
new file mode 100644
index 000000000000..ec1ac191a057
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/vdso.h>
+#include <asm/vdso/vsyscall.h>
+#include <vdso/datapage.h>
+
+/*
+ * Linker script for vDSO. This is an ELF shared object prelinked to
+ * its virtual address, and with only one read-only segment.
+ * This script controls its layout.
+ */
+
+SECTIONS
+{
+ /*
+ * User/kernel shared data is before the vDSO. This may be a little
+ * uglier than putting it after the vDSO, but it avoids issues with
+ * non-allocatable things that dangle past the end of the PT_LOAD
+ * segment.
+ */
+
+ VDSO_VVAR_SYMS
+
+ vclock_pages = VDSO_VCLOCK_PAGES_START(vdso_u_data);
+ pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE;
+ hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE;
+
+ . = SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : {
+ *(.rodata*)
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+ } :text
+
+ /*
+ * Discard .note.gnu.property sections which are unused and have
+ * different alignment requirement from vDSO note sections.
+ */
+ /DISCARD/ : {
+ *(.note.gnu.property)
+ }
+ .note : { *(.note.*) } :text :note
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+
+
+ /*
+ * Text is well-separated from actual data: there's plenty of
+ * stuff that isn't used at runtime in between.
+ */
+
+ .text : {
+ *(.text*)
+ } :text =0x90909090,
+
+
+
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
+
+ __ex_table : { *(__ex_table) } :text
+
+ /DISCARD/ : {
+ *(.discard)
+ *(.discard.*)
+ *(__bug_table)
+ }
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME 0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
new file mode 100644
index 000000000000..79423170118f
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-note.S
@@ -0,0 +1,15 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
new file mode 100644
index 000000000000..0bab5f4af6d1
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 64-bit vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSO64
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+ LINUX_2.6 {
+ global:
+ clock_gettime;
+ __vdso_clock_gettime;
+ gettimeofday;
+ __vdso_gettimeofday;
+ getcpu;
+ __vdso_getcpu;
+ time;
+ __vdso_time;
+ clock_getres;
+ __vdso_clock_getres;
+#ifdef CONFIG_X86_SGX
+ __vdso_sgx_enter_enclave;
+#endif
+ getrandom;
+ __vdso_getrandom;
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
new file mode 100644
index 000000000000..f84e8f8fa5fe
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vdso2c - A vdso image preparation tool
+ * Copyright (c) 2014 Andy Lutomirski and others
+ *
+ * vdso2c requires stripped and unstripped input. It would be trivial
+ * to fully strip the input in here, but, for reasons described below,
+ * we need to write a section table. Doing this is more or less
+ * equivalent to dropping all non-allocatable sections, but it's
+ * easier to let objcopy handle that instead of doing it ourselves.
+ * If we ever need to do something fancier than what objcopy provides,
+ * it would be straightforward to add here.
+ *
+ * We're keep a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <err.h>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <tools/le_byteshift.h>
+
+#include <linux/elf.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+const char *outfilename;
+
+struct vdso_sym {
+ const char *name;
+ bool export;
+};
+
+struct vdso_sym required_syms[] = {
+ {"VDSO32_NOTE_MASK", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
+ {"int80_landing_pad", true},
+ {"vdso32_rt_sigreturn_landing_pad", true},
+ {"vdso32_sigreturn_landing_pad", true},
+};
+
+__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
+static void fail(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ fprintf(stderr, "Error: ");
+ vfprintf(stderr, format, ap);
+ if (outfilename)
+ unlink(outfilename);
+ exit(1);
+ va_end(ap);
+}
+
+/*
+ * Evil macros for little-endian reads and writes
+ */
+#define GLE(x, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
+
+extern void bad_get_le(void);
+#define LAST_GLE(x) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
+
+#define GET_LE(x) \
+ GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
+
+#define PLE(x, val, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ put_unaligned_le##bits((val), (x)), ifnot)
+
+extern void bad_put_le(void);
+#define LAST_PLE(x, val) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
+
+#define PUT_LE(x, val) \
+ PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
+
+
+#define NSYMS ARRAY_SIZE(required_syms)
+
+#define BITSFUNC3(name, bits, suffix) name##bits##suffix
+#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
+
+#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
+#include "vdso2c.h"
+#undef ELF_BITS
+
+#define ELF_BITS 32
+#include "vdso2c.h"
+#undef ELF_BITS
+
+static void go(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
+{
+ Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
+
+ if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
+ go64(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
+ go32(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else {
+ fail("unknown ELF class\n");
+ }
+}
+
+static void map_input(const char *name, void **addr, size_t *len, int prot)
+{
+ off_t tmp_len;
+
+ int fd = open(name, O_RDONLY);
+ if (fd == -1)
+ err(1, "open(%s)", name);
+
+ tmp_len = lseek(fd, 0, SEEK_END);
+ if (tmp_len == (off_t)-1)
+ err(1, "lseek");
+ *len = (size_t)tmp_len;
+
+ *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
+ if (*addr == MAP_FAILED)
+ err(1, "mmap");
+
+ close(fd);
+}
+
+int main(int argc, char **argv)
+{
+ size_t raw_len, stripped_len;
+ void *raw_addr, *stripped_addr;
+ FILE *outfile;
+ char *name, *tmp;
+ int namelen;
+
+ if (argc != 4) {
+ printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
+ return 1;
+ }
+
+ /*
+ * Figure out the struct name. If we're writing to a .so file,
+ * generate raw output instead.
+ */
+ name = strdup(argv[3]);
+ namelen = strlen(name);
+ if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
+ name = NULL;
+ } else {
+ tmp = strrchr(name, '/');
+ if (tmp)
+ name = tmp + 1;
+ tmp = strchr(name, '.');
+ if (tmp)
+ *tmp = '\0';
+ for (tmp = name; *tmp; tmp++)
+ if (*tmp == '-')
+ *tmp = '_';
+ }
+
+ map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
+ map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
+
+ outfilename = argv[3];
+ outfile = fopen(outfilename, "w");
+ if (!outfile)
+ err(1, "fopen(%s)", outfilename);
+
+ go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
+
+ munmap(raw_addr, raw_len);
+ munmap(stripped_addr, stripped_len);
+ fclose(outfile);
+
+ return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
new file mode 100644
index 000000000000..78ed1c1f28b9
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -0,0 +1,208 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is included twice from vdso2c.c. It generates code for 32-bit
+ * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs
+ * are built for 32-bit userspace.
+ */
+
+static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (i % 10 == 0)
+ fprintf(outfile, "\n\t");
+ fprintf(outfile, "0x%02X, ", (int)(data)[i]);
+ }
+}
+
+
+/*
+ * Extract a section from the input data into a standalone blob. Used to
+ * capture kernel-only data that needs to persist indefinitely, e.g. the
+ * exception fixup tables, but only in the kernel, i.e. the section can
+ * be stripped from the final vDSO image.
+ */
+static void BITSFUNC(extract)(const unsigned char *data, size_t data_len,
+ FILE *outfile, ELF(Shdr) *sec, const char *name)
+{
+ unsigned long offset;
+ size_t len;
+
+ offset = (unsigned long)GET_LE(&sec->sh_offset);
+ len = (size_t)GET_LE(&sec->sh_size);
+
+ if (offset + len > data_len)
+ fail("section to extract overruns input data");
+
+ fprintf(outfile, "static const unsigned char %s[%zu] = {", name, len);
+ BITSFUNC(copy)(outfile, data + offset, len);
+ fprintf(outfile, "\n};\n\n");
+}
+
+static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *image_name)
+{
+ int found_load = 0;
+ unsigned long load_size = -1; /* Work around bogus warning */
+ unsigned long mapping_size;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
+ unsigned long i, syms_nr;
+ ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+ *alt_sec = NULL, *extable_sec = NULL;
+ ELF(Dyn) *dyn = 0, *dyn_end = 0;
+ const char *secstrings;
+ INT_BITS syms[NSYMS] = {};
+
+ ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
+
+ if (GET_LE(&hdr->e_type) != ET_DYN)
+ fail("input is not a shared object\n");
+
+ /* Walk the segment table. */
+ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
+ if (GET_LE(&pt[i].p_type) == PT_LOAD) {
+ if (found_load)
+ fail("multiple PT_LOAD segs\n");
+
+ if (GET_LE(&pt[i].p_offset) != 0 ||
+ GET_LE(&pt[i].p_vaddr) != 0)
+ fail("PT_LOAD in wrong place\n");
+
+ if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
+ fail("cannot handle memsz != filesz\n");
+
+ load_size = GET_LE(&pt[i].p_memsz);
+ found_load = 1;
+ } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
+ dyn = raw_addr + GET_LE(&pt[i].p_offset);
+ dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
+ GET_LE(&pt[i].p_memsz);
+ }
+ }
+ if (!found_load)
+ fail("no PT_LOAD seg\n");
+
+ if (stripped_len < load_size)
+ fail("stripped input is too short\n");
+
+ if (!dyn)
+ fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n");
+
+ /* Walk the dynamic table */
+ for (i = 0; dyn + i < dyn_end &&
+ GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
+ typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
+ if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
+ tag == DT_RELENT || tag == DT_TEXTREL)
+ fail("vdso image contains dynamic relocations\n");
+ }
+
+ /* Walk the section table */
+ secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
+ secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
+ for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
+ ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * i;
+ if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
+ symtab_hdr = sh;
+
+ if (!strcmp(secstrings + GET_LE(&sh->sh_name),
+ ".altinstructions"))
+ alt_sec = sh;
+ if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table"))
+ extable_sec = sh;
+ }
+
+ if (!symtab_hdr)
+ fail("no symbol table\n");
+
+ strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
+
+ syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
+ /* Walk the symbol table */
+ for (i = 0; i < syms_nr; i++) {
+ unsigned int k;
+ ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
+ GET_LE(&symtab_hdr->sh_entsize) * i;
+ const char *sym_name = raw_addr +
+ GET_LE(&strtab_hdr->sh_offset) +
+ GET_LE(&sym->st_name);
+
+ for (k = 0; k < NSYMS; k++) {
+ if (!strcmp(sym_name, required_syms[k].name)) {
+ if (syms[k]) {
+ fail("duplicate symbol %s\n",
+ required_syms[k].name);
+ }
+
+ /*
+ * Careful: we use negative addresses, but
+ * st_value is unsigned, so we rely
+ * on syms[k] being a signed type of the
+ * correct width.
+ */
+ syms[k] = GET_LE(&sym->st_value);
+ }
+ }
+ }
+
+ if (!image_name) {
+ fwrite(stripped_addr, stripped_len, 1, outfile);
+ return;
+ }
+
+ mapping_size = (stripped_len + 4095) / 4096 * 4096;
+
+ fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
+ fprintf(outfile, "#include <linux/linkage.h>\n");
+ fprintf(outfile, "#include <linux/init.h>\n");
+ fprintf(outfile, "#include <asm/page_types.h>\n");
+ fprintf(outfile, "#include <asm/vdso.h>\n");
+ fprintf(outfile, "\n");
+ fprintf(outfile,
+ "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
+ mapping_size);
+ for (i = 0; i < stripped_len; i++) {
+ if (i % 10 == 0)
+ fprintf(outfile, "\n\t");
+ fprintf(outfile, "0x%02X, ",
+ (int)((unsigned char *)stripped_addr)[i]);
+ }
+ fprintf(outfile, "\n};\n\n");
+ if (extable_sec)
+ BITSFUNC(extract)(raw_addr, raw_len, outfile,
+ extable_sec, "extable");
+
+ fprintf(outfile, "const struct vdso_image %s = {\n", image_name);
+ fprintf(outfile, "\t.data = raw_data,\n");
+ fprintf(outfile, "\t.size = %lu,\n", mapping_size);
+ if (alt_sec) {
+ fprintf(outfile, "\t.alt = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_offset));
+ fprintf(outfile, "\t.alt_len = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_size));
+ }
+ if (extable_sec) {
+ fprintf(outfile, "\t.extable_base = %lu,\n",
+ (unsigned long)GET_LE(&extable_sec->sh_offset));
+ fprintf(outfile, "\t.extable_len = %lu,\n",
+ (unsigned long)GET_LE(&extable_sec->sh_size));
+ fprintf(outfile, "\t.extable = extable,\n");
+ }
+
+ for (i = 0; i < NSYMS; i++) {
+ if (required_syms[i].export && syms[i])
+ fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
+ required_syms[i].name, (int64_t)syms[i]);
+ }
+ fprintf(outfile, "};\n\n");
+ fprintf(outfile, "static __init int init_%s(void) {\n", image_name);
+ fprintf(outfile, "\treturn init_vdso_image(&%s);\n", image_name);
+ fprintf(outfile, "};\n");
+ fprintf(outfile, "subsys_initcall(init_%s);\n", image_name);
+
+}
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
new file mode 100644
index 000000000000..8894013eea1d
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/elf.h>
+
+#include <asm/processor.h>
+#include <asm/vdso.h>
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT 0
+#else
+#define VDSO_DEFAULT 1
+#endif
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT;
+
+static int __init vdso32_setup(char *s)
+{
+ vdso32_enabled = simple_strtoul(s, NULL, 0);
+
+ if (vdso32_enabled > 1) {
+ pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
+ vdso32_enabled = 0;
+ }
+
+ return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso32_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
+#endif
+
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+
+static const struct ctl_table vdso_table[] = {
+ {
+#ifdef CONFIG_X86_64
+ .procname = "vsyscall32",
+#else
+ .procname = "vdso_enabled",
+#endif
+ .data = &vdso32_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static __init int ia32_binfmt_init(void)
+{
+#ifdef CONFIG_X86_64
+ /* Register vsyscall32 into the ABI table */
+ register_sysctl("abi", vdso_table);
+#else
+ register_sysctl_init("vm", vdso_table);
+#endif
+ return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif /* CONFIG_SYSCTL */
+
diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore
new file mode 100644
index 000000000000..5167384843b9
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
new file mode 100644
index 000000000000..db1b15f686e3
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_64
+
+/*
+ * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
+ * configuration
+ */
+#undef CONFIG_64BIT
+#undef CONFIG_X86_64
+#undef CONFIG_COMPAT
+#undef CONFIG_PGTABLE_LEVELS
+#undef CONFIG_ILLEGAL_POINTER_VALUE
+#undef CONFIG_SPARSEMEM_VMEMMAP
+#undef CONFIG_NR_CPUS
+#undef CONFIG_PARAVIRT_XXL
+
+#define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
+#define CONFIG_PAGE_OFFSET 0
+#define CONFIG_ILLEGAL_POINTER_VALUE 0
+#define CONFIG_NR_CPUS 1
+
+#define BUILD_VDSO32_64
+
+#endif
diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
new file mode 100644
index 000000000000..2cbd39939dc6
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+/* Ideally this would use UTS_NAME, but using a quoted string here
+ doesn't work. Remember to change this when changing the
+ kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
index 31776d0efc8c..1bd068f72d4c 100644
--- a/arch/x86/vdso/vdso32/sigreturn.S
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
@@ -1,11 +1,4 @@
-/*
- * Common code for the sigreturn entry points in vDSO images.
- * So far this code is the same for both int80 and sysenter versions.
- * This file is #include'd by int80.S et al to define them first thing.
- * The kernel assumes that the addresses of these routines are constant
- * for all vDSO implementations.
- */
-
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/unistd_32.h>
#include <asm/asm-offsets.h>
@@ -17,6 +10,7 @@
.text
.globl __kernel_sigreturn
.type __kernel_sigreturn,@function
+ nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
ALIGN
__kernel_sigreturn:
.LSTART_sigreturn:
@@ -24,6 +18,7 @@ __kernel_sigreturn:
movl $__NR_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_sigreturn:
+SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL)
nop
.size __kernel_sigreturn,.-.LSTART_sigreturn
@@ -35,6 +30,7 @@ __kernel_rt_sigreturn:
movl $__NR_rt_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_rt_sigreturn:
+SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL)
nop
.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
.previous
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
new file mode 100644
index 000000000000..d33c6513fd2c
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AT_SYSINFO entry point
+*/
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+
+ .text
+ .globl __kernel_vsyscall
+ .type __kernel_vsyscall,@function
+ ALIGN
+__kernel_vsyscall:
+ CFI_STARTPROC
+ /*
+ * Reshuffle regs so that all of any of the entry instructions
+ * will preserve enough state.
+ *
+ * A really nice entry sequence would be:
+ * pushl %edx
+ * pushl %ecx
+ * movl %esp, %ecx
+ *
+ * Unfortunately, naughty Android versions between July and December
+ * 2015 actually hardcode the traditional Linux SYSENTER entry
+ * sequence. That is severely broken for a number of reasons (ask
+ * anyone with an AMD CPU, for example). Nonetheless, we try to keep
+ * it working approximately as well as it ever worked.
+ *
+ * This link may elucidate some of the history:
+ * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+ * personally, I find it hard to understand what's going on there.
+ *
+ * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+ * Execute an indirect call to the address in the AT_SYSINFO auxv
+ * entry. That is the ONLY correct way to make a fast 32-bit system
+ * call on Linux. (Open-coding int $0x80 is also fine, but it's
+ * slow.)
+ */
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+
+ #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter"
+ #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall"
+
+#ifdef CONFIG_X86_64
+ /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
+ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32
+#else
+ ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
+#endif
+
+ /* Enter using int $0x80 */
+ int $0x80
+SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL)
+
+ /*
+ * Restore EDX and ECX in case they were clobbered. EBP is not
+ * clobbered (the kernel restores it), but it's cleaner and
+ * probably faster to pop it than to adjust ESP using addl.
+ */
+ popl %ebp
+ CFI_RESTORE ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %edx
+ CFI_RESTORE edx
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ RET
+ CFI_ENDPROC
+
+ .size __kernel_vsyscall,.-__kernel_vsyscall
+ .previous
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 000000000000..86981decfea8
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BUILD_VDSO32
+#include "fake_32bit_build.h"
+#include "../vclock_gettime.c"
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 976124bb5f92..8a3be07006bb 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -1,14 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Linker script for 32-bit vDSO.
* We #include the file to define the layout details.
- * Here we only choose the prelinked virtual address.
*
* This file defines the version script giving the user-exported symbols in
- * the DSO. We can define local symbols here called VDSO* to make their
- * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ * the DSO.
*/
-#define VDSO_PRELINK 0
+#include <asm/page.h>
+
+#define BUILD_VDSO32
+
#include "../vdso-layout.lds.S"
/* The ELF entry point can be used to set the AT_SYSINFO value. */
@@ -19,6 +21,16 @@ ENTRY(__kernel_vsyscall);
*/
VERSION
{
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_time;
+ __vdso_clock_getres;
+ __vdso_clock_gettime64;
+ __vdso_getcpu;
+ };
+
LINUX_2.5 {
global:
__kernel_vsyscall;
@@ -27,11 +39,3 @@ VERSION
local: *;
};
}
-
-/*
- * Symbols we define here called VDSO* get their values into vdso32-syms.h.
- */
-VDSO32_PRELINK = VDSO_PRELINK;
-VDSO32_vsyscall = __kernel_vsyscall;
-VDSO32_sigreturn = __kernel_sigreturn;
-VDSO32_rt_sigreturn = __kernel_rt_sigreturn;
diff --git a/arch/x86/entry/vdso/vdso32/vgetcpu.c b/arch/x86/entry/vdso/vdso32/vgetcpu.c
new file mode 100644
index 000000000000..3a9791f5e998
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vgetcpu.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "fake_32bit_build.h"
+#include "../vgetcpu.c"
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
new file mode 100644
index 000000000000..16a8050a4fb6
--- /dev/null
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for x32 vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSOX32
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_getcpu;
+ __vdso_time;
+ __vdso_clock_getres;
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
new file mode 100644
index 000000000000..e4640306b2e3
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ *
+ * Fast user context implementation of getcpu()
+ */
+
+#include <linux/kernel.h>
+#include <linux/getcpu.h>
+#include <asm/segment.h>
+#include <vdso/processor.h>
+
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+ vdso_read_cpunode(cpu, node);
+
+ return 0;
+}
+
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+ __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..bcba5639b8ee
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section .rodata, "a"
+.align 16
+CONSTANTS: .octa 0x6b20657479622d323320646e61707865
+.text
+
+/*
+ * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
+ * of blocks of output with a nonce of 0, taking an input key and 8-byte
+ * counter. Importantly does not spill to the stack. Its arguments are:
+ *
+ * rdi: output bytes
+ * rsi: 32-byte key input
+ * rdx: 8-byte counter input/output
+ * rcx: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+.set output, %rdi
+.set key, %rsi
+.set counter, %rdx
+.set nblocks, %rcx
+.set i, %al
+/* xmm registers are *not* callee-save. */
+.set temp, %xmm0
+.set state0, %xmm1
+.set state1, %xmm2
+.set state2, %xmm3
+.set state3, %xmm4
+.set copy0, %xmm5
+.set copy1, %xmm6
+.set copy2, %xmm7
+.set copy3, %xmm8
+.set one, %xmm9
+
+ /* copy0 = "expand 32-byte k" */
+ movaps CONSTANTS(%rip),copy0
+ /* copy1,copy2 = key */
+ movups 0x00(key),copy1
+ movups 0x10(key),copy2
+ /* copy3 = counter || zero nonce */
+ movq 0x00(counter),copy3
+ /* one = 1 || 0 */
+ movq $1,%rax
+ movq %rax,one
+
+.Lblock:
+ /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
+ movdqa copy0,state0
+ movdqa copy1,state1
+ movdqa copy2,state2
+ movdqa copy3,state3
+
+ movb $10,i
+.Lpermute:
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $16,temp
+ psrld $16,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $12,temp
+ psrld $20,state1
+ por temp,state1
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $8,temp
+ psrld $24,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $7,temp
+ psrld $25,state1
+ por temp,state1
+
+ /* state1[0,1,2,3] = state1[1,2,3,0] */
+ pshufd $0x39,state1,state1
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ pshufd $0x4e,state2,state2
+ /* state3[0,1,2,3] = state3[3,0,1,2] */
+ pshufd $0x93,state3,state3
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $16,temp
+ psrld $16,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $12,temp
+ psrld $20,state1
+ por temp,state1
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $8,temp
+ psrld $24,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $7,temp
+ psrld $25,state1
+ por temp,state1
+
+ /* state1[0,1,2,3] = state1[3,0,1,2] */
+ pshufd $0x93,state1,state1
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ pshufd $0x4e,state2,state2
+ /* state3[0,1,2,3] = state3[1,2,3,0] */
+ pshufd $0x39,state3,state3
+
+ decb i
+ jnz .Lpermute
+
+ /* output0 = state0 + copy0 */
+ paddd copy0,state0
+ movups state0,0x00(output)
+ /* output1 = state1 + copy1 */
+ paddd copy1,state1
+ movups state1,0x10(output)
+ /* output2 = state2 + copy2 */
+ paddd copy2,state2
+ movups state2,0x20(output)
+ /* output3 = state3 + copy3 */
+ paddd copy3,state3
+ movups state3,0x30(output)
+
+ /* ++copy3.counter */
+ paddq one,copy3
+
+ /* output += 64, --nblocks */
+ addq $64,output
+ decq nblocks
+ jnz .Lblock
+
+ /* counter = copy3.counter */
+ movq copy3,0x00(counter)
+
+ /* Zero out the potentially sensitive regs, in case nothing uses these again. */
+ pxor state0,state0
+ pxor state1,state1
+ pxor state2,state2
+ pxor state3,state3
+ pxor copy1,copy1
+ pxor copy2,copy2
+ pxor temp,temp
+
+ ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c
new file mode 100644
index 000000000000..430862b8977c
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+#include "../../../../lib/vdso/getrandom.c"
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+ return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}
+
+ssize_t getrandom(void *, size_t, unsigned int, void *, size_t)
+ __attribute__((weak, alias("__vdso_getrandom")));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
new file mode 100644
index 000000000000..afe105b2f907
--- /dev/null
+++ b/arch/x86/entry/vdso/vma.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2007 Andi Kleen, SUSE Labs.
+ *
+ * This contains most of the x86 vDSO kernel-side code.
+ */
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/cpu.h>
+#include <linux/ptrace.h>
+#include <linux/vdso_datastore.h>
+
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
+#include <asm/proto.h>
+#include <asm/vdso.h>
+#include <asm/tlb.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/cpufeature.h>
+#include <asm/vdso/vsyscall.h>
+#include <clocksource/hyperv_timer.h>
+
+static_assert(VDSO_NR_PAGES + VDSO_NR_VCLOCK_PAGES == __VDSO_PAGES);
+
+unsigned int vclocks_used __read_mostly;
+
+#if defined(CONFIG_X86_64)
+unsigned int __read_mostly vdso64_enabled = 1;
+#endif
+
+int __init init_vdso_image(const struct vdso_image *image)
+{
+ BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32);
+ BUG_ON(image->size % PAGE_SIZE != 0);
+
+ apply_alternatives((struct alt_instr *)(image->data + image->alt),
+ (struct alt_instr *)(image->data + image->alt +
+ image->alt_len));
+
+ return 0;
+}
+
+struct linux_binprm;
+
+static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+ if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+ get_page(vmf->page);
+ return 0;
+}
+
+static void vdso_fix_landing(const struct vdso_image *image,
+ struct vm_area_struct *new_vma)
+{
+ if (in_ia32_syscall() && image == &vdso_image_32) {
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long vdso_land = image->sym_int80_landing_pad;
+ unsigned long old_land_addr = vdso_land +
+ (unsigned long)current->mm->context.vdso;
+
+ /* Fixing userspace landing - look at do_fast_syscall_32 */
+ if (regs->ip == old_land_addr)
+ regs->ip = new_vma->vm_start + vdso_land;
+ }
+}
+
+static int vdso_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ const struct vdso_image *image = current->mm->context.vdso_image;
+
+ vdso_fix_landing(image, new_vma);
+ current->mm->context.vdso = (void __user *)new_vma->vm_start;
+
+ return 0;
+}
+
+static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ switch (vmf->pgoff) {
+#ifdef CONFIG_PARAVIRT_CLOCK
+ case VDSO_PAGE_PVCLOCK_OFFSET:
+ {
+ struct pvclock_vsyscall_time_info *pvti =
+ pvclock_get_pvti_cpu0_va();
+
+ if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
+ return vmf_insert_pfn_prot(vma, vmf->address,
+ __pa(pvti) >> PAGE_SHIFT,
+ pgprot_decrypted(vma->vm_page_prot));
+ break;
+ }
+#endif /* CONFIG_PARAVIRT_CLOCK */
+#ifdef CONFIG_HYPERV_TIMER
+ case VDSO_PAGE_HVCLOCK_OFFSET:
+ {
+ unsigned long pfn = hv_get_tsc_pfn();
+ if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
+ return vmf_insert_pfn(vma, vmf->address, pfn);
+ break;
+ }
+#endif /* CONFIG_HYPERV_TIMER */
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_special_mapping vdso_mapping = {
+ .name = "[vdso]",
+ .fault = vdso_fault,
+ .mremap = vdso_mremap,
+};
+static const struct vm_special_mapping vvar_vclock_mapping = {
+ .name = "[vvar_vclock]",
+ .fault = vvar_vclock_fault,
+};
+
+/*
+ * Add vdso and vvar mappings to current process.
+ * @image - blob to map
+ * @addr - request a specific address (zero to map at free addr)
+ */
+static int map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long text_start;
+ int ret = 0;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ addr = get_unmapped_area(NULL, addr,
+ image->size + __VDSO_PAGES * PAGE_SIZE, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+
+ text_start = addr + __VDSO_PAGES * PAGE_SIZE;
+
+ /*
+ * MAYWRITE to allow gdb to COW and set breakpoints
+ */
+ vma = _install_special_mapping(mm,
+ text_start,
+ image->size,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+ VM_SEALED_SYSMAP,
+ &vdso_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto up_fail;
+ }
+
+ vma = vdso_install_vvar_mapping(mm, addr);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ do_munmap(mm, text_start, image->size, NULL);
+ goto up_fail;
+ }
+
+ vma = _install_special_mapping(mm,
+ VDSO_VCLOCK_PAGES_START(addr),
+ VDSO_NR_VCLOCK_PAGES * PAGE_SIZE,
+ VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+ VM_PFNMAP|VM_SEALED_SYSMAP,
+ &vvar_vclock_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ do_munmap(mm, text_start, image->size, NULL);
+ do_munmap(mm, addr, image->size, NULL);
+ goto up_fail;
+ }
+
+ current->mm->context.vdso = (void __user *)text_start;
+ current->mm->context.vdso_image = image;
+
+up_fail:
+ mmap_write_unlock(mm);
+ return ret;
+}
+
+int map_vdso_once(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mmap_write_lock(mm);
+ /*
+ * Check if we have already mapped vdso blob - fail to prevent
+ * abusing from userspace install_special_mapping, which may
+ * not do accounting and rlimit right.
+ * We could search vma near context.vdso, but it's a slowpath,
+ * so let's explicitly check all VMAs to be completely sure.
+ */
+ for_each_vma(vmi, vma) {
+ if (vma_is_special_mapping(vma, &vdso_mapping) ||
+ vma_is_special_mapping(vma, &vdso_vvar_mapping) ||
+ vma_is_special_mapping(vma, &vvar_vclock_mapping)) {
+ mmap_write_unlock(mm);
+ return -EEXIST;
+ }
+ }
+ mmap_write_unlock(mm);
+
+ return map_vdso(image, addr);
+}
+
+static int load_vdso32(void)
+{
+ if (vdso32_enabled != 1) /* Other values all mean "disabled" */
+ return 0;
+
+ return map_vdso(&vdso_image_32, 0);
+}
+
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ if (!vdso64_enabled)
+ return 0;
+
+ return map_vdso(&vdso_image_64, 0);
+ }
+
+ return load_vdso32();
+}
+
+#ifdef CONFIG_COMPAT
+int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp, bool x32)
+{
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && x32) {
+ if (!vdso64_enabled)
+ return 0;
+ return map_vdso(&vdso_image_x32, 0);
+ }
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ return load_vdso32();
+
+ return 0;
+}
+#endif
+
+bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
+{
+ const struct vdso_image *image = current->mm->context.vdso_image;
+ unsigned long vdso = (unsigned long) current->mm->context.vdso;
+
+ if (in_ia32_syscall() && image == &vdso_image_32) {
+ if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad ||
+ regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad)
+ return true;
+ }
+ return false;
+}
+
+#ifdef CONFIG_X86_64
+static __init int vdso_setup(char *s)
+{
+ vdso64_enabled = simple_strtoul(s, NULL, 0);
+ return 1;
+}
+__setup("vdso=", vdso_setup);
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
new file mode 100644
index 000000000000..37a3d4c02366
--- /dev/null
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/enclu.h>
+
+#include "extable.h"
+
+/* Relative to %rbp. */
+#define SGX_ENCLAVE_OFFSET_OF_RUN 16
+
+/* The offsets relative to struct sgx_enclave_run. */
+#define SGX_ENCLAVE_RUN_TCS 0
+#define SGX_ENCLAVE_RUN_LEAF 8
+#define SGX_ENCLAVE_RUN_EXCEPTION_VECTOR 12
+#define SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE 14
+#define SGX_ENCLAVE_RUN_EXCEPTION_ADDR 16
+#define SGX_ENCLAVE_RUN_USER_HANDLER 24
+#define SGX_ENCLAVE_RUN_USER_DATA 32 /* not used */
+#define SGX_ENCLAVE_RUN_RESERVED_START 40
+#define SGX_ENCLAVE_RUN_RESERVED_END 256
+
+.code64
+.section .text, "ax"
+
+SYM_FUNC_START(__vdso_sgx_enter_enclave)
+ /* Prolog */
+ .cfi_startproc
+ push %rbp
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rbp, 0
+ mov %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ push %rbx
+ .cfi_rel_offset %rbx, -8
+
+ mov %ecx, %eax
+.Lenter_enclave:
+ /* EENTER <= function <= ERESUME */
+ cmp $EENTER, %eax
+ jb .Linvalid_input
+ cmp $ERESUME, %eax
+ ja .Linvalid_input
+
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rcx
+
+ /* Validate that the reserved area contains only zeros. */
+ mov $SGX_ENCLAVE_RUN_RESERVED_START, %rbx
+1:
+ cmpq $0, (%rcx, %rbx)
+ jne .Linvalid_input
+ add $8, %rbx
+ cmpq $SGX_ENCLAVE_RUN_RESERVED_END, %rbx
+ jne 1b
+
+ /* Load TCS and AEP */
+ mov SGX_ENCLAVE_RUN_TCS(%rcx), %rbx
+ lea .Lasync_exit_pointer(%rip), %rcx
+
+ /* Single ENCLU serving as both EENTER and AEP (ERESUME) */
+.Lasync_exit_pointer:
+.Lenclu_eenter_eresume:
+ enclu
+
+ /* EEXIT jumps here unless the enclave is doing something fancy. */
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+ /* Set exit_reason. */
+ movl $EEXIT, SGX_ENCLAVE_RUN_LEAF(%rbx)
+
+ /* Invoke userspace's exit handler if one was provided. */
+.Lhandle_exit:
+ cmpq $0, SGX_ENCLAVE_RUN_USER_HANDLER(%rbx)
+ jne .Linvoke_userspace_handler
+
+ /* Success, in the sense that ENCLU was attempted. */
+ xor %eax, %eax
+
+.Lout:
+ pop %rbx
+ leave
+ .cfi_def_cfa %rsp, 8
+ RET
+
+ /* The out-of-line code runs with the pre-leave stack frame. */
+ .cfi_def_cfa %rbp, 16
+
+.Linvalid_input:
+ mov $(-EINVAL), %eax
+ jmp .Lout
+
+.Lhandle_exception:
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+ /* Set the exception info. */
+ mov %eax, (SGX_ENCLAVE_RUN_LEAF)(%rbx)
+ mov %di, (SGX_ENCLAVE_RUN_EXCEPTION_VECTOR)(%rbx)
+ mov %si, (SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE)(%rbx)
+ mov %rdx, (SGX_ENCLAVE_RUN_EXCEPTION_ADDR)(%rbx)
+ jmp .Lhandle_exit
+
+.Linvoke_userspace_handler:
+ /* Pass the untrusted RSP (at exit) to the callback via %rcx. */
+ mov %rsp, %rcx
+
+ /* Save struct sgx_enclave_exception %rbx is about to be clobbered. */
+ mov %rbx, %rax
+
+ /* Save the untrusted RSP offset in %rbx (non-volatile register). */
+ mov %rsp, %rbx
+ and $0xf, %rbx
+
+ /*
+ * Align stack per x86_64 ABI. Note, %rsp needs to be 16-byte aligned
+ * _after_ pushing the parameters on the stack, hence the bonus push.
+ */
+ and $-0x10, %rsp
+ push %rax
+
+ /* Push struct sgx_enclave_exception as a param to the callback. */
+ push %rax
+
+ /* Clear RFLAGS.DF per x86_64 ABI */
+ cld
+
+ /*
+ * Load the callback pointer to %rax and lfence for LVI (load value
+ * injection) protection before making the call.
+ */
+ mov SGX_ENCLAVE_RUN_USER_HANDLER(%rax), %rax
+ lfence
+ call *%rax
+
+ /* Undo the post-exit %rsp adjustment. */
+ lea 0x10(%rsp, %rbx), %rsp
+
+ /*
+ * If the return from callback is zero or negative, return immediately,
+ * else re-execute ENCLU with the positive return value interpreted as
+ * the requested ENCLU function.
+ */
+ cmp $0, %eax
+ jle .Lout
+ jmp .Lenter_enclave
+
+ .cfi_endproc
+
+_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception)
+
+SYM_FUNC_END(__vdso_sgx_enter_enclave)
diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile
new file mode 100644
index 000000000000..93c1b3e949a7
--- /dev/null
+++ b/arch/x86/entry/vsyscall/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the x86 low level vsyscall code
+#
+obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
+
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
new file mode 100644
index 000000000000..6e6c0a740837
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
+ *
+ * Based on the original implementation which is:
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ * Parts of the original code have been moved to arch/x86/vdso/vma.c
+ *
+ * This file implements vsyscall emulation. vsyscalls are a legacy ABI:
+ * Userspace can request certain kernel services by calling fixed
+ * addresses. This concept is problematic:
+ *
+ * - It interferes with ASLR.
+ * - It's awkward to write code that lives in kernel addresses but is
+ * callable by userspace at fixed addresses.
+ * - The whole concept is impossible for 32-bit compat userspace.
+ * - UML cannot easily virtualize a vsyscall.
+ *
+ * As of mid-2014, I believe that there is no new userspace code that
+ * will use a vsyscall if the vDSO is present. I hope that there will
+ * soon be no new userspace code that will ever use a vsyscall.
+ *
+ * The code in this file emulates vsyscalls when notified of a page
+ * fault to a vsyscall address.
+ */
+
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/sched/signal.h>
+#include <linux/mm_types.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/traps.h>
+#include <asm/paravirt.h>
+
+#define CREATE_TRACE_POINTS
+#include "vsyscall_trace.h"
+
+static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
+#ifdef CONFIG_LEGACY_VSYSCALL_NONE
+ NONE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
+ XONLY;
+#else
+ #error VSYSCALL config is broken
+#endif
+
+static int __init vsyscall_setup(char *str)
+{
+ if (str) {
+ if (!strcmp("emulate", str))
+ vsyscall_mode = EMULATE;
+ else if (!strcmp("xonly", str))
+ vsyscall_mode = XONLY;
+ else if (!strcmp("none", str))
+ vsyscall_mode = NONE;
+ else
+ return -EINVAL;
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+early_param("vsyscall", vsyscall_setup);
+
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+ const char *message)
+{
+ if (!show_unhandled_signals)
+ return;
+
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
+}
+
+static int addr_to_vsyscall_nr(unsigned long addr)
+{
+ int nr;
+
+ if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
+ return -EINVAL;
+
+ nr = (addr & 0xC00UL) >> 10;
+ if (nr >= 3)
+ return -EINVAL;
+
+ return nr;
+}
+
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+ if (!access_ok((void __user *)ptr, size)) {
+ struct thread_struct *thread = &current->thread;
+
+ thread->error_code = X86_PF_USER | X86_PF_WRITE;
+ thread->cr2 = ptr;
+ thread->trap_nr = X86_TRAP_PF;
+
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
+{
+ unsigned long caller;
+ int vsyscall_nr, syscall_nr, tmp;
+ long ret;
+ unsigned long orig_dx;
+
+ /* Write faults or kernel-privilege faults never get fixed up. */
+ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
+ return false;
+
+ /*
+ * Assume that faults at regs->ip are because of an
+ * instruction fetch. Return early and avoid
+ * emulation for faults during data accesses:
+ */
+ if (address != regs->ip) {
+ /* Failed vsyscall read */
+ if (vsyscall_mode == EMULATE)
+ return false;
+
+ /*
+ * User code tried and failed to read the vsyscall page.
+ */
+ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
+ return false;
+ }
+
+ /*
+ * X86_PF_INSTR is only set when NX is supported. When
+ * available, use it to double-check that the emulation code
+ * is only being used for instruction fetches:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_NX))
+ WARN_ON_ONCE(!(error_code & X86_PF_INSTR));
+
+ /*
+ * No point in checking CS -- the only way to get here is a user mode
+ * trap to a high address, which means that we're in 64-bit user code.
+ */
+
+ if (vsyscall_mode == NONE) {
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall attempted with vsyscall=none");
+ return false;
+ }
+
+ vsyscall_nr = addr_to_vsyscall_nr(address);
+
+ trace_emulate_vsyscall(vsyscall_nr);
+
+ if (vsyscall_nr < 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
+ goto sigsegv;
+ }
+
+ if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "vsyscall with bad stack (exploit attempt?)");
+ goto sigsegv;
+ }
+
+ /*
+ * Check for access_ok violations and find the syscall nr.
+ *
+ * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, NULL means "don't write anything" not "write it at
+ * address 0".
+ */
+ switch (vsyscall_nr) {
+ case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_gettimeofday;
+ break;
+
+ case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_time;
+ break;
+
+ case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_getcpu;
+ break;
+ }
+
+ /*
+ * Handle seccomp. regs->ip must be the original value.
+ * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
+ *
+ * We could optimize the seccomp disabled case, but performance
+ * here doesn't matter.
+ */
+ regs->orig_ax = syscall_nr;
+ regs->ax = -ENOSYS;
+ tmp = secure_computing();
+ if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+ warn_bad_vsyscall(KERN_DEBUG, regs,
+ "seccomp tried to change syscall nr or ip");
+ force_exit_sig(SIGSYS);
+ return true;
+ }
+ regs->orig_ax = -1;
+ if (tmp)
+ goto do_ret; /* skip requested */
+
+ /*
+ * With a real vsyscall, page faults cause SIGSEGV.
+ */
+ ret = -EFAULT;
+ switch (vsyscall_nr) {
+ case 0:
+ /* this decodes regs->di and regs->si on its own */
+ ret = __x64_sys_gettimeofday(regs);
+ break;
+
+ case 1:
+ /* this decodes regs->di on its own */
+ ret = __x64_sys_time(regs);
+ break;
+
+ case 2:
+ /* while we could clobber regs->dx, we didn't in the past... */
+ orig_dx = regs->dx;
+ regs->dx = 0;
+ /* this decodes regs->di, regs->si and regs->dx on its own */
+ ret = __x64_sys_getcpu(regs);
+ regs->dx = orig_dx;
+ break;
+ }
+
+check_fault:
+ if (ret == -EFAULT) {
+ /* Bad news -- userspace fed a bad pointer to a vsyscall. */
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall fault (exploit attempt?)");
+ goto sigsegv;
+ }
+
+ regs->ax = ret;
+
+do_ret:
+ /* Emulate a ret instruction. */
+ regs->ip = caller;
+ regs->sp += 8;
+ return true;
+
+sigsegv:
+ force_sig(SIGSEGV);
+ return true;
+}
+
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
+static const char *gate_vma_name(struct vm_area_struct *vma)
+{
+ return "[vsyscall]";
+}
+static const struct vm_operations_struct gate_vma_ops = {
+ .name = gate_vma_name,
+};
+static struct vm_area_struct gate_vma __ro_after_init = {
+ .vm_start = VSYSCALL_ADDR,
+ .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
+ .vm_page_prot = PAGE_READONLY_EXEC,
+ .vm_flags = VM_READ | VM_EXEC,
+ .vm_ops = &gate_vma_ops,
+};
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+#ifdef CONFIG_COMPAT
+ if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
+ return NULL;
+#endif
+ if (vsyscall_mode == NONE)
+ return NULL;
+ return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = get_gate_vma(mm);
+
+ if (!vma)
+ return 0;
+
+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/*
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
+ */
+int in_gate_area_no_mm(unsigned long addr)
+{
+ return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
+}
+
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range. Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present. If so, we skip calling
+ * this.
+ */
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
+ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+ p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+ set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
+ pud = pud_offset(p4d, VSYSCALL_ADDR);
+ set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+ pmd = pmd_offset(pud, VSYSCALL_ADDR);
+ set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
+void __init map_vsyscall(void)
+{
+ extern char __vsyscall_page;
+ unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
+
+ /*
+ * For full emulation, the page needs to exist for real. In
+ * execute-only mode, there is no PTE at all backing the vsyscall
+ * page.
+ */
+ if (vsyscall_mode == EMULATE) {
+ __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
+ PAGE_KERNEL_VVAR);
+ set_vsyscall_pgtable_user_bits(swapper_pg_dir);
+ }
+
+ if (vsyscall_mode == XONLY)
+ vm_flags_init(&gate_vma, VM_EXEC);
+
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+ (unsigned long)VSYSCALL_ADDR);
+}
diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
new file mode 100644
index 000000000000..ef2dd1827243
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/page_types.h>
+#include <asm/unistd_64.h>
+
+__PAGE_ALIGNED_DATA
+ .globl __vsyscall_page
+ .balign PAGE_SIZE, 0xcc
+ .type __vsyscall_page, @object
+__vsyscall_page:
+
+ mov $__NR_gettimeofday, %rax
+ syscall
+ ret
+ int3
+
+ .balign 1024, 0xcc
+ mov $__NR_time, %rax
+ syscall
+ ret
+ int3
+
+ .balign 1024, 0xcc
+ mov $__NR_getcpu, %rax
+ syscall
+ ret
+ int3
+
+ .balign 4096, 0xcc
+
+ .size __vsyscall_page, 4096
diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h
new file mode 100644
index 000000000000..3c3f9765a85c
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_trace.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vsyscall
+
+#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __VSYSCALL_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(emulate_vsyscall,
+
+ TP_PROTO(int nr),
+
+ TP_ARGS(nr),
+
+ TP_STRUCT__entry(__field(int, nr)),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ ),
+
+ TP_printk("nr = %d", __entry->nr)
+);
+
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
+#define TRACE_INCLUDE_FILE vsyscall_trace
+#include <trace/define_trace.h>
diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig
new file mode 100644
index 000000000000..dabdf3d7bf84
--- /dev/null
+++ b/arch/x86/events/Kconfig
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+menu "Performance monitoring"
+
+config PERF_EVENTS_INTEL_UNCORE
+ tristate "Intel uncore performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ help
+ Include support for Intel uncore performance events. These are
+ available on NehalemEX and more modern processors.
+
+config PERF_EVENTS_INTEL_RAPL
+ tristate "Intel/AMD rapl performance events"
+ depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI
+ default y
+ help
+ Include support for Intel and AMD rapl performance events for power
+ monitoring on modern processors.
+
+config PERF_EVENTS_INTEL_CSTATE
+ tristate "Intel cstate performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ help
+ Include support for Intel cstate performance events for power
+ monitoring on modern processors.
+
+config PERF_EVENTS_AMD_POWER
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ tristate "AMD Processor Power Reporting Mechanism"
+ help
+ Provide power reporting mechanism support for AMD processors.
+ Currently, it leverages X86_FEATURE_ACC_POWER
+ (CPUID Fn8000_0007_EDX[12]) interface to calculate the
+ average power consumption on Family 15h processors.
+
+config PERF_EVENTS_AMD_UNCORE
+ tristate "AMD Uncore performance events"
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ default y
+ help
+ Include support for AMD uncore performance events for use with
+ e.g., perf stat -e amd_l3/.../,amd_df/.../.
+
+ To compile this driver as a module, choose M here: the
+ module will be called 'amd-uncore'.
+
+config PERF_EVENTS_AMD_BRS
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ bool "AMD Zen3 Branch Sampling support"
+ help
+ Enable AMD Zen3 branch sampling support (BRS) which samples up to
+ 16 consecutive taken branches in registers.
+
+endmenu
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
new file mode 100644
index 000000000000..86a76efa8bb6
--- /dev/null
+++ b/arch/x86/events/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y += core.o probe.o utils.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o
+obj-y += amd/
+obj-$(CONFIG_X86_LOCAL_APIC) += msr.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel/
+obj-$(CONFIG_CPU_SUP_CENTAUR) += zhaoxin/
+obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin/
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
new file mode 100644
index 000000000000..527d947eb76b
--- /dev/null
+++ b/arch/x86/events/amd/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o
+obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o
+obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o
+obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o
+obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o
+amd-uncore-objs := uncore.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD) += iommu.o
+endif
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
new file mode 100644
index 000000000000..06f35a6b58a5
--- /dev/null
+++ b/arch/x86/events/amd/brs.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implement support for AMD Fam19h Branch Sampling feature
+ * Based on specifications published in AMD PPR Fam19 Model 01
+ *
+ * Copyright 2021 Google LLC
+ * Contributed by Stephane Eranian <eranian@google.com>
+ */
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <asm/msr.h>
+#include <asm/cpufeature.h>
+
+#include "../perf_event.h"
+
+#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */
+
+/* Debug Extension Configuration register layout */
+union amd_debug_extn_cfg {
+ __u64 val;
+ struct {
+ __u64 rsvd0:2, /* reserved */
+ brsmen:1, /* branch sample enable */
+ rsvd4_3:2,/* reserved - must be 0x3 */
+ vb:1, /* valid branches recorded */
+ rsvd2:10, /* reserved */
+ msroff:4, /* index of next entry to write */
+ rsvd3:4, /* reserved */
+ pmc:3, /* #PMC holding the sampling event */
+ rsvd4:37; /* reserved */
+ };
+};
+
+static inline unsigned int brs_from(int idx)
+{
+ return MSR_AMD_SAMP_BR_FROM + 2 * idx;
+}
+
+static inline unsigned int brs_to(int idx)
+{
+ return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1;
+}
+
+static __always_inline void set_debug_extn_cfg(u64 val)
+{
+ /* bits[4:3] must always be set to 11b */
+ native_wrmsrq(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3);
+}
+
+static __always_inline u64 get_debug_extn_cfg(void)
+{
+ return native_rdmsrq(MSR_AMD_DBG_EXTN_CFG);
+}
+
+static bool __init amd_brs_detect(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BRS))
+ return false;
+
+ switch (boot_cpu_data.x86) {
+ case 0x19: /* AMD Fam19h (Zen3) */
+ x86_pmu.lbr_nr = 16;
+
+ /* No hardware filtering supported */
+ x86_pmu.lbr_sel_map = NULL;
+ x86_pmu.lbr_sel_mask = 0;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Current BRS implementation does not support branch type or privilege level
+ * filtering. Therefore, this function simply enforces these limitations. No need for
+ * a br_sel_map. Software filtering is not supported because it would not correlate well
+ * with a sampling period.
+ */
+static int amd_brs_setup_filter(struct perf_event *event)
+{
+ u64 type = event->attr.branch_sample_type;
+
+ /* No BRS support */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ /* Can only capture all branches, i.e., no filtering */
+ if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline int amd_is_brs_event(struct perf_event *e)
+{
+ return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT;
+}
+
+int amd_brs_hw_config(struct perf_event *event)
+{
+ int ret = 0;
+
+ /*
+ * Due to interrupt holding, BRS is not recommended in
+ * counting mode.
+ */
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ /*
+ * Due to the way BRS operates by holding the interrupt until
+ * lbr_nr entries have been captured, it does not make sense
+ * to allow sampling on BRS with an event that does not match
+ * what BRS is capturing, i.e., retired taken branches.
+ * Otherwise the correlation with the event's period is even
+ * more loose:
+ *
+ * With retired taken branch:
+ * Effective P = P + 16 + X
+ * With any other event:
+ * Effective P = P + Y + X
+ *
+ * Where X is the number of taken branches due to interrupt
+ * skid. Skid is large.
+ *
+ * Where Y is the occurrences of the event while BRS is
+ * capturing the lbr_nr entries.
+ *
+ * By using retired taken branches, we limit the impact on the
+ * Y variable. We know it cannot be more than the depth of
+ * BRS.
+ */
+ if (!amd_is_brs_event(event))
+ return -EINVAL;
+
+ /*
+ * BRS implementation does not work with frequency mode
+ * reprogramming of the period.
+ */
+ if (event->attr.freq)
+ return -EINVAL;
+ /*
+ * The kernel subtracts BRS depth from period, so it must
+ * be big enough.
+ */
+ if (event->attr.sample_period <= x86_pmu.lbr_nr)
+ return -EINVAL;
+
+ /*
+ * Check if we can allow PERF_SAMPLE_BRANCH_STACK
+ */
+ ret = amd_brs_setup_filter(event);
+
+ /* only set in case of success */
+ if (!ret)
+ event->hw.flags |= PERF_X86_EVENT_AMD_BRS;
+
+ return ret;
+}
+
+/* tos = top of stack, i.e., last valid entry written */
+static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg)
+{
+ /*
+ * msroff: index of next entry to write so top-of-stack is one off
+ * if BRS is full then msroff is set back to 0.
+ */
+ return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1;
+}
+
+/*
+ * make sure we have a sane BRS offset to begin with
+ * especially with kexec
+ */
+void amd_brs_reset(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BRS))
+ return;
+
+ /*
+ * Reset config
+ */
+ set_debug_extn_cfg(0);
+
+ /*
+ * Mark first entry as poisoned
+ */
+ wrmsrq(brs_to(0), BRS_POISON);
+}
+
+int __init amd_brs_init(void)
+{
+ if (!amd_brs_detect())
+ return -EOPNOTSUPP;
+
+ pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr);
+
+ return 0;
+}
+
+void amd_brs_enable(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /* Activate only on first user */
+ if (++cpuc->brs_active > 1)
+ return;
+
+ cfg.val = 0; /* reset all fields */
+ cfg.brsmen = 1; /* enable branch sampling */
+
+ /* Set enable bit */
+ set_debug_extn_cfg(cfg.val);
+}
+
+void amd_brs_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (cpuc->lbr_users)
+ amd_brs_enable();
+}
+
+void amd_brs_disable(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /* Check if active (could be disabled via x86_pmu_disable_all()) */
+ if (!cpuc->brs_active)
+ return;
+
+ /* Only disable for last user */
+ if (--cpuc->brs_active)
+ return;
+
+ /*
+ * Clear the brsmen bit but preserve the others as they contain
+ * useful state such as vb and msroff
+ */
+ cfg.val = get_debug_extn_cfg();
+
+ /*
+ * When coming in on interrupt and BRS is full, then hw will have
+ * already stopped BRS, no need to issue wrmsr again
+ */
+ if (cfg.brsmen) {
+ cfg.brsmen = 0;
+ set_debug_extn_cfg(cfg.val);
+ }
+}
+
+void amd_brs_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (cpuc->lbr_users)
+ amd_brs_disable();
+}
+
+static bool amd_brs_match_plm(struct perf_event *event, u64 to)
+{
+ int type = event->attr.branch_sample_type;
+ int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV;
+ int plm_u = PERF_SAMPLE_BRANCH_USER;
+
+ if (!(type & plm_k) && kernel_ip(to))
+ return 0;
+
+ if (!(type & plm_u) && !kernel_ip(to))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Caller must ensure amd_brs_inuse() is true before calling
+ * return:
+ */
+void amd_brs_drain(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event = cpuc->events[0];
+ struct perf_branch_entry *br = cpuc->lbr_entries;
+ union amd_debug_extn_cfg cfg;
+ u32 i, nr = 0, num, tos, start;
+ u32 shift = 64 - boot_cpu_data.x86_virt_bits;
+
+ /*
+ * BRS event forced on PMC0,
+ * so check if there is an event.
+ * It is possible to have lbr_users > 0 but the event
+ * not yet scheduled due to long latency PMU irq
+ */
+ if (!event)
+ goto empty;
+
+ cfg.val = get_debug_extn_cfg();
+
+ /* Sanity check [0-x86_pmu.lbr_nr] */
+ if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr))
+ goto empty;
+
+ /* No valid branch */
+ if (cfg.vb == 0)
+ goto empty;
+
+ /*
+ * msr.off points to next entry to be written
+ * tos = most recent entry index = msr.off - 1
+ * BRS register buffer saturates, so we know we have
+ * start < tos and that we have to read from start to tos
+ */
+ start = 0;
+ tos = amd_brs_get_tos(&cfg);
+
+ num = tos - start + 1;
+
+ /*
+ * BRS is only one pass (saturation) from MSROFF to depth-1
+ * MSROFF wraps to zero when buffer is full
+ */
+ for (i = 0; i < num; i++) {
+ u32 brs_idx = tos - i;
+ u64 from, to;
+
+ rdmsrq(brs_to(brs_idx), to);
+
+ /* Entry does not belong to us (as marked by kernel) */
+ if (to == BRS_POISON)
+ break;
+
+ /*
+ * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved.
+ * Necessary to generate proper virtual addresses suitable for
+ * symbolization
+ */
+ to = (u64)(((s64)to << shift) >> shift);
+
+ if (!amd_brs_match_plm(event, to))
+ continue;
+
+ rdmsrq(brs_from(brs_idx), from);
+
+ perf_clear_branch_entry_bitfields(br+nr);
+
+ br[nr].from = from;
+ br[nr].to = to;
+
+ nr++;
+ }
+empty:
+ /* Record number of sampled branches */
+ cpuc->lbr_stack.nr = nr;
+}
+
+/*
+ * Poison most recent entry to prevent reuse by next task
+ * required because BRS entry are not tagged by PID
+ */
+static void amd_brs_poison_buffer(void)
+{
+ union amd_debug_extn_cfg cfg;
+ unsigned int idx;
+
+ /* Get current state */
+ cfg.val = get_debug_extn_cfg();
+
+ /* idx is most recently written entry */
+ idx = amd_brs_get_tos(&cfg);
+
+ /* Poison target of entry */
+ wrmsrq(brs_to(idx), BRS_POISON);
+}
+
+/*
+ * On context switch in, we need to make sure no samples from previous user
+ * are left in the BRS.
+ *
+ * On ctxswin, sched_in = true, called after the PMU has started
+ * On ctxswout, sched_in = false, called before the PMU is stopped
+ */
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /* no active users */
+ if (!cpuc->lbr_users)
+ return;
+
+ /*
+ * On context switch in, we need to ensure we do not use entries
+ * from previous BRS user on that CPU, so we poison the buffer as
+ * a faster way compared to resetting all entries.
+ */
+ if (sched_in)
+ amd_brs_poison_buffer();
+}
+
+/*
+ * called from ACPI processor_idle.c or acpi_pad.c
+ * with interrupts disabled
+ */
+void noinstr perf_amd_brs_lopwr_cb(bool lopwr_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /*
+ * on mwait in, we may end up in non C0 state.
+ * we must disable branch sampling to avoid holding the NMI
+ * for too long. We disable it in hardware but we
+ * keep the state in cpuc, so we can re-enable.
+ *
+ * The hardware will deliver the NMI if needed when brsmen cleared
+ */
+ if (cpuc->brs_active) {
+ cfg.val = get_debug_extn_cfg();
+ cfg.brsmen = !lopwr_in;
+ set_debug_extn_cfg(cfg.val);
+ }
+}
+
+DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb);
+
+void __init amd_brs_lopwr_init(void)
+{
+ static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+}
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
new file mode 100644
index 000000000000..44656d2fb555
--- /dev/null
+++ b/arch/x86/events/amd/core.c
@@ -0,0 +1,1595 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/perf_event.h>
+#include <linux/jump_label.h>
+#include <linux/export.h>
+#include <linux/kvm_types.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+
+#include "../perf_event.h"
+
+static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp);
+static unsigned long perf_nmi_window;
+
+/* AMD Event 0xFFF: Merge. Used with Large Increment per Cycle events */
+#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL)
+#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE)
+
+/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */
+static u64 amd_pmu_global_cntr_mask __read_mostly;
+
+static __initconst const u64 amd_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
+ [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+ [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+ [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst const u64 amd_hw_cache_event_ids_f17h
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */
+ [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */
+ [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */
+ [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */
+ [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */
+ [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(NODE)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+};
+
+/*
+ * AMD Performance Monitor K7 and later, up to and including Family 16h:
+ */
+static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x077e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
+};
+
+/*
+ * AMD Performance Monitor Family 17h and later:
+ */
+static const u64 amd_zen1_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287,
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187,
+};
+
+static const u64 amd_zen2_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9,
+};
+
+static const u64 amd_zen4_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9,
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x100000120,
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1a)
+ return amd_zen4_perfmon_event_map[hw_event];
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2) || boot_cpu_data.x86 >= 0x19)
+ return amd_zen2_perfmon_event_map[hw_event];
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN1))
+ return amd_zen1_perfmon_event_map[hw_event];
+
+ return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ * 4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ * 6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+ int offset;
+
+ if (!index)
+ return index;
+
+ if (eventsel)
+ offset = event_offsets[index];
+ else
+ offset = count_offsets[index];
+
+ if (offset)
+ return offset;
+
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ offset = index;
+ else
+ offset = index << 1;
+
+ if (eventsel)
+ event_offsets[index] = offset;
+ else
+ count_offsets[index] = offset;
+
+ return offset;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+ return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc)
+{
+ if (!(x86_pmu.flags & PMU_FL_PAIR))
+ return false;
+
+ switch (amd_get_event_code(hwc)) {
+ case 0x003: return true; /* Retired SSE/AVX FLOPs */
+ default: return false;
+ }
+}
+
+DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config);
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+ if (event->attr.exclude_host && event->attr.exclude_guest)
+ /*
+ * When HO == GO == 1 the hardware treats that as GO == HO == 0
+ * and will count in both modes. We don't want to count in that
+ * case so we emulate no-counting by setting US = OS = 0.
+ */
+ event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+ ARCH_PERFMON_EVENTSEL_OS);
+ else if (event->attr.exclude_host)
+ event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+ else if (event->attr.exclude_guest)
+ event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+ if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw))
+ event->hw.flags |= PERF_X86_EVENT_PAIR;
+
+ if (has_branch_stack(event))
+ return static_call(amd_pmu_branch_hw_config)(event);
+
+ return 0;
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+ return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+ struct amd_nb *nb = cpuc->amd_nb;
+
+ return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+ int ret;
+
+ /* pass precise event sampling to ibs: */
+ if (event->attr.precise_ip && get_ibs_caps())
+ return forward_event_to_ibs(event);
+
+ if (has_branch_stack(event) && !x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ ret = x86_pmu_hw_config(event);
+ if (ret)
+ return ret;
+
+ if (event->attr.type == PERF_TYPE_RAW)
+ event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+ return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct amd_nb *nb = cpuc->amd_nb;
+ int i;
+
+ /*
+ * need to scan whole list because event may not have
+ * been assigned during scheduling
+ *
+ * no race condition possible because event can only
+ * be removed on one CPU at a time AND PMU is disabled
+ * when we come here
+ */
+ for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct perf_event *tmp = event;
+
+ if (try_cmpxchg(nb->owners + i, &tmp, NULL))
+ break;
+ }
+}
+
+ /*
+ * AMD64 NorthBridge events need special treatment because
+ * counter access needs to be synchronized across all cores
+ * of a package. Refer to BKDG section 3.12
+ *
+ * NB events are events measuring L3 cache, Hypertransport
+ * traffic. They are identified by an event code >= 0xe00.
+ * They measure events on the NorthBride which is shared
+ * by all cores on a package. NB events are counted on a
+ * shared set of counters. When a NB event is programmed
+ * in a counter, the data actually comes from a shared
+ * counter. Thus, access to those counters needs to be
+ * synchronized.
+ *
+ * We implement the synchronization such that no two cores
+ * can be measuring NB events using the same counters. Thus,
+ * we maintain a per-NB allocation table. The available slot
+ * is propagated using the event_constraint structure.
+ *
+ * We provide only one choice for each NB event based on
+ * the fact that only NB events have restrictions. Consequently,
+ * if a counter is available, there is a guarantee the NB event
+ * will be assigned to it. If no slot is available, an empty
+ * constraint is returned and scheduling will eventually fail
+ * for this event.
+ *
+ * Note that all cores attached the same NB compete for the same
+ * counters to host NB events, this is why we use atomic ops. Some
+ * multi-chip CPUs may have more than one NB.
+ *
+ * Given that resources are allocated (cmpxchg), they must be
+ * eventually freed for others to use. This is accomplished by
+ * calling __amd_put_nb_event_constraints()
+ *
+ * Non NB events are not impacted by this restriction.
+ */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ struct event_constraint *c)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_nb *nb = cpuc->amd_nb;
+ struct perf_event *old;
+ int idx, new = -1;
+
+ if (!c)
+ c = &unconstrained;
+
+ if (cpuc->is_fake)
+ return c;
+
+ /*
+ * detect if already present, if so reuse
+ *
+ * cannot merge with actual allocation
+ * because of possible holes
+ *
+ * event can already be present yet not assigned (in hwc->idx)
+ * because of successive calls to x86_schedule_events() from
+ * hw_perf_group_sched_in() without hw_perf_enable()
+ */
+ for_each_set_bit(idx, c->idxmsk, x86_pmu_max_num_counters(NULL)) {
+ if (new == -1 || hwc->idx == idx)
+ /* assign free slot, prefer hwc->idx */
+ old = cmpxchg(nb->owners + idx, NULL, event);
+ else if (nb->owners[idx] == event)
+ /* event already present */
+ old = event;
+ else
+ continue;
+
+ if (old && old != event)
+ continue;
+
+ /* reassign to this slot */
+ if (new != -1)
+ cmpxchg(nb->owners + new, event, NULL);
+ new = idx;
+
+ /* already present, reuse */
+ if (old == event)
+ break;
+ }
+
+ if (new == -1)
+ return &emptyconstraint;
+
+ return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+ struct amd_nb *nb;
+ int i;
+
+ nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+ if (!nb)
+ return NULL;
+
+ nb->nb_id = -1;
+
+ /*
+ * initialize all possible NB constraints
+ */
+ for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ __set_bit(i, nb->event_constraints[i].idxmsk);
+ nb->event_constraints[i].weight = 1;
+ }
+ return nb;
+}
+
+typedef void (amd_pmu_branch_reset_t)(void);
+DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t);
+
+static void amd_pmu_cpu_reset(int cpu)
+{
+ if (x86_pmu.lbr_nr)
+ static_call(amd_pmu_branch_reset)();
+
+ if (x86_pmu.version < 2)
+ return;
+
+ /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */
+ wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
+
+ /*
+ * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze
+ * and PerfCntrGLobalStatus.PerfCntrOvfl
+ */
+ wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+ GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask);
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!cpuc->lbr_sel)
+ return -ENOMEM;
+
+ WARN_ON_ONCE(cpuc->amd_nb);
+
+ if (!x86_pmu.amd_nb_constraints)
+ return 0;
+
+ cpuc->amd_nb = amd_alloc_nb(cpu);
+ if (cpuc->amd_nb)
+ return 0;
+
+ kfree(cpuc->lbr_sel);
+ cpuc->lbr_sel = NULL;
+
+ return -ENOMEM;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+ struct amd_nb *nb;
+ int i, nb_id;
+
+ cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+ amd_pmu_cpu_reset(cpu);
+
+ if (!x86_pmu.amd_nb_constraints)
+ return;
+
+ nb_id = topology_amd_node_id(cpu);
+ WARN_ON_ONCE(nb_id == BAD_APICID);
+
+ for_each_online_cpu(i) {
+ nb = per_cpu(cpu_hw_events, i).amd_nb;
+ if (WARN_ON_ONCE(!nb))
+ continue;
+
+ if (nb->nb_id == nb_id) {
+ *onln = cpuc->amd_nb;
+ cpuc->amd_nb = nb;
+ break;
+ }
+ }
+
+ cpuc->amd_nb->nb_id = nb_id;
+ cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+ struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+ kfree(cpuhw->lbr_sel);
+ cpuhw->lbr_sel = NULL;
+
+ if (!x86_pmu.amd_nb_constraints)
+ return;
+
+ if (cpuhw->amd_nb) {
+ struct amd_nb *nb = cpuhw->amd_nb;
+
+ if (nb->nb_id == -1 || --nb->refcnt == 0)
+ kfree(nb);
+
+ cpuhw->amd_nb = NULL;
+ }
+}
+
+static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
+{
+ wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
+}
+
+static inline u64 amd_pmu_get_global_status(void)
+{
+ u64 status;
+
+ /* PerfCntrGlobalStatus is read-only */
+ rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void amd_pmu_ack_global_status(u64 status)
+{
+ /*
+ * PerfCntrGlobalStatus is read-only but an overflow acknowledgment
+ * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr
+ * clears the same bit in PerfCntrGlobalStatus
+ */
+
+ wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status);
+}
+
+static bool amd_pmu_test_overflow_topbit(int idx)
+{
+ u64 counter;
+
+ rdmsrq(x86_pmu_event_addr(idx), counter);
+
+ return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1));
+}
+
+static bool amd_pmu_test_overflow_status(int idx)
+{
+ return amd_pmu_get_global_status() & BIT_ULL(idx);
+}
+
+DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit);
+
+/*
+ * When a PMC counter overflows, an NMI is used to process the event and
+ * reset the counter. NMI latency can result in the counter being updated
+ * before the NMI can run, which can result in what appear to be spurious
+ * NMIs. This function is intended to wait for the NMI to run and reset
+ * the counter to avoid possible unhandled NMI messages.
+ */
+#define OVERFLOW_WAIT_COUNT 50
+
+static void amd_pmu_wait_on_overflow(int idx)
+{
+ unsigned int i;
+
+ /*
+ * Wait for the counter to be reset if it has overflowed. This loop
+ * should exit very, very quickly, but just in case, don't wait
+ * forever...
+ */
+ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
+ if (!static_call(amd_pmu_test_overflow)(idx))
+ break;
+
+ /* Might be in IRQ context, so can't sleep */
+ udelay(1);
+ }
+}
+
+static void amd_pmu_check_overflow(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ /*
+ * This shouldn't be called from NMI context, but add a safeguard here
+ * to return, since if we're in NMI context we can't wait for an NMI
+ * to reset an overflowed counter value.
+ */
+ if (in_nmi())
+ return;
+
+ /*
+ * Check each counter for overflow and wait for it to be reset by the
+ * NMI if it has overflowed. This relies on the fact that all active
+ * counters are always enabled when this function is called and
+ * ARCH_PERFMON_EVENTSEL_INT is always set.
+ */
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ amd_pmu_wait_on_overflow(idx);
+ }
+}
+
+static void amd_pmu_enable_event(struct perf_event *event)
+{
+ x86_pmu_enable_event(event);
+}
+
+static void amd_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ amd_brs_enable_all();
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ /* only activate events which are marked as active */
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ /*
+ * FIXME: cpuc->events[idx] can become NULL in a subtle race
+ * condition with NMI->throttle->x86_pmu_stop().
+ */
+ if (cpuc->events[idx])
+ amd_pmu_enable_event(cpuc->events[idx]);
+ }
+}
+
+static void amd_pmu_v2_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * Testing cpu_hw_events.enabled should be skipped in this case unlike
+ * in x86_pmu_enable_event().
+ *
+ * Since cpu_hw_events.enabled is set only after returning from
+ * x86_pmu_start(), the PMCs must be programmed and kept ready.
+ * Counting starts only after x86_pmu_enable_all() is called.
+ */
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+static __always_inline void amd_pmu_core_enable_all(void)
+{
+ amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask);
+}
+
+static void amd_pmu_v2_enable_all(int added)
+{
+ amd_pmu_lbr_enable_all();
+ amd_pmu_core_enable_all();
+}
+
+static void amd_pmu_disable_event(struct perf_event *event)
+{
+ x86_pmu_disable_event(event);
+
+ /*
+ * This can be called from NMI context (via x86_pmu_stop). The counter
+ * may have overflowed, but either way, we'll never see it get reset
+ * by the NMI if we're already in the NMI. And the NMI latency support
+ * below will take care of any pending NMI that might have been
+ * generated by the overflow.
+ */
+ if (in_nmi())
+ return;
+
+ amd_pmu_wait_on_overflow(event->hw.idx);
+}
+
+static void amd_pmu_disable_all(void)
+{
+ amd_brs_disable_all();
+ x86_pmu_disable_all();
+ amd_pmu_check_overflow();
+}
+
+static __always_inline void amd_pmu_core_disable_all(void)
+{
+ amd_pmu_set_global_ctl(0);
+}
+
+static void amd_pmu_v2_disable_all(void)
+{
+ amd_pmu_core_disable_all();
+ amd_pmu_lbr_disable_all();
+ amd_pmu_check_overflow();
+}
+
+DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add);
+
+static void amd_pmu_add_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ static_call(amd_pmu_branch_add)(event);
+}
+
+DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del);
+
+static void amd_pmu_del_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ static_call(amd_pmu_branch_del)(event);
+}
+
+/*
+ * Because of NMI latency, if multiple PMC counters are active or other sources
+ * of NMIs are received, the perf NMI handler can handle one or more overflowed
+ * PMC counters outside of the NMI associated with the PMC overflow. If the NMI
+ * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel
+ * back-to-back NMI support won't be active. This PMC handler needs to take into
+ * account that this can occur, otherwise this could result in unknown NMI
+ * messages being issued. Examples of this is PMC overflow while in the NMI
+ * handler when multiple PMCs are active or PMC overflow while handling some
+ * other source of an NMI.
+ *
+ * Attempt to mitigate this by creating an NMI window in which un-handled NMIs
+ * received during this window will be claimed. This prevents extending the
+ * window past when it is possible that latent NMIs should be received. The
+ * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has
+ * handled a counter. When an un-handled NMI is received, it will be claimed
+ * only if arriving within that window.
+ */
+static inline int amd_pmu_adjust_nmi_window(int handled)
+{
+ /*
+ * If a counter was handled, record a timestamp such that un-handled
+ * NMIs will be claimed if arriving within that window.
+ */
+ if (handled) {
+ this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window);
+
+ return handled;
+ }
+
+ if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp)))
+ return NMI_DONE;
+
+ return NMI_HANDLED;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int handled;
+ int pmu_enabled;
+
+ /*
+ * Save the PMU state.
+ * It needs to be restored when leaving the handler.
+ */
+ pmu_enabled = cpuc->enabled;
+ cpuc->enabled = 0;
+
+ amd_brs_disable_all();
+
+ /* Drain BRS is in use (could be inactive) */
+ if (cpuc->lbr_users)
+ amd_brs_drain();
+
+ /* Process any counter overflows */
+ handled = x86_pmu_handle_irq(regs);
+
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ amd_brs_enable_all();
+
+ return amd_pmu_adjust_nmi_window(handled);
+}
+
+/*
+ * AMD-specific callback invoked through perf_snapshot_branch_stack static
+ * call, defined in include/linux/perf_event.h. See its definition for API
+ * details. It's up to caller to provide enough space in *entries* to fit all
+ * LBR records, otherwise returned result will be truncated to *cnt* entries.
+ */
+static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ struct cpu_hw_events *cpuc;
+ unsigned long flags;
+
+ /*
+ * The sequence of steps to freeze LBR should be completely inlined
+ * and contain no branches to minimize contamination of LBR snapshot
+ */
+ local_irq_save(flags);
+ amd_pmu_core_disable_all();
+ __amd_pmu_lbr_disable();
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ amd_pmu_lbr_read();
+ cnt = min(cnt, x86_pmu.lbr_nr);
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+
+ amd_pmu_v2_enable_all(0);
+ local_irq_restore(flags);
+
+ return cnt;
+}
+
+static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ static atomic64_t status_warned = ATOMIC64_INIT(0);
+ u64 reserved, status, mask, new_bits, prev_bits;
+ struct perf_sample_data data;
+ struct hw_perf_event *hwc;
+ struct perf_event *event;
+ int handled = 0, idx;
+ bool pmu_enabled;
+
+ /*
+ * Save the PMU state as it needs to be restored when leaving the
+ * handler
+ */
+ pmu_enabled = cpuc->enabled;
+ cpuc->enabled = 0;
+
+ /* Stop counting but do not disable LBR */
+ amd_pmu_core_disable_all();
+
+ status = amd_pmu_get_global_status();
+
+ /* Check if any overflows are pending */
+ if (!status)
+ goto done;
+
+ /* Read branch records */
+ if (x86_pmu.lbr_nr) {
+ amd_pmu_lbr_read();
+ status &= ~GLOBAL_STATUS_LBRS_FROZEN;
+ }
+
+ reserved = status & ~amd_pmu_global_cntr_mask;
+ if (reserved)
+ pr_warn_once("Reserved PerfCntrGlobalStatus bits are set (0x%llx), please consider updating microcode\n",
+ reserved);
+
+ /* Clear any reserved bits set by buggy microcode */
+ status &= amd_pmu_global_cntr_mask;
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ event = cpuc->events[idx];
+ hwc = &event->hw;
+ x86_perf_event_update(event);
+ mask = BIT_ULL(idx);
+
+ if (!(status & mask))
+ continue;
+
+ /* Event overflow */
+ handled++;
+ status &= ~mask;
+ perf_sample_data_init(&data, 0, hwc->last_period);
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
+
+ perf_event_overflow(event, &data, regs);
+ }
+
+ /*
+ * It should never be the case that some overflows are not handled as
+ * the corresponding PMCs are expected to be inactive according to the
+ * active_mask
+ */
+ if (status > 0) {
+ prev_bits = atomic64_fetch_or(status, &status_warned);
+ // A new bit was set for the very first time.
+ new_bits = status & ~prev_bits;
+ WARN(new_bits, "New overflows for inactive PMCs: %llx\n", new_bits);
+ }
+
+ /* Clear overflow and freeze bits */
+ amd_pmu_ack_global_status(~status);
+
+ /*
+ * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT
+ * PMI entry is not set by the local APIC when a PMC overflow occurs
+ */
+ inc_irq_stat(apic_perf_irqs);
+
+done:
+ cpuc->enabled = pmu_enabled;
+
+ /* Resume counting only if PMU is active */
+ if (pmu_enabled)
+ amd_pmu_core_enable_all();
+
+ return amd_pmu_adjust_nmi_window(handled);
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ /*
+ * if not NB event or no NB, then no constraints
+ */
+ if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+ return &unconstrained;
+
+ return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+ __amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *amd_format_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
+
+#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS 0x000000C0ULL
+#define AMD_EVENT_DE 0x000000D0ULL
+#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000 FP PERF_CTL[5:3]
+ * 0x010 FP PERF_CTL[5:3]
+ * 0x020 LS PERF_CTL[5:0]
+ * 0x030 LS PERF_CTL[5:0]
+ * 0x040 DC PERF_CTL[5:0]
+ * 0x050 DC PERF_CTL[5:0]
+ * 0x060 CU PERF_CTL[2:0]
+ * 0x070 CU PERF_CTL[2:0]
+ * 0x080 IC/DE PERF_CTL[2:0]
+ * 0x090 IC/DE PERF_CTL[2:0]
+ * 0x0A0 ---
+ * 0x0B0 ---
+ * 0x0C0 EX/LS PERF_CTL[5:0]
+ * 0x0D0 DE PERF_CTL[2:0]
+ * 0x0E0 NB NB_PERF_CTL[3:0]
+ * 0x0F0 NB NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003 FP PERF_CTL[3]
+ * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B FP PERF_CTL[3]
+ * 0x00D FP PERF_CTL[3]
+ * 0x023 DE PERF_CTL[2:0]
+ * 0x02D LS PERF_CTL[3]
+ * 0x02E LS PERF_CTL[3,0]
+ * 0x031 LS PERF_CTL[2:0] (**)
+ * 0x043 CU PERF_CTL[2:0]
+ * 0x045 CU PERF_CTL[2:0]
+ * 0x046 CU PERF_CTL[2:0]
+ * 0x054 CU PERF_CTL[2:0]
+ * 0x055 CU PERF_CTL[2:0]
+ * 0x08F IC PERF_CTL[0]
+ * 0x187 DE PERF_CTL[0]
+ * 0x188 DE PERF_CTL[0]
+ * 0x0DB EX PERF_CTL[5:0]
+ * 0x0DC LS PERF_CTL[5:0]
+ * 0x0DD LS PERF_CTL[5:0]
+ * 0x0DE LS PERF_CTL[5:0]
+ * 0x0DF LS PERF_CTL[5:0]
+ * 0x1C0 EX PERF_CTL[5:3]
+ * 0x1D6 EX PERF_CTL[5:0]
+ * 0x1D8 EX PERF_CTL[5:0]
+ *
+ * (*) depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int event_code = amd_get_event_code(hwc);
+
+ switch (event_code & AMD_EVENT_TYPE_MASK) {
+ case AMD_EVENT_FP:
+ switch (event_code) {
+ case 0x000:
+ if (!(hwc->config & 0x0000F000ULL))
+ break;
+ if (!(hwc->config & 0x00000F00ULL))
+ break;
+ return &amd_f15_PMC3;
+ case 0x004:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ break;
+ return &amd_f15_PMC3;
+ case 0x003:
+ case 0x00B:
+ case 0x00D:
+ return &amd_f15_PMC3;
+ }
+ return &amd_f15_PMC53;
+ case AMD_EVENT_LS:
+ case AMD_EVENT_DC:
+ case AMD_EVENT_EX_LS:
+ switch (event_code) {
+ case 0x023:
+ case 0x043:
+ case 0x045:
+ case 0x046:
+ case 0x054:
+ case 0x055:
+ return &amd_f15_PMC20;
+ case 0x02D:
+ return &amd_f15_PMC3;
+ case 0x02E:
+ return &amd_f15_PMC30;
+ case 0x031:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ return &amd_f15_PMC20;
+ return &emptyconstraint;
+ case 0x1C0:
+ return &amd_f15_PMC53;
+ default:
+ return &amd_f15_PMC50;
+ }
+ case AMD_EVENT_CU:
+ case AMD_EVENT_IC_DE:
+ case AMD_EVENT_DE:
+ switch (event_code) {
+ case 0x08F:
+ case 0x187:
+ case 0x188:
+ return &amd_f15_PMC0;
+ case 0x0DB ... 0x0DF:
+ case 0x1D6:
+ case 0x1D8:
+ return &amd_f15_PMC50;
+ default:
+ return &amd_f15_PMC20;
+ }
+ case AMD_EVENT_NB:
+ /* moved to uncore.c */
+ return &emptyconstraint;
+ default:
+ return &emptyconstraint;
+ }
+}
+
+static struct event_constraint pair_constraint;
+
+static struct event_constraint *
+amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (amd_is_pair_event_code(hwc))
+ return &pair_constraint;
+
+ return &unconstrained;
+}
+
+static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (is_counter_pair(hwc))
+ --cpuc->n_pair;
+}
+
+/*
+ * Because of the way BRS operates with an inactive and active phases, and
+ * the link to one counter, it is not possible to have two events using BRS
+ * scheduled at the same time. There would be an issue with enforcing the
+ * period of each one and given that the BRS saturates, it would not be possible
+ * to guarantee correlated content for all events. Therefore, in situations
+ * where multiple events want to use BRS, the kernel enforces mutual exclusion.
+ * Exclusion is enforced by choosing only one counter for events using BRS.
+ * The event scheduling logic will then automatically multiplex the
+ * events and ensure that at most one event is actively using BRS.
+ *
+ * The BRS counter could be any counter, but there is no constraint on Fam19h,
+ * therefore all counters are equal and thus we pick the first one: PMC0
+ */
+static struct event_constraint amd_fam19h_brs_cntr0_constraint =
+ EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK);
+
+static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint =
+ __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR);
+
+static struct event_constraint *
+amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ bool has_brs = has_amd_brs(hwc);
+
+ /*
+ * In case BRS is used with an event requiring a counter pair,
+ * the kernel allows it but only on counter 0 & 1 to enforce
+ * multiplexing requiring to protect BRS in case of multiple
+ * BRS users
+ */
+ if (amd_is_pair_event_code(hwc)) {
+ return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint
+ : &pair_constraint;
+ }
+
+ if (has_brs)
+ return &amd_fam19h_brs_cntr0_constraint;
+
+ return &unconstrained;
+}
+
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+ u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+ (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+ return x86_event_sysfs_show(page, config, event);
+}
+
+static void amd_pmu_limit_period(struct perf_event *event, s64 *left)
+{
+ /*
+ * Decrease period by the depth of the BRS feature to get the last N
+ * taken branches and approximate the desired period
+ */
+ if (has_branch_stack(event) && *left > x86_pmu.lbr_nr)
+ *left -= x86_pmu.lbr_nr;
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = amd_pmu_handle_irq,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_event,
+ .disable = amd_pmu_disable_event,
+ .hw_config = amd_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .addr_offset = amd_pmu_addr_offset,
+ .event_map = amd_pmu_event_map,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS - 1, 0),
+ .add = amd_pmu_add_event,
+ .del = amd_pmu_del_event,
+ .cntval_bits = 48,
+ .cntval_mask = (1ULL << 48) - 1,
+ .apic = 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+ .get_event_constraints = amd_get_event_constraints,
+ .put_event_constraints = amd_put_event_constraints,
+
+ .format_attrs = amd_format_attr,
+ .events_sysfs_show = amd_event_sysfs_show,
+
+ .cpu_prepare = amd_pmu_cpu_prepare,
+ .cpu_starting = amd_pmu_cpu_starting,
+ .cpu_dead = amd_pmu_cpu_dead,
+
+ .amd_nb_constraints = 1,
+};
+
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *amd_pmu_branches_attrs[] = {
+ &dev_attr_branches.attr,
+ NULL,
+};
+
+static umode_t
+amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return x86_pmu.lbr_nr ? attr->mode : 0;
+}
+
+static struct attribute_group group_caps_amd_branches = {
+ .name = "caps",
+ .attrs = amd_pmu_branches_attrs,
+ .is_visible = amd_branches_is_visible,
+};
+
+#ifdef CONFIG_PERF_EVENTS_AMD_BRS
+
+EVENT_ATTR_STR(branch-brs, amd_branch_brs,
+ "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n");
+
+static struct attribute *amd_brs_events_attrs[] = {
+ EVENT_PTR(amd_branch_brs),
+ NULL,
+};
+
+static umode_t
+amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ?
+ attr->mode : 0;
+}
+
+static struct attribute_group group_events_amd_brs = {
+ .name = "events",
+ .attrs = amd_brs_events_attrs,
+ .is_visible = amd_brs_is_visible,
+};
+
+#endif /* CONFIG_PERF_EVENTS_AMD_BRS */
+
+static const struct attribute_group *amd_attr_update[] = {
+ &group_caps_amd_branches,
+#ifdef CONFIG_PERF_EVENTS_AMD_BRS
+ &group_events_amd_brs,
+#endif
+ NULL,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+ union cpuid_0x80000022_ebx ebx;
+ u64 even_ctr_mask = 0ULL;
+ int i;
+
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ return 0;
+
+ /* Avoid calculating the value each time in the NMI handler */
+ perf_nmi_window = msecs_to_jiffies(100);
+
+ /*
+ * If core performance counter extensions exists, we must use
+ * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+ * amd_pmu_addr_offset().
+ */
+ x86_pmu.eventsel = MSR_F15H_PERF_CTL;
+ x86_pmu.perfctr = MSR_F15H_PERF_CTR;
+ x86_pmu.cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS_CORE - 1, 0);
+
+ /* Check for Performance Monitoring v2 support */
+ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) {
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+
+ /* Update PMU version for later usage */
+ x86_pmu.version = 2;
+
+ /* Find the number of available Core PMCs */
+ x86_pmu.cntr_mask64 = GENMASK_ULL(ebx.split.num_core_pmc - 1, 0);
+
+ amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64;
+
+ /* Update PMC handling functions */
+ x86_pmu.enable_all = amd_pmu_v2_enable_all;
+ x86_pmu.disable_all = amd_pmu_v2_disable_all;
+ x86_pmu.enable = amd_pmu_v2_enable_event;
+ x86_pmu.handle_irq = amd_pmu_v2_handle_irq;
+ static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status);
+ }
+
+ /*
+ * AMD Core perfctr has separate MSRs for the NB events, see
+ * the amd/uncore.c driver.
+ */
+ x86_pmu.amd_nb_constraints = 0;
+
+ if (boot_cpu_data.x86 == 0x15) {
+ pr_cont("Fam15h ");
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+ }
+ if (boot_cpu_data.x86 >= 0x17) {
+ pr_cont("Fam17h+ ");
+ /*
+ * Family 17h and compatibles have constraints for Large
+ * Increment per Cycle events: they may only be assigned an
+ * even numbered counter that has a consecutive adjacent odd
+ * numbered counter following it.
+ */
+ for (i = 0; i < x86_pmu_max_num_counters(NULL) - 1; i += 2)
+ even_ctr_mask |= BIT_ULL(i);
+
+ pair_constraint = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, even_ctr_mask, 0,
+ x86_pmu_max_num_counters(NULL) / 2, 0,
+ PERF_X86_EVENT_PAIR);
+
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
+ x86_pmu.put_event_constraints = amd_put_event_constraints_f17h;
+ x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE;
+ x86_pmu.flags |= PMU_FL_PAIR;
+ }
+
+ /* LBR and BRS are mutually exclusive features */
+ if (!amd_pmu_lbr_init()) {
+ /* LBR requires flushing on context switch */
+ x86_pmu.sched_task = amd_pmu_lbr_sched_task;
+ static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config);
+ static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
+ static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
+ static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
+
+ /* Only support branch_stack snapshot on perfmon v2 */
+ if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
+ static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
+ } else if (!amd_brs_init()) {
+ /*
+ * BRS requires special event constraints and flushing on ctxsw.
+ */
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f19h;
+ x86_pmu.sched_task = amd_pmu_brs_sched_task;
+ x86_pmu.limit_period = amd_pmu_limit_period;
+
+ static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config);
+ static_call_update(amd_pmu_branch_reset, amd_brs_reset);
+ static_call_update(amd_pmu_branch_add, amd_pmu_brs_add);
+ static_call_update(amd_pmu_branch_del, amd_pmu_brs_del);
+
+ /*
+ * put_event_constraints callback same as Fam17h, set above
+ */
+
+ /* branch sampling must be stopped when entering low power */
+ amd_brs_lopwr_init();
+ }
+
+ x86_pmu.attr_update = amd_attr_update;
+
+ pr_cont("core perfctr, ");
+ return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+ int ret;
+
+ /* Performance-monitoring supported from K7 and later: */
+ if (boot_cpu_data.x86 < 6)
+ return -ENODEV;
+
+ x86_pmu = amd_pmu;
+
+ ret = amd_core_pmu_init();
+ if (ret)
+ return ret;
+
+ if (num_possible_cpus() == 1) {
+ /*
+ * No point in allocating data structures to serialize
+ * against other CPUs, when there is only the one CPU.
+ */
+ x86_pmu.amd_nb_constraints = 0;
+ }
+
+ if (boot_cpu_data.x86 >= 0x17)
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids));
+ else
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+
+ return 0;
+}
+
+static inline void amd_pmu_reload_virt(void)
+{
+ if (x86_pmu.version >= 2) {
+ /*
+ * Clear global enable bits, reprogram the PERF_CTL
+ * registers with updated perf_ctr_virt_mask and then
+ * set global enable bits once again
+ */
+ amd_pmu_v2_disable_all();
+ amd_pmu_enable_all(0);
+ amd_pmu_v2_enable_all(0);
+ return;
+ }
+
+ amd_pmu_disable_all();
+ amd_pmu_enable_all(0);
+}
+
+void amd_pmu_enable_virt(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ cpuc->perf_ctr_virt_mask = 0;
+
+ /* Reload all events */
+ amd_pmu_reload_virt();
+}
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * We only mask out the Host-only bit so that host-only counting works
+ * when SVM is disabled. If someone sets up a guest-only counter when
+ * SVM is disabled the Guest-only bits still gets set and the counter
+ * will not count anything.
+ */
+ cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+ /* Reload all events */
+ amd_pmu_reload_virt();
+}
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
new file mode 100644
index 000000000000..aca89f23d2e0
--- /dev/null
+++ b/arch/x86/events/amd/ibs.c
@@ -0,0 +1,1790 @@
+/*
+ * Performance events - AMD IBS
+ *
+ * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+#include <linux/sched/clock.h>
+
+#include <asm/apic.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+#include <asm/amd/ibs.h>
+
+/* attr.config2 */
+#define IBS_SW_FILTER_MASK 1
+
+/*
+ * IBS states:
+ *
+ * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
+ * and any further add()s must fail.
+ *
+ * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
+ * complicated by the fact that the IBS hardware can send late NMIs (ie. after
+ * we've cleared the EN bit).
+ *
+ * In order to consume these late NMIs we have the STOPPED state, any NMI that
+ * happens after we've cleared the EN state will clear this bit and report the
+ * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
+ * someone else can consume our BIT and our NMI will go unhandled).
+ *
+ * And since we cannot set/clear this separate bit together with the EN bit,
+ * there are races; if we cleared STARTED early, an NMI could land in
+ * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
+ * could happen if the period is small enough), and consume our STOPPED bit
+ * and trigger streams of unhandled NMIs.
+ *
+ * If, however, we clear STARTED late, an NMI can hit between clearing the
+ * EN bit and clearing STARTED, still see STARTED set and process the event.
+ * If this event will have the VALID bit clear, we bail properly, but this
+ * is not a given. With VALID set we can end up calling pmu::stop() again
+ * (the throttle logic) and trigger the WARNs in there.
+ *
+ * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
+ * nesting, and clear STARTED late, so that we have a well defined state over
+ * the clearing of the EN bit.
+ *
+ * XXX: we could probably be using !atomic bitops for all this.
+ */
+
+enum ibs_states {
+ IBS_ENABLED = 0,
+ IBS_STARTED = 1,
+ IBS_STOPPING = 2,
+ IBS_STOPPED = 3,
+
+ IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+ struct perf_event *event;
+ unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+ struct pmu pmu;
+ unsigned int msr;
+ u64 config_mask;
+ u64 cnt_mask;
+ u64 enable_mask;
+ u64 valid_mask;
+ u16 min_period;
+ u64 max_period;
+ unsigned long offset_mask[1];
+ int offset_max;
+ unsigned int fetch_count_reset_broken : 1;
+ unsigned int fetch_ignore_if_zero_rip : 1;
+ struct cpu_perf_ibs __percpu *pcpu;
+
+ u64 (*get_count)(u64 config);
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int overflow = 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ if (unlikely(left < (s64)min)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ /*
+ * If the hw period that triggers the sw overflow is too short
+ * we might hit the irq handler. This biases the results.
+ * Thus we shorten the next-to-last period and set the last
+ * period to the max period.
+ */
+ if (left > max) {
+ left -= max;
+ if (left > max)
+ left = max;
+ else if (left < min)
+ left = min;
+ }
+
+ *hw_period = (u64)left;
+
+ return overflow;
+}
+
+static int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - width;
+ u64 prev_raw_count;
+ u64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ if (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count))
+ return 0;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+ if (perf_ibs_fetch.pmu.type == type)
+ return &perf_ibs_fetch;
+ if (perf_ibs_op.pmu.type == type)
+ return &perf_ibs_op;
+ return NULL;
+}
+
+/*
+ * core pmu config -> IBS config
+ *
+ * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
+ * perf record -a -e r076:p ... # same as -e cpu-cycles:p
+ * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ */
+static int core_pmu_ibs_config(struct perf_event *event, u64 *config)
+{
+ switch (event->attr.type) {
+ case PERF_TYPE_HARDWARE:
+ switch (event->attr.config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ *config = 0;
+ return 0;
+ }
+ break;
+ case PERF_TYPE_RAW:
+ switch (event->attr.config) {
+ case 0x0076:
+ *config = 0;
+ return 0;
+ case 0x00C1:
+ *config = IBS_OP_CNT_CTL;
+ return 0;
+ }
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+/*
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ */
+int forward_event_to_ibs(struct perf_event *event)
+{
+ u64 config = 0;
+
+ if (!event->attr.precise_ip || event->attr.precise_ip > 2)
+ return -EOPNOTSUPP;
+
+ if (!core_pmu_ibs_config(event, &config)) {
+ event->attr.type = perf_ibs_op.pmu.type;
+ event->attr.config = config;
+ }
+ return -ENOENT;
+}
+
+/*
+ * Grouping of IBS events is not possible since IBS can have only
+ * one event active at any point in time.
+ */
+static int validate_group(struct perf_event *event)
+{
+ struct perf_event *sibling;
+
+ if (event->group_leader == event)
+ return 0;
+
+ if (event->group_leader->pmu == event->pmu)
+ return -EINVAL;
+
+ for_each_sibling_event(sibling, event->group_leader) {
+ if (sibling->pmu == event->pmu)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs,
+ struct perf_event *event)
+{
+ return perf_ibs == &perf_ibs_op &&
+ (ibs_caps & IBS_CAPS_OPLDLAT) &&
+ (event->attr.config1 & 0xFFF);
+}
+
+static int perf_ibs_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs;
+ u64 config;
+ int ret;
+
+ perf_ibs = get_ibs_pmu(event->attr.type);
+ if (!perf_ibs)
+ return -ENOENT;
+
+ config = event->attr.config;
+
+ if (event->pmu != &perf_ibs->pmu)
+ return -ENOENT;
+
+ if (config & ~perf_ibs->config_mask)
+ return -EINVAL;
+
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ /* handle exclude_{user,kernel} in the IRQ handler */
+ if (event->attr.exclude_host || event->attr.exclude_guest ||
+ event->attr.exclude_idle)
+ return -EINVAL;
+
+ if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
+ (event->attr.exclude_kernel || event->attr.exclude_user ||
+ event->attr.exclude_hv))
+ return -EINVAL;
+
+ ret = validate_group(event);
+ if (ret)
+ return ret;
+
+ if (hwc->sample_period) {
+ if (config & perf_ibs->cnt_mask)
+ /* raw max_cnt may not be set */
+ return -EINVAL;
+
+ if (event->attr.freq) {
+ hwc->sample_period = perf_ibs->min_period;
+ } else {
+ /* Silently mask off lower nibble. IBS hw mandates it. */
+ hwc->sample_period &= ~0x0FULL;
+ if (hwc->sample_period < perf_ibs->min_period)
+ return -EINVAL;
+ }
+ } else {
+ u64 period = 0;
+
+ if (event->attr.freq)
+ return -EINVAL;
+
+ if (perf_ibs == &perf_ibs_op) {
+ period = (config & IBS_OP_MAX_CNT) << 4;
+ if (ibs_caps & IBS_CAPS_OPCNTEXT)
+ period |= config & IBS_OP_MAX_CNT_EXT_MASK;
+ } else {
+ period = (config & IBS_FETCH_MAX_CNT) << 4;
+ }
+
+ config &= ~perf_ibs->cnt_mask;
+ event->attr.sample_period = period;
+ hwc->sample_period = period;
+
+ if (hwc->sample_period < perf_ibs->min_period)
+ return -EINVAL;
+ }
+
+ if (perf_ibs_ldlat_event(perf_ibs, event)) {
+ u64 ldlat = event->attr.config1 & 0xFFF;
+
+ if (ldlat < 128 || ldlat > 2048)
+ return -EINVAL;
+ ldlat >>= 7;
+
+ config |= (ldlat - 1) << 59;
+ config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN;
+ }
+
+ /*
+ * If we modify hwc->sample_period, we also need to update
+ * hwc->last_period and hwc->period_left.
+ */
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ hwc->config_base = perf_ibs->msr;
+ hwc->config = config;
+
+ return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 *period)
+{
+ int overflow;
+
+ /* ignore lower 4 bits in min count: */
+ overflow = perf_event_set_period(hwc, perf_ibs->min_period,
+ perf_ibs->max_period, period);
+ local64_set(&hwc->prev_count, 0);
+
+ return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+ union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config;
+
+ return fetch_ctl.fetch_cnt << 4;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+ union ibs_op_ctl op_ctl = (union ibs_op_ctl)config;
+ u64 count = 0;
+
+ /*
+ * If the internal 27-bit counter rolled over, the count is MaxCnt
+ * and the lower 7 bits of CurCnt are randomized.
+ * Otherwise CurCnt has the full 27-bit current counter value.
+ */
+ if (op_ctl.op_val) {
+ count = op_ctl.opmaxcnt << 4;
+ if (ibs_caps & IBS_CAPS_OPCNTEXT)
+ count += op_ctl.opmaxcnt_ext << 20;
+ } else if (ibs_caps & IBS_CAPS_RDWROPCNT) {
+ count = op_ctl.opcurcnt;
+ }
+
+ return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+ u64 *config)
+{
+ u64 count = perf_ibs->get_count(*config);
+
+ /*
+ * Set width to 64 since we do not overflow on max width but
+ * instead on max count. In perf_ibs_set_period() we clear
+ * prev count manually on overflow.
+ */
+ while (!perf_event_try_update(event, count, 64)) {
+ rdmsrq(event->hw.config_base, *config);
+ count = perf_ibs->get_count(*config);
+ }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ u64 tmp = hwc->config | config;
+
+ if (perf_ibs->fetch_count_reset_broken)
+ wrmsrq(hwc->config_base, tmp & ~perf_ibs->enable_mask);
+
+ wrmsrq(hwc->config_base, tmp | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ config &= ~perf_ibs->cnt_mask;
+ if (boot_cpu_data.x86 == 0x10)
+ wrmsrq(hwc->config_base, config);
+ config &= ~perf_ibs->enable_mask;
+ wrmsrq(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 period, config = 0;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
+ hwc->sample_period = perf_ibs->min_period;
+
+ perf_ibs_set_period(perf_ibs, hwc, &period);
+ if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
+ config |= period & IBS_OP_MAX_CNT_EXT_MASK;
+ period &= ~IBS_OP_MAX_CNT_EXT_MASK;
+ }
+ config |= period >> 4;
+
+ /*
+ * Set STARTED before enabling the hardware, such that a subsequent NMI
+ * must observe it.
+ */
+ set_bit(IBS_STARTED, pcpu->state);
+ clear_bit(IBS_STOPPING, pcpu->state);
+ perf_ibs_enable_event(perf_ibs, hwc, config);
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 config;
+ int stopping;
+
+ if (test_and_set_bit(IBS_STOPPING, pcpu->state))
+ return;
+
+ stopping = test_bit(IBS_STARTED, pcpu->state);
+
+ if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+ return;
+
+ rdmsrq(hwc->config_base, config);
+
+ if (stopping) {
+ /*
+ * Set STOPPED before disabling the hardware, such that it
+ * must be visible to NMIs the moment we clear the EN bit,
+ * at which point we can generate an !VALID sample which
+ * we need to consume.
+ */
+ set_bit(IBS_STOPPED, pcpu->state);
+ perf_ibs_disable_event(perf_ibs, hwc, config);
+ /*
+ * Clear STARTED after disabling the hardware; if it were
+ * cleared before an NMI hitting after the clear but before
+ * clearing the EN bit might think it a spurious NMI and not
+ * handle it.
+ *
+ * Clearing it after, however, creates the problem of the NMI
+ * handler seeing STARTED but not having a valid sample.
+ */
+ clear_bit(IBS_STARTED, pcpu->state);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /*
+ * Clear valid bit to not count rollovers on update, rollovers
+ * are only updated in the irq handler.
+ */
+ config &= ~perf_ibs->valid_mask;
+
+ perf_ibs_event_update(perf_ibs, event, &config);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+ return -ENOSPC;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ pcpu->event = event;
+
+ if (flags & PERF_EF_START)
+ perf_ibs_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+ return;
+
+ perf_ibs_stop(event, PERF_EF_UPDATE);
+
+ pcpu->event = NULL;
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+static int perf_ibs_check_period(struct perf_event *event, u64 value)
+{
+ struct perf_ibs *perf_ibs;
+ u64 low_nibble;
+
+ if (event->attr.freq)
+ return 0;
+
+ perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ low_nibble = value & 0xFULL;
+
+ /*
+ * This contradicts with perf_ibs_init() which allows sample period
+ * with lower nibble bits set but silently masks them off. Whereas
+ * this returns error.
+ */
+ if (low_nibble || value < perf_ibs->min_period)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * We need to initialize with empty group if all attributes in the
+ * group are dynamic.
+ */
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+static struct attribute_group empty_caps_group = {
+ .name = "caps",
+ .attrs = attrs_empty,
+};
+
+PMU_FORMAT_ATTR(rand_en, "config:57");
+PMU_FORMAT_ATTR(cnt_ctl, "config:19");
+PMU_FORMAT_ATTR(swfilt, "config2:0");
+PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
+PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
+PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11");
+PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
+PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1");
+PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1");
+
+static umode_t
+zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
+}
+
+static umode_t
+ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0;
+}
+
+static umode_t
+ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0;
+}
+
+static struct attribute *fetch_attrs[] = {
+ &format_attr_rand_en.attr,
+ &format_attr_swfilt.attr,
+ NULL,
+};
+
+static struct attribute *fetch_l3missonly_attrs[] = {
+ &fetch_l3missonly.attr.attr,
+ NULL,
+};
+
+static struct attribute *zen4_ibs_extensions_attrs[] = {
+ &zen4_ibs_extensions.attr.attr,
+ NULL,
+};
+
+static struct attribute *ibs_op_ldlat_cap_attrs[] = {
+ &ibs_op_ldlat_cap.attr.attr,
+ NULL,
+};
+
+static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = {
+ &ibs_op_dtlb_pgsize_cap.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_fetch_formats = {
+ .name = "format",
+ .attrs = fetch_attrs,
+};
+
+static struct attribute_group group_fetch_l3missonly = {
+ .name = "format",
+ .attrs = fetch_l3missonly_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static struct attribute_group group_zen4_ibs_extensions = {
+ .name = "caps",
+ .attrs = zen4_ibs_extensions_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static struct attribute_group group_ibs_op_ldlat_cap = {
+ .name = "caps",
+ .attrs = ibs_op_ldlat_cap_attrs,
+ .is_visible = ibs_op_ldlat_is_visible,
+};
+
+static struct attribute_group group_ibs_op_dtlb_pgsize_cap = {
+ .name = "caps",
+ .attrs = ibs_op_dtlb_pgsize_cap_attrs,
+ .is_visible = ibs_op_dtlb_pgsize_is_visible,
+};
+
+static const struct attribute_group *fetch_attr_groups[] = {
+ &group_fetch_formats,
+ &empty_caps_group,
+ NULL,
+};
+
+static const struct attribute_group *fetch_attr_update[] = {
+ &group_fetch_l3missonly,
+ &group_zen4_ibs_extensions,
+ NULL,
+};
+
+static umode_t
+cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
+}
+
+static struct attribute *op_attrs[] = {
+ &format_attr_swfilt.attr,
+ NULL,
+};
+
+static struct attribute *cnt_ctl_attrs[] = {
+ &format_attr_cnt_ctl.attr,
+ NULL,
+};
+
+static struct attribute *op_l3missonly_attrs[] = {
+ &op_l3missonly.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_op_formats = {
+ .name = "format",
+ .attrs = op_attrs,
+};
+
+static struct attribute *ibs_op_ldlat_format_attrs[] = {
+ &ibs_op_ldlat_format.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_cnt_ctl = {
+ .name = "format",
+ .attrs = cnt_ctl_attrs,
+ .is_visible = cnt_ctl_is_visible,
+};
+
+static struct attribute_group group_op_l3missonly = {
+ .name = "format",
+ .attrs = op_l3missonly_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static const struct attribute_group *op_attr_groups[] = {
+ &group_op_formats,
+ &empty_caps_group,
+ NULL,
+};
+
+static struct attribute_group group_ibs_op_ldlat_format = {
+ .name = "format",
+ .attrs = ibs_op_ldlat_format_attrs,
+ .is_visible = ibs_op_ldlat_is_visible,
+};
+
+static const struct attribute_group *op_attr_update[] = {
+ &group_cnt_ctl,
+ &group_op_l3missonly,
+ &group_zen4_ibs_extensions,
+ &group_ibs_op_ldlat_cap,
+ &group_ibs_op_ldlat_format,
+ &group_ibs_op_dtlb_pgsize_cap,
+ NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+ .pmu = {
+ .task_ctx_nr = perf_hw_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ .check_period = perf_ibs_check_period,
+ },
+ .msr = MSR_AMD64_IBSFETCHCTL,
+ .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN,
+ .cnt_mask = IBS_FETCH_MAX_CNT,
+ .enable_mask = IBS_FETCH_ENABLE,
+ .valid_mask = IBS_FETCH_VAL,
+ .min_period = 0x10,
+ .max_period = IBS_FETCH_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
+ .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
+
+ .get_count = get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+ .pmu = {
+ .task_ctx_nr = perf_hw_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ .check_period = perf_ibs_check_period,
+ },
+ .msr = MSR_AMD64_IBSOPCTL,
+ .config_mask = IBS_OP_MAX_CNT,
+ .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
+ IBS_OP_CUR_CNT_RAND,
+ .enable_mask = IBS_OP_ENABLE,
+ .valid_mask = IBS_OP_VAL,
+ .min_period = 0x90,
+ .max_period = IBS_OP_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
+ .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
+
+ .get_count = get_ibs_op_count,
+};
+
+static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->mem_op = PERF_MEM_OP_NA;
+
+ if (op_data3->ld_op)
+ data_src->mem_op = PERF_MEM_OP_LOAD;
+ else if (op_data3->st_op)
+ data_src->mem_op = PERF_MEM_OP_STORE;
+}
+
+/*
+ * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has
+ * more fine granular DataSrc encodings. Others have coarse.
+ */
+static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
+{
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ return (op_data2->data_src_hi << 3) | op_data2->data_src_lo;
+
+ return op_data2->data_src_lo;
+}
+
+#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
+#define LN(x) PERF_MEM_S(LVLNUM, x)
+#define REM PERF_MEM_S(REMOTE, REMOTE)
+#define HOPS(x) PERF_MEM_S(HOPS, x)
+
+static u64 g_data_src[8] = {
+ [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
+ [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_IO] = L(IO) | LN(IO),
+};
+
+#define RMT_NODE_BITS (1 << IBS_DATA_SRC_DRAM)
+#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
+
+static u64 g_zen4_data_src[32] = {
+ [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
+ [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
+ [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
+ [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
+ [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
+};
+
+#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
+ (1 << IBS_DATA_SRC_EXT_PMEM) | \
+ (1 << IBS_DATA_SRC_EXT_EXT_MEM))
+#define ZEN4_RMT_NODE_APPLICABLE(x) (ZEN4_RMT_NODE_BITS & (1 << x))
+
+static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
+ union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+ u8 ibs_data_src = perf_ibs_data_src(op_data2);
+
+ data_src->mem_lvl = 0;
+ data_src->mem_lvl_num = 0;
+
+ /*
+ * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
+ * memory accesses. So, check DcUcMemAcc bit early.
+ */
+ if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
+ return L(UNC) | LN(UNC);
+
+ /* L1 Hit */
+ if (op_data3->dc_miss == 0)
+ return L(L1) | LN(L1);
+
+ /* L2 Hit */
+ if (op_data3->l2_miss == 0) {
+ /* Erratum #1293 */
+ if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
+ !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
+ return L(L2) | LN(L2);
+ }
+
+ /*
+ * OP_DATA2 is valid only for load ops. Skip all checks which
+ * uses OP_DATA2[DataSrc].
+ */
+ if (data_src->mem_op != PERF_MEM_OP_LOAD)
+ goto check_mab;
+
+ if (ibs_caps & IBS_CAPS_ZEN4) {
+ u64 val = g_zen4_data_src[ibs_data_src];
+
+ if (!val)
+ goto check_mab;
+
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
+ }
+
+ return val;
+ } else {
+ u64 val = g_data_src[ibs_data_src];
+
+ if (!val)
+ goto check_mab;
+
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
+ }
+
+ return val;
+ }
+
+check_mab:
+ /*
+ * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding
+ * DC misses. However, such data may come from any level in mem
+ * hierarchy. IBS provides detail about both MAB as well as actual
+ * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
+ * MAB only when IBS fails to provide DataSrc.
+ */
+ if (op_data3->dc_miss_no_mab_alloc)
+ return L(LFB) | LN(LFB);
+
+ /* Don't set HIT with NA */
+ return PERF_MEM_S(LVL, NA) | LN(NA);
+}
+
+static bool perf_ibs_cache_hit_st_valid(void)
+{
+ /* 0: Uninitialized, 1: Valid, -1: Invalid */
+ static int cache_hit_st_valid;
+
+ if (unlikely(!cache_hit_st_valid)) {
+ if (boot_cpu_data.x86 == 0x19 &&
+ (boot_cpu_data.x86_model <= 0xF ||
+ (boot_cpu_data.x86_model >= 0x20 &&
+ boot_cpu_data.x86_model <= 0x5F))) {
+ cache_hit_st_valid = -1;
+ } else {
+ cache_hit_st_valid = 1;
+ }
+ }
+
+ return cache_hit_st_valid == 1;
+}
+
+static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+ u8 ibs_data_src;
+
+ data_src->mem_snoop = PERF_MEM_SNOOP_NA;
+
+ if (!perf_ibs_cache_hit_st_valid() ||
+ data_src->mem_op != PERF_MEM_OP_LOAD ||
+ data_src->mem_lvl & PERF_MEM_LVL_L1 ||
+ data_src->mem_lvl & PERF_MEM_LVL_L2 ||
+ op_data2->cache_hit_st)
+ return;
+
+ ibs_data_src = perf_ibs_data_src(op_data2);
+
+ if (ibs_caps & IBS_CAPS_ZEN4) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE ||
+ ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE ||
+ ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE)
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ }
+}
+
+static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->mem_dtlb = PERF_MEM_TLB_NA;
+
+ if (!op_data3->dc_lin_addr_valid)
+ return;
+
+ if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) &&
+ !op_data3->dc_phy_addr_valid)
+ return;
+
+ if (!op_data3->dc_l1tlb_miss) {
+ data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT;
+ return;
+ }
+
+ if (!op_data3->dc_l2tlb_miss) {
+ data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT;
+ return;
+ }
+
+ data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS;
+}
+
+static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->mem_lock = PERF_MEM_LOCK_NA;
+
+ if (op_data3->dc_locked_op)
+ data_src->mem_lock = PERF_MEM_LOCK_LOCKED;
+}
+
+/* Be careful. Works only for contiguous MSRs. */
+#define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL)
+#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL)
+
+static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
+ struct perf_sample_data *data,
+ union ibs_op_data2 *op_data2,
+ union ibs_op_data3 *op_data3)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
+ perf_ibs_get_mem_snoop(op_data2, data);
+ perf_ibs_get_tlb_lvl(op_data3, data);
+ perf_ibs_get_mem_lock(op_data3, data);
+}
+
+static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data,
+ union ibs_op_data3 *op_data3)
+{
+ __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)];
+
+ /* Erratum #1293 */
+ if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF &&
+ (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
+ /*
+ * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode.
+ * DataSrc=0 is 'No valid status' and RmtNode is invalid when
+ * DataSrc=0.
+ */
+ val = 0;
+ }
+ return val;
+}
+
+static void perf_ibs_parse_ld_st_data(__u64 sample_type,
+ struct perf_ibs_data *ibs_data,
+ struct perf_sample_data *data)
+{
+ union ibs_op_data3 op_data3;
+ union ibs_op_data2 op_data2;
+ union ibs_op_data op_data;
+
+ data->data_src.val = PERF_MEM_NA;
+ op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
+
+ perf_ibs_get_mem_op(&op_data3, data);
+ if (data->data_src.mem_op != PERF_MEM_OP_LOAD &&
+ data->data_src.mem_op != PERF_MEM_OP_STORE)
+ return;
+
+ op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
+ perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3);
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss &&
+ data->data_src.mem_op == PERF_MEM_OP_LOAD) {
+ op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)];
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
+ data->weight.var1_dw = op_data3.dc_miss_lat;
+ data->weight.var2_w = op_data.tag_to_ret_ctr;
+ } else if (sample_type & PERF_SAMPLE_WEIGHT) {
+ data->weight.full = op_data3.dc_miss_lat;
+ }
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) {
+ data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)];
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) {
+ data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)];
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
+}
+
+static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs,
+ struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ return perf_ibs == &perf_ibs_op &&
+ sample_type & (PERF_SAMPLE_DATA_SRC |
+ PERF_SAMPLE_WEIGHT_TYPE |
+ PERF_SAMPLE_ADDR |
+ PERF_SAMPLE_PHYS_ADDR);
+}
+
+static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs,
+ struct perf_event *event,
+ int check_rip)
+{
+ if (event->attr.sample_type & PERF_SAMPLE_RAW ||
+ perf_ibs_is_mem_sample_type(perf_ibs, event) ||
+ perf_ibs_ldlat_event(perf_ibs, event))
+ return perf_ibs->offset_max;
+ else if (check_rip)
+ return 3;
+ return 1;
+}
+
+static bool perf_ibs_is_kernel_data_addr(struct perf_event *event,
+ struct perf_ibs_data *ibs_data)
+{
+ u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW;
+ union ibs_op_data3 op_data3;
+ u64 dc_lin_addr;
+
+ op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
+ dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)];
+
+ return unlikely((event->attr.sample_type & sample_type_mask) &&
+ op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr));
+}
+
+static bool perf_ibs_is_kernel_br_target(struct perf_event *event,
+ struct perf_ibs_data *ibs_data,
+ int br_target_idx)
+{
+ union ibs_op_data op_data;
+ u64 br_target;
+
+ op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)];
+ br_target = ibs_data->regs[br_target_idx];
+
+ return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) &&
+ op_data.op_brn_ret && kernel_ip(br_target));
+}
+
+static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event,
+ struct pt_regs *regs, struct perf_ibs_data *ibs_data,
+ int br_target_idx)
+{
+ if (perf_exclude_event(event, regs))
+ return true;
+
+ if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel)
+ return false;
+
+ if (perf_ibs_is_kernel_data_addr(event, ibs_data))
+ return true;
+
+ if (br_target_idx != -1 &&
+ perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx))
+ return true;
+
+ return false;
+}
+
+static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs,
+ struct perf_ibs_data *ibs_data)
+{
+ if (perf_ibs == &perf_ibs_op) {
+ ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)] &= ~(1ULL << 18);
+ ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0;
+ return;
+ }
+
+ ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)] &= ~(1ULL << 52);
+ ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0;
+}
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ struct perf_event *event = pcpu->event;
+ struct hw_perf_event *hwc;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ struct perf_ibs_data ibs_data;
+ int offset, size, check_rip, offset_max, throttle = 0;
+ unsigned int msr;
+ u64 *buf, *config, period, new_config = 0;
+ int br_target_idx = -1;
+
+ if (!test_bit(IBS_STARTED, pcpu->state)) {
+fail:
+ /*
+ * Catch spurious interrupts after stopping IBS: After
+ * disabling IBS there could be still incoming NMIs
+ * with samples that even have the valid bit cleared.
+ * Mark all this NMIs as handled.
+ */
+ if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
+ return 1;
+
+ return 0;
+ }
+
+ if (WARN_ON_ONCE(!event))
+ goto fail;
+
+ hwc = &event->hw;
+ msr = hwc->config_base;
+ buf = ibs_data.regs;
+ rdmsrq(msr, *buf);
+ if (!(*buf++ & perf_ibs->valid_mask))
+ goto fail;
+
+ config = &ibs_data.regs[0];
+ perf_ibs_event_update(perf_ibs, event, config);
+ perf_sample_data_init(&data, 0, hwc->last_period);
+ if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+ goto out; /* no sw counter overflow */
+
+ ibs_data.caps = ibs_caps;
+ size = 1;
+ offset = 1;
+ check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+
+ offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip);
+
+ do {
+ rdmsrq(msr + offset, *buf++);
+ size++;
+ offset = find_next_bit(perf_ibs->offset_mask,
+ perf_ibs->offset_max,
+ offset + 1);
+ } while (offset < offset_max);
+
+ if (perf_ibs_ldlat_event(perf_ibs, event)) {
+ union ibs_op_data3 op_data3;
+
+ op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
+ /*
+ * Opening event is errored out if load latency threshold is
+ * outside of [128, 2048] range. Since the event has reached
+ * interrupt handler, we can safely assume the threshold is
+ * within [128, 2048] range.
+ */
+ if (!op_data3.ld_op || !op_data3.dc_miss ||
+ op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF))
+ goto out;
+ }
+
+ /*
+ * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
+ * depending on their availability.
+ * Can't add to offset_max as they are staggered
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ if (perf_ibs == &perf_ibs_op) {
+ if (ibs_caps & IBS_CAPS_BRNTRGT) {
+ rdmsrq(MSR_AMD64_IBSBRTARGET, *buf++);
+ br_target_idx = size;
+ size++;
+ }
+ if (ibs_caps & IBS_CAPS_OPDATA4) {
+ rdmsrq(MSR_AMD64_IBSOPDATA4, *buf++);
+ size++;
+ }
+ }
+ if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
+ rdmsrq(MSR_AMD64_ICIBSEXTDCTL, *buf++);
+ size++;
+ }
+ }
+ ibs_data.size = sizeof(u64) * size;
+
+ regs = *iregs;
+ if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+ } else {
+ /* Workaround for erratum #1197 */
+ if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
+ goto out;
+
+ set_linear_ip(&regs, ibs_data.regs[1]);
+ regs.flags |= PERF_EFLAGS_EXACT;
+ }
+
+ if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
+ perf_ibs_swfilt_discard(perf_ibs, event, &regs, &ibs_data, br_target_idx)) {
+ throttle = perf_event_account_interrupt(event);
+ goto out;
+ }
+ /*
+ * Prevent leaking physical addresses to unprivileged users. Skip
+ * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for
+ * unprivileged users.
+ */
+ if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
+ perf_allow_kernel()) {
+ perf_ibs_phyaddr_clear(perf_ibs, &ibs_data);
+ }
+
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw = (struct perf_raw_record){
+ .frag = {
+ .size = sizeof(u32) + ibs_data.size,
+ .data = ibs_data.data,
+ },
+ };
+ perf_sample_save_raw_data(&data, event, &raw);
+ }
+
+ if (perf_ibs == &perf_ibs_op)
+ perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data);
+
+ /*
+ * rip recorded by IbsOpRip will not be consistent with rsp and rbp
+ * recorded as part of interrupt regs. Thus we need to use rip from
+ * interrupt regs while unwinding call stack.
+ */
+ perf_sample_save_callchain(&data, event, iregs);
+
+ throttle = perf_event_overflow(event, &data, &regs);
+
+ if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
+ hwc->sample_period = perf_ibs->min_period;
+
+out:
+ if (!throttle) {
+ if (perf_ibs == &perf_ibs_op) {
+ if (ibs_caps & IBS_CAPS_OPCNTEXT) {
+ new_config = period & IBS_OP_MAX_CNT_EXT_MASK;
+ period &= ~IBS_OP_MAX_CNT_EXT_MASK;
+ }
+ if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL))
+ new_config |= *config & IBS_OP_CUR_CNT_RAND;
+ }
+ new_config |= period >> 4;
+
+ perf_ibs_enable_event(perf_ibs, hwc, new_config);
+ }
+
+ perf_event_update_userpage(event);
+
+ return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ u64 stamp = sched_clock();
+ int handled = 0;
+
+ handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+ handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ perf_sample_event_took(sched_clock() - stamp);
+
+ return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+ struct cpu_perf_ibs __percpu *pcpu;
+ int ret;
+
+ pcpu = alloc_percpu(struct cpu_perf_ibs);
+ if (!pcpu)
+ return -ENOMEM;
+
+ perf_ibs->pcpu = pcpu;
+
+ ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+ if (ret) {
+ perf_ibs->pcpu = NULL;
+ free_percpu(pcpu);
+ }
+
+ return ret;
+}
+
+static __init int perf_ibs_fetch_init(void)
+{
+ /*
+ * Some chips fail to reset the fetch count when it is written; instead
+ * they need a 0-1 transition of IbsFetchEn.
+ */
+ if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
+ perf_ibs_fetch.fetch_count_reset_broken = 1;
+
+ if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
+ perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
+
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY;
+
+ perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups;
+ perf_ibs_fetch.pmu.attr_update = fetch_attr_update;
+
+ return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+}
+
+static __init int perf_ibs_op_init(void)
+{
+ if (ibs_caps & IBS_CAPS_OPCNT)
+ perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+
+ if (ibs_caps & IBS_CAPS_OPCNTEXT) {
+ perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK;
+ perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK;
+ perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK |
+ IBS_OP_CUR_CNT_EXT_MASK);
+ }
+
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
+
+ perf_ibs_op.pmu.attr_groups = op_attr_groups;
+ perf_ibs_op.pmu.attr_update = op_attr_update;
+
+ return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+}
+
+static __init int perf_event_ibs_init(void)
+{
+ int ret;
+
+ ret = perf_ibs_fetch_init();
+ if (ret)
+ return ret;
+
+ ret = perf_ibs_op_init();
+ if (ret)
+ goto err_op;
+
+ ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+ if (ret)
+ goto err_nmi;
+
+ pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+ return 0;
+
+err_nmi:
+ perf_pmu_unregister(&perf_ibs_op.pmu);
+ free_percpu(perf_ibs_op.pcpu);
+ perf_ibs_op.pcpu = NULL;
+err_op:
+ perf_pmu_unregister(&perf_ibs_fetch.pmu);
+ free_percpu(perf_ibs_fetch.pcpu);
+ perf_ibs_fetch.pcpu = NULL;
+
+ return ret;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void)
+{
+ return 0;
+}
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+ u32 caps;
+ unsigned int max_level;
+
+ if (!boot_cpu_has(X86_FEATURE_IBS))
+ return 0;
+
+ /* check IBS cpuid feature flags */
+ max_level = cpuid_eax(0x80000000);
+ if (max_level < IBS_CPUID_FEATURES)
+ return IBS_CAPS_DEFAULT;
+
+ caps = cpuid_eax(IBS_CPUID_FEATURES);
+ if (!(caps & IBS_CAPS_AVAIL))
+ /* cpuid flags not valid */
+ return IBS_CAPS_DEFAULT;
+
+ return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+ return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+ return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+ return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+ int offset;
+ u64 val;
+ int valid = 0;
+
+ preempt_disable();
+
+ rdmsrq(MSR_AMD64_IBSCTL, val);
+ offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+ if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+ pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+ smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+ goto out;
+ }
+
+ if (!get_eilvt(offset)) {
+ pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+ smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+ goto out;
+ }
+
+ valid = 1;
+out:
+ preempt_enable();
+
+ return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+ struct pci_dev *cpu_cfg;
+ int nodes;
+ u32 value = 0;
+
+ nodes = 0;
+ cpu_cfg = NULL;
+ do {
+ cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+ PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ cpu_cfg);
+ if (!cpu_cfg)
+ break;
+ ++nodes;
+ pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+ | IBSCTL_LVT_OFFSET_VALID);
+ pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+ if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+ pci_dev_put(cpu_cfg);
+ pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+ value);
+ return -EINVAL;
+ }
+ } while (1);
+
+ if (!nodes) {
+ pr_debug("No CPU node configured for IBS\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+ int offset;
+ int ret;
+
+ preempt_disable();
+ /* find the next free available EILVT entry, skip offset 0 */
+ for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+ if (get_eilvt(offset))
+ break;
+ }
+ preempt_enable();
+
+ if (offset == APIC_EILVT_NR_MAX) {
+ pr_debug("No EILVT entry available\n");
+ return;
+ }
+
+ ret = setup_ibs_ctl(offset);
+ if (ret)
+ goto out;
+
+ if (!ibs_eilvt_valid())
+ goto out;
+
+ pr_info("LVT offset %d assigned\n", offset);
+
+ return;
+out:
+ preempt_disable();
+ put_eilvt(offset);
+ preempt_enable();
+ return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+ /*
+ * Force LVT offset assignment for family 10h: The offsets are
+ * not assigned by the BIOS for this family, so the OS is
+ * responsible for doing it. If the OS assignment fails, fall
+ * back to BIOS settings and try to setup this.
+ */
+ if (boot_cpu_data.x86 == 0x10)
+ force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+ u64 val;
+
+ rdmsrq(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_LVT_OFFSET_VALID))
+ return -EINVAL;
+
+ return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset < 0)
+ goto failed;
+
+ if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+ return;
+failed:
+ pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+ smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset >= 0)
+ setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
+{
+ setup_APIC_ibs();
+ return 0;
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void *data)
+{
+ clear_APIC_ibs();
+ return 0;
+}
+
+static void perf_ibs_resume(void *data)
+{
+ ibs_eilvt_setup();
+ setup_APIC_ibs();
+}
+
+static const struct syscore_ops perf_ibs_syscore_ops = {
+ .resume = perf_ibs_resume,
+ .suspend = perf_ibs_suspend,
+};
+
+static struct syscore perf_ibs_syscore = {
+ .ops = &perf_ibs_syscore_ops,
+};
+
+static void perf_ibs_pm_init(void)
+{
+ register_syscore(&perf_ibs_syscore);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu)
+{
+ clear_APIC_ibs();
+ return 0;
+}
+
+static __init int amd_ibs_init(void)
+{
+ u32 caps;
+
+ caps = __get_ibs_caps();
+ if (!caps)
+ return -ENODEV; /* ibs not supported by the cpu */
+
+ ibs_eilvt_setup();
+
+ if (!ibs_eilvt_valid())
+ return -EINVAL;
+
+ perf_ibs_pm_init();
+
+ ibs_caps = caps;
+ /* make ibs_caps visible to other cpus: */
+ smp_mb();
+ /*
+ * x86_pmu_amd_ibs_starting_cpu will be called from core on
+ * all online cpus.
+ */
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
+ "perf/x86/amd/ibs:starting",
+ x86_pmu_amd_ibs_starting_cpu,
+ x86_pmu_amd_ibs_dying_cpu);
+
+ return perf_event_ibs_init();
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
new file mode 100644
index 000000000000..a721da9987dd
--- /dev/null
+++ b/arch/x86/events/amd/iommu.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ */
+
+#define pr_fmt(fmt) "perf/amd_iommu: " fmt
+
+#include <linux/perf_event.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+#include <linux/amd-iommu.h>
+
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+#include "iommu.h"
+
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x) ((x)->conf & 0xFFULL)
+#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL)
+#define GET_DOMID(x) (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x) (((x)->conf >> 40) & 0xFFFFFULL)
+
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x) ((x)->conf1 & 0xFFFFULL)
+#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL)
+
+#define IOMMU_NAME_SIZE 24
+
+struct perf_amd_iommu {
+ struct list_head list;
+ struct pmu pmu;
+ struct amd_iommu *iommu;
+ char name[IOMMU_NAME_SIZE];
+ u8 max_banks;
+ u8 max_counters;
+ u64 cntr_assign_mask;
+ raw_spinlock_t lock;
+};
+
+static LIST_HEAD(perf_amd_iommu_list);
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource, "config:0-7");
+PMU_FORMAT_ATTR(devid, "config:8-23");
+PMU_FORMAT_ATTR(domid, "config:24-39");
+PMU_FORMAT_ATTR(pasid, "config:40-59");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
+
+static struct attribute *iommu_format_attrs[] = {
+ &format_attr_csource.attr,
+ &format_attr_devid.attr,
+ &format_attr_pasid.attr,
+ &format_attr_domid.attr,
+ &format_attr_devid_mask.attr,
+ &format_attr_pasid_mask.attr,
+ &format_attr_domid_mask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+ .name = "format",
+ .attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+static struct attribute_group amd_iommu_events_group = {
+ .name = "events",
+};
+
+struct amd_iommu_event_desc {
+ struct device_attribute attr;
+ const char *event;
+};
+
+static ssize_t _iommu_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct amd_iommu_event_desc *event =
+ container_of(attr, struct amd_iommu_event_desc, attr);
+ return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event) \
+{ \
+ .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \
+ .event = _event, \
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+ AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"),
+ AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"),
+ AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"),
+ AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"),
+ AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"),
+ AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"),
+ AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"),
+ AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"),
+ AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"),
+ AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
+ AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
+ AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
+ AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"),
+ AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"),
+ AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"),
+ { /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+ .attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
+{
+ struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+ int max_cntrs = piommu->max_counters;
+ int max_banks = piommu->max_banks;
+ u32 shift, bank, cntr;
+ unsigned long flags;
+ int retval;
+
+ raw_spin_lock_irqsave(&piommu->lock, flags);
+
+ for (bank = 0; bank < max_banks; bank++) {
+ for (cntr = 0; cntr < max_cntrs; cntr++) {
+ shift = bank + (bank*3) + cntr;
+ if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
+ continue;
+ } else {
+ piommu->cntr_assign_mask |= BIT_ULL(shift);
+ event->hw.iommu_bank = bank;
+ event->hw.iommu_cntr = cntr;
+ retval = 0;
+ goto out;
+ }
+ }
+ }
+ retval = -ENOSPC;
+out:
+ raw_spin_unlock_irqrestore(&piommu->lock, flags);
+ return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+ u8 bank, u8 cntr)
+{
+ unsigned long flags;
+ int max_banks, max_cntrs;
+ int shift = 0;
+
+ max_banks = perf_iommu->max_banks;
+ max_cntrs = perf_iommu->max_counters;
+
+ if ((bank > max_banks) || (cntr > max_cntrs))
+ return -EINVAL;
+
+ shift = bank + cntr + (bank*3);
+
+ raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+ perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+ raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+ return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* test the event attr type check for PMU enumeration */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /*
+ * IOMMU counters are shared across all cores.
+ * Therefore, it does not support per-process mode.
+ * Also, it does not support event sampling mode.
+ */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* update the hw_perf_event struct with the iommu config data */
+ hwc->conf = event->attr.config;
+ hwc->conf1 = event->attr.config1;
+
+ return 0;
+}
+
+static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev)
+{
+ return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+ struct amd_iommu *iommu = perf_event_2_iommu(ev);
+ struct hw_perf_event *hwc = &ev->hw;
+ u8 bank = hwc->iommu_bank;
+ u8 cntr = hwc->iommu_cntr;
+ u64 reg = 0ULL;
+
+ reg = GET_CSOURCE(hwc);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
+
+ reg = GET_DEVID_MASK(hwc);
+ reg = GET_DEVID(hwc) | (reg << 32);
+ if (reg)
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
+
+ reg = GET_PASID_MASK(hwc);
+ reg = GET_PASID(hwc) | (reg << 32);
+ if (reg)
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
+
+ reg = GET_DOMID_MASK(hwc);
+ reg = GET_DOMID(hwc) | (reg << 32);
+ if (reg)
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 reg = 0ULL;
+
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_SRC_REG, &reg);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ /*
+ * To account for power-gating, which prevents write to
+ * the counter, we need to enable the counter
+ * before setting up counter register.
+ */
+ perf_iommu_enable_event(event);
+
+ if (flags & PERF_EF_RELOAD) {
+ u64 count = 0;
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+ /*
+ * Since the IOMMU PMU only support counting mode,
+ * the counter always start with value zero.
+ */
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &count);
+ }
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+ u64 count;
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+ if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &count))
+ return;
+
+ /* IOMMU pc counter register is only 48 bits */
+ count &= GENMASK_ULL(47, 0);
+
+ /*
+ * Since the counter always start with value zero,
+ * simply just accumulate the count for the event.
+ */
+ local64_add(count, &event->count);
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /*
+ * To account for power-gating, in which reading the counter would
+ * return zero, we need to read the register before disabling.
+ */
+ perf_iommu_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+
+ perf_iommu_disable_event(event);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+ int retval;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ /* request an iommu bank/counter */
+ retval = get_next_avail_iommu_bnk_cntr(event);
+ if (retval)
+ return retval;
+
+ if (flags & PERF_EF_START)
+ perf_iommu_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_amd_iommu *perf_iommu =
+ container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+ perf_iommu_stop(event, PERF_EF_UPDATE);
+
+ /* clear the assigned iommu bank/counter */
+ clear_avail_iommu_bnk_cntr(perf_iommu,
+ hwc->iommu_bank, hwc->iommu_cntr);
+
+ perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(void)
+{
+ int i = 0, j;
+ struct attribute **attrs;
+
+ while (amd_iommu_v2_event_descs[i].attr.attr.name)
+ i++;
+
+ attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ for (j = 0; j < i; j++)
+ attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+ amd_iommu_events_group.attrs = attrs;
+ return 0;
+}
+
+static const struct attribute_group *amd_iommu_attr_groups[] = {
+ &amd_iommu_format_group,
+ &amd_iommu_cpumask_group,
+ &amd_iommu_events_group,
+ NULL,
+};
+
+static const struct pmu iommu_pmu __initconst = {
+ .event_init = perf_iommu_event_init,
+ .add = perf_iommu_add,
+ .del = perf_iommu_del,
+ .start = perf_iommu_start,
+ .stop = perf_iommu_stop,
+ .read = perf_iommu_read,
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_iommu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+};
+
+static __init int init_one_iommu(unsigned int idx)
+{
+ struct perf_amd_iommu *perf_iommu;
+ int ret;
+
+ perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL);
+ if (!perf_iommu)
+ return -ENOMEM;
+
+ raw_spin_lock_init(&perf_iommu->lock);
+
+ perf_iommu->pmu = iommu_pmu;
+ perf_iommu->iommu = get_amd_iommu(idx);
+ perf_iommu->max_banks = amd_iommu_pc_get_max_banks(idx);
+ perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx);
+
+ if (!perf_iommu->iommu ||
+ !perf_iommu->max_banks ||
+ !perf_iommu->max_counters) {
+ kfree(perf_iommu);
+ return -EINVAL;
+ }
+
+ snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx);
+
+ ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1);
+ if (!ret) {
+ pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n",
+ idx, perf_iommu->max_banks, perf_iommu->max_counters);
+ list_add_tail(&perf_iommu->list, &perf_amd_iommu_list);
+ } else {
+ pr_warn("Error initializing IOMMU %d.\n", idx);
+ kfree(perf_iommu);
+ }
+ return ret;
+}
+
+static __init int amd_iommu_pc_init(void)
+{
+ unsigned int i, cnt = 0;
+ int ret;
+
+ /* Make sure the IOMMU PC resource is available */
+ if (!amd_iommu_pc_supported())
+ return -ENODEV;
+
+ ret = _init_events_attrs();
+ if (ret)
+ return ret;
+
+ /*
+ * An IOMMU PMU is specific to an IOMMU, and can function independently.
+ * So we go through all IOMMUs and ignore the one that fails init
+ * unless all IOMMU are failing.
+ */
+ for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
+ ret = init_one_iommu(i);
+ if (!ret)
+ cnt++;
+ }
+
+ if (!cnt) {
+ kfree(amd_iommu_events_group.attrs);
+ return -ENODEV;
+ }
+
+ /* Init cpumask attributes to only core 0 */
+ cpumask_set_cpu(0, &iommu_cpumask);
+ return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
new file mode 100644
index 000000000000..e6310c635c8b
--- /dev/null
+++ b/arch/x86/events/amd/iommu.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG 0x00
+#define IOMMU_PC_COUNTER_SRC_REG 0x08
+#define IOMMU_PC_PASID_MATCH_REG 0x10
+#define IOMMU_PC_DOMID_MATCH_REG 0x18
+#define IOMMU_PC_DEVID_MATCH_REG 0x20
+#define IOMMU_PC_COUNTER_REPORT_REG 0x28
+
+/* maximum specified bank/counters */
+#define PC_MAX_SPEC_BNKS 64
+#define PC_MAX_SPEC_CNTRS 16
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
new file mode 100644
index 000000000000..d24da377df77
--- /dev/null
+++ b/arch/x86/events/amd/lbr.c
@@ -0,0 +1,436 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <asm/msr.h>
+#include <asm/perf_event.h>
+
+#include "../perf_event.h"
+
+/* LBR Branch Select valid bits */
+#define LBR_SELECT_MASK 0x1ff
+
+/*
+ * LBR Branch Select filter bits which when set, ensures that the
+ * corresponding type of branches are not recorded
+ */
+#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */
+#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */
+#define LBR_SELECT_JCC 2 /* Conditional branches */
+#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */
+#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */
+#define LBR_SELECT_RET_NEAR 5 /* Near returns */
+#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */
+#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */
+#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */
+
+#define LBR_KERNEL BIT(LBR_SELECT_KERNEL)
+#define LBR_USER BIT(LBR_SELECT_USER)
+#define LBR_JCC BIT(LBR_SELECT_JCC)
+#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL)
+#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND)
+#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR)
+#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL)
+#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND)
+#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH)
+#define LBR_NOT_SUPP -1 /* unsupported filter */
+#define LBR_IGNORE 0
+
+#define LBR_ANY \
+ (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \
+ LBR_REL_JMP | LBR_IND_JMP | LBR_FAR)
+
+struct branch_entry {
+ union {
+ struct {
+ u64 ip:58;
+ u64 ip_sign_ext:5;
+ u64 mispredict:1;
+ } split;
+ u64 full;
+ } from;
+
+ union {
+ struct {
+ u64 ip:58;
+ u64 ip_sign_ext:3;
+ u64 reserved:1;
+ u64 spec:1;
+ u64 valid:1;
+ } split;
+ u64 full;
+ } to;
+};
+
+static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val)
+{
+ wrmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2, val);
+}
+
+static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val)
+{
+ wrmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val);
+}
+
+static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2, val);
+
+ return val;
+}
+
+static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val);
+
+ return val;
+}
+
+static __always_inline u64 sign_ext_branch_ip(u64 ip)
+{
+ u32 shift = 64 - boot_cpu_data.x86_virt_bits;
+
+ return (u64)(((s64)ip << shift) >> shift);
+}
+
+static void amd_pmu_lbr_filter(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int br_sel = cpuc->br_sel, offset, type, i, j;
+ bool compress = false;
+ bool fused_only = false;
+ u64 from, to;
+
+ /* If sampling all branches, there is nothing to filter */
+ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+ ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
+ fused_only = true;
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+ from = cpuc->lbr_entries[i].from;
+ to = cpuc->lbr_entries[i].to;
+ type = branch_type_fused(from, to, 0, &offset);
+
+ /*
+ * Adjust the branch from address in case of instruction
+ * fusion where it points to an instruction preceding the
+ * actual branch
+ */
+ if (offset) {
+ cpuc->lbr_entries[i].from += offset;
+ if (fused_only)
+ continue;
+ }
+
+ /* If type does not correspond, then discard */
+ if (type == X86_BR_NONE || (br_sel & type) != type) {
+ cpuc->lbr_entries[i].from = 0; /* mark invalid */
+ compress = true;
+ }
+
+ if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+ cpuc->lbr_entries[i].type = common_branch_type(type);
+ }
+
+ if (!compress)
+ return;
+
+ /* Remove all invalid entries */
+ for (i = 0; i < cpuc->lbr_stack.nr; ) {
+ if (!cpuc->lbr_entries[i].from) {
+ j = i;
+ while (++j < cpuc->lbr_stack.nr)
+ cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j];
+ cpuc->lbr_stack.nr--;
+ if (!cpuc->lbr_entries[i].from)
+ continue;
+ }
+ i++;
+ }
+}
+
+static const int lbr_spec_map[PERF_BR_SPEC_MAX] = {
+ PERF_BR_SPEC_NA,
+ PERF_BR_SPEC_WRONG_PATH,
+ PERF_BR_NON_SPEC_CORRECT_PATH,
+ PERF_BR_SPEC_CORRECT_PATH,
+};
+
+void amd_pmu_lbr_read(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_branch_entry *br = cpuc->lbr_entries;
+ struct branch_entry entry;
+ int out = 0, idx, i;
+
+ if (!cpuc->lbr_users)
+ return;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ entry.from.full = amd_pmu_lbr_get_from(i);
+ entry.to.full = amd_pmu_lbr_get_to(i);
+
+ /*
+ * Check if a branch has been logged; if valid = 0, spec = 0
+ * then no branch was recorded; if reserved = 1 then an
+ * erroneous branch was recorded (see Erratum 1452)
+ */
+ if ((!entry.to.split.valid && !entry.to.split.spec) ||
+ entry.to.split.reserved)
+ continue;
+
+ perf_clear_branch_entry_bitfields(br + out);
+
+ br[out].from = sign_ext_branch_ip(entry.from.split.ip);
+ br[out].to = sign_ext_branch_ip(entry.to.split.ip);
+ br[out].mispred = entry.from.split.mispredict;
+ br[out].predicted = !br[out].mispred;
+
+ /*
+ * Set branch speculation information using the status of
+ * the valid and spec bits.
+ *
+ * When valid = 0, spec = 0, no branch was recorded and the
+ * entry is discarded as seen above.
+ *
+ * When valid = 0, spec = 1, the recorded branch was
+ * speculative but took the wrong path.
+ *
+ * When valid = 1, spec = 0, the recorded branch was
+ * non-speculative but took the correct path.
+ *
+ * When valid = 1, spec = 1, the recorded branch was
+ * speculative and took the correct path
+ */
+ idx = (entry.to.split.valid << 1) | entry.to.split.spec;
+ br[out].spec = lbr_spec_map[idx];
+ out++;
+ }
+
+ cpuc->lbr_stack.nr = out;
+
+ /*
+ * Internal register renaming always ensures that LBR From[0] and
+ * LBR To[0] always represent the TOS
+ */
+ cpuc->lbr_stack.hw_idx = 0;
+
+ /* Perform further software filtering */
+ amd_pmu_lbr_filter();
+}
+
+static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE,
+
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
+
+ [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP,
+};
+
+static int amd_pmu_lbr_setup_filter(struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg = &event->hw.branch_reg;
+ u64 br_type = event->attr.branch_sample_type;
+ u64 mask = 0, v;
+ int i;
+
+ /* No LBR support */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ if (br_type & PERF_SAMPLE_BRANCH_USER)
+ mask |= X86_BR_USER;
+
+ if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+ mask |= X86_BR_KERNEL;
+
+ /* Ignore BRANCH_HV here */
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY)
+ mask |= X86_BR_ANY;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+ mask |= X86_BR_ANY_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+ mask |= X86_BR_IND_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+ mask |= X86_BR_IND_JMP;
+
+ if (br_type & PERF_SAMPLE_BRANCH_CALL)
+ mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+ mask |= X86_BR_TYPE_SAVE;
+
+ reg->reg = mask;
+ mask = 0;
+
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & BIT_ULL(i)))
+ continue;
+
+ v = lbr_select_map[i];
+ if (v == LBR_NOT_SUPP)
+ return -EOPNOTSUPP;
+
+ if (v != LBR_IGNORE)
+ mask |= v;
+ }
+
+ /* Filter bits operate in suppress mode */
+ reg->config = mask ^ LBR_SELECT_MASK;
+
+ return 0;
+}
+
+int amd_pmu_lbr_hw_config(struct perf_event *event)
+{
+ int ret = 0;
+
+ ret = amd_pmu_lbr_setup_filter(event);
+ if (!ret)
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+
+ return ret;
+}
+
+void amd_pmu_lbr_reset(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ /* Reset all branch records individually */
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ amd_pmu_lbr_set_from(i, 0);
+ amd_pmu_lbr_set_to(i, 0);
+ }
+
+ cpuc->last_task_ctx = NULL;
+ cpuc->last_log_id = 0;
+ wrmsrq(MSR_AMD64_LBR_SELECT, 0);
+}
+
+void amd_pmu_lbr_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event_extra *reg = &event->hw.branch_reg;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (has_branch_stack(event)) {
+ cpuc->lbr_select = 1;
+ cpuc->lbr_sel->config = reg->config;
+ cpuc->br_sel = reg->reg;
+ }
+
+ perf_sched_cb_inc(event->pmu);
+
+ if (!cpuc->lbr_users++ && !event->total_time_running)
+ amd_pmu_lbr_reset();
+}
+
+void amd_pmu_lbr_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (has_branch_stack(event))
+ cpuc->lbr_select = 0;
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+ perf_sched_cb_dec(event->pmu);
+}
+
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * A context switch can flip the address space and LBR entries are
+ * not tagged with an identifier. Hence, branches cannot be resolved
+ * from the old address space and the LBR records should be wiped.
+ */
+ if (cpuc->lbr_users && sched_in)
+ amd_pmu_lbr_reset();
+}
+
+void amd_pmu_lbr_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 lbr_select, dbg_ctl, dbg_extn_cfg;
+
+ if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
+ return;
+
+ /* Set hardware branch filter */
+ if (cpuc->lbr_select) {
+ lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK;
+ wrmsrq(MSR_AMD64_LBR_SELECT, lbr_select);
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ }
+
+ rdmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ wrmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN);
+}
+
+void amd_pmu_lbr_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
+ return;
+
+ __amd_pmu_lbr_disable();
+}
+
+__init int amd_pmu_lbr_init(void)
+{
+ union cpuid_0x80000022_ebx ebx;
+
+ if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2))
+ return -EOPNOTSUPP;
+
+ /* Set number of entries */
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+ x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz;
+
+ pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+
+ return 0;
+}
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
new file mode 100644
index 000000000000..dad42790cf7d
--- /dev/null
+++ b/arch/x86/events/amd/power.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Performance events - AMD Processor Power Reporting Mechanism
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+#include "../perf_event.h"
+
+/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
+#define AMD_POWER_EVENT_MASK 0xFFULL
+
+/*
+ * Accumulated power status counters.
+ */
+#define AMD_POWER_EVENTSEL_PKG 1
+
+/*
+ * The ratio of compute unit power accumulator sample period to the
+ * PTSC period.
+ */
+static unsigned int cpu_pwr_sample_ratio;
+
+/* Maximum accumulated power of a compute unit. */
+static u64 max_cu_acc_power;
+
+static struct pmu pmu_class;
+
+/*
+ * Accumulated power represents the sum of each compute unit's (CU) power
+ * consumption. On any core of each CU we read the total accumulated power from
+ * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores
+ * which are picked to measure the power for the CUs they belong to.
+ */
+static cpumask_t cpu_mask;
+
+static void event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc;
+ u64 delta, tdelta;
+
+ prev_pwr_acc = hwc->pwr_acc;
+ prev_ptsc = hwc->ptsc;
+ rdmsrq(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc);
+ rdmsrq(MSR_F15H_PTSC, new_ptsc);
+
+ /*
+ * Calculate the CU power consumption over a time period, the unit of
+ * final value (delta) is micro-Watts. Then add it to the event count.
+ */
+ if (new_pwr_acc < prev_pwr_acc) {
+ delta = max_cu_acc_power + new_pwr_acc;
+ delta -= prev_pwr_acc;
+ } else
+ delta = new_pwr_acc - prev_pwr_acc;
+
+ delta *= cpu_pwr_sample_ratio * 1000;
+ tdelta = new_ptsc - prev_ptsc;
+
+ do_div(delta, tdelta);
+ local64_add(delta, &event->count);
+}
+
+static void __pmu_event_start(struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ rdmsrq(MSR_F15H_PTSC, event->hw.ptsc);
+ rdmsrq(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc);
+}
+
+static void pmu_event_start(struct perf_event *event, int mode)
+{
+ __pmu_event_start(event);
+}
+
+static void pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* Mark event as deactivated and stopped. */
+ if (!(hwc->state & PERF_HES_STOPPED))
+ hwc->state |= PERF_HES_STOPPED;
+
+ /* Check if software counter update is necessary. */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of an event
+ * that we are disabling:
+ */
+ event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int pmu_event_add(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __pmu_event_start(event);
+
+ return 0;
+}
+
+static void pmu_event_del(struct perf_event *event, int flags)
+{
+ pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK;
+
+ /* Only look at AMD power events. */
+ if (event->attr.type != pmu_class.type)
+ return -ENOENT;
+
+ /* Unsupported modes and filters. */
+ if (event->attr.sample_period)
+ return -EINVAL;
+
+ if (cfg != AMD_POWER_EVENTSEL_PKG)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void pmu_event_read(struct perf_event *event)
+{
+ event_update(event);
+}
+
+static ssize_t
+get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL);
+
+static struct attribute *pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_attr_group = {
+ .attrs = pmu_attrs,
+};
+
+/*
+ * Currently it only supports to report the power of each
+ * processor/package.
+ */
+EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01");
+
+EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts");
+
+/* Convert the count from micro-Watts to milli-Watts. */
+EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3");
+
+static struct attribute *events_attr[] = {
+ EVENT_PTR(power_pkg),
+ EVENT_PTR(power_pkg_unit),
+ EVENT_PTR(power_pkg_scale),
+ NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+ .name = "events",
+ .attrs = events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_format_group = {
+ .name = "format",
+ .attrs = formats_attr,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &pmu_attr_group,
+ &pmu_format_group,
+ &pmu_events_group,
+ NULL,
+};
+
+static struct pmu pmu_class = {
+ .attr_groups = attr_groups,
+ /* system-wide only */
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = pmu_event_init,
+ .add = pmu_event_add,
+ .del = pmu_event_del,
+ .start = pmu_event_start,
+ .stop = pmu_event_stop,
+ .read = pmu_event_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .module = THIS_MODULE,
+};
+
+static int power_cpu_exit(unsigned int cpu)
+{
+ int target;
+
+ if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask))
+ return 0;
+
+ /*
+ * Find a new CPU on the same compute unit, if was set in cpumask
+ * and still some CPUs on compute unit. Then migrate event and
+ * context to new CPU.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target < nr_cpumask_bits) {
+ cpumask_set_cpu(target, &cpu_mask);
+ perf_pmu_migrate_context(&pmu_class, cpu, target);
+ }
+ return 0;
+}
+
+static int power_cpu_init(unsigned int cpu)
+{
+ int target;
+
+ /*
+ * 1) If any CPU is set at cpu_mask in the same compute unit, do
+ * nothing.
+ * 2) If no CPU is set at cpu_mask in the same compute unit,
+ * set current ONLINE CPU.
+ *
+ * Note: if there is a CPU aside of the new one already in the
+ * sibling mask, then it is also in cpu_mask.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target >= nr_cpumask_bits)
+ cpumask_set_cpu(cpu, &cpu_mask);
+ return 0;
+}
+
+static const struct x86_cpu_id cpu_match[] = {
+ X86_MATCH_VENDOR_FAM(AMD, 0x15, NULL),
+ {},
+};
+
+static int __init amd_power_pmu_init(void)
+{
+ int ret;
+
+ if (!x86_match_cpu(cpu_match))
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
+ return -ENODEV;
+
+ cpu_pwr_sample_ratio = cpuid_ecx(0x80000007);
+
+ if (rdmsrq_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) {
+ pr_err("Failed to read max compute unit power accumulator MSR\n");
+ return -ENODEV;
+ }
+
+
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
+ "perf/x86/amd/power:online",
+ power_cpu_init, power_cpu_exit);
+
+ ret = perf_pmu_register(&pmu_class, "power", -1);
+ if (WARN_ON(ret)) {
+ pr_warn("AMD Power PMU registration failed\n");
+ return ret;
+ }
+
+ pr_info("AMD Power PMU detected\n");
+ return ret;
+}
+module_init(amd_power_pmu_init);
+
+static void __exit amd_power_pmu_exit(void)
+{
+ cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE);
+ perf_pmu_unregister(&pmu_class);
+}
+module_exit(amd_power_pmu_exit);
+
+MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
+MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
new file mode 100644
index 000000000000..e8b6af199c73
--- /dev/null
+++ b/arch/x86/events/amd/uncore.c
@@ -0,0 +1,1227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufeature.h>
+#include <linux/smp.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB 4
+#define NUM_COUNTERS_L2 4
+#define NUM_COUNTERS_L3 6
+#define NUM_COUNTERS_MAX 64
+
+#define RDPMC_BASE_NB 6
+#define RDPMC_BASE_LLC 10
+
+#define COUNTER_SHIFT 16
+#define UNCORE_NAME_LEN 16
+#define UNCORE_GROUP_MAX 256
+
+#undef pr_fmt
+#define pr_fmt(fmt) "amd_uncore: " fmt
+
+static int pmu_version;
+
+struct amd_uncore_ctx {
+ int refcnt;
+ int cpu;
+ struct perf_event **events;
+ unsigned long active_mask[BITS_TO_LONGS(NUM_COUNTERS_MAX)];
+ int nr_active;
+ struct hrtimer hrtimer;
+ u64 hrtimer_duration;
+};
+
+struct amd_uncore_pmu {
+ char name[UNCORE_NAME_LEN];
+ int num_counters;
+ int rdpmc_base;
+ u32 msr_base;
+ int group;
+ cpumask_t active_mask;
+ struct pmu pmu;
+ struct amd_uncore_ctx * __percpu *ctx;
+};
+
+enum {
+ UNCORE_TYPE_DF,
+ UNCORE_TYPE_L3,
+ UNCORE_TYPE_UMC,
+
+ UNCORE_TYPE_MAX
+};
+
+union amd_uncore_info {
+ struct {
+ u64 aux_data:32; /* auxiliary data */
+ u64 num_pmcs:8; /* number of counters */
+ u64 gid:8; /* group id */
+ u64 cid:8; /* context id */
+ } split;
+ u64 full;
+};
+
+struct amd_uncore {
+ union amd_uncore_info __percpu *info;
+ struct amd_uncore_pmu *pmus;
+ unsigned int num_pmus;
+ bool init_done;
+ void (*scan)(struct amd_uncore *uncore, unsigned int cpu);
+ int (*init)(struct amd_uncore *uncore, unsigned int cpu);
+ void (*move)(struct amd_uncore *uncore, unsigned int cpu);
+ void (*free)(struct amd_uncore *uncore, unsigned int cpu);
+};
+
+static struct amd_uncore uncores[UNCORE_TYPE_MAX];
+
+/* Interval for hrtimer, defaults to 60000 milliseconds */
+static unsigned int update_interval = 60 * MSEC_PER_SEC;
+module_param(update_interval, uint, 0444);
+
+static struct amd_uncore_pmu *event_to_amd_uncore_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct amd_uncore_pmu, pmu);
+}
+
+static enum hrtimer_restart amd_uncore_hrtimer(struct hrtimer *hrtimer)
+{
+ struct amd_uncore_ctx *ctx;
+ struct perf_event *event;
+ int bit;
+
+ ctx = container_of(hrtimer, struct amd_uncore_ctx, hrtimer);
+
+ if (!ctx->nr_active || ctx->cpu != smp_processor_id())
+ return HRTIMER_NORESTART;
+
+ for_each_set_bit(bit, ctx->active_mask, NUM_COUNTERS_MAX) {
+ event = ctx->events[bit];
+ event->pmu->read(event);
+ }
+
+ hrtimer_forward_now(hrtimer, ns_to_ktime(ctx->hrtimer_duration));
+ return HRTIMER_RESTART;
+}
+
+static void amd_uncore_start_hrtimer(struct amd_uncore_ctx *ctx)
+{
+ hrtimer_start(&ctx->hrtimer, ns_to_ktime(ctx->hrtimer_duration),
+ HRTIMER_MODE_REL_PINNED_HARD);
+}
+
+static void amd_uncore_cancel_hrtimer(struct amd_uncore_ctx *ctx)
+{
+ hrtimer_cancel(&ctx->hrtimer);
+}
+
+static void amd_uncore_init_hrtimer(struct amd_uncore_ctx *ctx)
+{
+ hrtimer_setup(&ctx->hrtimer, amd_uncore_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev, new;
+ s64 delta;
+
+ /*
+ * since we do not enable counter overflow interrupts,
+ * we do not have to worry about prev_count changing on us
+ */
+
+ prev = local64_read(&hwc->prev_count);
+
+ /*
+ * Some uncore PMUs do not have RDPMC assignments. In such cases,
+ * read counts directly from the corresponding PERF_CTR.
+ */
+ if (hwc->event_base_rdpmc < 0)
+ rdmsrq(hwc->event_base, new);
+ else
+ new = rdpmc(hwc->event_base_rdpmc);
+
+ local64_set(&hwc->prev_count, new);
+ delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+ delta >>= COUNTER_SHIFT;
+ local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+ struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+ struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!ctx->nr_active++)
+ amd_uncore_start_hrtimer(ctx);
+
+ if (flags & PERF_EF_RELOAD)
+ wrmsrq(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+ hwc->state = 0;
+ __set_bit(hwc->idx, ctx->active_mask);
+ wrmsrq(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+ perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+ struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+ struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrq(hwc->config_base, hwc->config);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ event->pmu->read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ if (!--ctx->nr_active)
+ amd_uncore_cancel_hrtimer(ctx);
+
+ __clear_bit(hwc->idx, ctx->active_mask);
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+ int i;
+ struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+ struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* are we already assigned? */
+ if (hwc->idx != -1 && ctx->events[hwc->idx] == event)
+ goto out;
+
+ for (i = 0; i < pmu->num_counters; i++) {
+ if (ctx->events[i] == event) {
+ hwc->idx = i;
+ goto out;
+ }
+ }
+
+ /* if not, take the first available counter */
+ hwc->idx = -1;
+ for (i = 0; i < pmu->num_counters; i++) {
+ struct perf_event *tmp = NULL;
+
+ if (try_cmpxchg(&ctx->events[i], &tmp, event)) {
+ hwc->idx = i;
+ break;
+ }
+ }
+
+out:
+ if (hwc->idx == -1)
+ return -EBUSY;
+
+ hwc->config_base = pmu->msr_base + (2 * hwc->idx);
+ hwc->event_base = pmu->msr_base + 1 + (2 * hwc->idx);
+ hwc->event_base_rdpmc = pmu->rdpmc_base + hwc->idx;
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (pmu->rdpmc_base < 0)
+ hwc->event_base_rdpmc = -1;
+
+ if (flags & PERF_EF_START)
+ event->pmu->start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+ int i;
+ struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+ struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < pmu->num_counters; i++) {
+ struct perf_event *tmp = event;
+
+ if (try_cmpxchg(&ctx->events[i], &tmp, NULL))
+ break;
+ }
+
+ hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+ struct amd_uncore_pmu *pmu;
+ struct amd_uncore_ctx *ctx;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ pmu = event_to_amd_uncore_pmu(event);
+ ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+ if (!ctx)
+ return -ENODEV;
+
+ /*
+ * NB and Last level cache counters (MSRs) are shared across all cores
+ * that share the same NB / Last level cache. On family 16h and below,
+ * Interrupts can be directed to a single target core, however, event
+ * counts generated by processes running on other cores cannot be masked
+ * out. So we do not support sampling and per-thread events via
+ * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
+ */
+ hwc->config = event->attr.config;
+ hwc->idx = -1;
+
+ /*
+ * since request can come in to any of the shared cores, we will remap
+ * to a single common cpu.
+ */
+ event->cpu = ctx->cpu;
+
+ return 0;
+}
+
+static umode_t
+amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ?
+ attr->mode : 0;
+}
+
+static umode_t
+amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *ptr = dev_get_drvdata(dev);
+ struct amd_uncore_pmu *pmu = container_of(ptr, struct amd_uncore_pmu, pmu);
+
+ return cpumap_print_to_pagebuf(true, buf, &pmu->active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+ .attrs = amd_uncore_attrs,
+};
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct device_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35");
+DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */
+DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */
+DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3, PerfMonV2 UMC */
+DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */
+DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(threadmask2, threadmask, "config:56-57"); /* F19h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(rdwrmask, rdwrmask, "config:8-9"); /* PerfMonV2 UMC */
+
+/* Common DF and NB attributes */
+static struct attribute *amd_uncore_df_format_attr[] = {
+ &format_attr_event12.attr, /* event */
+ &format_attr_umask8.attr, /* umask */
+ NULL,
+};
+
+/* Common L2 and L3 attributes */
+static struct attribute *amd_uncore_l3_format_attr[] = {
+ &format_attr_event12.attr, /* event */
+ &format_attr_umask8.attr, /* umask */
+ NULL, /* threadmask */
+ NULL,
+};
+
+/* Common UMC attributes */
+static struct attribute *amd_uncore_umc_format_attr[] = {
+ &format_attr_event8.attr, /* event */
+ &format_attr_rdwrmask.attr, /* rdwrmask */
+ NULL,
+};
+
+/* F17h unique L3 attributes */
+static struct attribute *amd_f17h_uncore_l3_format_attr[] = {
+ &format_attr_slicemask.attr, /* slicemask */
+ NULL,
+};
+
+/* F19h unique L3 attributes */
+static struct attribute *amd_f19h_uncore_l3_format_attr[] = {
+ &format_attr_coreid.attr, /* coreid */
+ &format_attr_enallslices.attr, /* enallslices */
+ &format_attr_enallcores.attr, /* enallcores */
+ &format_attr_sliceid.attr, /* sliceid */
+ NULL,
+};
+
+static struct attribute_group amd_uncore_df_format_group = {
+ .name = "format",
+ .attrs = amd_uncore_df_format_attr,
+};
+
+static struct attribute_group amd_uncore_l3_format_group = {
+ .name = "format",
+ .attrs = amd_uncore_l3_format_attr,
+};
+
+static struct attribute_group amd_f17h_uncore_l3_format_group = {
+ .name = "format",
+ .attrs = amd_f17h_uncore_l3_format_attr,
+ .is_visible = amd_f17h_uncore_is_visible,
+};
+
+static struct attribute_group amd_f19h_uncore_l3_format_group = {
+ .name = "format",
+ .attrs = amd_f19h_uncore_l3_format_attr,
+ .is_visible = amd_f19h_uncore_is_visible,
+};
+
+static struct attribute_group amd_uncore_umc_format_group = {
+ .name = "format",
+ .attrs = amd_uncore_umc_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_df_attr_groups[] = {
+ &amd_uncore_attr_group,
+ &amd_uncore_df_format_group,
+ NULL,
+};
+
+static const struct attribute_group *amd_uncore_l3_attr_groups[] = {
+ &amd_uncore_attr_group,
+ &amd_uncore_l3_format_group,
+ NULL,
+};
+
+static const struct attribute_group *amd_uncore_l3_attr_update[] = {
+ &amd_f17h_uncore_l3_format_group,
+ &amd_f19h_uncore_l3_format_group,
+ NULL,
+};
+
+static const struct attribute_group *amd_uncore_umc_attr_groups[] = {
+ &amd_uncore_attr_group,
+ &amd_uncore_umc_format_group,
+ NULL,
+};
+
+static __always_inline
+int amd_uncore_ctx_cid(struct amd_uncore *uncore, unsigned int cpu)
+{
+ union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+ return info->split.cid;
+}
+
+static __always_inline
+int amd_uncore_ctx_gid(struct amd_uncore *uncore, unsigned int cpu)
+{
+ union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+ return info->split.gid;
+}
+
+static __always_inline
+int amd_uncore_ctx_num_pmcs(struct amd_uncore *uncore, unsigned int cpu)
+{
+ union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+ return info->split.num_pmcs;
+}
+
+static void amd_uncore_ctx_free(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct amd_uncore_pmu *pmu;
+ struct amd_uncore_ctx *ctx;
+ int i;
+
+ if (!uncore->init_done)
+ return;
+
+ for (i = 0; i < uncore->num_pmus; i++) {
+ pmu = &uncore->pmus[i];
+ ctx = *per_cpu_ptr(pmu->ctx, cpu);
+ if (!ctx)
+ continue;
+
+ if (cpu == ctx->cpu)
+ cpumask_clear_cpu(cpu, &pmu->active_mask);
+
+ if (!--ctx->refcnt) {
+ kfree(ctx->events);
+ kfree(ctx);
+ }
+
+ *per_cpu_ptr(pmu->ctx, cpu) = NULL;
+ }
+}
+
+static int amd_uncore_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct amd_uncore_ctx *curr, *prev;
+ struct amd_uncore_pmu *pmu;
+ int node, cid, gid, i, j;
+
+ if (!uncore->init_done || !uncore->num_pmus)
+ return 0;
+
+ cid = amd_uncore_ctx_cid(uncore, cpu);
+ gid = amd_uncore_ctx_gid(uncore, cpu);
+
+ for (i = 0; i < uncore->num_pmus; i++) {
+ pmu = &uncore->pmus[i];
+ *per_cpu_ptr(pmu->ctx, cpu) = NULL;
+ curr = NULL;
+
+ /* Check for group exclusivity */
+ if (gid != pmu->group)
+ continue;
+
+ /* Find a sibling context */
+ for_each_online_cpu(j) {
+ if (cpu == j)
+ continue;
+
+ prev = *per_cpu_ptr(pmu->ctx, j);
+ if (!prev)
+ continue;
+
+ if (cid == amd_uncore_ctx_cid(uncore, j)) {
+ curr = prev;
+ break;
+ }
+ }
+
+ /* Allocate context if sibling does not exist */
+ if (!curr) {
+ node = cpu_to_node(cpu);
+ curr = kzalloc_node(sizeof(*curr), GFP_KERNEL, node);
+ if (!curr)
+ goto fail;
+
+ curr->cpu = cpu;
+ curr->events = kzalloc_node(sizeof(*curr->events) *
+ pmu->num_counters,
+ GFP_KERNEL, node);
+ if (!curr->events) {
+ kfree(curr);
+ goto fail;
+ }
+
+ amd_uncore_init_hrtimer(curr);
+ curr->hrtimer_duration = (u64)update_interval * NSEC_PER_MSEC;
+
+ cpumask_set_cpu(cpu, &pmu->active_mask);
+ }
+
+ curr->refcnt++;
+ *per_cpu_ptr(pmu->ctx, cpu) = curr;
+ }
+
+ return 0;
+
+fail:
+ amd_uncore_ctx_free(uncore, cpu);
+
+ return -ENOMEM;
+}
+
+static void amd_uncore_ctx_move(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct amd_uncore_ctx *curr, *next;
+ struct amd_uncore_pmu *pmu;
+ int i, j;
+
+ if (!uncore->init_done)
+ return;
+
+ for (i = 0; i < uncore->num_pmus; i++) {
+ pmu = &uncore->pmus[i];
+ curr = *per_cpu_ptr(pmu->ctx, cpu);
+ if (!curr)
+ continue;
+
+ /* Migrate to a shared sibling if possible */
+ for_each_online_cpu(j) {
+ next = *per_cpu_ptr(pmu->ctx, j);
+ if (!next || cpu == j)
+ continue;
+
+ if (curr == next) {
+ perf_pmu_migrate_context(&pmu->pmu, cpu, j);
+ cpumask_clear_cpu(cpu, &pmu->active_mask);
+ cpumask_set_cpu(j, &pmu->active_mask);
+ next->cpu = j;
+ break;
+ }
+ }
+ }
+}
+
+static int amd_uncore_cpu_starting(unsigned int cpu)
+{
+ struct amd_uncore *uncore;
+ int i;
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ uncore->scan(uncore, cpu);
+ }
+
+ return 0;
+}
+
+static int amd_uncore_cpu_online(unsigned int cpu)
+{
+ struct amd_uncore *uncore;
+ int i;
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ if (uncore->init(uncore, cpu))
+ break;
+ }
+
+ return 0;
+}
+
+static int amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+ struct amd_uncore *uncore;
+ int i;
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ uncore->move(uncore, cpu);
+ }
+
+ return 0;
+}
+
+static int amd_uncore_cpu_dead(unsigned int cpu)
+{
+ struct amd_uncore *uncore;
+ int i;
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ uncore->free(uncore, cpu);
+ }
+
+ return 0;
+}
+
+static int amd_uncore_df_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = amd_uncore_event_init(event);
+
+ if (ret || pmu_version < 2)
+ return ret;
+
+ hwc->config = event->attr.config &
+ (pmu_version >= 2 ? AMD64_PERFMON_V2_RAW_EVENT_MASK_NB :
+ AMD64_RAW_EVENT_MASK_NB);
+
+ return 0;
+}
+
+static int amd_uncore_df_add(struct perf_event *event, int flags)
+{
+ int ret = amd_uncore_add(event, flags & ~PERF_EF_START);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (ret)
+ return ret;
+
+ /*
+ * The first four DF counters are accessible via RDPMC index 6 to 9
+ * followed by the L3 counters from index 10 to 15. For processors
+ * with more than four DF counters, the DF RDPMC assignments become
+ * discontiguous as the additional counters are accessible starting
+ * from index 16.
+ */
+ if (hwc->idx >= NUM_COUNTERS_NB)
+ hwc->event_base_rdpmc += NUM_COUNTERS_L3;
+
+ /* Delayed start after rdpmc base update */
+ if (flags & PERF_EF_START)
+ amd_uncore_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static
+void amd_uncore_df_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
+{
+ union cpuid_0x80000022_ebx ebx;
+ union amd_uncore_info info;
+
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+ return;
+
+ info.split.aux_data = 0;
+ info.split.num_pmcs = NUM_COUNTERS_NB;
+ info.split.gid = 0;
+ info.split.cid = topology_logical_package_id(cpu);
+
+ if (pmu_version >= 2) {
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+ info.split.num_pmcs = ebx.split.num_df_pmc;
+ }
+
+ *per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_df_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct attribute **df_attr = amd_uncore_df_format_attr;
+ struct amd_uncore_pmu *pmu;
+ int num_counters;
+
+ /* Run just once */
+ if (uncore->init_done)
+ return amd_uncore_ctx_init(uncore, cpu);
+
+ num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ if (!num_counters)
+ goto done;
+
+ /* No grouping, single instance for a system */
+ uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
+ if (!uncore->pmus)
+ goto done;
+
+ /*
+ * For Family 17h and above, the Northbridge counters are repurposed
+ * as Data Fabric counters. The PMUs are exported based on family as
+ * either NB or DF.
+ */
+ pmu = &uncore->pmus[0];
+ strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_df" : "amd_nb",
+ sizeof(pmu->name));
+ pmu->num_counters = num_counters;
+ pmu->msr_base = MSR_F15H_NB_PERF_CTL;
+ pmu->rdpmc_base = RDPMC_BASE_NB;
+ pmu->group = amd_uncore_ctx_gid(uncore, cpu);
+
+ if (pmu_version >= 2) {
+ *df_attr++ = &format_attr_event14v2.attr;
+ *df_attr++ = &format_attr_umask12.attr;
+ } else if (boot_cpu_data.x86 >= 0x17) {
+ *df_attr = &format_attr_event14.attr;
+ }
+
+ pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+ if (!pmu->ctx)
+ goto done;
+
+ pmu->pmu = (struct pmu) {
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_uncore_df_attr_groups,
+ .name = pmu->name,
+ .event_init = amd_uncore_df_event_init,
+ .add = amd_uncore_df_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+ };
+
+ if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ goto done;
+ }
+
+ pr_info("%d %s%s counters detected\n", pmu->num_counters,
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON " : "",
+ pmu->pmu.name);
+
+ uncore->num_pmus = 1;
+
+done:
+ uncore->init_done = true;
+
+ return amd_uncore_ctx_init(uncore, cpu);
+}
+
+static int amd_uncore_l3_event_init(struct perf_event *event)
+{
+ int ret = amd_uncore_event_init(event);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 config = event->attr.config;
+ u64 mask;
+
+ hwc->config = config & AMD64_RAW_EVENT_MASK_NB;
+
+ /*
+ * SliceMask and ThreadMask need to be set for certain L3 events.
+ * For other events, the two fields do not affect the count.
+ */
+ if (ret || boot_cpu_data.x86 < 0x17)
+ return ret;
+
+ mask = config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK |
+ AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES |
+ AMD64_L3_COREID_MASK);
+
+ if (boot_cpu_data.x86 <= 0x18)
+ mask = ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) |
+ ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK);
+
+ /*
+ * If the user doesn't specify a ThreadMask, they're not trying to
+ * count core 0, so we enable all cores & threads.
+ * We'll also assume that they want to count slice 0 if they specify
+ * a ThreadMask and leave SliceId and EnAllSlices unpopulated.
+ */
+ else if (!(config & AMD64_L3_F19H_THREAD_MASK))
+ mask = AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES |
+ AMD64_L3_EN_ALL_CORES;
+
+ hwc->config |= mask;
+
+ return 0;
+}
+
+static
+void amd_uncore_l3_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
+{
+ union amd_uncore_info info;
+
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_LLC))
+ return;
+
+ info.split.aux_data = 0;
+ info.split.num_pmcs = NUM_COUNTERS_L2;
+ info.split.gid = 0;
+ info.split.cid = per_cpu_llc_id(cpu);
+
+ if (boot_cpu_data.x86 >= 0x17)
+ info.split.num_pmcs = NUM_COUNTERS_L3;
+
+ *per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_l3_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct attribute **l3_attr = amd_uncore_l3_format_attr;
+ struct amd_uncore_pmu *pmu;
+ int num_counters;
+
+ /* Run just once */
+ if (uncore->init_done)
+ return amd_uncore_ctx_init(uncore, cpu);
+
+ num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ if (!num_counters)
+ goto done;
+
+ /* No grouping, single instance for a system */
+ uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
+ if (!uncore->pmus)
+ goto done;
+
+ /*
+ * For Family 17h and above, L3 cache counters are available instead
+ * of L2 cache counters. The PMUs are exported based on family as
+ * either L2 or L3.
+ */
+ pmu = &uncore->pmus[0];
+ strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_l3" : "amd_l2",
+ sizeof(pmu->name));
+ pmu->num_counters = num_counters;
+ pmu->msr_base = MSR_F16H_L2I_PERF_CTL;
+ pmu->rdpmc_base = RDPMC_BASE_LLC;
+ pmu->group = amd_uncore_ctx_gid(uncore, cpu);
+
+ if (boot_cpu_data.x86 >= 0x17) {
+ *l3_attr++ = &format_attr_event8.attr;
+ *l3_attr++ = &format_attr_umask8.attr;
+ *l3_attr++ = boot_cpu_data.x86 >= 0x19 ?
+ &format_attr_threadmask2.attr :
+ &format_attr_threadmask8.attr;
+ }
+
+ pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+ if (!pmu->ctx)
+ goto done;
+
+ pmu->pmu = (struct pmu) {
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_uncore_l3_attr_groups,
+ .attr_update = amd_uncore_l3_attr_update,
+ .name = pmu->name,
+ .event_init = amd_uncore_l3_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+ };
+
+ if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ goto done;
+ }
+
+ pr_info("%d %s%s counters detected\n", pmu->num_counters,
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON " : "",
+ pmu->pmu.name);
+
+ uncore->num_pmus = 1;
+
+done:
+ uncore->init_done = true;
+
+ return amd_uncore_ctx_init(uncore, cpu);
+}
+
+static int amd_uncore_umc_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = amd_uncore_event_init(event);
+
+ if (ret)
+ return ret;
+
+ hwc->config = event->attr.config & AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC;
+
+ return 0;
+}
+
+static void amd_uncore_umc_start(struct perf_event *event, int flags)
+{
+ struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+ struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!ctx->nr_active++)
+ amd_uncore_start_hrtimer(ctx);
+
+ if (flags & PERF_EF_RELOAD)
+ wrmsrq(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+ hwc->state = 0;
+ __set_bit(hwc->idx, ctx->active_mask);
+ wrmsrq(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC));
+ perf_event_update_userpage(event);
+}
+
+static void amd_uncore_umc_read(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev, new, shift;
+ s64 delta;
+
+ shift = COUNTER_SHIFT + 1;
+ prev = local64_read(&hwc->prev_count);
+
+ /*
+ * UMC counters do not have RDPMC assignments. Read counts directly
+ * from the corresponding PERF_CTR.
+ */
+ rdmsrl(hwc->event_base, new);
+
+ /*
+ * Unlike the other uncore counters, UMC counters saturate and set the
+ * Overflow bit (bit 48) on overflow. Since they do not roll over,
+ * proactively reset the corresponding PERF_CTR when bit 47 is set so
+ * that the counter never gets a chance to saturate.
+ */
+ if (new & BIT_ULL(63 - COUNTER_SHIFT)) {
+ wrmsrl(hwc->event_base, 0);
+ local64_set(&hwc->prev_count, 0);
+ } else {
+ local64_set(&hwc->prev_count, new);
+ }
+
+ delta = (new << shift) - (prev << shift);
+ delta >>= shift;
+ local64_add(delta, &event->count);
+}
+
+static
+void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
+{
+ union cpuid_0x80000022_ebx ebx;
+ union amd_uncore_info info;
+ unsigned int eax, ecx, edx;
+
+ if (pmu_version < 2)
+ return;
+
+ cpuid(EXT_PERFMON_DEBUG_FEATURES, &eax, &ebx.full, &ecx, &edx);
+ info.split.aux_data = ecx; /* stash active mask */
+ info.split.num_pmcs = ebx.split.num_umc_pmc;
+ info.split.gid = topology_logical_package_id(cpu);
+ info.split.cid = topology_logical_package_id(cpu);
+ *per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ DECLARE_BITMAP(gmask, UNCORE_GROUP_MAX) = { 0 };
+ u8 group_num_pmus[UNCORE_GROUP_MAX] = { 0 };
+ u8 group_num_pmcs[UNCORE_GROUP_MAX] = { 0 };
+ union amd_uncore_info info;
+ struct amd_uncore_pmu *pmu;
+ int gid, i;
+ u16 index = 0;
+
+ if (pmu_version < 2)
+ return 0;
+
+ /* Run just once */
+ if (uncore->init_done)
+ return amd_uncore_ctx_init(uncore, cpu);
+
+ /* Find unique groups */
+ for_each_online_cpu(i) {
+ info = *per_cpu_ptr(uncore->info, i);
+ gid = info.split.gid;
+ if (test_bit(gid, gmask))
+ continue;
+
+ __set_bit(gid, gmask);
+ group_num_pmus[gid] = hweight32(info.split.aux_data);
+ group_num_pmcs[gid] = info.split.num_pmcs;
+ uncore->num_pmus += group_num_pmus[gid];
+ }
+
+ uncore->pmus = kzalloc(sizeof(*uncore->pmus) * uncore->num_pmus,
+ GFP_KERNEL);
+ if (!uncore->pmus) {
+ uncore->num_pmus = 0;
+ goto done;
+ }
+
+ for_each_set_bit(gid, gmask, UNCORE_GROUP_MAX) {
+ for (i = 0; i < group_num_pmus[gid]; i++) {
+ pmu = &uncore->pmus[index];
+ snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%hu", index);
+ pmu->num_counters = group_num_pmcs[gid] / group_num_pmus[gid];
+ pmu->msr_base = MSR_F19H_UMC_PERF_CTL + i * pmu->num_counters * 2;
+ pmu->rdpmc_base = -1;
+ pmu->group = gid;
+
+ pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+ if (!pmu->ctx)
+ goto done;
+
+ pmu->pmu = (struct pmu) {
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_uncore_umc_attr_groups,
+ .name = pmu->name,
+ .event_init = amd_uncore_umc_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_umc_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_umc_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+ };
+
+ if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ goto done;
+ }
+
+ pr_info("%d %s counters detected\n", pmu->num_counters,
+ pmu->pmu.name);
+
+ index++;
+ }
+ }
+
+done:
+ uncore->num_pmus = index;
+ uncore->init_done = true;
+
+ return amd_uncore_ctx_init(uncore, cpu);
+}
+
+static struct amd_uncore uncores[UNCORE_TYPE_MAX] = {
+ /* UNCORE_TYPE_DF */
+ {
+ .scan = amd_uncore_df_ctx_scan,
+ .init = amd_uncore_df_ctx_init,
+ .move = amd_uncore_ctx_move,
+ .free = amd_uncore_ctx_free,
+ },
+ /* UNCORE_TYPE_L3 */
+ {
+ .scan = amd_uncore_l3_ctx_scan,
+ .init = amd_uncore_l3_ctx_init,
+ .move = amd_uncore_ctx_move,
+ .free = amd_uncore_ctx_free,
+ },
+ /* UNCORE_TYPE_UMC */
+ {
+ .scan = amd_uncore_umc_ctx_scan,
+ .init = amd_uncore_umc_ctx_init,
+ .move = amd_uncore_ctx_move,
+ .free = amd_uncore_ctx_free,
+ },
+};
+
+static int __init amd_uncore_init(void)
+{
+ struct amd_uncore *uncore;
+ int ret = -ENODEV;
+ int i;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+ return -ENODEV;
+
+ if (boot_cpu_has(X86_FEATURE_PERFMON_V2))
+ pmu_version = 2;
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+
+ BUG_ON(!uncore->scan);
+ BUG_ON(!uncore->init);
+ BUG_ON(!uncore->move);
+ BUG_ON(!uncore->free);
+
+ uncore->info = alloc_percpu(union amd_uncore_info);
+ if (!uncore->info) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ };
+
+ /*
+ * Install callbacks. Core will call them for each online cpu.
+ */
+ ret = cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
+ "perf/x86/amd/uncore:prepare",
+ NULL, amd_uncore_cpu_dead);
+ if (ret)
+ goto fail;
+
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
+ "perf/x86/amd/uncore:starting",
+ amd_uncore_cpu_starting, NULL);
+ if (ret)
+ goto fail_prep;
+
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
+ "perf/x86/amd/uncore:online",
+ amd_uncore_cpu_online,
+ amd_uncore_cpu_down_prepare);
+ if (ret)
+ goto fail_start;
+
+ return 0;
+
+fail_start:
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
+fail_prep:
+ cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
+fail:
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ if (uncore->info) {
+ free_percpu(uncore->info);
+ uncore->info = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static void __exit amd_uncore_exit(void)
+{
+ struct amd_uncore *uncore;
+ struct amd_uncore_pmu *pmu;
+ int i, j;
+
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE);
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
+ cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ if (!uncore->info)
+ continue;
+
+ free_percpu(uncore->info);
+ uncore->info = NULL;
+
+ for (j = 0; j < uncore->num_pmus; j++) {
+ pmu = &uncore->pmus[j];
+ if (!pmu->ctx)
+ continue;
+
+ perf_pmu_unregister(&pmu->pmu);
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ }
+
+ kfree(uncore->pmus);
+ uncore->pmus = NULL;
+ }
+}
+
+module_init(amd_uncore_init);
+module_exit(amd_uncore_exit);
+
+MODULE_DESCRIPTION("AMD Uncore Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
new file mode 100644
index 000000000000..0c38a31d5fc7
--- /dev/null
+++ b/arch/x86/events/core.c
@@ -0,0 +1,3093 @@
+/*
+ * Performance events x86 architecture code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kdebug.h>
+#include <linux/kvm_types.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+#include <linux/nospec.h>
+#include <linux/static_call.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+#include <asm/unwind.h>
+#include <asm/uprobes.h>
+#include <asm/ibt.h>
+
+#include "perf_event.h"
+
+struct x86_pmu x86_pmu __read_mostly;
+static struct pmu pmu;
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+ .enabled = 1,
+ .pmu = &pmu,
+};
+
+DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
+DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
+DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
+
+/*
+ * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
+ * from just a typename, as opposed to an actual function.
+ */
+DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
+DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
+DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
+DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
+DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
+DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
+DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period);
+DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update);
+DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
+DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
+DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling);
+DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
+DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
+DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup);
+
+DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable);
+DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable);
+DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all);
+DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all);
+
+/*
+ * This one is magic, it will get called even when PMU init fails (because
+ * there is no PMU), in which case it should simply return NULL.
+ */
+DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
+
+u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+u64 __read_mostly hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+u64 x86_perf_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - x86_pmu.cntval_bits;
+ u64 prev_raw_count, new_raw_count;
+ u64 delta;
+
+ if (unlikely(!hwc->event_base))
+ return 0;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ do {
+ new_raw_count = rdpmc(hwc->event_base_rdpmc);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return new_raw_count;
+}
+
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+ struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
+ struct hw_perf_event_extra *reg;
+ struct extra_reg *er;
+
+ reg = &event->hw.extra_reg;
+
+ if (!extra_regs)
+ return 0;
+
+ for (er = extra_regs; er->msr; er++) {
+ if (er->event != (config & er->config_mask))
+ continue;
+ if (event->attr.config1 & ~er->valid_mask)
+ return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
+
+ reg->idx = er->idx;
+ reg->config = event->attr.config1;
+ reg->reg = er->msr;
+ break;
+ }
+ return 0;
+}
+
+static atomic_t active_events;
+static atomic_t pmc_refcount;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static inline u64 get_possible_counter_mask(void)
+{
+ u64 cntr_mask = x86_pmu.cntr_mask64;
+ int i;
+
+ if (!is_hybrid())
+ return cntr_mask;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
+ cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64;
+
+ return cntr_mask;
+}
+
+static bool reserve_pmc_hardware(void)
+{
+ u64 cntr_mask = get_possible_counter_mask();
+ int i, end;
+
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
+ if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
+ goto perfctr_fail;
+ }
+
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
+ if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
+ goto eventsel_fail;
+ }
+
+ return true;
+
+eventsel_fail:
+ end = i;
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
+ release_evntsel_nmi(x86_pmu_config_addr(i));
+ i = X86_PMC_IDX_MAX;
+
+perfctr_fail:
+ end = i;
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
+ release_perfctr_nmi(x86_pmu_event_addr(i));
+
+ return false;
+}
+
+static void release_pmc_hardware(void)
+{
+ u64 cntr_mask = get_possible_counter_mask();
+ int i;
+
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
+ release_perfctr_nmi(x86_pmu_event_addr(i));
+ release_evntsel_nmi(x86_pmu_config_addr(i));
+ }
+}
+
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
+bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask,
+ unsigned long *fixed_cntr_mask)
+{
+ u64 val, val_fail = -1, val_new= ~0;
+ int i, reg, reg_fail = -1, ret = 0;
+ int bios_fail = 0;
+ int reg_safe = -1;
+
+ /*
+ * Check to see if the BIOS enabled any of the counters, if so
+ * complain and bail.
+ */
+ for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) {
+ reg = x86_pmu_config_addr(i);
+ ret = rdmsrq_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+ bios_fail = 1;
+ val_fail = val;
+ reg_fail = reg;
+ } else {
+ reg_safe = i;
+ }
+ }
+
+ if (*(u64 *)fixed_cntr_mask) {
+ reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ ret = rdmsrq_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) {
+ if (fixed_counter_disabled(i, pmu))
+ continue;
+ if (val & (0x03ULL << i*4)) {
+ bios_fail = 1;
+ val_fail = val;
+ reg_fail = reg;
+ }
+ }
+ }
+
+ /*
+ * If all the counters are enabled, the below test will always
+ * fail. The tools will also become useless in this scenario.
+ * Just fail and disable the hardware counters.
+ */
+
+ if (reg_safe == -1) {
+ reg = reg_safe;
+ goto msr_fail;
+ }
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ reg = x86_pmu_event_addr(reg_safe);
+ if (rdmsrq_safe(reg, &val))
+ goto msr_fail;
+ val ^= 0xffffUL;
+ ret = wrmsrq_safe(reg, val);
+ ret |= rdmsrq_safe(reg, &val_new);
+ if (ret || val != val_new)
+ goto msr_fail;
+
+ /*
+ * We still allow the PMU driver to operate:
+ */
+ if (bios_fail) {
+ pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+ pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+ reg_fail, val_fail);
+ }
+
+ return true;
+
+msr_fail:
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ pr_cont("PMU not available due to virtualization, using software events only.\n");
+ } else {
+ pr_cont("Broken PMU hardware detected, using software events only.\n");
+ pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
+ reg, val_new);
+ }
+
+ return false;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+ x86_release_hardware();
+ atomic_dec(&active_events);
+}
+
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+ hw_perf_event_destroy(event);
+
+ /* undo the lbr/bts event accounting */
+ x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
+static inline int x86_pmu_initialized(void)
+{
+ return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ unsigned int cache_type, cache_op, cache_result;
+ u64 config, val;
+
+ config = attr->config;
+
+ cache_type = (config >> 0) & 0xff;
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+ return -EINVAL;
+ cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
+
+ cache_op = (config >> 8) & 0xff;
+ if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+ return -EINVAL;
+ cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+
+ cache_result = (config >> 16) & 0xff;
+ if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return -EINVAL;
+ cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+ val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result];
+ if (val == 0)
+ return -ENOENT;
+
+ if (val == -1)
+ return -EINVAL;
+
+ hwc->config |= val;
+ attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result];
+ return x86_pmu_extra_regs(val, event);
+}
+
+int x86_reserve_hardware(void)
+{
+ int err = 0;
+
+ if (!atomic_inc_not_zero(&pmc_refcount)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&pmc_refcount) == 0) {
+ if (!reserve_pmc_hardware()) {
+ err = -EBUSY;
+ } else {
+ reserve_ds_buffers();
+ reserve_lbr_buffers();
+ }
+ }
+ if (!err)
+ atomic_inc(&pmc_refcount);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+
+ return err;
+}
+
+void x86_release_hardware(void)
+{
+ if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
+ release_ds_buffers();
+ release_lbr_buffers();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+ int i;
+
+ /*
+ * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
+ * LBR and BTS are still mutually exclusive.
+ */
+ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
+ goto out;
+
+ if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto fail_unlock;
+ }
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+
+out:
+ atomic_inc(&active_events);
+ return 0;
+
+fail_unlock:
+ mutex_unlock(&pmc_reserve_mutex);
+ return -EBUSY;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+ atomic_dec(&active_events);
+
+ /*
+ * See the comment in x86_add_exclusive().
+ */
+ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
+ return;
+
+ atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
+int x86_setup_perfctr(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 config;
+
+ if (!is_sampling_event(event)) {
+ hwc->sample_period = x86_pmu.max_period;
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ }
+
+ if (attr->type == event->pmu->type)
+ return x86_pmu_extra_regs(event->attr.config, event);
+
+ if (attr->type == PERF_TYPE_HW_CACHE)
+ return set_ext_hw_attr(hwc, event);
+
+ if (attr->config >= x86_pmu.max_events)
+ return -EINVAL;
+
+ attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
+
+ /*
+ * The generic map:
+ */
+ config = x86_pmu.event_map(attr->config);
+
+ if (config == 0)
+ return -ENOENT;
+
+ if (config == -1LL)
+ return -EINVAL;
+
+ hwc->config |= config;
+
+ return 0;
+}
+
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+ u64 m = event->attr.branch_sample_type;
+ u64 b = 0;
+
+ /* must capture all branches */
+ if (!(m & PERF_SAMPLE_BRANCH_ANY))
+ return 0;
+
+ m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_user)
+ b |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ /*
+ * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+ */
+
+ return m == b;
+}
+
+int x86_pmu_max_precise(struct pmu *pmu)
+{
+ int precise = 0;
+
+ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ /* arch PEBS */
+ if (x86_pmu.arch_pebs) {
+ precise = 2;
+ if (hybrid(pmu, arch_pebs_cap).pdists)
+ precise++;
+
+ return precise;
+ }
+
+ /* legacy PEBS - support for constant skid */
+ precise++;
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
+ precise++;
+
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
+ }
+
+ return precise;
+}
+
+int x86_pmu_hw_config(struct perf_event *event)
+{
+ if (event->attr.precise_ip) {
+ int precise = x86_pmu_max_precise(event->pmu);
+
+ if (event->attr.precise_ip > precise)
+ return -EOPNOTSUPP;
+
+ /* There's no sense in having PEBS for non sampling events: */
+ if (!is_sampling_event(event))
+ return -EINVAL;
+ }
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+ }
+ }
+
+ if (branch_sample_call_stack(event))
+ event->attach_state |= PERF_ATTACH_TASK_DATA;
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to
+ */
+ if (!event->attr.exclude_user)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!event->attr.exclude_kernel)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (event->attr.type == event->pmu->type)
+ event->hw.config |= x86_pmu_get_event_config(event);
+
+ if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) {
+ s64 left = event->attr.sample_period;
+ x86_pmu.limit_period(event, &left);
+ if (left > event->attr.sample_period)
+ return -EINVAL;
+ }
+
+ /* sample_regs_user never support XMM registers */
+ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
+ return -EINVAL;
+ /*
+ * Besides the general purpose registers, XMM registers may
+ * be collected in PEBS on some platforms, e.g. Icelake
+ */
+ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
+ return -EINVAL;
+
+ if (!event->attr.precise_ip)
+ return -EINVAL;
+ }
+
+ return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __x86_pmu_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = x86_reserve_hardware();
+ if (err)
+ return err;
+
+ atomic_inc(&active_events);
+ event->destroy = hw_perf_event_destroy;
+
+ event->hw.idx = -1;
+ event->hw.last_cpu = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.dyn_constraint = ~0ULL;
+
+ /* mark unused */
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+ return x86_pmu.hw_config(event);
+}
+
+void x86_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrq(x86_pmu_config_addr(idx), val);
+ if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrq(x86_pmu_config_addr(idx), val);
+ if (is_counter_pair(hwc))
+ wrmsrq(x86_pmu_config_addr(idx + 1), 0);
+ }
+}
+
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
+{
+ return static_call(x86_pmu_guest_get_msrs)(nr, data);
+}
+EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs);
+
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
+static void x86_pmu_disable(struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu_initialized())
+ return;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->n_added = 0;
+ cpuc->enabled = 0;
+ barrier();
+
+ static_call(x86_pmu_disable_all)();
+}
+
+void x86_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ }
+}
+
+int is_x86_event(struct perf_event *event)
+{
+ /*
+ * For a non-hybrid platforms, the type of X86 pmu is
+ * always PERF_TYPE_RAW.
+ * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE
+ * is a unique capability for the X86 PMU.
+ * Use them to detect a X86 event.
+ */
+ if (event->pmu->type == PERF_TYPE_RAW ||
+ event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)
+ return true;
+
+ return false;
+}
+
+struct pmu *x86_get_pmu(unsigned int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ /*
+ * All CPUs of the hybrid type have been offline.
+ * The x86_get_pmu() should not be invoked.
+ */
+ if (WARN_ON_ONCE(!cpuc->pmu))
+ return &pmu;
+
+ return cpuc->pmu;
+}
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+ int weight;
+ int event; /* event index */
+ int counter; /* counter index */
+ int unassigned; /* number of events to be assigned left */
+ int nr_gp; /* number of GP counters used */
+ u64 used;
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define SCHED_STATES_MAX 2
+
+struct perf_sched {
+ int max_weight;
+ int max_events;
+ int max_gp;
+ int saved_states;
+ struct event_constraint **constraints;
+ struct sched_state state;
+ struct sched_state saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize iterator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+ int num, int wmin, int wmax, int gpmax)
+{
+ int idx;
+
+ memset(sched, 0, sizeof(*sched));
+ sched->max_events = num;
+ sched->max_weight = wmax;
+ sched->max_gp = gpmax;
+ sched->constraints = constraints;
+
+ for (idx = 0; idx < num; idx++) {
+ if (constraints[idx]->weight == wmin)
+ break;
+ }
+
+ sched->state.event = idx; /* start with min weight */
+ sched->state.weight = wmin;
+ sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+ if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+ return;
+
+ sched->saved[sched->saved_states] = sched->state;
+ sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+ if (!sched->saved_states)
+ return false;
+
+ sched->saved_states--;
+ sched->state = sched->saved[sched->saved_states];
+
+ /* this assignment didn't work out */
+ /* XXX broken vs EVENT_PAIR */
+ sched->state.used &= ~BIT_ULL(sched->state.counter);
+
+ /* try the next one */
+ sched->state.counter++;
+
+ return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+ int idx;
+
+ if (!sched->state.unassigned)
+ return false;
+
+ if (sched->state.event >= sched->max_events)
+ return false;
+
+ c = sched->constraints[sched->state.event];
+ /* Prefer fixed purpose counters */
+ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+ idx = INTEL_PMC_IDX_FIXED;
+ for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+ u64 mask = BIT_ULL(idx);
+
+ if (sched->state.used & mask)
+ continue;
+
+ sched->state.used |= mask;
+ goto done;
+ }
+ }
+
+ /* Grab the first unused counter starting with idx */
+ idx = sched->state.counter;
+ for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+ u64 mask = BIT_ULL(idx);
+
+ if (c->flags & PERF_X86_EVENT_PAIR)
+ mask |= mask << 1;
+
+ if (sched->state.used & mask)
+ continue;
+
+ if (sched->state.nr_gp++ >= sched->max_gp)
+ return false;
+
+ sched->state.used |= mask;
+ goto done;
+ }
+
+ return false;
+
+done:
+ sched->state.counter = idx;
+
+ if (c->overlap)
+ perf_sched_save_state(sched);
+
+ return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+ while (!__perf_sched_find_counter(sched)) {
+ if (!perf_sched_restore_state(sched))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+
+ if (!sched->state.unassigned || !--sched->state.unassigned)
+ return false;
+
+ do {
+ /* next event */
+ sched->state.event++;
+ if (sched->state.event >= sched->max_events) {
+ /* next weight */
+ sched->state.event = 0;
+ sched->state.weight++;
+ if (sched->state.weight > sched->max_weight)
+ return false;
+ }
+ c = sched->constraints[sched->state.event];
+ } while (c->weight != sched->state.weight);
+
+ sched->state.counter = 0; /* start with first counter */
+
+ return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign)
+{
+ struct perf_sched sched;
+
+ perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+
+ do {
+ if (!perf_sched_find_counter(&sched))
+ break; /* failed */
+ if (assign)
+ assign[sched.state.event] = sched.state.counter;
+ } while (perf_sched_next_event(&sched));
+
+ return sched.state.unassigned;
+}
+EXPORT_SYMBOL_GPL(perf_assign_events);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ struct event_constraint *c;
+ struct perf_event *e;
+ int n0, i, wmin, wmax, unsched = 0;
+ struct hw_perf_event *hwc;
+ u64 used_mask = 0;
+
+ /*
+ * Compute the number of events already present; see x86_pmu_add(),
+ * validate_group() and x86_pmu_commit_txn(). For the former two
+ * cpuc->n_events hasn't been updated yet, while for the latter
+ * cpuc->n_txn contains the number of events added in the current
+ * transaction.
+ */
+ n0 = cpuc->n_events;
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+ n0 -= cpuc->n_txn;
+
+ static_call_cond(x86_pmu_start_scheduling)(cpuc);
+
+ for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+ c = cpuc->event_constraint[i];
+
+ /*
+ * Previously scheduled events should have a cached constraint,
+ * while new events should not have one.
+ */
+ WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
+
+ /*
+ * Request constraints for new events; or for those events that
+ * have a dynamic constraint -- for those the constraint can
+ * change due to external factors (sibling state, allow_tfa).
+ */
+ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
+ c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
+ cpuc->event_constraint[i] = c;
+ }
+
+ wmin = min(wmin, c->weight);
+ wmax = max(wmax, c->weight);
+ }
+
+ /*
+ * fastpath, try to reuse previous register
+ */
+ for (i = 0; i < n; i++) {
+ u64 mask;
+
+ hwc = &cpuc->event_list[i]->hw;
+ c = cpuc->event_constraint[i];
+
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
+
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
+
+ mask = BIT_ULL(hwc->idx);
+ if (is_counter_pair(hwc))
+ mask |= mask << 1;
+
+ /* not already used */
+ if (used_mask & mask)
+ break;
+
+ used_mask |= mask;
+
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+
+ /* slow path */
+ if (i != n) {
+ int gpmax = x86_pmu_max_num_counters(cpuc->pmu);
+
+ /*
+ * Do not allow scheduling of more than half the available
+ * generic counters.
+ *
+ * This helps avoid counter starvation of sibling thread by
+ * ensuring at most half the counters cannot be in exclusive
+ * mode. There is no designated counters for the limits. Any
+ * N/2 counters can be used. This helps with events with
+ * specific counter constraints.
+ */
+ if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+ READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+ gpmax /= 2;
+
+ /*
+ * Reduce the amount of available counters to allow fitting
+ * the extra Merge events needed by large increment events.
+ */
+ if (x86_pmu.flags & PMU_FL_PAIR) {
+ gpmax -= cpuc->n_pair;
+ WARN_ON(gpmax <= 0);
+ }
+
+ unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+ wmax, gpmax, assign);
+ }
+
+ /*
+ * In case of success (unsched = 0), mark events as committed,
+ * so we do not put_constraint() in case new events are added
+ * and fail to be scheduled
+ *
+ * We invoke the lower level commit callback to lock the resource
+ *
+ * We do not need to do all of this in case we are called to
+ * validate an event group (assign == NULL)
+ */
+ if (!unsched && assign) {
+ for (i = 0; i < n; i++)
+ static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
+ } else {
+ for (i = n0; i < n; i++) {
+ e = cpuc->event_list[i];
+
+ /*
+ * release events that failed scheduling
+ */
+ static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
+
+ cpuc->event_constraint[i] = NULL;
+ }
+ }
+
+ static_call_cond(x86_pmu_stop_scheduling)(cpuc);
+
+ return unsched ? -EINVAL : 0;
+}
+
+static int add_nr_metric_event(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (is_metric_event(event)) {
+ if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
+ return -EINVAL;
+ cpuc->n_metric++;
+ cpuc->n_txn_metric++;
+ }
+
+ return 0;
+}
+
+static void del_nr_metric_event(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (is_metric_event(event))
+ cpuc->n_metric--;
+}
+
+static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
+ int max_count, int n)
+{
+ union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
+
+ if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
+ return -EINVAL;
+
+ if (n >= max_count + cpuc->n_metric)
+ return -EINVAL;
+
+ cpuc->event_list[n] = event;
+ if (is_counter_pair(&event->hw)) {
+ cpuc->n_pair++;
+ cpuc->n_txn_pair++;
+ }
+
+ return 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu);
+
+ /* current number of events already accepted */
+ n = cpuc->n_events;
+ if (!cpuc->n_events)
+ cpuc->pebs_output = 0;
+
+ if (!cpuc->is_fake && leader->attr.precise_ip) {
+ /*
+ * For PEBS->PT, if !aux_event, the group leader (PT) went
+ * away, the group was broken down and this singleton event
+ * can't schedule any more.
+ */
+ if (is_pebs_pt(leader) && !leader->aux_event)
+ return -EINVAL;
+
+ /*
+ * pebs_output: 0: no PEBS so far, 1: PT, 2: DS
+ */
+ if (cpuc->pebs_output &&
+ cpuc->pebs_output != is_pebs_pt(leader) + 1)
+ return -EINVAL;
+
+ cpuc->pebs_output = is_pebs_pt(leader) + 1;
+ }
+
+ if (is_x86_event(leader)) {
+ if (collect_event(cpuc, leader, max_count, n))
+ return -EINVAL;
+ n++;
+ }
+
+ if (!dogrp)
+ return n;
+
+ for_each_sibling_event(event, leader) {
+ if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (collect_event(cpuc, event, max_count, n))
+ return -EINVAL;
+
+ n++;
+ }
+ return n;
+}
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+ struct cpu_hw_events *cpuc, int i)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ idx = hwc->idx = cpuc->assign[i];
+ hwc->last_cpu = smp_processor_id();
+ hwc->last_tag = ++cpuc->tags[i];
+
+ static_call_cond(x86_pmu_assign)(event, idx);
+
+ switch (hwc->idx) {
+ case INTEL_PMC_IDX_FIXED_BTS:
+ case INTEL_PMC_IDX_FIXED_VLBR:
+ hwc->config_base = 0;
+ hwc->event_base = 0;
+ break;
+
+ case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
+ /* All the metric events are mapped onto the fixed counter 3. */
+ idx = INTEL_PMC_IDX_FIXED_SLOTS;
+ fallthrough;
+ case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED);
+ hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
+ INTEL_PMC_FIXED_RDPMC_BASE;
+ break;
+
+ default:
+ hwc->config_base = x86_pmu_config_addr(hwc->idx);
+ hwc->event_base = x86_pmu_event_addr(hwc->idx);
+ hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+ break;
+ }
+}
+
+/**
+ * x86_perf_rdpmc_index - Return PMC counter used for event
+ * @event: the perf_event to which the PMC counter was assigned
+ *
+ * The counter assigned to this performance event may change if interrupts
+ * are enabled. This counter should thus never be used while interrupts are
+ * enabled. Before this function is used to obtain the assigned counter the
+ * event should be checked for validity using, for example,
+ * perf_event_read_local(), within the same interrupt disabled section in
+ * which this counter is planned to be used.
+ *
+ * Return: The index of the performance monitoring counter assigned to
+ * @perf_event.
+ */
+int x86_perf_rdpmc_index(struct perf_event *event)
+{
+ lockdep_assert_irqs_disabled();
+
+ return event->hw.event_base_rdpmc;
+}
+
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+ struct cpu_hw_events *cpuc,
+ int i)
+{
+ return hwc->idx == cpuc->assign[i] &&
+ hwc->last_cpu == smp_processor_id() &&
+ hwc->last_tag == cpuc->tags[i];
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags);
+
+static void x86_pmu_enable(struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int i, added = cpuc->n_added;
+
+ if (!x86_pmu_initialized())
+ return;
+
+ if (cpuc->enabled)
+ return;
+
+ if (cpuc->n_added) {
+ int n_running = cpuc->n_events - cpuc->n_added;
+
+ /*
+ * The late setup (after counters are scheduled)
+ * is required for some cases, e.g., PEBS counters
+ * snapshotting. Because an accurate counter index
+ * is needed.
+ */
+ static_call_cond(x86_pmu_late_setup)();
+
+ /*
+ * apply assignment obtained either from
+ * hw_perf_group_sched_in() or x86_pmu_enable()
+ *
+ * step1: save events moving to new counters
+ */
+ for (i = 0; i < n_running; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ /*
+ * we can avoid reprogramming counter if:
+ * - assigned same counter as last time
+ * - running on same CPU as last time
+ * - no other event has used the counter since
+ */
+ if (hwc->idx == -1 ||
+ match_prev_assignment(hwc, cpuc, i))
+ continue;
+
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ x86_pmu_stop(event, PERF_EF_UPDATE);
+ cpuc->events[hwc->idx] = NULL;
+ }
+
+ /*
+ * step2: reprogram moved events into new counters
+ */
+ for (i = 0; i < cpuc->n_events; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (!match_prev_assignment(hwc, cpuc, i))
+ x86_assign_hw_event(event, cpuc, i);
+ else if (i < n_running)
+ continue;
+
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ /*
+ * if cpuc->enabled = 0, then no wrmsr as
+ * per x86_pmu_enable_event()
+ */
+ cpuc->events[hwc->idx] = event;
+ x86_pmu_start(event, PERF_EF_RELOAD);
+ }
+ cpuc->n_added = 0;
+ perf_events_lapic_init();
+ }
+
+ cpuc->enabled = 1;
+ barrier();
+
+ static_call(x86_pmu_enable_all)(added);
+}
+
+DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+int x86_perf_event_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int ret = 0, idx = hwc->idx;
+
+ if (unlikely(!hwc->event_base))
+ return 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+ /*
+ * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+ */
+ if (unlikely(left < 2))
+ left = 2;
+
+ if (left > x86_pmu.max_period)
+ left = x86_pmu.max_period;
+
+ static_call_cond(x86_pmu_limit_period)(event, &left);
+
+ this_cpu_write(pmc_prev_left[idx], left);
+
+ /*
+ * The hw event starts counting from this event offset,
+ * mark it to be able to extra future deltas:
+ */
+ local64_set(&hwc->prev_count, (u64)-left);
+
+ wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+
+ /*
+ * Sign extend the Merge event counter's upper 16 bits since
+ * we currently declare a 48-bit counter width
+ */
+ if (is_counter_pair(hwc))
+ wrmsrq(x86_pmu_event_addr(idx + 1), 0xffff);
+
+ perf_event_update_userpage(event);
+
+ return ret;
+}
+
+void x86_pmu_enable_event(struct perf_event *event)
+{
+ if (__this_cpu_read(cpu_hw_events.enabled))
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scheduled with existing events.
+ */
+static int x86_pmu_add(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc;
+ int assign[X86_PMC_IDX_MAX];
+ int n, n0, ret;
+
+ hwc = &event->hw;
+
+ n0 = cpuc->n_events;
+ ret = n = collect_events(cpuc, event, false);
+ if (ret < 0)
+ goto out;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ /*
+ * If group events scheduling transaction was started,
+ * skip the schedulability test here, it will be performed
+ * at commit time (->commit_txn) as a whole.
+ *
+ * If commit fails, we'll call ->del() on all events
+ * for which ->add() was called.
+ */
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+ goto done_collect;
+
+ ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
+ if (ret)
+ goto out;
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));
+
+done_collect:
+ /*
+ * Commit the collect_events() state. See x86_pmu_del() and
+ * x86_pmu_*_txn().
+ */
+ cpuc->n_events = n;
+ cpuc->n_added += n - n0;
+ cpuc->n_txn += n - n0;
+
+ /*
+ * This is before x86_pmu_enable() will call x86_pmu_start(),
+ * so we enable LBRs before an event needs them etc..
+ */
+ static_call_cond(x86_pmu_add)(event);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx = event->hw.idx;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ if (WARN_ON_ONCE(idx == -1))
+ return;
+
+ if (flags & PERF_EF_RELOAD) {
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+ static_call(x86_pmu_set_period)(event);
+ }
+
+ event->hw.state = 0;
+
+ __set_bit(idx, cpuc->active_mask);
+ static_call(x86_pmu_enable)(event);
+ perf_event_update_userpage(event);
+}
+
+void perf_event_print_debug(void)
+{
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ unsigned long *cntr_mask, *fixed_cntr_mask;
+ struct event_constraint *pebs_constraints;
+ struct cpu_hw_events *cpuc;
+ u64 pebs, debugctl;
+ int cpu, idx;
+
+ guard(irqsave)();
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_events, cpu);
+ cntr_mask = hybrid(cpuc->pmu, cntr_mask);
+ fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
+ pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
+
+ if (!*(u64 *)cntr_mask)
+ return;
+
+ if (x86_pmu.version >= 2) {
+ rdmsrq(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+ rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ rdmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+ rdmsrq(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+ pr_info("\n");
+ pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
+ pr_info("CPU#%d: status: %016llx\n", cpu, status);
+ pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
+ pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ if (pebs_constraints) {
+ rdmsrq(MSR_IA32_PEBS_ENABLE, pebs);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ }
+ if (x86_pmu.lbr_nr) {
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
+ }
+ }
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+
+ for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) {
+ rdmsrq(x86_pmu_config_addr(idx), pmc_ctrl);
+ rdmsrq(x86_pmu_event_addr(idx), pmc_count);
+
+ prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+ pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
+ cpu, idx, pmc_ctrl);
+ pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
+ cpu, idx, prev_left);
+ }
+ for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) {
+ if (fixed_counter_disabled(idx, cpuc->pmu))
+ continue;
+ rdmsrq(x86_pmu_fixed_ctr_addr(idx), pmc_count);
+
+ pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ }
+}
+
+void x86_pmu_stop(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (test_bit(hwc->idx, cpuc->active_mask)) {
+ static_call(x86_pmu_disable)(event);
+ __clear_bit(hwc->idx, cpuc->active_mask);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ static_call(x86_pmu_update)(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static void x86_pmu_del(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
+ int i;
+
+ /*
+ * If we're called during a txn, we only need to undo x86_pmu.add.
+ * The events never got scheduled and ->cancel_txn will truncate
+ * the event_list.
+ *
+ * XXX assumes any ->del() called during a TXN will only be on
+ * an event added during that same TXN.
+ */
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+ goto do_del;
+
+ __set_bit(event->hw.idx, cpuc->dirty);
+
+ /*
+ * Not a TXN, therefore cleanup properly.
+ */
+ x86_pmu_stop(event, PERF_EF_UPDATE);
+ cpuc->events[event->hw.idx] = NULL;
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ if (event == cpuc->event_list[i])
+ break;
+ }
+
+ if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+ return;
+
+ /* If we have a newly added event; make sure to decrease n_added. */
+ if (i >= cpuc->n_events - cpuc->n_added)
+ --cpuc->n_added;
+
+ static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
+
+ /* Delete the array entry. */
+ while (++i < cpuc->n_events) {
+ cpuc->event_list[i-1] = cpuc->event_list[i];
+ cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+ cpuc->assign[i-1] = cpuc->assign[i];
+ }
+ cpuc->event_constraint[i-1] = NULL;
+ --cpuc->n_events;
+ if (intel_cap.perf_metrics)
+ del_nr_metric_event(cpuc, event);
+
+ perf_event_update_userpage(event);
+
+do_del:
+
+ /*
+ * This is after x86_pmu_stop(); so we disable LBRs after any
+ * event can need them etc..
+ */
+ static_call_cond(x86_pmu_del)(event);
+}
+
+int x86_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ int idx, handled = 0;
+ u64 last_period;
+ u64 val;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * Some chipsets need to unmask the LVTPC in a particular spot
+ * inside the nmi handler. As a result, the unmasking was pushed
+ * into all the nmi handlers.
+ *
+ * This generic handler doesn't seem to have any issues where the
+ * unmasking occurs so it was left at the top.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ event = cpuc->events[idx];
+ last_period = event->hw.last_period;
+
+ val = static_call(x86_pmu_update)(event);
+ if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+ continue;
+
+ /*
+ * event overflow
+ */
+ handled++;
+
+ if (!static_call(x86_pmu_set_period)(event))
+ continue;
+
+ perf_sample_data_init(&data, 0, last_period);
+
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
+
+ perf_event_overflow(event, &data, regs);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+void perf_events_lapic_init(void)
+{
+ if (!x86_pmu.apic || !x86_pmu_initialized())
+ return;
+
+ /*
+ * Always use NMI for PMU
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ u64 start_clock;
+ u64 finish_clock;
+ int ret;
+
+ /*
+ * All PMUs/events that share this PMI handler should make sure to
+ * increment active_events for their events.
+ */
+ if (!atomic_read(&active_events))
+ return NMI_DONE;
+
+ start_clock = sched_clock();
+ ret = static_call(x86_pmu_handle_irq)(regs);
+ finish_clock = sched_clock();
+
+ perf_sample_event_took(finish_clock - start_clock);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
+
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
+
+static int x86_pmu_prepare_cpu(unsigned int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int i;
+
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+ cpuc->kfree_on_online[i] = NULL;
+ if (x86_pmu.cpu_prepare)
+ return x86_pmu.cpu_prepare(cpu);
+ return 0;
+}
+
+static int x86_pmu_dead_cpu(unsigned int cpu)
+{
+ if (x86_pmu.cpu_dead)
+ x86_pmu.cpu_dead(cpu);
+ return 0;
+}
+
+static int x86_pmu_online_cpu(unsigned int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int i;
+
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+ kfree(cpuc->kfree_on_online[i]);
+ cpuc->kfree_on_online[i] = NULL;
+ }
+ return 0;
+}
+
+static int x86_pmu_starting_cpu(unsigned int cpu)
+{
+ if (x86_pmu.cpu_starting)
+ x86_pmu.cpu_starting(cpu);
+ return 0;
+}
+
+static int x86_pmu_dying_cpu(unsigned int cpu)
+{
+ if (x86_pmu.cpu_dying)
+ x86_pmu.cpu_dying(cpu);
+ return 0;
+}
+
+static void __init pmu_check_apic(void)
+{
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ return;
+
+ x86_pmu.apic = 0;
+ pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+ pr_info("no hardware sampling interrupt available.\n");
+
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
+ */
+ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group __ro_after_init = {
+ .name = "format",
+ .attrs = NULL,
+};
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+ u64 config = 0;
+
+ if (pmu_attr->id < x86_pmu.max_events)
+ config = x86_pmu.event_map(pmu_attr->id);
+
+ /* string trumps id */
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s\n", pmu_attr->event_str);
+
+ return x86_pmu.events_sysfs_show(page, config);
+}
+EXPORT_SYMBOL_GPL(events_sysfs_show);
+
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_ht_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_ht_attr, attr);
+
+ /*
+ * Report conditional events depending on Hyper-Threading.
+ *
+ * This is overly conservative as usually the HT special
+ * handling is not needed if the other CPU thread is idle.
+ *
+ * Note this does not (and cannot) handle the case when thread
+ * siblings are invisible, for example with virtualization
+ * if they are owned by some other guest. The user tool
+ * has to re-read when a thread sibling gets onlined later.
+ */
+ return sprintf(page, "%s",
+ topology_max_smt_threads() > 1 ?
+ pmu_attr->event_str_ht :
+ pmu_attr->event_str_noht);
+}
+
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
+ struct x86_hybrid_pmu *pmu;
+ const char *str, *next_str;
+ int i;
+
+ if (hweight64(pmu_attr->pmu_type) == 1)
+ return sprintf(page, "%s", pmu_attr->event_str);
+
+ /*
+ * Hybrid PMUs may support the same event name, but with different
+ * event encoding, e.g., the mem-loads event on an Atom PMU has
+ * different event encoding from a Core PMU.
+ *
+ * The event_str includes all event encodings. Each event encoding
+ * is divided by ";". The order of the event encodings must follow
+ * the order of the hybrid PMU index.
+ */
+ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ str = pmu_attr->event_str;
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type))
+ continue;
+ if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) {
+ next_str = strchr(str, ';');
+ if (next_str)
+ return snprintf(page, next_str - str + 1, "%s", str);
+ else
+ return sprintf(page, "%s", str);
+ }
+ str = strchr(str, ';');
+ str++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
+
+EVENT_ATTR(cpu-cycles, CPU_CYCLES );
+EVENT_ATTR(instructions, INSTRUCTIONS );
+EVENT_ATTR(cache-references, CACHE_REFERENCES );
+EVENT_ATTR(cache-misses, CACHE_MISSES );
+EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
+EVENT_ATTR(branch-misses, BRANCH_MISSES );
+EVENT_ATTR(bus-cycles, BUS_CYCLES );
+EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
+EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
+EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+ EVENT_PTR(CPU_CYCLES),
+ EVENT_PTR(INSTRUCTIONS),
+ EVENT_PTR(CACHE_REFERENCES),
+ EVENT_PTR(CACHE_MISSES),
+ EVENT_PTR(BRANCH_INSTRUCTIONS),
+ EVENT_PTR(BRANCH_MISSES),
+ EVENT_PTR(BUS_CYCLES),
+ EVENT_PTR(STALLED_CYCLES_FRONTEND),
+ EVENT_PTR(STALLED_CYCLES_BACKEND),
+ EVENT_PTR(REF_CPU_CYCLES),
+ NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static umode_t
+is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ if (idx >= x86_pmu.max_events)
+ return 0;
+
+ pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
+ /* str trumps id */
+ return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0;
+}
+
+static struct attribute_group x86_pmu_events_group __ro_after_init = {
+ .name = "events",
+ .attrs = events_attr,
+ .is_visible = is_visible,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+ u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+ u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+ bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+ bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+ bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
+ bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
+ ssize_t ret;
+
+ /*
+ * We have whole page size to spend and just little data
+ * to write, so we can safely use sprintf.
+ */
+ ret = sprintf(page, "event=0x%02llx", event);
+
+ if (umask)
+ ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+ if (edge)
+ ret += sprintf(page + ret, ",edge");
+
+ if (pc)
+ ret += sprintf(page + ret, ",pc");
+
+ if (any)
+ ret += sprintf(page + ret, ",any");
+
+ if (inv)
+ ret += sprintf(page + ret, ",inv");
+
+ if (cmask)
+ ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+ ret += sprintf(page + ret, "\n");
+
+ return ret;
+}
+
+static struct attribute_group x86_pmu_attr_group;
+static struct attribute_group x86_pmu_caps_group;
+
+static void x86_pmu_static_call_update(void)
+{
+ static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
+ static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
+ static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
+ static_call_update(x86_pmu_enable, x86_pmu.enable);
+ static_call_update(x86_pmu_disable, x86_pmu.disable);
+
+ static_call_update(x86_pmu_assign, x86_pmu.assign);
+
+ static_call_update(x86_pmu_add, x86_pmu.add);
+ static_call_update(x86_pmu_del, x86_pmu.del);
+ static_call_update(x86_pmu_read, x86_pmu.read);
+
+ static_call_update(x86_pmu_set_period, x86_pmu.set_period);
+ static_call_update(x86_pmu_update, x86_pmu.update);
+ static_call_update(x86_pmu_limit_period, x86_pmu.limit_period);
+
+ static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
+ static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
+ static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
+
+ static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
+ static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
+ static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
+
+ static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
+
+ static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
+ static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
+
+ static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
+ static_call_update(x86_pmu_filter, x86_pmu.filter);
+
+ static_call_update(x86_pmu_late_setup, x86_pmu.late_setup);
+
+ static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable);
+ static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable);
+ static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all);
+ static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all);
+}
+
+static void _x86_pmu_read(struct perf_event *event)
+{
+ static_call(x86_pmu_update)(event);
+}
+
+void x86_pmu_show_pmu_cap(struct pmu *pmu)
+{
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
+ pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu));
+ pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64));
+ pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu));
+ pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64));
+ pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask);
+ pr_info("... max period: %016llx\n", x86_pmu.max_period);
+ pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl));
+}
+
+static int __init init_hw_perf_events(void)
+{
+ struct x86_pmu_quirk *quirk;
+ int err;
+
+ pr_info("Performance Events: ");
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_pmu_init();
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_pmu_init();
+ break;
+ case X86_VENDOR_HYGON:
+ err = amd_pmu_init();
+ x86_pmu.name = "HYGON";
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ err = zhaoxin_pmu_init();
+ break;
+ default:
+ err = -ENOTSUPP;
+ }
+ if (err != 0) {
+ pr_cont("no PMU driver, software events only.\n");
+ err = 0;
+ goto out_bad_pmu;
+ }
+
+ pmu_check_apic();
+
+ /* sanity check that the hardware exists or is emulated */
+ if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask))
+ goto out_bad_pmu;
+
+ pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+ x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
+ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+ quirk->func();
+
+ if (!x86_pmu.intel_ctrl)
+ x86_pmu.intel_ctrl = x86_pmu.cntr_mask64;
+
+ if (!x86_pmu.config_mask)
+ x86_pmu.config_mask = X86_RAW_EVENT_MASK;
+
+ perf_events_lapic_init();
+ register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
+
+ unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64,
+ 0, x86_pmu_num_counters(NULL), 0, 0);
+
+ x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
+ if (!x86_pmu.events_sysfs_show)
+ x86_pmu_events_group.attrs = &empty_attrs;
+
+ pmu.attr_update = x86_pmu.attr_update;
+
+ if (!is_hybrid())
+ x86_pmu_show_pmu_cap(NULL);
+
+ if (!x86_pmu.read)
+ x86_pmu.read = _x86_pmu_read;
+
+ if (!x86_pmu.guest_get_msrs)
+ x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
+
+ if (!x86_pmu.set_period)
+ x86_pmu.set_period = x86_perf_event_set_period;
+
+ if (!x86_pmu.update)
+ x86_pmu.update = x86_perf_event_update;
+
+ x86_pmu_static_call_update();
+
+ /*
+ * Install callbacks. Core will call them for each online
+ * cpu.
+ */
+ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
+ x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
+ if (err)
+ return err;
+
+ err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
+ "perf/x86:starting", x86_pmu_starting_cpu,
+ x86_pmu_dying_cpu);
+ if (err)
+ goto out;
+
+ err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
+ x86_pmu_online_cpu, NULL);
+ if (err)
+ goto out1;
+
+ if (!is_hybrid()) {
+ err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+ if (err)
+ goto out2;
+ } else {
+ struct x86_hybrid_pmu *hybrid_pmu;
+ int i, j;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ hybrid_pmu = &x86_pmu.hybrid_pmu[i];
+
+ hybrid_pmu->pmu = pmu;
+ hybrid_pmu->pmu.type = -1;
+ hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
+ hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
+
+ err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
+ (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
+ if (err)
+ break;
+ }
+
+ if (i < x86_pmu.num_hybrid_pmus) {
+ for (j = 0; j < i; j++)
+ perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
+ pr_warn("Failed to register hybrid PMUs\n");
+ kfree(x86_pmu.hybrid_pmu);
+ x86_pmu.hybrid_pmu = NULL;
+ x86_pmu.num_hybrid_pmus = 0;
+ goto out2;
+ }
+ }
+
+ return 0;
+
+out2:
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
+out1:
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
+out:
+ cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
+out_bad_pmu:
+ memset(&x86_pmu, 0, sizeof(x86_pmu));
+ return err;
+}
+early_initcall(init_hw_perf_events);
+
+static void x86_pmu_read(struct perf_event *event)
+{
+ static_call(x86_pmu_read)(event);
+}
+
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
+ */
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
+
+ cpuc->txn_flags = txn_flags;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
+ perf_pmu_disable(pmu);
+ __this_cpu_write(cpu_hw_events.n_txn, 0);
+ __this_cpu_write(cpu_hw_events.n_txn_pair, 0);
+ __this_cpu_write(cpu_hw_events.n_txn_metric, 0);
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(struct pmu *pmu)
+{
+ unsigned int txn_flags;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ txn_flags = cpuc->txn_flags;
+ cpuc->txn_flags = 0;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
+ /*
+ * Truncate collected array by the number of events added in this
+ * transaction. See x86_pmu_add() and x86_pmu_*_txn().
+ */
+ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+ __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+ __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair));
+ __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric));
+ perf_pmu_enable(pmu);
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
+ */
+static int x86_pmu_commit_txn(struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int assign[X86_PMC_IDX_MAX];
+ int n, ret;
+
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+ cpuc->txn_flags = 0;
+ return 0;
+ }
+
+ n = cpuc->n_events;
+
+ if (!x86_pmu_initialized())
+ return -EAGAIN;
+
+ ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
+ if (ret)
+ return ret;
+
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));
+
+ cpuc->txn_flags = 0;
+ perf_pmu_enable(pmu);
+ return 0;
+}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+ intel_cpuc_finish(cpuc);
+ kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu)
+{
+ struct cpu_hw_events *cpuc;
+ int cpu;
+
+ cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+ if (!cpuc)
+ return ERR_PTR(-ENOMEM);
+ cpuc->is_fake = 1;
+
+ if (is_hybrid()) {
+ struct x86_hybrid_pmu *h_pmu;
+
+ h_pmu = hybrid_pmu(event_pmu);
+ if (cpumask_empty(&h_pmu->supported_cpus))
+ goto error;
+ cpu = cpumask_first(&h_pmu->supported_cpus);
+ } else
+ cpu = raw_smp_processor_id();
+ cpuc->pmu = event_pmu;
+
+ if (intel_cpuc_prepare(cpuc, cpu))
+ goto error;
+
+ return cpuc;
+error:
+ free_fake_cpuc(cpuc);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+ struct cpu_hw_events *fake_cpuc;
+ struct event_constraint *c;
+ int ret = 0;
+
+ fake_cpuc = allocate_fake_cpuc(event->pmu);
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
+
+ c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);
+
+ if (!c || !c->weight)
+ ret = -EINVAL;
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(fake_cpuc, event);
+
+ free_fake_cpuc(fake_cpuc);
+
+ return ret;
+}
+
+/*
+ * validate a single event group
+ *
+ * validation include:
+ * - check events are compatible which each other
+ * - events do not compete for the same counter
+ * - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+ struct cpu_hw_events *fake_cpuc;
+ int ret = -EINVAL, n;
+
+ /*
+ * Reject events from different hybrid PMUs.
+ */
+ if (is_hybrid()) {
+ struct perf_event *sibling;
+ struct pmu *pmu = NULL;
+
+ if (is_x86_event(leader))
+ pmu = leader->pmu;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!is_x86_event(sibling))
+ continue;
+ if (!pmu)
+ pmu = sibling->pmu;
+ else if (pmu != sibling->pmu)
+ return ret;
+ }
+ }
+
+ fake_cpuc = allocate_fake_cpuc(event->pmu);
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = collect_events(fake_cpuc, leader, true);
+ if (n < 0)
+ goto out;
+
+ fake_cpuc->n_events = n;
+ n = collect_events(fake_cpuc, event, false);
+ if (n < 0)
+ goto out;
+
+ fake_cpuc->n_events = 0;
+ ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
+
+out:
+ free_fake_cpuc(fake_cpuc);
+ return ret;
+}
+
+static int x86_pmu_event_init(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = NULL;
+ int err;
+
+ if ((event->attr.type != event->pmu->type) &&
+ (event->attr.type != PERF_TYPE_HARDWARE) &&
+ (event->attr.type != PERF_TYPE_HW_CACHE))
+ return -ENOENT;
+
+ if (is_hybrid() && (event->cpu != -1)) {
+ pmu = hybrid_pmu(event->pmu);
+ if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus))
+ return -ENOENT;
+ }
+
+ err = __x86_pmu_event_init(event);
+ if (!err) {
+ if (event->group_leader != event)
+ err = validate_group(event);
+ else
+ err = validate_event(event);
+ }
+ if (err) {
+ if (event->destroy)
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (READ_ONCE(x86_pmu.attr_rdpmc) &&
+ !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
+ event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
+
+ return err;
+}
+
+void perf_clear_dirty_counters(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+
+ /* Don't need to clear the assigned counter. */
+ for (i = 0; i < cpuc->n_events; i++)
+ __clear_bit(cpuc->assign[i], cpuc->dirty);
+
+ if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
+ return;
+
+ for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
+ if (i >= INTEL_PMC_IDX_FIXED) {
+ /* Metrics and fake events don't have corresponding HW counters. */
+ if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask)))
+ continue;
+
+ wrmsrq(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0);
+ } else {
+ wrmsrq(x86_pmu_event_addr(i), 0);
+ }
+ }
+
+ bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
+{
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
+ return;
+
+ /*
+ * This function relies on not being called concurrently in two
+ * tasks in the same mm. Otherwise one task could observe
+ * perf_rdpmc_allowed > 1 and return all the way back to
+ * userspace with CR4.PCE clear while another task is still
+ * doing on_each_cpu_mask() to propagate CR4.PCE.
+ *
+ * For now, this can't happen because all callers hold mmap_lock
+ * for write. If this changes, we'll need a different solution.
+ */
+ mmap_assert_write_locked(mm);
+
+ if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
+ on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
+{
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
+ return;
+
+ if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
+ on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
+}
+
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
+ return 0;
+
+ if (is_metric_idx(hwc->idx))
+ return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
+ else
+ return hwc->event_base_rdpmc + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ static DEFINE_MUTEX(rdpmc_mutex);
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 2)
+ return -EINVAL;
+
+ if (x86_pmu.attr_rdpmc_broken)
+ return -ENOTSUPP;
+
+ guard(mutex)(&rdpmc_mutex);
+
+ if (val != x86_pmu.attr_rdpmc) {
+ /*
+ * Changing into or out of never available or always available,
+ * aka perf-event-bypassing mode. This path is extremely slow,
+ * but only root can trigger it, so it's okay.
+ */
+ if (val == 0)
+ static_branch_inc(&rdpmc_never_available_key);
+ else if (x86_pmu.attr_rdpmc == 0)
+ static_branch_dec(&rdpmc_never_available_key);
+
+ if (val == 2)
+ static_branch_inc(&rdpmc_always_available_key);
+ else if (x86_pmu.attr_rdpmc == 2)
+ static_branch_dec(&rdpmc_always_available_key);
+
+ on_each_cpu(cr4_update_pce, NULL, 1);
+ x86_pmu.attr_rdpmc = val;
+ }
+
+ return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+ &dev_attr_rdpmc.attr,
+ NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group __ro_after_init = {
+ .attrs = x86_pmu_attrs,
+};
+
+static ssize_t max_precise_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(cdev);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu));
+}
+
+static DEVICE_ATTR_RO(max_precise);
+
+static struct attribute *x86_pmu_caps_attrs[] = {
+ &dev_attr_max_precise.attr,
+ NULL
+};
+
+static struct attribute_group x86_pmu_caps_group __ro_after_init = {
+ .name = "caps",
+ .attrs = x86_pmu_caps_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+ &x86_pmu_attr_group,
+ &x86_pmu_format_group,
+ &x86_pmu_events_group,
+ &x86_pmu_caps_group,
+ NULL,
+};
+
+static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
+{
+ static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in);
+}
+
+void perf_check_microcode(void)
+{
+ if (x86_pmu.check_microcode)
+ x86_pmu.check_microcode();
+}
+
+static int x86_pmu_check_period(struct perf_event *event, u64 value)
+{
+ if (x86_pmu.check_period && x86_pmu.check_period(event, value))
+ return -EINVAL;
+
+ if (value && x86_pmu.limit_period) {
+ s64 left = value;
+ x86_pmu.limit_period(event, &left);
+ if (left > value)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int x86_pmu_aux_output_match(struct perf_event *event)
+{
+ if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT))
+ return 0;
+
+ if (x86_pmu.aux_output_match)
+ return x86_pmu.aux_output_match(event);
+
+ return 0;
+}
+
+static bool x86_pmu_filter(struct pmu *pmu, int cpu)
+{
+ bool ret = false;
+
+ static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
+
+ return ret;
+}
+
+static struct pmu pmu = {
+ .pmu_enable = x86_pmu_enable,
+ .pmu_disable = x86_pmu_disable,
+
+ .attr_groups = x86_pmu_attr_groups,
+
+ .event_init = x86_pmu_event_init,
+
+ .event_mapped = x86_pmu_event_mapped,
+ .event_unmapped = x86_pmu_event_unmapped,
+
+ .add = x86_pmu_add,
+ .del = x86_pmu_del,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
+ .read = x86_pmu_read,
+
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
+
+ .event_idx = x86_pmu_event_idx,
+ .sched_task = x86_pmu_sched_task,
+ .check_period = x86_pmu_check_period,
+
+ .aux_output_match = x86_pmu_aux_output_match,
+
+ .filter = x86_pmu_filter,
+};
+
+void arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now)
+{
+ struct cyc2ns_data data;
+ u64 offset;
+
+ userpg->cap_user_time = 0;
+ userpg->cap_user_time_zero = 0;
+ userpg->cap_user_rdpmc =
+ !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
+ userpg->pmc_width = x86_pmu.cntval_bits;
+
+ if (!using_native_sched_clock() || !sched_clock_stable())
+ return;
+
+ cyc2ns_read_begin(&data);
+
+ offset = data.cyc2ns_offset + __sched_clock_offset;
+
+ /*
+ * Internal timekeeping for enabled/running/stopped times
+ * is always in the local_clock domain.
+ */
+ userpg->cap_user_time = 1;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
+ userpg->time_offset = offset - now;
+
+ /*
+ * cap_user_time_zero doesn't make sense when we're using a different
+ * time base for the records.
+ */
+ if (!event->attr.use_clockid) {
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = offset;
+ }
+
+ cyc2ns_read_end();
+}
+
+/*
+ * Determine whether the regs were taken from an irq/exception handler rather
+ * than from perf_arch_fetch_caller_regs().
+ */
+static bool perf_hw_regs(struct pt_regs *regs)
+{
+ return regs->flags & X86_EFLAGS_FIXED;
+}
+
+void
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
+{
+ struct unwind_state state;
+ unsigned long addr;
+
+ if (perf_guest_state()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
+
+ if (perf_hw_regs(regs)) {
+ if (perf_callchain_store(entry, regs->ip))
+ return;
+ unwind_start(&state, current, regs, NULL);
+ } else {
+ unwind_start(&state, current, NULL, (void *)regs->sp);
+ }
+
+ for (; !unwind_done(&state); unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || perf_callchain_store(entry, addr))
+ return;
+ }
+}
+
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+ return __access_ok(fp, size);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+ struct desc_struct *desc;
+ unsigned int idx = segment >> 3;
+
+ if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /*
+ * If we're not in a valid context with a real (not just lazy)
+ * user mm, then don't even try.
+ */
+ if (!nmi_uaccess_okay())
+ return 0;
+
+ /* IRQs are off, so this synchronizes with smp_store_release */
+ ldt = smp_load_acquire(&current->mm->context.ldt);
+ if (!ldt || idx >= ldt->nr_entries)
+ return 0;
+
+ desc = &ldt->entries[idx];
+#else
+ return 0;
+#endif
+ } else {
+ if (idx >= GDT_ENTRIES)
+ return 0;
+
+ desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+ }
+
+ return get_desc_base(desc);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
+{
+ /* 32-bit process in 64-bit kernel. */
+ unsigned long ss_base, cs_base;
+ struct stack_frame_ia32 frame;
+ const struct stack_frame_ia32 __user *fp;
+ u32 ret_addr;
+
+ if (user_64bit_mode(regs))
+ return 0;
+
+ cs_base = get_segment_base(regs->cs);
+ ss_base = get_segment_base(regs->ss);
+
+ fp = compat_ptr(ss_base + regs->bp);
+ pagefault_disable();
+
+ /* see perf_callchain_user() below for why we do this */
+ if (is_uprobe_at_func_entry(regs) &&
+ !get_user(ret_addr, (const u32 __user *)regs->sp))
+ perf_callchain_store(entry, ret_addr);
+
+ while (entry->nr < entry->max_stack) {
+ if (!valid_user_frame(fp, sizeof(frame)))
+ break;
+
+ if (__get_user(frame.next_frame, &fp->next_frame))
+ break;
+ if (__get_user(frame.return_address, &fp->return_address))
+ break;
+
+ perf_callchain_store(entry, cs_base + frame.return_address);
+ fp = compat_ptr(ss_base + frame.next_frame);
+ }
+ pagefault_enable();
+ return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
+{
+ return 0;
+}
+#endif
+
+void
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
+{
+ struct stack_frame frame;
+ const struct stack_frame __user *fp;
+ unsigned long ret_addr;
+
+ if (perf_guest_state()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
+
+ /*
+ * We don't know what to do with VM86 stacks.. ignore them for now.
+ */
+ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+ return;
+
+ fp = (void __user *)regs->bp;
+
+ perf_callchain_store(entry, regs->ip);
+
+ if (!nmi_uaccess_okay())
+ return;
+
+ if (perf_callchain_user32(regs, entry))
+ return;
+
+ pagefault_disable();
+
+ /*
+ * If we are called from uprobe handler, and we are indeed at the very
+ * entry to user function (which is normally a `push %rbp` instruction,
+ * under assumption of application being compiled with frame pointers),
+ * we should read return address from *regs->sp before proceeding
+ * to follow frame pointers, otherwise we'll skip immediate caller
+ * as %rbp is not yet setup.
+ */
+ if (is_uprobe_at_func_entry(regs) &&
+ !get_user(ret_addr, (const unsigned long __user *)regs->sp))
+ perf_callchain_store(entry, ret_addr);
+
+ while (entry->nr < entry->max_stack) {
+ if (!valid_user_frame(fp, sizeof(frame)))
+ break;
+
+ if (__get_user(frame.next_frame, &fp->next_frame))
+ break;
+ if (__get_user(frame.return_address, &fp->return_address))
+ break;
+
+ perf_callchain_store(entry, frame.return_address);
+ fp = (void __user *)frame.next_frame;
+ }
+ pagefault_enable();
+}
+
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ * VM86 - the good olde 16 bit days, where the linear address is
+ * 20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ * IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ * to figure out what the 32bit base address is.
+ *
+ * X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
+{
+ /*
+ * For IA32 we look at the GDT/LDT segment base to convert the
+ * effective IP to a linear address.
+ */
+
+#ifdef CONFIG_X86_32
+ /*
+ * If we are in VM86 mode, add the segment offset to convert to a
+ * linear address.
+ */
+ if (regs->flags & X86_VM_MASK)
+ return 0x10 * regs->cs;
+
+ if (user_mode(regs) && regs->cs != __USER_CS)
+ return get_segment_base(regs->cs);
+#else
+ if (user_mode(regs) && !user_64bit_mode(regs) &&
+ regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
+#endif
+ return 0;
+}
+
+unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
+{
+ return regs->ip + code_segment_base(regs);
+}
+
+static unsigned long common_misc_flags(struct pt_regs *regs)
+{
+ if (regs->flags & PERF_EFLAGS_EXACT)
+ return PERF_RECORD_MISC_EXACT_IP;
+
+ return 0;
+}
+
+static unsigned long guest_misc_flags(struct pt_regs *regs)
+{
+ unsigned long guest_state = perf_guest_state();
+
+ if (!(guest_state & PERF_GUEST_ACTIVE))
+ return 0;
+
+ if (guest_state & PERF_GUEST_USER)
+ return PERF_RECORD_MISC_GUEST_USER;
+ else
+ return PERF_RECORD_MISC_GUEST_KERNEL;
+
+}
+
+static unsigned long host_misc_flags(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return PERF_RECORD_MISC_USER;
+ else
+ return PERF_RECORD_MISC_KERNEL;
+}
+
+unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
+{
+ unsigned long flags = common_misc_flags(regs);
+
+ flags |= guest_misc_flags(regs);
+
+ return flags;
+}
+
+unsigned long perf_arch_misc_flags(struct pt_regs *regs)
+{
+ unsigned long flags = common_misc_flags(regs);
+
+ flags |= host_misc_flags(regs);
+
+ return flags;
+}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+ /* This API doesn't currently support enumerating hybrid PMUs. */
+ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
+ !x86_pmu_initialized()) {
+ memset(cap, 0, sizeof(*cap));
+ return;
+ }
+
+ /*
+ * Note, hybrid CPU models get tracked as having hybrid PMUs even when
+ * all E-cores are disabled via BIOS. When E-cores are disabled, the
+ * base PMU holds the correct number of counters for P-cores.
+ */
+ cap->version = x86_pmu.version;
+ cap->num_counters_gp = x86_pmu_num_counters(NULL);
+ cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL);
+ cap->bit_width_gp = x86_pmu.cntval_bits;
+ cap->bit_width_fixed = x86_pmu.cntval_bits;
+ cap->events_mask = (unsigned int)x86_pmu.events_maskl;
+ cap->events_mask_len = x86_pmu.events_mask_len;
+ cap->pebs_ept = x86_pmu.pebs_ept;
+}
+EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability);
+
+u64 perf_get_hw_event_config(int hw_event)
+{
+ int max = x86_pmu.max_events;
+
+ if (hw_event < max)
+ return x86_pmu.event_map(array_index_nospec(hw_event, max));
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config);
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
new file mode 100644
index 000000000000..10bde6c5abb2
--- /dev/null
+++ b/arch/x86/events/intel/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o
+obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o
+obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o
+intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o
+intel-cstate-objs := cstate.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
new file mode 100644
index 000000000000..cbac54cb3a9e
--- /dev/null
+++ b/arch/x86/events/intel/bts.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <linux/sizes.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+struct bts_ctx {
+ struct perf_output_handle handle;
+ struct debug_store ds_back;
+ int state;
+};
+
+/* BTS context states: */
+enum {
+ /* no ongoing AUX transactions */
+ BTS_STATE_STOPPED = 0,
+ /* AUX transaction is on, BTS tracing is disabled */
+ BTS_STATE_INACTIVE,
+ /* AUX transaction is on, BTS tracing is running */
+ BTS_STATE_ACTIVE,
+};
+
+static struct bts_ctx __percpu *bts_ctx;
+
+#define BTS_RECORD_SIZE 24
+#define BTS_SAFETY_MARGIN 4080
+
+struct bts_phys {
+ struct page *page;
+ unsigned long size;
+ unsigned long offset;
+ unsigned long displacement;
+};
+
+struct bts_buffer {
+ size_t real_size; /* multiple of BTS_RECORD_SIZE */
+ unsigned int nr_pages;
+ unsigned int nr_bufs;
+ unsigned int cur_buf;
+ bool snapshot;
+ local_t data_size;
+ local_t head;
+ unsigned long end;
+ void **data_pages;
+ struct bts_phys buf[] __counted_by(nr_bufs);
+};
+
+static struct pmu bts_pmu;
+
+static int buf_nr_pages(struct page *page)
+{
+ if (!PagePrivate(page))
+ return 1;
+
+ return 1 << page_private(page);
+}
+
+static size_t buf_size(struct page *page)
+{
+ return buf_nr_pages(page) * PAGE_SIZE;
+}
+
+static void *
+bts_buffer_setup_aux(struct perf_event *event, void **pages,
+ int nr_pages, bool overwrite)
+{
+ struct bts_buffer *bb;
+ struct page *page;
+ int cpu = event->cpu;
+ int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ unsigned long offset;
+ size_t size = nr_pages << PAGE_SHIFT;
+ int pg, nr_buf, pad;
+
+ /* count all the high order buffers */
+ for (pg = 0, nr_buf = 0; pg < nr_pages;) {
+ page = virt_to_page(pages[pg]);
+ pg += buf_nr_pages(page);
+ nr_buf++;
+ }
+
+ /*
+ * to avoid interrupts in overwrite mode, only allow one physical
+ */
+ if (overwrite && nr_buf > 1)
+ return NULL;
+
+ bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node);
+ if (!bb)
+ return NULL;
+
+ bb->nr_pages = nr_pages;
+ bb->nr_bufs = nr_buf;
+ bb->snapshot = overwrite;
+ bb->data_pages = pages;
+ bb->real_size = size - size % BTS_RECORD_SIZE;
+
+ for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) {
+ unsigned int __nr_pages;
+
+ page = virt_to_page(pages[pg]);
+ __nr_pages = buf_nr_pages(page);
+ bb->buf[nr_buf].page = page;
+ bb->buf[nr_buf].offset = offset;
+ bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+ bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement;
+ pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE;
+ bb->buf[nr_buf].size -= pad;
+
+ pg += __nr_pages;
+ offset += __nr_pages << PAGE_SHIFT;
+ }
+
+ return bb;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+ kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx)
+{
+ return bb->buf[idx].offset + bb->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *bb)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_phys *phys = &bb->buf[bb->cur_buf];
+ unsigned long index, thresh = 0, end = phys->size;
+ struct page *page = phys->page;
+
+ index = local_read(&bb->head);
+
+ if (!bb->snapshot) {
+ if (bb->end < phys->offset + buf_size(page))
+ end = bb->end - phys->offset - phys->displacement;
+
+ index -= phys->offset + phys->displacement;
+
+ if (end - index > BTS_SAFETY_MARGIN)
+ thresh = end - BTS_SAFETY_MARGIN;
+ else if (end - index > BTS_RECORD_SIZE)
+ thresh = end - BTS_RECORD_SIZE;
+ else
+ thresh = end;
+ }
+
+ ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+ ds->bts_index = ds->bts_buffer_base + index;
+ ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+ ds->bts_interrupt_threshold = !bb->snapshot
+ ? ds->bts_buffer_base + thresh
+ : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+ unsigned long index = head - phys->offset;
+
+ memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_buffer *bb = perf_get_aux(&bts->handle);
+ unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+ if (!bb)
+ return;
+
+ head = index + bts_buffer_offset(bb, bb->cur_buf);
+ old = local_xchg(&bb->head, head);
+
+ if (!bb->snapshot) {
+ if (old == head)
+ return;
+
+ if (ds->bts_index >= ds->bts_absolute_maximum)
+ perf_aux_output_flag(&bts->handle,
+ PERF_AUX_FLAG_TRUNCATED);
+
+ /*
+ * old and head are always in the same physical buffer, so we
+ * can subtract them to get the data size.
+ */
+ local_add(head - old, &bb->data_size);
+ } else {
+ local_set(&bb->data_size, head);
+ }
+
+ /*
+ * Since BTS is coherent, just add compiler barrier to ensure
+ * BTS updating is ordered against bts::handle::event.
+ */
+ barrier();
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle);
+
+/*
+ * Ordering PMU callbacks wrt themselves and the PMI is done by means
+ * of bts::state, which:
+ * - is set when bts::handle::event is valid, that is, between
+ * perf_aux_output_begin() and perf_aux_output_end();
+ * - is zero otherwise;
+ * - is ordered against bts::handle::event with a compiler barrier.
+ */
+
+static void __bts_event_start(struct perf_event *event)
+{
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
+ struct bts_buffer *bb = perf_get_aux(&bts->handle);
+ u64 config = 0;
+
+ if (!bb->snapshot)
+ config |= ARCH_PERFMON_EVENTSEL_INT;
+ if (!event->attr.exclude_kernel)
+ config |= ARCH_PERFMON_EVENTSEL_OS;
+ if (!event->attr.exclude_user)
+ config |= ARCH_PERFMON_EVENTSEL_USR;
+
+ bts_config_buffer(bb);
+
+ /*
+ * local barrier to make sure that ds configuration made it
+ * before we enable BTS and bts::state goes ACTIVE
+ */
+ wmb();
+
+ /* INACTIVE/STOPPED -> ACTIVE */
+ WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
+
+ intel_pmu_enable_bts(config);
+
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
+ struct bts_buffer *bb;
+
+ bb = perf_aux_output_begin(&bts->handle, event);
+ if (!bb)
+ goto fail_stop;
+
+ if (bts_buffer_reset(bb, &bts->handle))
+ goto fail_end_stop;
+
+ bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+ bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+ bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+ perf_event_itrace_started(event);
+ event->hw.state = 0;
+
+ __bts_event_start(event);
+
+ return;
+
+fail_end_stop:
+ perf_aux_output_end(&bts->handle, 0);
+
+fail_stop:
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void __bts_event_stop(struct perf_event *event, int state)
+{
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
+
+ /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
+ WRITE_ONCE(bts->state, state);
+
+ /*
+ * No extra synchronization is mandated by the documentation to have
+ * BTS data stores globally visible.
+ */
+ intel_pmu_disable_bts();
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
+ struct bts_buffer *bb = NULL;
+ int state = READ_ONCE(bts->state);
+
+ if (state == BTS_STATE_ACTIVE)
+ __bts_event_stop(event, BTS_STATE_STOPPED);
+
+ if (state != BTS_STATE_STOPPED)
+ bb = perf_get_aux(&bts->handle);
+
+ event->hw.state |= PERF_HES_STOPPED;
+
+ if (flags & PERF_EF_UPDATE) {
+ bts_update(bts);
+
+ if (bb) {
+ if (bb->snapshot)
+ bts->handle.head =
+ local_xchg(&bb->data_size,
+ bb->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&bts->handle,
+ local_xchg(&bb->data_size, 0));
+ }
+
+ cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+ cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+ }
+}
+
+void intel_bts_enable_local(void)
+{
+ struct bts_ctx *bts;
+ int state;
+
+ if (!bts_ctx)
+ return;
+
+ bts = this_cpu_ptr(bts_ctx);
+ state = READ_ONCE(bts->state);
+ /*
+ * Here we transition from INACTIVE to ACTIVE;
+ * if we instead are STOPPED from the interrupt handler,
+ * stay that way. Can't be ACTIVE here though.
+ */
+ if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
+ return;
+
+ if (state == BTS_STATE_STOPPED)
+ return;
+
+ if (bts->handle.event)
+ __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+ struct bts_ctx *bts;
+
+ if (!bts_ctx)
+ return;
+
+ bts = this_cpu_ptr(bts_ctx);
+
+ /*
+ * Here we transition from ACTIVE to INACTIVE;
+ * do nothing for STOPPED or INACTIVE.
+ */
+ if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
+ return;
+
+ if (bts->handle.event)
+ __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle)
+{
+ unsigned long head, space, next_space, pad, gap, skip, wakeup;
+ unsigned int next_buf;
+ struct bts_phys *phys, *next_phys;
+ int ret;
+
+ if (bb->snapshot)
+ return 0;
+
+ head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1);
+
+ phys = &bb->buf[bb->cur_buf];
+ space = phys->offset + phys->displacement + phys->size - head;
+ pad = space;
+ if (space > handle->size) {
+ space = handle->size;
+ space -= space % BTS_RECORD_SIZE;
+ }
+ if (space <= BTS_SAFETY_MARGIN) {
+ /* See if next phys buffer has more space */
+ next_buf = bb->cur_buf + 1;
+ if (next_buf >= bb->nr_bufs)
+ next_buf = 0;
+ next_phys = &bb->buf[next_buf];
+ gap = buf_size(phys->page) - phys->displacement - phys->size +
+ next_phys->displacement;
+ skip = pad + gap;
+ if (handle->size >= skip) {
+ next_space = next_phys->size;
+ if (next_space + skip > handle->size) {
+ next_space = handle->size - skip;
+ next_space -= next_space % BTS_RECORD_SIZE;
+ }
+ if (next_space > space || !space) {
+ if (pad)
+ bts_buffer_pad_out(phys, head);
+ ret = perf_aux_output_skip(handle, skip);
+ if (ret)
+ return ret;
+ /* Advance to next phys buffer */
+ phys = next_phys;
+ space = next_space;
+ head = phys->offset + phys->displacement;
+ /*
+ * After this, cur_buf and head won't match ds
+ * anymore, so we must not be racing with
+ * bts_update().
+ */
+ bb->cur_buf = next_buf;
+ local_set(&bb->head, head);
+ }
+ }
+ }
+
+ /* Don't go far beyond wakeup watermark */
+ wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+ handle->head;
+ if (space > wakeup) {
+ space = wakeup;
+ space -= space % BTS_RECORD_SIZE;
+ }
+
+ bb->end = head + space;
+
+ /*
+ * If we have no space, the lost notification would have been sent when
+ * we hit absolute_maximum - see bts_update()
+ */
+ if (!space)
+ return -ENOSPC;
+
+ return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+ struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
+ struct bts_ctx *bts;
+ struct perf_event *event;
+ struct bts_buffer *bb;
+ s64 old_head;
+ int err = -ENOSPC, handled = 0;
+
+ if (!bts_ctx)
+ return 0;
+
+ bts = this_cpu_ptr(bts_ctx);
+ event = bts->handle.event;
+ /*
+ * The only surefire way of knowing if this NMI is ours is by checking
+ * the write ptr against the PMI threshold.
+ */
+ if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
+ handled = 1;
+
+ /*
+ * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
+ * so we can only be INACTIVE or STOPPED
+ */
+ if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
+ return handled;
+
+ bb = perf_get_aux(&bts->handle);
+ if (!bb)
+ return handled;
+
+ /*
+ * Skip snapshot counters: they don't use the interrupt, but
+ * there's no other way of telling, because the pointer will
+ * keep moving
+ */
+ if (bb->snapshot)
+ return 0;
+
+ old_head = local_read(&bb->head);
+ bts_update(bts);
+
+ /* no new data */
+ if (old_head == local_read(&bb->head))
+ return handled;
+
+ perf_aux_output_end(&bts->handle, local_xchg(&bb->data_size, 0));
+
+ bb = perf_aux_output_begin(&bts->handle, event);
+ if (bb)
+ err = bts_buffer_reset(bb, &bts->handle);
+
+ if (err) {
+ WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
+
+ if (bb) {
+ /*
+ * BTS_STATE_STOPPED should be visible before
+ * cleared handle::event
+ */
+ barrier();
+ perf_aux_output_end(&bts->handle, 0);
+ }
+ }
+
+ return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+ bts_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+ struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ return -EBUSY;
+
+ if (bts->handle.event)
+ return -EBUSY;
+
+ if (mode & PERF_EF_START) {
+ bts_event_start(event, 0);
+ if (hwc->state & PERF_HES_STOPPED)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+ x86_release_hardware();
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+ int ret;
+
+ if (event->attr.type != bts_pmu.type)
+ return -ENOENT;
+
+ /*
+ * BTS leaks kernel addresses even when CPL0 tracing is
+ * disabled, so disallow intel_bts driver for unprivileged
+ * users on paranoid systems since it provides trace data
+ * to the user in a zero-copy fashion.
+ */
+ if (event->attr.exclude_kernel) {
+ ret = perf_allow_kernel();
+ if (ret)
+ return ret;
+ }
+
+ if (x86_add_exclusive(x86_lbr_exclusive_bts))
+ return -EBUSY;
+
+ ret = x86_reserve_hardware();
+ if (ret) {
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+ return ret;
+ }
+
+ event->destroy = bts_event_destroy;
+
+ return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_DTES64))
+ return -ENODEV;
+
+ x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
+ if (!x86_pmu.bts)
+ return -ENODEV;
+
+ if (boot_cpu_has(X86_FEATURE_PTI)) {
+ /*
+ * BTS hardware writes through a virtual memory map we must
+ * either use the kernel physical map, or the user mapping of
+ * the AUX buffer.
+ *
+ * However, since this driver supports per-CPU and per-task inherit
+ * we cannot use the user mapping since it will not be available
+ * if we're not running the owning process.
+ *
+ * With PTI we can't use the kernel map either, because its not
+ * there when we run userspace.
+ *
+ * For now, disable this driver when using PTI.
+ */
+ return -ENODEV;
+ }
+
+ bts_ctx = alloc_percpu(struct bts_ctx);
+ if (!bts_ctx)
+ return -ENOMEM;
+
+ bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
+ PERF_PMU_CAP_EXCLUSIVE;
+ bts_pmu.task_ctx_nr = perf_sw_context;
+ bts_pmu.event_init = bts_event_init;
+ bts_pmu.add = bts_event_add;
+ bts_pmu.del = bts_event_del;
+ bts_pmu.start = bts_event_start;
+ bts_pmu.stop = bts_event_stop;
+ bts_pmu.read = bts_event_read;
+ bts_pmu.setup_aux = bts_buffer_setup_aux;
+ bts_pmu.free_aux = bts_buffer_free_aux;
+
+ return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+early_initcall(bts_init);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
new file mode 100644
index 000000000000..853fe073bab3
--- /dev/null
+++ b/arch/x86/events/intel/core.c
@@ -0,0 +1,8223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+#include <linux/kvm_host.h>
+
+#include <asm/cpufeature.h>
+#include <asm/debugreg.h>
+#include <asm/hardirq.h>
+#include <asm/intel-family.h>
+#include <asm/intel_pt.h>
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
+};
+
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
+{
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+ INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+ INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+ INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+ /*
+ * When HT is off these events can only run on the bottom 4 counters
+ * When HT is on, they are impacted by the HT bug and require EXCL access
+ */
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMPTY */
+ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+ /*
+ * When HT is off these events can only run on the bottom 4 counters
+ * When HT is on, they are impacted by the HT bug and require EXCL access
+ */
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
+{
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_v5_gen_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ FIXED_EVENT_CONSTRAINT(0x0500, 4),
+ FIXED_EVENT_CONSTRAINT(0x0600, 5),
+ FIXED_EVENT_CONSTRAINT(0x0700, 6),
+ FIXED_EVENT_CONSTRAINT(0x0800, 7),
+ FIXED_EVENT_CONSTRAINT(0x0900, 8),
+ FIXED_EVENT_CONSTRAINT(0x0a00, 9),
+ FIXED_EVENT_CONSTRAINT(0x0b00, 10),
+ FIXED_EVENT_CONSTRAINT(0x0c00, 11),
+ FIXED_EVENT_CONSTRAINT(0x0d00, 12),
+ FIXED_EVENT_CONSTRAINT(0x0e00, 13),
+ FIXED_EVENT_CONSTRAINT(0x0f00, 14),
+ FIXED_EVENT_CONSTRAINT(0x1000, 15),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_grt_event_constraints[] __read_mostly = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_skt_event_constraints[] __read_mostly = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+ FIXED_EVENT_CONSTRAINT(0x0073, 4), /* TOPDOWN_BAD_SPECULATION.ALL */
+ FIXED_EVENT_CONSTRAINT(0x019c, 5), /* TOPDOWN_FE_BOUND.ALL */
+ FIXED_EVENT_CONSTRAINT(0x02c2, 6), /* TOPDOWN_RETIRING.ALL */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_skl_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+ /*
+ * when HT is off, these can only run on the bottom 4 counters
+ */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xc6, 0xf), /* FRONTEND_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x799ffbb6e7ull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x399ffbffe7ull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ /*
+ * Note the low 8 bits eventsel code is not a continuous field, containing
+ * some #GPing bits. These are masked out.
+ */
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_icl_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf),
+ INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf),
+ INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x56, 0xf),
+ INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */
+ INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */
+ INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */
+ INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf),
+ INTEL_EVENT_CONSTRAINT(0xef, 0xf),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffbfffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffffbfffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_glc_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE),
+ INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
+ INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_glc_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+ INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
+ INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
+ /*
+ * Generally event codes < 0x90 are restricted to counters 0-3.
+ * The 0x2E and 0x3C are exception, which has no restriction.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+ /*
+ * Generally event codes >= 0x90 are likely to have no restrictions.
+ * The exception are defined as above.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_rwc_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE),
+ INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE),
+ INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
+ INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_lnc_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+ INTEL_EVENT_CONSTRAINT(0x20, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x012a, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x012b, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4),
+ INTEL_UEVENT_CONSTRAINT(0x0175, 0x4),
+
+ INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff),
+ INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff),
+
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+ INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8),
+ INTEL_UEVENT_CONSTRAINT(0x01cd, 0x3fc),
+ INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3),
+
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x00e0, 0xf),
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_lnc_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0xfffffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0xfffffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE),
+ INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE),
+ INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0xf, FE),
+ INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
+ EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
+
+static struct attribute *nhm_mem_events_attrs[] = {
+ EVENT_PTR(mem_ld_nhm),
+ NULL,
+};
+
+/*
+ * topdown events for Intel Core CPUs.
+ *
+ * The events are all in slots, which is a free slot in a 4 wide
+ * pipeline. Some events are already reported in slots, for cycle
+ * events we multiply by the pipeline width (4).
+ *
+ * With Hyper Threading on, topdown metrics are either summed or averaged
+ * between the threads of a core: (count_t0 + count_t1).
+ *
+ * For the average case the metric is always scaled to pipeline width,
+ * so we use factor 2 ((count_t0 + count_t1) / 2 * 4)
+ */
+
+EVENT_ATTR_STR_HT(topdown-total-slots, td_total_slots,
+ "event=0x3c,umask=0x0", /* cpu_clk_unhalted.thread */
+ "event=0x3c,umask=0x0,any=1"); /* cpu_clk_unhalted.thread_any */
+EVENT_ATTR_STR_HT(topdown-total-slots.scale, td_total_slots_scale, "4", "2");
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued,
+ "event=0xe,umask=0x1"); /* uops_issued.any */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired,
+ "event=0xc2,umask=0x2"); /* uops_retired.retire_slots */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles,
+ "event=0x9c,umask=0x1"); /* idq_uops_not_delivered_core */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
+ "event=0xd,umask=0x3,cmask=1", /* int_misc.recovery_cycles */
+ "event=0xd,umask=0x3,cmask=1,any=1"); /* int_misc.recovery_cycles_any */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
+ "4", "2");
+
+EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84");
+EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85");
+EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86");
+EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87");
+
+static struct attribute *snb_events_attrs[] = {
+ EVENT_PTR(td_slots_issued),
+ EVENT_PTR(td_slots_retired),
+ EVENT_PTR(td_fetch_bubbles),
+ EVENT_PTR(td_total_slots),
+ EVENT_PTR(td_total_slots_scale),
+ EVENT_PTR(td_recovery_bubbles),
+ EVENT_PTR(td_recovery_bubbles_scale),
+ NULL,
+};
+
+static struct attribute *snb_mem_events_attrs[] = {
+ EVENT_PTR(mem_ld_snb),
+ EVENT_PTR(mem_st_snb),
+ NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+ /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+ /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+ /*
+ * When HT is off these events can only run on the bottom 4 counters
+ * When HT is on, they are impacted by the HT bug and require EXCL access
+ */
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+ /*
+ * when HT is off, these can only run on the bottom 4 counters
+ */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
+ EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+ return intel_perfmon_event_map[hw_event];
+}
+
+static __initconst const u64 glc_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe124,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_MISS) ] = 0xe424,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe12,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ [ C(RESULT_MISS) ] = 0xe13,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = 0xe11,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4c4,
+ [ C(RESULT_MISS) ] = 0x4c5,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+};
+
+static __initconst const u64 glc_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10001,
+ [ C(RESULT_MISS) ] = 0x3fbfc00001,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
+ [ C(RESULT_MISS) ] = 0x3f3fc00002,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10c000001,
+ [ C(RESULT_MISS) ] = 0x3fb3000001,
+ },
+ },
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD BIT_ULL(0)
+#define SKL_DEMAND_RFO BIT_ULL(1)
+#define SKL_ANY_RESPONSE BIT_ULL(16)
+#define SKL_SUPPLIER_NONE BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
+#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT BIT_ULL(30)
+#define SKL_SNOOP_NONE BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define SKL_SNOOP_MISS BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define SKL_SNOOP_HITM BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
+#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+#define SNB_DMND_DATA_RD (1ULL << 0)
+#define SNB_DMND_RFO (1ULL << 1)
+#define SNB_DMND_IFETCH (1ULL << 2)
+#define SNB_DMND_WB (1ULL << 3)
+#define SNB_PF_DATA_RD (1ULL << 4)
+#define SNB_PF_RFO (1ULL << 5)
+#define SNB_PF_IFETCH (1ULL << 6)
+#define SNB_LLC_DATA_RD (1ULL << 7)
+#define SNB_LLC_RFO (1ULL << 8)
+#define SNB_LLC_IFETCH (1ULL << 9)
+#define SNB_BUS_LOCKS (1ULL << 10)
+#define SNB_STRM_ST (1ULL << 11)
+#define SNB_OTHER (1ULL << 15)
+#define SNB_RESP_ANY (1ULL << 16)
+#define SNB_NO_SUPP (1ULL << 17)
+#define SNB_LLC_HITM (1ULL << 18)
+#define SNB_LLC_HITE (1ULL << 19)
+#define SNB_LLC_HITS (1ULL << 20)
+#define SNB_LLC_HITF (1ULL << 21)
+#define SNB_LOCAL (1ULL << 22)
+#define SNB_REMOTE (0xffULL << 23)
+#define SNB_SNP_NONE (1ULL << 31)
+#define SNB_SNP_NOT_NEEDED (1ULL << 32)
+#define SNB_SNP_MISS (1ULL << 33)
+#define SNB_NO_FWD (1ULL << 34)
+#define SNB_SNP_FWD (1ULL << 35)
+#define SNB_HITM (1ULL << 36)
+#define SNB_NON_DRAM (1ULL << 37)
+
+#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+ SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+ SNB_HITM)
+
+#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS SNB_RESP_ANY
+#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+ },
+ },
+};
+
+static __initconst const u64 snb_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
+ [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 westmere_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ /*
+ * Use RFO, not WRITEBACK, because a write miss would typically occur
+ * on RFO.
+ */
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+};
+
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+
+#define NHM_DMND_DATA_RD (1 << 0)
+#define NHM_DMND_RFO (1 << 1)
+#define NHM_DMND_IFETCH (1 << 2)
+#define NHM_DMND_WB (1 << 3)
+#define NHM_PF_DATA_RD (1 << 4)
+#define NHM_PF_DATA_RFO (1 << 5)
+#define NHM_PF_IFETCH (1 << 6)
+#define NHM_OFFCORE_OTHER (1 << 7)
+#define NHM_UNCORE_HIT (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
+#define NHM_OTHER_CORE_HITM (1 << 10)
+ /* reserved */
+#define NHM_REMOTE_CACHE_FWD (1 << 12)
+#define NHM_REMOTE_DRAM (1 << 13)
+#define NHM_LOCAL_DRAM (1 << 14)
+#define NHM_NON_DRAM (1 << 15)
+
+#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE (NHM_REMOTE_DRAM)
+
+#define NHM_DMND_READ (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
+ },
+ },
+};
+
+static __initconst const u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ /*
+ * Use RFO, not WRITEBACK, because a write miss would typically occur
+ * on RFO.
+ */
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+};
+
+static __initconst const u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst const u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_slm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_slm, "2");
+/* no_alloc_cycles.not_delivered */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_slm,
+ "event=0xca,umask=0x50");
+EVENT_ATTR_STR(topdown-fetch-bubbles.scale, td_fetch_bubbles_scale_slm, "2");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_slm,
+ "event=0xc2,umask=0x10");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_slm,
+ "event=0xc2,umask=0x10");
+
+static struct attribute *slm_events_attrs[] = {
+ EVENT_PTR(td_total_slots_slm),
+ EVENT_PTR(td_total_slots_scale_slm),
+ EVENT_PTR(td_fetch_bubbles_slm),
+ EVENT_PTR(td_fetch_bubbles_scale_slm),
+ EVENT_PTR(td_slots_issued_slm),
+ EVENT_PTR(td_slots_retired_slm),
+ NULL
+};
+
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE SNB_DMND_RFO
+#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS SNB_RESP_ANY
+#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+ },
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+ [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
+/* UOPS_NOT_DELIVERED.ANY */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
+/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
+EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
+/* UOPS_RETIRED.ANY */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
+/* UOPS_ISSUED.ANY */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
+
+static struct attribute *glm_events_attrs[] = {
+ EVENT_PTR(td_total_slots_glm),
+ EVENT_PTR(td_total_slots_scale_glm),
+ EVENT_PTR(td_fetch_bubbles_glm),
+ EVENT_PTR(td_recovery_bubbles_glm),
+ EVENT_PTR(td_slots_issued_glm),
+ EVENT_PTR(td_slots_retired_glm),
+ NULL
+};
+
+static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+#define GLM_DEMAND_DATA_RD BIT_ULL(0)
+#define GLM_DEMAND_RFO BIT_ULL(1)
+#define GLM_ANY_RESPONSE BIT_ULL(16)
+#define GLM_SNP_NONE_OR_MISS BIT_ULL(33)
+#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD
+#define GLM_DEMAND_WRITE GLM_DEMAND_RFO
+#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+#define GLM_LLC_ACCESS GLM_ANY_RESPONSE
+#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM)
+#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 glm_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
+ [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ },
+ [C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+ [C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+};
+
+static __initconst const u64 glm_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_READ|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_READ|
+ GLM_LLC_MISS,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_PREFETCH|
+ GLM_LLC_MISS,
+ },
+ },
+};
+
+static __initconst const u64 glp_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
+ [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+ [C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+};
+
+static __initconst const u64 glp_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_READ|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_READ|
+ GLM_LLC_MISS,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+};
+
+#define TNT_LOCAL_DRAM BIT_ULL(26)
+#define TNT_DEMAND_READ GLM_DEMAND_DATA_RD
+#define TNT_DEMAND_WRITE GLM_DEMAND_RFO
+#define TNT_LLC_ACCESS GLM_ANY_RESPONSE
+#define TNT_SNP_ANY (SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \
+ SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM)
+#define TNT_LLC_MISS (TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM)
+
+static __initconst const u64 tnt_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = TNT_DEMAND_READ|
+ TNT_LLC_ACCESS,
+ [C(RESULT_MISS)] = TNT_DEMAND_READ|
+ TNT_LLC_MISS,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = TNT_DEMAND_WRITE|
+ TNT_LLC_ACCESS,
+ [C(RESULT_MISS)] = TNT_DEMAND_WRITE|
+ TNT_LLC_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+};
+
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0");
+EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0");
+
+static struct attribute *tnt_events_attrs[] = {
+ EVENT_PTR(td_fe_bound_tnt),
+ EVENT_PTR(td_retiring_tnt),
+ EVENT_PTR(td_bad_spec_tnt),
+ EVENT_PTR(td_be_bound_tnt),
+ NULL,
+};
+
+static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads, mem_ld_grt, "event=0xd0,umask=0x5,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_grt, "event=0xd0,umask=0x6");
+
+static struct attribute *grt_mem_attrs[] = {
+ EVENT_PTR(mem_ld_grt),
+ EVENT_PTR(mem_st_grt),
+ NULL
+};
+
+static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+ EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(topdown-retiring, td_retiring_cmt, "event=0x72,umask=0x0");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_cmt, "event=0x73,umask=0x0");
+
+static struct attribute *cmt_events_attrs[] = {
+ EVENT_PTR(td_fe_bound_tnt),
+ EVENT_PTR(td_retiring_cmt),
+ EVENT_PTR(td_bad_spec_cmt),
+ EVENT_PTR(td_be_bound_tnt),
+ NULL
+};
+
+static struct extra_reg intel_cmt_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff3ffffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff3ffffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+ INTEL_UEVENT_EXTRA_REG(0x0127, MSR_SNOOP_RSP_0, 0xffffffffffffffffull, SNOOP_0),
+ INTEL_UEVENT_EXTRA_REG(0x0227, MSR_SNOOP_RSP_1, 0xffffffffffffffffull, SNOOP_1),
+ EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_skt, "event=0x9c,umask=0x01");
+EVENT_ATTR_STR(topdown-retiring, td_retiring_skt, "event=0xc2,umask=0x02");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound_skt, "event=0xa4,umask=0x02");
+
+static struct attribute *skt_events_attrs[] = {
+ EVENT_PTR(td_fe_bound_skt),
+ EVENT_PTR(td_retiring_skt),
+ EVENT_PTR(td_bad_spec_cmt),
+ EVENT_PTR(td_be_bound_skt),
+ NULL,
+};
+
+#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL BIT_ULL(21)
+#define KNL_MCDRAM_FAR BIT_ULL(22)
+#define KNL_DDR_LOCAL BIT_ULL(23)
+#define KNL_DDR_FAR BIT_ULL(24)
+#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+ KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ SLM_DMND_READ
+#define KNL_L2_WRITE SLM_DMND_WRITE
+#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS SLM_LLC_ACCESS
+#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+ KNL_DRAM_ANY | SNB_SNP_ANY | \
+ SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
+ },
+ },
+};
+
+/*
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged.
+ *
+ * intel_bts events don't coexist with intel PMU's BTS events because of
+ * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them
+ * disabled around intel PMU's event batching etc, only inside the PMI handler.
+ *
+ * Avoid PEBS_ENABLE MSR access in PMIs.
+ * The GLOBAL_CTRL has been disabled. All the counters do not count anymore.
+ * It doesn't matter if the PEBS is enabled or not.
+ * Usually, the PEBS status are not changed in PMIs. It's unnecessary to
+ * access PEBS_ENABLE MSR in disable_all()/enable_all().
+ * However, there are some cases which may change PEBS status, e.g. PMI
+ * throttle. The PEBS_ENABLE should be updated where the status changes.
+ */
+static __always_inline void __intel_pmu_disable_all(bool bts)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+ if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ intel_pmu_disable_bts();
+}
+
+static __always_inline void intel_pmu_disable_all(void)
+{
+ __intel_pmu_disable_all(true);
+ static_call_cond(x86_pmu_pebs_disable_all)();
+ intel_pmu_lbr_disable_all();
+}
+
+static void __intel_pmu_enable_all(int added, bool pmi)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
+
+ intel_pmu_lbr_enable_all(pmi);
+
+ if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) {
+ wrmsrq(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val);
+ cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val;
+ }
+
+ wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL,
+ intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ struct perf_event *event =
+ cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ intel_pmu_enable_bts(event->hw.config);
+ }
+}
+
+static void intel_pmu_enable_all(int added)
+{
+ static_call_cond(x86_pmu_pebs_enable_all)();
+ __intel_pmu_enable_all(added, false);
+}
+
+static noinline int
+__intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries,
+ unsigned int cnt, unsigned long flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ intel_pmu_lbr_read();
+ cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr);
+
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+ intel_pmu_enable_all(0);
+ local_irq_restore(flags);
+ return cnt;
+}
+
+static int
+intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ unsigned long flags;
+
+ /* must not have branches... */
+ local_irq_save(flags);
+ __intel_pmu_disable_all(false); /* we don't care about BTS */
+ __intel_pmu_lbr_disable();
+ /* ... until here */
+ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
+static int
+intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ unsigned long flags;
+
+ /* must not have branches... */
+ local_irq_save(flags);
+ __intel_pmu_disable_all(false); /* we don't care about BTS */
+ __intel_pmu_arch_lbr_disable();
+ /* ... until here */
+ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
+}
+
+/*
+ * Workaround for:
+ * Intel Errata AAK100 (model 26)
+ * Intel Errata AAP53 (model 30)
+ * Intel Errata BD53 (model 44)
+ *
+ * The official story:
+ * These chips need to be 'reset' when adding counters by programming the
+ * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ * in sequence on the same PMC or on different PMCs.
+ *
+ * In practice it appears some of these events do in fact count, and
+ * we need to program all 4 events.
+ */
+static void intel_pmu_nhm_workaround(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ static const unsigned long nhm_magic[4] = {
+ 0x4300B5,
+ 0x4300D2,
+ 0x4300B1,
+ 0x4300B1
+ };
+ struct perf_event *event;
+ int i;
+
+ /*
+ * The Errata requires below steps:
+ * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+ * 2) Configure 4 PERFEVTSELx with the magic events and clear
+ * the corresponding PMCx;
+ * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+ * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+ * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+ */
+
+ /*
+ * The real steps we choose are a little different from above.
+ * A) To reduce MSR operations, we don't run step 1) as they
+ * are already cleared before this function is called;
+ * B) Call x86_perf_event_update to save PMCx before configuring
+ * PERFEVTSELx with magic number;
+ * C) With step 5), we do clear only when the PERFEVTSELx is
+ * not used currently.
+ * D) Call x86_perf_event_set_period to restore PMCx;
+ */
+
+ /* We always operate 4 pairs of PERF Counters */
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event)
+ static_call(x86_pmu_update)(event);
+ }
+
+ for (i = 0; i < 4; i++) {
+ wrmsrq(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+ wrmsrq(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+ }
+
+ wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+ wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+
+ if (event) {
+ static_call(x86_pmu_set_period)(event);
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+ } else
+ wrmsrq(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
+ }
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+ if (added)
+ intel_pmu_nhm_workaround();
+ intel_pmu_enable_all(added);
+}
+
+static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on)
+{
+ u64 val = on ? MSR_TFA_RTM_FORCE_ABORT : 0;
+
+ if (cpuc->tfa_shadow != val) {
+ cpuc->tfa_shadow = val;
+ wrmsrq(MSR_TSX_FORCE_ABORT, val);
+ }
+}
+
+static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ /*
+ * We're going to use PMC3, make sure TFA is set before we touch it.
+ */
+ if (cntr == 3)
+ intel_set_tfa(cpuc, true);
+}
+
+static void intel_tfa_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * If we find PMC3 is no longer used when we enable the PMU, we can
+ * clear TFA.
+ */
+ if (!test_bit(3, cpuc->active_mask))
+ intel_set_tfa(cpuc, false);
+
+ intel_pmu_enable_all(added);
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+ return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static inline void intel_set_masks(struct perf_event *event, int idx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (event->attr.exclude_host)
+ __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
+ if (event->attr.exclude_guest)
+ __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
+ if (event_is_checkpointed(event))
+ __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
+}
+
+static inline void intel_clear_masks(struct perf_event *event, int idx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
+ __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
+ __clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
+}
+
+static void intel_pmu_disable_fixed(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+ u64 mask;
+
+ if (is_topdown_idx(idx)) {
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * When there are other active TopDown events,
+ * don't disable the fixed counter 3.
+ */
+ if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx))
+ return;
+ idx = INTEL_PMC_IDX_FIXED_SLOTS;
+ }
+
+ intel_clear_masks(event, idx);
+
+ mask = intel_fixed_bits_by_idx(idx - INTEL_PMC_IDX_FIXED, INTEL_FIXED_BITS_MASK);
+ cpuc->fixed_ctrl_val &= ~mask;
+}
+
+static inline void __intel_pmu_update_event_ext(int idx, u64 ext)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u32 msr;
+
+ if (idx < INTEL_PMC_IDX_FIXED) {
+ msr = MSR_IA32_PMC_V6_GP0_CFG_C +
+ x86_pmu.addr_offset(idx, false);
+ } else {
+ msr = MSR_IA32_PMC_V6_FX0_CFG_C +
+ x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false);
+ }
+
+ cpuc->cfg_c_val[idx] = ext;
+ wrmsrq(msr, ext);
+}
+
+static void intel_pmu_disable_event_ext(struct perf_event *event)
+{
+ /*
+ * Only clear CFG_C MSR for PEBS counter group events,
+ * it avoids the HW counter's value to be added into
+ * other PEBS records incorrectly after PEBS counter
+ * group events are disabled.
+ *
+ * For other events, it's unnecessary to clear CFG_C MSRs
+ * since CFG_C doesn't take effect if counter is in
+ * disabled state. That helps to reduce the WRMSR overhead
+ * in context switches.
+ */
+ if (!is_pebs_counter_event_group(event))
+ return;
+
+ __intel_pmu_update_event_ext(event->hw.idx, 0);
+}
+
+DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext);
+
+static void intel_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ switch (idx) {
+ case 0 ... INTEL_PMC_IDX_FIXED - 1:
+ intel_clear_masks(event, idx);
+ static_call_cond(intel_pmu_disable_event_ext)(event);
+ x86_pmu_disable_event(event);
+ break;
+ case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
+ static_call_cond(intel_pmu_disable_event_ext)(event);
+ fallthrough;
+ case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
+ intel_pmu_disable_fixed(event);
+ break;
+ case INTEL_PMC_IDX_FIXED_BTS:
+ intel_pmu_disable_bts();
+ intel_pmu_drain_bts_buffer();
+ return;
+ case INTEL_PMC_IDX_FIXED_VLBR:
+ intel_clear_masks(event, idx);
+ break;
+ default:
+ intel_clear_masks(event, idx);
+ pr_warn("Failed to disable the event with invalid index %d\n",
+ idx);
+ return;
+ }
+
+ /*
+ * Needs to be called after x86_pmu_disable_event,
+ * so we don't trigger the event without PEBS bit set.
+ */
+ if (unlikely(event->attr.precise_ip))
+ static_call(x86_pmu_pebs_disable)(event);
+}
+
+static void intel_pmu_assign_event(struct perf_event *event, int idx)
+{
+ if (is_pebs_pt(event))
+ perf_report_aux_output_id(event, idx);
+}
+
+static __always_inline bool intel_pmu_needs_branch_stack(struct perf_event *event)
+{
+ return event->hw.flags & PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+}
+
+static void intel_pmu_del_event(struct perf_event *event)
+{
+ if (intel_pmu_needs_branch_stack(event))
+ intel_pmu_lbr_del(event);
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_del(event);
+ if (is_pebs_counter_event_group(event) ||
+ is_acr_event_group(event))
+ this_cpu_ptr(&cpu_hw_events)->n_late_setup--;
+}
+
+static int icl_set_topdown_event_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 left = local64_read(&hwc->period_left);
+
+ /*
+ * The values in PERF_METRICS MSR are derived from fixed counter 3.
+ * Software should start both registers, PERF_METRICS and fixed
+ * counter 3, from zero.
+ * Clear PERF_METRICS and Fixed counter 3 in initialization.
+ * After that, both MSRs will be cleared for each read.
+ * Don't need to clear them again.
+ */
+ if (left == x86_pmu.max_period) {
+ wrmsrq(MSR_CORE_PERF_FIXED_CTR3, 0);
+ wrmsrq(MSR_PERF_METRICS, 0);
+ hwc->saved_slots = 0;
+ hwc->saved_metric = 0;
+ }
+
+ if ((hwc->saved_slots) && is_slots_event(event)) {
+ wrmsrq(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots);
+ wrmsrq(MSR_PERF_METRICS, hwc->saved_metric);
+ }
+
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period);
+
+static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
+{
+ u32 val;
+
+ /*
+ * The metric is reported as an 8bit integer fraction
+ * summing up to 0xff.
+ * slots-in-metric = (Metric / 0xff) * slots
+ */
+ val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff;
+ return mul_u64_u32_div(slots, val, 0xff);
+}
+
+static u64 icl_get_topdown_value(struct perf_event *event,
+ u64 slots, u64 metrics)
+{
+ int idx = event->hw.idx;
+ u64 delta;
+
+ if (is_metric_idx(idx))
+ delta = icl_get_metrics_event_value(metrics, slots, idx);
+ else
+ delta = slots;
+
+ return delta;
+}
+
+static void __icl_update_topdown_event(struct perf_event *event,
+ u64 slots, u64 metrics,
+ u64 last_slots, u64 last_metrics)
+{
+ u64 delta, last = 0;
+
+ delta = icl_get_topdown_value(event, slots, metrics);
+ if (last_slots)
+ last = icl_get_topdown_value(event, last_slots, last_metrics);
+
+ /*
+ * The 8bit integer fraction of metric may be not accurate,
+ * especially when the changes is very small.
+ * For example, if only a few bad_spec happens, the fraction
+ * may be reduced from 1 to 0. If so, the bad_spec event value
+ * will be 0 which is definitely less than the last value.
+ * Avoid update event->count for this case.
+ */
+ if (delta > last) {
+ delta -= last;
+ local64_add(delta, &event->count);
+ }
+}
+
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+ u64 metrics, int metric_end)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *other;
+ int idx;
+
+ event->hw.saved_slots = slots;
+ event->hw.saved_metric = metrics;
+
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
+ if (!is_topdown_idx(idx))
+ continue;
+ other = cpuc->events[idx];
+ other->hw.saved_slots = slots;
+ other->hw.saved_metric = metrics;
+ }
+}
+
+/*
+ * Update all active Topdown events.
+ *
+ * The PERF_METRICS and Fixed counter 3 are read separately. The values may be
+ * modify by a NMI. PMU has to be disabled before calling this function.
+ */
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *other;
+ u64 slots, metrics;
+ bool reset = true;
+ int idx;
+
+ if (!val) {
+ /* read Fixed counter 3 */
+ slots = rdpmc(3 | INTEL_PMC_FIXED_RDPMC_BASE);
+ if (!slots)
+ return 0;
+
+ /* read PERF_METRICS */
+ metrics = rdpmc(INTEL_PMC_FIXED_RDPMC_METRICS);
+ } else {
+ slots = val[0];
+ metrics = val[1];
+ /*
+ * Don't reset the PERF_METRICS and Fixed counter 3
+ * for each PEBS record read. Utilize the RDPMC metrics
+ * clear mode.
+ */
+ reset = false;
+ }
+
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
+ if (!is_topdown_idx(idx))
+ continue;
+ other = cpuc->events[idx];
+ __icl_update_topdown_event(other, slots, metrics,
+ event ? event->hw.saved_slots : 0,
+ event ? event->hw.saved_metric : 0);
+ }
+
+ /*
+ * Check and update this event, which may have been cleared
+ * in active_mask e.g. x86_pmu_stop()
+ */
+ if (event && !test_bit(event->hw.idx, cpuc->active_mask)) {
+ __icl_update_topdown_event(event, slots, metrics,
+ event->hw.saved_slots,
+ event->hw.saved_metric);
+
+ /*
+ * In x86_pmu_stop(), the event is cleared in active_mask first,
+ * then drain the delta, which indicates context switch for
+ * counting.
+ * Save metric and slots for context switch.
+ * Don't need to reset the PERF_METRICS and Fixed counter 3.
+ * Because the values will be restored in next schedule in.
+ */
+ update_saved_topdown_regs(event, slots, metrics, metric_end);
+ reset = false;
+ }
+
+ if (reset) {
+ /* The fixed counter 3 has to be written before the PERF_METRICS. */
+ wrmsrq(MSR_CORE_PERF_FIXED_CTR3, 0);
+ wrmsrq(MSR_PERF_METRICS, 0);
+ if (event)
+ update_saved_topdown_regs(event, 0, 0, metric_end);
+ }
+
+ return slots;
+}
+
+static u64 icl_update_topdown_event(struct perf_event *event, u64 *val)
+{
+ return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+ x86_pmu.num_topdown_events - 1,
+ val);
+}
+
+DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
+
+static void intel_pmu_read_event(struct perf_event *event)
+{
+ if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN) ||
+ is_pebs_counter_event_group(event)) {
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ bool pmu_enabled = cpuc->enabled;
+
+ /* Only need to call update_topdown_event() once for group read. */
+ if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ))
+ return;
+
+ cpuc->enabled = 0;
+ if (pmu_enabled)
+ intel_pmu_disable_all();
+
+ /*
+ * If the PEBS counters snapshotting is enabled,
+ * the topdown event is available in PEBS records.
+ */
+ if (is_topdown_count(event) && !is_pebs_counter_event_group(event))
+ static_call(intel_pmu_update_topdown_event)(event, NULL);
+ else
+ intel_pmu_drain_pebs_buffer();
+
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ intel_pmu_enable_all(0);
+
+ return;
+ }
+
+ x86_perf_event_update(event);
+}
+
+static void intel_pmu_enable_fixed(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+ u64 bits = 0;
+
+ if (is_topdown_idx(idx)) {
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ /*
+ * When there are other active TopDown events,
+ * don't enable the fixed counter 3 again.
+ */
+ if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx))
+ return;
+
+ idx = INTEL_PMC_IDX_FIXED_SLOTS;
+
+ if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR)
+ bits |= INTEL_FIXED_3_METRICS_CLEAR;
+ }
+
+ intel_set_masks(event, idx);
+
+ /*
+ * Enable IRQ generation (0x8), if not PEBS,
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ if (!event->attr.precise_ip)
+ bits |= INTEL_FIXED_0_ENABLE_PMI;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= INTEL_FIXED_0_USER;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= INTEL_FIXED_0_KERNEL;
+
+ /*
+ * ANY bit is supported in v3 and up
+ */
+ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+ bits |= INTEL_FIXED_0_ANYTHREAD;
+
+ idx -= INTEL_PMC_IDX_FIXED;
+ bits = intel_fixed_bits_by_idx(idx, bits);
+ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip)
+ bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
+
+ cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
+ cpuc->fixed_ctrl_val |= bits;
+}
+
+static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int msr_b, msr_c;
+ int msr_offset;
+
+ if (!mask && !cpuc->acr_cfg_b[idx])
+ return;
+
+ if (idx < INTEL_PMC_IDX_FIXED) {
+ msr_b = MSR_IA32_PMC_V6_GP0_CFG_B;
+ msr_c = MSR_IA32_PMC_V6_GP0_CFG_C;
+ msr_offset = x86_pmu.addr_offset(idx, false);
+ } else {
+ msr_b = MSR_IA32_PMC_V6_FX0_CFG_B;
+ msr_c = MSR_IA32_PMC_V6_FX0_CFG_C;
+ msr_offset = x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false);
+ }
+
+ if (cpuc->acr_cfg_b[idx] != mask) {
+ wrmsrl(msr_b + msr_offset, mask);
+ cpuc->acr_cfg_b[idx] = mask;
+ }
+ /* Only need to update the reload value when there is a valid config value. */
+ if (mask && cpuc->acr_cfg_c[idx] != reload) {
+ wrmsrl(msr_c + msr_offset, reload);
+ cpuc->acr_cfg_c[idx] = reload;
+ }
+}
+
+static void intel_pmu_enable_acr(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_acr_event_group(event) || !event->attr.config2) {
+ /*
+ * The disable doesn't clear the ACR CFG register.
+ * Check and clear the ACR CFG register.
+ */
+ intel_pmu_config_acr(hwc->idx, 0, 0);
+ return;
+ }
+
+ intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period);
+}
+
+DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr);
+
+static void intel_pmu_enable_event_ext(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ union arch_pebs_index old, new;
+ struct arch_pebs_cap cap;
+ u64 ext = 0;
+
+ cap = hybrid(cpuc->pmu, arch_pebs_cap);
+
+ if (event->attr.precise_ip) {
+ u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event);
+
+ ext |= ARCH_PEBS_EN;
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD)
+ ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD;
+
+ if (pebs_data_cfg && cap.caps) {
+ if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
+ ext |= ARCH_PEBS_AUX & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_GP)
+ ext |= ARCH_PEBS_GPR & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_XMMS)
+ ext |= ARCH_PEBS_VECR_XMM & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_LBRS)
+ ext |= ARCH_PEBS_LBR & cap.caps;
+
+ if (pebs_data_cfg &
+ (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT))
+ ext |= ARCH_PEBS_CNTR_GP & cap.caps;
+
+ if (pebs_data_cfg &
+ (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT))
+ ext |= ARCH_PEBS_CNTR_FIXED & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_METRICS)
+ ext |= ARCH_PEBS_CNTR_METRICS & cap.caps;
+ }
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
+ new.thresh = ARCH_PEBS_THRESH_MULTI;
+ else
+ new.thresh = ARCH_PEBS_THRESH_SINGLE;
+
+ rdmsrq(MSR_IA32_PEBS_INDEX, old.whole);
+ if (new.thresh != old.thresh || !old.en) {
+ if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) {
+ /*
+ * Large PEBS was enabled.
+ * Drain PEBS buffer before applying the single PEBS.
+ */
+ intel_pmu_drain_pebs_buffer();
+ } else {
+ new.wr = 0;
+ new.full = 0;
+ new.en = 1;
+ wrmsrq(MSR_IA32_PEBS_INDEX, new.whole);
+ }
+ }
+ }
+
+ if (is_pebs_counter_event_group(event))
+ ext |= ARCH_PEBS_CNTR_ALLOW;
+
+ if (cpuc->cfg_c_val[hwc->idx] != ext)
+ __intel_pmu_update_event_ext(hwc->idx, ext);
+}
+
+DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext);
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+ u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE;
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ if (unlikely(event->attr.precise_ip))
+ static_call(x86_pmu_pebs_enable)(event);
+
+ switch (idx) {
+ case 0 ... INTEL_PMC_IDX_FIXED - 1:
+ if (branch_sample_counters(event))
+ enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR;
+ intel_set_masks(event, idx);
+ static_call_cond(intel_pmu_enable_acr_event)(event);
+ static_call_cond(intel_pmu_enable_event_ext)(event);
+ __x86_pmu_enable_event(hwc, enable_mask);
+ break;
+ case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
+ static_call_cond(intel_pmu_enable_acr_event)(event);
+ static_call_cond(intel_pmu_enable_event_ext)(event);
+ fallthrough;
+ case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
+ intel_pmu_enable_fixed(event);
+ break;
+ case INTEL_PMC_IDX_FIXED_BTS:
+ if (!__this_cpu_read(cpu_hw_events.enabled))
+ return;
+ intel_pmu_enable_bts(hwc->config);
+ break;
+ case INTEL_PMC_IDX_FIXED_VLBR:
+ intel_set_masks(event, idx);
+ break;
+ default:
+ pr_warn("Failed to enable the event with invalid index %d\n",
+ idx);
+ }
+}
+
+static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc)
+{
+ struct perf_event *event, *leader;
+ int i, j, idx;
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ leader = cpuc->event_list[i];
+ if (!is_acr_event_group(leader))
+ continue;
+
+ /* The ACR events must be contiguous. */
+ for (j = i; j < cpuc->n_events; j++) {
+ event = cpuc->event_list[j];
+ if (event->group_leader != leader->group_leader)
+ break;
+ for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) {
+ if (i + idx >= cpuc->n_events ||
+ !is_acr_event_group(cpuc->event_list[i + idx]))
+ return;
+ __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1);
+ }
+ }
+ i = j - 1;
+ }
+}
+
+void intel_pmu_late_setup(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!cpuc->n_late_setup)
+ return;
+
+ intel_pmu_pebs_late_setup(cpuc);
+ intel_pmu_acr_late_setup(cpuc);
+}
+
+static void intel_pmu_add_event(struct perf_event *event)
+{
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_add(event);
+ if (intel_pmu_needs_branch_stack(event))
+ intel_pmu_lbr_add(event);
+ if (is_pebs_counter_event_group(event) ||
+ is_acr_event_group(event))
+ this_cpu_ptr(&cpu_hw_events)->n_late_setup++;
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+int intel_pmu_save_and_restart(struct perf_event *event)
+{
+ static_call(x86_pmu_update)(event);
+ /*
+ * For a checkpointed counter always reset back to 0. This
+ * avoids a situation where the counter overflows, aborts the
+ * transaction and is then set back to shortly before the
+ * overflow, and overflows and aborts again.
+ */
+ if (unlikely(event_is_checkpointed(event))) {
+ /* No race with NMIs because the counter should not be armed */
+ wrmsrq(event->hw.event_base, 0);
+ local64_set(&event->hw.prev_count, 0);
+ }
+ return static_call(x86_pmu_set_period)(event);
+}
+
+static int intel_pmu_set_period(struct perf_event *event)
+{
+ if (unlikely(is_topdown_count(event)))
+ return static_call(intel_pmu_set_topdown_event_period)(event);
+
+ return x86_perf_event_set_period(event);
+}
+
+static u64 intel_pmu_update(struct perf_event *event)
+{
+ if (unlikely(is_topdown_count(event)))
+ return static_call(intel_pmu_update_topdown_event)(event, NULL);
+
+ return x86_perf_event_update(event);
+}
+
+static void intel_pmu_reset(void)
+{
+ struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask);
+ unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
+ unsigned long flags;
+ int idx;
+
+ if (!*(u64 *)cntr_mask)
+ return;
+
+ local_irq_save(flags);
+
+ pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for_each_set_bit(idx, cntr_mask, INTEL_PMC_MAX_GENERIC) {
+ wrmsrq_safe(x86_pmu_config_addr(idx), 0ull);
+ wrmsrq_safe(x86_pmu_event_addr(idx), 0ull);
+ }
+ for_each_set_bit(idx, fixed_cntr_mask, INTEL_PMC_MAX_FIXED) {
+ if (fixed_counter_disabled(idx, cpuc->pmu))
+ continue;
+ wrmsrq_safe(x86_pmu_fixed_ctr_addr(idx), 0ull);
+ }
+
+ if (ds)
+ ds->bts_index = ds->bts_buffer_base;
+
+ /* Ack all overflows and disable fixed counters */
+ if (x86_pmu.version >= 2) {
+ intel_pmu_ack_status(intel_pmu_get_status());
+ wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ /* Reset LBRs and LBR freezing */
+ if (x86_pmu.lbr_nr) {
+ update_debugctlmsr(get_debugctlmsr() &
+ ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * We may be running with guest PEBS events created by KVM, and the
+ * PEBS records are logged into the guest's DS and invisible to host.
+ *
+ * In the case of guest PEBS overflow, we only trigger a fake event
+ * to emulate the PEBS overflow PMI for guest PEBS counters in KVM.
+ * The guest will then vm-entry and check the guest DS area to read
+ * the guest PEBS records.
+ *
+ * The contents and other behavior of the guest event do not matter.
+ */
+static void x86_pmu_handle_guest_pebs(struct pt_regs *regs,
+ struct perf_sample_data *data)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask;
+ struct perf_event *event = NULL;
+ int bit;
+
+ if (!unlikely(perf_guest_state()))
+ return;
+
+ if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active ||
+ !guest_pebs_idxs)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, X86_PMC_IDX_MAX) {
+ event = cpuc->events[bit];
+ if (!event->attr.precise_ip)
+ continue;
+
+ perf_sample_data_init(data, 0, event->hw.last_period);
+ perf_event_overflow(event, data, regs);
+
+ /* Inject one fake event is enough. */
+ break;
+ }
+}
+
+static int handle_pmi_common(struct pt_regs *regs, u64 status)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int bit;
+ int handled = 0;
+
+ inc_irq_stat(apic_perf_irqs);
+
+ /*
+ * Ignore a range of extra bits in status that do not indicate
+ * overflow by themselves.
+ */
+ status &= ~(GLOBAL_STATUS_COND_CHG |
+ GLOBAL_STATUS_ASIF |
+ GLOBAL_STATUS_LBRS_FROZEN);
+ if (!status)
+ return 0;
+ /*
+ * In case multiple PEBS events are sampled at the same time,
+ * it is possible to have GLOBAL_STATUS bit 62 set indicating
+ * PEBS buffer overflow and also seeing at most 3 PEBS counters
+ * having their bits set in the status register. This is a sign
+ * that there was at least one PEBS record pending at the time
+ * of the PMU interrupt. PEBS counters must only be processed
+ * via the drain_pebs() calls and not via the regular sample
+ * processing loop coming after that the function, otherwise
+ * phony regular samples may be generated in the sampling buffer
+ * not marked with the EXACT tag. Another possibility is to have
+ * one PEBS event and at least one non-PEBS event which overflows
+ * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
+ * not be set, yet the overflow status bit for the PEBS counter will
+ * be on Skylake.
+ *
+ * To avoid this problem, we systematically ignore the PEBS-enabled
+ * counters from the GLOBAL_STATUS mask and we always process PEBS
+ * events via drain_pebs().
+ */
+ status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
+
+ /*
+ * PEBS overflow sets bit 62 in the global status register
+ */
+ if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&status)) {
+ u64 pebs_enabled = cpuc->pebs_enabled;
+
+ handled++;
+ x86_pmu_handle_guest_pebs(regs, &data);
+ static_call(x86_pmu_drain_pebs)(regs, &data);
+
+ /*
+ * PMI throttle may be triggered, which stops the PEBS event.
+ * Although cpuc->pebs_enabled is updated accordingly, the
+ * MSR_IA32_PEBS_ENABLE is not updated. Because the
+ * cpuc->enabled has been forced to 0 in PMI.
+ * Update the MSR if pebs_enabled is changed.
+ */
+ if (pebs_enabled != cpuc->pebs_enabled)
+ wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+ /*
+ * Above PEBS handler (PEBS counters snapshotting) has updated fixed
+ * counter 3 and perf metrics counts if they are in counter group,
+ * unnecessary to update again.
+ */
+ if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] &&
+ is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS]))
+ status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
+ }
+
+ /*
+ * Arch PEBS sets bit 54 in the global status register
+ */
+ if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT,
+ (unsigned long *)&status)) {
+ handled++;
+ static_call(x86_pmu_drain_pebs)(regs, &data);
+
+ if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] &&
+ is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS]))
+ status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
+ }
+
+ /*
+ * Intel PT
+ */
+ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
+ handled++;
+ if (!perf_guest_handle_intel_pt_intr())
+ intel_pt_interrupt();
+ }
+
+ /*
+ * Intel Perf metrics
+ */
+ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
+ handled++;
+ static_call(intel_pmu_update_topdown_event)(NULL, NULL);
+ }
+
+ status &= hybrid(cpuc->pmu, intel_ctrl);
+
+ /*
+ * Checkpointed counters can lead to 'spurious' PMIs because the
+ * rollback caused by the PMI will have cleared the overflow status
+ * bit. Therefore always force probe these counters.
+ */
+ status |= cpuc->intel_cp_status;
+
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+ u64 last_period;
+
+ handled++;
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ /*
+ * There may be unprocessed PEBS records in the PEBS buffer,
+ * which still stores the previous values.
+ * Process those records first before handling the latest value.
+ * For example,
+ * A is a regular counter
+ * B is a PEBS event which reads A
+ * C is a PEBS event
+ *
+ * The following can happen:
+ * B-assist A=1
+ * C A=2
+ * B-assist A=3
+ * A-overflow-PMI A=4
+ * C-assist-PMI (PEBS buffer) A=5
+ *
+ * The PEBS buffer has to be drained before handling the A-PMI
+ */
+ if (is_pebs_counter_event_group(event))
+ static_call(x86_pmu_drain_pebs)(regs, &data);
+
+ last_period = event->hw.last_period;
+
+ if (!intel_pmu_save_and_restart(event))
+ continue;
+
+ perf_sample_data_init(&data, 0, last_period);
+
+ if (has_branch_stack(event))
+ intel_pmu_lbr_save_brstack(&data, cpuc, event);
+
+ perf_event_overflow(event, &data, regs);
+ }
+
+ return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ bool late_ack = hybrid_bit(cpuc->pmu, late_ack);
+ bool mid_ack = hybrid_bit(cpuc->pmu, mid_ack);
+ int loops;
+ u64 status;
+ int handled;
+ int pmu_enabled;
+
+ /*
+ * Save the PMU state.
+ * It needs to be restored when leaving the handler.
+ */
+ pmu_enabled = cpuc->enabled;
+ /*
+ * In general, the early ACK is only applied for old platforms.
+ * For the big core starts from Haswell, the late ACK should be
+ * applied.
+ * For the small core after Tremont, we have to do the ACK right
+ * before re-enabling counters, which is in the middle of the
+ * NMI handler.
+ */
+ if (!late_ack && !mid_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ intel_bts_disable_local();
+ cpuc->enabled = 0;
+ __intel_pmu_disable_all(true);
+ handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
+ status = intel_pmu_get_status();
+ if (!status)
+ goto done;
+
+ loops = 0;
+again:
+ intel_pmu_lbr_read();
+ intel_pmu_ack_status(status);
+ if (++loops > 100) {
+ static bool warned;
+
+ if (!warned) {
+ WARN(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
+ warned = true;
+ }
+ intel_pmu_reset();
+ goto done;
+ }
+
+ handled += handle_pmi_common(regs, status);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ if (mid_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ __intel_pmu_enable_all(0, true);
+ intel_bts_enable_local();
+
+ /*
+ * Only unmask the NMI after the overflow counters
+ * have been reset. This avoids spurious NMIs on
+ * Haswell CPUs.
+ */
+ if (late_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ return handled;
+}
+
+static struct event_constraint *
+intel_bts_constraints(struct perf_event *event)
+{
+ if (unlikely(intel_pmu_has_bts(event)))
+ return &bts_constraint;
+
+ return NULL;
+}
+
+/*
+ * Note: matches a fake event, like Fixed2.
+ */
+static struct event_constraint *
+intel_vlbr_constraints(struct perf_event *event)
+{
+ struct event_constraint *c = &vlbr_constraint;
+
+ if (unlikely(constraint_match(c, event->hw.config))) {
+ event->hw.flags |= c->flags;
+ return c;
+ }
+
+ return NULL;
+}
+
+static int intel_alt_er(struct cpu_hw_events *cpuc,
+ int idx, u64 config)
+{
+ struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs);
+ int alt_idx = idx;
+
+ if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
+ return idx;
+
+ if (idx == EXTRA_REG_RSP_0)
+ alt_idx = EXTRA_REG_RSP_1;
+
+ if (idx == EXTRA_REG_RSP_1)
+ alt_idx = EXTRA_REG_RSP_0;
+
+ if (config & ~extra_regs[alt_idx].valid_mask)
+ return idx;
+
+ return alt_idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+ struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
+ event->hw.extra_reg.idx = idx;
+
+ if (idx == EXTRA_REG_RSP_0) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ } else if (idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+ }
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
+static struct event_constraint *
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ struct hw_perf_event_extra *reg)
+{
+ struct event_constraint *c = &emptyconstraint;
+ struct er_account *era;
+ unsigned long flags;
+ int idx = reg->idx;
+
+ /*
+ * reg->alloc can be set due to existing state, so for fake cpuc we
+ * need to ignore this, otherwise we might fail to allocate proper fake
+ * state for this extra reg constraint. Also see the comment below.
+ */
+ if (reg->alloc && !cpuc->is_fake)
+ return NULL; /* call x86_get_event_constraint() */
+
+again:
+ era = &cpuc->shared_regs->regs[idx];
+ /*
+ * we use spin_lock_irqsave() to avoid lockdep issues when
+ * passing a fake cpuc
+ */
+ raw_spin_lock_irqsave(&era->lock, flags);
+
+ if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+ /*
+ * If its a fake cpuc -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ *
+ * Not doing the ER fixup will only result in era->reg being
+ * wrong, but since we won't actually try and program hardware
+ * this isn't a problem either.
+ */
+ if (!cpuc->is_fake) {
+ if (idx != reg->idx)
+ intel_fixup_er(event, idx);
+
+ /*
+ * x86_schedule_events() can call get_event_constraints()
+ * multiple times on events in the case of incremental
+ * scheduling(). reg->alloc ensures we only do the ER
+ * allocation once.
+ */
+ reg->alloc = 1;
+ }
+
+ /* lock in msr value */
+ era->config = reg->config;
+ era->reg = reg->reg;
+
+ /* one more user */
+ atomic_inc(&era->ref);
+
+ /*
+ * need to call x86_get_event_constraint()
+ * to check if associated event has constraints
+ */
+ c = NULL;
+ } else {
+ idx = intel_alt_er(cpuc, idx, reg->config);
+ if (idx != reg->idx) {
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+ goto again;
+ }
+ }
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+
+ return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+ struct hw_perf_event_extra *reg)
+{
+ struct er_account *era;
+
+ /*
+ * Only put constraint if extra reg was actually allocated. Also takes
+ * care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake cpuc we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+ * either since it'll be thrown out.
+ */
+ if (!reg->alloc || cpuc->is_fake)
+ return;
+
+ era = &cpuc->shared_regs->regs[reg->idx];
+
+ /* one fewer user */
+ atomic_dec(&era->ref);
+
+ /* allocate again next time */
+ reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct event_constraint *c = NULL, *d;
+ struct hw_perf_event_extra *xreg, *breg;
+
+ xreg = &event->hw.extra_reg;
+ if (xreg->idx != EXTRA_REG_NONE) {
+ c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+ if (c == &emptyconstraint)
+ return c;
+ }
+ breg = &event->hw.branch_reg;
+ if (breg->idx != EXTRA_REG_NONE) {
+ d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+ if (d == &emptyconstraint) {
+ __intel_shared_reg_put_constraints(cpuc, xreg);
+ c = d;
+ }
+ }
+ return c;
+}
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *event_constraints = hybrid(cpuc->pmu, event_constraints);
+ struct event_constraint *c;
+
+ if (event_constraints) {
+ for_each_event_constraint(c, event_constraints) {
+ if (constraint_match(c, event->hw.config)) {
+ event->hw.flags |= c->flags;
+ return c;
+ }
+ }
+ }
+
+ return &hybrid_var(cpuc->pmu, unconstrained);
+}
+
+static struct event_constraint *
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_vlbr_constraints(event);
+ if (c)
+ return c;
+
+ c = intel_bts_constraints(event);
+ if (c)
+ return c;
+
+ c = intel_shared_regs_constraints(cpuc, event);
+ if (c)
+ return c;
+
+ c = intel_pebs_constraints(event);
+ if (c)
+ return c;
+
+ return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ /*
+ * no exclusion needed
+ */
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = true;
+ /*
+ * lock shared state until we are done scheduling
+ * in stop_event_scheduling()
+ * makes scheduling appear as a transaction
+ */
+ raw_spin_lock(&excl_cntrs->lock);
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = cpuc->event_constraint[idx];
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ lockdep_assert_held(&excl_cntrs->lock);
+
+ if (c->flags & PERF_X86_EVENT_EXCL)
+ xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xl->state[cntr] = INTEL_EXCL_SHARED;
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+ /*
+ * no exclusion needed
+ */
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = false;
+ /*
+ * release shared state lock (acquired in intel_start_scheduling())
+ */
+ raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx)
+{
+ WARN_ON_ONCE(!cpuc->constraint_list);
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+ struct event_constraint *cx;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ *cx = *c;
+
+ /*
+ * mark constraint as dynamic
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ c = cx;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ int idx, struct event_constraint *c)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xlo;
+ int tid = cpuc->excl_thread_id;
+ int is_excl, i, w;
+
+ /*
+ * validating a group does not require
+ * enforcing cross-thread exclusion
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return c;
+
+ /*
+ * no exclusion needed
+ */
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return c;
+
+ /*
+ * because we modify the constraint, we need
+ * to make a copy. Static constraints come
+ * from static const tables.
+ *
+ * only needed when constraint has not yet
+ * been cloned (marked dynamic)
+ */
+ c = dyn_constraint(cpuc, c, idx);
+
+ /*
+ * From here on, the constraint is dynamic.
+ * Either it was just allocated above, or it
+ * was allocated during a earlier invocation
+ * of this function
+ */
+
+ /*
+ * state of sibling HT
+ */
+ xlo = &excl_cntrs->states[tid ^ 1];
+
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+ if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+ event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+ if (!cpuc->n_excl++)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+ }
+
+ /*
+ * Modify static constraint with current dynamic
+ * state of thread
+ *
+ * EXCLUSIVE: sibling counter measuring exclusive event
+ * SHARED : sibling counter measuring non-exclusive event
+ * UNUSED : sibling counter unused
+ */
+ w = c->weight;
+ for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
+ /*
+ * exclusive event in sibling counter
+ * our corresponding counter cannot be used
+ * regardless of our event
+ */
+ if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) {
+ __clear_bit(i, c->idxmsk);
+ w--;
+ continue;
+ }
+ /*
+ * if measuring an exclusive event, sibling
+ * measuring non-exclusive, then counter cannot
+ * be used
+ */
+ if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) {
+ __clear_bit(i, c->idxmsk);
+ w--;
+ continue;
+ }
+ }
+
+ /*
+ * if we return an empty mask, then switch
+ * back to static empty constraint to avoid
+ * the cost of freeing later on
+ */
+ if (!w)
+ c = &emptyconstraint;
+
+ c->weight = w;
+
+ return c;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c1, *c2;
+
+ c1 = cpuc->event_constraint[idx];
+
+ /*
+ * first time only
+ * - static constraint: no change across incremental scheduling calls
+ * - dynamic constraint: handled by intel_get_excl_constraints()
+ */
+ c2 = __intel_get_event_constraints(cpuc, idx, event);
+ if (c1) {
+ WARN_ON_ONCE(!(c1->flags & PERF_X86_EVENT_DYNAMIC));
+ bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+ c1->weight = c2->weight;
+ c2 = c1;
+ }
+
+ if (cpuc->excl_cntrs)
+ return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+ if (event->hw.dyn_constraint != ~0ULL) {
+ c2 = dyn_constraint(cpuc, c2, idx);
+ c2->idxmsk64 &= event->hw.dyn_constraint;
+ c2->weight = hweight64(c2->idxmsk64);
+ }
+
+ return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ int tid = cpuc->excl_thread_id;
+ struct intel_excl_states *xl;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake)
+ return;
+
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+ hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+ if (!--cpuc->n_excl)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+ }
+
+ /*
+ * If event was actually assigned, then mark the counter state as
+ * unused now.
+ */
+ if (hwc->idx >= 0) {
+ xl = &excl_cntrs->states[tid];
+
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional.
+ */
+ if (!xl->sched_started)
+ raw_spin_lock(&excl_cntrs->lock);
+
+ xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+ if (!xl->sched_started)
+ raw_spin_unlock(&excl_cntrs->lock);
+ }
+}
+
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg;
+
+ reg = &event->hw.extra_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
+
+ reg = &event->hw.branch_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
+}
+
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ intel_put_shared_regs_event_constraints(cpuc, event);
+
+ /*
+ * is PMU has exclusive counter restrictions, then
+ * all events are subject to and must call the
+ * put_excl_constraints() routine
+ */
+ if (cpuc->excl_cntrs)
+ intel_put_excl_constraints(cpuc, event);
+}
+
+static void intel_pebs_aliases_core2(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.ANY_P
+ * (0x00c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * INST_RETIRED.ANY_P counts the number of cycles that retires
+ * CNTMASK instructions. By setting CNTMASK to a value (16)
+ * larger than the maximum number of instructions that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less instructions, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use UOPS_RETIRED.ALL
+ * (0x01c2), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * UOPS_RETIRED.ALL counts the number of cycles that retires
+ * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+ * larger than the maximum number of micro-ops that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less micro-ops, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+ * (0x01c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * The PREC_DIST event has special support to minimize sample
+ * shadowing effects. One drawback is that it can be
+ * only programmed on counter 1, but that seems like an
+ * acceptable trade off.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_snb(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_core2(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
+{
+ unsigned long flags = x86_pmu.large_pebs_flags;
+
+ if (event->attr.use_clockid)
+ flags &= ~PERF_SAMPLE_TIME;
+ if (!event->attr.exclude_kernel)
+ flags &= ~PERF_SAMPLE_REGS_USER;
+ if (event->attr.sample_regs_user & ~PEBS_GP_REGS)
+ flags &= ~PERF_SAMPLE_REGS_USER;
+ if (event->attr.sample_regs_intr & ~PEBS_GP_REGS)
+ flags &= ~PERF_SAMPLE_REGS_INTR;
+ return flags;
+}
+
+static int intel_pmu_bts_config(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ if (unlikely(intel_pmu_has_bts(event))) {
+ /* BTS is not supported by this architecture. */
+ if (!x86_pmu.bts_active)
+ return -EOPNOTSUPP;
+
+ /* BTS is currently only allowed for user-mode. */
+ if (!attr->exclude_kernel)
+ return -EOPNOTSUPP;
+
+ /* BTS is not allowed for precise events. */
+ if (attr->precise_ip)
+ return -EOPNOTSUPP;
+
+ /* disallow bts if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
+
+ return 0;
+}
+
+static int core_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ return intel_pmu_bts_config(event);
+}
+
+#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \
+ ((x86_pmu.num_topdown_events - 1) << 8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+ return is_metric_event(event) &&
+ event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
+static inline bool is_mem_loads_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
+}
+
+static inline bool is_mem_loads_aux_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
+}
+
+static inline bool require_mem_loads_aux_event(struct perf_event *event)
+{
+ if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX))
+ return false;
+
+ if (is_hybrid())
+ return hybrid_pmu(event->pmu)->pmu_type == hybrid_big;
+
+ return true;
+}
+
+static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
+{
+ union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap);
+
+ return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
+}
+
+static u64 intel_pmu_freq_start_period(struct perf_event *event)
+{
+ int type = event->attr.type;
+ u64 config, factor;
+ s64 start;
+
+ /*
+ * The 127 is the lowest possible recommended SAV (sample after value)
+ * for a 4000 freq (default freq), according to the event list JSON file.
+ * Also, assume the workload is idle 50% time.
+ */
+ factor = 64 * 4000;
+ if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE)
+ goto end;
+
+ /*
+ * The estimation of the start period in the freq mode is
+ * based on the below assumption.
+ *
+ * For a cycles or an instructions event, 1GHZ of the
+ * underlying platform, 1 IPC. The workload is idle 50% time.
+ * The start period = 1,000,000,000 * 1 / freq / 2.
+ * = 500,000,000 / freq
+ *
+ * Usually, the branch-related events occur less than the
+ * instructions event. According to the Intel event list JSON
+ * file, the SAV (sample after value) of a branch-related event
+ * is usually 1/4 of an instruction event.
+ * The start period of branch-related events = 125,000,000 / freq.
+ *
+ * The cache-related events occurs even less. The SAV is usually
+ * 1/20 of an instruction event.
+ * The start period of cache-related events = 25,000,000 / freq.
+ */
+ config = event->attr.config & PERF_HW_EVENT_MASK;
+ if (type == PERF_TYPE_HARDWARE) {
+ switch (config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ case PERF_COUNT_HW_INSTRUCTIONS:
+ case PERF_COUNT_HW_BUS_CYCLES:
+ case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND:
+ case PERF_COUNT_HW_STALLED_CYCLES_BACKEND:
+ case PERF_COUNT_HW_REF_CPU_CYCLES:
+ factor = 500000000;
+ break;
+ case PERF_COUNT_HW_BRANCH_INSTRUCTIONS:
+ case PERF_COUNT_HW_BRANCH_MISSES:
+ factor = 125000000;
+ break;
+ case PERF_COUNT_HW_CACHE_REFERENCES:
+ case PERF_COUNT_HW_CACHE_MISSES:
+ factor = 25000000;
+ break;
+ default:
+ goto end;
+ }
+ }
+
+ if (type == PERF_TYPE_HW_CACHE)
+ factor = 25000000;
+end:
+ /*
+ * Usually, a prime or a number with less factors (close to prime)
+ * is chosen as an SAV, which makes it less likely that the sampling
+ * period synchronizes with some periodic event in the workload.
+ * Minus 1 to make it at least avoiding values near power of twos
+ * for the default freq.
+ */
+ start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1;
+
+ if (start > x86_pmu.max_period)
+ start = x86_pmu.max_period;
+
+ if (x86_pmu.limit_period)
+ x86_pmu.limit_period(event, &start);
+
+ return start;
+}
+
+static inline bool intel_pmu_has_acr(struct pmu *pmu)
+{
+ return !!hybrid(pmu, acr_cause_mask64);
+}
+
+static bool intel_pmu_is_acr_group(struct perf_event *event)
+{
+ /* The group leader has the ACR flag set */
+ if (is_acr_event_group(event))
+ return true;
+
+ /* The acr_mask is set */
+ if (event->attr.config2)
+ return true;
+
+ return false;
+}
+
+static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu)
+{
+ u64 caps;
+
+ if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline)
+ return true;
+
+ caps = hybrid(pmu, arch_pebs_cap).caps;
+ if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK))
+ return true;
+
+ return false;
+}
+
+static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event,
+ u64 *cause_mask, int *num)
+{
+ event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64);
+ *cause_mask |= event->attr.config2;
+ *num += 1;
+}
+
+static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event,
+ int idx, u64 cause_mask)
+{
+ if (test_bit(idx, (unsigned long *)&cause_mask))
+ event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64);
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ ret = intel_pmu_bts_config(event);
+ if (ret)
+ return ret;
+
+ if (event->attr.freq && event->attr.sample_freq) {
+ event->hw.sample_period = intel_pmu_freq_start_period(event);
+ event->hw.last_period = event->hw.sample_period;
+ local64_set(&event->hw.period_left, event->hw.sample_period);
+ }
+
+ if (event->attr.precise_ip) {
+ struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap);
+
+ if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
+ return -EINVAL;
+
+ if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
+ event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+ if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event)) &&
+ !has_aux_action(event)) {
+ event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ }
+ }
+ if (x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
+
+ if (x86_pmu.arch_pebs) {
+ u64 cntr_mask = hybrid(event->pmu, intel_ctrl) &
+ ~GLOBAL_CTRL_EN_PERF_METRICS;
+ u64 pebs_mask = event->attr.precise_ip >= 3 ?
+ pebs_cap.pdists : pebs_cap.counters;
+ if (cntr_mask != pebs_mask)
+ event->hw.dyn_constraint &= pebs_mask;
+ }
+ }
+
+ if (needs_branch_stack(event)) {
+ /* Avoid branch stack setup for counting events in SAMPLE READ */
+ if (is_sampling_event(event) ||
+ !(event->attr.sample_type & PERF_SAMPLE_READ))
+ event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+ }
+
+ if (branch_sample_counters(event)) {
+ struct perf_event *leader, *sibling;
+ int num = 0;
+
+ if (!(x86_pmu.flags & PMU_FL_BR_CNTR) ||
+ (event->attr.config & ~INTEL_ARCH_EVENT_MASK))
+ return -EINVAL;
+
+ /*
+ * The branch counter logging is not supported in the call stack
+ * mode yet, since we cannot simply flush the LBR during e.g.,
+ * multiplexing. Also, there is no obvious usage with the call
+ * stack mode. Simply forbids it for now.
+ *
+ * If any events in the group enable the branch counter logging
+ * feature, the group is treated as a branch counter logging
+ * group, which requires the extra space to store the counters.
+ */
+ leader = event->group_leader;
+ if (branch_sample_call_stack(leader))
+ return -EINVAL;
+ if (branch_sample_counters(leader)) {
+ num++;
+ leader->hw.dyn_constraint &= x86_pmu.lbr_counters;
+ }
+ leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS;
+
+ for_each_sibling_event(sibling, leader) {
+ if (branch_sample_call_stack(sibling))
+ return -EINVAL;
+ if (branch_sample_counters(sibling)) {
+ num++;
+ sibling->hw.dyn_constraint &= x86_pmu.lbr_counters;
+ }
+ }
+
+ if (num > fls(x86_pmu.lbr_counters))
+ return -EINVAL;
+ /*
+ * Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't
+ * require any branch stack setup.
+ * Clear the bit to avoid unnecessary branch stack setup.
+ */
+ if (0 == (event->attr.branch_sample_type &
+ ~(PERF_SAMPLE_BRANCH_PLM_ALL |
+ PERF_SAMPLE_BRANCH_COUNTERS)))
+ event->hw.flags &= ~PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+
+ /*
+ * Force the leader to be a LBR event. So LBRs can be reset
+ * with the leader event. See intel_pmu_lbr_del() for details.
+ */
+ if (!intel_pmu_needs_branch_stack(leader))
+ return -EINVAL;
+ }
+
+ if (intel_pmu_needs_branch_stack(event)) {
+ ret = intel_pmu_setup_lbr_filter(event);
+ if (ret)
+ return ret;
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+
+ /*
+ * BTS is set up earlier in this path, so don't account twice
+ */
+ if (!unlikely(intel_pmu_has_bts(event))) {
+ /* disallow lbr if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
+ }
+
+ if (event->attr.aux_output) {
+ if (!event->attr.precise_ip)
+ return -EINVAL;
+
+ event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
+ }
+
+ if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
+ intel_pmu_has_pebs_counter_group(event->pmu) &&
+ is_sampling_event(event) &&
+ event->attr.precise_ip)
+ event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
+
+ if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) {
+ struct perf_event *sibling, *leader = event->group_leader;
+ struct pmu *pmu = event->pmu;
+ bool has_sw_event = false;
+ int num = 0, idx = 0;
+ u64 cause_mask = 0;
+
+ /* Not support perf metrics */
+ if (is_metric_event(event))
+ return -EINVAL;
+
+ /* Not support freq mode */
+ if (event->attr.freq)
+ return -EINVAL;
+
+ /* PDist is not supported */
+ if (event->attr.config2 && event->attr.precise_ip > 2)
+ return -EINVAL;
+
+ /* The reload value cannot exceeds the max period */
+ if (event->attr.sample_period > x86_pmu.max_period)
+ return -EINVAL;
+ /*
+ * The counter-constraints of each event cannot be finalized
+ * unless the whole group is scanned. However, it's hard
+ * to know whether the event is the last one of the group.
+ * Recalculate the counter-constraints for each event when
+ * adding a new event.
+ *
+ * The group is traversed twice, which may be optimized later.
+ * In the first round,
+ * - Find all events which do reload when other events
+ * overflow and set the corresponding counter-constraints
+ * - Add all events, which can cause other events reload,
+ * in the cause_mask
+ * - Error out if the number of events exceeds the HW limit
+ * - The ACR events must be contiguous.
+ * Error out if there are non-X86 events between ACR events.
+ * This is not a HW limit, but a SW limit.
+ * With the assumption, the intel_pmu_acr_late_setup() can
+ * easily convert the event idx to counter idx without
+ * traversing the whole event list.
+ */
+ if (!is_x86_event(leader))
+ return -EINVAL;
+
+ if (leader->attr.config2)
+ intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num);
+
+ if (leader->nr_siblings) {
+ for_each_sibling_event(sibling, leader) {
+ if (!is_x86_event(sibling)) {
+ has_sw_event = true;
+ continue;
+ }
+ if (!sibling->attr.config2)
+ continue;
+ if (has_sw_event)
+ return -EINVAL;
+ intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num);
+ }
+ }
+ if (leader != event && event->attr.config2) {
+ if (has_sw_event)
+ return -EINVAL;
+ intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num);
+ }
+
+ if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) ||
+ num > hweight64(hybrid(event->pmu, acr_cntr_mask64)))
+ return -EINVAL;
+ /*
+ * In the second round, apply the counter-constraints for
+ * the events which can cause other events reload.
+ */
+ intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask);
+
+ if (leader->nr_siblings) {
+ for_each_sibling_event(sibling, leader)
+ intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask);
+ }
+
+ if (leader != event)
+ intel_pmu_set_acr_caused_constr(event, idx, cause_mask);
+
+ leader->hw.flags |= PERF_X86_EVENT_ACR;
+ }
+
+ if ((event->attr.type == PERF_TYPE_HARDWARE) ||
+ (event->attr.type == PERF_TYPE_HW_CACHE))
+ return 0;
+
+ /*
+ * Config Topdown slots and metric events
+ *
+ * The slots event on Fixed Counter 3 can support sampling,
+ * which will be handled normally in x86_perf_event_update().
+ *
+ * Metric events don't support sampling and require being paired
+ * with a slots event as group leader. When the slots event
+ * is used in a metrics group, it too cannot support sampling.
+ */
+ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
+ /* The metrics_clear can only be set for the slots event */
+ if (event->attr.config1 &&
+ (!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR)))
+ return -EINVAL;
+
+ if (event->attr.config2)
+ return -EINVAL;
+
+ /*
+ * The TopDown metrics events and slots event don't
+ * support any filters.
+ */
+ if (event->attr.config & X86_ALL_EVENT_FLAGS)
+ return -EINVAL;
+
+ if (is_available_metric_event(event)) {
+ struct perf_event *leader = event->group_leader;
+
+ /* The metric events don't support sampling. */
+ if (is_sampling_event(event))
+ return -EINVAL;
+
+ /* The metric events require a slots group leader. */
+ if (!is_slots_event(leader))
+ return -EINVAL;
+
+ /*
+ * The leader/SLOTS must not be a sampling event for
+ * metric use; hardware requires it starts at 0 when used
+ * in conjunction with MSR_PERF_METRICS.
+ */
+ if (is_sampling_event(leader))
+ return -EINVAL;
+
+ event->event_caps |= PERF_EV_CAP_SIBLING;
+ /*
+ * Only once we have a METRICs sibling do we
+ * need TopDown magic.
+ */
+ leader->hw.flags |= PERF_X86_EVENT_TOPDOWN;
+ event->hw.flags |= PERF_X86_EVENT_TOPDOWN;
+ }
+ }
+
+ /*
+ * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
+ * doesn't function quite right. As a work-around it needs to always be
+ * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
+ * The actual count of this second event is irrelevant it just needs
+ * to be active to make the first event function correctly.
+ *
+ * In a group, the auxiliary event must be in front of the load latency
+ * event. The rule is to simplify the implementation of the check.
+ * That's because perf cannot have a complete group at the moment.
+ */
+ if (require_mem_loads_aux_event(event) &&
+ (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
+ is_mem_loads_event(event)) {
+ struct perf_event *leader = event->group_leader;
+ struct perf_event *sibling = NULL;
+
+ /*
+ * When this memload event is also the first event (no group
+ * exists yet), then there is no aux event before it.
+ */
+ if (leader == event)
+ return -ENODATA;
+
+ if (!is_mem_loads_aux_event(leader)) {
+ for_each_sibling_event(sibling, leader) {
+ if (is_mem_loads_aux_event(sibling))
+ break;
+ }
+ if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
+ return -ENODATA;
+ }
+ }
+
+ if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+ return 0;
+
+ if (x86_pmu.version < 3)
+ return -EINVAL;
+
+ ret = perf_allow_cpu();
+ if (ret)
+ return ret;
+
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+ return 0;
+}
+
+/*
+ * Currently, the only caller of this function is the atomic_switch_perf_msrs().
+ * The host perf context helps to prepare the values of the real hardware for
+ * a set of msrs that need to be switched atomically in a vmx transaction.
+ *
+ * For example, the pseudocode needed to add a new msr should look like:
+ *
+ * arr[(*nr)++] = (struct perf_guest_switch_msr){
+ * .msr = the hardware msr address,
+ * .host = the value the hardware has when it doesn't run a guest,
+ * .guest = the value the hardware has when it runs a guest,
+ * };
+ *
+ * These values have nothing to do with the emulated values the guest sees
+ * when it uses {RD,WR}MSR, which should be handled by the KVM context,
+ * specifically in the intel_pmu_{get,set}_msr().
+ */
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+ struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data;
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
+ u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable;
+ int global_ctrl, pebs_enable;
+
+ /*
+ * In addition to obeying exclude_guest/exclude_host, remove bits being
+ * used for PEBS when running a guest, because PEBS writes to virtual
+ * addresses (not physical addresses).
+ */
+ *nr = 0;
+ global_ctrl = (*nr)++;
+ arr[global_ctrl] = (struct perf_guest_switch_msr){
+ .msr = MSR_CORE_PERF_GLOBAL_CTRL,
+ .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
+ .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask,
+ };
+
+ if (!x86_pmu.ds_pebs)
+ return arr;
+
+ /*
+ * If PMU counter has PEBS enabled it is not enough to
+ * disable counter on a guest entry since PEBS memory
+ * write can overshoot guest entry and corrupt guest
+ * memory. Disabling PEBS solves the problem.
+ *
+ * Don't do this if the CPU already enforces it.
+ */
+ if (x86_pmu.pebs_no_isolation) {
+ arr[(*nr)++] = (struct perf_guest_switch_msr){
+ .msr = MSR_IA32_PEBS_ENABLE,
+ .host = cpuc->pebs_enabled,
+ .guest = 0,
+ };
+ return arr;
+ }
+
+ if (!kvm_pmu || !x86_pmu.pebs_ept)
+ return arr;
+
+ arr[(*nr)++] = (struct perf_guest_switch_msr){
+ .msr = MSR_IA32_DS_AREA,
+ .host = (unsigned long)cpuc->ds,
+ .guest = kvm_pmu->ds_area,
+ };
+
+ if (x86_pmu.intel_cap.pebs_baseline) {
+ arr[(*nr)++] = (struct perf_guest_switch_msr){
+ .msr = MSR_PEBS_DATA_CFG,
+ .host = cpuc->active_pebs_data_cfg,
+ .guest = kvm_pmu->pebs_data_cfg,
+ };
+ }
+
+ pebs_enable = (*nr)++;
+ arr[pebs_enable] = (struct perf_guest_switch_msr){
+ .msr = MSR_IA32_PEBS_ENABLE,
+ .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
+ .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask & kvm_pmu->pebs_enable,
+ };
+
+ if (arr[pebs_enable].host) {
+ /* Disable guest PEBS if host PEBS is enabled. */
+ arr[pebs_enable].guest = 0;
+ } else {
+ /* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */
+ arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask;
+ arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask;
+ /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */
+ arr[global_ctrl].guest |= arr[pebs_enable].guest;
+ }
+
+ return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+ int idx;
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[idx];
+
+ arr[idx].msr = x86_pmu_config_addr(idx);
+ arr[idx].host = arr[idx].guest = 0;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ arr[idx].host = arr[idx].guest =
+ event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ if (event->attr.exclude_host)
+ arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ else if (event->attr.exclude_guest)
+ arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ }
+
+ *nr = x86_pmu_max_num_counters(cpuc->pmu);
+ return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+ if (!event->attr.exclude_host)
+ x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+ if (!test_bit(idx, cpuc->active_mask) ||
+ cpuc->events[idx]->attr.exclude_host)
+ continue;
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ }
+}
+
+static int hsw_hw_config(struct perf_event *event)
+{
+ int ret = intel_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+ if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+ return 0;
+ event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+ /*
+ * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+ * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+ * this combination.
+ */
+ if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+ ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+ event->attr.precise_ip > 0))
+ return -EOPNOTSUPP;
+
+ if (event_is_checkpointed(event)) {
+ /*
+ * Sampling of checkpointed events can cause situations where
+ * the CPU constantly aborts because of a overflow, which is
+ * then checkpointed back and ignored. Forbid checkpointing
+ * for sampling.
+ *
+ * But still allow a long sampling period, so that perf stat
+ * from KVM works.
+ */
+ if (event->attr.sample_period > 0 &&
+ event->attr.sample_period < 0x7fffffff)
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static struct event_constraint counter0_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1);
+
+static struct event_constraint counter1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x2);
+
+static struct event_constraint counter0_1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x3);
+
+static struct event_constraint counter2_constraint =
+ EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint fixed0_constraint =
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0);
+
+static struct event_constraint fixed0_counter0_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL);
+
+static struct event_constraint fixed0_counter0_1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000003ULL);
+
+static struct event_constraint counters_1_7_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xfeULL);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ /* Handle special quirk on in_tx_checkpointed only in counter 2 */
+ if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+ if (c->idxmsk64 & (1U << 2))
+ return &counter2_constraint;
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ /*
+ * Fixed counter 0 has less skid.
+ * Force instruction:ppp in Fixed counter 0
+ */
+ if ((event->attr.precise_ip == 3) &&
+ constraint_match(&fixed0_constraint, event->hw.config))
+ return &fixed0_constraint;
+
+ return hsw_get_event_constraints(cpuc, idx, event);
+}
+
+static struct event_constraint *
+glc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = icl_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0. If a :ppp event which is not
+ * available on the GP counter 0, error out.
+ * Exception: Instruction PDIR is only available on the fixed counter 0.
+ */
+ if ((event->attr.precise_ip == 3) &&
+ !constraint_match(&fixed0_constraint, event->hw.config)) {
+ if (c->idxmsk64 & BIT_ULL(0))
+ return &counter0_constraint;
+
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ /* :ppp means to do reduced skid PEBS which is PMC0 only. */
+ if (event->attr.precise_ip == 3)
+ return &counter0_constraint;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ return c;
+}
+
+static struct event_constraint *
+tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * :ppp means to do reduced skid PEBS,
+ * which is available on PMC0 and fixed counter 0.
+ */
+ if (event->attr.precise_ip == 3) {
+ /* Force instruction:ppp on PMC0 and Fixed counter 0 */
+ if (constraint_match(&fixed0_constraint, event->hw.config))
+ return &fixed0_counter0_constraint;
+
+ return &counter0_constraint;
+ }
+
+ return c;
+}
+
+static bool allow_tsx_force_abort = true;
+
+static struct event_constraint *
+tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * Without TFA we must not use PMC3.
+ */
+ if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) {
+ c = dyn_constraint(cpuc, c, idx);
+ c->idxmsk64 &= ~(1ULL << 3);
+ c->weight--;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_big)
+ return glc_get_event_constraints(cpuc, idx, event);
+ else if (pmu->pmu_type == hybrid_small)
+ return tnt_get_event_constraints(cpuc, idx, event);
+
+ WARN_ON(1);
+ return &emptyconstraint;
+}
+
+static struct event_constraint *
+cmt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0 & 1 and Fixed counter 0.
+ * If a :ppp event which is not available on the above eligible counters,
+ * error out.
+ */
+ if (event->attr.precise_ip == 3) {
+ /* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */
+ if (constraint_match(&fixed0_constraint, event->hw.config)) {
+ /* The fixed counter 0 doesn't support LBR event logging. */
+ if (branch_sample_counters(event))
+ return &counter0_1_constraint;
+ else
+ return &fixed0_counter0_1_constraint;
+ }
+
+ switch (c->idxmsk64 & 0x3ull) {
+ case 0x1:
+ return &counter0_constraint;
+ case 0x2:
+ return &counter1_constraint;
+ case 0x3:
+ return &counter0_1_constraint;
+ }
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+rwc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = glc_get_event_constraints(cpuc, idx, event);
+
+ /* The Retire Latency is not supported by the fixed counter 0. */
+ if (event->attr.precise_ip &&
+ (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
+ constraint_match(&fixed0_constraint, event->hw.config)) {
+ /*
+ * The Instruction PDIR is only available
+ * on the fixed counter 0. Error out for this case.
+ */
+ if (event->attr.precise_ip == 3)
+ return &emptyconstraint;
+ return &counters_1_7_constraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+mtl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_big)
+ return rwc_get_event_constraints(cpuc, idx, event);
+ if (pmu->pmu_type == hybrid_small)
+ return cmt_get_event_constraints(cpuc, idx, event);
+
+ WARN_ON(1);
+ return &emptyconstraint;
+}
+
+static int adl_hw_config(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_big)
+ return hsw_hw_config(event);
+ else if (pmu->pmu_type == hybrid_small)
+ return intel_pmu_hw_config(event);
+
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+
+static enum intel_cpu_type adl_get_hybrid_cpu_type(void)
+{
+ return INTEL_CPU_TYPE_CORE;
+}
+
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+static struct event_constraint *
+arl_h_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_tiny)
+ return cmt_get_event_constraints(cpuc, idx, event);
+
+ return mtl_get_event_constraints(cpuc, idx, event);
+}
+
+static int arl_h_hw_config(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_tiny)
+ return intel_pmu_hw_config(event);
+
+ return adl_hw_config(event);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static void bdw_limit_period(struct perf_event *event, s64 *left)
+{
+ if (erratum_hsw11(event)) {
+ if (*left < 128)
+ *left = 128;
+ *left &= ~0x3fULL;
+ }
+}
+
+static void nhm_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, 32LL);
+}
+
+static void glc_limit_period(struct perf_event *event, s64 *left)
+{
+ if (event->attr.precise_ip == 3)
+ *left = max(*left, 128LL);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+PMU_FORMAT_ATTR(in_tx, "config:32" );
+PMU_FORMAT_ATTR(in_tx_cp, "config:33" );
+PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */
+
+PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
+
+static ssize_t umask2_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ u64 mask = hybrid(dev_get_drvdata(dev), config_mask) & ARCH_PERFMON_EVENTSEL_UMASK2;
+
+ if (mask == ARCH_PERFMON_EVENTSEL_UMASK2)
+ return sprintf(page, "config:8-15,40-47\n");
+
+ /* Roll back to the old format if umask2 is not supported. */
+ return sprintf(page, "config:8-15\n");
+}
+
+static struct device_attribute format_attr_umask2 =
+ __ATTR(umask, 0444, umask2_show, NULL);
+
+static struct attribute *format_evtsel_ext_attrs[] = {
+ &format_attr_umask2.attr,
+ &format_attr_eq.attr,
+ &format_attr_metrics_clear.attr,
+ NULL
+};
+
+static umode_t
+evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ u64 mask;
+
+ /*
+ * The umask and umask2 have different formats but share the
+ * same attr name. In update mode, the previous value of the
+ * umask is unconditionally removed before is_visible. If
+ * umask2 format is not enumerated, it's impossible to roll
+ * back to the old format.
+ * Does the check in umask2_show rather than is_visible.
+ */
+ if (i == 0)
+ return attr->mode;
+
+ mask = hybrid(dev_get_drvdata(dev), config_mask);
+ if (i == 1)
+ return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0;
+
+ /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */
+ if (i == 2) {
+ union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap);
+
+ return intel_cap.rdpmc_metrics_clear ? attr->mode : 0;
+ }
+
+ return 0;
+}
+
+static struct attribute *intel_arch_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+ u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+ return x86_event_sysfs_show(page, config, event);
+}
+
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ struct intel_shared_regs *regs;
+ int i;
+
+ regs = kzalloc_node(sizeof(struct intel_shared_regs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (regs) {
+ /*
+ * initialize the locks to keep lockdep happy
+ */
+ for (i = 0; i < EXTRA_REG_MAX; i++)
+ raw_spin_lock_init(&regs->regs[i].lock);
+
+ regs->core_id = -1;
+ }
+ return regs;
+}
+
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+ struct intel_excl_cntrs *c;
+
+ c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (c) {
+ raw_spin_lock_init(&c->lock);
+ c->core_id = -1;
+ }
+ return c;
+}
+
+
+int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
+ cpuc->pebs_record_size = x86_pmu.pebs_record_size;
+
+ if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ goto err;
+ }
+
+ if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) {
+ size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+ cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpuc->constraint_list)
+ goto err_shared_regs;
+ }
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+ if (!cpuc->excl_cntrs)
+ goto err_constraint_list;
+
+ cpuc->excl_thread_id = 0;
+ }
+
+ return 0;
+
+err_constraint_list:
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+
+err_shared_regs:
+ kfree(cpuc->shared_regs);
+ cpuc->shared_regs = NULL;
+
+err:
+ return -ENOMEM;
+}
+
+static int intel_pmu_cpu_prepare(int cpu)
+{
+ int ret;
+
+ ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
+ if (ret)
+ return ret;
+
+ return alloc_arch_pebs_buf_on_cpu(cpu);
+}
+
+static void flip_smm_bit(void *data)
+{
+ unsigned long set = *(unsigned long *)data;
+
+ if (set > 0) {
+ msr_set_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ } else {
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ }
+}
+
+static void intel_pmu_check_counters_mask(u64 *cntr_mask,
+ u64 *fixed_cntr_mask,
+ u64 *intel_ctrl)
+{
+ unsigned int bit;
+
+ bit = fls64(*cntr_mask);
+ if (bit > INTEL_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+ bit, INTEL_PMC_MAX_GENERIC);
+ *cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
+ }
+ *intel_ctrl = *cntr_mask;
+
+ bit = fls64(*fixed_cntr_mask);
+ if (bit > INTEL_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+ bit, INTEL_PMC_MAX_FIXED);
+ *fixed_cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0);
+ }
+
+ *intel_ctrl |= *fixed_cntr_mask << INTEL_PMC_IDX_FIXED;
+}
+
+static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
+ u64 cntr_mask,
+ u64 fixed_cntr_mask,
+ u64 intel_ctrl);
+
+enum dyn_constr_type {
+ DYN_CONSTR_NONE,
+ DYN_CONSTR_BR_CNTR,
+ DYN_CONSTR_ACR_CNTR,
+ DYN_CONSTR_ACR_CAUSE,
+ DYN_CONSTR_PEBS,
+ DYN_CONSTR_PDIST,
+
+ DYN_CONSTR_MAX,
+};
+
+static const char * const dyn_constr_type_name[] = {
+ [DYN_CONSTR_NONE] = "a normal event",
+ [DYN_CONSTR_BR_CNTR] = "a branch counter logging event",
+ [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event",
+ [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event",
+ [DYN_CONSTR_PEBS] = "a PEBS event",
+ [DYN_CONSTR_PDIST] = "a PEBS PDIST event",
+};
+
+static void __intel_pmu_check_dyn_constr(struct event_constraint *constr,
+ enum dyn_constr_type type, u64 mask)
+{
+ struct event_constraint *c1, *c2;
+ int new_weight, check_weight;
+ u64 new_mask, check_mask;
+
+ for_each_event_constraint(c1, constr) {
+ new_mask = c1->idxmsk64 & mask;
+ new_weight = hweight64(new_mask);
+
+ /* ignore topdown perf metrics event */
+ if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN)
+ continue;
+
+ if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) {
+ pr_info("The event 0x%llx is not supported as %s.\n",
+ c1->code, dyn_constr_type_name[type]);
+ }
+
+ if (new_weight <= 1)
+ continue;
+
+ for_each_event_constraint(c2, c1 + 1) {
+ bool check_fail = false;
+
+ check_mask = c2->idxmsk64 & mask;
+ check_weight = hweight64(check_mask);
+
+ if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN ||
+ !check_weight)
+ continue;
+
+ /* The same constraints or no overlap */
+ if (new_mask == check_mask ||
+ (new_mask ^ check_mask) == (new_mask | check_mask))
+ continue;
+
+ /*
+ * A scheduler issue may be triggered in the following cases.
+ * - Two overlap constraints have the same weight.
+ * E.g., A constraints: 0x3, B constraints: 0x6
+ * event counter failure case
+ * B PMC[2:1] 1
+ * A PMC[1:0] 0
+ * A PMC[1:0] FAIL
+ * - Two overlap constraints have different weight.
+ * The constraint has a low weight, but has high last bit.
+ * E.g., A constraints: 0x7, B constraints: 0xC
+ * event counter failure case
+ * B PMC[3:2] 2
+ * A PMC[2:0] 0
+ * A PMC[2:0] 1
+ * A PMC[2:0] FAIL
+ */
+ if (new_weight == check_weight) {
+ check_fail = true;
+ } else if (new_weight < check_weight) {
+ if ((new_mask | check_mask) != check_mask &&
+ fls64(new_mask) > fls64(check_mask))
+ check_fail = true;
+ } else {
+ if ((new_mask | check_mask) != new_mask &&
+ fls64(new_mask) < fls64(check_mask))
+ check_fail = true;
+ }
+
+ if (check_fail) {
+ pr_info("The two events 0x%llx and 0x%llx may not be "
+ "fully scheduled under some circumstances as "
+ "%s.\n",
+ c1->code, c2->code, dyn_constr_type_name[type]);
+ }
+ }
+ }
+}
+
+static void intel_pmu_check_dyn_constr(struct pmu *pmu,
+ struct event_constraint *constr,
+ u64 cntr_mask)
+{
+ enum dyn_constr_type i;
+ u64 mask;
+
+ for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) {
+ mask = 0;
+ switch (i) {
+ case DYN_CONSTR_NONE:
+ mask = cntr_mask;
+ break;
+ case DYN_CONSTR_BR_CNTR:
+ if (x86_pmu.flags & PMU_FL_BR_CNTR)
+ mask = x86_pmu.lbr_counters;
+ break;
+ case DYN_CONSTR_ACR_CNTR:
+ mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
+ break;
+ case DYN_CONSTR_ACR_CAUSE:
+ if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64))
+ continue;
+ mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
+ break;
+ case DYN_CONSTR_PEBS:
+ if (x86_pmu.arch_pebs)
+ mask = hybrid(pmu, arch_pebs_cap).counters;
+ break;
+ case DYN_CONSTR_PDIST:
+ if (x86_pmu.arch_pebs)
+ mask = hybrid(pmu, arch_pebs_cap).pdists;
+ break;
+ default:
+ pr_warn("Unsupported dynamic constraint type %d\n", i);
+ }
+
+ if (mask)
+ __intel_pmu_check_dyn_constr(constr, i, mask);
+ }
+}
+
+static void intel_pmu_check_event_constraints_all(struct pmu *pmu)
+{
+ struct event_constraint *event_constraints = hybrid(pmu, event_constraints);
+ struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints);
+ u64 cntr_mask = hybrid(pmu, cntr_mask64);
+ u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64);
+ u64 intel_ctrl = hybrid(pmu, intel_ctrl);
+
+ intel_pmu_check_event_constraints(event_constraints, cntr_mask,
+ fixed_cntr_mask, intel_ctrl);
+
+ if (event_constraints)
+ intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask);
+
+ if (pebs_constraints)
+ intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask);
+}
+
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs);
+
+static inline bool intel_pmu_broken_perf_cap(void)
+{
+ /* The Perf Metric (Bit 15) is always cleared */
+ if (boot_cpu_data.x86_vfm == INTEL_METEORLAKE ||
+ boot_cpu_data.x86_vfm == INTEL_METEORLAKE_L)
+ return true;
+
+ return false;
+}
+
+static inline void __intel_update_pmu_caps(struct pmu *pmu)
+{
+ struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id());
+
+ if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM)
+ dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+}
+
+static inline void __intel_update_large_pebs_flags(struct pmu *pmu)
+{
+ u64 caps = hybrid(pmu, arch_pebs_cap).caps;
+
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
+ if (caps & ARCH_PEBS_LBR)
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK;
+ if (caps & ARCH_PEBS_CNTR_MASK)
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
+
+ if (!(caps & ARCH_PEBS_AUX))
+ x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC;
+ if (!(caps & ARCH_PEBS_GPR)) {
+ x86_pmu.large_pebs_flags &=
+ ~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER);
+ }
+}
+
+#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED))
+
+static void update_pmu_cap(struct pmu *pmu)
+{
+ unsigned int eax, ebx, ecx, edx;
+ union cpuid35_eax eax_0;
+ union cpuid35_ebx ebx_0;
+ u64 cntrs_mask = 0;
+ u64 pebs_mask = 0;
+ u64 pdists_mask = 0;
+
+ cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx);
+
+ if (ebx_0.split.umask2)
+ hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2;
+ if (ebx_0.split.eq)
+ hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ;
+
+ if (eax_0.split.cntr_subleaf) {
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ hybrid(pmu, cntr_mask64) = eax;
+ hybrid(pmu, fixed_cntr_mask64) = ebx;
+ cntrs_mask = counter_mask(eax, ebx);
+ }
+
+ if (eax_0.split.acr_subleaf) {
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ /* The mask of the counters which can be reloaded */
+ hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx);
+ /* The mask of the counters which can cause a reload of reloadable counters */
+ hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx);
+ }
+
+ /* Bits[5:4] should be set simultaneously if arch-PEBS is supported */
+ if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) {
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32;
+
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ pebs_mask = counter_mask(eax, ecx);
+ pdists_mask = counter_mask(ebx, edx);
+ hybrid(pmu, arch_pebs_cap).counters = pebs_mask;
+ hybrid(pmu, arch_pebs_cap).pdists = pdists_mask;
+
+ if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) {
+ x86_pmu.arch_pebs = 0;
+ } else {
+ __intel_update_pmu_caps(pmu);
+ __intel_update_large_pebs_flags(pmu);
+ }
+ } else {
+ WARN_ON(x86_pmu.arch_pebs == 1);
+ x86_pmu.arch_pebs = 0;
+ }
+
+ if (!intel_pmu_broken_perf_cap()) {
+ /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities);
+ }
+}
+
+static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
+{
+ intel_pmu_check_counters_mask(&pmu->cntr_mask64, &pmu->fixed_cntr_mask64,
+ &pmu->intel_ctrl);
+ pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64);
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, pmu->cntr_mask64,
+ 0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
+
+ if (pmu->intel_cap.perf_metrics)
+ pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
+ else
+ pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
+
+ intel_pmu_check_event_constraints_all(&pmu->pmu);
+
+ intel_pmu_check_extra_regs(pmu->extra_regs);
+}
+
+static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+ enum intel_cpu_type cpu_type = c->topo.intel_type;
+ int i;
+
+ /*
+ * This is running on a CPU model that is known to have hybrid
+ * configurations. But the CPU told us it is not hybrid, shame
+ * on it. There should be a fixup function provided for these
+ * troublesome CPUs (->get_hybrid_cpu_type).
+ */
+ if (cpu_type == INTEL_CPU_TYPE_UNKNOWN) {
+ if (x86_pmu.get_hybrid_cpu_type)
+ cpu_type = x86_pmu.get_hybrid_cpu_type();
+ else
+ return NULL;
+ }
+
+ /*
+ * This essentially just maps between the 'hybrid_cpu_type'
+ * and 'hybrid_pmu_type' enums except for ARL-H processor
+ * which needs to compare atom uarch native id since ARL-H
+ * contains two different atom uarchs.
+ */
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type;
+ u32 native_id;
+
+ if (cpu_type == INTEL_CPU_TYPE_CORE && pmu_type == hybrid_big)
+ return &x86_pmu.hybrid_pmu[i];
+ if (cpu_type == INTEL_CPU_TYPE_ATOM) {
+ if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small)
+ return &x86_pmu.hybrid_pmu[i];
+
+ native_id = c->topo.intel_native_model_id;
+ if (native_id == INTEL_ATOM_SKT_NATIVE_ID && pmu_type == hybrid_small)
+ return &x86_pmu.hybrid_pmu[i];
+ if (native_id == INTEL_ATOM_CMT_NATIVE_ID && pmu_type == hybrid_tiny)
+ return &x86_pmu.hybrid_pmu[i];
+ }
+ }
+
+ return NULL;
+}
+
+static bool init_hybrid_pmu(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct x86_hybrid_pmu *pmu = find_hybrid_pmu_for_cpu();
+
+ if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) {
+ cpuc->pmu = NULL;
+ return false;
+ }
+
+ /* Only check and dump the PMU information for the first CPU */
+ if (!cpumask_empty(&pmu->supported_cpus))
+ goto end;
+
+ if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
+ update_pmu_cap(&pmu->pmu);
+
+ intel_pmu_check_hybrid_pmus(pmu);
+
+ if (!check_hw_exists(&pmu->pmu, pmu->cntr_mask, pmu->fixed_cntr_mask))
+ return false;
+
+ pr_info("%s PMU driver: ", pmu->name);
+
+ pr_cont("\n");
+
+ x86_pmu_show_pmu_cap(&pmu->pmu);
+
+end:
+ cpumask_set_cpu(cpu, &pmu->supported_cpus);
+ cpuc->pmu = &pmu->pmu;
+
+ return true;
+}
+
+static void intel_pmu_cpu_starting(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int core_id = topology_core_id(cpu);
+ int i;
+
+ if (is_hybrid() && !init_hybrid_pmu(cpu))
+ return;
+
+ init_debug_store_on_cpu(cpu);
+ init_arch_pebs_on_cpu(cpu);
+ /*
+ * Deal with CPUs that don't clear their LBRs on power-up, and that may
+ * even boot with LBRs enabled.
+ */
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && x86_pmu.lbr_nr)
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR_BIT);
+ intel_pmu_lbr_reset();
+
+ cpuc->lbr_sel = NULL;
+
+ if (x86_pmu.flags & PMU_FL_TFA) {
+ WARN_ON_ONCE(cpuc->tfa_shadow);
+ cpuc->tfa_shadow = ~0ULL;
+ intel_set_tfa(cpuc, false);
+ }
+
+ if (x86_pmu.version > 1)
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+
+ /*
+ * Disable perf metrics if any added CPU doesn't support it.
+ *
+ * Turn off the check for a hybrid architecture, because the
+ * architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate
+ * the architecture features. The perf metrics is a model-specific
+ * feature for now. The corresponding bit should always be 0 on
+ * a hybrid platform, e.g., Alder Lake.
+ */
+ if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) {
+ union perf_capabilities perf_cap;
+
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
+ if (!perf_cap.perf_metrics) {
+ x86_pmu.intel_cap.perf_metrics = 0;
+ x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
+ }
+ }
+
+ __intel_update_pmu_caps(cpuc->pmu);
+
+ if (!cpuc->shared_regs)
+ return;
+
+ if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+ for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+ struct intel_shared_regs *pc;
+
+ pc = per_cpu(cpu_hw_events, i).shared_regs;
+ if (pc && pc->core_id == core_id) {
+ cpuc->kfree_on_online[0] = cpuc->shared_regs;
+ cpuc->shared_regs = pc;
+ break;
+ }
+ }
+ cpuc->shared_regs->core_id = core_id;
+ cpuc->shared_regs->refcnt++;
+ }
+
+ if (x86_pmu.lbr_sel_map)
+ cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+ struct cpu_hw_events *sibling;
+ struct intel_excl_cntrs *c;
+
+ sibling = &per_cpu(cpu_hw_events, i);
+ c = sibling->excl_cntrs;
+ if (c && c->core_id == core_id) {
+ cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+ cpuc->excl_cntrs = c;
+ if (!sibling->excl_thread_id)
+ cpuc->excl_thread_id = 1;
+ break;
+ }
+ }
+ cpuc->excl_cntrs->core_id = core_id;
+ cpuc->excl_cntrs->refcnt++;
+ }
+}
+
+static void free_excl_cntrs(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *c;
+
+ c = cpuc->excl_cntrs;
+ if (c) {
+ if (c->core_id == -1 || --c->refcnt == 0)
+ kfree(c);
+ cpuc->excl_cntrs = NULL;
+ }
+
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+ fini_debug_store_on_cpu(cpu);
+ fini_arch_pebs_on_cpu(cpu);
+}
+
+void intel_cpuc_finish(struct cpu_hw_events *cpuc)
+{
+ struct intel_shared_regs *pc;
+
+ pc = cpuc->shared_regs;
+ if (pc) {
+ if (pc->core_id == -1 || --pc->refcnt == 0)
+ kfree(pc);
+ cpuc->shared_regs = NULL;
+ }
+
+ free_excl_cntrs(cpuc);
+}
+
+static void intel_pmu_cpu_dead(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ release_arch_pebs_buf_on_cpu(cpu);
+ intel_cpuc_finish(cpuc);
+
+ if (is_hybrid() && cpuc->pmu)
+ cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
+}
+
+static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
+{
+ intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
+ intel_pmu_lbr_sched_task(pmu_ctx, task, sched_in);
+}
+
+static int intel_pmu_check_period(struct perf_event *event, u64 value)
+{
+ return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
+}
+
+static void intel_aux_output_init(void)
+{
+ /* Refer also intel_pmu_aux_output_match() */
+ if (x86_pmu.intel_cap.pebs_output_pt_available)
+ x86_pmu.assign = intel_pmu_assign_event;
+}
+
+static int intel_pmu_aux_output_match(struct perf_event *event)
+{
+ /* intel_pmu_assign_event() is needed, refer intel_aux_output_init() */
+ if (!x86_pmu.intel_cap.pebs_output_pt_available)
+ return 0;
+
+ return is_intel_pt_event(event);
+}
+
+static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
+{
+ struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
+
+ *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
+PMU_FORMAT_ATTR(snoop_rsp, "config1:0-63");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_any.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static struct attribute *hsw_format_attr[] = {
+ &format_attr_in_tx.attr,
+ &format_attr_in_tx_cp.attr,
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
+
+static struct attribute *nhm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
+
+static struct attribute *slm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ NULL
+};
+
+static struct attribute *cmt_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ &format_attr_snoop_rsp.attr,
+ NULL
+};
+
+static struct attribute *skl_format_attr[] = {
+ &format_attr_frontend.attr,
+ NULL,
+};
+
+static __initconst const struct x86_pmu core_pmu = {
+ .name = "core",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = x86_pmu_disable_all,
+ .enable_all = core_pmu_enable_all,
+ .enable = core_pmu_enable_event,
+ .disable = x86_pmu_disable_event,
+ .hw_config = core_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0,
+ .event_map = intel_pmu_event_map,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ .large_pebs_flags = LARGE_PEBS_FLAGS,
+
+ /*
+ * Intel PMCs cannot be accessed sanely above 32-bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL<<31) - 1,
+ .get_event_constraints = intel_get_event_constraints,
+ .put_event_constraints = intel_put_event_constraints,
+ .event_constraints = intel_core_event_constraints,
+ .guest_get_msrs = core_guest_get_msrs,
+ .format_attrs = intel_arch_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
+ /*
+ * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+ * together with PMU version 1 and thus be using core_pmu with
+ * shared_regs. We need following callbacks here to allocate
+ * it properly.
+ */
+ .cpu_prepare = intel_pmu_cpu_prepare,
+ .cpu_starting = intel_pmu_cpu_starting,
+ .cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
+ .check_period = intel_pmu_check_period,
+
+ .lbr_reset = intel_pmu_lbr_reset_64,
+ .lbr_read = intel_pmu_lbr_read_64,
+ .lbr_save = intel_pmu_lbr_save,
+ .lbr_restore = intel_pmu_lbr_restore,
+};
+
+static __initconst const struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_event,
+ .disable = intel_pmu_disable_event,
+ .add = intel_pmu_add_event,
+ .del = intel_pmu_del_event,
+ .read = intel_pmu_read_event,
+ .set_period = intel_pmu_set_period,
+ .update = intel_pmu_update,
+ .hw_config = intel_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0,
+ .event_map = intel_pmu_event_map,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ .large_pebs_flags = LARGE_PEBS_FLAGS,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL << 31) - 1,
+ .get_event_constraints = intel_get_event_constraints,
+ .put_event_constraints = intel_put_event_constraints,
+ .pebs_aliases = intel_pebs_aliases_core2,
+
+ .format_attrs = intel_arch3_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
+ .cpu_prepare = intel_pmu_cpu_prepare,
+ .cpu_starting = intel_pmu_cpu_starting,
+ .cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
+ .guest_get_msrs = intel_guest_get_msrs,
+ .sched_task = intel_pmu_sched_task,
+
+ .check_period = intel_pmu_check_period,
+
+ .aux_output_match = intel_pmu_aux_output_match,
+
+ .lbr_reset = intel_pmu_lbr_reset_64,
+ .lbr_read = intel_pmu_lbr_read_64,
+ .lbr_save = intel_pmu_lbr_save,
+ .lbr_restore = intel_pmu_lbr_restore,
+
+ /*
+ * SMM has access to all 4 rings and while traditionally SMM code only
+ * ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM.
+ *
+ * Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction
+ * between SMM or not, this results in what should be pure userspace
+ * counters including SMM data.
+ *
+ * This is a clear privilege issue, therefore globally disable
+ * counting SMM by default.
+ */
+ .attr_freeze_on_smi = 1,
+};
+
+static __init void intel_clovertown_quirk(void)
+{
+ /*
+ * PEBS is unreliable due to:
+ *
+ * AJ67 - PEBS may experience CPL leaks
+ * AJ68 - PEBS PMI may be delayed by one event
+ * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+ * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+ *
+ * AJ67 could be worked around by restricting the OS/USR flags.
+ * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+ *
+ * AJ106 could possibly be worked around by not allowing LBR
+ * usage from PEBS, including the fixup.
+ * AJ68 could possibly be worked around by always programming
+ * a pebs_event_reset[0] value and coping with the lost events.
+ *
+ * But taken together it might just make sense to not enable PEBS on
+ * these chips.
+ */
+ pr_warn("PEBS disabled due to CPU errata\n");
+ x86_pmu.ds_pebs = 0;
+ x86_pmu.pebs_constraints = NULL;
+}
+
+static const struct x86_cpu_id isolation_ucodes[] = {
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL, 3, 3, 0x0000001f),
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_L, 1, 1, 0x0000001e),
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_G, 1, 1, 0x00000015),
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 2, 2, 0x00000037),
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 4, 4, 0x0000000a),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL, 4, 4, 0x00000023),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_G, 1, 1, 0x00000014),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 2, 2, 0x00000010),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 3, 3, 0x07000009),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 4, 4, 0x0f000009),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 5, 5, 0x0e000002),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_X, 1, 1, 0x0b000014),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 3, 3, 0x00000021),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 4, 7, 0x00000000),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 11, 11, 0x00000000),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_L, 3, 3, 0x0000007c),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE, 3, 3, 0x0000007c),
+ X86_MATCH_VFM_STEPS(INTEL_KABYLAKE, 9, 13, 0x0000004e),
+ X86_MATCH_VFM_STEPS(INTEL_KABYLAKE_L, 9, 12, 0x0000004e),
+ {}
+};
+
+static void intel_check_pebs_isolation(void)
+{
+ x86_pmu.pebs_no_isolation = !x86_match_min_microcode_rev(isolation_ucodes);
+}
+
+static __init void intel_pebs_isolation_quirk(void)
+{
+ WARN_ON_ONCE(x86_pmu.check_microcode);
+ x86_pmu.check_microcode = intel_check_pebs_isolation;
+ intel_check_pebs_isolation();
+}
+
+static const struct x86_cpu_id pebs_ucodes[] = {
+ X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE, 7, 7, 0x00000028),
+ X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 6, 6, 0x00000618),
+ X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 7, 7, 0x0000070c),
+ {}
+};
+
+static bool intel_snb_pebs_broken(void)
+{
+ return !x86_match_min_microcode_rev(pebs_ucodes);
+}
+
+static void intel_snb_check_microcode(void)
+{
+ if (intel_snb_pebs_broken() == x86_pmu.pebs_broken)
+ return;
+
+ /*
+ * Serialized by the microcode lock..
+ */
+ if (x86_pmu.pebs_broken) {
+ pr_info("PEBS enabled due to microcode update\n");
+ x86_pmu.pebs_broken = 0;
+ } else {
+ pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+ x86_pmu.pebs_broken = 1;
+ }
+}
+
+static bool is_lbr_from(unsigned long msr)
+{
+ unsigned long lbr_from_nr = x86_pmu.lbr_from + x86_pmu.lbr_nr;
+
+ return x86_pmu.lbr_from <= msr && msr < lbr_from_nr;
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Disable the check for real HW, so we don't
+ * mess with potentially enabled registers:
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrq_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrq.
+ */
+ val_tmp = val_old ^ mask;
+
+ if (is_lbr_from(msr))
+ val_tmp = lbr_from_signext_quirk_wr(val_tmp);
+
+ if (wrmsrq_safe(msr, val_tmp) ||
+ rdmsrq_safe(msr, &val_new))
+ return false;
+
+ /*
+ * Quirk only affects validation in wrmsr(), so wrmsrq()'s value
+ * should equal rdmsrq()'s even with the quirk.
+ */
+ if (val_new != val_tmp)
+ return false;
+
+ if (is_lbr_from(msr))
+ val_old = lbr_from_signext_quirk_wr(val_old);
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrq(msr, val_old);
+
+ return true;
+}
+
+static __init void intel_sandybridge_quirk(void)
+{
+ x86_pmu.check_microcode = intel_snb_check_microcode;
+ cpus_read_lock();
+ intel_snb_check_microcode();
+ cpus_read_unlock();
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+ { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+ { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+ { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+ { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+ { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+ { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+ { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+ int bit;
+
+ /* disable event that reported as not present by cpuid */
+ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+ intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+ pr_warn("CPUID marked event: \'%s\' unavailable\n",
+ intel_arch_events_map[bit].name);
+ }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+ union cpuid10_ebx ebx;
+
+ ebx.full = x86_pmu.events_maskl;
+ if (ebx.split.no_branch_misses_retired) {
+ /*
+ * Erratum AAJ80 detected, we work it around by using
+ * the BR_MISP_EXEC.ANY event. This will over-count
+ * branch-misses, but it's still much better than the
+ * architectural event which is often completely bogus:
+ */
+ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+ ebx.split.no_branch_misses_retired = 0;
+ x86_pmu.events_maskl = ebx.full;
+ pr_info("CPU erratum AAJ80 worked around\n");
+ }
+}
+
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+ x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+ x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
+ x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
+EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+ EVENT_PTR(td_slots_issued),
+ EVENT_PTR(td_slots_retired),
+ EVENT_PTR(td_fetch_bubbles),
+ EVENT_PTR(td_total_slots),
+ EVENT_PTR(td_total_slots_scale),
+ EVENT_PTR(td_recovery_bubbles),
+ EVENT_PTR(td_recovery_bubbles_scale),
+ NULL
+};
+
+static struct attribute *hsw_mem_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_hsw),
+ NULL,
+};
+
+static struct attribute *hsw_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_capacity),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(el_start),
+ EVENT_PTR(el_commit),
+ EVENT_PTR(el_abort),
+ EVENT_PTR(el_capacity),
+ EVENT_PTR(el_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL
+};
+
+EVENT_ATTR_STR(tx-capacity-read, tx_capacity_read, "event=0x54,umask=0x80");
+EVENT_ATTR_STR(tx-capacity-write, tx_capacity_write, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-capacity-read, el_capacity_read, "event=0x54,umask=0x80");
+EVENT_ATTR_STR(el-capacity-write, el_capacity_write, "event=0x54,umask=0x2");
+
+static struct attribute *icl_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_hsw),
+ NULL,
+};
+
+static struct attribute *icl_td_events_attrs[] = {
+ EVENT_PTR(slots),
+ EVENT_PTR(td_retiring),
+ EVENT_PTR(td_bad_spec),
+ EVENT_PTR(td_fe_bound),
+ EVENT_PTR(td_be_bound),
+ NULL,
+};
+
+static struct attribute *icl_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_capacity_read),
+ EVENT_PTR(tx_capacity_write),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(el_start),
+ EVENT_PTR(el_abort),
+ EVENT_PTR(el_commit),
+ EVENT_PTR(el_capacity_read),
+ EVENT_PTR(el_capacity_write),
+ EVENT_PTR(el_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL,
+};
+
+
+EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82");
+
+static struct attribute *glc_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_spr),
+ EVENT_PTR(mem_ld_aux),
+ NULL,
+};
+
+static struct attribute *glc_td_events_attrs[] = {
+ EVENT_PTR(slots),
+ EVENT_PTR(td_retiring),
+ EVENT_PTR(td_bad_spec),
+ EVENT_PTR(td_fe_bound),
+ EVENT_PTR(td_be_bound),
+ EVENT_PTR(td_heavy_ops),
+ EVENT_PTR(td_br_mispredict),
+ EVENT_PTR(td_fetch_lat),
+ EVENT_PTR(td_mem_bound),
+ NULL,
+};
+
+static struct attribute *glc_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_capacity_read),
+ EVENT_PTR(tx_capacity_write),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL,
+};
+
+static ssize_t freeze_on_smi_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
+}
+
+static DEFINE_MUTEX(freeze_on_smi_mutex);
+
+static ssize_t freeze_on_smi_store(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 1)
+ return -EINVAL;
+
+ mutex_lock(&freeze_on_smi_mutex);
+
+ if (x86_pmu.attr_freeze_on_smi == val)
+ goto done;
+
+ x86_pmu.attr_freeze_on_smi = val;
+
+ cpus_read_lock();
+ on_each_cpu(flip_smm_bit, &val, 1);
+ cpus_read_unlock();
+done:
+ mutex_unlock(&freeze_on_smi_mutex);
+
+ return count;
+}
+
+static void update_tfa_sched(void *ignored)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * check if PMC3 is used
+ * and if so force schedule out for all event types all contexts
+ */
+ if (test_bit(3, cpuc->active_mask))
+ perf_pmu_resched(x86_get_pmu(smp_processor_id()));
+}
+
+static ssize_t show_sysctl_tfa(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, 40, "%d\n", allow_tsx_force_abort);
+}
+
+static ssize_t set_sysctl_tfa(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool val;
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ /* no change */
+ if (val == allow_tsx_force_abort)
+ return count;
+
+ allow_tsx_force_abort = val;
+
+ cpus_read_lock();
+ on_each_cpu(update_tfa_sched, NULL, 1);
+ cpus_read_unlock();
+
+ return count;
+}
+
+
+static DEVICE_ATTR_RW(freeze_on_smi);
+
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static ssize_t branch_counter_nr_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", fls(x86_pmu.lbr_counters));
+}
+
+static DEVICE_ATTR_RO(branch_counter_nr);
+
+static ssize_t branch_counter_width_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", LBR_INFO_BR_CNTR_BITS);
+}
+
+static DEVICE_ATTR_RO(branch_counter_width);
+
+static struct attribute *lbr_attrs[] = {
+ &dev_attr_branches.attr,
+ &dev_attr_branch_counter_nr.attr,
+ &dev_attr_branch_counter_width.attr,
+ NULL
+};
+
+static umode_t
+lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ /* branches */
+ if (i == 0)
+ return x86_pmu.lbr_nr ? attr->mode : 0;
+
+ return (x86_pmu.flags & PMU_FL_BR_CNTR) ? attr->mode : 0;
+}
+
+static char pmu_name_str[30];
+
+static DEVICE_STRING_ATTR_RO(pmu_name, 0444, pmu_name_str);
+
+static struct attribute *intel_pmu_caps_attrs[] = {
+ &dev_attr_pmu_name.attr.attr,
+ NULL
+};
+
+static DEVICE_ATTR(allow_tsx_force_abort, 0644,
+ show_sysctl_tfa,
+ set_sysctl_tfa);
+
+static struct attribute *intel_pmu_attrs[] = {
+ &dev_attr_freeze_on_smi.attr,
+ &dev_attr_allow_tsx_force_abort.attr,
+ NULL,
+};
+
+static umode_t
+default_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ if (attr == &dev_attr_allow_tsx_force_abort.attr)
+ return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0;
+
+ return attr->mode;
+}
+
+static umode_t
+tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return boot_cpu_has(X86_FEATURE_RTM) ? attr->mode : 0;
+}
+
+static umode_t
+pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return intel_pmu_has_pebs() ? attr->mode : 0;
+}
+
+static umode_t
+mem_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ if (attr == &event_attr_mem_ld_aux.attr.attr)
+ return x86_pmu.flags & PMU_FL_MEM_LOADS_AUX ? attr->mode : 0;
+
+ return pebs_is_visible(kobj, attr, i);
+}
+
+static umode_t
+exra_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return x86_pmu.version >= 2 ? attr->mode : 0;
+}
+
+static umode_t
+td_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ /*
+ * Hide the perf metrics topdown events
+ * if the feature is not enumerated.
+ */
+ if (x86_pmu.num_topdown_events)
+ return x86_pmu.intel_cap.perf_metrics ? attr->mode : 0;
+
+ return attr->mode;
+}
+
+PMU_FORMAT_ATTR(acr_mask, "config2:0-63");
+
+static struct attribute *format_acr_attrs[] = {
+ &format_attr_acr_mask.attr,
+ NULL
+};
+
+static umode_t
+acr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+
+ return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0;
+}
+
+static struct attribute_group group_events_td = {
+ .name = "events",
+ .is_visible = td_is_visible,
+};
+
+static struct attribute_group group_events_mem = {
+ .name = "events",
+ .is_visible = mem_is_visible,
+};
+
+static struct attribute_group group_events_tsx = {
+ .name = "events",
+ .is_visible = tsx_is_visible,
+};
+
+static struct attribute_group group_caps_gen = {
+ .name = "caps",
+ .attrs = intel_pmu_caps_attrs,
+};
+
+static struct attribute_group group_caps_lbr = {
+ .name = "caps",
+ .attrs = lbr_attrs,
+ .is_visible = lbr_is_visible,
+};
+
+static struct attribute_group group_format_extra = {
+ .name = "format",
+ .is_visible = exra_is_visible,
+};
+
+static struct attribute_group group_format_extra_skl = {
+ .name = "format",
+ .is_visible = exra_is_visible,
+};
+
+static struct attribute_group group_format_evtsel_ext = {
+ .name = "format",
+ .attrs = format_evtsel_ext_attrs,
+ .is_visible = evtsel_ext_is_visible,
+};
+
+static struct attribute_group group_format_acr = {
+ .name = "format",
+ .attrs = format_acr_attrs,
+ .is_visible = acr_is_visible,
+};
+
+static struct attribute_group group_default = {
+ .attrs = intel_pmu_attrs,
+ .is_visible = default_is_visible,
+};
+
+static const struct attribute_group *attr_update[] = {
+ &group_events_td,
+ &group_events_mem,
+ &group_events_tsx,
+ &group_caps_gen,
+ &group_caps_lbr,
+ &group_format_extra,
+ &group_format_extra_skl,
+ &group_format_evtsel_ext,
+ &group_format_acr,
+ &group_default,
+ NULL,
+};
+
+EVENT_ATTR_STR_HYBRID(slots, slots_adl, "event=0x00,umask=0x4", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_adl, "event=0xc2,umask=0x0;event=0x00,umask=0x80", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-bad-spec, td_bad_spec_adl, "event=0x73,umask=0x0;event=0x00,umask=0x81", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_adl, "event=0x71,umask=0x0;event=0x00,umask=0x82", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_adl, "event=0x74,umask=0x0;event=0x00,umask=0x83", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-heavy-ops, td_heavy_ops_adl, "event=0x00,umask=0x84", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-br-mispredict, td_br_mis_adl, "event=0x00,umask=0x85", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-fetch-lat, td_fetch_lat_adl, "event=0x00,umask=0x86", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-mem-bound, td_mem_bound_adl, "event=0x00,umask=0x87", hybrid_big);
+
+static struct attribute *adl_hybrid_events_attrs[] = {
+ EVENT_PTR(slots_adl),
+ EVENT_PTR(td_retiring_adl),
+ EVENT_PTR(td_bad_spec_adl),
+ EVENT_PTR(td_fe_bound_adl),
+ EVENT_PTR(td_be_bound_adl),
+ EVENT_PTR(td_heavy_ops_adl),
+ EVENT_PTR(td_br_mis_adl),
+ EVENT_PTR(td_fetch_lat_adl),
+ EVENT_PTR(td_mem_bound_adl),
+ NULL,
+};
+
+EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_lnl, "event=0xc2,umask=0x02;event=0x00,umask=0x80", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_lnl, "event=0x9c,umask=0x01;event=0x00,umask=0x82", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_lnl, "event=0xa4,umask=0x02;event=0x00,umask=0x83", hybrid_big_small);
+
+static struct attribute *lnl_hybrid_events_attrs[] = {
+ EVENT_PTR(slots_adl),
+ EVENT_PTR(td_retiring_lnl),
+ EVENT_PTR(td_bad_spec_adl),
+ EVENT_PTR(td_fe_bound_lnl),
+ EVENT_PTR(td_be_bound_lnl),
+ EVENT_PTR(td_heavy_ops_adl),
+ EVENT_PTR(td_br_mis_adl),
+ EVENT_PTR(td_fetch_lat_adl),
+ EVENT_PTR(td_mem_bound_adl),
+ NULL
+};
+
+/* The event string must be in PMU IDX order. */
+EVENT_ATTR_STR_HYBRID(topdown-retiring,
+ td_retiring_arl_h,
+ "event=0xc2,umask=0x02;event=0x00,umask=0x80;event=0xc2,umask=0x0",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(topdown-bad-spec,
+ td_bad_spec_arl_h,
+ "event=0x73,umask=0x0;event=0x00,umask=0x81;event=0x73,umask=0x0",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(topdown-fe-bound,
+ td_fe_bound_arl_h,
+ "event=0x9c,umask=0x01;event=0x00,umask=0x82;event=0x71,umask=0x0",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(topdown-be-bound,
+ td_be_bound_arl_h,
+ "event=0xa4,umask=0x02;event=0x00,umask=0x83;event=0x74,umask=0x0",
+ hybrid_big_small_tiny);
+
+static struct attribute *arl_h_hybrid_events_attrs[] = {
+ EVENT_PTR(slots_adl),
+ EVENT_PTR(td_retiring_arl_h),
+ EVENT_PTR(td_bad_spec_arl_h),
+ EVENT_PTR(td_fe_bound_arl_h),
+ EVENT_PTR(td_be_bound_arl_h),
+ EVENT_PTR(td_heavy_ops_adl),
+ EVENT_PTR(td_br_mis_adl),
+ EVENT_PTR(td_fetch_lat_adl),
+ EVENT_PTR(td_mem_bound_adl),
+ NULL,
+};
+
+/* Must be in IDX order */
+EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(mem-loads-aux, mem_ld_aux_adl, "event=0x03,umask=0x82", hybrid_big);
+
+static struct attribute *adl_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_adl),
+ EVENT_PTR(mem_st_adl),
+ EVENT_PTR(mem_ld_aux_adl),
+ NULL,
+};
+
+static struct attribute *mtl_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_adl),
+ EVENT_PTR(mem_st_adl),
+ NULL
+};
+
+EVENT_ATTR_STR_HYBRID(mem-loads,
+ mem_ld_arl_h,
+ "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3;event=0xd0,umask=0x5,ldlat=3",
+ hybrid_big_small_tiny);
+EVENT_ATTR_STR_HYBRID(mem-stores,
+ mem_st_arl_h,
+ "event=0xd0,umask=0x6;event=0xcd,umask=0x2;event=0xd0,umask=0x6",
+ hybrid_big_small_tiny);
+
+static struct attribute *arl_h_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_arl_h),
+ EVENT_PTR(mem_st_arl_h),
+ NULL,
+};
+
+EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-conflict, tx_conflict_adl, "event=0x54,umask=0x1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(cycles-t, cycles_t_adl, "event=0x3c,in_tx=1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(cycles-ct, cycles_ct_adl, "event=0x3c,in_tx=1,in_tx_cp=1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-capacity-read, tx_capacity_read_adl, "event=0x54,umask=0x80", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-capacity-write, tx_capacity_write_adl, "event=0x54,umask=0x2", hybrid_big);
+
+static struct attribute *adl_hybrid_tsx_attrs[] = {
+ EVENT_PTR(tx_start_adl),
+ EVENT_PTR(tx_abort_adl),
+ EVENT_PTR(tx_commit_adl),
+ EVENT_PTR(tx_capacity_read_adl),
+ EVENT_PTR(tx_capacity_write_adl),
+ EVENT_PTR(tx_conflict_adl),
+ EVENT_PTR(cycles_t_adl),
+ EVENT_PTR(cycles_ct_adl),
+ NULL,
+};
+
+FORMAT_ATTR_HYBRID(in_tx, hybrid_big);
+FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big);
+FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small_tiny);
+FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small_tiny);
+FORMAT_ATTR_HYBRID(frontend, hybrid_big);
+
+#define ADL_HYBRID_RTM_FORMAT_ATTR \
+ FORMAT_HYBRID_PTR(in_tx), \
+ FORMAT_HYBRID_PTR(in_tx_cp)
+
+#define ADL_HYBRID_FORMAT_ATTR \
+ FORMAT_HYBRID_PTR(offcore_rsp), \
+ FORMAT_HYBRID_PTR(ldlat), \
+ FORMAT_HYBRID_PTR(frontend)
+
+static struct attribute *adl_hybrid_extra_attr_rtm[] = {
+ ADL_HYBRID_RTM_FORMAT_ATTR,
+ ADL_HYBRID_FORMAT_ATTR,
+ NULL
+};
+
+static struct attribute *adl_hybrid_extra_attr[] = {
+ ADL_HYBRID_FORMAT_ATTR,
+ NULL
+};
+
+FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small_tiny);
+
+static struct attribute *mtl_hybrid_extra_attr_rtm[] = {
+ ADL_HYBRID_RTM_FORMAT_ATTR,
+ ADL_HYBRID_FORMAT_ATTR,
+ FORMAT_HYBRID_PTR(snoop_rsp),
+ NULL
+};
+
+static struct attribute *mtl_hybrid_extra_attr[] = {
+ ADL_HYBRID_FORMAT_ATTR,
+ FORMAT_HYBRID_PTR(snoop_rsp),
+ NULL
+};
+
+static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ struct perf_pmu_events_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr);
+
+ return pmu->pmu_type & pmu_attr->pmu_type;
+}
+
+static umode_t hybrid_events_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0;
+}
+
+static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu)
+{
+ int cpu = cpumask_first(&pmu->supported_cpus);
+
+ return (cpu >= nr_cpu_ids) ? -1 : cpu;
+}
+
+static umode_t hybrid_tsx_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ int cpu = hybrid_find_supported_cpu(pmu);
+
+ return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && cpu_has(&cpu_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0;
+}
+
+static umode_t hybrid_format_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ struct perf_pmu_format_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr);
+ int cpu = hybrid_find_supported_cpu(pmu);
+
+ return (cpu >= 0) && (pmu->pmu_type & pmu_attr->pmu_type) ? attr->mode : 0;
+}
+
+static umode_t hybrid_td_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ if (!is_attr_for_this_pmu(kobj, attr))
+ return 0;
+
+
+ /* Only the big core supports perf metrics */
+ if (pmu->pmu_type == hybrid_big)
+ return pmu->intel_cap.perf_metrics ? attr->mode : 0;
+
+ return attr->mode;
+}
+
+static struct attribute_group hybrid_group_events_td = {
+ .name = "events",
+ .is_visible = hybrid_td_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_mem = {
+ .name = "events",
+ .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_tsx = {
+ .name = "events",
+ .is_visible = hybrid_tsx_is_visible,
+};
+
+static struct attribute_group hybrid_group_format_extra = {
+ .name = "format",
+ .is_visible = hybrid_format_is_visible,
+};
+
+static ssize_t intel_hybrid_get_attr_cpus(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ return cpumap_print_to_pagebuf(true, buf, &pmu->supported_cpus);
+}
+
+static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL);
+static struct attribute *intel_hybrid_cpus_attrs[] = {
+ &dev_attr_cpus.attr,
+ NULL,
+};
+
+static struct attribute_group hybrid_group_cpus = {
+ .attrs = intel_hybrid_cpus_attrs,
+};
+
+static const struct attribute_group *hybrid_attr_update[] = {
+ &hybrid_group_events_td,
+ &hybrid_group_events_mem,
+ &hybrid_group_events_tsx,
+ &group_caps_gen,
+ &group_caps_lbr,
+ &hybrid_group_format_extra,
+ &group_format_evtsel_ext,
+ &group_format_acr,
+ &group_default,
+ &hybrid_group_cpus,
+ NULL,
+};
+
+static struct attribute *empty_attrs;
+
+static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
+ u64 cntr_mask,
+ u64 fixed_cntr_mask,
+ u64 intel_ctrl)
+{
+ struct event_constraint *c;
+
+ if (!event_constraints)
+ return;
+
+ /*
+ * event on fixed counter2 (REF_CYCLES) only works on this
+ * counter, so do not extend mask to generic counters
+ */
+ for_each_event_constraint(c, event_constraints) {
+ /*
+ * Don't extend the topdown slots and metrics
+ * events to the generic counters.
+ */
+ if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+ /*
+ * Disable topdown slots and metrics events,
+ * if slots event is not in CPUID.
+ */
+ if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl))
+ c->idxmsk64 = 0;
+ c->weight = hweight64(c->idxmsk64);
+ continue;
+ }
+
+ if (c->cmask == FIXED_EVENT_FLAGS) {
+ /* Disabled fixed counters which are not in CPUID */
+ c->idxmsk64 &= intel_ctrl;
+
+ /*
+ * Don't extend the pseudo-encoding to the
+ * generic counters
+ */
+ if (!use_fixed_pseudo_encoding(c->code))
+ c->idxmsk64 |= cntr_mask;
+ }
+ c->idxmsk64 &= cntr_mask | (fixed_cntr_mask << INTEL_PMC_IDX_FIXED);
+ c->weight = hweight64(c->idxmsk64);
+ }
+}
+
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
+{
+ struct extra_reg *er;
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (!extra_regs)
+ return;
+
+ for (er = extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x11UL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+}
+
+static inline int intel_pmu_v6_addr_offset(int index, bool eventsel)
+{
+ return MSR_IA32_PMC_V6_STEP * index;
+}
+
+static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = {
+ { hybrid_small, "cpu_atom" },
+ { hybrid_big, "cpu_core" },
+ { hybrid_tiny, "cpu_lowpower" },
+};
+
+static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus)
+{
+ unsigned long pmus_mask = pmus;
+ struct x86_hybrid_pmu *pmu;
+ int idx = 0, bit;
+
+ x86_pmu.num_hybrid_pmus = hweight_long(pmus_mask);
+ x86_pmu.hybrid_pmu = kcalloc(x86_pmu.num_hybrid_pmus,
+ sizeof(struct x86_hybrid_pmu),
+ GFP_KERNEL);
+ if (!x86_pmu.hybrid_pmu)
+ return -ENOMEM;
+
+ static_branch_enable(&perf_is_hybrid);
+ x86_pmu.filter = intel_pmu_filter;
+
+ for_each_set_bit(bit, &pmus_mask, ARRAY_SIZE(intel_hybrid_pmu_type_map)) {
+ pmu = &x86_pmu.hybrid_pmu[idx++];
+ pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id;
+ pmu->name = intel_hybrid_pmu_type_map[bit].name;
+
+ pmu->cntr_mask64 = x86_pmu.cntr_mask64;
+ pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64;
+ pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64);
+ pmu->config_mask = X86_RAW_EVENT_MASK;
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, pmu->cntr_mask64,
+ 0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
+
+ pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
+ if (pmu->pmu_type & hybrid_small_tiny) {
+ pmu->intel_cap.perf_metrics = 0;
+ pmu->mid_ack = true;
+ } else if (pmu->pmu_type & hybrid_big) {
+ pmu->intel_cap.perf_metrics = 1;
+ pmu->late_ack = true;
+ }
+ }
+
+ return 0;
+}
+
+static __always_inline void intel_pmu_ref_cycles_ext(void)
+{
+ if (!(x86_pmu.events_maskl & (INTEL_PMC_MSK_FIXED_REF_CYCLES >> INTEL_PMC_IDX_FIXED)))
+ intel_perfmon_event_map[PERF_COUNT_HW_REF_CPU_CYCLES] = 0x013c;
+}
+
+static __always_inline void intel_pmu_init_glc(struct pmu *pmu)
+{
+ x86_pmu.late_ack = true;
+ x86_pmu.limit_period = glc_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.num_topdown_events = 8;
+ static_call_update(intel_pmu_update_topdown_event,
+ &icl_update_topdown_event);
+ static_call_update(intel_pmu_set_topdown_event_period,
+ &icl_set_topdown_event_period);
+
+ memcpy(hybrid_var(pmu, hw_cache_event_ids), glc_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hybrid_var(pmu, hw_cache_extra_regs), glc_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ hybrid(pmu, event_constraints) = intel_glc_event_constraints;
+ hybrid(pmu, pebs_constraints) = intel_glc_pebs_event_constraints;
+
+ intel_pmu_ref_cycles_ext();
+}
+
+static __always_inline void intel_pmu_init_grt(struct pmu *pmu)
+{
+ x86_pmu.mid_ack = true;
+ x86_pmu.limit_period = glc_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+
+ memcpy(hybrid_var(pmu, hw_cache_event_ids), glp_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hybrid_var(pmu, hw_cache_extra_regs), tnt_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ hybrid_var(pmu, hw_cache_event_ids)[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+ hybrid(pmu, event_constraints) = intel_grt_event_constraints;
+ hybrid(pmu, pebs_constraints) = intel_grt_pebs_event_constraints;
+ hybrid(pmu, extra_regs) = intel_grt_extra_regs;
+
+ intel_pmu_ref_cycles_ext();
+}
+
+static __always_inline void intel_pmu_init_lnc(struct pmu *pmu)
+{
+ intel_pmu_init_glc(pmu);
+ hybrid(pmu, event_constraints) = intel_lnc_event_constraints;
+ hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints;
+ hybrid(pmu, extra_regs) = intel_lnc_extra_regs;
+}
+
+static __always_inline void intel_pmu_init_skt(struct pmu *pmu)
+{
+ intel_pmu_init_grt(pmu);
+ hybrid(pmu, event_constraints) = intel_skt_event_constraints;
+ hybrid(pmu, extra_regs) = intel_cmt_extra_regs;
+ static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr);
+}
+
+__init int intel_pmu_init(void)
+{
+ struct attribute **extra_skl_attr = &empty_attrs;
+ struct attribute **extra_attr = &empty_attrs;
+ struct attribute **td_attr = &empty_attrs;
+ struct attribute **mem_attr = &empty_attrs;
+ struct attribute **tsx_attr = &empty_attrs;
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ union cpuid10_ebx ebx;
+ unsigned int fixed_mask;
+ bool pmem = false;
+ int version, i;
+ char *name;
+ struct x86_hybrid_pmu *pmu;
+
+ /* Architectural Perfmon was introduced starting with Core "Yonah" */
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_vfm < INTEL_CORE_YONAH)
+ return p6_pmu_init();
+ break;
+ case 11:
+ return knc_pmu_init();
+ case 15:
+ return p4_pmu_init();
+ }
+
+ pr_cont("unsupported CPU family %d model %d ",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired hw_event or not.
+ */
+ cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
+ if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ x86_pmu = core_pmu;
+ else
+ x86_pmu = intel_pmu;
+
+ x86_pmu.version = version;
+ x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0);
+ x86_pmu.cntval_bits = eax.split.bit_width;
+ x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
+
+ x86_pmu.events_maskl = ebx.full;
+ x86_pmu.events_mask_len = eax.split.mask_length;
+
+ x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64);
+ x86_pmu.pebs_capable = PEBS_COUNTER_MASK;
+ x86_pmu.config_mask = X86_RAW_EVENT_MASK;
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose events, so
+ * assume at least 3 events, when not running in a hypervisor:
+ */
+ if (version > 1 && version < 5) {
+ int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
+
+ x86_pmu.fixed_cntr_mask64 =
+ GENMASK_ULL(max((int)edx.split.num_counters_fixed, assume) - 1, 0);
+ } else if (version >= 5)
+ x86_pmu.fixed_cntr_mask64 = fixed_mask;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM)) {
+ u64 capabilities;
+
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, capabilities);
+ x86_pmu.intel_cap.capabilities = capabilities;
+ }
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) {
+ x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+ x86_pmu.lbr_read = intel_pmu_lbr_read_32;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
+ intel_pmu_arch_lbr_init();
+
+ intel_pebs_init();
+
+ x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
+ if (version >= 5) {
+ x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated;
+ if (x86_pmu.intel_cap.anythread_deprecated)
+ pr_cont(" AnyThread deprecated, ");
+ }
+
+ /*
+ * Many features on and after V6 require dynamic constraint,
+ * e.g., Arch PEBS, ACR.
+ */
+ if (version >= 6) {
+ x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT;
+ x86_pmu.late_setup = intel_pmu_late_setup;
+ }
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_CORE_YONAH:
+ pr_cont("Core events, ");
+ name = "core";
+ break;
+
+ case INTEL_CORE2_MEROM:
+ x86_add_quirk(intel_clovertown_quirk);
+ fallthrough;
+
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ intel_pmu_lbr_init_core();
+
+ x86_pmu.event_constraints = intel_core2_event_constraints;
+ x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
+ pr_cont("Core2 events, ");
+ name = "core2";
+ break;
+
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_nhm();
+
+ x86_pmu.event_constraints = intel_nehalem_event_constraints;
+ x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ x86_pmu.extra_regs = intel_nehalem_extra_regs;
+ x86_pmu.limit_period = nhm_limit_period;
+
+ mem_attr = nhm_mem_events_attrs;
+
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+ intel_pmu_pebs_data_source_nhm();
+ x86_add_quirk(intel_nehalem_quirk);
+ x86_pmu.pebs_no_tlb = 1;
+ extra_attr = nhm_format_attr;
+
+ pr_cont("Nehalem events, ");
+ name = "nehalem";
+ break;
+
+ case INTEL_ATOM_BONNELL:
+ case INTEL_ATOM_BONNELL_MID:
+ case INTEL_ATOM_SALTWELL:
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ intel_pmu_lbr_init_atom();
+
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+ pr_cont("Atom events, ");
+ name = "bonnell";
+ break;
+
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_SILVERMONT_D:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT:
+ case INTEL_ATOM_SILVERMONT_MID2:
+ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_slm();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_slm_extra_regs;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ td_attr = slm_events_attrs;
+ extra_attr = slm_format_attr;
+ pr_cont("Silvermont events, ");
+ name = "silvermont";
+ break;
+
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_D:
+ memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_glm_extra_regs;
+ /*
+ * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+ * for precise cycles.
+ * :pp is identical to :ppp
+ */
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ td_attr = glm_events_attrs;
+ extra_attr = slm_format_attr;
+ pr_cont("Goldmont events, ");
+ name = "goldmont";
+ break;
+
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.extra_regs = intel_glm_extra_regs;
+ /*
+ * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+ * for precise cycles.
+ */
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.pebs_capable = ~0ULL;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.get_event_constraints = glp_get_event_constraints;
+ td_attr = glm_events_attrs;
+ /* Goldmont Plus has 4-wide pipeline */
+ event_attr_td_total_slots_scale_glm.event_str = "4";
+ extra_attr = slm_format_attr;
+ pr_cont("Goldmont plus events, ");
+ name = "goldmont_plus";
+ break;
+
+ case INTEL_ATOM_TREMONT_D:
+ case INTEL_ATOM_TREMONT:
+ case INTEL_ATOM_TREMONT_L:
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+ hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.extra_regs = intel_tnt_extra_regs;
+ /*
+ * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+ * for precise cycles.
+ */
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.get_event_constraints = tnt_get_event_constraints;
+ td_attr = tnt_events_attrs;
+ extra_attr = slm_format_attr;
+ pr_cont("Tremont events, ");
+ name = "Tremont";
+ break;
+
+ case INTEL_ATOM_GRACEMONT:
+ intel_pmu_init_grt(NULL);
+ intel_pmu_pebs_data_source_grt();
+ x86_pmu.pebs_latency_data = grt_latency_data;
+ x86_pmu.get_event_constraints = tnt_get_event_constraints;
+ td_attr = tnt_events_attrs;
+ mem_attr = grt_mem_attrs;
+ extra_attr = nhm_format_attr;
+ pr_cont("Gracemont events, ");
+ name = "gracemont";
+ break;
+
+ case INTEL_ATOM_CRESTMONT:
+ case INTEL_ATOM_CRESTMONT_X:
+ intel_pmu_init_grt(NULL);
+ x86_pmu.extra_regs = intel_cmt_extra_regs;
+ intel_pmu_pebs_data_source_cmt();
+ x86_pmu.pebs_latency_data = cmt_latency_data;
+ x86_pmu.get_event_constraints = cmt_get_event_constraints;
+ td_attr = cmt_events_attrs;
+ mem_attr = grt_mem_attrs;
+ extra_attr = cmt_format_attr;
+ pr_cont("Crestmont events, ");
+ name = "crestmont";
+ break;
+
+ case INTEL_ATOM_DARKMONT_X:
+ intel_pmu_init_skt(NULL);
+ intel_pmu_pebs_data_source_cmt();
+ x86_pmu.pebs_latency_data = cmt_latency_data;
+ x86_pmu.get_event_constraints = cmt_get_event_constraints;
+ td_attr = skt_events_attrs;
+ mem_attr = grt_mem_attrs;
+ extra_attr = cmt_format_attr;
+ pr_cont("Darkmont events, ");
+ name = "darkmont";
+ break;
+
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_WESTMERE_EX:
+ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_nhm();
+
+ x86_pmu.event_constraints = intel_westmere_event_constraints;
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_westmere_extra_regs;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+
+ mem_attr = nhm_mem_events_attrs;
+
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+ intel_pmu_pebs_data_source_nhm();
+ extra_attr = nhm_format_attr;
+ pr_cont("Westmere events, ");
+ name = "westmere";
+ break;
+
+ case INTEL_SANDYBRIDGE:
+ case INTEL_SANDYBRIDGE_X:
+ x86_add_quirk(intel_sandybridge_quirk);
+ x86_add_quirk(intel_ht_bug);
+ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_snb_event_constraints;
+ x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ if (boot_cpu_data.x86_vfm == INTEL_SANDYBRIDGE_X)
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ else
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ td_attr = snb_events_attrs;
+ mem_attr = snb_mem_events_attrs;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+
+ extra_attr = nhm_format_attr;
+
+ pr_cont("SandyBridge events, ");
+ name = "sandybridge";
+ break;
+
+ case INTEL_IVYBRIDGE:
+ case INTEL_IVYBRIDGE_X:
+ x86_add_quirk(intel_ht_bug);
+ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_ivb_event_constraints;
+ x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
+ if (boot_cpu_data.x86_vfm == INTEL_IVYBRIDGE_X)
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ else
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ td_attr = snb_events_attrs;
+ mem_attr = snb_mem_events_attrs;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+ extra_attr = nhm_format_attr;
+
+ pr_cont("IvyBridge events, ");
+ name = "ivybridge";
+ break;
+
+
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_X:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ x86_add_quirk(intel_ht_bug);
+ x86_add_quirk(intel_pebs_isolation_quirk);
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_hsw();
+
+ x86_pmu.event_constraints = intel_hsw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
+ x86_pmu.lbr_double_abort = true;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ td_attr = hsw_events_attrs;
+ mem_attr = hsw_mem_events_attrs;
+ tsx_attr = hsw_tsx_events_attrs;
+ pr_cont("Haswell events, ");
+ name = "haswell";
+ break;
+
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
+ x86_add_quirk(intel_pebs_isolation_quirk);
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+ intel_pmu_lbr_init_hsw();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = bdw_limit_period;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ td_attr = hsw_events_attrs;
+ mem_attr = hsw_mem_events_attrs;
+ tsx_attr = hsw_tsx_events_attrs;
+ pr_cont("Broadwell events, ");
+ name = "broadwell";
+ break;
+
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
+ memcpy(hw_cache_event_ids,
+ slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs,
+ knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_knl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_knl_extra_regs;
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ extra_attr = slm_format_attr;
+ pr_cont("Knights Landing/Mill events, ");
+ name = "knights-landing";
+ break;
+
+ case INTEL_SKYLAKE_X:
+ pmem = true;
+ fallthrough;
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ case INTEL_COMETLAKE_L:
+ case INTEL_COMETLAKE:
+ x86_add_quirk(intel_pebs_isolation_quirk);
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_skl();
+
+ /* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */
+ event_attr_td_recovery_bubbles.event_str_noht =
+ "event=0xd,umask=0x1,cmask=1";
+ event_attr_td_recovery_bubbles.event_str_ht =
+ "event=0xd,umask=0x1,cmask=1,any=1";
+
+ x86_pmu.event_constraints = intel_skl_event_constraints;
+ x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_skl_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+ x86_pmu.pebs_prec_dist = true;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_skl_attr = skl_format_attr;
+ td_attr = hsw_events_attrs;
+ mem_attr = hsw_mem_events_attrs;
+ tsx_attr = hsw_tsx_events_attrs;
+ intel_pmu_pebs_data_source_skl(pmem);
+
+ /*
+ * Processors with CPUID.RTM_ALWAYS_ABORT have TSX deprecated by default.
+ * TSX force abort hooks are not required on these systems. Only deploy
+ * workaround when microcode has not enabled X86_FEATURE_RTM_ALWAYS_ABORT.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) &&
+ !boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
+ x86_pmu.flags |= PMU_FL_TFA;
+ x86_pmu.get_event_constraints = tfa_get_event_constraints;
+ x86_pmu.enable_all = intel_tfa_pmu_enable_all;
+ x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
+ }
+
+ pr_cont("Skylake events, ");
+ name = "skylake";
+ break;
+
+ case INTEL_ICELAKE_X:
+ case INTEL_ICELAKE_D:
+ x86_pmu.pebs_ept = 1;
+ pmem = true;
+ fallthrough;
+ case INTEL_ICELAKE_L:
+ case INTEL_ICELAKE:
+ case INTEL_TIGERLAKE_L:
+ case INTEL_TIGERLAKE:
+ case INTEL_ROCKETLAKE:
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_icl_event_constraints;
+ x86_pmu.pebs_constraints = intel_icl_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_icl_extra_regs;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = icl_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_skl_attr = skl_format_attr;
+ mem_attr = icl_events_attrs;
+ td_attr = icl_td_events_attrs;
+ tsx_attr = icl_tsx_events_attrs;
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+ x86_pmu.lbr_pt_coexist = true;
+ intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 4;
+ static_call_update(intel_pmu_update_topdown_event,
+ &icl_update_topdown_event);
+ static_call_update(intel_pmu_set_topdown_event_period,
+ &icl_set_topdown_event_period);
+ pr_cont("Icelake events, ");
+ name = "icelake";
+ break;
+
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ x86_pmu.extra_regs = intel_glc_extra_regs;
+ pr_cont("Sapphire Rapids events, ");
+ name = "sapphire_rapids";
+ goto glc_common;
+
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_GRANITERAPIDS_D:
+ x86_pmu.extra_regs = intel_rwc_extra_regs;
+ pr_cont("Granite Rapids events, ");
+ name = "granite_rapids";
+
+ glc_common:
+ intel_pmu_init_glc(NULL);
+ x86_pmu.pebs_ept = 1;
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = glc_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_skl_attr = skl_format_attr;
+ mem_attr = glc_events_attrs;
+ td_attr = glc_td_events_attrs;
+ tsx_attr = glc_tsx_events_attrs;
+ intel_pmu_pebs_data_source_skl(true);
+ break;
+
+ case INTEL_ALDERLAKE:
+ case INTEL_ALDERLAKE_L:
+ case INTEL_RAPTORLAKE:
+ case INTEL_RAPTORLAKE_P:
+ case INTEL_RAPTORLAKE_S:
+ /*
+ * Alder Lake has 2 types of CPU, core and atom.
+ *
+ * Initialize the common PerfMon capabilities here.
+ */
+ intel_pmu_init_hybrid(hybrid_big_small);
+
+ x86_pmu.pebs_latency_data = grt_latency_data;
+ x86_pmu.get_event_constraints = adl_get_event_constraints;
+ x86_pmu.hw_config = adl_hw_config;
+ x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type;
+
+ td_attr = adl_hybrid_events_attrs;
+ mem_attr = adl_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ adl_hybrid_extra_attr_rtm : adl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ intel_pmu_init_glc(&pmu->pmu);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+ pmu->cntr_mask64 <<= 2;
+ pmu->cntr_mask64 |= 0x3;
+ pmu->fixed_cntr_mask64 <<= 1;
+ pmu->fixed_cntr_mask64 |= 0x1;
+ } else {
+ pmu->cntr_mask64 = x86_pmu.cntr_mask64;
+ pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64;
+ }
+
+ /*
+ * Quirk: For some Alder Lake machine, when all E-cores are disabled in
+ * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However,
+ * the X86_FEATURE_HYBRID_CPU is still set. The above codes will
+ * mistakenly add extra counters for P-cores. Correct the number of
+ * counters here.
+ */
+ if ((x86_pmu_num_counters(&pmu->pmu) > 8) || (x86_pmu_num_counters_fixed(&pmu->pmu) > 4)) {
+ pmu->cntr_mask64 = x86_pmu.cntr_mask64;
+ pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64;
+ }
+
+ pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64);
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, pmu->cntr_mask64,
+ 0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
+
+ pmu->extra_regs = intel_glc_extra_regs;
+
+ /* Initialize Atom core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ intel_pmu_init_grt(&pmu->pmu);
+
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ intel_pmu_pebs_data_source_adl();
+ pr_cont("Alderlake Hybrid events, ");
+ name = "alderlake_hybrid";
+ break;
+
+ case INTEL_METEORLAKE:
+ case INTEL_METEORLAKE_L:
+ case INTEL_ARROWLAKE_U:
+ intel_pmu_init_hybrid(hybrid_big_small);
+
+ x86_pmu.pebs_latency_data = cmt_latency_data;
+ x86_pmu.get_event_constraints = mtl_get_event_constraints;
+ x86_pmu.hw_config = adl_hw_config;
+
+ td_attr = adl_hybrid_events_attrs;
+ mem_attr = mtl_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ intel_pmu_init_glc(&pmu->pmu);
+ pmu->extra_regs = intel_rwc_extra_regs;
+
+ /* Initialize Atom core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ intel_pmu_init_grt(&pmu->pmu);
+ pmu->extra_regs = intel_cmt_extra_regs;
+
+ intel_pmu_pebs_data_source_mtl();
+ pr_cont("Meteorlake Hybrid events, ");
+ name = "meteorlake_hybrid";
+ break;
+
+ case INTEL_PANTHERLAKE_L:
+ case INTEL_WILDCATLAKE_L:
+ pr_cont("Pantherlake Hybrid events, ");
+ name = "pantherlake_hybrid";
+ goto lnl_common;
+
+ case INTEL_LUNARLAKE_M:
+ case INTEL_ARROWLAKE:
+ pr_cont("Lunarlake Hybrid events, ");
+ name = "lunarlake_hybrid";
+
+ lnl_common:
+ intel_pmu_init_hybrid(hybrid_big_small);
+
+ x86_pmu.pebs_latency_data = lnl_latency_data;
+ x86_pmu.get_event_constraints = mtl_get_event_constraints;
+ x86_pmu.hw_config = adl_hw_config;
+
+ td_attr = lnl_hybrid_events_attrs;
+ mem_attr = mtl_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ intel_pmu_init_lnc(&pmu->pmu);
+
+ /* Initialize Atom core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ intel_pmu_init_skt(&pmu->pmu);
+
+ intel_pmu_pebs_data_source_lnl();
+ break;
+
+ case INTEL_ARROWLAKE_H:
+ intel_pmu_init_hybrid(hybrid_big_small_tiny);
+
+ x86_pmu.pebs_latency_data = arl_h_latency_data;
+ x86_pmu.get_event_constraints = arl_h_get_event_constraints;
+ x86_pmu.hw_config = arl_h_hw_config;
+
+ td_attr = arl_h_hybrid_events_attrs;
+ mem_attr = arl_h_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities. */
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ intel_pmu_init_lnc(&pmu->pmu);
+
+ /* Initialize Atom core specific PerfMon capabilities. */
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ intel_pmu_init_skt(&pmu->pmu);
+
+ /* Initialize Lower Power Atom specific PerfMon capabilities. */
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX];
+ intel_pmu_init_grt(&pmu->pmu);
+ pmu->extra_regs = intel_cmt_extra_regs;
+
+ intel_pmu_pebs_data_source_arl_h();
+ pr_cont("ArrowLake-H Hybrid events, ");
+ name = "arrowlake_h_hybrid";
+ break;
+
+ default:
+ switch (x86_pmu.version) {
+ case 1:
+ x86_pmu.event_constraints = intel_v1_event_constraints;
+ pr_cont("generic architected perfmon v1, ");
+ name = "generic_arch_v1";
+ break;
+ case 2:
+ case 3:
+ case 4:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ name = "generic_arch_v2+";
+ break;
+ default:
+ /*
+ * The default constraints for v5 and up can support up to
+ * 16 fixed counters. For the fixed counters 4 and later,
+ * the pseudo-encoding is applied.
+ * The constraints may be cut according to the CPUID enumeration
+ * by inserting the EVENT_CONSTRAINT_END.
+ */
+ if (fls64(x86_pmu.fixed_cntr_mask64) > INTEL_PMC_MAX_FIXED)
+ x86_pmu.fixed_cntr_mask64 &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0);
+ intel_v5_gen_event_constraints[fls64(x86_pmu.fixed_cntr_mask64)].weight = -1;
+ x86_pmu.event_constraints = intel_v5_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ name = "generic_arch_v5+";
+ break;
+ }
+ }
+
+ snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name);
+
+ if (!is_hybrid()) {
+ group_events_td.attrs = td_attr;
+ group_events_mem.attrs = mem_attr;
+ group_events_tsx.attrs = tsx_attr;
+ group_format_extra.attrs = extra_attr;
+ group_format_extra_skl.attrs = extra_skl_attr;
+
+ x86_pmu.attr_update = attr_update;
+ } else {
+ hybrid_group_events_td.attrs = td_attr;
+ hybrid_group_events_mem.attrs = mem_attr;
+ hybrid_group_events_tsx.attrs = tsx_attr;
+ hybrid_group_format_extra.attrs = extra_attr;
+
+ x86_pmu.attr_update = hybrid_attr_update;
+ }
+
+ /*
+ * The archPerfmonExt (0x23) includes an enhanced enumeration of
+ * PMU architectural features with a per-core view. For non-hybrid,
+ * each core has the same PMU capabilities. It's good enough to
+ * update the x86_pmu from the booting CPU. For hybrid, the x86_pmu
+ * is used to keep the common capabilities. Still keep the values
+ * from the leaf 0xa. The core specific update will be done later
+ * when a new type is online.
+ */
+ if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
+ update_pmu_cap(NULL);
+
+ if (x86_pmu.arch_pebs) {
+ static_call_update(intel_pmu_disable_event_ext,
+ intel_pmu_disable_event_ext);
+ static_call_update(intel_pmu_enable_event_ext,
+ intel_pmu_enable_event_ext);
+ pr_cont("Architectural PEBS, ");
+ }
+
+ intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64,
+ &x86_pmu.fixed_cntr_mask64,
+ &x86_pmu.intel_ctrl);
+
+ /* AnyThread may be deprecated on arch perfmon v5 or later */
+ if (x86_pmu.intel_cap.anythread_deprecated)
+ x86_pmu.format_attrs = intel_arch_formats_attr;
+
+ intel_pmu_check_event_constraints_all(NULL);
+
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * Check all LBR MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ if (x86_pmu.lbr_nr) {
+ intel_pmu_lbr_init();
+
+ pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+
+ /* only support branch_stack snapshot for perfmon >= v2 */
+ if (x86_pmu.disable_all == intel_pmu_disable_all) {
+ if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) {
+ static_call_update(perf_snapshot_branch_stack,
+ intel_pmu_snapshot_arch_branch_stack);
+ } else {
+ static_call_update(perf_snapshot_branch_stack,
+ intel_pmu_snapshot_branch_stack);
+ }
+ }
+ }
+
+ intel_pmu_check_extra_regs(x86_pmu.extra_regs);
+
+ /* Support full width counters using alternative MSR range */
+ if (x86_pmu.intel_cap.full_width_write) {
+ x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+ x86_pmu.perfctr = MSR_IA32_PMC0;
+ pr_cont("full-width counters, ");
+ }
+
+ /* Support V6+ MSR Aliasing */
+ if (x86_pmu.version >= 6) {
+ x86_pmu.perfctr = MSR_IA32_PMC_V6_GP0_CTR;
+ x86_pmu.eventsel = MSR_IA32_PMC_V6_GP0_CFG_A;
+ x86_pmu.fixedctr = MSR_IA32_PMC_V6_FX0_CTR;
+ x86_pmu.addr_offset = intel_pmu_v6_addr_offset;
+ }
+
+ if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
+ x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
+
+ if (x86_pmu.intel_cap.pebs_timing_info)
+ x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
+
+ intel_aux_output_init();
+
+ return 0;
+}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+ int c;
+ /*
+ * problem not present on this CPU model, nothing to do
+ */
+ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+ return 0;
+
+ if (topology_max_smt_threads() > 1) {
+ pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+ return 0;
+ }
+
+ cpus_read_lock();
+
+ hardlockup_detector_perf_stop();
+
+ x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+ x86_pmu.start_scheduling = NULL;
+ x86_pmu.commit_scheduling = NULL;
+ x86_pmu.stop_scheduling = NULL;
+
+ hardlockup_detector_perf_restart();
+
+ for_each_online_cpu(c)
+ free_excl_cntrs(&per_cpu(cpu_hw_events, c));
+
+ cpus_read_unlock();
+ pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+ return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
new file mode 100644
index 000000000000..fa67fda6e45b
--- /dev/null
+++ b/arch/x86/events/intel/cstate.c
@@ -0,0 +1,768 @@
+/*
+ * Support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ * - 'cstate_core': The counter is available for each physical core.
+ * The counters include CORE_C*_RESIDENCY.
+ * - 'cstate_pkg': The counter is available for each physical package.
+ * The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ * MSR_CORE_C1_RES: CORE C1 Residency Counter
+ * perf code: 0x00
+ * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
+ * MTL,SRF,GRR,ARL,LNL,PTL
+ * Scope: Core (each processor core has a MSR)
+ * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM,
+ * CNL,KBL,CML,TNT
+ * Scope: Core
+ * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
+ * GRR,ARL,LNL,PTL
+ * Scope: Core
+ * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ * perf code: 0x03
+ * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
+ * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL,
+ * PTL
+ * Scope: Core
+ * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
+ * perf code: 0x00
+ * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
+ * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
+ * RPL,SPR,MTL,ARL,LNL,SRF,PTL
+ * Scope: Package (physical package)
+ * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
+ * ADL,RPL,MTL,ARL
+ * Scope: Package (physical package)
+ * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
+ * ARL,LNL,PTL
+ * Scope: Package (physical package)
+ * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
+ * perf code: 0x03
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
+ * KBL,CML,ICL,TGL,RKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
+ * perf code: 0x04
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL,RPL,MTL,ARL
+ * Scope: Package (physical package)
+ * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
+ * perf code: 0x05
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ * perf code: 0x06
+ * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
+ * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL
+ * Scope: Package (physical package)
+ * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter.
+ * perf code: 0x00
+ * Available model: SRF,GRR
+ * Scope: A cluster of cores shared L2 cache
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/nospec.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+#include "../perf_event.h"
+#include "../probe.h"
+
+MODULE_DESCRIPTION("Support for Intel cstate performance events");
+MODULE_LICENSE("GPL");
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __cstate_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct device_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+/* Model -> events mapping */
+struct cstate_model {
+ unsigned long core_events;
+ unsigned long pkg_events;
+ unsigned long module_events;
+ unsigned long quirks;
+};
+
+/* Quirk flags */
+#define SLM_PKG_C6_USE_C7_MSR (1UL << 0)
+#define KNL_CORE_C6_MSR (1UL << 1)
+
+/* cstate_core PMU */
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_events {
+ PERF_CSTATE_CORE_C1_RES = 0,
+ PERF_CSTATE_CORE_C3_RES,
+ PERF_CSTATE_CORE_C6_RES,
+ PERF_CSTATE_CORE_C7_RES,
+
+ PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+PMU_EVENT_ATTR_STRING(c1-residency, attr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, attr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, attr_cstate_core_c7, "event=0x03");
+
+static unsigned long core_msr_mask;
+
+PMU_EVENT_GROUP(events, cstate_core_c1);
+PMU_EVENT_GROUP(events, cstate_core_c3);
+PMU_EVENT_GROUP(events, cstate_core_c6);
+PMU_EVENT_GROUP(events, cstate_core_c7);
+
+static bool test_msr(int idx, void *data)
+{
+ return test_bit(idx, (unsigned long *) data);
+}
+
+static struct perf_msr core_msr[] = {
+ [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &group_cstate_core_c1, test_msr },
+ [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &group_cstate_core_c3, test_msr },
+ [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &group_cstate_core_c6, test_msr },
+ [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &group_cstate_core_c7, test_msr },
+};
+
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+/*
+ * There are no default events, but we need to create
+ * "events" group (with empty attrs) before updating
+ * it with detected events.
+ */
+static struct attribute_group cstate_events_attr_group = {
+ .name = "events",
+ .attrs = attrs_empty,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(cstate_event, event, "config:0-63");
+static struct attribute *cstate_format_attrs[] = {
+ &format_attr_cstate_event.attr,
+ NULL,
+};
+
+static struct attribute_group cstate_format_attr_group = {
+ .name = "format",
+ .attrs = cstate_format_attrs,
+};
+
+static const struct attribute_group *cstate_attr_groups[] = {
+ &cstate_events_attr_group,
+ &cstate_format_attr_group,
+ NULL,
+};
+
+/* cstate_pkg PMU */
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_events {
+ PERF_CSTATE_PKG_C2_RES = 0,
+ PERF_CSTATE_PKG_C3_RES,
+ PERF_CSTATE_PKG_C6_RES,
+ PERF_CSTATE_PKG_C7_RES,
+ PERF_CSTATE_PKG_C8_RES,
+ PERF_CSTATE_PKG_C9_RES,
+ PERF_CSTATE_PKG_C10_RES,
+
+ PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+PMU_EVENT_ATTR_STRING(c2-residency, attr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, attr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, attr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, attr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, attr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, attr_cstate_pkg_c10, "event=0x06");
+
+static unsigned long pkg_msr_mask;
+
+PMU_EVENT_GROUP(events, cstate_pkg_c2);
+PMU_EVENT_GROUP(events, cstate_pkg_c3);
+PMU_EVENT_GROUP(events, cstate_pkg_c6);
+PMU_EVENT_GROUP(events, cstate_pkg_c7);
+PMU_EVENT_GROUP(events, cstate_pkg_c8);
+PMU_EVENT_GROUP(events, cstate_pkg_c9);
+PMU_EVENT_GROUP(events, cstate_pkg_c10);
+
+static struct perf_msr pkg_msr[] = {
+ [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &group_cstate_pkg_c2, test_msr },
+ [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &group_cstate_pkg_c3, test_msr },
+ [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &group_cstate_pkg_c6, test_msr },
+ [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &group_cstate_pkg_c7, test_msr },
+ [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &group_cstate_pkg_c8, test_msr },
+ [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &group_cstate_pkg_c9, test_msr },
+ [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
+};
+
+/* cstate_module PMU */
+static struct pmu cstate_module_pmu;
+static bool has_cstate_module;
+
+enum perf_cstate_module_events {
+ PERF_CSTATE_MODULE_C6_RES = 0,
+
+ PERF_CSTATE_MODULE_EVENT_MAX,
+};
+
+PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_module_c6, "event=0x00");
+
+static unsigned long module_msr_mask;
+
+PMU_EVENT_GROUP(events, cstate_module_c6);
+
+static struct perf_msr module_msr[] = {
+ [PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
+};
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* unsupported modes and filters */
+ if (event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ if (event->pmu == &cstate_core_pmu) {
+ if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+ return -EINVAL;
+ cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_CORE_EVENT_MAX);
+ if (!(core_msr_mask & (1 << cfg)))
+ return -EINVAL;
+ event->hw.event_base = core_msr[cfg].msr;
+ } else if (event->pmu == &cstate_pkg_pmu) {
+ if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+ return -EINVAL;
+ cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
+ if (!(pkg_msr_mask & (1 << cfg)))
+ return -EINVAL;
+ event->hw.event_base = pkg_msr[cfg].msr;
+ } else if (event->pmu == &cstate_module_pmu) {
+ if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
+ return -EINVAL;
+ cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_MODULE_EVENT_MAX);
+ if (!(module_msr_mask & (1 << cfg)))
+ return -EINVAL;
+ event->hw.event_base = module_msr[cfg].msr;
+ } else {
+ return -ENOENT;
+ }
+
+ event->hw.config = cfg;
+ event->hw.idx = -1;
+ return 0;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+ u64 val;
+
+ rdmsrq(event->hw.event_base, val);
+ return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+
+ prev_raw_count = local64_read(&hwc->prev_count);
+ do {
+ new_raw_count = cstate_pmu_read_counter(event);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
+
+ local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+ local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+ if (mode & PERF_EF_START)
+ cstate_pmu_event_start(event, mode);
+
+ return 0;
+}
+
+static const struct attribute_group *core_attr_update[] = {
+ &group_cstate_core_c1,
+ &group_cstate_core_c3,
+ &group_cstate_core_c6,
+ &group_cstate_core_c7,
+ NULL,
+};
+
+static const struct attribute_group *pkg_attr_update[] = {
+ &group_cstate_pkg_c2,
+ &group_cstate_pkg_c3,
+ &group_cstate_pkg_c6,
+ &group_cstate_pkg_c7,
+ &group_cstate_pkg_c8,
+ &group_cstate_pkg_c9,
+ &group_cstate_pkg_c10,
+ NULL,
+};
+
+static const struct attribute_group *module_attr_update[] = {
+ &group_cstate_module_c6,
+ NULL
+};
+
+static struct pmu cstate_core_pmu = {
+ .attr_groups = cstate_attr_groups,
+ .attr_update = core_attr_update,
+ .name = "cstate_core",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CORE,
+ .module = THIS_MODULE,
+};
+
+static struct pmu cstate_pkg_pmu = {
+ .attr_groups = cstate_attr_groups,
+ .attr_update = pkg_attr_update,
+ .name = "cstate_pkg",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_PKG,
+ .module = THIS_MODULE,
+};
+
+static struct pmu cstate_module_pmu = {
+ .attr_groups = cstate_attr_groups,
+ .attr_update = module_attr_update,
+ .name = "cstate_module",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CLUSTER,
+ .module = THIS_MODULE,
+};
+
+static const struct cstate_model nhm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model snb_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model hswult_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model cnl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model icl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model icx_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
+};
+
+static const struct cstate_model adl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model lnl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model slm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+ .quirks = SLM_PKG_C6_USE_C7_MSR,
+};
+
+
+static const struct cstate_model knl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
+ .quirks = KNL_CORE_C6_MSR,
+};
+
+
+static const struct cstate_model glm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model grr_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
+};
+
+static const struct cstate_model srf_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
+
+ .module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
+};
+
+
+static const struct x86_cpu_id intel_cstates_match[] __initconst = {
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &cnl_cstates),
+
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_cstates),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &srf_cstates),
+
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_cstates),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates),
+
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates),
+ { },
+};
+MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
+
+static int __init cstate_probe(const struct cstate_model *cm)
+{
+ /* SLM has different MSR for PKG C6 */
+ if (cm->quirks & SLM_PKG_C6_USE_C7_MSR)
+ pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+
+ /* KNL has different MSR for CORE C6 */
+ if (cm->quirks & KNL_CORE_C6_MSR)
+ pkg_msr[PERF_CSTATE_CORE_C6_RES].msr = MSR_KNL_CORE_C6_RESIDENCY;
+
+
+ core_msr_mask = perf_msr_probe(core_msr, PERF_CSTATE_CORE_EVENT_MAX,
+ true, (void *) &cm->core_events);
+
+ pkg_msr_mask = perf_msr_probe(pkg_msr, PERF_CSTATE_PKG_EVENT_MAX,
+ true, (void *) &cm->pkg_events);
+
+ module_msr_mask = perf_msr_probe(module_msr, PERF_CSTATE_MODULE_EVENT_MAX,
+ true, (void *) &cm->module_events);
+
+ has_cstate_core = !!core_msr_mask;
+ has_cstate_pkg = !!pkg_msr_mask;
+ has_cstate_module = !!module_msr_mask;
+
+ return (has_cstate_core || has_cstate_pkg || has_cstate_module) ? 0 : -ENODEV;
+}
+
+static inline void cstate_cleanup(void)
+{
+ if (has_cstate_core)
+ perf_pmu_unregister(&cstate_core_pmu);
+
+ if (has_cstate_pkg)
+ perf_pmu_unregister(&cstate_pkg_pmu);
+
+ if (has_cstate_module)
+ perf_pmu_unregister(&cstate_module_pmu);
+}
+
+static int __init cstate_init(void)
+{
+ int err;
+
+ if (has_cstate_core) {
+ err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+ if (err) {
+ has_cstate_core = false;
+ pr_info("Failed to register cstate core pmu\n");
+ cstate_cleanup();
+ return err;
+ }
+ }
+
+ if (has_cstate_pkg) {
+ if (topology_max_dies_per_package() > 1) {
+ /* CLX-AP is multi-die and the cstate is die-scope */
+ cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
+ err = perf_pmu_register(&cstate_pkg_pmu,
+ "cstate_die", -1);
+ } else {
+ err = perf_pmu_register(&cstate_pkg_pmu,
+ cstate_pkg_pmu.name, -1);
+ }
+ if (err) {
+ has_cstate_pkg = false;
+ pr_info("Failed to register cstate pkg pmu\n");
+ cstate_cleanup();
+ return err;
+ }
+ }
+
+ if (has_cstate_module) {
+ err = perf_pmu_register(&cstate_module_pmu, cstate_module_pmu.name, -1);
+ if (err) {
+ has_cstate_module = false;
+ pr_info("Failed to register cstate cluster pmu\n");
+ cstate_cleanup();
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int __init cstate_pmu_init(void)
+{
+ const struct x86_cpu_id *id;
+ int err;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return -ENODEV;
+
+ id = x86_match_cpu(intel_cstates_match);
+ if (!id)
+ return -ENODEV;
+
+ err = cstate_probe((const struct cstate_model *) id->driver_data);
+ if (err)
+ return err;
+
+ return cstate_init();
+}
+module_init(cstate_pmu_init);
+
+static void __exit cstate_pmu_exit(void)
+{
+ cstate_cleanup();
+}
+module_exit(cstate_pmu_exit);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
new file mode 100644
index 000000000000..feb1c3cf63e4
--- /dev/null
+++ b/arch/x86/events/intel/ds.c
@@ -0,0 +1,3189 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/debugreg.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/timer.h>
+
+#include "../perf_event.h"
+
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE 24
+
+#define PEBS_FIXUP_SIZE PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+ u32 flags, ip;
+ u32 ax, bc, cx, dx;
+ u32 si, di, bp, sp;
+};
+
+ */
+
+union intel_x86_pebs_dse {
+ u64 val;
+ struct {
+ unsigned int ld_dse:4;
+ unsigned int ld_stlb_miss:1;
+ unsigned int ld_locked:1;
+ unsigned int ld_data_blk:1;
+ unsigned int ld_addr_blk:1;
+ unsigned int ld_reserved:24;
+ };
+ struct {
+ unsigned int st_l1d_hit:1;
+ unsigned int st_reserved1:3;
+ unsigned int st_stlb_miss:1;
+ unsigned int st_locked:1;
+ unsigned int st_reserved2:26;
+ };
+ struct {
+ unsigned int st_lat_dse:4;
+ unsigned int st_lat_stlb_miss:1;
+ unsigned int st_lat_locked:1;
+ unsigned int ld_reserved3:26;
+ };
+ struct {
+ unsigned int mtl_dse:5;
+ unsigned int mtl_locked:1;
+ unsigned int mtl_stlb_miss:1;
+ unsigned int mtl_fwd_blk:1;
+ unsigned int ld_reserved4:24;
+ };
+ struct {
+ unsigned int lnc_dse:8;
+ unsigned int ld_reserved5:2;
+ unsigned int lnc_stlb_miss:1;
+ unsigned int lnc_locked:1;
+ unsigned int lnc_data_blk:1;
+ unsigned int lnc_addr_blk:1;
+ unsigned int ld_reserved6:18;
+ };
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = {
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
+ OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */
+ OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+ OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+ OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source)
+{
+ u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+ data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+ data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+ data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ __intel_pmu_pebs_data_source_skl(pmem, pebs_data_source);
+}
+
+static void __init __intel_pmu_pebs_data_source_grt(u64 *data_source)
+{
+ data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
+}
+
+void __init intel_pmu_pebs_data_source_grt(void)
+{
+ __intel_pmu_pebs_data_source_grt(pebs_data_source);
+}
+
+void __init intel_pmu_pebs_data_source_adl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_skl(false, data_source);
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_grt(data_source);
+}
+
+static void __init __intel_pmu_pebs_data_source_cmt(u64 *data_source)
+{
+ data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
+ data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ data_source[0x0a] = OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE);
+ data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ data_source[0x0c] = OP_LH | LEVEL(RAM) | REM | P(SNOOPX, FWD);
+ data_source[0x0d] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_mtl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_skl(false, data_source);
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_cmt(data_source);
+}
+
+void __init intel_pmu_pebs_data_source_arl_h(void)
+{
+ u64 *data_source;
+
+ intel_pmu_pebs_data_source_lnl();
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_cmt(data_source);
+}
+
+void __init intel_pmu_pebs_data_source_cmt(void)
+{
+ __intel_pmu_pebs_data_source_cmt(pebs_data_source);
+}
+
+/* Version for Lion Cove and later */
+static u64 lnc_pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = {
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA), /* 0x00: ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 hit */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: LFB/L1 Miss Handling Buffer hit */
+ 0, /* 0x04: Reserved */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x05: L2 Hit */
+ OP_LH | LEVEL(L2_MHB) | P(SNOOP, NONE), /* 0x06: L2 Miss Handling Buffer Hit */
+ 0, /* 0x07: Reserved */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x08: L3 Hit */
+ 0, /* 0x09: Reserved */
+ 0, /* 0x0a: Reserved */
+ 0, /* 0x0b: Reserved */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD), /* 0x0c: L3 Hit Snoop Fwd */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0d: L3 Hit Snoop HitM */
+ 0, /* 0x0e: Reserved */
+ P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0f: L3 Miss Snoop HitM */
+ OP_LH | LEVEL(MSC) | P(SNOOP, NONE), /* 0x10: Memory-side Cache Hit */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE), /* 0x11: Local Memory Hit */
+};
+
+void __init intel_pmu_pebs_data_source_lnl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, lnc_pebs_data_source, sizeof(lnc_pebs_data_source));
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_cmt(data_source);
+}
+
+static u64 precise_store_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+ dse.val = status;
+
+ /*
+ * bit 4: TLB access
+ * 1 = stored missed 2nd level TLB
+ *
+ * so it either hit the walker or the OS
+ * otherwise hit 2nd level TLB
+ */
+ if (dse.st_stlb_miss)
+ val |= P(TLB, MISS);
+ else
+ val |= P(TLB, HIT);
+
+ /*
+ * bit 0: hit L1 data cache
+ * if not set, then all we know is that
+ * it missed L1D
+ */
+ if (dse.st_l1d_hit)
+ val |= P(LVL, HIT);
+ else
+ val |= P(LVL, MISS);
+
+ /*
+ * bit 5: Locked prefix
+ */
+ if (dse.st_locked)
+ val |= P(LOCK, LOCKED);
+
+ return val;
+}
+
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
+{
+ union perf_mem_data_src dse;
+
+ dse.val = PERF_MEM_NA;
+
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ dse.mem_op = PERF_MEM_OP_STORE;
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+ dse.mem_op = PERF_MEM_OP_LOAD;
+
+ /*
+ * L1 info only valid for following events:
+ *
+ * MEM_UOPS_RETIRED.STLB_MISS_STORES
+ * MEM_UOPS_RETIRED.LOCK_STORES
+ * MEM_UOPS_RETIRED.SPLIT_STORES
+ * MEM_UOPS_RETIRED.ALL_STORES
+ */
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+ if (status & 1)
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+ }
+ return dse.val;
+}
+
+static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
+{
+ /*
+ * TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (tlb)
+ *val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ *val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /* locked prefix */
+ if (lock)
+ *val |= P(LOCK, LOCKED);
+}
+
+/* Retrieve the latency data for e-core of ADL */
+static u64 __grt_latency_data(struct perf_event *event, u64 status,
+ u8 dse, bool tlb, bool lock, bool blk)
+{
+ u64 val;
+
+ WARN_ON_ONCE(is_hybrid() &&
+ hybrid_pmu(event->pmu)->pmu_type == hybrid_big);
+
+ dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK;
+ val = hybrid_var(event->pmu, pebs_data_source)[dse];
+
+ pebs_set_tlb_lock(&val, tlb, lock);
+
+ if (blk)
+ val |= P(BLK, DATA);
+ else
+ val |= P(BLK, NA);
+
+ return val;
+}
+
+u64 grt_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+
+ dse.val = status;
+
+ return __grt_latency_data(event, status, dse.ld_dse,
+ dse.ld_locked, dse.ld_stlb_miss,
+ dse.ld_data_blk);
+}
+
+/* Retrieve the latency data for e-core of MTL */
+u64 cmt_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+
+ dse.val = status;
+
+ return __grt_latency_data(event, status, dse.mtl_dse,
+ dse.mtl_stlb_miss, dse.mtl_locked,
+ dse.mtl_fwd_blk);
+}
+
+static u64 lnc_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ union perf_mem_data_src src;
+ u64 val;
+
+ dse.val = status;
+
+ /* LNC core latency data */
+ val = hybrid_var(event->pmu, pebs_data_source)[status & PERF_PEBS_DATA_SOURCE_MASK];
+ if (!val)
+ val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
+
+ if (dse.lnc_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ if (dse.lnc_locked)
+ val |= P(LOCK, LOCKED);
+
+ if (dse.lnc_data_blk)
+ val |= P(BLK, DATA);
+ if (dse.lnc_addr_blk)
+ val |= P(BLK, ADDR);
+ if (!dse.lnc_data_blk && !dse.lnc_addr_blk)
+ val |= P(BLK, NA);
+
+ src.val = val;
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ src.mem_op = P(OP, STORE);
+
+ return src.val;
+}
+
+u64 lnl_latency_data(struct perf_event *event, u64 status)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_small)
+ return cmt_latency_data(event, status);
+
+ return lnc_latency_data(event, status);
+}
+
+u64 arl_h_latency_data(struct perf_event *event, u64 status)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_tiny)
+ return cmt_latency_data(event, status);
+
+ return lnl_latency_data(event, status);
+}
+
+static u64 load_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
+
+ /*
+ * Nehalem models do not support TLB, Lock infos
+ */
+ if (x86_pmu.pebs_no_tlb) {
+ val |= P(TLB, NA) | P(LOCK, NA);
+ return val;
+ }
+
+ pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked);
+
+ /*
+ * Ice Lake and earlier models do not support block infos.
+ */
+ if (!x86_pmu.pebs_block) {
+ val |= P(BLK, NA);
+ return val;
+ }
+ /*
+ * bit 6: load was blocked since its data could not be forwarded
+ * from a preceding store
+ */
+ if (dse.ld_data_blk)
+ val |= P(BLK, DATA);
+
+ /*
+ * bit 7: load was blocked due to potential address conflict with
+ * a preceding store
+ */
+ if (dse.ld_addr_blk)
+ val |= P(BLK, ADDR);
+
+ if (!dse.ld_data_blk && !dse.ld_addr_blk)
+ val |= P(BLK, NA);
+
+ return val;
+}
+
+static u64 store_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ union perf_mem_data_src src;
+ u64 val;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse];
+
+ pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked);
+
+ val |= P(BLK, NA);
+
+ /*
+ * the pebs_data_source table is only for loads
+ * so override the mem_op to say STORE instead
+ */
+ src.val = val;
+ src.mem_op = P(OP,STORE);
+
+ return src.val;
+}
+
+struct pebs_record_core {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+};
+
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+ struct {
+ u32 cycles_last_block : 32,
+ hle_abort : 1,
+ rtm_abort : 1,
+ instruction_abort : 1,
+ non_instruction_abort : 1,
+ retry : 1,
+ data_conflict : 1,
+ capacity_writes : 1,
+ capacity_reads : 1;
+ };
+ u64 value;
+};
+
+#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+ u64 tsc;
+};
+
+void init_debug_store_on_cpu(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)(unsigned long)ds),
+ (u32)((u64)(unsigned long)ds >> 32));
+}
+
+void fini_debug_store_on_cpu(int cpu)
+{
+ if (!per_cpu(cpu_hw_events, cpu).ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static DEFINE_PER_CPU(void *, insn_buffer);
+
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
+{
+ unsigned long start = (unsigned long)cea;
+ phys_addr_t pa;
+ size_t msz = 0;
+
+ pa = virt_to_phys(addr);
+
+ preempt_disable();
+ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, pa, prot);
+
+ /*
+ * This is a cross-CPU update of the cpu_entry_area, we must shoot down
+ * all TLB entries for it.
+ */
+ flush_tlb_kernel_range(start, start + size);
+ preempt_enable();
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+ unsigned long start = (unsigned long)cea;
+ size_t msz = 0;
+
+ preempt_disable();
+ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+
+ flush_tlb_kernel_range(start, start + size);
+ preempt_enable();
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+ unsigned int order = get_order(size);
+ int node = cpu_to_node(cpu);
+ struct page *page;
+
+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+ return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+ if (buffer)
+ free_pages((unsigned long)buffer, get_order(size));
+}
+
+static int alloc_pebs_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ size_t bsiz = x86_pmu.pebs_buffer_size;
+ int max, node = cpu_to_node(cpu);
+ void *buffer, *insn_buff, *cea;
+
+ if (!intel_pmu_has_pebs())
+ return 0;
+
+ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+ if (x86_pmu.arch_pebs) {
+ hwev->pebs_vaddr = buffer;
+ return 0;
+ }
+
+ /*
+ * HSW+ already provides us the eventing ip; no need to allocate this
+ * buffer then.
+ */
+ if (x86_pmu.intel_cap.pebs_format < 2) {
+ insn_buff = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+ if (!insn_buff) {
+ dsfree_pages(buffer, bsiz);
+ return -ENOMEM;
+ }
+ per_cpu(insn_buffer, cpu) = insn_buff;
+ }
+ hwev->pebs_vaddr = buffer;
+ /* Update the cpu entry area mapping */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds->pebs_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
+ ds->pebs_index = ds->pebs_buffer_base;
+ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
+ return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ void *cea;
+
+ if (!intel_pmu_has_pebs())
+ return;
+
+ if (x86_pmu.ds_pebs) {
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
+
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
+ }
+
+ dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size);
+ hwev->pebs_vaddr = NULL;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *buffer, *cea;
+ int max;
+
+ if (!x86_pmu.bts)
+ return 0;
+
+ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+ return -ENOMEM;
+ }
+ hwev->ds_bts_vaddr = buffer;
+ /* Update the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds->bts_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
+ ds->bts_index = ds->bts_buffer_base;
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ (max / 16) * BTS_RECORD_SIZE;
+ return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ void *cea;
+
+ if (!x86_pmu.bts)
+ return;
+
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds_clear_cea(cea, BTS_BUFFER_SIZE);
+ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+ hwev->ds_bts_vaddr = NULL;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
+
+ memset(ds, 0, sizeof(*ds));
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+ return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+}
+
+void release_ds_buffers(void)
+{
+ int cpu;
+
+ if (!x86_pmu.bts && !x86_pmu.ds_pebs)
+ return;
+
+ for_each_possible_cpu(cpu)
+ release_ds_buffer(cpu);
+
+ for_each_possible_cpu(cpu) {
+ /*
+ * Again, ignore errors from offline CPUs, they will no longer
+ * observe cpu_hw_events.ds and not program the DS_AREA when
+ * they come up.
+ */
+ fini_debug_store_on_cpu(cpu);
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (x86_pmu.ds_pebs)
+ release_pebs_buffer(cpu);
+ release_bts_buffer(cpu);
+ }
+}
+
+void reserve_ds_buffers(void)
+{
+ int bts_err = 0, pebs_err = 0;
+ int cpu;
+
+ x86_pmu.bts_active = 0;
+
+ if (x86_pmu.ds_pebs)
+ x86_pmu.pebs_active = 0;
+
+ if (!x86_pmu.bts && !x86_pmu.ds_pebs)
+ return;
+
+ if (!x86_pmu.bts)
+ bts_err = 1;
+
+ if (!x86_pmu.ds_pebs)
+ pebs_err = 1;
+
+ for_each_possible_cpu(cpu) {
+ if (alloc_ds_buffer(cpu)) {
+ bts_err = 1;
+ pebs_err = 1;
+ }
+
+ if (!bts_err && alloc_bts_buffer(cpu))
+ bts_err = 1;
+
+ if (x86_pmu.ds_pebs && !pebs_err &&
+ alloc_pebs_buffer(cpu))
+ pebs_err = 1;
+
+ if (bts_err && pebs_err)
+ break;
+ }
+
+ if (bts_err) {
+ for_each_possible_cpu(cpu)
+ release_bts_buffer(cpu);
+ }
+
+ if (x86_pmu.ds_pebs && pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_pebs_buffer(cpu);
+ }
+
+ if (bts_err && pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_ds_buffer(cpu);
+ } else {
+ if (x86_pmu.bts && !bts_err)
+ x86_pmu.bts_active = 1;
+
+ if (x86_pmu.ds_pebs && !pebs_err)
+ x86_pmu.pebs_active = 1;
+
+ for_each_possible_cpu(cpu) {
+ /*
+ * Ignores wrmsr_on_cpu() errors for offline CPUs they
+ * will get this call through intel_pmu_cpu_starting().
+ */
+ init_debug_store_on_cpu(cpu);
+ }
+ }
+}
+
+inline int alloc_arch_pebs_buf_on_cpu(int cpu)
+{
+ if (!x86_pmu.arch_pebs)
+ return 0;
+
+ return alloc_pebs_buffer(cpu);
+}
+
+inline void release_arch_pebs_buf_on_cpu(int cpu)
+{
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ release_pebs_buffer(cpu);
+}
+
+void init_arch_pebs_on_cpu(int cpu)
+{
+ struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ u64 arch_pebs_base;
+
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ if (!cpuc->pebs_vaddr) {
+ WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu);
+ x86_pmu.pebs_active = 0;
+ return;
+ }
+
+ /*
+ * 4KB-aligned pointer of the output buffer
+ * (__alloc_pages_node() return page aligned address)
+ * Buffer Size = 4KB * 2^SIZE
+ * contiguous physical buffer (__alloc_pages_node() with order)
+ */
+ arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT;
+ wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base,
+ (u32)(arch_pebs_base >> 32));
+ x86_pmu.pebs_active = 1;
+}
+
+inline void fini_arch_pebs_on_cpu(int cpu)
+{
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0);
+}
+
+/*
+ * BTS
+ */
+
+struct event_constraint bts_constraint =
+ EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
+
+void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= DEBUGCTLMSR_TR;
+ debugctlmsr |= DEBUGCTLMSR_BTS;
+ if (config & ARCH_PERFMON_EVENTSEL_INT)
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+ DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+int intel_pmu_drain_bts_buffer(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+ struct bts_record *at, *base, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ unsigned long skip = 0;
+ struct pt_regs regs;
+
+ if (!event)
+ return 0;
+
+ if (!x86_pmu.bts_active)
+ return 0;
+
+ base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ if (top <= base)
+ return 0;
+
+ memset(&regs, 0, sizeof(regs));
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ /*
+ * BTS leaks kernel addresses in branches across the cpl boundary,
+ * such as traps or system calls, so unless the user is asking for
+ * kernel tracing (and right now it's not possible), we'd need to
+ * filter them out. But first we need to count how many of those we
+ * have in the current batch. This is an extra O(n) pass, however,
+ * it's much faster than the other one especially considering that
+ * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+ * alloc_bts_buffer()).
+ */
+ for (at = base; at < top; at++) {
+ /*
+ * Note that right now *this* BTS code only works if
+ * attr::exclude_kernel is set, but let's keep this extra
+ * check here in case that changes.
+ */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ skip++;
+ }
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ rcu_read_lock();
+ perf_prepare_sample(&data, event, &regs);
+ perf_prepare_header(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, &data, event,
+ header.size * (top - base - skip)))
+ goto unlock;
+
+ for (at = base; at < top; at++) {
+ /* Filter out any records that contain kernel addresses. */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ continue;
+
+ data.ip = at->from;
+ data.addr = at->to;
+
+ perf_output_sample(&handle, &header, &data, event);
+ }
+
+ perf_output_end(&handle);
+
+ /* There's new data available. */
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+unlock:
+ rcu_read_unlock();
+ return 1;
+}
+
+void intel_pmu_drain_pebs_buffer(void)
+{
+ struct perf_sample_data data;
+
+ static_call(x86_pmu_drain_pebs)(NULL, &data);
+}
+
+/*
+ * PEBS
+ */
+struct event_constraint intel_core2_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_atom_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x1),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_glm_pebs_event_constraints[] = {
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_grt_pebs_event_constraints[] = {
+ /* Allow all events as PEBS with no flags */
+ INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0x3),
+ INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+ INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
+ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
+ INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_snb_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_icl_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */
+
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_glc_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
+ INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_lnc_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3fc),
+ INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
+{
+ struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints);
+ struct event_constraint *c;
+
+ if (!event->attr.precise_ip)
+ return NULL;
+
+ if (pebs_constraints) {
+ for_each_event_constraint(c, pebs_constraints) {
+ if (constraint_match(c, event->hw.config)) {
+ event->hw.flags |= c->flags;
+ return c;
+ }
+ }
+ }
+
+ /*
+ * Extended PEBS support
+ * Makes the PEBS code search the normal constraints.
+ */
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ return NULL;
+
+ return &emptyconstraint;
+}
+
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+ if (cpuc->n_pebs == cpuc->n_pebs_via_pt)
+ return false;
+
+ return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!sched_in && pebs_needs_sched_cb(cpuc))
+ intel_pmu_drain_pebs_buffer();
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ int max_pebs_events = intel_pmu_max_num_pebs(cpuc->pmu);
+ u64 threshold;
+ int reserved;
+
+ if (cpuc->n_pebs_via_pt)
+ return;
+
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ reserved = max_pebs_events + x86_pmu_max_num_counters_fixed(cpuc->pmu);
+ else
+ reserved = max_pebs_events;
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs) {
+ threshold = ds->pebs_absolute_maximum -
+ reserved * cpuc->pebs_record_size;
+ } else {
+ threshold = ds->pebs_buffer_base + cpuc->pebs_record_size;
+ }
+
+ ds->pebs_interrupt_threshold = threshold;
+}
+
+#define PEBS_DATACFG_CNTRS(x) \
+ ((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK)
+
+#define PEBS_DATACFG_CNTR_BIT(x) \
+ (((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT)
+
+#define PEBS_DATACFG_FIX(x) \
+ ((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK)
+
+#define PEBS_DATACFG_FIX_BIT(x) \
+ (((1ULL << (x)) & PEBS_DATACFG_FIX_MASK) \
+ << PEBS_DATACFG_FIX_SHIFT)
+
+static void adaptive_pebs_record_size_update(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 pebs_data_cfg = cpuc->pebs_data_cfg;
+ int sz = sizeof(struct pebs_basic);
+
+ if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
+ sz += sizeof(struct pebs_meminfo);
+ if (pebs_data_cfg & PEBS_DATACFG_GP)
+ sz += sizeof(struct pebs_gprs);
+ if (pebs_data_cfg & PEBS_DATACFG_XMMS)
+ sz += sizeof(struct pebs_xmm);
+ if (pebs_data_cfg & PEBS_DATACFG_LBRS)
+ sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+ if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) {
+ sz += sizeof(struct pebs_cntr_header);
+
+ /* Metrics base and Metrics Data */
+ if (pebs_data_cfg & PEBS_DATACFG_METRICS)
+ sz += 2 * sizeof(u64);
+
+ if (pebs_data_cfg & PEBS_DATACFG_CNTR) {
+ sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) +
+ hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) *
+ sizeof(u64);
+ }
+ }
+
+ cpuc->pebs_record_size = sz;
+}
+
+static void __intel_pmu_pebs_update_cfg(struct perf_event *event,
+ int idx, u64 *pebs_data_cfg)
+{
+ if (is_metric_event(event)) {
+ *pebs_data_cfg |= PEBS_DATACFG_METRICS;
+ return;
+ }
+
+ *pebs_data_cfg |= PEBS_DATACFG_CNTR;
+
+ if (idx >= INTEL_PMC_IDX_FIXED)
+ *pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx - INTEL_PMC_IDX_FIXED);
+ else
+ *pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx);
+}
+
+
+void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc)
+{
+ struct perf_event *event;
+ u64 pebs_data_cfg = 0;
+ int i;
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ event = cpuc->event_list[i];
+ if (!is_pebs_counter_event_group(event))
+ continue;
+ __intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg);
+ }
+
+ if (pebs_data_cfg & ~cpuc->pebs_data_cfg)
+ cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW;
+}
+
+#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \
+ PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_WEIGHT_TYPE | \
+ PERF_SAMPLE_TRANSACTION | \
+ PERF_SAMPLE_DATA_PAGE_SIZE)
+
+static u64 pebs_update_adaptive_cfg(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ u64 sample_type = attr->sample_type;
+ u64 pebs_data_cfg = 0;
+ bool gprs, tsx_weight;
+
+ if (!(sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) &&
+ attr->precise_ip > 1)
+ return pebs_data_cfg;
+
+ if (sample_type & PERF_PEBS_MEMINFO_TYPE)
+ pebs_data_cfg |= PEBS_DATACFG_MEMINFO;
+
+ /*
+ * We need GPRs when:
+ * + user requested them
+ * + precise_ip < 2 for the non event IP
+ * + For RTM TSX weight we need GPRs for the abort code.
+ */
+ gprs = ((sample_type & PERF_SAMPLE_REGS_INTR) &&
+ (attr->sample_regs_intr & PEBS_GP_REGS)) ||
+ ((sample_type & PERF_SAMPLE_REGS_USER) &&
+ (attr->sample_regs_user & PEBS_GP_REGS));
+
+ tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
+ ((attr->config & INTEL_ARCH_EVENT_MASK) ==
+ x86_pmu.rtm_abort_event);
+
+ if (gprs || (attr->precise_ip < 2) || tsx_weight)
+ pebs_data_cfg |= PEBS_DATACFG_GP;
+
+ if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
+ (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
+ pebs_data_cfg |= PEBS_DATACFG_XMMS;
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ /*
+ * For now always log all LBRs. Could configure this
+ * later.
+ */
+ pebs_data_cfg |= PEBS_DATACFG_LBRS |
+ ((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT);
+ }
+
+ return pebs_data_cfg;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
+ struct perf_event *event, bool add)
+{
+ struct pmu *pmu = event->pmu;
+
+ /*
+ * Make sure we get updated with the first PEBS event.
+ * During removal, ->pebs_data_cfg is still valid for
+ * the last PEBS event. Don't clear it.
+ */
+ if ((cpuc->n_pebs == 1) && add)
+ cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW;
+
+ if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+ if (!needed_cb)
+ perf_sched_cb_inc(pmu);
+ else
+ perf_sched_cb_dec(pmu);
+
+ cpuc->pebs_data_cfg |= PEBS_UPDATE_DS_SW;
+ }
+
+ /*
+ * The PEBS record doesn't shrink on pmu::del(). Doing so would require
+ * iterating all remaining PEBS events to reconstruct the config.
+ */
+ if (x86_pmu.intel_cap.pebs_baseline && add) {
+ u64 pebs_data_cfg;
+
+ pebs_data_cfg = pebs_update_adaptive_cfg(event);
+ /*
+ * Be sure to update the thresholds when we change the record.
+ */
+ if (pebs_data_cfg & ~cpuc->pebs_data_cfg)
+ cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW;
+ }
+}
+
+u64 intel_get_arch_pebs_data_config(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 pebs_data_cfg = 0;
+ u64 cntr_mask;
+
+ if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX))
+ return 0;
+
+ pebs_data_cfg |= pebs_update_adaptive_cfg(event);
+
+ cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) |
+ (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) |
+ PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS;
+ pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask;
+
+ return pebs_data_cfg;
+}
+
+void intel_pmu_pebs_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs++;
+ if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
+ cpuc->n_large_pebs++;
+ if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT)
+ cpuc->n_pebs_via_pt++;
+
+ pebs_update_state(needed_cb, cpuc, event, true);
+}
+
+static void intel_pmu_pebs_via_pt_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!is_pebs_pt(event))
+ return;
+
+ if (!(cpuc->pebs_enabled & ~PEBS_VIA_PT_MASK))
+ cpuc->pebs_enabled &= ~PEBS_VIA_PT_MASK;
+}
+
+static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+ u64 value = ds->pebs_event_reset[hwc->idx];
+ u32 base = MSR_RELOAD_PMC0;
+ unsigned int idx = hwc->idx;
+
+ if (!is_pebs_pt(event))
+ return;
+
+ if (!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
+ cpuc->pebs_enabled |= PEBS_PMI_AFTER_EACH_RECORD;
+
+ cpuc->pebs_enabled |= PEBS_OUTPUT_PT;
+
+ if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+ base = MSR_RELOAD_FIXED_CTR0;
+ idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+ if (x86_pmu.intel_cap.pebs_format < 5)
+ value = ds->pebs_event_reset[MAX_PEBS_EVENTS_FMT4 + idx];
+ else
+ value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx];
+ }
+ wrmsrq(base + idx, value);
+}
+
+static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc)
+{
+ if (cpuc->n_pebs == cpuc->n_large_pebs &&
+ cpuc->n_pebs != cpuc->n_pebs_via_pt)
+ intel_pmu_drain_pebs_buffer();
+}
+
+static void __intel_pmu_pebs_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+ cpuc->pebs_enabled |= 1ULL << hwc->idx;
+}
+
+void intel_pmu_pebs_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 pebs_data_cfg = cpuc->pebs_data_cfg & ~PEBS_UPDATE_DS_SW;
+ struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+ unsigned int idx = hwc->idx;
+
+ __intel_pmu_pebs_enable(event);
+
+ if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5))
+ cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+ cpuc->pebs_enabled |= 1ULL << 63;
+
+ if (x86_pmu.intel_cap.pebs_baseline) {
+ hwc->config |= ICL_EVENTSEL_ADAPTIVE;
+ if (pebs_data_cfg != cpuc->active_pebs_data_cfg) {
+ /*
+ * drain_pebs() assumes uniform record size;
+ * hence we need to drain when changing said
+ * size.
+ */
+ intel_pmu_drain_pebs_buffer();
+ adaptive_pebs_record_size_update();
+ wrmsrq(MSR_PEBS_DATA_CFG, pebs_data_cfg);
+ cpuc->active_pebs_data_cfg = pebs_data_cfg;
+ }
+ }
+ if (cpuc->pebs_data_cfg & PEBS_UPDATE_DS_SW) {
+ cpuc->pebs_data_cfg = pebs_data_cfg;
+ pebs_update_threshold(cpuc);
+ }
+
+ if (idx >= INTEL_PMC_IDX_FIXED) {
+ if (x86_pmu.intel_cap.pebs_format < 5)
+ idx = MAX_PEBS_EVENTS_FMT4 + (idx - INTEL_PMC_IDX_FIXED);
+ else
+ idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
+ }
+
+ /*
+ * Use auto-reload if possible to save a MSR write in the PMI.
+ * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
+ */
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+ ds->pebs_event_reset[idx] =
+ (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+ } else {
+ ds->pebs_event_reset[idx] = 0;
+ }
+
+ intel_pmu_pebs_via_pt_enable(event);
+}
+
+void intel_pmu_pebs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs--;
+ if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
+ cpuc->n_large_pebs--;
+ if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT)
+ cpuc->n_pebs_via_pt--;
+
+ pebs_update_state(needed_cb, cpuc, event, false);
+}
+
+static void __intel_pmu_pebs_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ intel_pmu_drain_large_pebs(cpuc);
+ cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+ hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ __intel_pmu_pebs_disable(event);
+
+ if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) &&
+ (x86_pmu.version < 5))
+ cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+ cpuc->pebs_enabled &= ~(1ULL << 63);
+
+ intel_pmu_pebs_via_pt_disable(event);
+
+ if (cpuc->enabled)
+ wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ __intel_pmu_pebs_disable_all();
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ unsigned long from = cpuc->lbr_entries[0].from;
+ unsigned long old_to, to = cpuc->lbr_entries[0].to;
+ unsigned long ip = regs->ip;
+ int is_64bit = 0;
+ void *kaddr;
+ int size;
+
+ /*
+ * We don't need to fixup if the PEBS assist is fault like
+ */
+ if (!x86_pmu.intel_cap.pebs_trap)
+ return 1;
+
+ /*
+ * No LBR entry, no basic block, no rewinding
+ */
+ if (!cpuc->lbr_stack.nr || !from || !to)
+ return 0;
+
+ /*
+ * Basic blocks should never cross user/kernel boundaries
+ */
+ if (kernel_ip(ip) != kernel_ip(to))
+ return 0;
+
+ /*
+ * unsigned math, either ip is before the start (impossible) or
+ * the basic block is larger than 1 page (sanity)
+ */
+ if ((ip - to) > PEBS_FIXUP_SIZE)
+ return 0;
+
+ /*
+ * We sampled a branch insn, rewind using the LBR stack
+ */
+ if (ip == to) {
+ set_linear_ip(regs, from);
+ return 1;
+ }
+
+ size = ip - to;
+ if (!kernel_ip(ip)) {
+ int bytes;
+ u8 *buf = this_cpu_read(insn_buffer);
+
+ /* 'size' must fit our buffer, see above */
+ bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+ if (bytes != 0)
+ return 0;
+
+ kaddr = buf;
+ } else {
+ kaddr = (void *)to;
+ }
+
+ do {
+ struct insn insn;
+
+ old_to = to;
+
+#ifdef CONFIG_X86_64
+ is_64bit = kernel_ip(to) || any_64bit_mode(regs);
+#endif
+ insn_init(&insn, kaddr, size, is_64bit);
+
+ /*
+ * Make sure there was not a problem decoding the instruction.
+ * This is doubly important because we have an infinite loop if
+ * insn.length=0.
+ */
+ if (insn_get_length(&insn))
+ break;
+
+ to += insn.length;
+ kaddr += insn.length;
+ size -= insn.length;
+ } while (to < ip);
+
+ if (to == ip) {
+ set_linear_ip(regs, old_to);
+ return 1;
+ }
+
+ /*
+ * Even though we decoded the basic block, the instruction stream
+ * never matched the given IP, either the TO or the IP got corrupted.
+ */
+ return 0;
+}
+
+static inline u64 intel_get_tsx_weight(u64 tsx_tuning)
+{
+ if (tsx_tuning) {
+ union hsw_tsx_tuning tsx = { .value = tsx_tuning };
+ return tsx.cycles_last_block;
+ }
+ return 0;
+}
+
+static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax)
+{
+ u64 txn = (tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+ /* For RTM XABORTs also log the abort code from AX */
+ if ((txn & PERF_TXN_TRANSACTION) && (ax & 1))
+ txn |= ((ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+ return txn;
+}
+
+static inline u64 get_pebs_status(void *n)
+{
+ if (x86_pmu.intel_cap.pebs_format < 4)
+ return ((struct pebs_record_nhm *)n)->status;
+ return ((struct pebs_basic *)n)->applicable_counters;
+}
+
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+ (PERF_X86_EVENT_PEBS_ST_HSW | \
+ PERF_X86_EVENT_PEBS_LD_HSW | \
+ PERF_X86_EVENT_PEBS_NA_HSW)
+
+static u64 get_data_src(struct perf_event *event, u64 aux)
+{
+ u64 val = PERF_MEM_NA;
+ int fl = event->hw.flags;
+ bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+ if (fl & PERF_X86_EVENT_PEBS_LDLAT)
+ val = load_latency_data(event, aux);
+ else if (fl & PERF_X86_EVENT_PEBS_STLAT)
+ val = store_latency_data(event, aux);
+ else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID)
+ val = x86_pmu.pebs_latency_data(event, aux);
+ else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+ val = precise_datala_hsw(event, aux);
+ else if (fst)
+ val = precise_store_data(aux);
+ return val;
+}
+
+static void setup_pebs_time(struct perf_event *event,
+ struct perf_sample_data *data,
+ u64 tsc)
+{
+ /* Converting to a user-defined clock is not supported yet. */
+ if (event->attr.use_clockid != 0)
+ return;
+
+ /*
+ * Doesn't support the conversion when the TSC is unstable.
+ * The TSC unstable case is a corner case and very unlikely to
+ * happen. If it happens, the TSC in a PEBS record will be
+ * dropped and fall back to perf_event_clock().
+ */
+ if (!using_native_sched_clock() || !sched_clock_stable())
+ return;
+
+ data->time = native_sched_clock_from_tsc(tsc) + __sched_clock_offset;
+ data->sample_flags |= PERF_SAMPLE_TIME;
+}
+
+#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_DATA_PAGE_SIZE)
+
+static void setup_pebs_fixed_sample_data(struct perf_event *event,
+ struct pt_regs *iregs, void *__pebs,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /*
+ * We cast to the biggest pebs_record but are careful not to
+ * unconditionally access the 'extra' entries.
+ */
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct pebs_record_skl *pebs = __pebs;
+ u64 sample_type;
+ int fll;
+
+ if (pebs == NULL)
+ return;
+
+ sample_type = event->attr.sample_type;
+ fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
+
+ perf_sample_data_init(data, 0, event->hw.last_period);
+
+ /*
+ * Use latency for weight (only avail with PEBS-LL)
+ */
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) {
+ data->weight.full = pebs->lat;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ /*
+ * data.data_src encodes the data source
+ */
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
+ data->data_src.val = get_data_src(event, pebs->dse);
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ /*
+ * We must however always use iregs for the unwinder to stay sane; the
+ * record BP,SP,IP can point into thin air when the record is from a
+ * previous PMI context or an (I)RET happened between the record and
+ * PMI.
+ */
+ perf_sample_save_callchain(data, event, iregs);
+
+ /*
+ * We use the interrupt regs as a base because the PEBS record does not
+ * contain a full regs set, specifically it seems to lack segment
+ * descriptors, which get used by things like user_mode().
+ *
+ * In the simple case fix up only the IP for PERF_SAMPLE_IP.
+ */
+ *regs = *iregs;
+
+ /*
+ * Initialize regs_>flags from PEBS,
+ * Clear exact bit (which uses x86 EFLAGS Reserved bit 3),
+ * i.e., do not rely on it being zero:
+ */
+ regs->flags = pebs->flags & ~PERF_EFLAGS_EXACT;
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ regs->ax = pebs->ax;
+ regs->bx = pebs->bx;
+ regs->cx = pebs->cx;
+ regs->dx = pebs->dx;
+ regs->si = pebs->si;
+ regs->di = pebs->di;
+
+ regs->bp = pebs->bp;
+ regs->sp = pebs->sp;
+
+#ifndef CONFIG_X86_32
+ regs->r8 = pebs->r8;
+ regs->r9 = pebs->r9;
+ regs->r10 = pebs->r10;
+ regs->r11 = pebs->r11;
+ regs->r12 = pebs->r12;
+ regs->r13 = pebs->r13;
+ regs->r14 = pebs->r14;
+ regs->r15 = pebs->r15;
+#endif
+ }
+
+ if (event->attr.precise_ip > 1) {
+ /*
+ * Haswell and later processors have an 'eventing IP'
+ * (real IP) which fixes the off-by-1 skid in hardware.
+ * Use it when precise_ip >= 2 :
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 2) {
+ set_linear_ip(regs, pebs->real_ip);
+ regs->flags |= PERF_EFLAGS_EXACT;
+ } else {
+ /* Otherwise, use PEBS off-by-1 IP: */
+ set_linear_ip(regs, pebs->ip);
+
+ /*
+ * With precise_ip >= 2, try to fix up the off-by-1 IP
+ * using the LBR. If successful, the fixup function
+ * corrects regs->ip and calls set_linear_ip() on regs:
+ */
+ if (intel_pmu_pebs_fixup_ip(regs))
+ regs->flags |= PERF_EFLAGS_EXACT;
+ }
+ } else {
+ /*
+ * When precise_ip == 1, return the PEBS off-by-1 IP,
+ * no fixup attempted:
+ */
+ set_linear_ip(regs, pebs->ip);
+ }
+
+
+ if ((sample_type & PERF_SAMPLE_ADDR_TYPE) &&
+ x86_pmu.intel_cap.pebs_format >= 1) {
+ data->addr = pebs->dla;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (x86_pmu.intel_cap.pebs_format >= 2) {
+ /* Only set the TSX weight when no memory weight. */
+ if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) {
+ data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+ if (sample_type & PERF_SAMPLE_TRANSACTION) {
+ data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
+ pebs->ax);
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
+ }
+
+ /*
+ * v3 supplies an accurate time stamp, so we use that
+ * for the time stamp.
+ *
+ * We can only do this for the default trace clock.
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 3)
+ setup_pebs_time(event, data, pebs->tsc);
+
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
+}
+
+static void adaptive_pebs_save_regs(struct pt_regs *regs,
+ struct pebs_gprs *gprs)
+{
+ regs->ax = gprs->ax;
+ regs->bx = gprs->bx;
+ regs->cx = gprs->cx;
+ regs->dx = gprs->dx;
+ regs->si = gprs->si;
+ regs->di = gprs->di;
+ regs->bp = gprs->bp;
+ regs->sp = gprs->sp;
+#ifndef CONFIG_X86_32
+ regs->r8 = gprs->r8;
+ regs->r9 = gprs->r9;
+ regs->r10 = gprs->r10;
+ regs->r11 = gprs->r11;
+ regs->r12 = gprs->r12;
+ regs->r13 = gprs->r13;
+ regs->r14 = gprs->r14;
+ regs->r15 = gprs->r15;
+#endif
+}
+
+static void intel_perf_event_update_pmc(struct perf_event *event, u64 pmc)
+{
+ int shift = 64 - x86_pmu.cntval_bits;
+ struct hw_perf_event *hwc;
+ u64 delta, prev_pmc;
+
+ /*
+ * A recorded counter may not have an assigned event in the
+ * following cases. The value should be dropped.
+ * - An event is deleted. There is still an active PEBS event.
+ * The PEBS record doesn't shrink on pmu::del().
+ * If the counter of the deleted event once occurred in a PEBS
+ * record, PEBS still records the counter until the counter is
+ * reassigned.
+ * - An event is stopped for some reason, e.g., throttled.
+ * During this period, another event is added and takes the
+ * counter of the stopped event. The stopped event is assigned
+ * to another new and uninitialized counter, since the
+ * x86_pmu_start(RELOAD) is not invoked for a stopped event.
+ * The PEBS__DATA_CFG is updated regardless of the event state.
+ * The uninitialized counter can be recorded in a PEBS record.
+ * But the cpuc->events[uninitialized_counter] is always NULL,
+ * because the event is stopped. The uninitialized value is
+ * safely dropped.
+ */
+ if (!event)
+ return;
+
+ hwc = &event->hw;
+ prev_pmc = local64_read(&hwc->prev_count);
+
+ /* Only update the count when the PMU is disabled */
+ WARN_ON(this_cpu_read(cpu_hw_events.enabled));
+ local64_set(&hwc->prev_count, pmc);
+
+ delta = (pmc << shift) - (prev_pmc << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+}
+
+static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ struct pebs_cntr_header *cntr,
+ void *next_record)
+{
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) {
+ intel_perf_event_update_pmc(cpuc->events[bit], *(u64 *)next_record);
+ next_record += sizeof(u64);
+ }
+
+ for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) {
+ /* The slots event will be handled with perf_metric later */
+ if ((cntr->metrics == INTEL_CNTR_METRICS) &&
+ (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) {
+ next_record += sizeof(u64);
+ continue;
+ }
+ intel_perf_event_update_pmc(cpuc->events[bit + INTEL_PMC_IDX_FIXED],
+ *(u64 *)next_record);
+ next_record += sizeof(u64);
+ }
+
+ /* HW will reload the value right after the overflow. */
+ if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period);
+
+ if (cntr->metrics == INTEL_CNTR_METRICS) {
+ static_call(intel_pmu_update_topdown_event)
+ (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS],
+ (u64 *)next_record);
+ next_record += 2 * sizeof(u64);
+ }
+}
+
+#define PEBS_LATENCY_MASK 0xffff
+
+static inline void __setup_perf_sample_data(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct perf_sample_data *data)
+{
+ perf_sample_data_init(data, 0, event->hw.last_period);
+
+ /*
+ * We must however always use iregs for the unwinder to stay sane; the
+ * record BP,SP,IP can point into thin air when the record is from a
+ * previous PMI context or an (I)RET happened between the record and
+ * PMI.
+ */
+ perf_sample_save_callchain(data, event, iregs);
+}
+
+static inline void __setup_pebs_basic_group(struct perf_event *event,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ u64 sample_type, u64 ip,
+ u64 tsc, u16 retire)
+{
+ /* The ip in basic is EventingIP */
+ set_linear_ip(regs, ip);
+ regs->flags = PERF_EFLAGS_EXACT;
+ setup_pebs_time(event, data, tsc);
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+ data->weight.var3_w = retire;
+}
+
+static inline void __setup_pebs_gpr_group(struct perf_event *event,
+ struct pt_regs *regs,
+ struct pebs_gprs *gprs,
+ u64 sample_type)
+{
+ if (event->attr.precise_ip < 2) {
+ set_linear_ip(regs, gprs->ip);
+ regs->flags &= ~PERF_EFLAGS_EXACT;
+ }
+
+ if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
+ adaptive_pebs_save_regs(regs, gprs);
+}
+
+static inline void __setup_pebs_meminfo_group(struct perf_event *event,
+ struct perf_sample_data *data,
+ u64 sample_type, u64 latency,
+ u16 instr_latency, u64 address,
+ u64 aux, u64 tsx_tuning, u64 ax)
+{
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ u64 tsx_latency = intel_get_tsx_weight(tsx_tuning);
+
+ data->weight.var2_w = instr_latency;
+
+ /*
+ * Although meminfo::latency is defined as a u64,
+ * only the lower 32 bits include the valid data
+ * in practice on Ice Lake and earlier platforms.
+ */
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ data->weight.full = latency ?: tsx_latency;
+ else
+ data->weight.var1_dw = (u32)latency ?: tsx_latency;
+
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
+ data->data_src.val = get_data_src(event, aux);
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (sample_type & PERF_SAMPLE_ADDR_TYPE) {
+ data->addr = address;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION) {
+ data->txn = intel_get_tsx_transaction(tsx_tuning, ax);
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
+}
+
+/*
+ * With adaptive PEBS the layout depends on what fields are configured.
+ */
+static void setup_pebs_adaptive_sample_data(struct perf_event *event,
+ struct pt_regs *iregs, void *__pebs,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 sample_type = event->attr.sample_type;
+ struct pebs_basic *basic = __pebs;
+ void *next_record = basic + 1;
+ struct pebs_meminfo *meminfo = NULL;
+ struct pebs_gprs *gprs = NULL;
+ struct x86_perf_regs *perf_regs;
+ u64 format_group;
+ u16 retire;
+
+ if (basic == NULL)
+ return;
+
+ perf_regs = container_of(regs, struct x86_perf_regs, regs);
+ perf_regs->xmm_regs = NULL;
+
+ format_group = basic->format_group;
+
+ __setup_perf_sample_data(event, iregs, data);
+
+ *regs = *iregs;
+
+ /* basic group */
+ retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ?
+ basic->retire_latency : 0;
+ __setup_pebs_basic_group(event, regs, data, sample_type,
+ basic->ip, basic->tsc, retire);
+
+ /*
+ * The record for MEMINFO is in front of GP
+ * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
+ * Save the pointer here but process later.
+ */
+ if (format_group & PEBS_DATACFG_MEMINFO) {
+ meminfo = next_record;
+ next_record = meminfo + 1;
+ }
+
+ if (format_group & PEBS_DATACFG_GP) {
+ gprs = next_record;
+ next_record = gprs + 1;
+
+ __setup_pebs_gpr_group(event, regs, gprs, sample_type);
+ }
+
+ if (format_group & PEBS_DATACFG_MEMINFO) {
+ u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
+ meminfo->cache_latency : meminfo->mem_latency;
+ u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
+ meminfo->instr_latency : 0;
+ u64 ax = gprs ? gprs->ax : 0;
+
+ __setup_pebs_meminfo_group(event, data, sample_type, latency,
+ instr_latency, meminfo->address,
+ meminfo->aux, meminfo->tsx_tuning,
+ ax);
+ }
+
+ if (format_group & PEBS_DATACFG_XMMS) {
+ struct pebs_xmm *xmm = next_record;
+
+ next_record = xmm + 1;
+ perf_regs->xmm_regs = xmm->xmm;
+ }
+
+ if (format_group & PEBS_DATACFG_LBRS) {
+ struct lbr_entry *lbr = next_record;
+ int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT)
+ & 0xff) + 1;
+ next_record = next_record + num_lbr * sizeof(struct lbr_entry);
+
+ if (has_branch_stack(event)) {
+ intel_pmu_store_pebs_lbrs(lbr);
+ intel_pmu_lbr_save_brstack(data, cpuc, event);
+ }
+ }
+
+ if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) {
+ struct pebs_cntr_header *cntr = next_record;
+ unsigned int nr;
+
+ next_record += sizeof(struct pebs_cntr_header);
+ /*
+ * The PEBS_DATA_CFG is a global register, which is the
+ * superset configuration for all PEBS events.
+ * For the PEBS record of non-sample-read group, ignore
+ * the counter snapshot fields.
+ */
+ if (is_pebs_counter_event_group(event)) {
+ __setup_pebs_counter_group(cpuc, event, cntr, next_record);
+ data->sample_flags |= PERF_SAMPLE_READ;
+ }
+
+ nr = hweight32(cntr->cntr) + hweight32(cntr->fixed);
+ if (cntr->metrics == INTEL_CNTR_METRICS)
+ nr += 2;
+ next_record += nr * sizeof(u64);
+ }
+
+ WARN_ONCE(next_record != __pebs + basic->format_size,
+ "PEBS record size %u, expected %llu, config %llx\n",
+ basic->format_size,
+ (u64)(next_record - __pebs),
+ format_group);
+}
+
+static inline bool arch_pebs_record_continued(struct arch_pebs_header *header)
+{
+ /* Continue bit or null PEBS record indicates fragment follows. */
+ return header->cont || !(header->format & GENMASK_ULL(63, 16));
+}
+
+static void setup_arch_pebs_sample_data(struct perf_event *event,
+ struct pt_regs *iregs,
+ void *__pebs,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 sample_type = event->attr.sample_type;
+ struct arch_pebs_header *header = NULL;
+ struct arch_pebs_aux *meminfo = NULL;
+ struct arch_pebs_gprs *gprs = NULL;
+ struct x86_perf_regs *perf_regs;
+ void *next_record;
+ void *at = __pebs;
+
+ if (at == NULL)
+ return;
+
+ perf_regs = container_of(regs, struct x86_perf_regs, regs);
+ perf_regs->xmm_regs = NULL;
+
+ __setup_perf_sample_data(event, iregs, data);
+
+ *regs = *iregs;
+
+again:
+ header = at;
+ next_record = at + sizeof(struct arch_pebs_header);
+ if (header->basic) {
+ struct arch_pebs_basic *basic = next_record;
+ u16 retire = 0;
+
+ next_record = basic + 1;
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+ retire = basic->valid ? basic->retire : 0;
+ __setup_pebs_basic_group(event, regs, data, sample_type,
+ basic->ip, basic->tsc, retire);
+ }
+
+ /*
+ * The record for MEMINFO is in front of GP
+ * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
+ * Save the pointer here but process later.
+ */
+ if (header->aux) {
+ meminfo = next_record;
+ next_record = meminfo + 1;
+ }
+
+ if (header->gpr) {
+ gprs = next_record;
+ next_record = gprs + 1;
+
+ __setup_pebs_gpr_group(event, regs,
+ (struct pebs_gprs *)gprs,
+ sample_type);
+ }
+
+ if (header->aux) {
+ u64 ax = gprs ? gprs->ax : 0;
+
+ __setup_pebs_meminfo_group(event, data, sample_type,
+ meminfo->cache_latency,
+ meminfo->instr_latency,
+ meminfo->address, meminfo->aux,
+ meminfo->tsx_tuning, ax);
+ }
+
+ if (header->xmm) {
+ struct pebs_xmm *xmm;
+
+ next_record += sizeof(struct arch_pebs_xer_header);
+
+ xmm = next_record;
+ perf_regs->xmm_regs = xmm->xmm;
+ next_record = xmm + 1;
+ }
+
+ if (header->lbr) {
+ struct arch_pebs_lbr_header *lbr_header = next_record;
+ struct lbr_entry *lbr;
+ int num_lbr;
+
+ next_record = lbr_header + 1;
+ lbr = next_record;
+
+ num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ?
+ lbr_header->depth :
+ header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES;
+ next_record += num_lbr * sizeof(struct lbr_entry);
+
+ if (has_branch_stack(event)) {
+ intel_pmu_store_pebs_lbrs(lbr);
+ intel_pmu_lbr_save_brstack(data, cpuc, event);
+ }
+ }
+
+ if (header->cntr) {
+ struct arch_pebs_cntr_header *cntr = next_record;
+ unsigned int nr;
+
+ next_record += sizeof(struct arch_pebs_cntr_header);
+
+ if (is_pebs_counter_event_group(event)) {
+ __setup_pebs_counter_group(cpuc, event,
+ (struct pebs_cntr_header *)cntr, next_record);
+ data->sample_flags |= PERF_SAMPLE_READ;
+ }
+
+ nr = hweight32(cntr->cntr) + hweight32(cntr->fixed);
+ if (cntr->metrics == INTEL_CNTR_METRICS)
+ nr += 2;
+ next_record += nr * sizeof(u64);
+ }
+
+ /* Parse followed fragments if there are. */
+ if (arch_pebs_record_continued(header)) {
+ at = at + header->size;
+ goto again;
+ }
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ void *at;
+ u64 pebs_status;
+
+ /*
+ * fmt0 does not have a status bitfield (does not use
+ * perf_record_nhm format)
+ */
+ if (x86_pmu.intel_cap.pebs_format < 1)
+ return base;
+
+ if (base == NULL)
+ return NULL;
+
+ for (at = base; at < top; at += cpuc->pebs_record_size) {
+ unsigned long status = get_pebs_status(at);
+
+ if (test_bit(bit, (unsigned long *)&status)) {
+ /* PEBS v3 has accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3)
+ return at;
+
+ if (status == (1 << bit))
+ return at;
+
+ /* clear non-PEBS bit and re-check */
+ pebs_status = status & cpuc->pebs_enabled;
+ pebs_status &= PEBS_COUNTER_MASK;
+ if (pebs_status == (1 << bit))
+ return at;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Special variant of intel_pmu_save_and_restart() for auto-reload.
+ */
+static int
+intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - x86_pmu.cntval_bits;
+ u64 period = hwc->sample_period;
+ u64 prev_raw_count, new_raw_count;
+ s64 new, old;
+
+ WARN_ON(!period);
+
+ /*
+ * drain_pebs() only happens when the PMU is disabled.
+ */
+ WARN_ON(this_cpu_read(cpu_hw_events.enabled));
+
+ prev_raw_count = local64_read(&hwc->prev_count);
+ new_raw_count = rdpmc(hwc->event_base_rdpmc);
+ local64_set(&hwc->prev_count, new_raw_count);
+
+ /*
+ * Since the counter increments a negative counter value and
+ * overflows on the sign switch, giving the interval:
+ *
+ * [-period, 0]
+ *
+ * the difference between two consecutive reads is:
+ *
+ * A) value2 - value1;
+ * when no overflows have happened in between,
+ *
+ * B) (0 - value1) + (value2 - (-period));
+ * when one overflow happened in between,
+ *
+ * C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
+ * when @n overflows happened in between.
+ *
+ * Here A) is the obvious difference, B) is the extension to the
+ * discrete interval, where the first term is to the top of the
+ * interval and the second term is from the bottom of the next
+ * interval and C) the extension to multiple intervals, where the
+ * middle term is the whole intervals covered.
+ *
+ * An equivalent of C, by reduction, is:
+ *
+ * value2 - value1 + n * period
+ */
+ new = ((s64)(new_raw_count << shift) >> shift);
+ old = ((s64)(prev_raw_count << shift) >> shift);
+ local64_add(new - old + count * period, &event->count);
+
+ local64_set(&hwc->period_left, -new);
+
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
+ struct perf_sample_data *, struct pt_regs *);
+
+static struct pt_regs dummy_iregs;
+
+static __always_inline void
+__intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ void *at,
+ setup_fn setup_sample)
+{
+ setup_sample(event, iregs, at, data, regs);
+ perf_event_output(event, data, regs);
+}
+
+static __always_inline void
+__intel_pmu_pebs_last_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ void *at,
+ int count,
+ setup_fn setup_sample)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ setup_sample(event, iregs, at, data, regs);
+ if (iregs == &dummy_iregs) {
+ /*
+ * The PEBS records may be drained in the non-overflow context,
+ * e.g., large PEBS + context switch. Perf should treat the
+ * last record the same as other PEBS records, and doesn't
+ * invoke the generic overflow handler.
+ */
+ perf_event_output(event, data, regs);
+ } else {
+ /*
+ * All but the last records are processed.
+ * The last one is left to be able to call the overflow handler.
+ */
+ perf_event_overflow(event, data, regs);
+ }
+
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+ if ((is_pebs_counter_event_group(event))) {
+ /*
+ * The value of each sample has been updated when setup
+ * the corresponding sample data.
+ */
+ perf_event_update_userpage(event);
+ } else {
+ /*
+ * Now, auto-reload is only enabled in fixed period mode.
+ * The reload value is always hwc->sample_period.
+ * May need to change it, if auto-reload is enabled in
+ * freq mode later.
+ */
+ intel_pmu_save_and_restart_reload(event, count);
+ }
+ } else {
+ /*
+ * For a non-precise event, it's possible the
+ * counters-snapshotting records a positive value for the
+ * overflowed event. Then the HW auto-reload mechanism
+ * reset the counter to 0 immediately, because the
+ * pebs_event_reset is cleared if the PERF_X86_EVENT_AUTO_RELOAD
+ * is not set. The counter backwards may be observed in a
+ * PMI handler.
+ *
+ * Since the event value has been updated when processing the
+ * counters-snapshotting record, only needs to set the new
+ * period for the counter.
+ */
+ if (is_pebs_counter_event_group(event))
+ static_call(x86_pmu_set_period)(event);
+ else
+ intel_pmu_save_and_restart(event);
+ }
+}
+
+static __always_inline void
+__intel_pmu_pebs_events(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct perf_sample_data *data,
+ void *base, void *top,
+ int bit, int count,
+ setup_fn setup_sample)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_regs perf_regs;
+ struct pt_regs *regs = &perf_regs.regs;
+ void *at = get_next_pebs_record_by_bit(base, top, bit);
+ int cnt = count;
+
+ if (!iregs)
+ iregs = &dummy_iregs;
+
+ while (cnt > 1) {
+ __intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
+ at += cpuc->pebs_record_size;
+ at = get_next_pebs_record_by_bit(at, top, bit);
+ cnt--;
+ }
+
+ __intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+ struct pebs_record_core *at, *top;
+ int n;
+
+ if (!x86_pmu.pebs_active)
+ return;
+
+ at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+ /*
+ * Whatever else happens, drain the thing
+ */
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ if (!test_bit(0, cpuc->active_mask))
+ return;
+
+ WARN_ON_ONCE(!event);
+
+ if (!event->attr.precise_ip)
+ return;
+
+ n = top - at;
+ if (n <= 0) {
+ if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ intel_pmu_save_and_restart_reload(event, 0);
+ return;
+ }
+
+ __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
+ setup_pebs_fixed_sample_data);
+}
+
+static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask)
+{
+ u64 pebs_enabled = cpuc->pebs_enabled & mask;
+ struct perf_event *event;
+ int bit;
+
+ /*
+ * The drain_pebs() could be called twice in a short period
+ * for auto-reload event in pmu::read(). There are no
+ * overflows have happened in between.
+ * It needs to call intel_pmu_save_and_restart_reload() to
+ * update the event->count for this case.
+ */
+ for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) {
+ event = cpuc->events[bit];
+ if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ intel_pmu_save_and_restart_reload(event, 0);
+ }
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct perf_event *event;
+ void *base, *at, *top;
+ short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ int max_pebs_events = intel_pmu_max_num_pebs(NULL);
+ int bit, i, size;
+ u64 mask;
+
+ if (!x86_pmu.pebs_active)
+ return;
+
+ base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ mask = x86_pmu.pebs_events_mask;
+ size = max_pebs_events;
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL) {
+ mask |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED;
+ size = INTEL_PMC_IDX_FIXED + x86_pmu_max_num_counters_fixed(NULL);
+ }
+
+ if (unlikely(base >= top)) {
+ intel_pmu_pebs_event_update_no_drain(cpuc, mask);
+ return;
+ }
+
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+ struct pebs_record_nhm *p = at;
+ u64 pebs_status;
+
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= mask;
+
+ /* PEBS v3 has more accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3) {
+ for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
+ counts[bit]++;
+
+ continue;
+ }
+
+ /*
+ * On some CPUs the PEBS status can be zero when PEBS is
+ * racing with clearing of GLOBAL_STATUS.
+ *
+ * Normally we would drop that record, but in the
+ * case when there is only a single active PEBS event
+ * we can assume it's for that event.
+ */
+ if (!pebs_status && cpuc->pebs_enabled &&
+ !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+ pebs_status = p->status = cpuc->pebs_enabled;
+
+ bit = find_first_bit((unsigned long *)&pebs_status,
+ max_pebs_events);
+
+ if (!(x86_pmu.pebs_events_mask & (1 << bit)))
+ continue;
+
+ /*
+ * The PEBS hardware does not deal well with the situation
+ * when events happen near to each other and multiple bits
+ * are set. But it should happen rarely.
+ *
+ * If these events include one PEBS and multiple non-PEBS
+ * events, it doesn't impact PEBS record. The record will
+ * be handled normally. (slow path)
+ *
+ * If these events include two or more PEBS events, the
+ * records for the events can be collapsed into a single
+ * one, and it's not possible to reconstruct all events
+ * that caused the PEBS record. It's called collision.
+ * If collision happened, the record will be dropped.
+ */
+ if (pebs_status != (1ULL << bit)) {
+ for_each_set_bit(i, (unsigned long *)&pebs_status, size)
+ error[i]++;
+ continue;
+ }
+
+ counts[bit]++;
+ }
+
+ for_each_set_bit(bit, (unsigned long *)&mask, size) {
+ if ((counts[bit] == 0) && (error[bit] == 0))
+ continue;
+
+ event = cpuc->events[bit];
+ if (WARN_ON_ONCE(!event))
+ continue;
+
+ if (WARN_ON_ONCE(!event->attr.precise_ip))
+ continue;
+
+ /* log dropped samples number */
+ if (error[bit]) {
+ perf_log_lost_samples(event, error[bit]);
+
+ if (iregs)
+ perf_event_account_interrupt(event);
+ }
+
+ if (counts[bit]) {
+ __intel_pmu_pebs_events(event, iregs, data, base,
+ top, bit, counts[bit],
+ setup_pebs_fixed_sample_data);
+ }
+ }
+}
+
+static __always_inline void
+__intel_pmu_handle_pebs_record(struct pt_regs *iregs,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ void *at, u64 pebs_status,
+ short *counts, void **last,
+ setup_fn setup_sample)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
+ event = cpuc->events[bit];
+
+ if (WARN_ON_ONCE(!event) ||
+ WARN_ON_ONCE(!event->attr.precise_ip))
+ continue;
+
+ if (counts[bit]++) {
+ __intel_pmu_pebs_event(event, iregs, regs, data,
+ last[bit], setup_sample);
+ }
+
+ last[bit] = at;
+ }
+}
+
+static __always_inline void
+__intel_pmu_handle_last_pebs_record(struct pt_regs *iregs,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ u64 mask, short *counts, void **last,
+ setup_fn setup_sample)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
+ if (!counts[bit])
+ continue;
+
+ event = cpuc->events[bit];
+
+ __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
+ counts[bit], setup_sample);
+ }
+
+}
+
+static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
+{
+ short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct x86_perf_regs perf_regs;
+ struct pt_regs *regs = &perf_regs.regs;
+ struct pebs_basic *basic;
+ void *base, *at, *top;
+ u64 mask;
+
+ if (!x86_pmu.pebs_active)
+ return;
+
+ base = (struct pebs_basic *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_basic *)(unsigned long)ds->pebs_index;
+
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ mask = hybrid(cpuc->pmu, pebs_events_mask) |
+ (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
+ mask &= cpuc->pebs_enabled;
+
+ if (unlikely(base >= top)) {
+ intel_pmu_pebs_event_update_no_drain(cpuc, mask);
+ return;
+ }
+
+ if (!iregs)
+ iregs = &dummy_iregs;
+
+ /* Process all but the last event for each counter. */
+ for (at = base; at < top; at += basic->format_size) {
+ u64 pebs_status;
+
+ basic = at;
+ if (basic->format_size != cpuc->pebs_record_size)
+ continue;
+
+ pebs_status = mask & basic->applicable_counters;
+ __intel_pmu_handle_pebs_record(iregs, regs, data, at,
+ pebs_status, counts, last,
+ setup_pebs_adaptive_sample_data);
+ }
+
+ __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last,
+ setup_pebs_adaptive_sample_data);
+}
+
+static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs,
+ struct perf_sample_data *data)
+{
+ short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union arch_pebs_index index;
+ struct x86_perf_regs perf_regs;
+ struct pt_regs *regs = &perf_regs.regs;
+ void *base, *at, *top;
+ u64 mask;
+
+ rdmsrq(MSR_IA32_PEBS_INDEX, index.whole);
+
+ if (unlikely(!index.wr)) {
+ intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
+ return;
+ }
+
+ base = cpuc->pebs_vaddr;
+ top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT);
+
+ index.wr = 0;
+ index.full = 0;
+ index.en = 1;
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
+ index.thresh = ARCH_PEBS_THRESH_MULTI;
+ else
+ index.thresh = ARCH_PEBS_THRESH_SINGLE;
+ wrmsrq(MSR_IA32_PEBS_INDEX, index.whole);
+
+ mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled;
+
+ if (!iregs)
+ iregs = &dummy_iregs;
+
+ /* Process all but the last event for each counter. */
+ for (at = base; at < top;) {
+ struct arch_pebs_header *header;
+ struct arch_pebs_basic *basic;
+ u64 pebs_status;
+
+ header = at;
+
+ if (WARN_ON_ONCE(!header->size))
+ break;
+
+ /* 1st fragment or single record must have basic group */
+ if (!header->basic) {
+ at += header->size;
+ continue;
+ }
+
+ basic = at + sizeof(struct arch_pebs_header);
+ pebs_status = mask & basic->applicable_counters;
+ __intel_pmu_handle_pebs_record(iregs, regs, data, at,
+ pebs_status, counts, last,
+ setup_arch_pebs_sample_data);
+
+ /* Skip non-last fragments */
+ while (arch_pebs_record_continued(header)) {
+ if (!header->size)
+ break;
+ at += header->size;
+ header = at;
+ }
+
+ /* Skip last fragment or the single record */
+ at += header->size;
+ }
+
+ __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask,
+ counts, last,
+ setup_arch_pebs_sample_data);
+}
+
+static void __init intel_arch_pebs_init(void)
+{
+ /*
+ * Current hybrid platforms always both support arch-PEBS or not
+ * on all kinds of cores. So directly set x86_pmu.arch_pebs flag
+ * if boot cpu supports arch-PEBS.
+ */
+ x86_pmu.arch_pebs = 1;
+ x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+ x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs;
+ x86_pmu.pebs_capable = ~0ULL;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+
+ x86_pmu.pebs_enable = __intel_pmu_pebs_enable;
+ x86_pmu.pebs_disable = __intel_pmu_pebs_disable;
+}
+
+/*
+ * PEBS probe and setup
+ */
+
+static void __init intel_ds_pebs_init(void)
+{
+ /*
+ * No support for 32bit formats
+ */
+ if (!boot_cpu_has(X86_FEATURE_DTES64))
+ return;
+
+ x86_pmu.ds_pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+ if (x86_pmu.version <= 4)
+ x86_pmu.pebs_no_isolation = 1;
+
+ if (x86_pmu.ds_pebs) {
+ char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
+ char *pebs_qual = "";
+ int format = x86_pmu.intel_cap.pebs_format;
+
+ if (format < 4)
+ x86_pmu.intel_cap.pebs_baseline = 0;
+
+ x86_pmu.pebs_enable = intel_pmu_pebs_enable;
+ x86_pmu.pebs_disable = intel_pmu_pebs_disable;
+ x86_pmu.pebs_enable_all = intel_pmu_pebs_enable_all;
+ x86_pmu.pebs_disable_all = intel_pmu_pebs_disable_all;
+
+ switch (format) {
+ case 0:
+ pr_cont("PEBS fmt0%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+ /*
+ * Using >PAGE_SIZE buffers makes the WRMSR to
+ * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+ * mysteriously hang on Core2.
+ *
+ * As a workaround, we don't do this.
+ */
+ x86_pmu.pebs_buffer_size = PAGE_SIZE;
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+ break;
+
+ case 1:
+ pr_cont("PEBS fmt1%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ break;
+
+ case 2:
+ pr_cont("PEBS fmt2%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ break;
+
+ case 3:
+ pr_cont("PEBS fmt3%c, ", pebs_type);
+ x86_pmu.pebs_record_size =
+ sizeof(struct pebs_record_skl);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
+ break;
+
+ case 6:
+ if (x86_pmu.intel_cap.pebs_baseline)
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
+ fallthrough;
+ case 5:
+ x86_pmu.pebs_ept = 1;
+ fallthrough;
+ case 4:
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
+ x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
+ if (x86_pmu.intel_cap.pebs_baseline) {
+ x86_pmu.large_pebs_flags |=
+ PERF_SAMPLE_BRANCH_STACK |
+ PERF_SAMPLE_TIME;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.pebs_capable = ~0ULL;
+ pebs_qual = "-baseline";
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+ } else {
+ /* Only basic record supported */
+ x86_pmu.large_pebs_flags &=
+ ~(PERF_SAMPLE_ADDR |
+ PERF_SAMPLE_TIME |
+ PERF_SAMPLE_DATA_SRC |
+ PERF_SAMPLE_TRANSACTION |
+ PERF_SAMPLE_REGS_USER |
+ PERF_SAMPLE_REGS_INTR);
+ }
+ pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual);
+
+ /*
+ * The PEBS-via-PT is not supported on hybrid platforms,
+ * because not all CPUs of a hybrid machine support it.
+ * The global x86_pmu.intel_cap, which only contains the
+ * common capabilities, is used to check the availability
+ * of the feature. The per-PMU pebs_output_pt_available
+ * in a hybrid machine should be ignored.
+ */
+ if (x86_pmu.intel_cap.pebs_output_pt_available) {
+ pr_cont("PEBS-via-PT, ");
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+ }
+
+ break;
+
+ default:
+ pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
+ x86_pmu.ds_pebs = 0;
+ }
+ }
+}
+
+void __init intel_pebs_init(void)
+{
+ if (x86_pmu.intel_cap.pebs_format == 0xf)
+ intel_arch_pebs_init();
+ else
+ intel_ds_pebs_init();
+}
+
+void perf_restore_debug_store(void)
+{
+ struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+ if (!x86_pmu.bts && !x86_pmu.ds_pebs)
+ return;
+
+ wrmsrq(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
new file mode 100644
index 000000000000..e614baf42926
--- /dev/null
+++ b/arch/x86/events/intel/knc.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ /* On Xeon Phi event "0" is a valid DATA_READ */
+ /* (L1 Data Cache Reads) Instruction. */
+ /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
+ /* bit will always be set in x86_pmu_hw_config(). */
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
+ [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
+ [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ /* see note on L1 OP_READ */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
+ [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+ return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
+ INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
+ INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
+ INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+ INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
+ INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
+ INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
+ INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
+ INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
+ INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
+ INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
+ INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
+ EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
+
+#define KNC_ENABLE_COUNTER0 0x00000001
+#define KNC_ENABLE_COUNTER1 0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+ u64 val;
+
+ rdmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+ u64 val;
+
+ rdmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrq_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrq_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrq(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+ wrmsrq(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ int handled = 0;
+ int bit, loops;
+ u64 status;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ knc_pmu_disable_all();
+
+ status = knc_pmu_get_status();
+ if (!status) {
+ knc_pmu_enable_all(0);
+ return handled;
+ }
+
+ loops = 0;
+again:
+ knc_pmu_ack_status(status);
+ if (++loops > 100) {
+ WARN_ONCE(1, "perf: irq loop stuck!\n");
+ perf_event_print_debug();
+ goto done;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+ u64 last_period;
+
+ handled++;
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ last_period = event->hw.last_period;
+ if (!intel_pmu_save_and_restart(event))
+ continue;
+
+ perf_sample_data_init(&data, 0, last_period);
+
+ perf_event_overflow(event, &data, regs);
+ }
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = knc_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ knc_pmu_enable_all(0);
+
+ return handled;
+}
+
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_knc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+ .name = "knc",
+ .handle_irq = knc_pmu_handle_irq,
+ .disable_all = knc_pmu_disable_all,
+ .enable_all = knc_pmu_enable_all,
+ .enable = knc_pmu_enable_event,
+ .disable = knc_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_KNC_EVNTSEL0,
+ .perfctr = MSR_KNC_PERFCTR0,
+ .event_map = knc_pmu_event_map,
+ .max_events = ARRAY_SIZE(knc_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 39) - 1,
+ .version = 0,
+ .cntr_mask64 = 0x3,
+ .cntval_bits = 40,
+ .cntval_mask = (1ULL << 40) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = knc_event_constraints,
+ .format_attrs = intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+ x86_pmu = knc_pmu;
+
+ memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
new file mode 100644
index 000000000000..72f2adcda7c6
--- /dev/null
+++ b/arch/x86/events/intel/lbr.c
@@ -0,0 +1,1713 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kvm_types.h>
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
+#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT 2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT 5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
+#define LBR_FAR_BIT 8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT 9 /* enable call stack */
+
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */
+
+#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
+#define LBR_USER (1 << LBR_USER_BIT)
+#define LBR_JCC (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK 0x3ff /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP -1 /* LBR filter not supported */
+#define LBR_IGN 0 /* ignored */
+
+#define LBR_ANY \
+ (LBR_JCC |\
+ LBR_REL_CALL |\
+ LBR_IND_CALL |\
+ LBR_RETURN |\
+ LBR_REL_JMP |\
+ LBR_IND_JMP |\
+ LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED BIT_ULL(63)
+#define LBR_FROM_FLAG_IN_TX BIT_ULL(62)
+#define LBR_FROM_FLAG_ABORT BIT_ULL(61)
+
+#define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59))
+
+/*
+ * Intel LBR_CTL bits
+ *
+ * Hardware branch filter for Arch LBR
+ */
+#define ARCH_LBR_KERNEL_BIT 1 /* capture at ring0 */
+#define ARCH_LBR_USER_BIT 2 /* capture at ring > 0 */
+#define ARCH_LBR_CALL_STACK_BIT 3 /* enable call stack */
+#define ARCH_LBR_JCC_BIT 16 /* capture conditional branches */
+#define ARCH_LBR_REL_JMP_BIT 17 /* capture relative jumps */
+#define ARCH_LBR_IND_JMP_BIT 18 /* capture indirect jumps */
+#define ARCH_LBR_REL_CALL_BIT 19 /* capture relative calls */
+#define ARCH_LBR_IND_CALL_BIT 20 /* capture indirect calls */
+#define ARCH_LBR_RETURN_BIT 21 /* capture near returns */
+#define ARCH_LBR_OTHER_BRANCH_BIT 22 /* capture other branches */
+
+#define ARCH_LBR_KERNEL (1ULL << ARCH_LBR_KERNEL_BIT)
+#define ARCH_LBR_USER (1ULL << ARCH_LBR_USER_BIT)
+#define ARCH_LBR_CALL_STACK (1ULL << ARCH_LBR_CALL_STACK_BIT)
+#define ARCH_LBR_JCC (1ULL << ARCH_LBR_JCC_BIT)
+#define ARCH_LBR_REL_JMP (1ULL << ARCH_LBR_REL_JMP_BIT)
+#define ARCH_LBR_IND_JMP (1ULL << ARCH_LBR_IND_JMP_BIT)
+#define ARCH_LBR_REL_CALL (1ULL << ARCH_LBR_REL_CALL_BIT)
+#define ARCH_LBR_IND_CALL (1ULL << ARCH_LBR_IND_CALL_BIT)
+#define ARCH_LBR_RETURN (1ULL << ARCH_LBR_RETURN_BIT)
+#define ARCH_LBR_OTHER_BRANCH (1ULL << ARCH_LBR_OTHER_BRANCH_BIT)
+
+#define ARCH_LBR_ANY \
+ (ARCH_LBR_JCC |\
+ ARCH_LBR_REL_JMP |\
+ ARCH_LBR_IND_JMP |\
+ ARCH_LBR_REL_CALL |\
+ ARCH_LBR_IND_CALL |\
+ ARCH_LBR_RETURN |\
+ ARCH_LBR_OTHER_BRANCH)
+
+#define ARCH_LBR_CTL_MASK 0x7f000e
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+static __always_inline bool is_lbr_call_stack_bit_set(u64 config)
+{
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return !!(config & ARCH_LBR_CALL_STACK);
+
+ return !!(config & LBR_CALL_STACK);
+}
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(bool pmi)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 debugctl, lbr_select = 0, orig_debugctl;
+
+ /*
+ * No need to unfreeze manually, as v4 can do that as part
+ * of the GLOBAL_STATUS ack.
+ */
+ if (pmi && x86_pmu.version >= 4)
+ return;
+
+ /*
+ * No need to reprogram LBR_SELECT in a PMI, as it
+ * did not change.
+ */
+ if (cpuc->lbr_sel)
+ lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel)
+ wrmsrq(MSR_LBR_SELECT, lbr_select);
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ orig_debugctl = debugctl;
+
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+ debugctl |= DEBUGCTLMSR_LBR;
+ /*
+ * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+ * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+ * may cause superfluous increase/decrease of LBR_TOS.
+ */
+ if (is_lbr_call_stack_bit_set(lbr_select))
+ debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ else
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
+ if (orig_debugctl != debugctl)
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+ wrmsrq(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
+}
+
+void intel_pmu_lbr_reset_32(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++)
+ wrmsrq(x86_pmu.lbr_from + i, 0);
+}
+
+void intel_pmu_lbr_reset_64(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ wrmsrq(x86_pmu.lbr_from + i, 0);
+ wrmsrq(x86_pmu.lbr_to + i, 0);
+ if (x86_pmu.lbr_has_info)
+ wrmsrq(x86_pmu.lbr_info + i, 0);
+ }
+}
+
+static void intel_pmu_arch_lbr_reset(void)
+{
+ /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */
+ wrmsrq(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr);
+}
+
+void intel_pmu_lbr_reset(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ x86_pmu.lbr_reset();
+
+ cpuc->last_task_ctx = NULL;
+ cpuc->last_log_id = 0;
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && cpuc->lbr_select)
+ wrmsrq(MSR_LBR_SELECT, 0);
+}
+
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrq(x86_pmu.lbr_tos, tos);
+ return tos;
+}
+
+enum {
+ LBR_NONE,
+ LBR_VALID,
+};
+
+/*
+ * For format LBR_FORMAT_EIP_FLAGS2, bits 61:62 in MSR_LAST_BRANCH_FROM_x
+ * are the TSX flags when TSX is supported, but when TSX is not supported
+ * they have no consistent behavior:
+ *
+ * - For wrmsr(), bits 61:62 are considered part of the sign extension.
+ * - For HW updates (branch captures) bits 61:62 are always OFF and are not
+ * part of the sign extension.
+ *
+ * Therefore, if:
+ *
+ * 1) LBR format LBR_FORMAT_EIP_FLAGS2
+ * 2) CPU has no TSX support enabled
+ *
+ * ... then any value passed to wrmsr() must be sign extended to 63 bits and any
+ * value from rdmsr() must be converted to have a 61 bits sign extension,
+ * ignoring the TSX flags.
+ */
+static inline bool lbr_from_signext_quirk_needed(void)
+{
+ bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) ||
+ boot_cpu_has(X86_FEATURE_RTM);
+
+ return !tsx_support;
+}
+
+static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key);
+
+/* If quirk is enabled, ensure sign extension is 63 bits: */
+inline u64 lbr_from_signext_quirk_wr(u64 val)
+{
+ if (static_branch_unlikely(&lbr_from_quirk_key)) {
+ /*
+ * Sign extend into bits 61:62 while preserving bit 63.
+ *
+ * Quirk is enabled when TSX is disabled. Therefore TSX bits
+ * in val are always OFF and must be changed to be sign
+ * extension bits. Since bits 59:60 are guaranteed to be
+ * part of the sign extension bits, we can just copy them
+ * to 61:62.
+ */
+ val |= (LBR_FROM_SIGNEXT_2MSB & val) << 2;
+ }
+ return val;
+}
+
+/*
+ * If quirk is needed, ensure sign extension is 61 bits:
+ */
+static u64 lbr_from_signext_quirk_rd(u64 val)
+{
+ if (static_branch_unlikely(&lbr_from_quirk_key)) {
+ /*
+ * Quirk is on when TSX is not enabled. Therefore TSX
+ * flags must be read as OFF.
+ */
+ val &= ~(LBR_FROM_FLAG_IN_TX | LBR_FROM_FLAG_ABORT);
+ }
+ return val;
+}
+
+static __always_inline void wrlbr_from(unsigned int idx, u64 val)
+{
+ val = lbr_from_signext_quirk_wr(val);
+ wrmsrq(x86_pmu.lbr_from + idx, val);
+}
+
+static __always_inline void wrlbr_to(unsigned int idx, u64 val)
+{
+ wrmsrq(x86_pmu.lbr_to + idx, val);
+}
+
+static __always_inline void wrlbr_info(unsigned int idx, u64 val)
+{
+ wrmsrq(x86_pmu.lbr_info + idx, val);
+}
+
+static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr)
+{
+ u64 val;
+
+ if (lbr)
+ return lbr->from;
+
+ rdmsrq(x86_pmu.lbr_from + idx, val);
+
+ return lbr_from_signext_quirk_rd(val);
+}
+
+static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr)
+{
+ u64 val;
+
+ if (lbr)
+ return lbr->to;
+
+ rdmsrq(x86_pmu.lbr_to + idx, val);
+
+ return val;
+}
+
+static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr)
+{
+ u64 val;
+
+ if (lbr)
+ return lbr->info;
+
+ rdmsrq(x86_pmu.lbr_info + idx, val);
+
+ return val;
+}
+
+static inline void
+wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+ wrlbr_from(idx, lbr->from);
+ wrlbr_to(idx, lbr->to);
+ if (need_info)
+ wrlbr_info(idx, lbr->info);
+}
+
+static inline bool
+rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+ u64 from = rdlbr_from(idx, NULL);
+
+ /* Don't read invalid entry */
+ if (!from)
+ return false;
+
+ lbr->from = from;
+ lbr->to = rdlbr_to(idx, NULL);
+ if (need_info)
+ lbr->info = rdlbr_info(idx, NULL);
+
+ return true;
+}
+
+void intel_pmu_lbr_restore(void *ctx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx = ctx;
+ bool need_info = x86_pmu.lbr_has_info;
+ u64 tos = task_ctx->tos;
+ unsigned lbr_idx, mask;
+ int i;
+
+ mask = x86_pmu.lbr_nr - 1;
+ for (i = 0; i < task_ctx->valid_lbrs; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info);
+ }
+
+ for (; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrlbr_from(lbr_idx, 0);
+ wrlbr_to(lbr_idx, 0);
+ if (need_info)
+ wrlbr_info(lbr_idx, 0);
+ }
+
+ wrmsrq(x86_pmu.lbr_tos, tos);
+
+ if (cpuc->lbr_select)
+ wrmsrq(MSR_LBR_SELECT, task_ctx->lbr_sel);
+}
+
+static void intel_pmu_arch_lbr_restore(void *ctx)
+{
+ struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+ struct lbr_entry *entries = task_ctx->entries;
+ int i;
+
+ /* Fast reset the LBRs before restore if the call stack is not full. */
+ if (!entries[x86_pmu.lbr_nr - 1].from)
+ intel_pmu_arch_lbr_reset();
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!entries[i].from)
+ break;
+ wrlbr_all(&entries[i], i, true);
+ }
+}
+
+/*
+ * Restore the Architecture LBR state from the xsave area in the perf
+ * context data for the task via the XRSTORS instruction.
+ */
+static void intel_pmu_arch_lbr_xrstors(void *ctx)
+{
+ struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+ xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
+static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
+{
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL);
+
+ return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
+}
+
+static inline bool has_lbr_callstack_users(void *ctx)
+{
+ return task_context_opt(ctx)->lbr_callstack_users ||
+ x86_pmu.lbr_callstack_users;
+}
+
+static void __intel_pmu_lbr_restore(void *ctx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!has_lbr_callstack_users(ctx) ||
+ task_context_opt(ctx)->lbr_stack_state == LBR_NONE) {
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ /*
+ * Does not restore the LBR registers, if
+ * - No one else touched them, and
+ * - Was not cleared in Cstate
+ */
+ if ((ctx == cpuc->last_task_ctx) &&
+ (task_context_opt(ctx)->log_id == cpuc->last_log_id) &&
+ !lbr_is_reset_in_cstate(ctx)) {
+ task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ x86_pmu.lbr_restore(ctx);
+
+ task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
+}
+
+void intel_pmu_lbr_save(void *ctx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx = ctx;
+ bool need_info = x86_pmu.lbr_has_info;
+ unsigned lbr_idx, mask;
+ u64 tos;
+ int i;
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info))
+ break;
+ }
+ task_ctx->valid_lbrs = i;
+ task_ctx->tos = tos;
+
+ if (cpuc->lbr_select)
+ rdmsrq(MSR_LBR_SELECT, task_ctx->lbr_sel);
+}
+
+static void intel_pmu_arch_lbr_save(void *ctx)
+{
+ struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+ struct lbr_entry *entries = task_ctx->entries;
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!rdlbr_all(&entries[i], i, true))
+ break;
+ }
+
+ /* LBR call stack is not full. Reset is required in restore. */
+ if (i < x86_pmu.lbr_nr)
+ entries[x86_pmu.lbr_nr - 1].from = 0;
+}
+
+/*
+ * Save the Architecture LBR state to the xsave area in the perf
+ * context data for the task via the XSAVES instruction.
+ */
+static void intel_pmu_arch_lbr_xsaves(void *ctx)
+{
+ struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+ xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
+static void __intel_pmu_lbr_save(void *ctx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!has_lbr_callstack_users(ctx)) {
+ task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ x86_pmu.lbr_save(ctx);
+
+ task_context_opt(ctx)->lbr_stack_state = LBR_VALID;
+
+ cpuc->last_task_ctx = ctx;
+ cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_ctx_data *ctx_data;
+ void *task_ctx;
+
+ if (!cpuc->lbr_users)
+ return;
+
+ /*
+ * If LBR callstack feature is enabled and the stack was saved when
+ * the task was scheduled out, restore the stack. Otherwise flush
+ * the LBR stack.
+ */
+ rcu_read_lock();
+ ctx_data = rcu_dereference(task->perf_ctx_data);
+ task_ctx = ctx_data ? ctx_data->data : NULL;
+ if (task_ctx) {
+ if (sched_in)
+ __intel_pmu_lbr_restore(task_ctx);
+ else
+ __intel_pmu_lbr_save(task_ctx);
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ /*
+ * Since a context switch can flip the address space and LBR entries
+ * are not tagged with an identifier, we need to wipe the LBR, even for
+ * per-cpu events. You simply cannot resolve the branches from the old
+ * address space.
+ */
+ if (sched_in)
+ intel_pmu_lbr_reset();
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
+void intel_pmu_lbr_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
+ cpuc->lbr_select = 1;
+
+ cpuc->br_sel = event->hw.branch_reg.reg;
+
+ if (branch_user_callstack(cpuc->br_sel)) {
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ struct task_struct *task = event->hw.target;
+ struct perf_ctx_data *ctx_data;
+
+ rcu_read_lock();
+ ctx_data = rcu_dereference(task->perf_ctx_data);
+ if (ctx_data)
+ task_context_opt(ctx_data->data)->lbr_callstack_users++;
+ rcu_read_unlock();
+ } else
+ x86_pmu.lbr_callstack_users++;
+ }
+ /*
+ * Request pmu::sched_task() callback, which will fire inside the
+ * regular perf event scheduling, so that call will:
+ *
+ * - restore or wipe; when LBR-callstack,
+ * - wipe; otherwise,
+ *
+ * when this is from __perf_event_task_sched_in().
+ *
+ * However, if this is from perf_install_in_context(), no such callback
+ * will follow and we'll need to reset the LBR here if this is the
+ * first LBR event.
+ *
+ * The problem is, we cannot tell these cases apart... but we can
+ * exclude the biggest chunk of cases by looking at
+ * event->total_time_running. An event that has accrued runtime cannot
+ * be 'new'. Conversely, a new event can get installed through the
+ * context switch path for the first time.
+ */
+ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
+ cpuc->lbr_pebs_users++;
+ perf_sched_cb_inc(event->pmu);
+ if (!cpuc->lbr_users++ && !event->total_time_running)
+ intel_pmu_lbr_reset();
+}
+
+void release_lbr_buffers(void)
+{
+ struct kmem_cache *kmem_cache;
+ struct cpu_hw_events *cpuc;
+ int cpu;
+
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ kmem_cache = x86_get_pmu(cpu)->task_ctx_cache;
+ if (kmem_cache && cpuc->lbr_xsave) {
+ kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
+ cpuc->lbr_xsave = NULL;
+ }
+ }
+}
+
+void reserve_lbr_buffers(void)
+{
+ struct kmem_cache *kmem_cache;
+ struct cpu_hw_events *cpuc;
+ int cpu;
+
+ if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ kmem_cache = x86_get_pmu(cpu)->task_ctx_cache;
+ if (!kmem_cache || cpuc->lbr_xsave)
+ continue;
+
+ cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache,
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
+ }
+}
+
+void intel_pmu_lbr_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (branch_user_callstack(cpuc->br_sel)) {
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ struct task_struct *task = event->hw.target;
+ struct perf_ctx_data *ctx_data;
+
+ rcu_read_lock();
+ ctx_data = rcu_dereference(task->perf_ctx_data);
+ if (ctx_data)
+ task_context_opt(ctx_data->data)->lbr_callstack_users--;
+ rcu_read_unlock();
+ } else
+ x86_pmu.lbr_callstack_users--;
+ }
+
+ if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
+ cpuc->lbr_select = 0;
+
+ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
+ cpuc->lbr_pebs_users--;
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+ WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
+ perf_sched_cb_dec(event->pmu);
+
+ /*
+ * The logged occurrences information is only valid for the
+ * current LBR group. If another LBR group is scheduled in
+ * later, the information from the stale LBRs will be wrongly
+ * interpreted. Reset the LBRs here.
+ *
+ * Only clear once for a branch counter group with the leader
+ * event. Because
+ * - Cannot simply reset the LBRs with the !cpuc->lbr_users.
+ * Because it's possible that the last LBR user is not in a
+ * branch counter group, e.g., a branch_counters group +
+ * several normal LBR events.
+ * - The LBR reset can be done with any one of the events in a
+ * branch counter group, since they are always scheduled together.
+ * It's easy to force the leader event an LBR event.
+ */
+ if (is_branch_counters_group(event) && event == event->group_leader)
+ intel_pmu_lbr_reset();
+}
+
+static inline bool vlbr_exclude_host(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ return test_bit(INTEL_PMC_IDX_FIXED_VLBR,
+ (unsigned long *)&cpuc->intel_ctrl_guest_mask);
+}
+
+void intel_pmu_lbr_enable_all(bool pmi)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->lbr_users && !vlbr_exclude_host())
+ __intel_pmu_lbr_enable(pmi);
+}
+
+void intel_pmu_lbr_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->lbr_users && !vlbr_exclude_host()) {
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return __intel_pmu_arch_lbr_disable();
+
+ __intel_pmu_lbr_disable();
+ }
+}
+
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ struct perf_branch_entry *br = cpuc->lbr_entries;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ union {
+ struct {
+ u32 from;
+ u32 to;
+ };
+ u64 lbr;
+ } msr_lastbranch;
+
+ rdmsrq(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+ perf_clear_branch_entry_bitfields(br);
+
+ br->from = msr_lastbranch.from;
+ br->to = msr_lastbranch.to;
+ br++;
+ }
+ cpuc->lbr_stack.nr = i;
+ cpuc->lbr_stack.hw_idx = tos;
+}
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+ bool need_info = false, call_stack = false;
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ struct perf_branch_entry *br = cpuc->lbr_entries;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+ int out = 0;
+ int num = x86_pmu.lbr_nr;
+
+ if (cpuc->lbr_sel) {
+ need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+ if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+ call_stack = true;
+ }
+
+ for (i = 0; i < num; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+ u16 cycles = 0;
+
+ from = rdlbr_from(lbr_idx, NULL);
+ to = rdlbr_to(lbr_idx, NULL);
+
+ /*
+ * Read LBR call stack entries
+ * until invalid entry (0s) is detected.
+ */
+ if (call_stack && !from)
+ break;
+
+ if (x86_pmu.lbr_has_info) {
+ if (need_info) {
+ u64 info;
+
+ info = rdlbr_info(lbr_idx, NULL);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ cycles = (info & LBR_INFO_CYCLES);
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ }
+ }
+ } else {
+ int skip = 0;
+
+ if (x86_pmu.lbr_from_flags) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
+ }
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+ abort = !!(from & LBR_FROM_FLAG_ABORT);
+ skip = 3;
+ }
+ from = (u64)((((s64)from) << skip) >> skip);
+
+ if (x86_pmu.lbr_to_cycles) {
+ cycles = ((to >> 48) & LBR_INFO_CYCLES);
+ to = (u64)((((s64)to) << 16) >> 16);
+ }
+ }
+
+ /*
+ * Some CPUs report duplicated abort records,
+ * with the second entry not having an abort bit set.
+ * Skip them here. This loop runs backwards,
+ * so we need to undo the previous record.
+ * If the abort just happened outside the window
+ * the extra entry cannot be removed.
+ */
+ if (abort && x86_pmu.lbr_double_abort && out > 0)
+ out--;
+
+ perf_clear_branch_entry_bitfields(br+out);
+ br[out].from = from;
+ br[out].to = to;
+ br[out].mispred = mis;
+ br[out].predicted = pred;
+ br[out].in_tx = in_tx;
+ br[out].abort = abort;
+ br[out].cycles = cycles;
+ out++;
+ }
+ cpuc->lbr_stack.nr = out;
+ cpuc->lbr_stack.hw_idx = tos;
+}
+
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_mispred);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_cycles);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_type);
+
+static __always_inline int get_lbr_br_type(u64 info)
+{
+ int type = 0;
+
+ if (static_branch_likely(&x86_lbr_type))
+ type = (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+
+ return type;
+}
+
+static __always_inline bool get_lbr_mispred(u64 info)
+{
+ bool mispred = 0;
+
+ if (static_branch_likely(&x86_lbr_mispred))
+ mispred = !!(info & LBR_INFO_MISPRED);
+
+ return mispred;
+}
+
+static __always_inline u16 get_lbr_cycles(u64 info)
+{
+ u16 cycles = info & LBR_INFO_CYCLES;
+
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+ (!static_branch_likely(&x86_lbr_cycles) ||
+ !(info & LBR_INFO_CYC_CNT_VALID)))
+ cycles = 0;
+
+ return cycles;
+}
+
+static_assert((64 - PERF_BRANCH_ENTRY_INFO_BITS_MAX) > LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS);
+
+static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
+ struct lbr_entry *entries)
+{
+ struct perf_branch_entry *e;
+ struct lbr_entry *lbr;
+ u64 from, to, info;
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr = entries ? &entries[i] : NULL;
+ e = &cpuc->lbr_entries[i];
+
+ from = rdlbr_from(i, lbr);
+ /*
+ * Read LBR entries until invalid entry (0s) is detected.
+ */
+ if (!from)
+ break;
+
+ to = rdlbr_to(i, lbr);
+ info = rdlbr_info(i, lbr);
+
+ perf_clear_branch_entry_bitfields(e);
+
+ e->from = from;
+ e->to = to;
+ e->mispred = get_lbr_mispred(info);
+ e->predicted = !e->mispred;
+ e->in_tx = !!(info & LBR_INFO_IN_TX);
+ e->abort = !!(info & LBR_INFO_ABORT);
+ e->cycles = get_lbr_cycles(info);
+ e->type = get_lbr_br_type(info);
+
+ /*
+ * Leverage the reserved field of cpuc->lbr_entries[i] to
+ * temporarily store the branch counters information.
+ * The later code will decide what content can be disclosed
+ * to the perf tool. Pleae see intel_pmu_lbr_counters_reorder().
+ */
+ e->reserved = (info >> LBR_INFO_BR_CNTR_OFFSET) & LBR_INFO_BR_CNTR_FULL_MASK;
+ }
+
+ cpuc->lbr_stack.nr = i;
+}
+
+/*
+ * The enabled order may be different from the counter order.
+ * Update the lbr_counters with the enabled order.
+ */
+static void intel_pmu_lbr_counters_reorder(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ int i, j, pos = 0, order[X86_PMC_IDX_MAX];
+ struct perf_event *leader, *sibling;
+ u64 src, dst, cnt;
+
+ leader = event->group_leader;
+ if (branch_sample_counters(leader))
+ order[pos++] = leader->hw.idx;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!branch_sample_counters(sibling))
+ continue;
+ order[pos++] = sibling->hw.idx;
+ }
+
+ WARN_ON_ONCE(!pos);
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+ src = cpuc->lbr_entries[i].reserved;
+ dst = 0;
+ for (j = 0; j < pos; j++) {
+ cnt = (src >> (order[j] * LBR_INFO_BR_CNTR_BITS)) & LBR_INFO_BR_CNTR_MASK;
+ dst |= cnt << j * LBR_INFO_BR_CNTR_BITS;
+ }
+ cpuc->lbr_counters[i] = dst;
+ cpuc->lbr_entries[i].reserved = 0;
+ }
+}
+
+void intel_pmu_lbr_save_brstack(struct perf_sample_data *data,
+ struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (is_branch_counters_group(event)) {
+ intel_pmu_lbr_counters_reorder(cpuc, event);
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, cpuc->lbr_counters);
+ return;
+ }
+
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
+}
+
+static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
+{
+ intel_pmu_store_lbr(cpuc, NULL);
+}
+
+static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
+{
+ struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;
+
+ if (!xsave) {
+ intel_pmu_store_lbr(cpuc, NULL);
+ return;
+ }
+ xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
+
+ intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
+}
+
+void intel_pmu_lbr_read(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * Don't read when all LBRs users are using adaptive PEBS.
+ *
+ * This could be smarter and actually check the event,
+ * but this simple approach seems to work for now.
+ */
+ if (!cpuc->lbr_users || vlbr_exclude_host() ||
+ cpuc->lbr_users == cpuc->lbr_pebs_users)
+ return;
+
+ x86_pmu.lbr_read(cpuc);
+
+ intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+ u64 br_type = event->attr.branch_sample_type;
+ int mask = 0;
+
+ if (br_type & PERF_SAMPLE_BRANCH_USER)
+ mask |= X86_BR_USER;
+
+ if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+ mask |= X86_BR_KERNEL;
+
+ /* we ignore BRANCH_HV here */
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY)
+ mask |= X86_BR_ANY;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+ mask |= X86_BR_ANY_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+ mask |= X86_BR_IND_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+ mask |= X86_BR_ABORT;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+ mask |= X86_BR_IN_TX;
+
+ if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+ mask |= X86_BR_NO_TX;
+
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
+ if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+ if (!x86_pmu_has_lbr_callstack())
+ return -EOPNOTSUPP;
+ if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+ return -EINVAL;
+ mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+ X86_BR_CALL_STACK;
+ }
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+ mask |= X86_BR_IND_JMP;
+
+ if (br_type & PERF_SAMPLE_BRANCH_CALL)
+ mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+ mask |= X86_BR_TYPE_SAVE;
+
+ /*
+ * stash actual user request into reg, it may
+ * be used by fixup code for some CPU
+ */
+ event->hw.branch_reg.reg = mask;
+ return 0;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg;
+ u64 br_type = event->attr.branch_sample_type;
+ u64 mask = 0, v;
+ int i;
+
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & (1ULL << i)))
+ continue;
+
+ v = x86_pmu.lbr_sel_map[i];
+ if (v == LBR_NOT_SUPP)
+ return -EOPNOTSUPP;
+
+ if (v != LBR_IGN)
+ mask |= v;
+ }
+
+ reg = &event->hw.branch_reg;
+ reg->idx = EXTRA_REG_LBR;
+
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+ reg->config = mask;
+
+ /*
+ * The Arch LBR HW can retrieve the common branch types
+ * from the LBR_INFO. It doesn't require the high overhead
+ * SW disassemble.
+ * Enable the branch type by default for the Arch LBR.
+ */
+ reg->reg |= X86_BR_TYPE_SAVE;
+ return 0;
+ }
+
+ /*
+ * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+ * in suppress mode. So LBR_SELECT should be set to
+ * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ * But the 10th bit LBR_CALL_STACK does not operate
+ * in suppress mode.
+ */
+ reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK);
+
+ if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+ (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+ x86_pmu.lbr_has_info)
+ reg->config |= LBR_NO_INFO;
+
+ return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+ int ret = 0;
+
+ /*
+ * no LBR on this PMU
+ */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ /*
+ * setup SW LBR filter
+ */
+ ret = intel_pmu_setup_sw_lbr_filter(event);
+ if (ret)
+ return ret;
+
+ /*
+ * setup HW LBR filter, if any
+ */
+ if (x86_pmu.lbr_sel_map)
+ ret = intel_pmu_setup_hw_lbr_filter(event);
+
+ return ret;
+}
+
+enum {
+ ARCH_LBR_BR_TYPE_JCC = 0,
+ ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1,
+ ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2,
+ ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3,
+ ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4,
+ ARCH_LBR_BR_TYPE_NEAR_RET = 5,
+ ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET,
+
+ ARCH_LBR_BR_TYPE_MAP_MAX = 16,
+};
+
+static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
+ [ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC,
+ [ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP,
+ [ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP,
+ [ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL,
+ [ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL,
+ [ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET,
+};
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+ u64 from, to;
+ int br_sel = cpuc->br_sel;
+ int i, j, type, to_plm;
+ bool compress = false;
+
+ /* if sampling all branches, then nothing to filter */
+ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+ ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
+ return;
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+ from = cpuc->lbr_entries[i].from;
+ to = cpuc->lbr_entries[i].to;
+ type = cpuc->lbr_entries[i].type;
+
+ /*
+ * Parse the branch type recorded in LBR_x_INFO MSR.
+ * Doesn't support OTHER_BRANCH decoding for now.
+ * OTHER_BRANCH branch type still rely on software decoding.
+ */
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+ type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) {
+ to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+ type = arch_lbr_br_type_map[type] | to_plm;
+ } else
+ type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+ if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+ if (cpuc->lbr_entries[i].in_tx)
+ type |= X86_BR_IN_TX;
+ else
+ type |= X86_BR_NO_TX;
+ }
+
+ /* if type does not correspond, then discard */
+ if (type == X86_BR_NONE || (br_sel & type) != type) {
+ cpuc->lbr_entries[i].from = 0;
+ compress = true;
+ }
+
+ if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+ cpuc->lbr_entries[i].type = common_branch_type(type);
+ }
+
+ if (!compress)
+ return;
+
+ /* remove all entries with from=0 */
+ for (i = 0; i < cpuc->lbr_stack.nr; ) {
+ if (!cpuc->lbr_entries[i].from) {
+ j = i;
+ while (++j < cpuc->lbr_stack.nr) {
+ cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+ cpuc->lbr_counters[j-1] = cpuc->lbr_counters[j];
+ }
+ cpuc->lbr_stack.nr--;
+ if (!cpuc->lbr_entries[i].from)
+ continue;
+ }
+ i++;
+ }
+}
+
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /* Cannot get TOS for large PEBS and Arch LBR */
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR) ||
+ (cpuc->n_pebs == cpuc->n_large_pebs))
+ cpuc->lbr_stack.hw_idx = -1ULL;
+ else
+ cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
+
+ intel_pmu_store_lbr(cpuc, lbr);
+ intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+ */
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
+ LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+ */
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_RETURN | LBR_CALL_STACK,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
+};
+
+static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN |
+ ARCH_LBR_OTHER_BRANCH,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL |
+ ARCH_LBR_IND_CALL |
+ ARCH_LBR_OTHER_BRANCH,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL |
+ ARCH_LBR_IND_CALL |
+ ARCH_LBR_RETURN |
+ ARCH_LBR_CALL_STACK,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL,
+};
+
+/* core */
+void __init intel_pmu_lbr_init_core(void)
+{
+ x86_pmu.lbr_nr = 4;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+}
+
+/* nehalem/westmere */
+void __init intel_pmu_lbr_init_nhm(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - workaround LBR_SEL errata (see above)
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+}
+
+/* sandy bridge */
+void __init intel_pmu_lbr_init_snb(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+}
+
+static inline struct kmem_cache *
+create_lbr_kmem_cache(size_t size, size_t align)
+{
+ return kmem_cache_create("x86_lbr", size, align, 0, NULL);
+}
+
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+ size_t size = sizeof(struct x86_perf_task_context);
+
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+}
+
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+ size_t size = sizeof(struct x86_perf_task_context);
+
+ x86_pmu.lbr_nr = 32;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+ x86_pmu.lbr_info = MSR_LBR_INFO_0;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+}
+
+/* atom */
+void __init intel_pmu_lbr_init_atom(void)
+{
+ /*
+ * only models starting at stepping 10 seems
+ * to have an operational LBR which can freeze
+ * on PMU interrupt
+ */
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_BONNELL
+ && boot_cpu_data.x86_stepping < 10) {
+ pr_cont("LBR disabled due to erratum");
+ return;
+ }
+
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+}
+
+/* slm */
+void __init intel_pmu_lbr_init_slm(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+ pr_cont("8-deep LBR, ");
+}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ /* Knights Landing does have MISPREDICT bit */
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP)
+ x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
+}
+
+void intel_pmu_lbr_init(void)
+{
+ switch (x86_pmu.intel_cap.lbr_format) {
+ case LBR_FORMAT_EIP_FLAGS2:
+ x86_pmu.lbr_has_tsx = 1;
+ x86_pmu.lbr_from_flags = 1;
+ if (lbr_from_signext_quirk_needed())
+ static_branch_enable(&lbr_from_quirk_key);
+ break;
+
+ case LBR_FORMAT_EIP_FLAGS:
+ x86_pmu.lbr_from_flags = 1;
+ break;
+
+ case LBR_FORMAT_INFO:
+ x86_pmu.lbr_has_tsx = 1;
+ fallthrough;
+ case LBR_FORMAT_INFO2:
+ x86_pmu.lbr_has_info = 1;
+ break;
+
+ case LBR_FORMAT_TIME:
+ x86_pmu.lbr_from_flags = 1;
+ x86_pmu.lbr_to_cycles = 1;
+ break;
+ }
+
+ if (x86_pmu.lbr_has_info) {
+ /*
+ * Only used in combination with baseline pebs.
+ */
+ static_branch_enable(&x86_lbr_mispred);
+ static_branch_enable(&x86_lbr_cycles);
+ }
+}
+
+/*
+ * LBR state size is variable based on the max number of registers.
+ * This calculates the expected state size, which should match
+ * what the hardware enumerates for the size of XFEATURE_LBR.
+ */
+static inline unsigned int get_lbr_state_size(void)
+{
+ return sizeof(struct arch_lbr_state) +
+ x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+}
+
+static bool is_arch_lbr_xsave_available(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVES))
+ return false;
+
+ /*
+ * Check the LBR state with the corresponding software structure.
+ * Disable LBR XSAVES support if the size doesn't match.
+ */
+ if (xfeature_size(XFEATURE_LBR) == 0)
+ return false;
+
+ if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size()))
+ return false;
+
+ return true;
+}
+
+void __init intel_pmu_arch_lbr_init(void)
+{
+ struct pmu *pmu = x86_get_pmu(smp_processor_id());
+ union cpuid28_eax eax;
+ union cpuid28_ebx ebx;
+ union cpuid28_ecx ecx;
+ unsigned int unused_edx;
+ bool arch_lbr_xsave;
+ size_t size;
+ u64 lbr_nr;
+
+ /* Arch LBR Capabilities */
+ cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx);
+
+ lbr_nr = fls(eax.split.lbr_depth_mask) * 8;
+ if (!lbr_nr)
+ goto clear_arch_lbr;
+
+ /* Apply the max depth of Arch LBR */
+ if (wrmsrq_safe(MSR_ARCH_LBR_DEPTH, lbr_nr))
+ goto clear_arch_lbr;
+
+ x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask;
+ x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset;
+ x86_pmu.lbr_lip = eax.split.lbr_lip;
+ x86_pmu.lbr_cpl = ebx.split.lbr_cpl;
+ x86_pmu.lbr_filter = ebx.split.lbr_filter;
+ x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack;
+ x86_pmu.lbr_mispred = ecx.split.lbr_mispred;
+ x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr;
+ x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
+ x86_pmu.lbr_counters = ecx.split.lbr_counters;
+ x86_pmu.lbr_nr = lbr_nr;
+
+ if (!!x86_pmu.lbr_counters)
+ x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT;
+
+ if (x86_pmu.lbr_mispred)
+ static_branch_enable(&x86_lbr_mispred);
+ if (x86_pmu.lbr_timed_lbr)
+ static_branch_enable(&x86_lbr_cycles);
+ if (x86_pmu.lbr_br_type)
+ static_branch_enable(&x86_lbr_type);
+
+ arch_lbr_xsave = is_arch_lbr_xsave_available();
+ if (arch_lbr_xsave) {
+ size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) +
+ get_lbr_state_size();
+ pmu->task_ctx_cache = create_lbr_kmem_cache(size,
+ XSAVE_ALIGNMENT);
+ }
+
+ if (!pmu->task_ctx_cache) {
+ arch_lbr_xsave = false;
+
+ size = sizeof(struct x86_perf_task_context_arch_lbr) +
+ lbr_nr * sizeof(struct lbr_entry);
+ pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+ }
+
+ x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
+ x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
+ x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0;
+
+ /* LBR callstack requires both CPL and Branch Filtering support */
+ if (!x86_pmu.lbr_cpl ||
+ !x86_pmu.lbr_filter ||
+ !x86_pmu.lbr_call_stack)
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP;
+
+ if (!x86_pmu.lbr_cpl) {
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP;
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP;
+ } else if (!x86_pmu.lbr_filter) {
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP;
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP;
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP;
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP;
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP;
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP;
+ arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP;
+ }
+
+ x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK;
+ x86_pmu.lbr_ctl_map = arch_lbr_ctl_map;
+
+ if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter)
+ x86_pmu.lbr_ctl_map = NULL;
+
+ x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
+ if (arch_lbr_xsave) {
+ x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves;
+ x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors;
+ x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave;
+ pr_cont("XSAVE ");
+ } else {
+ x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
+ x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+ x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
+ }
+
+ pr_cont("Architectural LBR, ");
+
+ return;
+
+clear_arch_lbr:
+ setup_clear_cpu_cap(X86_FEATURE_ARCH_LBR);
+}
+
+/**
+ * x86_perf_get_lbr - get the LBR records information
+ *
+ * @lbr: the caller's memory to store the LBR records information
+ */
+void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+{
+ lbr->nr = x86_pmu.lbr_nr;
+ lbr->from = x86_pmu.lbr_from;
+ lbr->to = x86_pmu.lbr_to;
+ lbr->info = x86_pmu.lbr_info;
+ lbr->has_callstack = x86_pmu_has_lbr_callstack();
+}
+EXPORT_SYMBOL_FOR_KVM(x86_perf_get_lbr);
+
+struct event_constraint vlbr_constraint =
+ __EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR),
+ FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT);
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
new file mode 100644
index 000000000000..e5fd7367e45d
--- /dev/null
+++ b/arch/x86/events/intel/p4.c
@@ -0,0 +1,1405 @@
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ *
+ * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/perf_event_p4.h>
+#include <asm/cpu_device_id.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+ unsigned int opcode; /* Event code and ESCR selector */
+ unsigned int escr_msr[2]; /* ESCR MSR for this event */
+ unsigned int escr_emask; /* valid ESCR EventMask bits */
+ unsigned int shared; /* event is shared across threads */
+ signed char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */
+};
+
+struct p4_pebs_bind {
+ unsigned int metric_pebs;
+ unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert) \
+ [P4_PEBS_METRIC__##name] = { \
+ .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
+ .metric_vert = vert, \
+ }
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * registers
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+ P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
+ P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
+ P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
+ P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
+ P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
+ P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
+ P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+ [P4_EVENT_TC_DELIVER_MODE] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+ .shared = 1,
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_BPU_FETCH_REQUEST] = {
+ .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+ .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_ITLB_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_MEMORY_CANCEL] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MEMORY_COMPLETE] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_LOAD_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_STORE_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MOB_LOAD_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+ .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_PAGE_WALK_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+ .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+ .shared = 1,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ALLOCATION] = {
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_FSB_DATA_ACTIVITY] = {
+ .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+ .shared = 1,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
+ .cntr = { {0, -1, -1}, {1, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_SSE_INPUT_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_64BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_128BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_X87_FP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_TC_MISC] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+ .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_TC_MS_XFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_UOP_QUEUE_WRITES] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RESOURCE_STALL] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+ .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_WC_BUFFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_B2B_CYCLES] = {
+ .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BNR] = {
+ .opcode = P4_OPCODE(P4_EVENT_BNR),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_SNOOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SNOOP),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_RESPONSE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_FRONT_END_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_EXECUTION_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_REPLAY_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOPS_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOP_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
+ .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_X87_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MACHINE_CLEAR] = {
+ .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_COMPLETED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric) \
+ p4_config_pack_escr(P4_ESCR_EVENT(event) | \
+ P4_ESCR_EMASK_BIT(event, bit)) | \
+ p4_config_pack_cccr(metric | \
+ P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+ },
+},
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__dtlb_load_miss_retired),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__dtlb_store_miss_retired),
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+ P4_PEBS_METRIC__none),
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+ P4_PEBS_METRIC__none),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+static struct p4_event_alias {
+ u64 original;
+ u64 alternative;
+} p4_event_aliases[] = {
+ {
+ /*
+ * Non-halted cycles can be substituted with non-sleeping cycles (see
+ * Intel SDM Vol3b for details). We need this alias to be able
+ * to run nmi-watchdog and 'perf top' (or any other user space tool
+ * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+ * simultaneously.
+ */
+ .original =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ .alternative =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE),
+ },
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+ u64 config_match;
+ int i;
+
+ /*
+ * Only event with special mark is allowed,
+ * we're to be sure it didn't come as malformed
+ * RAW event.
+ */
+ if (!(config & P4_CONFIG_ALIASABLE))
+ return 0;
+
+ config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+ if (config_match == p4_event_aliases[i].original) {
+ config_match = p4_event_aliases[i].alternative;
+ break;
+ } else if (config_match == p4_event_aliases[i].alternative) {
+ config_match = p4_event_aliases[i].original;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(p4_event_aliases))
+ return 0;
+
+ return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+ /* non-halted CPU clocks */
+ [PERF_COUNT_HW_CPU_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
+ P4_CONFIG_ALIASABLE,
+
+ /*
+ * retired instructions
+ * in a sake of simplicity we don't use the FSB tagging
+ */
+ [PERF_COUNT_HW_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+ /* cache hits */
+ [PERF_COUNT_HW_CACHE_REFERENCES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+ /* cache misses */
+ [PERF_COUNT_HW_CACHE_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+ /* branch instructions retired */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+ /* mispredicted branches retired */
+ [PERF_COUNT_HW_BRANCH_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+ /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
+ [PERF_COUNT_HW_BUS_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
+ p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+ unsigned int evnt = p4_config_unpack_event(config);
+ struct p4_event_bind *bind = NULL;
+
+ if (evnt < ARRAY_SIZE(p4_event_bind_map))
+ bind = &p4_event_bind_map[evnt];
+
+ return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+ struct p4_event_bind *bind;
+ unsigned int esel;
+ u64 config;
+
+ config = p4_general_events[hw_event];
+ bind = p4_config_get_bind(config);
+ esel = P4_OPCODE_ESEL(bind->opcode);
+ config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+ return config;
+}
+
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+ /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+ if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+ if (boot_cpu_data.x86_vfm != INTEL_P4_PRESCOTT &&
+ boot_cpu_data.x86_vfm != INTEL_P4_PRESCOTT_2M &&
+ boot_cpu_data.x86_vfm != INTEL_P4_CEDARMILL)
+ return false;
+ }
+
+ /*
+ * For info
+ * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+ */
+
+ return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+ unsigned int v, emask;
+
+ /* User data may have out-of-bound event index */
+ v = p4_config_unpack_event(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_event_bind_map))
+ return -EINVAL;
+
+ /* It may be unsupported: */
+ if (!p4_event_match_cpu_model(v))
+ return -EINVAL;
+
+ /*
+ * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+ * in Architectural Performance Monitoring, it means not
+ * on _which_ logical cpu to count but rather _when_, ie it
+ * depends on logical cpu state -- count event if one cpu active,
+ * none, both or any, so we just allow user to pass any value
+ * desired.
+ *
+ * In turn we always set Tx_OS/Tx_USR bits bound to logical
+ * cpu without their propagation to another cpu
+ */
+
+ /*
+ * if an event is shared across the logical threads
+ * the user needs special permissions to be able to use it
+ */
+ if (p4_ht_active() && p4_event_bind_map[v].shared) {
+ v = perf_allow_cpu();
+ if (v)
+ return v;
+ }
+
+ /* ESCR EventMask bits may be invalid */
+ emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+ if (emask & ~p4_event_bind_map[v].escr_emask)
+ return -EINVAL;
+
+ /*
+ * it may have some invalid PEBS bits
+ */
+ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+ return -EINVAL;
+
+ v = p4_config_unpack_metric(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+ int cpu = get_cpu();
+ int rc = 0;
+ u32 escr, cccr;
+
+ /*
+ * the reason we use cpu that early is that: if we get scheduled
+ * first time on the same cpu -- we will not need swap thread
+ * specific flags in config (and will save some cpu cycles)
+ */
+
+ cccr = p4_default_cccr_conf(cpu);
+ escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+ event->attr.exclude_user);
+ event->hw.config = p4_config_pack_escr(escr) |
+ p4_config_pack_cccr(cccr);
+
+ if (p4_ht_active() && p4_ht_thread(cpu))
+ event->hw.config = p4_set_ht_bit(event->hw.config);
+
+ if (event->attr.type == PERF_TYPE_RAW) {
+ struct p4_event_bind *bind;
+ unsigned int esel;
+ /*
+ * Clear bits we reserve to be managed by kernel itself
+ * and never allowed from a user space
+ */
+ event->attr.config &= P4_CONFIG_MASK;
+
+ rc = p4_validate_raw_event(event);
+ if (rc)
+ goto out;
+
+ /*
+ * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+ * bits since we keep additional info here (for cache events and etc)
+ */
+ event->hw.config |= event->attr.config;
+ bind = p4_config_get_bind(event->attr.config);
+ if (!bind) {
+ rc = -EINVAL;
+ goto out;
+ }
+ esel = P4_OPCODE_ESEL(bind->opcode);
+ event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+ }
+
+ rc = x86_setup_perfctr(event);
+out:
+ put_cpu();
+ return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+ u64 v;
+
+ /* an official way for overflow indication */
+ rdmsrq(hwc->config_base, v);
+ if (v & P4_CCCR_OVF) {
+ wrmsrq(hwc->config_base, v & ~P4_CCCR_OVF);
+ return 1;
+ }
+
+ /*
+ * In some circumstances the overflow might issue an NMI but did
+ * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+ * we simply check for high bit being set, if it's cleared it means
+ * the counter has reached zero value and continued counting before
+ * real NMI signal was received:
+ */
+ rdmsrq(hwc->event_base, v);
+ if (!(v & ARCH_P4_UNFLAGGED_BIT))
+ return 1;
+
+ return 0;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+ /*
+ * FIXME
+ *
+ * It's still allowed that two threads setup same cache
+ * events so we can't simply clear metrics until we knew
+ * no one is depending on us, so we need kind of counter
+ * for "ReplayEvent" users.
+ *
+ * What is more complex -- RAW events, if user (for some
+ * reason) will pass some cache event metric with improper
+ * event opcode -- it's fine from hardware point of view
+ * but completely nonsense from "meaning" of such action.
+ *
+ * So at moment let leave metrics turned on forever -- it's
+ * ok for now but need to be revisited!
+ *
+ * (void)wrmsrq_safe(MSR_IA32_PEBS_ENABLE, 0);
+ * (void)wrmsrq_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
+ */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * If event gets disabled while counter is in overflowed
+ * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+ * asserted again and again
+ */
+ (void)wrmsrq_safe(hwc->config_base,
+ p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ p4_pmu_disable_event(event);
+ }
+
+ p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+ struct p4_pebs_bind *bind;
+ unsigned int idx;
+
+ BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+ idx = p4_config_unpack_metric(config);
+ if (idx == P4_PEBS_METRIC__none)
+ return;
+
+ bind = &p4_pebs_bind_map[idx];
+
+ (void)wrmsrq_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+ (void)wrmsrq_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
+}
+
+static void __p4_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int thread = p4_ht_config_thread(hwc->config);
+ u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+ unsigned int idx = p4_config_unpack_event(hwc->config);
+ struct p4_event_bind *bind;
+ u64 escr_addr, cccr;
+
+ bind = &p4_event_bind_map[idx];
+ escr_addr = bind->escr_msr[thread];
+
+ /*
+ * - we dont support cascaded counters yet
+ * - and counter 1 is broken (erratum)
+ */
+ WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+ WARN_ON_ONCE(hwc->idx == 1);
+
+ /* we need a real Event value */
+ escr_conf &= ~P4_ESCR_EVENT_MASK;
+ escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ /*
+ * it could be Cache event so we need to write metrics
+ * into additional MSRs
+ */
+ p4_pmu_enable_pebs(hwc->config);
+
+ (void)wrmsrq_safe(escr_addr, escr_conf);
+ (void)wrmsrq_safe(hwc->config_base,
+ (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(X86_PMC_IDX_MAX)], p4_running);
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+ int idx = event->hw.idx;
+
+ __set_bit(idx, per_cpu(p4_running, smp_processor_id()));
+ __p4_pmu_enable_event(event);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ __p4_pmu_enable_event(event);
+ }
+}
+
+static int p4_pmu_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 left = this_cpu_read(pmc_prev_left[hwc->idx]);
+ int ret;
+
+ ret = x86_perf_event_set_period(event);
+
+ if (hwc->event_base) {
+ /*
+ * This handles erratum N15 in intel doc 249199-029,
+ * the counter may not be updated correctly on write
+ * so we need a second write operation to do the trick
+ * (the official workaround didn't work)
+ *
+ * the former idea is taken from OProfile code
+ */
+ wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ }
+
+ return ret;
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int idx, handled = 0;
+ u64 val;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ int overflow;
+
+ if (!test_bit(idx, cpuc->active_mask)) {
+ /* catch in-flight IRQs */
+ if (__test_and_clear_bit(idx, per_cpu(p4_running, smp_processor_id())))
+ handled++;
+ continue;
+ }
+
+ event = cpuc->events[idx];
+ hwc = &event->hw;
+
+ WARN_ON_ONCE(hwc->idx != idx);
+
+ /* it might be unflagged overflow */
+ overflow = p4_pmu_clear_cccr_ovf(hwc);
+
+ val = x86_perf_event_update(event);
+ if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+ continue;
+
+ handled += overflow;
+
+ /* event overflow for sure */
+ perf_sample_data_init(&data, 0, hwc->last_period);
+
+ if (!static_call(x86_pmu_set_period)(event))
+ continue;
+
+
+ perf_event_overflow(event, &data, regs);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ /*
+ * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+ * been observed that the OVF bit flag has to be cleared first _before_
+ * the LVTPC can be unmasked.
+ *
+ * The reason is the NMI line will continue to be asserted while the OVF
+ * bit is set. This causes a second NMI to generate if the LVTPC is
+ * unmasked before the OVF bit is cleared, leading to unknown NMI
+ * messages.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+ u32 escr, cccr;
+
+ /*
+ * we either lucky and continue on same cpu or no HT support
+ */
+ if (!p4_should_swap_ts(hwc->config, cpu))
+ return;
+
+ /*
+ * the event is migrated from an another logical
+ * cpu, so we need to swap thread specific flags
+ */
+
+ escr = p4_config_unpack_escr(hwc->config);
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ if (p4_ht_thread(cpu)) {
+ cccr &= ~P4_CCCR_OVF_PMI_T0;
+ cccr |= P4_CCCR_OVF_PMI_T1;
+ if (escr & P4_ESCR_T0_OS) {
+ escr &= ~P4_ESCR_T0_OS;
+ escr |= P4_ESCR_T1_OS;
+ }
+ if (escr & P4_ESCR_T0_USR) {
+ escr &= ~P4_ESCR_T0_USR;
+ escr |= P4_ESCR_T1_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config |= P4_CONFIG_HT;
+ } else {
+ cccr &= ~P4_CCCR_OVF_PMI_T1;
+ cccr |= P4_CCCR_OVF_PMI_T0;
+ if (escr & P4_ESCR_T1_OS) {
+ escr &= ~P4_ESCR_T1_OS;
+ escr |= P4_ESCR_T0_OS;
+ }
+ if (escr & P4_ESCR_T1_USR) {
+ escr &= ~P4_ESCR_T1_USR;
+ escr |= P4_ESCR_T0_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config &= ~P4_CONFIG_HT;
+ }
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE 0x000003a0
+#define P4_ESCR_MSR_MAX 0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+ unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+ if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+ !p4_escr_table[idx] ||
+ p4_escr_table[idx] != addr)) {
+ WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+ return -1;
+ }
+
+ return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+ struct p4_event_bind *bind)
+{
+ int i, j;
+
+ for (i = 0; i < P4_CNTR_LIMIT; i++) {
+ j = bind->cntr[thread][i];
+ if (j != -1 && !test_bit(j, used_mask))
+ return j;
+ }
+
+ return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+ int cpu = smp_processor_id();
+ struct hw_perf_event *hwc;
+ struct p4_event_bind *bind;
+ unsigned int i, thread, num;
+ int cntr_idx, escr_idx;
+ u64 config_alias;
+ int pass;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+ for (i = 0, num = n; i < n; i++, num--) {
+
+ hwc = &cpuc->event_list[i]->hw;
+ thread = p4_ht_thread(cpu);
+ pass = 0;
+
+again:
+ /*
+ * It's possible to hit a circular lock
+ * between original and alternative events
+ * if both are scheduled already.
+ */
+ if (pass > 2)
+ goto done;
+
+ bind = p4_config_get_bind(hwc->config);
+ escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+ if (unlikely(escr_idx == -1))
+ goto done;
+
+ if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+ cntr_idx = hwc->idx;
+ if (assign)
+ assign[i] = hwc->idx;
+ goto reserve;
+ }
+
+ cntr_idx = p4_next_cntr(thread, used_mask, bind);
+ if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+ /*
+ * Check whether an event alias is still available.
+ */
+ config_alias = p4_get_alias_event(hwc->config);
+ if (!config_alias)
+ goto done;
+ hwc->config = config_alias;
+ pass++;
+ goto again;
+ }
+ /*
+ * Perf does test runs to see if a whole group can be assigned
+ * together successfully. There can be multiple rounds of this.
+ * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+ * bits, such that the next round of group assignments will
+ * cause the above p4_should_swap_ts to pass instead of fail.
+ * This leads to counters exclusive to thread0 being used by
+ * thread1.
+ *
+ * Solve this with a cheap hack, reset the idx back to -1 to
+ * force a new lookup (p4_next_cntr) to get the right counter
+ * for the right thread.
+ *
+ * This probably doesn't comply with the general spirit of how
+ * perf wants to work, but P4 is special. :-(
+ */
+ if (p4_should_swap_ts(hwc->config, cpu))
+ hwc->idx = -1;
+ p4_pmu_swap_config_ts(hwc, cpu);
+ if (assign)
+ assign[i] = cntr_idx;
+reserve:
+ set_bit(cntr_idx, used_mask);
+ set_bit(escr_idx, escr_mask);
+ }
+
+done:
+ return num ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht, "config:63" );
+
+static struct attribute *intel_p4_formats_attr[] = {
+ &format_attr_cccr.attr,
+ &format_attr_escr.attr,
+ &format_attr_ht.attr,
+ NULL,
+};
+
+static __initconst const struct x86_pmu p4_pmu = {
+ .name = "Netburst P4/Xeon",
+ .handle_irq = p4_pmu_handle_irq,
+ .disable_all = p4_pmu_disable_all,
+ .enable_all = p4_pmu_enable_all,
+ .enable = p4_pmu_enable_event,
+ .disable = p4_pmu_disable_event,
+
+ .set_period = p4_pmu_set_period,
+
+ .eventsel = MSR_P4_BPU_CCCR0,
+ .perfctr = MSR_P4_BPU_PERFCTR0,
+ .event_map = p4_pmu_event_map,
+ .max_events = ARRAY_SIZE(p4_general_events),
+ .get_event_constraints = x86_get_event_constraints,
+ /*
+ * IF HT disabled we may need to use all
+ * ARCH_P4_MAX_CCCR counters simultaneously
+ * though leave it restricted at moment assuming
+ * HT is on
+ */
+ .cntr_mask64 = GENMASK_ULL(ARCH_P4_MAX_CCCR - 1, 0),
+ .apic = 1,
+ .cntval_bits = ARCH_P4_CNTRVAL_BITS,
+ .cntval_mask = ARCH_P4_CNTRVAL_MASK,
+ .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+ .hw_config = p4_hw_config,
+ .schedule_events = p4_pmu_schedule_events,
+
+ .format_attrs = intel_p4_formats_attr,
+};
+
+__init int p4_pmu_init(void)
+{
+ unsigned int low, high;
+ int i, reg;
+
+ /* If we get stripped -- indexing fails */
+ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
+
+ rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+ if (!(low & (1 << 7))) {
+ pr_cont("unsupported Netburst CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Netburst events, ");
+
+ x86_pmu = p4_pmu;
+
+ /*
+ * Even though the counters are configured to interrupt a particular
+ * logical processor when an overflow happens, testing has shown that
+ * on kdump kernels (which uses a single cpu), thread1's counter
+ * continues to run and will report an NMI on thread0. Due to the
+ * overflow bug, this leads to a stream of unknown NMIs.
+ *
+ * Solve this by zero'ing out the registers to mimic a reset.
+ */
+ for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ reg = x86_pmu_config_addr(i);
+ wrmsrq_safe(reg, 0ULL);
+ }
+
+ return 0;
+}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
new file mode 100644
index 000000000000..6e41de355bd8
--- /dev/null
+++ b/arch/x86/events/intel/p6.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
+ [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
+ [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
+ [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+ return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT 0x0000002EULL
+
+static struct event_constraint p6_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+ u64 val;
+
+ /* p6 only has one enable register */
+ rdmsrq(MSR_P6_EVNTSEL0, val);
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrq(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(int added)
+{
+ unsigned long val;
+
+ /* p6 only has one enable register */
+ rdmsrq(MSR_P6_EVNTSEL0, val);
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrq(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val = P6_NOP_EVENT;
+
+ (void)wrmsrq_safe(hwc->config_base, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+
+ /*
+ * p6 only has a global event enable, set on PerfEvtSel0
+ * We "disable" events by programming P6_NOP_EVENT
+ * and we rely on p6_pmu_enable_all() being called
+ * to actually enable the events.
+ */
+
+ (void)wrmsrq_safe(hwc->config_base, val);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_p6_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static __initconst const struct x86_pmu p6_pmu = {
+ .name = "p6",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = p6_pmu_disable_all,
+ .enable_all = p6_pmu_enable_all,
+ .enable = p6_pmu_enable_event,
+ .disable = p6_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_P6_EVNTSEL0,
+ .perfctr = MSR_P6_PERFCTR0,
+ .event_map = p6_pmu_event_map,
+ .max_events = ARRAY_SIZE(p6_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 31) - 1,
+ .version = 0,
+ .cntr_mask64 = 0x3,
+ /*
+ * Events have 40 bits implemented. However they are designed such
+ * that bits [32-39] are sign extensions of bit 31. As such the
+ * effective width of a event for P6-like PMU is 32 bits only.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+ .cntval_bits = 32,
+ .cntval_mask = (1ULL << 32) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = p6_event_constraints,
+
+ .format_attrs = intel_p6_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
+};
+
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+ if (boot_cpu_data.x86_stepping < 9) {
+ /*
+ * PPro erratum 26; fixed in stepping 9 and above.
+ */
+ pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+ x86_pmu.attr_rdpmc_broken = 1;
+ x86_pmu.attr_rdpmc = 0;
+ }
+}
+
+__init int p6_pmu_init(void)
+{
+ x86_pmu = p6_pmu;
+
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO)
+ x86_add_quirk(p6_pmu_rdpmc_quirk);
+
+ memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
new file mode 100644
index 000000000000..44524a387c58
--- /dev/null
+++ b/arch/x86/events/intel/pt.c
@@ -0,0 +1,1894 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/bits.h>
+#include <linux/limits.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/kvm_types.h>
+
+#include <asm/cpuid/api.h>
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/intel_pt.h>
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+#include "pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m) \
+ [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
+ .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+ const char *name;
+ u32 leaf;
+ u8 reg;
+ u32 mask;
+} pt_caps[] = {
+ PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)),
+ PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)),
+ PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)),
+ PT_CAP(mtc, 0, CPUID_EBX, BIT(3)),
+ PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)),
+ PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)),
+ PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)),
+ PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)),
+ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
+ PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+ PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)),
+ PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
+ PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7),
+ PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
+ PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff),
+ PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
+};
+
+u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
+{
+ struct pt_cap_desc *cd = &pt_caps[capability];
+ u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+ unsigned int shift = __ffs(cd->mask);
+
+ return (c & cd->mask) >> shift;
+}
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_cap);
+
+u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
+{
+ return intel_pt_validate_cap(pt_pmu.caps, cap);
+}
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_hw_cap);
+
+static ssize_t pt_cap_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dev_ext_attribute *ea =
+ container_of(attr, struct dev_ext_attribute, attr);
+ enum pt_capabilities cap = (long)ea->var;
+
+ return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
+}
+
+static struct attribute_group pt_cap_group __ro_after_init = {
+ .name = "caps",
+};
+
+PMU_FORMAT_ATTR(pt, "config:0" );
+PMU_FORMAT_ATTR(cyc, "config:1" );
+PMU_FORMAT_ATTR(pwr_evt, "config:4" );
+PMU_FORMAT_ATTR(fup_on_ptw, "config:5" );
+PMU_FORMAT_ATTR(mtc, "config:9" );
+PMU_FORMAT_ATTR(tsc, "config:10" );
+PMU_FORMAT_ATTR(noretcomp, "config:11" );
+PMU_FORMAT_ATTR(ptw, "config:12" );
+PMU_FORMAT_ATTR(branch, "config:13" );
+PMU_FORMAT_ATTR(event, "config:31" );
+PMU_FORMAT_ATTR(notnt, "config:55" );
+PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
+PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
+PMU_FORMAT_ATTR(psb_period, "config:24-27" );
+
+static struct attribute *pt_formats_attr[] = {
+ &format_attr_pt.attr,
+ &format_attr_cyc.attr,
+ &format_attr_pwr_evt.attr,
+ &format_attr_event.attr,
+ &format_attr_notnt.attr,
+ &format_attr_fup_on_ptw.attr,
+ &format_attr_mtc.attr,
+ &format_attr_tsc.attr,
+ &format_attr_noretcomp.attr,
+ &format_attr_ptw.attr,
+ &format_attr_branch.attr,
+ &format_attr_mtc_period.attr,
+ &format_attr_cyc_thresh.attr,
+ &format_attr_psb_period.attr,
+ NULL,
+};
+
+static struct attribute_group pt_format_group = {
+ .name = "format",
+ .attrs = pt_formats_attr,
+};
+
+static ssize_t
+pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ switch (pmu_attr->id) {
+ case 0:
+ return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
+ case 1:
+ return sprintf(page, "%u:%u\n",
+ pt_pmu.tsc_art_num,
+ pt_pmu.tsc_art_den);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
+ pt_timing_attr_show);
+PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
+ pt_timing_attr_show);
+
+static struct attribute *pt_timing_attr[] = {
+ &timing_attr_max_nonturbo_ratio.attr.attr,
+ &timing_attr_tsc_art_ratio.attr.attr,
+ NULL,
+};
+
+static struct attribute_group pt_timing_group = {
+ .attrs = pt_timing_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+ &pt_cap_group,
+ &pt_format_group,
+ &pt_timing_group,
+ NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+ struct dev_ext_attribute *de_attrs;
+ struct attribute **attrs;
+ size_t size;
+ u64 reg;
+ int ret;
+ long i;
+
+ rdmsrq(MSR_PLATFORM_INFO, reg);
+ pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
+
+ /*
+ * if available, read in TSC to core crystal clock ratio,
+ * otherwise, zero for numerator stands for "not enumerated"
+ * as per SDM
+ */
+ if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx);
+
+ pt_pmu.tsc_art_num = ebx;
+ pt_pmu.tsc_art_den = eax;
+ }
+
+ /* model-specific quirks */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
+ /* not setting BRANCH_EN will #GP, erratum BDM106 */
+ pt_pmu.branch_en_always_on = true;
+ break;
+ default:
+ break;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_VMX)) {
+ /*
+ * Intel SDM, 36.5 "Tracing post-VMXON" says that
+ * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
+ * post-VMXON.
+ */
+ rdmsrq(MSR_IA32_VMX_MISC, reg);
+ if (reg & BIT(14))
+ pt_pmu.vmx = true;
+ }
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ cpuid_count(20, i,
+ &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
+ }
+
+ ret = -ENOMEM;
+ size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+ attrs = kzalloc(size, GFP_KERNEL);
+ if (!attrs)
+ goto fail;
+
+ size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+ de_attrs = kzalloc(size, GFP_KERNEL);
+ if (!de_attrs)
+ goto fail;
+
+ for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+ struct dev_ext_attribute *de_attr = de_attrs + i;
+
+ de_attr->attr.attr.name = pt_caps[i].name;
+
+ sysfs_attr_init(&de_attr->attr.attr);
+
+ de_attr->attr.attr.mode = S_IRUGO;
+ de_attr->attr.show = pt_cap_show;
+ de_attr->var = (void *)i;
+
+ attrs[i] = &de_attr->attr.attr;
+ }
+
+ pt_cap_group.attrs = attrs;
+
+ return 0;
+
+fail:
+ kfree(attrs);
+
+ return ret;
+}
+
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \
+ RTIT_CTL_CYC_THRESH | \
+ RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \
+ RTIT_CTL_MTC_RANGE)
+
+#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \
+ RTIT_CTL_FUP_ON_PTW)
+
+/*
+ * Bit 0 (TraceEn) in the attr.config is meaningless as the
+ * corresponding bit in the RTIT_CTL can only be controlled
+ * by the driver; therefore, repurpose it to mean: pass
+ * through the bit that was previously assumed to be always
+ * on for PT, thereby allowing the user to *not* set it if
+ * they so wish. See also pt_event_valid() and pt_config().
+ */
+#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
+
+#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \
+ RTIT_CTL_TSC_EN | \
+ RTIT_CTL_DISRETC | \
+ RTIT_CTL_BRANCH_EN | \
+ RTIT_CTL_CYC_PSB | \
+ RTIT_CTL_MTC | \
+ RTIT_CTL_PWR_EVT_EN | \
+ RTIT_CTL_EVENT_EN | \
+ RTIT_CTL_NOTNT | \
+ RTIT_CTL_FUP_ON_PTW | \
+ RTIT_CTL_PTW_EN)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+ u64 allowed, requested;
+
+ if ((config & PT_CONFIG_MASK) != config)
+ return false;
+
+ if (config & RTIT_CTL_CYC_PSB) {
+ if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
+ return false;
+
+ allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
+ requested = (config & RTIT_CTL_PSB_FREQ) >>
+ RTIT_CTL_PSB_FREQ_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+
+ allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
+ requested = (config & RTIT_CTL_CYC_THRESH) >>
+ RTIT_CTL_CYC_THRESH_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+ }
+
+ if (config & RTIT_CTL_MTC) {
+ /*
+ * In the unlikely case that CPUID lists valid mtc periods,
+ * but not the mtc capability, drop out here.
+ *
+ * Spec says that setting mtc period bits while mtc bit in
+ * CPUID is 0 will #GP, so better safe than sorry.
+ */
+ if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
+ return false;
+
+ allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
+ if (!allowed)
+ return false;
+
+ requested = (config & RTIT_CTL_MTC_RANGE) >>
+ RTIT_CTL_MTC_RANGE_OFFSET;
+
+ if (!(allowed & BIT(requested)))
+ return false;
+ }
+
+ if (config & RTIT_CTL_PWR_EVT_EN &&
+ !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
+ return false;
+
+ if (config & RTIT_CTL_EVENT_EN &&
+ !intel_pt_validate_hw_cap(PT_CAP_event_trace))
+ return false;
+
+ if (config & RTIT_CTL_NOTNT &&
+ !intel_pt_validate_hw_cap(PT_CAP_tnt_disable))
+ return false;
+
+ if (config & RTIT_CTL_PTW) {
+ if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
+ return false;
+
+ /* FUPonPTW without PTW doesn't make sense */
+ if ((config & RTIT_CTL_FUP_ON_PTW) &&
+ !(config & RTIT_CTL_PTW_EN))
+ return false;
+ }
+
+ /*
+ * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
+ * clears the assumption that BranchEn must always be enabled,
+ * as was the case with the first implementation of PT.
+ * If this bit is not set, the legacy behavior is preserved
+ * for compatibility with the older userspace.
+ *
+ * Re-using bit 0 for this purpose is fine because it is never
+ * directly set by the user; previous attempts at setting it in
+ * the attr.config resulted in -EINVAL.
+ */
+ if (config & RTIT_CTL_PASSTHROUGH) {
+ /*
+ * Disallow not setting BRANCH_EN where BRANCH_EN is
+ * always required.
+ */
+ if (pt_pmu.branch_en_always_on &&
+ !(config & RTIT_CTL_BRANCH_EN))
+ return false;
+ } else {
+ /*
+ * Disallow BRANCH_EN without the PASSTHROUGH.
+ */
+ if (config & RTIT_CTL_BRANCH_EN)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static void pt_config_start(struct perf_event *event)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 ctl = event->hw.aux_config;
+
+ if (READ_ONCE(event->hw.aux_paused))
+ return;
+
+ ctl |= RTIT_CTL_TRACEEN;
+ if (READ_ONCE(pt->vmx_on))
+ perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
+ else
+ wrmsrq(MSR_IA32_RTIT_CTL, ctl);
+
+ WRITE_ONCE(event->hw.aux_config, ctl);
+}
+
+/* Address ranges and their corresponding msr configuration registers */
+static const struct pt_address_range {
+ unsigned long msr_a;
+ unsigned long msr_b;
+ unsigned int reg_off;
+} pt_address_ranges[] = {
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR0_A,
+ .msr_b = MSR_IA32_RTIT_ADDR0_B,
+ .reg_off = RTIT_CTL_ADDR0_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR1_A,
+ .msr_b = MSR_IA32_RTIT_ADDR1_B,
+ .reg_off = RTIT_CTL_ADDR1_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR2_A,
+ .msr_b = MSR_IA32_RTIT_ADDR2_B,
+ .reg_off = RTIT_CTL_ADDR2_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR3_A,
+ .msr_b = MSR_IA32_RTIT_ADDR3_B,
+ .reg_off = RTIT_CTL_ADDR3_OFFSET,
+ }
+};
+
+static u64 pt_config_filters(struct perf_event *event)
+{
+ struct pt_filters *filters = event->hw.addr_filters;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ unsigned int range = 0;
+ u64 rtit_ctl = 0;
+
+ if (!filters)
+ return 0;
+
+ perf_event_addr_filters_sync(event);
+
+ for (range = 0; range < filters->nr_filters; range++) {
+ struct pt_filter *filter = &filters->filter[range];
+
+ /*
+ * Note, if the range has zero start/end addresses due
+ * to its dynamic object not being loaded yet, we just
+ * go ahead and program zeroed range, which will simply
+ * produce no data. Note^2: if executable code at 0x0
+ * is a concern, we can set up an "invalid" configuration
+ * such as msr_b < msr_a.
+ */
+
+ /* avoid redundant msr writes */
+ if (pt->filters.filter[range].msr_a != filter->msr_a) {
+ wrmsrq(pt_address_ranges[range].msr_a, filter->msr_a);
+ pt->filters.filter[range].msr_a = filter->msr_a;
+ }
+
+ if (pt->filters.filter[range].msr_b != filter->msr_b) {
+ wrmsrq(pt_address_ranges[range].msr_b, filter->msr_b);
+ pt->filters.filter[range].msr_b = filter->msr_b;
+ }
+
+ rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
+ }
+
+ return rtit_ctl;
+}
+
+static void pt_config(struct perf_event *event)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ u64 reg;
+
+ /* First round: clear STATUS, in particular the PSB byte counter. */
+ if (!event->hw.aux_config) {
+ perf_event_itrace_started(event);
+ wrmsrq(MSR_IA32_RTIT_STATUS, 0);
+ }
+
+ reg = pt_config_filters(event);
+ reg |= RTIT_CTL_TRACEEN;
+ if (!buf->single)
+ reg |= RTIT_CTL_TOPA;
+
+ /*
+ * Previously, we had BRANCH_EN on by default, but now that PT has
+ * grown features outside of branch tracing, it is useful to allow
+ * the user to disable it. Setting bit 0 in the event's attr.config
+ * allows BRANCH_EN to pass through instead of being always on. See
+ * also the comment in pt_event_valid().
+ */
+ if (event->attr.config & BIT(0)) {
+ reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
+ } else {
+ reg |= RTIT_CTL_BRANCH_EN;
+ }
+
+ if (!event->attr.exclude_kernel)
+ reg |= RTIT_CTL_OS;
+ if (!event->attr.exclude_user)
+ reg |= RTIT_CTL_USR;
+
+ reg |= (event->attr.config & PT_CONFIG_MASK);
+
+ event->hw.aux_config = reg;
+
+ /*
+ * Allow resume before starting so as not to overwrite a value set by a
+ * PMI.
+ */
+ barrier();
+ WRITE_ONCE(pt->resume_allowed, 1);
+ /* Configuration is complete, it is now OK to handle an NMI */
+ barrier();
+ WRITE_ONCE(pt->handle_nmi, 1);
+ barrier();
+ pt_config_start(event);
+ barrier();
+ /*
+ * Allow pause after starting so its pt_config_stop() doesn't race with
+ * pt_config_start().
+ */
+ WRITE_ONCE(pt->pause_allowed, 1);
+}
+
+static void pt_config_stop(struct perf_event *event)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 ctl = READ_ONCE(event->hw.aux_config);
+
+ /* may be already stopped by a PMI */
+ if (!(ctl & RTIT_CTL_TRACEEN))
+ return;
+
+ ctl &= ~RTIT_CTL_TRACEEN;
+ if (!READ_ONCE(pt->vmx_on))
+ wrmsrq(MSR_IA32_RTIT_CTL, ctl);
+
+ WRITE_ONCE(event->hw.aux_config, ctl);
+
+ /*
+ * A wrmsr that disables trace generation serializes other PT
+ * registers and causes all data packets to be written to memory,
+ * but a fence is required for the data to become globally visible.
+ *
+ * The below WMB, separating data store and aux_head store matches
+ * the consumer's RMB that separates aux_head load and data load.
+ */
+ wmb();
+}
+
+/**
+ * struct topa - ToPA metadata
+ * @list: linkage to struct pt_buffer's list of tables
+ * @offset: offset of the first entry in this table in the buffer
+ * @size: total size of all entries in this table
+ * @last: index of the last initialized entry in this table
+ * @z_count: how many times the first entry repeats
+ */
+struct topa {
+ struct list_head list;
+ u64 offset;
+ size_t size;
+ int last;
+ unsigned int z_count;
+};
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE \
+ ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry))
+
+/**
+ * struct topa_page - page-sized ToPA table with metadata at the top
+ * @table: actual ToPA table entries, as understood by PT hardware
+ * @topa: metadata
+ */
+struct topa_page {
+ struct topa_entry table[TENTS_PER_PAGE];
+ struct topa topa;
+};
+
+static inline struct topa_page *topa_to_page(struct topa *topa)
+{
+ return container_of(topa, struct topa_page, topa);
+}
+
+static inline struct topa_page *topa_entry_to_page(struct topa_entry *te)
+{
+ return (struct topa_page *)((unsigned long)te & PAGE_MASK);
+}
+
+static inline phys_addr_t topa_pfn(struct topa *topa)
+{
+ return PFN_DOWN(virt_to_phys(topa_to_page(topa)));
+}
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) \
+ ((i) == -1 \
+ ? &topa_to_page(t)->table[(t)->last] \
+ : &topa_to_page(t)->table[(i)])
+#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
+#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
+
+static void pt_config_buffer(struct pt_buffer *buf)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 reg, mask;
+ void *base;
+
+ if (buf->single) {
+ base = buf->data_pages[0];
+ mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
+ } else {
+ base = topa_to_page(buf->cur)->table;
+ mask = (u64)buf->cur_idx;
+ }
+
+ reg = virt_to_phys(base);
+ if (pt->output_base != reg) {
+ pt->output_base = reg;
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, reg);
+ }
+
+ reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
+ if (pt->output_mask != reg) {
+ pt->output_mask = reg;
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+ }
+}
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu: CPU on which to allocate.
+ * @gfp: Allocation flags.
+ *
+ * Return: On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+ int node = cpu_to_node(cpu);
+ struct topa_page *tp;
+ struct page *p;
+
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+ if (!p)
+ return NULL;
+
+ tp = page_address(p);
+ tp->topa.last = 0;
+
+ /*
+ * In case of singe-entry ToPA, always put the self-referencing END
+ * link as the 2nd entry in the table
+ */
+ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT;
+ TOPA_ENTRY(&tp->topa, 1)->end = 1;
+ }
+
+ return &tp->topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa: Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+ free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf: PT buffer that's being extended.
+ * @topa: New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+ struct topa *last = buf->last;
+
+ list_add_tail(&topa->list, &buf->tables);
+
+ if (!buf->first) {
+ buf->first = buf->last = buf->cur = topa;
+ return;
+ }
+
+ topa->offset = last->offset + last->size;
+ buf->last = topa;
+
+ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
+ return;
+
+ BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+ TOPA_ENTRY(last, -1)->base = topa_pfn(topa);
+ TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa: ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+ /* single-entry ToPA is a special case */
+ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
+ return !!topa->last;
+
+ return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf: PT buffer being initialized.
+ * @cpu: CPU on which to allocate.
+ * @gfp: Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return: 0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp)
+{
+ struct topa *topa = buf->last;
+ int order = 0;
+ struct page *p;
+
+ p = virt_to_page(buf->data_pages[buf->nr_pages]);
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (topa_table_full(topa)) {
+ topa = topa_alloc(cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+ }
+
+ if (topa->z_count == topa->last - 1) {
+ if (order == TOPA_ENTRY(topa, topa->last - 1)->size)
+ topa->z_count++;
+ }
+
+ TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, -1)->size = order;
+ if (!buf->snapshot &&
+ !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, -1)->intr = 1;
+ TOPA_ENTRY(topa, -1)->stop = 1;
+ }
+
+ topa->last++;
+ topa->size += sizes(order);
+
+ buf->nr_pages += 1ul << order;
+
+ return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf: PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+ struct topa *topa;
+
+ list_for_each_entry(topa, &buf->tables, list) {
+ struct topa_page *tp = topa_to_page(topa);
+ int i;
+
+ pr_debug("# table @%p, off %llx size %zx\n", tp->table,
+ topa->offset, topa->size);
+ for (i = 0; i < TENTS_PER_PAGE; i++) {
+ pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+ &tp->table[i],
+ (unsigned long)tp->table[i].base << TOPA_SHIFT,
+ sizes(tp->table[i].size),
+ tp->table[i].end ? 'E' : ' ',
+ tp->table[i].intr ? 'I' : ' ',
+ tp->table[i].stop ? 'S' : ' ',
+ *(u64 *)&tp->table[i]);
+ if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
+ tp->table[i].stop) ||
+ tp->table[i].end)
+ break;
+ if (!i && topa->z_count)
+ i += topa->z_count;
+ }
+ }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf: PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+ buf->output_off = 0;
+ buf->cur_idx++;
+
+ if (buf->cur_idx == buf->cur->last) {
+ if (buf->cur == buf->last) {
+ buf->cur = buf->first;
+ buf->wrapped = true;
+ } else {
+ buf->cur = list_entry(buf->cur->list.next, struct topa,
+ list);
+ }
+ buf->cur_idx = 0;
+ }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt: Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ bool wrapped = buf->wrapped;
+ u64 topa_idx, base, old;
+
+ buf->wrapped = false;
+
+ if (buf->single) {
+ local_set(&buf->data_size, buf->output_off);
+ return;
+ }
+
+ /* offset of the first region in this table from the beginning of buf */
+ base = buf->cur->offset + buf->output_off;
+
+ /* offset of the current output region within this table */
+ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+ base += TOPA_ENTRY_SIZE(buf->cur, topa_idx);
+
+ if (buf->snapshot) {
+ local_set(&buf->data_size, base);
+ } else {
+ old = (local64_xchg(&buf->head, base) &
+ ((buf->nr_pages << PAGE_SHIFT) - 1));
+ if (base < old || (base == old && wrapped))
+ base += buf->nr_pages << PAGE_SHIFT;
+
+ local_add(base - old, &buf->data_size);
+ }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf: PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+ return phys_to_virt((phys_addr_t)TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf: PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+ return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt: Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ int advance = 0;
+ u64 status;
+
+ rdmsrq(MSR_IA32_RTIT_STATUS, status);
+
+ if (status & RTIT_STATUS_ERROR) {
+ pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+ pt_topa_dump(buf);
+ status &= ~RTIT_STATUS_ERROR;
+ }
+
+ if (status & RTIT_STATUS_STOPPED) {
+ status &= ~RTIT_STATUS_STOPPED;
+
+ /*
+ * On systems that only do single-entry ToPA, hitting STOP
+ * means we are already losing data; need to let the decoder
+ * know.
+ */
+ if (!buf->single &&
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) ||
+ buf->output_off == pt_buffer_region_size(buf))) {
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_TRUNCATED);
+ advance++;
+ }
+ }
+
+ /*
+ * Also on single-entry ToPA implementations, interrupt will come
+ * before the output reaches its output region's boundary.
+ */
+ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) &&
+ !buf->snapshot &&
+ pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+ void *head = pt_buffer_region(buf);
+
+ /* everything within this margin needs to be zeroed out */
+ memset(head + buf->output_off, 0,
+ pt_buffer_region_size(buf) -
+ buf->output_off);
+ advance++;
+ }
+
+ if (advance)
+ pt_buffer_advance(buf);
+
+ wrmsrq(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf: PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct topa_page *tp;
+
+ if (!buf->single) {
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
+ tp = phys_to_virt(pt->output_base);
+ buf->cur = &tp->topa;
+ }
+
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
+ /* offset within current output region */
+ buf->output_off = pt->output_mask >> 32;
+ /* index of current output region within this table */
+ if (!buf->single)
+ buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
+}
+
+static struct topa_entry *
+pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg)
+{
+ struct topa_page *tp;
+ struct topa *topa;
+ unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0;
+
+ /*
+ * Indicates a bug in the caller.
+ */
+ if (WARN_ON_ONCE(pg >= buf->nr_pages))
+ return NULL;
+
+ /*
+ * First, find the ToPA table where @pg fits. With high
+ * order allocations, there shouldn't be many of these.
+ */
+ list_for_each_entry(topa, &buf->tables, list) {
+ if (topa->offset + topa->size > (unsigned long)pg << PAGE_SHIFT)
+ goto found;
+ }
+
+ /*
+ * Hitting this means we have a problem in the ToPA
+ * allocation code.
+ */
+ WARN_ON_ONCE(1);
+
+ return NULL;
+
+found:
+ /*
+ * Indicates a problem in the ToPA allocation code.
+ */
+ if (WARN_ON_ONCE(topa->last == -1))
+ return NULL;
+
+ tp = topa_to_page(topa);
+ cur_pg = PFN_DOWN(topa->offset);
+ if (topa->z_count) {
+ z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1);
+ start_idx = topa->z_count + 1;
+ }
+
+ /*
+ * Multiple entries at the beginning of the table have the same size,
+ * ideally all of them; if @pg falls there, the search is done.
+ */
+ if (pg >= cur_pg && pg < cur_pg + z_pg) {
+ idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0);
+ return &tp->table[idx];
+ }
+
+ /*
+ * Otherwise, slow path: iterate through the remaining entries.
+ */
+ for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) {
+ if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg)
+ return &tp->table[idx];
+
+ cur_pg += TOPA_ENTRY_PAGES(topa, idx);
+ }
+
+ /*
+ * Means we couldn't find a ToPA entry in the table that does match.
+ */
+ WARN_ON_ONCE(1);
+
+ return NULL;
+}
+
+static struct topa_entry *
+pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te)
+{
+ unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1);
+ struct topa_page *tp;
+ struct topa *topa;
+
+ tp = (struct topa_page *)table;
+ if (tp->table != te)
+ return --te;
+
+ topa = &tp->topa;
+ if (topa == buf->first)
+ topa = buf->last;
+ else
+ topa = list_prev_entry(topa, list);
+
+ tp = topa_to_page(topa);
+
+ return &tp->table[topa->last - 1];
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf: PT buffer.
+ * @handle: Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+ struct perf_output_handle *handle)
+
+{
+ unsigned long head = local64_read(&buf->head);
+ unsigned long idx, npages, wakeup;
+
+ if (buf->single)
+ return 0;
+
+ /* can't stop in the middle of an output region */
+ if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+ return -EINVAL;
+ }
+
+
+ /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
+ return 0;
+
+ /* clear STOP and INT from current entry */
+ if (buf->stop_te) {
+ buf->stop_te->stop = 0;
+ buf->stop_te->intr = 0;
+ }
+
+ if (buf->intr_te)
+ buf->intr_te->intr = 0;
+
+ /* how many pages till the STOP marker */
+ npages = handle->size >> PAGE_SHIFT;
+
+ /* if it's on a page boundary, fill up one more page */
+ if (!offset_in_page(head + handle->size + 1))
+ npages++;
+
+ idx = (head >> PAGE_SHIFT) + npages;
+ idx &= buf->nr_pages - 1;
+
+ if (idx != buf->stop_pos) {
+ buf->stop_pos = idx;
+ buf->stop_te = pt_topa_entry_for_page(buf, idx);
+ buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te);
+ }
+
+ wakeup = handle->wakeup >> PAGE_SHIFT;
+
+ /* in the worst case, wake up the consumer one page before hard stop */
+ idx = (head >> PAGE_SHIFT) + npages - 1;
+ if (idx > wakeup)
+ idx = wakeup;
+
+ idx &= buf->nr_pages - 1;
+ if (idx != buf->intr_pos) {
+ buf->intr_pos = idx;
+ buf->intr_te = pt_topa_entry_for_page(buf, idx);
+ buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te);
+ }
+
+ buf->stop_te->stop = 1;
+ buf->stop_te->intr = 1;
+ buf->intr_te->intr = 1;
+
+ return 0;
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf: PT buffer.
+ * @head: Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+ struct topa_page *cur_tp;
+ struct topa_entry *te;
+ int pg;
+
+ if (buf->snapshot)
+ head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+ if (!buf->single) {
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ te = pt_topa_entry_for_page(buf, pg);
+
+ cur_tp = topa_entry_to_page(te);
+ buf->cur = &cur_tp->topa;
+ buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
+ buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+ } else {
+ buf->output_off = head;
+ }
+
+ local64_set(&buf->head, head);
+ local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf: PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+ struct topa *topa, *iter;
+
+ if (buf->single)
+ return;
+
+ list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+ /*
+ * right now, this is in free_aux() path only, so
+ * no need to unlink this table from the list
+ */
+ topa_free(topa);
+ }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf: PT buffer.
+ * @cpu: CPU on which to allocate.
+ * @nr_pages: No. of pages to allocate.
+ * @gfp: Allocation flags.
+ *
+ * Return: 0 on success or error code.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
+ unsigned long nr_pages, gfp_t gfp)
+{
+ struct topa *topa;
+ int err;
+
+ topa = topa_alloc(cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+
+ while (buf->nr_pages < nr_pages) {
+ err = topa_insert_pages(buf, cpu, gfp);
+ if (err) {
+ pt_buffer_fini_topa(buf);
+ return -ENOMEM;
+ }
+ }
+
+ /* link last table to the first one, unless we're double buffering */
+ if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first);
+ TOPA_ENTRY(buf->last, -1)->end = 1;
+ }
+
+ pt_topa_dump(buf);
+ return 0;
+}
+
+static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
+{
+ struct page *p = virt_to_page(buf->data_pages[0]);
+ int ret = -ENOTSUPP, order = 0;
+
+ /*
+ * We can use single range output mode
+ * + in snapshot mode, where we don't need interrupts;
+ * + if the hardware supports it;
+ * + if the entire buffer is one contiguous allocation.
+ */
+ if (!buf->snapshot)
+ goto out;
+
+ if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
+ goto out;
+
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (1 << order != nr_pages)
+ goto out;
+
+ /*
+ * Some processors cannot always support single range for more than
+ * 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might
+ * also be affected, so for now rather than trying to keep track of
+ * which ones, just disable it for all.
+ */
+ if (nr_pages > 1)
+ goto out;
+
+ buf->single = true;
+ buf->nr_pages = nr_pages;
+ ret = 0;
+out:
+ return ret;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @event: Performance event
+ * @pages: Array of pointers to buffer pages passed from perf core.
+ * @nr_pages: Number of pages in the buffer.
+ * @snapshot: If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return: Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(struct perf_event *event, void **pages,
+ int nr_pages, bool snapshot)
+{
+ struct pt_buffer *buf;
+ int node, ret, cpu = event->cpu;
+
+ if (!nr_pages)
+ return NULL;
+
+ /*
+ * Only support AUX sampling in snapshot mode, where we don't
+ * generate NMIs.
+ */
+ if (event->attr.aux_sample_size && !snapshot)
+ return NULL;
+
+ if (cpu == -1)
+ cpu = raw_smp_processor_id();
+ node = cpu_to_node(cpu);
+
+ buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->snapshot = snapshot;
+ buf->data_pages = pages;
+ buf->stop_pos = -1;
+ buf->intr_pos = -1;
+
+ INIT_LIST_HEAD(&buf->tables);
+
+ ret = pt_buffer_try_single(buf, nr_pages);
+ if (!ret)
+ return buf;
+
+ ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
+ if (ret) {
+ kfree(buf);
+ return NULL;
+ }
+
+ return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data: PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+ struct pt_buffer *buf = data;
+
+ pt_buffer_fini_topa(buf);
+ kfree(buf);
+}
+
+static int pt_addr_filters_init(struct perf_event *event)
+{
+ struct pt_filters *filters;
+ int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
+
+ if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
+ return 0;
+
+ filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
+ if (!filters)
+ return -ENOMEM;
+
+ if (event->parent)
+ memcpy(filters, event->parent->hw.addr_filters,
+ sizeof(*filters));
+
+ event->hw.addr_filters = filters;
+
+ return 0;
+}
+
+static void pt_addr_filters_fini(struct perf_event *event)
+{
+ kfree(event->hw.addr_filters);
+ event->hw.addr_filters = NULL;
+}
+
+#ifdef CONFIG_X86_64
+/* Clamp to a canonical address greater-than-or-equal-to the address given */
+static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits)
+{
+ return __is_canonical_address(vaddr, vaddr_bits) ?
+ vaddr :
+ -BIT_ULL(vaddr_bits - 1);
+}
+
+/* Clamp to a canonical address less-than-or-equal-to the address given */
+static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits)
+{
+ return __is_canonical_address(vaddr, vaddr_bits) ?
+ vaddr :
+ BIT_ULL(vaddr_bits - 1) - 1;
+}
+#else
+#define clamp_to_ge_canonical_addr(x, y) (x)
+#define clamp_to_le_canonical_addr(x, y) (x)
+#endif
+
+static int pt_event_addr_filters_validate(struct list_head *filters)
+{
+ struct perf_addr_filter *filter;
+ int range = 0;
+
+ list_for_each_entry(filter, filters, entry) {
+ /*
+ * PT doesn't support single address triggers and
+ * 'start' filters.
+ */
+ if (!filter->size ||
+ filter->action == PERF_ADDR_FILTER_ACTION_START)
+ return -EOPNOTSUPP;
+
+ if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void pt_event_addr_filters_sync(struct perf_event *event)
+{
+ struct perf_addr_filters_head *head = perf_event_addr_filters(event);
+ unsigned long msr_a, msr_b;
+ struct perf_addr_filter_range *fr = event->addr_filter_ranges;
+ struct pt_filters *filters = event->hw.addr_filters;
+ struct perf_addr_filter *filter;
+ int range = 0;
+
+ if (!filters)
+ return;
+
+ list_for_each_entry(filter, &head->list, entry) {
+ if (filter->path.dentry && !fr[range].start) {
+ msr_a = msr_b = 0;
+ } else {
+ unsigned long n = fr[range].size - 1;
+ unsigned long a = fr[range].start;
+ unsigned long b;
+
+ if (a > ULONG_MAX - n)
+ b = ULONG_MAX;
+ else
+ b = a + n;
+ /*
+ * Apply the offset. 64-bit addresses written to the
+ * MSRs must be canonical, but the range can encompass
+ * non-canonical addresses. Since software cannot
+ * execute at non-canonical addresses, adjusting to
+ * canonical addresses does not affect the result of the
+ * address filter.
+ */
+ msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits);
+ msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits);
+ if (msr_b < msr_a)
+ msr_a = msr_b = 0;
+ }
+
+ filters->filter[range].msr_a = msr_a;
+ filters->filter[range].msr_b = msr_b;
+ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
+ filters->filter[range].config = 1;
+ else
+ filters->filter[range].config = 2;
+ range++;
+ }
+
+ filters->nr_filters = range;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+ struct perf_event *event = pt->handle.event;
+
+ /*
+ * There may be a dangling PT bit in the interrupt status register
+ * after PT has been disabled by pt_event_stop(). Make sure we don't
+ * do anything (particularly, re-enable) for this event here.
+ */
+ if (!READ_ONCE(pt->handle_nmi))
+ return;
+
+ if (!event)
+ return;
+
+ pt_config_stop(event);
+
+ buf = perf_get_aux(&pt->handle);
+ if (!buf)
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
+
+ if (!event->hw.state) {
+ int ret;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf) {
+ event->hw.state = PERF_HES_STOPPED;
+ WRITE_ONCE(pt->resume_allowed, 0);
+ return;
+ }
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ /* snapshot counters don't use PMI, so it's safe */
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret) {
+ perf_aux_output_end(&pt->handle, 0);
+ WRITE_ONCE(pt->resume_allowed, 0);
+ return;
+ }
+
+ pt_config_buffer(buf);
+ pt_config_start(event);
+ }
+}
+
+void intel_pt_handle_vmx(int on)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct perf_event *event;
+ unsigned long flags;
+
+ /* PT plays nice with VMX, do nothing */
+ if (pt_pmu.vmx)
+ return;
+
+ /*
+ * VMXON will clear RTIT_CTL.TraceEn; we need to make
+ * sure to not try to set it while VMX is on. Disable
+ * interrupts to avoid racing with pmu callbacks;
+ * concurrent PMI should be handled fine.
+ */
+ local_irq_save(flags);
+ WRITE_ONCE(pt->vmx_on, on);
+
+ /*
+ * If an AUX transaction is in progress, it will contain
+ * gap(s), so flag it PARTIAL to inform the user.
+ */
+ event = pt->handle.event;
+ if (event)
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_PARTIAL);
+
+ /* Turn PTs back on */
+ if (!on && event)
+ wrmsrq(MSR_IA32_RTIT_CTL, event->hw.aux_config);
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx);
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+
+ if (mode & PERF_EF_RESUME) {
+ if (READ_ONCE(pt->resume_allowed)) {
+ u64 status;
+
+ /*
+ * Only if the trace is not active and the error and
+ * stopped bits are clear, is it safe to start, but a
+ * PMI might have just cleared these, so resume_allowed
+ * must be checked again also.
+ */
+ rdmsrq(MSR_IA32_RTIT_STATUS, status);
+ if (!(status & (RTIT_STATUS_TRIGGEREN |
+ RTIT_STATUS_ERROR |
+ RTIT_STATUS_STOPPED)) &&
+ READ_ONCE(pt->resume_allowed))
+ pt_config_start(event);
+ }
+ return;
+ }
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf)
+ goto fail_stop;
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ if (!buf->snapshot) {
+ if (pt_buffer_reset_markers(buf, &pt->handle))
+ goto fail_end_stop;
+ }
+
+ hwc->state = 0;
+
+ pt_config_buffer(buf);
+ pt_config(event);
+
+ return;
+
+fail_end_stop:
+ perf_aux_output_end(&pt->handle, 0);
+fail_stop:
+ hwc->state = PERF_HES_STOPPED;
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ if (mode & PERF_EF_PAUSE) {
+ if (READ_ONCE(pt->pause_allowed))
+ pt_config_stop(event);
+ return;
+ }
+
+ /*
+ * Protect against the PMI racing with disabling wrmsr,
+ * see comment in intel_pt_interrupt().
+ */
+ WRITE_ONCE(pt->handle_nmi, 0);
+ barrier();
+
+ /*
+ * Prevent a resume from attempting to restart tracing, or a pause
+ * during a subsequent start. Do this after clearing handle_nmi so that
+ * pt_event_snapshot_aux() will not re-allow them.
+ */
+ WRITE_ONCE(pt->pause_allowed, 0);
+ WRITE_ONCE(pt->resume_allowed, 0);
+ barrier();
+
+ pt_config_stop(event);
+
+ if (event->hw.state == PERF_HES_STOPPED)
+ return;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_UPDATE) {
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (!buf)
+ return;
+
+ if (WARN_ON_ONCE(pt->handle.event != event))
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ if (buf->snapshot)
+ pt->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
+ }
+}
+
+static long pt_event_snapshot_aux(struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ unsigned long from = 0, to;
+ long ret;
+
+ if (WARN_ON_ONCE(!buf))
+ return 0;
+
+ /*
+ * Sampling is only allowed on snapshot events;
+ * see pt_buffer_setup_aux().
+ */
+ if (WARN_ON_ONCE(!buf->snapshot))
+ return 0;
+
+ /* Prevent pause/resume from attempting to start/stop tracing */
+ WRITE_ONCE(pt->pause_allowed, 0);
+ WRITE_ONCE(pt->resume_allowed, 0);
+ barrier();
+ /*
+ * There is no PT interrupt in this mode, so stop the trace and it will
+ * remain stopped while the buffer is copied.
+ */
+ pt_config_stop(event);
+ pt_read_offset(buf);
+ pt_update_head(pt);
+
+ to = local_read(&buf->data_size);
+ if (to < size)
+ from = buf->nr_pages << PAGE_SHIFT;
+ from += to - size;
+
+ ret = perf_output_copy_aux(&pt->handle, handle, from, to);
+
+ /*
+ * Here, handle_nmi tells us if the tracing was on.
+ * If the tracing was on, restart it.
+ */
+ if (READ_ONCE(pt->handle_nmi)) {
+ WRITE_ONCE(pt->resume_allowed, 1);
+ barrier();
+ pt_config_start(event);
+ barrier();
+ WRITE_ONCE(pt->pause_allowed, 1);
+ }
+
+ return ret;
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+ pt_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ if (pt->handle.event)
+ goto fail;
+
+ if (mode & PERF_EF_START) {
+ pt_event_start(event, 0);
+ ret = -EINVAL;
+ if (hwc->state == PERF_HES_STOPPED)
+ goto fail;
+ } else {
+ hwc->state = PERF_HES_STOPPED;
+ }
+
+ ret = 0;
+fail:
+
+ return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+ pt_addr_filters_fini(event);
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+ if (event->attr.type != pt_pmu.pmu.type)
+ return -ENOENT;
+
+ if (!pt_event_valid(event))
+ return -EINVAL;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_pt))
+ return -EBUSY;
+
+ if (pt_addr_filters_init(event)) {
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+ return -ENOMEM;
+ }
+
+ event->destroy = pt_event_destroy;
+
+ return 0;
+}
+
+void cpu_emergency_stop_pt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ if (pt->handle.event)
+ pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
+int is_intel_pt_event(struct perf_event *event)
+{
+ return event->pmu == &pt_pmu.pmu;
+}
+
+static __init int pt_init(void)
+{
+ int ret, cpu, prior_warn = 0;
+
+ BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+ if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
+ return -ENODEV;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ u64 ctl;
+
+ ret = rdmsrq_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+ if (!ret && (ctl & RTIT_CTL_TRACEEN))
+ prior_warn++;
+ }
+ cpus_read_unlock();
+
+ if (prior_warn) {
+ x86_add_exclusive(x86_lbr_exclusive_pt);
+ pr_warn("PT is enabled at boot time, doing nothing\n");
+
+ return -EBUSY;
+ }
+
+ ret = pt_pmu_hw_init();
+ if (ret)
+ return ret;
+
+ if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) {
+ pr_warn("ToPA output is not supported on this CPU\n");
+ return -ENODEV;
+ }
+
+ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
+ pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG;
+ else
+ pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE;
+
+ pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE |
+ PERF_PMU_CAP_ITRACE |
+ PERF_PMU_CAP_AUX_PAUSE;
+ pt_pmu.pmu.attr_groups = pt_attr_groups;
+ pt_pmu.pmu.task_ctx_nr = perf_sw_context;
+ pt_pmu.pmu.event_init = pt_event_init;
+ pt_pmu.pmu.add = pt_event_add;
+ pt_pmu.pmu.del = pt_event_del;
+ pt_pmu.pmu.start = pt_event_start;
+ pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.snapshot_aux = pt_event_snapshot_aux;
+ pt_pmu.pmu.read = pt_event_read;
+ pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
+ pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync;
+ pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
+ pt_pmu.pmu.nr_addr_filters =
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges);
+
+ ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+ return ret;
+}
+arch_initcall(pt_init);
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
new file mode 100644
index 000000000000..2ac36250b656
--- /dev/null
+++ b/arch/x86/events/intel/pt.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+#define TOPA_SHIFT 12
+
+static inline unsigned int sizes(unsigned int tsz)
+{
+ return 1 << (tsz + TOPA_SHIFT);
+};
+
+struct topa_entry {
+ u64 end : 1;
+ u64 rsvd0 : 1;
+ u64 intr : 1;
+ u64 rsvd1 : 1;
+ u64 stop : 1;
+ u64 rsvd2 : 1;
+ u64 size : 4;
+ u64 rsvd3 : 2;
+ u64 base : 40;
+ u64 rsvd4 : 12;
+};
+
+struct pt_pmu {
+ struct pmu pmu;
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+ bool vmx;
+ bool branch_en_always_on;
+ unsigned long max_nonturbo_ratio;
+ unsigned int tsc_art_num;
+ unsigned int tsc_art_den;
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ * cpu, depending on perf event configuration
+ * @tables: list of ToPA tables in this buffer
+ * @first: shorthand for first topa table
+ * @last: shorthand for last topa table
+ * @cur: current topa table
+ * @nr_pages: buffer size in pages
+ * @cur_idx: current output region's index within @cur table
+ * @output_off: offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost: if data was lost/truncated
+ * @head: logical write offset inside the buffer
+ * @snapshot: if this is for a snapshot/overwrite counter
+ * @single: use Single Range Output instead of ToPA
+ * @wrapped: buffer advance wrapped back to the first topa table
+ * @stop_pos: STOP topa entry index
+ * @intr_pos: INT topa entry index
+ * @stop_te: STOP topa entry pointer
+ * @intr_te: INT topa entry pointer
+ * @data_pages: array of pages from perf
+ * @topa_index: table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+ struct list_head tables;
+ struct topa *first, *last, *cur;
+ unsigned int cur_idx;
+ size_t output_off;
+ unsigned long nr_pages;
+ local_t data_size;
+ local64_t head;
+ bool snapshot;
+ bool single;
+ bool wrapped;
+ long stop_pos, intr_pos;
+ struct topa_entry *stop_te, *intr_te;
+ void **data_pages;
+};
+
+#define PT_FILTERS_NUM 4
+
+/**
+ * struct pt_filter - IP range filter configuration
+ * @msr_a: range start, goes to RTIT_ADDRn_A
+ * @msr_b: range end, goes to RTIT_ADDRn_B
+ * @config: 4-bit field in RTIT_CTL
+ */
+struct pt_filter {
+ unsigned long msr_a;
+ unsigned long msr_b;
+ unsigned long config;
+};
+
+/**
+ * struct pt_filters - IP range filtering context
+ * @filter: filters defined for this context
+ * @nr_filters: number of defined filters in the @filter array
+ */
+struct pt_filters {
+ struct pt_filter filter[PT_FILTERS_NUM];
+ unsigned int nr_filters;
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle: perf output handle
+ * @filters: last configured filters
+ * @handle_nmi: do handle PT PMI on this cpu, there's an active event
+ * @vmx_on: 1 if VMX is ON on this cpu
+ * @pause_allowed: PERF_EF_PAUSE is allowed to stop tracing
+ * @resume_allowed: PERF_EF_RESUME is allowed to start tracing
+ * @output_base: cached RTIT_OUTPUT_BASE MSR value
+ * @output_mask: cached RTIT_OUTPUT_MASK MSR value
+ */
+struct pt {
+ struct perf_output_handle handle;
+ struct pt_filters filters;
+ int handle_nmi;
+ int vmx_on;
+ int pause_allowed;
+ int resume_allowed;
+ u64 output_base;
+ u64 output_mask;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
new file mode 100644
index 000000000000..e228e564b15e
--- /dev/null
+++ b/arch/x86/events/intel/uncore.c
@@ -0,0 +1,1985 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+static bool uncore_no_discover;
+module_param(uncore_no_discover, bool, 0);
+MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism "
+ "(default: enable the discovery mechanism).");
+struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
+struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
+
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* The PCI driver for the device which the uncore doesn't own. */
+struct pci_driver *uncore_pci_sub_driver;
+/* pci bus to socket mapping */
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
+struct pci_extra_dev *uncore_extra_pci_dev;
+int __uncore_max_dies;
+
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint uncore_constraint_fixed =
+ EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+struct event_constraint uncore_constraint_empty =
+ EVENT_CONSTRAINT(0, 0, 0);
+
+MODULE_DESCRIPTION("Support for Intel uncore performance events");
+MODULE_LICENSE("GPL");
+
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
+{
+ struct pci2phy_map *map;
+ int die_id = -1;
+
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == pci_domain_nr(bus)) {
+ die_id = map->pbus_to_dieid[bus->number];
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ return die_id;
+}
+
+int uncore_die_to_segment(int die)
+{
+ struct pci_bus *bus = NULL;
+
+ /* Find first pci bus which attributes to specified die. */
+ while ((bus = pci_find_next_bus(bus)) &&
+ (die != uncore_pcibus_to_dieid(bus)))
+ ;
+
+ return bus ? pci_domain_nr(bus) : -EINVAL;
+}
+
+int uncore_device_to_die(struct pci_dev *dev)
+{
+ int node = pcibus_to_node(dev->bus);
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_pcibus(dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node)
+ return c->topo.logical_die_id;
+ }
+
+ return -1;
+}
+
+static void uncore_free_pcibus_map(void)
+{
+ struct pci2phy_map *map, *tmp;
+
+ list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+ list_del(&map->list);
+ kfree(map);
+ }
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+ struct pci2phy_map *map, *alloc = NULL;
+ int i;
+
+ lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == segment)
+ goto end;
+ }
+
+ if (!alloc) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+ raw_spin_lock(&pci2phy_map_lock);
+
+ if (!alloc)
+ return NULL;
+
+ goto lookup;
+ }
+
+ map = alloc;
+ alloc = NULL;
+ map->segment = segment;
+ for (i = 0; i < 256; i++)
+ map->pbus_to_dieid[i] = -1;
+ list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+ kfree(alloc);
+ return map;
+}
+
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct uncore_event_desc *event =
+ container_of(attr, struct uncore_event_desc, attr);
+ return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+ unsigned int dieid = topology_logical_die_id(cpu);
+
+ /*
+ * The unsigned check also catches the '-1' return value for non
+ * existent mappings in the topology map.
+ */
+ return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
+}
+
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ u64 count;
+
+ rdmsrq(event->hw.event_base, count);
+
+ return count;
+}
+
+void uncore_mmio_exit_box(struct intel_uncore_box *box)
+{
+ if (box->io_addr)
+ iounmap(box->io_addr);
+}
+
+u64 uncore_mmio_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ if (!box->io_addr)
+ return 0;
+
+ if (!uncore_mmio_is_valid_offset(box, event->hw.event_base))
+ return 0;
+
+ return readq(box->io_addr + event->hw.event_base);
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ unsigned long flags;
+ bool ok = false;
+
+ /*
+ * reg->alloc can be set due to existing state, so for fake box we
+ * need to ignore this, otherwise we might fail to allocate proper
+ * fake state for this extra reg constraint.
+ */
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+
+ er = &box->shared_regs[reg1->idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) ||
+ (er->config1 == reg1->config && er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (ok) {
+ if (!uncore_box_is_fake(box))
+ reg1->alloc = 1;
+ return NULL;
+ }
+
+ return &uncore_constraint_empty;
+}
+
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+ /*
+ * Only put constraint if extra reg was actually allocated. Also
+ * takes care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake box we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent box
+ * state either since it will be thrown out.
+ */
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ er = &box->shared_regs[reg1->idx];
+ atomic_dec(&er->ref);
+ reg1->alloc = 0;
+}
+
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ er = &box->shared_regs[idx];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return config;
+}
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+ struct perf_event *event, int idx)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->idx = idx;
+ hwc->last_tag = ++box->tags[idx];
+
+ if (uncore_pmc_fixed(hwc->idx)) {
+ hwc->event_base = uncore_fixed_ctr(box);
+ hwc->config_base = uncore_fixed_ctl(box);
+ return;
+ }
+
+ if (intel_generic_uncore_assign_hw_event(event, box))
+ return;
+
+ hwc->config_base = uncore_event_ctl(box, hwc->idx);
+ hwc->event_base = uncore_perf_ctr(box, hwc->idx);
+}
+
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+ u64 prev_count, new_count, delta;
+ int shift;
+
+ if (uncore_pmc_freerunning(event->hw.idx))
+ shift = 64 - uncore_freerunning_bits(box, event);
+ else if (uncore_pmc_fixed(event->hw.idx))
+ shift = 64 - uncore_fixed_ctr_bits(box);
+ else
+ shift = 64 - uncore_perf_ctr_bits(box);
+
+ /* the hrtimer might modify the previous event value */
+again:
+ prev_count = local64_read(&event->hw.prev_count);
+ new_count = uncore_read_counter(box, event);
+ if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+ goto again;
+
+ delta = (new_count << shift) - (prev_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+ struct intel_uncore_box *box;
+ struct perf_event *event;
+ int bit;
+
+ box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+ if (!box->n_active || box->cpu != smp_processor_id())
+ return HRTIMER_NORESTART;
+
+ /*
+ * handle boxes with an active event list as opposed to active
+ * counters
+ */
+ list_for_each_entry(event, &box->active_list, active_entry) {
+ uncore_perf_event_update(box, event);
+ }
+
+ for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+ uncore_perf_event_update(box, box->events[bit]);
+
+ hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+ return HRTIMER_RESTART;
+}
+
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+ HRTIMER_MODE_REL_PINNED_HARD);
+}
+
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+ int node)
+{
+ int i, size, numshared = type->num_shared_regs ;
+ struct intel_uncore_box *box;
+
+ size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
+
+ box = kzalloc_node(size, GFP_KERNEL, node);
+ if (!box)
+ return NULL;
+
+ for (i = 0; i < numshared; i++)
+ raw_spin_lock_init(&box->shared_regs[i].lock);
+
+ uncore_pmu_init_hrtimer(box);
+ box->cpu = -1;
+ box->dieid = -1;
+
+ /* set default hrtimer timeout */
+ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+ INIT_LIST_HEAD(&box->active_list);
+
+ return box;
+}
+
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_box_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return &box->pmu->pmu == event->pmu;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+ bool dogrp)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = box->pmu->type->num_counters;
+ if (box->pmu->type->fixed_ctl)
+ max_count++;
+
+ if (box->n_events >= max_count)
+ return -EINVAL;
+
+ n = box->n_events;
+
+ if (is_box_event(box, leader)) {
+ box->event_list[n] = leader;
+ n++;
+ }
+
+ if (!dogrp)
+ return n;
+
+ for_each_sibling_event(event, leader) {
+ if (!is_box_event(box, event) ||
+ event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (n >= max_count)
+ return -EINVAL;
+
+ box->event_list[n] = event;
+ n++;
+ }
+ return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct event_constraint *c;
+
+ if (type->ops->get_constraint) {
+ c = type->ops->get_constraint(box, event);
+ if (c)
+ return c;
+ }
+
+ if (event->attr.config == UNCORE_FIXED_EVENT)
+ return &uncore_constraint_fixed;
+
+ if (type->constraints) {
+ for_each_event_constraint(c, type->constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
+
+ return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ if (box->pmu->type->ops->put_constraint)
+ box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+ unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+ struct event_constraint *c;
+ int i, wmin, wmax, ret = 0;
+ struct hw_perf_event *hwc;
+
+ bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+ for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+ c = uncore_get_event_constraint(box, box->event_list[i]);
+ box->event_constraint[i] = c;
+ wmin = min(wmin, c->weight);
+ wmax = max(wmax, c->weight);
+ }
+
+ /* fastpath, try to reuse previous register */
+ for (i = 0; i < n; i++) {
+ hwc = &box->event_list[i]->hw;
+ c = box->event_constraint[i];
+
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
+
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
+
+ /* not already used */
+ if (test_bit(hwc->idx, used_mask))
+ break;
+
+ __set_bit(hwc->idx, used_mask);
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+ /* slow path */
+ if (i != n)
+ ret = perf_assign_events(box->event_constraint, n,
+ wmin, wmax, n, assign);
+
+ if (!assign || ret) {
+ for (i = 0; i < n; i++)
+ uncore_put_event_constraint(box, box->event_list[i]);
+ }
+ return ret ? -EINVAL : 0;
+}
+
+void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int idx = event->hw.idx;
+
+ if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+ return;
+
+ /*
+ * Free running counter is read-only and always active.
+ * Use the current counter value as start point.
+ * There is no overflow interrupt for free running counter.
+ * Use hrtimer to periodically poll the counter to avoid overflow.
+ */
+ if (uncore_pmc_freerunning(event->hw.idx)) {
+ list_add_tail(&event->active_entry, &box->active_list);
+ local64_set(&event->hw.prev_count,
+ uncore_read_counter(box, event));
+ if (box->n_active++ == 0)
+ uncore_pmu_start_hrtimer(box);
+ return;
+ }
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+ box->events[idx] = event;
+ box->n_active++;
+ __set_bit(idx, box->active_mask);
+
+ local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+ uncore_enable_event(box, event);
+
+ if (box->n_active == 1)
+ uncore_pmu_start_hrtimer(box);
+}
+
+void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* Cannot disable free running counter which is read-only */
+ if (uncore_pmc_freerunning(hwc->idx)) {
+ list_del(&event->active_entry);
+ if (--box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ uncore_perf_event_update(box, event);
+ return;
+ }
+
+ if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+ uncore_disable_event(box, event);
+ box->n_active--;
+ box->events[hwc->idx] = NULL;
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+ int assign[UNCORE_PMC_IDX_MAX];
+ int i, n, ret;
+
+ if (!box)
+ return -ENODEV;
+
+ /*
+ * The free funning counter is assigned in event_init().
+ * The free running counter event and free running counter
+ * are 1:1 mapped. It doesn't need to be tracked in event_list.
+ */
+ if (uncore_pmc_freerunning(hwc->idx)) {
+ if (flags & PERF_EF_START)
+ uncore_pmu_event_start(event, 0);
+ return 0;
+ }
+
+ ret = n = uncore_collect_events(box, event, false);
+ if (ret < 0)
+ return ret;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ ret = uncore_assign_events(box, assign, n);
+ if (ret)
+ return ret;
+
+ /* save events moving to new counters */
+ for (i = 0; i < box->n_events; i++) {
+ event = box->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx == assign[i] &&
+ hwc->last_tag == box->tags[assign[i]])
+ continue;
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+ }
+
+ /* reprogram moved events into new counters */
+ for (i = 0; i < n; i++) {
+ event = box->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx != assign[i] ||
+ hwc->last_tag != box->tags[assign[i]])
+ uncore_assign_hw_event(box, event, assign[i]);
+ else if (i < box->n_events)
+ continue;
+
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ uncore_pmu_event_start(event, 0);
+ }
+ box->n_events = n;
+
+ return 0;
+}
+
+void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+ /*
+ * The event for free running counter is not tracked by event_list.
+ * It doesn't need to force event->hw.idx = -1 to reassign the counter.
+ * Because the event and the free running counter are 1:1 mapped.
+ */
+ if (uncore_pmc_freerunning(event->hw.idx))
+ return;
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ uncore_put_event_constraint(box, event);
+
+ for (++i; i < box->n_events; i++)
+ box->event_list[i - 1] = box->event_list[i];
+
+ --box->n_events;
+ break;
+ }
+ }
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+}
+
+void uncore_pmu_event_read(struct perf_event *event)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+ struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+ struct intel_uncore_box *fake_box;
+ int ret = -EINVAL, n;
+
+ /* The free running counter is always active. */
+ if (uncore_pmc_freerunning(event->hw.idx))
+ return 0;
+
+ fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+ if (!fake_box)
+ return -ENOMEM;
+
+ fake_box->pmu = pmu;
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = uncore_collect_events(fake_box, leader, true);
+ if (n < 0)
+ goto out;
+
+ fake_box->n_events = n;
+ n = uncore_collect_events(fake_box, event, false);
+ if (n < 0)
+ goto out;
+
+ fake_box->n_events = n;
+
+ ret = uncore_assign_events(fake_box, NULL, n);
+out:
+ kfree(fake_box);
+ return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ int ret;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (!pmu->registered)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+ event->cpu = box->cpu;
+ event->pmu_private = box;
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+ if (event->attr.config == UNCORE_FIXED_EVENT) {
+ /* no fixed counter */
+ if (!pmu->type->fixed_ctl)
+ return -EINVAL;
+ /*
+ * if there is only one fixed counter, only the first pmu
+ * can access the fixed counter
+ */
+ if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+ return -EINVAL;
+
+ /* fixed counters have event field hardcoded to zero */
+ hwc->config = 0ULL;
+ } else if (is_freerunning_event(event)) {
+ hwc->config = event->attr.config;
+ if (!check_valid_freerunning_event(box, event))
+ return -EINVAL;
+ event->hw.idx = UNCORE_PMC_IDX_FREERUNNING;
+ /*
+ * The free running counter event and free running counter
+ * are always 1:1 mapped.
+ * The free running counter is always active.
+ * Assign the free running counter here.
+ */
+ event->hw.event_base = uncore_freerunning_counter(box, event);
+ } else {
+ hwc->config = event->attr.config &
+ (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
+ if (pmu->type->ops->hw_config) {
+ ret = pmu->type->ops->hw_config(box, event);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (event->group_leader != event)
+ ret = uncore_validate_group(pmu, event);
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static void uncore_pmu_enable(struct pmu *pmu)
+{
+ struct intel_uncore_pmu *uncore_pmu;
+ struct intel_uncore_box *box;
+
+ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
+
+ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
+ if (!box)
+ return;
+
+ if (uncore_pmu->type->ops->enable_box)
+ uncore_pmu->type->ops->enable_box(box);
+}
+
+static void uncore_pmu_disable(struct pmu *pmu)
+{
+ struct intel_uncore_pmu *uncore_pmu;
+ struct intel_uncore_box *box;
+
+ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
+
+ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
+ if (!box)
+ return;
+
+ if (uncore_pmu->type->ops->disable_box)
+ uncore_pmu->type->ops->disable_box(box);
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct intel_uncore_pmu *pmu = container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+
+ return cpumap_print_to_pagebuf(true, buf, &pmu->cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static const struct attribute_group uncore_pmu_attr_group = {
+ .attrs = uncore_pmu_attrs,
+};
+
+static inline int uncore_get_box_id(struct intel_uncore_type *type,
+ struct intel_uncore_pmu *pmu)
+{
+ if (type->boxes)
+ return intel_uncore_find_discovery_unit_id(type->boxes, -1, pmu->pmu_idx);
+
+ return pmu->pmu_idx;
+}
+
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
+{
+ struct intel_uncore_type *type = pmu->type;
+
+ if (type->num_boxes == 1)
+ sprintf(pmu_name, "uncore_type_%u", type->type_id);
+ else {
+ sprintf(pmu_name, "uncore_type_%u_%d",
+ type->type_id, uncore_get_box_id(type, pmu));
+ }
+}
+
+static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
+{
+ struct intel_uncore_type *type = pmu->type;
+
+ /*
+ * No uncore block name in discovery table.
+ * Use uncore_type_&typeid_&boxid as name.
+ */
+ if (!type->name) {
+ uncore_get_alias_name(pmu->name, pmu);
+ return;
+ }
+
+ if (type->num_boxes == 1) {
+ if (strlen(type->name) > 0)
+ sprintf(pmu->name, "uncore_%s", type->name);
+ else
+ sprintf(pmu->name, "uncore");
+ } else {
+ /*
+ * Use the box ID from the discovery table if applicable.
+ */
+ sprintf(pmu->name, "uncore_%s_%d", type->name,
+ uncore_get_box_id(type, pmu));
+ }
+}
+
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+ int ret;
+
+ if (!pmu->type->pmu) {
+ pmu->pmu = (struct pmu) {
+ .attr_groups = pmu->type->attr_groups,
+ .task_ctx_nr = perf_invalid_context,
+ .pmu_enable = uncore_pmu_enable,
+ .pmu_disable = uncore_pmu_disable,
+ .event_init = uncore_pmu_event_init,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
+ .read = uncore_pmu_event_read,
+ .module = THIS_MODULE,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .attr_update = pmu->type->attr_update,
+ };
+ } else {
+ pmu->pmu = *pmu->type->pmu;
+ pmu->pmu.attr_groups = pmu->type->attr_groups;
+ pmu->pmu.attr_update = pmu->type->attr_update;
+ }
+
+ uncore_get_pmu_name(pmu);
+
+ ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+ if (!ret)
+ pmu->registered = true;
+ return ret;
+}
+
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+ if (!pmu->registered)
+ return;
+ perf_pmu_unregister(&pmu->pmu);
+ pmu->registered = false;
+}
+
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+ int die;
+
+ for (die = 0; die < uncore_max_dies(); die++)
+ kfree(pmu->boxes[die]);
+ kfree(pmu->boxes);
+}
+
+static void uncore_type_exit(struct intel_uncore_type *type)
+{
+ struct intel_uncore_pmu *pmu = type->pmus;
+ int i;
+
+ if (type->cleanup_mapping)
+ type->cleanup_mapping(type);
+
+ if (type->cleanup_extra_boxes)
+ type->cleanup_extra_boxes(type);
+
+ if (pmu) {
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ uncore_pmu_unregister(pmu);
+ uncore_free_boxes(pmu);
+ }
+ kfree(type->pmus);
+ type->pmus = NULL;
+ }
+
+ kfree(type->events_group);
+ type->events_group = NULL;
+}
+
+static void uncore_types_exit(struct intel_uncore_type **types)
+{
+ for (; *types; types++)
+ uncore_type_exit(*types);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type)
+{
+ struct intel_uncore_pmu *pmus;
+ size_t size;
+ int i, j;
+
+ pmus = kcalloc(type->num_boxes, sizeof(*pmus), GFP_KERNEL);
+ if (!pmus)
+ return -ENOMEM;
+
+ size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
+
+ for (i = 0; i < type->num_boxes; i++) {
+ pmus[i].pmu_idx = i;
+ pmus[i].type = type;
+ pmus[i].boxes = kzalloc(size, GFP_KERNEL);
+ if (!pmus[i].boxes)
+ goto err;
+ }
+
+ type->pmus = pmus;
+ type->unconstrainted = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+ 0, type->num_counters, 0, 0);
+
+ if (type->event_descs) {
+ struct {
+ struct attribute_group group;
+ struct attribute *attrs[];
+ } *attr_group;
+ for (i = 0; type->event_descs[i].attr.attr.name; i++);
+
+ attr_group = kzalloc(struct_size(attr_group, attrs, i + 1),
+ GFP_KERNEL);
+ if (!attr_group)
+ goto err;
+
+ attr_group->group.name = "events";
+ attr_group->group.attrs = attr_group->attrs;
+
+ for (j = 0; j < i; j++)
+ attr_group->attrs[j] = &type->event_descs[j].attr.attr;
+
+ type->events_group = &attr_group->group;
+ }
+
+ type->pmu_group = &uncore_pmu_attr_group;
+
+ if (type->set_mapping)
+ type->set_mapping(type);
+
+ return 0;
+
+err:
+ for (i = 0; i < type->num_boxes; i++)
+ kfree(pmus[i].boxes);
+ kfree(pmus);
+
+ return -ENOMEM;
+}
+
+static int __init
+uncore_types_init(struct intel_uncore_type **types)
+{
+ int ret;
+
+ for (; *types; types++) {
+ ret = uncore_type_init(*types);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Get the die information of a PCI device.
+ * @pdev: The PCI device.
+ * @die: The die id which the device maps to.
+ */
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
+{
+ *die = uncore_pcibus_to_dieid(pdev->bus);
+ if (*die < 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev)
+{
+ struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_discovery_unit *unit;
+ struct intel_uncore_type *type;
+ struct rb_node *node;
+
+ for (; *types; types++) {
+ type = *types;
+
+ for (node = rb_first(type->boxes); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(unit->addr) &&
+ pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(unit->addr) &&
+ pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(unit->addr))
+ return &type->pmus[unit->pmu_idx];
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Find the PMU of a PCI device.
+ * @pdev: The PCI device.
+ * @ids: The ID table of the available PCI devices with a PMU.
+ * If NULL, search the whole uncore_pci_uncores.
+ */
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
+{
+ struct intel_uncore_pmu *pmu = NULL;
+ struct intel_uncore_type *type;
+ kernel_ulong_t data;
+ unsigned int devfn;
+
+ if (!ids)
+ return uncore_pci_find_dev_pmu_from_types(pdev);
+
+ while (ids && ids->vendor) {
+ if ((ids->vendor == pdev->vendor) &&
+ (ids->device == pdev->device)) {
+ data = ids->driver_data;
+ devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data),
+ UNCORE_PCI_DEV_FUNC(data));
+ if (devfn == pdev->devfn) {
+ type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)];
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)];
+ break;
+ }
+ }
+ ids++;
+ }
+ return pmu;
+}
+
+/*
+ * Register the PMU for a PCI device
+ * @pdev: The PCI device.
+ * @type: The corresponding PMU type of the device.
+ * @pmu: The corresponding PMU of the device.
+ * @die: The die id which the device maps to.
+ */
+static int uncore_pci_pmu_register(struct pci_dev *pdev,
+ struct intel_uncore_type *type,
+ struct intel_uncore_pmu *pmu,
+ int die)
+{
+ struct intel_uncore_box *box;
+ int ret;
+
+ if (WARN_ON_ONCE(pmu->boxes[die] != NULL))
+ return -EINVAL;
+
+ box = uncore_alloc_box(type, NUMA_NO_NODE);
+ if (!box)
+ return -ENOMEM;
+
+ atomic_inc(&box->refcnt);
+ box->dieid = die;
+ box->pci_dev = pdev;
+ box->pmu = pmu;
+ uncore_box_init(box);
+
+ pmu->boxes[die] = box;
+ if (atomic_inc_return(&pmu->activeboxes) > 1)
+ return 0;
+
+ /* First active box registers the pmu */
+ ret = uncore_pmu_register(pmu);
+ if (ret) {
+ pmu->boxes[die] = NULL;
+ uncore_box_exit(box);
+ kfree(box);
+ }
+ return ret;
+}
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu = NULL;
+ int die, ret;
+
+ ret = uncore_pci_get_dev_die_info(pdev, &die);
+ if (ret)
+ return ret;
+
+ if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+ int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+
+ uncore_extra_pci_dev[die].dev[idx] = pdev;
+ pci_set_drvdata(pdev, NULL);
+ return 0;
+ }
+
+ type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+
+ /*
+ * Some platforms, e.g. Knights Landing, use a common PCI device ID
+ * for multiple instances of an uncore PMU device type. We should check
+ * PCI slot and func to indicate the uncore box.
+ */
+ if (id->driver_data & ~0xffff) {
+ struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver);
+
+ pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
+ if (pmu == NULL)
+ return -ENODEV;
+ } else {
+ /*
+ * for performance monitoring unit with multiple boxes,
+ * each box has a different function id.
+ */
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+ }
+
+ ret = uncore_pci_pmu_register(pdev, type, pmu, die);
+
+ pci_set_drvdata(pdev, pmu->boxes[die]);
+
+ return ret;
+}
+
+/*
+ * Unregister the PMU of a PCI device
+ * @pmu: The corresponding PMU is unregistered.
+ * @die: The die id which the device maps to.
+ */
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
+{
+ struct intel_uncore_box *box = pmu->boxes[die];
+
+ pmu->boxes[die] = NULL;
+ if (atomic_dec_return(&pmu->activeboxes) == 0)
+ uncore_pmu_unregister(pmu);
+ uncore_box_exit(box);
+ kfree(box);
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+ struct intel_uncore_box *box;
+ struct intel_uncore_pmu *pmu;
+ int i, die;
+
+ if (uncore_pci_get_dev_die_info(pdev, &die))
+ return;
+
+ box = pci_get_drvdata(pdev);
+ if (!box) {
+ for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+ if (uncore_extra_pci_dev[die].dev[i] == pdev) {
+ uncore_extra_pci_dev[die].dev[i] = NULL;
+ break;
+ }
+ }
+ WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+ return;
+ }
+
+ pmu = box->pmu;
+
+ pci_set_drvdata(pdev, NULL);
+
+ uncore_pci_pmu_unregister(pmu, die);
+}
+
+static int uncore_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data,
+ const struct pci_device_id *ids)
+{
+ struct device *dev = data;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct intel_uncore_pmu *pmu;
+ int die;
+
+ /* Unregister the PMU when the device is going to be deleted. */
+ if (action != BUS_NOTIFY_DEL_DEVICE)
+ return NOTIFY_DONE;
+
+ pmu = uncore_pci_find_dev_pmu(pdev, ids);
+ if (!pmu)
+ return NOTIFY_DONE;
+
+ if (uncore_pci_get_dev_die_info(pdev, &die))
+ return NOTIFY_DONE;
+
+ uncore_pci_pmu_unregister(pmu, die);
+
+ return NOTIFY_OK;
+}
+
+static int uncore_pci_sub_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return uncore_bus_notify(nb, action, data,
+ uncore_pci_sub_driver->id_table);
+}
+
+static struct notifier_block uncore_pci_sub_notifier = {
+ .notifier_call = uncore_pci_sub_bus_notify,
+};
+
+static void uncore_pci_sub_driver_init(void)
+{
+ const struct pci_device_id *ids = uncore_pci_sub_driver->id_table;
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct pci_dev *pci_sub_dev;
+ bool notify = false;
+ unsigned int devfn;
+ int die;
+
+ while (ids && ids->vendor) {
+ pci_sub_dev = NULL;
+ type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)];
+ /*
+ * Search the available device, and register the
+ * corresponding PMU.
+ */
+ while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
+ ids->device, pci_sub_dev))) {
+ devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
+ UNCORE_PCI_DEV_FUNC(ids->driver_data));
+ if (devfn != pci_sub_dev->devfn)
+ continue;
+
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
+
+ if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
+ continue;
+
+ if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
+ die))
+ notify = true;
+ }
+ ids++;
+ }
+
+ if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier))
+ notify = false;
+
+ if (!notify)
+ uncore_pci_sub_driver = NULL;
+}
+
+static int uncore_pci_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return uncore_bus_notify(nb, action, data, NULL);
+}
+
+static struct notifier_block uncore_pci_notifier = {
+ .notifier_call = uncore_pci_bus_notify,
+};
+
+
+static void uncore_pci_pmus_register(void)
+{
+ struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_discovery_unit *unit;
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct rb_node *node;
+ struct pci_dev *pdev;
+
+ for (; *types; types++) {
+ type = *types;
+
+ for (node = rb_first(type->boxes); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(unit->addr),
+ UNCORE_DISCOVERY_PCI_BUS(unit->addr),
+ UNCORE_DISCOVERY_PCI_DEVFN(unit->addr));
+
+ if (!pdev)
+ continue;
+ pmu = &type->pmus[unit->pmu_idx];
+ uncore_pci_pmu_register(pdev, type, pmu, unit->die);
+ }
+ }
+
+ bus_register_notifier(&pci_bus_type, &uncore_pci_notifier);
+}
+
+static int __init uncore_pci_init(void)
+{
+ size_t size;
+ int ret;
+
+ size = uncore_max_dies() * sizeof(struct pci_extra_dev);
+ uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+ if (!uncore_extra_pci_dev) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = uncore_types_init(uncore_pci_uncores);
+ if (ret)
+ goto errtype;
+
+ if (uncore_pci_driver) {
+ uncore_pci_driver->probe = uncore_pci_probe;
+ uncore_pci_driver->remove = uncore_pci_remove;
+
+ ret = pci_register_driver(uncore_pci_driver);
+ if (ret)
+ goto errtype;
+ } else
+ uncore_pci_pmus_register();
+
+ if (uncore_pci_sub_driver)
+ uncore_pci_sub_driver_init();
+
+ pcidrv_registered = true;
+ return 0;
+
+errtype:
+ uncore_types_exit(uncore_pci_uncores);
+ kfree(uncore_extra_pci_dev);
+ uncore_extra_pci_dev = NULL;
+ uncore_free_pcibus_map();
+err:
+ uncore_pci_uncores = empty_uncore;
+ return ret;
+}
+
+static void uncore_pci_exit(void)
+{
+ if (pcidrv_registered) {
+ pcidrv_registered = false;
+ if (uncore_pci_sub_driver)
+ bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier);
+ if (uncore_pci_driver)
+ pci_unregister_driver(uncore_pci_driver);
+ else
+ bus_unregister_notifier(&pci_bus_type, &uncore_pci_notifier);
+ uncore_types_exit(uncore_pci_uncores);
+ kfree(uncore_extra_pci_dev);
+ uncore_free_pcibus_map();
+ }
+}
+
+static bool uncore_die_has_box(struct intel_uncore_type *type,
+ int die, unsigned int pmu_idx)
+{
+ if (!type->boxes)
+ return true;
+
+ if (intel_uncore_find_discovery_unit_id(type->boxes, die, pmu_idx) < 0)
+ return false;
+
+ return true;
+}
+
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+ int new_cpu)
+{
+ struct intel_uncore_pmu *pmu = type->pmus;
+ struct intel_uncore_box *box;
+ int i, die;
+
+ die = topology_logical_die_id(old_cpu < 0 ? new_cpu : old_cpu);
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[die];
+ if (!box)
+ continue;
+
+ if (old_cpu < 0) {
+ WARN_ON_ONCE(box->cpu != -1);
+ if (uncore_die_has_box(type, die, pmu->pmu_idx)) {
+ box->cpu = new_cpu;
+ cpumask_set_cpu(new_cpu, &pmu->cpu_mask);
+ }
+ continue;
+ }
+
+ WARN_ON_ONCE(box->cpu != -1 && box->cpu != old_cpu);
+ box->cpu = -1;
+ cpumask_clear_cpu(old_cpu, &pmu->cpu_mask);
+ if (new_cpu < 0)
+ continue;
+
+ if (!uncore_die_has_box(type, die, pmu->pmu_idx))
+ continue;
+ uncore_pmu_cancel_hrtimer(box);
+ perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+ box->cpu = new_cpu;
+ cpumask_set_cpu(new_cpu, &pmu->cpu_mask);
+ }
+}
+
+static void uncore_change_context(struct intel_uncore_type **uncores,
+ int old_cpu, int new_cpu)
+{
+ for (; *uncores; uncores++)
+ uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
+static void uncore_box_unref(struct intel_uncore_type **types, int id)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ int i;
+
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[id];
+ if (box && box->cpu >= 0 && atomic_dec_return(&box->refcnt) == 0)
+ uncore_box_exit(box);
+ }
+ }
+}
+
+static int uncore_event_cpu_offline(unsigned int cpu)
+{
+ int die, target;
+
+ /* Check if exiting cpu is used for collecting uncore events */
+ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+ goto unref;
+ /* Find a new cpu to collect uncore events */
+ target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+
+ /* Migrate uncore events to the new target */
+ if (target < nr_cpu_ids)
+ cpumask_set_cpu(target, &uncore_cpu_mask);
+ else
+ target = -1;
+
+ uncore_change_context(uncore_msr_uncores, cpu, target);
+ uncore_change_context(uncore_mmio_uncores, cpu, target);
+ uncore_change_context(uncore_pci_uncores, cpu, target);
+
+unref:
+ /* Clear the references */
+ die = topology_logical_die_id(cpu);
+ uncore_box_unref(uncore_msr_uncores, die);
+ uncore_box_unref(uncore_mmio_uncores, die);
+ return 0;
+}
+
+static int allocate_boxes(struct intel_uncore_type **types,
+ unsigned int die, unsigned int cpu)
+{
+ struct intel_uncore_box *box, *tmp;
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ LIST_HEAD(allocated);
+ int i;
+
+ /* Try to allocate all required boxes */
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ if (pmu->boxes[die])
+ continue;
+ box = uncore_alloc_box(type, cpu_to_node(cpu));
+ if (!box)
+ goto cleanup;
+ box->pmu = pmu;
+ box->dieid = die;
+ list_add(&box->active_list, &allocated);
+ }
+ }
+ /* Install them in the pmus */
+ list_for_each_entry_safe(box, tmp, &allocated, active_list) {
+ list_del_init(&box->active_list);
+ box->pmu->boxes[die] = box;
+ }
+ return 0;
+
+cleanup:
+ list_for_each_entry_safe(box, tmp, &allocated, active_list) {
+ list_del_init(&box->active_list);
+ kfree(box);
+ }
+ return -ENOMEM;
+}
+
+static int uncore_box_ref(struct intel_uncore_type **types,
+ int id, unsigned int cpu)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ int i, ret;
+
+ ret = allocate_boxes(types, id, cpu);
+ if (ret)
+ return ret;
+
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[id];
+ if (box && box->cpu >= 0 && atomic_inc_return(&box->refcnt) == 1)
+ uncore_box_init(box);
+ }
+ }
+ return 0;
+}
+
+static int uncore_event_cpu_online(unsigned int cpu)
+{
+ int die, target, msr_ret, mmio_ret;
+
+ die = topology_logical_die_id(cpu);
+ msr_ret = uncore_box_ref(uncore_msr_uncores, die, cpu);
+ mmio_ret = uncore_box_ref(uncore_mmio_uncores, die, cpu);
+ if (msr_ret && mmio_ret)
+ return -ENOMEM;
+
+ /*
+ * Check if there is an online cpu in the package
+ * which collects uncore events already.
+ */
+ target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu));
+ if (target < nr_cpu_ids)
+ return 0;
+
+ cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+ if (!msr_ret)
+ uncore_change_context(uncore_msr_uncores, -1, cpu);
+ if (!mmio_ret)
+ uncore_change_context(uncore_mmio_uncores, -1, cpu);
+ uncore_change_context(uncore_pci_uncores, -1, cpu);
+ return 0;
+}
+
+static int __init type_pmu_register(struct intel_uncore_type *type)
+{
+ int i, ret;
+
+ for (i = 0; i < type->num_boxes; i++) {
+ ret = uncore_pmu_register(&type->pmus[i]);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+ struct intel_uncore_type **types = uncore_msr_uncores;
+ int ret;
+
+ for (; *types; types++) {
+ ret = type_pmu_register(*types);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int __init uncore_cpu_init(void)
+{
+ int ret;
+
+ ret = uncore_types_init(uncore_msr_uncores);
+ if (ret)
+ goto err;
+
+ ret = uncore_msr_pmus_register();
+ if (ret)
+ goto err;
+ return 0;
+err:
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_msr_uncores = empty_uncore;
+ return ret;
+}
+
+static int __init uncore_mmio_init(void)
+{
+ struct intel_uncore_type **types = uncore_mmio_uncores;
+ int ret;
+
+ ret = uncore_types_init(types);
+ if (ret)
+ goto err;
+
+ for (; *types; types++) {
+ ret = type_pmu_register(*types);
+ if (ret)
+ goto err;
+ }
+ return 0;
+err:
+ uncore_types_exit(uncore_mmio_uncores);
+ uncore_mmio_uncores = empty_uncore;
+ return ret;
+}
+
+struct intel_uncore_init_fun {
+ void (*cpu_init)(void);
+ int (*pci_init)(void);
+ void (*mmio_init)(void);
+ /* Discovery table is required */
+ bool use_discovery;
+ /* The units in the discovery table should be ignored. */
+ int *uncore_units_ignore;
+};
+
+static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
+ .cpu_init = nhm_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun snb_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = snb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun ivb_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = ivb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hsw_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = hsw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdw_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = bdw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun snbep_uncore_init __initconst = {
+ .cpu_init = snbep_uncore_cpu_init,
+ .pci_init = snbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = {
+ .cpu_init = nhmex_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = {
+ .cpu_init = ivbep_uncore_cpu_init,
+ .pci_init = ivbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hswep_uncore_init __initconst = {
+ .cpu_init = hswep_uncore_cpu_init,
+ .pci_init = hswep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdx_uncore_init __initconst = {
+ .cpu_init = bdx_uncore_cpu_init,
+ .pci_init = bdx_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun knl_uncore_init __initconst = {
+ .cpu_init = knl_uncore_cpu_init,
+ .pci_init = knl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
+ .cpu_init = skl_uncore_cpu_init,
+ .pci_init = skl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun skx_uncore_init __initconst = {
+ .cpu_init = skx_uncore_cpu_init,
+ .pci_init = skx_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun icl_uncore_init __initconst = {
+ .cpu_init = icl_uncore_cpu_init,
+ .pci_init = skl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun tgl_uncore_init __initconst = {
+ .cpu_init = tgl_uncore_cpu_init,
+ .mmio_init = tgl_uncore_mmio_init,
+};
+
+static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = {
+ .cpu_init = tgl_uncore_cpu_init,
+ .mmio_init = tgl_l_uncore_mmio_init,
+};
+
+static const struct intel_uncore_init_fun rkl_uncore_init __initconst = {
+ .cpu_init = tgl_uncore_cpu_init,
+ .pci_init = skl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
+ .cpu_init = adl_uncore_cpu_init,
+ .mmio_init = adl_uncore_mmio_init,
+};
+
+static const struct intel_uncore_init_fun mtl_uncore_init __initconst = {
+ .cpu_init = mtl_uncore_cpu_init,
+ .mmio_init = adl_uncore_mmio_init,
+};
+
+static const struct intel_uncore_init_fun lnl_uncore_init __initconst = {
+ .cpu_init = lnl_uncore_cpu_init,
+ .mmio_init = lnl_uncore_mmio_init,
+};
+
+static const struct intel_uncore_init_fun ptl_uncore_init __initconst = {
+ .cpu_init = ptl_uncore_cpu_init,
+ .mmio_init = ptl_uncore_mmio_init,
+ .use_discovery = true,
+};
+
+static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
+ .cpu_init = icx_uncore_cpu_init,
+ .pci_init = icx_uncore_pci_init,
+ .mmio_init = icx_uncore_mmio_init,
+};
+
+static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
+ .cpu_init = snr_uncore_cpu_init,
+ .pci_init = snr_uncore_pci_init,
+ .mmio_init = snr_uncore_mmio_init,
+};
+
+static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
+ .cpu_init = spr_uncore_cpu_init,
+ .pci_init = spr_uncore_pci_init,
+ .mmio_init = spr_uncore_mmio_init,
+ .use_discovery = true,
+ .uncore_units_ignore = spr_uncore_units_ignore,
+};
+
+static const struct intel_uncore_init_fun gnr_uncore_init __initconst = {
+ .cpu_init = gnr_uncore_cpu_init,
+ .pci_init = gnr_uncore_pci_init,
+ .mmio_init = gnr_uncore_mmio_init,
+ .use_discovery = true,
+ .uncore_units_ignore = gnr_uncore_units_ignore,
+};
+
+static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+ .cpu_init = intel_uncore_generic_uncore_cpu_init,
+ .pci_init = intel_uncore_generic_uncore_pci_init,
+ .mmio_init = intel_uncore_generic_uncore_mmio_init,
+};
+
+static const struct x86_cpu_id intel_uncore_match[] __initconst = {
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &ivb_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snbep_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ivbep_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &hswep_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &tgl_l_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &tgl_uncore_init),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &rkl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init),
+ X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &ptl_uncore_init),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &snr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &gnr_uncore_init),
+ {},
+};
+MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
+
+static int __init intel_uncore_init(void)
+{
+ const struct x86_cpu_id *id;
+ struct intel_uncore_init_fun *uncore_init;
+ int pret = 0, cret = 0, mret = 0, ret;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return -ENODEV;
+
+ __uncore_max_dies =
+ topology_max_packages() * topology_max_dies_per_package();
+
+ id = x86_match_cpu(intel_uncore_match);
+ if (!id) {
+ if (!uncore_no_discover && intel_uncore_has_discovery_tables(NULL))
+ uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
+ else
+ return -ENODEV;
+ } else {
+ uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+ if (uncore_no_discover && uncore_init->use_discovery)
+ return -ENODEV;
+ if (uncore_init->use_discovery &&
+ !intel_uncore_has_discovery_tables(uncore_init->uncore_units_ignore))
+ return -ENODEV;
+ }
+
+ if (uncore_init->pci_init) {
+ pret = uncore_init->pci_init();
+ if (!pret)
+ pret = uncore_pci_init();
+ }
+
+ if (uncore_init->cpu_init) {
+ uncore_init->cpu_init();
+ cret = uncore_cpu_init();
+ }
+
+ if (uncore_init->mmio_init) {
+ uncore_init->mmio_init();
+ mret = uncore_mmio_init();
+ }
+
+ if (cret && pret && mret) {
+ ret = -ENODEV;
+ goto free_discovery;
+ }
+
+ /* Install hotplug callbacks to setup the targets for each package */
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+ "perf/x86/intel/uncore:online",
+ uncore_event_cpu_online,
+ uncore_event_cpu_offline);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_types_exit(uncore_mmio_uncores);
+ uncore_pci_exit();
+free_discovery:
+ intel_uncore_clear_discovery_tables();
+ return ret;
+}
+module_init(intel_uncore_init);
+
+static void __exit intel_uncore_exit(void)
+{
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE);
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_types_exit(uncore_mmio_uncores);
+ uncore_pci_exit();
+ intel_uncore_clear_discovery_tables();
+}
+module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
new file mode 100644
index 000000000000..d8815fff7588
--- /dev/null
+++ b/arch/x86/events/intel/uncore.h
@@ -0,0 +1,650 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <asm/apicdef.h>
+#include <asm/intel-family.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+
+#include <linux/perf_event.h>
+#include "../perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN 32
+#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT 0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC 8
+#define UNCORE_PMC_IDX_MAX_FIXED 1
+#define UNCORE_PMC_IDX_MAX_FREERUNNING 1
+#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_FREERUNNING (UNCORE_PMC_IDX_FIXED + \
+ UNCORE_PMC_IDX_MAX_FIXED)
+#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FREERUNNING + \
+ UNCORE_PMC_IDX_MAX_FREERUNNING)
+
+#define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx) \
+ ((dev << 24) | (func << 16) | (type << 8) | idx)
+#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_DEV(data) ((data >> 24) & 0xff)
+#define UNCORE_PCI_DEV_FUNC(data) ((data >> 16) & 0xff)
+#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV 0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX 4
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+#define UNCORE_IGNORE_END -1
+
+struct pci_extra_dev {
+ struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+struct freerunning_counters;
+struct intel_uncore_topology;
+
+struct intel_uncore_type {
+ const char *name;
+ int num_counters;
+ int num_boxes;
+ int perf_ctr_bits;
+ int fixed_ctr_bits;
+ int num_freerunning_types;
+ int type_id;
+ unsigned perf_ctr;
+ unsigned event_ctl;
+ unsigned event_mask;
+ unsigned event_mask_ext;
+ unsigned fixed_ctr;
+ unsigned fixed_ctl;
+ unsigned box_ctl;
+ union {
+ unsigned msr_offset;
+ unsigned mmio_offset;
+ };
+ unsigned mmio_map_size;
+ unsigned num_shared_regs:8;
+ unsigned single_fixed:1;
+ unsigned pair_ctr_ctl:1;
+ union {
+ u64 *msr_offsets;
+ u64 *pci_offsets;
+ u64 *mmio_offsets;
+ };
+ struct event_constraint unconstrainted;
+ struct event_constraint *constraints;
+ struct intel_uncore_pmu *pmus;
+ struct intel_uncore_ops *ops;
+ struct uncore_event_desc *event_descs;
+ struct freerunning_counters *freerunning;
+ const struct attribute_group *attr_groups[4];
+ const struct attribute_group **attr_update;
+ struct pmu *pmu; /* for custom pmu ops */
+ struct rb_root *boxes;
+ /*
+ * Uncore PMU would store relevant platform topology configuration here
+ * to identify which platform component each PMON block of that type is
+ * supposed to monitor.
+ */
+ struct intel_uncore_topology **topology;
+ /*
+ * Optional callbacks for managing mapping of Uncore units to PMONs
+ */
+ int (*get_topology)(struct intel_uncore_type *type);
+ void (*set_mapping)(struct intel_uncore_type *type);
+ void (*cleanup_mapping)(struct intel_uncore_type *type);
+ /*
+ * Optional callbacks for extra uncore units cleanup
+ */
+ void (*cleanup_extra_boxes)(struct intel_uncore_type *type);
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+ void (*init_box)(struct intel_uncore_box *);
+ void (*exit_box)(struct intel_uncore_box *);
+ void (*disable_box)(struct intel_uncore_box *);
+ void (*enable_box)(struct intel_uncore_box *);
+ void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+ void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+ u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+ int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+ struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+ struct perf_event *);
+ void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+ struct pmu pmu;
+ char name[UNCORE_PMU_NAME_LEN];
+ int pmu_idx;
+ bool registered;
+ atomic_t activeboxes;
+ cpumask_t cpu_mask;
+ struct intel_uncore_type *type;
+ struct intel_uncore_box **boxes;
+};
+
+struct intel_uncore_extra_reg {
+ raw_spinlock_t lock;
+ u64 config, config1, config2;
+ atomic_t ref;
+};
+
+struct intel_uncore_box {
+ int dieid; /* Logical die ID */
+ int n_active; /* number of active events */
+ int n_events;
+ int cpu; /* cpu to collect events */
+ unsigned long flags;
+ atomic_t refcnt;
+ struct perf_event *events[UNCORE_PMC_IDX_MAX];
+ struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+ struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
+ unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+ u64 tags[UNCORE_PMC_IDX_MAX];
+ struct pci_dev *pci_dev;
+ struct intel_uncore_pmu *pmu;
+ u64 hrtimer_duration; /* hrtimer timeout for this box */
+ struct hrtimer hrtimer;
+ struct list_head list;
+ struct list_head active_list;
+ void __iomem *io_addr;
+ struct intel_uncore_extra_reg shared_regs[];
+};
+
+/* CFL uncore 8th cbox MSRs */
+#define CFL_UNC_CBO_7_PERFEVTSEL0 0xf70
+#define CFL_UNC_CBO_7_PER_CTR0 0xf76
+
+#define UNCORE_BOX_FLAG_INITIATED 0
+/* event config registers are 8-byte apart */
+#define UNCORE_BOX_FLAG_CTL_OFFS8 1
+/* CFL 8th CBOX has different MSR space */
+#define UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS 2
+
+struct uncore_event_desc {
+ struct device_attribute attr;
+ const char *config;
+};
+
+struct freerunning_counters {
+ unsigned int counter_base;
+ unsigned int counter_offset;
+ unsigned int box_offset;
+ unsigned int num_counters;
+ unsigned int bits;
+ unsigned *box_offsets;
+};
+
+struct uncore_iio_topology {
+ int pci_bus_no;
+ int segment;
+};
+
+struct uncore_upi_topology {
+ int die_to;
+ int pmu_idx_to;
+ int enabled;
+};
+
+struct intel_uncore_topology {
+ int pmu_idx;
+ union {
+ void *untyped;
+ struct uncore_iio_topology *iio;
+ struct uncore_upi_topology *upi;
+ };
+};
+
+struct pci2phy_map {
+ struct list_head list;
+ int segment;
+ int pbus_to_dieid[256];
+};
+
+struct pci2phy_map *__find_pci2phy_map(int segment);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
+int uncore_die_to_segment(int die);
+int uncore_device_to_die(struct pci_dev *dev);
+
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+ return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n) container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr)
+#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n))
+
+extern int __uncore_max_dies;
+#define uncore_max_dies() (__uncore_max_dies)
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
+{ \
+ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
+ .config = _config, \
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct device_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+static inline bool uncore_pmc_fixed(int idx)
+{
+ return idx == UNCORE_PMC_IDX_FIXED;
+}
+
+static inline bool uncore_pmc_freerunning(int idx)
+{
+ return idx == UNCORE_PMC_IDX_FREERUNNING;
+}
+
+static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box,
+ unsigned long offset)
+{
+ if (offset < box->pmu->type->mmio_map_size)
+ return true;
+
+ pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n",
+ offset, box->pmu->type->name);
+
+ return false;
+}
+
+static inline
+unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box)
+{
+ return box->pmu->type->box_ctl +
+ box->pmu->type->mmio_offset * box->pmu->pmu_idx;
+}
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+ return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags))
+ return idx * 8 + box->pmu->type->event_ctl;
+
+ return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+ struct intel_uncore_pmu *pmu = box->pmu;
+ return pmu->type->msr_offsets ?
+ pmu->type->msr_offsets[pmu->pmu_idx] :
+ pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+ if (!box->pmu->type->box_ctl)
+ return 0;
+ return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+ if (!box->pmu->type->fixed_ctl)
+ return 0;
+ return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+
+/*
+ * In the uncore document, there is no event-code assigned to free running
+ * counters. Some events need to be defined to indicate the free running
+ * counters. The events are encoded as event-code + umask-code.
+ *
+ * The event-code for all free running counters is 0xff, which is the same as
+ * the fixed counters.
+ *
+ * The umask-code is used to distinguish a fixed counter and a free running
+ * counter, and different types of free running counters.
+ * - For fixed counters, the umask-code is 0x0X.
+ * X indicates the index of the fixed counter, which starts from 0.
+ * - For free running counters, the umask-code uses the rest of the space.
+ * It would bare the format of 0xXY.
+ * X stands for the type of free running counters, which starts from 1.
+ * Y stands for the index of free running counters of same type, which
+ * starts from 0.
+ *
+ * For example, there are three types of IIO free running counters on Skylake
+ * server, IO CLOCKS counters, BANDWIDTH counters and UTILIZATION counters.
+ * The event-code for all the free running counters is 0xff.
+ * 'ioclk' is the first counter of IO CLOCKS. IO CLOCKS is the first type,
+ * which umask-code starts from 0x10.
+ * So 'ioclk' is encoded as event=0xff,umask=0x10
+ * 'bw_in_port2' is the third counter of BANDWIDTH counters. BANDWIDTH is
+ * the second type, which umask-code starts from 0x20.
+ * So 'bw_in_port2' is encoded as event=0xff,umask=0x22
+ */
+static inline unsigned int uncore_freerunning_idx(u64 config)
+{
+ return ((config >> 8) & 0xf);
+}
+
+#define UNCORE_FREERUNNING_UMASK_START 0x10
+
+static inline unsigned int uncore_freerunning_type(u64 config)
+{
+ return ((((config >> 8) - UNCORE_FREERUNNING_UMASK_START) >> 4) & 0xf);
+}
+
+static inline
+unsigned int uncore_freerunning_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
+ struct intel_uncore_pmu *pmu = box->pmu;
+
+ return pmu->type->freerunning[type].counter_base +
+ pmu->type->freerunning[type].counter_offset * idx +
+ (pmu->type->freerunning[type].box_offsets ?
+ pmu->type->freerunning[type].box_offsets[pmu->pmu_idx] :
+ pmu->type->freerunning[type].box_offset * pmu->pmu_idx);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ if (test_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags)) {
+ return CFL_UNC_CBO_7_PERFEVTSEL0 +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx);
+ } else {
+ return box->pmu->type->event_ctl +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+ }
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ if (test_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags)) {
+ return CFL_UNC_CBO_7_PER_CTR0 +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx);
+ } else {
+ return box->pmu->type->perf_ctr +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+ }
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+ if (box->pci_dev || box->io_addr)
+ return uncore_pci_fixed_ctl(box);
+ else
+ return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+ if (box->pci_dev || box->io_addr)
+ return uncore_pci_fixed_ctr(box);
+ else
+ return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ if (box->pci_dev || box->io_addr)
+ return uncore_pci_event_ctl(box, idx);
+ else
+ return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ if (box->pci_dev || box->io_addr)
+ return uncore_pci_perf_ctr(box, idx);
+ else
+ return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+ return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline
+unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+
+ return box->pmu->type->freerunning[type].bits;
+}
+
+static inline int uncore_num_freerunning(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+
+ return box->pmu->type->freerunning[type].num_counters;
+}
+
+static inline int uncore_num_freerunning_types(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ return box->pmu->type->num_freerunning_types;
+}
+
+static inline bool check_valid_freerunning_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
+
+ return (type < uncore_num_freerunning_types(box, event)) &&
+ (idx < uncore_num_freerunning(box, event));
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+ return box->pmu->type->num_counters;
+}
+
+static inline bool is_freerunning_event(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ return ((cfg & UNCORE_FIXED_EVENT) == UNCORE_FIXED_EVENT) &&
+ (((cfg >> 8) & 0xff) >= UNCORE_FREERUNNING_UMASK_START);
+}
+
+/* Check and reject invalid config */
+static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ if (is_freerunning_event(event))
+ return 0;
+
+ return -EINVAL;
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+ if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->init_box)
+ box->pmu->type->ops->init_box(box);
+ }
+}
+
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+ if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->exit_box)
+ box->pmu->type->ops->exit_box(box);
+ }
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+ return (box->dieid < 0);
+}
+
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+ return event->pmu_private;
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_mmio_exit_box(struct intel_uncore_box *box);
+u64 uncore_mmio_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_start(struct perf_event *event, int flags);
+void uncore_pmu_event_stop(struct perf_event *event, int flags);
+int uncore_pmu_event_add(struct perf_event *event, int flags);
+void uncore_pmu_event_del(struct perf_event *event, int flags);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu);
+
+extern struct intel_uncore_type *empty_uncore[];
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct intel_uncore_type **uncore_mmio_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern struct pci_driver *uncore_pci_sub_driver;
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
+extern struct pci_extra_dev *uncore_extra_pci_dev;
+extern struct event_constraint uncore_constraint_empty;
+extern int spr_uncore_units_ignore[];
+extern int gnr_uncore_units_ignore[];
+
+/* uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+void skl_uncore_cpu_init(void);
+void icl_uncore_cpu_init(void);
+void tgl_uncore_cpu_init(void);
+void adl_uncore_cpu_init(void);
+void lnl_uncore_cpu_init(void);
+void mtl_uncore_cpu_init(void);
+void ptl_uncore_cpu_init(void);
+void tgl_uncore_mmio_init(void);
+void tgl_l_uncore_mmio_init(void);
+void adl_uncore_mmio_init(void);
+void lnl_uncore_mmio_init(void);
+void ptl_uncore_mmio_init(void);
+int snb_pci2phy_map_init(int devid);
+
+/* uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
+int skx_uncore_pci_init(void);
+void skx_uncore_cpu_init(void);
+int snr_uncore_pci_init(void);
+void snr_uncore_cpu_init(void);
+void snr_uncore_mmio_init(void);
+int icx_uncore_pci_init(void);
+void icx_uncore_cpu_init(void);
+void icx_uncore_mmio_init(void);
+int spr_uncore_pci_init(void);
+void spr_uncore_cpu_init(void);
+void spr_uncore_mmio_init(void);
+int gnr_uncore_pci_init(void);
+void gnr_uncore_cpu_init(void);
+void gnr_uncore_mmio_init(void);
+
+/* uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
new file mode 100644
index 000000000000..7d57ce706feb
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -0,0 +1,793 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Support Intel uncore PerfMon discovery mechanism.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/msr.h>
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+static struct rb_root discovery_tables = RB_ROOT;
+static int num_discovered_types[UNCORE_ACCESS_MAX];
+
+static bool has_generic_discovery_table(void)
+{
+ struct pci_dev *dev;
+ int dvsec;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL);
+ if (!dev)
+ return false;
+
+ /* A discovery table device has the unique capability ID. */
+ dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY);
+ pci_dev_put(dev);
+ if (dvsec)
+ return true;
+
+ return false;
+}
+
+static int logical_die_id;
+
+static int get_device_die_id(struct pci_dev *dev)
+{
+ int node = pcibus_to_node(dev->bus);
+
+ /*
+ * If the NUMA info is not available, assume that the logical die id is
+ * continuous in the order in which the discovery table devices are
+ * detected.
+ */
+ if (node < 0)
+ return logical_die_id++;
+
+ return uncore_device_to_die(dev);
+}
+
+#define __node_2_type(cur) \
+ rb_entry((cur), struct intel_uncore_discovery_type, node)
+
+static inline int __type_cmp(const void *key, const struct rb_node *b)
+{
+ struct intel_uncore_discovery_type *type_b = __node_2_type(b);
+ const u16 *type_id = key;
+
+ if (type_b->type > *type_id)
+ return -1;
+ else if (type_b->type < *type_id)
+ return 1;
+
+ return 0;
+}
+
+static inline struct intel_uncore_discovery_type *
+search_uncore_discovery_type(u16 type_id)
+{
+ struct rb_node *node = rb_find(&type_id, &discovery_tables, __type_cmp);
+
+ return (node) ? __node_2_type(node) : NULL;
+}
+
+static inline bool __type_less(struct rb_node *a, const struct rb_node *b)
+{
+ return (__node_2_type(a)->type < __node_2_type(b)->type);
+}
+
+static struct intel_uncore_discovery_type *
+add_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+ struct intel_uncore_discovery_type *type;
+
+ if (unit->access_type >= UNCORE_ACCESS_MAX) {
+ pr_warn("Unsupported access type %d\n", unit->access_type);
+ return NULL;
+ }
+
+ type = kzalloc(sizeof(struct intel_uncore_discovery_type), GFP_KERNEL);
+ if (!type)
+ return NULL;
+
+ type->units = RB_ROOT;
+
+ type->access_type = unit->access_type;
+ num_discovered_types[type->access_type]++;
+ type->type = unit->box_type;
+
+ rb_add(&type->node, &discovery_tables, __type_less);
+
+ return type;
+}
+
+static struct intel_uncore_discovery_type *
+get_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+ struct intel_uncore_discovery_type *type;
+
+ type = search_uncore_discovery_type(unit->box_type);
+ if (type)
+ return type;
+
+ return add_uncore_discovery_type(unit);
+}
+
+static inline int pmu_idx_cmp(const void *key, const struct rb_node *b)
+{
+ struct intel_uncore_discovery_unit *unit;
+ const unsigned int *id = key;
+
+ unit = rb_entry(b, struct intel_uncore_discovery_unit, node);
+
+ if (unit->pmu_idx > *id)
+ return -1;
+ else if (unit->pmu_idx < *id)
+ return 1;
+
+ return 0;
+}
+
+static struct intel_uncore_discovery_unit *
+intel_uncore_find_discovery_unit(struct rb_root *units, int die,
+ unsigned int pmu_idx)
+{
+ struct intel_uncore_discovery_unit *unit;
+ struct rb_node *pos;
+
+ if (!units)
+ return NULL;
+
+ pos = rb_find_first(&pmu_idx, units, pmu_idx_cmp);
+ if (!pos)
+ return NULL;
+ unit = rb_entry(pos, struct intel_uncore_discovery_unit, node);
+
+ if (die < 0)
+ return unit;
+
+ for (; pos; pos = rb_next(pos)) {
+ unit = rb_entry(pos, struct intel_uncore_discovery_unit, node);
+
+ if (unit->pmu_idx != pmu_idx)
+ break;
+
+ if (unit->die == die)
+ return unit;
+ }
+
+ return NULL;
+}
+
+int intel_uncore_find_discovery_unit_id(struct rb_root *units, int die,
+ unsigned int pmu_idx)
+{
+ struct intel_uncore_discovery_unit *unit;
+
+ unit = intel_uncore_find_discovery_unit(units, die, pmu_idx);
+ if (unit)
+ return unit->id;
+
+ return -1;
+}
+
+static inline bool unit_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct intel_uncore_discovery_unit *a_node, *b_node;
+
+ a_node = rb_entry(a, struct intel_uncore_discovery_unit, node);
+ b_node = rb_entry(b, struct intel_uncore_discovery_unit, node);
+
+ if (a_node->pmu_idx < b_node->pmu_idx)
+ return true;
+ if (a_node->pmu_idx > b_node->pmu_idx)
+ return false;
+
+ if (a_node->die < b_node->die)
+ return true;
+ if (a_node->die > b_node->die)
+ return false;
+
+ return 0;
+}
+
+static inline struct intel_uncore_discovery_unit *
+uncore_find_unit(struct rb_root *root, unsigned int id)
+{
+ struct intel_uncore_discovery_unit *unit;
+ struct rb_node *node;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ if (unit->id == id)
+ return unit;
+ }
+
+ return NULL;
+}
+
+void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
+ struct rb_root *root, u16 *num_units)
+{
+ struct intel_uncore_discovery_unit *unit = uncore_find_unit(root, node->id);
+
+ if (unit)
+ node->pmu_idx = unit->pmu_idx;
+ else if (num_units)
+ node->pmu_idx = (*num_units)++;
+
+ rb_add(&node->node, root, unit_less);
+}
+
+static void
+uncore_insert_box_info(struct uncore_unit_discovery *unit,
+ int die)
+{
+ struct intel_uncore_discovery_unit *node;
+ struct intel_uncore_discovery_type *type;
+
+ if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) {
+ pr_info("Invalid address is detected for uncore type %d box %d, "
+ "Disable the uncore unit.\n",
+ unit->box_type, unit->box_id);
+ return;
+ }
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return;
+
+ node->die = die;
+ node->id = unit->box_id;
+ node->addr = unit->ctl;
+
+ type = get_uncore_discovery_type(unit);
+ if (!type) {
+ kfree(node);
+ return;
+ }
+
+ uncore_find_add_unit(node, &type->units, &type->num_units);
+
+ /* Store generic information for the first box */
+ if (type->num_units == 1) {
+ type->num_counters = unit->num_regs;
+ type->counter_width = unit->bit_width;
+ type->ctl_offset = unit->ctl_offset;
+ type->ctr_offset = unit->ctr_offset;
+ }
+}
+
+static bool
+uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore)
+{
+ int i;
+
+ if (!ignore)
+ return false;
+
+ for (i = 0; ignore[i] != UNCORE_IGNORE_END ; i++) {
+ if (unit->box_type == ignore[i])
+ return true;
+ }
+
+ return false;
+}
+
+static int __parse_discovery_table(resource_size_t addr, int die,
+ bool *parsed, int *ignore)
+{
+ struct uncore_global_discovery global;
+ struct uncore_unit_discovery unit;
+ void __iomem *io_addr;
+ unsigned long size;
+ int i;
+
+ size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
+ io_addr = ioremap(addr, size);
+ if (!io_addr)
+ return -ENOMEM;
+
+ /* Read Global Discovery State */
+ memcpy_fromio(&global, io_addr, sizeof(struct uncore_global_discovery));
+ if (uncore_discovery_invalid_unit(global)) {
+ pr_info("Invalid Global Discovery State: 0x%llx 0x%llx 0x%llx\n",
+ global.table1, global.ctl, global.table3);
+ iounmap(io_addr);
+ return -EINVAL;
+ }
+ iounmap(io_addr);
+
+ size = (1 + global.max_units) * global.stride * 8;
+ io_addr = ioremap(addr, size);
+ if (!io_addr)
+ return -ENOMEM;
+
+ /* Parsing Unit Discovery State */
+ for (i = 0; i < global.max_units; i++) {
+ memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8),
+ sizeof(struct uncore_unit_discovery));
+
+ if (uncore_discovery_invalid_unit(unit))
+ continue;
+
+ if (unit.access_type >= UNCORE_ACCESS_MAX)
+ continue;
+
+ if (uncore_ignore_unit(&unit, ignore))
+ continue;
+
+ uncore_insert_box_info(&unit, die);
+ }
+
+ *parsed = true;
+ iounmap(io_addr);
+ return 0;
+}
+
+static int parse_discovery_table(struct pci_dev *dev, int die,
+ u32 bar_offset, bool *parsed,
+ int *ignore)
+{
+ resource_size_t addr;
+ u32 val;
+
+ pci_read_config_dword(dev, bar_offset, &val);
+
+ if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64)
+ return -EINVAL;
+
+ addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ u32 val2;
+
+ pci_read_config_dword(dev, bar_offset + 4, &val2);
+ addr |= ((resource_size_t)val2) << 32;
+ }
+#endif
+
+ return __parse_discovery_table(addr, die, parsed, ignore);
+}
+
+static bool intel_uncore_has_discovery_tables_pci(int *ignore)
+{
+ u32 device, val, entry_id, bar_offset;
+ int die, dvsec = 0, ret = true;
+ struct pci_dev *dev = NULL;
+ bool parsed = false;
+
+ if (has_generic_discovery_table())
+ device = UNCORE_DISCOVERY_TABLE_DEVICE;
+ else
+ device = PCI_ANY_ID;
+
+ /*
+ * Start a new search and iterates through the list of
+ * the discovery table devices.
+ */
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+ while ((dvsec = pci_find_next_ext_capability(dev, dvsec, UNCORE_EXT_CAP_ID_DISCOVERY))) {
+ pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC_OFFSET, &val);
+ entry_id = val & UNCORE_DISCOVERY_DVSEC_ID_MASK;
+ if (entry_id != UNCORE_DISCOVERY_DVSEC_ID_PMON)
+ continue;
+
+ pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC2_OFFSET, &val);
+
+ if (val & ~UNCORE_DISCOVERY_DVSEC2_BIR_MASK) {
+ ret = false;
+ goto err;
+ }
+ bar_offset = UNCORE_DISCOVERY_BIR_BASE +
+ (val & UNCORE_DISCOVERY_DVSEC2_BIR_MASK) * UNCORE_DISCOVERY_BIR_STEP;
+
+ die = get_device_die_id(dev);
+ if (die < 0)
+ continue;
+
+ parse_discovery_table(dev, die, bar_offset, &parsed, ignore);
+ }
+ }
+
+ /* None of the discovery tables are available */
+ if (!parsed)
+ ret = false;
+err:
+ pci_dev_put(dev);
+
+ return ret;
+}
+
+static bool intel_uncore_has_discovery_tables_msr(int *ignore)
+{
+ unsigned long *die_mask;
+ bool parsed = false;
+ int cpu, die;
+ u64 base;
+
+ die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!die_mask)
+ return false;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ die = topology_logical_die_id(cpu);
+ if (__test_and_set_bit(die, die_mask))
+ continue;
+
+ if (rdmsrq_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base))
+ continue;
+
+ if (!base)
+ continue;
+
+ __parse_discovery_table(base, die, &parsed, ignore);
+ }
+
+ cpus_read_unlock();
+
+ kfree(die_mask);
+ return parsed;
+}
+
+bool intel_uncore_has_discovery_tables(int *ignore)
+{
+ return intel_uncore_has_discovery_tables_msr(ignore) ||
+ intel_uncore_has_discovery_tables_pci(ignore);
+}
+
+void intel_uncore_clear_discovery_tables(void)
+{
+ struct intel_uncore_discovery_type *type, *next;
+ struct intel_uncore_discovery_unit *pos;
+ struct rb_node *node;
+
+ rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) {
+ while (!RB_EMPTY_ROOT(&type->units)) {
+ node = rb_first(&type->units);
+ pos = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ rb_erase(node, &type->units);
+ kfree(pos);
+ }
+ kfree(type);
+ }
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh, thresh, "config:24-31");
+
+static struct attribute *generic_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh.attr,
+ NULL,
+};
+
+static const struct attribute_group generic_uncore_format_group = {
+ .name = "format",
+ .attrs = generic_uncore_formats_attr,
+};
+
+static u64 intel_generic_uncore_box_ctl(struct intel_uncore_box *box)
+{
+ struct intel_uncore_discovery_unit *unit;
+
+ unit = intel_uncore_find_discovery_unit(box->pmu->type->boxes,
+ -1, box->pmu->pmu_idx);
+ if (WARN_ON_ONCE(!unit))
+ return 0;
+
+ return unit->addr;
+}
+
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrq(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
+}
+
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrq(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrq(intel_generic_uncore_box_ctl(box), 0);
+}
+
+static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrq(hwc->config_base, hwc->config);
+}
+
+static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrq(hwc->config_base, 0);
+}
+
+static struct intel_uncore_ops generic_uncore_msr_ops = {
+ .init_box = intel_generic_uncore_msr_init_box,
+ .disable_box = intel_generic_uncore_msr_disable_box,
+ .enable_box = intel_generic_uncore_msr_enable_box,
+ .disable_event = intel_generic_uncore_msr_disable_event,
+ .enable_event = intel_generic_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+bool intel_generic_uncore_assign_hw_event(struct perf_event *event,
+ struct intel_uncore_box *box)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 box_ctl;
+
+ if (!box->pmu->type->boxes)
+ return false;
+
+ if (box->io_addr) {
+ hwc->config_base = uncore_pci_event_ctl(box, hwc->idx);
+ hwc->event_base = uncore_pci_perf_ctr(box, hwc->idx);
+ return true;
+ }
+
+ box_ctl = intel_generic_uncore_box_ctl(box);
+ if (!box_ctl)
+ return false;
+
+ if (box->pci_dev) {
+ box_ctl = UNCORE_DISCOVERY_PCI_BOX_CTRL(box_ctl);
+ hwc->config_base = box_ctl + uncore_pci_event_ctl(box, hwc->idx);
+ hwc->event_base = box_ctl + uncore_pci_perf_ctr(box, hwc->idx);
+ return true;
+ }
+
+ hwc->config_base = box_ctl + box->pmu->type->event_ctl + hwc->idx;
+ hwc->event_base = box_ctl + box->pmu->type->perf_ctr + hwc->idx;
+
+ return true;
+}
+
+static inline int intel_pci_uncore_box_ctl(struct intel_uncore_box *box)
+{
+ return UNCORE_DISCOVERY_PCI_BOX_CTRL(intel_generic_uncore_box_ctl(box));
+}
+
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = intel_pci_uncore_box_ctl(box);
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT);
+}
+
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = intel_pci_uncore_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = intel_pci_uncore_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, 0);
+}
+
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops generic_uncore_pci_ops = {
+ .init_box = intel_generic_uncore_pci_init_box,
+ .disable_box = intel_generic_uncore_pci_disable_box,
+ .enable_box = intel_generic_uncore_pci_enable_box,
+ .disable_event = intel_generic_uncore_pci_disable_event,
+ .enable_event = intel_generic_uncore_pci_enable_event,
+ .read_counter = intel_generic_uncore_pci_read_counter,
+};
+
+#define UNCORE_GENERIC_MMIO_SIZE 0x4000
+
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
+{
+ static struct intel_uncore_discovery_unit *unit;
+ struct intel_uncore_type *type = box->pmu->type;
+ resource_size_t addr;
+
+ unit = intel_uncore_find_discovery_unit(type->boxes, box->dieid, box->pmu->pmu_idx);
+ if (!unit) {
+ pr_warn("Uncore type %d id %d: Cannot find box control address.\n",
+ type->type_id, box->pmu->pmu_idx);
+ return;
+ }
+
+ if (!unit->addr) {
+ pr_warn("Uncore type %d box %d: Invalid box control address.\n",
+ type->type_id, unit->id);
+ return;
+ }
+
+ addr = unit->addr;
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr) {
+ pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
+ type->type_id, unit->id, (unsigned long long)addr);
+ return;
+ }
+
+ writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr);
+}
+
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops generic_uncore_mmio_ops = {
+ .init_box = intel_generic_uncore_mmio_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static bool uncore_update_uncore_type(enum uncore_access_type type_id,
+ struct intel_uncore_type *uncore,
+ struct intel_uncore_discovery_type *type)
+{
+ uncore->type_id = type->type;
+ uncore->num_counters = type->num_counters;
+ uncore->perf_ctr_bits = type->counter_width;
+ uncore->perf_ctr = (unsigned int)type->ctr_offset;
+ uncore->event_ctl = (unsigned int)type->ctl_offset;
+ uncore->boxes = &type->units;
+ uncore->num_boxes = type->num_units;
+
+ switch (type_id) {
+ case UNCORE_ACCESS_MSR:
+ uncore->ops = &generic_uncore_msr_ops;
+ break;
+ case UNCORE_ACCESS_PCI:
+ uncore->ops = &generic_uncore_pci_ops;
+ break;
+ case UNCORE_ACCESS_MMIO:
+ uncore->ops = &generic_uncore_mmio_ops;
+ uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra)
+{
+ struct intel_uncore_discovery_type *type;
+ struct intel_uncore_type **uncores;
+ struct intel_uncore_type *uncore;
+ struct rb_node *node;
+ int i = 0;
+
+ uncores = kcalloc(num_discovered_types[type_id] + num_extra + 1,
+ sizeof(struct intel_uncore_type *), GFP_KERNEL);
+ if (!uncores)
+ return empty_uncore;
+
+ for (node = rb_first(&discovery_tables); node; node = rb_next(node)) {
+ type = rb_entry(node, struct intel_uncore_discovery_type, node);
+ if (type->access_type != type_id)
+ continue;
+
+ uncore = kzalloc(sizeof(struct intel_uncore_type), GFP_KERNEL);
+ if (!uncore)
+ break;
+
+ uncore->event_mask = GENERIC_PMON_RAW_EVENT_MASK;
+ uncore->format_group = &generic_uncore_format_group;
+
+ if (!uncore_update_uncore_type(type_id, uncore, type)) {
+ kfree(uncore);
+ continue;
+ }
+ uncores[i++] = uncore;
+ }
+
+ return uncores;
+}
+
+void intel_uncore_generic_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR, 0);
+}
+
+int intel_uncore_generic_uncore_pci_init(void)
+{
+ uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI, 0);
+
+ return 0;
+}
+
+void intel_uncore_generic_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO, 0);
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
new file mode 100644
index 000000000000..dff75c98e22f
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* Store the full address of the global discovery table */
+#define UNCORE_DISCOVERY_MSR 0x201e
+
+/* Generic device ID of a discovery table device */
+#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7
+/* Capability ID for a discovery table device */
+#define UNCORE_EXT_CAP_ID_DISCOVERY 0x23
+/* First DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC_OFFSET 0x8
+/* Mask of the supported discovery entry type */
+#define UNCORE_DISCOVERY_DVSEC_ID_MASK 0xffff
+/* PMON discovery entry type ID */
+#define UNCORE_DISCOVERY_DVSEC_ID_PMON 0x1
+/* Second DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC2_OFFSET 0xc
+/* Mask of the discovery table BAR offset */
+#define UNCORE_DISCOVERY_DVSEC2_BIR_MASK 0x7
+/* Discovery table BAR base offset */
+#define UNCORE_DISCOVERY_BIR_BASE 0x10
+/* Discovery table BAR step */
+#define UNCORE_DISCOVERY_BIR_STEP 0x4
+/* Global discovery table size */
+#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20
+
+#define UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET 28
+#define UNCORE_DISCOVERY_PCI_DOMAIN(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET) & 0x7)
+#define UNCORE_DISCOVERY_PCI_BUS_OFFSET 20
+#define UNCORE_DISCOVERY_PCI_BUS(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_BUS_OFFSET) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DEVFN_OFFSET 12
+#define UNCORE_DISCOVERY_PCI_DEVFN(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_DEVFN_OFFSET) & 0xff)
+#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data) (data & 0xfff)
+
+
+#define uncore_discovery_invalid_unit(unit) \
+ (!unit.table1 || !unit.ctl || \
+ unit.table1 == -1ULL || unit.ctl == -1ULL || \
+ unit.table3 == -1ULL)
+
+#define GENERIC_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define GENERIC_PMON_CTL_UMASK_MASK 0x0000ff00
+#define GENERIC_PMON_CTL_EDGE_DET (1 << 18)
+#define GENERIC_PMON_CTL_INVERT (1 << 23)
+#define GENERIC_PMON_CTL_TRESH_MASK 0xff000000
+#define GENERIC_PMON_RAW_EVENT_MASK (GENERIC_PMON_CTL_EV_SEL_MASK | \
+ GENERIC_PMON_CTL_UMASK_MASK | \
+ GENERIC_PMON_CTL_EDGE_DET | \
+ GENERIC_PMON_CTL_INVERT | \
+ GENERIC_PMON_CTL_TRESH_MASK)
+
+#define GENERIC_PMON_BOX_CTL_FRZ (1 << 0)
+#define GENERIC_PMON_BOX_CTL_RST_CTRL (1 << 8)
+#define GENERIC_PMON_BOX_CTL_RST_CTRS (1 << 9)
+#define GENERIC_PMON_BOX_CTL_INT (GENERIC_PMON_BOX_CTL_RST_CTRL | \
+ GENERIC_PMON_BOX_CTL_RST_CTRS)
+
+enum uncore_access_type {
+ UNCORE_ACCESS_MSR = 0,
+ UNCORE_ACCESS_MMIO,
+ UNCORE_ACCESS_PCI,
+
+ UNCORE_ACCESS_MAX,
+};
+
+struct uncore_global_discovery {
+ union {
+ u64 table1;
+ struct {
+ u64 type : 8,
+ stride : 8,
+ max_units : 10,
+ __reserved_1 : 36,
+ access_type : 2;
+ };
+ };
+
+ u64 ctl; /* Global Control Address */
+
+ union {
+ u64 table3;
+ struct {
+ u64 status_offset : 8,
+ num_status : 16,
+ __reserved_2 : 40;
+ };
+ };
+};
+
+struct uncore_unit_discovery {
+ union {
+ u64 table1;
+ struct {
+ u64 num_regs : 8,
+ ctl_offset : 8,
+ bit_width : 8,
+ ctr_offset : 8,
+ status_offset : 8,
+ __reserved_1 : 22,
+ access_type : 2;
+ };
+ };
+
+ u64 ctl; /* Unit Control Address */
+
+ union {
+ u64 table3;
+ struct {
+ u64 box_type : 16,
+ box_id : 16,
+ __reserved_2 : 32;
+ };
+ };
+};
+
+struct intel_uncore_discovery_unit {
+ struct rb_node node;
+ unsigned int pmu_idx; /* The idx of the corresponding PMU */
+ unsigned int id; /* Unit ID */
+ unsigned int die; /* Die ID */
+ u64 addr; /* Unit Control Address */
+};
+
+struct intel_uncore_discovery_type {
+ struct rb_node node;
+ enum uncore_access_type access_type;
+ struct rb_root units; /* Unit ctrl addr for all units */
+ u16 type; /* Type ID of the uncore block */
+ u8 num_counters;
+ u8 counter_width;
+ u8 ctl_offset; /* Counter Control 0 offset */
+ u8 ctr_offset; /* Counter 0 offset */
+ u16 num_units; /* number of units */
+};
+
+bool intel_uncore_has_discovery_tables(int *ignore);
+void intel_uncore_clear_discovery_tables(void);
+void intel_uncore_generic_uncore_cpu_init(void);
+int intel_uncore_generic_uncore_pci_init(void);
+void intel_uncore_generic_uncore_mmio_init(void);
+
+void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box);
+
+void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box);
+void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra);
+
+int intel_uncore_find_discovery_unit_id(struct rb_root *units, int die,
+ unsigned int pmu_idx);
+bool intel_generic_uncore_assign_hw_event(struct perf_event *event,
+ struct intel_uncore_box *box);
+void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
+ struct rb_root *root, u16 *num_units);
+struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+ struct intel_uncore_type **extra, int max_num_types,
+ struct intel_uncore_type **uncores);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
new file mode 100644
index 000000000000..8962e7cb21e3
--- /dev/null
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -0,0 +1,1230 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Nehalem-EX/Westmere-EX uncore support */
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+#include "uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
+#define NHMEX_PMON_CTL_INVERT (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_UMASK_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET | \
+ NHMEX_PMON_CTL_INVERT | \
+ NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define NHMEX_U_MSR_PMON_CTR 0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK \
+ (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
+#define NHMEX_C0_MSR_PMON_CTR0 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
+#define NHMEX_C_MSR_OFFSET 0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
+#define NHMEX_B0_MSR_PMON_CTR0 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0 0xc30
+#define NHMEX_B_MSR_OFFSET 0x40
+#define NHMEX_B0_MSR_MATCH 0xe45
+#define NHMEX_B0_MSR_MASK 0xe46
+#define NHMEX_B1_MSR_MATCH 0xe4d
+#define NHMEX_B1_MSR_MASK 0xe4e
+
+#define NHMEX_B_PMON_CTL_EN (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT 6
+#define NHMEX_B_PMON_CTR_MASK \
+ (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK \
+ (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
+#define NHMEX_S0_MSR_PMON_CTR0 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0 0xc50
+#define NHMEX_S_MSR_OFFSET 0x80
+#define NHMEX_S0_MSR_MM_CFG 0xe48
+#define NHMEX_S0_MSR_MATCH 0xe49
+#define NHMEX_S0_MSR_MASK 0xe4a
+#define NHMEX_S1_MSR_MM_CFG 0xe58
+#define NHMEX_S1_MSR_MATCH 0xe59
+#define NHMEX_S1_MSR_MASK 0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV 0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
+#define NHMEX_M0_MSR_PMU_DSP 0xca5
+#define NHMEX_M0_MSR_PMU_ISS 0xca6
+#define NHMEX_M0_MSR_PMU_MAP 0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
+#define NHMEX_M0_MSR_PMU_PGT 0xca9
+#define NHMEX_M0_MSR_PMU_PLD 0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
+#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
+#define NHMEX_M_MSR_OFFSET 0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
+
+#define NHMEX_M_PMON_CTL_EN (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
+ (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
+ (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK \
+ (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
+ NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
+ NHMEX_M_PMON_CTL_WRAP_MODE | \
+ NHMEX_M_PMON_CTL_FLAG_MODE | \
+ NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_SET_FLAG_SEL_MASK, \
+ (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
+#define NHMEX_R_MSR_PMON_CTL0 0xe10
+#define NHMEX_R_MSR_PMON_CNT0 0xe11
+#define NHMEX_R_MSR_OFFSET 0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
+ ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
+ (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
+ (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
+ (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
+#define NHMEX_W_MSR_PMON_CNT0 0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrq(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ wrmsrq(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrq(msr, config);
+ config &= ~((1ULL << uncore_num_counters(box)) - 1);
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrq(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrq(msr, config);
+ config |= (1ULL << uncore_num_counters(box)) - 1;
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrq(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrq(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx == UNCORE_PMC_IDX_FIXED)
+ wrmsrq(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+ else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+ wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+ else
+ wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT() \
+ .init_box = nhmex_uncore_msr_init_box, \
+ .exit_box = nhmex_uncore_msr_exit_box, \
+ .disable_box = nhmex_uncore_msr_disable_box, \
+ .enable_box = nhmex_uncore_msr_enable_box, \
+ .disable_event = nhmex_uncore_msr_disable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
+ .perf_ctr = NHMEX_U_MSR_PMON_CTR,
+ .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static u64 nhmex_cbox_msr_offsets[] = {
+ 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 6,
+ .num_boxes = 10,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
+ .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+ .msr_offsets = nhmex_cbox_msr_offsets,
+ .pair_ctr_ctl = 1,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+ .name = "wbox",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_W_MSR_PMON_CNT0,
+ .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
+ .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
+ .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
+ .pair_ctr_ctl = 1,
+ .event_descs = nhmex_uncore_wbox_events,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int ctr, ev_sel;
+
+ ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+ NHMEX_B_PMON_CTR_SHIFT;
+ ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+ /* events that do not use the match/mask registers */
+ if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+ (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_B0_MSR_MATCH;
+ else
+ reg1->reg = NHMEX_B1_MSR_MATCH;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrq(reg1->reg, reg1->config);
+ wrmsrq(reg1->reg + 1, reg2->config);
+ }
+ wrmsrq(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+ EVENT_CONSTRAINT(0 , 1, 0xc0),
+ EVENT_CONSTRAINT(0x40, 2, 0xc0),
+ EVENT_CONSTRAINT(0x80, 4, 0xc0),
+ EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+ EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_counter.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_bbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_bbox_msr_enable_event,
+ .hw_config = nhmex_bbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+ .name = "bbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_B_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .constraints = nhmex_uncore_bbox_constraints,
+ .ops = &nhmex_uncore_bbox_ops,
+ .format_group = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ /* only TO_R_PROG_EV event uses the match/mask register */
+ if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+ NHMEX_S_EVENT_TO_R_PROG_EV)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_S0_MSR_MM_CFG;
+ else
+ reg1->reg = NHMEX_S1_MSR_MM_CFG;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrq(reg1->reg, 0);
+ wrmsrq(reg1->reg + 1, reg1->config);
+ wrmsrq(reg1->reg + 2, reg2->config);
+ wrmsrq(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+ }
+ wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_sbox_msr_enable_event,
+ .hw_config = nhmex_sbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_S_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .ops = &nhmex_uncore_sbox_ops,
+ .format_group = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+ EXTRA_REG_NHMEX_M_FILTER,
+ EXTRA_REG_NHMEX_M_DSP,
+ EXTRA_REG_NHMEX_M_ISS,
+ EXTRA_REG_NHMEX_M_MAP,
+ EXTRA_REG_NHMEX_M_MSC_THR,
+ EXTRA_REG_NHMEX_M_PGT,
+ EXTRA_REG_NHMEX_M_PLD,
+ EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+ MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+ MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+ /* event 0xa uses two extra registers */
+ MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+ MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+ MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+ /* events 0xd ~ 0x10 use the same extra register */
+ MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+ EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ bool ret = false;
+ u64 mask;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) || er->config == config) {
+ atomic_inc(&er->ref);
+ er->config = config;
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+ }
+ /*
+ * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+ * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+ * fields which are shared.
+ */
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (WARN_ON_ONCE(idx >= 4))
+ return false;
+
+ /* mask of the shared fields */
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ /* add mask of the non-shared field if it's in use */
+ if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+ if (uncore_nhmex)
+ mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ }
+
+ if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ er->config &= ~mask;
+ er->config |= (config & mask);
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ atomic_dec(&er->ref);
+ return;
+ }
+
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+ u64 config = reg1->config;
+
+ /* get the non-shared control bits and shift them */
+ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (uncore_nhmex)
+ config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ if (new_idx > orig_idx) {
+ idx = new_idx - orig_idx;
+ config <<= 3 * idx;
+ } else {
+ idx = orig_idx - new_idx;
+ config >>= 3 * idx;
+ }
+
+ /* add the shared control bits back */
+ if (uncore_nhmex)
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ else
+ config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ if (modify) {
+ /* adjust the main event selector */
+ if (new_idx > orig_idx)
+ hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ else
+ hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ reg1->config = config;
+ reg1->idx = ~0xff | new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int i, idx[2], alloc = 0;
+ u64 config1 = reg1->config;
+
+ idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+ idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+ for (i = 0; i < 2; i++) {
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ idx[i] = 0xff;
+
+ if (idx[i] == 0xff)
+ continue;
+
+ if (!nhmex_mbox_get_shared_reg(box, idx[i],
+ __BITS_VALUE(config1, i, 32)))
+ goto fail;
+ alloc |= (0x1 << i);
+ }
+
+ /* for the match/mask registers */
+ if (reg2->idx != EXTRA_REG_NONE &&
+ (uncore_box_is_fake(box) || !reg2->alloc) &&
+ !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+ goto fail;
+
+ /*
+ * If it's a fake box -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ */
+ if (!uncore_box_is_fake(box)) {
+ if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+ nhmex_mbox_alter_er(event, idx[0], true);
+ reg1->alloc |= alloc;
+ if (reg2->idx != EXTRA_REG_NONE)
+ reg2->alloc = 1;
+ }
+ return NULL;
+fail:
+ if (idx[0] != 0xff && !(alloc & 0x1) &&
+ idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ /*
+ * events 0xd ~ 0x10 are functional identical, but are
+ * controlled by different fields in the ZDP_CTL_FVC
+ * register. If we failed to take one field, try the
+ * rest 3 choices.
+ */
+ BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+ idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ idx[0] = (idx[0] + 1) % 4;
+ idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+ config1 = nhmex_mbox_alter_er(event, idx[0], false);
+ goto again;
+ }
+ }
+
+ if (alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, idx[0]);
+ if (alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, idx[1]);
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ if (reg1->alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+ if (reg1->alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+ reg1->alloc = 0;
+
+ if (reg2->alloc) {
+ nhmex_mbox_put_shared_reg(box, reg2->idx);
+ reg2->alloc = 0;
+ }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+ if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return er->idx;
+ return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ struct extra_reg *er;
+ unsigned msr;
+ int reg_idx = 0;
+ /*
+ * The mbox events may require 2 extra MSRs at the most. But only
+ * the lower 32 bits in these MSRs are significant, so we can use
+ * config1 to pass two MSRs' config.
+ */
+ for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ if (event->attr.config1 & ~er->valid_mask)
+ return -EINVAL;
+
+ msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+ if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+ return -EINVAL;
+
+ /* always use the 32~63 bits to pass the PLD config */
+ if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+ reg_idx = 1;
+ else if (WARN_ON_ONCE(reg_idx > 0))
+ return -EINVAL;
+
+ reg1->idx &= ~(0xff << (reg_idx * 8));
+ reg1->reg &= ~(0xffff << (reg_idx * 16));
+ reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+ reg1->reg |= msr << (reg_idx * 16);
+ reg1->config = event->attr.config1;
+ reg_idx++;
+ }
+ /*
+ * The mbox only provides ability to perform address matching
+ * for the PLD events.
+ */
+ if (reg_idx == 2) {
+ reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+ if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+ reg2->config = event->attr.config2;
+ else
+ reg2->config = ~0ULL;
+ if (box->pmu->pmu_idx == 0)
+ reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+ else
+ reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+ }
+ return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return box->shared_regs[idx].config;
+
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx;
+
+ idx = __BITS_VALUE(reg1->idx, 0, 8);
+ if (idx != 0xff)
+ wrmsrq(__BITS_VALUE(reg1->reg, 0, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+ idx = __BITS_VALUE(reg1->idx, 1, 8);
+ if (idx != 0xff)
+ wrmsrq(__BITS_VALUE(reg1->reg, 1, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+
+ if (reg2->idx != EXTRA_REG_NONE) {
+ wrmsrq(reg2->reg, 0);
+ if (reg2->config != ~0ULL) {
+ wrmsrq(reg2->reg + 1,
+ reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+ wrmsrq(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+ (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+ wrmsrq(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+ }
+ }
+
+ wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+ &format_attr_count_mode.attr,
+ &format_attr_storage_mode.attr,
+ &format_attr_wrap_mode.attr,
+ &format_attr_flag_mode.attr,
+ &format_attr_inc_sel.attr,
+ &format_attr_set_flag_sel.attr,
+ &format_attr_filter_cfg_en.attr,
+ &format_attr_filter_match.attr,
+ &format_attr_filter_mask.attr,
+ &format_attr_dsp.attr,
+ &format_attr_thr.attr,
+ &format_attr_fvc.attr,
+ &format_attr_pgt.attr,
+ &format_attr_map.attr,
+ &format_attr_iss.attr,
+ &format_attr_pld.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_mbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_mbox_msr_enable_event,
+ .hw_config = nhmex_mbox_hw_config,
+ .get_constraint = nhmex_mbox_get_constraint,
+ .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+ .name = "mbox",
+ .num_counters = 6,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
+ .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
+ .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_M_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 8,
+ .event_descs = nhmex_uncore_mbox_events,
+ .ops = &nhmex_uncore_mbox_ops,
+ .format_group = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ /* adjust the main event selector and extra register index */
+ if (reg1->idx % 2) {
+ reg1->idx--;
+ hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ } else {
+ reg1->idx++;
+ hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ }
+
+ /* adjust extra register config */
+ switch (reg1->idx % 6) {
+ case 2:
+ /* shift the 8~15 bits to the 0~7 bits */
+ reg1->config >>= 8;
+ break;
+ case 3:
+ /* shift the 0~7 bits to the 8~15 bits */
+ reg1->config <<= 8;
+ break;
+ }
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ int idx, er_idx;
+ u64 config1;
+ bool ok = false;
+
+ if (!uncore_box_is_fake(box) && reg1->alloc)
+ return NULL;
+
+ idx = reg1->idx % 6;
+ config1 = reg1->config;
+again:
+ er_idx = idx;
+ /* the 3rd and 4th events use the same extra register */
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (idx < 2) {
+ if (!atomic_read(&er->ref) || er->config == reg1->config) {
+ atomic_inc(&er->ref);
+ er->config = reg1->config;
+ ok = true;
+ }
+ } else if (idx == 2 || idx == 3) {
+ /*
+ * these two events use different fields in a extra register,
+ * the 0~7 bits and the 8~15 bits respectively.
+ */
+ u64 mask = 0xff << ((idx - 2) * 8);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+ !((er->config ^ config1) & mask)) {
+ atomic_add(1 << ((idx - 2) * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ } else {
+ if (!atomic_read(&er->ref) ||
+ (er->config == (hwc->config >> 32) &&
+ er->config1 == reg1->config &&
+ er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config = (hwc->config >> 32);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ /*
+ * The Rbox events are always in pairs. The paired
+ * events are functional identical, but use different
+ * extra registers. If we failed to take an extra
+ * register, try the alternative.
+ */
+ idx ^= 1;
+ if (idx != reg1->idx % 6) {
+ if (idx == 2)
+ config1 >>= 8;
+ else if (idx == 3)
+ config1 <<= 8;
+ goto again;
+ }
+ } else {
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx % 6)
+ nhmex_rbox_alter_er(box, event);
+ reg1->alloc = 1;
+ }
+ return NULL;
+ }
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ int idx, er_idx;
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ idx = reg1->idx % 6;
+ er_idx = idx;
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ if (idx == 2 || idx == 3)
+ atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+ else
+ atomic_dec(&er->ref);
+
+ reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int idx;
+
+ idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ if (idx >= 0x18)
+ return -EINVAL;
+
+ reg1->idx = idx;
+ reg1->config = event->attr.config1;
+
+ switch (idx % 6) {
+ case 4:
+ case 5:
+ hwc->config |= event->attr.config & (~0ULL << 32);
+ reg2->config = event->attr.config2;
+ break;
+ }
+ return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx, port;
+
+ idx = reg1->idx;
+ port = idx / 6 + box->pmu->pmu_idx * 4;
+
+ switch (idx % 6) {
+ case 0:
+ wrmsrq(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+ break;
+ case 1:
+ wrmsrq(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+ break;
+ case 2:
+ case 3:
+ wrmsrq(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+ uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+ break;
+ case 4:
+ wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+ wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+ break;
+ case 5:
+ wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+ wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+ break;
+ }
+
+ wrmsrq(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_xbr_mm_cfg.attr,
+ &format_attr_xbr_match.attr,
+ &format_attr_xbr_mask.attr,
+ &format_attr_qlx_cfg.attr,
+ &format_attr_iperf_cfg.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_rbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_rbox_msr_enable_event,
+ .hw_config = nhmex_rbox_hw_config,
+ .get_constraint = nhmex_rbox_get_constraint,
+ .put_constraint = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+ .name = "rbox",
+ .num_counters = 8,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_R_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
+ .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_R_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 20,
+ .event_descs = nhmex_uncore_rbox_events,
+ .ops = &nhmex_uncore_rbox_ops,
+ .format_group = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+ &nhmex_uncore_ubox,
+ &nhmex_uncore_cbox,
+ &nhmex_uncore_bbox,
+ &nhmex_uncore_sbox,
+ &nhmex_uncore_mbox,
+ &nhmex_uncore_rbox,
+ &nhmex_uncore_wbox,
+ NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+ if (boot_cpu_data.x86_vfm == INTEL_NEHALEM_EX)
+ uncore_nhmex = true;
+ else
+ nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+ if (nhmex_uncore_cbox.num_boxes > topology_num_cores_per_package())
+ nhmex_uncore_cbox.num_boxes = topology_num_cores_per_package();
+ uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
new file mode 100644
index 000000000000..807e582b8f17
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -0,0 +1,1936 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */
+#include <asm/msr.h>
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x1904
+#define PCI_DEVICE_ID_INTEL_SKL_Y_IMC 0x190c
+#define PCI_DEVICE_ID_INTEL_SKL_HD_IMC 0x1900
+#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910
+#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f
+#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f
+#define PCI_DEVICE_ID_INTEL_SKL_E3_IMC 0x1918
+#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c
+#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904
+#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914
+#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f
+#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f
+#define PCI_DEVICE_ID_INTEL_KBL_HQ_IMC 0x5910
+#define PCI_DEVICE_ID_INTEL_KBL_WQ_IMC 0x5918
+#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc
+#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0
+#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10
+#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4
+#define PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC 0x3e0f
+#define PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC 0x3e1f
+#define PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC 0x3ec2
+#define PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC 0x3e30
+#define PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC 0x3e18
+#define PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC 0x3ec6
+#define PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC 0x3e31
+#define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33
+#define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca
+#define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32
+#define PCI_DEVICE_ID_INTEL_AML_YD_IMC 0x590c
+#define PCI_DEVICE_ID_INTEL_AML_YQ_IMC 0x590d
+#define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0
+#define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34
+#define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35
+#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44
+#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54
+#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64
+#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51
+#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61
+#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71
+#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33
+#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43
+#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53
+#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63
+#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73
+#define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02
+#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12
+#define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02
+#define PCI_DEVICE_ID_INTEL_TGL_U2_IMC 0x9a04
+#define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12
+#define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14
+#define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36
+#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43
+#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53
+#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660
+#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641
+#define PCI_DEVICE_ID_INTEL_ADL_3_IMC 0x4601
+#define PCI_DEVICE_ID_INTEL_ADL_4_IMC 0x4602
+#define PCI_DEVICE_ID_INTEL_ADL_5_IMC 0x4609
+#define PCI_DEVICE_ID_INTEL_ADL_6_IMC 0x460a
+#define PCI_DEVICE_ID_INTEL_ADL_7_IMC 0x4621
+#define PCI_DEVICE_ID_INTEL_ADL_8_IMC 0x4623
+#define PCI_DEVICE_ID_INTEL_ADL_9_IMC 0x4629
+#define PCI_DEVICE_ID_INTEL_ADL_10_IMC 0x4637
+#define PCI_DEVICE_ID_INTEL_ADL_11_IMC 0x463b
+#define PCI_DEVICE_ID_INTEL_ADL_12_IMC 0x4648
+#define PCI_DEVICE_ID_INTEL_ADL_13_IMC 0x4649
+#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650
+#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668
+#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670
+#define PCI_DEVICE_ID_INTEL_ADL_17_IMC 0x4614
+#define PCI_DEVICE_ID_INTEL_ADL_18_IMC 0x4617
+#define PCI_DEVICE_ID_INTEL_ADL_19_IMC 0x4618
+#define PCI_DEVICE_ID_INTEL_ADL_20_IMC 0x461B
+#define PCI_DEVICE_ID_INTEL_ADL_21_IMC 0x461C
+#define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700
+#define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702
+#define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706
+#define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709
+#define PCI_DEVICE_ID_INTEL_RPL_5_IMC 0xA701
+#define PCI_DEVICE_ID_INTEL_RPL_6_IMC 0xA703
+#define PCI_DEVICE_ID_INTEL_RPL_7_IMC 0xA704
+#define PCI_DEVICE_ID_INTEL_RPL_8_IMC 0xA705
+#define PCI_DEVICE_ID_INTEL_RPL_9_IMC 0xA706
+#define PCI_DEVICE_ID_INTEL_RPL_10_IMC 0xA707
+#define PCI_DEVICE_ID_INTEL_RPL_11_IMC 0xA708
+#define PCI_DEVICE_ID_INTEL_RPL_12_IMC 0xA709
+#define PCI_DEVICE_ID_INTEL_RPL_13_IMC 0xA70a
+#define PCI_DEVICE_ID_INTEL_RPL_14_IMC 0xA70b
+#define PCI_DEVICE_ID_INTEL_RPL_15_IMC 0xA715
+#define PCI_DEVICE_ID_INTEL_RPL_16_IMC 0xA716
+#define PCI_DEVICE_ID_INTEL_RPL_17_IMC 0xA717
+#define PCI_DEVICE_ID_INTEL_RPL_18_IMC 0xA718
+#define PCI_DEVICE_ID_INTEL_RPL_19_IMC 0xA719
+#define PCI_DEVICE_ID_INTEL_RPL_20_IMC 0xA71A
+#define PCI_DEVICE_ID_INTEL_RPL_21_IMC 0xA71B
+#define PCI_DEVICE_ID_INTEL_RPL_22_IMC 0xA71C
+#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728
+#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729
+#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A
+#define PCI_DEVICE_ID_INTEL_MTL_1_IMC 0x7d00
+#define PCI_DEVICE_ID_INTEL_MTL_2_IMC 0x7d01
+#define PCI_DEVICE_ID_INTEL_MTL_3_IMC 0x7d02
+#define PCI_DEVICE_ID_INTEL_MTL_4_IMC 0x7d05
+#define PCI_DEVICE_ID_INTEL_MTL_5_IMC 0x7d10
+#define PCI_DEVICE_ID_INTEL_MTL_6_IMC 0x7d14
+#define PCI_DEVICE_ID_INTEL_MTL_7_IMC 0x7d15
+#define PCI_DEVICE_ID_INTEL_MTL_8_IMC 0x7d16
+#define PCI_DEVICE_ID_INTEL_MTL_9_IMC 0x7d21
+#define PCI_DEVICE_ID_INTEL_MTL_10_IMC 0x7d22
+#define PCI_DEVICE_ID_INTEL_MTL_11_IMC 0x7d23
+#define PCI_DEVICE_ID_INTEL_MTL_12_IMC 0x7d24
+#define PCI_DEVICE_ID_INTEL_MTL_13_IMC 0x7d28
+
+
+#define IMC_UNCORE_DEV(a) \
+{ \
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_##a##_IMC), \
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), \
+}
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET (1 << 18)
+#define SNB_UNC_CTL_EN (1 << 22)
+#define SNB_UNC_CTL_INVERT (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL 0x391
+#define SNB_UNC_FIXED_CTR_CTRL 0x394
+#define SNB_UNC_FIXED_CTR 0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
+#define SNB_UNC_CBO_0_PER_CTR0 0x706
+#define SNB_UNC_CBO_MSR_OFFSET 0x10
+
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0 0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL 0x391
+#define NHM_UNC_FIXED_CTR 0x394
+#define NHM_UNC_FIXED_CTR_CTRL 0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0 0x3c0
+#define NHM_UNC_UNCORE_PMC0 0x3b0
+
+/* SKL uncore global control */
+#define SKL_UNC_PERF_GLOBAL_CTL 0xe01
+#define SKL_UNC_GLOBAL_CTL_CORE_ALL ((1 << 5) - 1)
+
+/* ICL Cbo register */
+#define ICL_UNC_CBO_CONFIG 0x396
+#define ICL_UNC_NUM_CBO_MASK 0xf
+#define ICL_UNC_CBO_0_PER_CTR0 0x702
+#define ICL_UNC_CBO_MSR_OFFSET 0x8
+
+/* ICL ARB register */
+#define ICL_UNC_ARB_PER_CTR 0x3b1
+#define ICL_UNC_ARB_PERFEVTSEL 0x3b3
+
+/* ADL uncore global control */
+#define ADL_UNC_PERF_GLOBAL_CTL 0x2ff0
+#define ADL_UNC_FIXED_CTR_CTRL 0x2fde
+#define ADL_UNC_FIXED_CTR 0x2fdf
+
+/* ADL Cbo register */
+#define ADL_UNC_CBO_0_PER_CTR0 0x2002
+#define ADL_UNC_CBO_0_PERFEVTSEL0 0x2000
+#define ADL_UNC_CTL_THRESHOLD 0x3f000000
+#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ ADL_UNC_CTL_THRESHOLD)
+
+/* ADL ARB register */
+#define ADL_UNC_ARB_PER_CTR0 0x2FD2
+#define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0
+#define ADL_UNC_ARB_MSR_OFFSET 0x8
+
+/* MTL Cbo register */
+#define MTL_UNC_CBO_0_PER_CTR0 0x2448
+#define MTL_UNC_CBO_0_PERFEVTSEL0 0x2442
+
+/* MTL HAC_ARB register */
+#define MTL_UNC_HAC_ARB_CTR 0x2018
+#define MTL_UNC_HAC_ARB_CTRL 0x2012
+
+/* MTL ARB register */
+#define MTL_UNC_ARB_CTR 0x2418
+#define MTL_UNC_ARB_CTRL 0x2412
+
+/* MTL cNCU register */
+#define MTL_UNC_CNCU_FIXED_CTR 0x2408
+#define MTL_UNC_CNCU_FIXED_CTRL 0x2402
+#define MTL_UNC_CNCU_BOX_CTL 0x240e
+
+/* MTL sNCU register */
+#define MTL_UNC_SNCU_FIXED_CTR 0x2008
+#define MTL_UNC_SNCU_FIXED_CTRL 0x2002
+#define MTL_UNC_SNCU_BOX_CTL 0x200e
+
+/* MTL HAC_CBO register */
+#define MTL_UNC_HBO_CTR 0x2048
+#define MTL_UNC_HBO_CTRL 0x2042
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(threshold2, threshold, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrq(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrq(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrq(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrq(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+}
+
+static void snb_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrq(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrq(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask5.attr,
+ NULL,
+};
+
+static const struct attribute_group snb_uncore_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+ .init_box = snb_uncore_msr_init_box,
+ .enable_box = snb_uncore_msr_enable_box,
+ .exit_box = snb_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_arb_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type snb_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .perf_ctr = SNB_UNC_ARB_PER_CTR0,
+ .event_ctl = SNB_UNC_ARB_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_ARB_MSR_OFFSET,
+ .constraints = snb_uncore_arb_constraints,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+ &snb_uncore_cbox,
+ &snb_uncore_arb,
+ NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = snb_msr_uncores;
+ if (snb_uncore_cbox.num_boxes > topology_num_cores_per_package())
+ snb_uncore_cbox.num_boxes = topology_num_cores_per_package();
+}
+
+static void skl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrq(SKL_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+
+ /* The 8th CBOX has different MSR space */
+ if (box->pmu->pmu_idx == 7)
+ __set_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags);
+}
+
+static void skl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrq(SKL_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
+static void skl_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrq(SKL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct intel_uncore_ops skl_uncore_msr_ops = {
+ .init_box = skl_uncore_msr_init_box,
+ .enable_box = skl_uncore_msr_enable_box,
+ .exit_box = skl_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &skl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type *skl_msr_uncores[] = {
+ &skl_uncore_cbox,
+ &snb_uncore_arb,
+ NULL,
+};
+
+void skl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = skl_msr_uncores;
+ if (skl_uncore_cbox.num_boxes > topology_num_cores_per_package())
+ skl_uncore_cbox.num_boxes = topology_num_cores_per_package();
+ snb_uncore_arb.ops = &skl_uncore_msr_ops;
+}
+
+static struct intel_uncore_ops icl_uncore_msr_ops = {
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type icl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ICL_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = ICL_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
+static struct uncore_event_desc icl_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute *icl_uncore_clock_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group icl_uncore_clock_format_group = {
+ .name = "format",
+ .attrs = icl_uncore_clock_formats_attr,
+};
+
+static struct intel_uncore_type icl_uncore_clockbox = {
+ .name = "clock",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &icl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type icl_uncore_arb = {
+ .name = "arb",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ICL_UNC_ARB_PER_CTR,
+ .event_ctl = ICL_UNC_ARB_PERFEVTSEL,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *icl_msr_uncores[] = {
+ &icl_uncore_cbox,
+ &icl_uncore_arb,
+ &icl_uncore_clockbox,
+ NULL,
+};
+
+static int icl_get_cbox_num(void)
+{
+ u64 num_boxes;
+
+ rdmsrq(ICL_UNC_CBO_CONFIG, num_boxes);
+
+ return num_boxes & ICL_UNC_NUM_CBO_MASK;
+}
+
+void icl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = icl_msr_uncores;
+ icl_uncore_cbox.num_boxes = icl_get_cbox_num();
+}
+
+static struct intel_uncore_type *tgl_msr_uncores[] = {
+ &icl_uncore_cbox,
+ &snb_uncore_arb,
+ &icl_uncore_clockbox,
+ NULL,
+};
+
+static void rkl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrq(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+void tgl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = tgl_msr_uncores;
+ icl_uncore_cbox.num_boxes = icl_get_cbox_num();
+ icl_uncore_cbox.ops = &skl_uncore_msr_ops;
+ icl_uncore_clockbox.ops = &skl_uncore_msr_ops;
+ snb_uncore_arb.ops = &skl_uncore_msr_ops;
+ skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box;
+}
+
+static void adl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void adl_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct intel_uncore_ops adl_uncore_msr_ops = {
+ .init_box = adl_uncore_msr_init_box,
+ .enable_box = adl_uncore_msr_enable_box,
+ .disable_box = adl_uncore_msr_disable_box,
+ .exit_box = adl_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct attribute *adl_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_threshold.attr,
+ NULL,
+};
+
+static const struct attribute_group adl_uncore_format_group = {
+ .name = "format",
+ .attrs = adl_uncore_formats_attr,
+};
+
+static struct intel_uncore_type adl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ADL_UNC_CBO_0_PER_CTR0,
+ .event_ctl = ADL_UNC_CBO_0_PERFEVTSEL0,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = ICL_UNC_CBO_MSR_OFFSET,
+ .ops = &adl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type adl_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ADL_UNC_ARB_PER_CTR0,
+ .event_ctl = ADL_UNC_ARB_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = ADL_UNC_ARB_MSR_OFFSET,
+ .constraints = snb_uncore_arb_constraints,
+ .ops = &adl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type adl_uncore_clockbox = {
+ .name = "clock",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = ADL_UNC_FIXED_CTR,
+ .fixed_ctl = ADL_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &adl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type *adl_msr_uncores[] = {
+ &adl_uncore_cbox,
+ &adl_uncore_arb,
+ &adl_uncore_clockbox,
+ NULL,
+};
+
+void adl_uncore_cpu_init(void)
+{
+ adl_uncore_cbox.num_boxes = icl_get_cbox_num();
+ uncore_msr_uncores = adl_msr_uncores;
+}
+
+static struct intel_uncore_type mtl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_CBO_0_PER_CTR0,
+ .event_ctl = MTL_UNC_CBO_0_PERFEVTSEL0,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_hac_arb = {
+ .name = "hac_arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_HAC_ARB_CTR,
+ .event_ctl = MTL_UNC_HAC_ARB_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_ARB_CTR,
+ .event_ctl = MTL_UNC_ARB_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_hac_cbox = {
+ .name = "hac_cbox",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_HBO_CTR,
+ .event_ctl = MTL_UNC_HBO_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static void mtl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrq(uncore_msr_box_ctl(box), SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static struct intel_uncore_ops mtl_uncore_msr_ops = {
+ .init_box = mtl_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type mtl_uncore_cncu = {
+ .name = "cncu",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .box_ctl = MTL_UNC_CNCU_BOX_CTL,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = MTL_UNC_CNCU_FIXED_CTR,
+ .fixed_ctl = MTL_UNC_CNCU_FIXED_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &mtl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type mtl_uncore_sncu = {
+ .name = "sncu",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .box_ctl = MTL_UNC_SNCU_BOX_CTL,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = MTL_UNC_SNCU_FIXED_CTR,
+ .fixed_ctl = MTL_UNC_SNCU_FIXED_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &mtl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type *mtl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ &mtl_uncore_hac_arb,
+ &mtl_uncore_arb,
+ &mtl_uncore_hac_cbox,
+ &mtl_uncore_cncu,
+ &mtl_uncore_sncu,
+ NULL
+};
+
+void mtl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = icl_get_cbox_num();
+ uncore_msr_uncores = mtl_msr_uncores;
+}
+
+static struct intel_uncore_type *lnl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ &mtl_uncore_arb,
+ NULL
+};
+
+#define LNL_UNC_MSR_GLOBAL_CTL 0x240e
+
+static void lnl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrq(LNL_UNC_MSR_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static struct intel_uncore_ops lnl_uncore_msr_ops = {
+ .init_box = lnl_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+void lnl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = 4;
+ mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
+ uncore_msr_uncores = lnl_msr_uncores;
+}
+
+enum {
+ SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(gt_requests, "event=0x03"),
+ INTEL_UNCORE_EVENT_DESC(gt_requests.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(gt_requests.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(ia_requests, "event=0x04"),
+ INTEL_UNCORE_EVENT_DESC(ia_requests.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(ia_requests.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(io_requests, "event=0x05"),
+ INTEL_UNCORE_EVENT_DESC(io_requests.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(io_requests.unit, "MiB"),
+
+ { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+/* BW break down- legacy counters */
+#define SNB_UNCORE_PCI_IMC_GT_REQUESTS 0x3
+#define SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE 0x5040
+#define SNB_UNCORE_PCI_IMC_IA_REQUESTS 0x4
+#define SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE 0x5044
+#define SNB_UNCORE_PCI_IMC_IO_REQUESTS 0x5
+#define SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE 0x5048
+
+enum perf_snb_uncore_imc_freerunning_types {
+ SNB_PCI_UNCORE_IMC_DATA_READS = 0,
+ SNB_PCI_UNCORE_IMC_DATA_WRITES,
+ SNB_PCI_UNCORE_IMC_GT_REQUESTS,
+ SNB_PCI_UNCORE_IMC_IA_REQUESTS,
+ SNB_PCI_UNCORE_IMC_IO_REQUESTS,
+
+ SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters snb_uncore_imc_freerunning[] = {
+ [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+ 0x0, 0x0, 1, 32 },
+ [SNB_PCI_UNCORE_IMC_DATA_WRITES] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE,
+ 0x0, 0x0, 1, 32 },
+ [SNB_PCI_UNCORE_IMC_GT_REQUESTS] = { SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE,
+ 0x0, 0x0, 1, 32 },
+ [SNB_PCI_UNCORE_IMC_IA_REQUESTS] = { SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE,
+ 0x0, 0x0, 1, 32 },
+ [SNB_PCI_UNCORE_IMC_IO_REQUESTS] = { SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE,
+ 0x0, 0x0, 1, 32 },
+};
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static const struct attribute_group snb_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct pci_dev *pdev = box->pci_dev;
+ int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ pci_read_config_dword(pdev, where, &pci_dword);
+ addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, where + 4, &pci_dword);
+ addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+ addr &= ~(PAGE_SIZE - 1);
+
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr)
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
+ box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+/*
+ * Keep the custom event_init() function compatible with old event
+ * encoding for free running counters.
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+ int idx, base;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (!pmu->registered)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+ return -EINVAL;
+
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+
+ event->cpu = box->cpu;
+ event->pmu_private = box;
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+ /*
+ * check event is known (whitelist, determines counter)
+ */
+ switch (cfg) {
+ case SNB_UNCORE_PCI_IMC_DATA_READS:
+ base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
+ break;
+ case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+ base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
+ break;
+ case SNB_UNCORE_PCI_IMC_GT_REQUESTS:
+ base = SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
+ break;
+ case SNB_UNCORE_PCI_IMC_IA_REQUESTS:
+ base = SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
+ break;
+ case SNB_UNCORE_PCI_IMC_IO_REQUESTS:
+ base = SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* must be done before validate_group */
+ event->hw.event_base = base;
+ event->hw.idx = idx;
+
+ /* Convert to standard encoding format for freerunning counters */
+ event->hw.config = ((cfg - 1) << 8) | 0x10ff;
+
+ /* no group validation needed, we have free running counters */
+
+ return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return 0;
+}
+
+int snb_pci2phy_map_init(int devid)
+{
+ struct pci_dev *dev = NULL;
+ struct pci2phy_map *map;
+ int bus, segment;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+ if (!dev)
+ return -ENOTTY;
+
+ bus = dev->bus->number;
+ segment = pci_domain_nr(dev->bus);
+
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ pci_dev_put(dev);
+ return -ENOMEM;
+ }
+ map->pbus_to_dieid[bus] = 0;
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ pci_dev_put(dev);
+
+ return 0;
+}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * SNB IMC counters are 32-bit and are laid out back to back
+ * in MMIO space. Therefore we must use a 32-bit accessor function
+ * using readq() from uncore_mmio_read_counter() causes problems
+ * because it is reading 64-bit at a time. This is okay for the
+ * uncore_perf_event_update() function because it drops the upper
+ * 32-bits but not okay for plain uncore_read_counter() as invoked
+ * in uncore_pmu_event_start().
+ */
+ return (u64)readl(box->io_addr + hwc->event_base);
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = snb_uncore_imc_event_init,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
+ .read = uncore_pmu_event_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+ .init_box = snb_uncore_imc_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .enable_box = snb_uncore_imc_enable_box,
+ .disable_box = snb_uncore_imc_disable_box,
+ .disable_event = snb_uncore_imc_disable_event,
+ .enable_event = snb_uncore_imc_enable_event,
+ .hw_config = snb_uncore_imc_hw_config,
+ .read_counter = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 1,
+ .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE,
+ .freerunning = snb_uncore_imc_freerunning,
+ .event_descs = snb_uncore_imc_events,
+ .format_group = &snb_uncore_imc_format_group,
+ .ops = &snb_uncore_imc_ops,
+ .pmu = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+ [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
+ NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+ IMC_UNCORE_DEV(SNB),
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+ IMC_UNCORE_DEV(IVB),
+ IMC_UNCORE_DEV(IVB_E3),
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+ IMC_UNCORE_DEV(HSW),
+ IMC_UNCORE_DEV(HSW_U),
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+ IMC_UNCORE_DEV(BDW),
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+ IMC_UNCORE_DEV(SKL_Y),
+ IMC_UNCORE_DEV(SKL_U),
+ IMC_UNCORE_DEV(SKL_HD),
+ IMC_UNCORE_DEV(SKL_HQ),
+ IMC_UNCORE_DEV(SKL_SD),
+ IMC_UNCORE_DEV(SKL_SQ),
+ IMC_UNCORE_DEV(SKL_E3),
+ IMC_UNCORE_DEV(KBL_Y),
+ IMC_UNCORE_DEV(KBL_U),
+ IMC_UNCORE_DEV(KBL_UQ),
+ IMC_UNCORE_DEV(KBL_SD),
+ IMC_UNCORE_DEV(KBL_SQ),
+ IMC_UNCORE_DEV(KBL_HQ),
+ IMC_UNCORE_DEV(KBL_WQ),
+ IMC_UNCORE_DEV(CFL_2U),
+ IMC_UNCORE_DEV(CFL_4U),
+ IMC_UNCORE_DEV(CFL_4H),
+ IMC_UNCORE_DEV(CFL_6H),
+ IMC_UNCORE_DEV(CFL_2S_D),
+ IMC_UNCORE_DEV(CFL_4S_D),
+ IMC_UNCORE_DEV(CFL_6S_D),
+ IMC_UNCORE_DEV(CFL_8S_D),
+ IMC_UNCORE_DEV(CFL_4S_W),
+ IMC_UNCORE_DEV(CFL_6S_W),
+ IMC_UNCORE_DEV(CFL_8S_W),
+ IMC_UNCORE_DEV(CFL_4S_S),
+ IMC_UNCORE_DEV(CFL_6S_S),
+ IMC_UNCORE_DEV(CFL_8S_S),
+ IMC_UNCORE_DEV(AML_YD),
+ IMC_UNCORE_DEV(AML_YQ),
+ IMC_UNCORE_DEV(WHL_UQ),
+ IMC_UNCORE_DEV(WHL_4_UQ),
+ IMC_UNCORE_DEV(WHL_UD),
+ IMC_UNCORE_DEV(CML_H1),
+ IMC_UNCORE_DEV(CML_H2),
+ IMC_UNCORE_DEV(CML_H3),
+ IMC_UNCORE_DEV(CML_U1),
+ IMC_UNCORE_DEV(CML_U2),
+ IMC_UNCORE_DEV(CML_U3),
+ IMC_UNCORE_DEV(CML_S1),
+ IMC_UNCORE_DEV(CML_S2),
+ IMC_UNCORE_DEV(CML_S3),
+ IMC_UNCORE_DEV(CML_S4),
+ IMC_UNCORE_DEV(CML_S5),
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id icl_uncore_pci_ids[] = {
+ IMC_UNCORE_DEV(ICL_U),
+ IMC_UNCORE_DEV(ICL_U2),
+ IMC_UNCORE_DEV(RKL_1),
+ IMC_UNCORE_DEV(RKL_2),
+ { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+ .name = "snb_uncore",
+ .id_table = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+ .name = "ivb_uncore",
+ .id_table = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+ .name = "hsw_uncore",
+ .id_table = hsw_uncore_pci_ids,
+};
+
+static struct pci_driver bdw_uncore_pci_driver = {
+ .name = "bdw_uncore",
+ .id_table = bdw_uncore_pci_ids,
+};
+
+static struct pci_driver skl_uncore_pci_driver = {
+ .name = "skl_uncore",
+ .id_table = skl_uncore_pci_ids,
+};
+
+static struct pci_driver icl_uncore_pci_driver = {
+ .name = "icl_uncore",
+ .id_table = icl_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+ __u32 pci_id;
+ struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+ { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+ IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+ IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */
+ IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
+ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */
+ IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */
+ IMC_DEV(SKL_Y_IMC, &skl_uncore_pci_driver), /* 6th Gen Core Y */
+ IMC_DEV(SKL_U_IMC, &skl_uncore_pci_driver), /* 6th Gen Core U */
+ IMC_DEV(SKL_HD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Dual Core */
+ IMC_DEV(SKL_HQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Quad Core */
+ IMC_DEV(SKL_SD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Dual Core */
+ IMC_DEV(SKL_SQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Quad Core */
+ IMC_DEV(SKL_E3_IMC, &skl_uncore_pci_driver), /* Xeon E3 V5 Gen Core processor */
+ IMC_DEV(KBL_Y_IMC, &skl_uncore_pci_driver), /* 7th Gen Core Y */
+ IMC_DEV(KBL_U_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U */
+ IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */
+ IMC_DEV(KBL_SD_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Dual Core */
+ IMC_DEV(KBL_SQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Quad Core */
+ IMC_DEV(KBL_HQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core H Quad Core */
+ IMC_DEV(KBL_WQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S 4 cores Work Station */
+ IMC_DEV(CFL_2U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 2 Cores */
+ IMC_DEV(CFL_4U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 4 Cores */
+ IMC_DEV(CFL_4H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 4 Cores */
+ IMC_DEV(CFL_6H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 6 Cores */
+ IMC_DEV(CFL_2S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 2 Cores Desktop */
+ IMC_DEV(CFL_4S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Desktop */
+ IMC_DEV(CFL_6S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Desktop */
+ IMC_DEV(CFL_8S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Desktop */
+ IMC_DEV(CFL_4S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Work Station */
+ IMC_DEV(CFL_6S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Work Station */
+ IMC_DEV(CFL_8S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Work Station */
+ IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */
+ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */
+ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */
+ IMC_DEV(AML_YD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core Y Mobile Dual Core */
+ IMC_DEV(AML_YQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core Y Mobile Quad Core */
+ IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */
+ IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */
+ IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */
+ IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
+ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
+ IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver),
+ IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver),
+ { /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+ for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+ const struct imc_uncore_pci_dev *p;
+ int ret;
+
+ for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+ ret = snb_pci2phy_map_init(p->pci_id);
+ if (ret == 0)
+ return p->driver;
+ }
+ return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+ struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+ if (!imc_drv)
+ return -ENODEV;
+
+ uncore_pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = imc_drv;
+
+ return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+int bdw_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+int skl_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrq(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrq(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrq(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrq(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask8.attr,
+ NULL,
+};
+
+static const struct attribute_group nhm_uncore_format_group = {
+ .name = "format",
+ .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+ .disable_box = nhm_uncore_msr_disable_box,
+ .enable_box = nhm_uncore_msr_enable_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = nhm_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+ .name = "",
+ .num_counters = 8,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .event_ctl = NHM_UNC_PERFEVTSEL0,
+ .perf_ctr = NHM_UNC_UNCORE_PMC0,
+ .fixed_ctr = NHM_UNC_FIXED_CTR,
+ .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
+ .event_mask = NHM_UNC_RAW_EVENT_MASK,
+ .event_descs = nhm_uncore_events,
+ .ops = &nhm_uncore_msr_ops,
+ .format_group = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+ &nhm_uncore,
+ NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
+
+/* Tiger Lake MMIO uncore support */
+
+static const struct pci_device_id tgl_uncore_pci_ids[] = {
+ IMC_UNCORE_DEV(TGL_U1),
+ IMC_UNCORE_DEV(TGL_U2),
+ IMC_UNCORE_DEV(TGL_U3),
+ IMC_UNCORE_DEV(TGL_U4),
+ IMC_UNCORE_DEV(TGL_H),
+ IMC_UNCORE_DEV(ADL_1),
+ IMC_UNCORE_DEV(ADL_2),
+ IMC_UNCORE_DEV(ADL_3),
+ IMC_UNCORE_DEV(ADL_4),
+ IMC_UNCORE_DEV(ADL_5),
+ IMC_UNCORE_DEV(ADL_6),
+ IMC_UNCORE_DEV(ADL_7),
+ IMC_UNCORE_DEV(ADL_8),
+ IMC_UNCORE_DEV(ADL_9),
+ IMC_UNCORE_DEV(ADL_10),
+ IMC_UNCORE_DEV(ADL_11),
+ IMC_UNCORE_DEV(ADL_12),
+ IMC_UNCORE_DEV(ADL_13),
+ IMC_UNCORE_DEV(ADL_14),
+ IMC_UNCORE_DEV(ADL_15),
+ IMC_UNCORE_DEV(ADL_16),
+ IMC_UNCORE_DEV(ADL_17),
+ IMC_UNCORE_DEV(ADL_18),
+ IMC_UNCORE_DEV(ADL_19),
+ IMC_UNCORE_DEV(ADL_20),
+ IMC_UNCORE_DEV(ADL_21),
+ IMC_UNCORE_DEV(RPL_1),
+ IMC_UNCORE_DEV(RPL_2),
+ IMC_UNCORE_DEV(RPL_3),
+ IMC_UNCORE_DEV(RPL_4),
+ IMC_UNCORE_DEV(RPL_5),
+ IMC_UNCORE_DEV(RPL_6),
+ IMC_UNCORE_DEV(RPL_7),
+ IMC_UNCORE_DEV(RPL_8),
+ IMC_UNCORE_DEV(RPL_9),
+ IMC_UNCORE_DEV(RPL_10),
+ IMC_UNCORE_DEV(RPL_11),
+ IMC_UNCORE_DEV(RPL_12),
+ IMC_UNCORE_DEV(RPL_13),
+ IMC_UNCORE_DEV(RPL_14),
+ IMC_UNCORE_DEV(RPL_15),
+ IMC_UNCORE_DEV(RPL_16),
+ IMC_UNCORE_DEV(RPL_17),
+ IMC_UNCORE_DEV(RPL_18),
+ IMC_UNCORE_DEV(RPL_19),
+ IMC_UNCORE_DEV(RPL_20),
+ IMC_UNCORE_DEV(RPL_21),
+ IMC_UNCORE_DEV(RPL_22),
+ IMC_UNCORE_DEV(RPL_23),
+ IMC_UNCORE_DEV(RPL_24),
+ IMC_UNCORE_DEV(RPL_25),
+ IMC_UNCORE_DEV(MTL_1),
+ IMC_UNCORE_DEV(MTL_2),
+ IMC_UNCORE_DEV(MTL_3),
+ IMC_UNCORE_DEV(MTL_4),
+ IMC_UNCORE_DEV(MTL_5),
+ IMC_UNCORE_DEV(MTL_6),
+ IMC_UNCORE_DEV(MTL_7),
+ IMC_UNCORE_DEV(MTL_8),
+ IMC_UNCORE_DEV(MTL_9),
+ IMC_UNCORE_DEV(MTL_10),
+ IMC_UNCORE_DEV(MTL_11),
+ IMC_UNCORE_DEV(MTL_12),
+ IMC_UNCORE_DEV(MTL_13),
+ { /* end: all zeroes */ }
+};
+
+enum perf_tgl_uncore_imc_freerunning_types {
+ TGL_MMIO_UNCORE_IMC_DATA_TOTAL,
+ TGL_MMIO_UNCORE_IMC_DATA_READ,
+ TGL_MMIO_UNCORE_IMC_DATA_WRITE,
+ TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX
+};
+
+static struct freerunning_counters tgl_l_uncore_imc_freerunning[] = {
+ [TGL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x5040, 0x0, 0x0, 1, 64 },
+ [TGL_MMIO_UNCORE_IMC_DATA_READ] = { 0x5058, 0x0, 0x0, 1, 64 },
+ [TGL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0x50A0, 0x0, 0x0, 1, 64 },
+};
+
+static struct freerunning_counters tgl_uncore_imc_freerunning[] = {
+ [TGL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0xd840, 0x0, 0x0, 1, 64 },
+ [TGL_MMIO_UNCORE_IMC_DATA_READ] = { 0xd858, 0x0, 0x0, 1, 64 },
+ [TGL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xd8A0, 0x0, 0x0, 1, 64 },
+};
+
+static struct uncore_event_desc tgl_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_total, "event=0xff,umask=0x10"),
+ INTEL_UNCORE_EVENT_DESC(data_total.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_total.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_read, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(data_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_read.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_write, "event=0xff,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(data_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_write.unit, "MiB"),
+
+ { /* end: all zeroes */ }
+};
+
+static struct pci_dev *tgl_uncore_get_mc_dev(void)
+{
+ const struct pci_device_id *ids = tgl_uncore_pci_ids;
+ struct pci_dev *mc_dev = NULL;
+
+ while (ids && ids->vendor) {
+ mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, ids->device, NULL);
+ if (mc_dev)
+ return mc_dev;
+ ids++;
+ }
+
+ /* Just try to grab 00:00.0 device */
+ if (!mc_dev)
+ mc_dev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0));
+
+ return mc_dev;
+}
+
+#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
+#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
+
+static void
+uncore_get_box_mmio_addr(struct intel_uncore_box *box,
+ unsigned int base_offset,
+ int bar_offset, int step)
+{
+ struct pci_dev *pdev = tgl_uncore_get_mc_dev();
+ struct intel_uncore_pmu *pmu = box->pmu;
+ struct intel_uncore_type *type = pmu->type;
+ resource_size_t addr;
+ u32 bar;
+
+ if (!pdev) {
+ pr_warn("perf uncore: Cannot find matched IMC device.\n");
+ return;
+ }
+
+ pci_read_config_dword(pdev, bar_offset, &bar);
+ if (!(bar & BIT(0))) {
+ pr_warn("perf uncore: BAR 0x%x is disabled. Failed to map %s counters.\n",
+ bar_offset, type->name);
+ pci_dev_put(pdev);
+ return;
+ }
+ bar &= ~BIT(0);
+ addr = (resource_size_t)(bar + step * pmu->pmu_idx);
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, bar_offset + 4, &bar);
+ addr |= ((resource_size_t)bar << 32);
+#endif
+
+ addr += base_offset;
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr)
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
+ pci_dev_put(pdev);
+}
+
+static void __uncore_imc_init_box(struct intel_uncore_box *box,
+ unsigned int base_offset)
+{
+ uncore_get_box_mmio_addr(box, base_offset,
+ SNB_UNCORE_PCI_IMC_BAR_OFFSET,
+ TGL_UNCORE_MMIO_IMC_MEM_OFFSET);
+}
+
+static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, 0);
+}
+
+static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {
+ .init_box = tgl_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct attribute *tgl_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ NULL
+};
+
+static const struct attribute_group tgl_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = tgl_uncore_imc_formats_attr,
+};
+
+static struct intel_uncore_type tgl_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE,
+ .freerunning = tgl_uncore_imc_freerunning,
+ .ops = &tgl_uncore_imc_freerunning_ops,
+ .event_descs = tgl_uncore_imc_events,
+ .format_group = &tgl_uncore_imc_format_group,
+};
+
+static struct intel_uncore_type *tgl_mmio_uncores[] = {
+ &tgl_uncore_imc_free_running,
+ NULL
+};
+
+void tgl_l_uncore_mmio_init(void)
+{
+ tgl_uncore_imc_free_running.freerunning = tgl_l_uncore_imc_freerunning;
+ uncore_mmio_uncores = tgl_mmio_uncores;
+}
+
+void tgl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = tgl_mmio_uncores;
+}
+
+/* end of Tiger Lake MMIO uncore support */
+
+/* Alder Lake MMIO uncore support */
+#define ADL_UNCORE_IMC_BASE 0xd900
+#define ADL_UNCORE_IMC_MAP_SIZE 0x200
+#define ADL_UNCORE_IMC_CTR 0xe8
+#define ADL_UNCORE_IMC_CTRL 0xd0
+#define ADL_UNCORE_IMC_GLOBAL_CTL 0xc0
+#define ADL_UNCORE_IMC_BOX_CTL 0xc4
+#define ADL_UNCORE_IMC_FREERUNNING_BASE 0xd800
+#define ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE 0x100
+
+#define ADL_UNCORE_IMC_CTL_FRZ (1 << 0)
+#define ADL_UNCORE_IMC_CTL_RST_CTRL (1 << 1)
+#define ADL_UNCORE_IMC_CTL_RST_CTRS (1 << 2)
+#define ADL_UNCORE_IMC_CTL_INT (ADL_UNCORE_IMC_CTL_RST_CTRL | \
+ ADL_UNCORE_IMC_CTL_RST_CTRS)
+
+static void adl_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_BASE);
+
+ /* The global control in MC1 can control both MCs. */
+ if (box->io_addr && (box->pmu->pmu_idx == 1))
+ writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + ADL_UNCORE_IMC_GLOBAL_CTL);
+}
+
+static void adl_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(ADL_UNCORE_IMC_CTL_FRZ, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+#define MMIO_UNCORE_COMMON_OPS() \
+ .exit_box = uncore_mmio_exit_box, \
+ .disable_box = adl_uncore_mmio_disable_box, \
+ .enable_box = adl_uncore_mmio_enable_box, \
+ .disable_event = intel_generic_uncore_mmio_disable_event, \
+ .enable_event = intel_generic_uncore_mmio_enable_event, \
+ .read_counter = uncore_mmio_read_counter,
+
+static struct intel_uncore_ops adl_uncore_mmio_ops = {
+ .init_box = adl_uncore_imc_init_box,
+ MMIO_UNCORE_COMMON_OPS()
+};
+
+#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00
+#define ADL_UNC_IMC_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ ADL_UNC_CTL_CHMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET)
+
+static struct attribute *adl_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_chmask.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static const struct attribute_group adl_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = adl_uncore_imc_formats_attr,
+};
+
+static struct intel_uncore_type adl_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 2,
+ .perf_ctr_bits = 64,
+ .perf_ctr = ADL_UNCORE_IMC_CTR,
+ .event_ctl = ADL_UNCORE_IMC_CTRL,
+ .event_mask = ADL_UNC_IMC_EVENT_MASK,
+ .box_ctl = ADL_UNCORE_IMC_BOX_CTL,
+ .mmio_offset = 0,
+ .mmio_map_size = ADL_UNCORE_IMC_MAP_SIZE,
+ .ops = &adl_uncore_mmio_ops,
+ .format_group = &adl_uncore_imc_format_group,
+};
+
+enum perf_adl_uncore_imc_freerunning_types {
+ ADL_MMIO_UNCORE_IMC_DATA_TOTAL,
+ ADL_MMIO_UNCORE_IMC_DATA_READ,
+ ADL_MMIO_UNCORE_IMC_DATA_WRITE,
+ ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX
+};
+
+static struct freerunning_counters adl_uncore_imc_freerunning[] = {
+ [ADL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x40, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_READ] = { 0x58, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xA0, 0x0, 0x0, 1, 64 },
+};
+
+static void adl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_FREERUNNING_BASE);
+}
+
+static struct intel_uncore_ops adl_uncore_imc_freerunning_ops = {
+ .init_box = adl_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type adl_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .num_freerunning_types = ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE,
+ .freerunning = adl_uncore_imc_freerunning,
+ .ops = &adl_uncore_imc_freerunning_ops,
+ .event_descs = tgl_uncore_imc_events,
+ .format_group = &tgl_uncore_imc_format_group,
+};
+
+static struct intel_uncore_type *adl_mmio_uncores[] = {
+ &adl_uncore_imc,
+ &adl_uncore_imc_free_running,
+ NULL
+};
+
+void adl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = adl_mmio_uncores;
+}
+
+/* end of Alder Lake MMIO uncore support */
+
+/* Lunar Lake MMIO uncore support */
+#define LNL_UNCORE_PCI_SAFBAR_OFFSET 0x68
+#define LNL_UNCORE_MAP_SIZE 0x1000
+#define LNL_UNCORE_SNCU_BASE 0xE4B000
+#define LNL_UNCORE_SNCU_CTR 0x390
+#define LNL_UNCORE_SNCU_CTRL 0x398
+#define LNL_UNCORE_SNCU_BOX_CTL 0x380
+#define LNL_UNCORE_GLOBAL_CTL 0x700
+#define LNL_UNCORE_HBO_BASE 0xE54000
+#define LNL_UNCORE_HBO_OFFSET -4096
+#define LNL_UNCORE_HBO_CTR 0x570
+#define LNL_UNCORE_HBO_CTRL 0x550
+#define LNL_UNCORE_HBO_BOX_CTL 0x548
+
+#define LNL_UNC_CTL_THRESHOLD 0xff000000
+#define LNL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ LNL_UNC_CTL_THRESHOLD)
+
+static struct attribute *lnl_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_threshold2.attr,
+ NULL
+};
+
+static const struct attribute_group lnl_uncore_format_group = {
+ .name = "format",
+ .attrs = lnl_uncore_formats_attr,
+};
+
+static void lnl_uncore_hbo_init_box(struct intel_uncore_box *box)
+{
+ uncore_get_box_mmio_addr(box, LNL_UNCORE_HBO_BASE,
+ LNL_UNCORE_PCI_SAFBAR_OFFSET,
+ LNL_UNCORE_HBO_OFFSET);
+}
+
+static struct intel_uncore_ops lnl_uncore_hbo_ops = {
+ .init_box = lnl_uncore_hbo_init_box,
+ MMIO_UNCORE_COMMON_OPS()
+};
+
+static struct intel_uncore_type lnl_uncore_hbo = {
+ .name = "hbo",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 64,
+ .perf_ctr = LNL_UNCORE_HBO_CTR,
+ .event_ctl = LNL_UNCORE_HBO_CTRL,
+ .event_mask = LNL_UNC_RAW_EVENT_MASK,
+ .box_ctl = LNL_UNCORE_HBO_BOX_CTL,
+ .mmio_map_size = LNL_UNCORE_MAP_SIZE,
+ .ops = &lnl_uncore_hbo_ops,
+ .format_group = &lnl_uncore_format_group,
+};
+
+static void lnl_uncore_sncu_init_box(struct intel_uncore_box *box)
+{
+ uncore_get_box_mmio_addr(box, LNL_UNCORE_SNCU_BASE,
+ LNL_UNCORE_PCI_SAFBAR_OFFSET,
+ 0);
+
+ if (box->io_addr)
+ writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + LNL_UNCORE_GLOBAL_CTL);
+}
+
+static struct intel_uncore_ops lnl_uncore_sncu_ops = {
+ .init_box = lnl_uncore_sncu_init_box,
+ MMIO_UNCORE_COMMON_OPS()
+};
+
+static struct intel_uncore_type lnl_uncore_sncu = {
+ .name = "sncu",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 64,
+ .perf_ctr = LNL_UNCORE_SNCU_CTR,
+ .event_ctl = LNL_UNCORE_SNCU_CTRL,
+ .event_mask = LNL_UNC_RAW_EVENT_MASK,
+ .box_ctl = LNL_UNCORE_SNCU_BOX_CTL,
+ .mmio_map_size = LNL_UNCORE_MAP_SIZE,
+ .ops = &lnl_uncore_sncu_ops,
+ .format_group = &lnl_uncore_format_group,
+};
+
+static struct intel_uncore_type *lnl_mmio_uncores[] = {
+ &adl_uncore_imc,
+ &lnl_uncore_hbo,
+ &lnl_uncore_sncu,
+ &adl_uncore_imc_free_running,
+ NULL
+};
+
+void lnl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = lnl_mmio_uncores;
+}
+
+/* end of Lunar Lake MMIO uncore support */
+
+/* Panther Lake uncore support */
+
+#define UNCORE_PTL_MAX_NUM_UNCORE_TYPES 42
+#define UNCORE_PTL_TYPE_IMC 6
+#define UNCORE_PTL_TYPE_SNCU 34
+#define UNCORE_PTL_TYPE_HBO 41
+
+#define PTL_UNCORE_GLOBAL_CTL_OFFSET 0x380
+
+static struct intel_uncore_type ptl_uncore_imc = {
+ .name = "imc",
+ .mmio_map_size = 0xf00,
+};
+
+static void ptl_uncore_sncu_init_box(struct intel_uncore_box *box)
+{
+ intel_generic_uncore_mmio_init_box(box);
+
+ /* Clear the global freeze bit */
+ if (box->io_addr)
+ writel(0, box->io_addr + PTL_UNCORE_GLOBAL_CTL_OFFSET);
+}
+
+static struct intel_uncore_ops ptl_uncore_sncu_ops = {
+ .init_box = ptl_uncore_sncu_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type ptl_uncore_sncu = {
+ .name = "sncu",
+ .ops = &ptl_uncore_sncu_ops,
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type ptl_uncore_hbo = {
+ .name = "hbo",
+ .mmio_map_size = 0xf00,
+};
+
+static struct intel_uncore_type *ptl_uncores[UNCORE_PTL_MAX_NUM_UNCORE_TYPES] = {
+ [UNCORE_PTL_TYPE_IMC] = &ptl_uncore_imc,
+ [UNCORE_PTL_TYPE_SNCU] = &ptl_uncore_sncu,
+ [UNCORE_PTL_TYPE_HBO] = &ptl_uncore_hbo,
+};
+
+#define UNCORE_PTL_MMIO_EXTRA_UNCORES 1
+
+static struct intel_uncore_type *ptl_mmio_extra_uncores[UNCORE_PTL_MMIO_EXTRA_UNCORES] = {
+ &adl_uncore_imc_free_running,
+};
+
+void ptl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+ UNCORE_PTL_MMIO_EXTRA_UNCORES,
+ ptl_mmio_extra_uncores,
+ UNCORE_PTL_MAX_NUM_UNCORE_TYPES,
+ ptl_uncores);
+}
+
+static struct intel_uncore_type *ptl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ NULL
+};
+
+void ptl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = 6;
+ mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
+ uncore_msr_uncores = ptl_msr_uncores;
+}
+
+/* end of Panther Lake uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
new file mode 100644
index 000000000000..e1f370b8d065
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -0,0 +1,6711 @@
+// SPDX-License-Identifier: GPL-2.0
+/* SandyBridge-EP/IvyTown uncore support */
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+/* SNB-EP pci bus to socket mapping */
+#define SNBEP_CPUNODEID 0x40
+#define SNBEP_GIDNIDMAP 0x54
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS | \
+ SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
+#define SNBEP_PMON_CTL_RST (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
+#define SNBEP_PMON_CTL_EN (1 << 22)
+#define SNBEP_PMON_CTL_INVERT (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL 0xf4
+#define SNBEP_PCI_PMON_CTL0 0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0 0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0 0xc16
+#define SNBEP_U_MSR_PMON_CTL0 0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
+#define SNBEP_CBO_MSR_OFFSET 0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
+ .event = (e), \
+ .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
+ .config_mask = (m), \
+ .idx = (i) \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0 0x709
+#define HSWEP_U_MSR_PMON_CTL0 0x705
+#define HSWEP_U_MSR_PMON_FILTER 0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR 0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID (0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID (0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+ (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+ HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0 0xe08
+#define HSWEP_C0_MSR_PMON_CTL0 0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL 0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0 0xe05
+#define HSWEP_CBO_MSR_OFFSET 0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID (0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0 0x726
+#define HSWEP_S0_MSR_PMON_CTL0 0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL 0x720
+#define HSWEP_SBOX_MSR_OFFSET 0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0 0x717
+#define HSWEP_PCU_MSR_PMON_CTL0 0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715
+
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET 0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE (0x1ULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE (0x1ULL << 33)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_NNC (0x1ULL << 37)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400
+#define KNL_UCLK_MSR_PMON_CTL0 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL 0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW 0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL 0x454
+#define KNL_PMON_FIXED_CTL_EN 0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW 0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0 0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL 0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW 0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL 0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW 0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0 0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL 0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW 0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL 0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL 0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK 0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK 0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+ KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_CBO_PMON_CTL_TID_EN | \
+ SNBEP_PMON_CTL_INVERT | \
+ KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+/* SKX pci bus to socket mapping */
+#define SKX_CPUNODEID 0xc0
+#define SKX_GIDNIDMAP 0xd4
+
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * | Bit | Default | Description
+ * | [63] | 00h | VALID - When set, indicates the CPU bus
+ * numbers have been initialized. (RO)
+ * |[62:48]| --- | Reserved
+ * |[47:40]| 00h | BUS_NUM_5 - Return the bus number BIOS assigned
+ * CPUBUSNO(5). (RO)
+ * |[39:32]| 00h | BUS_NUM_4 - Return the bus number BIOS assigned
+ * CPUBUSNO(4). (RO)
+ * |[31:24]| 00h | BUS_NUM_3 - Return the bus number BIOS assigned
+ * CPUBUSNO(3). (RO)
+ * |[23:16]| 00h | BUS_NUM_2 - Return the bus number BIOS assigned
+ * CPUBUSNO(2). (RO)
+ * |[15:8] | 00h | BUS_NUM_1 - Return the bus number BIOS assigned
+ * CPUBUSNO(1). (RO)
+ * | [7:0] | 00h | BUS_NUM_0 - Return the bus number BIOS assigned
+ * CPUBUSNO(0). (RO)
+ */
+#define SKX_MSR_CPU_BUS_NUMBER 0x300
+#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63)
+#define BUS_NUM_STRIDE 8
+
+/* SKX CHA */
+#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE (0x3ffULL << 17)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_REM (0x1ULL << 32)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC (0x1ULL << 33)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC (0x1ULL << 35)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NM (0x1ULL << 36)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM (0x1ULL << 37)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0 (0x3ffULL << 41)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1 (0x3ffULL << 51)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+/* SKX IIO */
+#define SKX_IIO0_MSR_PMON_CTL0 0xa48
+#define SKX_IIO0_MSR_PMON_CTR0 0xa41
+#define SKX_IIO0_MSR_PMON_BOX_CTL 0xa40
+#define SKX_IIO_MSR_OFFSET 0x20
+
+#define SKX_PMON_CTL_TRESH_MASK (0xff << 24)
+#define SKX_PMON_CTL_TRESH_MASK_EXT (0xf)
+#define SKX_PMON_CTL_CH_MASK (0xff << 4)
+#define SKX_PMON_CTL_FC_MASK (0x7 << 12)
+#define SKX_IIO_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SKX_PMON_CTL_TRESH_MASK)
+#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT (SKX_PMON_CTL_TRESH_MASK_EXT | \
+ SKX_PMON_CTL_CH_MASK | \
+ SKX_PMON_CTL_FC_MASK)
+
+/* SKX IRP */
+#define SKX_IRP0_MSR_PMON_CTL0 0xa5b
+#define SKX_IRP0_MSR_PMON_CTR0 0xa59
+#define SKX_IRP0_MSR_PMON_BOX_CTL 0xa58
+#define SKX_IRP_MSR_OFFSET 0x20
+
+/* SKX UPI */
+#define SKX_UPI_PCI_PMON_CTL0 0x350
+#define SKX_UPI_PCI_PMON_CTR0 0x318
+#define SKX_UPI_PCI_PMON_BOX_CTL 0x378
+#define SKX_UPI_CTL_UMASK_EXT 0xffefff
+
+/* SKX M2M */
+#define SKX_M2M_PCI_PMON_CTL0 0x228
+#define SKX_M2M_PCI_PMON_CTR0 0x200
+#define SKX_M2M_PCI_PMON_BOX_CTL 0x258
+
+/* Memory Map registers device ID */
+#define SNR_ICX_MESH2IIO_MMAP_DID 0x9a2
+#define SNR_ICX_SAD_CONTROL_CFG 0x3f4
+
+/* Getting I/O stack id in SAD_COTROL_CFG notation */
+#define SAD_CONTROL_STACK_ID(data) (((data) >> 4) & 0x7)
+
+/* SNR Ubox */
+#define SNR_U_MSR_PMON_CTR0 0x1f98
+#define SNR_U_MSR_PMON_CTL0 0x1f91
+#define SNR_U_MSR_PMON_UCLK_FIXED_CTL 0x1f93
+#define SNR_U_MSR_PMON_UCLK_FIXED_CTR 0x1f94
+
+/* SNR CHA */
+#define SNR_CHA_RAW_EVENT_MASK_EXT 0x3ffffff
+#define SNR_CHA_MSR_PMON_CTL0 0x1c01
+#define SNR_CHA_MSR_PMON_CTR0 0x1c08
+#define SNR_CHA_MSR_PMON_BOX_CTL 0x1c00
+#define SNR_C0_MSR_PMON_BOX_FILTER0 0x1c05
+
+
+/* SNR IIO */
+#define SNR_IIO_MSR_PMON_CTL0 0x1e08
+#define SNR_IIO_MSR_PMON_CTR0 0x1e01
+#define SNR_IIO_MSR_PMON_BOX_CTL 0x1e00
+#define SNR_IIO_MSR_OFFSET 0x10
+#define SNR_IIO_PMON_RAW_EVENT_MASK_EXT 0x7ffff
+
+/* SNR IRP */
+#define SNR_IRP0_MSR_PMON_CTL0 0x1ea8
+#define SNR_IRP0_MSR_PMON_CTR0 0x1ea1
+#define SNR_IRP0_MSR_PMON_BOX_CTL 0x1ea0
+#define SNR_IRP_MSR_OFFSET 0x10
+
+/* SNR M2PCIE */
+#define SNR_M2PCIE_MSR_PMON_CTL0 0x1e58
+#define SNR_M2PCIE_MSR_PMON_CTR0 0x1e51
+#define SNR_M2PCIE_MSR_PMON_BOX_CTL 0x1e50
+#define SNR_M2PCIE_MSR_OFFSET 0x10
+
+/* SNR PCU */
+#define SNR_PCU_MSR_PMON_CTL0 0x1ef1
+#define SNR_PCU_MSR_PMON_CTR0 0x1ef8
+#define SNR_PCU_MSR_PMON_BOX_CTL 0x1ef0
+#define SNR_PCU_MSR_PMON_BOX_FILTER 0x1efc
+
+/* SNR M2M */
+#define SNR_M2M_PCI_PMON_CTL0 0x468
+#define SNR_M2M_PCI_PMON_CTR0 0x440
+#define SNR_M2M_PCI_PMON_BOX_CTL 0x438
+#define SNR_M2M_PCI_PMON_UMASK_EXT 0xff
+
+/* SNR PCIE3 */
+#define SNR_PCIE3_PCI_PMON_CTL0 0x508
+#define SNR_PCIE3_PCI_PMON_CTR0 0x4e8
+#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e0
+
+/* SNR IMC */
+#define SNR_IMC_MMIO_PMON_FIXED_CTL 0x54
+#define SNR_IMC_MMIO_PMON_FIXED_CTR 0x38
+#define SNR_IMC_MMIO_PMON_CTL0 0x40
+#define SNR_IMC_MMIO_PMON_CTR0 0x8
+#define SNR_IMC_MMIO_PMON_BOX_CTL 0x22800
+#define SNR_IMC_MMIO_OFFSET 0x4000
+#define SNR_IMC_MMIO_SIZE 0x4000
+#define SNR_IMC_MMIO_BASE_OFFSET 0xd0
+#define SNR_IMC_MMIO_BASE_MASK 0x1FFFFFFF
+#define SNR_IMC_MMIO_MEM0_OFFSET 0xd8
+#define SNR_IMC_MMIO_MEM0_MASK 0x7FF
+
+/* ICX CHA */
+#define ICX_C34_MSR_PMON_CTR0 0xb68
+#define ICX_C34_MSR_PMON_CTL0 0xb61
+#define ICX_C34_MSR_PMON_BOX_CTL 0xb60
+#define ICX_C34_MSR_PMON_BOX_FILTER0 0xb65
+
+/* ICX IIO */
+#define ICX_IIO_MSR_PMON_CTL0 0xa58
+#define ICX_IIO_MSR_PMON_CTR0 0xa51
+#define ICX_IIO_MSR_PMON_BOX_CTL 0xa50
+
+/* ICX IRP */
+#define ICX_IRP0_MSR_PMON_CTL0 0xa4d
+#define ICX_IRP0_MSR_PMON_CTR0 0xa4b
+#define ICX_IRP0_MSR_PMON_BOX_CTL 0xa4a
+
+/* ICX M2PCIE */
+#define ICX_M2PCIE_MSR_PMON_CTL0 0xa46
+#define ICX_M2PCIE_MSR_PMON_CTR0 0xa41
+#define ICX_M2PCIE_MSR_PMON_BOX_CTL 0xa40
+
+/* ICX UPI */
+#define ICX_UPI_PCI_PMON_CTL0 0x350
+#define ICX_UPI_PCI_PMON_CTR0 0x320
+#define ICX_UPI_PCI_PMON_BOX_CTL 0x318
+#define ICX_UPI_CTL_UMASK_EXT 0xffffff
+#define ICX_UBOX_DID 0x3450
+
+/* ICX M3UPI*/
+#define ICX_M3UPI_PCI_PMON_CTL0 0xd8
+#define ICX_M3UPI_PCI_PMON_CTR0 0xa8
+#define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0
+
+/* ICX IMC */
+#define ICX_NUMBER_IMC_CHN 3
+#define ICX_IMC_MEM_STRIDE 0x4
+
+/* SPR */
+#define SPR_RAW_EVENT_MASK_EXT 0xffffff
+#define SPR_UBOX_DID 0x3250
+
+/* SPR CHA */
+#define SPR_CHA_EVENT_MASK_EXT 0xffffffff
+#define SPR_CHA_PMON_CTL_TID_EN (1 << 16)
+#define SPR_CHA_PMON_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SPR_CHA_PMON_CTL_TID_EN)
+#define SPR_CHA_PMON_BOX_FILTER_TID 0x3ff
+
+#define SPR_C0_MSR_PMON_BOX_FILTER0 0x200e
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext2, umask, "config:8-15,32-57");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext3, umask, "config:8-15,32-39");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext5, umask, "config:8-15,32-63");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en2, tid_en, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43");
+DEFINE_UNCORE_FORMAT_ATTR(ch_mask2, ch_mask, "config:36-47");
+DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46");
+DEFINE_UNCORE_FORMAT_ATTR(fc_mask2, fc_mask, "config:48-50");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid5, filter_tid, "config1:0-9");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26");
+DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32");
+DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36");
+DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrq(msr, config);
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrq(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrq(msr, config);
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrq(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrq(reg1->reg, uncore_shared_reg_config(box, 0));
+
+ wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrq(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr)
+ wrmsrq(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_nid.attr,
+ &format_attr_filter_state.attr,
+ &format_attr_filter_opc.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
+ INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+ INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
+ { /* end: all zeroes */ },
+};
+
+static const struct attribute_group snbep_uncore_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \
+ .init_box = snbep_uncore_msr_init_box \
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_pci_init_box, \
+ .disable_box = snbep_uncore_pci_disable_box, \
+ .enable_box = snbep_uncore_pci_enable_box, \
+ .disable_event = snbep_uncore_pci_disable_event, \
+ .read_counter = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_uncore_pci_enable_event, \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0xe),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+ EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ for (i = 0; i < 5; i++) {
+ if (reg1->alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+ u64 (*cbox_filter_mask)(int fields))
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i, alloc = 0;
+ unsigned long flags;
+ u64 mask;
+
+ if (reg1->idx == EXTRA_REG_NONE)
+ return NULL;
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ for (i = 0; i < 5; i++) {
+ if (!(reg1->idx & (0x1 << i)))
+ continue;
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ continue;
+
+ mask = cbox_filter_mask(0x1 << i);
+ if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+ !((reg1->config ^ er->config) & mask)) {
+ atomic_add(1 << (i * 6), &er->ref);
+ er->config &= ~mask;
+ er->config |= reg1->config & mask;
+ alloc |= (0x1 << i);
+ } else {
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ if (i < 5)
+ goto fail;
+
+ if (!uncore_box_is_fake(box))
+ reg1->alloc |= alloc;
+
+ return NULL;
+fail:
+ for (; i >= 0; i--) {
+ if (alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x4)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+ return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_cbox_hw_config,
+ .get_constraint = snbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &snbep_uncore_cbox_ops,
+ .format_group = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 config = reg1->config;
+
+ if (new_idx > reg1->idx)
+ config <<= 8 * (new_idx - reg1->idx);
+ else
+ config >>= 8 * (reg1->idx - new_idx);
+
+ if (modify) {
+ hwc->config += new_idx - reg1->idx;
+ reg1->config = config;
+ reg1->idx = new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ unsigned long flags;
+ int idx = reg1->idx;
+ u64 mask, config1 = reg1->config;
+ bool ok = false;
+
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+again:
+ mask = 0xffULL << (idx * 8);
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+ !((config1 ^ er->config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ idx = (idx + 1) % 4;
+ if (idx != reg1->idx) {
+ config1 = snbep_pcu_alter_er(event, idx, false);
+ goto again;
+ }
+ return &uncore_constraint_empty;
+ }
+
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx)
+ snbep_pcu_alter_er(event, idx, true);
+ reg1->alloc = 1;
+ }
+ return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ atomic_sub(1 << (reg1->idx * 8), &er->ref);
+ reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+ &snbep_uncore_ubox,
+ &snbep_uncore_cbox,
+ &snbep_uncore_pcu,
+ NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+ if (snbep_uncore_cbox.num_boxes > topology_num_cores_per_package())
+ snbep_uncore_cbox.num_boxes = topology_num_cores_per_package();
+ uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+ SNBEP_PCI_QPI_PORT0_FILTER,
+ SNBEP_PCI_QPI_PORT1_FILTER,
+ BDX_PCI_QPI_PORT2_FILTER,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+ reg1->idx = 0;
+ reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+ reg1->config = event->attr.config1;
+ reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+ reg2->config = event->attr.config2;
+ }
+ return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+ int die = box->dieid;
+ struct pci_dev *filter_pdev = uncore_extra_pci_dev[die].dev[idx];
+
+ if (filter_pdev) {
+ pci_write_config_dword(filter_pdev, reg1->reg,
+ (u32)reg1->config);
+ pci_write_config_dword(filter_pdev, reg1->reg + 4,
+ (u32)(reg1->config >> 32));
+ pci_write_config_dword(filter_pdev, reg2->reg,
+ (u32)reg2->config);
+ pci_write_config_dword(filter_pdev, reg2->reg + 4,
+ (u32)(reg2->config >> 32));
+ }
+ }
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_qpi_enable_event,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &snbep_uncore_pci_ops, \
+ .format_group = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .event_descs = snbep_uncore_qpi_events,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ SNBEP_PCI_UNCORE_HA,
+ SNBEP_PCI_UNCORE_IMC,
+ SNBEP_PCI_UNCORE_QPI,
+ SNBEP_PCI_UNCORE_R2PCIE,
+ SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+ [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
+ [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
+ [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
+ [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
+ [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+ { /* Home Agent */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* MC Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* QPI Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+ .name = "snbep_uncore",
+ .id_table = snbep_uncore_pci_ids,
+};
+
+#define NODE_ID_MASK 0x7
+
+/* Each three bits from 0 to 23 of GIDNIDMAP register correspond Node ID. */
+#define GIDNIDMAP(config, id) (((config) >> (3 * (id))) & 0x7)
+
+static int upi_nodeid_groupid(struct pci_dev *ubox_dev, int nodeid_loc, int idmap_loc,
+ int *nodeid, int *groupid)
+{
+ int ret;
+
+ /* get the Node ID of the local register */
+ ret = pci_read_config_dword(ubox_dev, nodeid_loc, nodeid);
+ if (ret)
+ goto err;
+
+ *nodeid = *nodeid & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ ret = pci_read_config_dword(ubox_dev, idmap_loc, groupid);
+ if (ret)
+ goto err;
+err:
+ return ret;
+}
+
+static int topology_gidnid_map(int nodeid, u32 gidnid)
+{
+ int i, die_id = -1;
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == GIDNIDMAP(gidnid, i)) {
+ if (topology_max_dies_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ if (die_id < 0)
+ die_id = -ENODEV;
+ break;
+ }
+ }
+
+ return die_id;
+}
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
+{
+ struct pci_dev *ubox_dev = NULL;
+ int i, bus, nodeid, segment, die_id;
+ struct pci2phy_map *map;
+ int err = 0;
+ u32 config = 0;
+
+ while (1) {
+ /* find the UBOX device */
+ ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+ if (!ubox_dev)
+ break;
+ bus = ubox_dev->bus->number;
+ /*
+ * The nodeid and idmap registers only contain enough
+ * information to handle 8 nodes. On systems with more
+ * than 8 nodes, we need to rely on NUMA information,
+ * filled in from BIOS supplied information, to determine
+ * the topology.
+ */
+ if (nr_node_ids <= 8) {
+ err = upi_nodeid_groupid(ubox_dev, nodeid_loc, idmap_loc,
+ &nodeid, &config);
+ if (err)
+ break;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ map->pbus_to_dieid[bus] = topology_gidnid_map(nodeid, config);
+ raw_spin_unlock(&pci2phy_map_lock);
+ } else {
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ map->pbus_to_dieid[bus] = die_id = uncore_device_to_die(ubox_dev);
+
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ if (WARN_ON_ONCE(die_id == -1)) {
+ err = -EINVAL;
+ break;
+ }
+ }
+ }
+
+ if (!err) {
+ /*
+ * For PCI bus with no UBOX device, find the next bus
+ * that has UBOX device and use its mapping.
+ */
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ i = -1;
+ if (reverse) {
+ for (bus = 255; bus >= 0; bus--) {
+ if (map->pbus_to_dieid[bus] != -1)
+ i = map->pbus_to_dieid[bus];
+ else
+ map->pbus_to_dieid[bus] = i;
+ }
+ } else {
+ for (bus = 0; bus <= 255; bus++) {
+ if (map->pbus_to_dieid[bus] != -1)
+ i = map->pbus_to_dieid[bus];
+ else
+ map->pbus_to_dieid[bus] = i;
+ }
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+ }
+
+ pci_dev_put(ubox_dev);
+
+ return pcibios_err_to_errno(err);
+}
+
+int snbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = snbep_pci_uncores;
+ uncore_pci_driver = &snbep_uncore_pci_driver;
+ return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ if (msr)
+ wrmsrq(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = ivbep_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &ivbep_uncore_pci_ops, \
+ .format_group = &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_link.attr,
+ &format_attr_filter_state2.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static const struct attribute_group ivbep_uncore_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10) {
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+ }
+
+ return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrq(reg1->reg, filter & 0xffffffff);
+ wrmsrq(reg1->reg + 6, filter >> 32);
+ }
+
+ wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = ivbep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = ivbep_cbox_hw_config,
+ .get_constraint = ivbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 15,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &ivbep_uncore_cbox_ops,
+ .format_group = &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_pcu_ops,
+ .format_group = &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+ &ivbep_uncore_ubox,
+ &ivbep_uncore_cbox,
+ &ivbep_uncore_pcu,
+ NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+ if (ivbep_uncore_cbox.num_boxes > topology_num_cores_per_package())
+ ivbep_uncore_cbox.num_boxes = topology_num_cores_per_package();
+ uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivbep_uncore_irp_disable_event,
+ .enable_event = ivbep_uncore_irp_enable_event,
+ .read_counter = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_irp_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_qpi_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_qpi_ops,
+ .format_group = &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ IVBEP_PCI_UNCORE_HA,
+ IVBEP_PCI_UNCORE_IMC,
+ IVBEP_PCI_UNCORE_IRP,
+ IVBEP_PCI_UNCORE_QPI,
+ IVBEP_PCI_UNCORE_R2PCIE,
+ IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+ [IVBEP_PCI_UNCORE_HA] = &ivbep_uncore_ha,
+ [IVBEP_PCI_UNCORE_IMC] = &ivbep_uncore_imc,
+ [IVBEP_PCI_UNCORE_IRP] = &ivbep_uncore_irp,
+ [IVBEP_PCI_UNCORE_QPI] = &ivbep_uncore_qpi,
+ [IVBEP_PCI_UNCORE_R2PCIE] = &ivbep_uncore_r2pcie,
+ [IVBEP_PCI_UNCORE_R3QPI] = &ivbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+ .name = "ivbep_uncore",
+ .id_table = ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = ivbep_pci_uncores;
+ uncore_pci_driver = &ivbep_uncore_pci_driver;
+ return 0;
+}
+/* end of IvyTown uncore support */
+
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_link3.attr,
+ &format_attr_filter_state4.attr,
+ &format_attr_filter_local.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_nnm.attr,
+ &format_attr_filter_opc3.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_cha_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+ EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x4)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+ return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE;
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE;
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_NNC;
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = knl_cha_hw_config,
+ .get_constraint = knl_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+ .name = "cha",
+ .num_counters = 4,
+ .num_boxes = 38,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = KNL_CHA_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = knl_uncore_cha_constraints,
+ .ops = &knl_uncore_cha_ops,
+ .format_group = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+ &format_attr_event2.attr,
+ &format_attr_use_occ_ctr.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh6.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+ &knl_uncore_ubox,
+ &knl_uncore_cha,
+ &knl_uncore_pcu,
+ NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+ == UNCORE_FIXED_EVENT)
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | KNL_PMON_FIXED_CTL_EN);
+ else
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = knl_uncore_imc_enable_box,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .enable_event = knl_uncore_imc_enable_event,
+ .disable_event = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+ .name = "imc_uclk",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_MC0_CH0_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+ .fixed_ctl = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+ .box_ctl = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+ .name = "edc_uclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+ .name = "edc_eclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+ .fixed_ctl = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+ .box_ctl = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = knl_uncore_m2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_irp_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = KNL_IRP_PCI_PMON_BOX_CTL,
+ .ops = &snbep_uncore_pci_ops,
+ .format_group = &knl_uncore_irp_format_group,
+};
+
+enum {
+ KNL_PCI_UNCORE_MC_UCLK,
+ KNL_PCI_UNCORE_MC_DCLK,
+ KNL_PCI_UNCORE_EDC_UCLK,
+ KNL_PCI_UNCORE_EDC_ECLK,
+ KNL_PCI_UNCORE_M2PCIE,
+ KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+ [KNL_PCI_UNCORE_MC_UCLK] = &knl_uncore_imc_uclk,
+ [KNL_PCI_UNCORE_MC_DCLK] = &knl_uncore_imc_dclk,
+ [KNL_PCI_UNCORE_EDC_UCLK] = &knl_uncore_edc_uclk,
+ [KNL_PCI_UNCORE_EDC_ECLK] = &knl_uncore_edc_eclk,
+ [KNL_PCI_UNCORE_M2PCIE] = &knl_uncore_m2pcie,
+ [KNL_PCI_UNCORE_IRP] = &knl_uncore_irp,
+ NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ * PCI Device ID Uncore PMU Devices
+ * ----------------------------------
+ * 0x7841 MC0 UClk, MC1 UClk
+ * 0x7843 MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ * MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ * 0x7833 EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ * EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ * 0x7835 EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ * EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ * 0x7817 M2PCIe
+ * 0x7814 IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+ { /* MC0 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 0, KNL_PCI_UNCORE_MC_UCLK, 0),
+ },
+ { /* MC1 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 0, KNL_PCI_UNCORE_MC_UCLK, 1),
+ },
+ { /* MC0 DClk CH 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 2, KNL_PCI_UNCORE_MC_DCLK, 0),
+ },
+ { /* MC0 DClk CH 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 3, KNL_PCI_UNCORE_MC_DCLK, 1),
+ },
+ { /* MC0 DClk CH 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 4, KNL_PCI_UNCORE_MC_DCLK, 2),
+ },
+ { /* MC1 DClk CH 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 2, KNL_PCI_UNCORE_MC_DCLK, 3),
+ },
+ { /* MC1 DClk CH 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 3, KNL_PCI_UNCORE_MC_DCLK, 4),
+ },
+ { /* MC1 DClk CH 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 4, KNL_PCI_UNCORE_MC_DCLK, 5),
+ },
+ { /* EDC0 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, KNL_PCI_UNCORE_EDC_UCLK, 0),
+ },
+ { /* EDC1 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, KNL_PCI_UNCORE_EDC_UCLK, 1),
+ },
+ { /* EDC2 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(17, 0, KNL_PCI_UNCORE_EDC_UCLK, 2),
+ },
+ { /* EDC3 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, KNL_PCI_UNCORE_EDC_UCLK, 3),
+ },
+ { /* EDC4 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(19, 0, KNL_PCI_UNCORE_EDC_UCLK, 4),
+ },
+ { /* EDC5 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(20, 0, KNL_PCI_UNCORE_EDC_UCLK, 5),
+ },
+ { /* EDC6 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 0, KNL_PCI_UNCORE_EDC_UCLK, 6),
+ },
+ { /* EDC7 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 0, KNL_PCI_UNCORE_EDC_UCLK, 7),
+ },
+ { /* EDC0 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(24, 2, KNL_PCI_UNCORE_EDC_ECLK, 0),
+ },
+ { /* EDC1 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(25, 2, KNL_PCI_UNCORE_EDC_ECLK, 1),
+ },
+ { /* EDC2 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(26, 2, KNL_PCI_UNCORE_EDC_ECLK, 2),
+ },
+ { /* EDC3 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(27, 2, KNL_PCI_UNCORE_EDC_ECLK, 3),
+ },
+ { /* EDC4 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(28, 2, KNL_PCI_UNCORE_EDC_ECLK, 4),
+ },
+ { /* EDC5 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(29, 2, KNL_PCI_UNCORE_EDC_ECLK, 5),
+ },
+ { /* EDC6 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(30, 2, KNL_PCI_UNCORE_EDC_ECLK, 6),
+ },
+ { /* EDC7 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(31, 2, KNL_PCI_UNCORE_EDC_ECLK, 7),
+ },
+ { /* M2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+ .name = "knl_uncore",
+ .id_table = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+ int ret;
+
+ /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+ ret = snb_pci2phy_map_init(0x7814); /* IRP */
+ if (ret)
+ return ret;
+ ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+ if (ret)
+ return ret;
+ uncore_pci_uncores = knl_pci_uncores;
+ uncore_pci_driver = &knl_uncore_pci_driver;
+ return 0;
+}
+
+/* end of KNL uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_filter_tid2.attr,
+ &format_attr_filter_cid.attr,
+ NULL,
+};
+
+static const struct attribute_group hswep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+ reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+ reg1->idx = 0;
+ return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_ubox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .num_shared_regs = 1,
+ .ops = &hswep_uncore_ubox_ops,
+ .format_group = &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid3.attr,
+ &format_attr_filter_link2.attr,
+ &format_attr_filter_state3.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static const struct attribute_group hswep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+ if (fields & 0x1)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10) {
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+ }
+ return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrq(reg1->reg, filter & 0xffffffff);
+ wrmsrq(reg1->reg + 1, filter >> 32);
+ }
+
+ wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = hswep_cbox_hw_config,
+ .get_constraint = hswep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 18,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = hswep_uncore_cbox_constraints,
+ .ops = &hswep_uncore_cbox_ops,
+ .format_group = &hswep_uncore_cbox_format_group,
+};
+
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr) {
+ u64 init = SNBEP_PMON_BOX_CTL_INT;
+ u64 flags = 0;
+ int i;
+
+ for_each_set_bit(i, (unsigned long *)&init, 64) {
+ flags |= (1ULL << i);
+ wrmsrq(msr, flags);
+ }
+ }
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .init_box = hswep_uncore_sbox_msr_init_box
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group hswep_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << reg1->idx);
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &hswep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+ &hswep_uncore_ubox,
+ &hswep_uncore_cbox,
+ &hswep_uncore_sbox,
+ &hswep_uncore_pcu,
+ NULL,
+};
+
+#define HSWEP_PCU_DID 0x2fc0
+#define HSWEP_PCU_CAPID4_OFFET 0x94
+#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3)
+
+static bool hswep_has_limit_sbox(unsigned int device)
+{
+ struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL);
+ u32 capid4;
+
+ if (!dev)
+ return false;
+
+ pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ pci_dev_put(dev);
+ if (!hswep_get_chop(capid4))
+ return true;
+
+ return false;
+}
+
+void hswep_uncore_cpu_init(void)
+{
+ if (hswep_uncore_cbox.num_boxes > topology_num_cores_per_package())
+ hswep_uncore_cbox.num_boxes = topology_num_cores_per_package();
+
+ /* Detect 6-8 core systems with only two SBOXes */
+ if (hswep_has_limit_sbox(HSWEP_PCU_DID))
+ hswep_uncore_sbox.num_boxes = 2;
+
+ uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivbep_uncore_irp_disable_event,
+ .enable_event = ivbep_uncore_irp_enable_event,
+ .read_counter = hswep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &hswep_uncore_irp_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = hswep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 44,
+ .constraints = hswep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ HSWEP_PCI_UNCORE_HA,
+ HSWEP_PCI_UNCORE_IMC,
+ HSWEP_PCI_UNCORE_IRP,
+ HSWEP_PCI_UNCORE_QPI,
+ HSWEP_PCI_UNCORE_R2PCIE,
+ HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+ [HSWEP_PCI_UNCORE_HA] = &hswep_uncore_ha,
+ [HSWEP_PCI_UNCORE_IMC] = &hswep_uncore_imc,
+ [HSWEP_PCI_UNCORE_IRP] = &hswep_uncore_irp,
+ [HSWEP_PCI_UNCORE_QPI] = &hswep_uncore_qpi,
+ [HSWEP_PCI_UNCORE_R2PCIE] = &hswep_uncore_r2pcie,
+ [HSWEP_PCI_UNCORE_R3QPI] = &hswep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+ .name = "hswep_uncore",
+ .id_table = hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = hswep_pci_uncores;
+ uncore_pci_driver = &hswep_uncore_pci_driver;
+ return 0;
+}
+/* end of Haswell-EP uncore support */
+
+/* BDX uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 24,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = bdx_uncore_cbox_constraints,
+ .ops = &hswep_uncore_cbox_ops,
+ .format_group = &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX 3
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+ &bdx_uncore_ubox,
+ &bdx_uncore_cbox,
+ &hswep_uncore_pcu,
+ &bdx_uncore_sbox,
+ NULL,
+};
+
+/* Bit 7 'Use Occupancy' is not available for counter 0 on BDX */
+static struct event_constraint bdx_uncore_pcu_constraints[] = {
+ EVENT_CONSTRAINT(0x80, 0xe, 0x80),
+ EVENT_CONSTRAINT_END
+};
+
+#define BDX_PCU_DID 0x6fc0
+
+void bdx_uncore_cpu_init(void)
+{
+ if (bdx_uncore_cbox.num_boxes > topology_num_cores_per_package())
+ bdx_uncore_cbox.num_boxes = topology_num_cores_per_package();
+ uncore_msr_uncores = bdx_msr_uncores;
+
+ /* Detect systems with no SBOXes */
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_D || hswep_has_limit_sbox(BDX_PCU_DID))
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+
+ hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &hswep_uncore_irp_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ BDX_PCI_UNCORE_HA,
+ BDX_PCI_UNCORE_IMC,
+ BDX_PCI_UNCORE_IRP,
+ BDX_PCI_UNCORE_QPI,
+ BDX_PCI_UNCORE_R2PCIE,
+ BDX_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+ [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha,
+ [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc,
+ [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp,
+ [BDX_PCI_UNCORE_QPI] = &bdx_uncore_qpi,
+ [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+ [BDX_PCI_UNCORE_R3QPI] = &bdx_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* QPI Port 2 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ BDX_PCI_QPI_PORT2_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+ .name = "bdx_uncore",
+ .id_table = bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+
+ if (ret)
+ return ret;
+ uncore_pci_uncores = bdx_pci_uncores;
+ uncore_pci_driver = &bdx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of BDX uncore support */
+
+/* SKX uncore support */
+
+static struct intel_uncore_type skx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct attribute *skx_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_state5.attr,
+ &format_attr_filter_rem.attr,
+ &format_attr_filter_loc.attr,
+ &format_attr_filter_nm.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_not_nm.attr,
+ &format_attr_filter_opc_0.attr,
+ &format_attr_filter_opc_1.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_chabox_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_cha_formats_attr,
+};
+
+static struct event_constraint skx_uncore_chabox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg skx_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x38, 0xff, 0x3),
+ EVENT_EXTRA_END
+};
+
+static u64 skx_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8) {
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_REM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LOC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC0;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC1;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ISOC;
+ }
+ return mask;
+}
+
+static struct event_constraint *
+skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask);
+}
+
+static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+ /* Any of the CHA events may be filtered by Thread/Core-ID.*/
+ if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN)
+ idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+
+ for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & skx_cha_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops skx_uncore_chabox_ops = {
+ /* There is no frz_en for chabox ctl */
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = skx_cha_hw_config,
+ .get_constraint = skx_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_chabox = {
+ .name = "cha",
+ .num_counters = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &skx_uncore_chabox_ops,
+ .format_group = &skx_uncore_chabox_format_group,
+};
+
+static struct attribute *skx_uncore_iio_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh9.attr,
+ &format_attr_ch_mask.attr,
+ &format_attr_fc_mask.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_iio_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_iio_formats_attr,
+};
+
+static struct event_constraint skx_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x95, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
+static void skx_iio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops skx_uncore_iio_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = skx_iio_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_topology *pmu_topology(struct intel_uncore_pmu *pmu, int die)
+{
+ int idx;
+
+ for (idx = 0; idx < pmu->type->num_boxes; idx++) {
+ if (pmu->type->topology[die][idx].pmu_idx == pmu->pmu_idx)
+ return &pmu->type->topology[die][idx];
+ }
+
+ return NULL;
+}
+
+static umode_t
+pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr,
+ int die, int zero_bus_pmu)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
+
+ return (pmut && !pmut->iio->pci_bus_no && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 0. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 0);
+}
+
+static ssize_t skx_iio_mapping_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+ struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+ long die = (long)ea->var;
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
+
+ return sprintf(buf, "%04x:%02x\n", pmut ? pmut->iio->segment : 0,
+ pmut ? pmut->iio->pci_bus_no : 0);
+}
+
+static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
+{
+ u64 msr_value;
+
+ if (rdmsrq_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) ||
+ !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT))
+ return -ENXIO;
+
+ *topology = msr_value;
+
+ return 0;
+}
+
+static int die_to_cpu(int die)
+{
+ int res = 0, cpu, current_die;
+ /*
+ * Using cpus_read_lock() to ensure cpu is not going down between
+ * looking at cpu_online_mask.
+ */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ current_die = topology_logical_die_id(cpu);
+ if (current_die == die) {
+ res = cpu;
+ break;
+ }
+ }
+ cpus_read_unlock();
+ return res;
+}
+
+enum {
+ IIO_TOPOLOGY_TYPE,
+ UPI_TOPOLOGY_TYPE,
+ TOPOLOGY_MAX
+};
+
+static const size_t topology_size[TOPOLOGY_MAX] = {
+ sizeof(*((struct intel_uncore_topology *)NULL)->iio),
+ sizeof(*((struct intel_uncore_topology *)NULL)->upi)
+};
+
+static int pmu_alloc_topology(struct intel_uncore_type *type, int topology_type)
+{
+ int die, idx;
+ struct intel_uncore_topology **topology;
+
+ if (!type->num_boxes)
+ return -EPERM;
+
+ topology = kcalloc(uncore_max_dies(), sizeof(*topology), GFP_KERNEL);
+ if (!topology)
+ goto err;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ topology[die] = kcalloc(type->num_boxes, sizeof(**topology), GFP_KERNEL);
+ if (!topology[die])
+ goto clear;
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ topology[die][idx].untyped = kcalloc(type->num_boxes,
+ topology_size[topology_type],
+ GFP_KERNEL);
+ if (!topology[die][idx].untyped)
+ goto clear;
+ }
+ }
+
+ type->topology = topology;
+
+ return 0;
+clear:
+ for (; die >= 0; die--) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(topology[die][idx].untyped);
+ kfree(topology[die]);
+ }
+ kfree(topology);
+err:
+ return -ENOMEM;
+}
+
+static void pmu_free_topology(struct intel_uncore_type *type)
+{
+ int die, idx;
+
+ if (type->topology) {
+ for (die = 0; die < uncore_max_dies(); die++) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(type->topology[die][idx].untyped);
+ kfree(type->topology[die]);
+ }
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+}
+
+static int skx_pmu_get_topology(struct intel_uncore_type *type,
+ int (*topology_cb)(struct intel_uncore_type*, int, int, u64))
+{
+ int die, ret = -EPERM;
+ u64 cpu_bus_msr;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(die), &cpu_bus_msr);
+ if (ret)
+ break;
+
+ ret = uncore_die_to_segment(die);
+ if (ret < 0)
+ break;
+
+ ret = topology_cb(type, ret, die, cpu_bus_msr);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int skx_iio_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx;
+ struct intel_uncore_topology *t;
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ t = &type->topology[die][idx];
+ t->pmu_idx = idx;
+ t->iio->segment = segment;
+ t->iio->pci_bus_no = (cpu_bus_msr >> (idx * BUS_NUM_STRIDE)) & 0xff;
+ }
+
+ return 0;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+ return skx_pmu_get_topology(type, skx_iio_topology_cb);
+}
+
+static struct attribute_group skx_iio_mapping_group = {
+ .is_visible = skx_iio_mapping_visible,
+};
+
+static const struct attribute_group *skx_iio_attr_update[] = {
+ &skx_iio_mapping_group,
+ NULL,
+};
+
+static void pmu_clear_mapping_attr(const struct attribute_group **groups,
+ struct attribute_group *ag)
+{
+ int i;
+
+ for (i = 0; groups[i]; i++) {
+ if (groups[i] == ag) {
+ for (i++; groups[i]; i++)
+ groups[i - 1] = groups[i];
+ groups[i - 1] = NULL;
+ break;
+ }
+ }
+}
+
+static void
+pmu_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag,
+ ssize_t (*show)(struct device*, struct device_attribute*, char*),
+ int topology_type)
+{
+ char buf[64];
+ int ret;
+ long die = -1;
+ struct attribute **attrs = NULL;
+ struct dev_ext_attribute *eas = NULL;
+
+ ret = pmu_alloc_topology(type, topology_type);
+ if (ret < 0)
+ goto clear_attr_update;
+
+ ret = type->get_topology(type);
+ if (ret < 0)
+ goto clear_topology;
+
+ /* One more for NULL. */
+ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ goto clear_topology;
+
+ eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
+ if (!eas)
+ goto clear_attrs;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ snprintf(buf, sizeof(buf), "die%ld", die);
+ sysfs_attr_init(&eas[die].attr.attr);
+ eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
+ if (!eas[die].attr.attr.name)
+ goto err;
+ eas[die].attr.attr.mode = 0444;
+ eas[die].attr.show = show;
+ eas[die].attr.store = NULL;
+ eas[die].var = (void *)die;
+ attrs[die] = &eas[die].attr.attr;
+ }
+ ag->attrs = attrs;
+
+ return;
+err:
+ for (; die >= 0; die--)
+ kfree(eas[die].attr.attr.name);
+ kfree(eas);
+clear_attrs:
+ kfree(attrs);
+clear_topology:
+ pmu_free_topology(type);
+clear_attr_update:
+ pmu_clear_mapping_attr(type->attr_update, ag);
+}
+
+static void
+pmu_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ struct attribute **attr = ag->attrs;
+
+ if (!attr)
+ return;
+
+ for (; *attr; attr++)
+ kfree((*attr)->name);
+ kfree(attr_to_ext_attr(*ag->attrs));
+ kfree(ag->attrs);
+ ag->attrs = NULL;
+ pmu_free_topology(type);
+}
+
+static void
+pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_iio_mapping_show, IIO_TOPOLOGY_TYPE);
+}
+
+static void skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_iio_set_mapping(type, &skx_iio_mapping_group);
+}
+
+static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &skx_iio_mapping_group);
+}
+
+static struct intel_uncore_type skx_uncore_iio = {
+ .name = "iio",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IIO0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IIO0_MSR_PMON_CTR0,
+ .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = SKX_IIO0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IIO_MSR_OFFSET,
+ .constraints = skx_uncore_iio_constraints,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_iio_format_group,
+ .attr_update = skx_iio_attr_update,
+ .get_topology = skx_iio_get_topology,
+ .set_mapping = skx_iio_set_mapping,
+ .cleanup_mapping = skx_iio_cleanup_mapping,
+};
+
+enum perf_uncore_iio_freerunning_type_id {
+ SKX_IIO_MSR_IOCLK = 0,
+ SKX_IIO_MSR_BW = 1,
+ SKX_IIO_MSR_UTIL = 2,
+
+ SKX_IIO_FREERUNNING_TYPE_MAX,
+};
+
+
+static struct freerunning_counters skx_iio_freerunning[] = {
+ [SKX_IIO_MSR_IOCLK] = { 0xa45, 0x1, 0x20, 1, 36 },
+ [SKX_IIO_MSR_BW] = { 0xb00, 0x1, 0x10, 8, 36 },
+ [SKX_IIO_MSR_UTIL] = { 0xb08, 0x1, 0x10, 8, 36 },
+};
+
+static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = {
+ /* Free-Running IO CLOCKS Counter */
+ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
+ /* Free-Running IIO BANDWIDTH Counters */
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x24"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x25"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x26"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x27"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"),
+ /* Free-running IIO UTILIZATION Counters */
+ INTEL_UNCORE_EVENT_DESC(util_in_port0, "event=0xff,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port0, "event=0xff,umask=0x31"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port1, "event=0xff,umask=0x32"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port1, "event=0xff,umask=0x33"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port2, "event=0xff,umask=0x34"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port2, "event=0xff,umask=0x35"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port3, "event=0xff,umask=0x36"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port3, "event=0xff,umask=0x37"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops skx_uncore_iio_freerunning_ops = {
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct attribute *skx_uncore_iio_freerunning_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_iio_freerunning_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_iio_freerunning_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 17,
+ .num_boxes = 6,
+ .num_freerunning_types = SKX_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = skx_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = skx_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct attribute *skx_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IRP0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IRP0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_IRP0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IRP_MSR_OFFSET,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct attribute *skx_uncore_pcu_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_ops skx_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &skx_uncore_pcu_ops,
+ .format_group = &skx_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *skx_msr_uncores[] = {
+ &skx_uncore_ubox,
+ &skx_uncore_chabox,
+ &skx_uncore_iio,
+ &skx_uncore_iio_free_running,
+ &skx_uncore_irp,
+ &skx_uncore_pcu,
+ NULL,
+};
+
+/*
+ * To determine the number of CHAs, it should read bits 27:0 in the CAPID6
+ * register which located at Device 30, Function 3, Offset 0x9C. PCI ID 0x2083.
+ */
+#define SKX_CAPID6 0x9c
+#define SKX_CHA_BIT_MASK GENMASK(27, 0)
+
+static int skx_count_chabox(void)
+{
+ struct pci_dev *dev = NULL;
+ u32 val = 0;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x2083, dev);
+ if (!dev)
+ goto out;
+
+ pci_read_config_dword(dev, SKX_CAPID6, &val);
+ val &= SKX_CHA_BIT_MASK;
+out:
+ pci_dev_put(dev);
+ return hweight32(val);
+}
+
+void skx_uncore_cpu_init(void)
+{
+ skx_uncore_chabox.num_boxes = skx_count_chabox();
+ uncore_msr_uncores = skx_msr_uncores;
+}
+
+static struct intel_uncore_type skx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct attribute *skx_upi_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_upi_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_upi_uncore_formats_attr,
+};
+
+static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
+ .init_box = skx_upi_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static umode_t
+skx_upi_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+ return pmu->type->topology[die][pmu->pmu_idx].upi->enabled ? attr->mode : 0;
+}
+
+static ssize_t skx_upi_mapping_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+ struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+ long die = (long)ea->var;
+ struct uncore_upi_topology *upi = pmu->type->topology[die][pmu->pmu_idx].upi;
+
+ return sysfs_emit(buf, "upi_%d,die_%d\n", upi->pmu_idx_to, upi->die_to);
+}
+
+#define SKX_UPI_REG_DID 0x2058
+#define SKX_UPI_REGS_ADDR_DEVICE_LINK0 0x0e
+#define SKX_UPI_REGS_ADDR_FUNCTION 0x00
+
+/*
+ * UPI Link Parameter 0
+ * | Bit | Default | Description
+ * | 19:16 | 0h | base_nodeid - The NodeID of the sending socket.
+ * | 12:8 | 00h | sending_port - The processor die port number of the sending port.
+ */
+#define SKX_KTILP0_OFFSET 0x94
+
+/*
+ * UPI Pcode Status. This register is used by PCode to store the link training status.
+ * | Bit | Default | Description
+ * | 4 | 0h | ll_status_valid — Bit indicates the valid training status
+ * logged from PCode to the BIOS.
+ */
+#define SKX_KTIPCSTS_OFFSET 0x120
+
+static int upi_fill_topology(struct pci_dev *dev, struct intel_uncore_topology *tp,
+ int pmu_idx)
+{
+ int ret;
+ u32 upi_conf;
+ struct uncore_upi_topology *upi = tp->upi;
+
+ tp->pmu_idx = pmu_idx;
+ ret = pci_read_config_dword(dev, SKX_KTIPCSTS_OFFSET, &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->enabled = (upi_conf >> 4) & 1;
+ if (upi->enabled) {
+ ret = pci_read_config_dword(dev, SKX_KTILP0_OFFSET,
+ &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->die_to = (upi_conf >> 16) & 0xf;
+ upi->pmu_idx_to = (upi_conf >> 8) & 0x1f;
+ }
+err:
+ return ret;
+}
+
+static int skx_upi_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx, ret;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+ struct pci_dev *dev = NULL;
+ u8 bus = cpu_bus_msr >> (3 * BUS_NUM_STRIDE);
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[die][idx];
+ devfn = PCI_DEVFN(SKX_UPI_REGS_ADDR_DEVICE_LINK0 + idx,
+ SKX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(segment, bus, devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ break;
+ }
+ }
+
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int skx_upi_get_topology(struct intel_uncore_type *type)
+{
+ /* CPX case is not supported */
+ if (boot_cpu_data.x86_stepping == 11)
+ return -EPERM;
+
+ return skx_pmu_get_topology(type, skx_upi_topology_cb);
+}
+
+static struct attribute_group skx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *skx_upi_attr_update[] = {
+ &skx_upi_mapping_group,
+ NULL
+};
+
+static void
+pmu_upi_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_upi_mapping_show, UPI_TOPOLOGY_TYPE);
+}
+
+static void skx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &skx_upi_mapping_group);
+}
+
+static void skx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &skx_upi_mapping_group);
+}
+
+static struct intel_uncore_type skx_uncore_upi = {
+ .name = "upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_UPI_PCI_PMON_CTR0,
+ .event_ctl = SKX_UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_UPI_CTL_UMASK_EXT,
+ .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
+ .ops = &skx_upi_uncore_pci_ops,
+ .format_group = &skx_upi_uncore_format_group,
+ .attr_update = skx_upi_attr_update,
+ .get_topology = skx_upi_get_topology,
+ .set_mapping = skx_upi_set_mapping,
+ .cleanup_mapping = skx_upi_cleanup_mapping,
+};
+
+static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_m2m_uncore_pci_ops = {
+ .init_box = skx_m2m_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_m2m = {
+ .name = "m2m",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_M2M_PCI_PMON_CTR0,
+ .event_ctl = SKX_M2M_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_M2M_PCI_PMON_BOX_CTL,
+ .ops = &skx_m2m_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m2pcie_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m3upi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x51, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x52, 0x7),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m3upi = {
+ .name = "m3upi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m3upi_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum {
+ SKX_PCI_UNCORE_IMC,
+ SKX_PCI_UNCORE_M2M,
+ SKX_PCI_UNCORE_UPI,
+ SKX_PCI_UNCORE_M2PCIE,
+ SKX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *skx_pci_uncores[] = {
+ [SKX_PCI_UNCORE_IMC] = &skx_uncore_imc,
+ [SKX_PCI_UNCORE_M2M] = &skx_uncore_m2m,
+ [SKX_PCI_UNCORE_UPI] = &skx_uncore_upi,
+ [SKX_PCI_UNCORE_M2PCIE] = &skx_uncore_m2pcie,
+ [SKX_PCI_UNCORE_M3UPI] = &skx_uncore_m3upi,
+ NULL,
+};
+
+static const struct pci_device_id skx_uncore_pci_ids[] = {
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5),
+ },
+ { /* M2M0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0),
+ },
+ { /* M2M1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1),
+ },
+ { /* UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0),
+ },
+ { /* UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1),
+ },
+ { /* UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2),
+ },
+ { /* M2PCIe 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* M2PCIe 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1),
+ },
+ { /* M2PCIe 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2),
+ },
+ { /* M2PCIe 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3),
+ },
+ { /* M3UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 0),
+ },
+ { /* M3UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204E),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 2, SKX_PCI_UNCORE_M3UPI, 1),
+ },
+ { /* M3UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 5, SKX_PCI_UNCORE_M3UPI, 2),
+ },
+ { /* end: all zeroes */ }
+};
+
+
+static struct pci_driver skx_uncore_pci_driver = {
+ .name = "skx_uncore",
+ .id_table = skx_uncore_pci_ids,
+};
+
+int skx_uncore_pci_init(void)
+{
+ /* need to double check pci address */
+ int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false);
+
+ if (ret)
+ return ret;
+
+ uncore_pci_uncores = skx_pci_uncores;
+ uncore_pci_driver = &skx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of SKX uncore support */
+
+/* SNR uncore support */
+
+static struct intel_uncore_type snr_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNR_U_MSR_PMON_CTR0,
+ .event_ctl = SNR_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNR_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNR_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct attribute *snr_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext2.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid5.attr,
+ NULL,
+};
+static const struct attribute_group snr_uncore_chabox_format_group = {
+ .name = "format",
+ .attrs = snr_uncore_cha_formats_attr,
+};
+
+static int snr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+ reg1->reg = SNR_C0_MSR_PMON_BOX_FILTER0 +
+ box->pmu->type->msr_offset * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+ reg1->idx = 0;
+
+ return 0;
+}
+
+static void snr_cha_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrq(reg1->reg, reg1->config);
+
+ wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snr_uncore_chabox_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = snr_cha_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = snr_cha_hw_config,
+};
+
+static struct intel_uncore_type snr_uncore_chabox = {
+ .name = "cha",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = SNR_CHA_MSR_PMON_CTL0,
+ .perf_ctr = SNR_CHA_MSR_PMON_CTR0,
+ .box_ctl = SNR_CHA_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_CHA_RAW_EVENT_MASK_EXT,
+ .ops = &snr_uncore_chabox_ops,
+ .format_group = &snr_uncore_chabox_format_group,
+};
+
+static struct attribute *snr_uncore_iio_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh9.attr,
+ &format_attr_ch_mask2.attr,
+ &format_attr_fc_mask2.attr,
+ NULL,
+};
+
+static const struct attribute_group snr_uncore_iio_format_group = {
+ .name = "format",
+ .attrs = snr_uncore_iio_formats_attr,
+};
+
+static umode_t
+snr_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 1. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 1);
+}
+
+static struct attribute_group snr_iio_mapping_group = {
+ .is_visible = snr_iio_mapping_visible,
+};
+
+static const struct attribute_group *snr_iio_attr_update[] = {
+ &snr_iio_mapping_group,
+ NULL,
+};
+
+static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_mapping)
+{
+ u32 sad_cfg;
+ int die, stack_id, ret = -EPERM;
+ struct pci_dev *dev = NULL;
+
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) {
+ ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ break;
+ }
+
+ die = uncore_pcibus_to_dieid(dev->bus);
+ stack_id = SAD_CONTROL_STACK_ID(sad_cfg);
+ if (die < 0 || stack_id >= type->num_boxes) {
+ ret = -EPERM;
+ break;
+ }
+
+ /* Convert stack id from SAD_CONTROL to PMON notation. */
+ stack_id = sad_pmon_mapping[stack_id];
+
+ type->topology[die][stack_id].iio->segment = pci_domain_nr(dev->bus);
+ type->topology[die][stack_id].pmu_idx = stack_id;
+ type->topology[die][stack_id].iio->pci_bus_no = dev->bus->number;
+ }
+
+ pci_dev_put(dev);
+
+ return ret;
+}
+
+/*
+ * SNR has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON
+ */
+enum {
+ SNR_QAT_PMON_ID,
+ SNR_CBDMA_DMI_PMON_ID,
+ SNR_NIS_PMON_ID,
+ SNR_DLB_PMON_ID,
+ SNR_PCIE_GEN3_PMON_ID
+};
+
+static u8 snr_sad_pmon_mapping[] = {
+ SNR_CBDMA_DMI_PMON_ID,
+ SNR_PCIE_GEN3_PMON_ID,
+ SNR_DLB_PMON_ID,
+ SNR_NIS_PMON_ID,
+ SNR_QAT_PMON_ID
+};
+
+static int snr_iio_get_topology(struct intel_uncore_type *type)
+{
+ return sad_cfg_iio_topology(type, snr_sad_pmon_mapping);
+}
+
+static void snr_iio_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_iio_set_mapping(type, &snr_iio_mapping_group);
+}
+
+static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &snr_iio_mapping_group);
+}
+
+static struct event_constraint snr_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snr_uncore_iio = {
+ .name = "iio",
+ .num_counters = 4,
+ .num_boxes = 5,
+ .perf_ctr_bits = 48,
+ .event_ctl = SNR_IIO_MSR_PMON_CTL0,
+ .perf_ctr = SNR_IIO_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = SNR_IIO_MSR_PMON_BOX_CTL,
+ .msr_offset = SNR_IIO_MSR_OFFSET,
+ .constraints = snr_uncore_iio_constraints,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &snr_uncore_iio_format_group,
+ .attr_update = snr_iio_attr_update,
+ .get_topology = snr_iio_get_topology,
+ .set_mapping = snr_iio_set_mapping,
+ .cleanup_mapping = snr_iio_cleanup_mapping,
+};
+
+static struct intel_uncore_type snr_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 5,
+ .perf_ctr_bits = 48,
+ .event_ctl = SNR_IRP0_MSR_PMON_CTL0,
+ .perf_ctr = SNR_IRP0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNR_IRP0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNR_IRP_MSR_OFFSET,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_type snr_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 5,
+ .perf_ctr_bits = 48,
+ .event_ctl = SNR_M2PCIE_MSR_PMON_CTL0,
+ .perf_ctr = SNR_M2PCIE_MSR_PMON_CTR0,
+ .box_ctl = SNR_M2PCIE_MSR_PMON_BOX_CTL,
+ .msr_offset = SNR_M2PCIE_MSR_OFFSET,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static int snr_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = SNR_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << reg1->idx);
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snr_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snr_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snr_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNR_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNR_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNR_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snr_uncore_pcu_ops,
+ .format_group = &skx_uncore_pcu_format_group,
+};
+
+enum perf_uncore_snr_iio_freerunning_type_id {
+ SNR_IIO_MSR_IOCLK,
+ SNR_IIO_MSR_BW_IN,
+
+ SNR_IIO_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters snr_iio_freerunning[] = {
+ [SNR_IIO_MSR_IOCLK] = { 0x1eac, 0x1, 0x10, 1, 48 },
+ [SNR_IIO_MSR_BW_IN] = { 0x1f00, 0x1, 0x10, 8, 48 },
+};
+
+static struct uncore_event_desc snr_uncore_iio_freerunning_events[] = {
+ /* Free-Running IIO CLOCKS Counter */
+ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
+ /* Free-Running IIO BANDWIDTH IN Counters */
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.0517578125e-5"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type snr_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 9,
+ .num_boxes = 5,
+ .num_freerunning_types = SNR_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = snr_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = snr_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct intel_uncore_type *snr_msr_uncores[] = {
+ &snr_uncore_ubox,
+ &snr_uncore_chabox,
+ &snr_uncore_iio,
+ &snr_uncore_irp,
+ &snr_uncore_m2pcie,
+ &snr_uncore_pcu,
+ &snr_uncore_iio_free_running,
+ NULL,
+};
+
+void snr_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = snr_msr_uncores;
+}
+
+static void snr_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, box_ctl, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops snr_m2m_uncore_pci_ops = {
+ .init_box = snr_m2m_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct attribute *snr_m2m_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext3.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group snr_m2m_uncore_format_group = {
+ .name = "format",
+ .attrs = snr_m2m_uncore_formats_attr,
+};
+
+static struct intel_uncore_type snr_uncore_m2m = {
+ .name = "m2m",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNR_M2M_PCI_PMON_CTR0,
+ .event_ctl = SNR_M2M_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT,
+ .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL,
+ .ops = &snr_m2m_uncore_pci_ops,
+ .format_group = &snr_m2m_uncore_format_group,
+};
+
+static void snr_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, (u32)(hwc->config | SNBEP_PMON_CTL_EN));
+ pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32));
+}
+
+static struct intel_uncore_ops snr_pcie3_uncore_pci_ops = {
+ .init_box = snr_m2m_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snr_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type snr_uncore_pcie3 = {
+ .name = "pcie3",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0,
+ .event_ctl = SNR_PCIE3_PCI_PMON_CTL0,
+ .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = SNR_PCIE3_PCI_PMON_BOX_CTL,
+ .ops = &snr_pcie3_uncore_pci_ops,
+ .format_group = &skx_uncore_iio_format_group,
+};
+
+enum {
+ SNR_PCI_UNCORE_M2M,
+ SNR_PCI_UNCORE_PCIE3,
+};
+
+static struct intel_uncore_type *snr_pci_uncores[] = {
+ [SNR_PCI_UNCORE_M2M] = &snr_uncore_m2m,
+ [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3,
+ NULL,
+};
+
+static const struct pci_device_id snr_uncore_pci_ids[] = {
+ { /* M2M */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, SNR_PCI_UNCORE_M2M, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver snr_uncore_pci_driver = {
+ .name = "snr_uncore",
+ .id_table = snr_uncore_pci_ids,
+};
+
+static const struct pci_device_id snr_uncore_pci_sub_ids[] = {
+ { /* PCIe3 RP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver snr_uncore_pci_sub_driver = {
+ .name = "snr_uncore_sub",
+ .id_table = snr_uncore_pci_sub_ids,
+};
+
+int snr_uncore_pci_init(void)
+{
+ /* SNR UBOX DID */
+ int ret = snbep_pci2phy_map_init(0x3460, SKX_CPUNODEID,
+ SKX_GIDNIDMAP, true);
+
+ if (ret)
+ return ret;
+
+ uncore_pci_uncores = snr_pci_uncores;
+ uncore_pci_driver = &snr_uncore_pci_driver;
+ uncore_pci_sub_driver = &snr_uncore_pci_sub_driver;
+ return 0;
+}
+
+#define SNR_MC_DEVICE_ID 0x3451
+
+static struct pci_dev *snr_uncore_get_mc_dev(unsigned int device, int id)
+{
+ struct pci_dev *mc_dev = NULL;
+ int pkg;
+
+ while (1) {
+ mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, mc_dev);
+ if (!mc_dev)
+ break;
+ pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+ if (pkg == id)
+ break;
+ }
+ return mc_dev;
+}
+
+static int snr_uncore_mmio_map(struct intel_uncore_box *box,
+ unsigned int box_ctl, int mem_offset,
+ unsigned int device)
+{
+ struct pci_dev *pdev = snr_uncore_get_mc_dev(device, box->dieid);
+ struct intel_uncore_type *type = box->pmu->type;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ if (!pdev)
+ return -ENODEV;
+
+ pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword);
+ addr = ((resource_size_t)pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23;
+
+ pci_read_config_dword(pdev, mem_offset, &pci_dword);
+ addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12;
+
+ addr += box_ctl;
+
+ pci_dev_put(pdev);
+
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr) {
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
+ unsigned int box_ctl, int mem_offset,
+ unsigned int device)
+{
+ if (!snr_uncore_mmio_map(box, box_ctl, mem_offset, device))
+ writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
+}
+
+static void snr_uncore_mmio_init_box(struct intel_uncore_box *box)
+{
+ __snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box),
+ SNR_IMC_MMIO_MEM0_OFFSET,
+ SNR_MC_DEVICE_ID);
+}
+
+static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ u32 config;
+
+ if (!box->io_addr)
+ return;
+
+ config = readl(box->io_addr);
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ writel(config, box->io_addr);
+}
+
+static void snr_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ u32 config;
+
+ if (!box->io_addr)
+ return;
+
+ config = readl(box->io_addr);
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ writel(config, box->io_addr);
+}
+
+static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+ return;
+
+ writel(hwc->config | SNBEP_PMON_CTL_EN,
+ box->io_addr + hwc->config_base);
+}
+
+static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+ return;
+
+ writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops snr_uncore_mmio_ops = {
+ .init_box = snr_uncore_mmio_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = snr_uncore_mmio_disable_box,
+ .enable_box = snr_uncore_mmio_enable_box,
+ .disable_event = snr_uncore_mmio_disable_event,
+ .enable_event = snr_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct uncore_event_desc snr_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type snr_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
+ .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
+ .event_descs = snr_uncore_imc_events,
+ .perf_ctr = SNR_IMC_MMIO_PMON_CTR0,
+ .event_ctl = SNR_IMC_MMIO_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
+ .mmio_offset = SNR_IMC_MMIO_OFFSET,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
+ .ops = &snr_uncore_mmio_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum perf_uncore_snr_imc_freerunning_type_id {
+ SNR_IMC_DCLK,
+ SNR_IMC_DDR,
+
+ SNR_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters snr_imc_freerunning[] = {
+ [SNR_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 },
+ [SNR_IMC_DDR] = { 0x2290, 0x8, 0, 2, 48 },
+};
+
+static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = {
+ INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"),
+
+ INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops snr_uncore_imc_freerunning_ops = {
+ .init_box = snr_uncore_mmio_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type snr_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .num_boxes = 1,
+ .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
+ .freerunning = snr_imc_freerunning,
+ .ops = &snr_uncore_imc_freerunning_ops,
+ .event_descs = snr_uncore_imc_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct intel_uncore_type *snr_mmio_uncores[] = {
+ &snr_uncore_imc,
+ &snr_uncore_imc_free_running,
+ NULL,
+};
+
+void snr_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = snr_mmio_uncores;
+}
+
+/* end of SNR uncore support */
+
+/* ICX uncore support */
+
+static u64 icx_cha_msr_offsets[] = {
+ 0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310,
+ 0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e,
+ 0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a,
+ 0x428, 0x436, 0x444, 0x452, 0x460, 0x46e, 0x47c, 0x0, 0xe,
+ 0x1c, 0x2a, 0x38, 0x46,
+};
+
+static int icx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ bool tie_en = !!(event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN);
+
+ if (tie_en) {
+ reg1->reg = ICX_C34_MSR_PMON_BOX_FILTER0 +
+ icx_cha_msr_offsets[box->pmu->pmu_idx];
+ reg1->config = event->attr.config1 & SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+ reg1->idx = 0;
+ }
+
+ return 0;
+}
+
+static struct intel_uncore_ops icx_uncore_chabox_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = snr_cha_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = icx_cha_hw_config,
+};
+
+static struct intel_uncore_type icx_uncore_chabox = {
+ .name = "cha",
+ .num_counters = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_C34_MSR_PMON_CTL0,
+ .perf_ctr = ICX_C34_MSR_PMON_CTR0,
+ .box_ctl = ICX_C34_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_cha_msr_offsets,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_CHA_RAW_EVENT_MASK_EXT,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &icx_uncore_chabox_ops,
+ .format_group = &snr_uncore_chabox_format_group,
+};
+
+static u64 icx_msr_offsets[] = {
+ 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
+};
+
+static struct event_constraint icx_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x03, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
+static umode_t
+icx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 5. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 5);
+}
+
+static struct attribute_group icx_iio_mapping_group = {
+ .is_visible = icx_iio_mapping_visible,
+};
+
+static const struct attribute_group *icx_iio_attr_update[] = {
+ &icx_iio_mapping_group,
+ NULL,
+};
+
+/*
+ * ICX has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON
+ */
+enum {
+ ICX_PCIE1_PMON_ID,
+ ICX_PCIE2_PMON_ID,
+ ICX_PCIE3_PMON_ID,
+ ICX_PCIE4_PMON_ID,
+ ICX_PCIE5_PMON_ID,
+ ICX_CBDMA_DMI_PMON_ID
+};
+
+static u8 icx_sad_pmon_mapping[] = {
+ ICX_CBDMA_DMI_PMON_ID,
+ ICX_PCIE1_PMON_ID,
+ ICX_PCIE2_PMON_ID,
+ ICX_PCIE3_PMON_ID,
+ ICX_PCIE4_PMON_ID,
+ ICX_PCIE5_PMON_ID,
+};
+
+static int icx_iio_get_topology(struct intel_uncore_type *type)
+{
+ return sad_cfg_iio_topology(type, icx_sad_pmon_mapping);
+}
+
+static void icx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ /* Detect ICX-D system. This case is not supported */
+ if (boot_cpu_data.x86_vfm == INTEL_ICELAKE_D) {
+ pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group);
+ return;
+ }
+ pmu_iio_set_mapping(type, &icx_iio_mapping_group);
+}
+
+static void icx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &icx_iio_mapping_group);
+}
+
+static struct intel_uncore_type icx_uncore_iio = {
+ .name = "iio",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_IIO_MSR_PMON_CTL0,
+ .perf_ctr = ICX_IIO_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = ICX_IIO_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_msr_offsets,
+ .constraints = icx_uncore_iio_constraints,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &snr_uncore_iio_format_group,
+ .attr_update = icx_iio_attr_update,
+ .get_topology = icx_iio_get_topology,
+ .set_mapping = icx_iio_set_mapping,
+ .cleanup_mapping = icx_iio_cleanup_mapping,
+};
+
+static struct intel_uncore_type icx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_IRP0_MSR_PMON_CTL0,
+ .perf_ctr = ICX_IRP0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = ICX_IRP0_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_msr_offsets,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct event_constraint icx_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type icx_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = ICX_M2PCIE_MSR_PMON_CTL0,
+ .perf_ctr = ICX_M2PCIE_MSR_PMON_CTR0,
+ .box_ctl = ICX_M2PCIE_MSR_PMON_BOX_CTL,
+ .msr_offsets = icx_msr_offsets,
+ .constraints = icx_uncore_m2pcie_constraints,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+enum perf_uncore_icx_iio_freerunning_type_id {
+ ICX_IIO_MSR_IOCLK,
+ ICX_IIO_MSR_BW_IN,
+
+ ICX_IIO_FREERUNNING_TYPE_MAX,
+};
+
+static unsigned icx_iio_clk_freerunning_box_offsets[] = {
+ 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
+};
+
+static unsigned icx_iio_bw_freerunning_box_offsets[] = {
+ 0x0, 0x10, 0x20, 0x90, 0xa0, 0xb0,
+};
+
+static struct freerunning_counters icx_iio_freerunning[] = {
+ [ICX_IIO_MSR_IOCLK] = { 0xa55, 0x1, 0x20, 1, 48, icx_iio_clk_freerunning_box_offsets },
+ [ICX_IIO_MSR_BW_IN] = { 0xaa0, 0x1, 0x10, 8, 48, icx_iio_bw_freerunning_box_offsets },
+};
+
+static struct intel_uncore_type icx_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 9,
+ .num_boxes = 6,
+ .num_freerunning_types = ICX_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = icx_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = snr_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct intel_uncore_type *icx_msr_uncores[] = {
+ &skx_uncore_ubox,
+ &icx_uncore_chabox,
+ &icx_uncore_iio,
+ &icx_uncore_irp,
+ &icx_uncore_m2pcie,
+ &skx_uncore_pcu,
+ &icx_uncore_iio_free_running,
+ NULL,
+};
+
+/*
+ * To determine the number of CHAs, it should read CAPID6(Low) and CAPID7 (High)
+ * registers which located at Device 30, Function 3
+ */
+#define ICX_CAPID6 0x9c
+#define ICX_CAPID7 0xa0
+
+static u64 icx_count_chabox(void)
+{
+ struct pci_dev *dev = NULL;
+ u64 caps = 0;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x345b, dev);
+ if (!dev)
+ goto out;
+
+ pci_read_config_dword(dev, ICX_CAPID6, (u32 *)&caps);
+ pci_read_config_dword(dev, ICX_CAPID7, (u32 *)&caps + 1);
+out:
+ pci_dev_put(dev);
+ return hweight64(caps);
+}
+
+void icx_uncore_cpu_init(void)
+{
+ u64 num_boxes = icx_count_chabox();
+
+ if (WARN_ON(num_boxes > ARRAY_SIZE(icx_cha_msr_offsets)))
+ return;
+ icx_uncore_chabox.num_boxes = num_boxes;
+ uncore_msr_uncores = icx_msr_uncores;
+}
+
+static struct intel_uncore_type icx_uncore_m2m = {
+ .name = "m2m",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNR_M2M_PCI_PMON_CTR0,
+ .event_ctl = SNR_M2M_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT,
+ .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL,
+ .ops = &snr_m2m_uncore_pci_ops,
+ .format_group = &snr_m2m_uncore_format_group,
+};
+
+static struct attribute *icx_upi_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext4.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group icx_upi_uncore_format_group = {
+ .name = "format",
+ .attrs = icx_upi_uncore_formats_attr,
+};
+
+#define ICX_UPI_REGS_ADDR_DEVICE_LINK0 0x02
+#define ICX_UPI_REGS_ADDR_FUNCTION 0x01
+
+static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, int dev_link0)
+{
+ struct pci_dev *ubox = NULL;
+ struct pci_dev *dev = NULL;
+ u32 nid, gid;
+ int idx, lgc_pkg, ret = -EPERM;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+
+ /* GIDNIDMAP method supports machines which have less than 8 sockets. */
+ if (uncore_max_dies() > 8)
+ goto err;
+
+ while ((ubox = pci_get_device(PCI_VENDOR_ID_INTEL, ubox_did, ubox))) {
+ ret = upi_nodeid_groupid(ubox, SKX_CPUNODEID, SKX_GIDNIDMAP, &nid, &gid);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ break;
+ }
+
+ lgc_pkg = topology_gidnid_map(nid, gid);
+ if (lgc_pkg < 0) {
+ ret = -EPERM;
+ goto err;
+ }
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[lgc_pkg][idx];
+ devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
+ ubox->bus->number,
+ devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ goto err;
+ }
+ }
+ }
+err:
+ pci_dev_put(ubox);
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int icx_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, ICX_UBOX_DID, ICX_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
+static struct attribute_group icx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *icx_upi_attr_update[] = {
+ &icx_upi_mapping_group,
+ NULL
+};
+
+static void icx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &icx_upi_mapping_group);
+}
+
+static void icx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &icx_upi_mapping_group);
+}
+
+static struct intel_uncore_type icx_uncore_upi = {
+ .name = "upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = ICX_UPI_CTL_UMASK_EXT,
+ .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
+ .ops = &skx_upi_uncore_pci_ops,
+ .format_group = &icx_upi_uncore_format_group,
+ .attr_update = icx_upi_attr_update,
+ .get_topology = icx_upi_get_topology,
+ .set_mapping = icx_upi_set_mapping,
+ .cleanup_mapping = icx_upi_cleanup_mapping,
+};
+
+static struct event_constraint icx_uncore_m3upi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x1c, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type icx_uncore_m3upi = {
+ .name = "m3upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_M3UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .constraints = icx_uncore_m3upi_constraints,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum {
+ ICX_PCI_UNCORE_M2M,
+ ICX_PCI_UNCORE_UPI,
+ ICX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *icx_pci_uncores[] = {
+ [ICX_PCI_UNCORE_M2M] = &icx_uncore_m2m,
+ [ICX_PCI_UNCORE_UPI] = &icx_uncore_upi,
+ [ICX_PCI_UNCORE_M3UPI] = &icx_uncore_m3upi,
+ NULL,
+};
+
+static const struct pci_device_id icx_uncore_pci_ids[] = {
+ { /* M2M 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, ICX_PCI_UNCORE_M2M, 0),
+ },
+ { /* M2M 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 0, ICX_PCI_UNCORE_M2M, 1),
+ },
+ { /* M2M 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, ICX_PCI_UNCORE_M2M, 2),
+ },
+ { /* M2M 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, ICX_PCI_UNCORE_M2M, 3),
+ },
+ { /* UPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(2, 1, ICX_PCI_UNCORE_UPI, 0),
+ },
+ { /* UPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(3, 1, ICX_PCI_UNCORE_UPI, 1),
+ },
+ { /* UPI Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 1, ICX_PCI_UNCORE_UPI, 2),
+ },
+ { /* M3UPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(5, 1, ICX_PCI_UNCORE_M3UPI, 0),
+ },
+ { /* M3UPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(6, 1, ICX_PCI_UNCORE_M3UPI, 1),
+ },
+ { /* M3UPI Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(7, 1, ICX_PCI_UNCORE_M3UPI, 2),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver icx_uncore_pci_driver = {
+ .name = "icx_uncore",
+ .id_table = icx_uncore_pci_ids,
+};
+
+int icx_uncore_pci_init(void)
+{
+ /* ICX UBOX DID */
+ int ret = snbep_pci2phy_map_init(0x3450, SKX_CPUNODEID,
+ SKX_GIDNIDMAP, true);
+
+ if (ret)
+ return ret;
+
+ uncore_pci_uncores = icx_pci_uncores;
+ uncore_pci_driver = &icx_uncore_pci_driver;
+ return 0;
+}
+
+static void icx_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ unsigned int box_ctl = box->pmu->type->box_ctl +
+ box->pmu->type->mmio_offset * (box->pmu->pmu_idx % ICX_NUMBER_IMC_CHN);
+ int mem_offset = (box->pmu->pmu_idx / ICX_NUMBER_IMC_CHN) * ICX_IMC_MEM_STRIDE +
+ SNR_IMC_MMIO_MEM0_OFFSET;
+
+ __snr_uncore_mmio_init_box(box, box_ctl, mem_offset,
+ SNR_MC_DEVICE_ID);
+}
+
+static struct intel_uncore_ops icx_uncore_mmio_ops = {
+ .init_box = icx_uncore_imc_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = snr_uncore_mmio_disable_box,
+ .enable_box = snr_uncore_mmio_enable_box,
+ .disable_event = snr_uncore_mmio_disable_event,
+ .enable_event = snr_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct intel_uncore_type icx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 12,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
+ .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
+ .event_descs = snr_uncore_imc_events,
+ .perf_ctr = SNR_IMC_MMIO_PMON_CTR0,
+ .event_ctl = SNR_IMC_MMIO_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
+ .mmio_offset = SNR_IMC_MMIO_OFFSET,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
+ .ops = &icx_uncore_mmio_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum perf_uncore_icx_imc_freerunning_type_id {
+ ICX_IMC_DCLK,
+ ICX_IMC_DDR,
+ ICX_IMC_DDRT,
+
+ ICX_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters icx_imc_freerunning[] = {
+ [ICX_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 },
+ [ICX_IMC_DDR] = { 0x2290, 0x8, 0, 2, 48 },
+ [ICX_IMC_DDRT] = { 0x22a0, 0x8, 0, 2, 48 },
+};
+
+static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = {
+ INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"),
+
+ INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static void icx_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE +
+ SNR_IMC_MMIO_MEM0_OFFSET;
+
+ snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+ mem_offset, SNR_MC_DEVICE_ID);
+}
+
+static struct intel_uncore_ops icx_uncore_imc_freerunning_ops = {
+ .init_box = icx_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type icx_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 5,
+ .num_boxes = 4,
+ .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
+ .freerunning = icx_imc_freerunning,
+ .ops = &icx_uncore_imc_freerunning_ops,
+ .event_descs = icx_uncore_imc_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct intel_uncore_type *icx_mmio_uncores[] = {
+ &icx_uncore_imc,
+ &icx_uncore_imc_free_running,
+ NULL,
+};
+
+void icx_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = icx_mmio_uncores;
+}
+
+/* end of ICX uncore support */
+
+/* SPR uncore support */
+
+static void spr_uncore_msr_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrq(reg1->reg, reg1->config);
+
+ wrmsrq(hwc->config_base, hwc->config);
+}
+
+static void spr_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrq(reg1->reg, 0);
+
+ wrmsrq(hwc->config_base, 0);
+}
+
+static int spr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ bool tie_en = !!(event->hw.config & SPR_CHA_PMON_CTL_TID_EN);
+ struct intel_uncore_type *type = box->pmu->type;
+ int id = intel_uncore_find_discovery_unit_id(type->boxes, -1, box->pmu->pmu_idx);
+
+ if (tie_en) {
+ reg1->reg = SPR_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * id;
+ reg1->config = event->attr.config1 & SPR_CHA_PMON_BOX_FILTER_TID;
+ reg1->idx = 0;
+ }
+
+ return 0;
+}
+
+static struct intel_uncore_ops spr_uncore_chabox_ops = {
+ .init_box = intel_generic_uncore_msr_init_box,
+ .disable_box = intel_generic_uncore_msr_disable_box,
+ .enable_box = intel_generic_uncore_msr_enable_box,
+ .disable_event = spr_uncore_msr_disable_event,
+ .enable_event = spr_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = spr_cha_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct attribute *spr_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext5.attr,
+ &format_attr_tid_en2.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid5.attr,
+ NULL,
+};
+static const struct attribute_group spr_uncore_chabox_format_group = {
+ .name = "format",
+ .attrs = spr_uncore_cha_formats_attr,
+};
+
+static ssize_t alias_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+ char pmu_name[UNCORE_PMU_NAME_LEN];
+
+ uncore_get_alias_name(pmu_name, pmu);
+ return sysfs_emit(buf, "%s\n", pmu_name);
+}
+
+static DEVICE_ATTR_RO(alias);
+
+static struct attribute *uncore_alias_attrs[] = {
+ &dev_attr_alias.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(uncore_alias);
+
+static struct intel_uncore_type spr_uncore_chabox = {
+ .name = "cha",
+ .event_mask = SPR_CHA_PMON_EVENT_MASK,
+ .event_mask_ext = SPR_CHA_EVENT_MASK_EXT,
+ .num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &spr_uncore_chabox_ops,
+ .format_group = &spr_uncore_chabox_format_group,
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type spr_uncore_iio = {
+ .name = "iio",
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .format_group = &snr_uncore_iio_format_group,
+ .attr_update = uncore_alias_groups,
+ .constraints = icx_uncore_iio_constraints,
+};
+
+static struct attribute *spr_uncore_raw_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext4.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group spr_uncore_raw_format_group = {
+ .name = "format",
+ .attrs = spr_uncore_raw_formats_attr,
+};
+
+#define SPR_UNCORE_COMMON_FORMAT() \
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, \
+ .format_group = &spr_uncore_raw_format_group, \
+ .attr_update = uncore_alias_groups
+
+static struct intel_uncore_type spr_uncore_irp = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "irp",
+
+};
+
+static struct event_constraint spr_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type spr_uncore_m2pcie = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "m2pcie",
+ .constraints = spr_uncore_m2pcie_constraints,
+};
+
+static struct intel_uncore_type spr_uncore_pcu = {
+ .name = "pcu",
+ .attr_update = uncore_alias_groups,
+};
+
+static void spr_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ if (uncore_pmc_fixed(hwc->idx))
+ writel(SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base);
+ else
+ writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops spr_uncore_mmio_ops = {
+ .init_box = intel_generic_uncore_mmio_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = spr_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static struct uncore_event_desc spr_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x01,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x05,umask=0xcf"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x05,umask=0xf0"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+#define SPR_UNCORE_MMIO_COMMON_FORMAT() \
+ SPR_UNCORE_COMMON_FORMAT(), \
+ .ops = &spr_uncore_mmio_ops
+
+static struct intel_uncore_type spr_uncore_imc = {
+ SPR_UNCORE_MMIO_COMMON_FORMAT(),
+ .name = "imc",
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
+ .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
+ .event_descs = spr_uncore_imc_events,
+};
+
+static void spr_uncore_pci_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32));
+ pci_write_config_dword(pdev, hwc->config_base, (u32)hwc->config);
+}
+
+static struct intel_uncore_ops spr_uncore_pci_ops = {
+ .init_box = intel_generic_uncore_pci_init_box,
+ .disable_box = intel_generic_uncore_pci_disable_box,
+ .enable_box = intel_generic_uncore_pci_enable_box,
+ .disable_event = intel_generic_uncore_pci_disable_event,
+ .enable_event = spr_uncore_pci_enable_event,
+ .read_counter = intel_generic_uncore_pci_read_counter,
+};
+
+#define SPR_UNCORE_PCI_COMMON_FORMAT() \
+ SPR_UNCORE_COMMON_FORMAT(), \
+ .ops = &spr_uncore_pci_ops
+
+static struct intel_uncore_type spr_uncore_m2m = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "m2m",
+};
+
+static struct attribute_group spr_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *spr_upi_attr_update[] = {
+ &uncore_alias_group,
+ &spr_upi_mapping_group,
+ NULL
+};
+
+#define SPR_UPI_REGS_ADDR_DEVICE_LINK0 0x01
+
+static void spr_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &spr_upi_mapping_group);
+}
+
+static void spr_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &spr_upi_mapping_group);
+}
+
+static int spr_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, SPR_UBOX_DID, SPR_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
+static struct intel_uncore_type spr_uncore_mdf = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "mdf",
+};
+
+static void spr_uncore_mmio_offs8_init_box(struct intel_uncore_box *box)
+{
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ intel_generic_uncore_mmio_init_box(box);
+}
+
+static struct intel_uncore_ops spr_uncore_mmio_offs8_ops = {
+ .init_box = spr_uncore_mmio_offs8_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = spr_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+#define SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT() \
+ SPR_UNCORE_COMMON_FORMAT(), \
+ .ops = &spr_uncore_mmio_offs8_ops
+
+static struct event_constraint spr_uncore_cxlcm_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x0f),
+ UNCORE_EVENT_CONSTRAINT(0x05, 0x0f),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x41, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x42, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x43, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x4b, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x52, 0xf0),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type spr_uncore_cxlcm = {
+ SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT(),
+ .name = "cxlcm",
+ .constraints = spr_uncore_cxlcm_constraints,
+};
+
+static struct intel_uncore_type spr_uncore_cxldp = {
+ SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT(),
+ .name = "cxldp",
+};
+
+static struct intel_uncore_type spr_uncore_hbm = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "hbm",
+};
+
+#define UNCORE_SPR_NUM_UNCORE_TYPES 15
+#define UNCORE_SPR_CHA 0
+#define UNCORE_SPR_IIO 1
+#define UNCORE_SPR_IMC 6
+#define UNCORE_SPR_UPI 8
+#define UNCORE_SPR_M3UPI 9
+
+/*
+ * The uncore units, which are supported by the discovery table,
+ * are defined here.
+ */
+static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
+ &spr_uncore_chabox,
+ &spr_uncore_iio,
+ &spr_uncore_irp,
+ &spr_uncore_m2pcie,
+ &spr_uncore_pcu,
+ NULL,
+ &spr_uncore_imc,
+ &spr_uncore_m2m,
+ NULL,
+ NULL,
+ NULL,
+ &spr_uncore_mdf,
+ &spr_uncore_cxlcm,
+ &spr_uncore_cxldp,
+ &spr_uncore_hbm,
+};
+
+/*
+ * The uncore units, which are not supported by the discovery table,
+ * are implemented from here.
+ */
+#define SPR_UNCORE_UPI_NUM_BOXES 4
+
+static u64 spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
+ 0, 0x8000, 0x10000, 0x18000
+};
+
+static void spr_extra_boxes_cleanup(struct intel_uncore_type *type)
+{
+ struct intel_uncore_discovery_unit *pos;
+ struct rb_node *node;
+
+ if (!type->boxes)
+ return;
+
+ while (!RB_EMPTY_ROOT(type->boxes)) {
+ node = rb_first(type->boxes);
+ pos = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ rb_erase(node, type->boxes);
+ kfree(pos);
+ }
+ kfree(type->boxes);
+ type->boxes = NULL;
+}
+
+static struct intel_uncore_type spr_uncore_upi = {
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
+ .format_group = &spr_uncore_raw_format_group,
+ .ops = &spr_uncore_pci_ops,
+ .name = "upi",
+ .attr_update = spr_upi_attr_update,
+ .get_topology = spr_upi_get_topology,
+ .set_mapping = spr_upi_set_mapping,
+ .cleanup_mapping = spr_upi_cleanup_mapping,
+ .type_id = UNCORE_SPR_UPI,
+ .num_counters = 4,
+ .num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_UPI_PCI_PMON_CTR0 - ICX_UPI_PCI_PMON_BOX_CTL,
+ .event_ctl = ICX_UPI_PCI_PMON_CTL0 - ICX_UPI_PCI_PMON_BOX_CTL,
+ .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
+ .pci_offsets = spr_upi_pci_offsets,
+ .cleanup_extra_boxes = spr_extra_boxes_cleanup,
+};
+
+static struct intel_uncore_type spr_uncore_m3upi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "m3upi",
+ .type_id = UNCORE_SPR_M3UPI,
+ .num_counters = 4,
+ .num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0 - ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .event_ctl = ICX_M3UPI_PCI_PMON_CTL0 - ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .pci_offsets = spr_upi_pci_offsets,
+ .constraints = icx_uncore_m3upi_constraints,
+ .cleanup_extra_boxes = spr_extra_boxes_cleanup,
+};
+
+enum perf_uncore_spr_iio_freerunning_type_id {
+ SPR_IIO_MSR_IOCLK,
+ SPR_IIO_MSR_BW_IN,
+ SPR_IIO_MSR_BW_OUT,
+
+ SPR_IIO_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_iio_freerunning[] = {
+ [SPR_IIO_MSR_IOCLK] = { 0x340e, 0x1, 0x10, 1, 48 },
+ [SPR_IIO_MSR_BW_IN] = { 0x3800, 0x1, 0x10, 8, 48 },
+ [SPR_IIO_MSR_BW_OUT] = { 0x3808, 0x1, 0x10, 8, 48 },
+};
+
+static struct intel_uncore_type spr_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 17,
+ .num_freerunning_types = SPR_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = spr_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = snr_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+enum perf_uncore_spr_imc_freerunning_type_id {
+ SPR_IMC_DCLK,
+ SPR_IMC_PQ_CYCLES,
+
+ SPR_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters spr_imc_freerunning[] = {
+ [SPR_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 },
+ [SPR_IMC_PQ_CYCLES] = { 0x2318, 0x8, 0, 2, 48 },
+};
+
+static struct uncore_event_desc spr_uncore_imc_freerunning_events[] = {
+ INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"),
+
+ INTEL_UNCORE_EVENT_DESC(rpq_cycles, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(wpq_cycles, "event=0xff,umask=0x21"),
+ { /* end: all zeroes */ },
+};
+
+#define SPR_MC_DEVICE_ID 0x3251
+
+static void spr_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE + SNR_IMC_MMIO_MEM0_OFFSET;
+
+ snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box),
+ mem_offset, SPR_MC_DEVICE_ID);
+}
+
+static struct intel_uncore_ops spr_uncore_imc_freerunning_ops = {
+ .init_box = spr_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type spr_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
+ .num_freerunning_types = SPR_IMC_FREERUNNING_TYPE_MAX,
+ .freerunning = spr_imc_freerunning,
+ .ops = &spr_uncore_imc_freerunning_ops,
+ .event_descs = spr_uncore_imc_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+#define UNCORE_SPR_MSR_EXTRA_UNCORES 1
+#define UNCORE_SPR_MMIO_EXTRA_UNCORES 1
+#define UNCORE_SPR_PCI_EXTRA_UNCORES 2
+
+static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = {
+ &spr_uncore_iio_free_running,
+};
+
+static struct intel_uncore_type *spr_mmio_uncores[UNCORE_SPR_MMIO_EXTRA_UNCORES] = {
+ &spr_uncore_imc_free_running,
+};
+
+static struct intel_uncore_type *spr_pci_uncores[UNCORE_SPR_PCI_EXTRA_UNCORES] = {
+ &spr_uncore_upi,
+ &spr_uncore_m3upi
+};
+
+int spr_uncore_units_ignore[] = {
+ UNCORE_SPR_UPI,
+ UNCORE_SPR_M3UPI,
+ UNCORE_IGNORE_END
+};
+
+static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
+ struct intel_uncore_type *from_type)
+{
+ if (!to_type || !from_type)
+ return;
+
+ if (from_type->name)
+ to_type->name = from_type->name;
+ if (from_type->fixed_ctr_bits)
+ to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+ if (from_type->event_mask)
+ to_type->event_mask = from_type->event_mask;
+ if (from_type->event_mask_ext)
+ to_type->event_mask_ext = from_type->event_mask_ext;
+ if (from_type->fixed_ctr)
+ to_type->fixed_ctr = from_type->fixed_ctr;
+ if (from_type->fixed_ctl)
+ to_type->fixed_ctl = from_type->fixed_ctl;
+ if (from_type->fixed_ctr_bits)
+ to_type->fixed_ctr_bits = from_type->fixed_ctr_bits;
+ if (from_type->num_shared_regs)
+ to_type->num_shared_regs = from_type->num_shared_regs;
+ if (from_type->constraints)
+ to_type->constraints = from_type->constraints;
+ if (from_type->ops)
+ to_type->ops = from_type->ops;
+ if (from_type->event_descs)
+ to_type->event_descs = from_type->event_descs;
+ if (from_type->format_group)
+ to_type->format_group = from_type->format_group;
+ if (from_type->attr_update)
+ to_type->attr_update = from_type->attr_update;
+ if (from_type->set_mapping)
+ to_type->set_mapping = from_type->set_mapping;
+ if (from_type->get_topology)
+ to_type->get_topology = from_type->get_topology;
+ if (from_type->cleanup_mapping)
+ to_type->cleanup_mapping = from_type->cleanup_mapping;
+ if (from_type->mmio_map_size)
+ to_type->mmio_map_size = from_type->mmio_map_size;
+}
+
+struct intel_uncore_type **
+uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
+ struct intel_uncore_type **extra, int max_num_types,
+ struct intel_uncore_type **uncores)
+{
+ struct intel_uncore_type **types, **start_types;
+ int i;
+
+ start_types = types = intel_uncore_generic_init_uncores(type_id, num_extra);
+
+ /* Only copy the customized features */
+ for (; *types; types++) {
+ if ((*types)->type_id >= max_num_types)
+ continue;
+ uncore_type_customized_copy(*types, uncores[(*types)->type_id]);
+ }
+
+ for (i = 0; i < num_extra; i++, types++)
+ *types = extra[i];
+
+ return start_types;
+}
+
+static struct intel_uncore_type *
+uncore_find_type_by_id(struct intel_uncore_type **types, int type_id)
+{
+ for (; *types; types++) {
+ if (type_id == (*types)->type_id)
+ return *types;
+ }
+
+ return NULL;
+}
+
+static int uncore_type_max_boxes(struct intel_uncore_type **types,
+ int type_id)
+{
+ struct intel_uncore_discovery_unit *unit;
+ struct intel_uncore_type *type;
+ struct rb_node *node;
+ int max = 0;
+
+ type = uncore_find_type_by_id(types, type_id);
+ if (!type)
+ return 0;
+
+ for (node = rb_first(type->boxes); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+
+ if (unit->id > max)
+ max = unit->id;
+ }
+ return max + 1;
+}
+
+#define SPR_MSR_UNC_CBO_CONFIG 0x2FFE
+
+void spr_uncore_cpu_init(void)
+{
+ struct intel_uncore_type *type;
+ u64 num_cbo;
+
+ uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
+ UNCORE_SPR_MSR_EXTRA_UNCORES,
+ spr_msr_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
+
+ type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA);
+ if (type) {
+ /*
+ * The value from the discovery table (stored in the type->num_boxes
+ * of UNCORE_SPR_CHA) is incorrect on some SPR variants because of a
+ * firmware bug. Using the value from SPR_MSR_UNC_CBO_CONFIG to replace it.
+ */
+ rdmsrq(SPR_MSR_UNC_CBO_CONFIG, num_cbo);
+ /*
+ * The MSR doesn't work on the EMR XCC, but the firmware bug doesn't impact
+ * the EMR XCC. Don't let the value from the MSR replace the existing value.
+ */
+ if (num_cbo)
+ type->num_boxes = num_cbo;
+ }
+ spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
+}
+
+#define SPR_UNCORE_UPI_PCIID 0x3241
+#define SPR_UNCORE_UPI0_DEVFN 0x9
+#define SPR_UNCORE_M3UPI_PCIID 0x3246
+#define SPR_UNCORE_M3UPI0_DEVFN 0x29
+
+static void spr_update_device_location(int type_id)
+{
+ struct intel_uncore_discovery_unit *unit;
+ struct intel_uncore_type *type;
+ struct pci_dev *dev = NULL;
+ struct rb_root *root;
+ u32 device, devfn;
+ int die;
+
+ if (type_id == UNCORE_SPR_UPI) {
+ type = &spr_uncore_upi;
+ device = SPR_UNCORE_UPI_PCIID;
+ devfn = SPR_UNCORE_UPI0_DEVFN;
+ } else if (type_id == UNCORE_SPR_M3UPI) {
+ type = &spr_uncore_m3upi;
+ device = SPR_UNCORE_M3UPI_PCIID;
+ devfn = SPR_UNCORE_M3UPI0_DEVFN;
+ } else
+ return;
+
+ root = kzalloc(sizeof(struct rb_root), GFP_KERNEL);
+ if (!root) {
+ type->num_boxes = 0;
+ return;
+ }
+ *root = RB_ROOT;
+
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+
+ die = uncore_device_to_die(dev);
+ if (die < 0)
+ continue;
+
+ unit = kzalloc(sizeof(*unit), GFP_KERNEL);
+ if (!unit)
+ continue;
+ unit->die = die;
+ unit->id = PCI_SLOT(dev->devfn) - PCI_SLOT(devfn);
+ unit->addr = pci_domain_nr(dev->bus) << UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET |
+ dev->bus->number << UNCORE_DISCOVERY_PCI_BUS_OFFSET |
+ devfn << UNCORE_DISCOVERY_PCI_DEVFN_OFFSET |
+ type->box_ctl;
+
+ unit->pmu_idx = unit->id;
+
+ uncore_find_add_unit(unit, root, NULL);
+ }
+
+ type->boxes = root;
+}
+
+int spr_uncore_pci_init(void)
+{
+ /*
+ * The discovery table of UPI on some SPR variant is broken,
+ * which impacts the detection of both UPI and M3UPI uncore PMON.
+ * Use the pre-defined UPI and M3UPI table to replace.
+ *
+ * The accurate location, e.g., domain and BUS number,
+ * can only be retrieved at load time.
+ * Update the location of UPI and M3UPI.
+ */
+ spr_update_device_location(UNCORE_SPR_UPI);
+ spr_update_device_location(UNCORE_SPR_M3UPI);
+ uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI,
+ UNCORE_SPR_PCI_EXTRA_UNCORES,
+ spr_pci_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
+ return 0;
+}
+
+void spr_uncore_mmio_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true);
+
+ if (ret) {
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
+ } else {
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
+ UNCORE_SPR_MMIO_EXTRA_UNCORES,
+ spr_mmio_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
+
+ spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2;
+ }
+}
+
+/* end of SPR uncore support */
+
+/* GNR uncore support */
+
+#define UNCORE_GNR_NUM_UNCORE_TYPES 23
+
+int gnr_uncore_units_ignore[] = {
+ UNCORE_IGNORE_END
+};
+
+static struct intel_uncore_type gnr_uncore_ubox = {
+ .name = "ubox",
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type gnr_uncore_pciex8 = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "pciex8",
+};
+
+static struct intel_uncore_type gnr_uncore_pciex16 = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "pciex16",
+};
+
+static struct intel_uncore_type gnr_uncore_upi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "upi",
+};
+
+static struct intel_uncore_type gnr_uncore_b2upi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "b2upi",
+};
+
+static struct intel_uncore_type gnr_uncore_b2hot = {
+ .name = "b2hot",
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type gnr_uncore_b2cmi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "b2cmi",
+};
+
+static struct intel_uncore_type gnr_uncore_b2cxl = {
+ SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT(),
+ .name = "b2cxl",
+};
+
+static struct intel_uncore_type gnr_uncore_mdf_sbo = {
+ .name = "mdf_sbo",
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = {
+ &spr_uncore_chabox,
+ &spr_uncore_iio,
+ &spr_uncore_irp,
+ NULL,
+ &spr_uncore_pcu,
+ &gnr_uncore_ubox,
+ &spr_uncore_imc,
+ NULL,
+ &gnr_uncore_upi,
+ NULL,
+ NULL,
+ NULL,
+ &spr_uncore_cxlcm,
+ &spr_uncore_cxldp,
+ NULL,
+ &gnr_uncore_b2hot,
+ &gnr_uncore_b2cmi,
+ &gnr_uncore_b2cxl,
+ &gnr_uncore_b2upi,
+ NULL,
+ &gnr_uncore_mdf_sbo,
+ &gnr_uncore_pciex16,
+ &gnr_uncore_pciex8,
+};
+
+static struct freerunning_counters gnr_iio_freerunning[] = {
+ [SPR_IIO_MSR_IOCLK] = { 0x290e, 0x01, 0x10, 1, 48 },
+ [SPR_IIO_MSR_BW_IN] = { 0x360e, 0x10, 0x80, 8, 48 },
+ [SPR_IIO_MSR_BW_OUT] = { 0x2e0e, 0x10, 0x80, 8, 48 },
+};
+
+void gnr_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
+ UNCORE_SPR_MSR_EXTRA_UNCORES,
+ spr_msr_uncores,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+ spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
+ spr_uncore_iio_free_running.freerunning = gnr_iio_freerunning;
+}
+
+int gnr_uncore_pci_init(void)
+{
+ uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+ return 0;
+}
+
+void gnr_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+}
+
+/* end of GNR uncore support */
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
new file mode 100644
index 000000000000..7f5007a4752a
--- /dev/null
+++ b/arch/x86/events/msr.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <linux/sysfs.h>
+#include <linux/nospec.h>
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "probe.h"
+
+enum perf_msr_id {
+ PERF_MSR_TSC = 0,
+ PERF_MSR_APERF = 1,
+ PERF_MSR_MPERF = 2,
+ PERF_MSR_PPERF = 3,
+ PERF_MSR_SMI = 4,
+ PERF_MSR_PTSC = 5,
+ PERF_MSR_IRPERF = 6,
+ PERF_MSR_THERM = 7,
+ PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx, void *data)
+{
+ return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_ptsc(int idx, void *data)
+{
+ return boot_cpu_has(X86_FEATURE_PTSC);
+}
+
+static bool test_irperf(int idx, void *data)
+{
+ return boot_cpu_has(X86_FEATURE_IRPERF);
+}
+
+static bool test_therm_status(int idx, void *data)
+{
+ return boot_cpu_has(X86_FEATURE_DTHERM);
+}
+
+static bool test_intel(int idx, void *data)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_WESTMERE_EX:
+
+ case INTEL_SANDYBRIDGE:
+ case INTEL_SANDYBRIDGE_X:
+
+ case INTEL_IVYBRIDGE:
+ case INTEL_IVYBRIDGE_X:
+
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_X:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_GRANITERAPIDS_D:
+
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_SILVERMONT_D:
+ case INTEL_ATOM_AIRMONT:
+
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_D:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_TREMONT_D:
+ case INTEL_ATOM_TREMONT:
+ case INTEL_ATOM_TREMONT_L:
+
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
+ if (idx == PERF_MSR_SMI)
+ return true;
+ break;
+
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_SKYLAKE_X:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ case INTEL_COMETLAKE_L:
+ case INTEL_COMETLAKE:
+ case INTEL_ICELAKE_L:
+ case INTEL_ICELAKE:
+ case INTEL_ICELAKE_X:
+ case INTEL_ICELAKE_D:
+ case INTEL_TIGERLAKE_L:
+ case INTEL_TIGERLAKE:
+ case INTEL_ROCKETLAKE:
+ case INTEL_ALDERLAKE:
+ case INTEL_ALDERLAKE_L:
+ case INTEL_ATOM_GRACEMONT:
+ case INTEL_RAPTORLAKE:
+ case INTEL_RAPTORLAKE_P:
+ case INTEL_RAPTORLAKE_S:
+ case INTEL_METEORLAKE:
+ case INTEL_METEORLAKE_L:
+ if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+PMU_EVENT_ATTR_STRING(tsc, attr_tsc, "event=0x00" );
+PMU_EVENT_ATTR_STRING(aperf, attr_aperf, "event=0x01" );
+PMU_EVENT_ATTR_STRING(mperf, attr_mperf, "event=0x02" );
+PMU_EVENT_ATTR_STRING(pperf, attr_pperf, "event=0x03" );
+PMU_EVENT_ATTR_STRING(smi, attr_smi, "event=0x04" );
+PMU_EVENT_ATTR_STRING(ptsc, attr_ptsc, "event=0x05" );
+PMU_EVENT_ATTR_STRING(irperf, attr_irperf, "event=0x06" );
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin, attr_therm, "event=0x07" );
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, attr_therm_snap, "1" );
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, attr_therm_unit, "C" );
+
+static unsigned long msr_mask;
+
+PMU_EVENT_GROUP(events, aperf);
+PMU_EVENT_GROUP(events, mperf);
+PMU_EVENT_GROUP(events, pperf);
+PMU_EVENT_GROUP(events, smi);
+PMU_EVENT_GROUP(events, ptsc);
+PMU_EVENT_GROUP(events, irperf);
+
+static struct attribute *attrs_therm[] = {
+ &attr_therm.attr.attr,
+ &attr_therm_snap.attr.attr,
+ &attr_therm_unit.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_therm = {
+ .name = "events",
+ .attrs = attrs_therm,
+};
+
+static struct perf_msr msr[] = {
+ [PERF_MSR_TSC] = { .no_check = true, },
+ [PERF_MSR_APERF] = { MSR_IA32_APERF, &group_aperf, test_aperfmperf, },
+ [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &group_mperf, test_aperfmperf, },
+ [PERF_MSR_PPERF] = { MSR_PPERF, &group_pperf, test_intel, },
+ [PERF_MSR_SMI] = { MSR_SMI_COUNT, &group_smi, test_intel, },
+ [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &group_ptsc, test_ptsc, },
+ [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &group_irperf, test_irperf, },
+ [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &group_therm, test_therm_status, },
+};
+
+static struct attribute *events_attrs[] = {
+ &attr_tsc.attr.attr,
+ NULL,
+};
+
+static struct attribute_group events_attr_group = {
+ .name = "events",
+ .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+static struct attribute_group format_attr_group = {
+ .name = "format",
+ .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &events_attr_group,
+ &format_attr_group,
+ NULL,
+};
+
+static const struct attribute_group *attr_update[] = {
+ &group_aperf,
+ &group_mperf,
+ &group_pperf,
+ &group_smi,
+ &group_ptsc,
+ &group_irperf,
+ &group_therm,
+ NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* unsupported modes and filters */
+ if (event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
+
+ cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX);
+
+ if (!(msr_mask & (1 << cfg)))
+ return -EINVAL;
+
+ event->hw.idx = -1;
+ event->hw.event_base = msr[cfg].msr;
+ event->hw.config = cfg;
+
+ return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+ u64 now;
+
+ if (event->hw.event_base)
+ rdmsrq(event->hw.event_base, now);
+ else
+ now = rdtsc_ordered();
+
+ return now;
+}
+
+static void msr_event_update(struct perf_event *event)
+{
+ u64 prev, now;
+ s64 delta;
+
+ /* Careful, an NMI might modify the previous event value: */
+ prev = local64_read(&event->hw.prev_count);
+ do {
+ now = msr_read_counter(event);
+ } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, now));
+
+ delta = now - prev;
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
+ delta = sign_extend64(delta, 31);
+ local64_add(delta, &event->count);
+ } else if (unlikely(event->hw.event_base == MSR_IA32_THERM_STATUS)) {
+ /* If valid, extract digital readout, otherwise set to -1: */
+ now = now & (1ULL << 31) ? (now >> 16) & 0x3f : -1;
+ local64_set(&event->count, now);
+ } else {
+ local64_add(delta, &event->count);
+ }
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+ u64 now = msr_read_counter(event);
+
+ local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+ msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+ msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ msr_event_start(event, flags);
+
+ return 0;
+}
+
+static struct pmu pmu_msr = {
+ .task_ctx_nr = perf_sw_context,
+ .attr_groups = attr_groups,
+ .event_init = msr_event_init,
+ .add = msr_event_add,
+ .del = msr_event_del,
+ .start = msr_event_start,
+ .stop = msr_event_stop,
+ .read = msr_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .attr_update = attr_update,
+};
+
+static int __init msr_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ pr_cont("no MSR PMU driver.\n");
+ return 0;
+ }
+
+ msr_mask = perf_msr_probe(msr, PERF_MSR_EVENT_MAX, true, NULL);
+
+ perf_pmu_register(&pmu_msr, "msr", -1);
+
+ return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
new file mode 100644
index 000000000000..3161ec0a3416
--- /dev/null
+++ b/arch/x86/events/perf_event.h
@@ -0,0 +1,1871 @@
+/*
+ * Performance events x86 architecture header
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/fpu/xstate.h>
+#include <asm/intel_ds.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+/* To enable MSR tracing please use the generic trace points. */
+
+/*
+ * | NHM/WSM | SNB |
+ * register -------------------------------
+ * | HT | no HT | HT | no HT |
+ *-----------------------------------------
+ * offcore | core | core | cpu | core |
+ * lbr_sel | core | core | cpu | core |
+ * ld_lat | cpu | core | cpu | core |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+ EXTRA_REG_NONE = -1, /* not used */
+
+ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
+ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
+ EXTRA_REG_LBR = 2, /* lbr_select */
+ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
+ EXTRA_REG_FE = 4, /* fe_* */
+ EXTRA_REG_SNOOP_0 = 5, /* snoop response 0 */
+ EXTRA_REG_SNOOP_1 = 6, /* snoop response 1 */
+
+ EXTRA_REG_MAX /* number of entries needed */
+};
+
+struct event_constraint {
+ union {
+ unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ u64 idxmsk64;
+ };
+ u64 code;
+ u64 cmask;
+ int weight;
+ int overlap;
+ int flags;
+ unsigned int size;
+};
+
+static inline bool constraint_match(struct event_constraint *c, u64 ecode)
+{
+ return ((ecode & c->cmask) - c->code) <= (u64)c->size;
+}
+
+#define PERF_ARCH(name, val) \
+ PERF_X86_EVENT_##name = val,
+
+/*
+ * struct hw_perf_event.flags flags
+ */
+enum {
+#include "perf_event_flags.h"
+};
+
+#undef PERF_ARCH
+
+#define PERF_ARCH(name, val) \
+ static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \
+ PERF_X86_EVENT_##name);
+
+#include "perf_event_flags.h"
+
+#undef PERF_ARCH
+
+static inline bool is_topdown_count(struct perf_event *event)
+{
+ return event->hw.flags & PERF_X86_EVENT_TOPDOWN;
+}
+
+static inline bool is_metric_event(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+
+ return ((config & ARCH_PERFMON_EVENTSEL_EVENT) == 0) &&
+ ((config & INTEL_ARCH_EVENT_MASK) >= INTEL_TD_METRIC_RETIRING) &&
+ ((config & INTEL_ARCH_EVENT_MASK) <= INTEL_TD_METRIC_MAX);
+}
+
+static inline bool is_slots_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_TD_SLOTS;
+}
+
+static inline bool is_topdown_event(struct perf_event *event)
+{
+ return is_metric_event(event) || is_slots_event(event);
+}
+
+int is_x86_event(struct perf_event *event);
+
+static inline bool check_leader_group(struct perf_event *leader, int flags)
+{
+ return is_x86_event(leader) ? !!(leader->hw.flags & flags) : false;
+}
+
+static inline bool is_branch_counters_group(struct perf_event *event)
+{
+ return check_leader_group(event->group_leader, PERF_X86_EVENT_BRANCH_COUNTERS);
+}
+
+static inline bool is_pebs_counter_event_group(struct perf_event *event)
+{
+ return check_leader_group(event->group_leader, PERF_X86_EVENT_PEBS_CNTR);
+}
+
+static inline bool is_acr_event_group(struct perf_event *event)
+{
+ return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR);
+}
+
+struct amd_nb {
+ int nb_id; /* NorthBridge id */
+ int refcnt; /* reference count */
+ struct perf_event *owners[X86_PMC_IDX_MAX];
+ struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
+#define PEBS_PMI_AFTER_EACH_RECORD BIT_ULL(60)
+#define PEBS_OUTPUT_OFFSET 61
+#define PEBS_OUTPUT_MASK (3ull << PEBS_OUTPUT_OFFSET)
+#define PEBS_OUTPUT_PT (1ull << PEBS_OUTPUT_OFFSET)
+#define PEBS_VIA_PT_MASK (PEBS_OUTPUT_PT | PEBS_PMI_AFTER_EACH_RECORD)
+
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ * REGS_USER can be handled for events limited to ring 3.
+ *
+ */
+#define LARGE_PEBS_FLAGS \
+ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE | \
+ PERF_SAMPLE_WEIGHT_TYPE)
+
+#define PEBS_GP_REGS \
+ ((1ULL << PERF_REG_X86_AX) | \
+ (1ULL << PERF_REG_X86_BX) | \
+ (1ULL << PERF_REG_X86_CX) | \
+ (1ULL << PERF_REG_X86_DX) | \
+ (1ULL << PERF_REG_X86_DI) | \
+ (1ULL << PERF_REG_X86_SI) | \
+ (1ULL << PERF_REG_X86_SP) | \
+ (1ULL << PERF_REG_X86_BP) | \
+ (1ULL << PERF_REG_X86_IP) | \
+ (1ULL << PERF_REG_X86_FLAGS) | \
+ (1ULL << PERF_REG_X86_R8) | \
+ (1ULL << PERF_REG_X86_R9) | \
+ (1ULL << PERF_REG_X86_R10) | \
+ (1ULL << PERF_REG_X86_R11) | \
+ (1ULL << PERF_REG_X86_R12) | \
+ (1ULL << PERF_REG_X86_R13) | \
+ (1ULL << PERF_REG_X86_R14) | \
+ (1ULL << PERF_REG_X86_R15))
+
+/*
+ * Per register state.
+ */
+struct er_account {
+ raw_spinlock_t lock; /* per-core: protect structure */
+ u64 config; /* extra MSR config */
+ u64 reg; /* extra MSR number */
+ atomic_t ref; /* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+ struct er_account regs[EXTRA_REG_MAX];
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
+enum intel_excl_state_type {
+ INTEL_EXCL_UNUSED = 0, /* counter is unused */
+ INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */
+ INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+ enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+ bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+ raw_spinlock_t lock;
+
+ struct intel_excl_states states[2];
+
+ union {
+ u16 has_exclusive[2];
+ u32 exclusive_present;
+ };
+
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
+struct x86_perf_task_context;
+#define MAX_LBR_ENTRIES 32
+
+enum {
+ LBR_FORMAT_32 = 0x00,
+ LBR_FORMAT_LIP = 0x01,
+ LBR_FORMAT_EIP = 0x02,
+ LBR_FORMAT_EIP_FLAGS = 0x03,
+ LBR_FORMAT_EIP_FLAGS2 = 0x04,
+ LBR_FORMAT_INFO = 0x05,
+ LBR_FORMAT_TIME = 0x06,
+ LBR_FORMAT_INFO2 = 0x07,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO2,
+};
+
+enum {
+ X86_PERF_KFREE_SHARED = 0,
+ X86_PERF_KFREE_EXCL = 1,
+ X86_PERF_KFREE_MAX
+};
+
+struct cpu_hw_events {
+ /*
+ * Generic x86 PMC bits
+ */
+ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
+ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ int enabled;
+
+ int n_events; /* the # of events in the below arrays */
+ int n_added; /* the # last events in the below arrays;
+ they've never been enabled yet */
+ int n_txn; /* the # last events in the below arrays;
+ added in the current transaction */
+ int n_txn_pair;
+ int n_txn_metric;
+ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+ u64 tags[X86_PMC_IDX_MAX];
+
+ struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+ struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+
+ int n_excl; /* the number of exclusive events */
+ int n_late_setup; /* the num of events needs late setup */
+
+ unsigned int txn_flags;
+ int is_fake;
+
+ /*
+ * Intel DebugStore bits
+ */
+ struct debug_store *ds;
+ void *ds_bts_vaddr;
+ /* DS based PEBS or arch-PEBS buffer address */
+ void *pebs_vaddr;
+ u64 pebs_enabled;
+ int n_pebs;
+ int n_large_pebs;
+ int n_pebs_via_pt;
+ int pebs_output;
+
+ /* Current super set of events hardware configuration */
+ u64 pebs_data_cfg;
+ u64 active_pebs_data_cfg;
+ int pebs_record_size;
+
+ /* Intel Fixed counter configuration */
+ u64 fixed_ctrl_val;
+ u64 active_fixed_ctrl_val;
+
+ /* Intel ACR configuration */
+ u64 acr_cfg_b[X86_PMC_IDX_MAX];
+ u64 acr_cfg_c[X86_PMC_IDX_MAX];
+ /* Cached CFG_C values */
+ u64 cfg_c_val[X86_PMC_IDX_MAX];
+
+ /*
+ * Intel LBR bits
+ */
+ int lbr_users;
+ int lbr_pebs_users;
+ struct perf_branch_stack lbr_stack;
+ struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ u64 lbr_counters[MAX_LBR_ENTRIES]; /* branch stack extra */
+ union {
+ struct er_account *lbr_sel;
+ struct er_account *lbr_ctl;
+ };
+ u64 br_sel;
+ void *last_task_ctx;
+ int last_log_id;
+ int lbr_select;
+ void *lbr_xsave;
+
+ /*
+ * Intel host/guest exclude bits
+ */
+ u64 intel_ctrl_guest_mask;
+ u64 intel_ctrl_host_mask;
+ struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
+
+ /*
+ * Intel checkpoint mask
+ */
+ u64 intel_cp_status;
+
+ /*
+ * manage shared (per-core, per-cpu) registers
+ * used on Intel NHM/WSM/SNB
+ */
+ struct intel_shared_regs *shared_regs;
+ /*
+ * manage exclusive counter access between hyperthread
+ */
+ struct event_constraint *constraint_list; /* in enable order */
+ struct intel_excl_cntrs *excl_cntrs;
+ int excl_thread_id; /* 0 or 1 */
+
+ /*
+ * SKL TSX_FORCE_ABORT shadow
+ */
+ u64 tfa_shadow;
+
+ /*
+ * Perf Metrics
+ */
+ /* number of accepted metrics events */
+ int n_metric;
+
+ /*
+ * AMD specific bits
+ */
+ struct amd_nb *amd_nb;
+ int brs_active; /* BRS is enabled */
+
+ /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+ u64 perf_ctr_virt_mask;
+ int n_pair; /* Large increment events */
+
+ void *kfree_on_online[X86_PERF_KFREE_MAX];
+
+ struct pmu *pmu;
+};
+
+#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \
+ { .idxmsk64 = (n) }, \
+ .code = (c), \
+ .size = (e) - (c), \
+ .cmask = (m), \
+ .weight = (w), \
+ .overlap = (o), \
+ .flags = f, \
+}
+
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \
+ __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f)
+
+#define EVENT_CONSTRAINT(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+
+/*
+ * The constraint_match() function only works for 'simple' event codes
+ * and not for extended (AMD64_EVENTSEL_EVENT) events codes.
+ */
+#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \
+ __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0)
+
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+ 0, PERF_X86_EVENT_EXCL)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-committed system
+ * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on a range of Event codes
+ */
+#define INTEL_EVENT_CONSTRAINT_RANGE(c, e, n) \
+ EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ * - inv
+ * - edge
+ * - cnt-mask
+ * - in_tx
+ * - in_tx_checkpointed
+ * The other filters are supported by fixed counters.
+ * The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
+#define FIXED_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
+
+/*
+ * The special metric counters do not actually exist. They are calculated from
+ * the combination of the FxCtr3 + MSR_PERF_METRICS.
+ *
+ * The special metric counters are mapped to a dummy offset for the scheduler.
+ * The sharing between multiple users of the same metric without multiplexing
+ * is not allowed, even though the hardware supports that in principle.
+ */
+
+#define METRIC_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, (1ULL << (INTEL_PMC_IDX_METRIC_BASE + n)), \
+ INTEL_ARCH_EVENT_MASK)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
+#define INTEL_PLD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+
+#define INTEL_PSD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT)
+
+#define INTEL_PST_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+#define INTEL_HYBRID_LAT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID)
+
+#define INTEL_HYBRID_LDLAT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_HYBRID_STLAT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n) \
+ EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \
+ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(code, end, n) \
+ __EVENT_CONSTRAINT_RANGE(code, end, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->weight != -1; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+ unsigned int event;
+ unsigned int msr;
+ u64 config_mask;
+ u64 valid_mask;
+ int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
+ }
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+ INTEL_UEVENT_EXTRA_REG(c, \
+ MSR_PEBS_LD_LAT_THRESHOLD, \
+ 0xffff, \
+ LDLAT)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+ struct {
+ u64 lbr_format:6;
+ u64 pebs_trap:1;
+ u64 pebs_arch_reg:1;
+ u64 pebs_format:4;
+ u64 smm_freeze:1;
+ /*
+ * PMU supports separate counter range for writing
+ * values > 32bit.
+ */
+ u64 full_width_write:1;
+ u64 pebs_baseline:1;
+ u64 perf_metrics:1;
+ u64 pebs_output_pt_available:1;
+ u64 pebs_timing_info:1;
+ u64 anythread_deprecated:1;
+ u64 rdpmc_metrics_clear:1;
+ };
+ u64 capabilities;
+};
+
+struct x86_pmu_quirk {
+ struct x86_pmu_quirk *next;
+ void (*func)(void);
+};
+
+union x86_pmu_config {
+ struct {
+ u64 event:8,
+ umask:8,
+ usr:1,
+ os:1,
+ edge:1,
+ pc:1,
+ interrupt:1,
+ __reserved1:1,
+ en:1,
+ inv:1,
+ cmask:8,
+ event2:4,
+ __reserved2:4,
+ go:1,
+ ho:1;
+ } bits;
+ u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
+enum {
+ x86_lbr_exclusive_lbr,
+ x86_lbr_exclusive_bts,
+ x86_lbr_exclusive_pt,
+ x86_lbr_exclusive_max,
+};
+
+#define PERF_PEBS_DATA_SOURCE_MAX 0x100
+#define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1)
+#define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10
+#define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1)
+
+#define X86_HYBRID_PMU_ATOM_IDX 0
+#define X86_HYBRID_PMU_CORE_IDX 1
+#define X86_HYBRID_PMU_TINY_IDX 2
+
+enum hybrid_pmu_type {
+ not_hybrid,
+ hybrid_small = BIT(X86_HYBRID_PMU_ATOM_IDX),
+ hybrid_big = BIT(X86_HYBRID_PMU_CORE_IDX),
+ hybrid_tiny = BIT(X86_HYBRID_PMU_TINY_IDX),
+
+ /* The belows are only used for matching */
+ hybrid_big_small = hybrid_big | hybrid_small,
+ hybrid_small_tiny = hybrid_small | hybrid_tiny,
+ hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny,
+};
+
+struct arch_pebs_cap {
+ u64 caps;
+ u64 counters;
+ u64 pdists;
+};
+
+struct x86_hybrid_pmu {
+ struct pmu pmu;
+ const char *name;
+ enum hybrid_pmu_type pmu_type;
+ cpumask_t supported_cpus;
+ union perf_capabilities intel_cap;
+ u64 intel_ctrl;
+ u64 pebs_events_mask;
+ u64 config_mask;
+ union {
+ u64 cntr_mask64;
+ unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ union {
+ u64 fixed_cntr_mask64;
+ unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+
+ union {
+ u64 acr_cntr_mask64;
+ unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ union {
+ u64 acr_cause_mask64;
+ unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ struct event_constraint unconstrained;
+
+ u64 hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+ u64 hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+ struct event_constraint *event_constraints;
+ struct event_constraint *pebs_constraints;
+ struct extra_reg *extra_regs;
+
+ unsigned int late_ack :1,
+ mid_ack :1,
+ enabled_ack :1;
+
+ struct arch_pebs_cap arch_pebs_cap;
+
+ u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX];
+};
+
+static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
+{
+ return container_of(pmu, struct x86_hybrid_pmu, pmu);
+}
+
+extern struct static_key_false perf_is_hybrid;
+#define is_hybrid() static_branch_unlikely(&perf_is_hybrid)
+
+#define hybrid(_pmu, _field) \
+(*({ \
+ typeof(&x86_pmu._field) __Fp = &x86_pmu._field; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = &hybrid_pmu(_pmu)->_field; \
+ \
+ __Fp; \
+}))
+
+#define hybrid_var(_pmu, _var) \
+(*({ \
+ typeof(&_var) __Fp = &_var; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = &hybrid_pmu(_pmu)->_var; \
+ \
+ __Fp; \
+}))
+
+#define hybrid_bit(_pmu, _field) \
+({ \
+ bool __Fp = x86_pmu._field; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = hybrid_pmu(_pmu)->_field; \
+ \
+ __Fp; \
+})
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+ /*
+ * Generic x86 PMC bits
+ */
+ const char *name;
+ int version;
+ int (*handle_irq)(struct pt_regs *);
+ void (*disable_all)(void);
+ void (*enable_all)(int added);
+ void (*enable)(struct perf_event *);
+ void (*disable)(struct perf_event *);
+ void (*assign)(struct perf_event *event, int idx);
+ void (*add)(struct perf_event *);
+ void (*del)(struct perf_event *);
+ void (*read)(struct perf_event *event);
+ int (*set_period)(struct perf_event *event);
+ u64 (*update)(struct perf_event *event);
+ int (*hw_config)(struct perf_event *event);
+ int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+ void (*late_setup)(void);
+ void (*pebs_enable)(struct perf_event *event);
+ void (*pebs_disable)(struct perf_event *event);
+ void (*pebs_enable_all)(void);
+ void (*pebs_disable_all)(void);
+ unsigned eventsel;
+ unsigned perfctr;
+ unsigned fixedctr;
+ int (*addr_offset)(int index, bool eventsel);
+ int (*rdpmc_index)(int index);
+ u64 (*event_map)(int);
+ int max_events;
+ u64 config_mask;
+ union {
+ u64 cntr_mask64;
+ unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ union {
+ u64 fixed_cntr_mask64;
+ unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ union {
+ u64 acr_cntr_mask64;
+ unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ union {
+ u64 acr_cause_mask64;
+ unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ int cntval_bits;
+ u64 cntval_mask;
+ union {
+ unsigned long events_maskl;
+ unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+ };
+ int events_mask_len;
+ int apic;
+ u64 max_period;
+ struct event_constraint *
+ (*get_event_constraints)(struct cpu_hw_events *cpuc,
+ int idx,
+ struct perf_event *event);
+
+ void (*put_event_constraints)(struct cpu_hw_events *cpuc,
+ struct perf_event *event);
+
+ void (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
+ void (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
+ struct event_constraint *event_constraints;
+ struct x86_pmu_quirk *quirks;
+ void (*limit_period)(struct perf_event *event, s64 *l);
+
+ /* PMI handler bits */
+ unsigned int late_ack :1,
+ mid_ack :1,
+ enabled_ack :1;
+ /*
+ * sysfs attrs
+ */
+ int attr_rdpmc_broken;
+ int attr_rdpmc;
+ struct attribute **format_attrs;
+
+ ssize_t (*events_sysfs_show)(char *page, u64 config);
+ const struct attribute_group **attr_update;
+
+ unsigned long attr_freeze_on_smi;
+
+ /*
+ * CPU Hotplug hooks
+ */
+ int (*cpu_prepare)(int cpu);
+ void (*cpu_starting)(int cpu);
+ void (*cpu_dying)(int cpu);
+ void (*cpu_dead)(int cpu);
+
+ void (*check_microcode)(void);
+ void (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in);
+
+ /*
+ * Intel Arch Perfmon v2+
+ */
+ u64 intel_ctrl;
+ union perf_capabilities intel_cap;
+
+ /*
+ * Intel DebugStore and PEBS bits
+ */
+ unsigned int bts :1,
+ bts_active :1,
+ ds_pebs :1,
+ pebs_active :1,
+ pebs_broken :1,
+ pebs_prec_dist :1,
+ pebs_no_tlb :1,
+ pebs_no_isolation :1,
+ pebs_block :1,
+ pebs_ept :1,
+ arch_pebs :1;
+ int pebs_record_size;
+ int pebs_buffer_size;
+ u64 pebs_events_mask;
+ void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
+ struct event_constraint *pebs_constraints;
+ void (*pebs_aliases)(struct perf_event *event);
+ u64 (*pebs_latency_data)(struct perf_event *event, u64 status);
+ unsigned long large_pebs_flags;
+ u64 rtm_abort_event;
+ u64 pebs_capable;
+
+ /*
+ * Intel Architectural PEBS
+ */
+ struct arch_pebs_cap arch_pebs_cap;
+
+ /*
+ * Intel LBR
+ */
+ unsigned int lbr_tos, lbr_from, lbr_to,
+ lbr_info, lbr_nr; /* LBR base regs and size */
+ union {
+ u64 lbr_sel_mask; /* LBR_SELECT valid bits */
+ u64 lbr_ctl_mask; /* LBR_CTL valid bits */
+ };
+ union {
+ const int *lbr_sel_map; /* lbr_select mappings */
+ int *lbr_ctl_map; /* LBR_CTL mappings */
+ };
+ u64 lbr_callstack_users; /* lbr callstack system wide users */
+ bool lbr_double_abort; /* duplicated lbr aborts */
+ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */
+
+ unsigned int lbr_has_info:1;
+ unsigned int lbr_has_tsx:1;
+ unsigned int lbr_from_flags:1;
+ unsigned int lbr_to_cycles:1;
+
+ /*
+ * Intel Architectural LBR CPUID Enumeration
+ */
+ unsigned int lbr_depth_mask:8;
+ unsigned int lbr_deep_c_reset:1;
+ unsigned int lbr_lip:1;
+ unsigned int lbr_cpl:1;
+ unsigned int lbr_filter:1;
+ unsigned int lbr_call_stack:1;
+ unsigned int lbr_mispred:1;
+ unsigned int lbr_timed_lbr:1;
+ unsigned int lbr_br_type:1;
+ unsigned int lbr_counters:4;
+
+ void (*lbr_reset)(void);
+ void (*lbr_read)(struct cpu_hw_events *cpuc);
+ void (*lbr_save)(void *ctx);
+ void (*lbr_restore)(void *ctx);
+
+ /*
+ * Intel PT/LBR/BTS are exclusive
+ */
+ atomic_t lbr_exclusive[x86_lbr_exclusive_max];
+
+ /*
+ * Intel perf metrics
+ */
+ int num_topdown_events;
+
+ /*
+ * AMD bits
+ */
+ unsigned int amd_nb_constraints : 1;
+ u64 perf_ctr_pair_en;
+
+ /*
+ * Extra registers for events
+ */
+ struct extra_reg *extra_regs;
+ unsigned int flags;
+
+ /*
+ * Intel host/guest support (KVM)
+ */
+ struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, void *data);
+
+ /*
+ * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
+ */
+ int (*check_period) (struct perf_event *event, u64 period);
+
+ int (*aux_output_match) (struct perf_event *event);
+
+ void (*filter)(struct pmu *pmu, int cpu, bool *ret);
+ /*
+ * Hybrid support
+ *
+ * Most PMU capabilities are the same among different hybrid PMUs.
+ * The global x86_pmu saves the architecture capabilities, which
+ * are available for all PMUs. The hybrid_pmu only includes the
+ * unique capabilities.
+ */
+ int num_hybrid_pmus;
+ struct x86_hybrid_pmu *hybrid_pmu;
+ enum intel_cpu_type (*get_hybrid_cpu_type) (void);
+};
+
+struct x86_perf_task_context_opt {
+ int lbr_callstack_users;
+ int lbr_stack_state;
+ int log_id;
+};
+
+struct x86_perf_task_context {
+ u64 lbr_sel;
+ int tos;
+ int valid_lbrs;
+ struct x86_perf_task_context_opt opt;
+ struct lbr_entry lbr[MAX_LBR_ENTRIES];
+};
+
+struct x86_perf_task_context_arch_lbr {
+ struct x86_perf_task_context_opt opt;
+ struct lbr_entry entries[];
+};
+
+/*
+ * Add padding to guarantee the 64-byte alignment of the state buffer.
+ *
+ * The structure is dynamically allocated. The size of the LBR state may vary
+ * based on the number of LBR registers.
+ *
+ * Do not put anything after the LBR state.
+ */
+struct x86_perf_task_context_arch_lbr_xsave {
+ struct x86_perf_task_context_opt opt;
+
+ union {
+ struct xregs_state xsave;
+ struct {
+ struct fxregs_state i387;
+ struct xstate_header header;
+ struct arch_lbr_state lbr;
+ } __attribute__ ((packed, aligned (XSAVE_ALIGNMENT)));
+ };
+};
+
+#define x86_add_quirk(func_) \
+do { \
+ static struct x86_pmu_quirk __quirk __initdata = { \
+ .func = func_, \
+ }; \
+ __quirk.next = x86_pmu.quirks; \
+ x86_pmu.quirks = &__quirk; \
+} while (0)
+
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
+#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
+#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
+#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
+#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
+#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
+#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
+#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
+#define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */
+#define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */
+#define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */
+
+#define EVENT_VAR(_id) event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id) \
+static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
+ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+ .id = PERF_COUNT_HW_##_id, \
+ .event_str = NULL, \
+};
+
+#define EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
+};
+
+#define EVENT_ATTR_STR_HT(_name, v, noht, ht) \
+static struct perf_pmu_events_ht_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_ht_sysfs_show, NULL),\
+ .id = 0, \
+ .event_str_noht = noht, \
+ .event_str_ht = ht, \
+}
+
+#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \
+static struct perf_pmu_events_hybrid_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\
+ .id = 0, \
+ .event_str = str, \
+ .pmu_type = _pmu, \
+}
+
+#define FORMAT_HYBRID_PTR(_id) (&format_attr_hybrid_##_id.attr.attr)
+
+#define FORMAT_ATTR_HYBRID(_name, _pmu) \
+static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
+ .attr = __ATTR_RO(_name), \
+ .pmu_type = _pmu, \
+}
+
+struct pmu *x86_get_pmu(unsigned int cpu);
+extern struct x86_pmu x86_pmu __read_mostly;
+
+DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period);
+DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update);
+DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
+DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup);
+DECLARE_STATIC_CALL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable);
+DECLARE_STATIC_CALL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable);
+DECLARE_STATIC_CALL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all);
+DECLARE_STATIC_CALL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all);
+
+static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
+{
+ if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+ return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt;
+
+ return &((struct x86_perf_task_context *)ctx)->opt;
+}
+
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+ return x86_pmu.lbr_sel_map &&
+ x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val)
+{
+ return x86_perf_event_update(event);
+}
+DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update);
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+ return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, true) : index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+ return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline unsigned int x86_pmu_fixed_ctr_addr(int index)
+{
+ return x86_pmu.fixedctr + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+ return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
+bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask,
+ unsigned long *fixed_cntr_mask);
+
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
+int x86_pmu_max_precise(struct pmu *pmu);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline bool has_amd_brs(struct hw_perf_event *hwc)
+{
+ return hwc->flags & PERF_X86_EVENT_AMD_BRS;
+}
+
+static inline bool is_counter_pair(struct hw_perf_event *hwc)
+{
+ return hwc->flags & PERF_X86_EVENT_PAIR;
+}
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+ u64 enable_mask)
+{
+ u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
+ if (hwc->extra_reg.reg)
+ wrmsrq(hwc->extra_reg.reg, hwc->extra_reg.config);
+
+ /*
+ * Add enabled Merge event on next counter
+ * if large increment event being enabled on this counter
+ */
+ if (is_counter_pair(hwc))
+ wrmsrq(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en);
+
+ wrmsrq(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign);
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+ u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrq(hwc->config_base, hwc->config & ~disable_mask);
+
+ if (is_counter_pair(hwc))
+ wrmsrq(x86_pmu_config_addr(hwc->idx + 1), 0);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+void x86_pmu_show_pmu_cap(struct pmu *pmu);
+
+static inline int x86_pmu_num_counters(struct pmu *pmu)
+{
+ return hweight64(hybrid(pmu, cntr_mask64));
+}
+
+static inline int x86_pmu_max_num_counters(struct pmu *pmu)
+{
+ return fls64(hybrid(pmu, cntr_mask64));
+}
+
+static inline int x86_pmu_num_counters_fixed(struct pmu *pmu)
+{
+ return hweight64(hybrid(pmu, fixed_cntr_mask64));
+}
+
+static inline int x86_pmu_max_num_counters_fixed(struct pmu *pmu)
+{
+ return fls64(hybrid(pmu, fixed_cntr_mask64));
+}
+
+static inline u64 x86_pmu_get_event_config(struct perf_event *event)
+{
+ return event->attr.config & hybrid(event->pmu, config_mask);
+}
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+ return ip > PAGE_OFFSET;
+#else
+ return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+ regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+ if (regs->flags & X86_VM_MASK)
+ regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+ regs->ip = ip;
+}
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
+ X86_BR_IND_JMP = 1 << 17,/* indirect jump */
+
+ X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */
+
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY \
+ (X86_BR_CALL |\
+ X86_BR_RET |\
+ X86_BR_SYSCALL |\
+ X86_BR_SYSRET |\
+ X86_BR_INT |\
+ X86_BR_IRET |\
+ X86_BR_JCC |\
+ X86_BR_JMP |\
+ X86_BR_IRQ |\
+ X86_BR_ABORT |\
+ X86_BR_IND_CALL |\
+ X86_BR_IND_JMP |\
+ X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL \
+ (X86_BR_CALL |\
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
+ X86_BR_SYSCALL |\
+ X86_BR_IRQ |\
+ X86_BR_INT)
+
+int common_branch_type(int type);
+int branch_type(unsigned long from, unsigned long to, int abort);
+int branch_type_fused(unsigned long from, unsigned long to, int abort,
+ int *offset);
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page);
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page);
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page);
+
+static inline bool fixed_counter_disabled(int i, struct pmu *pmu)
+{
+ u64 intel_ctrl = hybrid(pmu, intel_ctrl);
+
+ return !(intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+}
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+int amd_pmu_lbr_init(void);
+void amd_pmu_lbr_reset(void);
+void amd_pmu_lbr_read(void);
+void amd_pmu_lbr_add(struct perf_event *event);
+void amd_pmu_lbr_del(struct perf_event *event);
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in);
+void amd_pmu_lbr_enable_all(void);
+void amd_pmu_lbr_disable_all(void);
+int amd_pmu_lbr_hw_config(struct perf_event *event);
+
+static __always_inline void __amd_pmu_lbr_disable(void)
+{
+ u64 dbg_ctl, dbg_extn_cfg;
+
+ rdmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ wrmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ }
+}
+
+#ifdef CONFIG_PERF_EVENTS_AMD_BRS
+
+#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
+
+int amd_brs_init(void);
+void amd_brs_disable(void);
+void amd_brs_enable(void);
+void amd_brs_enable_all(void);
+void amd_brs_disable_all(void);
+void amd_brs_drain(void);
+void amd_brs_lopwr_init(void);
+int amd_brs_hw_config(struct perf_event *event);
+void amd_brs_reset(void);
+
+static inline void amd_pmu_brs_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ perf_sched_cb_inc(event->pmu);
+ cpuc->lbr_users++;
+ /*
+ * No need to reset BRS because it is reset
+ * on brs_enable() and it is saturating
+ */
+}
+
+static inline void amd_pmu_brs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+
+ perf_sched_cb_dec(event->pmu);
+}
+
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in);
+#else
+static inline int amd_brs_init(void)
+{
+ return 0;
+}
+static inline void amd_brs_disable(void) {}
+static inline void amd_brs_enable(void) {}
+static inline void amd_brs_drain(void) {}
+static inline void amd_brs_lopwr_init(void) {}
+static inline void amd_brs_disable_all(void) {}
+static inline int amd_brs_hw_config(struct perf_event *event)
+{
+ return 0;
+}
+static inline void amd_brs_reset(void) {}
+
+static inline void amd_pmu_brs_add(struct perf_event *event)
+{
+}
+
+static inline void amd_pmu_brs_del(struct perf_event *event)
+{
+}
+
+static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in)
+{
+}
+
+static inline void amd_brs_enable_all(void)
+{
+}
+
+#endif
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+ return 0;
+}
+
+static inline int amd_brs_init(void)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void amd_brs_drain(void)
+{
+}
+
+static inline void amd_brs_enable_all(void)
+{
+}
+
+static inline void amd_brs_disable_all(void)
+{
+}
+#endif /* CONFIG_CPU_SUP_AMD */
+
+static inline int is_pebs_pt(struct perf_event *event)
+{
+ return !!(event->hw.flags & PERF_X86_EVENT_PEBS_VIA_PT);
+}
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int hw_event, bts_event;
+
+ if (event->attr.freq)
+ return false;
+
+ hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+ bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+ return hw_event == bts_event && period == 1;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return intel_pmu_has_bts_period(event, hwc->sample_period);
+}
+
+static __always_inline void __intel_pmu_pebs_disable_all(void)
+{
+ wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static __always_inline void __intel_pmu_arch_lbr_disable(void)
+{
+ wrmsrq(MSR_ARCH_LBR_CTL, 0);
+}
+
+static __always_inline void __intel_pmu_lbr_disable(void)
+{
+ u64 debugctl;
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event);
+
+extern int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu);
+extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);
+
+int intel_pmu_init(void);
+
+int alloc_arch_pebs_buf_on_cpu(int cpu);
+
+void release_arch_pebs_buf_on_cpu(int cpu);
+
+void init_arch_pebs_on_cpu(int cpu);
+
+void fini_arch_pebs_on_cpu(int cpu);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+void release_lbr_buffers(void);
+
+void reserve_lbr_buffers(void);
+
+extern struct event_constraint bts_constraint;
+extern struct event_constraint vlbr_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+void intel_pmu_late_setup(void);
+
+u64 grt_latency_data(struct perf_event *event, u64 status);
+
+u64 cmt_latency_data(struct perf_event *event, u64 status);
+
+u64 lnl_latency_data(struct perf_event *event, u64 status);
+
+u64 arl_h_latency_data(struct perf_event *event, u64 status);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
+extern struct event_constraint intel_glm_pebs_event_constraints[];
+
+extern struct event_constraint intel_glp_pebs_event_constraints[];
+
+extern struct event_constraint intel_grt_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
+extern struct event_constraint intel_icl_pebs_event_constraints[];
+
+extern struct event_constraint intel_glc_pebs_event_constraints[];
+
+extern struct event_constraint intel_lnc_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_add(struct perf_event *event);
+
+void intel_pmu_pebs_del(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+
+void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc);
+
+void intel_pmu_drain_pebs_buffer(void);
+
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
+
+void intel_pebs_init(void);
+
+void intel_pmu_lbr_save_brstack(struct perf_sample_data *data,
+ struct cpu_hw_events *cpuc,
+ struct perf_event *event);
+
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx,
+ struct task_struct *task, bool sched_in);
+
+u64 lbr_from_signext_quirk_wr(u64 val);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_reset_32(void);
+
+void intel_pmu_lbr_reset_64(void);
+
+void intel_pmu_lbr_add(struct perf_event *event);
+
+void intel_pmu_lbr_del(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(bool pmi);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
+
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+
+void intel_pmu_lbr_save(void *ctx);
+
+void intel_pmu_lbr_restore(void *ctx);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+void intel_pmu_lbr_init_slm(void);
+
+void intel_pmu_lbr_init_snb(void);
+
+void intel_pmu_lbr_init_hsw(void);
+
+void intel_pmu_lbr_init_skl(void);
+
+void intel_pmu_lbr_init_knl(void);
+
+void intel_pmu_lbr_init(void);
+
+void intel_pmu_arch_lbr_init(void);
+
+void intel_pmu_pebs_data_source_nhm(void);
+
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
+void intel_pmu_pebs_data_source_adl(void);
+
+void intel_pmu_pebs_data_source_grt(void);
+
+void intel_pmu_pebs_data_source_mtl(void);
+
+void intel_pmu_pebs_data_source_arl_h(void);
+
+void intel_pmu_pebs_data_source_cmt(void);
+
+void intel_pmu_pebs_data_source_lnl(void);
+
+u64 intel_get_arch_pebs_data_config(struct perf_event *event);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+int knc_pmu_init(void);
+
+static inline int is_ht_workaround_enabled(void)
+{
+ return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
+
+static inline u64 intel_pmu_pebs_mask(u64 cntr_mask)
+{
+ return MAX_PEBS_EVENTS_MASK & cntr_mask;
+}
+
+static inline int intel_pmu_max_num_pebs(struct pmu *pmu)
+{
+ static_assert(MAX_PEBS_EVENTS == 32);
+ return fls((u32)hybrid(pmu, pebs_events_mask));
+}
+
+static inline bool intel_pmu_has_pebs(void)
+{
+ return x86_pmu.ds_pebs || x86_pmu.arch_pebs;
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline void release_lbr_buffers(void)
+{
+}
+
+static inline void reserve_lbr_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+ return 0;
+}
+
+static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
+ return 0;
+}
+
+static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc)
+{
+}
+
+static inline int is_ht_workaround_enabled(void)
+{
+ return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
+
+#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
+int zhaoxin_pmu_init(void);
+#else
+static inline int zhaoxin_pmu_init(void)
+{
+ return 0;
+}
+#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
new file mode 100644
index 000000000000..70078334e4a3
--- /dev/null
+++ b/arch/x86/events/perf_event_flags.h
@@ -0,0 +1,25 @@
+
+/*
+ * struct hw_perf_event.flags flags
+ */
+PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */
+PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */
+PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */
+PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */
+PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */
+PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */
+PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */
+PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */
+PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */
+PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */
+PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */
+PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */
+PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */
+PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */
+PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */
+PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */
+PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */
+PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */
+PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */
+PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */
+PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */
diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c
new file mode 100644
index 000000000000..bb719d0d3f0b
--- /dev/null
+++ b/arch/x86/events/probe.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/bits.h>
+
+#include <asm/msr.h>
+#include "probe.h"
+
+static umode_t
+not_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return 0;
+}
+
+/*
+ * Accepts msr[] array with non populated entries as long as either
+ * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs
+ * visibility is visible when group->is_visible callback is set.
+ */
+unsigned long
+perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
+{
+ unsigned long avail = 0;
+ unsigned int bit;
+ u64 val;
+
+ if (cnt >= BITS_PER_LONG)
+ return 0;
+
+ for (bit = 0; bit < cnt; bit++) {
+ if (!msr[bit].no_check) {
+ struct attribute_group *grp = msr[bit].grp;
+ u64 mask;
+
+ /* skip entry with no group */
+ if (!grp)
+ continue;
+
+ grp->is_visible = not_visible;
+
+ /* skip unpopulated entry */
+ if (!msr[bit].msr)
+ continue;
+
+ if (msr[bit].test && !msr[bit].test(bit, data))
+ continue;
+ /* Virt sucks; you cannot tell if a R/O MSR is present :/ */
+ if (rdmsrq_safe(msr[bit].msr, &val))
+ continue;
+
+ mask = msr[bit].mask;
+ if (!mask)
+ mask = ~0ULL;
+ /* Disable zero counters if requested. */
+ if (!zero && !(val & mask))
+ continue;
+
+ grp->is_visible = NULL;
+ }
+ avail |= BIT(bit);
+ }
+
+ return avail;
+}
+EXPORT_SYMBOL_GPL(perf_msr_probe);
diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h
new file mode 100644
index 000000000000..261b9bda24e3
--- /dev/null
+++ b/arch/x86/events/probe.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_X86_EVENTS_PROBE_H__
+#define __ARCH_X86_EVENTS_PROBE_H__
+#include <linux/sysfs.h>
+
+struct perf_msr {
+ u64 msr;
+ struct attribute_group *grp;
+ bool (*test)(int idx, void *data);
+ bool no_check;
+ u64 mask;
+};
+
+unsigned long
+perf_msr_probe(struct perf_msr *msr, int cnt, bool no_zero, void *data);
+
+#define __PMU_EVENT_GROUP(_name) \
+static struct attribute *attrs_##_name[] = { \
+ &attr_##_name.attr.attr, \
+ NULL, \
+}
+
+#define PMU_EVENT_GROUP(_grp, _name) \
+__PMU_EVENT_GROUP(_name); \
+static struct attribute_group group_##_name = { \
+ .name = #_grp, \
+ .attrs = attrs_##_name, \
+}
+
+#endif /* __ARCH_X86_EVENTS_PROBE_H__ */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
new file mode 100644
index 000000000000..defd86137f12
--- /dev/null
+++ b/arch/x86/events/rapl.c
@@ -0,0 +1,965 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support Intel/AMD RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * AMD RAPL interface for Fam17h is described in the public PPR:
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206537
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ * pp0 counter: consumption of all physical cores (power plane 0)
+ * event: rapl_energy_cores
+ * perf code: 0x1
+ *
+ * pkg counter: consumption of the whole processor package
+ * event: rapl_energy_pkg
+ * perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ * event: rapl_energy_dram
+ * perf code: 0x3
+ *
+ * gpu counter: consumption of the builtin-gpu domain (client only)
+ * event: rapl_energy_gpu
+ * perf code: 0x4
+ *
+ * psys counter: consumption of the builtin-psys domain (client only)
+ * event: rapl_energy_psys
+ * perf code: 0x5
+ *
+ * core counter: consumption of a single physical core
+ * event: rapl_energy_core (power_core PMU)
+ * perf code: 0x1
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/nospec.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+#include "perf_event.h"
+#include "probe.h"
+
+MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters");
+MODULE_LICENSE("GPL");
+
+/*
+ * RAPL energy status counters
+ */
+enum perf_rapl_pkg_events {
+ PERF_RAPL_PP0 = 0, /* all cores */
+ PERF_RAPL_PKG, /* entire package */
+ PERF_RAPL_RAM, /* DRAM */
+ PERF_RAPL_PP1, /* gpu */
+ PERF_RAPL_PSYS, /* psys */
+
+ PERF_RAPL_PKG_EVENTS_MAX,
+ NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
+};
+
+#define PERF_RAPL_CORE 0 /* single core */
+#define PERF_RAPL_CORE_EVENTS_MAX 1
+#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
+
+static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
+ "pp0-core",
+ "package",
+ "dram",
+ "pp1-gpu",
+ "psys",
+};
+
+static const char *const rapl_core_domain_name __initconst = "core";
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK 0xFFULL
+#define RAPL_CNTR_WIDTH 32
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
+};
+
+/*
+ * RAPL Package energy counter scope:
+ * 1. AMD/HYGON platforms have a per-PKG package energy counter
+ * 2. For Intel platforms
+ * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
+ * 2.2. Other Intel platforms are single die systems so the scope can be
+ * considered as either pkg-scope or die-scope, and we are considering
+ * them as die-scope.
+ */
+#define rapl_pkg_pmu_is_pkg_scope() \
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
+struct rapl_pmu {
+ raw_spinlock_t lock;
+ int n_active;
+ int cpu;
+ struct list_head active_list;
+ struct pmu *pmu;
+ ktime_t timer_interval;
+ struct hrtimer hrtimer;
+};
+
+struct rapl_pmus {
+ struct pmu pmu;
+ unsigned int nr_rapl_pmu;
+ unsigned int cntr_mask;
+ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
+};
+
+enum rapl_unit_quirk {
+ RAPL_UNIT_QUIRK_NONE,
+ RAPL_UNIT_QUIRK_INTEL_HSW,
+ RAPL_UNIT_QUIRK_INTEL_SPR,
+};
+
+struct rapl_model {
+ struct perf_msr *rapl_pkg_msrs;
+ struct perf_msr *rapl_core_msrs;
+ unsigned long pkg_events;
+ unsigned long core_events;
+ unsigned int msr_power_unit;
+ enum rapl_unit_quirk unit_quirk;
+};
+
+ /* 1/2^hw_unit Joule */
+static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
+static int rapl_core_hw_unit __read_mostly;
+static struct rapl_pmus *rapl_pmus_pkg;
+static struct rapl_pmus *rapl_pmus_core;
+static u64 rapl_timer_ms;
+static struct rapl_model *rapl_model;
+
+/*
+ * Helper function to get the correct topology id according to the
+ * RAPL PMU scope.
+ */
+static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
+{
+ /*
+ * Returns unsigned int, which converts the '-1' return value
+ * (for non-existent mappings in topology map) to UINT_MAX, so
+ * the error check in the caller is simplified.
+ */
+ switch (scope) {
+ case PERF_PMU_SCOPE_PKG:
+ return topology_logical_package_id(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_logical_die_id(cpu);
+ case PERF_PMU_SCOPE_CORE:
+ return topology_logical_core_id(cpu);
+ default:
+ return -EINVAL;
+ }
+}
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+ u64 raw;
+ rdmsrq(event->hw.event_base, raw);
+ return raw;
+}
+
+static inline u64 rapl_scale(u64 v, struct perf_event *event)
+{
+ int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
+
+ if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
+ hw_unit = rapl_core_hw_unit;
+
+ /*
+ * scale delta to smallest unit (1/2^32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ return v << (32 - hw_unit);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+ int shift = RAPL_CNTR_WIDTH;
+
+ prev_raw_count = local64_read(&hwc->prev_count);
+ do {
+ rdmsrq(event->hw.event_base, new_raw_count);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ sdelta = rapl_scale(delta, event);
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+ hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!rapl_pmu->n_active)
+ return HRTIMER_NORESTART;
+
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+
+ list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
+ rapl_event_update(event);
+
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+
+ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
+{
+ struct hrtimer *hr = &rapl_pmu->hrtimer;
+
+ hrtimer_setup(hr, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
+ struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &rapl_pmu->active_list);
+
+ local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+ rapl_pmu->n_active++;
+ if (rapl_pmu->n_active == 1)
+ rapl_start_hrtimer(rapl_pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ __rapl_pmu_event_start(rapl_pmu, event);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+
+ /* mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(rapl_pmu->n_active <= 0);
+ rapl_pmu->n_active--;
+ if (rapl_pmu->n_active == 0)
+ hrtimer_cancel(&rapl_pmu->hrtimer);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(rapl_pmu, event);
+
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int bit, rapl_pmus_scope, ret = 0;
+ struct rapl_pmu *rapl_pmu;
+ unsigned int rapl_pmu_idx;
+ struct rapl_pmus *rapl_pmus;
+
+ /* only look at RAPL events */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* unsupported modes and filters */
+ if (event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~RAPL_EVENT_MASK)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
+ if (!rapl_pmus)
+ return -EINVAL;
+ rapl_pmus_scope = rapl_pmus->pmu.scope;
+
+ if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
+ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
+ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
+ return -EINVAL;
+
+ bit = cfg - 1;
+ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
+ } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
+ cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
+ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
+ return -EINVAL;
+
+ bit = cfg - 1;
+ event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
+ } else
+ return -EINVAL;
+
+ /* check event supported */
+ if (!(rapl_pmus->cntr_mask & (1 << bit)))
+ return -EINVAL;
+
+ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ return -EINVAL;
+ /* must be done before validate_group */
+ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
+ if (!rapl_pmu)
+ return -EINVAL;
+
+ event->pmu_private = rapl_pmu;
+ event->hw.config = cfg;
+ event->hw.idx = bit;
+
+ return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
+RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
+
+/*
+ * There are no default events, but we need to create
+ * "events" group (with empty attrs) before updating
+ * it with detected events.
+ */
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+ .name = "events",
+ .attrs = attrs_empty,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+ .name = "format",
+ .attrs = rapl_formats_attr,
+};
+
+static const struct attribute_group *rapl_attr_groups[] = {
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
+};
+
+static const struct attribute_group *rapl_core_attr_groups[] = {
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
+};
+
+static struct attribute *rapl_events_cores[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_cores_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_cores_group = {
+ .name = "events",
+ .attrs = rapl_events_cores,
+};
+
+static struct attribute *rapl_events_pkg[] = {
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_pkg_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_pkg_group = {
+ .name = "events",
+ .attrs = rapl_events_pkg,
+};
+
+static struct attribute *rapl_events_ram[] = {
+ EVENT_PTR(rapl_ram),
+ EVENT_PTR(rapl_ram_unit),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_ram_group = {
+ .name = "events",
+ .attrs = rapl_events_ram,
+};
+
+static struct attribute *rapl_events_gpu[] = {
+ EVENT_PTR(rapl_gpu),
+ EVENT_PTR(rapl_gpu_unit),
+ EVENT_PTR(rapl_gpu_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_gpu_group = {
+ .name = "events",
+ .attrs = rapl_events_gpu,
+};
+
+static struct attribute *rapl_events_psys[] = {
+ EVENT_PTR(rapl_psys),
+ EVENT_PTR(rapl_psys_unit),
+ EVENT_PTR(rapl_psys_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_psys_group = {
+ .name = "events",
+ .attrs = rapl_events_psys,
+};
+
+static struct attribute *rapl_events_core[] = {
+ EVENT_PTR(rapl_core),
+ EVENT_PTR(rapl_core_unit),
+ EVENT_PTR(rapl_core_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_core_group = {
+ .name = "events",
+ .attrs = rapl_events_core,
+};
+
+static bool test_msr(int idx, void *data)
+{
+ return test_bit(idx, (unsigned long *) data);
+}
+
+/* Only lower 32bits of the MSR represents the energy counter */
+#define RAPL_MSR_MASK 0xFFFFFFFF
+
+static struct perf_msr intel_rapl_msrs[] = {
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
+};
+
+static struct perf_msr intel_rapl_spr_msrs[] = {
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
+};
+
+/*
+ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
+ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
+ * - want to use same event codes across both architectures
+ */
+static struct perf_msr amd_rapl_pkg_msrs[] = {
+ [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
+ [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
+ [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, NULL, false, 0 },
+ [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
+};
+
+static struct perf_msr amd_rapl_core_msrs[] = {
+ [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
+ test_msr, false, RAPL_MSR_MASK },
+};
+
+static int rapl_check_hw_unit(void)
+{
+ u64 msr_rapl_power_unit_bits;
+ int i;
+
+ /* protect rdmsrq() to handle virtualization */
+ if (rdmsrq_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
+ return -1;
+ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
+ rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+ rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+ switch (rapl_model->unit_quirk) {
+ /*
+ * DRAM domain on HSW server and KNL has fixed energy unit which can be
+ * different than the unit from power unit MSR. See
+ * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+ * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ */
+ case RAPL_UNIT_QUIRK_INTEL_HSW:
+ rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
+ break;
+ /* SPR uses a fixed energy unit for Psys domain. */
+ case RAPL_UNIT_QUIRK_INTEL_SPR:
+ rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Calculate the timer rate:
+ * Use reference of 200W for scaling the timeout to avoid counter
+ * overflows. 200W = 200 Joules/sec
+ * Divide interval by 2 to avoid lockstep (2 * 100)
+ * if hw unit is 32, then we use 2 ms 1/200/2
+ */
+ rapl_timer_ms = 2;
+ if (rapl_pkg_hw_unit[0] < 32) {
+ rapl_timer_ms = (1000 / (2 * 100));
+ rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
+ }
+ return 0;
+}
+
+static void __init rapl_advertise(void)
+{
+ int i;
+ int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
+
+ if (rapl_pmus_core)
+ num_counters += hweight32(rapl_pmus_core->cntr_mask);
+
+ pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+ num_counters, rapl_timer_ms);
+
+ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
+ if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
+ }
+ }
+
+ if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_core_domain_name, rapl_core_hw_unit);
+}
+
+static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
+{
+ int i;
+
+ for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
+ kfree(rapl_pmus->rapl_pmu[i]);
+ kfree(rapl_pmus);
+}
+
+static const struct attribute_group *rapl_attr_update[] = {
+ &rapl_events_cores_group,
+ &rapl_events_pkg_group,
+ &rapl_events_ram_group,
+ &rapl_events_gpu_group,
+ &rapl_events_psys_group,
+ NULL,
+};
+
+static const struct attribute_group *rapl_core_attr_update[] = {
+ &rapl_events_core_group,
+ NULL,
+};
+
+static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
+{
+ struct rapl_pmu *rapl_pmu;
+ int idx;
+
+ for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
+ rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
+ if (!rapl_pmu)
+ goto free;
+
+ raw_spin_lock_init(&rapl_pmu->lock);
+ INIT_LIST_HEAD(&rapl_pmu->active_list);
+ rapl_pmu->pmu = &rapl_pmus->pmu;
+ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(rapl_pmu);
+
+ rapl_pmus->rapl_pmu[idx] = rapl_pmu;
+ }
+
+ return 0;
+free:
+ for (; idx > 0; idx--)
+ kfree(rapl_pmus->rapl_pmu[idx - 1]);
+ return -ENOMEM;
+}
+
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
+ const struct attribute_group **rapl_attr_groups,
+ const struct attribute_group **rapl_attr_update)
+{
+ int nr_rapl_pmu = topology_max_packages();
+ struct rapl_pmus *rapl_pmus;
+ int ret;
+
+ /*
+ * rapl_pmu_scope must be either PKG, DIE or CORE
+ */
+ if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
+ nr_rapl_pmu *= topology_max_dies_per_package();
+ else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
+ nr_rapl_pmu *= topology_num_cores_per_package();
+ else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
+ return -EINVAL;
+
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
+ if (!rapl_pmus)
+ return -ENOMEM;
+
+ *rapl_pmus_ptr = rapl_pmus;
+
+ rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
+ rapl_pmus->pmu.attr_groups = rapl_attr_groups;
+ rapl_pmus->pmu.attr_update = rapl_attr_update;
+ rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
+ rapl_pmus->pmu.event_init = rapl_pmu_event_init;
+ rapl_pmus->pmu.add = rapl_pmu_event_add;
+ rapl_pmus->pmu.del = rapl_pmu_event_del;
+ rapl_pmus->pmu.start = rapl_pmu_event_start;
+ rapl_pmus->pmu.stop = rapl_pmu_event_stop;
+ rapl_pmus->pmu.read = rapl_pmu_event_read;
+ rapl_pmus->pmu.scope = rapl_pmu_scope;
+ rapl_pmus->pmu.module = THIS_MODULE;
+ rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+
+ ret = init_rapl_pmu(rapl_pmus);
+ if (ret)
+ kfree(rapl_pmus);
+
+ return ret;
+}
+
+static struct rapl_model model_snb = {
+ .pkg_events = BIT(PERF_RAPL_PP0) |
+ BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_PP1),
+ .msr_power_unit = MSR_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = intel_rapl_msrs,
+};
+
+static struct rapl_model model_snbep = {
+ .pkg_events = BIT(PERF_RAPL_PP0) |
+ BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_RAM),
+ .msr_power_unit = MSR_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = intel_rapl_msrs,
+};
+
+static struct rapl_model model_hsw = {
+ .pkg_events = BIT(PERF_RAPL_PP0) |
+ BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_RAM) |
+ BIT(PERF_RAPL_PP1),
+ .msr_power_unit = MSR_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = intel_rapl_msrs,
+};
+
+static struct rapl_model model_hsx = {
+ .pkg_events = BIT(PERF_RAPL_PP0) |
+ BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_RAM),
+ .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
+ .msr_power_unit = MSR_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = intel_rapl_msrs,
+};
+
+static struct rapl_model model_knl = {
+ .pkg_events = BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_RAM),
+ .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
+ .msr_power_unit = MSR_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = intel_rapl_msrs,
+};
+
+static struct rapl_model model_skl = {
+ .pkg_events = BIT(PERF_RAPL_PP0) |
+ BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_RAM) |
+ BIT(PERF_RAPL_PP1) |
+ BIT(PERF_RAPL_PSYS),
+ .msr_power_unit = MSR_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = intel_rapl_msrs,
+};
+
+static struct rapl_model model_spr = {
+ .pkg_events = BIT(PERF_RAPL_PP0) |
+ BIT(PERF_RAPL_PKG) |
+ BIT(PERF_RAPL_RAM) |
+ BIT(PERF_RAPL_PSYS),
+ .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
+ .msr_power_unit = MSR_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = intel_rapl_spr_msrs,
+};
+
+static struct rapl_model model_amd_hygon = {
+ .pkg_events = BIT(PERF_RAPL_PKG),
+ .core_events = BIT(PERF_RAPL_CORE),
+ .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
+ .rapl_pkg_msrs = amd_rapl_pkg_msrs,
+ .rapl_core_msrs = amd_rapl_core_msrs,
+};
+
+static const struct x86_cpu_id rapl_model_match[] __initconst = {
+ X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &model_snb),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &model_snbep),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &model_snb),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &model_snbep),
+ X86_MATCH_VFM(INTEL_HASWELL, &model_hsw),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &model_hsw),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &model_hsw),
+ X86_MATCH_VFM(INTEL_BROADWELL, &model_hsw),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &model_hsw),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &model_hsx),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &model_knl),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &model_knl),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &model_hsw),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &model_hsw),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &model_hsw),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ICELAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &model_hsx),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &model_skl),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &model_spr),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &model_spr),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_skl),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &model_skl),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &model_skl),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl),
+ {},
+};
+MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
+
+static int __init rapl_pmu_init(void)
+{
+ const struct x86_cpu_id *id;
+ int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
+ int ret;
+
+ if (rapl_pkg_pmu_is_pkg_scope())
+ rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
+
+ id = x86_match_cpu(rapl_model_match);
+ if (!id)
+ return -ENODEV;
+
+ rapl_model = (struct rapl_model *) id->driver_data;
+
+ ret = rapl_check_hw_unit();
+ if (ret)
+ return ret;
+
+ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
+ rapl_attr_update);
+ if (ret)
+ return ret;
+
+ rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
+ PERF_RAPL_PKG_EVENTS_MAX, false,
+ (void *) &rapl_model->pkg_events);
+
+ ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
+ if (ret)
+ goto out;
+
+ if (rapl_model->core_events) {
+ ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
+ rapl_core_attr_groups,
+ rapl_core_attr_update);
+ if (ret) {
+ pr_warn("power-core PMU initialization failed (%d)\n", ret);
+ goto core_init_failed;
+ }
+
+ rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
+ PERF_RAPL_CORE_EVENTS_MAX, false,
+ (void *) &rapl_model->core_events);
+
+ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
+ if (ret) {
+ pr_warn("power-core PMU registration failed (%d)\n", ret);
+ cleanup_rapl_pmus(rapl_pmus_core);
+ }
+ }
+
+core_init_failed:
+ rapl_advertise();
+ return 0;
+
+out:
+ pr_warn("Initialization failed (%d), disabled\n", ret);
+ cleanup_rapl_pmus(rapl_pmus_pkg);
+ return ret;
+}
+module_init(rapl_pmu_init);
+
+static void __exit intel_rapl_exit(void)
+{
+ if (rapl_pmus_core) {
+ perf_pmu_unregister(&rapl_pmus_core->pmu);
+ cleanup_rapl_pmus(rapl_pmus_core);
+ }
+ perf_pmu_unregister(&rapl_pmus_pkg->pmu);
+ cleanup_rapl_pmus(rapl_pmus_pkg);
+}
+module_exit(intel_rapl_exit);
diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c
new file mode 100644
index 000000000000..77fd00b3305e
--- /dev/null
+++ b/arch/x86/events/utils.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/insn.h>
+#include <linux/mm.h>
+
+#include <asm/msr.h>
+#include "perf_event.h"
+
+static int decode_branch_type(struct insn *insn)
+{
+ int ext;
+
+ if (insn_get_opcode(insn))
+ return X86_BR_ABORT;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xf:
+ switch (insn->opcode.bytes[1]) {
+ case 0x05: /* syscall */
+ case 0x34: /* sysenter */
+ return X86_BR_SYSCALL;
+ case 0x07: /* sysret */
+ case 0x35: /* sysexit */
+ return X86_BR_SYSRET;
+ case 0x80 ... 0x8f: /* conditional */
+ return X86_BR_JCC;
+ }
+ return X86_BR_NONE;
+ case 0x70 ... 0x7f: /* conditional */
+ return X86_BR_JCC;
+ case 0xc2: /* near ret */
+ case 0xc3: /* near ret */
+ case 0xca: /* far ret */
+ case 0xcb: /* far ret */
+ return X86_BR_RET;
+ case 0xcf: /* iret */
+ return X86_BR_IRET;
+ case 0xcc ... 0xce: /* int */
+ return X86_BR_INT;
+ case 0xe8: /* call near rel */
+ if (insn_get_immediate(insn) || insn->immediate1.value == 0) {
+ /* zero length call */
+ return X86_BR_ZERO_CALL;
+ }
+ fallthrough;
+ case 0x9a: /* call far absolute */
+ return X86_BR_CALL;
+ case 0xe0 ... 0xe3: /* loop jmp */
+ return X86_BR_JCC;
+ case 0xe9 ... 0xeb: /* jmp */
+ return X86_BR_JMP;
+ case 0xff: /* call near absolute, call far absolute ind */
+ if (insn_get_modrm(insn))
+ return X86_BR_ABORT;
+
+ ext = (insn->modrm.bytes[0] >> 3) & 0x7;
+ switch (ext) {
+ case 2: /* near ind call */
+ case 3: /* far ind call */
+ return X86_BR_IND_CALL;
+ case 4:
+ case 5:
+ return X86_BR_IND_JMP;
+ }
+ return X86_BR_NONE;
+ }
+
+ return X86_BR_NONE;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * instruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ *
+ * While recording branches, some processors can report the "from"
+ * address to be that of an instruction preceding the actual branch
+ * when instruction fusion occurs. If fusion is expected, attempt to
+ * find the type of the first branch instruction within the next
+ * MAX_INSN_SIZE bytes and if found, provide the offset between the
+ * reported "from" address and the actual branch instruction address.
+ */
+static int get_branch_type(unsigned long from, unsigned long to, int abort,
+ bool fused, int *offset)
+{
+ struct insn insn;
+ void *addr;
+ int bytes_read, bytes_left, insn_offset;
+ int ret = X86_BR_NONE;
+ int to_plm, from_plm;
+ u8 buf[MAX_INSN_SIZE];
+ int is64 = 0;
+
+ /* make sure we initialize offset */
+ if (offset)
+ *offset = 0;
+
+ to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+ from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+ /*
+ * maybe zero if lbr did not fill up after a reset by the time
+ * we get a PMU interrupt
+ */
+ if (from == 0 || to == 0)
+ return X86_BR_NONE;
+
+ if (abort)
+ return X86_BR_ABORT | to_plm;
+
+ if (from_plm == X86_BR_USER) {
+ /*
+ * can happen if measuring at the user level only
+ * and we interrupt in a kernel thread, e.g., idle.
+ */
+ if (!current->mm)
+ return X86_BR_NONE;
+
+ /* may fail if text not present */
+ bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+ MAX_INSN_SIZE);
+ bytes_read = MAX_INSN_SIZE - bytes_left;
+ if (!bytes_read)
+ return X86_BR_NONE;
+
+ addr = buf;
+ } else {
+ /*
+ * The LBR logs any address in the IP, even if the IP just
+ * faulted. This means userspace can control the from address.
+ * Ensure we don't blindly read any address by validating it is
+ * a known text address and not a vsyscall address.
+ */
+ if (kernel_text_address(from) && !in_gate_area_no_mm(from)) {
+ addr = (void *)from;
+ /*
+ * Assume we can get the maximum possible size
+ * when grabbing kernel data. This is not
+ * _strictly_ true since we could possibly be
+ * executing up next to a memory hole, but
+ * it is very unlikely to be a problem.
+ */
+ bytes_read = MAX_INSN_SIZE;
+ } else {
+ return X86_BR_NONE;
+ }
+ }
+
+ /*
+ * decoder needs to know the ABI especially
+ * on 64-bit systems running 32-bit apps
+ */
+#ifdef CONFIG_X86_64
+ is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs());
+#endif
+ insn_init(&insn, addr, bytes_read, is64);
+ ret = decode_branch_type(&insn);
+ insn_offset = 0;
+
+ /* Check for the possibility of branch fusion */
+ while (fused && ret == X86_BR_NONE) {
+ /* Check for decoding errors */
+ if (insn_get_length(&insn) || !insn.length)
+ break;
+
+ insn_offset += insn.length;
+ bytes_read -= insn.length;
+ if (bytes_read < 0)
+ break;
+
+ insn_init(&insn, addr + insn_offset, bytes_read, is64);
+ ret = decode_branch_type(&insn);
+ }
+
+ if (offset)
+ *offset = insn_offset;
+
+ /*
+ * interrupts, traps, faults (and thus ring transition) may
+ * occur on any instructions. Thus, to classify them correctly,
+ * we need to first look at the from and to priv levels. If they
+ * are different and to is in the kernel, then it indicates
+ * a ring transition. If the from instruction is not a ring
+ * transition instr (syscall, systenter, int), then it means
+ * it was a irq, trap or fault.
+ *
+ * we have no way of detecting kernel to kernel faults.
+ */
+ if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+ && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+ ret = X86_BR_IRQ;
+
+ /*
+ * branch priv level determined by target as
+ * is done by HW when LBR_SELECT is implemented
+ */
+ if (ret != X86_BR_NONE)
+ ret |= to_plm;
+
+ return ret;
+}
+
+int branch_type(unsigned long from, unsigned long to, int abort)
+{
+ return get_branch_type(from, to, abort, false, NULL);
+}
+
+int branch_type_fused(unsigned long from, unsigned long to, int abort,
+ int *offset)
+{
+ return get_branch_type(from, to, abort, true, offset);
+}
+
+#define X86_BR_TYPE_MAP_MAX 16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+ PERF_BR_CALL, /* X86_BR_CALL */
+ PERF_BR_RET, /* X86_BR_RET */
+ PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
+ PERF_BR_SYSRET, /* X86_BR_SYSRET */
+ PERF_BR_UNKNOWN, /* X86_BR_INT */
+ PERF_BR_ERET, /* X86_BR_IRET */
+ PERF_BR_COND, /* X86_BR_JCC */
+ PERF_BR_UNCOND, /* X86_BR_JMP */
+ PERF_BR_IRQ, /* X86_BR_IRQ */
+ PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_ABORT */
+ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
+ PERF_BR_NO_TX, /* X86_BR_NO_TX */
+ PERF_BR_CALL, /* X86_BR_ZERO_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */
+ PERF_BR_IND, /* X86_BR_IND_JMP */
+};
+
+int common_branch_type(int type)
+{
+ int i;
+
+ type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+ if (type) {
+ i = __ffs(type);
+ if (i < X86_BR_TYPE_MAP_MAX)
+ return branch_map[i];
+ }
+
+ return PERF_BR_UNKNOWN;
+}
diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile
new file mode 100644
index 000000000000..642c1174d662
--- /dev/null
+++ b/arch/x86/events/zhaoxin/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y += core.o
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
new file mode 100644
index 000000000000..4bdfcf091200
--- /dev/null
+++ b/arch/x86/events/zhaoxin/core.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Zhaoxin PMU; like Intel Architectural PerfMon-v2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+#include <asm/msr.h>
+
+#include "../perf_event.h"
+
+/*
+ * Zhaoxin PerfMon, used on zxc and later.
+ */
+static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
+
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0082,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x051a,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0083,
+};
+
+static struct event_constraint zxc_event_constraints[] __read_mostly = {
+
+ FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint zxd_event_constraints[] __read_mostly = {
+
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
+ FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+ FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
+ EVENT_CONSTRAINT_END
+};
+
+static __initconst const u64 zxd_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0042,
+ [C(RESULT_MISS)] = 0x0538,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x0043,
+ [C(RESULT_MISS)] = 0x0562,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0300,
+ [C(RESULT_MISS)] = 0x0301,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x030a,
+ [C(RESULT_MISS)] = 0x030b,
+ },
+},
+[C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0042,
+ [C(RESULT_MISS)] = 0x052c,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x0043,
+ [C(RESULT_MISS)] = 0x0530,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0564,
+ [C(RESULT_MISS)] = 0x0565,
+ },
+},
+[C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c0,
+ [C(RESULT_MISS)] = 0x0534,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0700,
+ [C(RESULT_MISS)] = 0x0709,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(NODE)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+};
+
+static __initconst const u64 zxe_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0568,
+ [C(RESULT_MISS)] = 0x054b,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x0669,
+ [C(RESULT_MISS)] = 0x0562,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0300,
+ [C(RESULT_MISS)] = 0x0301,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x030a,
+ [C(RESULT_MISS)] = 0x030b,
+ },
+},
+[C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+},
+[C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0568,
+ [C(RESULT_MISS)] = 0x052c,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x0669,
+ [C(RESULT_MISS)] = 0x0530,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0564,
+ [C(RESULT_MISS)] = 0x0565,
+ },
+},
+[C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c0,
+ [C(RESULT_MISS)] = 0x0534,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0028,
+ [C(RESULT_MISS)] = 0x0029,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(NODE)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+};
+
+static void zhaoxin_pmu_disable_all(void)
+{
+ wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void zhaoxin_pmu_enable_all(int added)
+{
+ wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static inline u64 zhaoxin_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void zhaoxin_pmu_ack_status(u64 ack)
+{
+ wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void zxc_pmu_ack_status(u64 ack)
+{
+ /*
+ * ZXC needs global control enabled in order to clear status bits.
+ */
+ zhaoxin_pmu_enable_all(0);
+ zhaoxin_pmu_ack_status(ack);
+ zhaoxin_pmu_disable_all();
+}
+
+static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+ int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrq(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ wrmsrq(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ zhaoxin_pmu_disable_fixed(hwc);
+ return;
+ }
+
+ x86_pmu_disable_event(event);
+}
+
+static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+ int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+ u64 ctrl_val, bits, mask;
+
+ /*
+ * Enable IRQ generation (0x8),
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ bits = 0x8ULL;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrq(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ wrmsrq(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ zhaoxin_pmu_enable_fixed(hwc);
+ return;
+ }
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ int handled = 0;
+ u64 status;
+ int bit;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ zhaoxin_pmu_disable_all();
+ status = zhaoxin_pmu_get_status();
+ if (!status)
+ goto done;
+
+again:
+ if (x86_pmu.enabled_ack)
+ zxc_pmu_ack_status(status);
+ else
+ zhaoxin_pmu_ack_status(status);
+
+ inc_irq_stat(apic_perf_irqs);
+
+ /*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+
+ handled++;
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ x86_perf_event_update(event);
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+
+ perf_event_overflow(event, &data, regs);
+ }
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = zhaoxin_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ zhaoxin_pmu_enable_all(0);
+ return handled;
+}
+
+static u64 zhaoxin_pmu_event_map(int hw_event)
+{
+ return zx_pmon_event_map[hw_event];
+}
+
+static struct event_constraint *
+zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
+
+ return &unconstrained;
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+PMU_FORMAT_ATTR(edge, "config:18");
+PMU_FORMAT_ATTR(inv, "config:23");
+PMU_FORMAT_ATTR(cmask, "config:24-31");
+
+static struct attribute *zx_arch_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
+{
+ u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+ return x86_event_sysfs_show(page, config, event);
+}
+
+static const struct x86_pmu zhaoxin_pmu __initconst = {
+ .name = "zhaoxin",
+ .handle_irq = zhaoxin_pmu_handle_irq,
+ .disable_all = zhaoxin_pmu_disable_all,
+ .enable_all = zhaoxin_pmu_enable_all,
+ .enable = zhaoxin_pmu_enable_event,
+ .disable = zhaoxin_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = zhaoxin_pmu_event_map,
+ .max_events = ARRAY_SIZE(zx_pmon_event_map),
+ .apic = 1,
+ /*
+ * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
+ */
+ .max_period = (1ULL << 47) - 1,
+ .get_event_constraints = zhaoxin_get_event_constraints,
+
+ .format_attrs = zx_arch_formats_attr,
+ .events_sysfs_show = zhaoxin_event_sysfs_show,
+};
+
+static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
+ { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+ { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+ { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+ { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+ { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+ { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+ { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void zhaoxin_arch_events_quirk(void)
+{
+ int bit;
+
+ /* disable event that reported as not present by cpuid */
+ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
+ zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
+ pr_warn("CPUID marked event: \'%s\' unavailable\n",
+ zx_arch_events_map[bit].name);
+ }
+}
+
+__init int zhaoxin_pmu_init(void)
+{
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ union cpuid10_ebx ebx;
+ struct event_constraint *c;
+ unsigned int unused;
+ int version;
+
+ pr_info("Welcome to zhaoxin pmu!\n");
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * hw_event or not.
+ */
+ cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+
+ if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version != 2)
+ return -ENODEV;
+
+ x86_pmu = zhaoxin_pmu;
+ pr_info("Version check pass!\n");
+
+ x86_pmu.version = version;
+ x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0);
+ x86_pmu.cntval_bits = eax.split.bit_width;
+ x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
+ x86_pmu.events_maskl = ebx.full;
+ x86_pmu.events_mask_len = eax.split.mask_length;
+
+ x86_pmu.fixed_cntr_mask64 = GENMASK_ULL(edx.split.num_counters_fixed - 1, 0);
+ x86_add_quirk(zhaoxin_arch_events_quirk);
+
+ switch (boot_cpu_data.x86) {
+ case 0x06:
+ /*
+ * Support Zhaoxin CPU from ZXC series, exclude Nano series through FMS.
+ * Nano FMS: Family=6, Model=F, Stepping=[0-A][C-D]
+ * ZXC FMS: Family=6, Model=F, Stepping=E-F OR Family=6, Model=0x19, Stepping=0-3
+ */
+ if ((boot_cpu_data.x86_model == 0x0f && boot_cpu_data.x86_stepping >= 0x0e) ||
+ boot_cpu_data.x86_model == 0x19) {
+
+ x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+
+ /* Clearing status works only if the global control is enable on zxc. */
+ x86_pmu.enabled_ack = 1;
+
+ x86_pmu.event_constraints = zxc_event_constraints;
+ zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
+ zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
+ zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
+ zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
+
+ pr_cont("ZXC events, ");
+ break;
+ }
+ return -ENODEV;
+
+ case 0x07:
+ zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+
+ zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x1b:
+ memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = zxd_event_constraints;
+
+ zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
+ zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
+
+ pr_cont("ZXD events, ");
+ break;
+ case 0x3b:
+ memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = zxd_event_constraints;
+
+ zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
+ zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
+
+ pr_cont("ZXE events, ");
+ break;
+ default:
+ return -ENODEV;
+ }
+ break;
+
+ default:
+ return -ENODEV;
+ }
+
+ x86_pmu.intel_ctrl = x86_pmu.cntr_mask64;
+ x86_pmu.intel_ctrl |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED;
+
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ c->idxmsk64 |= x86_pmu.cntr_mask64;
+ c->weight += x86_pmu_num_counters(NULL);
+ }
+ }
+
+ return 0;
+}
+
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
new file mode 100644
index 000000000000..56292102af62
--- /dev/null
+++ b/arch/x86/hyperv/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
+obj-$(CONFIG_X86_64) += hv_apic.o
+obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o
+
+$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h
+
+$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE
+ $(call filechk,offsets,__MSHV_ASM_OFFSETS_H__)
+
+ifdef CONFIG_X86_64
+obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
+
+ ifdef CONFIG_MSHV_ROOT
+ CFLAGS_REMOVE_hv_trampoline.o += -pg
+ CFLAGS_hv_trampoline.o += -fno-stack-protector
+ obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o
+ endif
+endif
+
+targets += mshv-asm-offsets.s
+clean-files += mshv-asm-offsets.h
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
new file mode 100644
index 000000000000..a8de503def37
--- /dev/null
+++ b/arch/x86/hyperv/hv_apic.c
@@ -0,0 +1,341 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V specific APIC code.
+ *
+ * Copyright (C) 2018, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
+#include <asm/apic.h>
+#include <asm/msr.h>
+
+#include <asm/trace/hyperv.h>
+
+static struct apic orig_apic;
+
+static u64 hv_apic_icr_read(void)
+{
+ u64 reg_val;
+
+ rdmsrq(HV_X64_MSR_ICR, reg_val);
+ return reg_val;
+}
+
+static void hv_apic_icr_write(u32 low, u32 id)
+{
+ u64 reg_val;
+
+ reg_val = SET_XAPIC_DEST_FIELD(id);
+ reg_val = reg_val << 32;
+ reg_val |= low;
+
+ wrmsrq(HV_X64_MSR_ICR, reg_val);
+}
+
+void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+ apic_update_vector(cpu, vector, set);
+}
+
+static u32 hv_apic_read(u32 reg)
+{
+ u32 reg_val, hi;
+
+ switch (reg) {
+ case APIC_EOI:
+ rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+ (void)hi;
+ return reg_val;
+ case APIC_TASKPRI:
+ rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+ (void)hi;
+ return reg_val;
+
+ default:
+ return native_apic_mem_read(reg);
+ }
+}
+
+static void hv_apic_write(u32 reg, u32 val)
+{
+ switch (reg) {
+ case APIC_EOI:
+ wrmsrq(HV_X64_MSR_EOI, val);
+ break;
+ case APIC_TASKPRI:
+ wrmsrq(HV_X64_MSR_TPR, val);
+ break;
+ default:
+ native_apic_mem_write(reg, val);
+ }
+}
+
+static void hv_apic_eoi_write(void)
+{
+ struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()];
+
+ if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1))
+ return;
+
+ wrmsrq(HV_X64_MSR_EOI, APIC_EOI_ACK);
+}
+
+static bool cpu_is_self(int cpu)
+{
+ return cpu == smp_processor_id();
+}
+
+/*
+ * IPI implementation on Hyper-V.
+ */
+static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
+ bool exclude_self)
+{
+ struct hv_send_ipi_ex *ipi_arg;
+ unsigned long flags;
+ int nr_bank = 0;
+ u64 status = HV_STATUS_INVALID_PARAMETER;
+
+ if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+
+ local_irq_save(flags);
+ ipi_arg = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (unlikely(!ipi_arg))
+ goto ipi_mask_ex_done;
+
+ ipi_arg->vector = vector;
+ ipi_arg->reserved = 0;
+ ipi_arg->vp_set.valid_bank_mask = 0;
+
+ /*
+ * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET
+ * when the IPI is sent to all currently present CPUs.
+ */
+ if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
+ ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+
+ nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask,
+ exclude_self ? cpu_is_self : NULL);
+
+ /*
+ * 'nr_bank <= 0' means some CPUs in cpumask can't be
+ * represented in VP_SET. Return an error and fall back to
+ * native (architectural) method of sending IPIs.
+ */
+ if (nr_bank <= 0)
+ goto ipi_mask_ex_done;
+ } else {
+ ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
+ }
+
+ /*
+ * For this hypercall, Hyper-V treats the valid_bank_mask field
+ * of ipi_arg->vp_set as part of the fixed size input header.
+ * So the variable input header size is equal to nr_bank.
+ */
+ status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+ ipi_arg, NULL);
+
+ipi_mask_ex_done:
+ local_irq_restore(flags);
+ return hv_result_success(status);
+}
+
+static bool __send_ipi_mask(const struct cpumask *mask, int vector,
+ bool exclude_self)
+{
+ int cur_cpu, vcpu, this_cpu = smp_processor_id();
+ struct hv_send_ipi ipi_arg;
+ u64 status;
+ unsigned int weight;
+
+ trace_hyperv_send_ipi_mask(mask, vector);
+
+ weight = cpumask_weight(mask);
+
+ /*
+ * Do nothing if
+ * 1. the mask is empty
+ * 2. the mask only contains self when exclude_self is true
+ */
+ if (weight == 0 ||
+ (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
+ return true;
+
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
+
+ if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR)
+ return false;
+
+ /*
+ * From the supplied CPU set we need to figure out if we can get away
+ * with cheaper HVCALL_SEND_IPI hypercall. This is possible when the
+ * highest VP number in the set is < 64. As VP numbers are usually in
+ * ascending order and match Linux CPU ids, here is an optimization:
+ * we check the VP number for the highest bit in the supplied set first
+ * so we can quickly find out if using HVCALL_SEND_IPI_EX hypercall is
+ * a must. We will also check all VP numbers when walking the supplied
+ * CPU set to remain correct in all cases.
+ */
+ if (hv_cpu_number_to_vp_number(cpumask_last(mask)) >= 64)
+ goto do_ex_hypercall;
+
+ ipi_arg.vector = vector;
+ ipi_arg.cpu_mask = 0;
+
+ for_each_cpu(cur_cpu, mask) {
+ if (exclude_self && cur_cpu == this_cpu)
+ continue;
+ vcpu = hv_cpu_number_to_vp_number(cur_cpu);
+ if (vcpu == VP_INVAL)
+ return false;
+
+ /*
+ * This particular version of the IPI hypercall can
+ * only target up to 64 CPUs.
+ */
+ if (vcpu >= 64)
+ goto do_ex_hypercall;
+
+ __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask);
+ }
+
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
+ ipi_arg.cpu_mask);
+ return hv_result_success(status);
+
+do_ex_hypercall:
+ return __send_ipi_mask_ex(mask, vector, exclude_self);
+}
+
+static bool __send_ipi_one(int cpu, int vector)
+{
+ int vp = hv_cpu_number_to_vp_number(cpu);
+ u64 status;
+
+ trace_hyperv_send_ipi_one(cpu, vector);
+
+ if (vp == VP_INVAL)
+ return false;
+
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
+
+ if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR)
+ return false;
+
+ if (vp >= 64)
+ return __send_ipi_mask_ex(cpumask_of(cpu), vector, false);
+
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+ return hv_result_success(status);
+}
+
+static void hv_send_ipi(int cpu, int vector)
+{
+ if (!__send_ipi_one(cpu, vector))
+ orig_apic.send_IPI(cpu, vector);
+}
+
+static void hv_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ if (!__send_ipi_mask(mask, vector, false))
+ orig_apic.send_IPI_mask(mask, vector);
+}
+
+static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ if (!__send_ipi_mask(mask, vector, true))
+ orig_apic.send_IPI_mask_allbutself(mask, vector);
+}
+
+static void hv_send_ipi_allbutself(int vector)
+{
+ hv_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void hv_send_ipi_all(int vector)
+{
+ if (!__send_ipi_mask(cpu_online_mask, vector, false))
+ orig_apic.send_IPI_all(vector);
+}
+
+static void hv_send_ipi_self(int vector)
+{
+ if (!__send_ipi_one(smp_processor_id(), vector))
+ orig_apic.send_IPI_self(vector);
+}
+
+void __init hv_apic_init(void)
+{
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ return;
+
+ if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
+ pr_info("Hyper-V: Using IPI hypercalls\n");
+ /*
+ * Set the IPI entry points.
+ */
+ orig_apic = *apic;
+
+ apic_update_callback(send_IPI, hv_send_ipi);
+ apic_update_callback(send_IPI_mask, hv_send_ipi_mask);
+ apic_update_callback(send_IPI_mask_allbutself, hv_send_ipi_mask_allbutself);
+ apic_update_callback(send_IPI_allbutself, hv_send_ipi_allbutself);
+ apic_update_callback(send_IPI_all, hv_send_ipi_all);
+ apic_update_callback(send_IPI_self, hv_send_ipi_self);
+ }
+
+ if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
+ pr_info("Hyper-V: Using enlightened APIC (%s mode)",
+ x2apic_enabled() ? "x2apic" : "xapic");
+ /*
+ * When in x2apic mode, don't use the Hyper-V specific APIC
+ * accessors since the field layout in the ICR register is
+ * different in x2apic mode. Furthermore, the architectural
+ * x2apic MSRs function just as well as the Hyper-V
+ * synthetic APIC MSRs, so there's no benefit in having
+ * separate Hyper-V accessors for x2apic mode. The only
+ * exception is hv_apic_eoi_write, because it benefits from
+ * lazy EOI when available, but the same accessor works for
+ * both xapic and x2apic because the field layout is the same.
+ */
+ apic_update_callback(eoi, hv_apic_eoi_write);
+ if (!x2apic_enabled()) {
+ apic_update_callback(read, hv_apic_read);
+ apic_update_callback(write, hv_apic_write);
+ apic_update_callback(icr_write, hv_apic_icr_write);
+ apic_update_callback(icr_read, hv_apic_icr_read);
+ }
+ }
+}
diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
new file mode 100644
index 000000000000..c0e22921ace1
--- /dev/null
+++ b/arch/x86/hyperv/hv_crash.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * X86 specific Hyper-V root partition kdump/crash support module
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ * This module implements hypervisor RAM collection into vmcore for both
+ * cases of the hypervisor crash and Linux root crash. Hyper-V implements
+ * a disable hypercall with a 32bit protected mode ABI callback. This
+ * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM
+ * is already mapped in Linux, it is automatically collected into Linux vmcore,
+ * and can be examined by the crash command (raw RAM dump) or windbg.
+ *
+ * At a high level:
+ *
+ * Hypervisor Crash:
+ * Upon crash, hypervisor goes into an emergency minimal dispatch loop, a
+ * restrictive mode with very limited hypercall and MSR support. Each cpu
+ * then injects NMIs into root vcpus. A shared page is used to check
+ * by Linux in the NMI handler if the hypervisor has crashed. This shared
+ * page is setup in hv_root_crash_init during boot.
+ *
+ * Linux Crash:
+ * In case of Linux crash, the callback hv_crash_stop_other_cpus will send
+ * NMIs to all cpus, then proceed to the crash_nmi_callback where it waits
+ * for all cpus to be in NMI.
+ *
+ * NMI Handler (upon quorum):
+ * Eventually, in both cases, all cpus will end up in the NMI handler.
+ * Hyper-V requires the disable hypervisor must be done from the BSP. So
+ * the BSP NMI handler saves current context, does some fixups and makes
+ * the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor
+ * at that point will suspend all vcpus (except the BSP), unlock all its
+ * RAM, and return to Linux at the 32bit mode entry RIP.
+ *
+ * Linux 32bit entry trampoline will then restore long mode and call C
+ * function here to restore context and continue execution to crash kexec.
+ */
+
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/panic.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/mshyperv.h>
+#include <asm/nmi.h>
+#include <asm/idtentry.h>
+#include <asm/reboot.h>
+#include <asm/intel_pt.h>
+
+bool hv_crash_enabled;
+EXPORT_SYMBOL_GPL(hv_crash_enabled);
+
+struct hv_crash_ctxt {
+ ulong rsp;
+ ulong cr0;
+ ulong cr2;
+ ulong cr4;
+ ulong cr8;
+
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 es;
+ u16 fs;
+ u16 gs;
+
+ u16 gdt_fill;
+ struct desc_ptr gdtr;
+ char idt_fill[6];
+ struct desc_ptr idtr;
+
+ u64 gsbase;
+ u64 efer;
+ u64 pat;
+};
+static struct hv_crash_ctxt hv_crash_ctxt;
+
+/* Shared hypervisor page that contains crash dump area we peek into.
+ * NB: windbg looks for "hv_cda" symbol so don't change it.
+ */
+static struct hv_crashdump_area *hv_cda;
+
+static u32 trampoline_pa, devirt_arg;
+static atomic_t crash_cpus_wait;
+static void *hv_crash_ptpgs[4];
+static bool hv_has_crashed, lx_has_crashed;
+
+static void __noreturn hv_panic_timeout_reboot(void)
+{
+ #define PANIC_TIMER_STEP 100
+
+ if (panic_timeout > 0) {
+ int i;
+
+ for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP)
+ mdelay(PANIC_TIMER_STEP);
+ }
+
+ if (panic_timeout)
+ native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */
+
+ for (;;)
+ cpu_relax();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline __noclone void hv_crash_restore_tss(void)
+{
+ load_TR_desc();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline void hv_crash_clear_kernpt(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ /* Clear entry so it's not confusing to someone looking at the core */
+ pgd = pgd_offset_k(trampoline_pa);
+ p4d = p4d_offset(pgd, trampoline_pa);
+ native_p4d_clear(p4d);
+}
+
+/*
+ * This is the C entry point from the asm glue code after the disable hypercall.
+ * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
+ * page tables with our below 4G page identity mapped, but using a temporary
+ * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not
+ * available. We restore kernel GDT, and rest of the context, and continue
+ * to kexec.
+ */
+static asmlinkage void __noreturn hv_crash_c_entry(void)
+{
+ struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+ /* first thing, restore kernel gdt */
+ native_load_gdt(&ctxt->gdtr);
+
+ asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
+ asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
+
+ asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
+ asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
+ asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
+ asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
+
+ native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
+ asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
+
+ asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
+ asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
+ asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
+
+ native_load_idt(&ctxt->idtr);
+ native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
+ native_wrmsrq(MSR_EFER, ctxt->efer);
+
+ /* restore the original kernel CS now via far return */
+ asm volatile("movzwq %0, %%rax\n\t"
+ "pushq %%rax\n\t"
+ "pushq $1f\n\t"
+ "lretq\n\t"
+ "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
+
+ /* We are in asmlinkage without stack frame, hence make C function
+ * calls which will buy stack frames.
+ */
+ hv_crash_restore_tss();
+ hv_crash_clear_kernpt();
+
+ /* we are now fully in devirtualized normal kernel mode */
+ __crash_kexec(NULL);
+
+ hv_panic_timeout_reboot();
+}
+/* Tell gcc we are using lretq long jump in the above function intentionally */
+STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
+
+static void hv_mark_tss_not_busy(void)
+{
+ struct desc_struct *desc = get_current_gdt_rw();
+ tss_desc tss;
+
+ memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
+ tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */
+ write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
+}
+
+/* Save essential context */
+static void hv_hvcrash_ctxt_save(void)
+{
+ struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+ asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp));
+
+ ctxt->cr0 = native_read_cr0();
+ ctxt->cr4 = native_read_cr4();
+
+ asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2));
+ asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8));
+
+ asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs));
+ asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss));
+ asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds));
+ asm volatile("movl %%es, %%eax" : "=a"(ctxt->es));
+ asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs));
+ asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs));
+
+ native_store_gdt(&ctxt->gdtr);
+ store_idt(&ctxt->idtr);
+
+ ctxt->gsbase = __rdmsr(MSR_GS_BASE);
+ ctxt->efer = __rdmsr(MSR_EFER);
+ ctxt->pat = __rdmsr(MSR_IA32_CR_PAT);
+}
+
+/* Add trampoline page to the kernel pagetable for transition to kernel PT */
+static void hv_crash_fixup_kernpt(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ pgd = pgd_offset_k(trampoline_pa);
+ p4d = p4d_offset(pgd, trampoline_pa);
+
+ /* trampoline_pa is below 4G, so no pre-existing entry to clobber */
+ p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]);
+ p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */
+}
+
+/*
+ * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce
+ * and suspend all guest VPs.
+ */
+static void hv_notify_prepare_hyp(void)
+{
+ u64 status;
+ struct hv_input_notify_partition_event *input;
+ struct hv_partition_event_root_crashdump_input *cda;
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ cda = &input->input.crashdump_input;
+ memset(input, 0, sizeof(*input));
+ input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP;
+
+ cda->crashdump_action = HV_CRASHDUMP_ENTRY;
+ status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+ if (!hv_result_success(status))
+ return;
+
+ cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS;
+ hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+}
+
+/*
+ * Common function for all cpus before devirtualization.
+ *
+ * Hypervisor crash: all cpus get here in NMI context.
+ * Linux crash: the panicing cpu gets here at base level, all others in NMI
+ * context. Note, panicing cpu may not be the BSP.
+ *
+ * The function is not inlined so it will show on the stack. It is named so
+ * because the crash cmd looks for certain well known function names on the
+ * stack before looking into the cpu saved note in the elf section, and
+ * that work is currently incomplete.
+ *
+ * Notes:
+ * Hypervisor crash:
+ * - the hypervisor is in a very restrictive mode at this point and any
+ * vmexit it cannot handle would result in reboot. So, no mumbo jumbo,
+ * just get to kexec as quickly as possible.
+ *
+ * Devirtualization is supported from the BSP only at present.
+ */
+static noinline __noclone void crash_nmi_callback(struct pt_regs *regs)
+{
+ struct hv_input_disable_hyp_ex *input;
+ u64 status;
+ int msecs = 1000, ccpu = smp_processor_id();
+
+ if (ccpu == 0) {
+ /* crash_save_cpu() will be done in the kexec path */
+ cpu_emergency_stop_pt(); /* disable performance trace */
+ atomic_inc(&crash_cpus_wait);
+ } else {
+ crash_save_cpu(regs, ccpu);
+ cpu_emergency_stop_pt(); /* disable performance trace */
+ atomic_inc(&crash_cpus_wait);
+ for (;;)
+ cpu_relax();
+ }
+
+ while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--)
+ mdelay(1);
+
+ stop_nmi();
+ if (!hv_has_crashed)
+ hv_notify_prepare_hyp();
+
+ if (crashing_cpu == -1)
+ crashing_cpu = ccpu; /* crash cmd uses this */
+
+ hv_hvcrash_ctxt_save();
+ hv_mark_tss_not_busy();
+ hv_crash_fixup_kernpt();
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->rip = trampoline_pa;
+ input->arg = devirt_arg;
+
+ status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL);
+
+ hv_panic_timeout_reboot();
+}
+
+
+static DEFINE_SPINLOCK(hv_crash_reboot_lk);
+
+/*
+ * Generic NMI callback handler: could be called without any crash also.
+ * hv crash: hypervisor injects NMI's into all cpus
+ * lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus
+ */
+static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs)
+{
+ if (!hv_has_crashed && hv_cda && hv_cda->cda_valid)
+ hv_has_crashed = true;
+
+ if (!hv_has_crashed && !lx_has_crashed)
+ return NMI_DONE; /* ignore the NMI */
+
+ if (hv_has_crashed && !kexec_crash_loaded()) {
+ if (spin_trylock(&hv_crash_reboot_lk))
+ hv_panic_timeout_reboot();
+ else
+ for (;;)
+ cpu_relax();
+ }
+
+ crash_nmi_callback(regs);
+
+ return NMI_DONE;
+}
+
+/*
+ * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus
+ *
+ * On normal Linux panic, this is called twice: first from panic and then again
+ * from native_machine_crash_shutdown.
+ *
+ * In case of hyperv, 3 ways to get here:
+ * 1. hv crash (only BSP will get here):
+ * BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry
+ * -> __crash_kexec -> native_machine_crash_shutdown
+ * -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus
+ * Linux panic:
+ * 2. panic cpu x: panic() -> crash_smp_send_stop
+ * -> smp_ops.crash_stop_other_cpus
+ * 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop
+ *
+ * NB: noclone and non standard stack because of call to crash_setup_regs().
+ */
+static void __noclone hv_crash_stop_other_cpus(void)
+{
+ static bool crash_stop_done;
+ struct pt_regs lregs;
+ int ccpu = smp_processor_id();
+
+ if (hv_has_crashed)
+ return; /* all cpus already in NMI handler path */
+
+ if (!kexec_crash_loaded()) {
+ hv_notify_prepare_hyp();
+ hv_panic_timeout_reboot(); /* no return */
+ }
+
+ /* If the hv crashes also, we could come here again before cpus_stopped
+ * is set in crash_smp_send_stop(). So use our own check.
+ */
+ if (crash_stop_done)
+ return;
+ crash_stop_done = true;
+
+ /* Linux has crashed: hv is healthy, we can IPI safely */
+ lx_has_crashed = true;
+ wmb(); /* NMI handlers look at lx_has_crashed */
+
+ apic->send_IPI_allbutself(NMI_VECTOR);
+
+ if (crashing_cpu == -1)
+ crashing_cpu = ccpu; /* crash cmd uses this */
+
+ /* crash_setup_regs() happens in kexec also, but for the kexec cpu which
+ * is the BSP. We could be here on non-BSP cpu, collect regs if so.
+ */
+ if (ccpu)
+ crash_setup_regs(&lregs, NULL);
+
+ crash_nmi_callback(&lregs);
+}
+STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus);
+
+/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */
+struct hv_gdtreg_32 {
+ u16 fill;
+ u16 limit;
+ u32 address;
+} __packed;
+
+/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */
+struct hv_crash_tramp_gdt {
+ u64 null; /* index 0, selector 0, null selector */
+ u64 cs64; /* index 1, selector 8, cs64 selector */
+} __packed;
+
+/* No stack, so jump via far ptr in memory to load the 64bit CS */
+struct hv_cs_jmptgt {
+ u32 address;
+ u16 csval;
+ u16 fill;
+} __packed;
+
+/* Linux use only, hypervisor doesn't look at this struct */
+struct hv_crash_tramp_data {
+ u64 tramp32_cr3;
+ u64 kernel_cr3;
+ struct hv_gdtreg_32 gdtr32;
+ struct hv_crash_tramp_gdt tramp_gdt;
+ struct hv_cs_jmptgt cs_jmptgt;
+ u64 c_entry_addr;
+} __packed;
+
+/*
+ * Setup a temporary gdt to allow the asm code to switch to the long mode.
+ * Since the asm code is relocated/copied to a below 4G page, it cannot use rip
+ * relative addressing, hence we must use trampoline_pa here. Also, save other
+ * info like jmp and C entry targets for same reasons.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+static int hv_crash_setup_trampdata(u64 trampoline_va)
+{
+ int size, offs;
+ void *dest;
+ struct hv_crash_tramp_data *tramp;
+
+ /* These must match exactly the ones in the corresponding asm file */
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data,
+ cs_jmptgt.address) != 40);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48);
+
+ /* hv_crash_asm_end is beyond last byte by 1 */
+ size = &hv_crash_asm_end - &hv_crash_asm32;
+ if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) {
+ pr_err("%s: trampoline page overflow\n", __func__);
+ return -1;
+ }
+
+ dest = (void *)trampoline_va;
+ memcpy(dest, &hv_crash_asm32, size);
+
+ dest += size;
+ dest = (void *)round_up((ulong)dest, 16);
+ tramp = (struct hv_crash_tramp_data *)dest;
+
+ /* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by
+ * non-PCID-aware users". Build cr3 with pcid 0
+ */
+ tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]);
+
+ /* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */
+ tramp->kernel_cr3 = __sme_pa(init_mm.pgd);
+
+ tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt);
+ tramp->gdtr32.address = trampoline_pa +
+ (ulong)&tramp->tramp_gdt - trampoline_va;
+
+ /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */
+ tramp->tramp_gdt.cs64 = 0x00af9a000000ffff;
+
+ tramp->cs_jmptgt.csval = 0x8;
+ offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32;
+ tramp->cs_jmptgt.address = trampoline_pa + offs;
+
+ tramp->c_entry_addr = (u64)&hv_crash_c_entry;
+
+ devirt_arg = trampoline_pa + (ulong)dest - trampoline_va;
+
+ return 0;
+}
+
+/*
+ * Build 32bit trampoline page table for transition from protected mode
+ * non-paging to long-mode paging. This transition needs pagetables below 4G.
+ */
+static void hv_crash_build_tramp_pt(void)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ u64 pa, addr = trampoline_pa;
+
+ p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d);
+ pa = virt_to_phys(hv_crash_ptpgs[1]);
+ set_p4d(p4d, __p4d(_PAGE_TABLE | pa));
+ p4d->p4d &= ~(_PAGE_NX); /* enable execute */
+
+ pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud);
+ pa = virt_to_phys(hv_crash_ptpgs[2]);
+ set_pud(pud, __pud(_PAGE_TABLE | pa));
+
+ pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd);
+ pa = virt_to_phys(hv_crash_ptpgs[3]);
+ set_pmd(pmd, __pmd(_PAGE_TABLE | pa));
+
+ pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte);
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+/*
+ * Setup trampoline for devirtualization:
+ * - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to
+ * in protected mode.
+ * - 4 pages for a temporary page table that asm code uses to turn paging on
+ * - a temporary gdt to use in the compat mode.
+ *
+ * Returns: 0 on success
+ */
+static int hv_crash_trampoline_setup(void)
+{
+ int i, rc, order;
+ struct page *page;
+ u64 trampoline_va;
+ gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO;
+
+ /* page for 32bit trampoline assembly code + hv_crash_tramp_data */
+ page = alloc_page(flags32);
+ if (page == NULL) {
+ pr_err("%s: failed to alloc asm stub page\n", __func__);
+ return -1;
+ }
+
+ trampoline_va = (u64)page_to_virt(page);
+ trampoline_pa = (u32)page_to_phys(page);
+
+ order = 2; /* alloc 2^2 pages */
+ page = alloc_pages(flags32, order);
+ if (page == NULL) {
+ pr_err("%s: failed to alloc pt pages\n", __func__);
+ free_page(trampoline_va);
+ return -1;
+ }
+
+ for (i = 0; i < 4; i++, page++)
+ hv_crash_ptpgs[i] = page_to_virt(page);
+
+ hv_crash_build_tramp_pt();
+
+ rc = hv_crash_setup_trampdata(trampoline_va);
+ if (rc)
+ goto errout;
+
+ return 0;
+
+errout:
+ free_page(trampoline_va);
+ free_pages((ulong)hv_crash_ptpgs[0], order);
+
+ return rc;
+}
+
+/* Setup for kdump kexec to collect hypervisor RAM when running as root */
+void hv_root_crash_init(void)
+{
+ int rc;
+ struct hv_input_get_system_property *input;
+ struct hv_output_get_system_property *output;
+ unsigned long flags;
+ u64 status;
+ union hv_pfn_range cda_info;
+
+ if (pgtable_l5_enabled()) {
+ pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n");
+ return;
+ }
+
+ rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST,
+ "hv_crash_nmi");
+ if (rc) {
+ pr_err("Hyper-V: failed to register crash nmi handler\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA;
+
+ status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
+ cda_info.as_uint64 = output->hv_cda_info.as_uint64;
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("Hyper-V: %s: property:%d %s\n", __func__,
+ input->property_id, hv_result_to_string(status));
+ goto err_out;
+ }
+
+ if (cda_info.base_pfn == 0) {
+ pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n");
+ goto err_out;
+ }
+
+ hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT);
+
+ rc = hv_crash_trampoline_setup();
+ if (rc)
+ goto err_out;
+
+ smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus;
+
+ crash_kexec_post_notifiers = true;
+ hv_crash_enabled = true;
+ pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n");
+
+ return;
+
+err_out:
+ unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi");
+ pr_err("Hyper-V: only linux root kdump support enabled\n");
+}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
new file mode 100644
index 000000000000..14de43f4bc6c
--- /dev/null
+++ b/arch/x86/hyperv/hv_init.c
@@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * X86 specific Hyper-V initialization code.
+ *
+ * Copyright (C) 2016, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ */
+
+#define pr_fmt(fmt) "Hyper-V: " fmt
+
+#include <linux/efi.h>
+#include <linux/types.h>
+#include <linux/bitfield.h>
+#include <linux/io.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/e820/api.h>
+#include <asm/sev.h>
+#include <asm/hypervisor.h>
+#include <hyperv/hvhdk.h>
+#include <asm/mshyperv.h>
+#include <asm/msr.h>
+#include <asm/idtentry.h>
+#include <asm/set_memory.h>
+#include <linux/kexec.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/cpuhotplug.h>
+#include <linux/syscore_ops.h>
+#include <clocksource/hyperv_timer.h>
+#include <linux/highmem.h>
+#include <linux/export.h>
+
+void *hv_hypercall_pg;
+
+#ifdef CONFIG_X86_64
+static u64 __hv_hyperfail(u64 control, u64 param1, u64 param2)
+{
+ return U64_MAX;
+}
+
+DEFINE_STATIC_CALL(__hv_hypercall, __hv_hyperfail);
+
+u64 hv_std_hypercall(u64 control, u64 param1, u64 param2)
+{
+ u64 hv_status;
+
+ register u64 __r8 asm("r8") = param2;
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(__hv_hypercall)
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (param1), "+r" (__r8)
+ : : "cc", "memory", "r9", "r10", "r11");
+
+ return hv_status;
+}
+
+typedef u64 (*hv_hypercall_f)(u64 control, u64 param1, u64 param2);
+
+static inline void hv_set_hypercall_pg(void *ptr)
+{
+ hv_hypercall_pg = ptr;
+
+ if (!ptr)
+ ptr = &__hv_hyperfail;
+ static_call_update(__hv_hypercall, (hv_hypercall_f)ptr);
+}
+#else
+static inline void hv_set_hypercall_pg(void *ptr)
+{
+ hv_hypercall_pg = ptr;
+}
+EXPORT_SYMBOL_GPL(hv_hypercall_pg);
+#endif
+
+union hv_ghcb * __percpu *hv_ghcb_pg;
+
+/* Storage to save the hypercall page temporarily for hibernation */
+static void *hv_hypercall_pg_saved;
+
+struct hv_vp_assist_page **hv_vp_assist_page;
+EXPORT_SYMBOL_GPL(hv_vp_assist_page);
+
+static int hyperv_init_ghcb(void)
+{
+ u64 ghcb_gpa;
+ void *ghcb_va;
+ void **ghcb_base;
+
+ if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp())
+ return 0;
+
+ if (!hv_ghcb_pg)
+ return -EINVAL;
+
+ /*
+ * GHCB page is allocated by paravisor. The address
+ * returned by MSR_AMD64_SEV_ES_GHCB is above shared
+ * memory boundary and map it here.
+ */
+ rdmsrq(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa);
+
+ /* Mask out vTOM bit. ioremap_cache() maps decrypted */
+ ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary;
+ ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE);
+ if (!ghcb_va)
+ return -ENOMEM;
+
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ *ghcb_base = ghcb_va;
+
+ return 0;
+}
+
+static int hv_cpu_init(unsigned int cpu)
+{
+ union hv_vp_assist_msr_contents msr = { 0 };
+ struct hv_vp_assist_page **hvp;
+ int ret;
+
+ ret = hv_common_cpu_init(cpu);
+ if (ret)
+ return ret;
+
+ if (!hv_vp_assist_page)
+ return 0;
+
+ hvp = &hv_vp_assist_page[cpu];
+ if (hv_root_partition()) {
+ /*
+ * For root partition we get the hypervisor provided VP assist
+ * page, instead of allocating a new page.
+ */
+ rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+ } else {
+ /*
+ * The VP assist page is an "overlay" page (see Hyper-V TLFS's
+ * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
+ * out to make sure we always write the EOI MSR in
+ * hv_apic_eoi_write() *after* the EOI optimization is disabled
+ * in hv_cpu_die(), otherwise a CPU may not be stopped in the
+ * case of CPU offlining and the VM will hang.
+ */
+ if (!*hvp) {
+ *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+
+ /*
+ * Hyper-V should never specify a VM that is a Confidential
+ * VM and also running in the root partition. Root partition
+ * is blocked to run in Confidential VM. So only decrypt assist
+ * page in non-root partition here.
+ */
+ if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1));
+ memset(*hvp, 0, PAGE_SIZE);
+ }
+ }
+
+ if (*hvp)
+ msr.pfn = vmalloc_to_pfn(*hvp);
+
+ }
+ if (!WARN_ON(!(*hvp))) {
+ msr.enable = 1;
+ wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ }
+
+ /* Allow Hyper-V stimer vector to be injected from Hypervisor. */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true);
+
+ return hyperv_init_ghcb();
+}
+
+static void (*hv_reenlightenment_cb)(void);
+
+static void hv_reenlightenment_notify(struct work_struct *dummy)
+{
+ struct hv_tsc_emulation_status emu_status;
+
+ rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+ /* Don't issue the callback if TSC accesses are not emulated */
+ if (hv_reenlightenment_cb && emu_status.inprogress)
+ hv_reenlightenment_cb();
+}
+static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
+
+void hyperv_stop_tsc_emulation(void)
+{
+ u64 freq;
+ struct hv_tsc_emulation_status emu_status;
+
+ rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+ emu_status.inprogress = 0;
+ wrmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+ rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq);
+ tsc_khz = div64_u64(freq, 1000);
+}
+EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
+
+static inline bool hv_reenlightenment_available(void)
+{
+ /*
+ * Check for required features and privileges to make TSC frequency
+ * change notifications work.
+ */
+ return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE &&
+ ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT;
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment)
+{
+ apic_eoi();
+ inc_irq_stat(irq_hv_reenlightenment_count);
+ schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
+}
+
+void set_hv_tscchange_cb(void (*cb)(void))
+{
+ struct hv_reenlightenment_control re_ctrl = {
+ .vector = HYPERV_REENLIGHTENMENT_VECTOR,
+ .enabled = 1,
+ };
+ struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
+
+ if (!hv_reenlightenment_available()) {
+ pr_warn("reenlightenment support is unavailable\n");
+ return;
+ }
+
+ if (!hv_vp_index)
+ return;
+
+ hv_reenlightenment_cb = cb;
+
+ /* Make sure callback is registered before we write to MSRs */
+ wmb();
+
+ re_ctrl.target_vp = hv_vp_index[get_cpu()];
+
+ wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ wrmsrq(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
+
+ put_cpu();
+}
+EXPORT_SYMBOL_GPL(set_hv_tscchange_cb);
+
+void clear_hv_tscchange_cb(void)
+{
+ struct hv_reenlightenment_control re_ctrl;
+
+ if (!hv_reenlightenment_available())
+ return;
+
+ rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+ re_ctrl.enabled = 0;
+ wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+
+ hv_reenlightenment_cb = NULL;
+}
+EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb);
+
+static int hv_cpu_die(unsigned int cpu)
+{
+ struct hv_reenlightenment_control re_ctrl;
+ unsigned int new_cpu;
+ void **ghcb_va;
+
+ if (hv_ghcb_pg) {
+ ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg);
+ if (*ghcb_va)
+ iounmap(*ghcb_va);
+ *ghcb_va = NULL;
+ }
+
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false);
+
+ hv_common_cpu_die(cpu);
+
+ if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
+ union hv_vp_assist_msr_contents msr = { 0 };
+ if (hv_root_partition()) {
+ /*
+ * For root partition the VP assist page is mapped to
+ * hypervisor provided page, and thus we unmap the
+ * page here and nullify it, so that in future we have
+ * correct page address mapped in hv_cpu_init.
+ */
+ memunmap(hv_vp_assist_page[cpu]);
+ hv_vp_assist_page[cpu] = NULL;
+ rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ msr.enable = 0;
+ }
+ wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ }
+
+ if (hv_reenlightenment_cb == NULL)
+ return 0;
+
+ rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ if (re_ctrl.target_vp == hv_vp_index[cpu]) {
+ /*
+ * Reassign reenlightenment notifications to some other online
+ * CPU or just disable the feature if there are no online CPUs
+ * left (happens on hibernation).
+ */
+ new_cpu = cpumask_any_but(cpu_online_mask, cpu);
+
+ if (new_cpu < nr_cpu_ids)
+ re_ctrl.target_vp = hv_vp_index[new_cpu];
+ else
+ re_ctrl.enabled = 0;
+
+ wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ }
+
+ return 0;
+}
+
+static int __init hv_pci_init(void)
+{
+ bool gen2vm = efi_enabled(EFI_BOOT);
+
+ /*
+ * A Generation-2 VM doesn't support legacy PCI/PCIe, so both
+ * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() ->
+ * pcibios_init() doesn't call pcibios_resource_survey() ->
+ * e820__reserve_resources_late(); as a result, any emulated persistent
+ * memory of E820_TYPE_PRAM (12) via the kernel parameter
+ * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be
+ * detected by register_e820_pmem(). Fix this by directly calling
+ * e820__reserve_resources_late() here: e820__reserve_resources_late()
+ * depends on e820__reserve_resources(), which has been called earlier
+ * from setup_arch(). Note: e820__reserve_resources_late() also adds
+ * any memory of E820_TYPE_PMEM (7) into iomem_resource, and
+ * acpi_nfit_register_region() -> acpi_nfit_insert_resource() ->
+ * region_intersects() returns REGION_INTERSECTS, so the memory of
+ * E820_TYPE_PMEM won't get added twice.
+ *
+ * We return 0 here so that pci_arch_init() won't print the warning:
+ * "PCI: Fatal: No config space access function found"
+ */
+ if (gen2vm) {
+ e820__reserve_resources_late();
+ return 0;
+ }
+
+ /* For Generation-1 VM, we'll proceed in pci_arch_init(). */
+ return 1;
+}
+
+static int hv_suspend(void *data)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+ int ret;
+
+ if (hv_root_partition())
+ return -EPERM;
+
+ /*
+ * Reset the hypercall page as it is going to be invalidated
+ * across hibernation. Setting hv_hypercall_pg to NULL ensures
+ * that any subsequent hypercall operation fails safely instead of
+ * crashing due to an access of an invalid page. The hypercall page
+ * pointer is restored on resume.
+ */
+ hv_hypercall_pg_saved = hv_hypercall_pg;
+ hv_set_hypercall_pg(NULL);
+
+ /* Disable the hypercall page in the hypervisor */
+ rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hypercall_msr.enable = 0;
+ wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ ret = hv_cpu_die(0);
+ return ret;
+}
+
+static void hv_resume(void *data)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+ int ret;
+
+ ret = hv_cpu_init(0);
+ WARN_ON(ret);
+
+ /* Re-enable the hypercall page */
+ rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hypercall_msr.enable = 1;
+ hypercall_msr.guest_physical_address =
+ vmalloc_to_pfn(hv_hypercall_pg_saved);
+ wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ hv_set_hypercall_pg(hv_hypercall_pg_saved);
+ hv_hypercall_pg_saved = NULL;
+
+ /*
+ * Reenlightenment notifications are disabled by hv_cpu_die(0),
+ * reenable them here if hv_reenlightenment_cb was previously set.
+ */
+ if (hv_reenlightenment_cb)
+ set_hv_tscchange_cb(hv_reenlightenment_cb);
+}
+
+/* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */
+static const struct syscore_ops hv_syscore_ops = {
+ .suspend = hv_suspend,
+ .resume = hv_resume,
+};
+
+static struct syscore hv_syscore = {
+ .ops = &hv_syscore_ops,
+};
+
+static void (* __initdata old_setup_percpu_clockev)(void);
+
+static void __init hv_stimer_setup_percpu_clockev(void)
+{
+ /*
+ * Ignore any errors in setting up stimer clockevents
+ * as we can run with the LAPIC timer as a fallback.
+ */
+ (void)hv_stimer_alloc(false);
+
+ /*
+ * Still register the LAPIC timer, because the direct-mode STIMER is
+ * not supported by old versions of Hyper-V. This also allows users
+ * to switch to LAPIC timer via /sys, if they want to.
+ */
+ if (old_setup_percpu_clockev)
+ old_setup_percpu_clockev();
+}
+
+/*
+ * This function is to be invoked early in the boot sequence after the
+ * hypervisor has been detected.
+ *
+ * 1. Setup the hypercall page.
+ * 2. Register Hyper-V specific clocksource.
+ * 3. Setup Hyper-V specific APIC entry points.
+ */
+void __init hyperv_init(void)
+{
+ u64 guest_id;
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+ int cpuhp;
+
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
+ return;
+
+ if (hv_common_init())
+ return;
+
+ /*
+ * The VP assist page is useless to a TDX guest: the only use we
+ * would have for it is lazy EOI, which can not be used with TDX.
+ */
+ if (hv_isolation_type_tdx())
+ hv_vp_assist_page = NULL;
+ else
+ hv_vp_assist_page = kcalloc(nr_cpu_ids,
+ sizeof(*hv_vp_assist_page),
+ GFP_KERNEL);
+ if (!hv_vp_assist_page) {
+ ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+
+ if (!hv_isolation_type_tdx())
+ goto common_free;
+ }
+
+ if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ /* Negotiate GHCB Version. */
+ if (!hv_ghcb_negotiate_protocol())
+ hv_ghcb_terminate(SEV_TERM_SET_GEN,
+ GHCB_SEV_ES_PROT_UNSUPPORTED);
+
+ hv_ghcb_pg = alloc_percpu(union hv_ghcb *);
+ if (!hv_ghcb_pg)
+ goto free_vp_assist_page;
+ }
+
+ cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online",
+ hv_cpu_init, hv_cpu_die);
+ if (cpuhp < 0)
+ goto free_ghcb_page;
+
+ /*
+ * Setup the hypercall page and enable hypercalls.
+ * 1. Register the guest ID
+ * 2. Enable the hypercall and register the hypercall page
+ *
+ * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg:
+ * when the hypercall input is a page, such a VM must pass a decrypted
+ * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page
+ * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present.
+ *
+ * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls,
+ * which are handled by the paravisor and the VM must use an encrypted
+ * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and
+ * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and
+ * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls:
+ * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8().
+ * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e.
+ * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg;
+ * instead, hv_post_message() uses the post_msg_page, which is decrypted
+ * in such a VM and is only used in such a VM.
+ */
+ guest_id = hv_generate_guest_id(LINUX_VERSION_CODE);
+ wrmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ /* With the paravisor, the VM must also write the ID via GHCB/GHCI */
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ goto skip_hypercall_pg_init;
+
+ hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR,
+ MODULES_END, GFP_KERNEL, PAGE_KERNEL_ROX,
+ VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (hv_hypercall_pg == NULL)
+ goto clean_guest_os_id;
+
+ rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hypercall_msr.enable = 1;
+
+ if (hv_root_partition()) {
+ struct page *pg;
+ void *src;
+
+ /*
+ * For the root partition, the hypervisor will set up its
+ * hypercall page. The hypervisor guarantees it will not show
+ * up in the root's address space. The root can't change the
+ * location of the hypercall page.
+ *
+ * Order is important here. We must enable the hypercall page
+ * so it is populated with code, then copy the code to an
+ * executable page.
+ */
+ wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ pg = vmalloc_to_page(hv_hypercall_pg);
+ src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
+ MEMREMAP_WB);
+ BUG_ON(!src);
+ memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE);
+ memunmap(src);
+
+ hv_remap_tsc_clocksource();
+ hv_root_crash_init();
+ hv_sleep_notifiers_register();
+ } else {
+ hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
+ wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ }
+
+ hv_set_hypercall_pg(hv_hypercall_pg);
+
+skip_hypercall_pg_init:
+ /*
+ * hyperv_init() is called before LAPIC is initialized: see
+ * apic_intr_mode_init() -> x86_platform.apic_post_init() and
+ * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER
+ * depends on LAPIC, so hv_stimer_alloc() should be called from
+ * x86_init.timers.setup_percpu_clockev.
+ */
+ old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev;
+ x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev;
+
+ hv_apic_init();
+
+ x86_init.pci.arch_init = hv_pci_init;
+
+ register_syscore(&hv_syscore);
+
+ if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID)
+ hv_get_partition_id();
+
+#ifdef CONFIG_PCI_MSI
+ /*
+ * If we're running as root, we want to create our own PCI MSI domain.
+ * We can't set this in hv_pci_init because that would be too late.
+ */
+ if (hv_root_partition())
+ x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
+#endif
+
+ /* Query the VMs extended capability once, so that it can be cached. */
+ hv_query_ext_cap(0);
+
+ /* Find the VTL */
+ ms_hyperv.vtl = get_vtl();
+
+ if (ms_hyperv.vtl > 0) /* non default VTL */
+ hv_vtl_early_init();
+
+ return;
+
+clean_guest_os_id:
+ wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
+free_ghcb_page:
+ free_percpu(hv_ghcb_pg);
+free_vp_assist_page:
+ kfree(hv_vp_assist_page);
+ hv_vp_assist_page = NULL;
+common_free:
+ hv_common_free();
+}
+
+/*
+ * This routine is called before kexec/kdump, it does the required cleanup.
+ */
+void hyperv_cleanup(void)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+ union hv_reference_tsc_msr tsc_msr;
+
+ /* Reset our OS id */
+ wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+
+ /*
+ * Reset hypercall page reference before reset the page,
+ * let hypercall operations fail safely rather than
+ * panic the kernel for using invalid hypercall page
+ */
+ hv_hypercall_pg = NULL;
+
+ /* Reset the hypercall page */
+ hypercall_msr.as_uint64 = hv_get_msr(HV_X64_MSR_HYPERCALL);
+ hypercall_msr.enable = 0;
+ hv_set_msr(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ /* Reset the TSC page */
+ tsc_msr.as_uint64 = hv_get_msr(HV_X64_MSR_REFERENCE_TSC);
+ tsc_msr.enable = 0;
+ hv_set_msr(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+}
+
+void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
+{
+ static bool panic_reported;
+ u64 guest_id;
+
+ if (in_die && !panic_on_oops)
+ return;
+
+ /*
+ * We prefer to report panic on 'die' chain as we have proper
+ * registers to report, but if we miss it (e.g. on BUG()) we need
+ * to report it on 'panic'.
+ */
+ if (panic_reported)
+ return;
+ panic_reported = true;
+
+ rdmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ wrmsrq(HV_X64_MSR_CRASH_P0, err);
+ wrmsrq(HV_X64_MSR_CRASH_P1, guest_id);
+ wrmsrq(HV_X64_MSR_CRASH_P2, regs->ip);
+ wrmsrq(HV_X64_MSR_CRASH_P3, regs->ax);
+ wrmsrq(HV_X64_MSR_CRASH_P4, regs->sp);
+
+ /*
+ * Let Hyper-V know there is crash data available
+ */
+ wrmsrq(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(hyperv_report_panic);
+
+bool hv_is_hyperv_initialized(void)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+
+ /*
+ * Ensure that we're really on Hyper-V, and not a KVM or Xen
+ * emulation of Hyper-V
+ */
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
+ return false;
+
+ /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ return true;
+ /*
+ * Verify that earlier initialization succeeded by checking
+ * that the hypercall page is setup
+ */
+ hypercall_msr.as_uint64 = 0;
+ rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ return hypercall_msr.enable;
+}
+EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
+
+int hv_apicid_to_vp_index(u32 apic_id)
+{
+ u64 control;
+ u64 status;
+ unsigned long irq_flags;
+ struct hv_get_vp_from_apic_id_in *input;
+ u32 *output, ret;
+
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->apic_ids[0] = apic_id;
+
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_INDEX_FROM_APIC_ID;
+ status = hv_do_hypercall(control, input, output);
+ ret = output[0];
+
+ local_irq_restore(irq_flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("failed to get vp index from apic id %d, status %#llx\n",
+ apic_id, status);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_apicid_to_vp_index);
diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
new file mode 100644
index 000000000000..81b006601370
--- /dev/null
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V specific spinlock code.
+ *
+ * Copyright (C) 2018, Intel, Inc.
+ *
+ * Author : Yi Sun <yi.y.sun@intel.com>
+ */
+
+#define pr_fmt(fmt) "Hyper-V: " fmt
+
+#include <linux/spinlock.h>
+
+#include <asm/mshyperv.h>
+#include <asm/paravirt.h>
+#include <asm/apic.h>
+#include <asm/msr.h>
+
+static bool hv_pvspin __initdata = true;
+
+static void hv_qlock_kick(int cpu)
+{
+ __apic_send_IPI(cpu, X86_PLATFORM_IPI_VECTOR);
+}
+
+static void hv_qlock_wait(u8 *byte, u8 val)
+{
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ /*
+ * Reading HV_X64_MSR_GUEST_IDLE MSR tells the hypervisor that the
+ * vCPU can be put into 'idle' state. This 'idle' state is
+ * terminated by an IPI, usually from hv_qlock_kick(), even if
+ * interrupts are disabled on the vCPU.
+ *
+ * To prevent a race against the unlock path it is required to
+ * disable interrupts before accessing the HV_X64_MSR_GUEST_IDLE
+ * MSR. Otherwise, if the IPI from hv_qlock_kick() arrives between
+ * the lock value check and the rdmsrq() then the vCPU might be put
+ * into 'idle' state by the hypervisor and kept in that state for
+ * an unspecified amount of time.
+ */
+ local_irq_save(flags);
+ /*
+ * Only issue the rdmsrq() when the lock state has not changed.
+ */
+ if (READ_ONCE(*byte) == val) {
+ unsigned long msr_val;
+
+ rdmsrq(HV_X64_MSR_GUEST_IDLE, msr_val);
+
+ (void)msr_val;
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Hyper-V does not support this so far.
+ */
+__visible bool hv_vcpu_is_preempted(int vcpu)
+{
+ return false;
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted);
+
+void __init hv_init_spinlocks(void)
+{
+ if (!hv_pvspin || !apic ||
+ !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) ||
+ !(ms_hyperv.features & HV_MSR_GUEST_IDLE_AVAILABLE)) {
+ pr_info("PV spinlocks disabled\n");
+ return;
+ }
+ pr_info("PV spinlocks enabled\n");
+
+ __pv_init_lock_hash();
+ pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_ops.lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_ops.lock.wait = hv_qlock_wait;
+ pv_ops.lock.kick = hv_qlock_kick;
+ pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted);
+}
+
+static __init int hv_parse_nopvspin(char *arg)
+{
+ hv_pvspin = false;
+ return 0;
+}
+early_param("hv_nopvspin", hv_parse_nopvspin);
diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S
new file mode 100644
index 000000000000..25f02ff12286
--- /dev/null
+++ b/arch/x86/hyperv/hv_trampoline.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * X86 specific Hyper-V kdump/crash related code.
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * void noreturn hv_crash_asm32(arg1)
+ * arg1 == edi == 32bit PA of struct hv_crash_tramp_data
+ *
+ * The hypervisor jumps here upon devirtualization in protected mode. This
+ * code gets copied to a page in the low 4G ie, 32bit space so it can run
+ * in the protected mode. Hence we cannot use any compile/link time offsets or
+ * addresses. It restores long mode via temporary gdt and page tables and
+ * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry.
+ *
+ * PreCondition (ie, Hypervisor call back ABI):
+ * o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled
+ * o CR4 is set to 0x0
+ * o IA32_EFER is set to 0x901 (SCE and NXE are set)
+ * o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX.
+ * o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF
+ * o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF
+ * o LDTR is initialized as invalid (limit of 0)
+ * o MSR PAT is power on default.
+ * o Other state/registers are cleared. All TLBs flushed.
+ */
+
+#define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */
+#define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */
+#define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */
+#define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */
+#define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */
+
+ .text
+ .code32
+
+SYM_CODE_START(hv_crash_asm32)
+ UNWIND_HINT_UNDEFINED
+ ENDBR
+ movl $X86_CR4_PAE, %ecx
+ movl %ecx, %cr4
+
+ movl %edi, %ebx
+ add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx
+ movl %cs:(%ebx), %eax
+ movl %eax, %cr3
+
+ /* Setup EFER for long mode now */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Turn paging on using the temp 32bit trampoline page table */
+ movl %cr0, %eax
+ orl $(X86_CR0_PG), %eax
+ movl %eax, %cr0
+
+ /* since kernel cr3 could be above 4G, we need to be in the long mode
+ * before we can load 64bits of the kernel cr3. We use a temp gdt for
+ * that with CS.L=1 and CS.D=0 */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax
+ lgdtl %cs:(%eax)
+
+ /* not done yet, restore CS now to switch to CS.L=1 */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax
+ ljmp %cs:*(%eax)
+SYM_CODE_END(hv_crash_asm32)
+
+ /* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */
+ .code64
+ .balign 8
+SYM_CODE_START(hv_crash_asm64)
+ UNWIND_HINT_UNDEFINED
+ ENDBR
+ /* restore kernel page tables so we can jump to kernel code */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_KERNCR3, %eax
+ movq %cs:(%eax), %rbx
+ movq %rbx, %cr3
+
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_C_entry, %eax
+ movq %cs:(%eax), %rbx
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rbx
+
+ int $3
+
+SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL)
+SYM_CODE_END(hv_crash_asm64)
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
new file mode 100644
index 000000000000..c0edaed0efb3
--- /dev/null
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ * Saurabh Sengar <ssengar@microsoft.com>
+ */
+
+#include <asm/apic.h>
+#include <asm/boot.h>
+#include <asm/desc.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/types.h>
+#include <asm/i8259.h>
+#include <asm/mshyperv.h>
+#include <asm/msr.h>
+#include <asm/realmode.h>
+#include <asm/reboot.h>
+#include <asm/smap.h>
+#include <linux/export.h>
+#include <../kernel/smpboot.h>
+#include "../../kernel/fpu/legacy.h"
+
+extern struct boot_params boot_params;
+static struct real_mode_header hv_vtl_real_mode_header;
+
+static bool __init hv_vtl_msi_ext_dest_id(void)
+{
+ return true;
+}
+
+/*
+ * The `native_machine_emergency_restart` function from `reboot.c` writes
+ * to the physical address 0x472 to indicate the type of reboot for the
+ * firmware. We cannot have that in VSM as the memory composition might
+ * be more generic, and such write effectively corrupts the memory thus
+ * making diagnostics harder at the very least.
+ */
+static void __noreturn hv_vtl_emergency_restart(void)
+{
+ /*
+ * Cause a triple fault and the immediate reset. Here the code does not run
+ * on the top of any firmware, whereby cannot reach out to its services.
+ * The inifinite loop is for the improbable case that the triple fault does
+ * not work and have to preserve the state intact for debugging.
+ */
+ for (;;) {
+ idt_invalidate();
+ __asm__ __volatile__("int3");
+ }
+}
+
+/*
+ * The only way to restart in the VTL mode is to triple fault as the kernel runs
+ * as firmware.
+ */
+static void __noreturn hv_vtl_restart(char __maybe_unused *cmd)
+{
+ hv_vtl_emergency_restart();
+}
+
+void __init hv_vtl_init_platform(void)
+{
+ /*
+ * This function is a no-op if the VTL mode is not enabled.
+ * If it is, this function runs if and only the kernel boots in
+ * VTL2 which the x86 hv initialization path makes sure of.
+ */
+ pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n", ms_hyperv.vtl);
+
+ x86_platform.realmode_reserve = x86_init_noop;
+ x86_platform.realmode_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
+
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
+
+ x86_platform.get_wallclock = get_rtc_noop;
+ x86_platform.set_wallclock = set_rtc_noop;
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.warm_reset = 0;
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.legacy.devices.pnpbios = 0;
+
+ x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id;
+}
+
+static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
+{
+ return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) |
+ (desc->base1 << 16) | desc->base0;
+}
+
+static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc)
+{
+ return ((u32)desc->limit1 << 16) | (u32)desc->limit0;
+}
+
+typedef void (*secondary_startup_64_fn)(void*, void*);
+static void hv_vtl_ap_entry(void)
+{
+ ((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
+}
+
+static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored)
+{
+ u64 status;
+ int ret = 0;
+ struct hv_enable_vp_vtl *input;
+ unsigned long irq_flags;
+
+ struct desc_ptr gdt_ptr;
+ struct desc_ptr idt_ptr;
+
+ struct ldttss_desc *tss;
+ struct ldttss_desc *ldt;
+ struct desc_struct *gdt;
+
+ struct task_struct *idle = idle_thread_get(cpu);
+ u64 rsp = (unsigned long)idle->thread.sp;
+
+ u64 rip = (u64)&hv_vtl_ap_entry;
+
+ native_store_gdt(&gdt_ptr);
+ store_idt(&idt_ptr);
+
+ gdt = (struct desc_struct *)((void *)(gdt_ptr.address));
+ tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+ ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT);
+
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->vp_index = target_vp_index;
+ input->target_vtl.target_vtl = HV_VTL_MGMT;
+
+ /*
+ * The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit
+ * mode transition sequence after waking up an AP with SIPI whose
+ * vector points to the 16-bit AP startup trampoline code. Here in
+ * VTL2, we can't perform that sequence as the AP has to start in
+ * the 64-bit mode.
+ *
+ * To make this happen, we tell the hypervisor to load a valid 64-bit
+ * context (most of which is just magic numbers from the CPU manual)
+ * so that AP jumps right to the 64-bit entry of the kernel, and the
+ * control registers are loaded with values that let the AP fetch the
+ * code and data and carry on with work it gets assigned.
+ */
+
+ input->vp_context.rip = rip;
+ input->vp_context.rsp = rsp;
+ input->vp_context.rflags = 0x0000000000000002;
+ input->vp_context.efer = native_rdmsrq(MSR_EFER);
+ input->vp_context.cr0 = native_read_cr0();
+ input->vp_context.cr3 = __native_read_cr3();
+ input->vp_context.cr4 = native_read_cr4();
+ input->vp_context.msr_cr_pat = native_rdmsrq(MSR_IA32_CR_PAT);
+ input->vp_context.idtr.limit = idt_ptr.size;
+ input->vp_context.idtr.base = idt_ptr.address;
+ input->vp_context.gdtr.limit = gdt_ptr.size;
+ input->vp_context.gdtr.base = gdt_ptr.address;
+
+ /* Non-system desc (64bit), long, code, present */
+ input->vp_context.cs.selector = __KERNEL_CS;
+ input->vp_context.cs.base = 0;
+ input->vp_context.cs.limit = 0xffffffff;
+ input->vp_context.cs.attributes = 0xa09b;
+ /* Non-system desc (64bit), data, present, granularity, default */
+ input->vp_context.ss.selector = __KERNEL_DS;
+ input->vp_context.ss.base = 0;
+ input->vp_context.ss.limit = 0xffffffff;
+ input->vp_context.ss.attributes = 0xc093;
+
+ /* System desc (128bit), present, LDT */
+ input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8;
+ input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt);
+ input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt);
+ input->vp_context.ldtr.attributes = 0x82;
+
+ /* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */
+ input->vp_context.tr.selector = GDT_ENTRY_TSS * 8;
+ input->vp_context.tr.base = hv_vtl_system_desc_base(tss);
+ input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss);
+ input->vp_context.tr.attributes = 0x8b;
+
+ status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL);
+
+ if (!hv_result_success(status) &&
+ hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) {
+ pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]",
+ target_vp_index, status);
+ ret = -EINVAL;
+ goto free_lock;
+ }
+
+ status = hv_do_hypercall(HVCALL_START_VP, input, NULL);
+
+ if (!hv_result_success(status)) {
+ pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n",
+ target_vp_index, status);
+ ret = -EINVAL;
+ }
+
+free_lock:
+ local_irq_restore(irq_flags);
+
+ return ret;
+}
+
+static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, unsigned int cpu)
+{
+ int vp_index;
+
+ pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
+ vp_index = hv_apicid_to_vp_index(apicid);
+
+ if (vp_index < 0) {
+ pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
+ return -EINVAL;
+ }
+ if (vp_index > ms_hyperv.max_vp_index) {
+ pr_err("Invalid CPU id %d for APIC ID %d\n", vp_index, apicid);
+ return -EINVAL;
+ }
+
+ return hv_vtl_bringup_vcpu(vp_index, cpu, start_eip);
+}
+
+int __init hv_vtl_early_init(void)
+{
+ machine_ops.emergency_restart = hv_vtl_emergency_restart;
+ machine_ops.restart = hv_vtl_restart;
+
+ /*
+ * `boot_cpu_has` returns the runtime feature support,
+ * and here is the earliest it can be used.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ panic("XSAVE has to be disabled as it is not supported by this module.\n"
+ "Please add 'noxsave' to the kernel command line.\n");
+
+ real_mode_header = &hv_vtl_real_mode_header;
+ apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu);
+
+ return 0;
+}
+
+DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void));
+
+void mshv_vtl_return_call_init(u64 vtl_return_offset)
+{
+ static_call_update(__mshv_vtl_return_hypercall,
+ (void *)((u8 *)hv_hypercall_pg + vtl_return_offset));
+}
+EXPORT_SYMBOL(mshv_vtl_return_call_init);
+
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+{
+ struct hv_vp_assist_page *hvp;
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+ hvp->vtl_ret_x64rax = vtl0->rax;
+ hvp->vtl_ret_x64rcx = vtl0->rcx;
+
+ kernel_fpu_begin_mask(0);
+ fxrstor(&vtl0->fx_state);
+ __mshv_vtl_return_call(vtl0);
+ fxsave(&vtl0->fx_state);
+ kernel_fpu_end();
+}
+EXPORT_SYMBOL(mshv_vtl_return_call);
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
new file mode 100644
index 000000000000..c3ba12b1bc07
--- /dev/null
+++ b/arch/x86/hyperv/irqdomain.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
+ *
+ * Authors:
+ * Sunil Muthuswamy <sunilmut@microsoft.com>
+ * Wei Liu <wei.liu@kernel.org>
+ */
+
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/irqchip/irq-msi-lib.h>
+#include <asm/mshyperv.h>
+
+static int hv_map_interrupt(union hv_device_id device_id, bool level,
+ int cpu, int vector, struct hv_interrupt_entry *entry)
+{
+ struct hv_input_map_device_interrupt *input;
+ struct hv_output_map_device_interrupt *output;
+ struct hv_device_interrupt_descriptor *intr_desc;
+ unsigned long flags;
+ u64 status;
+ int nr_bank, var_size;
+
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ intr_desc = &input->interrupt_descriptor;
+ memset(input, 0, sizeof(*input));
+ input->partition_id = hv_current_partition_id;
+ input->device_id = device_id.as_uint64;
+ intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+ intr_desc->vector_count = 1;
+ intr_desc->target.vector = vector;
+
+ if (level)
+ intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL;
+ else
+ intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+
+ intr_desc->target.vp_set.valid_bank_mask = 0;
+ intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
+ if (nr_bank < 0) {
+ local_irq_restore(flags);
+ pr_err("%s: unable to generate VP set\n", __func__);
+ return -EINVAL;
+ }
+ intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+ /*
+ * var-sized hypercall, var-size starts after vp_mask (thus
+ * vp_set.format does not count, but vp_set.valid_bank_mask
+ * does).
+ */
+ var_size = nr_bank + 1;
+
+ status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
+ input, output);
+ *entry = output->interrupt_entry;
+
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
+}
+
+static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
+{
+ unsigned long flags;
+ struct hv_input_unmap_device_interrupt *input;
+ struct hv_interrupt_entry *intr_entry;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ intr_entry = &input->interrupt_entry;
+ input->partition_id = hv_current_partition_id;
+ input->device_id = id;
+ *intr_entry = *old_entry;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
+}
+
+#ifdef CONFIG_PCI_MSI
+struct rid_data {
+ struct pci_dev *bridge;
+ u32 rid;
+};
+
+static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
+{
+ struct rid_data *rd = data;
+ u8 bus = PCI_BUS_NUM(rd->rid);
+
+ if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
+ rd->bridge = pdev;
+ rd->rid = alias;
+ }
+
+ return 0;
+}
+
+static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
+{
+ union hv_device_id dev_id;
+ struct rid_data data = {
+ .bridge = NULL,
+ .rid = PCI_DEVID(dev->bus->number, dev->devfn)
+ };
+
+ pci_for_each_dma_alias(dev, get_rid_cb, &data);
+
+ dev_id.as_uint64 = 0;
+ dev_id.device_type = HV_DEVICE_TYPE_PCI;
+ dev_id.pci.segment = pci_domain_nr(dev->bus);
+
+ dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
+ dev_id.pci.bdf.device = PCI_SLOT(data.rid);
+ dev_id.pci.bdf.function = PCI_FUNC(data.rid);
+ dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
+
+ if (data.bridge) {
+ int pos;
+
+ /*
+ * Microsoft Hypervisor requires a bus range when the bridge is
+ * running in PCI-X mode.
+ *
+ * To distinguish conventional vs PCI-X bridge, we can check
+ * the bridge's PCI-X Secondary Status Register, Secondary Bus
+ * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
+ * Specification Revision 1.0 5.2.2.1.3.
+ *
+ * Value zero means it is in conventional mode, otherwise it is
+ * in PCI-X mode.
+ */
+
+ pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
+ if (pos) {
+ u16 status;
+
+ pci_read_config_word(data.bridge, pos +
+ PCI_X_BRIDGE_SSTATUS, &status);
+
+ if (status & PCI_X_SSTATUS_FREQ) {
+ /* Non-zero, PCI-X mode */
+ u8 sec_bus, sub_bus;
+
+ dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
+
+ pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
+ dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
+ pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
+ dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
+ }
+ }
+ }
+
+ return dev_id;
+}
+
+/**
+ * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor.
+ * @data: Describes the IRQ
+ * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL)
+ *
+ * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int hv_map_msi_interrupt(struct irq_data *data,
+ struct hv_interrupt_entry *out_entry)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct hv_interrupt_entry dummy;
+ union hv_device_id device_id;
+ struct msi_desc *msidesc;
+ struct pci_dev *dev;
+ int cpu;
+
+ msidesc = irq_data_get_msi_desc(data);
+ dev = msi_desc_to_pci_dev(msidesc);
+ device_id = hv_build_pci_dev_id(dev);
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
+
+ return hv_map_interrupt(device_id, false, cpu, cfg->vector,
+ out_entry ? out_entry : &dummy);
+}
+EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
+
+static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
+{
+ /* High address is always 0 */
+ msg->address_hi = 0;
+ msg->address_lo = entry->msi_entry.address.as_uint32;
+ msg->data = entry->msi_entry.data.as_uint32;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
+static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct hv_interrupt_entry *stored_entry;
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct msi_desc *msidesc;
+ struct pci_dev *dev;
+ int ret;
+
+ msidesc = irq_data_get_msi_desc(data);
+ dev = msi_desc_to_pci_dev(msidesc);
+
+ if (!cfg) {
+ pr_debug("%s: cfg is NULL", __func__);
+ return;
+ }
+
+ if (data->chip_data) {
+ /*
+ * This interrupt is already mapped. Let's unmap first.
+ *
+ * We don't use retarget interrupt hypercalls here because
+ * Microsoft Hypervisor doesn't allow root to change the vector
+ * or specify VPs outside of the set that is initially used
+ * during mapping.
+ */
+ stored_entry = data->chip_data;
+ data->chip_data = NULL;
+
+ ret = hv_unmap_msi_interrupt(dev, stored_entry);
+
+ kfree(stored_entry);
+
+ if (ret)
+ return;
+ }
+
+ stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
+ if (!stored_entry) {
+ pr_debug("%s: failed to allocate chip data\n", __func__);
+ return;
+ }
+
+ ret = hv_map_msi_interrupt(data, stored_entry);
+ if (ret) {
+ kfree(stored_entry);
+ return;
+ }
+
+ data->chip_data = stored_entry;
+ entry_to_msi_msg(data->chip_data, msg);
+
+ return;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
+{
+ return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
+}
+
+static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
+{
+ struct hv_interrupt_entry old_entry;
+ struct msi_msg msg;
+
+ if (!irqd->chip_data) {
+ pr_debug("%s: no chip data\n!", __func__);
+ return;
+ }
+
+ old_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
+ entry_to_msi_msg(&old_entry, &msg);
+
+ kfree(irqd->chip_data);
+ irqd->chip_data = NULL;
+
+ (void)hv_unmap_msi_interrupt(dev, &old_entry);
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip hv_pci_msi_controller = {
+ .name = "HV-PCI-MSI",
+ .irq_ack = irq_chip_ack_parent,
+ .irq_compose_msi_msg = hv_irq_compose_msi_msg,
+ .irq_set_affinity = irq_chip_set_affinity_parent,
+};
+
+static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+ struct irq_chip *chip = info->chip;
+
+ if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+ return false;
+
+ chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED;
+
+ info->ops->msi_prepare = pci_msi_prepare;
+
+ return true;
+}
+
+#define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX)
+#define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS)
+
+static struct msi_parent_ops hv_msi_parent_ops = {
+ .supported_flags = HV_MSI_FLAGS_SUPPORTED,
+ .required_flags = HV_MSI_FLAGS_REQUIRED,
+ .bus_select_token = DOMAIN_BUS_NEXUS,
+ .bus_select_mask = MATCH_PCI_MSI,
+ .chip_flags = MSI_CHIP_FLAG_SET_ACK,
+ .prefix = "HV-",
+ .init_dev_msi_info = hv_init_dev_msi_info,
+};
+
+static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs,
+ void *arg)
+{
+ /*
+ * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except
+ * entry_to_msi_msg() should be in here.
+ */
+
+ int ret;
+
+ ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < nr_irqs; ++i) {
+ irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL,
+ handle_edge_irq, NULL, "edge");
+ }
+ return 0;
+}
+
+static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs)
+{
+ for (int i = 0; i < nr_irqs; ++i) {
+ struct irq_data *irqd = irq_domain_get_irq_data(d, virq);
+ struct msi_desc *desc;
+
+ desc = irq_data_get_msi_desc(irqd);
+ if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev)))
+ continue;
+
+ hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd);
+ }
+ irq_domain_free_irqs_top(d, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops hv_msi_domain_ops = {
+ .select = msi_lib_irq_domain_select,
+ .alloc = hv_msi_domain_alloc,
+ .free = hv_msi_domain_free,
+};
+
+struct irq_domain * __init hv_create_pci_msi_domain(void)
+{
+ struct irq_domain *d = NULL;
+
+ struct irq_domain_info info = {
+ .fwnode = irq_domain_alloc_named_fwnode("HV-PCI-MSI"),
+ .ops = &hv_msi_domain_ops,
+ .parent = x86_vector_domain,
+ };
+
+ if (info.fwnode)
+ d = msi_create_parent_irq_domain(&info, &hv_msi_parent_ops);
+
+ /* No point in going further if we can't get an irq domain */
+ BUG_ON(!d);
+
+ return d;
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id;
+
+ device_id.as_uint64 = 0;
+ device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+ device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+ return hv_unmap_interrupt(device_id.as_uint64, entry);
+}
+EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
+
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
+ struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id;
+
+ device_id.as_uint64 = 0;
+ device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+ device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+ return hv_map_interrupt(device_id, level, cpu, vector, entry);
+}
+EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
new file mode 100644
index 000000000000..651771534cae
--- /dev/null
+++ b/arch/x86/hyperv/ivm.c
@@ -0,0 +1,945 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V Isolation VM interface with paravisor and hypervisor
+ *
+ * Author:
+ * Tianyu Lan <Tianyu.Lan@microsoft.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <asm/svm.h>
+#include <asm/sev.h>
+#include <asm/io.h>
+#include <asm/coco.h>
+#include <asm/mem_encrypt.h>
+#include <asm/set_memory.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
+#include <asm/mtrr.h>
+#include <asm/io_apic.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
+#include <asm/desc.h>
+#include <asm/msr.h>
+#include <uapi/asm/vmx.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+#define GHCB_USAGE_HYPERV_CALL 1
+
+union hv_ghcb {
+ struct ghcb ghcb;
+ struct {
+ u64 hypercalldata[509];
+ u64 outputgpa;
+ union {
+ union {
+ struct {
+ u32 callcode : 16;
+ u32 isfast : 1;
+ u32 reserved1 : 14;
+ u32 isnested : 1;
+ u32 countofelements : 12;
+ u32 reserved2 : 4;
+ u32 repstartindex : 12;
+ u32 reserved3 : 4;
+ };
+ u64 asuint64;
+ } hypercallinput;
+ union {
+ struct {
+ u16 callstatus;
+ u16 reserved1;
+ u32 elementsprocessed : 12;
+ u32 reserved2 : 20;
+ };
+ u64 asunit64;
+ } hypercalloutput;
+ };
+ u64 reserved2;
+ } hypercall;
+} __packed __aligned(HV_HYP_PAGE_SIZE);
+
+/* Only used in an SNP VM with the paravisor */
+static u16 hv_ghcb_version __ro_after_init;
+
+/* Functions only used in an SNP VM with the paravisor go here. */
+u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+ u64 status;
+
+ if (!hv_ghcb_pg)
+ return -EFAULT;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return -EFAULT;
+ }
+
+ hv_ghcb->ghcb.protocol_version = GHCB_PROTOCOL_MAX;
+ hv_ghcb->ghcb.ghcb_usage = GHCB_USAGE_HYPERV_CALL;
+
+ hv_ghcb->hypercall.outputgpa = (u64)output;
+ hv_ghcb->hypercall.hypercallinput.asuint64 = 0;
+ hv_ghcb->hypercall.hypercallinput.callcode = control;
+
+ if (input_size)
+ memcpy(hv_ghcb->hypercall.hypercalldata, input, input_size);
+
+ VMGEXIT();
+
+ hv_ghcb->ghcb.ghcb_usage = 0xffffffff;
+ memset(hv_ghcb->ghcb.save.valid_bitmap, 0,
+ sizeof(hv_ghcb->ghcb.save.valid_bitmap));
+
+ status = hv_ghcb->hypercall.hypercalloutput.callstatus;
+
+ local_irq_restore(flags);
+
+ return status;
+}
+
+static inline u64 rd_ghcb_msr(void)
+{
+ return native_rdmsrq(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static inline void wr_ghcb_msr(u64 val)
+{
+ native_wrmsrq(MSR_AMD64_SEV_ES_GHCB, val);
+}
+
+static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code,
+ u64 exit_info_1, u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = hv_ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ VMGEXIT();
+
+ if (ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0))
+ return ES_VMM_ERROR;
+ else
+ return ES_OK;
+}
+
+void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason)
+{
+ u64 val = GHCB_MSR_TERM_REQ;
+
+ /* Tell the hypervisor what went wrong. */
+ val |= GHCB_SEV_TERM_REASON(set, reason);
+
+ /* Request Guest Termination from Hypervisor */
+ wr_ghcb_msr(val);
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+bool hv_ghcb_negotiate_protocol(void)
+{
+ u64 ghcb_gpa;
+ u64 val;
+
+ /* Save ghcb page gpa. */
+ ghcb_gpa = rd_ghcb_msr();
+
+ /* Do the GHCB protocol version negotiation */
+ wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
+ VMGEXIT();
+ val = rd_ghcb_msr();
+
+ if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
+ return false;
+
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
+ return false;
+
+ hv_ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val),
+ GHCB_PROTOCOL_MAX);
+
+ /* Write ghcb page back after negotiating protocol. */
+ wr_ghcb_msr(ghcb_gpa);
+ VMGEXIT();
+
+ return true;
+}
+
+static void hv_ghcb_msr_write(u64 msr, u64 value)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+
+ if (!hv_ghcb_pg)
+ return;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ ghcb_set_rcx(&hv_ghcb->ghcb, msr);
+ ghcb_set_rax(&hv_ghcb->ghcb, lower_32_bits(value));
+ ghcb_set_rdx(&hv_ghcb->ghcb, upper_32_bits(value));
+
+ if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 1, 0))
+ pr_warn("Fail to write msr via ghcb %llx.\n", msr);
+
+ local_irq_restore(flags);
+}
+
+static void hv_ghcb_msr_read(u64 msr, u64 *value)
+{
+ union hv_ghcb *hv_ghcb;
+ void **ghcb_base;
+ unsigned long flags;
+
+ /* Check size of union hv_ghcb here. */
+ BUILD_BUG_ON(sizeof(union hv_ghcb) != HV_HYP_PAGE_SIZE);
+
+ if (!hv_ghcb_pg)
+ return;
+
+ WARN_ON(in_nmi());
+
+ local_irq_save(flags);
+ ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg);
+ hv_ghcb = (union hv_ghcb *)*ghcb_base;
+ if (!hv_ghcb) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ ghcb_set_rcx(&hv_ghcb->ghcb, msr);
+ if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 0, 0))
+ pr_warn("Fail to read msr via ghcb %llx.\n", msr);
+ else
+ *value = (u64)lower_32_bits(hv_ghcb->ghcb.save.rax)
+ | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32);
+ local_irq_restore(flags);
+}
+
+/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */
+static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE);
+static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa);
+
+/* Functions only used in an SNP VM without the paravisor go here. */
+
+#define hv_populate_vmcb_seg(seg, gdtr_base) \
+do { \
+ if (seg.selector) { \
+ seg.base = 0; \
+ seg.limit = HV_AP_SEGMENT_LIMIT; \
+ seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \
+ seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \
+ } \
+} while (0) \
+
+static int snp_set_vmsa(void *va, bool vmsa)
+{
+ u64 attrs;
+
+ /*
+ * Running at VMPL0 allows the kernel to change the VMSA bit for a page
+ * using the RMPADJUST instruction. However, for the instruction to
+ * succeed it must target the permissions of a lesser privileged
+ * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
+ * instruction in the AMD64 APM Volume 3).
+ */
+ attrs = 1;
+ if (vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu)
+{
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
+ __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ struct sev_es_save_area *cur_vmsa;
+ struct desc_ptr gdtr;
+ u64 ret, retry = 5;
+ struct hv_enable_vp_vtl *start_vp_input;
+ unsigned long flags;
+ int vp_index;
+
+ if (!vmsa)
+ return -ENOMEM;
+
+ /* Find the Hyper-V VP index which might be not the same as APIC ID */
+ vp_index = hv_apicid_to_vp_index(apic_id);
+ if (vp_index < 0 || vp_index > ms_hyperv.max_vp_index)
+ return -EINVAL;
+
+ native_store_gdt(&gdtr);
+
+ vmsa->gdtr.base = gdtr.address;
+ vmsa->gdtr.limit = gdtr.size;
+
+ asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector));
+ hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base);
+
+ asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector));
+ hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base);
+
+ asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector));
+ hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base);
+
+ asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector));
+ hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base);
+
+ vmsa->efer = native_read_msr(MSR_EFER);
+
+ vmsa->cr4 = native_read_cr4();
+ vmsa->cr3 = __native_read_cr3();
+ vmsa->cr0 = native_read_cr0();
+
+ vmsa->xcr0 = 1;
+ vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT;
+ vmsa->rip = (u64)secondary_startup_64_no_verify;
+ vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE];
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ ret = snp_set_vmsa(vmsa, true);
+ if (ret) {
+ pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
+ free_page((u64)vmsa);
+ return ret;
+ }
+
+ local_irq_save(flags);
+ start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg;
+ memset(start_vp_input, 0, sizeof(*start_vp_input));
+ start_vp_input->partition_id = -1;
+ start_vp_input->vp_index = vp_index;
+ start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
+ *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
+
+ do {
+ ret = hv_do_hypercall(HVCALL_START_VP,
+ start_vp_input, NULL);
+ } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--);
+
+ local_irq_restore(flags);
+
+ if (!hv_result_success(ret)) {
+ pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret);
+ snp_cleanup_vmsa(vmsa);
+ vmsa = NULL;
+ }
+
+ cur_vmsa = per_cpu(hv_sev_vmsa, cpu);
+ /* Free up any previous VMSA page */
+ if (cur_vmsa)
+ snp_cleanup_vmsa(cur_vmsa);
+
+ /* Record the current VMSA page */
+ per_cpu(hv_sev_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2)
+{
+ u64 hv_status;
+
+ register u64 __r8 asm("r8") = param2;
+ asm volatile("vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (param1), "+r" (__r8)
+ : : "cc", "memory", "r9", "r10", "r11");
+
+ return hv_status;
+}
+
+#else
+static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
+static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; }
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+static void hv_tdx_msr_write(u64 msr, u64 val)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_WRITE,
+ .r12 = msr,
+ .r13 = val,
+ };
+
+ u64 ret = __tdx_hypercall(&args);
+
+ WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret);
+}
+
+static void hv_tdx_msr_read(u64 msr, u64 *val)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_READ,
+ .r12 = msr,
+ };
+
+ u64 ret = __tdx_hypercall(&args);
+
+ if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret))
+ *val = 0;
+ else
+ *val = args.r11;
+}
+
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
+{
+ struct tdx_module_args args = { };
+
+ args.r10 = control;
+ args.rdx = param1;
+ args.r8 = param2;
+
+ (void)__tdx_hypercall(&args);
+
+ return args.r11;
+}
+
+#else
+static inline void hv_tdx_msr_write(u64 msr, u64 value) {}
+static inline void hv_tdx_msr_read(u64 msr, u64 *value) {}
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; }
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_ivm_msr_write(u64 msr, u64 value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_write(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_write(msr, value);
+}
+
+void hv_ivm_msr_read(u64 msr, u64 *value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_read(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_read(msr, value);
+}
+
+/*
+ * Keep track of the PFN regions which were shared with the host. The access
+ * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
+ */
+struct hv_enc_pfn_region {
+ struct list_head list;
+ u64 pfn;
+ int count;
+};
+
+static LIST_HEAD(hv_list_enc);
+static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
+
+static int hv_list_enc_add(const u64 *pfn_list, int count)
+{
+ struct hv_enc_pfn_region *ent;
+ unsigned long flags;
+ u64 pfn;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ pfn = pfn_list[i];
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ /* Check if the PFN already exists in some region first */
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
+ /* Nothing to do - pfn is already in the list */
+ goto unlock_done;
+ }
+
+ /*
+ * Check if the PFN is adjacent to an existing region. Growing
+ * a region can make it adjacent to another one but merging is
+ * not (yet) implemented for simplicity. A PFN cannot be added
+ * to two regions to keep the logic in hv_list_enc_remove()
+ * correct.
+ */
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ if (ent->pfn + ent->count == pfn) {
+ /* Grow existing region up */
+ ent->count++;
+ goto unlock_done;
+ } else if (pfn + 1 == ent->pfn) {
+ /* Grow existing region down */
+ ent->pfn--;
+ ent->count++;
+ goto unlock_done;
+ }
+ }
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+ /* No adjacent region found -- create a new one */
+ ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pfn = pfn;
+ ent->count = 1;
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_add(&ent->list, &hv_list_enc);
+
+unlock_done:
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ }
+
+ return 0;
+}
+
+static int hv_list_enc_remove(const u64 *pfn_list, int count)
+{
+ struct hv_enc_pfn_region *ent, *t;
+ struct hv_enc_pfn_region new_region;
+ unsigned long flags;
+ u64 pfn;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ pfn = pfn_list[i];
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
+ if (pfn == ent->pfn + ent->count - 1) {
+ /* Removing tail pfn */
+ ent->count--;
+ if (!ent->count) {
+ list_del(&ent->list);
+ kfree(ent);
+ }
+ goto unlock_done;
+ } else if (pfn == ent->pfn) {
+ /* Removing head pfn */
+ ent->count--;
+ ent->pfn++;
+ if (!ent->count) {
+ list_del(&ent->list);
+ kfree(ent);
+ }
+ goto unlock_done;
+ } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
+ /*
+ * Removing a pfn in the middle. Cut off the tail
+ * of the existing region and create a template for
+ * the new one.
+ */
+ new_region.pfn = pfn + 1;
+ new_region.count = ent->count - (pfn - ent->pfn + 1);
+ ent->count = pfn - ent->pfn;
+ goto unlock_split;
+ }
+
+ }
+unlock_done:
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ continue;
+
+unlock_split:
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+ ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pfn = new_region.pfn;
+ ent->count = new_region.count;
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_add(&ent->list, &hv_list_enc);
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ }
+
+ return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void hv_vtom_kexec_begin(void)
+{
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ /*
+ * Crash kernel reaches here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!set_memory_enc_stop_conversion())
+ pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+static void hv_vtom_kexec_finish(void)
+{
+ struct hv_gpa_range_for_visibility *input;
+ struct hv_enc_pfn_region *ent;
+ unsigned long flags;
+ u64 hv_status;
+ int cur, i;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (unlikely(!input))
+ goto out;
+
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ for (i = 0, cur = 0; i < ent->count; i++) {
+ input->gpa_page_list[cur] = ent->pfn + i;
+ cur++;
+
+ if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
+ input->reserved0 = 0;
+ input->reserved1 = 0;
+ hv_status = hv_do_rep_hypercall(
+ HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
+ cur, 0, input, NULL);
+ WARN_ON_ONCE(!hv_result_success(hv_status));
+ cur = 0;
+ }
+ }
+
+ }
+
+out:
+ local_irq_restore(flags);
+}
+
+/*
+ * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
+ *
+ * In Isolation VM, all guest memory is encrypted from host and guest
+ * needs to set memory visible to host via hvcall before sharing memory
+ * with host.
+ */
+static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
+ enum hv_mem_host_visibility visibility)
+{
+ struct hv_gpa_range_for_visibility *input;
+ u64 hv_status;
+ unsigned long flags;
+ int ret;
+
+ /* no-op if partition isolation is not enabled */
+ if (!hv_is_isolation_supported())
+ return 0;
+
+ if (count > HV_MAX_MODIFY_GPA_REP_COUNT) {
+ pr_err("Hyper-V: GPA count:%d exceeds supported:%lu\n", count,
+ HV_MAX_MODIFY_GPA_REP_COUNT);
+ return -EINVAL;
+ }
+
+ if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+ ret = hv_list_enc_remove(pfn, count);
+ else
+ ret = hv_list_enc_add(pfn, count);
+ if (ret)
+ return ret;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (unlikely(!input)) {
+ local_irq_restore(flags);
+ return -EINVAL;
+ }
+
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->host_visibility = visibility;
+ input->reserved0 = 0;
+ input->reserved1 = 0;
+ memcpy((void *)input->gpa_page_list, pfn, count * sizeof(*pfn));
+ hv_status = hv_do_rep_hypercall(
+ HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count,
+ 0, input, NULL);
+ local_irq_restore(flags);
+
+ if (hv_result_success(hv_status))
+ return 0;
+
+ if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+ ret = hv_list_enc_add(pfn, count);
+ else
+ ret = hv_list_enc_remove(pfn, count);
+ /*
+ * There's no good way to recover from -ENOMEM here, the accounting is
+ * wrong either way.
+ */
+ WARN_ON_ONCE(ret);
+
+ return -EFAULT;
+}
+
+/*
+ * When transitioning memory between encrypted and decrypted, the caller
+ * of set_memory_encrypted() or set_memory_decrypted() is responsible for
+ * ensuring that the memory isn't in use and isn't referenced while the
+ * transition is in progress. The transition has multiple steps, and the
+ * memory is in an inconsistent state until all steps are complete. A
+ * reference while the state is inconsistent could result in an exception
+ * that can't be cleanly fixed up.
+ *
+ * But the Linux kernel load_unaligned_zeropad() mechanism could cause a
+ * stray reference that can't be prevented by the caller, so Linux has
+ * specific code to handle this case. But when the #VC and #VE exceptions
+ * routed to a paravisor, the specific code doesn't work. To avoid this
+ * problem, mark the pages as "not present" while the transition is in
+ * progress. If load_unaligned_zeropad() causes a stray reference, a normal
+ * page fault is generated instead of #VC or #VE, and the page-fault-based
+ * handlers for load_unaligned_zeropad() resolve the reference. When the
+ * transition is complete, hv_vtom_set_host_visibility() marks the pages
+ * as "present" again.
+ */
+static int hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
+{
+ return set_memory_np(kbuffer, pagecount);
+}
+
+/*
+ * hv_vtom_set_host_visibility - Set specified memory visible to host.
+ *
+ * In Isolation VM, all guest memory is encrypted from host and guest
+ * needs to set memory visible to host via hvcall before sharing memory
+ * with host. This function works as wrap of hv_mark_gpa_visibility()
+ * with memory base and size.
+ */
+static int hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
+{
+ enum hv_mem_host_visibility visibility = enc ?
+ VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
+ u64 *pfn_array;
+ phys_addr_t paddr;
+ int i, pfn, err;
+ void *vaddr;
+ int ret = 0;
+
+ pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ if (!pfn_array) {
+ ret = -ENOMEM;
+ goto err_set_memory_p;
+ }
+
+ for (i = 0, pfn = 0; i < pagecount; i++) {
+ /*
+ * Use slow_virt_to_phys() because the PRESENT bit has been
+ * temporarily cleared in the PTEs. slow_virt_to_phys() works
+ * without the PRESENT bit while virt_to_hvpfn() or similar
+ * does not.
+ */
+ vaddr = (void *)kbuffer + (i * HV_HYP_PAGE_SIZE);
+ paddr = slow_virt_to_phys(vaddr);
+ pfn_array[pfn] = paddr >> HV_HYP_PAGE_SHIFT;
+ pfn++;
+
+ if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
+ ret = hv_mark_gpa_visibility(pfn, pfn_array,
+ visibility);
+ if (ret)
+ goto err_free_pfn_array;
+ pfn = 0;
+ }
+ }
+
+err_free_pfn_array:
+ kfree(pfn_array);
+
+err_set_memory_p:
+ /*
+ * Set the PTE PRESENT bits again to revert what hv_vtom_clear_present()
+ * did. Do this even if there is an error earlier in this function in
+ * order to avoid leaving the memory range in a "broken" state. Setting
+ * the PRESENT bits shouldn't fail, but return an error if it does.
+ */
+ err = set_memory_p(kbuffer, pagecount);
+ if (err && !ret)
+ ret = err;
+
+ return ret;
+}
+
+static bool hv_vtom_tlb_flush_required(bool private)
+{
+ /*
+ * Since hv_vtom_clear_present() marks the PTEs as "not present"
+ * and flushes the TLB, they can't be in the TLB. That makes the
+ * flush controlled by this function redundant, so return "false".
+ */
+ return false;
+}
+
+static bool hv_vtom_cache_flush_required(void)
+{
+ return false;
+}
+
+static bool hv_is_private_mmio(u64 addr)
+{
+ /*
+ * Hyper-V always provides a single IO-APIC in a guest VM.
+ * When a paravisor is used, it is emulated by the paravisor
+ * in the guest context and must be mapped private.
+ */
+ if (addr >= HV_IOAPIC_BASE_ADDRESS &&
+ addr < (HV_IOAPIC_BASE_ADDRESS + PAGE_SIZE))
+ return true;
+
+ /* Same with a vTPM */
+ if (addr >= VTPM_BASE_ADDRESS &&
+ addr < (VTPM_BASE_ADDRESS + PAGE_SIZE))
+ return true;
+
+ return false;
+}
+
+void __init hv_vtom_init(void)
+{
+ enum hv_isolation_type type = hv_get_isolation_type();
+
+ switch (type) {
+ case HV_ISOLATION_TYPE_VBS:
+ fallthrough;
+ /*
+ * By design, a VM using vTOM doesn't see the SEV setting,
+ * so SEV initialization is bypassed and sev_status isn't set.
+ * Set it here to indicate a vTOM VM.
+ *
+ * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is
+ * defined as 0ULL, to which we can't assigned a value.
+ */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ case HV_ISOLATION_TYPE_SNP:
+ sev_status = MSR_AMD64_SNP_VTOM;
+ cc_vendor = CC_VENDOR_AMD;
+ break;
+#endif
+
+ case HV_ISOLATION_TYPE_TDX:
+ cc_vendor = CC_VENDOR_INTEL;
+ break;
+
+ default:
+ panic("hv_vtom_init: unsupported isolation type %d\n", type);
+ }
+
+ cc_set_mask(ms_hyperv.shared_gpa_boundary);
+ physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
+
+ x86_platform.hyper.is_private_mmio = hv_is_private_mmio;
+ x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required;
+ x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
+ x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
+ x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
+ x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
+ x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
+
+ /* Set WB as the default cache mode. */
+ guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
+}
+
+#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+ if (!(ms_hyperv.priv_high & HV_ISOLATION))
+ return HV_ISOLATION_TYPE_NONE;
+ return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
+
+/*
+ * hv_is_isolation_supported - Check system runs in the Hyper-V
+ * isolation VM.
+ */
+bool hv_is_isolation_supported(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ if (!hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ return false;
+
+ return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
+}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
+
+/*
+ * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based
+ * isolation VM.
+ */
+bool hv_isolation_type_snp(void)
+{
+ return static_branch_unlikely(&isolation_type_snp);
+}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_tdx);
+/*
+ * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based
+ * isolated VM.
+ */
+bool hv_isolation_type_tdx(void)
+{
+ return static_branch_unlikely(&isolation_type_tdx);
+}
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
new file mode 100644
index 000000000000..cfcb60468b01
--- /dev/null
+++ b/arch/x86/hyperv/mmu.c
@@ -0,0 +1,246 @@
+#define pr_fmt(fmt) "Hyper-V: " fmt
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/fpu/api.h>
+#include <asm/mshyperv.h>
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/hyperv.h>
+
+/* Each gva in gva_list encodes up to 4096 pages to flush */
+#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
+
+static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
+
+/*
+ * Fills in gva_list starting from offset. Returns the number of items added.
+ */
+static inline int fill_gva_list(u64 gva_list[], int offset,
+ unsigned long start, unsigned long end)
+{
+ int gva_n = offset;
+ unsigned long cur = start, diff;
+
+ do {
+ diff = end > cur ? end - cur : 0;
+
+ gva_list[gva_n] = cur & PAGE_MASK;
+ /*
+ * Lower 12 bits encode the number of additional
+ * pages to flush (in addition to the 'cur' page).
+ */
+ if (diff >= HV_TLB_FLUSH_UNIT) {
+ gva_list[gva_n] |= ~PAGE_MASK;
+ cur += HV_TLB_FLUSH_UNIT;
+ } else if (diff) {
+ gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT;
+ cur = end;
+ }
+
+ gva_n++;
+
+ } while (cur < end);
+
+ return gva_n - offset;
+}
+
+static bool cpu_is_lazy(int cpu)
+{
+ return per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+}
+
+static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
+{
+ int cpu, vcpu, gva_n, max_gvas;
+ struct hv_tlb_flush *flush;
+ u64 status;
+ unsigned long flags;
+ bool do_lazy = !info->freed_tables;
+
+ trace_hyperv_mmu_flush_tlb_multi(cpus, info);
+
+ if (!hv_hypercall_pg)
+ goto do_native;
+
+ local_irq_save(flags);
+
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
+
+ if (info->mm) {
+ /*
+ * AddressSpace argument must match the CR3 with PCID bits
+ * stripped out.
+ */
+ flush->address_space = virt_to_phys(info->mm->pgd);
+ flush->address_space &= CR3_ADDR_MASK;
+ flush->flags = 0;
+ } else {
+ flush->address_space = 0;
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ }
+
+ flush->processor_mask = 0;
+ if (cpumask_equal(cpus, cpu_present_mask)) {
+ flush->flags |= HV_FLUSH_ALL_PROCESSORS;
+ } else {
+ /*
+ * From the supplied CPU set we need to figure out if we can get
+ * away with cheaper HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE}
+ * hypercalls. This is possible when the highest VP number in
+ * the set is < 64. As VP numbers are usually in ascending order
+ * and match Linux CPU ids, here is an optimization: we check
+ * the VP number for the highest bit in the supplied set first
+ * so we can quickly find out if using *_EX hypercalls is a
+ * must. We will also check all VP numbers when walking the
+ * supplied CPU set to remain correct in all cases.
+ */
+ cpu = cpumask_last(cpus);
+
+ if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64)
+ goto do_ex_hypercall;
+
+ for_each_cpu(cpu, cpus) {
+ if (do_lazy && cpu_is_lazy(cpu))
+ continue;
+ vcpu = hv_cpu_number_to_vp_number(cpu);
+ if (vcpu == VP_INVAL) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
+
+ if (vcpu >= 64)
+ goto do_ex_hypercall;
+
+ __set_bit(vcpu, (unsigned long *)
+ &flush->processor_mask);
+ }
+
+ /* nothing to flush if 'processor_mask' ends up being empty */
+ if (!flush->processor_mask) {
+ local_irq_restore(flags);
+ return;
+ }
+ }
+
+ /*
+ * We can flush not more than max_gvas with one hypercall. Flush the
+ * whole address space if we were asked to do more.
+ */
+ max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]);
+
+ if (info->end == TLB_FLUSH_ALL) {
+ flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
+ status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
+ flush, NULL);
+ } else if (info->end &&
+ ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
+ status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
+ flush, NULL);
+ } else {
+ gva_n = fill_gva_list(flush->gva_list, 0,
+ info->start, info->end);
+ status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST,
+ gva_n, 0, flush, NULL);
+ }
+ goto check_status;
+
+do_ex_hypercall:
+ status = hyperv_flush_tlb_others_ex(cpus, info);
+
+check_status:
+ local_irq_restore(flags);
+
+ if (hv_result_success(status))
+ return;
+do_native:
+ native_flush_tlb_multi(cpus, info);
+}
+
+static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
+{
+ int nr_bank = 0, max_gvas, gva_n;
+ struct hv_tlb_flush_ex *flush;
+ u64 status;
+
+ if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return HV_STATUS_INVALID_PARAMETER;
+
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (info->mm) {
+ /*
+ * AddressSpace argument must match the CR3 with PCID bits
+ * stripped out.
+ */
+ flush->address_space = virt_to_phys(info->mm->pgd);
+ flush->address_space &= CR3_ADDR_MASK;
+ flush->flags = 0;
+ } else {
+ flush->address_space = 0;
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ }
+
+ flush->hv_vp_set.valid_bank_mask = 0;
+
+ flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset_skip(&flush->hv_vp_set, cpus,
+ info->freed_tables ? NULL : cpu_is_lazy);
+ if (nr_bank < 0)
+ return HV_STATUS_INVALID_PARAMETER;
+
+ /*
+ * We can flush not more than max_gvas with one hypercall. Flush the
+ * whole address space if we were asked to do more.
+ *
+ * For these hypercalls, Hyper-V treats the valid_bank_mask field
+ * of flush->hv_vp_set as part of the fixed size input header.
+ * So the variable input header size is equal to nr_bank.
+ */
+ max_gvas =
+ (PAGE_SIZE - sizeof(*flush) - nr_bank *
+ sizeof(flush->hv_vp_set.bank_contents[0])) /
+ sizeof(flush->gva_list[0]);
+
+ if (info->end == TLB_FLUSH_ALL) {
+ flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
+ status = hv_do_rep_hypercall(
+ HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
+ 0, nr_bank, flush, NULL);
+ } else if (info->end &&
+ ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
+ status = hv_do_rep_hypercall(
+ HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
+ 0, nr_bank, flush, NULL);
+ } else {
+ gva_n = fill_gva_list(flush->gva_list, nr_bank,
+ info->start, info->end);
+ status = hv_do_rep_hypercall(
+ HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
+ gva_n, nr_bank, flush, NULL);
+ }
+
+ return status;
+}
+
+void hyperv_setup_mmu_ops(void)
+{
+ if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED))
+ return;
+
+ pr_info("Using hypercall for remote TLB flush\n");
+ pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
+}
diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c
new file mode 100644
index 000000000000..882c1db6df16
--- /dev/null
+++ b/arch/x86/hyperv/mshv-asm-offsets.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ * Naman Jain <namjain@microsoft.com>
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include <asm/mshyperv.h>
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) {
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2);
+ }
+}
diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S
new file mode 100644
index 000000000000..f595eefad9ab
--- /dev/null
+++ b/arch/x86/hyperv/mshv_vtl_asm.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Assembly level code for mshv_vtl VTL transition
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ * Naman Jain <namjain@microsoft.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/static_call_types.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include "mshv-asm-offsets.h"
+
+ .text
+ .section .noinstr.text, "ax"
+/*
+ * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+ *
+ * This function is used to context switch between different Virtual Trust Levels.
+ * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities.
+ * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard
+ * against #PFs in NMI context clobbering the guest state.
+ */
+SYM_FUNC_START(__mshv_vtl_return_call)
+ /* Push callee save registers */
+ pushq %rbp
+ mov %rsp, %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+
+ /* register switch to VTL0 clobbers all registers except rax/rcx */
+ mov %_ASM_ARG1, %rax
+
+ /* grab rbx/rbp/rsi/rdi/r8-r15 */
+ mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx
+ mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp
+ mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi
+ mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi
+ mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8
+ mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9
+ mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10
+ mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11
+ mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12
+ mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13
+ mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14
+ mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15
+
+ mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx
+ mov %rdx, %cr2
+ mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx
+
+ /* stash host registers on stack */
+ pushq %rax
+ pushq %rcx
+
+ xor %ecx, %ecx
+
+ /* make a hypercall to switch VTL */
+ call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall)
+
+ /* stash guest registers on stack, restore saved host copies */
+ pushq %rax
+ pushq %rcx
+ mov 16(%rsp), %rcx
+ mov 24(%rsp), %rax
+
+ mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax)
+ mov %cr2, %rdx
+ mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax)
+ pop MSHV_VTL_CPU_CONTEXT_rcx(%rax)
+ pop MSHV_VTL_CPU_CONTEXT_rax(%rax)
+ add $16, %rsp
+
+ /* save rbx/rbp/rsi/rdi/r8-r15 */
+ mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax)
+ mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax)
+ mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax)
+ mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax)
+ mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax)
+ mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax)
+ mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax)
+ mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax)
+ mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax)
+ mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax)
+ mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax)
+ mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax)
+
+ /* pop callee-save registers r12-r15, rbx */
+ pop %rbx
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ pop %rbp
+ RET
+SYM_FUNC_END(__mshv_vtl_return_call)
+/*
+ * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here.
+ * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid
+ * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0"
+ * which would otherwise have been generated by the macro.
+ */
+ .section .discard.addressable,"aw"
+ .align 8
+ .type mshv_vtl_return_sym, @object
+ .size mshv_vtl_return_sym, 8
+mshv_vtl_return_sym:
+ .quad __SCK____mshv_vtl_return_hypercall
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
new file mode 100644
index 000000000000..8ccbb7c4fc27
--- /dev/null
+++ b/arch/x86/hyperv/nested.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V nested virtualization code.
+ *
+ * Copyright (C) 2018, Microsoft, Inc.
+ *
+ * Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
+ */
+#define pr_fmt(fmt) "Hyper-V: " fmt
+
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <hyperv/hvhdk.h>
+#include <asm/mshyperv.h>
+#include <asm/tlbflush.h>
+
+#include <asm/trace/hyperv.h>
+
+int hyperv_flush_guest_mapping(u64 as)
+{
+ struct hv_guest_mapping_flush *flush;
+ u64 status;
+ unsigned long flags;
+ int ret = -ENOTSUPP;
+
+ if (!hv_hypercall_pg)
+ goto fault;
+
+ local_irq_save(flags);
+
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto fault;
+ }
+
+ flush->address_space = as;
+ flush->flags = 0;
+
+ status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE,
+ flush, NULL);
+ local_irq_restore(flags);
+
+ if (hv_result_success(status))
+ ret = 0;
+
+fault:
+ trace_hyperv_nested_flush_guest_mapping(as, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping);
+
+int hyperv_fill_flush_guest_mapping_list(
+ struct hv_guest_mapping_flush_list *flush,
+ u64 start_gfn, u64 pages)
+{
+ u64 cur = start_gfn;
+ u64 additional_pages;
+ int gpa_n = 0;
+
+ do {
+ /*
+ * If flush requests exceed max flush count, go back to
+ * flush tlbs without range.
+ */
+ if (gpa_n >= HV_MAX_FLUSH_REP_COUNT)
+ return -ENOSPC;
+
+ additional_pages = min_t(u64, pages, HV_MAX_FLUSH_PAGES) - 1;
+
+ flush->gpa_list[gpa_n].page.additional_pages = additional_pages;
+ flush->gpa_list[gpa_n].page.largepage = false;
+ flush->gpa_list[gpa_n].page.basepfn = cur;
+
+ pages -= additional_pages + 1;
+ cur += additional_pages + 1;
+ gpa_n++;
+ } while (pages > 0);
+
+ return gpa_n;
+}
+EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list);
+
+int hyperv_flush_guest_mapping_range(u64 as,
+ hyperv_fill_flush_list_func fill_flush_list_func, void *data)
+{
+ struct hv_guest_mapping_flush_list *flush;
+ u64 status;
+ unsigned long flags;
+ int ret = -ENOTSUPP;
+ int gpa_n = 0;
+
+ if (!hv_hypercall_pg || !fill_flush_list_func)
+ goto fault;
+
+ local_irq_save(flags);
+
+ flush = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto fault;
+ }
+
+ flush->address_space = as;
+ flush->flags = 0;
+
+ gpa_n = fill_flush_list_func(flush, data);
+ if (gpa_n < 0) {
+ local_irq_restore(flags);
+ goto fault;
+ }
+
+ status = hv_do_rep_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST,
+ gpa_n, 0, flush, NULL);
+
+ local_irq_restore(flags);
+
+ if (hv_result_success(status))
+ ret = 0;
+ else
+ ret = hv_result(status);
+fault:
+ trace_hyperv_nested_flush_guest_mapping_range(as, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping_range);
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 52d0ccfcf6ea..333556a86b2a 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -1,13 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the ia32 kernel emulation subsystem.
#
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-
-sysv-$(CONFIG_SYSVIPC) := ipc32.o
-obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
-
-obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
-
audit-class-$(CONFIG_AUDIT) := audit.o
obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 5d7b381da692..59e19549e759 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -1,4 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/audit_arch.h>
#include <asm/unistd_32.h>
+#include <asm/audit.h>
unsigned ia32_dir_class[] = {
#include <asm-generic/audit_dir_write.h>
@@ -29,14 +32,17 @@ int ia32_classify_syscall(unsigned syscall)
{
switch (syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
- return 5;
+ case __NR_execveat:
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 1;
+ return AUDITSC_COMPAT;
}
}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
deleted file mode 100644
index 2a4d073d2cf1..000000000000
--- a/arch/x86/ia32/ia32_aout.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * a.out loader for x86-64
- *
- * Copyright (C) 1991, 1992, 1996 Linus Torvalds
- * Hacked together by Andi Kleen
- */
-
-#include <linux/module.h>
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/slab.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <linux/jiffies.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/user32.h>
-#include <asm/ia32.h>
-
-#undef WARN_OLD
-#undef CORE_DUMP /* probably broken */
-
-static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
-static int load_aout_library(struct file *);
-
-#ifdef CORE_DUMP
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
- unsigned long limit);
-
-/*
- * fill in the user structure for a core dump..
- */
-static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
-{
- u32 fs, gs;
-
-/* changed the size calculations - should hopefully work better. lbt */
- dump->magic = CMAGIC;
- dump->start_code = 0;
- dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
- dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
- dump->u_dsize = ((unsigned long)
- (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
- dump->u_dsize -= dump->u_tsize;
- dump->u_ssize = 0;
- dump->u_debugreg[0] = current->thread.debugreg0;
- dump->u_debugreg[1] = current->thread.debugreg1;
- dump->u_debugreg[2] = current->thread.debugreg2;
- dump->u_debugreg[3] = current->thread.debugreg3;
- dump->u_debugreg[4] = 0;
- dump->u_debugreg[5] = 0;
- dump->u_debugreg[6] = current->thread.debugreg6;
- dump->u_debugreg[7] = current->thread.debugreg7;
-
- if (dump->start_stack < 0xc0000000) {
- unsigned long tmp;
-
- tmp = (unsigned long) (0xc0000000 - dump->start_stack);
- dump->u_ssize = tmp >> PAGE_SHIFT;
- }
-
- dump->regs.bx = regs->bx;
- dump->regs.cx = regs->cx;
- dump->regs.dx = regs->dx;
- dump->regs.si = regs->si;
- dump->regs.di = regs->di;
- dump->regs.bp = regs->bp;
- dump->regs.ax = regs->ax;
- dump->regs.ds = current->thread.ds;
- dump->regs.es = current->thread.es;
- savesegment(fs, fs);
- dump->regs.fs = fs;
- savesegment(gs, gs);
- dump->regs.gs = gs;
- dump->regs.orig_ax = regs->orig_ax;
- dump->regs.ip = regs->ip;
- dump->regs.cs = regs->cs;
- dump->regs.flags = regs->flags;
- dump->regs.sp = regs->sp;
- dump->regs.ss = regs->ss;
-
-#if 1 /* FIXME */
- dump->u_fpvalid = 0;
-#else
- dump->u_fpvalid = dump_fpu(regs, &dump->i387);
-#endif
-}
-
-#endif
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
-#ifdef CORE_DUMP
- .core_dump = aout_core_dump,
-#endif
- .min_coredump = PAGE_SIZE
-};
-
-static void set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end <= start)
- return;
- down_write(&current->mm->mmap_sem);
- do_brk(start, end - start);
- up_write(&current->mm->mmap_sem);
-}
-
-#ifdef CORE_DUMP
-/*
- * These are the only things you should do on a core-file: use only these
- * macros to write out all the necessary info.
- */
-
-static int dump_write(struct file *file, const void *addr, int nr)
-{
- return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-#define DUMP_WRITE(addr, nr) \
- if (!dump_write(file, (void *)(addr), (nr))) \
- goto end_coredump;
-
-#define DUMP_SEEK(offset) \
- if (file->f_op->llseek) { \
- if (file->f_op->llseek(file, (offset), 0) != (offset)) \
- goto end_coredump; \
- } else \
- file->f_pos = (offset)
-
-#define START_DATA() (u.u_tsize << PAGE_SHIFT)
-#define START_STACK(u) (u.start_stack)
-
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
- unsigned long limit)
-{
- mm_segment_t fs;
- int has_dumped = 0;
- unsigned long dump_start, dump_size;
- struct user32 dump;
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- has_dumped = 1;
- current->flags |= PF_DUMPCORE;
- strncpy(dump.u_comm, current->comm, sizeof(current->comm));
- dump.u_ar0 = offsetof(struct user32, regs);
- dump.signal = signr;
- dump_thread32(regs, &dump);
-
- /*
- * If the size of the dump file exceeds the rlimit, then see
- * what would happen if we wrote the stack, but not the data
- * area.
- */
- if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit)
- dump.u_dsize = 0;
-
- /* Make sure we have enough room to write the stack and data areas. */
- if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
- dump.u_ssize = 0;
-
- /* make sure we actually have a data and stack area to dump */
- set_fs(USER_DS);
- if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump),
- dump.u_dsize << PAGE_SHIFT))
- dump.u_dsize = 0;
- if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump),
- dump.u_ssize << PAGE_SHIFT))
- dump.u_ssize = 0;
-
- set_fs(KERNEL_DS);
- /* struct user */
- DUMP_WRITE(&dump, sizeof(dump));
- /* Now dump all of the user data. Include malloced stuff as well */
- DUMP_SEEK(PAGE_SIZE);
- /* now we start writing out the user space info */
- set_fs(USER_DS);
- /* Dump the data area */
- if (dump.u_dsize != 0) {
- dump_start = START_DATA(dump);
- dump_size = dump.u_dsize << PAGE_SHIFT;
- DUMP_WRITE(dump_start, dump_size);
- }
- /* Now prepare to dump the stack area */
- if (dump.u_ssize != 0) {
- dump_start = START_STACK(dump);
- dump_size = dump.u_ssize << PAGE_SHIFT;
- DUMP_WRITE(dump_start, dump_size);
- }
- /*
- * Finally dump the task struct. Not be used by gdb, but
- * could be useful
- */
- set_fs(KERNEL_DS);
- DUMP_WRITE(current, sizeof(*current));
-end_coredump:
- set_fs(fs);
- return has_dumped;
-}
-#endif
-
-/*
- * create_aout_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
-{
- u32 __user *argv, *envp, *sp;
- int argc = bprm->argc, envc = bprm->envc;
-
- sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
- sp -= envc+1;
- envp = sp;
- sp -= argc+1;
- argv = sp;
- put_user((unsigned long) envp, --sp);
- put_user((unsigned long) argv, --sp);
- put_user(argc, --sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-- > 0) {
- char c;
-
- put_user((u32)(unsigned long)p, argv++);
- do {
- get_user(c, p++);
- } while (c);
- }
- put_user(0, argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-- > 0) {
- char c;
-
- put_user((u32)(unsigned long)p, envp++);
- do {
- get_user(c, p++);
- } while (c);
- }
- put_user(0, envp);
- current->mm->env_end = (unsigned long) p;
- return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries. There is no binary dependent code anywhere else.
- */
-static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
-{
- unsigned long error, fd_offset, rlim;
- struct exec ex;
- int retval;
-
- ex = *((struct exec *) bprm->buf); /* exec-header */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
- N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
- N_TRSIZE(ex) || N_DRSIZE(ex) ||
- i_size_read(bprm->file->f_path.dentry->d_inode) <
- ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- return -ENOEXEC;
- }
-
- fd_offset = N_TXTOFF(ex);
-
- /* Check initial limits. This avoids letting people circumvent
- * size limits imposed on them by creating programs with large
- * arrays in the data or bss.
- */
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
- if (rlim >= RLIM_INFINITY)
- rlim = ~0;
- if (ex.a_data + ex.a_bss > rlim)
- return -ENOMEM;
-
- /* Flush all traces of the currently running executable */
- retval = flush_old_exec(bprm);
- if (retval)
- return retval;
-
- regs->cs = __USER32_CS;
- regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
- regs->r13 = regs->r14 = regs->r15 = 0;
-
- /* OK, This is the point of no return */
- set_personality(PER_LINUX);
- set_thread_flag(TIF_IA32);
- clear_thread_flag(TIF_ABI_PENDING);
-
- current->mm->end_code = ex.a_text +
- (current->mm->start_code = N_TXTADDR(ex));
- current->mm->end_data = ex.a_data +
- (current->mm->start_data = N_DATADDR(ex));
- current->mm->brk = ex.a_bss +
- (current->mm->start_brk = N_BSSADDR(ex));
- current->mm->free_area_cache = TASK_UNMAPPED_BASE;
- current->mm->cached_hole_size = 0;
-
- current->mm->mmap = NULL;
- install_exec_creds(bprm);
- current->flags &= ~PF_FORKNOEXEC;
-
- if (N_MAGIC(ex) == OMAGIC) {
- unsigned long text_addr, map_size;
- loff_t pos;
-
- text_addr = N_TXTADDR(ex);
-
- pos = 32;
- map_size = ex.a_text+ex.a_data;
-
- down_write(&current->mm->mmap_sem);
- error = do_brk(text_addr & PAGE_MASK, map_size);
- up_write(&current->mm->mmap_sem);
-
- if (error != (text_addr & PAGE_MASK)) {
- send_sig(SIGKILL, current, 0);
- return error;
- }
-
- error = bprm->file->f_op->read(bprm->file,
- (char __user *)text_addr,
- ex.a_text+ex.a_data, &pos);
- if ((signed long)error < 0) {
- send_sig(SIGKILL, current, 0);
- return error;
- }
-
- flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
- } else {
-#ifdef WARN_OLD
- static unsigned long error_time, error_time2;
- if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
- (N_MAGIC(ex) != NMAGIC) &&
- time_after(jiffies, error_time2 + 5*HZ)) {
- printk(KERN_NOTICE "executable not page aligned\n");
- error_time2 = jiffies;
- }
-
- if ((fd_offset & ~PAGE_MASK) != 0 &&
- time_after(jiffies, error_time + 5*HZ)) {
- printk(KERN_WARNING
- "fd_offset is not page aligned. Please convert "
- "program: %s\n",
- bprm->file->f_path.dentry->d_name.name);
- error_time = jiffies;
- }
-#endif
-
- if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
- loff_t pos = fd_offset;
-
- down_write(&current->mm->mmap_sem);
- do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
- up_write(&current->mm->mmap_sem);
- bprm->file->f_op->read(bprm->file,
- (char __user *)N_TXTADDR(ex),
- ex.a_text+ex.a_data, &pos);
- flush_icache_range((unsigned long) N_TXTADDR(ex),
- (unsigned long) N_TXTADDR(ex) +
- ex.a_text+ex.a_data);
- goto beyond_if;
- }
-
- down_write(&current->mm->mmap_sem);
- error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
- PROT_READ | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_EXECUTABLE | MAP_32BIT,
- fd_offset);
- up_write(&current->mm->mmap_sem);
-
- if (error != N_TXTADDR(ex)) {
- send_sig(SIGKILL, current, 0);
- return error;
- }
-
- down_write(&current->mm->mmap_sem);
- error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_EXECUTABLE | MAP_32BIT,
- fd_offset + ex.a_text);
- up_write(&current->mm->mmap_sem);
- if (error != N_DATADDR(ex)) {
- send_sig(SIGKILL, current, 0);
- return error;
- }
- }
-beyond_if:
- set_binfmt(&aout_format);
-
- set_brk(current->mm->start_brk, current->mm->brk);
-
- retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0) {
- /* Someone check-me: is this error path enough? */
- send_sig(SIGKILL, current, 0);
- return retval;
- }
-
- current->mm->start_stack =
- (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
- /* start thread */
- loadsegment(fs, 0);
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
- load_gs_index(0);
- (regs)->ip = ex.a_entry;
- (regs)->sp = current->mm->start_stack;
- (regs)->flags = 0x200;
- (regs)->cs = __USER32_CS;
- (regs)->ss = __USER32_DS;
- regs->r8 = regs->r9 = regs->r10 = regs->r11 =
- regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
- set_fs(USER_DS);
- return 0;
-}
-
-static int load_aout_library(struct file *file)
-{
- struct inode *inode;
- unsigned long bss, start_addr, len, error;
- int retval;
- struct exec ex;
-
- inode = file->f_path.dentry->d_inode;
-
- retval = -ENOEXEC;
- error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
- if (error != sizeof(ex))
- goto out;
-
- /* We come in here for the regular a.out style of shared libraries */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
- N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
- i_size_read(inode) <
- ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- goto out;
- }
-
- if (N_FLAGS(ex))
- goto out;
-
- /* For QMAGIC, the starting address is 0x20 into the page. We mask
- this off to get the starting address for the page */
-
- start_addr = ex.a_entry & 0xfffff000;
-
- if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
- loff_t pos = N_TXTOFF(ex);
-
-#ifdef WARN_OLD
- static unsigned long error_time;
- if (time_after(jiffies, error_time + 5*HZ)) {
- printk(KERN_WARNING
- "N_TXTOFF is not page aligned. Please convert "
- "library: %s\n",
- file->f_path.dentry->d_name.name);
- error_time = jiffies;
- }
-#endif
- down_write(&current->mm->mmap_sem);
- do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
- up_write(&current->mm->mmap_sem);
-
- file->f_op->read(file, (char __user *)start_addr,
- ex.a_text + ex.a_data, &pos);
- flush_icache_range((unsigned long) start_addr,
- (unsigned long) start_addr + ex.a_text +
- ex.a_data);
-
- retval = 0;
- goto out;
- }
- /* Now use mmap to map the library into memory. */
- down_write(&current->mm->mmap_sem);
- error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
- N_TXTOFF(ex));
- up_write(&current->mm->mmap_sem);
- retval = error;
- if (error != start_addr)
- goto out;
-
- len = PAGE_ALIGN(ex.a_text + ex.a_data);
- bss = ex.a_text + ex.a_data + ex.a_bss;
- if (bss > len) {
- down_write(&current->mm->mmap_sem);
- error = do_brk(start_addr + len, bss - len);
- up_write(&current->mm->mmap_sem);
- retval = error;
- if (error != start_addr + len)
- goto out;
- }
- retval = 0;
-out:
- return retval;
-}
-
-static int __init init_aout_binfmt(void)
-{
- return register_binfmt(&aout_format);
-}
-
-static void __exit exit_aout_binfmt(void)
-{
- unregister_binfmt(&aout_format);
-}
-
-module_init(init_aout_binfmt);
-module_exit(exit_aout_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
deleted file mode 100644
index 588a7aa937e1..000000000000
--- a/arch/x86/ia32/ia32_signal.c
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- * linux/arch/x86_64/ia32/ia32_signal.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
- * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
- * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/personality.h>
-#include <linux/compat.h>
-#include <linux/binfmts.h>
-#include <asm/ucontext.h>
-#include <asm/uaccess.h>
-#include <asm/i387.h>
-#include <asm/ptrace.h>
-#include <asm/ia32_unistd.h>
-#include <asm/user32.h>
-#include <asm/sigcontext32.h>
-#include <asm/proto.h>
-#include <asm/vdso.h>
-#include <asm/sigframe.h>
-#include <asm/sys_ia32.h>
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
- X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
- X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
- X86_EFLAGS_CF)
-
-void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
-{
- int err = 0;
-
- if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
- return -EFAULT;
-
- put_user_try {
- /* If you change siginfo_t structure, please make sure that
- this code is fixed accordingly.
- It should never copy any pad contained in the structure
- to avoid security leaks, but must copy the generic
- 3 ints plus the relevant union member. */
- put_user_ex(from->si_signo, &to->si_signo);
- put_user_ex(from->si_errno, &to->si_errno);
- put_user_ex((short)from->si_code, &to->si_code);
-
- if (from->si_code < 0) {
- put_user_ex(from->si_pid, &to->si_pid);
- put_user_ex(from->si_uid, &to->si_uid);
- put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
- } else {
- /*
- * First 32bits of unions are always present:
- * si_pid === si_band === si_tid === si_addr(LS half)
- */
- put_user_ex(from->_sifields._pad[0],
- &to->_sifields._pad[0]);
- switch (from->si_code >> 16) {
- case __SI_FAULT >> 16:
- break;
- case __SI_CHLD >> 16:
- put_user_ex(from->si_utime, &to->si_utime);
- put_user_ex(from->si_stime, &to->si_stime);
- put_user_ex(from->si_status, &to->si_status);
- /* FALL THROUGH */
- default:
- case __SI_KILL >> 16:
- put_user_ex(from->si_uid, &to->si_uid);
- break;
- case __SI_POLL >> 16:
- put_user_ex(from->si_fd, &to->si_fd);
- break;
- case __SI_TIMER >> 16:
- put_user_ex(from->si_overrun, &to->si_overrun);
- put_user_ex(ptr_to_compat(from->si_ptr),
- &to->si_ptr);
- break;
- /* This is not generated by the kernel as of now. */
- case __SI_RT >> 16:
- case __SI_MESGQ >> 16:
- put_user_ex(from->si_uid, &to->si_uid);
- put_user_ex(from->si_int, &to->si_int);
- break;
- }
- }
- } put_user_catch(err);
-
- return err;
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
- int err = 0;
- u32 ptr32;
-
- if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
- return -EFAULT;
-
- get_user_try {
- get_user_ex(to->si_signo, &from->si_signo);
- get_user_ex(to->si_errno, &from->si_errno);
- get_user_ex(to->si_code, &from->si_code);
-
- get_user_ex(to->si_pid, &from->si_pid);
- get_user_ex(to->si_uid, &from->si_uid);
- get_user_ex(ptr32, &from->si_ptr);
- to->si_ptr = compat_ptr(ptr32);
- } get_user_catch(err);
-
- return err;
-}
-
-asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
-}
-
-asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
- stack_ia32_t __user *uoss_ptr,
- struct pt_regs *regs)
-{
- stack_t uss, uoss;
- int ret, err = 0;
- mm_segment_t seg;
-
- if (uss_ptr) {
- u32 ptr;
-
- memset(&uss, 0, sizeof(stack_t));
- if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)))
- return -EFAULT;
-
- get_user_try {
- get_user_ex(ptr, &uss_ptr->ss_sp);
- get_user_ex(uss.ss_flags, &uss_ptr->ss_flags);
- get_user_ex(uss.ss_size, &uss_ptr->ss_size);
- } get_user_catch(err);
-
- if (err)
- return -EFAULT;
- uss.ss_sp = compat_ptr(ptr);
- }
- seg = get_fs();
- set_fs(KERNEL_DS);
- ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
- set_fs(seg);
- if (ret >= 0 && uoss_ptr) {
- if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)))
- return -EFAULT;
-
- put_user_try {
- put_user_ex(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp);
- put_user_ex(uoss.ss_flags, &uoss_ptr->ss_flags);
- put_user_ex(uoss.ss_size, &uoss_ptr->ss_size);
- } put_user_catch(err);
-
- if (err)
- ret = -EFAULT;
- }
- return ret;
-}
-
-/*
- * Do a signal return; undo the signal stack.
- */
-#define loadsegment_gs(v) load_gs_index(v)
-#define loadsegment_fs(v) loadsegment(fs, v)
-#define loadsegment_ds(v) loadsegment(ds, v)
-#define loadsegment_es(v) loadsegment(es, v)
-
-#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; })
-#define set_user_seg(seg, v) loadsegment_##seg(v)
-
-#define COPY(x) { \
- get_user_ex(regs->x, &sc->x); \
-}
-
-#define GET_SEG(seg) ({ \
- unsigned short tmp; \
- get_user_ex(tmp, &sc->seg); \
- tmp; \
-})
-
-#define COPY_SEG_CPL3(seg) do { \
- regs->seg = GET_SEG(seg) | 3; \
-} while (0)
-
-#define RELOAD_SEG(seg) { \
- unsigned int pre = GET_SEG(seg); \
- unsigned int cur = get_user_seg(seg); \
- pre |= 3; \
- if (pre != cur) \
- set_user_seg(seg, pre); \
-}
-
-static int ia32_restore_sigcontext(struct pt_regs *regs,
- struct sigcontext_ia32 __user *sc,
- unsigned int *pax)
-{
- unsigned int tmpflags, err = 0;
- void __user *buf;
- u32 tmp;
-
- /* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
- get_user_try {
- /*
- * Reload fs and gs if they have changed in the signal
- * handler. This does not handle long fs/gs base changes in
- * the handler, but does not clobber them at least in the
- * normal case.
- */
- RELOAD_SEG(gs);
- RELOAD_SEG(fs);
- RELOAD_SEG(ds);
- RELOAD_SEG(es);
-
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
- /* Don't touch extended registers */
-
- COPY_SEG_CPL3(cs);
- COPY_SEG_CPL3(ss);
-
- get_user_ex(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- /* disable syscall checks */
- regs->orig_ax = -1;
-
- get_user_ex(tmp, &sc->fpstate);
- buf = compat_ptr(tmp);
- err |= restore_i387_xstate_ia32(buf);
-
- get_user_ex(*pax, &sc->ax);
- } get_user_catch(err);
-
- return err;
-}
-
-asmlinkage long sys32_sigreturn(struct pt_regs *regs)
-{
- struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
- sigset_t set;
- unsigned int ax;
-
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask)
- || (_COMPAT_NSIG_WORDS > 1
- && __copy_from_user((((char *) &set.sig) + 4),
- &frame->extramask,
- sizeof(frame->extramask))))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
- goto badframe;
- return ax;
-
-badframe:
- signal_fault(regs, frame, "32bit sigreturn");
- return 0;
-}
-
-asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
-{
- struct rt_sigframe_ia32 __user *frame;
- sigset_t set;
- unsigned int ax;
- struct pt_regs tregs;
-
- frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
-
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
- goto badframe;
-
- tregs = *regs;
- if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
- goto badframe;
-
- return ax;
-
-badframe:
- signal_fault(regs, frame, "32bit rt sigreturn");
- return 0;
-}
-
-/*
- * Set up a signal frame.
- */
-
-static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
- void __user *fpstate,
- struct pt_regs *regs, unsigned int mask)
-{
- int err = 0;
-
- put_user_try {
- put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs);
- put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs);
- put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds);
- put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es);
-
- put_user_ex(regs->di, &sc->di);
- put_user_ex(regs->si, &sc->si);
- put_user_ex(regs->bp, &sc->bp);
- put_user_ex(regs->sp, &sc->sp);
- put_user_ex(regs->bx, &sc->bx);
- put_user_ex(regs->dx, &sc->dx);
- put_user_ex(regs->cx, &sc->cx);
- put_user_ex(regs->ax, &sc->ax);
- put_user_ex(current->thread.trap_no, &sc->trapno);
- put_user_ex(current->thread.error_code, &sc->err);
- put_user_ex(regs->ip, &sc->ip);
- put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
- put_user_ex(regs->flags, &sc->flags);
- put_user_ex(regs->sp, &sc->sp_at_signal);
- put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
-
- put_user_ex(ptr_to_compat(fpstate), &sc->fpstate);
-
- /* non-iBCS2 extensions.. */
- put_user_ex(mask, &sc->oldmask);
- put_user_ex(current->thread.cr2, &sc->cr2);
- } put_user_catch(err);
-
- return err;
-}
-
-/*
- * Determine which stack to use..
- */
-static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
- size_t frame_size,
- void **fpstate)
-{
- unsigned long sp;
-
- /* Default to using normal stack */
- sp = regs->sp;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- }
-
- /* This is the legacy signal stack switching. */
- else if ((regs->ss & 0xffff) != __USER32_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
- sp = (unsigned long) ka->sa.sa_restorer;
-
- if (used_math()) {
- sp = sp - sig_xstate_ia32_size;
- *fpstate = (struct _fpstate_ia32 *) sp;
- if (save_i387_xstate_ia32(*fpstate) < 0)
- return (void __user *) -1L;
- }
-
- sp -= frame_size;
- /* Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0. */
- sp = ((sp + 4) & -16ul) - 4;
- return (void __user *) sp;
-}
-
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- compat_sigset_t *set, struct pt_regs *regs)
-{
- struct sigframe_ia32 __user *frame;
- void __user *restorer;
- int err = 0;
- void __user *fpstate = NULL;
-
- /* copy_to_user optimizes that into a single 8 byte store */
- static const struct {
- u16 poplmovl;
- u32 val;
- u16 int80;
- } __attribute__((packed)) code = {
- 0xb858, /* popl %eax ; movl $...,%eax */
- __NR_ia32_sigreturn,
- 0x80cd, /* int $0x80 */
- };
-
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (__put_user(sig, &frame->sig))
- return -EFAULT;
-
- if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
- return -EFAULT;
-
- if (_COMPAT_NSIG_WORDS > 1) {
- if (__copy_to_user(frame->extramask, &set->sig[1],
- sizeof(frame->extramask)))
- return -EFAULT;
- }
-
- if (ka->sa.sa_flags & SA_RESTORER) {
- restorer = ka->sa.sa_restorer;
- } else {
- /* Return stub is in 32bit vsyscall page */
- if (current->mm->context.vdso)
- restorer = VDSO32_SYMBOL(current->mm->context.vdso,
- sigreturn);
- else
- restorer = &frame->retcode;
- }
-
- put_user_try {
- put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
-
- /*
- * These are actually not used anymore, but left because some
- * gdb versions depend on them as a marker.
- */
- put_user_ex(*((u64 *)&code), (u64 *)frame->retcode);
- } put_user_catch(err);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
-
- /* Make -mregparm=3 work */
- regs->ax = sig;
- regs->dx = 0;
- regs->cx = 0;
-
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
-
- regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
-
- return 0;
-}
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- compat_sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe_ia32 __user *frame;
- void __user *restorer;
- int err = 0;
- void __user *fpstate = NULL;
-
- /* __copy_to_user optimizes that into a single 8 byte store */
- static const struct {
- u8 movl;
- u32 val;
- u16 int80;
- u8 pad;
- } __attribute__((packed)) code = {
- 0xb8,
- __NR_ia32_rt_sigreturn,
- 0x80cd,
- 0,
- };
-
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- put_user_try {
- put_user_ex(sig, &frame->sig);
- put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo);
- put_user_ex(ptr_to_compat(&frame->uc), &frame->puc);
- err |= copy_siginfo_to_user32(&frame->info, info);
-
- /* Create the ucontext. */
- if (cpu_has_xsave)
- put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- put_user_ex(0, &frame->uc.uc_flags);
- put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
- else
- restorer = VDSO32_SYMBOL(current->mm->context.vdso,
- rt_sigreturn);
- put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
-
- /*
- * Not actually used anymore, but left because some gdb
- * versions need it.
- */
- put_user_ex(*((u64 *)&code), (u64 *)frame->retcode);
- } put_user_catch(err);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
-
- /* Make -mregparm=3 work */
- regs->ax = sig;
- regs->dx = (unsigned long) &frame->info;
- regs->cx = (unsigned long) &frame->uc;
-
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
-
- regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
-
- return 0;
-}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
deleted file mode 100644
index e590261ba059..000000000000
--- a/arch/x86/ia32/ia32entry.S
+++ /dev/null
@@ -1,835 +0,0 @@
-/*
- * Compatibility mode system call entry point for x86-64.
- *
- * Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */
-
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/current.h>
-#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
-#include <asm/thread_info.h>
-#include <asm/segment.h>
-#include <asm/irqflags.h>
-#include <linux/linkage.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysexit_audit int_ret_from_sys_call
-#define sysretl_audit int_ret_from_sys_call
-#endif
-
-#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
-
- .macro IA32_ARG_FIXUP noebp=0
- movl %edi,%r8d
- .if \noebp
- .else
- movl %ebp,%r9d
- .endif
- xchg %ecx,%esi
- movl %ebx,%edi
- movl %edx,%edx /* zero extension */
- .endm
-
- /* clobbers %eax */
- .macro CLEAR_RREGS _r9=rax
- xorl %eax,%eax
- movq %rax,R11(%rsp)
- movq %rax,R10(%rsp)
- movq %\_r9,R9(%rsp)
- movq %rax,R8(%rsp)
- .endm
-
- /*
- * Reload arg registers from stack in case ptrace changed them.
- * We don't reload %eax because syscall_trace_enter() returned
- * the value it wants us to use in the table lookup.
- */
- .macro LOAD_ARGS32 offset, _r9=0
- .if \_r9
- movl \offset+16(%rsp),%r9d
- .endif
- movl \offset+40(%rsp),%ecx
- movl \offset+48(%rsp),%edx
- movl \offset+56(%rsp),%esi
- movl \offset+64(%rsp),%edi
- .endm
-
- .macro CFI_STARTPROC32 simple
- CFI_STARTPROC \simple
- CFI_UNDEFINED r8
- CFI_UNDEFINED r9
- CFI_UNDEFINED r10
- CFI_UNDEFINED r11
- CFI_UNDEFINED r12
- CFI_UNDEFINED r13
- CFI_UNDEFINED r14
- CFI_UNDEFINED r15
- .endm
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
- swapgs
- sysretl
-ENDPROC(native_usergs_sysret32)
-
-ENTRY(native_irq_enable_sysexit)
- swapgs
- sti
- sysexit
-ENDPROC(native_irq_enable_sysexit)
-#endif
-
-/*
- * 32bit SYSENTER instruction entry.
- *
- * Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp user stack
- * 0(%ebp) Arg6
- *
- * Interrupts off.
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,0
- CFI_REGISTER rsp,rbp
- SWAPGS_UNSAFE_STACK
- movq PER_CPU_VAR(kernel_stack), %rsp
- addq $(KERNEL_STACK_OFFSET),%rsp
- /*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs, here we enable it straight after entry:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
- movl %ebp,%ebp /* zero extension */
- pushq $__USER32_DS
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET ss,0*/
- pushq %rbp
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rsp,0
- pushfq
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET rflags,0*/
- movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
- CFI_REGISTER rip,r10
- pushq $__USER32_CS
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET cs,0*/
- movl %eax, %eax
- pushq %r10
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip,0
- pushq %rax
- CFI_ADJUST_CFA_OFFSET 8
- cld
- SAVE_ARGS 0,0,1
- /* no need to do an access_ok check here because rbp has been
- 32bit zero extended */
-1: movl (%rbp),%ebp
- .section __ex_table,"a"
- .quad 1b,ia32_badarg
- .previous
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
- CFI_REMEMBER_STATE
- jnz sysenter_tracesys
- cmpl $(IA32_NR_syscalls-1),%eax
- ja ia32_badsys
-sysenter_do_call:
- IA32_ARG_FIXUP
-sysenter_dispatch:
- call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
- GET_THREAD_INFO(%r10)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
- jnz sysexit_audit
-sysexit_from_sys_call:
- andl $~TS_COMPAT,TI_status(%r10)
- /* clear IF, that popfq doesn't enable interrupts early */
- andl $~0x200,EFLAGS-R11(%rsp)
- movl RIP-R11(%rsp),%edx /* User %eip */
- CFI_REGISTER rip,rdx
- RESTORE_ARGS 1,24,1,1,1,1
- popfq
- CFI_ADJUST_CFA_OFFSET -8
- /*CFI_RESTORE rflags*/
- popq %rcx /* User %esp */
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rsp,rcx
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS_SYSEXIT32
-
-#ifdef CONFIG_AUDITSYSCALL
- .macro auditsys_entry_common
- movl %esi,%r9d /* 6th arg: 4th syscall arg */
- movl %edx,%r8d /* 5th arg: 3rd syscall arg */
- /* (already in %ecx) 4th arg: 2nd syscall arg */
- movl %ebx,%edx /* 3rd arg: 1st syscall arg */
- movl %eax,%esi /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
- movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
- cmpl $(IA32_NR_syscalls-1),%eax
- ja ia32_badsys
- movl %ebx,%edi /* reload 1st syscall arg */
- movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
- movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
- movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
- movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
- .endm
-
- .macro auditsys_exit exit,ebpsave=RBP
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
- jnz int_ret_from_sys_call
- TRACE_IRQS_ON
- sti
- movl %eax,%esi /* second arg, syscall return value */
- cmpl $0,%eax /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
- GET_THREAD_INFO(%r10)
- movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
- movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- cli
- TRACE_IRQS_OFF
- testl %edi,TI_flags(%r10)
- jnz int_with_check
- jmp \exit
- .endm
-
-sysenter_auditsys:
- CFI_RESTORE_STATE
- auditsys_entry_common
- movl %ebp,%r9d /* reload 6th syscall arg */
- jmp sysenter_dispatch
-
-sysexit_audit:
- auditsys_exit sysexit_from_sys_call
-#endif
-
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
- jz sysenter_auditsys
-#endif
- SAVE_REST
- CLEAR_RREGS
- movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
- cmpl $(IA32_NR_syscalls-1),%eax
- ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
- jmp sysenter_do_call
- CFI_ENDPROC
-ENDPROC(ia32_sysenter_target)
-
-/*
- * 32bit SYSCALL instruction entry.
- *
- * Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx return EIP
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
- * %esp user stack
- * 0(%esp) Arg6
- *
- * Interrupts off.
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_cstar_target)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
- CFI_REGISTER rip,rcx
- /*CFI_REGISTER rflags,r11*/
- SWAPGS_UNSAFE_STACK
- movl %esp,%r8d
- CFI_REGISTER rsp,r8
- movq PER_CPU_VAR(kernel_stack),%rsp
- /*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1,1
- movl %eax,%eax /* zero extension */
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
- movl %ebp,%ecx
- movq $__USER32_CS,CS-ARGOFFSET(%rsp)
- movq $__USER32_DS,SS-ARGOFFSET(%rsp)
- movq %r11,EFLAGS-ARGOFFSET(%rsp)
- /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- movq %r8,RSP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
- /* no need to do an access_ok check here because r8 has been
- 32bit zero extended */
- /* hardware stack frame is complete now */
-1: movl (%r8),%r9d
- .section __ex_table,"a"
- .quad 1b,ia32_badarg
- .previous
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
- CFI_REMEMBER_STATE
- jnz cstar_tracesys
- cmpl $IA32_NR_syscalls-1,%eax
- ja ia32_badsys
-cstar_do_call:
- IA32_ARG_FIXUP 1
-cstar_dispatch:
- call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
- GET_THREAD_INFO(%r10)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
- jnz sysretl_audit
-sysretl_from_sys_call:
- andl $~TS_COMPAT,TI_status(%r10)
- RESTORE_ARGS 1,-ARG_SKIP,1,1,1
- movl RIP-ARGOFFSET(%rsp),%ecx
- CFI_REGISTER rip,rcx
- movl EFLAGS-ARGOFFSET(%rsp),%r11d
- /*CFI_REGISTER rflags,r11*/
- TRACE_IRQS_ON
- movl RSP-ARGOFFSET(%rsp),%esp
- CFI_RESTORE rsp
- USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
-cstar_auditsys:
- CFI_RESTORE_STATE
- movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
- auditsys_entry_common
- movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
- jmp cstar_dispatch
-
-sysretl_audit:
- auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
-#endif
-
-cstar_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
- jz cstar_auditsys
-#endif
- xchgl %r9d,%ebp
- SAVE_REST
- CLEAR_RREGS r9
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
- RESTORE_REST
- xchgl %ebp,%r9d
- cmpl $(IA32_NR_syscalls-1),%eax
- ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
- jmp cstar_do_call
-END(ia32_cstar_target)
-
-ia32_badarg:
- movq $-EFAULT,%rax
- jmp ia32_sysret
- CFI_ENDPROC
-
-/*
- * Emulated IA32 system calls via int 0x80.
- *
- * Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except %eax must be saved (but ptrace may violate that)
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
-
-ENTRY(ia32_syscall)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-RIP
- /*CFI_REL_OFFSET ss,SS-RIP*/
- CFI_REL_OFFSET rsp,RSP-RIP
- /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
- /*CFI_REL_OFFSET cs,CS-RIP*/
- CFI_REL_OFFSET rip,RIP-RIP
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- SWAPGS
- /*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
- movl %eax,%eax
- pushq %rax
- CFI_ADJUST_CFA_OFFSET 8
- cld
- /* note the registers are not zero extended to the sf.
- this could be a problem. */
- SAVE_ARGS 0,0,1
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
- jnz ia32_tracesys
- cmpl $(IA32_NR_syscalls-1),%eax
- ja ia32_badsys
-ia32_do_call:
- IA32_ARG_FIXUP
- call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
-ia32_sysret:
- movq %rax,RAX-ARGOFFSET(%rsp)
- jmp int_ret_from_sys_call
-
-ia32_tracesys:
- SAVE_REST
- CLEAR_RREGS
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
- cmpl $(IA32_NR_syscalls-1),%eax
- ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
- jmp ia32_do_call
-END(ia32_syscall)
-
-ia32_badsys:
- movq $0,ORIG_RAX-ARGOFFSET(%rsp)
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp int_ret_from_sys_call
-
-quiet_ni_syscall:
- movq $-ENOSYS,%rax
- ret
- CFI_ENDPROC
-
- .macro PTREGSCALL label, func, arg
- .globl \label
-\label:
- leaq \func(%rip),%rax
- leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
- jmp ia32_ptregs_common
- .endm
-
- CFI_STARTPROC32
-
- PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
- PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
- PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
- PTREGSCALL stub32_execve, sys32_execve, %rcx
- PTREGSCALL stub32_fork, sys_fork, %rdi
- PTREGSCALL stub32_clone, sys32_clone, %rdx
- PTREGSCALL stub32_vfork, sys_vfork, %rdi
- PTREGSCALL stub32_iopl, sys_iopl, %rsi
-
-ENTRY(ia32_ptregs_common)
- popq %r11
- CFI_ENDPROC
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-ARGOFFSET
- CFI_REL_OFFSET rax,RAX-ARGOFFSET
- CFI_REL_OFFSET rcx,RCX-ARGOFFSET
- CFI_REL_OFFSET rdx,RDX-ARGOFFSET
- CFI_REL_OFFSET rsi,RSI-ARGOFFSET
- CFI_REL_OFFSET rdi,RDI-ARGOFFSET
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
-/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
-/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
-/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
- SAVE_REST
- call *%rax
- RESTORE_REST
- jmp ia32_sysret /* misbalances the return cache */
- CFI_ENDPROC
-END(ia32_ptregs_common)
-
- .section .rodata,"a"
- .align 8
-ia32_sys_call_table:
- .quad sys_restart_syscall
- .quad sys_exit
- .quad stub32_fork
- .quad sys_read
- .quad sys_write
- .quad compat_sys_open /* 5 */
- .quad sys_close
- .quad sys32_waitpid
- .quad sys_creat
- .quad sys_link
- .quad sys_unlink /* 10 */
- .quad stub32_execve
- .quad sys_chdir
- .quad compat_sys_time
- .quad sys_mknod
- .quad sys_chmod /* 15 */
- .quad sys_lchown16
- .quad quiet_ni_syscall /* old break syscall holder */
- .quad sys_stat
- .quad sys32_lseek
- .quad sys_getpid /* 20 */
- .quad compat_sys_mount /* mount */
- .quad sys_oldumount /* old_umount */
- .quad sys_setuid16
- .quad sys_getuid16
- .quad compat_sys_stime /* stime */ /* 25 */
- .quad compat_sys_ptrace /* ptrace */
- .quad sys_alarm
- .quad sys_fstat /* (old)fstat */
- .quad sys_pause
- .quad compat_sys_utime /* 30 */
- .quad quiet_ni_syscall /* old stty syscall holder */
- .quad quiet_ni_syscall /* old gtty syscall holder */
- .quad sys_access
- .quad sys_nice
- .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
- .quad sys_sync
- .quad sys32_kill
- .quad sys_rename
- .quad sys_mkdir
- .quad sys_rmdir /* 40 */
- .quad sys_dup
- .quad sys32_pipe
- .quad compat_sys_times
- .quad quiet_ni_syscall /* old prof syscall holder */
- .quad sys_brk /* 45 */
- .quad sys_setgid16
- .quad sys_getgid16
- .quad sys_signal
- .quad sys_geteuid16
- .quad sys_getegid16 /* 50 */
- .quad sys_acct
- .quad sys_umount /* new_umount */
- .quad quiet_ni_syscall /* old lock syscall holder */
- .quad compat_sys_ioctl
- .quad compat_sys_fcntl64 /* 55 */
- .quad quiet_ni_syscall /* old mpx syscall holder */
- .quad sys_setpgid
- .quad quiet_ni_syscall /* old ulimit syscall holder */
- .quad sys32_olduname
- .quad sys_umask /* 60 */
- .quad sys_chroot
- .quad compat_sys_ustat
- .quad sys_dup2
- .quad sys_getppid
- .quad sys_getpgrp /* 65 */
- .quad sys_setsid
- .quad sys32_sigaction
- .quad sys_sgetmask
- .quad sys_ssetmask
- .quad sys_setreuid16 /* 70 */
- .quad sys_setregid16
- .quad sys32_sigsuspend
- .quad compat_sys_sigpending
- .quad sys_sethostname
- .quad compat_sys_setrlimit /* 75 */
- .quad compat_sys_old_getrlimit /* old_getrlimit */
- .quad compat_sys_getrusage
- .quad compat_sys_gettimeofday
- .quad compat_sys_settimeofday
- .quad sys_getgroups16 /* 80 */
- .quad sys_setgroups16
- .quad sys32_old_select
- .quad sys_symlink
- .quad sys_lstat
- .quad sys_readlink /* 85 */
- .quad sys_uselib
- .quad sys_swapon
- .quad sys_reboot
- .quad compat_sys_old_readdir
- .quad sys32_mmap /* 90 */
- .quad sys_munmap
- .quad sys_truncate
- .quad sys_ftruncate
- .quad sys_fchmod
- .quad sys_fchown16 /* 95 */
- .quad sys_getpriority
- .quad sys_setpriority
- .quad quiet_ni_syscall /* old profil syscall holder */
- .quad compat_sys_statfs
- .quad compat_sys_fstatfs /* 100 */
- .quad sys_ioperm
- .quad compat_sys_socketcall
- .quad sys_syslog
- .quad compat_sys_setitimer
- .quad compat_sys_getitimer /* 105 */
- .quad compat_sys_newstat
- .quad compat_sys_newlstat
- .quad compat_sys_newfstat
- .quad sys32_uname
- .quad stub32_iopl /* 110 */
- .quad sys_vhangup
- .quad quiet_ni_syscall /* old "idle" system call */
- .quad sys32_vm86_warning /* vm86old */
- .quad compat_sys_wait4
- .quad sys_swapoff /* 115 */
- .quad compat_sys_sysinfo
- .quad sys32_ipc
- .quad sys_fsync
- .quad stub32_sigreturn
- .quad stub32_clone /* 120 */
- .quad sys_setdomainname
- .quad sys_uname
- .quad sys_modify_ldt
- .quad compat_sys_adjtimex
- .quad sys32_mprotect /* 125 */
- .quad compat_sys_sigprocmask
- .quad quiet_ni_syscall /* create_module */
- .quad sys_init_module
- .quad sys_delete_module
- .quad quiet_ni_syscall /* 130 get_kernel_syms */
- .quad sys32_quotactl
- .quad sys_getpgid
- .quad sys_fchdir
- .quad quiet_ni_syscall /* bdflush */
- .quad sys_sysfs /* 135 */
- .quad sys_personality
- .quad quiet_ni_syscall /* for afs_syscall */
- .quad sys_setfsuid16
- .quad sys_setfsgid16
- .quad sys_llseek /* 140 */
- .quad compat_sys_getdents
- .quad compat_sys_select
- .quad sys_flock
- .quad sys_msync
- .quad compat_sys_readv /* 145 */
- .quad compat_sys_writev
- .quad sys_getsid
- .quad sys_fdatasync
- .quad sys32_sysctl /* sysctl */
- .quad sys_mlock /* 150 */
- .quad sys_munlock
- .quad sys_mlockall
- .quad sys_munlockall
- .quad sys_sched_setparam
- .quad sys_sched_getparam /* 155 */
- .quad sys_sched_setscheduler
- .quad sys_sched_getscheduler
- .quad sys_sched_yield
- .quad sys_sched_get_priority_max
- .quad sys_sched_get_priority_min /* 160 */
- .quad sys32_sched_rr_get_interval
- .quad compat_sys_nanosleep
- .quad sys_mremap
- .quad sys_setresuid16
- .quad sys_getresuid16 /* 165 */
- .quad sys32_vm86_warning /* vm86 */
- .quad quiet_ni_syscall /* query_module */
- .quad sys_poll
- .quad compat_sys_nfsservctl
- .quad sys_setresgid16 /* 170 */
- .quad sys_getresgid16
- .quad sys_prctl
- .quad stub32_rt_sigreturn
- .quad sys32_rt_sigaction
- .quad sys32_rt_sigprocmask /* 175 */
- .quad sys32_rt_sigpending
- .quad compat_sys_rt_sigtimedwait
- .quad sys32_rt_sigqueueinfo
- .quad sys_rt_sigsuspend
- .quad sys32_pread /* 180 */
- .quad sys32_pwrite
- .quad sys_chown16
- .quad sys_getcwd
- .quad sys_capget
- .quad sys_capset
- .quad stub32_sigaltstack
- .quad sys32_sendfile
- .quad quiet_ni_syscall /* streams1 */
- .quad quiet_ni_syscall /* streams2 */
- .quad stub32_vfork /* 190 */
- .quad compat_sys_getrlimit
- .quad sys32_mmap2
- .quad sys32_truncate64
- .quad sys32_ftruncate64
- .quad sys32_stat64 /* 195 */
- .quad sys32_lstat64
- .quad sys32_fstat64
- .quad sys_lchown
- .quad sys_getuid
- .quad sys_getgid /* 200 */
- .quad sys_geteuid
- .quad sys_getegid
- .quad sys_setreuid
- .quad sys_setregid
- .quad sys_getgroups /* 205 */
- .quad sys_setgroups
- .quad sys_fchown
- .quad sys_setresuid
- .quad sys_getresuid
- .quad sys_setresgid /* 210 */
- .quad sys_getresgid
- .quad sys_chown
- .quad sys_setuid
- .quad sys_setgid
- .quad sys_setfsuid /* 215 */
- .quad sys_setfsgid
- .quad sys_pivot_root
- .quad sys_mincore
- .quad sys_madvise
- .quad compat_sys_getdents64 /* 220 getdents64 */
- .quad compat_sys_fcntl64
- .quad quiet_ni_syscall /* tux */
- .quad quiet_ni_syscall /* security */
- .quad sys_gettid
- .quad sys32_readahead /* 225 */
- .quad sys_setxattr
- .quad sys_lsetxattr
- .quad sys_fsetxattr
- .quad sys_getxattr
- .quad sys_lgetxattr /* 230 */
- .quad sys_fgetxattr
- .quad sys_listxattr
- .quad sys_llistxattr
- .quad sys_flistxattr
- .quad sys_removexattr /* 235 */
- .quad sys_lremovexattr
- .quad sys_fremovexattr
- .quad sys_tkill
- .quad sys_sendfile64
- .quad compat_sys_futex /* 240 */
- .quad compat_sys_sched_setaffinity
- .quad compat_sys_sched_getaffinity
- .quad sys_set_thread_area
- .quad sys_get_thread_area
- .quad compat_sys_io_setup /* 245 */
- .quad sys_io_destroy
- .quad compat_sys_io_getevents
- .quad compat_sys_io_submit
- .quad sys_io_cancel
- .quad sys32_fadvise64 /* 250 */
- .quad quiet_ni_syscall /* free_huge_pages */
- .quad sys_exit_group
- .quad sys32_lookup_dcookie
- .quad sys_epoll_create
- .quad sys_epoll_ctl /* 255 */
- .quad sys_epoll_wait
- .quad sys_remap_file_pages
- .quad sys_set_tid_address
- .quad compat_sys_timer_create
- .quad compat_sys_timer_settime /* 260 */
- .quad compat_sys_timer_gettime
- .quad sys_timer_getoverrun
- .quad sys_timer_delete
- .quad compat_sys_clock_settime
- .quad compat_sys_clock_gettime /* 265 */
- .quad compat_sys_clock_getres
- .quad compat_sys_clock_nanosleep
- .quad compat_sys_statfs64
- .quad compat_sys_fstatfs64
- .quad sys_tgkill /* 270 */
- .quad compat_sys_utimes
- .quad sys32_fadvise64_64
- .quad quiet_ni_syscall /* sys_vserver */
- .quad sys_mbind
- .quad compat_sys_get_mempolicy /* 275 */
- .quad sys_set_mempolicy
- .quad compat_sys_mq_open
- .quad sys_mq_unlink
- .quad compat_sys_mq_timedsend
- .quad compat_sys_mq_timedreceive /* 280 */
- .quad compat_sys_mq_notify
- .quad compat_sys_mq_getsetattr
- .quad compat_sys_kexec_load /* reserved for kexec */
- .quad compat_sys_waitid
- .quad quiet_ni_syscall /* 285: sys_altroot */
- .quad sys_add_key
- .quad sys_request_key
- .quad sys_keyctl
- .quad sys_ioprio_set
- .quad sys_ioprio_get /* 290 */
- .quad sys_inotify_init
- .quad sys_inotify_add_watch
- .quad sys_inotify_rm_watch
- .quad sys_migrate_pages
- .quad compat_sys_openat /* 295 */
- .quad sys_mkdirat
- .quad sys_mknodat
- .quad sys_fchownat
- .quad compat_sys_futimesat
- .quad sys32_fstatat /* 300 */
- .quad sys_unlinkat
- .quad sys_renameat
- .quad sys_linkat
- .quad sys_symlinkat
- .quad sys_readlinkat /* 305 */
- .quad sys_fchmodat
- .quad sys_faccessat
- .quad compat_sys_pselect6
- .quad compat_sys_ppoll
- .quad sys_unshare /* 310 */
- .quad compat_sys_set_robust_list
- .quad compat_sys_get_robust_list
- .quad sys_splice
- .quad sys32_sync_file_range
- .quad sys_tee /* 315 */
- .quad compat_sys_vmsplice
- .quad compat_sys_move_pages
- .quad sys_getcpu
- .quad sys_epoll_pwait
- .quad compat_sys_utimensat /* 320 */
- .quad compat_sys_signalfd
- .quad sys_timerfd_create
- .quad sys_eventfd
- .quad sys32_fallocate
- .quad compat_sys_timerfd_settime /* 325 */
- .quad compat_sys_timerfd_gettime
- .quad compat_sys_signalfd4
- .quad sys_eventfd2
- .quad sys_epoll_create1
- .quad sys_dup3 /* 330 */
- .quad sys_pipe2
- .quad sys_inotify_init1
- .quad compat_sys_preadv
- .quad compat_sys_pwritev
- .quad compat_sys_rt_tgsigqueueinfo /* 335 */
- .quad sys_perf_counter_open
-ia32_syscall_end:
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
deleted file mode 100644
index 29cdcd02ead3..000000000000
--- a/arch/x86/ia32/ipc32.c
+++ /dev/null
@@ -1,54 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/syscalls.h>
-#include <linux/time.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/ipc.h>
-#include <linux/compat.h>
-#include <asm/sys_ia32.h>
-
-asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
- compat_uptr_t ptr, u32 fifth)
-{
- int version;
-
- version = call >> 16; /* hack for backward compatibility */
- call &= 0xffff;
-
- switch (call) {
- case SEMOP:
- /* struct sembuf is the same on 32 and 64bit :)) */
- return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
- case SEMTIMEDOP:
- return compat_sys_semtimedop(first, compat_ptr(ptr), second,
- compat_ptr(fifth));
- case SEMGET:
- return sys_semget(first, second, third);
- case SEMCTL:
- return compat_sys_semctl(first, second, third, compat_ptr(ptr));
-
- case MSGSND:
- return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
- case MSGRCV:
- return compat_sys_msgrcv(first, second, fifth, third,
- version, compat_ptr(ptr));
- case MSGGET:
- return sys_msgget((key_t) first, second);
- case MSGCTL:
- return compat_sys_msgctl(first, second, compat_ptr(ptr));
-
- case SHMAT:
- return compat_sys_shmat(first, second, third, version,
- compat_ptr(ptr));
- case SHMDT:
- return sys_shmdt(compat_ptr(ptr));
- case SHMGET:
- return sys_shmget(first, (unsigned)second, third);
- case SHMCTL:
- return compat_sys_shmctl(first, second, compat_ptr(ptr));
- }
- return -ENOSYS;
-}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
deleted file mode 100644
index 085a8c35f149..000000000000
--- a/arch/x86/ia32/sys_ia32.c
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
- * sys_sparc32
- *
- * Copyright (C) 2000 VA Linux Co
- * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
- * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
- * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 2000 Hewlett-Packard Co.
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * environment. In 2.5 most of this should be moved to a generic directory.
- *
- * This file assumes that there is a hole at the end of user address space.
- *
- * Some of the functions are LE specific currently. These are
- * hopefully all marked. This should be fixed.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/signal.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/utsname.h>
-#include <linux/smp_lock.h>
-#include <linux/mm.h>
-#include <linux/uio.h>
-#include <linux/poll.h>
-#include <linux/personality.h>
-#include <linux/stat.h>
-#include <linux/rwsem.h>
-#include <linux/compat.h>
-#include <linux/vfs.h>
-#include <linux/ptrace.h>
-#include <linux/highuid.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
-#include <asm/types.h>
-#include <asm/uaccess.h>
-#include <asm/atomic.h>
-#include <asm/vgtod.h>
-#include <asm/sys_ia32.h>
-
-#define AA(__x) ((unsigned long)(__x))
-
-
-asmlinkage long sys32_truncate64(char __user *filename,
- unsigned long offset_low,
- unsigned long offset_high)
-{
- return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
-}
-
-asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low,
- unsigned long offset_high)
-{
- return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
-}
-
-/*
- * Another set for IA32/LFS -- x86_64 struct stat is different due to
- * support for 64bit inode numbers.
- */
-static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
-{
- typeof(ubuf->st_uid) uid = 0;
- typeof(ubuf->st_gid) gid = 0;
- SET_UID(uid, stat->uid);
- SET_GID(gid, stat->gid);
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
- __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
- __put_user(stat->ino, &ubuf->__st_ino) ||
- __put_user(stat->ino, &ubuf->st_ino) ||
- __put_user(stat->mode, &ubuf->st_mode) ||
- __put_user(stat->nlink, &ubuf->st_nlink) ||
- __put_user(uid, &ubuf->st_uid) ||
- __put_user(gid, &ubuf->st_gid) ||
- __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
- __put_user(stat->size, &ubuf->st_size) ||
- __put_user(stat->atime.tv_sec, &ubuf->st_atime) ||
- __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
- __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) ||
- __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
- __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) ||
- __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
- __put_user(stat->blksize, &ubuf->st_blksize) ||
- __put_user(stat->blocks, &ubuf->st_blocks))
- return -EFAULT;
- return 0;
-}
-
-asmlinkage long sys32_stat64(char __user *filename,
- struct stat64 __user *statbuf)
-{
- struct kstat stat;
- int ret = vfs_stat(filename, &stat);
-
- if (!ret)
- ret = cp_stat64(statbuf, &stat);
- return ret;
-}
-
-asmlinkage long sys32_lstat64(char __user *filename,
- struct stat64 __user *statbuf)
-{
- struct kstat stat;
- int ret = vfs_lstat(filename, &stat);
- if (!ret)
- ret = cp_stat64(statbuf, &stat);
- return ret;
-}
-
-asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
-{
- struct kstat stat;
- int ret = vfs_fstat(fd, &stat);
- if (!ret)
- ret = cp_stat64(statbuf, &stat);
- return ret;
-}
-
-asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
- struct stat64 __user *statbuf, int flag)
-{
- struct kstat stat;
- int error;
-
- error = vfs_fstatat(dfd, filename, &stat, flag);
- if (error)
- return error;
- return cp_stat64(statbuf, &stat);
-}
-
-/*
- * Linux/i386 didn't use to be able to handle more than
- * 4 system call parameters, so these system calls used a memory
- * block for parameter passing..
- */
-
-struct mmap_arg_struct {
- unsigned int addr;
- unsigned int len;
- unsigned int prot;
- unsigned int flags;
- unsigned int fd;
- unsigned int offset;
-};
-
-asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
-{
- struct mmap_arg_struct a;
- struct file *file = NULL;
- unsigned long retval;
- struct mm_struct *mm ;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
-
- if (a.offset & ~PAGE_MASK)
- return -EINVAL;
-
- if (!(a.flags & MAP_ANONYMOUS)) {
- file = fget(a.fd);
- if (!file)
- return -EBADF;
- }
-
- mm = current->mm;
- down_write(&mm->mmap_sem);
- retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
- a.offset>>PAGE_SHIFT);
- if (file)
- fput(file);
-
- up_write(&mm->mmap_sem);
-
- return retval;
-}
-
-asmlinkage long sys32_mprotect(unsigned long start, size_t len,
- unsigned long prot)
-{
- return sys_mprotect(start, len, prot);
-}
-
-asmlinkage long sys32_pipe(int __user *fd)
-{
- int retval;
- int fds[2];
-
- retval = do_pipe_flags(fds, 0);
- if (retval)
- goto out;
- if (copy_to_user(fd, fds, sizeof(fds)))
- retval = -EFAULT;
-out:
- return retval;
-}
-
-asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
- struct sigaction32 __user *oact,
- unsigned int sigsetsize)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
- compat_sigset_t set32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
-
- if (act) {
- compat_uptr_t handler, restorer;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
- __get_user(restorer, &act->sa_restorer) ||
- __copy_from_user(&set32, &act->sa_mask,
- sizeof(compat_sigset_t)))
- return -EFAULT;
- new_ka.sa.sa_handler = compat_ptr(handler);
- new_ka.sa.sa_restorer = compat_ptr(restorer);
-
- /*
- * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
- * than _NSIG_WORDS << 1
- */
- switch (_NSIG_WORDS) {
- case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
- | (((long)set32.sig[7]) << 32);
- case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4]
- | (((long)set32.sig[5]) << 32);
- case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2]
- | (((long)set32.sig[3]) << 32);
- case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0]
- | (((long)set32.sig[1]) << 32);
- }
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- /*
- * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
- * than _NSIG_WORDS << 1
- */
- switch (_NSIG_WORDS) {
- case 4:
- set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
- set32.sig[6] = old_ka.sa.sa_mask.sig[3];
- case 3:
- set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32);
- set32.sig[4] = old_ka.sa.sa_mask.sig[2];
- case 2:
- set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32);
- set32.sig[2] = old_ka.sa.sa_mask.sig[1];
- case 1:
- set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32);
- set32.sig[0] = old_ka.sa.sa_mask.sig[0];
- }
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_handler),
- &oact->sa_handler) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
- &oact->sa_restorer) ||
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
- __copy_to_user(&oact->sa_mask, &set32,
- sizeof(compat_sigset_t)))
- return -EFAULT;
- }
-
- return ret;
-}
-
-asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
- struct old_sigaction32 __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
-
- if (act) {
- compat_old_sigset_t mask;
- compat_uptr_t handler, restorer;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
- __get_user(restorer, &act->sa_restorer) ||
- __get_user(mask, &act->sa_mask))
- return -EFAULT;
-
- new_ka.sa.sa_handler = compat_ptr(handler);
- new_ka.sa.sa_restorer = compat_ptr(restorer);
-
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_handler),
- &oact->sa_handler) ||
- __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
- &oact->sa_restorer) ||
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
- return -EFAULT;
- }
-
- return ret;
-}
-
-asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
- compat_sigset_t __user *oset,
- unsigned int sigsetsize)
-{
- sigset_t s;
- compat_sigset_t s32;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- if (set) {
- if (copy_from_user(&s32, set, sizeof(compat_sigset_t)))
- return -EFAULT;
- switch (_NSIG_WORDS) {
- case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
- case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
- case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
- case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
- }
- }
- set_fs(KERNEL_DS);
- ret = sys_rt_sigprocmask(how,
- set ? (sigset_t __user *)&s : NULL,
- oset ? (sigset_t __user *)&s : NULL,
- sigsetsize);
- set_fs(old_fs);
- if (ret)
- return ret;
- if (oset) {
- switch (_NSIG_WORDS) {
- case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
- case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
- case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
- case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
- }
- if (copy_to_user(oset, &s32, sizeof(compat_sigset_t)))
- return -EFAULT;
- }
- return 0;
-}
-
-asmlinkage long sys32_alarm(unsigned int seconds)
-{
- return alarm_setitimer(seconds);
-}
-
-struct sel_arg_struct {
- unsigned int n;
- unsigned int inp;
- unsigned int outp;
- unsigned int exp;
- unsigned int tvp;
-};
-
-asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
-{
- struct sel_arg_struct a;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
- return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
- compat_ptr(a.exp), compat_ptr(a.tvp));
-}
-
-asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
- int options)
-{
- return compat_sys_wait4(pid, stat_addr, options, NULL);
-}
-
-/* 32-bit timeval and related flotsam. */
-
-asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2)
-{
- return sys_sysfs(option, arg1, arg2);
-}
-
-asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid,
- struct compat_timespec __user *interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
- set_fs(old_fs);
- if (put_compat_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
-asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set,
- compat_size_t sigsetsize)
-{
- sigset_t s;
- compat_sigset_t s32;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
- set_fs(old_fs);
- if (!ret) {
- switch (_NSIG_WORDS) {
- case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
- case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
- case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
- case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
- }
- if (copy_to_user(set, &s32, sizeof(compat_sigset_t)))
- return -EFAULT;
- }
- return ret;
-}
-
-asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
- compat_siginfo_t __user *uinfo)
-{
- siginfo_t info;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- set_fs(KERNEL_DS);
- ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
- set_fs(old_fs);
- return ret;
-}
-
-#ifdef CONFIG_SYSCTL_SYSCALL
-struct sysctl_ia32 {
- unsigned int name;
- int nlen;
- unsigned int oldval;
- unsigned int oldlenp;
- unsigned int newval;
- unsigned int newlen;
- unsigned int __unused[4];
-};
-
-
-asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
-{
- struct sysctl_ia32 a32;
- mm_segment_t old_fs = get_fs();
- void __user *oldvalp, *newvalp;
- size_t oldlen;
- int __user *namep;
- long ret;
-
- if (copy_from_user(&a32, args32, sizeof(a32)))
- return -EFAULT;
-
- /*
- * We need to pre-validate these because we have to disable
- * address checking before calling do_sysctl() because of
- * OLDLEN but we can't run the risk of the user specifying bad
- * addresses here. Well, since we're dealing with 32 bit
- * addresses, we KNOW that access_ok() will always succeed, so
- * this is an expensive NOP, but so what...
- */
- namep = compat_ptr(a32.name);
- oldvalp = compat_ptr(a32.oldval);
- newvalp = compat_ptr(a32.newval);
-
- if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
- || !access_ok(VERIFY_WRITE, namep, 0)
- || !access_ok(VERIFY_WRITE, oldvalp, 0)
- || !access_ok(VERIFY_WRITE, newvalp, 0))
- return -EFAULT;
-
- set_fs(KERNEL_DS);
- lock_kernel();
- ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
- newvalp, (size_t) a32.newlen);
- unlock_kernel();
- set_fs(old_fs);
-
- if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
- return -EFAULT;
-
- return ret;
-}
-#endif
-
-/* warning: next two assume little endian */
-asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
- u32 poslo, u32 poshi)
-{
- return sys_pread64(fd, ubuf, count,
- ((loff_t)AA(poshi) << 32) | AA(poslo));
-}
-
-asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
- u32 poslo, u32 poshi)
-{
- return sys_pwrite64(fd, ubuf, count,
- ((loff_t)AA(poshi) << 32) | AA(poslo));
-}
-
-
-asmlinkage long sys32_personality(unsigned long personality)
-{
- int ret;
-
- if (personality(current->personality) == PER_LINUX32 &&
- personality == PER_LINUX)
- personality = PER_LINUX32;
- ret = sys_personality(personality);
- if (ret == PER_LINUX32)
- ret = PER_LINUX;
- return ret;
-}
-
-asmlinkage long sys32_sendfile(int out_fd, int in_fd,
- compat_off_t __user *offset, s32 count)
-{
- mm_segment_t old_fs = get_fs();
- int ret;
- off_t of;
-
- if (offset && get_user(of, offset))
- return -EFAULT;
-
- set_fs(KERNEL_DS);
- ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
- count);
- set_fs(old_fs);
-
- if (offset && put_user(of, offset))
- return -EFAULT;
- return ret;
-}
-
-asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff)
-{
- struct mm_struct *mm = current->mm;
- unsigned long error;
- struct file *file = NULL;
-
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- if (!(flags & MAP_ANONYMOUS)) {
- file = fget(fd);
- if (!file)
- return -EBADF;
- }
-
- down_write(&mm->mmap_sem);
- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&mm->mmap_sem);
-
- if (file)
- fput(file);
- return error;
-}
-
-asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
-{
- char *arch = "x86_64";
- int err;
-
- if (!name)
- return -EFAULT;
- if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
- return -EFAULT;
-
- down_read(&uts_sem);
-
- err = __copy_to_user(&name->sysname, &utsname()->sysname,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
- err |= __copy_to_user(&name->nodename, &utsname()->nodename,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
- err |= __copy_to_user(&name->release, &utsname()->release,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->release+__OLD_UTS_LEN);
- err |= __copy_to_user(&name->version, &utsname()->version,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->version+__OLD_UTS_LEN);
-
- if (personality(current->personality) == PER_LINUX32)
- arch = "i686";
-
- err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
-
- up_read(&uts_sem);
-
- err = err ? -EFAULT : 0;
-
- return err;
-}
-
-long sys32_uname(struct old_utsname __user *name)
-{
- int err;
-
- if (!name)
- return -EFAULT;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
-
- return err ? -EFAULT : 0;
-}
-
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
- compat_uptr_t __user *envp, struct pt_regs *regs)
-{
- long error;
- char *filename;
-
- filename = getname(name);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- return error;
- error = compat_do_execve(filename, argv, envp, regs);
- putname(filename);
- return error;
-}
-
-asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
- struct pt_regs *regs)
-{
- void __user *parent_tid = (void __user *)regs->dx;
- void __user *child_tid = (void __user *)regs->di;
-
- if (!newsp)
- newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
-}
-
-/*
- * Some system calls that need sign extended arguments. This could be
- * done by a generic wrapper.
- */
-long sys32_lseek(unsigned int fd, int offset, unsigned int whence)
-{
- return sys_lseek(fd, offset, whence);
-}
-
-long sys32_kill(int pid, int sig)
-{
- return sys_kill(pid, sig);
-}
-
-long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
- __u32 len_low, __u32 len_high, int advice)
-{
- return sys_fadvise64_64(fd,
- (((u64)offset_high)<<32) | offset_low,
- (((u64)len_high)<<32) | len_low,
- advice);
-}
-
-long sys32_vm86_warning(void)
-{
- struct task_struct *me = current;
- static char lastcomm[sizeof(me->comm)];
-
- if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
- compat_printk(KERN_INFO
- "%s: vm86 mode not supported on 64 bit kernel\n",
- me->comm);
- strncpy(lastcomm, me->comm, sizeof(lastcomm));
- }
- return -ENOSYS;
-}
-
-long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
- char __user *buf, size_t len)
-{
- return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
-}
-
-asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
- size_t count)
-{
- return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
-}
-
-asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi,
- unsigned n_low, unsigned n_hi, int flags)
-{
- return sys_sync_file_range(fd,
- ((u64)off_hi << 32) | off_low,
- ((u64)n_hi << 32) | n_low, flags);
-}
-
-asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
- size_t len, int advice)
-{
- return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
- len, advice);
-}
-
-asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
- unsigned offset_hi, unsigned len_lo,
- unsigned len_hi)
-{
- return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
- ((u64)len_hi << 32) | len_lo);
-}
diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h
new file mode 100644
index 000000000000..07949102a08d
--- /dev/null
+++ b/arch/x86/include/asm/GEN-for-each-reg.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * These are in machine order; things rely on that.
+ */
+#ifdef CONFIG_64BIT
+GEN(rax)
+GEN(rcx)
+GEN(rdx)
+GEN(rbx)
+GEN(rsp)
+GEN(rbp)
+GEN(rsi)
+GEN(rdi)
+GEN(r8)
+GEN(r9)
+GEN(r10)
+GEN(r11)
+GEN(r12)
+GEN(r13)
+GEN(r14)
+GEN(r15)
+#else
+GEN(eax)
+GEN(ecx)
+GEN(edx)
+GEN(ebx)
+GEN(esp)
+GEN(ebp)
+GEN(esi)
+GEN(edi)
+#endif
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..4566000e15c4 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -1,24 +1,16 @@
-include include/asm-generic/Kbuild.asm
+# SPDX-License-Identifier: GPL-2.0
-header-y += boot.h
-header-y += bootparam.h
-header-y += debugreg.h
-header-y += ldt.h
-header-y += msr-index.h
-header-y += prctl.h
-header-y += ptrace-abi.h
-header-y += sigcontext32.h
-header-y += ucontext.h
-header-y += processor-flags.h
-unifdef-y += e820.h
-unifdef-y += ist.h
-unifdef-y += mce.h
-unifdef-y += msr.h
-unifdef-y += mtrr.h
-unifdef-y += posix_types_32.h
-unifdef-y += posix_types_64.h
-unifdef-y += unistd_32.h
-unifdef-y += unistd_64.h
-unifdef-y += vm86.h
-unifdef-y += vsyscall.h
+generated-y += orc_hash.h
+generated-y += syscalls_32.h
+generated-y += syscalls_64.h
+generated-y += syscalls_x32.h
+generated-y += unistd_32_ia32.h
+generated-y += unistd_64_x32.h
+generated-y += xen-hypercalls.h
+generated-y += cpufeaturemasks.h
+
+generic-y += early_ioremap.h
+generic-y += fprobe.h
+generic-y += mcs_spinlock.h
+generic-y += mmzone.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
deleted file mode 100644
index bb70e397aa84..000000000000
--- a/arch/x86/include/asm/a.out-core.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* a.out coredump register dumper
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_X86_A_OUT_CORE_H
-#define _ASM_X86_A_OUT_CORE_H
-
-#ifdef __KERNEL__
-#ifdef CONFIG_X86_32
-
-#include <linux/user.h>
-#include <linux/elfcore.h>
-
-/*
- * fill in the user structure for an a.out core dump
- */
-static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
-{
-/* changed the size calculations - should hopefully work better. lbt */
- dump->magic = CMAGIC;
- dump->start_code = 0;
- dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
- dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
- dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
- >> PAGE_SHIFT;
- dump->u_dsize -= dump->u_tsize;
- dump->u_ssize = 0;
- dump->u_debugreg[0] = current->thread.debugreg0;
- dump->u_debugreg[1] = current->thread.debugreg1;
- dump->u_debugreg[2] = current->thread.debugreg2;
- dump->u_debugreg[3] = current->thread.debugreg3;
- dump->u_debugreg[4] = 0;
- dump->u_debugreg[5] = 0;
- dump->u_debugreg[6] = current->thread.debugreg6;
- dump->u_debugreg[7] = current->thread.debugreg7;
-
- if (dump->start_stack < TASK_SIZE)
- dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
- >> PAGE_SHIFT;
-
- dump->regs.bx = regs->bx;
- dump->regs.cx = regs->cx;
- dump->regs.dx = regs->dx;
- dump->regs.si = regs->si;
- dump->regs.di = regs->di;
- dump->regs.bp = regs->bp;
- dump->regs.ax = regs->ax;
- dump->regs.ds = (u16)regs->ds;
- dump->regs.es = (u16)regs->es;
- dump->regs.fs = (u16)regs->fs;
- dump->regs.gs = get_user_gs(regs);
- dump->regs.orig_ax = regs->orig_ax;
- dump->regs.ip = regs->ip;
- dump->regs.cs = (u16)regs->cs;
- dump->regs.flags = regs->flags;
- dump->regs.sp = regs->sp;
- dump->regs.ss = (u16)regs->ss;
-
- dump->u_fpvalid = dump_fpu(regs, &dump->i387);
-}
-
-#endif /* CONFIG_X86_32 */
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h
new file mode 100644
index 000000000000..d937c55e717e
--- /dev/null
+++ b/arch/x86/include/asm/acenv.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * X86 specific ACPICA environments and implementation
+ *
+ * Copyright (C) 2014, Intel Corporation
+ * Author: Lv Zheng <lv.zheng@intel.com>
+ */
+
+#ifndef _ASM_X86_ACENV_H
+#define _ASM_X86_ACENV_H
+
+#include <asm/special_insns.h>
+
+/* Asm macros */
+
+/*
+ * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states.
+ * It is required to prevent data loss.
+ *
+ * While running inside virtual machine, the kernel can bypass cache flushing.
+ * Changing sleep state in a virtual machine doesn't affect the host system
+ * sleep state and cannot lead to data loss.
+ */
+#define ACPI_FLUSH_CPU_CACHE() \
+do { \
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \
+ wbinvd(); \
+} while (0)
+
+int __acpi_acquire_global_lock(unsigned int *lock);
+int __acpi_release_global_lock(unsigned int *lock);
+
+#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
+
+#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_release_global_lock(&facs->global_lock))
+
+/*
+ * Math helper asm macros
+ */
+#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
+ asm("divl %2;" \
+ : "=a"(q32), "=d"(r32) \
+ : "r"(d32), \
+ "0"(n_lo), "1"(n_hi))
+
+#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
+ asm("shrl $1,%2 ;" \
+ "rcrl $1,%3;" \
+ : "=r"(n_hi), "=r"(n_lo) \
+ : "0"(n_hi), "1"(n_lo))
+
+#endif /* _ASM_X86_ACENV_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 20d1465a2ab0..a03aa6f999d1 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -1,83 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_ACPI_H
#define _ASM_X86_ACPI_H
/*
* Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <asm/numa.h>
+#include <asm/fixmap.h>
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/mpspec.h>
+#include <asm/x86_init.h>
+#include <asm/cpufeature.h>
+#include <asm/irq_vectors.h>
+#include <asm/xen/hypervisor.h>
-#define COMPILER_DEPENDENT_INT64 long long
-#define COMPILER_DEPENDENT_UINT64 unsigned long long
+#include <xen/xen.h>
-/*
- * Calling conventions:
- *
- * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
- * ACPI_EXTERNAL_XFACE - External ACPI interfaces
- * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
- * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
- */
-#define ACPI_SYSTEM_XFACE
-#define ACPI_EXTERNAL_XFACE
-#define ACPI_INTERNAL_XFACE
-#define ACPI_INTERNAL_VAR_XFACE
-
-/* Asm macros */
-
-#define ACPI_ASM_MACROS
-#define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS() local_irq_enable()
-#define ACPI_FLUSH_CPU_CACHE() wbinvd()
-
-int __acpi_acquire_global_lock(unsigned int *lock);
-int __acpi_release_global_lock(unsigned int *lock);
-
-#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
- ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
-
-#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
- ((Acq) = __acpi_release_global_lock(&facs->global_lock))
-
-/*
- * Math helper asm macros
- */
-#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
- asm("divl %2;" \
- : "=a"(q32), "=d"(r32) \
- : "r"(d32), \
- "0"(n_lo), "1"(n_hi))
-
-
-#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
- asm("shrl $1,%2 ;" \
- "rcrl $1,%3;" \
- : "=r"(n_hi), "=r"(n_lo) \
- : "0"(n_hi), "1"(n_lo))
+#ifdef CONFIG_ACPI_APEI
+# include <asm/pgtable_types.h>
+#endif
#ifdef CONFIG_ACPI
extern int acpi_lapic;
@@ -85,25 +30,34 @@ extern int acpi_ioapic;
extern int acpi_noirq;
extern int acpi_strict;
extern int acpi_disabled;
-extern int acpi_ht;
extern int acpi_pci_disabled;
extern int acpi_skip_timer_override;
extern int acpi_use_timer_override;
+extern int acpi_fix_pin2_polarity;
+extern int acpi_disable_cmcff;
+extern bool acpi_int_src_ovr[NR_IRQS_LEGACY];
extern u8 acpi_sci_flags;
-extern int acpi_sci_override_gsi;
+extern u32 acpi_sci_override_gsi;
void acpi_pic_sci_set_trigger(unsigned int, u16);
+struct device;
+
+extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity);
+extern void (*__acpi_unregister_gsi)(u32 gsi);
+
static inline void disable_acpi(void)
{
acpi_disabled = 1;
- acpi_ht = 0;
acpi_pci_disabled = 1;
acpi_noirq = 1;
}
extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
+extern int acpi_blacklisted(void);
+
static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
static inline void acpi_disable_pci(void)
{
@@ -111,14 +65,25 @@ static inline void acpi_disable_pci(void)
acpi_noirq_set();
}
-/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-extern void acpi_restore_state_mem(void);
+/* Low-level suspend routine. */
+extern int (*acpi_suspend_lowlevel)(void);
-extern unsigned long acpi_wakeup_address;
+/* Physical address to resume after wakeup */
+unsigned long acpi_get_wakeup_address(void);
-/* early initialization routine */
-extern void acpi_reserve_bootmem(void);
+static inline bool acpi_skip_set_wakeup_address(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+
+#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address
+
+union acpi_subtable_headers;
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end);
+
+void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
/*
* Check if the CPU can handle C2 and deeper
@@ -134,42 +99,146 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
if (boot_cpu_data.x86 == 0x0F &&
boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86_model <= 0x05 &&
- boot_cpu_data.x86_mask < 0x0A)
+ boot_cpu_data.x86_stepping < 0x0A)
return 1;
- else if (boot_cpu_has(X86_FEATURE_AMDC1E))
+ else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E))
return 1;
else
return max_cstate;
}
+static inline bool arch_has_acpi_pdc(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+ return (c->x86_vendor == X86_VENDOR_INTEL ||
+ c->x86_vendor == X86_VENDOR_CENTAUR);
+}
+
+static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ *cap |= ACPI_PROC_CAP_C_CAPABILITY_SMP;
+
+ /* Enable coordination with firmware's _TSD info */
+ *cap |= ACPI_PROC_CAP_SMP_T_SWCOORD;
+
+ if (cpu_has(c, X86_FEATURE_EST))
+ *cap |= ACPI_PROC_CAP_EST_CAPABILITY_SWSMP;
+
+ if (cpu_has(c, X86_FEATURE_ACPI))
+ *cap |= ACPI_PROC_CAP_T_FFH;
+
+ if (cpu_has(c, X86_FEATURE_HWP))
+ *cap |= ACPI_PROC_CAP_COLLAB_PROC_PERF;
+
+ /*
+ * If mwait/monitor is unsupported, C_C1_FFH and
+ * C2/C3_FFH will be disabled.
+ */
+ if (!cpu_has(c, X86_FEATURE_MWAIT) ||
+ boot_option_idle_override == IDLE_NOMWAIT)
+ *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH);
+
+ if (xen_initial_domain()) {
+ /*
+ * When Linux is running as Xen dom0, the hypervisor is the
+ * entity in charge of the processor power management, and so
+ * Xen needs to check the OS capabilities reported in the
+ * processor capabilities buffer matches what the hypervisor
+ * driver supports.
+ */
+ xen_sanitize_proc_cap_bits(cap);
+ }
+}
+
+static inline bool acpi_has_cpu_in_madt(void)
+{
+ return !!acpi_lapic;
+}
+
+#define ACPI_HAVE_ARCH_SET_ROOT_POINTER
+static __always_inline void acpi_arch_set_root_pointer(u64 addr)
+{
+ x86_init.acpi.set_root_pointer(addr);
+}
+
+#define ACPI_HAVE_ARCH_GET_ROOT_POINTER
+static __always_inline u64 acpi_arch_get_root_pointer(void)
+{
+ return x86_init.acpi.get_root_pointer();
+}
+
+void acpi_generic_reduced_hw_init(void);
+
+void x86_default_set_root_pointer(u64 addr);
+u64 x86_default_get_root_pointer(void);
+
+#ifdef CONFIG_XEN_PV
+/* A Xen PV domain needs a special acpi_os_ioremap() handling. */
+extern void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys,
+ acpi_size size);
+void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size);
+#define acpi_os_ioremap acpi_os_ioremap
+#endif
+
#else /* !CONFIG_ACPI */
-#define acpi_disabled 1
#define acpi_lapic 0
#define acpi_ioapic 0
+#define acpi_disable_cmcff 0
static inline void acpi_noirq_set(void) { }
static inline void acpi_disable_pci(void) { }
static inline void disable_acpi(void) { }
+static inline void acpi_generic_reduced_hw_init(void) { }
+
+static inline void x86_default_set_root_pointer(u64 addr) { }
+
+static inline u64 x86_default_get_root_pointer(void)
+{
+ return 0;
+}
+
#endif /* !CONFIG_ACPI */
#define ARCH_HAS_POWER_INIT 1
-struct bootnode;
-
#ifdef CONFIG_ACPI_NUMA
-extern int acpi_numa;
-extern int acpi_scan_nodes(unsigned long start, unsigned long end);
-#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
- int num_nodes);
+extern int x86_acpi_numa_init(void);
+#endif /* CONFIG_ACPI_NUMA */
+
+struct cper_ia_proc_ctx;
+
+#ifdef CONFIG_ACPI_APEI
+static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
+{
+ /*
+ * We currently have no way to look up the EFI memory map
+ * attributes for a region in a consistent way, because the
+ * memmap is discarded after efi_free_boot_services(). So if
+ * you call efi_mem_attributes() during boot and at runtime,
+ * you could theoretically see different attributes.
+ *
+ * We are yet to see any x86 platforms that require anything
+ * other than PAGE_KERNEL (some ARM64 platforms require the
+ * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
+ * is active, the ACPI information will not be encrypted,
+ * so return PAGE_KERNEL_NOENC until we know differently.
+ */
+ return PAGE_KERNEL_NOENC;
+}
+
+int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id);
#else
-static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
- int num_nodes)
+static inline int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id)
{
+ return -EINVAL;
}
#endif
-#define acpi_unlazy_tlb(x) leave_mm(x)
+#define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT)
#endif /* _ASM_X86_ACPI_H */
diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h
new file mode 100644
index 000000000000..fab11192c60a
--- /dev/null
+++ b/arch/x86/include/asm/acrn.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ACRN_H
+#define _ASM_X86_ACRN_H
+
+/*
+ * This CPUID returns feature bitmaps in EAX.
+ * Guest VM uses this to detect the appropriate feature bit.
+ */
+#define ACRN_CPUID_FEATURES 0x40000001
+/* Bit 0 indicates whether guest VM is privileged */
+#define ACRN_FEATURE_PRIVILEGED_VM BIT(0)
+
+/*
+ * Timing Information.
+ * This leaf returns the current TSC frequency in kHz.
+ *
+ * EAX: (Virtual) TSC frequency in kHz.
+ * EBX, ECX, EDX: RESERVED (reserved fields are set to zero).
+ */
+#define ACRN_CPUID_TIMING_INFO 0x40000010
+
+void acrn_setup_intr_handler(void (*handler)(void));
+void acrn_remove_intr_handler(void);
+
+static inline u32 acrn_cpuid_base(void)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return cpuid_base_hypervisor("ACRNACRNACRN", 0);
+
+ return 0;
+}
+
+static inline unsigned long acrn_get_tsc_khz(void)
+{
+ return cpuid_eax(ACRN_CPUID_TIMING_INFO);
+}
+
+/*
+ * Hypercalls for ACRN
+ *
+ * - VMCALL instruction is used to implement ACRN hypercalls.
+ * - ACRN hypercall ABI:
+ * - Hypercall number is passed in R8 register.
+ * - Up to 2 arguments are passed in RDI, RSI.
+ * - Return value will be placed in RAX.
+ *
+ * Because GCC doesn't support R8 register as direct register constraints, use
+ * supported constraint as input with a explicit MOV to R8 in beginning of asm.
+ */
+static inline long acrn_hypercall0(unsigned long hcall_id)
+{
+ long result;
+
+ asm volatile("movl %1, %%r8d\n\t"
+ "vmcall\n\t"
+ : "=a" (result)
+ : "g" (hcall_id)
+ : "r8", "memory");
+
+ return result;
+}
+
+static inline long acrn_hypercall1(unsigned long hcall_id,
+ unsigned long param1)
+{
+ long result;
+
+ asm volatile("movl %1, %%r8d\n\t"
+ "vmcall\n\t"
+ : "=a" (result)
+ : "g" (hcall_id), "D" (param1)
+ : "r8", "memory");
+
+ return result;
+}
+
+static inline long acrn_hypercall2(unsigned long hcall_id,
+ unsigned long param1,
+ unsigned long param2)
+{
+ long result;
+
+ asm volatile("movl %1, %%r8d\n\t"
+ "vmcall\n\t"
+ : "=a" (result)
+ : "g" (hcall_id), "D" (param1), "S" (param2)
+ : "r8", "memory");
+
+ return result;
+}
+
+#endif /* _ASM_X86_ACRN_H */
diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/aes.h
deleted file mode 100644
index 80545a1cbe39..000000000000
--- a/arch/x86/include/asm/aes.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef ASM_X86_AES_H
-#define ASM_X86_AES_H
-
-#include <linux/crypto.h>
-#include <crypto/aes.h>
-
-void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
- const u8 *src);
-void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
- const u8 *src);
-#endif
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 9825cd64c9b6..c8c111d8fbd7 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -1,14 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_AGP_H
#define _ASM_X86_AGP_H
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <asm/cacheflush.h>
/*
* Functions to keep the agpgart mappings coherent with the MMU. The
* GART gives the CPU a physical alias of pages in memory. The alias
* region is mapped uncacheable. Make sure there are no conflicting
- * mappings with different cachability attributes for the same
+ * mappings with different cacheability attributes for the same
* page. This avoids data corruption on some CPUs.
*/
@@ -22,14 +23,4 @@
*/
#define flush_agp_cache() wbinvd()
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
-/* GATT allocation. Returns/accepts GATT kernel virtual address. */
-#define alloc_gatt_pages(order) \
- ((char *)__get_free_pages(GFP_KERNEL, (order)))
-#define free_gatt_pages(table, order) \
- free_pages((unsigned long)(table), (order))
-
#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
deleted file mode 100644
index e2077d343c33..000000000000
--- a/arch/x86/include/asm/alternative-asm.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifdef __ASSEMBLY__
-
-#ifdef CONFIG_X86_32
-# define X86_ALIGN .long
-#else
-# define X86_ALIGN .quad
-#endif
-
-#ifdef CONFIG_SMP
- .macro LOCK_PREFIX
-1: lock
- .section .smp_locks,"a"
- .align 4
- X86_ALIGN 1b
- .previous
- .endm
-#else
- .macro LOCK_PREFIX
- .endm
-#endif
-
-#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 1a37bcdc8606..03364510d5fe 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -1,10 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ALTERNATIVE_H
#define _ASM_X86_ALTERNATIVE_H
#include <linux/types.h>
-#include <linux/stddef.h>
#include <linux/stringify.h>
+#include <linux/objtool.h>
#include <asm/asm.h>
+#include <asm/bug.h>
+
+#define ALT_FLAGS_SHIFT 16
+
+#define ALT_FLAG_NOT (1 << 0)
+#define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature))
+#define ALT_FLAG_DIRECT_CALL (1 << 1)
+#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature))
+#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS)
+
+#ifndef __ASSEMBLER__
+
+#include <linux/stddef.h>
/*
* Alternative inline assembly for SMP.
@@ -28,68 +42,194 @@
*/
#ifdef CONFIG_SMP
-#define LOCK_PREFIX \
- ".section .smp_locks,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661f\n" /* address */ \
- ".previous\n" \
- "661:\n\tlock; "
+#define LOCK_PREFIX_HERE \
+ ".pushsection .smp_locks,\"a\"\n" \
+ ".balign 4\n" \
+ ".long 671f - .\n" /* offset */ \
+ ".popsection\n" \
+ "671:"
+
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock "
#else /* ! CONFIG_SMP */
+#define LOCK_PREFIX_HERE ""
#define LOCK_PREFIX ""
#endif
-/* This must be included *after* the definition of LOCK_PREFIX */
-#include <asm/cpufeature.h>
-
+/*
+ * The patching flags are part of the upper bits of the @ft_flags parameter when
+ * specifying them. The split is currently like this:
+ *
+ * [31... flags ...16][15... CPUID feature bit ...0]
+ *
+ * but since this is all hidden in the macros argument being split, those fields can be
+ * extended in the future to fit in a u64 or however the need arises.
+ */
struct alt_instr {
- u8 *instr; /* original instruction */
- u8 *replacement;
- u8 cpuid; /* cpuid bit set for replacement */
+ s32 instr_offset; /* original instruction */
+ s32 repl_offset; /* offset to replacement instruction */
+
+ union {
+ struct {
+ u32 cpuid: 16; /* CPUID bit set for replacement */
+ u32 flags: 16; /* patching control flags */
+ };
+ u32 ft_flags;
+ };
+
u8 instrlen; /* length of original instruction */
- u8 replacementlen; /* length of new instruction, <= instrlen */
- u8 pad1;
-#ifdef CONFIG_X86_64
- u32 pad2;
-#endif
-};
+ u8 replacementlen; /* length of new instruction */
+} __packed;
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+
+extern s32 __retpoline_sites[], __retpoline_sites_end[];
+extern s32 __return_sites[], __return_sites_end[];
+extern s32 __cfi_sites[], __cfi_sites_end[];
+extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
+
+/*
+ * Debug flag that can be tested to see whether alternative
+ * instructions were patched in already:
+ */
+extern int alternatives_patched;
extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+extern void apply_retpolines(s32 *start, s32 *end);
+extern void apply_returns(s32 *start, s32 *end);
+extern void apply_seal_endbr(s32 *start, s32 *end);
+extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
+ s32 *start_cfi, s32 *end_cfi);
struct module;
+struct callthunk_sites {
+ s32 *call_start, *call_end;
+};
+
+#ifdef CONFIG_CALL_THUNKS
+extern void callthunks_patch_builtin_calls(void);
+extern void callthunks_patch_module_calls(struct callthunk_sites *sites,
+ struct module *mod);
+extern void *callthunks_translate_call_dest(void *dest);
+extern int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip);
+#else
+static __always_inline void callthunks_patch_builtin_calls(void) {}
+static __always_inline void
+callthunks_patch_module_calls(struct callthunk_sites *sites,
+ struct module *mod) {}
+static __always_inline void *callthunks_translate_call_dest(void *dest)
+{
+ return dest;
+}
+static __always_inline int x86_call_depth_emit_accounting(u8 **pprog,
+ void *func, void *ip)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_init_mod(struct module *mod);
+extern void its_fini_mod(struct module *mod);
+extern void its_free_mod(struct module *mod);
+extern u8 *its_static_thunk(int reg);
+#else /* CONFIG_MITIGATION_ITS */
+static inline void its_init_mod(struct module *mod) { }
+static inline void its_fini_mod(struct module *mod) { }
+static inline void its_free_mod(struct module *mod) { }
+static inline u8 *its_static_thunk(int reg)
+{
+ WARN_ONCE(1, "ITS not compiled in");
+
+ return NULL;
+}
+#endif
+
+#if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL)
+extern bool cpu_wants_rethunk(void);
+extern bool cpu_wants_rethunk_at(void *addr);
+#else
+static __always_inline bool cpu_wants_rethunk(void)
+{
+ return false;
+}
+static __always_inline bool cpu_wants_rethunk_at(void *addr)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SMP
extern void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end);
extern void alternatives_smp_module_del(struct module *mod);
-extern void alternatives_smp_switch(int smp);
+extern void alternatives_enable_smp(void);
+extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
#else
static inline void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end) {}
static inline void alternatives_smp_module_del(struct module *mod) {}
-static inline void alternatives_smp_switch(int smp) {}
+static inline void alternatives_enable_smp(void) {}
+static inline int alternatives_text_reserved(void *start, void *end)
+{
+ return 0;
+}
#endif /* CONFIG_SMP */
-const unsigned char *const *find_nop_table(void);
+#define ALT_CALL_INSTR "call BUG_func"
+
+#define alt_slen "772b-771b"
+#define alt_total_slen "773b-771b"
+#define alt_rlen "775f-774f"
+
+#define OLDINSTR(oldinstr) \
+ "# ALT: oldinstr\n" \
+ "771:\n\t" oldinstr "\n772:\n" \
+ "# ALT: padding\n" \
+ ".skip -(((" alt_rlen ")-(" alt_slen ")) > 0) * " \
+ "((" alt_rlen ")-(" alt_slen ")),0x90\n" \
+ "773:\n"
+
+#define ALTINSTR_ENTRY(ft_flags) \
+ ".pushsection .altinstructions, \"aM\", @progbits, " \
+ __stringify(ALT_INSTR_SIZE) "\n" \
+ " .long 771b - .\n" /* label */ \
+ " .long 774f - .\n" /* new instruction */ \
+ " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
+ " .byte " alt_total_slen "\n" /* source len */ \
+ " .byte " alt_rlen "\n" /* replacement len */ \
+ ".popsection\n"
+
+#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ANNOTATE_DATA_SPECIAL "\n" \
+ "# ALT: replacement\n" \
+ "774:\n\t" newinstr "\n775:\n" \
+ ".popsection\n"
/* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, newinstr, feature) \
- \
- "661:\n\t" oldinstr "\n662:\n" \
- ".section .altinstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
- " .byte " __stringify(feature) "\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
- " .byte 664f-663f\n" /* replacementlen */ \
- ".previous\n" \
- ".section .altinstr_replacement, \"ax\"\n" \
- "663:\n\t" newinstr "\n664:\n" /* replacement */ \
- ".previous"
+#define ALTERNATIVE(oldinstr, newinstr, ft_flags) \
+ OLDINSTR(oldinstr) \
+ ALTINSTR_ENTRY(ft_flags) \
+ ALTINSTR_REPLACEMENT(newinstr)
+
+#define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
+ ALTERNATIVE(ALTERNATIVE(oldinstr, newinstr1, ft_flags1), newinstr2, ft_flags2)
+
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, newinstr_yes, ft_flags)
+
+#define ALTERNATIVE_3(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, \
+ newinstr3, ft_flags3) \
+ ALTERNATIVE(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2), \
+ newinstr3, ft_flags3)
/*
* Alternative instructions for different CPU types or capabilities.
@@ -103,68 +243,159 @@ const unsigned char *const *find_nop_table(void);
* For non barrier like inlines please define new variants
* without volatile and memory clobber.
*/
-#define alternative(oldinstr, newinstr, feature) \
- asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+#define alternative(oldinstr, newinstr, ft_flags) \
+ asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory")
+
+#define alternative_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
+ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) ::: "memory")
/*
* Alternative inline assembly with input.
*
- * Pecularities:
+ * Peculiarities:
* No memory clobber here.
* Argument numbers start with 1.
- * Best is to use constraints that are fixed size (like (%1) ... "r")
- * If you use variable sized constraints like "m" or "g" in the
- * replacement make sure to pad to the worst case length.
* Leaving an unused argument 0 to keep API compatibility.
*/
-#define alternative_input(oldinstr, newinstr, feature, input...) \
- asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+#define alternative_input(oldinstr, newinstr, ft_flags, input...) \
+ asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: : "i" (0), ## input)
/* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...) \
- asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+#define alternative_io(oldinstr, newinstr, ft_flags, output, input...) \
+ asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: output : "i" (0), ## input)
/*
- * use this macro(s) if you need more than one output parameter
- * in alternative_io
+ * Like alternative_io, but for replacing a direct call with another one.
+ *
+ * Use the %c operand modifier which is the generic way to print a bare
+ * constant expression with all syntax-specific punctuation omitted. %P
+ * is the x86-specific variant which can handle constants too, for
+ * historical reasons, but it should be used primarily for PIC
+ * references: i.e., if used for a function, it would add the PLT
+ * suffix.
*/
-#define ASM_OUTPUT2(a, b) a, b
+#define alternative_call(oldfunc, newfunc, ft_flags, output, input, clobbers...) \
+ asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \
+ : ALT_OUTPUT_SP(output) \
+ : [old] "i" (oldfunc), [new] "i" (newfunc) \
+ COMMA(input) \
+ : clobbers)
+
+/*
+ * Like alternative_call, but there are two features and respective functions.
+ * If CPU has feature2, function2 is used.
+ * Otherwise, if CPU has feature1, function1 is used.
+ * Otherwise, old function is used.
+ */
+#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
+ output, input, clobbers...) \
+ asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \
+ "call %c[new2]", ft_flags2) \
+ : ALT_OUTPUT_SP(output) \
+ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
+ [new2] "i" (newfunc2) \
+ COMMA(input) \
+ : clobbers)
+
+#define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__
-struct paravirt_patch_site;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end);
+/* Macro for creating assembler functions avoiding any C magic. */
+#define DEFINE_ASM_FUNC(func, instr, sec) \
+ asm (".pushsection " #sec ", \"ax\"\n" \
+ ".global " #func "\n\t" \
+ ".type " #func ", @function\n\t" \
+ ASM_FUNC_ALIGN "\n" \
+ #func ":\n\t" \
+ ASM_ENDBR \
+ instr "\n\t" \
+ ASM_RET \
+ ".size " #func ", . - " #func "\n\t" \
+ ".popsection")
+
+void BUG_func(void);
+void nop_func(void);
+
+#else /* __ASSEMBLER__ */
+
+#ifdef CONFIG_SMP
+ .macro LOCK_PREFIX
+672: lock
+ .pushsection .smp_locks,"a"
+ .balign 4
+ .long 672b - .
+ .popsection
+ .endm
#else
-static inline void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{}
-#define __parainstructions NULL
-#define __parainstructions_end NULL
+ .macro LOCK_PREFIX
+ .endm
#endif
-extern void add_nops(void *insns, unsigned int len);
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
+.macro altinstr_entry orig alt ft_flags orig_len alt_len
+ .long \orig - .
+ .long \alt - .
+ .4byte \ft_flags
+ .byte \orig_len
+ .byte \alt_len
+.endm
+
+.macro ALT_CALL_INSTR
+ call BUG_func
+.endm
/*
- * Clear and restore the kernel write-protection flag on the local CPU.
- * Allows the kernel to edit read-only pages.
- * Side-effect: any interrupt handler running between save and restore will have
- * the ability to write to read-only pages.
- *
- * Warning:
- * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
- * no thread can be preempted in the instructions being modified (no iret to an
- * invalid instruction possible) or if the instructions are changed from a
- * consistent state to another consistent state atomically.
- * More care must be taken when modifying code in the SMP case because of
- * Intel's errata.
- * On the local CPU you need to be protected again NMI or MCE handlers seeing an
- * inconsistent instruction while you patch.
- * The _early version expects the memory to already be RW.
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
*/
+#define __ALTERNATIVE(oldinst, newinst, flag) \
+740: \
+ oldinst ; \
+741: \
+ .skip -(((744f-743f)-(741b-740b)) > 0) * ((744f-743f)-(741b-740b)),0x90 ;\
+742: \
+ .pushsection .altinstructions, "aM", @progbits, ALT_INSTR_SIZE ;\
+ altinstr_entry 740b,743f,flag,742b-740b,744f-743f ; \
+ .popsection ; \
+ .pushsection .altinstr_replacement,"ax" ; \
+743: \
+ ANNOTATE_DATA_SPECIAL ; \
+ newinst ; \
+744: \
+ .popsection ;
+
+.macro ALTERNATIVE oldinstr, newinstr, ft_flags
+ __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags)
+.endm
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
+.macro ALTERNATIVE_2 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2
+ __ALTERNATIVE(__ALTERNATIVE(\oldinstr, \newinstr1, \ft_flags1),
+ \newinstr2, \ft_flags2)
+.endm
+
+.macro ALTERNATIVE_3 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, newinstr3, ft_flags3
+ __ALTERNATIVE(ALTERNATIVE_2(\oldinstr, \newinstr1, \ft_flags1, \newinstr2, \ft_flags2),
+ \newinstr3, \ft_flags3)
+.endm
+
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
+ newinstr_yes, ft_flags
-extern void *text_poke(void *addr, const void *opcode, size_t len);
-extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd/hsmp.h b/arch/x86/include/asm/amd/hsmp.h
new file mode 100644
index 000000000000..2137f62853ed
--- /dev/null
+++ b/arch/x86/include/asm/amd/hsmp.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_AMD_HSMP_H_
+#define _ASM_X86_AMD_HSMP_H_
+
+#include <uapi/asm/amd_hsmp.h>
+
+#if IS_ENABLED(CONFIG_AMD_HSMP)
+int hsmp_send_message(struct hsmp_message *msg);
+#else
+static inline int hsmp_send_message(struct hsmp_message *msg)
+{
+ return -ENODEV;
+}
+#endif
+
+#endif /*_ASM_X86_AMD_HSMP_H_*/
diff --git a/arch/x86/include/asm/amd/ibs.h b/arch/x86/include/asm/amd/ibs.h
new file mode 100644
index 000000000000..3ee5903982c2
--- /dev/null
+++ b/arch/x86/include/asm/amd/ibs.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_AMD_IBS_H
+#define _ASM_X86_AMD_IBS_H
+
+/*
+ * From PPR Vol 1 for AMD Family 19h Model 01h B1
+ * 55898 Rev 0.35 - Feb 5, 2021
+ */
+
+#include <asm/msr-index.h>
+
+/* IBS_OP_DATA2 DataSrc */
+#define IBS_DATA_SRC_LOC_CACHE 2
+#define IBS_DATA_SRC_DRAM 3
+#define IBS_DATA_SRC_REM_CACHE 4
+#define IBS_DATA_SRC_IO 7
+
+/* IBS_OP_DATA2 DataSrc Extension */
+#define IBS_DATA_SRC_EXT_LOC_CACHE 1
+#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2
+#define IBS_DATA_SRC_EXT_DRAM 3
+#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5
+#define IBS_DATA_SRC_EXT_PMEM 6
+#define IBS_DATA_SRC_EXT_IO 7
+#define IBS_DATA_SRC_EXT_EXT_MEM 8
+#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12
+
+/*
+ * IBS Hardware MSRs
+ */
+
+/* MSR 0xc0011030: IBS Fetch Control */
+union ibs_fetch_ctl {
+ __u64 val;
+ struct {
+ __u64 fetch_maxcnt:16,/* 0-15: instruction fetch max. count */
+ fetch_cnt:16, /* 16-31: instruction fetch count */
+ fetch_lat:16, /* 32-47: instruction fetch latency */
+ fetch_en:1, /* 48: instruction fetch enable */
+ fetch_val:1, /* 49: instruction fetch valid */
+ fetch_comp:1, /* 50: instruction fetch complete */
+ ic_miss:1, /* 51: i-cache miss */
+ phy_addr_valid:1,/* 52: physical address valid */
+ l1tlb_pgsz:2, /* 53-54: i-cache L1TLB page size
+ * (needs IbsPhyAddrValid) */
+ l1tlb_miss:1, /* 55: i-cache fetch missed in L1TLB */
+ l2tlb_miss:1, /* 56: i-cache fetch missed in L2TLB */
+ rand_en:1, /* 57: random tagging enable */
+ fetch_l2_miss:1,/* 58: L2 miss for sampled fetch
+ * (needs IbsFetchComp) */
+ l3_miss_only:1, /* 59: Collect L3 miss samples only */
+ fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */
+ fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */
+ reserved:2; /* 62-63: reserved */
+ };
+};
+
+/* MSR 0xc0011033: IBS Execution Control */
+union ibs_op_ctl {
+ __u64 val;
+ struct {
+ __u64 opmaxcnt:16, /* 0-15: periodic op max. count */
+ l3_miss_only:1, /* 16: Collect L3 miss samples only */
+ op_en:1, /* 17: op sampling enable */
+ op_val:1, /* 18: op sample valid */
+ cnt_ctl:1, /* 19: periodic op counter control */
+ opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */
+ reserved0:5, /* 27-31: reserved */
+ opcurcnt:27, /* 32-58: periodic op counter current count */
+ ldlat_thrsh:4, /* 59-62: Load Latency threshold */
+ ldlat_en:1; /* 63: Load Latency enabled */
+ };
+};
+
+/* MSR 0xc0011035: IBS Op Data 1 */
+union ibs_op_data {
+ __u64 val;
+ struct {
+ __u64 comp_to_ret_ctr:16, /* 0-15: op completion to retire count */
+ tag_to_ret_ctr:16, /* 15-31: op tag to retire count */
+ reserved1:2, /* 32-33: reserved */
+ op_return:1, /* 34: return op */
+ op_brn_taken:1, /* 35: taken branch op */
+ op_brn_misp:1, /* 36: mispredicted branch op */
+ op_brn_ret:1, /* 37: branch op retired */
+ op_rip_invalid:1, /* 38: RIP is invalid */
+ op_brn_fuse:1, /* 39: fused branch op */
+ op_microcode:1, /* 40: microcode op */
+ reserved2:23; /* 41-63: reserved */
+ };
+};
+
+/* MSR 0xc0011036: IBS Op Data 2 */
+union ibs_op_data2 {
+ __u64 val;
+ struct {
+ __u64 data_src_lo:3, /* 0-2: data source low */
+ reserved0:1, /* 3: reserved */
+ rmt_node:1, /* 4: destination node */
+ cache_hit_st:1, /* 5: cache hit state */
+ data_src_hi:2, /* 6-7: data source high */
+ reserved1:56; /* 8-63: reserved */
+ };
+};
+
+/* MSR 0xc0011037: IBS Op Data 3 */
+union ibs_op_data3 {
+ __u64 val;
+ struct {
+ __u64 ld_op:1, /* 0: load op */
+ st_op:1, /* 1: store op */
+ dc_l1tlb_miss:1, /* 2: data cache L1TLB miss */
+ dc_l2tlb_miss:1, /* 3: data cache L2TLB hit in 2M page */
+ dc_l1tlb_hit_2m:1, /* 4: data cache L1TLB hit in 2M page */
+ dc_l1tlb_hit_1g:1, /* 5: data cache L1TLB hit in 1G page */
+ dc_l2tlb_hit_2m:1, /* 6: data cache L2TLB hit in 2M page */
+ dc_miss:1, /* 7: data cache miss */
+ dc_mis_acc:1, /* 8: misaligned access */
+ reserved:4, /* 9-12: reserved */
+ dc_wc_mem_acc:1, /* 13: write combining memory access */
+ dc_uc_mem_acc:1, /* 14: uncacheable memory access */
+ dc_locked_op:1, /* 15: locked operation */
+ dc_miss_no_mab_alloc:1, /* 16: DC miss with no MAB allocated */
+ dc_lin_addr_valid:1, /* 17: data cache linear address valid */
+ dc_phy_addr_valid:1, /* 18: data cache physical address valid */
+ dc_l2_tlb_hit_1g:1, /* 19: data cache L2 hit in 1GB page */
+ l2_miss:1, /* 20: L2 cache miss */
+ sw_pf:1, /* 21: software prefetch */
+ op_mem_width:4, /* 22-25: load/store size in bytes */
+ op_dc_miss_open_mem_reqs:6, /* 26-31: outstanding mem reqs on DC fill */
+ dc_miss_lat:16, /* 32-47: data cache miss latency */
+ tlb_refill_lat:16; /* 48-63: L1 TLB refill latency */
+ };
+};
+
+/* MSR 0xc001103c: IBS Fetch Control Extended */
+union ic_ibs_extd_ctl {
+ __u64 val;
+ struct {
+ __u64 itlb_refill_lat:16, /* 0-15: ITLB Refill latency for sampled fetch */
+ reserved:48; /* 16-63: reserved */
+ };
+};
+
+/*
+ * IBS driver related
+ */
+
+struct perf_ibs_data {
+ u32 size;
+ union {
+ u32 data[0]; /* data buffer starts here */
+ u32 caps;
+ };
+ u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+#endif /* _ASM_X86_AMD_IBS_H */
diff --git a/arch/x86/include/asm/amd/nb.h b/arch/x86/include/asm/amd/nb.h
new file mode 100644
index 000000000000..ddb5108cf46c
--- /dev/null
+++ b/arch/x86/include/asm/amd/nb.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
+
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <asm/amd/node.h>
+
+struct amd_nb_bus_dev_range {
+ u8 bus;
+ u8 dev_base;
+ u8 dev_limit;
+};
+
+extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
+
+extern bool early_is_amd_nb(u32 value);
+extern struct resource *amd_get_mmconfig_range(struct resource *res);
+extern void amd_flush_garts(void);
+extern int amd_numa_init(void);
+extern int amd_get_subcaches(int);
+extern int amd_set_subcaches(int, unsigned long);
+
+struct amd_l3_cache {
+ unsigned indices;
+ u8 subcaches[4];
+};
+
+struct amd_northbridge {
+ struct pci_dev *misc;
+ struct pci_dev *link;
+ struct amd_l3_cache l3_cache;
+};
+
+struct amd_northbridge_info {
+ u16 num;
+ u64 flags;
+ struct amd_northbridge *nb;
+};
+
+#define AMD_NB_GART BIT(0)
+#define AMD_NB_L3_INDEX_DISABLE BIT(1)
+#define AMD_NB_L3_PARTITIONING BIT(2)
+
+#ifdef CONFIG_AMD_NB
+
+u16 amd_nb_num(void);
+bool amd_nb_has_feature(unsigned int feature);
+struct amd_northbridge *node_to_amd_nb(int node);
+
+static inline bool amd_gart_present(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return false;
+
+ /* GART present only on Fam15h, up to model 0fh */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+ return true;
+
+ return false;
+}
+
+#else
+
+#define amd_nb_num(x) 0
+#define amd_nb_has_feature(x) false
+static inline struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return NULL;
+}
+#define amd_gart_present(x) false
+
+#endif
+
+
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/amd/node.h b/arch/x86/include/asm/amd/node.h
new file mode 100644
index 000000000000..a672b8765fa8
--- /dev/null
+++ b/arch/x86/include/asm/amd/node.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Node helper functions and common defines
+ *
+ * Copyright (c) 2024, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ *
+ * Note:
+ * Items in this file may only be used in a single place.
+ * However, it's prudent to keep all AMD Node functionality
+ * in a unified place rather than spreading throughout the
+ * kernel.
+ */
+
+#ifndef _ASM_X86_AMD_NODE_H_
+#define _ASM_X86_AMD_NODE_H_
+
+#include <linux/pci.h>
+
+#define MAX_AMD_NUM_NODES 8
+#define AMD_NODE0_PCI_SLOT 0x18
+
+struct pci_dev *amd_node_get_func(u16 node, u8 func);
+
+static inline u16 amd_num_nodes(void)
+{
+ return topology_amd_nodes_per_pkg() * topology_max_packages();
+}
+
+#ifdef CONFIG_AMD_NODE
+int __must_check amd_smn_read(u16 node, u32 address, u32 *value);
+int __must_check amd_smn_write(u16 node, u32 address, u32 value);
+
+/* Should only be used by the HSMP driver. */
+int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write);
+#else
+static inline int __must_check amd_smn_read(u16 node, u32 address, u32 *value) { return -ENODEV; }
+static inline int __must_check amd_smn_write(u16 node, u32 address, u32 value) { return -ENODEV; }
+
+static inline int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_AMD_NODE */
+
+/* helper for use with read_poll_timeout */
+static inline int smn_read_register(u32 reg)
+{
+ int data, rc;
+
+ rc = amd_smn_read(0, reg, &data);
+ if (rc)
+ return rc;
+
+ return data;
+}
+#endif /*_ASM_X86_AMD_NODE_H_*/
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
deleted file mode 100644
index bdf96f119f06..000000000000
--- a/arch/x86/include/asm/amd_iommu.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_H
-#define _ASM_X86_AMD_IOMMU_H
-
-#include <linux/irqreturn.h>
-
-#ifdef CONFIG_AMD_IOMMU
-extern int amd_iommu_init(void);
-extern int amd_iommu_init_dma_ops(void);
-extern void amd_iommu_detect(void);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_flush_all_domains(void);
-extern void amd_iommu_flush_all_devices(void);
-extern void amd_iommu_shutdown(void);
-#else
-static inline int amd_iommu_init(void) { return -ENODEV; }
-static inline void amd_iommu_detect(void) { }
-static inline void amd_iommu_shutdown(void) { }
-#endif
-
-#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
deleted file mode 100644
index 0c878caaa0a2..000000000000
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
-#define _ASM_X86_AMD_IOMMU_TYPES_H
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-/*
- * some size calculation constants
- */
-#define DEV_TABLE_ENTRY_SIZE 32
-#define ALIAS_TABLE_ENTRY_SIZE 2
-#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
-
-/* Length of the MMIO region for the AMD IOMMU */
-#define MMIO_REGION_LENGTH 0x4000
-
-/* Capability offsets used by the driver */
-#define MMIO_CAP_HDR_OFFSET 0x00
-#define MMIO_RANGE_OFFSET 0x0c
-#define MMIO_MISC_OFFSET 0x10
-
-/* Masks, shifts and macros to parse the device range capability */
-#define MMIO_RANGE_LD_MASK 0xff000000
-#define MMIO_RANGE_FD_MASK 0x00ff0000
-#define MMIO_RANGE_BUS_MASK 0x0000ff00
-#define MMIO_RANGE_LD_SHIFT 24
-#define MMIO_RANGE_FD_SHIFT 16
-#define MMIO_RANGE_BUS_SHIFT 8
-#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
-#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
-#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
-#define MMIO_MSI_NUM(x) ((x) & 0x1f)
-
-/* Flag masks for the AMD IOMMU exclusion range */
-#define MMIO_EXCL_ENABLE_MASK 0x01ULL
-#define MMIO_EXCL_ALLOW_MASK 0x02ULL
-
-/* Used offsets into the MMIO space */
-#define MMIO_DEV_TABLE_OFFSET 0x0000
-#define MMIO_CMD_BUF_OFFSET 0x0008
-#define MMIO_EVT_BUF_OFFSET 0x0010
-#define MMIO_CONTROL_OFFSET 0x0018
-#define MMIO_EXCL_BASE_OFFSET 0x0020
-#define MMIO_EXCL_LIMIT_OFFSET 0x0028
-#define MMIO_CMD_HEAD_OFFSET 0x2000
-#define MMIO_CMD_TAIL_OFFSET 0x2008
-#define MMIO_EVT_HEAD_OFFSET 0x2010
-#define MMIO_EVT_TAIL_OFFSET 0x2018
-#define MMIO_STATUS_OFFSET 0x2020
-
-/* MMIO status bits */
-#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
-
-/* event logging constants */
-#define EVENT_ENTRY_SIZE 0x10
-#define EVENT_TYPE_SHIFT 28
-#define EVENT_TYPE_MASK 0xf
-#define EVENT_TYPE_ILL_DEV 0x1
-#define EVENT_TYPE_IO_FAULT 0x2
-#define EVENT_TYPE_DEV_TAB_ERR 0x3
-#define EVENT_TYPE_PAGE_TAB_ERR 0x4
-#define EVENT_TYPE_ILL_CMD 0x5
-#define EVENT_TYPE_CMD_HARD_ERR 0x6
-#define EVENT_TYPE_IOTLB_INV_TO 0x7
-#define EVENT_TYPE_INV_DEV_REQ 0x8
-#define EVENT_DEVID_MASK 0xffff
-#define EVENT_DEVID_SHIFT 0
-#define EVENT_DOMID_MASK 0xffff
-#define EVENT_DOMID_SHIFT 0
-#define EVENT_FLAGS_MASK 0xfff
-#define EVENT_FLAGS_SHIFT 0x10
-
-/* feature control bits */
-#define CONTROL_IOMMU_EN 0x00ULL
-#define CONTROL_HT_TUN_EN 0x01ULL
-#define CONTROL_EVT_LOG_EN 0x02ULL
-#define CONTROL_EVT_INT_EN 0x03ULL
-#define CONTROL_COMWAIT_EN 0x04ULL
-#define CONTROL_PASSPW_EN 0x08ULL
-#define CONTROL_RESPASSPW_EN 0x09ULL
-#define CONTROL_COHERENT_EN 0x0aULL
-#define CONTROL_ISOC_EN 0x0bULL
-#define CONTROL_CMDBUF_EN 0x0cULL
-#define CONTROL_PPFLOG_EN 0x0dULL
-#define CONTROL_PPFINT_EN 0x0eULL
-
-/* command specific defines */
-#define CMD_COMPL_WAIT 0x01
-#define CMD_INV_DEV_ENTRY 0x02
-#define CMD_INV_IOMMU_PAGES 0x03
-
-#define CMD_COMPL_WAIT_STORE_MASK 0x01
-#define CMD_COMPL_WAIT_INT_MASK 0x02
-#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
-#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
-
-#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
-
-/* macros and definitions for device table entries */
-#define DEV_ENTRY_VALID 0x00
-#define DEV_ENTRY_TRANSLATION 0x01
-#define DEV_ENTRY_IR 0x3d
-#define DEV_ENTRY_IW 0x3e
-#define DEV_ENTRY_NO_PAGE_FAULT 0x62
-#define DEV_ENTRY_EX 0x67
-#define DEV_ENTRY_SYSMGT1 0x68
-#define DEV_ENTRY_SYSMGT2 0x69
-#define DEV_ENTRY_INIT_PASS 0xb8
-#define DEV_ENTRY_EINT_PASS 0xb9
-#define DEV_ENTRY_NMI_PASS 0xba
-#define DEV_ENTRY_LINT0_PASS 0xbe
-#define DEV_ENTRY_LINT1_PASS 0xbf
-#define DEV_ENTRY_MODE_MASK 0x07
-#define DEV_ENTRY_MODE_SHIFT 0x09
-
-/* constants to configure the command buffer */
-#define CMD_BUFFER_SIZE 8192
-#define CMD_BUFFER_ENTRIES 512
-#define MMIO_CMD_SIZE_SHIFT 56
-#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
-
-/* constants for event buffer handling */
-#define EVT_BUFFER_SIZE 8192 /* 512 entries */
-#define EVT_LEN_MASK (0x9ULL << 56)
-
-#define PAGE_MODE_1_LEVEL 0x01
-#define PAGE_MODE_2_LEVEL 0x02
-#define PAGE_MODE_3_LEVEL 0x03
-
-#define IOMMU_PDE_NL_0 0x000ULL
-#define IOMMU_PDE_NL_1 0x200ULL
-#define IOMMU_PDE_NL_2 0x400ULL
-#define IOMMU_PDE_NL_3 0x600ULL
-
-#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL)
-#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL)
-#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL)
-
-#define IOMMU_MAP_SIZE_L1 (1ULL << 21)
-#define IOMMU_MAP_SIZE_L2 (1ULL << 30)
-#define IOMMU_MAP_SIZE_L3 (1ULL << 39)
-
-#define IOMMU_PTE_P (1ULL << 0)
-#define IOMMU_PTE_TV (1ULL << 1)
-#define IOMMU_PTE_U (1ULL << 59)
-#define IOMMU_PTE_FC (1ULL << 60)
-#define IOMMU_PTE_IR (1ULL << 61)
-#define IOMMU_PTE_IW (1ULL << 62)
-
-#define IOMMU_L1_PDE(address) \
- ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define IOMMU_L2_PDE(address) \
- ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
-#define IOMMU_PROT_MASK 0x03
-#define IOMMU_PROT_IR 0x01
-#define IOMMU_PROT_IW 0x02
-
-/* IOMMU capabilities */
-#define IOMMU_CAP_IOTLB 24
-#define IOMMU_CAP_NPCACHE 26
-
-#define MAX_DOMAIN_ID 65536
-
-/* FIXME: move this macro to <linux/pci.h> */
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
-
-/* Protection domain flags */
-#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
-#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
- domain for an IOMMU */
-extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...) \
- do { \
- if (amd_iommu_dump) \
- printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
- } while(0);
-
-/*
- * Make iterating over all IOMMUs easier
- */
-#define for_each_iommu(iommu) \
- list_for_each_entry((iommu), &amd_iommu_list, list)
-#define for_each_iommu_safe(iommu, next) \
- list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
-
-#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
-#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
-#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
-#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
-#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
-#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
-
-/*
- * This structure contains generic data for IOMMU protection domains
- * independent of their use.
- */
-struct protection_domain {
- spinlock_t lock; /* mostly used to lock the page table*/
- u16 id; /* the domain id written to the device table */
- int mode; /* paging mode (0-6 levels) */
- u64 *pt_root; /* page table root pointer */
- unsigned long flags; /* flags to find out type of domain */
- unsigned dev_cnt; /* devices assigned to this domain */
- void *priv; /* private data */
-};
-
-/*
- * For dynamic growth the aperture size is split into ranges of 128MB of
- * DMA address space each. This struct represents one such range.
- */
-struct aperture_range {
-
- /* address allocation bitmap */
- unsigned long *bitmap;
-
- /*
- * Array of PTE pages for the aperture. In this array we save all the
- * leaf pages of the domain page table used for the aperture. This way
- * we don't need to walk the page table to find a specific PTE. We can
- * just calculate its address in constant time.
- */
- u64 *pte_pages[64];
-
- unsigned long offset;
-};
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
- struct list_head list;
-
- /* generic protection domain information */
- struct protection_domain domain;
-
- /* size of the aperture for the mappings */
- unsigned long aperture_size;
-
- /* address we start to search for free addresses */
- unsigned long next_address;
-
- /* address space relevant data */
- struct aperture_range *aperture[APERTURE_MAX_RANGES];
-
- /* This will be set to true when TLB needs to be flushed */
- bool need_flush;
-
- /*
- * if this is a preallocated domain, keep the device for which it was
- * preallocated in this variable
- */
- u16 target_dev;
-};
-
-/*
- * Structure where we save information about one hardware AMD IOMMU in the
- * system.
- */
-struct amd_iommu {
- struct list_head list;
-
- /* locks the accesses to the hardware */
- spinlock_t lock;
-
- /* Pointer to PCI device of this IOMMU */
- struct pci_dev *dev;
-
- /* physical address of MMIO space */
- u64 mmio_phys;
- /* virtual address of MMIO space */
- u8 *mmio_base;
-
- /* capabilities of that IOMMU read from ACPI */
- u32 cap;
-
- /*
- * Capability pointer. There could be more than one IOMMU per PCI
- * device function if there are more than one AMD IOMMU capability
- * pointers.
- */
- u16 cap_ptr;
-
- /* pci domain of this IOMMU */
- u16 pci_seg;
-
- /* first device this IOMMU handles. read from PCI */
- u16 first_device;
- /* last device this IOMMU handles. read from PCI */
- u16 last_device;
-
- /* start of exclusion range of that IOMMU */
- u64 exclusion_start;
- /* length of exclusion range of that IOMMU */
- u64 exclusion_length;
-
- /* command buffer virtual address */
- u8 *cmd_buf;
- /* size of command buffer */
- u32 cmd_buf_size;
-
- /* size of event buffer */
- u32 evt_buf_size;
- /* event buffer virtual address */
- u8 *evt_buf;
- /* MSI number for event interrupt */
- u16 evt_msi_num;
-
- /* true if interrupts for this IOMMU are already enabled */
- bool int_enabled;
-
- /* if one, we need to send a completion wait command */
- bool need_sync;
-
- /* default dma_ops domain for that IOMMU */
- struct dma_ops_domain *default_dom;
-};
-
-/*
- * List with all IOMMUs in the system. This list is not locked because it is
- * only written and read at driver initialization or suspend time
- */
-extern struct list_head amd_iommu_list;
-
-/*
- * Structure defining one entry in the device table
- */
-struct dev_table_entry {
- u32 data[8];
-};
-
-/*
- * One entry for unity mappings parsed out of the ACPI table.
- */
-struct unity_map_entry {
- struct list_head list;
-
- /* starting device id this entry is used for (including) */
- u16 devid_start;
- /* end device id this entry is used for (including) */
- u16 devid_end;
-
- /* start address to unity map (including) */
- u64 address_start;
- /* end address to unity map (including) */
- u64 address_end;
-
- /* required protection */
- int prot;
-};
-
-/*
- * List of all unity mappings. It is not locked because as runtime it is only
- * read. It is created at ACPI table parsing time.
- */
-extern struct list_head amd_iommu_unity_map;
-
-/*
- * Data structures for device handling
- */
-
-/*
- * Device table used by hardware. Read and write accesses by software are
- * locked with the amd_iommu_pd_table lock.
- */
-extern struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * Alias table to find requestor ids to device ids. Not locked because only
- * read on runtime.
- */
-extern u16 *amd_iommu_alias_table;
-
-/*
- * Reverse lookup table to find the IOMMU which translates a specific device.
- */
-extern struct amd_iommu **amd_iommu_rlookup_table;
-
-/* size of the dma_ops aperture as power of 2 */
-extern unsigned amd_iommu_aperture_order;
-
-/* largest PCI device id we expect translation requests for */
-extern u16 amd_iommu_last_bdf;
-
-/* data structures for protection domain handling */
-extern struct protection_domain **amd_iommu_pd_table;
-
-/* allocation bitmap for domain ids */
-extern unsigned long *amd_iommu_pd_alloc_bitmap;
-
-/* will be 1 if device isolation is enabled */
-extern bool amd_iommu_isolate;
-
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
-/* takes bus and device/function and returns the device id
- * FIXME: should that be in generic PCI code? */
-static inline u16 calc_devid(u8 bus, u8 devfn)
-{
- return (((u16)bus) << 8) | devfn;
-}
-
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-struct __iommu_counter {
- char *name;
- struct dentry *dent;
- u64 value;
-};
-
-#define DECLARE_STATS_COUNTER(nm) \
- static struct __iommu_counter nm = { \
- .name = #nm, \
- }
-
-#define INC_STATS_COUNTER(name) name.value += 1
-#define ADD_STATS_COUNTER(name, x) name.value += (x)
-#define SUB_STATS_COUNTER(name, x) name.value -= (x)
-
-#else /* CONFIG_AMD_IOMMU_STATS */
-
-#define DECLARE_STATS_COUNTER(name)
-#define INC_STATS_COUNTER(name)
-#define ADD_STATS_COUNTER(name, x)
-#define SUB_STATS_COUNTER(name, x)
-
-static inline void amd_iommu_stats_init(void) { }
-
-#endif /* CONFIG_AMD_IOMMU_STATS */
-
-#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bb7d47925847..a26e66d66444 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -1,22 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_X86_APIC_H
#define _ASM_X86_APIC_H
#include <linux/cpumask.h>
-#include <linux/delay.h>
-#include <linux/pm.h>
+#include <linux/static_call.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
-#include <asm/processor.h>
#include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
-#include <asm/system.h>
#include <asm/msr.h>
+#include <asm/hardirq.h>
+#include <asm/io.h>
+#include <asm/posted_intr.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
+/* Macros for apic_extnmi which controls external NMI masking */
+#define APIC_EXTNMI_BSP 0 /* Default */
+#define APIC_EXTNMI_ALL 1
+#define APIC_EXTNMI_NONE 2
+
/*
* Debugging macros
*/
@@ -25,44 +31,60 @@
#define APIC_DEBUG 2
/*
- * Define the default level of output to be very little
- * This can be turned up by using apic=verbose for more
- * information and apic=debug for _lots_ of information.
- * apic_verbosity is defined in apic.c
+ * Define the default level of output to be very little This can be turned
+ * up by using apic=verbose for more information and apic=debug for _lots_
+ * of information. apic_verbosity is defined in apic.c
*/
-#define apic_printk(v, s, a...) do { \
- if ((v) <= apic_verbosity) \
- printk(s, ##a); \
- } while (0)
-
+#define apic_printk(v, s, a...) \
+do { \
+ if ((v) <= apic_verbosity) \
+ printk(s, ##a); \
+} while (0)
+
+#define apic_pr_verbose(s, a...) apic_printk(APIC_VERBOSE, KERN_INFO s, ##a)
+#define apic_pr_debug(s, a...) apic_printk(APIC_DEBUG, KERN_DEBUG s, ##a)
+#define apic_pr_debug_cont(s, a...) apic_printk(APIC_DEBUG, KERN_CONT s, ##a)
+/* Unconditional debug prints for code which is guarded by apic_verbosity already */
+#define apic_dbg(s, a...) printk(KERN_DEBUG s, ##a)
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
-extern void generic_apic_probe(void);
+extern void x86_32_probe_apic(void);
#else
-static inline void generic_apic_probe(void)
-{
-}
+static inline void x86_32_probe_apic(void) { }
#endif
+extern u32 cpuid_to_apicid[];
+
+#define CPU_ACPIID_INVALID U32_MAX
+
#ifdef CONFIG_X86_LOCAL_APIC
-extern unsigned int apic_verbosity;
+extern int apic_verbosity;
extern int local_apic_timer_c2_ok;
-extern int disable_apic;
+extern bool apic_is_disabled;
+extern unsigned int lapic_timer_period;
-#ifdef CONFIG_SMP
-extern void __inquire_remote_apic(int apicid);
-#else /* CONFIG_SMP */
-static inline void __inquire_remote_apic(int apicid)
-{
-}
-#endif /* CONFIG_SMP */
+extern enum apic_intr_mode_id apic_intr_mode;
+enum apic_intr_mode_id {
+ APIC_PIC,
+ APIC_VIRTUAL_WIRE,
+ APIC_VIRTUAL_WIRE_NO_CONFIG,
+ APIC_SYMMETRIC_IO,
+ APIC_SYMMETRIC_IO_NO_ROUTING
+};
-static inline void default_inquire_remote_apic(int apicid)
+/*
+ * With 82489DX we can't rely on apic feature bit
+ * retrieved via cpuid but still have to deal with
+ * such an apic chip so we assume that SMP configuration
+ * is found from MP table (64bit case uses ACPI mostly
+ * which set smp presence flag as well so we are safe
+ * to use this helper too).
+ */
+static inline bool apic_from_smp_config(void)
{
- if (apic_verbosity >= APIC_DEBUG)
- __inquire_remote_apic(apicid);
+ return smp_found_config && !apic_is_disabled;
}
/*
@@ -70,163 +92,72 @@ static inline void default_inquire_remote_apic(int apicid)
*/
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
-#else
-#define setup_boot_clock setup_boot_APIC_clock
-#define setup_secondary_clock setup_secondary_APIC_clock
-#endif
-
-#ifdef CONFIG_X86_64
-extern int is_vsmp_box(void);
-#else
-static inline int is_vsmp_box(void)
-{
- return 0;
-}
#endif
-extern void xapic_wait_icr_idle(void);
-extern u32 safe_xapic_wait_icr_idle(void);
-extern void xapic_icr_write(u32, u32);
-extern int setup_profiling_timer(unsigned int);
static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
- ASM_OUTPUT2("=r" (v), "=m" (*addr)),
- ASM_OUTPUT2("0" (v), "m" (*addr)));
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+ ASM_OUTPUT("=r" (v), "=m" (*addr)),
+ ASM_INPUT("0" (v), "m" (*addr)));
}
static inline u32 native_apic_mem_read(u32 reg)
{
- return *((volatile u32 *)(APIC_BASE + reg));
-}
-
-extern void native_apic_wait_icr_idle(void);
-extern u32 native_safe_apic_wait_icr_idle(void);
-extern void native_apic_icr_write(u32 low, u32 id);
-extern u64 native_apic_icr_read(void);
-
-extern int x2apic_mode;
-
-#ifdef CONFIG_X86_X2APIC
-/*
- * Make previous memory operations globally visible before
- * sending the IPI through x2apic wrmsr. We need a serializing instruction or
- * mfence for this.
- */
-static inline void x2apic_wrmsr_fence(void)
-{
- asm volatile("mfence" : : : "memory");
-}
-
-static inline void native_apic_msr_write(u32 reg, u32 v)
-{
- if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
- reg == APIC_LVR)
- return;
-
- wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
-}
-
-static inline u32 native_apic_msr_read(u32 reg)
-{
- u32 low, high;
-
- if (reg == APIC_DFR)
- return -1;
-
- rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
- return low;
-}
-
-static inline void native_x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return;
+ return readl((void __iomem *)(APIC_BASE + reg));
}
-static inline u32 native_safe_x2apic_wait_icr_idle(void)
+static inline void native_apic_mem_eoi(void)
{
- /* no need to wait for icr idle in x2apic */
- return 0;
+ native_apic_mem_write(APIC_EOI, APIC_EOI_ACK);
}
-static inline void native_x2apic_icr_write(u32 low, u32 id)
-{
- wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
-}
-
-static inline u64 native_x2apic_icr_read(void)
-{
- unsigned long val;
-
- rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
- return val;
-}
+extern void native_apic_icr_write(u32 low, u32 id);
+extern u64 native_apic_icr_read(void);
-extern int x2apic_phys;
-extern void check_x2apic(void);
-extern void enable_x2apic(void);
-extern void x2apic_icr_write(u32 low, u32 id);
-static inline int x2apic_enabled(void)
+static inline bool apic_is_x2apic_enabled(void)
{
- int msr, msr2;
+ u64 msr;
- if (!cpu_has_x2apic)
- return 0;
-
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
- if (msr & X2APIC_ENABLE)
- return 1;
- return 0;
-}
-
-#define x2apic_supported() (cpu_has_x2apic)
-#else
-static inline void check_x2apic(void)
-{
-}
-static inline void enable_x2apic(void)
-{
-}
-static inline int x2apic_enabled(void)
-{
- return 0;
+ if (rdmsrq_safe(MSR_IA32_APICBASE, &msr))
+ return false;
+ return msr & X2APIC_ENABLE;
}
-#define x2apic_preenabled 0
-#define x2apic_supported() 0
-#endif
-
extern void enable_IR_x2apic(void);
-extern int get_physical_broadcast(void);
-
-extern void apic_disable(void);
extern int lapic_get_maxlvt(void);
extern void clear_local_APIC(void);
-extern void connect_bsp_APIC(void);
extern void disconnect_bsp_APIC(int virt_wire_setup);
extern void disable_local_APIC(void);
+extern void apic_soft_disable(void);
extern void lapic_shutdown(void);
-extern int verify_local_APIC(void);
-extern void cache_APIC_registers(void);
extern void sync_Arb_IDs(void);
extern void init_bsp_APIC(void);
-extern void setup_local_APIC(void);
-extern void end_local_APIC_setup(void);
+extern void apic_intr_mode_select(void);
+extern void apic_intr_mode_init(void);
extern void init_apic_mappings(void);
+void register_lapic_address(unsigned long address);
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
-extern int APIC_init_uniprocessor(void);
-extern void enable_NMI_through_LVT0(void);
+extern void lapic_update_tsc_freq(void);
+
+#ifdef CONFIG_X86_64
+static inline bool apic_force_enable(unsigned long addr)
+{
+ return false;
+}
+#else
+extern bool apic_force_enable(unsigned long addr);
+#endif
+
+extern void apic_ap_setup(void);
/*
* On 32bit this is mach-xxx local
*/
#ifdef CONFIG_X86_64
-extern void early_init_lapic_mapping(void);
extern int apic_is_clustered_box(void);
#else
static inline int apic_is_clustered_box(void)
@@ -235,27 +166,104 @@ static inline int apic_is_clustered_box(void)
}
#endif
-extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
-extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
+extern void lapic_assign_system_vectors(void);
+extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
+extern void lapic_update_legacy_vectors(void);
+extern void lapic_online(void);
+extern void lapic_offline(void);
+extern bool apic_needs_pit(void);
+
+extern void apic_send_IPI_allbutself(unsigned int vector);
+extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present);
+extern void topology_register_boot_apic(u32 apic_id);
+extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id);
+extern void topology_hotunplug_apic(unsigned int cpu);
+extern void topology_apply_cmdline_limits_early(void);
+extern void topology_init_possible_cpus(void);
+extern void topology_reset_possible_cpus_up(void);
#else /* !CONFIG_X86_LOCAL_APIC */
static inline void lapic_shutdown(void) { }
#define local_apic_timer_c2_ok 1
static inline void init_apic_mappings(void) { }
static inline void disable_local_APIC(void) { }
-static inline void apic_disable(void) { }
+# define setup_boot_APIC_clock x86_init_noop
+# define setup_secondary_APIC_clock x86_init_noop
+static inline void lapic_update_tsc_freq(void) { }
+static inline void init_bsp_APIC(void) { }
+static inline void apic_intr_mode_select(void) { }
+static inline void apic_intr_mode_init(void) { }
+static inline void lapic_assign_system_vectors(void) { }
+static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { }
+static inline bool apic_needs_pit(void) { return true; }
+static inline void topology_apply_cmdline_limits_early(void) { }
+static inline void topology_init_possible_cpus(void) { }
#endif /* !CONFIG_X86_LOCAL_APIC */
-#ifdef CONFIG_X86_64
-#define SET_APIC_ID(x) (apic->set_apic_id(x))
-#else
+#ifdef CONFIG_X86_X2APIC
+static inline void native_apic_msr_write(u32 reg, u32 v)
+{
+ if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
+ reg == APIC_LVR)
+ return;
-#endif
+ wrmsrq(APIC_BASE_MSR + (reg >> 4), v);
+}
+
+static inline void native_apic_msr_eoi(void)
+{
+ native_wrmsrq(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK);
+}
+
+static inline u32 native_apic_msr_read(u32 reg)
+{
+ u64 msr;
+
+ if (reg == APIC_DFR)
+ return -1;
+
+ rdmsrq(APIC_BASE_MSR + (reg >> 4), msr);
+ return (u32)msr;
+}
+
+static inline void native_x2apic_icr_write(u32 low, u32 id)
+{
+ wrmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+static inline u64 native_x2apic_icr_read(void)
+{
+ unsigned long val;
+
+ rdmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+ return val;
+}
+
+extern int x2apic_mode;
+extern int x2apic_phys;
+extern void __init x2apic_set_max_apicid(u32 apicid);
+extern void x2apic_setup(void);
+static inline int x2apic_enabled(void)
+{
+ return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled();
+}
+
+#define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC))
+#else /* !CONFIG_X86_X2APIC */
+static inline void x2apic_setup(void) { }
+static inline int x2apic_enabled(void) { return 0; }
+static inline u32 native_apic_msr_read(u32 reg) { BUG(); }
+#define x2apic_mode (0)
+#define x2apic_supported() (0)
+#endif /* !CONFIG_X86_X2APIC */
+extern void __init check_x2apic(void);
+
+struct irq_data;
/*
* Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
*
* Generic APIC sub-arch data struct.
*
@@ -264,79 +272,73 @@ static inline void apic_disable(void) { }
* James Cleverdon.
*/
struct apic {
- char *name;
-
- int (*probe)(void);
- int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
- int (*apic_id_registered)(void);
-
- u32 irq_delivery_mode;
- u32 irq_dest_mode;
-
- const struct cpumask *(*target_cpus)(void);
-
- int disable_esr;
-
- int dest_logical;
- unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
- unsigned long (*check_apicid_present)(int apicid);
-
- void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
- void (*init_apic_ldr)(void);
-
- physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
-
- void (*setup_apic_routing)(void);
- int (*multi_timer_check)(int apic, int irq);
- int (*apicid_to_node)(int logical_apicid);
- int (*cpu_to_logical_apicid)(int cpu);
- int (*cpu_present_to_apicid)(int mps_cpu);
- physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
- void (*setup_portio_remap)(void);
- int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
- void (*enable_apic_mode)(void);
- int (*phys_pkg_id)(int cpuid_apic, int index_msb);
-
- /*
- * When one of the next two hooks returns 1 the apic
- * is switched to this. Essentially they are additional
- * probe functions:
- */
- int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid);
-
- unsigned int (*get_apic_id)(unsigned long x);
- unsigned long (*set_apic_id)(unsigned int id);
- unsigned long apic_id_mask;
-
- unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
- unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
- const struct cpumask *andmask);
-
- /* ipi */
- void (*send_IPI_mask)(const struct cpumask *mask, int vector);
- void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
- int vector);
- void (*send_IPI_allbutself)(int vector);
- void (*send_IPI_all)(int vector);
- void (*send_IPI_self)(int vector);
+ /* Hotpath functions first */
+ void (*eoi)(void);
+ void (*native_eoi)(void);
+ void (*write)(u32 reg, u32 v);
+ u32 (*read)(u32 reg);
+
+ /* IPI related functions */
+ void (*wait_icr_idle)(void);
+ u32 (*safe_wait_icr_idle)(void);
+
+ void (*send_IPI)(int cpu, int vector);
+ void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+ void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+ void (*send_IPI_self)(int vector);
+
+ u32 disable_esr : 1,
+ dest_mode_logical : 1,
+ x2apic_set_max_apicid : 1,
+ nmi_to_offline_cpu : 1;
+
+ u32 (*calc_dest_apicid)(unsigned int cpu);
+
+ /* ICR related functions */
+ u64 (*icr_read)(void);
+ void (*icr_write)(u32 low, u32 high);
+
+ /* The limit of the APIC ID space. */
+ u32 max_apic_id;
+
+ /* Probe, setup and smpboot functions */
+ int (*probe)(void);
+ void (*setup)(void);
+ void (*teardown)(void);
+ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+
+ void (*init_apic_ldr)(void);
+ u32 (*cpu_present_to_apicid)(int mps_cpu);
+
+ u32 (*get_apic_id)(u32 id);
/* wakeup_secondary_cpu */
- int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
-
- int trampoline_phys_low;
- int trampoline_phys_high;
-
- void (*wait_for_init_deassert)(atomic_t *deassert);
- void (*smp_callin_clear_local_apic)(void);
- void (*inquire_remote_apic)(int apicid);
-
- /* apic ops */
- u32 (*read)(u32 reg);
- void (*write)(u32 reg, u32 v);
- u64 (*icr_read)(void);
- void (*icr_write)(u32 low, u32 high);
- void (*wait_icr_idle)(void);
- u32 (*safe_wait_icr_idle)(void);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu);
+ /* wakeup secondary CPU using 64-bit wakeup point */
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu);
+
+ void (*update_vector)(unsigned int cpu, unsigned int vector, bool set);
+
+ char *name;
+};
+
+struct apic_override {
+ void (*eoi)(void);
+ void (*native_eoi)(void);
+ void (*write)(u32 reg, u32 v);
+ u32 (*read)(u32 reg);
+ void (*send_IPI)(int cpu, int vector);
+ void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+ void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+ void (*send_IPI_self)(int vector);
+ u64 (*icr_read)(void);
+ void (*icr_write)(u32 low, u32 high);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu);
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu);
};
/*
@@ -347,232 +349,283 @@ struct apic {
extern struct apic *apic;
/*
+ * APIC drivers are probed based on how they are listed in the .apicdrivers
+ * section. So the order is important and enforced by the ordering
+ * of different apic driver files in the Makefile.
+ */
+#define apic_driver(sym) \
+ static const struct apic *__apicdrivers_##sym __used \
+ __aligned(sizeof(struct apic *)) \
+ __section(".apicdrivers") = { &sym }
+
+extern struct apic *__apicdrivers[], *__apicdrivers_end[];
+
+/*
* APIC functionality to boot other CPUs - only used on SMP:
*/
#ifdef CONFIG_SMP
-extern atomic_t init_deasserted;
-extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
+extern int lapic_can_unplug_cpu(void);
#endif
-static inline u32 apic_read(u32 reg)
-{
- return apic->read(reg);
+#ifdef CONFIG_X86_LOCAL_APIC
+extern struct apic_override __x86_apic_override;
+
+void __init apic_setup_apic_calls(void);
+void __init apic_install_driver(struct apic *driver);
+
+#define apic_update_callback(_callback, _fn) { \
+ __x86_apic_override._callback = _fn; \
+ apic->_callback = _fn; \
+ static_call_update(apic_call_##_callback, _fn); \
+ pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \
}
-static inline void apic_write(u32 reg, u32 val)
+#define DECLARE_APIC_CALL(__cb) \
+ DECLARE_STATIC_CALL(apic_call_##__cb, *apic->__cb)
+
+DECLARE_APIC_CALL(eoi);
+DECLARE_APIC_CALL(native_eoi);
+DECLARE_APIC_CALL(icr_read);
+DECLARE_APIC_CALL(icr_write);
+DECLARE_APIC_CALL(read);
+DECLARE_APIC_CALL(send_IPI);
+DECLARE_APIC_CALL(send_IPI_mask);
+DECLARE_APIC_CALL(send_IPI_mask_allbutself);
+DECLARE_APIC_CALL(send_IPI_allbutself);
+DECLARE_APIC_CALL(send_IPI_all);
+DECLARE_APIC_CALL(send_IPI_self);
+DECLARE_APIC_CALL(wait_icr_idle);
+DECLARE_APIC_CALL(wakeup_secondary_cpu);
+DECLARE_APIC_CALL(wakeup_secondary_cpu_64);
+DECLARE_APIC_CALL(write);
+
+static __always_inline u32 apic_read(u32 reg)
{
- apic->write(reg, val);
+ return static_call(apic_call_read)(reg);
}
-static inline u64 apic_icr_read(void)
+static __always_inline void apic_write(u32 reg, u32 val)
{
- return apic->icr_read();
+ static_call(apic_call_write)(reg, val);
}
-static inline void apic_icr_write(u32 low, u32 high)
+static __always_inline void apic_eoi(void)
{
- apic->icr_write(low, high);
+ static_call(apic_call_eoi)();
}
-static inline void apic_wait_icr_idle(void)
+static __always_inline void apic_native_eoi(void)
{
- apic->wait_icr_idle();
+ static_call(apic_call_native_eoi)();
}
-static inline u32 safe_apic_wait_icr_idle(void)
+static __always_inline u64 apic_icr_read(void)
{
- return apic->safe_wait_icr_idle();
+ return static_call(apic_call_icr_read)();
}
-
-static inline void ack_APIC_irq(void)
+static __always_inline void apic_icr_write(u32 low, u32 high)
{
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * ack_APIC_irq() actually gets compiled as a single instruction
- * ... yummie.
- */
-
- /* Docs say use 0 for future compatibility */
- apic_write(APIC_EOI, 0);
-#endif
+ static_call(apic_call_icr_write)(low, high);
}
-static inline unsigned default_get_apic_id(unsigned long x)
+static __always_inline void __apic_send_IPI(int cpu, int vector)
{
- unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-
- if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
- return (x >> 24) & 0xFF;
- else
- return (x >> 24) & 0x0F;
+ static_call(apic_call_send_IPI)(cpu, vector);
}
-/*
- * Warm reset vector default position:
- */
-#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467
-#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
-
-#ifdef CONFIG_X86_64
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2apic_cluster;
-extern struct apic apic_x2apic_phys;
-extern int default_acpi_madt_oem_check(char *, char *);
-
-extern void apic_send_IPI_self(int vector);
-
-extern struct apic apic_x2apic_uv_x;
-DECLARE_PER_CPU(int, x2apic_extra_bits);
-
-extern int default_cpu_present_to_apicid(int mps_cpu);
-extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
-#endif
-
-static inline void default_wait_for_init_deassert(atomic_t *deassert)
+static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- while (!atomic_read(deassert))
- cpu_relax();
- return;
+ static_call_mod(apic_call_send_IPI_mask)(mask, vector);
}
-extern void generic_bigsmp_probe(void);
-
+static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ static_call(apic_call_send_IPI_mask_allbutself)(mask, vector);
+}
-#ifdef CONFIG_X86_LOCAL_APIC
+static __always_inline void __apic_send_IPI_allbutself(int vector)
+{
+ static_call(apic_call_send_IPI_allbutself)(vector);
+}
-#include <asm/smp.h>
+static __always_inline void __apic_send_IPI_all(int vector)
+{
+ static_call(apic_call_send_IPI_all)(vector);
+}
-#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+static __always_inline void __apic_send_IPI_self(int vector)
+{
+ static_call_mod(apic_call_send_IPI_self)(vector);
+}
-static inline const struct cpumask *default_target_cpus(void)
+static __always_inline void apic_wait_icr_idle(void)
{
-#ifdef CONFIG_SMP
- return cpu_online_mask;
-#else
- return cpumask_of(0);
-#endif
+ static_call_cond(apic_call_wait_icr_idle)();
}
-DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+static __always_inline u32 safe_apic_wait_icr_idle(void)
+{
+ return apic->safe_wait_icr_idle ? apic->safe_wait_icr_idle() : 0;
+}
+static __always_inline bool apic_id_valid(u32 apic_id)
+{
+ return apic_id <= apic->max_apic_id;
+}
-static inline unsigned int read_apic_id(void)
+static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set)
{
- unsigned int reg;
+ if (apic->update_vector)
+ apic->update_vector(cpu, vector, set);
+}
- reg = apic_read(APIC_ID);
+#else /* CONFIG_X86_LOCAL_APIC */
- return apic->get_apic_id(reg);
-}
+static inline u32 apic_read(u32 reg) { return 0; }
+static inline void apic_write(u32 reg, u32 val) { }
+static inline void apic_eoi(void) { }
+static inline u64 apic_icr_read(void) { return 0; }
+static inline void apic_icr_write(u32 low, u32 high) { }
+static inline void apic_wait_icr_idle(void) { }
+static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); }
+static inline void apic_setup_apic_calls(void) { }
+static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { }
-extern void default_setup_apic_routing(void);
+#define apic_update_callback(_callback, _fn) do { } while (0)
-#ifdef CONFIG_X86_32
+#endif /* CONFIG_X86_LOCAL_APIC */
-extern struct apic apic_default;
+extern void apic_ack_irq(struct irq_data *data);
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-extern void default_init_apic_ldr(void);
+#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32)
+#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10)
-static inline int default_apic_id_registered(void)
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
{
- return physid_isset(read_apic_id(), phys_cpu_present_map);
+ u32 irr = apic_read(APIC_IRR + APIC_VECTOR_TO_REG_OFFSET(vector));
+
+ return !!(irr & (1U << APIC_VECTOR_TO_BIT_NUMBER(vector)));
}
-static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
+static inline bool is_vector_pending(unsigned int vector)
{
- return cpuid_apic >> index_msb;
+ return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
}
-extern int default_apicid_to_node(int logical_apicid);
+#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
-#endif
-
-static inline unsigned int
-default_cpu_mask_to_apicid(const struct cpumask *cpumask)
+/*
+ * Vector states are maintained by APIC in 32-bit registers that are
+ * 16 bytes aligned. The status of each vector is kept in a single
+ * bit.
+ */
+static inline int apic_find_highest_vector(void *bitmap)
{
- return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-}
+ int vec;
+ u32 *reg;
-static inline unsigned int
-default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- unsigned long mask1 = cpumask_bits(cpumask)[0];
- unsigned long mask2 = cpumask_bits(andmask)[0];
- unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
+ if (*reg)
+ return __fls(*reg) + vec;
+ }
- return (unsigned int)(mask1 & mask2 & mask3);
+ return -1;
}
-static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
+static inline u32 apic_get_reg(void *regs, int reg)
{
- return physid_isset(apicid, bitmap);
+ return *((u32 *) (regs + reg));
}
-static inline unsigned long default_check_apicid_present(int bit)
+static inline void apic_set_reg(void *regs, int reg, u32 val)
{
- return physid_isset(bit, phys_cpu_present_map);
+ *((u32 *) (regs + reg)) = val;
}
-static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map)
+static __always_inline u64 apic_get_reg64(void *regs, int reg)
{
- return phys_map;
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
}
-/* Mapping from cpu number to logical apicid */
-static inline int default_cpu_to_logical_apicid(int cpu)
+static __always_inline void apic_set_reg64(void *regs, int reg, u64 val)
{
- return 1 << cpu;
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
}
-static inline int __default_cpu_present_to_apicid(int mps_cpu)
+static inline void apic_clear_vector(int vec, void *bitmap)
{
- if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
+ clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
}
-static inline int
-__default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static inline void apic_set_vector(int vec, void *bitmap)
{
- return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
+ set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
}
-#ifdef CONFIG_X86_32
-static inline int default_cpu_present_to_apicid(int mps_cpu)
+static inline int apic_test_vector(int vec, void *bitmap)
{
- return __default_cpu_present_to_apicid(mps_cpu);
+ return test_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), bitmap + APIC_VECTOR_TO_REG_OFFSET(vec));
}
-static inline int
-default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+/*
+ * Warm reset vector position:
+ */
+#define TRAMPOLINE_PHYS_LOW 0x467
+#define TRAMPOLINE_PHYS_HIGH 0x469
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#include <asm/smp.h>
+
+extern struct apic apic_noop;
+
+static inline u32 read_apic_id(void)
{
- return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+ u32 reg = apic_read(APIC_ID);
+
+ return apic->get_apic_id(reg);
}
+
+#ifdef CONFIG_X86_64
+typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip);
+extern int default_acpi_madt_oem_check(char *, char *);
+extern void x86_64_probe_apic(void);
#else
-extern int default_cpu_present_to_apicid(int mps_cpu);
-extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
+static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; }
+static inline void x86_64_probe_apic(void) { }
#endif
-static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
+extern u32 apic_default_calc_apicid(unsigned int cpu);
+extern u32 apic_flat_calc_apicid(unsigned int cpu);
-#endif /* CONFIG_X86_LOCAL_APIC */
+extern u32 default_cpu_present_to_apicid(int mps_cpu);
+
+void apic_send_nmi_to_offline_cpu(unsigned int cpu);
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+static inline u32 read_apic_id(void) { return 0; }
-#ifdef CONFIG_X86_32
-extern u8 cpu_2_logical_apicid[NR_CPUS];
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_SMP
+void apic_smt_update(void);
+#else
+static inline void apic_smt_update(void) { }
#endif
+struct msi_msg;
+struct irq_cfg;
+
+extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+ bool dmar);
+
+extern void ioapic_zap_locks(void);
+
#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7ddb36ab933b..be39a543fbe5 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -1,6 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_APICDEF_H
#define _ASM_X86_APICDEF_H
+#include <linux/bits.h>
+
/*
* Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
*
@@ -8,12 +11,27 @@
* Ingo Molnar <mingo@redhat.com>, 1999, 2000
*/
-#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
+#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+
+/*
+ * This is the IO-APIC register space as specified
+ * by Intel docs:
+ */
+#define IO_APIC_SLOT_SIZE 1024
+
+#define APIC_DELIVERY_MODE_FIXED 0
+#define APIC_DELIVERY_MODE_LOWESTPRIO 1
+#define APIC_DELIVERY_MODE_SMI 2
+#define APIC_DELIVERY_MODE_NMI 4
+#define APIC_DELIVERY_MODE_INIT 5
+#define APIC_DELIVERY_MODE_EXTINT 7
#define APIC_ID 0x20
#define APIC_LVR 0x30
#define APIC_LVR_MASK 0xFF00FF
+#define APIC_LVR_DIRECTED_EOI (1 << 24)
#define GET_APIC_VERSION(x) ((x) & 0xFFu)
#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
#ifdef CONFIG_X86_32
@@ -29,7 +47,7 @@
#define APIC_ARBPRI_MASK 0xFFu
#define APIC_PROCPRI 0xA0
#define APIC_EOI 0xB0
-#define APIC_EIO_ACK 0x0
+#define APIC_EOI_ACK 0x0 /* Docs say 0 for future compat. */
#define APIC_RRR 0xC0
#define APIC_LDR 0xD0
#define APIC_LDR_MASK (0xFFu << 24)
@@ -40,6 +58,7 @@
#define APIC_DFR_CLUSTER 0x0FFFFFFFul
#define APIC_DFR_FLAT 0xFFFFFFFFul
#define APIC_SPIV 0xF0
+#define APIC_SPIV_DIRECTED_EOI (1 << 12)
#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
#define APIC_SPIV_APIC_ENABLED (1 << 8)
#define APIC_ISR 0x100
@@ -69,6 +88,7 @@
#define APIC_DEST_LOGICAL 0x00800
#define APIC_DEST_PHYSICAL 0x00000
#define APIC_DM_FIXED 0x00000
+#define APIC_DM_FIXED_MASK 0x00700
#define APIC_DM_LOWEST 0x00100
#define APIC_DM_SMI 0x00200
#define APIC_DM_REMRD 0x00300
@@ -78,19 +98,15 @@
#define APIC_DM_EXTINT 0x00700
#define APIC_VECTOR_MASK 0x000FF
#define APIC_ICR2 0x310
-#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
-#define SET_APIC_DEST_FIELD(x) ((x) << 24)
+#define GET_XAPIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
+#define SET_XAPIC_DEST_FIELD(x) ((x) << 24)
#define APIC_LVTT 0x320
#define APIC_LVTTHMR 0x330
#define APIC_LVTPC 0x340
#define APIC_LVT0 0x350
-#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
-#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
-#define SET_APIC_TIMER_BASE(x) (((x) << 18))
-#define APIC_TIMER_BASE_CLKIN 0x0
-#define APIC_TIMER_BASE_TMBASE 0x1
-#define APIC_TIMER_BASE_DIV 0x2
+#define APIC_LVT_TIMER_ONESHOT (0 << 17)
#define APIC_LVT_TIMER_PERIODIC (1 << 17)
+#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
#define APIC_LVT_MASKED (1 << 16)
#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
#define APIC_LVT_REMOTE_IRR (1 << 14)
@@ -119,9 +135,12 @@
#define APIC_TDR_DIV_128 0xA
#define APIC_EFEAT 0x400
#define APIC_ECTRL 0x410
+#define APIC_SEOI 0x420
+#define APIC_IER 0x480
#define APIC_EILVTn(n) (0x500 + 0x10 * n)
#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
#define APIC_EILVT_NR_AMD_10H 4
+#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
#define APIC_EILVT_MSG_FIX 0x0
#define APIC_EILVT_MSG_SMI 0x2
@@ -130,11 +149,14 @@
#define APIC_EILVT_MASKED (1 << 16)
#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
-#define APIC_BASE_MSR 0x800
-#define X2APIC_ENABLE (1UL << 10)
+#define APIC_BASE_MSR 0x800
+#define APIC_X2APIC_ID_MSR 0x802
+#define XAPIC_ENABLE BIT(11)
+#define X2APIC_ENABLE BIT(10)
#ifdef CONFIG_X86_32
# define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
#else
# define MAX_IO_APICS 128
# define MAX_LOCAL_APIC 32768
@@ -152,267 +174,10 @@
#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
-/*
- * the local APIC register structure, memory mapped. Not terribly well
- * tested, but we might eventually use this one in the future - the
- * problem why we cannot use it right now is the P5 APIC, it has an
- * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
- */
-#define u32 unsigned int
-
-struct local_apic {
-
-/*000*/ struct { u32 __reserved[4]; } __reserved_01;
-
-/*010*/ struct { u32 __reserved[4]; } __reserved_02;
-
-/*020*/ struct { /* APIC ID Register */
- u32 __reserved_1 : 24,
- phys_apic_id : 4,
- __reserved_2 : 4;
- u32 __reserved[3];
- } id;
-
-/*030*/ const
- struct { /* APIC Version Register */
- u32 version : 8,
- __reserved_1 : 8,
- max_lvt : 8,
- __reserved_2 : 8;
- u32 __reserved[3];
- } version;
-
-/*040*/ struct { u32 __reserved[4]; } __reserved_03;
-
-/*050*/ struct { u32 __reserved[4]; } __reserved_04;
-
-/*060*/ struct { u32 __reserved[4]; } __reserved_05;
-
-/*070*/ struct { u32 __reserved[4]; } __reserved_06;
-
-/*080*/ struct { /* Task Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } tpr;
-
-/*090*/ const
- struct { /* Arbitration Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } apr;
-
-/*0A0*/ const
- struct { /* Processor Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } ppr;
-
-/*0B0*/ struct { /* End Of Interrupt Register */
- u32 eoi;
- u32 __reserved[3];
- } eoi;
-
-/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
-
-/*0D0*/ struct { /* Logical Destination Register */
- u32 __reserved_1 : 24,
- logical_dest : 8;
- u32 __reserved_2[3];
- } ldr;
-
-/*0E0*/ struct { /* Destination Format Register */
- u32 __reserved_1 : 28,
- model : 4;
- u32 __reserved_2[3];
- } dfr;
-
-/*0F0*/ struct { /* Spurious Interrupt Vector Register */
- u32 spurious_vector : 8,
- apic_enabled : 1,
- focus_cpu : 1,
- __reserved_2 : 22;
- u32 __reserved_3[3];
- } svr;
-
-/*100*/ struct { /* In Service Register */
-/*170*/ u32 bitfield;
- u32 __reserved[3];
- } isr [8];
-
-/*180*/ struct { /* Trigger Mode Register */
-/*1F0*/ u32 bitfield;
- u32 __reserved[3];
- } tmr [8];
-
-/*200*/ struct { /* Interrupt Request Register */
-/*270*/ u32 bitfield;
- u32 __reserved[3];
- } irr [8];
-
-/*280*/ union { /* Error Status Register */
- struct {
- u32 send_cs_error : 1,
- receive_cs_error : 1,
- send_accept_error : 1,
- receive_accept_error : 1,
- __reserved_1 : 1,
- send_illegal_vector : 1,
- receive_illegal_vector : 1,
- illegal_register_address : 1,
- __reserved_2 : 24;
- u32 __reserved_3[3];
- } error_bits;
- struct {
- u32 errors;
- u32 __reserved_3[3];
- } all_errors;
- } esr;
-
-/*290*/ struct { u32 __reserved[4]; } __reserved_08;
-
-/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
-
-/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
-
-/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
-
-/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
-
-/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
-
-/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
-
-/*300*/ struct { /* Interrupt Command Register 1 */
- u32 vector : 8,
- delivery_mode : 3,
- destination_mode : 1,
- delivery_status : 1,
- __reserved_1 : 1,
- level : 1,
- trigger : 1,
- __reserved_2 : 2,
- shorthand : 2,
- __reserved_3 : 12;
- u32 __reserved_4[3];
- } icr1;
-
-/*310*/ struct { /* Interrupt Command Register 2 */
- union {
- u32 __reserved_1 : 24,
- phys_dest : 4,
- __reserved_2 : 4;
- u32 __reserved_3 : 24,
- logical_dest : 8;
- } dest;
- u32 __reserved_4[3];
- } icr2;
-
-/*320*/ struct { /* LVT - Timer */
- u32 vector : 8,
- __reserved_1 : 4,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- timer_mode : 1,
- __reserved_3 : 14;
- u32 __reserved_4[3];
- } lvt_timer;
-
-/*330*/ struct { /* LVT - Thermal Sensor */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_thermal;
-
-/*340*/ struct { /* LVT - Performance Counter */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_pc;
-
-/*350*/ struct { /* LVT - LINT0 */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- polarity : 1,
- remote_irr : 1,
- trigger : 1,
- mask : 1,
- __reserved_2 : 15;
- u32 __reserved_3[3];
- } lvt_lint0;
-
-/*360*/ struct { /* LVT - LINT1 */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- polarity : 1,
- remote_irr : 1,
- trigger : 1,
- mask : 1,
- __reserved_2 : 15;
- u32 __reserved_3[3];
- } lvt_lint1;
-
-/*370*/ struct { /* LVT - Error */
- u32 vector : 8,
- __reserved_1 : 4,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_error;
-
-/*380*/ struct { /* Timer Initial Count Register */
- u32 initial_count;
- u32 __reserved_2[3];
- } timer_icr;
-
-/*390*/ const
- struct { /* Timer Current Count Register */
- u32 curr_count;
- u32 __reserved_2[3];
- } timer_ccr;
-
-/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
-
-/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
-
-/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
-
-/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
-
-/*3E0*/ struct { /* Timer Divide Configuration Register */
- u32 divisor : 4,
- __reserved_1 : 28;
- u32 __reserved_2[3];
- } timer_dcr;
-
-/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
-
-} __attribute__ ((packed));
-
-#undef u32
-
#ifdef CONFIG_X86_32
#define BAD_APICID 0xFFu
#else
#define BAD_APICID 0xFFFFu
#endif
+
#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
deleted file mode 100644
index 82f613c607ce..000000000000
--- a/arch/x86/include/asm/apicnum.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_X86_APICNUM_H
-#define _ASM_X86_APICNUM_H
-
-/* define MAX_IO_APICS */
-#ifdef CONFIG_X86_32
-# define MAX_IO_APICS 64
-#else
-# define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 32768
-#endif
-
-#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h
index 20370c6db74b..4d4015ddcf26 100644
--- a/arch/x86/include/asm/apm.h
+++ b/arch/x86/include/asm/apm.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Machine specific APM BIOS functions for generic.
* Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
@@ -45,11 +46,11 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
: "memory", "cc");
}
-static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in,
- u32 ecx_in, u32 *eax)
+static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in,
+ u32 ecx_in, u32 *eax)
{
int cx, dx, si;
- u8 error;
+ bool error;
/*
* N.B. We do NOT need a cld after the BIOS call
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
new file mode 100644
index 000000000000..b5982b94bdba
--- /dev/null
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HWEIGHT_H
+#define _ASM_X86_HWEIGHT_H
+
+#include <asm/cpufeatures.h>
+
+#ifdef CONFIG_64BIT
+#define REG_IN "D"
+#define REG_OUT "a"
+#else
+#define REG_IN "a"
+#define REG_OUT "a"
+#endif
+
+static __always_inline unsigned int __arch_hweight32(unsigned int w)
+{
+ unsigned int res;
+
+ asm_inline (ALTERNATIVE("call __sw_hweight32",
+ "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT)
+ : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
+ : [val] REG_IN (w));
+
+ return res;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+ return __arch_hweight32(w & 0xffff);
+}
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+ return __arch_hweight32(w & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+static inline unsigned long __arch_hweight64(__u64 w)
+{
+ return __arch_hweight32((u32)w) +
+ __arch_hweight32((u32)(w >> 32));
+}
+#else
+static __always_inline unsigned long __arch_hweight64(__u64 w)
+{
+ unsigned long res;
+
+ asm_inline (ALTERNATIVE("call __sw_hweight64",
+ "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT)
+ : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT
+ : [val] REG_IN (w));
+
+ return res;
+}
+#endif /* CONFIG_X86_32 */
+
+#endif
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
new file mode 100644
index 000000000000..4c305305871b
--- /dev/null
+++ b/arch/x86/include/asm/archrandom.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011-2014, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ */
+
+#ifndef ASM_X86_ARCHRANDOM_H
+#define ASM_X86_ARCHRANDOM_H
+
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+#define RDRAND_RETRY_LOOPS 10
+
+/* Unconditional execution of RDRAND and RDSEED */
+
+static inline bool __must_check rdrand_long(unsigned long *v)
+{
+ bool ok;
+ unsigned int retry = RDRAND_RETRY_LOOPS;
+ do {
+ asm volatile("rdrand %[out]"
+ : "=@ccc" (ok), [out] "=r" (*v));
+ if (ok)
+ return true;
+ } while (--retry);
+ return false;
+}
+
+static inline bool __must_check rdseed_long(unsigned long *v)
+{
+ bool ok;
+ asm volatile("rdseed %[out]"
+ : "=@ccc" (ok), [out] "=r" (*v));
+ return ok;
+}
+
+/*
+ * These are the generic interfaces; they must not be declared if the
+ * stubs in <linux/random.h> are to be invoked.
+ */
+
+static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
+{
+ return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0;
+}
+
+static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs)
+{
+ return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0;
+}
+
+#ifndef CONFIG_UML
+void x86_init_rdrand(struct cpuinfo_x86 *c);
+#endif
+
+#endif /* ASM_X86_ARCHRANDOM_H */
diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h
new file mode 100644
index 000000000000..d370ee36a182
--- /dev/null
+++ b/arch/x86/include/asm/asm-offsets.h
@@ -0,0 +1 @@
+#include <generated/asm-offsets.h>
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
new file mode 100644
index 000000000000..11c6fecc3ad7
--- /dev/null
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/pgtable.h>
+#include <asm/string.h>
+#include <asm/page.h>
+#include <asm/checksum.h>
+#include <asm/mce.h>
+
+#include <asm-generic/asm-prototypes.h>
+
+#include <asm/special_insns.h>
+#include <asm/preempt.h>
+#include <asm/asm.h>
+#include <asm/fred.h>
+#include <asm/gsseg.h>
+#include <asm/nospec-branch.h>
+
+#ifndef CONFIG_X86_CX8
+extern void cmpxchg8b_emu(void);
+#endif
+
+#ifdef CONFIG_STACKPROTECTOR
+extern unsigned long __ref_stack_chk_guard;
+#endif
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 56be78f582f0..0e8c611bc9e2 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -1,22 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ASM_H
#define _ASM_X86_ASM_H
-#ifdef __ASSEMBLY__
-# define __ASM_FORM(x) x
-# define __ASM_EX_SEC .section __ex_table
+#include <linux/annotate.h>
+
+#ifdef __ASSEMBLER__
+# define __ASM_FORM(x, ...) x,## __VA_ARGS__
+# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__
+# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__,
+# define __ASM_REGPFX %
#else
-# define __ASM_FORM(x) " " #x " "
-# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
+#include <linux/stringify.h>
+# define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " "
+# define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__)
+# define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) ","
+# define __ASM_REGPFX %%
#endif
-#ifdef CONFIG_X86_32
-# define __ASM_SEL(a,b) __ASM_FORM(a)
+#define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;)
+
+#ifndef __x86_64__
+/* 32 bit */
+# define __ASM_SEL(a,b) __ASM_FORM(a)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
#else
-# define __ASM_SEL(a,b) __ASM_FORM(b)
+/* 64 bit */
+# define __ASM_SEL(a,b) __ASM_FORM(b)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
#endif
-#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
-#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
+#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
+ inst##q##__VA_ARGS__)
+#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg)
#define _ASM_PTR __ASM_SEL(.long, .quad)
#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
@@ -27,6 +42,7 @@
#define _ASM_ADD __ASM_SIZE(add)
#define _ASM_SUB __ASM_SIZE(sub)
#define _ASM_XADD __ASM_SIZE(xadd)
+#define _ASM_MUL __ASM_SIZE(mul)
#define _ASM_AX __ASM_REG(ax)
#define _ASM_BX __ASM_REG(bx)
@@ -37,11 +53,205 @@
#define _ASM_SI __ASM_REG(si)
#define _ASM_DI __ASM_REG(di)
+/* Adds a (%rip) suffix on 64 bits only; for immediate memory references */
+#define _ASM_RIP(x) __ASM_SEL_RAW(x, x (__ASM_REGPFX rip))
+
+#ifndef __x86_64__
+/* 32 bit */
+
+#define _ASM_ARG1 _ASM_AX
+#define _ASM_ARG2 _ASM_DX
+#define _ASM_ARG3 _ASM_CX
+
+#define _ASM_ARG1L eax
+#define _ASM_ARG2L edx
+#define _ASM_ARG3L ecx
+
+#define _ASM_ARG1W ax
+#define _ASM_ARG2W dx
+#define _ASM_ARG3W cx
+
+#define _ASM_ARG1B al
+#define _ASM_ARG2B dl
+#define _ASM_ARG3B cl
+
+#else
+/* 64 bit */
+
+#define _ASM_ARG1 _ASM_DI
+#define _ASM_ARG2 _ASM_SI
+#define _ASM_ARG3 _ASM_DX
+#define _ASM_ARG4 _ASM_CX
+#define _ASM_ARG5 r8
+#define _ASM_ARG6 r9
+
+#define _ASM_ARG1Q rdi
+#define _ASM_ARG2Q rsi
+#define _ASM_ARG3Q rdx
+#define _ASM_ARG4Q rcx
+#define _ASM_ARG5Q r8
+#define _ASM_ARG6Q r9
+
+#define _ASM_ARG1L edi
+#define _ASM_ARG2L esi
+#define _ASM_ARG3L edx
+#define _ASM_ARG4L ecx
+#define _ASM_ARG5L r8d
+#define _ASM_ARG6L r9d
+
+#define _ASM_ARG1W di
+#define _ASM_ARG2W si
+#define _ASM_ARG3W dx
+#define _ASM_ARG4W cx
+#define _ASM_ARG5W r8w
+#define _ASM_ARG6W r9w
+
+#define _ASM_ARG1B dil
+#define _ASM_ARG2B sil
+#define _ASM_ARG3B dl
+#define _ASM_ARG4B cl
+#define _ASM_ARG5B r8b
+#define _ASM_ARG6B r9b
+
+#endif
+
+#ifndef __ASSEMBLER__
+static __always_inline __pure void *rip_rel_ptr(void *p)
+{
+ asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p));
+
+ return p;
+}
+#endif
+
+#ifdef __KERNEL__
+
+#ifndef COMPILE_OFFSETS
+#include <asm/asm-offsets.h>
+#endif
+
+# include <asm/extable_fixup_types.h>
+
/* Exception table entry */
-# define _ASM_EXTABLE(from,to) \
- __ASM_EX_SEC \
- _ASM_ALIGN "\n" \
- _ASM_PTR #from "," #to "\n" \
- " .previous\n"
+#ifdef __ASSEMBLER__
+
+# define _ASM_EXTABLE_TYPE(from, to, type) \
+ .pushsection "__ex_table", "aM", @progbits, EXTABLE_SIZE ; \
+ .balign 4 ; \
+ .long (from) - . ; \
+ .long (to) - . ; \
+ .long type ; \
+ .popsection
+
+# ifdef CONFIG_KPROBES
+# define _ASM_NOKPROBE(entry) \
+ .pushsection "_kprobe_blacklist","aw" ; \
+ _ASM_ALIGN ; \
+ _ASM_PTR (entry); \
+ .popsection
+# else
+# define _ASM_NOKPROBE(entry)
+# endif
+
+#else /* ! __ASSEMBLER__ */
+
+# define DEFINE_EXTABLE_TYPE_REG \
+ ".macro extable_type_reg type:req reg:req\n" \
+ ".set .Lfound, 0\n" \
+ ".set .Lregnr, 0\n" \
+ ".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
+ ".endif\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
+ ".endr\n" \
+ ".set .Lregnr, 0\n" \
+ ".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
+ ".endif\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
+ ".endr\n" \
+ ".if (.Lfound != 1)\n" \
+ ".error \"extable_type_reg: bad register argument\"\n" \
+ ".endif\n" \
+ ".endm\n"
+
+# define UNDEFINE_EXTABLE_TYPE_REG \
+ ".purgem extable_type_reg\n"
+
+# define _ASM_EXTABLE_TYPE(from, to, type) \
+ " .pushsection __ex_table, \"aM\", @progbits, " \
+ __stringify(EXTABLE_SIZE) "\n" \
+ " .balign 4\n" \
+ " .long (" #from ") - .\n" \
+ " .long (" #to ") - .\n" \
+ " .long " __stringify(type) " \n" \
+ " .popsection\n"
+
+# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
+ " .pushsection __ex_table, \"aM\", @progbits, " \
+ __stringify(EXTABLE_SIZE) "\n" \
+ " .balign 4\n" \
+ " .long (" #from ") - .\n" \
+ " .long (" #to ") - .\n" \
+ DEFINE_EXTABLE_TYPE_REG \
+ "extable_type_reg reg=" __stringify(reg) ", type=" __stringify(type) " \n"\
+ UNDEFINE_EXTABLE_TYPE_REG \
+ " .popsection\n"
+
+/* For C file, we already have NOKPROBE_SYMBOL macro */
+
+/* Insert a comma if args are non-empty */
+#define COMMA(x...) __COMMA(x)
+#define __COMMA(...) , ##__VA_ARGS__
+
+/*
+ * Combine multiple asm inline constraint args into a single arg for passing to
+ * another macro.
+ */
+#define ASM_OUTPUT(x...) x
+#define ASM_INPUT(x...) x
+
+/*
+ * This output constraint should be used for any inline asm which has a "call"
+ * instruction. Otherwise the asm may be inserted before the frame pointer
+ * gets set up by the containing function. If you forget to do this, objtool
+ * may print a "call without frame pointer save/setup" warning.
+ */
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
+#endif /* __ASSEMBLER__ */
+
+#define _ASM_EXTABLE(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT)
+
+#define _ASM_EXTABLE_UA(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
+
+#define _ASM_EXTABLE_FAULT(from, to) \
+ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
+
+/*
+ * Both i386 and x86_64 returns 64-bit values in edx:eax for certain
+ * instructions, but GCC's "A" constraint has different meanings.
+ * For i386, "A" means exactly edx:eax, while for x86_64 it
+ * means rax *or* rdx.
+ *
+ * These helpers wrapping these semantic differences save one instruction
+ * clearing the high half of 'low':
+ */
+#ifdef CONFIG_X86_64
+# define EAX_EDX_DECLARE_ARGS(val, low, high) unsigned long low, high
+# define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32)
+# define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
+#else
+# define EAX_EDX_DECLARE_ARGS(val, low, high) u64 val
+# define EAX_EDX_VAL(val, low, high) (val)
+# define EAX_EDX_RET(val, low, high) "=A" (val)
+#endif
+#endif /* __KERNEL__ */
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 4e1b8873c474..75743f1dfd4e 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -1,5 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ATOMIC_H
+#define _ASM_X86_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+#include <asm/rmwcc.h>
+#include <asm/barrier.h>
+
+/*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+static __always_inline int arch_atomic_read(const atomic_t *v)
+{
+ /*
+ * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
+ * it's non-inlined function that increases binary size and stack usage.
+ */
+ return __READ_ONCE((v)->counter);
+}
+
+static __always_inline void arch_atomic_set(atomic_t *v, int i)
+{
+ __WRITE_ONCE(v->counter, i);
+}
+
+static __always_inline void arch_atomic_add(int i, atomic_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "addl %1, %0"
+ : "+m" (v->counter)
+ : "ir" (i) : "memory");
+}
+
+static __always_inline void arch_atomic_sub(int i, atomic_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "subl %1, %0"
+ : "+m" (v->counter)
+ : "ir" (i) : "memory");
+}
+
+static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
+{
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
+}
+#define arch_atomic_sub_and_test arch_atomic_sub_and_test
+
+static __always_inline void arch_atomic_inc(atomic_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "incl %0"
+ : "+m" (v->counter) :: "memory");
+}
+#define arch_atomic_inc arch_atomic_inc
+
+static __always_inline void arch_atomic_dec(atomic_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "decl %0"
+ : "+m" (v->counter) :: "memory");
+}
+#define arch_atomic_dec arch_atomic_dec
+
+static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
+{
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
+}
+#define arch_atomic_dec_and_test arch_atomic_dec_and_test
+
+static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
+{
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
+}
+#define arch_atomic_inc_and_test arch_atomic_inc_and_test
+
+static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
+{
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
+}
+#define arch_atomic_add_negative arch_atomic_add_negative
+
+static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
+{
+ return i + xadd(&v->counter, i);
+}
+#define arch_atomic_add_return arch_atomic_add_return
+
+#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)
+
+static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
+{
+ return xadd(&v->counter, i);
+}
+#define arch_atomic_fetch_add arch_atomic_fetch_add
+
+#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)
+
+static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ return arch_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic_cmpxchg arch_atomic_cmpxchg
+
+static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+ return arch_try_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
+
+static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
+{
+ return arch_xchg(&v->counter, new);
+}
+#define arch_atomic_xchg arch_atomic_xchg
+
+static __always_inline void arch_atomic_and(int i, atomic_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "andl %1, %0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
+{
+ int val = arch_atomic_read(v);
+
+ do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));
+
+ return val;
+}
+#define arch_atomic_fetch_and arch_atomic_fetch_and
+
+static __always_inline void arch_atomic_or(int i, atomic_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "orl %1, %0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
+{
+ int val = arch_atomic_read(v);
+
+ do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));
+
+ return val;
+}
+#define arch_atomic_fetch_or arch_atomic_fetch_or
+
+static __always_inline void arch_atomic_xor(int i, atomic_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "xorl %1, %0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
+{
+ int val = arch_atomic_read(v);
+
+ do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));
+
+ return val;
+}
+#define arch_atomic_fetch_xor arch_atomic_fetch_xor
+
#ifdef CONFIG_X86_32
-# include "atomic_32.h"
+# include <asm/atomic64_32.h>
#else
-# include "atomic_64.h"
+# include <asm/atomic64_64.h>
#endif
+
+#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
new file mode 100644
index 000000000000..ab838205c1c6
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -0,0 +1,312 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ATOMIC64_32_H
+#define _ASM_X86_ATOMIC64_32_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+//#include <asm/cmpxchg.h>
+
+/* An 64bit atomic type */
+
+typedef struct {
+ s64 __aligned(8) counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val) { (val) }
+
+/*
+ * Read an atomic64_t non-atomically.
+ *
+ * This is intended to be used in cases where a subsequent atomic operation
+ * will handle the torn value, and can be used to prime the first iteration
+ * of unconditional try_cmpxchg() loops, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ *
+ * This is NOT safe to use where the value is not always checked by a
+ * subsequent atomic operation, such as in conditional try_cmpxchg() loops
+ * that can break before the atomic operation, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do {
+ * if (condition(val))
+ * break;
+ * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ */
+static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v)
+{
+ /* See comment in arch_atomic_read(). */
+ return __READ_ONCE(v->counter);
+}
+
+#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
+#ifndef ATOMIC64_EXPORT
+#define ATOMIC64_DECL_ONE __ATOMIC64_DECL
+#else
+#define ATOMIC64_DECL_ONE(sym) __ATOMIC64_DECL(sym); \
+ ATOMIC64_EXPORT(atomic64_##sym)
+#endif
+
+#ifdef CONFIG_X86_CX8
+#define __alternative_atomic64(f, g, out, in, clobbers...) \
+ asm volatile("call %c[func]" \
+ : ALT_OUTPUT_SP(out) \
+ : [func] "i" (atomic64_##g##_cx8) \
+ COMMA(in) \
+ : clobbers)
+
+#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8)
+#else
+#define __alternative_atomic64(f, g, out, in, clobbers...) \
+ alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \
+ X86_FEATURE_CX8, ASM_OUTPUT(out), \
+ ASM_INPUT(in), clobbers)
+
+#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \
+ ATOMIC64_DECL_ONE(sym##_386)
+
+ATOMIC64_DECL_ONE(add_386);
+ATOMIC64_DECL_ONE(sub_386);
+ATOMIC64_DECL_ONE(inc_386);
+ATOMIC64_DECL_ONE(dec_386);
+#endif
+
+#define alternative_atomic64(f, out, in, clobbers...) \
+ __alternative_atomic64(f, f, ASM_OUTPUT(out), ASM_INPUT(in), clobbers)
+
+ATOMIC64_DECL(read);
+ATOMIC64_DECL(set);
+ATOMIC64_DECL(xchg);
+ATOMIC64_DECL(add_return);
+ATOMIC64_DECL(sub_return);
+ATOMIC64_DECL(inc_return);
+ATOMIC64_DECL(dec_return);
+ATOMIC64_DECL(dec_if_positive);
+ATOMIC64_DECL(inc_not_zero);
+ATOMIC64_DECL(add_unless);
+
+#undef ATOMIC64_DECL
+#undef ATOMIC64_DECL_ONE
+#undef __ATOMIC64_DECL
+#undef ATOMIC64_EXPORT
+
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+{
+ return arch_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+ return arch_try_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
+{
+ s64 o;
+ unsigned high = (unsigned)(n >> 32);
+ unsigned low = (unsigned)n;
+ alternative_atomic64(xchg,
+ "=&A" (o),
+ ASM_INPUT("S" (v), "b" (low), "c" (high)),
+ "memory");
+ return o;
+}
+#define arch_atomic64_xchg arch_atomic64_xchg
+
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
+{
+ unsigned high = (unsigned)(i >> 32);
+ unsigned low = (unsigned)i;
+ alternative_atomic64(set,
+ /* no output */,
+ ASM_INPUT("S" (v), "b" (low), "c" (high)),
+ "eax", "edx", "memory");
+}
+
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
+{
+ s64 r;
+ alternative_atomic64(read, "=&A" (r), "c" (v), "memory");
+ return r;
+}
+
+static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
+{
+ alternative_atomic64(add_return,
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
+ return i;
+}
+#define arch_atomic64_add_return arch_atomic64_add_return
+
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+{
+ alternative_atomic64(sub_return,
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
+ return i;
+}
+#define arch_atomic64_sub_return arch_atomic64_sub_return
+
+static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
+{
+ s64 a;
+ alternative_atomic64(inc_return,
+ "=&A" (a),
+ "S" (v),
+ "memory", "ecx");
+ return a;
+}
+#define arch_atomic64_inc_return arch_atomic64_inc_return
+
+static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
+{
+ s64 a;
+ alternative_atomic64(dec_return,
+ "=&A" (a),
+ "S" (v),
+ "memory", "ecx");
+ return a;
+}
+#define arch_atomic64_dec_return arch_atomic64_dec_return
+
+static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
+{
+ __alternative_atomic64(add, add_return,
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
+}
+
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
+{
+ __alternative_atomic64(sub, sub_return,
+ ASM_OUTPUT("+A" (i), "+c" (v)),
+ /* no input */,
+ "memory");
+}
+
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
+{
+ __alternative_atomic64(inc, inc_return,
+ /* no output */,
+ "S" (v),
+ "memory", "eax", "ecx", "edx");
+}
+#define arch_atomic64_inc arch_atomic64_inc
+
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
+{
+ __alternative_atomic64(dec, dec_return,
+ /* no output */,
+ "S" (v),
+ "memory", "eax", "ecx", "edx");
+}
+#define arch_atomic64_dec arch_atomic64_dec
+
+static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
+{
+ unsigned low = (unsigned)u;
+ unsigned high = (unsigned)(u >> 32);
+ alternative_atomic64(add_unless,
+ ASM_OUTPUT("+A" (a), "+c" (low), "+D" (high)),
+ "S" (v),
+ "memory");
+ return (int)a;
+}
+#define arch_atomic64_add_unless arch_atomic64_add_unless
+
+static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
+{
+ int r;
+ alternative_atomic64(inc_not_zero,
+ "=&a" (r),
+ "S" (v),
+ "ecx", "edx", "memory");
+ return r;
+}
+#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
+
+static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
+{
+ s64 r;
+ alternative_atomic64(dec_if_positive,
+ "=&A" (r),
+ "S" (v),
+ "ecx", "memory");
+ return r;
+}
+#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
+
+#undef alternative_atomic64
+#undef __alternative_atomic64
+
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read_nonatomic(v);
+
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
+}
+
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read_nonatomic(v);
+
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
+
+ return val;
+}
+#define arch_atomic64_fetch_and arch_atomic64_fetch_and
+
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read_nonatomic(v);
+
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
+}
+
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read_nonatomic(v);
+
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
+
+ return val;
+}
+#define arch_atomic64_fetch_or arch_atomic64_fetch_or
+
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read_nonatomic(v);
+
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
+}
+
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read_nonatomic(v);
+
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
+
+ return val;
+}
+#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
+
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read_nonatomic(v);
+
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
+
+ return val;
+}
+#define arch_atomic64_fetch_add arch_atomic64_fetch_add
+
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), (v))
+
+#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
new file mode 100644
index 000000000000..87b496325b5b
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ATOMIC64_64_H
+#define _ASM_X86_ATOMIC64_64_H
+
+#include <linux/types.h>
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+
+/* The 64-bit atomic type */
+
+#define ATOMIC64_INIT(i) { (i) }
+
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
+{
+ return __READ_ONCE((v)->counter);
+}
+
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
+{
+ __WRITE_ONCE(v->counter, i);
+}
+
+static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "addq %1, %0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter) : "memory");
+}
+
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "subq %1, %0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter) : "memory");
+}
+
+static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
+{
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
+}
+#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
+
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "incq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter) : "memory");
+}
+#define arch_atomic64_inc arch_atomic64_inc
+
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "decq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter) : "memory");
+}
+#define arch_atomic64_dec arch_atomic64_dec
+
+static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
+{
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
+}
+#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
+
+static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
+{
+ return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
+}
+#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
+
+static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
+{
+ return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
+}
+#define arch_atomic64_add_negative arch_atomic64_add_negative
+
+static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
+{
+ return i + xadd(&v->counter, i);
+}
+#define arch_atomic64_add_return arch_atomic64_add_return
+
+#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)
+
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+{
+ return xadd(&v->counter, i);
+}
+#define arch_atomic64_fetch_add arch_atomic64_fetch_add
+
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)
+
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+{
+ return arch_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+ return arch_try_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
+{
+ return arch_xchg(&v->counter, new);
+}
+#define arch_atomic64_xchg arch_atomic64_xchg
+
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "andq %1, %0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read(v);
+
+ do {
+ } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
+ return val;
+}
+#define arch_atomic64_fetch_and arch_atomic64_fetch_and
+
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "orq %1, %0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read(v);
+
+ do {
+ } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
+ return val;
+}
+#define arch_atomic64_fetch_or arch_atomic64_fetch_or
+
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+{
+ asm_inline volatile(LOCK_PREFIX "xorq %1, %0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read(v);
+
+ do {
+ } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
+ return val;
+}
+#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
+
+#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
deleted file mode 100644
index dc5a667ff791..000000000000
--- a/arch/x86/include/asm/atomic_32.h
+++ /dev/null
@@ -1,415 +0,0 @@
-#ifndef _ASM_X86_ATOMIC_32_H
-#define _ASM_X86_ATOMIC_32_H
-
-#include <linux/compiler.h>
-#include <linux/types.h>
-#include <asm/processor.h>
-#include <asm/cmpxchg.h>
-
-/*
- * Atomic operations that C can't guarantee us. Useful for
- * resource counting etc..
- */
-
-#define ATOMIC_INIT(i) { (i) }
-
-/**
- * atomic_read - read atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically reads the value of @v.
- */
-static inline int atomic_read(const atomic_t *v)
-{
- return v->counter;
-}
-
-/**
- * atomic_set - set atomic variable
- * @v: pointer of type atomic_t
- * @i: required value
- *
- * Atomically sets the value of @v to @i.
- */
-static inline void atomic_set(atomic_t *v, int i)
-{
- v->counter = i;
-}
-
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic_add(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "addl %1,%0"
- : "+m" (v->counter)
- : "ir" (i));
-}
-
-/**
- * atomic_sub - subtract integer from atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic_sub(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "subl %1,%0"
- : "+m" (v->counter)
- : "ir" (i));
-}
-
-/**
- * atomic_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
- : "+m" (v->counter), "=qm" (c)
- : "ir" (i) : "memory");
- return c;
-}
-
-/**
- * atomic_inc - increment atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1.
- */
-static inline void atomic_inc(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "incl %0"
- : "+m" (v->counter));
-}
-
-/**
- * atomic_dec - decrement atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1.
- */
-static inline void atomic_dec(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "decl %0"
- : "+m" (v->counter));
-}
-
-/**
- * atomic_dec_and_test - decrement and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int atomic_dec_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "decl %0; sete %1"
- : "+m" (v->counter), "=qm" (c)
- : : "memory");
- return c != 0;
-}
-
-/**
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_inc_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "incl %0; sete %1"
- : "+m" (v->counter), "=qm" (c)
- : : "memory");
- return c != 0;
-}
-
-/**
- * atomic_add_negative - add and test if negative
- * @v: pointer of type atomic_t
- * @i: integer value to add
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int atomic_add_negative(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
- : "+m" (v->counter), "=qm" (c)
- : "ir" (i) : "memory");
- return c;
-}
-
-/**
- * atomic_add_return - add integer and return
- * @v: pointer of type atomic_t
- * @i: integer value to add
- *
- * Atomically adds @i to @v and returns @i + @v
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int __i;
-#ifdef CONFIG_M386
- unsigned long flags;
- if (unlikely(boot_cpu_data.x86 <= 3))
- goto no_xadd;
-#endif
- /* Modern 486+ processor */
- __i = i;
- asm volatile(LOCK_PREFIX "xaddl %0, %1"
- : "+r" (i), "+m" (v->counter)
- : : "memory");
- return i + __i;
-
-#ifdef CONFIG_M386
-no_xadd: /* Legacy 386 processor */
- local_irq_save(flags);
- __i = atomic_read(v);
- atomic_set(v, i + __i);
- local_irq_restore(flags);
- return i + __i;
-#endif
-}
-
-/**
- * atomic_sub_return - subtract integer and return
- * @v: pointer of type atomic_t
- * @i: integer value to subtract
- *
- * Atomically subtracts @i from @v and returns @v - @i
- */
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- return atomic_add_return(-i, v);
-}
-
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
-{
- return cmpxchg(&v->counter, old, new);
-}
-
-static inline int atomic_xchg(atomic_t *v, int new)
-{
- return xchg(&v->counter, new);
-}
-
-/**
- * atomic_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
- */
-static inline int atomic_add_unless(atomic_t *v, int a, int u)
-{
- int c, old;
- c = atomic_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
-}
-
-#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-#define atomic_dec_return(v) (atomic_sub_return(1, v))
-
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "andl %0,%1" \
- : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "orl %0,%1" \
- : : "r" (mask), "m" (*(addr)) : "memory")
-
-/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic_dec() barrier()
-#define smp_mb__after_atomic_dec() barrier()
-#define smp_mb__before_atomic_inc() barrier()
-#define smp_mb__after_atomic_inc() barrier()
-
-/* An 64bit atomic type */
-
-typedef struct {
- u64 __aligned(8) counter;
-} atomic64_t;
-
-#define ATOMIC64_INIT(val) { (val) }
-
-extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-extern void atomic64_set(atomic64_t *ptr, u64 new_val);
-
-/**
- * atomic64_read - read atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically reads the value of @ptr and returns it.
- */
-static inline u64 atomic64_read(atomic64_t *ptr)
-{
- u64 res;
-
- /*
- * Note, we inline this atomic64_t primitive because
- * it only clobbers EAX/EDX and leaves the others
- * untouched. We also (somewhat subtly) rely on the
- * fact that cmpxchg8b returns the current 64-bit value
- * of the memory location we are touching:
- */
- asm volatile(
- "mov %%ebx, %%eax\n\t"
- "mov %%ecx, %%edx\n\t"
- LOCK_PREFIX "cmpxchg8b %1\n"
- : "=&A" (res)
- : "m" (*ptr)
- );
-
- return res;
-}
-
-extern u64 atomic64_read(atomic64_t *ptr);
-
-/**
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
-
-/*
- * Other variants with different arithmetic operators:
- */
-extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
-extern u64 atomic64_inc_return(atomic64_t *ptr);
-extern u64 atomic64_dec_return(atomic64_t *ptr);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-extern void atomic64_add(u64 delta, atomic64_t *ptr);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-extern void atomic64_sub(u64 delta, atomic64_t *ptr);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-extern void atomic64_inc(atomic64_t *ptr);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-extern void atomic64_dec(atomic64_t *ptr);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-extern int atomic64_dec_and_test(atomic64_t *ptr);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-extern int atomic64_inc_and_test(atomic64_t *ptr);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
-
-#include <asm-generic/atomic-long.h>
-#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
deleted file mode 100644
index d605dc268e79..000000000000
--- a/arch/x86/include/asm/atomic_64.h
+++ /dev/null
@@ -1,485 +0,0 @@
-#ifndef _ASM_X86_ATOMIC_64_H
-#define _ASM_X86_ATOMIC_64_H
-
-#include <linux/types.h>
-#include <asm/alternative.h>
-#include <asm/cmpxchg.h>
-
-/*
- * Atomic operations that C can't guarantee us. Useful for
- * resource counting etc..
- */
-
-#define ATOMIC_INIT(i) { (i) }
-
-/**
- * atomic_read - read atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically reads the value of @v.
- */
-static inline int atomic_read(const atomic_t *v)
-{
- return v->counter;
-}
-
-/**
- * atomic_set - set atomic variable
- * @v: pointer of type atomic_t
- * @i: required value
- *
- * Atomically sets the value of @v to @i.
- */
-static inline void atomic_set(atomic_t *v, int i)
-{
- v->counter = i;
-}
-
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic_add(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "addl %1,%0"
- : "=m" (v->counter)
- : "ir" (i), "m" (v->counter));
-}
-
-/**
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic_sub(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "subl %1,%0"
- : "=m" (v->counter)
- : "ir" (i), "m" (v->counter));
-}
-
-/**
- * atomic_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "ir" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic_inc - increment atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1.
- */
-static inline void atomic_inc(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "incl %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic_dec - decrement atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1.
- */
-static inline void atomic_dec(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "decl %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic_dec_and_test - decrement and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int atomic_dec_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "decl %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_inc_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "incl %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int atomic_add_negative(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
- : "=m" (v->counter), "=qm" (c)
- : "ir" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic_add_return - add and return
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns @i + @v
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int __i = i;
- asm volatile(LOCK_PREFIX "xaddl %0, %1"
- : "+r" (i), "+m" (v->counter)
- : : "memory");
- return i + __i;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- return atomic_add_return(-i, v);
-}
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-#define atomic_dec_return(v) (atomic_sub_return(1, v))
-
-/* The 64-bit atomic type */
-
-#define ATOMIC64_INIT(i) { (i) }
-
-/**
- * atomic64_read - read atomic64 variable
- * @v: pointer of type atomic64_t
- *
- * Atomically reads the value of @v.
- * Doesn't imply a read memory barrier.
- */
-static inline long atomic64_read(const atomic64_t *v)
-{
- return v->counter;
-}
-
-/**
- * atomic64_set - set atomic64 variable
- * @v: pointer to type atomic64_t
- * @i: required value
- *
- * Atomically sets the value of @v to @i.
- */
-static inline void atomic64_set(atomic64_t *v, long i)
-{
- v->counter = i;
-}
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic64_add(long i, atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "addq %1,%0"
- : "=m" (v->counter)
- : "er" (i), "m" (v->counter));
-}
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @i: integer value to subtract
- * @v: pointer to type atomic64_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic64_sub(long i, atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "subq %1,%0"
- : "=m" (v->counter)
- : "er" (i), "m" (v->counter));
-}
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer to type atomic64_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic64_sub_and_test(long i, atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "er" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @v: pointer to type atomic64_t
- *
- * Atomically increments @v by 1.
- */
-static inline void atomic64_inc(atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "incq %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @v: pointer to type atomic64_t
- *
- * Atomically decrements @v by 1.
- */
-static inline void atomic64_dec(atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "decq %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @v: pointer to type atomic64_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int atomic64_dec_and_test(atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "decq %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic64_inc_and_test - increment and test
- * @v: pointer to type atomic64_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic64_inc_and_test(atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "incq %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic64_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int atomic64_add_negative(long i, atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
- : "=m" (v->counter), "=qm" (c)
- : "er" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic64_add_return - add and return
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v and returns @i + @v
- */
-static inline long atomic64_add_return(long i, atomic64_t *v)
-{
- long __i = i;
- asm volatile(LOCK_PREFIX "xaddq %0, %1;"
- : "+r" (i), "+m" (v->counter)
- : : "memory");
- return i + __i;
-}
-
-static inline long atomic64_sub_return(long i, atomic64_t *v)
-{
- return atomic64_add_return(-i, v);
-}
-
-#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
-#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
-
-static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
-{
- return cmpxchg(&v->counter, old, new);
-}
-
-static inline long atomic64_xchg(atomic64_t *v, long new)
-{
- return xchg(&v->counter, new);
-}
-
-static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
-{
- return cmpxchg(&v->counter, old, new);
-}
-
-static inline long atomic_xchg(atomic_t *v, int new)
-{
- return xchg(&v->counter, new);
-}
-
-/**
- * atomic_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
- */
-static inline int atomic_add_unless(atomic_t *v, int a, int u)
-{
- int c, old;
- c = atomic_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
-}
-
-#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
-
-/**
- * atomic64_add_unless - add unless the number is a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
- */
-static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
-{
- long c, old;
- c = atomic64_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic64_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
-}
-
-/**
- * atomic_inc_short - increment of a short integer
- * @v: pointer to type int
- *
- * Atomically adds 1 to @v
- * Returns the new value of @u
- */
-static inline short int atomic_inc_short(short int *v)
-{
- asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
- return *v;
-}
-
-/**
- * atomic_or_long - OR of two long integers
- * @v1: pointer to type unsigned long
- * @v2: pointer to type unsigned long
- *
- * Atomically ORs @v1 and @v2
- * Returns the result of the OR
- */
-static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
-{
- asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
-}
-
-#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "andl %0,%1" \
- : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "orl %0,%1" \
- : : "r" ((unsigned)(mask)), "m" (*(addr)) \
- : "memory")
-
-/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic_dec() barrier()
-#define smp_mb__after_atomic_dec() barrier()
-#define smp_mb__before_atomic_inc() barrier()
-#define smp_mb__after_atomic_inc() barrier()
-
-#include <asm-generic/atomic-long.h>
-#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h
new file mode 100644
index 000000000000..fa918f01333e
--- /dev/null
+++ b/arch/x86/include/asm/audit.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_AUDIT_H
+#define _ASM_X86_AUDIT_H
+
+int ia32_classify_syscall(unsigned int syscall);
+
+extern unsigned ia32_dir_class[];
+extern unsigned ia32_write_class[];
+extern unsigned ia32_read_class[];
+extern unsigned ia32_chattr_class[];
+extern unsigned ia32_signal_class[];
+
+
+#endif /* _ASM_X86_AUDIT_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
new file mode 100644
index 000000000000..db70832232d4
--- /dev/null
+++ b/arch/x86/include/asm/barrier.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BARRIER_H
+#define _ASM_X86_BARRIER_H
+
+#include <asm/alternative.h>
+#include <asm/nops.h>
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this might be required on UP too when we're talking
+ * to devices.
+ */
+
+#ifdef CONFIG_X86_32
+#define mb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "mfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "lfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "sfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#else
+#define __mb() asm volatile("mfence":::"memory")
+#define __rmb() asm volatile("lfence":::"memory")
+#define __wmb() asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * array_index_mask_nospec() - generate a mask that is ~0UL when the
+ * bounds check succeeds and 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * Returns:
+ * 0 - (index < size)
+ */
+#define array_index_mask_nospec(idx,sz) ({ \
+ typeof((idx)+(sz)) __idx = (idx); \
+ typeof(__idx) __sz = (sz); \
+ unsigned long __mask; \
+ asm volatile ("cmp %1,%2; sbb %0,%0" \
+ :"=r" (__mask) \
+ :ASM_INPUT_G (__sz), \
+ "r" (__idx) \
+ :"cc"); \
+ __mask; })
+
+/* Prevent speculative execution past this barrier. */
+#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC)
+
+#define __dma_rmb() barrier()
+#define __dma_wmb() barrier()
+
+#define __smp_mb() asm volatile("lock addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc")
+
+#define __smp_rmb() dma_rmb()
+#define __smp_wmb() barrier()
+#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
+
+#define __smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ barrier(); \
+ WRITE_ONCE(*p, v); \
+} while (0)
+
+#define __smp_load_acquire(p) \
+({ \
+ typeof(*p) ___p1 = READ_ONCE(*p); \
+ compiletime_assert_atomic_type(*p); \
+ barrier(); \
+ ___p1; \
+})
+
+/* Atomic operations are already serializing on x86 */
+#define __smp_mb__before_atomic() do { } while (0)
+#define __smp_mb__after_atomic() do { } while (0)
+
+/* Writing to CR3 provides a full memory barrier in switch_mm(). */
+#define smp_mb__after_switch_mm() do { } while (0)
+
+#include <asm-generic/barrier.h>
+
+#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 3c7521063d3f..4d5a17e2febe 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -1,20 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BIOS_EBDA_H
#define _ASM_X86_BIOS_EBDA_H
#include <asm/io.h>
/*
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E.
+ * Returns physical address of EBDA. Returns 0 if there is no EBDA.
*/
static inline unsigned int get_bios_ebda(void)
{
+ /*
+ * There is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
address <<= 4;
return address; /* 0 means none */
}
-void reserve_ebda_region(void);
+void reserve_bios_regions(void);
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
/*
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 02b47a603fc8..c2ce213f2b9b 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BITOPS_H
#define _ASM_X86_BITOPS_H
@@ -14,6 +15,18 @@
#include <linux/compiler.h>
#include <asm/alternative.h>
+#include <asm/rmwcc.h>
+#include <asm/barrier.h>
+
+#if BITS_PER_LONG == 32
+# define _BITOPS_LONG_SHIFT 5
+#elif BITS_PER_LONG == 64
+# define _BITOPS_LONG_SHIFT 6
+#else
+# error "Unexpected BITS_PER_LONG"
+#endif
+
+#define BIT_64(n) (U64_C(1) << (n))
/*
* These have to be done with inline assembly: that way the bit-setting
@@ -23,320 +36,217 @@
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
*/
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
-/* Technically wrong, but this avoids compilation errors on some gcc
- versions. */
-#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
-#else
-#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
-#endif
+#define RLONG_ADDR(x) "m" (*(volatile long *) (x))
+#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x))
-#define ADDR BITOP_ADDR(addr)
+#define ADDR RLONG_ADDR(addr)
/*
* We do the locked ops that don't return the old value as
* a mask operation on a byte.
*/
-#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
-#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
+#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr) (1 << ((nr) & 7))
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered. See __set_bit()
- * if you do not require the atomic guarantees.
- *
- * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writing portable code,
- * make sure not to rely on its reordering guarantees.
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
static __always_inline void
-set_bit(unsigned int nr, volatile unsigned long *addr)
+arch_set_bit(long nr, volatile unsigned long *addr)
{
- if (IS_IMMEDIATE(nr)) {
- asm volatile(LOCK_PREFIX "orb %1,%0"
+ if (__builtin_constant_p(nr)) {
+ asm_inline volatile(LOCK_PREFIX "orb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
- : "iq" ((u8)CONST_MASK(nr))
+ : "iq" (CONST_MASK(nr))
: "memory");
} else {
- asm volatile(LOCK_PREFIX "bts %1,%0"
- : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
-/**
- * __set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
{
- asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
+ asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered. However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
- * in order to ensure changes are visible on other processors.
- */
static __always_inline void
-clear_bit(int nr, volatile unsigned long *addr)
+arch_clear_bit(long nr, volatile unsigned long *addr)
{
- if (IS_IMMEDIATE(nr)) {
- asm volatile(LOCK_PREFIX "andb %1,%0"
+ if (__builtin_constant_p(nr)) {
+ asm_inline volatile(LOCK_PREFIX "andb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
- : "iq" ((u8)~CONST_MASK(nr)));
+ : "iq" (~CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX "btr %1,%0"
- : BITOP_ADDR(addr)
- : "Ir" (nr));
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
-/*
- * clear_bit_unlock - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and implies release semantics before the memory
- * operation. It can be used for an unlock.
- */
-static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+static __always_inline void
+arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
{
barrier();
- clear_bit(nr, addr);
+ arch_clear_bit(nr, addr);
}
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
- asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
+ asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
-/*
- * __clear_bit_unlock - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * __clear_bit() is non-atomic and implies release semantics before the memory
- * operation. It can be used for an unlock if no other CPUs can concurrently
- * modify other bits in the word.
- *
- * No memory barrier is required here, because x86 cannot reorder stores past
- * older loads. Same principle as spin_unlock.
- */
-static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
+ volatile unsigned long *addr)
{
- barrier();
- __clear_bit(nr, addr);
+ bool negative;
+ asm_inline volatile(LOCK_PREFIX "xorb %2,%1"
+ : "=@ccs" (negative), WBYTE_ADDR(addr)
+ : "iq" ((char)mask) : "memory");
+ return negative;
}
+#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
-#define smp_mb__before_clear_bit() barrier()
-#define smp_mb__after_clear_bit() barrier()
+static __always_inline void
+arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
+{
+ arch___clear_bit(nr, addr);
+}
-/**
- * __change_bit - Toggle a bit in memory
- * @nr: the bit to change
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
{
- asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
+ asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to change
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch_change_bit(long nr, volatile unsigned long *addr)
{
- if (IS_IMMEDIATE(nr)) {
- asm volatile(LOCK_PREFIX "xorb %1,%0"
+ if (__builtin_constant_p(nr)) {
+ asm_inline volatile(LOCK_PREFIX "xorb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
- : "iq" ((u8)CONST_MASK(nr)));
+ : "iq" (CONST_MASK(nr)));
} else {
- asm volatile(LOCK_PREFIX "btc %1,%0"
- : BITOP_ADDR(addr)
- : "Ir" (nr));
+ asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch_test_and_set_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
+}
- asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
- "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+static __always_inline bool
+arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
+{
+ return arch_test_and_set_bit(nr, addr);
+}
+
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
+{
+ bool oldbit;
+ asm(__ASM_SIZE(bts) " %2,%1"
+ : "=@ccc" (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
return oldbit;
}
-/**
- * test_and_set_bit_lock - Set a bit and return its old value for lock
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This is the same as test_and_set_bit on x86.
- */
-static __always_inline int
-test_and_set_bit_lock(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- return test_and_set_bit(nr, addr);
+ return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
}
-/**
- * __test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail. You must protect multiple accesses with a lock.
+/*
+ * Note: the operation is performed atomically with respect to
+ * the local CPU, but not other CPUs. Portable code should not
+ * rely on this behaviour.
+ * KVM relies on this behaviour on x86 for modifying memory that is also
+ * accessed from a hypervisor on the same CPU if running in a VM: don't change
+ * this without also updating arch/x86/kernel/kvm.c
*/
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
- asm("bts %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR
- : "Ir" (nr));
+ asm volatile(__ASM_SIZE(btr) " %2,%1"
+ : "=@ccc" (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
return oldbit;
}
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
- asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+ asm volatile(__ASM_SIZE(btc) " %2,%1"
+ : "=@ccc" (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
return oldbit;
}
-/**
- * __test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- * If two examples of this operation race, one can appear to succeed
- * but actually fail. You must protect multiple accesses with a lock.
- */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch_test_and_change_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
+ return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
+}
- asm volatile("btr %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR
- : "Ir" (nr));
- return oldbit;
+static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
+{
+ return ((1UL << (nr & (BITS_PER_LONG-1))) &
+ (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
-/* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
- asm volatile("btc %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR
- : "Ir" (nr) : "memory");
+ asm volatile("testb %2,%1"
+ : "=@ccnz" (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
return oldbit;
}
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
- int oldbit;
+ bool oldbit;
- asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+ asm volatile(__ASM_SIZE(bt) " %2,%1"
+ : "=@ccc" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
return oldbit;
}
-static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
{
- return ((1UL << (nr % BITS_PER_LONG)) &
- (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+ return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) :
+ variable_test_bit(nr, addr);
}
-static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
{
- int oldbit;
-
- asm volatile("bt %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
- : "m" (*(unsigned long *)addr), "Ir" (nr));
-
- return oldbit;
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
}
-#if 0 /* Fool kernel-doc since it doesn't do macros yet */
-/**
- * test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static int test_bit(int nr, const volatile unsigned long *addr);
-#endif
-
-#define test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? constant_test_bit((nr), (addr)) \
- : variable_test_bit((nr), (addr)))
+static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word)
+{
+ asm("tzcnt %1,%0"
+ : "=r" (word)
+ : ASM_INPUT_RM (word));
+ return word;
+}
/**
* __ffs - find first set bit in word
@@ -344,12 +254,14 @@ static int test_bit(int nr, const volatile unsigned long *addr);
*
* Undefined if no bit exists, so code should check against 0 first.
*/
-static inline unsigned long __ffs(unsigned long word)
+#define __ffs(word) \
+ (__builtin_constant_p(word) ? \
+ (unsigned long)__builtin_ctzl(word) : \
+ variable__ffs(word))
+
+static __always_inline __attribute_const__ unsigned long variable_ffz(unsigned long word)
{
- asm("bsf %1,%0"
- : "=r" (word)
- : "rm" (word));
- return word;
+ return variable__ffs(~word);
}
/**
@@ -358,13 +270,10 @@ static inline unsigned long __ffs(unsigned long word)
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
-static inline unsigned long ffz(unsigned long word)
-{
- asm("bsf %1,%0"
- : "=r" (word)
- : "r" (~word));
- return word;
-}
+#define ffz(word) \
+ (__builtin_constant_p(word) ? \
+ (unsigned long)__builtin_ctzl(~word) : \
+ variable_ffz(word))
/*
* __fls: find last set bit in word
@@ -372,33 +281,41 @@ static inline unsigned long ffz(unsigned long word)
*
* Undefined if no set bit exists, so code should check against 0 first.
*/
-static inline unsigned long __fls(unsigned long word)
+static __always_inline __attribute_const__ unsigned long __fls(unsigned long word)
{
+ if (__builtin_constant_p(word))
+ return BITS_PER_LONG - 1 - __builtin_clzl(word);
+
asm("bsr %1,%0"
: "=r" (word)
- : "rm" (word));
+ : ASM_INPUT_RM (word));
return word;
}
+#undef ADDR
+
#ifdef __KERNEL__
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-static inline int ffs(int x)
+static __always_inline __attribute_const__ int variable_ffs(int x)
{
int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+ /*
+ * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before, except that the
+ * top 32 bits will be cleared.
+ *
+ * We cannot do this on 32 bits because at the very least some
+ * 486 CPUs did not behave this way.
+ */
+ asm("bsfl %1,%0"
+ : "=r" (r)
+ : ASM_INPUT_RM (x), "0" (-1));
+#elif defined(CONFIG_X86_CMOV)
asm("bsfl %1,%0\n\t"
"cmovzl %2,%0"
- : "=r" (r) : "rm" (x), "r" (-1));
+ : "=&r" (r) : "rm" (x), "r" (-1));
#else
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
@@ -409,6 +326,19 @@ static inline int ffs(int x)
}
/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
+
+/**
* fls - find last set bit in word
* @x: the word to search
*
@@ -419,10 +349,27 @@ static inline int ffs(int x)
* set bit if value is nonzero. The last (most significant) bit is
* at position 32.
*/
-static inline int fls(int x)
+static __always_inline __attribute_const__ int fls(unsigned int x)
{
int r;
-#ifdef CONFIG_X86_CMOV
+
+ if (__builtin_constant_p(x))
+ return x ? 32 - __builtin_clz(x) : 0;
+
+#ifdef CONFIG_X86_64
+ /*
+ * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before, except that the
+ * top 32 bits will be cleared.
+ *
+ * We cannot do this on 32 bits because at the very least some
+ * 486 CPUs did not behave this way.
+ */
+ asm("bsrl %1,%0"
+ : "=r" (r)
+ : ASM_INPUT_RM (x), "0" (-1));
+#elif defined(CONFIG_X86_CMOV)
asm("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
@@ -434,32 +381,52 @@ static inline int fls(int x)
#endif
return r + 1;
}
-#endif /* __KERNEL__ */
-
-#undef ADDR
-#ifdef __KERNEL__
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#ifdef CONFIG_X86_64
+static __always_inline __attribute_const__ int fls64(__u64 x)
+{
+ int bitpos = -1;
+
+ if (__builtin_constant_p(x))
+ return x ? 64 - __builtin_clzll(x) : 0;
+ /*
+ * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before.
+ */
+ asm("bsrq %1,%q0"
+ : "+r" (bitpos)
+ : ASM_INPUT_RM (x));
+ return bitpos + 1;
+}
+#else
+#include <asm-generic/bitops/fls64.h>
+#endif
#include <asm-generic/bitops/sched.h>
-#define ARCH_HAS_FAST_MULTIPLIER 1
-
-#include <asm-generic/bitops/hweight.h>
-
-#endif /* __KERNEL__ */
-
-#include <asm-generic/bitops/fls64.h>
+#include <asm/arch_hweight.h>
-#ifdef __KERNEL__
+#include <asm-generic/bitops/const_hweight.h>
-#include <asm-generic/bitops/ext2-non-atomic.h>
+#include <asm-generic/bitops/instrumented-atomic.h>
+#include <asm-generic/bitops/instrumented-non-atomic.h>
+#include <asm-generic/bitops/instrumented-lock.h>
-#define ext2_set_bit_atomic(lock, nr, addr) \
- test_and_set_bit((nr), (unsigned long *)(addr))
-#define ext2_clear_bit_atomic(lock, nr, addr) \
- test_and_clear_bit((nr), (unsigned long *)(addr))
+#include <asm-generic/bitops/le.h>
-#include <asm-generic/bitops/minix.h>
+#include <asm-generic/bitops/ext2-atomic-setbit.h>
#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 7a1065958ba9..f7b67cb73915 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,51 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BOOT_H
#define _ASM_X86_BOOT_H
-/* Internal svga startup constants */
-#define NORMAL_VGA 0xffff /* 80x25 mode */
-#define EXTENDED_VGA 0xfffe /* 80x50 mode */
-#define ASK_VGA 0xfffd /* ask for it at bootup */
-
-#ifdef __KERNEL__
#include <asm/pgtable_types.h>
-
-/* Physical address where kernel should be loaded. */
-#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
- + (CONFIG_PHYSICAL_ALIGN - 1)) \
- & ~(CONFIG_PHYSICAL_ALIGN - 1))
+#include <uapi/asm/boot.h>
/* Minimum kernel alignment, as a power of two */
#ifdef CONFIG_X86_64
-#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
+# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
#else
-#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER)
+# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER)
#endif
#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
- (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
-#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+ (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
+# error "Invalid value for CONFIG_PHYSICAL_ALIGN"
#endif
-#ifdef CONFIG_KERNEL_BZIP2
-#define BOOT_HEAP_SIZE 0x400000
-#else /* !CONFIG_KERNEL_BZIP2 */
-
-#ifdef CONFIG_X86_64
-#define BOOT_HEAP_SIZE 0x7000
+#if defined(CONFIG_KERNEL_BZIP2)
+# define BOOT_HEAP_SIZE 0x400000
+#elif defined(CONFIG_KERNEL_ZSTD)
+/*
+ * Zstd needs to allocate the ZSTD_DCtx in order to decompress the kernel.
+ * The ZSTD_DCtx is ~160KB, so set the heap size to 192KB because it is a
+ * round number and to allow some slack.
+ */
+# define BOOT_HEAP_SIZE 0x30000
#else
-#define BOOT_HEAP_SIZE 0x4000
+# define BOOT_HEAP_SIZE 0x10000
#endif
-#endif /* !CONFIG_KERNEL_BZIP2 */
-
#ifdef CONFIG_X86_64
-#define BOOT_STACK_SIZE 0x4000
-#else
-#define BOOT_STACK_SIZE 0x1000
+# define BOOT_STACK_SIZE 0x4000
+
+/*
+ * Used by decompressor's startup_32() to allocate page tables for identity
+ * mapping of the 4G of RAM in 4-level paging mode:
+ * - 1 level4 table;
+ * - 1 level3 table;
+ * - 4 level2 table that maps everything with 2M pages;
+ *
+ * The additional level5 table needed for 5-level paging is allocated from
+ * trampoline_32bit memory.
+ */
+# define BOOT_INIT_PGT_SIZE (6*4096)
+
+/*
+ * Total number of page tables kernel_add_identity_map() can allocate,
+ * including page tables consumed by startup_32().
+ *
+ * Worst-case scenario:
+ * - 5-level paging needs 1 level5 table;
+ * - KASLR needs to map kernel, boot_params, cmdline and randomized kernel,
+ * assuming all of them cross 256T boundary:
+ * + 4*2 level4 table;
+ * + 4*2 level3 table;
+ * + 4*2 level2 table;
+ * - X86_VERBOSE_BOOTUP needs to map the first 2M (video RAM):
+ * + 1 level4 table;
+ * + 1 level3 table;
+ * + 1 level2 table;
+ * Total: 28 tables
+ *
+ * Add 4 spare table in case decompressor touches anything beyond what is
+ * accounted above. Warn if it happens.
+ */
+# define BOOT_PGT_SIZE_WARN (28*4096)
+# define BOOT_PGT_SIZE (32*4096)
+
+#else /* !CONFIG_X86_64 */
+# define BOOT_STACK_SIZE 0x1000
#endif
-#endif /* __KERNEL__ */
+#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
+
+#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
+#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
+
+#ifndef __ASSEMBLER__
+extern unsigned int output_len;
+extern const unsigned long kernel_text_size;
+extern const unsigned long kernel_inittext_offset;
+extern const unsigned long kernel_inittext_size;
+extern const unsigned long kernel_total_size;
+
+unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
+ void (*error)(char *x));
+
+extern struct boot_params *boot_params_ptr;
+extern unsigned long *trampoline_32bit;
+extern const u16 trampoline_ljmp_imm_offset;
+
+void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
+
+#endif
#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
deleted file mode 100644
index 1724e8de317c..000000000000
--- a/arch/x86/include/asm/bootparam.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef _ASM_X86_BOOTPARAM_H
-#define _ASM_X86_BOOTPARAM_H
-
-#include <linux/types.h>
-#include <linux/screen_info.h>
-#include <linux/apm_bios.h>
-#include <linux/edd.h>
-#include <asm/e820.h>
-#include <asm/ist.h>
-#include <video/edid.h>
-
-/* setup data types */
-#define SETUP_NONE 0
-#define SETUP_E820_EXT 1
-
-/* extensible setup data list node */
-struct setup_data {
- __u64 next;
- __u32 type;
- __u32 len;
- __u8 data[0];
-};
-
-struct setup_header {
- __u8 setup_sects;
- __u16 root_flags;
- __u32 syssize;
- __u16 ram_size;
-#define RAMDISK_IMAGE_START_MASK 0x07FF
-#define RAMDISK_PROMPT_FLAG 0x8000
-#define RAMDISK_LOAD_FLAG 0x4000
- __u16 vid_mode;
- __u16 root_dev;
- __u16 boot_flag;
- __u16 jump;
- __u32 header;
- __u16 version;
- __u32 realmode_swtch;
- __u16 start_sys;
- __u16 kernel_version;
- __u8 type_of_loader;
- __u8 loadflags;
-#define LOADED_HIGH (1<<0)
-#define QUIET_FLAG (1<<5)
-#define KEEP_SEGMENTS (1<<6)
-#define CAN_USE_HEAP (1<<7)
- __u16 setup_move_size;
- __u32 code32_start;
- __u32 ramdisk_image;
- __u32 ramdisk_size;
- __u32 bootsect_kludge;
- __u16 heap_end_ptr;
- __u8 ext_loader_ver;
- __u8 ext_loader_type;
- __u32 cmd_line_ptr;
- __u32 initrd_addr_max;
- __u32 kernel_alignment;
- __u8 relocatable_kernel;
- __u8 _pad2[3];
- __u32 cmdline_size;
- __u32 hardware_subarch;
- __u64 hardware_subarch_data;
- __u32 payload_offset;
- __u32 payload_length;
- __u64 setup_data;
-} __attribute__((packed));
-
-struct sys_desc_table {
- __u16 length;
- __u8 table[14];
-};
-
-struct efi_info {
- __u32 efi_loader_signature;
- __u32 efi_systab;
- __u32 efi_memdesc_size;
- __u32 efi_memdesc_version;
- __u32 efi_memmap;
- __u32 efi_memmap_size;
- __u32 efi_systab_hi;
- __u32 efi_memmap_hi;
-};
-
-/* The so-called "zeropage" */
-struct boot_params {
- struct screen_info screen_info; /* 0x000 */
- struct apm_bios_info apm_bios_info; /* 0x040 */
- __u8 _pad2[12]; /* 0x054 */
- struct ist_info ist_info; /* 0x060 */
- __u8 _pad3[16]; /* 0x070 */
- __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
- __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
- struct sys_desc_table sys_desc_table; /* 0x0a0 */
- __u8 _pad4[144]; /* 0x0b0 */
- struct edid_info edid_info; /* 0x140 */
- struct efi_info efi_info; /* 0x1c0 */
- __u32 alt_mem_k; /* 0x1e0 */
- __u32 scratch; /* Scratch field! */ /* 0x1e4 */
- __u8 e820_entries; /* 0x1e8 */
- __u8 eddbuf_entries; /* 0x1e9 */
- __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
- __u8 _pad6[6]; /* 0x1eb */
- struct setup_header hdr; /* setup header */ /* 0x1f1 */
- __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
- __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
- struct e820entry e820_map[E820MAX]; /* 0x2d0 */
- __u8 _pad8[48]; /* 0xcd0 */
- struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
- __u8 _pad9[276]; /* 0xeec */
-} __attribute__((packed));
-
-#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
new file mode 100644
index 000000000000..d90ae472fb76
--- /dev/null
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BOOTPARAM_UTILS_H
+#define _ASM_X86_BOOTPARAM_UTILS_H
+
+#include <asm/bootparam.h>
+
+/*
+ * This file is included from multiple environments. Do not
+ * add completing #includes to make it standalone.
+ */
+
+/*
+ * Deal with bootloaders which fail to initialize unknown fields in
+ * boot_params to zero. The list fields in this list are taken from
+ * analysis of kexec-tools; if other broken bootloaders initialize a
+ * different set of fields we will need to figure out how to disambiguate.
+ *
+ * Note: efi_info is commonly left uninitialized, but that field has a
+ * private magic, so it is better to leave it unchanged.
+ */
+
+#define sizeof_mbr(type, member) ({ sizeof(((type *)0)->member); })
+
+#define BOOT_PARAM_PRESERVE(struct_member) \
+ { \
+ .start = offsetof(struct boot_params, struct_member), \
+ .len = sizeof_mbr(struct boot_params, struct_member), \
+ }
+
+struct boot_params_to_save {
+ unsigned int start;
+ unsigned int len;
+};
+
+static void sanitize_boot_params(struct boot_params *boot_params)
+{
+ /*
+ * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear
+ * this field. The purpose of this field is to guarantee
+ * compliance with the x86 boot spec located in
+ * Documentation/arch/x86/boot.rst . That spec says that the
+ * *whole* structure should be cleared, after which only the
+ * portion defined by struct setup_header (boot_params->hdr)
+ * should be copied in.
+ *
+ * If you're having an issue because the sentinel is set, you
+ * need to change the whole structure to be cleared, not this
+ * (or any other) individual field, or you will soon have
+ * problems again.
+ */
+ if (boot_params->sentinel) {
+ static struct boot_params scratch;
+ char *bp_base = (char *)boot_params;
+ char *save_base = (char *)&scratch;
+ int i;
+
+ const struct boot_params_to_save to_save[] = {
+ BOOT_PARAM_PRESERVE(screen_info),
+ BOOT_PARAM_PRESERVE(apm_bios_info),
+ BOOT_PARAM_PRESERVE(tboot_addr),
+ BOOT_PARAM_PRESERVE(ist_info),
+ BOOT_PARAM_PRESERVE(hd0_info),
+ BOOT_PARAM_PRESERVE(hd1_info),
+ BOOT_PARAM_PRESERVE(sys_desc_table),
+ BOOT_PARAM_PRESERVE(olpc_ofw_header),
+ BOOT_PARAM_PRESERVE(efi_info),
+ BOOT_PARAM_PRESERVE(alt_mem_k),
+ BOOT_PARAM_PRESERVE(scratch),
+ BOOT_PARAM_PRESERVE(e820_entries),
+ BOOT_PARAM_PRESERVE(eddbuf_entries),
+ BOOT_PARAM_PRESERVE(edd_mbr_sig_buf_entries),
+ BOOT_PARAM_PRESERVE(edd_mbr_sig_buffer),
+ BOOT_PARAM_PRESERVE(secure_boot),
+ BOOT_PARAM_PRESERVE(hdr),
+ BOOT_PARAM_PRESERVE(e820_table),
+ BOOT_PARAM_PRESERVE(eddbuf),
+ BOOT_PARAM_PRESERVE(cc_blob_address),
+ };
+
+ memset(&scratch, 0, sizeof(scratch));
+
+ for (i = 0; i < ARRAY_SIZE(to_save); i++) {
+ memcpy(save_base + to_save[i].start,
+ bp_base + to_save[i].start, to_save[i].len);
+ }
+
+ memcpy(boot_params, save_base, sizeof(*boot_params));
+ }
+}
+
+#endif /* _ASM_X86_BOOTPARAM_UTILS_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index d9cf1cd156d2..d561a8443c13 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -1,39 +1,195 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BUG_H
#define _ASM_X86_BUG_H
-#ifdef CONFIG_BUG
-#define HAVE_ARCH_BUG
+#include <linux/stringify.h>
+#include <linux/instrumentation.h>
+#include <linux/objtool.h>
+#include <asm/asm.h>
+
+#ifndef __ASSEMBLY__
+struct bug_entry;
+extern void __WARN_trap(struct bug_entry *bug, ...);
+#endif
+
+/*
+ * Despite that some emulators terminate on UD2, we use it for WARN().
+ */
+#define ASM_UD2 _ASM_BYTES(0x0f, 0x0b)
+#define INSN_UD2 0x0b0f
+#define LEN_UD2 2
+
+#define ASM_UDB _ASM_BYTES(0xd6)
+#define INSN_UDB 0xd6
+#define LEN_UDB 1
+
+/*
+ * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit.
+ */
+#define INSN_ASOP 0x67
+#define INSN_LOCK 0xf0
+#define OPCODE_ESCAPE 0x0f
+#define SECOND_BYTE_OPCODE_UD1 0xb9
+#define SECOND_BYTE_OPCODE_UD2 0x0b
+
+#define BUG_NONE 0xffff
+#define BUG_UD2 0xfffe
+#define BUG_UD1 0xfffd
+#define BUG_UD1_UBSAN 0xfffc
+#define BUG_UD1_WARN 0xfffb
+#define BUG_UDB 0xffd6
+#define BUG_LOCK 0xfff0
+
+#ifdef CONFIG_GENERIC_BUG
#ifdef CONFIG_DEBUG_BUGVERBOSE
+#define __BUG_ENTRY_VERBOSE(file, line) \
+ "\t.long " file " - .\t# bug_entry::file\n" \
+ "\t.word " line "\t# bug_entry::line\n"
+#else
+#define __BUG_ENTRY_VERBOSE(file, line)
+#endif
-#ifdef CONFIG_X86_32
-# define __BUG_C0 "2:\t.long 1b, %c0\n"
+#if defined(CONFIG_X86_64) || defined(CONFIG_DEBUG_BUGVERBOSE_DETAILED)
+#define HAVE_ARCH_BUG_FORMAT
+#define __BUG_ENTRY_FORMAT(format) \
+ "\t.long " format " - .\t# bug_entry::format\n"
#else
-# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n"
+#define __BUG_ENTRY_FORMAT(format)
#endif
-#define BUG() \
-do { \
- asm volatile("1:\tud2\n" \
- ".pushsection __bug_table,\"a\"\n" \
- __BUG_C0 \
- "\t.word %c1, 0\n" \
- "\t.org 2b+%c2\n" \
- ".popsection" \
- : : "i" (__FILE__), "i" (__LINE__), \
- "i" (sizeof(struct bug_entry))); \
- for (;;) ; \
+#ifdef CONFIG_X86_64
+#define HAVE_ARCH_BUG_FORMAT_ARGS
+#endif
+
+#define __BUG_ENTRY(format, file, line, flags) \
+ "\t.long 1b - ." "\t# bug_entry::bug_addr\n" \
+ __BUG_ENTRY_FORMAT(format) \
+ __BUG_ENTRY_VERBOSE(file, line) \
+ "\t.word " flags "\t# bug_entry::flags\n"
+
+#define _BUG_FLAGS_ASM(format, file, line, flags, size, extra) \
+ ".pushsection __bug_table,\"aw\"\n\t" \
+ ANNOTATE_DATA_SPECIAL "\n\t" \
+ "2:\n\t" \
+ __BUG_ENTRY(format, file, line, flags) \
+ "\t.org 2b + " size "\n" \
+ ".popsection\n" \
+ extra
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED
+#define WARN_CONDITION_STR(cond_str) cond_str
+#else
+#define WARN_CONDITION_STR(cond_str) ""
+#endif
+
+#define _BUG_FLAGS(cond_str, ins, flags, extra) \
+do { \
+ asm_inline volatile("1:\t" ins "\n" \
+ _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \
+ "%c[line]", "%c[fl]", \
+ "%c[size]", extra) \
+ : : [fmt] "i" (WARN_CONDITION_STR(cond_str)), \
+ [file] "i" (__FILE__), \
+ [line] "i" (__LINE__), \
+ [fl] "i" (flags), \
+ [size] "i" (sizeof(struct bug_entry))); \
} while (0)
+#define ARCH_WARN_ASM(file, line, flags, size) \
+ ".pushsection .rodata.str1.1, \"aMS\", @progbits, 1\n" \
+ "99:\n" \
+ "\t.string \"\"\n" \
+ ".popsection\n" \
+ "1:\t " ASM_UD2 "\n" \
+ _BUG_FLAGS_ASM("99b", file, line, flags, size, "")
+
#else
+
+#define _BUG_FLAGS(cond_str, ins, flags, extra) asm volatile(ins)
+
+#endif /* CONFIG_GENERIC_BUG */
+
+#define HAVE_ARCH_BUG
#define BUG() \
do { \
- asm volatile("ud2"); \
- for (;;) ; \
+ instrumentation_begin(); \
+ _BUG_FLAGS("", ASM_UD2, 0, ""); \
+ __builtin_unreachable(); \
+} while (0)
+
+/*
+ * This instrumentation_begin() is strictly speaking incorrect; but it
+ * suppresses the complaints from WARN()s in noinstr code. If such a WARN()
+ * were to trigger, we'd rather wreck the machine in an attempt to get the
+ * message out than not know about it.
+ */
+
+#define ARCH_WARN_REACHABLE ANNOTATE_REACHABLE(1b)
+
+#define __WARN_FLAGS(cond_str, flags) \
+do { \
+ auto __flags = BUGFLAG_WARNING|(flags); \
+ instrumentation_begin(); \
+ _BUG_FLAGS(cond_str, ASM_UD2, __flags, ARCH_WARN_REACHABLE); \
+ instrumentation_end(); \
} while (0)
-#endif
-#endif /* !CONFIG_BUG */
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+
+#ifndef __ASSEMBLY__
+#include <linux/static_call_types.h>
+DECLARE_STATIC_CALL(WARN_trap, __WARN_trap);
+
+struct pt_regs;
+struct sysv_va_list { /* from AMD64 System V ABI */
+ unsigned int gp_offset;
+ unsigned int fp_offset;
+ void *overflow_arg_area;
+ void *reg_save_area;
+};
+struct arch_va_list {
+ unsigned long regs[6];
+ struct sysv_va_list args;
+};
+extern void *__warn_args(struct arch_va_list *args, struct pt_regs *regs);
+#endif /* __ASSEMBLY__ */
+
+#define __WARN_bug_entry(flags, format) ({ \
+ struct bug_entry *bug; \
+ asm_inline volatile("lea (2f)(%%rip), %[addr]\n1:\n" \
+ _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \
+ "%c[line]", "%c[fl]", \
+ "%c[size]", "") \
+ : [addr] "=r" (bug) \
+ : [fmt] "i" (format), \
+ [file] "i" (__FILE__), \
+ [line] "i" (__LINE__), \
+ [fl] "i" (flags), \
+ [size] "i" (sizeof(struct bug_entry))); \
+ bug; })
+
+#define __WARN_print_arg(flags, format, arg...) \
+do { \
+ int __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_ARGS ; \
+ static_call_mod(WARN_trap)(__WARN_bug_entry(__flags, format), ## arg); \
+ asm (""); /* inhibit tail-call optimization */ \
+} while (0)
+
+#define __WARN_printf(taint, fmt, arg...) \
+ __WARN_print_arg(BUGFLAG_TAINT(taint), fmt, ## arg)
+
+#define WARN_ONCE(cond, format, arg...) ({ \
+ int __ret_warn_on = !!(cond); \
+ if (unlikely(__ret_warn_on)) { \
+ __WARN_print_arg(BUGFLAG_ONCE|BUGFLAG_TAINT(TAINT_WARN),\
+ format, ## arg); \
+ } \
+ __ret_warn_on; \
+})
+
+#endif /* HAVE_ARCH_BUG_FORMAT_ARGS */
#include <asm-generic/bug.h>
+
#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h
index 08abf639075f..f25ca2d709d4 100644
--- a/arch/x86/include/asm/bugs.h
+++ b/arch/x86/include/asm/bugs.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BUGS_H
#define _ASM_X86_BUGS_H
-extern void check_bugs(void);
+#include <asm/processor.h>
#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
int ppro_with_ram_bug(void);
@@ -9,4 +10,6 @@ int ppro_with_ram_bug(void);
static inline int ppro_with_ram_bug(void) { return 0; }
#endif
+extern void cpu_bugs_smt_update(void);
+
#endif /* _ASM_X86_BUGS_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 5d367caa0e36..69404eae9983 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -1,19 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CACHE_H
#define _ASM_X86_CACHE_H
+#include <linux/linkage.h>
+
/* L1 cache line size */
#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __section(".data..read_mostly")
+
+#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
#ifdef CONFIG_X86_VSMP
-/* vSMP Internode cacheline shift */
-#define INTERNODE_CACHE_SHIFT (12)
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp \
- __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
- __attribute__((__section__(".data.page_aligned")))
+ __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
+ __page_aligned_data
#endif
#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e55dfc1ad453..b192d917a6d0 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -1,145 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CACHEFLUSH_H
#define _ASM_X86_CACHEFLUSH_H
-/* Keep includes the same across arches. */
#include <linux/mm.h>
/* Caches aren't brain-dead on the intel. */
-static inline void flush_cache_all(void) { }
-static inline void flush_cache_mm(struct mm_struct *mm) { }
-static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
-static inline void flush_cache_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end) { }
-static inline void flush_cache_page(struct vm_area_struct *vma,
- unsigned long vmaddr, unsigned long pfn) { }
-static inline void flush_dcache_page(struct page *page) { }
-static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
-static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
-static inline void flush_icache_range(unsigned long start,
- unsigned long end) { }
-static inline void flush_icache_page(struct vm_area_struct *vma,
- struct page *page) { }
-static inline void flush_icache_user_range(struct vm_area_struct *vma,
- struct page *page,
- unsigned long addr,
- unsigned long len) { }
-static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
-static inline void flush_cache_vunmap(unsigned long start,
- unsigned long end) { }
-
-static inline void copy_to_user_page(struct vm_area_struct *vma,
- struct page *page, unsigned long vaddr,
- void *dst, const void *src,
- unsigned long len)
-{
- memcpy(dst, src, len);
-}
-
-static inline void copy_from_user_page(struct vm_area_struct *vma,
- struct page *page, unsigned long vaddr,
- void *dst, const void *src,
- unsigned long len)
-{
- memcpy(dst, src, len);
-}
-
-#define PG_non_WB PG_arch_1
-PAGEFLAG(NonWB, non_WB)
-
-/*
- * The set_memory_* API can be used to change various attributes of a virtual
- * address range. The attributes include:
- * Cachability : UnCached, WriteCombining, WriteBack
- * Executability : eXeutable, NoteXecutable
- * Read/Write : ReadOnly, ReadWrite
- * Presence : NotPresent
- *
- * Within a catagory, the attributes are mutually exclusive.
- *
- * The implementation of this API will take care of various aspects that
- * are associated with changing such attributes, such as:
- * - Flushing TLBs
- * - Flushing CPU caches
- * - Making sure aliases of the memory behind the mapping don't violate
- * coherency rules as defined by the CPU in the system.
- *
- * What this API does not do:
- * - Provide exclusion between various callers - including callers that
- * operation on other mappings of the same physical page
- * - Restore default attributes when a page is freed
- * - Guarantee that mappings other than the requested one are
- * in any state, other than that these do not violate rules for
- * the CPU you have. Do not depend on any effects on other mappings,
- * CPUs other than the one you have may have more relaxed rules.
- * The caller is required to take care of these.
- */
-
-int _set_memory_uc(unsigned long addr, int numpages);
-int _set_memory_wc(unsigned long addr, int numpages);
-int _set_memory_wb(unsigned long addr, int numpages);
-int set_memory_uc(unsigned long addr, int numpages);
-int set_memory_wc(unsigned long addr, int numpages);
-int set_memory_wb(unsigned long addr, int numpages);
-int set_memory_x(unsigned long addr, int numpages);
-int set_memory_nx(unsigned long addr, int numpages);
-int set_memory_ro(unsigned long addr, int numpages);
-int set_memory_rw(unsigned long addr, int numpages);
-int set_memory_np(unsigned long addr, int numpages);
-int set_memory_4k(unsigned long addr, int numpages);
-
-int set_memory_array_uc(unsigned long *addr, int addrinarray);
-int set_memory_array_wb(unsigned long *addr, int addrinarray);
-
-int set_pages_array_uc(struct page **pages, int addrinarray);
-int set_pages_array_wb(struct page **pages, int addrinarray);
-
-/*
- * For legacy compatibility with the old APIs, a few functions
- * are provided that work on a "struct page".
- * These functions operate ONLY on the 1:1 kernel mapping of the
- * memory that the struct page represents, and internally just
- * call the set_memory_* function. See the description of the
- * set_memory_* function for more details on conventions.
- *
- * These APIs should be considered *deprecated* and are likely going to
- * be removed in the future.
- * The reason for this is the implicit operation on the 1:1 mapping only,
- * making this not a generally useful API.
- *
- * Specifically, many users of the old APIs had a virtual address,
- * called virt_to_page() or vmalloc_to_page() on that address to
- * get a struct page* that the old API required.
- * To convert these cases, use set_memory_*() on the original
- * virtual address, do not use these functions.
- */
-
-int set_pages_uc(struct page *page, int numpages);
-int set_pages_wb(struct page *page, int numpages);
-int set_pages_x(struct page *page, int numpages);
-int set_pages_nx(struct page *page, int numpages);
-int set_pages_ro(struct page *page, int numpages);
-int set_pages_rw(struct page *page, int numpages);
-
+#include <asm-generic/cacheflush.h>
+#include <asm/special_insns.h>
void clflush_cache_range(void *addr, unsigned int size);
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-extern const int rodata_test_data;
-void set_kernel_text_rw(void);
-void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
-
-#ifdef CONFIG_DEBUG_RODATA_TEST
-int rodata_test(void);
-#else
-static inline int rodata_test(void)
-{
- return 0;
-}
-#endif
-
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
new file mode 100644
index 000000000000..5aa061199866
--- /dev/null
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CACHEINFO_H
+#define _ASM_X86_CACHEINFO_H
+
+/* Kernel controls MTRR and/or PAT MSRs. */
+extern unsigned int memory_caching_control;
+#define CACHE_MTRR 0x01
+#define CACHE_PAT 0x02
+
+void cache_disable(void);
+void cache_enable(void);
+void set_cache_aps_delayed_init(bool val);
+bool get_cache_aps_delayed_init(void);
+void cache_bp_init(void);
+void cache_bp_restore(void);
+void cache_aps_init(void);
+
+#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
deleted file mode 100644
index b03bedb62aa7..000000000000
--- a/arch/x86/include/asm/calgary.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Derived from include/asm-powerpc/iommu.h
- *
- * Copyright IBM Corporation, 2006-2007
- *
- * Author: Jon Mason <jdmason@us.ibm.com>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_CALGARY_H
-#define _ASM_X86_CALGARY_H
-
-#include <linux/spinlock.h>
-#include <linux/device.h>
-#include <linux/dma-mapping.h>
-#include <linux/timer.h>
-#include <asm/types.h>
-
-struct iommu_table {
- struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
- unsigned long it_base; /* mapped address of tce table */
- unsigned long it_hint; /* Hint for next alloc */
- unsigned long *it_map; /* A simple allocation bitmap for now */
- void __iomem *bbar; /* Bridge BAR */
- u64 tar_val; /* Table Address Register */
- struct timer_list watchdog_timer;
- spinlock_t it_lock; /* Protects it_map */
- unsigned int it_size; /* Size of iommu table in entries */
- unsigned char it_busno; /* Bus number this table belongs to */
-};
-
-struct cal_chipset_ops {
- void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev);
- void (*tce_cache_blast)(struct iommu_table *tbl);
- void (*dump_error_regs)(struct iommu_table *tbl);
-};
-
-#define TCE_TABLE_SIZE_UNSPECIFIED ~0
-#define TCE_TABLE_SIZE_64K 0
-#define TCE_TABLE_SIZE_128K 1
-#define TCE_TABLE_SIZE_256K 2
-#define TCE_TABLE_SIZE_512K 3
-#define TCE_TABLE_SIZE_1M 4
-#define TCE_TABLE_SIZE_2M 5
-#define TCE_TABLE_SIZE_4M 6
-#define TCE_TABLE_SIZE_8M 7
-
-extern int use_calgary;
-
-#ifdef CONFIG_CALGARY_IOMMU
-extern int calgary_iommu_init(void);
-extern void detect_calgary(void);
-#else
-static inline int calgary_iommu_init(void) { return 1; }
-static inline void detect_calgary(void) { return; }
-#endif
-
-#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
deleted file mode 100644
index 0e63c9a2a8d0..000000000000
--- a/arch/x86/include/asm/calling.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
-
- x86 function call convention, 64-bit:
- -------------------------------------
- arguments | callee-saved | extra caller-saved | return
- [callee-clobbered] | | [callee-clobbered] |
- ---------------------------------------------------------------------------
- rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**]
-
- ( rsp is obviously invariant across normal function calls. (gcc can 'merge'
- functions when it sees tail-call optimization possibilities) rflags is
- clobbered. Leftover arguments are passed over the stack frame.)
-
- [*] In the frame-pointers case rbp is fixed to the stack frame.
-
- [**] for struct return values wider than 64 bits the return convention is a
- bit more complex: up to 128 bits width we return small structures
- straight in rax, rdx. For structures larger than that (3 words or
- larger) the caller puts a pointer to an on-stack return struct
- [allocated in the caller's stack frame] into the first argument - i.e.
- into rdi. All other arguments shift up by one in this case.
- Fortunately this case is rare in the kernel.
-
-For 32-bit we have the following conventions - kernel is built with
--mregparm=3 and -freg-struct-return:
-
- x86 function calling convention, 32-bit:
- ----------------------------------------
- arguments | callee-saved | extra caller-saved | return
- [callee-clobbered] | | [callee-clobbered] |
- -------------------------------------------------------------------------
- eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**]
-
- ( here too esp is obviously invariant across normal function calls. eflags
- is clobbered. Leftover arguments are passed over the stack frame. )
-
- [*] In the frame-pointers case ebp is fixed to the stack frame.
-
- [**] We build with -freg-struct-return, which on 32-bit means similar
- semantics as on 64-bit: edx can be used for a second return value
- (i.e. covering integer and structure sizes up to 64 bits) - after that
- it gets more complex and more expensive: 3-word or larger struct returns
- get done in the caller's frame and the pointer to the return struct goes
- into regparm0, i.e. eax - the other arguments shift up and the
- function's register parameters degenerate to regparm=2 in essence.
-
-*/
-
-
-/*
- * 64-bit system call stack frame layout defines and helpers,
- * for assembly code:
- */
-
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
-
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11 48
-#define R10 56
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120 /* + error_code */
-/* end of arguments */
-
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
-
-#define ARGOFFSET R11
-#define SWFRAME ORIG_RAX
-
- .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
- subq $9*8+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET 9*8+\addskip
- movq %rdi, 8*8(%rsp)
- CFI_REL_OFFSET rdi, 8*8
- movq %rsi, 7*8(%rsp)
- CFI_REL_OFFSET rsi, 7*8
- movq %rdx, 6*8(%rsp)
- CFI_REL_OFFSET rdx, 6*8
- .if \norcx
- .else
- movq %rcx, 5*8(%rsp)
- CFI_REL_OFFSET rcx, 5*8
- .endif
- movq %rax, 4*8(%rsp)
- CFI_REL_OFFSET rax, 4*8
- .if \nor891011
- .else
- movq %r8, 3*8(%rsp)
- CFI_REL_OFFSET r8, 3*8
- movq %r9, 2*8(%rsp)
- CFI_REL_OFFSET r9, 2*8
- movq %r10, 1*8(%rsp)
- CFI_REL_OFFSET r10, 1*8
- movq %r11, (%rsp)
- CFI_REL_OFFSET r11, 0*8
- .endif
- .endm
-
-#define ARG_SKIP 9*8
-
- .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
- skipr8910=0, skiprdx=0
- .if \skipr11
- .else
- movq (%rsp), %r11
- CFI_RESTORE r11
- .endif
- .if \skipr8910
- .else
- movq 1*8(%rsp), %r10
- CFI_RESTORE r10
- movq 2*8(%rsp), %r9
- CFI_RESTORE r9
- movq 3*8(%rsp), %r8
- CFI_RESTORE r8
- .endif
- .if \skiprax
- .else
- movq 4*8(%rsp), %rax
- CFI_RESTORE rax
- .endif
- .if \skiprcx
- .else
- movq 5*8(%rsp), %rcx
- CFI_RESTORE rcx
- .endif
- .if \skiprdx
- .else
- movq 6*8(%rsp), %rdx
- CFI_RESTORE rdx
- .endif
- movq 7*8(%rsp), %rsi
- CFI_RESTORE rsi
- movq 8*8(%rsp), %rdi
- CFI_RESTORE rdi
- .if ARG_SKIP+\addskip > 0
- addq $ARG_SKIP+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
- .endif
- .endm
-
- .macro LOAD_ARGS offset, skiprax=0
- movq \offset(%rsp), %r11
- movq \offset+8(%rsp), %r10
- movq \offset+16(%rsp), %r9
- movq \offset+24(%rsp), %r8
- movq \offset+40(%rsp), %rcx
- movq \offset+48(%rsp), %rdx
- movq \offset+56(%rsp), %rsi
- movq \offset+64(%rsp), %rdi
- .if \skiprax
- .else
- movq \offset+72(%rsp), %rax
- .endif
- .endm
-
-#define REST_SKIP 6*8
-
- .macro SAVE_REST
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq %rbx, 5*8(%rsp)
- CFI_REL_OFFSET rbx, 5*8
- movq %rbp, 4*8(%rsp)
- CFI_REL_OFFSET rbp, 4*8
- movq %r12, 3*8(%rsp)
- CFI_REL_OFFSET r12, 3*8
- movq %r13, 2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
- movq %r14, 1*8(%rsp)
- CFI_REL_OFFSET r14, 1*8
- movq %r15, (%rsp)
- CFI_REL_OFFSET r15, 0*8
- .endm
-
- .macro RESTORE_REST
- movq (%rsp), %r15
- CFI_RESTORE r15
- movq 1*8(%rsp), %r14
- CFI_RESTORE r14
- movq 2*8(%rsp), %r13
- CFI_RESTORE r13
- movq 3*8(%rsp), %r12
- CFI_RESTORE r12
- movq 4*8(%rsp), %rbp
- CFI_RESTORE rbp
- movq 5*8(%rsp), %rbx
- CFI_RESTORE rbx
- addq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
- .endm
-
- .macro SAVE_ALL
- SAVE_ARGS
- SAVE_REST
- .endm
-
- .macro RESTORE_ALL addskip=0
- RESTORE_REST
- RESTORE_ARGS 0, \addskip
- .endm
-
- .macro icebp
- .byte 0xf1
- .endm
diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h
new file mode 100644
index 000000000000..e1f965bb1e31
--- /dev/null
+++ b/arch/x86/include/asm/ce4100.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_CE4100_H_
+#define _ASM_CE4100_H_
+
+int ce4100_pci_init(void);
+
+#ifdef CONFIG_SERIAL_8250
+void __init sdv_serial_fixup(void);
+#else
+static inline void sdv_serial_fixup(void) {};
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
new file mode 100644
index 000000000000..c40b9ebc1fb4
--- /dev/null
+++ b/arch/x86/include/asm/cfi.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CFI_H
+#define _ASM_X86_CFI_H
+
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#include <linux/bug.h>
+#include <asm/ibt.h>
+
+/*
+ * An overview of the various calling conventions...
+ *
+ * Traditional:
+ *
+ * foo:
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * call *%r11
+ *
+ *
+ * IBT:
+ *
+ * foo:
+ * endbr64
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo / call foo+4
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * call *%r11
+ *
+ *
+ * kCFI:
+ *
+ * __cfi_foo:
+ * movl $0x12345678, %eax
+ * # 11 nops when CONFIG_CALL_PADDING
+ * foo:
+ * endbr64 # when IBT
+ * ... code here ...
+ * ret
+ *
+ * direct call:
+ * call foo # / call foo+4 when IBT
+ *
+ * indirect call:
+ * lea foo(%rip), %r11
+ * ...
+ * movl $(-0x12345678), %r10d
+ * addl -4(%r11), %r10d # -15 when CONFIG_CALL_PADDING
+ * jz 1f
+ * ud2
+ * 1:call *%r11
+ *
+ *
+ * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into):
+ *
+ * __cfi_foo:
+ * endbr64
+ * subl 0x12345678, %eax
+ * jne.32,pn foo+3
+ * foo:
+ * nopl -42(%rax) # was endbr64
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo / call foo+4
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * movl $0x12345678, %eax
+ * lea -0x10(%r11), %r11
+ * nop5
+ * call *%r11
+ *
+ */
+enum cfi_mode {
+ CFI_AUTO, /* FineIBT if hardware has IBT, otherwise kCFI */
+ CFI_OFF, /* Taditional / IBT depending on .config */
+ CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */
+ CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */
+};
+
+extern enum cfi_mode cfi_mode;
+
+#ifdef CONFIG_FINEIBT_BHI
+extern bool cfi_bhi;
+#else
+#define cfi_bhi (0)
+#endif
+
+typedef u8 bhi_thunk[32];
+extern bhi_thunk __bhi_args[];
+extern bhi_thunk __bhi_args_end[];
+
+struct pt_regs;
+
+#ifdef CONFIG_CFI
+enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#define __bpfcall
+
+static inline int cfi_get_offset(void)
+{
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ return 16;
+ case CFI_KCFI:
+ if (IS_ENABLED(CONFIG_CALL_PADDING))
+ return 16;
+ return 5;
+ default:
+ return 0;
+ }
+}
+#define cfi_get_offset cfi_get_offset
+
+extern u32 cfi_get_func_hash(void *func);
+#define cfi_get_func_hash cfi_get_func_hash
+
+extern int cfi_get_func_arity(void *func);
+
+#ifdef CONFIG_FINEIBT
+extern bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type);
+#else
+static inline bool
+decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ return false;
+}
+
+#endif
+
+#else
+static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+ return BUG_TRAP_TYPE_NONE;
+}
+static inline int cfi_get_func_arity(void *func)
+{
+ return 0;
+}
+#endif /* CONFIG_CFI */
+
+#if HAS_KERNEL_IBT == 1
+#define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x)))
+#endif
+
+#endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h
index 848850fd7d62..6df6ece8a28e 100644
--- a/arch/x86/include/asm/checksum.h
+++ b/arch/x86/include/asm/checksum.h
@@ -1,5 +1,13 @@
-#ifdef CONFIG_X86_32
-# include "checksum_32.h"
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_GENERIC_CSUM
+# include <asm-generic/checksum.h>
#else
-# include "checksum_64.h"
+# define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
+# define HAVE_CSUM_COPY_USER
+# define _HAVE_ARCH_CSUM_AND_COPY
+# ifdef CONFIG_X86_32
+# include <asm/checksum_32.h>
+# else
+# include <asm/checksum_64.h>
+# endif
#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 7c5ef8b14d92..17da95387997 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -1,9 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CHECKSUM_32_H
#define _ASM_X86_CHECKSUM_32_H
#include <linux/in6.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* computes the checksum of a memory block at buff, length len,
@@ -27,9 +27,7 @@ asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum);
* better 64-bit) boundary
*/
-asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
- int len, __wsum sum,
- int *src_err_ptr, int *dst_err_ptr);
+asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
/*
* Note: when you get a NULL pointer exception here this means someone
@@ -38,20 +36,23 @@ asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
* If you use these functions directly please don't forget the
* access_ok().
*/
-static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst,
- int len, __wsum sum)
+static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len)
{
- return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
+ return csum_partial_copy_generic(src, dst, len);
}
-static inline __wsum csum_partial_copy_from_user(const void __user *src,
- void *dst,
- int len, __wsum sum,
- int *err_ptr)
+static inline __wsum csum_and_copy_from_user(const void __user *src,
+ void *dst, int len)
{
+ __wsum ret;
+
might_sleep();
- return csum_partial_copy_generic((__force void *)src, dst,
- len, sum, err_ptr, NULL);
+ if (!user_access_begin(src, len))
+ return 0;
+ ret = csum_partial_copy_generic((__force void *)src, dst, len);
+ user_access_end();
+
+ return ret;
}
/*
@@ -106,8 +107,7 @@ static inline __sum16 csum_fold(__wsum sum)
}
static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
asm("addl %1, %0 ;\n"
@@ -125,8 +125,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
* returns a 16-bit checksum, already complemented
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
@@ -145,8 +144,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
#define _HAVE_ARCH_IPV6_CSUM
static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto,
- __wsum sum)
+ __u32 len, __u8 proto, __wsum sum)
{
asm("addl 0(%1), %0 ;\n"
"adcl 4(%1), %0 ;\n"
@@ -161,7 +159,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
"adcl $0, %0 ;\n"
: "=&r" (sum)
: "r" (saddr), "r" (daddr),
- "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
+ "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)
+ : "memory");
return csum_fold(sum);
}
@@ -169,21 +168,19 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
/*
* Copy and checksum to user
*/
-#define HAVE_CSUM_COPY_USER
static inline __wsum csum_and_copy_to_user(const void *src,
void __user *dst,
- int len, __wsum sum,
- int *err_ptr)
+ int len)
{
- might_sleep();
- if (access_ok(VERIFY_WRITE, dst, len))
- return csum_partial_copy_generic(src, (__force void *)dst,
- len, sum, NULL, err_ptr);
+ __wsum ret;
- if (len)
- *err_ptr = -EFAULT;
+ might_sleep();
+ if (!user_access_begin(dst, len))
+ return 0;
- return (__force __wsum)-1; /* invalid checksum */
+ ret = csum_partial_copy_generic(src, (__force void *)dst, len);
+ user_access_end();
+ return ret;
}
#endif /* _ASM_X86_CHECKSUM_32_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 9bfdc41629ec..4d4a47a3a8ab 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CHECKSUM_64_H
#define _ASM_X86_CHECKSUM_64_H
@@ -8,7 +9,6 @@
*/
#include <linux/compiler.h>
-#include <asm/uaccess.h>
#include <asm/byteorder.h>
/**
@@ -84,8 +84,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
* 32bit unfolded.
*/
static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
- unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
asm(" addl %1, %0\n"
" adcl %2, %0\n"
@@ -110,8 +110,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
* complemented and ready to be filled in.
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto, __wsum sum)
+ __u32 len, __u8 proto,
+ __wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
@@ -128,26 +128,12 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
-#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
-#define HAVE_CSUM_COPY_USER 1
-
-
/* Do not call this directly. Use the wrappers below */
-extern __wsum csum_partial_copy_generic(const void *src, const void *dst,
- int len, __wsum sum,
- int *src_err_ptr, int *dst_err_ptr);
-
-
-extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
- int len, __wsum isum, int *errp);
-extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst,
- int len, __wsum isum, int *errp);
-extern __wsum csum_partial_copy_nocheck(const void *src, void *dst,
- int len, __wsum sum);
+extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
-/* Old names. To be removed. */
-#define csum_and_copy_to_user csum_partial_copy_to_user
-#define csum_and_copy_from_user csum_partial_copy_from_user
+extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len);
+extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);
+extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);
/**
* ip_compute_csum - Compute an 16bit IP checksum.
@@ -177,15 +163,22 @@ struct in6_addr;
#define _HAVE_ARCH_IPV6_CSUM 1
extern __sum16
csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
- __u32 len, unsigned short proto, __wsum sum);
+ __u32 len, __u8 proto, __wsum sum);
static inline unsigned add32_with_carry(unsigned a, unsigned b)
{
asm("addl %2,%0\n\t"
"adcl $0,%0"
: "=r" (a)
- : "0" (a), "r" (b));
+ : "0" (a), "rm" (b));
return a;
}
+#define HAVE_ARCH_CSUM_ADD
+static inline __wsum csum_add(__wsum csum, __wsum addend)
+{
+ return (__force __wsum)add32_with_carry((__force unsigned)csum,
+ (__force unsigned)addend);
+}
+
#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 000000000000..dc9dc7b3911a
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#include <asm/vdso/clocksource.h>
+
+extern unsigned int vclocks_used;
+
+static inline bool vclock_was_used(int vclock)
+{
+ return READ_ONCE(vclocks_used) & (1U << vclock);
+}
+
+static inline void vclocks_set_used(unsigned int which)
+{
+ WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << which));
+}
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
new file mode 100644
index 000000000000..6cbd9ae58b21
--- /dev/null
+++ b/arch/x86/include/asm/cmdline.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CMDLINE_H
+#define _ASM_X86_CMDLINE_H
+
+#include <asm/setup.h>
+
+extern char builtin_cmdline[COMMAND_LINE_SIZE];
+
+int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+ char *buffer, int bufsize);
+
+#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index a460fa088d4c..a88b06f1c35e 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -1,5 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_CMPXCHG_H
+#define ASM_X86_CMPXCHG_H
+
+#include <linux/compiler.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+
+/*
+ * Non-existent functions to indicate usage errors at link time
+ * (or compile-time if the compiler implements __compiletime_error().
+ */
+extern void __xchg_wrong_size(void)
+ __compiletime_error("Bad argument size for xchg");
+extern void __cmpxchg_wrong_size(void)
+ __compiletime_error("Bad argument size for cmpxchg");
+extern void __xadd_wrong_size(void)
+ __compiletime_error("Bad argument size for xadd");
+extern void __add_wrong_size(void)
+ __compiletime_error("Bad argument size for add");
+
+/*
+ * Constants for operation sizes. On 32-bit, the 64-bit size it set to
+ * -1 because sizeof will never return -1, thereby making those switch
+ * case statements guaranteed dead code which the compiler will
+ * eliminate, and allowing the "missing symbol in the default case" to
+ * indicate a usage error.
+ */
+#define __X86_CASE_B 1
+#define __X86_CASE_W 2
+#define __X86_CASE_L 4
+#ifdef CONFIG_64BIT
+#define __X86_CASE_Q 8
+#else
+#define __X86_CASE_Q -1 /* sizeof will never return -1 */
+#endif
+
+/*
+ * An exchange-type operation, which takes a value and a pointer, and
+ * returns the old value.
+ */
+#define __xchg_op(ptr, arg, op, lock) \
+ ({ \
+ __typeof__ (*(ptr)) __ret = (arg); \
+ switch (sizeof(*(ptr))) { \
+ case __X86_CASE_B: \
+ asm_inline volatile (lock #op "b %b0, %1" \
+ : "+q" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_W: \
+ asm_inline volatile (lock #op "w %w0, %1" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_L: \
+ asm_inline volatile (lock #op "l %0, %1" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_Q: \
+ asm_inline volatile (lock #op "q %q0, %1" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ default: \
+ __ ## op ## _wrong_size(); \
+ } \
+ __ret; \
+ })
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
+ */
+#define arch_xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm_inline volatile(lock "cmpxchgb %2, %1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm_inline volatile(lock "cmpxchgw %2, %1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm_inline volatile(lock "cmpxchgl %2, %1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
+ asm_inline volatile(lock "cmpxchgq %2, %1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __ret; \
+})
+
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define __sync_cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "lock ")
+
+#define __cmpxchg_local(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "")
+
#ifdef CONFIG_X86_32
-# include "cmpxchg_32.h"
+# include <asm/cmpxchg_32.h>
#else
-# include "cmpxchg_64.h"
+# include <asm/cmpxchg_64.h>
#endif
+
+#define arch_cmpxchg(ptr, old, new) \
+ __cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+#define arch_sync_cmpxchg(ptr, old, new) \
+ __sync_cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+#define arch_cmpxchg_local(ptr, old, new) \
+ __cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
+
+
+#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \
+({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(_ptr); \
+ asm_inline volatile(lock "cmpxchgb %[new], %[ptr]" \
+ : "=@ccz" (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "q" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(_ptr); \
+ asm_inline volatile(lock "cmpxchgw %[new], %[ptr]" \
+ : "=@ccz" (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(_ptr); \
+ asm_inline volatile(lock "cmpxchgl %[new], %[ptr]" \
+ : "=@ccz" (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(_ptr); \
+ asm_inline volatile(lock "cmpxchgq %[new], %[ptr]" \
+ : "=@ccz" (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); \
+})
+
+#define __try_cmpxchg(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+
+#define __sync_try_cmpxchg(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock ")
+
+#define __try_cmpxchg_local(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), "")
+
+#define arch_try_cmpxchg(ptr, pold, new) \
+ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
+#define arch_sync_try_cmpxchg(ptr, pold, new) \
+ __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
+#define arch_try_cmpxchg_local(ptr, pold, new) \
+ __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr)))
+
+/*
+ * xadd() adds "inc" to "*ptr" and atomically returns the previous
+ * value of "*ptr".
+ *
+ * xadd() is locked when multiple CPUs are online
+ */
+#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
+#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
+
+#endif /* ASM_X86_CMPXCHG_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 82ceb788a981..1f80a62be969 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -1,344 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CMPXCHG_32_H
#define _ASM_X86_CMPXCHG_32_H
-#include <linux/bitops.h> /* for LOCK_PREFIX */
-
/*
- * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you
+ * Note: if you use __cmpxchg64(), or their variants,
* you need to test for the feature in boot_cpu_data.
*/
-#define xchg(ptr, v) \
- ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
-
-struct __xchg_dummy {
- unsigned long a[100];
+union __u64_halves {
+ u64 full;
+ struct {
+ u32 low, high;
+ };
};
-#define __xg(x) ((struct __xchg_dummy *)(x))
-/*
- * The semantics of XCHGCMP8B are a bit strange, this is why
- * there is a loop and the loading of %%eax and %%edx has to
- * be inside. This inlines well in most cases, the cached
- * cost is around ~38 cycles. (in the future we might want
- * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
- * might have an implicit FPU-save as a cost, so it's not
- * clear which path to go.)
- *
- * cmpxchg8b must be used with the lock prefix here to allow
- * the instruction to be executed atomically, see page 3-102
- * of the instruction set reference 24319102.pdf. We need
- * the reader side to see the coherent 64bit value.
- */
-static inline void __set_64bit(unsigned long long *ptr,
- unsigned int low, unsigned int high)
+#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm_inline volatile(_lock "cmpxchg8b %[ptr]" \
+ : [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+
+static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
{
- asm volatile("\n1:\t"
- "movl (%0), %%eax\n\t"
- "movl 4(%0), %%edx\n\t"
- LOCK_PREFIX "cmpxchg8b (%0)\n\t"
- "jnz 1b"
- : /* no outputs */
- : "D"(ptr),
- "b"(low),
- "c"(high)
- : "ax", "dx", "memory");
+ return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX);
}
-static inline void __set_64bit_constant(unsigned long long *ptr,
- unsigned long long value)
+static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
{
- __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
+ return __arch_cmpxchg64(ptr, old, new,);
}
-#define ll_low(x) *(((unsigned int *)&(x)) + 0)
-#define ll_high(x) *(((unsigned int *)&(x)) + 1)
+#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm_inline volatile(_lock "cmpxchg8b %[ptr]" \
+ : "=@ccz" (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
-static inline void __set_64bit_var(unsigned long long *ptr,
- unsigned long long value)
+static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
{
- __set_64bit(ptr, ll_low(value), ll_high(value));
+ return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX);
}
-#define set_64bit(ptr, value) \
- (__builtin_constant_p((value)) \
- ? __set_64bit_constant((ptr), (value)) \
- : __set_64bit_var((ptr), (value)))
-
-#define _set_64bit(ptr, value) \
- (__builtin_constant_p(value) \
- ? __set_64bit(ptr, (unsigned int)(value), \
- (unsigned int)((value) >> 32)) \
- : __set_64bit(ptr, ll_low((value)), ll_high((value))))
-
-/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
- */
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
- int size)
+static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
{
- switch (size) {
- case 1:
- asm volatile("xchgb %b0,%1"
- : "=q" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 2:
- asm volatile("xchgw %w0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 4:
- asm volatile("xchgl %0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- }
- return x;
+ return __arch_try_cmpxchg64(ptr, oldp, new,);
}
-/*
- * Atomic compare and exchange. Compare OLD with MEM, if identical,
- * store NEW in MEM. Return the initial value in MEM. Success is
- * indicated by comparing RETURN with OLD.
- */
+#ifdef CONFIG_X86_CX8
-#ifdef CONFIG_X86_CMPXCHG
-#define __HAVE_ARCH_CMPXCHG 1
-#define cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define sync_cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define cmpxchg_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#endif
+#define arch_cmpxchg64 __cmpxchg64
-#ifdef CONFIG_X86_CMPXCHG64
-#define cmpxchg64(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#define cmpxchg64_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#endif
+#define arch_cmpxchg64_local __cmpxchg64_local
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define arch_try_cmpxchg64 __try_cmpxchg64
+
+#define arch_try_cmpxchg64_local __try_cmpxchg64_local
+
+#else
/*
- * Always use locked operations when touching memory shared with a
- * hypervisor, since the system may be SMP even if the guest kernel
- * isn't.
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
*/
-static inline unsigned long __sync_cmpxchg(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("lock; cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("lock; cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("lock; cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-static inline unsigned long __cmpxchg_local(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm_inline volatile( \
+ ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
+ : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \
+ : "b" (n.low), "c" (n.high), \
+ [ptr] "S" (_ptr) \
+ : "memory"); \
+ \
+ o.full; \
+})
-static inline unsigned long long __cmpxchg64(volatile void *ptr,
- unsigned long long old,
- unsigned long long new)
+static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
{
- unsigned long long prev;
- asm volatile(LOCK_PREFIX "cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
- : "memory");
- return prev;
+ return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock ");
}
+#define arch_cmpxchg64 arch_cmpxchg64
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
- unsigned long long old,
- unsigned long long new)
+static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
{
- unsigned long long prev;
- asm volatile("cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
- : "memory");
- return prev;
+ return __arch_cmpxchg64_emu(ptr, old, new, ,);
}
+#define arch_cmpxchg64_local arch_cmpxchg64_local
-#ifndef CONFIG_X86_CMPXCHG
-/*
- * Building a kernel capable running on 80386. It may be necessary to
- * simulate the cmpxchg on the 80386 CPU. For that purpose we define
- * a function for each of the sizes we support.
- */
+#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm_inline volatile( \
+ ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
+ : ALT_OUTPUT_SP("=@ccz" (ret), \
+ "+a" (o.low), "+d" (o.high)) \
+ : "b" (n.low), "c" (n.high), \
+ [ptr] "S" (_ptr) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
-extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
-extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
-extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
+static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock ");
+}
+#define arch_try_cmpxchg64 arch_try_cmpxchg64
-static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
+static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
{
- switch (size) {
- case 1:
- return cmpxchg_386_u8(ptr, old, new);
- case 2:
- return cmpxchg_386_u16(ptr, old, new);
- case 4:
- return cmpxchg_386_u32(ptr, old, new);
- }
- return old;
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,);
}
+#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local
-#define cmpxchg(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 3)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \
- (unsigned long)(o), (unsigned long)(n), \
- sizeof(*(ptr))); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
- (unsigned long)(o), (unsigned long)(n), \
- sizeof(*(ptr))); \
- __ret; \
-})
-#define cmpxchg_local(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 3)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \
- (unsigned long)(o), (unsigned long)(n), \
- sizeof(*(ptr))); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
- (unsigned long)(o), (unsigned long)(n), \
- sizeof(*(ptr))); \
- __ret; \
-})
#endif
-#ifndef CONFIG_X86_CMPXCHG64
-/*
- * Building a kernel capable running on 80386 and 80486. It may be necessary
- * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
- */
-
-extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
-
-#define cmpxchg64(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 4)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- __ret; \
-})
-#define cmpxchg64_local(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 4)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- __ret; \
-})
-
-#endif
+#define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8)
#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e0de8c..5afea056fb20 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -1,185 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CMPXCHG_64_H
#define _ASM_X86_CMPXCHG_64_H
-#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+#define arch_cmpxchg64(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_cmpxchg((ptr), (o), (n)); \
+})
-#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
- (ptr), sizeof(*(ptr))))
+#define arch_cmpxchg64_local(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_cmpxchg_local((ptr), (o), (n)); \
+})
-#define __xg(x) ((volatile long *)(x))
+#define arch_try_cmpxchg64(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg((ptr), (po), (n)); \
+})
-static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
-{
- *ptr = val;
-}
+#define arch_try_cmpxchg64_local(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg_local((ptr), (po), (n)); \
+})
-#define _set_64bit set_64bit
+union __u128_halves {
+ u128 full;
+ struct {
+ u64 low, high;
+ };
+};
-/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
- */
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
- int size)
+#define __arch_cmpxchg128(_ptr, _old, _new, _lock) \
+({ \
+ union __u128_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm_inline volatile(_lock "cmpxchg16b %[ptr]" \
+ : [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
{
- switch (size) {
- case 1:
- asm volatile("xchgb %b0,%1"
- : "=q" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 2:
- asm volatile("xchgw %w0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 4:
- asm volatile("xchgl %k0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 8:
- asm volatile("xchgq %0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- }
- return x;
+ return __arch_cmpxchg128(ptr, old, new, LOCK_PREFIX);
}
+#define arch_cmpxchg128 arch_cmpxchg128
-/*
- * Atomic compare and exchange. Compare OLD with MEM, if identical,
- * store NEW in MEM. Return the initial value in MEM. Success is
- * indicated by comparing RETURN with OLD.
- */
-
-#define __HAVE_ARCH_CMPXCHG 1
-
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
+static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new)
{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 8:
- asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
+ return __arch_cmpxchg128(ptr, old, new,);
}
+#define arch_cmpxchg128_local arch_cmpxchg128_local
+
+#define __arch_try_cmpxchg128(_ptr, _oldp, _new, _lock) \
+({ \
+ union __u128_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm_inline volatile(_lock "cmpxchg16b %[ptr]" \
+ : "=@ccz" (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
-/*
- * Always use locked operations when touching memory shared with a
- * hypervisor, since the system may be SMP even if the guest kernel
- * isn't.
- */
-static inline unsigned long __sync_cmpxchg(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
+static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("lock; cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("lock; cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("lock; cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
+ return __arch_try_cmpxchg128(ptr, oldp, new, LOCK_PREFIX);
}
+#define arch_try_cmpxchg128 arch_try_cmpxchg128
-static inline unsigned long __cmpxchg_local(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
+static __always_inline bool arch_try_cmpxchg128_local(volatile u128 *ptr, u128 *oldp, u128 new)
{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("cmpxchgl %k1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 8:
- asm volatile("cmpxchgq %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
+ return __arch_try_cmpxchg128(ptr, oldp, new,);
}
+#define arch_try_cmpxchg128_local arch_try_cmpxchg128_local
-#define cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), sizeof(*(ptr))))
-#define cmpxchg64(ptr, o, n) \
-({ \
- BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
- cmpxchg((ptr), (o), (n)); \
-})
-#define cmpxchg_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define sync_cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define cmpxchg64_local(ptr, o, n) \
-({ \
- BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
- cmpxchg_local((ptr), (o), (n)); \
-})
+#define system_has_cmpxchg128() boot_cpu_has(X86_FEATURE_CX16)
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
new file mode 100644
index 000000000000..e1dbf8df1b69
--- /dev/null
+++ b/arch/x86/include/asm/coco.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_COCO_H
+#define _ASM_X86_COCO_H
+
+#include <asm/asm.h>
+#include <asm/types.h>
+
+enum cc_vendor {
+ CC_VENDOR_NONE,
+ CC_VENDOR_AMD,
+ CC_VENDOR_INTEL,
+};
+
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+extern enum cc_vendor cc_vendor;
+extern u64 cc_mask;
+
+static inline u64 cc_get_mask(void)
+{
+ return cc_mask;
+}
+
+static inline void cc_set_mask(u64 mask)
+{
+ cc_mask = mask;
+}
+
+u64 cc_mkenc(u64 val);
+u64 cc_mkdec(u64 val);
+void cc_random_init(void);
+#else
+#define cc_vendor (CC_VENDOR_NONE)
+static inline u64 cc_get_mask(void)
+{
+ return 0;
+}
+
+static inline u64 cc_mkenc(u64 val)
+{
+ return val;
+}
+
+static inline u64 cc_mkdec(u64 val)
+{
+ return val;
+}
+static inline void cc_random_init(void) { }
+#endif
+
+#endif /* _ASM_X86_COCO_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 9a9c7bdc923d..b1221da477b7 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_COMPAT_H
#define _ASM_X86_COMPAT_H
@@ -6,59 +7,40 @@
*/
#include <linux/types.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <asm/processor.h>
#include <asm/user32.h>
+#include <asm/unistd.h>
-#define COMPAT_USER_HZ 100
+#define compat_mode_t compat_mode_t
+typedef u16 compat_mode_t;
-typedef u32 compat_size_t;
-typedef s32 compat_ssize_t;
-typedef s32 compat_time_t;
-typedef s32 compat_clock_t;
-typedef s32 compat_pid_t;
+#define __compat_uid_t __compat_uid_t
typedef u16 __compat_uid_t;
typedef u16 __compat_gid_t;
-typedef u32 __compat_uid32_t;
-typedef u32 __compat_gid32_t;
-typedef u16 compat_mode_t;
-typedef u32 compat_ino_t;
+
+#define compat_dev_t compat_dev_t
typedef u16 compat_dev_t;
-typedef s32 compat_off_t;
-typedef s64 compat_loff_t;
-typedef u16 compat_nlink_t;
-typedef u16 compat_ipc_pid_t;
-typedef s32 compat_daddr_t;
-typedef u32 compat_caddr_t;
-typedef __kernel_fsid_t compat_fsid_t;
-typedef s32 compat_timer_t;
-typedef s32 compat_key_t;
-
-typedef s32 compat_int_t;
-typedef s32 compat_long_t;
-typedef s64 __attribute__((aligned(4))) compat_s64;
-typedef u32 compat_uint_t;
-typedef u32 compat_ulong_t;
-typedef u64 __attribute__((aligned(4))) compat_u64;
-
-struct compat_timespec {
- compat_time_t tv_sec;
- s32 tv_nsec;
-};
-struct compat_timeval {
- compat_time_t tv_sec;
- s32 tv_usec;
-};
+#define compat_ipc_pid_t compat_ipc_pid_t
+typedef u16 compat_ipc_pid_t;
+
+#define compat_statfs compat_statfs
+
+#include <asm-generic/compat.h>
+
+#define COMPAT_UTS_MACHINE "i686\0\0"
+
+typedef u16 compat_nlink_t;
struct compat_stat {
- compat_dev_t st_dev;
- u16 __pad1;
+ u32 st_dev;
compat_ino_t st_ino;
compat_mode_t st_mode;
compat_nlink_t st_nlink;
__compat_uid_t st_uid;
__compat_gid_t st_gid;
- compat_dev_t st_rdev;
- u16 __pad2;
+ u32 st_rdev;
u32 st_size;
u32 st_blksize;
u32 st_blocks;
@@ -72,29 +54,11 @@ struct compat_stat {
u32 __unused5;
};
-struct compat_flock {
- short l_type;
- short l_whence;
- compat_off_t l_start;
- compat_off_t l_len;
- compat_pid_t l_pid;
-};
-
-#define F_GETLK64 12 /* using 'struct flock64' */
-#define F_SETLK64 13
-#define F_SETLKW64 14
-
/*
- * IA32 uses 4 byte alignment for 64 bit quantities,
- * so we need to pack this structure.
+ * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the
+ * compat flock64 structure.
*/
-struct compat_flock64 {
- short l_type;
- short l_whence;
- compat_loff_t l_start;
- compat_loff_t l_len;
- compat_pid_t l_pid;
-} __attribute__((packed));
+#define __ARCH_NEED_COMPAT_FLOCK64_PACKED
struct compat_statfs {
int f_type;
@@ -107,112 +71,44 @@ struct compat_statfs {
compat_fsid_t f_fsid;
int f_namelen; /* SunOS ignores this field. */
int f_frsize;
- int f_spare[5];
+ int f_flags;
+ int f_spare[4];
};
-#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff
-#define COMPAT_RLIM_INFINITY 0xffffffff
-
-typedef u32 compat_old_sigset_t; /* at least 32 bits */
-
-#define _COMPAT_NSIG 64
-#define _COMPAT_NSIG_BPW 32
-
-typedef u32 compat_sigset_word;
-
-#define COMPAT_OFF_T_MAX 0x7fffffff
-#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL
+#ifdef CONFIG_X86_X32_ABI
+#define COMPAT_USE_64BIT_TIME \
+ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
+#endif
-struct compat_ipc64_perm {
- compat_key_t key;
- __compat_uid32_t uid;
- __compat_gid32_t gid;
- __compat_uid32_t cuid;
- __compat_gid32_t cgid;
- unsigned short mode;
- unsigned short __pad1;
- unsigned short seq;
- unsigned short __pad2;
- compat_ulong_t unused1;
- compat_ulong_t unused2;
-};
-
-struct compat_semid64_ds {
- struct compat_ipc64_perm sem_perm;
- compat_time_t sem_otime;
- compat_ulong_t __unused1;
- compat_time_t sem_ctime;
- compat_ulong_t __unused2;
- compat_ulong_t sem_nsems;
- compat_ulong_t __unused3;
- compat_ulong_t __unused4;
-};
-
-struct compat_msqid64_ds {
- struct compat_ipc64_perm msg_perm;
- compat_time_t msg_stime;
- compat_ulong_t __unused1;
- compat_time_t msg_rtime;
- compat_ulong_t __unused2;
- compat_time_t msg_ctime;
- compat_ulong_t __unused3;
- compat_ulong_t msg_cbytes;
- compat_ulong_t msg_qnum;
- compat_ulong_t msg_qbytes;
- compat_pid_t msg_lspid;
- compat_pid_t msg_lrpid;
- compat_ulong_t __unused4;
- compat_ulong_t __unused5;
-};
-
-struct compat_shmid64_ds {
- struct compat_ipc64_perm shm_perm;
- compat_size_t shm_segsz;
- compat_time_t shm_atime;
- compat_ulong_t __unused1;
- compat_time_t shm_dtime;
- compat_ulong_t __unused2;
- compat_time_t shm_ctime;
- compat_ulong_t __unused3;
- compat_pid_t shm_cpid;
- compat_pid_t shm_lpid;
- compat_ulong_t shm_nattch;
- compat_ulong_t __unused4;
- compat_ulong_t __unused5;
-};
-
-/*
- * The type of struct elf_prstatus.pr_reg in compatible core dumps.
- */
-typedef struct user_regs_struct32 compat_elf_gregset_t;
-
-/*
- * A pointer passed in from user mode. This should not
- * be used for syscall parameters, just declare them
- * as pointers because the syscall entry code will have
- * appropriately converted them already.
- */
-typedef u32 compat_uptr_t;
-
-static inline void __user *compat_ptr(compat_uptr_t uptr)
+static inline bool in_x32_syscall(void)
{
- return (void __user *)(unsigned long)uptr;
+#ifdef CONFIG_X86_X32_ABI
+ if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
+ return true;
+#endif
+ return false;
}
-static inline compat_uptr_t ptr_to_compat(void __user *uptr)
+static inline bool in_32bit_syscall(void)
{
- return (u32)(unsigned long)uptr;
+ return in_ia32_syscall() || in_x32_syscall();
}
-static inline void __user *compat_alloc_user_space(long len)
+#ifdef CONFIG_COMPAT
+static inline bool in_compat_syscall(void)
{
- struct pt_regs *regs = task_pt_regs(current);
- return (void __user *)regs->sp - len;
+ return in_32bit_syscall();
}
+#define in_compat_syscall in_compat_syscall /* override the generic impl */
+#define compat_need_64bit_alignment_fixup in_ia32_syscall
+#endif
-static inline int is_compat_task(void)
-{
- return current_thread_info()->status & TS_COMPAT;
-}
+struct compat_siginfo;
+
+#ifdef CONFIG_X86_X32_ABI
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const kernel_siginfo_t *from);
+#define copy_siginfo_to_user32 copy_siginfo_to_user32
+#endif /* CONFIG_X86_X32_ABI */
#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19c..ad235dda1ded 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPU_H
#define _ASM_X86_CPU_H
@@ -6,32 +7,64 @@
#include <linux/topology.h>
#include <linux/nodemask.h>
#include <linux/percpu.h>
+#include <asm/ibt.h>
-#ifdef CONFIG_SMP
+#ifndef CONFIG_SMP
+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
+#define cpu_acpi_id(cpu) 0
+#endif /* CONFIG_SMP */
-extern void prefill_possible_map(void);
+#ifdef CONFIG_HOTPLUG_CPU
+extern void soft_restart_cpu(void);
+#endif
-#else /* CONFIG_SMP */
+extern void ap_init_aperfmperf(void);
-static inline void prefill_possible_map(void) {}
+int mwait_usable(const struct cpuinfo_x86 *);
-#define cpu_physical_id(cpu) boot_cpu_physical_apicid
-#define safe_smp_processor_id() 0
-#define stack_smp_processor_id() 0
+unsigned int x86_family(unsigned int sig);
+unsigned int x86_model(unsigned int sig);
+unsigned int x86_stepping(unsigned int sig);
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
+extern void __init sld_setup(struct cpuinfo_x86 *c);
+extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
+extern bool handle_guest_split_lock(unsigned long ip);
+extern void handle_bus_lock(struct pt_regs *regs);
+void split_lock_init(void);
+void bus_lock_init(void);
+#else
+static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
+static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ return false;
+}
-#endif /* CONFIG_SMP */
+static inline bool handle_guest_split_lock(unsigned long ip)
+{
+ return false;
+}
-struct x86_cpu {
- struct cpu cpu;
-};
+static inline void handle_bus_lock(struct pt_regs *regs) {}
+static inline void split_lock_init(void) {}
+static inline void bus_lock_init(void) {}
+#endif
-#ifdef CONFIG_HOTPLUG_CPU
-extern int arch_register_cpu(int num);
-extern void arch_unregister_cpu(int);
+#ifdef CONFIG_IA32_FEAT_CTL
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
+#else
+static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {}
#endif
-DECLARE_PER_CPU(int, cpu_state);
+extern __noendbr void cet_disable(void);
+
+struct cpu_signature;
+
+void intel_collect_cpu_info(struct cpu_signature *sig);
+
+extern u64 x86_read_arch_cap_msr(void);
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig);
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
-extern unsigned int boot_cpu_id;
+extern struct cpumask cpus_stop_mask;
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
deleted file mode 100644
index d96c1ee3a95c..000000000000
--- a/arch/x86/include/asm/cpu_debug.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef _ASM_X86_CPU_DEBUG_H
-#define _ASM_X86_CPU_DEBUG_H
-
-/*
- * CPU x86 architecture debug
- *
- * Copyright(C) 2009 Jaswinder Singh Rajput
- */
-
-/* Register flags */
-enum cpu_debug_bit {
-/* Model Specific Registers (MSRs) */
- CPU_MC_BIT, /* Machine Check */
- CPU_MONITOR_BIT, /* Monitor */
- CPU_TIME_BIT, /* Time */
- CPU_PMC_BIT, /* Performance Monitor */
- CPU_PLATFORM_BIT, /* Platform */
- CPU_APIC_BIT, /* APIC */
- CPU_POWERON_BIT, /* Power-on */
- CPU_CONTROL_BIT, /* Control */
- CPU_FEATURES_BIT, /* Features control */
- CPU_LBRANCH_BIT, /* Last Branch */
- CPU_BIOS_BIT, /* BIOS */
- CPU_FREQ_BIT, /* Frequency */
- CPU_MTTR_BIT, /* MTRR */
- CPU_PERF_BIT, /* Performance */
- CPU_CACHE_BIT, /* Cache */
- CPU_SYSENTER_BIT, /* Sysenter */
- CPU_THERM_BIT, /* Thermal */
- CPU_MISC_BIT, /* Miscellaneous */
- CPU_DEBUG_BIT, /* Debug */
- CPU_PAT_BIT, /* PAT */
- CPU_VMX_BIT, /* VMX */
- CPU_CALL_BIT, /* System Call */
- CPU_BASE_BIT, /* BASE Address */
- CPU_VER_BIT, /* Version ID */
- CPU_CONF_BIT, /* Configuration */
- CPU_SMM_BIT, /* System mgmt mode */
- CPU_SVM_BIT, /*Secure Virtual Machine*/
- CPU_OSVM_BIT, /* OS-Visible Workaround*/
-/* Standard Registers */
- CPU_TSS_BIT, /* Task Stack Segment */
- CPU_CR_BIT, /* Control Registers */
- CPU_DT_BIT, /* Descriptor Table */
-/* End of Registers flags */
- CPU_REG_ALL_BIT, /* Select all Registers */
-};
-
-#define CPU_REG_ALL (~0) /* Select all Registers */
-
-#define CPU_MC (1 << CPU_MC_BIT)
-#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
-#define CPU_TIME (1 << CPU_TIME_BIT)
-#define CPU_PMC (1 << CPU_PMC_BIT)
-#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
-#define CPU_APIC (1 << CPU_APIC_BIT)
-#define CPU_POWERON (1 << CPU_POWERON_BIT)
-#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
-#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
-#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
-#define CPU_BIOS (1 << CPU_BIOS_BIT)
-#define CPU_FREQ (1 << CPU_FREQ_BIT)
-#define CPU_MTRR (1 << CPU_MTTR_BIT)
-#define CPU_PERF (1 << CPU_PERF_BIT)
-#define CPU_CACHE (1 << CPU_CACHE_BIT)
-#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
-#define CPU_THERM (1 << CPU_THERM_BIT)
-#define CPU_MISC (1 << CPU_MISC_BIT)
-#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
-#define CPU_PAT (1 << CPU_PAT_BIT)
-#define CPU_VMX (1 << CPU_VMX_BIT)
-#define CPU_CALL (1 << CPU_CALL_BIT)
-#define CPU_BASE (1 << CPU_BASE_BIT)
-#define CPU_VER (1 << CPU_VER_BIT)
-#define CPU_CONF (1 << CPU_CONF_BIT)
-#define CPU_SMM (1 << CPU_SMM_BIT)
-#define CPU_SVM (1 << CPU_SVM_BIT)
-#define CPU_OSVM (1 << CPU_OSVM_BIT)
-#define CPU_TSS (1 << CPU_TSS_BIT)
-#define CPU_CR (1 << CPU_CR_BIT)
-#define CPU_DT (1 << CPU_DT_BIT)
-
-/* Register file flags */
-enum cpu_file_bit {
- CPU_INDEX_BIT, /* index */
- CPU_VALUE_BIT, /* value */
-};
-
-#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
-
-#define MAX_CPU_FILES 512
-
-struct cpu_private {
- unsigned cpu;
- unsigned type;
- unsigned reg;
- unsigned file;
-};
-
-struct cpu_debug_base {
- char *name; /* Register name */
- unsigned flag; /* Register flag */
- unsigned write; /* Register write flag */
-};
-
-/*
- * Currently it looks similar to cpu_debug_base but once we add more files
- * cpu_file_base will go in different direction
- */
-struct cpu_file_base {
- char *name; /* Register file name */
- unsigned flag; /* Register file flag */
- unsigned write; /* Register write flag */
-};
-
-struct cpu_cpuX_base {
- struct dentry *dentry; /* Register dentry */
- int init; /* Register index file */
-};
-
-struct cpu_debug_range {
- unsigned min; /* Register range min */
- unsigned max; /* Register range max */
- unsigned flag; /* Supported flags */
-};
-
-#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
new file mode 100644
index 000000000000..6be777a06944
--- /dev/null
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPU_DEVICE_ID
+#define _ASM_X86_CPU_DEVICE_ID
+
+/*
+ * Can't use <linux/bitfield.h> because it generates expressions that
+ * cannot be used in structure initializers. Bitfield construction
+ * here must match the union in struct cpuinfo_86:
+ * union {
+ * struct {
+ * __u8 x86_model;
+ * __u8 x86;
+ * __u8 x86_vendor;
+ * __u8 x86_reserved;
+ * };
+ * __u32 x86_vfm;
+ * };
+ */
+#define VFM_MODEL_BIT 0
+#define VFM_FAMILY_BIT 8
+#define VFM_VENDOR_BIT 16
+#define VFM_RSVD_BIT 24
+
+#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
+#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
+#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
+
+#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
+#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
+#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
+
+#define VFM_MAKE(_vendor, _family, _model) ( \
+ ((_model) << VFM_MODEL_BIT) | \
+ ((_family) << VFM_FAMILY_BIT) | \
+ ((_vendor) << VFM_VENDOR_BIT) \
+)
+
+/*
+ * Declare drivers belonging to specific x86 CPUs
+ * Similar in spirit to pci_device_id and related PCI functions
+ *
+ * The wildcard initializers are in mod_devicetable.h because
+ * file2alias needs them. Sigh.
+ */
+#include <linux/mod_devicetable.h>
+/* Get the INTEL_FAM* model defines */
+#include <asm/intel-family.h>
+/* And the X86_VENDOR_* ones */
+#include <asm/processor.h>
+
+/* Centaur FAM6 models */
+#define X86_CENTAUR_FAM6_C7_A 0xa
+#define X86_CENTAUR_FAM6_C7_D 0xd
+#define X86_CENTAUR_FAM6_NANO 0xf
+
+/* x86_cpu_id::flags */
+#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0)
+
+/**
+ * X86_MATCH_CPU - Base macro for CPU matching
+ * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@_vendor
+ * @_family: The family number or X86_FAMILY_ANY
+ * @_model: The model number, model constant or X86_MODEL_ANY
+ * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY
+ * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY
+ * @_data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ *
+ * Use only if you need all selectors. Otherwise use one of the shorter
+ * macros of the X86_MATCH_* family. If there is no matching shorthand
+ * macro, consider to add one. If you really need to wrap one of the macros
+ * into another macro at the usage site for good reasons, then please
+ * start this local macro with X86_MATCH to allow easy grepping.
+ */
+#define X86_MATCH_CPU(_vendor, _family, _model, _steppings, _feature, _type, _data) { \
+ .vendor = _vendor, \
+ .family = _family, \
+ .model = _model, \
+ .steppings = _steppings, \
+ .feature = _feature, \
+ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \
+ .type = _type, \
+ .driver_data = (unsigned long) _data \
+}
+
+/**
+ * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature
+ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@vendor
+ * @family: The family number or X86_FAMILY_ANY
+ * @feature: A X86_FEATURE bit
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ */
+#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature
+ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@vendor
+ * @feature: A X86_FEATURE bit
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ */
+#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, X86_FAMILY_ANY, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_FEATURE - Macro for matching a CPU feature
+ * @feature: A X86_FEATURE bit
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ */
+#define X86_MATCH_FEATURE(feature, data) \
+ X86_MATCH_CPU(X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model
+ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@vendor
+ * @family: The family number or X86_FAMILY_ANY
+ * @model: The model number, model constant or X86_MODEL_ANY
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ */
+#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, family, model, X86_STEPPING_ANY, \
+ X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_VENDOR_FAM - Match vendor and family
+ * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@vendor
+ * @family: The family number or X86_FAMILY_ANY
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ */
+#define X86_MATCH_VENDOR_FAM(vendor, family, data) \
+ X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_VFM - Match encoded vendor/family/model
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ */
+#define X86_MATCH_VFM(vfm, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data)
+
+#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
+/**
+ * X86_MATCH_VFM_STEPS - Match encoded vendor/family/model and steppings
+ * range.
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @min_step: Lowest stepping number to match
+ * @max_step: Highest stepping number to match
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ */
+#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ __X86_STEPPINGS(min_step, max_step), X86_FEATURE_ANY, \
+ X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @feature: A X86_FEATURE bit
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ */
+#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_CPU_TYPE - Match encoded vendor/family/model/type
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @type: CPU type e.g. P-core, E-core
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ */
+#define X86_MATCH_VFM_CPU_TYPE(vfm, type, data) \
+ X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, type, data)
+
+extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
+extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table);
+
+#endif /* _ASM_X86_CPU_DEVICE_ID */
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
new file mode 100644
index 000000000000..462fc34f1317
--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <linux/percpu-defs.h>
+#include <asm/processor.h>
+#include <asm/intel_ds.h>
+#include <asm/pgtable_areas.h>
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#define VC_EXCEPTION_STKSZ EXCEPTION_STKSZ
+#else
+#define VC_EXCEPTION_STKSZ 0
+#endif
+
+/* Macro to enforce the same ordering and stack sizes */
+#define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
+ char DF_stack_guard[guardsize]; \
+ char DF_stack[EXCEPTION_STKSZ]; \
+ char NMI_stack_guard[guardsize]; \
+ char NMI_stack[EXCEPTION_STKSZ]; \
+ char DB_stack_guard[guardsize]; \
+ char DB_stack[EXCEPTION_STKSZ]; \
+ char MCE_stack_guard[guardsize]; \
+ char MCE_stack[EXCEPTION_STKSZ]; \
+ char VC_stack_guard[guardsize]; \
+ char VC_stack[optional_stack_size]; \
+ char VC2_stack_guard[guardsize]; \
+ char VC2_stack[optional_stack_size]; \
+ char IST_top_guard[guardsize]; \
+
+/* The exception stacks' physical storage. No guard pages required */
+struct exception_stacks {
+ ESTACKS_MEMBERS(0, VC_EXCEPTION_STKSZ)
+};
+
+/* The effective cpu entry area mapping with guard pages. */
+struct cea_exception_stacks {
+ ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
+};
+
+/*
+ * The exception stack ordering in [cea_]exception_stacks
+ */
+enum exception_stack_ordering {
+ ESTACK_DF,
+ ESTACK_NMI,
+ ESTACK_DB,
+ ESTACK_MCE,
+ ESTACK_VC,
+ ESTACK_VC2,
+ N_EXCEPTION_STACKS
+};
+
+#define CEA_ESTACK_SIZE(st) \
+ sizeof(((struct cea_exception_stacks *)0)->st## _stack)
+
+#define CEA_ESTACK_BOT(ceastp, st) \
+ ((unsigned long)&(ceastp)->st## _stack)
+
+#define CEA_ESTACK_TOP(ceastp, st) \
+ (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st))
+
+#define CEA_ESTACK_OFFS(st) \
+ offsetof(struct cea_exception_stacks, st## _stack)
+
+#define CEA_ESTACK_PAGES \
+ (sizeof(struct cea_exception_stacks) / PAGE_SIZE)
+
+#endif
+
+#ifdef CONFIG_X86_32
+struct doublefault_stack {
+ unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)];
+ struct x86_hw_tss tss;
+} __aligned(PAGE_SIZE);
+#endif
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code. Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+ * The GDT is just below entry_stack and thus serves (on x86_64) as
+ * a read-only guard page. On 32-bit the GDT must be writeable, so
+ * it needs an extra guard page.
+ */
+#ifdef CONFIG_X86_32
+ char guard_entry_stack[PAGE_SIZE];
+#endif
+ struct entry_stack_page entry_stack_page;
+
+#ifdef CONFIG_X86_32
+ char guard_doublefault_stack[PAGE_SIZE];
+ struct doublefault_stack doublefault_stack;
+#endif
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+#ifdef CONFIG_X86_64
+ /*
+ * Exception stacks used for IST entries with guard pages.
+ */
+ struct cea_exception_stacks estacks;
+#endif
+ /*
+ * Per CPU debug store for Intel performance monitoring. Wastes a
+ * full page at the moment.
+ */
+ struct debug_store cpu_debug_store;
+ /*
+ * The actual PEBS/BTS buffers must be mapped to user space
+ * Reserve enough fixmap PTEs.
+ */
+ struct debug_store_buffers cpu_debug_buffers;
+};
+
+#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
+
+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
+
+static __always_inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+ return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#define __this_cpu_ist_top_va(name) \
+ CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
+
+#define __this_cpu_ist_bottom_va(name) \
+ CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name)
+
+#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4a28d22d4793..3ddc1d33399b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -1,280 +1,142 @@
-/*
- * Defines x86 CPU feature bits
- */
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H
-#include <asm/required-features.h>
-
-#define NCAPINTS 9 /* N 32-bit words worth of info */
-
-/*
- * Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name. If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
- */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */
- /* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */
-#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM (0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
-
-/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
-
-/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
-
-/* Other features, Linux-defined mapping, word 3 */
-/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
-#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
-#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
-#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
-#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
+#include <asm/processor.h>
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID (4*32+10) /* Context ID */
-#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_AES (4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
-
-/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */
-
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
-#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
-
-/*
- * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc
- */
-#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
-
-/* Virtualization flags: Linux defined */
-#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
-
-#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#if defined(__KERNEL__) && !defined(__ASSEMBLER__)
+#include <asm/asm.h>
#include <linux/bitops.h>
+#include <asm/alternative.h>
+#include <asm/cpufeaturemasks.h>
+
+enum cpuid_leafs
+{
+ CPUID_1_EDX = 0,
+ CPUID_8000_0001_EDX,
+ CPUID_8086_0001_EDX,
+ CPUID_LNX_1,
+ CPUID_1_ECX,
+ CPUID_C000_0001_EDX,
+ CPUID_8000_0001_ECX,
+ CPUID_LNX_2,
+ CPUID_LNX_3,
+ CPUID_7_0_EBX,
+ CPUID_D_1_EAX,
+ CPUID_LNX_4,
+ CPUID_7_1_EAX,
+ CPUID_8000_0008_EBX,
+ CPUID_6_EAX,
+ CPUID_8000_000A_EDX,
+ CPUID_7_ECX,
+ CPUID_LNX_6,
+ CPUID_7_EDX,
+ CPUID_8000_001F_EAX,
+ CPUID_8000_0021_EAX,
+ CPUID_LNX_5,
+ NR_CPUID_WORDS,
+};
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
+/*
+ * In order to save room, we index into this array by doing
+ * X86_BUG_<name> - NCAPINTS*32.
+ */
+extern const char * const x86_bug_flags[NBUGINTS*32];
+#define x86_bug_flag(flag) x86_bug_flags[flag]
+
#define test_cpu_cap(c, bit) \
- test_bit(bit, (unsigned long *)((c)->x86_capability))
+ arch_test_bit(bit, (unsigned long *)((c)->x86_capability))
#define cpu_has(c, bit) \
- (__builtin_constant_p(bit) && \
- ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
- (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
- (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
- (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
- (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
- (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
- (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
- ? 1 : \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
test_cpu_cap(c, bit))
-#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
-
-#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
-#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
-#define setup_clear_cpu_cap(bit) do { \
- clear_cpu_cap(&boot_cpu_data, bit); \
- set_bit(bit, (unsigned long *)cpu_caps_cleared); \
-} while (0)
-#define setup_force_cpu_cap(bit) do { \
- set_cpu_cap(&boot_cpu_data, bit); \
- set_bit(bit, (unsigned long *)cpu_caps_set); \
-} while (0)
-
-#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
-#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
-#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
-#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
-#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
-#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
-#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
-#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
-#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
-#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
-#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
-#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
-#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
-#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
-#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
-#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
-#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
-#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
-#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
-#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
-#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
-#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
-#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
-#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
-#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
-#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
-#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
-#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
-#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
-#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
-#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
-#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
-#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH)
-#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
-#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
-#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
-#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
-#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
-#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
-#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
-#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
-#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
-
-#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
-# define cpu_has_invlpg 1
-#else
-# define cpu_has_invlpg (boot_cpu_data.x86 > 3)
-#endif
-
-#ifdef CONFIG_X86_64
-
-#undef cpu_has_vme
-#define cpu_has_vme 0
+#define this_cpu_has(bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
-#undef cpu_has_pae
-#define cpu_has_pae ___BUG___
-
-#undef cpu_has_mp
-#define cpu_has_mp 1
+/*
+ * This is the default CPU features testing macro to use in code.
+ *
+ * It is for detection of features which need kernel infrastructure to be
+ * used. It may *not* directly test the CPU itself. Use the cpu_has() family
+ * if you want true runtime testing of CPU features, like in hypervisor code
+ * where you are supporting a possible guest feature where host support for it
+ * is not relevant.
+ */
+#define cpu_feature_enabled(bit) \
+ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
-#undef cpu_has_k6_mtrr
-#define cpu_has_k6_mtrr 0
+#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
-#undef cpu_has_cyrix_arr
-#define cpu_has_cyrix_arr 0
+#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
-#undef cpu_has_centaur_mcr
-#define cpu_has_centaur_mcr 0
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+void check_cpufeature_deps(struct cpuinfo_x86 *c);
-#endif /* CONFIG_X86_64 */
+#define setup_force_cpu_cap(bit) do { \
+ \
+ if (!boot_cpu_has(bit)) \
+ WARN_ON(alternatives_patched); \
+ \
+ set_cpu_cap(&boot_cpu_data, bit); \
+ set_bit(bit, (unsigned long *)cpu_caps_set); \
+} while (0)
-#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+/*
+ * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
+ * that this is only used on a fallback path and will sometimes cause
+ * it to manifest the address of boot_cpu_data in a register, fouling
+ * the mainline (post-initialization) code.
+ */
+static __always_inline bool _static_cpu_has(u16 bit)
+{
+ asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
+ ".pushsection .altinstr_aux,\"ax\"\n"
+ "6:\n"
+ ANNOTATE_DATA_SPECIAL "\n"
+ " testb %[bitnum], %a[cap_byte]\n"
+ " jnz %l[t_yes]\n"
+ " jmp %l[t_no]\n"
+ ".popsection\n"
+ : : [feature] "i" (bit),
+ [bitnum] "i" (1 << (bit & 7)),
+ [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+ : : t_yes, t_no);
+t_yes:
+ return true;
+t_no:
+ return false;
+}
+
+#define static_cpu_has(bit) \
+( \
+ __builtin_constant_p(boot_cpu_has(bit)) ? \
+ boot_cpu_has(bit) : \
+ _static_cpu_has(bit) \
+)
+
+#define cpu_has_bug(c, bit) cpu_has(c, (bit))
+#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
+#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
+
+#define static_cpu_has_bug(bit) static_cpu_has((bit))
+#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
+#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit))
+
+#define MAX_CPU_FEATURES (NCAPINTS * 32)
+#define cpu_have_feature boot_cpu_has
+
+#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
+#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
+ boot_cpu_data.x86_model
+
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */
#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
new file mode 100644
index 000000000000..c3b53beb1300
--- /dev/null
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -0,0 +1,573 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUFEATURES_H
+#define _ASM_X86_CPUFEATURES_H
+
+/*
+ * Defines x86 CPU feature bits
+ */
+#define NCAPINTS 22 /* N 32-bit words worth of info */
+#define NBUGINTS 2 /* N 32-bit bug flags */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name. Otherwise, this feature
+ * bit is not displayed in /proc/cpuinfo at all.
+ *
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c as well.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
+#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */
+#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */
+#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */
+#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */
+#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */
+#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */
+#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */
+#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */
+#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */
+#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */
+#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */
+#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */
+#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */
+#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */
+#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */
+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */
+#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */
+#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */
+#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */
+#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */
+#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */
+#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */
+#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */
+#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */
+#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */
+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */
+#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */
+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */
+#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
+#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
+#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
+#define X86_FEATURE_ZEN6 ( 3*32+ 6) /* CPU based on Zen6 microarchitecture */
+/* Free ( 3*32+ 7) */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
+#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
+#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */
+#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */
+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */
+#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */
+#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */
+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */
+#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */
+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */
+#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */
+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
+#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */
+#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
+#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
+#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */
+#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */
+#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */
+#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */
+#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */
+#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */
+#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */
+#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */
+#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */
+#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */
+#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */
+#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */
+#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */
+#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */
+#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */
+#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */
+#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */
+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */
+#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */
+#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */
+#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */
+#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */
+#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */
+#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */
+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */
+#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */
+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */
+#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */
+#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */
+#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */
+#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */
+#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */
+#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */
+#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */
+#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */
+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */
+#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */
+#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */
+#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */
+#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */
+#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */
+#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */
+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */
+#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */
+#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */
+#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */
+#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */
+#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */
+#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */
+#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
+#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */
+#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier without a guaranteed RSB flush */
+#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */
+#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */
+#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */
+#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */
+#define X86_FEATURE_COHERENCY_SFW_NO ( 8*32+ 4) /* SNP cache coherency software work around not needed */
+
+#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */
+#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */
+#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */
+#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */
+#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */
+#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */
+#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */
+#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */
+#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */
+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */
+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */
+#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */
+#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */
+#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */
+#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */
+#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */
+#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */
+#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */
+#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */
+#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */
+#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */
+#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */
+#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */
+
+/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */
+#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */
+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */
+#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */
+#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */
+
+/*
+ * Extended auxiliary flags: Linux defined - for features scattered in various
+ * CPUID levels like 0xf, etc.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */
+#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */
+#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */
+#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */
+#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */
+#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */
+#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */
+#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */
+#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */
+#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */
+#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */
+#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */
+#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */
+#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */
+#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */
+#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */
+#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */
+#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */
+#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */
+#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */
+#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */
+#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
+#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */
+#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */
+#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */
+#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_SHA512 (12*32+ 0) /* SHA512 instructions */
+#define X86_FEATURE_SM3 (12*32+ 1) /* SM3 instructions */
+#define X86_FEATURE_SM4 (12*32+ 2) /* SM4 instructions */
+#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */
+#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_LASS (12*32+ 6) /* "lass" Linear Address Space Separation */
+#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */
+#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */
+#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */
+#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */
+#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */
+#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */
+#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */
+#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
+#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
+#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */
+
+/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
+#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
+#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
+#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
+#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
+#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
+#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/
+#define X86_FEATURE_EFER_LMSLE_MBZ (13*32+20) /* EFER.LMSLE must be zero */
+#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
+#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
+#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
+#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */
+#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */
+#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */
+#define X86_FEATURE_AMD_IBPB_RET (13*32+30) /* IBPB clears return address predictor */
+#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
+#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */
+#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */
+#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */
+#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */
+#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */
+#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */
+#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */
+#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
+#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */
+#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */
+#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */
+#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */
+#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */
+#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */
+#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
+#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
+#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */
+#define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
+#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/
+#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */
+#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */
+#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */
+#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */
+#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */
+#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
+#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */
+#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */
+#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */
+#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */
+#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */
+#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */
+#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */
+
+/*
+ * Linux-defined word for use with scattered/synthetic bits.
+ */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */
+
+#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
+#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */
+#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */
+#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */
+#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */
+#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */
+#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */
+#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */
+#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */
+#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */
+#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */
+#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */
+#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */
+#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */
+#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */
+#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */
+#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */
+#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */
+#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */
+#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */
+#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */
+
+/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
+#define X86_FEATURE_SME (19*32+ 0) /* "sme" Secure Memory Encryption */
+#define X86_FEATURE_SEV (19*32+ 1) /* "sev" Secure Encrypted Virtualization */
+#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */
+#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */
+#define X86_FEATURE_SNP_SECURE_TSC (19*32+ 8) /* SEV-SNP Secure TSC */
+#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
+#define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */
+#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
+#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
+#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
+#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */
+#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
+#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */
+
+/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
+#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */
+#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
+#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */
+#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */
+
+#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
+#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
+
+#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */
+
+#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */
+#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
+#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
+#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
+#define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */
+#define X86_FEATURE_SRSO_BP_SPEC_REDUCE (20*32+31) /*
+ * BP_CFG[BpSpecReduce] can be used to mitigate SRSO for VMs.
+ * (SRSO_MSR_FIX in the official doc).
+ */
+
+/*
+ * Extended auxiliary flags: Linux defined - for features scattered in various
+ * CPUID levels like 0x80000022, etc and Linux defined features.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */
+#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */
+#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */
+#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
+#define X86_FEATURE_CLEAR_BHB_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_AMD_FAST_CPPC (21*32+ 5) /* Fast CPPC */
+#define X86_FEATURE_AMD_HTR_CORES (21*32+ 6) /* Heterogeneous Core Topology */
+#define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32+ 7) /* Workload Classification */
+#define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */
+#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */
+#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
+#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
+#define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */
+#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */
+#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */
+#define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */
+#define X86_FEATURE_SGX_EUPDATESVN (21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */
+
+#define X86_FEATURE_SDCIAE (21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO (21*32+19) /*
+ * Clear CPU buffers before VM-Enter if the vCPU
+ * can access host MMIO (ignored for all intents
+ * and purposes if CLEAR_CPU_BUF_VM is set).
+ */
+#define X86_FEATURE_X2AVIC_EXT (21*32+20) /* AMD SVM x2AVIC support for 4k vCPUs */
+
+/*
+ * BUG word(s)
+ */
+#define X86_BUG(x) (NCAPINTS*32 + (x))
+
+#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */
+#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */
+#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */
+#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */
+#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */
+#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */
+#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */
+#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */
+#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */
+#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */
+#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */
+/* unused, was #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */
+#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */
+#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */
+#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */
+#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */
+#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */
+
+/* BUG word 2 */
+#define X86_BUG_SRSO X86_BUG( 1*32+ 0) /* "srso" AMD SRSO bug */
+#define X86_BUG_DIV0 X86_BUG( 1*32+ 1) /* "div0" AMD DIV0 speculation bug */
+#define X86_BUG_RFDS X86_BUG( 1*32+ 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */
+#define X86_BUG_BHI X86_BUG( 1*32+ 3) /* "bhi" CPU is affected by Branch History Injection */
+#define X86_BUG_IBPB_NO_RET X86_BUG( 1*32+ 4) /* "ibpb_no_ret" IBPB omits return target predictions */
+#define X86_BUG_SPECTRE_V2_USER X86_BUG( 1*32+ 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */
+#define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */
+#define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */
+#define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
+#define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
+#define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */
+#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/cpuid/api.h b/arch/x86/include/asm/cpuid/api.h
new file mode 100644
index 000000000000..44fa82e1267c
--- /dev/null
+++ b/arch/x86/include/asm/cpuid/api.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUID_API_H
+#define _ASM_X86_CPUID_API_H
+
+#include <asm/cpuid/types.h>
+
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+#include <asm/string.h>
+
+/*
+ * Raw CPUID accessors:
+ */
+
+#ifdef CONFIG_X86_32
+bool cpuid_feature(void);
+#else
+static inline bool cpuid_feature(void)
+{
+ return true;
+}
+#endif
+
+static inline void native_cpuid(u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
+#define NATIVE_CPUID_REG(reg) \
+static inline u32 native_cpuid_##reg(u32 op) \
+{ \
+ u32 eax = op, ebx, ecx = 0, edx; \
+ \
+ native_cpuid(&eax, &ebx, &ecx, &edx); \
+ \
+ return reg; \
+}
+
+/*
+ * Native CPUID functions returning a single datum:
+ */
+NATIVE_CPUID_REG(eax)
+NATIVE_CPUID_REG(ebx)
+NATIVE_CPUID_REG(ecx)
+NATIVE_CPUID_REG(edx)
+
+#ifdef CONFIG_PARAVIRT_XXL
+# include <asm/paravirt.h>
+#else
+# define __cpuid native_cpuid
+#endif
+
+/*
+ * Generic CPUID function
+ *
+ * Clear ECX since some CPUs (Cyrix MII) do not set or clear ECX
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(u32 op,
+ u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ECX */
+static inline void cpuid_count(u32 op, int count,
+ u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum:
+ */
+
+static inline u32 cpuid_eax(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline u32 cpuid_ebx(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline u32 cpuid_ecx(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline u32 cpuid_edx(u32 op)
+{
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
+static inline void __cpuid_read(u32 leaf, u32 subleaf, u32 *regs)
+{
+ regs[CPUID_EAX] = leaf;
+ regs[CPUID_ECX] = subleaf;
+ __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX);
+}
+
+#define cpuid_subleaf(leaf, subleaf, regs) { \
+ static_assert(sizeof(*(regs)) == 16); \
+ __cpuid_read(leaf, subleaf, (u32 *)(regs)); \
+}
+
+#define cpuid_leaf(leaf, regs) { \
+ static_assert(sizeof(*(regs)) == 16); \
+ __cpuid_read(leaf, 0, (u32 *)(regs)); \
+}
+
+static inline void __cpuid_read_reg(u32 leaf, u32 subleaf,
+ enum cpuid_regs_idx regidx, u32 *reg)
+{
+ u32 regs[4];
+
+ __cpuid_read(leaf, subleaf, regs);
+ *reg = regs[regidx];
+}
+
+#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \
+ static_assert(sizeof(*(reg)) == 4); \
+ __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \
+}
+
+#define cpuid_leaf_reg(leaf, regidx, reg) { \
+ static_assert(sizeof(*(reg)) == 4); \
+ __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \
+}
+
+/*
+ * Hypervisor-related APIs:
+ */
+
+static __always_inline bool cpuid_function_is_indexed(u32 function)
+{
+ switch (function) {
+ case 4:
+ case 7:
+ case 0xb:
+ case 0xd:
+ case 0xf:
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x17:
+ case 0x18:
+ case 0x1d:
+ case 0x1e:
+ case 0x1f:
+ case 0x24:
+ case 0x8000001d:
+ return true;
+ }
+
+ return false;
+}
+
+#define for_each_possible_cpuid_base_hypervisor(function) \
+ for (function = 0x40000000; function < 0x40010000; function += 0x100)
+
+static inline u32 cpuid_base_hypervisor(const char *sig, u32 leaves)
+{
+ u32 base, eax, signature[3];
+
+ for_each_possible_cpuid_base_hypervisor(base) {
+ cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
+
+ /*
+ * This must not compile to "call memcmp" because it's called
+ * from PVH early boot code before instrumentation is set up
+ * and memcmp() itself may be instrumented.
+ */
+ if (!__builtin_memcmp(sig, signature, 12) &&
+ (leaves == 0 || ((eax - base) >= leaves)))
+ return base;
+ }
+
+ return 0;
+}
+
+/*
+ * CPUID(0x2) parsing:
+ */
+
+/**
+ * cpuid_leaf_0x2() - Return sanitized CPUID(0x2) register output
+ * @regs: Output parameter
+ *
+ * Query CPUID(0x2) and store its output in @regs. Force set any
+ * invalid 1-byte descriptor returned by the hardware to zero (the NULL
+ * cache/TLB descriptor) before returning it to the caller.
+ *
+ * Use for_each_cpuid_0x2_desc() to iterate over the register output in
+ * parsed form.
+ */
+static inline void cpuid_leaf_0x2(union leaf_0x2_regs *regs)
+{
+ cpuid_leaf(0x2, regs);
+
+ /*
+ * All Intel CPUs must report an iteration count of 1. In case
+ * of bogus hardware, treat all returned descriptors as NULL.
+ */
+ if (regs->desc[0] != 0x01) {
+ for (int i = 0; i < 4; i++)
+ regs->regv[i] = 0;
+ return;
+ }
+
+ /*
+ * The most significant bit (MSB) of each register must be clear.
+ * If a register is invalid, replace its descriptors with NULL.
+ */
+ for (int i = 0; i < 4; i++) {
+ if (regs->reg[i].invalid)
+ regs->regv[i] = 0;
+ }
+}
+
+/**
+ * for_each_cpuid_0x2_desc() - Iterator for parsed CPUID(0x2) descriptors
+ * @_regs: CPUID(0x2) register output, as returned by cpuid_leaf_0x2()
+ * @_ptr: u8 pointer, for macro internal use only
+ * @_desc: Pointer to the parsed CPUID(0x2) descriptor at each iteration
+ *
+ * Loop over the 1-byte descriptors in the passed CPUID(0x2) output registers
+ * @_regs. Provide the parsed information for each descriptor through @_desc.
+ *
+ * To handle cache-specific descriptors, switch on @_desc->c_type. For TLB
+ * descriptors, switch on @_desc->t_type.
+ *
+ * Example usage for cache descriptors::
+ *
+ * const struct leaf_0x2_table *desc;
+ * union leaf_0x2_regs regs;
+ * u8 *ptr;
+ *
+ * cpuid_leaf_0x2(&regs);
+ * for_each_cpuid_0x2_desc(regs, ptr, desc) {
+ * switch (desc->c_type) {
+ * ...
+ * }
+ * }
+ */
+#define for_each_cpuid_0x2_desc(_regs, _ptr, _desc) \
+ for (_ptr = &(_regs).desc[1]; \
+ _ptr < &(_regs).desc[16] && (_desc = &cpuid_0x2_table[*_ptr]); \
+ _ptr++)
+
+/*
+ * CPUID(0x80000006) parsing:
+ */
+
+static inline bool cpuid_amd_hygon_has_l3_cache(void)
+{
+ return cpuid_edx(0x80000006);
+}
+
+#endif /* _ASM_X86_CPUID_API_H */
diff --git a/arch/x86/include/asm/cpuid/types.h b/arch/x86/include/asm/cpuid/types.h
new file mode 100644
index 000000000000..8a00364b79de
--- /dev/null
+++ b/arch/x86/include/asm/cpuid/types.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUID_TYPES_H
+#define _ASM_X86_CPUID_TYPES_H
+
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+/*
+ * Types for raw CPUID access:
+ */
+
+struct cpuid_regs {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+};
+
+enum cpuid_regs_idx {
+ CPUID_EAX = 0,
+ CPUID_EBX,
+ CPUID_ECX,
+ CPUID_EDX,
+};
+
+#define CPUID_LEAF_MWAIT 0x05
+#define CPUID_LEAF_DCA 0x09
+#define CPUID_LEAF_XSTATE 0x0d
+#define CPUID_LEAF_TSC 0x15
+#define CPUID_LEAF_FREQ 0x16
+#define CPUID_LEAF_TILE 0x1d
+
+/*
+ * Types for CPUID(0x2) parsing:
+ */
+
+struct leaf_0x2_reg {
+ u32 : 31,
+ invalid : 1;
+};
+
+union leaf_0x2_regs {
+ struct leaf_0x2_reg reg[4];
+ u32 regv[4];
+ u8 desc[16];
+};
+
+/*
+ * Leaf 0x2 1-byte descriptors' cache types
+ * To be used for their mappings at cpuid_0x2_table[]
+ *
+ * Start at 1 since type 0 is reserved for HW byte descriptors which are
+ * not recognized by the kernel; i.e., those without an explicit mapping.
+ */
+enum _cache_table_type {
+ CACHE_L1_INST = 1,
+ CACHE_L1_DATA,
+ CACHE_L2,
+ CACHE_L3
+ /* Adjust __TLB_TABLE_TYPE_BEGIN before adding more types */
+} __packed;
+#ifndef __CHECKER__
+static_assert(sizeof(enum _cache_table_type) == 1);
+#endif
+
+/*
+ * Ensure that leaf 0x2 cache and TLB type values do not intersect,
+ * since they share the same type field at struct cpuid_0x2_table.
+ */
+#define __TLB_TABLE_TYPE_BEGIN (CACHE_L3 + 1)
+
+/*
+ * Leaf 0x2 1-byte descriptors' TLB types
+ * To be used for their mappings at cpuid_0x2_table[]
+ */
+enum _tlb_table_type {
+ TLB_INST_4K = __TLB_TABLE_TYPE_BEGIN,
+ TLB_INST_4M,
+ TLB_INST_2M_4M,
+ TLB_INST_ALL,
+
+ TLB_DATA_4K,
+ TLB_DATA_4M,
+ TLB_DATA_2M_4M,
+ TLB_DATA_4K_4M,
+ TLB_DATA_1G,
+ TLB_DATA_1G_2M_4M,
+
+ TLB_DATA0_4K,
+ TLB_DATA0_4M,
+ TLB_DATA0_2M_4M,
+
+ STLB_4K,
+ STLB_4K_2M,
+} __packed;
+#ifndef __CHECKER__
+static_assert(sizeof(enum _tlb_table_type) == 1);
+#endif
+
+/*
+ * Combined parsing table for leaf 0x2 cache and TLB descriptors.
+ */
+
+struct leaf_0x2_table {
+ union {
+ enum _cache_table_type c_type;
+ enum _tlb_table_type t_type;
+ };
+ union {
+ short c_size;
+ short entries;
+ };
+};
+
+extern const struct leaf_0x2_table cpuid_0x2_table[256];
+
+/*
+ * All of leaf 0x2's one-byte TLB descriptors implies the same number of entries
+ * for their respective TLB types. TLB descriptor 0x63 is an exception: it
+ * implies 4 dTLB entries for 1GB pages and 32 dTLB entries for 2MB or 4MB pages.
+ *
+ * Encode that descriptor's dTLB entry count for 2MB/4MB pages here, as the entry
+ * count for dTLB 1GB pages is already encoded at the cpuid_0x2_table[]'s mapping.
+ */
+#define TLB_0x63_2M_4M_ENTRIES 32
+
+#endif /* _ASM_X86_CPUID_TYPES_H */
diff --git a/arch/x86/include/asm/cpuidle_haltpoll.h b/arch/x86/include/asm/cpuidle_haltpoll.h
new file mode 100644
index 000000000000..c8b39c6716ff
--- /dev/null
+++ b/arch/x86/include/asm/cpuidle_haltpoll.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_HALTPOLL_H
+#define _ARCH_HALTPOLL_H
+
+void arch_haltpoll_enable(unsigned int cpu);
+void arch_haltpoll_disable(unsigned int cpu);
+
+#endif
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index 61c852fa346b..9df9e9cde670 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -1,14 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUMASK_H
#define _ASM_X86_CPUMASK_H
-#ifndef __ASSEMBLY__
-#include <linux/cpumask.h>
+#ifndef __ASSEMBLER__
-extern cpumask_var_t cpu_callin_mask;
-extern cpumask_var_t cpu_callout_mask;
-extern cpumask_var_t cpu_initialized_mask;
-extern cpumask_var_t cpu_sibling_setup_mask;
+#include <linux/compiler.h>
+#include <linux/cpumask.h>
extern void setup_cpu_local_masks(void);
-#endif /* __ASSEMBLY__ */
+/*
+ * NMI and MCE exceptions need cpu_is_offline() _really_ early,
+ * provide an arch_ special for them to avoid instrumentation.
+ */
+#if NR_CPUS > 1
+static __always_inline bool arch_cpu_online(int cpu)
+{
+ return arch_test_bit(cpu, cpumask_bits(cpu_online_mask));
+}
+
+static __always_inline void arch_cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ arch_clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+#else
+static __always_inline bool arch_cpu_online(int cpu)
+{
+ return cpu == 0;
+}
+
+static __always_inline void arch_cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ return;
+}
+#endif
+
+#define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu))
+
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h
deleted file mode 100644
index 6d68ad7e0ea3..000000000000
--- a/arch/x86/include/asm/cputime.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/cputime.h>
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
new file mode 100644
index 000000000000..8b6bd63530dc
--- /dev/null
+++ b/arch/x86/include/asm/crash.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CRASH_H
+#define _ASM_X86_CRASH_H
+
+struct kimage;
+
+int crash_load_segments(struct kimage *image);
+int crash_setup_memmap_entries(struct kimage *image,
+ struct boot_params *params);
+void crash_smp_send_stop(void);
+
+#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/crash_reserve.h b/arch/x86/include/asm/crash_reserve.h
new file mode 100644
index 000000000000..7835b2cdff04
--- /dev/null
+++ b/arch/x86/include/asm/crash_reserve.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_CRASH_RESERVE_H
+#define _X86_CRASH_RESERVE_H
+
+/* 16M alignment for crash kernel regions */
+#define CRASH_ALIGN SZ_16M
+
+/*
+ * Keep the crash kernel below this limit.
+ *
+ * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
+ * due to mapping restrictions.
+ *
+ * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
+ * the upper limit of system RAM in 4-level paging mode. Since the kdump
+ * jump could be from 5-level paging to 4-level paging, the jump will fail if
+ * the kernel is put above 64 TB, and during the 1st kernel bootup there's
+ * no good way to detect the paging mode of the target kernel which will be
+ * loaded for dumping.
+ */
+extern unsigned long swiotlb_size_or_default(void);
+
+#ifdef CONFIG_X86_32
+# define CRASH_ADDR_LOW_MAX SZ_512M
+# define CRASH_ADDR_HIGH_MAX SZ_512M
+#else
+# define CRASH_ADDR_LOW_MAX SZ_4G
+# define CRASH_ADDR_HIGH_MAX SZ_64T
+#endif
+
+# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default()
+
+static inline unsigned long crash_low_size_default(void)
+{
+#ifdef CONFIG_X86_64
+ return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
+#else
+ return 0;
+#endif
+}
+
+#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+
+#endif /* _X86_CRASH_RESERVE_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index c68c361697e1..cc4a3f725b37 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -1,21 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H
+#include <linux/build_bug.h>
#include <linux/compiler.h>
+
+#ifndef __ASSEMBLER__
+
+#include <linux/cache.h>
#include <asm/percpu.h>
-#ifndef __ASSEMBLY__
struct task_struct;
-DECLARE_PER_CPU(struct task_struct *, current_task);
+DECLARE_PER_CPU_CACHE_HOT(struct task_struct *, current_task);
+/* const-qualified alias provided by the linker. */
+DECLARE_PER_CPU_CACHE_HOT(struct task_struct * const __percpu_seg_override,
+ const_current_task);
static __always_inline struct task_struct *get_current(void)
{
- return percpu_read(current_task);
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return this_cpu_read_const(const_current_task);
+
+ return this_cpu_read_stable(current_task);
}
#define current get_current()
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e2..a2c1f2d24b64 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -1,70 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DEBUGREG_H
#define _ASM_X86_DEBUGREG_H
+#include <linux/bug.h>
+#include <linux/percpu.h>
+#include <uapi/asm/debugreg.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+/*
+ * Define bits that are always set to 1 in DR7, only bit 10 is
+ * architecturally reserved to '1'.
+ *
+ * This is also the init/reset value for DR7.
+ */
+#define DR7_FIXED_1 0x00000400
+
+DECLARE_PER_CPU(unsigned long, cpu_dr7);
+
+#ifndef CONFIG_PARAVIRT_XXL
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ (var) = native_get_debugreg(register)
+#define set_debugreg(value, register) \
+ native_set_debugreg(register, value)
+#endif
-/* Indicate the register numbers for a number of the specific
- debug registers. Registers 0-3 contain the addresses we wish to trap on */
-#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */
-#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */
-
-#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */
-#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */
-
-/* Define a few things for the status register. We can use this to determine
- which debugging register was responsible for the trap. The other bits
- are either reserved or not of interest to us. */
-
-#define DR_TRAP0 (0x1) /* db0 */
-#define DR_TRAP1 (0x2) /* db1 */
-#define DR_TRAP2 (0x4) /* db2 */
-#define DR_TRAP3 (0x8) /* db3 */
-
-#define DR_STEP (0x4000) /* single-step */
-#define DR_SWITCH (0x8000) /* task switch */
-
-/* Now define a bunch of things for manipulating the control register.
- The top two bytes of the control register consist of 4 fields of 4
- bits - each field corresponds to one of the four debug registers,
- and indicates what types of access we trap on, and how large the data
- field is that we are looking at */
-
-#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */
-#define DR_CONTROL_SIZE 4 /* 4 control bits per register */
-
-#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */
-#define DR_RW_WRITE (0x1)
-#define DR_RW_READ (0x3)
-
-#define DR_LEN_1 (0x0) /* Settings for data length to trap on */
-#define DR_LEN_2 (0x4)
-#define DR_LEN_4 (0xC)
-#define DR_LEN_8 (0x8)
-
-/* The low byte to the control register determine which registers are
- enabled. There are 4 fields of two bits. One bit is "local", meaning
- that the processor will reset the bit after a task switch and the other
- is global meaning that we have to explicitly reset the bit. With linux,
- you can use either one, since we explicitly zero the register when we enter
- kernel mode. */
+static __always_inline unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val;
+
+ switch (regno) {
+ case 0:
+ asm("mov %%db0, %0" :"=r" (val));
+ break;
+ case 1:
+ asm("mov %%db1, %0" :"=r" (val));
+ break;
+ case 2:
+ asm("mov %%db2, %0" :"=r" (val));
+ break;
+ case 3:
+ asm("mov %%db3, %0" :"=r" (val));
+ break;
+ case 6:
+ asm("mov %%db6, %0" :"=r" (val));
+ break;
+ case 7:
+ /*
+ * Use "asm volatile" for DR7 reads to forbid re-ordering them
+ * with other code.
+ *
+ * This is needed because a DR7 access can cause a #VC exception
+ * when running under SEV-ES. Taking a #VC exception is not a
+ * safe thing to do just anywhere in the entry code and
+ * re-ordering might place the access into an unsafe location.
+ *
+ * This happened in the NMI handler, where the DR7 read was
+ * re-ordered to happen before the call to sev_es_ist_enter(),
+ * causing stack recursion.
+ */
+ asm volatile("mov %%db7, %0" : "=r" (val));
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static __always_inline void native_set_debugreg(int regno, unsigned long value)
+{
+ switch (regno) {
+ case 0:
+ asm("mov %0, %%db0" ::"r" (value));
+ break;
+ case 1:
+ asm("mov %0, %%db1" ::"r" (value));
+ break;
+ case 2:
+ asm("mov %0, %%db2" ::"r" (value));
+ break;
+ case 3:
+ asm("mov %0, %%db3" ::"r" (value));
+ break;
+ case 6:
+ asm("mov %0, %%db6" ::"r" (value));
+ break;
+ case 7:
+ /*
+ * Use "asm volatile" for DR7 writes to forbid re-ordering them
+ * with other code.
+ *
+ * While is didn't happen with a DR7 write (see the DR7 read
+ * comment above which explains where it happened), add the
+ * "asm volatile" here too to avoid similar problems in the
+ * future.
+ */
+ asm volatile("mov %0, %%db7" ::"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void hw_breakpoint_disable(void)
+{
+ /* Reset the control register for HW Breakpoint */
+ set_debugreg(DR7_FIXED_1, 7);
+
+ /* Zero-out the individual HW breakpoint address registers */
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+}
+
+static __always_inline bool hw_breakpoint_active(void)
+{
+ return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+}
+
+extern void hw_breakpoint_restore(void);
+
+static __always_inline unsigned long local_db_save(void)
+{
+ unsigned long dr7;
+
+ if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active())
+ return 0;
+
+ get_debugreg(dr7, 7);
+
+ /* Architecturally set bit */
+ dr7 &= ~DR7_FIXED_1;
+ if (dr7)
+ set_debugreg(DR7_FIXED_1, 7);
+
+ /*
+ * Ensure the compiler doesn't lower the above statements into
+ * the critical section; disabling breakpoints late would not
+ * be good.
+ */
+ barrier();
+
+ return dr7;
+}
+
+static __always_inline void local_db_restore(unsigned long dr7)
+{
+ /*
+ * Ensure the compiler doesn't raise this statement into
+ * the critical section; enabling breakpoints early would
+ * not be good.
+ */
+ barrier();
+ if (dr7)
+ set_debugreg(dr7, 7);
+}
+
+#ifdef CONFIG_CPU_SUP_AMD
+extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr);
+extern unsigned long amd_get_dr_addr_mask(unsigned int dr);
+#else
+static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { }
+static inline unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ return 0;
+}
+#endif
-#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
-#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
-#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr = 0;
-#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
-#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
-/* The second byte to the control register has a few special things.
- We can slow the instruction pipeline for instructions coming via the
- gdt or the ldt if we want to. I am not sure why this is an advantage */
+ return debugctlmsr;
+}
-#ifdef __i386__
-#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */
-#else
-#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
#endif
-
-#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
-#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 409a649204aa..630891d25819 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -1,31 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DELAY_H
#define _ASM_X86_DELAY_H
-/*
- * Copyright (C) 1993 Linus Torvalds
- *
- * Delay routines calling functions in arch/x86/lib/delay.c
- */
+#include <asm-generic/delay.h>
+#include <linux/init.h>
-/* Undefined functions to get compile-time errors */
-extern void __bad_udelay(void);
-extern void __bad_ndelay(void);
-
-extern void __udelay(unsigned long usecs);
-extern void __ndelay(unsigned long nsecs);
-extern void __const_udelay(unsigned long xloops);
-extern void __delay(unsigned long loops);
-
-/* 0x10c7 is 2**32 / 1000000 (rounded up) */
-#define udelay(n) (__builtin_constant_p(n) ? \
- ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
- __udelay(n))
-
-/* 0x5 is 2**32 / 1000000000 (rounded up) */
-#define ndelay(n) (__builtin_constant_p(n) ? \
- ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
- __ndelay(n))
-
-void use_tsc_delay(void);
+void __init use_tsc_delay(void);
+void __init use_tpause_delay(void);
+void use_mwaitx_delay(void);
#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index c993e9e0fed4..ec95fe44fa3a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -1,105 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DESC_H
#define _ASM_X86_DESC_H
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
+#include <asm/fixmap.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
+
+#include <linux/debug_locks.h>
#include <linux/smp.h>
+#include <linux/percpu.h>
+
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+
+ desc->base0 = (info->base_addr & 0x0000ffff);
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+ /* Set the ACCESS bit so it can be mapped RO */
+ desc->type |= 1;
-static inline void fill_ldt(struct desc_struct *desc,
- const struct user_desc *info)
-{
- desc->limit0 = info->limit & 0x0ffff;
- desc->base0 = info->base_addr & 0x0000ffff;
-
- desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
- desc->type = (info->read_exec_only ^ 1) << 1;
- desc->type |= info->contents << 2;
- desc->s = 1;
- desc->dpl = 0x3;
- desc->p = info->seg_not_present ^ 1;
- desc->limit = (info->limit & 0xf0000) >> 16;
- desc->avl = info->useable;
- desc->d = info->seg_32bit;
- desc->g = info->limit_in_pages;
- desc->base2 = (info->base_addr & 0xff000000) >> 24;
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit1 = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
/*
- * Don't allow setting of the lm bit. It is useless anyway
- * because 64bit system calls require __USER_CS:
+ * Don't allow setting of the lm bit. It would confuse
+ * user_64bit_mode and would get overridden by sysret anyway.
*/
- desc->l = 0;
+ desc->l = 0;
}
-extern struct desc_ptr idt_descr;
-extern gate_desc idt_table[];
-
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
+
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+/* Provide the original GDT */
+static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
{
return per_cpu(gdt_page, cpu).gdt;
}
-#ifdef CONFIG_X86_64
+/* Provide the current original GDT */
+static inline struct desc_struct *get_current_gdt_rw(void)
+{
+ return this_cpu_ptr(&gdt_page)->gdt;
+}
-static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
- unsigned dpl, unsigned ist, unsigned seg)
+/* Provide the fixmap address of the remapped GDT */
+static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
- gate->offset_low = PTR_LOW(func);
- gate->segment = __KERNEL_CS;
- gate->ist = ist;
- gate->p = 1;
- gate->dpl = dpl;
- gate->zero0 = 0;
- gate->zero1 = 0;
- gate->type = type;
- gate->offset_middle = PTR_MIDDLE(func);
- gate->offset_high = PTR_HIGH(func);
+ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
}
-#else
-static inline void pack_gate(gate_desc *gate, unsigned char type,
- unsigned long base, unsigned dpl, unsigned flags,
- unsigned short seg)
+/* Provide the current read-only GDT */
+static inline struct desc_struct *get_current_gdt_ro(void)
+{
+ return get_cpu_gdt_ro(smp_processor_id());
+}
+
+/* Provide the physical address of the GDT page. */
+static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
{
- gate->a = (seg << 16) | (base & 0xffff);
- gate->b = (base & 0xffff0000) |
- (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+ return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
}
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate->offset_low = (u16) func;
+ gate->bits.p = 1;
+ gate->bits.dpl = dpl;
+ gate->bits.zero = 0;
+ gate->bits.type = type;
+ gate->offset_middle = (u16) (func >> 16);
+#ifdef CONFIG_X86_64
+ gate->segment = __KERNEL_CS;
+ gate->bits.ist = ist;
+ gate->reserved = 0;
+ gate->offset_high = (u32) (func >> 32);
+#else
+ gate->segment = seg;
+ gate->bits.ist = 0;
#endif
+}
static inline int desc_empty(const void *ptr)
{
const u32 *desc = ptr;
+
return !(desc[0] | desc[1]);
}
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
-#define load_TR_desc() native_load_tr_desc()
-#define load_gdt(dtr) native_load_gdt(dtr)
-#define load_idt(dtr) native_load_idt(dtr)
-#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
-#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
-
-#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
-#define store_tr(tr) (tr = native_store_tr())
-
-#define load_TLS(t, cpu) native_load_tls(t, cpu)
-#define set_ldt native_set_ldt
-
-#define write_ldt_entry(dt, entry, desc) \
- native_write_ldt_entry(dt, entry, desc)
-#define write_gdt_entry(dt, entry, desc, type) \
- native_write_gdt_entry(dt, entry, desc, type)
-#define write_idt_entry(dt, entry, g) \
- native_write_idt_entry(dt, entry, g)
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
@@ -108,86 +130,60 @@ static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
}
-#endif /* CONFIG_PARAVIRT */
+#endif /* CONFIG_PARAVIRT_XXL */
#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
-static inline void native_write_idt_entry(gate_desc *idt, int entry,
- const gate_desc *gate)
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
{
memcpy(&idt[entry], gate, sizeof(*gate));
}
-static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
- const void *desc)
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
{
memcpy(&ldt[entry], desc, 8);
}
-static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
- const void *desc, int type)
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
{
unsigned int size;
+
switch (type) {
- case DESC_TSS:
- size = sizeof(tss_desc);
- break;
- case DESC_LDT:
- size = sizeof(ldt_desc);
- break;
- default:
- size = sizeof(struct desc_struct);
- break;
+ case DESC_TSS: size = sizeof(tss_desc); break;
+ case DESC_LDT: size = sizeof(ldt_desc); break;
+ default: size = sizeof(*gdt); break;
}
- memcpy(&gdt[entry], desc, size);
-}
-static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
- unsigned long limit, unsigned char type,
- unsigned char flags)
-{
- desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
- desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
- (limit & 0x000f0000) | ((type & 0xff) << 8) |
- ((flags & 0xf) << 20);
- desc->p = 1;
+ memcpy(&gdt[entry], desc, size);
}
-
static inline void set_tssldt_descriptor(void *d, unsigned long addr,
unsigned type, unsigned size)
{
-#ifdef CONFIG_X86_64
- struct ldttss_desc64 *desc = d;
+ struct ldttss_desc *desc = d;
+
memset(desc, 0, sizeof(*desc));
- desc->limit0 = size & 0xFFFF;
- desc->base0 = PTR_LOW(addr);
- desc->base1 = PTR_MIDDLE(addr) & 0xFF;
- desc->type = type;
- desc->p = 1;
- desc->limit1 = (size >> 16) & 0xF;
- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
- desc->base3 = PTR_HIGH(addr);
-#else
- pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+
+ desc->limit0 = (u16) size;
+ desc->base0 = (u16) addr;
+ desc->base1 = (addr >> 16) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (addr >> 24) & 0xFF;
+#ifdef CONFIG_X86_64
+ desc->base3 = (u32) (addr >> 32);
#endif
}
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{
- struct desc_struct *d = get_cpu_gdt_table(cpu);
+ struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;
- /*
- * sizeof(unsigned long) coming from an extra "long" at the end
- * of the iobitmap. See tss_struct definition in processor.h
- *
- * -1? seg base+limit should be pointing to the address of the
- * last valid byte
- */
set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
- IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
- sizeof(unsigned long) - 1);
+ __KERNEL_TSS_LIMIT);
write_gdt_entry(d, entry, &tss, DESC_TSS);
}
@@ -203,23 +199,18 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)
set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
entries * LDT_ENTRY_SIZE - 1);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
&ldt, DESC_LDT);
asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
}
}
-static inline void native_load_tr_desc(void)
-{
- asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
-}
-
static inline void native_load_gdt(const struct desc_ptr *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
}
-static inline void native_load_idt(const struct desc_ptr *dtr)
+static __always_inline void native_load_idt(const struct desc_ptr *dtr)
{
asm volatile("lidt %0"::"m" (*dtr));
}
@@ -229,154 +220,230 @@ static inline void native_store_gdt(struct desc_ptr *dtr)
asm volatile("sgdt %0":"=m" (*dtr));
}
-static inline void native_store_idt(struct desc_ptr *dtr)
+static inline void store_idt(struct desc_ptr *dtr)
{
asm volatile("sidt %0":"=m" (*dtr));
}
+static inline void native_gdt_invalidate(void)
+{
+ const struct desc_ptr invalid_gdt = {
+ .address = 0,
+ .size = 0
+ };
+
+ native_load_gdt(&invalid_gdt);
+}
+
+static inline void native_idt_invalidate(void)
+{
+ const struct desc_ptr invalid_idt = {
+ .address = 0,
+ .size = 0
+ };
+
+ native_load_idt(&invalid_idt);
+}
+
+/*
+ * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
+ * a read-only remapping. To prevent a page fault, the GDT is switched to the
+ * original writeable version when needed.
+ */
+#ifdef CONFIG_X86_64
+static inline void native_load_tr_desc(void)
+{
+ struct desc_ptr gdt;
+ int cpu = raw_smp_processor_id();
+ bool restore = 0;
+ struct desc_struct *fixmap_gdt;
+
+ native_store_gdt(&gdt);
+ fixmap_gdt = get_cpu_gdt_ro(cpu);
+
+ /*
+ * If the current GDT is the read-only fixmap, swap to the original
+ * writeable version. Swap back at the end.
+ */
+ if (gdt.address == (unsigned long)fixmap_gdt) {
+ load_direct_gdt(cpu);
+ restore = 1;
+ }
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+ if (restore)
+ load_fixmap_gdt(cpu);
+}
+#else
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+#endif
+
static inline unsigned long native_store_tr(void)
{
unsigned long tr;
+
asm volatile("str %0":"=r" (tr));
+
return tr;
}
static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
+ struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
unsigned int i;
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}
-#define _LDT_empty(info) \
- ((info)->base_addr == 0 && \
- (info)->limit == 0 && \
- (info)->contents == 0 && \
- (info)->read_exec_only == 1 && \
- (info)->seg_32bit == 0 && \
- (info)->limit_in_pages == 0 && \
- (info)->seg_not_present == 1 && \
- (info)->useable == 0)
-
-#ifdef CONFIG_X86_64
-#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
-#else
-#define LDT_empty(info) (_LDT_empty(info))
-#endif
+DECLARE_PER_CPU(bool, __tss_limit_invalid);
-static inline void clear_LDT(void)
+static inline void force_reload_TR(void)
{
- set_ldt(NULL, 0);
-}
+ struct desc_struct *d = get_current_gdt_rw();
+ tss_desc tss;
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
- set_ldt(pc->ldt, pc->size);
-}
+ memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
-static inline void load_LDT(mm_context_t *pc)
-{
- preempt_disable();
- load_LDT_nolock(pc);
- preempt_enable();
-}
+ /*
+ * LTR requires an available TSS, and the TSS is currently
+ * busy. Make it be available so that LTR will work.
+ */
+ tss.type = DESC_TSS;
+ write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
-static inline unsigned long get_desc_base(const struct desc_struct *desc)
-{
- return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+ load_TR_desc();
+ this_cpu_write(__tss_limit_invalid, false);
}
-static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+/*
+ * Call this if you need the TSS limit to be correct, which should be the case
+ * if and only if you have TIF_IO_BITMAP set or you're switching to a task
+ * with TIF_IO_BITMAP set.
+ */
+static inline void refresh_tss_limit(void)
{
- return desc->limit0 | (desc->limit << 16);
-}
+ DEBUG_LOCKS_WARN_ON(preemptible());
-static inline void _set_gate(int gate, unsigned type, void *addr,
- unsigned dpl, unsigned ist, unsigned seg)
-{
- gate_desc s;
- pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
- /*
- * does not need to be atomic because it is only done once at
- * setup time
- */
- write_idt_entry(idt_table, gate, &s);
+ if (unlikely(this_cpu_read(__tss_limit_invalid)))
+ force_reload_TR();
}
/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
+ * If you do something evil that corrupts the cached TSS limit (I'm looking
+ * at you, VMX exits), call this function.
+ *
+ * The optimization here is that the TSS limit only matters for Linux if the
+ * IO bitmap is in use. If the TSS limit gets forced to its minimum value,
+ * everything works except that IO bitmap will be ignored and all CPL 3 IO
+ * instructions will #GP, which is exactly what we want for normal tasks.
*/
-static inline void set_intr_gate(unsigned int n, void *addr)
+static inline void invalidate_tss_limit(void)
{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+ DEBUG_LOCKS_WARN_ON(preemptible());
+
+ if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
+ force_reload_TR();
+ else
+ this_cpu_write(__tss_limit_invalid, true);
}
-extern int first_system_vector;
-/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
-extern unsigned long used_vectors[];
+/* This intentionally ignores lm, since 32-bit apps don't have that field. */
+#define LDT_empty(info) \
+ ((info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0)
-static inline void alloc_system_vector(int vector)
+/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */
+static inline bool LDT_zero(const struct user_desc *info)
{
- if (!test_bit(vector, used_vectors)) {
- set_bit(vector, used_vectors);
- if (first_system_vector > vector)
- first_system_vector = vector;
- } else
- BUG();
+ return (info->base_addr == 0 &&
+ info->limit == 0 &&
+ info->contents == 0 &&
+ info->read_exec_only == 0 &&
+ info->seg_32bit == 0 &&
+ info->limit_in_pages == 0 &&
+ info->seg_not_present == 0 &&
+ info->useable == 0);
}
-static inline void alloc_intr_gate(unsigned int n, void *addr)
+static inline void clear_LDT(void)
{
- alloc_system_vector(n);
- set_intr_gate(n, addr);
+ set_ldt(NULL, 0);
}
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+ return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
}
-static inline void set_system_trap_gate(unsigned int n, void *addr)
+static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+ desc->base0 = base & 0xffff;
+ desc->base1 = (base >> 16) & 0xff;
+ desc->base2 = (base >> 24) & 0xff;
}
-static inline void set_trap_gate(unsigned int n, void *addr)
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+ return desc->limit0 | (desc->limit1 << 16);
}
-static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+ desc->limit0 = limit & 0xffff;
+ desc->limit1 = (limit >> 16) & 0xf;
}
-static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+static inline void init_idt_data(struct idt_data *data, unsigned int n,
+ const void *addr)
{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+ BUG_ON(n > 0xFF);
+
+ memset(data, 0, sizeof(*data));
+ data->vector = n;
+ data->addr = addr;
+ data->segment = __KERNEL_CS;
+ data->bits.type = GATE_INTERRUPT;
+ data->bits.p = 1;
}
-static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
{
- BUG_ON((unsigned)n > 0xFF);
- _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+ unsigned long addr = (unsigned long) d->addr;
+
+ gate->offset_low = (u16) addr;
+ gate->segment = (u16) d->segment;
+ gate->bits = d->bits;
+ gate->offset_middle = (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+ gate->offset_high = (u32) (addr >> 32);
+ gate->reserved = 0;
+#endif
}
+extern unsigned long system_vectors[];
+
+extern void load_current_idt(void);
+extern void idt_setup_early_handler(void);
+extern void idt_setup_early_traps(void);
+extern void idt_setup_traps(void);
+extern void idt_setup_apic_and_irq_gates(void);
+extern bool idt_is_f00f_address(unsigned long address);
+
+#ifdef CONFIG_X86_64
+extern void idt_setup_early_pf(void);
+#else
+static inline void idt_setup_early_pf(void) { }
+#endif
+
+extern void idt_invalidate(void);
+
#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index a6adefa28b94..7e6b9314758a 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Written 2000 by Andi Kleen */
#ifndef _ASM_X86_DESC_DEFS_H
#define _ASM_X86_DESC_DEFS_H
@@ -7,33 +8,85 @@
* archs.
*/
-#ifndef __ASSEMBLY__
+/*
+ * Low-level interface mapping flags/field names to bits
+ */
-#include <linux/types.h>
+/* Flags for _DESC_S (non-system) descriptors */
+#define _DESC_ACCESSED 0x0001
+#define _DESC_DATA_WRITABLE 0x0002
+#define _DESC_CODE_READABLE 0x0002
+#define _DESC_DATA_EXPAND_DOWN 0x0004
+#define _DESC_CODE_CONFORMING 0x0004
+#define _DESC_CODE_EXECUTABLE 0x0008
+
+/* Common flags */
+#define _DESC_S 0x0010
+#define _DESC_DPL(dpl) ((dpl) << 5)
+#define _DESC_PRESENT 0x0080
+
+#define _DESC_LONG_CODE 0x2000
+#define _DESC_DB 0x4000
+#define _DESC_GRANULARITY_4K 0x8000
+
+/* System descriptors have a numeric "type" field instead of flags */
+#define _DESC_SYSTEM(code) (code)
/*
- * FIXME: Acessing the desc_struct through its fields is more elegant,
- * and should be the one valid thing to do. However, a lot of open code
- * still touches the a and b acessors, and doing this allow us to do it
- * incrementally. We keep the signature as a struct, rather than an union,
- * so we can get rid of it transparently in the future -- glommer
+ * High-level interface mapping intended usage to low-level combinations
+ * of flags
*/
+
+#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \
+ _DESC_DATA_WRITABLE)
+#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \
+ _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE)
+
+#define DESC_DATA16 (_DESC_DATA)
+#define DESC_CODE16 (_DESC_CODE)
+
+#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB)
+
+#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB)
+
+#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT)
+
+#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE)
+
+#define DESC_USER (_DESC_DPL(3))
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+
/* 8 byte segment descriptor */
struct desc_struct {
- union {
- struct {
- unsigned int a;
- unsigned int b;
- };
- struct {
- u16 limit0;
- u16 base0;
- unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
- unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
- };
- };
+ u16 limit0;
+ u16 base0;
+ u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
} __attribute__((packed));
+#define GDT_ENTRY_INIT(flags, base, limit) \
+ { \
+ .limit0 = ((limit) >> 0) & 0xFFFF, \
+ .limit1 = ((limit) >> 16) & 0x000F, \
+ .base0 = ((base) >> 0) & 0xFFFF, \
+ .base1 = ((base) >> 16) & 0x00FF, \
+ .base2 = ((base) >> 24) & 0x00FF, \
+ .type = ((flags) >> 0) & 0x000F, \
+ .s = ((flags) >> 4) & 0x0001, \
+ .dpl = ((flags) >> 5) & 0x0003, \
+ .p = ((flags) >> 7) & 0x0001, \
+ .avl = ((flags) >> 12) & 0x0001, \
+ .l = ((flags) >> 13) & 0x0001, \
+ .d = ((flags) >> 14) & 0x0001, \
+ .g = ((flags) >> 15) & 0x0001, \
+ }
+
enum {
GATE_INTERRUPT = 0xE,
GATE_TRAP = 0xF,
@@ -41,48 +94,71 @@ enum {
GATE_TASK = 0x5,
};
-/* 16byte gate */
-struct gate_struct64 {
- u16 offset_low;
- u16 segment;
- unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
- u16 offset_middle;
- u32 offset_high;
- u32 zero1;
-} __attribute__((packed));
-
-#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
-#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
-#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
-
enum {
DESC_TSS = 0x9,
DESC_LDT = 0x2,
DESCTYPE_S = 0x10, /* !system */
};
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct ldttss_desc64 {
- u16 limit0;
- u16 base0;
- unsigned base1 : 8, type : 5, dpl : 2, p : 1;
- unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
- u32 base3;
- u32 zero1;
+/* LDT or TSS descriptor in the GDT. */
+struct ldttss_desc {
+ u16 limit0;
+ u16 base0;
+
+ u16 base1 : 8, type : 5, dpl : 2, p : 1;
+ u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+#ifdef CONFIG_X86_64
+ u32 base3;
+ u32 zero1;
+#endif
+} __attribute__((packed));
+
+typedef struct ldttss_desc ldt_desc;
+typedef struct ldttss_desc tss_desc;
+
+struct idt_bits {
+ u16 ist : 3,
+ zero : 5,
+ type : 5,
+ dpl : 2,
+ p : 1;
+} __attribute__((packed));
+
+struct idt_data {
+ unsigned int vector;
+ unsigned int segment;
+ struct idt_bits bits;
+ const void *addr;
+};
+
+struct gate_struct {
+ u16 offset_low;
+ u16 segment;
+ struct idt_bits bits;
+ u16 offset_middle;
+#ifdef CONFIG_X86_64
+ u32 offset_high;
+ u32 reserved;
+#endif
} __attribute__((packed));
+typedef struct gate_struct gate_desc;
+
+#ifndef _SETUP
+static inline unsigned long gate_offset(const gate_desc *g)
+{
#ifdef CONFIG_X86_64
-typedef struct gate_struct64 gate_desc;
-typedef struct ldttss_desc64 ldt_desc;
-typedef struct ldttss_desc64 tss_desc;
-#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
-#define gate_segment(g) ((g).segment)
+ return g->offset_low | ((unsigned long)g->offset_middle << 16) |
+ ((unsigned long) g->offset_high << 32);
#else
-typedef struct desc_struct gate_desc;
-typedef struct desc_struct ldt_desc;
-typedef struct desc_struct tss_desc;
-#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
-#define gate_segment(g) ((g).a >> 16)
+ return g->offset_low | ((unsigned long)g->offset_middle << 16);
+#endif
+}
+
+static inline unsigned long gate_segment(const gate_desc *g)
+{
+ return g->segment;
+}
#endif
struct desc_ptr {
@@ -90,6 +166,32 @@ struct desc_ptr {
unsigned long address;
} __attribute__((packed)) ;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
+
+/* Boot IDT definitions */
+#define BOOT_IDT_ENTRIES 32
+
+/* Access rights as returned by LAR */
+#define AR_TYPE_RODATA (0 * (1 << 9))
+#define AR_TYPE_RWDATA (1 * (1 << 9))
+#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9))
+#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9))
+#define AR_TYPE_XOCODE (4 * (1 << 9))
+#define AR_TYPE_XRCODE (5 * (1 << 9))
+#define AR_TYPE_XOCODE_CONF (6 * (1 << 9))
+#define AR_TYPE_XRCODE_CONF (7 * (1 << 9))
+#define AR_TYPE_MASK (7 * (1 << 9))
+
+#define AR_DPL0 (0 * (1 << 13))
+#define AR_DPL3 (3 * (1 << 13))
+#define AR_DPL_MASK (3 * (1 << 13))
+
+#define AR_A (1 << 8) /* "Accessed" */
+#define AR_S (1 << 12) /* If clear, "System" segment */
+#define AR_P (1 << 15) /* "Present" */
+#define AR_AVL (1 << 20) /* "AVaiLable" (no HW effect) */
+#define AR_L (1 << 21) /* "Long mode" for code segments */
+#define AR_DB (1 << 22) /* D/B, effect depends on type */
+#define AR_G (1 << 23) /* "Granularity" (limit in pages) */
#endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 4994a20acbcb..7c0a52ca2f4d 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -1,16 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DEVICE_H
#define _ASM_X86_DEVICE_H
struct dev_archdata {
-#ifdef CONFIG_ACPI
- void *acpi_handle;
-#endif
-#ifdef CONFIG_X86_64
-struct dma_map_ops *dma_ops;
-#endif
-#ifdef CONFIG_DMAR
- void *iommu; /* hook for IOMMU specific extension */
-#endif
+};
+
+struct pdev_archdata {
};
#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9a2d644c08ef..30fd06ede751 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -1,9 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DIV64_H
#define _ASM_X86_DIV64_H
#ifdef CONFIG_X86_32
#include <linux/types.h>
+#include <linux/log2.h>
/*
* do_div() is NOT a C function. It wants to return
@@ -21,15 +23,20 @@
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
- asm("":"=a" (__low), "=d" (__high) : "A" (n)); \
- __upper = __high; \
- if (__high) { \
- __upper = __high % (__base); \
- __high = __high / (__base); \
+ if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
+ __mod = n & (__base - 1); \
+ n >>= ilog2(__base); \
+ } else { \
+ asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2" : "=a" (__low), "=d" (__mod) \
+ : "rm" (__base), "0" (__low), "1" (__upper)); \
+ asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
- asm("divl %2":"=a" (__low), "=d" (__mod) \
- : "rm" (__base), "0" (__low), "1" (__upper)); \
- asm("":"=A" (n) : "a" (__low), "d" (__high)); \
__mod; \
})
@@ -53,8 +60,71 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
}
#define div_u64_rem div_u64_rem
+/*
+ * gcc tends to zero extend 32bit values and do full 64bit maths.
+ * Define asm functions that avoid this.
+ * (clang generates better code for the C versions.)
+ */
+#ifndef __clang__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ u32 high, low;
+
+ asm ("mull %[b]" : "=a" (low), "=d" (high)
+ : [a] "a" (a), [b] "rm" (b) );
+
+ return low | ((u64)high) << 32;
+}
+#define mul_u32_u32 mul_u32_u32
+
+static inline u64 add_u64_u32(u64 a, u32 b)
+{
+ u32 high = a >> 32, low = a;
+
+ asm ("addl %[b], %[low]; adcl $0, %[high]"
+ : [low] "+r" (low), [high] "+r" (high)
+ : [b] "rm" (b) );
+
+ return low | (u64)high << 32;
+}
+#define add_u64_u32 add_u64_u32
+#endif
+
+/*
+ * __div64_32() is never called on x86, so prevent the
+ * generic definition from getting built.
+ */
+#define __div64_32
+
#else
# include <asm-generic/div64.h>
+
+/*
+ * Will generate an #DE when the result doesn't fit u64, could fix with an
+ * __ex_table[] entry when it becomes an issue.
+ */
+static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
+{
+ u64 rdx;
+
+ asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
+
+ if (!statically_true(!add))
+ asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
+ [lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
+
+ asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
+
+ return rax;
+}
+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
+
+static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+{
+ return mul_u64_add_u64_div_u64(a, mul, 0, div);
+}
+#define mul_u64_u32_div mul_u64_u32_div
+
#endif /* CONFIG_X86_32 */
#endif /* _ASM_X86_DIV64_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1c3f9435f1c9..d1dac96ee30b 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -1,142 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DMA_MAPPING_H
#define _ASM_X86_DMA_MAPPING_H
-/*
- * IOMMU interface. See Documentation/PCI/PCI-DMA-mapping.txt and
- * Documentation/DMA-API.txt for documentation.
- */
+extern const struct dma_map_ops *dma_ops;
-#include <linux/kmemcheck.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-debug.h>
-#include <linux/dma-attrs.h>
-#include <asm/io.h>
-#include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
-
-extern dma_addr_t bad_dma_address;
-extern int iommu_merge;
-extern struct device x86_dma_fallback_dev;
-extern int panic_on_overflow;
-
-extern struct dma_map_ops *dma_ops;
-
-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static inline const struct dma_map_ops *get_arch_dma_ops(void)
{
-#ifdef CONFIG_X86_32
return dma_ops;
-#else
- if (unlikely(!dev) || !dev->archdata.dma_ops)
- return dma_ops;
- else
- return dev->archdata.dma_ops;
-#endif
-}
-
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
- if (ops->mapping_error)
- return ops->mapping_error(dev, dma_addr);
-
- return (dma_addr == bad_dma_address);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d, h) (1)
-
-extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
-
-extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag);
-
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
- enum dma_data_direction dir)
-{
- flush_write_buffers();
-}
-
-static inline int dma_get_cache_alignment(void)
-{
- /* no easy way to get cache size on all x86, so return the
- * maximum possible, to be safe */
- return boot_cpu_data.x86_clflush_size;
-}
-
-static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
- gfp_t gfp)
-{
- unsigned long dma_mask = 0;
-
- dma_mask = dev->coherent_dma_mask;
- if (!dma_mask)
- dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32);
-
- return dma_mask;
-}
-
-static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
-{
- unsigned long dma_mask = dma_alloc_coherent_mask(dev, gfp);
-
- if (dma_mask <= DMA_BIT_MASK(24))
- gfp |= GFP_DMA;
-#ifdef CONFIG_X86_64
- if (dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
- gfp |= GFP_DMA32;
-#endif
- return gfp;
-}
-
-static inline void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
- void *memory;
-
- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
- return memory;
-
- if (!dev) {
- dev = &x86_dma_fallback_dev;
- gfp |= GFP_DMA;
- }
-
- if (!is_device_dma_capable(dev))
- return NULL;
-
- if (!ops->alloc_coherent)
- return NULL;
-
- memory = ops->alloc_coherent(dev, size, dma_handle,
- dma_alloc_coherent_gfp_flags(dev, gfp));
- debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
- return memory;
-}
-
-static inline void dma_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t bus)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
-
- WARN_ON(irqs_disabled()); /* for portability */
-
- if (dma_release_from_coherent(dev, get_order(size), vaddr))
- return;
-
- debug_dma_free_coherent(dev, size, vaddr, bus);
- if (ops->free_coherent)
- ops->free_coherent(dev, size, vaddr, bus);
}
#endif
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index ca1098a7e580..8ae6e0e11b8b 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/include/asm/dma.h: Defines for using and allocating dma channels.
* Written by Hennus Bergman, 1992.
@@ -10,7 +11,6 @@
#include <linux/spinlock.h> /* And spinlocks */
#include <asm/io.h> /* need byte IO */
-#include <linux/delay.h>
#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
#define dma_outb outb_p
@@ -70,22 +70,18 @@
#define MAX_DMA_CHANNELS 8
-#ifdef CONFIG_X86_32
-
-/* The maximum address that we can perform a DMA transfer to on this platform */
-#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
-
-#else
-
/* 16MB ISA DMA zone */
-#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
+#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT)
/* 4GB broken PCI/AGP hardware bus master zone */
-#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT))
+#ifdef CONFIG_X86_32
+/* The maximum address that we can perform a DMA transfer to on this platform */
+#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
+#else
/* Compat define for old dma zone */
#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
-
#endif
/* 8237 DMA controllers */
@@ -151,6 +147,7 @@
#define DMA_AUTOINIT 0x10
+#ifdef CONFIG_ISA_DMA_API
extern spinlock_t dma_spin_lock;
static inline unsigned long claim_dma_lock(void)
@@ -164,6 +161,7 @@ static inline void release_dma_lock(unsigned long flags)
{
spin_unlock_irqrestore(&dma_spin_lock, flags);
}
+#endif /* CONFIG_ISA_DMA_API */
/* enable/disable a specific DMA channel */
static inline void enable_dma(unsigned int dmanr)
@@ -303,16 +301,10 @@ static inline int get_dma_residue(unsigned int dmanr)
}
-/* These are in kernel/dma.c: */
+/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */
+#ifdef CONFIG_ISA_DMA_API
extern int request_dma(unsigned int dmanr, const char *device_id);
extern void free_dma(unsigned int dmanr);
-
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy (0)
#endif
#endif /* _ASM_X86_DMA_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index fd8f9e2ca35f..b825cb201251 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -1,10 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DMI_H
#define _ASM_X86_DMI_H
#include <linux/compiler.h>
#include <linux/init.h>
+#include <linux/io.h>
-#include <asm/io.h>
#include <asm/setup.h>
static __always_inline __init void *dmi_alloc(unsigned len)
@@ -13,7 +14,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
}
/* Use early IO mappings for DMI because it's initialized early */
-#define dmi_ioremap early_ioremap
-#define dmi_iounmap early_iounmap
+#define dmi_early_remap early_memremap
+#define dmi_early_unmap early_memunmap
+#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB)
+#define dmi_unmap(_x) memunmap(_x)
#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h
deleted file mode 100644
index 23ecda0b28a0..000000000000
--- a/arch/x86/include/asm/do_timer.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* defines for inline arch setup functions */
-#include <linux/clockchips.h>
-
-#include <asm/i8259.h>
-#include <asm/i8253.h>
-
-/**
- * do_timer_interrupt_hook - hook into timer tick
- *
- * Call the pit clock event handler. see asm/i8253.h
- **/
-
-static inline void do_timer_interrupt_hook(void)
-{
- global_clock_event->event_handler(global_clock_event);
-}
diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h
new file mode 100644
index 000000000000..de0e88b32207
--- /dev/null
+++ b/arch/x86/include/asm/doublefault.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DOUBLEFAULT_H
+#define _ASM_X86_DOUBLEFAULT_H
+
+#include <linux/linkage.h>
+
+#ifdef CONFIG_X86_32
+extern void doublefault_init_cpu_tss(void);
+#else
+static inline void doublefault_init_cpu_tss(void)
+{
+}
+#endif
+
+asmlinkage void __noreturn doublefault_shim(void);
+
+#endif /* _ASM_X86_DOUBLEFAULT_H */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b093..000000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Debug Store (DS) support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#ifndef _ASM_X86_DS_H
-#define _ASM_X86_DS_H
-
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/err.h>
-
-
-#ifdef CONFIG_X86_DS
-
-struct task_struct;
-struct ds_context;
-struct ds_tracer;
-struct bts_tracer;
-struct pebs_tracer;
-
-typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
-typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
-
-
-/*
- * A list of features plus corresponding macros to talk about them in
- * the ds_request function's flags parameter.
- *
- * We use the enum to index an array of corresponding control bits;
- * we use the macro to index a flags bit-vector.
- */
-enum ds_feature {
- dsf_bts = 0,
- dsf_bts_kernel,
-#define BTS_KERNEL (1 << dsf_bts_kernel)
- /* trace kernel-mode branches */
-
- dsf_bts_user,
-#define BTS_USER (1 << dsf_bts_user)
- /* trace user-mode branches */
-
- dsf_bts_overflow,
- dsf_bts_max,
- dsf_pebs = dsf_bts_max,
-
- dsf_pebs_max,
- dsf_ctl_max = dsf_pebs_max,
- dsf_bts_timestamps = dsf_ctl_max,
-#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
- /* add timestamps into BTS trace */
-
-#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
-};
-
-
-/*
- * Request BTS or PEBS
- *
- * Due to alignement constraints, the actual buffer may be slightly
- * smaller than the requested or provided buffer.
- *
- * Returns a pointer to a tracer structure on success, or
- * ERR_PTR(errcode) on failure.
- *
- * The interrupt threshold is independent from the overflow callback
- * to allow users to use their own overflow interrupt handling mechanism.
- *
- * The function might sleep.
- *
- * task: the task to request recording for
- * cpu: the cpu to request recording for
- * base: the base pointer for the (non-pageable) buffer;
- * size: the size of the provided buffer in bytes
- * ovfl: pointer to a function to be called on buffer overflow;
- * NULL if cyclic buffer requested
- * th: the interrupt threshold in records from the end of the buffer;
- * -1 if no interrupt threshold is requested.
- * flags: a bit-mask of the above flags
- */
-extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Must be called with irq's enabled.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern void ds_release_bts(struct bts_tracer *tracer);
-extern void ds_suspend_bts(struct bts_tracer *tracer);
-extern void ds_resume_bts(struct bts_tracer *tracer);
-extern void ds_release_pebs(struct pebs_tracer *tracer);
-extern void ds_suspend_pebs(struct pebs_tracer *tracer);
-extern void ds_resume_pebs(struct pebs_tracer *tracer);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Cpu tracers must call this on the traced cpu.
- * Task tracers must call ds_release_~_noirq() for themselves.
- *
- * May be called with irq's disabled.
- *
- * Returns 0 if successful;
- * -EPERM if the cpu tracer does not trace the current cpu.
- * -EPERM if the task tracer does not trace itself.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_release_bts_noirq(struct bts_tracer *tracer);
-extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
-extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
-extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
-
-
-/*
- * The raw DS buffer state as it is used for BTS and PEBS recording.
- *
- * This is the low-level, arch-dependent interface for working
- * directly on the raw trace data.
- */
-struct ds_trace {
- /* the number of bts/pebs records */
- size_t n;
- /* the size of a bts/pebs record in bytes */
- size_t size;
- /* pointers into the raw buffer:
- - to the first entry */
- void *begin;
- /* - one beyond the last entry */
- void *end;
- /* - one beyond the newest entry */
- void *top;
- /* - the interrupt threshold */
- void *ith;
- /* flags given on ds_request() */
- unsigned int flags;
-};
-
-/*
- * An arch-independent view on branch trace data.
- */
-enum bts_qualifier {
- bts_invalid,
-#define BTS_INVALID bts_invalid
-
- bts_branch,
-#define BTS_BRANCH bts_branch
-
- bts_task_arrives,
-#define BTS_TASK_ARRIVES bts_task_arrives
-
- bts_task_departs,
-#define BTS_TASK_DEPARTS bts_task_departs
-
- bts_qual_bit_size = 4,
- bts_qual_max = (1 << bts_qual_bit_size),
-};
-
-struct bts_struct {
- __u64 qualifier;
- union {
- /* BTS_BRANCH */
- struct {
- __u64 from;
- __u64 to;
- } lbr;
- /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
- struct {
- __u64 clock;
- pid_t pid;
- } event;
- } variant;
-};
-
-
-/*
- * The BTS state.
- *
- * This gives access to the raw DS state and adds functions to provide
- * an arch-independent view of the BTS data.
- */
-struct bts_trace {
- struct ds_trace ds;
-
- int (*read)(struct bts_tracer *tracer, const void *at,
- struct bts_struct *out);
- int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
-};
-
-
-/*
- * The PEBS state.
- *
- * This gives access to the raw DS state and the PEBS-specific counter
- * reset value.
- */
-struct pebs_trace {
- struct ds_trace ds;
-
- /* the number of valid counters in the below array */
- unsigned int counters;
-
-#define MAX_PEBS_COUNTERS 4
- /* the counter reset value */
- unsigned long long counter_reset[MAX_PEBS_COUNTERS];
-};
-
-
-/*
- * Read the BTS or PEBS trace.
- *
- * Returns a view on the trace collected for the parameter tracer.
- *
- * The view remains valid as long as the traced task is not running or
- * the tracer is suspended.
- * Writes into the trace buffer are not reflected.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
-extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
-
-
-/*
- * Reset the write pointer of the BTS/PEBS buffer.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_reset_bts(struct bts_tracer *tracer);
-extern int ds_reset_pebs(struct pebs_tracer *tracer);
-
-/*
- * Set the PEBS counter reset value.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_pebs()
- * counter: the index of the counter
- * value: the new counter reset value
- */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
- unsigned int counter, u64 value);
-
-/*
- * Initialization
- */
-struct cpuinfo_x86;
-extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
-
-/*
- * Context switch work
- */
-extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-
-#else /* CONFIG_X86_DS */
-
-struct cpuinfo_x86;
-static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
-static inline void ds_switch_to(struct task_struct *prev,
- struct task_struct *next) {}
-
-#endif /* CONFIG_X86_DS */
-#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 3afc5e87cfdd..302e11b15da8 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -1,19 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DWARF2_H
#define _ASM_X86_DWARF2_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#warning "asm/dwarf2.h should be only included in pure assembly files"
#endif
-/*
- * Macros for dwarf2 CFI unwind table entries.
- * See "as.info" for details on these pseudo ops. Unfortunately
- * they are only supported in very new binutils, so define them
- * away for older version.
- */
-
-#ifdef CONFIG_AS_CFI
-
#define CFI_STARTPROC .cfi_startproc
#define CFI_ENDPROC .cfi_endproc
#define CFI_DEF_CFA .cfi_def_cfa
@@ -27,70 +19,23 @@
#define CFI_REMEMBER_STATE .cfi_remember_state
#define CFI_RESTORE_STATE .cfi_restore_state
#define CFI_UNDEFINED .cfi_undefined
-
-#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
-#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#define CFI_ESCAPE .cfi_escape
+
+#ifndef BUILD_VDSO
+ /*
+ * Emit CFI data in .debug_frame sections, not .eh_frame sections.
+ * The latter we currently just discard since we don't do DWARF
+ * unwinding at runtime. So only the offline DWARF information is
+ * useful to anyone. Note we should not use this directive if we
+ * ever decide to enable DWARF unwinding at runtime.
+ */
+ .cfi_sections .debug_frame
#else
-#define CFI_SIGNAL_FRAME
+ /*
+ * For the vDSO, emit both runtime unwind information and debug
+ * symbols for the .dbg file.
+ */
+ .cfi_sections .eh_frame, .debug_frame
#endif
-#else
-
-/*
- * Due to the structure of pre-exisiting code, don't use assembler line
- * comment character # to ignore the arguments. Instead, use a dummy macro.
- */
-.macro cfi_ignore a=0, b=0, c=0, d=0
-.endm
-
-#define CFI_STARTPROC cfi_ignore
-#define CFI_ENDPROC cfi_ignore
-#define CFI_DEF_CFA cfi_ignore
-#define CFI_DEF_CFA_REGISTER cfi_ignore
-#define CFI_DEF_CFA_OFFSET cfi_ignore
-#define CFI_ADJUST_CFA_OFFSET cfi_ignore
-#define CFI_OFFSET cfi_ignore
-#define CFI_REL_OFFSET cfi_ignore
-#define CFI_REGISTER cfi_ignore
-#define CFI_RESTORE cfi_ignore
-#define CFI_REMEMBER_STATE cfi_ignore
-#define CFI_RESTORE_STATE cfi_ignore
-#define CFI_UNDEFINED cfi_ignore
-#define CFI_SIGNAL_FRAME cfi_ignore
-
-#endif
-
-/*
- * An attempt to make CFI annotations more or less
- * correct and shorter. It is implied that you know
- * what you're doing if you use them.
- */
-#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_64
- .macro pushq_cfi reg
- pushq \reg
- CFI_ADJUST_CFA_OFFSET 8
- .endm
-
- .macro popq_cfi reg
- popq \reg
- CFI_ADJUST_CFA_OFFSET -8
- .endm
-
- .macro movq_cfi reg offset=0
- movq %\reg, \offset(%rsp)
- CFI_REL_OFFSET \reg, \offset
- .endm
-
- .macro movq_cfi_restore offset reg
- movq \offset(%rsp), %\reg
- CFI_RESTORE \reg
- .endm
-#else /*!CONFIG_X86_64*/
-
- /* 32bit defenitions are missed yet */
-
-#endif /*!CONFIG_X86_64*/
-#endif /*__ASSEMBLY__*/
-
#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
deleted file mode 100644
index 7ecba4d85089..000000000000
--- a/arch/x86/include/asm/e820.h
+++ /dev/null
@@ -1,147 +0,0 @@
-#ifndef _ASM_X86_E820_H
-#define _ASM_X86_E820_H
-#define E820MAP 0x2d0 /* our map */
-#define E820MAX 128 /* number of entries in E820MAP */
-
-/*
- * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
- * constrained space in the zeropage. If we have more nodes than
- * that, and if we've booted off EFI firmware, then the EFI tables
- * passed us from the EFI firmware can list more nodes. Size our
- * internal memory map tables to have room for these additional
- * nodes, based on up to three entries per node for which the
- * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
- * plus E820MAX, allowing space for the possible duplicate E820
- * entries that might need room in the same arrays, prior to the
- * call to sanitize_e820_map() to remove duplicates. The allowance
- * of three memory map entries per node is "enough" entries for
- * the initial hardware platform motivating this mechanism to make
- * use of additional EFI map entries. Future platforms may want
- * to allow more than three entries per node or otherwise refine
- * this size.
- */
-
-/*
- * Odd: 'make headers_check' complains about numa.h if I try
- * to collapse the next two #ifdef lines to a single line:
- * #if defined(__KERNEL__) && defined(CONFIG_EFI)
- */
-#ifdef __KERNEL__
-#ifdef CONFIG_EFI
-#include <linux/numa.h>
-#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
-#else /* ! CONFIG_EFI */
-#define E820_X_MAX E820MAX
-#endif
-#else /* ! __KERNEL__ */
-#define E820_X_MAX E820MAX
-#endif
-
-#define E820NR 0x1e8 /* # entries in E820MAP */
-
-#define E820_RAM 1
-#define E820_RESERVED 2
-#define E820_ACPI 3
-#define E820_NVS 4
-#define E820_UNUSABLE 5
-
-/* reserved RAM used by kernel itself */
-#define E820_RESERVED_KERN 128
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-struct e820entry {
- __u64 addr; /* start of memory segment */
- __u64 size; /* size of memory segment */
- __u32 type; /* type of memory segment */
-} __attribute__((packed));
-
-struct e820map {
- __u32 nr_map;
- struct e820entry map[E820_X_MAX];
-};
-
-#ifdef __KERNEL__
-/* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
-extern struct e820map e820_saved;
-
-extern unsigned long pci_mem_start;
-extern int e820_any_mapped(u64 start, u64 end, unsigned type);
-extern int e820_all_mapped(u64 start, u64 end, unsigned type);
-extern void e820_add_region(u64 start, u64 size, int type);
-extern void e820_print_map(char *who);
-extern int
-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
-extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type);
-extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
- int checktype);
-extern void update_e820(void);
-extern void e820_setup_gap(void);
-extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
- unsigned long start_addr, unsigned long long end_addr);
-struct setup_data;
-extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
-
-#if defined(CONFIG_X86_64) || \
- (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
-extern void e820_mark_nosave_regions(unsigned long limit_pfn);
-#else
-static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
-{
-}
-#endif
-
-#ifdef CONFIG_MEMTEST
-extern void early_memtest(unsigned long start, unsigned long end);
-#else
-static inline void early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
-extern unsigned long end_user_pfn;
-
-extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
-extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern void reserve_early(u64 start, u64 end, char *name);
-extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
-extern void free_early(u64 start, u64 end);
-extern void early_res_to_bootmem(u64 start, u64 end);
-extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-
-extern unsigned long e820_end_of_ram_pfn(void);
-extern unsigned long e820_end_of_low_ram_pfn(void);
-extern int e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn);
-extern void e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn);
-extern u64 e820_hole_size(u64 start, u64 end);
-extern void finish_e820_parsing(void);
-extern void e820_reserve_resources(void);
-extern void e820_reserve_resources_late(void);
-extern void setup_memory_map(void);
-extern char *default_machine_specific_memory_setup(void);
-extern char *machine_specific_memory_setup(void);
-extern char *memory_setup(void);
-#endif /* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
-
-#define ISA_START_ADDRESS 0xa0000
-#define ISA_END_ADDRESS 0x100000
-#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
-
-#define BIOS_BEGIN 0x000a0000
-#define BIOS_END 0x00100000
-
-#ifdef __KERNEL__
-#include <linux/ioport.h>
-
-#define HIGH_MEMORY (1024*1024)
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_E820_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
new file mode 100644
index 000000000000..c83645d5b2a8
--- /dev/null
+++ b/arch/x86/include/asm/e820/api.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_E820_API_H
+#define _ASM_E820_API_H
+
+#include <asm/e820/types.h>
+
+extern struct e820_table *e820_table;
+extern struct e820_table *e820_table_kexec;
+extern struct e820_table *e820_table_firmware;
+
+extern unsigned long pci_mem_start;
+
+extern bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type);
+extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type);
+extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
+
+extern void e820__range_add (u64 start, u64 size, enum e820_type type);
+extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
+extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type);
+extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
+
+extern void e820__print_table(char *who);
+extern int e820__update_table(struct e820_table *table);
+extern void e820__update_table_print(void);
+
+extern unsigned long e820__end_of_ram_pfn(void);
+extern unsigned long e820__end_of_low_ram_pfn(void);
+
+extern u64 e820__memblock_alloc_reserved(u64 size, u64 align);
+extern void e820__memblock_setup(void);
+
+extern void e820__finish_early_params(void);
+extern void e820__reserve_resources(void);
+extern void e820__reserve_resources_late(void);
+
+extern void e820__memory_setup(void);
+extern void e820__memory_setup_extended(u64 phys_addr, u32 data_len);
+extern char *e820__memory_setup_default(void);
+extern void e820__setup_pci_gap(void);
+
+extern void e820__reallocate_tables(void);
+extern void e820__register_nosave_regions(unsigned long limit_pfn);
+
+extern int e820__get_entry_type(u64 start, u64 end);
+
+/*
+ * Returns true iff the specified range [start,end) is completely contained inside
+ * the ISA region.
+ */
+static inline bool is_ISA_range(u64 start, u64 end)
+{
+ return start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS;
+}
+
+#endif /* _ASM_E820_API_H */
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
new file mode 100644
index 000000000000..80c4a7266629
--- /dev/null
+++ b/arch/x86/include/asm/e820/types.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_E820_TYPES_H
+#define _ASM_E820_TYPES_H
+
+#include <uapi/asm/bootparam.h>
+
+/*
+ * These are the E820 types known to the kernel:
+ */
+enum e820_type {
+ E820_TYPE_RAM = 1,
+ E820_TYPE_RESERVED = 2,
+ E820_TYPE_ACPI = 3,
+ E820_TYPE_NVS = 4,
+ E820_TYPE_UNUSABLE = 5,
+ E820_TYPE_PMEM = 7,
+
+ /*
+ * This is a non-standardized way to represent ADR or
+ * NVDIMM regions that persist over a reboot.
+ *
+ * The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ *
+ * ( Note that older platforms also used 6 for the same
+ * type of memory, but newer versions switched to 12 as
+ * 6 was assigned differently. Some time they will learn... )
+ */
+ E820_TYPE_PRAM = 12,
+
+ /*
+ * Special-purpose memory is indicated to the system via the
+ * EFI_MEMORY_SP attribute. Define an e820 translation of this
+ * memory type for the purpose of reserving this range and
+ * marking it with the IORES_DESC_SOFT_RESERVED designation.
+ */
+ E820_TYPE_SOFT_RESERVED = 0xefffffff,
+};
+
+/*
+ * A single E820 map entry, describing a memory range of [addr...addr+size-1],
+ * of 'type' memory type:
+ *
+ * (We pack it because there can be thousands of them on large systems.)
+ */
+struct e820_entry {
+ u64 addr;
+ u64 size;
+ enum e820_type type;
+} __attribute__((packed));
+
+/*
+ * The legacy E820 BIOS limits us to 128 (E820_MAX_ENTRIES_ZEROPAGE) nodes
+ * due to the constrained space in the zeropage.
+ *
+ * On large systems we can easily have thousands of nodes with RAM,
+ * which cannot be fit into so few entries - so we have a mechanism
+ * to extend the e820 table size at build-time, via the E820_MAX_ENTRIES
+ * define below.
+ *
+ * ( Those extra entries are enumerated via the EFI memory map, not
+ * via the legacy zeropage mechanism. )
+ *
+ * Size our internal memory map tables to have room for these additional
+ * entries, based on a heuristic calculation: up to three entries per
+ * NUMA node, plus E820_MAX_ENTRIES_ZEROPAGE for some extra space.
+ *
+ * This allows for bootstrap/firmware quirks such as possible duplicate
+ * E820 entries that might need room in the same arrays, prior to the
+ * call to e820__update_table() to remove duplicates. The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries. Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+#include <linux/numa.h>
+
+#define E820_MAX_ENTRIES (E820_MAX_ENTRIES_ZEROPAGE + 3*MAX_NUMNODES)
+
+/*
+ * The whole array of E820 entries:
+ */
+struct e820_table {
+ __u32 nr_entries;
+ struct e820_entry entries[E820_MAX_ENTRIES];
+};
+
+/*
+ * Various well-known legacy memory ranges in physical memory:
+ */
+#define ISA_START_ADDRESS 0x000a0000
+#define ISA_END_ADDRESS 0x00100000
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
+#define HIGH_MEMORY 0x00100000
+
+#define BIOS_ROM_BASE 0xffe00000
+#define BIOS_ROM_END 0xffffffff
+
+#endif /* _ASM_E820_TYPES_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
index e9b57ecc70c5..dfbd1ebb9f10 100644
--- a/arch/x86/include/asm/edac.h
+++ b/arch/x86/include/asm/edac.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_EDAC_H
#define _ASM_X86_EDAC_H
/* ECC atomic, DMA, SMP and interrupt safe scrub function */
-static inline void atomic_scrub(void *va, u32 size)
+static inline void edac_atomic_scrub(void *va, u32 size)
{
u32 i, *virt_addr = va;
@@ -12,7 +13,7 @@ static inline void atomic_scrub(void *va, u32 size)
* are interrupt, DMA and SMP safe.
*/
for (i = 0; i < size / 4; i++, virt_addr++)
- asm volatile("lock; addl $0, %0"::"m" (*virt_addr));
+ asm volatile("lock addl $0, %0"::"m" (*virt_addr));
}
#endif /* _ASM_X86_EDAC_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8406ed7f9926..f227a70ac91f 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,110 +1,430 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_EFI_H
#define _ASM_X86_EFI_H
-#ifdef CONFIG_X86_32
+#include <asm/fpu/api.h>
+#include <asm/processor-flags.h>
+#include <asm/tlb.h>
+#include <asm/nospec-branch.h>
+#include <asm/mmu_context.h>
+#include <asm/ibt.h>
+#include <linux/build_bug.h>
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+extern unsigned long efi_fw_vendor, efi_config_table;
+extern unsigned long efi_mixed_mode_stack_pa;
+
+/*
+ * We map the EFI regions needed for runtime services non-contiguously,
+ * with preserved alignment on virtual addresses starting from -4G down
+ * for a total max space of 64G. This way, we provide for stable runtime
+ * services addresses across kernels so that a kexec'd kernel can still
+ * use them.
+ *
+ * This is the main reason why we're doing stable VA mappings for RT
+ * services.
+ */
+
+#define EFI32_LOADER_SIGNATURE "EL32"
+#define EFI64_LOADER_SIGNATURE "EL64"
+
+#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF
+
+#define EFI_UNACCEPTED_UNIT_SIZE PMD_SIZE
+
+/*
+ * The EFI services are called through variadic functions in many cases. These
+ * functions are implemented in assembler and support only a fixed number of
+ * arguments. The macros below allows us to check at build time that we don't
+ * try to call them with too many arguments.
+ *
+ * __efi_nargs() will return the number of arguments if it is 7 or less, and
+ * cause a BUILD_BUG otherwise. The limitations of the C preprocessor make it
+ * impossible to calculate the exact number of arguments beyond some
+ * pre-defined limit. The maximum number of arguments currently supported by
+ * any of the thunks is 7, so this is good enough for now and can be extended
+ * in the obvious way if we ever need more.
+ */
+
+#define __efi_nargs(...) __efi_nargs_(__VA_ARGS__)
+#define __efi_nargs_(...) __efi_nargs__(0, ##__VA_ARGS__, \
+ __efi_arg_sentinel(9), __efi_arg_sentinel(8), \
+ __efi_arg_sentinel(7), __efi_arg_sentinel(6), \
+ __efi_arg_sentinel(5), __efi_arg_sentinel(4), \
+ __efi_arg_sentinel(3), __efi_arg_sentinel(2), \
+ __efi_arg_sentinel(1), __efi_arg_sentinel(0))
+#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, ...) \
+ __take_second_arg(n, \
+ ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 10; }))
+#define __efi_arg_sentinel(n) , n
-extern unsigned long asmlinkage efi_call_phys(void *, ...);
-
-#define efi_call_phys0(f) efi_call_phys(f)
-#define efi_call_phys1(f, a1) efi_call_phys(f, a1)
-#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2)
-#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3)
-#define efi_call_phys4(f, a1, a2, a3, a4) \
- efi_call_phys(f, a1, a2, a3, a4)
-#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
- efi_call_phys(f, a1, a2, a3, a4, a5)
-#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
- efi_call_phys(f, a1, a2, a3, a4, a5, a6)
/*
- * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ * __efi_nargs_check(f, n, ...) will cause a BUILD_BUG if the ellipsis
+ * represents more than n arguments.
*/
-#define efi_call_virt(f, args...) \
- ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+#define __efi_nargs_check(f, n, ...) \
+ __efi_nargs_check_(f, __efi_nargs(__VA_ARGS__), n)
+#define __efi_nargs_check_(f, p, n) __efi_nargs_check__(f, p, n)
+#define __efi_nargs_check__(f, p, n) ({ \
+ BUILD_BUG_ON_MSG( \
+ (p) > (n), \
+ #f " called with too many arguments (" #p ">" #n ")"); \
+})
-#define efi_call_virt0(f) efi_call_virt(f)
-#define efi_call_virt1(f, a1) efi_call_virt(f, a1)
-#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2)
-#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3)
-#define efi_call_virt4(f, a1, a2, a3, a4) \
- efi_call_virt(f, a1, a2, a3, a4)
-#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
- efi_call_virt(f, a1, a2, a3, a4, a5)
-#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
- efi_call_virt(f, a1, a2, a3, a4, a5, a6)
+static inline void efi_fpu_begin(void)
+{
+ /*
+ * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires
+ * that FCW and MXCSR (64-bit) must be initialized prior to calling
+ * UEFI code. (Oddly the spec does not require that the FPU stack
+ * be empty.)
+ */
+ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+}
-#define efi_ioremap(addr, size, type) ioremap_cache(addr, size)
+static inline void efi_fpu_end(void)
+{
+ kernel_fpu_end();
+}
+#ifdef CONFIG_X86_32
+#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1)
#else /* !CONFIG_X86_32 */
+#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT
+
+extern asmlinkage u64 __efi_call(void *fp, ...);
-extern u64 efi_call0(void *fp);
-extern u64 efi_call1(void *fp, u64 arg1);
-extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
-extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
-extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
-extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
- u64 arg4, u64 arg5);
-extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
- u64 arg4, u64 arg5, u64 arg6);
-
-#define efi_call_phys0(f) \
- efi_call0((void *)(f))
-#define efi_call_phys1(f, a1) \
- efi_call1((void *)(f), (u64)(a1))
-#define efi_call_phys2(f, a1, a2) \
- efi_call2((void *)(f), (u64)(a1), (u64)(a2))
-#define efi_call_phys3(f, a1, a2, a3) \
- efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
-#define efi_call_phys4(f, a1, a2, a3, a4) \
- efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
- (u64)(a4))
-#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
- efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
- (u64)(a4), (u64)(a5))
-#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
- efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
- (u64)(a4), (u64)(a5), (u64)(a6))
-
-#define efi_call_virt0(f) \
- efi_call0((void *)(efi.systab->runtime->f))
-#define efi_call_virt1(f, a1) \
- efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
-#define efi_call_virt2(f, a1, a2) \
- efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
-#define efi_call_virt3(f, a1, a2, a3) \
- efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
- (u64)(a3))
-#define efi_call_virt4(f, a1, a2, a3, a4) \
- efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
- (u64)(a3), (u64)(a4))
-#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
- efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
- (u64)(a3), (u64)(a4), (u64)(a5))
-#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
- efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
- (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
-
-extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
- u32 type);
+extern bool efi_disable_ibt_for_runtime;
+
+#define efi_call(...) ({ \
+ __efi_nargs_check(efi_call, 7, __VA_ARGS__); \
+ __efi_call(__VA_ARGS__); \
+})
+
+#undef arch_efi_call_virt
+#define arch_efi_call_virt(p, f, args...) ({ \
+ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \
+ ret = efi_call((void *)p->f, args); \
+ ibt_restore(ibt); \
+ ret; \
+})
+
+#ifdef CONFIG_KASAN
+/*
+ * CONFIG_KASAN may redefine memset to __memset. __memset function is present
+ * only in kernel binary. Since the EFI stub linked into a separate binary it
+ * doesn't have __memset(). So we should use standard memset from
+ * arch/x86/boot/compressed/string.c. The same applies to memcpy and memmove.
+ */
+#undef memcpy
+#undef memset
+#undef memmove
+#endif
#endif /* CONFIG_X86_32 */
-extern int add_efi_memmap;
-extern void efi_reserve_early(void);
-extern void efi_call_phys_prelog(void);
-extern void efi_call_phys_epilog(void);
+extern int __init efi_memblock_x86_reserve_range(void);
+extern void __init efi_print_memmap(void);
+extern void __init efi_map_region(efi_memory_desc_t *md);
+extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
+extern void efi_sync_low_kernel_mappings(void);
+extern int __init efi_alloc_page_tables(void);
+extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
+extern void __init efi_runtime_update_mappings(void);
+extern void __init efi_dump_pagetable(void);
+extern void __init efi_apply_memmap_quirks(void);
+extern int __init efi_reuse_config(u64 tables, int nr_tables);
+extern void efi_delete_dummy_variable(void);
+extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
+extern void efi_free_boot_services(void);
+
+void arch_efi_call_virt_setup(void);
+void arch_efi_call_virt_teardown(void);
+
+extern u64 efi_setup;
+
+#ifdef CONFIG_EFI
+extern u64 __efi64_thunk(u32, ...);
+
+#define efi64_thunk(...) ({ \
+ u64 __pad[3]; /* must have space for 3 args on the stack */ \
+ __efi_nargs_check(efi64_thunk, 9, __VA_ARGS__); \
+ __efi64_thunk(__VA_ARGS__, __pad); \
+})
+
+static inline bool efi_is_mixed(void)
+{
+ if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ return false;
+ return IS_ENABLED(CONFIG_X86_64) && !efi_enabled(EFI_64BIT);
+}
+
+static inline bool efi_runtime_supported(void)
+{
+ if (IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT))
+ return true;
+
+ return IS_ENABLED(CONFIG_EFI_MIXED);
+}
+
+extern void parse_efi_setup(u64 phys_addr, u32 data_len);
+
+extern void efi_thunk_runtime_setup(void);
+efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map,
+ unsigned long systab_phys);
+
+/* arch specific definitions used by the stub code */
+
+#ifdef CONFIG_EFI_MIXED
+
+#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX)
+
+#define ARCH_HAS_EFISTUB_WRAPPERS
+
+static inline bool efi_is_64bit(void)
+{
+ extern const bool efi_is64;
+
+ return efi_is64;
+}
+
+static inline bool efi_is_native(void)
+{
+ return efi_is_64bit();
+}
+
+#define efi_table_attr(inst, attr) \
+ (efi_is_native() ? (inst)->attr \
+ : efi_mixed_table_attr((inst), attr))
+
+#define efi_mixed_table_attr(inst, attr) \
+ (__typeof__(inst->attr)) \
+ _Generic(inst->mixed_mode.attr, \
+ u32: (unsigned long)(inst->mixed_mode.attr), \
+ default: (inst->mixed_mode.attr))
+
+/*
+ * The following macros allow translating arguments if necessary from native to
+ * mixed mode. The use case for this is to initialize the upper 32 bits of
+ * output parameters, and where the 32-bit method requires a 64-bit argument,
+ * which must be split up into two arguments to be thunked properly.
+ *
+ * As examples, the AllocatePool boot service returns the address of the
+ * allocation, but it will not set the high 32 bits of the address. To ensure
+ * that the full 64-bit address is initialized, we zero-init the address before
+ * calling the thunk.
+ *
+ * The FreePages boot service takes a 64-bit physical address even in 32-bit
+ * mode. For the thunk to work correctly, a native 64-bit call of
+ * free_pages(addr, size)
+ * must be translated to
+ * efi64_thunk(free_pages, addr & U32_MAX, addr >> 32, size)
+ * so that the two 32-bit halves of addr get pushed onto the stack separately.
+ */
+
+static inline void *efi64_zero_upper(void *p)
+{
+ if (p)
+ ((u32 *)p)[1] = 0;
+ return p;
+}
+
+static inline u32 efi64_convert_status(efi_status_t status)
+{
+ return (u32)(status | (u64)status >> 32);
+}
+
+#define __efi64_split(val) (val) & U32_MAX, (u64)(val) >> 32
-#ifndef CONFIG_EFI
+#define __efi64_argmap_free_pages(addr, size) \
+ ((addr), 0, (size))
+
+#define __efi64_argmap_get_memory_map(mm_size, mm, key, size, ver) \
+ ((mm_size), (mm), efi64_zero_upper(key), efi64_zero_upper(size), (ver))
+
+#define __efi64_argmap_allocate_pool(type, size, buffer) \
+ ((type), (size), efi64_zero_upper(buffer))
+
+#define __efi64_argmap_locate_handle_buffer(type, proto, key, num, buf) \
+ ((type), (proto), (key), efi64_zero_upper(num), efi64_zero_upper(buf))
+
+#define __efi64_argmap_create_event(type, tpl, f, c, event) \
+ ((type), (tpl), (f), (c), efi64_zero_upper(event))
+
+#define __efi64_argmap_set_timer(event, type, time) \
+ ((event), (type), lower_32_bits(time), upper_32_bits(time))
+
+#define __efi64_argmap_wait_for_event(num, event, index) \
+ ((num), (event), efi64_zero_upper(index))
+
+#define __efi64_argmap_handle_protocol(handle, protocol, interface) \
+ ((handle), (protocol), efi64_zero_upper(interface))
+
+#define __efi64_argmap_locate_protocol(protocol, reg, interface) \
+ ((protocol), (reg), efi64_zero_upper(interface))
+
+#define __efi64_argmap_locate_device_path(protocol, path, handle) \
+ ((protocol), (path), efi64_zero_upper(handle))
+
+#define __efi64_argmap_exit(handle, status, size, data) \
+ ((handle), efi64_convert_status(status), (size), (data))
+
+/* PCI I/O */
+#define __efi64_argmap_get_location(protocol, seg, bus, dev, func) \
+ ((protocol), efi64_zero_upper(seg), efi64_zero_upper(bus), \
+ efi64_zero_upper(dev), efi64_zero_upper(func))
+
+/* LoadFile */
+#define __efi64_argmap_load_file(protocol, path, policy, bufsize, buf) \
+ ((protocol), (path), (policy), efi64_zero_upper(bufsize), (buf))
+
+/* Graphics Output Protocol */
+#define __efi64_argmap_query_mode(gop, mode, size, info) \
+ ((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info))
+
+/* TCG2 protocol */
+#define __efi64_argmap_hash_log_extend_event(prot, fl, addr, size, ev) \
+ ((prot), (fl), 0ULL, (u64)(addr), 0ULL, (u64)(size), 0ULL, ev)
+
+/* DXE services */
+#define __efi64_argmap_get_memory_space_descriptor(phys, desc) \
+ (__efi64_split(phys), (desc))
+
+#define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \
+ (__efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
+/* file protocol */
+#define __efi64_argmap_open(prot, newh, fname, mode, attr) \
+ ((prot), efi64_zero_upper(newh), (fname), __efi64_split(mode), \
+ __efi64_split(attr))
+
+#define __efi64_argmap_set_position(pos) (__efi64_split(pos))
+
+/* file system protocol */
+#define __efi64_argmap_open_volume(prot, file) \
+ ((prot), efi64_zero_upper(file))
+
+/* Memory Attribute Protocol */
+#define __efi64_argmap_get_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), (flags))
+
+#define __efi64_argmap_set_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
+#define __efi64_argmap_clear_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
+/* EFI SMBIOS protocol */
+#define __efi64_argmap_get_next(protocol, smbioshandle, type, record, phandle) \
+ ((protocol), (smbioshandle), (type), efi64_zero_upper(record), \
+ efi64_zero_upper(phandle))
/*
- * IF EFI is not configured, have the EFI calls return -ENOSYS.
+ * The macros below handle the plumbing for the argument mapping. To add a
+ * mapping for a specific EFI method, simply define a macro
+ * __efi64_argmap_<method name>, following the examples above.
*/
-#define efi_call0(_f) (-ENOSYS)
-#define efi_call1(_f, _a1) (-ENOSYS)
-#define efi_call2(_f, _a1, _a2) (-ENOSYS)
-#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS)
-#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
-#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
-#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
+
+#define __efi64_thunk_map(inst, func, ...) \
+ efi64_thunk(inst->mixed_mode.func, \
+ __efi64_argmap(__efi64_argmap_ ## func(__VA_ARGS__), \
+ (__VA_ARGS__)))
+
+#define __efi64_argmap(mapped, args) \
+ __PASTE(__efi64_argmap__, __efi_nargs(__efi_eat mapped))(mapped, args)
+#define __efi64_argmap__0(mapped, args) __efi_eval mapped
+#define __efi64_argmap__1(mapped, args) __efi_eval args
+
+#define __efi_eat(...)
+#define __efi_eval(...) __VA_ARGS__
+
+static inline efi_status_t __efi64_widen_efi_status(u64 status)
+{
+ /* use rotate to move the value of bit #31 into position #63 */
+ return ror64(rol32(status, 1), 1);
+}
+
+/* The macro below handles dispatching via the thunk if needed */
+
+#define efi_fn_call(inst, func, ...) \
+ (efi_is_native() ? (inst)->func(__VA_ARGS__) \
+ : efi_mixed_call((inst), func, ##__VA_ARGS__))
+
+#define efi_mixed_call(inst, func, ...) \
+ _Generic(inst->func(__VA_ARGS__), \
+ efi_status_t: \
+ __efi64_widen_efi_status( \
+ __efi64_thunk_map(inst, func, ##__VA_ARGS__)), \
+ u64: ({ BUILD_BUG(); ULONG_MAX; }), \
+ default: \
+ (__typeof__(inst->func(__VA_ARGS__))) \
+ __efi64_thunk_map(inst, func, ##__VA_ARGS__))
+
+#else /* CONFIG_EFI_MIXED */
+
+static inline bool efi_is_64bit(void)
+{
+ return IS_ENABLED(CONFIG_X86_64);
+}
+
+#endif /* CONFIG_EFI_MIXED */
+
+extern bool efi_reboot_required(void);
+extern bool efi_is_table_address(unsigned long phys_addr);
+
+extern void efi_reserve_boot_services(void);
+#else
+static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
+static inline bool efi_reboot_required(void)
+{
+ return false;
+}
+static inline bool efi_is_table_address(unsigned long phys_addr)
+{
+ return false;
+}
+static inline void efi_reserve_boot_services(void)
+{
+}
#endif /* CONFIG_EFI */
+extern int __init efi_memmap_alloc(unsigned int num_entries,
+ struct efi_memory_map_data *data);
+
+extern int __init efi_memmap_install(struct efi_memory_map_data *data);
+extern int __init efi_memmap_split_count(efi_memory_desc_t *md,
+ struct range *range);
+extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap,
+ void *buf, struct efi_mem_range *mem);
+
+extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void);
+
+#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode()
+
+#ifdef CONFIG_EFI_RUNTIME_MAP
+int efi_get_runtime_map_size(void);
+int efi_get_runtime_map_desc_size(void);
+int efi_runtime_map_copy(void *buf, size_t bufsz);
+#else
+static inline int efi_get_runtime_map_size(void)
+{
+ return 0;
+}
+
+static inline int efi_get_runtime_map_desc_size(void)
+{
+ return 0;
+}
+
+static inline int efi_runtime_map_copy(void *buf, size_t bufsz)
+{
+ return 0;
+}
+
+#endif
+
#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83c1bc8d2e8a..6c8fdc96be7e 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -1,13 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ELF_H
#define _ASM_X86_ELF_H
/*
* ELF register definitions..
*/
+#include <linux/thread_info.h>
+#include <asm/ia32.h>
#include <asm/ptrace.h>
#include <asm/user.h>
#include <asm/auxvec.h>
+#include <asm/fsgsbase.h>
typedef unsigned long elf_greg_t;
@@ -18,8 +22,6 @@ typedef struct user_i387_struct elf_fpregset_t;
#ifdef __i386__
-typedef struct user_fxsr_struct elf_fpxregset_t;
-
#define R_386_NONE 0
#define R_386_32 1
#define R_386_PC32 2
@@ -52,16 +54,16 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
#define R_X86_64_RELATIVE 8 /* Adjust by program base */
-#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
- offset to GOT */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */
+#define R_X86_64_GOTPCRELX 41
+#define R_X86_64_REX_GOTPCRELX 42
#define R_X86_64_32 10 /* Direct 32 bit zero extended */
#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
#define R_X86_64_16 12 /* Direct 16 bit zero extended */
#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
#define R_X86_64_8 14 /* Direct 8 bit sign extended */
#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
-
-#define R_X86_64_NUM 16
+#define R_X86_64_PC64 24 /* Place relative 64-bit signed */
/*
* These are used to set parameters in the core dumps.
@@ -74,7 +76,8 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
#include <asm/vdso.h>
-extern unsigned int vdso_enabled;
+extern unsigned int vdso64_enabled;
+extern unsigned int vdso32_enabled;
/*
* This is used to ensure we don't load something for the wrong architecture.
@@ -83,7 +86,6 @@ extern unsigned int vdso_enabled;
(((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
#include <asm/processor.h>
-#include <asm/system.h>
#ifdef CONFIG_X86_32
#include <asm/desc.h>
@@ -112,7 +114,7 @@ extern unsigned int vdso_enabled;
* now struct_user_regs, they are different)
*/
-#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
pr_reg[0] = regs->bx; \
pr_reg[1] = regs->cx; \
@@ -121,27 +123,16 @@ do { \
pr_reg[4] = regs->di; \
pr_reg[5] = regs->bp; \
pr_reg[6] = regs->ax; \
- pr_reg[7] = regs->ds & 0xffff; \
- pr_reg[8] = regs->es & 0xffff; \
- pr_reg[9] = regs->fs & 0xffff; \
+ pr_reg[7] = regs->ds; \
+ pr_reg[8] = regs->es; \
+ pr_reg[9] = regs->fs; \
+ savesegment(gs, pr_reg[10]); \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
- pr_reg[13] = regs->cs & 0xffff; \
+ pr_reg[13] = regs->cs; \
pr_reg[14] = regs->flags; \
pr_reg[15] = regs->sp; \
- pr_reg[16] = regs->ss & 0xffff; \
-} while (0);
-
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
-do { \
- ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
- pr_reg[10] = get_user_gs(regs); \
-} while (0);
-
-#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
-do { \
- ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
- savesegment(gs, pr_reg[10]); \
+ pr_reg[16] = regs->ss; \
} while (0);
#define ELF_PLATFORM (utsname()->machine)
@@ -155,56 +146,36 @@ do { \
#define elf_check_arch(x) \
((x)->e_machine == EM_X86_64)
-#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
-
-static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
-{
- loadsegment(fs, 0);
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
- load_gs_index(0);
- regs->ip = ip;
- regs->sp = sp;
- regs->flags = X86_EFLAGS_IF;
- regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
-}
+#define compat_elf_check_arch(x) \
+ ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \
+ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
{
- regs->ax = regs->bx = regs->cx = regs->dx = 0;
+ /* ax gets execve's return value. */
+ /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0;
regs->si = regs->di = regs->bp = 0;
regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
- t->fs = t->gs = 0;
+ t->fsbase = t->gsbase = 0;
t->fsindex = t->gsindex = 0;
t->ds = t->es = ds;
}
#define ELF_PLAT_INIT(_r, load_addr) \
-do { \
- elf_common_init(&current->thread, _r, 0); \
- clear_thread_flag(TIF_IA32); \
-} while (0)
+ elf_common_init(&current->thread, _r, 0)
#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
elf_common_init(&current->thread, regs, __USER_DS)
-#define compat_start_thread(regs, ip, sp) \
-do { \
- start_ia32_thread(regs, ip, sp); \
- set_fs(USER_DS); \
-} while (0)
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32);
+#define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \
+ compat_start_thread(regs, new_ip, new_sp, ex->e_machine == EM_X86_64)
+void set_personality_ia32(bool);
#define COMPAT_SET_PERSONALITY(ex) \
-do { \
- if (test_thread_flag(TIF_IA32)) \
- clear_thread_flag(TIF_ABI_PENDING); \
- else \
- set_thread_flag(TIF_ABI_PENDING); \
- current->personality |= force_personality32; \
-} while (0)
+ set_personality_ia32((ex).e_machine == EM_X86_64)
#define COMPAT_ELF_PLATFORM ("i686")
@@ -238,8 +209,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
- (pr_reg)[21] = current->thread.fs; \
- (pr_reg)[22] = current->thread.gs; \
+ (pr_reg)[21] = x86_fsbase_read_cpu(); \
+ (pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
@@ -249,27 +220,35 @@ do { \
/* I'm not sure if we can use '-' here */
#define ELF_PLATFORM ("x86_64")
extern void set_personality_64bit(void);
-extern unsigned int sysctl_vsyscall32;
extern int force_personality32;
#endif /* !CONFIG_X86_32 */
#define CORE_DUMP_USE_REGSET
-#define USE_ELF_CORE_DUMP
#define ELF_EXEC_PAGESIZE 4096
-/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
- use of this is to invoke "./ld.so someprog" to test out a new version of
- the loader. We need to make sure that it is out of the way of the program
- that it will "exec", and that there is sufficient room for the brk. */
-
-#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
+/*
+ * This is the base location for PIE (ET_DYN with INTERP) loads. On
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
+ * space open for things that want to use the area for 32-bit pointers.
+ */
+#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \
+ (DEFAULT_MAP_WINDOW / 3 * 2))
/* This yields a mask that user programs can use to figure out what
instruction set this CPU supports. This could be done in user space,
but it's not easy, and we've already done it here. */
-#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
+#define ELF_HWCAP (boot_cpu_data.x86_capability[CPUID_1_EDX])
+
+extern u32 elf_hwcap2;
+
+/*
+ * HWCAP2 supplies mask with kernel enabled CPU features, so that
+ * the application can discover that it can safely use them.
+ * The bits are defined in uapi/asm/hwcap2.h.
+ */
+#define ELF_HWCAP2 (elf_hwcap2)
/* This yields a string that ld.so will use to load implementation
specific libraries for optimization. This is more specific in
@@ -283,45 +262,96 @@ extern int force_personality32;
/*
* An executable for which elf_read_implies_exec() returns TRUE will
* have the READ_IMPLIES_EXEC personality flag set automatically.
+ *
+ * The decision process for determining the results are:
+ *
+ * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 |
+ * ELF: | | | |
+ * ---------------------|------------|------------------|----------------|
+ * missing PT_GNU_STACK | exec-all | exec-all | exec-none |
+ * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack |
+ * PT_GNU_STACK == RW | exec-none | exec-none | exec-none |
+ *
+ * exec-all : all PROT_READ user mappings are executable, except when
+ * backed by files on a noexec-filesystem.
+ * exec-none : only PROT_EXEC user mappings are executable.
+ * exec-stack: only the stack and PROT_EXEC user mappings are executable.
+ *
+ * *this column has no architectural effect: NX markings are ignored by
+ * hardware, but may have behavioral effects when "wants X" collides with
+ * "cannot be X" constraints in memory permission flags, as in
+ * https://lkml.kernel.org/r/20190418055759.GA3155@mellanox.com
+ *
*/
#define elf_read_implies_exec(ex, executable_stack) \
- (executable_stack != EXSTACK_DISABLE_X)
+ (mmap_is_ia32() && executable_stack == EXSTACK_DEFAULT)
struct task_struct;
-#define ARCH_DLINFO_IA32(vdso_enabled) \
+#define ARCH_DLINFO_IA32 \
do { \
- if (vdso_enabled) { \
+ if (VDSO_CURRENT_BASE) { \
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
} \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static inline int mmap_is_ia32(void)
+{
+ return IS_ENABLED(CONFIG_X86_32) ||
+ (IS_ENABLED(CONFIG_COMPAT) &&
+ test_thread_flag(TIF_ADDR32));
+}
+
+extern unsigned long task_size_32bit(void);
+extern unsigned long task_size_64bit(int full_addr_space);
+extern unsigned long get_mmap_base(int is_legacy);
+extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
+
#ifdef CONFIG_X86_32
-#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
+#define __STACK_RND_MASK(is32bit) (0x7ff)
+#define STACK_RND_MASK (0x7ff)
-#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
+#define ARCH_DLINFO ARCH_DLINFO_IA32
/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
#else /* CONFIG_X86_32 */
-#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */
-
/* 1GB for 64bit, 8MB for 32bit */
-#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
+#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff)
+#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32())
#define ARCH_DLINFO \
do { \
- if (vdso_enabled) \
+ if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
- (unsigned long)current->mm->context.vdso); \
+ (unsigned long __force)current->mm->context.vdso); \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
+} while (0)
+
+/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
+#define ARCH_DLINFO_X32 \
+do { \
+ if (vdso64_enabled) \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (unsigned long __force)current->mm->context.vdso); \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
#define AT_SYSINFO 32
-#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32)
+#define COMPAT_ARCH_DLINFO \
+if (exec->e_machine == EM_X86_64) \
+ ARCH_DLINFO_X32; \
+else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
+ ARCH_DLINFO_IA32
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
@@ -330,18 +360,33 @@ do { \
#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
#define VDSO_ENTRY \
- ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
+ ((unsigned long)current->mm->context.vdso + \
+ vdso_image_32.sym___kernel_vsyscall)
struct linux_binprm;
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
-
-extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
-#define compat_arch_setup_additional_pages syscall32_setup_pages
-
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
+extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp, bool x32);
+#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \
+ compat_arch_setup_additional_pages(bprm, interpreter, \
+ (ex->e_machine == EM_X86_64))
+
+extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs);
+
+/* Do not change the values. See get_align_mask() */
+enum align_flags {
+ ALIGN_VA_32 = BIT(0),
+ ALIGN_VA_64 = BIT(1),
+};
+
+struct va_alignment {
+ int flags;
+ unsigned long mask;
+ unsigned long bits;
+} ____cacheline_aligned;
+
+extern struct va_alignment va_align;
#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h
new file mode 100644
index 000000000000..f1b6c7a8d8fc
--- /dev/null
+++ b/arch/x86/include/asm/elfcore-compat.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_ELFCORE_COMPAT_H
+#define _ASM_X86_ELFCORE_COMPAT_H
+
+#include <asm/user32.h>
+
+/*
+ * On amd64 we have two 32bit ABIs - i386 and x32. The latter
+ * has bigger registers, so we use it for compat_elf_regset_t.
+ * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID
+ * are used to choose the size and location of ->pr_fpvalid of
+ * the layout actually used.
+ */
+typedef struct user_regs_struct compat_elf_gregset_t;
+
+struct i386_elf_prstatus
+{
+ struct compat_elf_prstatus_common common;
+ struct user_regs_struct32 pr_reg;
+ compat_int_t pr_fpvalid;
+};
+
+#define PRSTATUS_SIZE \
+ (user_64bit_mode(task_pt_regs(current)) \
+ ? sizeof(struct compat_elf_prstatus) \
+ : sizeof(struct i386_elf_prstatus))
+#define SET_PR_FPVALID(S) \
+ (*(user_64bit_mode(task_pt_regs(current)) \
+ ? &(S)->pr_fpvalid \
+ : &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1)
+
+#endif
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index cc70c1c78ca4..2abde717db31 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_EMERGENCY_RESTART_H
#define _ASM_X86_EMERGENCY_RESTART_H
-enum reboot_type {
- BOOT_TRIPLE = 't',
- BOOT_KBD = 'k',
-#ifdef CONFIG_X86_32
- BOOT_BIOS = 'b',
-#endif
- BOOT_ACPI = 'a',
- BOOT_EFI = 'e',
- BOOT_CF9 = 'p',
- BOOT_CF9_COND = 'q',
-};
-
-extern enum reboot_type reboot_type;
-
extern void machine_emergency_restart(void);
#endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/include/asm/emulate_prefix.h b/arch/x86/include/asm/emulate_prefix.h
new file mode 100644
index 000000000000..70f5b98a5286
--- /dev/null
+++ b/arch/x86/include/asm/emulate_prefix.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EMULATE_PREFIX_H
+#define _ASM_X86_EMULATE_PREFIX_H
+
+/*
+ * Virt escape sequences to trigger instruction emulation;
+ * ideally these would decode to 'whole' instruction and not destroy
+ * the instruction stream; sadly this is not true for the 'kvm' one :/
+ */
+
+#define __XEN_EMULATE_PREFIX 0x0f,0x0b,0x78,0x65,0x6e /* ud2 ; .ascii "xen" */
+#define __KVM_EMULATE_PREFIX 0x0f,0x0b,0x6b,0x76,0x6d /* ud2 ; .ascii "kvm" */
+
+#endif
diff --git a/arch/x86/include/asm/enclu.h b/arch/x86/include/asm/enclu.h
new file mode 100644
index 000000000000..b1314e41a744
--- /dev/null
+++ b/arch/x86/include/asm/enclu.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ENCLU_H
+#define _ASM_X86_ENCLU_H
+
+#define EENTER 0x02
+#define ERESUME 0x03
+#define EEXIT 0x04
+
+#endif /* _ASM_X86_ENCLU_H */
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
new file mode 100644
index 000000000000..ce3eb6d5fdf9
--- /dev/null
+++ b/arch/x86/include/asm/entry-common.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_ENTRY_COMMON_H
+#define _ASM_X86_ENTRY_COMMON_H
+
+#include <linux/randomize_kstack.h>
+#include <linux/user-return-notifier.h>
+
+#include <asm/nospec-branch.h>
+#include <asm/io_bitmap.h>
+#include <asm/fpu/api.h>
+#include <asm/fred.h>
+
+/* Check that the stack and regs on entry from user mode are sane. */
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
+ /*
+ * Make sure that the entry code gave us a sensible EFLAGS
+ * register. Native because we want to check the actual CPU
+ * state, not the interrupt state as imagined by Xen.
+ */
+ unsigned long flags = native_save_fl();
+ unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT;
+
+ /*
+ * For !SMAP hardware we patch out CLAC on entry.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SMAP) ||
+ cpu_feature_enabled(X86_FEATURE_XENPV))
+ mask |= X86_EFLAGS_AC;
+
+ WARN_ON_ONCE(flags & mask);
+
+ /* We think we came from user mode. Make sure pt_regs agrees. */
+ WARN_ON_ONCE(!user_mode(regs));
+
+ /*
+ * All entries from user mode (except #DF) should be on the
+ * normal thread stack and should have user pt_regs in the
+ * correct location.
+ */
+ WARN_ON_ONCE(!on_thread_stack());
+ WARN_ON_ONCE(regs != task_pt_regs(current));
+ }
+}
+#define arch_enter_from_user_mode arch_enter_from_user_mode
+
+static inline void arch_exit_work(unsigned long ti_work)
+{
+ if (ti_work & _TIF_USER_RETURN_NOTIFY)
+ fire_user_return_notifiers();
+
+ if (unlikely(ti_work & _TIF_IO_BITMAP))
+ tss_update_io_bitmap();
+
+ if (unlikely(ti_work & _TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ fpregs_assert_state_consistent();
+
+ if (unlikely(ti_work))
+ arch_exit_work(ti_work);
+
+ fred_update_rsp0();
+
+#ifdef CONFIG_COMPAT
+ /*
+ * Compat syscalls set TS_COMPAT. Make sure we clear it before
+ * returning to user mode. We need to clear it *after* signal
+ * handling, because syscall restart has a fixup for compat
+ * syscalls. The fixup is exercised by the ptrace_syscall_32
+ * selftest.
+ *
+ * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
+ * special case only applies after poking regs and before the
+ * very next return to user mode.
+ */
+ current_thread_info()->status &= ~(TS_COMPAT | TS_I386_REGS_POKED);
+#endif
+
+ /*
+ * This value will get limited by KSTACK_OFFSET_MAX(), which is 10
+ * bits. The actual entropy will be further reduced by the compiler
+ * when applying stack alignment constraints (see cc_stack_align4/8 in
+ * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32)
+ * low bits from any entropy chosen here.
+ *
+ * Therefore, final stack offset entropy will be 7 (x86_64) or
+ * 8 (ia32) bits.
+ */
+ choose_random_kstack_offset(rdtsc());
+
+ /* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */
+ if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) &&
+ this_cpu_read(x86_ibpb_exit_to_user)) {
+ indirect_branch_prediction_barrier();
+ this_cpu_write(x86_ibpb_exit_to_user, false);
+ }
+}
+#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+
+static __always_inline void arch_exit_to_user_mode(void)
+{
+ amd_clear_divider();
+}
+#define arch_exit_to_user_mode arch_exit_to_user_mode
+
+#endif
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
deleted file mode 100644
index ff8cbfa07851..000000000000
--- a/arch/x86/include/asm/entry_arch.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * This file is designed to contain the BUILD_INTERRUPT specifications for
- * all of the extra named interrupt vectors used by the architecture.
- * Usually this is the Inter Process Interrupts (IPIs)
- */
-
-/*
- * The following vectors are part of the Linux architecture, there
- * is no hardware IRQ pin equivalent for them, they are triggered
- * through the ICC by us (IPIs)
- */
-#ifdef CONFIG_SMP
-BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
-BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
-BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
-BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
-BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
-
-BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
- smp_invalidate_interrupt)
-#endif
-
-BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
-
-/*
- * every pentium local APIC has two 'local interrupts', with a
- * soft-definable vector attached to both interrupts, one of
- * which is a timer interrupt, the other one is error counter
- * overflow. Linux uses the local APIC timer interrupt to get
- * a much simpler SMP time architecture:
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-
-BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
-BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
-BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
-
-#ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
-#endif
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
-#endif
-
-#ifdef CONFIG_X86_MCE_THRESHOLD
-BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
-#endif
-
-#ifdef CONFIG_X86_NEW_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
-#endif
diff --git a/arch/x86/include/asm/errno.h b/arch/x86/include/asm/errno.h
deleted file mode 100644
index 4c82b503d92f..000000000000
--- a/arch/x86/include/asm/errno.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/errno.h>
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
new file mode 100644
index 000000000000..6777480d8a42
--- /dev/null
+++ b/arch/x86/include/asm/espfix.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ESPFIX_H
+#define _ASM_X86_ESPFIX_H
+
+#ifdef CONFIG_X86_ESPFIX64
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+extern void init_espfix_bsp(void);
+extern void init_espfix_ap(int cpu);
+#else
+static inline void init_espfix_ap(int cpu) { }
+#endif
+
+#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h
new file mode 100644
index 000000000000..54c2e1db274a
--- /dev/null
+++ b/arch/x86/include/asm/exec.h
@@ -0,0 +1 @@
+/* define arch_align_stack() here */
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
new file mode 100644
index 000000000000..a0e0c6b50155
--- /dev/null
+++ b/arch/x86/include/asm/extable.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EXTABLE_H
+#define _ASM_X86_EXTABLE_H
+
+#include <asm/extable_fixup_types.h>
+
+/*
+ * The exception table consists of two addresses relative to the
+ * exception table entry itself and a type selector field.
+ *
+ * The first address is of an instruction that is allowed to fault, the
+ * second is the target at which the program should continue.
+ *
+ * The type entry is used by fixup_exception() to select the handler to
+ * deal with the fault caused by the instruction in the first field.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path. This means when everything is well,
+ * we don't even have to jump over them. Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+ int insn, fixup, data;
+};
+struct pt_regs;
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta) \
+ do { \
+ (a)->fixup = (b)->fixup + (delta); \
+ (b)->fixup = (tmp).fixup - (delta); \
+ (a)->data = (b)->data; \
+ (b)->data = (tmp).data; \
+ } while (0)
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, unsigned long fault_addr);
+extern int ex_get_fixup_type(unsigned long ip);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+
+#ifdef CONFIG_X86_MCE
+extern void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr);
+#else
+static inline void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
+{
+ for (;;)
+ cpu_relax();
+}
+#endif
+
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64)
+bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs);
+#else
+static inline bool ex_handler_bpf(const struct exception_table_entry *x,
+ struct pt_regs *regs) { return false; }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
new file mode 100644
index 000000000000..906b0d5541e8
--- /dev/null
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H
+#define _ASM_X86_EXTABLE_FIXUP_TYPES_H
+
+/*
+ * Our IMM is signed, as such it must live at the top end of the word. Also,
+ * since C99 hex constants are of ambiguous type, force cast the mask to 'int'
+ * so that FIELD_GET() will DTRT and sign extend the value when it extracts it.
+ */
+#define EX_DATA_TYPE_MASK ((int)0x000000FF)
+#define EX_DATA_REG_MASK ((int)0x00000F00)
+#define EX_DATA_FLAG_MASK ((int)0x0000F000)
+#define EX_DATA_IMM_MASK ((int)0xFFFF0000)
+
+#define EX_DATA_REG_SHIFT 8
+#define EX_DATA_FLAG_SHIFT 12
+#define EX_DATA_IMM_SHIFT 16
+
+#define EX_DATA_REG(reg) ((reg) << EX_DATA_REG_SHIFT)
+#define EX_DATA_FLAG(flag) ((flag) << EX_DATA_FLAG_SHIFT)
+#define EX_DATA_IMM(imm) ((imm) << EX_DATA_IMM_SHIFT)
+
+/* segment regs */
+#define EX_REG_DS EX_DATA_REG(8)
+#define EX_REG_ES EX_DATA_REG(9)
+#define EX_REG_FS EX_DATA_REG(10)
+#define EX_REG_GS EX_DATA_REG(11)
+
+/* flags */
+#define EX_FLAG_CLEAR_AX EX_DATA_FLAG(1)
+#define EX_FLAG_CLEAR_DX EX_DATA_FLAG(2)
+#define EX_FLAG_CLEAR_AX_DX EX_DATA_FLAG(3)
+
+/* types */
+#define EX_TYPE_NONE 0
+#define EX_TYPE_DEFAULT 1
+#define EX_TYPE_FAULT 2
+#define EX_TYPE_UACCESS 3
+/* unused, was: #define EX_TYPE_COPY 4 */
+#define EX_TYPE_CLEAR_FS 5
+#define EX_TYPE_FPU_RESTORE 6
+#define EX_TYPE_BPF 7
+#define EX_TYPE_WRMSR 8
+#define EX_TYPE_RDMSR 9
+#define EX_TYPE_WRMSR_SAFE 10 /* reg := -EIO */
+#define EX_TYPE_RDMSR_SAFE 11 /* reg := -EIO */
+#define EX_TYPE_WRMSR_IN_MCE 12
+#define EX_TYPE_RDMSR_IN_MCE 13
+#define EX_TYPE_DEFAULT_MCE_SAFE 14
+#define EX_TYPE_FAULT_MCE_SAFE 15
+
+#define EX_TYPE_POP_REG 16 /* sp += sizeof(long) */
+#define EX_TYPE_POP_ZERO (EX_TYPE_POP_REG | EX_DATA_IMM(0))
+
+#define EX_TYPE_IMM_REG 17 /* reg := (long)imm */
+#define EX_TYPE_EFAULT_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(-EFAULT))
+#define EX_TYPE_ZERO_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(0))
+#define EX_TYPE_ONE_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(1))
+
+#define EX_TYPE_FAULT_SGX 18
+
+#define EX_TYPE_UCOPY_LEN 19 /* cx := reg + imm*cx */
+#define EX_TYPE_UCOPY_LEN1 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(1))
+#define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))
+#define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8))
+
+#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */
+
+#define EX_TYPE_ERETU 21
+
+#endif
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
deleted file mode 100644
index 53018464aea6..000000000000
--- a/arch/x86/include/asm/fb.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _ASM_X86_FB_H
-#define _ASM_X86_FB_H
-
-#include <linux/fb.h>
-#include <linux/fs.h>
-#include <asm/page.h>
-
-static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
- unsigned long off)
-{
- if (boot_cpu_data.x86 > 3)
- pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
-}
-
-#ifdef CONFIG_X86_32
-extern int fb_is_primary_device(struct fb_info *info);
-#else
-static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
-#endif
-
-#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fcntl.h b/arch/x86/include/asm/fcntl.h
deleted file mode 100644
index 46ab12db5739..000000000000
--- a/arch/x86/include/asm/fcntl.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fcntl.h>
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7b2d71df39a6..4519c9f35ba0 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,16 +14,32 @@
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
-#ifndef __ASSEMBLY__
+#include <asm/kmap_size.h>
+
+/*
+ * Exposed to assembly code for setting up initial page tables. Cannot be
+ * calculated in assembly code (fixmap entries are an enum), but is sanity
+ * checked in the actual fixmap C code to make sure that the fixmap is
+ * covered fully.
+ */
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define FIXMAP_PMD_NUM 2
+#else
+# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512))
+# define FIXMAP_PMD_NUM (KM_PMDS + 2)
+#endif
+/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
+#define FIXMAP_PMD_TOP 507
+
+#ifndef __ASSEMBLER__
#include <linux/kernel.h>
-#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
+#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
-#include <asm/kmap_types.h>
#else
-#include <asm/vsyscall.h>
+#include <uapi/asm/vsyscall.h>
#endif
/*
@@ -32,25 +48,17 @@
* Because of this, FIXADDR_TOP x86 integration was left as later work.
*/
#ifdef CONFIG_X86_32
-/* used by vmalloc.c, vsyscall.lds.S.
- *
+/*
* Leave one empty page between vmalloc'ed areas and
* the start of the fixmap.
*/
extern unsigned long __FIXADDR_TOP;
#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
-
-#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
-#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
#else
-#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
-
-/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
-#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
+#define FIXADDR_TOP (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1<<PMD_SHIFT) - \
+ PAGE_SIZE)
#endif
-
/*
* Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at
@@ -73,15 +81,16 @@ extern unsigned long __FIXADDR_TOP;
enum fixed_addresses {
#ifdef CONFIG_X86_32
FIX_HOLE,
- FIX_VDSO,
#else
- VSYSCALL_LAST_PAGE,
- VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
- + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
- VSYSCALL_HPET,
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+ VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
+#endif
#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
@@ -89,71 +98,69 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
-#ifdef CONFIG_X86_VISWS_APIC
- FIX_CO_CPU, /* Cobalt timer */
- FIX_CO_APIC, /* Cobalt APIC Redirection Table */
- FIX_LI_PCIA, /* Lithium PCI Bridge A */
- FIX_LI_PCIB, /* Lithium PCI Bridge B */
-#endif
-#ifdef CONFIG_X86_F00F_BUG
- FIX_F00F_IDT, /* Virtual mapping for IDT */
-#endif
-#ifdef CONFIG_X86_CYCLONE_TIMER
- FIX_CYCLONE_TIMER, /*cyclone timer register*/
-#endif
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_KMAP_LOCAL
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
#ifdef CONFIG_PCI_MMCONFIG
FIX_PCIE_MCFG,
#endif
#endif
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
FIX_PARAVIRT_BOOTMAP,
#endif
- FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
- FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
+
+#ifdef CONFIG_ACPI_APEI_GHES
+ /* Used for GHES mapping from assorted contexts */
+ FIX_APEI_GHES_IRQ,
+ FIX_APEI_GHES_NMI,
+#endif
+
__end_of_permanent_fixed_addresses,
+
/*
- * 256 temporary boot-time mappings, used by early_ioremap(),
+ * 512 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
- * We round it up to the next 256 pages boundary so that we
- * can have a single pgd entry and a single pte table:
+ * If necessary we round it up to the next 512 pages boundary so
+ * that we can have a single pmd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
-#define FIX_BTMAPS_SLOTS 4
- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
- (__end_of_permanent_fixed_addresses & 255),
- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
- FIX_OHCI1394_BASE,
-#endif
+#define FIX_BTMAPS_SLOTS 8
+#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
+ FIX_BTMAP_END =
+ (__end_of_permanent_fixed_addresses ^
+ (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
+ -PTRS_PER_PTE
+ ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
+ (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
+ : __end_of_permanent_fixed_addresses,
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
#ifdef CONFIG_X86_32
FIX_WP_TEST,
#endif
+#ifdef CONFIG_INTEL_TXT
+ FIX_TBOOT_BASE,
+#endif
__end_of_fixed_addresses
};
extern void reserve_top_address(unsigned long reserve);
-#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
-#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
+#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE)
extern int fixmaps_set;
-extern pte_t *kmap_pte;
-extern pgprot_t kmap_prot;
extern pte_t *pkmap_page_table;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
-void native_set_fixmap(enum fixed_addresses idx,
+void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags);
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
static inline void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
@@ -161,49 +168,33 @@ static inline void __set_fixmap(enum fixed_addresses idx,
}
#endif
-#define set_fixmap(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL)
-
/*
- * Some hardware wants to get fixmapped without caching.
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
*/
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
-#define clear_fixmap(idx) \
- __set_fixmap(idx, 0, __pgprot(0))
-
-#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without translation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
*/
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
- /*
- * this branch gets completely eliminated after inlining,
- * except when someone tries to use fixaddr indices in an
- * illegal way. (such as mixing up address types or using
- * out-of-range indices).
- *
- * If it doesn't get removed, the linker will complain
- * loudly with a reasonably clear error message..
- */
- if (idx >= __end_of_fixed_addresses)
- __this_fixmap_does_not_exist();
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+ unsigned long size);
- return __fix_to_virt(idx);
-}
+#include <asm-generic/fixmap.h>
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
-#endif /* !__ASSEMBLY__ */
+#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
+#define __late_clear_fixmap(idx) __set_fixmap(idx, 0, __pgprot(0))
+
+void __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags);
+
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index dbe82a5c5eac..e7a244051c62 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -10,6 +10,7 @@
#ifndef _ASM_X86_FLOPPY_H
#define _ASM_X86_FLOPPY_H
+#include <linux/sizes.h>
#include <linux/vmalloc.h>
/*
@@ -22,17 +23,14 @@
*/
#define _CROSS_64KB(a, s, vdma) \
(!(vdma) && \
- ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
-
-#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1)
-
+ ((unsigned long)(a) / SZ_64K != ((unsigned long)(a) + (s) - 1) / SZ_64K))
#define SW fd_routine[use_virtual_dma & 1]
#define CSW fd_routine[can_use_virtual_dma & 1]
-#define fd_inb(port) inb_p(port)
-#define fd_outb(value, port) outb_p(value, port)
+#define fd_inb(base, reg) inb_p((base) + (reg))
+#define fd_outb(value, base, reg) outb_p(value, (base) + (reg))
#define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy")
#define fd_free_dma() CSW._free_dma(FLOPPY_DMA)
@@ -74,32 +72,32 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)
int lcount;
char *lptr;
- st = 1;
for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
lcount; lcount--, lptr++) {
- st = inb(virtual_dma_port + 4) & 0xa0;
- if (st != 0xa0)
+ st = inb(virtual_dma_port + FD_STATUS);
+ st &= STATUS_DMA | STATUS_READY;
+ if (st != (STATUS_DMA | STATUS_READY))
break;
if (virtual_dma_mode)
- outb_p(*lptr, virtual_dma_port + 5);
+ outb_p(*lptr, virtual_dma_port + FD_DATA);
else
- *lptr = inb_p(virtual_dma_port + 5);
+ *lptr = inb_p(virtual_dma_port + FD_DATA);
}
virtual_dma_count = lcount;
virtual_dma_addr = lptr;
- st = inb(virtual_dma_port + 4);
+ st = inb(virtual_dma_port + FD_STATUS);
}
#ifdef TRACE_FLPY_INT
calls++;
#endif
- if (st == 0x20)
+ if (st == STATUS_DMA)
return IRQ_HANDLED;
- if (!(st & 0x20)) {
+ if (!(st & STATUS_DMA)) {
virtual_dma_residue += virtual_dma_count;
virtual_dma_count = 0;
#ifdef TRACE_FLPY_INT
- printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
+ printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
virtual_dma_count, virtual_dma_residue, calls, bytes,
dma_wait);
calls = 0;
@@ -145,10 +143,10 @@ static int fd_request_irq(void)
{
if (can_use_virtual_dma)
return request_irq(FLOPPY_IRQ, floppy_hardint,
- IRQF_DISABLED, "floppy", NULL);
+ 0, "floppy", NULL);
else
return request_irq(FLOPPY_IRQ, floppy_interrupt,
- IRQF_DISABLED, "floppy", NULL);
+ 0, "floppy", NULL);
}
static unsigned long dma_mem_alloc(unsigned long size)
@@ -206,7 +204,7 @@ static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
{
#ifdef FLOPPY_SANITY_CHECK
- if (CROSS_64KB(addr, size)) {
+ if (_CROSS_64KB(addr, size, use_virtual_dma & 1)) {
printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size);
return -1;
}
@@ -229,18 +227,18 @@ static struct fd_routine_l {
int (*_dma_setup)(char *addr, unsigned long size, int mode, int io);
} fd_routine[] = {
{
- request_dma,
- free_dma,
- get_dma_residue,
- dma_mem_alloc,
- hard_dma_setup
+ ._request_dma = request_dma,
+ ._free_dma = free_dma,
+ ._get_dma_residue = get_dma_residue,
+ ._dma_mem_alloc = dma_mem_alloc,
+ ._dma_setup = hard_dma_setup
},
{
- vdma_request_dma,
- vdma_nop,
- vdma_get_dma_residue,
- vdma_mem_alloc,
- vdma_dma_setup
+ ._request_dma = vdma_request_dma,
+ ._free_dma = vdma_nop,
+ ._get_dma_residue = vdma_get_dma_residue,
+ ._dma_mem_alloc = vdma_mem_alloc,
+ ._dma_setup = vdma_dma_setup
}
};
diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h
new file mode 100644
index 000000000000..b2743fe19339
--- /dev/null
+++ b/arch/x86/include/asm/fpu.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+#include <asm/fpu/api.h>
+
+#define kernel_fpu_available() true
+
+#endif /* ! _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
new file mode 100644
index 000000000000..cd6f194a912b
--- /dev/null
+++ b/arch/x86/include/asm/fpu/api.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_API_H
+#define _ASM_X86_FPU_API_H
+#include <linux/bottom_half.h>
+
+#include <asm/fpu/types.h>
+
+/*
+ * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
+ * disables preemption and softirq processing, so be careful if you intend to
+ * use it for long periods of time. Kernel-mode FPU cannot be used in all
+ * contexts -- see irq_fpu_usable() for details.
+ */
+
+/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
+#define KFPU_387 _BITUL(0) /* 387 state will be initialized */
+#define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */
+
+extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
+extern void kernel_fpu_end(void);
+extern bool irq_fpu_usable(void);
+extern void fpregs_mark_activate(void);
+
+/* Code that is unaware of kernel_fpu_begin_mask() can use this */
+static inline void kernel_fpu_begin(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * Any 64-bit code that uses 387 instructions must explicitly request
+ * KFPU_387.
+ */
+ kernel_fpu_begin_mask(KFPU_MXCSR);
+#else
+ /*
+ * 32-bit kernel code may use 387 operations as well as SSE2, etc,
+ * as long as it checks that the CPU has the required capability.
+ */
+ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+#endif
+}
+
+/*
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while
+ * using the FPU in kernel mode. A context switch will (and softirq might) save
+ * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving
+ * CPU's FPU registers in a random state.
+ *
+ * local_bh_disable() protects against both preemption and soft interrupts
+ * on !RT kernels.
+ *
+ * On RT kernels local_bh_disable() is not sufficient because it only
+ * serializes soft interrupt related sections via a local lock, but stays
+ * preemptible. Disabling preemption is the right choice here as bottom
+ * half processing is always in thread context on RT kernels so it
+ * implicitly prevents bottom half processing as well.
+ */
+static inline void fpregs_lock(void)
+{
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_bh_disable();
+ else
+ preempt_disable();
+}
+
+static inline void fpregs_unlock(void)
+{
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_bh_enable();
+ else
+ preempt_enable();
+}
+
+/*
+ * FPU state gets lazily restored before returning to userspace. So when in the
+ * kernel, the valid FPU state may be kept in the buffer. This function will force
+ * restore all the fpu state to the registers early if needed, and lock them from
+ * being automatically saved/restored. Then FPU state can be modified safely in the
+ * registers, before unlocking with fpregs_unlock().
+ */
+void fpregs_lock_and_load(void);
+
+#ifdef CONFIG_X86_DEBUG_FPU
+extern void fpregs_assert_state_consistent(void);
+#else
+static inline void fpregs_assert_state_consistent(void) { }
+#endif
+
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
+/*
+ * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
+ *
+ * If 'feature_name' is set then put a human-readable description of
+ * the feature there as well - this can be used to print error (or success)
+ * messages.
+ */
+extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
+
+/* Trap handling */
+extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern void fpu_sync_fpstate(struct fpu *fpu);
+extern void fpu_reset_from_exception_fixup(void);
+
+/* Boot, hotplug and resume */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system(void);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+
+/* State tracking */
+DECLARE_PER_CPU(bool, kernel_fpu_allowed);
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/* Process cleanup */
+#ifdef CONFIG_X86_64
+extern void fpstate_free(struct fpu *fpu);
+#else
+static inline void fpstate_free(struct fpu *fpu) { }
+#endif
+
+/* fpstate-related functions which are exported to KVM */
+extern void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature);
+
+extern u64 xstate_get_guest_group_perm(void);
+
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+
+/* KVM specific functions */
+extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
+extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
+extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
+extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);
+
+#ifdef CONFIG_X86_64
+extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
+extern void fpu_sync_guest_vmexit_xfd_state(void);
+#else
+static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
+static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
+#endif
+
+extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+ unsigned int size, u64 xfeatures, u32 pkru);
+extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
+
+static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
+{
+ gfpu->fpstate->is_confidential = true;
+}
+
+static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
+{
+ return gfpu->fpstate->is_confidential;
+}
+
+/* prctl */
+extern long fpu_xstate_prctl(int option, unsigned long arg2);
+
+extern void fpu_idle_fpregs(void);
+
+#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
new file mode 100644
index 000000000000..697b77e96025
--- /dev/null
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * FPU regset handling methods:
+ */
+#ifndef _ASM_X86_FPU_REGSET_H
+#define _ASM_X86_FPU_REGSET_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active,
+ ssp_active;
+extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+ xstateregs_get, ssp_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+ xstateregs_set, ssp_set;
+
+/*
+ * xstateregs_active == regset_fpregs_active. Please refer to the comment
+ * at the definition of regset_fpregs_active.
+ */
+#define xstateregs_active regset_fpregs_active
+
+#endif /* _ASM_X86_FPU_REGSET_H */
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
new file mode 100644
index 000000000000..89004f4ca208
--- /dev/null
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_SCHED_H
+#define _ASM_X86_FPU_SCHED_H
+
+#include <linux/sched.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/types.h>
+
+#include <asm/trace/fpu.h>
+
+extern void save_fpregs_to_fpstate(struct fpu *fpu);
+extern void fpu__drop(struct task_struct *tsk);
+extern int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal,
+ unsigned long shstk_addr);
+extern void fpu_flush_thread(void);
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * switch_fpu() saves the old state and sets TIF_NEED_FPU_LOAD if
+ * TIF_NEED_FPU_LOAD is not set. This is done within the context
+ * of the old process.
+ *
+ * Once TIF_NEED_FPU_LOAD is set, it is required to load the
+ * registers before returning to userland or using the content
+ * otherwise.
+ *
+ * The FPU context is only stored/restored for a user task and
+ * PF_KTHREAD is used to distinguish between kernel and user threads.
+ */
+static inline void switch_fpu(struct task_struct *old, int cpu)
+{
+ if (!test_tsk_thread_flag(old, TIF_NEED_FPU_LOAD) &&
+ cpu_feature_enabled(X86_FEATURE_FPU) &&
+ !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+ struct fpu *old_fpu = x86_task_fpu(old);
+
+ set_tsk_thread_flag(old, TIF_NEED_FPU_LOAD);
+ save_fpregs_to_fpstate(old_fpu);
+ /*
+ * The save operation preserved register state, so the
+ * fpu_fpregs_owner_ctx is still @old_fpu. Store the
+ * current CPU number in @old_fpu, so the next return
+ * to user space can avoid the FPU register restore
+ * when is returns on the same CPU and still owns the
+ * context. See fpregs_restore_userregs().
+ */
+ old_fpu->last_cpu = cpu;
+
+ trace_x86_fpu_regs_deactivated(old_fpu);
+ }
+}
+
+#endif /* _ASM_X86_FPU_SCHED_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
new file mode 100644
index 000000000000..eccc75bc9c4f
--- /dev/null
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * x86 FPU signal frame handling methods:
+ */
+#ifndef _ASM_X86_FPU_SIGNAL_H
+#define _ASM_X86_FPU_SIGNAL_H
+
+#include <linux/compat.h>
+#include <linux/user.h>
+
+#include <asm/fpu/types.h>
+
+#ifdef CONFIG_X86_64
+# include <uapi/asm/sigcontext.h>
+# include <asm/user32.h>
+#else
+# define user_i387_ia32_struct user_i387_struct
+# define user32_fxsr_struct user_fxsr_struct
+#endif
+
+extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
+ struct task_struct *tsk);
+extern void convert_to_fxsr(struct fxregs_state *fxsave,
+ const struct user_i387_ia32_struct *env);
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+ unsigned long *buf_fx, unsigned long *size);
+
+unsigned long fpu__get_fpstate_size(void);
+
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
+
+extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask);
+#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
new file mode 100644
index 000000000000..93e99d2583d6
--- /dev/null
+++ b/arch/x86/include/asm/fpu/types.h
@@ -0,0 +1,647 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * FPU data structures:
+ */
+#ifndef _ASM_X86_FPU_TYPES_H
+#define _ASM_X86_FPU_TYPES_H
+
+#include <asm/page_types.h>
+
+/*
+ * The legacy x87 FPU state format, as saved by FSAVE and
+ * restored by the FRSTOR instructions:
+ */
+struct fregs_state {
+ u32 cwd; /* FPU Control Word */
+ u32 swd; /* FPU Status Word */
+ u32 twd; /* FPU Tag Word */
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Pointer Offset */
+ u32 fos; /* FPU Operand Pointer Selector */
+
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+
+ /* Software status information [not touched by FSAVE]: */
+ u32 status;
+};
+
+/*
+ * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and
+ * restored by the FXRSTOR instructions. It's similar to the FSAVE
+ * format, but differs in some areas, plus has extensions at
+ * the end for the XMM registers.
+ */
+struct fxregs_state {
+ u16 cwd; /* Control Word */
+ u16 swd; /* Status Word */
+ u16 twd; /* Tag Word */
+ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+ u64 rip; /* Instruction Pointer */
+ u64 rdp; /* Data Pointer */
+ };
+ struct {
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Offset */
+ u32 fos; /* FPU Operand Selector */
+ };
+ };
+ u32 mxcsr; /* MXCSR Register State */
+ u32 mxcsr_mask; /* MXCSR Mask */
+
+ /* 8*16 bytes for each FP-reg = 128 bytes: */
+ u32 st_space[32];
+
+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
+ u32 xmm_space[64];
+
+ u32 padding[12];
+
+ union {
+ u32 padding1[12];
+ u32 sw_reserved[12];
+ };
+
+} __attribute__((aligned(16)));
+
+/* Default value for fxregs_state.mxcsr: */
+#define MXCSR_DEFAULT 0x1f80
+
+/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */
+#define MXCSR_AND_FLAGS_SIZE sizeof(u64)
+
+/*
+ * Software based FPU emulation state. This is arbitrary really,
+ * it matches the x87 format to make it easier to understand:
+ */
+struct swregs_state {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+ u8 ftop;
+ u8 changed;
+ u8 lookahead;
+ u8 no_update;
+ u8 rm;
+ u8 alimit;
+ struct math_emu_info *info;
+ u32 entry_eip;
+};
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature {
+ XFEATURE_FP,
+ XFEATURE_SSE,
+ /*
+ * Values above here are "legacy states".
+ * Those below are "extended states".
+ */
+ XFEATURE_YMM,
+ XFEATURE_BNDREGS,
+ XFEATURE_BNDCSR,
+ XFEATURE_OPMASK,
+ XFEATURE_ZMM_Hi256,
+ XFEATURE_Hi16_ZMM,
+ XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
+ XFEATURE_PKRU,
+ XFEATURE_PASID,
+ XFEATURE_CET_USER,
+ XFEATURE_CET_KERNEL,
+ XFEATURE_RSRVD_COMP_13,
+ XFEATURE_RSRVD_COMP_14,
+ XFEATURE_LBR,
+ XFEATURE_RSRVD_COMP_16,
+ XFEATURE_XTILE_CFG,
+ XFEATURE_XTILE_DATA,
+ XFEATURE_APX,
+
+ XFEATURE_MAX,
+};
+
+#define XFEATURE_MASK_FP (1 << XFEATURE_FP)
+#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE)
+#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM)
+#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS)
+#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR)
+#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK)
+#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256)
+#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
+#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
+#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
+#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
+#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL)
+#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
+#define XFEATURE_MASK_APX (1 << XFEATURE_APX)
+
+#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
+#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
+ | XFEATURE_MASK_ZMM_Hi256 \
+ | XFEATURE_MASK_Hi16_ZMM)
+
+#ifdef CONFIG_X86_64
+# define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILE_DATA \
+ | XFEATURE_MASK_XTILE_CFG)
+#else
+# define XFEATURE_MASK_XTILE (0)
+#endif
+
+#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM
+
+struct reg_128_bit {
+ u8 regbytes[128/8];
+};
+struct reg_256_bit {
+ u8 regbytes[256/8];
+};
+struct reg_512_bit {
+ u8 regbytes[512/8];
+};
+struct reg_1024_byte {
+ u8 regbytes[1024];
+};
+
+/*
+ * State component 2:
+ *
+ * There are 16x 256-bit AVX registers named YMM0-YMM15.
+ * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
+ * and are stored in 'struct fxregs_state::xmm_space[]' in the
+ * "legacy" area.
+ *
+ * The high 128 bits are stored here.
+ */
+struct ymmh_struct {
+ struct reg_128_bit hi_ymm[16];
+} __packed;
+
+/* Intel MPX support: */
+
+struct mpx_bndreg {
+ u64 lower_bound;
+ u64 upper_bound;
+} __packed;
+/*
+ * State component 3 is used for the 4 128-bit bounds registers
+ */
+struct mpx_bndreg_state {
+ struct mpx_bndreg bndreg[4];
+} __packed;
+
+/*
+ * State component 4 is used for the 64-bit user-mode MPX
+ * configuration register BNDCFGU and the 64-bit MPX status
+ * register BNDSTATUS. We call the pair "BNDCSR".
+ */
+struct mpx_bndcsr {
+ u64 bndcfgu;
+ u64 bndstatus;
+} __packed;
+
+/*
+ * The BNDCSR state is padded out to be 64-bytes in size.
+ */
+struct mpx_bndcsr_state {
+ union {
+ struct mpx_bndcsr bndcsr;
+ u8 pad_to_64_bytes[64];
+ };
+} __packed;
+
+/* AVX-512 Components: */
+
+/*
+ * State component 5 is used for the 8 64-bit opmask registers
+ * k0-k7 (opmask state).
+ */
+struct avx_512_opmask_state {
+ u64 opmask_reg[8];
+} __packed;
+
+/*
+ * State component 6 is used for the upper 256 bits of the
+ * registers ZMM0-ZMM15. These 16 256-bit values are denoted
+ * ZMM0_H-ZMM15_H (ZMM_Hi256 state).
+ */
+struct avx_512_zmm_uppers_state {
+ struct reg_256_bit zmm_upper[16];
+} __packed;
+
+/*
+ * State component 7 is used for the 16 512-bit registers
+ * ZMM16-ZMM31 (Hi16_ZMM state).
+ */
+struct avx_512_hi16_state {
+ struct reg_512_bit hi16_zmm[16];
+} __packed;
+
+/*
+ * State component 9: 32-bit PKRU register. The state is
+ * 8 bytes long but only 4 bytes is used currently.
+ */
+struct pkru_state {
+ u32 pkru;
+ u32 pad;
+} __packed;
+
+/*
+ * State component 11 is Control-flow Enforcement user states
+ */
+struct cet_user_state {
+ /* user control-flow settings */
+ u64 user_cet;
+ /* user shadow stack pointer */
+ u64 user_ssp;
+};
+
+/*
+ * State component 12 is Control-flow Enforcement supervisor states.
+ * This state includes SSP pointers for privilege levels 0 through 2.
+ */
+struct cet_supervisor_state {
+ u64 pl0_ssp;
+ u64 pl1_ssp;
+ u64 pl2_ssp;
+} __packed;
+
+/*
+ * State component 15: Architectural LBR configuration state.
+ * The size of Arch LBR state depends on the number of LBRs (lbr_depth).
+ */
+
+struct lbr_entry {
+ u64 from;
+ u64 to;
+ u64 info;
+};
+
+struct arch_lbr_state {
+ u64 lbr_ctl;
+ u64 lbr_depth;
+ u64 ler_from;
+ u64 ler_to;
+ u64 ler_info;
+ struct lbr_entry entries[];
+};
+
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+ u64 tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+ struct reg_1024_byte tmm;
+} __packed;
+
+/*
+ * State component 19: 8B extended general purpose register.
+ */
+struct apx_state {
+ u64 egpr[16];
+} __packed;
+
+/*
+ * State component 10 is supervisor state used for context-switching the
+ * PASID state.
+ */
+struct ia32_pasid_state {
+ u64 pasid;
+} __packed;
+
+struct xstate_header {
+ u64 xfeatures;
+ u64 xcomp_bv;
+ u64 reserved[6];
+} __attribute__((packed));
+
+/*
+ * xstate_header.xcomp_bv[63] indicates that the extended_state_area
+ * is in compacted format.
+ */
+#define XCOMP_BV_COMPACTED_FORMAT ((u64)1 << 63)
+
+/*
+ * This is our most modern FPU state format, as saved by the XSAVE
+ * and restored by the XRSTOR instructions.
+ *
+ * It consists of a legacy fxregs portion, an xstate header and
+ * subsequent areas as defined by the xstate header. Not all CPUs
+ * support all the extensions, so the size of the extended area
+ * can vary quite a bit between CPUs.
+ */
+struct xregs_state {
+ struct fxregs_state i387;
+ struct xstate_header header;
+ u8 extended_state_area[];
+} __attribute__ ((packed, aligned (64)));
+
+/*
+ * This is a union of all the possible FPU state formats
+ * put together, so that we can pick the right one runtime.
+ *
+ * The size of the structure is determined by the largest
+ * member - which is the xsave area. The padding is there
+ * to ensure that statically-allocated task_structs (just
+ * the init_task today) have enough space.
+ */
+union fpregs_state {
+ struct fregs_state fsave;
+ struct fxregs_state fxsave;
+ struct swregs_state soft;
+ struct xregs_state xsave;
+ u8 __padding[PAGE_SIZE];
+};
+
+struct fpstate {
+ /* @kernel_size: The size of the kernel register image */
+ unsigned int size;
+
+ /* @user_size: The size in non-compacted UABI format */
+ unsigned int user_size;
+
+ /* @xfeatures: xfeatures for which the storage is sized */
+ u64 xfeatures;
+
+ /* @user_xfeatures: xfeatures valid in UABI buffers */
+ u64 user_xfeatures;
+
+ /* @xfd: xfeatures disabled to trap userspace use. */
+ u64 xfd;
+
+ /* @is_valloc: Indicator for dynamically allocated state */
+ unsigned int is_valloc : 1;
+
+ /* @is_guest: Indicator for guest state (KVM) */
+ unsigned int is_guest : 1;
+
+ /*
+ * @is_confidential: Indicator for KVM confidential mode.
+ * The FPU registers are restored by the
+ * vmentry firmware from encrypted guest
+ * memory. On vmexit the FPU registers are
+ * saved by firmware to encrypted guest memory
+ * and the registers are scrubbed before
+ * returning to the host. So there is no
+ * content which is worth saving and restoring.
+ * The fpstate has to be there so that
+ * preemption and softirq FPU usage works
+ * without special casing.
+ */
+ unsigned int is_confidential : 1;
+
+ /* @in_use: State is in use */
+ unsigned int in_use : 1;
+
+ /* @regs: The register state union for all supported formats */
+ union fpregs_state regs;
+
+ /* @regs is dynamically sized! Don't add anything after @regs! */
+} __aligned(64);
+
+#define FPU_GUEST_PERM_LOCKED BIT_ULL(63)
+
+struct fpu_state_perm {
+ /*
+ * @__state_perm:
+ *
+ * This bitmap indicates the permission for state components
+ * available to a thread group, including both user and supervisor
+ * components and software-defined bits like FPU_GUEST_PERM_LOCKED.
+ * The permission prctl() sets the enabled state bits in
+ * thread_group_leader()->thread.fpu.
+ *
+ * All run time operations use the per thread information in the
+ * currently active fpu.fpstate which contains the xfeature masks
+ * and sizes for kernel and user space.
+ *
+ * This master permission field is only to be used when
+ * task.fpu.fpstate based checks fail to validate whether the task
+ * is allowed to expand its xfeatures set which requires to
+ * allocate a larger sized fpstate buffer.
+ *
+ * Do not access this field directly. Use the provided helper
+ * function. Unlocked access is possible for quick checks.
+ */
+ u64 __state_perm;
+
+ /*
+ * @__state_size:
+ *
+ * The size required for @__state_perm. Only valid to access
+ * with sighand locked.
+ */
+ unsigned int __state_size;
+
+ /*
+ * @__user_state_size:
+ *
+ * The size required for @__state_perm user part. Only valid to
+ * access with sighand locked.
+ */
+ unsigned int __user_state_size;
+};
+
+/*
+ * Highest level per task FPU state data structure that
+ * contains the FPU register state plus various FPU
+ * state fields:
+ */
+struct fpu {
+ /*
+ * @last_cpu:
+ *
+ * Records the last CPU on which this context was loaded into
+ * FPU registers. (In the lazy-restore case we might be
+ * able to reuse FPU registers across multiple context switches
+ * this way, if no intermediate task used the FPU.)
+ *
+ * A value of -1 is used to indicate that the FPU state in context
+ * memory is newer than the FPU state in registers, and that the
+ * FPU state should be reloaded next time the task is run.
+ */
+ unsigned int last_cpu;
+
+ /*
+ * @avx512_timestamp:
+ *
+ * Records the timestamp of AVX512 use during last context switch.
+ */
+ unsigned long avx512_timestamp;
+
+ /*
+ * @fpstate:
+ *
+ * Pointer to the active struct fpstate. Initialized to
+ * point at @__fpstate below.
+ */
+ struct fpstate *fpstate;
+
+ /*
+ * @__task_fpstate:
+ *
+ * Pointer to an inactive struct fpstate. Initialized to NULL. Is
+ * used only for KVM support to swap out the regular task fpstate.
+ */
+ struct fpstate *__task_fpstate;
+
+ /*
+ * @perm:
+ *
+ * Permission related information
+ */
+ struct fpu_state_perm perm;
+
+ /*
+ * @guest_perm:
+ *
+ * Permission related information for guest pseudo FPUs
+ */
+ struct fpu_state_perm guest_perm;
+
+ /*
+ * @__fpstate:
+ *
+ * Initial in-memory storage for FPU registers which are saved in
+ * context switch and when the kernel uses the FPU. The registers
+ * are restored from this storage on return to user space if they
+ * are not longer containing the tasks FPU register state.
+ */
+ struct fpstate __fpstate;
+ /*
+ * WARNING: '__fpstate' is dynamically-sized. Do not put
+ * anything after it here.
+ */
+};
+
+/*
+ * Guest pseudo FPU container
+ */
+struct fpu_guest {
+ /*
+ * @xfeatures: xfeature bitmap of features which are
+ * currently enabled for the guest vCPU.
+ */
+ u64 xfeatures;
+
+ /*
+ * @xfd_err: Save the guest value.
+ */
+ u64 xfd_err;
+
+ /*
+ * @uabi_size: Size required for save/restore
+ */
+ unsigned int uabi_size;
+
+ /*
+ * @fpstate: Pointer to the allocated guest fpstate
+ */
+ struct fpstate *fpstate;
+};
+
+/*
+ * FPU state configuration data for fpu_guest.
+ * Initialized at boot time. Read only after init.
+ */
+struct vcpu_fpu_config {
+ /*
+ * @size:
+ *
+ * The default size of the register state buffer in guest FPUs.
+ * Includes all supported features except independent managed
+ * features and features which have to be requested by user space
+ * before usage.
+ */
+ unsigned int size;
+
+ /*
+ * @features:
+ *
+ * The default supported features bitmap in guest FPUs. Does not
+ * include independent managed features and features which have to
+ * be requested by user space before usage.
+ */
+ u64 features;
+};
+
+/*
+ * FPU state configuration data. Initialized at boot time. Read only after init.
+ */
+struct fpu_state_config {
+ /*
+ * @max_size:
+ *
+ * The maximum size of the register state buffer. Includes all
+ * supported features except independent managed features.
+ */
+ unsigned int max_size;
+
+ /*
+ * @default_size:
+ *
+ * The default size of the register state buffer. Includes all
+ * supported features except independent managed features,
+ * guest-only features and features which have to be requested by
+ * user space before usage.
+ */
+ unsigned int default_size;
+
+ /*
+ * @max_features:
+ *
+ * The maximum supported features bitmap. Does not include
+ * independent managed features.
+ */
+ u64 max_features;
+
+ /*
+ * @default_features:
+ *
+ * The default supported features bitmap. Does not include
+ * independent managed features, guest-only features and features
+ * which have to be requested by user space before usage.
+ */
+ u64 default_features;
+ /*
+ * @legacy_features:
+ *
+ * Features which can be reported back to user space
+ * even without XSAVE support, i.e. legacy features FP + SSE
+ */
+ u64 legacy_features;
+ /*
+ * @independent_features:
+ *
+ * Features that are supported by XSAVES, but not managed as part of
+ * the FPU core, such as LBR
+ */
+ u64 independent_features;
+};
+
+/* FPU state configuration information */
+extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
+extern struct vcpu_fpu_config guest_default_cfg;
+
+#endif /* _ASM_X86_FPU_TYPES_H */
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
new file mode 100644
index 000000000000..9a710c060445
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_XCR_H
+#define _ASM_X86_FPU_XCR_H
+
+#define XCR_XFEATURE_ENABLED_MASK 0x00000000
+#define XCR_XFEATURE_IN_USE_MASK 0x00000001
+
+static __always_inline u64 xgetbv(u32 index)
+{
+ u32 eax, edx;
+
+ asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
+ return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+ u32 eax = value;
+ u32 edx = value >> 32;
+
+ asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+/*
+ * Return a mask of xfeatures which are currently being tracked
+ * by the processor as being in the initial configuration.
+ *
+ * Callers should check X86_FEATURE_XGETBV1.
+ */
+static __always_inline u64 xfeatures_in_use(void)
+{
+ return xgetbv(XCR_XFEATURE_IN_USE_MASK);
+}
+
+#endif /* _ASM_X86_FPU_XCR_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
new file mode 100644
index 000000000000..7a7dc9d56027
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/fpu/api.h>
+#include <asm/user.h>
+
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
+
+#define FXSAVE_SIZE 512
+
+#define XSAVE_HDR_SIZE 64
+#define XSAVE_HDR_OFFSET FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE 256
+#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
+#define XSAVE_ALIGNMENT 64
+
+/* All currently supported user features */
+#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_XTILE | \
+ XFEATURE_MASK_APX)
+
+/*
+ * Features which are restored when returning to user space.
+ * PKRU is not restored on return to user space because PKRU
+ * is switched eagerly in switch_to() and flush_thread()
+ */
+#define XFEATURE_MASK_USER_RESTORE \
+ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
+
+/* Features which are dynamically enabled for a process on request */
+#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
+
+/* Supervisor features which are enabled only in guest FPUs */
+#define XFEATURE_MASK_GUEST_SUPERVISOR XFEATURE_MASK_CET_KERNEL
+
+/* All currently supported supervisor features */
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_GUEST_SUPERVISOR)
+
+/*
+ * A supervisor state component may not always contain valuable information,
+ * and its size may be huge. Saving/restoring such supervisor state components
+ * at each context switch can cause high CPU and space overhead, which should
+ * be avoided. Such supervisor state components should only be saved/restored
+ * on demand. The on-demand supervisor features are set in this mask.
+ *
+ * Unlike the existing supported supervisor features, an independent supervisor
+ * feature does not allocate a buffer in task->fpu, and the corresponding
+ * supervisor state component cannot be saved/restored at each context switch.
+ *
+ * To support an independent supervisor feature, a developer should follow the
+ * dos and don'ts as below:
+ * - Do dynamically allocate a buffer for the supervisor state component.
+ * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
+ * state component to/from the buffer.
+ * - Don't set the bit corresponding to the independent supervisor feature in
+ * IA32_XSS at run time, since it has been set at boot time.
+ */
+#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)
+
+/*
+ * Unsupported supervisor features. When a supervisor feature in this mask is
+ * supported in the future, move it to the supported supervisor feature mask.
+ */
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+
+/* All supervisor states including supported and unsupported states. */
+#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
+ XFEATURE_MASK_INDEPENDENT | \
+ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
+
+/*
+ * The feature mask required to restore FPU state:
+ * - All user states which are not eagerly switched in switch_to()/exec()
+ * - The suporvisor states
+ */
+#define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED)
+
+/*
+ * Features in this mask have space allocated in the signal frame, but may not
+ * have that space initialized when the feature is in its init state.
+ */
+#define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \
+ XFEATURE_MASK_USER_DYNAMIC)
+
+extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+extern void __init update_regset_xstate_info(unsigned int size,
+ u64 xstate_mask);
+
+int xfeature_size(int xfeature_nr);
+
+void xsaves(struct xregs_state *xsave, u64 mask);
+void xrstors(struct xregs_state *xsave, u64 mask);
+
+int xfd_enable_feature(u64 xfd_err);
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+#endif
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+ return static_branch_unlikely(&__fpu_state_size_dynamic);
+}
+#else
+static __always_inline __pure bool fpu_state_size_dynamic(void)
+{
+ return false;
+}
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 06850a7194e1..0ab65073c1cc 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,27 +1,113 @@
-#ifdef __ASSEMBLY__
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FRAME_H
+#define _ASM_X86_FRAME_H
-#include <asm/dwarf2.h>
+#include <asm/asm.h>
+
+/*
+ * These are stack frame creation macros. They should be used by every
+ * callable non-leaf asm function to make kernel stack traces more reliable.
+ */
-/* The annotation hides the frame from the unwinder and makes it look
- like a ordinary ebp save/restore. This avoids some special cases for
- frame pointer later */
#ifdef CONFIG_FRAME_POINTER
- .macro FRAME
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp,0
- movl %esp,%ebp
- .endm
- .macro ENDFRAME
- popl %ebp
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebp
- .endm
-#else
- .macro FRAME
- .endm
- .macro ENDFRAME
- .endm
+
+#ifdef __ASSEMBLER__
+
+.macro FRAME_BEGIN
+ push %_ASM_BP
+ _ASM_MOV %_ASM_SP, %_ASM_BP
+.endm
+
+.macro FRAME_END
+ pop %_ASM_BP
+.endm
+
+#ifdef CONFIG_X86_64
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just setting the LSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
+ * the original rbp.
+ */
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+ leaq 1+\ptregs_offset(%rsp), %rbp
+.endm
+#else /* !CONFIG_X86_64 */
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just clearing the MSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
+ * original ebp.
+ */
+.macro ENCODE_FRAME_POINTER
+ mov %esp, %ebp
+ andl $0x7fffffff, %ebp
+.endm
+#endif /* CONFIG_X86_64 */
+
+#else /* !__ASSEMBLER__ */
+
+#define FRAME_BEGIN \
+ "push %" _ASM_BP "\n" \
+ _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n"
+
+#define FRAME_END "pop %" _ASM_BP "\n"
+
+#ifdef CONFIG_X86_64
+
+#define ENCODE_FRAME_POINTER \
+ "lea 1(%rsp), %rbp\n\t"
+
+static inline unsigned long encode_frame_pointer(struct pt_regs *regs)
+{
+ return (unsigned long)regs + 1;
+}
+
+#else /* !CONFIG_X86_64 */
+
+#define ENCODE_FRAME_POINTER \
+ "movl %esp, %ebp\n\t" \
+ "andl $0x7fffffff, %ebp\n\t"
+
+static inline unsigned long encode_frame_pointer(struct pt_regs *regs)
+{
+ return (unsigned long)regs & 0x7fffffff;
+}
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* __ASSEMBLER__ */
+
+#define FRAME_OFFSET __ASM_SEL(4, 8)
+
+#else /* !CONFIG_FRAME_POINTER */
+
+#ifdef __ASSEMBLER__
+
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+.endm
+
+#else /* !__ASSEMBLER__ */
+
+#define ENCODE_FRAME_POINTER
+
+static inline unsigned long encode_frame_pointer(struct pt_regs *regs)
+{
+ return 0;
+}
+
#endif
-#endif /* __ASSEMBLY__ */
+#define FRAME_BEGIN
+#define FRAME_END
+#define FRAME_OFFSET 0
+
+#endif /* CONFIG_FRAME_POINTER */
+
+#endif /* _ASM_X86_FRAME_H */
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
new file mode 100644
index 000000000000..2bb65677c079
--- /dev/null
+++ b/arch/x86/include/asm/fred.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Macros for Flexible Return and Event Delivery (FRED)
+ */
+
+#ifndef ASM_X86_FRED_H
+#define ASM_X86_FRED_H
+
+#include <linux/const.h>
+
+#include <asm/asm.h>
+#include <asm/msr.h>
+#include <asm/trapnr.h>
+
+/*
+ * FRED event return instruction opcodes for ERET{S,U}; supported in
+ * binutils >= 2.41.
+ */
+#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca)
+#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca)
+
+/*
+ * RSP is aligned to a 64-byte boundary before used to push a new stack frame
+ */
+#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f))
+
+/*
+ * Used for the return address for call emulation during code patching,
+ * and measured in 64-byte cache lines.
+ */
+#define FRED_CONFIG_REDZONE_AMOUNT 1
+#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6)
+#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9)
+#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p))
+
+#ifndef __ASSEMBLER__
+
+#ifdef CONFIG_X86_FRED
+#include <linux/kernel.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/ptrace.h>
+
+struct fred_info {
+ /* Event data: CR2, DR6, ... */
+ unsigned long edata;
+ unsigned long resv;
+};
+
+/* Full format of the FRED stack frame */
+struct fred_frame {
+ struct pt_regs regs;
+ struct fred_info info;
+};
+
+static __always_inline struct fred_info *fred_info(struct pt_regs *regs)
+{
+ return &container_of(regs, struct fred_frame, regs)->info;
+}
+
+static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
+{
+ return fred_info(regs)->edata;
+}
+
+void asm_fred_entrypoint_user(void);
+void asm_fred_entrypoint_kernel(void);
+void asm_fred_entry_from_kvm(struct fred_ss);
+
+__visible void fred_entry_from_user(struct pt_regs *regs);
+__visible void fred_entry_from_kernel(struct pt_regs *regs);
+__visible void __fred_entry_from_kvm(struct pt_regs *regs);
+
+/* Can be called from noinstr code, thus __always_inline */
+static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector)
+{
+ struct fred_ss ss = {
+ .ss =__KERNEL_DS,
+ .type = type,
+ .vector = vector,
+ .nmi = type == EVENT_TYPE_NMI,
+ .l = 1,
+ };
+
+ asm_fred_entry_from_kvm(ss);
+}
+
+void cpu_init_fred_exceptions(void);
+void cpu_init_fred_rsps(void);
+void fred_complete_exception_setup(void);
+
+DECLARE_PER_CPU(unsigned long, fred_rsp0);
+
+static __always_inline void fred_sync_rsp0(unsigned long rsp0)
+{
+ __this_cpu_write(fred_rsp0, rsp0);
+}
+
+static __always_inline void fred_update_rsp0(void)
+{
+ unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE;
+
+ if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) {
+ wrmsrns(MSR_IA32_FRED_RSP0, rsp0);
+ __this_cpu_write(fred_rsp0, rsp0);
+ }
+}
+#else /* CONFIG_X86_FRED */
+static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
+static inline void cpu_init_fred_exceptions(void) { }
+static inline void cpu_init_fred_rsps(void) { }
+static inline void fred_complete_exception_setup(void) { }
+static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
+static inline void fred_sync_rsp0(unsigned long rsp0) { }
+static inline void fred_update_rsp0(void) { }
+#endif /* CONFIG_X86_FRED */
+#endif /* !__ASSEMBLER__ */
+
+#endif /* ASM_X86_FRED_H */
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
new file mode 100644
index 000000000000..ab2547f97c2c
--- /dev/null
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_FSGSBASE_H
+#define _ASM_FSGSBASE_H
+
+#ifndef __ASSEMBLER__
+
+#ifdef CONFIG_X86_64
+
+#include <asm/msr.h>
+
+/*
+ * Read/write a task's FSBASE or GSBASE. This returns the value that
+ * the FS/GS base would have (if the task were to be resumed). These
+ * work on the current task or on a non-running (typically stopped
+ * ptrace child) task.
+ */
+extern unsigned long x86_fsbase_read_task(struct task_struct *task);
+extern unsigned long x86_gsbase_read_task(struct task_struct *task);
+extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
+extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
+
+/* Must be protected by X86_FEATURE_FSGSBASE check. */
+
+static __always_inline unsigned long rdfsbase(void)
+{
+ unsigned long fsbase;
+
+ asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
+
+ return fsbase;
+}
+
+static __always_inline unsigned long rdgsbase(void)
+{
+ unsigned long gsbase;
+
+ asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
+
+ return gsbase;
+}
+
+static __always_inline void wrfsbase(unsigned long fsbase)
+{
+ asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
+}
+
+static __always_inline void wrgsbase(unsigned long gsbase)
+{
+ asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
+}
+
+#include <asm/cpufeature.h>
+
+/* Helper functions for reading/writing FS/GS base */
+
+static inline unsigned long x86_fsbase_read_cpu(void)
+{
+ unsigned long fsbase;
+
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE))
+ fsbase = rdfsbase();
+ else
+ rdmsrq(MSR_FS_BASE, fsbase);
+
+ return fsbase;
+}
+
+static inline void x86_fsbase_write_cpu(unsigned long fsbase)
+{
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE))
+ wrfsbase(fsbase);
+ else
+ wrmsrq(MSR_FS_BASE, fsbase);
+}
+
+extern unsigned long x86_gsbase_read_cpu_inactive(void);
+extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+extern unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+ unsigned short selector);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_FSGSBASE_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index bd2c6511c887..b08c95872eed 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -1,57 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_FTRACE_H
#define _ASM_X86_FTRACE_H
-#ifdef __ASSEMBLY__
-
- .macro MCOUNT_SAVE_FRAME
- /* taken from glibc */
- subq $0x38, %rsp
- movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
- .endm
-
- .macro MCOUNT_RESTORE_FRAME
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
- movq (%rsp), %rax
- addq $0x38, %rsp
- .endm
+#include <asm/ptrace.h>
+#ifdef CONFIG_FUNCTION_TRACER
+#ifndef CC_USING_FENTRY
+# error Compiler does not support fentry?
#endif
+# define MCOUNT_ADDR ((unsigned long)(__fentry__))
+#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
-/* FIXME: I don't want to stay hardcoded */
-#ifdef CONFIG_X86_64
-# define FTRACE_SYSCALL_MAX 296
-#else
-# define FTRACE_SYSCALL_MAX 333
+/* Ignore unused weak functions which will have non zero offsets */
+#ifdef CONFIG_HAVE_FENTRY
+# include <asm/ibt.h>
+/* Add offset for endbr64 if IBT enabled */
+# define FTRACE_MCOUNT_MAX_OFFSET ENDBR_INSN_SIZE
#endif
-#ifdef CONFIG_FUNCTION_TRACER
-#define MCOUNT_ADDR ((long)(mcount))
-#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define ARCH_SUPPORTS_FTRACE_OPS 1
+#endif
-#ifndef __ASSEMBLY__
-extern void mcount(void);
+#ifndef __ASSEMBLER__
+extern void __fentry__(void);
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
/*
- * call mcount is "e8 <4 byte offset>"
- * The addr points to the 4 byte offset and the caller of this
- * function wants the pointer to e8. Simply subtract one.
+ * addr is the address of the mcount call instruction.
+ * recordmcount does the necessary offset calculation.
*/
- return addr - 1;
+ return addr;
+}
+
+static inline unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip)
+{
+ if (is_endbr((void*)(fentry_ip - ENDBR_INSN_SIZE)))
+ fentry_ip -= ENDBR_INSN_SIZE;
+
+ return fentry_ip;
+}
+#define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip)
+
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
+#include <linux/ftrace_regs.h>
+
+static __always_inline struct pt_regs *
+arch_ftrace_get_regs(struct ftrace_regs *fregs)
+{
+ /* Only when FL_SAVE_REGS is set, cs will be non zero */
+ if (!arch_ftrace_regs(fregs)->regs.cs)
+ return NULL;
+ return &arch_ftrace_regs(fregs)->regs;
+}
+
+#define arch_ftrace_partial_regs(regs) do { \
+ regs->flags &= ~X86_EFLAGS_FIXED; \
+ regs->cs = __KERNEL_CS; \
+} while (0)
+
+#define arch_ftrace_fill_perf_regs(fregs, _regs) do { \
+ (_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \
+ (_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \
+ (_regs)->cs = __KERNEL_CS; \
+ (_regs)->flags = 0; \
+ } while (0)
+
+#define ftrace_regs_set_instruction_pointer(fregs, _ip) \
+ do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0)
+
+
+static __always_inline unsigned long
+ftrace_regs_get_return_address(struct ftrace_regs *fregs)
+{
+ return *(unsigned long *)ftrace_regs_get_stack_pointer(fregs);
}
+struct ftrace_ops;
+#define ftrace_graph_func ftrace_graph_func
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+#else
+#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When a ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this, we
+ * place the direct caller in the ORIG_AX part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void
+__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
+{
+ /* Emulate a call */
+ regs->orig_ax = addr;
+}
+#define arch_ftrace_set_direct_caller(fregs, addr) \
+ __arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr)
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
#ifdef CONFIG_DYNAMIC_FTRACE
struct dyn_arch_ftrace {
@@ -59,7 +111,54 @@ struct dyn_arch_ftrace {
};
#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifndef __ASSEMBLER__
+
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
+ unsigned long frame_pointer);
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+extern void set_ftrace_ops_ro(void);
+#else
+static inline void set_ftrace_ops_ro(void) { }
+#endif
+
+#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+ /*
+ * Compare the symbol name with the system call name. Skip the
+ * "__x64_sys", "__ia32_sys", "__do_sys" or simple "sys" prefix.
+ */
+ return !strcmp(sym + 3, name + 3) ||
+ (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) ||
+ (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)) ||
+ (!strncmp(sym, "__do_sys", 8) && !strcmp(sym + 8, name + 3));
+}
+
+#ifndef COMPILE_OFFSETS
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION)
+#include <linux/compat.h>
+
+/*
+ * Because ia32 syscalls do not map to x86_64 syscall numbers
+ * this screws up the trace output when tracing a ia32 task.
+ * Instead of reporting bogus syscalls, just do not trace them.
+ *
+ * If the user really wants these, then they should use the
+ * raw syscall tracepoints with filtering.
+ */
+#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
+static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
+{
+ return in_32bit_syscall();
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */
+#endif /* !COMPILE_OFFSETS */
+#endif /* !__ASSEMBLER__ */
+
#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e956..fe5d9a10d900 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_FUTEX_H
#define _ASM_X86_FUTEX_H
@@ -9,131 +10,88 @@
#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/smap.h>
-#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+#define unsafe_atomic_op1(insn, oval, uaddr, oparg, label) \
+do { \
+ int oldval = 0, ret; \
asm volatile("1:\t" insn "\n" \
- "2:\t.section .fixup,\"ax\"\n" \
- "3:\tmov\t%3, %1\n" \
- "\tjmp\t2b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %1) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
- : "i" (-EFAULT), "0" (oparg), "1" (0))
+ : "0" (oparg), "1" (0)); \
+ if (ret) \
+ goto label; \
+ *oval = oldval; \
+} while(0)
-#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+
+#define unsafe_atomic_op2(insn, oval, uaddr, oparg, label) \
+do { \
+ int oldval = 0, ret, tem; \
asm volatile("1:\tmovl %2, %0\n" \
- "\tmovl\t%0, %3\n" \
+ "2:\tmovl\t%0, %3\n" \
"\t" insn "\n" \
- "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
- "\tjnz\t1b\n" \
- "3:\t.section .fixup,\"ax\"\n" \
- "4:\tmov\t%5, %1\n" \
- "\tjmp\t3b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE(1b, 4b) \
- _ASM_EXTABLE(2b, 4b) \
+ "3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
+ "\tjnz\t2b\n" \
+ "4:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 4b, EX_TYPE_EFAULT_REG, %1) \
+ _ASM_EXTABLE_TYPE_REG(3b, 4b, EX_TYPE_EFAULT_REG, %1) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
- : "r" (oparg), "i" (-EFAULT), "1" (0))
-
-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+ : "r" (oparg), "1" (0)); \
+ if (ret) \
+ goto label; \
+ *oval = oldval; \
+} while(0)
+
+static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
{
- int op = (encoded_op >> 28) & 7;
- int cmp = (encoded_op >> 24) & 15;
- int oparg = (encoded_op << 8) >> 20;
- int cmparg = (encoded_op << 20) >> 20;
- int oldval = 0, ret, tem;
-
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
- oparg = 1 << oparg;
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
- return -EFAULT;
-
-#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
- /* Real i386 machines can only support FUTEX_OP_SET */
- if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3)
- return -ENOSYS;
-#endif
-
- pagefault_disable();
-
- switch (op) {
- case FUTEX_OP_SET:
- __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
- break;
- case FUTEX_OP_ADD:
- __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
- uaddr, oparg);
- break;
- case FUTEX_OP_OR:
- __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
- break;
- case FUTEX_OP_ANDN:
- __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
- break;
- case FUTEX_OP_XOR:
- __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
- break;
- default:
- ret = -ENOSYS;
- }
-
- pagefault_enable();
-
- if (!ret) {
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- ret = (oldval == cmparg);
- break;
- case FUTEX_OP_CMP_NE:
- ret = (oldval != cmparg);
+ scoped_user_rw_access(uaddr, Efault) {
+ switch (op) {
+ case FUTEX_OP_SET:
+ unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault);
break;
- case FUTEX_OP_CMP_LT:
- ret = (oldval < cmparg);
+ case FUTEX_OP_ADD:
+ unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, uaddr, oparg, Efault);
break;
- case FUTEX_OP_CMP_GE:
- ret = (oldval >= cmparg);
+ case FUTEX_OP_OR:
+ unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault);
break;
- case FUTEX_OP_CMP_LE:
- ret = (oldval <= cmparg);
+ case FUTEX_OP_ANDN:
+ unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault);
break;
- case FUTEX_OP_CMP_GT:
- ret = (oldval > cmparg);
+ case FUTEX_OP_XOR:
+ unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault);
break;
default:
- ret = -ENOSYS;
+ return -ENOSYS;
}
}
- return ret;
+ return 0;
+Efault:
+ return -EFAULT;
}
-static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
- int newval)
+static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
{
-
-#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
- /* Real i386 machines have no cmpxchg instruction */
- if (boot_cpu_data.x86 == 3)
- return -ENOSYS;
-#endif
-
- if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
- return -EFAULT;
-
- asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
- "2:\t.section .fixup, \"ax\"\n"
- "3:\tmov %2, %0\n"
- "\tjmp 2b\n"
- "\t.previous\n"
- _ASM_EXTABLE(1b, 3b)
- : "=a" (oldval), "+m" (*uaddr)
- : "i" (-EFAULT), "r" (newval), "0" (oldval)
- : "memory"
- );
-
- return oldval;
+ int ret = 0;
+
+ scoped_user_rw_access(uaddr, Efault) {
+ asm_inline volatile("\n"
+ "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0)
+ : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
+ : "r" (newval), "1" (oldval)
+ : "memory");
+ *uval = oldval;
+ }
+ return ret;
+Efault:
+ return -EFAULT;
}
#endif
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa409d8..5af8088a10df 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_GART_H
#define _ASM_X86_GART_H
-#include <asm/e820.h>
+#include <asm/e820/api.h>
extern void set_up_gart_resume(u32, u32);
@@ -17,6 +18,7 @@ extern int fix_aperture;
#define GARTEN (1<<0)
#define DISGARTCPU (1<<4)
#define DISGARTIO (1<<5)
+#define DISTLBWALKPRB (1<<6)
/* GART cache control register bits. */
#define INVGART (1<<0)
@@ -27,7 +29,6 @@ extern int fix_aperture;
#define AMD64_GARTAPERTUREBASE 0x94
#define AMD64_GARTTABLEBASE 0x98
#define AMD64_GARTCACHECTL 0x9c
-#define AMD64_GARTEN (1<<0)
#ifdef CONFIG_GART_IOMMU
extern int gart_iommu_aperture;
@@ -35,10 +36,9 @@ extern int gart_iommu_aperture_allowed;
extern int gart_iommu_aperture_disabled;
extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
-extern void gart_iommu_shutdown(void);
+extern int gart_iommu_init(void);
extern void __init gart_parse_options(char *);
-extern void gart_iommu_hole_init(void);
+void gart_iommu_hole_init(void);
#else
#define gart_iommu_aperture 0
@@ -48,12 +48,6 @@ extern void gart_iommu_hole_init(void);
static inline void early_gart_iommu_check(void)
{
}
-static inline void gart_iommu_init(void)
-{
-}
-static inline void gart_iommu_shutdown(void)
-{
-}
static inline void gart_parse_options(char *options)
{
}
@@ -64,21 +58,34 @@ static inline void gart_iommu_hole_init(void)
extern int agp_amd64_init(void);
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+ u32 ctl;
+
+ /*
+ * Don't enable translation but enable GART IO and CPU accesses.
+ * Also, set DISTLBWALKPRB since GART tables memory is UC.
+ */
+ ctl = order << 1;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
{
u32 tmp, ctl;
- /* address of the mappings table */
- addr >>= 12;
- tmp = (u32) addr<<4;
- tmp &= ~0xf;
- pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
-
- /* Enable GART translation for this hammer. */
- pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
- ctl |= GARTEN;
- ctl &= ~(DISGARTCPU | DISGARTIO);
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+ /* address of the mappings table */
+ addr >>= 12;
+ tmp = (u32) addr<<4;
+ tmp &= ~0xf;
+ pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
+
+ /* Enable GART translation for this hammer. */
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+ ctl |= GARTEN | DISTLBWALKPRB;
+ ctl &= ~(DISGARTCPU | DISGARTIO);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
@@ -90,7 +97,7 @@ static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
return 0;
}
- if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+ if (e820__mapped_any(aper_base, aper_base + aper_size, E820_TYPE_RAM)) {
printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
return 0;
}
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
index ad3c2ed75481..3c7267ef4a9e 100644
--- a/arch/x86/include/asm/geode.h
+++ b/arch/x86/include/asm/geode.h
@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* AMD Geode definitions
* Copyright (C) 2006, Advanced Micro Devices, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
*/
#ifndef _ASM_X86_GEODE_H
@@ -12,160 +9,7 @@
#include <asm/processor.h>
#include <linux/io.h>
-
-/* Generic southbridge functions */
-
-#define GEODE_DEV_PMS 0
-#define GEODE_DEV_ACPI 1
-#define GEODE_DEV_GPIO 2
-#define GEODE_DEV_MFGPT 3
-
-extern int geode_get_dev_base(unsigned int dev);
-
-/* Useful macros */
-#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
-#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
-#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
-#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
-
-/* MSRS */
-
-#define MSR_GLIU_P2D_RO0 0x10000029
-
-#define MSR_LX_GLD_MSR_CONFIG 0x48002001
-#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
- * sheet has the wrong value */
-#define MSR_GLCP_SYS_RSTPLL 0x4C000014
-#define MSR_GLCP_DOTPLL 0x4C000015
-
-#define MSR_LBAR_SMB 0x5140000B
-#define MSR_LBAR_GPIO 0x5140000C
-#define MSR_LBAR_MFGPT 0x5140000D
-#define MSR_LBAR_ACPI 0x5140000E
-#define MSR_LBAR_PMS 0x5140000F
-
-#define MSR_DIVIL_SOFT_RESET 0x51400017
-
-#define MSR_PIC_YSEL_LOW 0x51400020
-#define MSR_PIC_YSEL_HIGH 0x51400021
-#define MSR_PIC_ZSEL_LOW 0x51400022
-#define MSR_PIC_ZSEL_HIGH 0x51400023
-#define MSR_PIC_IRQM_LPC 0x51400025
-
-#define MSR_MFGPT_IRQ 0x51400028
-#define MSR_MFGPT_NR 0x51400029
-#define MSR_MFGPT_SETUP 0x5140002B
-
-#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
-
-#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
-#define MSR_GX_MSR_PADSEL 0xC0002011
-
-/* Resource Sizes */
-
-#define LBAR_GPIO_SIZE 0xFF
-#define LBAR_MFGPT_SIZE 0x40
-#define LBAR_ACPI_SIZE 0x40
-#define LBAR_PMS_SIZE 0x80
-
-/* ACPI registers (PMS block) */
-
-/*
- * PM1_EN is only valid when VSA is enabled for 16 bit reads.
- * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
- * with a 32 bit read at offset 0x0
- */
-
-#define PM1_STS 0x00
-#define PM1_EN 0x02
-#define PM1_CNT 0x08
-#define PM2_CNT 0x0C
-#define PM_TMR 0x10
-#define PM_GPE0_STS 0x18
-#define PM_GPE0_EN 0x1C
-
-/* PMC registers (PMS block) */
-
-#define PM_SSD 0x00
-#define PM_SCXA 0x04
-#define PM_SCYA 0x08
-#define PM_OUT_SLPCTL 0x0C
-#define PM_SCLK 0x10
-#define PM_SED 0x1
-#define PM_SCXD 0x18
-#define PM_SCYD 0x1C
-#define PM_IN_SLPCTL 0x20
-#define PM_WKD 0x30
-#define PM_WKXD 0x34
-#define PM_RD 0x38
-#define PM_WKXA 0x3C
-#define PM_FSD 0x40
-#define PM_TSD 0x44
-#define PM_PSD 0x48
-#define PM_NWKD 0x4C
-#define PM_AWKD 0x50
-#define PM_SSC 0x54
-
-/* VSA2 magic values */
-
-#define VSA_VRC_INDEX 0xAC1C
-#define VSA_VRC_DATA 0xAC1E
-#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
-#define VSA_VR_SIGNATURE 0x0003
-#define VSA_VR_MEM_SIZE 0x0200
-#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
-#define GSW_VSA_SIG 0x534d /* General Software signature */
-/* GPIO */
-
-#define GPIO_OUTPUT_VAL 0x00
-#define GPIO_OUTPUT_ENABLE 0x04
-#define GPIO_OUTPUT_OPEN_DRAIN 0x08
-#define GPIO_OUTPUT_INVERT 0x0C
-#define GPIO_OUTPUT_AUX1 0x10
-#define GPIO_OUTPUT_AUX2 0x14
-#define GPIO_PULL_UP 0x18
-#define GPIO_PULL_DOWN 0x1C
-#define GPIO_INPUT_ENABLE 0x20
-#define GPIO_INPUT_INVERT 0x24
-#define GPIO_INPUT_FILTER 0x28
-#define GPIO_INPUT_EVENT_COUNT 0x2C
-#define GPIO_READ_BACK 0x30
-#define GPIO_INPUT_AUX1 0x34
-#define GPIO_EVENTS_ENABLE 0x38
-#define GPIO_LOCK_ENABLE 0x3C
-#define GPIO_POSITIVE_EDGE_EN 0x40
-#define GPIO_NEGATIVE_EDGE_EN 0x44
-#define GPIO_POSITIVE_EDGE_STS 0x48
-#define GPIO_NEGATIVE_EDGE_STS 0x4C
-
-#define GPIO_MAP_X 0xE0
-#define GPIO_MAP_Y 0xE4
-#define GPIO_MAP_Z 0xE8
-#define GPIO_MAP_W 0xEC
-
-static inline u32 geode_gpio(unsigned int nr)
-{
- BUG_ON(nr > 28);
- return 1 << nr;
-}
-
-extern void geode_gpio_set(u32, unsigned int);
-extern void geode_gpio_clear(u32, unsigned int);
-extern int geode_gpio_isset(u32, unsigned int);
-extern void geode_gpio_setup_event(unsigned int, int, int);
-extern void geode_gpio_set_irq(unsigned int, unsigned int);
-
-static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
-{
- geode_gpio_setup_event(gpio, pair, 0);
-}
-
-static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
-{
- geode_gpio_setup_event(gpio, pair, 1);
-}
-
-/* Specific geode tests */
+#include <linux/cs5535.h>
static inline int is_geode_gx(void)
{
@@ -186,68 +30,4 @@ static inline int is_geode(void)
return (is_geode_gx() || is_geode_lx());
}
-#ifdef CONFIG_MGEODE_LX
-extern int geode_has_vsa2(void);
-#else
-static inline int geode_has_vsa2(void)
-{
- return 0;
-}
-#endif
-
-/* MFGPTs */
-
-#define MFGPT_MAX_TIMERS 8
-#define MFGPT_TIMER_ANY (-1)
-
-#define MFGPT_DOMAIN_WORKING 1
-#define MFGPT_DOMAIN_STANDBY 2
-#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
-
-#define MFGPT_CMP1 0
-#define MFGPT_CMP2 1
-
-#define MFGPT_EVENT_IRQ 0
-#define MFGPT_EVENT_NMI 1
-#define MFGPT_EVENT_RESET 3
-
-#define MFGPT_REG_CMP1 0
-#define MFGPT_REG_CMP2 2
-#define MFGPT_REG_COUNTER 4
-#define MFGPT_REG_SETUP 6
-
-#define MFGPT_SETUP_CNTEN (1 << 15)
-#define MFGPT_SETUP_CMP2 (1 << 14)
-#define MFGPT_SETUP_CMP1 (1 << 13)
-#define MFGPT_SETUP_SETUP (1 << 12)
-#define MFGPT_SETUP_STOPEN (1 << 11)
-#define MFGPT_SETUP_EXTEN (1 << 10)
-#define MFGPT_SETUP_REVEN (1 << 5)
-#define MFGPT_SETUP_CLKSEL (1 << 4)
-
-static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
- outw(value, base + reg + (timer * 8));
-}
-
-static inline u16 geode_mfgpt_read(int timer, u16 reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
- return inw(base + reg + (timer * 8));
-}
-
-extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
-extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
-extern int geode_mfgpt_alloc_timer(int timer, int domain);
-
-#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
-#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
-
-#ifdef CONFIG_GEODE_MFGPT_TIMER
-extern int __init mfgpt_timer_setup(void);
-#else
-static inline int mfgpt_timer_setup(void) { return 0; }
-#endif
-
#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
deleted file mode 100644
index 49dbfdfa50f9..000000000000
--- a/arch/x86/include/asm/gpio.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Generic GPIO API implementation for x86.
- *
- * Derived from the generic GPIO API for powerpc:
- *
- * Copyright (c) 2007-2008 MontaVista Software, Inc.
- *
- * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef _ASM_X86_GPIO_H
-#define _ASM_X86_GPIO_H
-
-#include <asm-generic/gpio.h>
-
-#ifdef CONFIG_GPIOLIB
-
-/*
- * Just call gpiolib.
- */
-static inline int gpio_get_value(unsigned int gpio)
-{
- return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
- __gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
- return __gpio_cansleep(gpio);
-}
-
-/*
- * Not implemented, yet.
- */
-static inline int gpio_to_irq(unsigned int gpio)
-{
- return -ENOSYS;
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
- return -EINVAL;
-}
-
-#endif /* CONFIG_GPIOLIB */
-
-#endif /* _ASM_X86_GPIO_H */
diff --git a/arch/x86/include/asm/gsseg.h b/arch/x86/include/asm/gsseg.h
new file mode 100644
index 000000000000..ab6a595cea70
--- /dev/null
+++ b/arch/x86/include/asm/gsseg.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_GSSEG_H
+#define _ASM_X86_GSSEG_H
+
+#include <linux/types.h>
+
+#include <asm/asm.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
+#include <asm/processor.h>
+#include <asm/nops.h>
+
+#ifdef CONFIG_X86_64
+
+extern asmlinkage void asm_load_gs_index(u16 selector);
+
+/* Replace with "lkgs %di" once binutils support LKGS instruction */
+#define LKGS_DI _ASM_BYTES(0xf2,0x0f,0x00,0xf7)
+
+static inline void native_lkgs(unsigned int selector)
+{
+ u16 sel = selector;
+ asm_inline volatile("1: " LKGS_DI
+ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k[sel])
+ : [sel] "+D" (sel));
+}
+
+static inline void native_load_gs_index(unsigned int selector)
+{
+ if (cpu_feature_enabled(X86_FEATURE_LKGS)) {
+ native_lkgs(selector);
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ asm_load_gs_index(selector);
+ local_irq_restore(flags);
+ }
+}
+
+#endif /* CONFIG_X86_64 */
+
+static inline void __init lkgs_init(void)
+{
+#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_X86_64
+ if (cpu_feature_enabled(X86_FEATURE_LKGS))
+ pv_ops.cpu.load_gs_index = native_lkgs;
+#endif
+#endif
+}
+
+#ifndef CONFIG_PARAVIRT_XXL
+
+static inline void load_gs_index(unsigned int selector)
+{
+#ifdef CONFIG_X86_64
+ native_load_gs_index(selector);
+#else
+ loadsegment(gs, selector);
+#endif
+}
+
+#endif /* CONFIG_PARAVIRT_XXL */
+
+#endif /* _ASM_X86_GSSEG_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f01043..6b6d472baa0b 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -1,48 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_HARDIRQ_H
#define _ASM_X86_HARDIRQ_H
#include <linux/threads.h>
-#include <linux/irq.h>
typedef struct {
- unsigned int __softirq_pending;
+#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL)
+ u8 kvm_cpu_l1tf_flush_l1d;
+#endif
unsigned int __nmi_count; /* arch dependent */
- unsigned int irq0_irqs;
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int apic_timer_irqs; /* arch dependent */
unsigned int irq_spurious_count;
+ unsigned int icr_read_retry_count;
+#endif
+#if IS_ENABLED(CONFIG_KVM)
+ unsigned int kvm_posted_intr_ipis;
+ unsigned int kvm_posted_intr_wakeup_ipis;
+ unsigned int kvm_posted_intr_nested_ipis;
#endif
- unsigned int generic_irqs; /* arch dependent */
+ unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
- unsigned int apic_pending_irqs;
+ unsigned int apic_irq_work_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
- unsigned int irq_tlb_count;
#endif
-#ifdef CONFIG_X86_MCE
+ unsigned int irq_tlb_count;
+#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
unsigned int irq_threshold_count;
-# endif
+#endif
+#ifdef CONFIG_X86_MCE_AMD
+ unsigned int irq_deferred_error_count;
+#endif
+#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
+ unsigned int irq_hv_callback_count;
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ unsigned int irq_hv_reenlightenment_count;
+ unsigned int hyperv_stimer0_count;
+#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ unsigned int posted_msi_notification_count;
#endif
} ____cacheline_aligned irq_cpustat_t;
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
-#define MAX_HARDIRQS_PER_CPU NR_VECTORS
-
+#ifdef CONFIG_X86_POSTED_MSI
+DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+#endif
#define __ARCH_IRQ_STAT
-#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
-
-#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
-
-#define __ARCH_SET_SOFTIRQ_PENDING
-
-#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
-#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x))
+#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
extern void ack_bad_irq(unsigned int irq);
@@ -52,4 +65,30 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
+DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
+#define local_softirq_pending_ref __softirq_pending
+
+#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * This function is called from noinstr interrupt contexts
+ * and must be inlined to not get instrumentation.
+ */
+static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
+}
+
+static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
+}
+
+static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void)
+{
+ return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
+}
+#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
+static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
+#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
+
#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45..585bdadba47d 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* highmem.h: virtual kernel memory mappings for high memory
*
@@ -22,10 +23,10 @@
#include <linux/interrupt.h>
#include <linux/threads.h>
-#include <asm/kmap_types.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/fixmap.h>
+#include <asm/pgtable_areas.h>
/* declarations for highmem.c */
extern unsigned long highstart_pfn, highend_pfn;
@@ -38,42 +39,35 @@ extern unsigned long highstart_pfn, highend_pfn;
/*
* Ordering is:
*
- * FIXADDR_TOP
- * fixed_addresses
- * FIXADDR_START
- * temp fixed addresses
- * FIXADDR_BOOT_START
- * Persistent kmap area
- * PKMAP_BASE
- * VMALLOC_END
- * Vmalloc area
- * VMALLOC_START
- * high_memory
+ * high memory on: high_memory off:
+ * FIXADDR_TOP FIXADDR_TOP
+ * fixed addresses fixed addresses
+ * FIXADDR_START FIXADDR_START
+ * temp fixed addresses/persistent kmap area VMALLOC_END
+ * PKMAP_BASE temp fixed addresses/vmalloc area
+ * VMALLOC_END VMALLOC_START
+ * vmalloc area high_memory
+ * VMALLOC_START
+ * high_memory
+ *
+ * The temp fixed area is only used during boot for early_ioremap(), and
+ * it is unused when the ioremap() is functional. vmalloc/pkmap area become
+ * available after early boot so the temp fixed area is available for re-use.
*/
#define LAST_PKMAP_MASK (LAST_PKMAP-1)
#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
-extern void *kmap_high(struct page *page);
-extern void kunmap_high(struct page *page);
-
-void *kmap(struct page *page);
-void kunmap(struct page *page);
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
-struct page *kmap_atomic_to_page(void *ptr);
-
-#ifndef CONFIG_PARAVIRT
-#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
-#endif
-
#define flush_cache_kmaps() do { } while (0)
-extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn);
+#define arch_kmap_local_post_map(vaddr, pteval) \
+ arch_flush_lazy_mmu_mode()
+
+#define arch_kmap_local_post_unmap(vaddr) \
+ do { \
+ flush_tlb_one_kernel((vaddr)); \
+ arch_flush_lazy_mmu_mode(); \
+ } while (0)
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1c22cb05ad6a..ab0c78855ecb 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_HPET_H
#define _ASM_X86_HPET_H
@@ -35,8 +36,6 @@
#define HPET_ID_NUMBER_SHIFT 8
#define HPET_ID_VENDOR_SHIFT 16
-#define HPET_ID_VENDOR_8086 0x8086
-
#define HPET_CFG_ENABLE 0x001
#define HPET_CFG_LEGACY 0x002
#define HPET_LEGACY_8254 2
@@ -65,27 +64,16 @@
/* hpet memory map physical address */
extern unsigned long hpet_address;
extern unsigned long force_hpet_address;
-extern int hpet_force_user;
+extern bool boot_hpet_disable;
+extern u8 hpet_blockid;
+extern bool hpet_force_user;
+extern bool hpet_msi_disable;
extern int is_hpet_enabled(void);
extern int hpet_enable(void);
extern void hpet_disable(void);
-extern unsigned long hpet_readl(unsigned long a);
+extern unsigned int hpet_readl(unsigned int a);
extern void force_hpet_resume(void);
-extern void hpet_msi_unmask(unsigned int irq);
-extern void hpet_msi_mask(unsigned int irq);
-extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
-extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
-
-#ifdef CONFIG_PCI_MSI
-extern int arch_setup_hpet_msi(unsigned int irq);
-#else
-static inline int arch_setup_hpet_msi(unsigned int irq)
-{
- return -EINVAL;
-}
-#endif
-
#ifdef CONFIG_HPET_EMULATE_RTC
#include <linux/interrupt.h>
@@ -96,7 +84,6 @@ extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
unsigned char sec);
extern int hpet_set_periodic_freq(unsigned long freq);
-extern int hpet_rtc_dropped_irq(void);
extern int hpet_rtc_timer_init(void);
extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
extern int hpet_register_irq_handler(rtc_irq_handler handler);
@@ -109,6 +96,7 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
static inline int hpet_enable(void) { return 0; }
static inline int is_hpet_enabled(void) { return 0; }
#define hpet_readl(a) 0
+#define default_setup_hpet_msi NULL
#endif
#endif /* _ASM_X86_HPET_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 439a9acc132d..1721b1aadeb1 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -1,93 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_HUGETLB_H
#define _ASM_X86_HUGETLB_H
#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
-
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr,
- unsigned long len) {
- return 0;
-}
-
-/*
- * If the arch doesn't supply something else, assume that hugepage
- * size aligned regions are ok without further preparation.
- */
-static inline int prepare_hugepage_range(struct file *file,
- unsigned long addr, unsigned long len)
-{
- struct hstate *h = hstate_file(file);
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- return 0;
-}
-
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
-
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor,
- unsigned long ceiling)
-{
- free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- return ptep_get_and_clear(mm, addr, ptep);
-}
-
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
-}
-
-static inline int huge_pte_none(pte_t pte)
-{
- return pte_none(pte);
-}
-
-static inline pte_t huge_pte_wrprotect(pte_t pte)
-{
- return pte_wrprotect(pte);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
-
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return *ptep;
-}
-
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
+#define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..0bc931cd0698
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _I386_HW_BREAKPOINT_H
+#define _I386_HW_BREAKPOINT_H
+
+#include <uapi/asm/hw_breakpoint.h>
+
+#define __ARCH_HW_BREAKPOINT_H
+
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
+struct arch_hw_breakpoint {
+ unsigned long address;
+ unsigned long mask;
+ u8 len;
+ u8 type;
+};
+
+#include <linux/kdebug.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
+
+/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_X 0x40
+#define X86_BREAKPOINT_LEN_1 0x40
+#define X86_BREAKPOINT_LEN_2 0x44
+#define X86_BREAKPOINT_LEN_4 0x4c
+
+#ifdef CONFIG_X86_64
+#define X86_BREAKPOINT_LEN_8 0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define X86_BREAKPOINT_EXECUTE 0x80
+/* trigger on memory write */
+#define X86_BREAKPOINT_WRITE 0x81
+/* trigger on memory read or write */
+#define X86_BREAKPOINT_RW 0x83
+
+/* Total number of available HW breakpoint registers */
+#define HBP_NUM 4
+
+#define hw_breakpoint_slots(type) (HBP_NUM)
+
+struct perf_event_attr;
+struct perf_event;
+struct pmu;
+
+extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
+extern int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+ unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
+#endif /* _I386_HW_BREAKPOINT_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ba180d93b08c..cbe19e669080 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_HW_IRQ_H
#define _ASM_X86_HW_IRQ_H
@@ -15,123 +16,120 @@
#include <asm/irq_vectors.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/smp.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/irq.h>
#include <asm/sections.h>
-/* Interrupt handlers registered during init_IRQ */
-extern void apic_timer_interrupt(void);
-extern void generic_interrupt(void);
-extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
-
-extern void spurious_interrupt(void);
-extern void thermal_interrupt(void);
-extern void reschedule_interrupt(void);
-extern void mce_self_interrupt(void);
-
-extern void invalidate_interrupt(void);
-extern void invalidate_interrupt0(void);
-extern void invalidate_interrupt1(void);
-extern void invalidate_interrupt2(void);
-extern void invalidate_interrupt3(void);
-extern void invalidate_interrupt4(void);
-extern void invalidate_interrupt5(void);
-extern void invalidate_interrupt6(void);
-extern void invalidate_interrupt7(void);
-
-extern void irq_move_cleanup_interrupt(void);
-extern void reboot_interrupt(void);
-extern void threshold_interrupt(void);
-
-extern void call_function_interrupt(void);
-extern void call_function_single_interrupt(void);
-
-/* PIC specific functions */
-extern void disable_8259A_irq(unsigned int irq);
-extern void enable_8259A_irq(unsigned int irq);
-extern int i8259A_irq_pending(unsigned int irq);
-extern void make_8259A_irq(unsigned int irq);
-extern void init_8259A(int aeoi);
-
-/* IOAPIC */
-#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
-extern unsigned long io_apic_irqs;
-
-extern void init_VISWS_APIC_irqs(void);
-extern void setup_IO_APIC(void);
-extern void disable_IO_APIC(void);
-
-struct io_apic_irq_attr {
- int ioapic;
- int ioapic_pin;
- int trigger;
- int polarity;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+struct irq_data;
+struct pci_dev;
+struct msi_desc;
+
+enum irq_alloc_type {
+ X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
+ X86_IRQ_ALLOC_TYPE_HPET,
+ X86_IRQ_ALLOC_TYPE_PCI_MSI,
+ X86_IRQ_ALLOC_TYPE_PCI_MSIX,
+ X86_IRQ_ALLOC_TYPE_DMAR,
+ X86_IRQ_ALLOC_TYPE_AMDVI,
+ X86_IRQ_ALLOC_TYPE_UV,
};
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
- int ioapic, int ioapic_pin,
- int trigger, int polarity)
-{
- irq_attr->ioapic = ioapic;
- irq_attr->ioapic_pin = ioapic_pin;
- irq_attr->trigger = trigger;
- irq_attr->polarity = polarity;
-}
+struct ioapic_alloc_info {
+ int pin;
+ int node;
+ u32 is_level : 1;
+ u32 active_low : 1;
+ u32 valid : 1;
+};
-extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
- struct io_apic_irq_attr *irq_attr);
-extern void setup_ioapic_dest(void);
+struct uv_alloc_info {
+ int limit;
+ int blade;
+ unsigned long offset;
+ char *name;
-extern void enable_IO_APIC(void);
+};
-/* Statistics */
-extern atomic_t irq_err_count;
-extern atomic_t irq_mis_count;
+/**
+ * irq_alloc_info - X86 specific interrupt allocation info
+ * @type: X86 specific allocation type
+ * @flags: Flags for allocation tweaks
+ * @devid: Device ID for allocations
+ * @hwirq: Associated hw interrupt number in the domain
+ * @mask: CPU mask for vector allocation
+ * @desc: Pointer to msi descriptor
+ * @data: Allocation specific data
+ *
+ * @ioapic: IOAPIC specific allocation data
+ * @uv: UV specific allocation data
+*/
+struct irq_alloc_info {
+ enum irq_alloc_type type;
+ u32 flags;
+ u32 devid;
+ irq_hw_number_t hwirq;
+ const struct cpumask *mask;
+ struct msi_desc *desc;
+ void *data;
+
+ union {
+ struct ioapic_alloc_info ioapic;
+ struct uv_alloc_info uv;
+ };
+};
-/* EISA */
-extern void eisa_set_level_irq(unsigned int irq);
+struct irq_cfg {
+ unsigned int dest_apicid;
+ unsigned int vector;
+};
-/* SMP */
-extern void smp_apic_timer_interrupt(struct pt_regs *);
-extern void smp_spurious_interrupt(struct pt_regs *);
-extern void smp_generic_interrupt(struct pt_regs *);
-extern void smp_error_interrupt(struct pt_regs *);
-#ifdef CONFIG_X86_IO_APIC
-extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
-#endif
+extern struct irq_cfg *irq_cfg(unsigned int irq);
+extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
#ifdef CONFIG_SMP
-extern void smp_reschedule_interrupt(struct pt_regs *);
-extern void smp_call_function_interrupt(struct pt_regs *);
-extern void smp_call_function_single_interrupt(struct pt_regs *);
-#ifdef CONFIG_X86_32
-extern void smp_invalidate_interrupt(struct pt_regs *);
+extern void vector_schedule_cleanup(struct irq_cfg *);
+extern void irq_complete_move(struct irq_cfg *cfg);
#else
-extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
+static inline void vector_schedule_cleanup(struct irq_cfg *c) { }
+static inline void irq_complete_move(struct irq_cfg *c) { }
#endif
-#endif
-
-extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
-
-typedef int vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+extern void apic_ack_edge(struct irq_data *data);
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
-#ifdef CONFIG_X86_IO_APIC
+#ifdef CONFIG_X86_LOCAL_APIC
extern void lock_vector_lock(void);
extern void unlock_vector_lock(void);
-extern void __setup_vector_irq(int cpu);
#else
static inline void lock_vector_lock(void) {}
static inline void unlock_vector_lock(void) {}
-static inline void __setup_vector_irq(int cpu) {}
#endif
-#endif /* !ASSEMBLY_ */
+/* Statistics */
+extern atomic_t irq_err_count;
+extern atomic_t irq_mis_count;
+
+extern void elcr_set_level_irq(unsigned int irq);
+
+extern char irq_entries_start[];
+#ifdef CONFIG_TRACING
+#define trace_irq_entries_start irq_entries_start
+#endif
+
+extern char spurious_entries_start[];
+
+#define VECTOR_UNUSED NULL
+#define VECTOR_SHUTDOWN ((void *)-1L)
+#define VECTOR_RETRIGGERED ((void *)-2L)
+
+typedef struct irq_desc* vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
+
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_HW_IRQ_H */
diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h
deleted file mode 100644
index 334b1a885d9c..000000000000
--- a/arch/x86/include/asm/hypertransport.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef _ASM_X86_HYPERTRANSPORT_H
-#define _ASM_X86_HYPERTRANSPORT_H
-
-/*
- * Constants for x86 Hypertransport Interrupts.
- */
-
-#define HT_IRQ_LOW_BASE 0xf8000000
-
-#define HT_IRQ_LOW_VECTOR_SHIFT 16
-#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000
-#define HT_IRQ_LOW_VECTOR(v) \
- (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK)
-
-#define HT_IRQ_LOW_DEST_ID_SHIFT 8
-#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00
-#define HT_IRQ_LOW_DEST_ID(v) \
- (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK)
-
-#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000
-#define HT_IRQ_LOW_DM_LOGICAL 0x0000040
-
-#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000
-#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020
-
-
-#define HT_IRQ_LOW_MT_FIXED 0x0000000
-#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004
-#define HT_IRQ_LOW_MT_SMI 0x0000008
-#define HT_IRQ_LOW_MT_NMI 0x000000c
-#define HT_IRQ_LOW_MT_INIT 0x0000010
-#define HT_IRQ_LOW_MT_STARTUP 0x0000014
-#define HT_IRQ_LOW_MT_EXTINT 0x0000018
-#define HT_IRQ_LOW_MT_LINT1 0x000008c
-#define HT_IRQ_LOW_MT_LINT0 0x0000098
-
-#define HT_IRQ_LOW_IRQ_MASKED 0x0000001
-
-
-#define HT_IRQ_HIGH_DEST_ID_SHIFT 0
-#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff
-#define HT_IRQ_HIGH_DEST_ID(v) \
- ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK)
-
-#endif /* _ASM_X86_HYPERTRANSPORT_H */
diff --git a/arch/x86/include/asm/hyperv_timer.h b/arch/x86/include/asm/hyperv_timer.h
new file mode 100644
index 000000000000..388fa81b8f38
--- /dev/null
+++ b/arch/x86/include/asm/hyperv_timer.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HYPERV_TIMER_H
+#define _ASM_X86_HYPERV_TIMER_H
+
+#include <asm/msr.h>
+
+#define hv_get_raw_timer() rdtsc_ordered()
+
+#endif
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 369f5c5d09a1..9ad86a7d13f6 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -17,10 +17,69 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
-#ifndef ASM_X86__HYPERVISOR_H
-#define ASM_X86__HYPERVISOR_H
+#ifndef _ASM_X86_HYPERVISOR_H
+#define _ASM_X86_HYPERVISOR_H
-extern unsigned long get_hypervisor_tsc_freq(void);
-extern void init_hypervisor(struct cpuinfo_x86 *c);
+/* x86 hypervisor types */
+enum x86_hypervisor_type {
+ X86_HYPER_NATIVE = 0,
+ X86_HYPER_VMWARE,
+ X86_HYPER_MS_HYPERV,
+ X86_HYPER_XEN_PV,
+ X86_HYPER_XEN_HVM,
+ X86_HYPER_KVM,
+ X86_HYPER_JAILHOUSE,
+ X86_HYPER_ACRN,
+ X86_HYPER_BHYVE,
+};
-#endif
+#ifdef CONFIG_HYPERVISOR_GUEST
+
+#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
+#include <asm/xen/hypervisor.h>
+
+struct hypervisor_x86 {
+ /* Hypervisor name */
+ const char *name;
+
+ /* Detection routine */
+ uint32_t (*detect)(void);
+
+ /* Hypervisor type */
+ enum x86_hypervisor_type type;
+
+ /* init time callbacks */
+ struct x86_hyper_init init;
+
+ /* runtime callbacks */
+ struct x86_hyper_runtime runtime;
+
+ /* ignore nopv parameter */
+ bool ignore_nopv;
+};
+
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_pv;
+extern const struct hypervisor_x86 x86_hyper_kvm;
+extern const struct hypervisor_x86 x86_hyper_jailhouse;
+extern const struct hypervisor_x86 x86_hyper_acrn;
+extern const struct hypervisor_x86 x86_hyper_bhyve;
+extern struct hypervisor_x86 x86_hyper_xen_hvm;
+
+extern bool nopv;
+extern enum x86_hypervisor_type x86_hyper_type;
+extern void init_hypervisor_platform(void);
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return x86_hyper_type == type;
+}
+#else
+static inline void init_hypervisor_platform(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return type == X86_HYPER_NATIVE;
+}
+#endif /* CONFIG_HYPERVISOR_GUEST */
+#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
deleted file mode 100644
index 175adf58dd4f..000000000000
--- a/arch/x86/include/asm/i387.h
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- * Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_I387_H
-#define _ASM_X86_I387_H
-
-#include <linux/sched.h>
-#include <linux/kernel_stat.h>
-#include <linux/regset.h>
-#include <linux/hardirq.h>
-#include <asm/asm.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/uaccess.h>
-#include <asm/xsave.h>
-
-extern unsigned int sig_xstate_size;
-extern void fpu_init(void);
-extern void mxcsr_feature_mask_init(void);
-extern int init_fpu(struct task_struct *child);
-extern asmlinkage void math_state_restore(void);
-extern void init_thread_xstate(void);
-extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
-
-extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
-
-extern struct _fpx_sw_bytes fx_sw_reserved;
-#ifdef CONFIG_IA32_EMULATION
-extern unsigned int sig_xstate_ia32_size;
-extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
-struct _fpstate_ia32;
-struct _xstate_ia32;
-extern int save_i387_xstate_ia32(void __user *buf);
-extern int restore_i387_xstate_ia32(void __user *buf);
-#endif
-
-#define X87_FSW_ES (1 << 7) /* Exception Summary */
-
-#ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
- asm volatile("1: fwait\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b));
-}
-
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
- int err;
-
- asm volatile("1: rex64/fxrstor (%[fx])\n\t"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
- return err;
-}
-
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. The kernel data segment can be sometimes 0 and sometimes
- new user value. Both should be ok.
- Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
- struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
-
- /*
- * xsave header may indicate the init state of the FP.
- */
- if ((task_thread_info(tsk)->status & TS_XSAVE) &&
- !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- return;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
- alternative_input(ASM_NOP8 ASM_NOP2,
- " emms\n" /* clear stack tags */
- " fildl %%gs:0", /* load to clear state */
- X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
-{
- int err;
-
- asm volatile("1: rex64/fxsave (%[fx])\n\t"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "0" (0));
-#endif
- if (unlikely(err) &&
- __clear_user(fx, sizeof(struct i387_fxsave_struct)))
- err = -EFAULT;
- /* No need to clear here because the caller clears USED_MATH */
- return err;
-}
-
-static inline void fxsave(struct task_struct *tsk)
-{
- /* Using "rex64; fxsave %0" is broken because, if the memory operand
- uses any extended registers for addressing, a second REX prefix
- will be generated (to the assembler, rex64 followed by semicolon
- is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
- /* Using "fxsaveq %0" would be the ideal choice, but is only supported
- starting with gas 2.16. */
- __asm__ __volatile__("fxsaveq %0"
- : "=m" (tsk->thread.xstate->fxsave));
-#elif 0
- /* Using, as a workaround, the properly prefixed form below isn't
- accepted by any binutils version so far released, complaining that
- the same type of prefix is used twice if an extended register is
- needed for addressing (fix submitted to mainline 2005-11-21). */
- __asm__ __volatile__("rex64/fxsave %0"
- : "=m" (tsk->thread.xstate->fxsave));
-#else
- /* This, however, we can work around by forcing the compiler to select
- an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__("rex64/fxsave (%1)"
- : "=m" (tsk->thread.xstate->fxsave)
- : "cdaSDb" (&tsk->thread.xstate->fxsave));
-#endif
-}
-
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
- if (task_thread_info(tsk)->status & TS_XSAVE)
- xsave(tsk);
- else
- fxsave(tsk);
-
- clear_fpu_state(tsk);
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
-#else /* CONFIG_X86_32 */
-
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_task(struct task_struct *tsk);
-#else
-static inline void finit_task(struct task_struct *tsk)
-{
-}
-#endif
-
-static inline void tolerant_fwait(void)
-{
- asm volatile("fnclex ; fwait");
-}
-
-/* perform fxrstor iff the processor has extended states, otherwise frstor */
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
- /*
- * The "nop" is needed to make the instructions the same
- * length.
- */
- alternative_input(
- "nop ; frstor %1",
- "fxrstor %1",
- X86_FEATURE_FXSR,
- "m" (*fx));
-
- return 0;
-}
-
-/* We need a safe address that is cheap to find and that is already
- in L1 during context switch. The best choices are unfortunately
- different for UP and SMP */
-#ifdef CONFIG_SMP
-#define safe_address (__per_cpu_offset[0])
-#else
-#define safe_address (kstat_cpu(0).cpustat.user)
-#endif
-
-/*
- * These must be called with preempt disabled
- */
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
- if (task_thread_info(tsk)->status & TS_XSAVE) {
- struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
-
- xsave(tsk);
-
- /*
- * xsave header may indicate the init state of the FP.
- */
- if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- goto end;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
-
- /*
- * we can do a simple return here or be paranoid :)
- */
- goto clear_state;
- }
-
- /* Use more nops than strictly needed in case the compiler
- varies code */
- alternative_input(
- "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
- "fxsave %[fx]\n"
- "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
- X86_FEATURE_FXSR,
- [fx] "m" (tsk->thread.xstate->fxsave),
- [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
-clear_state:
- /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. safe_address is a random variable that should be in L1 */
- alternative_input(
- GENERIC_NOP8 GENERIC_NOP2,
- "emms\n\t" /* clear stack tags */
- "fildl %[addr]", /* set F?P to defined value */
- X86_FEATURE_FXSAVE_LEAK,
- [addr] "m" (safe_address));
-end:
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
-#endif /* CONFIG_X86_64 */
-
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
- if (task_thread_info(tsk)->status & TS_XSAVE)
- return xrstor_checking(&tsk->thread.xstate->xsave);
- else
- return fxrstor_checking(&tsk->thread.xstate->fxsave);
-}
-
-/*
- * Signal frame handlers...
- */
-extern int save_i387_xstate(void __user *buf);
-extern int restore_i387_xstate(void __user *buf);
-
-static inline void __unlazy_fpu(struct task_struct *tsk)
-{
- if (task_thread_info(tsk)->status & TS_USEDFPU) {
- __save_init_fpu(tsk);
- stts();
- } else
- tsk->fpu_counter = 0;
-}
-
-static inline void __clear_fpu(struct task_struct *tsk)
-{
- if (task_thread_info(tsk)->status & TS_USEDFPU) {
- tolerant_fwait();
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
- stts();
- }
-}
-
-static inline void kernel_fpu_begin(void)
-{
- struct thread_info *me = current_thread_info();
- preempt_disable();
- if (me->status & TS_USEDFPU)
- __save_init_fpu(me->task);
- else
- clts();
-}
-
-static inline void kernel_fpu_end(void)
-{
- stts();
- preempt_enable();
-}
-
-/*
- * Some instructions like VIA's padlock instructions generate a spurious
- * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context as well. To prevent these kernel instructions
- * in interrupt context interacting wrongly with other user/kernel fpu usage, we
- * should use them only in the context of irq_ts_save/restore()
- */
-static inline int irq_ts_save(void)
-{
- /*
- * If in process context and not atomic, we can take a spurious DNA fault.
- * Otherwise, doing clts() in process context requires disabling preemption
- * or some heavy lifting like kernel_fpu_begin()
- */
- if (!in_atomic())
- return 0;
-
- if (read_cr0() & X86_CR0_TS) {
- clts();
- return 1;
- }
-
- return 0;
-}
-
-static inline void irq_ts_restore(int TS_state)
-{
- if (TS_state)
- stts();
-}
-
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
- __save_init_fpu(tsk);
- stts();
-}
-
-#define unlazy_fpu __unlazy_fpu
-#define clear_fpu __clear_fpu
-
-#else /* CONFIG_X86_32 */
-
-/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu(struct task_struct *tsk)
-{
- preempt_disable();
- __save_init_fpu(tsk);
- stts();
- preempt_enable();
-}
-
-static inline void unlazy_fpu(struct task_struct *tsk)
-{
- preempt_disable();
- __unlazy_fpu(tsk);
- preempt_enable();
-}
-
-static inline void clear_fpu(struct task_struct *tsk)
-{
- preempt_disable();
- __clear_fpu(tsk);
- preempt_enable();
-}
-
-#endif /* CONFIG_X86_64 */
-
-/*
- * i387 state interaction
- */
-static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
- if (cpu_has_fxsr) {
- return tsk->thread.xstate->fxsave.cwd;
- } else {
- return (unsigned short)tsk->thread.xstate->fsave.cwd;
- }
-}
-
-static inline unsigned short get_fpu_swd(struct task_struct *tsk)
-{
- if (cpu_has_fxsr) {
- return tsk->thread.xstate->fxsave.swd;
- } else {
- return (unsigned short)tsk->thread.xstate->fsave.swd;
- }
-}
-
-static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
- if (cpu_has_xmm) {
- return tsk->thread.xstate->fxsave.mxcsr;
- } else {
- return MXCSR_DEFAULT;
- }
-}
-
-#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
deleted file mode 100644
index 1edbf89680fd..000000000000
--- a/arch/x86/include/asm/i8253.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _ASM_X86_I8253_H
-#define _ASM_X86_I8253_H
-
-/* i8253A PIT registers */
-#define PIT_MODE 0x43
-#define PIT_CH0 0x40
-#define PIT_CH2 0x42
-
-extern spinlock_t i8253_lock;
-
-extern struct clock_event_device *global_clock_event;
-
-extern void setup_pit_timer(void);
-
-#define inb_pit inb_p
-#define outb_pit outb_p
-
-#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 58d7091eeb1f..c715097e92fd 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -1,7 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_I8259_H
#define _ASM_X86_I8259_H
#include <linux/delay.h>
+#include <asm/io.h>
extern unsigned int cached_irq_mask;
@@ -17,6 +19,8 @@ extern unsigned int cached_irq_mask;
#define PIC_MASTER_OCW3 PIC_MASTER_ISR
#define PIC_SLAVE_CMD 0xa0
#define PIC_SLAVE_IMR 0xa1
+#define PIC_ELCR1 0x4d0
+#define PIC_ELCR2 0x4d1
/* i8259A PIC related value */
#define PIC_CASCADE_IR 2
@@ -24,12 +28,7 @@ extern unsigned int cached_irq_mask;
#define SLAVE_ICW4_DEFAULT 0x01
#define PIC_ICW4_AEOI 2
-extern spinlock_t i8259A_lock;
-
-extern void init_8259A(int auto_eoi);
-extern void enable_8259A_irq(unsigned int irq);
-extern void disable_8259A_irq(unsigned int irq);
-extern unsigned int startup_8259A_irq(unsigned int irq);
+extern raw_spinlock_t i8259A_lock;
/* the PIC may need a careful delay on some platforms, hence specific calls */
static inline unsigned char inb_pic(unsigned int port)
@@ -57,7 +56,32 @@ static inline void outb_pic(unsigned char value, unsigned int port)
extern struct irq_chip i8259A_chip;
-extern void mask_8259A(void);
-extern void unmask_8259A(void);
+struct legacy_pic {
+ int nr_legacy_irqs;
+ struct irq_chip *chip;
+ void (*mask)(unsigned int irq);
+ void (*unmask)(unsigned int irq);
+ void (*mask_all)(void);
+ void (*restore_mask)(void);
+ void (*init)(int auto_eoi);
+ int (*probe)(void);
+ int (*irq_pending)(unsigned int irq);
+ void (*make_irq)(unsigned int irq);
+};
+
+void legacy_pic_pcat_compat(void);
+
+extern struct legacy_pic *legacy_pic;
+extern struct legacy_pic null_legacy_pic;
+
+static inline bool has_legacy_pic(void)
+{
+ return legacy_pic != &null_legacy_pic;
+}
+
+static inline int nr_legacy_irqs(void)
+{
+ return legacy_pic->nr_legacy_irqs;
+}
#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 1f7e62517284..9d69f3f8dbab 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -1,7 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IA32_H
#define _ASM_X86_IA32_H
-
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
@@ -10,36 +10,15 @@
* 32 bit structures for IA32 support.
*/
-#include <asm/sigcontext32.h>
+#include <uapi/asm/sigcontext.h>
/* signal.h */
-struct sigaction32 {
- unsigned int sa_handler; /* Really a pointer, but need to deal
- with 32 bits */
- unsigned int sa_flags;
- unsigned int sa_restorer; /* Another 32 bit pointer */
- compat_sigset_t sa_mask; /* A 32 bit mask */
-};
-
-struct old_sigaction32 {
- unsigned int sa_handler; /* Really a pointer, but need to deal
- with 32 bits */
- compat_old_sigset_t sa_mask; /* A 32 bit mask */
- unsigned int sa_flags;
- unsigned int sa_restorer; /* Another 32 bit pointer */
-};
-
-typedef struct sigaltstack_ia32 {
- unsigned int ss_sp;
- int ss_flags;
- unsigned int ss_size;
-} stack_ia32_t;
struct ucontext_ia32 {
unsigned int uc_flags;
unsigned int uc_link;
- stack_ia32_t uc_stack;
- struct sigcontext_ia32 uc_mcontext;
+ compat_stack_t uc_stack;
+ struct sigcontext_32 uc_mcontext;
compat_sigset_t uc_sigmask; /* mask last for extensibility */
};
@@ -77,69 +56,37 @@ struct stat64 {
unsigned long long st_ino;
} __attribute__((packed));
-typedef struct compat_siginfo {
- int si_signo;
- int si_errno;
- int si_code;
-
- union {
- int _pad[((128 / sizeof(int)) - 3)];
-
- /* kill() */
- struct {
- unsigned int _pid; /* sender's pid */
- unsigned int _uid; /* sender's uid */
- } _kill;
-
- /* POSIX.1b timers */
- struct {
- compat_timer_t _tid; /* timer id */
- int _overrun; /* overrun count */
- compat_sigval_t _sigval; /* same as below */
- int _sys_private; /* not to be passed to user */
- int _overrun_incr; /* amount to add to overrun */
- } _timer;
-
- /* POSIX.1b signals */
- struct {
- unsigned int _pid; /* sender's pid */
- unsigned int _uid; /* sender's uid */
- compat_sigval_t _sigval;
- } _rt;
-
- /* SIGCHLD */
- struct {
- unsigned int _pid; /* which child */
- unsigned int _uid; /* sender's uid */
- int _status; /* exit code */
- compat_clock_t _utime;
- compat_clock_t _stime;
- } _sigchld;
-
- /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
- struct {
- unsigned int _addr; /* faulting insn/memory ref. */
- } _sigfault;
-
- /* SIGPOLL */
- struct {
- int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
- int _fd;
- } _sigpoll;
- } _sifields;
-} compat_siginfo_t;
-
-#define IA32_STACK_TOP IA32_PAGE_OFFSET
-
-#ifdef __KERNEL__
-struct linux_binprm;
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
- unsigned long stack_top, int exec_stack);
-struct mm_struct;
-extern void ia32_pick_mmap_layout(struct mm_struct *mm);
+extern bool __ia32_enabled;
+
+static __always_inline bool ia32_enabled(void)
+{
+ return __ia32_enabled;
+}
+
+static inline void ia32_disable(void)
+{
+ __ia32_enabled = false;
+}
+
+#else /* !CONFIG_IA32_EMULATION */
+
+static __always_inline bool ia32_enabled(void)
+{
+ return IS_ENABLED(CONFIG_X86_32);
+}
+
+static inline void ia32_disable(void) {}
#endif
-#endif /* !CONFIG_IA32_SUPPORT */
+static inline bool ia32_enabled_verbose(void)
+{
+ bool enabled = ia32_enabled();
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled)
+ pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n");
+
+ return enabled;
+}
#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
deleted file mode 100644
index 976f6ecd2ce6..000000000000
--- a/arch/x86/include/asm/ia32_unistd.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _ASM_X86_IA32_UNISTD_H
-#define _ASM_X86_IA32_UNISTD_H
-
-/*
- * This file contains the system call numbers of the ia32 port,
- * this is for the kernel only.
- * Only add syscalls here where some part of the kernel needs to know
- * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
- */
-
-#define __NR_ia32_restart_syscall 0
-#define __NR_ia32_exit 1
-#define __NR_ia32_read 3
-#define __NR_ia32_write 4
-#define __NR_ia32_sigreturn 119
-#define __NR_ia32_rt_sigreturn 173
-
-#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
new file mode 100644
index 000000000000..5e45d6424722
--- /dev/null
+++ b/arch/x86/include/asm/ibt.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IBT_H
+#define _ASM_X86_IBT_H
+
+#include <linux/types.h>
+
+/*
+ * The rules for enabling IBT are:
+ *
+ * - CC_HAS_IBT: the toolchain supports it
+ * - X86_KERNEL_IBT: it is selected in Kconfig
+ * - !__DISABLE_EXPORTS: this is regular kernel code
+ *
+ * Esp. that latter one is a bit non-obvious, but some code like compressed,
+ * purgatory, realmode etc.. is built with custom CFLAGS that do not include
+ * -fcf-protection=branch and things will go *bang*.
+ *
+ * When all the above are satisfied, HAS_KERNEL_IBT will be 1, otherwise 0.
+ */
+#if defined(CONFIG_X86_KERNEL_IBT) && !defined(__DISABLE_EXPORTS)
+
+#define HAS_KERNEL_IBT 1
+
+#ifndef __ASSEMBLER__
+
+#ifdef CONFIG_X86_64
+#define ASM_ENDBR "endbr64\n\t"
+#else
+#define ASM_ENDBR "endbr32\n\t"
+#endif
+
+#define __noendbr __attribute__((nocf_check))
+
+/*
+ * Create a dummy function pointer reference to prevent objtool from marking
+ * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by
+ * apply_seal_endbr()).
+ */
+#define IBT_NOSEAL(fname) \
+ ".pushsection .discard.ibt_endbr_noseal\n\t" \
+ _ASM_PTR fname "\n\t" \
+ ".popsection\n\t"
+
+static __always_inline __attribute_const__ u32 gen_endbr(void)
+{
+ u32 endbr;
+
+ /*
+ * Generate ENDBR64 in a way that is sure to not result in
+ * an ENDBR64 instruction as immediate.
+ */
+ asm ( "mov $~0xfa1e0ff3, %[endbr]\n\t"
+ "not %[endbr]\n\t"
+ : [endbr] "=&r" (endbr) );
+
+ return endbr;
+}
+
+static __always_inline __attribute_const__ u32 gen_endbr_poison(void)
+{
+ /*
+ * 4 byte NOP that isn't NOP4, such that it will be unique to (former)
+ * ENDBR sites. Additionally it carries UDB as immediate.
+ */
+ return 0xd6401f0f; /* nopl -42(%rax) */
+}
+
+static inline bool __is_endbr(u32 val)
+{
+ if (val == gen_endbr_poison())
+ return true;
+
+ val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */
+ return val == gen_endbr();
+}
+
+extern __noendbr bool is_endbr(u32 *val);
+extern __noendbr u64 ibt_save(bool disable);
+extern __noendbr void ibt_restore(u64 save);
+
+#else /* __ASSEMBLER__ */
+
+#ifdef CONFIG_X86_64
+#define ENDBR endbr64
+#else
+#define ENDBR endbr32
+#endif
+
+#endif /* __ASSEMBLER__ */
+
+#else /* !IBT */
+
+#define HAS_KERNEL_IBT 0
+
+#ifndef __ASSEMBLER__
+
+#define ASM_ENDBR
+#define IBT_NOSEAL(name)
+
+#define __noendbr
+
+static inline bool is_endbr(u32 *val) { return false; }
+
+static inline u64 ibt_save(bool disable) { return 0; }
+static inline void ibt_restore(u64 save) { }
+
+#else /* __ASSEMBLER__ */
+
+#define ENDBR
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* CONFIG_X86_KERNEL_IBT */
+
+#define ENDBR_INSN_SIZE (4*HAS_KERNEL_IBT)
+
+#endif /* _ASM_X86_IBT_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
deleted file mode 100644
index 38d87379e270..000000000000
--- a/arch/x86/include/asm/idle.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _ASM_X86_IDLE_H
-#define _ASM_X86_IDLE_H
-
-#define IDLE_START 1
-#define IDLE_END 2
-
-struct notifier_block;
-void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
-
-#ifdef CONFIG_X86_64
-void enter_idle(void);
-void exit_idle(void);
-#else /* !CONFIG_X86_64 */
-static inline void enter_idle(void) { }
-static inline void exit_idle(void) { }
-#endif /* CONFIG_X86_64 */
-
-void c1e_remove_cpu(int cpu);
-
-#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
new file mode 100644
index 000000000000..3218770670d3
--- /dev/null
+++ b/arch/x86/include/asm/idtentry.h
@@ -0,0 +1,775 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IDTENTRY_H
+#define _ASM_X86_IDTENTRY_H
+
+/* Interrupts/Exceptions */
+#include <asm/trapnr.h>
+
+#define IDT_ALIGN (8 * (1 + HAS_KERNEL_IBT))
+
+#ifndef __ASSEMBLER__
+#include <linux/entry-common.h>
+#include <linux/hardirq.h>
+
+#include <asm/irq_stack.h>
+
+typedef void (*idtentry_t)(struct pt_regs *regs);
+
+/**
+ * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
+ * No error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares four functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the FRED event dispatcher (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it
+ * declares the entry points for usage in C code. There is an ASM variant
+ * as well which is used to emit the entry stubs in entry_32/64.S.
+ */
+#define DECLARE_IDTENTRY(vector, func) \
+ asmlinkage void asm_##func(void); \
+ asmlinkage void xen_asm_##func(void); \
+ void fred_##func(struct pt_regs *regs); \
+ __visible void func(struct pt_regs *regs)
+
+/**
+ * DEFINE_IDTENTRY - Emit code for simple IDT entry points
+ * @func: Function name of the entry point
+ *
+ * @func is called from ASM entry code with interrupts disabled.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ *
+ * irqentry_enter() contains common code which has to be invoked before
+ * arbitrary code in the body. irqentry_exit() contains common code
+ * which has to run before returning to the low level assembly code.
+ */
+#define DEFINE_IDTENTRY(func) \
+static __always_inline void __##func(struct pt_regs *regs); \
+ \
+__visible noinstr void func(struct pt_regs *regs) \
+{ \
+ irqentry_state_t state = irqentry_enter(regs); \
+ \
+ instrumentation_begin(); \
+ __##func (regs); \
+ instrumentation_end(); \
+ irqentry_exit(regs, state); \
+} \
+ \
+static __always_inline void __##func(struct pt_regs *regs)
+
+/* Special case for 32bit IRET 'trap' */
+#define DECLARE_IDTENTRY_SW DECLARE_IDTENTRY
+#define DEFINE_IDTENTRY_SW DEFINE_IDTENTRY
+
+/**
+ * DECLARE_IDTENTRY_ERRORCODE - Declare functions for simple IDT entry points
+ * Error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares three functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Same as DECLARE_IDTENTRY, but has an extra error_code argument for the
+ * C-handler.
+ */
+#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \
+ asmlinkage void asm_##func(void); \
+ asmlinkage void xen_asm_##func(void); \
+ __visible void func(struct pt_regs *regs, unsigned long error_code)
+
+/**
+ * DEFINE_IDTENTRY_ERRORCODE - Emit code for simple IDT entry points
+ * Error code pushed by hardware
+ * @func: Function name of the entry point
+ *
+ * Same as DEFINE_IDTENTRY, but has an extra error_code argument
+ */
+#define DEFINE_IDTENTRY_ERRORCODE(func) \
+static __always_inline void __##func(struct pt_regs *regs, \
+ unsigned long error_code); \
+ \
+__visible noinstr void func(struct pt_regs *regs, \
+ unsigned long error_code) \
+{ \
+ irqentry_state_t state = irqentry_enter(regs); \
+ \
+ instrumentation_begin(); \
+ __##func (regs, error_code); \
+ instrumentation_end(); \
+ irqentry_exit(regs, state); \
+} \
+ \
+static __always_inline void __##func(struct pt_regs *regs, \
+ unsigned long error_code)
+
+/**
+ * DECLARE_IDTENTRY_RAW - Declare functions for raw IDT entry points
+ * No error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY().
+ */
+#define DECLARE_IDTENTRY_RAW(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_RAW - Emit code for raw IDT entry points
+ * @func: Function name of the entry point
+ *
+ * @func is called from ASM entry code with interrupts disabled.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ *
+ * Contrary to DEFINE_IDTENTRY() this does not invoke the
+ * idtentry_enter/exit() helpers before and after the body invocation. This
+ * needs to be done in the body itself if applicable. Use if extra work
+ * is required before the enter/exit() helpers are invoked.
+ */
+#define DEFINE_IDTENTRY_RAW(func) \
+__visible noinstr void func(struct pt_regs *regs)
+
+/**
+ * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points
+ * @func: Function name of the entry point
+ *
+ * @func is called from the FRED event dispatcher with interrupts disabled.
+ *
+ * See @DEFINE_IDTENTRY_RAW for further details.
+ */
+#define DEFINE_FREDENTRY_RAW(func) \
+noinstr void fred_##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points
+ * Error code pushed by hardware
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_ERRORCODE()
+ */
+#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \
+ DECLARE_IDTENTRY_ERRORCODE(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_RAW_ERRORCODE - Emit code for raw IDT entry points
+ * @func: Function name of the entry point
+ *
+ * @func is called from ASM entry code with interrupts disabled.
+ *
+ * The macro is written so it acts as function definition. Append the
+ * body with a pair of curly brackets.
+ *
+ * Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the
+ * irqentry_enter/exit() helpers before and after the body invocation. This
+ * needs to be done in the body itself if applicable. Use if extra work
+ * is required before the enter/exit() helpers are invoked.
+ */
+#define DEFINE_IDTENTRY_RAW_ERRORCODE(func) \
+__visible noinstr void func(struct pt_regs *regs, unsigned long error_code)
+
+/**
+ * DECLARE_IDTENTRY_IRQ - Declare functions for device interrupt IDT entry
+ * points (common/spurious)
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_ERRORCODE()
+ */
+#define DECLARE_IDTENTRY_IRQ(vector, func) \
+ DECLARE_IDTENTRY_ERRORCODE(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_IRQ - Emit code for device interrupt IDT entry points
+ * @func: Function name of the entry point
+ *
+ * The vector number is pushed by the low level entry stub and handed
+ * to the function as error_code argument which needs to be truncated
+ * to an u8 because the push is sign extending.
+ *
+ * irq_enter/exit_rcu() are invoked before the function body and the
+ * KVM L1D flush request is set. Stack switching to the interrupt stack
+ * has to be done in the function body if necessary.
+ */
+#define DEFINE_IDTENTRY_IRQ(func) \
+static void __##func(struct pt_regs *regs, u32 vector); \
+ \
+__visible noinstr void func(struct pt_regs *regs, \
+ unsigned long error_code) \
+{ \
+ irqentry_state_t state = irqentry_enter(regs); \
+ u32 vector = (u32)(u8)error_code; \
+ \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ instrumentation_begin(); \
+ run_irq_on_irqstack_cond(__##func, regs, vector); \
+ instrumentation_end(); \
+ irqentry_exit(regs, state); \
+} \
+ \
+static noinline void __##func(struct pt_regs *regs, u32 vector)
+
+/**
+ * DECLARE_IDTENTRY_SYSVEC - Declare functions for system vector entry points
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares three functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Maps to DECLARE_IDTENTRY().
+ */
+#define DECLARE_IDTENTRY_SYSVEC(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points
+ * @func: Function name of the entry point
+ *
+ * irqentry_enter/exit() and irq_enter/exit_rcu() are invoked before the
+ * function body. KVM L1D flush request is set.
+ *
+ * Runs the function on the interrupt stack if the entry hit kernel mode
+ */
+#define DEFINE_IDTENTRY_SYSVEC(func) \
+static void __##func(struct pt_regs *regs); \
+ \
+static __always_inline void instr_##func(struct pt_regs *regs) \
+{ \
+ run_sysvec_on_irqstack_cond(__##func, regs); \
+} \
+ \
+__visible noinstr void func(struct pt_regs *regs) \
+{ \
+ irqentry_state_t state = irqentry_enter(regs); \
+ \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ instrumentation_begin(); \
+ instr_##func (regs); \
+ instrumentation_end(); \
+ irqentry_exit(regs, state); \
+} \
+ \
+void fred_##func(struct pt_regs *regs) \
+{ \
+ instr_##func (regs); \
+} \
+ \
+static noinline void __##func(struct pt_regs *regs)
+
+/**
+ * DEFINE_IDTENTRY_SYSVEC_SIMPLE - Emit code for simple system vector IDT
+ * entry points
+ * @func: Function name of the entry point
+ *
+ * Runs the function on the interrupted stack. No switch to IRQ stack and
+ * only the minimal __irq_enter/exit() handling.
+ *
+ * Only use for 'empty' vectors like reschedule IPI and KVM posted
+ * interrupt vectors.
+ */
+#define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \
+static __always_inline void __##func(struct pt_regs *regs); \
+ \
+static __always_inline void instr_##func(struct pt_regs *regs) \
+{ \
+ __irq_enter_raw(); \
+ __##func (regs); \
+ __irq_exit_raw(); \
+} \
+ \
+__visible noinstr void func(struct pt_regs *regs) \
+{ \
+ irqentry_state_t state = irqentry_enter(regs); \
+ \
+ kvm_set_cpu_l1tf_flush_l1d(); \
+ instrumentation_begin(); \
+ instr_##func (regs); \
+ instrumentation_end(); \
+ irqentry_exit(regs, state); \
+} \
+ \
+void fred_##func(struct pt_regs *regs) \
+{ \
+ instr_##func (regs); \
+} \
+ \
+static __always_inline void __##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_IDTENTRY_XENCB - Declare functions for XEN HV callback entry point
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares three functions:
+ * - The ASM entry point: asm_##func
+ * - The XEN PV trap entry point: xen_##func (maybe unused)
+ * - The C handler called from the ASM entry point
+ *
+ * Maps to DECLARE_IDTENTRY(). Distinct entry point to handle the 32/64-bit
+ * difference
+ */
+#define DECLARE_IDTENTRY_XENCB(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+#ifdef CONFIG_X86_64
+/**
+ * DECLARE_IDTENTRY_IST - Declare functions for IST handling IDT entry points
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_RAW, but declares also the NOIST C handler
+ * which is called from the ASM entry point on user mode entry
+ */
+#define DECLARE_IDTENTRY_IST(vector, func) \
+ DECLARE_IDTENTRY_RAW(vector, func); \
+ __visible void noist_##func(struct pt_regs *regs)
+
+/**
+ * DECLARE_IDTENTRY_VC - Declare functions for the VC entry point
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE, but declares also the
+ * safe_stack C handler.
+ */
+#define DECLARE_IDTENTRY_VC(vector, func) \
+ DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
+ __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
+ __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
+
+/**
+ * DEFINE_IDTENTRY_IST - Emit code for IST entry points
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW
+ */
+#define DEFINE_IDTENTRY_IST(func) \
+ DEFINE_IDTENTRY_RAW(func)
+
+/**
+ * DEFINE_IDTENTRY_NOIST - Emit code for NOIST entry points which
+ * belong to a IST entry point (MCE, DB)
+ * @func: Function name of the entry point. Must be the same as
+ * the function name of the corresponding IST variant
+ *
+ * Maps to DEFINE_IDTENTRY_RAW().
+ */
+#define DEFINE_IDTENTRY_NOIST(func) \
+ DEFINE_IDTENTRY_RAW(noist_##func)
+
+/**
+ * DECLARE_IDTENTRY_DF - Declare functions for double fault
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DECLARE_IDTENTRY_DF(vector, func) \
+ DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_DF - Emit code for double fault
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DEFINE_IDTENTRY_DF(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(func)
+
+/**
+ * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
+ * when raised from kernel mode
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DEFINE_IDTENTRY_VC_KERNEL(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func)
+
+/**
+ * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
+ * when raised from user mode
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DEFINE_IDTENTRY_VC_USER(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
+
+#else /* CONFIG_X86_64 */
+
+/**
+ * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Declares two functions:
+ * - The ASM entry point: asm_##func
+ * - The C handler called from the C shim
+ */
+#define DECLARE_IDTENTRY_DF(vector, func) \
+ asmlinkage void asm_##func(void); \
+ __visible void func(struct pt_regs *regs, \
+ unsigned long error_code, \
+ unsigned long address)
+
+/**
+ * DEFINE_IDTENTRY_DF - Emit code for double fault on 32bit
+ * @func: Function name of the entry point
+ *
+ * This is called through the doublefault shim which already provides
+ * cr2 in the address argument.
+ */
+#define DEFINE_IDTENTRY_DF(func) \
+__visible noinstr void func(struct pt_regs *regs, \
+ unsigned long error_code, \
+ unsigned long address)
+
+#endif /* !CONFIG_X86_64 */
+
+/* C-Code mapping */
+#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW
+#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW
+#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW
+
+#ifdef CONFIG_X86_64
+#define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST
+#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW
+
+#define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST
+#define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST
+#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW
+#endif
+
+void idt_install_sysvec(unsigned int n, const void *function);
+void fred_install_sysvec(unsigned int vector, const idtentry_t function);
+
+#define sysvec_install(vector, function) { \
+ if (IS_ENABLED(CONFIG_X86_FRED)) \
+ fred_install_sysvec(vector, function); \
+ if (!cpu_feature_enabled(X86_FEATURE_FRED)) \
+ idt_install_sysvec(vector, asm_##function); \
+}
+
+#else /* !__ASSEMBLER__ */
+
+/*
+ * The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs.
+ */
+#define DECLARE_IDTENTRY(vector, func) \
+ idtentry vector asm_##func func has_error_code=0
+
+#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \
+ idtentry vector asm_##func func has_error_code=1
+
+/* Special case for 32bit IRET 'trap'. Do not emit ASM code */
+#define DECLARE_IDTENTRY_SW(vector, func)
+
+#define DECLARE_IDTENTRY_RAW(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \
+ DECLARE_IDTENTRY_ERRORCODE(vector, func)
+
+/* Entries for common/spurious (device) interrupts */
+#define DECLARE_IDTENTRY_IRQ(vector, func) \
+ idtentry_irq vector func
+
+/* System vector entries */
+#define DECLARE_IDTENTRY_SYSVEC(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+#ifdef CONFIG_X86_64
+# define DECLARE_IDTENTRY_MCE(vector, func) \
+ idtentry_mce_db vector asm_##func func
+
+# define DECLARE_IDTENTRY_DEBUG(vector, func) \
+ idtentry_mce_db vector asm_##func func
+
+# define DECLARE_IDTENTRY_DF(vector, func) \
+ idtentry_df vector asm_##func func
+
+# define DECLARE_IDTENTRY_XENCB(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+# define DECLARE_IDTENTRY_VC(vector, func) \
+ idtentry_vc vector asm_##func func
+
+#else
+# define DECLARE_IDTENTRY_MCE(vector, func) \
+ DECLARE_IDTENTRY(vector, func)
+
+/* No ASM emitted for DF as this goes through a C shim */
+# define DECLARE_IDTENTRY_DF(vector, func)
+
+/* No ASM emitted for XEN hypervisor callback */
+# define DECLARE_IDTENTRY_XENCB(vector, func)
+
+#endif
+
+/* No ASM code emitted for NMI */
+#define DECLARE_IDTENTRY_NMI(vector, func)
+
+/*
+ * ASM code to emit the common vector entry stubs where each stub is
+ * packed into IDT_ALIGN bytes.
+ *
+ * Note, that the 'pushq imm8' is emitted via '.byte 0x6a, vector' because
+ * GCC treats the local vector variable as unsigned int and would expand
+ * all vectors above 0x7F to a 5 byte push. The original code did an
+ * adjustment of the vector number to be in the signed byte range to avoid
+ * this. While clever it's mindboggling counterintuitive and requires the
+ * odd conversion back to a real vector number in the C entry points. Using
+ * .byte achieves the same thing and the only fixup needed in the C entry
+ * point is to mask off the bits above bit 7 because the push is sign
+ * extending.
+ */
+ .align IDT_ALIGN
+SYM_CODE_START(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept NR_EXTERNAL_VECTORS
+ UNWIND_HINT_IRET_REGS
+0 :
+ ENDBR
+ .byte 0x6a, vector
+ jmp asm_common_interrupt
+ /* Ensure that the above is IDT_ALIGN bytes max */
+ .fill 0b + IDT_ALIGN - ., 1, 0xcc
+ vector = vector+1
+ .endr
+SYM_CODE_END(irq_entries_start)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .align IDT_ALIGN
+SYM_CODE_START(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ .rept NR_SYSTEM_VECTORS
+ UNWIND_HINT_IRET_REGS
+0 :
+ ENDBR
+ .byte 0x6a, vector
+ jmp asm_spurious_interrupt
+ /* Ensure that the above is IDT_ALIGN bytes max */
+ .fill 0b + IDT_ALIGN - ., 1, 0xcc
+ vector = vector+1
+ .endr
+SYM_CODE_END(spurious_entries_start)
+#endif
+
+#endif /* __ASSEMBLER__ */
+
+/*
+ * The actual entry points. Note that DECLARE_IDTENTRY*() serves two
+ * purposes:
+ * - provide the function declarations when included from C-Code
+ * - emit the ASM stubs when included from entry_32/64.S
+ *
+ * This avoids duplicate defines and ensures that everything is consistent.
+ */
+
+/*
+ * Dummy trap number so the low level ASM macro vector number checks do not
+ * match which results in emitting plain IDTENTRY stubs without bells and
+ * whistles.
+ */
+#define X86_TRAP_OTHER 0xFFFF
+
+/* Simple exception entry points. No hardware error code */
+DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error);
+DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow);
+DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds);
+DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available);
+DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun);
+DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug);
+DECLARE_IDTENTRY(X86_TRAP_MF, exc_coprocessor_error);
+DECLARE_IDTENTRY(X86_TRAP_XF, exc_simd_coprocessor_error);
+
+/* 32bit software IRET trap. Do not emit ASM code */
+DECLARE_IDTENTRY_SW(X86_TRAP_IRET, iret_error);
+
+/* Simple exception entries with error code pushed by hardware */
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection);
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check);
+
+/* Raw exception entries which need extra work */
+DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op);
+DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3);
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault);
+
+#if defined(CONFIG_IA32_EMULATION)
+DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation);
+#endif
+
+#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_64
+DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
+#else
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
+#endif
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
+#endif
+#endif
+
+/* NMI */
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * Special entry point for VMX which invokes this on the kernel stack, even for
+ * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work
+ * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well
+ * to avoid more ifdeffery.
+ */
+DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx);
+#endif
+
+DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
+#endif
+
+/* #DB */
+#ifdef CONFIG_X86_64
+DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug);
+#else
+DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug);
+#endif
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
+#endif
+
+/* #DF */
+DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
+#endif
+
+/* #CP */
+#ifdef CONFIG_X86_CET
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
+#endif
+
+/* #VC */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
+#endif
+
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
+DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap);
+#endif
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception);
+#endif
+
+/* Device interrupts common/spurious */
+DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt);
+#ifdef CONFIG_X86_LOCAL_APIC
+DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, spurious_interrupt);
+#endif
+
+/* System vector entry points */
+#ifdef CONFIG_X86_LOCAL_APIC
+DECLARE_IDTENTRY_SYSVEC(ERROR_APIC_VECTOR, sysvec_error_interrupt);
+DECLARE_IDTENTRY_SYSVEC(SPURIOUS_APIC_VECTOR, sysvec_spurious_apic_interrupt);
+DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR, sysvec_apic_timer_interrupt);
+DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi);
+#endif
+
+#ifdef CONFIG_SMP
+DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi);
+DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot);
+DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single);
+DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function);
+#else
+# define fred_sysvec_reschedule_ipi NULL
+# define fred_sysvec_reboot NULL
+# define fred_sysvec_call_function_single NULL
+# define fred_sysvec_call_function NULL
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+# ifdef CONFIG_X86_MCE_THRESHOLD
+DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold);
+# else
+# define fred_sysvec_threshold NULL
+# endif
+
+# ifdef CONFIG_X86_MCE_AMD
+DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error);
+# else
+# define fred_sysvec_deferred_error NULL
+# endif
+
+# ifdef CONFIG_X86_THERMAL_VECTOR
+DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal);
+# else
+# define fred_sysvec_thermal NULL
+# endif
+
+# ifdef CONFIG_IRQ_WORK
+DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work);
+# else
+# define fred_sysvec_irq_work NULL
+# endif
+#endif
+
+#if IS_ENABLED(CONFIG_KVM)
+DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi);
+DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi);
+DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi);
+#else
+# define fred_sysvec_kvm_posted_intr_ipi NULL
+# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL
+# define fred_sysvec_kvm_posted_intr_nested_ipi NULL
+#endif
+
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification);
+#else
+# define fred_sysvec_posted_msi_notification NULL
+# endif
+
+#if IS_ENABLED(CONFIG_HYPERV)
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
+DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
+DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
+#endif
+
+#if IS_ENABLED(CONFIG_ACRN_GUEST)
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
+#endif
+
+#ifdef CONFIG_XEN_PVHVM
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
+#endif
+
+#ifdef CONFIG_KVM_GUEST
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
+#endif
+
+#undef X86_TRAP_OTHER
+
+#endif
diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
new file mode 100644
index 000000000000..0d1dbf235679
--- /dev/null
+++ b/arch/x86/include/asm/imr.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * imr.h: Isolated Memory Region API
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ */
+#ifndef _IMR_H
+#define _IMR_H
+
+#include <linux/types.h>
+
+/*
+ * IMR agent access mask bits
+ * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register
+ * definitions.
+ */
+#define IMR_ESRAM_FLUSH BIT(31)
+#define IMR_CPU_SNOOP BIT(30) /* Applicable only to write */
+#define IMR_RMU BIT(29)
+#define IMR_VC1_SAI_ID3 BIT(15)
+#define IMR_VC1_SAI_ID2 BIT(14)
+#define IMR_VC1_SAI_ID1 BIT(13)
+#define IMR_VC1_SAI_ID0 BIT(12)
+#define IMR_VC0_SAI_ID3 BIT(11)
+#define IMR_VC0_SAI_ID2 BIT(10)
+#define IMR_VC0_SAI_ID1 BIT(9)
+#define IMR_VC0_SAI_ID0 BIT(8)
+#define IMR_CPU_0 BIT(1) /* SMM mode */
+#define IMR_CPU BIT(0) /* Non SMM mode */
+#define IMR_ACCESS_NONE 0
+
+/*
+ * Read/Write access-all bits here include some reserved bits
+ * These are the values firmware uses and are accepted by hardware.
+ * The kernel defines read/write access-all in the same way as firmware
+ * in order to have a consistent and crisp definition across firmware,
+ * bootloader and kernel.
+ */
+#define IMR_READ_ACCESS_ALL 0xBFFFFFFF
+#define IMR_WRITE_ACCESS_ALL 0xFFFFFFFF
+
+/* Number of IMRs provided by Quark X1000 SoC */
+#define QUARK_X1000_IMR_MAX 0x08
+#define QUARK_X1000_IMR_REGBASE 0x40
+
+/* IMR alignment bits - only bits 31:10 are checked for IMR validity */
+#define IMR_ALIGN 0x400
+#define IMR_MASK (IMR_ALIGN - 1)
+
+int imr_add_range(phys_addr_t base, size_t size,
+ unsigned int rmask, unsigned int wmask);
+
+int imr_remove_range(phys_addr_t base, size_t size);
+
+#endif /* _IMR_H */
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 000000000000..1b3060a3425c
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,266 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ */
+#include <asm/inat_types.h> /* __ignore_sync_check__ */
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK 4 /* 0xF0 */
+#define INAT_PFX_CS 5 /* 0x2E */
+#define INAT_PFX_DS 6 /* 0x3E */
+#define INAT_PFX_ES 7 /* 0x26 */
+#define INAT_PFX_FS 8 /* 0x64 */
+#define INAT_PFX_GS 9 /* 0x65 */
+#define INAT_PFX_SS 10 /* 0x36 */
+#define INAT_PFX_ADDRSZ 11 /* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX 12 /* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
+#define INAT_PFX_EVEX 15 /* EVEX prefix */
+/* x86-64 REX2 prefix */
+#define INAT_PFX_REX2 16 /* 0xD5 */
+/* AMD XOP prefix */
+#define INAT_PFX_XOP 17 /* 0x8F */
+
+#define INAT_LSTPFX_MAX 3
+#define INAT_LGCPFX_MAX 11
+
+/* Immediate size */
+#define INAT_IMM_BYTE 1
+#define INAT_IMM_WORD 2
+#define INAT_IMM_DWORD 3
+#define INAT_IMM_QWORD 4
+#define INAT_IMM_PTR 5
+#define INAT_IMM_VWORD32 6
+#define INAT_IMM_VWORD 7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS 0
+#define INAT_PFX_BITS 5
+#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS 2
+#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS 5
+#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS 3
+#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
+#define INAT_XOPOK INAT_VEXOK
+#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
+#define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7))
+#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8))
+#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9))
+#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10))
+#define INAT_INV64 (1 << (INAT_FLAG_OFFS + 11))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
+
+/* Identifiers for segment registers */
+#define INAT_SEG_REG_IGNORE 0
+#define INAT_SEG_REG_DEFAULT 1
+#define INAT_SEG_REG_CS 2
+#define INAT_SEG_REG_SS 3
+#define INAT_SEG_REG_DS 4
+#define INAT_SEG_REG_ES 5
+#define INAT_SEG_REG_FS 6
+#define INAT_SEG_REG_GS 7
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+ int lpfx_id,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+ int lpfx_id,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+ insn_byte_t vex_m,
+ insn_byte_t vex_pp);
+extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode,
+ insn_byte_t map_select);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_is_rex2_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_REX2;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+ if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+ return 0;
+ else
+ return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3 ||
+ attr == INAT_PFX_EVEX;
+}
+
+static inline int inat_is_evex_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_EVEX;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_xop_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_XOP;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+ return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+ return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+ return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+ return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+ return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+ return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+ return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+ return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+ return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+ return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+ return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+ return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+ return attr & INAT_VEXOK;
+}
+
+static inline int inat_accept_xop(insn_attr_t attr)
+{
+ return attr & INAT_XOPOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+ return attr & (INAT_VEXONLY | INAT_EVEXONLY);
+}
+
+static inline int inat_must_evex(insn_attr_t attr)
+{
+ return attr & INAT_EVEXONLY;
+}
+
+static inline int inat_evex_scalable(insn_attr_t attr)
+{
+ return attr & INAT_EVEX_SCALABLE;
+}
+
+static inline int inat_is_invalid64(insn_attr_t attr)
+{
+ return attr & INAT_INV64;
+}
+#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 000000000000..b047efa9ddc2
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 36fb1a6a5109..01ccdd168df0 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -1,18 +1,20 @@
-#ifndef _ASM_X86_INIT_32_H
-#define _ASM_X86_INIT_32_H
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INIT_H
+#define _ASM_X86_INIT_H
-#ifdef CONFIG_X86_32
-extern void __init early_ioremap_page_table_range_init(void);
-#endif
+struct x86_mapping_info {
+ void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
+ void (*free_pgt_page)(void *, void *); /* free buf for page table */
+ void *context; /* context for alloc_pgt_page */
+ unsigned long page_flag; /* page flag for PMD or PUD entry */
+ unsigned long offset; /* ident mapping offset */
+ bool direct_gbpages; /* PUD level 1GB page support */
+ unsigned long kernpg_flag; /* kernel pagetable flag override */
+};
-extern unsigned long __init
-kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
- unsigned long page_size_mask);
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long pstart, unsigned long pend);
+void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd);
-extern unsigned long __initdata e820_table_start;
-extern unsigned long __meminitdata e820_table_end;
-extern unsigned long __meminitdata e820_table_top;
-
-#endif /* _ASM_X86_INIT_32_H */
+#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
new file mode 100644
index 000000000000..4733e9064ee5
--- /dev/null
+++ b/arch/x86/include/asm/insn-eval.h
@@ -0,0 +1,49 @@
+#ifndef _ASM_X86_INSN_EVAL_H
+#define _ASM_X86_INSN_EVAL_H
+/*
+ * A collection of utility functions for x86 instruction analysis to be
+ * used in a kernel context. Useful when, for instance, making sense
+ * of the registers indicated by operands.
+ */
+
+#include <linux/compiler.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <asm/ptrace.h>
+
+#define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf)
+#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
+#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+
+int pt_regs_offset(struct pt_regs *regs, int regno);
+
+bool insn_has_rep_prefix(struct insn *insn);
+void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
+int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs);
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
+int insn_get_code_seg_params(struct pt_regs *regs);
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip);
+int insn_fetch_from_user(struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE]);
+int insn_fetch_from_user_inatomic(struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE]);
+bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE], int buf_size);
+
+enum insn_mmio_type {
+ INSN_MMIO_DECODE_FAILED,
+ INSN_MMIO_WRITE,
+ INSN_MMIO_WRITE_IMM,
+ INSN_MMIO_READ,
+ INSN_MMIO_READ_ZERO_EXTEND,
+ INSN_MMIO_READ_SIGN_EXTEND,
+ INSN_MMIO_MOVS,
+};
+
+enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
+
+bool insn_is_nop(struct insn *insn);
+
+#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 000000000000..846d21c1a7f8
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <asm/byteorder.h>
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h> /* __ignore_sync_check__ */
+
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
+
+struct insn_field {
+ union {
+ insn_value_t value;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+}
+
+#else
+
+struct insn_field {
+ insn_value_t value;
+ union {
+ insn_value_t little;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->little = __cpu_to_le32(v);
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+ p->value = __le32_to_cpu(p->little);
+}
+#endif
+
+struct insn {
+ struct insn_field prefixes; /*
+ * Prefixes
+ * prefixes.bytes[3]: last prefix
+ */
+ struct insn_field rex_prefix; /* REX prefix */
+ union {
+ struct insn_field vex_prefix; /* VEX prefix */
+ struct insn_field xop_prefix; /* XOP prefix */
+ };
+ struct insn_field opcode; /*
+ * opcode.bytes[0]: opcode1
+ * opcode.bytes[1]: opcode2
+ * opcode.bytes[2]: opcode3
+ */
+ struct insn_field modrm;
+ struct insn_field sib;
+ struct insn_field displacement;
+ union {
+ struct insn_field immediate;
+ struct insn_field moffset1; /* for 64bit MOV */
+ struct insn_field immediate1; /* for 64bit imm or off16/32 */
+ };
+ union {
+ struct insn_field moffset2; /* for 64bit MOV */
+ struct insn_field immediate2; /* for 64bit imm or seg16 */
+ };
+
+ int emulate_prefix_size;
+ insn_attr_t attr;
+ unsigned char opnd_bytes;
+ unsigned char addr_bytes;
+ unsigned char length;
+ unsigned char x86_64;
+
+ const insn_byte_t *kaddr; /* kernel address of insn to analyze */
+ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */
+ const insn_byte_t *next_byte;
+};
+
+#define MAX_INSN_SIZE 15
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */
+#define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */
+#define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */
+#define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */
+
+#define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */
+#define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */
+#define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */
+#define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */
+
+/* VEX bit flags */
+#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
+#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
+#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
+#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
+#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_EVEX_M(vex) ((vex) & 0x07) /* EVEX Byte1 */
+#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
+#define X86_VEX2_M 1 /* VEX2.M always 1 */
+#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
+/* XOP bit fields */
+#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */
+#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */
+#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */
+#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */
+#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */
+#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */
+#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */
+#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */
+#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */
+#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */
+
+extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64);
+extern int insn_get_prefixes(struct insn *insn);
+extern int insn_get_opcode(struct insn *insn);
+extern int insn_get_modrm(struct insn *insn);
+extern int insn_get_sib(struct insn *insn);
+extern int insn_get_displacement(struct insn *insn);
+extern int insn_get_immediate(struct insn *insn);
+extern int insn_get_length(struct insn *insn);
+
+enum insn_mode {
+ INSN_MODE_32,
+ INSN_MODE_64,
+ /* Mode is determined by the current kernel build. */
+ INSN_MODE_KERN,
+ INSN_NUM_MODES,
+};
+
+extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m);
+
+#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN)
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+ insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+static inline int insn_is_rex2(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return insn->rex_prefix.nbytes == 2;
+}
+
+static inline insn_byte_t insn_rex2_m_bit(struct insn *insn)
+{
+ return X86_REX2_M(insn->rex_prefix.bytes[1]);
+}
+
+static inline int insn_is_avx_or_xop(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return (insn->vex_prefix.value != 0);
+}
+
+static inline int insn_is_evex(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return (insn->vex_prefix.nbytes == 4);
+}
+
+/* If we already know this is AVX/XOP encoded */
+static inline int avx_insn_is_xop(struct insn *insn)
+{
+ insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]);
+
+ return inat_is_xop_prefix(attr);
+}
+
+static inline int insn_is_xop(struct insn *insn)
+{
+ if (!insn_is_avx_or_xop(insn))
+ return 0;
+
+ return avx_insn_is_xop(insn);
+}
+
+static inline int insn_has_emulate_prefix(struct insn *insn)
+{
+ return !!insn->emulate_prefix_size;
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX2_M;
+ else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */
+ return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+ else /* EVEX */
+ return X86_EVEX_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX_P(insn->vex_prefix.bytes[1]);
+ else
+ return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+static inline insn_byte_t insn_vex_w_bit(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes < 3)
+ return 0;
+ return X86_VEX_W(insn->vex_prefix.bytes[2]);
+}
+
+static inline insn_byte_t insn_xop_map_bits(struct insn *insn)
+{
+ if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */
+ return 0;
+ return X86_XOP_M(insn->xop_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_xop_p_bits(struct insn *insn)
+{
+ return X86_XOP_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Get the last prefix id from last prefix or VEX prefix */
+static inline int insn_last_prefix_id(struct insn *insn)
+{
+ if (insn_is_avx_or_xop(insn)) {
+ if (avx_insn_is_xop(insn))
+ return insn_xop_p_bits(insn);
+ return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */
+ }
+
+ if (insn->prefixes.bytes[3])
+ return inat_get_last_prefix_id(insn->prefixes.bytes[3]);
+
+ return 0;
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+ return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+ return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+ return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+ return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+ return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+ return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+ return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+/**
+ * for_each_insn_prefix() -- Iterate prefixes in the instruction
+ * @insn: Pointer to struct insn.
+ * @prefix: Prefix byte.
+ *
+ * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
+ * and the index is stored in @idx (note that this @idx is just for a cursor,
+ * do not change it.)
+ * Since prefixes.nbytes can be bigger than 4 if some prefixes
+ * are repeated, it cannot be used for looping over the prefixes.
+ */
+#define for_each_insn_prefix(insn, prefix) \
+ for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
+
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+ return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+ (insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
+#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000000..e48a00b3311d
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Generate .byte code for some instructions not supported by old
+ * binutils.
+ */
+#ifndef X86_ASM_INST_H
+#define X86_ASM_INST_H
+
+#ifdef __ASSEMBLER__
+
+#define REG_NUM_INVALID 100
+
+#define REG_TYPE_R32 0
+#define REG_TYPE_R64 1
+#define REG_TYPE_INVALID 100
+
+ .macro R32_NUM opd r32
+ \opd = REG_NUM_INVALID
+ .ifc \r32,%eax
+ \opd = 0
+ .endif
+ .ifc \r32,%ecx
+ \opd = 1
+ .endif
+ .ifc \r32,%edx
+ \opd = 2
+ .endif
+ .ifc \r32,%ebx
+ \opd = 3
+ .endif
+ .ifc \r32,%esp
+ \opd = 4
+ .endif
+ .ifc \r32,%ebp
+ \opd = 5
+ .endif
+ .ifc \r32,%esi
+ \opd = 6
+ .endif
+ .ifc \r32,%edi
+ \opd = 7
+ .endif
+#ifdef CONFIG_X86_64
+ .ifc \r32,%r8d
+ \opd = 8
+ .endif
+ .ifc \r32,%r9d
+ \opd = 9
+ .endif
+ .ifc \r32,%r10d
+ \opd = 10
+ .endif
+ .ifc \r32,%r11d
+ \opd = 11
+ .endif
+ .ifc \r32,%r12d
+ \opd = 12
+ .endif
+ .ifc \r32,%r13d
+ \opd = 13
+ .endif
+ .ifc \r32,%r14d
+ \opd = 14
+ .endif
+ .ifc \r32,%r15d
+ \opd = 15
+ .endif
+#endif
+ .endm
+
+ .macro R64_NUM opd r64
+ \opd = REG_NUM_INVALID
+#ifdef CONFIG_X86_64
+ .ifc \r64,%rax
+ \opd = 0
+ .endif
+ .ifc \r64,%rcx
+ \opd = 1
+ .endif
+ .ifc \r64,%rdx
+ \opd = 2
+ .endif
+ .ifc \r64,%rbx
+ \opd = 3
+ .endif
+ .ifc \r64,%rsp
+ \opd = 4
+ .endif
+ .ifc \r64,%rbp
+ \opd = 5
+ .endif
+ .ifc \r64,%rsi
+ \opd = 6
+ .endif
+ .ifc \r64,%rdi
+ \opd = 7
+ .endif
+ .ifc \r64,%r8
+ \opd = 8
+ .endif
+ .ifc \r64,%r9
+ \opd = 9
+ .endif
+ .ifc \r64,%r10
+ \opd = 10
+ .endif
+ .ifc \r64,%r11
+ \opd = 11
+ .endif
+ .ifc \r64,%r12
+ \opd = 12
+ .endif
+ .ifc \r64,%r13
+ \opd = 13
+ .endif
+ .ifc \r64,%r14
+ \opd = 14
+ .endif
+ .ifc \r64,%r15
+ \opd = 15
+ .endif
+#endif
+ .endm
+
+ .macro REG_TYPE type reg
+ R32_NUM reg_type_r32 \reg
+ R64_NUM reg_type_r64 \reg
+ .if reg_type_r64 <> REG_NUM_INVALID
+ \type = REG_TYPE_R64
+ .elseif reg_type_r32 <> REG_NUM_INVALID
+ \type = REG_TYPE_R32
+ .else
+ \type = REG_TYPE_INVALID
+ .endif
+ .endm
+
+ .macro PFX_REX opd1 opd2 W=0
+ .if ((\opd1 | \opd2) & 8) || \W
+ .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+ .endif
+ .endm
+
+ .macro MODRM mod opd1 opd2
+ .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+ .endm
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
new file mode 100644
index 000000000000..950bfd006905
--- /dev/null
+++ b/arch/x86/include/asm/intel-family.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_FAMILY_H
+#define _ASM_X86_INTEL_FAMILY_H
+
+/*
+ * "Big Core" Processors (Branded as Core, Xeon, etc...)
+ *
+ * While adding a new CPUID for a new microarchitecture, add a new
+ * group to keep logically sorted out in chronological order. Within
+ * that group keep the CPUID for the variants sorted by model number.
+ *
+ * The defined symbol names have the following form:
+ * INTEL_{OPTFAMILY}_{MICROARCH}{OPTDIFF}
+ * where:
+ * OPTFAMILY Describes the family of CPUs that this belongs to. Default
+ * is assumed to be "_CORE" (and should be omitted). Other values
+ * currently in use are _ATOM and _XEON_PHI
+ * MICROARCH Is the code name for the micro-architecture for this core.
+ * N.B. Not the platform name.
+ * OPTDIFF If needed, a short string to differentiate by market segment.
+ *
+ * Common OPTDIFFs:
+ *
+ * - regular client parts
+ * _L - regular mobile parts
+ * _G - parts with extra graphics on
+ * _X - regular server parts
+ * _D - micro server parts
+ * _N,_P - other mobile parts
+ * _H - premium mobile parts
+ * _S - other client parts
+ *
+ * Historical OPTDIFFs:
+ *
+ * _EP - 2 socket server parts
+ * _EX - 4+ socket server parts
+ *
+ * The #define line may optionally include a comment including platform or core
+ * names. An exception is made for skylake/kabylake where steppings seem to have gotten
+ * their own names :-(
+ */
+
+#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model)
+
+/* Wildcard match so X86_MATCH_VFM(ANY) works */
+#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY)
+
+/* Family 5 */
+#define INTEL_FAM5_START IFM(5, 0x00) /* Notational marker, also P5 A-step */
+#define INTEL_PENTIUM_75 IFM(5, 0x02) /* P54C */
+#define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */
+#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */
+
+/* Family 6, 18, 19 */
+#define INTEL_PENTIUM_PRO IFM(6, 0x01)
+#define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03)
+#define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05)
+#define INTEL_PENTIUM_III_TUALATIN IFM(6, 0x0B)
+#define INTEL_PENTIUM_M_DOTHAN IFM(6, 0x0D)
+
+#define INTEL_CORE_YONAH IFM(6, 0x0E)
+
+#define INTEL_CORE2_MEROM IFM(6, 0x0F)
+#define INTEL_CORE2_MEROM_L IFM(6, 0x16)
+#define INTEL_CORE2_PENRYN IFM(6, 0x17)
+#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D)
+
+#define INTEL_NEHALEM IFM(6, 0x1E)
+#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */
+#define INTEL_NEHALEM_EP IFM(6, 0x1A)
+#define INTEL_NEHALEM_EX IFM(6, 0x2E)
+
+#define INTEL_WESTMERE IFM(6, 0x25)
+#define INTEL_WESTMERE_EP IFM(6, 0x2C)
+#define INTEL_WESTMERE_EX IFM(6, 0x2F)
+
+#define INTEL_SANDYBRIDGE IFM(6, 0x2A)
+#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D)
+#define INTEL_IVYBRIDGE IFM(6, 0x3A)
+#define INTEL_IVYBRIDGE_X IFM(6, 0x3E)
+
+#define INTEL_HASWELL IFM(6, 0x3C)
+#define INTEL_HASWELL_X IFM(6, 0x3F)
+#define INTEL_HASWELL_L IFM(6, 0x45)
+#define INTEL_HASWELL_G IFM(6, 0x46)
+
+#define INTEL_BROADWELL IFM(6, 0x3D)
+#define INTEL_BROADWELL_G IFM(6, 0x47)
+#define INTEL_BROADWELL_X IFM(6, 0x4F)
+#define INTEL_BROADWELL_D IFM(6, 0x56)
+
+#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */
+#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */
+#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */
+/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */
+/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */
+
+#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */
+/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */
+/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */
+/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */
+
+#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */
+/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */
+
+#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */
+#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */
+
+#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */
+
+#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */
+#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */
+#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */
+#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */
+#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */
+
+#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */
+
+#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */
+#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */
+
+#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */
+
+#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) /* Raptor Cove */
+
+#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */
+#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE)
+
+#define INTEL_DIAMONDRAPIDS_X IFM(19, 0x01) /* Panther Cove */
+
+#define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */
+
+/* "Hybrid" Processors (P-Core/E-Core) */
+
+#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */
+
+#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */
+#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */
+
+#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */
+#define INTEL_RAPTORLAKE_P IFM(6, 0xBA)
+#define INTEL_RAPTORLAKE_S IFM(6, 0xBF)
+
+#define INTEL_METEORLAKE IFM(6, 0xAC) /* Redwood Cove / Crestmont */
+#define INTEL_METEORLAKE_L IFM(6, 0xAA)
+
+#define INTEL_ARROWLAKE_H IFM(6, 0xC5) /* Lion Cove / Skymont */
+#define INTEL_ARROWLAKE IFM(6, 0xC6)
+#define INTEL_ARROWLAKE_U IFM(6, 0xB5)
+
+#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */
+
+#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Darkmont */
+
+#define INTEL_WILDCATLAKE_L IFM(6, 0xD5)
+
+#define INTEL_NOVALAKE IFM(18, 0x01) /* Coyote Cove / Arctic Wolf */
+#define INTEL_NOVALAKE_L IFM(18, 0x03) /* Coyote Cove / Arctic Wolf */
+
+/* "Small Core" Processors (Atom/E-Core) */
+
+#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */
+#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */
+
+#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */
+#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */
+#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */
+
+#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */
+#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */
+#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */
+#define INTEL_ATOM_SILVERMONT_MID2 IFM(6, 0x5A) /* Anniedale */
+
+#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */
+#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */
+
+#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */
+#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */
+
+/* Note: the micro-architecture is "Goldmont Plus" */
+#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */
+
+#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */
+#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */
+#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */
+
+#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */
+
+#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */
+#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */
+
+#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */
+
+/* Xeon Phi */
+
+#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */
+#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */
+
+/* Notational marker denoting the last Family 6 model */
+#define INTEL_FAM6_LAST IFM(6, 0xFF)
+
+/* Family 15 - NetBurst */
+#define INTEL_P4_WILLAMETTE IFM(15, 0x01) /* Also Xeon Foster */
+#define INTEL_P4_PRESCOTT IFM(15, 0x03)
+#define INTEL_P4_PRESCOTT_2M IFM(15, 0x04)
+#define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */
+
+/*
+ * Intel CPU core types
+ *
+ * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture
+ * of the core. Bits 31-24 indicates its core type (Core or Atom)
+ * and Bits [23:0] indicates the native model ID of the core.
+ * Core type and native model ID are defined in below enumerations.
+ */
+enum intel_cpu_type {
+ INTEL_CPU_TYPE_UNKNOWN,
+ INTEL_CPU_TYPE_ATOM = 0x20,
+ INTEL_CPU_TYPE_CORE = 0x40,
+};
+
+enum intel_native_id {
+ INTEL_ATOM_CMT_NATIVE_ID = 0x2, /* Crestmont */
+ INTEL_ATOM_SKT_NATIVE_ID = 0x3, /* Skymont */
+};
+
+#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
new file mode 100644
index 000000000000..a3abdcd89a32
--- /dev/null
+++ b/arch/x86/include/asm/intel-mid.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel MID specific setup code
+ *
+ * (C) Copyright 2009, 2021 Intel Corporation
+ */
+#ifndef _ASM_X86_INTEL_MID_H
+#define _ASM_X86_INTEL_MID_H
+
+#include <linux/pci.h>
+
+extern int intel_mid_pci_init(void);
+extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state);
+extern pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev);
+
+extern void intel_mid_pwr_power_off(void);
+
+#define INTEL_MID_PWR_LSS_OFFSET 4
+#define INTEL_MID_PWR_LSS_TYPE (1 << 7)
+
+extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
+
+#endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
new file mode 100644
index 000000000000..695f87efbeb8
--- /dev/null
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SHIFT 4
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT)
+
+/*
+ * The largest PEBS record could consume a page, ensure
+ * a record at least can be written after triggering PMI.
+ */
+#define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT)
+#define ARCH_PEBS_THRESH_SINGLE 1
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS_FMT4 8
+#define MAX_PEBS_EVENTS 32
+#define MAX_PEBS_EVENTS_MASK GENMASK_ULL(MAX_PEBS_EVENTS - 1, 0)
+#define MAX_FIXED_PEBS_EVENTS 16
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_event_reset[MAX_PEBS_EVENTS + MAX_FIXED_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+ char bts_buffer[BTS_BUFFER_SIZE];
+ char pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
new file mode 100644
index 000000000000..c796e9bc98b6
--- /dev/null
+++ b/arch/x86/include/asm/intel_pt.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_PT_H
+#define _ASM_X86_INTEL_PT_H
+
+#define PT_CPUID_LEAVES 2
+#define PT_CPUID_REGS_NUM 4 /* number of registers (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+ PT_CAP_max_subleaf = 0,
+ PT_CAP_cr3_filtering,
+ PT_CAP_psb_cyc,
+ PT_CAP_ip_filtering,
+ PT_CAP_mtc,
+ PT_CAP_ptwrite,
+ PT_CAP_power_event_trace,
+ PT_CAP_event_trace,
+ PT_CAP_tnt_disable,
+ PT_CAP_topa_output,
+ PT_CAP_topa_multiple_entries,
+ PT_CAP_single_range_output,
+ PT_CAP_output_subsys,
+ PT_CAP_payloads_lip,
+ PT_CAP_num_address_ranges,
+ PT_CAP_mtc_periods,
+ PT_CAP_cycle_thresholds,
+ PT_CAP_psb_periods,
+};
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+void cpu_emergency_stop_pt(void);
+extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
+extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap);
+extern int is_intel_pt_event(struct perf_event *event);
+#else
+static inline void cpu_emergency_stop_pt(void) {}
+static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; }
+static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; }
+static inline int is_intel_pt_event(struct perf_event *event) { return 0; }
+#endif
+
+#endif /* _ASM_X86_INTEL_PT_H */
diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h
new file mode 100644
index 000000000000..1f9b5d225912
--- /dev/null
+++ b/arch/x86/include/asm/intel_punit_ipc.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_PUNIT_IPC_H_
+#define _ASM_X86_INTEL_PUNIT_IPC_H_
+
+/*
+ * Three types of 8bit P-Unit IPC commands are supported,
+ * bit[7:6]: [00]: BIOS; [01]: GTD; [10]: ISPD.
+ */
+typedef enum {
+ BIOS_IPC = 0,
+ GTDRIVER_IPC,
+ ISPDRIVER_IPC,
+ RESERVED_IPC,
+} IPC_TYPE;
+
+#define IPC_TYPE_OFFSET 6
+#define IPC_PUNIT_BIOS_CMD_BASE (BIOS_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_GTD_CMD_BASE (GTDDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_ISPD_CMD_BASE (ISPDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_CMD_TYPE_MASK (RESERVED_IPC << IPC_TYPE_OFFSET)
+
+/* BIOS => Pcode commands */
+#define IPC_PUNIT_BIOS_ZERO (IPC_PUNIT_BIOS_CMD_BASE | 0x00)
+#define IPC_PUNIT_BIOS_VR_INTERFACE (IPC_PUNIT_BIOS_CMD_BASE | 0x01)
+#define IPC_PUNIT_BIOS_READ_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x02)
+#define IPC_PUNIT_BIOS_WRITE_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x03)
+#define IPC_PUNIT_BIOS_READ_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x04)
+#define IPC_PUNIT_BIOS_WRITE_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x05)
+#define IPC_PUNIT_BIOS_READ_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x06)
+#define IPC_PUNIT_BIOS_WRITE_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x07)
+#define IPC_PUNIT_BIOS_TRIGGER_VDD_RAM (IPC_PUNIT_BIOS_CMD_BASE | 0x08)
+#define IPC_PUNIT_BIOS_READ_TELE_INFO (IPC_PUNIT_BIOS_CMD_BASE | 0x09)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0a)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0b)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0c)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0d)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0e)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0f)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x10)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x11)
+#define IPC_PUNIT_BIOS_READ_MODULE_TEMP (IPC_PUNIT_BIOS_CMD_BASE | 0x12)
+#define IPC_PUNIT_BIOS_RESERVED (IPC_PUNIT_BIOS_CMD_BASE | 0x13)
+#define IPC_PUNIT_BIOS_READ_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x14)
+#define IPC_PUNIT_BIOS_WRITE_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x15)
+#define IPC_PUNIT_BIOS_READ_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x16)
+#define IPC_PUNIT_BIOS_WRITE_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x17)
+#define IPC_PUNIT_BIOS_READ_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x18)
+#define IPC_PUNIT_BIOS_WRITE_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x19)
+#define IPC_PUNIT_BIOS_READ_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1a)
+#define IPC_PUNIT_BIOS_WRITE_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1b)
+
+/* GT Driver => Pcode commands */
+#define IPC_PUNIT_GTD_ZERO (IPC_PUNIT_GTD_CMD_BASE | 0x00)
+#define IPC_PUNIT_GTD_CONFIG (IPC_PUNIT_GTD_CMD_BASE | 0x01)
+#define IPC_PUNIT_GTD_READ_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x02)
+#define IPC_PUNIT_GTD_WRITE_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x03)
+#define IPC_PUNIT_GTD_GET_WM_VAL (IPC_PUNIT_GTD_CMD_BASE | 0x06)
+#define IPC_PUNIT_GTD_WRITE_CONFIG_WISHREQ (IPC_PUNIT_GTD_CMD_BASE | 0x07)
+#define IPC_PUNIT_GTD_READ_REQ_DUTY_CYCLE (IPC_PUNIT_GTD_CMD_BASE | 0x16)
+#define IPC_PUNIT_GTD_DIS_VOL_FREQ_CHG_REQUEST (IPC_PUNIT_GTD_CMD_BASE | 0x17)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_CTRL (IPC_PUNIT_GTD_CMD_BASE | 0x1a)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_TUNING (IPC_PUNIT_GTD_CMD_BASE | 0x1c)
+
+/* ISP Driver => Pcode commands */
+#define IPC_PUNIT_ISPD_ZERO (IPC_PUNIT_ISPD_CMD_BASE | 0x00)
+#define IPC_PUNIT_ISPD_CONFIG (IPC_PUNIT_ISPD_CMD_BASE | 0x01)
+#define IPC_PUNIT_ISPD_GET_ISP_LTR_VAL (IPC_PUNIT_ISPD_CMD_BASE | 0x02)
+#define IPC_PUNIT_ISPD_ACCESS_IU_FREQ_BOUNDS (IPC_PUNIT_ISPD_CMD_BASE | 0x03)
+#define IPC_PUNIT_ISPD_READ_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x04)
+#define IPC_PUNIT_ISPD_WRITE_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x05)
+
+/* Error codes */
+#define IPC_PUNIT_ERR_SUCCESS 0
+#define IPC_PUNIT_ERR_INVALID_CMD 1
+#define IPC_PUNIT_ERR_INVALID_PARAMETER 2
+#define IPC_PUNIT_ERR_CMD_TIMEOUT 3
+#define IPC_PUNIT_ERR_CMD_LOCKED 4
+#define IPC_PUNIT_ERR_INVALID_VR_ID 5
+#define IPC_PUNIT_ERR_VR_ERR 6
+
+#if IS_ENABLED(CONFIG_INTEL_PUNIT_IPC)
+
+int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out);
+
+#else
+
+static inline int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2,
+ u32 *in, u32 *out)
+{
+ return -ENODEV;
+}
+
+#endif /* CONFIG_INTEL_PUNIT_IPC */
+
+#endif
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
new file mode 100644
index 000000000000..944637a4e6de
--- /dev/null
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel SOC Telemetry Driver Header File
+ * Copyright (C) 2015, Intel Corporation.
+ * All Rights Reserved.
+ */
+#ifndef INTEL_TELEMETRY_H
+#define INTEL_TELEMETRY_H
+
+#define TELEM_MAX_EVENTS_SRAM 28
+#define TELEM_MAX_OS_ALLOCATED_EVENTS 20
+
+#include <linux/platform_data/x86/intel_scu_ipc.h>
+
+enum telemetry_unit {
+ TELEM_PSS = 0,
+ TELEM_IOSS,
+ TELEM_UNIT_NONE
+};
+
+struct telemetry_evtlog {
+ u32 telem_evtid;
+ u64 telem_evtlog;
+};
+
+struct telemetry_evtconfig {
+ /* Array of Event-IDs to Enable */
+ u32 *evtmap;
+
+ /* Number of Events (<29) in evtmap */
+ u8 num_evts;
+
+ /* Sampling period */
+ u8 period;
+};
+
+struct telemetry_evtmap {
+ const char *name;
+ u32 evt_id;
+};
+
+struct telemetry_unit_config {
+ struct telemetry_evtmap *telem_evts;
+ void __iomem *regmap;
+ u8 ssram_evts_used;
+ u8 curr_period;
+ u8 max_period;
+ u8 min_period;
+};
+
+struct telemetry_plt_config {
+ struct telemetry_unit_config pss_config;
+ struct telemetry_unit_config ioss_config;
+ struct mutex telem_trace_lock;
+ struct mutex telem_lock;
+ struct intel_pmc_dev *pmc;
+ struct intel_scu_ipc_dev *scu;
+ bool telem_in_use;
+};
+
+struct telemetry_core_ops {
+ int (*get_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+ int (*set_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+ int (*raw_read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+
+ int (*read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+};
+
+int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
+ struct telemetry_plt_config *pltconfig);
+
+int telemetry_clear_pltdata(void);
+
+struct telemetry_plt_config *telemetry_get_pltdata(void);
+
+int telemetry_get_evtname(enum telemetry_unit telem_unit,
+ const char **name, int len);
+
+int telemetry_read_events(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+int telemetry_get_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+#endif /* INTEL_TELEMETRY_H */
diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h
new file mode 100644
index 000000000000..734482afbf81
--- /dev/null
+++ b/arch/x86/include/asm/invpcid.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVPCID
+#define _ASM_X86_INVPCID
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+{
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+ * mappings, we don't want the compiler to reorder any subsequent
+ * memory accesses before the TLB flush.
+ */
+ asm volatile("invpcid %[desc], %[type]"
+ :: [desc] "m" (desc), [type] "r" (type) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR 0
+#define INVPCID_TYPE_SINGLE_CTXT 1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
+#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+ unsigned long addr)
+{
+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
+#endif /* _ASM_X86_INVPCID */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 73739322b6d0..ca309a3227c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -1,11 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IO_H
#define _ASM_X86_IO_H
-#define ARCH_HAS_IOREMAP_WC
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#include <linux/string.h>
#include <linux/compiler.h>
-#include <asm-generic/int-ll64.h>
+#include <linux/cc_platform.h>
#include <asm/page.h>
+#include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
+#include <asm/shared/io.h>
+#include <asm/special_insns.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -33,6 +70,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", )
build_mmio_write(__writew, "w", unsigned short, "r", )
build_mmio_write(__writel, "l", unsigned int, "r", )
+#define readb readb
+#define readw readw
+#define readl readl
#define readb_relaxed(a) __readb(a)
#define readw_relaxed(a) __readw(a)
#define readl_relaxed(a) __readl(a)
@@ -40,47 +80,39 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
#define __raw_readw __readw
#define __raw_readl __readl
+#define writeb writeb
+#define writew writew
+#define writel writel
+#define writeb_relaxed(v, a) __writeb(v, a)
+#define writew_relaxed(v, a) __writew(v, a)
+#define writel_relaxed(v, a) __writel(v, a)
#define __raw_writeb __writeb
#define __raw_writew __writew
#define __raw_writel __writel
-#define mmiowb() barrier()
-
#ifdef CONFIG_X86_64
-build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
-build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_read(readq, "q", u64, "=r", :"memory")
+build_mmio_read(__readq, "q", u64, "=r", )
+build_mmio_write(writeq, "q", u64, "r", :"memory")
+build_mmio_write(__writeq, "q", u64, "r", )
-#else
-
-static inline __u64 readq(const volatile void __iomem *addr)
-{
- const volatile u32 __iomem *p = addr;
- u32 low, high;
-
- low = readl(p);
- high = readl(p + 1);
-
- return low + ((u64)high << 32);
-}
+#define readq_relaxed(a) __readq(a)
+#define writeq_relaxed(v, a) __writeq(v, a)
-static inline void writeq(__u64 val, volatile void __iomem *addr)
-{
- writel(val, addr);
- writel(val >> 32, addr+4);
-}
-
-#endif
-
-#define readq_relaxed(a) readq(a)
-
-#define __raw_readq(a) readq(a)
-#define __raw_writeq(val, addr) writeq(val, addr)
+#define __raw_readq __readq
+#define __raw_writeq __writeq
/* Let people know that we have them */
#define readq readq
#define writeq writeq
+#endif
+
+#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
+extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
+extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
+
/**
* virt_to_phys - map virtual addresses to physical
* @address: address to remap
@@ -98,6 +130,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address)
{
return __pa(address);
}
+#define virt_to_phys virt_to_phys
/**
* phys_to_virt - map physical address to virtual
@@ -116,32 +149,34 @@ static inline void *phys_to_virt(phys_addr_t address)
{
return __va(address);
}
-
-/*
- * Change "struct page" to physical address.
- */
-#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#define phys_to_virt phys_to_virt
/*
* ISA I/O bus memory addresses are 1:1 with the physical address.
* However, we truncate the address to unsigned int to avoid undesirable
- * promitions in legacy drivers.
+ * promotions in legacy drivers.
*/
static inline unsigned int isa_virt_to_bus(volatile void *address)
{
return (unsigned int)virt_to_phys(address);
}
-#define isa_page_to_bus(page) ((unsigned int)page_to_phys(page))
#define isa_bus_to_virt phys_to_virt
/*
- * However PCI ones are not necessarily 1:1 and therefore these interfaces
- * are forbidden in portable PCI drivers.
- *
- * Allow them on x86 for legacy drivers, though.
+ * The default ioremap() behavior is non-cached; if you need something
+ * else, you probably want one of the following.
*/
-#define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
+#define ioremap_uc ioremap_uc
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+#define ioremap_cache ioremap_cache
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, pgprot_t prot);
+#define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
+
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags);
+#define arch_memremap_wb arch_memremap_wb
/**
* ioremap - map bus memory into CPU space
@@ -157,48 +192,212 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
* If the area you are trying to map is a PCI BAR you should have a
* look at pci_iomap().
*/
-extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
-extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
- unsigned long prot_val);
+void __iomem *ioremap(resource_size_t offset, unsigned long size);
+#define ioremap ioremap
+
+extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
+
+#ifdef __KERNEL__
+void memcpy_fromio(void *, const volatile void __iomem *, size_t);
+void memcpy_toio(volatile void __iomem *, const void *, size_t);
+void memset_io(volatile void __iomem *, int, size_t);
+
+#define memcpy_fromio memcpy_fromio
+#define memcpy_toio memcpy_toio
+#define memset_io memset_io
+
+#ifdef CONFIG_X86_64
/*
- * The default ioremap() behavior is non-cached:
+ * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
+ * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
+ * loop.
*/
-static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+ size_t count)
{
- return ioremap_nocache(offset, size);
+ asm volatile("rep movsl"
+ : "=&c"(count), "=&D"(to), "=&S"(from)
+ : "0"(count), "1"(to), "2"(from)
+ : "memory");
}
+#define __iowrite32_copy __iowrite32_copy
+#endif
-extern void iounmap(volatile void __iomem *addr);
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+extern int io_delay_type;
+extern void io_delay_init(void);
-#ifdef CONFIG_X86_32
-# include "io_32.h"
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
#else
-# include "io_64.h"
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
#endif
+}
-extern void *xlate_dev_mem_ptr(unsigned long phys);
-extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+#endif
+
+#define BUILDIO(bwl, type) \
+static inline void out##bwl##_p(type value, u16 port) \
+{ \
+ out##bwl(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline type in##bwl##_p(u16 port) \
+{ \
+ type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \
+{ \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
+ type *value = (type *)addr; \
+ while (count) { \
+ out##bwl(*value, port); \
+ value++; \
+ count--; \
+ } \
+ } else { \
+ asm volatile("rep outs" #bwl \
+ : "+S"(addr), "+c"(count) \
+ : "d"(port) : "memory"); \
+ } \
+} \
+ \
+static inline void ins##bwl(u16 port, void *addr, unsigned long count) \
+{ \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
+ type *value = (type *)addr; \
+ while (count) { \
+ *value = in##bwl(port); \
+ value++; \
+ count--; \
+ } \
+ } else { \
+ asm volatile("rep ins" #bwl \
+ : "+D"(addr), "+c"(count) \
+ : "d"(port) : "memory"); \
+ } \
+}
+
+BUILDIO(b, u8)
+BUILDIO(w, u16)
+BUILDIO(l, u32)
+#undef BUILDIO
+
+#define inb_p inb_p
+#define inw_p inw_p
+#define inl_p inl_p
+#define insb insb
+#define insw insw
+#define insl insl
+
+#define outb_p outb_p
+#define outw_p outw_p
+#define outl_p outl_p
+#define outsb outsb
+#define outsw outsw
+#define outsl outsl
+
+extern void *xlate_dev_mem_ptr(phys_addr_t phys);
+extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
+
+#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
- unsigned long prot_val);
+ enum page_cache_mode pcm);
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+#define ioremap_wc ioremap_wc
+extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
+#define ioremap_wt ioremap_wt
-/*
- * early_ioremap() and early_iounmap() are for temporary early boot-time
- * mappings, before the real ioremap() is functional.
- * A boot-time mapping is currently limited to at most 16 pages.
- */
-extern void early_ioremap_init(void);
-extern void early_ioremap_reset(void);
-extern void __iomem *early_ioremap(resource_size_t phys_addr,
- unsigned long size);
-extern void __iomem *early_memremap(resource_size_t phys_addr,
- unsigned long size);
-extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
#define IO_SPACE_LIMIT 0xffff
+#include <asm-generic/io.h>
+#undef PCI_IOBASE
+
+#ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
+extern int __must_check arch_phys_wc_add(unsigned long base,
+ unsigned long size);
+extern void arch_phys_wc_del(int handle);
+#define arch_phys_wc_add arch_phys_wc_add
+#endif
+
+#ifdef CONFIG_X86_PAT
+extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size);
+extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size);
+#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+ unsigned long size,
+ unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
+extern bool phys_mem_access_encrypted(unsigned long phys_addr,
+ unsigned long size);
+#else
+static inline bool phys_mem_access_encrypted(unsigned long phys_addr,
+ unsigned long size)
+{
+ return true;
+}
+#endif
+
+/**
+ * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
+ * @dst: destination, in MMIO space (must be 512-bit aligned)
+ * @src: source
+ * @count: number of 512 bits quantities to submit
+ *
+ * Submit data from kernel space to MMIO space, in units of 512 bits at a
+ * time. Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ *
+ * Warning: Do not use this helper unless your driver has checked that the CPU
+ * instruction is supported on the platform.
+ */
+static inline void iosubmit_cmds512(void __iomem *dst, const void *src,
+ size_t count)
+{
+ const u8 *from = src;
+ const u8 *end = from + count * 64;
+
+ while (from < end) {
+ movdir64b_io(dst, from);
+ from += 64;
+ }
+}
+
#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
deleted file mode 100644
index a299900f5920..000000000000
--- a/arch/x86/include/asm/io_32.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef _ASM_X86_IO_32_H
-#define _ASM_X86_IO_32_H
-
-#include <linux/string.h>
-#include <linux/compiler.h>
-
-/*
- * This file contains the definitions for the x86 IO instructions
- * inb/inw/inl/outb/outw/outl and the "string versions" of the same
- * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
- * versions of the single-IO instructions (inb_p/inw_p/..).
- *
- * This file is not meant to be obfuscating: it's just complicated
- * to (a) handle it all in a way that makes gcc able to optimize it
- * as well as possible and (b) trying to avoid writing the same thing
- * over and over again with slight variations and possibly making a
- * mistake somewhere.
- */
-
-/*
- * Thanks to James van Artsdalen for a better timing-fix than
- * the two short jumps: using outb's to a nonexistent port seems
- * to guarantee better timings even on fast machines.
- *
- * On the other hand, I'd like to be sure of a non-existent port:
- * I feel a bit unsafe about using 0x80 (should be safe, though)
- *
- * Linus
- */
-
- /*
- * Bit simplified and optimized by Jan Hubicka
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
- *
- * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
- * isa_read[wl] and isa_write[wl] fixed
- * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#define XQUAD_PORTIO_BASE 0xfe400000
-#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
-
-#ifdef __KERNEL__
-
-#include <asm-generic/iomap.h>
-
-#include <linux/vmalloc.h>
-
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p) p
-
-static inline void
-memset_io(volatile void __iomem *addr, unsigned char val, int count)
-{
- memset((void __force *)addr, val, count);
-}
-
-static inline void
-memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
-{
- __memcpy(dst, (const void __force *)src, count);
-}
-
-static inline void
-memcpy_toio(volatile void __iomem *dst, const void *src, int count)
-{
- __memcpy((void __force *)dst, src, count);
-}
-
-/*
- * ISA space is 'always mapped' on a typical x86 system, no need to
- * explicitly ioremap() it. The fact that the ISA IO space is mapped
- * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
- * are physical addresses. The following constant pointer can be
- * used as the IO-area pointer (it can be iounmapped as well, so the
- * analogy with PCI is quite large):
- */
-#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
-
-/*
- * Cache management
- *
- * This needed for two cases
- * 1. Out of order aware processors
- * 2. Accidentally out of order processors (PPro errata #51)
- */
-
-#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
-
-static inline void flush_write_buffers(void)
-{
- asm volatile("lock; addl $0,0(%%esp)": : :"memory");
-}
-
-#else
-
-#define flush_write_buffers() do { } while (0)
-
-#endif
-
-#endif /* __KERNEL__ */
-
-extern void native_io_delay(void);
-
-extern int io_delay_type;
-extern void io_delay_init(void);
-
-#if defined(CONFIG_PARAVIRT)
-#include <asm/paravirt.h>
-#else
-
-static inline void slow_down_io(void)
-{
- native_io_delay();
-#ifdef REALLY_SLOW_IO
- native_io_delay();
- native_io_delay();
- native_io_delay();
-#endif
-}
-
-#endif
-
-#define __BUILDIO(bwl, bw, type) \
-static inline void out##bwl(unsigned type value, int port) \
-{ \
- out##bwl##_local(value, port); \
-} \
- \
-static inline unsigned type in##bwl(int port) \
-{ \
- return in##bwl##_local(port); \
-}
-
-#define BUILDIO(bwl, bw, type) \
-static inline void out##bwl##_local(unsigned type value, int port) \
-{ \
- asm volatile("out" #bwl " %" #bw "0, %w1" \
- : : "a"(value), "Nd"(port)); \
-} \
- \
-static inline unsigned type in##bwl##_local(int port) \
-{ \
- unsigned type value; \
- asm volatile("in" #bwl " %w1, %" #bw "0" \
- : "=a"(value) : "Nd"(port)); \
- return value; \
-} \
- \
-static inline void out##bwl##_local_p(unsigned type value, int port) \
-{ \
- out##bwl##_local(value, port); \
- slow_down_io(); \
-} \
- \
-static inline unsigned type in##bwl##_local_p(int port) \
-{ \
- unsigned type value = in##bwl##_local(port); \
- slow_down_io(); \
- return value; \
-} \
- \
-__BUILDIO(bwl, bw, type) \
- \
-static inline void out##bwl##_p(unsigned type value, int port) \
-{ \
- out##bwl(value, port); \
- slow_down_io(); \
-} \
- \
-static inline unsigned type in##bwl##_p(int port) \
-{ \
- unsigned type value = in##bwl(port); \
- slow_down_io(); \
- return value; \
-} \
- \
-static inline void outs##bwl(int port, const void *addr, unsigned long count) \
-{ \
- asm volatile("rep; outs" #bwl \
- : "+S"(addr), "+c"(count) : "d"(port)); \
-} \
- \
-static inline void ins##bwl(int port, void *addr, unsigned long count) \
-{ \
- asm volatile("rep; ins" #bwl \
- : "+D"(addr), "+c"(count) : "d"(port)); \
-}
-
-BUILDIO(b, b, char)
-BUILDIO(w, w, short)
-BUILDIO(l, , int)
-
-#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
deleted file mode 100644
index 244067893af4..000000000000
--- a/arch/x86/include/asm/io_64.h
+++ /dev/null
@@ -1,181 +0,0 @@
-#ifndef _ASM_X86_IO_64_H
-#define _ASM_X86_IO_64_H
-
-
-/*
- * This file contains the definitions for the x86 IO instructions
- * inb/inw/inl/outb/outw/outl and the "string versions" of the same
- * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
- * versions of the single-IO instructions (inb_p/inw_p/..).
- *
- * This file is not meant to be obfuscating: it's just complicated
- * to (a) handle it all in a way that makes gcc able to optimize it
- * as well as possible and (b) trying to avoid writing the same thing
- * over and over again with slight variations and possibly making a
- * mistake somewhere.
- */
-
-/*
- * Thanks to James van Artsdalen for a better timing-fix than
- * the two short jumps: using outb's to a nonexistent port seems
- * to guarantee better timings even on fast machines.
- *
- * On the other hand, I'd like to be sure of a non-existent port:
- * I feel a bit unsafe about using 0x80 (should be safe, though)
- *
- * Linus
- */
-
- /*
- * Bit simplified and optimized by Jan Hubicka
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
- *
- * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
- * isa_read[wl] and isa_write[wl] fixed
- * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-extern void native_io_delay(void);
-
-extern int io_delay_type;
-extern void io_delay_init(void);
-
-#if defined(CONFIG_PARAVIRT)
-#include <asm/paravirt.h>
-#else
-
-static inline void slow_down_io(void)
-{
- native_io_delay();
-#ifdef REALLY_SLOW_IO
- native_io_delay();
- native_io_delay();
- native_io_delay();
-#endif
-}
-#endif
-
-/*
- * Talk about misusing macros..
- */
-#define __OUT1(s, x) \
-static inline void out##s(unsigned x value, unsigned short port) {
-
-#define __OUT2(s, s1, s2) \
-asm volatile ("out" #s " %" s1 "0,%" s2 "1"
-
-#ifndef REALLY_SLOW_IO
-#define REALLY_SLOW_IO
-#define UNSET_REALLY_SLOW_IO
-#endif
-
-#define __OUT(s, s1, x) \
- __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
- } \
- __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
- slow_down_io(); \
-}
-
-#define __IN1(s) \
-static inline RETURN_TYPE in##s(unsigned short port) \
-{ \
- RETURN_TYPE _v;
-
-#define __IN2(s, s1, s2) \
- asm volatile ("in" #s " %" s2 "1,%" s1 "0"
-
-#define __IN(s, s1, i...) \
- __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
- return _v; \
- } \
- __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
- slow_down_io(); \
- return _v; }
-
-#ifdef UNSET_REALLY_SLOW_IO
-#undef REALLY_SLOW_IO
-#endif
-
-#define __INS(s) \
-static inline void ins##s(unsigned short port, void *addr, \
- unsigned long count) \
-{ \
- asm volatile ("rep ; ins" #s \
- : "=D" (addr), "=c" (count) \
- : "d" (port), "0" (addr), "1" (count)); \
-}
-
-#define __OUTS(s) \
-static inline void outs##s(unsigned short port, const void *addr, \
- unsigned long count) \
-{ \
- asm volatile ("rep ; outs" #s \
- : "=S" (addr), "=c" (count) \
- : "d" (port), "0" (addr), "1" (count)); \
-}
-
-#define RETURN_TYPE unsigned char
-__IN(b, "")
-#undef RETURN_TYPE
-#define RETURN_TYPE unsigned short
-__IN(w, "")
-#undef RETURN_TYPE
-#define RETURN_TYPE unsigned int
-__IN(l, "")
-#undef RETURN_TYPE
-
-__OUT(b, "b", char)
-__OUT(w, "w", short)
-__OUT(l, , int)
-
-__INS(b)
-__INS(w)
-__INS(l)
-
-__OUTS(b)
-__OUTS(w)
-__OUTS(l)
-
-#if defined(__KERNEL__) && defined(__x86_64__)
-
-#include <linux/vmalloc.h>
-
-#include <asm-generic/iomap.h>
-
-void __memcpy_fromio(void *, unsigned long, unsigned);
-void __memcpy_toio(unsigned long, const void *, unsigned);
-
-static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
- unsigned len)
-{
- __memcpy_fromio(to, (unsigned long)from, len);
-}
-
-static inline void memcpy_toio(volatile void __iomem *to, const void *from,
- unsigned len)
-{
- __memcpy_toio((unsigned long)to, from, len);
-}
-
-void memset_io(volatile void __iomem *a, int b, size_t c);
-
-/*
- * ISA space is 'always mapped' on a typical x86 system, no need to
- * explicitly ioremap() it. The fact that the ISA IO space is mapped
- * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
- * are physical addresses. The following constant pointer can be
- * used as the IO-area pointer (it can be iounmapped as well, so the
- * analogy with PCI is quite large):
- */
-#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
-
-#define flush_write_buffers()
-
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p) p
-
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 330ee807f89e..0d806513c4b3 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IO_APIC_H
#define _ASM_X86_IO_APIC_H
@@ -5,22 +6,13 @@
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/irq_vectors.h>
-
+#include <asm/x86_init.h>
/*
* Intel IO-APIC support for SMP and UP systems.
*
* Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
*/
-/* I/O Unit Redirection Table */
-#define IO_APIC_REDIR_VECTOR_MASK 0x000FF
-#define IO_APIC_REDIR_DEST_LOGICAL 0x00800
-#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
-#define IO_APIC_REDIR_SEND_PENDING (1 << 12)
-#define IO_APIC_REDIR_REMOTE_IRR (1 << 14)
-#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15)
-#define IO_APIC_REDIR_MASKED (1 << 16)
-
/*
* The structure of the IO-APIC:
*/
@@ -63,48 +55,43 @@ union IO_APIC_reg_03 {
} __attribute__ ((packed)) bits;
};
-enum ioapic_irq_destination_types {
- dest_Fixed = 0,
- dest_LowestPrio = 1,
- dest_SMI = 2,
- dest__reserved_1 = 3,
- dest_NMI = 4,
- dest_INIT = 5,
- dest__reserved_2 = 6,
- dest_ExtINT = 7
-};
-
struct IO_APIC_route_entry {
- __u32 vector : 8,
- delivery_mode : 3, /* 000: FIXED
- * 001: lowest prio
- * 111: ExtINT
- */
- dest_mode : 1, /* 0: physical, 1: logical */
- delivery_status : 1,
- polarity : 1,
- irr : 1,
- trigger : 1, /* 0: edge, 1: level */
- mask : 1, /* 0: enabled, 1: disabled */
- __reserved_2 : 15;
-
- __u32 __reserved_3 : 24,
- dest : 8;
+ union {
+ struct {
+ u64 vector : 8,
+ delivery_mode : 3,
+ dest_mode_logical : 1,
+ delivery_status : 1,
+ active_low : 1,
+ irr : 1,
+ is_level : 1,
+ masked : 1,
+ reserved_0 : 15,
+ reserved_1 : 17,
+ virt_destid_8_14 : 7,
+ destid_0_7 : 8;
+ };
+ struct {
+ u64 ir_shared_0 : 8,
+ ir_zero : 3,
+ ir_index_15 : 1,
+ ir_shared_1 : 5,
+ ir_reserved_0 : 31,
+ ir_format : 1,
+ ir_index_0_14 : 15;
+ };
+ struct {
+ u64 w1 : 32,
+ w2 : 32;
+ };
+ };
} __attribute__ ((packed));
-struct IR_IO_APIC_route_entry {
- __u64 vector : 8,
- zero : 3,
- index2 : 1,
- delivery_status : 1,
- polarity : 1,
- irr : 1,
- trigger : 1,
- mask : 1,
- reserved : 31,
- format : 1,
- index : 15;
-} __attribute__ ((packed));
+struct irq_alloc_info;
+struct ioapic_domain_cfg;
+
+#define IOAPIC_MAP_ALLOC 0x1
+#define IOAPIC_MAP_CHECK 0x2
#ifdef CONFIG_X86_IO_APIC
@@ -112,12 +99,9 @@ struct IR_IO_APIC_route_entry {
* # of IO-APICs and # of IRQ routing registers
*/
extern int nr_ioapics;
-extern int nr_ioapic_registers[MAX_IO_APICS];
-
-#define MP_MAX_IOAPIC_PIN 127
-/* I/O APIC entries */
-extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
+extern int mpc_ioapic_id(int ioapic);
+extern unsigned int mpc_ioapic_addr(int ioapic);
/* # of MP IRQ source entries */
extern int mp_irq_entries;
@@ -125,14 +109,8 @@ extern int mp_irq_entries;
/* MP IRQ source entries */
extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
-/* non-0 if default (table-less) MP configuration */
-extern int mpc_default_type;
-
-/* Older SiS APIC requires we rewrite the index register */
-extern int sis_apic_bug;
-
-/* 1 if "noapic" boot option passed */
-extern int skip_ioapic_setup;
+/* True if "noapic" boot option passed */
+extern bool ioapic_is_disabled;
/* 1 if "noapic" boot option passed */
extern int noioapicquirk;
@@ -140,50 +118,101 @@ extern int noioapicquirk;
/* -1 if "noapic" boot option passed */
extern int noioapicreroute;
-/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
-extern int timer_through_8259;
+extern u32 gsi_top;
+
+extern unsigned long io_apic_irqs;
+
+#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
*/
#define io_apic_assign_pci_irqs \
- (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
-
-#ifdef CONFIG_ACPI
-extern int io_apic_get_unique_id(int ioapic, int apic_id);
-extern int io_apic_get_version(int ioapic);
-extern int io_apic_get_redir_entries(int ioapic);
-#endif /* CONFIG_ACPI */
-
-struct io_apic_irq_attr;
-extern int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr);
-extern int (*ioapic_renumber_irq)(int ioapic, int irq);
-extern void ioapic_init_mappings(void);
-extern void ioapic_insert_resources(void);
+ (mp_irq_entries && !ioapic_is_disabled && io_apic_irqs)
-extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
-extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
-extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-
-extern void probe_nr_irqs_gsi(void);
-
-extern int setup_ioapic_entry(int apic, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin);
-extern void ioapic_write_entry(int apic, int pin,
- struct IO_APIC_route_entry e);
+struct irq_cfg;
+extern void ioapic_insert_resources(void);
+extern int arch_early_ioapic_init(void);
+
+extern int save_ioapic_entries(void);
+extern void mask_ioapic_entries(void);
+extern int restore_ioapic_entries(void);
+
+extern void setup_ioapic_ids_from_mpc(void);
+
+extern int mp_find_ioapic(u32 gsi);
+extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+ struct irq_alloc_info *info);
+extern void mp_unmap_irq(int irq);
+extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg);
+extern int mp_unregister_ioapic(u32 gsi_base);
+extern int mp_ioapic_registered(u32 gsi_base);
+
+extern void ioapic_set_alloc_attr(struct irq_alloc_info *info,
+ int node, int trigger, int polarity);
+
+extern void mp_save_irq(struct mpc_intsrc *m);
+
+extern void disable_ioapic_support(void);
+
+extern void __init io_apic_init_mappings(void);
+extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
+extern void native_restore_boot_irq_mode(void);
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+ return x86_apic_ops.io_apic_read(apic, reg);
+}
+
+extern void setup_IO_APIC(void);
+extern void enable_IO_APIC(void);
+extern void clear_IO_APIC(void);
+extern void restore_boot_irq_mode(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin);
+extern void print_IO_APICs(void);
#else /* !CONFIG_X86_IO_APIC */
+
+#define IO_APIC_IRQ(x) 0
#define io_apic_assign_pci_irqs 0
-static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void) { }
+#define setup_ioapic_ids_from_mpc x86_init_noop
+#define nr_ioapics (0)
static inline void ioapic_insert_resources(void) { }
+static inline int arch_early_ioapic_init(void) { return 0; }
+static inline void print_IO_APICs(void) {}
+#define gsi_top (NR_IRQS_LEGACY)
+static inline int mp_find_ioapic(u32 gsi) { return 0; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+ struct irq_alloc_info *info)
+{
+ return gsi;
+}
+
+static inline void mp_unmap_irq(int irq) { }
+
+static inline int save_ioapic_entries(void)
+{
+ return -ENOMEM;
+}
+
+static inline void mask_ioapic_entries(void) { }
+static inline int restore_ioapic_entries(void)
+{
+ return -ENOMEM;
+}
+
+static inline void mp_save_irq(struct mpc_intsrc *m) { }
+static inline void disable_ioapic_support(void) { }
+static inline void io_apic_init_mappings(void) { }
+#define native_io_apic_read NULL
+#define native_restore_boot_irq_mode NULL
+
+static inline void setup_IO_APIC(void) { }
+static inline void enable_IO_APIC(void) { }
+static inline void restore_boot_irq_mode(void) { }
-static inline void probe_nr_irqs_gsi(void) { }
#endif
#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h
new file mode 100644
index 000000000000..7f080f5c7def
--- /dev/null
+++ b/arch/x86/include/asm/io_bitmap.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IOBITMAP_H
+#define _ASM_X86_IOBITMAP_H
+
+#include <linux/refcount.h>
+#include <asm/processor.h>
+
+struct io_bitmap {
+ u64 sequence;
+ refcount_t refcnt;
+ /* The maximum number of bytes to copy so all zero bits are covered */
+ unsigned int max;
+ unsigned long bitmap[IO_BITMAP_LONGS];
+};
+
+struct task_struct;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+void io_bitmap_share(struct task_struct *tsk);
+void io_bitmap_exit(struct task_struct *tsk);
+
+static inline void native_tss_invalidate_io_bitmap(void)
+{
+ /*
+ * Invalidate the I/O bitmap by moving io_bitmap_base outside the
+ * TSS limit so any subsequent I/O access from user space will
+ * trigger a #GP.
+ *
+ * This is correct even when VMEXIT rewrites the TSS limit
+ * to 0x67 as the only requirement is that the base points
+ * outside the limit.
+ */
+ this_cpu_write(cpu_tss_rw.x86_tss.io_bitmap_base,
+ IO_BITMAP_OFFSET_INVALID);
+}
+
+void native_tss_update_io_bitmap(void);
+
+#ifdef CONFIG_PARAVIRT_XXL
+#include <asm/paravirt.h>
+#else
+#define tss_update_io_bitmap native_tss_update_io_bitmap
+#define tss_invalidate_io_bitmap native_tss_invalidate_io_bitmap
+#endif
+
+#else
+static inline void io_bitmap_share(struct task_struct *tsk) { }
+static inline void io_bitmap_exit(struct task_struct *tsk) { }
+static inline void tss_update_io_bitmap(void) { }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/ioctl.h b/arch/x86/include/asm/ioctl.h
deleted file mode 100644
index b279fe06dfe5..000000000000
--- a/arch/x86/include/asm/ioctl.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctl.h>
diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h
deleted file mode 100644
index 0d5b23b7b06e..000000000000
--- a/arch/x86/include/asm/ioctls.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef _ASM_X86_IOCTLS_H
-#define _ASM_X86_IOCTLS_H
-
-#include <asm/ioctl.h>
-
-/* 0x54 is just a magic number to make these relatively unique ('T') */
-
-#define TCGETS 0x5401
-#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */
-#define TCSETSW 0x5403
-#define TCSETSF 0x5404
-#define TCGETA 0x5405
-#define TCSETA 0x5406
-#define TCSETAW 0x5407
-#define TCSETAF 0x5408
-#define TCSBRK 0x5409
-#define TCXONC 0x540A
-#define TCFLSH 0x540B
-#define TIOCEXCL 0x540C
-#define TIOCNXCL 0x540D
-#define TIOCSCTTY 0x540E
-#define TIOCGPGRP 0x540F
-#define TIOCSPGRP 0x5410
-#define TIOCOUTQ 0x5411
-#define TIOCSTI 0x5412
-#define TIOCGWINSZ 0x5413
-#define TIOCSWINSZ 0x5414
-#define TIOCMGET 0x5415
-#define TIOCMBIS 0x5416
-#define TIOCMBIC 0x5417
-#define TIOCMSET 0x5418
-#define TIOCGSOFTCAR 0x5419
-#define TIOCSSOFTCAR 0x541A
-#define FIONREAD 0x541B
-#define TIOCINQ FIONREAD
-#define TIOCLINUX 0x541C
-#define TIOCCONS 0x541D
-#define TIOCGSERIAL 0x541E
-#define TIOCSSERIAL 0x541F
-#define TIOCPKT 0x5420
-#define FIONBIO 0x5421
-#define TIOCNOTTY 0x5422
-#define TIOCSETD 0x5423
-#define TIOCGETD 0x5424
-#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */
-/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */
-#define TIOCSBRK 0x5427 /* BSD compatibility */
-#define TIOCCBRK 0x5428 /* BSD compatibility */
-#define TIOCGSID 0x5429 /* Return the session ID of FD */
-#define TCGETS2 _IOR('T', 0x2A, struct termios2)
-#define TCSETS2 _IOW('T', 0x2B, struct termios2)
-#define TCSETSW2 _IOW('T', 0x2C, struct termios2)
-#define TCSETSF2 _IOW('T', 0x2D, struct termios2)
-#define TIOCGRS485 0x542E
-#define TIOCSRS485 0x542F
-#define TIOCGPTN _IOR('T', 0x30, unsigned int)
- /* Get Pty Number (of pty-mux device) */
-#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */
-#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */
-#define TCSETX 0x5433
-#define TCSETXF 0x5434
-#define TCSETXW 0x5435
-
-#define FIONCLEX 0x5450
-#define FIOCLEX 0x5451
-#define FIOASYNC 0x5452
-#define TIOCSERCONFIG 0x5453
-#define TIOCSERGWILD 0x5454
-#define TIOCSERSWILD 0x5455
-#define TIOCGLCKTRMIOS 0x5456
-#define TIOCSLCKTRMIOS 0x5457
-#define TIOCSERGSTRUCT 0x5458 /* For debugging only */
-#define TIOCSERGETLSR 0x5459 /* Get line status register */
-#define TIOCSERGETMULTI 0x545A /* Get multiport config */
-#define TIOCSERSETMULTI 0x545B /* Set multiport config */
-
-#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */
-#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */
-#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */
-#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */
-#define FIOQSIZE 0x5460
-
-/* Used for packet mode */
-#define TIOCPKT_DATA 0
-#define TIOCPKT_FLUSHREAD 1
-#define TIOCPKT_FLUSHWRITE 2
-#define TIOCPKT_STOP 4
-#define TIOCPKT_START 8
-#define TIOCPKT_NOSTOP 16
-#define TIOCPKT_DOSTOP 32
-
-#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */
-
-#endif /* _ASM_X86_IOCTLS_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0e9fe1d9d971..e2de092fc38c 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,38 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_IOMAP_H
#define _ASM_X86_IOMAP_H
/*
* Copyright © 2008 Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
+#include <linux/highmem.h>
#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-int
-is_io_mapping_possible(resource_size_t base, unsigned long size);
+void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
-void
-iounmap_atomic(void *kvaddr, enum km_type type);
+void iomap_free(resource_size_t base, unsigned long size);
#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21bbee6c..3be2451e7bc8 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,14 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IOMMU_H
#define _ASM_X86_IOMMU_H
-extern void pci_iommu_shutdown(void);
-extern void no_iommu_init(void);
-extern struct dma_map_ops nommu_dma_ops;
+#include <linux/acpi.h>
+
+#include <asm/e820/api.h>
+
extern int force_iommu, no_iommu;
extern int iommu_detected;
-extern int iommu_pass_through;
+extern int iommu_merge;
+extern int panic_on_overflow;
+extern bool amd_iommu_snp_en;
+
+#ifdef CONFIG_SWIOTLB
+extern bool x86_swiotlb_enable;
+#else
+#define x86_swiotlb_enable false
+#endif
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
+static inline int __init
+arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
+{
+ u64 start = rmrr->base_address;
+ u64 end = rmrr->end_address + 1;
+ int entry_type;
+
+ entry_type = e820__get_entry_type(start, end);
+ if (entry_type == E820_TYPE_RESERVED || entry_type == E820_TYPE_NVS)
+ return 0;
+
+ pr_err(FW_BUG "No firmware reserved region can cover this RMRR [%#018Lx-%#018Lx], contact BIOS vendor for fixes\n",
+ start, end - 1);
+ return -EINVAL;
+}
+
#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
new file mode 100644
index 000000000000..8ace6559d399
--- /dev/null
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Intel OnChip System Fabric MailBox access support
+ */
+
+#ifndef IOSF_MBI_SYMS_H
+#define IOSF_MBI_SYMS_H
+
+#include <linux/notifier.h>
+
+#define MBI_MCR_OFFSET 0xD0
+#define MBI_MDR_OFFSET 0xD4
+#define MBI_MCRX_OFFSET 0xD8
+
+#define MBI_RD_MASK 0xFEFFFFFF
+#define MBI_WR_MASK 0X01000000
+
+#define MBI_MASK_HI 0xFFFFFF00
+#define MBI_MASK_LO 0x000000FF
+#define MBI_ENABLE 0xF0
+
+/* IOSF SB read/write opcodes */
+#define MBI_MMIO_READ 0x00
+#define MBI_MMIO_WRITE 0x01
+#define MBI_CFG_READ 0x04
+#define MBI_CFG_WRITE 0x05
+#define MBI_CR_READ 0x06
+#define MBI_CR_WRITE 0x07
+#define MBI_REG_READ 0x10
+#define MBI_REG_WRITE 0x11
+#define MBI_ESRAM_READ 0x12
+#define MBI_ESRAM_WRITE 0x13
+
+/* Baytrail available units */
+#define BT_MBI_UNIT_AUNIT 0x00
+#define BT_MBI_UNIT_SMC 0x01
+#define BT_MBI_UNIT_CPU 0x02
+#define BT_MBI_UNIT_BUNIT 0x03
+#define BT_MBI_UNIT_PMC 0x04
+#define BT_MBI_UNIT_GFX 0x06
+#define BT_MBI_UNIT_SMI 0x0C
+#define BT_MBI_UNIT_CCK 0x14
+#define BT_MBI_UNIT_USB 0x43
+#define BT_MBI_UNIT_SATA 0xA3
+#define BT_MBI_UNIT_PCIE 0xA6
+
+/* Quark available units */
+#define QRK_MBI_UNIT_HBA 0x00
+#define QRK_MBI_UNIT_HB 0x03
+#define QRK_MBI_UNIT_RMU 0x04
+#define QRK_MBI_UNIT_MM 0x05
+#define QRK_MBI_UNIT_SOC 0x31
+
+/* Action values for the pmic_bus_access_notifier functions */
+#define MBI_PMIC_BUS_ACCESS_BEGIN 1
+#define MBI_PMIC_BUS_ACCESS_END 2
+
+#if IS_ENABLED(CONFIG_IOSF_MBI)
+
+bool iosf_mbi_available(void);
+
+/**
+ * iosf_mbi_read() - MailBox Interface read command
+ * @port: port indicating subunit being accessed
+ * @opcode: port specific read or write opcode
+ * @offset: register address offset
+ * @mdr: register data to be read
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr);
+
+/**
+ * iosf_mbi_write() - MailBox unmasked write command
+ * @port: port indicating subunit being accessed
+ * @opcode: port specific read or write opcode
+ * @offset: register address offset
+ * @mdr: register data to be written
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr);
+
+/**
+ * iosf_mbi_modify() - MailBox masked write command
+ * @port: port indicating subunit being accessed
+ * @opcode: port specific read or write opcode
+ * @offset: register address offset
+ * @mdr: register data being modified
+ * @mask: mask indicating bits in mdr to be modified
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
+
+/**
+ * iosf_mbi_punit_acquire() - Acquire access to the P-Unit
+ *
+ * One some systems the P-Unit accesses the PMIC to change various voltages
+ * through the same bus as other kernel drivers use for e.g. battery monitoring.
+ *
+ * If a driver sends requests to the P-Unit which require the P-Unit to access
+ * the PMIC bus while another driver is also accessing the PMIC bus various bad
+ * things happen.
+ *
+ * Call this function before sending requests to the P-Unit which may make it
+ * access the PMIC, be it through iosf_mbi* functions or through other means.
+ * This function will block all kernel access to the PMIC I2C bus, so that the
+ * P-Unit can safely access the PMIC over the shared I2C bus.
+ *
+ * Note on these systems the i2c-bus driver will request a semaphore from the
+ * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing
+ * it, but this does not appear to be sufficient, we still need to avoid making
+ * certain P-Unit requests during the access window to avoid problems.
+ *
+ * This function locks a mutex, as such it may sleep.
+ */
+void iosf_mbi_punit_acquire(void);
+
+/**
+ * iosf_mbi_punit_release() - Release access to the P-Unit
+ */
+void iosf_mbi_punit_release(void);
+
+/**
+ * iosf_mbi_block_punit_i2c_access() - Block P-Unit accesses to the PMIC bus
+ *
+ * Call this function to block P-Unit access to the PMIC I2C bus, so that the
+ * kernel can safely access the PMIC over the shared I2C bus.
+ *
+ * This function acquires the P-Unit bus semaphore and notifies
+ * pmic_bus_access_notifier listeners that they may no longer access the
+ * P-Unit in a way which may cause it to access the shared I2C bus.
+ *
+ * Note this function may be called multiple times and the bus will not
+ * be released until iosf_mbi_unblock_punit_i2c_access() has been called the
+ * same amount of times.
+ *
+ * Return: Nonzero on error
+ */
+int iosf_mbi_block_punit_i2c_access(void);
+
+/*
+ * iosf_mbi_unblock_punit_i2c_access() - Release PMIC I2C bus block
+ *
+ * Release i2c access block gotten through iosf_mbi_block_punit_i2c_access().
+ */
+void iosf_mbi_unblock_punit_i2c_access(void);
+
+/**
+ * iosf_mbi_register_pmic_bus_access_notifier - Register PMIC bus notifier
+ *
+ * This function can be used by drivers which may need to acquire P-Unit
+ * managed resources from interrupt context, where iosf_mbi_punit_acquire()
+ * can not be used.
+ *
+ * This function allows a driver to register a notifier to get notified (in a
+ * process context) before other drivers start accessing the PMIC bus.
+ *
+ * This allows the driver to acquire any resources, which it may need during
+ * the window the other driver is accessing the PMIC, before hand.
+ *
+ * @nb: notifier_block to register
+ */
+int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb);
+
+/**
+ * iosf_mbi_unregister_pmic_bus_access_notifier_unlocked - Unregister PMIC bus
+ * notifier, unlocked
+ *
+ * Like iosf_mbi_unregister_pmic_bus_access_notifier(), but for use when the
+ * caller has already called iosf_mbi_punit_acquire() itself.
+ *
+ * @nb: notifier_block to unregister
+ */
+int iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
+ struct notifier_block *nb);
+
+/**
+ * iosf_mbi_assert_punit_acquired - Assert that the P-Unit has been acquired.
+ */
+void iosf_mbi_assert_punit_acquired(void);
+
+#else /* CONFIG_IOSF_MBI is not enabled */
+static inline
+bool iosf_mbi_available(void)
+{
+ return false;
+}
+
+static inline
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline void iosf_mbi_punit_acquire(void) {}
+static inline void iosf_mbi_punit_release(void) {}
+
+static inline
+int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline
+int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int
+iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline
+int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v)
+{
+ return 0;
+}
+
+static inline void iosf_mbi_assert_punit_acquired(void) {}
+
+#endif /* CONFIG_IOSF_MBI */
+
+#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h
deleted file mode 100644
index ee678fd51594..000000000000
--- a/arch/x86/include/asm/ipcbuf.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _ASM_X86_IPCBUF_H
-#define _ASM_X86_IPCBUF_H
-
-/*
- * The ipc64_perm structure for x86 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 32-bit mode_t and seq
- * - 2 miscellaneous 32-bit values
- */
-
-struct ipc64_perm {
- __kernel_key_t key;
- __kernel_uid32_t uid;
- __kernel_gid32_t gid;
- __kernel_uid32_t cuid;
- __kernel_gid32_t cgid;
- __kernel_mode_t mode;
- unsigned short __pad1;
- unsigned short seq;
- unsigned short __pad2;
- unsigned long __unused1;
- unsigned long __unused2;
-};
-
-#endif /* _ASM_X86_IPCBUF_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
deleted file mode 100644
index 0b7228268a63..000000000000
--- a/arch/x86/include/asm/ipi.h
+++ /dev/null
@@ -1,162 +0,0 @@
-#ifndef _ASM_X86_IPI_H
-#define _ASM_X86_IPI_H
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Generic APIC InterProcessor Interrupt code.
- *
- * Moved to include file by James Cleverdon from
- * arch/x86-64/kernel/smp.c
- *
- * Copyrights from kernel/smp.c:
- *
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- * (c) 2002,2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License, v.2
- */
-
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/smp.h>
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
- unsigned int dest)
-{
- unsigned int icr = shortcut | dest;
-
- switch (vector) {
- default:
- icr |= APIC_DM_FIXED | vector;
- break;
- case NMI_VECTOR:
- icr |= APIC_DM_NMI;
- break;
- }
- return icr;
-}
-
-static inline int __prepare_ICR2(unsigned int mask)
-{
- return SET_APIC_DEST_FIELD(mask);
-}
-
-static inline void __xapic_wait_icr_idle(void)
-{
- while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-static inline void
-__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
-{
- /*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- __xapic_wait_icr_idle();
-
- /*
- * No need to touch the target chip field
- */
- cfg = __prepare_ICR(shortcut, vector, dest);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
-}
-
-/*
- * This is used to send an IPI with no shorthand notation (the destination is
- * specified in bits 56 to 63 of the ICR).
- */
-static inline void
- __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
-{
- unsigned long cfg;
-
- /*
- * Wait for idle.
- */
- if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
- else
- __xapic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- native_apic_mem_write(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
-}
-
-extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
- int vector);
-extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
- int vector);
-extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
- int vector);
-extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
- int vector);
-
-/* Avoid include hell */
-#define NMI_VECTOR 0x02
-
-extern int no_broadcast;
-
-static inline void __default_local_send_IPI_allbutself(int vector)
-{
- if (no_broadcast || vector == NMI_VECTOR)
- apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
- else
- __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical);
-}
-
-static inline void __default_local_send_IPI_all(int vector)
-{
- if (no_broadcast || vector == NMI_VECTOR)
- apic->send_IPI_mask(cpu_online_mask, vector);
- else
- __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical);
-}
-
-#ifdef CONFIG_X86_32
-extern void default_send_IPI_mask_logical(const struct cpumask *mask,
- int vector);
-extern void default_send_IPI_allbutself(int vector);
-extern void default_send_IPI_all(int vector);
-extern void default_send_IPI_self(int vector);
-#endif
-
-#endif
-
-#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index f38481bcd455..194dfff84cb1 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IRQ_H
#define _ASM_X86_IRQ_H
/*
@@ -10,41 +11,40 @@
#include <asm/apicdef.h>
#include <asm/irq_vectors.h>
+/*
+ * The irq entry code is in the noinstr section and the start/end of
+ * __irqentry_text is emitted via labels. Make the build fail if
+ * something moves a C function into the __irq_entry section.
+ */
+#define __irq_entry __invalid_section
+
static inline int irq_canonicalize(int irq)
{
return ((irq == 2) ? 9 : irq);
}
-#ifdef CONFIG_X86_LOCAL_APIC
-# define ARCH_HAS_NMI_WATCHDOG
-#endif
+extern int irq_init_percpu_irqstack(unsigned int cpu);
-#ifdef CONFIG_4KSTACKS
- extern void irq_ctx_init(int cpu);
- extern void irq_ctx_exit(int cpu);
-# define __ARCH_HAS_DO_SOFTIRQ
-#else
-# define irq_ctx_init(cpu) do { } while (0)
-# define irq_ctx_exit(cpu) do { } while (0)
-# ifdef CONFIG_X86_64
-# define __ARCH_HAS_DO_SOFTIRQ
-# endif
-#endif
+struct irq_desc;
-#ifdef CONFIG_HOTPLUG_CPU
-#include <linux/cpumask.h>
extern void fixup_irqs(void);
+
+#if IS_ENABLED(CONFIG_KVM)
+extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
#endif
-extern void (*generic_interrupt_extension)(void);
-extern void init_IRQ(void);
+extern void (*x86_platform_ipi_callback)(void);
extern void native_init_IRQ(void);
-extern bool handle_irq(unsigned irq, struct pt_regs *regs);
-extern unsigned int do_IRQ(struct pt_regs *regs);
+extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs);
+
+extern void init_ISA_irqs(void);
-/* Interrupt vector management */
-extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
-extern int vector_used_by_percpu_irq(unsigned int vector);
+#ifdef CONFIG_X86_LOCAL_APIC
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+ int exclude_cpu);
+
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
+#endif
#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
deleted file mode 100644
index 77843225b7ea..000000000000
--- a/arch/x86/include/asm/irq_regs.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Per-cpu current frame pointer - the location of the last exception frame on
- * the stack, stored in the per-cpu area.
- *
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-#ifndef _ASM_X86_IRQ_REGS_H
-#define _ASM_X86_IRQ_REGS_H
-
-#include <asm/percpu.h>
-
-#define ARCH_HAS_OWN_IRQ_REGS
-
-DECLARE_PER_CPU(struct pt_regs *, irq_regs);
-
-static inline struct pt_regs *get_irq_regs(void)
-{
- return percpu_read(irq_regs);
-}
-
-static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
-{
- struct pt_regs *old_regs;
-
- old_regs = get_irq_regs();
- percpu_write(irq_regs, new_regs);
-
- return old_regs;
-}
-
-#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e2244505..5a0d42464d44 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,90 @@
-#ifndef _ASM_X86_IRQ_REMAPPING_H
-#define _ASM_X86_IRQ_REMAPPING_H
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This header file contains the interface of the interrupt remapping code to
+ * the x86 interrupt management code.
+ */
-#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
+#ifndef __X86_IRQ_REMAPPING_H
+#define __X86_IRQ_REMAPPING_H
-#endif /* _ASM_X86_IRQ_REMAPPING_H */
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
+#include <asm/io_apic.h>
+
+struct msi_msg;
+struct irq_alloc_info;
+
+enum irq_remap_cap {
+ IRQ_POSTING_CAP = 0,
+};
+
+enum {
+ IRQ_REMAP_XAPIC_MODE,
+ IRQ_REMAP_X2APIC_MODE,
+};
+
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+ u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */
+ u32 ga_tag;
+ u32 vector; /* Guest vector of the interrupt */
+ int cpu;
+ bool ga_log_intr;
+ bool is_guest_mode;
+ void *ir_data;
+};
+
+struct intel_iommu_pi_data {
+ u64 pi_desc_addr; /* Physical address of PI Descriptor */
+ u32 vector; /* Guest vector of the interrupt */
+};
+
+#ifdef CONFIG_IRQ_REMAP
+
+extern raw_spinlock_t irq_2_ir_lock;
+
+extern bool irq_remapping_cap(enum irq_remap_cap cap);
+extern void set_irq_remapping_broken(void);
+extern int irq_remapping_prepare(void);
+extern int irq_remapping_enable(void);
+extern void irq_remapping_disable(void);
+extern int irq_remapping_reenable(int);
+extern int irq_remap_enable_fault_handling(void);
+extern void panic_if_irq_remap(const char *msg);
+
+/* Get parent irqdomain for interrupt remapping irqdomain */
+static inline struct irq_domain *arch_get_ir_parent_domain(void)
+{
+ return x86_vector_domain;
+}
+
+extern bool enable_posted_msi;
+
+static inline bool posted_msi_supported(void)
+{
+ return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
+#else /* CONFIG_IRQ_REMAP */
+
+static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
+static inline void set_irq_remapping_broken(void) { }
+static inline int irq_remapping_prepare(void) { return -ENODEV; }
+static inline int irq_remapping_enable(void) { return -ENODEV; }
+static inline void irq_remapping_disable(void) { }
+static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
+static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
+
+static inline void panic_if_irq_remap(const char *msg)
+{
+}
+
+#endif /* CONFIG_IRQ_REMAP */
+#endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
new file mode 100644
index 000000000000..8325b79f2ac6
--- /dev/null
+++ b/arch/x86/include/asm/irq_stack.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IRQ_STACK_H
+#define _ASM_X86_IRQ_STACK_H
+
+#include <linux/ptrace.h>
+#include <linux/objtool.h>
+
+#include <asm/processor.h>
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Macro to inline switching to an interrupt stack and invoking function
+ * calls from there. The following rules apply:
+ *
+ * - Ordering:
+ *
+ * 1. Write the stack pointer into the top most place of the irq
+ * stack. This ensures that the various unwinders can link back to the
+ * original stack.
+ *
+ * 2. Switch the stack pointer to the top of the irq stack.
+ *
+ * 3. Invoke whatever needs to be done (@asm_call argument)
+ *
+ * 4. Pop the original stack pointer from the top of the irq stack
+ * which brings it back to the original stack where it left off.
+ *
+ * - Function invocation:
+ *
+ * To allow flexible usage of the macro, the actual function code including
+ * the store of the arguments in the call ABI registers is handed in via
+ * the @asm_call argument.
+ *
+ * - Local variables:
+ *
+ * @tos:
+ * The @tos variable holds a pointer to the top of the irq stack and
+ * _must_ be allocated in a non-callee saved register as this is a
+ * restriction coming from objtool.
+ *
+ * Note, that (tos) is both in input and output constraints to ensure
+ * that the compiler does not assume that R11 is left untouched in
+ * case this macro is used in some place where the per cpu interrupt
+ * stack pointer is used again afterwards
+ *
+ * - Function arguments:
+ * The function argument(s), if any, have to be defined in register
+ * variables at the place where this is invoked. Storing the
+ * argument(s) in the proper register(s) is part of the @asm_call
+ *
+ * - Constraints:
+ *
+ * The constraints have to be done very carefully because the compiler
+ * does not know about the assembly call.
+ *
+ * output:
+ * As documented already above the @tos variable is required to be in
+ * the output constraints to make the compiler aware that R11 cannot be
+ * reused after the asm() statement.
+ *
+ * For builds with CONFIG_UNWINDER_FRAME_POINTER, ASM_CALL_CONSTRAINT is
+ * required as well as this prevents certain creative GCC variants from
+ * misplacing the ASM code.
+ *
+ * input:
+ * - func:
+ * Immediate, which tells the compiler that the function is referenced.
+ *
+ * - tos:
+ * Register. The actual register is defined by the variable declaration.
+ *
+ * - function arguments:
+ * The constraints are handed in via the 'argconstr' argument list. They
+ * describe the register arguments which are used in @asm_call.
+ *
+ * clobbers:
+ * Function calls can clobber anything except the callee-saved
+ * registers. Tell the compiler.
+ */
+#define call_on_stack(stack, func, asm_call, argconstr...) \
+{ \
+ register void *tos asm("r11"); \
+ \
+ tos = ((void *)(stack)); \
+ \
+ asm_inline volatile( \
+ "movq %%rsp, (%[tos]) \n" \
+ "movq %[tos], %%rsp \n" \
+ \
+ asm_call \
+ \
+ "popq %%rsp \n" \
+ \
+ : "+r" (tos), ASM_CALL_CONSTRAINT \
+ : [__func] "i" (func), [tos] "r" (tos) argconstr \
+ : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", \
+ "memory" \
+ ); \
+}
+
+#define ASM_CALL_ARG0 \
+ "1: call %c[__func] \n" \
+ ANNOTATE_REACHABLE(1b) " \n"
+
+#define ASM_CALL_ARG1 \
+ "movq %[arg1], %%rdi \n" \
+ ASM_CALL_ARG0
+
+#define ASM_CALL_ARG2 \
+ "movq %[arg2], %%rsi \n" \
+ ASM_CALL_ARG1
+
+#define ASM_CALL_ARG3 \
+ "movq %[arg3], %%rdx \n" \
+ ASM_CALL_ARG2
+
+#define call_on_irqstack(func, asm_call, argconstr...) \
+ call_on_stack(__this_cpu_read(hardirq_stack_ptr), \
+ func, asm_call, argconstr)
+
+/* Macros to assert type correctness for run_*_on_irqstack macros */
+#define assert_function_type(func, proto) \
+ static_assert(__builtin_types_compatible_p(typeof(&func), proto))
+
+#define assert_arg_type(arg, proto) \
+ static_assert(__builtin_types_compatible_p(typeof(arg), proto))
+
+/*
+ * Macro to invoke system vector and device interrupt C handlers.
+ */
+#define call_on_irqstack_cond(func, regs, asm_call, constr, c_args...) \
+{ \
+ /* \
+ * User mode entry and interrupt on the irq stack do not \
+ * switch stacks. If from user mode the task stack is empty. \
+ */ \
+ if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \
+ irq_enter_rcu(); \
+ func(c_args); \
+ irq_exit_rcu(); \
+ } else { \
+ /* \
+ * Mark the irq stack inuse _before_ and unmark _after_ \
+ * switching stacks. Interrupts are disabled in both \
+ * places. Invoke the stack switch macro with the call \
+ * sequence which matches the above direct invocation. \
+ */ \
+ __this_cpu_write(hardirq_stack_inuse, true); \
+ call_on_irqstack(func, asm_call, constr); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
+ } \
+}
+
+/*
+ * Function call sequence for __call_on_irqstack() for system vectors.
+ *
+ * Note that irq_enter_rcu() and irq_exit_rcu() do not use the input
+ * mechanism because these functions are global and cannot be optimized out
+ * when compiling a particular source file which uses one of these macros.
+ *
+ * The argument (regs) does not need to be pushed or stashed in a callee
+ * saved register to be safe vs. the irq_enter_rcu() call because the
+ * clobbers already prevent the compiler from storing it in a callee
+ * clobbered register. As the compiler has to preserve @regs for the final
+ * call to idtentry_exit() anyway, it's likely that it does not cause extra
+ * effort for this asm magic.
+ */
+#define ASM_CALL_SYSVEC \
+ "call irq_enter_rcu \n" \
+ ASM_CALL_ARG1 \
+ "call irq_exit_rcu \n"
+
+#define SYSVEC_CONSTRAINTS , [arg1] "r" (regs)
+
+#define run_sysvec_on_irqstack_cond(func, regs) \
+{ \
+ assert_function_type(func, void (*)(struct pt_regs *)); \
+ assert_arg_type(regs, struct pt_regs *); \
+ \
+ call_on_irqstack_cond(func, regs, ASM_CALL_SYSVEC, \
+ SYSVEC_CONSTRAINTS, regs); \
+}
+
+/*
+ * As in ASM_CALL_SYSVEC above the clobbers force the compiler to store
+ * @regs and @vector in callee saved registers.
+ */
+#define ASM_CALL_IRQ \
+ "call irq_enter_rcu \n" \
+ ASM_CALL_ARG2 \
+ "call irq_exit_rcu \n"
+
+#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" ((unsigned long)vector)
+
+#define run_irq_on_irqstack_cond(func, regs, vector) \
+{ \
+ assert_function_type(func, void (*)(struct pt_regs *, u32)); \
+ assert_arg_type(regs, struct pt_regs *); \
+ assert_arg_type(vector, u32); \
+ \
+ call_on_irqstack_cond(func, regs, ASM_CALL_IRQ, \
+ IRQ_CONSTRAINTS, regs, vector); \
+}
+
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
+/*
+ * Macro to invoke __do_softirq on the irq stack. This is only called from
+ * task context when bottom halves are about to be reenabled and soft
+ * interrupts are pending to be processed. The interrupt stack cannot be in
+ * use here.
+ */
+#define do_softirq_own_stack() \
+{ \
+ __this_cpu_write(hardirq_stack_inuse, true); \
+ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
+}
+
+#endif
+
+#else /* CONFIG_X86_64 */
+/* System vector handlers always run on the stack they interrupted. */
+#define run_sysvec_on_irqstack_cond(func, regs) \
+{ \
+ irq_enter_rcu(); \
+ func(regs); \
+ irq_exit_rcu(); \
+}
+
+/* Switches to the irq stack within func() */
+#define run_irq_on_irqstack_cond(func, regs, vector) \
+{ \
+ irq_enter_rcu(); \
+ func(regs, vector); \
+ irq_exit_rcu(); \
+}
+
+#endif /* !CONFIG_X86_64 */
+
+#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5b21f0ec3df2..47051871b436 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,6 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IRQ_VECTORS_H
#define _ASM_X86_IRQ_VECTORS_H
+#include <linux/threads.h>
/*
* Linux IRQ vector layout.
*
@@ -16,56 +18,30 @@
* Vectors 0 ... 31 : system traps and exceptions - hardcoded events
* Vectors 32 ... 127 : device interrupts
* Vector 128 : legacy int80 syscall interface
- * Vectors 129 ... 237 : device interrupts
- * Vectors 238 ... 255 : special interrupts
+ * Vectors 129 ... FIRST_SYSTEM_VECTOR-1 : device interrupts
+ * Vectors FIRST_SYSTEM_VECTOR ... 255 : special interrupts
*
* 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
*
* This file enumerates the exact layout of them:
*/
+/* This is used as an interrupt vector when programming the APIC. */
#define NMI_VECTOR 0x02
-#define MCE_VECTOR 0x12
/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
+ * IDT vectors usable for external interrupt sources start at 0x20.
+ * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
*/
#define FIRST_EXTERNAL_VECTOR 0x20
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR 0x80
-# define IA32_SYSCALL_VECTOR 0x80
-#else
-# define IA32_SYSCALL_VECTOR 0x80
-#endif
-
-/*
- * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
- * cleanup after irq migration.
- */
-#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+#define IA32_SYSCALL_VECTOR 0x80
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
+ * round up to the next 16-vector boundary
*/
-#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
-
-#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
-#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
-#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
-#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
-#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
-#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
-#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
-#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
-#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
-#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
-#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
-#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
-#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
-#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
-#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
+#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq)
/*
* Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -91,56 +67,53 @@
#define THRESHOLD_APIC_VECTOR 0xf9
#define REBOOT_VECTOR 0xf8
-/* f0-f7 used for spreading out TLB flushes: */
-#define INVALIDATE_TLB_VECTOR_END 0xf7
-#define INVALIDATE_TLB_VECTOR_START 0xf0
-#define NUM_INVALIDATE_TLB_VECTORS 8
-
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR 0xef
-
/*
* Generic system vector for platform specific use
*/
-#define GENERIC_INTERRUPT_VECTOR 0xed
+#define X86_PLATFORM_IPI_VECTOR 0xf7
/*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
*/
-#define LOCAL_PENDING_VECTOR 0xec
+#define IRQ_WORK_VECTOR 0xf6
-#define UV_BAU_MESSAGE 0xec
+/* 0xf5 - unused, was UV_BAU_MESSAGE */
+#define DEFERRED_ERROR_VECTOR 0xf4
-/*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR 0xeb
+/* Vector on which hypervisor callbacks will be delivered */
+#define HYPERVISOR_CALLBACK_VECTOR 0xf3
+
+/* Vector for KVM to deliver posted interrupt IPI */
+#define POSTED_INTR_VECTOR 0xf2
+#define POSTED_INTR_WAKEUP_VECTOR 0xf1
+#define POSTED_INTR_NESTED_VECTOR 0xf0
+
+#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef
+
+#if IS_ENABLED(CONFIG_HYPERV)
+#define HYPERV_REENLIGHTENMENT_VECTOR 0xee
+#define HYPERV_STIMER0_VECTOR 0xed
+#endif
+
+#define LOCAL_TIMER_VECTOR 0xec
/*
- * First APIC vector available to drivers: (vectors 0x30-0xee) we
- * start at 0x31(0x41) to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
+ * Posted interrupt notification vector for all device MSIs delivered to
+ * the host kernel.
*/
-#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
+#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb
#define NR_VECTORS 256
-#define FPU_IRQ 13
-
-#define FIRST_VM86_IRQ 3
-#define LAST_VM86_IRQ 15
-
-#ifndef __ASSEMBLY__
-static inline int invalid_vm86_irq(int irq)
-{
- return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
-}
+#ifdef CONFIG_X86_LOCAL_APIC
+#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR
+#else
+#define FIRST_SYSTEM_VECTOR NR_VECTORS
#endif
+#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+
/*
* Size the maximum number of interrupts.
*
@@ -152,26 +125,22 @@ static inline int invalid_vm86_irq(int irq)
* static arrays.
*/
-#define NR_IRQS_LEGACY 16
+#define NR_IRQS_LEGACY 16
-#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
-#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
+#define CPU_VECTOR_LIMIT (64 * NR_CPUS)
+#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS)
-#ifdef CONFIG_X86_IO_APIC
-# ifdef CONFIG_SPARSE_IRQ
-# define NR_IRQS \
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_IRQS \
(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
(NR_VECTORS + CPU_VECTOR_LIMIT) : \
(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-# else
-# if NR_CPUS < MAX_IO_APICS
-# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT)
-# else
-# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
-# endif
-# endif
-#else /* !CONFIG_X86_IO_APIC: */
-# define NR_IRQS NR_IRQS_LEGACY
+#elif defined(CONFIG_X86_IO_APIC)
+#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_IRQS NR_IRQS_LEGACY
#endif
#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
new file mode 100644
index 000000000000..6b4d36c95165
--- /dev/null
+++ b/arch/x86/include/asm/irq_work.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_IRQ_WORK_H
+#define _ASM_IRQ_WORK_H
+
+#include <asm/cpufeature.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return boot_cpu_has(X86_FEATURE_APIC);
+}
+#else
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return false;
+}
+#endif
+
+#endif /* _ASM_IRQ_WORK_H */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
new file mode 100644
index 000000000000..30c325c235c0
--- /dev/null
+++ b/arch/x86/include/asm/irqdomain.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_IRQDOMAIN_H
+#define _ASM_IRQDOMAIN_H
+
+#include <linux/irqdomain.h>
+#include <asm/hw_irq.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+enum {
+ X86_IRQ_ALLOC_LEGACY = 0x1,
+};
+
+extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec);
+extern int x86_fwspec_is_hpet(struct irq_fwspec *fwspec);
+
+extern struct irq_domain *x86_vector_domain;
+
+extern void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask);
+extern void copy_irq_alloc_info(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src);
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+struct device_node;
+struct irq_data;
+
+enum ioapic_domain_type {
+ IOAPIC_DOMAIN_INVALID,
+ IOAPIC_DOMAIN_LEGACY,
+ IOAPIC_DOMAIN_STRICT,
+ IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct ioapic_domain_cfg {
+ enum ioapic_domain_type type;
+ const struct irq_domain_ops *ops;
+ struct device_node *dev;
+};
+
+extern const struct irq_domain_ops mp_ioapic_irqdomain_ops;
+
+extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg);
+extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs);
+extern int mp_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve);
+extern void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data);
+extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
+#endif /* CONFIG_X86_IO_APIC */
+
+#ifdef CONFIG_PCI_MSI
+void x86_create_pci_msi_domain(void);
+struct irq_domain *native_create_pci_msi_domain(void);
+extern struct irq_domain *x86_pci_msi_default_domain;
+#else
+static inline void x86_create_pci_msi_domain(void) { }
+#define native_create_pci_msi_domain NULL
+#define x86_pci_msi_default_domain NULL
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c6ccbe7e81ad..b30e5474c18e 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -1,92 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_
#include <asm/processor-flags.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
+
+#include <asm/nospec-branch.h>
+
/*
* Interrupt control:
*/
-static inline unsigned long native_save_fl(void)
+/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
+extern inline unsigned long native_save_fl(void);
+extern __always_inline unsigned long native_save_fl(void)
{
unsigned long flags;
/*
- * Note: this needs to be "=r" not "=rm", because we have the
- * stack offset from what gcc expects at the time the "pop" is
- * executed, and so a memory reference with respect to the stack
- * would end up using the wrong address.
+ * "=rm" is safe here, because "pop" adjusts the stack before
+ * it evaluates its effective address -- this is part of the
+ * documented behavior of the "pop" instruction.
*/
asm volatile("# __raw_save_flags\n\t"
"pushf ; pop %0"
- : "=r" (flags)
+ : "=rm" (flags)
: /* no input */
: "memory");
return flags;
}
-static inline void native_restore_fl(unsigned long flags)
-{
- asm volatile("push %0 ; popf"
- : /* no output */
- :"g" (flags)
- :"memory", "cc");
-}
-
-static inline void native_irq_disable(void)
+static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
}
-static inline void native_irq_enable(void)
+static __always_inline void native_irq_enable(void)
{
asm volatile("sti": : :"memory");
}
-static inline void native_safe_halt(void)
+static __always_inline void native_safe_halt(void)
{
+ x86_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
-static inline void native_halt(void)
+static __always_inline void native_halt(void)
{
+ x86_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
}
-#endif
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#ifndef __ASSEMBLY__
-
-static inline unsigned long __raw_local_save_flags(void)
+static __always_inline int native_irqs_disabled_flags(unsigned long flags)
{
- return native_save_fl();
+ return !(flags & X86_EFLAGS_IF);
}
-static inline void raw_local_irq_restore(unsigned long flags)
+static __always_inline unsigned long native_local_irq_save(void)
{
- native_restore_fl(flags);
-}
+ unsigned long flags = native_save_fl();
-static inline void raw_local_irq_disable(void)
-{
native_irq_disable();
+
+ return flags;
}
-static inline void raw_local_irq_enable(void)
+static __always_inline void native_local_irq_restore(unsigned long flags)
{
- native_irq_enable();
+ if (!native_irqs_disabled_flags(flags))
+ native_irq_enable();
}
+#endif
+
+#ifndef CONFIG_PARAVIRT
+#ifndef __ASSEMBLY__
/*
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline void raw_safe_halt(void)
+static __always_inline void arch_safe_halt(void)
{
native_safe_halt();
}
@@ -95,123 +91,73 @@ static inline void raw_safe_halt(void)
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
-static inline void halt(void)
+static __always_inline void halt(void)
{
native_halt();
}
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifdef CONFIG_PARAVIRT_XXL
+#include <asm/paravirt.h>
+#else
+#ifndef __ASSEMBLER__
+#include <linux/types.h>
+
+static __always_inline unsigned long arch_local_save_flags(void)
+{
+ return native_save_fl();
+}
+
+static __always_inline void arch_local_irq_disable(void)
+{
+ native_irq_disable();
+}
+
+static __always_inline void arch_local_irq_enable(void)
+{
+ native_irq_enable();
+}
/*
* For spinlocks, etc:
*/
-static inline unsigned long __raw_local_irq_save(void)
+static __always_inline unsigned long arch_local_irq_save(void)
{
- unsigned long flags = __raw_local_save_flags();
-
- raw_local_irq_disable();
-
+ unsigned long flags = arch_local_save_flags();
+ arch_local_irq_disable();
return flags;
}
#else
-#define ENABLE_INTERRUPTS(x) sti
-#define DISABLE_INTERRUPTS(x) cli
-
#ifdef CONFIG_X86_64
-#define SWAPGS swapgs
-/*
- * Currently paravirt can't handle swapgs nicely when we
- * don't have a stack we can rely on (such as a user space
- * stack). So we either find a way around these or just fault
- * and emulate if a guest tries to call swapgs directly.
- *
- * Either way, this is a good way to document that we don't
- * have a reliable stack. x86_64 only.
- */
-#define SWAPGS_UNSAFE_STACK swapgs
-
-#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
-
-#define INTERRUPT_RETURN iretq
-#define USERGS_SYSRET64 \
- swapgs; \
- sysretq;
-#define USERGS_SYSRET32 \
- swapgs; \
- sysretl
-#define ENABLE_INTERRUPTS_SYSEXIT32 \
- swapgs; \
- sti; \
- sysexit
-
-#else
-#define INTERRUPT_RETURN iret
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define GET_CR0_INTO_EAX movl %cr0, %eax
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS pushfq; popq %rax
#endif
+#endif
-#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_PARAVIRT */
-
-#ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_save(flags) \
- do { (flags) = __raw_local_irq_save(); } while (0)
+#endif /* __ASSEMBLER__ */
+#endif /* CONFIG_PARAVIRT_XXL */
-static inline int raw_irqs_disabled_flags(unsigned long flags)
+#ifndef __ASSEMBLER__
+static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & X86_EFLAGS_IF);
}
-static inline int raw_irqs_disabled(void)
+static __always_inline int arch_irqs_disabled(void)
{
- unsigned long flags = __raw_local_save_flags();
+ unsigned long flags = arch_local_save_flags();
- return raw_irqs_disabled_flags(flags);
+ return arch_irqs_disabled_flags(flags);
}
-#else
-
-#ifdef CONFIG_X86_64
-#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
- TRACE_IRQS_ON; \
- sti; \
- SAVE_REST; \
- LOCKDEP_SYS_EXIT; \
- RESTORE_REST; \
- cli; \
- TRACE_IRQS_OFF;
-
-#else
-#define ARCH_LOCKDEP_SYS_EXIT \
- pushl %eax; \
- pushl %ecx; \
- pushl %edx; \
- call lockdep_sys_exit; \
- popl %edx; \
- popl %ecx; \
- popl %eax;
-
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ
-#endif
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
-# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
-#else
-# define TRACE_IRQS_ON
-# define TRACE_IRQS_OFF
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
-# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
-# else
-# define LOCKDEP_SYS_EXIT
-# define LOCKDEP_SYS_EXIT_IRQ
-# endif
+static __always_inline void arch_local_irq_restore(unsigned long flags)
+{
+ if (!arch_irqs_disabled_flags(flags))
+ arch_local_irq_enable();
+}
+#endif /* !__ASSEMBLER__ */
-#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h
index 7e5dff1de0e9..7ede2731dc92 100644
--- a/arch/x86/include/asm/ist.h
+++ b/arch/x86/include/asm/ist.h
@@ -1,34 +1,14 @@
-#ifndef _ASM_X86_IST_H
-#define _ASM_X86_IST_H
-
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Include file for the interface to IST BIOS
* Copyright 2002 Andy Grover <andrew.grover@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
+#ifndef _ASM_X86_IST_H
+#define _ASM_X86_IST_H
+#include <uapi/asm/ist.h>
-#include <linux/types.h>
-
-struct ist_info {
- __u32 signature;
- __u32 command;
- __u32 event;
- __u32 perf_level;
-};
-
-#ifdef __KERNEL__
extern struct ist_info ist_info;
-#endif /* __KERNEL__ */
#endif /* _ASM_X86_IST_H */
diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h
new file mode 100644
index 000000000000..a34897aef2c2
--- /dev/null
+++ b/arch/x86/include/asm/jailhouse_para.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Jailhouse paravirt detection
+ *
+ * Copyright (c) Siemens AG, 2015-2017
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ */
+
+#ifndef _ASM_X86_JAILHOUSE_PARA_H
+#define _ASM_X86_JAILHOUSE_PARA_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_JAILHOUSE_GUEST
+bool jailhouse_paravirt(void);
+#else
+static inline bool jailhouse_paravirt(void)
+{
+ return false;
+}
+#endif
+
+#endif /* _ASM_X86_JAILHOUSE_PARA_H */
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 000000000000..05b16299588d
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#define HAVE_JUMP_LABEL_BATCH
+
+#include <asm/asm.h>
+#include <asm/nops.h>
+
+#ifndef __ASSEMBLER__
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+
+#define JUMP_TABLE_ENTRY(key, label) \
+ ".pushsection __jump_table, \"aw\" \n\t" \
+ _ASM_ALIGN "\n\t" \
+ ANNOTATE_DATA_SPECIAL "\n" \
+ ".long 1b - . \n\t" \
+ ".long " label " - . \n\t" \
+ _ASM_PTR " " key " - . \n\t" \
+ ".popsection \n\t"
+
+/* This macro is also expanded on the Rust side. */
+#ifdef CONFIG_HAVE_JUMP_LABEL_HACK
+#define ARCH_STATIC_BRANCH_ASM(key, label) \
+ "1: jmp " label " # objtool NOPs this \n\t" \
+ JUMP_TABLE_ENTRY(key " + 2", label)
+#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */
+#define ARCH_STATIC_BRANCH_ASM(key, label) \
+ "1: .byte " __stringify(BYTES_NOP5) "\n\t" \
+ JUMP_TABLE_ENTRY(key, label)
+#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */
+
+static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
+{
+ asm goto(ARCH_STATIC_BRANCH_ASM("%c0 + %c1", "%l[l_yes]")
+ : : "i" (key), "i" (branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
+{
+ asm goto("1:"
+ "jmp %l[l_yes]\n\t"
+ JUMP_TABLE_ENTRY("%c0 + %c1", "%l[l_yes]")
+ : : "i" (key), "i" (branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+extern int arch_jump_entry_size(struct jump_entry *entry);
+
+#endif /* __ASSEMBLER__ */
+
+#endif
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
deleted file mode 100644
index c2d1f3b58e5f..000000000000
--- a/arch/x86/include/asm/k8.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _ASM_X86_K8_H
-#define _ASM_X86_K8_H
-
-#include <linux/pci.h>
-
-extern struct pci_device_id k8_nb_ids[];
-
-extern int early_is_k8_nb(u32 value);
-extern struct pci_dev **k8_northbridges;
-extern int num_k8_northbridges;
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_scan_nodes(unsigned long start, unsigned long end);
-
-#ifdef CONFIG_K8_NB
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
-{
- return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
-}
-#else
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
-{
- return NULL;
-}
-#endif
-
-
-#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
new file mode 100644
index 000000000000..d7e33c7f096b
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+#include <linux/const.h>
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+#define KASAN_SHADOW_SCALE_SHIFT 3
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
+ ((-1UL << __VIRTUAL_MASK_SHIFT) >> \
+ KASAN_SHADOW_SCALE_SHIFT))
+/*
+ * 47 bits for kernel address -> (47 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow
+ * 56 bits for kernel address -> (56 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow
+ */
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + \
+ (1ULL << (__VIRTUAL_MASK_SHIFT - \
+ KASAN_SHADOW_SCALE_SHIFT)))
+
+#ifndef __ASSEMBLER__
+
+#ifdef CONFIG_KASAN
+void __init kasan_early_init(void);
+void __init kasan_init(void);
+void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid);
+#else
+static inline void kasan_early_init(void) { }
+static inline void kasan_init(void) { }
+static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size,
+ int nid) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
new file mode 100644
index 000000000000..0648190467ba
--- /dev/null
+++ b/arch/x86/include/asm/kaslr.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_KASLR_H_
+#define _ASM_KASLR_H_
+
+unsigned long kaslr_get_random_long(const char *purpose);
+
+#ifdef CONFIG_RANDOMIZE_MEMORY
+void kernel_randomize_memory(void);
+void init_trampoline_kaslr(void);
+#else
+static inline void kernel_randomize_memory(void) { }
+static inline void init_trampoline_kaslr(void) {}
+#endif /* CONFIG_RANDOMIZE_MEMORY */
+
+#endif
diff --git a/arch/x86/include/asm/kbdleds.h b/arch/x86/include/asm/kbdleds.h
new file mode 100644
index 000000000000..197ea4fedd53
--- /dev/null
+++ b/arch/x86/include/asm/kbdleds.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KBDLEDS_H
+#define _ASM_X86_KBDLEDS_H
+
+/*
+ * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on.
+ * This seems a good reason to start with NumLock off. That's why on X86 we
+ * ask the bios for the correct state.
+ */
+
+#include <asm/setup.h>
+
+static inline int kbd_defleds(void)
+{
+ return boot_params.kbd_status & 0x20 ? (1 << VC_NUMLOCK) : 0;
+}
+
+#endif /* _ASM_X86_KBDLEDS_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fa7c0b974761..d1514e70477b 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_KDEBUG_H
#define _ASM_X86_KDEBUG_H
@@ -13,24 +14,31 @@ enum die_val {
DIE_PANIC,
DIE_NMI,
DIE_DIE,
- DIE_NMIWATCHDOG,
DIE_KERNELDEBUG,
DIE_TRAP,
DIE_GPF,
DIE_CALL,
- DIE_NMI_IPI,
DIE_PAGE_FAULT,
DIE_NMIUNKNOWN,
};
-extern void printk_address(unsigned long address, int reliable);
+enum show_regs_mode {
+ SHOW_REGS_SHORT,
+ /*
+ * For when userspace crashed, but we don't think it's our fault, and
+ * therefore don't print kernel registers.
+ */
+ SHOW_REGS_USER,
+ SHOW_REGS_ALL
+};
+
extern void die(const char *, struct pt_regs *,long);
+void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr);
extern int __must_check __die(const char *, struct pt_regs *, long);
-extern void show_registers(struct pt_regs *regs);
-extern void show_trace(struct task_struct *t, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp);
-extern void __show_regs(struct pt_regs *regs, int all);
-extern void show_regs(struct pt_regs *regs);
+extern void show_stack_regs(struct pt_regs *regs);
+extern void __show_regs(struct pt_regs *regs, enum show_regs_mode,
+ const char *log_lvl);
+extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h
new file mode 100644
index 000000000000..df89ee7d3e9e
--- /dev/null
+++ b/arch/x86/include/asm/kexec-bzimage64.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_KEXEC_BZIMAGE64_H
+#define _ASM_KEXEC_BZIMAGE64_H
+
+extern const struct kexec_file_ops kexec_bzImage64_ops;
+
+#endif /* _ASM_KEXE_BZIMAGE64_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff1703d0b..5cfb27f26583 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_KEXEC_H
#define _ASM_X86_KEXEC_H
@@ -8,22 +9,33 @@
# define PA_SWAP_PAGE 3
# define PAGES_NR 4
#else
-# define PA_CONTROL_PAGE 0
-# define VA_CONTROL_PAGE 1
-# define PA_TABLE_PAGE 2
-# define PA_SWAP_PAGE 3
-# define PAGES_NR 4
+/* Size of each exception handler referenced by the IDT */
+# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */
+#endif
+
+#ifdef CONFIG_X86_64
+
+#include <linux/bits.h>
+
+#define RELOC_KERNEL_PRESERVE_CONTEXT BIT(0)
+#define RELOC_KERNEL_CACHE_INCOHERENT BIT(1)
+
#endif
+# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/asm.h>
#include <asm/page.h>
#include <asm/ptrace.h>
+struct kimage;
+
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
* I.e. Maximum page that is mapped directly into kernel memory,
@@ -39,7 +51,6 @@
/* Maximum address we can use for the control code buffer */
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
-# define KEXEC_CONTROL_PAGE_SIZE 4096
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_386
@@ -48,34 +59,23 @@
# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
#else
/* Maximum physical address we can use pages from */
-# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1)
/* Maximum address we can reach in physical address mode */
-# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
/* Maximum address we can use for the control pages */
-# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
-
-/* Allocate one page for the pdp and the second for the code */
-# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
+# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_X86_64
-#endif
-/*
- * CPU does not save ss and sp on stack if execution is already
- * running in kernel mode at the time of NMI occurrence. This code
- * fixes it.
- */
-static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
- struct pt_regs *oldregs)
-{
-#ifdef CONFIG_X86_32
- newregs->sp = (unsigned long)&(oldregs->sp);
- asm volatile("xorl %%eax, %%eax\n\t"
- "movw %%ss, %%ax\n\t"
- :"=a"(newregs->ss));
+extern unsigned long kexec_va_control_page;
+extern unsigned long kexec_pa_table_page;
+extern unsigned long kexec_pa_swap_page;
+extern gate_desc kexec_debug_idt[];
+extern unsigned char kexec_debug_exc_vectors[];
+extern uint16_t kexec_debug_8250_port;
+extern unsigned long kexec_debug_8250_mmio32;
#endif
-}
/*
* This function is responsible for capturing register states if coming
@@ -87,62 +87,52 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
{
if (oldregs) {
memcpy(newregs, oldregs, sizeof(*newregs));
- crash_fixup_ss_esp(newregs, oldregs);
} else {
+ asm volatile("mov %%" _ASM_BX ",%0" : "=m"(newregs->bx));
+ asm volatile("mov %%" _ASM_CX ",%0" : "=m"(newregs->cx));
+ asm volatile("mov %%" _ASM_DX ",%0" : "=m"(newregs->dx));
+ asm volatile("mov %%" _ASM_SI ",%0" : "=m"(newregs->si));
+ asm volatile("mov %%" _ASM_DI ",%0" : "=m"(newregs->di));
+ asm volatile("mov %%" _ASM_BP ",%0" : "=m"(newregs->bp));
+ asm volatile("mov %%" _ASM_AX ",%0" : "=m"(newregs->ax));
+ asm volatile("mov %%" _ASM_SP ",%0" : "=m"(newregs->sp));
+#ifdef CONFIG_X86_64
+ asm volatile("mov %%r8,%0" : "=m"(newregs->r8));
+ asm volatile("mov %%r9,%0" : "=m"(newregs->r9));
+ asm volatile("mov %%r10,%0" : "=m"(newregs->r10));
+ asm volatile("mov %%r11,%0" : "=m"(newregs->r11));
+ asm volatile("mov %%r12,%0" : "=m"(newregs->r12));
+ asm volatile("mov %%r13,%0" : "=m"(newregs->r13));
+ asm volatile("mov %%r14,%0" : "=m"(newregs->r14));
+ asm volatile("mov %%r15,%0" : "=m"(newregs->r15));
+#endif
+ asm volatile("mov %%ss,%k0" : "=a"(newregs->ss));
+ asm volatile("mov %%cs,%k0" : "=a"(newregs->cs));
#ifdef CONFIG_X86_32
- asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
- asm volatile("movl %%ecx,%0" : "=m"(newregs->cx));
- asm volatile("movl %%edx,%0" : "=m"(newregs->dx));
- asm volatile("movl %%esi,%0" : "=m"(newregs->si));
- asm volatile("movl %%edi,%0" : "=m"(newregs->di));
- asm volatile("movl %%ebp,%0" : "=m"(newregs->bp));
- asm volatile("movl %%eax,%0" : "=m"(newregs->ax));
- asm volatile("movl %%esp,%0" : "=m"(newregs->sp));
- asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
- asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
- asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds));
- asm volatile("movl %%es, %%eax;" :"=a"(newregs->es));
- asm volatile("pushfl; popl %0" :"=m"(newregs->flags));
-#else
- asm volatile("movq %%rbx,%0" : "=m"(newregs->bx));
- asm volatile("movq %%rcx,%0" : "=m"(newregs->cx));
- asm volatile("movq %%rdx,%0" : "=m"(newregs->dx));
- asm volatile("movq %%rsi,%0" : "=m"(newregs->si));
- asm volatile("movq %%rdi,%0" : "=m"(newregs->di));
- asm volatile("movq %%rbp,%0" : "=m"(newregs->bp));
- asm volatile("movq %%rax,%0" : "=m"(newregs->ax));
- asm volatile("movq %%rsp,%0" : "=m"(newregs->sp));
- asm volatile("movq %%r8,%0" : "=m"(newregs->r8));
- asm volatile("movq %%r9,%0" : "=m"(newregs->r9));
- asm volatile("movq %%r10,%0" : "=m"(newregs->r10));
- asm volatile("movq %%r11,%0" : "=m"(newregs->r11));
- asm volatile("movq %%r12,%0" : "=m"(newregs->r12));
- asm volatile("movq %%r13,%0" : "=m"(newregs->r13));
- asm volatile("movq %%r14,%0" : "=m"(newregs->r14));
- asm volatile("movq %%r15,%0" : "=m"(newregs->r15));
- asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
- asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
- asm volatile("pushfq; popq %0" :"=m"(newregs->flags));
+ asm volatile("mov %%ds,%k0" : "=a"(newregs->ds));
+ asm volatile("mov %%es,%k0" : "=a"(newregs->es));
#endif
- newregs->ip = (unsigned long)current_text_addr();
+ asm volatile("pushf\n\t"
+ "pop %0" : "=m"(newregs->flags));
+ newregs->ip = _THIS_IP_;
}
}
#ifdef CONFIG_X86_32
-asmlinkage unsigned long
-relocate_kernel(unsigned long indirection_page,
- unsigned long control_page,
- unsigned long start_address,
- unsigned int has_pae,
- unsigned int preserve_context);
+typedef asmlinkage unsigned long
+relocate_kernel_fn(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
#else
-unsigned long
-relocate_kernel(unsigned long indirection_page,
- unsigned long page_list,
- unsigned long start_address,
- unsigned int preserve_context);
+typedef unsigned long
+relocate_kernel_fn(unsigned long indirection_page,
+ unsigned long pa_control_page,
+ unsigned long start_address,
+ unsigned int flags);
#endif
-
+extern relocate_kernel_fn relocate_kernel;
#define ARCH_HAS_KIMAGE_ARCH
#ifdef CONFIG_X86_32
@@ -157,12 +147,91 @@ struct kimage_arch {
};
#else
struct kimage_arch {
+ /*
+ * This is a kimage control page, as it must not overlap with either
+ * source or destination address ranges.
+ */
+ pgd_t *pgd;
+ /*
+ * The virtual mapping of the control code page itself is used only
+ * during the transition, while the current kernel's pages are all
+ * in place. Thus the intermediate page table pages used to map it
+ * are not control pages, but instead just normal pages obtained
+ * with get_zeroed_page(). And have to be tracked (below) so that
+ * they can be freed.
+ */
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
};
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+/*
+ * Number of elements and order of elements in this structure should match
+ * with the ones in arch/x86/purgatory/entry64.S. If you make a change here
+ * make an appropriate change in purgatory too.
+ */
+struct kexec_entry64_regs {
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rsp;
+ uint64_t rbp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ uint64_t rip;
+};
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+ gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
+void arch_kexec_protect_crashkres(void);
+#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres
+
+void arch_kexec_unprotect_crashkres(void);
+#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
+
+#ifdef CONFIG_KEXEC_FILE
+struct purgatory_info;
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+#endif
+#endif
+
+extern void kdump_nmi_shootdown_cpus(void);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
+#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
+
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags);
+#define arch_crash_hotplug_support arch_crash_hotplug_support
+
+unsigned int arch_crash_get_elfcorehdr_size(void);
+#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h
new file mode 100644
index 000000000000..ff5c7134a37a
--- /dev/null
+++ b/arch/x86/include/asm/kfence.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * x86 KFENCE support.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef _ASM_X86_KFENCE_H
+#define _ASM_X86_KFENCE_H
+
+#ifndef MODULE
+
+#include <linux/bug.h>
+#include <linux/kfence.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+#include <asm/tlbflush.h>
+
+/* Force 4K pages for __kfence_pool. */
+static inline bool arch_kfence_init_pool(void)
+{
+ unsigned long addr;
+
+ for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr);
+ addr += PAGE_SIZE) {
+ unsigned int level;
+
+ if (!lookup_address(addr, &level))
+ return false;
+
+ if (level != PG_LEVEL_4K)
+ set_memory_4k(addr, 1);
+ }
+
+ return true;
+}
+
+/* Protect the given page and flush TLB. */
+static inline bool kfence_protect_page(unsigned long addr, bool protect)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(addr, &level);
+
+ if (WARN_ON(!pte || level != PG_LEVEL_4K))
+ return false;
+
+ /*
+ * We need to avoid IPIs, as we may get KFENCE allocations or faults
+ * with interrupts disabled. Therefore, the below is best-effort, and
+ * does not flush TLBs on all CPUs. We can tolerate some inaccuracy;
+ * lazy fault handling takes care of faults after the page is PRESENT.
+ */
+
+ if (protect)
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ else
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+
+ /*
+ * Flush this CPU's TLB, assuming whoever did the allocation/free is
+ * likely to continue running on this CPU.
+ */
+ preempt_disable();
+ flush_tlb_one_kernel(addr);
+ preempt_enable();
+ return true;
+}
+
+#endif /* !MODULE */
+
+#endif /* _ASM_X86_KFENCE_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index e6c6c808489f..aacaf2502bd2 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_KGDB_H
#define _ASM_X86_KGDB_H
@@ -6,6 +7,8 @@
* Copyright (C) 2008 Wind River Systems, Inc.
*/
+#include <asm/ptrace.h>
+
/*
* BUFMAX defines the maximum number of characters in inbound/outbound
* buffers at least NUMREGBYTES*2 are needed for register packets
@@ -39,9 +42,11 @@ enum regnames {
GDB_FS, /* 14 */
GDB_GS, /* 15 */
};
+#define GDB_ORIG_AX 41
+#define DBG_MAX_REG_NUM 16
#define NUMREGBYTES ((GDB_GS+1)*4)
#else /* ! CONFIG_X86_32 */
-enum regnames64 {
+enum regnames {
GDB_AX, /* 0 */
GDB_BX, /* 1 */
GDB_CX, /* 2 */
@@ -59,15 +64,19 @@ enum regnames64 {
GDB_R14, /* 14 */
GDB_R15, /* 15 */
GDB_PC, /* 16 */
+ GDB_PS, /* 17 */
+ GDB_CS, /* 18 */
+ GDB_SS, /* 19 */
+ GDB_DS, /* 20 */
+ GDB_ES, /* 21 */
+ GDB_FS, /* 22 */
+ GDB_GS, /* 23 */
};
-
-enum regnames32 {
- GDB_PS = 34,
- GDB_CS,
- GDB_SS,
-};
-#define NUMREGBYTES ((GDB_SS+1)*4)
-#endif /* CONFIG_X86_32 */
+#define GDB_ORIG_AX 57
+#define DBG_MAX_REG_NUM 24
+/* 17 64 bit regs and 5 32 bit regs */
+#define NUMREGBYTES ((17 * 8) + (5 * 4))
+#endif /* ! CONFIG_X86_32 */
static inline void arch_kgdb_breakpoint(void)
{
@@ -75,5 +84,9 @@ static inline void arch_kgdb_breakpoint(void)
}
#define BREAK_INSTR_SIZE 1
#define CACHE_FLUSH_IS_SAFE 1
+#define GDB_ADJUSTS_BREAK_OFFSET
+
+extern int kgdb_ll_trap(int cmd, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig);
#endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
deleted file mode 100644
index 9e00a731a7fb..000000000000
--- a/arch/x86/include/asm/kmap_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_X86_KMAP_TYPES_H
-#define _ASM_X86_KMAP_TYPES_H
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
-#define __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
deleted file mode 100644
index ed01518f297e..000000000000
--- a/arch/x86/include/asm/kmemcheck.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef ASM_X86_KMEMCHECK_H
-#define ASM_X86_KMEMCHECK_H
-
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#ifdef CONFIG_KMEMCHECK
-bool kmemcheck_active(struct pt_regs *regs);
-
-void kmemcheck_show(struct pt_regs *regs);
-void kmemcheck_hide(struct pt_regs *regs);
-
-bool kmemcheck_fault(struct pt_regs *regs,
- unsigned long address, unsigned long error_code);
-bool kmemcheck_trap(struct pt_regs *regs);
-#else
-static inline bool kmemcheck_active(struct pt_regs *regs)
-{
- return false;
-}
-
-static inline void kmemcheck_show(struct pt_regs *regs)
-{
-}
-
-static inline void kmemcheck_hide(struct pt_regs *regs)
-{
-}
-
-static inline bool kmemcheck_fault(struct pt_regs *regs,
- unsigned long address, unsigned long error_code)
-{
- return false;
-}
-
-static inline bool kmemcheck_trap(struct pt_regs *regs)
-{
- return false;
-}
-#endif /* CONFIG_KMEMCHECK */
-
-#endif
diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h
new file mode 100644
index 000000000000..d91b37f5b4bb
--- /dev/null
+++ b/arch/x86/include/asm/kmsan.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * x86 KMSAN support.
+ *
+ * Copyright (C) 2022, Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ */
+
+#ifndef _ASM_X86_KMSAN_H
+#define _ASM_X86_KMSAN_H
+
+#ifndef MODULE
+
+#include <asm/cpu_entry_area.h>
+#include <asm/processor.h>
+#include <linux/mmzone.h>
+
+DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow);
+DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin);
+
+/*
+ * Functions below are declared in the header to make sure they are inlined.
+ * They all are called from kmsan_get_metadata() for every memory access in
+ * the kernel, so speed is important here.
+ */
+
+/*
+ * Compute metadata addresses for the CPU entry area on x86.
+ */
+static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin)
+{
+ unsigned long addr64 = (unsigned long)addr;
+ char *metadata_array;
+ unsigned long off;
+ int cpu;
+
+ if ((addr64 < CPU_ENTRY_AREA_BASE) ||
+ (addr64 >= (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE)))
+ return NULL;
+ cpu = (addr64 - CPU_ENTRY_AREA_BASE) / CPU_ENTRY_AREA_SIZE;
+ off = addr64 - (unsigned long)get_cpu_entry_area(cpu);
+ if ((off < 0) || (off >= CPU_ENTRY_AREA_SIZE))
+ return NULL;
+ metadata_array = is_origin ? cpu_entry_area_origin :
+ cpu_entry_area_shadow;
+ return &per_cpu(metadata_array[off], cpu);
+}
+
+/*
+ * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version.
+ */
+static inline bool kmsan_phys_addr_valid(unsigned long addr)
+{
+ if (IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+ else
+ return true;
+}
+
+/*
+ * Taken from arch/x86/mm/physaddr.c to avoid using an instrumented version.
+ */
+static inline bool kmsan_virt_addr_valid(void *addr)
+{
+ unsigned long x = (unsigned long)addr;
+ unsigned long y = x - __START_KERNEL_map;
+ bool ret;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
+ return false;
+ } else {
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !kmsan_phys_addr_valid(x))
+ return false;
+ }
+
+ /*
+ * pfn_valid() relies on RCU, and may call into the scheduler on exiting
+ * the critical section. However, this would result in recursion with
+ * KMSAN. Therefore, disable preemption here, and re-enable preemption
+ * below while suppressing reschedules to avoid recursion.
+ *
+ * Note, this sacrifices occasionally breaking scheduling guarantees.
+ * Although, a kernel compiled with KMSAN has already given up on any
+ * performance guarantees due to being heavily instrumented.
+ */
+ preempt_disable();
+ ret = pfn_valid(x >> PAGE_SHIFT);
+ preempt_enable_no_resched();
+
+ return ret;
+}
+
+#endif /* !MODULE */
+
+#endif /* _ASM_X86_KMSAN_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e76..5939694dfb28 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -1,29 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_KPROBES_H
#define _ASM_X86_KPROBES_H
/*
* Kernel Probes (KProbes)
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2002, 2004
*
* See arch/x86/kernel/kprobes.c for x86 kprobes history.
*/
+
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/percpu.h>
+#include <asm/text-patching.h>
+#include <asm/insn.h>
#define __ARCH_WANT_KPROBES_INSN_SLOT
@@ -31,39 +24,79 @@ struct pt_regs;
struct kprobe;
typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
-#define MAX_INSN_SIZE 16
+
#define MAX_STACK_SIZE 64
-#define MIN_STACK_SIZE(ADDR) \
- (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
- THREAD_SIZE - (unsigned long)(ADDR))) \
- ? (MAX_STACK_SIZE) \
- : (((unsigned long)current_thread_info()) + \
- THREAD_SIZE - (unsigned long)(ADDR)))
+#define CUR_STACK_SIZE(ADDR) \
+ (current_top_of_stack() - (unsigned long)(ADDR))
+#define MIN_STACK_SIZE(ADDR) \
+ (MAX_STACK_SIZE < CUR_STACK_SIZE(ADDR) ? \
+ MAX_STACK_SIZE : CUR_STACK_SIZE(ADDR))
#define flush_insn_slot(p) do { } while (0)
+/* optinsn template addresses */
+extern __visible kprobe_opcode_t optprobe_template_entry[];
+extern __visible kprobe_opcode_t optprobe_template_clac[];
+extern __visible kprobe_opcode_t optprobe_template_val[];
+extern __visible kprobe_opcode_t optprobe_template_call[];
+extern __visible kprobe_opcode_t optprobe_template_end[];
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + DISP32_SIZE)
+#define MAX_OPTINSN_SIZE \
+ (((unsigned long)optprobe_template_end - \
+ (unsigned long)optprobe_template_entry) + \
+ MAX_OPTIMIZED_LENGTH + JMP32_INSN_SIZE)
+
extern const int kretprobe_blacklist_size;
void arch_remove_kprobe(struct kprobe *p);
-void kretprobe_trampoline(void);
/* Architecture specific copy of original instruction*/
struct arch_specific_insn {
/* copy of the original instruction */
kprobe_opcode_t *insn;
/*
- * boostable = -1: This instruction type is not boostable.
- * boostable = 0: This instruction type is boostable.
+ * boostable = 0: This instruction type is not boostable.
* boostable = 1: This instruction has been boosted: we have
* added a relative jump after the instruction copy in insn,
* so no single-step and fixup are needed (unless there's
- * a post_handler or break_handler).
+ * a post_handler).
*/
- int boostable;
+ unsigned boostable:1;
+ unsigned char size; /* The size of insn */
+ union {
+ unsigned char opcode;
+ struct {
+ unsigned char type;
+ } jcc;
+ struct {
+ unsigned char type;
+ unsigned char asize;
+ } loop;
+ struct {
+ unsigned char reg;
+ } indirect;
+ };
+ s32 rel32; /* relative offset must be s32, s16, or s8 */
+ void (*emulate_op)(struct kprobe *p, struct pt_regs *regs);
+ /* Number of bytes of text poked */
+ int tp_len;
};
+struct arch_optimized_insn {
+ /* copy of the original instructions */
+ kprobe_opcode_t copied_insn[DISP32_SIZE];
+ /* detour code buffer */
+ kprobe_opcode_t *insn;
+ /* the size of instructions copied to detour code buffer */
+ size_t size;
+};
+
+/* Return true (!0) if optinsn is prepared for optimization. */
+static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+ return optinsn->size;
+}
+
struct prev_kprobe {
struct kprobe *kp;
unsigned long status;
@@ -76,13 +109,15 @@ struct kprobe_ctlblk {
unsigned long kprobe_status;
unsigned long kprobe_old_flags;
unsigned long kprobe_saved_flags;
- unsigned long *jprobe_saved_sp;
- struct pt_regs jprobe_saved_regs;
- kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
struct prev_kprobe prev_kprobe;
};
extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data);
+extern int kprobe_int3_handler(struct pt_regs *regs);
+
+#else
+
+static inline int kprobe_debug_handler(struct pt_regs *regs) { return 0; }
+
+#endif /* CONFIG_KPROBES */
#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
new file mode 100644
index 000000000000..de709fb5bd76
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_OPTIONAL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_OP() and KVM_X86_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
+ * to make a definition optional, but in this case the default will
+ * be __static_call_return0.
+ */
+KVM_X86_OP(check_processor_compatibility)
+KVM_X86_OP(enable_virtualization_cpu)
+KVM_X86_OP(disable_virtualization_cpu)
+KVM_X86_OP(hardware_unsetup)
+KVM_X86_OP(has_emulated_msr)
+KVM_X86_OP(vcpu_after_set_cpuid)
+KVM_X86_OP(vm_init)
+KVM_X86_OP_OPTIONAL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_pre_destroy)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
+KVM_X86_OP(vcpu_create)
+KVM_X86_OP(vcpu_free)
+KVM_X86_OP(vcpu_reset)
+KVM_X86_OP(prepare_switch_to_guest)
+KVM_X86_OP(vcpu_load)
+KVM_X86_OP(vcpu_put)
+KVM_X86_OP(update_exception_bitmap)
+KVM_X86_OP(get_msr)
+KVM_X86_OP(set_msr)
+KVM_X86_OP(get_segment_base)
+KVM_X86_OP(get_segment)
+KVM_X86_OP(get_cpl)
+KVM_X86_OP(get_cpl_no_cache)
+KVM_X86_OP(set_segment)
+KVM_X86_OP(get_cs_db_l_bits)
+KVM_X86_OP(is_valid_cr0)
+KVM_X86_OP(set_cr0)
+KVM_X86_OP_OPTIONAL(post_set_cr3)
+KVM_X86_OP(is_valid_cr4)
+KVM_X86_OP(set_cr4)
+KVM_X86_OP(set_efer)
+KVM_X86_OP(get_idt)
+KVM_X86_OP(set_idt)
+KVM_X86_OP(get_gdt)
+KVM_X86_OP(set_gdt)
+KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr7)
+KVM_X86_OP(cache_reg)
+KVM_X86_OP(get_rflags)
+KVM_X86_OP(set_rflags)
+KVM_X86_OP(get_if_flag)
+KVM_X86_OP(flush_tlb_all)
+KVM_X86_OP(flush_tlb_current)
+#if IS_ENABLED(CONFIG_HYPERV)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range)
+#endif
+KVM_X86_OP(flush_tlb_gva)
+KVM_X86_OP(flush_tlb_guest)
+KVM_X86_OP(vcpu_pre_run)
+KVM_X86_OP(vcpu_run)
+KVM_X86_OP(handle_exit)
+KVM_X86_OP(skip_emulated_instruction)
+KVM_X86_OP_OPTIONAL(update_emulated_instruction)
+KVM_X86_OP(set_interrupt_shadow)
+KVM_X86_OP(get_interrupt_shadow)
+KVM_X86_OP(patch_hypercall)
+KVM_X86_OP(inject_irq)
+KVM_X86_OP(inject_nmi)
+KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending)
+KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending)
+KVM_X86_OP(inject_exception)
+KVM_X86_OP(cancel_injection)
+KVM_X86_OP(interrupt_allowed)
+KVM_X86_OP(nmi_allowed)
+KVM_X86_OP(get_nmi_mask)
+KVM_X86_OP(set_nmi_mask)
+KVM_X86_OP(enable_nmi_window)
+KVM_X86_OP(enable_irq_window)
+KVM_X86_OP_OPTIONAL(update_cr8_intercept)
+KVM_X86_OP(refresh_apicv_exec_ctrl)
+KVM_X86_OP_OPTIONAL(hwapic_isr_update)
+KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
+KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
+KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
+KVM_X86_OP(deliver_interrupt)
+KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
+KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
+KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
+KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
+KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_OPTIONAL(link_external_spt)
+KVM_X86_OP_OPTIONAL(set_external_spte)
+KVM_X86_OP_OPTIONAL(free_external_spt)
+KVM_X86_OP_OPTIONAL(remove_external_spte)
+KVM_X86_OP(has_wbinvd_exit)
+KVM_X86_OP(get_l2_tsc_offset)
+KVM_X86_OP(get_l2_tsc_multiplier)
+KVM_X86_OP(write_tsc_offset)
+KVM_X86_OP(write_tsc_multiplier)
+KVM_X86_OP(get_exit_info)
+KVM_X86_OP(get_entry_info)
+KVM_X86_OP(check_intercept)
+KVM_X86_OP(handle_exit_irqoff)
+KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
+KVM_X86_OP_OPTIONAL(vcpu_blocking)
+KVM_X86_OP_OPTIONAL(vcpu_unblocking)
+KVM_X86_OP_OPTIONAL(pi_update_irte)
+KVM_X86_OP_OPTIONAL(pi_start_bypass)
+KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
+KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
+KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_OPTIONAL(protected_apic_has_interrupt)
+KVM_X86_OP_OPTIONAL(set_hv_timer)
+KVM_X86_OP_OPTIONAL(cancel_hv_timer)
+KVM_X86_OP(setup_mce)
+#ifdef CONFIG_KVM_SMM
+KVM_X86_OP(smi_allowed)
+KVM_X86_OP(enter_smm)
+KVM_X86_OP(leave_smm)
+KVM_X86_OP(enable_smi_window)
+#endif
+KVM_X86_OP_OPTIONAL(dev_get_attr)
+KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_unlocked_ioctl)
+KVM_X86_OP_OPTIONAL(mem_enc_register_region)
+KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
+KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
+KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
+KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
+KVM_X86_OP(get_feature_msr)
+KVM_X86_OP(check_emulate_instruction)
+KVM_X86_OP(apic_init_signal_blocked)
+KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
+KVM_X86_OP_OPTIONAL(migrate_timers)
+KVM_X86_OP(recalc_intercepts)
+KVM_X86_OP(complete_emulated_msr)
+KVM_X86_OP(vcpu_deliver_sipi_vector)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
+KVM_X86_OP_OPTIONAL(get_untagged_addr)
+KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
+KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
+KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
+KVM_X86_OP_OPTIONAL(gmem_invalidate)
+
+#undef KVM_X86_OP
+#undef KVM_X86_OP_OPTIONAL
+#undef KVM_X86_OP_OPTIONAL_RET0
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
new file mode 100644
index 000000000000..9159bf1a4730
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition.
+ */
+KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
+KVM_X86_PMU_OP(msr_idx_to_pmc)
+KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early)
+KVM_X86_PMU_OP(is_valid_msr)
+KVM_X86_PMU_OP(get_msr)
+KVM_X86_PMU_OP(set_msr)
+KVM_X86_PMU_OP(refresh)
+KVM_X86_PMU_OP(init)
+KVM_X86_PMU_OP_OPTIONAL(reset)
+KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
+KVM_X86_PMU_OP_OPTIONAL(cleanup)
+
+#undef KVM_X86_PMU_OP
+#undef KVM_X86_PMU_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
deleted file mode 100644
index 125be8b19568..000000000000
--- a/arch/x86/include/asm/kvm.h
+++ /dev/null
@@ -1,243 +0,0 @@
-#ifndef _ASM_X86_KVM_H
-#define _ASM_X86_KVM_H
-
-/*
- * KVM x86 specific structures and definitions
- *
- */
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-/* Select x86 specific features in <linux/kvm.h> */
-#define __KVM_HAVE_PIT
-#define __KVM_HAVE_IOAPIC
-#define __KVM_HAVE_DEVICE_ASSIGNMENT
-#define __KVM_HAVE_MSI
-#define __KVM_HAVE_USER_NMI
-#define __KVM_HAVE_GUEST_DEBUG
-#define __KVM_HAVE_MSIX
-
-/* Architectural interrupt line count. */
-#define KVM_NR_INTERRUPTS 256
-
-struct kvm_memory_alias {
- __u32 slot; /* this has a different namespace than memory slots */
- __u32 flags;
- __u64 guest_phys_addr;
- __u64 memory_size;
- __u64 target_phys_addr;
-};
-
-/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
-struct kvm_pic_state {
- __u8 last_irr; /* edge detection */
- __u8 irr; /* interrupt request register */
- __u8 imr; /* interrupt mask register */
- __u8 isr; /* interrupt service register */
- __u8 priority_add; /* highest irq priority */
- __u8 irq_base;
- __u8 read_reg_select;
- __u8 poll;
- __u8 special_mask;
- __u8 init_state;
- __u8 auto_eoi;
- __u8 rotate_on_auto_eoi;
- __u8 special_fully_nested_mode;
- __u8 init4; /* true if 4 byte init */
- __u8 elcr; /* PIIX edge/trigger selection */
- __u8 elcr_mask;
-};
-
-#define KVM_IOAPIC_NUM_PINS 24
-struct kvm_ioapic_state {
- __u64 base_address;
- __u32 ioregsel;
- __u32 id;
- __u32 irr;
- __u32 pad;
- union {
- __u64 bits;
- struct {
- __u8 vector;
- __u8 delivery_mode:3;
- __u8 dest_mode:1;
- __u8 delivery_status:1;
- __u8 polarity:1;
- __u8 remote_irr:1;
- __u8 trig_mode:1;
- __u8 mask:1;
- __u8 reserve:7;
- __u8 reserved[4];
- __u8 dest_id;
- } fields;
- } redirtbl[KVM_IOAPIC_NUM_PINS];
-};
-
-#define KVM_IRQCHIP_PIC_MASTER 0
-#define KVM_IRQCHIP_PIC_SLAVE 1
-#define KVM_IRQCHIP_IOAPIC 2
-
-/* for KVM_GET_REGS and KVM_SET_REGS */
-struct kvm_regs {
- /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
- __u64 rax, rbx, rcx, rdx;
- __u64 rsi, rdi, rsp, rbp;
- __u64 r8, r9, r10, r11;
- __u64 r12, r13, r14, r15;
- __u64 rip, rflags;
-};
-
-/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
-#define KVM_APIC_REG_SIZE 0x400
-struct kvm_lapic_state {
- char regs[KVM_APIC_REG_SIZE];
-};
-
-struct kvm_segment {
- __u64 base;
- __u32 limit;
- __u16 selector;
- __u8 type;
- __u8 present, dpl, db, s, l, g, avl;
- __u8 unusable;
- __u8 padding;
-};
-
-struct kvm_dtable {
- __u64 base;
- __u16 limit;
- __u16 padding[3];
-};
-
-
-/* for KVM_GET_SREGS and KVM_SET_SREGS */
-struct kvm_sregs {
- /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
- struct kvm_segment cs, ds, es, fs, gs, ss;
- struct kvm_segment tr, ldt;
- struct kvm_dtable gdt, idt;
- __u64 cr0, cr2, cr3, cr4, cr8;
- __u64 efer;
- __u64 apic_base;
- __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
-};
-
-/* for KVM_GET_FPU and KVM_SET_FPU */
-struct kvm_fpu {
- __u8 fpr[8][16];
- __u16 fcw;
- __u16 fsw;
- __u8 ftwx; /* in fxsave format */
- __u8 pad1;
- __u16 last_opcode;
- __u64 last_ip;
- __u64 last_dp;
- __u8 xmm[16][16];
- __u32 mxcsr;
- __u32 pad2;
-};
-
-struct kvm_msr_entry {
- __u32 index;
- __u32 reserved;
- __u64 data;
-};
-
-/* for KVM_GET_MSRS and KVM_SET_MSRS */
-struct kvm_msrs {
- __u32 nmsrs; /* number of msrs in entries */
- __u32 pad;
-
- struct kvm_msr_entry entries[0];
-};
-
-/* for KVM_GET_MSR_INDEX_LIST */
-struct kvm_msr_list {
- __u32 nmsrs; /* number of msrs in entries */
- __u32 indices[0];
-};
-
-
-struct kvm_cpuid_entry {
- __u32 function;
- __u32 eax;
- __u32 ebx;
- __u32 ecx;
- __u32 edx;
- __u32 padding;
-};
-
-/* for KVM_SET_CPUID */
-struct kvm_cpuid {
- __u32 nent;
- __u32 padding;
- struct kvm_cpuid_entry entries[0];
-};
-
-struct kvm_cpuid_entry2 {
- __u32 function;
- __u32 index;
- __u32 flags;
- __u32 eax;
- __u32 ebx;
- __u32 ecx;
- __u32 edx;
- __u32 padding[3];
-};
-
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
-#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
-#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
-
-/* for KVM_SET_CPUID2 */
-struct kvm_cpuid2 {
- __u32 nent;
- __u32 padding;
- struct kvm_cpuid_entry2 entries[0];
-};
-
-/* for KVM_GET_PIT and KVM_SET_PIT */
-struct kvm_pit_channel_state {
- __u32 count; /* can be 65536 */
- __u16 latched_count;
- __u8 count_latched;
- __u8 status_latched;
- __u8 status;
- __u8 read_state;
- __u8 write_state;
- __u8 write_latch;
- __u8 rw_mode;
- __u8 mode;
- __u8 bcd;
- __u8 gate;
- __s64 count_load_time;
-};
-
-struct kvm_debug_exit_arch {
- __u32 exception;
- __u32 pad;
- __u64 pc;
- __u64 dr6;
- __u64 dr7;
-};
-
-#define KVM_GUESTDBG_USE_SW_BP 0x00010000
-#define KVM_GUESTDBG_USE_HW_BP 0x00020000
-#define KVM_GUESTDBG_INJECT_DB 0x00040000
-#define KVM_GUESTDBG_INJECT_BP 0x00080000
-
-/* for KVM_SET_GUEST_DEBUG */
-struct kvm_guest_debug_arch {
- __u64 debugreg[8];
-};
-
-struct kvm_pit_state {
- struct kvm_pit_channel_state channels[3];
-};
-
-struct kvm_reinject_control {
- __u8 pit_reinject;
- __u8 reserved[31];
-};
-#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eabdc1cfab5c..5a3bfa293e8b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Kernel-based Virtual Machine driver for Linux
*
* This header defines architecture specific interfaces, x86 version
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
*/
#ifndef _ASM_X86_KVM_HOST_H
@@ -14,110 +11,196 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/irq.h>
+#include <linux/workqueue.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
#include <linux/kvm_types.h>
-
+#include <linux/perf_event.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/clocksource.h>
+#include <linux/irqbypass.h>
+#include <linux/kfifo.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/call_once.h>
+#include <linux/atomic.h>
+
+#include <asm/apic.h>
#include <asm/pvclock-abi.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/msr-index.h>
+#include <asm/msr.h>
+#include <asm/asm.h>
+#include <asm/irq_remapping.h>
+#include <asm/kvm_page_track.h>
+#include <asm/kvm_vcpu_regs.h>
+#include <asm/reboot.h>
+#include <hyperv/hvhdk.h>
+
+#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
+
+/*
+ * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if
+ * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
+ */
+#ifdef CONFIG_KVM_MAX_NR_VCPUS
+#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS
+#else
+#define KVM_MAX_VCPUS 1024
+#endif
+
+/*
+ * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
+ * might be larger than the actual number of VCPUs because the
+ * APIC ID encodes CPU topology information.
+ *
+ * In the worst case, we'll need less than one extra bit for the
+ * Core ID, and less than one extra bit for the Package (Die) ID,
+ * so ratio of 4 should be enough.
+ */
+#define KVM_VCPU_ID_RATIO 4
+#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+
+/* memory slots that are not exposed to userspace */
+#define KVM_INTERNAL_MEM_SLOTS 3
+
+#define KVM_HALT_POLL_NS_DEFAULT 200000
+
+#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
+
+#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
+ KVM_DIRTY_LOG_INITIALLY_SET)
+
+#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \
+ KVM_BUS_LOCK_DETECTION_EXIT)
+
+#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \
+ KVM_X86_NOTIFY_VMEXIT_USER)
+
+/* x86-specific vcpu->requests bit members */
+#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
+#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
+#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2)
+#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3)
+#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4)
+#define KVM_REQ_LOAD_MMU_PGD KVM_ARCH_REQ(5)
+#define KVM_REQ_EVENT KVM_ARCH_REQ(6)
+#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7)
+#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8)
+#define KVM_REQ_NMI KVM_ARCH_REQ(9)
+#define KVM_REQ_PMU KVM_ARCH_REQ(10)
+#define KVM_REQ_PMI KVM_ARCH_REQ(11)
+#ifdef CONFIG_KVM_SMM
+#define KVM_REQ_SMI KVM_ARCH_REQ(12)
+#endif
+#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
+#define KVM_REQ_MCLOCK_INPROGRESS \
+ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC \
+ KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16)
+#define KVM_REQ_APIC_PAGE_RELOAD \
+ KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18)
+#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19)
+#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
+#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
+#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
+#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
+#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24)
+#define KVM_REQ_APICV_UPDATE \
+ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
+#define KVM_REQ_TLB_FLUSH_GUEST \
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
+#define KVM_REQ_RECALC_INTERCEPTS KVM_ARCH_REQ(29)
+#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
+ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \
+ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_TLB_FLUSH \
+ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \
+ KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT)
+
+#define CR0_RESERVED_BITS \
+ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+
+#define CR4_RESERVED_BITS \
+ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
+ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
+ | X86_CR4_LAM_SUP | X86_CR4_CET))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
-#define KVM_MAX_VCPUS 16
-#define KVM_MEMORY_SLOTS 32
-/* memory slots that does not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 4
-
-#define KVM_PIO_PAGE_OFFSET 1
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
-
-#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
-#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
-#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
- 0xFFFFFF0000000000ULL)
-
-#define KVM_GUEST_CR0_MASK \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
- | X86_CR0_NW | X86_CR0_CD)
-#define KVM_VM_CR0_ALWAYS_ON \
- (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
- | X86_CR0_MP)
-#define KVM_GUEST_CR4_MASK \
- (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
#define INVALID_PAGE (~(hpa_t)0)
-#define UNMAPPED_GVA (~(gpa_t)0)
-
-/* shadow tables are PAE even on non-PAE hosts */
-#define KVM_HPAGE_SHIFT 21
-#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
-#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
-
-#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
-
-#define DE_VECTOR 0
-#define DB_VECTOR 1
-#define BP_VECTOR 3
-#define OF_VECTOR 4
-#define BR_VECTOR 5
-#define UD_VECTOR 6
-#define NM_VECTOR 7
-#define DF_VECTOR 8
-#define TS_VECTOR 10
-#define NP_VECTOR 11
-#define SS_VECTOR 12
-#define GP_VECTOR 13
-#define PF_VECTOR 14
-#define MF_VECTOR 16
-#define MC_VECTOR 18
-
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
-#define KVM_ALIAS_SLOTS 4
-
-#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+/* KVM Hugepage definitions for x86 */
+#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
+#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
+
+#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
+#define KVM_MIN_ALLOC_MMU_PAGES 64UL
+#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-#define KVM_NR_FIXED_MTRR_REGION 88
+#define KVM_MAX_CPUID_ENTRIES 256
#define KVM_NR_VAR_MTRR 8
-extern spinlock_t kvm_lock;
-extern struct list_head vm_list;
-
-struct kvm_vcpu;
-struct kvm;
+#define ASYNC_PF_PER_VCPU 64
enum kvm_reg {
- VCPU_REGS_RAX = 0,
- VCPU_REGS_RCX = 1,
- VCPU_REGS_RDX = 2,
- VCPU_REGS_RBX = 3,
- VCPU_REGS_RSP = 4,
- VCPU_REGS_RBP = 5,
- VCPU_REGS_RSI = 6,
- VCPU_REGS_RDI = 7,
+ VCPU_REGS_RAX = __VCPU_REGS_RAX,
+ VCPU_REGS_RCX = __VCPU_REGS_RCX,
+ VCPU_REGS_RDX = __VCPU_REGS_RDX,
+ VCPU_REGS_RBX = __VCPU_REGS_RBX,
+ VCPU_REGS_RSP = __VCPU_REGS_RSP,
+ VCPU_REGS_RBP = __VCPU_REGS_RBP,
+ VCPU_REGS_RSI = __VCPU_REGS_RSI,
+ VCPU_REGS_RDI = __VCPU_REGS_RDI,
#ifdef CONFIG_X86_64
- VCPU_REGS_R8 = 8,
- VCPU_REGS_R9 = 9,
- VCPU_REGS_R10 = 10,
- VCPU_REGS_R11 = 11,
- VCPU_REGS_R12 = 12,
- VCPU_REGS_R13 = 13,
- VCPU_REGS_R14 = 14,
- VCPU_REGS_R15 = 15,
+ VCPU_REGS_R8 = __VCPU_REGS_R8,
+ VCPU_REGS_R9 = __VCPU_REGS_R9,
+ VCPU_REGS_R10 = __VCPU_REGS_R10,
+ VCPU_REGS_R11 = __VCPU_REGS_R11,
+ VCPU_REGS_R12 = __VCPU_REGS_R12,
+ VCPU_REGS_R13 = __VCPU_REGS_R13,
+ VCPU_REGS_R14 = __VCPU_REGS_R14,
+ VCPU_REGS_R15 = __VCPU_REGS_R15,
#endif
VCPU_REGS_RIP,
- NR_VCPU_REGS
+ NR_VCPU_REGS,
+
+ VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+ VCPU_EXREG_CR0,
+ VCPU_EXREG_CR3,
+ VCPU_EXREG_CR4,
+ VCPU_EXREG_RFLAGS,
+ VCPU_EXREG_SEGMENTS,
+ VCPU_EXREG_EXIT_INFO_1,
+ VCPU_EXREG_EXIT_INFO_2,
};
enum {
@@ -131,141 +214,574 @@ enum {
VCPU_SREG_LDTR,
};
-#include <asm/kvm_x86_emulate.h>
+enum exit_fastpath_completion {
+ EXIT_FASTPATH_NONE,
+ EXIT_FASTPATH_REENTER_GUEST,
+ EXIT_FASTPATH_EXIT_HANDLED,
+ EXIT_FASTPATH_EXIT_USERSPACE,
+};
+typedef enum exit_fastpath_completion fastpath_t;
-#define KVM_NR_MEM_OBJS 40
+struct x86_emulate_ctxt;
+struct x86_exception;
+union kvm_smram;
+enum x86_intercept;
+enum x86_intercept_stage;
#define KVM_NR_DB_REGS 4
+#define DR6_BUS_LOCK (1 << 11)
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
-#define DR6_FIXED_1 0xffff0ff0
-#define DR6_VOLATILE 0x0000e00f
+#define DR6_BT (1 << 15)
+#define DR6_RTM (1 << 16)
+/*
+ * DR6_ACTIVE_LOW combines fixed-1 and active-low bits.
+ * We can regard all the bits in DR6_FIXED_1 as active_low bits;
+ * they will never be 0 for now, but when they are defined
+ * in the future it will require no code change.
+ *
+ * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
+ */
+#define DR6_ACTIVE_LOW 0xffff0ff0
+#define DR6_VOLATILE 0x0001e80f
+#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
#define DR7_GD (1 << 13)
-#define DR7_FIXED_1 0x00000400
-#define DR7_VOLATILE 0xffff23ff
+#define DR7_VOLATILE 0xffff2bff
+
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_SINGLESTEP | \
+ KVM_GUESTDBG_USE_HW_BP | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_INJECT_BP | \
+ KVM_GUESTDBG_INJECT_DB | \
+ KVM_GUESTDBG_BLOCKIRQ)
+
+#define PFERR_PRESENT_MASK BIT(0)
+#define PFERR_WRITE_MASK BIT(1)
+#define PFERR_USER_MASK BIT(2)
+#define PFERR_RSVD_MASK BIT(3)
+#define PFERR_FETCH_MASK BIT(4)
+#define PFERR_PK_MASK BIT(5)
+#define PFERR_SS_MASK BIT(6)
+#define PFERR_SGX_MASK BIT(15)
+#define PFERR_GUEST_RMP_MASK BIT_ULL(31)
+#define PFERR_GUEST_FINAL_MASK BIT_ULL(32)
+#define PFERR_GUEST_PAGE_MASK BIT_ULL(33)
+#define PFERR_GUEST_ENC_MASK BIT_ULL(34)
+#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35)
+#define PFERR_GUEST_VMPL_MASK BIT_ULL(36)
/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
+ * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks
+ * when emulating instructions that triggers implicit access.
*/
-struct kvm_mmu_memory_cache {
- int nobjs;
- void *objects[KVM_NR_MEM_OBJS];
-};
+#define PFERR_IMPLICIT_ACCESS BIT_ULL(48)
+/*
+ * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred
+ * when the guest was accessing private memory.
+ */
+#define PFERR_PRIVATE_ACCESS BIT_ULL(49)
+#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
-#define NR_PTE_CHAIN_ENTRIES 5
+/* apic attention bits */
+#define KVM_APIC_CHECK_VAPIC 0
+/*
+ * The following bit is set with PV-EOI, unset on EOI.
+ * We detect PV-EOI changes by guest by comparing
+ * this bit with PV-EOI in guest memory.
+ * See the implementation in apic_update_pv_eoi.
+ */
+#define KVM_APIC_PV_EOI_PENDING 1
-struct kvm_pte_chain {
- u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
- struct hlist_node link;
-};
+struct kvm_kernel_irqfd;
+struct kvm_kernel_irq_routing_entry;
/*
- * kvm_mmu_page_role, below, is defined as:
+ * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
+ * also includes TDP pages) to determine whether or not a page can be used in
+ * the given MMU context. This is a subset of the overall kvm_cpu_role to
+ * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows
+ * allocating 2 bytes per gfn instead of 4 bytes per gfn.
+ *
+ * Upper-level shadow pages having gptes are tracked for write-protection via
+ * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must
+ * not create more than 2^16-1 upper-level shadow pages at a single gfn,
+ * otherwise gfn_write_track will overflow and explosions will ensue.
+ *
+ * A unique shadow page (SP) for a gfn is created if and only if an existing SP
+ * cannot be reused. The ability to reuse a SP is tracked by its role, which
+ * incorporates various mode bits and properties of the SP. Roughly speaking,
+ * the number of unique SPs that can theoretically be created is 2^n, where n
+ * is the number of bits that are used to compute the role.
+ *
+ * But, even though there are 20 bits in the mask below, not all combinations
+ * of modes and flags are possible:
+ *
+ * - invalid shadow pages are not accounted, mirror pages are not shadowed,
+ * so the bits are effectively 18.
+ *
+ * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
+ * execonly and ad_disabled are only used for nested EPT which has
+ * has_4_byte_gpte=0. Therefore, 2 bits are always unused.
+ *
+ * - the 4 bits of level are effectively limited to the values 2/3/4/5,
+ * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE
+ * paging has exactly one upper level, making level completely redundant
+ * when has_4_byte_gpte=1.
*
- * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- * bits 4:7 - page table level for this shadow (1-4)
- * bits 8:9 - page table quadrant for 2-level guests
- * bit 16 - direct mapping of virtual to physical mapping at gfn
- * used for real mode and two-dimensional paging
- * bits 17:19 - common access permissions for all ptes in this shadow page
+ * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
+ * cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
+ *
+ * Therefore, the maximum number of possible upper-level shadow pages for a
+ * single gfn is a bit less than 2^13.
*/
union kvm_mmu_page_role {
- unsigned word;
+ u32 word;
struct {
- unsigned glevels:4;
unsigned level:4;
+ unsigned has_4_byte_gpte:1;
unsigned quadrant:2;
- unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
- unsigned cr4_pge:1;
- unsigned nxe:1;
+ unsigned efer_nx:1;
+ unsigned cr0_wp:1;
+ unsigned smep_andnot_wp:1;
+ unsigned smap_andnot_wp:1;
+ unsigned ad_disabled:1;
+ unsigned guest_mode:1;
+ unsigned passthrough:1;
+ unsigned is_mirror:1;
+ unsigned :4;
+
+ /*
+ * This is left at the top of the word so that
+ * kvm_memslots_for_spte_role can extract it with a
+ * simple shift. While there is room, give it a whole
+ * byte so it is also faster to load it from memory.
+ */
+ unsigned smm:8;
};
};
-struct kvm_mmu_page {
- struct list_head link;
- struct hlist_node hash_link;
-
- struct list_head oos_link;
-
- /*
- * The following two entries are used to key the shadow page in the
- * hash table.
- */
- gfn_t gfn;
- union kvm_mmu_page_role role;
+/*
+ * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties
+ * relevant to the current MMU configuration. When loading CR0, CR4, or EFER,
+ * including on nested transitions, if nothing in the full role changes then
+ * MMU re-configuration can be skipped. @valid bit is set on first usage so we
+ * don't treat all-zero structure as valid data.
+ *
+ * The properties that are tracked in the extended role but not the page role
+ * are for things that either (a) do not affect the validity of the shadow page
+ * or (b) are indirectly reflected in the shadow page's role. For example,
+ * CR4.PKE only affects permission checks for software walks of the guest page
+ * tables (because KVM doesn't support Protection Keys with shadow paging), and
+ * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
+ *
+ * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
+ * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
+ * SMAP, but the MMU's permission checks for software walks need to be SMEP and
+ * SMAP aware regardless of CR0.WP.
+ */
+union kvm_mmu_extended_role {
+ u32 word;
+ struct {
+ unsigned int valid:1;
+ unsigned int execonly:1;
+ unsigned int cr4_pse:1;
+ unsigned int cr4_pke:1;
+ unsigned int cr4_smap:1;
+ unsigned int cr4_smep:1;
+ unsigned int cr4_la57:1;
+ unsigned int efer_lma:1;
+ };
+};
- u64 *spt;
- /* hold the gfn of each spte inside spt */
- gfn_t *gfns;
- /*
- * One bit set per slot which has memory
- * in this shadow page.
- */
- DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- int multimapped; /* More than one parent_pte? */
- int root_count; /* Currently serving as active root */
- bool unsync;
- unsigned int unsync_children;
- union {
- u64 *parent_pte; /* !multimapped */
- struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+union kvm_cpu_role {
+ u64 as_u64;
+ struct {
+ union kvm_mmu_page_role base;
+ union kvm_mmu_extended_role ext;
};
- DECLARE_BITMAP(unsync_child_bitmap, 512);
};
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char buf[512] __aligned(sizeof(long));
+struct kvm_rmap_head {
+ atomic_long_t val;
};
struct kvm_pio_request {
unsigned long count;
- int cur_count;
- gva_t guest_gva;
int in;
int port;
int size;
- int string;
- int down;
- int rep;
};
+#define PT64_ROOT_MAX_LEVEL 5
+
+struct rsvd_bits_validate {
+ u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
+ u64 bad_mt_xwr;
+};
+
+struct kvm_mmu_root_info {
+ gpa_t pgd;
+ hpa_t hpa;
+};
+
+#define KVM_MMU_ROOT_INFO_INVALID \
+ ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE })
+
+#define KVM_MMU_NUM_PREV_ROOTS 3
+
+#define KVM_MMU_ROOT_CURRENT BIT(0)
+#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
+#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1)
+
+#define KVM_HAVE_MMU_RWLOCK
+
+struct kvm_mmu_page;
+struct kvm_page_fault;
+
/*
- * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
- * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
- * mode.
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
*/
struct kvm_mmu {
- void (*new_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
- void (*prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page);
- int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
- void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
- hpa_t root_hpa;
- int root_level;
- int shadow_root_level;
- union kvm_mmu_page_role base_role;
+ unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
+ u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
+ int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t gva_or_gpa, u64 access,
+ struct x86_exception *exception);
+ int (*sync_spte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, int i);
+ struct kvm_mmu_root_info root;
+ hpa_t mirror_root_hpa;
+ union kvm_cpu_role cpu_role;
+ union kvm_mmu_page_role root_role;
+
+ /*
+ * The pkru_mask indicates if protection key checks are needed. It
+ * consists of 16 domains indexed by page fault error code bits [4:1],
+ * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+ * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+ */
+ u32 pkru_mask;
+
+ struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
+
+ /*
+ * Bitmap; bit set = permission fault
+ * Byte index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u8 permissions[16];
u64 *pae_root;
- u64 rsvd_bits_mask[2][4];
+ u64 *pml4_root;
+ u64 *pml5_root;
+
+ /*
+ * check zero bits on shadow page table entries, these
+ * bits include not only hardware reserved bits but also
+ * the bits spte never used.
+ */
+ struct rsvd_bits_validate shadow_zero_check;
+
+ struct rsvd_bits_validate guest_rsvd_check;
+
+ u64 pdptrs[4]; /* pae */
+};
+
+enum pmc_type {
+ KVM_PMC_GP = 0,
+ KVM_PMC_FIXED,
+};
+
+struct kvm_pmc {
+ enum pmc_type type;
+ u8 idx;
+ bool is_paused;
+ bool intr;
+ /*
+ * Base value of the PMC counter, relative to the *consumed* count in
+ * the associated perf_event. This value includes counter updates from
+ * the perf_event and emulated_count since the last time the counter
+ * was reprogrammed, but it is *not* the current value as seen by the
+ * guest or userspace.
+ *
+ * The count is relative to the associated perf_event so that KVM
+ * doesn't need to reprogram the perf_event every time the guest writes
+ * to the counter.
+ */
+ u64 counter;
+ /*
+ * PMC events triggered by KVM emulation that haven't been fully
+ * processed, i.e. haven't undergone overflow detection.
+ */
+ u64 emulated_counter;
+ u64 eventsel;
+ struct perf_event *perf_event;
+ struct kvm_vcpu *vcpu;
+ /*
+ * only for creating or reusing perf_event,
+ * eventsel value for general purpose counters,
+ * ctrl value for fixed counters.
+ */
+ u64 current_config;
+};
+
+/* More counters may conflict with other existing Architectural MSRs */
+#define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b))
+#define KVM_MAX_NR_INTEL_GP_COUNTERS 8
+#define KVM_MAX_NR_AMD_GP_COUNTERS 6
+#define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \
+ KVM_MAX_NR_AMD_GP_COUNTERS)
+
+#define KVM_MAX_NR_INTEL_FIXED_COUNTERS 3
+#define KVM_MAX_NR_AMD_FIXED_COUNTERS 0
+#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUNTERS, \
+ KVM_MAX_NR_AMD_FIXED_COUNTERS)
+
+struct kvm_pmu {
+ u8 version;
+ unsigned nr_arch_gp_counters;
+ unsigned nr_arch_fixed_counters;
+ unsigned available_event_types;
+ u64 fixed_ctr_ctrl;
+ u64 fixed_ctr_ctrl_rsvd;
+ u64 global_ctrl;
+ u64 global_status;
+ u64 counter_bitmask[2];
+ u64 global_ctrl_rsvd;
+ u64 global_status_rsvd;
+ u64 reserved_bits;
+ u64 raw_event_mask;
+ struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS];
+ struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS];
+
+ /*
+ * Overlay the bitmap with a 64-bit atomic so that all bits can be
+ * set in a single access, e.g. to reprogram all counters when the PMU
+ * filter changes.
+ */
+ union {
+ DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
+ atomic64_t __reprogram_pmi;
+ };
+ DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
+ DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
+
+ DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX);
+ DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX);
+
+ u64 ds_area;
+ u64 pebs_enable;
+ u64 pebs_enable_rsvd;
+ u64 pebs_data_cfg;
+ u64 pebs_data_cfg_rsvd;
+
+ /*
+ * If a guest counter is cross-mapped to host counter with different
+ * index, its PEBS capability will be temporarily disabled.
+ *
+ * The user should make sure that this mask is updated
+ * after disabling interrupts and before perf_guest_get_msrs();
+ */
+ u64 host_cross_mapped_mask;
+
+ /*
+ * The gate to release perf_events not marked in
+ * pmc_in_use only once in a vcpu time slice.
+ */
+ bool need_cleanup;
+
+ /*
+ * The total number of programmed perf_events and it helps to avoid
+ * redundant check before cleanup if guest don't use vPMU at all.
+ */
+ u8 event_count;
+};
+
+struct kvm_pmu_ops;
+
+enum {
+ KVM_DEBUGREG_BP_ENABLED = BIT(0),
+ KVM_DEBUGREG_WONT_EXIT = BIT(1),
+ /*
+ * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by
+ * hardware on exit from or enter to guest. KVM needn't switch them.
+ * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM
+ * exit, host values need to be restored.
+ */
+ KVM_DEBUGREG_AUTO_SWITCH = BIT(2),
+};
+
+struct kvm_mtrr {
+ u64 var[KVM_NR_VAR_MTRR * 2];
+ u64 fixed_64k;
+ u64 fixed_16k[2];
+ u64 fixed_4k[8];
+ u64 deftype;
+};
+
+/* Hyper-V SynIC timer */
+struct kvm_vcpu_hv_stimer {
+ struct hrtimer timer;
+ int index;
+ union hv_stimer_config config;
+ u64 count;
+ u64 exp_time;
+ struct hv_message msg;
+ bool msg_pending;
+};
+
+/* Hyper-V synthetic interrupt controller (SynIC)*/
+struct kvm_vcpu_hv_synic {
+ u64 version;
+ u64 control;
+ u64 msg_page;
+ u64 evt_page;
+ atomic64_t sint[HV_SYNIC_SINT_COUNT];
+ atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
+ DECLARE_BITMAP(auto_eoi_bitmap, 256);
+ DECLARE_BITMAP(vec_bitmap, 256);
+ bool active;
+ bool dont_zero_synic_pages;
+};
+
+/* The maximum number of entries on the TLB flush fifo. */
+#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16)
+/*
+ * Note: the following 'magic' entry is made up by KVM to avoid putting
+ * anything besides GVA on the TLB flush fifo. It is theoretically possible
+ * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000
+ * which will look identical. KVM's action to 'flush everything' instead of
+ * flushing these particular addresses is, however, fully legitimate as
+ * flushing more than requested is always OK.
+ */
+#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1)
+
+enum hv_tlb_flush_fifos {
+ HV_L1_TLB_FLUSH_FIFO,
+ HV_L2_TLB_FLUSH_FIFO,
+ HV_NR_TLB_FLUSH_FIFOS,
+};
+
+struct kvm_vcpu_hv_tlb_flush_fifo {
+ spinlock_t write_lock;
+ DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+};
+
+/* Hyper-V per vcpu emulation context */
+struct kvm_vcpu_hv {
+ struct kvm_vcpu *vcpu;
+ u32 vp_index;
+ u64 hv_vapic;
+ s64 runtime_offset;
+ struct kvm_vcpu_hv_synic synic;
+ struct kvm_hyperv_exit exit;
+ struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
+ DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ bool enforce_cpuid;
+ struct {
+ u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */
+ u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */
+ u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */
+ u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
+ u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
+ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+ u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
+ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
+ } cpuid_cache;
+
+ struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS];
+
+ /*
+ * Preallocated buffers for handling hypercalls that pass sparse vCPU
+ * sets (for high vCPU counts, they're too large to comfortably fit on
+ * the stack).
+ */
+ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS];
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+
+ struct hv_vp_assist_page vp_assist_page;
+
+ struct {
+ u64 pa_page_gpa;
+ u64 vm_id;
+ u32 vp_id;
+ } nested;
+};
+
+struct kvm_hypervisor_cpuid {
+ u32 base;
+ u32 limit;
+};
+
+#ifdef CONFIG_KVM_XEN
+/* Xen HVM per vcpu emulation context */
+struct kvm_vcpu_xen {
+ u64 hypercall_rip;
+ u32 current_runstate;
+ u8 upcall_vector;
+ struct gfn_to_pfn_cache vcpu_info_cache;
+ struct gfn_to_pfn_cache vcpu_time_info_cache;
+ struct gfn_to_pfn_cache runstate_cache;
+ struct gfn_to_pfn_cache runstate2_cache;
+ u64 last_steal;
+ u64 runstate_entry_time;
+ u64 runstate_times[4];
+ unsigned long evtchn_pending_sel;
+ u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+ u32 timer_virq;
+ u64 timer_expires; /* In guest epoch */
+ atomic_t timer_pending;
+ struct hrtimer timer;
+ int poll_evtchn;
+ struct timer_list poll_timer;
+ struct kvm_hypervisor_cpuid cpuid;
+};
+#endif
+
+struct kvm_queued_exception {
+ bool pending;
+ bool injected;
+ bool has_error_code;
+ u8 vector;
+ u32 error_code;
+ unsigned long payload;
+ bool has_payload;
+};
+
+/*
+ * Hardware-defined CPUID leafs that are either scattered by the kernel or are
+ * unknown to the kernel, but need to be directly used by KVM. Note, these
+ * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+ CPUID_12_EAX = NCAPINTS,
+ CPUID_7_1_EDX,
+ CPUID_8000_0007_EDX,
+ CPUID_8000_0022_EAX,
+ CPUID_7_2_EDX,
+ CPUID_24_0_EBX,
+ CPUID_8000_0021_ECX,
+ CPUID_7_1_ECX,
+ NR_KVM_CPU_CAPS,
+
+ NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
};
struct kvm_vcpu_arch {
- u64 host_tsc;
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
@@ -275,409 +791,1528 @@ struct kvm_vcpu_arch {
u32 regs_dirty;
unsigned long cr0;
+ unsigned long cr0_guest_owned_bits;
unsigned long cr2;
unsigned long cr3;
unsigned long cr4;
+ unsigned long cr4_guest_owned_bits;
+ unsigned long cr4_guest_rsvd_bits;
unsigned long cr8;
+ u32 host_pkru;
+ u32 pkru;
u32 hflags;
- u64 pdptrs[4]; /* pae */
- u64 shadow_efer;
+ u64 efer;
+ u64 host_debugctl;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
+ bool load_eoi_exitmap_pending;
+ DECLARE_BITMAP(ioapic_handled_vectors, 256);
+ unsigned long apic_attention;
int32_t apic_arb_prio;
int mp_state;
- int sipi_vector;
u64 ia32_misc_enable_msr;
+ u64 smbase;
+ u64 smi_count;
+ bool at_instruction_boundary;
bool tpr_access_reporting;
+ bool xfd_no_write_intercept;
+ u64 microcode_version;
+ u64 arch_capabilities;
+ u64 perf_capabilities;
- struct kvm_mmu mmu;
- /* only needed in kvm_pv_mmu_op() path, but it's hot so
- * put it here to avoid allocation */
- struct kvm_pv_mmu_op_buffer mmu_op_buffer;
+ /*
+ * Paging state of the vcpu
+ *
+ * If the vcpu runs in guest mode with two level paging this still saves
+ * the paging mode of the l1 guest. This context is always used to
+ * handle faults.
+ */
+ struct kvm_mmu *mmu;
- struct kvm_mmu_memory_cache mmu_pte_chain_cache;
- struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
- struct kvm_mmu_memory_cache mmu_page_cache;
- struct kvm_mmu_memory_cache mmu_page_header_cache;
+ /* Non-nested MMU for L1 */
+ struct kvm_mmu root_mmu;
- gfn_t last_pt_write_gfn;
- int last_pt_write_count;
- u64 *last_pte_updated;
- gfn_t last_pte_gfn;
+ /* L1 MMU when running nested */
+ struct kvm_mmu guest_mmu;
- struct {
- gfn_t gfn; /* presumed gfn during guest pte update */
- pfn_t pfn; /* pfn corresponding to that gfn */
- int largepage;
- unsigned long mmu_seq;
- } update_pte;
+ /*
+ * Paging state of an L2 guest (used for nested npt)
+ *
+ * This context will save all necessary information to walk page tables
+ * of an L2 guest. This context is only initialized for page table
+ * walking and not for faulting since we never handle l2 page faults on
+ * the host.
+ */
+ struct kvm_mmu nested_mmu;
+
+ /*
+ * Pointer to the mmu context currently used for
+ * gva_to_gpa translations.
+ */
+ struct kvm_mmu *walk_mmu;
+
+ struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
+ struct kvm_mmu_memory_cache mmu_shadow_page_cache;
+ struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
+ struct kvm_mmu_memory_cache mmu_page_header_cache;
+ /*
+ * This cache is to allocate external page table. E.g. private EPT used
+ * by the TDX module.
+ */
+ struct kvm_mmu_memory_cache mmu_external_spt_cache;
+
+ /*
+ * QEMU userspace and the guest each have their own FPU state.
+ * In vcpu_run, we switch between the user and guest FPU contexts.
+ * While running a VCPU, the VCPU thread will have the guest FPU
+ * context.
+ *
+ * Note that while the PKRU state lives inside the fpu registers,
+ * it is switched out separately at VMENTER and VMEXIT time. The
+ * "guest_fpstate" state here contains the guest FPU context, with the
+ * host PRKU bits.
+ */
+ struct fpu_guest guest_fpu;
- struct i387_fxsave_struct host_fx_image;
- struct i387_fxsave_struct guest_fx_image;
+ u64 xcr0;
+ u64 guest_supported_xcr0;
+ u64 ia32_xss;
+ u64 guest_supported_xss;
- gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
void *pio_data;
+ void *sev_pio_data;
+ unsigned sev_pio_count;
u8 event_exit_inst_len;
- struct kvm_queued_exception {
- bool pending;
- bool has_error_code;
- u8 nr;
- u32 error_code;
- } exception;
+ bool exception_from_userspace;
+
+ /* Exceptions to be injected to the guest. */
+ struct kvm_queued_exception exception;
+ /* Exception VM-Exits to be synthesized to L1. */
+ struct kvm_queued_exception exception_vmexit;
struct kvm_queued_interrupt {
- bool pending;
+ bool injected;
bool soft;
u8 nr;
} interrupt;
- struct {
- int vm86_active;
- u8 save_iopl;
- struct kvm_save_segment {
- u16 selector;
- unsigned long base;
- u32 limit;
- u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
int halt_request; /* real mode on Intel only */
int cpuid_nent;
- struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+ struct kvm_cpuid_entry2 *cpuid_entries;
+ bool cpuid_dynamic_bits_dirty;
+ bool is_amd_compatible;
+
+ /*
+ * cpu_caps holds the effective guest capabilities, i.e. the features
+ * the vCPU is allowed to use. Typically, but not always, features can
+ * be used by the guest if and only if both KVM and userspace want to
+ * expose the feature to the guest.
+ *
+ * A common exception is for virtualization holes, i.e. when KVM can't
+ * prevent the guest from using a feature, in which case the vCPU "has"
+ * the feature regardless of what KVM or userspace desires.
+ *
+ * Note, features that don't require KVM involvement in any way are
+ * NOT enforced/sanitized by KVM, i.e. are taken verbatim from the
+ * guest CPUID provided by userspace.
+ */
+ u32 cpu_caps[NR_KVM_CPU_CAPS];
+
+ u64 reserved_gpa_bits;
+ int maxphyaddr;
+
/* emulate context */
- struct x86_emulate_ctxt emulate_ctxt;
+ struct x86_emulate_ctxt *emulate_ctxt;
+ bool emulate_regs_need_sync_to_vcpu;
+ bool emulate_regs_need_sync_from_vcpu;
+ int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
+ unsigned long cui_linear_rip;
+ int cui_rdmsr_imm_reg;
gpa_t time;
- struct pvclock_vcpu_time_info hv_clock;
- unsigned int hv_clock_tsc_khz;
- unsigned int time_offset;
- struct page *time_page;
-
- bool singlestep; /* guest is single stepped by KVM */
- bool nmi_pending;
- bool nmi_injected;
-
- struct mtrr_state_type mtrr_state;
- u32 pat;
-
- int switch_db_regs;
- unsigned long host_db[KVM_NR_DB_REGS];
- unsigned long host_dr6;
- unsigned long host_dr7;
+ s8 pvclock_tsc_shift;
+ u32 pvclock_tsc_mul;
+ unsigned int hw_tsc_khz;
+ struct gfn_to_pfn_cache pv_time;
+ /* set guest stopped flag in pvclock flags field */
+ bool pvclock_set_guest_stopped_request;
+
+ struct {
+ u8 preempted;
+ u64 msr_val;
+ u64 last_steal;
+ struct gfn_to_hva_cache cache;
+ } st;
+
+ u64 l1_tsc_offset;
+ u64 tsc_offset; /* current tsc offset */
+ u64 last_guest_tsc;
+ u64 last_host_tsc;
+ u64 tsc_offset_adjustment;
+ u64 this_tsc_nsec;
+ u64 this_tsc_write;
+ u64 this_tsc_generation;
+ bool tsc_catchup;
+ bool tsc_always_catchup;
+ s8 virtual_tsc_shift;
+ u32 virtual_tsc_mult;
+ u32 virtual_tsc_khz;
+ s64 ia32_tsc_adjust_msr;
+ u64 msr_ia32_power_ctl;
+ u64 l1_tsc_scaling_ratio;
+ u64 tsc_scaling_ratio; /* current scaling ratio */
+
+ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
+ /* Number of NMIs pending injection, not including hardware vNMIs. */
+ unsigned int nmi_pending;
+ bool nmi_injected; /* Trying to inject an NMI this entry */
+ bool smi_pending; /* SMI queued after currently running handler */
+ u8 handling_intr_from_guest;
+
+ struct kvm_mtrr mtrr_state;
+ u64 pat;
+
+ unsigned switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
+ unsigned long guest_debug_dr7;
+ u64 msr_platform_info;
+ u64 msr_misc_features_enables;
+
+ u64 mcg_cap;
+ u64 mcg_status;
+ u64 mcg_ctl;
+ u64 mcg_ext_ctl;
+ u64 *mce_banks;
+ u64 *mci_ctl2_banks;
+
+ /* Cache MMIO info */
+ u64 mmio_gva;
+ unsigned mmio_access;
+ gfn_t mmio_gfn;
+ u64 mmio_gen;
+
+ struct kvm_pmu pmu;
+
+ /* used for guest single stepping over the given code position */
+ unsigned long singlestep_rip;
+
+#ifdef CONFIG_KVM_HYPERV
+ bool hyperv_enabled;
+ struct kvm_vcpu_hv *hyperv;
+#endif
+#ifdef CONFIG_KVM_XEN
+ struct kvm_vcpu_xen xen;
+#endif
+ cpumask_var_t wbinvd_dirty_mask;
+
+ unsigned long last_retry_eip;
+ unsigned long last_retry_addr;
+
+ struct {
+ bool halted;
+ gfn_t gfns[ASYNC_PF_PER_VCPU];
+ struct gfn_to_hva_cache data;
+ u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */
+ u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */
+ u16 vec;
+ u32 id;
+ u32 host_apf_flags;
+ bool send_always;
+ bool delivery_as_pf_vmexit;
+ bool pageready_pending;
+ } apf;
+
+ /* OSVW MSRs (AMD only) */
+ struct {
+ u64 length;
+ u64 status;
+ } osvw;
+
+ struct {
+ u64 msr_val;
+ struct gfn_to_hva_cache data;
+ } pv_eoi;
+
+ u64 msr_kvm_poll_control;
+
+ /* pv related host specific info */
+ struct {
+ bool pv_unhalted;
+ } pv;
+
+ int pending_ioapic_eoi;
+ int pending_external_vector;
+ int highest_stale_pending_ioapic_eoi;
+
+ /* be preempted when it's in kernel-mode(cpl=0) */
+ bool preempted_in_kernel;
+
+ /* Host CPU on which VM-entry was most recently attempted */
+ int last_vmentry_cpu;
+
+ /* AMD MSRC001_0015 Hardware Configuration */
+ u64 msr_hwcr;
+
+ /* pv related cpuid info */
+ struct {
+ /*
+ * value of the eax register in the KVM_CPUID_FEATURES CPUID
+ * leaf.
+ */
+ u32 features;
+
+ /*
+ * indicates whether pv emulation should be disabled if features
+ * are not present in the guest's cpuid
+ */
+ bool enforce;
+ } pv_cpuid;
+
+ /* Protected Guests */
+ bool guest_state_protected;
+ bool guest_tsc_protected;
+
+ /*
+ * Set when PDPTS were loaded directly by the userspace without
+ * reading the guest memory
+ */
+ bool pdptrs_from_userspace;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_tdp;
+#endif
};
-struct kvm_mem_alias {
- gfn_t base_gfn;
- unsigned long npages;
- gfn_t target_gfn;
+struct kvm_lpage_info {
+ int disallow_lpage;
};
-struct kvm_arch{
- int naliases;
- struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+struct kvm_arch_memory_slot {
+ struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
+ struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ unsigned short *gfn_write_track;
+};
+
+/*
+ * Track the mode of the optimized logical map, as the rules for decoding the
+ * destination vary per mode. Enabling the optimized logical map requires all
+ * software-enabled local APIs to be in the same mode, each addressable APIC to
+ * be mapped to only one MDA, and each MDA to map to at most one APIC.
+ */
+enum kvm_apic_logical_mode {
+ /* All local APICs are software disabled. */
+ KVM_APIC_MODE_SW_DISABLED,
+ /* All software enabled local APICs in xAPIC cluster addressing mode. */
+ KVM_APIC_MODE_XAPIC_CLUSTER,
+ /* All software enabled local APICs in xAPIC flat addressing mode. */
+ KVM_APIC_MODE_XAPIC_FLAT,
+ /* All software enabled local APICs in x2APIC mode. */
+ KVM_APIC_MODE_X2APIC,
+ /*
+ * Optimized map disabled, e.g. not all local APICs in the same logical
+ * mode, same logical ID assigned to multiple APICs, etc.
+ */
+ KVM_APIC_MODE_MAP_DISABLED,
+};
+
+struct kvm_apic_map {
+ struct rcu_head rcu;
+ enum kvm_apic_logical_mode logical_mode;
+ u32 max_apic_id;
+ union {
+ struct kvm_lapic *xapic_flat_map[8];
+ struct kvm_lapic *xapic_cluster_map[16][4];
+ };
+ struct kvm_lapic *phys_map[];
+};
+
+/* Hyper-V synthetic debugger (SynDbg)*/
+struct kvm_hv_syndbg {
+ struct {
+ u64 control;
+ u64 status;
+ u64 send_page;
+ u64 recv_page;
+ u64 pending_page;
+ } control;
+ u64 options;
+};
+
+/* Current state of Hyper-V TSC page clocksource */
+enum hv_tsc_page_status {
+ /* TSC page was not set up or disabled */
+ HV_TSC_PAGE_UNSET = 0,
+ /* TSC page MSR was written by the guest, update pending */
+ HV_TSC_PAGE_GUEST_CHANGED,
+ /* TSC page update was triggered from the host side */
+ HV_TSC_PAGE_HOST_CHANGED,
+ /* TSC page was properly set up and is currently active */
+ HV_TSC_PAGE_SET,
+ /* TSC page was set up with an inaccessible GPA */
+ HV_TSC_PAGE_BROKEN,
+};
+
+#ifdef CONFIG_KVM_HYPERV
+/* Hyper-V emulation context */
+struct kvm_hv {
+ struct mutex hv_lock;
+ u64 hv_guest_os_id;
+ u64 hv_hypercall;
+ u64 hv_tsc_page;
+ enum hv_tsc_page_status hv_tsc_page_status;
+
+ /* Hyper-v based guest crash (NT kernel bugcheck) parameters */
+ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
+ u64 hv_crash_ctl;
+
+ struct ms_hyperv_tsc_page tsc_ref;
+
+ struct idr conn_to_evt;
+
+ u64 hv_reenlightenment_control;
+ u64 hv_tsc_emulation_control;
+ u64 hv_tsc_emulation_status;
+ u64 hv_invtsc_control;
+
+ /* How many vCPUs have VP index != vCPU index */
+ atomic_t num_mismatched_vp_indexes;
+
+ /*
+ * How many SynICs use 'AutoEOI' feature
+ * (protected by arch.apicv_update_lock)
+ */
+ unsigned int synic_auto_eoi_used;
+
+ struct kvm_hv_syndbg hv_syndbg;
+
+ bool xsaves_xsavec_checked;
+};
+#endif
+
+struct msr_bitmap_range {
+ u32 flags;
+ u32 nmsrs;
+ u32 base;
+ unsigned long *bitmap;
+};
+
+#ifdef CONFIG_KVM_XEN
+/* Xen emulation context */
+struct kvm_xen {
+ struct mutex xen_lock;
+ u32 xen_version;
+ bool long_mode;
+ bool runstate_update_flag;
+ u8 upcall_vector;
+ struct gfn_to_pfn_cache shinfo_cache;
+ struct idr evtchn_ports;
+ unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+
+ struct kvm_xen_hvm_config hvm_config;
+};
+#endif
+
+enum kvm_irqchip_mode {
+ KVM_IRQCHIP_NONE,
+ KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
+ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
+};
+
+struct kvm_x86_msr_filter {
+ u8 count;
+ bool default_allow:1;
+ struct msr_bitmap_range ranges[16];
+};
+
+struct kvm_x86_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u32 fixed_counter_bitmap;
+ __u32 flags;
+ __u32 nr_includes;
+ __u32 nr_excludes;
+ __u64 *includes;
+ __u64 *excludes;
+ __u64 events[];
+};
+
+enum kvm_apicv_inhibit {
+
+ /********************************************************************/
+ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
+ /********************************************************************/
+
+ /*
+ * APIC acceleration is disabled by a module parameter
+ * and/or not supported in hardware.
+ */
+ APICV_INHIBIT_REASON_DISABLED,
+
+ /*
+ * APIC acceleration is inhibited because AutoEOI feature is
+ * being used by a HyperV guest.
+ */
+ APICV_INHIBIT_REASON_HYPERV,
+
+ /*
+ * APIC acceleration is inhibited because the userspace didn't yet
+ * enable the kernel/split irqchip.
+ */
+ APICV_INHIBIT_REASON_ABSENT,
+
+ /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
+ * (out of band, debug measure of blocking all interrupts on this vCPU)
+ * was enabled, to avoid AVIC/APICv bypassing it.
+ */
+ APICV_INHIBIT_REASON_BLOCKIRQ,
+
+ /*
+ * APICv is disabled because not all vCPUs have a 1:1 mapping between
+ * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
+
+ /*
+ * For simplicity, the APIC acceleration is inhibited
+ * first time either APIC ID or APIC base are changed by the guest
+ * from their reset values.
+ */
+ APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
+
+ /******************************************************/
+ /* INHIBITs that are relevant only to the AMD's AVIC. */
+ /******************************************************/
+
+ /*
+ * AVIC is inhibited on a vCPU because it runs a nested guest.
+ *
+ * This is needed because unlike APICv, the peers of this vCPU
+ * cannot use the doorbell mechanism to signal interrupts via AVIC when
+ * a vCPU runs nested.
+ */
+ APICV_INHIBIT_REASON_NESTED,
+
+ /*
+ * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
+ * which cannot be injected when the AVIC is enabled, thus AVIC
+ * is inhibited while KVM waits for IRQ window.
+ */
+ APICV_INHIBIT_REASON_IRQWIN,
+
+ /*
+ * PIT (i8254) 're-inject' mode, relies on EOI intercept,
+ * which AVIC doesn't support for edge triggered interrupts.
+ */
+ APICV_INHIBIT_REASON_PIT_REINJ,
+
+ /*
+ * AVIC is disabled because SEV doesn't support it.
+ */
+ APICV_INHIBIT_REASON_SEV,
+
+ /*
+ * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
+ * mapping between logical ID and vCPU.
+ */
+ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
- unsigned int n_free_mmu_pages;
- unsigned int n_requested_mmu_pages;
- unsigned int n_alloc_mmu_pages;
- struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
- * Hash table of struct kvm_mmu_page.
+ * AVIC is disabled because the vCPU's APIC ID is beyond the max
+ * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
*/
+ APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
+
+ NR_APICV_INHIBIT_REASONS,
+};
+
+#define __APICV_INHIBIT_REASON(reason) \
+ { BIT(APICV_INHIBIT_REASON_##reason), #reason }
+
+#define APICV_INHIBIT_REASONS \
+ __APICV_INHIBIT_REASON(DISABLED), \
+ __APICV_INHIBIT_REASON(HYPERV), \
+ __APICV_INHIBIT_REASON(ABSENT), \
+ __APICV_INHIBIT_REASON(BLOCKIRQ), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \
+ __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \
+ __APICV_INHIBIT_REASON(NESTED), \
+ __APICV_INHIBIT_REASON(IRQWIN), \
+ __APICV_INHIBIT_REASON(PIT_REINJ), \
+ __APICV_INHIBIT_REASON(SEV), \
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
+
+struct kvm_possible_nx_huge_pages {
+ /*
+ * A list of kvm_mmu_page structs that, if zapped, could possibly be
+ * replaced by an NX huge page. A shadow page is on this list if its
+ * existence disallows an NX huge page (nx_huge_page_disallowed is set)
+ * and there are no other conditions that prevent a huge page, e.g.
+ * the backing host page is huge, dirtly logging is not enabled for its
+ * memslot, etc... Note, zapping shadow pages on this list doesn't
+ * guarantee an NX huge page will be created in its stead, e.g. if the
+ * guest attempts to execute from the region then KVM obviously can't
+ * create an NX huge page (without hanging the guest).
+ */
+ struct list_head pages;
+ u64 nr_pages;
+};
+
+enum kvm_mmu_type {
+ KVM_SHADOW_MMU,
+#ifdef CONFIG_X86_64
+ KVM_TDP_MMU,
+#endif
+ KVM_NR_MMU_TYPES,
+};
+
+struct kvm_arch {
+ unsigned long n_used_mmu_pages;
+ unsigned long n_requested_mmu_pages;
+ unsigned long n_max_mmu_pages;
+ unsigned int indirect_shadow_pages;
+ u8 mmu_valid_gen;
+ u8 vm_type;
+ bool has_private_mem;
+ bool has_protected_state;
+ bool has_protected_eoi;
+ bool pre_fault_allowed;
+ struct hlist_head *mmu_page_hash;
struct list_head active_mmu_pages;
- struct list_head assigned_dev_head;
- struct iommu_domain *iommu_domain;
- int iommu_flags;
+ struct kvm_possible_nx_huge_pages possible_nx_huge_pages[KVM_NR_MMU_TYPES];
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ struct kvm_page_track_notifier_head track_notifier_head;
+#endif
+ /*
+ * Protects marking pages unsync during page faults, as TDP MMU page
+ * faults only take mmu_lock for read. For simplicity, the unsync
+ * pages lock is always taken when marking pages unsync regardless of
+ * whether mmu_lock is held for read or write.
+ */
+ spinlock_t mmu_unsync_pages_lock;
+
+ u64 shadow_mmio_value;
+
+#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
+ atomic_t noncoherent_dma_count;
+ unsigned long nr_possible_bypass_irqs;
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
- struct hlist_head irq_ack_notifier_list;
- int vapics_in_nmi_mode;
+#endif
+ atomic_t vapics_in_nmi_mode;
+ struct mutex apic_map_lock;
+ struct kvm_apic_map __rcu *apic_map;
+ atomic_t apic_map_dirty;
- unsigned int tss_addr;
- struct page *apic_access_page;
+ bool apic_access_memslot_enabled;
+ bool apic_access_memslot_inhibited;
+
+ /* Protects apicv_inhibit_reasons */
+ struct rw_semaphore apicv_update_lock;
+ unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
- struct page *ept_identity_pagetable;
- bool ept_identity_pagetable_done;
+ u64 disabled_exits;
+
+ s64 kvmclock_offset;
+
+ /*
+ * This also protects nr_vcpus_matched_tsc which is read from a
+ * preemption-disabled region, so it must be a raw spinlock.
+ */
+ raw_spinlock_t tsc_write_lock;
+ u64 last_tsc_nsec;
+ u64 last_tsc_write;
+ u32 last_tsc_khz;
+ u64 last_tsc_offset;
+ u64 cur_tsc_nsec;
+ u64 cur_tsc_write;
+ u64 cur_tsc_offset;
+ u64 cur_tsc_generation;
+ int nr_vcpus_matched_tsc;
+
+ u32 default_tsc_khz;
+ bool user_set_tsc;
+ u64 apic_bus_cycle_ns;
+
+ seqcount_raw_spinlock_t pvclock_sc;
+ bool use_master_clock;
+ u64 master_kernel_ns;
+ u64 master_cycle_now;
+
+#ifdef CONFIG_KVM_HYPERV
+ struct kvm_hv hyperv;
+#endif
+
+#ifdef CONFIG_KVM_XEN
+ struct kvm_xen xen;
+#endif
+
+ bool backwards_tsc_observed;
+ bool boot_vcpu_runs_old_kvmclock;
+ u32 bsp_vcpu_id;
+
+ u64 disabled_quirks;
+
+ enum kvm_irqchip_mode irqchip_mode;
+ u8 nr_reserved_ioapic_pins;
+
+ bool disabled_lapic_found;
+
+ bool x2apic_format;
+ bool x2apic_broadcast_quirk_disabled;
+
+ bool has_mapped_host_mmio;
+ bool guest_can_read_msr_platform_info;
+ bool exception_payload_enabled;
+
+ bool triple_fault_event;
+
+ bool bus_lock_detection_enabled;
+ bool enable_pmu;
+
+ u32 notify_window;
+ u32 notify_vmexit_flags;
+ /*
+ * If exit_on_emulation_error is set, and the in-kernel instruction
+ * emulator fails to emulate an instruction, allow userspace
+ * the opportunity to look at it.
+ */
+ bool exit_on_emulation_error;
+
+ /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
+ u32 user_space_msr_mask;
+ struct kvm_x86_msr_filter __rcu *msr_filter;
+
+ u32 hypercall_exit_enabled;
+
+ /* Guest can access the SGX PROVISIONKEY. */
+ bool sgx_provisioning_allowed;
- unsigned long irq_sources_bitmap;
- unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
- u64 vm_init_tsc;
+ struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
+ struct vhost_task *nx_huge_page_recovery_thread;
+ u64 nx_huge_page_last;
+ struct once nx_once;
+
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_KVM_PROVE_MMU
+ /*
+ * The number of TDP MMU pages across all roots. Used only to sanity
+ * check that KVM isn't leaking TDP MMU pages.
+ */
+ atomic64_t tdp_mmu_pages;
+#endif
+
+ /*
+ * List of struct kvm_mmu_pages being used as roots.
+ * All struct kvm_mmu_pages in the list should have
+ * tdp_mmu_page set.
+ *
+ * For reads, this list is protected by:
+ * RCU alone or
+ * the MMU lock in read mode + RCU or
+ * the MMU lock in write mode
+ *
+ * For writes, this list is protected by tdp_mmu_pages_lock; see
+ * below for the details.
+ *
+ * Roots will remain in the list until their tdp_mmu_root_count
+ * drops to zero, at which point the thread that decremented the
+ * count to zero should removed the root from the list and clean
+ * it up, freeing the root after an RCU grace period.
+ */
+ struct list_head tdp_mmu_roots;
+
+ /*
+ * Protects accesses to the following fields when the MMU lock
+ * is held in read mode:
+ * - tdp_mmu_roots (above)
+ * - the link field of kvm_mmu_page structs used by the TDP MMU
+ * - possible_nx_huge_pages[KVM_TDP_MMU];
+ * - the possible_nx_huge_page_link field of kvm_mmu_page structs used
+ * by the TDP MMU
+ * Because the lock is only taken within the MMU lock, strictly
+ * speaking it is redundant to acquire this lock when the thread
+ * holds the MMU lock in write mode. However it often simplifies
+ * the code to do so.
+ */
+ spinlock_t tdp_mmu_pages_lock;
+#endif /* CONFIG_X86_64 */
+
+ /*
+ * If set, at least one shadow root has been allocated. This flag
+ * is used as one input when determining whether certain memslot
+ * related allocations are necessary.
+ */
+ bool shadow_root_allocated;
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * If set, the VM has (or had) an external write tracking user, and
+ * thus all write tracking metadata has been allocated, even if KVM
+ * itself isn't using write tracking.
+ */
+ bool external_write_tracking_enabled;
+#endif
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_tdp;
+ spinlock_t hv_root_tdp_lock;
+ struct hv_partition_assist_pg *hv_pa_pg;
+#endif
+ /*
+ * VM-scope maximum vCPU ID. Used to determine the size of structures
+ * that increase along with the maximum vCPU ID, in which case, using
+ * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
+ */
+ u32 max_vcpu_ids;
+
+ bool disable_nx_huge_pages;
+
+ /*
+ * Memory caches used to allocate shadow pages when performing eager
+ * page splitting. No need for a shadowed_info_cache since eager page
+ * splitting only allocates direct shadow pages.
+ *
+ * Protected by kvm->slots_lock.
+ */
+ struct kvm_mmu_memory_cache split_shadow_page_cache;
+ struct kvm_mmu_memory_cache split_page_header_cache;
+
+ /*
+ * Memory cache used to allocate pte_list_desc structs while splitting
+ * huge pages. In the worst case, to split one huge page, 512
+ * pte_list_desc structs are needed to add each lower level leaf sptep
+ * to the rmap plus 1 to extend the parent_ptes rmap of the lower level
+ * page table.
+ *
+ * Protected by kvm->slots_lock.
+ */
+#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
+ struct kvm_mmu_memory_cache split_desc_cache;
+
+ gfn_t gfn_direct_bits;
+
+ /*
+ * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero
+ * value indicates CPU dirty logging is unsupported or disabled in
+ * current VM.
+ */
+ int cpu_dirty_log_size;
};
struct kvm_vm_stat {
- u32 mmu_shadow_zapped;
- u32 mmu_pte_write;
- u32 mmu_pte_updated;
- u32 mmu_pde_zapped;
- u32 mmu_flooded;
- u32 mmu_recycled;
- u32 mmu_cache_miss;
- u32 mmu_unsync;
- u32 remote_tlb_flush;
- u32 lpages;
+ struct kvm_vm_stat_generic generic;
+ u64 mmu_shadow_zapped;
+ u64 mmu_pte_write;
+ u64 mmu_pde_zapped;
+ u64 mmu_flooded;
+ u64 mmu_recycled;
+ u64 mmu_cache_miss;
+ u64 mmu_unsync;
+ union {
+ struct {
+ atomic64_t pages_4k;
+ atomic64_t pages_2m;
+ atomic64_t pages_1g;
+ };
+ atomic64_t pages[KVM_NR_PAGE_SIZES];
+ };
+ u64 nx_lpage_splits;
+ u64 max_mmu_page_hash_collisions;
+ u64 max_mmu_rmap_size;
};
struct kvm_vcpu_stat {
- u32 pf_fixed;
- u32 pf_guest;
- u32 tlb_flush;
- u32 invlpg;
-
- u32 exits;
- u32 io_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 irq_window_exits;
- u32 nmi_window_exits;
- u32 halt_exits;
- u32 halt_wakeup;
- u32 request_irq_exits;
- u32 irq_exits;
- u32 host_state_reload;
- u32 efer_reload;
- u32 fpu_reload;
- u32 insn_emulation;
- u32 insn_emulation_fail;
- u32 hypercalls;
- u32 irq_injections;
- u32 nmi_injections;
-};
-
-struct descriptor_table {
- u16 limit;
- unsigned long base;
-} __attribute__((packed));
+ struct kvm_vcpu_stat_generic generic;
+ u64 pf_taken;
+ u64 pf_fixed;
+ u64 pf_emulate;
+ u64 pf_spurious;
+ u64 pf_fast;
+ u64 pf_mmio_spte_created;
+ u64 pf_guest;
+ u64 tlb_flush;
+ u64 invlpg;
+
+ u64 exits;
+ u64 io_exits;
+ u64 mmio_exits;
+ u64 signal_exits;
+ u64 irq_window_exits;
+ u64 nmi_window_exits;
+ u64 l1d_flush;
+ u64 halt_exits;
+ u64 request_irq_exits;
+ u64 irq_exits;
+ u64 host_state_reload;
+ u64 fpu_reload;
+ u64 insn_emulation;
+ u64 insn_emulation_fail;
+ u64 hypercalls;
+ u64 irq_injections;
+ u64 nmi_injections;
+ u64 req_event;
+ u64 nested_run;
+ u64 directed_yield_attempted;
+ u64 directed_yield_successful;
+ u64 preemption_reported;
+ u64 preemption_other;
+ u64 guest_mode;
+ u64 notify_window_exits;
+};
+
+struct x86_instruction_info;
+
+struct msr_data {
+ bool host_initiated;
+ u32 index;
+ u64 data;
+};
+
+struct kvm_lapic_irq {
+ u32 vector;
+ u16 delivery_mode;
+ u16 dest_mode;
+ bool level;
+ u16 trig_mode;
+ u32 shorthand;
+ u32 dest_id;
+ bool msi_redir_hint;
+};
+
+static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+{
+ return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+}
+
+enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+};
struct kvm_x86_ops {
- int (*cpu_has_kvm_support)(void); /* __init */
- int (*disabled_by_bios)(void); /* __init */
- void (*hardware_enable)(void *dummy); /* __init */
- void (*hardware_disable)(void *dummy);
- void (*check_processor_compatibility)(void *rtn);
- int (*hardware_setup)(void); /* __init */
- void (*hardware_unsetup)(void); /* __exit */
- bool (*cpu_has_accelerated_tpr)(void);
+ const char *name;
+
+ int (*check_processor_compatibility)(void);
+
+ int (*enable_virtualization_cpu)(void);
+ void (*disable_virtualization_cpu)(void);
+ cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
+
+ void (*hardware_unsetup)(void);
+ bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
+ void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
+
+ unsigned int vm_size;
+ int (*vm_init)(struct kvm *kvm);
+ void (*vm_destroy)(struct kvm *kvm);
+ void (*vm_pre_destroy)(struct kvm *kvm);
/* Create, but do not attach this VCPU */
- struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+ int (*vcpu_precreate)(struct kvm *kvm);
+ int (*vcpu_create)(struct kvm_vcpu *vcpu);
void (*vcpu_free)(struct kvm_vcpu *vcpu);
- int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+ void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
- void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
- int (*set_guest_debug)(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
- int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ /*
+ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
+ * match the host's value even while the guest is active.
+ */
+ const u64 HOST_OWNED_DEBUGCTL;
+
+ void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
void (*get_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
int (*get_cpl)(struct kvm_vcpu *vcpu);
+ int (*get_cpl_no_cache)(struct kvm_vcpu *vcpu);
void (*set_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
- void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+ bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
- void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
- void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
- void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
- void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception);
+ int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ bool (*get_if_flag)(struct kvm_vcpu *vcpu);
+
+ void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
+ void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
+#if IS_ENABLED(CONFIG_HYPERV)
+ int (*flush_remote_tlbs)(struct kvm *kvm);
+ int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn,
+ gfn_t nr_pages);
+#endif
- void (*tlb_flush)(struct kvm_vcpu *vcpu);
+ /*
+ * Flush any TLB entries associated with the given GVA.
+ * Does not need to flush GPA->HPA mappings.
+ * Can potentially get non-canonical addresses through INVLPGs, which
+ * the implementation may choose to ignore if appropriate.
+ */
+ void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr);
- void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
- int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
- void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ /*
+ * Flush any TLB entries created by the guest. Like tlb_flush_gva(),
+ * does not need to flush GPA->HPA mappings.
+ */
+ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
+
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+ u64 run_flags);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath);
+ int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*update_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
- void (*set_irq)(struct kvm_vcpu *vcpu);
- void (*set_nmi)(struct kvm_vcpu *vcpu);
- void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code);
- int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
- int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+ void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected);
+ void (*inject_nmi)(struct kvm_vcpu *vcpu);
+ void (*inject_exception)(struct kvm_vcpu *vcpu);
+ void (*cancel_injection)(struct kvm_vcpu *vcpu);
+ int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+ int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+ bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+ void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+ /* Whether or not a virtual NMI is pending in hardware. */
+ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
+ /*
+ * Attempt to pend a virtual NMI in hardware. Returns %true on success
+ * to allow using static_call_ret0 as the fallback.
+ */
+ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+
+ const bool x2apic_icr_is_split;
+ const unsigned long required_apicv_inhibits;
+ bool allow_apicv_in_x2apic_without_x2apic_virtualization;
+ void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+ void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
+ void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
+ void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+ int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
- int (*get_tdp_level)(void);
- u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
+ u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+
+ void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level);
+
+ /* Update external mapping with page table link. */
+ int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ void *external_spt);
+ /* Update the external page table from spte getting set. */
+ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ u64 mirror_spte);
+
+ /* Update external page tables for page table about to be freed. */
+ int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ void *external_spt);
+
+ /* Update external page table from spte getting removed, and flush TLB. */
+ void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ u64 mirror_spte);
+
+ bool (*has_wbinvd_exit)(void);
+
+ u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
+ u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu);
+ void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
+
+ /*
+ * Retrieve somewhat arbitrary exit/entry information. Intended to
+ * be used only from within tracepoints or error paths.
+ */
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code);
+
+ void (*get_entry_info)(struct kvm_vcpu *vcpu,
+ u32 *intr_info, u32 *error_code);
+
+ int (*check_intercept)(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception);
+ void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+
+ void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
+
+ const struct kvm_x86_nested_ops *nested_ops;
+
+ void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
+ void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
+
+ int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+ void (*pi_start_bypass)(struct kvm *kvm);
+ void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
+ void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+ bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
+ bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu);
+
+ int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired);
+ void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+ void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_KVM_SMM
+ int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+ int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+ int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+ void (*enable_smi_window)(struct kvm_vcpu *vcpu);
+#endif
+
+ int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
+ int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+ int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
+ int (*vcpu_mem_enc_unlocked_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
+ int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+ int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+ void (*guest_memory_reclaimed)(struct kvm *kvm);
+
+ int (*get_feature_msr)(u32 msr, u64 *data);
+
+ int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+
+ bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
+ int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
+
+ void (*migrate_timers)(struct kvm_vcpu *vcpu);
+ void (*recalc_intercepts)(struct kvm_vcpu *vcpu);
+ int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
+
+ void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+
+ /*
+ * Returns vCPU specific APICv inhibit reasons
+ */
+ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
+
+ gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
+ void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+ int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+ void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+ int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
+};
+
+struct kvm_x86_nested_ops {
+ void (*leave_nested)(struct kvm_vcpu *vcpu);
+ bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code);
+ int (*check_events)(struct kvm_vcpu *vcpu);
+ bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection);
+ void (*triple_fault)(struct kvm_vcpu *vcpu);
+ int (*get_state)(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ unsigned user_data_size);
+ int (*set_state)(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state);
+ bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
+ int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
+
+ int (*enable_evmcs)(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version);
+ uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+ void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu);
+};
+
+struct kvm_x86_init_ops {
+ int (*hardware_setup)(void);
+ unsigned int (*handle_intel_pt_intr)(void);
+
+ struct kvm_x86_ops *runtime_ops;
+ struct kvm_pmu_ops *pmu_ops;
+};
+
+struct kvm_arch_async_pf {
+ u32 token;
+ gfn_t gfn;
+ unsigned long cr3;
+ bool direct_map;
+ u64 error_code;
+};
+
+extern u32 __read_mostly kvm_nr_uret_msrs;
+extern bool __read_mostly allow_smaller_maxphyaddr;
+extern bool __read_mostly enable_apicv;
+extern bool __read_mostly enable_ipiv;
+extern bool __read_mostly enable_device_posted_irqs;
+extern struct kvm_x86_ops kvm_x86_ops;
+
+#define kvm_x86_call(func) static_call(kvm_x86_##func)
+#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
+
+#define KVM_X86_OP(func) \
+ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+void kvm_x86_vendor_exit(void);
+
+#define __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+ return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
+}
+
+#define __KVM_HAVE_ARCH_VM_FREE
+void kvm_arch_free_vm(struct kvm *kvm);
+
+#if IS_ENABLED(CONFIG_HYPERV)
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
+{
+ if (kvm_x86_ops.flush_remote_tlbs &&
+ !kvm_x86_call(flush_remote_tlbs)(kvm))
+ return 0;
+ else
+ return -ENOTSUPP;
+}
+
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
+ u64 nr_pages)
+{
+ if (!kvm_x86_ops.flush_remote_tlbs_range)
+ return -EOPNOTSUPP;
+
+ return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+}
+#endif /* CONFIG_HYPERV */
+
+enum kvm_intr_type {
+ /* Values are arbitrary, but must be non-zero. */
+ KVM_HANDLING_IRQ = 1,
+ KVM_HANDLING_NMI,
};
-extern struct kvm_x86_ops *kvm_x86_ops;
+/* Enable perf NMI and timer modes to work, and minimise false positives. */
+#define kvm_arch_pmi_in_guest(vcpu) \
+ ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
+ (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
+void __init kvm_mmu_x86_module_init(void);
+int kvm_mmu_vendor_module_init(void);
+void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask);
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_zap_all(struct kvm *kvm);
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+int kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int start_level);
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level);
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
- gpa_t addr, unsigned long *ret);
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
-enum emulation_result {
- EMULATE_DONE, /* no further processing */
- EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
- EMULATE_FAIL, /* can't emulate this instruction */
-};
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
+/*
+ * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
+ * userspace I/O) to indicate that the emulation context
+ * should be reused as is, i.e. skip initialization of
+ * emulation context, instruction fetch and decode.
+ *
+ * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
+ * Indicates that only select instructions (tagged with
+ * EmulateOnUD) should be emulated (to minimize the emulator
+ * attack surface). See also EMULTYPE_TRAP_UD_FORCED.
+ *
+ * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
+ * decode the instruction length. For use *only* by
+ * kvm_x86_ops.skip_emulated_instruction() implementations if
+ * EMULTYPE_COMPLETE_USER_EXIT is not set.
+ *
+ * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
+ * retry native execution under certain conditions,
+ * Can only be set in conjunction with EMULTYPE_PF.
+ *
+ * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
+ * triggered by KVM's magic "force emulation" prefix,
+ * which is opt in via module param (off by default).
+ * Bypasses EmulateOnUD restriction despite emulating
+ * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
+ * Used to test the full emulator from userspace.
+ *
+ * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ * backdoor emulation, which is opt in via module param.
+ * VMware backdoor emulation handles select instructions
+ * and reinjects the #GP for all other cases.
+ *
+ * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case
+ * the CR2/GPA value pass on the stack is valid.
+ *
+ * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
+ * state and inject single-step #DBs after skipping
+ * an instruction (after completing userspace I/O).
+ *
+ * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
+ * is attempting to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself,
+ * and the owning page table is being shadowed by KVM.
+ * If emulation of the faulting instruction fails and
+ * this flag is set, KVM will exit to userspace instead
+ * of retrying emulation as KVM cannot make forward
+ * progress.
+ *
+ * If emulation fails for a write to guest page tables,
+ * KVM unprotects (zaps) the shadow page for the target
+ * gfn and resumes the guest to retry the non-emulatable
+ * instruction (on hardware). Unprotecting the gfn
+ * doesn't allow forward progress for a self-changing
+ * access because doing so also zaps the translation for
+ * the gfn, i.e. retrying the instruction will hit a
+ * !PRESENT fault, which results in a new shadow page
+ * and sends KVM back to square one.
+ *
+ * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip
+ * an instruction if it could generate a given software
+ * interrupt, which must be encoded via
+ * EMULTYPE_SET_SOFT_INT_VECTOR().
+ */
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags);
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
- unsigned long *rflags);
-void kvm_enable_efer_bits(u64);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+#define EMULTYPE_ALLOW_RETRY_PF (1 << 3)
+#define EMULTYPE_TRAP_UD_FORCED (1 << 4)
+#define EMULTYPE_VMWARE_GP (1 << 5)
+#define EMULTYPE_PF (1 << 6)
+#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
+#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
+#define EMULTYPE_SKIP_SOFT_INT (1 << 9)
+
+#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16)
+#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff)
+
+static inline bool kvm_can_emulate_event_vectoring(int emul_type)
+{
+ return !(emul_type & EMULTYPE_PF);
+}
-struct x86_emulate_ctxt;
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len);
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
+ u64 *data, u8 ndata);
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
+
+void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa);
+void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason);
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+void kvm_enable_efer_bits(u64);
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
+int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
+int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long value);
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg);
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
-
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code);
+
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
- u32 error_code);
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error_code, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
+
+static inline int __kvm_irq_line_state(unsigned long *irq_state,
+ int irq_source_id, int level)
+{
+ /* Logical OR for level trig interrupt */
+ if (level)
+ __set_bit(irq_source_id, irq_state);
+ else
+ __clear_bit(irq_source_id, irq_state);
-int kvm_pic_set_irq(void *opaque, int irq, int level);
+ return !!(*irq_state);
+}
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
-unsigned long segment_base(u16 selector);
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- bool guest_initiated);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+void kvm_update_dr7(struct kvm_vcpu *vcpu);
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry);
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-
-void kvm_enable_tdp(void);
-void kvm_disable_tdp(void);
-
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-int complete_pio(struct kvm_vcpu *vcpu);
-
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
-static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa)
{
- struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
+ return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
}
-static inline u16 kvm_read_fs(void)
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
+ ulong roots_to_free);
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+
+bool kvm_apicv_activated(struct kvm *kvm);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set);
+void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set);
+
+static inline void kvm_set_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason)
{
- u16 seg;
- asm("mov %%fs, %0" : "=g"(seg));
- return seg;
+ kvm_set_or_clear_apicv_inhibit(kvm, reason, true);
}
-static inline u16 kvm_read_gs(void)
+static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason)
{
- u16 seg;
- asm("mov %%gs, %0" : "=g"(seg));
- return seg;
+ kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
}
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len);
+void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots);
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
+
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level);
+
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
+#endif
+
+#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
+
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -685,75 +2320,26 @@ static inline u16 kvm_read_ldt(void)
return ldt;
}
-static inline void kvm_load_fs(u16 sel)
-{
- asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void kvm_load_gs(u16 sel)
-{
- asm("mov %0, %%gs" : : "rm"(sel));
-}
-
static inline void kvm_load_ldt(u16 sel)
{
asm("lldt %0" : : "rm"(sel));
}
-static inline void kvm_get_idt(struct descriptor_table *table)
-{
- asm("sidt %0" : "=m"(*table));
-}
-
-static inline void kvm_get_gdt(struct descriptor_table *table)
-{
- asm("sgdt %0" : "=m"(*table));
-}
-
-static inline unsigned long kvm_read_tr_base(void)
-{
- u16 tr;
- asm("str %0" : "=g"(tr));
- return segment_base(tr);
-}
-
#ifdef CONFIG_X86_64
static inline unsigned long read_msr(unsigned long msr)
{
u64 value;
- rdmsrl(msr, value);
+ rdmsrq(msr, value);
return value;
}
#endif
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
- asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
- asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
- asm("finit");
-}
-
-static inline u32 get_rdx_init_val(void)
-{
- return 0x600; /* P6 family */
-}
-
static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
{
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
-#define MSR_IA32_TIME_STAMP_COUNTER 0x010
-
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
@@ -768,33 +2354,131 @@ enum {
TASK_SWITCH_GATE = 3,
};
-#define HF_GIF_MASK (1 << 0)
-#define HF_HIF_MASK (1 << 1)
-#define HF_VINTR_MASK (1 << 2)
-#define HF_NMI_MASK (1 << 3)
-#define HF_IRET_MASK (1 << 4)
+#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */
+
+#ifdef CONFIG_KVM_SMM
+#define HF_SMM_MASK (1 << 1)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 2)
+
+# define KVM_MAX_NR_ADDRESS_SPACES 2
+/* SMM is currently unsupported for guests with private memory. */
+# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2)
+# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
+# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+#else
+# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
+#endif
+
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_extint(struct kvm_vcpu *v);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_extint(struct kvm_vcpu *v);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit);
+
+int kvm_add_user_return_msr(u32 msr);
+int kvm_find_user_return_msr(u32 msr);
+int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
+u64 kvm_get_user_return_msr(unsigned int slot);
+
+static inline bool kvm_is_supported_user_return_msr(u32 msr)
+{
+ return kvm_find_user_return_msr(msr) >= 0;
+}
+
+u64 kvm_scale_tsc(u64 tsc, u64 ratio);
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
+
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap);
+
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
+void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size);
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+
+static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
+{
+ /* We can only post Fixed and LowPrio IRQs */
+ return (irq->delivery_mode == APIC_DM_FIXED ||
+ irq->delivery_mode == APIC_DM_LOWEST);
+}
+
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_call(vcpu_blocking)(vcpu);
+}
+
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_call(vcpu_unblocking)(vcpu);
+}
+
+static inline int kvm_cpu_get_apicid(int mps_cpu)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ return default_cpu_present_to_apicid(mps_cpu);
+#else
+ WARN_ON_ONCE(1);
+ return BAD_APICID;
+#endif
+}
+
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
+
+#define KVM_CLOCK_VALID_FLAGS \
+ (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
+#define KVM_X86_VALID_QUIRKS \
+ (KVM_X86_QUIRK_LINT0_REENABLED | \
+ KVM_X86_QUIRK_CD_NW_CLEARED | \
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
+ KVM_X86_QUIRK_OUT_7E_INC_RIP | \
+ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
+ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
+ KVM_X86_QUIRK_SLOT_ZAP_ALL | \
+ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT)
+
+#define KVM_X86_CONDITIONAL_QUIRKS \
+ (KVM_X86_QUIRK_CD_NW_CLEARED | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT)
/*
- * Hardware virtualization extension instructions may fault if a
- * reboot turns off virtualization while processes are running.
- * Trap the fault and ignore the instruction if that happens.
+ * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+ * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
+ * remaining 31 lower bits must be 0 to preserve ABI.
*/
-asmlinkage void kvm_handle_fault_on_reboot(void);
-
-#define __kvm_handle_fault_on_reboot(insn) \
- "666: " insn "\n\t" \
- ".pushsection .fixup, \"ax\" \n" \
- "667: \n\t" \
- __ASM_SIZE(push) " $666b \n\t" \
- "jmp kvm_handle_fault_on_reboot \n\t" \
- ".popsection \n\t" \
- ".pushsection __ex_table, \"a\" \n\t" \
- _ASM_PTR " 666b, 667b \n\t" \
- ".popsection"
-
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
-int kvm_age_hva(struct kvm *kvm, unsigned long hva);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
+
+static inline bool kvm_arch_has_irq_bypass(void)
+{
+ return enable_device_posted_irqs;
+}
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
new file mode 100644
index 000000000000..3d040741044b
--- /dev/null
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_PAGE_TRACK_H
+#define _ASM_X86_KVM_PAGE_TRACK_H
+
+#include <linux/kvm_types.h>
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+/*
+ * The notifier represented by @kvm_page_track_notifier_node is linked into
+ * the head which will be notified when guest is triggering the track event.
+ *
+ * Write access on the head is protected by kvm->mmu_lock, read access
+ * is protected by track_srcu.
+ */
+struct kvm_page_track_notifier_head {
+ struct srcu_struct track_srcu;
+ struct hlist_head track_notifier_list;
+};
+
+struct kvm_page_track_notifier_node {
+ struct hlist_node node;
+
+ /*
+ * It is called when guest is writing the write-tracked page
+ * and write emulation is finished at that time.
+ *
+ * @gpa: the physical address written by guest.
+ * @new: the data was written to the address.
+ * @bytes: the written length.
+ * @node: this node
+ */
+ void (*track_write)(gpa_t gpa, const u8 *new, int bytes,
+ struct kvm_page_track_notifier_node *node);
+
+ /*
+ * Invoked when a memory region is removed from the guest. Or in KVM
+ * terms, when a memslot is deleted.
+ *
+ * @gfn: base gfn of the region being removed
+ * @nr_pages: number of pages in the to-be-removed region
+ * @node: this node
+ */
+ void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages,
+ struct kvm_page_track_notifier_node *node);
+};
+
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn);
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn);
+#else
+/*
+ * Allow defining a node in a structure even if page tracking is disabled, e.g.
+ * to play nice with testing headers via direct inclusion from the command line.
+ */
+struct kvm_page_track_notifier_node {};
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
+
+#endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index b8a3305ae093..4a47c16e2df8 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -1,74 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_KVM_PARA_H
#define _ASM_X86_KVM_PARA_H
-/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
- * should be used to determine that a VM is running under KVM.
- */
-#define KVM_CPUID_SIGNATURE 0x40000000
-
-/* This CPUID returns a feature bitmap in eax. Before enabling a particular
- * paravirtualization, the appropriate feature bit should be checked.
- */
-#define KVM_CPUID_FEATURES 0x40000001
-#define KVM_FEATURE_CLOCKSOURCE 0
-#define KVM_FEATURE_NOP_IO_DELAY 1
-#define KVM_FEATURE_MMU_OP 2
-
-#define MSR_KVM_WALL_CLOCK 0x11
-#define MSR_KVM_SYSTEM_TIME 0x12
-
-#define KVM_MAX_MMU_OP_BATCH 32
-
-/* Operations for KVM_HC_MMU_OP */
-#define KVM_MMU_OP_WRITE_PTE 1
-#define KVM_MMU_OP_FLUSH_TLB 2
-#define KVM_MMU_OP_RELEASE_PT 3
-
-/* Payload for KVM_HC_MMU_OP */
-struct kvm_mmu_op_header {
- __u32 op;
- __u32 pad;
-};
-
-struct kvm_mmu_op_write_pte {
- struct kvm_mmu_op_header header;
- __u64 pte_phys;
- __u64 pte_val;
-};
-
-struct kvm_mmu_op_flush_tlb {
- struct kvm_mmu_op_header header;
-};
-
-struct kvm_mmu_op_release_pt {
- struct kvm_mmu_op_header header;
- __u64 pt_phys;
-};
-
-#ifdef __KERNEL__
#include <asm/processor.h>
+#include <asm/alternative.h>
+#include <linux/interrupt.h>
+#include <uapi/asm/kvm_para.h>
-extern void kvmclock_init(void);
+#include <asm/tdx.h>
+#ifdef CONFIG_KVM_GUEST
+bool kvm_check_and_clear_guest_paused(void);
+#else
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+ return false;
+}
+#endif /* CONFIG_KVM_GUEST */
-/* This instruction is vmcall. On non-VT architectures, it will generate a
- * trap that we will then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+#define KVM_HYPERCALL \
+ ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL)
-/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
* instructions are guaranteed to be supported.
*
* Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
* The hypercall number should be placed in rax and the return value will be
- * placed in rax. No other registers will be clobbered unless explicited
+ * placed in rax. No other registers will be clobbered unless explicitly
* noted by the particular hypercall.
*/
static inline long kvm_hypercall0(unsigned int nr)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, 0, 0, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr)
@@ -79,6 +48,10 @@ static inline long kvm_hypercall0(unsigned int nr)
static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, 0, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1)
@@ -90,6 +63,10 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
unsigned long p2)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2)
@@ -101,6 +78,10 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, p3, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3)
@@ -113,6 +94,10 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
unsigned long p4)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, p3, p4);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
@@ -120,28 +105,73 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
return ret;
}
-static inline int kvm_para_available(void)
+static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
{
- unsigned int eax, ebx, ecx, edx;
- char signature[13];
+ long ret;
- cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
- memcpy(signature + 0, &ebx, 4);
- memcpy(signature + 4, &ecx, 4);
- memcpy(signature + 8, &edx, 4);
- signature[12] = 0;
+ asm volatile("vmmcall"
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
+ : "memory");
+ return ret;
+}
- if (strcmp(signature, "KVMKVMKVM") == 0)
- return 1;
+#ifdef CONFIG_KVM_GUEST
+void kvmclock_init(void);
+void kvmclock_disable(void);
+bool kvm_para_available(void);
+unsigned int kvm_arch_para_features(void);
+unsigned int kvm_arch_para_hints(void);
+void kvm_async_pf_task_wait_schedule(u32 token);
+u32 kvm_read_and_reset_apf_flags(void);
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
- return 0;
+DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
+static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+ if (static_branch_unlikely(&kvm_async_pf_enabled))
+ return __kvm_handle_async_pf(regs, token);
+ else
+ return false;
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init kvm_spinlock_init(void);
+#else /* !CONFIG_PARAVIRT_SPINLOCKS */
+static inline void kvm_spinlock_init(void)
+{
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_KVM_GUEST */
+#define kvm_async_pf_task_wait_schedule(T) do {} while(0)
+
+static inline bool kvm_para_available(void)
+{
+ return false;
}
static inline unsigned int kvm_arch_para_features(void)
{
- return cpuid_eax(KVM_CPUID_FEATURES);
+ return 0;
}
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
+static inline u32 kvm_read_and_reset_apf_flags(void)
+{
+ return 0;
+}
+
+static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+ return false;
+}
#endif
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
new file mode 100644
index 000000000000..d7c704ed1be9
--- /dev/null
+++ b/arch/x86/include/asm/kvm_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_TYPES_H
+#define _ASM_X86_KVM_TYPES_H
+
+#if IS_MODULE(CONFIG_KVM_AMD) && IS_MODULE(CONFIG_KVM_INTEL)
+#define KVM_SUB_MODULES kvm-amd,kvm-intel
+#elif IS_MODULE(CONFIG_KVM_AMD)
+#define KVM_SUB_MODULES kvm-amd
+#elif IS_MODULE(CONFIG_KVM_INTEL)
+#define KVM_SUB_MODULES kvm-intel
+#else
+#undef KVM_SUB_MODULES
+/*
+ * Don't export symbols for KVM without vendor modules, as kvm.ko is built iff
+ * at least one vendor module is enabled.
+ */
+#define EXPORT_SYMBOL_FOR_KVM(symbol)
+#endif
+
+#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
+
+#endif /* _ASM_X86_KVM_TYPES_H */
diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h
new file mode 100644
index 000000000000..1af2cb59233b
--- /dev/null
+++ b/arch/x86/include/asm/kvm_vcpu_regs.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_VCPU_REGS_H
+#define _ASM_X86_KVM_VCPU_REGS_H
+
+#define __VCPU_REGS_RAX 0
+#define __VCPU_REGS_RCX 1
+#define __VCPU_REGS_RDX 2
+#define __VCPU_REGS_RBX 3
+#define __VCPU_REGS_RSP 4
+#define __VCPU_REGS_RBP 5
+#define __VCPU_REGS_RSI 6
+#define __VCPU_REGS_RDI 7
+
+#ifdef CONFIG_X86_64
+#define __VCPU_REGS_R8 8
+#define __VCPU_REGS_R9 9
+#define __VCPU_REGS_R10 10
+#define __VCPU_REGS_R11 11
+#define __VCPU_REGS_R12 12
+#define __VCPU_REGS_R13 13
+#define __VCPU_REGS_R14 14
+#define __VCPU_REGS_R15 15
+#endif
+
+#endif /* _ASM_X86_KVM_VCPU_REGS_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
deleted file mode 100644
index b7ed2c423116..000000000000
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/******************************************************************************
- * x86_emulate.h
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef _ASM_X86_KVM_X86_EMULATE_H
-#define _ASM_X86_KVM_X86_EMULATE_H
-
-struct x86_emulate_ctxt;
-
-/*
- * x86_emulate_ops:
- *
- * These operations represent the instruction emulator's interface to memory.
- * There are two categories of operation: those that act on ordinary memory
- * regions (*_std), and those that act on memory regions known to require
- * special treatment or emulation (*_emulated).
- *
- * The emulator assumes that an instruction accesses only one 'emulated memory'
- * location, that this location is the given linear faulting address (cr2), and
- * that this is one of the instruction's data operands. Instruction fetches and
- * stack operations are assumed never to access emulated memory. The emulator
- * automatically deduces which operand of a string-move operation is accessing
- * emulated memory, and assumes that the other operand accesses normal memory.
- *
- * NOTES:
- * 1. The emulator isn't very smart about emulated vs. standard memory.
- * 'Emulated memory' access addresses should be checked for sanity.
- * 'Normal memory' accesses may fault, and the caller must arrange to
- * detect and handle reentrancy into the emulator via recursive faults.
- * Accesses may be unaligned and may cross page boundaries.
- * 2. If the access fails (cannot emulate, or a standard access faults) then
- * it is up to the memop to propagate the fault to the guest VM via
- * some out-of-band mechanism, unknown to the emulator. The memop signals
- * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
- * then immediately bail.
- * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
- * cmpxchg8b_emulated need support 8-byte accesses.
- * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
- */
-/* Access completed successfully: continue emulation as normal. */
-#define X86EMUL_CONTINUE 0
-/* Access is unhandleable: bail from emulation and return error to caller. */
-#define X86EMUL_UNHANDLEABLE 1
-/* Terminate emulation but return success to the caller. */
-#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
-struct x86_emulate_ops {
- /*
- * read_std: Read bytes of standard (non-emulated/special) memory.
- * Used for instruction fetch, stack operations, and others.
- * @addr: [IN ] Linear address from which to read.
- * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
- * @bytes: [IN ] Number of bytes to read from memory.
- */
- int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu);
-
- /*
- * read_emulated: Read bytes from emulated/special memory area.
- * @addr: [IN ] Linear address from which to read.
- * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
- * @bytes: [IN ] Number of bytes to read from memory.
- */
- int (*read_emulated)(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
- /*
- * write_emulated: Read bytes from emulated/special memory area.
- * @addr: [IN ] Linear address to which to write.
- * @val: [IN ] Value to write to memory (low-order bytes used as
- * required).
- * @bytes: [IN ] Number of bytes to write to memory.
- */
- int (*write_emulated)(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
- /*
- * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
- * emulated/special memory area.
- * @addr: [IN ] Linear address to access.
- * @old: [IN ] Value expected to be current at @addr.
- * @new: [IN ] Value to write to @addr.
- * @bytes: [IN ] Number of bytes to access using CMPXCHG.
- */
- int (*cmpxchg_emulated)(unsigned long addr,
- const void *old,
- const void *new,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
-
-};
-
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
- enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
- unsigned int bytes;
- unsigned long val, orig_val, *ptr;
-};
-
-struct fetch_cache {
- u8 data[15];
- unsigned long start;
- unsigned long end;
-};
-
-struct decode_cache {
- u8 twobyte;
- u8 b;
- u8 lock_prefix;
- u8 rep_prefix;
- u8 op_bytes;
- u8 ad_bytes;
- u8 rex_prefix;
- struct operand src;
- struct operand src2;
- struct operand dst;
- bool has_seg_override;
- u8 seg_override;
- unsigned int d;
- unsigned long regs[NR_VCPU_REGS];
- unsigned long eip;
- /* modrm */
- u8 modrm;
- u8 modrm_mod;
- u8 modrm_reg;
- u8 modrm_rm;
- u8 use_modrm_ea;
- bool rip_relative;
- unsigned long modrm_ea;
- void *modrm_ptr;
- unsigned long modrm_val;
- struct fetch_cache fetch;
-};
-
-#define X86_SHADOW_INT_MOV_SS 1
-#define X86_SHADOW_INT_STI 2
-
-struct x86_emulate_ctxt {
- /* Register state before/after emulation. */
- struct kvm_vcpu *vcpu;
-
- unsigned long eflags;
- /* Emulated execution mode, represented by an X86EMUL_MODE value. */
- int mode;
- u32 cs_base;
-
- /* interruptibility state, as a result of execution of STI or MOV SS */
- int interruptibility;
-
- /* decode cache */
- struct decode_cache decode;
-};
-
-/* Repeat String Operation Prefix */
-#define REPE_PREFIX 1
-#define REPNE_PREFIX 2
-
-/* Execution mode, passed to the emulator. */
-#define X86EMUL_MODE_REAL 0 /* Real mode. */
-#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
-#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
-#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
-
-/* Host execution mode. */
-#if defined(CONFIG_X86_32)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
-#elif defined(CONFIG_X86_64)
-#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
-#endif
-
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
-
-#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
new file mode 100644
index 000000000000..f163176d6f7f
--- /dev/null
+++ b/arch/x86/include/asm/kvmclock.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_CLOCK_H
+#define _ASM_X86_KVM_CLOCK_H
+
+#include <linux/percpu.h>
+
+DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+
+static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+{
+ return &this_cpu_read(hv_clock_per_cpu)->pvti;
+}
+
+static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
+{
+ return this_cpu_read(hv_clock_per_cpu);
+}
+
+#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 5136dad57cbb..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS 10
-#define GDT_ENTRY_LGUEST_DS 11
-#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Every guest maps the core switcher code. */
-#define SHARED_SWITCHER_PAGES \
- DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
-/* Pages for switcher itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
-
-/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
-#ifdef CONFIG_X86_PAE
-#define SWITCHER_ADDR 0xFFE00000
-#else
-#define SWITCHER_ADDR 0xFFC00000
-#endif
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in lguest_guest.S */
-extern char lguest_noirq_start[], lguest_noirq_end[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_sti[], lgend_sti[];
-extern const char lgstart_popf[], lgend_popf[];
-extern const char lgstart_pushf[], lgend_pushf[];
-extern const char lgstart_iret[], lgend_iret[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
- /* Manually saved part. */
- unsigned long eax, ebx, ecx, edx;
- unsigned long esi, edi, ebp;
- unsigned long gs;
- unsigned long fs, ds, es;
- unsigned long trapnum, errcode;
- /* Trap pushed part */
- unsigned long eip;
- unsigned long cs;
- unsigned long eflags;
- unsigned long esp;
- unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
- /* Host information we need to restore when we switch back. */
- u32 host_cr3;
- struct desc_ptr host_idt_desc;
- struct desc_ptr host_gdt_desc;
- u32 host_sp;
-
- /* Fields which are used when guest is running. */
- struct desc_ptr guest_idt_desc;
- struct desc_ptr guest_gdt_desc;
- struct x86_hw_tss guest_tss;
- struct desc_struct guest_idt[IDT_ENTRIES];
- struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
- /* The GDT entries copied into lguest_ro_state when running. */
- struct desc_struct gdt[GDT_ENTRIES];
-
- /* The IDT entries: some copied into lguest_ro_state when running. */
- struct desc_struct idt[IDT_ENTRIES];
-
- /* The address of the last guest-visible pagefault (ie. cr2). */
- unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
- u32 cr0;
-
- cr0 = read_cr0();
- if (!(cr0 & 8))
- write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
-#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index ba0eed8aa1a6..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC 0
-#define LHCALL_LGUEST_INIT 1
-#define LHCALL_SHUTDOWN 2
-#define LHCALL_NEW_PGTABLE 4
-#define LHCALL_FLUSH_TLB 5
-#define LHCALL_LOAD_IDT_ENTRY 6
-#define LHCALL_SET_STACK 7
-#define LHCALL_TS 8
-#define LHCALL_SET_CLOCKEVENT 9
-#define LHCALL_HALT 10
-#define LHCALL_SET_PMD 13
-#define LHCALL_SET_PTE 14
-#define LHCALL_SET_PGD 15
-#define LHCALL_LOAD_TLS 16
-#define LHCALL_NOTIFY 17
-#define LHCALL_LOAD_GDT_ENTRY 18
-#define LHCALL_SEND_INTERRUPTS 19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF 1
-#define LGUEST_SHUTDOWN_RESTART 2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-#include <asm/kvm_para.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations? There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * We use the KVM hypercall mechanism, though completely different hypercall
- * numbers. Seventeen hypercalls are available: the hypercall number is put in
- * the %eax register, and the arguments (when required) are placed in %ebx,
- * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure. This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
-:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
- /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
- unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 12d55e773eb6..9d38ae744a2e 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,66 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_LINKAGE_H
#define _ASM_X86_LINKAGE_H
#include <linux/stringify.h>
+#include <asm/ibt.h>
#undef notrace
#define notrace __attribute__((no_instrument_function))
-#ifdef CONFIG_X86_32
-#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
+#ifdef CONFIG_64BIT
/*
- * For 32-bit UML - mark functions implemented in assembly that use
- * regparm input parameters:
+ * The generic version tends to create spurious ENDBR instructions under
+ * certain conditions.
*/
-#define asmregparm __attribute__((regparm(3)))
+#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; })
+#endif
+
+#ifdef CONFIG_X86_32
+#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
+#endif /* CONFIG_X86_32 */
+
+#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90;
+#define __ALIGN_STR __stringify(__ALIGN)
+
+#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90;
+#else
+#define FUNCTION_PADDING
+#endif
+
+#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING
+#else
+# define __FUNC_ALIGN __ALIGN
+#endif
+
+#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN)
+#define SYM_F_ALIGN __FUNC_ALIGN
+
+#ifdef __ASSEMBLER__
+
+#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#define RET jmp __x86_return_thunk
+#else /* CONFIG_MITIGATION_RETPOLINE */
+#ifdef CONFIG_MITIGATION_SLS
+#define RET ret; int3
+#else
+#define RET ret
+#endif
+#endif /* CONFIG_MITIGATION_RETPOLINE */
+
+#else /* __ASSEMBLER__ */
+
+#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#define ASM_RET "jmp __x86_return_thunk\n\t"
+#else /* CONFIG_MITIGATION_RETPOLINE */
+#ifdef CONFIG_MITIGATION_SLS
+#define ASM_RET "ret; int3\n\t"
+#else
+#define ASM_RET "ret\n\t"
+#endif
+#endif /* CONFIG_MITIGATION_RETPOLINE */
+
+#endif /* __ASSEMBLER__ */
/*
- * Make sure the compiler doesn't do anything stupid with the
- * arguments on the stack - they are owned by the *caller*, not
- * the callee. This just fools gcc into not spilling into them,
- * and keeps it from doing tailcall recursion and/or using the
- * stack slots for temporaries, since they are live and "used"
- * all the way to the end of the function.
+ * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the
+ * CFI symbol layout changes.
+ *
+ * Without CALL_THUNKS:
+ *
+ * .align FUNCTION_ALIGNMENT
+ * __cfi_##name:
+ * .skip FUNCTION_PADDING, 0x90
+ * .byte 0xb8
+ * .long __kcfi_typeid_##name
+ * name:
+ *
+ * With CALL_THUNKS:
+ *
+ * .align FUNCTION_ALIGNMENT
+ * __cfi_##name:
+ * .byte 0xb8
+ * .long __kcfi_typeid_##name
+ * .skip FUNCTION_PADDING, 0x90
+ * name:
*
- * NOTE! On x86-64, all the arguments are in registers, so this
- * only matters on a 32-bit kernel.
+ * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized.
*/
-#define asmlinkage_protect(n, ret, args...) \
- __asmlinkage_protect##n(ret, ##args)
-#define __asmlinkage_protect_n(ret, args...) \
- __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args)
-#define __asmlinkage_protect0(ret) \
- __asmlinkage_protect_n(ret)
-#define __asmlinkage_protect1(ret, arg1) \
- __asmlinkage_protect_n(ret, "g" (arg1))
-#define __asmlinkage_protect2(ret, arg1, arg2) \
- __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
-#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
- __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
-#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
- __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
- "g" (arg4))
-#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
- __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
- "g" (arg4), "g" (arg5))
-#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
- __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
- "g" (arg4), "g" (arg5), "g" (arg6))
-#endif /* CONFIG_X86_32 */
-
-#ifdef __ASSEMBLY__
+#ifdef CONFIG_CALL_PADDING
+#define CFI_PRE_PADDING
+#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90;
+#else
+#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90;
+#define CFI_POST_PADDING
+#endif
-#define GLOBAL(name) \
- .globl name; \
- name:
+#define __CFI_TYPE(name) \
+ SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \
+ CFI_PRE_PADDING \
+ .byte 0xb8 ASM_NL \
+ .long __kcfi_typeid_##name ASM_NL \
+ CFI_POST_PADDING \
+ SYM_FUNC_END(__cfi_##name)
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
-#define __ALIGN .p2align 4, 0x90
-#define __ALIGN_STR __stringify(__ALIGN)
+/* UML needs to be able to override memcpy() and friends for KASAN. */
+#ifdef CONFIG_UML
+# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS_WEAK
+#else
+# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS
#endif
-#endif /* __ASSEMBLY__ */
+/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
+#define SYM_TYPED_FUNC_START(name) \
+ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
+ ENDBR
+
+/* SYM_FUNC_START -- use for global functions */
+#define SYM_FUNC_START(name) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN)
+
+/* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */
+#define SYM_FUNC_START_NOALIGN(name) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)
+
+/* SYM_FUNC_START_LOCAL -- use for local functions */
+#define SYM_FUNC_START_LOCAL(name) \
+ SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN)
+
+/* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */
+#define SYM_FUNC_START_LOCAL_NOALIGN(name) \
+ SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)
+
+/* SYM_FUNC_START_WEAK -- use for weak functions */
+#define SYM_FUNC_START_WEAK(name) \
+ SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN)
+
+/* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */
+#define SYM_FUNC_START_WEAK_NOALIGN(name) \
+ SYM_START(name, SYM_L_WEAK, SYM_A_NONE)
+
+/*
+ * Expose 'sym' to the startup code in arch/x86/boot/startup/, by emitting an
+ * alias prefixed with __pi_
+ */
+#ifdef __ASSEMBLER__
+#define SYM_PIC_ALIAS(sym) SYM_ALIAS(__pi_ ## sym, sym, SYM_L_GLOBAL)
+#else
+#define SYM_PIC_ALIAS(sym) extern typeof(sym) __PASTE(__pi_, sym) __alias(sym)
+#endif
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 47b9b6f19057..59aa966dc212 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -1,10 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_LOCAL_H
#define _ASM_X86_LOCAL_H
#include <linux/percpu.h>
-#include <asm/system.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/asm.h>
typedef struct {
@@ -51,14 +51,9 @@ static inline void local_sub(long i, local_t *l)
* true if the result is zero, or false for all
* other cases.
*/
-static inline int local_sub_and_test(long i, local_t *l)
+static inline bool local_sub_and_test(long i, local_t *l)
{
- unsigned char c;
-
- asm volatile(_ASM_SUB "%2,%0; sete %1"
- : "+m" (l->a.counter), "=qm" (c)
- : "ir" (i) : "memory");
- return c;
+ return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i);
}
/**
@@ -69,14 +64,9 @@ static inline int local_sub_and_test(long i, local_t *l)
* returns true if the result is 0, or false for all other
* cases.
*/
-static inline int local_dec_and_test(local_t *l)
+static inline bool local_dec_and_test(local_t *l)
{
- unsigned char c;
-
- asm volatile(_ASM_DEC "%0; sete %1"
- : "+m" (l->a.counter), "=qm" (c)
- : : "memory");
- return c != 0;
+ return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e);
}
/**
@@ -87,14 +77,9 @@ static inline int local_dec_and_test(local_t *l)
* and returns true if the result is zero, or false for all
* other cases.
*/
-static inline int local_inc_and_test(local_t *l)
+static inline bool local_inc_and_test(local_t *l)
{
- unsigned char c;
-
- asm volatile(_ASM_INC "%0; sete %1"
- : "+m" (l->a.counter), "=qm" (c)
- : : "memory");
- return c != 0;
+ return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e);
}
/**
@@ -106,14 +91,9 @@ static inline int local_inc_and_test(local_t *l)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-static inline int local_add_negative(long i, local_t *l)
+static inline bool local_add_negative(long i, local_t *l)
{
- unsigned char c;
-
- asm volatile(_ASM_ADD "%2,%0; sets %1"
- : "+m" (l->a.counter), "=qm" (c)
- : "ir" (i) : "memory");
- return c;
+ return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i);
}
/**
@@ -125,27 +105,11 @@ static inline int local_add_negative(long i, local_t *l)
*/
static inline long local_add_return(long i, local_t *l)
{
- long __i;
-#ifdef CONFIG_M386
- unsigned long flags;
- if (unlikely(boot_cpu_data.x86 <= 3))
- goto no_xadd;
-#endif
- /* Modern 486+ processor */
- __i = i;
+ long __i = i;
asm volatile(_ASM_XADD "%0, %1;"
: "+r" (i), "+m" (l->a.counter)
: : "memory");
return i + __i;
-
-#ifdef CONFIG_M386
-no_xadd: /* Legacy 386 processor */
- local_irq_save(flags);
- __i = local_read(l);
- local_set(l, i + __i);
- local_irq_restore(flags);
- return i + __i;
-#endif
}
static inline long local_sub_return(long i, local_t *l)
@@ -156,34 +120,54 @@ static inline long local_sub_return(long i, local_t *l)
#define local_inc_return(l) (local_add_return(1, l))
#define local_dec_return(l) (local_sub_return(1, l))
-#define local_cmpxchg(l, o, n) \
- (cmpxchg_local(&((l)->a.counter), (o), (n)))
-/* Always has a lock prefix */
-#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
+static inline long local_cmpxchg(local_t *l, long old, long new)
+{
+ return cmpxchg_local(&l->a.counter, old, new);
+}
+
+static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
+{
+ return try_cmpxchg_local(&l->a.counter,
+ (typeof(l->a.counter) *) old, new);
+}
+
+/*
+ * Implement local_xchg using CMPXCHG instruction without the LOCK prefix.
+ * XCHG is expensive due to the implied LOCK prefix. The processor
+ * cannot prefetch cachelines if XCHG is used.
+ */
+static __always_inline long
+local_xchg(local_t *l, long n)
+{
+ long c = local_read(l);
+
+ do { } while (!local_try_cmpxchg(l, &c, n));
+
+ return c;
+}
/**
- * local_add_unless - add unless the number is a given value
+ * local_add_unless - add unless the number is already a given value
* @l: pointer of type local_t
* @a: the amount to add to l...
* @u: ...unless l is equal to u.
*
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
+ * Atomically adds @a to @l, if @v was not already @u.
+ * Returns true if the addition was done.
*/
-#define local_add_unless(l, a, u) \
-({ \
- long c, old; \
- c = local_read((l)); \
- for (;;) { \
- if (unlikely(c == (u))) \
- break; \
- old = local_cmpxchg((l), c, c + (a)); \
- if (likely(old == c)) \
- break; \
- c = old; \
- } \
- c != (u); \
-})
+static __always_inline bool
+local_add_unless(local_t *l, long a, long u)
+{
+ long c = local_read(l);
+
+ do {
+ if (unlikely(c == u))
+ return false;
+ } while (!local_try_cmpxchg(l, &c, c + a));
+
+ return true;
+}
+
#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
/* On x86_32, these are no better than the atomic variants.
@@ -195,41 +179,4 @@ static inline long local_sub_return(long i, local_t *l)
#define __local_add(i, l) local_add((i), (l))
#define __local_sub(i, l) local_sub((i), (l))
-/* Use these for per-cpu local_t variables: on some archs they are
- * much more efficient than these naive implementations. Note they take
- * a variable, not an address.
- *
- * X86_64: This could be done better if we moved the per cpu data directly
- * after GS.
- */
-
-/* Need to disable preemption for the cpu local counters otherwise we could
- still access a variable of a previous CPU in a non atomic way. */
-#define cpu_local_wrap_v(l) \
-({ \
- local_t res__; \
- preempt_disable(); \
- res__ = (l); \
- preempt_enable(); \
- res__; \
-})
-#define cpu_local_wrap(l) \
-({ \
- preempt_disable(); \
- (l); \
- preempt_enable(); \
-}) \
-
-#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
-#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
-#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
-#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
-#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
-#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
-
-#define __cpu_local_inc(l) cpu_local_inc((l))
-#define __cpu_local_dec(l) cpu_local_dec((l))
-#define __cpu_local_add(i, l) cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
-
#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h
index 853728519ae9..044daf6fb8f6 100644
--- a/arch/x86/include/asm/mach_timer.h
+++ b/arch/x86/include/asm/mach_timer.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Machine specific calibrate_tsc() for generic.
* Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
@@ -15,7 +16,7 @@
#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
#define CALIBRATE_LATCH \
- ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
+ ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
static inline void mach_prepare_counter(void)
{
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..e39a517467cd 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Machine specific NMI handling for generic.
* Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
@@ -7,9 +8,19 @@
#include <asm/mc146818rtc.h>
-static inline unsigned char get_nmi_reason(void)
+#define NMI_REASON_PORT 0x61
+
+#define NMI_REASON_SERR 0x80
+#define NMI_REASON_IOCHK 0x40
+#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+#define NMI_REASON_CLEAR_SERR 0x04
+#define NMI_REASON_CLEAR_IOCHK 0x08
+#define NMI_REASON_CLEAR_MASK 0x0f
+
+static inline unsigned char default_get_nmi_reason(void)
{
- return inb(0x61);
+ return inb(NMI_REASON_PORT);
}
static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 031f6266f425..3c42743083ed 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -1,8 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MATH_EMU_H
#define _ASM_X86_MATH_EMU_H
#include <asm/ptrace.h>
-#include <asm/vm86.h>
/* This structure matches the layout of the data saved to the stack
following a device-not-present interrupt, part of it saved
@@ -10,9 +10,6 @@
*/
struct math_emu_info {
long ___orig_eip;
- union {
- struct pt_regs *regs;
- struct kernel_vm86_regs *vm86;
- };
+ struct pt_regs *regs;
};
#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 01fdf5674e24..6115bb3d5795 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Machine dependent access functions for RTC registers.
*/
@@ -5,16 +6,14 @@
#define _ASM_X86_MC146818RTC_H
#include <asm/io.h>
-#include <asm/system.h>
#include <asm/processor.h>
-#include <linux/mc146818rtc.h>
#ifndef RTC_PORT
#define RTC_PORT(x) (0x70 + (x))
#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
#endif
-#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+#if defined(CONFIG_X86_32)
/*
* This lock provides nmi access to the CMOS/RTC registers. It has some
* special properties. It is owned by a CPU and stores the index register
@@ -81,8 +80,8 @@ static inline unsigned char current_lock_cmos_reg(void)
#else
#define lock_cmos_prefix(reg) do {} while (0)
#define lock_cmos_suffix(reg) do {} while (0)
-#define lock_cmos(reg)
-#define unlock_cmos()
+#define lock_cmos(reg) do { } while (0)
+#define unlock_cmos() do { } while (0)
#define do_i_have_lock_cmos() 0
#define current_lock_cmos_reg() 0
#endif
@@ -96,8 +95,8 @@ static inline unsigned char current_lock_cmos_reg(void)
unsigned char rtc_cmos_read(unsigned char addr);
void rtc_cmos_write(unsigned char val, unsigned char addr);
-extern int mach_set_rtc_mmss(unsigned long nowtime);
-extern unsigned long mach_get_cmos_time(void);
+extern int mach_set_cmos_time(const struct timespec64 *now);
+extern void mach_get_cmos_time(struct timespec64 *now);
#define RTC_IRQ 8
diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h
deleted file mode 100644
index eedbb6cc1efb..000000000000
--- a/arch/x86/include/asm/mca.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8 -*- */
-
-/* Platform specific MCA defines */
-#ifndef _ASM_X86_MCA_H
-#define _ASM_X86_MCA_H
-
-/* Maximal number of MCA slots - actually, some machines have less, but
- * they all have sufficient number of POS registers to cover 8.
- */
-#define MCA_MAX_SLOT_NR 8
-
-/* Most machines have only one MCA bus. The only multiple bus machines
- * I know have at most two */
-#define MAX_MCA_BUSSES 2
-
-#define MCA_PRIMARY_BUS 0
-#define MCA_SECONDARY_BUS 1
-
-/* Dummy slot numbers on primary MCA for integrated functions */
-#define MCA_INTEGSCSI (MCA_MAX_SLOT_NR)
-#define MCA_INTEGVIDEO (MCA_MAX_SLOT_NR+1)
-#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2)
-
-/* Dummy POS values for integrated functions */
-#define MCA_DUMMY_POS_START 0x10000
-#define MCA_INTEGSCSI_POS (MCA_DUMMY_POS_START+1)
-#define MCA_INTEGVIDEO_POS (MCA_DUMMY_POS_START+2)
-#define MCA_MOTHERBOARD_POS (MCA_DUMMY_POS_START+3)
-
-/* MCA registers */
-
-#define MCA_MOTHERBOARD_SETUP_REG 0x94
-#define MCA_ADAPTER_SETUP_REG 0x96
-#define MCA_POS_REG(n) (0x100+(n))
-
-#define MCA_ENABLED 0x01 /* POS 2, set if adapter enabled */
-
-/* Max number of adapters, including both slots and various integrated
- * things.
- */
-#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3)
-
-#endif /* _ASM_X86_MCA_H */
diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h
deleted file mode 100644
index 45271aef82dd..000000000000
--- a/arch/x86/include/asm/mca_dma.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifndef _ASM_X86_MCA_DMA_H
-#define _ASM_X86_MCA_DMA_H
-
-#include <asm/io.h>
-#include <linux/ioport.h>
-
-/*
- * Microchannel specific DMA stuff. DMA on an MCA machine is fairly similar to
- * standard PC dma, but it certainly has its quirks. DMA register addresses
- * are in a different place and there are some added functions. Most of this
- * should be pretty obvious on inspection. Note that the user must divide
- * count by 2 when using 16-bit dma; that is not handled by these functions.
- *
- * Ramen Noodles are yummy.
- *
- * 1998 Tymm Twillman <tymm@computer.org>
- */
-
-/*
- * Registers that are used by the DMA controller; FN is the function register
- * (tell the controller what to do) and EXE is the execution register (how
- * to do it)
- */
-
-#define MCA_DMA_REG_FN 0x18
-#define MCA_DMA_REG_EXE 0x1A
-
-/*
- * Functions that the DMA controller can do
- */
-
-#define MCA_DMA_FN_SET_IO 0x00
-#define MCA_DMA_FN_SET_ADDR 0x20
-#define MCA_DMA_FN_GET_ADDR 0x30
-#define MCA_DMA_FN_SET_COUNT 0x40
-#define MCA_DMA_FN_GET_COUNT 0x50
-#define MCA_DMA_FN_GET_STATUS 0x60
-#define MCA_DMA_FN_SET_MODE 0x70
-#define MCA_DMA_FN_SET_ARBUS 0x80
-#define MCA_DMA_FN_MASK 0x90
-#define MCA_DMA_FN_RESET_MASK 0xA0
-#define MCA_DMA_FN_MASTER_CLEAR 0xD0
-
-/*
- * Modes (used by setting MCA_DMA_FN_MODE in the function register)
- *
- * Note that the MODE_READ is read from memory (write to device), and
- * MODE_WRITE is vice-versa.
- */
-
-#define MCA_DMA_MODE_XFER 0x04 /* read by default */
-#define MCA_DMA_MODE_READ 0x04 /* same as XFER */
-#define MCA_DMA_MODE_WRITE 0x08 /* OR with MODE_XFER to use */
-#define MCA_DMA_MODE_IO 0x01 /* DMA from IO register */
-#define MCA_DMA_MODE_16 0x40 /* 16 bit xfers */
-
-
-/**
- * mca_enable_dma - channel to enable DMA on
- * @dmanr: DMA channel
- *
- * Enable the MCA bus DMA on a channel. This can be called from
- * IRQ context.
- */
-
-static inline void mca_enable_dma(unsigned int dmanr)
-{
- outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN);
-}
-
-/**
- * mca_disble_dma - channel to disable DMA on
- * @dmanr: DMA channel
- *
- * Enable the MCA bus DMA on a channel. This can be called from
- * IRQ context.
- */
-
-static inline void mca_disable_dma(unsigned int dmanr)
-{
- outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN);
-}
-
-/**
- * mca_set_dma_addr - load a 24bit DMA address
- * @dmanr: DMA channel
- * @a: 24bit bus address
- *
- * Load the address register in the DMA controller. This has a 24bit
- * limitation (16Mb).
- */
-
-static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a)
-{
- outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN);
- outb(a & 0xff, MCA_DMA_REG_EXE);
- outb((a >> 8) & 0xff, MCA_DMA_REG_EXE);
- outb((a >> 16) & 0xff, MCA_DMA_REG_EXE);
-}
-
-/**
- * mca_get_dma_addr - load a 24bit DMA address
- * @dmanr: DMA channel
- *
- * Read the address register in the DMA controller. This has a 24bit
- * limitation (16Mb). The return is a bus address.
- */
-
-static inline unsigned int mca_get_dma_addr(unsigned int dmanr)
-{
- unsigned int addr;
-
- outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN);
- addr = inb(MCA_DMA_REG_EXE);
- addr |= inb(MCA_DMA_REG_EXE) << 8;
- addr |= inb(MCA_DMA_REG_EXE) << 16;
-
- return addr;
-}
-
-/**
- * mca_set_dma_count - load a 16bit transfer count
- * @dmanr: DMA channel
- * @count: count
- *
- * Set the DMA count for this channel. This can be up to 64Kbytes.
- * Setting a count of zero will not do what you expect.
- */
-
-static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count)
-{
- count--; /* transfers one more than count -- correct for this */
-
- outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN);
- outb(count & 0xff, MCA_DMA_REG_EXE);
- outb((count >> 8) & 0xff, MCA_DMA_REG_EXE);
-}
-
-/**
- * mca_get_dma_residue - get the remaining bytes to transfer
- * @dmanr: DMA channel
- *
- * This function returns the number of bytes left to transfer
- * on this DMA channel.
- */
-
-static inline unsigned int mca_get_dma_residue(unsigned int dmanr)
-{
- unsigned short count;
-
- outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN);
- count = 1 + inb(MCA_DMA_REG_EXE);
- count += inb(MCA_DMA_REG_EXE) << 8;
-
- return count;
-}
-
-/**
- * mca_set_dma_io - set the port for an I/O transfer
- * @dmanr: DMA channel
- * @io_addr: an I/O port number
- *
- * Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer
- * with an I/O port target.
- */
-
-static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr)
-{
- /*
- * DMA from a port address -- set the io address
- */
-
- outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN);
- outb(io_addr & 0xff, MCA_DMA_REG_EXE);
- outb((io_addr >> 8) & 0xff, MCA_DMA_REG_EXE);
-}
-
-/**
- * mca_set_dma_mode - set the DMA mode
- * @dmanr: DMA channel
- * @mode: mode to set
- *
- * The DMA controller supports several modes. The mode values you can
- * set are-
- *
- * %MCA_DMA_MODE_READ when reading from the DMA device.
- *
- * %MCA_DMA_MODE_WRITE to writing to the DMA device.
- *
- * %MCA_DMA_MODE_IO to do DMA to or from an I/O port.
- *
- * %MCA_DMA_MODE_16 to do 16bit transfers.
- */
-
-static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode)
-{
- outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN);
- outb(mode, MCA_DMA_REG_EXE);
-}
-
-#endif /* _ASM_X86_MCA_DMA_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5cdd8d100ec9..2d98886de09a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -1,65 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MCE_H
#define _ASM_X86_MCE_H
-#include <linux/types.h>
-#include <asm/ioctls.h>
+#include <uapi/asm/mce.h>
/*
* Machine Check support for x86
*/
+/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
-#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
-#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
-#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
+#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */
+#define MCG_EXT_P BIT_ULL(9) /* Extended registers available */
+#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */
+#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */
#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT 16
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
-#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
-
-#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
-#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
-#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
-
-#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
-#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
-#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
-#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
-#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
-#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
-#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
-#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
-#define MCI_STATUS_AR (1ULL<<55) /* Action required */
-
-/* MISC register defines */
-#define MCM_ADDR_SEGOFF 0 /* segment offset */
-#define MCM_ADDR_LINEAR 1 /* linear address */
-#define MCM_ADDR_PHYS 2 /* physical address */
-#define MCM_ADDR_MEM 3 /* memory address */
-#define MCM_ADDR_GENERIC 7 /* generic */
-
-/* Fields are zero when not available */
-struct mce {
- __u64 status;
- __u64 misc;
- __u64 addr;
- __u64 mcgstatus;
- __u64 ip;
- __u64 tsc; /* cpu time stamp counter */
- __u64 time; /* wall time_t when error was detected */
- __u8 cpuvendor; /* cpu vendor as encoded in system.h */
- __u8 pad1;
- __u16 pad2;
- __u32 cpuid; /* CPUID 1 EAX */
- __u8 cs; /* code segment */
- __u8 bank; /* machine check bank */
- __u8 cpu; /* cpu number; obsolete; use extcpu now */
- __u8 finished; /* entry is valid */
- __u32 extcpu; /* linux cpu number that detected the error */
- __u32 socketid; /* CPU socket ID */
- __u32 apicid; /* CPU initial apic ID */
- __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
-};
+#define MCG_SER_P BIT_ULL(24) /* MCA recovery/new status bits */
+#define MCG_ELOG_P BIT_ULL(26) /* Extended error log supported */
+#define MCG_LMCE_P BIT_ULL(27) /* Local machine check supported */
+
+/* MCG_STATUS register defines */
+#define MCG_STATUS_RIPV BIT_ULL(0) /* restart ip valid */
+#define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */
+#define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */
+#define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */
+#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */
+
+/* MCG_EXT_CTL register defines */
+#define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */
+
+/* MCi_STATUS register defines */
+#define MCI_STATUS_VAL BIT_ULL(63) /* valid error */
+#define MCI_STATUS_OVER BIT_ULL(62) /* previous errors lost */
+#define MCI_STATUS_UC BIT_ULL(61) /* uncorrected error */
+#define MCI_STATUS_EN BIT_ULL(60) /* error enabled */
+#define MCI_STATUS_MISCV BIT_ULL(59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV BIT_ULL(58) /* addr reg. valid */
+#define MCI_STATUS_PCC BIT_ULL(57) /* processor context corrupt */
+#define MCI_STATUS_S BIT_ULL(56) /* Signaled machine check */
+#define MCI_STATUS_AR BIT_ULL(55) /* Action required */
+#define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */
+#define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38)
+#define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT)
+#define MCI_STATUS_MSCOD(m) (((m) >> 16) & 0xffff)
+
+/* AMD-specific bits */
+#define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */
+#define MCI_STATUS_PADDRV BIT_ULL(54) /* Valid System Physical Address */
+#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */
+#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */
+#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */
+#define MCI_STATUS_SCRUB BIT_ULL(40) /* Error detected during scrub operation */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ * - Deferred error interrupt type is specifiable by bank.
+ * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ * But should not be used to determine MSR numbers.
+ * - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX 0x1
+#define MCI_CONFIG_FRUTEXT BIT_ULL(9)
+#define MCI_CONFIG_PADDRV BIT_ULL(11)
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF
+
+/*
+ * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
+ * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
+ * errors to indicate that errors are being filtered by hardware.
+ * We should mask out bit 12 when looking for specific signatures
+ * of uncorrected errors - so the F bit is deliberately skipped
+ * in this #define.
+ */
+#define MCACOD 0xefff /* MCA Error Code */
+
+/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
+#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
+#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */
+#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
+#define MCACOD_DATA 0x0134 /* Data Load */
+#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
+
+/* MCi_MISC register defines */
+#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f)
+#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7)
+#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */
+#define MCI_MISC_ADDR_LINEAR 1 /* linear address */
+#define MCI_MISC_ADDR_PHYS 2 /* physical address */
+#define MCI_MISC_ADDR_MEM 3 /* memory address */
+#define MCI_MISC_ADDR_GENERIC 7 /* generic */
+
+/* MCi_ADDR register defines */
+#define MCI_ADDR_PHYSADDR GENMASK_ULL(boot_cpu_data.x86_phys_bits - 1, 0)
+
+/* CTL2 register defines */
+#define MCI_CTL2_CMCI_EN BIT_ULL(30)
+#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
+
+#define MCJ_CTX_MASK 3
+#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
+#define MCJ_CTX_RANDOM 0 /* inject context: random */
+#define MCJ_CTX_PROCESS 0x1 /* inject context: process */
+#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */
+#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */
+#define MCJ_EXCEPTION 0x8 /* raise as exception */
+#define MCJ_IRQ_BROADCAST 0x10 /* do IRQ broadcasting */
+
+#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
+
+#define MCE_LOG_MIN_LEN 32U
+#define MCE_LOG_SIGNATURE "MACHINECHECK"
+
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CTL 0xc0002000
+#define MSR_AMD64_SMCA_MC0_STATUS 0xc0002001
+#define MSR_AMD64_SMCA_MC0_ADDR 0xc0002002
+#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
+#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006
+#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
+#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
+#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+/* Registers MISC2 to MISC4 are at offsets B to D. */
+#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
+#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
+#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
+
+#define XEC(x, mask) (((x) >> 16) & mask)
+
+/* mce.kflags flag bits for logging etc. */
+#define MCE_HANDLED_CEC BIT_ULL(0)
+#define MCE_HANDLED_UC BIT_ULL(1)
+#define MCE_HANDLED_EXTLOG BIT_ULL(2)
+#define MCE_HANDLED_NFIT BIT_ULL(3)
+#define MCE_HANDLED_EDAC BIT_ULL(4)
+#define MCE_HANDLED_MCELOG BIT_ULL(5)
+
+/*
+ * Indicates an MCE which has happened in kernel space but from
+ * which the kernel can recover simply by executing fixup_exception()
+ * so that an error is returned to the caller of the function that
+ * hit the machine check.
+ */
+#define MCE_IN_KERNEL_RECOV BIT_ULL(6)
+
+/*
+ * Indicates an MCE that happened in kernel space while copying data
+ * from user. In this case fixup_exception() gets the kernel to the
+ * error exit for the copy function. Machine check handler can then
+ * treat it like a fault taken in user mode.
+ */
+#define MCE_IN_KERNEL_COPYIN BIT_ULL(7)
+
+/*
+ * Indicates that handler should check and clear Deferred error registers
+ * rather than common ones.
+ */
+#define MCE_CHECK_DFR_REGS BIT_ULL(8)
/*
* This structure contains all data related to the MCE log. Also
@@ -67,152 +179,220 @@ struct mce {
* debugging tools. Each entry is only valid when its finished flag
* is set.
*/
-
-#define MCE_LOG_LEN 32
-
-struct mce_log {
+struct mce_log_buffer {
char signature[12]; /* "MACHINECHECK" */
- unsigned len; /* = MCE_LOG_LEN */
+ unsigned len; /* = elements in .mce_entry[] */
unsigned next;
unsigned flags;
unsigned recordlen; /* length of struct mce */
- struct mce entry[MCE_LOG_LEN];
+ struct mce entry[];
};
-#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
-
-#define MCE_LOG_SIGNATURE "MACHINECHECK"
-
-#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
-#define MCE_GET_LOG_LEN _IOR('M', 2, int)
-#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
+/* Highest last */
+enum mce_notifier_prios {
+ MCE_PRIO_LOWEST,
+ MCE_PRIO_MCELOG,
+ MCE_PRIO_EDAC,
+ MCE_PRIO_NFIT,
+ MCE_PRIO_EXTLOG,
+ MCE_PRIO_UC,
+ MCE_PRIO_EARLY,
+ MCE_PRIO_CEC,
+ MCE_PRIO_HIGHEST = MCE_PRIO_CEC
+};
-/* Software defined banks */
-#define MCE_EXTENDED_BANK 128
-#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0
+/**
+ * struct mce_hw_err - Hardware Error Record.
+ * @m: Machine Check record.
+ * @vendor: Vendor-specific error information.
+ *
+ * Vendor-specific fields should not be added to struct mce. Instead, vendors
+ * should export their vendor-specific data through their structure in the
+ * vendor union below.
+ *
+ * AMD's vendor data is parsed by error decoding tools for supplemental error
+ * information. Thus, current offsets of existing fields must be maintained.
+ * Only add new fields at the end of AMD's vendor structure.
+ */
+struct mce_hw_err {
+ struct mce m;
+
+ union vendor_info {
+ struct {
+ u64 synd1; /* MCA_SYND1 MSR */
+ u64 synd2; /* MCA_SYND2 MSR */
+ } amd;
+ } vendor;
+};
-#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
-#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9)
-#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9)
-#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9)
-#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9)
-#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9)
-#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
-#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
+#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m)
-#ifdef __KERNEL__
+struct notifier_block;
+extern void mce_register_decode_chain(struct notifier_block *nb);
+extern void mce_unregister_decode_chain(struct notifier_block *nb);
#include <linux/percpu.h>
-#include <linux/init.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
-extern int mce_disabled;
extern int mce_p5_enabled;
-#ifdef CONFIG_X86_MCE
-void mcheck_init(struct cpuinfo_x86 *c);
+#ifdef CONFIG_ARCH_HAS_COPY_MC
+extern void enable_copy_mc_fragile(void);
+unsigned long __must_check copy_mc_fragile(void *dst, const void *src, unsigned cnt);
#else
-static inline void mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void enable_copy_mc_fragile(void)
+{
+}
#endif
-#ifdef CONFIG_X86_OLD_MCE
-extern int nr_mce_banks;
-void amd_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
-#endif
+struct cper_ia_proc_ctx;
-#ifdef CONFIG_X86_ANCIENT_MCE
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#ifdef CONFIG_X86_MCE
+int mcheck_init(void);
+void mca_bsp_init(struct cpuinfo_x86 *c);
+void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_cpu_clear(struct cpuinfo_x86 *c);
+int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id);
#else
-static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void enable_p5_mce(void) {}
+static inline int mcheck_init(void) { return 0; }
+static inline void mca_bsp_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
+static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
+ u64 lapic_id) { return -EINVAL; }
#endif
-void mce_setup(struct mce *m);
-void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_dev);
+void mce_prep_record(struct mce_hw_err *err);
+void mce_log(struct mce_hw_err *err);
+DECLARE_PER_CPU(struct device *, mce_device);
-/*
- * To support more than 128 would need to escape the predefined
- * Linux defined extended banks first.
- */
-#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+/* Maximum number of MCA banks per CPU. */
+#define MAX_NR_BANKS 64
#ifdef CONFIG_X86_MCE_INTEL
-extern int mce_cmci_disabled;
-extern int mce_ignore_ce;
void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_clear(struct cpuinfo_x86 *c);
void cmci_clear(void);
void cmci_reenable(void);
-void cmci_rediscover(int dying);
+void cmci_rediscover(void);
void cmci_recheck(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
-static inline void cmci_rediscover(int dying) {}
+static inline void cmci_rediscover(void) {}
static inline void cmci_recheck(void) {}
#endif
-#ifdef CONFIG_X86_MCE_AMD
-void mce_amd_feature_init(struct cpuinfo_x86 *c);
-#else
-static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
-#endif
-
-int mce_available(struct cpuinfo_x86 *c);
+bool mce_available(struct cpuinfo_x86 *c);
+bool mce_is_memory_error(struct mce *m);
+bool mce_is_correctable(struct mce *m);
+bool mce_usable_address(struct mce *m);
DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);
-extern atomic_t mce_entry;
-
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
enum mcp_flags {
- MCP_TIMESTAMP = (1 << 0), /* log time stamp */
- MCP_UC = (1 << 1), /* log uncorrected errors */
- MCP_DONTLOG = (1 << 2), /* only clear, don't log */
+ MCP_TIMESTAMP = BIT(0), /* log time stamp */
+ MCP_UC = BIT(1), /* log uncorrected errors */
+ MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */
};
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
-int mce_notify_irq(void);
-void mce_notify_process(void);
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
DECLARE_PER_CPU(struct mce, injectm);
-extern struct file_operations mce_chrdev_ops;
+
+/* Disable CMCI/polling for MCA bank claimed by firmware */
+extern void mce_disable_bank(int bank);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void mce_save_apei_thr_limit(u32 thr_limit);
+#else
+static inline void mce_save_apei_thr_limit(u32 thr_limit) { }
+#endif /* CONFIG_X86_MCE_THRESHOLD */
/*
* Exception handler
*/
-
-/* Call the installed machine check handler for this CPU setup. */
-extern void (*machine_check_vector)(struct pt_regs *, long error_code);
-void do_machine_check(struct pt_regs *, long);
+void do_machine_check(struct pt_regs *pt_regs);
/*
* Threshold handler
*/
-
extern void (*mce_threshold_vector)(void);
-extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+
+/* Deferred error interrupt handler */
+extern void (*deferred_error_int_vector)(void);
+
+/*
+ * Used by APEI to report memory error via /dev/mcelog
+ */
+
+struct cper_sec_mem_err;
+extern void apei_mce_report_mem_error(int corrected,
+ struct cper_sec_mem_err *mem_err);
/*
- * Thermal handler
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
*/
+#ifdef CONFIG_X86_MCE_AMD
-void intel_init_thermal(struct cpuinfo_x86 *c);
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_LS_V2,
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_RESERVED, /* Reserved */
+ SMCA_EX, /* Execution Unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_CS_V2,
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_PSP_V2,
+ SMCA_SMU, /* System Management Unit */
+ SMCA_SMU_V2,
+ SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_MPDMA, /* MPDMA Unit */
+ SMCA_NBIO, /* Northbridge IO Unit */
+ SMCA_PCIE, /* PCI Express Unit */
+ SMCA_PCIE_V2,
+ SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_NBIF, /* NBIF Unit */
+ SMCA_SHUB, /* System HUB Unit */
+ SMCA_SATA, /* SATA Unit */
+ SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
+ SMCA_GMI_PCS, /* GMI PCS Unit */
+ SMCA_XGMI_PHY, /* xGMI PHY Unit */
+ SMCA_WAFL_PHY, /* WAFL PHY Unit */
+ SMCA_GMI_PHY, /* GMI PHY Unit */
+ N_SMCA_BANK_TYPES
+};
-#ifdef CONFIG_X86_NEW_MCE
-void mce_log_therm_throt_event(__u64 status);
+extern bool amd_mce_is_memory_error(struct mce *m);
+
+void mce_amd_feature_init(struct cpuinfo_x86 *c);
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
#else
-static inline void mce_log_therm_throt_event(__u64 status) {}
+static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
+static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif
-#endif /* __KERNEL__ */
+unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len);
+
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..ea6494628cb0
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLER__
+
+#include <linux/init.h>
+#include <linux/cc_platform.h>
+
+#include <asm/asm.h>
+struct boot_params;
+
+#ifdef CONFIG_X86_MEM_ENCRYPT
+void __init mem_encrypt_init(void);
+void __init mem_encrypt_setup_arch(void);
+#else
+static inline void mem_encrypt_init(void) { }
+static inline void __init mem_encrypt_setup_arch(void) { }
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern u64 sme_me_mask;
+extern u64 sev_status;
+
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+ unsigned long decrypted_kernel_vaddr,
+ unsigned long kernel_len,
+ unsigned long encryption_wa,
+ unsigned long encryption_pgd);
+
+void __init sme_early_encrypt(resource_size_t paddr,
+ unsigned long size);
+void __init sme_early_decrypt(resource_size_t paddr,
+ unsigned long size);
+
+void __init sme_map_bootdata(char *real_mode_data);
+void __init sme_unmap_bootdata(char *real_mode_data);
+
+void __init sme_early_init(void);
+
+void sme_encrypt_kernel(struct boot_params *bp);
+void sme_enable(struct boot_params *bp);
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr,
+ unsigned long size, bool enc);
+
+void __init mem_encrypt_free_decrypted_mem(void);
+
+void __init sev_es_init_vc_handling(void);
+
+static inline u64 sme_get_me_mask(void)
+{
+ return sme_me_mask;
+}
+
+#define __bss_decrypted __section(".bss..decrypted")
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask 0ULL
+#define sev_status 0ULL
+
+static inline void __init sme_early_encrypt(resource_size_t paddr,
+ unsigned long size) { }
+static inline void __init sme_early_decrypt(resource_size_t paddr,
+ unsigned long size) { }
+
+static inline void __init sme_map_bootdata(char *real_mode_data) { }
+static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
+
+static inline void __init sme_early_init(void) { }
+
+static inline void sme_encrypt_kernel(struct boot_params *bp) { }
+static inline void sme_enable(struct boot_params *bp) { }
+
+static inline void sev_es_init_vc_handling(void) { }
+
+static inline int __init
+early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline int __init
+early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline void __init
+early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) {}
+
+static inline void mem_encrypt_free_decrypted_mem(void) { }
+
+static inline u64 sme_get_me_mask(void) { return 0; }
+
+#define __bss_decrypted
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+void add_encrypt_protection_map(void);
+
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register. Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x) (__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
+
+extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h
new file mode 100644
index 000000000000..113b2fa51849
--- /dev/null
+++ b/arch/x86/include/asm/memtype.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MEMTYPE_H
+#define _ASM_X86_MEMTYPE_H
+
+#include <linux/types.h>
+#include <asm/pgtable_types.h>
+
+extern bool pat_enabled(void);
+extern void pat_bp_init(void);
+extern void pat_cpu_init(void);
+
+extern int memtype_reserve(u64 start, u64 end,
+ enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
+extern int memtype_free(u64 start, u64 end);
+
+extern int memtype_kernel_map_sync(u64 base, unsigned long size,
+ enum page_cache_mode pcm);
+
+extern int memtype_reserve_io(resource_size_t start, resource_size_t end,
+ enum page_cache_mode *pcm);
+
+extern void memtype_free_io(resource_size_t start, resource_size_t end);
+
+extern bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
+
+bool x86_has_pat_wp(void);
+enum page_cache_mode pgprot2cachemode(pgprot_t pgprot);
+
+#endif /* _ASM_X86_MEMTYPE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..8b41f26f003b 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,58 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MICROCODE_H
#define _ASM_X86_MICROCODE_H
+#include <asm/msr.h>
+
struct cpu_signature {
unsigned int sig;
unsigned int pf;
unsigned int rev;
};
-struct device;
-
-enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
-
-struct microcode_ops {
- enum ucode_state (*request_microcode_user) (int cpu,
- const void __user *buf, size_t size);
+struct ucode_cpu_info {
+ struct cpu_signature cpu_sig;
+ void *mc;
+};
- enum ucode_state (*request_microcode_fw) (int cpu,
- struct device *device);
+#ifdef CONFIG_MICROCODE
+void load_ucode_bsp(void);
+void load_ucode_ap(void);
+void microcode_bsp_resume(void);
+bool __init microcode_loader_disabled(void);
+#else
+static inline void load_ucode_bsp(void) { }
+static inline void load_ucode_ap(void) { }
+static inline void microcode_bsp_resume(void) { }
+static inline bool __init microcode_loader_disabled(void) { return false; }
+#endif
- void (*microcode_fini_cpu) (int cpu);
+extern unsigned long initrd_start_early;
- /*
- * The generic 'microcode_core' part guarantees that
- * the callbacks below run on a target cpu when they
- * are being called.
- * See also the "Synchronization" section in microcode_core.c.
- */
- int (*apply_microcode) (int cpu);
- int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
+#ifdef CONFIG_CPU_SUP_INTEL
+/* Intel specific microcode defines. Public for IFS */
+struct microcode_header_intel {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int metasize;
+ unsigned int min_req_ver;
+ unsigned int reserved;
};
-struct ucode_cpu_info {
- struct cpu_signature cpu_sig;
- int valid;
- void *mc;
+struct microcode_intel {
+ struct microcode_header_intel hdr;
+ unsigned int bits[];
};
-extern struct ucode_cpu_info ucode_cpu_info[];
-#ifdef CONFIG_MICROCODE_INTEL
-extern struct microcode_ops * __init init_intel_microcode(void);
-#else
-static inline struct microcode_ops * __init init_intel_microcode(void)
+#define DEFAULT_UCODE_DATASIZE (2000)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
+#define MC_HEADER_TYPE_MICROCODE 1
+#define MC_HEADER_TYPE_IFS 2
+
+static inline int intel_microcode_get_datasize(struct microcode_header_intel *hdr)
{
- return NULL;
+ return hdr->datasize ? : DEFAULT_UCODE_DATASIZE;
}
-#endif /* CONFIG_MICROCODE_INTEL */
-#ifdef CONFIG_MICROCODE_AMD
-extern struct microcode_ops * __init init_amd_microcode(void);
-#else
-static inline struct microcode_ops * __init init_amd_microcode(void)
+static inline u32 intel_get_microcode_revision(void)
+{
+ u32 rev, dummy;
+
+ native_wrmsrq(MSR_IA32_UCODE_REV, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ native_cpuid_eax(1);
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
+
+ return rev;
+}
+#endif /* !CONFIG_CPU_SUP_INTEL */
+
+bool microcode_nmi_handler(void);
+void microcode_offline_nmi_handler(void);
+
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+DECLARE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static __always_inline bool microcode_nmi_handler_enabled(void)
{
- return NULL;
+ return static_branch_unlikely(&microcode_nmi_handler_enable);
}
+#else
+static __always_inline bool microcode_nmi_handler_enabled(void) { return false; }
#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/misc.h b/arch/x86/include/asm/misc.h
new file mode 100644
index 000000000000..bb049cca3729
--- /dev/null
+++ b/arch/x86/include/asm/misc.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MISC_H
+#define _ASM_X86_MISC_H
+
+int num_digits(int val);
+
+#endif /* _ASM_X86_MISC_H */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 751af2550ed9..12b820259b9f 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -1,20 +1,15 @@
-#ifndef _ASM_X86_MMAN_H
-#define _ASM_X86_MMAN_H
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_MMAN_H__
+#define __ASM_MMAN_H__
-#include <asm-generic/mman-common.h>
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#define arch_calc_vm_prot_bits(prot, key) ( \
+ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
+ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
+ ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
+ ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
-#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+#include <uapi/asm/mman.h>
-#define MAP_GROWSDOWN 0x0100 /* stack-like segment */
-#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
-#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
-#define MAP_LOCKED 0x2000 /* pages are locked */
-#define MAP_NORESERVE 0x4000 /* don't check for reservations */
-#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
-#define MAP_NONBLOCK 0x10000 /* do not block on IO */
-#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
-
-#define MCL_CURRENT 1 /* lock all current mappings */
-#define MCL_FUTURE 2 /* lock all future mappings */
-
-#endif /* _ASM_X86_MMAN_H */
+#endif /* __ASM_MMAN_H__ */
diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h
index 9b119da1d105..976486447e10 100644
--- a/arch/x86/include/asm/mmconfig.h
+++ b/arch/x86/include/asm/mmconfig.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMCONFIG_H
#define _ASM_X86_MMCONFIG_H
#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __cpuinit check_enable_amd_mmconf_dmi(void);
+extern void fam10h_check_enable_mmcfg(void);
+extern void check_enable_amd_mmconf_dmi(void);
#else
static inline void fam10h_check_enable_mmcfg(void) { }
static inline void check_enable_amd_mmconf_dmi(void) { }
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee5bea5..0fe9c569d171 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -1,26 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_H
#define _ASM_X86_MMU_H
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/mutex.h>
+#include <linux/atomic.h>
+#include <linux/bits.h>
+
+/* Uprobes on this MM assume 32-bit code */
+#define MM_CONTEXT_UPROBE_IA32 0
+/* vsyscall page is accessible on this MM */
+#define MM_CONTEXT_HAS_VSYSCALL 1
+/* Do not allow changing LAM mode */
+#define MM_CONTEXT_LOCK_LAM 2
+/* Allow LAM and SVA coexisting */
+#define MM_CONTEXT_FORCE_TAGGED_SVA 3
+/* Tracks mm_cpumask */
+#define MM_CONTEXT_NOTRACK 4
/*
- * The x86 doesn't have a mmu context, but
- * we put the segment information here.
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
*/
typedef struct {
- void *ldt;
- int size;
+ /*
+ * ctx_id uniquely identifies this mm_struct. A ctx_id will never
+ * be reused, and zero is not a valid ctx_id.
+ */
+ u64 ctx_id;
+
+ /*
+ * Any code that needs to do any sort of TLB flushing for this
+ * mm will first make its changes to the page tables, then
+ * increment tlb_gen, then flush. This lets the low-level
+ * flushing code keep track of what needs flushing.
+ *
+ * This is not used on Xen PV.
+ */
+ atomic64_t tlb_gen;
+
+ unsigned long next_trim_cpumask;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct rw_semaphore ldt_usr_sem;
+ struct ldt_struct *ldt;
+#endif
+
+ unsigned long flags;
+
+#ifdef CONFIG_ADDRESS_MASKING
+ /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
+ unsigned long lam_cr3_mask;
+
+ /* Significant bits of the virtual address. Excludes tag bits. */
+ u64 untag_mask;
+#endif
+
struct mutex lock;
- void *vdso;
-} mm_context_t;
+ void __user *vdso; /* vdso base address */
+ const struct vdso_image *vdso_image; /* vdso image in use */
+
+ atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /*
+ * One bit per protection key says whether userspace can
+ * use it or not. protected by mmap_lock.
+ */
+ u16 pkey_allocation_map;
+ s16 execute_only_pkey;
+#endif
-#ifdef CONFIG_SMP
-void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ /*
+ * The global ASID will be a non-zero value when the process has
+ * the same ASID across all CPUs, allowing it to make use of
+ * hardware-assisted remote TLB invalidation like AMD INVLPGB.
+ */
+ u16 global_asid;
+
+ /* The process is transitioning to a new global ASID number. */
+ bool asid_transition;
#endif
+} mm_context_t;
+
+#define INIT_MM_CONTEXT(mm) \
+ .context = { \
+ .ctx_id = 1, \
+ .lock = __MUTEX_INITIALIZER(mm.context.lock), \
+ }
+
+void leave_mm(void);
+#define leave_mm leave_mm
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index f923203dc39a..73bf3b1b44e8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -1,92 +1,288 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/pgalloc.h>
+#include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/pkeys.h>
+
+#include <trace/events/tlb.h>
+
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
-#ifndef CONFIG_PARAVIRT
-#include <asm-generic/mm_hooks.h>
+#include <asm/debugreg.h>
+#include <asm/gsseg.h>
+#include <asm/desc.h>
-static inline void paravirt_activate_mm(struct mm_struct *prev,
- struct mm_struct *next)
-{
-}
-#endif /* !CONFIG_PARAVIRT */
+extern atomic64_t last_mm_ctx_id;
+#ifdef CONFIG_PERF_EVENTS
+DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
+DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
+void cr4_update_pce(void *ignored);
+#endif
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
- * Used for LDT copy/destruction.
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
*/
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
+struct ldt_struct {
+ /*
+ * Xen requires page-aligned LDTs with special permissions. This is
+ * needed to prevent us from installing evil descriptors such as
+ * call gates. On native, we could merge the ldt_struct and LDT
+ * allocations, but it's not worth trying to optimize.
+ */
+ struct desc_struct *entries;
+ unsigned int nr_entries;
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot). We use two slots so that we can allocate
+ * and map, and enable a new LDT without invalidating the mapping
+ * of an older, still-in-use LDT.
+ *
+ * slot will be -1 if this LDT doesn't have an alias mapping.
+ */
+ int slot;
+};
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+/*
+ * Used for LDT copy/destruction.
+ */
+static inline void init_new_context_ldt(struct mm_struct *mm)
{
-#ifdef CONFIG_SMP
- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+ mm->context.ldt = NULL;
+ init_rwsem(&mm->context.ldt_usr_sem);
+}
+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
+void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
+#else /* CONFIG_MODIFY_LDT_SYSCALL */
+static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline int ldt_dup_context(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+ return 0;
+}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+extern void load_mm_ldt(struct mm_struct *mm);
+extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
+#else
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+ clear_LDT();
}
+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
+#endif
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
+#ifdef CONFIG_ADDRESS_MASKING
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
- unsigned cpu = smp_processor_id();
+ /*
+ * When switch_mm_irqs_off() is called for a kthread, it may race with
+ * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
+ * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
+ * reads a single value for both.
+ */
+ return READ_ONCE(mm->context.lam_cr3_mask);
+}
- if (likely(prev != next)) {
- /* stop flush ipis for the previous mm */
- cpu_clear(cpu, prev->cpu_vm_mask);
-#ifdef CONFIG_SMP
- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- percpu_write(cpu_tlbstate.active_mm, next);
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+ mm->context.untag_mask = oldmm->context.untag_mask;
+}
+
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+ return mm->context.untag_mask;
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+ mm->context.untag_mask = -1UL;
+}
+
+#define arch_pgtable_dma_compat arch_pgtable_dma_compat
+static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+{
+ return !mm_lam_cr3_mask(mm) ||
+ test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
+}
+#else
+
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+}
#endif
- cpu_set(cpu, next->cpu_vm_mask);
- /* Re-load page tables */
- load_cr3(next->pgd);
+#define enter_lazy_tlb enter_lazy_tlb
+extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
- /*
- * load the LDT, if the LDT is different:
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_LDT_nolock(&next->context);
- }
-#ifdef CONFIG_SMP
- else {
- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
- /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- */
- load_cr3(next->pgd);
- load_LDT_nolock(&next->context);
- }
+#define mm_init_global_asid mm_init_global_asid
+extern void mm_init_global_asid(struct mm_struct *mm);
+
+extern void mm_free_global_asid(struct mm_struct *mm);
+
+/*
+ * Init a new mm. Used on mm copies, like at fork()
+ * and on mm's that are brand-new, like at execve().
+ */
+#define init_new_context init_new_context
+static inline int init_new_context(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ mutex_init(&mm->context.lock);
+
+ mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+ atomic64_set(&mm->context.tlb_gen, 0);
+ mm->context.next_trim_cpumask = jiffies + HZ;
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+ /* pkey 0 is the default and allocated implicitly */
+ mm->context.pkey_allocation_map = 0x1;
+ /* -1 means unallocated or invalid */
+ mm->context.execute_only_pkey = -1;
}
#endif
+
+ mm_init_global_asid(mm);
+ mm_reset_untag_mask(mm);
+ init_new_context_ldt(mm);
+ return 0;
}
+#define destroy_context destroy_context
+static inline void destroy_context(struct mm_struct *mm)
+{
+ destroy_context_ldt(mm);
+ mm_free_global_asid(mm);
+}
+
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
+
#define activate_mm(prev, next) \
do { \
- paravirt_activate_mm((prev), (next)); \
- switch_mm((prev), (next), NULL); \
+ paravirt_enter_mmap(next); \
+ switch_mm_irqs_off((prev), (next), NULL); \
} while (0);
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
- lazy_load_gs(0); \
+ loadsegment(gs, 0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
do { \
+ shstk_free(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
#endif
+static inline void arch_dup_pkeys(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+#endif
+}
+
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ arch_dup_pkeys(oldmm, mm);
+ paravirt_enter_mmap(mm);
+ dup_lam(oldmm, mm);
+ return ldt_dup_context(oldmm, mm);
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+ paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
+}
+
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
+}
+#else
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return false;
+}
+#endif
+
+static inline bool is_notrack_mm(struct mm_struct *mm)
+{
+ return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
+}
+
+static inline void set_notrack_mm(struct mm_struct *mm)
+{
+ set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags);
+}
+
+/*
+ * We only want to enforce protection keys on the current process
+ * because we effectively have no access to PKRU for other
+ * processes or any way to tell *which * PKRU in a threaded
+ * process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current
+ * mm, or if we are in a kernel thread.
+ */
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+ bool write, bool execute, bool foreign)
+{
+ /* pkeys never affect instruction fetches */
+ if (execute)
+ return true;
+ /* allow access if the VMA is not one from this process */
+ if (foreign || vma_is_foreign(vma))
+ return true;
+ return __pkru_allows_pkey(vma_pkey(vma), write);
+}
+
+unsigned long __get_current_cr3_fast(void);
+
+#include <asm-generic/mmu_context.h>
+
+extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm);
+extern void unuse_temporary_mm(struct mm_struct *prev_mm);
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
deleted file mode 100644
index 5cbf3135b971..000000000000
--- a/arch/x86/include/asm/mmx.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _ASM_X86_MMX_H
-#define _ASM_X86_MMX_H
-
-/*
- * MMX 3Dnow! helper operations
- */
-
-#include <linux/types.h>
-
-extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
-extern void mmx_copy_page(void *to, void *from);
-
-#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h
deleted file mode 100644
index 64217ea16a36..000000000000
--- a/arch/x86/include/asm/mmzone.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_X86_32
-# include "mmzone_32.h"
-#else
-# include "mmzone_64.h"
-#endif
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
deleted file mode 100644
index ede6998bd92c..000000000000
--- a/arch/x86/include/asm/mmzone_32.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002
- *
- */
-
-#ifndef _ASM_X86_MMZONE_32_H
-#define _ASM_X86_MMZONE_32_H
-
-#include <asm/smp.h>
-
-#ifdef CONFIG_NUMA
-extern struct pglist_data *node_data[];
-#define NODE_DATA(nid) (node_data[nid])
-
-#include <asm/numaq.h>
-/* summit or generic arch */
-#include <asm/srat.h>
-
-extern int get_memcfg_numa_flat(void);
-/*
- * This allows any one NUMA architecture to be compiled
- * for, and still fall back to the flat function if it
- * fails.
- */
-static inline void get_memcfg_numa(void)
-{
-
- if (get_memcfg_numaq())
- return;
- if (get_memcfg_from_srat())
- return;
- get_memcfg_numa_flat();
-}
-
-extern void resume_map_numa_kva(pgd_t *pgd);
-
-#else /* !CONFIG_NUMA */
-
-#define get_memcfg_numa get_memcfg_numa_flat
-
-static inline void resume_map_numa_kva(pgd_t *pgd) {}
-
-#endif /* CONFIG_NUMA */
-
-#ifdef CONFIG_DISCONTIGMEM
-
-/*
- * generic node memory support, the following assumptions apply:
- *
- * 1) memory comes in 64Mb contigious chunks which are either present or not
- * 2) we will not have more than 64Gb in total
- *
- * for now assume that 64Gb is max amount of RAM for whole system
- * 64Gb / 4096bytes/page = 16777216 pages
- */
-#define MAX_NR_PAGES 16777216
-#define MAX_ELEMENTS 1024
-#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
-
-extern s8 physnode_map[];
-
-static inline int pfn_to_nid(unsigned long pfn)
-{
-#ifdef CONFIG_NUMA
- return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
-#else
- return 0;
-#endif
-}
-
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid) \
-({ \
- pg_data_t *__pgdat = NODE_DATA(nid); \
- __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
-})
-
-static inline int pfn_valid(int pfn)
-{
- int nid = pfn_to_nid(pfn);
-
- if (nid >= 0)
- return (pfn < node_end_pfn(nid));
- return 0;
-}
-
-#endif /* CONFIG_DISCONTIGMEM */
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-/* always use node 0 for bootmem on this numa platform */
-#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \
- (NODE_DATA(0)->bdata)
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
-
-#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
deleted file mode 100644
index a29f48c2a322..000000000000
--- a/arch/x86/include/asm/mmzone_64.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* K8 NUMA support */
-/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
-/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */
-#ifndef _ASM_X86_MMZONE_64_H
-#define _ASM_X86_MMZONE_64_H
-
-
-#ifdef CONFIG_NUMA
-
-#include <linux/mmdebug.h>
-
-#include <asm/smp.h>
-
-/* Simple perfect hash to map physical addresses to node numbers */
-struct memnode {
- int shift;
- unsigned int mapsize;
- s16 *map;
- s16 embedded_map[64 - 8];
-} ____cacheline_aligned; /* total size = 128 bytes */
-extern struct memnode memnode;
-#define memnode_shift memnode.shift
-#define memnodemap memnode.map
-#define memnodemapsize memnode.mapsize
-
-extern struct pglist_data *node_data[];
-
-static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
-{
- unsigned nid;
- VIRTUAL_BUG_ON(!memnodemap);
- nid = memnodemap[addr >> memnode_shift];
- VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
- return nid;
-}
-
-#define NODE_DATA(nid) (node_data[nid])
-
-#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
- NODE_DATA(nid)->node_spanned_pages)
-
-#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
-#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-#endif
-
-#endif
-#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 47d62743c4d5..3c2de4ce3b10 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -1,80 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MODULE_H
#define _ASM_X86_MODULE_H
-/* x86_32/64 are simple */
-struct mod_arch_specific {};
+#include <asm-generic/module.h>
+#include <asm/orc_types.h>
-#ifdef CONFIG_X86_32
-# define Elf_Shdr Elf32_Shdr
-# define Elf_Sym Elf32_Sym
-# define Elf_Ehdr Elf32_Ehdr
-#else
-# define Elf_Shdr Elf64_Shdr
-# define Elf_Sym Elf64_Sym
-# define Elf_Ehdr Elf64_Ehdr
+struct its_array {
+#ifdef CONFIG_MITIGATION_ITS
+ void **pages;
+ int num;
#endif
+};
-#ifdef CONFIG_X86_64
-/* X86_64 does not define MODULE_PROC_FAMILY */
-#elif defined CONFIG_M386
-#define MODULE_PROC_FAMILY "386 "
-#elif defined CONFIG_M486
-#define MODULE_PROC_FAMILY "486 "
-#elif defined CONFIG_M586
-#define MODULE_PROC_FAMILY "586 "
-#elif defined CONFIG_M586TSC
-#define MODULE_PROC_FAMILY "586TSC "
-#elif defined CONFIG_M586MMX
-#define MODULE_PROC_FAMILY "586MMX "
-#elif defined CONFIG_MCORE2
-#define MODULE_PROC_FAMILY "CORE2 "
-#elif defined CONFIG_M686
-#define MODULE_PROC_FAMILY "686 "
-#elif defined CONFIG_MPENTIUMII
-#define MODULE_PROC_FAMILY "PENTIUMII "
-#elif defined CONFIG_MPENTIUMIII
-#define MODULE_PROC_FAMILY "PENTIUMIII "
-#elif defined CONFIG_MPENTIUMM
-#define MODULE_PROC_FAMILY "PENTIUMM "
-#elif defined CONFIG_MPENTIUM4
-#define MODULE_PROC_FAMILY "PENTIUM4 "
-#elif defined CONFIG_MK6
-#define MODULE_PROC_FAMILY "K6 "
-#elif defined CONFIG_MK7
-#define MODULE_PROC_FAMILY "K7 "
-#elif defined CONFIG_MK8
-#define MODULE_PROC_FAMILY "K8 "
-#elif defined CONFIG_X86_ELAN
-#define MODULE_PROC_FAMILY "ELAN "
-#elif defined CONFIG_MCRUSOE
-#define MODULE_PROC_FAMILY "CRUSOE "
-#elif defined CONFIG_MEFFICEON
-#define MODULE_PROC_FAMILY "EFFICEON "
-#elif defined CONFIG_MWINCHIPC6
-#define MODULE_PROC_FAMILY "WINCHIPC6 "
-#elif defined CONFIG_MWINCHIP3D
-#define MODULE_PROC_FAMILY "WINCHIP3D "
-#elif defined CONFIG_MCYRIXIII
-#define MODULE_PROC_FAMILY "CYRIXIII "
-#elif defined CONFIG_MVIAC3_2
-#define MODULE_PROC_FAMILY "VIAC3-2 "
-#elif defined CONFIG_MVIAC7
-#define MODULE_PROC_FAMILY "VIAC7 "
-#elif defined CONFIG_MGEODEGX1
-#define MODULE_PROC_FAMILY "GEODEGX1 "
-#elif defined CONFIG_MGEODE_LX
-#define MODULE_PROC_FAMILY "GEODE "
-#else
-#error unknown processor family
-#endif
-
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_4KSTACKS
-# define MODULE_STACKSIZE "4KSTACKS "
-# else
-# define MODULE_STACKSIZE ""
-# endif
-# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+struct mod_arch_specific {
+#ifdef CONFIG_UNWINDER_ORC
+ unsigned int num_orcs;
+ int *orc_unwind_ip;
+ struct orc_entry *orc_unwind;
#endif
+ struct its_array its_pages;
+};
#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e2a1bb6d71ea..d593e52e6635 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -1,11 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MPSPEC_H
#define _ASM_X86_MPSPEC_H
-#include <linux/init.h>
+#include <linux/types.h>
#include <asm/mpspec_def.h>
+#include <asm/x86_init.h>
+#include <asm/apicdef.h>
-extern int apic_version[MAX_APICS];
extern int pic_mode;
#ifdef CONFIG_X86_32
@@ -14,25 +16,14 @@ extern int pic_mode;
* Summit or generic (i.e. installer) kernels need lots of bus entries.
* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets.
*/
-#if CONFIG_BASE_SMALL == 0
-# define MAX_MP_BUSSES 260
-#else
+#ifdef CONFIG_BASE_SMALL
# define MAX_MP_BUSSES 32
+#else
+# define MAX_MP_BUSSES 260
#endif
#define MAX_IRQ_SOURCES 256
-extern unsigned int def_to_bigsmp;
-extern u8 apicid_2_node[];
-
-#ifdef CONFIG_X86_NUMAQ
-extern int mp_bus_id_to_node[MAX_MP_BUSSES];
-extern int mp_bus_id_to_local[MAX_MP_BUSSES];
-extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-#endif
-
-#define MAX_APICID 256
-
#else /* CONFIG_X86_64: */
#define MAX_MP_BUSSES 256
@@ -41,125 +32,46 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
#endif /* CONFIG_X86_64 */
-extern void early_find_smp_config(void);
-extern void early_get_smp_config(void);
-
-#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
+#ifdef CONFIG_EISA
extern int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-extern unsigned int boot_cpu_physical_apicid;
-extern unsigned int max_physical_apicid;
-extern int smp_found_config;
-extern int mpc_default_type;
-extern unsigned long mp_lapic_addr;
+extern u32 boot_cpu_physical_apicid;
+extern u8 boot_cpu_apic_version;
-extern void get_smp_config(void);
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int smp_found_config;
+#else
+# define smp_found_config 0
+#endif
#ifdef CONFIG_X86_MPPARSE
-extern void find_smp_config(void);
-extern void early_reserve_e820_mpc_new(void);
+extern void e820__memblock_alloc_reserved_mpc_new(void);
extern int enable_update_mptable;
+extern void mpparse_find_mptable(void);
+extern void mpparse_parse_early_smp_config(void);
+extern void mpparse_parse_smp_config(void);
#else
-static inline void find_smp_config(void) { }
-static inline void early_reserve_e820_mpc_new(void) { }
-#define enable_update_mptable 0
+static inline void e820__memblock_alloc_reserved_mpc_new(void) { }
+#define enable_update_mptable 0
+#define mpparse_find_mptable x86_init_noop
+#define mpparse_parse_early_smp_config x86_init_noop
+#define mpparse_parse_smp_config x86_init_noop
#endif
-void __cpuinit generic_processor_info(int apicid, int version);
-#ifdef CONFIG_ACPI
-extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
- u32 gsi);
-extern void mp_config_acpi_legacy_irqs(void);
-struct device;
-extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
- int active_high_low);
-extern int acpi_probe_gsi(void);
-#ifdef CONFIG_X86_IO_APIC
-extern int mp_find_ioapic(int gsi);
-extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#endif
-#else /* !CONFIG_ACPI: */
-static inline int acpi_probe_gsi(void)
+extern DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC);
+
+static inline void reset_phys_cpu_present_map(u32 apicid)
{
- return 0;
+ bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
+ set_bit(apicid, phys_cpu_present_map);
}
-#endif /* CONFIG_ACPI */
-
-#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
-
-struct physid_mask {
- unsigned long mask[PHYSID_ARRAY_SIZE];
-};
-
-typedef struct physid_mask physid_mask_t;
-#define physid_set(physid, map) set_bit(physid, (map).mask)
-#define physid_clear(physid, map) clear_bit(physid, (map).mask)
-#define physid_isset(physid, map) test_bit(physid, (map).mask)
-#define physid_test_and_set(physid, map) \
- test_and_set_bit(physid, (map).mask)
-
-#define physids_and(dst, src1, src2) \
- bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-
-#define physids_or(dst, src1, src2) \
- bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-
-#define physids_clear(map) \
- bitmap_zero((map).mask, MAX_APICS)
-
-#define physids_complement(dst, src) \
- bitmap_complement((dst).mask, (src).mask, MAX_APICS)
-
-#define physids_empty(map) \
- bitmap_empty((map).mask, MAX_APICS)
-
-#define physids_equal(map1, map2) \
- bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
-
-#define physids_weight(map) \
- bitmap_weight((map).mask, MAX_APICS)
-
-#define physids_shift_right(d, s, n) \
- bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
-
-#define physids_shift_left(d, s, n) \
- bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
-
-#define physids_coerce(map) ((map).mask[0])
-
-#define physids_promote(physids) \
- ({ \
- physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
- __physid_mask.mask[0] = physids; \
- __physid_mask; \
- })
-
-/* Note: will create very large stack frames if physid_mask_t is big */
-#define physid_mask_of_physid(physid) \
- ({ \
- physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
- physid_set(physid, __physid_mask); \
- __physid_mask; \
- })
-
-static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
+static inline void copy_phys_cpu_present_map(unsigned long *dst)
{
- physids_clear(*map);
- physid_set(physid, *map);
+ bitmap_copy(dst, phys_cpu_present_map, MAX_LOCAL_APIC);
}
-#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
-#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
-
-extern physid_mask_t phys_cpu_present_map;
-
-extern int generic_mps_oem_check(struct mpc_table *, char *, char *);
-
-extern int default_acpi_madt_oem_check(char *, char *);
-
#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d7c188..6fb923a34309 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MPSPEC_DEF_H
#define _ASM_X86_MPSPEC_DEF_H
@@ -15,13 +16,6 @@
#ifdef CONFIG_X86_32
# define MAX_MPC_ENTRY 1024
-# define MAX_APICS 256
-#else
-# if NR_CPUS <= 255
-# define MAX_APICS 255
-# else
-# define MAX_APICS 32768
-# endif
#endif
/* Intel MP Floating Pointer Structure */
@@ -65,7 +59,7 @@ struct mpc_table {
#define MP_TRANSLATION 192
#define CPU_ENABLED 1 /* Processor is available */
-#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */
+#define CPU_BOOTPROCESSOR 2 /* Processor is the boot CPU */
#define CPU_STEPPING_MASK 0x000F
#define CPU_MODEL_MASK 0x00F0
@@ -91,7 +85,7 @@ struct mpc_bus {
#define BUSTYPE_EISA "EISA"
#define BUSTYPE_ISA "ISA"
#define BUSTYPE_INTERN "INTERN" /* Internal BUS */
-#define BUSTYPE_MCA "MCA"
+#define BUSTYPE_MCA "MCA" /* Obsolete */
#define BUSTYPE_VL "VL" /* Local bus */
#define BUSTYPE_PCI "PCI"
#define BUSTYPE_PCMCIA "PCMCIA"
@@ -134,9 +128,17 @@ enum mp_irq_source_types {
mp_ExtINT = 3
};
-#define MP_IRQDIR_DEFAULT 0
-#define MP_IRQDIR_HIGH 1
-#define MP_IRQDIR_LOW 3
+#define MP_IRQPOL_DEFAULT 0x0
+#define MP_IRQPOL_ACTIVE_HIGH 0x1
+#define MP_IRQPOL_RESERVED 0x2
+#define MP_IRQPOL_ACTIVE_LOW 0x3
+#define MP_IRQPOL_MASK 0x3
+
+#define MP_IRQTRIG_DEFAULT 0x0
+#define MP_IRQTRIG_EDGE 0x4
+#define MP_IRQTRIG_RESERVED 0x8
+#define MP_IRQTRIG_LEVEL 0xc
+#define MP_IRQTRIG_MASK 0xc
#define MP_APIC_ALL 0xFF
@@ -176,6 +178,5 @@ enum mp_bustype {
MP_BUS_ISA = 1,
MP_BUS_EISA,
MP_BUS_PCI,
- MP_BUS_MCA,
};
#endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h
deleted file mode 100644
index 7e4e9481f51c..000000000000
--- a/arch/x86/include/asm/msgbuf.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _ASM_X86_MSGBUF_H
-#define _ASM_X86_MSGBUF_H
-
-/*
- * The msqid64_ds structure for i386 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space on i386 is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- *
- * Pad space on x8664 is left for:
- * - 2 miscellaneous 64-bit values
- */
-struct msqid64_ds {
- struct ipc64_perm msg_perm;
- __kernel_time_t msg_stime; /* last msgsnd time */
-#ifdef __i386__
- unsigned long __unused1;
-#endif
- __kernel_time_t msg_rtime; /* last msgrcv time */
-#ifdef __i386__
- unsigned long __unused2;
-#endif
- __kernel_time_t msg_ctime; /* last change time */
-#ifdef __i386__
- unsigned long __unused3;
-#endif
- unsigned long msg_cbytes; /* current number of bytes on queue */
- unsigned long msg_qnum; /* number of messages in queue */
- unsigned long msg_qbytes; /* max number of bytes on queue */
- __kernel_pid_t msg_lspid; /* pid of last msgsnd */
- __kernel_pid_t msg_lrpid; /* last receive pid */
- unsigned long __unused4;
- unsigned long __unused5;
-};
-
-#endif /* _ASM_X86_MSGBUF_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 000000000000..eef4c3a5ba28
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MSHYPER_H
+#define _ASM_X86_MSHYPER_H
+
+#include <linux/types.h>
+#include <linux/nmi.h>
+#include <linux/msi.h>
+#include <linux/io.h>
+#include <linux/static_call.h>
+#include <asm/nospec-branch.h>
+#include <asm/paravirt.h>
+#include <asm/msr.h>
+#include <hyperv/hvhdk.h>
+#include <asm/fpu/types.h>
+
+/*
+ * Hyper-V always provides a single IO-APIC at this MMIO address.
+ * Ideally, the value should be looked up in ACPI tables, but it
+ * is needed for mapping the IO-APIC early in boot on Confidential
+ * VMs, before ACPI functions can be used.
+ */
+#define HV_IOAPIC_BASE_ADDRESS 0xfec00000
+
+#define HV_VTL_NORMAL 0x0
+#define HV_VTL_SECURE 0x1
+#define HV_VTL_MGMT 0x2
+
+union hv_ghcb;
+
+DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
+DECLARE_STATIC_KEY_FALSE(isolation_type_tdx);
+
+typedef int (*hyperv_fill_flush_list_func)(
+ struct hv_guest_mapping_flush_list *flush,
+ void *data);
+
+void hyperv_vector_handler(struct pt_regs *regs);
+
+static inline unsigned char hv_get_nmi_reason(void)
+{
+ return 0;
+}
+
+extern u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
+extern u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2);
+extern u64 hv_std_hypercall(u64 control, u64 param1, u64 param2);
+
+#if IS_ENABLED(CONFIG_HYPERV)
+extern void *hv_hypercall_pg;
+
+extern union hv_ghcb * __percpu *hv_ghcb_pg;
+
+bool hv_isolation_type_snp(void);
+bool hv_isolation_type_tdx(void);
+
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_CALL(hv_hypercall, hv_std_hypercall);
+#endif
+
+/*
+ * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA
+ * to start AP in enlightened SEV guest.
+ */
+#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define HV_AP_SEGMENT_LIMIT 0xffffffff
+
+/*
+ * If the hypercall involves no input or output parameters, the hypervisor
+ * ignores the corresponding GPA pointer.
+ */
+static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
+{
+ u64 input_address = input ? virt_to_phys(input) : 0;
+ u64 output_address = output ? virt_to_phys(output) : 0;
+
+#ifdef CONFIG_X86_64
+ return static_call_mod(hv_hypercall)(control, input_address, output_address);
+#else
+ u32 input_address_hi = upper_32_bits(input_address);
+ u32 input_address_lo = lower_32_bits(input_address);
+ u32 output_address_hi = upper_32_bits(output_address);
+ u32 output_address_lo = lower_32_bits(output_address);
+ u64 hv_status;
+
+ if (!hv_hypercall_pg)
+ return U64_MAX;
+
+ __asm__ __volatile__(CALL_NOSPEC
+ : "=A" (hv_status),
+ "+c" (input_address_lo), ASM_CALL_CONSTRAINT
+ : "A" (control),
+ "b" (input_address_hi),
+ "D"(output_address_hi), "S"(output_address_lo),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "memory");
+ return hv_status;
+#endif /* !x86_64 */
+}
+
+/* Fast hypercall with 8 bytes of input and no output */
+static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
+{
+#ifdef CONFIG_X86_64
+ return static_call_mod(hv_hypercall)(control, input1, 0);
+#else
+ u32 input1_hi = upper_32_bits(input1);
+ u32 input1_lo = lower_32_bits(input1);
+ u64 hv_status;
+
+ __asm__ __volatile__ (CALL_NOSPEC
+ : "=A"(hv_status),
+ "+c"(input1_lo),
+ ASM_CALL_CONSTRAINT
+ : "A" (control),
+ "b" (input1_hi),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "edi", "esi");
+ return hv_status;
+#endif
+}
+
+static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+ return _hv_do_fast_hypercall8(control, input1);
+}
+
+/* Fast hypercall with 16 bytes of input */
+static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
+{
+#ifdef CONFIG_X86_64
+ return static_call_mod(hv_hypercall)(control, input1, input2);
+#else
+ u32 input1_hi = upper_32_bits(input1);
+ u32 input1_lo = lower_32_bits(input1);
+ u32 input2_hi = upper_32_bits(input2);
+ u32 input2_lo = lower_32_bits(input2);
+ u64 hv_status;
+
+ __asm__ __volatile__ (CALL_NOSPEC
+ : "=A"(hv_status),
+ "+c"(input1_lo), ASM_CALL_CONSTRAINT
+ : "A" (control), "b" (input1_hi),
+ "D"(input2_hi), "S"(input2_lo),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc");
+ return hv_status;
+#endif
+}
+
+static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+ return _hv_do_fast_hypercall16(control, input1, input2);
+}
+
+extern struct hv_vp_assist_page **hv_vp_assist_page;
+
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ if (!hv_vp_assist_page)
+ return NULL;
+
+ return hv_vp_assist_page[cpu];
+}
+
+void __init hyperv_init(void);
+void hyperv_setup_mmu_ops(void);
+void set_hv_tscchange_cb(void (*cb)(void));
+void clear_hv_tscchange_cb(void);
+void hyperv_stop_tsc_emulation(void);
+int hyperv_flush_guest_mapping(u64 as);
+int hyperv_flush_guest_mapping_range(u64 as,
+ hyperv_fill_flush_list_func fill_func, void *data);
+int hyperv_fill_flush_guest_mapping_list(
+ struct hv_guest_mapping_flush_list *flush,
+ u64 start_gfn, u64 end_gfn);
+void hv_sleep_notifiers_register(void);
+void hv_machine_power_off(void);
+
+#ifdef CONFIG_X86_64
+void hv_apic_init(void);
+void __init hv_init_spinlocks(void);
+bool hv_vcpu_is_preempted(int vcpu);
+#else
+static inline void hv_apic_init(void) {}
+#endif
+
+struct irq_domain *hv_create_pci_msi_domain(void);
+
+int hv_map_msi_interrupt(struct irq_data *data,
+ struct hv_interrupt_entry *out_entry);
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
+ struct hv_interrupt_entry *entry);
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+bool hv_ghcb_negotiate_protocol(void);
+void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu);
+#else
+static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
+static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
+static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip,
+ unsigned int cpu) { return 0; }
+#endif
+
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_vtom_init(void);
+void hv_ivm_msr_write(u64 msr, u64 value);
+void hv_ivm_msr_read(u64 msr, u64 *value);
+#else
+static inline void hv_vtom_init(void) {}
+static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
+static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
+#endif
+
+static inline bool hv_is_synic_msr(unsigned int reg)
+{
+ return (reg >= HV_X64_MSR_SCONTROL) &&
+ (reg <= HV_X64_MSR_SINT15);
+}
+
+static inline bool hv_is_sint_msr(unsigned int reg)
+{
+ return (reg >= HV_X64_MSR_SINT0) &&
+ (reg <= HV_X64_MSR_SINT15);
+}
+
+u64 hv_get_msr(unsigned int reg);
+void hv_set_msr(unsigned int reg, u64 value);
+u64 hv_get_non_nested_msr(unsigned int reg);
+void hv_set_non_nested_msr(unsigned int reg, u64 value);
+
+static __always_inline u64 hv_raw_get_msr(unsigned int reg)
+{
+ return native_rdmsrq(reg);
+}
+int hv_apicid_to_vp_index(u32 apic_id);
+
+#if IS_ENABLED(CONFIG_MSHV_ROOT) && IS_ENABLED(CONFIG_CRASH_DUMP)
+void hv_root_crash_init(void);
+void hv_crash_asm32(void);
+void hv_crash_asm64(void);
+void hv_crash_asm_end(void);
+#else /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+static inline void hv_root_crash_init(void) {}
+#endif /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+
+#else /* CONFIG_HYPERV */
+static inline void hyperv_init(void) {}
+static inline void hyperv_setup_mmu_ops(void) {}
+static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
+static inline void clear_hv_tscchange_cb(void) {}
+static inline void hyperv_stop_tsc_emulation(void) {};
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ return NULL;
+}
+static inline int hyperv_flush_guest_mapping(u64 as) { return -1; }
+static inline int hyperv_flush_guest_mapping_range(u64 as,
+ hyperv_fill_flush_list_func fill_func, void *data)
+{
+ return -1;
+}
+static inline void hv_set_msr(unsigned int reg, u64 value) { }
+static inline u64 hv_get_msr(unsigned int reg) { return 0; }
+static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { }
+static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
+static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
+#endif /* CONFIG_HYPERV */
+
+struct mshv_vtl_cpu_context {
+ union {
+ struct {
+ u64 rax;
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 cr2;
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ };
+ u64 gp_regs[16];
+ };
+
+ struct fxregs_state fx_state;
+};
+
+#ifdef CONFIG_HYPERV_VTL_MODE
+void __init hv_vtl_init_platform(void);
+int __init hv_vtl_early_init(void);
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
+void mshv_vtl_return_call_init(u64 vtl_return_offset);
+void mshv_vtl_return_hypercall(void);
+void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
+#else
+static inline void __init hv_vtl_init_platform(void) {}
+static inline int __init hv_vtl_early_init(void) { return 0; }
+static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
+static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {}
+static inline void mshv_vtl_return_hypercall(void) {}
+static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
+#endif
+
+#include <asm-generic/mshyperv.h>
+
+#endif
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
new file mode 100644
index 000000000000..935c6d470341
--- /dev/null
+++ b/arch/x86/include/asm/msi.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MSI_H
+#define _ASM_X86_MSI_H
+#include <asm/hw_irq.h>
+#include <asm/irqdomain.h>
+
+typedef struct irq_alloc_info msi_alloc_info_t;
+
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg);
+
+/* Structs and defines for the X86 specific MSI message format */
+
+typedef struct x86_msi_data {
+ union {
+ struct {
+ u32 vector : 8,
+ delivery_mode : 3,
+ dest_mode_logical : 1,
+ reserved : 2,
+ active_low : 1,
+ is_level : 1;
+ };
+ u32 dmar_subhandle;
+ };
+} __attribute__ ((packed)) arch_msi_msg_data_t;
+#define arch_msi_msg_data x86_msi_data
+
+typedef struct x86_msi_addr_lo {
+ union {
+ struct {
+ u32 reserved_0 : 2,
+ dest_mode_logical : 1,
+ redirect_hint : 1,
+ reserved_1 : 1,
+ virt_destid_8_14 : 7,
+ destid_0_7 : 8,
+ base_address : 12;
+ };
+ struct {
+ u32 dmar_reserved_0 : 2,
+ dmar_index_15 : 1,
+ dmar_subhandle_valid : 1,
+ dmar_format : 1,
+ dmar_index_0_14 : 15,
+ dmar_base_address : 12;
+ };
+ };
+} __attribute__ ((packed)) arch_msi_msg_addr_lo_t;
+#define arch_msi_msg_addr_lo x86_msi_addr_lo
+
+#define X86_MSI_BASE_ADDRESS_LOW (0xfee00000 >> 20)
+
+typedef struct x86_msi_addr_hi {
+ u32 reserved : 8,
+ destid_8_31 : 24;
+} __attribute__ ((packed)) arch_msi_msg_addr_hi_t;
+#define arch_msi_msg_addr_hi x86_msi_addr_hi
+
+#define X86_MSI_BASE_ADDRESS_HIGH (0)
+
+struct msi_msg;
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
+
+#define X86_VECTOR_MSI_FLAGS_SUPPORTED \
+ (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN)
+
+#define X86_VECTOR_MSI_FLAGS_REQUIRED \
+ (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS)
+
+#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
deleted file mode 100644
index 4cc48af23fef..000000000000
--- a/arch/x86/include/asm/msidef.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _ASM_X86_MSIDEF_H
-#define _ASM_X86_MSIDEF_H
-
-/*
- * Constants for Intel APIC based MSI messages.
- */
-
-/*
- * Shifts for MSI data
- */
-
-#define MSI_DATA_VECTOR_SHIFT 0
-#define MSI_DATA_VECTOR_MASK 0x000000ff
-#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \
- MSI_DATA_VECTOR_MASK)
-
-#define MSI_DATA_DELIVERY_MODE_SHIFT 8
-#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
-#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
-
-#define MSI_DATA_LEVEL_SHIFT 14
-#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
-#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
-
-#define MSI_DATA_TRIGGER_SHIFT 15
-#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
-#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
-
-/*
- * Shift/mask fields for msi address
- */
-
-#define MSI_ADDR_BASE_HI 0
-#define MSI_ADDR_BASE_LO 0xfee00000
-
-#define MSI_ADDR_DEST_MODE_SHIFT 2
-#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
-#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
-
-#define MSI_ADDR_REDIRECTION_SHIFT 3
-#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
- /* dedicated cpu */
-#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
- /* lowest priority */
-
-#define MSI_ADDR_DEST_ID_SHIFT 12
-#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
-#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
- MSI_ADDR_DEST_ID_MASK)
-#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
-
-#define MSI_ADDR_IR_EXT_INT (1 << 4)
-#define MSI_ADDR_IR_SHV (1 << 3)
-#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13)
-#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5)
-#endif /* _ASM_X86_MSIDEF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6be7fc254b59..3d0a0950d20a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1,7 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_INDEX_H
#define _ASM_X86_MSR_INDEX_H
-/* CPU model specific register (MSR) numbers */
+#include <linux/bits.h>
+
+/* CPU model specific register (MSR) numbers. */
/* x86-64 specific MSRs */
#define MSR_EFER 0xc0000080 /* extended feature register */
@@ -12,6 +15,7 @@
#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
+#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
/* EFER bits: */
#define _EFER_SCE 0 /* SYSCALL/SYSRET */
@@ -19,22 +23,226 @@
#define _EFER_LMA 10 /* Long mode active (read-only) */
#define _EFER_NX 11 /* No execute enable */
#define _EFER_SVME 12 /* Enable virtualization */
+#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
+#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
#define EFER_LME (1<<_EFER_LME)
#define EFER_LMA (1<<_EFER_LMA)
#define EFER_NX (1<<_EFER_NX)
#define EFER_SVME (1<<_EFER_SVME)
+#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_TCE (1<<_EFER_TCE)
+#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
+
+/*
+ * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc.
+ * Most MSRs support/allow only a subset of memory types, but the values
+ * themselves are common across all relevant MSRs.
+ */
+#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */
+#define X86_MEMTYPE_WC 1ull /* Write Combining */
+/* RESERVED 2 */
+/* RESERVED 3 */
+#define X86_MEMTYPE_WT 4ull /* Write Through */
+#define X86_MEMTYPE_WP 5ull /* Write Protected */
+#define X86_MEMTYPE_WB 6ull /* Write Back */
+#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */
+
+/* FRED MSRs */
+#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */
+#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */
+#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */
+#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */
+#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */
+#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */
+#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */
+#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */
+#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */
+#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */
/* Intel MSRs. Some also available on other CPUs */
+#define MSR_TEST_CTRL 0x00000033
+#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29
+#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
+
+#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
+#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
+#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
+#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
+#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
+#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
+#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */
+#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
+#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */
+#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT)
+
+/* A mask for bits which the kernel toggles when controlling mitigations */
+#define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \
+ | SPEC_CTRL_RRSBA_DIS_S \
+ | SPEC_CTRL_BHI_DIS_S)
+
+#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
+#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
+#define PRED_CMD_SBPB BIT(7) /* Selective Branch Prediction Barrier */
+
+#define MSR_PPIN_CTL 0x0000004e
+#define MSR_PPIN 0x0000004f
+
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
+#define MSR_PLATFORM_INFO 0x000000ce
+#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
+#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
+
+#define MSR_IA32_UMWAIT_CONTROL 0xe1
+#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0)
+#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1)
+/*
+ * The time field is bit[31:2], but representing a 32bit value with
+ * bit[1:0] zero.
+ */
+#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U)
+
+/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */
+#define MSR_IA32_CORE_CAPS 0x000000cf
+#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2
+#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS BIT(MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT)
+#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5
+#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT)
+
+#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
+#define NHM_C3_AUTO_DEMOTE (1UL << 25)
+#define NHM_C1_AUTO_DEMOTE (1UL << 26)
+#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
+#define SNB_C3_AUTO_UNDEMOTE (1UL << 27)
+#define SNB_C1_AUTO_UNDEMOTE (1UL << 28)
#define MSR_MTRRcap 0x000000fe
+
+#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
+#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
+#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
+#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */
+#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
+#define ARCH_CAP_SSB_NO BIT(4) /*
+ * Not susceptible to Speculative Store Bypass
+ * attack, so no Speculative Store Bypass
+ * control required.
+ */
+#define ARCH_CAP_MDS_NO BIT(5) /*
+ * Not susceptible to
+ * Microarchitectural Data
+ * Sampling (MDS) vulnerabilities.
+ */
+#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /*
+ * The processor is not susceptible to a
+ * machine check error due to modifying the
+ * code page size along with either the
+ * physical address or cache type
+ * without TLB invalidation.
+ */
+#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */
+#define ARCH_CAP_TAA_NO BIT(8) /*
+ * Not susceptible to
+ * TSX Async Abort (TAA) vulnerabilities.
+ */
+#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /*
+ * Not susceptible to SBDR and SSDP
+ * variants of Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_FBSDP_NO BIT(14) /*
+ * Not susceptible to FBSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_PSDP_NO BIT(15) /*
+ * Not susceptible to PSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_MCU_ENUM BIT(16) /*
+ * Indicates the presence of microcode update
+ * feature enumeration and status information.
+ */
+#define ARCH_CAP_FB_CLEAR BIT(17) /*
+ * VERW clears CPU fill buffer
+ * even on MDS_NO CPUs.
+ */
+#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /*
+ * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
+ * bit available to control VERW
+ * behavior.
+ */
+#define ARCH_CAP_RRSBA BIT(19) /*
+ * Indicates RET may use predictors
+ * other than the RSB. With eIBRS
+ * enabled predictions in kernel mode
+ * are restricted to targets in
+ * kernel.
+ */
+#define ARCH_CAP_BHI_NO BIT(20) /*
+ * CPU is not affected by Branch
+ * History Injection.
+ */
+#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
+ * IA32_XAPIC_DISABLE_STATUS MSR
+ * supported
+ */
+#define ARCH_CAP_PBRSB_NO BIT(24) /*
+ * Not susceptible to Post-Barrier
+ * Return Stack Buffer Predictions.
+ */
+#define ARCH_CAP_GDS_CTRL BIT(25) /*
+ * CPU is vulnerable to Gather
+ * Data Sampling (GDS) and
+ * has controls for mitigation.
+ */
+#define ARCH_CAP_GDS_NO BIT(26) /*
+ * CPU is not vulnerable to Gather
+ * Data Sampling (GDS).
+ */
+#define ARCH_CAP_RFDS_NO BIT(27) /*
+ * Not susceptible to Register
+ * File Data Sampling.
+ */
+#define ARCH_CAP_RFDS_CLEAR BIT(28) /*
+ * VERW clears CPU Register
+ * File.
+ */
+#define ARCH_CAP_ITS_NO BIT_ULL(62) /*
+ * Not susceptible to
+ * Indirect Target Selection.
+ * This bit is not set by
+ * HW, but is synthesized by
+ * VMMs for guests to know
+ * their affected status.
+ */
+
+#define MSR_IA32_FLUSH_CMD 0x0000010b
+#define L1D_FLUSH BIT(0) /*
+ * Writeback and invalidate the
+ * L1 data cache.
+ */
+
#define MSR_IA32_BBL_CR_CTL 0x00000119
+#define MSR_IA32_BBL_CR_CTL3 0x0000011e
+
+#define MSR_IA32_TSX_CTRL 0x00000122
+#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
+#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
+
+#define MSR_IA32_MCU_OPT_CTRL 0x00000123
+#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
+#define RTM_ALLOW BIT(1) /* TSX development mode */
+#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */
+#define GDS_MITG_DIS BIT(4) /* Disable GDS mitigation */
+#define GDS_MITG_LOCKED BIT(5) /* GDS mitigation locked */
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -43,10 +251,157 @@
#define MSR_IA32_MCG_CAP 0x00000179
#define MSR_IA32_MCG_STATUS 0x0000017a
#define MSR_IA32_MCG_CTL 0x0000017b
+#define MSR_ERROR_CONTROL 0x0000017f
+#define MSR_IA32_MCG_EXT_CTL 0x000004d0
+
+#define MSR_OFFCORE_RSP_0 0x000001a6
+#define MSR_OFFCORE_RSP_1 0x000001a7
+#define MSR_TURBO_RATIO_LIMIT 0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1 0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2 0x000001af
+
+#define MSR_SNOOP_RSP_0 0x00001328
+#define MSR_SNOOP_RSP_1 0x00001329
+
+#define MSR_LBR_SELECT 0x000001c8
+#define MSR_LBR_TOS 0x000001c9
+
+#define MSR_IA32_POWER_CTL 0x000001fc
+#define MSR_IA32_POWER_CTL_BIT_EE 19
+
+/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */
+#define MSR_INTEGRITY_CAPS 0x000002d9
+#define MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT 2
+#define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
+#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4
+#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
+#define MSR_INTEGRITY_CAPS_SBAF_BIT 8
+#define MSR_INTEGRITY_CAPS_SBAF BIT(MSR_INTEGRITY_CAPS_SBAF_BIT)
+#define MSR_INTEGRITY_CAPS_SAF_GEN_MASK GENMASK_ULL(10, 9)
+
+#define MSR_LBR_NHM_FROM 0x00000680
+#define MSR_LBR_NHM_TO 0x000006c0
+#define MSR_LBR_CORE_FROM 0x00000040
+#define MSR_LBR_CORE_TO 0x00000060
+
+#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED BIT_ULL(63)
+#define LBR_INFO_IN_TX BIT_ULL(62)
+#define LBR_INFO_ABORT BIT_ULL(61)
+#define LBR_INFO_CYC_CNT_VALID BIT_ULL(60)
+#define LBR_INFO_CYCLES 0xffff
+#define LBR_INFO_BR_TYPE_OFFSET 56
+#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET)
+#define LBR_INFO_BR_CNTR_OFFSET 32
+#define LBR_INFO_BR_CNTR_NUM 4
+#define LBR_INFO_BR_CNTR_BITS 2
+#define LBR_INFO_BR_CNTR_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_BITS - 1, 0)
+#define LBR_INFO_BR_CNTR_FULL_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS - 1, 0)
+
+#define MSR_ARCH_LBR_CTL 0x000014ce
+#define ARCH_LBR_CTL_LBREN BIT(0)
+#define ARCH_LBR_CTL_CPL_OFFSET 1
+#define ARCH_LBR_CTL_CPL (0x3ull << ARCH_LBR_CTL_CPL_OFFSET)
+#define ARCH_LBR_CTL_STACK_OFFSET 3
+#define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET)
+#define ARCH_LBR_CTL_FILTER_OFFSET 16
+#define ARCH_LBR_CTL_FILTER (0x7full << ARCH_LBR_CTL_FILTER_OFFSET)
+#define MSR_ARCH_LBR_DEPTH 0x000014cf
+#define MSR_ARCH_LBR_FROM_0 0x00001500
+#define MSR_ARCH_LBR_TO_0 0x00001600
+#define MSR_ARCH_LBR_INFO_0 0x00001200
#define MSR_IA32_PEBS_ENABLE 0x000003f1
+#define MSR_PEBS_DATA_CFG 0x000003f2
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
+#define PERF_CAP_METRICS_IDX 15
+#define PERF_CAP_PT_IDX 16
+
+#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
+
+#define PERF_CAP_LBR_FMT 0x3f
+#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
+#define PERF_CAP_ARCH_REG BIT_ULL(7)
+#define PERF_CAP_PEBS_FORMAT 0xf00
+#define PERF_CAP_FW_WRITES BIT_ULL(13)
+#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
+#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17)
+#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
+ PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
+ PERF_CAP_PEBS_TIMING_INFO)
+
+/* Arch PEBS */
+#define MSR_IA32_PEBS_BASE 0x000003f4
+#define MSR_IA32_PEBS_INDEX 0x000003f5
+#define ARCH_PEBS_OFFSET_MASK 0x7fffff
+#define ARCH_PEBS_INDEX_WR_SHIFT 4
+
+#define ARCH_PEBS_RELOAD 0xffffffff
+#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35)
+#define ARCH_PEBS_CNTR_GP BIT_ULL(36)
+#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37)
+#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38)
+#define ARCH_PEBS_LBR_SHIFT 40
+#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT)
+#define ARCH_PEBS_VECR_XMM BIT_ULL(49)
+#define ARCH_PEBS_GPR BIT_ULL(61)
+#define ARCH_PEBS_AUX BIT_ULL(62)
+#define ARCH_PEBS_EN BIT_ULL(63)
+#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \
+ ARCH_PEBS_CNTR_METRICS)
+
+#define MSR_IA32_RTIT_CTL 0x00000570
+#define RTIT_CTL_TRACEEN BIT(0)
+#define RTIT_CTL_CYCLEACC BIT(1)
+#define RTIT_CTL_OS BIT(2)
+#define RTIT_CTL_USR BIT(3)
+#define RTIT_CTL_PWR_EVT_EN BIT(4)
+#define RTIT_CTL_FUP_ON_PTW BIT(5)
+#define RTIT_CTL_FABRIC_EN BIT(6)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA BIT(8)
+#define RTIT_CTL_MTC_EN BIT(9)
+#define RTIT_CTL_TSC_EN BIT(10)
+#define RTIT_CTL_DISRETC BIT(11)
+#define RTIT_CTL_PTW_EN BIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_EVENT_EN BIT(31)
+#define RTIT_CTL_NOTNT BIT_ULL(55)
+#define RTIT_CTL_MTC_RANGE_OFFSET 14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET 24
+#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET 32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET 36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET 40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET 44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
+#define MSR_IA32_RTIT_STATUS 0x00000571
+#define RTIT_STATUS_FILTEREN BIT(0)
+#define RTIT_STATUS_CONTEXTEN BIT(1)
+#define RTIT_STATUS_TRIGGEREN BIT(2)
+#define RTIT_STATUS_BUFFOVF BIT(3)
+#define RTIT_STATUS_ERROR BIT(4)
+#define RTIT_STATUS_STOPPED BIT(5)
+#define RTIT_STATUS_BYTECNT_OFFSET 32
+#define RTIT_STATUS_BYTECNT (0x1ffffull << RTIT_STATUS_BYTECNT_OFFSET)
+#define MSR_IA32_RTIT_ADDR0_A 0x00000580
+#define MSR_IA32_RTIT_ADDR0_B 0x00000581
+#define MSR_IA32_RTIT_ADDR1_A 0x00000582
+#define MSR_IA32_RTIT_ADDR1_B 0x00000583
+#define MSR_IA32_RTIT_ADDR2_A 0x00000584
+#define MSR_IA32_RTIT_ADDR2_B 0x00000585
+#define MSR_IA32_RTIT_ADDR3_A 0x00000586
+#define MSR_IA32_RTIT_ADDR3_B 0x00000587
+#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
#define MSR_MTRRfix64K_00000 0x00000250
#define MSR_MTRRfix16K_80000 0x00000258
@@ -63,43 +418,268 @@
#define MSR_IA32_CR_PAT 0x00000277
+#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \
+ ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \
+ (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \
+ (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \
+ (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56))
+
#define MSR_IA32_DEBUGCTLMSR 0x000001d9
#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
#define MSR_IA32_LASTINTFROMIP 0x000001dd
#define MSR_IA32_LASTINTTOIP 0x000001de
+#define MSR_IA32_PASID 0x00000d93
+#define MSR_IA32_PASID_VALID BIT_ULL(31)
+
/* DEBUGCTLMSR bits (others vary by model): */
-#define _DEBUGCTLMSR_LBR 0 /* last branch recording */
-#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */
+#define DEBUGCTLMSR_LBR_BIT 0 /* last branch recording */
+#define DEBUGCTLMSR_LBR (1UL << DEBUGCTLMSR_LBR_BIT)
+#define DEBUGCTLMSR_BTF_SHIFT 1
+#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
+#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2)
+#define DEBUGCTLMSR_TR (1UL << 6)
+#define DEBUGCTLMSR_BTS (1UL << 7)
+#define DEBUGCTLMSR_BTINT (1UL << 8)
+#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
+#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
-#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR)
-#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF)
+#define MSR_PEBS_FRONTEND 0x000003f7
#define MSR_IA32_MC0_CTL 0x00000400
#define MSR_IA32_MC0_STATUS 0x00000401
#define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403
+/* C-state Residency Counters */
+#define MSR_PKG_C3_RESIDENCY 0x000003f8
+#define MSR_PKG_C6_RESIDENCY 0x000003f9
+#define MSR_ATOM_PKG_C6_RESIDENCY 0x000003fa
+#define MSR_PKG_C7_RESIDENCY 0x000003fa
+#define MSR_CORE_C3_RESIDENCY 0x000003fc
+#define MSR_CORE_C6_RESIDENCY 0x000003fd
+#define MSR_CORE_C7_RESIDENCY 0x000003fe
+#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff
+#define MSR_PKG_C2_RESIDENCY 0x0000060d
+#define MSR_PKG_C8_RESIDENCY 0x00000630
+#define MSR_PKG_C9_RESIDENCY 0x00000631
+#define MSR_PKG_C10_RESIDENCY 0x00000632
+
+/* Interrupt Response Limit */
+#define MSR_PKGC3_IRTL 0x0000060a
+#define MSR_PKGC6_IRTL 0x0000060b
+#define MSR_PKGC7_IRTL 0x0000060c
+#define MSR_PKGC8_IRTL 0x00000633
+#define MSR_PKGC9_IRTL 0x00000634
+#define MSR_PKGC10_IRTL 0x00000635
+
+/* Run Time Average Power Limiting (RAPL) Interface */
+
+#define MSR_VR_CURRENT_CONFIG 0x00000601
+#define MSR_RAPL_POWER_UNIT 0x00000606
+
+#define MSR_PKG_POWER_LIMIT 0x00000610
+#define MSR_PKG_ENERGY_STATUS 0x00000611
+#define MSR_PKG_PERF_STATUS 0x00000613
+#define MSR_PKG_POWER_INFO 0x00000614
+
+#define MSR_DRAM_POWER_LIMIT 0x00000618
+#define MSR_DRAM_ENERGY_STATUS 0x00000619
+#define MSR_DRAM_PERF_STATUS 0x0000061b
+#define MSR_DRAM_POWER_INFO 0x0000061c
+
+#define MSR_PP0_POWER_LIMIT 0x00000638
+#define MSR_PP0_ENERGY_STATUS 0x00000639
+#define MSR_PP0_POLICY 0x0000063a
+#define MSR_PP0_PERF_STATUS 0x0000063b
+
+#define MSR_PP1_POWER_LIMIT 0x00000640
+#define MSR_PP1_ENERGY_STATUS 0x00000641
+#define MSR_PP1_POLICY 0x00000642
+
+#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299
+#define MSR_AMD_CORE_ENERGY_STATUS 0xc001029a
+#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b
+
+/* Config TDP MSRs */
+#define MSR_CONFIG_TDP_NOMINAL 0x00000648
+#define MSR_CONFIG_TDP_LEVEL_1 0x00000649
+#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A
+#define MSR_CONFIG_TDP_CONTROL 0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+
+#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D
+#define MSR_SECONDARY_TURBO_RATIO_LIMIT 0x00000650
+
+#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
+#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
+#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
+#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B
+
+#define MSR_CORE_C1_RES 0x00000660
+#define MSR_MODULE_C6_RES_MS 0x00000664
+
+#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
+#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669
+
+#define MSR_ATOM_CORE_RATIOS 0x0000066a
+#define MSR_ATOM_CORE_VIDS 0x0000066b
+#define MSR_ATOM_CORE_TURBO_RATIOS 0x0000066c
+#define MSR_ATOM_CORE_TURBO_VIDS 0x0000066d
+
+#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690
+#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
+#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
+
+/* Control-flow Enforcement Technology MSRs */
+#define MSR_IA32_U_CET 0x000006a0 /* user mode cet */
+#define MSR_IA32_S_CET 0x000006a2 /* kernel mode cet */
+#define CET_SHSTK_EN BIT_ULL(0)
+#define CET_WRSS_EN BIT_ULL(1)
+#define CET_ENDBR_EN BIT_ULL(2)
+#define CET_LEG_IW_EN BIT_ULL(3)
+#define CET_NO_TRACK_EN BIT_ULL(4)
+#define CET_SUPPRESS_DISABLE BIT_ULL(5)
+#define CET_RESERVED (BIT_ULL(6) | BIT_ULL(7) | BIT_ULL(8) | BIT_ULL(9))
+#define CET_SUPPRESS BIT_ULL(10)
+#define CET_WAIT_ENDBR BIT_ULL(11)
+
+#define MSR_IA32_PL0_SSP 0x000006a4 /* ring-0 shadow stack pointer */
+#define MSR_IA32_PL1_SSP 0x000006a5 /* ring-1 shadow stack pointer */
+#define MSR_IA32_PL2_SSP 0x000006a6 /* ring-2 shadow stack pointer */
+#define MSR_IA32_PL3_SSP 0x000006a7 /* ring-3 shadow stack pointer */
+#define MSR_IA32_INT_SSP_TAB 0x000006a8 /* exception shadow stack table */
+
+/* Hardware P state interface */
+#define MSR_PPERF 0x0000064e
+#define MSR_PERF_LIMIT_REASONS 0x0000064f
+#define MSR_PM_ENABLE 0x00000770
+#define MSR_HWP_CAPABILITIES 0x00000771
+#define MSR_HWP_REQUEST_PKG 0x00000772
+#define MSR_HWP_INTERRUPT 0x00000773
+#define MSR_HWP_REQUEST 0x00000774
+#define MSR_HWP_STATUS 0x00000777
+
+/* CPUID.6.EAX */
+#define HWP_BASE_BIT (1<<7)
+#define HWP_NOTIFICATIONS_BIT (1<<8)
+#define HWP_ACTIVITY_WINDOW_BIT (1<<9)
+#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10)
+#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11)
+
+/* IA32_HWP_CAPABILITIES */
+#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff)
+#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff)
+#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff)
+#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff)
+
+/* IA32_HWP_REQUEST */
+#define HWP_MIN_PERF(x) (x & 0xff)
+#define HWP_MAX_PERF(x) ((x & 0xff) << 8)
+#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16)
+#define HWP_ENERGY_PERF_PREFERENCE(x) (((u64)x & 0xff) << 24)
+#define HWP_EPP_PERFORMANCE 0x00
+#define HWP_EPP_BALANCE_PERFORMANCE 0x80
+#define HWP_EPP_BALANCE_POWERSAVE 0xC0
+#define HWP_EPP_POWERSAVE 0xFF
+#define HWP_ACTIVITY_WINDOW(x) ((u64)(x & 0xff3) << 32)
+#define HWP_PACKAGE_CONTROL(x) ((u64)(x & 0x1) << 42)
+
+/* IA32_HWP_STATUS */
+#define HWP_GUARANTEED_CHANGE(x) (x & 0x1)
+#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4)
+
+/* IA32_HWP_INTERRUPT */
+#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1)
+#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2)
+
+#define MSR_AMD64_MC0_MASK 0xc0010044
+
+#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
+#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
+#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
+#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
+
+#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
+
/* These are consecutive and not in the normal 4er MCE bank block */
#define MSR_IA32_MC0_CTL2 0x00000280
-#define CMCI_EN (1ULL << 30)
-#define CMCI_THRESHOLD_MASK 0xffffULL
+#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
#define MSR_P6_EVNTSEL1 0x00000187
-/* AMD64 MSRs. Not complete. See the architecture manual for a more
- complete list. */
+#define MSR_KNC_PERFCTR0 0x00000020
+#define MSR_KNC_PERFCTR1 0x00000021
+#define MSR_KNC_EVNTSEL0 0x00000028
+#define MSR_KNC_EVNTSEL1 0x00000029
+
+/* Alternative perfctr range with full access. */
+#define MSR_IA32_PMC0 0x000004c1
+/* Auto-reload via MSR instead of DS area */
+#define MSR_RELOAD_PMC0 0x000014c1
+#define MSR_RELOAD_FIXED_CTR0 0x00001309
+
+/* V6 PMON MSR range */
+#define MSR_IA32_PMC_V6_GP0_CTR 0x1900
+#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901
+#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902
+#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903
+#define MSR_IA32_PMC_V6_FX0_CTR 0x1980
+#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982
+#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983
+#define MSR_IA32_PMC_V6_STEP 4
+
+/* KeyID partitioning between MKTME and TDX */
+#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087
+
+/*
+ * AMD64 MSRs. Not complete. See the architecture manual for a more
+ * complete list.
+ */
#define MSR_AMD64_PATCH_LEVEL 0x0000008b
+#define MSR_AMD64_TSC_RATIO 0xc0000104
#define MSR_AMD64_NB_CFG 0xc001001f
#define MSR_AMD64_PATCH_LOADER 0xc0010020
+#define MSR_AMD_PERF_CTL 0xc0010062
+#define MSR_AMD_PERF_STATUS 0xc0010063
+#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
+#define MSR_AMD64_GUEST_TSC_FREQ 0xc0010134
+#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
+#define MSR_AMD64_OSVW_STATUS 0xc0010141
+#define MSR_AMD_PPIN_CTL 0xc00102f0
+#define MSR_AMD_PPIN 0xc00102f1
+#define MSR_AMD64_CPUID_FN_7 0xc0011002
+#define MSR_AMD64_CPUID_FN_1 0xc0011004
+
+#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005
+#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54
+#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT)
+
+#define MSR_AMD64_LS_CFG 0xc0011020
+#define MSR_AMD64_DC_CFG 0xc0011022
+#define MSR_AMD64_TW_CFG 0xc0011023
+
+#define MSR_AMD64_DE_CFG 0xc0011029
+#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1
+#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)
+#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9
+
+#define MSR_AMD64_BU_CFG2 0xc001102a
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
+#define MSR_AMD64_IBSFETCH_REG_COUNT 3
+#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1)
#define MSR_AMD64_IBSOPCTL 0xc0011033
#define MSR_AMD64_IBSOPRIP 0xc0011034
#define MSR_AMD64_IBSOPDATA 0xc0011035
@@ -107,24 +687,179 @@
#define MSR_AMD64_IBSOPDATA3 0xc0011037
#define MSR_AMD64_IBSDCLINAD 0xc0011038
#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
+#define MSR_AMD64_IBSOP_REG_COUNT 7
+#define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1)
#define MSR_AMD64_IBSCTL 0xc001103a
+#define MSR_AMD64_IBSBRTARGET 0xc001103b
+#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
+#define MSR_AMD64_IBSOPDATA4 0xc001103d
+#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b
+#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
+#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
+#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
+#define MSR_AMD64_SEV 0xc0010131
+#define MSR_AMD64_SEV_ENABLED_BIT 0
+#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
+#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
+#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
+#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
+#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
+#define MSR_AMD64_SNP_VTOM_BIT 3
+#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT)
+#define MSR_AMD64_SNP_REFLECT_VC_BIT 4
+#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT)
+#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5
+#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT)
+#define MSR_AMD64_SNP_ALT_INJ_BIT 6
+#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT)
+#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7
+#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT)
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT)
+#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9
+#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT)
+#define MSR_AMD64_SNP_VMPL_SSS_BIT 10
+#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT)
+#define MSR_AMD64_SNP_SECURE_TSC_BIT 11
+#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT)
+#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12
+#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT)
+#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13)
+#define MSR_AMD64_SNP_IBS_VIRT_BIT 14
+#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT)
+#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15)
+#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16
+#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT)
+#define MSR_AMD64_SNP_SMT_PROT_BIT 17
+#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
+#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18
+#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT)
+#define MSR_AMD64_SNP_RESV_BIT 19
+#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
+#define MSR_AMD64_SAVIC_CONTROL 0xc0010138
+#define MSR_AMD64_SAVIC_EN_BIT 0
+#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT)
+#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1
+#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT)
+#define MSR_AMD64_RMP_BASE 0xc0010132
+#define MSR_AMD64_RMP_END 0xc0010133
+#define MSR_AMD64_RMP_CFG 0xc0010136
+#define MSR_AMD64_SEG_RMP_ENABLED_BIT 0
+#define MSR_AMD64_SEG_RMP_ENABLED BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
+#define MSR_AMD64_RMP_SEGMENT_SHIFT(x) (((x) & GENMASK_ULL(13, 8)) >> 8)
+
+#define MSR_SVSM_CAA 0xc001f000
+
+/* AMD Collaborative Processor Performance Control MSRs */
+#define MSR_AMD_CPPC_CAP1 0xc00102b0
+#define MSR_AMD_CPPC_ENABLE 0xc00102b1
+#define MSR_AMD_CPPC_CAP2 0xc00102b2
+#define MSR_AMD_CPPC_REQ 0xc00102b3
+#define MSR_AMD_CPPC_STATUS 0xc00102b4
+
+/* Masks for use with MSR_AMD_CPPC_CAP1 */
+#define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0)
+#define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8)
+#define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16)
+#define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24)
+
+/* Masks for use with MSR_AMD_CPPC_REQ */
+#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0)
+#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8)
+#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16)
+#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24)
+
+/* AMD Performance Counter Global Status and Control MSRs */
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300
+#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET 0xc0000303
+
+/* AMD Hardware Feedback Support MSRs */
+#define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500
+#define MSR_AMD_WORKLOAD_CLASS_ID 0xc0000501
+#define MSR_AMD_WORKLOAD_HRST 0xc0000502
+
+/* AMD Last Branch Record MSRs */
+#define MSR_AMD64_LBR_SELECT 0xc000010e
+
+/* Zen4 */
+#define MSR_ZEN4_BP_CFG 0xc001102e
+#define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4
+#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5
+
+/* Fam 19h MSRs */
+#define MSR_F19H_UMC_PERF_CTL 0xc0010800
+#define MSR_F19H_UMC_PERF_CTR 0xc0010801
+
+/* Zen 2 */
+#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3
+#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1)
+
+/* Fam 17h MSRs */
+#define MSR_F17H_IRPERF 0xc00000e9
+
+/* Fam 16h MSRs */
+#define MSR_F16H_L2I_PERF_CTL 0xc0010230
+#define MSR_F16H_L2I_PERF_CTR 0xc0010231
+#define MSR_F16H_DR1_ADDR_MASK 0xc0011019
+#define MSR_F16H_DR2_ADDR_MASK 0xc001101a
+#define MSR_F16H_DR3_ADDR_MASK 0xc001101b
+#define MSR_F16H_DR0_ADDR_MASK 0xc0011027
+
+/* Fam 15h MSRs */
+#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
+#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
+#define MSR_F15H_PERF_CTL 0xc0010200
+#define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL
+#define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2)
+#define MSR_F15H_PERF_CTL2 (MSR_F15H_PERF_CTL + 4)
+#define MSR_F15H_PERF_CTL3 (MSR_F15H_PERF_CTL + 6)
+#define MSR_F15H_PERF_CTL4 (MSR_F15H_PERF_CTL + 8)
+#define MSR_F15H_PERF_CTL5 (MSR_F15H_PERF_CTL + 10)
+
+#define MSR_F15H_PERF_CTR 0xc0010201
+#define MSR_F15H_PERF_CTR0 MSR_F15H_PERF_CTR
+#define MSR_F15H_PERF_CTR1 (MSR_F15H_PERF_CTR + 2)
+#define MSR_F15H_PERF_CTR2 (MSR_F15H_PERF_CTR + 4)
+#define MSR_F15H_PERF_CTR3 (MSR_F15H_PERF_CTR + 6)
+#define MSR_F15H_PERF_CTR4 (MSR_F15H_PERF_CTR + 8)
+#define MSR_F15H_PERF_CTR5 (MSR_F15H_PERF_CTR + 10)
+
+#define MSR_F15H_NB_PERF_CTL 0xc0010240
+#define MSR_F15H_NB_PERF_CTR 0xc0010241
+#define MSR_F15H_PTSC 0xc0010280
+#define MSR_F15H_IC_CFG 0xc0011021
+#define MSR_F15H_EX_CFG 0xc001102c
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
#define FAM10H_MMIO_CONF_ENABLE (1<<0)
#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
-#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
+#define MSR_FAM10H_NODE_ID 0xc001100c
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
#define MSR_K8_TOP_MEM2 0xc001001d
-#define MSR_K8_SYSCFG 0xc0010010
+#define MSR_AMD64_SYSCFG 0xc0010010
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT)
+#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24
+#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT)
+#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25
+#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT)
+#define MSR_AMD64_SYSCFG_MFDM_BIT 19
+#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT)
+
#define MSR_K8_INT_PENDING_MSG 0xc0010055
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
#define MSR_K8_TSEG_ADDR 0xc0010112
+#define MSR_K8_TSEG_MASK 0xc0010113
#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
@@ -140,12 +875,17 @@
#define MSR_K7_PERFCTR3 0xc0010007
#define MSR_K7_CLK_CTL 0xc001001b
#define MSR_K7_HWCR 0xc0010015
+#define MSR_K7_HWCR_SMMLOCK_BIT 0
+#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
+#define MSR_K7_HWCR_IRPERF_EN_BIT 30
+#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
+#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
+#define MSR_K7_HWCR_CPB_DIS_BIT 25
+#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT)
/* K6 MSRs */
-#define MSR_K6_EFER 0xc0000080
-#define MSR_K6_STAR 0xc0000081
#define MSR_K6_WHCR 0xc0000082
#define MSR_K6_UWCCR 0xc0000085
#define MSR_K6_EPMR 0xc0000086
@@ -186,10 +926,26 @@
#define MSR_IA32_TSC 0x00000010
#define MSR_IA32_PLATFORM_ID 0x00000017
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
-#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+#define MSR_EBC_FREQUENCY_ID 0x0000002c
+#define MSR_SMI_COUNT 0x00000034
+
+/* Referred to as IA32_FEATURE_CONTROL in Intel's SDM. */
+#define MSR_IA32_FEAT_CTL 0x0000003a
+#define FEAT_CTL_LOCKED BIT(0)
+#define FEAT_CTL_VMX_ENABLED_INSIDE_SMX BIT(1)
+#define FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX BIT(2)
+#define FEAT_CTL_SGX_LC_ENABLED BIT(17)
+#define FEAT_CTL_SGX_ENABLED BIT(18)
+#define FEAT_CTL_LMCE_ENABLED BIT(20)
+
+#define MSR_IA32_TSC_ADJUST 0x0000003b
+#define MSR_IA32_BNDCFGS 0x00000d90
+
+#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
-#define FEATURE_CONTROL_LOCKED (1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
+#define MSR_IA32_XFD 0x000001c4
+#define MSR_IA32_XFD_ERR 0x000001c5
+#define MSR_IA32_XSS 0x00000da0
#define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8)
@@ -197,10 +953,30 @@
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
#define MSR_IA32_UCODE_WRITE 0x00000079
+
+#define MSR_IA32_MCU_ENUMERATION 0x0000007b
+#define MCU_STAGING BIT(4)
+
#define MSR_IA32_UCODE_REV 0x0000008b
+/* Intel SGX Launch Enclave Public Key Hash MSRs */
+#define MSR_IA32_SGXLEPUBKEYHASH0 0x0000008C
+#define MSR_IA32_SGXLEPUBKEYHASH1 0x0000008D
+#define MSR_IA32_SGXLEPUBKEYHASH2 0x0000008E
+#define MSR_IA32_SGXLEPUBKEYHASH3 0x0000008F
+
+#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
+#define MSR_IA32_SMBASE 0x0000009e
+
#define MSR_IA32_PERF_STATUS 0x00000198
#define MSR_IA32_PERF_CTL 0x00000199
+#define INTEL_PERF_CTL_MASK 0xffff
+
+/* AMD Branch Sampling configuration */
+#define MSR_AMD_DBG_EXTN_CFG 0xc000010f
+#define MSR_AMD_SAMP_BR_FROM 0xc0010300
+
+#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6)
#define MSR_IA32_MPERF 0x000000e7
#define MSR_IA32_APERF 0x000000e8
@@ -208,43 +984,131 @@
#define MSR_IA32_THERM_CONTROL 0x0000019a
#define MSR_IA32_THERM_INTERRUPT 0x0000019b
-#define THERM_INT_LOW_ENABLE (1 << 0)
-#define THERM_INT_HIGH_ENABLE (1 << 1)
+#define THERM_INT_HIGH_ENABLE (1 << 0)
+#define THERM_INT_LOW_ENABLE (1 << 1)
+#define THERM_INT_PLN_ENABLE (1 << 24)
#define MSR_IA32_THERM_STATUS 0x0000019c
#define THERM_STATUS_PROCHOT (1 << 0)
+#define THERM_STATUS_POWER_LIMIT (1 << 10)
+
+#define MSR_THERM2_CTL 0x0000019d
+
+#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16)
#define MSR_IA32_MISC_ENABLE 0x000001a0
+#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
+
+#define MSR_MISC_FEATURE_CONTROL 0x000001a4
+#define MSR_MISC_PWR_MGMT 0x000001aa
+
+#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_NORMAL_POWERSAVE 7
+#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
+#define ENERGY_PERF_BIAS_POWERSAVE 15
+
+#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
+#define PACKAGE_THERM_STATUS_HFI_UPDATED (1 << 26)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
+#define PACKAGE_THERM_INT_HFI_ENABLE (1 << 25)
+
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
+#define THERM_SHIFT_THRESHOLD0 8
+#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
+#define THERM_SHIFT_THRESHOLD1 16
+#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0 (1 << 6)
+#define THERM_LOG_THRESHOLD0 (1 << 7)
+#define THERM_STATUS_THRESHOLD1 (1 << 8)
+#define THERM_LOG_THRESHOLD1 (1 << 9)
+
/* MISC_ENABLE bits: architectural */
-#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
-#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
-#define MSR_IA32_MISC_ENABLE_EMON (1ULL << 7)
-#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << 11)
-#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << 12)
-#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << 16)
-#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << 18)
-#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << 22)
-#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << 23)
-#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << 34)
+#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0
+#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT)
+#define MSR_IA32_MISC_ENABLE_TCC_BIT 1
+#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT)
+#define MSR_IA32_MISC_ENABLE_EMON_BIT 7
+#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT)
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT)
+#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18
+#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT)
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT)
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT)
/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
-#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << 2)
-#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << 3)
-#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << 4)
-#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << 6)
-#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << 8)
-#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << 9)
-#define MSR_IA32_MISC_ENABLE_FERR (1ULL << 10)
-#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << 10)
-#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << 13)
-#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << 19)
-#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << 20)
-#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << 24)
-#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << 37)
-#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
-#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT)
+#define MSR_IA32_MISC_ENABLE_TM1_BIT 3
+#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT)
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
+#define MSR_IA32_MISC_ENABLE_TM2_BIT 13
+#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT)
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
+
+/* MISC_FEATURES_ENABLES non-architectural features */
+#define MSR_MISC_FEATURES_ENABLES 0x00000140
+
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
+#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1
+
+#define MSR_IA32_TSC_DEADLINE 0x000006E0
+
+
+#define MSR_TSX_FORCE_ABORT 0x0000010F
+
+#define MSR_TFA_RTM_FORCE_ABORT_BIT 0
+#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT)
+#define MSR_TFA_TSX_CPUID_CLEAR_BIT 1
+#define MSR_TFA_TSX_CPUID_CLEAR BIT_ULL(MSR_TFA_TSX_CPUID_CLEAR_BIT)
+#define MSR_TFA_SDV_ENABLE_RTM_BIT 2
+#define MSR_TFA_SDV_ENABLE_RTM BIT_ULL(MSR_TFA_SDV_ENABLE_RTM_BIT)
/* P4/Xeon+ specific */
#define MSR_IA32_MCG_EAX 0x00000180
@@ -344,15 +1208,28 @@
#define MSR_P4_U2L_ESCR0 0x000003b0
#define MSR_P4_U2L_ESCR1 0x000003b1
+#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
+
/* Intel Core-based CPU performance counters */
#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
+#define MSR_CORE_PERF_FIXED_CTR3 0x0000030c
#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
+#define MSR_PERF_METRICS 0x00000329
+
+/* PERF_GLOBAL_OVF_CTL bits */
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT)
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT 62
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT)
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT 63
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT)
+
/* Geode defined MSRs */
#define MSR_GEODE_BUSCONT_CONF0 0x00001900
@@ -370,10 +1247,53 @@
#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
+#define MSR_IA32_VMX_VMFUNC 0x00000491
+#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492
-/* AMD-V MSRs */
+#define MSR_IA32_MCU_STAGING_MBOX_ADDR 0x000007a5
+
+/* Resctrl MSRs: */
+/* - Intel: */
+#define MSR_IA32_L3_QOS_CFG 0xc81
+#define MSR_IA32_L2_QOS_CFG 0xc82
+#define MSR_IA32_QM_EVTSEL 0xc8d
+#define MSR_IA32_QM_CTR 0xc8e
+#define MSR_IA32_PQR_ASSOC 0xc8f
+#define MSR_IA32_L3_CBM_BASE 0xc90
+#define MSR_RMID_SNC_CONFIG 0xca0
+#define MSR_IA32_L2_CBM_BASE 0xd10
+#define MSR_IA32_MBA_THRTL_BASE 0xd50
+/* - AMD: */
+#define MSR_IA32_MBA_BW_BASE 0xc0000200
+#define MSR_IA32_SMBA_BW_BASE 0xc0000280
+#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd
+#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff
+#define MSR_IA32_EVT_CFG_BASE 0xc0000400
+
+/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
+#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
+/* Hardware Feedback Interface */
+#define MSR_IA32_HW_FEEDBACK_PTR 0x17d0
+#define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1
+
+/* x2APIC locked status */
+#define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD
+#define LEGACY_XAPIC_DISABLED BIT(0) /*
+ * x2APIC mode is locked and
+ * disabling x2APIC will cause
+ * a #GP
+ */
+
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h
new file mode 100644
index 000000000000..f6adbe96856a
--- /dev/null
+++ b/arch/x86/include/asm/msr-trace.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE msr-trace
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed),
+ TP_STRUCT__entry(
+ __field( unsigned, msr )
+ __field( u64, val )
+ __field( int, failed )
+ ),
+ TP_fast_assign(
+ __entry->msr = msr;
+ __entry->val = val;
+ __entry->failed = failed;
+ ),
+ TP_printk("%x, value %llx%s",
+ __entry->msr,
+ __entry->val,
+ __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, rdpmc,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 48ad9d29484a..9c2ea29e12a9 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -1,136 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H
-#include <asm/msr-index.h>
+#include "msr-index.h"
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
-#include <linux/types.h>
#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
+#include <uapi/asm/msr.h>
+#include <asm/shared/msr.h>
+
+#include <linux/types.h>
+#include <linux/percpu.h>
-struct msr {
- union {
- struct {
- u32 l;
- u32 h;
- };
- u64 q;
- };
+struct msr_info {
+ u32 msr_no;
+ struct msr reg;
+ struct msr __percpu *msrs;
+ int err;
};
-static inline unsigned long long native_read_tscp(unsigned int *aux)
-{
- unsigned long low, high;
- asm volatile(".byte 0x0f,0x01,0xf9"
- : "=a" (low), "=d" (high), "=c" (*aux));
- return low | ((u64)high << 32);
-}
+struct msr_regs_info {
+ u32 *regs;
+ int err;
+};
+
+struct saved_msr {
+ bool valid;
+ struct msr_info info;
+};
+
+struct saved_msrs {
+ unsigned int num;
+ struct saved_msr *array;
+};
/*
- * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
- * constraint has different meanings. For i386, "A" means exactly
- * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
- * it means rax *or* rdx.
+ * Be very careful with includes. This header is prone to include loops.
*/
-#ifdef CONFIG_X86_64
-#define DECLARE_ARGS(val, low, high) unsigned low, high
-#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
-#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
-#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
+#include <asm/atomic.h>
+#include <linux/tracepoint-defs.h>
+
+#ifdef CONFIG_TRACEPOINTS
+DECLARE_TRACEPOINT(read_msr);
+DECLARE_TRACEPOINT(write_msr);
+DECLARE_TRACEPOINT(rdpmc);
+extern void do_trace_write_msr(u32 msr, u64 val, int failed);
+extern void do_trace_read_msr(u32 msr, u64 val, int failed);
+extern void do_trace_rdpmc(u32 msr, u64 val, int failed);
#else
-#define DECLARE_ARGS(val, low, high) unsigned long long val
-#define EAX_EDX_VAL(val, low, high) (val)
-#define EAX_EDX_ARGS(val, low, high) "A" (val)
-#define EAX_EDX_RET(val, low, high) "=A" (val)
+static inline void do_trace_write_msr(u32 msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
#endif
-static inline unsigned long long native_read_msr(unsigned int msr)
+/*
+ * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
+ * accessors and should not have any tracing or other functionality piggybacking
+ * on them - those are *purely* for accessing MSRs and nothing more. So don't even
+ * think of extending them - you will be slapped with a stinking trout or a frozen
+ * shark will reach you, wherever you are! You've been warned.
+ */
+static __always_inline u64 __rdmsr(u32 msr)
{
- DECLARE_ARGS(val, low, high);
+ EAX_EDX_DECLARE_ARGS(val, low, high);
+
+ asm volatile("1: rdmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
+ : EAX_EDX_RET(val, low, high) : "c" (msr));
- asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
}
-static inline unsigned long long native_read_msr_safe(unsigned int msr,
- int *err)
+static __always_inline void __wrmsrq(u32 msr, u64 val)
{
- DECLARE_ARGS(val, low, high);
-
- asm volatile("2: rdmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err] ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
- : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
- : "c" (msr), [fault] "i" (-EFAULT));
- return EAX_EDX_VAL(val, low, high);
+ asm volatile("1: wrmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
+ : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)) : "memory");
}
-static inline unsigned long long native_read_msr_amd_safe(unsigned int msr,
- int *err)
+#define native_rdmsr(msr, val1, val2) \
+do { \
+ u64 __val = __rdmsr((msr)); \
+ (void)((val1) = (u32)__val); \
+ (void)((val2) = (u32)(__val >> 32)); \
+} while (0)
+
+static __always_inline u64 native_rdmsrq(u32 msr)
{
- DECLARE_ARGS(val, low, high);
-
- asm volatile("2: rdmsr ; xor %0,%0\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %3,%0 ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
- : "=r" (*err), EAX_EDX_RET(val, low, high)
- : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT));
- return EAX_EDX_VAL(val, low, high);
+ return __rdmsr(msr);
}
-static inline void native_write_msr(unsigned int msr,
- unsigned low, unsigned high)
+#define native_wrmsr(msr, low, high) \
+ __wrmsrq((msr), (u64)(high) << 32 | (low))
+
+#define native_wrmsrq(msr, val) \
+ __wrmsrq((msr), (val))
+
+static inline u64 native_read_msr(u32 msr)
{
- asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+ u64 val;
+
+ val = __rdmsr(msr);
+
+ if (tracepoint_enabled(read_msr))
+ do_trace_read_msr(msr, val, 0);
+
+ return val;
}
-/* Can be uninlined because referenced by paravirt */
-notrace static inline int native_write_msr_safe(unsigned int msr,
- unsigned low, unsigned high)
+static inline int native_read_msr_safe(u32 msr, u64 *p)
{
int err;
- asm volatile("2: wrmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err] ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
- : [err] "=a" (err)
- : "c" (msr), "0" (low), "d" (high),
- [fault] "i" (-EFAULT)
- : "memory");
+ EAX_EDX_DECLARE_ARGS(val, low, high);
+
+ asm volatile("1: rdmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
+ : [err] "=r" (err), EAX_EDX_RET(val, low, high)
+ : "c" (msr));
+ if (tracepoint_enabled(read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), err);
+
+ *p = EAX_EDX_VAL(val, low, high);
+
return err;
}
-extern unsigned long long native_read_tsc(void);
-
-static __always_inline unsigned long long __native_read_tsc(void)
+/* Can be uninlined because referenced by paravirt */
+static inline void notrace native_write_msr(u32 msr, u64 val)
{
- DECLARE_ARGS(val, low, high);
+ native_wrmsrq(msr, val);
- asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+ if (tracepoint_enabled(write_msr))
+ do_trace_write_msr(msr, val, 0);
+}
- return EAX_EDX_VAL(val, low, high);
+/* Can be uninlined because referenced by paravirt */
+static inline int notrace native_write_msr_safe(u32 msr, u64 val)
+{
+ int err;
+
+ asm volatile("1: wrmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
+ : [err] "=a" (err)
+ : "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32))
+ : "memory");
+ if (tracepoint_enabled(write_msr))
+ do_trace_write_msr(msr, val, err);
+ return err;
}
-static inline unsigned long long native_read_pmc(int counter)
+extern int rdmsr_safe_regs(u32 regs[8]);
+extern int wrmsr_safe_regs(u32 regs[8]);
+
+static inline u64 native_read_pmc(int counter)
{
- DECLARE_ARGS(val, low, high);
+ EAX_EDX_DECLARE_ARGS(val, low, high);
asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+ if (tracepoint_enabled(rdpmc))
+ do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
@@ -140,94 +179,95 @@ static inline unsigned long long native_read_pmc(int counter)
* pointer indirection), this allows gcc to optimize better
*/
-#define rdmsr(msr, val1, val2) \
+#define rdmsr(msr, low, high) \
do { \
u64 __val = native_read_msr((msr)); \
- (val1) = (u32)__val; \
- (val2) = (u32)(__val >> 32); \
+ (void)((low) = (u32)__val); \
+ (void)((high) = (u32)(__val >> 32)); \
} while (0)
-static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
+static inline void wrmsr(u32 msr, u32 low, u32 high)
{
- native_write_msr(msr, low, high);
+ native_write_msr(msr, (u64)high << 32 | low);
}
-#define rdmsrl(msr, val) \
+#define rdmsrq(msr, val) \
((val) = native_read_msr((msr)))
-#define wrmsrl(msr, val) \
- native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+static inline void wrmsrq(u32 msr, u64 val)
+{
+ native_write_msr(msr, val);
+}
/* wrmsr with exception handling */
-static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
+static inline int wrmsrq_safe(u32 msr, u64 val)
{
- return native_write_msr_safe(msr, low, high);
+ return native_write_msr_safe(msr, val);
}
/* rdmsr with exception handling */
-#define rdmsr_safe(msr, p1, p2) \
+#define rdmsr_safe(msr, low, high) \
({ \
- int __err; \
- u64 __val = native_read_msr_safe((msr), &__err); \
- (*p1) = (u32)__val; \
- (*p2) = (u32)(__val >> 32); \
+ u64 __val; \
+ int __err = native_read_msr_safe((msr), &__val); \
+ (*low) = (u32)__val; \
+ (*high) = (u32)(__val >> 32); \
__err; \
})
-static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+static inline int rdmsrq_safe(u32 msr, u64 *p)
{
- int err;
-
- *p = native_read_msr_safe(msr, &err);
- return err;
+ return native_read_msr_safe(msr, p);
}
-static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
-{
- int err;
- *p = native_read_msr_amd_safe(msr, &err);
- return err;
+static __always_inline u64 rdpmc(int counter)
+{
+ return native_read_pmc(counter);
}
-#define rdtscl(low) \
- ((low) = (u32)__native_read_tsc())
+#endif /* !CONFIG_PARAVIRT_XXL */
-#define rdtscll(val) \
- ((val) = __native_read_tsc())
+/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
+#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
-#define rdpmc(counter, low, high) \
-do { \
- u64 _l = native_read_pmc((counter)); \
- (low) = (u32)_l; \
- (high) = (u32)(_l >> 32); \
-} while (0)
-
-#define rdtscp(low, high, aux) \
-do { \
- unsigned long long _val = native_read_tscp(&(aux)); \
- (low) = (u32)_val; \
- (high) = (u32)(_val >> 32); \
-} while (0)
-
-#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
-
-#endif /* !CONFIG_PARAVIRT */
-
-
-#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
- (u32)((val) >> 32))
+/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */
+static __always_inline void wrmsrns(u32 msr, u64 val)
+{
+ /*
+ * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant
+ * DS prefix to avoid a trailing NOP.
+ */
+ asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS)
+ "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
+ : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
+}
-#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2))
+/*
+ * Dual u32 version of wrmsrq_safe():
+ */
+static inline int wrmsr_safe(u32 msr, u32 low, u32 high)
+{
+ return wrmsrq_safe(msr, (u64)high << 32 | low);
+}
-#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+struct msr __percpu *msrs_alloc(void);
+void msrs_free(struct msr __percpu *msrs);
+int msr_set_bit(u32 msr, u8 bit);
+int msr_clear_bit(u32 msr, u8 bit);
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
-void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
-void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
+int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
+int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
+int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
#else /* CONFIG_SMP */
static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
@@ -239,15 +279,25 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
wrmsr(msr_no, l, h);
return 0;
}
-static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
- struct msr *msrs)
+static inline int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ rdmsrq(msr_no, *q);
+ return 0;
+}
+static inline int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ wrmsrq(msr_no, q);
+ return 0;
+}
+static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
+ struct msr __percpu *msrs)
{
- rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+ rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h));
}
-static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
- struct msr *msrs)
+static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
+ struct msr __percpu *msrs)
{
- wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+ wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h));
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
u32 *l, u32 *h)
@@ -258,7 +308,28 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
return wrmsr_safe(msr_no, l, h);
}
+static inline int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ return rdmsrq_safe(msr_no, q);
+}
+static inline int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ return wrmsrq_safe(msr_no, q);
+}
+static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
+{
+ return rdmsr_safe_regs(regs);
+}
+static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
+{
+ return wrmsr_safe_regs(regs);
+}
#endif /* CONFIG_SMP */
-#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
+
+/* Compatibility wrappers: */
+#define rdmsrl(msr, val) rdmsrq(msr, val)
+#define wrmsrl(msr, val) wrmsrq(msr, val)
+#define rdmsrl_on_cpu(cpu, msr, q) rdmsrq_on_cpu(cpu, msr, q)
+
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index a51ada8467de..76b95bd1a405 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -1,21 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.0+ */
/* Generic MTRR (Memory Type Range Register) ioctls.
Copyright (C) 1997-1999 Richard Gooch
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public
- License along with this library; if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
Richard Gooch may be reached by email at rgooch@atnf.csiro.au
The postal address is:
Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
@@ -23,93 +10,44 @@
#ifndef _ASM_X86_MTRR_H
#define _ASM_X86_MTRR_H
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <linux/errno.h>
+#include <linux/bits.h>
+#include <uapi/asm/mtrr.h>
-#define MTRR_IOCTL_BASE 'M'
+/* Defines for hardware MTRR registers. */
+#define MTRR_CAP_VCNT GENMASK(7, 0)
+#define MTRR_CAP_FIX BIT_MASK(8)
+#define MTRR_CAP_WC BIT_MASK(10)
-struct mtrr_sentry {
- unsigned long base; /* Base address */
- unsigned int size; /* Size of region */
- unsigned int type; /* Type of region */
-};
+#define MTRR_DEF_TYPE_TYPE GENMASK(7, 0)
+#define MTRR_DEF_TYPE_FE BIT_MASK(10)
+#define MTRR_DEF_TYPE_E BIT_MASK(11)
-/* Warning: this structure has a different order from i386
- on x86-64. The 32bit emulation code takes care of that.
- But you need to use this for 64bit, otherwise your X server
- will break. */
-
-#ifdef __i386__
-struct mtrr_gentry {
- unsigned int regnum; /* Register number */
- unsigned long base; /* Base address */
- unsigned int size; /* Size of region */
- unsigned int type; /* Type of region */
-};
+#define MTRR_DEF_TYPE_ENABLE (MTRR_DEF_TYPE_FE | MTRR_DEF_TYPE_E)
+#define MTRR_DEF_TYPE_DISABLE ~(MTRR_DEF_TYPE_TYPE | MTRR_DEF_TYPE_ENABLE)
-#else /* __i386__ */
-
-struct mtrr_gentry {
- unsigned long base; /* Base address */
- unsigned int size; /* Size of region */
- unsigned int regnum; /* Register number */
- unsigned int type; /* Type of region */
-};
-#endif /* !__i386__ */
-
-struct mtrr_var_range {
- __u32 base_lo;
- __u32 base_hi;
- __u32 mask_lo;
- __u32 mask_hi;
-};
+#define MTRR_PHYSBASE_TYPE GENMASK(7, 0)
+#define MTRR_PHYSBASE_RSVD GENMASK(11, 8)
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
- an 8 bit field: */
-typedef __u8 mtrr_type;
-
-#define MTRR_NUM_FIXED_RANGES 88
-#define MTRR_MAX_VAR_RANGES 256
+#define MTRR_PHYSMASK_RSVD GENMASK(10, 0)
+#define MTRR_PHYSMASK_V BIT_MASK(11)
struct mtrr_state_type {
struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
unsigned char enabled;
- unsigned char have_fixed;
+ bool have_fixed;
mtrr_type def_type;
};
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-
-/* These are the various ioctls */
-#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
-#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
-#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry)
-#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry)
-#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry)
-#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry)
-#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry)
-#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry)
-#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
-#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
-
-/* These are the region types */
-#define MTRR_TYPE_UNCACHABLE 0
-#define MTRR_TYPE_WRCOMB 1
-/*#define MTRR_TYPE_ 2*/
-/*#define MTRR_TYPE_ 3*/
-#define MTRR_TYPE_WRTHROUGH 4
-#define MTRR_TYPE_WRPROT 5
-#define MTRR_TYPE_WRBACK 6
-#define MTRR_NUM_TYPES 7
-
-#ifdef __KERNEL__
-
-/* The following functions are for use by other drivers */
+/*
+ * The following functions are for use by other drivers that cannot use
+ * arch_phys_wc_add and arch_phys_wc_del.
+ */
# ifdef CONFIG_MTRR
-extern u8 mtrr_type_lookup(u64 addr, u64 end);
+void mtrr_bp_init(void);
+void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
+ mtrr_type def_type);
+extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
extern void mtrr_save_fixed_ranges(void *);
extern void mtrr_save_state(void);
extern int mtrr_add(unsigned long base, unsigned long size,
@@ -118,18 +56,27 @@ extern int mtrr_add_page(unsigned long base, unsigned long size,
unsigned int type, bool increment);
extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
-extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
-extern void mtrr_ap_init(void);
-extern void mtrr_bp_init(void);
extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
extern int amd_special_default_mtrr(void);
+void mtrr_disable(void);
+void mtrr_enable(void);
+void mtrr_generic_set_state(void);
# else
-static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+static inline void guest_force_mtrr_state(struct mtrr_var_range *var,
+ unsigned int num_var,
+ mtrr_type def_type)
+{
+}
+
+static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
{
/*
- * Return no-MTRRs:
+ * Return the default MTRR type, without any known other types in
+ * that range.
*/
- return 0xff;
+ *uniform = 1;
+
+ return MTRR_TYPE_UNCACHABLE;
}
#define mtrr_save_fixed_ranges(arg) do {} while (0)
#define mtrr_save_state() do {} while (0)
@@ -155,12 +102,10 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
{
return 0;
}
-static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
-{
-}
-
-#define mtrr_ap_init() do {} while (0)
#define mtrr_bp_init() do {} while (0)
+#define mtrr_disable() do {} while (0)
+#define mtrr_enable() do {} while (0)
+#define mtrr_generic_set_state() do {} while (0)
# endif
#ifdef CONFIG_COMPAT
@@ -194,6 +139,9 @@ struct mtrr_gentry32 {
_IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
#endif /* CONFIG_COMPAT */
-#endif /* __KERNEL__ */
+/* Bit fields for enabled in struct mtrr_state_type */
+#define MTRR_STATE_SHIFT 10
+#define MTRR_STATE_MTRR_FIXED_ENABLED (MTRR_DEF_TYPE_FE >> MTRR_STATE_SHIFT)
+#define MTRR_STATE_MTRR_ENABLED (MTRR_DEF_TYPE_E >> MTRR_STATE_SHIFT)
#endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h
deleted file mode 100644
index a731b9c573a6..000000000000
--- a/arch/x86/include/asm/mutex.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_X86_32
-# include "mutex_32.h"
-#else
-# include "mutex_64.h"
-#endif
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
deleted file mode 100644
index 03f90c8a5a7c..000000000000
--- a/arch/x86/include/asm/mutex_32.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Assembly implementation of the mutex fastpath, based on atomic
- * decrement/increment.
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- */
-#ifndef _ASM_X86_MUTEX_32_H
-#define _ASM_X86_MUTEX_32_H
-
-#include <asm/alternative.h>
-
-/**
- * __mutex_fastpath_lock - try to take the lock by moving the count
- * from 1 to a 0 value
- * @count: pointer of type atomic_t
- * @fn: function to call if the original value was not 1
- *
- * Change the count from 1 to a value lower than 1, and call <fn> if it
- * wasn't 1 originally. This function MUST leave the value lower than 1
- * even when the "1" assertion wasn't true.
- */
-#define __mutex_fastpath_lock(count, fail_fn) \
-do { \
- unsigned int dummy; \
- \
- typecheck(atomic_t *, count); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " decl (%%eax)\n" \
- " jns 1f \n" \
- " call " #fail_fn "\n" \
- "1:\n" \
- : "=a" (dummy) \
- : "a" (count) \
- : "memory", "ecx", "edx"); \
-} while (0)
-
-
-/**
- * __mutex_fastpath_lock_retval - try to take the lock by moving the count
- * from 1 to a 0 value
- * @count: pointer of type atomic_t
- * @fail_fn: function to call if the original value was not 1
- *
- * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
- * wasn't 1 originally. This function returns 0 if the fastpath succeeds,
- * or anything the slow path function returns
- */
-static inline int __mutex_fastpath_lock_retval(atomic_t *count,
- int (*fail_fn)(atomic_t *))
-{
- if (unlikely(atomic_dec_return(count) < 0))
- return fail_fn(count);
- else
- return 0;
-}
-
-/**
- * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
- * @count: pointer of type atomic_t
- * @fail_fn: function to call if the original value was not 0
- *
- * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>.
- * In the failure case, this function is allowed to either set the value
- * to 1, or to set it to a value lower than 1.
- *
- * If the implementation sets it to a value of lower than 1, the
- * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
- * to return 0 otherwise.
- */
-#define __mutex_fastpath_unlock(count, fail_fn) \
-do { \
- unsigned int dummy; \
- \
- typecheck(atomic_t *, count); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " incl (%%eax)\n" \
- " jg 1f\n" \
- " call " #fail_fn "\n" \
- "1:\n" \
- : "=a" (dummy) \
- : "a" (count) \
- : "memory", "ecx", "edx"); \
-} while (0)
-
-#define __mutex_slowpath_needs_to_unlock() 1
-
-/**
- * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
- *
- * @count: pointer of type atomic_t
- * @fail_fn: fallback function
- *
- * Change the count from 1 to a value lower than 1, and return 0 (failure)
- * if it wasn't 1 originally, or return 1 (success) otherwise. This function
- * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
- * Additionally, if the value was < 0 originally, this function must not leave
- * it to 0 on failure.
- */
-static inline int __mutex_fastpath_trylock(atomic_t *count,
- int (*fail_fn)(atomic_t *))
-{
- /*
- * We have two variants here. The cmpxchg based one is the best one
- * because it never induce a false contention state. It is included
- * here because architectures using the inc/dec algorithms over the
- * xchg ones are much more likely to support cmpxchg natively.
- *
- * If not we fall back to the spinlock based variant - that is
- * just as efficient (and simpler) as a 'destructive' probing of
- * the mutex state would be.
- */
-#ifdef __HAVE_ARCH_CMPXCHG
- if (likely(atomic_cmpxchg(count, 1, 0) == 1))
- return 1;
- return 0;
-#else
- return fail_fn(count);
-#endif
-}
-
-#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
deleted file mode 100644
index 68a87b0f8e29..000000000000
--- a/arch/x86/include/asm/mutex_64.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Assembly implementation of the mutex fastpath, based on atomic
- * decrement/increment.
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- */
-#ifndef _ASM_X86_MUTEX_64_H
-#define _ASM_X86_MUTEX_64_H
-
-/**
- * __mutex_fastpath_lock - decrement and call function if negative
- * @v: pointer of type atomic_t
- * @fail_fn: function to call if the result is negative
- *
- * Atomically decrements @v and calls <fail_fn> if the result is negative.
- */
-#define __mutex_fastpath_lock(v, fail_fn) \
-do { \
- unsigned long dummy; \
- \
- typecheck(atomic_t *, v); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \
- " jns 1f \n" \
- " call " #fail_fn "\n" \
- "1:" \
- : "=D" (dummy) \
- : "D" (v) \
- : "rax", "rsi", "rdx", "rcx", \
- "r8", "r9", "r10", "r11", "memory"); \
-} while (0)
-
-/**
- * __mutex_fastpath_lock_retval - try to take the lock by moving the count
- * from 1 to a 0 value
- * @count: pointer of type atomic_t
- * @fail_fn: function to call if the original value was not 1
- *
- * Change the count from 1 to a value lower than 1, and call <fail_fn> if
- * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
- * or anything the slow path function returns
- */
-static inline int __mutex_fastpath_lock_retval(atomic_t *count,
- int (*fail_fn)(atomic_t *))
-{
- if (unlikely(atomic_dec_return(count) < 0))
- return fail_fn(count);
- else
- return 0;
-}
-
-/**
- * __mutex_fastpath_unlock - increment and call function if nonpositive
- * @v: pointer of type atomic_t
- * @fail_fn: function to call if the result is nonpositive
- *
- * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
- */
-#define __mutex_fastpath_unlock(v, fail_fn) \
-do { \
- unsigned long dummy; \
- \
- typecheck(atomic_t *, v); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \
- " jg 1f\n" \
- " call " #fail_fn "\n" \
- "1:" \
- : "=D" (dummy) \
- : "D" (v) \
- : "rax", "rsi", "rdx", "rcx", \
- "r8", "r9", "r10", "r11", "memory"); \
-} while (0)
-
-#define __mutex_slowpath_needs_to_unlock() 1
-
-/**
- * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
- *
- * @count: pointer of type atomic_t
- * @fail_fn: fallback function
- *
- * Change the count from 1 to 0 and return 1 (success), or return 0 (failure)
- * if it wasn't 1 originally. [the fallback function is never used on
- * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.]
- */
-static inline int __mutex_fastpath_trylock(atomic_t *count,
- int (*fail_fn)(atomic_t *))
-{
- if (likely(atomic_cmpxchg(count, 1, 0) == 1))
- return 1;
- else
- return 0;
-}
-
-#endif /* _ASM_X86_MUTEX_64_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 000000000000..e4815e15dc9a
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#include <linux/sched.h>
+#include <linux/sched/idle.h>
+
+#include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
+
+#define MWAIT_SUBSTATE_MASK 0xf
+#define MWAIT_CSTATE_MASK 0xf
+#define MWAIT_SUBSTATE_SIZE 4
+#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK)
+#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
+#define MWAIT_C1_SUBSTATE_MASK 0xf0
+
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK 0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+#define MWAITX_ECX_TIMER_ENABLE BIT(1)
+#define MWAITX_MAX_WAIT_CYCLES UINT_MAX
+#define MWAITX_DISABLE_CSTATES 0xf0
+#define TPAUSE_C01_STATE 1
+#define TPAUSE_C02_STATE 0
+
+static __always_inline void __monitor(const void *eax, u32 ecx, u32 edx)
+{
+ /*
+ * Use the instruction mnemonic with implicit operands, as the LLVM
+ * assembler fails to assemble the mnemonic with explicit operands:
+ */
+ asm volatile("monitor" :: "a" (eax), "c" (ecx), "d" (edx));
+}
+
+static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx)
+{
+ asm volatile("monitorx" :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static __always_inline void __mwait(u32 eax, u32 ecx)
+{
+ /*
+ * Use the instruction mnemonic with implicit operands, as the LLVM
+ * assembler fails to assemble the mnemonic with explicit operands:
+ */
+ asm volatile("mwait" :: "a" (eax), "c" (ecx));
+}
+
+/*
+ * MWAITX allows for a timer expiration to get the core out a wait state in
+ * addition to the default MWAIT exit condition of a store appearing at a
+ * monitored virtual address.
+ *
+ * Registers:
+ *
+ * MWAITX ECX[1]: enable timer if set
+ * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
+ * frequency is the same as the TSC frequency.
+ *
+ * Below is a comparison between MWAIT and MWAITX on AMD processors:
+ *
+ * MWAIT MWAITX
+ * opcode 0f 01 c9 | 0f 01 fb
+ * ECX[0] value of RFLAGS.IF seen by instruction
+ * ECX[1] unused/#GP if set | enable timer if set
+ * ECX[31:2] unused/#GP if set
+ * EAX unused (reserve for hint)
+ * EBX[31:0] unused | max wait time (P0 clocks)
+ *
+ * MONITOR MONITORX
+ * opcode 0f 01 c8 | 0f 01 fa
+ * EAX (logical) address to monitor
+ * ECX #GP if not zero
+ */
+static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx)
+{
+ /* No need for TSA buffer clearing on AMD */
+
+ asm volatile("mwaitx" :: "a" (eax), "b" (ebx), "c" (ecx));
+}
+
+/*
+ * Re-enable interrupts right upon calling mwait in such a way that
+ * no interrupt can fire _before_ the execution of mwait, ie: no
+ * instruction must be placed between "sti" and "mwait".
+ *
+ * This is necessary because if an interrupt queues a timer before
+ * executing mwait, it would otherwise go unnoticed and the next tick
+ * would not be reprogrammed accordingly before mwait ever wakes up.
+ */
+static __always_inline void __sti_mwait(u32 eax, u32 ecx)
+{
+
+ asm volatile("sti; mwait" :: "a" (eax), "c" (ecx));
+}
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+static __always_inline void mwait_idle_with_hints(u32 eax, u32 ecx)
+{
+ if (need_resched())
+ return;
+
+ x86_idle_clear_cpu_buffers();
+
+ if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
+ const void *addr = &current_thread_info()->flags;
+
+ alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
+ __monitor(addr, 0, 0);
+
+ if (need_resched())
+ goto out;
+
+ if (ecx & 1) {
+ __mwait(eax, ecx);
+ } else {
+ __sti_mwait(eax, ecx);
+ raw_local_irq_disable();
+ }
+ }
+
+out:
+ current_clr_polling();
+}
+
+/*
+ * Caller can specify whether to enter C0.1 (low latency, less
+ * power saving) or C0.2 state (saves more power, but longer wakeup
+ * latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR
+ * which can force requests for C0.2 to be downgraded to C0.1.
+ */
+static inline void __tpause(u32 ecx, u32 edx, u32 eax)
+{
+ /* "tpause %ecx" */
+ asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1"
+ :: "c" (ecx), "d" (edx), "a" (eax));
+}
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c86e5ed4af51..79d88d12c8fb 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -1,80 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_NMI_H
#define _ASM_X86_NMI_H
+#include <linux/irq_work.h>
#include <linux/pm.h>
#include <asm/irq.h>
#include <asm/io.h>
-#ifdef ARCH_HAS_NMI_WATCHDOG
+#ifdef CONFIG_X86_LOCAL_APIC
-/**
- * do_nmi_callback
- *
- * Check to see if a callback exists and execute it. Return 1
- * if the handler exists and was handled successfully.
- */
-int do_nmi_callback(struct pt_regs *regs, int cpu);
-
-extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
-extern int check_nmi_watchdog(void);
-extern int nmi_watchdog_enabled;
-extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
-extern int avail_to_resrv_perfctr_nmi(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
extern void release_evntsel_nmi(unsigned int);
-extern void setup_apic_nmi_watchdog(void *);
-extern void stop_apic_nmi_watchdog(void *);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
-extern void cpu_nmi_set_wd_enabled(void);
-
-extern atomic_t nmi_active;
-extern unsigned int nmi_watchdog;
-#define NMI_NONE 0
-#define NMI_IO_APIC 1
-#define NMI_LOCAL_APIC 2
-#define NMI_INVALID 3
-
-struct ctl_table;
-struct file;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
- void __user *, size_t *, loff_t *);
+#endif /* CONFIG_X86_LOCAL_APIC */
+
extern int unknown_nmi_panic;
+extern int panic_on_unrecovered_nmi;
+extern int panic_on_io_nmi;
+
+/* NMI handler flags */
+#define NMI_FLAG_FIRST 1
+
+/**
+ * enum - NMI types.
+ * @NMI_LOCAL: Local NMI, CPU-specific NMI generated by the Local APIC.
+ * @NMI_UNKNOWN: Unknown NMI, the source of the NMI may not be identified.
+ * @NMI_SERR: System Error NMI, typically triggered by PCI errors.
+ * @NMI_IO_CHECK: I/O Check NMI, related to I/O errors.
+ * @NMI_MAX: Maximum value for NMI types.
+ *
+ * NMI types are used to categorize NMIs and to dispatch them to the
+ * appropriate handler.
+ */
+enum {
+ NMI_LOCAL=0,
+ NMI_UNKNOWN,
+ NMI_SERR,
+ NMI_IO_CHECK,
+ NMI_MAX
+};
+
+/* NMI handler return values */
+#define NMI_DONE 0
+#define NMI_HANDLED 1
+
+typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
+
+struct nmiaction {
+ struct list_head list;
+ nmi_handler_t handler;
+ u64 max_duration;
+ unsigned long flags;
+ const char *name;
+};
+
+/**
+ * register_nmi_handler - Register a handler for a specific NMI type
+ * @t: NMI type (e.g. NMI_LOCAL)
+ * @fn: The NMI handler
+ * @fg: Flags associated with the NMI handler
+ * @n: Name of the NMI handler
+ * @init: Optional __init* attributes for struct nmiaction
+ *
+ * Adds the provided handler to the list of handlers for the specified
+ * NMI type. Handlers flagged with NMI_FLAG_FIRST would be executed first.
+ *
+ * Sometimes the source of an NMI can't be reliably determined which
+ * results in an NMI being tagged as "unknown". Register an additional
+ * handler using the NMI type - NMI_UNKNOWN to handle such cases. The
+ * caller would get one last chance to assume responsibility for the
+ * NMI.
+ *
+ * Return: 0 on success, or an error code on failure.
+ */
+#define register_nmi_handler(t, fn, fg, n, init...) \
+({ \
+ static struct nmiaction init fn##_na = { \
+ .list = LIST_HEAD_INIT(fn##_na.list), \
+ .handler = (fn), \
+ .name = (n), \
+ .flags = (fg), \
+ }; \
+ __register_nmi_handler((t), &fn##_na); \
+})
+
+int __register_nmi_handler(unsigned int, struct nmiaction *);
+
+/**
+ * unregister_nmi_handler - Unregister a handler for a specific NMI type
+ * @type: NMI type (e.g. NMI_LOCAL)
+ * @name: Name of the NMI handler used during registration
+ *
+ * Removes the handler associated with the specified NMI type from the
+ * NMI handler list. The "name" is used as a lookup key to identify the
+ * handler.
+ */
+void unregister_nmi_handler(unsigned int type, const char *name);
+
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler);
-void __trigger_all_cpu_backtrace(void);
-#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
-
-static inline void localise_nmi_watchdog(void)
-{
- if (nmi_watchdog == NMI_IO_APIC)
- nmi_watchdog = NMI_LOCAL_APIC;
-}
-
-/* check if nmi_watchdog is active (ie was specified at boot) */
-static inline int nmi_watchdog_active(void)
-{
- /*
- * actually it should be:
- * return (nmi_watchdog == NMI_LOCAL_APIC ||
- * nmi_watchdog == NMI_IO_APIC)
- * but since they are power of two we could use a
- * cheaper way --cvg
- */
- return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
-}
-#endif
-
-void lapic_watchdog_stop(void);
-int lapic_watchdog_init(unsigned nmi_hz);
-int lapic_wd_event(unsigned nmi_hz);
-unsigned lapic_adjust_nmi_hz(unsigned hz);
-void disable_lapic_nmi_watchdog(void);
-void enable_lapic_nmi_watchdog(void);
void stop_nmi(void);
void restart_nmi(void);
+void local_touch_nmi(void);
#endif /* _ASM_X86_NMI_H */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index ad2668ee1aa7..cd94221d8335 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -1,118 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_NOPS_H
#define _ASM_X86_NOPS_H
-/* Define nops for use with alternative() */
+#include <asm/asm.h>
-/* generic versions from gas
- 1: nop
- the following instructions are NOT nops in 64-bit mode,
- for 64-bit mode use K8 or P6 nops instead
- 2: movl %esi,%esi
- 3: leal 0x00(%esi),%esi
- 4: leal 0x00(,%esi,1),%esi
- 6: leal 0x00000000(%esi),%esi
- 7: leal 0x00000000(,%esi,1),%esi
-*/
-#define GENERIC_NOP1 ".byte 0x90\n"
-#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
-#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
-#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
-#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
-#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
+/*
+ * Define nops for use with alternative() and for tracing.
+ */
-/* Opteron 64bit nops
- 1: nop
- 2: osp nop
- 3: osp osp nop
- 4: osp osp osp nop
-*/
-#define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2 ".byte 0x66,0x90\n"
-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
-#define K8_NOP5 K8_NOP3 K8_NOP2
-#define K8_NOP6 K8_NOP3 K8_NOP3
-#define K8_NOP7 K8_NOP4 K8_NOP3
-#define K8_NOP8 K8_NOP4 K8_NOP4
+#ifndef CONFIG_64BIT
-/* K7 nops
- uses eax dependencies (arbitary choice)
- 1: nop
- 2: movl %eax,%eax
- 3: leal (,%eax,1),%eax
- 4: leal 0x00(,%eax,1),%eax
- 6: leal 0x00000000(%eax),%eax
- 7: leal 0x00000000(,%eax,1),%eax
-*/
-#define K7_NOP1 GENERIC_NOP1
-#define K7_NOP2 ".byte 0x8b,0xc0\n"
-#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
-#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
-#define K7_NOP5 K7_NOP4 ASM_NOP1
-#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
-#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
-#define K7_NOP8 K7_NOP7 ASM_NOP1
+/*
+ * Generic 32bit nops from GAS:
+ *
+ * 1: nop
+ * 2: movl %esi,%esi
+ * 3: leal 0x0(%esi),%esi
+ * 4: leal 0x0(%esi,%eiz,1),%esi
+ * 5: leal %ds:0x0(%esi,%eiz,1),%esi
+ * 6: leal 0x0(%esi),%esi
+ * 7: leal 0x0(%esi,%eiz,1),%esi
+ * 8: leal %ds:0x0(%esi,%eiz,1),%esi
+ *
+ * Except 5 and 8, which are DS prefixed 4 and 7 resp, where GAS would emit 2
+ * nop instructions.
+ */
+#define BYTES_NOP1 0x90
+#define BYTES_NOP2 0x89,0xf6
+#define BYTES_NOP3 0x8d,0x76,0x00
+#define BYTES_NOP4 0x8d,0x74,0x26,0x00
+#define BYTES_NOP5 0x3e,BYTES_NOP4
+#define BYTES_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
+#define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
+#define BYTES_NOP8 0x3e,BYTES_NOP7
-/* P6 nops
- uses eax dependencies (Intel-recommended choice)
- 1: nop
- 2: osp nop
- 3: nopl (%eax)
- 4: nopl 0x00(%eax)
- 5: nopl 0x00(%eax,%eax,1)
- 6: osp nopl 0x00(%eax,%eax,1)
- 7: nopl 0x00000000(%eax)
- 8: nopl 0x00000000(%eax,%eax,1)
-*/
-#define P6_NOP1 GENERIC_NOP1
-#define P6_NOP2 ".byte 0x66,0x90\n"
-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+#define ASM_NOP_MAX 8
-#if defined(CONFIG_MK7)
-#define ASM_NOP1 K7_NOP1
-#define ASM_NOP2 K7_NOP2
-#define ASM_NOP3 K7_NOP3
-#define ASM_NOP4 K7_NOP4
-#define ASM_NOP5 K7_NOP5
-#define ASM_NOP6 K7_NOP6
-#define ASM_NOP7 K7_NOP7
-#define ASM_NOP8 K7_NOP8
-#elif defined(CONFIG_X86_P6_NOP)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
-#elif defined(CONFIG_X86_64)
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
#else
-#define ASM_NOP1 GENERIC_NOP1
-#define ASM_NOP2 GENERIC_NOP2
-#define ASM_NOP3 GENERIC_NOP3
-#define ASM_NOP4 GENERIC_NOP4
-#define ASM_NOP5 GENERIC_NOP5
-#define ASM_NOP6 GENERIC_NOP6
-#define ASM_NOP7 GENERIC_NOP7
-#define ASM_NOP8 GENERIC_NOP8
-#endif
-#define ASM_NOP_MAX 8
+/*
+ * Generic 64bit nops from GAS:
+ *
+ * 1: nop
+ * 2: osp nop
+ * 3: nopl (%eax)
+ * 4: nopl 0x00(%eax)
+ * 5: nopl 0x00(%eax,%eax,1)
+ * 6: osp nopl 0x00(%eax,%eax,1)
+ * 7: nopl 0x00000000(%eax)
+ * 8: nopl 0x00000000(%eax,%eax,1)
+ * 9: cs nopl 0x00000000(%eax,%eax,1)
+ * 10: osp cs nopl 0x00000000(%eax,%eax,1)
+ * 11: osp osp cs nopl 0x00000000(%eax,%eax,1)
+ */
+#define BYTES_NOP1 0x90
+#define BYTES_NOP2 0x66,BYTES_NOP1
+#define BYTES_NOP3 0x0f,0x1f,0x00
+#define BYTES_NOP4 0x0f,0x1f,0x40,0x00
+#define BYTES_NOP5 0x0f,0x1f,0x44,0x00,0x00
+#define BYTES_NOP6 0x66,BYTES_NOP5
+#define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00
+#define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+#define BYTES_NOP9 0x2e,BYTES_NOP8
+#define BYTES_NOP10 0x66,BYTES_NOP9
+#define BYTES_NOP11 0x66,BYTES_NOP10
+
+#define ASM_NOP9 _ASM_BYTES(BYTES_NOP9)
+#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10)
+#define ASM_NOP11 _ASM_BYTES(BYTES_NOP11)
+
+#define ASM_NOP_MAX 11
+
+#endif /* CONFIG_64BIT */
+
+#define ASM_NOP1 _ASM_BYTES(BYTES_NOP1)
+#define ASM_NOP2 _ASM_BYTES(BYTES_NOP2)
+#define ASM_NOP3 _ASM_BYTES(BYTES_NOP3)
+#define ASM_NOP4 _ASM_BYTES(BYTES_NOP4)
+#define ASM_NOP5 _ASM_BYTES(BYTES_NOP5)
+#define ASM_NOP6 _ASM_BYTES(BYTES_NOP6)
+#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
+#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)
+
+#ifndef __ASSEMBLER__
+extern const unsigned char * const x86_nops[];
+#endif
#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
new file mode 100644
index 000000000000..4f4b5e8a1574
--- /dev/null
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -0,0 +1,626 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_X86_NOSPEC_BRANCH_H_
+#define _ASM_X86_NOSPEC_BRANCH_H_
+
+#include <linux/static_key.h>
+#include <linux/objtool.h>
+#include <linux/linkage.h>
+
+#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
+#include <asm/unwind_hints.h>
+#include <asm/percpu.h>
+
+/*
+ * Call depth tracking for Intel SKL CPUs to address the RSB underflow
+ * issue in software.
+ *
+ * The tracking does not use a counter. It uses uses arithmetic shift
+ * right on call entry and logical shift left on return.
+ *
+ * The depth tracking variable is initialized to 0x8000.... when the call
+ * depth is zero. The arithmetic shift right sign extends the MSB and
+ * saturates after the 12th call. The shift count is 5 for both directions
+ * so the tracking covers 12 nested calls.
+ *
+ * Call
+ * 0: 0x8000000000000000 0x0000000000000000
+ * 1: 0xfc00000000000000 0xf000000000000000
+ * ...
+ * 11: 0xfffffffffffffff8 0xfffffffffffffc00
+ * 12: 0xffffffffffffffff 0xffffffffffffffe0
+ *
+ * After a return buffer fill the depth is credited 12 calls before the
+ * next stuffing has to take place.
+ *
+ * There is a inaccuracy for situations like this:
+ *
+ * 10 calls
+ * 5 returns
+ * 3 calls
+ * 4 returns
+ * 3 calls
+ * ....
+ *
+ * The shift count might cause this to be off by one in either direction,
+ * but there is still a cushion vs. the RSB depth. The algorithm does not
+ * claim to be perfect and it can be speculated around by the CPU, but it
+ * is considered that it obfuscates the problem enough to make exploitation
+ * extremely difficult.
+ */
+#define RET_DEPTH_SHIFT 5
+#define RSB_RET_STUFF_LOOPS 16
+#define RET_DEPTH_INIT 0x8000000000000000ULL
+#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL
+#define RET_DEPTH_CREDIT 0xffffffffffffffffULL
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+# define CALL_THUNKS_DEBUG_INC_CALLS \
+ incq PER_CPU_VAR(__x86_call_count);
+# define CALL_THUNKS_DEBUG_INC_RETS \
+ incq PER_CPU_VAR(__x86_ret_count);
+# define CALL_THUNKS_DEBUG_INC_STUFFS \
+ incq PER_CPU_VAR(__x86_stuffs_count);
+# define CALL_THUNKS_DEBUG_INC_CTXSW \
+ incq PER_CPU_VAR(__x86_ctxsw_count);
+#else
+# define CALL_THUNKS_DEBUG_INC_CALLS
+# define CALL_THUNKS_DEBUG_INC_RETS
+# define CALL_THUNKS_DEBUG_INC_STUFFS
+# define CALL_THUNKS_DEBUG_INC_CTXSW
+#endif
+
+#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
+
+#include <asm/asm-offsets.h>
+
+#define CREDIT_CALL_DEPTH \
+ movq $-1, PER_CPU_VAR(__x86_call_depth);
+
+#define RESET_CALL_DEPTH \
+ xor %eax, %eax; \
+ bts $63, %rax; \
+ movq %rax, PER_CPU_VAR(__x86_call_depth);
+
+#define RESET_CALL_DEPTH_FROM_CALL \
+ movb $0xfc, %al; \
+ shl $56, %rax; \
+ movq %rax, PER_CPU_VAR(__x86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#define INCREMENT_CALL_DEPTH \
+ sarq $5, PER_CPU_VAR(__x86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#else
+#define CREDIT_CALL_DEPTH
+#define RESET_CALL_DEPTH
+#define RESET_CALL_DEPTH_FROM_CALL
+#define INCREMENT_CALL_DEPTH
+#endif
+
+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; lfence; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * We define a CPP macro such that it can be used from both .S files and
+ * inline assembly. It's possible to do a .macro and then include that
+ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+ */
+
+#define RETPOLINE_THUNK_SIZE 32
+#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
+
+/*
+ * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN.
+ */
+#define __FILL_RETURN_SLOT \
+ ANNOTATE_INTRA_FUNCTION_CALL; \
+ call 772f; \
+ int3; \
+772:
+
+/*
+ * Stuff the entire RSB.
+ *
+ * Google experimented with loop-unrolling and this turned out to be
+ * the optimal version - two calls, each with their own speculation
+ * trap should their return address end up getting used, in a loop.
+ */
+#ifdef CONFIG_X86_64
+#define __FILL_RETURN_BUFFER(reg, nr) \
+ mov $(nr/2), reg; \
+771: \
+ __FILL_RETURN_SLOT \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8) * 2, %_ASM_SP; \
+ dec reg; \
+ jnz 771b; \
+ /* barrier for jnz misprediction */ \
+ lfence; \
+ CREDIT_CALL_DEPTH \
+ CALL_THUNKS_DEBUG_INC_CTXSW
+#else
+/*
+ * i386 doesn't unconditionally have LFENCE, as such it can't
+ * do a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr) \
+ .rept nr; \
+ __FILL_RETURN_SLOT; \
+ .endr; \
+ add $(BITS_PER_LONG/8) * nr, %_ASM_SP;
+#endif
+
+/*
+ * Stuff a single RSB slot.
+ *
+ * To mitigate Post-Barrier RSB speculation, one CALL instruction must be
+ * forced to retire before letting a RET instruction execute.
+ *
+ * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed
+ * before this point.
+ */
+#define __FILL_ONE_RETURN \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8), %_ASM_SP; \
+ lfence;
+
+#ifdef __ASSEMBLER__
+
+/*
+ * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
+ * vs RETBleed validation.
+ */
+#define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE
+
+/*
+ * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
+ * eventually turn into its own annotation.
+ */
+.macro VALIDATE_UNRET_END
+#if defined(CONFIG_NOINSTR_VALIDATION) && \
+ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
+ ANNOTATE_RETPOLINE_SAFE
+ nop
+#endif
+.endm
+
+/*
+ * Emits a conditional CS prefix that is compatible with
+ * -mindirect-branch-cs-prefix.
+ */
+.macro __CS_PREFIX reg:req
+ .irp rs,r8,r9,r10,r11,r12,r13,r14,r15
+ .ifc \reg,\rs
+ .byte 0x2e
+ .endif
+ .endr
+.endm
+
+/*
+ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
+ * indirect jmp/call which may be susceptible to the Spectre variant 2
+ * attack.
+ *
+ * NOTE: these do not take kCFI into account and are thus not comparable to C
+ * indirect calls, take care when using. The target of these should be an ENDBR
+ * instruction irrespective of kCFI.
+ */
+.macro JMP_NOSPEC reg:req
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ __CS_PREFIX \reg
+ jmp __x86_indirect_thunk_\reg
+#else
+ jmp *%\reg
+ int3
+#endif
+.endm
+
+.macro CALL_NOSPEC reg:req
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ __CS_PREFIX \reg
+ call __x86_indirect_thunk_\reg
+#else
+ call *%\reg
+#endif
+.endm
+
+ /*
+ * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+ * monstrosity above, manually.
+ */
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
+ ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
+ __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
+ __stringify(nop;nop;__FILL_ONE_RETURN), \ftr2
+
+.Lskip_rsb_\@:
+.endm
+
+/*
+ * The CALL to srso_alias_untrain_ret() must be patched in directly at
+ * the spot where untraining must be done, ie., srso_alias_untrain_ret()
+ * must be the target of a CALL instruction instead of indirectly
+ * jumping to a wrapper which then calls it. Therefore, this macro is
+ * called outside of __UNTRAIN_RET below, for the time being, before the
+ * kernel can support nested alternatives with arbitrary nesting.
+ */
+.macro CALL_UNTRAIN_RET
+#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
+ ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \
+ "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
+#endif
+.endm
+
+/*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the
+ * return thunk isn't mapped into the userspace tables (then again, AMD
+ * typically has NO_MELTDOWN).
+ *
+ * While retbleed_untrain_ret() doesn't clobber anything but requires stack,
+ * write_ibpb() will clobber AX, CX, DX.
+ *
+ * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
+ * where we have a stack but before any RET instruction.
+ */
+.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
+#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY)
+ VALIDATE_UNRET_END
+ CALL_UNTRAIN_RET
+ ALTERNATIVE_2 "", \
+ "call write_ibpb", \ibpb_feature, \
+ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+#define UNTRAIN_RET \
+ __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH)
+
+#define UNTRAIN_RET_VM \
+ __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH)
+
+#define UNTRAIN_RET_FROM_CALL \
+ __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL)
+
+
+.macro CALL_DEPTH_ACCOUNT
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ ALTERNATIVE "", \
+ __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+/*
+ * Macro to execute VERW insns that mitigate transient data sampling
+ * attacks such as MDS or TSA. On affected systems a microcode update
+ * overloaded VERW insns to also clear the CPU buffers. VERW clobbers
+ * CFLAGS.ZF.
+ * Note: Only the memory operand variant of VERW clears the CPU buffers.
+ */
+#ifdef CONFIG_X86_64
+#define VERW verw x86_verw_sel(%rip)
+#else
+/*
+ * In 32bit mode, the memory operand must be a %cs reference. The data segments
+ * may not be usable (vm86 mode), and the stack segment may not be flat (ESPFIX32).
+ */
+#define VERW verw %cs:x86_verw_sel
+#endif
+
+/*
+ * Provide a stringified VERW macro for simple usage, and a non-stringified
+ * VERW macro for use in more elaborate sequences, e.g. to encode a conditional
+ * VERW within an ALTERNATIVE.
+ */
+#define __CLEAR_CPU_BUFFERS __stringify(VERW)
+
+/* If necessary, emit VERW on exit-to-userspace to clear CPU buffers. */
+#define CLEAR_CPU_BUFFERS \
+ ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF
+
+#ifdef CONFIG_X86_64
+.macro CLEAR_BRANCH_HISTORY
+ ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
+.endm
+
+.macro CLEAR_BRANCH_HISTORY_VMEXIT
+ ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_VMEXIT
+.endm
+#else
+#define CLEAR_BRANCH_HISTORY
+#define CLEAR_BRANCH_HISTORY_VMEXIT
+#endif
+
+#else /* __ASSEMBLER__ */
+
+#define ITS_THUNK_SIZE 64
+
+typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
+typedef u8 its_thunk_t[ITS_THUNK_SIZE];
+extern retpoline_thunk_t __x86_indirect_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
+extern its_thunk_t __x86_indirect_its_thunk_array[];
+
+#ifdef CONFIG_MITIGATION_RETHUNK
+extern void __x86_return_thunk(void);
+#else
+static inline void __x86_return_thunk(void) {}
+#endif
+
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
+extern void retbleed_return_thunk(void);
+#else
+static inline void retbleed_return_thunk(void) {}
+#endif
+
+extern void srso_alias_untrain_ret(void);
+
+#ifdef CONFIG_MITIGATION_SRSO
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
+#else
+static inline void srso_return_thunk(void) {}
+static inline void srso_alias_return_thunk(void) {}
+#endif
+
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_return_thunk(void);
+#else
+static inline void its_return_thunk(void) {}
+#endif
+
+extern void retbleed_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
+
+extern void entry_untrain_ret(void);
+extern void write_ibpb(void);
+
+#ifdef CONFIG_X86_64
+extern void clear_bhb_loop(void);
+#endif
+
+extern void (*x86_return_thunk)(void);
+
+extern void __warn_thunk(void);
+
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+extern void call_depth_return_thunk(void);
+
+#define CALL_DEPTH_ACCOUNT \
+ ALTERNATIVE("", \
+ __stringify(INCREMENT_CALL_DEPTH), \
+ X86_FEATURE_CALL_DEPTH)
+
+DECLARE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DECLARE_PER_CPU(u64, __x86_call_count);
+DECLARE_PER_CPU(u64, __x86_ret_count);
+DECLARE_PER_CPU(u64, __x86_stuffs_count);
+DECLARE_PER_CPU(u64, __x86_ctxsw_count);
+#endif
+#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
+
+static inline void call_depth_return_thunk(void) {}
+#define CALL_DEPTH_ACCOUNT ""
+
+#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
+
+#ifdef CONFIG_MITIGATION_RETPOLINE
+
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Emits a conditional CS prefix that is compatible with
+ * -mindirect-branch-cs-prefix.
+ */
+#define __CS_PREFIX(reg) \
+ ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\rs," reg "\n" \
+ ".byte 0x2e\n" \
+ ".endif\n" \
+ ".endr\n"
+
+/*
+ * Inline asm uses the %V modifier which is only in newer GCC
+ * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
+ */
+#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \
+ "call __x86_indirect_thunk_%V[thunk_target]\n"
+
+# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
+
+#else /* CONFIG_X86_32 */
+/*
+ * For i386 we use the original ret-equivalent retpoline, because
+ * otherwise we'll run out of registers. We don't care about CET
+ * here, anyway.
+ */
+# define CALL_NOSPEC \
+ ALTERNATIVE_2( \
+ ANNOTATE_RETPOLINE_SAFE "\n" \
+ "call *%[thunk_target]\n", \
+ " jmp 904f;\n" \
+ " .align 16\n" \
+ "901: call 903f;\n" \
+ "902: pause;\n" \
+ " lfence;\n" \
+ " jmp 902b;\n" \
+ " .align 16\n" \
+ "903: lea 4(%%esp), %%esp;\n" \
+ " pushl %[thunk_target];\n" \
+ " ret;\n" \
+ " .align 16\n" \
+ "904: call 901b;\n", \
+ X86_FEATURE_RETPOLINE, \
+ "lfence;\n" \
+ ANNOTATE_RETPOLINE_SAFE "\n" \
+ "call *%[thunk_target]\n", \
+ X86_FEATURE_RETPOLINE_LFENCE)
+
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+#else /* No retpoline for C / inline asm */
+# define CALL_NOSPEC "call *%[thunk_target]\n"
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+
+/* The Spectre V2 mitigation variants */
+enum spectre_v2_mitigation {
+ SPECTRE_V2_NONE,
+ SPECTRE_V2_RETPOLINE,
+ SPECTRE_V2_LFENCE,
+ SPECTRE_V2_EIBRS,
+ SPECTRE_V2_EIBRS_RETPOLINE,
+ SPECTRE_V2_EIBRS_LFENCE,
+ SPECTRE_V2_IBRS,
+};
+
+/* The indirect branch speculation control variants */
+enum spectre_v2_user_mitigation {
+ SPECTRE_V2_USER_NONE,
+ SPECTRE_V2_USER_STRICT,
+ SPECTRE_V2_USER_STRICT_PREFERRED,
+ SPECTRE_V2_USER_PRCTL,
+ SPECTRE_V2_USER_SECCOMP,
+};
+
+/* The Speculative Store Bypass disable variants */
+enum ssb_mitigation {
+ SPEC_STORE_BYPASS_NONE,
+ SPEC_STORE_BYPASS_AUTO,
+ SPEC_STORE_BYPASS_DISABLE,
+ SPEC_STORE_BYPASS_PRCTL,
+ SPEC_STORE_BYPASS_SECCOMP,
+};
+
+static __always_inline
+void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
+{
+ asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
+ : : "c" (msr),
+ "a" ((u32)val),
+ "d" ((u32)(val >> 32)),
+ [feature] "i" (feature)
+ : "memory");
+}
+
+DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user);
+
+static inline void indirect_branch_prediction_barrier(void)
+{
+ asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB)
+ : ASM_CALL_CONSTRAINT
+ :: "rax", "rcx", "rdx", "memory");
+}
+
+/* The Intel SPEC CTRL MSR base value cache */
+extern u64 x86_spec_ctrl_base;
+DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
+extern void update_spec_ctrl_cond(u64 val);
+extern u64 spec_ctrl_current(void);
+
+/*
+ * With retpoline, we must use IBRS to restrict branch prediction
+ * before calling into firmware.
+ *
+ * (Implemented as CPP macros due to header hell.)
+ */
+#define firmware_restrict_branch_speculation_start() \
+do { \
+ preempt_disable(); \
+ alternative_msr_write(MSR_IA32_SPEC_CTRL, \
+ spec_ctrl_current() | SPEC_CTRL_IBRS, \
+ X86_FEATURE_USE_IBRS_FW); \
+ alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, \
+ X86_FEATURE_USE_IBPB_FW); \
+} while (0)
+
+#define firmware_restrict_branch_speculation_end() \
+do { \
+ alternative_msr_write(MSR_IA32_SPEC_CTRL, \
+ spec_ctrl_current(), \
+ X86_FEATURE_USE_IBRS_FW); \
+ preempt_enable(); \
+} while (0)
+
+DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+DECLARE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+
+DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
+
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
+extern u16 x86_verw_sel;
+
+#include <asm/segment.h>
+
+/**
+ * x86_clear_cpu_buffers - Buffer clearing support for different x86 CPU vulns
+ *
+ * This uses the otherwise unused and obsolete VERW instruction in
+ * combination with microcode which triggers a CPU buffer flush when the
+ * instruction is executed.
+ */
+static __always_inline void x86_clear_cpu_buffers(void)
+{
+ static const u16 ds = __KERNEL_DS;
+
+ /*
+ * Has to be the memory-operand variant because only that
+ * guarantees the CPU buffer flush functionality according to
+ * documentation. The register-operand variant does not.
+ * Works with any segment selector, but a valid writable
+ * data segment is the fastest variant.
+ *
+ * "cc" clobber is required because VERW modifies ZF.
+ */
+ asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
+}
+
+/**
+ * x86_idle_clear_cpu_buffers - Buffer clearing support in idle for the MDS
+ * and TSA vulnerabilities.
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static __always_inline void x86_idle_clear_cpu_buffers(void)
+{
+ if (static_branch_likely(&cpu_buf_idle_clear))
+ x86_clear_cpu_buffers();
+}
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 27da400d3138..53ba39ce010c 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,5 +1,64 @@
-#ifdef CONFIG_X86_32
-# include "numa_32.h"
-#else
-# include "numa_64.h"
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_NUMA_H
+#define _ASM_X86_NUMA_H
+
+#include <linux/nodemask.h>
+#include <linux/errno.h>
+
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+
+#ifdef CONFIG_NUMA
+
+extern int numa_off;
+
+/*
+ * __apicid_to_node[] stores the raw mapping between physical apicid and
+ * node and is used to initialize cpu_to_node mapping.
+ *
+ * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
+ * should be accessed by the accessors - set_apicid_to_node() and
+ * numa_cpu_node().
+ */
+extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+extern nodemask_t numa_nodes_parsed __initdata;
+
+static inline void set_apicid_to_node(int apicid, s16 node)
+{
+ __apicid_to_node[apicid] = node;
+}
+
+extern int numa_cpu_node(int cpu);
+
+#else /* CONFIG_NUMA */
+static inline void set_apicid_to_node(int apicid, s16 node)
+{
+}
+
+static inline int numa_cpu_node(int cpu)
+{
+ return NUMA_NO_NODE;
+}
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_NUMA
+extern void numa_set_node(int cpu, int node);
+extern void numa_clear_node(int cpu);
+extern void __init init_cpu_to_node(void);
+extern void numa_add_cpu(unsigned int cpu);
+extern void numa_remove_cpu(unsigned int cpu);
+extern void init_gi_nodes(void);
+#else /* CONFIG_NUMA */
+static inline void numa_set_node(int cpu, int node) { }
+static inline void numa_clear_node(int cpu) { }
+static inline void init_cpu_to_node(void) { }
+static inline void numa_add_cpu(unsigned int cpu) { }
+static inline void numa_remove_cpu(unsigned int cpu) { }
+static inline void init_gi_nodes(void) { }
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable);
#endif
+
+#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
deleted file mode 100644
index a37229011b56..000000000000
--- a/arch/x86/include/asm/numa_32.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _ASM_X86_NUMA_32_H
-#define _ASM_X86_NUMA_32_H
-
-extern int pxm_to_nid(int pxm);
-extern void numa_remove_cpu(int cpu);
-
-#ifdef CONFIG_HIGHMEM
-extern void set_highmem_pages_init(void);
-#else
-static inline void set_highmem_pages_init(void)
-{
-}
-#endif
-
-#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644
index c4ae822e415f..000000000000
--- a/arch/x86/include/asm/numa_64.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef _ASM_X86_NUMA_64_H
-#define _ASM_X86_NUMA_64_H
-
-#include <linux/nodemask.h>
-#include <asm/apicdef.h>
-
-struct bootnode {
- u64 start;
- u64 end;
-};
-
-extern int compute_hash_shift(struct bootnode *nodes, int numblks,
- int *nodeids);
-
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
-
-extern void numa_init_array(void);
-extern int numa_off;
-
-extern s16 apicid_to_node[MAX_LOCAL_APIC];
-
-extern unsigned long numa_free_all_bootmem(void);
-extern void setup_node_bootmem(int nodeid, unsigned long start,
- unsigned long end);
-
-#ifdef CONFIG_NUMA
-/*
- * Too small node sizes may confuse the VM badly. Usually they
- * result from BIOS bugs. So dont recognize nodes as standalone
- * NUMA entities that have less than this amount of RAM listed:
- */
-#define NODE_MIN_SIZE (4*1024*1024)
-
-extern void __init init_cpu_to_node(void);
-extern void __cpuinit numa_set_node(int cpu, int node);
-extern void __cpuinit numa_clear_node(int cpu);
-extern void __cpuinit numa_add_cpu(int cpu);
-extern void __cpuinit numa_remove_cpu(int cpu);
-#else
-static inline void init_cpu_to_node(void) { }
-static inline void numa_set_node(int cpu, int node) { }
-static inline void numa_clear_node(int cpu) { }
-static inline void numa_add_cpu(int cpu, int node) { }
-static inline void numa_remove_cpu(int cpu) { }
-#endif
-
-#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h
new file mode 100644
index 000000000000..c64373a2d731
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip.h
@@ -0,0 +1,20 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-specific header file
+ *
+ * Copyright (C) 2012 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
+#define _ASM_X86_NUMACHIP_NUMACHIP_H
+
+extern u8 numachip_system;
+extern int __init pci_numachip_init(void);
+
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h
new file mode 100644
index 000000000000..29719eecdc2e
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip_csr.h
@@ -0,0 +1,98 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific Header file
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#define CSR_NODE_SHIFT 16
+#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT)
+#define CSR_NODE_MASK 0x0fff /* 4K nodes */
+
+/* 32K CSR space, b15 indicates geo/non-geo */
+#define CSR_OFFSET_MASK 0x7fffUL
+#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
+#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
+
+/*
+ * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
+ * when using the direct mapping on x86_64, both start and size needs to be
+ * aligned with PMD_SIZE which is 2M
+ */
+#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL
+#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL
+#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
+#define NUMACHIP_LAPIC_BITS 8
+
+static inline void *lcsr_address(unsigned long offset)
+{
+ return __va(NUMACHIP_LCSR_BASE | (1UL << 15) |
+ CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
+}
+
+static inline unsigned int read_lcsr(unsigned long offset)
+{
+ return swab32(readl(lcsr_address(offset)));
+}
+
+static inline void write_lcsr(unsigned long offset, unsigned int val)
+{
+ writel(swab32(val), lcsr_address(offset));
+}
+
+/*
+ * On NumaChip2, local CSR space is 16MB and starts at fixed offset below 4G
+ */
+
+#define NUMACHIP2_LCSR_BASE 0xf0000000UL
+#define NUMACHIP2_LCSR_SIZE 0x1000000UL
+#define NUMACHIP2_APIC_ICR 0x100000
+#define NUMACHIP2_TIMER_DEADLINE 0x200000
+#define NUMACHIP2_TIMER_INT 0x200008
+#define NUMACHIP2_TIMER_NOW 0x200018
+#define NUMACHIP2_TIMER_RESET 0x200020
+
+static inline void __iomem *numachip2_lcsr_address(unsigned long offset)
+{
+ return (void __iomem *)__va(NUMACHIP2_LCSR_BASE |
+ (offset & (NUMACHIP2_LCSR_SIZE - 1)));
+}
+
+static inline u32 numachip2_read32_lcsr(unsigned long offset)
+{
+ return readl(numachip2_lcsr_address(offset));
+}
+
+static inline u64 numachip2_read64_lcsr(unsigned long offset)
+{
+ return readq(numachip2_lcsr_address(offset));
+}
+
+static inline void numachip2_write32_lcsr(unsigned long offset, u32 val)
+{
+ writel(val, numachip2_lcsr_address(offset));
+}
+
+static inline void numachip2_write64_lcsr(unsigned long offset, u64 val)
+{
+ writeq(val, numachip2_lcsr_address(offset));
+}
+
+static inline unsigned int numachip2_timer(void)
+{
+ return (smp_processor_id() % 48) << 6;
+}
+
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
deleted file mode 100644
index 9f0a5f5d29ec..000000000000
--- a/arch/x86/include/asm/numaq.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Written by: Patricia Gaughen, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
- */
-
-#ifndef _ASM_X86_NUMAQ_H
-#define _ASM_X86_NUMAQ_H
-
-#ifdef CONFIG_X86_NUMAQ
-
-extern int found_numaq;
-extern int get_memcfg_numaq(void);
-
-extern void *xquad_portio;
-
-/*
- * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
- */
-#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private
- quad space */
-
-/*
- * Communication area for each processor on lynxer-processor tests.
- *
- * NOTE: If you change the size of this eachproc structure you need
- * to change the definition for EACH_QUAD_SIZE.
- */
-struct eachquadmem {
- unsigned int priv_mem_start; /* Starting address of this */
- /* quad's private memory. */
- /* This is always 0. */
- /* In MB. */
- unsigned int priv_mem_size; /* Size of this quad's */
- /* private memory. */
- /* In MB. */
- unsigned int low_shrd_mem_strp_start;/* Starting address of this */
- /* quad's low shared block */
- /* (untranslated). */
- /* In MB. */
- unsigned int low_shrd_mem_start; /* Starting address of this */
- /* quad's low shared memory */
- /* (untranslated). */
- /* In MB. */
- unsigned int low_shrd_mem_size; /* Size of this quad's low */
- /* shared memory. */
- /* In MB. */
- unsigned int lmmio_copb_start; /* Starting address of this */
- /* quad's local memory */
- /* mapped I/O in the */
- /* compatibility OPB. */
- /* In MB. */
- unsigned int lmmio_copb_size; /* Size of this quad's local */
- /* memory mapped I/O in the */
- /* compatibility OPB. */
- /* In MB. */
- unsigned int lmmio_nopb_start; /* Starting address of this */
- /* quad's local memory */
- /* mapped I/O in the */
- /* non-compatibility OPB. */
- /* In MB. */
- unsigned int lmmio_nopb_size; /* Size of this quad's local */
- /* memory mapped I/O in the */
- /* non-compatibility OPB. */
- /* In MB. */
- unsigned int io_apic_0_start; /* Starting address of I/O */
- /* APIC 0. */
- unsigned int io_apic_0_sz; /* Size I/O APIC 0. */
- unsigned int io_apic_1_start; /* Starting address of I/O */
- /* APIC 1. */
- unsigned int io_apic_1_sz; /* Size I/O APIC 1. */
- unsigned int hi_shrd_mem_start; /* Starting address of this */
- /* quad's high shared memory.*/
- /* In MB. */
- unsigned int hi_shrd_mem_size; /* Size of this quad's high */
- /* shared memory. */
- /* In MB. */
- unsigned int mps_table_addr; /* Address of this quad's */
- /* MPS tables from BIOS, */
- /* in system space.*/
- unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */
- /* local access of MDC. */
- unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */
- /* remote access of MDC. */
- unsigned int mm_port_io_start; /* Starting address of this */
- /* quad's memory mapped Port */
- /* I/O space. */
- unsigned int mm_port_io_size; /* Size of this quad's memory*/
- /* mapped Port I/O space. */
- unsigned int mm_rmt_io_apic_start; /* Starting address of this */
- /* quad's memory mapped */
- /* remote I/O APIC space. */
- unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/
- /* mapped remote I/O APIC */
- /* space. */
- unsigned int mm_isa_start; /* Starting address of this */
- /* quad's memory mapped ISA */
- /* space (contains MDC */
- /* memory space). */
- unsigned int mm_isa_size; /* Size of this quad's memory*/
- /* mapped ISA space (contains*/
- /* MDC memory space). */
- unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/
- unsigned int lcl_qmi_addr; /* Local addr to access QMI. */
-};
-
-/*
- * Note: This structure must be NOT be changed unless the multiproc and
- * OS are changed to reflect the new structure.
- */
-struct sys_cfg_data {
- unsigned int quad_id;
- unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */
- unsigned int scd_version; /* Version number of this table. */
- unsigned int first_quad_id;
- unsigned int quads_present31_0; /* 1 bit for each quad */
- unsigned int quads_present63_32; /* 1 bit for each quad */
- unsigned int config_flags;
- unsigned int boot_flags;
- unsigned int csr_start_addr; /* Absolute value (not in MB) */
- unsigned int csr_size; /* Absolute value (not in MB) */
- unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */
- unsigned int lcl_apic_size; /* Absolute value (not in MB) */
- unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */
- unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */
- /* may not be totally populated */
- unsigned int split_mem_enbl; /* 0 for no low shared memory */
- unsigned int mmio_sz; /* Size of total system memory mapped I/O */
- /* (in MB). */
- unsigned int quad_spin_lock; /* Spare location used for quad */
- /* bringup. */
- unsigned int nonzero55; /* For checksumming. */
- unsigned int nonzeroaa; /* For checksumming. */
- unsigned int scd_magic_number;
- unsigned int system_type;
- unsigned int checksum;
- /*
- * memory configuration area for each quad
- */
- struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */
-};
-
-void numaq_tsc_disable(void);
-
-#else
-static inline int get_memcfg_numaq(void)
-{
- return 0;
-}
-#endif /* CONFIG_X86_NUMAQ */
-#endif /* _ASM_X86_NUMAQ_H */
-
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 834a30295fab..6fe76282aceb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* OLPC machine specific definitions */
#ifndef _ASM_X86_OLPC_H
@@ -8,12 +9,10 @@
struct olpc_platform_t {
int flags;
uint32_t boardrev;
- int ecver;
};
#define OLPC_F_PRESENT 0x01
#define OLPC_F_DCON 0x02
-#define OLPC_F_VSA 0x04
#ifdef CONFIG_OLPC
@@ -21,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info;
/*
* OLPC board IDs contain the major build number within the mask 0x0ff0,
- * and the minor build number withing 0x000f. Pre-builds have a minor
+ * and the minor build number within 0x000f. Pre-builds have a minor
* number less than 8, and normal builds start at 8. For example, 0x0B10
* is a PreB1, and 0x0C18 is a C1.
*/
@@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void)
}
/*
- * The VSA is software from AMD that typical Geode bioses will include.
- * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
- * not include the VSA; instead, PCI is emulated by the kernel.
- *
- * The VSA is described further in arch/x86/pci/olpc.c.
- */
-static inline int olpc_has_vsa(void)
-{
- return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
-}
-
-/*
* The "Mass Production" version of OLPC's XO is identified as being model
* C2. During the prototype phase, the following models (in chronological
* order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
@@ -87,46 +74,29 @@ static inline int olpc_has_dcon(void)
return 0;
}
-static inline int olpc_has_vsa(void)
-{
- return 0;
-}
-
#endif
-/* EC related functions */
-
-extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
- unsigned char *outbuf, size_t outlen);
-
-extern int olpc_ec_mask_set(uint8_t bits);
-extern int olpc_ec_mask_unset(uint8_t bits);
-
-/* EC commands */
-
-#define EC_FIRMWARE_REV 0x08
-
-/* SCI source values */
+#ifdef CONFIG_OLPC_XO1_PM
+extern void do_olpc_suspend_lowlevel(void);
+extern void olpc_xo1_pm_wakeup_set(u16 value);
+extern void olpc_xo1_pm_wakeup_clear(u16 value);
+#endif
-#define EC_SCI_SRC_EMPTY 0x00
-#define EC_SCI_SRC_GAME 0x01
-#define EC_SCI_SRC_BATTERY 0x02
-#define EC_SCI_SRC_BATSOC 0x04
-#define EC_SCI_SRC_BATERR 0x08
-#define EC_SCI_SRC_EBOOK 0x10
-#define EC_SCI_SRC_WLAN 0x20
-#define EC_SCI_SRC_ACPWR 0x40
-#define EC_SCI_SRC_ALL 0x7F
+extern int pci_olpc_init(void);
/* GPIO assignments */
-#define OLPC_GPIO_MIC_AC geode_gpio(1)
-#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
+#define OLPC_GPIO_MIC_AC 1
+#define OLPC_GPIO_DCON_STAT0 5
+#define OLPC_GPIO_DCON_STAT1 6
+#define OLPC_GPIO_DCON_IRQ 7
#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
-#define OLPC_GPIO_SMB_CLK geode_gpio(14)
-#define OLPC_GPIO_SMB_DATA geode_gpio(15)
+#define OLPC_GPIO_DCON_LOAD 11
+#define OLPC_GPIO_DCON_BLANK 12
+#define OLPC_GPIO_SMB_CLK 14
+#define OLPC_GPIO_SMB_DATA 15
#define OLPC_GPIO_WORKAUX geode_gpio(24)
-#define OLPC_GPIO_LID geode_gpio(26)
-#define OLPC_GPIO_ECSCI geode_gpio(27)
+#define OLPC_GPIO_LID 26
+#define OLPC_GPIO_ECSCI 27
#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 000000000000..8c2a1daf7f72
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_OLPC_OFW_H
+#define _ASM_X86_OLPC_OFW_H
+
+/* index into the page table containing the entry OFW occupies */
+#define OLPC_OFW_PDE_NR 1022
+
+#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
+
+#ifdef CONFIG_OLPC
+
+extern bool olpc_ofw_is_installed(void);
+
+/* run an OFW command by calling into the firmware */
+#define olpc_ofw(name, args, res) \
+ __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
+
+extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res);
+
+/* determine whether OFW is available and lives in the proper memory */
+extern void olpc_ofw_detect(void);
+
+/* install OFW's pde permanently into the kernel's pgtable */
+extern void setup_olpc_ofw_pgd(void);
+
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
+extern void olpc_dt_build_devicetree(void);
+
+#else /* !CONFIG_OLPC */
+static inline void olpc_ofw_detect(void) { }
+static inline void setup_olpc_ofw_pgd(void) { }
+static inline void olpc_dt_build_devicetree(void) { }
+#endif /* !CONFIG_OLPC */
+
+#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/orc_header.h b/arch/x86/include/asm/orc_header.h
new file mode 100644
index 000000000000..07bacf3e160e
--- /dev/null
+++ b/arch/x86/include/asm/orc_header.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+
+#ifndef _ORC_HEADER_H
+#define _ORC_HEADER_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/orc_hash.h>
+
+/*
+ * The header is currently a 20-byte hash of the ORC entry definition; see
+ * scripts/orc_hash.sh.
+ */
+#define ORC_HEADER \
+ __used __section(".orc_header") __aligned(4) \
+ static const u8 orc_header[] = { ORC_HASH }
+
+#endif /* _ORC_HEADER_H */
diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h
new file mode 100644
index 000000000000..241631282e43
--- /dev/null
+++ b/arch/x86/include/asm/orc_lookup.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ */
+#ifndef _ORC_LOOKUP_H
+#define _ORC_LOOKUP_H
+
+/*
+ * This is a lookup table for speeding up access to the .orc_unwind table.
+ * Given an input address offset, the corresponding lookup table entry
+ * specifies a subset of the .orc_unwind table to search.
+ *
+ * Each block represents the end of the previous range and the start of the
+ * next range. An extra block is added to give the last range an end.
+ *
+ * The block size should be a power of 2 to avoid a costly 'div' instruction.
+ *
+ * A block size of 256 was chosen because it roughly doubles unwinder
+ * performance while only adding ~5% to the ORC data footprint.
+ */
+#define LOOKUP_BLOCK_ORDER 8
+#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER)
+
+#ifndef LINKER_SCRIPT
+
+extern unsigned int orc_lookup[];
+extern unsigned int orc_lookup_end[];
+
+#define LOOKUP_START_IP (unsigned long)_stext
+#define LOOKUP_STOP_IP (unsigned long)_etext
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* _ORC_LOOKUP_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000000..e0125afa53fb
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED 0
+#define ORC_REG_PREV_SP 1
+#define ORC_REG_DX 2
+#define ORC_REG_DI 3
+#define ORC_REG_BP 4
+#define ORC_REG_SP 5
+#define ORC_REG_R10 6
+#define ORC_REG_R13 7
+#define ORC_REG_BP_INDIRECT 8
+#define ORC_REG_SP_INDIRECT 9
+#define ORC_REG_MAX 15
+
+#define ORC_TYPE_UNDEFINED 0
+#define ORC_TYPE_END_OF_STACK 1
+#define ORC_TYPE_CALL 2
+#define ORC_TYPE_REGS 3
+#define ORC_TYPE_REGS_PARTIAL 4
+
+#ifndef __ASSEMBLER__
+#include <asm/byteorder.h>
+
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard. It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address. Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+ s16 sp_offset;
+ s16 bp_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ unsigned sp_reg:4;
+ unsigned bp_reg:4;
+ unsigned type:3;
+ unsigned signal:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned bp_reg:4;
+ unsigned sp_reg:4;
+ unsigned unused:4;
+ unsigned signal:1;
+ unsigned type:3;
+#endif
+} __packed;
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 625c3f0e741a..9265f2fca99a 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_H
#define _ASM_X86_PAGE_H
@@ -13,10 +14,14 @@
#include <asm/page_32.h>
#endif /* CONFIG_X86_64 */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct page;
+#include <linux/range.h>
+extern struct range pfn_mapped[];
+extern int nr_pfn_mapped;
+
static inline void clear_user_page(void *page, unsigned long vaddr,
struct page *pg)
{
@@ -29,17 +34,29 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
copy_page(to, from);
}
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr)
+#ifndef __pa
#define __pa(x) __phys_addr((unsigned long)(x))
+#endif
+
#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
This seems to be the official gcc blessed way to do such arithmetic. */
-#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
+/*
+ * We need __phys_reloc_hide() here because gcc may assume that there is no
+ * overflow during __pa() calculation and can optimize it unexpectedly.
+ * Newer versions of gcc provide -fno-strict-overflow switch to handle this
+ * case properly. Once all supported versions of gcc understand it, we can
+ * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
+ */
+#define __pa_symbol(x) \
+ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
+#ifndef __va
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+#endif
#define __boot_va(x) __va(x)
#define __boot_pa(x) __pa(x)
@@ -49,16 +66,30 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
* virt_addr_valid(kaddr) returns true.
*/
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
-#endif /* __ASSEMBLY__ */
+static __always_inline void *pfn_to_kaddr(unsigned long pfn)
+{
+ return __va(pfn << PAGE_SHIFT);
+}
+
+static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
+{
+ return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits)
+{
+ return __canonical_address(vaddr, vaddr_bits) == vaddr;
+}
+
+#endif /* __ASSEMBLER__ */
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
-#define __HAVE_ARCH_GATE_AREA 1
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif /* __KERNEL__ */
#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index da4e762406f7..0c623706cb7e 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -1,13 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_32_H
#define _ASM_X86_PAGE_32_H
#include <asm/page_32_types.h>
-#ifndef __ASSEMBLY__
-
-#ifdef CONFIG_HUGETLB_PAGE
-#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-#endif
+#ifndef __ASSEMBLER__
#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
#ifdef CONFIG_DEBUG_VIRTUAL
@@ -15,25 +12,9 @@ extern unsigned long __phys_addr(unsigned long);
#else
#define __phys_addr(x) __phys_addr_nodebug(x)
#endif
+#define __phys_addr_symbol(x) __phys_addr(x)
#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_mapnr)
-#endif /* CONFIG_FLATMEM */
-
-#ifdef CONFIG_X86_USE_3DNOW
-#include <asm/mmx.h>
-
-static inline void clear_page(void *page)
-{
- mmx_clear_page(page);
-}
-
-static inline void copy_page(void *to, void *from)
-{
- mmx_copy_page(to, from);
-}
-#else /* !CONFIG_X86_USE_3DNOW */
#include <linux/string.h>
static inline void clear_page(void *page)
@@ -45,7 +26,6 @@ static inline void copy_page(void *to, void *from)
{
memcpy(to, from, PAGE_SIZE);
}
-#endif /* CONFIG_X86_3DNOW */
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 6f1b7331313f..623f1e9f493e 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_32_DEFS_H
#define _ASM_X86_PAGE_32_DEFS_H
@@ -10,28 +11,29 @@
* a virtual address space of one gigabyte, which limits the
* amount of physical memory you can use to about 950MB.
*
- * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
- * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ * If you want more physical memory than this then see the CONFIG_VMSPLIT_2G
+ * and CONFIG_HIGHMEM4G options in the kernel configuration.
*/
-#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
-
-#ifdef CONFIG_4KSTACKS
-#define THREAD_ORDER 0
-#else
-#define THREAD_ORDER 1
-#endif
-#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
-
-#define STACKFAULT_STACK 0
-#define DOUBLEFAULT_STACK 1
-#define NMI_STACK 0
-#define DEBUG_STACK 0
-#define MCE_STACK 0
-#define N_EXCEPTION_STACKS 1
+#define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL)
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+
+#define __START_KERNEL_map __PAGE_OFFSET
+
+#define THREAD_SIZE_ORDER 1
+#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
+
+#define IRQ_STACK_SIZE THREAD_SIZE
+
+#define N_EXCEPTION_STACKS 1
#ifdef CONFIG_X86_PAE
-/* 44=32+12, the limit we can fit into an unsigned long pfn */
-#define __PHYSICAL_MASK_SHIFT 44
+/*
+ * This is beyond the 44 bit limit imposed by the 32bit long pfns,
+ * but we need the full mask to make sure inverted PROT_NONE
+ * entries have all the host bits set in a guest.
+ * The real limit is still 44 bits.
+ */
+#define __PHYSICAL_MASK_SHIFT 52
#define __VIRTUAL_MASK_SHIFT 32
#else /* !CONFIG_X86_PAE */
@@ -40,11 +42,28 @@
#endif /* CONFIG_X86_PAE */
/*
- * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ * User space process size: 3GB (default).
+ */
+#define IA32_PAGE_OFFSET __PAGE_OFFSET
+#define TASK_SIZE __PAGE_OFFSET
+#define TASK_SIZE_LOW TASK_SIZE
+#define TASK_SIZE_MAX TASK_SIZE
+#define DEFAULT_MAP_WINDOW TASK_SIZE
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX STACK_TOP
+
+/*
+ * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual
+ * address for the kernel image, rather than the limit on the size itself. On
+ * 32-bit, this is not a strict limit, but this value is used to limit the
+ * link-time virtual address range of the kernel, and by KASLR to limit the
+ * randomized address from which the kernel is executed. A relocatable kernel
+ * can be loaded somewhat higher than KERNEL_IMAGE_SIZE as long as enough space
+ * remains for the vmalloc area.
*/
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* This much address space is reserved for vmalloc() and iomap()
@@ -54,8 +73,7 @@ extern unsigned int __VMALLOC_RESERVE;
extern int sysctl_legacy_va_layout;
extern void find_low_pfn_range(void);
-extern void setup_bootmem_allocator(void);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PAGE_32_DEFS_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 072694ed81a5..2f0e47be79a4 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -1,6 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_H
#define _ASM_X86_PAGE_64_H
#include <asm/page_64_types.h>
+#ifndef __ASSEMBLER__
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+
+#include <linux/kmsan-checks.h>
+#include <linux/mmdebug.h>
+
+/* duplicated to the one in bootmem.h */
+extern unsigned long max_pfn;
+extern unsigned long phys_base;
+
+extern unsigned long page_offset_base;
+extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
+extern unsigned long direct_map_physmem_end;
+
+static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
+
+ return x;
+}
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern unsigned long __phys_addr(unsigned long);
+#else
+#define __phys_addr(x) __phys_addr_nodebug(x)
+#endif
+
+static inline unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+
+#define __phys_reloc_hide(x) (x)
+
+void clear_page_orig(void *page);
+void clear_page_rep(void *page);
+void clear_page_erms(void *page);
+KCFI_REFERENCE(clear_page_orig);
+KCFI_REFERENCE(clear_page_rep);
+KCFI_REFERENCE(clear_page_erms);
+
+static inline void clear_page(void *page)
+{
+ /*
+ * Clean up KMSAN metadata for the page being cleared. The assembly call
+ * below clobbers @page, so we perform unpoisoning before it.
+ */
+ kmsan_unpoison_memory(page, PAGE_SIZE);
+ alternative_call_2(clear_page_orig,
+ clear_page_rep, X86_FEATURE_REP_GOOD,
+ clear_page_erms, X86_FEATURE_ERMS,
+ "=D" (page),
+ "D" (page),
+ "cc", "memory", "rax", "rcx");
+}
+
+void copy_page(void *to, void *from);
+KCFI_REFERENCE(copy_page);
+
+/*
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+static __always_inline unsigned long task_size_max(void)
+{
+ unsigned long ret;
+
+ alternative_io("movq %[small],%0","movq %[large],%0",
+ X86_FEATURE_LA57,
+ "=r" (ret),
+ [small] "i" ((1ul << 47)-PAGE_SIZE),
+ [large] "i" ((1ul << 56)-PAGE_SIZE));
+
+ return ret;
+}
+
+#endif /* !__ASSEMBLER__ */
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+# define __HAVE_ARCH_GATE_AREA 1
+#endif
+
#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 7639dbf5d223..7400dab373fe 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,75 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_DEFS_H
#define _ASM_X86_PAGE_64_DEFS_H
-#define THREAD_ORDER 1
-#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
-#define CURRENT_MASK (~(THREAD_SIZE - 1))
+#ifndef __ASSEMBLER__
+#include <asm/kaslr.h>
+#endif
-#define EXCEPTION_STACK_ORDER 0
-#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
-#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
-#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
+#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
+#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
-#define IRQ_STACK_ORDER 2
-#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
+#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
+#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
-#define STACKFAULT_STACK 1
-#define DOUBLEFAULT_STACK 2
-#define NMI_STACK 3
-#define DEBUG_STACK 4
-#define MCE_STACK 5
-#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
+#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
-#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
-#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+/*
+ * The index for the tss.ist[] array. The hardware limit is 7 entries.
+ */
+#define IST_INDEX_DF 0
+#define IST_INDEX_NMI 1
+#define IST_INDEX_DB 2
+#define IST_INDEX_MCE 3
+#define IST_INDEX_VC 4
/*
* Set __PAGE_OFFSET to the most negative possible address +
- * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
- * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
- * what Xen requires.
+ * PGDIR_SIZE*17 (pgd slot 273).
+ *
+ * The gap is to allow a space for LDT remap for PTI (1 pgd slot) and space for
+ * a hypervisor (16 slots). Choosing 16 slots for a hypervisor is arbitrary,
+ * but it's what Xen requires.
*/
-#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
+#define __PAGE_OFFSET_BASE_L5 _AC(0xff11000000000000, UL)
+#define __PAGE_OFFSET_BASE_L4 _AC(0xffff888000000000, UL)
-#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
- (CONFIG_PHYSICAL_ALIGN - 1)) & \
- ~(CONFIG_PHYSICAL_ALIGN - 1))
+#define __PAGE_OFFSET page_offset_base
-#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define __PHYSICAL_MASK_SHIFT 46
-#define __VIRTUAL_MASK_SHIFT 47
+/* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */
-/*
- * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
- * arch/x86/kernel/head_64.S), and it is mapped here:
- */
-#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
-#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
-
-#ifndef __ASSEMBLY__
-void clear_page(void *page);
-void copy_page(void *to, void *from);
-
-/* duplicated to the one in bootmem.h */
-extern unsigned long max_pfn;
-extern unsigned long phys_base;
+#define __PHYSICAL_MASK_SHIFT 52
+#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
-extern unsigned long __phys_addr(unsigned long);
-#define __phys_reloc_hide(x) (x)
+#define TASK_SIZE_MAX task_size_max()
+#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
-#define vmemmap ((struct page *)VMEMMAP_START)
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+ 0xc0000000 : 0xFFFFe000)
-extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
-extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
+ IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
+#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
-#endif /* !__ASSEMBLY__ */
+#define STACK_TOP TASK_SIZE_LOW
+#define STACK_TOP_MAX TASK_SIZE_MAX
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_pfn)
+/*
+ * In spite of the name, KERNEL_IMAGE_SIZE is a limit on the maximum virtual
+ * address for the kernel image, rather than the limit on the size itself.
+ * This can be at most 1 GiB, due to the fixmap living in the next 1 GiB (see
+ * level2_kernel_pgt in arch/x86/kernel/head_64.S).
+ *
+ * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
+ * page tables are fully set up.
+ *
+ * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
+ * of the modules area to 1.5 GiB.
+ */
+#ifdef CONFIG_RANDOMIZE_BASE
+#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
+#else
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#endif
#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..018a8d906ca3 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -1,23 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_DEFS_H
#define _ASM_X86_PAGE_DEFS_H
#include <linux/const.h>
+#include <linux/types.h>
+#include <linux/mem_encrypt.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT 12
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
-#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
-/* Cast PAGE_MASK to a signed type so that it is sign-extended if
+/* Cast P*D_MASK to a signed type so that it is sign-extended if
virtual addresses are 32-bits but physical addresses are larger
(ie, 32-bit PAE). */
#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
-
-#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
-#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_MASK) & __PHYSICAL_MASK)
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
@@ -28,30 +26,44 @@
#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
-#define VM_DATA_DEFAULT_FLAGS \
- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
+
+/* Physical address where kernel should be loaded. */
+#define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1)
+
+#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR)
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER (PUD_SHIFT)
#else
#include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER (PMD_SHIFT)
#endif /* CONFIG_X86_64 */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
+
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+extern phys_addr_t physical_mask;
+#define __PHYSICAL_MASK physical_mask
+#else
+#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#endif
-extern int page_is_ram(unsigned long pagenr);
extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
-extern unsigned long init_memory_mapping(unsigned long start,
- unsigned long end);
+static inline phys_addr_t get_max_mapped(void)
+{
+ return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern void free_initmem(void);
+extern void initmem_init(void);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h
deleted file mode 100644
index 6f0d0422f4ca..000000000000
--- a/arch/x86/include/asm/param.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_PARAM_H
-#define _ASM_X86_PARAM_H
-
-#ifdef __KERNEL__
-# define HZ CONFIG_HZ /* Internal kernel timer frequency */
-# define USER_HZ 100 /* some user interfaces are */
-# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */
-#endif
-
-#ifndef HZ
-#define HZ 100
-#endif
-
-#define EXEC_PAGESIZE 4096
-
-#ifndef NOGROUP
-#define NOGROUP (-1)
-#endif
-
-#define MAXHOSTNAMELEN 64 /* max length of hostname */
-
-#endif /* _ASM_X86_PARAM_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4fb37c8a0832..b5e59a7ba0d0 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1,1381 +1,504 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PARAVIRT_H
#define _ASM_X86_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
* para-virtualization: those hooks are defined here. */
+#include <asm/paravirt_types.h>
+
+#ifndef __ASSEMBLER__
+struct mm_struct;
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
+#include <asm/nospec-branch.h>
-/* Bitmask of what can be clobbered: usually at least eax. */
-#define CLBR_NONE 0
-#define CLBR_EAX (1 << 0)
-#define CLBR_ECX (1 << 1)
-#define CLBR_EDX (1 << 2)
-#define CLBR_EDI (1 << 3)
-
-#ifdef CONFIG_X86_32
-/* CLBR_ANY should match all regs platform has. For i386, that's just it */
-#define CLBR_ANY ((1 << 4) - 1)
-
-#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
-#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
-#define CLBR_SCRATCH (0)
-#else
-#define CLBR_RAX CLBR_EAX
-#define CLBR_RCX CLBR_ECX
-#define CLBR_RDX CLBR_EDX
-#define CLBR_RDI CLBR_EDI
-#define CLBR_RSI (1 << 4)
-#define CLBR_R8 (1 << 5)
-#define CLBR_R9 (1 << 6)
-#define CLBR_R10 (1 << 7)
-#define CLBR_R11 (1 << 8)
-
-#define CLBR_ANY ((1 << 9) - 1)
-
-#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
- CLBR_RCX | CLBR_R8 | CLBR_R9)
-#define CLBR_RET_REG (CLBR_RAX)
-#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
-
-#include <asm/desc_defs.h>
-#endif /* X86_64 */
-
-#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
-
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
+#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
-#include <asm/kmap_types.h>
-#include <asm/desc_defs.h>
+#include <linux/static_call_types.h>
+#include <asm/frame.h>
-struct page;
-struct thread_struct;
-struct desc_ptr;
-struct tss_struct;
-struct mm_struct;
-struct desc_struct;
-struct task_struct;
+u64 dummy_steal_clock(int cpu);
+u64 dummy_sched_clock(void);
-/*
- * Wrapper type for pointers to code which uses the non-standard
- * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
- */
-struct paravirt_callee_save {
- void *func;
-};
-
-/* general info */
-struct pv_info {
- unsigned int kernel_rpl;
- int shared_kernel_pmd;
- int paravirt_enabled;
- const char *name;
-};
-
-struct pv_init_ops {
- /*
- * Patch may replace one of the defined code sequences with
- * arbitrary code, subject to the same register constraints.
- * This generally means the code is not free to clobber any
- * registers other than EAX. The patch function should return
- * the number of bytes of code generated, as we nop pad the
- * rest in generic code.
- */
- unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
- unsigned long addr, unsigned len);
-
- /* Basic arch-specific setup */
- void (*arch_setup)(void);
- char *(*memory_setup)(void);
- void (*post_allocator_init)(void);
-
- /* Print a banner to identify the environment */
- void (*banner)(void);
-};
-
-
-struct pv_lazy_ops {
- /* Set deferred update mode, used for batching operations. */
- void (*enter)(void);
- void (*leave)(void);
-};
-
-struct pv_time_ops {
- void (*time_init)(void);
-
- /* Set and set time of day */
- unsigned long (*get_wallclock)(void);
- int (*set_wallclock)(unsigned long);
-
- unsigned long long (*sched_clock)(void);
- unsigned long (*get_tsc_khz)(void);
-};
-
-struct pv_cpu_ops {
- /* hooks for various privileged instructions */
- unsigned long (*get_debugreg)(int regno);
- void (*set_debugreg)(int regno, unsigned long value);
-
- void (*clts)(void);
-
- unsigned long (*read_cr0)(void);
- void (*write_cr0)(unsigned long);
-
- unsigned long (*read_cr4_safe)(void);
- unsigned long (*read_cr4)(void);
- void (*write_cr4)(unsigned long);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
-#ifdef CONFIG_X86_64
- unsigned long (*read_cr8)(void);
- void (*write_cr8)(unsigned long);
-#endif
+void paravirt_set_sched_clock(u64 (*func)(void));
- /* Segment descriptor handling */
- void (*load_tr_desc)(void);
- void (*load_gdt)(const struct desc_ptr *);
- void (*load_idt)(const struct desc_ptr *);
- void (*store_gdt)(struct desc_ptr *);
- void (*store_idt)(struct desc_ptr *);
- void (*set_ldt)(const void *desc, unsigned entries);
- unsigned long (*store_tr)(void);
- void (*load_tls)(struct thread_struct *t, unsigned int cpu);
-#ifdef CONFIG_X86_64
- void (*load_gs_index)(unsigned int idx);
-#endif
- void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
- const void *desc);
- void (*write_gdt_entry)(struct desc_struct *,
- int entrynum, const void *desc, int size);
- void (*write_idt_entry)(gate_desc *,
- int entrynum, const gate_desc *gate);
- void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
- void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
-
- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
-
- void (*set_iopl_mask)(unsigned mask);
-
- void (*wbinvd)(void);
- void (*io_delay)(void);
-
- /* cpuid emulation, mostly so that caps bits can be disabled */
- void (*cpuid)(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx);
-
- /* MSR, PMC and TSR operations.
- err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
- u64 (*read_msr_amd)(unsigned int msr, int *err);
- u64 (*read_msr)(unsigned int msr, int *err);
- int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
-
- u64 (*read_tsc)(void);
- u64 (*read_pmc)(int counter);
- unsigned long long (*read_tscp)(unsigned int *aux);
-
- /*
- * Atomically enable interrupts and return to userspace. This
- * is only ever used to return to 32-bit processes; in a
- * 64-bit kernel, it's used for 32-on-64 compat processes, but
- * never native 64-bit processes. (Jump, not call.)
- */
- void (*irq_enable_sysexit)(void);
-
- /*
- * Switch to usermode gs and return to 64-bit usermode using
- * sysret. Only used in 64-bit kernels to return to 64-bit
- * processes. Usermode register state, including %rsp, must
- * already be restored.
- */
- void (*usergs_sysret64)(void);
-
- /*
- * Switch to usermode gs and return to 32-bit usermode using
- * sysret. Used to return to 32-on-64 compat processes.
- * Other usermode register state, including %esp, must already
- * be restored.
- */
- void (*usergs_sysret32)(void);
-
- /* Normal iret. Jump to this with the standard iret stack
- frame set up. */
- void (*iret)(void);
-
- void (*swapgs)(void);
-
- void (*start_context_switch)(struct task_struct *prev);
- void (*end_context_switch)(struct task_struct *next);
-};
-
-struct pv_irq_ops {
- void (*init_IRQ)(void);
-
- /*
- * Get/set interrupt state. save_fl and restore_fl are only
- * expected to use X86_EFLAGS_IF; all other bits
- * returned from save_fl are undefined, and may be ignored by
- * restore_fl.
- *
- * NOTE: These functions callers expect the callee to preserve
- * more registers than the standard C calling convention.
- */
- struct paravirt_callee_save save_fl;
- struct paravirt_callee_save restore_fl;
- struct paravirt_callee_save irq_disable;
- struct paravirt_callee_save irq_enable;
-
- void (*safe_halt)(void);
- void (*halt)(void);
+static __always_inline u64 paravirt_sched_clock(void)
+{
+ return static_call(pv_sched_clock)();
+}
-#ifdef CONFIG_X86_64
- void (*adjust_exception_frame)(void);
-#endif
-};
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
-struct pv_apic_ops {
-#ifdef CONFIG_X86_LOCAL_APIC
- void (*setup_boot_clock)(void);
- void (*setup_secondary_clock)(void);
+__visible void __native_queued_spin_unlock(struct qspinlock *lock);
+bool pv_is_native_spin_unlock(void);
+__visible bool __native_vcpu_is_preempted(long cpu);
+bool pv_is_native_vcpu_is_preempted(void);
- void (*startup_ipi_hook)(int phys_apicid,
- unsigned long start_eip,
- unsigned long start_esp);
+static inline u64 paravirt_steal_clock(int cpu)
+{
+ return static_call(pv_steal_clock)(cpu);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init paravirt_set_cap(void);
#endif
-};
-
-struct pv_mmu_ops {
- /*
- * Called before/after init_mm pagetable setup. setup_start
- * may reset %cr3, and may pre-install parts of the pagetable;
- * pagetable setup is expected to preserve any existing
- * mapping.
- */
- void (*pagetable_setup_start)(pgd_t *pgd_base);
- void (*pagetable_setup_done)(pgd_t *pgd_base);
-
- unsigned long (*read_cr2)(void);
- void (*write_cr2)(unsigned long);
-
- unsigned long (*read_cr3)(void);
- void (*write_cr3)(unsigned long);
-
- /*
- * Hooks for intercepting the creation/use/destruction of an
- * mm_struct.
- */
- void (*activate_mm)(struct mm_struct *prev,
- struct mm_struct *next);
- void (*dup_mmap)(struct mm_struct *oldmm,
- struct mm_struct *mm);
- void (*exit_mmap)(struct mm_struct *mm);
-
-
- /* TLB operations */
- void (*flush_tlb_user)(void);
- void (*flush_tlb_kernel)(void);
- void (*flush_tlb_single)(unsigned long addr);
- void (*flush_tlb_others)(const struct cpumask *cpus,
- struct mm_struct *mm,
- unsigned long va);
-
- /* Hooks for allocating and freeing a pagetable top-level */
- int (*pgd_alloc)(struct mm_struct *mm);
- void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
-
- /*
- * Hooks for allocating/releasing pagetable pages when they're
- * attached to a pagetable
- */
- void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
- void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
- void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
- void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
- void (*release_pte)(unsigned long pfn);
- void (*release_pmd)(unsigned long pfn);
- void (*release_pud)(unsigned long pfn);
-
- /* Pagetable manipulation functions */
- void (*set_pte)(pte_t *ptep, pte_t pteval);
- void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
- void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
- void (*pte_update)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep);
- void (*pte_update_defer)(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep);
-
- pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep);
- void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte);
-
- struct paravirt_callee_save pte_val;
- struct paravirt_callee_save make_pte;
-
- struct paravirt_callee_save pgd_val;
- struct paravirt_callee_save make_pgd;
-
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
- void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep);
- void (*pmd_clear)(pmd_t *pmdp);
-
-#endif /* CONFIG_X86_PAE */
-
- void (*set_pud)(pud_t *pudp, pud_t pudval);
-
- struct paravirt_callee_save pmd_val;
- struct paravirt_callee_save make_pmd;
-
-#if PAGETABLE_LEVELS == 4
- struct paravirt_callee_save pud_val;
- struct paravirt_callee_save make_pud;
-
- void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif /* PAGETABLE_LEVELS == 4 */
-#endif /* PAGETABLE_LEVELS >= 3 */
-
-#ifdef CONFIG_HIGHPTE
- void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void)
+{
+ PVOP_VCALL0(cpu.io_delay);
+#ifdef REALLY_SLOW_IO
+ PVOP_VCALL0(cpu.io_delay);
+ PVOP_VCALL0(cpu.io_delay);
+ PVOP_VCALL0(cpu.io_delay);
#endif
+}
- struct pv_lazy_ops lazy_mode;
-
- /* dom0 ops */
-
- /* Sometimes the physical address is a pfn, and sometimes its
- an mfn. We can tell which is which from the index. */
- void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
- phys_addr_t phys, pgprot_t flags);
-};
-
-struct raw_spinlock;
-struct pv_lock_ops {
- int (*spin_is_locked)(struct raw_spinlock *lock);
- int (*spin_is_contended)(struct raw_spinlock *lock);
- void (*spin_lock)(struct raw_spinlock *lock);
- void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
- int (*spin_trylock)(struct raw_spinlock *lock);
- void (*spin_unlock)(struct raw_spinlock *lock);
-};
-
-/* This contains all the paravirt structures: we get a convenient
- * number for each function using the offset which we use to indicate
- * what to patch. */
-struct paravirt_patch_template {
- struct pv_init_ops pv_init_ops;
- struct pv_time_ops pv_time_ops;
- struct pv_cpu_ops pv_cpu_ops;
- struct pv_irq_ops pv_irq_ops;
- struct pv_apic_ops pv_apic_ops;
- struct pv_mmu_ops pv_mmu_ops;
- struct pv_lock_ops pv_lock_ops;
-};
-
-extern struct pv_info pv_info;
-extern struct pv_init_ops pv_init_ops;
-extern struct pv_time_ops pv_time_ops;
-extern struct pv_cpu_ops pv_cpu_ops;
-extern struct pv_irq_ops pv_irq_ops;
-extern struct pv_apic_ops pv_apic_ops;
-extern struct pv_mmu_ops pv_mmu_ops;
-extern struct pv_lock_ops pv_lock_ops;
-
-#define PARAVIRT_PATCH(x) \
- (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
-
-#define paravirt_type(op) \
- [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "i" (&(op))
-#define paravirt_clobber(clobber) \
- [paravirt_clobber] "i" (clobber)
+void native_flush_tlb_local(void);
+void native_flush_tlb_global(void);
+void native_flush_tlb_one_user(unsigned long addr);
+void native_flush_tlb_multi(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info);
-/*
- * Generate some code, and mark it as patchable by the
- * apply_paravirt() alternate instruction patcher.
- */
-#define _paravirt_alt(insn_string, type, clobber) \
- "771:\n\t" insn_string "\n" "772:\n" \
- ".pushsection .parainstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR " 771b\n" \
- " .byte " type "\n" \
- " .byte 772b-771b\n" \
- " .short " clobber "\n" \
- ".popsection\n"
-
-/* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string) \
- _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
-
-/* Simple instruction patching code. */
-#define DEF_NATIVE(ops, name, code) \
- extern const char start_##ops##_##name[], end_##ops##_##name[]; \
- asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
-
-unsigned paravirt_patch_nop(void);
-unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
-unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
-unsigned paravirt_patch_ignore(unsigned len);
-unsigned paravirt_patch_call(void *insnbuf,
- const void *target, u16 tgt_clobbers,
- unsigned long addr, u16 site_clobbers,
- unsigned len);
-unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
- unsigned long addr, unsigned len);
-unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
- unsigned long addr, unsigned len);
-
-unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
- const char *start, const char *end);
-
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len);
-
-int paravirt_disable_iospace(void);
+static inline void __flush_tlb_local(void)
+{
+ PVOP_VCALL0(mmu.flush_tlb_user);
+}
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
-#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
+static inline void __flush_tlb_global(void)
+{
+ PVOP_VCALL0(mmu.flush_tlb_kernel);
+}
-/*
- * These macros are intended to wrap calls through one of the paravirt
- * ops structs, so that they can be later identified and patched at
- * runtime.
- *
- * Normally, a call to a pv_op function is a simple indirect call:
- * (pv_op_struct.operations)(args...).
- *
- * Unfortunately, this is a relatively slow operation for modern CPUs,
- * because it cannot necessarily determine what the destination
- * address is. In this case, the address is a runtime constant, so at
- * the very least we can patch the call to e a simple direct call, or
- * ideally, patch an inline implementation into the callsite. (Direct
- * calls are essentially free, because the call and return addresses
- * are completely predictable.)
- *
- * For i386, these macros rely on the standard gcc "regparm(3)" calling
- * convention, in which the first three arguments are placed in %eax,
- * %edx, %ecx (in that order), and the remaining arguments are placed
- * on the stack. All caller-save registers (eax,edx,ecx) are expected
- * to be modified (either clobbered or used for return values).
- * X86_64, on the other hand, already specifies a register-based calling
- * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
- * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
- * special handling for dealing with 4 arguments, unlike i386.
- * However, x86_64 also have to clobber all caller saved registers, which
- * unfortunately, are quite a bit (r8 - r11)
- *
- * The call instruction itself is marked by placing its start address
- * and size into the .parainstructions section, so that
- * apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend pv_init_ops
- * implementation.
- *
- * Unfortunately there's no way to get gcc to generate the args setup
- * for the call, and then allow the call itself to be generated by an
- * inline asm. Because of this, we must do the complete arg setup and
- * return value handling from within these macros. This is fairly
- * cumbersome.
- *
- * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
- * It could be extended to more arguments, but there would be little
- * to be gained from that. For each number of arguments, there are
- * the two VCALL and CALL variants for void and non-void functions.
- *
- * When there is a return value, the invoker of the macro must specify
- * the return type. The macro then uses sizeof() on that type to
- * determine whether its a 32 or 64 bit value, and places the return
- * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
- * 64-bit). For x86_64 machines, it just returns at %rax regardless of
- * the return value size.
- *
- * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
- * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
- * in low,high order
- *
- * Small structures are passed and returned in registers. The macro
- * calling convention can't directly deal with this, so the wrapper
- * functions must do this.
- *
- * These PVOP_* macros are only defined within this header. This
- * means that all uses must be wrapped in inline functions. This also
- * makes sure the incoming and outgoing types are always correct.
- */
-#ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS \
- unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
-
-#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
-#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
-#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
-
-#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
- "=c" (__ecx)
-#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
-
-#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
-#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
-
-#define EXTRA_CLOBBERS
-#define VEXTRA_CLOBBERS
-#else /* CONFIG_X86_64 */
-#define PVOP_VCALL_ARGS \
- unsigned long __edi = __edi, __esi = __esi, \
- __edx = __edx, __ecx = __ecx
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
-
-#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
-#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
-#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
-#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
-
-#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
- "=S" (__esi), "=d" (__edx), \
- "=c" (__ecx)
-#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
-
-#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
-#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
-
-#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
-#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
-#endif /* CONFIG_X86_32 */
-
-#ifdef CONFIG_PARAVIRT_DEBUG
-#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
-#else
-#define PVOP_TEST_NULL(op) ((void)op)
-#endif
+static inline void __flush_tlb_one_user(unsigned long addr)
+{
+ PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
+}
-#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
- pre, post, ...) \
- ({ \
- rettype __ret; \
- PVOP_CALL_ARGS; \
- PVOP_TEST_NULL(op); \
- /* This is 32-bit specific, but is okay in 64-bit */ \
- /* since this condition will never hold */ \
- if (sizeof(rettype) > sizeof(unsigned long)) { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)((((u64)__edx) << 32) | __eax); \
- } else { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)__eax; \
- } \
- __ret; \
- })
-
-#define __PVOP_CALL(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
- EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
-
-#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
- PVOP_CALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
-
-
-#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
- ({ \
- PVOP_VCALL_ARGS; \
- PVOP_TEST_NULL(op); \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- })
-
-#define __PVOP_VCALL(op, pre, post, ...) \
- ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
- VEXTRA_CLOBBERS, \
- pre, post, ##__VA_ARGS__)
-
-#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
- PVOP_VCALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
-
-
-
-#define PVOP_CALL0(rettype, op) \
- __PVOP_CALL(rettype, op, "", "")
-#define PVOP_VCALL0(op) \
- __PVOP_VCALL(op, "", "")
-
-#define PVOP_CALLEE0(rettype, op) \
- __PVOP_CALLEESAVE(rettype, op, "", "")
-#define PVOP_VCALLEE0(op) \
- __PVOP_VCALLEESAVE(op, "", "")
-
-
-#define PVOP_CALL1(rettype, op, arg1) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
-#define PVOP_VCALL1(op, arg1) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
-
-#define PVOP_CALLEE1(rettype, op, arg1) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
-#define PVOP_VCALLEE1(op, arg1) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
-
-
-#define PVOP_CALL2(rettype, op, arg1, arg2) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-#define PVOP_VCALL2(op, arg1, arg2) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
-#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-#define PVOP_VCALLEE2(op, arg1, arg2) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
-
-#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
-#define PVOP_VCALL3(op, arg1, arg2, arg3) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
-
-/* This is the only difference in x86_64. We can make it much simpler */
-#ifdef CONFIG_X86_32
-#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
- __PVOP_CALL(rettype, op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
- PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
-#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
- "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
-#else
-#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
- __PVOP_CALL(rettype, op, "", "", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
- PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, "", "", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
- PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-#endif
+static inline void __flush_tlb_multi(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
+}
-static inline int paravirt_enabled(void)
+static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
- return pv_info.paravirt_enabled;
+ PVOP_VCALL1(mmu.exit_mmap, mm);
}
-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static inline void notify_page_enc_status_changed(unsigned long pfn,
+ int npages, bool enc)
{
- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+ PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc);
}
-#define ARCH_SETUP pv_init_ops.arch_setup();
-static inline unsigned long get_wallclock(void)
+static __always_inline void arch_safe_halt(void)
{
- return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
+ PVOP_VCALL0(irq.safe_halt);
}
-static inline int set_wallclock(unsigned long nowtime)
+static inline void halt(void)
{
- return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
+ PVOP_VCALL0(irq.halt);
}
-static inline void (*choose_time_init(void))(void)
+#ifdef CONFIG_PARAVIRT_XXL
+static inline void load_sp0(unsigned long sp0)
{
- return pv_time_ops.time_init;
+ PVOP_VCALL1(cpu.load_sp0, sp0);
}
/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
- PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
+ PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
}
/*
* These special macros can be used to get or set a debugging register
*/
-static inline unsigned long paravirt_get_debugreg(int reg)
+static __always_inline unsigned long paravirt_get_debugreg(int reg)
{
- return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
+ return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
-static inline void set_debugreg(unsigned long val, int reg)
-{
- PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
-}
-
-static inline void clts(void)
+static __always_inline void set_debugreg(unsigned long val, int reg)
{
- PVOP_VCALL0(pv_cpu_ops.clts);
+ PVOP_VCALL2(cpu.set_debugreg, reg, val);
}
static inline unsigned long read_cr0(void)
{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
+ return PVOP_CALL0(unsigned long, cpu.read_cr0);
}
static inline void write_cr0(unsigned long x)
{
- PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
+ PVOP_VCALL1(cpu.write_cr0, x);
}
-static inline unsigned long read_cr2(void)
+static __always_inline unsigned long read_cr2(void)
{
- return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
+ return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
+ "mov %%cr2, %%rax;", ALT_NOT_XEN);
}
-static inline void write_cr2(unsigned long x)
+static __always_inline void write_cr2(unsigned long x)
{
- PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
+ PVOP_VCALL1(mmu.write_cr2, x);
}
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
{
- return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
+ return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
+ "mov %%cr3, %%rax;", ALT_NOT_XEN);
}
static inline void write_cr3(unsigned long x)
{
- PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
-}
-
-static inline unsigned long read_cr4(void)
-{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
-}
-static inline unsigned long read_cr4_safe(void)
-{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
-}
-
-static inline void write_cr4(unsigned long x)
-{
- PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
-}
-
-#ifdef CONFIG_X86_64
-static inline unsigned long read_cr8(void)
-{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
+ PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
}
-static inline void write_cr8(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
- PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
+ PVOP_VCALL1(cpu.write_cr4, x);
}
-#endif
-static inline void raw_safe_halt(void)
+static inline u64 paravirt_read_msr(u32 msr)
{
- PVOP_VCALL0(pv_irq_ops.safe_halt);
+ return PVOP_CALL1(u64, cpu.read_msr, msr);
}
-static inline void halt(void)
+static inline void paravirt_write_msr(u32 msr, u64 val)
{
- PVOP_VCALL0(pv_irq_ops.safe_halt);
+ PVOP_VCALL2(cpu.write_msr, msr, val);
}
-static inline void wbinvd(void)
+static inline int paravirt_read_msr_safe(u32 msr, u64 *val)
{
- PVOP_VCALL0(pv_cpu_ops.wbinvd);
+ return PVOP_CALL2(int, cpu.read_msr_safe, msr, val);
}
-#define get_kernel_rpl() (pv_info.kernel_rpl)
-
-static inline u64 paravirt_read_msr(unsigned msr, int *err)
-{
- return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
-}
-static inline u64 paravirt_read_msr_amd(unsigned msr, int *err)
-{
- return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err);
-}
-static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
+static inline int paravirt_write_msr_safe(u32 msr, u64 val)
{
- return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
+ return PVOP_CALL2(int, cpu.write_msr_safe, msr, val);
}
-/* These should all do BUG_ON(_err), but our headers are too tangled. */
#define rdmsr(msr, val1, val2) \
do { \
- int _err; \
- u64 _l = paravirt_read_msr(msr, &_err); \
+ u64 _l = paravirt_read_msr(msr); \
val1 = (u32)_l; \
val2 = _l >> 32; \
} while (0)
-#define wrmsr(msr, val1, val2) \
-do { \
- paravirt_write_msr(msr, val1, val2); \
-} while (0)
-
-#define rdmsrl(msr, val) \
-do { \
- int _err; \
- val = paravirt_read_msr(msr, &_err); \
-} while (0)
-
-#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
-#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b)
-
-/* rdmsr with exception handling */
-#define rdmsr_safe(msr, a, b) \
-({ \
- int _err; \
- u64 _l = paravirt_read_msr(msr, &_err); \
- (*a) = (u32)_l; \
- (*b) = _l >> 32; \
- _err; \
-})
-
-static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
-{
- int err;
-
- *p = paravirt_read_msr(msr, &err);
- return err;
-}
-static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
-{
- int err;
-
- *p = paravirt_read_msr_amd(msr, &err);
- return err;
-}
-
-static inline u64 paravirt_read_tsc(void)
+static __always_inline void wrmsr(u32 msr, u32 low, u32 high)
{
- return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
+ paravirt_write_msr(msr, (u64)high << 32 | low);
}
-#define rdtscl(low) \
+#define rdmsrq(msr, val) \
do { \
- u64 _l = paravirt_read_tsc(); \
- low = (int)_l; \
+ val = paravirt_read_msr(msr); \
} while (0)
-#define rdtscll(val) (val = paravirt_read_tsc())
-
-static inline unsigned long long paravirt_sched_clock(void)
+static inline void wrmsrq(u32 msr, u64 val)
{
- return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
+ paravirt_write_msr(msr, val);
}
-#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
-static inline unsigned long long paravirt_read_pmc(int counter)
+static inline int wrmsrq_safe(u32 msr, u64 val)
{
- return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
+ return paravirt_write_msr_safe(msr, val);
}
-#define rdpmc(counter, low, high) \
-do { \
- u64 _l = paravirt_read_pmc(counter); \
- low = (u32)_l; \
- high = _l >> 32; \
-} while (0)
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr, a, b) \
+({ \
+ u64 _l; \
+ int _err = paravirt_read_msr_safe((msr), &_l); \
+ (*a) = (u32)_l; \
+ (*b) = (u32)(_l >> 32); \
+ _err; \
+})
-static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
+static __always_inline int rdmsrq_safe(u32 msr, u64 *p)
{
- return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
+ return paravirt_read_msr_safe(msr, p);
}
-#define rdtscp(low, high, aux) \
-do { \
- int __aux; \
- unsigned long __val = paravirt_rdtscp(&__aux); \
- (low) = (u32)__val; \
- (high) = (u32)(__val >> 32); \
- (aux) = __aux; \
-} while (0)
-
-#define rdtscpll(val, aux) \
-do { \
- unsigned long __aux; \
- val = paravirt_rdtscp(&__aux); \
- (aux) = __aux; \
-} while (0)
+static __always_inline u64 rdpmc(int counter)
+{
+ return PVOP_CALL1(u64, cpu.read_pmc, counter);
+}
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
- PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
+ PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
}
static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
- PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
+ PVOP_VCALL2(cpu.free_ldt, ldt, entries);
}
static inline void load_TR_desc(void)
{
- PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
+ PVOP_VCALL0(cpu.load_tr_desc);
}
static inline void load_gdt(const struct desc_ptr *dtr)
{
- PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
+ PVOP_VCALL1(cpu.load_gdt, dtr);
}
static inline void load_idt(const struct desc_ptr *dtr)
{
- PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
+ PVOP_VCALL1(cpu.load_idt, dtr);
}
static inline void set_ldt(const void *addr, unsigned entries)
{
- PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
-}
-static inline void store_gdt(struct desc_ptr *dtr)
-{
- PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
-}
-static inline void store_idt(struct desc_ptr *dtr)
-{
- PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
+ PVOP_VCALL2(cpu.set_ldt, addr, entries);
}
static inline unsigned long paravirt_store_tr(void)
{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
+ return PVOP_CALL0(unsigned long, cpu.store_tr);
}
+
#define store_tr(tr) ((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
- PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
+ PVOP_VCALL2(cpu.load_tls, t, cpu);
}
-#ifdef CONFIG_X86_64
static inline void load_gs_index(unsigned int gs)
{
- PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
+ PVOP_VCALL1(cpu.load_gs_index, gs);
}
-#endif
static inline void write_ldt_entry(struct desc_struct *dt, int entry,
const void *desc)
{
- PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
+ PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc);
}
static inline void write_gdt_entry(struct desc_struct *dt, int entry,
void *desc, int type)
{
- PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
+ PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type);
}
static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{
- PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
-}
-static inline void set_iopl_mask(unsigned mask)
-{
- PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
-}
-
-/* The paravirtualized I/O functions */
-static inline void slow_down_io(void)
-{
- pv_cpu_ops.io_delay();
-#ifdef REALLY_SLOW_IO
- pv_cpu_ops.io_delay();
- pv_cpu_ops.io_delay();
- pv_cpu_ops.io_delay();
-#endif
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static inline void setup_boot_clock(void)
-{
- PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
-}
-
-static inline void setup_secondary_clock(void)
-{
- PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
-}
-#endif
-
-static inline void paravirt_post_allocator_init(void)
-{
- if (pv_init_ops.post_allocator_init)
- (*pv_init_ops.post_allocator_init)();
+ PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
}
-static inline void paravirt_pagetable_setup_start(pgd_t *base)
+#ifdef CONFIG_X86_IOPL_IOPERM
+static inline void tss_invalidate_io_bitmap(void)
{
- (*pv_mmu_ops.pagetable_setup_start)(base);
+ PVOP_VCALL0(cpu.invalidate_io_bitmap);
}
-static inline void paravirt_pagetable_setup_done(pgd_t *base)
+static inline void tss_update_io_bitmap(void)
{
- (*pv_mmu_ops.pagetable_setup_done)(base);
-}
-
-#ifdef CONFIG_SMP
-static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
- phys_apicid, start_eip, start_esp);
+ PVOP_VCALL0(cpu.update_io_bitmap);
}
#endif
-static inline void paravirt_activate_mm(struct mm_struct *prev,
- struct mm_struct *next)
-{
- PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
-}
-
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
-{
- PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
-}
-
-static inline void arch_exit_mmap(struct mm_struct *mm)
-{
- PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
-}
-
-static inline void __flush_tlb(void)
-{
- PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
-}
-static inline void __flush_tlb_global(void)
-{
- PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
-}
-static inline void __flush_tlb_single(unsigned long addr)
-{
- PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
-}
-
-static inline void flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va)
+static inline void paravirt_enter_mmap(struct mm_struct *next)
{
- PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
+ PVOP_VCALL1(mmu.enter_mmap, next);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
- return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
+ return PVOP_CALL1(int, mmu.pgd_alloc, mm);
}
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
+ PVOP_VCALL2(mmu.pgd_free, mm, pgd);
}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
+ PVOP_VCALL2(mmu.alloc_pte, mm, pfn);
}
static inline void paravirt_release_pte(unsigned long pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
+ PVOP_VCALL1(mmu.release_pte, pfn);
}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
+ PVOP_VCALL2(mmu.alloc_pmd, mm, pfn);
}
-static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
- unsigned long start, unsigned long count)
-{
- PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
-}
static inline void paravirt_release_pmd(unsigned long pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
+ PVOP_VCALL1(mmu.release_pmd, pfn);
}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+ PVOP_VCALL2(mmu.alloc_pud, mm, pfn);
}
static inline void paravirt_release_pud(unsigned long pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
-}
-
-#ifdef CONFIG_HIGHPTE
-static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
-{
- unsigned long ret;
- ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
- return (void *)ret;
+ PVOP_VCALL1(mmu.release_pud, pfn);
}
-#endif
-static inline void pte_update(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
{
- PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
+ PVOP_VCALL2(mmu.alloc_p4d, mm, pfn);
}
-static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
+static inline void paravirt_release_p4d(unsigned long pfn)
{
- PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
+ PVOP_VCALL1(mmu.release_p4d, pfn);
}
static inline pte_t __pte(pteval_t val)
{
- pteval_t ret;
-
- if (sizeof(pteval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pteval_t,
- pv_mmu_ops.make_pte,
- val, (u64)val >> 32);
- else
- ret = PVOP_CALLEE1(pteval_t,
- pv_mmu_ops.make_pte,
- val);
-
- return (pte_t) { .pte = ret };
+ return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pteval_t pte_val(pte_t pte)
{
- pteval_t ret;
-
- if (sizeof(pteval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
- pte.pte, (u64)pte.pte >> 32);
- else
- ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
- pte.pte);
-
- return ret;
+ return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline pgd_t __pgd(pgdval_t val)
{
- pgdval_t ret;
-
- if (sizeof(pgdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
- val, (u64)val >> 32);
- else
- ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
- val);
-
- return (pgd_t) { ret };
+ return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
- pgdval_t ret;
-
- if (sizeof(pgdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
- pgd.pgd, (u64)pgd.pgd >> 32);
- else
- ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
- pgd.pgd);
-
- return ret;
+ return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep)
{
pteval_t ret;
- ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
- mm, addr, ptep);
+ ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep);
return (pte_t) { .pte = ret };
}
-static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
- if (sizeof(pteval_t) > sizeof(long))
- /* 5 arg words */
- pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
- else
- PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
- mm, addr, ptep, pte.pte);
-}
-static inline void set_pte(pte_t *ptep, pte_t pte)
-{
- if (sizeof(pteval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
- pte.pte, (u64)pte.pte >> 32);
- else
- PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
- pte.pte);
+ PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte);
}
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static inline void set_pte(pte_t *ptep, pte_t pte)
{
- if (sizeof(pteval_t) > sizeof(long))
- /* 5 arg words */
- pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
- else
- PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
+ PVOP_VCALL2(mmu.set_pte, ptep, pte.pte);
}
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- pmdval_t val = native_pmd_val(pmd);
-
- if (sizeof(pmdval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
- else
- PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
+ PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd));
}
-#if PAGETABLE_LEVELS >= 3
static inline pmd_t __pmd(pmdval_t val)
{
- pmdval_t ret;
-
- if (sizeof(pmdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
- val, (u64)val >> 32);
- else
- ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
- val);
-
- return (pmd_t) { ret };
+ return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
- pmdval_t ret;
-
- if (sizeof(pmdval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
- pmd.pmd, (u64)pmd.pmd >> 32);
- else
- ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
- pmd.pmd);
-
- return ret;
+ return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void set_pud(pud_t *pudp, pud_t pud)
{
- pudval_t val = native_pud_val(pud);
-
- if (sizeof(pudval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
- val, (u64)val >> 32);
- else
- PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
- val);
+ PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud));
}
-#if PAGETABLE_LEVELS == 4
+
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
- if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
- val, (u64)val >> 32);
- else
- ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
- val);
+ ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
- pudval_t ret;
-
- if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
- pud.pud, (u64)pud.pud >> 32);
- else
- ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
- pud.pud);
-
- return ret;
+ return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
-static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+static inline void pud_clear(pud_t *pudp)
{
- pgdval_t val = native_pgd_val(pgd);
-
- if (sizeof(pgdval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
- val, (u64)val >> 32);
- else
- PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
- val);
+ set_pud(pudp, native_make_pud(0));
}
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
- set_pgd(pgdp, __pgd(0));
-}
+ p4dval_t val = native_p4d_val(p4d);
-static inline void pud_clear(pud_t *pudp)
-{
- set_pud(pudp, __pud(0));
+ PVOP_VCALL2(mmu.set_p4d, p4dp, val);
}
-#endif /* PAGETABLE_LEVELS == 4 */
+static inline p4d_t __p4d(p4dval_t val)
+{
+ p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
-#endif /* PAGETABLE_LEVELS >= 3 */
+ return (p4d_t) { ret };
+}
-#ifdef CONFIG_X86_PAE
-/* Special-case pte-setting operations for PAE, which can't update a
- 64-bit pte atomically */
-static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
+static inline p4dval_t p4d_val(p4d_t p4d)
{
- PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
- pte.pte, pte.pte >> 32);
+ return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
-static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
+static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
+ PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd));
}
-static inline void pmd_clear(pmd_t *pmdp)
+#define set_pgd(pgdp, pgdval) do { \
+ if (pgtable_l5_enabled()) \
+ __set_pgd(pgdp, pgdval); \
+ else \
+ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
+} while (0)
+
+#define pgd_clear(pgdp) do { \
+ if (pgtable_l5_enabled()) \
+ set_pgd(pgdp, native_make_pgd(0)); \
+} while (0)
+
+static inline void p4d_clear(p4d_t *p4dp)
{
- PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
+ set_p4d(p4dp, native_make_p4d(0));
}
-#else /* !CONFIG_X86_PAE */
+
static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_pte(ptep, pte);
@@ -1384,123 +507,89 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- set_pte_at(mm, addr, ptep, __pte(0));
+ set_pte(ptep, native_make_pte(0));
}
static inline void pmd_clear(pmd_t *pmdp)
{
- set_pmd(pmdp, __pmd(0));
+ set_pmd(pmdp, native_make_pmd(0));
}
-#endif /* CONFIG_X86_PAE */
-
-/* Lazy mode for batching updates / context switch */
-enum paravirt_lazy_mode {
- PARAVIRT_LAZY_NONE,
- PARAVIRT_LAZY_MMU,
- PARAVIRT_LAZY_CPU,
-};
-
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_start_context_switch(struct task_struct *prev);
-void paravirt_end_context_switch(struct task_struct *next);
-
-void paravirt_enter_lazy_mmu(void);
-void paravirt_leave_lazy_mmu(void);
#define __HAVE_ARCH_START_CONTEXT_SWITCH
static inline void arch_start_context_switch(struct task_struct *prev)
{
- PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
+ PVOP_VCALL1(cpu.start_context_switch, prev);
}
static inline void arch_end_context_switch(struct task_struct *next)
{
- PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
+ PVOP_VCALL1(cpu.end_context_switch, next);
}
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
- PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
+ PVOP_VCALL0(mmu.lazy_mode.enter);
}
static inline void arch_leave_lazy_mmu_mode(void)
{
- PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
+ PVOP_VCALL0(mmu.lazy_mode.leave);
}
-void arch_flush_lazy_mmu_mode(void);
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(mmu.lazy_mode.flush);
+}
static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags)
{
- pv_mmu_ops.set_fixmap(idx, phys, flags);
+ pv_ops.mmu.set_fixmap(idx, phys, flags);
}
-
-void _paravirt_nop(void);
-u32 _paravirt_ident_32(u32);
-u64 _paravirt_ident_64(u64);
-
-#define paravirt_nop ((void *)_paravirt_nop)
+#endif
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
-{
- return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
-}
-
-static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+ u32 val)
{
- return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
+ PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val);
}
-#define __raw_spin_is_contended __raw_spin_is_contended
-static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
- PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
+ PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock,
+ "movb $0, (%%" _ASM_ARG1 ");",
+ ALT_NOT(X86_FEATURE_PVUNLOCK));
}
-static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock,
- unsigned long flags)
+static __always_inline void pv_wait(u8 *ptr, u8 val)
{
- PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
+ PVOP_VCALL2(lock.wait, ptr, val);
}
-static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
+static __always_inline void pv_kick(int cpu)
{
- return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
+ PVOP_VCALL1(lock.kick, cpu);
}
-static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
- PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
+ return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu,
+ "xor %%" _ASM_AX ", %%" _ASM_AX ";",
+ ALT_NOT(X86_FEATURE_VCPUPREEMPT));
}
-#endif
-
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
- u8 *instr; /* original instructions */
- u8 instrtype; /* type of this instruction */
- u8 len; /* length of original instruction */
- u16 clobbers; /* what registers you may clobber */
-};
+void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
+bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
-extern struct paravirt_patch_site __parainstructions[],
- __parainstructions_end[];
+#endif /* SMP && PARAVIRT_SPINLOCKS */
#ifdef CONFIG_X86_32
-#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
-#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
-
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
-
-#define PV_FLAGS_ARG "0"
-#define PV_EXTRA_CLOBBERS
-#define PV_VEXTRA_CLOBBERS
#else
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS \
@@ -1521,14 +610,6 @@ extern struct paravirt_patch_site __parainstructions[],
"pop %rsi;" \
"pop %rdx;" \
"pop %rcx;"
-
-/* We save some registers, but all of them, that's too much. We clobber all
- * caller saved registers but the argument parameter */
-#define PV_SAVE_REGS "pushq %%rdi;"
-#define PV_RESTORE_REGS "popq %%rdi;"
-#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
-#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
-#define PV_FLAGS_ARG "D"
#endif
/*
@@ -1543,18 +624,28 @@ extern struct paravirt_patch_site __parainstructions[],
* call. The return value in rax/eax will not be saved, even for void
* functions.
*/
-#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
+#define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \
extern typeof(func) __raw_callee_save_##func; \
- static void *__##func##__ __used = func; \
\
- asm(".pushsection .text;" \
- "__raw_callee_save_" #func ": " \
+ asm(".pushsection " section ", \"ax\";" \
+ ".globl " PV_THUNK_NAME(func) ";" \
+ ".type " PV_THUNK_NAME(func) ", @function;" \
+ ASM_FUNC_ALIGN \
+ PV_THUNK_NAME(func) ":" \
+ ASM_ENDBR \
+ FRAME_BEGIN \
PV_SAVE_ALL_CALLER_REGS \
"call " #func ";" \
PV_RESTORE_ALL_CALLER_REGS \
- "ret;" \
+ FRAME_END \
+ ASM_RET \
+ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \
".popsection")
+#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text")
+
/* Get a reference to a callee-save function */
#define PV_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { __raw_callee_save_##func })
@@ -1563,54 +654,32 @@ extern struct paravirt_patch_site __parainstructions[],
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })
-static inline unsigned long __raw_local_save_flags(void)
+#ifdef CONFIG_PARAVIRT_XXL
+static __always_inline unsigned long arch_local_save_flags(void)
{
- unsigned long f;
-
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- : "=a"(f)
- : paravirt_type(pv_irq_ops.save_fl),
- paravirt_clobber(CLBR_EAX)
- : "memory", "cc");
- return f;
-}
-
-static inline void raw_local_irq_restore(unsigned long f)
-{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- : "=a"(f)
- : PV_FLAGS_ARG(f),
- paravirt_type(pv_irq_ops.restore_fl),
- paravirt_clobber(CLBR_EAX)
- : "memory", "cc");
+ return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
+ ALT_NOT_XEN);
}
-static inline void raw_local_irq_disable(void)
+static __always_inline void arch_local_irq_disable(void)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- :
- : paravirt_type(pv_irq_ops.irq_disable),
- paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc");
+ PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
}
-static inline void raw_local_irq_enable(void)
+static __always_inline void arch_local_irq_enable(void)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- :
- : paravirt_type(pv_irq_ops.irq_enable),
- paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc");
+ PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
}
-static inline unsigned long __raw_local_irq_save(void)
+static __always_inline unsigned long arch_local_irq_save(void)
{
unsigned long f;
- f = __raw_local_save_flags();
- raw_local_irq_disable();
+ f = arch_local_save_flags();
+ arch_local_irq_disable();
return f;
}
+#endif
/* Make sure as little as possible of this mess escapes. */
@@ -1628,145 +697,57 @@ static inline unsigned long __raw_local_irq_save(void)
#undef PVOP_VCALL4
#undef PVOP_CALL4
-#else /* __ASSEMBLY__ */
-
-#define _PVSITE(ptype, clobbers, ops, word, algn) \
-771:; \
- ops; \
-772:; \
- .pushsection .parainstructions,"a"; \
- .align algn; \
- word 771b; \
- .byte ptype; \
- .byte 772b-771b; \
- .short clobbers; \
- .popsection
+extern void default_banner(void);
+void native_pv_lock_init(void) __init;
-
-#define COND_PUSH(set, mask, reg) \
- .if ((~(set)) & mask); push %reg; .endif
-#define COND_POP(set, mask, reg) \
- .if ((~(set)) & mask); pop %reg; .endif
+#else /* __ASSEMBLER__ */
#ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_RAX, rax); \
- COND_PUSH(set, CLBR_RCX, rcx); \
- COND_PUSH(set, CLBR_RDX, rdx); \
- COND_PUSH(set, CLBR_RSI, rsi); \
- COND_PUSH(set, CLBR_RDI, rdi); \
- COND_PUSH(set, CLBR_R8, r8); \
- COND_PUSH(set, CLBR_R9, r9); \
- COND_PUSH(set, CLBR_R10, r10); \
- COND_PUSH(set, CLBR_R11, r11)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_R11, r11); \
- COND_POP(set, CLBR_R10, r10); \
- COND_POP(set, CLBR_R9, r9); \
- COND_POP(set, CLBR_R8, r8); \
- COND_POP(set, CLBR_RDI, rdi); \
- COND_POP(set, CLBR_RSI, rsi); \
- COND_POP(set, CLBR_RDX, rdx); \
- COND_POP(set, CLBR_RCX, rcx); \
- COND_POP(set, CLBR_RAX, rax)
-
-#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
-#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#else
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_EAX, eax); \
- COND_PUSH(set, CLBR_EDI, edi); \
- COND_PUSH(set, CLBR_ECX, ecx); \
- COND_PUSH(set, CLBR_EDX, edx)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_EDX, edx); \
- COND_POP(set, CLBR_ECX, ecx); \
- COND_POP(set, CLBR_EDI, edi); \
- COND_POP(set, CLBR_EAX, eax)
-
-#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
-#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
-#define PARA_INDIRECT(addr) *%cs:addr
-#endif
-
-#define INTERRUPT_RETURN \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
-#define DISABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+.macro PARA_IRQ_save_fl
+ ANNOTATE_RETPOLINE_SAFE;
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
+.endm
-#define ENABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-
-#define USERGS_SYSRET32 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
-
-#ifdef CONFIG_X86_32
-#define GET_CR0_INTO_EAX \
- push %ecx; push %edx; \
- call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
- pop %edx; pop %ecx
+#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \
+ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \
+ "pushf; pop %rax;", ALT_NOT_XEN
+#endif
+#endif /* CONFIG_PARAVIRT_XXL */
+#endif /* CONFIG_X86_64 */
-#define ENABLE_INTERRUPTS_SYSEXIT \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+#endif /* __ASSEMBLER__ */
+#else /* CONFIG_PARAVIRT */
+# define default_banner x86_init_noop
+#ifndef __ASSEMBLER__
+static inline void native_pv_lock_init(void)
+{
+}
+#endif
+#endif /* !CONFIG_PARAVIRT */
-#else /* !CONFIG_X86_32 */
+#ifndef __ASSEMBLER__
+#ifndef CONFIG_PARAVIRT_XXL
+static inline void paravirt_enter_mmap(struct mm_struct *mm)
+{
+}
+#endif
-/*
- * If swapgs is used while the userspace stack is still current,
- * there's no way to call a pvop. The PV replacement *must* be
- * inlined, or the swapgs instruction must be trapped and emulated.
- */
-#define SWAPGS_UNSAFE_STACK \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
- swapgs)
+#ifndef CONFIG_PARAVIRT
+static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+{
+}
+#endif
-/*
- * Note: swapgs is very special, and in practise is either going to be
- * implemented with a single "swapgs" instruction or something very
- * special. Either way, we don't need to save any registers for
- * it.
- */
-#define SWAPGS \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
- call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \
- )
-
-#define GET_CR2_INTO_RCX \
- call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \
- movq %rax, %rcx; \
- xorq %rax, %rax;
-
-#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
- CLBR_NONE, \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
-
-#define USERGS_SYSRET64 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
-
-#define ENABLE_INTERRUPTS_SYSEXIT32 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
-#endif /* CONFIG_X86_32 */
-
-#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_PARAVIRT */
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+static inline void paravirt_set_cap(void)
+{
+}
+#endif
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_api_clock.h b/arch/x86/include/asm/paravirt_api_clock.h
new file mode 100644
index 000000000000..65ac7cee0dad
--- /dev/null
+++ b/arch/x86/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
new file mode 100644
index 000000000000..3502939415ad
--- /dev/null
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -0,0 +1,530 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PARAVIRT_TYPES_H
+#define _ASM_X86_PARAVIRT_TYPES_H
+
+#ifdef CONFIG_PARAVIRT
+
+#ifndef __ASSEMBLER__
+#include <linux/types.h>
+
+#include <asm/desc_defs.h>
+#include <asm/pgtable_types.h>
+#include <asm/nospec-branch.h>
+
+struct page;
+struct thread_struct;
+struct desc_ptr;
+struct tss_struct;
+struct mm_struct;
+struct desc_struct;
+struct task_struct;
+struct cpumask;
+struct flush_tlb_info;
+struct mmu_gather;
+struct vm_area_struct;
+
+/*
+ * Wrapper type for pointers to code which uses the non-standard
+ * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
+ */
+struct paravirt_callee_save {
+ void *func;
+};
+
+/* general info */
+struct pv_info {
+#ifdef CONFIG_PARAVIRT_XXL
+ u16 extra_user_64bit_cs; /* __USER_CS if none */
+#endif
+
+ const char *name;
+};
+
+#ifdef CONFIG_PARAVIRT_XXL
+struct pv_lazy_ops {
+ /* Set deferred update mode, used for batching operations. */
+ void (*enter)(void);
+ void (*leave)(void);
+ void (*flush)(void);
+} __no_randomize_layout;
+#endif
+
+struct pv_cpu_ops {
+ /* hooks for various privileged instructions */
+ void (*io_delay)(void);
+
+#ifdef CONFIG_PARAVIRT_XXL
+ unsigned long (*get_debugreg)(int regno);
+ void (*set_debugreg)(int regno, unsigned long value);
+
+ unsigned long (*read_cr0)(void);
+ void (*write_cr0)(unsigned long);
+
+ void (*write_cr4)(unsigned long);
+
+ /* Segment descriptor handling */
+ void (*load_tr_desc)(void);
+ void (*load_gdt)(const struct desc_ptr *);
+ void (*load_idt)(const struct desc_ptr *);
+ void (*set_ldt)(const void *desc, unsigned entries);
+ unsigned long (*store_tr)(void);
+ void (*load_tls)(struct thread_struct *t, unsigned int cpu);
+ void (*load_gs_index)(unsigned int idx);
+ void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
+ const void *desc);
+ void (*write_gdt_entry)(struct desc_struct *,
+ int entrynum, const void *desc, int size);
+ void (*write_idt_entry)(gate_desc *,
+ int entrynum, const gate_desc *gate);
+ void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
+ void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
+
+ void (*load_sp0)(unsigned long sp0);
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ void (*invalidate_io_bitmap)(void);
+ void (*update_io_bitmap)(void);
+#endif
+
+ /* cpuid emulation, mostly so that caps bits can be disabled */
+ void (*cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ /* Unsafe MSR operations. These will warn or panic on failure. */
+ u64 (*read_msr)(u32 msr);
+ void (*write_msr)(u32 msr, u64 val);
+
+ /*
+ * Safe MSR operations.
+ * Returns 0 or -EIO.
+ */
+ int (*read_msr_safe)(u32 msr, u64 *val);
+ int (*write_msr_safe)(u32 msr, u64 val);
+
+ u64 (*read_pmc)(int counter);
+
+ void (*start_context_switch)(struct task_struct *prev);
+ void (*end_context_switch)(struct task_struct *next);
+#endif
+} __no_randomize_layout;
+
+struct pv_irq_ops {
+#ifdef CONFIG_PARAVIRT_XXL
+ /*
+ * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF;
+ * all other bits returned from save_fl are undefined.
+ *
+ * NOTE: These functions callers expect the callee to preserve
+ * more registers than the standard C calling convention.
+ */
+ struct paravirt_callee_save save_fl;
+ struct paravirt_callee_save irq_disable;
+ struct paravirt_callee_save irq_enable;
+#endif
+ void (*safe_halt)(void);
+ void (*halt)(void);
+} __no_randomize_layout;
+
+struct pv_mmu_ops {
+ /* TLB operations */
+ void (*flush_tlb_user)(void);
+ void (*flush_tlb_kernel)(void);
+ void (*flush_tlb_one_user)(unsigned long addr);
+ void (*flush_tlb_multi)(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
+
+ /* Hook for intercepting the destruction of an mm_struct. */
+ void (*exit_mmap)(struct mm_struct *mm);
+ void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
+
+#ifdef CONFIG_PARAVIRT_XXL
+ struct paravirt_callee_save read_cr2;
+ void (*write_cr2)(unsigned long);
+
+ unsigned long (*read_cr3)(void);
+ void (*write_cr3)(unsigned long);
+
+ /* Hook for intercepting the creation/use of an mm_struct. */
+ void (*enter_mmap)(struct mm_struct *mm);
+
+ /* Hooks for allocating and freeing a pagetable top-level */
+ int (*pgd_alloc)(struct mm_struct *mm);
+ void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
+
+ /*
+ * Hooks for allocating/releasing pagetable pages when they're
+ * attached to a pagetable
+ */
+ void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
+ void (*release_pte)(unsigned long pfn);
+ void (*release_pmd)(unsigned long pfn);
+ void (*release_pud)(unsigned long pfn);
+ void (*release_p4d)(unsigned long pfn);
+
+ /* Pagetable manipulation functions */
+ void (*set_pte)(pte_t *ptep, pte_t pteval);
+ void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+
+ pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep);
+ void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+
+ struct paravirt_callee_save pte_val;
+ struct paravirt_callee_save make_pte;
+
+ struct paravirt_callee_save pgd_val;
+ struct paravirt_callee_save make_pgd;
+
+ void (*set_pud)(pud_t *pudp, pud_t pudval);
+
+ struct paravirt_callee_save pmd_val;
+ struct paravirt_callee_save make_pmd;
+
+ struct paravirt_callee_save pud_val;
+ struct paravirt_callee_save make_pud;
+
+ void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
+
+ struct paravirt_callee_save p4d_val;
+ struct paravirt_callee_save make_p4d;
+
+ void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
+
+ struct pv_lazy_ops lazy_mode;
+
+ /* dom0 ops */
+
+ /* Sometimes the physical address is a pfn, and sometimes its
+ an mfn. We can tell which is which from the index. */
+ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
+ phys_addr_t phys, pgprot_t flags);
+#endif
+} __no_randomize_layout;
+
+struct arch_spinlock;
+#ifdef CONFIG_SMP
+#include <asm/spinlock_types.h>
+#endif
+
+struct qspinlock;
+
+struct pv_lock_ops {
+ void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+ struct paravirt_callee_save queued_spin_unlock;
+
+ void (*wait)(u8 *ptr, u8 val);
+ void (*kick)(int cpu);
+
+ struct paravirt_callee_save vcpu_is_preempted;
+} __no_randomize_layout;
+
+/* This contains all the paravirt structures: we get a convenient
+ * number for each function using the offset which we use to indicate
+ * what to patch. */
+struct paravirt_patch_template {
+ struct pv_cpu_ops cpu;
+ struct pv_irq_ops irq;
+ struct pv_mmu_ops mmu;
+ struct pv_lock_ops lock;
+} __no_randomize_layout;
+
+extern struct pv_info pv_info;
+extern struct paravirt_patch_template pv_ops;
+
+#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op)
+
+/*
+ * This generates an indirect call based on the operation type number.
+ *
+ * Since alternatives run after enabling CET/IBT -- the latter setting/clearing
+ * capabilities and the former requiring all capabilities being finalized --
+ * these indirect calls are subject to IBT and the paravirt stubs should have
+ * ENDBR on.
+ *
+ * OTOH since this is effectively a __nocfi indirect call, the paravirt stubs
+ * don't need to bother with CFI prefixes.
+ */
+#define PARAVIRT_CALL \
+ ANNOTATE_RETPOLINE_SAFE "\n\t" \
+ "call *%[paravirt_opptr];"
+
+/*
+ * These macros are intended to wrap calls through one of the paravirt
+ * ops structs, so that they can be later identified and patched at
+ * runtime.
+ *
+ * Normally, a call to a pv_op function is a simple indirect call:
+ * (pv_op_struct.operations)(args...).
+ *
+ * Unfortunately, this is a relatively slow operation for modern CPUs,
+ * because it cannot necessarily determine what the destination
+ * address is. In this case, the address is a runtime constant, so at
+ * the very least we can patch the call to a simple direct call, or,
+ * ideally, patch an inline implementation into the callsite. (Direct
+ * calls are essentially free, because the call and return addresses
+ * are completely predictable.)
+ *
+ * For i386, these macros rely on the standard gcc "regparm(3)" calling
+ * convention, in which the first three arguments are placed in %eax,
+ * %edx, %ecx (in that order), and the remaining arguments are placed
+ * on the stack. All caller-save registers (eax,edx,ecx) are expected
+ * to be modified (either clobbered or used for return values).
+ * X86_64, on the other hand, already specifies a register-based calling
+ * conventions, returning at %rax, with parameters going in %rdi, %rsi,
+ * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
+ * special handling for dealing with 4 arguments, unlike i386.
+ * However, x86_64 also has to clobber all caller saved registers, which
+ * unfortunately, are quite a bit (r8 - r11)
+ *
+ * Unfortunately there's no way to get gcc to generate the args setup
+ * for the call, and then allow the call itself to be generated by an
+ * inline asm. Because of this, we must do the complete arg setup and
+ * return value handling from within these macros. This is fairly
+ * cumbersome.
+ *
+ * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
+ * It could be extended to more arguments, but there would be little
+ * to be gained from that. For each number of arguments, there are
+ * two VCALL and CALL variants for void and non-void functions.
+ *
+ * When there is a return value, the invoker of the macro must specify
+ * the return type. The macro then uses sizeof() on that type to
+ * determine whether it's a 32 or 64 bit value and places the return
+ * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
+ * 64-bit). For x86_64 machines, it just returns in %rax regardless of
+ * the return value size.
+ *
+ * 64-bit arguments are passed as a pair of adjacent 32-bit arguments;
+ * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
+ * in low,high order
+ *
+ * Small structures are passed and returned in registers. The macro
+ * calling convention can't directly deal with this, so the wrapper
+ * functions must do it.
+ *
+ * These PVOP_* macros are only defined within this header. This
+ * means that all uses must be wrapped in inline functions. This also
+ * makes sure the incoming and outgoing types are always correct.
+ */
+#ifdef CONFIG_X86_32
+#define PVOP_CALL_ARGS \
+ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
+
+#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
+ "=c" (__ecx)
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
+
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
+#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS
+#define VEXTRA_CLOBBERS
+#else /* CONFIG_X86_64 */
+/* [re]ax isn't an arg, but the return val */
+#define PVOP_CALL_ARGS \
+ unsigned long __edi = __edi, __esi = __esi, \
+ __edx = __edx, __ecx = __ecx, __eax = __eax;
+
+#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
+ "=S" (__esi), "=d" (__edx), \
+ "=c" (__ecx)
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
+
+/*
+ * void functions are still allowed [re]ax for scratch.
+ *
+ * The ZERO_CALL_USED REGS feature may end up zeroing out callee-saved
+ * registers. Make sure we model this with the appropriate clobbers.
+ */
+#ifdef CONFIG_ZERO_CALL_USED_REGS
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), PVOP_VCALL_CLOBBERS
+#else
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
+#endif
+#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
+#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_PARAVIRT_DEBUG
+#define PVOP_TEST_NULL(op) BUG_ON(pv_ops.op == NULL)
+#else
+#define PVOP_TEST_NULL(op) ((void)pv_ops.op)
+#endif
+
+#define PVOP_RETVAL(rettype) \
+ ({ unsigned long __mask = ~0UL; \
+ BUILD_BUG_ON(sizeof(rettype) > sizeof(unsigned long)); \
+ switch (sizeof(rettype)) { \
+ case 1: __mask = 0xffUL; break; \
+ case 2: __mask = 0xffffUL; break; \
+ case 4: __mask = 0xffffffffUL; break; \
+ default: break; \
+ } \
+ __mask & __eax; \
+ })
+
+/*
+ * Use alternative patching for paravirt calls:
+ * - For replacing an indirect call with a direct one, use the "normal"
+ * ALTERNATIVE() macro with the indirect call as the initial code sequence,
+ * which will be replaced with the related direct call by using the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ * - In case the replacement is either a direct call or a short code sequence
+ * depending on a feature bit, the ALTERNATIVE_2() macro is being used.
+ * The indirect call is the initial code sequence again, while the special
+ * code sequence is selected with the specified feature bit. In case the
+ * feature is not active, the direct call is used as above via the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ */
+#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \
+ ({ \
+ PVOP_CALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \
+ ALT_CALL_ALWAYS) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_ptr(op), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ ret; \
+ })
+
+#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \
+ extra_clbr, ...) \
+ ({ \
+ PVOP_CALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \
+ ALT_CALL_INSTR, ALT_CALL_ALWAYS, \
+ alt, cond) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_ptr(op), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ ret; \
+ })
+
+#define __PVOP_CALL(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op, \
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \
+ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \
+ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+
+#define __PVOP_VCALL(op, ...) \
+ (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \
+ VEXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_VCALL(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op, alt, cond, \
+ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
+
+#define __PVOP_VCALLEESAVE(op, ...) \
+ (void)____PVOP_CALL(, op.func, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op.func, alt, cond, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+
+#define PVOP_CALL0(rettype, op) \
+ __PVOP_CALL(rettype, op)
+#define PVOP_VCALL0(op) \
+ __PVOP_VCALL(op)
+#define PVOP_ALT_CALL0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALL(rettype, op, alt, cond)
+#define PVOP_ALT_VCALL0(op, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond)
+
+#define PVOP_CALLEE0(rettype, op) \
+ __PVOP_CALLEESAVE(rettype, op)
+#define PVOP_VCALLEE0(op) \
+ __PVOP_VCALLEESAVE(op)
+#define PVOP_ALT_CALLEE0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond)
+#define PVOP_ALT_VCALLEE0(op, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond)
+
+
+#define PVOP_CALL1(rettype, op, arg1) \
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1))
+#define PVOP_VCALL1(op, arg1) \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALL1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond, PVOP_CALL_ARG1(arg1))
+
+#define PVOP_CALLEE1(rettype, op, arg1) \
+ __PVOP_CALLEESAVE(rettype, op, PVOP_CALL_ARG1(arg1))
+#define PVOP_VCALLEE1(op, arg1) \
+ __PVOP_VCALLEESAVE(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_CALLEE1(rettype, op, arg1, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALLEE1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond, PVOP_CALL_ARG1(arg1))
+
+
+#define PVOP_CALL2(rettype, op, arg1, arg2) \
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
+#define PVOP_VCALL2(op, arg1, arg2) \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
+
+#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
+#define PVOP_VCALL3(op, arg1, arg2, arg3) \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
+
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
+ __PVOP_CALL(rettype, op, \
+ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
+
+unsigned long paravirt_ret0(void);
+#ifdef CONFIG_PARAVIRT_XXL
+u64 _paravirt_ident_64(u64);
+unsigned long pv_native_save_fl(void);
+void pv_native_irq_disable(void);
+void pv_native_irq_enable(void);
+unsigned long pv_native_read_cr2(void);
+#endif
+
+#define paravirt_nop ((void *)nop_func)
+
+#endif /* __ASSEMBLER__ */
+
+#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV)
+
+#endif /* CONFIG_PARAVIRT */
+#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h
index 3c4ffeb467e9..163f78259a7e 100644
--- a/arch/x86/include/asm/parport.h
+++ b/arch/x86/include/asm/parport.h
@@ -1,8 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PARPORT_H
#define _ASM_X86_PARPORT_H
-static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma);
-static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma)
+static int parport_pc_find_isa_ports(int autoirq, int autodma);
+static int parport_pc_find_nonpci_ports(int autoirq, int autodma)
{
return parport_pc_find_isa_ports(autoirq, autodma);
}
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
deleted file mode 100644
index 7af14e512f97..000000000000
--- a/arch/x86/include/asm/pat.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_PAT_H
-#define _ASM_X86_PAT_H
-
-#include <linux/types.h>
-#include <asm/pgtable_types.h>
-
-#ifdef CONFIG_X86_PAT
-extern int pat_enabled;
-#else
-static const int pat_enabled;
-#endif
-
-extern void pat_init(void);
-
-extern int reserve_memtype(u64 start, u64 end,
- unsigned long req_type, unsigned long *ret_type);
-extern int free_memtype(u64 start, u64 end);
-
-extern int kernel_map_sync_memtype(u64 base, unsigned long size,
- unsigned long flag);
-
-#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pc-conf-reg.h b/arch/x86/include/asm/pc-conf-reg.h
new file mode 100644
index 000000000000..56bceceacf5f
--- /dev/null
+++ b/arch/x86/include/asm/pc-conf-reg.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Support for the configuration register space at port I/O locations
+ * 0x22 and 0x23 variously used by PC architectures, e.g. the MP Spec,
+ * Cyrix CPUs, numerous chipsets.
+ */
+#ifndef _ASM_X86_PC_CONF_REG_H
+#define _ASM_X86_PC_CONF_REG_H
+
+#include <linux/io.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#define PC_CONF_INDEX 0x22
+#define PC_CONF_DATA 0x23
+
+#define PC_CONF_MPS_IMCR 0x70
+
+extern raw_spinlock_t pc_conf_lock;
+
+static inline u8 pc_conf_get(u8 reg)
+{
+ outb(reg, PC_CONF_INDEX);
+ return inb(PC_CONF_DATA);
+}
+
+static inline void pc_conf_set(u8 reg, u8 data)
+{
+ outb(reg, PC_CONF_INDEX);
+ outb(data, PC_CONF_DATA);
+}
+
+#endif /* _ASM_X86_PC_CONF_REG_H */
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
index b1e7a45d868a..94597a3cf3d0 100644
--- a/arch/x86/include/asm/pci-direct.h
+++ b/arch/x86/include/asm/pci-direct.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PCI_DIRECT_H
#define _ASM_X86_PCI_DIRECT_H
@@ -14,8 +15,4 @@ extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
extern int early_pci_allowed(void);
-
-extern unsigned int pci_early_dump_regs;
-extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
-extern void early_dump_pci_devices(void);
#endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/include/asm/pci-functions.h b/arch/x86/include/asm/pci-functions.h
index ed0bab427354..1bbc10812ae1 100644
--- a/arch/x86/include/asm/pci-functions.h
+++ b/arch/x86/include/asm/pci-functions.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* PCI BIOS function numbering for conventional PCI BIOS
* systems
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 1ff685ca221c..b3ab80a03365 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PCI_H
#define _ASM_X86_PCI_H
@@ -5,50 +6,78 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
+#include <linux/numa.h>
#include <asm/io.h>
-
-#ifdef __KERNEL__
+#include <asm/memtype.h>
struct pci_sysdata {
int domain; /* PCI domain */
int node; /* NUMA node */
+#ifdef CONFIG_ACPI
+ struct acpi_device *companion; /* ACPI companion device */
+#endif
#ifdef CONFIG_X86_64
void *iommu; /* IOMMU private data */
#endif
+#ifdef CONFIG_PCI_MSI
+ void *fwnode; /* IRQ domain for MSI assignment */
+#endif
+#if IS_ENABLED(CONFIG_VMD)
+ struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */
+#endif
};
extern int pci_routeirq;
extern int noioapicquirk;
extern int noioapicreroute;
-/* scan a bus after allocating a pci_sysdata for it */
-extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
- int node);
-extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+static inline struct pci_sysdata *to_pci_sysdata(const struct pci_bus *bus)
+{
+ return bus->sysdata;
+}
+
+#ifdef CONFIG_PCI
+#ifdef CONFIG_PCI_DOMAINS
static inline int pci_domain_nr(struct pci_bus *bus)
{
- struct pci_sysdata *sd = bus->sysdata;
- return sd->domain;
+ return to_pci_sysdata(bus)->domain;
}
static inline int pci_proc_domain(struct pci_bus *bus)
{
return pci_domain_nr(bus);
}
+#endif
+#ifdef CONFIG_PCI_MSI
+static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
+{
+ return to_pci_sysdata(bus)->fwnode;
+}
+
+#define pci_root_bus_fwnode _pci_root_bus_fwnode
+#endif
+
+#if IS_ENABLED(CONFIG_VMD)
+static inline bool is_vmd(struct pci_bus *bus)
+{
+ return to_pci_sysdata(bus)->vmd_dev != NULL;
+}
+#else
+#define is_vmd(bus) false
+#endif /* CONFIG_VMD */
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
or architectures with incomplete PCI setup by the loader */
-#ifdef CONFIG_PCI
extern unsigned int pcibios_assign_all_busses(void);
+extern int pci_legacy_init(void);
#else
-#define pcibios_assign_all_busses() 0
+static inline int pcibios_assign_all_busses(void) { return 0; }
#endif
-#define pcibios_scan_all_fns(a, b) 0
extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_IO 0x1000
@@ -56,95 +85,41 @@ extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_CARDBUS_IO 0x4000
-void pcibios_config_init(void);
-struct pci_bus *pcibios_scan_root(int bus);
+extern int pcibios_enabled;
+void pcibios_scan_root(int bus);
-void pcibios_set_master(struct pci_dev *dev);
-void pcibios_penalize_isa_irq(int irq, int active);
struct irq_routing_table *pcibios_get_irq_routing_table(void);
int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev);
#define HAVE_PCI_MMAP
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
- enum pci_mmap_state mmap_state,
- int write_combine);
-
+#define arch_can_pci_mmap_wc() pat_enabled()
+#define ARCH_GENERIC_PCI_MMAP_RESOURCE
#ifdef CONFIG_PCI
extern void early_quirks(void);
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
- enum pci_dma_burst_strategy *strat,
- unsigned long *strategy_parameter)
-{
- *strat = PCI_DMA_BURST_INFINITY;
- *strategy_parameter = ~0UL;
-}
#else
static inline void early_quirks(void) { }
#endif
extern void pci_iommu_alloc(void);
-/* MSI arch hook */
-#define arch_setup_msi_irqs arch_setup_msi_irqs
-
-#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
-
-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
- dma_addr_t ADDR_NAME;
-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
- __u32 LEN_NAME;
-#define pci_unmap_addr(PTR, ADDR_NAME) \
- ((PTR)->ADDR_NAME)
-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
- (((PTR)->ADDR_NAME) = (VAL))
-#define pci_unmap_len(PTR, LEN_NAME) \
- ((PTR)->LEN_NAME)
-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
- (((PTR)->LEN_NAME) = (VAL))
-
-#else
-
-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
-#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
- do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
-#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
- do { break; } while (pci_unmap_len(PTR, LEN_NAME))
-
-#endif
-
-#endif /* __KERNEL__ */
-
-#ifdef CONFIG_X86_64
-#include "pci_64.h"
-#endif
-
-/* implement the pci_ DMA API in terms of the generic device dma_ one */
-#include <asm-generic/pci-dma-compat.h>
-
-/* generic pci stuff */
-#include <asm-generic/pci.h>
-#define PCIBIOS_MAX_MEM_32 0xffffffff
-
#ifdef CONFIG_NUMA
/* Returns the node based on pci bus */
static inline int __pcibus_to_node(const struct pci_bus *bus)
{
- const struct pci_sysdata *sd = bus->sysdata;
-
- return sd->node;
+ return to_pci_sysdata(bus)->node;
}
static inline const struct cpumask *
cpumask_of_pcibus(const struct pci_bus *bus)
{
- return cpumask_of_node(__pcibus_to_node(bus));
+ int node;
+
+ node = __pcibus_to_node(bus);
+ return (node == NUMA_NO_NODE) ? cpu_online_mask :
+ cpumask_of_node(node);
}
#endif
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
deleted file mode 100644
index ae5e40f67daf..000000000000
--- a/arch/x86/include/asm/pci_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ASM_X86_PCI_64_H
-#define _ASM_X86_PCI_64_H
-
-#ifdef __KERNEL__
-
-#ifdef CONFIG_CALGARY_IOMMU
-static inline void *pci_iommu(struct pci_bus *bus)
-{
- struct pci_sysdata *sd = bus->sysdata;
- return sd->iommu;
-}
-
-static inline void set_pci_iommu(struct pci_bus *bus, void *val)
-{
- struct pci_sysdata *sd = bus->sysdata;
- sd->iommu = val;
-}
-#endif /* CONFIG_CALGARY_IOMMU */
-
-extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
- int reg, int len, u32 *value);
-extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
- int reg, int len, u32 value);
-
-extern void dma32_reserve_bootmem(void);
-
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988eee3a..70533fdcbf02 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -1,15 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Low-Level PCI Access for i386 machines.
*
* (c) 1999 Martin Mares <mj@ucw.cz>
*/
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/spinlock.h>
+
#undef DEBUG
#ifdef DEBUG
-#define DBG(x...) printk(x)
+#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
#else
-#define DBG(x...)
+#define DBG(fmt, ...) \
+do { \
+ if (0) \
+ printk(fmt, ##__VA_ARGS__); \
+} while (0)
#endif
#define PCI_PROBE_BIOS 0x0001
@@ -29,6 +39,11 @@
#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
#define PCI_HAS_IO_ECS 0x40000
#define PCI_NOASSIGN_ROMS 0x80000
+#define PCI_ROOT_NO_CRS 0x100000
+#define PCI_NOASSIGN_BARS 0x200000
+#define PCI_BIG_ROOT_WINDOW 0x400000
+#define PCI_USE_E820 0x800000
+#define PCI_NO_E820 0x1000000
extern unsigned int pci_probe;
extern unsigned long pirq_table_addr;
@@ -42,18 +57,20 @@ enum pci_bf_sort_state {
/* pci-i386.c */
-extern unsigned int pcibios_max_latency;
-
void pcibios_resource_survey(void);
+void pcibios_set_cache_line_size(void);
/* pci-pc.c */
extern int pcibios_last_bus;
-extern struct pci_bus *pci_root_bus;
extern struct pci_ops pci_root_ops;
+void pcibios_scan_specific_bus(int busn);
+
/* pci-irq.c */
+struct pci_dev;
+
struct irq_info {
u8 bus, devfn; /* Bus, device and function */
struct {
@@ -77,17 +94,27 @@ struct irq_routing_table {
u32 miniport_data; /* Crap */
u8 rfu[11];
u8 checksum; /* Modulo 256 checksum must give 0 */
- struct irq_info slots[0];
+ struct irq_info slots[];
+} __attribute__((packed));
+
+struct irt_routing_table {
+ u32 signature; /* IRT_SIGNATURE should be here */
+ u8 size; /* Number of entries provided */
+ u8 used; /* Number of entries actually used */
+ u16 exclusive_irqs; /* IRQs devoted exclusively to
+ PCI usage */
+ struct irq_info slots[];
} __attribute__((packed));
extern unsigned int pcibios_irq_mask;
-extern int pcibios_scanned;
-extern spinlock_t pci_config_lock;
+extern raw_spinlock_t pci_config_lock;
extern int (*pcibios_enable_irq)(struct pci_dev *dev);
extern void (*pcibios_disable_irq)(struct pci_dev *dev);
+extern bool mp_should_keep_irq(struct device *dev);
+
struct pci_raw_ops {
int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
int reg, int len, u32 *val);
@@ -95,41 +122,82 @@ struct pci_raw_ops {
int reg, int len, u32 val);
};
-extern struct pci_raw_ops *raw_pci_ops;
-extern struct pci_raw_ops *raw_pci_ext_ops;
+extern const struct pci_raw_ops *raw_pci_ops;
+extern const struct pci_raw_ops *raw_pci_ext_ops;
-extern struct pci_raw_ops pci_direct_conf1;
+extern const struct pci_raw_ops pci_mmcfg;
+extern const struct pci_raw_ops pci_direct_conf1;
extern bool port_cf9_safe;
/* arch_initcall level */
+#ifdef CONFIG_PCI_DIRECT
extern int pci_direct_probe(void);
extern void pci_direct_init(int type);
+#else
+static inline int pci_direct_probe(void) { return -1; }
+static inline void pci_direct_init(int type) { }
+#endif
+
+#ifdef CONFIG_PCI_BIOS
extern void pci_pcbios_init(void);
-extern int pci_olpc_init(void);
+#else
+static inline void pci_pcbios_init(void) { }
+#endif
+
extern void __init dmi_check_pciprobe(void);
extern void __init dmi_check_skip_isa_align(void);
/* some common used subsys_initcalls */
+#ifdef CONFIG_PCI
extern int __init pci_acpi_init(void);
-extern int __init pcibios_irq_init(void);
-extern int __init pci_visws_init(void);
-extern int __init pci_numaq_init(void);
+#else
+static inline int __init pci_acpi_init(void)
+{
+ return -EINVAL;
+}
+#endif
+extern void __init pcibios_irq_init(void);
extern int __init pcibios_init(void);
+extern int pci_legacy_init(void);
+extern void pcibios_fixup_irqs(void);
/* pci-mmconfig.c */
+/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
+#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
+
+struct pci_mmcfg_region {
+ struct list_head list;
+ struct resource res;
+ u64 address;
+ char __iomem *virt;
+ u16 segment;
+ u8 start_bus;
+ u8 end_bus;
+ char name[PCI_MMCFG_RESOURCE_NAME_LEN];
+};
+
extern int __init pci_mmcfg_arch_init(void);
extern void __init pci_mmcfg_arch_free(void);
+extern int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
+extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
+extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
+ phys_addr_t addr);
+extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
+extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
+extern struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
+ int end, u64 addr);
-extern struct acpi_mcfg_allocation *pci_mmcfg_config;
-extern int pci_mmcfg_config_num;
+extern struct list_head pci_mmcfg_list;
+
+#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
/*
- * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
- * on their northbrige except through the * %eax register. As such, you MUST
- * NOT use normal IOMEM accesses, you need to only use the magic mmio-config
- * accessor functions.
- * In fact just use pci_config_*, nothing else please.
+ * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use
+ * %eax. No other source or target registers may be used. The following
+ * mmio_config_* accessors enforce this. See "BIOS and Kernel Developer's
+ * Guide (BKDG) For AMD Family 10h Processors", rev. 3.48, sec 2.11.1,
+ * "MMIO Configuration Coding Requirements".
*/
static inline unsigned char mmio_config_readb(void __iomem *pos)
{
@@ -166,3 +234,23 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
{
asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
}
+
+#ifdef CONFIG_PCI
+# ifdef CONFIG_ACPI
+# define x86_default_pci_init pci_acpi_init
+# else
+# define x86_default_pci_init pci_legacy_init
+# endif
+# define x86_default_pci_init_irq pcibios_irq_init
+# define x86_default_pci_fixup_irqs pcibios_fixup_irqs
+#else
+# define x86_default_pci_init NULL
+# define x86_default_pci_init_irq NULL
+# define x86_default_pci_fixup_irqs NULL
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_ACPI)
+extern bool pci_use_e820;
+#else
+#define pci_use_e820 false
+#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d85..725d0eff7acd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,171 +1,606 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PERCPU_H
#define _ASM_X86_PERCPU_H
#ifdef CONFIG_X86_64
-#define __percpu_seg gs
-#define __percpu_mov_op movq
+# define __percpu_seg gs
+# define __percpu_rel (%rip)
#else
-#define __percpu_seg fs
-#define __percpu_mov_op movl
+# define __percpu_seg fs
+# define __percpu_rel
#endif
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- * var - variable name
- * reg - 32bit register
- *
- * The resulting address is stored in the "reg" argument.
- *
- * Example:
- * PER_CPU(cpu_gdt_descr, %ebx)
- */
#ifdef CONFIG_SMP
-#define PER_CPU(var, reg) \
- __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
- lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
-#else /* ! SMP */
-#define PER_CPU(var, reg) \
- __percpu_mov_op $per_cpu__##var, reg
-#define PER_CPU_VAR(var) per_cpu__##var
-#endif /* SMP */
-
-#ifdef CONFIG_X86_64_SMP
-#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
+# define __percpu %__percpu_seg:
#else
-#define INIT_PER_CPU_VAR(var) per_cpu__##var
+# define __percpu
#endif
-#else /* ...!ASSEMBLY */
+#define PER_CPU_VAR(var) __percpu(var)__percpu_rel
+
+#else /* !__ASSEMBLY__: */
-#include <linux/kernel.h>
+#include <linux/args.h>
+#include <linux/bits.h>
+#include <linux/build_bug.h>
#include <linux/stringify.h>
+#include <asm/asm.h>
#ifdef CONFIG_SMP
-#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
-#define __my_cpu_offset percpu_read(this_cpu_off)
-#else
-#define __percpu_arg(x) "%" #x
+
+#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
+
+#ifdef CONFIG_CC_HAS_NAMED_AS
+
+#ifdef __CHECKER__
+# define __seg_gs __attribute__((address_space(__seg_gs)))
+# define __seg_fs __attribute__((address_space(__seg_fs)))
#endif
+#define __percpu_prefix
+#define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg)
+
+#else /* !CONFIG_CC_HAS_NAMED_AS: */
+
+#define __percpu_prefix __force_percpu_prefix
+#define __percpu_seg_override
+
+#endif /* CONFIG_CC_HAS_NAMED_AS */
+
/*
- * Initialized pointers to per-cpu variables needed for the boot
- * processor need to use these macros to get the proper address
- * offset from __per_cpu_load on SMP.
- *
- * There also must be an entry in vmlinux_64.lds.S
+ * Compared to the generic __my_cpu_offset version, the following
+ * saves one instruction and avoids clobbering a temp register.
*/
-#define DECLARE_INIT_PER_CPU(var) \
- extern typeof(per_cpu_var(var)) init_per_cpu_var(var)
+#define __my_cpu_offset this_cpu_read(this_cpu_off)
+
+/*
+ * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit
+ * kernel, because games are played with CONFIG_X86_64 there and
+ * sizeof(this_cpu_off) becames 4.
+ */
+#ifndef BUILD_VDSO32_64
+#define arch_raw_cpu_ptr(_ptr) \
+({ \
+ unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \
+ \
+ tcp_ptr__ += (__force unsigned long)(_ptr); \
+ (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)tcp_ptr__; \
+})
+#else
+#define arch_raw_cpu_ptr(_ptr) \
+({ \
+ BUILD_BUG(); \
+ (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)0; \
+})
+#endif
+
+#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel
+
+#else /* !CONFIG_SMP: */
+
+#define __force_percpu_prefix
+#define __percpu_prefix
+#define __percpu_seg_override
+
+#define PER_CPU_VAR(var) (var)__percpu_rel
-#ifdef CONFIG_X86_64_SMP
-#define init_per_cpu_var(var) init_per_cpu__##var
+#endif /* CONFIG_SMP */
+
+#if defined(CONFIG_USE_X86_SEG_SUPPORT) && defined(USE_TYPEOF_UNQUAL)
+# define __my_cpu_type(var) typeof(var)
+# define __my_cpu_ptr(ptr) (ptr)
+# define __my_cpu_var(var) (var)
+
+# define __percpu_qual __percpu_seg_override
#else
-#define init_per_cpu_var(var) per_cpu_var(var)
+# define __my_cpu_type(var) typeof(var) __percpu_seg_override
+# define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr)
+# define __my_cpu_var(var) (*__my_cpu_ptr(&(var)))
#endif
-/* For arch-specific code, we can use direct single-insn ops (they
- * don't give an lvalue though). */
-extern void __bad_percpu_size(void);
-
-#define percpu_to_op(op, var, val) \
-do { \
- typedef typeof(var) T__; \
- if (0) { \
- T__ tmp__; \
- tmp__ = (val); \
- } \
- switch (sizeof(var)) { \
- case 1: \
- asm(op "b %1,"__percpu_arg(0) \
- : "+m" (var) \
- : "qi" ((T__)(val))); \
- break; \
- case 2: \
- asm(op "w %1,"__percpu_arg(0) \
- : "+m" (var) \
- : "ri" ((T__)(val))); \
- break; \
- case 4: \
- asm(op "l %1,"__percpu_arg(0) \
- : "+m" (var) \
- : "ri" ((T__)(val))); \
- break; \
- case 8: \
- asm(op "q %1,"__percpu_arg(0) \
- : "+m" (var) \
- : "re" ((T__)(val))); \
- break; \
- default: __bad_percpu_size(); \
- } \
+#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
+#define __percpu_arg(x) __percpu_prefix "%" #x
+
+/*
+ * For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though).
+ */
+
+#define __pcpu_type_1 u8
+#define __pcpu_type_2 u16
+#define __pcpu_type_4 u32
+#define __pcpu_type_8 u64
+
+#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff))
+#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff))
+#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff))
+#define __pcpu_cast_8(val) ((u64)(val))
+
+#define __pcpu_op_1(op) op "b "
+#define __pcpu_op_2(op) op "w "
+#define __pcpu_op_4(op) op "l "
+#define __pcpu_op_8(op) op "q "
+
+#define __pcpu_reg_1(mod, x) mod "q" (x)
+#define __pcpu_reg_2(mod, x) mod "r" (x)
+#define __pcpu_reg_4(mod, x) mod "r" (x)
+#define __pcpu_reg_8(mod, x) mod "r" (x)
+
+#define __pcpu_reg_imm_1(x) "qi" (x)
+#define __pcpu_reg_imm_2(x) "ri" (x)
+#define __pcpu_reg_imm_4(x) "ri" (x)
+#define __pcpu_reg_imm_8(x) "re" (x)
+
+#ifdef CONFIG_USE_X86_SEG_SUPPORT
+
+#define __raw_cpu_read(size, qual, pcp) \
+({ \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
+})
+
+#define __raw_cpu_write(size, qual, pcp, val) \
+do { \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
+} while (0)
+
+#define __raw_cpu_read_const(pcp) __raw_cpu_read(, , pcp)
+
+#else /* !CONFIG_USE_X86_SEG_SUPPORT: */
+
+#define __raw_cpu_read(size, qual, _var) \
+({ \
+ __pcpu_type_##size pfo_val__; \
+ \
+ asm qual (__pcpu_op_##size("mov") \
+ __percpu_arg([var]) ", %[val]" \
+ : [val] __pcpu_reg_##size("=", pfo_val__) \
+ : [var] "m" (__my_cpu_var(_var))); \
+ \
+ (typeof(_var))(unsigned long) pfo_val__; \
+})
+
+#define __raw_cpu_write(size, qual, _var, _val) \
+do { \
+ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \
+ \
+ if (0) { \
+ TYPEOF_UNQUAL(_var) pto_tmp__; \
+ pto_tmp__ = (_val); \
+ (void)pto_tmp__; \
+ } \
+ asm qual (__pcpu_op_##size("mov") "%[val], " \
+ __percpu_arg([var]) \
+ : [var] "=m" (__my_cpu_var(_var)) \
+ : [val] __pcpu_reg_imm_##size(pto_val__)); \
+} while (0)
+
+/*
+ * The generic per-CPU infrastrucutre is not suitable for
+ * reading const-qualified variables.
+ */
+#define __raw_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
+
+#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+
+#define __raw_cpu_read_stable(size, _var) \
+({ \
+ __pcpu_type_##size pfo_val__; \
+ \
+ asm(__pcpu_op_##size("mov") \
+ __force_percpu_arg(a[var]) ", %[val]" \
+ : [val] __pcpu_reg_##size("=", pfo_val__) \
+ : [var] "i" (&(_var))); \
+ \
+ (typeof(_var))(unsigned long) pfo_val__; \
+})
+
+#define percpu_unary_op(size, qual, op, _var) \
+({ \
+ asm qual (__pcpu_op_##size(op) __percpu_arg([var]) \
+ : [var] "+m" (__my_cpu_var(_var))); \
+})
+
+#define percpu_binary_op(size, qual, op, _var, _val) \
+do { \
+ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \
+ \
+ if (0) { \
+ TYPEOF_UNQUAL(_var) pto_tmp__; \
+ pto_tmp__ = (_val); \
+ (void)pto_tmp__; \
+ } \
+ asm qual (__pcpu_op_##size(op) "%[val], " __percpu_arg([var]) \
+ : [var] "+m" (__my_cpu_var(_var)) \
+ : [val] __pcpu_reg_imm_##size(pto_val__)); \
+} while (0)
+
+/*
+ * Generate a per-CPU add to memory instruction and optimize code
+ * if one is added or subtracted.
+ */
+#define percpu_add_op(size, qual, var, val) \
+do { \
+ const int pao_ID__ = \
+ (__builtin_constant_p(val) && \
+ ((val) == 1 || \
+ (val) == (typeof(val))-1)) ? (int)(val) : 0; \
+ \
+ if (0) { \
+ TYPEOF_UNQUAL(var) pao_tmp__; \
+ pao_tmp__ = (val); \
+ (void)pao_tmp__; \
+ } \
+ if (pao_ID__ == 1) \
+ percpu_unary_op(size, qual, "inc", var); \
+ else if (pao_ID__ == -1) \
+ percpu_unary_op(size, qual, "dec", var); \
+ else \
+ percpu_binary_op(size, qual, "add", var, val); \
} while (0)
-#define percpu_from_op(op, var) \
-({ \
- typeof(var) ret__; \
- switch (sizeof(var)) { \
- case 1: \
- asm(op "b "__percpu_arg(1)",%0" \
- : "=q" (ret__) \
- : "m" (var)); \
- break; \
- case 2: \
- asm(op "w "__percpu_arg(1)",%0" \
- : "=r" (ret__) \
- : "m" (var)); \
- break; \
- case 4: \
- asm(op "l "__percpu_arg(1)",%0" \
- : "=r" (ret__) \
- : "m" (var)); \
- break; \
- case 8: \
- asm(op "q "__percpu_arg(1)",%0" \
- : "=r" (ret__) \
- : "m" (var)); \
- break; \
- default: __bad_percpu_size(); \
- } \
- ret__; \
-})
-
-#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
-#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
-#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
-#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
-
-/* This is not atomic against other CPUs -- CPU preemption needs to be off */
-#define x86_test_and_clear_bit_percpu(bit, var) \
-({ \
- int old__; \
- asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
- : "=r" (old__), "+m" (per_cpu__##var) \
- : "dIr" (bit)); \
- old__; \
+/*
+ * Add return operation
+ */
+#define percpu_add_return_op(size, qual, _var, _val) \
+({ \
+ __pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \
+ \
+ asm qual (__pcpu_op_##size("xadd") "%[tmp], " \
+ __percpu_arg([var]) \
+ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \
+ [var] "+m" (__my_cpu_var(_var)) \
+ : : "memory"); \
+ (typeof(_var))(unsigned long) (paro_tmp__ + _val); \
})
+/*
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
+ */
+#define raw_percpu_xchg_op(_var, _nval) \
+({ \
+ TYPEOF_UNQUAL(_var) pxo_old__ = raw_cpu_read(_var); \
+ \
+ raw_cpu_write(_var, _nval); \
+ \
+ pxo_old__; \
+})
+
+/*
+ * this_cpu_xchg() is implemented using CMPXCHG without a LOCK prefix.
+ * XCHG is expensive due to the implied LOCK prefix. The processor
+ * cannot prefetch cachelines if XCHG is used.
+ */
+#define this_percpu_xchg_op(_var, _nval) \
+({ \
+ TYPEOF_UNQUAL(_var) pxo_old__ = this_cpu_read(_var); \
+ \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ \
+ pxo_old__; \
+})
+
+/*
+ * CMPXCHG has no such implied lock semantics as a result it is much
+ * more efficient for CPU-local operations.
+ */
+#define percpu_cmpxchg_op(size, qual, _var, _oval, _nval) \
+({ \
+ __pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \
+ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ \
+ asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \
+ __percpu_arg([var]) \
+ : [oval] "+a" (pco_old__), \
+ [var] "+m" (__my_cpu_var(_var)) \
+ : [nval] __pcpu_reg_##size(, pco_new__) \
+ : "memory"); \
+ \
+ (typeof(_var))(unsigned long) pco_old__; \
+})
+
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+ __pcpu_type_##size pco_old__ = *pco_oval__; \
+ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ \
+ asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \
+ __percpu_arg([var]) \
+ : "=@ccz" (success), \
+ [oval] "+a" (pco_old__), \
+ [var] "+m" (__my_cpu_var(_var)) \
+ : [nval] __pcpu_reg_##size(, pco_new__) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *pco_oval__ = pco_old__; \
+ \
+ likely(success); \
+})
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
+
+#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
+({ \
+ union { \
+ u64 var; \
+ struct { \
+ u32 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = _oval; \
+ new__.var = _nval; \
+ \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
+ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+ : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
+ \
+ old__.var; \
+})
+
+#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
+#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ u64 *_oval = (u64 *)(_ovalp); \
+ union { \
+ u64 var; \
+ struct { \
+ u32 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = *_oval; \
+ new__.var = _nval; \
+ \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
+ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+ : ALT_OUTPUT_SP("=@ccz" (success), \
+ [var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *_oval = old__.var; \
+ \
+ likely(success); \
+})
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
+
+#endif /* defined(CONFIG_X86_32) && !defined(CONFIG_UML) */
+
+#ifdef CONFIG_X86_64
+#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
+#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval);
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
+
+#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
+({ \
+ union { \
+ u128 var; \
+ struct { \
+ u64 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = _oval; \
+ new__.var = _nval; \
+ \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
+ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+ : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
+ \
+ old__.var; \
+})
+
+#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
+#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ u128 *_oval = (u128 *)(_ovalp); \
+ union { \
+ u128 var; \
+ struct { \
+ u64 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = *_oval; \
+ new__.var = _nval; \
+ \
+ asm_inline qual ( \
+ ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
+ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+ : ALT_OUTPUT_SP("=@ccz" (success), \
+ [var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), "+d" (old__.high)) \
+ : "b" (new__.low), "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *_oval = old__.var; \
+ \
+ likely(success); \
+})
+
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
+
+#endif /* CONFIG_X86_64 */
+
+#define raw_cpu_read_1(pcp) __raw_cpu_read(1, , pcp)
+#define raw_cpu_read_2(pcp) __raw_cpu_read(2, , pcp)
+#define raw_cpu_read_4(pcp) __raw_cpu_read(4, , pcp)
+#define raw_cpu_write_1(pcp, val) __raw_cpu_write(1, , pcp, val)
+#define raw_cpu_write_2(pcp, val) __raw_cpu_write(2, , pcp, val)
+#define raw_cpu_write_4(pcp, val) __raw_cpu_write(4, , pcp, val)
+
+#define this_cpu_read_1(pcp) __raw_cpu_read(1, volatile, pcp)
+#define this_cpu_read_2(pcp) __raw_cpu_read(2, volatile, pcp)
+#define this_cpu_read_4(pcp) __raw_cpu_read(4, volatile, pcp)
+#define this_cpu_write_1(pcp, val) __raw_cpu_write(1, volatile, pcp, val)
+#define this_cpu_write_2(pcp, val) __raw_cpu_write(2, volatile, pcp, val)
+#define this_cpu_write_4(pcp, val) __raw_cpu_write(4, volatile, pcp, val)
+
+#define this_cpu_read_stable_1(pcp) __raw_cpu_read_stable(1, pcp)
+#define this_cpu_read_stable_2(pcp) __raw_cpu_read_stable(2, pcp)
+#define this_cpu_read_stable_4(pcp) __raw_cpu_read_stable(4, pcp)
+
+#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
+#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
+#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
+#define raw_cpu_and_1(pcp, val) percpu_binary_op(1, , "and", (pcp), val)
+#define raw_cpu_and_2(pcp, val) percpu_binary_op(2, , "and", (pcp), val)
+#define raw_cpu_and_4(pcp, val) percpu_binary_op(4, , "and", (pcp), val)
+#define raw_cpu_or_1(pcp, val) percpu_binary_op(1, , "or", (pcp), val)
+#define raw_cpu_or_2(pcp, val) percpu_binary_op(2, , "or", (pcp), val)
+#define raw_cpu_or_4(pcp, val) percpu_binary_op(4, , "or", (pcp), val)
+#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
+#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
+#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
+
+#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val)
+#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val)
+#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val)
+#define this_cpu_and_1(pcp, val) percpu_binary_op(1, volatile, "and", (pcp), val)
+#define this_cpu_and_2(pcp, val) percpu_binary_op(2, volatile, "and", (pcp), val)
+#define this_cpu_and_4(pcp, val) percpu_binary_op(4, volatile, "and", (pcp), val)
+#define this_cpu_or_1(pcp, val) percpu_binary_op(1, volatile, "or", (pcp), val)
+#define this_cpu_or_2(pcp, val) percpu_binary_op(2, volatile, "or", (pcp), val)
+#define this_cpu_or_4(pcp, val) percpu_binary_op(4, volatile, "or", (pcp), val)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)
+
+#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
+#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
+#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val)
+#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
+#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
+#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
+
+#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
+#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
+#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val)
+#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
+
+/*
+ * Per-CPU atomic 64-bit operations are only available under 64-bit kernels.
+ * 32-bit kernels must fall back to generic operations.
+ */
+#ifdef CONFIG_X86_64
+
+#define raw_cpu_read_8(pcp) __raw_cpu_read(8, , pcp)
+#define raw_cpu_write_8(pcp, val) __raw_cpu_write(8, , pcp, val)
+
+#define this_cpu_read_8(pcp) __raw_cpu_read(8, volatile, pcp)
+#define this_cpu_write_8(pcp, val) __raw_cpu_write(8, volatile, pcp, val)
+
+#define this_cpu_read_stable_8(pcp) __raw_cpu_read_stable(8, pcp)
+
+#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
+#define raw_cpu_and_8(pcp, val) percpu_binary_op(8, , "and", (pcp), val)
+#define raw_cpu_or_8(pcp, val) percpu_binary_op(8, , "or", (pcp), val)
+#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
+#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
+#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
+
+#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val)
+#define this_cpu_and_8(pcp, val) percpu_binary_op(8, volatile, "and", (pcp), val)
+#define this_cpu_or_8(pcp, val) percpu_binary_op(8, volatile, "or", (pcp), val)
+#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
+
+#define raw_cpu_read_long(pcp) raw_cpu_read_8(pcp)
+
+#else /* !CONFIG_X86_64: */
+
+/* There is no generic 64-bit read stable operation for 32-bit targets. */
+#define this_cpu_read_stable_8(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
+
+#define raw_cpu_read_long(pcp) raw_cpu_read_4(pcp)
+
+#endif /* CONFIG_X86_64 */
+
+#define this_cpu_read_const(pcp) __raw_cpu_read_const(pcp)
+
+/*
+ * this_cpu_read() makes the compiler load the per-CPU variable every time
+ * it is accessed while this_cpu_read_stable() allows the value to be cached.
+ * this_cpu_read_stable() is more efficient and can be used if its value
+ * is guaranteed to be valid across CPUs. The current users include
+ * current_task and cpu_current_top_of_stack, both of which are
+ * actually per-thread variables implemented as per-CPU variables and
+ * thus stable for the duration of the respective task.
+ */
+#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
+
+#define x86_this_cpu_constant_test_bit(_nr, _var) \
+({ \
+ unsigned long __percpu *addr__ = \
+ (unsigned long __percpu *)&(_var) + BIT_WORD(_nr); \
+ \
+ !!(BIT_MASK(_nr) & raw_cpu_read(*addr__)); \
+})
+
+#define x86_this_cpu_variable_test_bit(_nr, _var) \
+({ \
+ bool oldbit; \
+ \
+ asm volatile("btl %[nr], " __percpu_arg([var]) \
+ : "=@ccc" (oldbit) \
+ : [var] "m" (__my_cpu_var(_var)), \
+ [nr] "rI" (_nr)); \
+ oldbit; \
+})
+
+#define x86_this_cpu_test_bit(_nr, _var) \
+ (__builtin_constant_p(_nr) \
+ ? x86_this_cpu_constant_test_bit(_nr, _var) \
+ : x86_this_cpu_variable_test_bit(_nr, _var))
+
+
#include <asm-generic/percpu.h>
/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
+DECLARE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
- return NULL;
-}
-#endif
-
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#ifdef CONFIG_SMP
@@ -181,35 +616,53 @@ static inline void *pcpu_lpage_remapped(void *kaddr)
{ [0 ... NR_CPUS-1] = _initvalue }; \
__typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
-#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
+ DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue; \
+ __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
+ { [0 ... NR_CPUS-1] = _initvalue }; \
+ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
EXPORT_PER_CPU_SYMBOL(_name)
-#define DECLARE_EARLY_PER_CPU(_type, _name) \
- DECLARE_PER_CPU(_type, _name); \
- extern __typeof__(_type) *_name##_early_ptr; \
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
+ extern __typeof__(_type) _name##_early_map[]
+
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
+ DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
extern __typeof__(_type) _name##_early_map[]
-#define early_per_cpu_ptr(_name) (_name##_early_ptr)
-#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
-#define early_per_cpu(_name, _cpu) \
- *(early_per_cpu_ptr(_name) ? \
- &early_per_cpu_ptr(_name)[_cpu] : \
+#define early_per_cpu_ptr(_name) (_name##_early_ptr)
+#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
+
+#define early_per_cpu(_name, _cpu) \
+ *(early_per_cpu_ptr(_name) ? \
+ &early_per_cpu_ptr(_name)[_cpu] : \
&per_cpu(_name, _cpu))
-#else /* !CONFIG_SMP */
-#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+#else /* !CONFIG_SMP: */
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
DEFINE_PER_CPU(_type, _name) = _initvalue
-#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
+ DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
EXPORT_PER_CPU_SYMBOL(_name)
-#define DECLARE_EARLY_PER_CPU(_type, _name) \
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
DECLARE_PER_CPU(_type, _name)
-#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
-#define early_per_cpu_ptr(_name) NULL
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
+ DECLARE_PER_CPU_READ_MOSTLY(_type, _name)
+
+#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
+#define early_per_cpu_ptr(_name) NULL
/* no early_per_cpu_map() */
-#endif /* !CONFIG_SMP */
+#endif /* !CONFIG_SMP */
#endif /* _ASM_X86_PERCPU_H */
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
deleted file mode 100644
index fa64e401589d..000000000000
--- a/arch/x86/include/asm/perf_counter.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef _ASM_X86_PERF_COUNTER_H
-#define _ASM_X86_PERF_COUNTER_H
-
-/*
- * Performance counter hw details:
- */
-
-#define X86_PMC_MAX_GENERIC 8
-#define X86_PMC_MAX_FIXED 3
-
-#define X86_PMC_IDX_GENERIC 0
-#define X86_PMC_IDX_FIXED 32
-#define X86_PMC_IDX_MAX 64
-
-#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-#define ARCH_PERFMON_EVENT_MASK 0xffff
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
- (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
-
-/*
- * Intel "Architectural Performance Monitoring" CPUID
- * detection/enumeration details:
- */
-union cpuid10_eax {
- struct {
- unsigned int version_id:8;
- unsigned int num_counters:8;
- unsigned int bit_width:8;
- unsigned int mask_length:8;
- } split;
- unsigned int full;
-};
-
-union cpuid10_edx {
- struct {
- unsigned int num_counters_fixed:4;
- unsigned int reserved:28;
- } split;
- unsigned int full;
-};
-
-
-/*
- * Fixed-purpose performance counters:
- */
-
-/*
- * All 3 fixed-mode PMCs are configured via this single MSR:
- */
-#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
-
-/*
- * The counts are available in three separate MSRs:
- */
-
-/* Instr_Retired.Any: */
-#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
-#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
-
-/* CPU_CLK_Unhalted.Core: */
-#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
-#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
-
-/* CPU_CLK_Unhalted.Ref: */
-#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
-#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(void);
-
-#define PERF_COUNTER_INDEX_OFFSET 0
-
-#else
-static inline void init_hw_perf_counters(void) { }
-static inline void perf_counters_lapic_init(void) { }
-#endif
-
-#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
new file mode 100644
index 000000000000..7276ba70c88a
--- /dev/null
+++ b/arch/x86/include/asm/perf_event.h
@@ -0,0 +1,813 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+#include <linux/static_call.h>
+
+/*
+ * Performance event hw details:
+ */
+
+#define INTEL_PMC_MAX_GENERIC 32
+#define INTEL_PMC_MAX_FIXED 16
+#define INTEL_PMC_IDX_FIXED 32
+
+#define X86_PMC_IDX_MAX 64
+
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+
+#define ARCH_PERFMON_EVENTSEL_EVENT 0x000000FFULL
+#define ARCH_PERFMON_EVENTSEL_UMASK 0x0000FF00ULL
+#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
+#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19)
+#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
+#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
+#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
+#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
+#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35)
+#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36)
+#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40)
+
+#define INTEL_FIXED_BITS_STRIDE 4
+#define INTEL_FIXED_0_KERNEL (1ULL << 0)
+#define INTEL_FIXED_0_USER (1ULL << 1)
+#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2)
+#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3)
+#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2)
+
+#define HSW_IN_TX (1ULL << 32)
+#define HSW_IN_TX_CHECKPOINTED (1ULL << 33)
+#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34)
+#define ICL_FIXED_0_ADAPTIVE (1ULL << 32)
+
+#define INTEL_FIXED_BITS_MASK \
+ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \
+ INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \
+ ICL_FIXED_0_ADAPTIVE)
+
+#define intel_fixed_bits_by_idx(_idx, _bits) \
+ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
+
+#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
+#define AMD64_EVENTSEL_GUESTONLY (1ULL << 40)
+#define AMD64_EVENTSEL_HOSTONLY (1ULL << 41)
+
+#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT 37
+#define AMD64_EVENTSEL_INT_CORE_SEL_MASK \
+ (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
+
+#define AMD64_EVENTSEL_EVENT \
+ (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
+#define INTEL_ARCH_EVENT_MASK \
+ (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
+
+#define AMD64_L3_SLICE_SHIFT 48
+#define AMD64_L3_SLICE_MASK \
+ (0xFULL << AMD64_L3_SLICE_SHIFT)
+#define AMD64_L3_SLICEID_MASK \
+ (0x7ULL << AMD64_L3_SLICE_SHIFT)
+
+#define AMD64_L3_THREAD_SHIFT 56
+#define AMD64_L3_THREAD_MASK \
+ (0xFFULL << AMD64_L3_THREAD_SHIFT)
+#define AMD64_L3_F19H_THREAD_MASK \
+ (0x3ULL << AMD64_L3_THREAD_SHIFT)
+
+#define AMD64_L3_EN_ALL_CORES BIT_ULL(47)
+#define AMD64_L3_EN_ALL_SLICES BIT_ULL(46)
+
+#define AMD64_L3_COREID_SHIFT 42
+#define AMD64_L3_COREID_MASK \
+ (0x7ULL << AMD64_L3_COREID_SHIFT)
+
+#define X86_RAW_EVENT_MASK \
+ (ARCH_PERFMON_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK | \
+ ARCH_PERFMON_EVENTSEL_EDGE | \
+ ARCH_PERFMON_EVENTSEL_INV | \
+ ARCH_PERFMON_EVENTSEL_CMASK)
+#define X86_ALL_EVENT_FLAGS \
+ (ARCH_PERFMON_EVENTSEL_EDGE | \
+ ARCH_PERFMON_EVENTSEL_INV | \
+ ARCH_PERFMON_EVENTSEL_CMASK | \
+ ARCH_PERFMON_EVENTSEL_ANY | \
+ ARCH_PERFMON_EVENTSEL_PIN_CONTROL | \
+ HSW_IN_TX | \
+ HSW_IN_TX_CHECKPOINTED)
+#define AMD64_RAW_EVENT_MASK \
+ (X86_RAW_EVENT_MASK | \
+ AMD64_EVENTSEL_EVENT)
+#define AMD64_RAW_EVENT_MASK_NB \
+ (AMD64_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK)
+
+#define AMD64_PERFMON_V2_EVENTSEL_EVENT_NB \
+ (AMD64_EVENTSEL_EVENT | \
+ GENMASK_ULL(37, 36))
+
+#define AMD64_PERFMON_V2_EVENTSEL_UMASK_NB \
+ (ARCH_PERFMON_EVENTSEL_UMASK | \
+ GENMASK_ULL(27, 24))
+
+#define AMD64_PERFMON_V2_RAW_EVENT_MASK_NB \
+ (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \
+ AMD64_PERFMON_V2_EVENTSEL_UMASK_NB)
+
+#define AMD64_PERFMON_V2_ENABLE_UMC BIT_ULL(31)
+#define AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC GENMASK_ULL(7, 0)
+#define AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC GENMASK_ULL(9, 8)
+#define AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC \
+ (AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC | \
+ AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC)
+
+#define AMD64_NUM_COUNTERS 4
+#define AMD64_NUM_COUNTERS_CORE 6
+#define AMD64_NUM_COUNTERS_NB 4
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
+#define ARCH_PERFMON_EVENTS_COUNT 7
+
+#define PEBS_DATACFG_MEMINFO BIT_ULL(0)
+#define PEBS_DATACFG_GP BIT_ULL(1)
+#define PEBS_DATACFG_XMMS BIT_ULL(2)
+#define PEBS_DATACFG_LBRS BIT_ULL(3)
+#define PEBS_DATACFG_CNTR BIT_ULL(4)
+#define PEBS_DATACFG_METRICS BIT_ULL(5)
+#define PEBS_DATACFG_LBR_SHIFT 24
+#define PEBS_DATACFG_CNTR_SHIFT 32
+#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0)
+#define PEBS_DATACFG_FIX_SHIFT 48
+#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0)
+
+/* Steal the highest bit of pebs_data_cfg for SW usage */
+#define PEBS_UPDATE_DS_SW BIT_ULL(63)
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+union cpuid10_ebx {
+ struct {
+ unsigned int no_unhalted_core_cycles:1;
+ unsigned int no_instructions_retired:1;
+ unsigned int no_unhalted_reference_cycles:1;
+ unsigned int no_llc_reference:1;
+ unsigned int no_llc_misses:1;
+ unsigned int no_branch_instruction_retired:1;
+ unsigned int no_branch_misses_retired:1;
+ } split;
+ unsigned int full;
+};
+
+union cpuid10_edx {
+ struct {
+ unsigned int num_counters_fixed:5;
+ unsigned int bit_width_fixed:8;
+ unsigned int reserved1:2;
+ unsigned int anythread_deprecated:1;
+ unsigned int reserved2:16;
+ } split;
+ unsigned int full;
+};
+
+/*
+ * Intel "Architectural Performance Monitoring extension" CPUID
+ * detection/enumeration details:
+ */
+#define ARCH_PERFMON_EXT_LEAF 0x00000023
+#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1
+#define ARCH_PERFMON_ACR_LEAF 0x2
+#define ARCH_PERFMON_PEBS_CAP_LEAF 0x4
+#define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5
+
+union cpuid35_eax {
+ struct {
+ unsigned int leaf0:1;
+ /* Counters Sub-Leaf */
+ unsigned int cntr_subleaf:1;
+ /* Auto Counter Reload Sub-Leaf */
+ unsigned int acr_subleaf:1;
+ /* Events Sub-Leaf */
+ unsigned int events_subleaf:1;
+ /* arch-PEBS Sub-Leaves */
+ unsigned int pebs_caps_subleaf:1;
+ unsigned int pebs_cnts_subleaf:1;
+ unsigned int reserved:26;
+ } split;
+ unsigned int full;
+};
+
+union cpuid35_ebx {
+ struct {
+ /* UnitMask2 Supported */
+ unsigned int umask2:1;
+ /* EQ-bit Supported */
+ unsigned int eq:1;
+ unsigned int reserved:30;
+ } split;
+ unsigned int full;
+};
+
+/*
+ * Intel Architectural LBR CPUID detection/enumeration details:
+ */
+union cpuid28_eax {
+ struct {
+ /* Supported LBR depth values */
+ unsigned int lbr_depth_mask:8;
+ unsigned int reserved:22;
+ /* Deep C-state Reset */
+ unsigned int lbr_deep_c_reset:1;
+ /* IP values contain LIP */
+ unsigned int lbr_lip:1;
+ } split;
+ unsigned int full;
+};
+
+union cpuid28_ebx {
+ struct {
+ /* CPL Filtering Supported */
+ unsigned int lbr_cpl:1;
+ /* Branch Filtering Supported */
+ unsigned int lbr_filter:1;
+ /* Call-stack Mode Supported */
+ unsigned int lbr_call_stack:1;
+ } split;
+ unsigned int full;
+};
+
+union cpuid28_ecx {
+ struct {
+ /* Mispredict Bit Supported */
+ unsigned int lbr_mispred:1;
+ /* Timed LBRs Supported */
+ unsigned int lbr_timed_lbr:1;
+ /* Branch Type Field Supported */
+ unsigned int lbr_br_type:1;
+ unsigned int reserved:13;
+ /* Branch counters (Event Logging) Supported */
+ unsigned int lbr_counters:4;
+ } split;
+ unsigned int full;
+};
+
+/*
+ * AMD "Extended Performance Monitoring and Debug" CPUID
+ * detection/enumeration details:
+ */
+union cpuid_0x80000022_ebx {
+ struct {
+ /* Number of Core Performance Counters */
+ unsigned int num_core_pmc:4;
+ /* Number of available LBR Stack Entries */
+ unsigned int lbr_v2_stack_sz:6;
+ /* Number of Data Fabric Counters */
+ unsigned int num_df_pmc:6;
+ /* Number of Unified Memory Controller Counters */
+ unsigned int num_umc_pmc:6;
+ } split;
+ unsigned int full;
+};
+
+struct x86_pmu_capability {
+ int version;
+ int num_counters_gp;
+ int num_counters_fixed;
+ int bit_width_gp;
+ int bit_width_fixed;
+ unsigned int events_mask;
+ int events_mask_len;
+ unsigned int pebs_ept :1;
+};
+
+/*
+ * Fixed-purpose performance events:
+ */
+
+/* RDPMC offset for Fixed PMCs */
+#define INTEL_PMC_FIXED_RDPMC_BASE (1 << 30)
+#define INTEL_PMC_FIXED_RDPMC_METRICS (1 << 29)
+
+/*
+ * All the fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
+
+/*
+ * There is no event-code assigned to the fixed-mode PMCs.
+ *
+ * For a fixed-mode PMC, which has an equivalent event on a general-purpose
+ * PMC, the event-code of the equivalent event is used for the fixed-mode PMC,
+ * e.g., Instr_Retired.Any and CPU_CLK_Unhalted.Core.
+ *
+ * For a fixed-mode PMC, which doesn't have an equivalent event, a
+ * pseudo-encoding is used, e.g., CPU_CLK_Unhalted.Ref and TOPDOWN.SLOTS.
+ * The pseudo event-code for a fixed-mode PMC must be 0x00.
+ * The pseudo umask-code is 0xX. The X equals the index of the fixed
+ * counter + 1, e.g., the fixed counter 2 has the pseudo-encoding 0x0300.
+ *
+ * The counts are available in separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
+#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
+#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: event=0x00,umask=0x3 (pseudo-encoding) */
+#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
+#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2)
+#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)
+
+/* TOPDOWN.SLOTS: event=0x00,umask=0x4 (pseudo-encoding) */
+#define MSR_ARCH_PERFMON_FIXED_CTR3 0x30c
+#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3)
+#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS)
+
+/* TOPDOWN_BAD_SPECULATION.ALL: fixed counter 4 (Atom only) */
+/* TOPDOWN_FE_BOUND.ALL: fixed counter 5 (Atom only) */
+/* TOPDOWN_RETIRING.ALL: fixed counter 6 (Atom only) */
+
+static inline bool use_fixed_pseudo_encoding(u64 code)
+{
+ return !(code & 0xff);
+}
+
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose the value 47 for the fixed index of BTS, since lower
+ * values are used by actual fixed events and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 15)
+
+/*
+ * The PERF_METRICS MSR is modeled as several magic fixed-mode PMCs, one for
+ * each TopDown metric event.
+ *
+ * Internally the TopDown metric events are mapped to the FxCtr 3 (SLOTS).
+ */
+#define INTEL_PMC_IDX_METRIC_BASE (INTEL_PMC_IDX_FIXED + 16)
+#define INTEL_PMC_IDX_TD_RETIRING (INTEL_PMC_IDX_METRIC_BASE + 0)
+#define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1)
+#define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2)
+#define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3)
+#define INTEL_PMC_IDX_TD_HEAVY_OPS (INTEL_PMC_IDX_METRIC_BASE + 4)
+#define INTEL_PMC_IDX_TD_BR_MISPREDICT (INTEL_PMC_IDX_METRIC_BASE + 5)
+#define INTEL_PMC_IDX_TD_FETCH_LAT (INTEL_PMC_IDX_METRIC_BASE + 6)
+#define INTEL_PMC_IDX_TD_MEM_BOUND (INTEL_PMC_IDX_METRIC_BASE + 7)
+#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_MEM_BOUND
+#define INTEL_PMC_MSK_TOPDOWN ((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \
+ INTEL_PMC_MSK_FIXED_SLOTS)
+
+/*
+ * There is no event-code assigned to the TopDown events.
+ *
+ * For the slots event, use the pseudo code of the fixed counter 3.
+ *
+ * For the metric events, the pseudo event-code is 0x00.
+ * The pseudo umask-code starts from the middle of the pseudo event
+ * space, 0x80.
+ */
+#define INTEL_TD_SLOTS 0x0400 /* TOPDOWN.SLOTS */
+/* Level 1 metrics */
+#define INTEL_TD_METRIC_RETIRING 0x8000 /* Retiring metric */
+#define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */
+#define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */
+#define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */
+/* Level 2 metrics */
+#define INTEL_TD_METRIC_HEAVY_OPS 0x8400 /* Heavy Operations metric */
+#define INTEL_TD_METRIC_BR_MISPREDICT 0x8500 /* Branch Mispredict metric */
+#define INTEL_TD_METRIC_FETCH_LAT 0x8600 /* Fetch Latency metric */
+#define INTEL_TD_METRIC_MEM_BOUND 0x8700 /* Memory bound metric */
+
+#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
+#define INTEL_TD_METRIC_NUM 8
+
+#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0
+#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT)
+
+static inline bool is_metric_idx(int idx)
+{
+ return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM;
+}
+
+static inline bool is_topdown_idx(int idx)
+{
+ return is_metric_idx(idx) || idx == INTEL_PMC_IDX_FIXED_SLOTS;
+}
+
+#define INTEL_PMC_OTHER_TOPDOWN_BITS(bit) \
+ (~(0x1ull << bit) & INTEL_PMC_MSK_TOPDOWN)
+
+#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF_BIT 62
+#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT)
+#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58
+#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
+#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55
+#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
+#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54
+#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT)
+#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
+
+#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
+/*
+ * We model guest LBR event tracing as another fixed-mode PMC like BTS.
+ *
+ * We choose bit 58 because it's used to indicate LBR stack frozen state
+ * for architectural perfmon v4, also we unconditionally mask that bit in
+ * the handle_pmi_common(), so it'll never be set in the overflow handling.
+ *
+ * With this fake counter assigned, the guest LBR event user (such as KVM),
+ * can program the LBR registers on its own, and we don't actually do anything
+ * with then in the host context.
+ */
+#define INTEL_PMC_IDX_FIXED_VLBR (GLOBAL_STATUS_LBRS_FROZEN_BIT)
+
+/*
+ * Pseudo-encoding the guest LBR event as event=0x00,umask=0x1b,
+ * since it would claim bit 58 which is effectively Fixed26.
+ */
+#define INTEL_FIXED_VLBR_EVENT 0x1b00
+
+/*
+ * Adaptive PEBS v4
+ */
+
+struct pebs_basic {
+ u64 format_group:32,
+ retire_latency:16,
+ format_size:16;
+ u64 ip;
+ u64 applicable_counters;
+ u64 tsc;
+};
+
+struct pebs_meminfo {
+ u64 address;
+ u64 aux;
+ union {
+ /* pre Alder Lake */
+ u64 mem_latency;
+ /* Alder Lake and later */
+ struct {
+ u64 instr_latency:16;
+ u64 pad2:16;
+ u64 cache_latency:16;
+ u64 pad3:16;
+ };
+ };
+ u64 tsx_tuning;
+};
+
+struct pebs_gprs {
+ u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
+ u64 r8, r9, r10, r11, r12, r13, r14, r15;
+};
+
+struct pebs_xmm {
+ u64 xmm[16*2]; /* two entries for each register */
+};
+
+struct pebs_cntr_header {
+ u32 cntr;
+ u32 fixed;
+ u32 metrics;
+ u32 reserved;
+};
+
+#define INTEL_CNTR_METRICS 0x3
+
+/*
+ * Arch PEBS
+ */
+union arch_pebs_index {
+ struct {
+ u64 rsvd:4,
+ wr:23,
+ rsvd2:4,
+ full:1,
+ en:1,
+ rsvd3:3,
+ thresh:23,
+ rsvd4:5;
+ };
+ u64 whole;
+};
+
+struct arch_pebs_header {
+ union {
+ u64 format;
+ struct {
+ u64 size:16, /* Record size */
+ rsvd:14,
+ mode:1, /* 64BIT_MODE */
+ cont:1,
+ rsvd2:3,
+ cntr:5,
+ lbr:2,
+ rsvd3:7,
+ xmm:1,
+ ymmh:1,
+ rsvd4:2,
+ opmask:1,
+ zmmh:1,
+ h16zmm:1,
+ rsvd5:5,
+ gpr:1,
+ aux:1,
+ basic:1;
+ };
+ };
+ u64 rsvd6;
+};
+
+struct arch_pebs_basic {
+ u64 ip;
+ u64 applicable_counters;
+ u64 tsc;
+ u64 retire :16, /* Retire Latency */
+ valid :1,
+ rsvd :47;
+ u64 rsvd2;
+ u64 rsvd3;
+};
+
+struct arch_pebs_aux {
+ u64 address;
+ u64 rsvd;
+ u64 rsvd2;
+ u64 rsvd3;
+ u64 rsvd4;
+ u64 aux;
+ u64 instr_latency :16,
+ pad2 :16,
+ cache_latency :16,
+ pad3 :16;
+ u64 tsx_tuning;
+};
+
+struct arch_pebs_gprs {
+ u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
+ u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp;
+ u64 rsvd;
+};
+
+struct arch_pebs_xer_header {
+ u64 xstate;
+ u64 rsvd;
+};
+
+#define ARCH_PEBS_LBR_NAN 0x0
+#define ARCH_PEBS_LBR_NUM_8 0x1
+#define ARCH_PEBS_LBR_NUM_16 0x2
+#define ARCH_PEBS_LBR_NUM_VAR 0x3
+#define ARCH_PEBS_BASE_LBR_ENTRIES 8
+struct arch_pebs_lbr_header {
+ u64 rsvd;
+ u64 ctl;
+ u64 depth;
+ u64 ler_from;
+ u64 ler_to;
+ u64 ler_info;
+};
+
+struct arch_pebs_cntr_header {
+ u32 cntr;
+ u32 fixed;
+ u32 metrics;
+ u32 reserved;
+};
+
+/*
+ * AMD Extended Performance Monitoring and Debug cpuid feature detection
+ */
+#define EXT_PERFMON_DEBUG_FEATURES 0x80000022
+
+/*
+ * IBS cpuid feature detection
+ */
+
+#define IBS_CPUID_FEATURES 0x8000001b
+
+/*
+ * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
+ * bit 0 is used to indicate the existence of IBS.
+ */
+#define IBS_CAPS_AVAIL (1U<<0)
+#define IBS_CAPS_FETCHSAM (1U<<1)
+#define IBS_CAPS_OPSAM (1U<<2)
+#define IBS_CAPS_RDWROPCNT (1U<<3)
+#define IBS_CAPS_OPCNT (1U<<4)
+#define IBS_CAPS_BRNTRGT (1U<<5)
+#define IBS_CAPS_OPCNTEXT (1U<<6)
+#define IBS_CAPS_RIPINVALIDCHK (1U<<7)
+#define IBS_CAPS_OPBRNFUSE (1U<<8)
+#define IBS_CAPS_FETCHCTLEXTD (1U<<9)
+#define IBS_CAPS_OPDATA4 (1U<<10)
+#define IBS_CAPS_ZEN4 (1U<<11)
+#define IBS_CAPS_OPLDLAT (1U<<12)
+#define IBS_CAPS_OPDTLBPGSIZE (1U<<19)
+
+#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
+ | IBS_CAPS_FETCHSAM \
+ | IBS_CAPS_OPSAM)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL 0x1cc
+#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK 0x0F
+
+/* IBS fetch bits/masks */
+#define IBS_FETCH_L3MISSONLY (1ULL<<59)
+#define IBS_FETCH_RAND_EN (1ULL<<57)
+#define IBS_FETCH_VAL (1ULL<<49)
+#define IBS_FETCH_ENABLE (1ULL<<48)
+#define IBS_FETCH_CNT 0xFFFF0000ULL
+#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
+
+/*
+ * IBS op bits/masks
+ * The lower 7 bits of the current count are random bits
+ * preloaded by hardware and ignored in software
+ */
+#define IBS_OP_LDLAT_EN (1ULL<<63)
+#define IBS_OP_LDLAT_THRSH (0xFULL<<59)
+#define IBS_OP_CUR_CNT (0xFFF80ULL<<32)
+#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32)
+#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52)
+#define IBS_OP_CNT_CTL (1ULL<<19)
+#define IBS_OP_VAL (1ULL<<18)
+#define IBS_OP_ENABLE (1ULL<<17)
+#define IBS_OP_L3MISSONLY (1ULL<<16)
+#define IBS_OP_MAX_CNT 0x0000FFFFULL
+#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
+#define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */
+#define IBS_RIP_INVALID (1ULL<<38)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern u32 get_ibs_caps(void);
+extern int forward_event_to_ibs(struct perf_event *event);
+#else
+static inline u32 get_ibs_caps(void) { return 0; }
+static inline int forward_event_to_ibs(struct perf_event *event) { return -ENOENT; }
+#endif
+
+#ifdef CONFIG_PERF_EVENTS
+extern void perf_events_lapic_init(void);
+
+/*
+ * Abuse bits {3,5} of the cpu eflags register. These flags are otherwise
+ * unused and ABI specified to be 0, so nobody should care what we do with
+ * them.
+ *
+ * EXACT - the IP points to the exact instruction that triggered the
+ * event (HW bugs exempt).
+ * VM - original X86_VM_MASK; see set_linear_ip().
+ */
+#define PERF_EFLAGS_EXACT (1UL << 3)
+#define PERF_EFLAGS_VM (1UL << 5)
+
+struct pt_regs;
+struct x86_perf_regs {
+ struct pt_regs regs;
+ u64 *xmm_regs;
+};
+
+extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_arch_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs);
+#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs)
+#define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs)
+
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see
+ * perf_arch_misc_flags() and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+ (regs)->ip = (__ip); \
+ (regs)->sp = (unsigned long)__builtin_frame_address(0); \
+ (regs)->cs = __KERNEL_CS; \
+ regs->flags = 0; \
+}
+
+struct perf_guest_switch_msr {
+ unsigned msr;
+ u64 host, guest;
+};
+
+struct x86_pmu_lbr {
+ unsigned int nr;
+ unsigned int from;
+ unsigned int to;
+ unsigned int info;
+ bool has_callstack;
+};
+
+extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
+extern u64 perf_get_hw_event_config(int hw_event);
+extern void perf_check_microcode(void);
+extern void perf_clear_dirty_counters(void);
+extern int x86_perf_rdpmc_index(struct perf_event *event);
+#else
+static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+ memset(cap, 0, sizeof(*cap));
+}
+
+static inline u64 perf_get_hw_event_config(int hw_event)
+{
+ return 0;
+}
+
+static inline void perf_events_lapic_init(void) { }
+static inline void perf_check_microcode(void) { }
+#endif
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
+extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
+#else
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
+static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+{
+ memset(lbr, 0, sizeof(*lbr));
+}
+#endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+ extern void intel_pt_handle_vmx(int on);
+#else
+static inline void intel_pt_handle_vmx(int on)
+{
+
+}
+#endif
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+ extern void amd_pmu_enable_virt(void);
+ extern void amd_pmu_disable_virt(void);
+
+#if defined(CONFIG_PERF_EVENTS_AMD_BRS)
+
+#define PERF_NEEDS_LOPWR_CB 1
+
+/*
+ * architectural low power callback impacts
+ * drivers/acpi/processor_idle.c
+ * drivers/acpi/acpi_pad.c
+ */
+extern void perf_amd_brs_lopwr_cb(bool lopwr_in);
+
+DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+
+static __always_inline void perf_lopwr_cb(bool lopwr_in)
+{
+ static_call_mod(perf_lopwr_cb)(lopwr_in);
+}
+
+#endif /* PERF_NEEDS_LOPWR_CB */
+
+#else
+ static inline void amd_pmu_enable_virt(void) { }
+ static inline void amd_pmu_disable_virt(void) { }
+#endif
+
+#define arch_perf_out_copy_user copy_from_user_nmi
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 000000000000..d65e338b6a5f
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,877 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ */
+
+#ifndef PERF_EVENT_P4_H
+#define PERF_EVENT_P4_H
+
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+
+/*
+ * NetBurst has performance MSRs shared between
+ * threads if HT is turned on, ie for both logical
+ * processors (mem: in turn in Atom with HT support
+ * perf-MSRs are not shared and every thread has its
+ * own perf-MSRs set)
+ */
+#define ARCH_P4_TOTAL_ESCR (46)
+#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR (18)
+
+#define ARCH_P4_CNTRVAL_BITS (40)
+#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
+#define ARCH_P4_UNFLAGGED_BIT ((1ULL) << (ARCH_P4_CNTRVAL_BITS - 1))
+
+#define P4_ESCR_EVENT_MASK 0x7e000000ULL
+#define P4_ESCR_EVENT_SHIFT 25
+#define P4_ESCR_EVENTMASK_MASK 0x01fffe00ULL
+#define P4_ESCR_EVENTMASK_SHIFT 9
+#define P4_ESCR_TAG_MASK 0x000001e0ULL
+#define P4_ESCR_TAG_SHIFT 5
+#define P4_ESCR_TAG_ENABLE 0x00000010ULL
+#define P4_ESCR_T0_OS 0x00000008ULL
+#define P4_ESCR_T0_USR 0x00000004ULL
+#define P4_ESCR_T1_OS 0x00000002ULL
+#define P4_ESCR_T1_USR 0x00000001ULL
+
+#define P4_ESCR_EVENT(v) ((v) << P4_ESCR_EVENT_SHIFT)
+#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
+
+#define P4_CCCR_OVF 0x80000000ULL
+#define P4_CCCR_CASCADE 0x40000000ULL
+#define P4_CCCR_OVF_PMI_T0 0x04000000ULL
+#define P4_CCCR_OVF_PMI_T1 0x08000000ULL
+#define P4_CCCR_FORCE_OVF 0x02000000ULL
+#define P4_CCCR_EDGE 0x01000000ULL
+#define P4_CCCR_THRESHOLD_MASK 0x00f00000ULL
+#define P4_CCCR_THRESHOLD_SHIFT 20
+#define P4_CCCR_COMPLEMENT 0x00080000ULL
+#define P4_CCCR_COMPARE 0x00040000ULL
+#define P4_CCCR_ESCR_SELECT_MASK 0x0000e000ULL
+#define P4_CCCR_ESCR_SELECT_SHIFT 13
+#define P4_CCCR_ENABLE 0x00001000ULL
+#define P4_CCCR_THREAD_SINGLE 0x00010000ULL
+#define P4_CCCR_THREAD_BOTH 0x00020000ULL
+#define P4_CCCR_THREAD_ANY 0x00030000ULL
+#define P4_CCCR_RESERVED 0x00000fffULL
+
+#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+#define P4_GEN_ESCR_EMASK(class, name, bit) \
+ class##__##name = ((1ULL << bit) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_EMASK_BIT(class, name) class##__##name
+
+/*
+ * config field is 64bit width and consists of
+ * HT << 63 | ESCR << 32 | CCCR
+ * where HT is HyperThreading bit (since ESCR
+ * has it reserved we may use it for own purpose)
+ *
+ * note that this is NOT the addresses of respective
+ * ESCR and CCCR but rather an only packed value should
+ * be unpacked and written to a proper addresses
+ *
+ * the base idea is to pack as much info as possible
+ */
+#define p4_config_pack_escr(v) (((u64)(v)) << 32)
+#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_escr(v) (((u64)(v)) >> 32)
+#define p4_config_unpack_cccr(v) (((u64)(v)) & 0xffffffffULL)
+
+#define p4_config_unpack_emask(v) \
+ ({ \
+ u32 t = p4_config_unpack_escr((v)); \
+ t = t & P4_ESCR_EVENTMASK_MASK; \
+ t = t >> P4_ESCR_EVENTMASK_SHIFT; \
+ t; \
+ })
+
+#define p4_config_unpack_event(v) \
+ ({ \
+ u32 t = p4_config_unpack_escr((v)); \
+ t = t & P4_ESCR_EVENT_MASK; \
+ t = t >> P4_ESCR_EVENT_SHIFT; \
+ t; \
+ })
+
+#define P4_CONFIG_HT_SHIFT 63
+#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
+
+/*
+ * If an event has alias it should be marked
+ * with a special bit. (Don't forget to check
+ * P4_PEBS_CONFIG_MASK and related bits on
+ * modification.)
+ */
+#define P4_CONFIG_ALIASABLE (1ULL << 9)
+
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR \
+ P4_ESCR_EVENT_MASK | \
+ P4_ESCR_EVENTMASK_MASK | \
+ P4_ESCR_TAG_MASK | \
+ P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR \
+ P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
+ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
+/*
+ * In case of event aliasing we need to preserve some
+ * caller bits, otherwise the mapping won't be complete.
+ */
+#define P4_CONFIG_EVENT_ALIAS_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \
+ p4_config_pack_cccr(P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE))
+
+#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \
+ ((P4_CONFIG_HT) | \
+ p4_config_pack_escr(P4_ESCR_T0_OS | \
+ P4_ESCR_T0_USR | \
+ P4_ESCR_T1_OS | \
+ P4_ESCR_T1_USR) | \
+ p4_config_pack_cccr(P4_CCCR_OVF | \
+ P4_CCCR_CASCADE | \
+ P4_CCCR_FORCE_OVF | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_OVF_PMI_T0 | \
+ P4_CCCR_OVF_PMI_T1 | \
+ P4_CONFIG_ALIASABLE))
+
+static inline bool p4_is_event_cascaded(u64 config)
+{
+ u32 cccr = p4_config_unpack_cccr(config);
+ return !!(cccr & P4_CCCR_CASCADE);
+}
+
+static inline int p4_ht_config_thread(u64 config)
+{
+ return !!(config & P4_CONFIG_HT);
+}
+
+static inline u64 p4_set_ht_bit(u64 config)
+{
+ return config | P4_CONFIG_HT;
+}
+
+static inline u64 p4_clear_ht_bit(u64 config)
+{
+ return config & ~P4_CONFIG_HT;
+}
+
+static inline int p4_ht_active(void)
+{
+#ifdef CONFIG_SMP
+ return __max_threads_per_core > 1;
+#endif
+ return 0;
+}
+
+static inline int p4_ht_thread(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (__max_threads_per_core == 2)
+ return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
+#endif
+ return 0;
+}
+
+static inline int p4_should_swap_ts(u64 config, int cpu)
+{
+ return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
+}
+
+static inline u32 p4_default_cccr_conf(int cpu)
+{
+ /*
+ * Note that P4_CCCR_THREAD_ANY is "required" on
+ * non-HT machines (on HT machines we count TS events
+ * regardless the state of second logical processor
+ */
+ u32 cccr = P4_CCCR_THREAD_ANY;
+
+ if (!p4_ht_thread(cpu))
+ cccr |= P4_CCCR_OVF_PMI_T0;
+ else
+ cccr |= P4_CCCR_OVF_PMI_T1;
+
+ return cccr;
+}
+
+static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
+{
+ u32 escr = 0;
+
+ if (!p4_ht_thread(cpu)) {
+ if (!exclude_os)
+ escr |= P4_ESCR_T0_OS;
+ if (!exclude_usr)
+ escr |= P4_ESCR_T0_USR;
+ } else {
+ if (!exclude_os)
+ escr |= P4_ESCR_T1_OS;
+ if (!exclude_usr)
+ escr |= P4_ESCR_T1_USR;
+ }
+
+ return escr;
+}
+
+/*
+ * This are the events which should be used in "Event Select"
+ * field of ESCR register, they are like unique keys which allow
+ * the kernel to determinate which CCCR and COUNTER should be
+ * used to track an event
+ */
+enum P4_EVENTS {
+ P4_EVENT_TC_DELIVER_MODE,
+ P4_EVENT_BPU_FETCH_REQUEST,
+ P4_EVENT_ITLB_REFERENCE,
+ P4_EVENT_MEMORY_CANCEL,
+ P4_EVENT_MEMORY_COMPLETE,
+ P4_EVENT_LOAD_PORT_REPLAY,
+ P4_EVENT_STORE_PORT_REPLAY,
+ P4_EVENT_MOB_LOAD_REPLAY,
+ P4_EVENT_PAGE_WALK_TYPE,
+ P4_EVENT_BSQ_CACHE_REFERENCE,
+ P4_EVENT_IOQ_ALLOCATION,
+ P4_EVENT_IOQ_ACTIVE_ENTRIES,
+ P4_EVENT_FSB_DATA_ACTIVITY,
+ P4_EVENT_BSQ_ALLOCATION,
+ P4_EVENT_BSQ_ACTIVE_ENTRIES,
+ P4_EVENT_SSE_INPUT_ASSIST,
+ P4_EVENT_PACKED_SP_UOP,
+ P4_EVENT_PACKED_DP_UOP,
+ P4_EVENT_SCALAR_SP_UOP,
+ P4_EVENT_SCALAR_DP_UOP,
+ P4_EVENT_64BIT_MMX_UOP,
+ P4_EVENT_128BIT_MMX_UOP,
+ P4_EVENT_X87_FP_UOP,
+ P4_EVENT_TC_MISC,
+ P4_EVENT_GLOBAL_POWER_EVENTS,
+ P4_EVENT_TC_MS_XFER,
+ P4_EVENT_UOP_QUEUE_WRITES,
+ P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
+ P4_EVENT_RETIRED_BRANCH_TYPE,
+ P4_EVENT_RESOURCE_STALL,
+ P4_EVENT_WC_BUFFER,
+ P4_EVENT_B2B_CYCLES,
+ P4_EVENT_BNR,
+ P4_EVENT_SNOOP,
+ P4_EVENT_RESPONSE,
+ P4_EVENT_FRONT_END_EVENT,
+ P4_EVENT_EXECUTION_EVENT,
+ P4_EVENT_REPLAY_EVENT,
+ P4_EVENT_INSTR_RETIRED,
+ P4_EVENT_UOPS_RETIRED,
+ P4_EVENT_UOP_TYPE,
+ P4_EVENT_BRANCH_RETIRED,
+ P4_EVENT_MISPRED_BRANCH_RETIRED,
+ P4_EVENT_X87_ASSIST,
+ P4_EVENT_MACHINE_CLEAR,
+ P4_EVENT_INSTR_COMPLETED,
+};
+
+#define P4_OPCODE(event) event##_OPCODE
+#define P4_OPCODE_ESEL(opcode) ((opcode & 0x00ff) >> 0)
+#define P4_OPCODE_EVNT(opcode) ((opcode & 0xff00) >> 8)
+#define P4_OPCODE_PACK(event, sel) (((event) << 8) | sel)
+
+/*
+ * Comments below the event represent ESCR restriction
+ * for this event and counter index per ESCR
+ *
+ * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
+ * processor builds (family 0FH, models 01H-02H). These MSRs
+ * are not available on later versions, so that we don't use
+ * them completely
+ *
+ * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
+ * working so that we should not use this CCCR and respective
+ * counter as result
+ */
+enum P4_EVENT_OPCODES {
+ P4_OPCODE(P4_EVENT_TC_DELIVER_MODE) = P4_OPCODE_PACK(0x01, 0x01),
+ /*
+ * MSR_P4_TC_ESCR0: 4, 5
+ * MSR_P4_TC_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST) = P4_OPCODE_PACK(0x03, 0x00),
+ /*
+ * MSR_P4_BPU_ESCR0: 0, 1
+ * MSR_P4_BPU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_ITLB_REFERENCE) = P4_OPCODE_PACK(0x18, 0x03),
+ /*
+ * MSR_P4_ITLB_ESCR0: 0, 1
+ * MSR_P4_ITLB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_MEMORY_CANCEL) = P4_OPCODE_PACK(0x02, 0x05),
+ /*
+ * MSR_P4_DAC_ESCR0: 8, 9
+ * MSR_P4_DAC_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_MEMORY_COMPLETE) = P4_OPCODE_PACK(0x08, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY) = P4_OPCODE_PACK(0x04, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY) = P4_OPCODE_PACK(0x05, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY) = P4_OPCODE_PACK(0x03, 0x02),
+ /*
+ * MSR_P4_MOB_ESCR0: 0, 1
+ * MSR_P4_MOB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE) = P4_OPCODE_PACK(0x01, 0x04),
+ /*
+ * MSR_P4_PMH_ESCR0: 0, 1
+ * MSR_P4_PMH_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE) = P4_OPCODE_PACK(0x0c, 0x07),
+ /*
+ * MSR_P4_BSU_ESCR0: 0, 1
+ * MSR_P4_BSU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_IOQ_ALLOCATION) = P4_OPCODE_PACK(0x03, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x1a, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY) = P4_OPCODE_PACK(0x17, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_ALLOCATION) = P4_OPCODE_PACK(0x05, 0x07),
+ /*
+ * MSR_P4_BSU_ESCR0: 0, 1
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x06, 0x07),
+ /*
+ * NOTE: no ESCR name in docs, it's guessed
+ * MSR_P4_BSU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST) = P4_OPCODE_PACK(0x34, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_PACKED_SP_UOP) = P4_OPCODE_PACK(0x08, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_PACKED_DP_UOP) = P4_OPCODE_PACK(0x0c, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_SCALAR_SP_UOP) = P4_OPCODE_PACK(0x0a, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_SCALAR_DP_UOP) = P4_OPCODE_PACK(0x0e, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_64BIT_MMX_UOP) = P4_OPCODE_PACK(0x02, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_128BIT_MMX_UOP) = P4_OPCODE_PACK(0x1a, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_X87_FP_UOP) = P4_OPCODE_PACK(0x04, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_TC_MISC) = P4_OPCODE_PACK(0x06, 0x01),
+ /*
+ * MSR_P4_TC_ESCR0: 4, 5
+ * MSR_P4_TC_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS) = P4_OPCODE_PACK(0x13, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_TC_MS_XFER) = P4_OPCODE_PACK(0x05, 0x00),
+ /*
+ * MSR_P4_MS_ESCR0: 4, 5
+ * MSR_P4_MS_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES) = P4_OPCODE_PACK(0x09, 0x00),
+ /*
+ * MSR_P4_MS_ESCR0: 4, 5
+ * MSR_P4_MS_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x05, 0x02),
+ /*
+ * MSR_P4_TBPU_ESCR0: 4, 5
+ * MSR_P4_TBPU_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x04, 0x02),
+ /*
+ * MSR_P4_TBPU_ESCR0: 4, 5
+ * MSR_P4_TBPU_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RESOURCE_STALL) = P4_OPCODE_PACK(0x01, 0x01),
+ /*
+ * MSR_P4_ALF_ESCR0: 12, 13, 16
+ * MSR_P4_ALF_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_WC_BUFFER) = P4_OPCODE_PACK(0x05, 0x05),
+ /*
+ * MSR_P4_DAC_ESCR0: 8, 9
+ * MSR_P4_DAC_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_B2B_CYCLES) = P4_OPCODE_PACK(0x16, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BNR) = P4_OPCODE_PACK(0x08, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_SNOOP) = P4_OPCODE_PACK(0x06, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_RESPONSE) = P4_OPCODE_PACK(0x04, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_FRONT_END_EVENT) = P4_OPCODE_PACK(0x08, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_EXECUTION_EVENT) = P4_OPCODE_PACK(0x0c, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_REPLAY_EVENT) = P4_OPCODE_PACK(0x09, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_INSTR_RETIRED) = P4_OPCODE_PACK(0x02, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_UOPS_RETIRED) = P4_OPCODE_PACK(0x01, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_UOP_TYPE) = P4_OPCODE_PACK(0x02, 0x02),
+ /*
+ * MSR_P4_RAT_ESCR0: 12, 13, 16
+ * MSR_P4_RAT_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_BRANCH_RETIRED) = P4_OPCODE_PACK(0x06, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED) = P4_OPCODE_PACK(0x03, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_X87_ASSIST) = P4_OPCODE_PACK(0x03, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_MACHINE_CLEAR) = P4_OPCODE_PACK(0x02, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_INSTR_COMPLETED) = P4_OPCODE_PACK(0x07, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+};
+
+/*
+ * a caller should use P4_ESCR_EMASK_NAME helper to
+ * pick the EventMask needed, for example
+ *
+ * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
+ */
+enum P4_ESCR_EMASKS {
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
+};
+
+/*
+ * Note we have UOP and PEBS bits reserved for now
+ * just in case if we will need them once
+ */
+#define P4_PEBS_CONFIG_ENABLE (1ULL << 7)
+#define P4_PEBS_CONFIG_UOP_TAG (1ULL << 8)
+#define P4_PEBS_CONFIG_METRIC_MASK 0x3FLL
+#define P4_PEBS_CONFIG_MASK 0xFFLL
+
+/*
+ * mem: Only counters MSR_IQ_COUNTER4 (16) and
+ * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
+ */
+#define P4_PEBS_ENABLE 0x02000000ULL
+#define P4_PEBS_ENABLE_UOP_TAG 0x01000000ULL
+
+#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
+#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK)
+
+#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask))
+
+enum P4_PEBS_METRIC {
+ P4_PEBS_METRIC__none,
+
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired,
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_store_miss_retired,
+ P4_PEBS_METRIC__dtlb_all_miss_retired,
+ P4_PEBS_METRIC__tagged_mispred_branch,
+ P4_PEBS_METRIC__mob_load_replay_retired,
+ P4_PEBS_METRIC__split_load_retired,
+ P4_PEBS_METRIC__split_store_retired,
+
+ P4_PEBS_METRIC__max
+};
+
+/*
+ * Notes on internal configuration of ESCR+CCCR tuples
+ *
+ * Since P4 has quite the different architecture of
+ * performance registers in compare with "architectural"
+ * once and we have on 64 bits to keep configuration
+ * of performance event, the following trick is used.
+ *
+ * 1) Since both ESCR and CCCR registers have only low
+ * 32 bits valuable, we pack them into a single 64 bit
+ * configuration. Low 32 bits of such config correspond
+ * to low 32 bits of CCCR register and high 32 bits
+ * correspond to low 32 bits of ESCR register.
+ *
+ * 2) The meaning of every bit of such config field can
+ * be found in Intel SDM but it should be noted that
+ * we "borrow" some reserved bits for own usage and
+ * clean them or set to a proper value when we do
+ * a real write to hardware registers.
+ *
+ * 3) The format of bits of config is the following
+ * and should be either 0 or set to some predefined
+ * values:
+ *
+ * Low 32 bits
+ * -----------
+ * 0-6: P4_PEBS_METRIC enum
+ * 7-11: reserved
+ * 12: reserved (Enable)
+ * 13-15: reserved (ESCR select)
+ * 16-17: Active Thread
+ * 18: Compare
+ * 19: Complement
+ * 20-23: Threshold
+ * 24: Edge
+ * 25: reserved (FORCE_OVF)
+ * 26: reserved (OVF_PMI_T0)
+ * 27: reserved (OVF_PMI_T1)
+ * 28-29: reserved
+ * 30: reserved (Cascade)
+ * 31: reserved (OVF)
+ *
+ * High 32 bits
+ * ------------
+ * 0: reserved (T1_USR)
+ * 1: reserved (T1_OS)
+ * 2: reserved (T0_USR)
+ * 3: reserved (T0_OS)
+ * 4: Tag Enable
+ * 5-8: Tag Value
+ * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
+ * 25-30: enum P4_EVENTS
+ * 31: reserved (HT thread)
+ */
+
+#endif /* PERF_EVENT_P4_H */
+
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 0e8c2a0fd922..c88691b15f3c 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGALLOC_H
#define _ASM_X86_PGALLOC_H
@@ -5,9 +6,15 @@
#include <linux/mm.h> /* for struct page */
#include <linux/pagemap.h>
+#include <asm/cpufeature.h>
+
+#define __HAVE_ARCH_PTE_ALLOC_ONE
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
+
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
@@ -17,34 +24,32 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
+static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif
/*
+ * In case of Page Table Isolation active, we acquire two PGDs instead of one.
+ * Being order-1, it is both 8k in size and 8k-aligned. That lets us just
+ * flip bit 12 in a pointer to swap between the two 4k halves.
+ */
+static inline unsigned int pgd_allocation_order(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ return 1;
+ return 0;
+}
+
+/*
* Allocate and free page tables.
*/
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
-extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
-
-/* Should really implement gc for free page table pages. This could be
- done with a reference count in struct page. */
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
- free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, struct page *pte)
-{
- pgtable_page_dtor(pte);
- __free_page(pte);
-}
+extern pgtable_t pte_alloc_one(struct mm_struct *);
extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
@@ -61,6 +66,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm,
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}
+static inline void pmd_populate_kernel_safe(struct mm_struct *mm,
+ pmd_t *pmd, pte_t *pte)
+{
+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+ set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
struct page *pte)
{
@@ -70,24 +82,11 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-#if PAGETABLE_LEVELS > 2
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- free_page((unsigned long)pmd);
-}
-
+#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long adddress)
+ unsigned long address)
{
___pmd_free_tlb(tlb, pmd);
}
@@ -100,24 +99,25 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
-#endif /* CONFIG_X86_PAE */
-#if PAGETABLE_LEVELS > 3
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
- paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
+#endif /* CONFIG_X86_PAE */
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+#if CONFIG_PGTABLE_LEVELS > 3
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+ set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- free_page((unsigned long)pud);
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+ set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
@@ -128,7 +128,34 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
___pud_free_tlb(tlb, pud);
}
-#endif /* PAGETABLE_LEVELS > 3 */
-#endif /* PAGETABLE_LEVELS > 2 */
+#if CONFIG_PGTABLE_LEVELS > 4
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
+{
+ if (!pgtable_l5_enabled())
+ return;
+ paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+}
+
+static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
+{
+ if (!pgtable_l5_enabled())
+ return;
+ paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
+ set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+}
+
+extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
+
+static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
+ unsigned long address)
+{
+ if (pgtable_l5_enabled())
+ ___p4d_free_tlb(tlb, p4d);
+}
+
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..e9482a11ac52 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -1,10 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_2LEVEL_H
#define _ASM_X86_PGTABLE_2LEVEL_H
#define pte_ERROR(e) \
- printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
+ pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low)
#define pgd_ERROR(e) \
- printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+ pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e))
/*
* Certain architectures need to do special things when PTEs
@@ -21,6 +22,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
*pmdp = pmd;
}
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+}
+
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
native_set_pte(ptep, pte);
@@ -31,6 +36,10 @@ static inline void native_pmd_clear(pmd_t *pmdp)
native_set_pmd(pmdp, __pmd(0));
}
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+
static inline void native_pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *xp)
{
@@ -46,57 +55,77 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range:
- */
-#define PTE_FILE_MAX_BITS 29
-#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+ return __pmd(xchg((pmdval_t *)xp, 0));
+}
#else
-#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1)
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
#endif
-#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-
-#define pte_to_pgoff(pte) \
- ((((pte).pte_low >> PTE_FILE_SHIFT1) \
- & ((1U << PTE_FILE_BITS1) - 1)) \
- + ((((pte).pte_low >> PTE_FILE_SHIFT2) \
- & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \
- + (((pte).pte_low >> PTE_FILE_SHIFT3) \
- << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
-
-#define pgoff_to_pte(off) \
- ((pte_t) { .pte_low = \
- (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \
- + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \
- << PTE_FILE_SHIFT2) \
- + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
- << PTE_FILE_SHIFT3) \
- + _PAGE_FILE })
-
-/* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+
+#ifdef CONFIG_SMP
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+ return __pud(xchg((pudval_t *)xp, 0));
+}
#else
-#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
-#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+/* Bit manipulation helper on pte/pgoff entry */
+static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
+ unsigned long mask, unsigned int leftshift)
+{
+ return ((value >> rightshift) & mask) << leftshift;
+}
+
+/*
+ * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
+ * are !pte_none() && !pte_present().
+ *
+ * Format of swap PTEs:
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * <----------------- offset ------------------> 0 E <- type --> 0
+ *
+ * E is the exclusive marker that is not stored in swap entries.
+ */
+#define SWP_TYPE_BITS 5
+#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1)
+#define _SWP_TYPE_SHIFT (_PAGE_BIT_PRESENT + 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
- & ((1U << SWP_TYPE_BITS) - 1))
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+
+#define __swp_type(x) (((x).val >> _SWP_TYPE_SHIFT) \
+ & _SWP_TYPE_MASK)
#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ (((type) & _SWP_TYPE_MASK) << _SWP_TYPE_SHIFT) \
| ((offset) << SWP_OFFSET_SHIFT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE
+
+/* No inverted PFNs on 2 level page tables */
+
+static inline u64 protnone_mask(u64 val)
+{
+ return 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ return val;
+}
+
+static inline bool __pte_needs_invert(u64 val)
+{
+ return false;
+}
+
#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..54690bd4ddbe 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -1,12 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
+typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
@@ -14,24 +16,26 @@ typedef union {
pteval_t pte;
pteval_t pte_low;
} pte_t;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
-#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 2
+#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED
/*
- * traditional i386 two-level paging structure:
+ * Traditional i386 two-level paging structure:
*/
#define PGDIR_SHIFT 22
#define PTRS_PER_PGD 1024
-
/*
- * the i386 is two-level, so we don't really have any
- * PMD directory physically.
+ * The i386 is two-level, so we don't really have any
+ * PMD directory physically:
*/
+#define PTRS_PER_PMD 1
#define PTRS_PER_PTE 1024
+/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
+#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
+
#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..dabafba957ea 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_3LEVEL_H
#define _ASM_X86_PGTABLE_3LEVEL_H
@@ -9,16 +10,24 @@
*/
#define pte_ERROR(e) \
- printk("%s:%d: bad pte %p(%08lx%08lx).\n", \
+ pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \
__FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
#define pmd_ERROR(e) \
- printk("%s:%d: bad pmd %p(%016Lx).\n", \
+ pr_err("%s:%d: bad pmd %p(%016Lx)\n", \
__FILE__, __LINE__, &(e), pmd_val(e))
#define pgd_ERROR(e) \
- printk("%s:%d: bad pgd %p(%016Lx).\n", \
+ pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
-/* Rules for using set_pte: the pte being assigned *must* be
+#define pxx_xchg64(_pxx, _ptr, _val) ({ \
+ _pxx##val_t *_p = (_pxx##val_t *)_ptr; \
+ _pxx##val_t _o = *_p; \
+ do { } while (!try_cmpxchg64(_p, &_o, (_val))); \
+ native_make_##_pxx(_o); \
+})
+
+/*
+ * Rules for using set_pte: the pte being assigned *must* be
* either not present or in a state where the hardware will
* not attempt to update the pte. In places where this is
* not possible, use pte_get_and_clear to obtain the old pte
@@ -26,24 +35,27 @@
*/
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
- ptep->pte_high = pte.pte_high;
+ WRITE_ONCE(ptep->pte_high, pte.pte_high);
smp_wmb();
- ptep->pte_low = pte.pte_low;
+ WRITE_ONCE(ptep->pte_low, pte.pte_low);
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
+ pxx_xchg64(pte, ptep, native_pte_val(pte));
}
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
+ pxx_xchg64(pmd, pmdp, native_pmd_val(pmd));
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
- set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
+#endif
+ pxx_xchg64(pud, pudp, native_pud_val(pud));
}
/*
@@ -54,23 +66,24 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud)
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- ptep->pte_low = 0;
+ WRITE_ONCE(ptep->pte_low, 0);
smp_wmb();
- ptep->pte_high = 0;
+ WRITE_ONCE(ptep->pte_high, 0);
}
-static inline void native_pmd_clear(pmd_t *pmd)
+static inline void native_pmd_clear(pmd_t *pmdp)
{
- u32 *tmp = (u32 *)pmd;
- *tmp = 0;
+ WRITE_ONCE(pmdp->pmd_low, 0);
smp_wmb();
- *(tmp + 1) = 0;
+ WRITE_ONCE(pmdp->pmd_high, 0);
}
-static inline void pud_clear(pud_t *pudp)
+static inline void native_pud_clear(pud_t *pudp)
{
- unsigned long pgd;
+}
+static inline void pud_clear(pud_t *pudp)
+{
set_pud(pudp, __pud(0));
/*
@@ -79,46 +92,118 @@ static inline void pud_clear(pud_t *pudp)
* section 8.1: in PAE mode we explicitly have to flush the
* TLB via cr3 if the top-level pgd is changed...
*
- * Make sure the pud entry we're updating is within the
- * current pgd to avoid unnecessary TLB flushes.
+ * Currently all places where pud_clear() is called either have
+ * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
+ * pud_clear_bad()), so we don't need TLB flush here.
*/
- pgd = read_cr3();
- if (__pa(pudp) >= pgd && __pa(pudp) <
- (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
- write_cr3(pgd);
}
+
#ifdef CONFIG_SMP
static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
{
- pte_t res;
+ return pxx_xchg64(pte, ptep, 0ULL);
+}
- /* xchg acts as a barrier before the setting of the high bits */
- res.pte_low = xchg(&ptep->pte_low, 0);
- res.pte_high = ptep->pte_high;
- ptep->pte_high = 0;
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ return pxx_xchg64(pmd, pmdp, 0ULL);
+}
- return res;
+static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
+{
+ return pxx_xchg64(pud, pudp, 0ULL);
}
#else
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+ pmd_t old;
+
+ /*
+ * If pmd has present bit cleared we can get away without expensive
+ * cmpxchg64: we can update pmdp half-by-half without racing with
+ * anybody.
+ */
+ if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+ /* xchg acts as a barrier before setting of the high bits */
+ old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low);
+ old.pmd_high = READ_ONCE(pmdp->pmd_high);
+ WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high);
+
+ return old;
+ }
+
+ return pxx_xchg64(pmd, pmdp, pmd.pmd);
+}
+#endif
+
+/*
+ * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
+ * are !pte_none() && !pte_present().
+ *
+ * Format of swap PTEs:
+ *
+ * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3
+ * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2
+ * < type -> <---------------------- offset ----------------------
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * --------------------------------------------> 0 E 0 0 0 0 0 0 0
+ *
+ * E is the exclusive marker that is not stored in swap entries.
+ */
+#define SWP_TYPE_BITS 5
+#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1)
+
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK)
+#define __swp_offset(x) ((x).val >> SWP_TYPE_BITS)
+#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \
+ | (offset) << SWP_TYPE_BITS})
+
/*
- * Bits 0, 6 and 7 are taken in the low part of the pte,
- * put the 32 bits of offset into the high part.
+ * Normally, __swp_entry() converts from arch-independent swp_entry_t to
+ * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result
+ * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the
+ * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to
+ * __swp_entry_to_pte() through the following helper macro based on 64bit
+ * __swp_entry().
*/
-#define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off) \
- ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-#define PTE_FILE_MAX_BITS 32
-
-/* Encode and de-code a swap entry */
-#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
-#define __swp_type(x) (((x).val) & 0x1f)
-#define __swp_offset(x) ((x).val >> 5)
-#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
-#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
-#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+#define __swp_pteval_entry(type, offset) ((pteval_t) { \
+ (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) })
+
+#define __swp_entry_to_pte(x) ((pte_t){ .pte = \
+ __swp_pteval_entry(__swp_type(x), __swp_offset(x)) })
+/*
+ * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent
+ * swp_entry_t, but also has to convert it from 64bit to the 32bit
+ * intermediate representation, using the following macros based on 64bit
+ * __swp_type() and __swp_offset().
+ */
+#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS)))
+#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT))
+
+#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
+ __pteval_swp_offset(pte)))
+
+/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE
+
+#include <asm/pgtable-invert.h>
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..580b09bf6a45 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -1,12 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
typedef u64 pteval_t;
typedef u64 pmdval_t;
typedef u64 pudval_t;
+typedef u64 p4dval_t;
typedef u64 pgdval_t;
typedef u64 pgprotval_t;
@@ -16,15 +18,16 @@ typedef union {
};
pteval_t pte;
} pte_t;
-#endif /* !__ASSEMBLY__ */
-#ifdef CONFIG_PARAVIRT
-#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
-#else
-#define SHARED_KERNEL_PMD 1
-#endif
+typedef union {
+ struct {
+ unsigned long pmd_low, pmd_high;
+ };
+ pmdval_t pmd;
+} pmd_t;
+#endif /* !__ASSEMBLER__ */
-#define PAGETABLE_LEVELS 3
+#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
@@ -44,5 +47,7 @@ typedef union {
*/
#define PTRS_PER_PTE 512
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
+#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h
new file mode 100644
index 000000000000..e12e52ae8083
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-invert.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_PGTABLE_INVERT_H
+#define _ASM_PGTABLE_INVERT_H 1
+
+#ifndef __ASSEMBLER__
+
+/*
+ * A clear pte value is special, and doesn't get inverted.
+ *
+ * Note that even users that only pass a pgprot_t (rather
+ * than a full pte) won't trigger the special zero case,
+ * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
+ * set. So the all zero case really is limited to just the
+ * cleared page table entry case.
+ */
+static inline bool __pte_needs_invert(u64 val)
+{
+ return val && !(val & _PAGE_PRESENT);
+}
+
+/* Get a mask to xor with the page table entry to get the correct pfn. */
+static inline u64 protnone_mask(u64 val)
+{
+ return __pte_needs_invert(val) ? ~0ull : 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ /*
+ * When a PTE transitions from NONE to !NONE or vice-versa
+ * invert the PFN part to stop speculation.
+ * pte_pfn undoes this when needed.
+ */
+ if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
+ val = (val & ~mask) | (~val & mask);
+ return val;
+}
+
+#endif /* __ASSEMBLER__ */
+
+#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3cc06e3fceb8..e33df3da6980 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,73 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H
+#include <linux/mem_encrypt.h>
#include <asm/page.h>
-
#include <asm/pgtable_types.h>
/*
* Macro to mark a page protection value as UC-
*/
-#define pgprot_noncached(prot) \
- ((boot_cpu_data.x86 > 3) \
- ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | \
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
: (prot))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
+#include <linux/spinlock.h>
+#include <asm/x86_init.h>
+#include <asm/pkru.h>
+#include <asm/fpu/api.h>
+#include <asm/coco.h>
+#include <asm-generic/pgtable_uffd.h>
+#include <linux/page_table_check.h>
+
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
+bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
+struct seq_file;
+void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
+ bool user);
+bool ptdump_walk_pgd_level_checkwx(void);
+#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
+void ptdump_walk_user_pgd_level_checkwx(void);
+
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot) __pgprot(cc_mkenc(pgprot_val(prot)))
+#define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot)))
+
+#ifdef CONFIG_DEBUG_WX
+#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
+#else
+#define debug_checkwx_user() do { } while (0)
+#endif
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
*/
-extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
+ __visible;
+#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
-#ifdef CONFIG_PARAVIRT
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
+extern pmdval_t early_pmd_flags;
+
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
-#else /* !CONFIG_PARAVIRT */
+#else /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
-#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
-#ifndef __PAGETABLE_PUD_FOLDED
+#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
-#define pgd_clear(pgd) native_pgd_clear(pgd)
+#define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
+#endif
+
+#ifndef set_p4d
+# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d)
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define p4d_clear(p4d) native_p4d_clear(p4d)
#endif
#ifndef set_pud
# define set_pud(pudp, pud) native_set_pud(pudp, pud)
#endif
-#ifndef __PAGETABLE_PMD_FOLDED
+#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud) native_pud_clear(pud)
#endif
#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd) native_pmd_clear(pmd)
-#define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep) do { } while (0)
-
-static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
-{
- native_pagetable_setup_start(base);
-}
-
-static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
-{
- native_pagetable_setup_done(base);
-}
-
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
+#ifndef __PAGETABLE_P4D_FOLDED
+#define p4d_val(x) native_p4d_val(x)
+#define __p4d(x) native_make_p4d(x)
+#endif
+
#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x) native_pud_val(x)
#define __pud(x) native_make_pud(x)
@@ -82,16 +118,49 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
#define __pte(x) native_make_pte(x)
#define arch_end_context_switch(prev) do {} while(0)
+#endif /* CONFIG_PARAVIRT_XXL */
+
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return native_make_pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return native_make_pmd(v & ~clear);
+}
+
+static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
+{
+ pudval_t v = native_pud_val(pud);
+
+ return native_make_pud(v | set);
+}
-#endif /* CONFIG_PARAVIRT */
+static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
+{
+ pudval_t v = native_pud_val(pud);
+
+ return native_make_pud(v & ~clear);
+}
/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pte_shstk(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}
static inline int pte_young(pte_t pte)
@@ -99,14 +168,70 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+static inline bool pte_decrypted(pte_t pte)
+{
+ return cc_mkdec(pte_val(pte)) == pte_val(pte);
+}
+
+#define pmd_dirty pmd_dirty
+static inline bool pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pmd_shstk(pmd_t pmd)
+{
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
+ (_PAGE_DIRTY | _PAGE_PSE);
+}
+
+#define pmd_young pmd_young
+static inline int pmd_young(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
+static inline bool pud_dirty(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
+}
+
+static inline int pud_young(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_ACCESSED;
+}
+
+static inline bool pud_shstk(pud_t pud)
+{
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
+ (_PAGE_DIRTY | _PAGE_PSE);
+}
+
static inline int pte_write(pte_t pte)
{
- return pte_flags(pte) & _PAGE_RW;
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
}
-static inline int pte_file(pte_t pte)
+#define pmd_write pmd_write
+static inline int pmd_write(pmd_t pmd)
{
- return pte_flags(pte) & _PAGE_FILE;
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
+}
+
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_RW;
}
static inline int pte_huge(pte_t pte)
@@ -129,19 +254,96 @@ static inline int pte_special(pte_t pte)
return pte_flags(pte) & _PAGE_SPECIAL;
}
+/* Entries that were set to PROT_NONE are inverted */
+
+static inline u64 protnone_mask(u64 val);
+
+#define PFN_PTE_SHIFT PAGE_SHIFT
+
static inline unsigned long pte_pfn(pte_t pte)
{
- return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ phys_addr_t pfn = pte_val(pte);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ phys_addr_t pfn = pmd_val(pmd);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
+}
+
+#define pud_pfn pud_pfn
+static inline unsigned long pud_pfn(pud_t pud)
+{
+ phys_addr_t pfn = pud_val(pud);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long p4d_pfn(p4d_t p4d)
+{
+ return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pgd_pfn(pgd_t pgd)
+{
+ return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
-static inline int pmd_large(pmd_t pte)
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pte)
+{
+ return pmd_flags(pte) & _PAGE_PSE;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return (pmd_val(pmd) & _PAGE_PSE) == _PAGE_PSE;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_trans_huge(pud_t pud)
+{
+ return (pud_val(pud) & _PAGE_PSE) == _PAGE_PSE;
+}
+#endif
+
+#define has_transparent_hugepage has_transparent_hugepage
+static inline int has_transparent_hugepage(void)
+{
+ return boot_cpu_has(X86_FEATURE_PSE);
+}
+
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static inline bool pmd_special(pmd_t pmd)
{
- return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
- (_PAGE_PSE | _PAGE_PRESENT);
+ return pmd_flags(pmd) & _PAGE_SPECIAL;
}
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SPECIAL);
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
+
+#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+static inline bool pud_special(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_SPECIAL;
+}
+
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_SPECIAL);
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
pteval_t v = native_pte_val(pte);
@@ -156,19 +358,90 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
-static inline pte_t pte_mkclean(pte_t pte)
+/*
+ * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
+ * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So
+ * when creating dirty, write-protected memory, a software bit is used:
+ * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the
+ * Dirty bit to SavedDirty, and vice-vesra.
+ *
+ * This shifting is only done if needed. In the case of shifting
+ * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of
+ * shifting SavedDirty->Dirty, the condition is Write=1.
+ */
+static inline pgprotval_t mksaveddirty_shift(pgprotval_t v)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1;
+
+ v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY;
+ v &= ~(cond << _PAGE_BIT_DIRTY);
+
+ return v;
}
-static inline pte_t pte_mkold(pte_t pte)
+static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v)
{
- return pte_clear_flags(pte, _PAGE_ACCESSED);
+ pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1;
+
+ v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY;
+ v &= ~(cond << _PAGE_BIT_SAVED_DIRTY);
+
+ return v;
+}
+
+static inline pte_t pte_mksaveddirty(pte_t pte)
+{
+ pteval_t v = native_pte_val(pte);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pte(v);
+}
+
+static inline pte_t pte_clear_saveddirty(pte_t pte)
+{
+ pteval_t v = native_pte_val(pte);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pte(v);
}
static inline pte_t pte_wrprotect(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_RW);
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
+ * dirty value to the software bit, if present.
+ */
+ return pte_mksaveddirty(pte);
+}
+
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pte_uffd_wp(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_UFFD_WP;
+}
+
+static inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+ return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
+}
+
+static inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_ACCESSED);
}
static inline pte_t pte_mkexec(pte_t pte)
@@ -178,6 +451,15 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
+ pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pte_mksaveddirty(pte);
+}
+
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
return pte_set_flags(pte, _PAGE_DIRTY);
}
@@ -186,11 +468,15 @@ static inline pte_t pte_mkyoung(pte_t pte)
return pte_set_flags(pte, _PAGE_ACCESSED);
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_novma(pte_t pte)
{
return pte_set_flags(pte, _PAGE_RW);
}
+struct vm_area_struct;
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
+#define pte_mkwrite pte_mkwrite
+
static inline pte_t pte_mkhuge(pte_t pte)
{
return pte_set_flags(pte, _PAGE_PSE);
@@ -216,6 +502,207 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL);
}
+/* See comments above mksaveddirty_shift() */
+static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pmd(v);
+}
+
+/* See comments above mksaveddirty_shift() */
+static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pmd(v);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ return pmd_mksaveddirty(pmd);
+}
+
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pmd_uffd_wp(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_UFFD_WP;
+}
+
+static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+ return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
+}
+
+static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pmd_mksaveddirty(pmd);
+}
+
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+
+ return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+#define pmd_mkwrite pmd_mkwrite
+
+/* See comments above mksaveddirty_shift() */
+static inline pud_t pud_mksaveddirty(pud_t pud)
+{
+ pudval_t v = native_pud_val(pud);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pud(v);
+}
+
+/* See comments above mksaveddirty_shift() */
+static inline pud_t pud_clear_saveddirty(pud_t pud)
+{
+ pudval_t v = native_pud_val(pud);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pud(v);
+}
+
+static inline pud_t pud_mkold(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkclean(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
+}
+
+static inline pud_t pud_wrprotect(pud_t pud)
+{
+ pud = pud_clear_flags(pud, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ return pud_mksaveddirty(pud);
+}
+
+static inline pud_t pud_mkdirty(pud_t pud)
+{
+ pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pud_mksaveddirty(pud);
+}
+
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_PSE);
+}
+
+static inline pud_t pud_mkyoung(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkwrite(pud_t pud)
+{
+ pud = pud_set_flags(pud, _PAGE_RW);
+
+ return pud_clear_saveddirty(pud);
+}
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline int pte_soft_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SOFT_DIRTY;
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
+}
+
+static inline int pud_soft_dirty(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_SOFT_DIRTY;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_mksoft_dirty(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_clear_soft_dirty(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
@@ -230,58 +717,188 @@ static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
return protval;
}
+static inline pgprotval_t check_pgprot(pgprot_t pgprot)
+{
+ pgprotval_t massaged_val = massage_pgprot(pgprot);
+
+ /* mmdebug.h can not be included here because of dependencies */
+#ifdef CONFIG_DEBUG_VM
+ WARN_ONCE(pgprot_val(pgprot) != massaged_val,
+ "attempted to set unsupported pgprot: %016llx "
+ "bits: %016llx supported: %016llx\n",
+ (u64)pgprot_val(pgprot),
+ (u64)pgprot_val(pgprot) ^ massaged_val,
+ (u64)__supported_pte_mask);
+#endif
+
+ return massaged_val;
+}
+
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
- return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
- massage_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ /* This bit combination is used to mark shadow stacks */
+ WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) ==
+ _PAGE_DIRTY);
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PTE_PFN_MASK;
+ return __pte(pfn | check_pgprot(pgprot));
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
- return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
- massage_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PMD_PAGE_MASK;
+ return __pmd(pfn | check_pgprot(pgprot));
}
+static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
+{
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PUD_PAGE_MASK;
+ return __pud(pfn | check_pgprot(pgprot));
+}
+
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+ return pfn_pmd(pmd_pfn(pmd),
+ __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
+static inline pud_t pud_mkinvalid(pud_t pud)
+{
+ return pfn_pud(pud_pfn(pud),
+ __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
+
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
- pteval_t val = pte_val(pte);
+ pteval_t val = pte_val(pte), oldval = val;
+ pte_t pte_result;
/*
* Chop off the NX bit (if present), and add the NX portion of
* the newprot (if present):
*/
val &= _PAGE_CHG_MASK;
- val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
+ val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
+
+ pte_result = __pte(val);
+
+ /*
+ * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid:
+ * 1. Marking Write=0 PTEs Dirty=1
+ * 2. Marking Dirty=1 PTEs Write=0
+ *
+ * The first case cannot happen because the _PAGE_CHG_MASK will filter
+ * out any Dirty bit passed in newprot. Handle the second case by
+ * going through the mksaveddirty exercise. Only do this if the old
+ * value was Write=1 to avoid doing this on Shadow Stack PTEs.
+ */
+ if (oldval & _PAGE_RW)
+ pte_result = pte_mksaveddirty(pte_result);
+ else
+ pte_result = pte_clear_saveddirty(pte_result);
+
+ return pte_result;
+}
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd), oldval = val;
+ pmd_t pmd_result;
+
+ val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
+ val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
+
+ pmd_result = __pmd(val);
+
+ /*
+ * Avoid creating shadow stack PMD by accident. See comment in
+ * pte_modify().
+ */
+ if (oldval & _PAGE_RW)
+ pmd_result = pmd_mksaveddirty(pmd_result);
+ else
+ pmd_result = pmd_clear_saveddirty(pmd_result);
+
+ return pmd_result;
+}
+
+static inline pud_t pud_modify(pud_t pud, pgprot_t newprot)
+{
+ pudval_t val = pud_val(pud), oldval = val;
+ pud_t pud_result;
+
+ val &= _HPAGE_CHG_MASK;
+ val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK);
- return __pte(val);
+ pud_result = __pud(val);
+
+ /*
+ * Avoid creating shadow stack PUD by accident. See comment in
+ * pte_modify().
+ */
+ if (oldval & _PAGE_RW)
+ pud_result = pud_mksaveddirty(pud_result);
+ else
+ pud_result = pud_clear_saveddirty(pud_result);
+
+ return pud_result;
}
-/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+/*
+ * mprotect needs to preserve PAT and encryption bits when updating
+ * vm_page_prot
+ */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
- pgprotval_t addbits = pgprot_val(newprot);
+ pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
return __pgprot(preservebits | addbits);
}
-#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+#define pte_pgprot(x) __pgprot(pte_flags(x))
+#define pmd_pgprot(x) __pgprot(pmd_flags(x))
+#define pud_pgprot(x) __pgprot(pud_flags(x))
+#define p4d_pgprot(x) __pgprot(p4d_flags(x))
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
-static inline int is_new_memtype_allowed(unsigned long flags,
- unsigned long new_flags)
+static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
+ enum page_cache_mode pcm,
+ enum page_cache_mode new_pcm)
{
/*
+ * PAT type is always WB for untracked ranges, so no need to check.
+ */
+ if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
+ return 1;
+
+ /*
* Certain new memtypes are not allowed with certain
* requested memtype:
* - request is uncached, return cannot be write-back
* - request is write-combine, return cannot be write-back
+ * - request is write-through, return cannot be write-back
+ * - request is write-through, return cannot be write-combine
*/
- if ((flags == _PAGE_CACHE_UC_MINUS &&
- new_flags == _PAGE_CACHE_WB) ||
- (flags == _PAGE_CACHE_WC &&
- new_flags == _PAGE_CACHE_WB)) {
+ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WC &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WC)) {
return 0;
}
@@ -290,20 +907,46 @@ static inline int is_new_memtype_allowed(unsigned long flags,
pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);
-#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return pgd;
+ return __pti_set_user_pgtbl(pgdp, pgd);
+}
+#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
+static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
+
+#endif /* __ASSEMBLER__ */
+
#ifdef CONFIG_X86_32
-# include "pgtable_32.h"
+# include <asm/pgtable_32.h>
#else
-# include "pgtable_64.h"
+# include <asm/pgtable_64.h>
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/mm_types.h>
+#include <linux/mmdebug.h>
+#include <linux/log2.h>
+#include <asm/fixmap.h>
static inline int pte_none(pte_t pte)
{
- return !pte.pte;
+ return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}
#define __HAVE_ARCH_PTE_SAME
@@ -312,78 +955,84 @@ static inline int pte_same(pte_t a, pte_t b)
return a.pte == b.pte;
}
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
+{
+ if (__pte_needs_invert(pte_val(pte)))
+ return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
+}
+#define pte_advance_pfn pte_advance_pfn
+
static inline int pte_present(pte_t a)
{
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
-static inline int pte_hidden(pte_t pte)
+#define pte_accessible pte_accessible
+static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
- return pte_flags(pte) & _PAGE_HIDDEN;
+ if (pte_flags(a) & _PAGE_PRESENT)
+ return true;
+
+ if ((pte_flags(a) & _PAGE_PROTNONE) &&
+ atomic_read(&mm->tlb_flush_pending))
+ return true;
+
+ return false;
}
static inline int pmd_present(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_PRESENT;
+ /*
+ * Checking for _PAGE_PSE is needed too because
+ * split_huge_page will temporarily clear the present bit (but
+ * the _PAGE_PSE flag will remain set at all times while the
+ * _PAGE_PRESENT bit is clear).
+ */
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/linux/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+ return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+ return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline int pmd_none(pmd_t pmd)
{
/* Only check low word on 32-bit platforms, since it might be
out of sync with upper half. */
- return (unsigned long)native_pmd_val(pmd) == 0;
+ unsigned long val = native_pmd_val(pmd);
+ return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}
static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
- return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+ return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
-
-/*
- * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
- *
- * this macro returns the index of the entry in the pmd page which would
- * control the given virtual address
- */
-static inline unsigned pmd_index(unsigned long address)
-{
- return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
-}
-
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * (Currently stuck as a macro because of indirect forward reference
- * to linux/mm.h:page_to_nid())
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
-/*
- * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
- *
- * this function returns the index of the entry in the pte page which would
- * control the given virtual address
- */
-static inline unsigned pte_index(unsigned long address)
-{
- return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
-}
-
-static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
-{
- return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
-}
+#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
static inline int pmd_bad(pmd_t pmd)
{
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}
static inline unsigned long pages_to_mb(unsigned long npg)
@@ -391,13 +1040,10 @@ static inline unsigned long pages_to_mb(unsigned long npg)
return npg >> (20 - PAGE_SHIFT);
}
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
- remap_pfn_range(vma, vaddr, pfn, size, prot)
-
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
- return native_pud_val(pud) == 0;
+ return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}
static inline int pud_present(pud_t pud)
@@ -405,48 +1051,72 @@ static inline int pud_present(pud_t pud)
return pud_flags(pud) & _PAGE_PRESENT;
}
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
{
- return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+ return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+#define pud_page(pud) pfn_to_page(pud_pfn(pud))
-/* Find an entry in the second-level page table.. */
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
{
- return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+ return pud_val(pud) & _PAGE_PSE;
}
-static inline unsigned long pmd_pfn(pmd_t pmd)
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+
+#if CONFIG_PGTABLE_LEVELS > 3
+static inline int p4d_none(p4d_t p4d)
{
- return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}
-static inline int pud_large(pud_t pud)
+static inline int p4d_present(p4d_t p4d)
{
- return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
- (_PAGE_PSE | _PAGE_PRESENT);
+ return p4d_flags(p4d) & _PAGE_PRESENT;
}
-static inline int pud_bad(pud_t pud)
+static inline pud_t *p4d_pgtable(p4d_t p4d)
{
- return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+ return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}
-#else
-static inline int pud_large(pud_t pud)
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d))
+
+static inline int p4d_bad(p4d_t p4d)
{
- return 0;
+ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
-#if PAGETABLE_LEVELS > 3
+static inline unsigned long p4d_index(unsigned long address)
+{
+ return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
+ if (!pgtable_l5_enabled())
+ return 1;
return pgd_flags(pgd) & _PAGE_PRESENT;
}
@@ -459,58 +1129,60 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd))
/* to find an entry in a page-table-directory. */
-static inline unsigned pud_index(unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
- return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
-}
-
-static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
-{
- return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+ if (!pgtable_l5_enabled())
+ return (p4d_t *)pgd;
+ return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ unsigned long ignore_flags = _PAGE_USER;
+
+ if (!pgtable_l5_enabled())
+ return 0;
+
+ if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
{
+ if (!pgtable_l5_enabled())
+ return 0;
+ /*
+ * There is no need to do a workaround for the KNL stray
+ * A/D bit erratum here. PGDs only point to page tables
+ * except on 32-bit non-PAE which is not supported on
+ * KNL.
+ */
return !native_pgd_val(pgd);
}
-#endif /* PAGETABLE_LEVELS > 3 */
-
-#endif /* __ASSEMBLY__ */
-
-/*
- * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
- *
- * this macro returns the index of the entry in the pgd page which would
- * control the given virtual address
- */
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-
-/*
- * pgd_offset() returns a (pgd_t *)
- * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
- */
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
-/*
- * a shortcut which implies the use of the kernel's pgd, instead
- * of a process's
- */
-#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
+#endif /* __ASSEMBLER__ */
#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern int direct_gbpages;
+void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
+void __init poking_init(void);
+unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end, pgprot_t prot);
+
+#ifdef CONFIG_X86_64
+extern pgd_t trampoline_pgd_entry;
+#endif
/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
@@ -522,29 +1194,35 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
return res;
}
-static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep , pte_t pte)
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
- native_set_pte(ptep, pte);
+ pmd_t res = *pmdp;
+
+ native_pmd_clear(pmdp);
+ return res;
}
-#ifndef CONFIG_PARAVIRT
-/*
- * Rules for using pte_update - it must be called after any PTE update which
- * has not been done using the set_pte / clear_pte interfaces. It is used by
- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
- * updates should either be sets, clears, or set_pte_atomic for P->P
- * transitions, which means this hook should only be called for user PTEs.
- * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush. The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
- */
-#define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep) do { } while (0)
-#endif
+static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
+{
+ pud_t res = *pudp;
+
+ native_pud_clear(pudp);
+ return res;
+}
+
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+ page_table_check_pmd_set(mm, pmdp, pmd);
+ set_pmd(pmdp, pmd);
+}
+
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+ page_table_check_pud_set(mm, pudp, pud);
+ native_set_pud(pudp, pud);
+}
/*
* We only update the dirty/accessed state if we set
@@ -573,7 +1251,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
- pte_update(mm, addr, ptep);
+ page_table_check_pte_clear(mm, pte);
return pte;
}
@@ -589,6 +1267,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, pte);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
@@ -599,14 +1278,184 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
- pte_update(mm, addr, ptep);
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ pte_t old_pte, new_pte;
+
+ old_pte = READ_ONCE(*ptep);
+ do {
+ new_pte = pte_wrprotect(old_pte);
+ } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp,
+ pud_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+
+ page_table_check_pmd_clear(mm, pmd);
+
+ return pmd;
+}
+
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t pud = native_pudp_get_and_clear(pudp);
+
+ page_table_check_pud_clear(mm, pud);
+
+ return pud;
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ pmd_t old_pmd, new_pmd;
+
+ old_pmd = READ_ONCE(*pmdp);
+ do {
+ new_pmd = pmd_wrprotect(old_pmd);
+ } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd));
+}
+
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+ page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ return xchg(pmdp, pmd);
+ } else {
+ pmd_t old = *pmdp;
+ WRITE_ONCE(*pmdp, pmd);
+ return old;
+ }
+}
+#endif
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline pud_t pudp_establish(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp, pud_t pud)
+{
+ page_table_check_pud_set(vma->vm_mm, pudp, pud);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ return xchg(pudp, pud);
+ } else {
+ pud_t old = *pudp;
+ WRITE_ONCE(*pudp, pud);
+ return old;
+ }
+}
+#endif
+
+#define __HAVE_ARCH_PMDP_INVALIDATE_AD
+extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp);
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
+}
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+/*
+ * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
+ * the user one is in the last 4k. To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr |= BIT(bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr &= ~BIT(bit);
+ return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
- * dst - pointer to pgd range anwhere on a pgd page
+ * dst - pointer to pgd range anywhere on a pgd page
* src - ""
* count - the number of pgds to copy.
*
@@ -615,11 +1464,282 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+ /* Clone the user space pgd as well */
+ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+ count * sizeof(pgd_t));
+#endif
+}
+
+#define PTE_SHIFT ilog2(PTRS_PER_PTE)
+static inline int page_level_shift(enum pg_level level)
+{
+ return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
+}
+static inline unsigned long page_level_size(enum pg_level level)
+{
+ return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(enum pg_level level)
+{
+ return ~(page_level_size(level) - 1);
+}
+
+/*
+ * The x86 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+static inline void update_mmu_cache(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+}
+static inline void update_mmu_cache_range(struct vm_fault *vmf,
+ struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+}
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd)
+{
+}
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
+{
+}
+static inline pte_t pte_swp_mkexclusive(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
+}
+
+static inline bool pte_swp_exclusive(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
+}
+
+static inline pte_t pte_swp_clear_exclusive(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
+}
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+#endif
+#endif
+
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pte_swp_uffd_wp(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pmd_swp_uffd_wp(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+static inline u16 pte_flags_pkey(unsigned long pte_flags)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /* ifdef to avoid doing 59-bit shift on 32-bit values */
+ return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
+#else
+ return 0;
+#endif
+}
+
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * 'pteval' can come from a PTE, PMD or PUD. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ /*
+ * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
+ * shouldn't generally allow access to, but since they
+ * are already Write=0, the below logic covers both cases.
+ */
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
-#include <asm-generic/pgtable.h>
-#endif /* __ASSEMBLY__ */
+ return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
+}
+
+#define pte_access_permitted pte_access_permitted
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+ return __pte_access_permitted(pte_val(pte), write);
+}
+
+#define pmd_access_permitted pmd_access_permitted
+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
+{
+ return __pte_access_permitted(pmd_val(pmd), write);
+}
+
+#define pud_access_permitted pud_access_permitted
+static inline bool pud_access_permitted(pud_t pud, bool write)
+{
+ return __pte_access_permitted(pud_val(pud), write);
+}
+
+#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
+extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
+
+static inline bool arch_has_pfn_modify_check(void)
+{
+ return boot_cpu_has_bug(X86_BUG_L1TF);
+}
+
+#define arch_check_zapped_pte arch_check_zapped_pte
+void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
+
+#define arch_check_zapped_pmd arch_check_zapped_pmd
+void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);
+
+#define arch_check_zapped_pud arch_check_zapped_pud
+void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud);
+
+#ifdef CONFIG_XEN_PV
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+#endif
+
+#ifdef CONFIG_PAGE_TABLE_CHECK
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+ return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
+}
+
+static inline bool pmd_user_accessible_page(pmd_t pmd)
+{
+ return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER);
+}
+
+static inline bool pud_user_accessible_page(pud_t pud)
+{
+ return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER);
+}
+#endif
+
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
+/*
+ * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
+ * TLB flush will be required as a result of the "set". For example, use
+ * in scenarios where it is known ahead of time that the routine is
+ * setting non-present entries, or re-setting an existing entry to the
+ * same value. Otherwise, use the typical "set" helpers and flush the
+ * TLB.
+ */
+#define set_pte_safe(ptep, pte) \
+({ \
+ WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
+ set_pte(ptep, pte); \
+})
+
+#define set_pmd_safe(pmdp, pmd) \
+({ \
+ WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
+ set_pmd(pmdp, pmd); \
+})
+
+#define set_pud_safe(pudp, pud) \
+({ \
+ WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
+ set_pud(pudp, pud); \
+})
+
+#define set_p4d_safe(p4dp, p4d) \
+({ \
+ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
+ set_p4d(p4dp, p4d); \
+})
+
+#define set_pgd_safe(pgdp, pgd) \
+({ \
+ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
+ set_pgd(pgdp, pgd); \
+})
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323..b612cc57a4d3 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_32_H
#define _ASM_X86_PGTABLE_32_H
@@ -12,14 +13,12 @@
* This file contains the functions and defines necessary to modify and use
* the i386 page table tree.
*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/processor.h>
-#include <asm/fixmap.h>
#include <linux/threads.h>
#include <asm/paravirt.h>
#include <linux/bitops.h>
-#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
@@ -27,20 +26,11 @@ struct mm_struct;
struct vm_area_struct;
extern pgd_t swapper_pg_dir[1024];
+extern pgd_t initial_page_table[1024];
+extern pmd_t initial_pg_pmd[];
-static inline void pgtable_cache_init(void) { }
-static inline void check_pgt_cache(void) { }
void paging_init(void);
-
-extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
-
-
-/*
- * Define this if things work differently on an i386 and an i486:
- * it will (on an i486) warn about kernel memory accesses that are
- * done without a 'access_ok(VERIFY_WRITE,..)'
- */
-#undef TEST_ACCESS_OK
+void sync_initial_page_table(void);
#ifdef CONFIG_X86_PAE
# include <asm/pgtable-3level.h>
@@ -48,50 +38,38 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
# include <asm/pgtable-2level.h>
#endif
-#if defined(CONFIG_HIGHPTE)
-#define __KM_PTE \
- (in_nmi() ? KM_NMI_PTE : \
- in_irq() ? KM_IRQ_PTE : \
- KM_PTE0)
-#define pte_offset_map(dir, address) \
- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \
- pte_index((address)))
-#define pte_offset_map_nested(dir, address) \
- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
- pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
-#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
-#else
-#define pte_offset_map(dir, address) \
- ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
-#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
-#define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
-#endif
-
/* Clear a kernel PTE and flush it from the TLB */
#define kpte_clear_flush(ptep, vaddr) \
do { \
pte_clear(&init_mm, (vaddr), (ptep)); \
- __flush_tlb_one((vaddr)); \
+ flush_tlb_one_kernel((vaddr)); \
} while (0)
-/*
- * The i386 doesn't have any external MMU info: the kernel page
- * tables contain all the necessary information.
- */
-#define update_mmu_cache(vma, address, pte) do { } while (0)
-
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/*
- * kern_addr_valid() is (1) for FLATMEM and (0) for
- * SPARSEMEM and DISCONTIGMEM
+ * This is used to calculate the .brk reservation for initial pagetables.
+ * Enough space is reserved to allocate pagetables sufficient to cover all
+ * of LOWMEM_PAGES, which is an upper bound on the size of the direct map of
+ * lowmem.
+ *
+ * With PAE paging (PTRS_PER_PMD > 1), we allocate PTRS_PER_PGD == 4 pages for
+ * the PMD's in addition to the pages required for the last level pagetables.
*/
-#ifdef CONFIG_FLATMEM
-#define kern_addr_valid(addr) (1)
+#if PTRS_PER_PMD > 1
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
#else
-#define kern_addr_valid(kaddr) (0)
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
+/*
+ * Number of possible pages in the lowmem region.
+ *
+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
+ * gas warning about overflowing shift count when gas has been compiled
+ * with only a host target support using a 32-bit type for internal
+ * representation.
+ */
+#define LOWMEM_PAGES ((((_ULL(2)<<31) - __PAGE_OFFSET) >> PAGE_SHIFT))
+
#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_32_areas.h b/arch/x86/include/asm/pgtable_32_areas.h
new file mode 100644
index 000000000000..921148b42967
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32_areas.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_X86_PGTABLE_32_AREAS_H
+#define _ASM_X86_PGTABLE_32_AREAS_H
+
+#include <asm/cpu_entry_area.h>
+
+/*
+ * Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+ * physical memory until the kernel virtual memory starts. That means that
+ * any out-of-bounds memory accesses will hopefully be caught.
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ */
+#define VMALLOC_OFFSET (8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
+#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define CPU_ENTRY_AREA_PAGES (NR_CPUS * DIV_ROUND_UP(sizeof(struct cpu_entry_area), PAGE_SIZE))
+
+/* The +1 is for the readonly IDT page: */
+#define CPU_ENTRY_AREA_BASE \
+ ((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK)
+
+#define LDT_BASE_ADDR \
+ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
+
+#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE)
+
+#define PKMAP_BASE \
+ ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK)
+
+#ifdef CONFIG_HIGHMEM
+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
+#else
+# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE)
+#endif
+
+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#define MODULES_LEN (MODULES_VADDR - MODULES_END)
+
+#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
+
+#endif /* _ASM_X86_PGTABLE_32_AREAS_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 5e67c1532314..5356a46b0373 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -1,5 +1,6 @@
-#ifndef _ASM_X86_PGTABLE_32_DEFS_H
-#define _ASM_X86_PGTABLE_32_DEFS_H
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_32_TYPES_H
+#define _ASM_X86_PGTABLE_32_TYPES_H
/*
* The Linux x86 paging architecture is 'compile-time dual-mode', it
@@ -14,42 +15,9 @@
# include <asm/pgtable-2level_types.h>
#endif
+#define pgtable_l5_enabled() 0
+
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-/* Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8MB value just means that there will be a 8MB "hole" after the
- * physical memory until the kernel virtual memory starts. That means that
- * any out-of-bounds memory accesses will hopefully be caught.
- * The vmalloc() routines leaves a hole of 4kB between each vmalloced
- * area for the same reason. ;)
- */
-#define VMALLOC_OFFSET (8 * 1024 * 1024)
-
-#ifndef __ASSEMBLER__
-extern bool __vmalloc_start_set; /* set once high_memory is set */
-#endif
-
-#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
-
-#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
- & PMD_MASK)
-
-#ifdef CONFIG_HIGHMEM
-# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
-#else
-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
-#endif
-
-#define MODULES_VADDR VMALLOC_START
-#define MODULES_END VMALLOC_END
-#define MODULES_LEN (MODULES_VADDR - MODULES_END)
-
-#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
-
-#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
+#endif /* _ASM_X86_PGTABLE_32_TYPES_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a30117149..f06e5d6a2747 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -1,10 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H
#include <linux/const.h>
#include <asm/pgtable_64_types.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* This file contains the functions and defines necessary to modify and use
@@ -13,50 +14,76 @@
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
+#include <asm/fixmap.h>
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
-extern pgd_t init_level4_pgt[];
+extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
+extern pgd_t init_top_pgt[];
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir init_top_pgt
extern void paging_init(void);
+static inline void sync_initial_page_table(void) { }
#define pte_ERROR(e) \
- printk("%s:%d: bad pte %p(%016lx).\n", \
+ pr_err("%s:%d: bad pte %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e) \
- printk("%s:%d: bad pmd %p(%016lx).\n", \
+ pr_err("%s:%d: bad pmd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e) \
- printk("%s:%d: bad pud %p(%016lx).\n", \
+ pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e))
+
+#define p4d_ERROR(e) \
+ pr_err("%s:%d: bad p4d %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), p4d_val(e))
+
#define pgd_ERROR(e) \
- printk("%s:%d: bad pgd %p(%016lx).\n", \
+ pr_err("%s:%d: bad pgd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
struct mm_struct;
+#define mm_p4d_folded mm_p4d_folded
+static inline bool mm_p4d_folded(struct mm_struct *mm)
+{
+ return !pgtable_l5_enabled();
+}
+
+void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ WRITE_ONCE(*ptep, pte);
+}
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- *ptep = native_make_pte(0);
+ native_set_pte(ptep, native_make_pte(0));
}
-static inline void native_set_pte(pte_t *ptep, pte_t pte)
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- *ptep = pte;
+ native_set_pte(ptep, pte);
}
-static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- native_set_pte(ptep, pte);
+ WRITE_ONCE(*pmdp, pmd);
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ native_set_pmd(pmd, native_make_pmd(0));
}
static inline pte_t native_ptep_get_and_clear(pte_t *xp)
@@ -72,19 +99,22 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#endif
}
-static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
- *pmdp = pmd;
-}
-
-static inline void native_pmd_clear(pmd_t *pmd)
-{
- native_set_pmd(pmd, native_make_pmd(0));
+#ifdef CONFIG_SMP
+ return native_make_pmd(xchg(&xp->pmd, 0));
+#else
+ /* native_local_pmdp_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pmd_t ret = *xp;
+ native_pmd_clear(xp);
+ return ret;
+#endif
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
- *pudp = pud;
+ WRITE_ONCE(*pudp, pud);
}
static inline void native_pud_clear(pud_t *pud)
@@ -92,9 +122,44 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+#ifdef CONFIG_SMP
+ return native_make_pud(xchg(&xp->pud, 0));
+#else
+ /* native_local_pudp_get_and_clear,
+ * but duplicated because of cyclic dependency
+ */
+ pud_t ret = *xp;
+
+ native_pud_clear(xp);
+ return ret;
+#endif
+}
+
+static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ pgd_t pgd;
+
+ if (pgtable_l5_enabled() ||
+ !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) {
+ WRITE_ONCE(*p4dp, p4d);
+ return;
+ }
+
+ pgd = native_make_pgd(native_p4d_val(p4d));
+ pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
+ WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
+}
+
+static inline void native_p4d_clear(p4d_t *p4d)
+{
+ native_set_p4d(p4d, native_make_p4d(0));
+}
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pgd;
+ WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}
static inline void native_pgd_clear(pgd_t *pgd)
@@ -107,59 +172,78 @@ static inline void native_pgd_clear(pgd_t *pgd)
* and a page entry and page directory to the page they refer to.
*/
-/*
- * Level 4 access.
- */
-static inline int pgd_large(pgd_t pgd) { return 0; }
-#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+/* PGD - Level 4 access */
-/* PUD - Level3 access */
+/* PUD - Level 3 access */
-/* PMD - Level 2 access */
-#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
- _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+/* PMD - Level 2 access */
-/* PTE - Level 1 access. */
+/* PTE - Level 1 access */
-/* x86-64 always has all page tables mapped. */
-#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
-#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+/*
+ * Encode and de-code a swap entry
+ *
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
+ * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes. We need to start storing swap entries above
+ * there. We also need to avoid using A and D because of an
+ * erratum where they can be incorrectly set by hardware on
+ * non-present PTEs.
+ *
+ * SD Bits 1-4 are not used in non-present format and available for
+ * special use described below:
+ *
+ * SD (1) in swp entry is used to store soft dirty bit, which helps us
+ * remember soft dirty over page migration
+ *
+ * F (2) in swp entry is used to record when a pagetable is
+ * writeprotected by userfaultfd WP support.
+ *
+ * E (3) in swp entry is used to remember PG_anon_exclusive.
+ *
+ * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
+ * but also L and G.
+ *
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
+ */
+#define SWP_TYPE_BITS 5
-#define update_mmu_cache(vma, address, pte) do { } while (0)
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
-/* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#else
-#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
-#endif
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
-#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
- & ((1U << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
-#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << (_PAGE_BIT_PRESENT + 1)) \
- | ((offset) << SWP_OFFSET_SHIFT) })
+/* Extract the high bits for type */
+#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
+
+/* Shift up (to get rid of type), then down to get value */
+#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
+
+/*
+ * Shift the offset up "too far" by TYPE bits, then down again
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
+ */
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
+
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
-#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
+#define __swp_entry_to_pte(x) (__pte((x).val))
+#define __swp_entry_to_pmd(x) (__pmd((x).val))
-extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#define pgtable_cache_init() do { } while (0)
-#define check_pgt_cache() do { } while (0)
-
#define PAGE_AGP PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1
@@ -168,6 +252,42 @@ extern void cleanup_highmap(void);
#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
#define __HAVE_ARCH_PTE_SAME
-#endif /* !__ASSEMBLY__ */
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
+#define gup_fast_permitted gup_fast_permitted
+static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
+{
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ return false;
+ return true;
+}
+
+#include <asm/pgtable-invert.h>
+
+#else /* __ASSEMBLER__ */
+
+#define l4_index(x) (((x) >> 39) & 511)
+#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+
+L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
+L4_START_KERNEL = l4_index(__START_KERNEL_map)
+
+L3_START_KERNEL = pud_index(__START_KERNEL_map)
+
+#define SYM_DATA_START_PAGE_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
+
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+ .rept (COUNT) ; \
+ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
+ i = i + 1 ; \
+ .endr
+
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16fbbbd..7eb61ef6a185 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,8 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_DEFS_H
#define _ASM_X86_PGTABLE_64_DEFS_H
-#ifndef __ASSEMBLY__
+#include <asm/sparsemem.h>
+
+#ifndef __ASSEMBLER__
#include <linux/types.h>
+#include <asm/kaslr.h>
/*
* These are used to make use of C type-checking..
@@ -10,23 +14,54 @@
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
+typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
+typedef struct { pmdval_t pmd; } pmd_t;
+
+extern unsigned int __pgtable_l5_enabled;
+
+#ifdef USE_EARLY_PGTABLE_L5
+/*
+ * cpu_feature_enabled() is not available in early boot code.
+ * Use variable instead.
+ */
+static inline bool pgtable_l5_enabled(void)
+{
+ return __pgtable_l5_enabled;
+}
+#else
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#endif /* USE_EARLY_PGTABLE_L5 */
+
+#define ARCH_PAGE_TABLE_SYNC_MASK \
+ (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED)
-#endif /* !__ASSEMBLY__ */
+extern unsigned int pgdir_shift;
+extern unsigned int ptrs_per_p4d;
-#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 4
+#endif /* !__ASSEMBLER__ */
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
-#define PGDIR_SHIFT 39
+#define PGDIR_SHIFT pgdir_shift
#define PTRS_PER_PGD 512
/*
+ * 4th level page in 5-level paging case
+ */
+#define P4D_SHIFT 39
+#define MAX_PTRS_PER_P4D 512
+#define PTRS_PER_P4D ptrs_per_p4d
+#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK (~(P4D_SIZE - 1))
+
+#define MAX_POSSIBLE_PHYSMEM_BITS 52
+
+/*
* 3rd level page
*/
#define PUD_SHIFT 30
@@ -51,13 +86,113 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START _AC(0xffffc90000000000, UL)
-#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
-#define VMEMMAP_START _AC(0xffffea0000000000, UL)
-#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
-#define MODULES_END _AC(0xffffffffff000000, UL)
-#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+/*
+ * See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map.
+ *
+ * Be very careful vs. KASLR when changing anything here. The KASLR address
+ * range must not overlap with anything except the KASAN shadow area, which
+ * is correct as KASAN disables KASLR.
+ */
+#define MAXMEM (1UL << MAX_PHYSMEM_BITS)
+
+#define GUARD_HOLE_PGD_ENTRY -256UL
+#define GUARD_HOLE_SIZE (16UL << PGDIR_SHIFT)
+#define GUARD_HOLE_BASE_ADDR (GUARD_HOLE_PGD_ENTRY << PGDIR_SHIFT)
+#define GUARD_HOLE_END_ADDR (GUARD_HOLE_BASE_ADDR + GUARD_HOLE_SIZE)
+
+#define LDT_PGD_ENTRY -240UL
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE)
+
+#define __VMALLOC_BASE_L4 0xffffc90000000000UL
+#define __VMALLOC_BASE_L5 0xffa0000000000000UL
+
+#define VMALLOC_SIZE_TB_L4 32UL
+#define VMALLOC_SIZE_TB_L5 12800UL
+
+#define __VMEMMAP_BASE_L4 0xffffea0000000000UL
+#define __VMEMMAP_BASE_L5 0xffd4000000000000UL
+
+# define VMALLOC_START vmalloc_base
+# define VMALLOC_SIZE_TB (pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
+# define VMEMMAP_START vmemmap_base
+
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define DIRECT_MAP_PHYSMEM_END direct_map_physmem_end
+#endif
+
+/*
+ * End of the region for which vmalloc page tables are pre-allocated.
+ * For non-KMSAN builds, this is the same as VMALLOC_END.
+ * For KMSAN builds, VMALLOC_START..VMEMORY_END is 4 times bigger than
+ * VMALLOC_START..VMALLOC_END (see below).
+ */
+#define VMEMORY_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
+
+#ifndef CONFIG_KMSAN
+#define VMALLOC_END VMEMORY_END
+#else
+/*
+ * In KMSAN builds vmalloc area is four times smaller, and the remaining 3/4
+ * are used to keep the metadata for virtual pages. The memory formerly
+ * belonging to vmalloc area is now laid out as follows:
+ *
+ * 1st quarter: VMALLOC_START to VMALLOC_END - new vmalloc area
+ * 2nd quarter: KMSAN_VMALLOC_SHADOW_START to
+ * VMALLOC_END+KMSAN_VMALLOC_SHADOW_OFFSET - vmalloc area shadow
+ * 3rd quarter: KMSAN_VMALLOC_ORIGIN_START to
+ * VMALLOC_END+KMSAN_VMALLOC_ORIGIN_OFFSET - vmalloc area origins
+ * 4th quarter: KMSAN_MODULES_SHADOW_START to KMSAN_MODULES_ORIGIN_START
+ * - shadow for modules,
+ * KMSAN_MODULES_ORIGIN_START to
+ * KMSAN_MODULES_ORIGIN_START + MODULES_LEN - origins for modules.
+ */
+#define VMALLOC_QUARTER_SIZE ((VMALLOC_SIZE_TB << 40) >> 2)
+#define VMALLOC_END (VMALLOC_START + VMALLOC_QUARTER_SIZE - 1)
+
+/*
+ * vmalloc metadata addresses are calculated by adding shadow/origin offsets
+ * to vmalloc address.
+ */
+#define KMSAN_VMALLOC_SHADOW_OFFSET VMALLOC_QUARTER_SIZE
+#define KMSAN_VMALLOC_ORIGIN_OFFSET (VMALLOC_QUARTER_SIZE << 1)
+
+#define KMSAN_VMALLOC_SHADOW_START (VMALLOC_START + KMSAN_VMALLOC_SHADOW_OFFSET)
+#define KMSAN_VMALLOC_ORIGIN_START (VMALLOC_START + KMSAN_VMALLOC_ORIGIN_OFFSET)
+
+/*
+ * The shadow/origin for modules are placed one by one in the last 1/4 of
+ * vmalloc space.
+ */
+#define KMSAN_MODULES_SHADOW_START (VMALLOC_END + KMSAN_VMALLOC_ORIGIN_OFFSET + 1)
+#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN)
+#endif /* CONFIG_KMSAN */
+
+#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+/* The module sections ends with the start of the fixmap */
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define MODULES_END _AC(0xffffffffff000000, UL)
+#else
+# define MODULES_END _AC(0xfffffffffe000000, UL)
+#endif
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+
+#define CPU_ENTRY_AREA_PGD _AC(-4, UL)
+#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+
+#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
+
+#define EARLY_DYNAMIC_PAGE_TABLES 64
+
+#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
+
+/*
+ * We borrow bit 3 to remember PG_anon_exclusive.
+ */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PWT
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h
new file mode 100644
index 000000000000..4f056fb88174
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_areas.h
@@ -0,0 +1,22 @@
+#ifndef _ASM_X86_PGTABLE_AREAS_H
+#define _ASM_X86_PGTABLE_AREAS_H
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_areas.h>
+#endif
+
+/* Single page reserved for the readonly IDT mapping: */
+#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#ifdef CONFIG_X86_32
+#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \
+ (CPU_ENTRY_AREA_SIZE * NR_CPUS) - \
+ CPU_ENTRY_AREA_BASE)
+#else
+#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE
+#endif
+
+#endif /* _ASM_X86_PGTABLE_AREAS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 54cb697f4900..2ec250ba467e 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -1,10 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_DEFS_H
#define _ASM_X86_PGTABLE_DEFS_H
#include <linux/const.h>
-#include <asm/page_types.h>
+#include <linux/mem_encrypt.h>
-#define FIRST_USER_ADDRESS 0
+#include <asm/page_types.h>
#define _PAGE_BIT_PRESENT 0 /* is present */
#define _PAGE_BIT_RW 1 /* writeable */
@@ -16,19 +17,36 @@
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
-#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
-#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
-#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
+#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
+#define _PAGE_BIT_SOFTW2 10 /* " */
+#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+#define _PAGE_BIT_SOFTW4 57 /* available for programmer */
+#define _PAGE_BIT_SOFTW5 58 /* available for programmer */
+#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
+#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
+#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
+#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
+#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
+#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
+
+#ifdef CONFIG_X86_64
+#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */
+#define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW5 /* No PTI shadow (root PGD) */
+#else
+/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */
+#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit (leaf) */
+#define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW2 /* No PTI shadow (root PGD) */
+#endif
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -39,118 +57,208 @@
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
-#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
+#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
+#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define __HAVE_ARCH_PTE_SPECIAL
+#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
+#else
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
+#endif
-#ifdef CONFIG_KMEMCHECK
-#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
+ _PAGE_PKEY_BIT1 | \
+ _PAGE_PKEY_BIT2 | \
+ _PAGE_PKEY_BIT3)
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#else
+#define _PAGE_KNL_ERRATUM_MASK 0
+#endif
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
-#define _PAGE_HIDDEN (_AT(pteval_t, 0))
+#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
+#endif
+
+/*
+ * Tracking soft dirty bit when a page goes to a swap is tricky.
+ * We need a bit which can be stored in pte _and_ not conflict
+ * with swap entry format. On x86 bits 1-4 are *not* involved
+ * into swap entry computation, but bit 7 is used for thp migration,
+ * so we borrow bit 1 for soft dirty tracking.
+ *
+ * Please note that this bit must be treated as swap dirty page
+ * mark if and only if the PTE/PMD has present bit clear!
+ */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW
+#else
+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
+#endif
+
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
+#define _PAGE_SWP_UFFD_WP _PAGE_USER
+#else
+#define _PAGE_UFFD_WP (_AT(pteval_t, 0))
+#define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0))
#endif
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
#else
#define _PAGE_NX (_AT(pteval_t, 0))
+#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
#endif
-#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+/*
+ * The hardware requires shadow stack to be Write=0,Dirty=1. However,
+ * there are valid cases where the kernel might create read-only PTEs that
+ * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In
+ * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit,
+ * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have
+ * (Write=0,SavedDirty=1,Dirty=0) set.
+ */
+#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY)
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
- _PAGE_DIRTY)
-
-/* Set of bits not changed in pte_modify */
-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
-#define _PAGE_CACHE_WB (0)
-#define _PAGE_CACHE_WC (_PAGE_PWT)
-#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
-#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
-
-#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-
-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
- _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED)
-#define PAGE_COPY PAGE_COPY_NOEXEC
-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED)
-
-#define __PAGE_KERNEL_EXEC \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
-#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
-
-#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
-#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
-#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
-#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
-#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
-#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
-
-#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
-
-#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
-#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
-#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
-#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
-
-#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
-#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
-#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
-
-/* xwr */
-#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY_EXEC
-#define __P101 PAGE_READONLY_EXEC
-#define __P110 PAGE_COPY_EXEC
-#define __P111 PAGE_COPY_EXEC
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY_EXEC
-#define __S101 PAGE_READONLY_EXEC
-#define __S110 PAGE_SHARED_EXEC
-#define __S111 PAGE_SHARED_EXEC
+#define _PAGE_NOPTISHADOW (_AT(pteval_t, 1) << _PAGE_BIT_NOPTISHADOW)
+
+/*
+ * Set of bits not changed in pte_modify. The pte's
+ * protection key is treated like _PAGE_RW, for
+ * instance, and is *not* included in this mask since
+ * pte_modify() does modify it.
+ */
+#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | \
+ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \
+ _PAGE_CC | _PAGE_UFFD_WP)
+#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
+#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)
+
+/*
+ * The cache modes defined here are used to translate between pure SW usage
+ * and the HW defined cache mode bits and/or PAT entries.
+ *
+ * The resulting bits for PWT, PCD and PAT should be chosen in a way
+ * to have the WB mode at index 0 (all bits clear). This is the default
+ * right now and likely would break too much if changed.
+ */
+#ifndef __ASSEMBLER__
+enum page_cache_mode {
+ _PAGE_CACHE_MODE_WB = 0,
+ _PAGE_CACHE_MODE_WC = 1,
+ _PAGE_CACHE_MODE_UC_MINUS = 2,
+ _PAGE_CACHE_MODE_UC = 3,
+ _PAGE_CACHE_MODE_WT = 4,
+ _PAGE_CACHE_MODE_WP = 5,
+
+ _PAGE_CACHE_MODE_NUM = 8
+};
+#endif
+
+#define _PAGE_CC (_AT(pteval_t, cc_get_mask()))
+#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
+
+#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
+#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
+
+#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
+
+#define __PP _PAGE_PRESENT
+#define __RW _PAGE_RW
+#define _USR _PAGE_USER
+#define ___A _PAGE_ACCESSED
+#define ___D _PAGE_DIRTY
+#define ___G _PAGE_GLOBAL
+#define __NX _PAGE_NX
+
+#define _ENC _PAGE_ENC
+#define __WP _PAGE_CACHE_WP
+#define __NC _PAGE_NOCACHE
+#define _PSE _PAGE_PSE
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+#define __pg(x) __pgprot(x)
+
+#define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G)
+#define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0)
+#define PAGE_COPY_NOEXEC __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_COPY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
+#define PAGE_COPY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
+
+/*
+ * Page tables needs to have Write=1 in order for any lower PTEs to be
+ * writable. This includes shadow stack memory (Write=0, Dirty=1)
+ */
+#define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0)
+#define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC)
+#define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0)
+#define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC)
+
+#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G)
+#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G)
+#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
+#define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC)
+#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G)
+#define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G)
+#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G)
+#define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP)
+
+
+#define __PAGE_KERNEL_IO __PAGE_KERNEL
+#define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE
+
+
+#ifndef __ASSEMBLER__
+
+#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC)
+#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC)
+#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL | 0)
+#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP | 0)
+
+#define __pgprot_mask(x) __pgprot((x) & __default_kernel_pte_mask)
+
+#define PAGE_KERNEL __pgprot_mask(__PAGE_KERNEL | _ENC)
+#define PAGE_KERNEL_NOENC __pgprot_mask(__PAGE_KERNEL | 0)
+#define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC)
+#define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC)
+#define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0)
+#define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC)
+#define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC)
+#define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
+#define PAGE_KERNEL_VVAR __pgprot_mask(__PAGE_KERNEL_VVAR | _ENC)
+
+#define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)
+
+#endif /* __ASSEMBLER__ */
/*
* early identity mapping pte attrib macros.
@@ -158,44 +266,71 @@
#ifdef CONFIG_X86_64
#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
#else
-/*
- * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
- * bits are combined, this will alow user to access the high address mapped
- * VDSO in the presence of CONFIG_COMPAT_VDSO
- */
#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
-#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */
#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
#endif
#ifdef CONFIG_X86_32
-# include "pgtable_32_types.h"
+# include <asm/pgtable_32_types.h>
#else
-# include "pgtable_64_types.h"
+# include <asm/pgtable_64_types.h>
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
-/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
-/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+/*
+ * Extracts the flags from a (pte|pmd|pud|pgd)val_t
+ * This includes the protection key value.
+ */
#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
typedef struct { pgdval_t pgd; } pgd_t;
+static inline pgprot_t pgprot_nx(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) | _PAGE_NX);
+}
+#define pgprot_nx pgprot_nx
+
+#ifdef CONFIG_X86_PAE
+
+/*
+ * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
+ * use it here.
+ */
+
+#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK)
+#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)
+
+/*
+ * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
+ * All other bits are Reserved MBZ
+ */
+#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
+ _PAGE_PWT | _PAGE_PCD | \
+ _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)
+
+#else
+/* No need to mask any bits for !PAE */
+#define PGD_ALLOWED_BITS (~0ULL)
+#endif
+
static inline pgd_t native_make_pgd(pgdval_t val)
{
- return (pgd_t) { val };
+ return (pgd_t) { val & PGD_ALLOWED_BITS };
}
static inline pgdval_t native_pgd_val(pgd_t pgd)
{
- return pgd.pgd;
+ return pgd.pgd & PGD_ALLOWED_BITS;
}
static inline pgdval_t pgd_flags(pgd_t pgd)
@@ -203,7 +338,33 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;
+
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { val };
+}
+
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return p4d.p4d;
+}
+#else
+#include <asm-generic/pgtable-nop4d.h>
+
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
+}
+
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return native_pgd_val(p4d.pgd);
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
static inline pud_t native_make_pud(pmdval_t val)
@@ -218,18 +379,21 @@ static inline pudval_t native_pud_val(pud_t pud)
#else
#include <asm-generic/pgtable-nopud.h>
+static inline pud_t native_make_pud(pudval_t val)
+{
+ return (pud_t) { .p4d.pgd = native_make_pgd(val) };
+}
+
static inline pudval_t native_pud_val(pud_t pud)
{
- return native_pgd_val(pud.pgd);
+ return native_pgd_val(pud.p4d.pgd);
}
#endif
-#if PAGETABLE_LEVELS > 2
-typedef struct { pmdval_t pmd; } pmd_t;
-
+#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t native_make_pmd(pmdval_t val)
{
- return (pmd_t) { val };
+ return (pmd_t) { .pmd = val };
}
static inline pmdval_t native_pmd_val(pmd_t pmd)
@@ -239,20 +403,67 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
#else
#include <asm-generic/pgtable-nopmd.h>
+static inline pmd_t native_make_pmd(pmdval_t val)
+{
+ return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
+}
+
static inline pmdval_t native_pmd_val(pmd_t pmd)
{
- return native_pgd_val(pmd.pud.pgd);
+ return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif
+static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
+{
+ /* No 512 GiB huge pages yet */
+ return PTE_PFN_MASK;
+}
+
+static inline p4dval_t p4d_flags_mask(p4d_t p4d)
+{
+ return ~p4d_pfn_mask(p4d);
+}
+
+static inline p4dval_t p4d_flags(p4d_t p4d)
+{
+ return native_p4d_val(p4d) & p4d_flags_mask(p4d);
+}
+
+static inline pudval_t pud_pfn_mask(pud_t pud)
+{
+ if (native_pud_val(pud) & _PAGE_PSE)
+ return PHYSICAL_PUD_PAGE_MASK;
+ else
+ return PTE_PFN_MASK;
+}
+
+static inline pudval_t pud_flags_mask(pud_t pud)
+{
+ return ~pud_pfn_mask(pud);
+}
+
static inline pudval_t pud_flags(pud_t pud)
{
- return native_pud_val(pud) & PTE_FLAGS_MASK;
+ return native_pud_val(pud) & pud_flags_mask(pud);
+}
+
+static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
+{
+ if (native_pmd_val(pmd) & _PAGE_PSE)
+ return PHYSICAL_PMD_PAGE_MASK;
+ else
+ return PTE_PFN_MASK;
+}
+
+static inline pmdval_t pmd_flags_mask(pmd_t pmd)
+{
+ return ~pmd_pfn_mask(pmd);
}
static inline pmdval_t pmd_flags(pmd_t pmd)
{
- return native_pmd_val(pmd) & PTE_FLAGS_MASK;
+ return native_pmd_val(pmd) & pmd_flags_mask(pmd);
}
static inline pte_t native_make_pte(pteval_t val)
@@ -270,18 +481,49 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
-#define pgprot_val(x) ((x).pgprot)
-#define __pgprot(x) ((pgprot_t) { (x) } )
+#define __pte2cm_idx(cb) \
+ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \
+ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \
+ (((cb) >> _PAGE_BIT_PWT) & 1))
+#define __cm_idx2pte(i) \
+ ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \
+ (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \
+ (((i) & 1) << _PAGE_BIT_PWT))
+
+unsigned long cachemode2protval(enum page_cache_mode pcm);
+
+static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
+{
+ return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
+ ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
+}
+static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
+{
+ return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
+}
+static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
+{
+ return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
+ ((val & _PAGE_PAT_LARGE) >>
+ (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
+}
+static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
+{
+ return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
+}
typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
-extern int nx_enabled;
+extern pteval_t __default_kernel_pte_mask;
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
+#define pgprot_writethrough pgprot_writethrough
+extern pgprot_t pgprot_writethrough(pgprot_t prot);
+
/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING
@@ -289,28 +531,23 @@ extern pgprot_t pgprot_writecombine(pgprot_t prot);
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
- unsigned long size, pgprot_t *vma_prot);
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
#ifdef CONFIG_X86_32
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
+extern void native_pagetable_init(void);
#else
-static inline void native_pagetable_setup_start(pgd_t *base) {}
-static inline void native_pagetable_setup_done(pgd_t *base) {}
+#define native_pagetable_init paging_init
#endif
-struct seq_file;
-extern void arch_report_meminfo(struct seq_file *m);
-
-enum {
+enum pg_level {
PG_LEVEL_NONE,
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
+ PG_LEVEL_512G,
+ PG_LEVEL_256T,
PG_LEVEL_NUM
};
@@ -327,7 +564,18 @@ static inline void update_page_count(int level, unsigned long pages) { }
* as a pte too.
*/
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
-
-#endif /* !__ASSEMBLY__ */
+extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level);
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw);
+extern pmd_t *lookup_pmd_address(unsigned long address);
+extern phys_addr_t slow_virt_to_phys(void *__address);
+extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
+ unsigned long address,
+ unsigned numpages,
+ unsigned long page_flags);
+extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned long numpages);
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
new file mode 100644
index 000000000000..2e6c04d8a45b
--- /dev/null
+++ b/arch/x86/include/asm/pkeys.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKEYS_H
+#define _ASM_X86_PKEYS_H
+
+/*
+ * If more than 16 keys are ever supported, a thorough audit
+ * will be necessary to ensure that the types that store key
+ * numbers and masks have sufficient capacity.
+ */
+#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+
+static inline bool arch_pkeys_enabled(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_OSPKE);
+}
+
+/*
+ * Try to dedicate one of the protection keys to be used as an
+ * execute-only protection key.
+ */
+extern int __execute_only_pkey(struct mm_struct *mm);
+static inline int execute_only_pkey(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return ARCH_DEFAULT_PKEY;
+
+ return __execute_only_pkey(mm);
+}
+
+extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey);
+static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return 0;
+
+ return __arch_override_mprotect_pkey(vma, prot, pkey);
+}
+
+#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
+
+#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
+#define mm_set_pkey_allocated(mm, pkey) do { \
+ mm_pkey_allocation_map(mm) |= (1U << pkey); \
+} while (0)
+#define mm_set_pkey_free(mm, pkey) do { \
+ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \
+} while (0)
+
+static inline
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+ /*
+ * "Allocated" pkeys are those that have been returned
+ * from pkey_alloc() or pkey 0 which is allocated
+ * implicitly when the mm is created.
+ */
+ if (pkey < 0)
+ return false;
+ if (pkey >= arch_max_pkey())
+ return false;
+ /*
+ * The exec-only pkey is set in the allocation map, but
+ * is not available to any of the user interfaces like
+ * mprotect_pkey().
+ */
+ if (pkey == mm->context.execute_only_pkey)
+ return false;
+
+ return mm_pkey_allocation_map(mm) & (1U << pkey);
+}
+
+/*
+ * Returns a positive, 4-bit key on success, or -1 on failure.
+ */
+static inline
+int mm_pkey_alloc(struct mm_struct *mm)
+{
+ /*
+ * Note: this is the one and only place we make sure
+ * that the pkey is valid as far as the hardware is
+ * concerned. The rest of the kernel trusts that
+ * only good, valid pkeys come out of here.
+ */
+ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
+ int ret;
+
+ /*
+ * Are we out of pkeys? We must handle this specially
+ * because ffz() behavior is undefined if there are no
+ * zeros.
+ */
+ if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+ return -1;
+
+ ret = ffz(mm_pkey_allocation_map(mm));
+
+ mm_set_pkey_allocated(mm, ret);
+
+ return ret;
+}
+
+static inline
+int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+ if (!mm_pkey_is_allocated(mm, pkey))
+ return -EINVAL;
+
+ mm_set_pkey_free(mm, pkey);
+
+ return 0;
+}
+
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+ unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+ VM_PKEY_BIT2 | VM_PKEY_BIT3;
+
+ return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+
+#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
new file mode 100644
index 000000000000..74f0a2d34ffd
--- /dev/null
+++ b/arch/x86/include/asm/pkru.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKRU_H
+#define _ASM_X86_PKRU_H
+
+#include <asm/cpufeature.h>
+
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
+#define PKRU_BITS_PER_PKEY 2
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+extern u32 init_pkru_value;
+#define pkru_get_init_value() READ_ONCE(init_pkru_value)
+#else
+#define init_pkru_value 0
+#define pkru_get_init_value() 0
+#endif
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ /*
+ * Access-disable disables writes too so we need to check
+ * both bits here.
+ */
+ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+static inline u32 read_pkru(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return rdpkru();
+ return 0;
+}
+
+static inline void write_pkru(u32 pkru)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+ /*
+ * WRPKRU is relatively expensive compared to RDPKRU.
+ * Avoid WRPKRU when it would not change the value.
+ */
+ if (pkru != rdpkru())
+ wrpkru(pkru);
+}
+
+static inline void pkru_write_default(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ wrpkru(pkru_get_init_value());
+}
+
+#endif
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
new file mode 100644
index 000000000000..40f92270515b
--- /dev/null
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * platform_sst_audio.h: sst audio platform data header file
+ *
+ * Copyright (C) 2012-14 Intel Corporation
+ * Author: Jeeja KP <jeeja.kp@intel.com>
+ * Omair Mohammed Abdullah <omair.m.abdullah@intel.com>
+ * Vinod Koul ,vinod.koul@intel.com>
+ */
+#ifndef _PLATFORM_SST_AUDIO_H_
+#define _PLATFORM_SST_AUDIO_H_
+
+#define MAX_NUM_STREAMS_MRFLD 25
+#define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD
+
+enum sst_audio_task_id_mrfld {
+ SST_TASK_ID_NONE = 0,
+ SST_TASK_ID_SBA = 1,
+ SST_TASK_ID_MEDIA = 3,
+ SST_TASK_ID_MAX = SST_TASK_ID_MEDIA,
+};
+
+/* Device IDs for Merrifield are Pipe IDs,
+ * ref: DSP spec v0.75 */
+enum sst_audio_device_id_mrfld {
+ /* Output pipeline IDs */
+ PIPE_ID_OUT_START = 0x0,
+ PIPE_CODEC_OUT0 = 0x2,
+ PIPE_CODEC_OUT1 = 0x3,
+ PIPE_SPROT_LOOP_OUT = 0x4,
+ PIPE_MEDIA_LOOP1_OUT = 0x5,
+ PIPE_MEDIA_LOOP2_OUT = 0x6,
+ PIPE_VOIP_OUT = 0xC,
+ PIPE_PCM0_OUT = 0xD,
+ PIPE_PCM1_OUT = 0xE,
+ PIPE_PCM2_OUT = 0xF,
+ PIPE_MEDIA0_OUT = 0x12,
+ PIPE_MEDIA1_OUT = 0x13,
+/* Input Pipeline IDs */
+ PIPE_ID_IN_START = 0x80,
+ PIPE_CODEC_IN0 = 0x82,
+ PIPE_CODEC_IN1 = 0x83,
+ PIPE_SPROT_LOOP_IN = 0x84,
+ PIPE_MEDIA_LOOP1_IN = 0x85,
+ PIPE_MEDIA_LOOP2_IN = 0x86,
+ PIPE_VOIP_IN = 0x8C,
+ PIPE_PCM0_IN = 0x8D,
+ PIPE_PCM1_IN = 0x8E,
+ PIPE_MEDIA0_IN = 0x8F,
+ PIPE_MEDIA1_IN = 0x90,
+ PIPE_MEDIA2_IN = 0x91,
+ PIPE_MEDIA3_IN = 0x9C,
+ PIPE_RSVD = 0xFF,
+};
+
+/* The stream map for each platform consists of an array of the below
+ * stream map structure.
+ */
+struct sst_dev_stream_map {
+ u8 dev_num; /* device id */
+ u8 subdev_num; /* substream */
+ u8 direction;
+ u8 device_id; /* fw id */
+ u8 task_id; /* fw task */
+ u8 status;
+};
+
+struct sst_platform_data {
+ /* Intel software platform id*/
+ struct sst_dev_stream_map *pdev_strm_map;
+ unsigned int strm_map_size;
+};
+
+struct sst_info {
+ u32 iram_start;
+ u32 iram_end;
+ bool iram_use;
+ u32 dram_start;
+ u32 dram_end;
+ bool dram_use;
+ u32 imr_start;
+ u32 imr_end;
+ bool imr_use;
+ u32 mailbox_start;
+ bool use_elf;
+ bool lpe_viewpt_rqd;
+ unsigned int max_streams;
+ u32 dma_max_len;
+ u8 num_probes;
+};
+
+struct sst_lib_dnld_info {
+ unsigned int mod_base;
+ unsigned int mod_end;
+ unsigned int mod_table_offset;
+ unsigned int mod_table_size;
+ bool mod_ddr_dnld;
+};
+
+struct sst_res_info {
+ unsigned int shim_offset;
+ unsigned int shim_size;
+ unsigned int shim_phy_addr;
+ unsigned int ssp0_offset;
+ unsigned int ssp0_size;
+ unsigned int dma0_offset;
+ unsigned int dma0_size;
+ unsigned int dma1_offset;
+ unsigned int dma1_size;
+ unsigned int iram_offset;
+ unsigned int iram_size;
+ unsigned int dram_offset;
+ unsigned int dram_size;
+ unsigned int mbox_offset;
+ unsigned int mbox_size;
+ unsigned int acpi_lpe_res_index;
+ unsigned int acpi_ddr_index;
+ unsigned int acpi_ipc_irq_index;
+};
+
+struct sst_ipc_info {
+ int ipc_offset;
+ unsigned int mbox_recv_off;
+};
+
+struct sst_platform_info {
+ const struct sst_info *probe_data;
+ const struct sst_ipc_info *ipc_info;
+ const struct sst_res_info *res_info;
+ const struct sst_lib_dnld_info *lib_info;
+ const char *platform;
+ bool streams_lost_on_suspend;
+};
+int add_sst_platform_device(void);
+#endif
+
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/pm-trace.h
index 3ff1c2cb1da5..bfa32aa428e5 100644
--- a/arch/x86/include/asm/resume-trace.h
+++ b/arch/x86/include/asm/pm-trace.h
@@ -1,5 +1,6 @@
-#ifndef _ASM_X86_RESUME_TRACE_H
-#define _ASM_X86_RESUME_TRACE_H
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PM_TRACE_H
+#define _ASM_X86_PM_TRACE_H
#include <asm/asm.h>
@@ -14,8 +15,10 @@ do { \
".previous" \
:"=r" (tracedata) \
: "i" (__LINE__), "i" (__FILE__)); \
- generate_resume_trace(tracedata, user); \
+ generate_pm_trace(tracedata, user); \
} \
} while (0)
-#endif /* _ASM_X86_RESUME_TRACE_H */
+#define TRACE_SUSPEND(user) TRACE_RESUME(user)
+
+#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/poll.h b/arch/x86/include/asm/poll.h
deleted file mode 100644
index c98509d3149e..000000000000
--- a/arch/x86/include/asm/poll.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h
index bb7133dc155d..374336e217ff 100644
--- a/arch/x86/include/asm/posix_types.h
+++ b/arch/x86/include/asm/posix_types.h
@@ -1,13 +1,6 @@
-#ifdef __KERNEL__
+/* SPDX-License-Identifier: GPL-2.0 */
# ifdef CONFIG_X86_32
-# include "posix_types_32.h"
+# include <asm/posix_types_32.h>
# else
-# include "posix_types_64.h"
+# include <asm/posix_types_64.h>
# endif
-#else
-# ifdef __i386__
-# include "posix_types_32.h"
-# else
-# include "posix_types_64.h"
-# endif
-#endif
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
deleted file mode 100644
index f7d9adf82e53..000000000000
--- a/arch/x86/include/asm/posix_types_32.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _ASM_X86_POSIX_TYPES_32_H
-#define _ASM_X86_POSIX_TYPES_32_H
-
-/*
- * This file is generally used by user-level software, so you need to
- * be a little careful about namespace pollution etc. Also, we cannot
- * assume GCC is being used.
- */
-
-typedef unsigned long __kernel_ino_t;
-typedef unsigned short __kernel_mode_t;
-typedef unsigned short __kernel_nlink_t;
-typedef long __kernel_off_t;
-typedef int __kernel_pid_t;
-typedef unsigned short __kernel_ipc_pid_t;
-typedef unsigned short __kernel_uid_t;
-typedef unsigned short __kernel_gid_t;
-typedef unsigned int __kernel_size_t;
-typedef int __kernel_ssize_t;
-typedef int __kernel_ptrdiff_t;
-typedef long __kernel_time_t;
-typedef long __kernel_suseconds_t;
-typedef long __kernel_clock_t;
-typedef int __kernel_timer_t;
-typedef int __kernel_clockid_t;
-typedef int __kernel_daddr_t;
-typedef char * __kernel_caddr_t;
-typedef unsigned short __kernel_uid16_t;
-typedef unsigned short __kernel_gid16_t;
-typedef unsigned int __kernel_uid32_t;
-typedef unsigned int __kernel_gid32_t;
-
-typedef unsigned short __kernel_old_uid_t;
-typedef unsigned short __kernel_old_gid_t;
-typedef unsigned short __kernel_old_dev_t;
-
-#ifdef __GNUC__
-typedef long long __kernel_loff_t;
-#endif
-
-typedef struct {
- int val[2];
-} __kernel_fsid_t;
-
-#if defined(__KERNEL__)
-
-#undef __FD_SET
-#define __FD_SET(fd,fdsetp) \
- asm volatile("btsl %1,%0": \
- "+m" (*(__kernel_fd_set *)(fdsetp)) \
- : "r" ((int)(fd)))
-
-#undef __FD_CLR
-#define __FD_CLR(fd,fdsetp) \
- asm volatile("btrl %1,%0": \
- "+m" (*(__kernel_fd_set *)(fdsetp)) \
- : "r" ((int) (fd)))
-
-#undef __FD_ISSET
-#define __FD_ISSET(fd,fdsetp) \
- (__extension__ \
- ({ \
- unsigned char __result; \
- asm volatile("btl %1,%2 ; setb %0" \
- : "=q" (__result) \
- : "r" ((int)(fd)), \
- "m" (*(__kernel_fd_set *)(fdsetp))); \
- __result; \
-}))
-
-#undef __FD_ZERO
-#define __FD_ZERO(fdsetp) \
-do { \
- int __d0, __d1; \
- asm volatile("cld ; rep ; stosl" \
- : "=m" (*(__kernel_fd_set *)(fdsetp)), \
- "=&c" (__d0), "=&D" (__d1) \
- : "a" (0), "1" (__FDSET_LONGS), \
- "2" ((__kernel_fd_set *)(fdsetp)) \
- : "memory"); \
-} while (0)
-
-#endif /* defined(__KERNEL__) */
-
-#endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h
deleted file mode 100644
index eb8d2d92b63e..000000000000
--- a/arch/x86/include/asm/posix_types_64.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef _ASM_X86_POSIX_TYPES_64_H
-#define _ASM_X86_POSIX_TYPES_64_H
-
-/*
- * This file is generally used by user-level software, so you need to
- * be a little careful about namespace pollution etc. Also, we cannot
- * assume GCC is being used.
- */
-
-typedef unsigned long __kernel_ino_t;
-typedef unsigned int __kernel_mode_t;
-typedef unsigned long __kernel_nlink_t;
-typedef long __kernel_off_t;
-typedef int __kernel_pid_t;
-typedef int __kernel_ipc_pid_t;
-typedef unsigned int __kernel_uid_t;
-typedef unsigned int __kernel_gid_t;
-typedef unsigned long __kernel_size_t;
-typedef long __kernel_ssize_t;
-typedef long __kernel_ptrdiff_t;
-typedef long __kernel_time_t;
-typedef long __kernel_suseconds_t;
-typedef long __kernel_clock_t;
-typedef int __kernel_timer_t;
-typedef int __kernel_clockid_t;
-typedef int __kernel_daddr_t;
-typedef char * __kernel_caddr_t;
-typedef unsigned short __kernel_uid16_t;
-typedef unsigned short __kernel_gid16_t;
-
-#ifdef __GNUC__
-typedef long long __kernel_loff_t;
-#endif
-
-typedef struct {
- int val[2];
-} __kernel_fsid_t;
-
-typedef unsigned short __kernel_old_uid_t;
-typedef unsigned short __kernel_old_gid_t;
-typedef __kernel_uid_t __kernel_uid32_t;
-typedef __kernel_gid_t __kernel_gid32_t;
-
-typedef unsigned long __kernel_old_dev_t;
-
-#ifdef __KERNEL__
-
-#undef __FD_SET
-static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp)
-{
- unsigned long _tmp = fd / __NFDBITS;
- unsigned long _rem = fd % __NFDBITS;
- fdsetp->fds_bits[_tmp] |= (1UL<<_rem);
-}
-
-#undef __FD_CLR
-static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp)
-{
- unsigned long _tmp = fd / __NFDBITS;
- unsigned long _rem = fd % __NFDBITS;
- fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem);
-}
-
-#undef __FD_ISSET
-static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p)
-{
- unsigned long _tmp = fd / __NFDBITS;
- unsigned long _rem = fd % __NFDBITS;
- return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0;
-}
-
-/*
- * This will unroll the loop for the normal constant cases (8 or 32 longs,
- * for 256 and 1024-bit fd_sets respectively)
- */
-#undef __FD_ZERO
-static inline void __FD_ZERO(__kernel_fd_set *p)
-{
- unsigned long *tmp = p->fds_bits;
- int i;
-
- if (__builtin_constant_p(__FDSET_LONGS)) {
- switch (__FDSET_LONGS) {
- case 32:
- tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
- tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
- tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
- tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
- tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0;
- tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0;
- tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0;
- tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0;
- return;
- case 16:
- tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
- tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
- tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
- tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
- return;
- case 8:
- tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
- tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
- return;
- case 4:
- tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
- return;
- }
- }
- i = __FDSET_LONGS;
- while (i) {
- i--;
- *tmp = 0;
- tmp++;
- }
-}
-
-#endif /* defined(__KERNEL__) */
-
-#endif /* _ASM_X86_POSIX_TYPES_64_H */
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
new file mode 100644
index 000000000000..a5f761fbf45b
--- /dev/null
+++ b/arch/x86/include/asm/posted_intr.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_POSTED_INTR_H
+#define _X86_POSTED_INTR_H
+
+#include <asm/cmpxchg.h>
+#include <asm/rwonce.h>
+#include <asm/irq_vectors.h>
+
+#include <linux/bitmap.h>
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+#define PID_TABLE_ENTRY_VALID 1
+
+#define NR_PIR_VECTORS 256
+#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */
+ union {
+ struct {
+ u16 notifications; /* Suppress and outstanding bits */
+ u8 nv;
+ u8 rsvd_2;
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During software processing of posted interrupts, the CPU needs to do
+ * natural width read and xchg for checking and clearing posted interrupt
+ * request (PIR), a 256 bit field within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows,
+ * assuming a 64-bit kernel:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * when processing posted interrupts in software, e.g. to dispatch interrupt
+ * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
+ * in KVM.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool pi_harvest_pir(unsigned long *pir,
+ unsigned long *pir_vals)
+{
+ unsigned long pending = 0;
+ int i;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ pir_vals[i] = READ_ONCE(pir[i]);
+ pending |= pir_vals[i];
+ }
+
+ if (!pending)
+ return false;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ if (!pir_vals[i])
+ continue;
+
+ pir_vals[i] = arch_xchg(&pir[i], 0);
+ }
+
+ return true;
+}
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty(pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+/* Non-atomic helpers */
+static inline void __pi_set_sn(struct pi_desc *pi_desc)
+{
+ pi_desc->notifications |= BIT(POSTED_INTR_SN);
+}
+
+static inline void __pi_clear_sn(struct pi_desc *pi_desc)
+{
+ pi_desc->notifications &= ~BIT(POSTED_INTR_SN);
+}
+
+#ifdef CONFIG_X86_POSTED_MSI
+/*
+ * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's
+ * own interrupts. Here we do not distinguish them since those vector bits in
+ * PIR will always be zero.
+ */
+static inline bool pi_pending_this_cpu(unsigned int vector)
+{
+ struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
+ return false;
+
+ return test_bit(vector, pid->pir);
+}
+
+extern void intel_posted_msi_init(void);
+#else
+static inline bool pi_pending_this_cpu(unsigned int vector) { return false; }
+
+static inline void intel_posted_msi_init(void) {};
+#endif /* X86_POSTED_MSI */
+
+#endif /* _X86_POSTED_INTR_H */
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
deleted file mode 100644
index 3ac5032fae09..000000000000
--- a/arch/x86/include/asm/prctl.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _ASM_X86_PRCTL_H
-#define _ASM_X86_PRCTL_H
-
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
-
-#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
new file mode 100644
index 000000000000..578441db09f0
--- /dev/null
+++ b/arch/x86/include/asm/preempt.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_PREEMPT_H
+#define __ASM_PREEMPT_H
+
+#include <asm/rmwcc.h>
+#include <asm/percpu.h>
+
+#include <linux/static_call_types.h>
+
+DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count);
+
+/* We use the MSB mostly because its available */
+#define PREEMPT_NEED_RESCHED 0x80000000
+
+/*
+ * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
+ * that a decrement hitting 0 means we can and should reschedule.
+ */
+#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
+
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
+static __always_inline int preempt_count(void)
+{
+ return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline void preempt_count_set(int pc)
+{
+ int old, new;
+
+ old = raw_cpu_read_4(__preempt_count);
+ do {
+ new = (old & PREEMPT_NEED_RESCHED) |
+ (pc & ~PREEMPT_NEED_RESCHED);
+ } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new));
+}
+
+/*
+ * must be macros to avoid header recursion hell
+ */
+#define init_task_preempt_count(p) do { } while (0)
+
+#define init_idle_preempt_count(p, cpu) do { \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
+} while (0)
+
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * single instruction.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
+
+static __always_inline void set_preempt_need_resched(void)
+{
+ raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline void clear_preempt_need_resched(void)
+{
+ raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline bool test_preempt_need_resched(void)
+{
+ return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
+}
+
+/*
+ * The various preempt_count add/sub methods
+ */
+
+static __always_inline void __preempt_count_add(int val)
+{
+ raw_cpu_add_4(__preempt_count, val);
+}
+
+static __always_inline void __preempt_count_sub(int val)
+{
+ raw_cpu_add_4(__preempt_count, -val);
+}
+
+/*
+ * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
+ * a decrement which hits zero means we have no preempt_count and should
+ * reschedule.
+ */
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+ return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e,
+ __percpu_arg([var]));
+}
+
+/*
+ * Returns true when we need to resched and can (barring IRQ state).
+ */
+static __always_inline bool should_resched(int preempt_offset)
+{
+ return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
+}
+
+#ifdef CONFIG_PREEMPTION
+
+extern asmlinkage void preempt_schedule(void);
+extern asmlinkage void preempt_schedule_thunk(void);
+
+#define preempt_schedule_dynamic_enabled preempt_schedule_thunk
+#define preempt_schedule_dynamic_disabled NULL
+
+extern asmlinkage void preempt_schedule_notrace(void);
+extern asmlinkage void preempt_schedule_notrace_thunk(void);
+
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace_thunk
+#define preempt_schedule_notrace_dynamic_disabled NULL
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
+
+#define __preempt_schedule() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+DECLARE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+
+#define __preempt_schedule_notrace() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+#else /* PREEMPT_DYNAMIC */
+
+#define __preempt_schedule() \
+ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);
+
+#define __preempt_schedule_notrace() \
+ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);
+
+#endif /* PREEMPT_DYNAMIC */
+
+#endif /* PREEMPTION */
+
+#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 000000000000..1c7f3815bbd6
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PROBE_ROMS_H_
+#define _PROBE_ROMS_H_
+struct pci_dev;
+
+extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
+extern void pci_unmap_biosrom(void __iomem *rom);
+extern size_t pci_biosrom_size(struct pci_dev *pdev);
+#endif
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
index 1198f2a0e42c..efe3e46e454b 100644
--- a/arch/x86/include/asm/processor-cyrix.h
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -1,38 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* NSC/Cyrix CPU indexed register access. Must be inlined instead of
* macros to ensure correct access ordering
* Access order is always 0x22 (=offset), 0x23 (=value)
- *
- * When using the old macros a line like
- * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
- * gets expanded to:
- * do {
- * outb((CX86_CCR2), 0x22);
- * outb((({
- * outb((CX86_CCR2), 0x22);
- * inb(0x23);
- * }) | 0x88), 0x23);
- * } while (0);
- *
- * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23).
*/
+#include <asm/pc-conf-reg.h>
+
static inline u8 getCx86(u8 reg)
{
- outb(reg, 0x22);
- return inb(0x23);
+ return pc_conf_get(reg);
}
static inline void setCx86(u8 reg, u8 data)
{
- outb(reg, 0x22);
- outb(data, 0x23);
+ pc_conf_set(reg, data);
}
-
-#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
-
-#define setCx86_old(reg, data) do { \
- outb((reg), 0x22); \
- outb((data), 0x23); \
-} while (0)
-
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 7a3e836eb2a9..e5f204b9b33d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -1,100 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PROCESSOR_FLAGS_H
#define _ASM_X86_PROCESSOR_FLAGS_H
-/* Various flags defined: can be included from assembler. */
-/*
- * EFLAGS bits
- */
-#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
-#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
-#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
-#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
-#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
-#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
-#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
-#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
-#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
-#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
-#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
-#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
-#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
-#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
-#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
-#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
-#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
+#include <uapi/asm/processor-flags.h>
+#include <linux/mem_encrypt.h>
-/*
- * Basic CPU control in CR0
- */
-#define X86_CR0_PE 0x00000001 /* Protection Enable */
-#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */
-#define X86_CR0_EM 0x00000004 /* Emulation */
-#define X86_CR0_TS 0x00000008 /* Task Switched */
-#define X86_CR0_ET 0x00000010 /* Extension Type */
-#define X86_CR0_NE 0x00000020 /* Numeric Error */
-#define X86_CR0_WP 0x00010000 /* Write Protect */
-#define X86_CR0_AM 0x00040000 /* Alignment Mask */
-#define X86_CR0_NW 0x20000000 /* Not Write-through */
-#define X86_CR0_CD 0x40000000 /* Cache Disable */
-#define X86_CR0_PG 0x80000000 /* Paging */
-
-/*
- * Paging options in CR3
- */
-#define X86_CR3_PWT 0x00000008 /* Page Write Through */
-#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
-
-/*
- * Intel CPU features in CR4
- */
-#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */
-#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */
-#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */
-#define X86_CR4_DE 0x00000008 /* enable debugging extensions */
-#define X86_CR4_PSE 0x00000010 /* enable page size extensions */
-#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */
-#define X86_CR4_MCE 0x00000040 /* Machine check enable */
-#define X86_CR4_PGE 0x00000080 /* enable global pages */
-#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */
-#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
-#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
-#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
-#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
-
-/*
- * x86-64 Task Priority Register, CR8
- */
-#define X86_CR8_TPR 0x0000000F /* task priority register */
+#ifdef CONFIG_VM86
+#define X86_VM_MASK X86_EFLAGS_VM
+#else
+#define X86_VM_MASK 0 /* No VM86 support */
+#endif
/*
- * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * On systemms with LAM, bits 61 and 62 are used to indicate LAM mode.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
*/
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID and SME encryption bits. */
+#define CR3_ADDR_MASK __sme_clr(PHYSICAL_PAGE_MASK)
+#define CR3_PCID_MASK 0xFFFull
+#define CR3_NOFLUSH BIT_ULL(63)
+#else
/*
- * NSC/Cyrix CPU configuration register indexes
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
*/
-#define CX86_PCR0 0x20
-#define CX86_GCR 0xb8
-#define CX86_CCR0 0xc0
-#define CX86_CCR1 0xc1
-#define CX86_CCR2 0xc2
-#define CX86_CCR3 0xc3
-#define CX86_CCR4 0xe8
-#define CX86_CCR5 0xe9
-#define CX86_CCR6 0xea
-#define CX86_CCR7 0xeb
-#define CX86_PCR1 0xf0
-#define CX86_DIR0 0xfe
-#define CX86_DIR1 0xff
-#define CX86_ARR_BASE 0xc4
-#define CX86_RCR_BASE 0xdc
-
-#ifdef __KERNEL__
-#ifdef CONFIG_VM86
-#define X86_VM_MASK X86_EFLAGS_VM
-#else
-#define X86_VM_MASK 0 /* No VM86 support */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#define CR3_NOFLUSH 0
#endif
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_PCID_USER_BIT 11
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c7768269b1cf..a24c7805acdb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PROCESSOR_H
#define _ASM_X86_PROCESSOR_H
@@ -6,113 +7,187 @@
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
+struct io_bitmap;
+struct vm86;
-#include <asm/vm86.h>
#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
-#include <asm/cpufeature.h>
-#include <asm/system.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpuid/api.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
-#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
-#include <asm/ds.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/types.h>
+#include <asm/unwind_hints.h>
+#include <asm/vmxfeatures.h>
+#include <asm/vdso/processor.h>
+#include <asm/shstk.h>
#include <linux/personality.h>
-#include <linux/cpumask.h>
#include <linux/cache.h>
#include <linux/threads.h>
-#include <linux/init.h>
+#include <linux/math64.h>
+#include <linux/err.h>
+#include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
/*
- * Default implementation of macro that returns current
- * instruction pointer ("program counter").
+ * We handle most unaligned accesses in hardware. On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
*/
-static inline void *current_text_addr(void)
-{
- void *pc;
+#define NET_IP_ALIGN 0
- asm volatile("mov $1f, %0; 1:":"=r" (pc));
-
- return pc;
-}
+#define HBP_NUM 4
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
#ifdef CONFIG_X86_VSMP
# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
#else
-# define ARCH_MIN_TASKALIGN 16
+# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state)
# define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
+extern u16 __read_mostly tlb_lli_4k;
+extern u16 __read_mostly tlb_lli_2m;
+extern u16 __read_mostly tlb_lli_4m;
+extern u16 __read_mostly tlb_lld_4k;
+extern u16 __read_mostly tlb_lld_2m;
+extern u16 __read_mostly tlb_lld_4m;
+extern u16 __read_mostly tlb_lld_1g;
+
/*
- * CPU type and hardware bug flags. Kept separately for each CPU.
- * Members of this structure are referenced in head.S, so think twice
- * before touching them. [mj]
+ * CPU type and hardware bug flags. Kept separately for each CPU.
*/
+struct cpuinfo_topology {
+ // Real APIC ID read from the local APIC
+ u32 apicid;
+ // The initial APIC ID provided by CPUID
+ u32 initial_apicid;
+
+ // Physical package ID
+ u32 pkg_id;
+
+ // Physical die ID on AMD, Relative on Intel
+ u32 die_id;
+
+ // Compute unit ID - AMD specific
+ u32 cu_id;
+
+ // Core ID relative to the package
+ u32 core_id;
+
+ // Logical ID mappings
+ u32 logical_pkg_id;
+ u32 logical_die_id;
+ u32 logical_core_id;
+
+ // AMD Node ID and Nodes per Package info
+ u32 amd_node_id;
+
+ // Cache level topology IDs
+ u32 llc_id;
+ u32 l2c_id;
+
+ // Hardware defined CPU-type
+ union {
+ u32 cpu_type;
+ struct {
+ // CPUID.1A.EAX[23-0]
+ u32 intel_native_model_id :24;
+ // CPUID.1A.EAX[31-24]
+ u32 intel_type :8;
+ };
+ struct {
+ // CPUID 0x80000026.EBX
+ u32 amd_num_processors :16,
+ amd_power_eff_ranking :8,
+ amd_native_model_id :4,
+ amd_type :4;
+ };
+ };
+};
+
struct cpuinfo_x86 {
- __u8 x86; /* CPU family */
- __u8 x86_vendor; /* CPU vendor */
- __u8 x86_model;
- __u8 x86_mask;
-#ifdef CONFIG_X86_32
- char wp_works_ok; /* It doesn't on 386's */
-
- /* Problems on some 486Dx4's and old 386's: */
- char hlt_works_ok;
- char hard_math;
- char rfu;
- char fdiv_bug;
- char f00f_bug;
- char coma_bug;
- char pad0;
-#else
+ union {
+ /*
+ * The particular ordering (low-to-high) of (vendor,
+ * family, model) is done in case range of models, like
+ * it is usually done on AMD, need to be compared.
+ */
+ struct {
+ __u8 x86_model;
+ /* CPU family */
+ __u8 x86;
+ /* CPU vendor */
+ __u8 x86_vendor;
+ __u8 x86_reserved;
+ };
+ /* combined vendor, family, model */
+ __u32 x86_vfm;
+ };
+ __u8 x86_stepping;
+#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
#endif
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ __u32 vmx_capability[NVMXINTS];
+#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
- /* CPUID returned core id bits: */
- __u8 x86_coreid_bits;
/* Max extended CPUID function supported: */
__u32 extended_cpuid_level;
/* Maximum supported CPUID level, -1=no CPUID: */
int cpuid_level;
- __u32 x86_capability[NCAPINTS];
+ /*
+ * Align to size of unsigned long because the x86_capability array
+ * is passed to bitops which require the alignment. Use unnamed
+ * union to enforce the array is aligned to size of unsigned long.
+ */
+ union {
+ __u32 x86_capability[NCAPINTS + NBUGINTS];
+ unsigned long x86_capability_alignment;
+ };
char x86_vendor_id[16];
char x86_model_id[64];
+ struct cpuinfo_topology topo;
/* in KB - valid for CPUS which support this call: */
- int x86_cache_size;
+ unsigned int x86_cache_size;
int x86_cache_alignment; /* In bytes */
+ /* Cache QoS architectural values, valid only on the BSP: */
+ int x86_cache_max_rmid; /* max index */
+ int x86_cache_occ_scale; /* scale to bytes */
+ int x86_cache_mbm_width_offset;
int x86_power;
unsigned long loops_per_jiffy;
-#ifdef CONFIG_SMP
- /* cpus sharing the last level cache: */
- cpumask_var_t llc_shared_map;
-#endif
- /* cpuid returned max cores value: */
- u16 x86_max_cores;
- u16 apicid;
- u16 initial_apicid;
+ /* protected processor identification number */
+ u64 ppin;
u16 x86_clflush_size;
-#ifdef CONFIG_SMP
/* number of cores as seen by the OS: */
u16 booted_cores;
- /* Physical processor id: */
- u16 phys_proc_id;
- /* Core id: */
- u16 cpu_core_id;
/* Index into per_cpu list: */
u16 cpu_index;
-#endif
- unsigned int x86_hyper_vendor;
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+ /* Is SMT active on this core? */
+ bool smt_active;
+ u32 microcode;
+ /* Address space bits used by the cache internally */
+ u8 x86_cache_bits;
+ unsigned initialized : 1;
+} __randomize_layout;
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
@@ -121,77 +196,66 @@ struct cpuinfo_x86 {
#define X86_VENDOR_CENTAUR 5
#define X86_VENDOR_TRANSMETA 7
#define X86_VENDOR_NSC 8
-#define X86_VENDOR_NUM 9
+#define X86_VENDOR_HYGON 9
+#define X86_VENDOR_ZHAOXIN 10
+#define X86_VENDOR_VORTEX 11
+#define X86_VENDOR_NUM 12
#define X86_VENDOR_UNKNOWN 0xff
-#define X86_HYPER_VENDOR_NONE 0
-#define X86_HYPER_VENDOR_VMWARE 1
-
/*
* capabilities of CPUs
*/
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
-extern struct tss_struct doublefault_tss;
-extern __u32 cpu_caps_cleared[NCAPINTS];
-extern __u32 cpu_caps_set[NCAPINTS];
+extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
-#ifdef CONFIG_SMP
-DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
+DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
-#define current_cpu_data __get_cpu_var(cpu_info)
-#else
-#define cpu_data(cpu) boot_cpu_data
-#define current_cpu_data boot_cpu_data
-#endif
extern const struct seq_operations cpuinfo_op;
-static inline int hlt_works(int cpu)
-{
-#ifdef CONFIG_X86_32
- return cpu_data(cpu).hlt_works_ok;
-#else
- return 1;
-#endif
-}
-
#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
extern void cpu_detect(struct cpuinfo_x86 *c);
-extern struct pt_regs *idle_regs(struct pt_regs *);
+static inline unsigned long long l1tf_pfn_limit(void)
+{
+ return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT);
+}
+void init_cpu_devs(void);
+void get_cpu_vendor(struct cpuinfo_x86 *c);
extern void early_cpu_init(void);
-extern void identify_boot_cpu(void);
-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void identify_secondary_cpu(unsigned int cpu);
extern void print_cpu_info(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
+void print_cpu_msr(struct cpuinfo_x86 *);
-extern void detect_extended_topology(struct cpuinfo_x86 *c);
-extern void detect_ht(struct cpuinfo_x86 *c);
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+ return __read_cr3() & CR3_ADDR_MASK;
+}
-static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
+static inline unsigned long native_read_cr3_pa(void)
{
- /* ecx is often an input as well as an output. */
- asm("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx));
+ return __native_read_cr3() & CR3_ADDR_MASK;
}
static inline void load_cr3(pgd_t *pgdir)
{
- write_cr3(__pa(pgdir));
+ write_cr3(__sme_pa(pgdir));
}
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
@@ -199,8 +263,23 @@ struct x86_hw_tss {
unsigned long sp0;
unsigned short ss0, __ss0h;
unsigned long sp1;
- /* ss1 caches MSR_IA32_SYSENTER_CS: */
- unsigned short ss1, __ss1h;
+
+ /*
+ * We don't use ring 1, so ss1 is a convenient scratch space in
+ * the same cacheline as sp0. We use ss1 to cache the value in
+ * MSR_IA32_SYSENTER_CS. When we context switch
+ * MSR_IA32_SYSENTER_CS, we first check if the new value being
+ * written matches ss1, and, if it's not, then we wrmsr the new
+ * value and update ss1.
+ *
+ * The only reason we context switch MSR_IA32_SYSENTER_CS is
+ * that we set it to zero in vm86 tasks to avoid corrupting the
+ * stack if we were to go through the sysenter path from vm86
+ * mode.
+ */
+ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
+
+ unsigned short __ss1h;
unsigned long sp2;
unsigned short ss2, __ss2h;
unsigned long __cr3;
@@ -230,7 +309,14 @@ struct x86_hw_tss {
u32 reserved1;
u64 sp0;
u64 sp1;
+
+ /*
+ * Since Linux does not use ring 2, the 'sp2' slot is unused by
+ * hardware. entry_SYSCALL_64 uses it as scratch space to stash
+ * the user RSP value.
+ */
u64 sp2;
+
u64 reserved2;
u64 ist[7];
u32 reserved3;
@@ -238,23 +324,63 @@ struct x86_hw_tss {
u16 reserved5;
u16 io_bitmap_base;
-} __attribute__((packed)) ____cacheline_aligned;
+} __attribute__((packed));
#endif
/*
* IO-bitmap sizes:
*/
#define IO_BITMAP_BITS 65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
-#define INVALID_IO_BITMAP_OFFSET 0x8000
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS / BITS_PER_BYTE)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES / sizeof(long))
+
+#define IO_BITMAP_OFFSET_VALID_MAP \
+ (offsetof(struct tss_struct, io_bitmap.bitmap) - \
+ offsetof(struct tss_struct, x86_tss))
+
+#define IO_BITMAP_OFFSET_VALID_ALL \
+ (offsetof(struct tss_struct, io_bitmap.mapall) - \
+ offsetof(struct tss_struct, x86_tss))
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+/*
+ * sizeof(unsigned long) coming from an extra "long" at the end of the
+ * iobitmap. The limit is inclusive, i.e. the last valid byte.
+ */
+# define __KERNEL_TSS_LIMIT \
+ (IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \
+ sizeof(unsigned long) - 1)
+#else
+# define __KERNEL_TSS_LIMIT \
+ (offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1)
+#endif
+
+/* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
+#define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1)
+
+struct entry_stack {
+ char stack[PAGE_SIZE];
+};
+
+struct entry_stack_page {
+ struct entry_stack stack;
+} __aligned(PAGE_SIZE);
+
+/*
+ * All IO bitmap related data stored in the TSS:
+ */
+struct x86_io_bitmap {
+ /* The sequence number of the last active bitmap. */
+ u64 prev_sequence;
-struct tss_struct {
/*
- * The hardware state:
+ * Store the dirty size of the last io bitmap offender. The next
+ * one will have to do the cleanup as the switch out to a non io
+ * bitmap user will just set x86_tss.io_bitmap_base to a value
+ * outside of the TSS limit. So for sane tasks there is no need to
+ * actually touch the io_bitmap at all.
*/
- struct x86_hw_tss x86_tss;
+ unsigned int prev_max;
/*
* The extra 1 is there because the CPU will access an
@@ -262,578 +388,216 @@ struct tss_struct {
* bitmap. The extra byte must be all 1 bits, and must
* be within the limit.
*/
- unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+ unsigned long bitmap[IO_BITMAP_LONGS + 1];
/*
- * .. and then another 0x100 bytes for the emergency kernel stack:
+ * Special I/O bitmap to emulate IOPL(3). All bytes zero,
+ * except the additional byte at the end.
*/
- unsigned long stack[64];
-
-} ____cacheline_aligned;
-
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
-
-/*
- * Save the original ist values for checking stack pointers during debugging
- */
-struct orig_ist {
- unsigned long ist[7];
-};
-
-#define MXCSR_DEFAULT 0x1f80
-
-struct i387_fsave_struct {
- u32 cwd; /* FPU Control Word */
- u32 swd; /* FPU Status Word */
- u32 twd; /* FPU Tag Word */
- u32 fip; /* FPU IP Offset */
- u32 fcs; /* FPU IP Selector */
- u32 foo; /* FPU Operand Pointer Offset */
- u32 fos; /* FPU Operand Pointer Selector */
-
- /* 8*10 bytes for each FP-reg = 80 bytes: */
- u32 st_space[20];
-
- /* Software status information [not touched by FSAVE ]: */
- u32 status;
+ unsigned long mapall[IO_BITMAP_LONGS + 1];
};
-struct i387_fxsave_struct {
- u16 cwd; /* Control Word */
- u16 swd; /* Status Word */
- u16 twd; /* Tag Word */
- u16 fop; /* Last Instruction Opcode */
- union {
- struct {
- u64 rip; /* Instruction Pointer */
- u64 rdp; /* Data Pointer */
- };
- struct {
- u32 fip; /* FPU IP Offset */
- u32 fcs; /* FPU IP Selector */
- u32 foo; /* FPU Operand Offset */
- u32 fos; /* FPU Operand Selector */
- };
- };
- u32 mxcsr; /* MXCSR Register State */
- u32 mxcsr_mask; /* MXCSR Mask */
-
- /* 8*16 bytes for each FP-reg = 128 bytes: */
- u32 st_space[32];
-
- /* 16*16 bytes for each XMM-reg = 256 bytes: */
- u32 xmm_space[64];
-
- u32 padding[12];
+struct tss_struct {
+ /*
+ * The fixed hardware portion. This must not cross a page boundary
+ * at risk of violating the SDM's advice and potentially triggering
+ * errata.
+ */
+ struct x86_hw_tss x86_tss;
- union {
- u32 padding1[12];
- u32 sw_reserved[12];
- };
+ struct x86_io_bitmap io_bitmap;
+} __aligned(PAGE_SIZE);
-} __attribute__((aligned(16)));
-
-struct i387_soft_struct {
- u32 cwd;
- u32 swd;
- u32 twd;
- u32 fip;
- u32 fcs;
- u32 foo;
- u32 fos;
- /* 8*10 bytes for each FP-reg = 80 bytes: */
- u32 st_space[20];
- u8 ftop;
- u8 changed;
- u8 lookahead;
- u8 no_update;
- u8 rm;
- u8 alimit;
- struct math_emu_info *info;
- u32 entry_eip;
-};
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
-struct ymmh_struct {
- /* 16 * 16 bytes for each YMMH-reg = 256 bytes */
- u32 ymmh_space[64];
-};
+/* Per CPU interrupt stacks */
+struct irq_stack {
+ char stack[IRQ_STACK_SIZE];
+} __aligned(IRQ_STACK_SIZE);
-struct xsave_hdr_struct {
- u64 xstate_bv;
- u64 reserved1[2];
- u64 reserved2[5];
-} __attribute__((packed));
+DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr);
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse);
+#else
+DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr);
+#endif
-struct xsave_struct {
- struct i387_fxsave_struct i387;
- struct xsave_hdr_struct xsave_hdr;
- struct ymmh_struct ymmh;
- /* new processor state extensions will go here */
-} __attribute__ ((packed, aligned (64)));
-
-union thread_xstate {
- struct i387_fsave_struct fsave;
- struct i387_fxsave_struct fxsave;
- struct i387_soft_struct soft;
- struct xsave_struct xsave;
-};
+DECLARE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack);
+/* const-qualified alias provided by the linker. */
+DECLARE_PER_CPU_CACHE_HOT(const unsigned long __percpu_seg_override,
+ const_cpu_current_top_of_stack);
#ifdef CONFIG_X86_64
-DECLARE_PER_CPU(struct orig_ist, orig_ist);
-
-union irq_stack_union {
- char irq_stack[IRQ_STACK_SIZE];
- /*
- * GCC hardcodes the stack canary as %gs:40. Since the
- * irq_stack is the object at %gs:0, we reserve the bottom
- * 48 bytes of the irq stack for the canary.
- */
- struct {
- char gs_base[40];
- unsigned long stack_canary;
- };
-};
+static inline unsigned long cpu_kernelmode_gs_base(int cpu)
+{
+#ifdef CONFIG_SMP
+ return per_cpu_offset(cpu);
+#else
+ return 0;
+#endif
+}
-DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
-DECLARE_INIT_PER_CPU(irq_stack_union);
+extern asmlinkage void entry_SYSCALL32_ignore(void);
-DECLARE_PER_CPU(char *, irq_stack_ptr);
-DECLARE_PER_CPU(unsigned int, irq_count);
-extern unsigned long kernel_eflags;
-extern asmlinkage void ignore_sysret(void);
-#else /* X86_64 */
-#ifdef CONFIG_CC_STACKPROTECTOR
-DECLARE_PER_CPU(unsigned long, stack_canary);
-#endif
+/* Save actual FS/GS selectors and bases to current->thread */
+void current_save_fsgs(void);
#endif /* X86_64 */
-extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
+struct perf_event;
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+#ifdef CONFIG_X86_32
unsigned long sp0;
+#endif
unsigned long sp;
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
#else
- unsigned long usersp; /* Copy from PDA */
unsigned short es;
unsigned short ds;
unsigned short fsindex;
unsigned short gsindex;
#endif
-#ifdef CONFIG_X86_32
- unsigned long ip;
-#endif
+
#ifdef CONFIG_X86_64
- unsigned long fs;
+ unsigned long fsbase;
+ unsigned long gsbase;
+#else
+ /*
+ * XXX: this could presumably be unsigned short. Alternatively,
+ * 32-bit kernels could be taught to use fsindex instead.
+ */
+ unsigned long fs;
+ unsigned long gs;
#endif
- unsigned long gs;
- /* Hardware debugging registers: */
- unsigned long debugreg0;
- unsigned long debugreg1;
- unsigned long debugreg2;
- unsigned long debugreg3;
- unsigned long debugreg6;
- unsigned long debugreg7;
+
+ /* Save middle states of ptrace breakpoints */
+ struct perf_event *ptrace_bps[HBP_NUM];
+ /* Debug status used for traps, single steps, etc... */
+ unsigned long virtual_dr6;
+ /* Keep track of the exact dr7 value set by the user */
+ unsigned long ptrace_dr7;
/* Fault info: */
unsigned long cr2;
- unsigned long trap_no;
+ unsigned long trap_nr;
unsigned long error_code;
- /* floating point and extended processor state */
- union thread_xstate *xstate;
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_VM86
/* Virtual 86 mode info */
- struct vm86_struct __user *vm86_info;
- unsigned long screen_bitmap;
- unsigned long v86flags;
- unsigned long v86mask;
- unsigned long saved_sp0;
- unsigned int saved_fs;
- unsigned int saved_gs;
+ struct vm86 *vm86;
#endif
/* IO permissions: */
- unsigned long *io_bitmap_ptr;
- unsigned long iopl;
- /* Max allowed port in the bitmap, in bytes: */
- unsigned io_bitmap_max;
-/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
- unsigned long debugctlmsr;
- /* Debug Store context; see asm/ds.h */
- struct ds_context *ds_ctx;
-};
+ struct io_bitmap *io_bitmap;
-static inline unsigned long native_get_debugreg(int regno)
-{
- unsigned long val = 0; /* Damn you, gcc! */
-
- switch (regno) {
- case 0:
- asm("mov %%db0, %0" :"=r" (val));
- break;
- case 1:
- asm("mov %%db1, %0" :"=r" (val));
- break;
- case 2:
- asm("mov %%db2, %0" :"=r" (val));
- break;
- case 3:
- asm("mov %%db3, %0" :"=r" (val));
- break;
- case 6:
- asm("mov %%db6, %0" :"=r" (val));
- break;
- case 7:
- asm("mov %%db7, %0" :"=r" (val));
- break;
- default:
- BUG();
- }
- return val;
-}
+ /*
+ * IOPL. Privilege level dependent I/O permission which is
+ * emulated via the I/O bitmap to prevent user space from disabling
+ * interrupts.
+ */
+ unsigned long iopl_emul;
-static inline void native_set_debugreg(int regno, unsigned long value)
-{
- switch (regno) {
- case 0:
- asm("mov %0, %%db0" ::"r" (value));
- break;
- case 1:
- asm("mov %0, %%db1" ::"r" (value));
- break;
- case 2:
- asm("mov %0, %%db2" ::"r" (value));
- break;
- case 3:
- asm("mov %0, %%db3" ::"r" (value));
- break;
- case 6:
- asm("mov %0, %%db6" ::"r" (value));
- break;
- case 7:
- asm("mov %0, %%db7" ::"r" (value));
- break;
- default:
- BUG();
- }
-}
+ unsigned int iopl_warn:1;
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void native_set_iopl_mask(unsigned mask)
-{
-#ifdef CONFIG_X86_32
- unsigned int reg;
-
- asm volatile ("pushfl;"
- "popl %0;"
- "andl %1, %0;"
- "orl %2, %0;"
- "pushl %0;"
- "popfl"
- : "=&r" (reg)
- : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-#endif
-}
+ /*
+ * Protection Keys Register for Userspace. Loaded immediately on
+ * context switch. Store it in thread_struct to avoid a lookup in
+ * the tasks's FPU xstate buffer. This value is only valid when a
+ * task is scheduled out. For 'current' the authoritative source of
+ * PKRU is the hardware itself.
+ */
+ u32 pkru;
-static inline void
-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
-{
- tss->x86_tss.sp0 = thread->sp0;
-#ifdef CONFIG_X86_32
- /* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
-#endif
-}
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ unsigned long features;
+ unsigned long features_locked;
-static inline void native_swapgs(void)
-{
-#ifdef CONFIG_X86_64
- asm volatile("swapgs" ::: "memory");
+ struct thread_shstk shstk;
#endif
-}
+};
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
+#ifdef CONFIG_X86_DEBUG_FPU
+extern struct fpu *x86_task_fpu(struct task_struct *task);
#else
-#define __cpuid native_cpuid
-#define paravirt_enabled() 0
-
-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register) \
- (var) = native_get_debugreg(register)
-#define set_debugreg(value, register) \
- native_set_debugreg(register, value)
-
-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- native_load_sp0(tss, thread);
-}
-
-#define set_iopl_mask native_set_iopl_mask
-#endif /* CONFIG_PARAVIRT */
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
- unsigned cr4;
-
- mmu_cr4_features |= mask;
- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
- unsigned cr4;
-
- mmu_cr4_features &= ~mask;
- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);
-}
-
-typedef struct {
- unsigned long seg;
-} mm_segment_t;
-
-
-/*
- * create a kernel thread without removing it from tasklists
- */
-extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-
-/* Free all resources held by a thread. */
-extern void release_thread(struct task_struct *);
-
-/* Prepare to copy thread state - unlazy all lazy state */
-extern void prepare_to_copy(struct task_struct *tsk);
-
-unsigned long get_wchan(struct task_struct *p);
-
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = 0;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(unsigned int op, int count,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = count;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
+# define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task))))
+#endif
- cpuid(op, &eax, &ebx, &ecx, &edx);
+extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size);
- return eax;
-}
-
-static inline unsigned int cpuid_ebx(unsigned int op)
+static inline void arch_thread_struct_whitelist(unsigned long *offset,
+ unsigned long *size)
{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ebx;
+ fpu_thread_struct_whitelist(offset, size);
}
-static inline unsigned int cpuid_ecx(unsigned int op)
+static inline void
+native_load_sp0(unsigned long sp0)
{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ecx;
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
-static inline unsigned int cpuid_edx(unsigned int op)
+static __always_inline void native_swapgs(void)
{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return edx;
+#ifdef CONFIG_X86_64
+ asm volatile("swapgs" ::: "memory");
+#endif
}
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
+static __always_inline unsigned long current_top_of_stack(void)
{
- asm volatile("rep; nop" ::: "memory");
-}
+ /*
+ * We can't read directly from tss.sp0: sp0 on x86_32 is special in
+ * and around vm86 mode and sp0 on x86_64 is special because of the
+ * entry trampoline.
+ */
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return this_cpu_read_const(const_cpu_current_top_of_stack);
-static inline void cpu_relax(void)
-{
- rep_nop();
+ return this_cpu_read_stable(cpu_current_top_of_stack);
}
-/* Stop speculative execution: */
-static inline void sync_core(void)
+static __always_inline bool on_thread_stack(void)
{
- int tmp;
-
- asm volatile("cpuid" : "=a" (tmp) : "0" (1)
- : "ebx", "ecx", "edx", "memory");
+ return (unsigned long)(current_top_of_stack() -
+ current_stack_pointer) < THREAD_SIZE;
}
-static inline void __monitor(const void *eax, unsigned long ecx,
- unsigned long edx)
-{
- /* "monitor %eax, %ecx, %edx;" */
- asm volatile(".byte 0x0f, 0x01, 0xc8;"
- :: "a" (eax), "c" (ecx), "d"(edx));
-}
+#ifdef CONFIG_PARAVIRT_XXL
+#include <asm/paravirt.h>
+#else
-static inline void __mwait(unsigned long eax, unsigned long ecx)
+static inline void load_sp0(unsigned long sp0)
{
- /* "mwait %eax, %ecx;" */
- asm volatile(".byte 0x0f, 0x01, 0xc9;"
- :: "a" (eax), "c" (ecx));
+ native_load_sp0(sp0);
}
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
-{
- trace_hardirqs_on();
- /* "mwait %eax, %ecx;" */
- asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
- :: "a" (eax), "c" (ecx));
-}
+#endif /* CONFIG_PARAVIRT_XXL */
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+unsigned long __get_wchan(struct task_struct *p);
-extern void select_idle_routine(const struct cpuinfo_x86 *c);
-extern void init_c1e_mask(void);
+extern void select_idle_routine(void);
+extern void amd_e400_c1e_apic_setup(void);
extern unsigned long boot_option_idle_override;
-extern unsigned long idle_halt;
-extern unsigned long idle_nomwait;
-/*
- * on systems with caches, caches must be flashed as the absolute
- * last instruction before going into a suspended halt. Otherwise,
- * dirty data can linger in the cache and become stale on resume,
- * leading to strange errors.
- *
- * perform a variety of operations to guarantee that the compiler
- * will not reorder instructions. wbinvd itself is serializing
- * so the processor will not reorder.
- *
- * Systems without cache can just go into halt.
- */
-static inline void wbinvd_halt(void)
-{
- mb();
- /* check for clflush to determine if wbinvd is legal */
- if (cpu_has_clflush)
- asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
- else
- while (1)
- halt();
-}
+enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
+ IDLE_POLL};
extern void enable_sep_cpu(void);
-extern int sysenter_setup(void);
+
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
-extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(int);
-extern void load_percpu_segment(int);
+extern void switch_gdt_and_percpu_base(int);
+extern void load_direct_gdt(int);
+extern void load_fixmap_gdt(int);
extern void cpu_init(void);
+extern void cpu_init_exception_handling(bool boot_cpu);
+extern void cpu_init_replace_early_idt(void);
+extern void cr4_init(void);
-static inline unsigned long get_debugctlmsr(void)
-{
- unsigned long debugctlmsr = 0;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return 0;
-#endif
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
-
- return debugctlmsr;
-}
-
-static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
-{
- u64 debugctlmsr = 0;
- u32 val1, val2;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return 0;
-#endif
- rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
- debugctlmsr = val1 | ((u64)val2 << 32);
-
- return debugctlmsr;
-}
-
-static inline void update_debugctlmsr(unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return;
-#endif
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
-}
-
-static inline void update_debugctlmsr_on_cpu(int cpu,
- unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return;
-#endif
- wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
- (u32)((u64)debugctlmsr),
- (u32)((u64)debugctlmsr >> 32));
-}
-
-/*
- * from system description table in BIOS. Mostly for MCA use, but
- * others may find it useful:
- */
-extern unsigned int machine_id;
-extern unsigned int machine_submodel_id;
-extern unsigned int BIOS_revision;
+extern void set_task_blockstep(struct task_struct *task, bool on);
/* Boot loader type from the setup header: */
extern int bootloader_type;
@@ -843,13 +607,12 @@ extern char ignore_fpu_irq;
#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
#define ARCH_HAS_PREFETCHW
-#define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32
-# define BASE_PREFETCH ASM_NOP4
+# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
-# define BASE_PREFETCH "prefetcht0 (%1)"
+# define BASE_PREFETCH "prefetcht0 %1"
#endif
/*
@@ -860,10 +623,9 @@ extern char ignore_fpu_irq;
*/
static inline void prefetch(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchnta (%1)",
+ alternative_input(BASE_PREFETCH, "prefetchnta %1",
X86_FEATURE_XMM,
- "r" (x));
+ "m" (*(const char *)x));
}
/*
@@ -871,133 +633,143 @@ static inline void prefetch(const void *x)
* Useful for spinlocks to avoid one state transition in the
* cache coherency protocol:
*/
-static inline void prefetchw(const void *x)
+static __always_inline void prefetchw(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchw (%1)",
- X86_FEATURE_3DNOW,
- "r" (x));
+ alternative_input(BASE_PREFETCH, "prefetchw %1",
+ X86_FEATURE_3DNOWPREFETCH,
+ "m" (*(const char *)x));
}
-static inline void spin_lock_prefetch(const void *x)
-{
- prefetchw(x);
-}
+#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+ TOP_OF_KERNEL_STACK_PADDING)
-#ifdef CONFIG_X86_32
-/*
- * User space process size: 3GB (default).
- */
-#define TASK_SIZE PAGE_OFFSET
-#define TASK_SIZE_MAX TASK_SIZE
-#define STACK_TOP TASK_SIZE
-#define STACK_TOP_MAX STACK_TOP
+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+#define task_pt_regs(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task_stack_page(task); \
+ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
+ ((struct pt_regs *)__ptr) - 1; \
+})
+
+#ifdef CONFIG_X86_32
#define INIT_THREAD { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
- .vm86_info = NULL, \
+ .sp0 = TOP_OF_INIT_STACK, \
.sysenter_cs = __KERNEL_CS, \
- .io_bitmap_ptr = NULL, \
}
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS { \
- .x86_tss = { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
- .ss0 = __KERNEL_DS, \
- .ss1 = __KERNEL_CS, \
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
- }, \
- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
+#else
+extern unsigned long __top_init_kernel_stack[];
+
+#define INIT_THREAD { \
+ .sp = (unsigned long)&__top_init_kernel_stack, \
}
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
+#endif /* CONFIG_X86_64 */
-#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info) \
-({ \
- unsigned long *__ptr = (unsigned long *)(info); \
- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
-})
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp);
/*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessable even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the ss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
*/
-#define task_pt_regs(task) \
-({ \
- struct pt_regs *__regs__; \
- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
- __regs__ - 1; \
-})
+#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
+#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
-#else
-/*
- * User space process size. 47bits minus one guard page.
- */
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
- 0xc0000000 : 0xFFFFe000)
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
+#define SET_TSC_CTL(val) set_tsc_mode((val))
-#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
- IA32_PAGE_OFFSET : TASK_SIZE_MAX)
-#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
- IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
-#define STACK_TOP TASK_SIZE
-#define STACK_TOP_MAX TASK_SIZE_MAX
+DECLARE_PER_CPU(u64, msr_misc_features_shadow);
-#define INIT_THREAD { \
- .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+static inline u32 per_cpu_llc_id(unsigned int cpu)
+{
+ return per_cpu(cpu_info.topo.llc_id, cpu);
}
-#define INIT_TSS { \
- .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+static inline u32 per_cpu_l2c_id(unsigned int cpu)
+{
+ return per_cpu(cpu_info.topo.l2c_id, cpu);
}
+#ifdef CONFIG_CPU_SUP_AMD
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
+ * Issue a DIV 0/1 insn to clear any division data from previous DIV
+ * operations.
*/
-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+static __always_inline void amd_clear_divider(void)
+{
+ asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
+ :: "a" (0), "d" (0), "r" (1));
+}
-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
-#endif /* CONFIG_X86_64 */
+extern void amd_check_microcode(void);
+#else
+static inline void amd_clear_divider(void) { }
+static inline void amd_check_microcode(void) { }
+#endif
-extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
- unsigned long new_sp);
+extern unsigned long arch_align_stack(unsigned long sp);
+void free_init_pages(const char *what, unsigned long begin, unsigned long end);
+extern void free_kernel_image_pages(const char *what, void *begin, void *end);
-/*
- * This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+void default_idle(void);
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void);
+#else
+#define xen_set_default_idle 0
+#endif
-#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+void __noreturn stop_this_cpu(void *dummy);
+void microcode_check(struct cpuinfo_x86 *prev_info);
+void store_cpu_caps(struct cpuinfo_x86 *info);
-/* Get/set a process' ability to use the timestamp counter instruction */
-#define GET_TSC_CTL(adr) get_tsc_mode((adr))
-#define SET_TSC_CTL(val) set_tsc_mode((val))
+DECLARE_PER_CPU(bool, cache_state_incoherent);
-extern int get_tsc_mode(unsigned long adr);
-extern int set_tsc_mode(unsigned int val);
+enum l1tf_mitigations {
+ L1TF_MITIGATION_OFF,
+ L1TF_MITIGATION_AUTO,
+ L1TF_MITIGATION_FLUSH_NOWARN,
+ L1TF_MITIGATION_FLUSH,
+ L1TF_MITIGATION_FLUSH_NOSMT,
+ L1TF_MITIGATION_FULL,
+ L1TF_MITIGATION_FULL_FORCE
+};
+
+extern enum l1tf_mitigations l1tf_mitigation;
+
+enum mds_mitigations {
+ MDS_MITIGATION_OFF,
+ MDS_MITIGATION_AUTO,
+ MDS_MITIGATION_FULL,
+ MDS_MITIGATION_VMWERV,
+};
+
+extern bool gds_ucode_mitigated(void);
+
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
+}
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000000..5d0dbab85264
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Definitions for Device tree / OpenFirmware handling on X86
+ *
+ * based on arch/powerpc/include/asm/prom.h which is
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ */
+
+#ifndef _ASM_X86_PROM_H
+#define _ASM_X86_PROM_H
+#ifndef __ASSEMBLER__
+
+#include <linux/of.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+
+#include <asm/irq.h>
+#include <linux/atomic.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_OF
+extern int of_ioapic;
+extern u64 initial_dtb;
+extern void add_dtb(u64 data);
+void x86_of_pci_init(void);
+void x86_flattree_get_config(void);
+#else
+static inline void add_dtb(u64 data) { }
+static inline void x86_of_pci_init(void) { }
+static inline void x86_flattree_get_config(void) { }
+#define of_ioapic 0
+#endif
+
+extern char cmd_line[COMMAND_LINE_SIZE];
+
+#endif /* __ASSEMBLER__ */
+#endif
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..05224a695872 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -1,35 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PROTO_H
#define _ASM_X86_PROTO_H
#include <asm/ldt.h>
-/* misc architecture specific prototypes */
-
-extern void early_idt_handler(void);
-
-extern void system_call(void);
-extern void syscall_init(void);
+struct task_struct;
-extern void ia32_syscall(void);
-extern void ia32_cstar_target(void);
-extern void ia32_sysenter_target(void);
-
-extern void syscall32_cpu_init(void);
+/* misc architecture specific prototypes */
-extern void check_efer(void);
+void syscall_init(void);
+
+#ifdef CONFIG_X86_64
+void entry_SYSCALL_64(void);
+void entry_SYSCALL_64_safe_stack(void);
+void entry_SYSRETQ_unsafe_stack(void);
+void entry_SYSRETQ_end(void);
+#endif
+
+#ifdef CONFIG_X86_32
+void entry_INT80_32(void);
+void entry_SYSENTER_32(void);
+void __begin_SYSENTER_singlestep_region(void);
+void __end_SYSENTER_singlestep_region(void);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
+void entry_SYSENTER_compat(void);
+void __end_entry_SYSENTER_compat(void);
+void entry_SYSCALL_compat(void);
+void entry_SYSCALL_compat_safe_stack(void);
+void entry_SYSRETL_compat_unsafe_stack(void);
+void entry_SYSRETL_compat_end(void);
+#else /* !CONFIG_IA32_EMULATION */
+#define entry_SYSCALL_compat NULL
+#define entry_SYSENTER_compat NULL
+#endif
+
+void x86_configure_nx(void);
extern int reboot_force;
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
-
-/*
- * This looks more complex than it should be. But we need to
- * get the type for the ~ right in round_down (it needs to be
- * as wide as the result!), and we want to evaluate the macro
- * arguments just once each.
- */
-#define __round_mask(x,y) ((__typeof__(x))((y)-1))
-#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
-#define round_down(x,y) ((x) & ~__round_mask(x,y))
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
new file mode 100644
index 000000000000..88d0a1ab1f77
--- /dev/null
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PTI_H
+#define _ASM_X86_PTI_H
+#ifndef __ASSEMBLER__
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+extern void pti_init(void);
+extern void pti_check_boottime_disable(void);
+extern void pti_finalize(void);
+#else
+static inline void pti_check_boottime_disable(void) { }
+#endif
+
+#endif /* __ASSEMBLER__ */
+#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
deleted file mode 100644
index 86723035a515..000000000000
--- a/arch/x86/include/asm/ptrace-abi.h
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifndef _ASM_X86_PTRACE_ABI_H
-#define _ASM_X86_PTRACE_ABI_H
-
-#ifdef __i386__
-
-#define EBX 0
-#define ECX 1
-#define EDX 2
-#define ESI 3
-#define EDI 4
-#define EBP 5
-#define EAX 6
-#define DS 7
-#define ES 8
-#define FS 9
-#define GS 10
-#define ORIG_EAX 11
-#define EIP 12
-#define CS 13
-#define EFL 14
-#define UESP 15
-#define SS 16
-#define FRAME_SIZE 17
-
-#else /* __i386__ */
-
-#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
-/* arguments: interrupts/non tracing syscalls only save upto here*/
-#define R11 48
-#define R10 56
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120 /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
-#define ARGOFFSET R11
-#endif /* __ASSEMBLY__ */
-
-/* top of stack page */
-#define FRAME_SIZE 168
-
-#endif /* !__i386__ */
-
-/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
-#define PTRACE_GETREGS 12
-#define PTRACE_SETREGS 13
-#define PTRACE_GETFPREGS 14
-#define PTRACE_SETFPREGS 15
-#define PTRACE_GETFPXREGS 18
-#define PTRACE_SETFPXREGS 19
-
-#define PTRACE_OLDSETOPTIONS 21
-
-/* only useful for access 32bit programs / kernels */
-#define PTRACE_GET_THREAD_AREA 25
-#define PTRACE_SET_THREAD_AREA 26
-
-#ifdef __x86_64__
-# define PTRACE_ARCH_PRCTL 30
-#endif
-
-#define PTRACE_SYSEMU 31
-#define PTRACE_SYSEMU_SINGLESTEP 32
-
-#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-
-/* configuration/status structure used in PTRACE_BTS_CONFIG and
- PTRACE_BTS_STATUS commands.
-*/
-struct ptrace_bts_config {
- /* requested or actual size of BTS buffer in bytes */
- __u32 size;
- /* bitmask of below flags */
- __u32 flags;
- /* buffer overflow signal */
- __u32 signal;
- /* actual size of bts_struct in bytes */
- __u32 bts_size;
-};
-#endif /* __ASSEMBLY__ */
-
-#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
-#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
-#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
- instead of wrapping around */
-#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
-
-#define PTRACE_BTS_CONFIG 40
-/* Configure branch trace recording.
- ADDR points to a struct ptrace_bts_config.
- DATA gives the size of that buffer.
- A new buffer is allocated, if requested in the flags.
- An overflow signal may only be requested for new buffers.
- Returns the number of bytes read.
-*/
-#define PTRACE_BTS_STATUS 41
-/* Return the current configuration in a struct ptrace_bts_config
- pointed to by ADDR; DATA gives the size of that buffer.
- Returns the number of bytes written.
-*/
-#define PTRACE_BTS_SIZE 42
-/* Return the number of available BTS records for draining.
- DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_GET 43
-/* Get a single BTS record.
- DATA defines the index into the BTS array, where 0 is the newest
- entry, and higher indices refer to older entries.
- ADDR is pointing to struct bts_struct (see asm/ds.h).
-*/
-#define PTRACE_BTS_CLEAR 44
-/* Clear the BTS buffer.
- DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_DRAIN 45
-/* Read all available BTS records and clear the buffer.
- ADDR points to an array of struct bts_struct.
- DATA gives the size of that buffer.
- BTS records are read from oldest to newest.
- Returns number of BTS records drained.
-*/
-
-#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908349aa..35d062a2e304 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -1,45 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTRACE_H
#define _ASM_X86_PTRACE_H
-#include <linux/compiler.h> /* For __user */
-#include <asm/ptrace-abi.h>
-#include <asm/processor-flags.h>
-
-#ifdef __KERNEL__
#include <asm/segment.h>
-#endif
-
-#ifndef __ASSEMBLY__
+#include <asm/page_types.h>
+#include <uapi/asm/ptrace.h>
+#ifndef __ASSEMBLER__
#ifdef __i386__
-/* this struct defines the way the registers are stored on the
- stack during a system call. */
-
-#ifndef __KERNEL__
-
-struct pt_regs {
- long ebx;
- long ecx;
- long edx;
- long esi;
- long edi;
- long ebp;
- long eax;
- int xds;
- int xes;
- int xfs;
- int xgs;
- long orig_eax;
- long eip;
- int xcs;
- long eflags;
- long esp;
- int xss;
-};
-
-#else /* __KERNEL__ */
struct pt_regs {
+ /*
+ * NB: 32-bit x86 CPUs are inconsistent as what happens in the
+ * following cases (where %seg represents a segment register):
+ *
+ * - pushl %seg: some do a 16-bit write and leave the high
+ * bits alone
+ * - movl %seg, [mem]: some do a 16-bit write despite the movl
+ * - IDT entry: some (e.g. 486) will leave the high bits of CS
+ * and (if applicable) SS undefined.
+ *
+ * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
+ * so we can just treat all of the segment registers as 16-bit
+ * values.
+ */
unsigned long bx;
unsigned long cx;
unsigned long dx;
@@ -47,62 +31,89 @@ struct pt_regs {
unsigned long di;
unsigned long bp;
unsigned long ax;
- unsigned long ds;
- unsigned long es;
- unsigned long fs;
- unsigned long gs;
+ unsigned short ds;
+ unsigned short __dsh;
+ unsigned short es;
+ unsigned short __esh;
+ unsigned short fs;
+ unsigned short __fsh;
+ /*
+ * On interrupt, gs and __gsh store the vector number. They never
+ * store gs any more.
+ */
+ unsigned short gs;
+ unsigned short __gsh;
+ /* On interrupt, this is the error code. */
unsigned long orig_ax;
unsigned long ip;
- unsigned long cs;
+ unsigned short cs;
+ unsigned short __csh;
unsigned long flags;
unsigned long sp;
- unsigned long ss;
+ unsigned short ss;
+ unsigned short __ssh;
};
-#endif /* __KERNEL__ */
-
#else /* __i386__ */
-#ifndef __KERNEL__
-
-struct pt_regs {
- unsigned long r15;
- unsigned long r14;
- unsigned long r13;
- unsigned long r12;
- unsigned long rbp;
- unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save upto here*/
- unsigned long r11;
- unsigned long r10;
- unsigned long r9;
- unsigned long r8;
- unsigned long rax;
- unsigned long rcx;
- unsigned long rdx;
- unsigned long rsi;
- unsigned long rdi;
- unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
- unsigned long rip;
- unsigned long cs;
- unsigned long eflags;
- unsigned long rsp;
- unsigned long ss;
-/* top of stack page */
+struct fred_cs {
+ /* CS selector */
+ u64 cs : 16,
+ /* Stack level at event time */
+ sl : 2,
+ /* IBT in WAIT_FOR_ENDBRANCH state */
+ wfe : 1,
+ : 45;
};
-#else /* __KERNEL__ */
+struct fred_ss {
+ /* SS selector */
+ u64 ss : 16,
+ /* STI state */
+ sti : 1,
+ /* Set if syscall, sysenter or INT n */
+ swevent : 1,
+ /* Event is NMI type */
+ nmi : 1,
+ : 13,
+ /* Event vector */
+ vector : 8,
+ : 8,
+ /* Event type */
+ type : 4,
+ : 4,
+ /* Event was incident to enclave execution */
+ enclave : 1,
+ /* CPU was in 64-bit mode */
+ l : 1,
+ /*
+ * Nested exception during FRED delivery, not set
+ * for #DF.
+ */
+ nested : 1,
+ : 1,
+ /*
+ * The length of the instruction causing the event.
+ * Only set for INTO, INT1, INT3, INT n, SYSCALL
+ * and SYSENTER. 0 otherwise.
+ */
+ insnlen : 4;
+};
struct pt_regs {
+ /*
+ * C ABI says these regs are callee-preserved. They aren't saved on
+ * kernel entry unless syscall needs a complete, fully filled
+ * "struct pt_regs".
+ */
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save upto here*/
+
+ /* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
@@ -112,24 +123,59 @@ struct pt_regs {
unsigned long dx;
unsigned long si;
unsigned long di;
+
+ /*
+ * orig_ax is used on entry for:
+ * - the syscall number (syscall, sysenter, int80)
+ * - error_code stored by the CPU on traps and exceptions
+ * - the interrupt number for device interrupts
+ *
+ * A FRED stack frame starts here:
+ * 1) It _always_ includes an error code;
+ *
+ * 2) The return frame for ERET[US] starts here, but
+ * the content of orig_ax is ignored.
+ */
unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+
+ /* The IRETQ return frame starts here */
unsigned long ip;
- unsigned long cs;
+
+ union {
+ /* CS selector */
+ u16 cs;
+ /* The extended 64-bit data slot containing CS */
+ u64 csx;
+ /* The FRED CS extension */
+ struct fred_cs fred_cs;
+ };
+
unsigned long flags;
unsigned long sp;
- unsigned long ss;
-/* top of stack page */
+
+ union {
+ /* SS selector */
+ u16 ss;
+ /* The extended 64-bit data slot containing SS */
+ u64 ssx;
+ /* The FRED SS extension */
+ struct fred_ss fred_ss;
+ };
+
+ /*
+ * Top of stack on IDT systems, while FRED systems have extra fields
+ * defined above for storing exception related information, e.g. CR2 or
+ * DR6.
+ */
};
-#endif /* __KERNEL__ */
#endif /* !__i386__ */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt_types.h>
+#endif
-#ifdef __KERNEL__
-
-#include <linux/init.h>
+#include <asm/proto.h>
struct cpuinfo_x86;
struct task_struct;
@@ -138,112 +184,290 @@ extern unsigned long profile_pc(struct pt_regs *regs);
extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
-extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
- int error_code, int si_code);
-void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);
-extern long syscall_trace_enter(struct pt_regs *);
-extern void syscall_trace_leave(struct pt_regs *);
-static inline unsigned long regs_return_value(struct pt_regs *regs)
+static __always_inline unsigned long regs_return_value(struct pt_regs *regs)
{
return regs->ax;
}
+static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+ regs->ax = rc;
+}
+
/*
- * user_mode_vm(regs) determines whether a register set came from user mode.
- * This is true if V8086 mode was enabled OR if the register set was from
- * protected mode with RPL-3 CS value. This tricky test checks that with
- * one comparison. Many places in the kernel can bypass this full check
- * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ * user_mode(regs) determines whether a register set came from user
+ * mode. On x86_32, this is true if V8086 mode was enabled OR if the
+ * register set was from protected mode with RPL-3 CS value. This
+ * tricky test checks that with one comparison.
+ *
+ * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
+ * the extra check.
*/
-static inline int user_mode(struct pt_regs *regs)
+static __always_inline int user_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
- return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+ return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
#else
return !!(regs->cs & 3);
#endif
}
-static inline int user_mode_vm(struct pt_regs *regs)
+static __always_inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
- return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
- USER_RPL;
+ return (regs->flags & X86_VM_MASK);
#else
- return user_mode(regs);
+ return 0; /* No V86 mode support in long mode */
#endif
}
-static inline int v8086_mode(struct pt_regs *regs)
+static inline bool user_64bit_mode(struct pt_regs *regs)
{
-#ifdef CONFIG_X86_32
- return (regs->flags & X86_VM_MASK);
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_PARAVIRT_XXL
+ /*
+ * On non-paravirt systems, this is the only long mode CPL 3
+ * selector. We do not allow long mode selectors in the LDT.
+ */
+ return regs->cs == __USER_CS;
#else
- return 0; /* No V86 mode support in long mode */
+ /* Headers are too twisted for this to go in paravirt.h. */
+ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
+#endif
+#else /* !CONFIG_X86_64 */
+ return false;
#endif
}
/*
- * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
- * when it traps. The previous stack will be directly underneath the saved
- * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
- *
- * This is valid only for kernel mode traps.
+ * Determine whether the register set came from any context that is running in
+ * 64-bit mode.
*/
-static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+static inline bool any_64bit_mode(struct pt_regs *regs)
{
-#ifdef CONFIG_X86_32
- return (unsigned long)(&regs->sp);
+#ifdef CONFIG_X86_64
+ return !user_mode(regs) || user_64bit_mode(regs);
#else
- return regs->sp;
+ return false;
+#endif
+}
+
+#ifdef CONFIG_X86_64
+#define current_user_stack_pointer() current_pt_regs()->sp
+#define compat_user_stack_pointer() current_pt_regs()->sp
+
+static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
+{
+ bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
+ regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
+
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
+ regs->ip < (unsigned long)entry_SYSRETQ_end);
+#ifdef CONFIG_IA32_EMULATION
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
+ regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack);
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack &&
+ regs->ip < (unsigned long)entry_SYSRETL_compat_end);
+#endif
+
+ return ret;
+}
#endif
+
+static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ return regs->sp;
}
-static inline unsigned long instruction_pointer(struct pt_regs *regs)
+static __always_inline unsigned long instruction_pointer(struct pt_regs *regs)
{
return regs->ip;
}
-static inline unsigned long frame_pointer(struct pt_regs *regs)
+static __always_inline
+void instruction_pointer_set(struct pt_regs *regs, unsigned long val)
+{
+ regs->ip = val;
+}
+
+static __always_inline unsigned long frame_pointer(struct pt_regs *regs)
{
return regs->bp;
}
-static inline unsigned long user_stack_pointer(struct pt_regs *regs)
+static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
-/*
- * These are defined as per linux/ptrace.h, which see.
+static __always_inline
+void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
+{
+ regs->sp = val;
+}
+
+static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
+{
+ return !(regs->flags & X86_EFLAGS_IF);
+}
+
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs: pt_regs from which register value is gotten.
+ * @offset: offset number of the register.
+ *
+ * regs_get_register returns the value of a register. The @offset is the
+ * offset of the register in struct pt_regs address which specified by @regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
*/
-#define arch_has_single_step() (1)
-extern void user_enable_single_step(struct task_struct *);
-extern void user_disable_single_step(struct task_struct *);
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+ unsigned int offset)
+{
+ if (unlikely(offset > MAX_REG_OFFSET))
+ return 0;
+#ifdef CONFIG_X86_32
+ /* The selector fields are 16-bit. */
+ if (offset == offsetof(struct pt_regs, cs) ||
+ offset == offsetof(struct pt_regs, ss) ||
+ offset == offsetof(struct pt_regs, ds) ||
+ offset == offsetof(struct pt_regs, es) ||
+ offset == offsetof(struct pt_regs, fs) ||
+ offset == offsetof(struct pt_regs, gs)) {
+ return *(u16 *)((unsigned long)regs + offset);
+
+ }
+#endif
+ return *(unsigned long *)((unsigned long)regs + offset);
+}
-extern void user_enable_block_step(struct task_struct *);
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+ unsigned long addr)
+{
+ return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
+ * kernel stack which is specified by @regs. If the @n th entry is NOT in
+ * the kernel stack, this returns NULL.
+ */
+static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long *addr = (unsigned long *)regs->sp;
+
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return addr;
+ else
+ return NULL;
+}
+
+/* To avoid include hell, we can't include uaccess.h */
+extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned int n)
+{
+ unsigned long *addr;
+ unsigned long val;
+ long ret;
+
+ addr = regs_get_kernel_stack_nth_addr(regs, n);
+ if (addr) {
+ ret = copy_from_kernel_nofault(&val, addr, sizeof(val));
+ if (!ret)
+ return val;
+ }
+ return 0;
+}
+
+/**
+ * regs_get_kernel_argument() - get Nth function argument in kernel
+ * @regs: pt_regs of that context
+ * @n: function argument number (start from 0)
+ *
+ * regs_get_argument() returns @n th argument of the function call.
+ * Note that this chooses most probably assignment, in some case
+ * it can be incorrect.
+ * This is expected to be called from kprobes or ftrace with regs
+ * where the top of stack is the return address.
+ */
+static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
+ unsigned int n)
+{
+ static const unsigned int argument_offs[] = {
+#ifdef __i386__
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, cx),
+#define NR_REG_ARGUMENTS 3
+#else
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+#define NR_REG_ARGUMENTS 6
+#endif
+ };
+
+ if (n >= NR_REG_ARGUMENTS) {
+ n -= NR_REG_ARGUMENTS - 1;
+ return regs_get_kernel_stack_nth(regs, n);
+ } else
+ return regs_get_register(regs, argument_offs[n]);
+}
+
+#define arch_has_single_step() (1)
#ifdef CONFIG_X86_DEBUGCTLMSR
#define arch_has_block_step() (1)
#else
#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
#endif
+#define ARCH_HAS_USER_SINGLE_STEP_REPORT
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
extern int do_set_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info, int can_allocate);
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void ptrace_bts_untrace(struct task_struct *tsk);
-
-#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
-#endif /* __KERNEL__ */
-
-#endif /* !__ASSEMBLY__ */
+#ifdef CONFIG_X86_64
+# define do_set_thread_area_64(p, s, t) do_arch_prctl_64(p, s, t)
+#else
+# define do_set_thread_area_64(p, s, t) (0)
+#endif
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h
new file mode 100644
index 000000000000..2fee5e9f1ccc
--- /dev/null
+++ b/arch/x86/include/asm/purgatory.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PURGATORY_H
+#define _ASM_X86_PURGATORY_H
+
+#ifndef __ASSEMBLER__
+#include <linux/purgatory.h>
+
+extern void purgatory(void);
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_PURGATORY_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f2626..b9fece5fc96d 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -1,6 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PVCLOCK_ABI_H
#define _ASM_X86_PVCLOCK_ABI_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* These structs MUST NOT be changed.
@@ -29,7 +30,8 @@ struct pvclock_vcpu_time_info {
u64 system_time;
u32 tsc_to_system_mul;
s8 tsc_shift;
- u8 pad[3];
+ u8 flags;
+ u8 pad[2];
} __attribute__((__packed__)); /* 32 bytes */
struct pvclock_wall_clock {
@@ -38,5 +40,9 @@ struct pvclock_wall_clock {
u32 nsec;
} __attribute__((__packed__));
-#endif /* __ASSEMBLY__ */
+#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#define PVCLOCK_GUEST_STOPPED (1 << 1)
+/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8ce..6e4f8fae3ce9 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -1,14 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PVCLOCK_H
#define _ASM_X86_PVCLOCK_H
-#include <linux/clocksource.h>
+#include <asm/clocksource.h>
#include <asm/pvclock-abi.h>
+struct timespec64;
/* some helper functions for xen and kvm pv clock sources */
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src);
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
- struct timespec *ts);
+ struct timespec64 *ts);
+void pvclock_resume(void);
+
+void pvclock_touch_watchdogs(void);
+
+static __always_inline
+unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
+{
+ unsigned version = src->version & ~1;
+ /* Make sure that the version is read before the data. */
+ virt_rmb();
+ return version;
+}
+
+static __always_inline
+bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
+ unsigned version)
+{
+ /* Make sure that the version is re-read after the data. */
+ virt_rmb();
+ return unlikely(version != src->version);
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#else
+ ulong tmp;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+ __asm__ (
+ "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+ : [lo]"=a"(product),
+ [hi]"=d"(tmp)
+ : "0"(delta),
+ [mul_frac]"rm"((u64)mul_frac));
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static __always_inline
+u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
+{
+ u64 delta = tsc - src->tsc_timestamp;
+ u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
+ src->tsc_shift);
+ return src->system_time + offset;
+}
+
+struct pvclock_vsyscall_time_info {
+ struct pvclock_vcpu_time_info pvti;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti);
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return NULL;
+}
+#endif
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
new file mode 100644
index 000000000000..8656b5a6e8e7
--- /dev/null
+++ b/arch/x86/include/asm/qrwlock.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_QRWLOCK_H
+#define _ASM_X86_QRWLOCK_H
+
+#include <asm-generic/qrwlock_types.h>
+#include <asm-generic/qrwlock.h>
+
+#endif /* _ASM_X86_QRWLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 000000000000..68da67df304d
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include <linux/jump_label.h>
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+#include <asm/rmwcc.h>
+
+#define _Q_PENDING_LOOPS (1 << 9)
+
+#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ u32 val;
+
+ /*
+ * We can't use GEN_BINARY_RMWcc() inside an if() stmt because asm goto
+ * and CONFIG_PROFILE_ALL_BRANCHES=y results in a label inside a
+ * statement expression, which GCC doesn't like.
+ */
+ val = GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c,
+ "I", _Q_PENDING_OFFSET) * _Q_PENDING_VAL;
+ val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
+
+ return val;
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+extern bool nopvspin;
+
+#define queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+ smp_store_release(&lock->locked, 0);
+}
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+ kcsan_release();
+ pv_queued_spin_unlock(lock);
+}
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(long cpu)
+{
+ return pv_vcpu_is_preempted(cpu);
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT
+/*
+ * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
+ *
+ * Native (and PV wanting native due to vCPU pinning) should keep this key
+ * disabled. Native does not touch the key.
+ *
+ * When in a guest then native_pv_lock_init() enables the key first and
+ * KVM/XEN might conditionally disable it later in the boot process again.
+ */
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
+
+/*
+ * Shortcut for the queued_spin_lock_slowpath() function that allows
+ * virt to hijack it.
+ *
+ * Returns:
+ * true - lock has been negotiated, all done;
+ * false - queued_spin_lock_slowpath() will do its thing.
+ */
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
+{
+ int val;
+
+ if (!static_branch_likely(&virt_spin_lock_key))
+ return false;
+
+ /*
+ * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+ * back to a Test-and-Set spinlock, because fair locks have
+ * horrible lock 'holder' preemption issues.
+ */
+
+ __retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+ cpu_relax();
+ goto __retry;
+ }
+
+ return true;
+}
+
+#endif /* CONFIG_PARAVIRT */
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000000..0a985784be9b
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+#include <asm/ibt.h>
+
+void __lockfunc __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked);
+
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code. Note that it puts the code in the .spinlock.text section which
+ * is equivalent to adding __lockfunc in the C code:
+ *
+ * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * u8 lockval = _Q_LOCKED_VAL;
+ *
+ * if (try_cmpxchg(&lock->locked, &lockval, 0))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+#define PV_UNLOCK_ASM \
+ FRAME_BEGIN \
+ "push %rdx\n\t" \
+ "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \
+ "xor %edx,%edx\n\t" \
+ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \
+ "jne .slowpath\n\t" \
+ "pop %rdx\n\t" \
+ FRAME_END \
+ ASM_RET \
+ ".slowpath:\n\t" \
+ "push %rsi\n\t" \
+ "movzbl %al,%esi\n\t" \
+ "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \
+ "pop %rsi\n\t" \
+ "pop %rdx\n\t" \
+ FRAME_END
+
+DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
+ PV_UNLOCK_ASM, .spinlock.text);
+
+#else /* CONFIG_64BIT */
+
+extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock);
+__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text");
+
+#endif /* CONFIG_64BIT */
+#endif
diff --git a/arch/x86/include/asm/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
deleted file mode 100644
index c8e9c8bed3d0..000000000000
--- a/arch/x86/include/asm/rdc321x_defs.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#define PFX "rdc321x: "
-
-/* General purpose configuration and data registers */
-#define RDC3210_CFGREG_ADDR 0x0CF8
-#define RDC3210_CFGREG_DATA 0x0CFC
-
-#define RDC321X_GPIO_CTRL_REG1 0x48
-#define RDC321X_GPIO_CTRL_REG2 0x84
-#define RDC321X_GPIO_DATA_REG1 0x4c
-#define RDC321X_GPIO_DATA_REG2 0x88
-
-#define RDC321X_MAX_GPIO 58
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
new file mode 100644
index 000000000000..e406a1e92c63
--- /dev/null
+++ b/arch/x86/include/asm/realmode.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_X86_REALMODE_H
+#define _ARCH_X86_REALMODE_H
+
+/*
+ * Flag bit definitions for use with the flags field of the trampoline header
+ * in the CONFIG_X86_64 variant.
+ */
+#define TH_FLAGS_SME_ACTIVE_BIT 0
+#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT)
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+#include <asm/io.h>
+
+/* This must match data at realmode/rm/header.S */
+struct real_mode_header {
+ u32 text_start;
+ u32 ro_end;
+ /* SMP trampoline */
+ u32 trampoline_start;
+ u32 trampoline_header;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ u32 sev_es_trampoline_start;
+#endif
+#ifdef CONFIG_X86_64
+ u32 trampoline_start64;
+ u32 trampoline_pgd;
+#endif
+ /* ACPI S3 wakeup */
+#ifdef CONFIG_ACPI_SLEEP
+ u32 wakeup_start;
+ u32 wakeup_header;
+#endif
+ /* APM/BIOS reboot */
+ u32 machine_real_restart_asm;
+#ifdef CONFIG_X86_64
+ u32 machine_real_restart_seg;
+#endif
+};
+
+/* This must match data at realmode/rm/trampoline_{32,64}.S */
+struct trampoline_header {
+#ifdef CONFIG_X86_32
+ u32 start;
+ u16 gdt_pad;
+ u16 gdt_limit;
+ u32 gdt_base;
+#else
+ u64 start;
+ u64 efer;
+ u32 cr4;
+ u32 flags;
+ u32 lock;
+#endif
+};
+
+extern struct real_mode_header *real_mode_header;
+extern unsigned char real_mode_blob_end[];
+
+extern unsigned long initial_code;
+extern unsigned long initial_stack;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+extern unsigned long initial_vc_handler;
+#endif
+
+extern u32 *trampoline_lock;
+
+extern unsigned char real_mode_blob[];
+extern unsigned char real_mode_relocs[];
+
+#ifdef CONFIG_X86_32
+extern unsigned char startup_32_smp[];
+extern unsigned char boot_gdt[];
+#else
+extern unsigned char secondary_startup_64[];
+extern unsigned char secondary_startup_64_no_verify[];
+#endif
+
+static __always_inline size_t real_mode_size_needed(void)
+{
+ if (real_mode_header)
+ return 0; /* already allocated. */
+
+ return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
+}
+
+static inline void set_real_mode_mem(phys_addr_t mem)
+{
+ real_mode_header = (struct real_mode_header *) __va(mem);
+}
+
+void reserve_real_mode(void);
+void load_trampoline_pgtable(void);
+void init_real_mode(void);
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 562d4fd31ba8..ecd58ea9a837 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_REBOOT_H
#define _ASM_X86_REBOOT_H
@@ -15,12 +16,28 @@ struct machine_ops {
};
extern struct machine_ops machine_ops;
+extern int crashing_cpu;
void native_machine_crash_shutdown(struct pt_regs *regs);
void native_machine_shutdown(void);
-void machine_real_restart(const unsigned char *code, int length);
+void __noreturn machine_real_restart(unsigned int type);
+/* These must match dispatch in arch/x86/realmore/rm/reboot.S */
+#define MRR_BIOS 0
+#define MRR_APM 1
-typedef void (*nmi_shootdown_cb)(int, struct die_args*);
+typedef void (cpu_emergency_virt_cb)(void);
+#if IS_ENABLED(CONFIG_KVM_X86)
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
+void cpu_emergency_disable_virtualization(void);
+#else
+static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {}
+static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {}
+static inline void cpu_emergency_disable_virtualization(void) {}
+#endif /* CONFIG_KVM_X86 */
+
+typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+void run_crash_ipi_callback(struct pt_regs *regs);
#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h
index 765debe4c54c..96515658ce12 100644
--- a/arch/x86/include/asm/reboot_fixups.h
+++ b/arch/x86/include/asm/reboot_fixups.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_REBOOT_FIXUPS_H
#define _ASM_X86_REBOOT_FIXUPS_H
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
deleted file mode 100644
index 64cf2d24fad1..000000000000
--- a/arch/x86/include/asm/required-features.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef _ASM_X86_REQUIRED_FEATURES_H
-#define _ASM_X86_REQUIRED_FEATURES_H
-
-/* Define minimum CPUID feature set for kernel These bits are checked
- really early to actually display a visible error message before the
- kernel dies. Make sure to assign features to the proper mask!
-
- Some requirements that are not in CPUID yet are also in the
- CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
-
- The real information is in arch/x86/Kconfig.cpu, this just converts
- the CONFIGs into a bitmask */
-
-#ifndef CONFIG_MATH_EMULATION
-# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
-#else
-# define NEED_FPU 0
-#endif
-
-#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
-#else
-# define NEED_PAE 0
-#endif
-
-#ifdef CONFIG_X86_CMPXCHG64
-# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
-#else
-# define NEED_CX8 0
-#endif
-
-#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
-# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
-#else
-# define NEED_CMOV 0
-#endif
-
-#ifdef CONFIG_X86_USE_3DNOW
-# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
-#else
-# define NEED_3DNOW 0
-#endif
-
-#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
-# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
-#else
-# define NEED_NOPL 0
-#endif
-
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT
-/* Paravirtualized systems may not have PSE or PGE available */
-#define NEED_PSE 0
-#define NEED_PGE 0
-#else
-#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
-#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
-#endif
-#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
-#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
-#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
-#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
-#define NEED_LM (1<<(X86_FEATURE_LM & 31))
-#else
-#define NEED_PSE 0
-#define NEED_MSR 0
-#define NEED_PGE 0
-#define NEED_FXSR 0
-#define NEED_XMM 0
-#define NEED_XMM2 0
-#define NEED_LM 0
-#endif
-
-#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
- NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
- NEED_XMM|NEED_XMM2)
-#define SSE_MASK (NEED_XMM|NEED_XMM2)
-
-#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
-
-#define REQUIRED_MASK2 0
-#define REQUIRED_MASK3 (NEED_NOPL)
-#define REQUIRED_MASK4 0
-#define REQUIRED_MASK5 0
-#define REQUIRED_MASK6 0
-#define REQUIRED_MASK7 0
-
-#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
new file mode 100644
index 000000000000..575f8408a9e7
--- /dev/null
+++ b/arch/x86/include/asm/resctrl.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RESCTRL_H
+#define _ASM_X86_RESCTRL_H
+
+#ifdef CONFIG_X86_CPU_RESCTRL
+
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/resctrl_types.h>
+#include <linux/sched.h>
+
+#include <asm/msr.h>
+
+/*
+ * This value can never be a valid CLOSID, and is used when mapping a
+ * (closid, rmid) pair to an index and back. On x86 only the RMID is
+ * needed. The index is a software defined value.
+ */
+#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0)
+
+/**
+ * struct resctrl_pqr_state - State cache for the PQR MSR
+ * @cur_rmid: The cached Resource Monitoring ID
+ * @cur_closid: The cached Class Of Service ID
+ * @default_rmid: The user assigned Resource Monitoring ID
+ * @default_closid: The user assigned cached Class Of Service ID
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct resctrl_pqr_state {
+ u32 cur_rmid;
+ u32 cur_closid;
+ u32 default_rmid;
+ u32 default_closid;
+};
+
+DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
+
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+static inline bool resctrl_arch_alloc_capable(void)
+{
+ return rdt_alloc_capable;
+}
+
+static inline void resctrl_arch_enable_alloc(void)
+{
+ static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
+ static_branch_inc_cpuslocked(&rdt_enable_key);
+}
+
+static inline void resctrl_arch_disable_alloc(void)
+{
+ static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
+ static_branch_dec_cpuslocked(&rdt_enable_key);
+}
+
+static inline bool resctrl_arch_mon_capable(void)
+{
+ return rdt_mon_capable;
+}
+
+static inline void resctrl_arch_enable_mon(void)
+{
+ static_branch_enable_cpuslocked(&rdt_mon_enable_key);
+ static_branch_inc_cpuslocked(&rdt_enable_key);
+}
+
+static inline void resctrl_arch_disable_mon(void)
+{
+ static_branch_disable_cpuslocked(&rdt_mon_enable_key);
+ static_branch_dec_cpuslocked(&rdt_enable_key);
+}
+
+/*
+ * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ * which supports resource control or monitoring and we enable by
+ * mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ * when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ * simple as possible.
+ * Must be called with preemption disabled.
+ */
+static inline void __resctrl_sched_in(struct task_struct *tsk)
+{
+ struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u32 closid = READ_ONCE(state->default_closid);
+ u32 rmid = READ_ONCE(state->default_rmid);
+ u32 tmp;
+
+ /*
+ * If this task has a closid/rmid assigned, use it.
+ * Else use the closid/rmid assigned to this cpu.
+ */
+ if (static_branch_likely(&rdt_alloc_enable_key)) {
+ tmp = READ_ONCE(tsk->closid);
+ if (tmp)
+ closid = tmp;
+ }
+
+ if (static_branch_likely(&rdt_mon_enable_key)) {
+ tmp = READ_ONCE(tsk->rmid);
+ if (tmp)
+ rmid = tmp;
+ }
+
+ if (closid != state->cur_closid || rmid != state->cur_rmid) {
+ state->cur_closid = closid;
+ state->cur_rmid = rmid;
+ wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid);
+ }
+}
+
+static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
+{
+ unsigned int scale = boot_cpu_data.x86_cache_occ_scale;
+
+ /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+ val /= scale;
+ return val * scale;
+}
+
+static inline void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid,
+ u32 rmid)
+{
+ WRITE_ONCE(per_cpu(pqr_state.default_closid, cpu), closid);
+ WRITE_ONCE(per_cpu(pqr_state.default_rmid, cpu), rmid);
+}
+
+static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk,
+ u32 closid, u32 rmid)
+{
+ WRITE_ONCE(tsk->closid, closid);
+ WRITE_ONCE(tsk->rmid, rmid);
+}
+
+static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
+{
+ return READ_ONCE(tsk->closid) == closid;
+}
+
+static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored,
+ u32 rmid)
+{
+ return READ_ONCE(tsk->rmid) == rmid;
+}
+
+static inline void resctrl_arch_sched_in(struct task_struct *tsk)
+{
+ if (static_branch_likely(&rdt_enable_key))
+ __resctrl_sched_in(tsk);
+}
+
+static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
+{
+ *rmid = idx;
+ *closid = X86_RESCTRL_EMPTY_CLOSID;
+}
+
+static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid)
+{
+ return rmid;
+}
+
+/* x86 can always read an rmid, nothing needs allocating */
+struct rdt_resource;
+static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r,
+ enum resctrl_event_id evtid)
+{
+ might_sleep();
+ return NULL;
+}
+
+static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r,
+ enum resctrl_event_id evtid,
+ void *ctx) { }
+
+void resctrl_cpu_detect(struct cpuinfo_x86 *c);
+
+#else
+
+static inline void resctrl_arch_sched_in(struct task_struct *tsk) {}
+static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {}
+
+#endif /* CONFIG_X86_CPU_RESCTRL */
+
+#endif /* _ASM_X86_RESCTRL_H */
diff --git a/arch/x86/include/asm/resource.h b/arch/x86/include/asm/resource.h
deleted file mode 100644
index 04bc4db8921b..000000000000
--- a/arch/x86/include/asm/resource.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/resource.h>
diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h
deleted file mode 100644
index 97bab6388a92..000000000000
--- a/arch/x86/include/asm/rio.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Derived from include/asm-x86/mach-summit/mach_mpparse.h
- * and include/asm-x86/mach-default/bios_ebda.h
- *
- * Author: Laurent Vivier <Laurent.Vivier@bull.net>
- */
-
-#ifndef _ASM_X86_RIO_H
-#define _ASM_X86_RIO_H
-
-#define RIO_TABLE_VERSION 3
-
-struct rio_table_hdr {
- u8 version; /* Version number of this data structure */
- u8 num_scal_dev; /* # of Scalability devices */
- u8 num_rio_dev; /* # of RIO I/O devices */
-} __attribute__((packed));
-
-struct scal_detail {
- u8 node_id; /* Scalability Node ID */
- u32 CBAR; /* Address of 1MB register space */
- u8 port0node; /* Node ID port connected to: 0xFF=None */
- u8 port0port; /* Port num port connected to: 0,1,2, or */
- /* 0xFF=None */
- u8 port1node; /* Node ID port connected to: 0xFF = None */
- u8 port1port; /* Port num port connected to: 0,1,2, or */
- /* 0xFF=None */
- u8 port2node; /* Node ID port connected to: 0xFF = None */
- u8 port2port; /* Port num port connected to: 0,1,2, or */
- /* 0xFF=None */
- u8 chassis_num; /* 1 based Chassis number (1 = boot node) */
-} __attribute__((packed));
-
-struct rio_detail {
- u8 node_id; /* RIO Node ID */
- u32 BBAR; /* Address of 1MB register space */
- u8 type; /* Type of device */
- u8 owner_id; /* Node ID of Hurricane that owns this */
- /* node */
- u8 port0node; /* Node ID port connected to: 0xFF=None */
- u8 port0port; /* Port num port connected to: 0,1,2, or */
- /* 0xFF=None */
- u8 port1node; /* Node ID port connected to: 0xFF=None */
- u8 port1port; /* Port num port connected to: 0,1,2, or */
- /* 0xFF=None */
- u8 first_slot; /* Lowest slot number below this Calgary */
- u8 status; /* Bit 0 = 1 : the XAPIC is used */
- /* = 0 : the XAPIC is not used, ie: */
- /* ints fwded to another XAPIC */
- /* Bits1:7 Reserved */
- u8 WP_index; /* instance index - lower ones have */
- /* lower slot numbers/PCI bus numbers */
- u8 chassis_num; /* 1 based Chassis number */
-} __attribute__((packed));
-
-enum {
- HURR_SCALABILTY = 0, /* Hurricane Scalability info */
- HURR_RIOIB = 2, /* Hurricane RIOIB info */
- COMPAT_CALGARY = 4, /* Compatibility Calgary */
- ALT_CALGARY = 5, /* Second Planar Calgary */
-};
-
-#endif /* _ASM_X86_RIO_H */
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
new file mode 100644
index 000000000000..54c8fc430684
--- /dev/null
+++ b/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RMWcc
+#define _ASM_X86_RMWcc
+
+#include <linux/args.h>
+
+#define __CLOBBERS_MEM(clb...) "memory", ## clb
+
+#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \
+({ \
+ bool c; \
+ asm_inline volatile (fullop \
+ : [var] "+m" (_var), "=@cc" #cc (c) \
+ : __VA_ARGS__ : clobbers); \
+ c; \
+})
+
+#define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \
+ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM())
+
+#define GEN_UNARY_RMWcc_3(op, var, cc) \
+ GEN_UNARY_RMWcc_4(op, var, cc, "%[var]")
+
+#define GEN_UNARY_RMWcc(X...) CONCATENATE(GEN_UNARY_RMWcc_, COUNT_ARGS(X))(X)
+
+#define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \
+ __GEN_RMWcc(op " %[val], " arg0, var, cc, \
+ __CLOBBERS_MEM(), [val] vcon (_val))
+
+#define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \
+ GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]")
+
+#define GEN_BINARY_RMWcc(X...) CONCATENATE(GEN_BINARY_RMWcc_, COUNT_ARGS(X))(X)
+
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \
+ __GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM(clobbers))
+
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, cc, vcon, _val, clobbers...)\
+ __GEN_RMWcc(op " %[val], %[var]\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM(clobbers), [val] vcon (_val))
+
+#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/rqspinlock.h b/arch/x86/include/asm/rqspinlock.h
new file mode 100644
index 000000000000..24a885449ee6
--- /dev/null
+++ b/arch/x86/include/asm/rqspinlock.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RQSPINLOCK_H
+#define _ASM_X86_RQSPINLOCK_H
+
+#include <asm/paravirt.h>
+
+#ifdef CONFIG_PARAVIRT
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
+
+#define resilient_virt_spin_lock_enabled resilient_virt_spin_lock_enabled
+static __always_inline bool resilient_virt_spin_lock_enabled(void)
+{
+ return static_branch_likely(&virt_spin_lock_key);
+}
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+typedef struct qspinlock rqspinlock_t;
+#else
+typedef struct rqspinlock rqspinlock_t;
+#endif
+extern int resilient_tas_spin_lock(rqspinlock_t *lock);
+
+#define resilient_virt_spin_lock resilient_virt_spin_lock
+static inline int resilient_virt_spin_lock(rqspinlock_t *lock)
+{
+ return resilient_tas_spin_lock(lock);
+}
+
+#endif /* CONFIG_PARAVIRT */
+
+#include <asm-generic/rqspinlock.h>
+
+#endif /* _ASM_X86_RQSPINLOCK_H */
diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h
deleted file mode 100644
index f71c3b0ed360..000000000000
--- a/arch/x86/include/asm/rtc.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/rtc.h>
diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h
new file mode 100644
index 000000000000..e5a13dc8816e
--- /dev/null
+++ b/arch/x86/include/asm/runtime-const.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RUNTIME_CONST_H
+#define _ASM_RUNTIME_CONST_H
+
+#ifdef MODULE
+ #error "Cannot use runtime-const infrastructure from modules"
+#endif
+
+#ifdef __ASSEMBLY__
+
+.macro RUNTIME_CONST_PTR sym reg
+ movq $0x0123456789abcdef, %\reg
+ 1:
+ .pushsection runtime_ptr_\sym, "a"
+ .long 1b - 8 - .
+ .popsection
+.endm
+
+#else /* __ASSEMBLY__ */
+
+#define runtime_const_ptr(sym) ({ \
+ typeof(sym) __ret; \
+ asm_inline("mov %1,%0\n1:\n" \
+ ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \
+ ".long 1b - %c2 - .\n" \
+ ".popsection" \
+ :"=r" (__ret) \
+ :"i" ((unsigned long)0x0123456789abcdefull), \
+ "i" (sizeof(long))); \
+ __ret; })
+
+// The 'typeof' will create at _least_ a 32-bit type, but
+// will happily also take a bigger type and the 'shrl' will
+// clear the upper bits
+#define runtime_const_shift_right_32(val, sym) ({ \
+ typeof(0u+(val)) __ret = (val); \
+ asm_inline("shrl $12,%k0\n1:\n" \
+ ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \
+ ".long 1b - 1 - .\n" \
+ ".popsection" \
+ :"+r" (__ret)); \
+ __ret; })
+
+#define runtime_const_init(type, sym) do { \
+ extern s32 __start_runtime_##type##_##sym[]; \
+ extern s32 __stop_runtime_##type##_##sym[]; \
+ runtime_const_fixup(__runtime_fixup_##type, \
+ (unsigned long)(sym), \
+ __start_runtime_##type##_##sym, \
+ __stop_runtime_##type##_##sym); \
+} while (0)
+
+/*
+ * The text patching is trivial - you can only do this at init time,
+ * when the text section hasn't been marked RO, and before the text
+ * has ever been executed.
+ */
+static inline void __runtime_fixup_ptr(void *where, unsigned long val)
+{
+ *(unsigned long *)where = val;
+}
+
+static inline void __runtime_fixup_shift(void *where, unsigned long val)
+{
+ *(unsigned char *)where = val;
+}
+
+static inline void runtime_const_fixup(void (*fn)(void *, unsigned long),
+ unsigned long val, s32 *start, s32 *end)
+{
+ while (start < end) {
+ fn(*start + (void *)start, val);
+ start++;
+ }
+}
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
deleted file mode 100644
index 6a8c0d645108..000000000000
--- a/arch/x86/include/asm/rwlock.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_X86_RWLOCK_H
-#define _ASM_X86_RWLOCK_H
-
-#define RW_LOCK_BIAS 0x01000000
-
-/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
-
-#endif /* _ASM_X86_RWLOCK_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
deleted file mode 100644
index ca7517d33776..000000000000
--- a/arch/x86/include/asm/rwsem.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+
- *
- * Written by David Howells (dhowells@redhat.com).
- *
- * Derived from asm-x86/semaphore.h
- *
- *
- * The MSW of the count is the negated number of active writers and waiting
- * lockers, and the LSW is the total number of active locks
- *
- * The lock count is initialized to 0 (no active and no waiting lockers).
- *
- * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an
- * uncontended lock. This can be determined because XADD returns the old value.
- * Readers increment by 1 and see a positive value when uncontended, negative
- * if there are writers (and maybe) readers waiting (in which case it goes to
- * sleep).
- *
- * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
- * be extended to 65534 by manually checking the whole MSW rather than relying
- * on the S flag.
- *
- * The value of ACTIVE_BIAS supports up to 65535 active processes.
- *
- * This should be totally fair - if anything is waiting, a process that wants a
- * lock will go to the back of the queue. When the currently active lock is
- * released, if there's a writer at the front of the queue, then that and only
- * that will be woken up; if there's a bunch of consequtive readers at the
- * front, then they'll all be woken up, but no other readers will be.
- */
-
-#ifndef _ASM_X86_RWSEM_H
-#define _ASM_X86_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
-
-struct rwsem_waiter;
-
-extern asmregparm struct rw_semaphore *
- rwsem_down_read_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_down_write_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_wake(struct rw_semaphore *);
-extern asmregparm struct rw_semaphore *
- rwsem_downgrade_wake(struct rw_semaphore *sem);
-
-/*
- * the semaphore definition
- */
-
-#define RWSEM_UNLOCKED_VALUE 0x00000000
-#define RWSEM_ACTIVE_BIAS 0x00000001
-#define RWSEM_ACTIVE_MASK 0x0000ffff
-#define RWSEM_WAITING_BIAS (-0x00010000)
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-struct rw_semaphore {
- signed long count;
- spinlock_t wait_lock;
- struct list_head wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-
-#define __RWSEM_INITIALIZER(name) \
-{ \
- RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
- LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
-}
-
-#define DECLARE_RWSEM(name) \
- struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key);
-
-#define init_rwsem(sem) \
-do { \
- static struct lock_class_key __key; \
- \
- __init_rwsem((sem), #sem, &__key); \
-} while (0)
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
- asm volatile("# beginning down_read\n\t"
- LOCK_PREFIX " incl (%%eax)\n\t"
- /* adds 0x00000001, returns the old value */
- " jns 1f\n"
- " call call_rwsem_down_read_failed\n"
- "1:\n\t"
- "# ending down_read\n\t"
- : "+m" (sem->count)
- : "a" (sem)
- : "memory", "cc");
-}
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
- __s32 result, tmp;
- asm volatile("# beginning __down_read_trylock\n\t"
- " movl %0,%1\n\t"
- "1:\n\t"
- " movl %1,%2\n\t"
- " addl %3,%2\n\t"
- " jle 2f\n\t"
- LOCK_PREFIX " cmpxchgl %2,%0\n\t"
- " jnz 1b\n\t"
- "2:\n\t"
- "# ending __down_read_trylock\n\t"
- : "+m" (sem->count), "=&a" (result), "=&r" (tmp)
- : "i" (RWSEM_ACTIVE_READ_BIAS)
- : "memory", "cc");
- return result >= 0 ? 1 : 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
- int tmp;
-
- tmp = RWSEM_ACTIVE_WRITE_BIAS;
- asm volatile("# beginning down_write\n\t"
- LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
- /* subtract 0x0000ffff, returns the old value */
- " testl %%edx,%%edx\n\t"
- /* was the count 0 before? */
- " jz 1f\n"
- " call call_rwsem_down_write_failed\n"
- "1:\n"
- "# ending down_write"
- : "+m" (sem->count), "=d" (tmp)
- : "a" (sem), "1" (tmp)
- : "memory", "cc");
-}
-
-static inline void __down_write(struct rw_semaphore *sem)
-{
- __down_write_nested(sem, 0);
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
- signed long ret = cmpxchg(&sem->count,
- RWSEM_UNLOCKED_VALUE,
- RWSEM_ACTIVE_WRITE_BIAS);
- if (ret == RWSEM_UNLOCKED_VALUE)
- return 1;
- return 0;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
- __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
- asm volatile("# beginning __up_read\n\t"
- LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
- /* subtracts 1, returns the old value */
- " jns 1f\n\t"
- " call call_rwsem_wake\n"
- "1:\n"
- "# ending __up_read\n"
- : "+m" (sem->count), "=d" (tmp)
- : "a" (sem), "1" (tmp)
- : "memory", "cc");
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
- asm volatile("# beginning __up_write\n\t"
- " movl %2,%%edx\n\t"
- LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
- /* tries to transition
- 0xffff0001 -> 0x00000000 */
- " jz 1f\n"
- " call call_rwsem_wake\n"
- "1:\n\t"
- "# ending __up_write\n"
- : "+m" (sem->count)
- : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS)
- : "memory", "cc", "edx");
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
- asm volatile("# beginning __downgrade_write\n\t"
- LOCK_PREFIX " addl %2,(%%eax)\n\t"
- /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
- " jns 1f\n\t"
- " call call_rwsem_downgrade_wake\n"
- "1:\n\t"
- "# ending __downgrade_write\n"
- : "+m" (sem->count)
- : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
- : "memory", "cc");
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
-{
- asm volatile(LOCK_PREFIX "addl %1,%0"
- : "+m" (sem->count)
- : "ir" (delta));
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
-{
- int tmp = delta;
-
- asm volatile(LOCK_PREFIX "xadd %0,%1"
- : "+r" (tmp), "+m" (sem->count)
- : : "memory");
-
- return tmp + delta;
-}
-
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
- return (sem->count != 0);
-}
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
deleted file mode 100644
index 263d397d2eef..000000000000
--- a/arch/x86/include/asm/scatterlist.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _ASM_X86_SCATTERLIST_H
-#define _ASM_X86_SCATTERLIST_H
-
-#include <asm/types.h>
-
-struct scatterlist {
-#ifdef CONFIG_DEBUG_SG
- unsigned long sg_magic;
-#endif
- unsigned long page_link;
- unsigned int offset;
- unsigned int length;
- dma_addr_t dma_address;
- unsigned int dma_length;
-};
-
-#define ARCH_HAS_SG_CHAIN
-#define ISA_DMA_THRESHOLD (0x00ffffff)
-
-/*
- * These macros should be used after a pci_map_sg call has been done
- * to get bus addresses of each of the SG entries and their lengths.
- * You should only work with the number of sg entries pci_map_sg
- * returns.
- */
-#define sg_dma_address(sg) ((sg)->dma_address)
-#ifdef CONFIG_X86_32
-# define sg_dma_len(sg) ((sg)->length)
-#else
-# define sg_dma_len(sg) ((sg)->dma_length)
-#endif
-
-#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index c62e58a5a90d..42bcd42d70d1 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -1,5 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SECCOMP_H
+#define _ASM_X86_SECCOMP_H
+
+#include <asm/unistd.h>
+
#ifdef CONFIG_X86_32
-# include "seccomp_32.h"
-#else
-# include "seccomp_64.h"
+#define __NR_seccomp_sigreturn __NR_sigreturn
#endif
+
+#ifdef CONFIG_COMPAT
+#include <asm/unistd_32_ia32.h>
+#define __NR_seccomp_read_32 __NR_ia32_read
+#define __NR_seccomp_write_32 __NR_ia32_write
+#define __NR_seccomp_exit_32 __NR_ia32_exit
+#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#endif
+
+#ifdef CONFIG_X86_64
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "x86_64"
+# ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386
+# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "ia32"
+# endif
+/*
+ * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
+ * caching them and they are treated as out of range syscalls, which will
+ * always pass through the BPF filter.
+ */
+#else /* !CONFIG_X86_64 */
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "ia32"
+#endif
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
deleted file mode 100644
index b811d6f5780c..000000000000
--- a/arch/x86/include/asm/seccomp_32.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_32_H
-#define _ASM_X86_SECCOMP_32_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
deleted file mode 100644
index 84ec1bd161a5..000000000000
--- a/arch/x86/include/asm/seccomp_64.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_64_H
-#define _ASM_X86_SECCOMP_64_H
-
-#include <linux/unistd.h>
-#include <asm/ia32_unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_ia32_read
-#define __NR_seccomp_write_32 __NR_ia32_write
-#define __NR_seccomp_exit_32 __NR_ia32_exit
-#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..30e8ee7006f9 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -1,8 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SECTIONS_H
#define _ASM_X86_SECTIONS_H
#include <asm-generic/sections.h>
+#include <asm/extable.h>
+extern char __relocate_kernel_start[], __relocate_kernel_end[];
extern char __brk_base[], __brk_limit[];
+extern char __end_rodata_aligned[];
+
+#if defined(CONFIG_X86_64)
+extern char __end_rodata_hpage_align[];
+#endif
+
+extern char __end_of_kernel_reserve[];
+
+extern unsigned long _brk_start, _brk_end;
#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 14e0ed86a6f9..f59ae7186940 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -1,216 +1,356 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SEGMENT_H
#define _ASM_X86_SEGMENT_H
-/* Constructor for a conventional segment GDT (or LDT) entry */
-/* This is a macro so it can be used in initializers */
+#include <linux/const.h>
+#include <asm/alternative.h>
+#include <asm/ibt.h>
+
+/*
+ * Constructor for a conventional segment GDT (or LDT) entry.
+ * This is a macro so it can be used in initializers.
+ */
#define GDT_ENTRY(flags, base, limit) \
- ((((base) & 0xff000000ULL) << (56-24)) | \
- (((flags) & 0x0000f0ffULL) << 40) | \
- (((limit) & 0x000f0000ULL) << (48-16)) | \
- (((base) & 0x00ffffffULL) << 16) | \
- (((limit) & 0x0000ffffULL)))
+ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
+ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
+ (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \
+ (((base) & _AC(0x00ffffff,ULL)) << 16) | \
+ (((limit) & _AC(0x0000ffff,ULL))))
-/* Simple and small GDT entries for booting only */
+/* Simple and small GDT entries for booting only: */
#define GDT_ENTRY_BOOT_CS 2
-#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
+#define GDT_ENTRY_BOOT_DS 3
+#define GDT_ENTRY_BOOT_TSS 4
+#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8)
+#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8)
+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8)
-#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
-#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
+/*
+ * Bottom two bits of selector give the ring
+ * privilege level
+ */
+#define SEGMENT_RPL_MASK 0x3
-#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
-#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
+/*
+ * When running on Xen PV, the actual privilege level of the kernel is 1,
+ * not 0. Testing the Requested Privilege Level in a segment selector to
+ * determine whether the context is user mode or kernel mode with
+ * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level
+ * matches the 0x3 mask.
+ *
+ * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV
+ * kernels because privilege level 2 is never used.
+ */
+#define USER_SEGMENT_RPL_MASK 0x2
-#ifdef CONFIG_X86_32
+/* User mode is privilege level 3: */
+#define USER_RPL 0x3
+
+/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
+#define SEGMENT_TI_MASK 0x4
+/* LDT segment has TI set ... */
+#define SEGMENT_LDT 0x4
+/* ... GDT has it cleared */
+#define SEGMENT_GDT 0x0
+
+#define GDT_ENTRY_INVALID_SEG 0
+
+#if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64)
/*
* The layout of the per-CPU GDT under Linux:
*
- * 0 - null
+ * 0 - null <=== cacheline #1
* 1 - reserved
* 2 - reserved
* 3 - reserved
*
- * 4 - unused <==== new cacheline
+ * 4 - unused <=== cacheline #2
* 5 - unused
*
* ------- start of TLS (Thread-Local Storage) segments:
*
* 6 - TLS segment #1 [ glibc's TLS segment ]
* 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
- * 8 - TLS segment #3
+ * 8 - TLS segment #3 <=== cacheline #3
* 9 - reserved
* 10 - reserved
* 11 - reserved
*
* ------- start of kernel segments:
*
- * 12 - kernel code segment <==== new cacheline
+ * 12 - kernel code segment <=== cacheline #4
* 13 - kernel data segment
* 14 - default user CS
* 15 - default user DS
- * 16 - TSS
+ * 16 - TSS <=== cacheline #5
* 17 - LDT
* 18 - PNPBIOS support (16->32 gate)
* 19 - PNPBIOS support
- * 20 - PNPBIOS support
+ * 20 - PNPBIOS support <=== cacheline #6
* 21 - PNPBIOS support
* 22 - PNPBIOS support
* 23 - APM BIOS support
- * 24 - APM BIOS support
+ * 24 - APM BIOS support <=== cacheline #7
* 25 - APM BIOS support
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - stack_canary-20 [ for stack protector ]
+ * 28 - VDSO getcpu
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
*/
-#define GDT_ENTRY_TLS_MIN 6
-#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+#define GDT_ENTRY_TLS_MIN 6
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+#define GDT_ENTRY_KERNEL_CS 12
+#define GDT_ENTRY_KERNEL_DS 13
#define GDT_ENTRY_DEFAULT_USER_CS 14
-
#define GDT_ENTRY_DEFAULT_USER_DS 15
+#define GDT_ENTRY_TSS 16
+#define GDT_ENTRY_LDT 17
+#define GDT_ENTRY_PNPBIOS_CS32 18
+#define GDT_ENTRY_PNPBIOS_CS16 19
+#define GDT_ENTRY_PNPBIOS_DS 20
+#define GDT_ENTRY_PNPBIOS_TS1 21
+#define GDT_ENTRY_PNPBIOS_TS2 22
+#define GDT_ENTRY_APMBIOS_BASE 23
+
+#define GDT_ENTRY_ESPFIX_SS 26
+#define GDT_ENTRY_PERCPU 27
+#define GDT_ENTRY_CPUNODE 28
-#define GDT_ENTRY_KERNEL_BASE 12
+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
-#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES 32
-#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+/*
+ * Segment selector values corresponding to the above entries:
+ */
+
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __USER32_CS __USER_CS
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
-#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
-#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+/* segment for calling fn: */
+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8)
+/* code segment for BIOS: */
+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8)
-#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
-#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32)
-#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+/* data segment for BIOS: */
+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8)
+/* transfer data segment: */
+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8)
+/* another data segment: */
+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8)
-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
#ifdef CONFIG_SMP
-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8)
#else
-#define __KERNEL_PERCPU 0
+# define __KERNEL_PERCPU 0
#endif
-#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16)
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8)
-#else
-#define __KERNEL_STACK_CANARY 0
-#endif
+#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+#else /* 64-bit: */
+
+#include <asm/cache.h>
+
+#define GDT_ENTRY_KERNEL32_CS 1
+#define GDT_ENTRY_KERNEL_CS 2
+#define GDT_ENTRY_KERNEL_DS 3
/*
- * The GDT has 32 entries
+ * We cannot use the same code segment descriptor for user and kernel mode,
+ * not even in long flat mode, because of different DPL.
+ *
+ * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
+ * selectors:
+ *
+ * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
+ * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ *
+ * ss = STAR.SYSRET_CS+8 (in either case)
+ *
+ * thus USER_DS should be between 32-bit and 64-bit code selectors:
*/
-#define GDT_ENTRIES 32
-
-/* The PnP BIOS entries in the GDT */
-#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
-#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
-#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
-#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
-#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
-
-/* The PnP BIOS selectors */
-#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
-#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
-#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
-#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
-#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
-
-/* Bottom two bits of selector give the ring privilege level */
-#define SEGMENT_RPL_MASK 0x3
-/* Bit 2 is table indicator (LDT/GDT) */
-#define SEGMENT_TI_MASK 0x4
+#define GDT_ENTRY_DEFAULT_USER32_CS 4
+#define GDT_ENTRY_DEFAULT_USER_DS 5
+#define GDT_ENTRY_DEFAULT_USER_CS 6
-/* User mode is privilege level 3 */
-#define USER_RPL 0x3
-/* LDT segment has TI set, GDT has it cleared */
-#define SEGMENT_LDT 0x4
-#define SEGMENT_GDT 0x0
+/* Needs two entries */
+#define GDT_ENTRY_TSS 8
+/* Needs two entries */
+#define GDT_ENTRY_LDT 10
+
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+
+#define GDT_ENTRY_CPUNODE 15
/*
- * Matching rules for certain types of segments.
+ * Number of entries in the GDT table:
*/
+#define GDT_ENTRIES 16
-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
-#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+/*
+ * Segment selector values corresponding to the above entries:
+ *
+ * Note, selectors also need to have a correct RPL,
+ * expressed with the +3 value for user-space selectors:
+ */
+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
+#endif
-#else
-#include <asm/cache.h>
+#define IDT_ENTRIES 256
+#define NUM_EXCEPTION_VECTORS 32
+
+/* Bitmask of exception vectors which push an error code on the stack: */
+#define EXCEPTION_ERRCODE_MASK 0x20027d00
+
+#define GDT_SIZE (GDT_ENTRIES*8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
+
+/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
+#define VDSO_CPUNODE_BITS 12
+#define VDSO_CPUNODE_MASK 0xfff
+
+#ifndef __ASSEMBLER__
+
+/* Helper functions to store/load CPU and node numbers */
-#define GDT_ENTRY_KERNEL32_CS 1
-#define GDT_ENTRY_KERNEL_CS 2
-#define GDT_ENTRY_KERNEL_DS 3
+static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node)
+{
+ return (node << VDSO_CPUNODE_BITS) | cpu;
+}
-#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
+static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
+{
+ unsigned long p;
+
+ /*
+ * Load CPU and node number from the GDT. LSL is faster than RDTSCP
+ * and works on all CPUs. This is volatile so that it orders
+ * correctly with respect to barrier() and to keep GCC from cleverly
+ * hoisting it out of the calling function.
+ *
+ * If RDPID is available, use it.
+ */
+ alternative_io ("lsl %[seg],%k[p]",
+ "rdpid %[p]",
+ X86_FEATURE_RDPID,
+ [p] "=r" (p), [seg] "r" (__CPUNODE_SEG));
+
+ if (cpu)
+ *cpu = (p & VDSO_CPUNODE_MASK);
+ if (node)
+ *node = (p >> VDSO_CPUNODE_BITS);
+}
+
+#endif /* !__ASSEMBLER__ */
+
+#ifdef __KERNEL__
/*
- * we cannot use the same code segment descriptor for user and kernel
- * -- not even in the long flat mode, because of different DPL /kkeil
- * The segment offset needs to contain a RPL. Grr. -AK
- * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ * early_idt_handler_array is an array of entry points referenced in the
+ * early IDT. For simplicity, it's a real array with one entry point
+ * every nine bytes. That leaves room for an optional 'push $0' if the
+ * vector has no error code (two bytes), a 'push $vector_number' (two
+ * bytes), and a jump to the common entry code (up to five bytes).
*/
-#define GDT_ENTRY_DEFAULT_USER32_CS 4
-#define GDT_ENTRY_DEFAULT_USER_DS 5
-#define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
-#define __USER32_DS __USER_DS
+#define EARLY_IDT_HANDLER_SIZE (9 + ENDBR_INSN_SIZE)
-#define GDT_ENTRY_TSS 8 /* needs two entries */
-#define GDT_ENTRY_LDT 10 /* needs two entries */
-#define GDT_ENTRY_TLS_MIN 12
-#define GDT_ENTRY_TLS_MAX 14
+/*
+ * xen_early_idt_handler_array is for Xen pv guests: for each entry in
+ * early_idt_handler_array it contains a prequel in the form of
+ * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to
+ * max 8 bytes.
+ */
+#define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE)
-#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
-#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
+#ifndef __ASSEMBLER__
-/* TLS indexes for 64bit - hardcoded in arch_prctl */
-#define FS_TLS 0
-#define GS_TLS 1
+extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
+extern void early_ignore_irq(void);
-#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
-#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+#ifdef CONFIG_XEN_PV
+extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE];
+#endif
-#define GDT_ENTRIES 16
+/*
+ * Load a segment. Fall back on loading the zero segment if something goes
+ * wrong. This variant assumes that loading zero fully clears the segment.
+ * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any
+ * failure to fully clear the cached descriptor is only observable for
+ * FS and GS.
+ */
+#define __loadsegment_simple(seg, value) \
+do { \
+ unsigned short __val = (value); \
+ \
+ asm volatile(" \n" \
+ "1: movl %k0,%%" #seg " \n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\
+ : "+r" (__val) : : "memory"); \
+} while (0)
+
+#define __loadsegment_ss(value) __loadsegment_simple(ss, (value))
+#define __loadsegment_ds(value) __loadsegment_simple(ds, (value))
+#define __loadsegment_es(value) __loadsegment_simple(es, (value))
-#endif
+#ifdef CONFIG_X86_32
-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
-#ifndef CONFIG_PARAVIRT
-#define get_kernel_rpl() 0
-#endif
+/*
+ * On 32-bit systems, the hidden parts of FS and GS are unobservable if
+ * the selector is NULL, so there's no funny business here.
+ */
+#define __loadsegment_fs(value) __loadsegment_simple(fs, (value))
+#define __loadsegment_gs(value) __loadsegment_simple(gs, (value))
-/* User mode is privilege level 3 */
-#define USER_RPL 0x3
-/* LDT segment has TI set, GDT has it cleared */
-#define SEGMENT_LDT 0x4
-#define SEGMENT_GDT 0x0
+#else
-/* Bottom two bits of selector give the ring privilege level */
-#define SEGMENT_RPL_MASK 0x3
-/* Bit 2 is table indicator (LDT/GDT) */
-#define SEGMENT_TI_MASK 0x4
+static inline void __loadsegment_fs(unsigned short value)
+{
+ asm volatile(" \n"
+ "1: movw %0, %%fs \n"
+ "2: \n"
-#define IDT_ENTRIES 256
-#define NUM_EXCEPTION_VECTORS 32
-#define GDT_SIZE (GDT_ENTRIES * 8)
-#define GDT_ENTRY_TLS_ENTRIES 3
-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS)
+
+ : : "rm" (value) : "memory");
+}
+
+/* __loadsegment_gs is intentionally undefined. Use load_gs_index instead. */
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
-#endif
#endif
+#define loadsegment(seg, value) __loadsegment_ ## seg (value)
+
+/*
+ * Save a segment register away:
+ */
+#define savesegment(seg, value) \
+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+
+#endif /* !__ASSEMBLER__ */
+#endif /* __KERNEL__ */
+
#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/sembuf.h b/arch/x86/include/asm/sembuf.h
deleted file mode 100644
index ee50c801f7b7..000000000000
--- a/arch/x86/include/asm/sembuf.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _ASM_X86_SEMBUF_H
-#define _ASM_X86_SEMBUF_H
-
-/*
- * The semid64_ds structure for x86 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-struct semid64_ds {
- struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
- __kernel_time_t sem_otime; /* last semop time */
- unsigned long __unused1;
- __kernel_time_t sem_ctime; /* last change time */
- unsigned long __unused2;
- unsigned long sem_nsems; /* no. of semaphores in array */
- unsigned long __unused3;
- unsigned long __unused4;
-};
-
-#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
index 628c801535ea..ece8299d2695 100644
--- a/arch/x86/include/asm/serial.h
+++ b/arch/x86/include/asm/serial.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SERIAL_H
#define _ASM_X86_SERIAL_H
@@ -6,24 +7,24 @@
*
* It'd be nice if someone built a serial card with a 24.576 MHz
* clock, since the 16550A is capable of handling a top speed of 1.5
- * megabits/second; but this requires the faster clock.
+ * megabits/second; but this requires a faster clock.
*/
-#define BASE_BAUD ( 1843200 / 16 )
+#define BASE_BAUD (1843200/16)
/* Standard COM flags (except for COM4, because of the 8514 problem) */
-#ifdef CONFIG_SERIAL_DETECT_IRQ
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
-#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
+#ifdef CONFIG_SERIAL_8250_DETECT_IRQ
+# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ)
+# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ)
#else
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
-#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
+# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | 0 )
+# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | 0 )
#endif
-#define SERIAL_PORT_DFNS \
- /* UART CLK PORT IRQ FLAGS */ \
- { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \
- { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \
- { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \
- { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
+#define SERIAL_PORT_DFNS \
+ /* UART CLK PORT IRQ FLAGS */ \
+ { .uart = 0, BASE_BAUD, 0x3F8, 4, STD_COMX_FLAGS }, /* ttyS0 */ \
+ { .uart = 0, BASE_BAUD, 0x2F8, 3, STD_COMX_FLAGS }, /* ttyS1 */ \
+ { .uart = 0, BASE_BAUD, 0x3E8, 4, STD_COMX_FLAGS }, /* ttyS2 */ \
+ { .uart = 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
new file mode 100644
index 000000000000..61f56cdaccb5
--- /dev/null
+++ b/arch/x86/include/asm/set_memory.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SET_MEMORY_H
+#define _ASM_X86_SET_MEMORY_H
+
+#include <asm/page.h>
+#include <asm-generic/set_memory.h>
+#include <asm/pgtable.h>
+
+#define set_memory_rox set_memory_rox
+int set_memory_rox(unsigned long addr, int numpages);
+
+/*
+ * The set_memory_* API can be used to change various attributes of a virtual
+ * address range. The attributes include:
+ * Cacheability : UnCached, WriteCombining, WriteThrough, WriteBack
+ * Executability : eXecutable, NoteXecutable
+ * Read/Write : ReadOnly, ReadWrite
+ * Presence : NotPresent
+ * Encryption : Encrypted, Decrypted
+ *
+ * Within a category, the attributes are mutually exclusive.
+ *
+ * The implementation of this API will take care of various aspects that
+ * are associated with changing such attributes, such as:
+ * - Flushing TLBs
+ * - Flushing CPU caches
+ * - Making sure aliases of the memory behind the mapping don't violate
+ * coherency rules as defined by the CPU in the system.
+ *
+ * What this API does not do:
+ * - Provide exclusion between various callers - including callers that
+ * operation on other mappings of the same physical page
+ * - Restore default attributes when a page is freed
+ * - Guarantee that mappings other than the requested one are
+ * in any state, other than that these do not violate rules for
+ * the CPU you have. Do not depend on any effects on other mappings,
+ * CPUs other than the one you have may have more relaxed rules.
+ * The caller is required to take care of these.
+ */
+
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
+int _set_memory_uc(unsigned long addr, int numpages);
+int _set_memory_wc(unsigned long addr, int numpages);
+int _set_memory_wt(unsigned long addr, int numpages);
+int _set_memory_wb(unsigned long addr, int numpages);
+int set_memory_uc(unsigned long addr, int numpages);
+int set_memory_wc(unsigned long addr, int numpages);
+int set_memory_wb(unsigned long addr, int numpages);
+int set_memory_np(unsigned long addr, int numpages);
+int set_memory_p(unsigned long addr, int numpages);
+int set_memory_4k(unsigned long addr, int numpages);
+
+bool set_memory_enc_stop_conversion(void);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
+int set_memory_np_noalias(unsigned long addr, int numpages);
+int set_memory_nonglobal(unsigned long addr, int numpages);
+int set_memory_global(unsigned long addr, int numpages);
+
+int set_pages_array_uc(struct page **pages, int addrinarray);
+int set_pages_array_wc(struct page **pages, int addrinarray);
+int set_pages_array_wb(struct page **pages, int addrinarray);
+
+/*
+ * For legacy compatibility with the old APIs, a few functions
+ * are provided that work on a "struct page".
+ * These functions operate ONLY on the 1:1 kernel mapping of the
+ * memory that the struct page represents, and internally just
+ * call the set_memory_* function. See the description of the
+ * set_memory_* function for more details on conventions.
+ *
+ * These APIs should be considered *deprecated* and are likely going to
+ * be removed in the future.
+ * The reason for this is the implicit operation on the 1:1 mapping only,
+ * making this not a generally useful API.
+ *
+ * Specifically, many users of the old APIs had a virtual address,
+ * called virt_to_page() or vmalloc_to_page() on that address to
+ * get a struct page* that the old API required.
+ * To convert these cases, use set_memory_*() on the original
+ * virtual address, do not use these functions.
+ */
+
+int set_pages_uc(struct page *page, int numpages);
+int set_pages_wb(struct page *page, int numpages);
+int set_pages_ro(struct page *page, int numpages);
+int set_pages_rw(struct page *page, int numpages);
+
+int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_default_noflush(struct page *page);
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
+bool kernel_page_present(struct page *page);
+
+extern int kernel_set_to_readonly;
+
+#endif /* _ASM_X86_SET_MEMORY_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 4093d1ed6db2..914eb32581c7 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -1,46 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SETUP_H
#define _ASM_X86_SETUP_H
-#ifdef __KERNEL__
+#include <uapi/asm/setup.h>
#define COMMAND_LINE_SIZE 2048
-#ifndef __ASSEMBLY__
-
-/*
- * Any setup quirks to be performed?
- */
-struct mpc_cpu;
-struct mpc_bus;
-struct mpc_oemtable;
-
-struct x86_quirks {
- int (*arch_pre_time_init)(void);
- int (*arch_time_init)(void);
- int (*arch_pre_intr_init)(void);
- int (*arch_intr_init)(void);
- int (*arch_trap_init)(void);
- char * (*arch_memory_setup)(void);
- int (*mach_get_smp_config)(unsigned int early);
- int (*mach_find_smp_config)(unsigned int reserve);
-
- int *mpc_record;
- int (*mpc_apic_id)(struct mpc_cpu *m);
- void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
- void (*mpc_oem_pci_bus)(struct mpc_bus *m);
- void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
- unsigned short oemsize);
- int (*setup_ioapic_ids)(void);
-};
-
-extern void x86_quirk_intr_init(void);
-
-extern void x86_quirk_trap_init(void);
-
-extern void x86_quirk_pre_time_init(void);
-extern void x86_quirk_time_init(void);
-
-#endif /* __ASSEMBLY__ */
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/ibt.h>
#ifdef __i386__
@@ -59,8 +27,11 @@ extern void x86_quirk_time_init(void);
#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */
#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
+#include <linux/cache.h>
+
#include <asm/bootparam.h>
+#include <asm/x86_init.h>
/* Interrupt control for vSMPowered x86_64 systems */
#ifdef CONFIG_X86_64
@@ -69,29 +40,67 @@ void vsmp_init(void);
static inline void vsmp_init(void) { }
#endif
+struct pt_regs;
+
void setup_bios_corruption_check(void);
+void early_platform_quirks(void);
-#ifdef CONFIG_X86_VISWS
-extern void visws_early_detect(void);
-extern int is_visws_box(void);
+extern unsigned long saved_video_mode;
+extern unsigned long acpi_realmode_flags;
+
+extern void reserve_standard_io_resources(void);
+extern void i386_reserve_resources(void);
+extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
+extern void startup_64_setup_gdt_idt(void);
+extern void startup_64_load_idt(void *vc_handler);
+extern void __pi_startup_64_load_idt(void *vc_handler);
+extern void early_setup_idt(void);
+extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
+
+#ifdef CONFIG_X86_INTEL_MID
+extern void x86_intel_mid_early_setup(void);
#else
-static inline void visws_early_detect(void) { }
-static inline int is_visws_box(void) { return 0; }
+static inline void x86_intel_mid_early_setup(void) { }
#endif
-extern struct x86_quirks *x86_quirks;
-extern unsigned long saved_video_mode;
-
-#ifndef CONFIG_PARAVIRT
-#define paravirt_post_allocator_init() do {} while (0)
+#ifdef CONFIG_X86_INTEL_CE
+extern void x86_ce4100_early_setup(void);
+#else
+static inline void x86_ce4100_early_setup(void) { }
#endif
+#include <linux/kexec_handover.h>
+
#ifndef _SETUP
+#include <asm/espfix.h>
+#include <linux/kernel.h>
+
/*
* This is set up by the setup-routine at boot-time
*/
extern struct boot_params boot_params;
+extern char _text[];
+
+static inline bool kaslr_enabled(void)
+{
+ return IS_ENABLED(CONFIG_RANDOMIZE_MEMORY) &&
+ !!(boot_params.hdr.loadflags & KASLR_FLAG);
+}
+
+/*
+ * Apply no randomization if KASLR was disabled at boot or if KASAN
+ * is enabled. KASAN shadow mappings rely on regions being PGD aligned.
+ */
+static inline bool kaslr_memory_enabled(void)
+{
+ return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
+}
+
+static inline unsigned long kaslr_offset(void)
+{
+ return (unsigned long)&_text - __START_KERNEL;
+}
/*
* Do NOT EVER look at the BIOS memory size location.
@@ -104,47 +113,51 @@ extern unsigned long _brk_end;
void *extend_brk(size_t size, size_t align);
/*
- * Reserve space in the brk section. The name must be unique within
- * the file, and somewhat descriptive. The size is in bytes. Must be
- * used at file scope.
+ * Reserve space in the .brk section, which is a block of memory from which the
+ * caller is allowed to allocate very early (before even memblock is available)
+ * by calling extend_brk(). All allocated memory will be eventually converted
+ * to memblock. Any leftover unallocated memory will be freed.
*
- * (This uses a temp function to wrap the asm so we can pass it the
- * size parameter; otherwise we wouldn't be able to. We can't use a
- * "section" attribute on a normal variable because it always ends up
- * being @progbits, which ends up allocating space in the vmlinux
- * executable.)
+ * The size is in bytes.
*/
-#define RESERVE_BRK(name,sz) \
- static void __section(.discard) __used \
- __brk_reservation_fn_##name##__(void) { \
- asm volatile ( \
- ".pushsection .brk_reservation,\"aw\",@nobits;" \
- ".brk." #name ":" \
- " 1:.skip %c0;" \
- " .size .brk." #name ", . - 1b;" \
- " .popsection" \
- : : "i" (sz)); \
- }
+#define RESERVE_BRK(name, size) \
+ __section(".bss..brk") __aligned(1) __used \
+ static char __brk_##name[size]
+
+extern void probe_roms(void);
+
+void clear_bss(void);
#ifdef __i386__
-void __init i386_start_kernel(void);
-extern void probe_roms(void);
+asmlinkage void __init __noreturn i386_start_kernel(void);
+void __init mk_early_pgtbl_32(void);
#else
-void __init x86_64_start_kernel(char *real_mode);
-void __init x86_64_start_reservations(char *real_mode_data);
+asmlinkage void __init __noreturn x86_64_start_kernel(char *real_mode);
+asmlinkage void __init __noreturn x86_64_start_reservations(char *real_mode_data);
#endif /* __i386__ */
#endif /* _SETUP */
+
+#ifdef CONFIG_CMDLINE_BOOL
+extern bool builtin_cmdline_added __ro_after_init;
#else
-#define RESERVE_BRK(name,sz) \
- .pushsection .brk_reservation,"aw",@nobits; \
-.brk.name: \
-1: .skip sz; \
- .size .brk.name,.-1b; \
+#define builtin_cmdline_added 0
+#endif
+
+#else /* __ASSEMBLER__ */
+
+.macro __RESERVE_BRK name, size
+ .pushsection .bss..brk, "aw"
+SYM_DATA_START(__brk_\name)
+ .skip \size
+SYM_DATA_END(__brk_\name)
.popsection
-#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
+.endm
+
+#define RESERVE_BRK(name, size) __RESERVE_BRK name, size
+
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h
new file mode 100644
index 000000000000..7bb16f843c93
--- /dev/null
+++ b/arch/x86/include/asm/setup_data.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SETUP_DATA_H
+#define _ASM_X86_SETUP_DATA_H
+
+#include <uapi/asm/setup_data.h>
+
+#ifndef __ASSEMBLER__
+
+struct pci_setup_rom {
+ struct setup_data data;
+ uint16_t vendor;
+ uint16_t devid;
+ uint64_t pcilen;
+ unsigned long segment;
+ unsigned long bus;
+ unsigned long device;
+ unsigned long function;
+ uint8_t romdata[];
+};
+
+/* kexec external ABI */
+struct efi_setup_data {
+ u64 fw_vendor;
+ u64 __unused;
+ u64 tables;
+ u64 smbios;
+ u64 reserved[8];
+};
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
new file mode 100644
index 000000000000..01a6e4dbe423
--- /dev/null
+++ b/arch/x86/include/asm/sev-common.h
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD SEV header common between the guest and the hypervisor.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ */
+
+#ifndef __ASM_X86_SEV_COMMON_H
+#define __ASM_X86_SEV_COMMON_H
+
+#define GHCB_MSR_INFO_POS 0
+#define GHCB_DATA_LOW 12
+#define GHCB_MSR_INFO_MASK (BIT_ULL(GHCB_DATA_LOW) - 1)
+
+#define GHCB_DATA(v) \
+ (((unsigned long)(v) & ~GHCB_MSR_INFO_MASK) >> GHCB_DATA_LOW)
+
+/* SEV Information Request/Response */
+#define GHCB_MSR_SEV_INFO_RESP 0x001
+#define GHCB_MSR_SEV_INFO_REQ 0x002
+
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ /* GHCBData[63:48] */ \
+ ((((_max) & 0xffff) << 48) | \
+ /* GHCBData[47:32] */ \
+ (((_min) & 0xffff) << 32) | \
+ /* GHCBData[31:24] */ \
+ (((_cbit) & 0xff) << 24) | \
+ GHCB_MSR_SEV_INFO_RESP)
+
+#define GHCB_MSR_INFO(v) ((v) & 0xfffUL)
+#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff)
+#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff)
+
+/* CPUID Request/Response */
+#define GHCB_MSR_CPUID_REQ 0x004
+#define GHCB_MSR_CPUID_RESP 0x005
+#define GHCB_MSR_CPUID_FUNC_POS 32
+#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff
+#define GHCB_MSR_CPUID_VALUE_POS 32
+#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff
+#define GHCB_MSR_CPUID_REG_POS 30
+#define GHCB_MSR_CPUID_REG_MASK 0x3
+#define GHCB_CPUID_REQ_EAX 0
+#define GHCB_CPUID_REQ_EBX 1
+#define GHCB_CPUID_REQ_ECX 2
+#define GHCB_CPUID_REQ_EDX 3
+#define GHCB_CPUID_REQ(fn, reg) \
+ /* GHCBData[11:0] */ \
+ (GHCB_MSR_CPUID_REQ | \
+ /* GHCBData[31:12] */ \
+ (((unsigned long)(reg) & 0x3) << 30) | \
+ /* GHCBData[63:32] */ \
+ (((unsigned long)fn) << 32))
+
+/* AP Reset Hold */
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0)
+
+/* Preferred GHCB GPA Request */
+#define GHCB_MSR_PREF_GPA_REQ 0x010
+#define GHCB_MSR_GPA_VALUE_POS 12
+#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0)
+
+#define GHCB_MSR_PREF_GPA_RESP 0x011
+#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff
+
+/* GHCB GPA Register */
+#define GHCB_MSR_REG_GPA_REQ 0x012
+#define GHCB_MSR_REG_GPA_REQ_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)((v) & GENMASK_ULL(51, 0)) << 12) | \
+ /* GHCBData[11:0] */ \
+ GHCB_MSR_REG_GPA_REQ)
+
+#define GHCB_MSR_REG_GPA_RESP 0x013
+#define GHCB_MSR_REG_GPA_RESP_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
+
+/*
+ * SNP Page State Change Operation
+ *
+ * GHCBData[55:52] - Page operation:
+ * 0x0001 Page assignment, Private
+ * 0x0002 Page assignment, Shared
+ */
+enum psc_op {
+ SNP_PAGE_STATE_PRIVATE = 1,
+ SNP_PAGE_STATE_SHARED,
+};
+
+#define GHCB_MSR_PSC_REQ 0x014
+#define GHCB_MSR_PSC_REQ_GFN(gfn, op) \
+ /* GHCBData[55:52] */ \
+ (((u64)((op) & 0xf) << 52) | \
+ /* GHCBData[51:12] */ \
+ ((u64)((gfn) & GENMASK_ULL(39, 0)) << 12) | \
+ /* GHCBData[11:0] */ \
+ GHCB_MSR_PSC_REQ)
+
+#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12)
+#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52)
+
+#define GHCB_MSR_PSC_RESP 0x015
+#define GHCB_MSR_PSC_RESP_VAL(val) \
+ /* GHCBData[63:32] */ \
+ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
+
+/* Set highest bit as a generic error response */
+#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP)
+
+/* GHCB Run at VMPL Request/Response */
+#define GHCB_MSR_VMPL_REQ 0x016
+#define GHCB_MSR_VMPL_REQ_LEVEL(v) \
+ /* GHCBData[39:32] */ \
+ ((((u64)(v) & GENMASK_ULL(7, 0)) << 32) | \
+ /* GHCBDdata[11:0] */ \
+ GHCB_MSR_VMPL_REQ)
+
+#define GHCB_MSR_VMPL_RESP 0x017
+#define GHCB_MSR_VMPL_RESP_VAL(v) \
+ /* GHCBData[63:32] */ \
+ (((u64)(v) & GENMASK_ULL(63, 32)) >> 32)
+
+/* GHCB Hypervisor Feature Request/Response */
+#define GHCB_MSR_HV_FT_REQ 0x080
+#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_POS 12
+#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0)
+#define GHCB_MSR_HV_FT_RESP_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
+
+#define GHCB_HV_FT_SNP BIT_ULL(0)
+#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1)
+#define GHCB_HV_FT_SNP_MULTI_VMPL BIT_ULL(5)
+
+/*
+ * SNP Page State Change NAE event
+ * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which
+ * is a local stack variable in set_pages_state(). Do not increase this value
+ * without evaluating the impact to stack usage.
+ *
+ * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value
+ * is needed, such as when processing GHCB requests on the hypervisor side.
+ */
+#define VMGEXIT_PSC_MAX_ENTRY 64
+#define VMGEXIT_PSC_MAX_COUNT 253
+
+#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32)
+#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1)
+#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2)
+
+#define VMGEXIT_PSC_OP_PRIVATE 1
+#define VMGEXIT_PSC_OP_SHARED 2
+
+struct psc_hdr {
+ u16 cur_entry;
+ u16 end_entry;
+ u32 reserved;
+} __packed;
+
+struct psc_entry {
+ u64 cur_page : 12,
+ gfn : 40,
+ operation : 4,
+ pagesize : 1,
+ reserved : 7;
+} __packed;
+
+struct snp_psc_desc {
+ struct psc_hdr hdr;
+ struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY];
+} __packed;
+
+#define GHCB_MSR_TERM_REQ 0x100
+#define GHCB_MSR_TERM_REASON_SET_POS 12
+#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
+#define GHCB_MSR_TERM_REASON_POS 16
+#define GHCB_MSR_TERM_REASON_MASK 0xff
+
+#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
+ /* GHCBData[15:12] */ \
+ (((((u64)reason_set) & 0xf) << 12) | \
+ /* GHCBData[23:16] */ \
+ ((((u64)reason_val) & 0xff) << 16))
+
+/* Error codes from reason set 0 */
+#define SEV_TERM_SET_GEN 0
+#define GHCB_SEV_ES_GEN_REQ 0
+#define GHCB_SEV_ES_PROT_UNSUPPORTED 1
+#define GHCB_SNP_UNSUPPORTED 2
+
+/* Linux-specific reason codes (used with reason set 1) */
+#define SEV_TERM_SET_LINUX 1
+#define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */
+#define GHCB_TERM_PSC 1 /* Page State Change failure */
+#define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */
+#define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */
+#define GHCB_TERM_CPUID 4 /* CPUID-validation failure */
+#define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */
+#define GHCB_TERM_SECRETS_PAGE 6 /* Secrets page failure */
+#define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */
+#define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */
+#define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */
+#define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */
+#define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */
+#define GHCB_TERM_SAVIC_FAIL 12 /* Secure AVIC-specific failure */
+
+#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+
+/*
+ * GHCB-defined return codes that are communicated back to the guest via
+ * SW_EXITINFO1.
+ */
+#define GHCB_HV_RESP_NO_ACTION 0
+#define GHCB_HV_RESP_ISSUE_EXCEPTION 1
+#define GHCB_HV_RESP_MALFORMED_INPUT 2
+
+/*
+ * GHCB-defined sub-error codes for malformed input (see above) that are
+ * communicated back to the guest via SW_EXITINFO2[31:0].
+ */
+#define GHCB_ERR_NOT_REGISTERED 1
+#define GHCB_ERR_INVALID_USAGE 2
+#define GHCB_ERR_INVALID_SCRATCH_AREA 3
+#define GHCB_ERR_MISSING_INPUT 4
+#define GHCB_ERR_INVALID_INPUT 5
+#define GHCB_ERR_INVALID_EVENT 6
+
+struct sev_config {
+ __u64 debug : 1,
+
+ /*
+ * Indicates when the per-CPU GHCB has been created and registered
+ * and thus can be used by the BSP instead of the early boot GHCB.
+ *
+ * For APs, the per-CPU GHCB is created before they are started
+ * and registered upon startup, so this flag can be used globally
+ * for the BSP and APs.
+ */
+ ghcbs_initialized : 1,
+
+ /*
+ * Indicates when the per-CPU SVSM CA is to be used instead of the
+ * boot SVSM CA.
+ *
+ * For APs, the per-CPU SVSM CA is created as part of the AP
+ * bringup, so this flag can be used globally for the BSP and APs.
+ */
+ use_cas : 1,
+
+ __reserved : 61;
+};
+
+extern struct sev_config sev_cfg;
+
+#endif
diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h
new file mode 100644
index 000000000000..c58c47c68ab6
--- /dev/null
+++ b/arch/x86/include/asm/sev-internal.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define DR7_RESET_VALUE 0x400
+
+extern u64 sev_hv_features;
+extern u64 sev_secrets_pa;
+
+/* #VC handler runtime per-CPU data */
+struct sev_es_runtime_data {
+ struct ghcb ghcb_page;
+
+ /*
+ * Reserve one page per CPU as backup storage for the unencrypted GHCB.
+ * It is needed when an NMI happens while the #VC handler uses the real
+ * GHCB, and the NMI handler itself is causing another #VC exception. In
+ * that case the GHCB content of the first handler needs to be backed up
+ * and restored.
+ */
+ struct ghcb backup_ghcb;
+
+ /*
+ * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
+ * There is no need for it to be atomic, because nothing is written to
+ * the GHCB between the read and the write of ghcb_active. So it is safe
+ * to use it when a nested #VC exception happens before the write.
+ *
+ * This is necessary for example in the #VC->NMI->#VC case when the NMI
+ * happens while the first #VC handler uses the GHCB. When the NMI code
+ * raises a second #VC handler it might overwrite the contents of the
+ * GHCB written by the first handler. To avoid this the content of the
+ * GHCB is saved and restored when the GHCB is detected to be in use
+ * already.
+ */
+ bool ghcb_active;
+ bool backup_ghcb_active;
+
+ /*
+ * Cached DR7 value - write it on DR7 writes and return it on reads.
+ * That value will never make it to the real hardware DR7 as debugging
+ * is currently unsupported in SEV-ES guests.
+ */
+ unsigned long dr7;
+};
+
+struct ghcb_state {
+ struct ghcb *ghcb;
+};
+
+extern struct svsm_ca boot_svsm_ca_page;
+
+struct ghcb *__sev_get_ghcb(struct ghcb_state *state);
+void __sev_put_ghcb(struct ghcb_state *state);
+
+DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
+DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
+
+void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, const struct psc_desc *desc);
+
+DECLARE_PER_CPU(struct svsm_ca *, svsm_caa);
+DECLARE_PER_CPU(u64, svsm_caa_pa);
+
+extern u64 boot_svsm_caa_pa;
+
+enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt);
+void vc_forward_exception(struct es_em_ctxt *ctxt);
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ return native_rdmsrq(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static __always_inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ u32 low, high;
+
+ low = (u32)(val);
+ high = (u32)(val >> 32);
+
+ native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
+}
+
+enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write);
+
+u64 get_hv_features(void);
+
+const struct snp_cpuid_table *snp_cpuid_get_table(void);
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
new file mode 100644
index 000000000000..0e6c0940100f
--- /dev/null
+++ b/arch/x86/include/asm/sev.h
@@ -0,0 +1,682 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#ifndef __ASM_ENCRYPTED_STATE_H
+#define __ASM_ENCRYPTED_STATE_H
+
+#include <linux/types.h>
+#include <linux/sev-guest.h>
+
+#include <asm/insn.h>
+#include <asm/sev-common.h>
+#include <asm/coco.h>
+#include <asm/set_memory.h>
+#include <asm/svm.h>
+
+#define GHCB_PROTOCOL_MIN 1ULL
+#define GHCB_PROTOCOL_MAX 2ULL
+#define GHCB_DEFAULT_USAGE 0ULL
+
+#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
+
+struct boot_params;
+
+enum es_result {
+ ES_OK, /* All good */
+ ES_UNSUPPORTED, /* Requested operation not supported */
+ ES_VMM_ERROR, /* Unexpected state from the VMM */
+ ES_DECODE_FAILED, /* Instruction decoding failed */
+ ES_EXCEPTION, /* Instruction caused exception */
+ ES_RETRY, /* Retry instruction emulation */
+};
+
+struct es_fault_info {
+ unsigned long vector;
+ unsigned long error_code;
+ unsigned long cr2;
+};
+
+struct pt_regs;
+
+/* ES instruction emulation context */
+struct es_em_ctxt {
+ struct pt_regs *regs;
+ struct insn insn;
+ struct es_fault_info fi;
+};
+
+/*
+ * AMD SEV Confidential computing blob structure. The structure is
+ * defined in OVMF UEFI firmware header:
+ * https://github.com/tianocore/edk2/blob/master/OvmfPkg/Include/Guid/ConfidentialComputingSevSnpBlob.h
+ */
+#define CC_BLOB_SEV_HDR_MAGIC 0x45444d41
+struct cc_blob_sev_info {
+ u32 magic;
+ u16 version;
+ u16 reserved;
+ u64 secrets_phys;
+ u32 secrets_len;
+ u32 rsvd1;
+ u64 cpuid_phys;
+ u32 cpuid_len;
+ u32 rsvd2;
+} __packed;
+
+void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code);
+
+static inline u64 lower_bits(u64 val, unsigned int bits)
+{
+ u64 mask = (1ULL << bits) - 1;
+
+ return (val & mask);
+}
+
+struct real_mode_header;
+enum stack_type;
+
+/* Early IDT entry points for #VC handler */
+extern void vc_no_ghcb(void);
+extern void vc_boot_ghcb(void);
+extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
+
+/*
+ * Individual entries of the SNP CPUID table, as defined by the SNP
+ * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
+ */
+struct snp_cpuid_fn {
+ u32 eax_in;
+ u32 ecx_in;
+ u64 xcr0_in;
+ u64 xss_in;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u64 __reserved;
+} __packed;
+
+/*
+ * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
+ * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
+ * of 64 entries per CPUID table.
+ */
+#define SNP_CPUID_COUNT_MAX 64
+
+struct snp_cpuid_table {
+ u32 count;
+ u32 __reserved1;
+ u64 __reserved2;
+ struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
+} __packed;
+
+/* PVALIDATE return codes */
+#define PVALIDATE_FAIL_SIZEMISMATCH 6
+
+/* Software defined (when rFlags.CF = 1) */
+#define PVALIDATE_FAIL_NOUPDATE 255
+
+/* RMUPDATE detected 4K page and 2MB page overlap. */
+#define RMPUPDATE_FAIL_OVERLAP 4
+
+/* PSMASH failed due to concurrent access by another CPU */
+#define PSMASH_FAIL_INUSE 3
+
+/* RMP page size */
+#define RMP_PG_SIZE_4K 0
+#define RMP_PG_SIZE_2M 1
+#define RMP_TO_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M)
+#define PG_LEVEL_TO_RMP(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M)
+
+struct rmp_state {
+ u64 gpa;
+ u8 assigned;
+ u8 pagesize;
+ u8 immutable;
+ u8 rsvd;
+ u32 asid;
+} __packed;
+
+#define RMPADJUST_VMSA_PAGE_BIT BIT(16)
+
+/* SNP Guest message request */
+struct snp_req_data {
+ unsigned long req_gpa;
+ unsigned long resp_gpa;
+ unsigned long data_gpa;
+ unsigned int data_npages;
+};
+
+#define MAX_AUTHTAG_LEN 32
+#define AUTHTAG_LEN 16
+#define AAD_LEN 48
+#define MSG_HDR_VER 1
+
+#define SNP_REQ_MAX_RETRY_DURATION (60*HZ)
+#define SNP_REQ_RETRY_DELAY (2*HZ)
+
+/* See SNP spec SNP_GUEST_REQUEST section for the structure */
+enum msg_type {
+ SNP_MSG_TYPE_INVALID = 0,
+ SNP_MSG_CPUID_REQ,
+ SNP_MSG_CPUID_RSP,
+ SNP_MSG_KEY_REQ,
+ SNP_MSG_KEY_RSP,
+ SNP_MSG_REPORT_REQ,
+ SNP_MSG_REPORT_RSP,
+ SNP_MSG_EXPORT_REQ,
+ SNP_MSG_EXPORT_RSP,
+ SNP_MSG_IMPORT_REQ,
+ SNP_MSG_IMPORT_RSP,
+ SNP_MSG_ABSORB_REQ,
+ SNP_MSG_ABSORB_RSP,
+ SNP_MSG_VMRK_REQ,
+ SNP_MSG_VMRK_RSP,
+
+ SNP_MSG_TSC_INFO_REQ = 17,
+ SNP_MSG_TSC_INFO_RSP,
+
+ SNP_MSG_TYPE_MAX
+};
+
+enum aead_algo {
+ SNP_AEAD_INVALID,
+ SNP_AEAD_AES_256_GCM,
+};
+
+struct snp_guest_msg_hdr {
+ u8 authtag[MAX_AUTHTAG_LEN];
+ u64 msg_seqno;
+ u8 rsvd1[8];
+ u8 algo;
+ u8 hdr_version;
+ u16 hdr_sz;
+ u8 msg_type;
+ u8 msg_version;
+ u16 msg_sz;
+ u32 rsvd2;
+ u8 msg_vmpck;
+ u8 rsvd3[35];
+} __packed;
+
+struct snp_guest_msg {
+ struct snp_guest_msg_hdr hdr;
+ u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)];
+} __packed;
+
+#define SNP_TSC_INFO_REQ_SZ 128
+
+struct snp_tsc_info_req {
+ u8 rsvd[SNP_TSC_INFO_REQ_SZ];
+} __packed;
+
+struct snp_tsc_info_resp {
+ u32 status;
+ u32 rsvd1;
+ u64 tsc_scale;
+ u64 tsc_offset;
+ u32 tsc_factor;
+ u8 rsvd2[100];
+} __packed;
+
+/*
+ * Obtain the mean TSC frequency by decreasing the nominal TSC frequency with
+ * TSC_FACTOR as documented in the SNP Firmware ABI specification:
+ *
+ * GUEST_TSC_FREQ * (1 - (TSC_FACTOR * 0.00001))
+ *
+ * which is equivalent to:
+ *
+ * GUEST_TSC_FREQ -= (GUEST_TSC_FREQ * TSC_FACTOR) / 100000;
+ */
+#define SNP_SCALE_TSC_FREQ(freq, factor) ((freq) - (freq) * (factor) / 100000)
+
+struct snp_guest_req {
+ void *req_buf;
+ size_t req_sz;
+
+ void *resp_buf;
+ size_t resp_sz;
+
+ u64 exit_code;
+ u64 exitinfo2;
+ unsigned int vmpck_id;
+ u8 msg_version;
+ u8 msg_type;
+
+ struct snp_req_data input;
+ void *certs_data;
+};
+
+/*
+ * The secrets page contains 96-bytes of reserved field that can be used by
+ * the guest OS. The guest OS uses the area to save the message sequence
+ * number for each VMPCK.
+ *
+ * See the GHCB spec section Secret page layout for the format for this area.
+ */
+struct secrets_os_area {
+ u32 msg_seqno_0;
+ u32 msg_seqno_1;
+ u32 msg_seqno_2;
+ u32 msg_seqno_3;
+ u64 ap_jump_table_pa;
+ u8 rsvd[40];
+ u8 guest_usage[32];
+} __packed;
+
+#define VMPCK_KEY_LEN 32
+
+/* See the SNP spec version 0.9 for secrets page format */
+struct snp_secrets_page {
+ u32 version;
+ u32 imien : 1,
+ rsvd1 : 31;
+ u32 fms;
+ u32 rsvd2;
+ u8 gosvw[16];
+ u8 vmpck0[VMPCK_KEY_LEN];
+ u8 vmpck1[VMPCK_KEY_LEN];
+ u8 vmpck2[VMPCK_KEY_LEN];
+ u8 vmpck3[VMPCK_KEY_LEN];
+ struct secrets_os_area os_area;
+
+ u8 vmsa_tweak_bitmap[64];
+
+ /* SVSM fields */
+ u64 svsm_base;
+ u64 svsm_size;
+ u64 svsm_caa;
+ u32 svsm_max_version;
+ u8 svsm_guest_vmpl;
+ u8 rsvd3[3];
+
+ /* The percentage decrease from nominal to mean TSC frequency. */
+ u32 tsc_factor;
+
+ /* Remainder of page */
+ u8 rsvd4[3740];
+} __packed;
+
+struct snp_msg_desc {
+ /* request and response are in unencrypted memory */
+ struct snp_guest_msg *request, *response;
+
+ /*
+ * Avoid information leakage by double-buffering shared messages
+ * in fields that are in regular encrypted memory.
+ */
+ struct snp_guest_msg secret_request, secret_response;
+
+ struct snp_secrets_page *secrets;
+
+ struct aesgcm_ctx *ctx;
+
+ u32 *os_area_msg_seqno;
+ u8 *vmpck;
+ int vmpck_id;
+};
+
+/*
+ * The SVSM Calling Area (CA) related structures.
+ */
+struct svsm_ca {
+ u8 call_pending;
+ u8 mem_available;
+ u8 rsvd1[6];
+
+ u8 svsm_buffer[PAGE_SIZE - 8];
+};
+
+#define SVSM_SUCCESS 0
+#define SVSM_ERR_INCOMPLETE 0x80000000
+#define SVSM_ERR_UNSUPPORTED_PROTOCOL 0x80000001
+#define SVSM_ERR_UNSUPPORTED_CALL 0x80000002
+#define SVSM_ERR_INVALID_ADDRESS 0x80000003
+#define SVSM_ERR_INVALID_FORMAT 0x80000004
+#define SVSM_ERR_INVALID_PARAMETER 0x80000005
+#define SVSM_ERR_INVALID_REQUEST 0x80000006
+#define SVSM_ERR_BUSY 0x80000007
+#define SVSM_PVALIDATE_FAIL_SIZEMISMATCH 0x80001006
+
+/*
+ * The SVSM PVALIDATE related structures
+ */
+struct svsm_pvalidate_entry {
+ u64 page_size : 2,
+ action : 1,
+ ignore_cf : 1,
+ rsvd : 8,
+ pfn : 52;
+};
+
+struct svsm_pvalidate_call {
+ u16 num_entries;
+ u16 cur_index;
+
+ u8 rsvd1[4];
+
+ struct svsm_pvalidate_entry entry[];
+};
+
+#define SVSM_PVALIDATE_MAX_COUNT ((sizeof_field(struct svsm_ca, svsm_buffer) - \
+ offsetof(struct svsm_pvalidate_call, entry)) / \
+ sizeof(struct svsm_pvalidate_entry))
+
+/*
+ * The SVSM Attestation related structures
+ */
+struct svsm_loc_entry {
+ u64 pa;
+ u32 len;
+ u8 rsvd[4];
+};
+
+struct svsm_attest_call {
+ struct svsm_loc_entry report_buf;
+ struct svsm_loc_entry nonce;
+ struct svsm_loc_entry manifest_buf;
+ struct svsm_loc_entry certificates_buf;
+
+ /* For attesting a single service */
+ u8 service_guid[16];
+ u32 service_manifest_ver;
+ u8 rsvd[4];
+};
+
+/* PTE descriptor used for the prepare_pte_enc() operations. */
+struct pte_enc_desc {
+ pte_t *kpte;
+ int pte_level;
+ bool encrypt;
+ /* pfn of the kpte above */
+ unsigned long pfn;
+ /* physical address of @pfn */
+ unsigned long pa;
+ /* virtual address of @pfn */
+ void *va;
+ /* memory covered by the pte */
+ unsigned long size;
+ pgprot_t new_pgprot;
+};
+
+/*
+ * SVSM protocol structure
+ */
+struct svsm_call {
+ struct svsm_ca *caa;
+ u64 rax;
+ u64 rcx;
+ u64 rdx;
+ u64 r8;
+ u64 r9;
+ u64 rax_out;
+ u64 rcx_out;
+ u64 rdx_out;
+ u64 r8_out;
+ u64 r9_out;
+};
+
+#define SVSM_CORE_CALL(x) ((0ULL << 32) | (x))
+#define SVSM_CORE_REMAP_CA 0
+#define SVSM_CORE_PVALIDATE 1
+#define SVSM_CORE_CREATE_VCPU 2
+#define SVSM_CORE_DELETE_VCPU 3
+
+#define SVSM_ATTEST_CALL(x) ((1ULL << 32) | (x))
+#define SVSM_ATTEST_SERVICES 0
+#define SVSM_ATTEST_SINGLE_SERVICE 1
+
+#define SVSM_VTPM_CALL(x) ((2ULL << 32) | (x))
+#define SVSM_VTPM_QUERY 0
+#define SVSM_VTPM_CMD 1
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern u8 snp_vmpl;
+
+extern void __sev_es_ist_enter(struct pt_regs *regs);
+extern void __sev_es_ist_exit(void);
+static __always_inline void sev_es_ist_enter(struct pt_regs *regs)
+{
+ if (cc_vendor == CC_VENDOR_AMD &&
+ cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ __sev_es_ist_enter(regs);
+}
+static __always_inline void sev_es_ist_exit(void)
+{
+ if (cc_vendor == CC_VENDOR_AMD &&
+ cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ __sev_es_ist_exit();
+}
+extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
+extern void __sev_es_nmi_complete(void);
+static __always_inline void sev_es_nmi_complete(void)
+{
+ if (cc_vendor == CC_VENDOR_AMD &&
+ cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ __sev_es_nmi_complete();
+}
+extern int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd);
+extern void sev_enable(struct boot_params *bp);
+
+/*
+ * RMPADJUST modifies the RMP permissions of a page of a lesser-
+ * privileged (numerically higher) VMPL.
+ *
+ * If the guest is running at a higher-privilege than the privilege
+ * level the instruction is targeting, the instruction will succeed,
+ * otherwise, it will fail.
+ */
+static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
+{
+ int rc;
+
+ /* "rmpadjust" mnemonic support in binutils 2.36 and newer */
+ asm volatile(".byte 0xF3,0x0F,0x01,0xFE\n\t"
+ : "=a"(rc)
+ : "a"(vaddr), "c"(rmp_psize), "d"(attrs)
+ : "memory", "cc");
+
+ return rc;
+}
+static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
+{
+ bool no_rmpupdate;
+ int rc;
+
+ /* "pvalidate" mnemonic support in binutils 2.36 and newer */
+ asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFF\n\t"
+ : "=@ccc"(no_rmpupdate), "=a"(rc)
+ : "a"(vaddr), "c"(rmp_psize), "d"(validate)
+ : "memory", "cc");
+
+ if (no_rmpupdate)
+ return PVALIDATE_FAIL_NOUPDATE;
+
+ return rc;
+}
+
+void setup_ghcb(void);
+void snp_register_ghcb_early(unsigned long paddr);
+void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
+void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
+void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
+void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
+void snp_set_wakeup_secondary_cpu(void);
+bool snp_init(struct boot_params *bp);
+void snp_dmi_setup(void);
+int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input);
+void snp_accept_memory(phys_addr_t start, phys_addr_t end);
+u64 snp_get_unsupported_features(u64 status);
+u64 sev_get_status(void);
+void sev_show_status(void);
+int prepare_pte_enc(struct pte_enc_desc *d);
+void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot);
+void snp_kexec_finish(void);
+void snp_kexec_begin(void);
+
+int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id);
+struct snp_msg_desc *snp_msg_alloc(void);
+void snp_msg_free(struct snp_msg_desc *mdesc);
+int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req);
+
+int snp_svsm_vtpm_send_command(u8 *buffer);
+
+void __init snp_secure_tsc_prepare(void);
+void __init snp_secure_tsc_init(void);
+enum es_result savic_register_gpa(u64 gpa);
+enum es_result savic_unregister_gpa(u64 *gpa);
+u64 savic_ghcb_msr_read(u32 reg);
+void savic_ghcb_msr_write(u32 reg, u64 value);
+
+static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
+{
+ ghcb->save.sw_exit_code = 0;
+ __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+/* I/O parameters for CPUID-related helpers */
+struct cpuid_leaf {
+ u32 fn;
+ u32 subfn;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+};
+
+int svsm_perform_msr_protocol(struct svsm_call *call);
+int __pi_svsm_perform_msr_protocol(struct svsm_call *call);
+int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf),
+ void *ctx, struct cpuid_leaf *leaf);
+
+void svsm_issue_call(struct svsm_call *call, u8 *pending);
+int svsm_process_result_codes(struct svsm_call *call);
+
+void __noreturn sev_es_terminate(unsigned int set, unsigned int reason);
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2);
+
+bool sev_es_negotiate_protocol(void);
+bool sev_es_check_cpu_features(void);
+
+extern u16 ghcb_version;
+extern struct ghcb *boot_ghcb;
+extern bool sev_snp_needs_sfw;
+
+struct psc_desc {
+ enum psc_op op;
+ struct svsm_ca *ca;
+ u64 caa_pa;
+};
+
+static inline void sev_evict_cache(void *va, int npages)
+{
+ volatile u8 val __always_unused;
+ u8 *bytes = va;
+ int page_idx;
+
+ /*
+ * For SEV guests, a read from the first/last cache-lines of a 4K page
+ * using the guest key is sufficient to cause a flush of all cache-lines
+ * associated with that 4K page without incurring all the overhead of a
+ * full CLFLUSH sequence.
+ */
+ for (page_idx = 0; page_idx < npages; page_idx++) {
+ val = bytes[page_idx * PAGE_SIZE];
+ val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1];
+ }
+}
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define snp_vmpl 0
+static inline void sev_es_ist_enter(struct pt_regs *regs) { }
+static inline void sev_es_ist_exit(void) { }
+static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
+static inline void sev_es_nmi_complete(void) { }
+static inline int sev_es_efi_map_ghcbs_cas(pgd_t *pgd) { return 0; }
+static inline void sev_enable(struct boot_params *bp) { }
+static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
+static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
+static inline void setup_ghcb(void) { }
+static inline void __init
+early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
+static inline void __init
+early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
+static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { }
+static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { }
+static inline void snp_set_wakeup_secondary_cpu(void) { }
+static inline bool snp_init(struct boot_params *bp) { return false; }
+static inline void snp_dmi_setup(void) { }
+static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input)
+{
+ return -ENOTTY;
+}
+static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
+static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
+static inline u64 sev_get_status(void) { return 0; }
+static inline void sev_show_status(void) { }
+static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; }
+static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { }
+static inline void snp_kexec_finish(void) { }
+static inline void snp_kexec_begin(void) { }
+static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; }
+static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; }
+static inline void snp_msg_free(struct snp_msg_desc *mdesc) { }
+static inline int snp_send_guest_request(struct snp_msg_desc *mdesc,
+ struct snp_guest_req *req) { return -ENODEV; }
+static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; }
+static inline void __init snp_secure_tsc_prepare(void) { }
+static inline void __init snp_secure_tsc_init(void) { }
+static inline void sev_evict_cache(void *va, int npages) {}
+static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; }
+static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; }
+static inline void savic_ghcb_msr_write(u32 reg, u64 value) { }
+static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+#ifdef CONFIG_KVM_AMD_SEV
+bool snp_probe_rmptable_info(void);
+int snp_rmptable_init(void);
+int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level);
+void snp_dump_hva_rmpentry(unsigned long address);
+int psmash(u64 pfn);
+int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable);
+int rmp_make_shared(u64 pfn, enum pg_level level);
+void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp);
+void kdump_sev_callback(void);
+void snp_fixup_e820_tables(void);
+static inline void snp_leak_pages(u64 pfn, unsigned int pages)
+{
+ __snp_leak_pages(pfn, pages, true);
+}
+#else
+static inline bool snp_probe_rmptable_info(void) { return false; }
+static inline int snp_rmptable_init(void) { return -ENOSYS; }
+static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; }
+static inline void snp_dump_hva_rmpentry(unsigned long address) {}
+static inline int psmash(u64 pfn) { return -ENODEV; }
+static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid,
+ bool immutable)
+{
+ return -ENODEV;
+}
+static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; }
+static inline void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) {}
+static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
+static inline void kdump_sev_callback(void) { }
+static inline void snp_fixup_e820_tables(void) {}
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
new file mode 100644
index 000000000000..04958459a7ca
--- /dev/null
+++ b/arch/x86/include/asm/sgx.h
@@ -0,0 +1,430 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Intel Software Guard Extensions (SGX) support.
+ */
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions. The two should not be mixed
+ * together for better readability. The architectural definitions come first.
+ */
+
+/* The SGX specific CPUID function. */
+#define SGX_CPUID 0x12
+/* EPC enumeration. */
+#define SGX_CPUID_EPC 2
+/* An invalid EPC section, i.e. the end marker. */
+#define SGX_CPUID_EPC_INVALID 0x0
+/* A valid EPC section. */
+#define SGX_CPUID_EPC_SECTION 0x1
+/* The bitmask for the EPC section type. */
+#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+ EUPDATESVN = 0x18,
+};
+
+/**
+ * SGX_ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
+ *
+ * ENCLS has its own (positive value) error codes and also generates
+ * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
+ * with system error codes as everything percolates back up the stack.
+ * Unfortunately (for us), we need to precisely identify each unique
+ * error code, e.g. the action taken if EWB fails varies based on the
+ * type of fault and on the exact SGX error code, i.e. we can't simply
+ * convert all faults to -EFAULT.
+ *
+ * To make all three error types coexist, we set bit 30 to identify an
+ * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
+ * between positive (faults and SGX error codes) and negative (system
+ * error codes) values.
+ */
+#define SGX_ENCLS_FAULT_FLAG 0x40000000
+
+/**
+ * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
+ * @SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function.
+ * @SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
+ * been completed yet.
+ * @SGX_CHILD_PRESENT: SECS has child pages present in the EPC.
+ * @SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
+ * public key does not match IA32_SGXLEPUBKEYHASH.
+ * @SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it
+ * is in the PENDING or MODIFIED state.
+ * @SGX_INSUFFICIENT_ENTROPY: Insufficient entropy in RNG.
+ * @SGX_NO_UPDATE: EUPDATESVN could not update the CPUSVN because the
+ * current SVN was not newer than CPUSVN. This is the most
+ * common error code returned by EUPDATESVN.
+ * @SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
+ */
+enum sgx_return_code {
+ SGX_EPC_PAGE_CONFLICT = 7,
+ SGX_NOT_TRACKED = 11,
+ SGX_CHILD_PRESENT = 13,
+ SGX_INVALID_EINITTOKEN = 16,
+ SGX_PAGE_NOT_MODIFIABLE = 20,
+ SGX_INSUFFICIENT_ENTROPY = 29,
+ SGX_NO_UPDATE = 31,
+ SGX_UNMASKED_EVENT = 128,
+};
+
+/* The modulus size for 3072-bit RSA keys. */
+#define SGX_MODULUS_SIZE 384
+
+/**
+ * enum sgx_miscselect - additional information to an SSA frame
+ * @SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
+ *
+ * Save State Area (SSA) is a stack inside the enclave used to store processor
+ * state when an exception or interrupt occurs. This enum defines additional
+ * information stored to an SSA frame.
+ */
+enum sgx_miscselect {
+ SGX_MISC_EXINFO = BIT(0),
+};
+
+#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1)
+
+#define SGX_SSA_GPRS_SIZE 184
+#define SGX_SSA_MISC_EXINFO_SIZE 16
+
+/**
+ * enum sgx_attribute - the attributes field in &struct sgx_secs
+ * @SGX_ATTR_INIT: Enclave can be entered (is initialized).
+ * @SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
+ * @SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave.
+ * @SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote
+ * attestation.
+ * @SGX_ATTR_KSS: Allow to use key separation and sharing (KSS).
+ * @SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
+ * sign cryptographic tokens that can be passed to
+ * EINIT as an authorization to run an enclave.
+ * @SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an
+ * asynchronous exit has occurred.
+ */
+enum sgx_attribute {
+ SGX_ATTR_INIT = BIT(0),
+ SGX_ATTR_DEBUG = BIT(1),
+ SGX_ATTR_MODE64BIT = BIT(2),
+ /* BIT(3) is reserved */
+ SGX_ATTR_PROVISIONKEY = BIT(4),
+ SGX_ATTR_EINITTOKENKEY = BIT(5),
+ /* BIT(6) is for CET */
+ SGX_ATTR_KSS = BIT(7),
+ /* BIT(8) is reserved */
+ /* BIT(9) is reserved */
+ SGX_ATTR_ASYNC_EXIT_NOTIFY = BIT(10),
+};
+
+#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | \
+ BIT_ULL(6) | \
+ BIT_ULL(8) | \
+ BIT_ULL(9) | \
+ GENMASK_ULL(63, 11))
+
+#define SGX_ATTR_UNPRIV_MASK (SGX_ATTR_DEBUG | \
+ SGX_ATTR_MODE64BIT | \
+ SGX_ATTR_KSS | \
+ SGX_ATTR_ASYNC_EXIT_NOTIFY)
+
+#define SGX_ATTR_PRIV_MASK (SGX_ATTR_PROVISIONKEY | \
+ SGX_ATTR_EINITTOKENKEY)
+
+/**
+ * struct sgx_secs - SGX Enclave Control Structure (SECS)
+ * @size: size of the address space
+ * @base: base address of the address space
+ * @ssa_frame_size: size of an SSA frame
+ * @miscselect: additional information stored to an SSA frame
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT
+ * @config_id: a user-defined value that is used in key derivation
+ * @isv_prod_id: a user-defined value that is used in key derivation
+ * @isv_svn: a user-defined value that is used in key derivation
+ * @config_svn: a user-defined value that is used in key derivation
+ *
+ * SGX Enclave Control Structure (SECS) is a special enclave page that is not
+ * visible in the address space. In fact, this structure defines the address
+ * range and other global attributes for the enclave and it is the first EPC
+ * page created for any enclave. It is moved from a temporary buffer to an EPC
+ * by the means of ENCLS[ECREATE] function.
+ */
+struct sgx_secs {
+ u64 size;
+ u64 base;
+ u32 ssa_frame_size;
+ u32 miscselect;
+ u8 reserved1[24];
+ u64 attributes;
+ u64 xfrm;
+ u32 mrenclave[8];
+ u8 reserved2[32];
+ u32 mrsigner[8];
+ u8 reserved3[32];
+ u32 config_id[16];
+ u16 isv_prod_id;
+ u16 isv_svn;
+ u16 config_svn;
+ u8 reserved4[3834];
+} __packed;
+
+/**
+ * enum sgx_tcs_flags - execution flags for TCS
+ * @SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints
+ * inside an enclave. It is cleared by EADD but can
+ * be set later with EDBGWR.
+ */
+enum sgx_tcs_flags {
+ SGX_TCS_DBGOPTIN = 0x01,
+};
+
+#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1)
+#define SGX_TCS_RESERVED_SIZE 4024
+
+/**
+ * struct sgx_tcs - Thread Control Structure (TCS)
+ * @state: used to mark an entered TCS
+ * @flags: execution flags (cleared by EADD)
+ * @ssa_offset: SSA stack offset relative to the enclave base
+ * @ssa_index: the current SSA frame index (cleard by EADD)
+ * @nr_ssa_frames: the number of frame in the SSA stack
+ * @entry_offset: entry point offset relative to the enclave base
+ * @exit_addr: address outside the enclave to exit on an exception or
+ * interrupt
+ * @fs_offset: offset relative to the enclave base to become FS
+ * segment inside the enclave
+ * @gs_offset: offset relative to the enclave base to become GS
+ * segment inside the enclave
+ * @fs_limit: size to become a new FS-limit (only 32-bit enclaves)
+ * @gs_limit: size to become a new GS-limit (only 32-bit enclaves)
+ *
+ * Thread Control Structure (TCS) is an enclave page visible in its address
+ * space that defines an entry point inside the enclave. A thread enters inside
+ * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered
+ * by only one thread at a time.
+ */
+struct sgx_tcs {
+ u64 state;
+ u64 flags;
+ u64 ssa_offset;
+ u32 ssa_index;
+ u32 nr_ssa_frames;
+ u64 entry_offset;
+ u64 exit_addr;
+ u64 fs_offset;
+ u64 gs_offset;
+ u32 fs_limit;
+ u32 gs_limit;
+ u8 reserved[SGX_TCS_RESERVED_SIZE];
+} __packed;
+
+/**
+ * struct sgx_pageinfo - an enclave page descriptor
+ * @addr: address of the enclave page
+ * @contents: pointer to the page contents
+ * @metadata: pointer either to a SECINFO or PCMD instance
+ * @secs: address of the SECS page
+ */
+struct sgx_pageinfo {
+ u64 addr;
+ u64 contents;
+ u64 metadata;
+ u64 secs;
+} __packed __aligned(32);
+
+
+/**
+ * enum sgx_page_type - bits in the SECINFO flags defining the page type
+ * @SGX_PAGE_TYPE_SECS: a SECS page
+ * @SGX_PAGE_TYPE_TCS: a TCS page
+ * @SGX_PAGE_TYPE_REG: a regular page
+ * @SGX_PAGE_TYPE_VA: a VA page
+ * @SGX_PAGE_TYPE_TRIM: a page in trimmed state
+ *
+ * Make sure when making changes to this enum that its values can still fit
+ * in the bitfield within &struct sgx_encl_page
+ */
+enum sgx_page_type {
+ SGX_PAGE_TYPE_SECS,
+ SGX_PAGE_TYPE_TCS,
+ SGX_PAGE_TYPE_REG,
+ SGX_PAGE_TYPE_VA,
+ SGX_PAGE_TYPE_TRIM,
+};
+
+#define SGX_NR_PAGE_TYPES 5
+#define SGX_PAGE_TYPE_MASK GENMASK(7, 0)
+
+/**
+ * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
+ * @SGX_SECINFO_R: allow read
+ * @SGX_SECINFO_W: allow write
+ * @SGX_SECINFO_X: allow execution
+ * @SGX_SECINFO_SECS: a SECS page
+ * @SGX_SECINFO_TCS: a TCS page
+ * @SGX_SECINFO_REG: a regular page
+ * @SGX_SECINFO_VA: a VA page
+ * @SGX_SECINFO_TRIM: a page in trimmed state
+ */
+enum sgx_secinfo_flags {
+ SGX_SECINFO_R = BIT(0),
+ SGX_SECINFO_W = BIT(1),
+ SGX_SECINFO_X = BIT(2),
+ SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8),
+ SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8),
+ SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8),
+ SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8),
+ SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8),
+};
+
+#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0)
+#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8)
+#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \
+ SGX_SECINFO_PAGE_TYPE_MASK)
+
+/**
+ * struct sgx_secinfo - describes attributes of an EPC page
+ * @flags: permissions and type
+ *
+ * Used together with ENCLS leaves that add or modify an EPC page to an
+ * enclave to define page permissions and type.
+ */
+struct sgx_secinfo {
+ u64 flags;
+ u8 reserved[56];
+} __packed __aligned(64);
+
+#define SGX_PCMD_RESERVED_SIZE 40
+
+/**
+ * struct sgx_pcmd - Paging Crypto Metadata (PCMD)
+ * @enclave_id: enclave identifier
+ * @mac: MAC over PCMD, page contents and isvsvn
+ *
+ * PCMD is stored for every swapped page to the regular memory. When ELDU loads
+ * the page back it recalculates the MAC by using a isvsvn number stored in a
+ * VA page. Together these two structures bring integrity and rollback
+ * protection.
+ */
+struct sgx_pcmd {
+ struct sgx_secinfo secinfo;
+ u64 enclave_id;
+ u8 reserved[SGX_PCMD_RESERVED_SIZE];
+ u8 mac[16];
+} __packed __aligned(128);
+
+#define SGX_SIGSTRUCT_RESERVED1_SIZE 84
+#define SGX_SIGSTRUCT_RESERVED2_SIZE 20
+#define SGX_SIGSTRUCT_RESERVED3_SIZE 32
+#define SGX_SIGSTRUCT_RESERVED4_SIZE 12
+
+/**
+ * struct sgx_sigstruct_header - defines author of the enclave
+ * @header1: constant byte string
+ * @vendor: must be either 0x0000 or 0x8086
+ * @date: YYYYMMDD in BCD
+ * @header2: constant byte string
+ * @swdefined: software defined value
+ */
+struct sgx_sigstruct_header {
+ u64 header1[2];
+ u32 vendor;
+ u32 date;
+ u64 header2[2];
+ u32 swdefined;
+ u8 reserved1[84];
+} __packed;
+
+/**
+ * struct sgx_sigstruct_body - defines contents of the enclave
+ * @miscselect: additional information stored to an SSA frame
+ * @misc_mask: required miscselect in SECS
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @attributes_mask: required attributes in SECS
+ * @xfrm_mask: required XFRM in SECS
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @isvprodid: a user-defined value that is used in key derivation
+ * @isvsvn: a user-defined value that is used in key derivation
+ */
+struct sgx_sigstruct_body {
+ u32 miscselect;
+ u32 misc_mask;
+ u8 reserved2[20];
+ u64 attributes;
+ u64 xfrm;
+ u64 attributes_mask;
+ u64 xfrm_mask;
+ u8 mrenclave[32];
+ u8 reserved3[32];
+ u16 isvprodid;
+ u16 isvsvn;
+} __packed;
+
+/**
+ * struct sgx_sigstruct - an enclave signature
+ * @header: defines author of the enclave
+ * @modulus: the modulus of the public key
+ * @exponent: the exponent of the public key
+ * @signature: the signature calculated over the fields except modulus,
+ * @body: defines contents of the enclave
+ * @q1: a value used in RSA signature verification
+ * @q2: a value used in RSA signature verification
+ *
+ * Header and body are the parts that are actual signed. The remaining fields
+ * define the signature of the enclave.
+ */
+struct sgx_sigstruct {
+ struct sgx_sigstruct_header header;
+ u8 modulus[SGX_MODULUS_SIZE];
+ u32 exponent;
+ u8 signature[SGX_MODULUS_SIZE];
+ struct sgx_sigstruct_body body;
+ u8 reserved4[12];
+ u8 q1[SGX_MODULUS_SIZE];
+ u8 q2[SGX_MODULUS_SIZE];
+} __packed;
+
+#define SGX_LAUNCH_TOKEN_SIZE 304
+
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h
new file mode 100644
index 000000000000..8009d781c2f9
--- /dev/null
+++ b/arch/x86/include/asm/shared/io.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_IO_H
+#define _ASM_X86_SHARED_IO_H
+
+#include <linux/types.h>
+
+#define BUILDIO(bwl, bw, type) \
+static __always_inline void __out##bwl(type value, u16 port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static __always_inline type __in##bwl(u16 port) \
+{ \
+ type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+}
+
+BUILDIO(b, b, u8)
+BUILDIO(w, w, u16)
+BUILDIO(l, , u32)
+#undef BUILDIO
+
+#define inb __inb
+#define inw __inw
+#define inl __inl
+#define outb __outb
+#define outw __outw
+#define outl __outl
+
+#endif
diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h
new file mode 100644
index 000000000000..a20b1c08c99f
--- /dev/null
+++ b/arch/x86/include/asm/shared/msr.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_MSR_H
+#define _ASM_X86_SHARED_MSR_H
+
+struct msr {
+ union {
+ struct {
+ u32 l;
+ u32 h;
+ };
+ u64 q;
+ };
+};
+
+/*
+ * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the
+ * boot kernel since they rely on tracepoint/exception handling infrastructure
+ * that's not available here.
+ */
+static inline void raw_rdmsr(unsigned int reg, struct msr *m)
+{
+ asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg));
+}
+
+static inline void raw_wrmsr(unsigned int reg, const struct msr *m)
+{
+ asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory");
+}
+
+#endif /* _ASM_X86_SHARED_MSR_H */
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
new file mode 100644
index 000000000000..8bc074c8d7c6
--- /dev/null
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_TDX_H
+#define _ASM_X86_SHARED_TDX_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+#define TDX_HYPERCALL_STANDARD 0
+
+#define TDX_CPUID_LEAF_ID 0x21
+#define TDX_IDENT "IntelTDX "
+
+/* TDX module Call Leaf IDs */
+#define TDG_VP_VMCALL 0
+#define TDG_VP_INFO 1
+#define TDG_MR_RTMR_EXTEND 2
+#define TDG_VP_VEINFO_GET 3
+#define TDG_MR_REPORT 4
+#define TDG_MEM_PAGE_ACCEPT 6
+#define TDG_VM_RD 7
+#define TDG_VM_WR 8
+
+/* TDX attributes */
+#define TDX_ATTR_DEBUG_BIT 0
+#define TDX_ATTR_DEBUG BIT_ULL(TDX_ATTR_DEBUG_BIT)
+#define TDX_ATTR_HGS_PLUS_PROF_BIT 4
+#define TDX_ATTR_HGS_PLUS_PROF BIT_ULL(TDX_ATTR_HGS_PLUS_PROF_BIT)
+#define TDX_ATTR_PERF_PROF_BIT 5
+#define TDX_ATTR_PERF_PROF BIT_ULL(TDX_ATTR_PERF_PROF_BIT)
+#define TDX_ATTR_PMT_PROF_BIT 6
+#define TDX_ATTR_PMT_PROF BIT_ULL(TDX_ATTR_PMT_PROF_BIT)
+#define TDX_ATTR_ICSSD_BIT 16
+#define TDX_ATTR_ICSSD BIT_ULL(TDX_ATTR_ICSSD_BIT)
+#define TDX_ATTR_LASS_BIT 27
+#define TDX_ATTR_LASS BIT_ULL(TDX_ATTR_LASS_BIT)
+#define TDX_ATTR_SEPT_VE_DISABLE_BIT 28
+#define TDX_ATTR_SEPT_VE_DISABLE BIT_ULL(TDX_ATTR_SEPT_VE_DISABLE_BIT)
+#define TDX_ATTR_MIGRTABLE_BIT 29
+#define TDX_ATTR_MIGRTABLE BIT_ULL(TDX_ATTR_MIGRTABLE_BIT)
+#define TDX_ATTR_PKS_BIT 30
+#define TDX_ATTR_PKS BIT_ULL(TDX_ATTR_PKS_BIT)
+#define TDX_ATTR_KL_BIT 31
+#define TDX_ATTR_KL BIT_ULL(TDX_ATTR_KL_BIT)
+#define TDX_ATTR_TPA_BIT 62
+#define TDX_ATTR_TPA BIT_ULL(TDX_ATTR_TPA_BIT)
+#define TDX_ATTR_PERFMON_BIT 63
+#define TDX_ATTR_PERFMON BIT_ULL(TDX_ATTR_PERFMON_BIT)
+
+/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */
+#define TDCS_CONFIG_FLAGS 0x1110000300000016
+#define TDCS_TD_CTLS 0x1110000300000017
+#define TDCS_NOTIFY_ENABLES 0x9100000000000010
+#define TDCS_TOPOLOGY_ENUM_CONFIGURED 0x9100000000000019
+
+/* TDCS_CONFIG_FLAGS bits */
+#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
+
+/* TDCS_TD_CTLS bits */
+#define TD_CTLS_PENDING_VE_DISABLE_BIT 0
+#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(TD_CTLS_PENDING_VE_DISABLE_BIT)
+#define TD_CTLS_ENUM_TOPOLOGY_BIT 1
+#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(TD_CTLS_ENUM_TOPOLOGY_BIT)
+#define TD_CTLS_VIRT_CPUID2_BIT 2
+#define TD_CTLS_VIRT_CPUID2 BIT_ULL(TD_CTLS_VIRT_CPUID2_BIT)
+#define TD_CTLS_REDUCE_VE_BIT 3
+#define TD_CTLS_REDUCE_VE BIT_ULL(TD_CTLS_REDUCE_VE_BIT)
+#define TD_CTLS_LOCK_BIT 63
+#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
+
+/* TDX hypercall Leaf IDs */
+#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000
+#define TDVMCALL_MAP_GPA 0x10001
+#define TDVMCALL_GET_QUOTE 0x10002
+#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
+#define TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT 0x10004ULL
+
+/*
+ * TDG.VP.VMCALL Status Codes (returned in R10)
+ */
+#define TDVMCALL_STATUS_SUCCESS 0x0000000000000000ULL
+#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL
+#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL
+#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL
+#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL
+
+/*
+ * Bitmasks of exposed registers (with VMM).
+ */
+#define TDX_RDX BIT(2)
+#define TDX_RBX BIT(3)
+#define TDX_RSI BIT(6)
+#define TDX_RDI BIT(7)
+#define TDX_R8 BIT(8)
+#define TDX_R9 BIT(9)
+#define TDX_R10 BIT(10)
+#define TDX_R11 BIT(11)
+#define TDX_R12 BIT(12)
+#define TDX_R13 BIT(13)
+#define TDX_R14 BIT(14)
+#define TDX_R15 BIT(15)
+
+/*
+ * These registers are clobbered to hold arguments for each
+ * TDVMCALL. They are safe to expose to the VMM.
+ * Each bit in this mask represents a register ID. Bit field
+ * details can be found in TDX GHCI specification, section
+ * titled "TDCALL [TDG.VP.VMCALL] leaf".
+ */
+#define TDVMCALL_EXPOSE_REGS_MASK \
+ (TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \
+ TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15)
+
+/* TDX supported page sizes from the TDX module ABI. */
+#define TDX_PS_4K 0
+#define TDX_PS_2M 1
+#define TDX_PS_1G 2
+#define TDX_PS_NR (TDX_PS_1G + 1)
+
+#ifndef __ASSEMBLER__
+
+#include <linux/compiler_attributes.h>
+
+/*
+ * Used in __tdcall*() to gather the input/output registers' values of the
+ * TDCALL instruction when requesting services from the TDX module. This is a
+ * software only structure and not part of the TDX module/VMM ABI
+ */
+struct tdx_module_args {
+ /* callee-clobbered */
+ u64 rcx;
+ u64 rdx;
+ u64 r8;
+ u64 r9;
+ /* extra callee-clobbered */
+ u64 r10;
+ u64 r11;
+ /* callee-saved + rdi/rsi */
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u64 rbx;
+ u64 rdi;
+ u64 rsi;
+};
+
+/* Used to communicate with the TDX module */
+u64 __tdcall(u64 fn, struct tdx_module_args *args);
+u64 __tdcall_ret(u64 fn, struct tdx_module_args *args);
+u64 __tdcall_saved_ret(u64 fn, struct tdx_module_args *args);
+
+/* Used to request services from the VMM */
+u64 __tdx_hypercall(struct tdx_module_args *args);
+
+/*
+ * Wrapper for standard use of __tdx_hypercall with no output aside from
+ * return code.
+ */
+static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = fn,
+ .r12 = r12,
+ .r13 = r13,
+ .r14 = r14,
+ .r15 = r15,
+ };
+
+ return __tdx_hypercall(&args);
+}
+
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __noreturn __tdx_hypercall_failed(void);
+
+bool tdx_accept_memory(phys_addr_t start, phys_addr_t end);
+
+/*
+ * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
+ * independently from but are currently matched 1:1 with VMX EXIT_REASONs.
+ * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
+ * guest sides of these calls.
+ */
+static __always_inline u64 hcall_func(u64 exit_reason)
+{
+ return exit_reason;
+}
+
+#endif /* !__ASSEMBLER__ */
+#endif /* _ASM_X86_SHARED_TDX_H */
diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h
deleted file mode 100644
index b51413b74971..000000000000
--- a/arch/x86/include/asm/shmbuf.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef _ASM_X86_SHMBUF_H
-#define _ASM_X86_SHMBUF_H
-
-/*
- * The shmid64_ds structure for x86 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space on 32 bit is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- *
- * Pad space on 64 bit is left for:
- * - 2 miscellaneous 64-bit values
- */
-
-struct shmid64_ds {
- struct ipc64_perm shm_perm; /* operation perms */
- size_t shm_segsz; /* size of segment (bytes) */
- __kernel_time_t shm_atime; /* last attach time */
-#ifdef __i386__
- unsigned long __unused1;
-#endif
- __kernel_time_t shm_dtime; /* last detach time */
-#ifdef __i386__
- unsigned long __unused2;
-#endif
- __kernel_time_t shm_ctime; /* last change time */
-#ifdef __i386__
- unsigned long __unused3;
-#endif
- __kernel_pid_t shm_cpid; /* pid of creator */
- __kernel_pid_t shm_lpid; /* pid of last operator */
- unsigned long shm_nattch; /* no. of current attaches */
- unsigned long __unused4;
- unsigned long __unused5;
-};
-
-struct shminfo64 {
- unsigned long shmmax;
- unsigned long shmmin;
- unsigned long shmmni;
- unsigned long shmseg;
- unsigned long shmall;
- unsigned long __unused1;
- unsigned long __unused2;
- unsigned long __unused3;
- unsigned long __unused4;
-};
-
-#endif /* _ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h
index 0880cf0917b9..c4041819c3e5 100644
--- a/arch/x86/include/asm/shmparam.h
+++ b/arch/x86/include/asm/shmparam.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SHMPARAM_H
#define _ASM_X86_SHMPARAM_H
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
new file mode 100644
index 000000000000..fc7dcec58fd4
--- /dev/null
+++ b/arch/x86/include/asm/shstk.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHSTK_H
+#define _ASM_X86_SHSTK_H
+
+#ifndef __ASSEMBLER__
+#include <linux/types.h>
+
+struct task_struct;
+struct ksignal;
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+struct thread_shstk {
+ u64 base;
+ u64 size;
+};
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
+void reset_thread_features(void);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p, u64 clone_flags,
+ unsigned long stack_size);
+void shstk_free(struct task_struct *p);
+int setup_signal_shadow_stack(struct ksignal *ksig);
+int restore_signal_shadow_stack(void);
+int shstk_update_last_frame(unsigned long val);
+bool shstk_is_enabled(void);
+int shstk_pop(u64 *val);
+int shstk_push(u64 val);
+#else
+static inline long shstk_prctl(struct task_struct *task, int option,
+ unsigned long arg2) { return -EINVAL; }
+static inline void reset_thread_features(void) {}
+static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+ u64 clone_flags,
+ unsigned long stack_size) { return 0; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
+static inline int restore_signal_shadow_stack(void) { return 0; }
+static inline int shstk_update_last_frame(unsigned long val) { return 0; }
+static inline bool shstk_is_enabled(void) { return false; }
+static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
+static inline int shstk_push(u64 val) { return -ENOTSUPP; }
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_X86_SHSTK_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 72e5a4491661..140d890c2c98 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -1,290 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SIGCONTEXT_H
#define _ASM_X86_SIGCONTEXT_H
-#include <linux/compiler.h>
-#include <linux/types.h>
+/* This is a legacy header - all kernel code includes <uapi/asm/sigcontext.h> directly. */
-#define FP_XSTATE_MAGIC1 0x46505853U
-#define FP_XSTATE_MAGIC2 0x46505845U
-#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
-
-/*
- * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
- * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
- * are used to extended the fpstate pointer in the sigcontext, which now
- * includes the extended state information along with fpstate information.
- *
- * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
- * area and FP_XSTATE_MAGIC2 at the end of memory layout
- * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
- * extended state information in the memory layout pointed by the fpstate
- * pointer in sigcontext.
- */
-struct _fpx_sw_bytes {
- __u32 magic1; /* FP_XSTATE_MAGIC1 */
- __u32 extended_size; /* total size of the layout referred by
- * fpstate pointer in the sigcontext.
- */
- __u64 xstate_bv;
- /* feature bit mask (including fp/sse/extended
- * state) that is present in the memory
- * layout.
- */
- __u32 xstate_size; /* actual xsave state size, based on the
- * features saved in the layout.
- * 'extended_size' will be greater than
- * 'xstate_size'.
- */
- __u32 padding[7]; /* for future use. */
-};
-
-#ifdef __i386__
-/*
- * As documented in the iBCS2 standard..
- *
- * The first part of "struct _fpstate" is just the normal i387
- * hardware setup, the extra "status" word is used to save the
- * coprocessor status word before entering the handler.
- *
- * Pentium III FXSR, SSE support
- * Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * The FPU state data structure has had to grow to accommodate the
- * extended FPU state required by the Streaming SIMD Extensions.
- * There is no documented standard to accomplish this at the moment.
- */
-struct _fpreg {
- unsigned short significand[4];
- unsigned short exponent;
-};
-
-struct _fpxreg {
- unsigned short significand[4];
- unsigned short exponent;
- unsigned short padding[3];
-};
-
-struct _xmmreg {
- unsigned long element[4];
-};
-
-struct _fpstate {
- /* Regular FPU environment */
- unsigned long cw;
- unsigned long sw;
- unsigned long tag;
- unsigned long ipoff;
- unsigned long cssel;
- unsigned long dataoff;
- unsigned long datasel;
- struct _fpreg _st[8];
- unsigned short status;
- unsigned short magic; /* 0xffff = regular FPU data only */
-
- /* FXSR FPU environment */
- unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */
- unsigned long mxcsr;
- unsigned long reserved;
- struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
- struct _xmmreg _xmm[8];
- unsigned long padding1[44];
-
- union {
- unsigned long padding2[12];
- struct _fpx_sw_bytes sw_reserved; /* represents the extended
- * state info */
- };
-};
-
-#define X86_FXSR_MAGIC 0x0000
-
-#ifdef __KERNEL__
-struct sigcontext {
- unsigned short gs, __gsh;
- unsigned short fs, __fsh;
- unsigned short es, __esh;
- unsigned short ds, __dsh;
- unsigned long di;
- unsigned long si;
- unsigned long bp;
- unsigned long sp;
- unsigned long bx;
- unsigned long dx;
- unsigned long cx;
- unsigned long ax;
- unsigned long trapno;
- unsigned long err;
- unsigned long ip;
- unsigned short cs, __csh;
- unsigned long flags;
- unsigned long sp_at_signal;
- unsigned short ss, __ssh;
-
- /*
- * fpstate is really (struct _fpstate *) or (struct _xstate *)
- * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
- * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
- * of extended memory layout. See comments at the defintion of
- * (struct _fpx_sw_bytes)
- */
- void __user *fpstate; /* zero when no FPU/extended context */
- unsigned long oldmask;
- unsigned long cr2;
-};
-#else /* __KERNEL__ */
-/*
- * User-space might still rely on the old definition:
- */
-struct sigcontext {
- unsigned short gs, __gsh;
- unsigned short fs, __fsh;
- unsigned short es, __esh;
- unsigned short ds, __dsh;
- unsigned long edi;
- unsigned long esi;
- unsigned long ebp;
- unsigned long esp;
- unsigned long ebx;
- unsigned long edx;
- unsigned long ecx;
- unsigned long eax;
- unsigned long trapno;
- unsigned long err;
- unsigned long eip;
- unsigned short cs, __csh;
- unsigned long eflags;
- unsigned long esp_at_signal;
- unsigned short ss, __ssh;
- struct _fpstate __user *fpstate;
- unsigned long oldmask;
- unsigned long cr2;
-};
-#endif /* !__KERNEL__ */
-
-#else /* __i386__ */
-
-/* FXSAVE frame */
-/* Note: reserved1/2 may someday contain valuable data. Always save/restore
- them when you change signal frames. */
-struct _fpstate {
- __u16 cwd;
- __u16 swd;
- __u16 twd; /* Note this is not the same as the
- 32bit/x87/FSAVE twd */
- __u16 fop;
- __u64 rip;
- __u64 rdp;
- __u32 mxcsr;
- __u32 mxcsr_mask;
- __u32 st_space[32]; /* 8*16 bytes for each FP-reg */
- __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */
- __u32 reserved2[12];
- union {
- __u32 reserved3[12];
- struct _fpx_sw_bytes sw_reserved; /* represents the extended
- * state information */
- };
-};
-
-#ifdef __KERNEL__
-struct sigcontext {
- unsigned long r8;
- unsigned long r9;
- unsigned long r10;
- unsigned long r11;
- unsigned long r12;
- unsigned long r13;
- unsigned long r14;
- unsigned long r15;
- unsigned long di;
- unsigned long si;
- unsigned long bp;
- unsigned long bx;
- unsigned long dx;
- unsigned long ax;
- unsigned long cx;
- unsigned long sp;
- unsigned long ip;
- unsigned long flags;
- unsigned short cs;
- unsigned short gs;
- unsigned short fs;
- unsigned short __pad0;
- unsigned long err;
- unsigned long trapno;
- unsigned long oldmask;
- unsigned long cr2;
-
- /*
- * fpstate is really (struct _fpstate *) or (struct _xstate *)
- * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
- * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
- * of extended memory layout. See comments at the defintion of
- * (struct _fpx_sw_bytes)
- */
- void __user *fpstate; /* zero when no FPU/extended context */
- unsigned long reserved1[8];
-};
-#else /* __KERNEL__ */
-/*
- * User-space might still rely on the old definition:
- */
-struct sigcontext {
- unsigned long r8;
- unsigned long r9;
- unsigned long r10;
- unsigned long r11;
- unsigned long r12;
- unsigned long r13;
- unsigned long r14;
- unsigned long r15;
- unsigned long rdi;
- unsigned long rsi;
- unsigned long rbp;
- unsigned long rbx;
- unsigned long rdx;
- unsigned long rax;
- unsigned long rcx;
- unsigned long rsp;
- unsigned long rip;
- unsigned long eflags; /* RFLAGS */
- unsigned short cs;
- unsigned short gs;
- unsigned short fs;
- unsigned short __pad0;
- unsigned long err;
- unsigned long trapno;
- unsigned long oldmask;
- unsigned long cr2;
- struct _fpstate __user *fpstate; /* zero when no FPU context */
- unsigned long reserved1[8];
-};
-#endif /* !__KERNEL__ */
-
-#endif /* !__i386__ */
-
-struct _xsave_hdr {
- __u64 xstate_bv;
- __u64 reserved1[2];
- __u64 reserved2[5];
-};
-
-struct _ymmh_state {
- /* 16 * 16 bytes for each YMMH-reg */
- __u32 ymmh_space[64];
-};
-
-/*
- * Extended state pointed by the fpstate pointer in the sigcontext.
- * In addition to the fpstate, information encoded in the xstate_hdr
- * indicates the presence of other extended state information
- * supported by the processor and OS.
- */
-struct _xstate {
- struct _fpstate fpstate;
- struct _xsave_hdr xstate_hdr;
- struct _ymmh_state ymmh;
- /* new processor state extensions go here */
-};
+#include <uapi/asm/sigcontext.h>
#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/asm/sigcontext32.h
deleted file mode 100644
index ad1478c4ae12..000000000000
--- a/arch/x86/include/asm/sigcontext32.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef _ASM_X86_SIGCONTEXT32_H
-#define _ASM_X86_SIGCONTEXT32_H
-
-#include <linux/types.h>
-
-/* signal context for 32bit programs. */
-
-#define X86_FXSR_MAGIC 0x0000
-
-struct _fpreg {
- unsigned short significand[4];
- unsigned short exponent;
-};
-
-struct _fpxreg {
- unsigned short significand[4];
- unsigned short exponent;
- unsigned short padding[3];
-};
-
-struct _xmmreg {
- __u32 element[4];
-};
-
-/* FSAVE frame with extensions */
-struct _fpstate_ia32 {
- /* Regular FPU environment */
- __u32 cw;
- __u32 sw;
- __u32 tag; /* not compatible to 64bit twd */
- __u32 ipoff;
- __u32 cssel;
- __u32 dataoff;
- __u32 datasel;
- struct _fpreg _st[8];
- unsigned short status;
- unsigned short magic; /* 0xffff = regular FPU data only */
-
- /* FXSR FPU environment */
- __u32 _fxsr_env[6];
- __u32 mxcsr;
- __u32 reserved;
- struct _fpxreg _fxsr_st[8];
- struct _xmmreg _xmm[8]; /* It's actually 16 */
- __u32 padding[44];
- union {
- __u32 padding2[12];
- struct _fpx_sw_bytes sw_reserved;
- };
-};
-
-struct sigcontext_ia32 {
- unsigned short gs, __gsh;
- unsigned short fs, __fsh;
- unsigned short es, __esh;
- unsigned short ds, __dsh;
- unsigned int di;
- unsigned int si;
- unsigned int bp;
- unsigned int sp;
- unsigned int bx;
- unsigned int dx;
- unsigned int cx;
- unsigned int ax;
- unsigned int trapno;
- unsigned int err;
- unsigned int ip;
- unsigned short cs, __csh;
- unsigned int flags;
- unsigned int sp_at_signal;
- unsigned short ss, __ssh;
- unsigned int fpstate; /* really (struct _fpstate_ia32 *) */
- unsigned int oldmask;
- unsigned int cr2;
-};
-
-#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 4e0fe26d27d3..84eab2724875 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -1,15 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SIGFRAME_H
#define _ASM_X86_SIGFRAME_H
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
#include <asm/siginfo.h>
#include <asm/ucontext.h>
+#include <linux/compat.h>
#ifdef CONFIG_X86_32
#define sigframe_ia32 sigframe
#define rt_sigframe_ia32 rt_sigframe
-#define sigcontext_ia32 sigcontext
-#define _fpstate_ia32 _fpstate
#define ucontext_ia32 ucontext
#else /* !CONFIG_X86_32 */
@@ -23,7 +23,7 @@
struct sigframe_ia32 {
u32 pretcode;
int sig;
- struct sigcontext_ia32 sc;
+ struct sigcontext_32 sc;
/*
* fpstate is unused. fpstate is moved/allocated after
* retcode[] below. This movement allows to have the FP state and the
@@ -32,12 +32,8 @@ struct sigframe_ia32 {
* the offset of extramask[] in the sigframe and thus prevent any
* legacy application accessing/modifying it.
*/
- struct _fpstate_ia32 fpstate_unused;
-#ifdef CONFIG_IA32_EMULATION
- unsigned int extramask[_COMPAT_NSIG_WORDS-1];
-#else /* !CONFIG_IA32_EMULATION */
- unsigned long extramask[_NSIG_WORDS-1];
-#endif /* CONFIG_IA32_EMULATION */
+ struct _fpstate_32 fpstate_unused;
+ unsigned int extramask[1];
char retcode[8];
/* fp state follows here */
};
@@ -59,12 +55,34 @@ struct rt_sigframe_ia32 {
#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */
#ifdef CONFIG_X86_64
+
struct rt_sigframe {
char __user *pretcode;
struct ucontext uc;
struct siginfo info;
/* fp state follows here */
};
+
+#ifdef CONFIG_X86_X32_ABI
+
+struct ucontext_x32 {
+ unsigned int uc_flags;
+ unsigned int uc_link;
+ compat_stack_t uc_stack;
+ unsigned int uc__pad0; /* needed for alignment */
+ struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
+ compat_sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
+struct rt_sigframe_x32 {
+ u64 pretcode;
+ struct ucontext_x32 uc;
+ compat_siginfo_t info;
+ /* fp state follows here */
+};
+
+#endif /* CONFIG_X86_X32_ABI */
+
#endif /* CONFIG_X86_64 */
#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
new file mode 100644
index 000000000000..8727c7e21dd1
--- /dev/null
+++ b/arch/x86/include/asm/sighandling.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SIGHANDLING_H
+#define _ASM_X86_SIGHANDLING_H
+
+#include <linux/compiler.h>
+#include <linux/ptrace.h>
+#include <linux/signal.h>
+
+#include <asm/processor-flags.h>
+
+#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
+ X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+ X86_EFLAGS_CF | X86_EFLAGS_RF)
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+
+void __user *
+get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
+ void __user **fpstate);
+
+int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs);
+int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+
+/*
+ * To prevent immediate repeat of single step trap on return from SIGTRAP
+ * handler if the trap flag (TF) is set without an external debugger attached,
+ * clear the software event flag in the augmented SS, ensuring no single-step
+ * trap is pending upon ERETU completion.
+ *
+ * Note, this function should be called in sigreturn() before the original
+ * state is restored to make sure the TF is read from the entry frame.
+ */
+static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs)
+{
+ /*
+ * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction
+ * is being single-stepped, do not clear the software event flag in the
+ * augmented SS, thus a debugger won't skip over the following instruction.
+ */
+#ifdef CONFIG_X86_FRED
+ if (!(regs->flags & X86_EFLAGS_TF))
+ regs->fred_ss.swevent = 0;
+#endif
+}
+
+#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/include/asm/siginfo.h b/arch/x86/include/asm/siginfo.h
deleted file mode 100644
index fc1aa5535646..000000000000
--- a/arch/x86/include/asm/siginfo.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_SIGINFO_H
-#define _ASM_X86_SIGINFO_H
-
-#ifdef __x86_64__
-# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int))
-#endif
-
-#include <asm-generic/siginfo.h>
-
-#endif /* _ASM_X86_SIGINFO_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 598457cbd0f8..5c03aaa89014 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -1,15 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SIGNAL_H
#define _ASM_X86_SIGNAL_H
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/time.h>
-#include <linux/compiler.h>
-
-/* Avoid too many header ordering problems. */
-struct siginfo;
-
-#ifdef __KERNEL__
+#ifndef __ASSEMBLER__
#include <linux/linkage.h>
/* Most things should be clean enough to redefine this at will, if care
@@ -31,160 +24,18 @@ typedef struct {
unsigned long sig[_NSIG_WORDS];
} sigset_t;
-#else
-/* Here we must cater to libcs that poke about in kernel headers. */
-
-#define NSIG 32
-typedef unsigned long sigset_t;
-
-#endif /* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
-
-#define SIGHUP 1
-#define SIGINT 2
-#define SIGQUIT 3
-#define SIGILL 4
-#define SIGTRAP 5
-#define SIGABRT 6
-#define SIGIOT 6
-#define SIGBUS 7
-#define SIGFPE 8
-#define SIGKILL 9
-#define SIGUSR1 10
-#define SIGSEGV 11
-#define SIGUSR2 12
-#define SIGPIPE 13
-#define SIGALRM 14
-#define SIGTERM 15
-#define SIGSTKFLT 16
-#define SIGCHLD 17
-#define SIGCONT 18
-#define SIGSTOP 19
-#define SIGTSTP 20
-#define SIGTTIN 21
-#define SIGTTOU 22
-#define SIGURG 23
-#define SIGXCPU 24
-#define SIGXFSZ 25
-#define SIGVTALRM 26
-#define SIGPROF 27
-#define SIGWINCH 28
-#define SIGIO 29
-#define SIGPOLL SIGIO
-/*
-#define SIGLOST 29
-*/
-#define SIGPWR 30
-#define SIGSYS 31
-#define SIGUNUSED 31
-
-/* These should not be considered constants from userland. */
-#define SIGRTMIN 32
-#define SIGRTMAX _NSIG
-
-/*
- * SA_FLAGS values:
- *
- * SA_ONSTACK indicates that a registered stack_t will be used.
- * SA_RESTART flag to get restarting signals (which were the default long ago)
- * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
- * SA_RESETHAND clears the handler when the signal is delivered.
- * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
- * SA_NODEFER prevents the current signal from being masked in the handler.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-#define SA_NOCLDSTOP 0x00000001u
-#define SA_NOCLDWAIT 0x00000002u
-#define SA_SIGINFO 0x00000004u
-#define SA_ONSTACK 0x08000000u
-#define SA_RESTART 0x10000000u
-#define SA_NODEFER 0x40000000u
-#define SA_RESETHAND 0x80000000u
-
-#define SA_NOMASK SA_NODEFER
-#define SA_ONESHOT SA_RESETHAND
-
-#define SA_RESTORER 0x04000000
-
-/*
- * sigaltstack controls
- */
-#define SS_ONSTACK 1
-#define SS_DISABLE 2
-
-#define MINSIGSTKSZ 2048
-#define SIGSTKSZ 8192
-
-#include <asm-generic/signal-defs.h>
-
-#ifndef __ASSEMBLY__
-
-# ifdef __KERNEL__
-extern void do_notify_resume(struct pt_regs *, void *, __u32);
-# endif /* __KERNEL__ */
-
-#ifdef __i386__
-# ifdef __KERNEL__
-struct old_sigaction {
- __sighandler_t sa_handler;
- old_sigset_t sa_mask;
- unsigned long sa_flags;
- __sigrestore_t sa_restorer;
-};
-
-struct sigaction {
- __sighandler_t sa_handler;
- unsigned long sa_flags;
- __sigrestore_t sa_restorer;
- sigset_t sa_mask; /* mask last for extensibility */
-};
-
-struct k_sigaction {
- struct sigaction sa;
-};
-
-# else /* __KERNEL__ */
-/* Here we must cater to libcs that poke about in kernel headers. */
-
-struct sigaction {
- union {
- __sighandler_t _sa_handler;
- void (*_sa_sigaction)(int, struct siginfo *, void *);
- } _u;
- sigset_t sa_mask;
- unsigned long sa_flags;
- void (*sa_restorer)(void);
-};
-
-#define sa_handler _u._sa_handler
-#define sa_sigaction _u._sa_sigaction
-
-# endif /* ! __KERNEL__ */
-#else /* __i386__ */
+/* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */
+#define SA_IA32_ABI 0x02000000u
+#define SA_X32_ABI 0x01000000u
-struct sigaction {
- __sighandler_t sa_handler;
- unsigned long sa_flags;
- __sigrestore_t sa_restorer;
- sigset_t sa_mask; /* mask last for extensibility */
-};
+#endif /* __ASSEMBLER__ */
+#include <uapi/asm/signal.h>
+#ifndef __ASSEMBLER__
-struct k_sigaction {
- struct sigaction sa;
-};
+#define __ARCH_HAS_SA_RESTORER
-#endif /* !__i386__ */
-
-typedef struct sigaltstack {
- void __user *ss_sp;
- int ss_flags;
- size_t ss_size;
-} stack_t;
-
-#ifdef __KERNEL__
-#include <asm/sigcontext.h>
+#include <asm/asm.h>
+#include <uapi/asm/sigcontext.h>
#ifdef __i386__
@@ -231,9 +82,8 @@ static inline int __const_sigismember(sigset_t *set, int _sig)
static inline int __gen_sigismember(sigset_t *set, int _sig)
{
- int ret;
- asm("btl %2,%1\n\tsbbl %0,%0"
- : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
+ bool ret;
+ asm("btl %2,%1" : "=@ccc"(ret) : "m"(*set), "Ir"(_sig-1));
return ret;
}
@@ -242,12 +92,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig)
? __const_sigismember((set), (sig)) \
: __gen_sigismember((set), (sig)))
-static inline int sigfindinword(unsigned long word)
-{
- asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc");
- return word;
-}
-
struct pt_regs;
#else /* __i386__ */
@@ -256,9 +100,5 @@ struct pt_regs;
#endif /* !__i386__ */
-#define ptrace_signal_deliver(regs, cookie) do { } while (0)
-
-#endif /* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
-
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
new file mode 100644
index 000000000000..b8027b63cd7a
--- /dev/null
+++ b/arch/x86/include/asm/simd.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
+#include <asm/fpu/api.h>
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+/*
+ * may_use_simd - whether it is allowable at this time to issue SIMD
+ * instructions or access the SIMD register file
+ */
+static __must_check inline bool may_use_simd(void)
+{
+ return irq_fpu_usable();
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
new file mode 100644
index 000000000000..977bef14a0ab
--- /dev/null
+++ b/arch/x86/include/asm/smap.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Supervisor Mode Access Prevention support
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: H. Peter Anvin <hpa@linux.intel.com>
+ */
+
+#ifndef _ASM_X86_SMAP_H
+#define _ASM_X86_SMAP_H
+
+#include <asm/nops.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+
+#ifdef __ASSEMBLER__
+
+#define ASM_CLAC \
+ ALTERNATIVE "", "clac", X86_FEATURE_SMAP
+
+#define ASM_STAC \
+ ALTERNATIVE "", "stac", X86_FEATURE_SMAP
+
+#else /* __ASSEMBLER__ */
+
+/*
+ * The CLAC/STAC instructions toggle the enforcement of
+ * X86_FEATURE_SMAP along with X86_FEATURE_LASS.
+ *
+ * SMAP enforcement is based on the _PAGE_BIT_USER bit in the page
+ * tables. The kernel is not allowed to touch pages with that bit set
+ * unless the AC bit is set.
+ *
+ * Use stac()/clac() when accessing userspace (_PAGE_USER) mappings,
+ * regardless of location.
+ *
+ * Note: a barrier is implicit in alternative().
+ */
+
+static __always_inline void clac(void)
+{
+ alternative("", "clac", X86_FEATURE_SMAP);
+}
+
+static __always_inline void stac(void)
+{
+ alternative("", "stac", X86_FEATURE_SMAP);
+}
+
+/*
+ * LASS enforcement is based on bit 63 of the virtual address. The
+ * kernel is not allowed to touch memory in the lower half of the
+ * virtual address space.
+ *
+ * Use lass_stac()/lass_clac() to toggle the AC bit for kernel data
+ * accesses (!_PAGE_USER) that are blocked by LASS, but not by SMAP.
+ *
+ * Even with the AC bit set, LASS will continue to block instruction
+ * fetches from the user half of the address space. To allow those,
+ * clear CR4.LASS to disable the LASS mechanism entirely.
+ *
+ * Note: a barrier is implicit in alternative().
+ */
+
+static __always_inline void lass_clac(void)
+{
+ alternative("", "clac", X86_FEATURE_LASS);
+}
+
+static __always_inline void lass_stac(void)
+{
+ alternative("", "stac", X86_FEATURE_LASS);
+}
+
+static __always_inline unsigned long smap_save(void)
+{
+ unsigned long flags;
+
+ asm volatile ("# smap_save\n\t"
+ ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
+ "", "pushf; pop %0; clac",
+ X86_FEATURE_SMAP)
+ : "=rm" (flags) : : "memory", "cc");
+
+ return flags;
+}
+
+static __always_inline void smap_restore(unsigned long flags)
+{
+ asm volatile ("# smap_restore\n\t"
+ ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
+ "", "push %0; popf",
+ X86_FEATURE_SMAP)
+ : : "g" (flags) : "memory", "cc");
+}
+
+/* These macros can be used in asm() statements */
+#define ASM_CLAC \
+ ALTERNATIVE("", "clac", X86_FEATURE_SMAP)
+#define ASM_STAC \
+ ALTERNATIVE("", "stac", X86_FEATURE_SMAP)
+
+#define ASM_CLAC_UNSAFE \
+ ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "clac", X86_FEATURE_SMAP)
+#define ASM_STAC_UNSAFE \
+ ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "stac", X86_FEATURE_SMAP)
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_X86_SMAP_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 6a84ed166aec..84951572ab81 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -1,62 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SMP_H
#define _ASM_X86_SMP_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <asm/percpu.h>
+#include <linux/thread_info.h>
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-# include <asm/mpspec.h>
-# include <asm/apic.h>
-# ifdef CONFIG_X86_IO_APIC
-# include <asm/io_apic.h>
-# endif
-#endif
-#include <asm/thread_info.h>
#include <asm/cpumask.h>
-extern int smp_num_siblings;
-extern unsigned int num_processors;
-
-DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
-DECLARE_PER_CPU(u16, cpu_llc_id);
-DECLARE_PER_CPU(int, cpu_number);
-
-static inline struct cpumask *cpu_sibling_mask(int cpu)
-{
- return per_cpu(cpu_sibling_map, cpu);
-}
+DECLARE_PER_CPU_CACHE_HOT(int, cpu_number);
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
- return per_cpu(cpu_core_map, cpu);
-}
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
+/* cpus sharing the last level cache: */
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
-DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
-DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
-/* Static state in head.S used to set up a CPU */
-extern struct {
- void *sp;
- unsigned short ss;
-} stack_start;
+struct task_struct;
struct smp_ops {
void (*smp_prepare_boot_cpu)(void);
void (*smp_prepare_cpus)(unsigned max_cpus);
void (*smp_cpus_done)(unsigned max_cpus);
- void (*smp_send_stop)(void);
+ void (*stop_other_cpus)(int wait);
+ void (*crash_stop_other_cpus)(void);
void (*smp_send_reschedule)(int cpu);
- int (*cpu_up)(unsigned cpu);
+ void (*cleanup_dead_cpu)(unsigned cpu);
+ void (*poll_sync_state)(void);
+ int (*kick_ap_alive)(unsigned cpu, struct task_struct *tidle);
int (*cpu_disable)(void);
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
+ void (*stop_this_cpu)(void);
void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
@@ -66,19 +46,16 @@ struct smp_ops {
extern void set_cpu_sibling_map(int cpu);
#ifdef CONFIG_SMP
-#ifndef CONFIG_PARAVIRT
-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
-#endif
extern struct smp_ops smp_ops;
static inline void smp_send_stop(void)
{
- smp_ops.smp_send_stop();
+ smp_ops.stop_other_cpus(0);
}
-static inline void smp_prepare_boot_cpu(void)
+static inline void stop_other_cpus(void)
{
- smp_ops.smp_prepare_boot_cpu();
+ smp_ops.stop_other_cpus(1);
}
static inline void smp_prepare_cpus(unsigned int max_cpus)
@@ -91,11 +68,6 @@ static inline void smp_cpus_done(unsigned int max_cpus)
smp_ops.smp_cpus_done(max_cpus);
}
-static inline int __cpu_up(unsigned int cpu)
-{
- return smp_ops.cpu_up(cpu);
-}
-
static inline int __cpu_disable(void)
{
return smp_ops.cpu_disable();
@@ -103,15 +75,17 @@ static inline int __cpu_disable(void)
static inline void __cpu_die(unsigned int cpu)
{
- smp_ops.cpu_die(cpu);
+ if (smp_ops.cpu_die)
+ smp_ops.cpu_die(cpu);
}
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
smp_ops.play_dead();
+ BUG();
}
-static inline void smp_send_reschedule(int cpu)
+static inline void arch_smp_send_reschedule(int cpu)
{
smp_ops.smp_send_reschedule(cpu);
}
@@ -121,7 +95,6 @@ static inline void arch_send_call_function_single_ipi(int cpu)
smp_ops.send_call_func_single_ipi(cpu);
}
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
smp_ops.send_call_func_ipi(mask);
@@ -129,71 +102,98 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
void cpu_disable_common(void);
void native_smp_prepare_boot_cpu(void);
+void smp_prepare_cpus_common(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void native_smp_cpus_done(unsigned int max_cpus);
-int native_cpu_up(unsigned int cpunum);
+int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
+int native_kick_ap(unsigned int cpu, struct task_struct *tidle);
int native_cpu_disable(void);
-void native_cpu_die(unsigned int cpu);
-void native_play_dead(void);
+void __noreturn hlt_play_dead(void);
+void __noreturn native_play_dead(void);
void play_dead_common(void);
+void wbinvd_on_cpu(int cpu);
+void wbinvd_on_all_cpus(void);
+void wbinvd_on_cpus_mask(struct cpumask *cpus);
+void wbnoinvd_on_all_cpus(void);
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus);
+void smp_kick_mwait_play_dead(void);
+void __noreturn mwait_play_dead(unsigned int eax_hint);
+
+void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
-void smp_store_cpu_info(int id);
-#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
-
-/* We don't mark CPUs online until __cpu_up(), so we need another measure */
-static inline int num_booting_cpus(void)
-{
- return cpumask_weight(cpu_callout_mask);
-}
-#endif /* CONFIG_SMP */
+asmlinkage __visible void smp_reboot_interrupt(void);
+__visible void smp_reschedule_interrupt(struct pt_regs *regs);
+__visible void smp_call_function_interrupt(struct pt_regs *regs);
+__visible void smp_call_function_single_interrupt(struct pt_regs *r);
-extern unsigned disabled_cpus __cpuinitdata;
+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+#define cpu_acpi_id(cpu) per_cpu(x86_cpu_to_acpiid, cpu)
-#ifdef CONFIG_X86_32_SMP
/*
* This function is needed by all SMP systems. It must _always_ be valid
- * from the initial startup. We map APIC_BASE very early in page_setup(),
- * so this is correct in the x86 case.
+ * from the initial startup.
*/
-#define raw_smp_processor_id() (percpu_read(cpu_number))
-extern int safe_smp_processor_id(void);
+#define raw_smp_processor_id() this_cpu_read(cpu_number)
+#define __smp_processor_id() __this_cpu_read(cpu_number)
-#elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() (percpu_read(cpu_number))
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return per_cpu(cpu_llc_shared_map, cpu);
+}
-#define stack_smp_processor_id() \
-({ \
- struct thread_info *ti; \
- __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
- ti->cpu; \
-})
-#define safe_smp_processor_id() smp_processor_id()
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+ return per_cpu(cpu_l2c_shared_map, cpu);
+}
-#endif
+#else /* !CONFIG_SMP */
+#define wbinvd_on_cpu(cpu) wbinvd()
+static inline void wbinvd_on_all_cpus(void)
+{
+ wbinvd();
+}
+
+static inline void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ wbinvd();
+}
-#ifdef CONFIG_X86_LOCAL_APIC
+static inline void wbnoinvd_on_all_cpus(void)
+{
+ wbnoinvd();
+}
-#ifndef CONFIG_X86_64
-static inline int logical_smp_processor_id(void)
+static inline void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
{
- /* we don't want to mark this access volatile - bad code generation */
- return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+ wbnoinvd();
}
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return (struct cpumask *)cpumask_of(0);
+}
+
+static inline void __noreturn mwait_play_dead(unsigned int eax_hint) { BUG(); }
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
#endif
-extern int hard_smp_processor_id(void);
+extern unsigned int smpboot_control;
+extern unsigned long apic_mmio_base;
-#else /* CONFIG_X86_LOCAL_APIC */
+#endif /* !__ASSEMBLER__ */
-# ifndef CONFIG_SMP
-# define hard_smp_processor_id() 0
-# endif
+/* Control bits for startup_64 */
+#define STARTUP_READ_APICID 0x80000000
-#endif /* CONFIG_X86_LOCAL_APIC */
+/* Top 8 bits are reserved for control */
+#define STARTUP_PARALLEL_MASK 0xFF000000
-#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
deleted file mode 100644
index 1def60114906..000000000000
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
- * which needs to alter them. */
-
-static inline void smpboot_clear_io_apic_irqs(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- io_apic_irqs = 0;
-#endif
-}
-
-static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
-{
- CMOS_WRITE(0xa, 0xf);
- local_flush_tlb();
- pr_debug("1.\n");
- *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
- start_eip >> 4;
- pr_debug("2.\n");
- *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) =
- start_eip & 0xf;
- pr_debug("3.\n");
-}
-
-static inline void smpboot_restore_warm_reset_vector(void)
-{
- /*
- * Install writable page 0 entry to set BIOS data area.
- */
- local_flush_tlb();
-
- /*
- * Paranoid: Set warm reset code and vector here back
- * to default values.
- */
- CMOS_WRITE(0, 0xf);
-
- *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0;
-}
-
-static inline void __init smpboot_setup_io_apic(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Here we can be sure that there is an IO-APIC in the system. Let's
- * go and set it up:
- */
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else {
- nr_ioapics = 0;
- localise_nmi_watchdog();
- }
-#endif
-}
-
-static inline void smpboot_clear_io_apic(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- nr_ioapics = 0;
-#endif
-}
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
deleted file mode 100644
index ca8bf2cd0ba9..000000000000
--- a/arch/x86/include/asm/socket.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef _ASM_X86_SOCKET_H
-#define _ASM_X86_SOCKET_H
-
-#include <asm/sockios.h>
-
-/* For setsockopt(2) */
-#define SOL_SOCKET 1
-
-#define SO_DEBUG 1
-#define SO_REUSEADDR 2
-#define SO_TYPE 3
-#define SO_ERROR 4
-#define SO_DONTROUTE 5
-#define SO_BROADCAST 6
-#define SO_SNDBUF 7
-#define SO_RCVBUF 8
-#define SO_SNDBUFFORCE 32
-#define SO_RCVBUFFORCE 33
-#define SO_KEEPALIVE 9
-#define SO_OOBINLINE 10
-#define SO_NO_CHECK 11
-#define SO_PRIORITY 12
-#define SO_LINGER 13
-#define SO_BSDCOMPAT 14
-/* To add :#define SO_REUSEPORT 15 */
-#define SO_PASSCRED 16
-#define SO_PEERCRED 17
-#define SO_RCVLOWAT 18
-#define SO_SNDLOWAT 19
-#define SO_RCVTIMEO 20
-#define SO_SNDTIMEO 21
-
-/* Security levels - as per NRL IPv6 - don't actually do anything */
-#define SO_SECURITY_AUTHENTICATION 22
-#define SO_SECURITY_ENCRYPTION_TRANSPORT 23
-#define SO_SECURITY_ENCRYPTION_NETWORK 24
-
-#define SO_BINDTODEVICE 25
-
-/* Socket filtering */
-#define SO_ATTACH_FILTER 26
-#define SO_DETACH_FILTER 27
-
-#define SO_PEERNAME 28
-#define SO_TIMESTAMP 29
-#define SCM_TIMESTAMP SO_TIMESTAMP
-
-#define SO_ACCEPTCONN 30
-
-#define SO_PEERSEC 31
-#define SO_PASSSEC 34
-#define SO_TIMESTAMPNS 35
-#define SCM_TIMESTAMPNS SO_TIMESTAMPNS
-
-#define SO_MARK 36
-
-#define SO_TIMESTAMPING 37
-#define SCM_TIMESTAMPING SO_TIMESTAMPING
-
-#endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h
deleted file mode 100644
index 49cc72b5d3c9..000000000000
--- a/arch/x86/include/asm/sockios.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _ASM_X86_SOCKIOS_H
-#define _ASM_X86_SOCKIOS_H
-
-/* Socket-level I/O control calls. */
-#define FIOSETOWN 0x8901
-#define SIOCSPGRP 0x8902
-#define FIOGETOWN 0x8903
-#define SIOCGPGRP 0x8904
-#define SIOCATMARK 0x8905
-#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */
-#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */
-
-#endif /* _ASM_X86_SOCKIOS_H */
diff --git a/arch/x86/include/asm/softirq_stack.h b/arch/x86/include/asm/softirq_stack.h
new file mode 100644
index 000000000000..889d53d6a0e1
--- /dev/null
+++ b/arch/x86/include/asm/softirq_stack.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SOFTIRQ_STACK_H
+#define _ASM_X86_SOFTIRQ_STACK_H
+
+#ifdef CONFIG_X86_64
+# include <asm/irq_stack.h>
+#else
+# include <asm-generic/softirq_stack.h>
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4517d6b93188..3918c7a434f5 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -1,6 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SPARSEMEM_H
#define _ASM_X86_SPARSEMEM_H
+#include <linux/types.h>
+
#ifdef CONFIG_SPARSEMEM
/*
* generic non-linear memory support:
@@ -9,26 +12,23 @@
* field of the struct page
*
* SECTION_SIZE_BITS 2^n: size of each section
- * MAX_PHYSADDR_BITS 2^n: max size of physical address space
- * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
+ * MAX_PHYSMEM_BITS 2^n: max size of physical address space
*
*/
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
# define SECTION_SIZE_BITS 29
-# define MAX_PHYSADDR_BITS 36
# define MAX_PHYSMEM_BITS 36
# else
# define SECTION_SIZE_BITS 26
-# define MAX_PHYSADDR_BITS 32
# define MAX_PHYSMEM_BITS 32
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
-# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 46
+# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
#endif
#endif /* CONFIG_SPARSEMEM */
+
#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
new file mode 100644
index 000000000000..00b7e0398210
--- /dev/null
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECCTRL_H_
+#define _ASM_X86_SPECCTRL_H_
+
+#include <linux/thread_info.h>
+#include <asm/nospec-branch.h>
+#include <asm/msr.h>
+
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter and also
+ * the guest's version of VIRT_SPEC_CTRL, if emulated.
+ */
+extern void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool guest);
+
+/**
+ * x86_spec_ctrl_set_guest - Set speculation control registers for the guest
+ * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ * (may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_set_guest(u64 guest_virt_spec_ctrl)
+{
+ x86_virt_spec_ctrl(guest_virt_spec_ctrl, true);
+}
+
+/**
+ * x86_spec_ctrl_restore_host - Restore host speculation control registers
+ * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ * (may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_restore_host(u64 guest_virt_spec_ctrl)
+{
+ x86_virt_spec_ctrl(guest_virt_spec_ctrl, false);
+}
+
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_ssbd_mask;
+
+static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
+{
+ BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+ return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
+static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+ BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+ return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
+static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
+{
+ return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
+}
+
+/*
+ * This can be used in noinstr functions & should only be called in bare
+ * metal context.
+ */
+static __always_inline void __update_spec_ctrl(u64 val)
+{
+ __this_cpu_write(x86_spec_ctrl_current, val);
+ native_wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
+#ifdef CONFIG_SMP
+extern void speculative_store_bypass_ht_init(void);
+#else
+static inline void speculative_store_bypass_ht_init(void) { }
+#endif
+
+extern void speculation_ctrl_update(unsigned long tif);
+extern void speculation_ctrl_update_current(void);
+
+extern bool itlb_multihit_kvm_mitigation;
+
+#endif
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
new file mode 100644
index 000000000000..46aa2c9c1bda
--- /dev/null
+++ b/arch/x86/include/asm/special_insns.h
@@ -0,0 +1,308 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECIAL_INSNS_H
+#define _ASM_X86_SPECIAL_INSNS_H
+
+#ifdef __KERNEL__
+#include <asm/nops.h>
+#include <asm/processor-flags.h>
+
+#include <linux/errno.h>
+#include <linux/irqflags.h>
+#include <linux/jump_label.h>
+
+void native_write_cr0(unsigned long val);
+
+static inline unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0" : "=r" (val));
+ return val;
+}
+
+static __always_inline unsigned long native_read_cr2(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr2,%0" : "=r" (val));
+ return val;
+}
+
+static __always_inline void native_write_cr2(unsigned long val)
+{
+ asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
+}
+
+static __always_inline unsigned long __native_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr3,%0" : "=r" (val));
+ return val;
+}
+
+static __always_inline void native_write_cr3(unsigned long val)
+{
+ asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+ unsigned long val;
+#ifdef CONFIG_X86_32
+ /*
+ * This could fault if CR4 does not exist. Non-existent CR4
+ * is functionally equivalent to CR4 == 0. Keep it simple and pretend
+ * that CR4 == 0 on CPUs that don't have CR4.
+ */
+ asm volatile("1: mov %%cr4, %0\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b)
+ : "=r" (val) : "0" (0));
+#else
+ /* CR4 always exists on x86_64. */
+ asm volatile("mov %%cr4,%0" : "=r" (val));
+#endif
+ return val;
+}
+
+void native_write_cr4(unsigned long val);
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static inline u32 rdpkru(void)
+{
+ u32 ecx = 0;
+ u32 edx, pkru;
+
+ /*
+ * "rdpkru" instruction. Places PKRU contents in to EAX,
+ * clears EDX and requires that ecx=0.
+ */
+ asm volatile("rdpkru" : "=a" (pkru), "=d" (edx) : "c" (ecx));
+ return pkru;
+}
+
+static inline void wrpkru(u32 pkru)
+{
+ u32 ecx = 0, edx = 0;
+
+ /*
+ * "wrpkru" instruction. Loads contents in EAX to PKRU,
+ * requires that ecx = edx = 0.
+ */
+ asm volatile("wrpkru" : : "a" (pkru), "c"(ecx), "d"(edx));
+}
+
+#else
+static inline u32 rdpkru(void)
+{
+ return 0;
+}
+
+static inline void wrpkru(u32 pkru)
+{
+}
+#endif
+
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, and then invalidate all caches. Depending
+ * on the micro-architecture, WBINVD (and WBNOINVD below) may or may not affect
+ * lower level caches associated with another logical processor that shares any
+ * level of this processor's cache hierarchy.
+ */
+static __always_inline void wbinvd(void)
+{
+ asm volatile("wbinvd" : : : "memory");
+}
+
+/* Instruction encoding provided for binutils backwards compatibility. */
+#define ASM_WBNOINVD _ASM_BYTES(0xf3,0x0f,0x09)
+
+/*
+ * Write back all modified lines in all levels of cache associated with this
+ * logical processor to main memory, but do NOT explicitly invalidate caches,
+ * i.e. leave all/most cache lines in the hierarchy in non-modified state.
+ */
+static __always_inline void wbnoinvd(void)
+{
+ /*
+ * Explicitly encode WBINVD if X86_FEATURE_WBNOINVD is unavailable even
+ * though WBNOINVD is backwards compatible (it's simply WBINVD with an
+ * ignored REP prefix), to guarantee that WBNOINVD isn't used if it
+ * needs to be avoided for any reason. For all supported usage in the
+ * kernel, WBINVD is functionally a superset of WBNOINVD.
+ */
+ alternative("wbinvd", ASM_WBNOINVD, X86_FEATURE_WBNOINVD);
+}
+
+static inline unsigned long __read_cr4(void)
+{
+ return native_read_cr4();
+}
+
+#ifdef CONFIG_PARAVIRT_XXL
+#include <asm/paravirt.h>
+#else
+
+static inline unsigned long read_cr0(void)
+{
+ return native_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ native_write_cr0(x);
+}
+
+static __always_inline unsigned long read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static __always_inline void write_cr2(unsigned long x)
+{
+ native_write_cr2(x);
+}
+
+/*
+ * Careful! CR3 contains more than just an address. You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
+{
+ return __native_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ native_write_cr3(x);
+}
+
+static inline void __write_cr4(unsigned long x)
+{
+ native_write_cr4(x);
+}
+#endif /* CONFIG_PARAVIRT_XXL */
+
+static __always_inline void clflush(volatile void *__p)
+{
+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+static inline void clflushopt(volatile void *__p)
+{
+ alternative_io("ds clflush %0",
+ "clflushopt %0", X86_FEATURE_CLFLUSHOPT,
+ "+m" (*(volatile char __force *)__p));
+}
+
+static inline void clwb(volatile void *__p)
+{
+ volatile struct { char x[64]; } *p = __p;
+
+ asm_inline volatile(ALTERNATIVE_2(
+ "ds clflush %0",
+ "clflushopt %0", X86_FEATURE_CLFLUSHOPT,
+ "clwb %0", X86_FEATURE_CLWB)
+ : "+m" (*p));
+}
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static inline int write_user_shstk_64(u64 __user *addr, u64 val)
+{
+ asm goto("1: wrussq %[val], %[addr]\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: [addr] "m" (*addr), [val] "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -EFAULT;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
+#define nop() asm volatile ("nop")
+
+static __always_inline void serialize(void)
+{
+ /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
+ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
+}
+
+/* The dst parameter must be 64-bytes aligned */
+static inline void movdir64b(void *dst, const void *src)
+{
+ const struct { char _[64]; } *__src = src;
+ struct { char _[64]; } *__dst = dst;
+
+ /*
+ * MOVDIR64B %(rdx), rax.
+ *
+ * Both __src and __dst must be memory constraints in order to tell the
+ * compiler that no other memory accesses should be reordered around
+ * this one.
+ *
+ * Also, both must be supplied as lvalues because this tells
+ * the compiler what the object is (its size) the instruction accesses.
+ * I.e., not the pointers but what they point to, thus the deref'ing '*'.
+ */
+ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+ : "+m" (*__dst)
+ : "m" (*__src), "a" (__dst), "d" (__src));
+}
+
+static inline void movdir64b_io(void __iomem *dst, const void *src)
+{
+ movdir64b((void __force *)dst, src);
+}
+
+/**
+ * enqcmds - Enqueue a command in supervisor (CPL0) mode
+ * @dst: destination, in MMIO space (must be 512-bit aligned)
+ * @src: 512 bits memory operand
+ *
+ * The ENQCMDS instruction allows software to write a 512-bit command to
+ * a 512-bit-aligned special MMIO region that supports the instruction.
+ * A return status is loaded into the ZF flag in the RFLAGS register.
+ * ZF = 0 equates to success, and ZF = 1 indicates retry or error.
+ *
+ * This function issues the ENQCMDS instruction to submit data from
+ * kernel space to MMIO space, in a unit of 512 bits. Order of data access
+ * is not guaranteed, nor is a memory barrier performed afterwards. It
+ * returns 0 on success and -EAGAIN on failure.
+ *
+ * Warning: Do not use this helper unless your driver has checked that the
+ * ENQCMDS instruction is supported on the platform and the device accepts
+ * ENQCMDS.
+ */
+static inline int enqcmds(void __iomem *dst, const void *src)
+{
+ const struct { char _[64]; } *__src = src;
+ struct { char _[64]; } __iomem *__dst = dst;
+ bool zf;
+
+ /*
+ * ENQCMDS %(rdx), rax
+ *
+ * See movdir64b()'s comment on operand specification.
+ */
+ asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90"
+ : "=@ccz" (zf), "+m" (*__dst)
+ : "m" (*__src), "a" (__dst), "d" (__src));
+
+ /* Submission failure is indicated via EFLAGS.ZF=1 */
+ if (zf)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static __always_inline void tile_release(void)
+{
+ /*
+ * Instruction opcode for TILERELEASE; supported in binutils
+ * version >= 2.36.
+ */
+ asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0");
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 4e77853321db..5b6bc7016c22 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -1,218 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SPINLOCK_H
#define _ASM_X86_SPINLOCK_H
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
+#include <linux/jump_label.h>
+#include <linux/atomic.h>
#include <asm/page.h>
#include <asm/processor.h>
#include <linux/compiler.h>
#include <asm/paravirt.h>
+#include <asm/bitops.h>
+
/*
* Your basic SMP spinlocks, allowing only a single CPU anywhere
*
* Simple spin lock operations. There are two variants, one clears IRQ's
* on the local processor, one does not.
*
- * These are fair FIFO ticket locks, which are currently limited to 256
- * CPUs.
+ * These are fair FIFO ticket locks, which support up to 2^16 CPUs.
*
* (the type definitions are in asm/spinlock_types.h)
*/
-#ifdef CONFIG_X86_32
-# define LOCK_PTR_REG "a"
-# define REG_PTR_MODE "k"
-#else
-# define LOCK_PTR_REG "D"
-# define REG_PTR_MODE "q"
-#endif
+/* How long a lock should spin before we consider blocking */
+#define SPIN_THRESHOLD (1 << 15)
-#if defined(CONFIG_X86_32) && \
- (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
-/*
- * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
- * (PPro errata 66, 92)
- */
-# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
-#else
-# define UNLOCK_LOCK_PREFIX
-#endif
-
-/*
- * Ticket locks are conceptually two parts, one indicating the current head of
- * the queue, and the other indicating the current tail. The lock is acquired
- * by atomically noting the tail and incrementing it by one (thus adding
- * ourself to the queue and noting our position), then waiting until the head
- * becomes equal to the the initial value of the tail.
- *
- * We use an xadd covering *both* parts of the lock, to increment the tail and
- * also load the position of the head, which takes care of memory ordering
- * issues and should be optimal for the uncontended case. Note the tail must be
- * in the high part, because a wide xadd increment of the low part would carry
- * up and contaminate the high part.
- *
- * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
- * save some instructions and make the code more elegant. There really isn't
- * much between them in performance though, especially as locks are out of line.
- */
-#if (NR_CPUS < 256)
-#define TICKET_SHIFT 8
-
-static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
-{
- short inc = 0x0100;
-
- asm volatile (
- LOCK_PREFIX "xaddw %w0, %1\n"
- "1:\t"
- "cmpb %h0, %b0\n\t"
- "je 2f\n\t"
- "rep ; nop\n\t"
- "movb %1, %b0\n\t"
- /* don't need lfence here, because loads are in-order */
- "jmp 1b\n"
- "2:"
- : "+Q" (inc), "+m" (lock->slock)
- :
- : "memory", "cc");
-}
-
-static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
-{
- int tmp, new;
-
- asm volatile("movzwl %2, %0\n\t"
- "cmpb %h0,%b0\n\t"
- "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
- "jne 1f\n\t"
- LOCK_PREFIX "cmpxchgw %w1,%2\n\t"
- "1:"
- "sete %b1\n\t"
- "movzbl %b1,%0\n\t"
- : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
- :
- : "memory", "cc");
-
- return tmp;
-}
-
-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
-{
- asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
- : "+m" (lock->slock)
- :
- : "memory", "cc");
-}
-#else
-#define TICKET_SHIFT 16
-
-static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
-{
- int inc = 0x00010000;
- int tmp;
-
- asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
- "movzwl %w0, %2\n\t"
- "shrl $16, %0\n\t"
- "1:\t"
- "cmpl %0, %2\n\t"
- "je 2f\n\t"
- "rep ; nop\n\t"
- "movzwl %1, %2\n\t"
- /* don't need lfence here, because loads are in-order */
- "jmp 1b\n"
- "2:"
- : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
- :
- : "memory", "cc");
-}
-
-static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
-{
- int tmp;
- int new;
-
- asm volatile("movl %2,%0\n\t"
- "movl %0,%1\n\t"
- "roll $16, %0\n\t"
- "cmpl %0,%1\n\t"
- "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
- "jne 1f\n\t"
- LOCK_PREFIX "cmpxchgl %1,%2\n\t"
- "1:"
- "sete %b1\n\t"
- "movzbl %b1,%0\n\t"
- : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
- :
- : "memory", "cc");
-
- return tmp;
-}
-
-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
-{
- asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
- : "+m" (lock->slock)
- :
- : "memory", "cc");
-}
-#endif
-
-static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
-{
- int tmp = ACCESS_ONCE(lock->slock);
-
- return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
-}
-
-static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
-{
- int tmp = ACCESS_ONCE(lock->slock);
-
- return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
-}
-
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
-
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
-{
- return __ticket_spin_is_locked(lock);
-}
-
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
-{
- return __ticket_spin_is_contended(lock);
-}
-#define __raw_spin_is_contended __raw_spin_is_contended
-
-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
-{
- __ticket_spin_lock(lock);
-}
-
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
-{
- return __ticket_spin_trylock(lock);
-}
-
-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
-{
- __ticket_spin_unlock(lock);
-}
-
-static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
- unsigned long flags)
-{
- __raw_spin_lock(lock);
-}
-
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-
-static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
-{
- while (__raw_spin_is_locked(lock))
- cpu_relax();
-}
+#include <asm/qspinlock.h>
/*
* Read-write spinlocks, allowing multiple readers
@@ -224,86 +36,10 @@ static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
* irq-safe write-lock, but readers can get non-irqsafe
* read-locks.
*
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
+ * On x86, we implement read-write locks using the generic qrwlock with
+ * x86 specific optimization.
*/
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int __raw_read_can_lock(raw_rwlock_t *lock)
-{
- return (int)(lock)->lock > 0;
-}
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int __raw_write_can_lock(raw_rwlock_t *lock)
-{
- return (lock)->lock == RW_LOCK_BIAS;
-}
-
-static inline void __raw_read_lock(raw_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
- "jns 1f\n"
- "call __read_lock_failed\n\t"
- "1:\n"
- ::LOCK_PTR_REG (rw) : "memory");
-}
-
-static inline void __raw_write_lock(raw_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
- "jz 1f\n"
- "call __write_lock_failed\n\t"
- "1:\n"
- ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
-}
-
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
-{
- atomic_t *count = (atomic_t *)lock;
-
- if (atomic_dec_return(count) >= 0)
- return 1;
- atomic_inc(count);
- return 0;
-}
-
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
-{
- atomic_t *count = (atomic_t *)lock;
-
- if (atomic_sub_and_test(RW_LOCK_BIAS, count))
- return 1;
- atomic_add(RW_LOCK_BIAS, count);
- return 0;
-}
-
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
-}
-
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX "addl %1, %0"
- : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
-}
-
-#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
-#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
-
-#define _raw_spin_relax(lock) cpu_relax()
-#define _raw_read_relax(lock) cpu_relax()
-#define _raw_write_relax(lock) cpu_relax()
-
-/* The {read|write|spin}_lock() on x86 are full memory barriers. */
-static inline void smp_mb__after_lock(void) { }
-#define ARCH_HAS_SMP_MB_AFTER_LOCK
+#include <asm/qrwlock.h>
#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 845f81c87091..323db6c5852a 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -1,20 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SPINLOCK_TYPES_H
#define _ASM_X86_SPINLOCK_TYPES_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
-
-typedef struct raw_spinlock {
- unsigned int slock;
-} raw_spinlock_t;
-
-#define __RAW_SPIN_LOCK_UNLOCKED { 0 }
-
-typedef struct {
- unsigned int lock;
-} raw_rwlock_t;
-
-#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
+#include <linux/types.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a7..000000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-
-#ifndef _ASM_X86_SRAT_H
-#define _ASM_X86_SRAT_H
-
-#ifdef CONFIG_ACPI_NUMA
-extern int get_memcfg_from_srat(void);
-#else
-static inline int get_memcfg_from_srat(void)
-{
- return 0;
-}
-#endif
-
-#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index c2d742c6e15f..cd761b14eb02 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -1,124 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* GCC stack protector support.
*
- * Stack protector works by putting predefined pattern at the start of
+ * Stack protector works by putting a predefined pattern at the start of
* the stack frame and verifying that it hasn't been overwritten when
- * returning from the function. The pattern is called stack canary
- * and unfortunately gcc requires it to be at a fixed offset from %gs.
- * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64
- * and x86_32 use segment registers differently and thus handles this
- * requirement differently.
- *
- * On x86_64, %gs is shared by percpu area and stack canary. All
- * percpu symbols are zero based and %gs points to the base of percpu
- * area. The first occupant of the percpu area is always
- * irq_stack_union which contains stack_canary at offset 40. Userland
- * %gs is always saved and restored on kernel entry and exit using
- * swapgs, so stack protector doesn't add any complexity there.
- *
- * On x86_32, it's slightly more complicated. As in x86_64, %gs is
- * used for userland TLS. Unfortunately, some processors are much
- * slower at loading segment registers with different value when
- * entering and leaving the kernel, so the kernel uses %fs for percpu
- * area and manages %gs lazily so that %gs is switched only when
- * necessary, usually during task switch.
- *
- * As gcc requires the stack canary at %gs:20, %gs can't be managed
- * lazily if stack protector is enabled, so the kernel saves and
- * restores userland %gs on kernel entry and exit. This behavior is
- * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
- * system.h to hide the details.
+ * returning from the function. The pattern is called the stack canary
+ * and is a unique value for each task.
*/
#ifndef _ASM_STACKPROTECTOR_H
#define _ASM_STACKPROTECTOR_H 1
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
#include <asm/tsc.h>
#include <asm/processor.h>
#include <asm/percpu.h>
-#include <asm/system.h>
#include <asm/desc.h>
-#include <linux/random.h>
-/*
- * 24 byte read-only segment initializer for stack canary. Linker
- * can't handle the address bit shifting. Address will be set in
- * head_32 for boot CPU and setup_per_cpu_areas() for others.
- */
-#define GDT_STACK_CANARY_INIT \
- [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } },
+#include <linux/sched.h>
+
+DECLARE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
/*
* Initialize the stackprotector canary value.
*
- * NOTE: this must only be called from functions that never return,
+ * NOTE: this must only be called from functions that never return
* and it must always be inlined.
+ *
+ * In addition, it should be called from a compilation unit for which
+ * stack protector is disabled. Alternatively, the caller should not end
+ * with a function call which gets tail-call optimized as that would
+ * lead to checking a modified canary value.
*/
static __always_inline void boot_init_stack_canary(void)
{
- u64 canary;
- u64 tsc;
-
-#ifdef CONFIG_X86_64
- BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
-#endif
- /*
- * We both use the random pool and the current TSC as a source
- * of randomness. The TSC only matters for very early init,
- * there it already has some randomness on most systems. Later
- * on during the bootup the random pool has true entropy too.
- */
- get_random_bytes(&canary, sizeof(canary));
- tsc = __native_read_tsc();
- canary += tsc + (tsc << 32UL);
+ unsigned long canary = get_random_canary();
current->stack_canary = canary;
-#ifdef CONFIG_X86_64
- percpu_write(irq_stack_union.stack_canary, canary);
-#else
- percpu_write(stack_canary, canary);
-#endif
+ this_cpu_write(__stack_chk_guard, canary);
}
-static inline void setup_stack_canary_segment(int cpu)
+static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
{
-#ifdef CONFIG_X86_32
- unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20;
- struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
- struct desc_struct desc;
-
- desc = gdt_table[GDT_ENTRY_STACK_CANARY];
- desc.base0 = canary & 0xffff;
- desc.base1 = (canary >> 16) & 0xff;
- desc.base2 = (canary >> 24) & 0xff;
- write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
-#endif
-}
-
-static inline void load_stack_canary_segment(void)
-{
-#ifdef CONFIG_X86_32
- asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
-#endif
+ per_cpu(__stack_chk_guard, cpu) = idle->stack_canary;
}
-#else /* CC_STACKPROTECTOR */
-
-#define GDT_STACK_CANARY_INIT
+#else /* STACKPROTECTOR */
/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */
-static inline void setup_stack_canary_segment(int cpu)
+static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
{ }
-static inline void load_stack_canary_segment(void)
-{
-#ifdef CONFIG_X86_32
- asm volatile ("mov %0, %%gs" : : "r" (0));
-#endif
-}
-
-#endif /* CC_STACKPROTECTOR */
+#endif /* STACKPROTECTOR */
#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e73815..3881b5333eb8 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -1,23 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
#ifndef _ASM_X86_STACKTRACE_H
#define _ASM_X86_STACKTRACE_H
-extern int kstack_depth_to_print;
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
-int x86_is_stack_id(int id, char *name);
+#include <asm/cpu_entry_area.h>
+#include <asm/switch_to.h>
-/* Generic stack tracer with callbacks */
+enum stack_type {
+ STACK_TYPE_UNKNOWN,
+ STACK_TYPE_TASK,
+ STACK_TYPE_IRQ,
+ STACK_TYPE_SOFTIRQ,
+ STACK_TYPE_ENTRY,
+ STACK_TYPE_EXCEPTION,
+ STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+};
-struct stacktrace_ops {
- void (*warning)(void *data, char *msg);
- /* msg must contain %s for the symbol */
- void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
- void (*address)(void *data, unsigned long address, int reliable);
- /* On negative return stop dumping */
- int (*stack)(void *data, char *name);
+struct stack_info {
+ enum stack_type type;
+ unsigned long *begin, *end, *next_sp;
};
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data);
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info);
+
+bool in_entry_stack(unsigned long *stack, struct stack_info *info);
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask);
+bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info);
+
+static __always_inline
+bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
+{
+ /* make sure it's not in the stack proper */
+ if (get_stack_info_noinstr(stack, current, info))
+ return false;
+ /* but if it is in the page below it, we hit a guard */
+ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info);
+}
+
+const char *stack_type_name(enum stack_type type);
+
+static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
+{
+ void *begin = info->begin;
+ void *end = info->end;
+
+ return (info->type != STACK_TYPE_UNKNOWN &&
+ addr >= begin && addr < end &&
+ addr + len > begin && addr + len <= end);
+}
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#else
+#define STACKSLOTS_PER_LINE 4
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long *)regs->bp;
+
+ if (task == current)
+ return __builtin_frame_address(0);
+
+ return &((struct inactive_task_frame *)task->thread.sp)->bp;
+}
+#else
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ return NULL;
+}
+#endif /* CONFIG_FRAME_POINTER */
+
+static inline unsigned long *
+get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long *)regs->sp;
+
+ if (task == current)
+ return __builtin_frame_address(0);
+
+ return (unsigned long *)task->thread.sp;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+struct stack_frame_ia32 {
+ u32 next_frame;
+ u32 return_address;
+};
+void show_opcodes(struct pt_regs *regs, const char *loglvl);
+void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
new file mode 100644
index 000000000000..4cd725a8fe91
--- /dev/null
+++ b/arch/x86/include/asm/static_call.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_STATIC_CALL_H
+#define _ASM_STATIC_CALL_H
+
+#include <asm/text-patching.h>
+
+/*
+ * For CONFIG_HAVE_STATIC_CALL_INLINE, this is a temporary trampoline which
+ * uses the current value of the key->func pointer to do an indirect jump to
+ * the function. This trampoline is only used during boot, before the call
+ * sites get patched by static_call_update(). The name of this trampoline has
+ * a magical aspect: objtool uses it to find static call sites so it can create
+ * the .static_call_sites section.
+ *
+ * For CONFIG_HAVE_STATIC_CALL, this is a permanent trampoline which
+ * does a direct jump to the function. The direct jump gets patched by
+ * static_call_update().
+ *
+ * Having the trampoline in a special section forces GCC to emit a JMP.d32 when
+ * it does tail-call optimization on the call; since you cannot compute the
+ * relative displacement across sections.
+ */
+
+/*
+ * The trampoline is 8 bytes and of the general form:
+ *
+ * jmp.d32 \func
+ * ud1 %esp, %ecx
+ *
+ * That trailing #UD provides both a speculation stop and serves as a unique
+ * 3 byte signature identifying static call trampolines. Also see tramp_ud[]
+ * and __static_call_fixup().
+ */
+#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insns) \
+ asm(".pushsection .static_call.text, \"ax\" \n" \
+ ".align 4 \n" \
+ ".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
+ STATIC_CALL_TRAMP_STR(name) ": \n" \
+ ANNOTATE_NOENDBR " \n" \
+ insns " \n" \
+ ".byte 0x0f, 0xb9, 0xcc \n" \
+ ".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
+ ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
+ ".popsection \n")
+
+#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")
+
+#ifdef CONFIG_MITIGATION_RETHUNK
+#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk")
+#else
+#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
+#endif
+
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) \
+ ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0)
+
+#define ARCH_ADD_TRAMP_KEY(name) \
+ asm(".pushsection .static_call_tramp_key, \"a\" \n" \
+ ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \
+ ".long " STATIC_CALL_KEY_STR(name) " - . \n" \
+ ".popsection \n")
+
+extern bool __static_call_fixup(void *tramp, u8 op, void *dest);
+
+extern void __static_call_update_early(void *tramp, void *func);
+
+#define static_call_update_early(name, _func) \
+({ \
+ typeof(&STATIC_CALL_TRAMP(name)) __F = (_func); \
+ if (static_call_initialized) { \
+ __static_call_update(&STATIC_CALL_KEY(name), \
+ STATIC_CALL_TRAMP_ADDR(name), __F);\
+ } else { \
+ WRITE_ONCE(STATIC_CALL_KEY(name).func, _func); \
+ __static_call_update_early(STATIC_CALL_TRAMP_ADDR(name),\
+ __F); \
+ } \
+})
+
+#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
index 6dfd6d9373a0..9cb5aae7fba9 100644
--- a/arch/x86/include/asm/string.h
+++ b/arch/x86/include/asm/string.h
@@ -1,5 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_STRING_H
+#define _ASM_X86_STRING_H
+
#ifdef CONFIG_X86_32
-# include "string_32.h"
+# include <asm/string_32.h>
#else
-# include "string_64.h"
+# include <asm/string_64.h>
#endif
+
+static __always_inline void *__inline_memcpy(void *to, const void *from, size_t len)
+{
+ void *ret = to;
+
+ asm volatile("rep movsb"
+ : "+D" (to), "+S" (from), "+c" (len)
+ : : "memory");
+ return ret;
+}
+
+static __always_inline void *__inline_memset(void *s, int v, size_t n)
+{
+ void *ret = s;
+
+ asm volatile("rep stosb"
+ : "+D" (s), "+c" (n)
+ : "a" ((uint8_t)v)
+ : "memory");
+ return ret;
+}
+
+#endif /* _ASM_X86_STRING_H */
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index c86f452256de..e9cce169bb4c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_STRING_32_H
#define _ASM_X86_STRING_32_H
@@ -32,11 +33,11 @@ extern size_t strlen(const char *s);
static __always_inline void *__memcpy(void *to, const void *from, size_t n)
{
int d0, d1, d2;
- asm volatile("rep ; movsl\n\t"
+ asm volatile("rep movsl\n\t"
"movl %4,%%ecx\n\t"
"andl $3,%%ecx\n\t"
"jz 1f\n\t"
- "rep ; movsb\n\t"
+ "rep movsb\n\t"
"1:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
@@ -65,7 +66,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
case 4:
*(int *)to = *(int *)from;
return to;
-
case 3:
*(short *)to = *(short *)from;
*((char *)to + 2) = *((char *)from + 2);
@@ -89,7 +89,7 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
if (n >= 5 * 4) {
/* large block: use rep prefix */
int ecx;
- asm volatile("rep ; movsl"
+ asm volatile("rep movsl"
: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
: "0" (n / 4), "1" (edi), "2" (esi)
: "memory"
@@ -143,59 +143,21 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
}
#define __HAVE_ARCH_MEMCPY
+extern void *memcpy(void *, const void *, size_t);
-#ifdef CONFIG_X86_USE_3DNOW
-
-#include <asm/mmx.h>
-
-/*
- * This CPU favours 3DNow strongly (eg AMD Athlon)
- */
-
-static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __constant_memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-static inline void *__memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-#define memcpy(t, f, n) \
- (__builtin_constant_p((n)) \
- ? __constant_memcpy3d((t), (f), (n)) \
- : __memcpy3d((t), (f), (n)))
+#ifndef CONFIG_FORTIFY_SOURCE
-#else
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
-/*
- * No 3D Now!
- */
-
-#ifndef CONFIG_KMEMCHECK
-#define memcpy(t, f, n) \
- (__builtin_constant_p((n)) \
- ? __constant_memcpy((t), (f), (n)) \
- : __memcpy((t), (f), (n)))
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(t, f, n) __memcpy((t), (f), (n))
-#endif
-
-#endif
+#endif /* !CONFIG_FORTIFY_SOURCE */
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t n);
+extern int memcmp(const void *, const void *, size_t);
+#ifndef CONFIG_FORTIFY_SOURCE
#define memcmp __builtin_memcmp
+#endif
#define __HAVE_ARCH_MEMCHR
extern void *memchr(const void *cs, int c, size_t count);
@@ -203,8 +165,7 @@ extern void *memchr(const void *cs, int c, size_t count);
static inline void *__memset_generic(void *s, char c, size_t count)
{
int d0, d1;
- asm volatile("rep\n\t"
- "stosb"
+ asm volatile("rep stosb"
: "=&c" (d0), "=&D" (d1)
: "a" (c), "1" (s), "0" (count)
: "memory");
@@ -214,29 +175,6 @@ static inline void *__memset_generic(void *s, char c, size_t count)
/* we might want to write optimized versions of these later */
#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count))
-/*
- * memset(x, 0, y) is a reasonably common thing to do, so we want to fill
- * things 32 bits at a time even when we don't know the size of the
- * area at compile-time..
- */
-static __always_inline
-void *__constant_c_memset(void *s, unsigned long c, size_t count)
-{
- int d0, d1;
- asm volatile("rep ; stosl\n\t"
- "testb $2,%b3\n\t"
- "je 1f\n\t"
- "stosw\n"
- "1:\ttestb $1,%b3\n\t"
- "je 2f\n\t"
- "stosb\n"
- "2:"
- : "=&c" (d0), "=&D" (d1)
- : "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
- : "memory");
- return s;
-}
-
/* Added by Gertjan van Wingerde to make minix and sysv module work */
#define __HAVE_ARCH_STRNLEN
extern size_t strnlen(const char *s, size_t count);
@@ -245,83 +183,38 @@ extern size_t strnlen(const char *s, size_t count);
#define __HAVE_ARCH_STRSTR
extern char *strstr(const char *cs, const char *ct);
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as we by now know that both pattern and count is constant..
- */
-static __always_inline
-void *__constant_c_and_count_memset(void *s, unsigned long pattern,
- size_t count)
-{
- switch (count) {
- case 0:
- return s;
- case 1:
- *(unsigned char *)s = pattern & 0xff;
- return s;
- case 2:
- *(unsigned short *)s = pattern & 0xffff;
- return s;
- case 3:
- *(unsigned short *)s = pattern & 0xffff;
- *((unsigned char *)s + 2) = pattern & 0xff;
- return s;
- case 4:
- *(unsigned long *)s = pattern;
- return s;
- }
-
-#define COMMON(x) \
- asm volatile("rep ; stosl" \
- x \
- : "=&c" (d0), "=&D" (d1) \
- : "a" (eax), "0" (count/4), "1" ((long)s) \
- : "memory")
-
- {
- int d0, d1;
-#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
- /* Workaround for broken gcc 4.0 */
- register unsigned long eax asm("%eax") = pattern;
-#else
- unsigned long eax = pattern;
-#endif
-
- switch (count % 4) {
- case 0:
- COMMON("");
- return s;
- case 1:
- COMMON("\n\tstosb");
- return s;
- case 2:
- COMMON("\n\tstosw");
- return s;
- default:
- COMMON("\n\tstosw\n\tstosb");
- return s;
- }
- }
-
-#undef COMMON
-}
-
-#define __constant_c_x_memset(s, c, count) \
- (__builtin_constant_p(count) \
- ? __constant_c_and_count_memset((s), (c), (count)) \
- : __constant_c_memset((s), (c), (count)))
-
#define __memset(s, c, count) \
(__builtin_constant_p(count) \
? __constant_count_memset((s), (c), (count)) \
: __memset_generic((s), (c), (count)))
#define __HAVE_ARCH_MEMSET
-#define memset(s, c, count) \
- (__builtin_constant_p(c) \
- ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
- (count)) \
- : __memset((s), (c), (count)))
+extern void *memset(void *, int, size_t);
+#ifndef CONFIG_FORTIFY_SOURCE
+#define memset(s, c, count) __builtin_memset(s, c, count)
+#endif /* !CONFIG_FORTIFY_SOURCE */
+
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
/*
* find the first occurrence of byte 'c', or 1 past the area if none
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c468fc2c..4635616863f5 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -1,61 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_STRING_64_H
#define _ASM_X86_STRING_64_H
#ifdef __KERNEL__
+#include <linux/jump_label.h>
/* Written 2002 by Andi Kleen */
-/* Only used for special circumstances. Stolen from i386/string.h */
-static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n)
-{
- unsigned long d0, d1, d2;
- asm volatile("rep ; movsl\n\t"
- "testb $2,%b4\n\t"
- "je 1f\n\t"
- "movsw\n"
- "1:\ttestb $1,%b4\n\t"
- "je 2f\n\t"
- "movsb\n"
- "2:"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
- : "memory");
- return to;
-}
-
/* Even with __builtin_ the compiler may decide to use the out of line
function. */
+#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY)
+#include <linux/kmsan_string.h>
+#endif
+
#define __HAVE_ARCH_MEMCPY 1
-#ifndef CONFIG_KMEMCHECK
-#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
extern void *memcpy(void *to, const void *from, size_t len);
-#else
extern void *__memcpy(void *to, const void *from, size_t len);
-#define memcpy(dst, src, len) \
-({ \
- size_t __len = (len); \
- void *__ret; \
- if (__builtin_constant_p(len) && __len >= 64) \
- __ret = __memcpy((dst), (src), __len); \
- else \
- __ret = __builtin_memcpy((dst), (src), __len); \
- __ret; \
-})
-#endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
-#endif
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
+KCFI_REFERENCE(__memset);
+
+/*
+ * KMSAN needs to instrument as much code as possible. Use C versions of
+ * memsetXX() from lib/string.c under KMSAN.
+ */
+#if !defined(CONFIG_KMSAN)
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ const auto s0 = s;
+ asm volatile (
+ "rep stosw"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ const auto s0 = s;
+ asm volatile (
+ "rep stosl"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
+}
+
+#define __HAVE_ARCH_MEMSET64
+static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
+{
+ const auto s0 = s;
+ asm volatile (
+ "rep stosq"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
+}
+#endif
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
+KCFI_REFERENCE(__memmove);
int memcmp(const void *cs, const void *ct, size_t count);
size_t strlen(const char *s);
@@ -63,6 +79,29 @@ char *strcpy(char *dest, const char *src);
char *strcat(char *dest, const char *src);
int strcmp(const char *cs, const char *ct);
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+ return;
+ case 8:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ return;
+ case 16:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+ return;
+ }
+ }
+ __memcpy_flushcache(dst, src, cnt);
+}
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h
index 9bd521fe4570..a892494ca5e4 100644
--- a/arch/x86/include/asm/suspend.h
+++ b/arch/x86/include/asm/suspend.h
@@ -1,5 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_X86_32
-# include "suspend_32.h"
+# include <asm/suspend_32.h>
#else
-# include "suspend_64.h"
+# include <asm/suspend_64.h>
#endif
+extern unsigned long restore_jump_address __visible;
+extern unsigned long jump_address_phys;
+extern unsigned long restore_cr3 __visible;
+extern unsigned long temp_pgt __visible;
+extern unsigned long relocated_restore_code __visible;
+extern int relocate_restore_code(void);
+/* Defined in hibernate_asm_32/64.S */
+extern asmlinkage __visible int restore_image(void);
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 48dcfa62ea07..e8e5aab06255 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
* Based on code
@@ -7,21 +8,31 @@
#define _ASM_X86_SUSPEND_32_H
#include <asm/desc.h>
-#include <asm/i387.h>
-
-static inline int arch_prepare_suspend(void) { return 0; }
+#include <asm/fpu/api.h>
+#include <asm/msr.h>
/* image of the saved processor state */
struct saved_context {
- u16 es, fs, gs, ss;
unsigned long cr0, cr2, cr3, cr4;
- struct desc_ptr gdt;
+ u64 misc_enable;
+ struct saved_msrs saved_msrs;
+ struct desc_ptr gdt_desc;
struct desc_ptr idt;
u16 ldt;
u16 tss;
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ /*
+ * On x86_32, all segment registers except gs are saved at kernel
+ * entry in pt_regs.
+ */
+ u16 gs;
+ bool misc_enable_saved;
} __attribute__((packed));
+/* routines for saving/restoring kernel state */
+extern char core_restore_code[];
+extern char restore_registers[];
+
#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 06284f42b759..b512f9665f78 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright 2001-2003 Pavel Machek <pavel@suse.cz>
* Based on code
@@ -7,46 +8,58 @@
#define _ASM_X86_SUSPEND_64_H
#include <asm/desc.h>
-#include <asm/i387.h>
-
-static inline int arch_prepare_suspend(void)
-{
- return 0;
-}
+#include <asm/fpu/api.h>
+#include <asm/msr.h>
/*
* Image of the saved processor state, used by the low level ACPI suspend to
* RAM code and by the low level hibernation code.
*
- * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
- * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
- * still work as required.
+ * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S
+ * and make sure that __save/__restore_processor_state(), defined in
+ * arch/x86/power/cpu.c, still work as required.
+ *
+ * Because the structure is packed, make sure to avoid unaligned members. For
+ * optimisation purposes but also because tools like kmemleak only search for
+ * pointers that are aligned.
*/
struct saved_context {
struct pt_regs regs;
- u16 ds, es, fs, gs, ss;
- unsigned long gs_base, gs_kernel_base, fs_base;
- unsigned long cr0, cr2, cr3, cr4, cr8;
+
+ /*
+ * User CS and SS are saved in current_pt_regs(). The rest of the
+ * segment selectors need to be saved and restored here.
+ */
+ u16 ds, es, fs, gs;
+
+ /*
+ * Usermode FSBASE and GSBASE may not match the fs and gs selectors,
+ * so we save them separately. We save the kernelmode GSBASE to
+ * restore percpu access after resume.
+ */
+ unsigned long kernelmode_gs_base, usermode_gs_base, fs_base;
+
+ unsigned long cr0, cr2, cr3, cr4;
+ u64 misc_enable;
+ struct saved_msrs saved_msrs;
unsigned long efer;
- u16 gdt_pad;
- u16 gdt_limit;
- unsigned long gdt_base;
+ u16 gdt_pad; /* Unused */
+ struct desc_ptr gdt_desc;
u16 idt_pad;
- u16 idt_limit;
- unsigned long idt_base;
+ struct desc_ptr idt;
u16 ldt;
u16 tss;
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ bool misc_enable_saved;
} __attribute__((packed));
#define loaddebug(thread,register) \
set_debugreg((thread)->debugreg##register, register)
/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-extern char core_restore_code;
-extern char restore_registers;
+extern char core_restore_code[];
+extern char restore_registers[];
#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..56aa99503dc4 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -1,8 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __SVM_H
#define __SVM_H
+#include <uapi/asm/svm.h>
+#include <uapi/asm/kvm.h>
+
+#include <hyperv/hvhdk.h>
+
+/*
+ * 32-bit intercept words in the VMCB Control Area, starting
+ * at Byte offset 000h.
+ */
+
+enum intercept_words {
+ INTERCEPT_CR = 0,
+ INTERCEPT_DR,
+ INTERCEPT_EXCEPTION,
+ INTERCEPT_WORD3,
+ INTERCEPT_WORD4,
+ INTERCEPT_WORD5,
+ MAX_INTERCEPT,
+};
+
enum {
- INTERCEPT_INTR,
+ /* Byte offset 000h (word 0) */
+ INTERCEPT_CR0_READ = 0,
+ INTERCEPT_CR3_READ = 3,
+ INTERCEPT_CR4_READ = 4,
+ INTERCEPT_CR8_READ = 8,
+ INTERCEPT_CR0_WRITE = 16,
+ INTERCEPT_CR3_WRITE = 16 + 3,
+ INTERCEPT_CR4_WRITE = 16 + 4,
+ INTERCEPT_CR8_WRITE = 16 + 8,
+ /* Byte offset 004h (word 1) */
+ INTERCEPT_DR0_READ = 32,
+ INTERCEPT_DR1_READ,
+ INTERCEPT_DR2_READ,
+ INTERCEPT_DR3_READ,
+ INTERCEPT_DR4_READ,
+ INTERCEPT_DR5_READ,
+ INTERCEPT_DR6_READ,
+ INTERCEPT_DR7_READ,
+ INTERCEPT_DR0_WRITE = 48,
+ INTERCEPT_DR1_WRITE,
+ INTERCEPT_DR2_WRITE,
+ INTERCEPT_DR3_WRITE,
+ INTERCEPT_DR4_WRITE,
+ INTERCEPT_DR5_WRITE,
+ INTERCEPT_DR6_WRITE,
+ INTERCEPT_DR7_WRITE,
+ /* Byte offset 008h (word 2) */
+ INTERCEPT_EXCEPTION_OFFSET = 64,
+ /* Byte offset 00Ch (word 3) */
+ INTERCEPT_INTR = 96,
INTERCEPT_NMI,
INTERCEPT_SMI,
INTERCEPT_INIT,
@@ -34,7 +84,8 @@ enum {
INTERCEPT_TASK_SWITCH,
INTERCEPT_FERR_FREEZE,
INTERCEPT_SHUTDOWN,
- INTERCEPT_VMRUN,
+ /* Byte offset 010h (word 4) */
+ INTERCEPT_VMRUN = 128,
INTERCEPT_VMMCALL,
INTERCEPT_VMLOAD,
INTERCEPT_VMSAVE,
@@ -47,17 +98,34 @@ enum {
INTERCEPT_MONITOR,
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
+ INTERCEPT_RDPRU,
+ TRAP_EFER_WRITE,
+ TRAP_CR0_WRITE,
+ TRAP_CR1_WRITE,
+ TRAP_CR2_WRITE,
+ TRAP_CR3_WRITE,
+ TRAP_CR4_WRITE,
+ TRAP_CR5_WRITE,
+ TRAP_CR6_WRITE,
+ TRAP_CR7_WRITE,
+ TRAP_CR8_WRITE,
+ /* Byte offset 014h (word 5) */
+ INTERCEPT_INVLPGB = 160,
+ INTERCEPT_INVLPGB_ILLEGAL,
+ INTERCEPT_INVPCID,
+ INTERCEPT_MCOMMIT,
+ INTERCEPT_TLBSYNC,
+ INTERCEPT_BUSLOCK,
+ INTERCEPT_IDLE_HLT = 166,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
- u32 intercept_exceptions;
- u64 intercept;
- u8 reserved_1[44];
+ u32 intercepts[MAX_INTERCEPT];
+ u32 reserved_1[15 - MAX_INTERCEPT];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
u64 tsc_offset;
@@ -75,33 +143,87 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 exit_int_info;
u32 exit_int_info_err;
u64 nested_ctl;
- u8 reserved_4[16];
+ u64 avic_vapic_bar;
+ u64 ghcb_gpa;
u32 event_inj;
u32 event_inj_err;
u64 nested_cr3;
- u64 lbr_ctl;
- u8 reserved_5[832];
+ u64 virt_ext;
+ u32 clean;
+ u32 reserved_5;
+ u64 next_rip;
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u64 avic_backing_page; /* Offset 0xe0 */
+ u8 reserved_6[8]; /* Offset 0xe8 */
+ u64 avic_logical_id; /* Offset 0xf0 */
+ u64 avic_physical_id; /* Offset 0xf8 */
+ u8 reserved_7[8];
+ u64 vmsa_pa; /* Used for an SEV-ES guest */
+ u8 reserved_8[16];
+ u16 bus_lock_counter; /* Offset 0x120 */
+ u8 reserved_9[22];
+ u64 allowed_sev_features; /* Offset 0x138 */
+ u64 guest_sev_features; /* Offset 0x140 */
+ u8 reserved_10[664];
+ /*
+ * Offset 0x3e0, 32 bytes reserved
+ * for use by hypervisor/software.
+ */
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
};
#define TLB_CONTROL_DO_NOTHING 0
#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
#define V_TPR_MASK 0x0f
#define V_IRQ_SHIFT 8
#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+#define V_GIF_SHIFT 9
+#define V_GIF_MASK (1 << V_GIF_SHIFT)
+
+#define V_NMI_PENDING_SHIFT 11
+#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT)
+
+#define V_NMI_BLOCKING_SHIFT 12
+#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT)
+
#define V_INTR_PRIO_SHIFT 16
#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
#define V_IGN_TPR_SHIFT 20
#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+#define V_IRQ_INJECTION_BITS_MASK (V_IRQ_MASK | V_INTR_PRIO_MASK | V_IGN_TPR_MASK)
+
#define V_INTR_MASKING_SHIFT 24
#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
-#define SVM_INTERRUPT_SHADOW_MASK 1
+#define V_GIF_ENABLE_SHIFT 25
+#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+
+#define V_NMI_ENABLE_SHIFT 26
+#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT)
+
+#define AVIC_ENABLE_SHIFT 31
+#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
+
+#define X2APIC_MODE_SHIFT 30
+#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT)
+
+#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
+#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+
+#define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0)
+#define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1)
#define SVM_IOIO_STR_SHIFT 2
#define SVM_IOIO_REP_SHIFT 3
@@ -114,14 +236,85 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-struct __attribute__ ((__packed__)) vmcb_seg {
+#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2)
+
+
+#define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL
+#define SVM_TSC_RATIO_MIN 0x0000000000000001ULL
+#define SVM_TSC_RATIO_MAX 0x000000ffffffffffULL
+#define SVM_TSC_RATIO_DEFAULT 0x0100000000ULL
+
+
+/* AVIC */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+/*
+ * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible
+ * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate
+ * GA log interrupts to wake the vCPU (because it's blocking or about to block).
+ */
+#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL)
+
+#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+
+enum avic_ipi_failure_cause {
+ AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+ AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+ AVIC_IPI_FAILURE_INVALID_TARGET,
+ AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+ AVIC_IPI_FAILURE_INVALID_IPI_VECTOR,
+};
+
+#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(11, 0)
+
+/*
+ * For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as
+ * 0xff is a broadcast to all CPUs, i.e. can't be targeted individually.
+ */
+#define AVIC_MAX_PHYSICAL_ID 0XFEULL
+
+/*
+ * For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511).
+ * With X86_FEATURE_X2AVIC_EXT, the max index is increased to 0xfff (4095).
+ */
+#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL
+#define X2AVIC_4K_MAX_PHYSICAL_ID 0xFFFUL
+
+static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
+static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
+static_assert((X2AVIC_4K_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_4K_MAX_PHYSICAL_ID);
+
+#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
+#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
+#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
+#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+#define SVM_SEV_FEAT_SECURE_TSC BIT(9)
+
+#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63)
+
+struct vmcb_seg {
u16 selector;
u16 attrib;
u32 limit;
u64 base;
-};
+} __packed;
-struct __attribute__ ((__packed__)) vmcb_save_area {
+/* Save area definition for legacy and SEV-MEM guests */
+struct vmcb_save_area {
struct vmcb_seg es;
struct vmcb_seg cs;
struct vmcb_seg ss;
@@ -132,11 +325,13 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
- u8 reserved_1[43];
+ /* Reserved fields are named following their struct offset */
+ u8 reserved_0xa0[42];
+ u8 vmpl;
u8 cpl;
- u8 reserved_2[4];
+ u8 reserved_0xcc[4];
u64 efer;
- u8 reserved_3[112];
+ u8 reserved_0xd8[112];
u64 cr4;
u64 cr3;
u64 cr0;
@@ -144,9 +339,11 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
u64 dr6;
u64 rflags;
u64 rip;
- u8 reserved_4[88];
+ u8 reserved_0x180[88];
u64 rsp;
- u8 reserved_5[24];
+ u64 s_cet;
+ u64 ssp;
+ u64 isst_addr;
u64 rax;
u64 star;
u64 lstar;
@@ -157,25 +354,251 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
- u8 reserved_6[32];
+ u8 reserved_0x248[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
-};
+ u8 reserved_0x298[72];
+ u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
+} __packed;
-struct __attribute__ ((__packed__)) vmcb {
+/* Save area definition for SEV-ES and SEV-SNP guests */
+struct sev_es_save_area {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg fs;
+ struct vmcb_seg gs;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg ldtr;
+ struct vmcb_seg idtr;
+ struct vmcb_seg tr;
+ u64 pl0_ssp;
+ u64 pl1_ssp;
+ u64 pl2_ssp;
+ u64 pl3_ssp;
+ u64 u_cet;
+ u8 reserved_0xc8[2];
+ u8 vmpl;
+ u8 cpl;
+ u8 reserved_0xcc[4];
+ u64 efer;
+ u8 reserved_0xd8[104];
+ u64 xss;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 dr0;
+ u64 dr1;
+ u64 dr2;
+ u64 dr3;
+ u64 dr0_addr_mask;
+ u64 dr1_addr_mask;
+ u64 dr2_addr_mask;
+ u64 dr3_addr_mask;
+ u8 reserved_0x1c0[24];
+ u64 rsp;
+ u64 s_cet;
+ u64 ssp;
+ u64 isst_addr;
+ u64 rax;
+ u64 star;
+ u64 lstar;
+ u64 cstar;
+ u64 sfmask;
+ u64 kernel_gs_base;
+ u64 sysenter_cs;
+ u64 sysenter_esp;
+ u64 sysenter_eip;
+ u64 cr2;
+ u8 reserved_0x248[32];
+ u64 g_pat;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
+ u8 reserved_0x298[80];
+ u32 pkru;
+ u32 tsc_aux;
+ u64 tsc_scale;
+ u64 tsc_offset;
+ u8 reserved_0x300[8];
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 reserved_0x320; /* rsp already available at 0x01d8 */
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_0x380[16];
+ u64 guest_exit_info_1;
+ u64 guest_exit_info_2;
+ u64 guest_exit_int_info;
+ u64 guest_nrip;
+ u64 sev_features;
+ u64 vintr_ctrl;
+ u64 guest_exit_code;
+ u64 virtual_tom;
+ u64 tlb_id;
+ u64 pcpu_id;
+ u64 event_inj;
+ u64 xcr0;
+ u8 reserved_0x3f0[16];
+
+ /* Floating point area */
+ u64 x87_dp;
+ u32 mxcsr;
+ u16 x87_ftw;
+ u16 x87_fsw;
+ u16 x87_fcw;
+ u16 x87_fop;
+ u16 x87_ds;
+ u16 x87_cs;
+ u64 x87_rip;
+ u8 fpreg_x87[80];
+ u8 fpreg_xmm[256];
+ u8 fpreg_ymm[256];
+} __packed;
+
+struct ghcb_save_area {
+ u8 reserved_0x0[203];
+ u8 cpl;
+ u8 reserved_0xcc[116];
+ u64 xss;
+ u8 reserved_0x148[24];
+ u64 dr7;
+ u8 reserved_0x168[16];
+ u64 rip;
+ u8 reserved_0x180[88];
+ u64 rsp;
+ u8 reserved_0x1e0[24];
+ u64 rax;
+ u8 reserved_0x200[264];
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u8 reserved_0x320[8];
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_0x380[16];
+ u64 sw_exit_code;
+ u64 sw_exit_info_1;
+ u64 sw_exit_info_2;
+ u64 sw_scratch;
+ u8 reserved_0x3b0[56];
+ u64 xcr0;
+ u8 valid_bitmap[16];
+ u64 x87_state_gpa;
+} __packed;
+
+#define GHCB_SHARED_BUF_SIZE 2032
+
+struct ghcb {
+ struct ghcb_save_area save;
+ u8 reserved_save[2048 - sizeof(struct ghcb_save_area)];
+
+ u8 shared_buffer[GHCB_SHARED_BUF_SIZE];
+
+ u8 reserved_0xff0[10];
+ u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
+ u32 ghcb_usage;
+} __packed;
+
+struct vmcb {
struct vmcb_control_area control;
- struct vmcb_save_area save;
-};
+ union {
+ struct vmcb_save_area save;
+
+ /*
+ * For SEV-ES VMs, the save area in the VMCB is used only to
+ * save/load host state. Guest state resides in a separate
+ * page, the aptly named VM Save Area (VMSA), that is encrypted
+ * with the guest's private key.
+ */
+ struct sev_es_save_area host_sev_es_save;
+ };
+} __packed;
+
+#define EXPECTED_VMCB_SAVE_AREA_SIZE 744
+#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032
+#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
+#define EXPECTED_GHCB_SIZE PAGE_SIZE
+
+#define BUILD_BUG_RESERVED_OFFSET(x, y) \
+ ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y)
+
+static inline void __unused_size_checks(void)
+{
+ BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
+
+ /* Check offsets of reserved fields */
+
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298);
+
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x300);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0);
+
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0);
+
+ BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0);
+}
-#define SVM_CPUID_FEATURE_SHIFT 2
#define SVM_CPUID_FUNC 0x8000000a
-#define SVM_VM_CR_SVM_DISABLE 4
-
#define SVM_SELECTOR_S_SHIFT 4
#define SVM_SELECTOR_DPL_SHIFT 5
#define SVM_SELECTOR_P_SHIFT 7
@@ -197,20 +620,6 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
-
#define SVM_EVTINJ_VEC_MASK 0xff
#define SVM_EVTINJ_TYPE_SHIFT 8
@@ -237,89 +646,64 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
-
-#define SVM_EXIT_READ_CR0 0x000
-#define SVM_EXIT_READ_CR3 0x003
-#define SVM_EXIT_READ_CR4 0x004
-#define SVM_EXIT_READ_CR8 0x008
-#define SVM_EXIT_WRITE_CR0 0x010
-#define SVM_EXIT_WRITE_CR3 0x013
-#define SVM_EXIT_WRITE_CR4 0x014
-#define SVM_EXIT_WRITE_CR8 0x018
-#define SVM_EXIT_READ_DR0 0x020
-#define SVM_EXIT_READ_DR1 0x021
-#define SVM_EXIT_READ_DR2 0x022
-#define SVM_EXIT_READ_DR3 0x023
-#define SVM_EXIT_READ_DR4 0x024
-#define SVM_EXIT_READ_DR5 0x025
-#define SVM_EXIT_READ_DR6 0x026
-#define SVM_EXIT_READ_DR7 0x027
-#define SVM_EXIT_WRITE_DR0 0x030
-#define SVM_EXIT_WRITE_DR1 0x031
-#define SVM_EXIT_WRITE_DR2 0x032
-#define SVM_EXIT_WRITE_DR3 0x033
-#define SVM_EXIT_WRITE_DR4 0x034
-#define SVM_EXIT_WRITE_DR5 0x035
-#define SVM_EXIT_WRITE_DR6 0x036
-#define SVM_EXIT_WRITE_DR7 0x037
-#define SVM_EXIT_EXCP_BASE 0x040
-#define SVM_EXIT_INTR 0x060
-#define SVM_EXIT_NMI 0x061
-#define SVM_EXIT_SMI 0x062
-#define SVM_EXIT_INIT 0x063
-#define SVM_EXIT_VINTR 0x064
-#define SVM_EXIT_CR0_SEL_WRITE 0x065
-#define SVM_EXIT_IDTR_READ 0x066
-#define SVM_EXIT_GDTR_READ 0x067
-#define SVM_EXIT_LDTR_READ 0x068
-#define SVM_EXIT_TR_READ 0x069
-#define SVM_EXIT_IDTR_WRITE 0x06a
-#define SVM_EXIT_GDTR_WRITE 0x06b
-#define SVM_EXIT_LDTR_WRITE 0x06c
-#define SVM_EXIT_TR_WRITE 0x06d
-#define SVM_EXIT_RDTSC 0x06e
-#define SVM_EXIT_RDPMC 0x06f
-#define SVM_EXIT_PUSHF 0x070
-#define SVM_EXIT_POPF 0x071
-#define SVM_EXIT_CPUID 0x072
-#define SVM_EXIT_RSM 0x073
-#define SVM_EXIT_IRET 0x074
-#define SVM_EXIT_SWINT 0x075
-#define SVM_EXIT_INVD 0x076
-#define SVM_EXIT_PAUSE 0x077
-#define SVM_EXIT_HLT 0x078
-#define SVM_EXIT_INVLPG 0x079
-#define SVM_EXIT_INVLPGA 0x07a
-#define SVM_EXIT_IOIO 0x07b
-#define SVM_EXIT_MSR 0x07c
-#define SVM_EXIT_TASK_SWITCH 0x07d
-#define SVM_EXIT_FERR_FREEZE 0x07e
-#define SVM_EXIT_SHUTDOWN 0x07f
-#define SVM_EXIT_VMRUN 0x080
-#define SVM_EXIT_VMMCALL 0x081
-#define SVM_EXIT_VMLOAD 0x082
-#define SVM_EXIT_VMSAVE 0x083
-#define SVM_EXIT_STGI 0x084
-#define SVM_EXIT_CLGI 0x085
-#define SVM_EXIT_SKINIT 0x086
-#define SVM_EXIT_RDTSCP 0x087
-#define SVM_EXIT_ICEBP 0x088
-#define SVM_EXIT_WBINVD 0x089
-#define SVM_EXIT_MONITOR 0x08a
-#define SVM_EXIT_MWAIT 0x08b
-#define SVM_EXIT_MWAIT_COND 0x08c
-#define SVM_EXIT_NPF 0x400
-
-#define SVM_EXIT_ERR -1
-
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
-
-#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
-#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
-#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
-#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
-#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
-#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+
+#define SVM_EXITINFO_REG_MASK 0x0F
+
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+
+/* GHCB Accessor functions */
+
+#define GHCB_BITMAP_IDX(field) \
+ (offsetof(struct ghcb_save_area, field) / sizeof(u64))
+
+#define DEFINE_GHCB_ACCESSORS(field) \
+ static __always_inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ } \
+ \
+ static __always_inline u64 ghcb_get_##field(struct ghcb *ghcb) \
+ { \
+ return ghcb->save.field; \
+ } \
+ \
+ static __always_inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \
+ { \
+ return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \
+ } \
+ \
+ static __always_inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
+ { \
+ __set_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ ghcb->save.field = value; \
+ }
+
+DEFINE_GHCB_ACCESSORS(cpl)
+DEFINE_GHCB_ACCESSORS(rip)
+DEFINE_GHCB_ACCESSORS(rsp)
+DEFINE_GHCB_ACCESSORS(rax)
+DEFINE_GHCB_ACCESSORS(rcx)
+DEFINE_GHCB_ACCESSORS(rdx)
+DEFINE_GHCB_ACCESSORS(rbx)
+DEFINE_GHCB_ACCESSORS(rbp)
+DEFINE_GHCB_ACCESSORS(rsi)
+DEFINE_GHCB_ACCESSORS(rdi)
+DEFINE_GHCB_ACCESSORS(r8)
+DEFINE_GHCB_ACCESSORS(r9)
+DEFINE_GHCB_ACCESSORS(r10)
+DEFINE_GHCB_ACCESSORS(r11)
+DEFINE_GHCB_ACCESSORS(r12)
+DEFINE_GHCB_ACCESSORS(r13)
+DEFINE_GHCB_ACCESSORS(r14)
+DEFINE_GHCB_ACCESSORS(r15)
+DEFINE_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_GHCB_ACCESSORS(sw_scratch)
+DEFINE_GHCB_ACCESSORS(xcr0)
+DEFINE_GHCB_ACCESSORS(xss)
#endif
-
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
deleted file mode 100644
index b9e4e20174fb..000000000000
--- a/arch/x86/include/asm/swiotlb.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_SWIOTLB_H
-#define _ASM_X86_SWIOTLB_H
-
-#include <linux/swiotlb.h>
-
-/* SWIOTLB interface */
-
-extern int swiotlb_force;
-
-#ifdef CONFIG_SWIOTLB
-extern int swiotlb;
-extern void pci_swiotlb_init(void);
-#else
-#define swiotlb 0
-static inline void pci_swiotlb_init(void)
-{
-}
-#endif
-
-static inline void dma_mark_clean(void *addr, size_t size) {}
-
-#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
new file mode 100644
index 000000000000..499b1c15cc8b
--- /dev/null
+++ b/arch/x86/include/asm/switch_to.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SWITCH_TO_H
+#define _ASM_X86_SWITCH_TO_H
+
+#include <linux/sched/task_stack.h>
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+struct task_struct *__switch_to_asm(struct task_struct *prev,
+ struct task_struct *next);
+
+__visible struct task_struct *__switch_to(struct task_struct *prev,
+ struct task_struct *next);
+
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg);
+
+/*
+ * This is the structure pointed to by thread.sp for an inactive task. The
+ * order of the fields must match the code in __switch_to_asm().
+ */
+struct inactive_task_frame {
+#ifdef CONFIG_X86_64
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+#else
+ unsigned long flags;
+ unsigned long si;
+ unsigned long di;
+#endif
+ unsigned long bx;
+
+ /*
+ * These two fields must be together. They form a stack frame header,
+ * needed by get_frame_pointer().
+ */
+ unsigned long bp;
+ unsigned long ret_addr;
+};
+
+struct fork_frame {
+ struct inactive_task_frame frame;
+ struct pt_regs regs;
+};
+
+#define switch_to(prev, next, last) \
+do { \
+ ((last) = __switch_to_asm((prev), (next))); \
+} while (0)
+
+#ifdef CONFIG_X86_32
+#include <asm/msr.h>
+
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
+ return;
+
+ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
+ wrmsrq(MSR_IA32_SYSENTER_CS, thread->sysenter_cs);
+}
+#endif
+
+/* This is used when switching tasks or entering/exiting vm86 mode. */
+static inline void update_task_stack(struct task_struct *task)
+{
+ /* sp0 always points to the entry trampoline stack, which is constant: */
+#ifdef CONFIG_X86_32
+ this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+#else
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV))
+ /* Xen PV enters the kernel on the thread stack. */
+ load_sp0(task_top_of_stack(task));
+#endif
+}
+
+static inline void kthread_frame_init(struct inactive_task_frame *frame,
+ int (*fun)(void *), void *arg)
+{
+ frame->bx = (unsigned long)fun;
+#ifdef CONFIG_X86_32
+ frame->di = (unsigned long)arg;
+#else
+ frame->r12 = (unsigned long)arg;
+#endif
+}
+
+#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index 9d09b4073b60..cd21a0405ac5 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SYNC_BITOPS_H
#define _ASM_X86_SYNC_BITOPS_H
@@ -13,6 +14,8 @@
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
*/
+#include <asm/rmwcc.h>
+
#define ADDR (*(volatile long *)addr)
/**
@@ -26,9 +29,9 @@
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
-static inline void sync_set_bit(int nr, volatile unsigned long *addr)
+static inline void sync_set_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; btsl %1,%0"
+ asm volatile("lock " __ASM_SIZE(bts) " %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -41,12 +44,12 @@ static inline void sync_set_bit(int nr, volatile unsigned long *addr)
*
* sync_clear_bit() is atomic and may not be reordered. However, it does
* not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
* in order to ensure changes are visible on other processors.
*/
-static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
+static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; btrl %1,%0"
+ asm volatile("lock " __ASM_SIZE(btr) " %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -61,9 +64,9 @@ static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
-static inline void sync_change_bit(int nr, volatile unsigned long *addr)
+static inline void sync_change_bit(long nr, volatile unsigned long *addr)
{
- asm volatile("lock; btcl %1,%0"
+ asm volatile("lock " __ASM_SIZE(btc) " %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
@@ -77,14 +80,9 @@ static inline void sync_change_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
+static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
-
- asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0"
- : "=r" (oldbit), "+m" (ADDR)
- : "Ir" (nr) : "memory");
- return oldbit;
+ return GEN_BINARY_RMWcc("lock " __ASM_SIZE(bts), *addr, c, "Ir", nr);
}
/**
@@ -95,14 +93,9 @@ static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
+static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
-
- asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0"
- : "=r" (oldbit), "+m" (ADDR)
- : "Ir" (nr) : "memory");
- return oldbit;
+ return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btr), *addr, c, "Ir", nr);
}
/**
@@ -113,14 +106,9 @@ static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr)
+static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr)
{
- int oldbit;
-
- asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0"
- : "=r" (oldbit), "+m" (ADDR)
- : "Ir" (nr) : "memory");
- return oldbit;
+ return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btc), *addr, c, "Ir", nr);
}
#define sync_test_bit(nr, addr) test_bit(nr, addr)
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
new file mode 100644
index 000000000000..96bda43538ee
--- /dev/null
+++ b/arch/x86/include/asm/sync_core.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SYNC_CORE_H
+#define _ASM_X86_SYNC_CORE_H
+
+#include <linux/preempt.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+
+#ifdef CONFIG_X86_32
+static __always_inline void iret_to_self(void)
+{
+ asm volatile (
+ "pushfl\n\t"
+ "pushl %%cs\n\t"
+ "pushl $1f\n\t"
+ "iret\n\t"
+ "1:"
+ : ASM_CALL_CONSTRAINT : : "memory");
+}
+#else
+static __always_inline void iret_to_self(void)
+{
+ unsigned int tmp;
+
+ asm volatile (
+ "mov %%ss, %0\n\t"
+ "pushq %q0\n\t"
+ "pushq %%rsp\n\t"
+ "addq $8, (%%rsp)\n\t"
+ "pushfq\n\t"
+ "mov %%cs, %0\n\t"
+ "pushq %q0\n\t"
+ "pushq $1f\n\t"
+ "iretq\n\t"
+ "1:"
+ : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
+}
+#endif /* CONFIG_X86_32 */
+
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ * a) Text was modified using one virtual address and is about to be executed
+ * from the same physical page at a different virtual address.
+ *
+ * b) Text was modified on a different CPU, may subsequently be
+ * executed on this CPU, and you want to make sure the new version
+ * gets executed. This generally means you're calling this in an IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ *
+ * Like all of Linux's memory ordering operations, this is a
+ * compiler barrier as well.
+ */
+static __always_inline void sync_core(void)
+{
+ /*
+ * The SERIALIZE instruction is the most straightforward way to
+ * do this, but it is not universally available.
+ */
+ if (static_cpu_has(X86_FEATURE_SERIALIZE)) {
+ serialize();
+ return;
+ }
+
+ /*
+ * For all other processors, there are quite a few ways to do this.
+ * IRET-to-self is nice because it works on every CPU, at any CPL
+ * (so it's compatible with paravirtualization), and it never exits
+ * to a hypervisor. The only downsides are that it's a bit slow
+ * (it seems to be a bit more than 2x slower than the fastest
+ * options) and that it unmasks NMIs. The "push %cs" is needed,
+ * because in paravirtual environments __KERNEL_CS may not be a
+ * valid CS value when we do IRET directly.
+ *
+ * In case NMI unmasking or performance ever becomes a problem,
+ * the next best option appears to be MOV-to-CR2 and an
+ * unconditional jump. That sequence also works on all CPUs,
+ * but it will fault at CPL3 (i.e. Xen PV).
+ *
+ * CPUID is the conventional way, but it's nasty: it doesn't
+ * exist on some 486-like CPUs, and it usually exits to a
+ * hypervisor.
+ */
+ iret_to_self();
+}
+
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode. x86 implements return to user-space through sysexit,
+ * sysrel, and sysretq, which are not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+ /* With PTI, we unconditionally serialize before running user code. */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Even if we're in an interrupt, we might reschedule before returning,
+ * in which case we could switch to a different thread in the same mm
+ * and return using SYSRET or SYSEXIT. Instead of trying to keep
+ * track of our need to sync the core, just sync right away.
+ */
+ sync_core();
+}
+
+#endif /* _ASM_X86_SYNC_CORE_H */
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
deleted file mode 100644
index 72a6dcd1299b..000000000000
--- a/arch/x86/include/asm/sys_ia32.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * sys_ia32.h - Linux ia32 syscall interfaces
- *
- * Copyright (c) 2008 Jaswinder Singh Rajput
- *
- * This file is released under the GPLv2.
- * See the file COPYING for more details.
- */
-
-#ifndef _ASM_X86_SYS_IA32_H
-#define _ASM_X86_SYS_IA32_H
-
-#include <linux/compiler.h>
-#include <linux/linkage.h>
-#include <linux/types.h>
-#include <linux/signal.h>
-#include <asm/compat.h>
-#include <asm/ia32.h>
-
-/* ia32/sys_ia32.c */
-asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
-asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
-
-asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
-asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
-asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
-asmlinkage long sys32_fstatat(unsigned int, char __user *,
- struct stat64 __user *, int);
-struct mmap_arg_struct;
-asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
-asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
-
-asmlinkage long sys32_pipe(int __user *);
-struct sigaction32;
-struct old_sigaction32;
-asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
- struct sigaction32 __user *, unsigned int);
-asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
- struct old_sigaction32 __user *);
-asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
- compat_sigset_t __user *, unsigned int);
-asmlinkage long sys32_alarm(unsigned int);
-
-struct sel_arg_struct;
-asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
-asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
-asmlinkage long sys32_sysfs(int, u32, u32);
-
-asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
- struct compat_timespec __user *);
-asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
-asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
-
-#ifdef CONFIG_SYSCTL_SYSCALL
-struct sysctl_ia32;
-asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
-#endif
-
-asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
-asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
-
-asmlinkage long sys32_personality(unsigned long);
-asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
-
-asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
- unsigned long, unsigned long, unsigned long);
-
-struct oldold_utsname;
-struct old_utsname;
-asmlinkage long sys32_olduname(struct oldold_utsname __user *);
-long sys32_uname(struct old_utsname __user *);
-
-asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
- compat_uptr_t __user *, struct pt_regs *);
-asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
-
-long sys32_lseek(unsigned int, int, unsigned int);
-long sys32_kill(int, int);
-long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
-long sys32_vm86_warning(void);
-long sys32_lookup_dcookie(u32, u32, char __user *, size_t);
-
-asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
-asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
- unsigned, unsigned, int);
-asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int);
-asmlinkage long sys32_fallocate(int, int, unsigned,
- unsigned, unsigned, unsigned);
-
-/* ia32/ia32_signal.c */
-asmlinkage long sys32_sigsuspend(int, int, old_sigset_t);
-asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *,
- stack_ia32_t __user *, struct pt_regs *);
-asmlinkage long sys32_sigreturn(struct pt_regs *);
-asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
-
-/* ia32/ipc32.c */
-asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
-#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d82f39bb7905..c10dbb74cd00 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Access to user system call parameters and results
*
- * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
+ * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
*
* See asm-generic/syscall.h for descriptions of what we must do here.
*/
@@ -13,19 +10,41 @@
#ifndef _ASM_X86_SYSCALL_H
#define _ASM_X86_SYSCALL_H
+#include <uapi/linux/audit.h>
#include <linux/sched.h>
#include <linux/err.h>
+#include <asm/thread_info.h> /* for TS_COMPAT */
+#include <asm/unistd.h>
+
+/* This is used purely for kernel/trace/trace_syscalls.c */
+typedef long (*sys_call_ptr_t)(const struct pt_regs *);
+extern const sys_call_ptr_t sys_call_table[];
+
+/*
+ * These may not exist, but still put the prototypes in so we
+ * can use IS_ENABLED().
+ */
+extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
+extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
+extern long x64_sys_call(const struct pt_regs *, unsigned int nr);
-static inline long syscall_get_nr(struct task_struct *task,
- struct pt_regs *regs)
+/*
+ * Only the low 32 bits of orig_ax are meaningful, so we return int.
+ * This importantly ignores the high bits on 64-bit, so comparisons
+ * sign-extend the low 32 bits.
+ */
+static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
{
- /*
- * We always sign-extend a -1 value being set here,
- * so this is always either -1L or a syscall number.
- */
return regs->orig_ax;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->orig_ax = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -41,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
* TS_COMPAT is set for 32-bit syscall entries and then
* remains set until we return to user mode.
*/
- if (task_thread_info(task)->status & TS_COMPAT)
+ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
/*
* Sign-extend the value so (int)-EFOO becomes (long)-EFOO
* and will match correctly in comparisons.
@@ -68,146 +87,98 @@ static inline void syscall_set_return_value(struct task_struct *task,
static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
- unsigned int i, unsigned int n,
unsigned long *args)
{
- BUG_ON(i + n > 6);
- memcpy(args, &regs->bx + i, n * sizeof(args[0]));
+ args[0] = regs->bx;
+ args[1] = regs->cx;
+ args[2] = regs->dx;
+ args[3] = regs->si;
+ args[4] = regs->di;
+ args[5] = regs->bp;
}
static inline void syscall_set_arguments(struct task_struct *task,
struct pt_regs *regs,
- unsigned int i, unsigned int n,
const unsigned long *args)
{
- BUG_ON(i + n > 6);
- memcpy(&regs->bx + i, args, n * sizeof(args[0]));
+ regs->bx = args[0];
+ regs->cx = args[1];
+ regs->dx = args[2];
+ regs->si = args[3];
+ regs->di = args[4];
+ regs->bp = args[5];
+}
+
+static inline int syscall_get_arch(struct task_struct *task)
+{
+ return AUDIT_ARCH_I386;
}
#else /* CONFIG_X86_64 */
static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
- unsigned int i, unsigned int n,
unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task_thread_info(task)->status & TS_COMPAT)
- switch (i) {
- case 0:
- if (!n--) break;
- *args++ = regs->bx;
- case 1:
- if (!n--) break;
- *args++ = regs->cx;
- case 2:
- if (!n--) break;
- *args++ = regs->dx;
- case 3:
- if (!n--) break;
- *args++ = regs->si;
- case 4:
- if (!n--) break;
- *args++ = regs->di;
- case 5:
- if (!n--) break;
- *args++ = regs->bp;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
- else
+ if (task->thread_info.status & TS_COMPAT) {
+ *args++ = regs->bx;
+ *args++ = regs->cx;
+ *args++ = regs->dx;
+ *args++ = regs->si;
+ *args++ = regs->di;
+ *args = regs->bp;
+ } else
# endif
- switch (i) {
- case 0:
- if (!n--) break;
- *args++ = regs->di;
- case 1:
- if (!n--) break;
- *args++ = regs->si;
- case 2:
- if (!n--) break;
- *args++ = regs->dx;
- case 3:
- if (!n--) break;
- *args++ = regs->r10;
- case 4:
- if (!n--) break;
- *args++ = regs->r8;
- case 5:
- if (!n--) break;
- *args++ = regs->r9;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
+ {
+ *args++ = regs->di;
+ *args++ = regs->si;
+ *args++ = regs->dx;
+ *args++ = regs->r10;
+ *args++ = regs->r8;
+ *args = regs->r9;
+ }
}
static inline void syscall_set_arguments(struct task_struct *task,
struct pt_regs *regs,
- unsigned int i, unsigned int n,
const unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task_thread_info(task)->status & TS_COMPAT)
- switch (i) {
- case 0:
- if (!n--) break;
- regs->bx = *args++;
- case 1:
- if (!n--) break;
- regs->cx = *args++;
- case 2:
- if (!n--) break;
- regs->dx = *args++;
- case 3:
- if (!n--) break;
- regs->si = *args++;
- case 4:
- if (!n--) break;
- regs->di = *args++;
- case 5:
- if (!n--) break;
- regs->bp = *args++;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
- else
+ if (task->thread_info.status & TS_COMPAT) {
+ regs->bx = *args++;
+ regs->cx = *args++;
+ regs->dx = *args++;
+ regs->si = *args++;
+ regs->di = *args++;
+ regs->bp = *args;
+ } else
# endif
- switch (i) {
- case 0:
- if (!n--) break;
- regs->di = *args++;
- case 1:
- if (!n--) break;
- regs->si = *args++;
- case 2:
- if (!n--) break;
- regs->dx = *args++;
- case 3:
- if (!n--) break;
- regs->r10 = *args++;
- case 4:
- if (!n--) break;
- regs->r8 = *args++;
- case 5:
- if (!n--) break;
- regs->r9 = *args++;
- case 6:
- if (!n--) break;
- default:
- BUG();
- break;
- }
+ {
+ regs->di = *args++;
+ regs->si = *args++;
+ regs->dx = *args++;
+ regs->r10 = *args++;
+ regs->r8 = *args++;
+ regs->r9 = *args;
+ }
}
+static inline int syscall_get_arch(struct task_struct *task)
+{
+ /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
+ return (IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ task->thread_info.status & TS_COMPAT)
+ ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+}
+
+bool do_syscall_64(struct pt_regs *regs, int nr);
+void do_int80_emulation(struct pt_regs *regs);
+
#endif /* CONFIG_X86_32 */
+void do_int80_syscall_32(struct pt_regs *regs);
+bool do_fast_syscall_32(struct pt_regs *regs);
+bool do_SYSENTER_32(struct pt_regs *regs);
+
#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
new file mode 100644
index 000000000000..7e88705e907f
--- /dev/null
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * syscall_wrapper.h - x86 specific wrappers to syscall definitions
+ */
+
+#ifndef _ASM_X86_SYSCALL_WRAPPER_H
+#define _ASM_X86_SYSCALL_WRAPPER_H
+
+#include <asm/ptrace.h>
+
+extern long __x64_sys_ni_syscall(const struct pt_regs *regs);
+extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
+
+/*
+ * Instead of the generic __SYSCALL_DEFINEx() definition, the x86 version takes
+ * struct pt_regs *regs as the only argument of the syscall stub(s) named as:
+ * __x64_sys_*() - 64-bit native syscall
+ * __ia32_sys_*() - 32-bit native syscall or common compat syscall
+ * __ia32_compat_sys_*() - 32-bit compat syscall
+ * __x64_compat_sys_*() - 64-bit X32 compat syscall
+ *
+ * The registers are decoded according to the ABI:
+ * 64-bit: RDI, RSI, RDX, R10, R8, R9
+ * 32-bit: EBX, ECX, EDX, ESI, EDI, EBP
+ *
+ * The stub then passes the decoded arguments to the __se_sys_*() wrapper to
+ * perform sign-extension (omitted for zero-argument syscalls). Finally the
+ * arguments are passed to the __do_sys_*() function which is the actual
+ * syscall. These wrappers are marked as inline so the compiler can optimize
+ * the functions where appropriate.
+ *
+ * Example assembly (slightly re-ordered for better readability):
+ *
+ * <__x64_sys_recv>: <-- syscall with 4 parameters
+ * callq <__fentry__>
+ *
+ * mov 0x70(%rdi),%rdi <-- decode regs->di
+ * mov 0x68(%rdi),%rsi <-- decode regs->si
+ * mov 0x60(%rdi),%rdx <-- decode regs->dx
+ * mov 0x38(%rdi),%rcx <-- decode regs->r10
+ *
+ * xor %r9d,%r9d <-- clear %r9
+ * xor %r8d,%r8d <-- clear %r8
+ *
+ * callq __sys_recvfrom <-- do the actual work in __sys_recvfrom()
+ * which takes 6 arguments
+ *
+ * cltq <-- extend return value to 64-bit
+ * retq <-- return
+ *
+ * This approach avoids leaking random user-provided register content down
+ * the call chain.
+ */
+
+/* Mapping of registers to parameters for syscalls on x86-64 and x32 */
+#define SC_X86_64_REGS_TO_ARGS(x, ...) \
+ __MAP(x,__SC_ARGS \
+ ,,regs->di,,regs->si,,regs->dx \
+ ,,regs->r10,,regs->r8,,regs->r9) \
+
+
+/* SYSCALL_PT_ARGS is Adapted from s390x */
+#define SYSCALL_PT_ARG6(m, t1, t2, t3, t4, t5, t6) \
+ SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5), m(t6, (regs->bp))
+#define SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5) \
+ SYSCALL_PT_ARG4(m, t1, t2, t3, t4), m(t5, (regs->di))
+#define SYSCALL_PT_ARG4(m, t1, t2, t3, t4) \
+ SYSCALL_PT_ARG3(m, t1, t2, t3), m(t4, (regs->si))
+#define SYSCALL_PT_ARG3(m, t1, t2, t3) \
+ SYSCALL_PT_ARG2(m, t1, t2), m(t3, (regs->dx))
+#define SYSCALL_PT_ARG2(m, t1, t2) \
+ SYSCALL_PT_ARG1(m, t1), m(t2, (regs->cx))
+#define SYSCALL_PT_ARG1(m, t1) m(t1, (regs->bx))
+#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__)
+
+#define __SC_COMPAT_CAST(t, a) \
+ (__typeof(__builtin_choose_expr(__TYPE_IS_L(t), 0, 0U))) \
+ (unsigned int)a
+
+/* Mapping of registers to parameters for syscalls on i386 */
+#define SC_IA32_REGS_TO_ARGS(x, ...) \
+ SYSCALL_PT_ARGS(x, __SC_COMPAT_CAST, \
+ __MAP(x, __SC_TYPE, __VA_ARGS__)) \
+
+#define __SYS_STUB0(abi, name) \
+ long __##abi##_##name(const struct pt_regs *regs); \
+ ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
+ long __##abi##_##name(const struct pt_regs *regs) \
+ __alias(__do_##name);
+
+#define __SYS_STUBx(abi, name, ...) \
+ long __##abi##_##name(const struct pt_regs *regs); \
+ ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
+ long __##abi##_##name(const struct pt_regs *regs) \
+ { \
+ return __se_##name(__VA_ARGS__); \
+ }
+
+#define __COND_SYSCALL(abi, name) \
+ __weak long __##abi##_##name(const struct pt_regs *__unused); \
+ __weak long __##abi##_##name(const struct pt_regs *__unused) \
+ { \
+ return sys_ni_syscall(); \
+ }
+
+#ifdef CONFIG_X86_64
+#define __X64_SYS_STUB0(name) \
+ __SYS_STUB0(x64, sys_##name)
+
+#define __X64_SYS_STUBx(x, name, ...) \
+ __SYS_STUBx(x64, sys##name, \
+ SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))
+
+#define __X64_COND_SYSCALL(name) \
+ __COND_SYSCALL(x64, sys_##name)
+
+#else /* CONFIG_X86_64 */
+#define __X64_SYS_STUB0(name)
+#define __X64_SYS_STUBx(x, name, ...)
+#define __X64_COND_SYSCALL(name)
+#endif /* CONFIG_X86_64 */
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+#define __IA32_SYS_STUB0(name) \
+ __SYS_STUB0(ia32, sys_##name)
+
+#define __IA32_SYS_STUBx(x, name, ...) \
+ __SYS_STUBx(ia32, sys##name, \
+ SC_IA32_REGS_TO_ARGS(x, __VA_ARGS__))
+
+#define __IA32_COND_SYSCALL(name) \
+ __COND_SYSCALL(ia32, sys_##name)
+
+#else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
+#define __IA32_SYS_STUB0(name)
+#define __IA32_SYS_STUBx(x, name, ...)
+#define __IA32_COND_SYSCALL(name)
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
+
+#ifdef CONFIG_IA32_EMULATION
+/*
+ * For IA32 emulation, we need to handle "compat" syscalls *and* create
+ * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the
+ * ia32 regs in the proper order for shared or "common" syscalls. As some
+ * syscalls may not be implemented, we need to expand COND_SYSCALL in
+ * kernel/sys_ni.c to cover this case as well.
+ */
+#define __IA32_COMPAT_SYS_STUB0(name) \
+ __SYS_STUB0(ia32, compat_sys_##name)
+
+#define __IA32_COMPAT_SYS_STUBx(x, name, ...) \
+ __SYS_STUBx(ia32, compat_sys##name, \
+ SC_IA32_REGS_TO_ARGS(x, __VA_ARGS__))
+
+#define __IA32_COMPAT_COND_SYSCALL(name) \
+ __COND_SYSCALL(ia32, compat_sys_##name)
+
+#else /* CONFIG_IA32_EMULATION */
+#define __IA32_COMPAT_SYS_STUB0(name)
+#define __IA32_COMPAT_SYS_STUBx(x, name, ...)
+#define __IA32_COMPAT_COND_SYSCALL(name)
+#endif /* CONFIG_IA32_EMULATION */
+
+
+#ifdef CONFIG_X86_X32_ABI
+/*
+ * For the x32 ABI, we need to create a stub for compat_sys_*() which is aware
+ * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common
+ * with x86_64 obviously do not need such care.
+ */
+#define __X32_COMPAT_SYS_STUB0(name) \
+ __SYS_STUB0(x64, compat_sys_##name)
+
+#define __X32_COMPAT_SYS_STUBx(x, name, ...) \
+ __SYS_STUBx(x64, compat_sys##name, \
+ SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))
+
+#define __X32_COMPAT_COND_SYSCALL(name) \
+ __COND_SYSCALL(x64, compat_sys_##name)
+
+#else /* CONFIG_X86_X32_ABI */
+#define __X32_COMPAT_SYS_STUB0(name)
+#define __X32_COMPAT_SYS_STUBx(x, name, ...)
+#define __X32_COMPAT_COND_SYSCALL(name)
+#endif /* CONFIG_X86_X32_ABI */
+
+
+#ifdef CONFIG_COMPAT
+/*
+ * Compat means IA32_EMULATION and/or X86_X32. As they use a different
+ * mapping of registers to parameters, we need to generate stubs for each
+ * of them.
+ */
+#define COMPAT_SYSCALL_DEFINE0(name) \
+ static long \
+ __do_compat_sys_##name(const struct pt_regs *__unused); \
+ __IA32_COMPAT_SYS_STUB0(name) \
+ __X32_COMPAT_SYS_STUB0(name) \
+ static long \
+ __do_compat_sys_##name(const struct pt_regs *__unused)
+
+#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
+ static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
+ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+ __IA32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \
+ __X32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \
+ static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
+ { \
+ return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\
+ } \
+ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+
+/*
+ * As some compat syscalls may not be implemented, we need to expand
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
+ */
+#define COND_SYSCALL_COMPAT(name) \
+ __IA32_COMPAT_COND_SYSCALL(name) \
+ __X32_COMPAT_COND_SYSCALL(name)
+
+#endif /* CONFIG_COMPAT */
+
+#define __SYSCALL_DEFINEx(x, name, ...) \
+ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
+ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+ __X64_SYS_STUBx(x, name, __VA_ARGS__) \
+ __IA32_SYS_STUBx(x, name, __VA_ARGS__) \
+ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
+ { \
+ long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
+ __MAP(x,__SC_TEST,__VA_ARGS__); \
+ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
+ return ret; \
+ } \
+ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+
+/*
+ * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for
+ * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not
+ * hurt, we only need to re-define it here to keep the naming congruent to
+ * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro
+ * to work correctly.
+ */
+#define SYSCALL_DEFINE0(sname) \
+ SYSCALL_METADATA(_##sname, 0); \
+ static long __do_sys_##sname(const struct pt_regs *__unused); \
+ __X64_SYS_STUB0(sname) \
+ __IA32_SYS_STUB0(sname) \
+ static long __do_sys_##sname(const struct pt_regs *__unused)
+
+#define COND_SYSCALL(name) \
+ __X64_COND_SYSCALL(name) \
+ __IA32_COND_SYSCALL(name)
+
+
+/*
+ * For VSYSCALLS, we need to declare these three syscalls with the new
+ * pt_regs-based calling convention for in-kernel use.
+ */
+long __x64_sys_getcpu(const struct pt_regs *regs);
+long __x64_sys_gettimeofday(const struct pt_regs *regs);
+long __x64_sys_time(const struct pt_regs *regs);
+
+#endif /* _ASM_X86_SYSCALL_WRAPPER_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 372b76edd63f..6714a358235d 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,97 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* syscalls.h - Linux syscall interfaces (arch-specific)
*
* Copyright (c) 2008 Jaswinder Singh Rajput
- *
- * This file is released under the GPLv2.
- * See the file COPYING for more details.
*/
#ifndef _ASM_X86_SYSCALLS_H
#define _ASM_X86_SYSCALLS_H
-#include <linux/compiler.h>
-#include <linux/linkage.h>
-#include <linux/signal.h>
-#include <linux/types.h>
-
/* Common in X86_32 and X86_64 */
/* kernel/ioport.c */
-asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
-
-/* kernel/process.c */
-int sys_fork(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
-
-/* kernel/ldt.c */
-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
-
-/* kernel/signal.c */
-long sys_rt_sigreturn(struct pt_regs *);
-
-/* kernel/tls.c */
-asmlinkage int sys_set_thread_area(struct user_desc __user *);
-asmlinkage int sys_get_thread_area(struct user_desc __user *);
-
-/* X86_32 only */
-#ifdef CONFIG_X86_32
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
-
-/* kernel/process_32.c */
-int sys_clone(struct pt_regs *);
-int sys_execve(struct pt_regs *);
-
-/* kernel/signal.c */
-asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
-asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
- struct old_sigaction __user *);
-int sys_sigaltstack(struct pt_regs *);
-unsigned long sys_sigreturn(struct pt_regs *);
-
-/* kernel/sys_i386_32.c */
-struct mmap_arg_struct;
-struct sel_arg_struct;
-struct oldold_utsname;
-struct old_utsname;
-
-asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
- unsigned long, unsigned long, unsigned long);
-asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-asmlinkage int old_select(struct sel_arg_struct __user *);
-asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-asmlinkage int sys_uname(struct old_utsname __user *);
-asmlinkage int sys_olduname(struct oldold_utsname __user *);
-
-/* kernel/vm86_32.c */
-int sys_vm86old(struct pt_regs *);
-int sys_vm86(struct pt_regs *);
-
-#else /* CONFIG_X86_32 */
-
-/* X86_64 only */
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
-/* kernel/process_64.c */
-asmlinkage long sys_clone(unsigned long, unsigned long,
- void __user *, void __user *,
- struct pt_regs *);
-asmlinkage long sys_execve(char __user *, char __user * __user *,
- char __user * __user *,
- struct pt_regs *);
-long sys_arch_prctl(int, unsigned long);
-
-/* kernel/signal.c */
-asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
- struct pt_regs *);
-
-/* kernel/sys_x86_64.c */
-struct new_utsname;
-
-asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
- unsigned long, unsigned long, unsigned long);
-asmlinkage long sys_uname(struct new_utsname __user *);
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on);
-#endif /* CONFIG_X86_32 */
#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
deleted file mode 100644
index 643c59b4bc6e..000000000000
--- a/arch/x86/include/asm/system.h
+++ /dev/null
@@ -1,485 +0,0 @@
-#ifndef _ASM_X86_SYSTEM_H
-#define _ASM_X86_SYSTEM_H
-
-#include <asm/asm.h>
-#include <asm/segment.h>
-#include <asm/cpufeature.h>
-#include <asm/cmpxchg.h>
-#include <asm/nops.h>
-
-#include <linux/kernel.h>
-#include <linux/irqflags.h>
-
-/* entries in ARCH_DLINFO: */
-#ifdef CONFIG_IA32_EMULATION
-# define AT_VECTOR_SIZE_ARCH 2
-#else
-# define AT_VECTOR_SIZE_ARCH 1
-#endif
-
-struct task_struct; /* one of the stranger aspects of C forward declarations */
-struct task_struct *__switch_to(struct task_struct *prev,
- struct task_struct *next);
-struct tss_struct;
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss);
-
-#ifdef CONFIG_X86_32
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movl %P[task_canary](%[next]), %%ebx\n\t" \
- "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam \
- , [stack_canary] "=m" (per_cpu_var(stack_canary))
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
-#define switch_to(prev, next, last) \
-do { \
- /* \
- * Context-switching clobbers all registers, so we clobber \
- * them explicitly, via unused output variables. \
- * (EAX and EBP is not listed because EBP is saved/restored \
- * explicitly for wchan access and EAX is the return value of \
- * __switch_to()) \
- */ \
- unsigned long ebx, ecx, edx, esi, edi; \
- \
- asm volatile("pushfl\n\t" /* save flags */ \
- "pushl %%ebp\n\t" /* save EBP */ \
- "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
- "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
- "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
- "pushl %[next_ip]\n\t" /* restore EIP */ \
- __switch_canary \
- "jmp __switch_to\n" /* regparm call */ \
- "1:\t" \
- "popl %%ebp\n\t" /* restore EBP */ \
- "popfl\n" /* restore flags */ \
- \
- /* output parameters */ \
- : [prev_sp] "=m" (prev->thread.sp), \
- [prev_ip] "=m" (prev->thread.ip), \
- "=a" (last), \
- \
- /* clobbered output registers: */ \
- "=b" (ebx), "=c" (ecx), "=d" (edx), \
- "=S" (esi), "=D" (edi) \
- \
- __switch_canary_oparam \
- \
- /* input parameters: */ \
- : [next_sp] "m" (next->thread.sp), \
- [next_ip] "m" (next->thread.ip), \
- \
- /* regparm parameters for __switch_to(): */ \
- [prev] "a" (prev), \
- [next] "d" (next) \
- \
- __switch_canary_iparam \
- \
- : /* reloaded segment registers */ \
- "memory"); \
-} while (0)
-
-/*
- * disable hlt during certain critical i/o operations
- */
-#define HAVE_DISABLE_HLT
-#else
-#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
-#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
-
-#define __EXTRA_CLOBBER \
- , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
- "r12", "r13", "r14", "r15"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movq %P[task_canary](%%rsi),%%r8\n\t" \
- "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam \
- , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/* Save restore flags to clear handle leaking NT */
-#define switch_to(prev, next, last) \
- asm volatile(SAVE_CONTEXT \
- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
- "call __switch_to\n\t" \
- ".globl thread_return\n" \
- "thread_return:\n\t" \
- "movq "__percpu_arg([current_task])",%%rsi\n\t" \
- __switch_canary \
- "movq %P[thread_info](%%rsi),%%r8\n\t" \
- "movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
- "jnz ret_from_fork\n\t" \
- RESTORE_CONTEXT \
- : "=a" (last) \
- __switch_canary_oparam \
- : [next] "S" (next), [prev] "D" (prev), \
- [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
- [ti_flags] "i" (offsetof(struct thread_info, flags)), \
- [_tif_fork] "i" (_TIF_FORK), \
- [thread_info] "i" (offsetof(struct task_struct, stack)), \
- [current_task] "m" (per_cpu_var(current_task)) \
- __switch_canary_iparam \
- : "memory", "cc" __EXTRA_CLOBBER)
-#endif
-
-#ifdef __KERNEL__
-#define _set_base(addr, base) do { unsigned long __pr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
- "rorl $16,%%edx\n\t" \
- "movb %%dl,%2\n\t" \
- "movb %%dh,%3" \
- :"=&d" (__pr) \
- :"m" (*((addr)+2)), \
- "m" (*((addr)+4)), \
- "m" (*((addr)+7)), \
- "0" (base) \
- ); } while (0)
-
-#define _set_limit(addr, limit) do { unsigned long __lr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
- "rorl $16,%%edx\n\t" \
- "movb %2,%%dh\n\t" \
- "andb $0xf0,%%dh\n\t" \
- "orb %%dh,%%dl\n\t" \
- "movb %%dl,%2" \
- :"=&d" (__lr) \
- :"m" (*(addr)), \
- "m" (*((addr)+6)), \
- "0" (limit) \
- ); } while (0)
-
-#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
-#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
-
-extern void native_load_gs_index(unsigned);
-
-/*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
- */
-#define loadsegment(seg, value) \
- asm volatile("\n" \
- "1:\t" \
- "movl %k0,%%" #seg "\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3:\t" \
- "movl %k1, %%" #seg "\n\t" \
- "jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b,3b) \
- : :"r" (value), "r" (0) : "memory")
-
-
-/*
- * Save a segment register away
- */
-#define savesegment(seg, value) \
- asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
-
-/*
- * x86_32 user gs accessors.
- */
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_32_LAZY_GS
-#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
-#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
-#define task_user_gs(tsk) ((tsk)->thread.gs)
-#define lazy_save_gs(v) savesegment(gs, (v))
-#define lazy_load_gs(v) loadsegment(gs, (v))
-#else /* X86_32_LAZY_GS */
-#define get_user_gs(regs) (u16)((regs)->gs)
-#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
-#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
-#define lazy_save_gs(v) do { } while (0)
-#define lazy_load_gs(v) do { } while (0)
-#endif /* X86_32_LAZY_GS */
-#endif /* X86_32 */
-
-static inline unsigned long get_limit(unsigned long segment)
-{
- unsigned long __limit;
- asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
- return __limit + 1;
-}
-
-static inline void native_clts(void)
-{
- asm volatile("clts");
-}
-
-/*
- * Volatile isn't enough to prevent the compiler from reordering the
- * read/write functions for the control registers and messing everything up.
- * A memory clobber would solve the problem, but would prevent reordering of
- * all loads stores around it, which can hurt performance. Solution is to
- * use a variable and mimic reads and writes to it to enforce serialization
- */
-static unsigned long __force_order;
-
-static inline unsigned long native_read_cr0(void)
-{
- unsigned long val;
- asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline void native_write_cr0(unsigned long val)
-{
- asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr2(void)
-{
- unsigned long val;
- asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline void native_write_cr2(unsigned long val)
-{
- asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr3(void)
-{
- unsigned long val;
- asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline void native_write_cr3(unsigned long val)
-{
- asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr4(void)
-{
- unsigned long val;
- asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline unsigned long native_read_cr4_safe(void)
-{
- unsigned long val;
- /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
- * exists, so it will never fail. */
-#ifdef CONFIG_X86_32
- asm volatile("1: mov %%cr4, %0\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b)
- : "=r" (val), "=m" (__force_order) : "0" (0));
-#else
- val = native_read_cr4();
-#endif
- return val;
-}
-
-static inline void native_write_cr4(unsigned long val)
-{
- asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
-}
-
-#ifdef CONFIG_X86_64
-static inline unsigned long native_read_cr8(void)
-{
- unsigned long cr8;
- asm volatile("movq %%cr8,%0" : "=r" (cr8));
- return cr8;
-}
-
-static inline void native_write_cr8(unsigned long val)
-{
- asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
-}
-#endif
-
-static inline void native_wbinvd(void)
-{
- asm volatile("wbinvd": : :"memory");
-}
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define read_cr0() (native_read_cr0())
-#define write_cr0(x) (native_write_cr0(x))
-#define read_cr2() (native_read_cr2())
-#define write_cr2(x) (native_write_cr2(x))
-#define read_cr3() (native_read_cr3())
-#define write_cr3(x) (native_write_cr3(x))
-#define read_cr4() (native_read_cr4())
-#define read_cr4_safe() (native_read_cr4_safe())
-#define write_cr4(x) (native_write_cr4(x))
-#define wbinvd() (native_wbinvd())
-#ifdef CONFIG_X86_64
-#define read_cr8() (native_read_cr8())
-#define write_cr8(x) (native_write_cr8(x))
-#define load_gs_index native_load_gs_index
-#endif
-
-/* Clear the 'TS' bit */
-#define clts() (native_clts())
-
-#endif/* CONFIG_PARAVIRT */
-
-#define stts() write_cr0(read_cr0() | X86_CR0_TS)
-
-#endif /* __KERNEL__ */
-
-static inline void clflush(volatile void *__p)
-{
- asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
-}
-
-#define nop() asm volatile ("nop")
-
-void disable_hlt(void);
-void enable_hlt(void);
-
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
-void default_idle(void);
-
-void stop_this_cpu(void *dummy);
-
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- */
-#ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
-#else
-#define mb() asm volatile("mfence":::"memory")
-#define rmb() asm volatile("lfence":::"memory")
-#define wmb() asm volatile("sfence" ::: "memory")
-#endif
-
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier. All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads. This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies. See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- * CPU 0 CPU 1
- *
- * b = 2;
- * memory_barrier();
- * p = &b; q = p;
- * read_barrier_depends();
- * d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends(). However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- * CPU 0 CPU 1
- *
- * a = 2;
- * memory_barrier();
- * b = 3; y = b;
- * read_barrier_depends();
- * x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b". Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends() do { } while (0)
-
-#ifdef CONFIG_SMP
-#define smp_mb() mb()
-#ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb() rmb()
-#else
-# define smp_rmb() barrier()
-#endif
-#ifdef CONFIG_X86_OOSTORE
-# define smp_wmb() wmb()
-#else
-# define smp_wmb() barrier()
-#endif
-#define smp_read_barrier_depends() read_barrier_depends()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-#define smp_read_barrier_depends() do { } while (0)
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
-#endif
-
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
-#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
deleted file mode 100644
index 1159e091ad09..000000000000
--- a/arch/x86/include/asm/system_64.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_SYSTEM_64_H
-#define _ASM_X86_SYSTEM_64_H
-
-#include <asm/segment.h>
-#include <asm/cmpxchg.h>
-
-
-static inline unsigned long read_cr8(void)
-{
- unsigned long cr8;
- asm volatile("movq %%cr8,%0" : "=r" (cr8));
- return cr8;
-}
-
-static inline void write_cr8(unsigned long val)
-{
- asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
-}
-
-#include <linux/irqflags.h>
-
-#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h
deleted file mode 100644
index 7a6677c1a715..000000000000
--- a/arch/x86/include/asm/tce.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * This file is derived from asm-powerpc/tce.h.
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
- * Author: Jon Mason <jdmason@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_TCE_H
-#define _ASM_X86_TCE_H
-
-extern unsigned int specified_table_size;
-struct iommu_table;
-
-#define TCE_ENTRY_SIZE 8 /* in bytes */
-
-#define TCE_READ_SHIFT 0
-#define TCE_WRITE_SHIFT 1
-#define TCE_HUBID_SHIFT 2 /* unused */
-#define TCE_RSVD_SHIFT 8 /* unused */
-#define TCE_RPN_SHIFT 12
-#define TCE_UNUSED_SHIFT 48 /* unused */
-
-#define TCE_RPN_MASK 0x0000fffffffff000ULL
-
-extern void tce_build(struct iommu_table *tbl, unsigned long index,
- unsigned int npages, unsigned long uaddr, int direction);
-extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages);
-extern void * __init alloc_tce_table(void);
-extern void __init free_tce_table(void *tbl);
-extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar);
-
-#endif /* _ASM_X86_TCE_H */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
new file mode 100644
index 000000000000..6b338d7f01b7
--- /dev/null
+++ b/arch/x86/include/asm/tdx.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021-2022 Intel Corporation */
+#ifndef _ASM_X86_TDX_H
+#define _ASM_X86_TDX_H
+
+#include <linux/init.h>
+#include <linux/bits.h>
+#include <linux/mmzone.h>
+
+#include <asm/errno.h>
+#include <asm/ptrace.h>
+#include <asm/trapnr.h>
+#include <asm/shared/tdx.h>
+
+/*
+ * SW-defined error codes.
+ *
+ * Bits 47:40 == 0xFF indicate Reserved status code class that never used by
+ * TDX module.
+ */
+#define TDX_ERROR _BITUL(63)
+#define TDX_NON_RECOVERABLE _BITUL(62)
+#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40))
+#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000))
+
+#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP)
+#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD)
+
+/*
+ * TDX module SEAMCALL leaf function error codes
+ */
+#define TDX_SUCCESS 0ULL
+#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL
+
+#ifndef __ASSEMBLER__
+
+#include <uapi/asm/mce.h>
+#include <asm/tdx_global_metadata.h>
+#include <linux/pgtable.h>
+
+/*
+ * Used by the #VE exception handler to gather the #VE exception
+ * info from the TDX module. This is a software only structure
+ * and not part of the TDX module/VMM ABI.
+ */
+struct ve_info {
+ u64 exit_reason;
+ u64 exit_qual;
+ /* Guest Linear (virtual) Address */
+ u64 gla;
+ /* Guest Physical Address */
+ u64 gpa;
+ u32 instr_len;
+ u32 instr_info;
+};
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+void __init tdx_early_init(void);
+
+void tdx_get_ve_info(struct ve_info *ve);
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
+
+void tdx_halt(void);
+
+bool tdx_early_handle_ve(struct pt_regs *regs);
+
+int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
+
+int tdx_mcall_extend_rtmr(u8 index, u8 *data);
+
+u64 tdx_hcall_get_quote(u8 *buf, size_t size);
+
+void __init tdx_dump_attributes(u64 td_attr);
+void __init tdx_dump_td_ctls(u64 td_ctls);
+
+#else
+
+static inline void tdx_early_init(void) { };
+static inline void tdx_halt(void) { };
+
+static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
+
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_INTEL_TDX_GUEST)
+long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4);
+#else
+static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
+
+#ifdef CONFIG_INTEL_TDX_HOST
+u64 __seamcall(u64 fn, struct tdx_module_args *args);
+u64 __seamcall_ret(u64 fn, struct tdx_module_args *args);
+u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args);
+void tdx_init(void);
+
+#include <linux/preempt.h>
+#include <asm/archrandom.h>
+#include <asm/processor.h>
+
+typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
+
+static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn,
+ struct tdx_module_args *args)
+{
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * SEAMCALLs are made to the TDX module and can generate dirty
+ * cachelines of TDX private memory. Mark cache state incoherent
+ * so that the cache can be flushed during kexec.
+ *
+ * This needs to be done before actually making the SEAMCALL,
+ * because kexec-ing CPU could send NMI to stop remote CPUs,
+ * in which case even disabling IRQ won't help here.
+ */
+ this_cpu_write(cache_state_incoherent, true);
+
+ return func(fn, args);
+}
+
+static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
+ struct tdx_module_args *args)
+{
+ int retry = RDRAND_RETRY_LOOPS;
+ u64 ret;
+
+ do {
+ preempt_disable();
+ ret = __seamcall_dirty_cache(func, fn, args);
+ preempt_enable();
+ } while (ret == TDX_RND_NO_ENTROPY && --retry);
+
+ return ret;
+}
+
+#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args))
+#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args))
+#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args))
+int tdx_cpu_enable(void);
+int tdx_enable(void);
+const char *tdx_dump_mce_info(struct mce *m);
+const struct tdx_sys_info *tdx_get_sysinfo(void);
+
+int tdx_guest_keyid_alloc(void);
+u32 tdx_get_nr_guest_keyids(void);
+void tdx_guest_keyid_free(unsigned int keyid);
+
+void tdx_quirk_reset_page(struct page *page);
+
+struct tdx_td {
+ /* TD root structure: */
+ struct page *tdr_page;
+
+ int tdcs_nr_pages;
+ /* TD control structure: */
+ struct page **tdcs_pages;
+
+ /* Size of `tdcx_pages` in struct tdx_vp */
+ int tdcx_nr_pages;
+};
+
+struct tdx_vp {
+ /* TDVP root page */
+ struct page *tdvpr_page;
+ /* precalculated page_to_phys(tdvpr_page) for use in noinstr code */
+ phys_addr_t tdvpr_pa;
+
+ /* TD vCPU control structure: */
+ struct page **tdcx_pages;
+};
+
+static inline u64 mk_keyed_paddr(u16 hkid, struct page *page)
+{
+ u64 ret;
+
+ ret = page_to_phys(page);
+ /* KeyID bits are just above the physical address bits: */
+ ret |= (u64)hkid << boot_cpu_data.x86_phys_bits;
+
+ return ret;
+}
+
+static inline int pg_level_to_tdx_sept_level(enum pg_level level)
+{
+ WARN_ON_ONCE(level == PG_LEVEL_NONE);
+ return level - 1;
+}
+
+u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
+u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page);
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mng_key_config(struct tdx_td *td);
+u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
+u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
+u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
+u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mr_finalize(struct tdx_td *td);
+u64 tdh_vp_flush(struct tdx_vp *vp);
+u64 tdh_mng_vpflushdone(struct tdx_td *td);
+u64 tdh_mng_key_freeid(struct tdx_td *td);
+u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err);
+u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid);
+u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data);
+u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask);
+u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size);
+u64 tdh_mem_track(struct tdx_td *tdr);
+u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_phymem_cache_wb(bool resume);
+u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
+#else
+static inline void tdx_init(void) { }
+static inline int tdx_cpu_enable(void) { return -ENODEV; }
+static inline int tdx_enable(void) { return -ENODEV; }
+static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
+static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
+static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
+#endif /* CONFIG_INTEL_TDX_HOST */
+
+#ifdef CONFIG_KEXEC_CORE
+void tdx_cpu_flush_cache_for_kexec(void);
+#else
+static inline void tdx_cpu_flush_cache_for_kexec(void) { }
+#endif
+
+#endif /* !__ASSEMBLER__ */
+#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h
new file mode 100644
index 000000000000..060a2ad744bf
--- /dev/null
+++ b/arch/x86/include/asm/tdx_global_metadata.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Automatically generated TDX global metadata structures. */
+#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H
+#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H
+
+#include <linux/types.h>
+
+struct tdx_sys_info_features {
+ u64 tdx_features0;
+};
+
+struct tdx_sys_info_tdmr {
+ u16 max_tdmrs;
+ u16 max_reserved_per_tdmr;
+ u16 pamt_4k_entry_size;
+ u16 pamt_2m_entry_size;
+ u16 pamt_1g_entry_size;
+};
+
+struct tdx_sys_info_td_ctrl {
+ u16 tdr_base_size;
+ u16 tdcs_base_size;
+ u16 tdvps_base_size;
+};
+
+struct tdx_sys_info_td_conf {
+ u64 attributes_fixed0;
+ u64 attributes_fixed1;
+ u64 xfam_fixed0;
+ u64 xfam_fixed1;
+ u16 num_cpuid_config;
+ u16 max_vcpus_per_td;
+ u64 cpuid_config_leaves[128];
+ u64 cpuid_config_values[128][2];
+};
+
+struct tdx_sys_info {
+ struct tdx_sys_info_features features;
+ struct tdx_sys_info_tdmr tdmr;
+ struct tdx_sys_info_td_ctrl td_ctrl;
+ struct tdx_sys_info_td_conf td_conf;
+};
+
+#endif
diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h
deleted file mode 100644
index af1b70ea440f..000000000000
--- a/arch/x86/include/asm/termbits.h
+++ /dev/null
@@ -1,198 +0,0 @@
-#ifndef _ASM_X86_TERMBITS_H
-#define _ASM_X86_TERMBITS_H
-
-#include <linux/posix_types.h>
-
-typedef unsigned char cc_t;
-typedef unsigned int speed_t;
-typedef unsigned int tcflag_t;
-
-#define NCCS 19
-struct termios {
- tcflag_t c_iflag; /* input mode flags */
- tcflag_t c_oflag; /* output mode flags */
- tcflag_t c_cflag; /* control mode flags */
- tcflag_t c_lflag; /* local mode flags */
- cc_t c_line; /* line discipline */
- cc_t c_cc[NCCS]; /* control characters */
-};
-
-struct termios2 {
- tcflag_t c_iflag; /* input mode flags */
- tcflag_t c_oflag; /* output mode flags */
- tcflag_t c_cflag; /* control mode flags */
- tcflag_t c_lflag; /* local mode flags */
- cc_t c_line; /* line discipline */
- cc_t c_cc[NCCS]; /* control characters */
- speed_t c_ispeed; /* input speed */
- speed_t c_ospeed; /* output speed */
-};
-
-struct ktermios {
- tcflag_t c_iflag; /* input mode flags */
- tcflag_t c_oflag; /* output mode flags */
- tcflag_t c_cflag; /* control mode flags */
- tcflag_t c_lflag; /* local mode flags */
- cc_t c_line; /* line discipline */
- cc_t c_cc[NCCS]; /* control characters */
- speed_t c_ispeed; /* input speed */
- speed_t c_ospeed; /* output speed */
-};
-
-/* c_cc characters */
-#define VINTR 0
-#define VQUIT 1
-#define VERASE 2
-#define VKILL 3
-#define VEOF 4
-#define VTIME 5
-#define VMIN 6
-#define VSWTC 7
-#define VSTART 8
-#define VSTOP 9
-#define VSUSP 10
-#define VEOL 11
-#define VREPRINT 12
-#define VDISCARD 13
-#define VWERASE 14
-#define VLNEXT 15
-#define VEOL2 16
-
-/* c_iflag bits */
-#define IGNBRK 0000001
-#define BRKINT 0000002
-#define IGNPAR 0000004
-#define PARMRK 0000010
-#define INPCK 0000020
-#define ISTRIP 0000040
-#define INLCR 0000100
-#define IGNCR 0000200
-#define ICRNL 0000400
-#define IUCLC 0001000
-#define IXON 0002000
-#define IXANY 0004000
-#define IXOFF 0010000
-#define IMAXBEL 0020000
-#define IUTF8 0040000
-
-/* c_oflag bits */
-#define OPOST 0000001
-#define OLCUC 0000002
-#define ONLCR 0000004
-#define OCRNL 0000010
-#define ONOCR 0000020
-#define ONLRET 0000040
-#define OFILL 0000100
-#define OFDEL 0000200
-#define NLDLY 0000400
-#define NL0 0000000
-#define NL1 0000400
-#define CRDLY 0003000
-#define CR0 0000000
-#define CR1 0001000
-#define CR2 0002000
-#define CR3 0003000
-#define TABDLY 0014000
-#define TAB0 0000000
-#define TAB1 0004000
-#define TAB2 0010000
-#define TAB3 0014000
-#define XTABS 0014000
-#define BSDLY 0020000
-#define BS0 0000000
-#define BS1 0020000
-#define VTDLY 0040000
-#define VT0 0000000
-#define VT1 0040000
-#define FFDLY 0100000
-#define FF0 0000000
-#define FF1 0100000
-
-/* c_cflag bit meaning */
-#define CBAUD 0010017
-#define B0 0000000 /* hang up */
-#define B50 0000001
-#define B75 0000002
-#define B110 0000003
-#define B134 0000004
-#define B150 0000005
-#define B200 0000006
-#define B300 0000007
-#define B600 0000010
-#define B1200 0000011
-#define B1800 0000012
-#define B2400 0000013
-#define B4800 0000014
-#define B9600 0000015
-#define B19200 0000016
-#define B38400 0000017
-#define EXTA B19200
-#define EXTB B38400
-#define CSIZE 0000060
-#define CS5 0000000
-#define CS6 0000020
-#define CS7 0000040
-#define CS8 0000060
-#define CSTOPB 0000100
-#define CREAD 0000200
-#define PARENB 0000400
-#define PARODD 0001000
-#define HUPCL 0002000
-#define CLOCAL 0004000
-#define CBAUDEX 0010000
-#define BOTHER 0010000 /* non standard rate */
-#define B57600 0010001
-#define B115200 0010002
-#define B230400 0010003
-#define B460800 0010004
-#define B500000 0010005
-#define B576000 0010006
-#define B921600 0010007
-#define B1000000 0010010
-#define B1152000 0010011
-#define B1500000 0010012
-#define B2000000 0010013
-#define B2500000 0010014
-#define B3000000 0010015
-#define B3500000 0010016
-#define B4000000 0010017
-#define CIBAUD 002003600000 /* input baud rate */
-#define CMSPAR 010000000000 /* mark or space (stick) parity */
-#define CRTSCTS 020000000000 /* flow control */
-
-#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */
-
-/* c_lflag bits */
-#define ISIG 0000001
-#define ICANON 0000002
-#define XCASE 0000004
-#define ECHO 0000010
-#define ECHOE 0000020
-#define ECHOK 0000040
-#define ECHONL 0000100
-#define NOFLSH 0000200
-#define TOSTOP 0000400
-#define ECHOCTL 0001000
-#define ECHOPRT 0002000
-#define ECHOKE 0004000
-#define FLUSHO 0010000
-#define PENDIN 0040000
-#define IEXTEN 0100000
-
-/* tcflow() and TCXONC use these */
-#define TCOOFF 0
-#define TCOON 1
-#define TCIOFF 2
-#define TCION 3
-
-/* tcflush() and TCFLSH use these */
-#define TCIFLUSH 0
-#define TCOFLUSH 1
-#define TCIOFLUSH 2
-
-/* tcsetattr uses these */
-#define TCSANOW 0
-#define TCSADRAIN 1
-#define TCSAFLUSH 2
-
-#endif /* _ASM_X86_TERMBITS_H */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
deleted file mode 100644
index c4ee8056baca..000000000000
--- a/arch/x86/include/asm/termios.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef _ASM_X86_TERMIOS_H
-#define _ASM_X86_TERMIOS_H
-
-#include <asm/termbits.h>
-#include <asm/ioctls.h>
-
-struct winsize {
- unsigned short ws_row;
- unsigned short ws_col;
- unsigned short ws_xpixel;
- unsigned short ws_ypixel;
-};
-
-#define NCC 8
-struct termio {
- unsigned short c_iflag; /* input mode flags */
- unsigned short c_oflag; /* output mode flags */
- unsigned short c_cflag; /* control mode flags */
- unsigned short c_lflag; /* local mode flags */
- unsigned char c_line; /* line discipline */
- unsigned char c_cc[NCC]; /* control characters */
-};
-
-/* modem lines */
-#define TIOCM_LE 0x001
-#define TIOCM_DTR 0x002
-#define TIOCM_RTS 0x004
-#define TIOCM_ST 0x008
-#define TIOCM_SR 0x010
-#define TIOCM_CTS 0x020
-#define TIOCM_CAR 0x040
-#define TIOCM_RNG 0x080
-#define TIOCM_DSR 0x100
-#define TIOCM_CD TIOCM_CAR
-#define TIOCM_RI TIOCM_RNG
-#define TIOCM_OUT1 0x2000
-#define TIOCM_OUT2 0x4000
-#define TIOCM_LOOP 0x8000
-
-/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
-
-#ifdef __KERNEL__
-
-#include <asm/uaccess.h>
-
-/* intr=^C quit=^\ erase=del kill=^U
- eof=^D vtime=\0 vmin=\1 sxtc=\0
- start=^Q stop=^S susp=^Z eol=\0
- reprint=^R discard=^U werase=^W lnext=^V
- eol2=\0
-*/
-#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
-
-/*
- * Translate a "termio" structure into a "termios". Ugh.
- */
-#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \
- unsigned short __tmp; \
- get_user(__tmp,&(termio)->x); \
- *(unsigned short *) &(termios)->x = __tmp; \
-}
-
-static inline int user_termio_to_kernel_termios(struct ktermios *termios,
- struct termio __user *termio)
-{
- SET_LOW_TERMIOS_BITS(termios, termio, c_iflag);
- SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
- SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
- SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
- get_user(termios->c_line, &termio->c_line);
- return copy_from_user(termios->c_cc, termio->c_cc, NCC);
-}
-
-/*
- * Translate a "termios" structure into a "termio". Ugh.
- */
-static inline int kernel_termios_to_user_termio(struct termio __user *termio,
- struct ktermios *termios)
-{
- put_user((termios)->c_iflag, &(termio)->c_iflag);
- put_user((termios)->c_oflag, &(termio)->c_oflag);
- put_user((termios)->c_cflag, &(termio)->c_cflag);
- put_user((termios)->c_lflag, &(termio)->c_lflag);
- put_user((termios)->c_line, &(termio)->c_line);
- return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC);
-}
-
-static inline int user_termios_to_kernel_termios(struct ktermios *k,
- struct termios2 __user *u)
-{
- return copy_from_user(k, u, sizeof(struct termios2));
-}
-
-static inline int kernel_termios_to_user_termios(struct termios2 __user *u,
- struct ktermios *k)
-{
- return copy_to_user(u, k, sizeof(struct termios2));
-}
-
-static inline int user_termios_to_kernel_termios_1(struct ktermios *k,
- struct termios __user *u)
-{
- return copy_from_user(k, u, sizeof(struct termios));
-}
-
-static inline int kernel_termios_to_user_termios_1(struct termios __user *u,
- struct ktermios *k)
-{
- return copy_to_user(u, k, sizeof(struct termios));
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_TERMIOS_H */
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
new file mode 100644
index 000000000000..f2d142a0a862
--- /dev/null
+++ b/arch/x86/include/asm/text-patching.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TEXT_PATCHING_H
+#define _ASM_X86_TEXT_PATCHING_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/ptrace.h>
+
+/*
+ * Currently, the max observed size in the kernel code is
+ * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
+ * Raise it if needed.
+ */
+#define TEXT_POKE_MAX_OPCODE_SIZE 5
+
+extern void text_poke_early(void *addr, const void *opcode, size_t len);
+
+extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
+
+/*
+ * Clear and restore the kernel write-protection flag on the local CPU.
+ * Allows the kernel to edit read-only pages.
+ * Side-effect: any interrupt handler running between save and restore will have
+ * the ability to write to read-only pages.
+ *
+ * Warning:
+ * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
+ * no thread can be preempted in the instructions being modified (no iret to an
+ * invalid instruction possible) or if the instructions are changed from a
+ * consistent state to another consistent state atomically.
+ * On the local CPU you need to be protected against NMI or MCE handlers seeing
+ * an inconsistent instruction while you patch.
+ */
+extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void smp_text_poke_sync_each_cpu(void);
+extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
+extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
+#define text_poke_copy text_poke_copy
+extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok);
+extern void *text_poke_set(void *addr, int c, size_t len);
+extern int smp_text_poke_int3_handler(struct pt_regs *regs);
+extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate);
+
+extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate);
+extern void smp_text_poke_batch_finish(void);
+
+#define INT3_INSN_SIZE 1
+#define INT3_INSN_OPCODE 0xCC
+
+#define RET_INSN_SIZE 1
+#define RET_INSN_OPCODE 0xC3
+
+#define CALL_INSN_SIZE 5
+#define CALL_INSN_OPCODE 0xE8
+
+#define JMP32_INSN_SIZE 5
+#define JMP32_INSN_OPCODE 0xE9
+
+#define JMP8_INSN_SIZE 2
+#define JMP8_INSN_OPCODE 0xEB
+
+#define DISP32_SIZE 4
+
+static __always_inline int text_opcode_size(u8 opcode)
+{
+ int size = 0;
+
+#define __CASE(insn) \
+ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break
+
+ switch(opcode) {
+ __CASE(INT3);
+ __CASE(RET);
+ __CASE(CALL);
+ __CASE(JMP32);
+ __CASE(JMP8);
+ }
+
+#undef __CASE
+
+ return size;
+}
+
+union text_poke_insn {
+ u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
+ struct {
+ u8 opcode;
+ s32 disp;
+ } __attribute__((packed));
+};
+
+static __always_inline
+void __text_gen_insn(void *buf, u8 opcode, const void *addr, const void *dest, int size)
+{
+ union text_poke_insn *insn = buf;
+
+ BUG_ON(size < text_opcode_size(opcode));
+
+ /*
+ * Hide the addresses to avoid the compiler folding in constants when
+ * referencing code, these can mess up annotations like
+ * ANNOTATE_NOENDBR.
+ */
+ OPTIMIZER_HIDE_VAR(insn);
+ OPTIMIZER_HIDE_VAR(addr);
+ OPTIMIZER_HIDE_VAR(dest);
+
+ insn->opcode = opcode;
+
+ if (size > 1) {
+ insn->disp = (long)dest - (long)(addr + size);
+ if (size == 2) {
+ /*
+ * Ensure that for JMP8 the displacement
+ * actually fits the signed byte.
+ */
+ BUG_ON((insn->disp >> 31) != (insn->disp >> 7));
+ }
+ }
+}
+
+static __always_inline
+void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
+{
+ static union text_poke_insn insn; /* per instance */
+ __text_gen_insn(&insn, opcode, addr, dest, text_opcode_size(opcode));
+ return &insn.text;
+}
+
+extern int after_bootmem;
+extern __ro_after_init struct mm_struct *text_poke_mm;
+extern __ro_after_init unsigned long text_poke_mm_addr;
+
+#ifndef CONFIG_UML_X86
+static __always_inline
+void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+static __always_inline
+void int3_emulate_push(struct pt_regs *regs, unsigned long val)
+{
+ /*
+ * The INT3 handler in entry_64.S adds a gap between the
+ * stack where the break point happened, and the saving of
+ * pt_regs. We can extend the original stack because of
+ * this gap. See the idtentry macro's X86_TRAP_BP logic.
+ *
+ * Similarly, entry_32.S will have a gap on the stack for
+ * (any) hardware exception and pt_regs; see the
+ * FIXUP_FRAME macro.
+ */
+ regs->sp -= sizeof(unsigned long);
+ *(unsigned long *)regs->sp = val;
+}
+
+static __always_inline
+unsigned long int3_emulate_pop(struct pt_regs *regs)
+{
+ unsigned long val = *(unsigned long *)regs->sp;
+ regs->sp += sizeof(unsigned long);
+ return val;
+}
+
+static __always_inline
+void int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+ int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
+ int3_emulate_jmp(regs, func);
+}
+
+static __always_inline
+void int3_emulate_ret(struct pt_regs *regs)
+{
+ unsigned long ip = int3_emulate_pop(regs);
+ int3_emulate_jmp(regs, ip);
+}
+
+static __always_inline
+bool __emulate_cc(unsigned long flags, u8 cc)
+{
+ static const unsigned long cc_mask[6] = {
+ [0] = X86_EFLAGS_OF,
+ [1] = X86_EFLAGS_CF,
+ [2] = X86_EFLAGS_ZF,
+ [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
+ [4] = X86_EFLAGS_SF,
+ [5] = X86_EFLAGS_PF,
+ };
+
+ bool invert = cc & 1;
+ bool match;
+
+ if (cc < 0xc) {
+ match = flags & cc_mask[cc >> 1];
+ } else {
+ match = ((flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+ ((flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+ if (cc >= 0xe)
+ match = match || (flags & X86_EFLAGS_ZF);
+ }
+
+ return (match && !invert) || (!match && invert);
+}
+
+static __always_inline
+void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+{
+ if (__emulate_cc(regs->flags, cc))
+ ip += disp;
+
+ int3_emulate_jmp(regs, ip);
+}
+
+#endif /* !CONFIG_UML_X86 */
+
+#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
new file mode 100644
index 000000000000..91a7b6687c3b
--- /dev/null
+++ b/arch/x86/include/asm/thermal.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_THERMAL_H
+#define _ASM_X86_THERMAL_H
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+void therm_lvt_init(void);
+void intel_init_thermal(struct cpuinfo_x86 *c);
+bool x86_thermal_enabled(void);
+void intel_thermal_interrupt(void);
+#else
+static inline void therm_lvt_init(void) { }
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+#endif
+
+#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fad7d40b75f8..e71e0e8362ed 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: low-level thread information
*
* Copyright (C) 2002 David Howells (dhowells@redhat.com)
@@ -9,225 +10,196 @@
#include <linux/compiler.h>
#include <asm/page.h>
+#include <asm/percpu.h>
#include <asm/types.h>
/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack. We do it because of a nasty
+ * 32-bit corner case. On x86_32, the hardware stack frame is
+ * variable-length. Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0. On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault. (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
+ *
+ * x86-64 has a fixed-length stack frame, but it depends on whether
+ * or not FRED is enabled. Future versions of FRED might make this
+ * dynamic, but for now it is always 2 words longer.
+ */
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_VM86
+# define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+# define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
+#else /* x86-64 */
+# ifdef CONFIG_X86_FRED
+# define TOP_OF_KERNEL_STACK_PADDING (2 * 8)
+# else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+# endif
+#endif
+
+/*
* low level task data that entry.S needs immediate access to
* - this struct should fit entirely inside of one cache line
* - this struct shares the supervisor stack pages
*/
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct task_struct;
-struct exec_domain;
-#include <asm/processor.h>
-#include <asm/ftrace.h>
-#include <asm/atomic.h>
+#include <asm/cpufeature.h>
+#include <linux/atomic.h>
struct thread_info {
- struct task_struct *task; /* main task structure */
- struct exec_domain *exec_domain; /* execution domain */
- __u32 flags; /* low level flags */
- __u32 status; /* thread synchronous flags */
- __u32 cpu; /* current CPU */
- int preempt_count; /* 0 => preemptable,
- <0 => BUG */
- mm_segment_t addr_limit;
- struct restart_block restart_block;
- void __user *sysenter_return;
-#ifdef CONFIG_X86_32
- unsigned long previous_esp; /* ESP of the previous stack in
- case of nested (IRQ) stacks
- */
- __u8 supervisor_stack[0];
+ unsigned long flags; /* low level flags */
+ unsigned long syscall_work; /* SYSCALL_WORK_ flags */
+ u32 status; /* thread synchronous flags */
+#ifdef CONFIG_SMP
+ u32 cpu; /* current CPU */
#endif
- int uaccess_err;
};
#define INIT_THREAD_INFO(tsk) \
{ \
- .task = &tsk, \
- .exec_domain = &default_exec_domain, \
.flags = 0, \
- .cpu = 0, \
- .preempt_count = INIT_PREEMPT_COUNT, \
- .addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
-#define init_thread_info (init_thread_union.thread_info)
-#define init_stack (init_thread_union.stack)
-
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
#include <asm/asm-offsets.h>
#endif
/*
- * thread information flags
- * - these are process state flags that various assembly files
- * may need to access
- * - pending work-to-be-done flags are in LSW
- * - other flags in MSW
- * Warning: layout of LSW is hardcoded in entry.S
+ * Tell the generic TIF infrastructure which bits x86 supports
*/
-#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
-#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
-#define TIF_SIGPENDING 2 /* signal pending */
-#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
-#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
-#define TIF_IRET 5 /* force IRET */
-#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
-#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
-#define TIF_SECCOMP 8 /* secure computing */
-#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
-#define TIF_NOTSC 16 /* TSC is not accessible in userland */
-#define TIF_IA32 17 /* 32bit process */
-#define TIF_FORK 18 /* ret_from_fork */
-#define TIF_ABI_PENDING 19
-#define TIF_MEMDIE 20
-#define TIF_DEBUG 21 /* uses debug registers */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_POLLING_NRFLAG
+#define HAVE_TIF_SINGLESTEP
+
+#include <asm-generic/thread_info_tif.h>
+
+/* Architecture specific TIF space starts at 16 */
+#define TIF_SSBD 16 /* Speculative store bypass disable */
+#define TIF_SPEC_IB 17 /* Indirect branch speculation mitigation */
+#define TIF_SPEC_L1D_FLUSH 18 /* Flush L1D on mm switches (processes) */
+#define TIF_NEED_FPU_LOAD 19 /* load FPU on return to userspace */
+#define TIF_NOCPUID 20 /* CPUID is not accessible in userland */
+#define TIF_NOTSC 21 /* TSC is not accessible in userland */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
-#define TIF_FREEZE 23 /* is freezing for suspend */
+#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
-#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
-#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
+#define TIF_SINGLESTEP 25 /* reenable singlestep on user return*/
+#define TIF_BLOCKSTEP 26 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
-#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
-
-#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
-#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
-#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
-#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
-#define _TIF_IRET (1 << TIF_IRET)
-#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
-#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
-#define _TIF_SECCOMP (1 << TIF_SECCOMP)
-#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
-#define _TIF_NOTSC (1 << TIF_NOTSC)
-#define _TIF_IA32 (1 << TIF_IA32)
-#define _TIF_FORK (1 << TIF_FORK)
-#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
-#define _TIF_DEBUG (1 << TIF_DEBUG)
-#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
-#define _TIF_FREEZE (1 << TIF_FREEZE)
-#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
-#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
-#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
-#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
-#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
-
-/* work to do in syscall_trace_enter() */
-#define _TIF_WORK_SYSCALL_ENTRY \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \
- _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
-
-/* work to do in syscall_trace_leave() */
-#define _TIF_WORK_SYSCALL_EXIT \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
- _TIF_SYSCALL_FTRACE)
-
-/* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK \
- (0x0000FFFF & \
- ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
- _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
-
-/* work to do on any return to user space */
-#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
-
-/* Only used for 64 bit */
-#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+#define TIF_ADDR32 28 /* 32-bit address space on 64 bits */
+
+#define _TIF_SSBD BIT(TIF_SSBD)
+#define _TIF_SPEC_IB BIT(TIF_SPEC_IB)
+#define _TIF_SPEC_L1D_FLUSH BIT(TIF_SPEC_L1D_FLUSH)
+#define _TIF_NEED_FPU_LOAD BIT(TIF_NEED_FPU_LOAD)
+#define _TIF_NOCPUID BIT(TIF_NOCPUID)
+#define _TIF_NOTSC BIT(TIF_NOTSC)
+#define _TIF_IO_BITMAP BIT(TIF_IO_BITMAP)
+#define _TIF_SPEC_FORCE_UPDATE BIT(TIF_SPEC_FORCE_UPDATE)
+#define _TIF_FORCED_TF BIT(TIF_FORCED_TF)
+#define _TIF_BLOCKSTEP BIT(TIF_BLOCKSTEP)
+#define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP)
+#define _TIF_LAZY_MMU_UPDATES BIT(TIF_LAZY_MMU_UPDATES)
+#define _TIF_ADDR32 BIT(TIF_ADDR32)
/* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
-#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
+#define _TIF_WORK_CTXSW_BASE \
+ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
+ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
-#define PREEMPT_ACTIVE 0x10000000
-
-/* thread information allocation */
-#ifdef CONFIG_DEBUG_STACK_USAGE
-#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+/*
+ * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
+ */
+#ifdef CONFIG_SMP
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
#else
-#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
#endif
-#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+#ifdef CONFIG_X86_IOPL_IOPERM
+# define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY | \
+ _TIF_IO_BITMAP)
+#else
+# define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW| _TIF_USER_RETURN_NOTIFY)
+#endif
-#define alloc_thread_info(tsk) \
- ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
-#ifdef CONFIG_X86_32
+#define STACK_WARN (THREAD_SIZE/8)
-#define STACK_WARN (THREAD_SIZE/8)
/*
* macros/functions for gaining access to the thread information structure
*
* preempt_count needs to be 1 initially, until the scheduler is functional.
*/
-#ifndef __ASSEMBLY__
-
-
-/* how to get the current stack pointer from C */
-register unsigned long current_stack_pointer asm("esp") __used;
-
-/* how to get the thread information struct from C */
-static inline struct thread_info *current_thread_info(void)
-{
- return (struct thread_info *)
- (current_stack_pointer & ~(THREAD_SIZE - 1));
-}
-
-#else /* !__ASSEMBLY__ */
-
-/* how to get the thread information struct from ASM */
-#define GET_THREAD_INFO(reg) \
- movl $-THREAD_SIZE, reg; \
- andl %esp, reg
-
-/* use this one if reg already contains %esp */
-#define GET_THREAD_INFO_WITH_ESP(reg) \
- andl $-THREAD_SIZE, reg
-
-#endif
-
-#else /* X86_32 */
-
-#include <asm/percpu.h>
-#define KERNEL_STACK_OFFSET (5*8)
+#ifndef __ASSEMBLER__
/*
- * macros/functions for gaining access to the thread information structure
- * preempt_count needs to be 1 initially, until the scheduler is functional.
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ * GOOD_FRAME if within a frame
+ * BAD_STACK if placed across a frame boundary (or outside stack)
+ * NOT_STACK unable to determine (no frame pointers, etc)
+ *
+ * This function reads pointers from the stack and dereferences them. The
+ * pointers may not have their KMSAN shadow set up properly, which may result
+ * in false positive reports. Disable instrumentation to avoid those.
*/
-#ifndef __ASSEMBLY__
-DECLARE_PER_CPU(unsigned long, kernel_stack);
-
-static inline struct thread_info *current_thread_info(void)
+__no_kmsan_checks
+static inline int arch_within_stack_frames(const void * const stack,
+ const void * const stackend,
+ const void *obj, unsigned long len)
{
- struct thread_info *ti;
- ti = (void *)(percpu_read(kernel_stack) +
- KERNEL_STACK_OFFSET - THREAD_SIZE);
- return ti;
-}
-
-#else /* !__ASSEMBLY__ */
-
-/* how to get the thread information struct from ASM */
-#define GET_THREAD_INFO(reg) \
- movq PER_CPU_VAR(kernel_stack),reg ; \
- subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
-
+#if defined(CONFIG_FRAME_POINTER)
+ const void *frame = NULL;
+ const void *oldframe;
+
+ oldframe = __builtin_frame_address(1);
+ if (oldframe)
+ frame = __builtin_frame_address(2);
+ /*
+ * low ----------------------------------------------> high
+ * [saved bp][saved ip][args][local vars][saved bp][saved ip]
+ * ^----------------^
+ * allow copies only within here
+ */
+ while (stack <= frame && frame < stackend) {
+ /*
+ * If obj + len extends past the last frame, this
+ * check won't pass and the next frame will be 0,
+ * causing us to bail out and correctly report
+ * the copy as invalid.
+ */
+ if (obj + len <= frame)
+ return obj >= oldframe + 2 * sizeof(void *) ?
+ GOOD_FRAME : BAD_STACK;
+ oldframe = frame;
+ frame = *(const void * const *)frame;
+ }
+ return BAD_STACK;
+#else
+ return NOT_STACK;
#endif
+}
-#endif /* !X86_32 */
+#endif /* !__ASSEMBLER__ */
/*
* Thread-synchronous status.
@@ -236,30 +208,26 @@ static inline struct thread_info *current_thread_info(void)
* ever touches our thread-synchronous status, so we don't
* have to worry about atomic accesses.
*/
-#define TS_USEDFPU 0x0001 /* FPU was used by this task
- this quantum (SMP) */
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-#define TS_POLLING 0x0004 /* true if in idle loop
- and not sleeping */
-#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
-#define TS_XSAVE 0x0010 /* Use xsave/xrstor */
-#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
+#ifndef __ASSEMBLER__
+#ifdef CONFIG_COMPAT
+#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
-#ifndef __ASSEMBLY__
-#define HAVE_SET_RESTORE_SIGMASK 1
-static inline void set_restore_sigmask(void)
-{
- struct thread_info *ti = current_thread_info();
- ti->status |= TS_RESTORE_SIGMASK;
- set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
-}
-#endif /* !__ASSEMBLY__ */
+#define arch_set_restart_data(restart) \
+ do { restart->arch_data = current_thread_info()->status; } while (0)
-#ifndef __ASSEMBLY__
-extern void arch_task_cache_init(void);
-extern void free_thread_info(struct thread_info *ti);
-extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
-#define arch_task_cache_init arch_task_cache_init
#endif
+
+#ifdef CONFIG_X86_32
+#define in_ia32_syscall() true
+#else
+#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
+ current_thread_info()->status & TS_COMPAT)
+#endif
+
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
+#endif /* !__ASSEMBLER__ */
+
#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 50c733aac421..f360104ed172 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -1,63 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TIME_H
#define _ASM_X86_TIME_H
-extern void hpet_time_init(void);
-
+#include <linux/clocksource.h>
#include <asm/mc146818rtc.h>
-#ifdef CONFIG_X86_32
-#include <linux/efi.h>
-
-static inline unsigned long native_get_wallclock(void)
-{
- unsigned long retval;
-
- if (efi_enabled)
- retval = efi_get_time();
- else
- retval = mach_get_cmos_time();
-
- return retval;
-}
-
-static inline int native_set_wallclock(unsigned long nowtime)
-{
- int retval;
-
- if (efi_enabled)
- retval = efi_set_rtc_mmss(nowtime);
- else
- retval = mach_set_rtc_mmss(nowtime);
-
- return retval;
-}
-#else
-extern void native_time_init_hook(void);
-
-static inline unsigned long native_get_wallclock(void)
-{
- return mach_get_cmos_time();
-}
-
-static inline int native_set_wallclock(unsigned long nowtime)
-{
- return mach_set_rtc_mmss(nowtime);
-}
-
-#endif
-
-extern void time_init(void);
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else /* !CONFIG_PARAVIRT */
-
-#define get_wallclock() native_get_wallclock()
-#define set_wallclock(x) native_set_wallclock(x)
-#define choose_time_init() hpet_time_init
-
-#endif /* CONFIG_PARAVIRT */
+extern void hpet_time_init(void);
+extern bool pit_timer_init(void);
+extern bool tsc_clocksource_watchdog_disabled(void);
-extern unsigned long __init calibrate_cpu(void);
+extern struct clock_event_device *global_clock_event;
#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 20ca9c4d4686..23baf8c9b34c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -1,72 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TIMER_H
#define _ASM_X86_TIMER_H
-#include <linux/init.h>
#include <linux/pm.h>
#include <linux/percpu.h>
#include <linux/interrupt.h>
-
-#define TICK_SIZE (tick_nsec / 1000)
+#include <linux/math64.h>
unsigned long long native_sched_clock(void);
-unsigned long native_calibrate_tsc(void);
-
-#ifdef CONFIG_X86_32
-extern int timer_ack;
-extern irqreturn_t timer_interrupt(int irq, void *dev_id);
-#endif /* CONFIG_X86_32 */
-extern int recalibrate_cpu_khz(void);
+extern void recalibrate_cpu_khz(void);
extern int no_timer_check;
-#ifndef CONFIG_PARAVIRT
-#define calibrate_tsc() native_calibrate_tsc()
-#endif
+extern bool using_native_sched_clock(void);
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
+/*
+ * We use the full linear equation: f(x) = a + b*x, in order to allow
+ * a continuous function in the face of dynamic freq changes.
*
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
+ * Continuity means that when our frequency changes our slope (b); we want to
+ * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
*
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
+ * Without an offset (a) the above would not be possible.
*
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ * See the comment near cycles_2_ns() for details on how we compute (b).
*/
-
-DECLARE_PER_CPU(unsigned long, cyc2ns);
-DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
-
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
-{
- int cpu = smp_processor_id();
- unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
- ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
- return ns;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
- unsigned long long ns;
- unsigned long flags;
-
- local_irq_save(flags);
- ns = __cycles_2_ns(cyc);
- local_irq_restore(flags);
-
- return ns;
-}
+struct cyc2ns_data {
+ u32 cyc2ns_mul;
+ u32 cyc2ns_shift;
+ u64 cyc2ns_offset;
+}; /* 16 bytes */
+
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index 1375cfc93960..956e4145311b 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -1,9 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TIMEX_H
#define _ASM_X86_TIMEX_H
#include <asm/processor.h>
#include <asm/tsc.h>
+static inline unsigned long random_get_entropy(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
+ return random_get_entropy_fallback();
+ return rdtsc();
+}
+#define random_get_entropy random_get_entropy
+
/* Assume we use the PIT time source for the clock tick */
#define CLOCK_TICK_RATE PIT_TICK_RATE
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 829215fef9ee..866ea78ba156 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -1,11 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLB_H
#define _ASM_X86_TLB_H
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+#define tlb_flush tlb_flush
+static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
+static inline void tlb_flush(struct mmu_gather *tlb)
+{
+ unsigned long start = 0UL, end = TLB_FLUSH_ALL;
+ unsigned int stride_shift = tlb_get_unmap_shift(tlb);
+
+ if (!tlb->fullmm && !tlb->need_flush_all) {
+ start = tlb->start;
+ end = tlb->end;
+ }
+
+ flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+}
+
+static inline void invlpg(unsigned long addr)
+{
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+}
+
+enum addr_stride {
+ PTE_STRIDE = 0,
+ PMD_STRIDE = 1
+};
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA BIT(0)
+#define INVLPGB_FLAG_PCID BIT(1)
+#define INVLPGB_FLAG_ASID BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * The instruction takes the number of extra pages to invalidate, beyond the
+ * first page, while __invlpgb gets the more human readable number of pages to
+ * invalidate.
+ *
+ * The bits in rax[0:2] determine respectively which components of the address
+ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
+ * address in the specified range matches.
+ *
+ * Since it is desired to only flush TLB entries for the ASID that is executing
+ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
+ * always be set. On a host/hypervisor, the hardware will use the ASID value
+ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
+ * use the actual ASID value of the guest.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride stride, u8 flags)
+{
+ u64 rax = addr | flags | INVLPGB_FLAG_ASID;
+ u32 ecx = (stride << 31) | (nr_pages - 1);
+ u32 edx = (pcid << 16) | asid;
+
+ /* The low bits in rax are for flags. Verify addr is clean. */
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+ /* INVLPGB; supported in binutils >= 2.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
+}
+
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
+{
+ __invlpgb(asid, pcid, 0, 1, 0, flags);
+}
+
+static inline void __tlbsync(void)
+{
+ /*
+ * TLBSYNC waits for INVLPGB instructions originating on the same CPU
+ * to have completed. Print a warning if the task has been migrated,
+ * and might not be waiting on all the INVLPGBs issued during this TLB
+ * invalidation sequence.
+ */
+ cant_migrate();
+
+ /* TLBSYNC: supported in binutils >= 0.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+#else
+/* Some compilers (I'm looking at you clang!) simply can't do DCE */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride s, u8 flags) { }
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
+static inline void __tlbsync(void) { }
+#endif
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr, bool stride)
+{
+ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
+ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
+
+ __invlpgb(0, pcid, addr, nr, str, flags);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+ /*
+ * TLBSYNC at the end needs to make sure all flushes done on the
+ * current CPU have been executed system-wide. Therefore, make
+ * sure nothing gets migrated in-between but disable preemption
+ * as it is cheaper.
+ */
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
+ __tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
+ __tlbsync();
+}
#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..80aaf64ff25f
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+struct arch_tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed..
+ */
+ struct cpumask cpumask;
+ /*
+ * Set if pages were unmapped from any MM, even one that does not
+ * have active CPUs in its cpumask.
+ */
+ bool unmapped_pages;
+};
+
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba08e7de..00daedfefc1b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -1,177 +1,491 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLBFLUSH_H
#define _ASM_X86_TLBFLUSH_H
-#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <asm/barrier.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+#include <asm/smp.h>
+#include <asm/invpcid.h>
+#include <asm/pti.h>
+#include <asm/processor-flags.h>
+#include <asm/pgtable.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define __flush_tlb() __native_flush_tlb()
-#define __flush_tlb_global() __native_flush_tlb_global()
-#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
-#endif
+DECLARE_PER_CPU(u64, tlbstate_untag_mask);
+
+void __flush_tlb_all(void);
+
+#define TLB_FLUSH_ALL -1UL
+#define TLB_GENERATION_INVALID 0
+
+void cr4_update_irqsoff(unsigned long set, unsigned long clear);
+unsigned long cr4_read_shadow(void);
-static inline void __native_flush_tlb(void)
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits_irqsoff(unsigned long mask)
{
- native_write_cr3(native_read_cr3());
+ cr4_update_irqsoff(mask, 0);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits_irqsoff(unsigned long mask)
+{
+ cr4_update_irqsoff(0, mask);
+}
+
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cr4_set_bits_irqsoff(mask);
+ local_irq_restore(flags);
}
-static inline void __native_flush_tlb_global(void)
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
{
unsigned long flags;
+
+ local_irq_save(flags);
+ cr4_clear_bits_irqsoff(mask);
+ local_irq_restore(flags);
+}
+
+#ifndef MODULE
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+ * lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+struct tlb_context {
+ u64 ctx_id;
+ u64 tlb_gen;
+};
+
+struct tlb_state {
+ /*
+ * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
+ * are on. This means that it may not match current->active_mm,
+ * which will contain the previous user mm when we're in lazy TLB
+ * mode even if we've already switched back to swapper_pg_dir.
+ *
+ * During switch_mm_irqs_off(), loaded_mm will be set to
+ * LOADED_MM_SWITCHING during the brief interrupts-off window
+ * when CR3 and loaded_mm would otherwise be inconsistent. This
+ * is for nmi_uaccess_okay()'s benefit.
+ */
+ struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1UL)
+
+ /* Last user mm for optimizing IBPB */
+ union {
+ struct mm_struct *last_user_mm;
+ unsigned long last_user_mm_spec;
+ };
+
+ u16 loaded_mm_asid;
+ u16 next_asid;
+
+ /*
+ * If set we changed the page tables in such a way that we
+ * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+ * This tells us to go invalidate all the non-loaded ctxs[]
+ * on the next context switch.
+ *
+ * The current ctx was kept up-to-date as it ran and does not
+ * need to be invalidated.
+ */
+ bool invalidate_other;
+
+#ifdef CONFIG_ADDRESS_MASKING
+ /*
+ * Active LAM mode.
+ *
+ * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM
+ * disabled.
+ */
+ u8 lam;
+#endif
+
+ /*
+ * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+ * the corresponding user PCID needs a flush next time we
+ * switch to it; see SWITCH_TO_USER_CR3.
+ */
+ unsigned short user_pcid_flush_mask;
+
+ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
unsigned long cr4;
/*
- * Read-modify-write to CR4 - protect it from preemption and
- * from interrupts. (Use the raw variant because this code can
- * be called from deep inside debugging code.)
+ * This is a list of all contexts that might exist in the TLB.
+ * There is one per ASID that we use, and the ASID (what the
+ * CPU calls PCID) is the index into ctxts.
+ *
+ * For each context, ctx_id indicates which mm the TLB's user
+ * entries came from. As an invariant, the TLB will never
+ * contain entries that are out-of-date as when that mm reached
+ * the tlb_gen in the list.
+ *
+ * To be clear, this means that it's legal for the TLB code to
+ * flush the TLB without updating tlb_gen. This can happen
+ * (for now, at least) due to paravirt remote flushes.
+ *
+ * NB: context 0 is a bit special, since it's also used by
+ * various bits of init code. This is fine -- code that
+ * isn't aware of PCID will end up harmlessly flushing
+ * context 0.
*/
- raw_local_irq_save(flags);
+ struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
+};
+DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
- cr4 = native_read_cr4();
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+struct tlb_state_shared {
+ /*
+ * We can be in one of several states:
+ *
+ * - Actively using an mm. Our CPU's bit will be set in
+ * mm_cpumask(loaded_mm) and is_lazy == false;
+ *
+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+ *
+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit
+ * is set in mm_cpumask(loaded_mm), but is_lazy == true.
+ * We're heuristically guessing that the CR3 load we
+ * skipped more than makes up for the overhead added by
+ * lazy mode.
+ */
+ bool is_lazy;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
- raw_local_irq_restore(flags);
-}
+bool nmi_uaccess_okay(void);
+#define nmi_uaccess_okay nmi_uaccess_okay
-static inline void __native_flush_tlb_single(unsigned long addr)
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
{
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
}
-static inline void __flush_tlb_all(void)
-{
- if (cpu_has_pge)
- __flush_tlb_global();
- else
- __flush_tlb();
-}
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
-static inline void __flush_tlb_one(unsigned long addr)
-{
- if (cpu_has_invlpg)
- __flush_tlb_single(addr);
- else
- __flush_tlb();
-}
+/* How many pages can be invalidated with one INVLPGB. */
+extern u16 invlpgb_count_max;
-#ifdef CONFIG_X86_32
-# define TLB_FLUSH_ALL 0xffffffff
-#else
-# define TLB_FLUSH_ALL -1ULL
-#endif
+extern void initialize_tlbstate_and_flush(void);
/*
* TLB flushing:
*
- * - flush_tlb() flushes the current mm struct TLBs
* - flush_tlb_all() flushes all processes TLBs
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
+ * - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
- *
- * x86-64 can only flush individual pages or full VMs. For a range flush
- * we always do the full VM. Might be worth trying if for a small
- * range a few INVLPGs in a row are a win.
*/
+struct flush_tlb_info {
+ /*
+ * We support several kinds of flushes.
+ *
+ * - Fully flush a single mm. .mm will be set, .end will be
+ * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+ * which the IPI sender is trying to catch us up.
+ *
+ * - Partially flush a single mm. .mm will be set, .start and
+ * .end will indicate the range, and .new_tlb_gen will be set
+ * such that the changes between generation .new_tlb_gen-1 and
+ * .new_tlb_gen are entirely contained in the indicated range.
+ *
+ * - Fully flush all mms whose tlb_gens have been updated. .mm
+ * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+ * will be zero.
+ */
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+ u64 new_tlb_gen;
+ unsigned int initiating_cpu;
+ u8 stride_shift;
+ u8 freed_tables;
+ u8 trim_cpumask;
+};
+
+void flush_tlb_local(void);
+void flush_tlb_one_user(unsigned long addr);
+void flush_tlb_one_kernel(unsigned long addr);
+void flush_tlb_multi(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info);
-#ifndef CONFIG_SMP
+static inline bool is_dyn_asid(u16 asid)
+{
+ return asid < TLB_NR_DYN_ASIDS;
+}
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
+static inline bool is_global_asid(u16 asid)
+{
+ return !is_dyn_asid(asid);
+}
-static inline void flush_tlb_mm(struct mm_struct *mm)
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+static inline u16 mm_global_asid(struct mm_struct *mm)
{
- if (mm == current->active_mm)
- __flush_tlb();
+ u16 asid;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return 0;
+
+ asid = smp_load_acquire(&mm->context.global_asid);
+
+ /* mm->context.global_asid is either 0, or a global ASID */
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+ return asid;
}
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
+static inline void mm_init_global_asid(struct mm_struct *mm)
{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ mm->context.global_asid = 0;
+ mm->context.asid_transition = false;
+ }
}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb();
+ /*
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+ * finish_asid_transition() needs to observe asid_transition = true
+ * once it observes global_asid.
+ */
+ mm->context.asid_transition = true;
+ smp_store_release(&mm->context.global_asid, asid);
}
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va)
+static inline void mm_clear_asid_transition(struct mm_struct *mm)
{
+ WRITE_ONCE(mm->context.asid_transition, false);
}
-static inline void reset_lazy_tlbstate(void)
+static inline bool mm_in_asid_transition(struct mm_struct *mm)
{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ return mm && READ_ONCE(mm->context.asid_transition);
}
+#else
+static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_init_global_asid(struct mm_struct *mm) { }
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
+static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
+#endif /* CONFIG_BROADCAST_TLB_FLUSH */
-#else /* SMP */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
-#include <asm/smp.h>
+#define flush_tlb_mm(mm) \
+ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
-#define local_flush_tlb() __flush_tlb()
+#define flush_tlb_range(vma, start, end) \
+ flush_tlb_mm_range((vma)->vm_mm, start, end, \
+ ((vma)->vm_flags & VM_HUGETLB) \
+ ? huge_page_shift(hstate_vma(vma)) \
+ : PAGE_SHIFT, true)
extern void flush_tlb_all(void);
-extern void flush_tlb_current_task(void);
-extern void flush_tlb_mm(struct mm_struct *);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned int stride_shift,
+ bool freed_tables);
+extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
+}
+
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+ bool should_defer = false;
+
+ /* If remote CPUs need to be flushed then defer batch the flush */
+ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+ should_defer = true;
+ put_cpu();
-#define flush_tlb() flush_tlb_current_task()
+ return should_defer;
+}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
- flush_tlb_mm(vma->vm_mm);
+ /*
+ * Bump the generation count. This also serves as a full barrier
+ * that synchronizes with switch_mm(): callers are required to order
+ * their read of mm_cpumask after their writes to the paging
+ * structures.
+ */
+ return atomic64_inc_return(&mm->context.tlb_gen);
}
-void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long va);
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ inc_mm_tlb_gen(mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ batch->unmapped_pages = true;
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
-#define TLBSTATE_OK 1
-#define TLBSTATE_LAZY 2
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
-struct tlb_state {
- struct mm_struct *active_mm;
- int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+static inline bool pte_flags_need_flush(unsigned long oldflags,
+ unsigned long newflags,
+ bool ignore_access)
+{
+ /*
+ * Flags that require a flush when cleared but not when they are set.
+ * Only include flags that would not trigger spurious page-faults.
+ * Non-present entries are not cached. Hardware would set the
+ * dirty/access bit if needed without a fault.
+ */
+ const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
+ _PAGE_ACCESSED;
+ const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
+ _PAGE_SOFTW3 | _PAGE_SOFTW4 |
+ _PAGE_SAVED_DIRTY;
+ const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
+ _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
+ _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
+ _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX;
+ unsigned long diff = oldflags ^ newflags;
+
+ BUILD_BUG_ON(flush_on_clear & software_flags);
+ BUILD_BUG_ON(flush_on_clear & flush_on_change);
+ BUILD_BUG_ON(flush_on_change & software_flags);
+
+ /* Ignore software flags */
+ diff &= ~software_flags;
+
+ if (ignore_access)
+ diff &= ~_PAGE_ACCESSED;
+
+ /*
+ * Did any of the 'flush_on_clear' flags was clleared set from between
+ * 'oldflags' and 'newflags'?
+ */
+ if (diff & oldflags & flush_on_clear)
+ return true;
+
+ /* Flush on modified flags. */
+ if (diff & flush_on_change)
+ return true;
-static inline void reset_lazy_tlbstate(void)
+ /* Ensure there are no flags that were left behind */
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ (diff & ~(flush_on_clear | software_flags | flush_on_change))) {
+ VM_WARN_ON_ONCE(1);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * pte_needs_flush() checks whether permissions were demoted and require a
+ * flush. It should only be used for userspace PTEs.
+ */
+static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
{
- percpu_write(cpu_tlbstate.state, 0);
- percpu_write(cpu_tlbstate.active_mm, &init_mm);
+ /* !PRESENT -> * ; no need for flush */
+ if (!(pte_flags(oldpte) & _PAGE_PRESENT))
+ return false;
+
+ /* PFN changed ; needs flush */
+ if (pte_pfn(oldpte) != pte_pfn(newpte))
+ return true;
+
+ /*
+ * check PTE flags; ignore access-bit; see comment in
+ * ptep_clear_flush_young().
+ */
+ return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte),
+ true);
}
+#define pte_needs_flush pte_needs_flush
+
+/*
+ * huge_pmd_needs_flush() checks whether permissions were demoted and require a
+ * flush. It should only be used for userspace huge PMDs.
+ */
+static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
+{
+ /* !PRESENT -> * ; no need for flush */
+ if (!(pmd_flags(oldpmd) & _PAGE_PRESENT))
+ return false;
-#endif /* SMP */
+ /* PFN changed ; needs flush */
+ if (pmd_pfn(oldpmd) != pmd_pfn(newpmd))
+ return true;
-#ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
-#endif
+ /*
+ * check PMD flags; do not ignore access-bit; see
+ * pmdp_clear_flush_young().
+ */
+ return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd),
+ false);
+}
+#define huge_pmd_needs_flush huge_pmd_needs_flush
+
+#ifdef CONFIG_ADDRESS_MASKING
+static inline u64 tlbstate_lam_cr3_mask(void)
+{
+ u64 lam = this_cpu_read(cpu_tlbstate.lam);
+
+ return lam << X86_CR3_LAM_U57_BIT;
+}
+
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
+{
+ this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
+ this_cpu_write(tlbstate_untag_mask, untag_mask);
+}
-static inline void flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
+#else
+
+static inline u64 tlbstate_lam_cr3_mask(void)
{
- flush_tlb_all();
+ return 0;
}
-extern void zap_low_mappings(bool early);
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
+{
+}
+#endif
+#endif /* !MODULE */
+static inline void __native_tlb_flush_global(unsigned long cr4)
+{
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ native_write_cr4(cr4);
+}
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 066ef590d7e0..1fadf0cf520c 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -25,56 +25,33 @@
#ifndef _ASM_X86_TOPOLOGY_H
#define _ASM_X86_TOPOLOGY_H
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
-# define ENABLE_TOPO_DEFINES
-# endif
-#else
-# ifdef CONFIG_SMP
-# define ENABLE_TOPO_DEFINES
-# endif
-#endif
-
-/* Node not present */
-#define NUMA_NO_NODE (-1)
-
-#ifdef CONFIG_NUMA
+/*
+ * to preserve the visibility of NUMA_NO_NODE definition,
+ * moved to there from here. May be used independent of
+ * CONFIG_NUMA.
+ */
+#include <linux/numa.h>
#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#ifdef CONFIG_X86_32
-
-/* Mappings between logical cpu number and node number */
-extern int cpu_to_node_map[];
-
-/* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
-{
- return cpu_to_node_map[cpu];
-}
-#define early_cpu_to_node(cpu) cpu_to_node(cpu)
+#ifdef CONFIG_NUMA
-#else /* CONFIG_X86_64 */
+#include <asm/mpspec.h>
+#include <asm/percpu.h>
/* Mappings between logical cpu number and node number */
DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
-/* Returns the number of the current Node. */
-DECLARE_PER_CPU(int, node_number);
-#define numa_node_id() percpu_read(node_number)
-
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-extern int cpu_to_node(int cpu);
+/*
+ * override generic percpu implementation of cpu_to_node
+ */
+extern int __cpu_to_node(int cpu);
+#define cpu_to_node __cpu_to_node
+
extern int early_cpu_to_node(int cpu);
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
-/* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
-{
- return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
/* Same function but used if called before per_cpu areas are setup */
static inline int early_cpu_to_node(int cpu)
{
@@ -83,8 +60,6 @@ static inline int early_cpu_to_node(int cpu)
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-#endif /* CONFIG_X86_64 */
-
/* Mappings between node number and cpus on that node. */
extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
@@ -100,125 +75,256 @@ static inline const struct cpumask *cpumask_of_node(int node)
extern void setup_node_to_cpumask_map(void);
+#define pcibus_to_node(bus) __pcibus_to_node(bus)
+
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a, b)
+
+#else /* !CONFIG_NUMA */
+
+static inline int numa_node_id(void)
+{
+ return 0;
+}
/*
- * Returns the number of the node containing Node 'node'. This
- * architecture is flat, so it is a pretty simple function!
+ * indicate override:
*/
-#define parent_node(node) (node)
+#define numa_node_id numa_node_id
-#define pcibus_to_node(bus) __pcibus_to_node(bus)
+static inline int early_cpu_to_node(int cpu)
+{
+ return 0;
+}
-#ifdef CONFIG_X86_32
-extern unsigned long node_start_pfn[];
-extern unsigned long node_end_pfn[];
-extern unsigned long node_remap_size[];
-#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
+static inline void setup_node_to_cpumask_map(void) { }
-# define SD_CACHE_NICE_TRIES 1
-# define SD_IDLE_IDX 1
-# define SD_NEWIDLE_IDX 2
-# define SD_FORKEXEC_IDX 0
+#endif
-#else
+#include <asm-generic/topology.h>
-# define SD_CACHE_NICE_TRIES 2
-# define SD_IDLE_IDX 2
-# define SD_NEWIDLE_IDX 2
-# define SD_FORKEXEC_IDX 1
+/* Topology information */
+enum x86_topology_domains {
+ TOPO_SMT_DOMAIN,
+ TOPO_CORE_DOMAIN,
+ TOPO_MODULE_DOMAIN,
+ TOPO_TILE_DOMAIN,
+ TOPO_DIE_DOMAIN,
+ TOPO_DIEGRP_DOMAIN,
+ TOPO_PKG_DOMAIN,
+ TOPO_MAX_DOMAIN,
+};
+
+enum x86_topology_cpu_type {
+ TOPO_CPU_TYPE_PERFORMANCE,
+ TOPO_CPU_TYPE_EFFICIENCY,
+ TOPO_CPU_TYPE_UNKNOWN,
+};
+
+struct x86_topology_system {
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_size[TOPO_MAX_DOMAIN];
+};
+
+extern struct x86_topology_system x86_topo_system;
+
+static inline unsigned int topology_get_domain_size(enum x86_topology_domains dom)
+{
+ return x86_topo_system.dom_size[dom];
+}
-#endif
+static inline unsigned int topology_get_domain_shift(enum x86_topology_domains dom)
+{
+ return dom == TOPO_SMT_DOMAIN ? 0 : x86_topo_system.dom_shifts[dom - 1];
+}
-/* sched_domains SD_NODE_INIT for NUMA machines */
-#define SD_NODE_INIT (struct sched_domain) { \
- .min_interval = 8, \
- .max_interval = 32, \
- .busy_factor = 32, \
- .imbalance_pct = 125, \
- .cache_nice_tries = SD_CACHE_NICE_TRIES, \
- .busy_idx = 3, \
- .idle_idx = SD_IDLE_IDX, \
- .newidle_idx = SD_NEWIDLE_IDX, \
- .wake_idx = 1, \
- .forkexec_idx = SD_FORKEXEC_IDX, \
- .flags = SD_LOAD_BALANCE \
- | SD_BALANCE_EXEC \
- | SD_BALANCE_FORK \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE \
- | SD_SERIALIZE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
-}
-
-#ifdef CONFIG_X86_64_ACPI_NUMA
-extern int __node_distance(int, int);
-#define node_distance(a, b) __node_distance(a, b)
-#endif
+extern const struct cpumask *cpu_coregroup_mask(int cpu);
+extern const struct cpumask *cpu_clustergroup_mask(int cpu);
-#else /* !CONFIG_NUMA */
+#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
+#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
+#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
+#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
+#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
+#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
+#define topology_ppin(cpu) (cpu_data(cpu).ppin)
-static inline int numa_node_id(void)
+#define topology_amd_node_id(cpu) (cpu_data(cpu).topo.amd_node_id)
+
+extern unsigned int __max_dies_per_package;
+extern unsigned int __max_logical_packages;
+extern unsigned int __max_threads_per_core;
+extern unsigned int __num_threads_per_package;
+extern unsigned int __num_cores_per_package;
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c);
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c);
+
+static inline unsigned int topology_max_packages(void)
{
- return 0;
+ return __max_logical_packages;
}
-static inline int cpu_to_node(int cpu)
+static inline unsigned int topology_max_dies_per_package(void)
{
- return 0;
+ return __max_dies_per_package;
}
-static inline int early_cpu_to_node(int cpu)
+static inline unsigned int topology_num_cores_per_package(void)
+{
+ return __num_cores_per_package;
+}
+
+static inline unsigned int topology_num_threads_per_package(void)
+{
+ return __num_threads_per_package;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level);
+#else
+static inline int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
{
return 0;
}
+#endif
-static inline const struct cpumask *cpumask_of_node(int node)
+#ifdef CONFIG_SMP
+#define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id)
+#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
+#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu))
+#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
+#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
+
+
+static inline int topology_phys_to_logical_pkg(unsigned int pkg)
{
- return cpu_online_mask;
+ return topology_get_logical_id(pkg << x86_topo_system.dom_shifts[TOPO_PKG_DOMAIN],
+ TOPO_PKG_DOMAIN);
}
-static inline void setup_node_to_cpumask_map(void) { }
+extern int __max_smt_threads;
-#endif
+static inline int topology_max_smt_threads(void)
+{
+ return __max_smt_threads;
+}
-#include <asm-generic/topology.h>
+#include <linux/cpu_smt.h>
-extern const struct cpumask *cpu_coregroup_mask(int cpu);
+extern unsigned int __amd_nodes_per_pkg;
-#ifdef ENABLE_TOPO_DEFINES
-#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
-#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
-#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
-#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
+static inline unsigned int topology_amd_nodes_per_pkg(void)
+{
+ return __amd_nodes_per_pkg;
+}
-/* indicates that pointers to the topology cpumask_t maps are valid */
-#define arch_provides_topology_pointers yes
-#endif
+#else /* CONFIG_SMP */
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
+static inline int topology_max_smt_threads(void) { return 1; }
+static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; }
+#endif /* !CONFIG_SMP */
+
+extern struct cpumask __cpu_primary_thread_mask;
+#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask)
+
+/**
+ * topology_is_primary_thread - Check whether CPU is the primary SMT thread
+ * @cpu: CPU to check
+ */
+static inline bool topology_is_primary_thread(unsigned int cpu)
+{
+ return cpumask_test_cpu(cpu, cpu_primary_thread_mask);
+}
+#define topology_is_primary_thread topology_is_primary_thread
+
+int topology_get_primary_thread(unsigned int cpu);
+
+static inline bool topology_is_core_online(unsigned int cpu)
+{
+ int pcpu = topology_get_primary_thread(cpu);
+
+ return pcpu >= 0 ? cpu_online(pcpu) : false;
+}
+#define topology_is_core_online topology_is_core_online
static inline void arch_fix_phys_package_id(int num, u32 slot)
{
}
struct pci_bus;
-void x86_pci_root_bus_res_quirks(struct pci_bus *b);
+int x86_pci_root_bus_node(int bus);
+void x86_pci_root_bus_resources(int bus, struct list_head *resources);
-#ifdef CONFIG_SMP
-#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
- (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
-#define smt_capable() (smp_num_siblings > 1)
-#endif
+extern bool x86_topology_update;
-#ifdef CONFIG_NUMA
-extern int get_mp_bus_to_node(int busnum);
-extern void set_mp_bus_to_node(int busnum, int node);
-#else
-static inline int get_mp_bus_to_node(int busnum)
+#ifdef CONFIG_SCHED_MC_PRIO
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+extern bool __read_mostly sysctl_sched_itmt_enabled;
+
+/* Interface to set priority of a cpu */
+void sched_set_itmt_core_prio(int prio, int core_cpu);
+
+/* Interface to notify scheduler that system supports ITMT */
+int sched_set_itmt_support(void);
+
+/* Interface to notify scheduler that system revokes ITMT support */
+void sched_clear_itmt_support(void);
+
+#else /* CONFIG_SCHED_MC_PRIO */
+
+#define sysctl_sched_itmt_enabled false
+static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+}
+static inline int sched_set_itmt_support(void)
{
return 0;
}
-static inline void set_mp_bus_to_node(int busnum, int node)
+static inline void sched_clear_itmt_support(void)
+{
+}
+#endif /* CONFIG_SCHED_MC_PRIO */
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_64)
+#include <asm/cpufeature.h>
+
+DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+#define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key)
+
+DECLARE_PER_CPU(unsigned long, arch_freq_scale);
+
+static inline long arch_scale_freq_capacity(int cpu)
{
+ return per_cpu(arch_freq_scale, cpu);
}
+#define arch_scale_freq_capacity arch_scale_freq_capacity
+
+bool arch_enable_hybrid_capacity_scale(void);
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq);
+
+unsigned long arch_scale_cpu_capacity(int cpu);
+#define arch_scale_cpu_capacity arch_scale_cpu_capacity
+
+extern void arch_set_max_freq_ratio(bool turbo_disabled);
+extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
+#else
+static inline bool arch_enable_hybrid_capacity_scale(void) { return false; }
+static inline void arch_set_cpu_capacity(int cpu, unsigned long cap,
+ unsigned long max_cap,
+ unsigned long cap_freq,
+ unsigned long base_freq) { }
+
+static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
+static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
#endif
+extern void arch_scale_freq_tick(void);
+#define arch_scale_freq_tick arch_scale_freq_tick
+
+extern int arch_sched_node_distance(int from, int to);
+
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
new file mode 100644
index 000000000000..721b408d9a67
--- /dev/null
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM x86_fpu
+
+#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FPU_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(x86_fpu,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu),
+
+ TP_STRUCT__entry(
+ __field(struct fpu *, fpu)
+ __field(bool, load_fpu)
+ __field(u64, xfeatures)
+ __field(u64, xcomp_bv)
+ ),
+
+ TP_fast_assign(
+ __entry->fpu = fpu;
+ __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD);
+ if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
+ __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv;
+ }
+ ),
+ TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
+ __entry->fpu,
+ __entry->load_fpu,
+ __entry->xfeatures,
+ __entry->xcomp_bv
+ )
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE fpu
+#endif /* _TRACE_FPU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h
new file mode 100644
index 000000000000..a8e5a7a2b460
--- /dev/null
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -0,0 +1,98 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hyperv
+
+#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HYPERV_H
+
+#include <linux/tracepoint.h>
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+TRACE_EVENT(hyperv_mmu_flush_tlb_multi,
+ TP_PROTO(const struct cpumask *cpus,
+ const struct flush_tlb_info *info),
+ TP_ARGS(cpus, info),
+ TP_STRUCT__entry(
+ __field(unsigned int, ncpus)
+ __field(struct mm_struct *, mm)
+ __field(unsigned long, addr)
+ __field(unsigned long, end)
+ ),
+ TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
+ __entry->mm = info->mm;
+ __entry->addr = info->start;
+ __entry->end = info->end;
+ ),
+ TP_printk("ncpus %d mm %p addr %lx, end %lx",
+ __entry->ncpus, __entry->mm,
+ __entry->addr, __entry->end)
+ );
+
+TRACE_EVENT(hyperv_nested_flush_guest_mapping,
+ TP_PROTO(u64 as, int ret),
+ TP_ARGS(as, ret),
+
+ TP_STRUCT__entry(
+ __field(u64, as)
+ __field(int, ret)
+ ),
+ TP_fast_assign(__entry->as = as;
+ __entry->ret = ret;
+ ),
+ TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
+ );
+
+TRACE_EVENT(hyperv_nested_flush_guest_mapping_range,
+ TP_PROTO(u64 as, int ret),
+ TP_ARGS(as, ret),
+
+ TP_STRUCT__entry(
+ __field(u64, as)
+ __field(int, ret)
+ ),
+ TP_fast_assign(__entry->as = as;
+ __entry->ret = ret;
+ ),
+ TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
+ );
+
+TRACE_EVENT(hyperv_send_ipi_mask,
+ TP_PROTO(const struct cpumask *cpus,
+ int vector),
+ TP_ARGS(cpus, vector),
+ TP_STRUCT__entry(
+ __field(unsigned int, ncpus)
+ __field(int, vector)
+ ),
+ TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
+ __entry->vector = vector;
+ ),
+ TP_printk("ncpus %d vector %x",
+ __entry->ncpus, __entry->vector)
+ );
+
+TRACE_EVENT(hyperv_send_ipi_one,
+ TP_PROTO(int cpu,
+ int vector),
+ TP_ARGS(cpu, vector),
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(int, vector)
+ ),
+ TP_fast_assign(__entry->cpu = cpu;
+ __entry->vector = vector;
+ ),
+ TP_printk("cpu %d vector %x",
+ __entry->cpu, __entry->vector)
+ );
+
+#endif /* CONFIG_HYPERV */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE hyperv
+#endif /* _TRACE_HYPERV_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
new file mode 100644
index 000000000000..7408bebdfde0
--- /dev/null
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -0,0 +1,382 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq_vectors
+
+#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_VECTORS_H
+
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+DECLARE_EVENT_CLASS(x86_irq_vector,
+
+ TP_PROTO(int vector),
+
+ TP_ARGS(vector),
+
+ TP_STRUCT__entry(
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vector=%d", __entry->vector) );
+
+#define DEFINE_IRQ_VECTOR_EVENT(name) \
+DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \
+ TP_PROTO(int vector), \
+ TP_ARGS(vector), NULL, NULL); \
+DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
+ TP_PROTO(int vector), \
+ TP_ARGS(vector), NULL, NULL);
+
+/*
+ * local_timer - called when entering/exiting a local timer interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(local_timer);
+
+/*
+ * spurious_apic - called when entering/exiting a spurious apic vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(spurious_apic);
+
+/*
+ * error_apic - called when entering/exiting an error apic vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(error_apic);
+
+/*
+ * x86_platform_ipi - called when entering/exiting a x86 platform ipi interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi);
+
+#ifdef CONFIG_IRQ_WORK
+/*
+ * irq_work - called when entering/exiting a irq work interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(irq_work);
+
+/*
+ * We must dis-allow sampling irq_work_exit() because perf event sampling
+ * itself can cause irq_work, which would lead to an infinite loop;
+ *
+ * 1) irq_work_exit happens
+ * 2) generates perf sample
+ * 3) generates irq_work
+ * 4) goto 1
+ */
+TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0);
+#endif
+
+/*
+ * The ifdef is required because that tracepoint macro hell emits tracepoint
+ * code in files which include this header even if the tracepoint is not
+ * enabled. Brilliant stuff that.
+ */
+#ifdef CONFIG_SMP
+/*
+ * reschedule - called when entering/exiting a reschedule vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(reschedule);
+
+/*
+ * call_function - called when entering/exiting a call function interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function);
+
+/*
+ * call_function_single - called when entering/exiting a call function
+ * single interrupt vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function_single);
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+/*
+ * threshold_apic - called when entering/exiting a threshold apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+/*
+ * deferred_error_apic - called when entering/exiting a deferred apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+/*
+ * thermal_apic - called when entering/exiting a thermal apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(thermal_apic);
+#endif
+
+TRACE_EVENT(vector_config,
+
+ TP_PROTO(unsigned int irq, unsigned int vector,
+ unsigned int cpu, unsigned int apicdest),
+
+ TP_ARGS(irq, vector, cpu, apicdest),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( unsigned int, cpu )
+ __field( unsigned int, apicdest )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = vector;
+ __entry->cpu = cpu;
+ __entry->apicdest = apicdest;
+ ),
+
+ TP_printk("irq=%u vector=%u cpu=%u apicdest=0x%08x",
+ __entry->irq, __entry->vector, __entry->cpu,
+ __entry->apicdest)
+);
+
+DECLARE_EVENT_CLASS(vector_mod,
+
+ TP_PROTO(unsigned int irq, unsigned int vector,
+ unsigned int cpu, unsigned int prev_vector,
+ unsigned int prev_cpu),
+
+ TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( unsigned int, cpu )
+ __field( unsigned int, prev_vector )
+ __field( unsigned int, prev_cpu )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = vector;
+ __entry->cpu = cpu;
+ __entry->prev_vector = prev_vector;
+ __entry->prev_cpu = prev_cpu;
+
+ ),
+
+ TP_printk("irq=%u vector=%u cpu=%u prev_vector=%u prev_cpu=%u",
+ __entry->irq, __entry->vector, __entry->cpu,
+ __entry->prev_vector, __entry->prev_cpu)
+);
+
+#define DEFINE_IRQ_VECTOR_MOD_EVENT(name) \
+DEFINE_EVENT_FN(vector_mod, name, \
+ TP_PROTO(unsigned int irq, unsigned int vector, \
+ unsigned int cpu, unsigned int prev_vector, \
+ unsigned int prev_cpu), \
+ TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu), NULL, NULL); \
+
+DEFINE_IRQ_VECTOR_MOD_EVENT(vector_update);
+DEFINE_IRQ_VECTOR_MOD_EVENT(vector_clear);
+
+DECLARE_EVENT_CLASS(vector_reserve,
+
+ TP_PROTO(unsigned int irq, int ret),
+
+ TP_ARGS(irq, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("irq=%u ret=%d", __entry->irq, __entry->ret)
+);
+
+#define DEFINE_IRQ_VECTOR_RESERVE_EVENT(name) \
+DEFINE_EVENT_FN(vector_reserve, name, \
+ TP_PROTO(unsigned int irq, int ret), \
+ TP_ARGS(irq, ret), NULL, NULL); \
+
+DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve_managed);
+DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve);
+
+TRACE_EVENT(vector_alloc,
+
+ TP_PROTO(unsigned int irq, unsigned int vector, bool reserved,
+ int ret),
+
+ TP_ARGS(irq, vector, reserved, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( bool, reserved )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = ret < 0 ? 0 : vector;
+ __entry->reserved = reserved;
+ __entry->ret = ret > 0 ? 0 : ret;
+ ),
+
+ TP_printk("irq=%u vector=%u reserved=%d ret=%d",
+ __entry->irq, __entry->vector,
+ __entry->reserved, __entry->ret)
+);
+
+TRACE_EVENT(vector_alloc_managed,
+
+ TP_PROTO(unsigned int irq, unsigned int vector,
+ int ret),
+
+ TP_ARGS(irq, vector, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = ret < 0 ? 0 : vector;
+ __entry->ret = ret > 0 ? 0 : ret;
+ ),
+
+ TP_printk("irq=%u vector=%u ret=%d",
+ __entry->irq, __entry->vector, __entry->ret)
+);
+
+DECLARE_EVENT_CLASS(vector_activate,
+
+ TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
+ bool reserve),
+
+ TP_ARGS(irq, is_managed, can_reserve, reserve),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( bool, is_managed )
+ __field( bool, can_reserve )
+ __field( bool, reserve )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->is_managed = is_managed;
+ __entry->can_reserve = can_reserve;
+ __entry->reserve = reserve;
+ ),
+
+ TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
+ __entry->irq, __entry->is_managed, __entry->can_reserve,
+ __entry->reserve)
+);
+
+#define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \
+DEFINE_EVENT_FN(vector_activate, name, \
+ TP_PROTO(unsigned int irq, bool is_managed, \
+ bool can_reserve, bool reserve), \
+ TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL); \
+
+DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
+DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);
+
+TRACE_EVENT(vector_teardown,
+
+ TP_PROTO(unsigned int irq, bool is_managed, bool has_reserved),
+
+ TP_ARGS(irq, is_managed, has_reserved),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( bool, is_managed )
+ __field( bool, has_reserved )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->is_managed = is_managed;
+ __entry->has_reserved = has_reserved;
+ ),
+
+ TP_printk("irq=%u is_managed=%d has_reserved=%d",
+ __entry->irq, __entry->is_managed, __entry->has_reserved)
+);
+
+TRACE_EVENT(vector_setup,
+
+ TP_PROTO(unsigned int irq, bool is_legacy, int ret),
+
+ TP_ARGS(irq, is_legacy, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( bool, is_legacy )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->is_legacy = is_legacy;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("irq=%u is_legacy=%d ret=%d",
+ __entry->irq, __entry->is_legacy, __entry->ret)
+);
+
+TRACE_EVENT(vector_free_moved,
+
+ TP_PROTO(unsigned int irq, unsigned int cpu, unsigned int vector,
+ bool is_managed),
+
+ TP_ARGS(irq, cpu, vector, is_managed),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, cpu )
+ __field( unsigned int, vector )
+ __field( bool, is_managed )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->cpu = cpu;
+ __entry->vector = vector;
+ __entry->is_managed = is_managed;
+ ),
+
+ TP_printk("irq=%u cpu=%u vector=%u is_managed=%d",
+ __entry->irq, __entry->cpu, __entry->vector,
+ __entry->is_managed)
+);
+
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE irq_vectors
+#endif /* _TRACE_IRQ_VECTORS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace_clock.h b/arch/x86/include/asm/trace_clock.h
new file mode 100644
index 000000000000..7061a5650969
--- /dev/null
+++ b/arch/x86/include/asm/trace_clock.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TRACE_CLOCK_H
+#define _ASM_X86_TRACE_CLOCK_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_X86_TSC
+
+extern u64 notrace trace_clock_x86_tsc(void);
+
+# define ARCH_TRACE_CLOCKS \
+ { trace_clock_x86_tsc, "x86-tsc", .in_ns = 0 },
+
+#else /* !CONFIG_X86_TSC */
+
+#define ARCH_TRACE_CLOCKS
+
+#endif
+
+#endif /* _ASM_X86_TRACE_CLOCK_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
deleted file mode 100644
index 90f06c25221d..000000000000
--- a/arch/x86/include/asm/trampoline.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ASM_X86_TRAMPOLINE_H
-#define _ASM_X86_TRAMPOLINE_H
-
-#ifndef __ASSEMBLY__
-
-#ifdef CONFIG_X86_TRAMPOLINE
-/*
- * Trampoline 80x86 program as an array.
- */
-extern const unsigned char trampoline_data [];
-extern const unsigned char trampoline_end [];
-extern unsigned char *trampoline_base;
-
-extern unsigned long init_rsp;
-extern unsigned long initial_code;
-extern unsigned long initial_gs;
-
-#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
-#define TRAMPOLINE_BASE 0x6000
-
-extern unsigned long setup_trampoline(void);
-extern void __init reserve_trampoline_memory(void);
-#else
-static inline void reserve_trampoline_memory(void) {};
-#endif /* CONFIG_X86_TRAMPOLINE */
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
new file mode 100644
index 000000000000..a23a7b707b64
--- /dev/null
+++ b/arch/x86/include/asm/trap_pf.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TRAP_PF_H
+#define _ASM_X86_TRAP_PF_H
+
+#include <linux/bits.h>
+
+/*
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
+ * bit 6 == 1: shadow stack access fault
+ * bit 15 == 1: SGX MMU page-fault
+ * bit 31 == 1: fault was due to RMP violation
+ */
+enum x86_pf_error_code {
+ X86_PF_PROT = BIT(0),
+ X86_PF_WRITE = BIT(1),
+ X86_PF_USER = BIT(2),
+ X86_PF_RSVD = BIT(3),
+ X86_PF_INSTR = BIT(4),
+ X86_PF_PK = BIT(5),
+ X86_PF_SHSTK = BIT(6),
+ X86_PF_SGX = BIT(15),
+ X86_PF_RMP = BIT(31),
+};
+
+#endif /* _ASM_X86_TRAP_PF_H */
diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h
new file mode 100644
index 000000000000..8d1154cdf787
--- /dev/null
+++ b/arch/x86/include/asm/trapnr.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TRAPNR_H
+#define _ASM_X86_TRAPNR_H
+
+/*
+ * Event type codes used by FRED, Intel VT-x and AMD SVM
+ */
+#define EVENT_TYPE_EXTINT 0 // External interrupt
+#define EVENT_TYPE_RESERVED 1
+#define EVENT_TYPE_NMI 2 // NMI
+#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions
+#define EVENT_TYPE_SWINT 4 // INT n
+#define EVENT_TYPE_PRIV_SWEXC 5 // INT1
+#define EVENT_TYPE_SWEXC 6 // INTO, INT3
+#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF
+
+/* Interrupts/Exceptions */
+
+#define X86_TRAP_DE 0 /* Divide-by-zero */
+#define X86_TRAP_DB 1 /* Debug */
+#define X86_TRAP_NMI 2 /* Non-maskable Interrupt */
+#define X86_TRAP_BP 3 /* Breakpoint */
+#define X86_TRAP_OF 4 /* Overflow */
+#define X86_TRAP_BR 5 /* Bound Range Exceeded */
+#define X86_TRAP_UD 6 /* Invalid Opcode */
+#define X86_TRAP_NM 7 /* Device Not Available */
+#define X86_TRAP_DF 8 /* Double Fault */
+#define X86_TRAP_OLD_MF 9 /* Coprocessor Segment Overrun */
+#define X86_TRAP_TS 10 /* Invalid TSS */
+#define X86_TRAP_NP 11 /* Segment Not Present */
+#define X86_TRAP_SS 12 /* Stack Segment Fault */
+#define X86_TRAP_GP 13 /* General Protection Fault */
+#define X86_TRAP_PF 14 /* Page Fault */
+#define X86_TRAP_SPURIOUS 15 /* Spurious Interrupt */
+#define X86_TRAP_MF 16 /* x87 Floating-Point Exception */
+#define X86_TRAP_AC 17 /* Alignment Check */
+#define X86_TRAP_MC 18 /* Machine Check */
+#define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */
+#define X86_TRAP_VE 20 /* Virtualization Exception */
+#define X86_TRAP_CP 21 /* Control Protection Exception */
+#define X86_TRAP_VC 29 /* VMM Communication Exception */
+#define X86_TRAP_IRET 32 /* IRET Exception */
+
+#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index bfd74c032fca..869b88061801 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,70 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H
+#include <linux/context_tracking_state.h>
+#include <linux/kprobes.h>
+
#include <asm/debugreg.h>
+#include <asm/idtentry.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */
+#include <asm/trap_pf.h>
-#ifdef CONFIG_X86_32
-#define dotraplinkage
-#else
-#define dotraplinkage asmlinkage
-#endif
-
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void xen_debug(void);
-asmlinkage void xen_int3(void);
-asmlinkage void xen_stack_segment(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
#ifdef CONFIG_X86_64
-asmlinkage void double_fault(void);
+asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
+asmlinkage __visible notrace
+struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
+asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void alignment_check(void);
-#ifdef CONFIG_X86_MCE
-asmlinkage void machine_check(void);
-#endif /* CONFIG_X86_MCE */
-asmlinkage void simd_coprocessor_error(void);
-dotraplinkage void do_divide_error(struct pt_regs *, long);
-dotraplinkage void do_debug(struct pt_regs *, long);
-dotraplinkage void do_nmi(struct pt_regs *, long);
-dotraplinkage void do_int3(struct pt_regs *, long);
-dotraplinkage void do_overflow(struct pt_regs *, long);
-dotraplinkage void do_bounds(struct pt_regs *, long);
-dotraplinkage void do_invalid_op(struct pt_regs *, long);
-dotraplinkage void do_device_not_available(struct pt_regs *, long);
-dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
-dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
-dotraplinkage void do_segment_not_present(struct pt_regs *, long);
-dotraplinkage void do_stack_segment(struct pt_regs *, long);
-#ifdef CONFIG_X86_64
-dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
-#endif
-dotraplinkage void do_general_protection(struct pt_regs *, long);
-dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
-dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
-dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
-dotraplinkage void do_alignment_check(struct pt_regs *, long);
-#ifdef CONFIG_X86_MCE
-dotraplinkage void do_machine_check(struct pt_regs *, long);
-#endif
-dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
-#ifdef CONFIG_X86_32
-dotraplinkage void do_iret_error(struct pt_regs *, long);
+extern int ibt_selftest(void);
+extern int ibt_selftest_noendbr(void);
+
+#ifdef CONFIG_X86_F00F_BUG
+/* For handling the FOOF bug */
+void handle_invalid_op(struct pt_regs *regs);
#endif
static inline int get_si_code(unsigned long condition)
@@ -77,15 +35,26 @@ static inline int get_si_code(unsigned long condition)
return TRAP_BRKPT;
}
-extern int panic_on_unrecovered_nmi;
-
-void math_error(void __user *);
void math_emulate(struct math_emu_info *);
-#ifdef CONFIG_X86_32
-unsigned long patch_espfix_desc(unsigned long, unsigned long);
-#else
-asmlinkage void smp_thermal_interrupt(void);
-asmlinkage void mce_threshold_interrupt(void);
+
+bool fault_in_kernel_space(unsigned long address);
+
+#ifdef CONFIG_VMAP_STACK
+void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info);
#endif
+static inline void cond_local_irq_enable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void cond_local_irq_disable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+}
+
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 38ae163cc91b..4f7f09f50552 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -1,13 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* x86 TSC related functions
*/
#ifndef _ASM_X86_TSC_H
#define _ASM_X86_TSC_H
+#include <asm/asm.h>
+#include <asm/cpufeature.h>
#include <asm/processor.h>
+#include <asm/msr.h>
-#define NS_SCALE 10 /* 2^10, carefully chosen */
-#define US_SCALE 32 /* 2^32, arbitralrily chosen */
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer. The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect. The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline u64 rdtsc(void)
+{
+ EAX_EDX_DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter. It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline u64 rdtsc_ordered(void)
+{
+ EAX_EDX_DECLARE_ARGS(val, low, high);
+
+ /*
+ * The RDTSC instruction is not ordered relative to memory
+ * access. The Intel SDM and the AMD APM are both vague on this
+ * point, but empirically an RDTSC instruction can be
+ * speculatively executed before prior loads. An RDTSC
+ * immediately after an appropriate barrier appears to be
+ * ordered as a normal load, that is, it provides the same
+ * ordering guarantees as reading from a global memory location
+ * that some other imaginary CPU is updating continuously with a
+ * time stamp.
+ *
+ * Thus, use the preferred barrier on the respective CPU, aiming for
+ * RDTSCP as the default.
+ */
+ asm volatile(ALTERNATIVE_2("rdtsc",
+ "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC,
+ "rdtscp", X86_FEATURE_RDTSCP)
+ : EAX_EDX_RET(val, low, high)
+ /* RDTSCP clobbers ECX with MSR_TSC_AUX. */
+ :: "ecx");
+
+ return EAX_EDX_VAL(val, low, high);
+}
/*
* Standard way to access the cycle counter.
@@ -21,42 +76,48 @@ extern void disable_TSC(void);
static inline cycles_t get_cycles(void)
{
- unsigned long long ret = 0;
-
-#ifndef CONFIG_X86_TSC
- if (!cpu_has_tsc)
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
return 0;
-#endif
- rdtscll(ret);
-
- return ret;
-}
-
-static __always_inline cycles_t vget_cycles(void)
-{
- /*
- * We only do VDSOs on TSC capable CPUs, so this shouldnt
- * access boot_cpu_data (which is not VDSO-safe):
- */
-#ifndef CONFIG_X86_TSC
- if (!cpu_has_tsc)
- return 0;
-#endif
- return (cycles_t)__native_read_tsc();
+ return rdtsc();
}
+#define get_cycles get_cycles
+extern void tsc_early_init(void);
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
-int check_tsc_unstable(void);
+extern int check_tsc_unstable(void);
+extern void mark_tsc_async_resets(char *reason);
+extern unsigned long native_calibrate_cpu_early(void);
+extern unsigned long native_calibrate_tsc(void);
+extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
+
+extern int tsc_clocksource_reliable;
+#ifdef CONFIG_X86_TSC
+extern bool tsc_async_resets;
+#else
+# define tsc_async_resets false
+#endif
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
*/
-extern void check_tsc_sync_source(int cpu);
+#ifdef CONFIG_X86_TSC
+extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
+extern void tsc_verify_tsc_adjust(bool resume);
extern void check_tsc_sync_target(void);
+#else
+static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; }
+static inline void tsc_verify_tsc_adjust(bool resume) { }
+static inline void check_tsc_sync_target(void) { }
+#endif
extern int notsc_setup(char *);
+extern void tsc_save_sched_clock_state(void);
+extern void tsc_restore_sched_clock_state(void);
+
+unsigned long cpu_khz_from_msr(void);
#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
deleted file mode 100644
index 09b97745772f..000000000000
--- a/arch/x86/include/asm/types.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _ASM_X86_TYPES_H
-#define _ASM_X86_TYPES_H
-
-#include <asm-generic/int-ll64.h>
-
-#ifndef __ASSEMBLY__
-
-typedef unsigned short umode_t;
-
-#endif /* __ASSEMBLY__ */
-
-/*
- * These aren't exported outside the kernel to avoid name space clashes
- */
-#ifdef __KERNEL__
-
-#ifndef __ASSEMBLY__
-
-typedef u64 dma64_addr_t;
-#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
-/* DMA addresses come in 32-bit and 64-bit flavours. */
-typedef u64 dma_addr_t;
-#else
-typedef u32 dma_addr_t;
-#endif
-
-#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d2c6c930b491..367297b188c3 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -1,138 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
* User space memory access functions
*/
-#include <linux/errno.h>
#include <linux/compiler.h>
-#include <linux/thread_info.h>
-#include <linux/prefetch.h>
+#include <linux/instrumented.h>
+#include <linux/kasan-checks.h>
+#include <linux/mm_types.h>
#include <linux/string.h>
+#include <linux/mmap_lock.h>
#include <asm/asm.h>
#include <asm/page.h>
+#include <asm/smap.h>
+#include <asm/extable.h>
+#include <asm/tlbflush.h>
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
-/*
- * The fs value determines whether argument validity checking should be
- * performed or not. If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- */
+#ifdef CONFIG_X86_32
+# include <asm/uaccess_32.h>
+#else
+# include <asm/uaccess_64.h>
+#endif
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+#include <asm-generic/access_ok.h>
-#define KERNEL_DS MAKE_MM_SEG(-1UL)
-#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
+extern int __get_user_1(void);
+extern int __get_user_2(void);
+extern int __get_user_4(void);
+extern int __get_user_8(void);
+extern int __get_user_nocheck_1(void);
+extern int __get_user_nocheck_2(void);
+extern int __get_user_nocheck_4(void);
+extern int __get_user_nocheck_8(void);
+extern int __get_user_bad(void);
-#define get_ds() (KERNEL_DS)
-#define get_fs() (current_thread_info()->addr_limit)
-#define set_fs(x) (current_thread_info()->addr_limit = (x))
+#define __uaccess_begin() stac()
+#define __uaccess_end() clac()
+#define __uaccess_begin_nospec() \
+({ \
+ stac(); \
+ barrier_nospec(); \
+})
-#define segment_eq(a, b) ((a).seg == (b).seg)
+/*
+ * This is the smallest unsigned integer type that can fit a value
+ * (up to 'long long')
+ */
+#define __inttype(x) __typeof__( \
+ __typefits(x,char, \
+ __typefits(x,short, \
+ __typefits(x,int, \
+ __typefits(x,long,0ULL)))))
-#define __addr_ok(addr) \
- ((unsigned long __force)(addr) < \
- (current_thread_info()->addr_limit.seg))
+#define __typefits(x,type,not) \
+ __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not)
/*
- * Test whether a block of memory is a valid user space address.
- * Returns 0 if the range is valid, nonzero otherwise.
+ * This is used for both get_user() and __get_user() to expand to
+ * the proper special function call that has odd calling conventions
+ * due to returning both a value and an error, and that depends on
+ * the size of the pointer passed in.
+ *
+ * Careful: we have to cast the result to the type of the pointer
+ * for sign reasons.
*
- * This is equivalent to the following test:
- * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ * The use of _ASM_DX as the register specifier is a bit of a
+ * simplification, as gcc only cares about it as the starting point
+ * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
+ * (%ecx being the next register in gcc's x86 register sequence), and
+ * %rdx on 64 bits.
*
- * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
+ * Clang/LLVM cares about the size of the register, but still wants
+ * the base register for something that ends up being a pair.
*/
-
-#define __range_not_ok(addr, size) \
+#define do_get_user_call(fn,x,ptr) \
({ \
- unsigned long flag, roksum; \
- __chk_user_ptr(addr); \
- asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
- : "=&r" (flag), "=r" (roksum) \
- : "1" (addr), "g" ((long)(size)), \
- "rm" (current_thread_info()->addr_limit.seg)); \
- flag; \
+ int __ret_gu; \
+ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
+ __chk_user_ptr(ptr); \
+ asm volatile("call __" #fn "_%c[size]" \
+ : "=a" (__ret_gu), "=r" (__val_gu), \
+ ASM_CALL_CONSTRAINT \
+ : "0" (ptr), [size] "i" (sizeof(*(ptr)))); \
+ instrument_get_user(__val_gu); \
+ (x) = (__force __typeof__(*(ptr))) __val_gu; \
+ __builtin_expect(__ret_gu, 0); \
})
/**
- * access_ok: - Checks if a user space pointer is valid
- * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
- * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
- * to write to a block, it is always safe to read from it.
- * @addr: User space pointer to start of block to check
- * @size: Size of block to check
- *
- * Context: User context only. This function may sleep.
- *
- * Checks if a pointer to a block of memory in user space is valid.
- *
- * Returns true (nonzero) if the memory block may be valid, false (zero)
- * if it is definitely invalid.
+ * get_user - Get a simple variable from user space.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
*
- * Note that, depending on architecture, this function probably just
- * checks that the pointer is in the user space range - after calling
- * this function, memory access functions may still return -EFAULT.
- */
-#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
-
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue. No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
*
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path. This means when everything is well,
- * we don't even have to jump over them. Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
- unsigned long insn, fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
-
-/*
- * These are the main single-value transfer routines. They automatically
- * use the right size if we just have the right pointer type.
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
*
- * This gets kind of ugly. We want to return _two_ values in "get_user()"
- * and yet we don't want to do any pointers, because that is too much
- * of a performance impact. Thus we have a few rather ugly macros here,
- * and hide all the ugliness from the user.
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
*
- * The "__xxx" versions of the user access functions are versions that
- * do not verify the address space, that must have been done previously
- * with a separate "access_ok()" call (this is used when we do multiple
- * accesses to the same area of user memory).
+ * Return: zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
*/
-
-extern int __get_user_1(void);
-extern int __get_user_2(void);
-extern int __get_user_4(void);
-extern int __get_user_8(void);
-extern int __get_user_bad(void);
-
-#define __get_user_x(size, ret, x, ptr) \
- asm volatile("call __get_user_" #size \
- : "=a" (ret), "=d" (x) \
- : "0" (ptr)) \
-
-/* Careful: we have to cast the result to the type of the pointer
- * for sign reasons */
+#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })
/**
- * get_user: - Get a simple variable from user space.
+ * __get_user - Get a simple variable from user space, with less checking.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
*
* This macro copies a single simple variable from user space to kernel
* space. It supports simple types like char and int, but not larger
@@ -141,102 +122,79 @@ extern int __get_user_bad(void);
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
- * Returns zero on success, or -EFAULT on error.
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
-#ifdef CONFIG_X86_32
-#define __get_user_8(__ret_gu, __val_gu, ptr) \
- __get_user_x(X, __ret_gu, __val_gu, ptr)
-#else
-#define __get_user_8(__ret_gu, __val_gu, ptr) \
- __get_user_x(8, __ret_gu, __val_gu, ptr)
-#endif
-
-#define get_user(x, ptr) \
-({ \
- int __ret_gu; \
- unsigned long __val_gu; \
- __chk_user_ptr(ptr); \
- might_fault(); \
- switch (sizeof(*(ptr))) { \
- case 1: \
- __get_user_x(1, __ret_gu, __val_gu, ptr); \
- break; \
- case 2: \
- __get_user_x(2, __ret_gu, __val_gu, ptr); \
- break; \
- case 4: \
- __get_user_x(4, __ret_gu, __val_gu, ptr); \
- break; \
- case 8: \
- __get_user_8(__ret_gu, __val_gu, ptr); \
- break; \
- default: \
- __get_user_x(X, __ret_gu, __val_gu, ptr); \
- break; \
- } \
- (x) = (__typeof__(*(ptr)))__val_gu; \
- __ret_gu; \
-})
-
-#define __put_user_x(size, x, ptr, __ret_pu) \
- asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
- : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
-
+#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr)
#ifdef CONFIG_X86_32
-#define __put_user_asm_u64(x, addr, err, errret) \
- asm volatile("1: movl %%eax,0(%2)\n" \
- "2: movl %%edx,4(%2)\n" \
- "3:\n" \
- ".section .fixup,\"ax\"\n" \
- "4: movl %3,%0\n" \
- " jmp 3b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 4b) \
- _ASM_EXTABLE(2b, 4b) \
- : "=r" (err) \
- : "A" (x), "r" (addr), "i" (errret), "0" (err))
-
-#define __put_user_asm_ex_u64(x, addr) \
- asm volatile("1: movl %%eax,0(%1)\n" \
- "2: movl %%edx,4(%1)\n" \
- "3:\n" \
- _ASM_EXTABLE(1b, 2b - 1b) \
- _ASM_EXTABLE(2b, 3b - 2b) \
- : : "A" (x), "r" (addr))
+#define __put_user_goto_u64(x, addr, label) \
+ asm goto("\n" \
+ "1: movl %%eax,0(%1)\n" \
+ "2: movl %%edx,4(%1)\n" \
+ _ASM_EXTABLE_UA(1b, %l2) \
+ _ASM_EXTABLE_UA(2b, %l2) \
+ : : "A" (x), "r" (addr) \
+ : : label)
-#define __put_user_x8(x, ptr, __ret_pu) \
- asm volatile("call __put_user_8" : "=a" (__ret_pu) \
- : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
#else
-#define __put_user_asm_u64(x, ptr, retval, errret) \
- __put_user_asm(x, ptr, retval, "q", "", "er", errret)
-#define __put_user_asm_ex_u64(x, addr) \
- __put_user_asm_ex(x, addr, "q", "", "er")
-#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
+#define __put_user_goto_u64(x, ptr, label) \
+ __put_user_goto(x, ptr, "q", "er", label)
#endif
extern void __put_user_bad(void);
/*
* Strange magic calling convention: pointer in %ecx,
- * value in %eax(:%edx), return value in %eax. clobbers %rbx
+ * value in %eax(:%edx), return value in %ecx. clobbers %rbx
*/
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
+extern void __put_user_nocheck_1(void);
+extern void __put_user_nocheck_2(void);
+extern void __put_user_nocheck_4(void);
+extern void __put_user_nocheck_8(void);
-#ifdef CONFIG_X86_WP_WORKS_OK
+/*
+ * ptr must be evaluated and assigned to the temporary __ptr_pu before
+ * the assignment of x to __val_pu, to avoid any function calls
+ * involved in the ptr expression (possibly implicitly generated due
+ * to KASAN) from clobbering %ax.
+ */
+#define do_put_user_call(fn,x,ptr) \
+({ \
+ int __ret_pu; \
+ void __user *__ptr_pu; \
+ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \
+ __typeof__(*(ptr)) __x = (x); /* eval x once */ \
+ __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \
+ __chk_user_ptr(__ptr); \
+ __ptr_pu = __ptr; \
+ __val_pu = __x; \
+ asm volatile("call __" #fn "_%c[size]" \
+ : "=c" (__ret_pu), \
+ ASM_CALL_CONSTRAINT \
+ : "0" (__ptr_pu), \
+ "r" (__val_pu), \
+ [size] "i" (sizeof(*(ptr))) \
+ :"ebx"); \
+ instrument_put_user(__x, __ptr, sizeof(*(ptr))); \
+ __builtin_expect(__ret_pu, 0); \
+})
/**
- * put_user: - Write a simple value into user space.
+ * put_user - Write a simple value into user space.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
*
* This macro copies a single simple value from kernel space to user
* space. It supports simple types like char and int, but not larger
@@ -245,188 +203,268 @@ extern void __put_user_8(void);
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
*/
-#define put_user(x, ptr) \
-({ \
- int __ret_pu; \
- __typeof__(*(ptr)) __pu_val; \
- __chk_user_ptr(ptr); \
- might_fault(); \
- __pu_val = x; \
- switch (sizeof(*(ptr))) { \
- case 1: \
- __put_user_x(1, __pu_val, ptr, __ret_pu); \
- break; \
- case 2: \
- __put_user_x(2, __pu_val, ptr, __ret_pu); \
- break; \
- case 4: \
- __put_user_x(4, __pu_val, ptr, __ret_pu); \
- break; \
- case 8: \
- __put_user_x8(__pu_val, ptr, __ret_pu); \
- break; \
- default: \
- __put_user_x(X, __pu_val, ptr, __ret_pu); \
- break; \
- } \
- __ret_pu; \
-})
+#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })
+
+/**
+ * __put_user - Write a simple value into user space, with less checking.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Return: zero on success, or -EFAULT on error.
+ */
+#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr)
-#define __put_user_size(x, ptr, size, retval, errret) \
+#define __put_user_size(x, ptr, size, label) \
do { \
- retval = 0; \
- __chk_user_ptr(ptr); \
+ __typeof__(*(ptr)) __x = (x); /* eval x once */ \
+ __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \
+ __chk_user_ptr(__ptr); \
switch (size) { \
case 1: \
- __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \
+ __put_user_goto(__x, __ptr, "b", "iq", label); \
break; \
case 2: \
- __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
+ __put_user_goto(__x, __ptr, "w", "ir", label); \
break; \
case 4: \
- __put_user_asm(x, ptr, retval, "l", "k", "ir", errret); \
+ __put_user_goto(__x, __ptr, "l", "ir", label); \
break; \
case 8: \
- __put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval, \
- errret); \
+ __put_user_goto_u64(__x, __ptr, label); \
break; \
default: \
__put_user_bad(); \
} \
+ instrument_put_user(__x, __ptr, size); \
} while (0)
-#define __put_user_size_ex(x, ptr, size) \
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+#ifdef CONFIG_X86_32
+#define __get_user_asm_u64(x, ptr, label) do { \
+ unsigned int __gu_low, __gu_high; \
+ const unsigned int __user *__gu_ptr; \
+ __gu_ptr = (const void __user *)(ptr); \
+ __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \
+ __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \
+ (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \
+} while (0)
+#else
+#define __get_user_asm_u64(x, ptr, label) \
+ __get_user_asm(x, ptr, "q", "=r", label)
+#endif
+
+#define __get_user_size(x, ptr, size, label) \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
- case 1: \
- __put_user_asm_ex(x, ptr, "b", "b", "iq"); \
+ case 1: { \
+ unsigned char x_u8__; \
+ __get_user_asm(x_u8__, ptr, "b", "=q", label); \
+ (x) = x_u8__; \
break; \
+ } \
case 2: \
- __put_user_asm_ex(x, ptr, "w", "w", "ir"); \
+ __get_user_asm(x, ptr, "w", "=r", label); \
break; \
case 4: \
- __put_user_asm_ex(x, ptr, "l", "k", "ir"); \
+ __get_user_asm(x, ptr, "l", "=r", label); \
break; \
case 8: \
- __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \
+ __get_user_asm_u64(x, ptr, label); \
break; \
default: \
- __put_user_bad(); \
+ (x) = __get_user_bad(); \
} \
+ instrument_get_user(x); \
} while (0)
-#else
+#define __get_user_asm(x, addr, itype, ltype, label) \
+ asm_goto_output("\n" \
+ "1: mov"itype" %[umem],%[output]\n" \
+ _ASM_EXTABLE_UA(1b, %l2) \
+ : [output] ltype(x) \
+ : [umem] "m" (__m(addr)) \
+ : : label)
-#define __put_user_size(x, ptr, size, retval, errret) \
-do { \
- __typeof__(*(ptr))__pus_tmp = x; \
- retval = 0; \
- \
- if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \
- retval = errret; \
-} while (0)
+#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#define put_user(x, ptr) \
-({ \
- int __ret_pu; \
- __typeof__(*(ptr))__pus_tmp = x; \
- __ret_pu = 0; \
- if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \
- sizeof(*(ptr))) != 0)) \
- __ret_pu = -EFAULT; \
- __ret_pu; \
+#ifdef CONFIG_X86_32
+#define __get_user_asm_u64(x, ptr, retval) \
+({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ asm volatile("\n" \
+ "1: movl %[lowbits],%%eax\n" \
+ "2: movl %[highbits],%%edx\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
+ _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
+ : [errout] "=r" (retval), \
+ [output] "=&A"(x) \
+ : [lowbits] "m" (__m(__ptr)), \
+ [highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \
+ "0" (retval)); \
})
-#endif
-#ifdef CONFIG_X86_32
-#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
-#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad()
#else
-#define __get_user_asm_u64(x, ptr, retval, errret) \
- __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
-#define __get_user_asm_ex_u64(x, ptr) \
- __get_user_asm_ex(x, ptr, "q", "", "=r")
+#define __get_user_asm_u64(x, ptr, retval) \
+ __get_user_asm(x, ptr, retval, "q")
#endif
-#define __get_user_size(x, ptr, size, retval, errret) \
+#define __get_user_size(x, ptr, size, retval) \
do { \
+ unsigned char x_u8__; \
+ \
retval = 0; \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
- __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
+ __get_user_asm(x_u8__, ptr, retval, "b"); \
+ (x) = x_u8__; \
break; \
case 2: \
- __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
+ __get_user_asm(x, ptr, retval, "w"); \
break; \
case 4: \
- __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
+ __get_user_asm(x, ptr, retval, "l"); \
break; \
case 8: \
- __get_user_asm_u64(x, ptr, retval, errret); \
+ __get_user_asm_u64(x, ptr, retval); \
break; \
default: \
(x) = __get_user_bad(); \
} \
} while (0)
-#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile("1: mov"itype" %2,%"rtype"1\n" \
+#define __get_user_asm(x, addr, err, itype) \
+ asm volatile("\n" \
+ "1: mov"itype" %[umem],%[output]\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: mov %3,%0\n" \
- " xor"itype" %"rtype"1,%"rtype"1\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : "=r" (err), ltype(x) \
- : "m" (__m(addr)), "i" (errret), "0" (err))
-
-#define __get_user_size_ex(x, ptr, size) \
-do { \
- __chk_user_ptr(ptr); \
- switch (size) { \
- case 1: \
- __get_user_asm_ex(x, ptr, "b", "b", "=q"); \
- break; \
- case 2: \
- __get_user_asm_ex(x, ptr, "w", "w", "=r"); \
- break; \
- case 4: \
- __get_user_asm_ex(x, ptr, "l", "k", "=r"); \
- break; \
- case 8: \
- __get_user_asm_ex_u64(x, ptr); \
- break; \
- default: \
- (x) = __get_user_bad(); \
- } \
-} while (0)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX, \
+ %[errout]) \
+ : [errout] "=r" (err), \
+ [output] "=a" (x) \
+ : [umem] "m" (__m(addr)), \
+ "0" (err))
+
+#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_goto_output("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : "=@ccz" (success), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
-#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \
- asm volatile("1: mov"itype" %1,%"rtype"0\n" \
+#ifdef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_goto_output("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : "=@ccz" (success), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+#endif // CONFIG_X86_32
+#else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ int __err = 0; \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
"2:\n" \
- _ASM_EXTABLE(1b, 2b - 1b) \
- : ltype(x) : "m" (__m(addr)))
-
-#define __put_user_nocheck(x, ptr, size) \
-({ \
- int __pu_err; \
- __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
- __pu_err; \
-})
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \
+ %[errout]) \
+ : "=@ccz" (success), \
+ [errout] "+r" (__err), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory"); \
+ if (unlikely(__err)) \
+ goto label; \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
-#define __get_user_nocheck(x, ptr, size) \
-({ \
- int __gu_err; \
- unsigned long __gu_val; \
- __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
- (x) = (__force __typeof__(*(ptr)))__gu_val; \
- __gu_err; \
-})
+#ifdef CONFIG_X86_32
+/*
+ * Unlike the normal CMPXCHG, use output GPR for both success/fail and error.
+ * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
+ * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses
+ * both ESI and EDI for the memory operand, compilation will fail if the error
+ * is an input+output as there will be no register available for input.
+ */
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ int __result; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ "mov $0, %[result]\n\t" \
+ "setz %b[result]\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \
+ %[result]) \
+ : [result] "=q" (__result), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory", "cc"); \
+ if (unlikely(__result < 0)) \
+ goto label; \
+ if (unlikely(!__result)) \
+ *_old = __old; \
+ likely(__result); })
+#endif // CONFIG_X86_32
+#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
@@ -437,142 +475,167 @@ struct __large_struct { unsigned long buf[100]; };
* we do not write to any memory gcc knows about, so there are no
* aliasing issues.
*/
-#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile("1: mov"itype" %"rtype"1,%2\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: mov %3,%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : "=r"(err) \
- : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
-
-#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \
- asm volatile("1: mov"itype" %"rtype"0,%1\n" \
- "2:\n" \
- _ASM_EXTABLE(1b, 2b - 1b) \
- : : ltype(x), "m" (__m(addr)))
+#define __put_user_goto(x, addr, itype, ltype, label) \
+ asm goto("\n" \
+ "1: mov"itype" %0,%1\n" \
+ _ASM_EXTABLE_UA(1b, %l2) \
+ : : ltype(x), "m" (__m(addr)) \
+ : : label)
+
+extern unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+extern __must_check long
+strncpy_from_user(char *dst, const char __user *src, long count);
+
+extern __must_check long strnlen_user(const char __user *str, long n);
+
+#ifdef CONFIG_ARCH_HAS_COPY_MC
+unsigned long __must_check
+copy_mc_to_kernel(void *to, const void *from, unsigned len);
+#define copy_mc_to_kernel copy_mc_to_kernel
+
+unsigned long __must_check
+copy_mc_to_user(void __user *to, const void *from, unsigned len);
+#endif
/*
- * uaccess_try and catch
- */
-#define uaccess_try do { \
- int prev_err = current_thread_info()->uaccess_err; \
- current_thread_info()->uaccess_err = 0; \
- barrier();
-
-#define uaccess_catch(err) \
- (err) |= current_thread_info()->uaccess_err; \
- current_thread_info()->uaccess_err = prev_err; \
-} while (0)
-
-/**
- * __get_user: - Get a simple variable from user space, with less checking.
- * @x: Variable to store result.
- * @ptr: Source address, in user space.
- *
- * Context: User context only. This function may sleep.
- *
- * This macro copies a single simple variable from user space to kernel
- * space. It supports simple types like char and int, but not larger
- * data types like structures or arrays.
- *
- * @ptr must have pointer-to-simple-variable type, and the result of
- * dereferencing @ptr must be assignable to @x without a cast.
- *
- * Caller must check the pointer with access_ok() before calling this
- * function.
- *
- * Returns zero on success, or -EFAULT on error.
- * On error, the variable @x is set to zero.
- */
-
-#define __get_user(x, ptr) \
- __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-/**
- * __put_user: - Write a simple value into user space, with less checking.
- * @x: Value to copy to user space.
- * @ptr: Destination address, in user space.
- *
- * Context: User context only. This function may sleep.
- *
- * This macro copies a single simple value from kernel space to user
- * space. It supports simple types like char and int, but not larger
- * data types like structures or arrays.
- *
- * @ptr must have pointer-to-simple-variable type, and @x must be assignable
- * to the result of dereferencing @ptr.
- *
- * Caller must check the pointer with access_ok() before calling this
- * function.
- *
- * Returns zero on success, or -EFAULT on error.
+ * movsl can be slow when source and dest are not both 8-byte aligned
*/
+#ifdef CONFIG_X86_INTEL_USERCOPY
+extern struct movsl_mask {
+ int mask;
+} ____cacheline_aligned_in_smp movsl_mask;
+#endif
-#define __put_user(x, ptr) \
- __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
-
-#define __get_user_unaligned __get_user
-#define __put_user_unaligned __put_user
+#define ARCH_HAS_NOCACHE_UACCESS 1
/*
- * {get|put}_user_try and catch
- *
- * get_user_try {
- * get_user_ex(...);
- * } get_user_catch(err)
+ * The "unsafe" user accesses aren't really "unsafe", but the naming
+ * is a big fat warning: you have to not only do the access_ok()
+ * checking before using them, but you have to surround them with the
+ * user_access_begin/end() pair.
*/
-#define get_user_try uaccess_try
-#define get_user_catch(err) uaccess_catch(err)
-
-#define get_user_ex(x, ptr) do { \
- unsigned long __gue_val; \
- __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \
- (x) = (__force __typeof__(*(ptr)))__gue_val; \
+static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
+{
+ if (unlikely(!access_ok(ptr,len)))
+ return 0;
+ __uaccess_begin_nospec();
+ return 1;
+}
+#define user_access_begin(a,b) user_access_begin(a,b)
+#define user_access_end() __uaccess_end()
+
+#define user_access_save() smap_save()
+#define user_access_restore(x) smap_restore(x)
+
+#define arch_unsafe_put_user(x, ptr, label) \
+ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)
+
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#define arch_unsafe_get_user(x, ptr, err_label) \
+do { \
+ __inttype(*(ptr)) __gu_val; \
+ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
} while (0)
-
-#ifdef CONFIG_X86_WP_WORKS_OK
-
-#define put_user_try uaccess_try
-#define put_user_catch(err) uaccess_catch(err)
-
-#define put_user_ex(x, ptr) \
- __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
-
-#else /* !CONFIG_X86_WP_WORKS_OK */
-
-#define put_user_try do { \
- int __uaccess_err = 0;
-
-#define put_user_catch(err) \
- (err) |= __uaccess_err; \
+#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#define arch_unsafe_get_user(x, ptr, err_label) \
+do { \
+ int __gu_err; \
+ __inttype(*(ptr)) __gu_val; \
+ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ if (unlikely(__gu_err)) goto err_label; \
} while (0)
+#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#define put_user_ex(x, ptr) do { \
- __uaccess_err |= __put_user(x, ptr); \
-} while (0)
+extern void __try_cmpxchg_user_wrong_size(void);
-#endif /* CONFIG_X86_WP_WORKS_OK */
+#ifndef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \
+ __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
+#endif
/*
- * movsl can be slow when source and dest are not both 8-byte aligned
+ * Force the pointer to u<size> to match the size expected by the asm helper.
+ * clang/LLVM compiles all cases and only discards the unused paths after
+ * processing errors, which breaks i386 if the pointer is an 8-byte value.
*/
-#ifdef CONFIG_X86_INTEL_USERCOPY
-extern struct movsl_mask {
- int mask;
-} ____cacheline_aligned_in_smp movsl_mask;
-#endif
+#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ bool __ret; \
+ __chk_user_ptr(_ptr); \
+ switch (sizeof(*(_ptr))) { \
+ case 1: __ret = __try_cmpxchg_user_asm("b", "q", \
+ (__force u8 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 2: __ret = __try_cmpxchg_user_asm("w", "r", \
+ (__force u16 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 4: __ret = __try_cmpxchg_user_asm("l", "r", \
+ (__force u32 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
+ (_nval), _label); \
+ break; \
+ default: __try_cmpxchg_user_wrong_size(); \
+ } \
+ __ret; })
+
+/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
+#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ int __ret = -EFAULT; \
+ __uaccess_begin_nospec(); \
+ __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \
+_label: \
+ __uaccess_end(); \
+ __ret; \
+ })
-#define ARCH_HAS_NOCACHE_UACCESS 1
+/*
+ * We want the unsafe accessors to always be inlined and use
+ * the error labels - thus the macro games.
+ */
+#define unsafe_copy_loop(dst, src, len, type, label) \
+ while (len >= sizeof(type)) { \
+ unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+#define unsafe_copy_to_user(_dst,_src,_len,label) \
+do { \
+ char __user *__ucu_dst = (_dst); \
+ const char *__ucu_src = (_src); \
+ size_t __ucu_len = (_len); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \
+} while (0)
-#ifdef CONFIG_X86_32
-# include "uaccess_32.h"
-#else
-# define ARCH_HAS_SEARCH_EXTABLE
-# include "uaccess_64.h"
-#endif
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#define arch_get_kernel_nofault(dst, src, type, err_label) \
+ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \
+ sizeof(type), err_label)
+#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#define arch_get_kernel_nofault(dst, src, type, err_label) \
+do { \
+ int __kr_err; \
+ \
+ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \
+ sizeof(type), __kr_err); \
+ if (unlikely(__kr_err)) \
+ goto err_label; \
+} while (0)
+#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+#define arch_put_kernel_nofault(dst, src, type, err_label) \
+ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \
+ sizeof(type), err_label)
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5e06259e90e5..40379a1adbb8 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -1,181 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_32_H
#define _ASM_X86_UACCESS_32_H
/*
* User space memory access functions
*/
-#include <linux/errno.h>
-#include <linux/thread_info.h>
-#include <linux/prefetch.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
-unsigned long __must_check __copy_to_user_ll
- (void __user *to, const void *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll
- (void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nozero
- (void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nocache
- (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_user_ll
+ (void *to, const void *from, unsigned long n);
unsigned long __must_check __copy_from_user_ll_nocache_nozero
(void *to, const void __user *from, unsigned long n);
-/**
- * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
- * @to: Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only.
- *
- * Copy data from kernel space to user space. Caller must check
- * the specified block with access_ok() before calling this function.
- * The caller should also make sure he pins the user space address
- * so that the we don't result in page fault and sleep.
- *
- * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
- * we return the initial request size (1, 2 or 4), as copy_*_user should do.
- * If a store crosses a page boundary and gets a fault, the x86 will not write
- * anything, so this is accurate.
- */
-
static __always_inline unsigned long __must_check
-__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
{
- if (__builtin_constant_p(n)) {
- unsigned long ret;
-
- switch (n) {
- case 1:
- __put_user_size(*(u8 *)from, (u8 __user *)to,
- 1, ret, 1);
- return ret;
- case 2:
- __put_user_size(*(u16 *)from, (u16 __user *)to,
- 2, ret, 2);
- return ret;
- case 4:
- __put_user_size(*(u32 *)from, (u32 __user *)to,
- 4, ret, 4);
- return ret;
- }
- }
- return __copy_to_user_ll(to, from, n);
-}
-
-/**
- * __copy_to_user: - Copy a block of data into user space, with less checking.
- * @to: Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep.
- *
- * Copy data from kernel space to user space. Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-static __always_inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
-{
- might_fault();
- return __copy_to_user_inatomic(to, from, n);
+ return __copy_user_ll((__force void *)to, from, n);
}
static __always_inline unsigned long
-__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
{
- /* Avoid zeroing the tail if the copy fails..
- * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
- * but as the zeroing behaviour is only significant when n is not
- * constant, that shouldn't be a problem.
- */
- if (__builtin_constant_p(n)) {
- unsigned long ret;
-
- switch (n) {
- case 1:
- __get_user_size(*(u8 *)to, from, 1, ret, 1);
- return ret;
- case 2:
- __get_user_size(*(u16 *)to, from, 2, ret, 2);
- return ret;
- case 4:
- __get_user_size(*(u32 *)to, from, 4, ret, 4);
- return ret;
- }
- }
- return __copy_from_user_ll_nozero(to, from, n);
-}
-
-/**
- * __copy_from_user: - Copy a block of data from user space, with less checking.
- * @to: Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep.
- *
- * Copy data from user space to kernel space. Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- *
- * An alternate version - __copy_from_user_inatomic() - may be called from
- * atomic context and will fail rather than sleep. In this case the
- * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
- * for explanation of why this is needed.
- */
-static __always_inline unsigned long
-__copy_from_user(void *to, const void __user *from, unsigned long n)
-{
- might_fault();
- if (__builtin_constant_p(n)) {
- unsigned long ret;
-
- switch (n) {
- case 1:
- __get_user_size(*(u8 *)to, from, 1, ret, 1);
- return ret;
- case 2:
- __get_user_size(*(u16 *)to, from, 2, ret, 2);
- return ret;
- case 4:
- __get_user_size(*(u32 *)to, from, 4, ret, 4);
- return ret;
- }
- }
- return __copy_from_user_ll(to, from, n);
-}
-
-static __always_inline unsigned long __copy_from_user_nocache(void *to,
- const void __user *from, unsigned long n)
-{
- might_fault();
- if (__builtin_constant_p(n)) {
- unsigned long ret;
-
- switch (n) {
- case 1:
- __get_user_size(*(u8 *)to, from, 1, ret, 1);
- return ret;
- case 2:
- __get_user_size(*(u16 *)to, from, 2, ret, 2);
- return ret;
- case 4:
- __get_user_size(*(u32 *)to, from, 4, ret, 4);
- return ret;
- }
- }
- return __copy_from_user_ll_nocache(to, from, n);
+ return __copy_user_ll(to, (__force const void *)from, n);
}
static __always_inline unsigned long
@@ -185,33 +33,6 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
return __copy_from_user_ll_nocache_nozero(to, from, n);
}
-unsigned long __must_check copy_to_user(void __user *to,
- const void *from, unsigned long n);
-unsigned long __must_check copy_from_user(void *to,
- const void __user *from,
- unsigned long n);
-long __must_check strncpy_from_user(char *dst, const char __user *src,
- long count);
-long __must_check __strncpy_from_user(char *dst,
- const char __user *src, long count);
-
-/**
- * strlen_user: - Get the size of a string in user space.
- * @str: The string to measure.
- *
- * Context: User context only. This function may sleep.
- *
- * Get the size of a NUL-terminated string in user space.
- *
- * Returns the size of the string INCLUDING the terminating NUL.
- * On exception, returns 0.
- *
- * If there is a limit on the length of a valid string, you may wish to
- * consider using strnlen_user() instead.
- */
-#define strlen_user(str) strnlen_user(str, LONG_MAX)
-
-long strnlen_user(const char __user *str, long n);
unsigned long __must_check clear_user(void __user *mem, unsigned long len);
unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc50..915124011c27 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H
@@ -5,204 +6,203 @@
* User space memory access functions
*/
#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/prefetch.h>
#include <linux/lockdep.h>
+#include <linux/kasan-checks.h>
+#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
#include <asm/page.h>
+#include <asm/percpu.h>
+#ifdef MODULE
+ #define runtime_const_ptr(sym) (sym)
+#else
+ #include <asm/runtime-const.h>
+#endif
+extern unsigned long USER_PTR_MAX;
+
+#ifdef CONFIG_ADDRESS_MASKING
/*
- * Copy To/From Userspace
+ * Mask out tag bits from the address.
*/
+static inline unsigned long __untagged_addr(unsigned long addr)
+{
+ asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]",
+ X86_FEATURE_LAM)
+ : [addr] "+r" (addr)
+ : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));
-/* Handles exceptions in both to and from, but doesn't do access_ok */
-__must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned len);
+ return addr;
+}
-__must_check unsigned long
-copy_to_user(void __user *to, const void *from, unsigned len);
-__must_check unsigned long
-copy_from_user(void *to, const void __user *from, unsigned len);
-__must_check unsigned long
-copy_in_user(void __user *to, const void __user *from, unsigned len);
+#define untagged_addr(addr) ({ \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr(__addr); \
+})
-static __always_inline __must_check
-int __copy_from_user(void *dst, const void __user *src, unsigned size)
+static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
+ unsigned long addr)
{
- int ret = 0;
+ mmap_assert_locked(mm);
+ return addr & (mm)->context.untag_mask;
+}
- might_fault();
- if (!__builtin_constant_p(size))
- return copy_user_generic(dst, (__force void *)src, size);
- switch (size) {
- case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
- ret, "b", "b", "=q", 1);
- return ret;
- case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
- ret, "w", "w", "=r", 2);
- return ret;
- case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
- ret, "l", "k", "=r", 4);
- return ret;
- case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
- ret, "q", "", "=r", 8);
- return ret;
- case 10:
- __get_user_asm(*(u64 *)dst, (u64 __user *)src,
- ret, "q", "", "=r", 10);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u16 *)(8 + (char *)dst),
- (u16 __user *)(8 + (char __user *)src),
- ret, "w", "w", "=r", 2);
- return ret;
- case 16:
- __get_user_asm(*(u64 *)dst, (u64 __user *)src,
- ret, "q", "", "=r", 16);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u64 *)(8 + (char *)dst),
- (u64 __user *)(8 + (char __user *)src),
- ret, "q", "", "=r", 8);
- return ret;
- default:
- return copy_user_generic(dst, (__force void *)src, size);
- }
+#define untagged_addr_remote(mm, addr) ({ \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \
+})
+
+#endif
+
+#define valid_user_address(x) \
+ likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX))
+
+/*
+ * Masking the user address is an alternative to a conditional
+ * user_access_begin that can avoid the fencing. This only works
+ * for dense accesses starting at the address.
+ */
+static inline void __user *mask_user_address(const void __user *ptr)
+{
+ void __user *ret;
+ asm("cmp %1,%0\n\t"
+ "cmova %1,%0"
+ :"=r" (ret)
+ :"r" (runtime_const_ptr(USER_PTR_MAX)),
+ "0" (ptr));
+ return ret;
}
+#define masked_user_access_begin(x) ({ \
+ auto __masked_ptr = (x); \
+ __masked_ptr = mask_user_address(__masked_ptr); \
+ __uaccess_begin(); __masked_ptr; })
-static __always_inline __must_check
-int __copy_to_user(void __user *dst, const void *src, unsigned size)
+/*
+ * User pointers can have tag bits on x86-64. This scheme tolerates
+ * arbitrary values in those bits rather then masking them off.
+ *
+ * Enforce two rules:
+ * 1. 'ptr' must be in the user part of the address space
+ * 2. 'ptr+size' must not overflow into kernel addresses
+ *
+ * Note that we always have at least one guard page between the
+ * max user address and the non-canonical gap, allowing us to
+ * ignore small sizes entirely.
+ *
+ * In fact, we could probably remove the size check entirely, since
+ * any kernel accesses will be in increasing address order starting
+ * at 'ptr'.
+ *
+ * That's a separate optimization, for now just handle the small
+ * constant case.
+ */
+static inline bool __access_ok(const void __user *ptr, unsigned long size)
{
- int ret = 0;
+ if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
+ return valid_user_address(ptr);
+ } else {
+ unsigned long sum = size + (__force unsigned long)ptr;
- might_fault();
- if (!__builtin_constant_p(size))
- return copy_user_generic((__force void *)dst, src, size);
- switch (size) {
- case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
- ret, "b", "b", "iq", 1);
- return ret;
- case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
- ret, "w", "w", "ir", 2);
- return ret;
- case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
- ret, "l", "k", "ir", 4);
- return ret;
- case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
- ret, "q", "", "er", 8);
- return ret;
- case 10:
- __put_user_asm(*(u64 *)src, (u64 __user *)dst,
- ret, "q", "", "er", 10);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
- ret, "w", "w", "ir", 2);
- return ret;
- case 16:
- __put_user_asm(*(u64 *)src, (u64 __user *)dst,
- ret, "q", "", "er", 16);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
- ret, "q", "", "er", 8);
- return ret;
- default:
- return copy_user_generic((__force void *)dst, src, size);
+ return valid_user_address(sum) && sum >= (__force unsigned long)ptr;
}
}
+#define __access_ok __access_ok
-static __always_inline __must_check
-int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
-{
- int ret = 0;
+/*
+ * Copy To/From Userspace
+ */
- might_fault();
- if (!__builtin_constant_p(size))
- return copy_user_generic((__force void *)dst,
- (__force void *)src, size);
- switch (size) {
- case 1: {
- u8 tmp;
- __get_user_asm(tmp, (u8 __user *)src,
- ret, "b", "b", "=q", 1);
- if (likely(!ret))
- __put_user_asm(tmp, (u8 __user *)dst,
- ret, "b", "b", "iq", 1);
- return ret;
- }
- case 2: {
- u16 tmp;
- __get_user_asm(tmp, (u16 __user *)src,
- ret, "w", "w", "=r", 2);
- if (likely(!ret))
- __put_user_asm(tmp, (u16 __user *)dst,
- ret, "w", "w", "ir", 2);
- return ret;
- }
+/* Handles exceptions in both to and from, but doesn't do access_ok */
+__must_check unsigned long
+rep_movs_alternative(void *to, const void *from, unsigned len);
- case 4: {
- u32 tmp;
- __get_user_asm(tmp, (u32 __user *)src,
- ret, "l", "k", "=r", 4);
- if (likely(!ret))
- __put_user_asm(tmp, (u32 __user *)dst,
- ret, "l", "k", "ir", 4);
- return ret;
- }
- case 8: {
- u64 tmp;
- __get_user_asm(tmp, (u64 __user *)src,
- ret, "q", "", "=r", 8);
- if (likely(!ret))
- __put_user_asm(tmp, (u64 __user *)dst,
- ret, "q", "", "er", 8);
- return ret;
- }
- default:
- return copy_user_generic((__force void *)dst,
- (__force void *)src, size);
- }
+static __always_inline __must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned long len)
+{
+ stac();
+ /*
+ * If CPU has FSRM feature, use 'rep movs'.
+ * Otherwise, use rep_movs_alternative.
+ */
+ asm volatile(
+ "1:\n\t"
+ ALTERNATIVE("rep movsb",
+ "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
+ : : "memory", "rax");
+ clac();
+ return len;
+}
+
+static __always_inline __must_check unsigned long
+raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
+{
+ return copy_user_generic(dst, (__force void *)src, size);
}
-__must_check long
-strncpy_from_user(char *dst, const char __user *src, long count);
-__must_check long
-__strncpy_from_user(char *dst, const char __user *src, long count);
-__must_check long strnlen_user(const char __user *str, long n);
-__must_check long __strnlen_user(const char __user *str, long n);
-__must_check long strlen_user(const char __user *str);
-__must_check unsigned long clear_user(void __user *mem, unsigned long len);
-__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
-
-__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
- unsigned size);
-
-static __must_check __always_inline int
-__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
+static __always_inline __must_check unsigned long
+raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
return copy_user_generic((__force void *)dst, src, size);
}
-extern long __copy_user_nocache(void *dst, const void __user *src,
- unsigned size, int zerorest);
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
+extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
static inline int
-__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
+__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
+ unsigned size)
{
- might_sleep();
- return __copy_user_nocache(dst, src, size, 1);
+ long ret;
+ kasan_check_write(dst, size);
+ stac();
+ ret = __copy_user_nocache(dst, src, size);
+ clac();
+ return ret;
}
static inline int
-__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
- unsigned size)
+__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
- return __copy_user_nocache(dst, src, size, 0);
+ kasan_check_write(dst, size);
+ return __copy_user_flushcache(dst, src, size);
}
-unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+/*
+ * Zero Userspace.
+ */
+
+__must_check unsigned long
+rep_stos_alternative(void __user *addr, unsigned long len);
+
+static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+ might_fault();
+ stac();
+
+ /*
+ * No memory constraint because it doesn't change any memory gcc
+ * knows about.
+ */
+ asm volatile(
+ "1:\n\t"
+ ALTERNATIVE("rep stosb",
+ "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
+ : "a" (0));
+
+ clac();
+
+ return size;
+}
+static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
+{
+ if (__access_ok(to, n))
+ return __clear_user(to, n);
+ return n;
+}
#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h
deleted file mode 100644
index 87324cf439d9..000000000000
--- a/arch/x86/include/asm/ucontext.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _ASM_X86_UCONTEXT_H
-#define _ASM_X86_UCONTEXT_H
-
-#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state
- * information in the memory layout pointed
- * by the fpstate pointer in the ucontext's
- * sigcontext struct (uc_mcontext).
- */
-
-struct ucontext {
- unsigned long uc_flags;
- struct ucontext *uc_link;
- stack_t uc_stack;
- struct sigcontext uc_mcontext;
- sigset_t uc_sigmask; /* mask last for extensibility */
-};
-
-#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h
new file mode 100644
index 000000000000..aeed98c3c9e1
--- /dev/null
+++ b/arch/x86/include/asm/umip.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_UMIP_H
+#define _ASM_X86_UMIP_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_UMIP
+bool fixup_umip_exception(struct pt_regs *regs);
+#else
+static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; }
+#endif /* CONFIG_X86_UMIP */
+#endif /* _ASM_X86_UMIP_H */
diff --git a/arch/x86/include/asm/unaccepted_memory.h b/arch/x86/include/asm/unaccepted_memory.h
new file mode 100644
index 000000000000..f5937e9866ac
--- /dev/null
+++ b/arch/x86/include/asm/unaccepted_memory.h
@@ -0,0 +1,27 @@
+#ifndef _ASM_X86_UNACCEPTED_MEMORY_H
+#define _ASM_X86_UNACCEPTED_MEMORY_H
+
+#include <linux/efi.h>
+#include <asm/tdx.h>
+#include <asm/sev.h>
+
+static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ /* Platform-specific memory-acceptance call goes here */
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ if (!tdx_accept_memory(start, end))
+ panic("TDX: Failed to accept memory\n");
+ } else if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ snp_accept_memory(start, end);
+ } else {
+ panic("Cannot accept memory: unknown platform\n");
+ }
+}
+
+static inline struct efi_unaccepted_memory *efi_get_unaccepted_table(void)
+{
+ if (efi.unaccepted == EFI_INVALID_TABLE_ADDR)
+ return NULL;
+ return __va(efi.unaccepted);
+}
+#endif
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
deleted file mode 100644
index a7bd416b4763..000000000000
--- a/arch/x86/include/asm/unaligned.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _ASM_X86_UNALIGNED_H
-#define _ASM_X86_UNALIGNED_H
-
-/*
- * The x86 can do unaligned accesses itself.
- */
-
-#include <linux/unaligned/access_ok.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned __get_unaligned_le
-#define put_unaligned __put_unaligned_le
-
-#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 2a58ed3e51d8..6c9e5bdd3916 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -1,13 +1,60 @@
-#ifdef __KERNEL__
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNISTD_H
+#define _ASM_X86_UNISTD_H 1
+
+#include <uapi/asm/unistd.h>
+
+
# ifdef CONFIG_X86_32
-# include "unistd_32.h"
+
+# include <asm/unistd_32.h>
+# define __ARCH_WANT_STAT64
+# define __ARCH_WANT_SYS_IPC
+# define __ARCH_WANT_SYS_OLD_MMAP
+# define __ARCH_WANT_SYS_OLD_SELECT
+
+# define IA32_NR_syscalls (__NR_syscalls)
+
# else
-# include "unistd_64.h"
+
+# include <asm/unistd_64.h>
+# include <asm/unistd_64_x32.h>
+# include <asm/unistd_32_ia32.h>
+# define __ARCH_WANT_SYS_TIME
+# define __ARCH_WANT_SYS_UTIME
+# define __ARCH_WANT_COMPAT_STAT
+# define __ARCH_WANT_COMPAT_SYS_PREADV64
+# define __ARCH_WANT_COMPAT_SYS_PWRITEV64
+# define __ARCH_WANT_COMPAT_SYS_PREADV64V2
+# define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+# define X32_NR_syscalls (__NR_x32_syscalls)
+# define IA32_NR_syscalls (__NR_ia32_syscalls)
+
# endif
-#else
-# ifdef __i386__
-# include "unistd_32.h"
-# else
-# include "unistd_64.h"
-# endif
-#endif
+
+# define NR_syscalls (__NR_syscalls)
+
+# define __ARCH_WANT_NEW_STAT
+# define __ARCH_WANT_OLD_READDIR
+# define __ARCH_WANT_OLD_STAT
+# define __ARCH_WANT_SYS_ALARM
+# define __ARCH_WANT_SYS_FADVISE64
+# define __ARCH_WANT_SYS_GETHOSTNAME
+# define __ARCH_WANT_SYS_GETPGRP
+# define __ARCH_WANT_SYS_NICE
+# define __ARCH_WANT_SYS_OLDUMOUNT
+# define __ARCH_WANT_SYS_OLD_GETRLIMIT
+# define __ARCH_WANT_SYS_OLD_UNAME
+# define __ARCH_WANT_SYS_PAUSE
+# define __ARCH_WANT_SYS_SIGNAL
+# define __ARCH_WANT_SYS_SIGPENDING
+# define __ARCH_WANT_SYS_SIGPROCMASK
+# define __ARCH_WANT_SYS_SOCKETCALL
+# define __ARCH_WANT_SYS_TIME32
+# define __ARCH_WANT_SYS_UTIME32
+# define __ARCH_WANT_SYS_WAITPID
+# define __ARCH_WANT_SYS_FORK
+# define __ARCH_WANT_SYS_VFORK
+# define __ARCH_WANT_SYS_CLONE
+
+#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
deleted file mode 100644
index 732a30706153..000000000000
--- a/arch/x86/include/asm/unistd_32.h
+++ /dev/null
@@ -1,383 +0,0 @@
-#ifndef _ASM_X86_UNISTD_32_H
-#define _ASM_X86_UNISTD_32_H
-
-/*
- * This file contains the system call numbers.
- */
-
-#define __NR_restart_syscall 0
-#define __NR_exit 1
-#define __NR_fork 2
-#define __NR_read 3
-#define __NR_write 4
-#define __NR_open 5
-#define __NR_close 6
-#define __NR_waitpid 7
-#define __NR_creat 8
-#define __NR_link 9
-#define __NR_unlink 10
-#define __NR_execve 11
-#define __NR_chdir 12
-#define __NR_time 13
-#define __NR_mknod 14
-#define __NR_chmod 15
-#define __NR_lchown 16
-#define __NR_break 17
-#define __NR_oldstat 18
-#define __NR_lseek 19
-#define __NR_getpid 20
-#define __NR_mount 21
-#define __NR_umount 22
-#define __NR_setuid 23
-#define __NR_getuid 24
-#define __NR_stime 25
-#define __NR_ptrace 26
-#define __NR_alarm 27
-#define __NR_oldfstat 28
-#define __NR_pause 29
-#define __NR_utime 30
-#define __NR_stty 31
-#define __NR_gtty 32
-#define __NR_access 33
-#define __NR_nice 34
-#define __NR_ftime 35
-#define __NR_sync 36
-#define __NR_kill 37
-#define __NR_rename 38
-#define __NR_mkdir 39
-#define __NR_rmdir 40
-#define __NR_dup 41
-#define __NR_pipe 42
-#define __NR_times 43
-#define __NR_prof 44
-#define __NR_brk 45
-#define __NR_setgid 46
-#define __NR_getgid 47
-#define __NR_signal 48
-#define __NR_geteuid 49
-#define __NR_getegid 50
-#define __NR_acct 51
-#define __NR_umount2 52
-#define __NR_lock 53
-#define __NR_ioctl 54
-#define __NR_fcntl 55
-#define __NR_mpx 56
-#define __NR_setpgid 57
-#define __NR_ulimit 58
-#define __NR_oldolduname 59
-#define __NR_umask 60
-#define __NR_chroot 61
-#define __NR_ustat 62
-#define __NR_dup2 63
-#define __NR_getppid 64
-#define __NR_getpgrp 65
-#define __NR_setsid 66
-#define __NR_sigaction 67
-#define __NR_sgetmask 68
-#define __NR_ssetmask 69
-#define __NR_setreuid 70
-#define __NR_setregid 71
-#define __NR_sigsuspend 72
-#define __NR_sigpending 73
-#define __NR_sethostname 74
-#define __NR_setrlimit 75
-#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */
-#define __NR_getrusage 77
-#define __NR_gettimeofday 78
-#define __NR_settimeofday 79
-#define __NR_getgroups 80
-#define __NR_setgroups 81
-#define __NR_select 82
-#define __NR_symlink 83
-#define __NR_oldlstat 84
-#define __NR_readlink 85
-#define __NR_uselib 86
-#define __NR_swapon 87
-#define __NR_reboot 88
-#define __NR_readdir 89
-#define __NR_mmap 90
-#define __NR_munmap 91
-#define __NR_truncate 92
-#define __NR_ftruncate 93
-#define __NR_fchmod 94
-#define __NR_fchown 95
-#define __NR_getpriority 96
-#define __NR_setpriority 97
-#define __NR_profil 98
-#define __NR_statfs 99
-#define __NR_fstatfs 100
-#define __NR_ioperm 101
-#define __NR_socketcall 102
-#define __NR_syslog 103
-#define __NR_setitimer 104
-#define __NR_getitimer 105
-#define __NR_stat 106
-#define __NR_lstat 107
-#define __NR_fstat 108
-#define __NR_olduname 109
-#define __NR_iopl 110
-#define __NR_vhangup 111
-#define __NR_idle 112
-#define __NR_vm86old 113
-#define __NR_wait4 114
-#define __NR_swapoff 115
-#define __NR_sysinfo 116
-#define __NR_ipc 117
-#define __NR_fsync 118
-#define __NR_sigreturn 119
-#define __NR_clone 120
-#define __NR_setdomainname 121
-#define __NR_uname 122
-#define __NR_modify_ldt 123
-#define __NR_adjtimex 124
-#define __NR_mprotect 125
-#define __NR_sigprocmask 126
-#define __NR_create_module 127
-#define __NR_init_module 128
-#define __NR_delete_module 129
-#define __NR_get_kernel_syms 130
-#define __NR_quotactl 131
-#define __NR_getpgid 132
-#define __NR_fchdir 133
-#define __NR_bdflush 134
-#define __NR_sysfs 135
-#define __NR_personality 136
-#define __NR_afs_syscall 137 /* Syscall for Andrew File System */
-#define __NR_setfsuid 138
-#define __NR_setfsgid 139
-#define __NR__llseek 140
-#define __NR_getdents 141
-#define __NR__newselect 142
-#define __NR_flock 143
-#define __NR_msync 144
-#define __NR_readv 145
-#define __NR_writev 146
-#define __NR_getsid 147
-#define __NR_fdatasync 148
-#define __NR__sysctl 149
-#define __NR_mlock 150
-#define __NR_munlock 151
-#define __NR_mlockall 152
-#define __NR_munlockall 153
-#define __NR_sched_setparam 154
-#define __NR_sched_getparam 155
-#define __NR_sched_setscheduler 156
-#define __NR_sched_getscheduler 157
-#define __NR_sched_yield 158
-#define __NR_sched_get_priority_max 159
-#define __NR_sched_get_priority_min 160
-#define __NR_sched_rr_get_interval 161
-#define __NR_nanosleep 162
-#define __NR_mremap 163
-#define __NR_setresuid 164
-#define __NR_getresuid 165
-#define __NR_vm86 166
-#define __NR_query_module 167
-#define __NR_poll 168
-#define __NR_nfsservctl 169
-#define __NR_setresgid 170
-#define __NR_getresgid 171
-#define __NR_prctl 172
-#define __NR_rt_sigreturn 173
-#define __NR_rt_sigaction 174
-#define __NR_rt_sigprocmask 175
-#define __NR_rt_sigpending 176
-#define __NR_rt_sigtimedwait 177
-#define __NR_rt_sigqueueinfo 178
-#define __NR_rt_sigsuspend 179
-#define __NR_pread64 180
-#define __NR_pwrite64 181
-#define __NR_chown 182
-#define __NR_getcwd 183
-#define __NR_capget 184
-#define __NR_capset 185
-#define __NR_sigaltstack 186
-#define __NR_sendfile 187
-#define __NR_getpmsg 188 /* some people actually want streams */
-#define __NR_putpmsg 189 /* some people actually want streams */
-#define __NR_vfork 190
-#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */
-#define __NR_mmap2 192
-#define __NR_truncate64 193
-#define __NR_ftruncate64 194
-#define __NR_stat64 195
-#define __NR_lstat64 196
-#define __NR_fstat64 197
-#define __NR_lchown32 198
-#define __NR_getuid32 199
-#define __NR_getgid32 200
-#define __NR_geteuid32 201
-#define __NR_getegid32 202
-#define __NR_setreuid32 203
-#define __NR_setregid32 204
-#define __NR_getgroups32 205
-#define __NR_setgroups32 206
-#define __NR_fchown32 207
-#define __NR_setresuid32 208
-#define __NR_getresuid32 209
-#define __NR_setresgid32 210
-#define __NR_getresgid32 211
-#define __NR_chown32 212
-#define __NR_setuid32 213
-#define __NR_setgid32 214
-#define __NR_setfsuid32 215
-#define __NR_setfsgid32 216
-#define __NR_pivot_root 217
-#define __NR_mincore 218
-#define __NR_madvise 219
-#define __NR_madvise1 219 /* delete when C lib stub is removed */
-#define __NR_getdents64 220
-#define __NR_fcntl64 221
-/* 223 is unused */
-#define __NR_gettid 224
-#define __NR_readahead 225
-#define __NR_setxattr 226
-#define __NR_lsetxattr 227
-#define __NR_fsetxattr 228
-#define __NR_getxattr 229
-#define __NR_lgetxattr 230
-#define __NR_fgetxattr 231
-#define __NR_listxattr 232
-#define __NR_llistxattr 233
-#define __NR_flistxattr 234
-#define __NR_removexattr 235
-#define __NR_lremovexattr 236
-#define __NR_fremovexattr 237
-#define __NR_tkill 238
-#define __NR_sendfile64 239
-#define __NR_futex 240
-#define __NR_sched_setaffinity 241
-#define __NR_sched_getaffinity 242
-#define __NR_set_thread_area 243
-#define __NR_get_thread_area 244
-#define __NR_io_setup 245
-#define __NR_io_destroy 246
-#define __NR_io_getevents 247
-#define __NR_io_submit 248
-#define __NR_io_cancel 249
-#define __NR_fadvise64 250
-/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
-#define __NR_exit_group 252
-#define __NR_lookup_dcookie 253
-#define __NR_epoll_create 254
-#define __NR_epoll_ctl 255
-#define __NR_epoll_wait 256
-#define __NR_remap_file_pages 257
-#define __NR_set_tid_address 258
-#define __NR_timer_create 259
-#define __NR_timer_settime (__NR_timer_create+1)
-#define __NR_timer_gettime (__NR_timer_create+2)
-#define __NR_timer_getoverrun (__NR_timer_create+3)
-#define __NR_timer_delete (__NR_timer_create+4)
-#define __NR_clock_settime (__NR_timer_create+5)
-#define __NR_clock_gettime (__NR_timer_create+6)
-#define __NR_clock_getres (__NR_timer_create+7)
-#define __NR_clock_nanosleep (__NR_timer_create+8)
-#define __NR_statfs64 268
-#define __NR_fstatfs64 269
-#define __NR_tgkill 270
-#define __NR_utimes 271
-#define __NR_fadvise64_64 272
-#define __NR_vserver 273
-#define __NR_mbind 274
-#define __NR_get_mempolicy 275
-#define __NR_set_mempolicy 276
-#define __NR_mq_open 277
-#define __NR_mq_unlink (__NR_mq_open+1)
-#define __NR_mq_timedsend (__NR_mq_open+2)
-#define __NR_mq_timedreceive (__NR_mq_open+3)
-#define __NR_mq_notify (__NR_mq_open+4)
-#define __NR_mq_getsetattr (__NR_mq_open+5)
-#define __NR_kexec_load 283
-#define __NR_waitid 284
-/* #define __NR_sys_setaltroot 285 */
-#define __NR_add_key 286
-#define __NR_request_key 287
-#define __NR_keyctl 288
-#define __NR_ioprio_set 289
-#define __NR_ioprio_get 290
-#define __NR_inotify_init 291
-#define __NR_inotify_add_watch 292
-#define __NR_inotify_rm_watch 293
-#define __NR_migrate_pages 294
-#define __NR_openat 295
-#define __NR_mkdirat 296
-#define __NR_mknodat 297
-#define __NR_fchownat 298
-#define __NR_futimesat 299
-#define __NR_fstatat64 300
-#define __NR_unlinkat 301
-#define __NR_renameat 302
-#define __NR_linkat 303
-#define __NR_symlinkat 304
-#define __NR_readlinkat 305
-#define __NR_fchmodat 306
-#define __NR_faccessat 307
-#define __NR_pselect6 308
-#define __NR_ppoll 309
-#define __NR_unshare 310
-#define __NR_set_robust_list 311
-#define __NR_get_robust_list 312
-#define __NR_splice 313
-#define __NR_sync_file_range 314
-#define __NR_tee 315
-#define __NR_vmsplice 316
-#define __NR_move_pages 317
-#define __NR_getcpu 318
-#define __NR_epoll_pwait 319
-#define __NR_utimensat 320
-#define __NR_signalfd 321
-#define __NR_timerfd_create 322
-#define __NR_eventfd 323
-#define __NR_fallocate 324
-#define __NR_timerfd_settime 325
-#define __NR_timerfd_gettime 326
-#define __NR_signalfd4 327
-#define __NR_eventfd2 328
-#define __NR_epoll_create1 329
-#define __NR_dup3 330
-#define __NR_pipe2 331
-#define __NR_inotify_init1 332
-#define __NR_preadv 333
-#define __NR_pwritev 334
-#define __NR_rt_tgsigqueueinfo 335
-#define __NR_perf_counter_open 336
-
-#ifdef __KERNEL__
-
-#define __ARCH_WANT_IPC_PARSE_VERSION
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_STAT64
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
deleted file mode 100644
index 900e1617e672..000000000000
--- a/arch/x86/include/asm/unistd_64.h
+++ /dev/null
@@ -1,700 +0,0 @@
-#ifndef _ASM_X86_UNISTD_64_H
-#define _ASM_X86_UNISTD_64_H
-
-#ifndef __SYSCALL
-#define __SYSCALL(a, b)
-#endif
-
-/*
- * This file contains the system call numbers.
- *
- * Note: holes are not allowed.
- */
-
-/* at least 8 syscall per cacheline */
-#define __NR_read 0
-__SYSCALL(__NR_read, sys_read)
-#define __NR_write 1
-__SYSCALL(__NR_write, sys_write)
-#define __NR_open 2
-__SYSCALL(__NR_open, sys_open)
-#define __NR_close 3
-__SYSCALL(__NR_close, sys_close)
-#define __NR_stat 4
-__SYSCALL(__NR_stat, sys_newstat)
-#define __NR_fstat 5
-__SYSCALL(__NR_fstat, sys_newfstat)
-#define __NR_lstat 6
-__SYSCALL(__NR_lstat, sys_newlstat)
-#define __NR_poll 7
-__SYSCALL(__NR_poll, sys_poll)
-
-#define __NR_lseek 8
-__SYSCALL(__NR_lseek, sys_lseek)
-#define __NR_mmap 9
-__SYSCALL(__NR_mmap, sys_mmap)
-#define __NR_mprotect 10
-__SYSCALL(__NR_mprotect, sys_mprotect)
-#define __NR_munmap 11
-__SYSCALL(__NR_munmap, sys_munmap)
-#define __NR_brk 12
-__SYSCALL(__NR_brk, sys_brk)
-#define __NR_rt_sigaction 13
-__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
-#define __NR_rt_sigprocmask 14
-__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
-#define __NR_rt_sigreturn 15
-__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
-
-#define __NR_ioctl 16
-__SYSCALL(__NR_ioctl, sys_ioctl)
-#define __NR_pread64 17
-__SYSCALL(__NR_pread64, sys_pread64)
-#define __NR_pwrite64 18
-__SYSCALL(__NR_pwrite64, sys_pwrite64)
-#define __NR_readv 19
-__SYSCALL(__NR_readv, sys_readv)
-#define __NR_writev 20
-__SYSCALL(__NR_writev, sys_writev)
-#define __NR_access 21
-__SYSCALL(__NR_access, sys_access)
-#define __NR_pipe 22
-__SYSCALL(__NR_pipe, sys_pipe)
-#define __NR_select 23
-__SYSCALL(__NR_select, sys_select)
-
-#define __NR_sched_yield 24
-__SYSCALL(__NR_sched_yield, sys_sched_yield)
-#define __NR_mremap 25
-__SYSCALL(__NR_mremap, sys_mremap)
-#define __NR_msync 26
-__SYSCALL(__NR_msync, sys_msync)
-#define __NR_mincore 27
-__SYSCALL(__NR_mincore, sys_mincore)
-#define __NR_madvise 28
-__SYSCALL(__NR_madvise, sys_madvise)
-#define __NR_shmget 29
-__SYSCALL(__NR_shmget, sys_shmget)
-#define __NR_shmat 30
-__SYSCALL(__NR_shmat, sys_shmat)
-#define __NR_shmctl 31
-__SYSCALL(__NR_shmctl, sys_shmctl)
-
-#define __NR_dup 32
-__SYSCALL(__NR_dup, sys_dup)
-#define __NR_dup2 33
-__SYSCALL(__NR_dup2, sys_dup2)
-#define __NR_pause 34
-__SYSCALL(__NR_pause, sys_pause)
-#define __NR_nanosleep 35
-__SYSCALL(__NR_nanosleep, sys_nanosleep)
-#define __NR_getitimer 36
-__SYSCALL(__NR_getitimer, sys_getitimer)
-#define __NR_alarm 37
-__SYSCALL(__NR_alarm, sys_alarm)
-#define __NR_setitimer 38
-__SYSCALL(__NR_setitimer, sys_setitimer)
-#define __NR_getpid 39
-__SYSCALL(__NR_getpid, sys_getpid)
-
-#define __NR_sendfile 40
-__SYSCALL(__NR_sendfile, sys_sendfile64)
-#define __NR_socket 41
-__SYSCALL(__NR_socket, sys_socket)
-#define __NR_connect 42
-__SYSCALL(__NR_connect, sys_connect)
-#define __NR_accept 43
-__SYSCALL(__NR_accept, sys_accept)
-#define __NR_sendto 44
-__SYSCALL(__NR_sendto, sys_sendto)
-#define __NR_recvfrom 45
-__SYSCALL(__NR_recvfrom, sys_recvfrom)
-#define __NR_sendmsg 46
-__SYSCALL(__NR_sendmsg, sys_sendmsg)
-#define __NR_recvmsg 47
-__SYSCALL(__NR_recvmsg, sys_recvmsg)
-
-#define __NR_shutdown 48
-__SYSCALL(__NR_shutdown, sys_shutdown)
-#define __NR_bind 49
-__SYSCALL(__NR_bind, sys_bind)
-#define __NR_listen 50
-__SYSCALL(__NR_listen, sys_listen)
-#define __NR_getsockname 51
-__SYSCALL(__NR_getsockname, sys_getsockname)
-#define __NR_getpeername 52
-__SYSCALL(__NR_getpeername, sys_getpeername)
-#define __NR_socketpair 53
-__SYSCALL(__NR_socketpair, sys_socketpair)
-#define __NR_setsockopt 54
-__SYSCALL(__NR_setsockopt, sys_setsockopt)
-#define __NR_getsockopt 55
-__SYSCALL(__NR_getsockopt, sys_getsockopt)
-
-#define __NR_clone 56
-__SYSCALL(__NR_clone, stub_clone)
-#define __NR_fork 57
-__SYSCALL(__NR_fork, stub_fork)
-#define __NR_vfork 58
-__SYSCALL(__NR_vfork, stub_vfork)
-#define __NR_execve 59
-__SYSCALL(__NR_execve, stub_execve)
-#define __NR_exit 60
-__SYSCALL(__NR_exit, sys_exit)
-#define __NR_wait4 61
-__SYSCALL(__NR_wait4, sys_wait4)
-#define __NR_kill 62
-__SYSCALL(__NR_kill, sys_kill)
-#define __NR_uname 63
-__SYSCALL(__NR_uname, sys_uname)
-
-#define __NR_semget 64
-__SYSCALL(__NR_semget, sys_semget)
-#define __NR_semop 65
-__SYSCALL(__NR_semop, sys_semop)
-#define __NR_semctl 66
-__SYSCALL(__NR_semctl, sys_semctl)
-#define __NR_shmdt 67
-__SYSCALL(__NR_shmdt, sys_shmdt)
-#define __NR_msgget 68
-__SYSCALL(__NR_msgget, sys_msgget)
-#define __NR_msgsnd 69
-__SYSCALL(__NR_msgsnd, sys_msgsnd)
-#define __NR_msgrcv 70
-__SYSCALL(__NR_msgrcv, sys_msgrcv)
-#define __NR_msgctl 71
-__SYSCALL(__NR_msgctl, sys_msgctl)
-
-#define __NR_fcntl 72
-__SYSCALL(__NR_fcntl, sys_fcntl)
-#define __NR_flock 73
-__SYSCALL(__NR_flock, sys_flock)
-#define __NR_fsync 74
-__SYSCALL(__NR_fsync, sys_fsync)
-#define __NR_fdatasync 75
-__SYSCALL(__NR_fdatasync, sys_fdatasync)
-#define __NR_truncate 76
-__SYSCALL(__NR_truncate, sys_truncate)
-#define __NR_ftruncate 77
-__SYSCALL(__NR_ftruncate, sys_ftruncate)
-#define __NR_getdents 78
-__SYSCALL(__NR_getdents, sys_getdents)
-#define __NR_getcwd 79
-__SYSCALL(__NR_getcwd, sys_getcwd)
-
-#define __NR_chdir 80
-__SYSCALL(__NR_chdir, sys_chdir)
-#define __NR_fchdir 81
-__SYSCALL(__NR_fchdir, sys_fchdir)
-#define __NR_rename 82
-__SYSCALL(__NR_rename, sys_rename)
-#define __NR_mkdir 83
-__SYSCALL(__NR_mkdir, sys_mkdir)
-#define __NR_rmdir 84
-__SYSCALL(__NR_rmdir, sys_rmdir)
-#define __NR_creat 85
-__SYSCALL(__NR_creat, sys_creat)
-#define __NR_link 86
-__SYSCALL(__NR_link, sys_link)
-#define __NR_unlink 87
-__SYSCALL(__NR_unlink, sys_unlink)
-
-#define __NR_symlink 88
-__SYSCALL(__NR_symlink, sys_symlink)
-#define __NR_readlink 89
-__SYSCALL(__NR_readlink, sys_readlink)
-#define __NR_chmod 90
-__SYSCALL(__NR_chmod, sys_chmod)
-#define __NR_fchmod 91
-__SYSCALL(__NR_fchmod, sys_fchmod)
-#define __NR_chown 92
-__SYSCALL(__NR_chown, sys_chown)
-#define __NR_fchown 93
-__SYSCALL(__NR_fchown, sys_fchown)
-#define __NR_lchown 94
-__SYSCALL(__NR_lchown, sys_lchown)
-#define __NR_umask 95
-__SYSCALL(__NR_umask, sys_umask)
-
-#define __NR_gettimeofday 96
-__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
-#define __NR_getrlimit 97
-__SYSCALL(__NR_getrlimit, sys_getrlimit)
-#define __NR_getrusage 98
-__SYSCALL(__NR_getrusage, sys_getrusage)
-#define __NR_sysinfo 99
-__SYSCALL(__NR_sysinfo, sys_sysinfo)
-#define __NR_times 100
-__SYSCALL(__NR_times, sys_times)
-#define __NR_ptrace 101
-__SYSCALL(__NR_ptrace, sys_ptrace)
-#define __NR_getuid 102
-__SYSCALL(__NR_getuid, sys_getuid)
-#define __NR_syslog 103
-__SYSCALL(__NR_syslog, sys_syslog)
-
-/* at the very end the stuff that never runs during the benchmarks */
-#define __NR_getgid 104
-__SYSCALL(__NR_getgid, sys_getgid)
-#define __NR_setuid 105
-__SYSCALL(__NR_setuid, sys_setuid)
-#define __NR_setgid 106
-__SYSCALL(__NR_setgid, sys_setgid)
-#define __NR_geteuid 107
-__SYSCALL(__NR_geteuid, sys_geteuid)
-#define __NR_getegid 108
-__SYSCALL(__NR_getegid, sys_getegid)
-#define __NR_setpgid 109
-__SYSCALL(__NR_setpgid, sys_setpgid)
-#define __NR_getppid 110
-__SYSCALL(__NR_getppid, sys_getppid)
-#define __NR_getpgrp 111
-__SYSCALL(__NR_getpgrp, sys_getpgrp)
-
-#define __NR_setsid 112
-__SYSCALL(__NR_setsid, sys_setsid)
-#define __NR_setreuid 113
-__SYSCALL(__NR_setreuid, sys_setreuid)
-#define __NR_setregid 114
-__SYSCALL(__NR_setregid, sys_setregid)
-#define __NR_getgroups 115
-__SYSCALL(__NR_getgroups, sys_getgroups)
-#define __NR_setgroups 116
-__SYSCALL(__NR_setgroups, sys_setgroups)
-#define __NR_setresuid 117
-__SYSCALL(__NR_setresuid, sys_setresuid)
-#define __NR_getresuid 118
-__SYSCALL(__NR_getresuid, sys_getresuid)
-#define __NR_setresgid 119
-__SYSCALL(__NR_setresgid, sys_setresgid)
-
-#define __NR_getresgid 120
-__SYSCALL(__NR_getresgid, sys_getresgid)
-#define __NR_getpgid 121
-__SYSCALL(__NR_getpgid, sys_getpgid)
-#define __NR_setfsuid 122
-__SYSCALL(__NR_setfsuid, sys_setfsuid)
-#define __NR_setfsgid 123
-__SYSCALL(__NR_setfsgid, sys_setfsgid)
-#define __NR_getsid 124
-__SYSCALL(__NR_getsid, sys_getsid)
-#define __NR_capget 125
-__SYSCALL(__NR_capget, sys_capget)
-#define __NR_capset 126
-__SYSCALL(__NR_capset, sys_capset)
-
-#define __NR_rt_sigpending 127
-__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
-#define __NR_rt_sigtimedwait 128
-__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
-#define __NR_rt_sigqueueinfo 129
-__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
-#define __NR_rt_sigsuspend 130
-__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
-#define __NR_sigaltstack 131
-__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
-#define __NR_utime 132
-__SYSCALL(__NR_utime, sys_utime)
-#define __NR_mknod 133
-__SYSCALL(__NR_mknod, sys_mknod)
-
-/* Only needed for a.out */
-#define __NR_uselib 134
-__SYSCALL(__NR_uselib, sys_ni_syscall)
-#define __NR_personality 135
-__SYSCALL(__NR_personality, sys_personality)
-
-#define __NR_ustat 136
-__SYSCALL(__NR_ustat, sys_ustat)
-#define __NR_statfs 137
-__SYSCALL(__NR_statfs, sys_statfs)
-#define __NR_fstatfs 138
-__SYSCALL(__NR_fstatfs, sys_fstatfs)
-#define __NR_sysfs 139
-__SYSCALL(__NR_sysfs, sys_sysfs)
-
-#define __NR_getpriority 140
-__SYSCALL(__NR_getpriority, sys_getpriority)
-#define __NR_setpriority 141
-__SYSCALL(__NR_setpriority, sys_setpriority)
-#define __NR_sched_setparam 142
-__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
-#define __NR_sched_getparam 143
-__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
-#define __NR_sched_setscheduler 144
-__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
-#define __NR_sched_getscheduler 145
-__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
-#define __NR_sched_get_priority_max 146
-__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
-#define __NR_sched_get_priority_min 147
-__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
-#define __NR_sched_rr_get_interval 148
-__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
-
-#define __NR_mlock 149
-__SYSCALL(__NR_mlock, sys_mlock)
-#define __NR_munlock 150
-__SYSCALL(__NR_munlock, sys_munlock)
-#define __NR_mlockall 151
-__SYSCALL(__NR_mlockall, sys_mlockall)
-#define __NR_munlockall 152
-__SYSCALL(__NR_munlockall, sys_munlockall)
-
-#define __NR_vhangup 153
-__SYSCALL(__NR_vhangup, sys_vhangup)
-
-#define __NR_modify_ldt 154
-__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
-
-#define __NR_pivot_root 155
-__SYSCALL(__NR_pivot_root, sys_pivot_root)
-
-#define __NR__sysctl 156
-__SYSCALL(__NR__sysctl, sys_sysctl)
-
-#define __NR_prctl 157
-__SYSCALL(__NR_prctl, sys_prctl)
-#define __NR_arch_prctl 158
-__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
-
-#define __NR_adjtimex 159
-__SYSCALL(__NR_adjtimex, sys_adjtimex)
-
-#define __NR_setrlimit 160
-__SYSCALL(__NR_setrlimit, sys_setrlimit)
-
-#define __NR_chroot 161
-__SYSCALL(__NR_chroot, sys_chroot)
-
-#define __NR_sync 162
-__SYSCALL(__NR_sync, sys_sync)
-
-#define __NR_acct 163
-__SYSCALL(__NR_acct, sys_acct)
-
-#define __NR_settimeofday 164
-__SYSCALL(__NR_settimeofday, sys_settimeofday)
-
-#define __NR_mount 165
-__SYSCALL(__NR_mount, sys_mount)
-#define __NR_umount2 166
-__SYSCALL(__NR_umount2, sys_umount)
-
-#define __NR_swapon 167
-__SYSCALL(__NR_swapon, sys_swapon)
-#define __NR_swapoff 168
-__SYSCALL(__NR_swapoff, sys_swapoff)
-
-#define __NR_reboot 169
-__SYSCALL(__NR_reboot, sys_reboot)
-
-#define __NR_sethostname 170
-__SYSCALL(__NR_sethostname, sys_sethostname)
-#define __NR_setdomainname 171
-__SYSCALL(__NR_setdomainname, sys_setdomainname)
-
-#define __NR_iopl 172
-__SYSCALL(__NR_iopl, stub_iopl)
-#define __NR_ioperm 173
-__SYSCALL(__NR_ioperm, sys_ioperm)
-
-#define __NR_create_module 174
-__SYSCALL(__NR_create_module, sys_ni_syscall)
-#define __NR_init_module 175
-__SYSCALL(__NR_init_module, sys_init_module)
-#define __NR_delete_module 176
-__SYSCALL(__NR_delete_module, sys_delete_module)
-#define __NR_get_kernel_syms 177
-__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
-#define __NR_query_module 178
-__SYSCALL(__NR_query_module, sys_ni_syscall)
-
-#define __NR_quotactl 179
-__SYSCALL(__NR_quotactl, sys_quotactl)
-
-#define __NR_nfsservctl 180
-__SYSCALL(__NR_nfsservctl, sys_nfsservctl)
-
-/* reserved for LiS/STREAMS */
-#define __NR_getpmsg 181
-__SYSCALL(__NR_getpmsg, sys_ni_syscall)
-#define __NR_putpmsg 182
-__SYSCALL(__NR_putpmsg, sys_ni_syscall)
-
-/* reserved for AFS */
-#define __NR_afs_syscall 183
-__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
-
-/* reserved for tux */
-#define __NR_tuxcall 184
-__SYSCALL(__NR_tuxcall, sys_ni_syscall)
-
-#define __NR_security 185
-__SYSCALL(__NR_security, sys_ni_syscall)
-
-#define __NR_gettid 186
-__SYSCALL(__NR_gettid, sys_gettid)
-
-#define __NR_readahead 187
-__SYSCALL(__NR_readahead, sys_readahead)
-#define __NR_setxattr 188
-__SYSCALL(__NR_setxattr, sys_setxattr)
-#define __NR_lsetxattr 189
-__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
-#define __NR_fsetxattr 190
-__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
-#define __NR_getxattr 191
-__SYSCALL(__NR_getxattr, sys_getxattr)
-#define __NR_lgetxattr 192
-__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
-#define __NR_fgetxattr 193
-__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
-#define __NR_listxattr 194
-__SYSCALL(__NR_listxattr, sys_listxattr)
-#define __NR_llistxattr 195
-__SYSCALL(__NR_llistxattr, sys_llistxattr)
-#define __NR_flistxattr 196
-__SYSCALL(__NR_flistxattr, sys_flistxattr)
-#define __NR_removexattr 197
-__SYSCALL(__NR_removexattr, sys_removexattr)
-#define __NR_lremovexattr 198
-__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
-#define __NR_fremovexattr 199
-__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
-#define __NR_tkill 200
-__SYSCALL(__NR_tkill, sys_tkill)
-#define __NR_time 201
-__SYSCALL(__NR_time, sys_time)
-#define __NR_futex 202
-__SYSCALL(__NR_futex, sys_futex)
-#define __NR_sched_setaffinity 203
-__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
-#define __NR_sched_getaffinity 204
-__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
-#define __NR_set_thread_area 205
-__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */
-#define __NR_io_setup 206
-__SYSCALL(__NR_io_setup, sys_io_setup)
-#define __NR_io_destroy 207
-__SYSCALL(__NR_io_destroy, sys_io_destroy)
-#define __NR_io_getevents 208
-__SYSCALL(__NR_io_getevents, sys_io_getevents)
-#define __NR_io_submit 209
-__SYSCALL(__NR_io_submit, sys_io_submit)
-#define __NR_io_cancel 210
-__SYSCALL(__NR_io_cancel, sys_io_cancel)
-#define __NR_get_thread_area 211
-__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */
-#define __NR_lookup_dcookie 212
-__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
-#define __NR_epoll_create 213
-__SYSCALL(__NR_epoll_create, sys_epoll_create)
-#define __NR_epoll_ctl_old 214
-__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
-#define __NR_epoll_wait_old 215
-__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
-#define __NR_remap_file_pages 216
-__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
-#define __NR_getdents64 217
-__SYSCALL(__NR_getdents64, sys_getdents64)
-#define __NR_set_tid_address 218
-__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
-#define __NR_restart_syscall 219
-__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
-#define __NR_semtimedop 220
-__SYSCALL(__NR_semtimedop, sys_semtimedop)
-#define __NR_fadvise64 221
-__SYSCALL(__NR_fadvise64, sys_fadvise64)
-#define __NR_timer_create 222
-__SYSCALL(__NR_timer_create, sys_timer_create)
-#define __NR_timer_settime 223
-__SYSCALL(__NR_timer_settime, sys_timer_settime)
-#define __NR_timer_gettime 224
-__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
-#define __NR_timer_getoverrun 225
-__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
-#define __NR_timer_delete 226
-__SYSCALL(__NR_timer_delete, sys_timer_delete)
-#define __NR_clock_settime 227
-__SYSCALL(__NR_clock_settime, sys_clock_settime)
-#define __NR_clock_gettime 228
-__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
-#define __NR_clock_getres 229
-__SYSCALL(__NR_clock_getres, sys_clock_getres)
-#define __NR_clock_nanosleep 230
-__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
-#define __NR_exit_group 231
-__SYSCALL(__NR_exit_group, sys_exit_group)
-#define __NR_epoll_wait 232
-__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
-#define __NR_epoll_ctl 233
-__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
-#define __NR_tgkill 234
-__SYSCALL(__NR_tgkill, sys_tgkill)
-#define __NR_utimes 235
-__SYSCALL(__NR_utimes, sys_utimes)
-#define __NR_vserver 236
-__SYSCALL(__NR_vserver, sys_ni_syscall)
-#define __NR_mbind 237
-__SYSCALL(__NR_mbind, sys_mbind)
-#define __NR_set_mempolicy 238
-__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
-#define __NR_get_mempolicy 239
-__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
-#define __NR_mq_open 240
-__SYSCALL(__NR_mq_open, sys_mq_open)
-#define __NR_mq_unlink 241
-__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
-#define __NR_mq_timedsend 242
-__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
-#define __NR_mq_timedreceive 243
-__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
-#define __NR_mq_notify 244
-__SYSCALL(__NR_mq_notify, sys_mq_notify)
-#define __NR_mq_getsetattr 245
-__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
-#define __NR_kexec_load 246
-__SYSCALL(__NR_kexec_load, sys_kexec_load)
-#define __NR_waitid 247
-__SYSCALL(__NR_waitid, sys_waitid)
-#define __NR_add_key 248
-__SYSCALL(__NR_add_key, sys_add_key)
-#define __NR_request_key 249
-__SYSCALL(__NR_request_key, sys_request_key)
-#define __NR_keyctl 250
-__SYSCALL(__NR_keyctl, sys_keyctl)
-#define __NR_ioprio_set 251
-__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
-#define __NR_ioprio_get 252
-__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
-#define __NR_inotify_init 253
-__SYSCALL(__NR_inotify_init, sys_inotify_init)
-#define __NR_inotify_add_watch 254
-__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
-#define __NR_inotify_rm_watch 255
-__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
-#define __NR_migrate_pages 256
-__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
-#define __NR_openat 257
-__SYSCALL(__NR_openat, sys_openat)
-#define __NR_mkdirat 258
-__SYSCALL(__NR_mkdirat, sys_mkdirat)
-#define __NR_mknodat 259
-__SYSCALL(__NR_mknodat, sys_mknodat)
-#define __NR_fchownat 260
-__SYSCALL(__NR_fchownat, sys_fchownat)
-#define __NR_futimesat 261
-__SYSCALL(__NR_futimesat, sys_futimesat)
-#define __NR_newfstatat 262
-__SYSCALL(__NR_newfstatat, sys_newfstatat)
-#define __NR_unlinkat 263
-__SYSCALL(__NR_unlinkat, sys_unlinkat)
-#define __NR_renameat 264
-__SYSCALL(__NR_renameat, sys_renameat)
-#define __NR_linkat 265
-__SYSCALL(__NR_linkat, sys_linkat)
-#define __NR_symlinkat 266
-__SYSCALL(__NR_symlinkat, sys_symlinkat)
-#define __NR_readlinkat 267
-__SYSCALL(__NR_readlinkat, sys_readlinkat)
-#define __NR_fchmodat 268
-__SYSCALL(__NR_fchmodat, sys_fchmodat)
-#define __NR_faccessat 269
-__SYSCALL(__NR_faccessat, sys_faccessat)
-#define __NR_pselect6 270
-__SYSCALL(__NR_pselect6, sys_pselect6)
-#define __NR_ppoll 271
-__SYSCALL(__NR_ppoll, sys_ppoll)
-#define __NR_unshare 272
-__SYSCALL(__NR_unshare, sys_unshare)
-#define __NR_set_robust_list 273
-__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
-#define __NR_get_robust_list 274
-__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
-#define __NR_splice 275
-__SYSCALL(__NR_splice, sys_splice)
-#define __NR_tee 276
-__SYSCALL(__NR_tee, sys_tee)
-#define __NR_sync_file_range 277
-__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
-#define __NR_vmsplice 278
-__SYSCALL(__NR_vmsplice, sys_vmsplice)
-#define __NR_move_pages 279
-__SYSCALL(__NR_move_pages, sys_move_pages)
-#define __NR_utimensat 280
-__SYSCALL(__NR_utimensat, sys_utimensat)
-#define __IGNORE_getcpu /* implemented as a vsyscall */
-#define __NR_epoll_pwait 281
-__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
-#define __NR_signalfd 282
-__SYSCALL(__NR_signalfd, sys_signalfd)
-#define __NR_timerfd_create 283
-__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
-#define __NR_eventfd 284
-__SYSCALL(__NR_eventfd, sys_eventfd)
-#define __NR_fallocate 285
-__SYSCALL(__NR_fallocate, sys_fallocate)
-#define __NR_timerfd_settime 286
-__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
-#define __NR_timerfd_gettime 287
-__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
-#define __NR_accept4 288
-__SYSCALL(__NR_accept4, sys_accept4)
-#define __NR_signalfd4 289
-__SYSCALL(__NR_signalfd4, sys_signalfd4)
-#define __NR_eventfd2 290
-__SYSCALL(__NR_eventfd2, sys_eventfd2)
-#define __NR_epoll_create1 291
-__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
-#define __NR_dup3 292
-__SYSCALL(__NR_dup3, sys_dup3)
-#define __NR_pipe2 293
-__SYSCALL(__NR_pipe2, sys_pipe2)
-#define __NR_inotify_init1 294
-__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-#define __NR_preadv 295
-__SYSCALL(__NR_preadv, sys_preadv)
-#define __NR_pwritev 296
-__SYSCALL(__NR_pwritev, sys_pwritev)
-#define __NR_rt_tgsigqueueinfo 297
-__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open 298
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
-
-#ifndef __NO_STUBS
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_COMPAT_SYS_TIME
-#endif /* __NO_STUBS */
-
-#ifdef __KERNEL__
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 000000000000..7cede4dc21f0
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <linux/rethook.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
+struct unwind_state {
+ struct stack_info stack_info;
+ unsigned long stack_mask;
+ struct task_struct *task;
+ int graph_idx;
+#if defined(CONFIG_RETHOOK)
+ struct llist_node *kr_cur;
+#endif
+ bool error;
+#if defined(CONFIG_UNWINDER_ORC)
+ bool signal, full_regs;
+ unsigned long sp, bp, ip;
+ struct pt_regs *regs, *prev_regs;
+#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
+ bool got_irq;
+ unsigned long *bp, *orig_sp, ip;
+ /*
+ * If non-NULL: The current frame is incomplete and doesn't contain a
+ * valid BP. When looking for the next frame, use this instead of the
+ * non-existent saved BP.
+ */
+ unsigned long *next_bp;
+ struct pt_regs *regs;
+#else
+ unsigned long *sp;
+#endif
+};
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame);
+bool unwind_next_frame(struct unwind_state *state);
+unsigned long unwind_get_return_address(struct unwind_state *state);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
+
+static inline bool unwind_done(struct unwind_state *state)
+{
+ return state->stack_info.type == STACK_TYPE_UNKNOWN;
+}
+
+static inline bool unwind_error(struct unwind_state *state)
+{
+ return state->error;
+}
+
+static inline
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ first_frame = first_frame ? : get_stack_pointer(task, regs);
+
+ __unwind_start(state, task, regs, first_frame);
+}
+
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+/*
+ * If 'partial' returns true, only the iret frame registers are valid.
+ */
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (partial) {
+#ifdef CONFIG_UNWINDER_ORC
+ *partial = !state->full_regs;
+#else
+ *partial = false;
+#endif
+ }
+
+ return state->regs;
+}
+#else
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_UNWINDER_ORC
+void unwind_init(void);
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size);
+#else
+static inline void unwind_init(void) {}
+static inline
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size) {}
+#endif
+
+static inline
+unsigned long unwind_recover_rethook(struct unwind_state *state,
+ unsigned long addr, unsigned long *addr_p)
+{
+#ifdef CONFIG_RETHOOK
+ if (is_rethook_trampoline(addr))
+ return rethook_find_ret_addr(state->task, (unsigned long)addr_p,
+ &state->kr_cur);
+#endif
+ return addr;
+}
+
+/* Recover the return address modified by rethook and ftrace_graph. */
+static inline
+unsigned long unwind_recover_ret_addr(struct unwind_state *state,
+ unsigned long addr, unsigned long *addr_p)
+{
+ unsigned long ret;
+
+ ret = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, addr_p);
+ return unwind_recover_rethook(state, ret, addr_p);
+}
+
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x) \
+({ \
+ unsigned long val; \
+ if (task == current) \
+ val = READ_ONCE(x); \
+ else \
+ val = READ_ONCE_NOCHECK(x); \
+ val; \
+})
+
+static inline bool task_on_another_cpu(struct task_struct *task)
+{
+#ifdef CONFIG_SMP
+ return task != current && task->on_cpu;
+#else
+ return false;
+#endif
+}
+
+#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000000..8f4579c5a6f8
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,93 @@
+#ifndef _ASM_X86_UNWIND_HINTS_H
+#define _ASM_X86_UNWIND_HINTS_H
+
+#include <linux/objtool.h>
+
+#include "orc_types.h"
+
+#ifdef __ASSEMBLER__
+
+.macro UNWIND_HINT_END_OF_STACK
+ UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK
+.endm
+
+.macro UNWIND_HINT_UNDEFINED
+ UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED
+.endm
+
+.macro UNWIND_HINT_ENTRY
+ VALIDATE_UNRET_BEGIN
+ UNWIND_HINT_END_OF_STACK
+.endm
+
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
+ .if \base == %rsp
+ .if \indirect
+ .set sp_reg, ORC_REG_SP_INDIRECT
+ .else
+ .set sp_reg, ORC_REG_SP
+ .endif
+ .elseif \base == %rbp
+ .set sp_reg, ORC_REG_BP
+ .elseif \base == %rdi
+ .set sp_reg, ORC_REG_DI
+ .elseif \base == %rdx
+ .set sp_reg, ORC_REG_DX
+ .elseif \base == %r10
+ .set sp_reg, ORC_REG_R10
+ .else
+ .error "UNWIND_HINT_REGS: bad base register"
+ .endif
+
+ .set sp_offset, \offset
+
+ .if \partial
+ .set type, UNWIND_HINT_TYPE_REGS_PARTIAL
+ .elseif \extra == 0
+ .set type, UNWIND_HINT_TYPE_REGS_PARTIAL
+ .set sp_offset, \offset + (16*8)
+ .else
+ .set type, UNWIND_HINT_TYPE_REGS
+ .endif
+
+ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type signal=\signal
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 signal=1
+ UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal
+.endm
+
+.macro UNWIND_HINT_IRET_ENTRY base=%rsp offset=0 signal=1
+ VALIDATE_UNRET_BEGIN
+ UNWIND_HINT_IRET_REGS base=\base offset=\offset signal=\signal
+.endm
+
+.macro UNWIND_HINT_FUNC
+ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
+.endm
+
+.macro UNWIND_HINT_SAVE
+ UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE
+.endm
+
+.macro UNWIND_HINT_RESTORE
+ UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE
+.endm
+
+#else
+
+#define UNWIND_HINT_UNDEFINED \
+ UNWIND_HINT(UNWIND_HINT_TYPE_UNDEFINED, 0, 0, 0)
+
+#define UNWIND_HINT_FUNC \
+ UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0)
+
+#define UNWIND_HINT_SAVE \
+ UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0)
+
+#define UNWIND_HINT_RESTORE \
+ UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0)
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
new file mode 100644
index 000000000000..12064284bc4e
--- /dev/null
+++ b/arch/x86/include/asm/unwind_user.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNWIND_USER_H
+#define _ASM_X86_UNWIND_USER_H
+
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+
+#include <asm/ptrace.h>
+#include <asm/uprobes.h>
+
+#define ARCH_INIT_USER_FP_FRAME(ws) \
+ .cfa_off = 2*(ws), \
+ .ra_off = -1*(ws), \
+ .fp_off = -2*(ws), \
+ .use_fp = true,
+
+#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \
+ .cfa_off = 1*(ws), \
+ .ra_off = -1*(ws), \
+ .fp_off = 0, \
+ .use_fp = false,
+
+static inline int unwind_user_word_size(struct pt_regs *regs)
+{
+ /* We can't unwind VM86 stacks */
+ if (regs->flags & X86_VM_MASK)
+ return 0;
+#ifdef CONFIG_X86_64
+ if (!user_64bit_mode(regs))
+ return sizeof(int);
+#endif
+ return sizeof(long);
+}
+
+static inline bool unwind_user_at_function_start(struct pt_regs *regs)
+{
+ return is_uprobe_at_func_entry(regs);
+}
+
+#endif /* CONFIG_HAVE_UNWIND_USER_FP */
+
+#endif /* _ASM_X86_UNWIND_USER_H */
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
new file mode 100644
index 000000000000..362210c79998
--- /dev/null
+++ b/arch/x86/include/asm/uprobes.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_UPROBES_H
+#define _ASM_UPROBES_H
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+
+#include <linux/notifier.h>
+
+typedef u8 uprobe_opcode_t;
+
+#define MAX_UINSN_BYTES 16
+#define UPROBE_XOL_SLOT_BYTES 128 /* to keep it cache aligned */
+
+#define UPROBE_SWBP_INSN 0xcc
+#define UPROBE_SWBP_INSN_SIZE 1
+
+enum {
+ ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0,
+ ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1,
+};
+
+struct uprobe_xol_ops;
+
+struct arch_uprobe {
+ union {
+ u8 insn[MAX_UINSN_BYTES];
+ u8 ixol[MAX_UINSN_BYTES];
+ };
+
+ const struct uprobe_xol_ops *ops;
+
+ union {
+ struct {
+ s32 offs;
+ u8 ilen;
+ u8 opc1;
+ } branch;
+ struct {
+ u8 fixups;
+ u8 ilen;
+ } defparam;
+ struct {
+ u8 reg_offset; /* to the start of pt_regs */
+ u8 ilen;
+ } push;
+ };
+
+ unsigned long flags;
+};
+
+struct arch_uprobe_task {
+#ifdef CONFIG_X86_64
+ unsigned long saved_scratch_register;
+#endif
+ unsigned int saved_trap_nr;
+ unsigned int saved_tf;
+};
+
+#ifdef CONFIG_UPROBES
+extern bool is_uprobe_at_func_entry(struct pt_regs *regs);
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ return false;
+}
+#endif /* CONFIG_UPROBES */
+
+#endif /* _ASM_UPROBES_H */
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index 999873b22e7f..413c91746b27 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -1,5 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_USER_H
+#define _ASM_X86_USER_H
+
#ifdef CONFIG_X86_32
-# include "user_32.h"
+# include <asm/user_32.h>
#else
-# include "user_64.h"
+# include <asm/user_64.h>
#endif
+
+#include <asm/types.h>
+
+struct user_ymmh_regs {
+ /* 16 * 16 bytes for each YMMH-reg */
+ __u32 ymmh_space[64];
+};
+
+struct user_xstate_header {
+ __u64 xfeatures;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
+};
+
+/*
+ * The structure layout of user_xstateregs, used for exporting the
+ * extended register state through ptrace and core-dump (NT_X86_XSTATE note)
+ * interfaces will be same as the memory layout of xsave used by the processor
+ * (except for the bytes 464..511, which can be used by the software) and hence
+ * the size of this structure varies depending on the features supported by the
+ * processor and OS. The size of the structure that users need to use can be
+ * obtained by doing:
+ * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx);
+ * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.)
+ * need to use.
+ *
+ * For now, only the first 8 bytes of the software usable bytes[464..471] will
+ * be used and will be set to OS enabled xstate mask (which is same as the
+ * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump
+ * remotely, etc.) can use this mask as well as the mask saved in the
+ * xstate_hdr bytes and interpret what states the processor/OS supports
+ * and what states are in modified/initialized conditions for the
+ * particular process/thread.
+ *
+ * Also when the user modifies certain state FP/SSE/etc through the
+ * ptrace interface, they must ensure that the header.xfeatures
+ * bytes[512..519] of the memory layout are updated correspondingly.
+ * i.e., for example when FP state is modified to a non-init state,
+ * header.xfeatures's bit 0 must be set to '1', when SSE is modified to
+ * non-init state, header.xfeatures's bit 1 must to be set to '1', etc.
+ */
+#define USER_XSTATE_FX_SW_WORDS 6
+#define USER_XSTATE_XCR0_WORD 0
+
+struct user_xstateregs {
+ struct {
+ __u64 fpx_space[58];
+ __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
+ } i387;
+ struct user_xstate_header header;
+ struct user_ymmh_regs ymmh;
+ /* further processor state extensions go here */
+};
+
+#endif /* _ASM_X86_USER_H */
diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h
index 14cbb73ebcba..fa577312f63a 100644
--- a/arch/x86/include/asm/user32.h
+++ b/arch/x86/include/asm/user32.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_USER32_H
#define _ASM_X86_USER32_H
diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h
index bebfd8644016..8963915e533f 100644
--- a/arch/x86/include/asm/user_32.h
+++ b/arch/x86/include/asm/user_32.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_USER_32_H
#define _ASM_X86_USER_32_H
@@ -123,9 +124,5 @@ struct user{
char u_comm[32]; /* User command that was responsible */
int u_debugreg[8];
};
-#define NBPG PAGE_SIZE
-#define UPAGES 1
-#define HOST_TEXT_START_ADDR (u.start_code)
-#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
#endif /* _ASM_X86_USER_32_H */
diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h
index faf2cd3e0d76..1dd10f07ccd6 100644
--- a/arch/x86/include/asm/user_64.h
+++ b/arch/x86/include/asm/user_64.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_USER_64_H
#define _ASM_X86_USER_64_H
@@ -129,9 +130,5 @@ struct user {
unsigned long error_code; /* CPU error code or 0 */
unsigned long fault_address; /* CR3 or 0 */
};
-#define NBPG PAGE_SIZE
-#define UPAGES 1
-#define HOST_TEXT_START_ADDR (u.start_code)
-#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
#endif /* _ASM_X86_USER_64_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 7ed17ff502b9..6989b824fd32 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -1,27 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_UV_BIOS_H
#define _ASM_X86_UV_BIOS_H
/*
* UV BIOS layer definitions.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (C) 2007-2017 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) Russ Anderson <rja@sgi.com>
*/
+#include <linux/efi.h>
#include <linux/rtc.h>
/*
@@ -36,9 +25,24 @@ enum uv_bios_cmd {
UV_BIOS_WATCHLIST_ALLOC,
UV_BIOS_WATCHLIST_FREE,
UV_BIOS_MEMPROTECT,
- UV_BIOS_GET_PARTITION_ADDR
+ UV_BIOS_GET_PARTITION_ADDR,
+ UV_BIOS_SET_LEGACY_VGA_TARGET
};
+#define UV_BIOS_EXTRA 0x10000
+#define UV_BIOS_GET_PCI_TOPOLOGY 0x10001
+#define UV_BIOS_GET_GEOINFO 0x10003
+
+#define UV_BIOS_EXTRA_OP_MEM_COPYIN 0x1000
+#define UV_BIOS_EXTRA_OP_MEM_COPYOUT 0x2000
+#define UV_BIOS_EXTRA_OP_MASK 0x0fff
+#define UV_BIOS_EXTRA_GET_HEAPSIZE 1
+#define UV_BIOS_EXTRA_INSTALL_HEAP 2
+#define UV_BIOS_EXTRA_MASTER_NASID 3
+#define UV_BIOS_EXTRA_OBJECT_COUNT (10|UV_BIOS_EXTRA_OP_MEM_COPYOUT)
+#define UV_BIOS_EXTRA_ENUM_OBJECTS (12|UV_BIOS_EXTRA_OP_MEM_COPYOUT)
+#define UV_BIOS_EXTRA_ENUM_PORTS (13|UV_BIOS_EXTRA_OP_MEM_COPYOUT)
+
/*
* Status values returned from a BIOS call.
*/
@@ -47,18 +51,107 @@ enum {
BIOS_STATUS_SUCCESS = 0,
BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
BIOS_STATUS_EINVAL = -EINVAL,
- BIOS_STATUS_UNAVAIL = -EBUSY
+ BIOS_STATUS_UNAVAIL = -EBUSY,
+ BIOS_STATUS_ABORT = -EINTR,
};
+/* Address map parameters */
+struct uv_gam_parameters {
+ u64 mmr_base;
+ u64 gru_base;
+ u8 mmr_shift; /* Convert PNode to MMR space offset */
+ u8 gru_shift; /* Convert PNode to GRU space offset */
+ u8 gpa_shift; /* Size of offset field in GRU phys addr */
+ u8 unused1;
+};
+
+/* UV_TABLE_GAM_RANGE_ENTRY values */
+#define UV_GAM_RANGE_TYPE_UNUSED 0 /* End of table */
+#define UV_GAM_RANGE_TYPE_RAM 1 /* Normal RAM */
+#define UV_GAM_RANGE_TYPE_NVRAM 2 /* Non-volatile memory */
+#define UV_GAM_RANGE_TYPE_NV_WINDOW 3 /* NVMDIMM block window */
+#define UV_GAM_RANGE_TYPE_NV_MAILBOX 4 /* NVMDIMM mailbox */
+#define UV_GAM_RANGE_TYPE_HOLE 5 /* Unused address range */
+#define UV_GAM_RANGE_TYPE_MAX 6
+
+/* The structure stores PA bits 56:26, for 64MB granularity */
+#define UV_GAM_RANGE_SHFT 26 /* 64MB */
+
+struct uv_gam_range_entry {
+ char type; /* Entry type: GAM_RANGE_TYPE_UNUSED, etc. */
+ char unused1;
+ u16 nasid; /* HNasid */
+ u16 sockid; /* Socket ID, high bits of APIC ID */
+ u16 pnode; /* Index to MMR and GRU spaces */
+ u32 unused2;
+ u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */
+};
+
+#define UV_AT_SIZE 8 /* 7 character arch type + NULL char */
+struct uv_arch_type_entry {
+ char archtype[UV_AT_SIZE];
+};
+
+#define UV_SYSTAB_SIG "UVST"
+#define UV_SYSTAB_VERSION_1 1 /* UV2/3 BIOS version */
+#define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */
+#define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */
+#define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */
+#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */
+#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3
+
+#define UV_SYSTAB_VERSION_UV5 0x500 /* UV5 GAM base version */
+#define UV_SYSTAB_VERSION_UV5_LATEST UV_SYSTAB_VERSION_UV5
+
+#define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */
+#define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */
+#define UV_SYSTAB_TYPE_GAM_RNG_TBL 2 /* GAM entry table */
+#define UV_SYSTAB_TYPE_ARCH_TYPE 3 /* UV arch type */
+#define UV_SYSTAB_TYPE_MAX 4
+
/*
* The UV system table describes specific firmware
* capabilities available to the Linux kernel at runtime.
*/
struct uv_systab {
- char signature[4]; /* must be "UVST" */
+ char signature[4]; /* must be UV_SYSTAB_SIG */
u32 revision; /* distinguish different firmware revs */
- u64 function; /* BIOS runtime callback function ptr */
+ u64 (__efiapi *function)(enum uv_bios_cmd, ...);
+ /* BIOS runtime callback function ptr */
+ u32 size; /* systab size (starting with _VERSION_UV4) */
+ struct {
+ u32 type:8; /* type of entry */
+ u32 offset:24; /* byte offset from struct start to entry */
+ } entry[1]; /* additional entries follow */
};
+extern struct uv_systab *uv_systab;
+
+#define UV_BIOS_MAXSTRING 128
+struct uv_bios_hub_info {
+ unsigned int id;
+ union {
+ struct {
+ unsigned long long this_part:1;
+ unsigned long long is_shared:1;
+ unsigned long long is_disabled:1;
+ } fields;
+ struct {
+ unsigned long long flags;
+ unsigned long long reserved;
+ } b;
+ } f;
+ char name[UV_BIOS_MAXSTRING];
+ char location[UV_BIOS_MAXSTRING];
+ unsigned int ports;
+};
+
+struct uv_bios_port_info {
+ unsigned int port;
+ unsigned int conn_id;
+ unsigned int conn_port;
+};
+
+/* (... end of definitions from UV BIOS ...) */
enum {
BIOS_FREQ_BASE_PLATFORM = 0,
@@ -76,45 +169,47 @@ union partition_info_u {
};
};
-union uv_watchlist_u {
- u64 val;
- struct {
- u64 blade : 16,
- size : 32,
- filler : 16;
- };
-};
-
enum uv_memprotect {
UV_MEMPROT_RESTRICT_ACCESS,
UV_MEMPROT_ALLOW_AMO,
UV_MEMPROT_ALLOW_RW
};
-/*
- * bios calls have 6 parameters
- */
-extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
-extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
-extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
-
-extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
+extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
extern s64 uv_bios_freq_base(u64, u64 *);
-extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int,
+extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
unsigned long *);
extern int uv_bios_mq_watchlist_free(int, int);
extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
+extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
-extern void uv_bios_init(void);
+extern s64 uv_bios_get_master_nasid(u64 sz, u64 *nasid);
+extern s64 uv_bios_get_heapsize(u64 nasid, u64 sz, u64 *heap_sz);
+extern s64 uv_bios_install_heap(u64 nasid, u64 sz, u64 *heap);
+extern s64 uv_bios_obj_count(u64 nasid, u64 sz, u64 *objcnt);
+extern s64 uv_bios_enum_objs(u64 nasid, u64 sz, u64 *objbuf);
+extern s64 uv_bios_enum_ports(u64 nasid, u64 obj_id, u64 sz, u64 *portbuf);
+extern s64 uv_bios_get_geoinfo(u64 nasid, u64 sz, u64 *geo);
+extern s64 uv_bios_get_pci_topology(u64 sz, u64 *buf);
+
+extern int uv_bios_init(void);
+extern unsigned long get_uv_systab_phys(bool msg);
extern unsigned long sn_rtc_cycles_per_second;
extern int uv_type;
extern long sn_partition_id;
extern long sn_coherency_id;
extern long sn_region_size;
-#define partition_coherence_id() (sn_coherency_id)
+extern long system_serial_number;
+extern ssize_t uv_get_archtype(char *buf, int len);
+extern int uv_get_hubless_system(void);
extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
+/*
+ * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details
+ */
+extern struct semaphore __efi_uv_runtime_lock;
+
#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index c0a01b5d985b..648eb23fe7f0 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -1,32 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H
-enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
-
-struct cpumask;
-struct mm_struct;
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC};
#ifdef CONFIG_X86_UV
+#include <linux/efi.h>
+
+#define UV_PROC_NODE "sgi_uv"
+
+static inline int uv(int uvtype)
+{
+ /* uv(0) is "any" */
+ if (uvtype >= 0 && uvtype <= 30)
+ return 1 << uvtype;
+ return 1;
+}
+
+extern unsigned long uv_systab_phys;
extern enum uv_system_type get_uv_system_type(void);
+static inline bool is_early_uv_system(void)
+{
+ return uv_systab_phys && uv_systab_phys != EFI_INVALID_TABLE_ADDR;
+}
extern int is_uv_system(void);
+extern int is_uv_hubbed(int uvtype);
extern void uv_cpu_init(void);
+extern void uv_nmi_init(void);
extern void uv_system_init(void);
-extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va,
- unsigned int cpu);
-#else /* X86_UV */
+#else /* !X86_UV */
static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
+static inline bool is_early_uv_system(void) { return 0; }
static inline int is_uv_system(void) { return 0; }
+static inline int is_uv_hubbed(int uv) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
-static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
- unsigned long va, unsigned int cpu)
-{ return cpumask; }
#endif /* X86_UV */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
deleted file mode 100644
index bddd44f2f0ab..000000000000
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV Broadcast Assist Unit definitions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#ifndef _ASM_X86_UV_UV_BAU_H
-#define _ASM_X86_UV_UV_BAU_H
-
-#include <linux/bitmap.h>
-#define BITSPERBYTE 8
-
-/*
- * Broadcast Assist Unit messaging structures
- *
- * Selective Broadcast activations are induced by software action
- * specifying a particular 8-descriptor "set" via a 6-bit index written
- * to an MMR.
- * Thus there are 64 unique 512-byte sets of SB descriptors - one set for
- * each 6-bit index value. These descriptor sets are mapped in sequence
- * starting with set 0 located at the address specified in the
- * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
- * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
- *
- * We will use 31 sets, one for sending BAU messages from each of the 32
- * cpu's on the node.
- *
- * TLB shootdown will use the first of the 8 descriptors of each set.
- * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
- */
-
-#define UV_ITEMS_PER_DESCRIPTOR 8
-#define UV_CPUS_PER_ACT_STATUS 32
-#define UV_ACT_STATUS_MASK 0x3
-#define UV_ACT_STATUS_SIZE 2
-#define UV_ADP_SIZE 32
-#define UV_DISTRIBUTION_SIZE 256
-#define UV_SW_ACK_NPENDING 8
-#define UV_NET_ENDPOINT_INTD 0x38
-#define UV_DESC_BASE_PNODE_SHIFT 49
-#define UV_PAYLOADQ_PNODE_SHIFT 49
-#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
-#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
-
-/*
- * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
- */
-#define DESC_STATUS_IDLE 0
-#define DESC_STATUS_ACTIVE 1
-#define DESC_STATUS_DESTINATION_TIMEOUT 2
-#define DESC_STATUS_SOURCE_TIMEOUT 3
-
-/*
- * source side threshholds at which message retries print a warning
- */
-#define SOURCE_TIMEOUT_LIMIT 20
-#define DESTINATION_TIMEOUT_LIMIT 20
-
-/*
- * number of entries in the destination side payload queue
- */
-#define DEST_Q_SIZE 17
-/*
- * number of destination side software ack resources
- */
-#define DEST_NUM_RESOURCES 8
-#define MAX_CPUS_PER_NODE 32
-/*
- * completion statuses for sending a TLB flush message
- */
-#define FLUSH_RETRY 1
-#define FLUSH_GIVEUP 2
-#define FLUSH_COMPLETE 3
-
-/*
- * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
- * If the 'multilevel' flag in the header portion of the descriptor
- * has been set to 0, then endpoint multi-unicast mode is selected.
- * The distribution specification (32 bytes) is interpreted as a 256-bit
- * distribution vector. Adjacent bits correspond to consecutive even numbered
- * nodeIDs. The result of adding the index of a given bit to the 15-bit
- * 'base_dest_nodeid' field of the header corresponds to the
- * destination nodeID associated with that specified bit.
- */
-struct bau_target_nodemask {
- unsigned long bits[BITS_TO_LONGS(256)];
-};
-
-/*
- * mask of cpu's on a node
- * (during initialization we need to check that unsigned long has
- * enough bits for max. cpu's per node)
- */
-struct bau_local_cpumask {
- unsigned long bits;
-};
-
-/*
- * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor)
- * only 12 bytes (96 bits) of the payload area are usable.
- * An additional 3 bytes (bits 27:4) of the header address are carried
- * to the next bytes of the destination payload queue.
- * And an additional 2 bytes of the header Suppl_A field are also
- * carried to the destination payload queue.
- * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte)
- * of the destination payload queue, which is written by the hardware
- * with the s/w ack resource bit vector.
- * [ effective message contents (16 bytes (128 bits) maximum), not counting
- * the s/w ack bit vector ]
- */
-
-/*
- * The payload is software-defined for INTD transactions
- */
-struct bau_msg_payload {
- unsigned long address; /* signifies a page or all TLB's
- of the cpu */
- /* 64 bits */
- unsigned short sending_cpu; /* filled in by sender */
- /* 16 bits */
- unsigned short acknowledge_count;/* filled in by destination */
- /* 16 bits */
- unsigned int reserved1:32; /* not usable */
-};
-
-
-/*
- * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
- * see table 4.2.3.0.1 in broacast_assist spec.
- */
-struct bau_msg_header {
- unsigned int dest_subnodeid:6; /* must be zero */
- /* bits 5:0 */
- unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
- /* bits 20:6 */ /* first bit in node_map */
- unsigned int command:8; /* message type */
- /* bits 28:21 */
- /* 0x38: SN3net EndPoint Message */
- unsigned int rsvd_1:3; /* must be zero */
- /* bits 31:29 */
- /* int will align on 32 bits */
- unsigned int rsvd_2:9; /* must be zero */
- /* bits 40:32 */
- /* Suppl_A is 56-41 */
- unsigned int payload_2a:8;/* becomes byte 16 of msg */
- /* bits 48:41 */ /* not currently using */
- unsigned int payload_2b:8;/* becomes byte 17 of msg */
- /* bits 56:49 */ /* not currently using */
- /* Address field (96:57) is never used as an
- address (these are address bits 42:3) */
- unsigned int rsvd_3:1; /* must be zero */
- /* bit 57 */
- /* address bits 27:4 are payload */
- /* these 24 bits become bytes 12-14 of msg */
- unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
- /* bit 58 */
-
- unsigned int payload_1a:5;/* not currently used */
- /* bits 63:59 */
- unsigned int payload_1b:8;/* not currently used */
- /* bits 71:64 */
- unsigned int payload_1c:8;/* not currently used */
- /* bits 79:72 */
- unsigned int payload_1d:2;/* not currently used */
- /* bits 81:80 */
-
- unsigned int rsvd_4:7; /* must be zero */
- /* bits 88:82 */
- unsigned int sw_ack_flag:1;/* software acknowledge flag */
- /* bit 89 */
- /* INTD trasactions at destination are to
- wait for software acknowledge */
- unsigned int rsvd_5:6; /* must be zero */
- /* bits 95:90 */
- unsigned int rsvd_6:5; /* must be zero */
- /* bits 100:96 */
- unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
- /* bit 101*/
- unsigned int fairness:3;/* usually zero */
- /* bits 104:102 */
- unsigned int multilevel:1; /* multi-level multicast format */
- /* bit 105 */
- /* 0 for TLB: endpoint multi-unicast messages */
- unsigned int chaining:1;/* next descriptor is part of this activation*/
- /* bit 106 */
- unsigned int rsvd_7:21; /* must be zero */
- /* bits 127:107 */
-};
-
-/*
- * The activation descriptor:
- * The format of the message to send, plus all accompanying control
- * Should be 64 bytes
- */
-struct bau_desc {
- struct bau_target_nodemask distribution;
- /*
- * message template, consisting of header and payload:
- */
- struct bau_msg_header header;
- struct bau_msg_payload payload;
-};
-/*
- * -payload-- ---------header------
- * bytes 0-11 bits 41-56 bits 58-81
- * A B (2) C (3)
- *
- * A/B/C are moved to:
- * A C B
- * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
- * ------------payload queue-----------
- */
-
-/*
- * The payload queue on the destination side is an array of these.
- * With BAU_MISC_CONTROL set for software acknowledge mode, the messages
- * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
- * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
- * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
- * sw_ack_vector and payload_2)
- * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
- * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
- * operation."
- */
-struct bau_payload_queue_entry {
- unsigned long address; /* signifies a page or all TLB's
- of the cpu */
- /* 64 bits, bytes 0-7 */
-
- unsigned short sending_cpu; /* cpu that sent the message */
- /* 16 bits, bytes 8-9 */
-
- unsigned short acknowledge_count; /* filled in by destination */
- /* 16 bits, bytes 10-11 */
-
- unsigned short replied_to:1; /* sent as 0 by the source */
- /* 1 bit */
- unsigned short unused1:7; /* not currently using */
- /* 7 bits: byte 12) */
-
- unsigned char unused2[2]; /* not currently using */
- /* bytes 13-14 */
-
- unsigned char sw_ack_vector; /* filled in by the hardware */
- /* byte 15 (bits 127:120) */
-
- unsigned char unused4[3]; /* not currently using bytes 17-19 */
- /* bytes 17-19 */
-
- int number_of_cpus; /* filled in at destination */
- /* 32 bits, bytes 20-23 (aligned) */
-
- unsigned char unused5[8]; /* not using */
- /* bytes 24-31 */
-};
-
-/*
- * one for every slot in the destination payload queue
- */
-struct bau_msg_status {
- struct bau_local_cpumask seen_by; /* map of cpu's */
-};
-
-/*
- * one for every slot in the destination software ack resources
- */
-struct bau_sw_ack_status {
- struct bau_payload_queue_entry *msg; /* associated message */
- int watcher; /* cpu monitoring, or -1 */
-};
-
-/*
- * one on every node and per-cpu; to locate the software tables
- */
-struct bau_control {
- struct bau_desc *descriptor_base;
- struct bau_payload_queue_entry *bau_msg_head;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
- struct bau_msg_status *msg_statuses;
- int *watching; /* pointer to array */
-};
-
-/*
- * This structure is allocated per_cpu for UV TLB shootdown statistics.
- */
-struct ptc_stats {
- unsigned long ptc_i; /* number of IPI-style flushes */
- unsigned long requestor; /* number of nodes this cpu sent to */
- unsigned long requestee; /* times cpu was remotely requested */
- unsigned long alltlb; /* times all tlb's on this cpu were flushed */
- unsigned long onetlb; /* times just one tlb on this cpu was flushed */
- unsigned long s_retry; /* retries on source side timeouts */
- unsigned long d_retry; /* retries on destination side timeouts */
- unsigned long sflush; /* cycles spent in uv_flush_tlb_others */
- unsigned long dflush; /* cycles spent on destination side */
- unsigned long retriesok; /* successes on retries */
- unsigned long nomsg; /* interrupts with no message */
- unsigned long multmsg; /* interrupts with multiple messages */
- unsigned long ntargeted;/* nodes targeted */
-};
-
-static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
-{
- return constant_test_bit(node, &dstp->bits[0]);
-}
-static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
-{
- __set_bit(node, &dstp->bits[0]);
-}
-static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
-{
- bitmap_zero(&dstp->bits[0], nbits);
-}
-
-static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
-{
- bitmap_zero(&dstp->bits, nbits);
-}
-
-#define cpubit_isset(cpu, bau_local_cpumask) \
- test_bit((cpu), (bau_local_cpumask).bits)
-
-extern void uv_bau_message_intr1(void);
-extern void uv_bau_timeout_intr1(void);
-
-#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h
new file mode 100644
index 000000000000..027a9258dbca
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_geo.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2020 Hewlett Packard Enterprise Development LP. All rights reserved.
+ */
+
+#ifndef _ASM_UV_GEO_H
+#define _ASM_UV_GEO_H
+
+/* Type declarations */
+
+/* Size of a geoid_s structure (must be before decl. of geoid_u) */
+#define GEOID_SIZE 8
+
+/* Fields common to all substructures */
+struct geo_common_s {
+ unsigned char type; /* What type of h/w is named by this geoid_s */
+ unsigned char blade;
+ unsigned char slot; /* slot is IRU */
+ unsigned char upos;
+ unsigned char rack;
+};
+
+/* Additional fields for particular types of hardware */
+struct geo_node_s {
+ struct geo_common_s common; /* No additional fields needed */
+};
+
+struct geo_rtr_s {
+ struct geo_common_s common; /* No additional fields needed */
+};
+
+struct geo_iocntl_s {
+ struct geo_common_s common; /* No additional fields needed */
+};
+
+struct geo_pcicard_s {
+ struct geo_iocntl_s common;
+ char bus; /* Bus/widget number */
+ char slot; /* PCI slot number */
+};
+
+/* Subcomponents of a node */
+struct geo_cpu_s {
+ struct geo_node_s node;
+ unsigned char socket:4, /* Which CPU on the node */
+ thread:4;
+ unsigned char core;
+};
+
+struct geo_mem_s {
+ struct geo_node_s node;
+ char membus; /* The memory bus on the node */
+ char memslot; /* The memory slot on the bus */
+};
+
+union geoid_u {
+ struct geo_common_s common;
+ struct geo_node_s node;
+ struct geo_iocntl_s iocntl;
+ struct geo_pcicard_s pcicard;
+ struct geo_rtr_s rtr;
+ struct geo_cpu_s cpu;
+ struct geo_mem_s mem;
+ char padsize[GEOID_SIZE];
+};
+
+/* Defined constants */
+
+#define GEO_MAX_LEN 48
+
+#define GEO_TYPE_INVALID 0
+#define GEO_TYPE_MODULE 1
+#define GEO_TYPE_NODE 2
+#define GEO_TYPE_RTR 3
+#define GEO_TYPE_IOCNTL 4
+#define GEO_TYPE_IOCARD 5
+#define GEO_TYPE_CPU 6
+#define GEO_TYPE_MEM 7
+#define GEO_TYPE_MAX (GEO_TYPE_MEM+1)
+
+static inline int geo_rack(union geoid_u g)
+{
+ return (g.common.type == GEO_TYPE_INVALID) ?
+ -1 : g.common.rack;
+}
+
+static inline int geo_slot(union geoid_u g)
+{
+ return (g.common.type == GEO_TYPE_INVALID) ?
+ -1 : g.common.upos;
+}
+
+static inline int geo_blade(union geoid_u g)
+{
+ return (g.common.type == GEO_TYPE_INVALID) ?
+ -1 : g.common.blade * 2 + g.common.slot;
+}
+
+#endif /* _ASM_UV_GEO_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 77a68505419a..ea877fd83114 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -5,7 +5,8 @@
*
* SGI UV architectural definitions
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_HUB_H
@@ -15,9 +16,15 @@
#include <linux/numa.h>
#include <linux/percpu.h>
#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/topology.h>
#include <asm/types.h>
#include <asm/percpu.h>
+#include <asm/uv/uv.h>
#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/bios.h>
+#include <asm/irq_vectors.h>
+#include <asm/io_apic.h>
/*
@@ -28,20 +35,27 @@
* contiguous (although various IO spaces may punch holes in
* it)..
*
- * N - Number of bits in the node portion of a socket physical
- * address.
+ * N - Number of bits in the node portion of a socket physical
+ * address.
*
- * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
- * routers always have low bit of 1, C/MBricks have low bit
- * equal to 0. Most addressing macros that target UV hub chips
- * right shift the NASID by 1 to exclude the always-zero bit.
- * NASIDs contain up to 15 bits.
+ * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
+ * routers always have low bit of 1, C/MBricks have low bit
+ * equal to 0. Most addressing macros that target UV hub chips
+ * right shift the NASID by 1 to exclude the always-zero bit.
+ * NASIDs contain up to 15 bits.
*
* GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
* of nasids.
*
- * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
- * of the nasid for socket usage.
+ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
+ * of the nasid for socket usage.
+ *
+ * GPA - (global physical address) a socket physical address converted
+ * so that it can be used by the GRU as a global address. Socket
+ * physical addresses 1) need additional NASID (node) bits added
+ * to the high end of the address, and 2) unaliased if the
+ * partition does not have a physical address 0. In addition, on
+ * UV2 rev 1, GPAs need the gnode left shifted to bits 39 or 40.
*
*
* NumaLink Global Physical Address Format:
@@ -68,13 +82,15 @@
*
*
* APICID format
- * NOTE!!!!!! This is the current format of the APICID. However, code
- * should assume that this will change in the future. Use functions
- * in this file for all APICID bit manipulations and conversion.
+ * NOTE!!!!!! This is the current format of the APICID. However, code
+ * should assume that this will change in the future. Use functions
+ * in this file for all APICID bit manipulations and conversion.
*
- * 1111110000000000
- * 5432109876543210
- * pppppppppplc0cch
+ * 1111110000000000
+ * 5432109876543210
+ * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg)
+ * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg)
+ * pppppppppppcccch SandyBridge (15 bits in hdw reg)
* sssssssssss
*
* p = pnode bits
@@ -83,15 +99,14 @@
* h = hyperthread
* s = bits that are in the SOCKET_ID CSR
*
- * Note: Processor only supports 12 bits in the APICID register. The ACPI
+ * Note: Processor may support fewer bits in the APICID register. The ACPI
* tables hold all 16 bits. Software needs to be aware of this.
*
- * Unless otherwise specified, all references to APICID refer to
- * the FULL value contained in ACPI tables, not the subset in the
- * processor APICID register.
+ * Unless otherwise specified, all references to APICID refer to
+ * the FULL value contained in ACPI tables, not the subset in the
+ * processor APICID register.
*/
-
/*
* Maximum number of bricks in all partitions and in all coherency domains.
* This is the total number of bricks accessible in the numalink fabric. It
@@ -113,70 +128,221 @@
/*
* The largest possible NASID of a C or M brick (+ 2)
*/
-#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
-
-struct uv_scir_s {
- struct timer_list timer;
- unsigned long offset;
- unsigned long last;
- unsigned long idle_on;
- unsigned long idle_off;
- unsigned char state;
- unsigned char enabled;
+#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2)
+
+/* GAM (globally addressed memory) range table */
+struct uv_gam_range_s {
+ u32 limit; /* PA bits 56:26 (GAM_RANGE_SHFT) */
+ u16 nasid; /* node's global physical address */
+ s8 base; /* entry index of node's base addr */
+ u8 reserved;
};
/*
* The following defines attributes of the HUB chip. These attributes are
- * frequently referenced and are kept in the per-cpu data areas of each cpu.
- * They are kept together in a struct to minimize cache misses.
+ * frequently referenced and are kept in a common per hub struct.
+ * After setup, the struct is read only, so it should be readily
+ * available in the L3 cache on the cpu socket for the node.
*/
struct uv_hub_info_s {
+ unsigned int hub_type;
+ unsigned char hub_revision;
unsigned long global_mmr_base;
+ unsigned long global_mmr_shift;
unsigned long gpa_mask;
+ unsigned short *socket_to_node;
+ unsigned short *socket_to_pnode;
+ unsigned short *pnode_to_socket;
+ struct uv_gam_range_s *gr_table;
+ unsigned short min_socket;
+ unsigned short min_pnode;
+ unsigned char m_val;
+ unsigned char n_val;
+ unsigned char gr_table_len;
+ unsigned char apic_pnode_shift;
+ unsigned char gpa_shift;
+ unsigned char nasid_shift;
+ unsigned char m_shift;
+ unsigned char n_lshift;
unsigned int gnode_extra;
unsigned long gnode_upper;
unsigned long lowmem_remap_top;
unsigned long lowmem_remap_base;
+ unsigned long global_gru_base;
+ unsigned long global_gru_shift;
unsigned short pnode;
unsigned short pnode_mask;
unsigned short coherency_domain_number;
unsigned short numa_blade_id;
- unsigned char blade_processor_id;
- unsigned char m_val;
- unsigned char n_val;
- struct uv_scir_s scir;
+ unsigned short nr_possible_cpus;
+ unsigned short nr_online_cpus;
+ short memory_nid;
+ unsigned short *node_to_socket;
+};
+
+/* CPU specific info with a pointer to the hub common info struct */
+struct uv_cpu_info_s {
+ void *p_uv_hub_info;
+ unsigned char blade_cpu_id;
+ void *reserved;
};
+DECLARE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+
+#define uv_cpu_info this_cpu_ptr(&__uv_cpu_info)
+#define uv_cpu_info_per(cpu) (&per_cpu(__uv_cpu_info, cpu))
-DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
-#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
+/* Node specific hub common info struct */
+extern void **__uv_hub_info_list;
+static inline struct uv_hub_info_s *uv_hub_info_list(int node)
+{
+ return (struct uv_hub_info_s *)__uv_hub_info_list[node];
+}
+
+static inline struct uv_hub_info_s *_uv_hub_info(void)
+{
+ return (struct uv_hub_info_s *)uv_cpu_info->p_uv_hub_info;
+}
+#define uv_hub_info _uv_hub_info()
+
+static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu)
+{
+ return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info;
+}
+
+static inline int uv_hub_type(void)
+{
+ return uv_hub_info->hub_type;
+}
+
+static inline __init void uv_hub_type_set(int uvmask)
+{
+ uv_hub_info->hub_type = uvmask;
+}
+
+
+/*
+ * HUB revision ranges for each UV HUB architecture.
+ * This is a software convention - NOT the hardware revision numbers in
+ * the hub chip.
+ */
+#define UV2_HUB_REVISION_BASE 3
+#define UV3_HUB_REVISION_BASE 5
+#define UV4_HUB_REVISION_BASE 7
+#define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */
+#define UV5_HUB_REVISION_BASE 9
+
+static inline int is_uv(int uvmask) { return uv_hub_type() & uvmask; }
+static inline int is_uv1_hub(void) { return 0; }
+static inline int is_uv2_hub(void) { return is_uv(UV2); }
+static inline int is_uv3_hub(void) { return is_uv(UV3); }
+static inline int is_uv4a_hub(void) { return is_uv(UV4A); }
+static inline int is_uv4_hub(void) { return is_uv(UV4); }
+static inline int is_uv5_hub(void) { return is_uv(UV5); }
+
+/*
+ * UV4A is a revision of UV4. So on UV4A, both is_uv4_hub() and
+ * is_uv4a_hub() return true, While on UV4, only is_uv4_hub()
+ * returns true. So to get true results, first test if is UV4A,
+ * then test if is UV4.
+ */
+
+/* UVX class: UV2,3,4 */
+static inline int is_uvx_hub(void) { return is_uv(UVX); }
+
+/* UVY class: UV5,..? */
+static inline int is_uvy_hub(void) { return is_uv(UVY); }
+
+/* Any UV Hubbed System */
+static inline int is_uv_hub(void) { return is_uv(UV_ANY); }
+
+union uvh_apicid {
+ unsigned long v;
+ struct uvh_apicid_s {
+ unsigned long local_apic_mask : 24;
+ unsigned long local_apic_shift : 5;
+ unsigned long unused1 : 3;
+ unsigned long pnode_mask : 24;
+ unsigned long pnode_shift : 5;
+ unsigned long unused2 : 3;
+ } s;
+};
/*
* Local & Global MMR space macros.
- * Note: macros are intended to be used ONLY by inline functions
- * in this file - not by other kernel code.
- * n - NASID (full 15-bit global nasid)
- * g - GNODE (full 15-bit global nasid, right shifted 1)
- * p - PNODE (local part of nsids, right shifted 1)
+ * Note: macros are intended to be used ONLY by inline functions
+ * in this file - not by other kernel code.
+ * n - NASID (full 15-bit global nasid)
+ * g - GNODE (full 15-bit global nasid, right shifted 1)
+ * p - PNODE (local part of nsids, right shifted 1)
*/
-#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
+#define UV_NASID_TO_PNODE(n) \
+ (((n) >> uv_hub_info->nasid_shift) & uv_hub_info->pnode_mask)
#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
-#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
+#define UV_PNODE_TO_NASID(p) \
+ (UV_PNODE_TO_GNODE(p) << uv_hub_info->nasid_shift)
+
+#define UV2_LOCAL_MMR_BASE 0xfa000000UL
+#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL
+#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
+
+#define UV3_LOCAL_MMR_BASE 0xfa000000UL
+#define UV3_GLOBAL_MMR32_BASE 0xfc000000UL
+#define UV3_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV3_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
+
+#define UV4_LOCAL_MMR_BASE 0xfa000000UL
+#define UV4_GLOBAL_MMR32_BASE 0
+#define UV4_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV4_GLOBAL_MMR32_SIZE 0
+
+#define UV5_LOCAL_MMR_BASE 0xfa000000UL
+#define UV5_GLOBAL_MMR32_BASE 0
+#define UV5_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV5_GLOBAL_MMR32_SIZE 0
+
+#define UV_LOCAL_MMR_BASE ( \
+ is_uv(UV2) ? UV2_LOCAL_MMR_BASE : \
+ is_uv(UV3) ? UV3_LOCAL_MMR_BASE : \
+ is_uv(UV4) ? UV4_LOCAL_MMR_BASE : \
+ is_uv(UV5) ? UV5_LOCAL_MMR_BASE : \
+ 0)
+
+#define UV_GLOBAL_MMR32_BASE ( \
+ is_uv(UV2) ? UV2_GLOBAL_MMR32_BASE : \
+ is_uv(UV3) ? UV3_GLOBAL_MMR32_BASE : \
+ is_uv(UV4) ? UV4_GLOBAL_MMR32_BASE : \
+ is_uv(UV5) ? UV5_GLOBAL_MMR32_BASE : \
+ 0)
+
+#define UV_LOCAL_MMR_SIZE ( \
+ is_uv(UV2) ? UV2_LOCAL_MMR_SIZE : \
+ is_uv(UV3) ? UV3_LOCAL_MMR_SIZE : \
+ is_uv(UV4) ? UV4_LOCAL_MMR_SIZE : \
+ is_uv(UV5) ? UV5_LOCAL_MMR_SIZE : \
+ 0)
+
+#define UV_GLOBAL_MMR32_SIZE ( \
+ is_uv(UV2) ? UV2_GLOBAL_MMR32_SIZE : \
+ is_uv(UV3) ? UV3_GLOBAL_MMR32_SIZE : \
+ is_uv(UV4) ? UV4_GLOBAL_MMR32_SIZE : \
+ is_uv(UV5) ? UV5_GLOBAL_MMR32_SIZE : \
+ 0)
-#define UV_LOCAL_MMR_BASE 0xf4000000UL
-#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
-#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
-#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
+
+#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
-#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
+#define _UV_GLOBAL_MMR64_PNODE_SHIFT 26
+#define UV_GLOBAL_MMR64_PNODE_SHIFT (uv_hub_info->global_mmr_shift)
#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
(((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+#define UVH_APICID 0x002D0E00L
#define UV_APIC_PNODE_SHIFT 6
/* Local Bus from cpu's perspective */
@@ -188,7 +354,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
*
* Note there are NO leds on a UV system. This register is only
* used by the system controller to monitor system-wide operation.
- * There are 64 regs per node. With Nahelem cpus (2 cores per node,
+ * There are 64 regs per node. With Nehalem cpus (2 cores per node,
* 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
* a node.
*
@@ -210,18 +376,79 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
/*
* Macros for converting between kernel virtual addresses, socket local physical
* addresses, and UV global physical addresses.
- * Note: use the standard __pa() & __va() macros for converting
- * between socket virtual and socket physical addresses.
+ * Note: use the standard __pa() & __va() macros for converting
+ * between socket virtual and socket physical addresses.
*/
+/* global bits offset - number of local address bits in gpa for this UV arch */
+static inline unsigned int uv_gpa_shift(void)
+{
+ return uv_hub_info->gpa_shift;
+}
+#define _uv_gpa_shift
+
+/* Find node that has the address range that contains global address */
+static inline struct uv_gam_range_s *uv_gam_range(unsigned long pa)
+{
+ struct uv_gam_range_s *gr = uv_hub_info->gr_table;
+ unsigned long pal = (pa & uv_hub_info->gpa_mask) >> UV_GAM_RANGE_SHFT;
+ int i, num = uv_hub_info->gr_table_len;
+
+ if (gr) {
+ for (i = 0; i < num; i++, gr++) {
+ if (pal < gr->limit)
+ return gr;
+ }
+ }
+ pr_crit("UV: GAM Range for 0x%lx not found at %p!\n", pa, gr);
+ BUG();
+}
+
+/* Return base address of node that contains global address */
+static inline unsigned long uv_gam_range_base(unsigned long pa)
+{
+ struct uv_gam_range_s *gr = uv_gam_range(pa);
+ int base = gr->base;
+
+ if (base < 0)
+ return 0UL;
+
+ return uv_hub_info->gr_table[base].limit;
+}
+
+/* socket phys RAM --> UV global NASID (UV4+) */
+static inline unsigned long uv_soc_phys_ram_to_nasid(unsigned long paddr)
+{
+ return uv_gam_range(paddr)->nasid;
+}
+#define _uv_soc_phys_ram_to_nasid
+
+/* socket virtual --> UV global NASID (UV4+) */
+static inline unsigned long uv_gpa_nasid(void *v)
+{
+ return uv_soc_phys_ram_to_nasid(__pa(v));
+}
+
/* socket phys RAM --> UV global physical address */
static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
{
+ unsigned int m_val = uv_hub_info->m_val;
+
if (paddr < uv_hub_info->lowmem_remap_top)
paddr |= uv_hub_info->lowmem_remap_base;
- return paddr | uv_hub_info->gnode_upper;
-}
+ if (m_val) {
+ paddr |= uv_hub_info->gnode_upper;
+ paddr = ((paddr << uv_hub_info->m_shift)
+ >> uv_hub_info->m_shift) |
+ ((paddr >> uv_hub_info->m_val)
+ << uv_hub_info->n_lshift);
+ } else {
+ paddr |= uv_soc_phys_ram_to_nasid(paddr)
+ << uv_hub_info->gpa_shift;
+ }
+ return paddr;
+}
/* socket virtual --> UV global physical address */
static inline unsigned long uv_gpa(void *v)
@@ -229,65 +456,154 @@ static inline unsigned long uv_gpa(void *v)
return uv_soc_phys_ram_to_gpa(__pa(v));
}
+/* Top two bits indicate the requested address is in MMR space. */
+static inline int
+uv_gpa_in_mmr_space(unsigned long gpa)
+{
+ return (gpa >> 62) == 0x3UL;
+}
+
+/* UV global physical address --> socket phys RAM */
+static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
+{
+ unsigned long paddr;
+ unsigned long remap_base = uv_hub_info->lowmem_remap_base;
+ unsigned long remap_top = uv_hub_info->lowmem_remap_top;
+ unsigned int m_val = uv_hub_info->m_val;
+
+ if (m_val)
+ gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
+ ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val);
+
+ paddr = gpa & uv_hub_info->gpa_mask;
+ if (paddr >= remap_base && paddr < remap_base + remap_top)
+ paddr -= remap_base;
+ return paddr;
+}
+
+/* gpa -> gnode */
+static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
+{
+ unsigned int n_lshift = uv_hub_info->n_lshift;
+
+ if (n_lshift)
+ return gpa >> n_lshift;
+
+ return uv_gam_range(gpa)->nasid >> 1;
+}
+
+/* gpa -> pnode */
+static inline int uv_gpa_to_pnode(unsigned long gpa)
+{
+ return uv_gpa_to_gnode(gpa) & uv_hub_info->pnode_mask;
+}
+
+/* gpa -> node offset */
+static inline unsigned long uv_gpa_to_offset(unsigned long gpa)
+{
+ unsigned int m_shift = uv_hub_info->m_shift;
+
+ if (m_shift)
+ return (gpa << m_shift) >> m_shift;
+
+ return (gpa & uv_hub_info->gpa_mask) - uv_gam_range_base(gpa);
+}
+
+/* Convert socket to node */
+static inline int _uv_socket_to_node(int socket, unsigned short *s2nid)
+{
+ return s2nid ? s2nid[socket - uv_hub_info->min_socket] : socket;
+}
+
+static inline int uv_socket_to_node(int socket)
+{
+ return _uv_socket_to_node(socket, uv_hub_info->socket_to_node);
+}
+
+static inline int uv_pnode_to_socket(int pnode)
+{
+ unsigned short *p2s = uv_hub_info->pnode_to_socket;
+
+ return p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode;
+}
+
/* pnode, offset --> socket virtual */
static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
{
- return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
-}
+ unsigned int m_val = uv_hub_info->m_val;
+ unsigned long base;
+ unsigned short sockid;
+ if (m_val)
+ return __va(((unsigned long)pnode << m_val) | offset);
-/*
- * Extract a PNODE from an APICID (full apicid, not processor subset)
- */
+ sockid = uv_pnode_to_socket(pnode);
+
+ /* limit address of previous socket is our base, except node 0 is 0 */
+ if (sockid == 0)
+ return __va((unsigned long)offset);
+
+ base = (unsigned long)(uv_hub_info->gr_table[sockid - 1].limit);
+ return __va(base << UV_GAM_RANGE_SHFT | offset);
+}
+
+/* Extract/Convert a PNODE from an APICID (full apicid, not processor subset) */
static inline int uv_apicid_to_pnode(int apicid)
{
- return (apicid >> UV_APIC_PNODE_SHIFT);
+ int pnode = apicid >> uv_hub_info->apic_pnode_shift;
+ unsigned short *s2pn = uv_hub_info->socket_to_pnode;
+
+ return s2pn ? s2pn[pnode - uv_hub_info->min_socket] : pnode;
}
/*
* Access global MMRs using the low memory MMR32 space. This region supports
* faster MMR access but not all MMRs are accessible in this space.
*/
-static inline unsigned long *uv_global_mmr32_address(int pnode,
- unsigned long offset)
+static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset)
{
return __va(UV_GLOBAL_MMR32_BASE |
UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
}
-static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
- unsigned long val)
+static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val)
{
- *uv_global_mmr32_address(pnode, offset) = val;
+ writeq(val, uv_global_mmr32_address(pnode, offset));
}
-static inline unsigned long uv_read_global_mmr32(int pnode,
- unsigned long offset)
+static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset)
{
- return *uv_global_mmr32_address(pnode, offset);
+ return readq(uv_global_mmr32_address(pnode, offset));
}
/*
* Access Global MMR space using the MMR space located at the top of physical
* memory.
*/
-static inline unsigned long *uv_global_mmr64_address(int pnode,
- unsigned long offset)
+static inline volatile void __iomem *uv_global_mmr64_address(int pnode, unsigned long offset)
{
return __va(UV_GLOBAL_MMR64_BASE |
UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
}
-static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
- unsigned long val)
+static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val)
+{
+ writeq(val, uv_global_mmr64_address(pnode, offset));
+}
+
+static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset)
{
- *uv_global_mmr64_address(pnode, offset) = val;
+ return readq(uv_global_mmr64_address(pnode, offset));
}
-static inline unsigned long uv_read_global_mmr64(int pnode,
- unsigned long offset)
+static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
{
- return *uv_global_mmr64_address(pnode, offset);
+ writeb(val, uv_global_mmr64_address(pnode, offset));
+}
+
+static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset)
+{
+ return readb(uv_global_mmr64_address(pnode, offset));
}
/*
@@ -301,130 +617,172 @@ static inline unsigned long *uv_local_mmr_address(unsigned long offset)
static inline unsigned long uv_read_local_mmr(unsigned long offset)
{
- return *uv_local_mmr_address(offset);
+ return readq(uv_local_mmr_address(offset));
}
static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
{
- *uv_local_mmr_address(offset) = val;
+ writeq(val, uv_local_mmr_address(offset));
}
static inline unsigned char uv_read_local_mmr8(unsigned long offset)
{
- return *((unsigned char *)uv_local_mmr_address(offset));
+ return readb(uv_local_mmr_address(offset));
}
static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
{
- *((unsigned char *)uv_local_mmr_address(offset)) = val;
+ writeb(val, uv_local_mmr_address(offset));
}
-/*
- * Structures and definitions for converting between cpu, node, pnode, and blade
- * numbers.
- */
-struct uv_blade_info {
- unsigned short nr_possible_cpus;
- unsigned short nr_online_cpus;
- unsigned short pnode;
- short memory_nid;
-};
-extern struct uv_blade_info *uv_blade_info;
-extern short *uv_node_to_blade;
-extern short *uv_cpu_to_blade;
-extern short uv_possible_blades;
-
/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
static inline int uv_blade_processor_id(void)
{
- return uv_hub_info->blade_processor_id;
+ return uv_cpu_info->blade_cpu_id;
}
-/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
+/* Blade-local cpu number of cpu N. Numbered 0 .. <# cpus on the blade> */
+static inline int uv_cpu_blade_processor_id(int cpu)
+{
+ return uv_cpu_info_per(cpu)->blade_cpu_id;
+}
+
+/* Blade number to Node number (UV2..UV4 is 1:1) */
+static inline int uv_blade_to_node(int blade)
+{
+ return uv_socket_to_node(blade);
+}
+
+/* Blade number of current cpu. Numbered 0 .. <#blades -1> */
static inline int uv_numa_blade_id(void)
{
return uv_hub_info->numa_blade_id;
}
-/* Convert a cpu number to the the UV blade number */
-static inline int uv_cpu_to_blade_id(int cpu)
+/*
+ * Convert linux node number to the UV blade number.
+ * .. Currently for UV2 thru UV4 the node and the blade are identical.
+ * .. UV5 needs conversion when sub-numa clustering is enabled.
+ */
+static inline int uv_node_to_blade_id(int nid)
{
- return uv_cpu_to_blade[cpu];
+ unsigned short *n2s = uv_hub_info->node_to_socket;
+
+ return n2s ? n2s[nid] : nid;
}
-/* Convert linux node number to the UV blade number */
-static inline int uv_node_to_blade_id(int nid)
+/* Convert a CPU number to the UV blade number */
+static inline int uv_cpu_to_blade_id(int cpu)
{
- return uv_node_to_blade[nid];
+ return uv_cpu_hub_info(cpu)->numa_blade_id;
}
/* Convert a blade id to the PNODE of the blade */
static inline int uv_blade_to_pnode(int bid)
{
- return uv_blade_info[bid].pnode;
+ unsigned short *s2p = uv_hub_info->socket_to_pnode;
+
+ return s2p ? s2p[bid] : bid;
}
/* Nid of memory node on blade. -1 if no blade-local memory */
static inline int uv_blade_to_memory_nid(int bid)
{
- return uv_blade_info[bid].memory_nid;
+ return uv_hub_info_list(uv_blade_to_node(bid))->memory_nid;
}
/* Determine the number of possible cpus on a blade */
static inline int uv_blade_nr_possible_cpus(int bid)
{
- return uv_blade_info[bid].nr_possible_cpus;
+ return uv_hub_info_list(uv_blade_to_node(bid))->nr_possible_cpus;
}
/* Determine the number of online cpus on a blade */
static inline int uv_blade_nr_online_cpus(int bid)
{
- return uv_blade_info[bid].nr_online_cpus;
+ return uv_hub_info_list(uv_blade_to_node(bid))->nr_online_cpus;
}
/* Convert a cpu id to the PNODE of the blade containing the cpu */
static inline int uv_cpu_to_pnode(int cpu)
{
- return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode;
+ return uv_cpu_hub_info(cpu)->pnode;
}
/* Convert a linux node number to the PNODE of the blade */
static inline int uv_node_to_pnode(int nid)
{
- return uv_blade_info[uv_node_to_blade_id(nid)].pnode;
+ return uv_hub_info_list(nid)->pnode;
}
/* Maximum possible number of blades */
+extern short uv_possible_blades;
static inline int uv_num_possible_blades(void)
{
return uv_possible_blades;
}
-/* Update SCIR state */
-static inline void uv_set_scir_bits(unsigned char value)
-{
- if (uv_hub_info->scir.state != value) {
- uv_hub_info->scir.state = value;
- uv_write_local_mmr8(uv_hub_info->scir.offset, value);
- }
-}
+/* Per Hub NMI support */
+extern void uv_nmi_setup(void);
+extern void uv_nmi_setup_hubless(void);
+
+/* BIOS/Kernel flags exchange MMR */
+#define UVH_BIOS_KERNEL_MMR UVH_SCRATCH5
+#define UVH_BIOS_KERNEL_MMR_ALIAS UVH_SCRATCH5_ALIAS
+#define UVH_BIOS_KERNEL_MMR_ALIAS_2 UVH_SCRATCH5_ALIAS_2
+
+/* TSC sync valid, set by BIOS */
+#define UVH_TSC_SYNC_MMR UVH_BIOS_KERNEL_MMR
+#define UVH_TSC_SYNC_SHIFT 10
+#define UVH_TSC_SYNC_SHIFT_UV2K 16 /* UV2/3k have different bits */
+#define UVH_TSC_SYNC_MASK 3 /* 0011 */
+#define UVH_TSC_SYNC_VALID 3 /* 0011 */
+#define UVH_TSC_SYNC_UNKNOWN 0 /* 0000 */
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR UVH_BIOS_KERNEL_MMR
+#define UVH_NMI_MMR_CLEAR UVH_BIOS_KERNEL_MMR_ALIAS
+#define UVH_NMI_MMR_SHIFT 63
+#define UVH_NMI_MMR_TYPE "SCRATCH5"
+
+struct uv_hub_nmi_s {
+ raw_spinlock_t nmi_lock;
+ atomic_t in_nmi; /* flag this node in UV NMI IRQ */
+ atomic_t cpu_owner; /* last locker of this struct */
+ atomic_t read_mmr_count; /* count of MMR reads */
+ atomic_t nmi_count; /* count of true UV NMIs */
+ unsigned long nmi_value; /* last value read from NMI MMR */
+ bool hub_present; /* false means UV hubless system */
+ bool pch_owner; /* indicates this hub owns PCH */
+};
-static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
-{
- if (uv_cpu_hub_info(cpu)->scir.state != value) {
- uv_cpu_hub_info(cpu)->scir.state = value;
- uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
- }
-}
+struct uv_cpu_nmi_s {
+ struct uv_hub_nmi_s *hub;
+ int state;
+ int pinging;
+ int queries;
+ int pings;
+};
-static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
-{
- unsigned long val;
+DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
+
+#define uv_hub_nmi this_cpu_read(uv_cpu_nmi.hub)
+#define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu))
+#define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub)
- val = (1UL << UVH_IPI_INT_SEND_SHFT) |
- ((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) |
- (vector << UVH_IPI_INT_VECTOR_SHFT);
- uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+/* uv_cpu_nmi_states */
+#define UV_NMI_STATE_OUT 0
+#define UV_NMI_STATE_IN 1
+#define UV_NMI_STATE_DUMP 2
+#define UV_NMI_STATE_DUMP_DONE 3
+
+/*
+ * Get the minimum revision number of the hub chips within the partition.
+ * (See UVx_HUB_REVISION_BASE above for specific values.)
+ */
+static inline int uv_get_min_hub_revision_id(void)
+{
+ return uv_hub_info->hub_revision;
}
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index 9613c8c0b647..1876b5edd142 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -25,12 +25,13 @@ struct uv_IO_APIC_route_entry {
dest : 32;
};
-extern struct irq_chip uv_irq_chip;
-
-extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
-extern void arch_disable_uv_irq(int, unsigned long);
+enum {
+ UV_AFFINITY_ALL,
+ UV_AFFINITY_NODE,
+ UV_AFFINITY_CPU
+};
-extern int uv_setup_irq(char *, int, int, unsigned long);
-extern void uv_teardown_irq(unsigned int, int, unsigned long);
+extern int uv_setup_irq(char *, int, int, unsigned long, int);
+extern void uv_teardown_irq(unsigned int);
#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 2cae46c7c8a2..bb45812889dd 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -1,403 +1,2659 @@
-
/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
- * SGI UV MMR definitions
+ * HPE UV MMR definitions
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (C) 2007-2016 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_MMRS_H
#define _ASM_X86_UV_UV_MMRS_H
+/*
+ * This file contains MMR definitions for all UV hubs types.
+ *
+ * To minimize coding differences between hub types, the symbols are
+ * grouped by architecture types.
+ *
+ * UVH - definitions common to all UV hub types.
+ * UVXH - definitions common to UVX class (2, 3, 4).
+ * UVYH - definitions common to UVY class (5).
+ * UV5H - definitions specific to UV type 5 hub.
+ * UV4AH - definitions specific to UV type 4A hub.
+ * UV4H - definitions specific to UV type 4 hub.
+ * UV3H - definitions specific to UV type 3 hub.
+ * UV2H - definitions specific to UV type 2 hub.
+ *
+ * If the MMR exists on all hub types but have different addresses,
+ * use a conditional operator to define the value at runtime. Any
+ * that are not defined are blank.
+ * (UV4A variations only generated if different from uv4)
+ * #define UVHxxx (
+ * is_uv(UV5) ? UV5Hxxx value :
+ * is_uv(UV4A) ? UV4AHxxx value :
+ * is_uv(UV4) ? UV4Hxxx value :
+ * is_uv(UV3) ? UV3Hxxx value :
+ * is_uv(UV2) ? UV2Hxxx value :
+ * <ucv> or <undef value>)
+ *
+ * Class UVX has UVs (2|3|4|4A).
+ * Class UVY has UVs (5).
+ *
+ * union uvh_xxx {
+ * unsigned long v;
+ * struct uvh_xxx_s { # Common fields only
+ * } s;
+ * struct uv5h_xxx_s { # Full UV5 definition (*)
+ * } s5;
+ * struct uv4ah_xxx_s { # Full UV4A definition (*)
+ * } s4a;
+ * struct uv4h_xxx_s { # Full UV4 definition (*)
+ * } s4;
+ * struct uv3h_xxx_s { # Full UV3 definition (*)
+ * } s3;
+ * struct uv2h_xxx_s { # Full UV2 definition (*)
+ * } s2;
+ * };
+ * (* - if present and different than the common struct)
+ *
+ * Only essential differences are enumerated. For example, if the address is
+ * the same for all UV's, only a single #define is generated. Likewise,
+ * if the contents is the same for all hubs, only the "s" structure is
+ * generated.
+ *
+ * (GEN Flags: undefs=function)
+ */
+
+ /* UV bit masks */
+#define UV2 (1 << 0)
+#define UV3 (1 << 1)
+#define UV4 (1 << 2)
+#define UV4A (1 << 3)
+#define UV5 (1 << 4)
+#define UVX (UV2|UV3|UV4)
+#define UVY (UV5)
+#define UV_ANY (~0)
+
+
+
+
#define UV_MMR_ENABLE (1UL << 63)
-/* ========================================================================= */
-/* UVH_BAU_DATA_CONFIG */
-/* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
-#define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x0438
-
-#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
-#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
-#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
-#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
-#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_BAU_DATA_CONFIG_P_SHFT 13
-#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_BAU_DATA_CONFIG_T_SHFT 15
-#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_BAU_DATA_CONFIG_M_SHFT 16
-#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
-#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_bau_data_config_u {
- unsigned long v;
- struct uvh_bau_data_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
-};
+#define UV1_HUB_PART_NUMBER 0x88a5
+#define UV2_HUB_PART_NUMBER 0x8eb8
+#define UV2_HUB_PART_NUMBER_X 0x1111
+#define UV3_HUB_PART_NUMBER 0x9578
+#define UV3_HUB_PART_NUMBER_X 0x4321
+#define UV4_HUB_PART_NUMBER 0x99a1
+#define UV5_HUB_PART_NUMBER 0xa171
+
+/* Error function to catch undefined references */
+extern unsigned long uv_undefined(char *str);
/* ========================================================================= */
/* UVH_EVENT_OCCURRED0 */
/* ========================================================================= */
#define UVH_EVENT_OCCURRED0 0x70000UL
-#define UVH_EVENT_OCCURRED0_32 0x005e8
-
-#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
-#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
-#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
-#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
-#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
-#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
-#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
-#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
-#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
-#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
-#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
-#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
-#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+
+/* UVH common defines*/
+#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+
+/* UVXH common defines */
+#define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2
+#define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
+#define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
+#define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
+#define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
+#define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
+#define UVXH_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
+#define UVXH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
+#define UVXH_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
+#define UVXH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
+#define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
+#define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
+#define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
+#define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
+#define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
+#define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
+#define UVXH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UVXH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
+#define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
+#define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
+
+/* UVYH common defines */
+#define UVYH_EVENT_OCCURRED0_KT_HCERR_SHFT 1
+#define UVYH_EVENT_OCCURRED0_KT_HCERR_MASK 0x0000000000000002UL
+#define UVYH_EVENT_OCCURRED0_RH0_HCERR_SHFT 2
+#define UVYH_EVENT_OCCURRED0_RH0_HCERR_MASK 0x0000000000000004UL
+#define UVYH_EVENT_OCCURRED0_RH1_HCERR_SHFT 3
+#define UVYH_EVENT_OCCURRED0_RH1_HCERR_MASK 0x0000000000000008UL
+#define UVYH_EVENT_OCCURRED0_LH0_HCERR_SHFT 4
+#define UVYH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000010UL
+#define UVYH_EVENT_OCCURRED0_LH1_HCERR_SHFT 5
+#define UVYH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000020UL
+#define UVYH_EVENT_OCCURRED0_LH2_HCERR_SHFT 6
+#define UVYH_EVENT_OCCURRED0_LH2_HCERR_MASK 0x0000000000000040UL
+#define UVYH_EVENT_OCCURRED0_LH3_HCERR_SHFT 7
+#define UVYH_EVENT_OCCURRED0_LH3_HCERR_MASK 0x0000000000000080UL
+#define UVYH_EVENT_OCCURRED0_XB_HCERR_SHFT 8
+#define UVYH_EVENT_OCCURRED0_XB_HCERR_MASK 0x0000000000000100UL
+#define UVYH_EVENT_OCCURRED0_RDM_HCERR_SHFT 9
+#define UVYH_EVENT_OCCURRED0_RDM_HCERR_MASK 0x0000000000000200UL
+#define UVYH_EVENT_OCCURRED0_NI0_HCERR_SHFT 10
+#define UVYH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000400UL
+#define UVYH_EVENT_OCCURRED0_NI1_HCERR_SHFT 11
+#define UVYH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000800UL
+#define UVYH_EVENT_OCCURRED0_LB_AOERR0_SHFT 12
+#define UVYH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000001000UL
+#define UVYH_EVENT_OCCURRED0_KT_AOERR0_SHFT 13
+#define UVYH_EVENT_OCCURRED0_KT_AOERR0_MASK 0x0000000000002000UL
+#define UVYH_EVENT_OCCURRED0_RH0_AOERR0_SHFT 14
+#define UVYH_EVENT_OCCURRED0_RH0_AOERR0_MASK 0x0000000000004000UL
+#define UVYH_EVENT_OCCURRED0_RH1_AOERR0_SHFT 15
+#define UVYH_EVENT_OCCURRED0_RH1_AOERR0_MASK 0x0000000000008000UL
+#define UVYH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 16
+#define UVYH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000010000UL
+#define UVYH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 17
+#define UVYH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000020000UL
+#define UVYH_EVENT_OCCURRED0_LH2_AOERR0_SHFT 18
+#define UVYH_EVENT_OCCURRED0_LH2_AOERR0_MASK 0x0000000000040000UL
+#define UVYH_EVENT_OCCURRED0_LH3_AOERR0_SHFT 19
+#define UVYH_EVENT_OCCURRED0_LH3_AOERR0_MASK 0x0000000000080000UL
+#define UVYH_EVENT_OCCURRED0_XB_AOERR0_SHFT 20
+#define UVYH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000100000UL
+#define UVYH_EVENT_OCCURRED0_RDM_AOERR0_SHFT 21
+#define UVYH_EVENT_OCCURRED0_RDM_AOERR0_MASK 0x0000000000200000UL
+#define UVYH_EVENT_OCCURRED0_RT0_AOERR0_SHFT 22
+#define UVYH_EVENT_OCCURRED0_RT0_AOERR0_MASK 0x0000000000400000UL
+#define UVYH_EVENT_OCCURRED0_RT1_AOERR0_SHFT 23
+#define UVYH_EVENT_OCCURRED0_RT1_AOERR0_MASK 0x0000000000800000UL
+#define UVYH_EVENT_OCCURRED0_NI0_AOERR0_SHFT 24
+#define UVYH_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000001000000UL
+#define UVYH_EVENT_OCCURRED0_NI1_AOERR0_SHFT 25
+#define UVYH_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000002000000UL
+#define UVYH_EVENT_OCCURRED0_LB_AOERR1_SHFT 26
+#define UVYH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000004000000UL
+#define UVYH_EVENT_OCCURRED0_KT_AOERR1_SHFT 27
+#define UVYH_EVENT_OCCURRED0_KT_AOERR1_MASK 0x0000000008000000UL
+#define UVYH_EVENT_OCCURRED0_RH0_AOERR1_SHFT 28
+#define UVYH_EVENT_OCCURRED0_RH0_AOERR1_MASK 0x0000000010000000UL
+#define UVYH_EVENT_OCCURRED0_RH1_AOERR1_SHFT 29
+#define UVYH_EVENT_OCCURRED0_RH1_AOERR1_MASK 0x0000000020000000UL
+#define UVYH_EVENT_OCCURRED0_LH0_AOERR1_SHFT 30
+#define UVYH_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000040000000UL
+#define UVYH_EVENT_OCCURRED0_LH1_AOERR1_SHFT 31
+#define UVYH_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000080000000UL
+#define UVYH_EVENT_OCCURRED0_LH2_AOERR1_SHFT 32
+#define UVYH_EVENT_OCCURRED0_LH2_AOERR1_MASK 0x0000000100000000UL
+#define UVYH_EVENT_OCCURRED0_LH3_AOERR1_SHFT 33
+#define UVYH_EVENT_OCCURRED0_LH3_AOERR1_MASK 0x0000000200000000UL
+#define UVYH_EVENT_OCCURRED0_XB_AOERR1_SHFT 34
+#define UVYH_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000400000000UL
+#define UVYH_EVENT_OCCURRED0_RDM_AOERR1_SHFT 35
+#define UVYH_EVENT_OCCURRED0_RDM_AOERR1_MASK 0x0000000800000000UL
+#define UVYH_EVENT_OCCURRED0_RT0_AOERR1_SHFT 36
+#define UVYH_EVENT_OCCURRED0_RT0_AOERR1_MASK 0x0000001000000000UL
+#define UVYH_EVENT_OCCURRED0_RT1_AOERR1_SHFT 37
+#define UVYH_EVENT_OCCURRED0_RT1_AOERR1_MASK 0x0000002000000000UL
+#define UVYH_EVENT_OCCURRED0_NI0_AOERR1_SHFT 38
+#define UVYH_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000004000000000UL
+#define UVYH_EVENT_OCCURRED0_NI1_AOERR1_SHFT 39
+#define UVYH_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000008000000000UL
+#define UVYH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 40
+#define UVYH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000010000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 41
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000020000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 42
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000040000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 43
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000080000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 44
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000100000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 45
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000200000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 46
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000400000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 47
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000800000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 48
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0001000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 49
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0002000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 50
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0004000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 51
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0008000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 52
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0010000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 53
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0020000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 54
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0040000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 55
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0080000000000000UL
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 56
+#define UVYH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0100000000000000UL
+#define UVYH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 57
+#define UVYH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0200000000000000UL
+#define UVYH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 58
+#define UVYH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0400000000000000UL
+#define UVYH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 59
+#define UVYH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0800000000000000UL
+#define UVYH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 60
+#define UVYH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x1000000000000000UL
+#define UVYH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 61
+#define UVYH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x2000000000000000UL
+
+/* UV4 unique defines */
+#define UV4H_EVENT_OCCURRED0_KT_HCERR_SHFT 1
+#define UV4H_EVENT_OCCURRED0_KT_HCERR_MASK 0x0000000000000002UL
+#define UV4H_EVENT_OCCURRED0_KT_AOERR0_SHFT 10
+#define UV4H_EVENT_OCCURRED0_KT_AOERR0_MASK 0x0000000000000400UL
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_SHFT 17
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_MASK 0x0000000000020000UL
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_SHFT 18
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_MASK 0x0000000000040000UL
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_SHFT 19
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_MASK 0x0000000000080000UL
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_SHFT 20
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_MASK 0x0000000000100000UL
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 21
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000200000UL
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 22
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000400000UL
+#define UV4H_EVENT_OCCURRED0_LB_AOERR1_SHFT 23
+#define UV4H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000800000UL
+#define UV4H_EVENT_OCCURRED0_KT_AOERR1_SHFT 24
+#define UV4H_EVENT_OCCURRED0_KT_AOERR1_MASK 0x0000000001000000UL
+#define UV4H_EVENT_OCCURRED0_RH_AOERR1_SHFT 25
+#define UV4H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000002000000UL
+#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 26
+#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000004000000UL
+#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 27
+#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000008000000UL
+#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 28
+#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000010000000UL
+#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 29
+#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000020000000UL
+#define UV4H_EVENT_OCCURRED0_XB_AOERR1_SHFT 30
+#define UV4H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000040000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_SHFT 31
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_MASK 0x0000000080000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_SHFT 32
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_MASK 0x0000000100000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_SHFT 33
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_MASK 0x0000000200000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_SHFT 34
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_MASK 0x0000000400000000UL
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 35
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000800000000UL
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 36
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000001000000000UL
+#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 37
+#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000002000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 38
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000004000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 39
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000008000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 40
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000010000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 41
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000020000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 42
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000040000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 43
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000080000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 44
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000100000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 45
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000200000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 46
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000400000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 47
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000800000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 48
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0001000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 49
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0002000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 50
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0004000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 51
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0008000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 52
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0010000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 53
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0020000000000000UL
+#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 54
+#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0040000000000000UL
+#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 55
+#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0080000000000000UL
+#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 56
+#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0100000000000000UL
+#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 57
+#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0200000000000000UL
+#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 58
+#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0400000000000000UL
+#define UV4H_EVENT_OCCURRED0_IPI_INT_SHFT 59
+#define UV4H_EVENT_OCCURRED0_IPI_INT_MASK 0x0800000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 60
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x1000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 61
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x2000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 62
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x4000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 63
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x8000000000000000UL
+
+/* UV3 unique defines */
+#define UV3H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
+#define UV3H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
+#define UV3H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
+#define UV3H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
+#define UV3H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
+#define UV3H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
+#define UV3H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
+#define UV3H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
+#define UV3H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
+#define UV3H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
+#define UV3H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
+#define UV3H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
+#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
+#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
+#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
+#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
+#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
+#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
+#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
+#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
+#define UV3H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
+#define UV3H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
+#define UV3H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
+#define UV3H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
+#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
+#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
+#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
+#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
+#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
+#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
+#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
+#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
+#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
+#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
+#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
+#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
+#define UV3H_EVENT_OCCURRED0_IPI_INT_SHFT 53
+#define UV3H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
+#define UV3H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
+#define UV3H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+
+/* UV2 unique defines */
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
+#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
+#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK ( \
+ is_uv(UV4) ? 0x1000000000000000UL : \
+ is_uv(UV3) ? 0x0040000000000000UL : \
+ is_uv(UV2) ? 0x0040000000000000UL : \
+ 0)
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT ( \
+ is_uv(UV4) ? 60 : \
+ is_uv(UV3) ? 54 : \
+ is_uv(UV2) ? 54 : \
+ -1)
+
union uvh_event_occurred0_u {
- unsigned long v;
- struct uvh_event_occurred0_s {
- unsigned long lb_hcerr : 1; /* RW, W1C */
- unsigned long gr0_hcerr : 1; /* RW, W1C */
- unsigned long gr1_hcerr : 1; /* RW, W1C */
- unsigned long lh_hcerr : 1; /* RW, W1C */
- unsigned long rh_hcerr : 1; /* RW, W1C */
- unsigned long xn_hcerr : 1; /* RW, W1C */
- unsigned long si_hcerr : 1; /* RW, W1C */
- unsigned long lb_aoerr0 : 1; /* RW, W1C */
- unsigned long gr0_aoerr0 : 1; /* RW, W1C */
- unsigned long gr1_aoerr0 : 1; /* RW, W1C */
- unsigned long lh_aoerr0 : 1; /* RW, W1C */
- unsigned long rh_aoerr0 : 1; /* RW, W1C */
- unsigned long xn_aoerr0 : 1; /* RW, W1C */
- unsigned long si_aoerr0 : 1; /* RW, W1C */
- unsigned long lb_aoerr1 : 1; /* RW, W1C */
- unsigned long gr0_aoerr1 : 1; /* RW, W1C */
- unsigned long gr1_aoerr1 : 1; /* RW, W1C */
- unsigned long lh_aoerr1 : 1; /* RW, W1C */
- unsigned long rh_aoerr1 : 1; /* RW, W1C */
- unsigned long xn_aoerr1 : 1; /* RW, W1C */
- unsigned long si_aoerr1 : 1; /* RW, W1C */
- unsigned long rh_vpi_int : 1; /* RW, W1C */
- unsigned long system_shutdown_int : 1; /* RW, W1C */
- unsigned long lb_irq_int_0 : 1; /* RW, W1C */
- unsigned long lb_irq_int_1 : 1; /* RW, W1C */
- unsigned long lb_irq_int_2 : 1; /* RW, W1C */
- unsigned long lb_irq_int_3 : 1; /* RW, W1C */
- unsigned long lb_irq_int_4 : 1; /* RW, W1C */
- unsigned long lb_irq_int_5 : 1; /* RW, W1C */
- unsigned long lb_irq_int_6 : 1; /* RW, W1C */
- unsigned long lb_irq_int_7 : 1; /* RW, W1C */
- unsigned long lb_irq_int_8 : 1; /* RW, W1C */
- unsigned long lb_irq_int_9 : 1; /* RW, W1C */
- unsigned long lb_irq_int_10 : 1; /* RW, W1C */
- unsigned long lb_irq_int_11 : 1; /* RW, W1C */
- unsigned long lb_irq_int_12 : 1; /* RW, W1C */
- unsigned long lb_irq_int_13 : 1; /* RW, W1C */
- unsigned long lb_irq_int_14 : 1; /* RW, W1C */
- unsigned long lb_irq_int_15 : 1; /* RW, W1C */
- unsigned long l1_nmi_int : 1; /* RW, W1C */
- unsigned long stop_clock : 1; /* RW, W1C */
- unsigned long asic_to_l1 : 1; /* RW, W1C */
- unsigned long l1_to_asic : 1; /* RW, W1C */
- unsigned long ltc_int : 1; /* RW, W1C */
- unsigned long la_seq_trigger : 1; /* RW, W1C */
- unsigned long ipi_int : 1; /* RW, W1C */
- unsigned long extio_int0 : 1; /* RW, W1C */
- unsigned long extio_int1 : 1; /* RW, W1C */
- unsigned long extio_int2 : 1; /* RW, W1C */
- unsigned long extio_int3 : 1; /* RW, W1C */
- unsigned long profile_int : 1; /* RW, W1C */
- unsigned long rtc0 : 1; /* RW, W1C */
- unsigned long rtc1 : 1; /* RW, W1C */
- unsigned long rtc2 : 1; /* RW, W1C */
- unsigned long rtc3 : 1; /* RW, W1C */
- unsigned long bau_data : 1; /* RW, W1C */
- unsigned long power_management_req : 1; /* RW, W1C */
- unsigned long rsvd_57_63 : 7; /* */
- } s;
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long rsvd_1:1;
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long rsvd_10:1;
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rsvd_17_63:47;
+ } sx;
+
+ /* UVYH common struct */
+ struct uvyh_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long kt_hcerr:1; /* RW */
+ unsigned long rh0_hcerr:1; /* RW */
+ unsigned long rh1_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long lh2_hcerr:1; /* RW */
+ unsigned long lh3_hcerr:1; /* RW */
+ unsigned long xb_hcerr:1; /* RW */
+ unsigned long rdm_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long kt_aoerr0:1; /* RW */
+ unsigned long rh0_aoerr0:1; /* RW */
+ unsigned long rh1_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long lh2_aoerr0:1; /* RW */
+ unsigned long lh3_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rdm_aoerr0:1; /* RW */
+ unsigned long rt0_aoerr0:1; /* RW */
+ unsigned long rt1_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long kt_aoerr1:1; /* RW */
+ unsigned long rh0_aoerr1:1; /* RW */
+ unsigned long rh1_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long lh2_aoerr1:1; /* RW */
+ unsigned long lh3_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rdm_aoerr1:1; /* RW */
+ unsigned long rt0_aoerr1:1; /* RW */
+ unsigned long rt1_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long rsvd_62_63:2;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long kt_hcerr:1; /* RW */
+ unsigned long rh0_hcerr:1; /* RW */
+ unsigned long rh1_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long lh2_hcerr:1; /* RW */
+ unsigned long lh3_hcerr:1; /* RW */
+ unsigned long xb_hcerr:1; /* RW */
+ unsigned long rdm_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long kt_aoerr0:1; /* RW */
+ unsigned long rh0_aoerr0:1; /* RW */
+ unsigned long rh1_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long lh2_aoerr0:1; /* RW */
+ unsigned long lh3_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rdm_aoerr0:1; /* RW */
+ unsigned long rt0_aoerr0:1; /* RW */
+ unsigned long rt1_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long kt_aoerr1:1; /* RW */
+ unsigned long rh0_aoerr1:1; /* RW */
+ unsigned long rh1_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long lh2_aoerr1:1; /* RW */
+ unsigned long lh3_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rdm_aoerr1:1; /* RW */
+ unsigned long rt0_aoerr1:1; /* RW */
+ unsigned long rt1_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long rsvd_62_63:2;
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long kt_hcerr:1; /* RW */
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long kt_aoerr0:1; /* RW */
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rtq0_aoerr0:1; /* RW */
+ unsigned long rtq1_aoerr0:1; /* RW */
+ unsigned long rtq2_aoerr0:1; /* RW */
+ unsigned long rtq3_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long kt_aoerr1:1; /* RW */
+ unsigned long rh_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long gr0_aoerr1:1; /* RW */
+ unsigned long gr1_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rtq0_aoerr1:1; /* RW */
+ unsigned long rtq1_aoerr1:1; /* RW */
+ unsigned long rtq2_aoerr1:1; /* RW */
+ unsigned long rtq3_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long qp_hcerr:1; /* RW */
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long qp_aoerr0:1; /* RW */
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rt_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long qp_aoerr1:1; /* RW */
+ unsigned long rh_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long gr0_aoerr1:1; /* RW */
+ unsigned long gr1_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rt_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ unsigned long profile_int:1; /* RW */
+ unsigned long rsvd_59_63:5;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long qp_hcerr:1; /* RW */
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long qp_aoerr0:1; /* RW */
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rt_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long qp_aoerr1:1; /* RW */
+ unsigned long rh_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long gr0_aoerr1:1; /* RW */
+ unsigned long gr1_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rt_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ unsigned long profile_int:1; /* RW */
+ unsigned long rsvd_59_63:5;
+ } s2;
};
/* ========================================================================= */
/* UVH_EVENT_OCCURRED0_ALIAS */
/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
+#define UVH_EVENT_OCCURRED0_ALIAS 0x70008UL
+
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED1 */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED1 0x70080UL
+
+
+
+/* UVYH common defines */
+#define UVYH_EVENT_OCCURRED1_IPI_INT_SHFT 0
+#define UVYH_EVENT_OCCURRED1_IPI_INT_MASK 0x0000000000000001UL
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT0_SHFT 1
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT0_MASK 0x0000000000000002UL
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT1_SHFT 2
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT1_MASK 0x0000000000000004UL
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT2_SHFT 3
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT2_MASK 0x0000000000000008UL
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT3_SHFT 4
+#define UVYH_EVENT_OCCURRED1_EXTIO_INT3_MASK 0x0000000000000010UL
+#define UVYH_EVENT_OCCURRED1_PROFILE_INT_SHFT 5
+#define UVYH_EVENT_OCCURRED1_PROFILE_INT_MASK 0x0000000000000020UL
+#define UVYH_EVENT_OCCURRED1_BAU_DATA_SHFT 6
+#define UVYH_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000040UL
+#define UVYH_EVENT_OCCURRED1_PROC_GENERAL_SHFT 7
+#define UVYH_EVENT_OCCURRED1_PROC_GENERAL_MASK 0x0000000000000080UL
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT0_SHFT 8
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT0_MASK 0x0000000000000100UL
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT1_SHFT 9
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT1_MASK 0x0000000000000200UL
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT2_SHFT 10
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT2_MASK 0x0000000000000400UL
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT3_SHFT 11
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT3_MASK 0x0000000000000800UL
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT4_SHFT 12
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT4_MASK 0x0000000000001000UL
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT5_SHFT 13
+#define UVYH_EVENT_OCCURRED1_XH_TLB_INT5_MASK 0x0000000000002000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT0_SHFT 14
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT0_MASK 0x0000000000004000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT1_SHFT 15
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT1_MASK 0x0000000000008000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT2_SHFT 16
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT2_MASK 0x0000000000010000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT3_SHFT 17
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT3_MASK 0x0000000000020000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT4_SHFT 18
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT4_MASK 0x0000000000040000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT5_SHFT 19
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT5_MASK 0x0000000000080000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT6_SHFT 20
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT6_MASK 0x0000000000100000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT7_SHFT 21
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT7_MASK 0x0000000000200000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT8_SHFT 22
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT8_MASK 0x0000000000400000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT9_SHFT 23
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT9_MASK 0x0000000000800000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT10_SHFT 24
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT10_MASK 0x0000000001000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT11_SHFT 25
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT11_MASK 0x0000000002000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT12_SHFT 26
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT12_MASK 0x0000000004000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT13_SHFT 27
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT13_MASK 0x0000000008000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT14_SHFT 28
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT14_MASK 0x0000000010000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT15_SHFT 29
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT15_MASK 0x0000000020000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT16_SHFT 30
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT16_MASK 0x0000000040000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT17_SHFT 31
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT17_MASK 0x0000000080000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT18_SHFT 32
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT18_MASK 0x0000000100000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT19_SHFT 33
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT19_MASK 0x0000000200000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT20_SHFT 34
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT20_MASK 0x0000000400000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT21_SHFT 35
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT21_MASK 0x0000000800000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT22_SHFT 36
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT22_MASK 0x0000001000000000UL
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT23_SHFT 37
+#define UVYH_EVENT_OCCURRED1_RDM_TLB_INT23_MASK 0x0000002000000000UL
+
+/* UV4 unique defines */
+#define UV4H_EVENT_OCCURRED1_PROFILE_INT_SHFT 0
+#define UV4H_EVENT_OCCURRED1_PROFILE_INT_MASK 0x0000000000000001UL
+#define UV4H_EVENT_OCCURRED1_BAU_DATA_SHFT 1
+#define UV4H_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000002UL
+#define UV4H_EVENT_OCCURRED1_PROC_GENERAL_SHFT 2
+#define UV4H_EVENT_OCCURRED1_PROC_GENERAL_MASK 0x0000000000000004UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT0_SHFT 3
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT0_MASK 0x0000000000000008UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT1_SHFT 4
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT1_MASK 0x0000000000000010UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT2_SHFT 5
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT2_MASK 0x0000000000000020UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT3_SHFT 6
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT3_MASK 0x0000000000000040UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT4_SHFT 7
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT4_MASK 0x0000000000000080UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT5_SHFT 8
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT5_MASK 0x0000000000000100UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT6_SHFT 9
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT6_MASK 0x0000000000000200UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT7_SHFT 10
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT7_MASK 0x0000000000000400UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT8_SHFT 11
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT8_MASK 0x0000000000000800UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT9_SHFT 12
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT9_MASK 0x0000000000001000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT10_SHFT 13
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT10_MASK 0x0000000000002000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT11_SHFT 14
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT11_MASK 0x0000000000004000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT12_SHFT 15
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT12_MASK 0x0000000000008000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT13_SHFT 16
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT13_MASK 0x0000000000010000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT14_SHFT 17
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT14_MASK 0x0000000000020000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT15_SHFT 18
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT15_MASK 0x0000000000040000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT16_SHFT 19
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT16_MASK 0x0000000000080000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT17_SHFT 20
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT17_MASK 0x0000000000100000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT18_SHFT 21
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT18_MASK 0x0000000000200000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT19_SHFT 22
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT19_MASK 0x0000000000400000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT20_SHFT 23
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT20_MASK 0x0000000000800000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT21_SHFT 24
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT21_MASK 0x0000000001000000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT22_SHFT 25
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT22_MASK 0x0000000002000000UL
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT23_SHFT 26
+#define UV4H_EVENT_OCCURRED1_GR0_TLB_INT23_MASK 0x0000000004000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT0_SHFT 27
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT0_MASK 0x0000000008000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT1_SHFT 28
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT1_MASK 0x0000000010000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT2_SHFT 29
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT2_MASK 0x0000000020000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT3_SHFT 30
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT3_MASK 0x0000000040000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT4_SHFT 31
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT4_MASK 0x0000000080000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT5_SHFT 32
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT5_MASK 0x0000000100000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT6_SHFT 33
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT6_MASK 0x0000000200000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT7_SHFT 34
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT7_MASK 0x0000000400000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT8_SHFT 35
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT8_MASK 0x0000000800000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT9_SHFT 36
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT9_MASK 0x0000001000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT10_SHFT 37
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT10_MASK 0x0000002000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT11_SHFT 38
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT11_MASK 0x0000004000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT12_SHFT 39
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT12_MASK 0x0000008000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT13_SHFT 40
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT13_MASK 0x0000010000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT14_SHFT 41
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT14_MASK 0x0000020000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT15_SHFT 42
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT15_MASK 0x0000040000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT16_SHFT 43
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT16_MASK 0x0000080000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT17_SHFT 44
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT17_MASK 0x0000100000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT18_SHFT 45
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT18_MASK 0x0000200000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT19_SHFT 46
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT19_MASK 0x0000400000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT20_SHFT 47
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT20_MASK 0x0000800000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT21_SHFT 48
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT21_MASK 0x0001000000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT22_SHFT 49
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT22_MASK 0x0002000000000000UL
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT23_SHFT 50
+#define UV4H_EVENT_OCCURRED1_GR1_TLB_INT23_MASK 0x0004000000000000UL
+
+/* UV3 unique defines */
+#define UV3H_EVENT_OCCURRED1_BAU_DATA_SHFT 0
+#define UV3H_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000001UL
+#define UV3H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_SHFT 1
+#define UV3H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_MASK 0x0000000000000002UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_SHFT 2
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000004UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_SHFT 3
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000008UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_SHFT 4
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000010UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_SHFT 5
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000020UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_SHFT 6
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000040UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_SHFT 7
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000080UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_SHFT 8
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000100UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_SHFT 9
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000200UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_SHFT 10
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000400UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_SHFT 11
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000800UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_SHFT 12
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000001000UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_SHFT 13
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000002000UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_SHFT 14
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000004000UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_SHFT 15
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000008000UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_SHFT 16
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000010000UL
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_SHFT 17
+#define UV3H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000020000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT0_SHFT 18
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT0_MASK 0x0000000000040000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT1_SHFT 19
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT1_MASK 0x0000000000080000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT2_SHFT 20
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT2_MASK 0x0000000000100000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT3_SHFT 21
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT3_MASK 0x0000000000200000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT4_SHFT 22
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT4_MASK 0x0000000000400000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT5_SHFT 23
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT5_MASK 0x0000000000800000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT6_SHFT 24
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT6_MASK 0x0000000001000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT7_SHFT 25
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT7_MASK 0x0000000002000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT8_SHFT 26
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT8_MASK 0x0000000004000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT9_SHFT 27
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT9_MASK 0x0000000008000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT10_SHFT 28
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT10_MASK 0x0000000010000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT11_SHFT 29
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT11_MASK 0x0000000020000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT12_SHFT 30
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT12_MASK 0x0000000040000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT13_SHFT 31
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT13_MASK 0x0000000080000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT14_SHFT 32
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT14_MASK 0x0000000100000000UL
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT15_SHFT 33
+#define UV3H_EVENT_OCCURRED1_GR0_TLB_INT15_MASK 0x0000000200000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT0_SHFT 34
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT0_MASK 0x0000000400000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT1_SHFT 35
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT1_MASK 0x0000000800000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT2_SHFT 36
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT2_MASK 0x0000001000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT3_SHFT 37
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT3_MASK 0x0000002000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT4_SHFT 38
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT4_MASK 0x0000004000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT5_SHFT 39
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT5_MASK 0x0000008000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT6_SHFT 40
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT6_MASK 0x0000010000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT7_SHFT 41
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT7_MASK 0x0000020000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT8_SHFT 42
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT8_MASK 0x0000040000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT9_SHFT 43
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT9_MASK 0x0000080000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT10_SHFT 44
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT10_MASK 0x0000100000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT11_SHFT 45
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT11_MASK 0x0000200000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT12_SHFT 46
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT12_MASK 0x0000400000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT13_SHFT 47
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT13_MASK 0x0000800000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT14_SHFT 48
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT14_MASK 0x0001000000000000UL
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT15_SHFT 49
+#define UV3H_EVENT_OCCURRED1_GR1_TLB_INT15_MASK 0x0002000000000000UL
+#define UV3H_EVENT_OCCURRED1_RTC_INTERVAL_INT_SHFT 50
+#define UV3H_EVENT_OCCURRED1_RTC_INTERVAL_INT_MASK 0x0004000000000000UL
+#define UV3H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_SHFT 51
+#define UV3H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_MASK 0x0008000000000000UL
+
+/* UV2 unique defines */
+#define UV2H_EVENT_OCCURRED1_BAU_DATA_SHFT 0
+#define UV2H_EVENT_OCCURRED1_BAU_DATA_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_SHFT 1
+#define UV2H_EVENT_OCCURRED1_POWER_MANAGEMENT_REQ_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_SHFT 2
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_SHFT 3
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_SHFT 4
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_SHFT 5
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_SHFT 6
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_SHFT 7
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_SHFT 8
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_SHFT 9
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_SHFT 10
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_SHFT 11
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_SHFT 12
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_SHFT 13
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_SHFT 14
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_SHFT 15
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_SHFT 16
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_SHFT 17
+#define UV2H_EVENT_OCCURRED1_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT0_SHFT 18
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT0_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT1_SHFT 19
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT1_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT2_SHFT 20
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT2_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT3_SHFT 21
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT3_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT4_SHFT 22
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT4_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT5_SHFT 23
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT5_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT6_SHFT 24
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT6_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT7_SHFT 25
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT7_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT8_SHFT 26
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT8_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT9_SHFT 27
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT9_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT10_SHFT 28
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT10_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT11_SHFT 29
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT11_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT12_SHFT 30
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT12_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT13_SHFT 31
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT13_MASK 0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT14_SHFT 32
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT14_MASK 0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT15_SHFT 33
+#define UV2H_EVENT_OCCURRED1_GR0_TLB_INT15_MASK 0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT0_SHFT 34
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT0_MASK 0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT1_SHFT 35
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT1_MASK 0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT2_SHFT 36
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT2_MASK 0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT3_SHFT 37
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT3_MASK 0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT4_SHFT 38
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT4_MASK 0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT5_SHFT 39
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT5_MASK 0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT6_SHFT 40
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT6_MASK 0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT7_SHFT 41
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT7_MASK 0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT8_SHFT 42
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT8_MASK 0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT9_SHFT 43
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT9_MASK 0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT10_SHFT 44
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT10_MASK 0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT11_SHFT 45
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT11_MASK 0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT12_SHFT 46
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT12_MASK 0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT13_SHFT 47
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT13_MASK 0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT14_SHFT 48
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT14_MASK 0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT15_SHFT 49
+#define UV2H_EVENT_OCCURRED1_GR1_TLB_INT15_MASK 0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED1_RTC_INTERVAL_INT_SHFT 50
+#define UV2H_EVENT_OCCURRED1_RTC_INTERVAL_INT_MASK 0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_SHFT 51
+#define UV2H_EVENT_OCCURRED1_BAU_DASHBOARD_INT_MASK 0x0008000000000000UL
+
+#define UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK ( \
+ is_uv(UV5) ? 0x0000000000000002UL : \
+ 0)
+#define UVH_EVENT_OCCURRED1_EXTIO_INT0_SHFT ( \
+ is_uv(UV5) ? 1 : \
+ -1)
+
+union uvyh_event_occurred1_u {
+ unsigned long v;
+
+ /* UVYH common struct */
+ struct uvyh_event_occurred1_s {
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ unsigned long profile_int:1; /* RW */
+ unsigned long bau_data:1; /* RW */
+ unsigned long proc_general:1; /* RW */
+ unsigned long xh_tlb_int0:1; /* RW */
+ unsigned long xh_tlb_int1:1; /* RW */
+ unsigned long xh_tlb_int2:1; /* RW */
+ unsigned long xh_tlb_int3:1; /* RW */
+ unsigned long xh_tlb_int4:1; /* RW */
+ unsigned long xh_tlb_int5:1; /* RW */
+ unsigned long rdm_tlb_int0:1; /* RW */
+ unsigned long rdm_tlb_int1:1; /* RW */
+ unsigned long rdm_tlb_int2:1; /* RW */
+ unsigned long rdm_tlb_int3:1; /* RW */
+ unsigned long rdm_tlb_int4:1; /* RW */
+ unsigned long rdm_tlb_int5:1; /* RW */
+ unsigned long rdm_tlb_int6:1; /* RW */
+ unsigned long rdm_tlb_int7:1; /* RW */
+ unsigned long rdm_tlb_int8:1; /* RW */
+ unsigned long rdm_tlb_int9:1; /* RW */
+ unsigned long rdm_tlb_int10:1; /* RW */
+ unsigned long rdm_tlb_int11:1; /* RW */
+ unsigned long rdm_tlb_int12:1; /* RW */
+ unsigned long rdm_tlb_int13:1; /* RW */
+ unsigned long rdm_tlb_int14:1; /* RW */
+ unsigned long rdm_tlb_int15:1; /* RW */
+ unsigned long rdm_tlb_int16:1; /* RW */
+ unsigned long rdm_tlb_int17:1; /* RW */
+ unsigned long rdm_tlb_int18:1; /* RW */
+ unsigned long rdm_tlb_int19:1; /* RW */
+ unsigned long rdm_tlb_int20:1; /* RW */
+ unsigned long rdm_tlb_int21:1; /* RW */
+ unsigned long rdm_tlb_int22:1; /* RW */
+ unsigned long rdm_tlb_int23:1; /* RW */
+ unsigned long rsvd_38_63:26;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_event_occurred1_s {
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ unsigned long profile_int:1; /* RW */
+ unsigned long bau_data:1; /* RW */
+ unsigned long proc_general:1; /* RW */
+ unsigned long xh_tlb_int0:1; /* RW */
+ unsigned long xh_tlb_int1:1; /* RW */
+ unsigned long xh_tlb_int2:1; /* RW */
+ unsigned long xh_tlb_int3:1; /* RW */
+ unsigned long xh_tlb_int4:1; /* RW */
+ unsigned long xh_tlb_int5:1; /* RW */
+ unsigned long rdm_tlb_int0:1; /* RW */
+ unsigned long rdm_tlb_int1:1; /* RW */
+ unsigned long rdm_tlb_int2:1; /* RW */
+ unsigned long rdm_tlb_int3:1; /* RW */
+ unsigned long rdm_tlb_int4:1; /* RW */
+ unsigned long rdm_tlb_int5:1; /* RW */
+ unsigned long rdm_tlb_int6:1; /* RW */
+ unsigned long rdm_tlb_int7:1; /* RW */
+ unsigned long rdm_tlb_int8:1; /* RW */
+ unsigned long rdm_tlb_int9:1; /* RW */
+ unsigned long rdm_tlb_int10:1; /* RW */
+ unsigned long rdm_tlb_int11:1; /* RW */
+ unsigned long rdm_tlb_int12:1; /* RW */
+ unsigned long rdm_tlb_int13:1; /* RW */
+ unsigned long rdm_tlb_int14:1; /* RW */
+ unsigned long rdm_tlb_int15:1; /* RW */
+ unsigned long rdm_tlb_int16:1; /* RW */
+ unsigned long rdm_tlb_int17:1; /* RW */
+ unsigned long rdm_tlb_int18:1; /* RW */
+ unsigned long rdm_tlb_int19:1; /* RW */
+ unsigned long rdm_tlb_int20:1; /* RW */
+ unsigned long rdm_tlb_int21:1; /* RW */
+ unsigned long rdm_tlb_int22:1; /* RW */
+ unsigned long rdm_tlb_int23:1; /* RW */
+ unsigned long rsvd_38_63:26;
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_event_occurred1_s {
+ unsigned long profile_int:1; /* RW */
+ unsigned long bau_data:1; /* RW */
+ unsigned long proc_general:1; /* RW */
+ unsigned long gr0_tlb_int0:1; /* RW */
+ unsigned long gr0_tlb_int1:1; /* RW */
+ unsigned long gr0_tlb_int2:1; /* RW */
+ unsigned long gr0_tlb_int3:1; /* RW */
+ unsigned long gr0_tlb_int4:1; /* RW */
+ unsigned long gr0_tlb_int5:1; /* RW */
+ unsigned long gr0_tlb_int6:1; /* RW */
+ unsigned long gr0_tlb_int7:1; /* RW */
+ unsigned long gr0_tlb_int8:1; /* RW */
+ unsigned long gr0_tlb_int9:1; /* RW */
+ unsigned long gr0_tlb_int10:1; /* RW */
+ unsigned long gr0_tlb_int11:1; /* RW */
+ unsigned long gr0_tlb_int12:1; /* RW */
+ unsigned long gr0_tlb_int13:1; /* RW */
+ unsigned long gr0_tlb_int14:1; /* RW */
+ unsigned long gr0_tlb_int15:1; /* RW */
+ unsigned long gr0_tlb_int16:1; /* RW */
+ unsigned long gr0_tlb_int17:1; /* RW */
+ unsigned long gr0_tlb_int18:1; /* RW */
+ unsigned long gr0_tlb_int19:1; /* RW */
+ unsigned long gr0_tlb_int20:1; /* RW */
+ unsigned long gr0_tlb_int21:1; /* RW */
+ unsigned long gr0_tlb_int22:1; /* RW */
+ unsigned long gr0_tlb_int23:1; /* RW */
+ unsigned long gr1_tlb_int0:1; /* RW */
+ unsigned long gr1_tlb_int1:1; /* RW */
+ unsigned long gr1_tlb_int2:1; /* RW */
+ unsigned long gr1_tlb_int3:1; /* RW */
+ unsigned long gr1_tlb_int4:1; /* RW */
+ unsigned long gr1_tlb_int5:1; /* RW */
+ unsigned long gr1_tlb_int6:1; /* RW */
+ unsigned long gr1_tlb_int7:1; /* RW */
+ unsigned long gr1_tlb_int8:1; /* RW */
+ unsigned long gr1_tlb_int9:1; /* RW */
+ unsigned long gr1_tlb_int10:1; /* RW */
+ unsigned long gr1_tlb_int11:1; /* RW */
+ unsigned long gr1_tlb_int12:1; /* RW */
+ unsigned long gr1_tlb_int13:1; /* RW */
+ unsigned long gr1_tlb_int14:1; /* RW */
+ unsigned long gr1_tlb_int15:1; /* RW */
+ unsigned long gr1_tlb_int16:1; /* RW */
+ unsigned long gr1_tlb_int17:1; /* RW */
+ unsigned long gr1_tlb_int18:1; /* RW */
+ unsigned long gr1_tlb_int19:1; /* RW */
+ unsigned long gr1_tlb_int20:1; /* RW */
+ unsigned long gr1_tlb_int21:1; /* RW */
+ unsigned long gr1_tlb_int22:1; /* RW */
+ unsigned long gr1_tlb_int23:1; /* RW */
+ unsigned long rsvd_51_63:13;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_event_occurred1_s {
+ unsigned long bau_data:1; /* RW */
+ unsigned long power_management_req:1; /* RW */
+ unsigned long message_accelerator_int0:1; /* RW */
+ unsigned long message_accelerator_int1:1; /* RW */
+ unsigned long message_accelerator_int2:1; /* RW */
+ unsigned long message_accelerator_int3:1; /* RW */
+ unsigned long message_accelerator_int4:1; /* RW */
+ unsigned long message_accelerator_int5:1; /* RW */
+ unsigned long message_accelerator_int6:1; /* RW */
+ unsigned long message_accelerator_int7:1; /* RW */
+ unsigned long message_accelerator_int8:1; /* RW */
+ unsigned long message_accelerator_int9:1; /* RW */
+ unsigned long message_accelerator_int10:1; /* RW */
+ unsigned long message_accelerator_int11:1; /* RW */
+ unsigned long message_accelerator_int12:1; /* RW */
+ unsigned long message_accelerator_int13:1; /* RW */
+ unsigned long message_accelerator_int14:1; /* RW */
+ unsigned long message_accelerator_int15:1; /* RW */
+ unsigned long gr0_tlb_int0:1; /* RW */
+ unsigned long gr0_tlb_int1:1; /* RW */
+ unsigned long gr0_tlb_int2:1; /* RW */
+ unsigned long gr0_tlb_int3:1; /* RW */
+ unsigned long gr0_tlb_int4:1; /* RW */
+ unsigned long gr0_tlb_int5:1; /* RW */
+ unsigned long gr0_tlb_int6:1; /* RW */
+ unsigned long gr0_tlb_int7:1; /* RW */
+ unsigned long gr0_tlb_int8:1; /* RW */
+ unsigned long gr0_tlb_int9:1; /* RW */
+ unsigned long gr0_tlb_int10:1; /* RW */
+ unsigned long gr0_tlb_int11:1; /* RW */
+ unsigned long gr0_tlb_int12:1; /* RW */
+ unsigned long gr0_tlb_int13:1; /* RW */
+ unsigned long gr0_tlb_int14:1; /* RW */
+ unsigned long gr0_tlb_int15:1; /* RW */
+ unsigned long gr1_tlb_int0:1; /* RW */
+ unsigned long gr1_tlb_int1:1; /* RW */
+ unsigned long gr1_tlb_int2:1; /* RW */
+ unsigned long gr1_tlb_int3:1; /* RW */
+ unsigned long gr1_tlb_int4:1; /* RW */
+ unsigned long gr1_tlb_int5:1; /* RW */
+ unsigned long gr1_tlb_int6:1; /* RW */
+ unsigned long gr1_tlb_int7:1; /* RW */
+ unsigned long gr1_tlb_int8:1; /* RW */
+ unsigned long gr1_tlb_int9:1; /* RW */
+ unsigned long gr1_tlb_int10:1; /* RW */
+ unsigned long gr1_tlb_int11:1; /* RW */
+ unsigned long gr1_tlb_int12:1; /* RW */
+ unsigned long gr1_tlb_int13:1; /* RW */
+ unsigned long gr1_tlb_int14:1; /* RW */
+ unsigned long gr1_tlb_int15:1; /* RW */
+ unsigned long rtc_interval_int:1; /* RW */
+ unsigned long bau_dashboard_int:1; /* RW */
+ unsigned long rsvd_52_63:12;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_event_occurred1_s {
+ unsigned long bau_data:1; /* RW */
+ unsigned long power_management_req:1; /* RW */
+ unsigned long message_accelerator_int0:1; /* RW */
+ unsigned long message_accelerator_int1:1; /* RW */
+ unsigned long message_accelerator_int2:1; /* RW */
+ unsigned long message_accelerator_int3:1; /* RW */
+ unsigned long message_accelerator_int4:1; /* RW */
+ unsigned long message_accelerator_int5:1; /* RW */
+ unsigned long message_accelerator_int6:1; /* RW */
+ unsigned long message_accelerator_int7:1; /* RW */
+ unsigned long message_accelerator_int8:1; /* RW */
+ unsigned long message_accelerator_int9:1; /* RW */
+ unsigned long message_accelerator_int10:1; /* RW */
+ unsigned long message_accelerator_int11:1; /* RW */
+ unsigned long message_accelerator_int12:1; /* RW */
+ unsigned long message_accelerator_int13:1; /* RW */
+ unsigned long message_accelerator_int14:1; /* RW */
+ unsigned long message_accelerator_int15:1; /* RW */
+ unsigned long gr0_tlb_int0:1; /* RW */
+ unsigned long gr0_tlb_int1:1; /* RW */
+ unsigned long gr0_tlb_int2:1; /* RW */
+ unsigned long gr0_tlb_int3:1; /* RW */
+ unsigned long gr0_tlb_int4:1; /* RW */
+ unsigned long gr0_tlb_int5:1; /* RW */
+ unsigned long gr0_tlb_int6:1; /* RW */
+ unsigned long gr0_tlb_int7:1; /* RW */
+ unsigned long gr0_tlb_int8:1; /* RW */
+ unsigned long gr0_tlb_int9:1; /* RW */
+ unsigned long gr0_tlb_int10:1; /* RW */
+ unsigned long gr0_tlb_int11:1; /* RW */
+ unsigned long gr0_tlb_int12:1; /* RW */
+ unsigned long gr0_tlb_int13:1; /* RW */
+ unsigned long gr0_tlb_int14:1; /* RW */
+ unsigned long gr0_tlb_int15:1; /* RW */
+ unsigned long gr1_tlb_int0:1; /* RW */
+ unsigned long gr1_tlb_int1:1; /* RW */
+ unsigned long gr1_tlb_int2:1; /* RW */
+ unsigned long gr1_tlb_int3:1; /* RW */
+ unsigned long gr1_tlb_int4:1; /* RW */
+ unsigned long gr1_tlb_int5:1; /* RW */
+ unsigned long gr1_tlb_int6:1; /* RW */
+ unsigned long gr1_tlb_int7:1; /* RW */
+ unsigned long gr1_tlb_int8:1; /* RW */
+ unsigned long gr1_tlb_int9:1; /* RW */
+ unsigned long gr1_tlb_int10:1; /* RW */
+ unsigned long gr1_tlb_int11:1; /* RW */
+ unsigned long gr1_tlb_int12:1; /* RW */
+ unsigned long gr1_tlb_int13:1; /* RW */
+ unsigned long gr1_tlb_int14:1; /* RW */
+ unsigned long gr1_tlb_int15:1; /* RW */
+ unsigned long rtc_interval_int:1; /* RW */
+ unsigned long bau_dashboard_int:1; /* RW */
+ unsigned long rsvd_52_63:12;
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED1_ALIAS */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED1_ALIAS 0x70088UL
+
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED2 */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED2 0x70100UL
+
+
+
+/* UVYH common defines */
+#define UVYH_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 0
+#define UVYH_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000000001UL
+#define UVYH_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 1
+#define UVYH_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000000002UL
+#define UVYH_EVENT_OCCURRED2_RTC_0_SHFT 2
+#define UVYH_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000004UL
+#define UVYH_EVENT_OCCURRED2_RTC_1_SHFT 3
+#define UVYH_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000008UL
+#define UVYH_EVENT_OCCURRED2_RTC_2_SHFT 4
+#define UVYH_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000010UL
+#define UVYH_EVENT_OCCURRED2_RTC_3_SHFT 5
+#define UVYH_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000020UL
+#define UVYH_EVENT_OCCURRED2_RTC_4_SHFT 6
+#define UVYH_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000040UL
+#define UVYH_EVENT_OCCURRED2_RTC_5_SHFT 7
+#define UVYH_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000080UL
+#define UVYH_EVENT_OCCURRED2_RTC_6_SHFT 8
+#define UVYH_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000100UL
+#define UVYH_EVENT_OCCURRED2_RTC_7_SHFT 9
+#define UVYH_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000200UL
+#define UVYH_EVENT_OCCURRED2_RTC_8_SHFT 10
+#define UVYH_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000400UL
+#define UVYH_EVENT_OCCURRED2_RTC_9_SHFT 11
+#define UVYH_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000800UL
+#define UVYH_EVENT_OCCURRED2_RTC_10_SHFT 12
+#define UVYH_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000001000UL
+#define UVYH_EVENT_OCCURRED2_RTC_11_SHFT 13
+#define UVYH_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000002000UL
+#define UVYH_EVENT_OCCURRED2_RTC_12_SHFT 14
+#define UVYH_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000004000UL
+#define UVYH_EVENT_OCCURRED2_RTC_13_SHFT 15
+#define UVYH_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000008000UL
+#define UVYH_EVENT_OCCURRED2_RTC_14_SHFT 16
+#define UVYH_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000010000UL
+#define UVYH_EVENT_OCCURRED2_RTC_15_SHFT 17
+#define UVYH_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000020000UL
+#define UVYH_EVENT_OCCURRED2_RTC_16_SHFT 18
+#define UVYH_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000040000UL
+#define UVYH_EVENT_OCCURRED2_RTC_17_SHFT 19
+#define UVYH_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000080000UL
+#define UVYH_EVENT_OCCURRED2_RTC_18_SHFT 20
+#define UVYH_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000100000UL
+#define UVYH_EVENT_OCCURRED2_RTC_19_SHFT 21
+#define UVYH_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000200000UL
+#define UVYH_EVENT_OCCURRED2_RTC_20_SHFT 22
+#define UVYH_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000400000UL
+#define UVYH_EVENT_OCCURRED2_RTC_21_SHFT 23
+#define UVYH_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000800000UL
+#define UVYH_EVENT_OCCURRED2_RTC_22_SHFT 24
+#define UVYH_EVENT_OCCURRED2_RTC_22_MASK 0x0000000001000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_23_SHFT 25
+#define UVYH_EVENT_OCCURRED2_RTC_23_MASK 0x0000000002000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_24_SHFT 26
+#define UVYH_EVENT_OCCURRED2_RTC_24_MASK 0x0000000004000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_25_SHFT 27
+#define UVYH_EVENT_OCCURRED2_RTC_25_MASK 0x0000000008000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_26_SHFT 28
+#define UVYH_EVENT_OCCURRED2_RTC_26_MASK 0x0000000010000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_27_SHFT 29
+#define UVYH_EVENT_OCCURRED2_RTC_27_MASK 0x0000000020000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_28_SHFT 30
+#define UVYH_EVENT_OCCURRED2_RTC_28_MASK 0x0000000040000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_29_SHFT 31
+#define UVYH_EVENT_OCCURRED2_RTC_29_MASK 0x0000000080000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_30_SHFT 32
+#define UVYH_EVENT_OCCURRED2_RTC_30_MASK 0x0000000100000000UL
+#define UVYH_EVENT_OCCURRED2_RTC_31_SHFT 33
+#define UVYH_EVENT_OCCURRED2_RTC_31_MASK 0x0000000200000000UL
+
+/* UV4 unique defines */
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_SHFT 0
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000001UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_SHFT 1
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000002UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_SHFT 2
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000004UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_SHFT 3
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000008UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_SHFT 4
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000010UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_SHFT 5
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000020UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_SHFT 6
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000040UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_SHFT 7
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000080UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_SHFT 8
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000100UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_SHFT 9
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000200UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_SHFT 10
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000000400UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_SHFT 11
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000000800UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_SHFT 12
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000001000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_SHFT 13
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000002000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_SHFT 14
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000004000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_SHFT 15
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000008000UL
+#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 16
+#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000010000UL
+#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 17
+#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000020000UL
+#define UV4H_EVENT_OCCURRED2_RTC_0_SHFT 18
+#define UV4H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000040000UL
+#define UV4H_EVENT_OCCURRED2_RTC_1_SHFT 19
+#define UV4H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000080000UL
+#define UV4H_EVENT_OCCURRED2_RTC_2_SHFT 20
+#define UV4H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000100000UL
+#define UV4H_EVENT_OCCURRED2_RTC_3_SHFT 21
+#define UV4H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000200000UL
+#define UV4H_EVENT_OCCURRED2_RTC_4_SHFT 22
+#define UV4H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000400000UL
+#define UV4H_EVENT_OCCURRED2_RTC_5_SHFT 23
+#define UV4H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000800000UL
+#define UV4H_EVENT_OCCURRED2_RTC_6_SHFT 24
+#define UV4H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000001000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_7_SHFT 25
+#define UV4H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000002000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_8_SHFT 26
+#define UV4H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000004000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_9_SHFT 27
+#define UV4H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000008000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_10_SHFT 28
+#define UV4H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000010000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_11_SHFT 29
+#define UV4H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000020000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_12_SHFT 30
+#define UV4H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000040000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_13_SHFT 31
+#define UV4H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000080000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_14_SHFT 32
+#define UV4H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000100000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_15_SHFT 33
+#define UV4H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000200000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_16_SHFT 34
+#define UV4H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000400000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_17_SHFT 35
+#define UV4H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000800000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_18_SHFT 36
+#define UV4H_EVENT_OCCURRED2_RTC_18_MASK 0x0000001000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_19_SHFT 37
+#define UV4H_EVENT_OCCURRED2_RTC_19_MASK 0x0000002000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_20_SHFT 38
+#define UV4H_EVENT_OCCURRED2_RTC_20_MASK 0x0000004000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_21_SHFT 39
+#define UV4H_EVENT_OCCURRED2_RTC_21_MASK 0x0000008000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_22_SHFT 40
+#define UV4H_EVENT_OCCURRED2_RTC_22_MASK 0x0000010000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_23_SHFT 41
+#define UV4H_EVENT_OCCURRED2_RTC_23_MASK 0x0000020000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_24_SHFT 42
+#define UV4H_EVENT_OCCURRED2_RTC_24_MASK 0x0000040000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_25_SHFT 43
+#define UV4H_EVENT_OCCURRED2_RTC_25_MASK 0x0000080000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_26_SHFT 44
+#define UV4H_EVENT_OCCURRED2_RTC_26_MASK 0x0000100000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_27_SHFT 45
+#define UV4H_EVENT_OCCURRED2_RTC_27_MASK 0x0000200000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_28_SHFT 46
+#define UV4H_EVENT_OCCURRED2_RTC_28_MASK 0x0000400000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_29_SHFT 47
+#define UV4H_EVENT_OCCURRED2_RTC_29_MASK 0x0000800000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_30_SHFT 48
+#define UV4H_EVENT_OCCURRED2_RTC_30_MASK 0x0001000000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_31_SHFT 49
+#define UV4H_EVENT_OCCURRED2_RTC_31_MASK 0x0002000000000000UL
+
+/* UV3 unique defines */
+#define UV3H_EVENT_OCCURRED2_RTC_0_SHFT 0
+#define UV3H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
+#define UV3H_EVENT_OCCURRED2_RTC_1_SHFT 1
+#define UV3H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
+#define UV3H_EVENT_OCCURRED2_RTC_2_SHFT 2
+#define UV3H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
+#define UV3H_EVENT_OCCURRED2_RTC_3_SHFT 3
+#define UV3H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
+#define UV3H_EVENT_OCCURRED2_RTC_4_SHFT 4
+#define UV3H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
+#define UV3H_EVENT_OCCURRED2_RTC_5_SHFT 5
+#define UV3H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
+#define UV3H_EVENT_OCCURRED2_RTC_6_SHFT 6
+#define UV3H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
+#define UV3H_EVENT_OCCURRED2_RTC_7_SHFT 7
+#define UV3H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
+#define UV3H_EVENT_OCCURRED2_RTC_8_SHFT 8
+#define UV3H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
+#define UV3H_EVENT_OCCURRED2_RTC_9_SHFT 9
+#define UV3H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
+#define UV3H_EVENT_OCCURRED2_RTC_10_SHFT 10
+#define UV3H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
+#define UV3H_EVENT_OCCURRED2_RTC_11_SHFT 11
+#define UV3H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
+#define UV3H_EVENT_OCCURRED2_RTC_12_SHFT 12
+#define UV3H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
+#define UV3H_EVENT_OCCURRED2_RTC_13_SHFT 13
+#define UV3H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
+#define UV3H_EVENT_OCCURRED2_RTC_14_SHFT 14
+#define UV3H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
+#define UV3H_EVENT_OCCURRED2_RTC_15_SHFT 15
+#define UV3H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
+#define UV3H_EVENT_OCCURRED2_RTC_16_SHFT 16
+#define UV3H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
+#define UV3H_EVENT_OCCURRED2_RTC_17_SHFT 17
+#define UV3H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
+#define UV3H_EVENT_OCCURRED2_RTC_18_SHFT 18
+#define UV3H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
+#define UV3H_EVENT_OCCURRED2_RTC_19_SHFT 19
+#define UV3H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
+#define UV3H_EVENT_OCCURRED2_RTC_20_SHFT 20
+#define UV3H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
+#define UV3H_EVENT_OCCURRED2_RTC_21_SHFT 21
+#define UV3H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
+#define UV3H_EVENT_OCCURRED2_RTC_22_SHFT 22
+#define UV3H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
+#define UV3H_EVENT_OCCURRED2_RTC_23_SHFT 23
+#define UV3H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
+#define UV3H_EVENT_OCCURRED2_RTC_24_SHFT 24
+#define UV3H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_25_SHFT 25
+#define UV3H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_26_SHFT 26
+#define UV3H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_27_SHFT 27
+#define UV3H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_28_SHFT 28
+#define UV3H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_29_SHFT 29
+#define UV3H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_30_SHFT 30
+#define UV3H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_31_SHFT 31
+#define UV3H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+
+/* UV2 unique defines */
+#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
+#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
+#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
+#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
+#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
+#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
+#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
+#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
+#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
+#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
+#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
+#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
+#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
+#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
+#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
+#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
+#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
+#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
+#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
+#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
+#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
+#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
+#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
+#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
+#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
+#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
+#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
+#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
+#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
+#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
+#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
+#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
+#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+
+#define UVH_EVENT_OCCURRED2_RTC_1_MASK ( \
+ is_uv(UV5) ? 0x0000000000000008UL : \
+ is_uv(UV4) ? 0x0000000000080000UL : \
+ is_uv(UV3) ? 0x0000000000000002UL : \
+ is_uv(UV2) ? 0x0000000000000002UL : \
+ 0)
+#define UVH_EVENT_OCCURRED2_RTC_1_SHFT ( \
+ is_uv(UV5) ? 3 : \
+ is_uv(UV4) ? 19 : \
+ is_uv(UV3) ? 1 : \
+ is_uv(UV2) ? 1 : \
+ -1)
+
+union uvyh_event_occurred2_u {
+ unsigned long v;
+
+ /* UVYH common struct */
+ struct uvyh_event_occurred2_s {
+ unsigned long rtc_interval_int:1; /* RW */
+ unsigned long bau_dashboard_int:1; /* RW */
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_34_63:30;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_event_occurred2_s {
+ unsigned long rtc_interval_int:1; /* RW */
+ unsigned long bau_dashboard_int:1; /* RW */
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_34_63:30;
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_event_occurred2_s {
+ unsigned long message_accelerator_int0:1; /* RW */
+ unsigned long message_accelerator_int1:1; /* RW */
+ unsigned long message_accelerator_int2:1; /* RW */
+ unsigned long message_accelerator_int3:1; /* RW */
+ unsigned long message_accelerator_int4:1; /* RW */
+ unsigned long message_accelerator_int5:1; /* RW */
+ unsigned long message_accelerator_int6:1; /* RW */
+ unsigned long message_accelerator_int7:1; /* RW */
+ unsigned long message_accelerator_int8:1; /* RW */
+ unsigned long message_accelerator_int9:1; /* RW */
+ unsigned long message_accelerator_int10:1; /* RW */
+ unsigned long message_accelerator_int11:1; /* RW */
+ unsigned long message_accelerator_int12:1; /* RW */
+ unsigned long message_accelerator_int13:1; /* RW */
+ unsigned long message_accelerator_int14:1; /* RW */
+ unsigned long message_accelerator_int15:1; /* RW */
+ unsigned long rtc_interval_int:1; /* RW */
+ unsigned long bau_dashboard_int:1; /* RW */
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_50_63:14;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_event_occurred2_s {
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_event_occurred2_s {
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED2_ALIAS */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED2_ALIAS 0x70108UL
+
+
+/* ========================================================================= */
+/* UVH_EXTIO_INT0_BROADCAST */
+/* ========================================================================= */
+#define UVH_EXTIO_INT0_BROADCAST 0x61448UL
+
+/* UVH common defines*/
+#define UVH_EXTIO_INT0_BROADCAST_ENABLE_SHFT 0
+#define UVH_EXTIO_INT0_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+
+
+union uvh_extio_int0_broadcast_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_extio_int0_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s;
+
+ /* UV5 unique struct */
+ struct uv5h_extio_int0_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_extio_int0_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_extio_int0_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_extio_int0_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_GAM_GR_CONFIG */
+/* ========================================================================= */
+#define UVH_GR0_GAM_GR_CONFIG ( \
+ is_uv(UV5) ? 0x600028UL : \
+ is_uv(UV4) ? 0x600028UL : \
+ is_uv(UV3) ? 0xc00028UL : \
+ is_uv(UV2) ? 0xc00028UL : \
+ 0)
+
+
+
+/* UVYH common defines */
+#define UVYH_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10
+#define UVYH_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL
+
+/* UV4 unique defines */
+#define UV4H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10
+#define UV4H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL
+
+/* UV3 unique defines */
+#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0
+#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL
+#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10
+#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL
+
+/* UV2 unique defines */
+#define UV2H_GR0_GAM_GR_CONFIG_N_GR_SHFT 0
+#define UV2H_GR0_GAM_GR_CONFIG_N_GR_MASK 0x000000000000000fUL
+
+
+union uvyh_gr0_gam_gr_config_u {
+ unsigned long v;
+
+ /* UVYH common struct */
+ struct uvyh_gr0_gam_gr_config_s {
+ unsigned long rsvd_0_9:10;
+ unsigned long subspace:1; /* RW */
+ unsigned long rsvd_11_63:53;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_gr0_gam_gr_config_s {
+ unsigned long rsvd_0_9:10;
+ unsigned long subspace:1; /* RW */
+ unsigned long rsvd_11_63:53;
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_gr0_gam_gr_config_s {
+ unsigned long rsvd_0_9:10;
+ unsigned long subspace:1; /* RW */
+ unsigned long rsvd_11_63:53;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_gr0_gam_gr_config_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long undef_6_9:4; /* Undefined */
+ unsigned long subspace:1; /* RW */
+ unsigned long reserved:53;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_gr0_gam_gr_config_s {
+ unsigned long n_gr:4; /* RW */
+ unsigned long reserved:60;
+ } s2;
+};
/* ========================================================================= */
/* UVH_GR0_TLB_INT0_CONFIG */
/* ========================================================================= */
-#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
-
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT0_CONFIG ( \
+ is_uv(UV4) ? 0x61b00UL : \
+ is_uv(UV3) ? 0x61b00UL : \
+ is_uv(UV2) ? 0x61b00UL : \
+ uv_undefined("UVH_GR0_TLB_INT0_CONFIG"))
+
+
+/* UVXH common defines */
+#define UVXH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVXH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVXH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVXH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVXH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVXH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVXH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVXH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVXH_GR0_TLB_INT0_CONFIG_P_SHFT 13
+#define UVXH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVXH_GR0_TLB_INT0_CONFIG_T_SHFT 15
+#define UVXH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVXH_GR0_TLB_INT0_CONFIG_M_SHFT 16
+#define UVXH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVXH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVXH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
union uvh_gr0_tlb_int0_config_u {
- unsigned long v;
- struct uvh_gr0_tlb_int0_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_GR0_TLB_INT1_CONFIG */
/* ========================================================================= */
-#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
-
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT1_CONFIG ( \
+ is_uv(UV4) ? 0x61b40UL : \
+ is_uv(UV3) ? 0x61b40UL : \
+ is_uv(UV2) ? 0x61b40UL : \
+ uv_undefined("UVH_GR0_TLB_INT1_CONFIG"))
+
+
+/* UVXH common defines */
+#define UVXH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVXH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVXH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVXH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVXH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVXH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVXH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVXH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVXH_GR0_TLB_INT1_CONFIG_P_SHFT 13
+#define UVXH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVXH_GR0_TLB_INT1_CONFIG_T_SHFT 15
+#define UVXH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVXH_GR0_TLB_INT1_CONFIG_M_SHFT 16
+#define UVXH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVXH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVXH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
union uvh_gr0_tlb_int1_config_u {
- unsigned long v;
- struct uvh_gr0_tlb_int1_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_GR1_TLB_INT0_CONFIG */
/* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
-
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT0_CONFIG ( \
+ is_uv(UV4) ? 0x62100UL : \
+ is_uv(UV3) ? 0x61f00UL : \
+ is_uv(UV2) ? 0x61f00UL : \
+ uv_undefined("UVH_GR1_TLB_INT0_CONFIG"))
+
+
+/* UVXH common defines */
+#define UVXH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVXH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVXH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVXH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVXH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVXH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVXH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVXH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVXH_GR1_TLB_INT0_CONFIG_P_SHFT 13
+#define UVXH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVXH_GR1_TLB_INT0_CONFIG_T_SHFT 15
+#define UVXH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVXH_GR1_TLB_INT0_CONFIG_M_SHFT 16
+#define UVXH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVXH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVXH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
union uvh_gr1_tlb_int0_config_u {
- unsigned long v;
- struct uvh_gr1_tlb_int0_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_GR1_TLB_INT1_CONFIG */
/* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
-
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT1_CONFIG ( \
+ is_uv(UV4) ? 0x62140UL : \
+ is_uv(UV3) ? 0x61f40UL : \
+ is_uv(UV2) ? 0x61f40UL : \
+ uv_undefined("UVH_GR1_TLB_INT1_CONFIG"))
+
+
+/* UVXH common defines */
+#define UVXH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVXH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVXH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVXH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVXH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVXH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVXH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVXH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVXH_GR1_TLB_INT1_CONFIG_P_SHFT 13
+#define UVXH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVXH_GR1_TLB_INT1_CONFIG_T_SHFT 15
+#define UVXH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVXH_GR1_TLB_INT1_CONFIG_M_SHFT 16
+#define UVXH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVXH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVXH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
union uvh_gr1_tlb_int1_config_u {
- unsigned long v;
- struct uvh_gr1_tlb_int1_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s2;
};
/* ========================================================================= */
@@ -405,1049 +2661,1991 @@ union uvh_gr1_tlb_int1_config_u {
/* ========================================================================= */
#define UVH_INT_CMPB 0x22080UL
-#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
-#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
+/* UVH common defines*/
+#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
+#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
-union uvh_int_cmpb_u {
- unsigned long v;
- struct uvh_int_cmpb_s {
- unsigned long real_time_cmpb : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
-};
-/* ========================================================================= */
-/* UVH_INT_CMPC */
-/* ========================================================================= */
-#define UVH_INT_CMPC 0x22100UL
+union uvh_int_cmpb_u {
+ unsigned long v;
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
+ /* UVH common struct */
+ struct uvh_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
-union uvh_int_cmpc_u {
- unsigned long v;
- struct uvh_int_cmpc_s {
- unsigned long real_time_cmpc : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
-};
+ /* UV5 unique struct */
+ struct uv5h_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s5;
-/* ========================================================================= */
-/* UVH_INT_CMPD */
-/* ========================================================================= */
-#define UVH_INT_CMPD 0x22180UL
+ /* UV4 unique struct */
+ struct uv4h_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s4;
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
+ /* UV3 unique struct */
+ struct uv3h_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s3;
-union uvh_int_cmpd_u {
- unsigned long v;
- struct uvh_int_cmpd_s {
- unsigned long real_time_cmpd : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ /* UV2 unique struct */
+ struct uv2h_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s2;
};
/* ========================================================================= */
/* UVH_IPI_INT */
/* ========================================================================= */
#define UVH_IPI_INT 0x60500UL
-#define UVH_IPI_INT_32 0x0348
-
-#define UVH_IPI_INT_VECTOR_SHFT 0
-#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
-#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
-#define UVH_IPI_INT_DESTMODE_SHFT 11
-#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_IPI_INT_APIC_ID_SHFT 16
-#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
-#define UVH_IPI_INT_SEND_SHFT 63
-#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+
+/* UVH common defines*/
+#define UVH_IPI_INT_VECTOR_SHFT 0
+#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
+#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
+#define UVH_IPI_INT_DESTMODE_SHFT 11
+#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_IPI_INT_APIC_ID_SHFT 16
+#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
+#define UVH_IPI_INT_SEND_SHFT 63
+#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+
union uvh_ipi_int_u {
- unsigned long v;
- struct uvh_ipi_int_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long delivery_mode : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long rsvd_12_15 : 4; /* */
- unsigned long apic_id : 32; /* RW */
- unsigned long rsvd_48_62 : 15; /* */
- unsigned long send : 1; /* WP */
- } s;
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s;
+
+ /* UV5 unique struct */
+ struct uv5h_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s2;
};
/* ========================================================================= */
-/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
+/* UVH_NODE_ID */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
+#define UVH_NODE_ID 0x0UL
+
+/* UVH common defines*/
+#define UVH_NODE_ID_FORCE1_SHFT 0
+#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UVH_NODE_ID_MANUFACTURER_SHFT 1
+#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UVH_NODE_ID_PART_NUMBER_SHFT 12
+#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UVH_NODE_ID_REVISION_SHFT 28
+#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UVH_NODE_ID_NODE_ID_SHFT 32
+#define UVH_NODE_ID_NI_PORT_SHFT 57
+
+/* UVXH common defines */
+#define UVXH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UVXH_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UVXH_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UVXH_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
+
+/* UVYH common defines */
+#define UVYH_NODE_ID_NODE_ID_MASK 0x0000007f00000000UL
+#define UVYH_NODE_ID_NI_PORT_MASK 0x7e00000000000000UL
+
+/* UV4 unique defines */
+#define UV4H_NODE_ID_ROUTER_SELECT_SHFT 48
+#define UV4H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL
+#define UV4H_NODE_ID_RESERVED_2_SHFT 49
+#define UV4H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL
+
+/* UV3 unique defines */
+#define UV3H_NODE_ID_ROUTER_SELECT_SHFT 48
+#define UV3H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL
+#define UV3H_NODE_ID_RESERVED_2_SHFT 49
+#define UV3H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL
+
+
+union uvh_node_id_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long rsvd_32_63:32;
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_49:3;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } sx;
+
+ /* UVYH common struct */
+ struct uvyh_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:7; /* RW */
+ unsigned long rsvd_39_56:18;
+ unsigned long ni_port:6; /* RO */
+ unsigned long rsvd_63:1;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:7; /* RW */
+ unsigned long rsvd_39_56:18;
+ unsigned long ni_port:6; /* RO */
+ unsigned long rsvd_63:1;
+ } s5;
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+ /* UV4 unique struct */
+ struct uv4h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long router_select:1; /* RO */
+ unsigned long rsvd_49:1;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s4;
-union uvh_lb_bau_intd_payload_queue_first_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_first_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_48: 6; /* */
- unsigned long node_id : 14; /* RW */
- unsigned long rsvd_63 : 1; /* */
- } s;
+ /* UV3 unique struct */
+ struct uv3h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long router_select:1; /* RO */
+ unsigned long rsvd_49:1;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_49:3;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s2;
};
/* ========================================================================= */
-/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
+/* UVH_NODE_PRESENT_0 */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
+#define UVH_NODE_PRESENT_0 ( \
+ is_uv(UV5) ? 0x1400UL : \
+ 0)
+
+
+/* UVYH common defines */
+#define UVYH_NODE_PRESENT_0_NODES_SHFT 0
+#define UVYH_NODE_PRESENT_0_NODES_MASK 0xffffffffffffffffUL
+
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+union uvh_node_present_0_u {
+ unsigned long v;
-union uvh_lb_bau_intd_payload_queue_last_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_last_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_63: 21; /* */
- } s;
+ /* UVH common struct */
+ struct uvh_node_present_0_s {
+ unsigned long nodes:64; /* RW */
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_node_present_0_s {
+ unsigned long nodes:64; /* RW */
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_node_present_0_s {
+ unsigned long nodes:64; /* RW */
+ } s5;
};
/* ========================================================================= */
-/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
+/* UVH_NODE_PRESENT_1 */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
+#define UVH_NODE_PRESENT_1 ( \
+ is_uv(UV5) ? 0x1408UL : \
+ 0)
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
-union uvh_lb_bau_intd_payload_queue_tail_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_tail_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_63: 21; /* */
- } s;
-};
+/* UVYH common defines */
+#define UVYH_NODE_PRESENT_1_NODES_SHFT 0
+#define UVYH_NODE_PRESENT_1_NODES_MASK 0xffffffffffffffffUL
-/* ========================================================================= */
-/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
-/* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
-
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
-union uvh_lb_bau_intd_software_acknowledge_u {
- unsigned long v;
- struct uvh_lb_bau_intd_software_acknowledge_s {
- unsigned long pending_0 : 1; /* RW, W1C */
- unsigned long pending_1 : 1; /* RW, W1C */
- unsigned long pending_2 : 1; /* RW, W1C */
- unsigned long pending_3 : 1; /* RW, W1C */
- unsigned long pending_4 : 1; /* RW, W1C */
- unsigned long pending_5 : 1; /* RW, W1C */
- unsigned long pending_6 : 1; /* RW, W1C */
- unsigned long pending_7 : 1; /* RW, W1C */
- unsigned long timeout_0 : 1; /* RW, W1C */
- unsigned long timeout_1 : 1; /* RW, W1C */
- unsigned long timeout_2 : 1; /* RW, W1C */
- unsigned long timeout_3 : 1; /* RW, W1C */
- unsigned long timeout_4 : 1; /* RW, W1C */
- unsigned long timeout_5 : 1; /* RW, W1C */
- unsigned long timeout_6 : 1; /* RW, W1C */
- unsigned long timeout_7 : 1; /* RW, W1C */
- unsigned long rsvd_16_63: 48; /* */
- } s;
+
+union uvh_node_present_1_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_node_present_1_s {
+ unsigned long nodes:64; /* RW */
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_node_present_1_s {
+ unsigned long nodes:64; /* RW */
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_node_present_1_s {
+ unsigned long nodes:64; /* RW */
+ } s5;
};
/* ========================================================================= */
-/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
+/* UVH_NODE_PRESENT_TABLE */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
+#define UVH_NODE_PRESENT_TABLE ( \
+ is_uv(UV4) ? 0x1400UL : \
+ is_uv(UV3) ? 0x1400UL : \
+ is_uv(UV2) ? 0x1400UL : \
+ 0)
+
+#define UVH_NODE_PRESENT_TABLE_DEPTH ( \
+ is_uv(UV4) ? 4 : \
+ is_uv(UV3) ? 16 : \
+ is_uv(UV2) ? 16 : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_NODE_PRESENT_TABLE_NODES_SHFT 0
+#define UVXH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+
+
+union uvh_node_present_table_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } s2;
+};
/* ========================================================================= */
-/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
+/* UVH_RH10_GAM_ADDR_MAP_CONFIG */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
+#define UVH_RH10_GAM_ADDR_MAP_CONFIG ( \
+ is_uv(UV5) ? 0x470000UL : \
+ 0)
+
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
+/* UVYH common defines */
+#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_N_SKT_SHFT 6
+#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_N_SKT_MASK 0x00000000000001c0UL
+#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_LS_ENABLE_SHFT 12
+#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_LS_ENABLE_MASK 0x0000000000001000UL
+#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_MK_TME_KEYID_BITS_SHFT 16
+#define UVYH_RH10_GAM_ADDR_MAP_CONFIG_MK_TME_KEYID_BITS_MASK 0x00000000000f0000UL
-union uvh_lb_bau_sb_activation_control_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_control_s {
- unsigned long index : 6; /* RW */
- unsigned long rsvd_6_61: 56; /* */
- unsigned long push : 1; /* WP */
- unsigned long init : 1; /* WP */
- } s;
+
+union uvh_rh10_gam_addr_map_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh10_gam_addr_map_config_s {
+ unsigned long undef_0_5:6; /* Undefined */
+ unsigned long n_skt:3; /* RW */
+ unsigned long undef_9_11:3; /* Undefined */
+ unsigned long ls_enable:1; /* RW */
+ unsigned long undef_13_15:3; /* Undefined */
+ unsigned long mk_tme_keyid_bits:4; /* RW */
+ unsigned long rsvd_20_63:44;
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_rh10_gam_addr_map_config_s {
+ unsigned long undef_0_5:6; /* Undefined */
+ unsigned long n_skt:3; /* RW */
+ unsigned long undef_9_11:3; /* Undefined */
+ unsigned long ls_enable:1; /* RW */
+ unsigned long undef_13_15:3; /* Undefined */
+ unsigned long mk_tme_keyid_bits:4; /* RW */
+ unsigned long rsvd_20_63:44;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_rh10_gam_addr_map_config_s {
+ unsigned long undef_0_5:6; /* Undefined */
+ unsigned long n_skt:3; /* RW */
+ unsigned long undef_9_11:3; /* Undefined */
+ unsigned long ls_enable:1; /* RW */
+ unsigned long undef_13_15:3; /* Undefined */
+ unsigned long mk_tme_keyid_bits:4; /* RW */
+ } s5;
};
/* ========================================================================= */
-/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
+/* UVH_RH10_GAM_GRU_OVERLAY_CONFIG */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
+#define UVH_RH10_GAM_GRU_OVERLAY_CONFIG ( \
+ is_uv(UV5) ? 0x4700b0UL : \
+ 0)
+
+
+/* UVYH common defines */
+#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 25
+#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x000ffffffe000000UL
+#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_N_GRU_SHFT 52
+#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_N_GRU_MASK 0x0070000000000000UL
+#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVYH_RH10_GAM_GRU_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
+#define UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK ( \
+ is_uv(UV5) ? 0x000ffffffe000000UL : \
+ 0)
+#define UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT ( \
+ is_uv(UV5) ? 25 : \
+ -1)
-union uvh_lb_bau_sb_activation_status_0_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_status_0_s {
- unsigned long status : 64; /* RW */
- } s;
+union uvh_rh10_gam_gru_overlay_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh10_gam_gru_overlay_config_s {
+ unsigned long undef_0_24:25; /* Undefined */
+ unsigned long base:27; /* RW */
+ unsigned long n_gru:3; /* RW */
+ unsigned long undef_55_62:8; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_rh10_gam_gru_overlay_config_s {
+ unsigned long undef_0_24:25; /* Undefined */
+ unsigned long base:27; /* RW */
+ unsigned long n_gru:3; /* RW */
+ unsigned long undef_55_62:8; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_rh10_gam_gru_overlay_config_s {
+ unsigned long undef_0_24:25; /* Undefined */
+ unsigned long base:27; /* RW */
+ unsigned long n_gru:3; /* RW */
+ unsigned long undef_55_62:8; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s5;
};
/* ========================================================================= */
-/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
+/* UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0 */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
+#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0 ( \
+ is_uv(UV5) ? 0x473000UL : \
+ 0)
+
+
+/* UVYH common defines */
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x000ffffffc000000UL
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 52
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x03f0000000000000UL
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL
+
+#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK ( \
+ is_uv(UV5) ? 0x000ffffffc000000UL : \
+ 0)
+#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT ( \
+ is_uv(UV5) ? 26 : \
+ -1)
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
+union uvh_rh10_gam_mmioh_overlay_config0_u {
+ unsigned long v;
-union uvh_lb_bau_sb_activation_status_1_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_status_1_s {
- unsigned long status : 64; /* RW */
- } s;
+ /* UVH common struct */
+ struct uvh_rh10_gam_mmioh_overlay_config0_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_rh10_gam_mmioh_overlay_config0_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_rh10_gam_mmioh_overlay_config0_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s5;
};
/* ========================================================================= */
-/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
+/* UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1 */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
+#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1 ( \
+ is_uv(UV5) ? 0x474000UL : \
+ 0)
+
+
+/* UVYH common defines */
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x000ffffffc000000UL
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 52
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x03f0000000000000UL
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63
+#define UVYH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL
+
+#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK ( \
+ is_uv(UV5) ? 0x000ffffffc000000UL : \
+ 0)
+#define UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT ( \
+ is_uv(UV5) ? 26 : \
+ -1)
+
+union uvh_rh10_gam_mmioh_overlay_config1_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh10_gam_mmioh_overlay_config1_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s;
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+ /* UVYH common struct */
+ struct uvyh_rh10_gam_mmioh_overlay_config1_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } sy;
-union uvh_lb_bau_sb_descriptor_base_u {
- unsigned long v;
- struct uvh_lb_bau_sb_descriptor_base_s {
- unsigned long rsvd_0_11 : 12; /* */
- unsigned long page_address : 31; /* RW */
- unsigned long rsvd_43_48 : 6; /* */
- unsigned long node_id : 14; /* RW */
- unsigned long rsvd_63 : 1; /* */
- } s;
+ /* UV5 unique struct */
+ struct uv5h_rh10_gam_mmioh_overlay_config1_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s5;
};
/* ========================================================================= */
-/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */
-/* ========================================================================= */
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
-
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
-
-union uvh_lb_mcast_aoerr0_rpt_enable_u {
- unsigned long v;
- struct uvh_lb_mcast_aoerr0_rpt_enable_s {
- unsigned long mcast_obese_msg : 1; /* RW */
- unsigned long mcast_data_sb_err : 1; /* RW */
- unsigned long mcast_nack_buff_parity : 1; /* RW */
- unsigned long mcast_timeout : 1; /* RW */
- unsigned long mcast_inactive_reply : 1; /* RW */
- unsigned long mcast_upgrade_error : 1; /* RW */
- unsigned long mcast_reg_count_underflow : 1; /* RW */
- unsigned long mcast_rep_obese_msg : 1; /* RW */
- unsigned long ucache_req_runt_msg : 1; /* RW */
- unsigned long ucache_req_obese_msg : 1; /* RW */
- unsigned long ucache_req_data_sb_err : 1; /* RW */
- unsigned long ucache_rep_runt_msg : 1; /* RW */
- unsigned long ucache_rep_obese_msg : 1; /* RW */
- unsigned long ucache_rep_data_sb_err : 1; /* RW */
- unsigned long ucache_rep_command_err : 1; /* RW */
- unsigned long ucache_pend_timeout : 1; /* RW */
- unsigned long macc_req_runt_msg : 1; /* RW */
- unsigned long macc_req_obese_msg : 1; /* RW */
- unsigned long macc_req_data_sb_err : 1; /* RW */
- unsigned long macc_rep_runt_msg : 1; /* RW */
- unsigned long macc_rep_obese_msg : 1; /* RW */
- unsigned long macc_rep_data_sb_err : 1; /* RW */
- unsigned long macc_amo_timeout : 1; /* RW */
- unsigned long macc_put_timeout : 1; /* RW */
- unsigned long macc_spurious_event : 1; /* RW */
- unsigned long ioh_destination_table_parity : 1; /* RW */
- unsigned long get_had_error_reply : 1; /* RW */
- unsigned long get_timeout : 1; /* RW */
- unsigned long lock_manager_had_error_reply : 1; /* RW */
- unsigned long put_had_error_reply : 1; /* RW */
- unsigned long put_timeout : 1; /* RW */
- unsigned long sb_activation_overrun : 1; /* RW */
- unsigned long completed_gb_activation_had_error_reply : 1; /* RW */
- unsigned long completed_gb_activation_timeout : 1; /* RW */
- unsigned long descriptor_buffer_0_parity : 1; /* RW */
- unsigned long descriptor_buffer_1_parity : 1; /* RW */
- unsigned long socket_destination_table_parity : 1; /* RW */
- unsigned long bau_reply_payload_corruption : 1; /* RW */
- unsigned long io_port_destination_table_parity : 1; /* RW */
- unsigned long intd_soft_ack_timeout : 1; /* RW */
- unsigned long int_rep_obese_msg : 1; /* RW */
- unsigned long int_rep_command_err : 1; /* RW */
- unsigned long int_timeout : 1; /* RW */
- unsigned long rsvd_43_63 : 21; /* */
- } s;
+/* UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0 */
+/* ========================================================================= */
+#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0 ( \
+ is_uv(UV5) ? 0x473800UL : \
+ 0)
+
+#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH ( \
+ is_uv(UV5) ? 128 : \
+ 0)
+
+
+/* UVYH common defines */
+#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0
+#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x000000000000007fUL
+
+
+union uvh_rh10_gam_mmioh_redirect_config0_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh10_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:7; /* RW */
+ unsigned long rsvd_7_63:57;
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_rh10_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:7; /* RW */
+ unsigned long rsvd_7_63:57;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_rh10_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:7; /* RW */
+ unsigned long rsvd_7_63:57;
+ } s5;
};
/* ========================================================================= */
-/* UVH_LOCAL_INT0_CONFIG */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_CONFIG 0x61000UL
-
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
-#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
-#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
-#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
-#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_local_int0_config_u {
- unsigned long v;
- struct uvh_local_int0_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+/* UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1 */
+/* ========================================================================= */
+#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1 ( \
+ is_uv(UV5) ? 0x474800UL : \
+ 0)
+
+#define UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH ( \
+ is_uv(UV5) ? 128 : \
+ 0)
+
+
+/* UVYH common defines */
+#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0
+#define UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x000000000000007fUL
+
+
+union uvh_rh10_gam_mmioh_redirect_config1_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh10_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:7; /* RW */
+ unsigned long rsvd_7_63:57;
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_rh10_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:7; /* RW */
+ unsigned long rsvd_7_63:57;
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_rh10_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:7; /* RW */
+ unsigned long rsvd_7_63:57;
+ } s5;
};
/* ========================================================================= */
-/* UVH_LOCAL_INT0_ENABLE */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_ENABLE 0x65000UL
-
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-
-union uvh_local_int0_enable_u {
- unsigned long v;
- struct uvh_local_int0_enable_s {
- unsigned long lb_hcerr : 1; /* RW */
- unsigned long gr0_hcerr : 1; /* RW */
- unsigned long gr1_hcerr : 1; /* RW */
- unsigned long lh_hcerr : 1; /* RW */
- unsigned long rh_hcerr : 1; /* RW */
- unsigned long xn_hcerr : 1; /* RW */
- unsigned long si_hcerr : 1; /* RW */
- unsigned long lb_aoerr0 : 1; /* RW */
- unsigned long gr0_aoerr0 : 1; /* RW */
- unsigned long gr1_aoerr0 : 1; /* RW */
- unsigned long lh_aoerr0 : 1; /* RW */
- unsigned long rh_aoerr0 : 1; /* RW */
- unsigned long xn_aoerr0 : 1; /* RW */
- unsigned long si_aoerr0 : 1; /* RW */
- unsigned long lb_aoerr1 : 1; /* RW */
- unsigned long gr0_aoerr1 : 1; /* RW */
- unsigned long gr1_aoerr1 : 1; /* RW */
- unsigned long lh_aoerr1 : 1; /* RW */
- unsigned long rh_aoerr1 : 1; /* RW */
- unsigned long xn_aoerr1 : 1; /* RW */
- unsigned long si_aoerr1 : 1; /* RW */
- unsigned long rh_vpi_int : 1; /* RW */
- unsigned long system_shutdown_int : 1; /* RW */
- unsigned long lb_irq_int_0 : 1; /* RW */
- unsigned long lb_irq_int_1 : 1; /* RW */
- unsigned long lb_irq_int_2 : 1; /* RW */
- unsigned long lb_irq_int_3 : 1; /* RW */
- unsigned long lb_irq_int_4 : 1; /* RW */
- unsigned long lb_irq_int_5 : 1; /* RW */
- unsigned long lb_irq_int_6 : 1; /* RW */
- unsigned long lb_irq_int_7 : 1; /* RW */
- unsigned long lb_irq_int_8 : 1; /* RW */
- unsigned long lb_irq_int_9 : 1; /* RW */
- unsigned long lb_irq_int_10 : 1; /* RW */
- unsigned long lb_irq_int_11 : 1; /* RW */
- unsigned long lb_irq_int_12 : 1; /* RW */
- unsigned long lb_irq_int_13 : 1; /* RW */
- unsigned long lb_irq_int_14 : 1; /* RW */
- unsigned long lb_irq_int_15 : 1; /* RW */
- unsigned long l1_nmi_int : 1; /* RW */
- unsigned long stop_clock : 1; /* RW */
- unsigned long asic_to_l1 : 1; /* RW */
- unsigned long l1_to_asic : 1; /* RW */
- unsigned long ltc_int : 1; /* RW */
- unsigned long la_seq_trigger : 1; /* RW */
- unsigned long rsvd_45_63 : 19; /* */
- } s;
+/* UVH_RH10_GAM_MMR_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_RH10_GAM_MMR_OVERLAY_CONFIG ( \
+ is_uv(UV5) ? 0x470090UL : \
+ 0)
+
+
+/* UVYH common defines */
+#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT 25
+#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_MASK 0x000ffffffe000000UL
+#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVYH_RH10_GAM_MMR_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+#define UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_MASK ( \
+ is_uv(UV5) ? 0x000ffffffe000000UL : \
+ 0)
+#define UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT ( \
+ is_uv(UV5) ? 25 : \
+ -1)
+
+union uvh_rh10_gam_mmr_overlay_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh10_gam_mmr_overlay_config_s {
+ unsigned long undef_0_24:25; /* Undefined */
+ unsigned long base:27; /* RW */
+ unsigned long undef_52_62:11; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVYH common struct */
+ struct uvyh_rh10_gam_mmr_overlay_config_s {
+ unsigned long undef_0_24:25; /* Undefined */
+ unsigned long base:27; /* RW */
+ unsigned long undef_52_62:11; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_rh10_gam_mmr_overlay_config_s {
+ unsigned long undef_0_24:25; /* Undefined */
+ unsigned long base:27; /* RW */
+ unsigned long undef_52_62:11; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s5;
};
/* ========================================================================= */
-/* UVH_NODE_ID */
+/* UVH_RH_GAM_ADDR_MAP_CONFIG */
/* ========================================================================= */
-#define UVH_NODE_ID 0x0UL
+#define UVH_RH_GAM_ADDR_MAP_CONFIG ( \
+ is_uv(UV4) ? 0x480000UL : \
+ is_uv(UV3) ? 0x1600000UL : \
+ is_uv(UV2) ? 0x1600000UL : \
+ 0)
-#define UVH_NODE_ID_FORCE1_SHFT 0
-#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UVH_NODE_ID_MANUFACTURER_SHFT 1
-#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UVH_NODE_ID_PART_NUMBER_SHFT 12
-#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UVH_NODE_ID_REVISION_SHFT 28
-#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UVH_NODE_ID_NODE_ID_SHFT 32
-#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
-#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
-#define UVH_NODE_ID_NI_PORT_SHFT 56
-#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
-union uvh_node_id_u {
- unsigned long v;
- struct uvh_node_id_s {
- unsigned long force1 : 1; /* RO */
- unsigned long manufacturer : 11; /* RO */
- unsigned long part_number : 16; /* RO */
- unsigned long revision : 4; /* RO */
- unsigned long node_id : 15; /* RW */
- unsigned long rsvd_47 : 1; /* */
- unsigned long nodes_per_bit : 7; /* RW */
- unsigned long rsvd_55 : 1; /* */
- unsigned long ni_port : 4; /* RO */
- unsigned long rsvd_60_63 : 4; /* */
- } s;
+/* UVXH common defines */
+#define UVXH_RH_GAM_ADDR_MAP_CONFIG_N_SKT_SHFT 6
+#define UVXH_RH_GAM_ADDR_MAP_CONFIG_N_SKT_MASK 0x00000000000003c0UL
+
+/* UV3 unique defines */
+#define UV3H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_SHFT 0
+#define UV3H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
+
+/* UV2 unique defines */
+#define UV2H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_SHFT 0
+#define UV2H_RH_GAM_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
+
+
+union uvh_rh_gam_addr_map_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_addr_map_config_s {
+ unsigned long rsvd_0_5:6;
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_addr_map_config_s {
+ unsigned long rsvd_0_5:6;
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_addr_map_config_s {
+ unsigned long rsvd_0_5:6;
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_addr_map_config_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_addr_map_config_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s2;
};
/* ========================================================================= */
-/* UVH_NODE_PRESENT_TABLE */
+/* UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG */
/* ========================================================================= */
-#define UVH_NODE_PRESENT_TABLE 0x1400UL
-#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+#define UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG ( \
+ is_uv(UV4) ? 0x4800c8UL : \
+ is_uv(UV3) ? 0x16000c8UL : \
+ is_uv(UV2) ? 0x16000c8UL : \
+ 0)
-#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
-#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
-union uvh_node_present_table_u {
- unsigned long v;
- struct uvh_node_present_table_s {
- unsigned long nodes : 64; /* RW */
- } s;
+/* UVXH common defines */
+#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVXH_RH_GAM_ALIAS_0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+
+union uvh_rh_gam_alias_0_overlay_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_alias_0_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_alias_0_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_alias_0_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_alias_0_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_alias_0_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
+/* UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG ( \
+ is_uv(UV4) ? 0x4800d0UL : \
+ is_uv(UV3) ? 0x16000d0UL : \
+ is_uv(UV2) ? 0x16000d0UL : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_MASK 0x00003fffff000000UL
+
+
+union uvh_rh_gam_alias_0_redirect_config_u {
+ unsigned long v;
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+ /* UVH common struct */
+ struct uvh_rh_gam_alias_0_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
-union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ /* UVXH common struct */
+ struct uvxh_rh_gam_alias_0_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_alias_0_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_alias_0_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_alias_0_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s2;
};
/* ========================================================================= */
-/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
+/* UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG ( \
+ is_uv(UV4) ? 0x4800d8UL : \
+ is_uv(UV3) ? 0x16000d8UL : \
+ is_uv(UV2) ? 0x16000d8UL : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVXH_RH_GAM_ALIAS_1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+union uvh_rh_gam_alias_1_overlay_config_u {
+ unsigned long v;
-union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ /* UVH common struct */
+ struct uvh_rh_gam_alias_1_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_alias_1_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_alias_1_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_alias_1_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_alias_1_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
+/* UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG ( \
+ is_uv(UV4) ? 0x4800e0UL : \
+ is_uv(UV3) ? 0x16000e0UL : \
+ is_uv(UV2) ? 0x16000e0UL : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_RH_GAM_ALIAS_1_REDIRECT_CONFIG_DEST_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS_1_REDIRECT_CONFIG_DEST_BASE_MASK 0x00003fffff000000UL
+
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+union uvh_rh_gam_alias_1_redirect_config_u {
+ unsigned long v;
-union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ /* UVH common struct */
+ struct uvh_rh_gam_alias_1_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_alias_1_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_alias_1_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_alias_1_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_alias_1_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s2;
};
/* ========================================================================= */
-/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */
+/* UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG */
/* ========================================================================= */
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
+#define UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG ( \
+ is_uv(UV4) ? 0x4800e8UL : \
+ is_uv(UV3) ? 0x16000e8UL : \
+ is_uv(UV2) ? 0x16000e8UL : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVXH_RH_GAM_ALIAS_2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-union uvh_rh_gam_cfg_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_cfg_overlay_config_mmr_s {
- unsigned long rsvd_0_25: 26; /* */
- unsigned long base : 20; /* RW */
- unsigned long rsvd_46_62: 17; /* */
- unsigned long enable : 1; /* RW */
- } s;
+union uvh_rh_gam_alias_2_overlay_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_alias_2_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_alias_2_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_alias_2_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_alias_2_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_alias_2_overlay_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
-/* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
-
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_gru_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_gru_overlay_config_mmr_s {
- unsigned long rsvd_0_27: 28; /* */
- unsigned long base : 18; /* RW */
- unsigned long rsvd_46_47: 2; /* */
- unsigned long gr4 : 1; /* RW */
- unsigned long rsvd_49_51: 3; /* */
- unsigned long n_gru : 4; /* RW */
- unsigned long rsvd_56_62: 7; /* */
- unsigned long enable : 1; /* RW */
- } s;
+/* UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG ( \
+ is_uv(UV4) ? 0x4800f0UL : \
+ is_uv(UV3) ? 0x16000f0UL : \
+ is_uv(UV2) ? 0x16000f0UL : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_RH_GAM_ALIAS_2_REDIRECT_CONFIG_DEST_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS_2_REDIRECT_CONFIG_DEST_BASE_MASK 0x00003fffff000000UL
+
+
+union uvh_rh_gam_alias_2_redirect_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_alias_2_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_alias_2_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_alias_2_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_alias_2_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_alias_2_redirect_config_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s2;
};
/* ========================================================================= */
-/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
-/* ========================================================================= */
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
-
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_mmioh_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
- unsigned long rsvd_0_29: 30; /* */
- unsigned long base : 16; /* RW */
- unsigned long m_io : 6; /* RW */
- unsigned long n_io : 4; /* RW */
- unsigned long rsvd_56_62: 7; /* */
- unsigned long enable : 1; /* RW */
- } s;
+/* UVH_RH_GAM_GRU_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG ( \
+ is_uv(UV4) ? 0x480010UL : \
+ is_uv(UV3) ? 0x1600010UL : \
+ is_uv(UV2) ? 0x1600010UL : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_N_GRU_SHFT 52
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_N_GRU_MASK 0x00f0000000000000UL
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+/* UV4A unique defines */
+#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 26
+#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x000ffffffc000000UL
+
+/* UV4 unique defines */
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 26
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x00003ffffc000000UL
+
+/* UV3 unique defines */
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 28
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x00003ffff0000000UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MODE_SHFT 62
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MODE_MASK 0x4000000000000000UL
+
+/* UV2 unique defines */
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 28
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x00003ffff0000000UL
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK ( \
+ is_uv(UV4A) ? 0x000ffffffc000000UL : \
+ is_uv(UV4) ? 0x00003ffffc000000UL : \
+ is_uv(UV3) ? 0x00003ffff0000000UL : \
+ is_uv(UV2) ? 0x00003ffff0000000UL : \
+ 0)
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT ( \
+ is_uv(UV4) ? 26 : \
+ is_uv(UV3) ? 28 : \
+ is_uv(UV2) ? 28 : \
+ -1)
+
+union uvh_rh_gam_gru_overlay_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_gru_overlay_config_s {
+ unsigned long rsvd_0_45:46;
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_gru_overlay_config_s {
+ unsigned long rsvd_0_45:46;
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV4A unique struct */
+ struct uv4ah_rh_gam_gru_overlay_config_s {
+ unsigned long rsvd_0_24:25;
+ unsigned long undef_25:1; /* Undefined */
+ unsigned long base:26; /* RW */
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s4a;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_gru_overlay_config_s {
+ unsigned long rsvd_0_24:25;
+ unsigned long undef_25:1; /* Undefined */
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_gru_overlay_config_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_61:6;
+ unsigned long mode:1; /* RW */
+ unsigned long enable:1; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_gru_overlay_config_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG */
/* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG ( \
+ is_uv(UV2) ? 0x1600030UL : \
+ 0)
+
+
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+/* UV2 unique defines */
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT 27
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_MASK 0x00003ffff8000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_M_IO_SHFT 46
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_M_IO_MASK 0x000fc00000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_N_IO_SHFT 52
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_N_IO_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-union uvh_rh_gam_mmr_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_mmr_overlay_config_mmr_s {
- unsigned long rsvd_0_25: 26; /* */
- unsigned long base : 20; /* RW */
- unsigned long dual_hub : 1; /* RW */
- unsigned long rsvd_47_62: 16; /* */
- unsigned long enable : 1; /* RW */
- } s;
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT ( \
+ is_uv(UV2) ? 27 : \
+ uv_undefined("UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT"))
+
+union uvh_rh_gam_mmioh_overlay_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_mmioh_overlay_config_s {
+ unsigned long rsvd_0_26:27;
+ unsigned long base:19; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_mmioh_overlay_config_s {
+ unsigned long rsvd_0_26:27;
+ unsigned long base:19; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_mmioh_overlay_config_s {
+ unsigned long rsvd_0_26:27;
+ unsigned long base:19; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_RTC */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0 */
/* ========================================================================= */
-#define UVH_RTC 0x340000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0 ( \
+ is_uv(UV4) ? 0x483000UL : \
+ is_uv(UV3) ? 0x1603000UL : \
+ 0)
-#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
-#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+/* UV4A unique defines */
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x000ffffffc000000UL
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 52
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x03f0000000000000UL
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL
-union uvh_rtc_u {
- unsigned long v;
- struct uvh_rtc_s {
- unsigned long real_time_clock : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+/* UV4 unique defines */
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x00003ffffc000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 46
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x000fc00000000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL
+
+/* UV3 unique defines */
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT 26
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK 0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_SHFT 46
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_M_IO_MASK 0x000fc00000000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_SHFT 63
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_ENABLE_MASK 0x8000000000000000UL
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK ( \
+ is_uv(UV4A) ? 0x000ffffffc000000UL : \
+ is_uv(UV4) ? 0x00003ffffc000000UL : \
+ is_uv(UV3) ? 0x00003ffffc000000UL : \
+ 0)
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT ( \
+ is_uv(UV4) ? 26 : \
+ is_uv(UV3) ? 26 : \
+ -1)
+
+union uvh_rh_gam_mmioh_overlay_config0_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_mmioh_overlay_config0_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_mmioh_overlay_config0_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV4A unique struct */
+ struct uv4ah_rh_gam_mmioh_overlay_config0_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s4a;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_mmioh_overlay_config0_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_mmioh_overlay_config0_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s3;
};
/* ========================================================================= */
-/* UVH_RTC1_INT_CONFIG */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1 */
/* ========================================================================= */
-#define UVH_RTC1_INT_CONFIG 0x615c0UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1 ( \
+ is_uv(UV4) ? 0x484000UL : \
+ is_uv(UV3) ? 0x1604000UL : \
+ 0)
-#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC1_INT_CONFIG_P_SHFT 13
-#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC1_INT_CONFIG_T_SHFT 15
-#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC1_INT_CONFIG_M_SHFT 16
-#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+/* UV4A unique defines */
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x000ffffffc000000UL
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 52
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x03f0000000000000UL
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL
-union uvh_rtc1_int_config_u {
- unsigned long v;
- struct uvh_rtc1_int_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+/* UV4 unique defines */
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x00003ffffc000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 46
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x000fc00000000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL
+
+/* UV3 unique defines */
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT 26
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK 0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_SHFT 46
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_M_IO_MASK 0x000fc00000000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_SHFT 63
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_ENABLE_MASK 0x8000000000000000UL
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK ( \
+ is_uv(UV4A) ? 0x000ffffffc000000UL : \
+ is_uv(UV4) ? 0x00003ffffc000000UL : \
+ is_uv(UV3) ? 0x00003ffffc000000UL : \
+ 0)
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT ( \
+ is_uv(UV4) ? 26 : \
+ is_uv(UV3) ? 26 : \
+ -1)
+
+union uvh_rh_gam_mmioh_overlay_config1_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_mmioh_overlay_config1_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_mmioh_overlay_config1_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV4A unique struct */
+ struct uv4ah_rh_gam_mmioh_overlay_config1_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s4a;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_mmioh_overlay_config1_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_mmioh_overlay_config1_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s3;
};
/* ========================================================================= */
-/* UVH_RTC2_INT_CONFIG */
-/* ========================================================================= */
-#define UVH_RTC2_INT_CONFIG 0x61600UL
-
-#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC2_INT_CONFIG_P_SHFT 13
-#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC2_INT_CONFIG_T_SHFT 15
-#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC2_INT_CONFIG_M_SHFT 16
-#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc2_int_config_u {
- unsigned long v;
- struct uvh_rtc2_int_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0 */
+/* ========================================================================= */
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0 ( \
+ is_uv(UV4) ? 0x483800UL : \
+ is_uv(UV3) ? 0x1603800UL : \
+ 0)
+
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH ( \
+ is_uv(UV4) ? 128 : \
+ is_uv(UV3) ? 128 : \
+ 0)
+
+/* UV4A unique defines */
+#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0
+#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000000fffUL
+
+/* UV4 unique defines */
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000007fffUL
+
+/* UV3 unique defines */
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000007fffUL
+
+/* UVH common defines */
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK ( \
+ is_uv(UV4A) ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \
+ is_uv(UV4) ? UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \
+ is_uv(UV3) ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \
+ 0)
+
+
+union uvh_rh_gam_mmioh_redirect_config0_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } sx;
+
+ struct uv4ah_rh_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:12; /* RW */
+ unsigned long rsvd_12_63:52;
+ } s4a;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_mmioh_redirect_config0_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s3;
};
/* ========================================================================= */
-/* UVH_RTC3_INT_CONFIG */
-/* ========================================================================= */
-#define UVH_RTC3_INT_CONFIG 0x61640UL
-
-#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC3_INT_CONFIG_P_SHFT 13
-#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC3_INT_CONFIG_T_SHFT 15
-#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC3_INT_CONFIG_M_SHFT 16
-#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc3_int_config_u {
- unsigned long v;
- struct uvh_rtc3_int_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1 */
+/* ========================================================================= */
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1 ( \
+ is_uv(UV4) ? 0x484800UL : \
+ is_uv(UV3) ? 0x1604800UL : \
+ 0)
+
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH ( \
+ is_uv(UV4) ? 128 : \
+ is_uv(UV3) ? 128 : \
+ 0)
+
+/* UV4A unique defines */
+#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0
+#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000000fffUL
+
+/* UV4 unique defines */
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000007fffUL
+
+/* UV3 unique defines */
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000007fffUL
+
+/* UVH common defines */
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK ( \
+ is_uv(UV4A) ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \
+ is_uv(UV4) ? UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \
+ is_uv(UV3) ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \
+ 0)
+
+
+union uvh_rh_gam_mmioh_redirect_config1_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rh_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_rh_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } sx;
+
+ struct uv4ah_rh_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:12; /* RW */
+ unsigned long rsvd_12_63:52;
+ } s4a;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_mmioh_redirect_config1_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s3;
};
/* ========================================================================= */
-/* UVH_RTC_INC_RATIO */
+/* UVH_RH_GAM_MMR_OVERLAY_CONFIG */
/* ========================================================================= */
-#define UVH_RTC_INC_RATIO 0x350000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG ( \
+ is_uv(UV4) ? 0x480028UL : \
+ is_uv(UV3) ? 0x1600028UL : \
+ is_uv(UV2) ? 0x1600028UL : \
+ 0)
+
+
+/* UVXH common defines */
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT 26
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_MASK ( \
+ is_uv(UV4A) ? 0x000ffffffc000000UL : \
+ is_uv(UV4) ? 0x00003ffffc000000UL : \
+ is_uv(UV3) ? 0x00003ffffc000000UL : \
+ is_uv(UV2) ? 0x00003ffffc000000UL : \
+ 0)
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+/* UV4A unique defines */
+#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT 26
+#define UV4AH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK 0x000ffffffc000000UL
+
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_MASK ( \
+ is_uv(UV4A) ? 0x000ffffffc000000UL : \
+ is_uv(UV4) ? 0x00003ffffc000000UL : \
+ is_uv(UV3) ? 0x00003ffffc000000UL : \
+ is_uv(UV2) ? 0x00003ffffc000000UL : \
+ 0)
+
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT ( \
+ is_uv(UV4) ? 26 : \
+ is_uv(UV3) ? 26 : \
+ is_uv(UV2) ? 26 : \
+ -1)
+
+union uvh_rh_gam_mmr_overlay_config_u {
+ unsigned long v;
-#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
-#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
-#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
-#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
+ /* UVH common struct */
+ struct uvh_rh_gam_mmr_overlay_config_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s;
-union uvh_rtc_inc_ratio_u {
- unsigned long v;
- struct uvh_rtc_inc_ratio_s {
- unsigned long fraction : 20; /* RW */
- unsigned long ratio : 3; /* RW */
- unsigned long rsvd_23_63: 41; /* */
- } s;
+ /* UVXH common struct */
+ struct uvxh_rh_gam_mmr_overlay_config_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } sx;
+
+ /* UV4 unique struct */
+ struct uv4h_rh_gam_mmr_overlay_config_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rh_gam_mmr_overlay_config_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rh_gam_mmr_overlay_config_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_SI_ADDR_MAP_CONFIG */
+/* UVH_RTC */
/* ========================================================================= */
-#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
+#define UVH_RTC ( \
+ is_uv(UV5) ? 0xe0000UL : \
+ is_uv(UV4) ? 0xe0000UL : \
+ is_uv(UV3) ? 0x340000UL : \
+ is_uv(UV2) ? 0x340000UL : \
+ 0)
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
+/* UVH common defines*/
+#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
+#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+
+
+union uvh_rtc_u {
+ unsigned long v;
-union uvh_si_addr_map_config_u {
- unsigned long v;
- struct uvh_si_addr_map_config_s {
- unsigned long m_skt : 6; /* RW */
- unsigned long rsvd_6_7: 2; /* */
- unsigned long n_skt : 4; /* RW */
- unsigned long rsvd_12_63: 52; /* */
- } s;
+ /* UVH common struct */
+ struct uvh_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
+
+ /* UV5 unique struct */
+ struct uv5h_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s2;
};
/* ========================================================================= */
-/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
+/* UVH_RTC1_INT_CONFIG */
/* ========================================================================= */
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
+#define UVH_RTC1_INT_CONFIG 0x615c0UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+/* UVH common defines*/
+#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC1_INT_CONFIG_P_SHFT 13
+#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC1_INT_CONFIG_T_SHFT 15
+#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC1_INT_CONFIG_M_SHFT 16
+#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-union uvh_si_alias0_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias0_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+
+union uvh_rtc1_int_config_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+
+ /* UV5 unique struct */
+ struct uv5h_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
+/* UVH_SCRATCH5 */
/* ========================================================================= */
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
+#define UVH_SCRATCH5 ( \
+ is_uv(UV5) ? 0xb0200UL : \
+ is_uv(UV4) ? 0xb0200UL : \
+ is_uv(UV3) ? 0x2d0200UL : \
+ is_uv(UV2) ? 0x2d0200UL : \
+ 0)
+#define UV5H_SCRATCH5 0xb0200UL
+#define UV4H_SCRATCH5 0xb0200UL
+#define UV3H_SCRATCH5 0x2d0200UL
+#define UV2H_SCRATCH5 0x2d0200UL
+
+/* UVH common defines*/
+#define UVH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
+/* UVXH common defines */
+#define UVXH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVXH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
+/* UVYH common defines */
+#define UVYH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVYH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
+/* UV5 unique defines */
+#define UV5H_SCRATCH5_SCRATCH5_SHFT 0
+#define UV5H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
+/* UV4 unique defines */
+#define UV4H_SCRATCH5_SCRATCH5_SHFT 0
+#define UV4H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
+/* UV3 unique defines */
+#define UV3H_SCRATCH5_SCRATCH5_SHFT 0
+#define UV3H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+/* UV2 unique defines */
+#define UV2H_SCRATCH5_SCRATCH5_SHFT 0
+#define UV2H_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
-union uvh_si_alias1_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias1_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+
+union uvh_scratch5_u {
+ unsigned long v;
+
+ /* UVH common struct */
+ struct uvh_scratch5_s {
+ unsigned long scratch5:64; /* RW */
+ } s;
+
+ /* UVXH common struct */
+ struct uvxh_scratch5_s {
+ unsigned long scratch5:64; /* RW */
+ } sx;
+
+ /* UVYH common struct */
+ struct uvyh_scratch5_s {
+ unsigned long scratch5:64; /* RW */
+ } sy;
+
+ /* UV5 unique struct */
+ struct uv5h_scratch5_s {
+ unsigned long scratch5:64; /* RW */
+ } s5;
+
+ /* UV4 unique struct */
+ struct uv4h_scratch5_s {
+ unsigned long scratch5:64; /* RW */
+ } s4;
+
+ /* UV3 unique struct */
+ struct uv3h_scratch5_s {
+ unsigned long scratch5:64; /* RW */
+ } s3;
+
+ /* UV2 unique struct */
+ struct uv2h_scratch5_s {
+ unsigned long scratch5:64; /* RW */
+ } s2;
};
/* ========================================================================= */
-/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
+/* UVH_SCRATCH5_ALIAS */
/* ========================================================================= */
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
+#define UVH_SCRATCH5_ALIAS ( \
+ is_uv(UV5) ? 0xb0208UL : \
+ is_uv(UV4) ? 0xb0208UL : \
+ is_uv(UV3) ? 0x2d0208UL : \
+ is_uv(UV2) ? 0x2d0208UL : \
+ 0)
+#define UV5H_SCRATCH5_ALIAS 0xb0208UL
+#define UV4H_SCRATCH5_ALIAS 0xb0208UL
+#define UV3H_SCRATCH5_ALIAS 0x2d0208UL
+#define UV2H_SCRATCH5_ALIAS 0x2d0208UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-union uvh_si_alias2_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias2_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
-};
+/* ========================================================================= */
+/* UVH_SCRATCH5_ALIAS_2 */
+/* ========================================================================= */
+#define UVH_SCRATCH5_ALIAS_2 ( \
+ is_uv(UV5) ? 0xb0210UL : \
+ is_uv(UV4) ? 0xb0210UL : \
+ is_uv(UV3) ? 0x2d0210UL : \
+ is_uv(UV2) ? 0x2d0210UL : \
+ 0)
+#define UV5H_SCRATCH5_ALIAS_2 0xb0210UL
+#define UV4H_SCRATCH5_ALIAS_2 0xb0210UL
+#define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL
+
#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052b73de..b7253ef3205a 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,47 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_VDSO_H
#define _ASM_X86_VDSO_H
-#ifdef CONFIG_X86_64
-extern const char VDSO64_PRELINK[];
-
-/*
- * Given a pointer to the vDSO image, find the pointer to VDSO64_name
- * as that symbol is defined in the vDSO sources or linker script.
- */
-#define VDSO64_SYMBOL(base, name) \
-({ \
- extern const char VDSO64_##name[]; \
- (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
-})
-#endif
-
-#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
-extern const char VDSO32_PRELINK[];
-
-/*
- * Given a pointer to the vDSO image, find the pointer to VDSO32_name
- * as that symbol is defined in the vDSO sources or linker script.
- */
-#define VDSO32_SYMBOL(base, name) \
-({ \
- extern const char VDSO32_##name[]; \
- (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \
-})
-#endif
-
-/*
- * These symbols are defined with the addresses in the vsyscall page.
- * See vsyscall-sigreturn.S.
- */
-extern void __user __kernel_sigreturn;
-extern void __user __kernel_rt_sigreturn;
-
-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_int80_start, vdso32_int80_end;
-extern const char vdso32_syscall_start, vdso32_syscall_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+#include <asm/page_types.h>
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+#ifndef __ASSEMBLER__
+
+#include <linux/mm_types.h>
+
+struct vdso_image {
+ void *data;
+ unsigned long size; /* Always a multiple of PAGE_SIZE */
+
+ unsigned long alt, alt_len;
+ unsigned long extable_base, extable_len;
+ const void *extable;
+
+ long sym_VDSO32_NOTE_MASK;
+ long sym___kernel_sigreturn;
+ long sym___kernel_rt_sigreturn;
+ long sym___kernel_vsyscall;
+ long sym_int80_landing_pad;
+ long sym_vdso32_sigreturn_landing_pad;
+ long sym_vdso32_rt_sigreturn_landing_pad;
+};
+
+extern const struct vdso_image vdso_image_64;
+extern const struct vdso_image vdso_image_x32;
+extern const struct vdso_image vdso_image_32;
+
+extern int __init init_vdso_image(const struct vdso_image *image);
+
+extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
+
+extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr);
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vdso/clocksource.h b/arch/x86/include/asm/vdso/clocksource.h
new file mode 100644
index 000000000000..136e5e57cfe1
--- /dev/null
+++ b/arch/x86/include/asm/vdso/clocksource.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_CLOCKSOURCE_H
+#define __ASM_VDSO_CLOCKSOURCE_H
+
+#define VDSO_ARCH_CLOCKMODES \
+ VDSO_CLOCKMODE_TSC, \
+ VDSO_CLOCKMODE_PVCLOCK, \
+ VDSO_CLOCKMODE_HVCLOCK
+
+#define HAVE_VDSO_CLOCKMODE_HVCLOCK
+
+#endif /* __ASM_VDSO_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..ff1c11b9fa27
--- /dev/null
+++ b/arch/x86/include/asm/vdso/getrandom.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLER__
+
+#include <asm/unistd.h>
+
+/**
+ * getrandom_syscall - Invoke the getrandom() syscall.
+ * @buffer: Destination buffer to fill with random bytes.
+ * @len: Size of @buffer in bytes.
+ * @flags: Zero or more GRND_* flags.
+ * Returns: The number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
+{
+ long ret;
+
+ asm ("syscall" : "=a" (ret) :
+ "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
+ "rcx", "r11", "memory");
+
+ return ret;
+}
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..73b2e7ee8f0f
--- /dev/null
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,336 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
+ *
+ * Copyright (C) 2019 ARM Limited.
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ */
+#ifndef __ASM_VDSO_GETTIMEOFDAY_H
+#define __ASM_VDSO_GETTIMEOFDAY_H
+
+#ifndef __ASSEMBLER__
+
+#include <uapi/linux/time.h>
+#include <asm/vgtod.h>
+#include <asm/unistd.h>
+#include <asm/msr.h>
+#include <asm/pvclock.h>
+#include <clocksource/hyperv_timer.h>
+
+#define VDSO_HAS_TIME 1
+
+#define VDSO_HAS_CLOCK_GETRES 1
+
+/*
+ * Declare the memory-mapped vclock data pages. These come from hypervisors.
+ * If we ever reintroduce something like direct access to an MMIO clock like
+ * the HPET again, it will go here as well.
+ *
+ * A load from any of these pages will segfault if the clock in question is
+ * disabled, so appropriate compiler barriers and checks need to be used
+ * to prevent stray loads.
+ *
+ * These declarations MUST NOT be const. The compiler will assume that
+ * an extern const variable has genuinely constant contents, and the
+ * resulting code won't work, since the whole point is that these pages
+ * change over time, possibly while we're accessing them.
+ */
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+/*
+ * This is the vCPU 0 pvclock page. We only use pvclock from the vDSO
+ * if the hypervisor tells us that all vCPUs can get valid data from the
+ * vCPU 0 page.
+ */
+extern struct pvclock_vsyscall_time_info pvclock_page
+ __attribute__((visibility("hidden")));
+#endif
+
+#ifdef CONFIG_HYPERV_TIMER
+extern struct ms_hyperv_tsc_page hvclock_page
+ __attribute__((visibility("hidden")));
+#endif
+
+#ifndef BUILD_VDSO32
+
+static __always_inline
+long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm ("syscall" : "=a" (ret), "=m" (*_ts) :
+ "0" (__NR_clock_gettime), "D" (_clkid), "S" (_ts) :
+ "rcx", "r11");
+
+ return ret;
+}
+
+static __always_inline
+long gettimeofday_fallback(struct __kernel_old_timeval *_tv,
+ struct timezone *_tz)
+{
+ long ret;
+
+ asm("syscall" : "=a" (ret) :
+ "0" (__NR_gettimeofday), "D" (_tv), "S" (_tz) : "memory");
+
+ return ret;
+}
+
+static __always_inline
+long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm ("syscall" : "=a" (ret), "=m" (*_ts) :
+ "0" (__NR_clock_getres), "D" (_clkid), "S" (_ts) :
+ "rcx", "r11");
+
+ return ret;
+}
+
+#else
+
+static __always_inline
+long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[clock], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*_ts)
+ : "0" (__NR_clock_gettime64), [clock] "g" (_clkid), "c" (_ts)
+ : "edx");
+
+ return ret;
+}
+
+static __always_inline
+long clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[clock], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*_ts)
+ : "0" (__NR_clock_gettime), [clock] "g" (_clkid), "c" (_ts)
+ : "edx");
+
+ return ret;
+}
+
+static __always_inline
+long gettimeofday_fallback(struct __kernel_old_timeval *_tv,
+ struct timezone *_tz)
+{
+ long ret;
+
+ asm(
+ "mov %%ebx, %%edx \n"
+ "mov %2, %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday), "g" (_tv), "c" (_tz)
+ : "memory", "edx");
+
+ return ret;
+}
+
+static __always_inline long
+clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[clock], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*_ts)
+ : "0" (__NR_clock_getres_time64), [clock] "g" (_clkid), "c" (_ts)
+ : "edx");
+
+ return ret;
+}
+
+static __always_inline
+long clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[clock], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*_ts)
+ : "0" (__NR_clock_getres), [clock] "g" (_clkid), "c" (_ts)
+ : "edx");
+
+ return ret;
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+static u64 vread_pvclock(void)
+{
+ const struct pvclock_vcpu_time_info *pvti = &pvclock_page.pvti;
+ u32 version;
+ u64 ret;
+
+ /*
+ * Note: The kernel and hypervisor must guarantee that cpu ID
+ * number maps 1:1 to per-CPU pvclock time info.
+ *
+ * Because the hypervisor is entirely unaware of guest userspace
+ * preemption, it cannot guarantee that per-CPU pvclock time
+ * info is updated if the underlying CPU changes or that that
+ * version is increased whenever underlying CPU changes.
+ *
+ * On KVM, we are guaranteed that pvti updates for any vCPU are
+ * atomic as seen by *all* vCPUs. This is an even stronger
+ * guarantee than we get with a normal seqlock.
+ *
+ * On Xen, we don't appear to have that guarantee, but Xen still
+ * supplies a valid seqlock using the version field.
+ *
+ * We only do pvclock vdso timing at all if
+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+ * mean that all vCPUs have matching pvti and that the TSC is
+ * synced, so we can just look at vCPU 0's pvti.
+ */
+
+ do {
+ version = pvclock_read_begin(pvti);
+
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
+ return U64_MAX;
+
+ ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
+ } while (pvclock_read_retry(pvti, version));
+
+ return ret & S64_MAX;
+}
+#endif
+
+#ifdef CONFIG_HYPERV_TIMER
+static u64 vread_hvclock(void)
+{
+ u64 tsc, time;
+
+ if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time))
+ return time & S64_MAX;
+
+ return U64_MAX;
+}
+#endif
+
+static inline u64 __arch_get_hw_counter(s32 clock_mode,
+ const struct vdso_time_data *vd)
+{
+ if (likely(clock_mode == VDSO_CLOCKMODE_TSC))
+ return (u64)rdtsc_ordered() & S64_MAX;
+ /*
+ * For any memory-mapped vclock type, we need to make sure that gcc
+ * doesn't cleverly hoist a load before the mode check. Otherwise we
+ * might end up touching the memory-mapped page even if the vclock in
+ * question isn't enabled, which will segfault. Hence the barriers.
+ */
+#ifdef CONFIG_PARAVIRT_CLOCK
+ if (clock_mode == VDSO_CLOCKMODE_PVCLOCK) {
+ barrier();
+ return vread_pvclock();
+ }
+#endif
+#ifdef CONFIG_HYPERV_TIMER
+ if (clock_mode == VDSO_CLOCKMODE_HVCLOCK) {
+ barrier();
+ return vread_hvclock();
+ }
+#endif
+ return U64_MAX;
+}
+
+static inline bool arch_vdso_clocksource_ok(const struct vdso_clock *vc)
+{
+ return true;
+}
+#define vdso_clocksource_ok arch_vdso_clocksource_ok
+
+/*
+ * Clocksource read value validation to handle PV and HyperV clocksources
+ * which can be invalidated asynchronously and indicate invalidation by
+ * returning U64_MAX, which can be effectively tested by checking for a
+ * negative value after casting it to s64.
+ *
+ * This effectively forces a S64_MAX mask on the calculations, unlike the
+ * U64_MAX mask normally used by x86 clocksources.
+ */
+static inline bool arch_vdso_cycles_ok(u64 cycles)
+{
+ return (s64)cycles >= 0;
+}
+#define vdso_cycles_ok arch_vdso_cycles_ok
+
+/*
+ * x86 specific calculation of nanoseconds for the current cycle count
+ *
+ * The regular implementation assumes that clocksource reads are globally
+ * monotonic. The TSC can be slightly off across sockets which can cause
+ * the regular delta calculation (@cycles - @last) to return a huge time
+ * jump.
+ *
+ * Therefore it needs to be verified that @cycles are greater than
+ * @vd->cycles_last. If not then use @vd->cycles_last, which is the base
+ * time of the current conversion period.
+ *
+ * This variant also uses a custom mask because while the clocksource mask of
+ * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
+ * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above
+ * declares everything with the MSB/Sign-bit set as invalid. Therefore the
+ * effective mask is S64_MAX.
+ */
+static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base)
+{
+ u64 delta = cycles - vc->cycle_last;
+
+ /*
+ * Negative motion and deltas which can cause multiplication
+ * overflow require special treatment. This check covers both as
+ * negative motion is guaranteed to be greater than @vc::max_cycles
+ * due to unsigned comparison.
+ *
+ * Due to the MSB/Sign-bit being used as invalid marker (see
+ * arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that
+ * case is also unlikely and will also take the unlikely path here.
+ */
+ if (unlikely(delta > vc->max_cycles)) {
+ /*
+ * Due to the above mentioned TSC wobbles, filter out
+ * negative motion. Per the above masking, the effective
+ * sign bit is now bit 62.
+ */
+ if (delta & (1ULL << 62))
+ return base >> vc->shift;
+
+ /* Handle multiplication overflow gracefully */
+ return mul_u64_u32_add_u64_shr(delta & S64_MAX, vc->mult, base, vc->shift);
+ }
+
+ return ((delta * vc->mult) + base) >> vc->shift;
+}
+#define vdso_calc_ns vdso_calc_ns
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..7000aeb59aa2
--- /dev/null
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+#ifndef __ASM_VDSO_PROCESSOR_H
+#define __ASM_VDSO_PROCESSOR_H
+
+#ifndef __ASSEMBLER__
+
+/* PAUSE is a good thing to insert into busy-wait loops. */
+static __always_inline void native_pause(void)
+{
+ asm volatile("pause" ::: "memory");
+}
+
+static __always_inline void cpu_relax(void)
+{
+ native_pause();
+}
+
+struct getcpu_cache;
+
+notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h
new file mode 100644
index 000000000000..4aa311a923f2
--- /dev/null
+++ b/arch/x86/include/asm/vdso/vsyscall.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_VSYSCALL_H
+#define __ASM_VDSO_VSYSCALL_H
+
+#define __VDSO_PAGES 6
+
+#define VDSO_NR_VCLOCK_PAGES 2
+#define VDSO_VCLOCK_PAGES_START(_b) ((_b) + (__VDSO_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE)
+#define VDSO_PAGE_PVCLOCK_OFFSET 0
+#define VDSO_PAGE_HVCLOCK_OFFSET 1
+
+#ifndef __ASSEMBLER__
+
+#include <vdso/datapage.h>
+#include <asm/vgtod.h>
+
+/* The asm-generic header needs to be included after the definitions above */
+#include <asm-generic/vdso/vsyscall.h>
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
new file mode 100644
index 000000000000..5d471253c755
--- /dev/null
+++ b/arch/x86/include/asm/vermagic.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_VERMAGIC_H
+#define _ASM_VERMAGIC_H
+
+#ifdef CONFIG_X86_64
+/* X86_64 does not define MODULE_PROC_FAMILY */
+#elif defined CONFIG_M486SX
+#define MODULE_PROC_FAMILY "486SX "
+#elif defined CONFIG_M486
+#define MODULE_PROC_FAMILY "486 "
+#elif defined CONFIG_M586
+#define MODULE_PROC_FAMILY "586 "
+#elif defined CONFIG_M586TSC
+#define MODULE_PROC_FAMILY "586TSC "
+#elif defined CONFIG_M586MMX
+#define MODULE_PROC_FAMILY "586MMX "
+#elif defined CONFIG_MATOM
+#define MODULE_PROC_FAMILY "ATOM "
+#elif defined CONFIG_M686
+#define MODULE_PROC_FAMILY "686 "
+#elif defined CONFIG_MPENTIUMII
+#define MODULE_PROC_FAMILY "PENTIUMII "
+#elif defined CONFIG_MPENTIUMIII
+#define MODULE_PROC_FAMILY "PENTIUMIII "
+#elif defined CONFIG_MPENTIUMM
+#define MODULE_PROC_FAMILY "PENTIUMM "
+#elif defined CONFIG_MPENTIUM4
+#define MODULE_PROC_FAMILY "PENTIUM4 "
+#elif defined CONFIG_MK6
+#define MODULE_PROC_FAMILY "K6 "
+#elif defined CONFIG_MK7
+#define MODULE_PROC_FAMILY "K7 "
+#elif defined CONFIG_MELAN
+#define MODULE_PROC_FAMILY "ELAN "
+#elif defined CONFIG_MCRUSOE
+#define MODULE_PROC_FAMILY "CRUSOE "
+#elif defined CONFIG_MEFFICEON
+#define MODULE_PROC_FAMILY "EFFICEON "
+#elif defined CONFIG_MWINCHIPC6
+#define MODULE_PROC_FAMILY "WINCHIPC6 "
+#elif defined CONFIG_MWINCHIP3D
+#define MODULE_PROC_FAMILY "WINCHIP3D "
+#elif defined CONFIG_MCYRIXIII
+#define MODULE_PROC_FAMILY "CYRIXIII "
+#elif defined CONFIG_MVIAC3_2
+#define MODULE_PROC_FAMILY "VIAC3-2 "
+#elif defined CONFIG_MVIAC7
+#define MODULE_PROC_FAMILY "VIAC7 "
+#elif defined CONFIG_MGEODEGX1
+#define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
+#else
+#error unknown processor family
+#endif
+
+#ifdef CONFIG_X86_32
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
+#else
+# define MODULE_ARCH_VERMAGIC ""
+#endif
+
+#endif /* _ASM_VERMAGIC_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index c4b9dc2f67c5..46f9b2deab4d 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Access to VGA videoram
*
@@ -7,12 +8,24 @@
#ifndef _ASM_X86_VGA_H
#define _ASM_X86_VGA_H
+#include <asm/set_memory.h>
+
/*
* On the PC, we can just recalculate addresses and then
* access the videoram directly without any black magic.
+ * To support memory encryption however, we need to access
+ * the videoram as decrypted memory.
*/
-#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+#define VGA_MAP_MEM(x, s) \
+({ \
+ unsigned long start = (unsigned long)phys_to_virt(x); \
+ \
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \
+ set_memory_decrypted(start, (s) >> PAGE_SHIFT); \
+ \
+ start; \
+})
#define vga_readb(x) (*(x))
#define vga_writeb(x, y) (*(y) = (x))
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index dc27a69e5d2a..a0ce291abcae 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -1,29 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_VGTOD_H
#define _ASM_X86_VGTOD_H
-#include <asm/vsyscall.h>
-#include <linux/clocksource.h>
+/*
+ * This check is required to prevent ARCH=um to include
+ * unwanted headers.
+ */
+#ifdef CONFIG_GENERIC_GETTIMEOFDAY
+#include <linux/compiler.h>
+#include <asm/clocksource.h>
+#include <vdso/datapage.h>
+#include <vdso/helpers.h>
-struct vsyscall_gtod_data {
- seqlock_t lock;
+#include <uapi/linux/time.h>
- /* open coded 'struct timespec' */
- time_t wall_time_sec;
- u32 wall_time_nsec;
-
- int sysctl_enabled;
- struct timezone sys_tz;
- struct { /* extract of a clocksource struct */
- cycle_t (*vread)(void);
- cycle_t cycle_last;
- cycle_t mask;
- u32 mult;
- u32 shift;
- } clock;
- struct timespec wall_to_monotonic;
-};
-extern struct vsyscall_gtod_data __vsyscall_gtod_data
-__section_vsyscall_gtod_data;
-extern struct vsyscall_gtod_data vsyscall_gtod_data;
+#endif /* CONFIG_GENERIC_GETTIMEOFDAY */
#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/video.h b/arch/x86/include/asm/video.h
new file mode 100644
index 000000000000..08ec328203ef
--- /dev/null
+++ b/arch/x86/include/asm/video.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VIDEO_H
+#define _ASM_X86_VIDEO_H
+
+#include <linux/types.h>
+
+#include <asm/page.h>
+
+struct device;
+
+pgprot_t pgprot_framebuffer(pgprot_t prot,
+ unsigned long vm_start, unsigned long vm_end,
+ unsigned long offset);
+#define pgprot_framebuffer pgprot_framebuffer
+
+#ifdef CONFIG_VIDEO
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
+#endif
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_X86_VIDEO_H */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
deleted file mode 100644
index e0f9aa16358b..000000000000
--- a/arch/x86/include/asm/virtext.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* CPU virtualization extensions handling
- *
- * This should carry the code for handling CPU virtualization extensions
- * that needs to live in the kernel core.
- *
- * Author: Eduardo Habkost <ehabkost@redhat.com>
- *
- * Copyright (C) 2008, Red Hat Inc.
- *
- * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-#ifndef _ASM_X86_VIRTEX_H
-#define _ASM_X86_VIRTEX_H
-
-#include <asm/processor.h>
-#include <asm/system.h>
-
-#include <asm/vmx.h>
-#include <asm/svm.h>
-
-/*
- * VMX functions:
- */
-
-static inline int cpu_has_vmx(void)
-{
- unsigned long ecx = cpuid_ecx(1);
- return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
-}
-
-
-/** Disable VMX on the current CPU
- *
- * vmxoff causes a undefined-opcode exception if vmxon was not run
- * on the CPU previously. Only call this function if you know VMX
- * is enabled.
- */
-static inline void cpu_vmxoff(void)
-{
- asm volatile (ASM_VMX_VMXOFF : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
-}
-
-static inline int cpu_vmx_enabled(void)
-{
- return read_cr4() & X86_CR4_VMXE;
-}
-
-/** Disable VMX if it is enabled on the current CPU
- *
- * You shouldn't call this if cpu_has_vmx() returns 0.
- */
-static inline void __cpu_emergency_vmxoff(void)
-{
- if (cpu_vmx_enabled())
- cpu_vmxoff();
-}
-
-/** Disable VMX if it is supported and enabled on the current CPU
- */
-static inline void cpu_emergency_vmxoff(void)
-{
- if (cpu_has_vmx())
- __cpu_emergency_vmxoff();
-}
-
-
-
-
-/*
- * SVM functions:
- */
-
-/** Check if the CPU has SVM support
- *
- * You can use the 'msg' arg to get a message describing the problem,
- * if the function returns zero. Simply pass NULL if you are not interested
- * on the messages; gcc should take care of not generating code for
- * the messages on this case.
- */
-static inline int cpu_has_svm(const char **msg)
-{
- uint32_t eax, ebx, ecx, edx;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- if (msg)
- *msg = "not amd";
- return 0;
- }
-
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- if (eax < SVM_CPUID_FUNC) {
- if (msg)
- *msg = "can't execute cpuid_8000000a";
- return 0;
- }
-
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
- if (msg)
- *msg = "svm not available";
- return 0;
- }
- return 1;
-}
-
-
-/** Disable SVM on the current CPU
- *
- * You should call this only if cpu_has_svm() returned true.
- */
-static inline void cpu_svm_disable(void)
-{
- uint64_t efer;
-
- wrmsrl(MSR_VM_HSAVE_PA, 0);
- rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer & ~EFER_SVME);
-}
-
-/** Makes sure SVM is disabled, if it is supported on the CPU
- */
-static inline void cpu_emergency_svm_disable(void)
-{
- if (cpu_has_svm(NULL))
- cpu_svm_disable();
-}
-
-#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
deleted file mode 100644
index 166adf61e770..000000000000
--- a/arch/x86/include/asm/visws/cobalt.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef _ASM_X86_VISWS_COBALT_H
-#define _ASM_X86_VISWS_COBALT_H
-
-#include <asm/fixmap.h>
-
-/*
- * Cobalt SGI Visual Workstation system ASIC
- */
-
-#define CO_CPU_NUM_PHYS 0x1e00
-#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2)
-
-#define CO_CPU_MAX 4
-
-#define CO_CPU_PHYS 0xc2000000
-#define CO_APIC_PHYS 0xc4000000
-
-/* see set_fixmap() and asm/fixmap.h */
-#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU))
-#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC))
-
-/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */
-#define CO_CPU_REV 0x08
-#define CO_CPU_CTRL 0x10
-#define CO_CPU_STAT 0x20
-#define CO_CPU_TIMEVAL 0x30
-
-/* CO_CPU_CTRL bits */
-#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */
-#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */
-
-/* CO_CPU_STATUS bits */
-#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */
-
-/* CO_CPU_TIMEVAL value */
-#define CO_TIME_HZ 100000000 /* Cobalt core rate */
-
-/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */
-#define CO_APIC_HI(n) (((n) * 0x10) + 4)
-#define CO_APIC_LO(n) ((n) * 0x10)
-#define CO_APIC_ID 0x0ffc
-
-/* CO_APIC_ID bits */
-#define CO_APIC_ENABLE 0x00000100
-
-/* CO_APIC_LO bits */
-#define CO_APIC_MASK 0x00010000 /* 0 = enabled */
-#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */
-
-/*
- * Where things are physically wired to Cobalt
- * #defines with no board _<type>_<rev>_ are common to all (thus far)
- */
-#define CO_APIC_IDE0 4
-#define CO_APIC_IDE1 2 /* Only on 320 */
-
-#define CO_APIC_8259 12 /* serial, floppy, par-l-l */
-
-/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */
-#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */
-#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */
-
-#define CO_APIC_PIIX4_USB 7 /* this one is weird */
-
-/* Lithium PCI Bridge B -- "the one with PIIX4" */
-#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */
-#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */
-
-#define CO_APIC_VIDOUT0 16
-#define CO_APIC_VIDOUT1 17
-#define CO_APIC_VIDIN0 18
-#define CO_APIC_VIDIN1 19
-
-#define CO_APIC_LI_AUDIO 22
-
-#define CO_APIC_AS 24
-#define CO_APIC_RE 25
-
-#define CO_APIC_CPU 28 /* Timer and Cache interrupt */
-#define CO_APIC_NMI 29
-#define CO_APIC_LAST CO_APIC_NMI
-
-/*
- * This is how irqs are assigned on the Visual Workstation.
- * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU).
- * All other devices (including PCI) go to Cobalt and are irq's 16 on up.
- */
-#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */
-#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0)
-#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */
-#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */
-#define CO_IRQ_IDE0 14 /* knowledge of... */
-#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */
-#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259)
-
-#ifdef CONFIG_X86_VISWS_APIC
-static inline void co_cpu_write(unsigned long reg, unsigned long v)
-{
- *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v;
-}
-
-static inline unsigned long co_cpu_read(unsigned long reg)
-{
- return *((volatile unsigned long *)(CO_CPU_VADDR+reg));
-}
-
-static inline void co_apic_write(unsigned long reg, unsigned long v)
-{
- *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v;
-}
-
-static inline unsigned long co_apic_read(unsigned long reg)
-{
- return *((volatile unsigned long *)(CO_APIC_VADDR+reg));
-}
-#endif
-
-extern char visws_board_type;
-
-#define VISWS_320 0
-#define VISWS_540 1
-
-extern char visws_board_rev;
-
-#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h
deleted file mode 100644
index a10d89bc1270..000000000000
--- a/arch/x86/include/asm/visws/lithium.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef _ASM_X86_VISWS_LITHIUM_H
-#define _ASM_X86_VISWS_LITHIUM_H
-
-#include <asm/fixmap.h>
-
-/*
- * Lithium is the SGI Visual Workstation I/O ASIC
- */
-
-#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */
-#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */
-
-/* see set_fixmap() and asm/fixmap.h */
-#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA))
-#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB))
-
-/* Not a standard PCI? (not in linux/pci.h) */
-#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */
-#define LI_PCI_INTEN 0x46
-
-/* LI_PCI_INTENT bits */
-#define LI_INTA_0 0x0001
-#define LI_INTA_1 0x0002
-#define LI_INTA_2 0x0004
-#define LI_INTA_3 0x0008
-#define LI_INTA_4 0x0010
-#define LI_INTB 0x0020
-#define LI_INTC 0x0040
-#define LI_INTD 0x0080
-
-/* More special purpose macros... */
-static inline void li_pcia_write16(unsigned long reg, unsigned short v)
-{
- *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v;
-}
-
-static inline unsigned short li_pcia_read16(unsigned long reg)
-{
- return *((volatile unsigned short *)(LI_PCIA_VADDR+reg));
-}
-
-static inline void li_pcib_write16(unsigned long reg, unsigned short v)
-{
- *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v;
-}
-
-static inline unsigned short li_pcib_read16(unsigned long reg)
-{
- return *((volatile unsigned short *)(LI_PCIB_VADDR+reg));
-}
-
-#endif /* _ASM_X86_VISWS_LITHIUM_H */
-
diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h
deleted file mode 100644
index d0af4d338e7f..000000000000
--- a/arch/x86/include/asm/visws/piix4.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef _ASM_X86_VISWS_PIIX4_H
-#define _ASM_X86_VISWS_PIIX4_H
-
-/*
- * PIIX4 as used on SGI Visual Workstations
- */
-
-#define PIIX_PM_START 0x0F80
-
-#define SIO_GPIO_START 0x0FC0
-
-#define SIO_PM_START 0x0FC8
-
-#define PMBASE PIIX_PM_START
-#define GPIREG0 (PMBASE+0x30)
-#define GPIREG(x) (GPIREG0+((x)/8))
-#define GPIBIT(x) (1 << ((x)%8))
-
-#define PIIX_GPI_BD_ID1 18
-#define PIIX_GPI_BD_ID2 19
-#define PIIX_GPI_BD_ID3 20
-#define PIIX_GPI_BD_ID4 21
-#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1)
-#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \
- GPIBIT(PIIX_GPI_BD_ID2) | \
- GPIBIT(PIIX_GPI_BD_ID3) | \
- GPIBIT(PIIX_GPI_BD_ID4) )
-
-#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8)
-
-#define SIO_INDEX 0x2e
-#define SIO_DATA 0x2f
-
-#define SIO_DEV_SEL 0x7
-#define SIO_DEV_ENB 0x30
-#define SIO_DEV_MSB 0x60
-#define SIO_DEV_LSB 0x61
-
-#define SIO_GP_DEV 0x7
-
-#define SIO_GP_BASE SIO_GPIO_START
-#define SIO_GP_MSB (SIO_GP_BASE>>8)
-#define SIO_GP_LSB (SIO_GP_BASE&0xff)
-
-#define SIO_GP_DATA1 (SIO_GP_BASE+0)
-
-#define SIO_PM_DEV 0x8
-
-#define SIO_PM_BASE SIO_PM_START
-#define SIO_PM_MSB (SIO_PM_BASE>>8)
-#define SIO_PM_LSB (SIO_PM_BASE&0xff)
-#define SIO_PM_INDEX (SIO_PM_BASE+0)
-#define SIO_PM_DATA (SIO_PM_BASE+1)
-
-#define SIO_PM_FER2 0x1
-
-#define SIO_PM_GP_EN 0x80
-
-
-
-/*
- * This is the dev/reg where generating a config cycle will
- * result in a PCI special cycle.
- */
-#define SPECIAL_DEV 0xff
-#define SPECIAL_REG 0x00
-
-/*
- * PIIX4 needs to see a special cycle with the following data
- * to be convinced the processor has gone into the stop grant
- * state. PIIX4 insists on seeing this before it will power
- * down a system.
- */
-#define PIIX_SPECIAL_STOP 0x00120002
-
-#define PIIX4_RESET_PORT 0xcf9
-#define PIIX4_RESET_VAL 0x6
-
-#define PMSTS_PORT 0xf80 // 2 bytes PM Status
-#define PMEN_PORT 0xf82 // 2 bytes PM Enable
-#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control
-
-#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state
-
-/*
- * PMSTS and PMEN I/O bit definitions.
- * (Bits are the same in both registers)
- */
-#define PM_STS_RSM (1<<15) // Resume Status
-#define PM_STS_PWRBTNOR (1<<11) // Power Button Override
-#define PM_STS_RTC (1<<10) // RTC status
-#define PM_STS_PWRBTN (1<<8) // Power Button Pressed?
-#define PM_STS_GBL (1<<5) // Global Status
-#define PM_STS_BM (1<<4) // Bus Master Status
-#define PM_STS_TMROF (1<<0) // Timer Overflow Status.
-
-/*
- * Stop clock GPI register
- */
-#define PIIX_GPIREG0 (0xf80 + 0x30)
-
-/*
- * Stop clock GPI bit in GPIREG0
- */
-#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in
-
-#endif /* _ASM_X86_VISWS_PIIX4_H */
diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h
deleted file mode 100644
index 5fbf63e1003c..000000000000
--- a/arch/x86/include/asm/visws/sgivw.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/*
- * Frame buffer position and size:
- */
-extern unsigned long sgivwfb_mem_phys;
-extern unsigned long sgivwfb_mem_size;
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index f9303602fbc0..62ee19909903 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -1,133 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_VM86_H
#define _ASM_X86_VM86_H
-/*
- * I'm guessing at the VIF/VIP flag usage, but hope that this is how
- * the Pentium uses them. Linux will return from vm86 mode when both
- * VIF and VIP is set.
- *
- * On a Pentium, we could probably optimize the virtual flags directly
- * in the eflags register instead of doing it "by hand" in vflags...
- *
- * Linus
- */
-
-#include <asm/processor-flags.h>
-
-#define BIOSSEG 0x0f000
-
-#define CPU_086 0
-#define CPU_186 1
-#define CPU_286 2
-#define CPU_386 3
-#define CPU_486 4
-#define CPU_586 5
-
-/*
- * Return values for the 'vm86()' system call
- */
-#define VM86_TYPE(retval) ((retval) & 0xff)
-#define VM86_ARG(retval) ((retval) >> 8)
-
-#define VM86_SIGNAL 0 /* return due to signal */
-#define VM86_UNKNOWN 1 /* unhandled GP fault
- - IO-instruction or similar */
-#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
-#define VM86_STI 3 /* sti/popf/iret instruction enabled
- virtual interrupts */
-
-/*
- * Additional return values when invoking new vm86()
- */
-#define VM86_PICRETURN 4 /* return due to pending PIC request */
-#define VM86_TRAP 6 /* return due to DOS-debugger request */
-
-/*
- * function codes when invoking new vm86()
- */
-#define VM86_PLUS_INSTALL_CHECK 0
-#define VM86_ENTER 1
-#define VM86_ENTER_NO_BYPASS 2
-#define VM86_REQUEST_IRQ 3
-#define VM86_FREE_IRQ 4
-#define VM86_GET_IRQ_BITS 5
-#define VM86_GET_AND_RESET_IRQ 6
-
-/*
- * This is the stack-layout seen by the user space program when we have
- * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
- * is 'kernel_vm86_regs' (see below).
- */
-
-struct vm86_regs {
-/*
- * normal regs, with special meaning for the segment descriptors..
- */
- long ebx;
- long ecx;
- long edx;
- long esi;
- long edi;
- long ebp;
- long eax;
- long __null_ds;
- long __null_es;
- long __null_fs;
- long __null_gs;
- long orig_eax;
- long eip;
- unsigned short cs, __csh;
- long eflags;
- long esp;
- unsigned short ss, __ssh;
-/*
- * these are specific to v86 mode:
- */
- unsigned short es, __esh;
- unsigned short ds, __dsh;
- unsigned short fs, __fsh;
- unsigned short gs, __gsh;
-};
-
-struct revectored_struct {
- unsigned long __map[8]; /* 256 bits */
-};
-
-struct vm86_struct {
- struct vm86_regs regs;
- unsigned long flags;
- unsigned long screen_bitmap;
- unsigned long cpu_type;
- struct revectored_struct int_revectored;
- struct revectored_struct int21_revectored;
-};
-
-/*
- * flags masks
- */
-#define VM86_SCREEN_BITMAP 0x0001
-
-struct vm86plus_info_struct {
- unsigned long force_return_for_pic:1;
- unsigned long vm86dbg_active:1; /* for debugger */
- unsigned long vm86dbg_TFpendig:1; /* for debugger */
- unsigned long unused:28;
- unsigned long is_vm86pus:1; /* for vm86 internal use */
- unsigned char vm86dbg_intxxtab[32]; /* for debugger */
-};
-struct vm86plus_struct {
- struct vm86_regs regs;
- unsigned long flags;
- unsigned long screen_bitmap;
- unsigned long cpu_type;
- struct revectored_struct int_revectored;
- struct revectored_struct int21_revectored;
- struct vm86plus_info_struct vm86plus;
-};
-
-#ifdef __KERNEL__
-
#include <asm/ptrace.h>
+#include <uapi/asm/vm86.h>
/*
* This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86
@@ -152,43 +28,48 @@ struct kernel_vm86_regs {
unsigned short gs, __gsh;
};
-struct kernel_vm86_struct {
- struct kernel_vm86_regs regs;
-/*
- * the below part remains on the kernel stack while we are in VM86 mode.
- * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
- * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
- * 'struct kernel_vm86_regs' with the then actual values.
- * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
- * in kernelspace, hence we need not reget the data from userspace.
- */
-#define VM86_TSS_ESP0 flags
+struct vm86 {
+ struct vm86plus_struct __user *user_vm86;
+ struct pt_regs regs32;
+ unsigned long veflags;
+ unsigned long veflags_mask;
+ unsigned long saved_sp0;
+
unsigned long flags;
- unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
struct vm86plus_info_struct vm86plus;
- struct pt_regs *regs32; /* here we save the pointer to the old regs */
-/*
- * The below is not part of the structure, but the stack layout continues
- * this way. In front of 'return-eip' may be some data, depending on
- * compilation, so we don't rely on this and save the pointer to 'oldregs'
- * in 'regs32' above.
- * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
-
- long return-eip; from call to vm86()
- struct pt_regs oldregs; user space registers as saved by syscall
- */
};
#ifdef CONFIG_VM86
void handle_vm86_fault(struct kernel_vm86_regs *, long);
int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+void save_v86_state(struct kernel_vm86_regs *, int);
struct task_struct;
+
+#define free_vm86(t) do { \
+ struct thread_struct *__t = (t); \
+ if (__t->vm86 != NULL) { \
+ kfree(__t->vm86); \
+ __t->vm86 = NULL; \
+ } \
+} while (0)
+
+/*
+ * Support for VM86 programs to request interrupts for
+ * real mode hardware drivers:
+ */
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+
+static inline int invalid_vm86_irq(int irq)
+{
+ return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+
void release_vm86_irqs(struct task_struct *);
#else
@@ -201,8 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
return 0;
}
-#endif /* CONFIG_VM86 */
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
-#endif /* __KERNEL__ */
+#define free_vm86(task) do { (void)(task); } while(0)
+
+#endif /* CONFIG_VM86 */
#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
new file mode 100644
index 000000000000..49ce331f3ac6
--- /dev/null
+++ b/arch/x86/include/asm/vmalloc.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_VMALLOC_H
+#define _ASM_X86_VMALLOC_H
+
+#include <asm/cpufeature.h>
+#include <asm/page.h>
+#include <asm/pgtable_areas.h>
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+
+#ifdef CONFIG_X86_64
+#define arch_vmap_pud_supported arch_vmap_pud_supported
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+ return boot_cpu_has(X86_FEATURE_GBPAGES);
+}
+#endif
+
+#define arch_vmap_pmd_supported arch_vmap_pmd_supported
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+ return boot_cpu_has(X86_FEATURE_PSE);
+}
+
+#endif
+
+#endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a2907..000000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * VMI interface definition
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Maintained by: Zachary Amsden zach@vmware.com
- *
- */
-#include <linux/types.h>
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI Option ROM API
- *
- *---------------------------------------------------------------------
- */
-#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
-
-#define PCI_VENDOR_ID_VMWARE 0x15AD
-#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
-
-/*
- * We use two version numbers for compatibility, with the major
- * number signifying interface breakages, and the minor number
- * interface extensions.
- */
-#define VMI_API_REV_MAJOR 3
-#define VMI_API_REV_MINOR 0
-
-#define VMI_CALL_CPUID 0
-#define VMI_CALL_WRMSR 1
-#define VMI_CALL_RDMSR 2
-#define VMI_CALL_SetGDT 3
-#define VMI_CALL_SetLDT 4
-#define VMI_CALL_SetIDT 5
-#define VMI_CALL_SetTR 6
-#define VMI_CALL_GetGDT 7
-#define VMI_CALL_GetLDT 8
-#define VMI_CALL_GetIDT 9
-#define VMI_CALL_GetTR 10
-#define VMI_CALL_WriteGDTEntry 11
-#define VMI_CALL_WriteLDTEntry 12
-#define VMI_CALL_WriteIDTEntry 13
-#define VMI_CALL_UpdateKernelStack 14
-#define VMI_CALL_SetCR0 15
-#define VMI_CALL_SetCR2 16
-#define VMI_CALL_SetCR3 17
-#define VMI_CALL_SetCR4 18
-#define VMI_CALL_GetCR0 19
-#define VMI_CALL_GetCR2 20
-#define VMI_CALL_GetCR3 21
-#define VMI_CALL_GetCR4 22
-#define VMI_CALL_WBINVD 23
-#define VMI_CALL_SetDR 24
-#define VMI_CALL_GetDR 25
-#define VMI_CALL_RDPMC 26
-#define VMI_CALL_RDTSC 27
-#define VMI_CALL_CLTS 28
-#define VMI_CALL_EnableInterrupts 29
-#define VMI_CALL_DisableInterrupts 30
-#define VMI_CALL_GetInterruptMask 31
-#define VMI_CALL_SetInterruptMask 32
-#define VMI_CALL_IRET 33
-#define VMI_CALL_SYSEXIT 34
-#define VMI_CALL_Halt 35
-#define VMI_CALL_Reboot 36
-#define VMI_CALL_Shutdown 37
-#define VMI_CALL_SetPxE 38
-#define VMI_CALL_SetPxELong 39
-#define VMI_CALL_UpdatePxE 40
-#define VMI_CALL_UpdatePxELong 41
-#define VMI_CALL_MachineToPhysical 42
-#define VMI_CALL_PhysicalToMachine 43
-#define VMI_CALL_AllocatePage 44
-#define VMI_CALL_ReleasePage 45
-#define VMI_CALL_InvalPage 46
-#define VMI_CALL_FlushTLB 47
-#define VMI_CALL_SetLinearMapping 48
-
-#define VMI_CALL_SetIOPLMask 61
-#define VMI_CALL_SetInitialAPState 62
-#define VMI_CALL_APICWrite 63
-#define VMI_CALL_APICRead 64
-#define VMI_CALL_IODelay 65
-#define VMI_CALL_SetLazyMode 73
-
-/*
- *---------------------------------------------------------------------
- *
- * MMU operation flags
- *
- *---------------------------------------------------------------------
- */
-
-/* Flags used by VMI_{Allocate|Release}Page call */
-#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
-#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
-#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
-
-
-/* Flags shared by Allocate|Release Page and PTE updates */
-#define VMI_PAGE_PT 0x01
-#define VMI_PAGE_PD 0x02
-#define VMI_PAGE_PDP 0x04
-#define VMI_PAGE_PML4 0x08
-
-#define VMI_PAGE_NORMAL 0x00 /* for debugging */
-
-/* Flags used by PTE updates */
-#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
-#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
-#define VMI_PAGE_VA_MASK 0xfffff000
-
-#ifdef CONFIG_X86_PAE
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#else
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
-#endif
-
-/* Flags used by VMI_FlushTLB call */
-#define VMI_FLUSH_TLB 0x01
-#define VMI_FLUSH_GLOBAL 0x02
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI relocation definitions for ROM call get_reloc
- *
- *---------------------------------------------------------------------
- */
-
-/* VMI Relocation types */
-#define VMI_RELOCATION_NONE 0
-#define VMI_RELOCATION_CALL_REL 1
-#define VMI_RELOCATION_JUMP_REL 2
-#define VMI_RELOCATION_NOP 3
-
-#ifndef __ASSEMBLY__
-struct vmi_relocation_info {
- unsigned char *eip;
- unsigned char type;
- unsigned char reserved[3];
-};
-#endif
-
-
-/*
- *---------------------------------------------------------------------
- *
- * Generic ROM structures and definitions
- *
- *---------------------------------------------------------------------
- */
-
-#ifndef __ASSEMBLY__
-
-struct vrom_header {
- u16 rom_signature; /* option ROM signature */
- u8 rom_length; /* ROM length in 512 byte chunks */
- u8 rom_entry[4]; /* 16-bit code entry point */
- u8 rom_pad0; /* 4-byte align pad */
- u32 vrom_signature; /* VROM identification signature */
- u8 api_version_min;/* Minor version of API */
- u8 api_version_maj;/* Major version of API */
- u8 jump_slots; /* Number of jump slots */
- u8 reserved1; /* Reserved for expansion */
- u32 virtual_top; /* Hypervisor virtual address start */
- u16 reserved2; /* Reserved for expansion */
- u16 license_offs; /* Offset to License string */
- u16 pci_header_offs;/* Offset to PCI OPROM header */
- u16 pnp_header_offs;/* Offset to PnP OPROM header */
- u32 rom_pad3; /* PnP reserverd / VMI reserved */
- u8 reserved[96]; /* Reserved for headers */
- char vmi_init[8]; /* VMI_Init jump point */
- char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
-} __attribute__((packed));
-
-struct pnp_header {
- char sig[4];
- char rev;
- char size;
- short next;
- short res;
- long devID;
- unsigned short manufacturer_offset;
- unsigned short product_offset;
-} __attribute__((packed));
-
-struct pci_header {
- char sig[4];
- short vendorID;
- short deviceID;
- short vpdData;
- short size;
- char rev;
- char class;
- char subclass;
- char interface;
- short chunks;
- char rom_version_min;
- char rom_version_maj;
- char codetype;
- char lastRom;
- short reserved;
-} __attribute__((packed));
-
-/* Function prototypes for bootstrapping */
-#ifdef CONFIG_VMI
-extern void vmi_init(void);
-extern void vmi_activate(void);
-extern void vmi_bringup(void);
-#else
-static inline void vmi_init(void) {}
-static inline void vmi_activate(void) {}
-static inline void vmi_bringup(void) {}
-#endif
-
-/* State needed to start an application processor in an SMP system. */
-struct vmi_ap_state {
- u32 cr0;
- u32 cr2;
- u32 cr3;
- u32 cr4;
-
- u64 efer;
-
- u32 eip;
- u32 eflags;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esp;
- u32 ebp;
- u32 esi;
- u32 edi;
- u16 cs;
- u16 ss;
- u16 ds;
- u16 es;
- u16 fs;
- u16 gs;
- u16 ldtr;
-
- u16 gdtr_limit;
- u32 gdtr_base;
- u32 idtr_base;
- u16 idtr_limit;
-};
-
-#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3c..000000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * VMI Time wrappers
- *
- * Copyright (C) 2006, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-#ifndef _ASM_X86_VMI_TIME_H
-#define _ASM_X86_VMI_TIME_H
-
-/*
- * Raw VMI call indices for timer functions
- */
-#define VMI_CALL_GetCycleFrequency 66
-#define VMI_CALL_GetCycleCounter 67
-#define VMI_CALL_SetAlarm 68
-#define VMI_CALL_CancelAlarm 69
-#define VMI_CALL_GetWallclockTime 70
-#define VMI_CALL_WallclockUpdated 71
-
-/* Cached VMI timer operations */
-extern struct vmi_timer_ops {
- u64 (*get_cycle_frequency)(void);
- u64 (*get_cycle_counter)(int);
- u64 (*get_wallclock)(void);
- int (*wallclock_updated)(void);
- void (*set_alarm)(u32 flags, u64 expiry, u64 period);
- void (*cancel_alarm)(u32 flags);
-} vmi_timer_ops;
-
-/* Prototypes */
-extern void __init vmi_time_init(void);
-extern unsigned long vmi_get_wallclock(void);
-extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_tsc_khz(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern void __devinit vmi_time_bsp_init(void);
-extern void __devinit vmi_time_ap_init(void);
-#endif
-
-/*
- * When run under a hypervisor, a vcpu is always in one of three states:
- * running, halted, or ready. The vcpu is in the 'running' state if it
- * is executing. When the vcpu executes the halt interface, the vcpu
- * enters the 'halted' state and remains halted until there is some work
- * pending for the vcpu (e.g. an alarm expires, host I/O completes on
- * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
- * state (waiting for the hypervisor to reschedule it). Finally, at any
- * time when the vcpu is not in the 'running' state nor the 'halted'
- * state, it is in the 'ready' state.
- *
- * Real time is advances while the vcpu is 'running', 'ready', or
- * 'halted'. Stolen time is the time in which the vcpu is in the
- * 'ready' state. Available time is the remaining time -- the vcpu is
- * either 'running' or 'halted'.
- *
- * All three views of time are accessible through the VMI cycle
- * counters.
- */
-
-/* The cycle counters. */
-#define VMI_CYCLES_REAL 0
-#define VMI_CYCLES_AVAILABLE 1
-#define VMI_CYCLES_STOLEN 2
-
-/* The alarm interface 'flags' bits */
-#define VMI_ALARM_COUNTERS 2
-
-#define VMI_ALARM_COUNTER_MASK 0x000000ff
-
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-
-#define VMI_ALARM_IS_ONESHOT 0x00000000
-#define VMI_ALARM_IS_PERIODIC 0x00000100
-
-#define CONFIG_VMI_ALARM_HZ 100
-
-#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
index c11b7e100d83..c9cf43d5ef23 100644
--- a/arch/x86/include/asm/vmware.h
+++ b/arch/x86/include/asm/vmware.h
@@ -1,27 +1,327 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+#ifndef _ASM_X86_VMWARE_H
+#define _ASM_X86_VMWARE_H
+
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+#include <linux/stringify.h>
+
/*
- * Copyright (C) 2008, VMware, Inc.
+ * VMware hypercall ABI.
+ *
+ * - Low bandwidth (LB) hypercalls (I/O port based, vmcall and vmmcall)
+ * have up to 6 input and 6 output arguments passed and returned using
+ * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3),
+ * %esi (arg4), %edi (arg5).
+ * The following input arguments must be initialized by the caller:
+ * arg0 - VMWARE_HYPERVISOR_MAGIC
+ * arg2 - Hypercall command
+ * arg3 bits [15:0] - Port number, LB and direction flags
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * - Low bandwidth TDX hypercalls (x86_64 only) are similar to LB
+ * hypercalls. They also have up to 6 input and 6 output on registers
+ * arguments, with different argument to register mapping:
+ * %r12 (arg0), %rbx (arg1), %r13 (arg2), %rdx (arg3),
+ * %rsi (arg4), %rdi (arg5).
*
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
+ * - High bandwidth (HB) hypercalls are I/O port based only. They have
+ * up to 7 input and 7 output arguments passed and returned using
+ * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3),
+ * %esi (arg4), %edi (arg5), %ebp (arg6).
+ * The following input arguments must be initialized by the caller:
+ * arg0 - VMWARE_HYPERVISOR_MAGIC
+ * arg1 - Hypercall command
+ * arg3 bits [15:0] - Port number, HB and direction flags
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ * For compatibility purposes, x86_64 systems use only lower 32 bits
+ * for input and output arguments.
*
+ * The hypercall definitions differ in the low word of the %edx (arg3)
+ * in the following way: the old I/O port based interface uses the port
+ * number to distinguish between high- and low bandwidth versions, and
+ * uses IN/OUT instructions to define transfer direction.
+ *
+ * The new vmcall interface instead uses a set of flags to select
+ * bandwidth mode and transfer direction. The flags should be loaded
+ * into arg3 by any user and are automatically replaced by the port
+ * number if the I/O port method is used.
+ */
+
+#define VMWARE_HYPERVISOR_HB BIT(0)
+#define VMWARE_HYPERVISOR_OUT BIT(1)
+
+#define VMWARE_HYPERVISOR_PORT 0x5658
+#define VMWARE_HYPERVISOR_PORT_HB (VMWARE_HYPERVISOR_PORT | \
+ VMWARE_HYPERVISOR_HB)
+
+#define VMWARE_HYPERVISOR_MAGIC 0x564d5868U
+
+#define VMWARE_CMD_GETVERSION 10
+#define VMWARE_CMD_GETHZ 45
+#define VMWARE_CMD_GETVCPU_INFO 68
+#define VMWARE_CMD_STEALCLOCK 91
+/*
+ * Hypercall command mask:
+ * bits [6:0] command, range [0, 127]
+ * bits [19:16] sub-command, range [0, 15]
+ */
+#define VMWARE_CMD_MASK 0xf007fU
+
+#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
+#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
+
+extern unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5);
+
+#define VMWARE_TDX_VENDOR_LEAF 0x1af7e4909ULL
+#define VMWARE_TDX_HCALL_FUNC 1
+
+extern unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5);
+
+/*
+ * The low bandwidth call. The low word of %edx is presumed to have OUT bit
+ * set. The high word of %edx may contain input data from the caller.
+ */
+#define VMWARE_HYPERCALL \
+ ALTERNATIVE_2("movw %[port], %%dx\n\t" \
+ "inl (%%dx), %%eax", \
+ "vmcall", X86_FEATURE_VMCALL, \
+ "vmmcall", X86_FEATURE_VMW_VMMCALL)
+
+static inline
+unsigned long vmware_hypercall1(unsigned long cmd, unsigned long in1)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, 0, 0, 0,
+ NULL, NULL, NULL, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, 0, 0, 0,
+ NULL, NULL, NULL, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (0)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall3(unsigned long cmd, unsigned long in1,
+ u32 *out1, u32 *out2)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, 0, 0, 0,
+ out1, out2, NULL, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, 0, 0, 0,
+ out1, out2, NULL, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=b" (*out1), "=c" (*out2)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (0)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall4(unsigned long cmd, unsigned long in1,
+ u32 *out1, u32 *out2, u32 *out3)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, 0, 0, 0,
+ out1, out2, out3, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, 0, 0, 0,
+ out1, out2, out3, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (0)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall5(unsigned long cmd, unsigned long in1,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, u32 *out2)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, in3, in4, in5,
+ NULL, out2, NULL, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, in3, in4, in5,
+ NULL, out2, NULL, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=c" (*out2)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall6(unsigned long cmd, unsigned long in1,
+ unsigned long in3, u32 *out2,
+ u32 *out3, u32 *out4, u32 *out5)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, in3, 0, 0,
+ NULL, out2, out3, out4, out5);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, in3, 0, 0,
+ NULL, out2, out3, out4, out5);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=c" (*out2), "=d" (*out3), "=S" (*out4),
+ "=D" (*out5)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall7(unsigned long cmd, unsigned long in1,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, u32 *out1,
+ u32 *out2, u32 *out3)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, in3, in4, in5,
+ out1, out2, out3, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, in3, in4, in5,
+ out1, out2, out3, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ return out0;
+}
+
+#ifdef CONFIG_X86_64
+#define VMW_BP_CONSTRAINT "r"
+#else
+#define VMW_BP_CONSTRAINT "m"
+#endif
+
+/*
+ * High bandwidth calls are not supported on encrypted memory guests.
+ * The caller should check cc_platform_has(CC_ATTR_MEM_ENCRYPT) and use
+ * low bandwidth hypercall if memory encryption is set.
+ * This assumption simplifies HB hypercall implementation to just I/O port
+ * based approach without alternative patching.
*/
-#ifndef ASM_X86__VMWARE_H
-#define ASM_X86__VMWARE_H
+static inline
+unsigned long vmware_hypercall_hb_out(unsigned long cmd, unsigned long in2,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, unsigned long in6,
+ u32 *out1)
+{
+ unsigned long out0;
+
+ asm_inline volatile (
+ UNWIND_HINT_SAVE
+ "push %%" _ASM_BP "\n\t"
+ UNWIND_HINT_UNDEFINED
+ "mov %[in6], %%" _ASM_BP "\n\t"
+ "rep outsb\n\t"
+ "pop %%" _ASM_BP "\n\t"
+ UNWIND_HINT_RESTORE
+ : "=a" (out0), "=b" (*out1)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (cmd),
+ "c" (in2),
+ "d" (in3 | VMWARE_HYPERVISOR_PORT_HB),
+ "S" (in4),
+ "D" (in5),
+ [in6] VMW_BP_CONSTRAINT (in6)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall_hb_in(unsigned long cmd, unsigned long in2,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, unsigned long in6,
+ u32 *out1)
+{
+ unsigned long out0;
-extern unsigned long vmware_get_tsc_khz(void);
-extern int vmware_platform(void);
-extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
+ asm_inline volatile (
+ UNWIND_HINT_SAVE
+ "push %%" _ASM_BP "\n\t"
+ UNWIND_HINT_UNDEFINED
+ "mov %[in6], %%" _ASM_BP "\n\t"
+ "rep insb\n\t"
+ "pop %%" _ASM_BP "\n\t"
+ UNWIND_HINT_RESTORE
+ : "=a" (out0), "=b" (*out1)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (cmd),
+ "c" (in2),
+ "d" (in3 | VMWARE_HYPERVISOR_PORT_HB),
+ "S" (in4),
+ "D" (in5),
+ [in6] VMW_BP_CONSTRAINT (in6)
+ : "cc", "memory");
+ return out0;
+}
+#undef VMW_BP_CONSTRAINT
+#undef VMWARE_HYPERCALL
#endif
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 11be5ad2e0e9..c85c50019523 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -1,79 +1,197 @@
-#ifndef VMX_H
-#define VMX_H
-
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* vmx.h: VMX Architecture related definitions
* Copyright (c) 2004, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
* A few random additions are:
* Copyright (C) 2006 Qumranet
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
- *
*/
+#ifndef VMX_H
+#define VMX_H
+
+
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/types.h>
+
+#include <uapi/asm/vmx.h>
+#include <asm/trapnr.h>
+#include <asm/vmxfeatures.h>
+
+#define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f)
/*
* Definitions of Primary Processor-Based VM-Execution Controls.
*/
-#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
-#define CPU_BASED_HLT_EXITING 0x00000080
-#define CPU_BASED_INVLPG_EXITING 0x00000200
-#define CPU_BASED_MWAIT_EXITING 0x00000400
-#define CPU_BASED_RDPMC_EXITING 0x00000800
-#define CPU_BASED_RDTSC_EXITING 0x00001000
-#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
-#define CPU_BASED_CR3_STORE_EXITING 0x00010000
-#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
-#define CPU_BASED_CR8_STORE_EXITING 0x00100000
-#define CPU_BASED_TPR_SHADOW 0x00200000
-#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
-#define CPU_BASED_MOV_DR_EXITING 0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
-#define CPU_BASED_USE_IO_BITMAPS 0x02000000
-#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
-#define CPU_BASED_MONITOR_EXITING 0x20000000
-#define CPU_BASED_PAUSE_EXITING 0x40000000
-#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+#define CPU_BASED_INTR_WINDOW_EXITING VMCS_CONTROL_BIT(INTR_WINDOW_EXITING)
+#define CPU_BASED_USE_TSC_OFFSETTING VMCS_CONTROL_BIT(USE_TSC_OFFSETTING)
+#define CPU_BASED_HLT_EXITING VMCS_CONTROL_BIT(HLT_EXITING)
+#define CPU_BASED_INVLPG_EXITING VMCS_CONTROL_BIT(INVLPG_EXITING)
+#define CPU_BASED_MWAIT_EXITING VMCS_CONTROL_BIT(MWAIT_EXITING)
+#define CPU_BASED_RDPMC_EXITING VMCS_CONTROL_BIT(RDPMC_EXITING)
+#define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING)
+#define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING)
+#define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING)
+#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS)
+#define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING)
+#define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING)
+#define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR)
+#define CPU_BASED_NMI_WINDOW_EXITING VMCS_CONTROL_BIT(NMI_WINDOW_EXITING)
+#define CPU_BASED_MOV_DR_EXITING VMCS_CONTROL_BIT(MOV_DR_EXITING)
+#define CPU_BASED_UNCOND_IO_EXITING VMCS_CONTROL_BIT(UNCOND_IO_EXITING)
+#define CPU_BASED_USE_IO_BITMAPS VMCS_CONTROL_BIT(USE_IO_BITMAPS)
+#define CPU_BASED_MONITOR_TRAP_FLAG VMCS_CONTROL_BIT(MONITOR_TRAP_FLAG)
+#define CPU_BASED_USE_MSR_BITMAPS VMCS_CONTROL_BIT(USE_MSR_BITMAPS)
+#define CPU_BASED_MONITOR_EXITING VMCS_CONTROL_BIT(MONITOR_EXITING)
+#define CPU_BASED_PAUSE_EXITING VMCS_CONTROL_BIT(PAUSE_EXITING)
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS VMCS_CONTROL_BIT(SEC_CONTROLS)
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
+
/*
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
-#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
-#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES)
+#define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT)
+#define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING)
+#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC)
+#define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID)
+#define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING)
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST VMCS_CONTROL_BIT(UNRESTRICTED_GUEST)
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT VMCS_CONTROL_BIT(APIC_REGISTER_VIRT)
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY VMCS_CONTROL_BIT(VIRT_INTR_DELIVERY)
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING VMCS_CONTROL_BIT(PAUSE_LOOP_EXITING)
+#define SECONDARY_EXEC_RDRAND_EXITING VMCS_CONTROL_BIT(RDRAND_EXITING)
+#define SECONDARY_EXEC_ENABLE_INVPCID VMCS_CONTROL_BIT(INVPCID)
+#define SECONDARY_EXEC_ENABLE_VMFUNC VMCS_CONTROL_BIT(VMFUNC)
+#define SECONDARY_EXEC_SHADOW_VMCS VMCS_CONTROL_BIT(SHADOW_VMCS)
+#define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING)
+#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
+#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
+#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
+#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
+#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
+#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
+#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
+#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
+#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
+#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
+/*
+ * Definitions of Tertiary Processor-Based VM-Execution Controls.
+ */
+#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT)
-#define PIN_BASED_EXT_INTR_MASK 0x00000001
-#define PIN_BASED_NMI_EXITING 0x00000008
-#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+#define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING)
+#define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING)
+#define PIN_BASED_VIRTUAL_NMIS VMCS_CONTROL_BIT(VIRTUAL_NMIS)
+#define PIN_BASED_VMX_PREEMPTION_TIMER VMCS_CONTROL_BIT(PREEMPTION_TIMER)
+#define PIN_BASED_POSTED_INTR VMCS_CONTROL_BIT(POSTED_INTR)
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
#define VM_EXIT_SAVE_IA32_PAT 0x00040000
#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+#define VM_EXIT_CLEAR_BNDCFGS 0x00800000
+#define VM_EXIT_PT_CONCEAL_PIP 0x01000000
+#define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000
+#define VM_EXIT_LOAD_CET_STATE 0x10000000
+
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
+#define VM_ENTRY_LOAD_BNDCFGS 0x00010000
+#define VM_ENTRY_PT_CONCEAL_PIP 0x00020000
+#define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000
+#define VM_ENTRY_LOAD_CET_STATE 0x00100000
+
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
+
+/* VMFUNC functions */
+#define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28)
+
+#define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING)
+#define VMFUNC_EPTP_ENTRIES 512
+
+#define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48)
+#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49)
+#define VMX_BASIC_INOUT BIT_ULL(54)
+#define VMX_BASIC_TRUE_CTLS BIT_ULL(55)
+#define VMX_BASIC_NO_HW_ERROR_CODE_CC BIT_ULL(56)
+
+static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
+{
+ return vmx_basic & GENMASK_ULL(30, 0);
+}
+
+static inline u32 vmx_basic_vmcs_size(u64 vmx_basic)
+{
+ return (vmx_basic & GENMASK_ULL(44, 32)) >> 32;
+}
+
+static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic)
+{
+ return (vmx_basic & GENMASK_ULL(53, 50)) >> 50;
+}
+
+static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype)
+{
+ return revision | ((u64)size << 32) | ((u64)memtype << 50);
+}
+
+#define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5)
+#define VMX_MISC_ACTIVITY_HLT BIT_ULL(6)
+#define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7)
+#define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8)
+#define VMX_MISC_INTEL_PT BIT_ULL(14)
+#define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15)
+#define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28)
+#define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29)
+#define VMX_MISC_ZERO_LEN_INS BIT_ULL(30)
+#define VMX_MISC_MSR_LIST_MULTIPLIER 512
+
+static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
+{
+ return vmx_misc & GENMASK_ULL(4, 0);
+}
+
+static inline int vmx_misc_cr3_count(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(24, 16)) >> 16;
+}
+
+static inline int vmx_misc_max_msr(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(27, 25)) >> 25;
+}
+
+static inline int vmx_misc_mseg_revid(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(63, 32)) >> 32;
+}
/* VMCS Encodings */
enum vmcs_field {
VIRTUAL_PROCESSOR_ID = 0x00000000,
+ POSTED_INTR_NV = 0x00000002,
+ LAST_PID_POINTER_INDEX = 0x00000008,
GUEST_ES_SELECTOR = 0x00000800,
GUEST_CS_SELECTOR = 0x00000802,
GUEST_SS_SELECTOR = 0x00000804,
@@ -82,6 +200,8 @@ enum vmcs_field {
GUEST_GS_SELECTOR = 0x0000080a,
GUEST_LDTR_SELECTOR = 0x0000080c,
GUEST_TR_SELECTOR = 0x0000080e,
+ GUEST_INTR_STATUS = 0x00000810,
+ GUEST_PML_INDEX = 0x00000812,
HOST_ES_SELECTOR = 0x00000c00,
HOST_CS_SELECTOR = 0x00000c02,
HOST_SS_SELECTOR = 0x00000c04,
@@ -101,14 +221,47 @@ enum vmcs_field {
VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ PML_ADDRESS = 0x0000200e,
+ PML_ADDRESS_HIGH = 0x0000200f,
TSC_OFFSET = 0x00002010,
TSC_OFFSET_HIGH = 0x00002011,
VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
APIC_ACCESS_ADDR = 0x00002014,
APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ POSTED_INTR_DESC_ADDR = 0x00002016,
+ POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
+ VM_FUNCTION_CONTROL = 0x00002018,
+ VM_FUNCTION_CONTROL_HIGH = 0x00002019,
EPT_POINTER = 0x0000201a,
EPT_POINTER_HIGH = 0x0000201b,
+ EOI_EXIT_BITMAP0 = 0x0000201c,
+ EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
+ EOI_EXIT_BITMAP1 = 0x0000201e,
+ EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
+ EOI_EXIT_BITMAP2 = 0x00002020,
+ EOI_EXIT_BITMAP2_HIGH = 0x00002021,
+ EOI_EXIT_BITMAP3 = 0x00002022,
+ EOI_EXIT_BITMAP3_HIGH = 0x00002023,
+ EPTP_LIST_ADDRESS = 0x00002024,
+ EPTP_LIST_ADDRESS_HIGH = 0x00002025,
+ VMREAD_BITMAP = 0x00002026,
+ VMREAD_BITMAP_HIGH = 0x00002027,
+ VMWRITE_BITMAP = 0x00002028,
+ VMWRITE_BITMAP_HIGH = 0x00002029,
+ VE_INFORMATION_ADDRESS = 0x0000202A,
+ VE_INFORMATION_ADDRESS_HIGH = 0x0000202B,
+ XSS_EXIT_BITMAP = 0x0000202C,
+ XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ ENCLS_EXITING_BITMAP = 0x0000202E,
+ ENCLS_EXITING_BITMAP_HIGH = 0x0000202F,
+ TSC_MULTIPLIER = 0x00002032,
+ TSC_MULTIPLIER_HIGH = 0x00002033,
+ TERTIARY_VM_EXEC_CONTROL = 0x00002034,
+ TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035,
+ SHARED_EPT_POINTER = 0x0000203C,
+ PID_POINTER_TABLE = 0x00002042,
+ PID_POINTER_TABLE_HIGH = 0x00002043,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
@@ -117,6 +270,10 @@ enum vmcs_field {
GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
GUEST_IA32_PAT = 0x00002804,
GUEST_IA32_PAT_HIGH = 0x00002805,
+ GUEST_IA32_EFER = 0x00002806,
+ GUEST_IA32_EFER_HIGH = 0x00002807,
+ GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
GUEST_PDPTR0 = 0x0000280a,
GUEST_PDPTR0_HIGH = 0x0000280b,
GUEST_PDPTR1 = 0x0000280c,
@@ -125,8 +282,16 @@ enum vmcs_field {
GUEST_PDPTR2_HIGH = 0x0000280f,
GUEST_PDPTR3 = 0x00002810,
GUEST_PDPTR3_HIGH = 0x00002811,
+ GUEST_BNDCFGS = 0x00002812,
+ GUEST_BNDCFGS_HIGH = 0x00002813,
+ GUEST_IA32_RTIT_CTL = 0x00002814,
+ GUEST_IA32_RTIT_CTL_HIGH = 0x00002815,
HOST_IA32_PAT = 0x00002c00,
HOST_IA32_PAT_HIGH = 0x00002c01,
+ HOST_IA32_EFER = 0x00002c02,
+ HOST_IA32_EFER_HIGH = 0x00002c03,
+ HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
@@ -143,6 +308,9 @@ enum vmcs_field {
VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
TPR_THRESHOLD = 0x0000401c,
SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
+ PLE_GAP = 0x00004020,
+ PLE_WINDOW = 0x00004022,
+ NOTIFY_WINDOW = 0x00004024,
VM_INSTRUCTION_ERROR = 0x00004400,
VM_EXIT_REASON = 0x00004402,
VM_EXIT_INTR_INFO = 0x00004404,
@@ -170,8 +338,9 @@ enum vmcs_field {
GUEST_LDTR_AR_BYTES = 0x00004820,
GUEST_TR_AR_BYTES = 0x00004822,
GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
- GUEST_ACTIVITY_STATE = 0X00004826,
+ GUEST_ACTIVITY_STATE = 0x00004826,
GUEST_SYSENTER_CS = 0x0000482A,
+ VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
HOST_IA32_SYSENTER_CS = 0x00004c00,
CR0_GUEST_HOST_MASK = 0x00006000,
CR4_GUEST_HOST_MASK = 0x00006002,
@@ -203,6 +372,9 @@ enum vmcs_field {
GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
GUEST_SYSENTER_ESP = 0x00006824,
GUEST_SYSENTER_EIP = 0x00006826,
+ GUEST_S_CET = 0x00006828,
+ GUEST_SSP = 0x0000682a,
+ GUEST_INTR_SSP_TABLE = 0x0000682c,
HOST_CR0 = 0x00006c00,
HOST_CR3 = 0x00006c02,
HOST_CR4 = 0x00006c04,
@@ -215,45 +387,11 @@ enum vmcs_field {
HOST_IA32_SYSENTER_EIP = 0x00006c12,
HOST_RSP = 0x00006c14,
HOST_RIP = 0x00006c16,
+ HOST_S_CET = 0x00006c18,
+ HOST_SSP = 0x00006c1a,
+ HOST_INTR_SSP_TABLE = 0x00006c1c
};
-#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
-
-#define EXIT_REASON_EXCEPTION_NMI 0
-#define EXIT_REASON_EXTERNAL_INTERRUPT 1
-#define EXIT_REASON_TRIPLE_FAULT 2
-
-#define EXIT_REASON_PENDING_INTERRUPT 7
-#define EXIT_REASON_NMI_WINDOW 8
-#define EXIT_REASON_TASK_SWITCH 9
-#define EXIT_REASON_CPUID 10
-#define EXIT_REASON_HLT 12
-#define EXIT_REASON_INVLPG 14
-#define EXIT_REASON_RDPMC 15
-#define EXIT_REASON_RDTSC 16
-#define EXIT_REASON_VMCALL 18
-#define EXIT_REASON_VMCLEAR 19
-#define EXIT_REASON_VMLAUNCH 20
-#define EXIT_REASON_VMPTRLD 21
-#define EXIT_REASON_VMPTRST 22
-#define EXIT_REASON_VMREAD 23
-#define EXIT_REASON_VMRESUME 24
-#define EXIT_REASON_VMWRITE 25
-#define EXIT_REASON_VMOFF 26
-#define EXIT_REASON_VMON 27
-#define EXIT_REASON_CR_ACCESS 28
-#define EXIT_REASON_DR_ACCESS 29
-#define EXIT_REASON_IO_INSTRUCTION 30
-#define EXIT_REASON_MSR_READ 31
-#define EXIT_REASON_MSR_WRITE 32
-#define EXIT_REASON_MWAIT_INSTRUCTION 36
-#define EXIT_REASON_MCE_DURING_VMENTRY 41
-#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
-#define EXIT_REASON_APIC_ACCESS 44
-#define EXIT_REASON_EPT_VIOLATION 48
-#define EXIT_REASON_EPT_MISCONFIG 49
-#define EXIT_REASON_WBINVD 54
-
/*
* Interruption-information format
*/
@@ -269,17 +407,27 @@ enum vmcs_field {
#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK
#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
-#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
-#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
-#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */
-#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
-#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */
+#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */
+#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */
+#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */
+#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */
+#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */
+#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */
+#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */
/* GUEST_INTERRUPTIBILITY_INFO flags. */
#define GUEST_INTR_STATE_STI 0x00000001
#define GUEST_INTR_STATE_MOV_SS 0x00000002
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010
+
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE 0
+#define GUEST_ACTIVITY_HLT 1
+#define GUEST_ACTIVITY_SHUTDOWN 2
+#define GUEST_ACTIVITY_WAIT_SIPI 3
/*
* Exit Qualifications for MOV for Control Register Access
@@ -316,69 +464,218 @@ enum vmcs_field {
#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1 << 13)
+/*
+ * Exit Qualifications for APIC-Access
+ */
+#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */
+#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */
+#define TYPE_LINEAR_APIC_INST_READ (0 << 12)
+#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12)
+#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12)
+#define TYPE_LINEAR_APIC_EVENT (3 << 12)
+#define TYPE_PHYSICAL_APIC_EVENT (10 << 12)
+#define TYPE_PHYSICAL_APIC_INST (15 << 12)
+
+/* segment AR in VMCS -- these are different from what LAR reports */
+#define VMX_SEGMENT_AR_L_MASK (1 << 13)
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1 << 1)
-#define AR_TYPE_WRITEABLE_MASK (1 << 2)
-#define AR_TYPE_CODE_MASK (1 << 3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
+#define VMX_AR_TYPE_ACCESSES_MASK 1
+#define VMX_AR_TYPE_READABLE_MASK (1 << 1)
+#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define VMX_AR_TYPE_CODE_MASK (1 << 3)
+#define VMX_AR_TYPE_MASK 0x0f
+#define VMX_AR_TYPE_BUSY_64_TSS 11
+#define VMX_AR_TYPE_BUSY_32_TSS 11
+#define VMX_AR_TYPE_BUSY_16_TSS 3
+#define VMX_AR_TYPE_LDT 2
-#define AR_UNUSABLE_MASK (1 << 16)
-#define AR_S_MASK (1 << 4)
-#define AR_P_MASK (1 << 7)
-#define AR_L_MASK (1 << 13)
-#define AR_DB_MASK (1 << 14)
-#define AR_G_MASK (1 << 15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
+#define VMX_AR_UNUSABLE_MASK (1 << 16)
+#define VMX_AR_S_MASK (1 << 4)
+#define VMX_AR_P_MASK (1 << 7)
+#define VMX_AR_L_MASK (1 << 13)
+#define VMX_AR_DB_MASK (1 << 14)
+#define VMX_AR_G_MASK (1 << 15)
+#define VMX_AR_DPL_SHIFT 5
+#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3)
-#define AR_RESERVD_MASK 0xfffe0f00
+#define VMX_AR_RESERVD_MASK 0xfffe0f00
-#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
+#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
#define VMX_NR_VPIDS (1 << 16)
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
#define VMX_VPID_EXTENT_ALL_CONTEXT 2
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3
-#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_EPT_EXTENT_CONTEXT 1
#define VMX_EPT_EXTENT_GLOBAL 2
-#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
+#define VMX_EPT_EXTENT_SHIFT 24
+
+#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7)
+#define VMX_EPTP_UC_BIT (1ull << 8)
+#define VMX_EPTP_WB_BIT (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
+#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
+#define VMX_EPT_AD_BIT (1ull << 21)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
-#define VMX_EPT_DEFAULT_GAW 3
-#define VMX_EPT_MAX_GAW 0x4
+
+#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */
+
#define VMX_EPT_MT_EPTE_SHIFT 3
-#define VMX_EPT_GAW_EPTP_SHIFT 3
-#define VMX_EPT_DEFAULT_MT 0x6ull
+#define VMX_EPTP_PWL_MASK 0x38ull
+#define VMX_EPTP_PWL_4 0x18ull
+#define VMX_EPTP_PWL_5 0x20ull
+#define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
+/* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */
+#define VMX_EPTP_MT_MASK 0x7ull
+#define VMX_EPTP_MT_WB X86_MEMTYPE_WB
+#define VMX_EPTP_MT_UC X86_MEMTYPE_UC
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
-#define VMX_EPT_IGMT_BIT (1ull << 6)
+#define VMX_EPT_IPAT_BIT (1ull << 6)
+#define VMX_EPT_ACCESS_BIT (1ull << 8)
+#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63)
+#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
+ VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
+
+static inline u8 vmx_eptp_page_walk_level(u64 eptp)
+{
+ u64 encoded_level = eptp & VMX_EPTP_PWL_MASK;
+
+ if (encoded_level == VMX_EPTP_PWL_5)
+ return 5;
+
+ /* @eptp must be pre-validated by the caller. */
+ WARN_ON_ONCE(encoded_level != VMX_EPTP_PWL_4);
+ return 4;
+}
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
+struct vmx_msr_entry {
+ u32 index;
+ u32 reserved;
+ u64 value;
+} __aligned(16);
+
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+enum vm_entry_failure_code {
+ ENTRY_FAIL_DEFAULT = 0,
+ ENTRY_FAIL_PDPTE = 2,
+ ENTRY_FAIL_NMI = 3,
+ ENTRY_FAIL_VMCS_LINK_PTR = 4,
+};
+
+/*
+ * Exit Qualifications for EPT Violations
+ */
+#define EPT_VIOLATION_ACC_READ BIT(0)
+#define EPT_VIOLATION_ACC_WRITE BIT(1)
+#define EPT_VIOLATION_ACC_INSTR BIT(2)
+#define EPT_VIOLATION_PROT_READ BIT(3)
+#define EPT_VIOLATION_PROT_WRITE BIT(4)
+#define EPT_VIOLATION_PROT_EXEC BIT(5)
+#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6)
+#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
+ EPT_VIOLATION_PROT_WRITE | \
+ EPT_VIOLATION_PROT_EXEC)
+#define EPT_VIOLATION_GVA_IS_VALID BIT(7)
+#define EPT_VIOLATION_GVA_TRANSLATED BIT(8)
+
+#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
+
+static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) ==
+ (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC));
+
+/*
+ * Exit Qualifications for NOTIFY VM EXIT
+ */
+#define NOTIFY_VM_CONTEXT_INVALID BIT(0)
+
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+ VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+ VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+ VMXERR_VMCLEAR_VMXON_POINTER = 3,
+ VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+ VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+ VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+ VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+ VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+ VMXERR_VMPTRLD_VMXON_POINTER = 10,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+ VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+ VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+ VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+ VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+ VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+ VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+ VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+ VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+ VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+ VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+ VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
+};
+
+/*
+ * VM-instruction errors that can be encountered on VM-Enter, used to trace
+ * nested VM-Enter failures reported by hardware. Errors unique to VM-Enter
+ * from a SMI Transfer Monitor are not included as things have gone seriously
+ * sideways if we get one of those...
+ */
+#define VMX_VMENTER_INSTRUCTION_ERRORS \
+ { VMXERR_VMLAUNCH_NONCLEAR_VMCS, "VMLAUNCH_NONCLEAR_VMCS" }, \
+ { VMXERR_VMRESUME_NONLAUNCHED_VMCS, "VMRESUME_NONLAUNCHED_VMCS" }, \
+ { VMXERR_VMRESUME_AFTER_VMXOFF, "VMRESUME_AFTER_VMXOFF" }, \
+ { VMXERR_ENTRY_INVALID_CONTROL_FIELD, "VMENTRY_INVALID_CONTROL_FIELD" }, \
+ { VMXERR_ENTRY_INVALID_HOST_STATE_FIELD, "VMENTRY_INVALID_HOST_STATE_FIELD" }, \
+ { VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS, "VMENTRY_EVENTS_BLOCKED_BY_MOV_SS" }
-#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+enum vmx_l1d_flush_state {
+ VMENTER_L1D_FLUSH_AUTO,
+ VMENTER_L1D_FLUSH_NEVER,
+ VMENTER_L1D_FLUSH_COND,
+ VMENTER_L1D_FLUSH_ALWAYS,
+ VMENTER_L1D_FLUSH_EPT_DISABLED,
+ VMENTER_L1D_FLUSH_NOT_REQUIRED,
+};
+extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+struct vmx_ve_information {
+ u32 exit_reason;
+ u32 delivery;
+ u64 exit_qualification;
+ u64 guest_linear_address;
+ u64 guest_physical_address;
+ u16 eptp_index;
+};
#endif
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
new file mode 100644
index 000000000000..09b1d7e607c1
--- /dev/null
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VMXFEATURES_H
+#define _ASM_X86_VMXFEATURES_H
+
+/*
+ * Defines VMX CPU feature bits
+ */
+#define NVMXINTS 5 /* N 32-bit words worth of info */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name. Otherwise, this feature bit
+ * is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Pin-Based VM-Execution Controls, EPT/VPID, APIC and VM-Functions, word 0 */
+#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* VM-Exit on vectored interrupts */
+#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* VM-Exit on NMIs */
+#define VMX_FEATURE_VIRTUAL_NMIS ( 0*32+ 5) /* "vnmi" NMI virtualization */
+#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* "preemption_timer" VMX Preemption Timer */
+#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* "posted_intr" Posted Interrupts */
+
+/* EPT/VPID features, scattered to bits 16-23 */
+#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* "invvpid" INVVPID is supported */
+#define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */
+#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* "ept_ad" EPT Accessed/Dirty bits */
+#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* "ept_1gb" 1GB EPT pages */
+#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* "ept_5level" 5-level EPT paging */
+
+/* Aggregated APIC features 24-27 */
+#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* "flexpriority" TPR shadow + virt APIC */
+#define VMX_FEATURE_APICV ( 0*32+ 25) /* "apicv" TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */
+
+/* VM-Functions, shifted to bits 28-31 */
+#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* "eptp_switching" EPTP switching (in guest) */
+
+/* Primary Processor-Based VM-Execution Controls, word 1 */
+#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* VM-Exit if INTRs are unblocked in guest */
+#define VMX_FEATURE_USE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */
+#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* VM-Exit on HLT */
+#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* VM-Exit on INVLPG */
+#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* VM-Exit on MWAIT */
+#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* VM-Exit on RDPMC */
+#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* VM-Exit on RDTSC */
+#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* VM-Exit on writes to CR3 */
+#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* VM-Exit on reads from CR3 */
+#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* Enable Tertiary VM-Execution Controls */
+#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* VM-Exit on writes to CR8 */
+#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* VM-Exit on reads from CR8 */
+#define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */
+#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* VM-Exit if NMIs are unblocked in guest */
+#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* VM-Exit on accesses to debug registers */
+#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* VM-Exit on *all* IN{S} and OUT{S}*/
+#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* VM-Exit based on I/O port */
+#define VMX_FEATURE_MONITOR_TRAP_FLAG ( 1*32+ 27) /* "mtf" VMX single-step VM-Exits */
+#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* VM-Exit based on MSR index */
+#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* VM-Exit on MONITOR (MWAIT's accomplice) */
+#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* VM-Exit on PAUSE (unconditionally) */
+#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* Enable Secondary VM-Execution Controls */
+
+/* Secondary Processor-Based VM-Execution Controls, word 2 */
+#define VMX_FEATURE_VIRT_APIC_ACCESSES ( 2*32+ 0) /* "vapic" Virtualize memory mapped APIC accesses */
+#define VMX_FEATURE_EPT ( 2*32+ 1) /* "ept" Extended Page Tables, a.k.a. Two-Dimensional Paging */
+#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* VM-Exit on {S,L}*DT instructions */
+#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* Enable RDTSCP in guest */
+#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* Virtualize X2APIC for the guest */
+#define VMX_FEATURE_VPID ( 2*32+ 5) /* "vpid" Virtual Processor ID (TLB ASID modifier) */
+#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* VM-Exit on WBINVD */
+#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* "unrestricted_guest" Allow Big Real Mode and other "invalid" states */
+#define VMX_FEATURE_APIC_REGISTER_VIRT ( 2*32+ 8) /* "vapic_reg" Hardware emulation of reads to the virtual-APIC */
+#define VMX_FEATURE_VIRT_INTR_DELIVERY ( 2*32+ 9) /* "vid" Evaluation and delivery of pending virtual interrupts */
+#define VMX_FEATURE_PAUSE_LOOP_EXITING ( 2*32+ 10) /* "ple" Conditionally VM-Exit on PAUSE at CPL0 */
+#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* VM-Exit on RDRAND*/
+#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* Enable INVPCID in guest */
+#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* Enable VM-Functions (leaf dependent) */
+#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* "shadow_vmcs" VMREAD/VMWRITE in guest can access shadow VMCS */
+#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* VM-Exit on ENCLS (leaf dependent) */
+#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* VM-Exit on RDSEED */
+#define VMX_FEATURE_PAGE_MOD_LOGGING ( 2*32+ 17) /* "pml" Log dirty pages into buffer */
+#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "ept_violation_ve" Conditionally reflect EPT violations as #VE exceptions */
+#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* Suppress VMX indicators in Processor Trace */
+#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* Enable XSAVES and XRSTORS in guest */
+#define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */
+#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* Processor Trace logs GPAs */
+#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* "tsc_scaling" Scale hardware TSC when read in guest */
+#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* "usr_wait_pause" Enable TPAUSE, UMONITOR, UMWAIT in guest */
+#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* VM-Exit on ENCLV (leaf dependent) */
+#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* VM-Exit when bus lock caused */
+#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* "notify_vm_exiting" VM-Exit when no event windows after notify window */
+
+/* Tertiary Processor-Based VM-Execution Controls, word 3 */
+#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* "ipi_virt" Enable IPI virtualization */
+#endif /* _ASM_X86_VMXFEATURES_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fbd..472f0263dbc6 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -1,44 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_VSYSCALL_H
#define _ASM_X86_VSYSCALL_H
-enum vsyscall_num {
- __NR_vgettimeofday,
- __NR_vtime,
- __NR_vgetcpu,
-};
-
-#define VSYSCALL_START (-10UL << 20)
-#define VSYSCALL_SIZE 1024
-#define VSYSCALL_END (-2UL << 20)
-#define VSYSCALL_MAPPED_PAGES 1
-#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
-
-#ifdef __KERNEL__
#include <linux/seqlock.h>
+#include <uapi/asm/vsyscall.h>
+#include <asm/page_types.h>
-#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
-#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
-
-/* Definitions for CONFIG_GENERIC_TIME definitions */
-#define __section_vsyscall_gtod_data __attribute__ \
- ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
-#define __section_vsyscall_clock __attribute__ \
- ((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn \
- __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
-
-#define VGETCPU_RDTSCP 1
-#define VGETCPU_LSL 2
-
-extern int __vgetcpu_mode;
-extern volatile unsigned long __jiffies;
-
-/* kernel space (writeable) */
-extern int vgetcpu_mode;
-extern struct timezone sys_tz;
-
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
-
-#endif /* __KERNEL__ */
+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
+
+/*
+ * Called on instruction fetch fault in vsyscall page.
+ * Returns true if handled.
+ */
+extern bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address);
+#else
+static inline void map_vsyscall(void) {}
+static inline bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
+{
+ return false;
+}
+#endif
+
+/*
+ * The (legacy) vsyscall page is the long page in the kernel portion
+ * of the address space that has user-accessible permissions.
+ */
+static inline bool is_vsyscall_vaddr(unsigned long vaddr)
+{
+ return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
+}
#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
new file mode 100644
index 000000000000..422a47746657
--- /dev/null
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_WORD_AT_A_TIME_H
+#define _ASM_WORD_AT_A_TIME_H
+
+#include <linux/bitops.h>
+#include <linux/wordpart.h>
+
+struct word_at_a_time {
+ const unsigned long one_bits, high_bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
+
+/* Return nonzero if it has a zero */
+static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
+{
+ unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
+ *bits = mask;
+ return mask;
+}
+
+static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
+{
+ return bits;
+}
+
+#ifdef CONFIG_64BIT
+
+/* Keep the initial has_zero() value for both bitmask and size calc */
+#define create_zero_mask(bits) (bits)
+
+static inline unsigned long zero_bytemask(unsigned long bits)
+{
+ bits = (bits - 1) & ~bits;
+ return bits >> 7;
+}
+
+#define find_zero(bits) (__ffs(bits) >> 3)
+
+#else
+
+/* Create the final mask for both bytemask and size */
+static inline unsigned long create_zero_mask(unsigned long bits)
+{
+ bits = (bits - 1) & ~bits;
+ return bits >> 7;
+}
+
+/* The mask we created is directly usable as a bytemask */
+#define zero_bytemask(mask) (mask)
+
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
+static inline unsigned long find_zero(unsigned long mask)
+{
+ /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+ long a = (0x0ff0001+mask) >> 23;
+ /* Fix the 1 for 00 case */
+ return a & mask;
+}
+
+#endif
+
+/*
+ * Load an unaligned word from kernel space.
+ *
+ * In the (very unlikely) case of the word being a page-crosser
+ * and the next page not being mapped, take the exception and
+ * return zeroes in the non-existing part.
+ */
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+ unsigned long ret;
+
+ asm volatile(
+ "1: mov %[mem], %[ret]\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD)
+ : [ret] "=r" (ret)
+ : [mem] "m" (*(unsigned long *)addr));
+
+ return ret;
+}
+
+#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
new file mode 100644
index 000000000000..6c8a6ead84f6
--- /dev/null
+++ b/arch/x86/include/asm/x86_init.h
@@ -0,0 +1,355 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PLATFORM_H
+#define _ASM_X86_PLATFORM_H
+
+struct ghcb;
+struct mpc_bus;
+struct mpc_cpu;
+struct pt_regs;
+struct mpc_table;
+struct cpuinfo_x86;
+struct irq_domain;
+
+/**
+ * struct x86_init_mpparse - platform specific mpparse ops
+ * @setup_ioapic_ids: platform specific ioapic id override
+ * @find_mptable: Find MPTABLE early to reserve the memory region
+ * @early_parse_smp_cfg: Parse the SMP configuration data early before initmem_init()
+ * @parse_smp_cfg: Parse the SMP configuration data
+ */
+struct x86_init_mpparse {
+ void (*setup_ioapic_ids)(void);
+ void (*find_mptable)(void);
+ void (*early_parse_smp_cfg)(void);
+ void (*parse_smp_cfg)(void);
+};
+
+/**
+ * struct x86_init_resources - platform specific resource related ops
+ * @probe_roms: probe BIOS roms
+ * @reserve_resources: reserve the standard resources for the
+ * platform
+ * @memory_setup: platform specific memory setup
+ * @dmi_setup: platform specific DMI setup
+ */
+struct x86_init_resources {
+ void (*probe_roms)(void);
+ void (*reserve_resources)(void);
+ char *(*memory_setup)(void);
+ void (*dmi_setup)(void);
+};
+
+/**
+ * struct x86_init_irqs - platform specific interrupt setup
+ * @pre_vector_init: init code to run before interrupt vectors
+ * are set up.
+ * @intr_init: interrupt init code
+ * @intr_mode_select: interrupt delivery mode selection
+ * @intr_mode_init: interrupt delivery mode setup
+ * @create_pci_msi_domain: Create the PCI/MSI interrupt domain
+ */
+struct x86_init_irqs {
+ void (*pre_vector_init)(void);
+ void (*intr_init)(void);
+ void (*intr_mode_select)(void);
+ void (*intr_mode_init)(void);
+ struct irq_domain *(*create_pci_msi_domain)(void);
+};
+
+/**
+ * struct x86_init_oem - oem platform specific customizing functions
+ * @arch_setup: platform specific architecture setup
+ * @banner: print a platform specific banner
+ */
+struct x86_init_oem {
+ void (*arch_setup)(void);
+ void (*banner)(void);
+};
+
+/**
+ * struct x86_init_paging - platform specific paging functions
+ * @pagetable_init: platform specific paging initialization call to setup
+ * the kernel pagetables and prepare accessors functions.
+ * Callback must call paging_init(). Called once after the
+ * direct mapping for phys memory is available.
+ */
+struct x86_init_paging {
+ void (*pagetable_init)(void);
+};
+
+/**
+ * struct x86_init_timers - platform specific timer setup
+ * @setup_percpu_clockev: set up the per cpu clock event device for the
+ * boot cpu
+ * @timer_init: initialize the platform timer (default PIT/HPET)
+ * @wallclock_init: init the wallclock device
+ */
+struct x86_init_timers {
+ void (*setup_percpu_clockev)(void);
+ void (*timer_init)(void);
+ void (*wallclock_init)(void);
+};
+
+/**
+ * struct x86_init_iommu - platform specific iommu setup
+ * @iommu_init: platform specific iommu setup
+ */
+struct x86_init_iommu {
+ int (*iommu_init)(void);
+};
+
+/**
+ * struct x86_init_pci - platform specific pci init functions
+ * @arch_init: platform specific pci arch init call
+ * @init: platform specific pci subsystem init
+ * @init_irq: platform specific pci irq init
+ * @fixup_irqs: platform specific pci irq fixup
+ */
+struct x86_init_pci {
+ int (*arch_init)(void);
+ int (*init)(void);
+ void (*init_irq)(void);
+ void (*fixup_irqs)(void);
+};
+
+/**
+ * struct x86_hyper_init - x86 hypervisor init functions
+ * @init_platform: platform setup
+ * @guest_late_init: guest late init
+ * @x2apic_available: X2APIC detection
+ * @msi_ext_dest_id: MSI supports 15-bit APIC IDs
+ * @init_mem_mapping: setup early mappings during init_mem_mapping()
+ * @init_after_bootmem: guest init after boot allocator is finished
+ */
+struct x86_hyper_init {
+ void (*init_platform)(void);
+ void (*guest_late_init)(void);
+ bool (*x2apic_available)(void);
+ bool (*msi_ext_dest_id)(void);
+ void (*init_mem_mapping)(void);
+ void (*init_after_bootmem)(void);
+};
+
+/**
+ * struct x86_init_acpi - x86 ACPI init functions
+ * @set_root_pointer: set RSDP address
+ * @get_root_pointer: get RSDP address
+ * @reduced_hw_early_init: hardware reduced platform early init
+ */
+struct x86_init_acpi {
+ void (*set_root_pointer)(u64 addr);
+ u64 (*get_root_pointer)(void);
+ void (*reduced_hw_early_init)(void);
+};
+
+/**
+ * struct x86_guest - Functions used by misc guest incarnations like SEV, TDX, etc.
+ *
+ * @enc_status_change_prepare: Notify HV before the encryption status of a range is changed
+ * @enc_status_change_finish: Notify HV after the encryption status of a range is changed
+ * @enc_tlb_flush_required: Returns true if a TLB flush is needed before changing page encryption status
+ * @enc_cache_flush_required: Returns true if a cache flush is needed before changing page encryption status
+ * @enc_kexec_begin: Begin the two-step process of converting shared memory back
+ * to private. It stops the new conversions from being started
+ * and waits in-flight conversions to finish, if possible.
+ * @enc_kexec_finish: Finish the two-step process of converting shared memory to
+ * private. All memory is private after the call when
+ * the function returns.
+ * It is called on only one CPU while the others are shut down
+ * and with interrupts disabled.
+ */
+struct x86_guest {
+ int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
+ int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
+ bool (*enc_tlb_flush_required)(bool enc);
+ bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_begin)(void);
+ void (*enc_kexec_finish)(void);
+};
+
+/**
+ * struct x86_init_ops - functions for platform specific setup
+ *
+ */
+struct x86_init_ops {
+ struct x86_init_resources resources;
+ struct x86_init_mpparse mpparse;
+ struct x86_init_irqs irqs;
+ struct x86_init_oem oem;
+ struct x86_init_paging paging;
+ struct x86_init_timers timers;
+ struct x86_init_iommu iommu;
+ struct x86_init_pci pci;
+ struct x86_hyper_init hyper;
+ struct x86_init_acpi acpi;
+};
+
+/**
+ * struct x86_cpuinit_ops - platform specific cpu hotplug setups
+ * @setup_percpu_clockev: set up the per cpu clock event device
+ * @early_percpu_clock_init: early init of the per cpu clock event device
+ * @fixup_cpu_id: fixup function for cpuinfo_x86::topo.pkg_id
+ * @parallel_bringup: Parallel bringup control
+ */
+struct x86_cpuinit_ops {
+ void (*setup_percpu_clockev)(void);
+ void (*early_percpu_clock_init)(void);
+ void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
+ bool parallel_bringup;
+};
+
+struct timespec64;
+
+/**
+ * struct x86_legacy_devices - legacy x86 devices
+ *
+ * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform
+ * is known to never have a PNPBIOS.
+ *
+ * These are devices known to require LPC or ISA bus. The definition of legacy
+ * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag
+ * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on
+ * the LPC or ISA bus. User visible devices are devices that have end-user
+ * accessible connectors (for example, LPT parallel port). Legacy devices on
+ * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard
+ * / mouse, and the floppy disk controller. A system that lacks all known
+ * legacy devices can assume all devices can be detected exclusively via
+ * standard device enumeration mechanisms including the ACPI namespace.
+ *
+ * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not
+ * have any of the legacy devices enumerated below present.
+ */
+struct x86_legacy_devices {
+ int pnpbios;
+};
+
+/**
+ * enum x86_legacy_i8042_state - i8042 keyboard controller state
+ * @X86_LEGACY_I8042_PLATFORM_ABSENT: the controller is always absent on
+ * given platform/subarch.
+ * @X86_LEGACY_I8042_FIRMWARE_ABSENT: firmware reports that the controller
+ * is absent.
+ * @X86_LEGACY_I8042_EXPECTED_PRESENT: the controller is likely to be
+ * present, the i8042 driver should probe for controller existence.
+ */
+enum x86_legacy_i8042_state {
+ X86_LEGACY_I8042_PLATFORM_ABSENT,
+ X86_LEGACY_I8042_FIRMWARE_ABSENT,
+ X86_LEGACY_I8042_EXPECTED_PRESENT,
+};
+
+/**
+ * struct x86_legacy_features - legacy x86 features
+ *
+ * @i8042: indicated if we expect the device to have i8042 controller
+ * present.
+ * @rtc: this device has a CMOS real-time clock present
+ * @warm_reset: 1 if platform allows warm reset, else 0
+ * @no_vga: 1 if (FADT.boot_flags & ACPI_FADT_NO_VGA) is set, else 0
+ * @reserve_bios_regions: boot code will search for the EBDA address and the
+ * start of the 640k - 1M BIOS region. If false, the platform must
+ * ensure that its memory map correctly reserves sub-1MB regions as needed.
+ * @devices: legacy x86 devices, refer to struct x86_legacy_devices
+ * documentation for further details.
+ */
+struct x86_legacy_features {
+ enum x86_legacy_i8042_state i8042;
+ int rtc;
+ int warm_reset;
+ int no_vga;
+ int reserve_bios_regions;
+ struct x86_legacy_devices devices;
+};
+
+/**
+ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
+ *
+ * @pin_vcpu: pin current vcpu to specified physical
+ * cpu (run rarely)
+ * @sev_es_hcall_prepare: Load additional hypervisor-specific
+ * state into the GHCB when doing a VMMCALL under
+ * SEV-ES. Called from the #VC exception handler.
+ * @sev_es_hcall_finish: Copies state from the GHCB back into the
+ * processor (or pt_regs). Also runs checks on the
+ * state returned from the hypervisor after a
+ * VMMCALL under SEV-ES. Needs to return 'false'
+ * if the checks fail. Called from the #VC
+ * exception handler.
+ * @is_private_mmio: For CoCo VMs, must map MMIO address as private.
+ * Used when device is emulated by a paravisor
+ * layer in the VM context.
+ */
+struct x86_hyper_runtime {
+ void (*pin_vcpu)(int cpu);
+ void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs);
+ bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs);
+ bool (*is_private_mmio)(u64 addr);
+};
+
+/**
+ * struct x86_platform_ops - platform specific runtime functions
+ * @calibrate_cpu: calibrate CPU
+ * @calibrate_tsc: calibrate TSC, if different from CPU
+ * @get_wallclock: get time from HW clock like RTC etc.
+ * @set_wallclock: set time back to HW clock
+ * @iommu_shutdown: set by an IOMMU driver for shutdown if necessary
+ * @is_untracked_pat_range: exclude from PAT logic
+ * @nmi_init: enable NMI on cpus
+ * @get_nmi_reason: get the reason an NMI was received
+ * @save_sched_clock_state: save state for sched_clock() on suspend
+ * @restore_sched_clock_state: restore state for sched_clock() on resume
+ * @apic_post_init: adjust apic if needed
+ * @legacy: legacy features
+ * @set_legacy_features: override legacy features. Use of this callback
+ * is highly discouraged. You should only need
+ * this if your hardware platform requires further
+ * custom fine tuning far beyond what may be
+ * possible in x86_early_init_platform_quirks() by
+ * only using the current x86_hardware_subarch
+ * semantics.
+ * @realmode_reserve: reserve memory for realmode trampoline
+ * @realmode_init: initialize realmode trampoline
+ * @hyper: x86 hypervisor specific runtime callbacks
+ * @guest: guest incarnations callbacks
+ */
+struct x86_platform_ops {
+ unsigned long (*calibrate_cpu)(void);
+ unsigned long (*calibrate_tsc)(void);
+ void (*get_wallclock)(struct timespec64 *ts);
+ int (*set_wallclock)(const struct timespec64 *ts);
+ void (*iommu_shutdown)(void);
+ bool (*is_untracked_pat_range)(u64 start, u64 end);
+ void (*nmi_init)(void);
+ unsigned char (*get_nmi_reason)(void);
+ void (*save_sched_clock_state)(void);
+ void (*restore_sched_clock_state)(void);
+ void (*apic_post_init)(void);
+ struct x86_legacy_features legacy;
+ void (*set_legacy_features)(void);
+ void (*realmode_reserve)(void);
+ void (*realmode_init)(void);
+ struct x86_hyper_runtime hyper;
+ struct x86_guest guest;
+};
+
+struct x86_apic_ops {
+ unsigned int (*io_apic_read) (unsigned int apic, unsigned int reg);
+ void (*restore)(void);
+};
+
+extern struct x86_init_ops x86_init;
+extern struct x86_cpuinit_ops x86_cpuinit;
+extern struct x86_platform_ops x86_platform;
+extern struct x86_msi_ops x86_msi;
+extern struct x86_apic_ops x86_apic_ops;
+
+extern void x86_early_init_platform_quirks(void);
+extern void x86_init_noop(void);
+extern void x86_init_uint_noop(unsigned int unused);
+extern bool bool_x86_init_noop(void);
+extern void x86_op_int_noop(int cpu);
+extern bool x86_pnpbios_disabled(void);
+extern int set_rtc_noop(const struct timespec64 *now);
+extern void get_rtc_noop(struct timespec64 *now);
+
+#endif
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h
deleted file mode 100644
index f2cba4e79a23..000000000000
--- a/arch/x86/include/asm/xcr.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * Copyright 2008 rPath, Inc. - All Rights Reserved
- *
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2 or (at your
- * option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * asm-x86/xcr.h
- *
- * Definitions for the eXtended Control Register instructions
- */
-
-#ifndef _ASM_X86_XCR_H
-#define _ASM_X86_XCR_H
-
-#define XCR_XFEATURE_ENABLED_MASK 0x00000000
-
-#ifdef __KERNEL__
-# ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-static inline u64 xgetbv(u32 index)
-{
- u32 eax, edx;
-
- asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
- : "=a" (eax), "=d" (edx)
- : "c" (index));
- return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
- u32 eax = value;
- u32 edx = value >> 32;
-
- asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
- : : "a" (eax), "d" (edx), "c" (index));
-}
-
-# endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
new file mode 100644
index 000000000000..a3c29b1496c8
--- /dev/null
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -0,0 +1,139 @@
+/******************************************************************************
+ * arch-x86/cpuid.h
+ *
+ * CPUID interface to Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007 Citrix Systems, Inc.
+ *
+ * Authors:
+ * Keir Fraser <keir@xen.org>
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
+#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
+
+/*
+ * For compatibility with other hypervisor interfaces, the Xen cpuid leaves
+ * can be found at the first otherwise unused 0x100 aligned boundary starting
+ * from 0x40000000.
+ *
+ * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid
+ * leaves will start at 0x40000100
+ */
+
+#define XEN_CPUID_FIRST_LEAF 0x40000000
+#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i))
+
+/*
+ * Leaf 1 (0x40000x00)
+ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
+ * are supported by the Xen host.
+ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
+ * of a Xen host.
+ */
+#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
+#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
+#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
+
+/*
+ * Leaf 2 (0x40000x01)
+ * EAX[31:16]: Xen major version.
+ * EAX[15: 0]: Xen minor version.
+ * EBX-EDX: Reserved (currently all zeroes).
+ */
+
+/*
+ * Leaf 3 (0x40000x02)
+ * EAX: Number of hypercall transfer pages. This register is always guaranteed
+ * to specify one hypercall page.
+ * EBX: Base address of Xen-specific MSRs.
+ * ECX: Features 1. Unused bits are set to zero.
+ * EDX: Features 2. Unused bits are set to zero.
+ */
+
+/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
+#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
+#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0)
+
+/*
+ * Leaf 4 (0x40000x03)
+ * Sub-leaf 0: EAX: bit 0: emulated tsc
+ * bit 1: host tsc is known to be reliable
+ * bit 2: RDTSCP instruction available
+ * EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate,
+ * 2=no emulation, 3=no emulation + TSC_AUX support
+ * ECX: guest tsc frequency in kHz
+ * EDX: guest tsc incarnation (migration count)
+ * Sub-leaf 1: EAX: tsc offset low part
+ * EBX: tsc offset high part
+ * ECX: multiplicator for tsc->ns conversion
+ * EDX: shift amount for tsc->ns conversion
+ * Sub-leaf 2: EAX: host tsc frequency in kHz
+ */
+
+#define XEN_CPUID_TSC_EMULATED (1u << 0)
+#define XEN_CPUID_HOST_TSC_RELIABLE (1u << 1)
+#define XEN_CPUID_RDTSCP_INSTR_AVAIL (1u << 2)
+
+#define XEN_CPUID_TSC_MODE_DEFAULT (0)
+#define XEN_CPUID_TSC_MODE_ALWAYS_EMULATE (1u)
+#define XEN_CPUID_TSC_MODE_NEVER_EMULATE (2u)
+#define XEN_CPUID_TSC_MODE_PVRDTSCP (3u)
+
+/*
+ * Leaf 5 (0x40000x04)
+ * HVM-specific features
+ * Sub-leaf 0: EAX: Features
+ * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ * Sub-leaf 0: ECX: domain id (iff EAX has XEN_HVM_CPUID_DOMID_PRESENT flag)
+ */
+#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
+#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */
+/* Memory mapped from other domains has valid IOMMU entries */
+#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2)
+#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */
+#define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */
+/*
+ * With interrupt format set to 0 (non-remappable) bits 55:49 from the
+ * IO-APIC RTE and bits 11:5 from the MSI address can be used to store
+ * high bits for the Destination ID. This expands the Destination ID
+ * field from 8 to 15 bits, allowing to target APIC IDs up 32768.
+ */
+#define XEN_HVM_CPUID_EXT_DEST_ID (1u << 5)
+/*
+ * Per-vCPU event channel upcalls work correctly with physical IRQs
+ * bound to event channels.
+ */
+#define XEN_HVM_CPUID_UPCALL_VECTOR (1u << 6)
+
+/*
+ * Leaf 6 (0x40000x05)
+ * PV-specific parameters
+ * Sub-leaf 0: EAX: max available sub-leaf
+ * Sub-leaf 0: EBX: bits 0-7: max machine address width
+ */
+
+/* Max. address width in bits taking memory hotplug into account. */
+#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0)
+
+#define XEN_CPUID_MAX_NUM_LEAVES 5
+
+#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 1df35417c412..62bdceb594f1 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -1,11 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_XEN_EVENTS_H
#define _ASM_X86_XEN_EVENTS_H
+#include <xen/xen.h>
+
enum ipi_vector {
XEN_RESCHEDULE_VECTOR,
XEN_CALL_FUNCTION_VECTOR,
XEN_CALL_FUNCTION_SINGLE_VECTOR,
XEN_SPIN_UNLOCK_VECTOR,
+ XEN_IRQ_WORK_VECTOR,
+ XEN_NMI_VECTOR,
XEN_NR_IPIS,
};
@@ -15,4 +20,19 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
return raw_irqs_disabled_flags(regs->flags);
}
+/* No need for a barrier -- XCHG is a barrier on x86. */
+#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
+
+extern bool xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+ return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
+extern bool xen_percpu_upcall;
#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h
deleted file mode 100644
index fdbbb45767a6..000000000000
--- a/arch/x86/include/asm/xen/grant_table.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ASM_X86_XEN_GRANT_TABLE_H
-#define _ASM_X86_XEN_GRANT_TABLE_H
-
-#define xen_alloc_vm_area(size) alloc_vm_area(size)
-#define xen_free_vm_area(area) free_vm_area(area)
-
-#endif /* _ASM_X86_XEN_GRANT_TABLE_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4a9fa6..a16d4631547c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -38,13 +38,23 @@
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/pgtable.h>
+#include <linux/instrumentation.h>
+#include <trace/events/xen.h>
+
+#include <asm/alternative.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <asm/smap.h>
+#include <asm/nospec-branch.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>
+
+struct xen_dm_op_buf;
/*
* The hypercall asms have to meet several constraints:
@@ -74,15 +84,25 @@
* - clobber the rest
*
* The result certainly isn't pretty, and it really shows up cpp's
- * weakness as as macro language. Sorry. (But let's just give thanks
+ * weakness as a macro language. Sorry. (But let's just give thanks
* there aren't more than 5 arguments...)
*/
-extern struct { char _entry[32]; } hypercall_page[];
+void xen_hypercall_func(void);
+DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func);
-#define __HYPERCALL "call hypercall_page+%c[offset]"
-#define __HYPERCALL_ENTRY(x) \
- [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+#ifdef MODULE
+#define __ADDRESSABLE_xen_hypercall
+#else
+#define __ADDRESSABLE_xen_hypercall \
+ __stringify(.global STATIC_CALL_KEY(xen_hypercall);)
+#endif
+
+#define __HYPERCALL \
+ __ADDRESSABLE_xen_hypercall \
+ __stringify(call STATIC_CALL_TRAMP(xen_hypercall))
+
+#define __HYPERCALL_ENTRY(x) "a" (x)
#ifdef CONFIG_X86_32
#define __HYPERCALL_RETREG "eax"
@@ -108,7 +128,7 @@ extern struct { char _entry[32]; } hypercall_page[];
register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
-#define __HYPERCALL_0PARAM "=r" (__res)
+#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT
#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
@@ -140,7 +160,7 @@ extern struct { char _entry[32]; } hypercall_page[];
__HYPERCALL_0ARG(); \
asm volatile (__HYPERCALL \
: __HYPERCALL_0PARAM \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \
: __HYPERCALL_CLOBBER0); \
(type)__res; \
})
@@ -151,7 +171,7 @@ extern struct { char _entry[32]; } hypercall_page[];
__HYPERCALL_1ARG(a1); \
asm volatile (__HYPERCALL \
: __HYPERCALL_1PARAM \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \
: __HYPERCALL_CLOBBER1); \
(type)__res; \
})
@@ -162,7 +182,7 @@ extern struct { char _entry[32]; } hypercall_page[];
__HYPERCALL_2ARG(a1, a2); \
asm volatile (__HYPERCALL \
: __HYPERCALL_2PARAM \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \
: __HYPERCALL_CLOBBER2); \
(type)__res; \
})
@@ -173,7 +193,7 @@ extern struct { char _entry[32]; } hypercall_page[];
__HYPERCALL_3ARG(a1, a2, a3); \
asm volatile (__HYPERCALL \
: __HYPERCALL_3PARAM \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \
: __HYPERCALL_CLOBBER3); \
(type)__res; \
})
@@ -184,22 +204,58 @@ extern struct { char _entry[32]; } hypercall_page[];
__HYPERCALL_4ARG(a1, a2, a3, a4); \
asm volatile (__HYPERCALL \
: __HYPERCALL_4PARAM \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \
: __HYPERCALL_CLOBBER4); \
(type)__res; \
})
-#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
-({ \
- __HYPERCALL_DECLS; \
- __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
- asm volatile (__HYPERCALL \
- : __HYPERCALL_5PARAM \
- : __HYPERCALL_ENTRY(name) \
- : __HYPERCALL_CLOBBER5); \
- (type)__res; \
-})
+static inline long
+xen_single_call(unsigned int call,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4,
+ unsigned long a5)
+{
+ __HYPERCALL_DECLS;
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+
+ asm volatile(__HYPERCALL
+ : __HYPERCALL_5PARAM
+ : __HYPERCALL_ENTRY(call)
+ : __HYPERCALL_CLOBBER5);
+
+ return (long)__res;
+}
+
+static __always_inline void __xen_stac(void)
+{
+ /*
+ * Suppress objtool seeing the STAC/CLAC and getting confused about it
+ * calling random code with AC=1.
+ */
+ asm volatile(ASM_STAC_UNSAFE ::: "memory", "flags");
+}
+
+static __always_inline void __xen_clac(void)
+{
+ asm volatile(ASM_CLAC_UNSAFE ::: "memory", "flags");
+}
+
+static inline long
+privcmd_call(unsigned int call,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4,
+ unsigned long a5)
+{
+ long res;
+
+ __xen_stac();
+ res = xen_single_call(call, a1, a2, a3, a4, a5);
+ __xen_clac();
+ return res;
+}
+
+#ifdef CONFIG_XEN_PV
static inline int
HYPERVISOR_set_trap_table(struct trap_info *table)
{
@@ -227,50 +283,116 @@ HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
}
static inline int
-HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
+HYPERVISOR_callback_op(int cmd, void *arg)
{
- return _hypercall2(int, stack_switch, ss, esp);
+ return _hypercall2(int, callback_op, cmd, arg);
}
-#ifdef CONFIG_X86_32
-static inline int
-HYPERVISOR_set_callbacks(unsigned long event_selector,
- unsigned long event_address,
- unsigned long failsafe_selector,
- unsigned long failsafe_address)
+static __always_inline int
+HYPERVISOR_set_debugreg(int reg, unsigned long value)
{
- return _hypercall4(int, set_callbacks,
- event_selector, event_address,
- failsafe_selector, failsafe_address);
+ return _hypercall2(int, set_debugreg, reg, value);
}
-#else /* CONFIG_X86_64 */
-static inline int
-HYPERVISOR_set_callbacks(unsigned long event_address,
- unsigned long failsafe_address,
- unsigned long syscall_address)
+
+static __always_inline unsigned long
+HYPERVISOR_get_debugreg(int reg)
{
- return _hypercall3(int, set_callbacks,
- event_address, failsafe_address,
- syscall_address);
+ return _hypercall1(unsigned long, get_debugreg, reg);
}
-#endif /* CONFIG_X86_{32,64} */
static inline int
-HYPERVISOR_callback_op(int cmd, void *arg)
+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
{
- return _hypercall2(int, callback_op, cmd, arg);
+ return _hypercall2(int, update_descriptor, ma, desc);
}
static inline int
-HYPERVISOR_fpu_taskswitch(int set)
+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
+ unsigned long flags)
{
- return _hypercall1(int, fpu_taskswitch, set);
+ return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
}
static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+
+static inline void
+MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+{
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = set;
+
+ trace_xen_mc_entry(mcl, 1);
+}
+
+static inline void
+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+
+ trace_xen_mc_entry(mcl, 3);
+}
+
+static inline void
+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
+ struct desc_struct desc)
+{
+ mcl->op = __HYPERVISOR_update_descriptor;
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+
+ trace_xen_mc_entry(mcl, 2);
+}
+
+static inline void
+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
+ int count, int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)req;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmuext_op;
+ mcl->args[0] = (unsigned long)op;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+ unsigned long ss, unsigned long esp)
+{
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = ss;
+ mcl->args[1] = esp;
+
+ trace_xen_mc_entry(mcl, 2);
+}
+#endif
+
+static __always_inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
- return _hypercall2(int, sched_op_new, cmd, arg);
+ return _hypercall2(int, sched_op, cmd, arg);
}
static inline long
@@ -282,64 +404,38 @@ HYPERVISOR_set_timer_op(u64 timeout)
}
static inline int
-HYPERVISOR_set_debugreg(int reg, unsigned long value)
+HYPERVISOR_mca(struct xen_mc *mc_op)
{
- return _hypercall2(int, set_debugreg, reg, value);
-}
-
-static inline unsigned long
-HYPERVISOR_get_debugreg(int reg)
-{
- return _hypercall1(unsigned long, get_debugreg, reg);
+ mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+ return _hypercall1(int, mca, mc_op);
}
static inline int
-HYPERVISOR_update_descriptor(u64 ma, u64 desc)
+HYPERVISOR_platform_op(struct xen_platform_op *op)
{
- if (sizeof(u64) == sizeof(long))
- return _hypercall2(int, update_descriptor, ma, desc);
- return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
+ op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, op);
}
-static inline int
+static inline long
HYPERVISOR_memory_op(unsigned int cmd, void *arg)
{
- return _hypercall2(int, memory_op, cmd, arg);
+ return _hypercall2(long, memory_op, cmd, arg);
}
static inline int
-HYPERVISOR_multicall(void *call_list, int nr_calls)
+HYPERVISOR_multicall(void *call_list, uint32_t nr_calls)
{
return _hypercall2(int, multicall, call_list, nr_calls);
}
static inline int
-HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
- unsigned long flags)
-{
- if (sizeof(new_val) == sizeof(long))
- return _hypercall3(int, update_va_mapping, va,
- new_val.pte, flags);
- else
- return _hypercall4(int, update_va_mapping, va,
- new_val.pte, new_val.pte >> 32, flags);
-}
-
-static inline int
HYPERVISOR_event_channel_op(int cmd, void *arg)
{
- int rc = _hypercall2(int, event_channel_op, cmd, arg);
- if (unlikely(rc == -ENOSYS)) {
- struct evtchn_op op;
- op.cmd = cmd;
- memcpy(&op.u, arg, sizeof(op.u));
- rc = _hypercall1(int, event_channel_op_compat, &op);
- memcpy(arg, &op.u, sizeof(op.u));
- }
- return rc;
+ return _hypercall2(int, event_channel_op, cmd, arg);
}
-static inline int
+static __always_inline int
HYPERVISOR_xen_version(int cmd, void *arg)
{
return _hypercall2(int, xen_version, cmd, arg);
@@ -354,15 +450,7 @@ HYPERVISOR_console_io(int cmd, int count, char *str)
static inline int
HYPERVISOR_physdev_op(int cmd, void *arg)
{
- int rc = _hypercall2(int, physdev_op, cmd, arg);
- if (unlikely(rc == -ENOSYS)) {
- struct physdev_op op;
- op.cmd = cmd;
- memcpy(&op.u, arg, sizeof(op.u));
- rc = _hypercall1(int, physdev_op_compat, &op);
- memcpy(arg, &op.u, sizeof(op.u));
- }
- return rc;
+ return _hypercall2(int, physdev_op, cmd, arg);
}
static inline int
@@ -372,19 +460,6 @@ HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
}
static inline int
-HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
- unsigned long flags, domid_t domid)
-{
- if (sizeof(new_val) == sizeof(long))
- return _hypercall4(int, update_va_mapping_otherdomain, va,
- new_val.pte, flags, domid);
- else
- return _hypercall5(int, update_va_mapping_otherdomain, va,
- new_val.pte, new_val.pte >> 32,
- flags, domid);
-}
-
-static inline int
HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
{
return _hypercall2(int, vm_assist, cmd, type);
@@ -396,140 +471,41 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
}
-#ifdef CONFIG_X86_64
static inline int
-HYPERVISOR_set_segment_base(int reg, unsigned long value)
+HYPERVISOR_suspend(unsigned long start_info_mfn)
{
- return _hypercall2(int, set_segment_base, reg, value);
-}
-#endif
+ struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
-static inline int
-HYPERVISOR_suspend(unsigned long srec)
-{
- return _hypercall3(int, sched_op, SCHEDOP_shutdown,
- SHUTDOWN_suspend, srec);
+ /*
+ * For a PV guest the tools require that the start_info mfn be
+ * present in rdx/edx when the hypercall is made. Per the
+ * hypercall calling convention this is the third hypercall
+ * argument, which is start_info_mfn here.
+ */
+ return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
}
-static inline int
-HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(int op, void *arg)
{
- return _hypercall2(int, nmi_op, op, arg);
+ return _hypercall2(unsigned long, hvm_op, op, arg);
}
-static inline void
-MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
-{
- mcl->op = __HYPERVISOR_fpu_taskswitch;
- mcl->args[0] = set;
-}
-
-static inline void
-MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
- pte_t new_val, unsigned long flags)
-{
- mcl->op = __HYPERVISOR_update_va_mapping;
- mcl->args[0] = va;
- if (sizeof(new_val) == sizeof(long)) {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = flags;
- } else {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = new_val.pte >> 32;
- mcl->args[3] = flags;
- }
-}
-
-static inline void
-MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
- void *uop, unsigned int count)
-{
- mcl->op = __HYPERVISOR_grant_table_op;
- mcl->args[0] = cmd;
- mcl->args[1] = (unsigned long)uop;
- mcl->args[2] = count;
-}
-
-static inline void
-MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
- pte_t new_val, unsigned long flags,
- domid_t domid)
-{
- mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
- mcl->args[0] = va;
- if (sizeof(new_val) == sizeof(long)) {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = flags;
- mcl->args[3] = domid;
- } else {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = new_val.pte >> 32;
- mcl->args[3] = flags;
- mcl->args[4] = domid;
- }
-}
-
-static inline void
-MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
- struct desc_struct desc)
-{
- mcl->op = __HYPERVISOR_update_descriptor;
- if (sizeof(maddr) == sizeof(long)) {
- mcl->args[0] = maddr;
- mcl->args[1] = *(unsigned long *)&desc;
- } else {
- mcl->args[0] = maddr;
- mcl->args[1] = maddr >> 32;
- mcl->args[2] = desc.a;
- mcl->args[3] = desc.b;
- }
-}
-
-static inline void
-MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
-{
- mcl->op = __HYPERVISOR_memory_op;
- mcl->args[0] = cmd;
- mcl->args[1] = (unsigned long)arg;
-}
-
-static inline void
-MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
- int count, int *success_count, domid_t domid)
-{
- mcl->op = __HYPERVISOR_mmu_update;
- mcl->args[0] = (unsigned long)req;
- mcl->args[1] = count;
- mcl->args[2] = (unsigned long)success_count;
- mcl->args[3] = domid;
-}
-
-static inline void
-MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
- int *success_count, domid_t domid)
-{
- mcl->op = __HYPERVISOR_mmuext_op;
- mcl->args[0] = (unsigned long)op;
- mcl->args[1] = count;
- mcl->args[2] = (unsigned long)success_count;
- mcl->args[3] = domid;
-}
-
-static inline void
-MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
{
- mcl->op = __HYPERVISOR_set_gdt;
- mcl->args[0] = (unsigned long)frames;
- mcl->args[1] = entries;
+ return _hypercall2(int, xenpmu_op, op, arg);
}
-static inline void
-MULTI_stack_switch(struct multicall_entry *mcl,
- unsigned long ss, unsigned long esp)
-{
- mcl->op = __HYPERVISOR_stack_switch;
- mcl->args[0] = ss;
- mcl->args[1] = esp;
+static inline int
+HYPERVISOR_dm_op(
+ domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
+{
+ int ret;
+ __xen_stac();
+ ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+ __xen_clac();
+ return ret;
}
#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90c0edf..c2fc7869b996 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -33,35 +33,69 @@
#ifndef _ASM_X86_XEN_HYPERVISOR_H
#define _ASM_X86_XEN_HYPERVISOR_H
-/* arch/i386/kernel/setup.c */
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
-enum xen_domain_type {
- XEN_NATIVE, /* running on bare hardware */
- XEN_PV_DOMAIN, /* running in a PV domain */
- XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
-};
+#include <asm/bug.h>
+#include <asm/processor.h>
+
+#define XEN_SIGNATURE "XenVMMXenVMM"
+
+static inline uint32_t xen_cpuid_base(void)
+{
+ return cpuid_base_hypervisor(XEN_SIGNATURE, 2);
+}
+
+struct pci_dev;
-#ifdef CONFIG_XEN
-extern enum xen_domain_type xen_domain_type;
+#ifdef CONFIG_XEN_PV_DOM0
+bool xen_initdom_restore_msi(struct pci_dev *dev);
#else
-#define xen_domain_type XEN_NATIVE
+static inline bool xen_initdom_restore_msi(struct pci_dev *dev) { return true; }
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num);
+void xen_arch_unregister_cpu(int num);
+#endif
+
+#ifdef CONFIG_PVH
+void __init xen_pvh_init(struct boot_params *boot_params);
+void __init mem_map_via_hcall(struct boot_params *boot_params_p);
#endif
-#define xen_domain() (xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain() (xen_domain() && \
- xen_domain_type == XEN_PV_DOMAIN)
-#define xen_hvm_domain() (xen_domain() && \
- xen_domain_type == XEN_HVM_DOMAIN)
+/* Lazy mode for batching updates / context switch */
+enum xen_lazy_mode {
+ XEN_LAZY_NONE,
+ XEN_LAZY_MMU,
+ XEN_LAZY_CPU,
+};
+
+DECLARE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode);
+
+static inline void enter_lazy(enum xen_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(xen_lazy_mode) != XEN_LAZY_NONE);
+
+ this_cpu_write(xen_lazy_mode, mode);
+}
-#ifdef CONFIG_XEN_DOM0
-#include <xen/interface/xen.h>
+static inline void leave_lazy(enum xen_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(xen_lazy_mode) != mode);
-#define xen_initial_domain() (xen_pv_domain() && \
- xen_start_info->flags & SIF_INITDOMAIN)
-#else /* !CONFIG_XEN_DOM0 */
-#define xen_initial_domain() (0)
-#endif /* CONFIG_XEN_DOM0 */
+ this_cpu_write(xen_lazy_mode, XEN_LAZY_NONE);
+}
+
+enum xen_lazy_mode xen_get_lazy_mode(void);
+
+#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI)
+void xen_sanitize_proc_cap_bits(uint32_t *buf);
+#else
+static inline void xen_sanitize_proc_cap_bits(uint32_t *buf)
+{
+ BUG();
+}
+#endif
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1f0c55..a078a2b0f032 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
*
* Guest OS interface to x86 Xen.
*
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
*/
#ifndef _ASM_X86_XEN_INTERFACE_H
#define _ASM_X86_XEN_INTERFACE_H
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
#ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
@@ -46,24 +72,36 @@
#endif
#endif
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
+/* Explicitly size integers that represent pfns in the public interface
+ * with Xen so that on ARM we can have one ABI that works for 32 and 64
+ * bit guests. */
+typedef unsigned long xen_pfn_t;
+#define PRI_xen_pfn "lx"
+typedef unsigned long xen_ulong_t;
+#define PRI_xen_ulong "lx"
+typedef long xen_long_t;
+#define PRI_xen_long "lx"
+
/* Guest handles for primitive C types. */
__DEFINE_GUEST_HANDLE(uchar, unsigned char);
__DEFINE_GUEST_HANDLE(uint, unsigned int);
-__DEFINE_GUEST_HANDLE(ulong, unsigned long);
DEFINE_GUEST_HANDLE(char);
DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(long);
DEFINE_GUEST_HANDLE(void);
+DEFINE_GUEST_HANDLE(uint64_t);
+DEFINE_GUEST_HANDLE(uint32_t);
+DEFINE_GUEST_HANDLE(xen_pfn_t);
+DEFINE_GUEST_HANDLE(xen_ulong_t);
#endif
#ifndef HYPERVISOR_VIRT_START
#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
#endif
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
/* Maximum number of virtual CPUs in multi-processor guests. */
#define MAX_VIRT_CPUS 32
@@ -76,17 +114,20 @@ DEFINE_GUEST_HANDLE(void);
* start of the GDT because some stupid OSes export hard-coded selector values
* in their ABI. These hard-coded values are always near the start of the GDT,
* so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
*/
#define FIRST_RESERVED_GDT_PAGE 14
#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
* The privilege level specifies which modes may enter a trap via a software
* interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
* privilege levels as follows:
- * Level == 0: Noone may enter
+ * Level == 0: No one may enter
* Level == 1: Kernel may enter
* Level == 2: Kernel may enter
* Level == 3: Everyone may enter
@@ -96,7 +137,7 @@ DEFINE_GUEST_HANDLE(void);
#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct trap_info {
uint8_t vector; /* exception vector */
uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
@@ -106,30 +147,84 @@ struct trap_info {
DEFINE_GUEST_HANDLE_STRUCT(trap_info);
struct arch_shared_info {
- unsigned long max_pfn; /* max pfn that appears in table */
- /* Frame containing list of mfns containing list of mfns containing p2m. */
- unsigned long pfn_to_mfn_frame_list_list;
- unsigned long nmi_reason;
+ /*
+ * Number of valid entries in the p2m table(s) anchored at
+ * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+ */
+ unsigned long max_pfn;
+ /*
+ * Frame containing list of mfns containing list of mfns containing p2m.
+ * A value of 0 indicates it has not yet been set up, ~0 indicates it
+ * has been set to invalid e.g. due to the p2m being too large for the
+ * 3-level p2m tree. In this case the linear mapper p2m list anchored
+ * at p2m_vaddr is to be used.
+ */
+ xen_pfn_t pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+ /*
+ * Following three fields are valid if p2m_cr3 contains a value
+ * different from 0.
+ * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+ * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+ * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+ * of a L3 or L4 page table.
+ * p2m_vaddr holds the virtual address of the linear p2m list. All
+ * entries in the range [0...max_pfn[ are accessible via this pointer.
+ * p2m_generation will be incremented by the guest before and after each
+ * change of the mappings of the p2m list. p2m_generation starts at 0
+ * and a value with the least significant bit set indicates that a
+ * mapping update is in progress. This allows guest external software
+ * (e.g. in Dom0) to verify that read mappings are consistent and
+ * whether they have changed since the last check.
+ * Modifying a p2m element in the linear p2m list is allowed via an
+ * atomic write only.
+ */
+ unsigned long p2m_cr3; /* cr3 value of the p2m address space */
+ unsigned long p2m_vaddr; /* virtual address of the p2m list */
+ unsigned long p2m_generation; /* generation count of p2m mapping */
+#ifdef CONFIG_X86_32
+ uint32_t wc_sec_hi;
+#endif
};
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#ifdef CONFIG_X86_32
-#include "interface_32.h"
+#include <asm/xen/interface_32.h>
#else
-#include "interface_64.h"
+#include <asm/xen/interface_64.h>
#endif
-#ifndef __ASSEMBLY__
+#include <asm/pvclock-abi.h>
+
+#ifndef __ASSEMBLER__
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
*/
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST (1<<1)
-#define VGCF_IN_KERNEL (1<<2)
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_IN_KERNEL (1<<2)
+#define _VGCF_i387_valid 0
+#define VGCF_i387_valid (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel 2
+#define VGCF_in_kernel (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events 4
+#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online 5
+#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
@@ -158,18 +253,138 @@ struct vcpu_guest_context {
#endif
};
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
-#endif /* !__ASSEMBLY__ */
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+ /*
+ * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t counters;
+ uint32_t ctrls;
+
+ /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+ uint64_t counter;
+ uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+ /*
+ * Offsets to fixed and architectural counter MSRs (relative to
+ * xen_pmu_arch.c.intel).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t fixed_counters;
+ uint32_t arch_counters;
+
+ /* PMU registers */
+ uint64_t global_ctrl;
+ uint64_t global_ovf_ctrl;
+ uint64_t global_status;
+ uint64_t fixed_ctrl;
+ uint64_t ds_area;
+ uint64_t pebs_enable;
+ uint64_t debugctl;
+
+ /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+ uint64_t ip;
+ uint64_t sp;
+ uint64_t flags;
+ uint16_t cs;
+ uint16_t ss;
+ uint8_t cpl;
+ uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+ union {
+ /*
+ * Processor's registers at the time of interrupt.
+ * WO for hypervisor, RO for guests.
+ */
+ struct xen_pmu_regs regs;
+ /*
+ * Padding for adding new registers to xen_pmu_regs in
+ * the future
+ */
+#define XENPMU_REGS_PAD_SZ 64
+ uint8_t pad[XENPMU_REGS_PAD_SZ];
+ } r;
+
+ /* WO for hypervisor, RO for guest */
+ uint64_t pmu_flags;
+
+ /*
+ * APIC LVTPC register.
+ * RW for both hypervisor and guest.
+ * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+ * during XENPMU_flush or XENPMU_lvtpc_set.
+ */
+ union {
+ uint32_t lapic_lvtpc;
+ uint64_t pad;
+ } l;
+
+ /*
+ * Vendor-specific PMU registers.
+ * RW for both hypervisor and guest (see exceptions above).
+ * Guest's updates to this field are verified and then loaded by the
+ * hypervisor into hardware during XENPMU_flush
+ */
+ union {
+ struct xen_pmu_amd_ctxt amd;
+ struct xen_pmu_intel_ctxt intel;
+
+ /*
+ * Padding for contexts (fixed parts only, does not include
+ * MSR banks that are specified by offsets)
+ */
+#define XENPMU_CTXT_PAD_SZ 128
+ uint8_t pad[XENPMU_CTXT_PAD_SZ];
+ } c;
+};
+
+#endif /* !__ASSEMBLER__ */
/*
* Prefix forces emulation of some non-trapping instructions.
* Currently only CPUID.
*/
-#ifdef __ASSEMBLY__
-#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
-#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
-#else
-#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
-#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
-#endif
+#include <asm/emulate_prefix.h>
+
+#define XEN_EMULATE_PREFIX __ASM_FORM(.byte __XEN_EMULATE_PREFIX ;)
+#define XEN_CPUID XEN_EMULATE_PREFIX __ASM_FORM(cpuid)
#endif /* _ASM_X86_XEN_INTERFACE_H */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e004ae5c..74d9768a9cf7 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/******************************************************************************
* arch-x86_32.h
*
@@ -32,13 +33,18 @@
/* And the trap vector is... */
#define TRAP_INSTR "int $0x82"
+#define __MACH2PHYS_VIRT_START 0xF5800000
+#define __MACH2PHYS_VIRT_END 0xF6800000
+
+#define __MACH2PHYS_SHIFT 2
+
/*
* Virtual addresses beyond this are not modifiable by guest OSes. The
* machine->physical mapping table starts at this address, read-only.
*/
#define __HYPERVISOR_VIRT_START 0xF5800000
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct cpu_user_regs {
uint32_t ebx;
@@ -79,7 +85,7 @@ typedef struct xen_callback xen_callback_t;
#define XEN_CALLBACK(__cs, __eip) \
((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/*
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d2662b97c..38a19edb81a3 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_XEN_INTERFACE_64_H
#define _ASM_X86_XEN_INTERFACE_64_H
@@ -39,18 +40,7 @@
#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
-#endif
-
-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define __MACH2PHYS_SHIFT 3
/*
* int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
@@ -71,7 +61,7 @@
* RING1 -> RING3 kernel mode.
* RING2 -> RING3 kernel mode.
* RING3 -> RING3 user mode.
- * However RING0 indicates that the guest kernel should return to iteself
+ * However RING0 indicates that the guest kernel should return to itself
* directly with
* orb $3,1*8(%rsp)
* iretq
@@ -87,7 +77,7 @@
#define VGCF_in_syscall (1<<_VGCF_in_syscall)
#define VGCF_IN_SYSCALL VGCF_in_syscall
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct iret_context {
/* Top of stack (%rsp at point of hypercall). */
@@ -153,7 +143,7 @@ typedef unsigned long xen_callback_t;
#define XEN_CALLBACK(__cs, __rip) \
((unsigned long)(__rip))
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_XEN_INTERFACE_64_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a400799..59f642a94b9d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_XEN_PAGE_H
#define _ASM_X86_XEN_PAGE_H
@@ -5,13 +6,15 @@
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/device.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
-#include <xen/features.h>
+#include <xen/interface/grant_table.h>
/* Xen machine address */
typedef struct xmaddr {
@@ -23,57 +26,203 @@ typedef struct xpaddr {
phys_addr_t paddr;
} xpaddr_t;
+#ifdef CONFIG_X86_64
+#define XEN_PHYSICAL_MASK __sme_clr((1UL << 52) - 1)
+#else
+#define XEN_PHYSICAL_MASK __PHYSICAL_MASK
+#endif
+
+#define XEN_PTE_MFN_MASK ((pteval_t)(((signed long)PAGE_MASK) & \
+ XEN_PHYSICAL_MASK))
+
#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
#define INVALID_P2M_ENTRY (~0UL)
-#define FOREIGN_FRAME_BIT (1UL<<31)
+#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
+#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES \
- ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned long machine_to_phys_nr;
+extern unsigned long *xen_p2m_addr;
+extern unsigned long xen_p2m_size;
+extern unsigned long xen_max_p2m_pfn;
+extern int xen_alloc_p2m_entry(unsigned long pfn);
extern unsigned long get_phys_to_machine(unsigned long pfn);
-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e);
+
+#ifdef CONFIG_XEN_PV
+extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count);
+extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_unmap_grant_ref *kunmap_ops,
+ struct page **pages, unsigned int count);
+#else
+static inline int
+set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ return 0;
+}
+
+static inline int
+clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_unmap_grant_ref *kunmap_ops,
+ struct page **pages, unsigned int count)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Helper functions to write or read unsigned long values to/from
+ * memory, when the access may fault.
+ */
+static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
+{
+ int ret = 0;
+
+ asm volatile("1: mov %[val], %[ptr]\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
+ : [ret] "+r" (ret), [ptr] "=m" (*addr)
+ : [val] "r" (val));
+
+ return ret;
+}
+
+static inline int xen_safe_read_ulong(const unsigned long *addr,
+ unsigned long *val)
+{
+ unsigned long rval = ~0ul;
+ int ret = 0;
+
+ asm volatile("1: mov %[ptr], %[rval]\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
+ : [ret] "+r" (ret), [rval] "+r" (rval)
+ : [ptr] "m" (*addr));
+ *val = rval;
+
+ return ret;
+}
+
+#ifdef CONFIG_XEN_PV
+/*
+ * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine():
+ * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator
+ * bits (identity or foreign) are set.
+ * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set
+ * identity or foreign indicator will be still set. __pfn_to_mfn() is
+ * encapsulating get_phys_to_machine() which is called in special cases only.
+ * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special
+ * cases needing an extended handling.
+ */
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+ unsigned long mfn;
+
+ if (pfn < xen_p2m_size)
+ mfn = xen_p2m_addr[pfn];
+ else if (unlikely(pfn < xen_max_p2m_pfn))
+ return get_phys_to_machine(pfn);
+ else
+ return IDENTITY_FRAME(pfn);
+
+ if (unlikely(mfn == INVALID_P2M_ENTRY))
+ return get_phys_to_machine(pfn);
+
+ return mfn;
+}
+#else
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+ return pfn;
+}
+#endif
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
- if (xen_feature(XENFEAT_auto_translated_physmap))
+ unsigned long mfn;
+
+ /*
+ * Some x86 code are still using pfn_to_mfn instead of
+ * pfn_to_mfn. This will have to be removed when we figured
+ * out which call.
+ */
+ if (!xen_pv_domain())
return pfn;
- return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+ mfn = __pfn_to_mfn(pfn);
+
+ if (mfn != INVALID_P2M_ENTRY)
+ mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
+
+ return mfn;
}
static inline int phys_to_machine_mapping_valid(unsigned long pfn)
{
- if (xen_feature(XENFEAT_auto_translated_physmap))
+ if (!xen_pv_domain())
return 1;
- return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
+ return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY;
}
-static inline unsigned long mfn_to_pfn(unsigned long mfn)
+static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
{
unsigned long pfn;
+ int ret;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return mfn;
-
-#if 0
- if (unlikely((mfn >> machine_to_phys_order) != 0))
- return max_mapnr;
-#endif
+ if (unlikely(mfn >= machine_to_phys_nr))
+ return ~0;
- pfn = 0;
/*
* The array access can fail (e.g., device space beyond end of RAM).
* In such cases it doesn't matter what we return (we return garbage),
* but we must handle the fault without crashing!
*/
- __get_user(pfn, &machine_to_phys_mapping[mfn]);
+ ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn);
+ if (ret < 0)
+ return ~0;
+
+ return pfn;
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ /*
+ * Some x86 code are still using mfn_to_pfn instead of
+ * gfn_to_pfn. This will have to be removed when we figure
+ * out which call.
+ */
+ if (!xen_pv_domain())
+ return mfn;
+
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (__pfn_to_mfn(pfn) != mfn)
+ pfn = ~0;
+
+ /*
+ * pfn is ~0 if there are no entries in the m2p for mfn or the
+ * entry doesn't map back to the mfn.
+ */
+ if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
+ pfn = mfn;
return pfn;
}
@@ -90,6 +239,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
}
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+ if (!xen_pv_domain())
+ return pfn;
+ else
+ return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+ if (!xen_pv_domain())
+ return gfn;
+ else
+ return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn) pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn) gfn_to_pfn(bfn)
+
/*
* We detect special mappings in one of two ways:
* 1. If the MFN is an I/O page then Xen will set the m2p entry
@@ -110,27 +280,35 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
* require. In all the cases we care about, the FOREIGN_FRAME bit is
* masked (e.g., pfn_to_mfn()) so behaviour there is correct.
*/
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
-{
- extern unsigned long max_mapnr;
- unsigned long pfn = mfn_to_pfn(mfn);
- if ((pfn < max_mapnr)
- && !xen_feature(XENFEAT_auto_translated_physmap)
- && (get_phys_to_machine(pfn) != mfn))
- return max_mapnr; /* force !pfn_valid() */
- /* XXX fixme; not true with sparsemem */
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (!xen_pv_domain())
+ return mfn;
+
+ pfn = mfn_to_pfn(mfn);
+ if (__pfn_to_mfn(pfn) != mfn)
+ return -1; /* force !pfn_valid() */
return pfn;
}
/* VIRT <-> MACHINE conversion */
#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_pfn(v) (PFN_DOWN(__pa(v)))
+static inline unsigned long virt_to_pfn(const void *v)
+{
+ return PFN_DOWN(__pa(v));
+}
#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g) (__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
static inline unsigned long pte_mfn(pte_t pte)
{
- return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+ return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
}
static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
@@ -155,18 +333,28 @@ static inline pte_t __pte_ma(pteval_t x)
#define pmd_val_ma(v) ((v).pmd)
#ifdef __PAGETABLE_PUD_FOLDED
-#define pud_val_ma(v) ((v).pgd.pgd)
+#define pud_val_ma(v) ((v).p4d.pgd.pgd)
#else
#define pud_val_ma(v) ((v).pud)
#endif
#define __pmd_ma(x) ((pmd_t) { (x) } )
-#define pgd_val_ma(x) ((x).pgd)
-
+#ifdef __PAGETABLE_P4D_FOLDED
+#define p4d_val_ma(x) ((x).pgd.pgd)
+#else
+#define p4d_val_ma(x) ((x).p4d)
+#endif
xmaddr_t arbitrary_virt_to_machine(void *address);
unsigned long arbitrary_virt_to_mfn(void *vaddr);
void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
+static inline bool xen_arch_need_swiotlb(struct device *dev,
+ phys_addr_t phys,
+ dma_addr_t dev_addr)
+{
+ return false;
+}
+
#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 000000000000..9015b888edd6
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
+#if defined(CONFIG_PCI_XEN)
+extern int __init pci_xen_init(void);
+extern int __init pci_xen_hvm_init(void);
+#define pci_xen 1
+#else
+#define pci_xen 0
+#define pci_xen_init (0)
+static inline int pci_xen_hvm_init(void)
+{
+ return -1;
+}
+#endif
+#ifdef CONFIG_XEN_PV_DOM0
+int __init pci_xen_initial_domain(void);
+#else
+static inline int __init pci_xen_initial_domain(void)
+{
+ return -1;
+}
+#endif
+
+#if defined(CONFIG_PCI_MSI)
+#if defined(CONFIG_PCI_XEN)
+/* The drivers/pci/xen-pcifront.c sets this structure to
+ * its own functions.
+ */
+struct xen_pci_frontend_ops {
+ int (*enable_msi)(struct pci_dev *dev, int vectors[]);
+ void (*disable_msi)(struct pci_dev *dev);
+ int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
+ void (*disable_msix)(struct pci_dev *dev);
+};
+
+extern struct xen_pci_frontend_ops *xen_pci_frontend;
+
+static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
+ int vectors[])
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msi)
+ return xen_pci_frontend->enable_msi(dev, vectors);
+ return -ENOSYS;
+}
+static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msi)
+ xen_pci_frontend->disable_msi(dev);
+}
+static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
+ int vectors[], int nvec)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msix)
+ return xen_pci_frontend->enable_msix(dev, vectors, nvec);
+ return -ENOSYS;
+}
+static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msix)
+ xen_pci_frontend->disable_msix(dev);
+}
+#endif /* CONFIG_PCI_XEN */
+#endif /* CONFIG_PCI_MSI */
+
+#endif /* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 000000000000..abde0f44df57
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SWIOTLB_XEN_H
+#define _ASM_X86_SWIOTLB_XEN_H
+
+int xen_swiotlb_fixup(void *buf, unsigned long nslabs);
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+ unsigned int address_bits,
+ dma_addr_t *dma_handle);
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
+
+#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h
new file mode 100644
index 000000000000..2aad0abd68e2
--- /dev/null
+++ b/arch/x86/include/asm/xen/trace_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_XEN_TRACE_TYPES_H
+#define _ASM_XEN_TRACE_TYPES_H
+
+enum xen_mc_flush_reason {
+ XEN_MC_FL_NONE, /* explicit flush */
+ XEN_MC_FL_BATCH, /* out of hypercall space */
+ XEN_MC_FL_ARGS, /* out of argument space */
+ XEN_MC_FL_CALLBACK, /* out of callback space */
+};
+
+enum xen_mc_extend_args {
+ XEN_MC_XE_OK,
+ XEN_MC_XE_BAD_OP,
+ XEN_MC_XE_NO_SPACE
+};
+typedef void (*xen_mc_callback_fn_t)(void *);
+
+#endif /* _ASM_XEN_TRACE_TYPES_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 7fcf6f3dbcc3..7b0307acc410 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,10 +1,502 @@
-#ifdef CONFIG_KMEMCHECK
-/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
-# include <asm-generic/xor.h>
-#else
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_X86_XOR_H
+#define _ASM_X86_XOR_H
+
+/*
+ * Optimized RAID-5 checksumming functions for SSE.
+ */
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+/*
+ * Based on
+ * High-speed RAID5 checksumming functions utilizing SSE instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+#include <asm/fpu/api.h>
+
#ifdef CONFIG_X86_32
-# include "xor_32.h"
+/* reduce register pressure */
+# define XOR_CONSTANT_CONSTRAINT "i"
#else
-# include "xor_64.h"
+# define XOR_CONSTANT_CONSTRAINT "re"
#endif
+
+#define OFFS(x) "16*("#x")"
+#define PF_OFFS(x) "256+16*("#x")"
+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
+#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
+#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
+#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
+#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
+#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
+#define NOP(x)
+
+#define BLK64(pf, op, i) \
+ pf(i) \
+ op(i, 0) \
+ op(i + 1, 1) \
+ op(i + 2, 2) \
+ op(i + 3, 3)
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1),
+ [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(PF3, XO3, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1),
+ [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ PF4(i) \
+ PF4(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ XO4(i + 1, 1) \
+ XO4(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " add %[inc], %[p5] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+ [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(PF3, XO3, i) \
+ BLK64(PF4, XO4, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " add %[inc], %[p5] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+ [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_sse_pf64 = {
+ .name = "prefetch64-sse",
+ .do_2 = xor_sse_2_pf64,
+ .do_3 = xor_sse_3_pf64,
+ .do_4 = xor_sse_4_pf64,
+ .do_5 = xor_sse_5_pf64,
+};
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef NOP
+#undef BLK64
+#undef BLOCK
+
+#undef XOR_CONSTANT_CONSTRAINT
+
+#ifdef CONFIG_X86_32
+# include <asm/xor_32.h>
+#else
+# include <asm/xor_64.h>
#endif
+
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ AVX_SELECT(FASTEST)
+
+#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f495..7a6b9474591e 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_X86_XOR_32_H
#define _ASM_X86_XOR_32_H
/*
- * Optimized RAID-5 checksumming functions for MMX and SSE.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Optimized RAID-5 checksumming functions for MMX.
*/
/*
@@ -26,10 +18,11 @@
#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
static void
-xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 7;
@@ -72,8 +65,9 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 7;
@@ -121,8 +115,10 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 7;
@@ -176,8 +172,11 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
static void
-xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 7;
@@ -256,7 +255,8 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
#undef BLOCK
static void
-xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
{
unsigned long lines = bytes >> 6;
@@ -303,8 +303,9 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
}
static void
-xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
+xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
{
unsigned long lines = bytes >> 6;
@@ -360,8 +361,10 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
+xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
{
unsigned long lines = bytes >> 6;
@@ -426,8 +429,11 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
}
static void
-xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
+xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4,
+ const unsigned long * __restrict p5)
{
unsigned long lines = bytes >> 6;
@@ -529,330 +535,6 @@ static struct xor_block_template xor_block_p5_mmx = {
.do_5 = xor_p5_mmx_5,
};
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-#define XMMS_SAVE \
-do { \
- preempt_disable(); \
- cr0 = read_cr0(); \
- clts(); \
- asm volatile( \
- "movups %%xmm0,(%0) ;\n\t" \
- "movups %%xmm1,0x10(%0) ;\n\t" \
- "movups %%xmm2,0x20(%0) ;\n\t" \
- "movups %%xmm3,0x30(%0) ;\n\t" \
- : \
- : "r" (xmm_save) \
- : "memory"); \
-} while (0)
-
-#define XMMS_RESTORE \
-do { \
- asm volatile( \
- "sfence ;\n\t" \
- "movups (%0),%%xmm0 ;\n\t" \
- "movups 0x10(%0),%%xmm1 ;\n\t" \
- "movups 0x20(%0),%%xmm2 ;\n\t" \
- "movups 0x30(%0),%%xmm3 ;\n\t" \
- : \
- : "r" (xmm_save) \
- : "memory"); \
- write_cr0(cr0); \
- preempt_enable(); \
-} while (0)
-
-#define ALIGN16 __attribute__((aligned(16)))
-
-#define OFFS(x) "16*("#x")"
-#define PF_OFFS(x) "256+16*("#x")"
-#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
-#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
-#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
-#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
-#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
-#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
-#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
-#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
-#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
-#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
-#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
-#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
-#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
-
-
-static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
-{
- unsigned long lines = bytes >> 8;
- char xmm_save[16*4] ALIGN16;
- int cr0;
-
- XMMS_SAVE;
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- LD(i, 0) \
- LD(i + 1, 1) \
- PF1(i) \
- PF1(i + 2) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO1(i, 0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- ST(i, 0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- : "+r" (lines),
- "+r" (p1), "+r" (p2)
- :
- : "memory");
-
- XMMS_RESTORE;
-}
-
-static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
-{
- unsigned long lines = bytes >> 8;
- char xmm_save[16*4] ALIGN16;
- int cr0;
-
- XMMS_SAVE;
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i + 2) \
- LD(i,0) \
- LD(i + 1, 1) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF2(i) \
- PF2(i + 2) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO1(i,0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- XO2(i,0) \
- XO2(i + 1, 1) \
- XO2(i + 2, 2) \
- XO2(i + 3, 3) \
- ST(i,0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " addl $256, %3 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- : "+r" (lines),
- "+r" (p1), "+r"(p2), "+r"(p3)
- :
- : "memory" );
-
- XMMS_RESTORE;
-}
-
-static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
-{
- unsigned long lines = bytes >> 8;
- char xmm_save[16*4] ALIGN16;
- int cr0;
-
- XMMS_SAVE;
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i + 2) \
- LD(i,0) \
- LD(i + 1, 1) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF2(i) \
- PF2(i + 2) \
- XO1(i,0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- PF3(i) \
- PF3(i + 2) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO2(i,0) \
- XO2(i + 1, 1) \
- XO2(i + 2, 2) \
- XO2(i + 3, 3) \
- XO3(i,0) \
- XO3(i + 1, 1) \
- XO3(i + 2, 2) \
- XO3(i + 3, 3) \
- ST(i,0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " addl $256, %3 ;\n"
- " addl $256, %4 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- : "+r" (lines),
- "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
- :
- : "memory" );
-
- XMMS_RESTORE;
-}
-
-static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
-{
- unsigned long lines = bytes >> 8;
- char xmm_save[16*4] ALIGN16;
- int cr0;
-
- XMMS_SAVE;
-
- /* Make sure GCC forgets anything it knows about p4 or p5,
- such that it won't pass to the asm volatile below a
- register that is shared with any other variable. That's
- because we modify p4 and p5 there, but we can't mark them
- as read/write, otherwise we'd overflow the 10-asm-operands
- limit of GCC < 3.1. */
- asm("" : "+r" (p4), "+r" (p5));
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i + 2) \
- LD(i,0) \
- LD(i + 1, 1) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF2(i) \
- PF2(i + 2) \
- XO1(i,0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- PF3(i) \
- PF3(i + 2) \
- XO2(i,0) \
- XO2(i + 1, 1) \
- XO2(i + 2, 2) \
- XO2(i + 3, 3) \
- PF4(i) \
- PF4(i + 2) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO3(i,0) \
- XO3(i + 1, 1) \
- XO3(i + 2, 2) \
- XO3(i + 3, 3) \
- XO4(i,0) \
- XO4(i + 1, 1) \
- XO4(i + 2, 2) \
- XO4(i + 3, 3) \
- ST(i,0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addl $256, %1 ;\n"
- " addl $256, %2 ;\n"
- " addl $256, %3 ;\n"
- " addl $256, %4 ;\n"
- " addl $256, %5 ;\n"
- " decl %0 ;\n"
- " jnz 1b ;\n"
- : "+r" (lines),
- "+r" (p1), "+r" (p2), "+r" (p3)
- : "r" (p4), "r" (p5)
- : "memory");
-
- /* p4 and p5 were modified, and now the variables are dead.
- Clobber them just to be sure nobody does something stupid
- like assuming they have some legal value. */
- asm("" : "=r" (p4), "=r" (p5));
-
- XMMS_RESTORE;
-}
-
static struct xor_block_template xor_block_pIII_sse = {
.name = "pIII_sse",
.do_2 = xor_sse_2,
@@ -861,28 +543,31 @@ static struct xor_block_template xor_block_pIII_sse = {
.do_5 = xor_sse_5,
};
+/* Also try the AVX routines */
+#include <asm/xor_avx.h>
+
/* Also try the generic routines. */
#include <asm-generic/xor.h>
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
- if (cpu_has_xmm) \
+ AVX_XOR_SPEED; \
+ if (boot_cpu_has(X86_FEATURE_XMM)) { \
xor_speed(&xor_block_pIII_sse); \
- if (cpu_has_mmx) { \
+ xor_speed(&xor_block_sse_pf64); \
+ } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
+ } else { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
} \
} while (0)
-/* We force the use of the SSE xor block because it can write around L2.
- We may also be able to load into the L1 only depending on how the cpu
- deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
- (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
-
#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f6..0307e4ec5044 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -1,344 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_XOR_64_H
#define _ASM_X86_XOR_64_H
-/*
- * Optimized RAID-5 checksumming functions for MMX and SSE.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
-
-/*
- * Based on
- * High-speed RAID5 checksumming functions utilizing SSE instructions.
- * Copyright (C) 1998 Ingo Molnar.
- */
-
-/*
- * x86-64 changes / gcc fixes from Andi Kleen.
- * Copyright 2002 Andi Kleen, SuSE Labs.
- *
- * This hasn't been optimized for the hammer yet, but there are likely
- * no advantages to be gotten from x86-64 here anyways.
- */
-
-typedef struct {
- unsigned long a, b;
-} __attribute__((aligned(16))) xmm_store_t;
-
-/* Doesn't use gcc to save the XMM registers, because there is no easy way to
- tell it to do a clts before the register saving. */
-#define XMMS_SAVE \
-do { \
- preempt_disable(); \
- asm volatile( \
- "movq %%cr0,%0 ;\n\t" \
- "clts ;\n\t" \
- "movups %%xmm0,(%1) ;\n\t" \
- "movups %%xmm1,0x10(%1) ;\n\t" \
- "movups %%xmm2,0x20(%1) ;\n\t" \
- "movups %%xmm3,0x30(%1) ;\n\t" \
- : "=&r" (cr0) \
- : "r" (xmm_save) \
- : "memory"); \
-} while (0)
-
-#define XMMS_RESTORE \
-do { \
- asm volatile( \
- "sfence ;\n\t" \
- "movups (%1),%%xmm0 ;\n\t" \
- "movups 0x10(%1),%%xmm1 ;\n\t" \
- "movups 0x20(%1),%%xmm2 ;\n\t" \
- "movups 0x30(%1),%%xmm3 ;\n\t" \
- "movq %0,%%cr0 ;\n\t" \
- : \
- : "r" (cr0), "r" (xmm_save) \
- : "memory"); \
- preempt_enable(); \
-} while (0)
-
-#define OFFS(x) "16*("#x")"
-#define PF_OFFS(x) "256+16*("#x")"
-#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
-#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
-#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
-#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
-#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
-#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
-#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
-#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
-#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
-#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
-#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
-#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
-#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
-
-
-static void
-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
-{
- unsigned int lines = bytes >> 8;
- unsigned long cr0;
- xmm_store_t xmm_save[4];
-
- XMMS_SAVE;
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- LD(i, 0) \
- LD(i + 1, 1) \
- PF1(i) \
- PF1(i + 2) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO1(i, 0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- ST(i, 0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addq %[inc], %[p1] ;\n"
- " addq %[inc], %[p2] ;\n"
- " decl %[cnt] ; jnz 1b"
- : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
- : [inc] "r" (256UL)
- : "memory");
-
- XMMS_RESTORE;
-}
-
-static void
-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3)
-{
- unsigned int lines = bytes >> 8;
- xmm_store_t xmm_save[4];
- unsigned long cr0;
-
- XMMS_SAVE;
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i + 2) \
- LD(i, 0) \
- LD(i + 1, 1) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF2(i) \
- PF2(i + 2) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO1(i, 0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- XO2(i, 0) \
- XO2(i + 1, 1) \
- XO2(i + 2, 2) \
- XO2(i + 3, 3) \
- ST(i, 0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addq %[inc], %[p1] ;\n"
- " addq %[inc], %[p2] ;\n"
- " addq %[inc], %[p3] ;\n"
- " decl %[cnt] ; jnz 1b"
- : [cnt] "+r" (lines),
- [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
- : [inc] "r" (256UL)
- : "memory");
- XMMS_RESTORE;
-}
-
-static void
-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4)
-{
- unsigned int lines = bytes >> 8;
- xmm_store_t xmm_save[4];
- unsigned long cr0;
-
- XMMS_SAVE;
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i + 2) \
- LD(i, 0) \
- LD(i + 1, 1) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF2(i) \
- PF2(i + 2) \
- XO1(i, 0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- PF3(i) \
- PF3(i + 2) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO2(i, 0) \
- XO2(i + 1, 1) \
- XO2(i + 2, 2) \
- XO2(i + 3, 3) \
- XO3(i, 0) \
- XO3(i + 1, 1) \
- XO3(i + 2, 2) \
- XO3(i + 3, 3) \
- ST(i, 0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addq %[inc], %[p1] ;\n"
- " addq %[inc], %[p2] ;\n"
- " addq %[inc], %[p3] ;\n"
- " addq %[inc], %[p4] ;\n"
- " decl %[cnt] ; jnz 1b"
- : [cnt] "+c" (lines),
- [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
- : [inc] "r" (256UL)
- : "memory" );
-
- XMMS_RESTORE;
-}
-
-static void
-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
- unsigned long *p3, unsigned long *p4, unsigned long *p5)
-{
- unsigned int lines = bytes >> 8;
- xmm_store_t xmm_save[4];
- unsigned long cr0;
-
- XMMS_SAVE;
-
- asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
- PF1(i) \
- PF1(i + 2) \
- LD(i, 0) \
- LD(i + 1, 1) \
- LD(i + 2, 2) \
- LD(i + 3, 3) \
- PF2(i) \
- PF2(i + 2) \
- XO1(i, 0) \
- XO1(i + 1, 1) \
- XO1(i + 2, 2) \
- XO1(i + 3, 3) \
- PF3(i) \
- PF3(i + 2) \
- XO2(i, 0) \
- XO2(i + 1, 1) \
- XO2(i + 2, 2) \
- XO2(i + 3, 3) \
- PF4(i) \
- PF4(i + 2) \
- PF0(i + 4) \
- PF0(i + 6) \
- XO3(i, 0) \
- XO3(i + 1, 1) \
- XO3(i + 2, 2) \
- XO3(i + 3, 3) \
- XO4(i, 0) \
- XO4(i + 1, 1) \
- XO4(i + 2, 2) \
- XO4(i + 3, 3) \
- ST(i, 0) \
- ST(i + 1, 1) \
- ST(i + 2, 2) \
- ST(i + 3, 3) \
-
-
- PF0(0)
- PF0(2)
-
- " .align 32 ;\n"
- " 1: ;\n"
-
- BLOCK(0)
- BLOCK(4)
- BLOCK(8)
- BLOCK(12)
-
- " addq %[inc], %[p1] ;\n"
- " addq %[inc], %[p2] ;\n"
- " addq %[inc], %[p3] ;\n"
- " addq %[inc], %[p4] ;\n"
- " addq %[inc], %[p5] ;\n"
- " decl %[cnt] ; jnz 1b"
- : [cnt] "+c" (lines),
- [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
- [p5] "+r" (p5)
- : [inc] "r" (256UL)
- : "memory");
-
- XMMS_RESTORE;
-}
-
static struct xor_block_template xor_block_sse = {
.name = "generic_sse",
.do_2 = xor_sse_2,
@@ -347,15 +10,19 @@ static struct xor_block_template xor_block_sse = {
.do_5 = xor_sse_5,
};
+
+/* Also try the AVX routines */
+#include <asm/xor_avx.h>
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
+ AVX_XOR_SPEED; \
+ xor_speed(&xor_block_sse_pf64); \
xor_speed(&xor_block_sse); \
} while (0)
-/* We force the use of the SSE xor block because it can write around L2.
- We may also be able to load into the L1 only depending on how the cpu
- deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
-
#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000000..7f81dd5897f4
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ */
+
+#include <linux/compiler.h>
+#include <asm/fpu/api.h>
+
+#define BLOCK4(i) \
+ BLOCK(32 * i, 0) \
+ BLOCK(32 * (i + 1), 1) \
+ BLOCK(32 * (i + 2), 2) \
+ BLOCK(32 * (i + 3), 3)
+
+#define BLOCK16() \
+ BLOCK4(0) \
+ BLOCK4(4) \
+ BLOCK4(8) \
+ BLOCK4(12)
+
+static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1,
+ const unsigned long * __restrict p2)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16();
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
+ const unsigned long * __restrict p1,
+ const unsigned long * __restrict p2,
+ const unsigned long * __restrict p3,
+ const unsigned long * __restrict p4)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ p4 = (unsigned long *)((uintptr_t)p4 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_avx = {
+ .name = "avx",
+ .do_2 = xor_avx_2,
+ .do_3 = xor_avx_3,
+ .do_4 = xor_avx_4,
+ .do_5 = xor_avx_5,
+};
+
+#define AVX_XOR_SPEED \
+do { \
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
+ xor_speed(&xor_block_avx); \
+} while (0)
+
+#define AVX_SELECT(FASTEST) \
+ (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
+
+#endif
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
deleted file mode 100644
index 727acc152344..000000000000
--- a/arch/x86/include/asm/xsave.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef __ASM_X86_XSAVE_H
-#define __ASM_X86_XSAVE_H
-
-#include <linux/types.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-
-#define XSTATE_FP 0x1
-#define XSTATE_SSE 0x2
-#define XSTATE_YMM 0x4
-
-#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
-
-#define FXSAVE_SIZE 512
-
-/*
- * These are the features that the OS can handle currently.
- */
-#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
-
-#ifdef CONFIG_X86_64
-#define REX_PREFIX "0x48, "
-#else
-#define REX_PREFIX
-#endif
-
-extern unsigned int xstate_size;
-extern u64 pcntxt_mask;
-extern struct xsave_struct *init_xstate_buf;
-
-extern void xsave_cntxt_init(void);
-extern void xsave_init(void);
-extern int init_fpu(struct task_struct *child);
-extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
- void __user *fpstate,
- struct _fpx_sw_bytes *sw);
-
-static inline int xrstor_checking(struct xsave_struct *fx)
-{
- int err;
-
- asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err)
- : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
- : "memory");
-
- return err;
-}
-
-static inline int xsave_user(struct xsave_struct __user *buf)
-{
- int err;
- __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- ".section __ex_table,\"a\"\n"
- _ASM_ALIGN "\n"
- _ASM_PTR "1b,3b\n"
- ".previous"
- : [err] "=r" (err)
- : "D" (buf), "a" (-1), "d" (-1), "0" (0)
- : "memory");
- if (unlikely(err) && __clear_user(buf, xstate_size))
- err = -EFAULT;
- /* No need to clear here because the caller clears USED_MATH */
- return err;
-}
-
-static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
-{
- int err;
- struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
- u32 lmask = mask;
- u32 hmask = mask >> 32;
-
- __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- ".section __ex_table,\"a\"\n"
- _ASM_ALIGN "\n"
- _ASM_PTR "1b,3b\n"
- ".previous"
- : [err] "=r" (err)
- : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
- : "memory"); /* memory required? */
- return err;
-}
-
-static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
-
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void xsave(struct task_struct *tsk)
-{
- /* This, however, we can work around by forcing the compiler to select
- an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
- : : "D" (&(tsk->thread.xstate->xsave)),
- "a" (-1), "d"(-1) : "memory");
-}
-#endif
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
new file mode 100644
index 000000000000..39606a856d3b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+generated-y += unistd_32.h
+generated-y += unistd_64.h
+generated-y += unistd_x32.h
diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/uapi/asm/a.out.h
index 4684f97a5bbd..094c49d8ea8e 100644
--- a/arch/x86/include/asm/a.out.h
+++ b/arch/x86/include/uapi/asm/a.out.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_A_OUT_H
#define _ASM_X86_A_OUT_H
diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h
new file mode 100644
index 000000000000..92d8f256d096
--- /dev/null
+++ b/arch/x86/include/uapi/asm/amd_hsmp.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_ASM_X86_AMD_HSMP_H_
+#define _UAPI_ASM_X86_AMD_HSMP_H_
+
+#include <linux/types.h>
+
+#pragma pack(4)
+
+#define HSMP_MAX_MSG_LEN 8
+
+/*
+ * HSMP Messages supported
+ */
+enum hsmp_message_ids {
+ HSMP_TEST = 1, /* 01h Increments input value by 1 */
+ HSMP_GET_SMU_VER, /* 02h SMU FW version */
+ HSMP_GET_PROTO_VER, /* 03h HSMP interface version */
+ HSMP_GET_SOCKET_POWER, /* 04h average package power consumption */
+ HSMP_SET_SOCKET_POWER_LIMIT, /* 05h Set the socket power limit */
+ HSMP_GET_SOCKET_POWER_LIMIT, /* 06h Get current socket power limit */
+ HSMP_GET_SOCKET_POWER_LIMIT_MAX,/* 07h Get maximum socket power value */
+ HSMP_SET_BOOST_LIMIT, /* 08h Set a core maximum frequency limit */
+ HSMP_SET_BOOST_LIMIT_SOCKET, /* 09h Set socket maximum frequency level */
+ HSMP_GET_BOOST_LIMIT, /* 0Ah Get current frequency limit */
+ HSMP_GET_PROC_HOT, /* 0Bh Get PROCHOT status */
+ HSMP_SET_XGMI_LINK_WIDTH, /* 0Ch Set max and min width of xGMI Link */
+ HSMP_SET_DF_PSTATE, /* 0Dh Alter APEnable/Disable messages behavior */
+ HSMP_SET_AUTO_DF_PSTATE, /* 0Eh Enable DF P-State Performance Boost algorithm */
+ HSMP_GET_FCLK_MCLK, /* 0Fh Get FCLK and MEMCLK for current socket */
+ HSMP_GET_CCLK_THROTTLE_LIMIT, /* 10h Get CCLK frequency limit in socket */
+ HSMP_GET_C0_PERCENT, /* 11h Get average C0 residency in socket */
+ HSMP_SET_NBIO_DPM_LEVEL, /* 12h Set max/min LCLK DPM Level for a given NBIO */
+ HSMP_GET_NBIO_DPM_LEVEL, /* 13h Get LCLK DPM level min and max for a given NBIO */
+ HSMP_GET_DDR_BANDWIDTH, /* 14h Get theoretical maximum and current DDR Bandwidth */
+ HSMP_GET_TEMP_MONITOR, /* 15h Get socket temperature */
+ HSMP_GET_DIMM_TEMP_RANGE, /* 16h Get per-DIMM temperature range and refresh rate */
+ HSMP_GET_DIMM_POWER, /* 17h Get per-DIMM power consumption */
+ HSMP_GET_DIMM_THERMAL, /* 18h Get per-DIMM thermal sensors */
+ HSMP_GET_SOCKET_FREQ_LIMIT, /* 19h Get current active frequency per socket */
+ HSMP_GET_CCLK_CORE_LIMIT, /* 1Ah Get CCLK frequency limit per core */
+ HSMP_GET_RAILS_SVI, /* 1Bh Get SVI-based Telemetry for all rails */
+ HSMP_GET_SOCKET_FMAX_FMIN, /* 1Ch Get Fmax and Fmin per socket */
+ HSMP_GET_IOLINK_BANDWITH, /* 1Dh Get current bandwidth on IO Link */
+ HSMP_GET_XGMI_BANDWITH, /* 1Eh Get current bandwidth on xGMI Link */
+ HSMP_SET_GMI3_WIDTH, /* 1Fh Set max and min GMI3 Link width */
+ HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */
+ HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */
+ HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */
+ HSMP_GET_METRIC_TABLE_VER, /* 23h Get metrics table version */
+ HSMP_GET_METRIC_TABLE, /* 24h Get metrics table */
+ HSMP_GET_METRIC_TABLE_DRAM_ADDR,/* 25h Get metrics table dram address */
+ HSMP_SET_XGMI_PSTATE_RANGE, /* 26h Set xGMI P-state range */
+ HSMP_CPU_RAIL_ISO_FREQ_POLICY, /* 27h Get/Set Cpu Iso frequency policy */
+ HSMP_DFC_ENABLE_CTRL, /* 28h Enable/Disable DF C-state */
+ HSMP_GET_RAPL_UNITS = 0x30, /* 30h Get scaling factor for energy */
+ HSMP_GET_RAPL_CORE_COUNTER, /* 31h Get core energy counter value */
+ HSMP_GET_RAPL_PACKAGE_COUNTER, /* 32h Get package energy counter value */
+ HSMP_MSG_ID_MAX,
+};
+
+struct hsmp_message {
+ __u32 msg_id; /* Message ID */
+ __u16 num_args; /* Number of input argument words in message */
+ __u16 response_sz; /* Number of expected output/response words */
+ __u32 args[HSMP_MAX_MSG_LEN]; /* argument/response buffer */
+ __u16 sock_ind; /* socket number */
+};
+
+enum hsmp_msg_type {
+ HSMP_RSVD = -1,
+ HSMP_SET = 0,
+ HSMP_GET = 1,
+ HSMP_SET_GET = 2,
+};
+
+enum hsmp_proto_versions {
+ HSMP_PROTO_VER2 = 2,
+ HSMP_PROTO_VER3,
+ HSMP_PROTO_VER4,
+ HSMP_PROTO_VER5,
+ HSMP_PROTO_VER6,
+ HSMP_PROTO_VER7
+};
+
+struct hsmp_msg_desc {
+ int num_args;
+ int response_sz;
+ enum hsmp_msg_type type;
+};
+
+/*
+ * User may use these comments as reference, please find the
+ * supported list of messages and message definition in the
+ * HSMP chapter of respective family/model PPR.
+ *
+ * Not supported messages would return -ENOMSG.
+ */
+static const struct hsmp_msg_desc hsmp_msg_desc_table[]
+ __attribute__((unused)) = {
+ /* RESERVED */
+ {0, 0, HSMP_RSVD},
+
+ /*
+ * HSMP_TEST, num_args = 1, response_sz = 1
+ * input: args[0] = xx
+ * output: args[0] = xx + 1
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SMU_VER, num_args = 0, response_sz = 1
+ * output: args[0] = smu fw ver
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_PROTO_VER, num_args = 0, response_sz = 1
+ * output: args[0] = proto version
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_POWER, num_args = 0, response_sz = 1
+ * output: args[0] = socket power in mWatts
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_SOCKET_POWER_LIMIT, num_args = 1, response_sz = 0
+ * input: args[0] = power limit value in mWatts
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_SOCKET_POWER_LIMIT, num_args = 0, response_sz = 1
+ * output: args[0] = socket power limit value in mWatts
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_POWER_LIMIT_MAX, num_args = 0, response_sz = 1
+ * output: args[0] = maximuam socket power limit in mWatts
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_BOOST_LIMIT, num_args = 1, response_sz = 0
+ * input: args[0] = apic id[31:16] + boost limit value in MHz[15:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_BOOST_LIMIT_SOCKET, num_args = 1, response_sz = 0
+ * input: args[0] = boost limit value in MHz
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_BOOST_LIMIT, num_args = 1, response_sz = 1
+ * input: args[0] = apic id
+ * output: args[0] = boost limit value in MHz
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_PROC_HOT, num_args = 0, response_sz = 1
+ * output: args[0] = proc hot status
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_XGMI_LINK_WIDTH, num_args = 1, response_sz = 0
+ * input: args[0] = min link width[15:8] + max link width[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_DF_PSTATE, num_args = 1, response_sz = 0
+ * input: args[0] = df pstate[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /* HSMP_SET_AUTO_DF_PSTATE, num_args = 0, response_sz = 0 */
+ {0, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_FCLK_MCLK, num_args = 0, response_sz = 2
+ * output: args[0] = fclk in MHz, args[1] = mclk in MHz
+ */
+ {0, 2, HSMP_GET},
+
+ /*
+ * HSMP_GET_CCLK_THROTTLE_LIMIT, num_args = 0, response_sz = 1
+ * output: args[0] = core clock in MHz
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_C0_PERCENT, num_args = 0, response_sz = 1
+ * output: args[0] = average c0 residency
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 0
+ * input: args[0] = nbioid[23:16] + max dpm level[15:8] + min dpm level[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 1
+ * input: args[0] = nbioid[23:16]
+ * output: args[0] = max dpm level[15:8] + min dpm level[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DDR_BANDWIDTH, num_args = 0, response_sz = 1
+ * output: args[0] = max bw in Gbps[31:20] + utilised bw in Gbps[19:8] +
+ * bw in percentage[7:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_TEMP_MONITOR, num_args = 0, response_sz = 1
+ * output: args[0] = temperature in degree celsius. [15:8] integer part +
+ * [7:5] fractional part
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_TEMP_RANGE, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = refresh rate[3] + temperature range[2:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_POWER, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = DIMM power in mW[31:17] + update rate in ms[16:8] +
+ * DIMM address[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = temperature in degree celsius[31:21] + update rate in ms[16:8] +
+ * DIMM address[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_FREQ_LIMIT, num_args = 0, response_sz = 1
+ * output: args[0] = frequency in MHz[31:16] + frequency source[15:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_CCLK_CORE_LIMIT, num_args = 1, response_sz = 1
+ * input: args[0] = apic id [31:0]
+ * output: args[0] = frequency in MHz[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_RAILS_SVI, num_args = 0, response_sz = 1
+ * output: args[0] = power in mW[31:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_FMAX_FMIN, num_args = 0, response_sz = 1
+ * output: args[0] = fmax in MHz[31:16] + fmin in MHz[15:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_IOLINK_BANDWITH, num_args = 1, response_sz = 1
+ * input: args[0] = link id[15:8] + bw type[2:0]
+ * output: args[0] = io bandwidth in Mbps[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_XGMI_BANDWITH, num_args = 1, response_sz = 1
+ * input: args[0] = link id[15:8] + bw type[2:0]
+ * output: args[0] = xgmi bandwidth in Mbps[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_GMI3_WIDTH, num_args = 1, response_sz = 0
+ * input: args[0] = min link width[15:8] + max link width[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_PCI_RATE, num_args = 1, response_sz = 1
+ * input: args[0] = link rate control value
+ * output: args[0] = previous link rate control value
+ */
+ {1, 1, HSMP_SET},
+
+ /*
+ * HSMP_SET_POWER_MODE, num_args = 1, response_sz = 0
+ * input: args[0] = power efficiency mode[2:0]
+ */
+ {1, 1, HSMP_SET_GET},
+
+ /*
+ * HSMP_SET_PSTATE_MAX_MIN, num_args = 1, response_sz = 0
+ * input: args[0] = min df pstate[15:8] + max df pstate[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_METRIC_TABLE_VER, num_args = 0, response_sz = 1
+ * output: args[0] = metrics table version
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_METRIC_TABLE, num_args = 0, response_sz = 0
+ */
+ {0, 0, HSMP_GET},
+
+ /*
+ * HSMP_GET_METRIC_TABLE_DRAM_ADDR, num_args = 0, response_sz = 2
+ * output: args[0] = lower 32 bits of the address
+ * output: args[1] = upper 32 bits of the address
+ */
+ {0, 2, HSMP_GET},
+
+ /*
+ * HSMP_SET_XGMI_PSTATE_RANGE, num_args = 1, response_sz = 0
+ * input: args[0] = min xGMI p-state[15:8] + max xGMI p-state[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_CPU_RAIL_ISO_FREQ_POLICY, num_args = 1, response_sz = 1
+ * input: args[0] = set/get policy[31] +
+ * disable/enable independent control[0]
+ * output: args[0] = current policy[0]
+ */
+ {1, 1, HSMP_SET_GET},
+
+ /*
+ * HSMP_DFC_ENABLE_CTRL, num_args = 1, response_sz = 1
+ * input: args[0] = set/get policy[31] + enable/disable DFC[0]
+ * output: args[0] = current policy[0]
+ */
+ {1, 1, HSMP_SET_GET},
+
+ /* RESERVED(0x29-0x2f) */
+ {0, 0, HSMP_RSVD},
+ {0, 0, HSMP_RSVD},
+ {0, 0, HSMP_RSVD},
+ {0, 0, HSMP_RSVD},
+ {0, 0, HSMP_RSVD},
+ {0, 0, HSMP_RSVD},
+ {0, 0, HSMP_RSVD},
+
+ /*
+ * HSMP_GET_RAPL_UNITS, response_sz = 1
+ * output: args[0] = tu value[19:16] + esu value[12:8]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_RAPL_CORE_COUNTER, num_args = 1, response_sz = 1
+ * input: args[0] = apic id[15:0]
+ * output: args[0] = lower 32 bits of energy
+ * output: args[1] = upper 32 bits of energy
+ */
+ {1, 2, HSMP_GET},
+
+ /*
+ * HSMP_GET_RAPL_PACKAGE_COUNTER, num_args = 0, response_sz = 1
+ * output: args[0] = lower 32 bits of energy
+ * output: args[1] = upper 32 bits of energy
+ */
+ {0, 2, HSMP_GET},
+
+};
+
+/* Metrics table (supported only with proto version 6) */
+struct hsmp_metric_table {
+ __u32 accumulation_counter;
+
+ /* TEMPERATURE */
+ __u32 max_socket_temperature;
+ __u32 max_vr_temperature;
+ __u32 max_hbm_temperature;
+ __u64 max_socket_temperature_acc;
+ __u64 max_vr_temperature_acc;
+ __u64 max_hbm_temperature_acc;
+
+ /* POWER */
+ __u32 socket_power_limit;
+ __u32 max_socket_power_limit;
+ __u32 socket_power;
+
+ /* ENERGY */
+ __u64 timestamp;
+ __u64 socket_energy_acc;
+ __u64 ccd_energy_acc;
+ __u64 xcd_energy_acc;
+ __u64 aid_energy_acc;
+ __u64 hbm_energy_acc;
+
+ /* FREQUENCY */
+ __u32 cclk_frequency_limit;
+ __u32 gfxclk_frequency_limit;
+ __u32 fclk_frequency;
+ __u32 uclk_frequency;
+ __u32 socclk_frequency[4];
+ __u32 vclk_frequency[4];
+ __u32 dclk_frequency[4];
+ __u32 lclk_frequency[4];
+ __u64 gfxclk_frequency_acc[8];
+ __u64 cclk_frequency_acc[96];
+
+ /* FREQUENCY RANGE */
+ __u32 max_cclk_frequency;
+ __u32 min_cclk_frequency;
+ __u32 max_gfxclk_frequency;
+ __u32 min_gfxclk_frequency;
+ __u32 fclk_frequency_table[4];
+ __u32 uclk_frequency_table[4];
+ __u32 socclk_frequency_table[4];
+ __u32 vclk_frequency_table[4];
+ __u32 dclk_frequency_table[4];
+ __u32 lclk_frequency_table[4];
+ __u32 max_lclk_dpm_range;
+ __u32 min_lclk_dpm_range;
+
+ /* XGMI */
+ __u32 xgmi_width;
+ __u32 xgmi_bitrate;
+ __u64 xgmi_read_bandwidth_acc[8];
+ __u64 xgmi_write_bandwidth_acc[8];
+
+ /* ACTIVITY */
+ __u32 socket_c0_residency;
+ __u32 socket_gfx_busy;
+ __u32 dram_bandwidth_utilization;
+ __u64 socket_c0_residency_acc;
+ __u64 socket_gfx_busy_acc;
+ __u64 dram_bandwidth_acc;
+ __u32 max_dram_bandwidth;
+ __u64 dram_bandwidth_utilization_acc;
+ __u64 pcie_bandwidth_acc[4];
+
+ /* THROTTLERS */
+ __u32 prochot_residency_acc;
+ __u32 ppt_residency_acc;
+ __u32 socket_thm_residency_acc;
+ __u32 vr_thm_residency_acc;
+ __u32 hbm_thm_residency_acc;
+ __u32 spare;
+
+ /* New items at the end to maintain driver compatibility */
+ __u32 gfxclk_frequency[8];
+};
+
+/* Reset to default packing */
+#pragma pack()
+
+/* Define unique ioctl command for hsmp msgs using generic _IOWR */
+#define HSMP_BASE_IOCTL_NR 0xF8
+#define HSMP_IOCTL_CMD _IOWR(HSMP_BASE_IOCTL_NR, 0, struct hsmp_message)
+
+#endif /*_ASM_X86_AMD_HSMP_H_*/
diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h
index 1316b4c35425..6beb55bbefa4 100644
--- a/arch/x86/include/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_AUXVEC_H
#define _ASM_X86_AUXVEC_H
/*
@@ -9,4 +10,11 @@
#endif
#define AT_SYSINFO_EHDR 33
+/* entries in ARCH_DLINFO: */
+#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
+# define AT_VECTOR_SIZE_ARCH 3
+#else /* else it's non-compat x86-64 */
+# define AT_VECTOR_SIZE_ARCH 2
+#endif
+
#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/uapi/asm/bitsperlong.h
index b0ae1c4dc791..5d72c8458838 100644
--- a/arch/x86/include/asm/bitsperlong.h
+++ b/arch/x86/include/uapi/asm/bitsperlong.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef __ASM_X86_BITSPERLONG_H
#define __ASM_X86_BITSPERLONG_H
-#ifdef __x86_64__
+#if defined(__x86_64__) && !defined(__ILP32__)
# define __BITS_PER_LONG 64
#else
# define __BITS_PER_LONG 32
diff --git a/arch/x86/include/uapi/asm/boot.h b/arch/x86/include/uapi/asm/boot.h
new file mode 100644
index 000000000000..88ffc5aee087
--- /dev/null
+++ b/arch/x86/include/uapi/asm/boot.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_BOOT_H
+#define _UAPI_ASM_X86_BOOT_H
+
+/* Internal svga startup constants */
+#define NORMAL_VGA 0xffff /* 80x25 mode */
+#define EXTENDED_VGA 0xfffe /* 80x50 mode */
+#define ASK_VGA 0xfffd /* ask for it at bootup */
+
+
+#endif /* _UAPI_ASM_X86_BOOT_H */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
new file mode 100644
index 000000000000..dafbf581c515
--- /dev/null
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_BOOTPARAM_H
+#define _ASM_X86_BOOTPARAM_H
+
+#include <asm/setup_data.h>
+
+/* ram_size flags */
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+
+/* loadflags */
+#define LOADED_HIGH (1<<0)
+#define KASLR_FLAG (1<<1)
+#define QUIET_FLAG (1<<5)
+#define KEEP_SEGMENTS (1<<6)
+#define CAN_USE_HEAP (1<<7)
+
+/* xloadflags */
+#define XLF_KERNEL_64 (1<<0)
+#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1)
+#define XLF_EFI_HANDOVER_32 (1<<2)
+#define XLF_EFI_HANDOVER_64 (1<<3)
+#define XLF_EFI_KEXEC (1<<4)
+#define XLF_5LEVEL (1<<5)
+#define XLF_5LEVEL_ENABLED (1<<6)
+#define XLF_MEM_ENCRYPTION (1<<7)
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+#include <linux/screen_info.h>
+#include <linux/apm_bios.h>
+#include <linux/edd.h>
+#include <asm/ist.h>
+#include <video/edid.h>
+
+struct setup_header {
+ __u8 setup_sects;
+ __u16 root_flags;
+ __u32 syssize;
+ __u16 ram_size;
+ __u16 vid_mode;
+ __u16 root_dev;
+ __u16 boot_flag;
+ __u16 jump;
+ __u32 header;
+ __u16 version;
+ __u32 realmode_swtch;
+ __u16 start_sys_seg;
+ __u16 kernel_version;
+ __u8 type_of_loader;
+ __u8 loadflags;
+ __u16 setup_move_size;
+ __u32 code32_start;
+ __u32 ramdisk_image;
+ __u32 ramdisk_size;
+ __u32 bootsect_kludge;
+ __u16 heap_end_ptr;
+ __u8 ext_loader_ver;
+ __u8 ext_loader_type;
+ __u32 cmd_line_ptr;
+ __u32 initrd_addr_max;
+ __u32 kernel_alignment;
+ __u8 relocatable_kernel;
+ __u8 min_alignment;
+ __u16 xloadflags;
+ __u32 cmdline_size;
+ __u32 hardware_subarch;
+ __u64 hardware_subarch_data;
+ __u32 payload_offset;
+ __u32 payload_length;
+ __u64 setup_data;
+ __u64 pref_address;
+ __u32 init_size;
+ __u32 handover_offset;
+ __u32 kernel_info_offset;
+} __attribute__((packed));
+
+struct sys_desc_table {
+ __u16 length;
+ __u8 table[14];
+};
+
+/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
+struct olpc_ofw_header {
+ __u32 ofw_magic; /* OFW signature */
+ __u32 ofw_version;
+ __u32 cif_handler; /* callback into OFW */
+ __u32 irq_desc_table;
+} __attribute__((packed));
+
+struct efi_info {
+ __u32 efi_loader_signature;
+ __u32 efi_systab;
+ __u32 efi_memdesc_size;
+ __u32 efi_memdesc_version;
+ __u32 efi_memmap;
+ __u32 efi_memmap_size;
+ __u32 efi_systab_hi;
+ __u32 efi_memmap_hi;
+};
+
+/*
+ * This is the maximum number of entries in struct boot_params::e820_table
+ * (the zeropage), which is part of the x86 boot protocol ABI:
+ */
+#define E820_MAX_ENTRIES_ZEROPAGE 128
+
+/*
+ * Smallest compatible version of jailhouse_setup_data required by this kernel.
+ */
+#define JAILHOUSE_SETUP_REQUIRED_VERSION 1
+
+/* The so-called "zeropage" */
+struct boot_params {
+ struct screen_info screen_info; /* 0x000 */
+ struct apm_bios_info apm_bios_info; /* 0x040 */
+ __u8 _pad2[4]; /* 0x054 */
+ __u64 tboot_addr; /* 0x058 */
+ struct ist_info ist_info; /* 0x060 */
+ __u64 acpi_rsdp_addr; /* 0x070 */
+ __u8 _pad3[8]; /* 0x078 */
+ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
+ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
+ struct sys_desc_table sys_desc_table; /* obsolete! */ /* 0x0a0 */
+ struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
+ __u32 ext_ramdisk_image; /* 0x0c0 */
+ __u32 ext_ramdisk_size; /* 0x0c4 */
+ __u32 ext_cmd_line_ptr; /* 0x0c8 */
+ __u8 _pad4[112]; /* 0x0cc */
+ __u32 cc_blob_address; /* 0x13c */
+ struct edid_info edid_info; /* 0x140 */
+ struct efi_info efi_info; /* 0x1c0 */
+ __u32 alt_mem_k; /* 0x1e0 */
+ __u32 scratch; /* Scratch field! */ /* 0x1e4 */
+ __u8 e820_entries; /* 0x1e8 */
+ __u8 eddbuf_entries; /* 0x1e9 */
+ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
+ __u8 kbd_status; /* 0x1eb */
+ __u8 secure_boot; /* 0x1ec */
+ __u8 _pad5[2]; /* 0x1ed */
+ /*
+ * The sentinel is set to a nonzero value (0xff) in header.S.
+ *
+ * A bootloader is supposed to only take setup_header and put
+ * it into a clean boot_params buffer. If it turns out that
+ * it is clumsy or too generous with the buffer, it most
+ * probably will pick up the sentinel variable too. The fact
+ * that this variable then is still 0xff will let kernel
+ * know that some variables in boot_params are invalid and
+ * kernel should zero out certain portions of boot_params.
+ */
+ __u8 sentinel; /* 0x1ef */
+ __u8 _pad6[1]; /* 0x1f0 */
+ struct setup_header hdr; /* setup header */ /* 0x1f1 */
+ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
+ __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
+ struct boot_e820_entry e820_table[E820_MAX_ENTRIES_ZEROPAGE]; /* 0x2d0 */
+ __u8 _pad8[48]; /* 0xcd0 */
+ struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
+ __u8 _pad9[276]; /* 0xeec */
+} __attribute__((packed));
+
+/**
+ * enum x86_hardware_subarch - x86 hardware subarchitecture
+ *
+ * The x86 hardware_subarch and hardware_subarch_data were added as of the x86
+ * boot protocol 2.07 to help distinguish and support custom x86 boot
+ * sequences. This enum represents accepted values for the x86
+ * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not
+ * have or simply *cannot* make use of natural stubs like BIOS or EFI, the
+ * hardware_subarch can be used on the Linux entry path to revector to a
+ * subarchitecture stub when needed. This subarchitecture stub can be used to
+ * set up Linux boot parameters or for special care to account for nonstandard
+ * handling of page tables.
+ *
+ * These enums should only ever be used by x86 code, and the code that uses
+ * it should be well contained and compartmentalized.
+ *
+ * KVM and Xen HVM do not have a subarch as these are expected to follow
+ * standard x86 boot entries. If there is a genuine need for "hypervisor" type
+ * that should be considered separately in the future. Future guest types
+ * should seriously consider working with standard x86 boot stubs such as
+ * the BIOS or EFI boot stubs.
+ *
+ * WARNING: this enum is only used for legacy hacks, for platform features that
+ * are not easily enumerated or discoverable. You should not ever use
+ * this for new features.
+ *
+ * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
+ * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
+ * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
+ * which start at asm startup_xen() entry point and later jump to the C
+ * xen_start_kernel() entry point. Both domU and dom0 type of guests are
+ * currently supported through this PV boot path.
+ * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform
+ * systems which do not have the PCI legacy interfaces.
+ * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC
+ * for settop boxes and media devices, the use of a subarch for CE4100
+ * is more of a hack...
+ */
+enum x86_hardware_subarch {
+ X86_SUBARCH_PC = 0,
+ X86_SUBARCH_LGUEST,
+ X86_SUBARCH_XEN,
+ X86_SUBARCH_INTEL_MID,
+ X86_SUBARCH_CE4100,
+ X86_NR_SUBARCHS,
+};
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/uapi/asm/byteorder.h
index b13a7a88f3eb..149143cab9ff 100644
--- a/arch/x86/include/asm/byteorder.h
+++ b/arch/x86/include/uapi/asm/byteorder.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_BYTEORDER_H
#define _ASM_X86_BYTEORDER_H
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
new file mode 100644
index 000000000000..41da492dfb01
--- /dev/null
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_DEBUGREG_H
+#define _UAPI_ASM_X86_DEBUGREG_H
+
+
+/* Indicate the register numbers for a number of the specific
+ debug registers. Registers 0-3 contain the addresses we wish to trap on */
+#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */
+#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */
+
+#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */
+#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */
+
+/* Define a few things for the status register. We can use this to determine
+ which debugging register was responsible for the trap. The other bits
+ are either reserved or not of interest to us. */
+
+/*
+ * Define bits in DR6 which are set to 1 by default.
+ *
+ * This is also the DR6 architectural value following Power-up, Reset or INIT.
+ *
+ * Note, with the introduction of Bus Lock Detection (BLD) and Restricted
+ * Transactional Memory (RTM), the DR6 register has been modified:
+ *
+ * 1) BLD flag (bit 11) is no longer reserved to 1 if the CPU supports
+ * Bus Lock Detection. The assertion of a bus lock could clear it.
+ *
+ * 2) RTM flag (bit 16) is no longer reserved to 1 if the CPU supports
+ * restricted transactional memory. #DB occurred inside an RTM region
+ * could clear it.
+ *
+ * Apparently, DR6.BLD and DR6.RTM are active low bits.
+ *
+ * As a result, DR6_RESERVED is an incorrect name now, but it is kept for
+ * compatibility.
+ */
+#define DR6_RESERVED (0xFFFF0FF0)
+
+#define DR_TRAP0 (0x1) /* db0 */
+#define DR_TRAP1 (0x2) /* db1 */
+#define DR_TRAP2 (0x4) /* db2 */
+#define DR_TRAP3 (0x8) /* db3 */
+#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
+
+#define DR_BUS_LOCK (0x800) /* bus_lock */
+#define DR_STEP (0x4000) /* single-step */
+#define DR_SWITCH (0x8000) /* task switch */
+
+/* Now define a bunch of things for manipulating the control register.
+ The top two bytes of the control register consist of 4 fields of 4
+ bits - each field corresponds to one of the four debug registers,
+ and indicates what types of access we trap on, and how large the data
+ field is that we are looking at */
+
+#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */
+#define DR_CONTROL_SIZE 4 /* 4 control bits per register */
+
+#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */
+#define DR_RW_WRITE (0x1)
+#define DR_RW_READ (0x3)
+
+#define DR_LEN_1 (0x0) /* Settings for data length to trap on */
+#define DR_LEN_2 (0x4)
+#define DR_LEN_4 (0xC)
+#define DR_LEN_8 (0x8)
+
+/* The low byte to the control register determine which registers are
+ enabled. There are 4 fields of two bits. One bit is "local", meaning
+ that the processor will reset the bit after a task switch and the other
+ is global meaning that we have to explicitly reset the bit. With linux,
+ you can use either one, since we explicitly zero the register when we enter
+ kernel mode. */
+
+#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
+#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
+#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
+
+#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
+#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */
+
+/* The second byte to the control register has a few special things.
+ We can slow the instruction pipeline for instructions coming via the
+ gdt or the ldt if we want to. I am not sure why this is an advantage */
+
+#ifdef __i386__
+#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */
+#else
+#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */
+#endif
+
+#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
+#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
+
+/*
+ * HW breakpoint additions
+ */
+
+#endif /* _UAPI_ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
new file mode 100644
index 000000000000..55bc66867156
--- /dev/null
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_E820_H
+#define _UAPI_ASM_X86_E820_H
+#define E820MAP 0x2d0 /* our map */
+#define E820MAX 128 /* number of entries in E820MAP */
+
+/*
+ * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
+ * constrained space in the zeropage. If we have more nodes than
+ * that, and if we've booted off EFI firmware, then the EFI tables
+ * passed us from the EFI firmware can list more nodes. Size our
+ * internal memory map tables to have room for these additional
+ * nodes, based on up to three entries per node for which the
+ * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
+ * plus E820MAX, allowing space for the possible duplicate E820
+ * entries that might need room in the same arrays, prior to the
+ * call to sanitize_e820_map() to remove duplicates. The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries. Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+#ifndef __KERNEL__
+#define E820_X_MAX E820MAX
+#endif
+
+#define E820NR 0x1e8 /* # entries in E820MAP */
+
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3
+#define E820_NVS 4
+#define E820_UNUSABLE 5
+#define E820_PMEM 7
+
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot. The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
+ *
+ * ( Note that older platforms also used 6 for the same type of memory,
+ * but newer versions switched to 12 as 6 was assigned differently. Some
+ * time they will learn... )
+ */
+#define E820_PRAM 12
+
+/*
+ * reserved RAM used by kernel itself
+ * if CONFIG_INTEL_TXT is enabled, memory of this type will be
+ * included in the S3 integrity calculation and so should not include
+ * any memory that BIOS might alter over the S3 transition
+ */
+#define E820_RESERVED_KERN 128
+
+#ifndef __ASSEMBLER__
+#include <linux/types.h>
+struct e820entry {
+ __u64 addr; /* start of memory segment */
+ __u64 size; /* size of memory segment */
+ __u32 type; /* type of memory segment */
+} __attribute__((packed));
+
+struct e820map {
+ __u32 nr_map;
+ struct e820entry map[E820_X_MAX];
+};
+
+#define ISA_START_ADDRESS 0xa0000
+#define ISA_END_ADDRESS 0x100000
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
+#define BIOS_ROM_BASE 0xffe00000
+#define BIOS_ROM_END 0xffffffff
+
+#endif /* __ASSEMBLER__ */
+
+
+#endif /* _UAPI_ASM_X86_E820_H */
diff --git a/arch/x86/include/uapi/asm/elf.h b/arch/x86/include/uapi/asm/elf.h
new file mode 100644
index 000000000000..468e135fa285
--- /dev/null
+++ b/arch/x86/include/uapi/asm/elf.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_ELF_H
+#define _UAPI_ASM_X86_ELF_H
+
+#include <linux/types.h>
+
+struct x86_xfeat_component {
+ __u32 type;
+ __u32 size;
+ __u32 offset;
+ __u32 flags;
+} __packed;
+
+_Static_assert(sizeof(struct x86_xfeat_component) % 4 == 0, "x86_xfeat_component is not aligned");
+
+#endif /* _UAPI_ASM_X86_ELF_H */
diff --git a/arch/x86/include/uapi/asm/hw_breakpoint.h b/arch/x86/include/uapi/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..6789884c7701
--- /dev/null
+++ b/arch/x86/include/uapi/asm/hw_breakpoint.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* */
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
new file mode 100644
index 000000000000..054604aba9f0
--- /dev/null
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_HWCAP2_H
+#define _ASM_X86_HWCAP2_H
+
+#include <linux/const.h>
+
+/* MONITOR/MWAIT enabled in Ring 3 */
+#define HWCAP2_RING3MWAIT _BITUL(0)
+
+/* Kernel allows FSGSBASE instructions available in Ring 3 */
+#define HWCAP2_FSGSBASE _BITUL(1)
+
+#endif
diff --git a/arch/x86/include/uapi/asm/ist.h b/arch/x86/include/uapi/asm/ist.h
new file mode 100644
index 000000000000..eac5b207939d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ist.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Include file for the interface to IST BIOS
+ * Copyright 2002 Andy Grover <andrew.grover@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef _UAPI_ASM_X86_IST_H
+#define _UAPI_ASM_X86_IST_H
+
+
+
+#include <linux/types.h>
+
+struct ist_info {
+ __u32 signature;
+ __u32 command;
+ __u32 event;
+ __u32 perf_level;
+};
+
+#endif /* _UAPI_ASM_X86_IST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
new file mode 100644
index 000000000000..7ceff6583652
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -0,0 +1,1046 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_KVM_H
+#define _ASM_X86_KVM_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <linux/const.h>
+#include <linux/bits.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <linux/stddef.h>
+
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
+
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define AC_VECTOR 17
+#define MC_VECTOR 18
+#define XM_VECTOR 19
+#define VE_VECTOR 20
+#define CP_VECTOR 21
+
+#define HV_VECTOR 28
+#define VC_VECTOR 29
+#define SX_VECTOR 30
+
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_PIT
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_MSI
+#define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+ __u8 last_irr; /* edge detection */
+ __u8 irr; /* interrupt request register */
+ __u8 imr; /* interrupt mask register */
+ __u8 isr; /* interrupt service register */
+ __u8 priority_add; /* highest irq priority */
+ __u8 irq_base;
+ __u8 read_reg_select;
+ __u8 poll;
+ __u8 special_mask;
+ __u8 init_state;
+ __u8 auto_eoi;
+ __u8 rotate_on_auto_eoi;
+ __u8 special_fully_nested_mode;
+ __u8 init4; /* true if 4 byte init */
+ __u8 elcr; /* PIIX edge/trigger selection */
+ __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS 24
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+#define KVM_NR_IRQCHIPS 3
+
+#define KVM_RUN_X86_SMM (1 << 0)
+#define KVM_RUN_X86_BUS_LOCK (1 << 1)
+#define KVM_RUN_X86_GUEST_MODE (1 << 2)
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+ char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+struct kvm_sregs2 {
+ /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 flags;
+ __u64 pdptrs[4];
+};
+#define KVM_SREGS2_FLAGS_PDPTRS_VALID 1
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+ __u8 fpr[8][16];
+ __u16 fcw;
+ __u16 fsw;
+ __u8 ftwx; /* in fxsave format */
+ __u8 pad1;
+ __u16 last_opcode;
+ __u64 last_ip;
+ __u64 last_dp;
+ __u8 xmm[16][16];
+ __u32 mxcsr;
+ __u32 pad2;
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 pad;
+
+ struct kvm_msr_entry entries[];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[];
+};
+
+/* Maximum size of any access bitmap in bytes */
+#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600
+
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range {
+#define KVM_MSR_FILTER_READ (1 << 0)
+#define KVM_MSR_FILTER_WRITE (1 << 1)
+#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \
+ KVM_MSR_FILTER_WRITE)
+ __u32 flags;
+ __u32 nmsrs; /* number of msrs in bitmap */
+ __u32 base; /* MSR index the bitmap starts at */
+ __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+};
+
+#define KVM_MSR_FILTER_MAX_RANGES 16
+struct kvm_msr_filter {
+#ifndef __KERNEL__
+#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+#endif
+#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY)
+ __u32 flags;
+ struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
+
+struct kvm_cpuid_entry {
+ __u32 function;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry entries[];
+};
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX (1 << 0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC (1 << 1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT (1 << 2)
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry2 entries[];
+};
+
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+ __u32 count; /* can be 65536 */
+ __u16 latched_count;
+ __u8 count_latched;
+ __u8 status_latched;
+ __u8 status;
+ __u8 read_state;
+ __u8 write_state;
+ __u8 write_latch;
+ __u8 rw_mode;
+ __u8 mode;
+ __u8 bcd;
+ __u8 gate;
+ __s64 count_load_time;
+};
+
+struct kvm_debug_exit_arch {
+ __u32 exception;
+ __u32 pad;
+ __u64 pc;
+ __u64 dr6;
+ __u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP 0x00010000
+#define KVM_GUESTDBG_USE_HW_BP 0x00020000
+#define KVM_GUESTDBG_INJECT_DB 0x00040000
+#define KVM_GUESTDBG_INJECT_BP 0x00080000
+#define KVM_GUESTDBG_BLOCKIRQ 0x00100000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+ __u64 debugreg[8];
+};
+
+struct kvm_pit_state {
+ struct kvm_pit_channel_state channels[3];
+};
+
+#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
+#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002
+
+struct kvm_pit_state2 {
+ struct kvm_pit_channel_state channels[3];
+ __u32 flags;
+ __u32 reserved[9];
+};
+
+struct kvm_reinject_control {
+ __u8 pit_reinject;
+ __u8 reserved[31];
+};
+
+/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
+#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
+#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+#define KVM_VCPUEVENT_VALID_SMM 0x00000008
+#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010
+#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS 0x01
+#define KVM_X86_SHADOW_INT_STI 0x02
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 has_error_code;
+ __u8 pending;
+ __u32 error_code;
+ } exception;
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 soft;
+ __u8 shadow;
+ } interrupt;
+ struct {
+ __u8 injected;
+ __u8 pending;
+ __u8 masked;
+ __u8 pad;
+ } nmi;
+ __u32 sipi_vector;
+ __u32 flags;
+ struct {
+ __u8 smm;
+ __u8 pending;
+ __u8 smm_inside_nmi;
+ __u8 latched_init;
+ } smi;
+ struct {
+ __u8 pending;
+ } triple_fault;
+ __u8 reserved[26];
+ __u8 exception_has_payload;
+ __u64 exception_payload;
+};
+
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
+struct kvm_xsave {
+ /*
+ * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+ * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * respectively, when invoked on the vm file descriptor.
+ *
+ * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * will always be at least 4096. Currently, it is only greater
+ * than 4096 if a dynamic feature has been enabled with
+ * ``arch_prctl()``, but this may change in the future.
+ *
+ * The offsets of the state save areas in struct kvm_xsave follow
+ * the contents of CPUID leaf 0xD on the host.
+ */
+ __u32 region[1024];
+ __u32 extra[];
+};
+
+#define KVM_MAX_XCRS 16
+
+struct kvm_xcr {
+ __u32 xcr;
+ __u32 reserved;
+ __u64 value;
+};
+
+struct kvm_xcrs {
+ __u32 nr_xcrs;
+ __u32 flags;
+ struct kvm_xcr xcrs[KVM_MAX_XCRS];
+ __u64 padding[16];
+};
+
+#define KVM_X86_REG_TYPE_MSR 2
+#define KVM_X86_REG_TYPE_KVM 3
+
+#define KVM_X86_KVM_REG_SIZE(reg) \
+({ \
+ reg == KVM_REG_GUEST_SSP ? KVM_REG_SIZE_U64 : 0; \
+})
+
+#define KVM_X86_REG_TYPE_SIZE(type, reg) \
+({ \
+ __u64 type_size = (__u64)type << 32; \
+ \
+ type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \
+ type == KVM_X86_REG_TYPE_KVM ? KVM_X86_KVM_REG_SIZE(reg) : \
+ 0; \
+ type_size; \
+})
+
+#define KVM_X86_REG_ID(type, index) \
+ (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type, index) | index)
+
+#define KVM_X86_REG_MSR(index) \
+ KVM_X86_REG_ID(KVM_X86_REG_TYPE_MSR, index)
+#define KVM_X86_REG_KVM(index) \
+ KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index)
+
+/* KVM-defined registers starting from 0 */
+#define KVM_REG_GUEST_SSP 0
+
+#define KVM_SYNC_X86_REGS (1UL << 0)
+#define KVM_SYNC_X86_SREGS (1UL << 1)
+#define KVM_SYNC_X86_EVENTS (1UL << 2)
+
+#define KVM_SYNC_X86_VALID_FIELDS \
+ (KVM_SYNC_X86_REGS| \
+ KVM_SYNC_X86_SREGS| \
+ KVM_SYNC_X86_EVENTS)
+
+/* kvm_sync_regs struct included by kvm_run struct */
+struct kvm_sync_regs {
+ /* Members of this structure are potentially malicious.
+ * Care must be taken by code reading, esp. interpreting,
+ * data fields from them inside KVM to prevent TOCTOU and
+ * double-fetch types of vulnerabilities.
+ */
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
+};
+
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
+#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
+#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
+#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8)
+#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9)
+
+#define KVM_STATE_NESTED_FORMAT_VMX 0
+#define KVM_STATE_NESTED_FORMAT_SVM 1
+
+#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
+#define KVM_STATE_NESTED_EVMCS 0x00000004
+#define KVM_STATE_NESTED_MTF_PENDING 0x00000008
+#define KVM_STATE_NESTED_GIF_SET 0x00000100
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
+
+#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000
+
+#define KVM_STATE_NESTED_SVM_VMCB_SIZE 0x1000
+
+#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
+
+/* vendor-independent attributes for system fd (group 0) */
+#define KVM_X86_GRP_SYSTEM 0
+# define KVM_X86_XCOMP_GUEST_SUPP 0
+
+/* vendor-specific groups and attributes for system fd */
+#define KVM_X86_GRP_SEV 1
+# define KVM_X86_SEV_VMSA_FEATURES 0
+# define KVM_X86_SNP_POLICY_BITS 1
+
+struct kvm_vmx_nested_state_data {
+ __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+ __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+};
+
+struct kvm_vmx_nested_state_hdr {
+ __u64 vmxon_pa;
+ __u64 vmcs12_pa;
+
+ struct {
+ __u16 flags;
+ } smm;
+
+ __u16 pad;
+
+ __u32 flags;
+ __u64 preemption_timer_deadline;
+};
+
+struct kvm_svm_nested_state_data {
+ /* Save area only used if KVM_STATE_NESTED_RUN_PENDING. */
+ __u8 vmcb12[KVM_STATE_NESTED_SVM_VMCB_SIZE];
+};
+
+struct kvm_svm_nested_state_hdr {
+ __u64 vmcb_pa;
+};
+
+/* for KVM_CAP_NESTED_STATE */
+struct kvm_nested_state {
+ __u16 flags;
+ __u16 format;
+ __u32 size;
+
+ union {
+ struct kvm_vmx_nested_state_hdr vmx;
+ struct kvm_svm_nested_state_hdr svm;
+
+ /* Pad the header to 128 bytes. */
+ __u8 pad[120];
+ } hdr;
+
+ /*
+ * Define data region as 0 bytes to preserve backwards-compatability
+ * to old definition of kvm_nested_state in order to avoid changing
+ * KVM_{GET,PUT}_NESTED_STATE ioctl values.
+ */
+ union {
+ __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx);
+ __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm);
+ } data;
+};
+
+/* for KVM_CAP_PMU_EVENT_FILTER */
+struct kvm_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u32 fixed_counter_bitmap;
+ __u32 flags;
+ __u32 pad[4];
+ __u64 events[];
+};
+
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0)
+#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+
+/* for KVM_CAP_MCE */
+struct kvm_x86_mce {
+ __u64 status;
+ __u64 addr;
+ __u64 misc;
+ __u64 mcg_status;
+ __u8 bank;
+ __u8 pad1[7];
+ __u64 pad2[3];
+};
+
+/* for KVM_CAP_XEN_HVM */
+#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
+#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
+#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8)
+
+#define KVM_XEN_MSR_MIN_INDEX 0x40000000u
+#define KVM_XEN_MSR_MAX_INDEX 0x4fffffffu
+
+struct kvm_xen_hvm_config {
+ __u32 flags;
+ __u32 msr;
+ __u64 blob_addr_32;
+ __u64 blob_addr_64;
+ __u8 blob_size_32;
+ __u8 blob_size_64;
+ __u8 pad2[30];
+};
+
+struct kvm_xen_hvm_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u8 long_mode;
+ __u8 vector;
+ __u8 runstate_update_flag;
+ union {
+ __u64 gfn;
+#define KVM_XEN_INVALID_GFN ((__u64)-1)
+ __u64 hva;
+ } shared_info;
+ struct {
+ __u32 send_port;
+ __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+ __u32 flags;
+#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0)
+#define KVM_XEN_EVTCHN_UPDATE (1 << 1)
+#define KVM_XEN_EVTCHN_RESET (1 << 2)
+ /*
+ * Events sent by the guest are either looped back to
+ * the guest itself (potentially on a different port#)
+ * or signalled via an eventfd.
+ */
+ union {
+ struct {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ } port;
+ struct {
+ __u32 port; /* Zero for eventfd */
+ __s32 fd;
+ } eventfd;
+ __u32 padding[4];
+ } deliver;
+ } evtchn;
+ __u32 xen_version;
+ __u64 pad[8];
+ } u;
+};
+
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1
+#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3
+#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
+#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6
+
+struct kvm_xen_vcpu_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u64 gpa;
+#define KVM_XEN_INVALID_GPA ((__u64)-1)
+ __u64 hva;
+ __u64 pad[8];
+ struct {
+ __u64 state;
+ __u64 state_entry_time;
+ __u64 time_running;
+ __u64 time_runnable;
+ __u64 time_blocked;
+ __u64 time_offline;
+ } runstate;
+ __u32 vcpu_id;
+ struct {
+ __u32 port;
+ __u32 priority;
+ __u64 expires_ns;
+ } timer;
+ __u8 vector;
+ } u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6
+#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7
+#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9
+
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+ /* Guest initialization commands */
+ KVM_SEV_INIT = 0,
+ KVM_SEV_ES_INIT,
+ /* Guest launch commands */
+ KVM_SEV_LAUNCH_START,
+ KVM_SEV_LAUNCH_UPDATE_DATA,
+ KVM_SEV_LAUNCH_UPDATE_VMSA,
+ KVM_SEV_LAUNCH_SECRET,
+ KVM_SEV_LAUNCH_MEASURE,
+ KVM_SEV_LAUNCH_FINISH,
+ /* Guest migration commands (outgoing) */
+ KVM_SEV_SEND_START,
+ KVM_SEV_SEND_UPDATE_DATA,
+ KVM_SEV_SEND_UPDATE_VMSA,
+ KVM_SEV_SEND_FINISH,
+ /* Guest migration commands (incoming) */
+ KVM_SEV_RECEIVE_START,
+ KVM_SEV_RECEIVE_UPDATE_DATA,
+ KVM_SEV_RECEIVE_UPDATE_VMSA,
+ KVM_SEV_RECEIVE_FINISH,
+ /* Guest status and debug commands */
+ KVM_SEV_GUEST_STATUS,
+ KVM_SEV_DBG_DECRYPT,
+ KVM_SEV_DBG_ENCRYPT,
+ /* Guest certificates commands */
+ KVM_SEV_CERT_EXPORT,
+ /* Attestation report */
+ KVM_SEV_GET_ATTESTATION_REPORT,
+ /* Guest Migration Extension */
+ KVM_SEV_SEND_CANCEL,
+
+ /* Second time is the charm; improved versions of the above ioctls. */
+ KVM_SEV_INIT2,
+
+ /* SNP-specific commands */
+ KVM_SEV_SNP_LAUNCH_START = 100,
+ KVM_SEV_SNP_LAUNCH_UPDATE,
+ KVM_SEV_SNP_LAUNCH_FINISH,
+
+ KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+ __u32 id;
+ __u32 pad0;
+ __u64 data;
+ __u32 error;
+ __u32 sev_fd;
+};
+
+struct kvm_sev_init {
+ __u64 vmsa_features;
+ __u32 flags;
+ __u16 ghcb_version;
+ __u16 pad1;
+ __u32 pad2[8];
+};
+
+struct kvm_sev_launch_start {
+ __u32 handle;
+ __u32 policy;
+ __u64 dh_uaddr;
+ __u32 dh_len;
+ __u32 pad0;
+ __u64 session_uaddr;
+ __u32 session_len;
+ __u32 pad1;
+};
+
+struct kvm_sev_launch_update_data {
+ __u64 uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+
+struct kvm_sev_launch_secret {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u32 pad0;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u32 pad1;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+ __u32 pad2;
+};
+
+struct kvm_sev_launch_measure {
+ __u64 uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+struct kvm_sev_guest_status {
+ __u32 handle;
+ __u32 policy;
+ __u32 state;
+};
+
+struct kvm_sev_dbg {
+ __u64 src_uaddr;
+ __u64 dst_uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+struct kvm_sev_attestation_report {
+ __u8 mnonce[16];
+ __u64 uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+struct kvm_sev_send_start {
+ __u32 policy;
+ __u32 pad0;
+ __u64 pdh_cert_uaddr;
+ __u32 pdh_cert_len;
+ __u32 pad1;
+ __u64 plat_certs_uaddr;
+ __u32 plat_certs_len;
+ __u32 pad2;
+ __u64 amd_certs_uaddr;
+ __u32 amd_certs_len;
+ __u32 pad3;
+ __u64 session_uaddr;
+ __u32 session_len;
+ __u32 pad4;
+};
+
+struct kvm_sev_send_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u32 pad0;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u32 pad1;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+ __u32 pad2;
+};
+
+struct kvm_sev_receive_start {
+ __u32 handle;
+ __u32 policy;
+ __u64 pdh_uaddr;
+ __u32 pdh_len;
+ __u32 pad0;
+ __u64 session_uaddr;
+ __u32 session_len;
+ __u32 pad1;
+};
+
+struct kvm_sev_receive_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u32 pad0;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u32 pad1;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+ __u32 pad2;
+};
+
+struct kvm_sev_snp_launch_start {
+ __u64 policy;
+ __u8 gosvw[16];
+ __u16 flags;
+ __u8 pad0[6];
+ __u64 pad1[4];
+};
+
+/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_PAGE_TYPE_INVALID 0x0
+#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
+#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
+#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
+#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5
+#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6
+
+struct kvm_sev_snp_launch_update {
+ __u64 gfn_start;
+ __u64 uaddr;
+ __u64 len;
+ __u8 type;
+ __u8 pad0;
+ __u16 flags;
+ __u32 pad1;
+ __u64 pad2[4];
+};
+
+#define KVM_SEV_SNP_ID_BLOCK_SIZE 96
+#define KVM_SEV_SNP_ID_AUTH_SIZE 4096
+#define KVM_SEV_SNP_FINISH_DATA_SIZE 32
+
+struct kvm_sev_snp_launch_finish {
+ __u64 id_block_uaddr;
+ __u64 id_auth_uaddr;
+ __u8 id_block_en;
+ __u8 auth_key_en;
+ __u8 vcek_disabled;
+ __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE];
+ __u8 pad0[3];
+ __u16 flags;
+ __u64 pad1[4];
+};
+
+#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+
+struct kvm_hyperv_eventfd {
+ __u32 conn_id;
+ __s32 fd;
+ __u32 flags;
+ __u32 padding[3];
+};
+
+#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff
+#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
+
+/*
+ * Masked event layout.
+ * Bits Description
+ * ---- -----------
+ * 7:0 event select (low bits)
+ * 15:8 umask match
+ * 31:16 unused
+ * 35:32 event select (high bits)
+ * 36:54 unused
+ * 55 exclude bit
+ * 63:56 umask mask
+ */
+
+#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \
+ (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \
+ (((mask) & 0xFFULL) << 56) | \
+ (((match) & 0xFFULL) << 8) | \
+ ((__u64)(!!(exclude)) << 55))
+
+#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
+ (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56)
+
+/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
+#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+
+/* x86-specific KVM_EXIT_HYPERCALL flags. */
+#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0)
+
+#define KVM_X86_DEFAULT_VM 0
+#define KVM_X86_SW_PROTECTED_VM 1
+#define KVM_X86_SEV_VM 2
+#define KVM_X86_SEV_ES_VM 3
+#define KVM_X86_SNP_VM 4
+#define KVM_X86_TDX_VM 5
+
+/* Trust Domain eXtension sub-ioctl() commands. */
+enum kvm_tdx_cmd_id {
+ KVM_TDX_CAPABILITIES = 0,
+ KVM_TDX_INIT_VM,
+ KVM_TDX_INIT_VCPU,
+ KVM_TDX_INIT_MEM_REGION,
+ KVM_TDX_FINALIZE_VM,
+ KVM_TDX_GET_CPUID,
+
+ KVM_TDX_CMD_NR_MAX,
+};
+
+struct kvm_tdx_cmd {
+ /* enum kvm_tdx_cmd_id */
+ __u32 id;
+ /* flags for sub-commend. If sub-command doesn't use this, set zero. */
+ __u32 flags;
+ /*
+ * data for each sub-command. An immediate or a pointer to the actual
+ * data in process virtual address. If sub-command doesn't use it,
+ * set zero.
+ */
+ __u64 data;
+ /*
+ * Auxiliary error code. The sub-command may return TDX SEAMCALL
+ * status code in addition to -Exxx.
+ */
+ __u64 hw_error;
+};
+
+struct kvm_tdx_capabilities {
+ __u64 supported_attrs;
+ __u64 supported_xfam;
+
+ __u64 kernel_tdvmcallinfo_1_r11;
+ __u64 user_tdvmcallinfo_1_r11;
+ __u64 kernel_tdvmcallinfo_1_r12;
+ __u64 user_tdvmcallinfo_1_r12;
+
+ __u64 reserved[250];
+
+ /* Configurable CPUID bits for userspace */
+ struct kvm_cpuid2 cpuid;
+};
+
+struct kvm_tdx_init_vm {
+ __u64 attributes;
+ __u64 xfam;
+ __u64 mrconfigid[6]; /* sha384 digest */
+ __u64 mrowner[6]; /* sha384 digest */
+ __u64 mrownerconfig[6]; /* sha384 digest */
+
+ /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
+ __u64 reserved[12];
+
+ /*
+ * Call KVM_TDX_INIT_VM before vcpu creation, thus before
+ * KVM_SET_CPUID2.
+ * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
+ * TDX module directly virtualizes those CPUIDs without VMM. The user
+ * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
+ * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of
+ * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
+ * module doesn't virtualize.
+ */
+ struct kvm_cpuid2 cpuid;
+};
+
+#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0)
+
+struct kvm_tdx_init_mem_region {
+ __u64 source_addr;
+ __u64 gpa;
+ __u64 nr_pages;
+};
+
+#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
new file mode 100644
index 000000000000..a1efa7907a0b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_KVM_PARA_H
+#define _UAPI_ASM_X86_KVM_PARA_H
+
+#include <linux/types.h>
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE 0x40000000
+#define KVM_SIGNATURE "KVMKVMKVM\0\0\0"
+
+/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
+ * a particular paravirtualization, the appropriate feature bit should
+ * be checked in eax. The performance hint feature bit should be checked
+ * in edx.
+ */
+#define KVM_CPUID_FEATURES 0x40000001
+#define KVM_FEATURE_CLOCKSOURCE 0
+#define KVM_FEATURE_NOP_IO_DELAY 1
+#define KVM_FEATURE_MMU_OP 2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_ASYNC_PF 4
+#define KVM_FEATURE_STEAL_TIME 5
+#define KVM_FEATURE_PV_EOI 6
+#define KVM_FEATURE_PV_UNHALT 7
+#define KVM_FEATURE_PV_TLB_FLUSH 9
+#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
+#define KVM_FEATURE_PV_SEND_IPI 11
+#define KVM_FEATURE_POLL_CONTROL 12
+#define KVM_FEATURE_PV_SCHED_YIELD 13
+#define KVM_FEATURE_ASYNC_PF_INT 14
+#define KVM_FEATURE_MSI_EXT_DEST_ID 15
+#define KVM_FEATURE_HC_MAP_GPA_RANGE 16
+#define KVM_FEATURE_MIGRATION_CONTROL 17
+
+#define KVM_HINTS_REALTIME 0
+
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_MSR_ENABLED 1
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME 0x4b564d03
+#define MSR_KVM_PV_EOI_EN 0x4b564d04
+#define MSR_KVM_POLL_CONTROL 0x4b564d05
+#define MSR_KVM_ASYNC_PF_INT 0x4b564d06
+#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07
+#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08
+
+struct kvm_steal_time {
+ __u64 steal;
+ __u32 version;
+ __u32 flags;
+ __u8 preempted;
+ __u8 u8_pad[3];
+ __u32 pad[11];
+};
+
+#define KVM_VCPU_PREEMPTED (1 << 0)
+#define KVM_VCPU_FLUSH_TLB (1 << 1)
+
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+ __s64 sec;
+ __s64 nsec;
+ __u64 tsc;
+ __u32 flags;
+ __u32 pad[9];
+};
+
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
+#define KVM_MAX_MMU_OP_BATCH 32
+
+#define KVM_ASYNC_PF_ENABLED (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2)
+#define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3)
+
+/* MSR_KVM_ASYNC_PF_INT */
+#define KVM_ASYNC_PF_VEC_MASK __GENMASK(7, 0)
+
+/* MSR_KVM_MIGRATION_CONTROL */
+#define KVM_MIGRATION_READY (1 << 0)
+
+/* KVM_HC_MAP_GPA_RANGE */
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_4K 0
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_2M (1 << 0)
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_1G (1 << 1)
+#define KVM_MAP_GPA_RANGE_ENC_STAT(n) (n << 4)
+#define KVM_MAP_GPA_RANGE_ENCRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(1)
+#define KVM_MAP_GPA_RANGE_DECRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(0)
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE 1
+#define KVM_MMU_OP_FLUSH_TLB 2
+#define KVM_MMU_OP_RELEASE_PT 3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+ __u32 op;
+ __u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+ struct kvm_mmu_op_header header;
+ __u64 pte_phys;
+ __u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+ struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+ struct kvm_mmu_op_header header;
+ __u64 pt_phys;
+};
+
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+ /* Used for 'page not present' events delivered via #PF */
+ __u32 flags;
+
+ /* Used for 'page ready' events delivered via interrupt notification */
+ __u32 token;
+
+ __u8 pad[56];
+};
+
+#define KVM_PV_EOI_BIT 0
+#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
+#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
+#define KVM_PV_EOI_DISABLED 0x0
+
+#endif /* _UAPI_ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h
new file mode 100644
index 000000000000..125cf5cdf6c5
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm_perf.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_KVM_PERF_H
+#define _ASM_X86_KVM_PERF_H
+
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
+
+#define DECODE_STR_LEN 20
+
+#define VCPU_ID "vcpu_id"
+
+#define KVM_ENTRY_TRACE "kvm:kvm_entry"
+#define KVM_EXIT_TRACE "kvm:kvm_exit"
+#define KVM_EXIT_REASON "exit_reason"
+
+#endif /* _ASM_X86_KVM_PERF_H */
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h
index 46727eb37bfe..a82c039d8e6a 100644
--- a/arch/x86/include/asm/ldt.h
+++ b/arch/x86/include/uapi/asm/ldt.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* ldt.h
*
@@ -11,7 +12,7 @@
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
@@ -28,6 +29,13 @@ struct user_desc {
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
+ /*
+ * Because this bit is not present in 32-bit user code, user
+ * programs can pass uninitialized values here. Therefore, in
+ * any context in which a user_desc comes from a 32-bit program,
+ * the kernel must act as though lm == 0, regardless of the
+ * actual value.
+ */
unsigned int lm:1;
#endif
};
@@ -36,5 +44,5 @@ struct user_desc {
#define MODIFY_LDT_CONTENTS_STACK 1
#define MODIFY_LDT_CONTENTS_CODE 2
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
new file mode 100644
index 000000000000..cb6b48a7c22b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_MCE_H
+#define _UAPI_ASM_X86_MCE_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Fields are zero when not available. Also, this struct is shared with
+ * userspace mcelog and thus must keep existing fields at current offsets.
+ * Only add new, shared fields to the end of the structure.
+ * Do not add vendor-specific fields.
+ */
+struct mce {
+ __u64 status; /* Bank's MCi_STATUS MSR */
+ __u64 misc; /* Bank's MCi_MISC MSR */
+ __u64 addr; /* Bank's MCi_ADDR MSR */
+ __u64 mcgstatus; /* Machine Check Global Status MSR */
+ __u64 ip; /* Instruction Pointer when the error happened */
+ __u64 tsc; /* CPU time stamp counter */
+ __u64 time; /* Wall time_t when error was detected */
+ __u8 cpuvendor; /* Kernel's X86_VENDOR enum */
+ __u8 inject_flags; /* Software inject flags */
+ __u8 severity; /* Error severity */
+ __u8 pad;
+ __u32 cpuid; /* CPUID 1 EAX */
+ __u8 cs; /* Code segment */
+ __u8 bank; /* Machine check bank reporting the error */
+ __u8 cpu; /* CPU number; obsoleted by extcpu */
+ __u8 finished; /* Entry is valid */
+ __u32 extcpu; /* Linux CPU number that detected the error */
+ __u32 socketid; /* CPU socket ID */
+ __u32 apicid; /* CPU initial APIC ID */
+ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
+ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
+ __u64 ppin; /* Protected Processor Inventory Number */
+ __u32 microcode; /* Microcode revision */
+ __u64 kflags; /* Internal kernel use */
+};
+
+#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
+#define MCE_GET_LOG_LEN _IOR('M', 2, int)
+#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
+
+#endif /* _UAPI_ASM_X86_MCE_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
new file mode 100644
index 000000000000..ac1e6277212b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_MMAN_H
+#define _ASM_X86_MMAN_H
+
+#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+#define MAP_ABOVE4G 0x80 /* only map above 4GB */
+
+#include <asm-generic/mman.h>
+
+#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h
new file mode 100644
index 000000000000..ac83e25bbf37
--- /dev/null
+++ b/arch/x86/include/uapi/asm/msgbuf.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X64_MSGBUF_H
+#define __ASM_X64_MSGBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
+#include <asm-generic/msgbuf.h>
+#else
+
+#include <asm/ipcbuf.h>
+
+/*
+ * The msqid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is different
+ * from other 32-bit architectures.
+ */
+
+struct msqid64_ds {
+ struct ipc64_perm msg_perm;
+ __kernel_long_t msg_stime; /* last msgsnd time */
+ __kernel_long_t msg_rtime; /* last msgrcv time */
+ __kernel_long_t msg_ctime; /* last change time */
+ __kernel_ulong_t msg_cbytes; /* current number of bytes on queue */
+ __kernel_ulong_t msg_qnum; /* number of messages in queue */
+ __kernel_ulong_t msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ __kernel_ulong_t __unused4;
+ __kernel_ulong_t __unused5;
+};
+
+#endif
+
+#endif /* __ASM_GENERIC_MSGBUF_H */
diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h
new file mode 100644
index 000000000000..4b8917ca28fe
--- /dev/null
+++ b/arch/x86/include/uapi/asm/msr.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_MSR_H
+#define _UAPI_ASM_X86_MSR_H
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8])
+#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8])
+
+#endif /* __ASSEMBLER__ */
+#endif /* _UAPI_ASM_X86_MSR_H */
diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h
new file mode 100644
index 000000000000..3a8a8eb8ac3a
--- /dev/null
+++ b/arch/x86/include/uapi/asm/mtrr.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: LGPL-2.0+ WITH Linux-syscall-note */
+/* Generic MTRR (Memory Type Range Register) ioctls.
+
+ Copyright (C) 1997-1999 Richard Gooch
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+*/
+#ifndef _UAPI_ASM_X86_MTRR_H
+#define _UAPI_ASM_X86_MTRR_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <linux/errno.h>
+
+#define MTRR_IOCTL_BASE 'M'
+
+/* Warning: this structure has a different order from i386
+ on x86-64. The 32bit emulation code takes care of that.
+ But you need to use this for 64bit, otherwise your X server
+ will break. */
+
+#ifdef __i386__
+struct mtrr_sentry {
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+struct mtrr_gentry {
+ unsigned int regnum; /* Register number */
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+#else /* __i386__ */
+
+struct mtrr_sentry {
+ __u64 base; /* Base address */
+ __u32 size; /* Size of region */
+ __u32 type; /* Type of region */
+};
+
+struct mtrr_gentry {
+ __u64 base; /* Base address */
+ __u32 size; /* Size of region */
+ __u32 regnum; /* Register number */
+ __u32 type; /* Type of region */
+ __u32 _pad; /* Unused */
+};
+
+#endif /* !__i386__ */
+
+struct mtrr_var_range {
+ __u32 base_lo;
+ __u32 base_hi;
+ __u32 mask_lo;
+ __u32 mask_hi;
+};
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+ an 8 bit field: */
+typedef __u8 mtrr_type;
+
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+
+/* These are the various ioctls */
+#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
+#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
+#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry)
+#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry)
+#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry)
+#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry)
+#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry)
+#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry)
+#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
+#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
+
+/* MTRR memory types, which are defined in SDM */
+#define MTRR_TYPE_UNCACHABLE 0
+#define MTRR_TYPE_WRCOMB 1
+/*#define MTRR_TYPE_ 2*/
+/*#define MTRR_TYPE_ 3*/
+#define MTRR_TYPE_WRTHROUGH 4
+#define MTRR_TYPE_WRPROT 5
+#define MTRR_TYPE_WRBACK 6
+#define MTRR_NUM_TYPES 7
+
+/*
+ * Invalid MTRR memory type. No longer used outside of MTRR code.
+ * Note, this value is allocated from the reserved values (0x7-0xff) of
+ * the MTRR memory types.
+ */
+#define MTRR_TYPE_INVALID 0xff
+
+#endif /* _UAPI_ASM_X86_MTRR_H */
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000000..7c9d2bb3833b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_PERF_REGS_H
+#define _ASM_X86_PERF_REGS_H
+
+enum perf_event_x86_regs {
+ PERF_REG_X86_AX,
+ PERF_REG_X86_BX,
+ PERF_REG_X86_CX,
+ PERF_REG_X86_DX,
+ PERF_REG_X86_SI,
+ PERF_REG_X86_DI,
+ PERF_REG_X86_BP,
+ PERF_REG_X86_SP,
+ PERF_REG_X86_IP,
+ PERF_REG_X86_FLAGS,
+ PERF_REG_X86_CS,
+ PERF_REG_X86_SS,
+ PERF_REG_X86_DS,
+ PERF_REG_X86_ES,
+ PERF_REG_X86_FS,
+ PERF_REG_X86_GS,
+ PERF_REG_X86_R8,
+ PERF_REG_X86_R9,
+ PERF_REG_X86_R10,
+ PERF_REG_X86_R11,
+ PERF_REG_X86_R12,
+ PERF_REG_X86_R13,
+ PERF_REG_X86_R14,
+ PERF_REG_X86_R15,
+ /* These are the limits for the GPRs. */
+ PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
+ PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1,
+
+ /* These all need two bits set because they are 128bit */
+ PERF_REG_X86_XMM0 = 32,
+ PERF_REG_X86_XMM1 = 34,
+ PERF_REG_X86_XMM2 = 36,
+ PERF_REG_X86_XMM3 = 38,
+ PERF_REG_X86_XMM4 = 40,
+ PERF_REG_X86_XMM5 = 42,
+ PERF_REG_X86_XMM6 = 44,
+ PERF_REG_X86_XMM7 = 46,
+ PERF_REG_X86_XMM8 = 48,
+ PERF_REG_X86_XMM9 = 50,
+ PERF_REG_X86_XMM10 = 52,
+ PERF_REG_X86_XMM11 = 54,
+ PERF_REG_X86_XMM12 = 56,
+ PERF_REG_X86_XMM13 = 58,
+ PERF_REG_X86_XMM14 = 60,
+ PERF_REG_X86_XMM15 = 62,
+
+ /* These include both GPRs and XMMX registers */
+ PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2,
+};
+
+#define PERF_REG_EXTENDED_MASK (~((1ULL << PERF_REG_X86_XMM0) - 1))
+
+#endif /* _ASM_X86_PERF_REGS_H */
diff --git a/arch/x86/include/uapi/asm/posix_types.h b/arch/x86/include/uapi/asm/posix_types.h
new file mode 100644
index 000000000000..c661e95f0134
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __KERNEL__
+# ifdef __i386__
+# include <asm/posix_types_32.h>
+# elif defined(__ILP32__)
+# include <asm/posix_types_x32.h>
+# else
+# include <asm/posix_types_64.h>
+# endif
+#endif
diff --git a/arch/x86/include/uapi/asm/posix_types_32.h b/arch/x86/include/uapi/asm/posix_types_32.h
new file mode 100644
index 000000000000..840659f4b96f
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types_32.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_POSIX_TYPES_32_H
+#define _ASM_X86_POSIX_TYPES_32_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned short __kernel_mode_t;
+#define __kernel_mode_t __kernel_mode_t
+
+typedef unsigned short __kernel_ipc_pid_t;
+#define __kernel_ipc_pid_t __kernel_ipc_pid_t
+
+typedef unsigned short __kernel_uid_t;
+typedef unsigned short __kernel_gid_t;
+#define __kernel_uid_t __kernel_uid_t
+
+typedef unsigned short __kernel_old_dev_t;
+#define __kernel_old_dev_t __kernel_old_dev_t
+
+#include <asm-generic/posix_types.h>
+
+#endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/uapi/asm/posix_types_64.h b/arch/x86/include/uapi/asm/posix_types_64.h
new file mode 100644
index 000000000000..515afb8059ce
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types_64.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_POSIX_TYPES_64_H
+#define _ASM_X86_POSIX_TYPES_64_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned short __kernel_old_uid_t;
+typedef unsigned short __kernel_old_gid_t;
+#define __kernel_old_uid_t __kernel_old_uid_t
+
+typedef unsigned long __kernel_old_dev_t;
+#define __kernel_old_dev_t __kernel_old_dev_t
+
+#include <asm-generic/posix_types.h>
+
+#endif /* _ASM_X86_POSIX_TYPES_64_H */
diff --git a/arch/x86/include/uapi/asm/posix_types_x32.h b/arch/x86/include/uapi/asm/posix_types_x32.h
new file mode 100644
index 000000000000..f60479b07fc8
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types_x32.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_POSIX_TYPES_X32_H
+#define _ASM_X86_POSIX_TYPES_X32_H
+
+/*
+ * This file is only used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ *
+ * These types should generally match the ones used by the 64-bit kernel,
+ *
+ */
+
+typedef long long __kernel_long_t;
+typedef unsigned long long __kernel_ulong_t;
+#define __kernel_long_t __kernel_long_t
+
+#include <asm/posix_types_64.h>
+
+#endif /* _ASM_X86_POSIX_TYPES_X32_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
new file mode 100644
index 000000000000..384e2cc6ac19
--- /dev/null
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_PRCTL_H
+#define _ASM_X86_PRCTL_H
+
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
+
+#define ARCH_GET_CPUID 0x1011
+#define ARCH_SET_CPUID 0x1012
+
+#define ARCH_GET_XCOMP_SUPP 0x1021
+#define ARCH_GET_XCOMP_PERM 0x1022
+#define ARCH_REQ_XCOMP_PERM 0x1023
+#define ARCH_GET_XCOMP_GUEST_PERM 0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
+
+#define ARCH_XCOMP_TILECFG 17
+#define ARCH_XCOMP_TILEDATA 18
+
+#define ARCH_MAP_VDSO_X32 0x2001
+#define ARCH_MAP_VDSO_32 0x2002
+#define ARCH_MAP_VDSO_64 0x2003
+
+/* Don't use 0x3001-0x3004 because of old glibcs */
+
+#define ARCH_GET_UNTAG_MASK 0x4001
+#define ARCH_ENABLE_TAGGED_ADDR 0x4002
+#define ARCH_GET_MAX_TAG_BITS 0x4003
+#define ARCH_FORCE_TAGGED_SVA 0x4004
+
+#define ARCH_SHSTK_ENABLE 0x5001
+#define ARCH_SHSTK_DISABLE 0x5002
+#define ARCH_SHSTK_LOCK 0x5003
+#define ARCH_SHSTK_UNLOCK 0x5004
+#define ARCH_SHSTK_STATUS 0x5005
+
+/* ARCH_SHSTK_ features bits */
+#define ARCH_SHSTK_SHSTK (1ULL << 0)
+#define ARCH_SHSTK_WRSS (1ULL << 1)
+
+#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
new file mode 100644
index 000000000000..81d0c8bf1137
--- /dev/null
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_PROCESSOR_FLAGS_H
+#define _UAPI_ASM_X86_PROCESSOR_FLAGS_H
+/* Various flags defined: can be included from assembler. */
+
+#include <linux/const.h>
+
+/*
+ * EFLAGS bits
+ */
+#define X86_EFLAGS_CF_BIT 0 /* Carry Flag */
+#define X86_EFLAGS_CF _BITUL(X86_EFLAGS_CF_BIT)
+#define X86_EFLAGS_FIXED_BIT 1 /* Bit 1 - always on */
+#define X86_EFLAGS_FIXED _BITUL(X86_EFLAGS_FIXED_BIT)
+#define X86_EFLAGS_PF_BIT 2 /* Parity Flag */
+#define X86_EFLAGS_PF _BITUL(X86_EFLAGS_PF_BIT)
+#define X86_EFLAGS_AF_BIT 4 /* Auxiliary carry Flag */
+#define X86_EFLAGS_AF _BITUL(X86_EFLAGS_AF_BIT)
+#define X86_EFLAGS_ZF_BIT 6 /* Zero Flag */
+#define X86_EFLAGS_ZF _BITUL(X86_EFLAGS_ZF_BIT)
+#define X86_EFLAGS_SF_BIT 7 /* Sign Flag */
+#define X86_EFLAGS_SF _BITUL(X86_EFLAGS_SF_BIT)
+#define X86_EFLAGS_TF_BIT 8 /* Trap Flag */
+#define X86_EFLAGS_TF _BITUL(X86_EFLAGS_TF_BIT)
+#define X86_EFLAGS_IF_BIT 9 /* Interrupt Flag */
+#define X86_EFLAGS_IF _BITUL(X86_EFLAGS_IF_BIT)
+#define X86_EFLAGS_DF_BIT 10 /* Direction Flag */
+#define X86_EFLAGS_DF _BITUL(X86_EFLAGS_DF_BIT)
+#define X86_EFLAGS_OF_BIT 11 /* Overflow Flag */
+#define X86_EFLAGS_OF _BITUL(X86_EFLAGS_OF_BIT)
+#define X86_EFLAGS_IOPL_BIT 12 /* I/O Privilege Level (2 bits) */
+#define X86_EFLAGS_IOPL (_AC(3,UL) << X86_EFLAGS_IOPL_BIT)
+#define X86_EFLAGS_NT_BIT 14 /* Nested Task */
+#define X86_EFLAGS_NT _BITUL(X86_EFLAGS_NT_BIT)
+#define X86_EFLAGS_RF_BIT 16 /* Resume Flag */
+#define X86_EFLAGS_RF _BITUL(X86_EFLAGS_RF_BIT)
+#define X86_EFLAGS_VM_BIT 17 /* Virtual Mode */
+#define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT)
+#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */
+#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT)
+#define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */
+#define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT)
+#define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */
+#define X86_EFLAGS_VIP _BITUL(X86_EFLAGS_VIP_BIT)
+#define X86_EFLAGS_ID_BIT 21 /* CPUID detection */
+#define X86_EFLAGS_ID _BITUL(X86_EFLAGS_ID_BIT)
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE_BIT 0 /* Protection Enable */
+#define X86_CR0_PE _BITUL(X86_CR0_PE_BIT)
+#define X86_CR0_MP_BIT 1 /* Monitor Coprocessor */
+#define X86_CR0_MP _BITUL(X86_CR0_MP_BIT)
+#define X86_CR0_EM_BIT 2 /* Emulation */
+#define X86_CR0_EM _BITUL(X86_CR0_EM_BIT)
+#define X86_CR0_TS_BIT 3 /* Task Switched */
+#define X86_CR0_TS _BITUL(X86_CR0_TS_BIT)
+#define X86_CR0_ET_BIT 4 /* Extension Type */
+#define X86_CR0_ET _BITUL(X86_CR0_ET_BIT)
+#define X86_CR0_NE_BIT 5 /* Numeric Error */
+#define X86_CR0_NE _BITUL(X86_CR0_NE_BIT)
+#define X86_CR0_WP_BIT 16 /* Write Protect */
+#define X86_CR0_WP _BITUL(X86_CR0_WP_BIT)
+#define X86_CR0_AM_BIT 18 /* Alignment Mask */
+#define X86_CR0_AM _BITUL(X86_CR0_AM_BIT)
+#define X86_CR0_NW_BIT 29 /* Not Write-through */
+#define X86_CR0_NW _BITUL(X86_CR0_NW_BIT)
+#define X86_CR0_CD_BIT 30 /* Cache Disable */
+#define X86_CR0_CD _BITUL(X86_CR0_CD_BIT)
+#define X86_CR0_PG_BIT 31 /* Paging */
+#define X86_CR0_PG _BITUL(X86_CR0_PG_BIT)
+
+/*
+ * Paging options in CR3
+ */
+#define X86_CR3_PWT_BIT 3 /* Page Write Through */
+#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
+#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
+#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
+
+#define X86_CR3_PCID_BITS 12
+#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+
+#define X86_CR3_LAM_U57_BIT 61 /* Activate LAM for userspace, 62:57 bits masked */
+#define X86_CR3_LAM_U57 _BITULL(X86_CR3_LAM_U57_BIT)
+#define X86_CR3_LAM_U48_BIT 62 /* Activate LAM for userspace, 62:48 bits masked */
+#define X86_CR3_LAM_U48 _BITULL(X86_CR3_LAM_U48_BIT)
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
+
+/*
+ * Intel CPU features in CR4
+ */
+#define X86_CR4_VME_BIT 0 /* enable vm86 extensions */
+#define X86_CR4_VME _BITUL(X86_CR4_VME_BIT)
+#define X86_CR4_PVI_BIT 1 /* virtual interrupts flag enable */
+#define X86_CR4_PVI _BITUL(X86_CR4_PVI_BIT)
+#define X86_CR4_TSD_BIT 2 /* disable time stamp at ipl 3 */
+#define X86_CR4_TSD _BITUL(X86_CR4_TSD_BIT)
+#define X86_CR4_DE_BIT 3 /* enable debugging extensions */
+#define X86_CR4_DE _BITUL(X86_CR4_DE_BIT)
+#define X86_CR4_PSE_BIT 4 /* enable page size extensions */
+#define X86_CR4_PSE _BITUL(X86_CR4_PSE_BIT)
+#define X86_CR4_PAE_BIT 5 /* enable physical address extensions */
+#define X86_CR4_PAE _BITUL(X86_CR4_PAE_BIT)
+#define X86_CR4_MCE_BIT 6 /* Machine check enable */
+#define X86_CR4_MCE _BITUL(X86_CR4_MCE_BIT)
+#define X86_CR4_PGE_BIT 7 /* enable global pages */
+#define X86_CR4_PGE _BITUL(X86_CR4_PGE_BIT)
+#define X86_CR4_PCE_BIT 8 /* enable performance counters at ipl 3 */
+#define X86_CR4_PCE _BITUL(X86_CR4_PCE_BIT)
+#define X86_CR4_OSFXSR_BIT 9 /* enable fast FPU save and restore */
+#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
+#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
+#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_UMIP_BIT 11 /* enable UMIP support */
+#define X86_CR4_UMIP _BITUL(X86_CR4_UMIP_BIT)
+#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
+#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
+#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
+#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
+#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
+#define X86_CR4_SMXE _BITUL(X86_CR4_SMXE_BIT)
+#define X86_CR4_FSGSBASE_BIT 16 /* enable RDWRFSGS support */
+#define X86_CR4_FSGSBASE _BITUL(X86_CR4_FSGSBASE_BIT)
+#define X86_CR4_PCIDE_BIT 17 /* enable PCID support */
+#define X86_CR4_PCIDE _BITUL(X86_CR4_PCIDE_BIT)
+#define X86_CR4_OSXSAVE_BIT 18 /* enable xsave and xrestore */
+#define X86_CR4_OSXSAVE _BITUL(X86_CR4_OSXSAVE_BIT)
+#define X86_CR4_SMEP_BIT 20 /* enable SMEP support */
+#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT)
+#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */
+#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
+#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
+#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */
+#define X86_CR4_CET _BITUL(X86_CR4_CET_BIT)
+#define X86_CR4_LASS_BIT 27 /* enable Linear Address Space Separation support */
+#define X86_CR4_LASS _BITUL(X86_CR4_LASS_BIT)
+#define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */
+#define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT)
+
+#ifdef __x86_64__
+#define X86_CR4_FRED_BIT 32 /* enable FRED kernel entry */
+#define X86_CR4_FRED _BITUL(X86_CR4_FRED_BIT)
+#else
+#define X86_CR4_FRED (0)
+#endif
+
+/*
+ * x86-64 Task Priority Register, CR8
+ */
+#define X86_CR8_TPR _AC(0x0000000f,UL) /* task priority register */
+
+/*
+ * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
+ */
+
+/*
+ * NSC/Cyrix CPU configuration register indexes
+ */
+#define CX86_PCR0 0x20
+#define CX86_GCR 0xb8
+#define CX86_CCR0 0xc0
+#define CX86_CCR1 0xc1
+#define CX86_CCR2 0xc2
+#define CX86_CCR3 0xc3
+#define CX86_CCR4 0xe8
+#define CX86_CCR5 0xe9
+#define CX86_CCR6 0xea
+#define CX86_CCR7 0xeb
+#define CX86_PCR1 0xf0
+#define CX86_DIR0 0xfe
+#define CX86_DIR1 0xff
+#define CX86_ARR_BASE 0xc4
+#define CX86_RCR_BASE 0xdc
+
+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+ X86_CR0_PG)
+
+#endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
new file mode 100644
index 000000000000..5823584dea13
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_PTRACE_ABI_H
+#define _ASM_X86_PTRACE_ABI_H
+
+#ifdef __i386__
+
+#define EBX 0
+#define ECX 1
+#define EDX 2
+#define ESI 3
+#define EDI 4
+#define EBP 5
+#define EAX 6
+#define DS 7
+#define ES 8
+#define FS 9
+#define GS 10
+#define ORIG_EAX 11
+#define EIP 12
+#define CS 13
+#define EFL 14
+#define UESP 15
+#define SS 16
+#define FRAME_SIZE 17
+
+#else /* __i386__ */
+
+#if defined(__ASSEMBLER__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15 0
+#define R14 8
+#define R13 16
+#define R12 24
+#define RBP 32
+#define RBX 40
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11 48
+#define R10 56
+#define R9 64
+#define R8 72
+#define RAX 80
+#define RCX 88
+#define RDX 96
+#define RSI 104
+#define RDI 112
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
+#define RIP 128
+#define CS 136
+#define EFLAGS 144
+#define RSP 152
+#define SS 160
+#endif /* __ASSEMBLER__ */
+
+/* top of stack page */
+#define FRAME_SIZE 168
+
+#endif /* !__i386__ */
+
+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
+#define PTRACE_GETREGS 12
+#define PTRACE_SETREGS 13
+#define PTRACE_GETFPREGS 14
+#define PTRACE_SETFPREGS 15
+#define PTRACE_GETFPXREGS 18
+#define PTRACE_SETFPXREGS 19
+
+#define PTRACE_OLDSETOPTIONS 21
+
+/* only useful for access 32bit programs / kernels */
+#define PTRACE_GET_THREAD_AREA 25
+#define PTRACE_SET_THREAD_AREA 26
+
+#ifdef __x86_64__
+# define PTRACE_ARCH_PRCTL 30
+#endif
+
+#define PTRACE_SYSEMU 31
+#define PTRACE_SYSEMU_SINGLESTEP 32
+
+#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
+
+#ifndef __ASSEMBLER__
+#include <linux/types.h>
+#endif
+
+#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
new file mode 100644
index 000000000000..e0b5b4f6226b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_PTRACE_H
+#define _UAPI_ASM_X86_PTRACE_H
+
+#include <linux/compiler.h> /* For __user */
+#include <asm/ptrace-abi.h>
+#include <asm/processor-flags.h>
+
+
+#ifndef __ASSEMBLER__
+
+#ifdef __i386__
+/* this struct defines the way the registers are stored on the
+ stack during a system call. */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ int xds;
+ int xes;
+ int xfs;
+ int xgs;
+ long orig_eax;
+ long eip;
+ int xcs;
+ long eflags;
+ long esp;
+ int xss;
+};
+
+#endif /* __KERNEL__ */
+
+#else /* __i386__ */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long rbp;
+ unsigned long rbx;
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long rax;
+ unsigned long rcx;
+ unsigned long rdx;
+ unsigned long rsi;
+ unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+ unsigned long orig_rax;
+/* Return frame for iretq */
+ unsigned long rip;
+ unsigned long cs;
+ unsigned long eflags;
+ unsigned long rsp;
+ unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* __KERNEL__ */
+#endif /* !__i386__ */
+
+
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* _UAPI_ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h
new file mode 100644
index 000000000000..71205b02a191
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sembuf.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_SEMBUF_H
+#define _ASM_X86_SEMBUF_H
+
+#include <asm/ipcbuf.h>
+
+/*
+ * The semid64_ds structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 2 miscellaneous 32-bit values
+ *
+ * x86_64 and x32 incorrectly added padding here, so the structures
+ * are still incompatible with the padding on x86.
+ */
+struct semid64_ds {
+ struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
+#ifdef __i386__
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_otime_high;
+ unsigned long sem_ctime; /* last change time */
+ unsigned long sem_ctime_high;
+#else
+ __kernel_long_t sem_otime; /* last semop time */
+ __kernel_ulong_t __unused1;
+ __kernel_long_t sem_ctime; /* last change time */
+ __kernel_ulong_t __unused2;
+#endif
+ __kernel_ulong_t sem_nsems; /* no. of semaphores in array */
+ __kernel_ulong_t __unused3;
+ __kernel_ulong_t __unused4;
+};
+
+#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/uapi/asm/setup.h b/arch/x86/include/uapi/asm/setup.h
new file mode 100644
index 000000000000..79a9626b5500
--- /dev/null
+++ b/arch/x86/include/uapi/asm/setup.h
@@ -0,0 +1 @@
+/* */
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
new file mode 100644
index 000000000000..2671c4e1b3a0
--- /dev/null
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_SETUP_DATA_H
+#define _UAPI_ASM_X86_SETUP_DATA_H
+
+/* setup_data/setup_indirect types */
+#define SETUP_NONE 0
+#define SETUP_E820_EXT 1
+#define SETUP_DTB 2
+#define SETUP_PCI 3
+#define SETUP_EFI 4
+#define SETUP_APPLE_PROPERTIES 5
+#define SETUP_JAILHOUSE 6
+#define SETUP_CC_BLOB 7
+#define SETUP_IMA 8
+#define SETUP_RNG_SEED 9
+#define SETUP_KEXEC_KHO 10
+#define SETUP_ENUM_MAX SETUP_KEXEC_KHO
+
+#define SETUP_INDIRECT (1<<31)
+#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+
+/* extensible setup data list node */
+struct setup_data {
+ __u64 next;
+ __u32 type;
+ __u32 len;
+ __u8 data[];
+};
+
+/* extensible setup indirect data node */
+struct setup_indirect {
+ __u32 type;
+ __u32 reserved; /* Reserved, must be set to zero. */
+ __u64 len;
+ __u64 addr;
+};
+
+/*
+ * The E820 memory region entry of the boot protocol ABI:
+ */
+struct boot_e820_entry {
+ __u64 addr;
+ __u64 size;
+ __u32 type;
+} __attribute__((packed));
+
+/*
+ * The boot loader is passing platform information via this Jailhouse-specific
+ * setup data structure.
+ */
+struct jailhouse_setup_data {
+ struct {
+ __u16 version;
+ __u16 compatible_version;
+ } __attribute__((packed)) hdr;
+ struct {
+ __u16 pm_timer_address;
+ __u16 num_cpus;
+ __u64 pci_mmconfig_base;
+ __u32 tsc_khz;
+ __u32 apic_khz;
+ __u8 standard_ioapic;
+ __u8 cpu_ids[255];
+ } __attribute__((packed)) v1;
+ struct {
+ __u32 flags;
+ } __attribute__((packed)) v2;
+} __attribute__((packed));
+
+/*
+ * IMA buffer setup data information from the previous kernel during kexec
+ */
+struct ima_setup_data {
+ __u64 addr;
+ __u64 size;
+} __attribute__((packed));
+
+/*
+ * Locations of kexec handover metadata
+ */
+struct kho_data {
+ __u64 fdt_addr;
+ __u64 fdt_size;
+ __u64 scratch_addr;
+ __u64 scratch_size;
+} __attribute__((packed));
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
new file mode 100644
index 000000000000..3c4d52072189
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright(c) 2016-20 Intel Corporation.
+ */
+#ifndef _UAPI_ASM_X86_SGX_H
+#define _UAPI_ASM_X86_SGX_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/**
+ * enum sgx_page_flags - page control flags
+ * @SGX_PAGE_MEASURE: Measure the page contents with a sequence of
+ * ENCLS[EEXTEND] operations.
+ */
+enum sgx_page_flags {
+ SGX_PAGE_MEASURE = 0x01,
+};
+
+#define SGX_MAGIC 0xA4
+
+#define SGX_IOC_ENCLAVE_CREATE \
+ _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
+#define SGX_IOC_ENCLAVE_ADD_PAGES \
+ _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
+#define SGX_IOC_ENCLAVE_INIT \
+ _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
+#define SGX_IOC_ENCLAVE_PROVISION \
+ _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
+#define SGX_IOC_VEPC_REMOVE_ALL \
+ _IO(SGX_MAGIC, 0x04)
+#define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \
+ _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions)
+#define SGX_IOC_ENCLAVE_MODIFY_TYPES \
+ _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types)
+#define SGX_IOC_ENCLAVE_REMOVE_PAGES \
+ _IOWR(SGX_MAGIC, 0x07, struct sgx_enclave_remove_pages)
+
+/**
+ * struct sgx_enclave_create - parameter structure for the
+ * %SGX_IOC_ENCLAVE_CREATE ioctl
+ * @src: address for the SECS page data
+ */
+struct sgx_enclave_create {
+ __u64 src;
+};
+
+/**
+ * struct sgx_enclave_add_pages - parameter structure for the
+ * %SGX_IOC_ENCLAVE_ADD_PAGE ioctl
+ * @src: start address for the page data
+ * @offset: starting page offset
+ * @length: length of the data (multiple of the page size)
+ * @secinfo: address for the SECINFO data
+ * @flags: page control flags
+ * @count: number of bytes added (multiple of the page size)
+ */
+struct sgx_enclave_add_pages {
+ __u64 src;
+ __u64 offset;
+ __u64 length;
+ __u64 secinfo;
+ __u64 flags;
+ __u64 count;
+};
+
+/**
+ * struct sgx_enclave_init - parameter structure for the
+ * %SGX_IOC_ENCLAVE_INIT ioctl
+ * @sigstruct: address for the SIGSTRUCT data
+ */
+struct sgx_enclave_init {
+ __u64 sigstruct;
+};
+
+/**
+ * struct sgx_enclave_provision - parameter structure for the
+ * %SGX_IOC_ENCLAVE_PROVISION ioctl
+ * @fd: file handle of /dev/sgx_provision
+ */
+struct sgx_enclave_provision {
+ __u64 fd;
+};
+
+/**
+ * struct sgx_enclave_restrict_permissions - parameters for ioctl
+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS
+ * @offset: starting page offset (page aligned relative to enclave base
+ * address defined in SECS)
+ * @length: length of memory (multiple of the page size)
+ * @permissions:new permission bits for pages in range described by @offset
+ * and @length
+ * @result: (output) SGX result code of ENCLS[EMODPR] function
+ * @count: (output) bytes successfully changed (multiple of page size)
+ */
+struct sgx_enclave_restrict_permissions {
+ __u64 offset;
+ __u64 length;
+ __u64 permissions;
+ __u64 result;
+ __u64 count;
+};
+
+/**
+ * struct sgx_enclave_modify_types - parameters for ioctl
+ * %SGX_IOC_ENCLAVE_MODIFY_TYPES
+ * @offset: starting page offset (page aligned relative to enclave base
+ * address defined in SECS)
+ * @length: length of memory (multiple of the page size)
+ * @page_type: new type for pages in range described by @offset and @length
+ * @result: (output) SGX result code of ENCLS[EMODT] function
+ * @count: (output) bytes successfully changed (multiple of page size)
+ */
+struct sgx_enclave_modify_types {
+ __u64 offset;
+ __u64 length;
+ __u64 page_type;
+ __u64 result;
+ __u64 count;
+};
+
+/**
+ * struct sgx_enclave_remove_pages - %SGX_IOC_ENCLAVE_REMOVE_PAGES parameters
+ * @offset: starting page offset (page aligned relative to enclave base
+ * address defined in SECS)
+ * @length: length of memory (multiple of the page size)
+ * @count: (output) bytes successfully changed (multiple of page size)
+ *
+ * Regular (PT_REG) or TCS (PT_TCS) can be removed from an initialized
+ * enclave if the system supports SGX2. First, the %SGX_IOC_ENCLAVE_MODIFY_TYPES
+ * ioctl() should be used to change the page type to PT_TRIM. After that
+ * succeeds ENCLU[EACCEPT] should be run from within the enclave and then
+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES can be used to complete the page removal.
+ */
+struct sgx_enclave_remove_pages {
+ __u64 offset;
+ __u64 length;
+ __u64 count;
+};
+
+struct sgx_enclave_run;
+
+/**
+ * typedef sgx_enclave_user_handler_t - Exit handler function accepted by
+ * __vdso_sgx_enter_enclave()
+ * @rdi: RDI at the time of EEXIT, undefined on AEX
+ * @rsi: RSI at the time of EEXIT, undefined on AEX
+ * @rdx: RDX at the time of EEXIT, undefined on AEX
+ * @rsp: RSP (untrusted) at the time of EEXIT or AEX
+ * @r8: R8 at the time of EEXIT, undefined on AEX
+ * @r9: R9 at the time of EEXIT, undefined on AEX
+ * @run: The run instance given by the caller
+ *
+ * The register parameters contain the snapshot of their values at enclave
+ * exit. An invalid ENCLU function number will cause -EINVAL to be returned
+ * to the caller.
+ *
+ * Return:
+ * - <= 0: The given value is returned back to the caller.
+ * - > 0: ENCLU function to invoke, either EENTER or ERESUME.
+ */
+typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx,
+ long rsp, long r8, long r9,
+ struct sgx_enclave_run *run);
+
+/**
+ * struct sgx_enclave_run - the execution context of __vdso_sgx_enter_enclave()
+ * @tcs: TCS used to enter the enclave
+ * @function: The last seen ENCLU function (EENTER, ERESUME or EEXIT)
+ * @exception_vector: The interrupt vector of the exception
+ * @exception_error_code: The exception error code pulled out of the stack
+ * @exception_addr: The address that triggered the exception
+ * @user_handler: User provided callback run on exception
+ * @user_data: Data passed to the user handler
+ * @reserved: Reserved for future extensions
+ *
+ * If @user_handler is provided, the handler will be invoked on all return paths
+ * of the normal flow. The user handler may transfer control, e.g. via a
+ * longjmp() call or a C++ exception, without returning to
+ * __vdso_sgx_enter_enclave().
+ */
+struct sgx_enclave_run {
+ __u64 tcs;
+ __u32 function;
+ __u16 exception_vector;
+ __u16 exception_error_code;
+ __u64 exception_addr;
+ __u64 user_handler;
+ __u64 user_data;
+ __u8 reserved[216];
+};
+
+/**
+ * typedef vdso_sgx_enter_enclave_t - Prototype for __vdso_sgx_enter_enclave(),
+ * a vDSO function to enter an SGX enclave.
+ * @rdi: Pass-through value for RDI
+ * @rsi: Pass-through value for RSI
+ * @rdx: Pass-through value for RDX
+ * @function: ENCLU function, must be EENTER or ERESUME
+ * @r8: Pass-through value for R8
+ * @r9: Pass-through value for R9
+ * @run: struct sgx_enclave_run, must be non-NULL
+ *
+ * NOTE: __vdso_sgx_enter_enclave() does not ensure full compliance with the
+ * x86-64 ABI, e.g. doesn't handle XSAVE state. Except for non-volatile
+ * general purpose registers, EFLAGS.DF, and RSP alignment, preserving/setting
+ * state in accordance with the x86-64 ABI is the responsibility of the enclave
+ * and its runtime, i.e. __vdso_sgx_enter_enclave() cannot be called from C
+ * code without careful consideration by both the enclave and its runtime.
+ *
+ * All general purpose registers except RAX, RBX and RCX are passed as-is to the
+ * enclave. RAX, RBX and RCX are consumed by EENTER and ERESUME and are loaded
+ * with @function, asynchronous exit pointer, and @run.tcs respectively.
+ *
+ * RBP and the stack are used to anchor __vdso_sgx_enter_enclave() to the
+ * pre-enclave state, e.g. to retrieve @run.exception and @run.user_handler
+ * after an enclave exit. All other registers are available for use by the
+ * enclave and its runtime, e.g. an enclave can push additional data onto the
+ * stack (and modify RSP) to pass information to the optional user handler (see
+ * below).
+ *
+ * Most exceptions reported on ENCLU, including those that occur within the
+ * enclave, are fixed up and reported synchronously instead of being delivered
+ * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are
+ * never fixed up and are always delivered via standard signals. On synchronously
+ * reported exceptions, -EFAULT is returned and details about the exception are
+ * recorded in @run.exception, the optional sgx_enclave_exception struct.
+ *
+ * Return:
+ * - 0: ENCLU function was successfully executed.
+ * - -EINVAL: Invalid ENCL number (neither EENTER nor ERESUME).
+ */
+typedef int (*vdso_sgx_enter_enclave_t)(unsigned long rdi, unsigned long rsi,
+ unsigned long rdx, unsigned int function,
+ unsigned long r8, unsigned long r9,
+ struct sgx_enclave_run *run);
+
+#endif /* _UAPI_ASM_X86_SGX_H */
diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h
new file mode 100644
index 000000000000..13775bfdfee2
--- /dev/null
+++ b/arch/x86/include/uapi/asm/shmbuf.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X86_SHMBUF_H
+#define __ASM_X86_SHMBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
+#include <asm-generic/shmbuf.h>
+#else
+
+#include <asm/ipcbuf.h>
+#include <asm/posix_types.h>
+
+/*
+ * The shmid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is different
+ * from other 32-bit architectures.
+ */
+
+struct shmid64_ds {
+ struct ipc64_perm shm_perm; /* operation perms */
+ __kernel_size_t shm_segsz; /* size of segment (bytes) */
+ __kernel_long_t shm_atime; /* last attach time */
+ __kernel_long_t shm_dtime; /* last detach time */
+ __kernel_long_t shm_ctime; /* last change time */
+ __kernel_pid_t shm_cpid; /* pid of creator */
+ __kernel_pid_t shm_lpid; /* pid of last operator */
+ __kernel_ulong_t shm_nattch; /* no. of current attaches */
+ __kernel_ulong_t __unused4;
+ __kernel_ulong_t __unused5;
+};
+
+struct shminfo64 {
+ __kernel_ulong_t shmmax;
+ __kernel_ulong_t shmmin;
+ __kernel_ulong_t shmmni;
+ __kernel_ulong_t shmseg;
+ __kernel_ulong_t shmall;
+ __kernel_ulong_t __unused1;
+ __kernel_ulong_t __unused2;
+ __kernel_ulong_t __unused3;
+ __kernel_ulong_t __unused4;
+};
+
+#endif
+
+#endif /* __ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
new file mode 100644
index 000000000000..d0d9b331d3a1
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_SIGCONTEXT_H
+#define _UAPI_ASM_X86_SIGCONTEXT_H
+
+/*
+ * Linux signal context definitions. The sigcontext includes a complex
+ * hierarchy of CPU and FPU state, available to user-space (on the stack) when
+ * a signal handler is executed.
+ *
+ * As over the years this ABI grew from its very simple roots towards
+ * supporting more and more CPU state organically, some of the details (which
+ * were rather clever hacks back in the days) became a bit quirky by today.
+ *
+ * The current ABI includes flexible provisions for future extensions, so we
+ * won't have to grow new quirks for quite some time. Promise!
+ */
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#define FP_XSTATE_MAGIC1 0x46505853U
+#define FP_XSTATE_MAGIC2 0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
+
+/*
+ * Bytes 464..511 in the current 512-byte layout of the FXSAVE/FXRSTOR frame
+ * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes are
+ * used to extend the fpstate pointer in the sigcontext, which now includes the
+ * extended state information along with fpstate information.
+ *
+ * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a
+ * sw_reserved.extended_size bytes large extended context area present. (The
+ * last 32-bit word of this extended area (at the
+ * fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to
+ * FP_XSTATE_MAGIC2 so that you can sanity check your size calculations.)
+ *
+ * This extended area typically grows with newer CPUs that have larger and
+ * larger XSAVE areas.
+ */
+struct _fpx_sw_bytes {
+ /*
+ * If set to FP_XSTATE_MAGIC1 then this is an xstate context.
+ * 0 if a legacy frame.
+ */
+ __u32 magic1;
+
+ /*
+ * Total size of the fpstate area:
+ *
+ * - if magic1 == 0 then it's sizeof(struct _fpstate)
+ * - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate)
+ * plus extensions (if any)
+ */
+ __u32 extended_size;
+
+ /*
+ * Feature bit mask (including FP/SSE/extended state) that is present
+ * in the memory layout:
+ */
+ __u64 xfeatures;
+
+ /*
+ * Actual XSAVE state size, based on the xfeatures saved in the layout.
+ * 'extended_size' is greater than 'xstate_size':
+ */
+ __u32 xstate_size;
+
+ /* For future use: */
+ __u32 padding[7];
+};
+
+/*
+ * As documented in the iBCS2 standard:
+ *
+ * The first part of "struct _fpstate" is just the normal i387 hardware setup,
+ * the extra "status" word is used to save the coprocessor status word before
+ * entering the handler.
+ *
+ * The FPU state data structure has had to grow to accommodate the extended FPU
+ * state required by the Streaming SIMD Extensions. There is no documented
+ * standard to accomplish this at the moment.
+ */
+
+/* 10-byte legacy floating point register: */
+struct _fpreg {
+ __u16 significand[4];
+ __u16 exponent;
+};
+
+/* 16-byte floating point register: */
+struct _fpxreg {
+ __u16 significand[4];
+ __u16 exponent;
+ __u16 padding[3];
+};
+
+/* 16-byte XMM register: */
+struct _xmmreg {
+ __u32 element[4];
+};
+
+#define X86_FXSR_MAGIC 0x0000
+
+/*
+ * The 32-bit FPU frame:
+ */
+struct _fpstate_32 {
+ /* Legacy FPU environment: */
+ __u32 cw;
+ __u32 sw;
+ __u32 tag;
+ __u32 ipoff;
+ __u32 cssel;
+ __u32 dataoff;
+ __u32 datasel;
+ struct _fpreg _st[8];
+ __u16 status;
+ __u16 magic; /* 0xffff: regular FPU data only */
+ /* 0x0000: FXSR FPU data */
+
+ /* FXSR FPU environment */
+ __u32 _fxsr_env[6]; /* FXSR FPU env is ignored */
+ __u32 mxcsr;
+ __u32 reserved;
+ struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
+ struct _xmmreg _xmm[8]; /* First 8 XMM registers */
+ union {
+ __u32 padding1[44]; /* Second 8 XMM registers plus padding */
+ __u32 padding[44]; /* Alias name for old user-space */
+ };
+
+ union {
+ __u32 padding2[12];
+ struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */
+ };
+};
+
+/*
+ * The 64-bit FPU frame. (FXSAVE format and later)
+ *
+ * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is
+ * larger: 'struct _xstate'. Note that 'struct _xstate' embeds
+ * 'struct _fpstate' so that you can always assume the _fpstate portion
+ * exists so that you can check the magic value.
+ *
+ * Note2: Reserved fields may someday contain valuable data. Always
+ * save/restore them when you change signal frames.
+ */
+struct _fpstate_64 {
+ __u16 cwd;
+ __u16 swd;
+ /* Note this is not the same as the 32-bit/x87/FSAVE twd: */
+ __u16 twd;
+ __u16 fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8x FP registers, 16 bytes each */
+ __u32 xmm_space[64]; /* 16x XMM registers, 16 bytes each */
+ __u32 reserved2[12];
+ union {
+ __u32 reserved3[12];
+ struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */
+ };
+};
+
+#ifdef __i386__
+# define _fpstate _fpstate_32
+#else
+# define _fpstate _fpstate_64
+#endif
+
+struct _header {
+ __u64 xfeatures;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
+};
+
+struct _ymmh_state {
+ /* 16x YMM registers, 16 bytes each: */
+ __u32 ymmh_space[64];
+};
+
+/*
+ * Extended state pointed to by sigcontext::fpstate.
+ *
+ * In addition to the fpstate, information encoded in _xstate::xstate_hdr
+ * indicates the presence of other extended state information supported
+ * by the CPU and kernel:
+ */
+struct _xstate {
+ struct _fpstate fpstate;
+ struct _header xstate_hdr;
+ struct _ymmh_state ymmh;
+ /* New processor state extensions go here: */
+};
+
+/*
+ * The 32-bit signal frame:
+ */
+struct sigcontext_32 {
+ __u16 gs, __gsh;
+ __u16 fs, __fsh;
+ __u16 es, __esh;
+ __u16 ds, __dsh;
+ __u32 di;
+ __u32 si;
+ __u32 bp;
+ __u32 sp;
+ __u32 bx;
+ __u32 dx;
+ __u32 cx;
+ __u32 ax;
+ __u32 trapno;
+ __u32 err;
+ __u32 ip;
+ __u16 cs, __csh;
+ __u32 flags;
+ __u32 sp_at_signal;
+ __u16 ss, __ssh;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the definition of
+ * (struct _fpx_sw_bytes)
+ */
+ __u32 fpstate; /* Zero when no FPU/extended context */
+ __u32 oldmask;
+ __u32 cr2;
+};
+
+/*
+ * The 64-bit signal frame:
+ */
+struct sigcontext_64 {
+ __u64 r8;
+ __u64 r9;
+ __u64 r10;
+ __u64 r11;
+ __u64 r12;
+ __u64 r13;
+ __u64 r14;
+ __u64 r15;
+ __u64 di;
+ __u64 si;
+ __u64 bp;
+ __u64 bx;
+ __u64 dx;
+ __u64 ax;
+ __u64 cx;
+ __u64 sp;
+ __u64 ip;
+ __u64 flags;
+ __u16 cs;
+ __u16 gs;
+ __u16 fs;
+ __u16 ss;
+ __u64 err;
+ __u64 trapno;
+ __u64 oldmask;
+ __u64 cr2;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the definition of
+ * (struct _fpx_sw_bytes)
+ */
+ __u64 fpstate; /* Zero when no FPU/extended context */
+ __u64 reserved1[8];
+};
+
+/*
+ * Create the real 'struct sigcontext' type:
+ */
+#ifdef __KERNEL__
+# ifdef __i386__
+# define sigcontext sigcontext_32
+# else
+# define sigcontext sigcontext_64
+# endif
+#endif
+
+/*
+ * The old user-space sigcontext definition, just in case user-space still
+ * relies on it. The kernel definition (in asm/sigcontext.h) has unified
+ * field names but otherwise the same layout.
+ */
+#ifndef __KERNEL__
+
+#define _fpstate_ia32 _fpstate_32
+#define sigcontext_ia32 sigcontext_32
+
+
+# ifdef __i386__
+struct sigcontext {
+ __u16 gs, __gsh;
+ __u16 fs, __fsh;
+ __u16 es, __esh;
+ __u16 ds, __dsh;
+ __u32 edi;
+ __u32 esi;
+ __u32 ebp;
+ __u32 esp;
+ __u32 ebx;
+ __u32 edx;
+ __u32 ecx;
+ __u32 eax;
+ __u32 trapno;
+ __u32 err;
+ __u32 eip;
+ __u16 cs, __csh;
+ __u32 eflags;
+ __u32 esp_at_signal;
+ __u16 ss, __ssh;
+ struct _fpstate __user *fpstate;
+ __u32 oldmask;
+ __u32 cr2;
+};
+# else /* __x86_64__: */
+struct sigcontext {
+ __u64 r8;
+ __u64 r9;
+ __u64 r10;
+ __u64 r11;
+ __u64 r12;
+ __u64 r13;
+ __u64 r14;
+ __u64 r15;
+ __u64 rdi;
+ __u64 rsi;
+ __u64 rbp;
+ __u64 rbx;
+ __u64 rdx;
+ __u64 rax;
+ __u64 rcx;
+ __u64 rsp;
+ __u64 rip;
+ __u64 eflags; /* RFLAGS */
+ __u16 cs;
+
+ /*
+ * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+ * Linux saved and restored fs and gs in these slots. This
+ * was counterproductive, as fsbase and gsbase were never
+ * saved, so arch_prctl was presumably unreliable.
+ *
+ * These slots should never be reused without extreme caution:
+ *
+ * - Some DOSEMU versions stash fs and gs in these slots manually,
+ * thus overwriting anything the kernel expects to be preserved
+ * in these slots.
+ *
+ * - If these slots are ever needed for any other purpose,
+ * there is some risk that very old 64-bit binaries could get
+ * confused. I doubt that many such binaries still work,
+ * though, since the same patch in 2.5.64 also removed the
+ * 64-bit set_thread_area syscall, so it appears that there
+ * is no TLS API beyond modify_ldt that works in both pre-
+ * and post-2.5.64 kernels.
+ *
+ * If the kernel ever adds explicit fs, gs, fsbase, and gsbase
+ * save/restore, it will most likely need to be opt-in and use
+ * different context slots.
+ */
+ __u16 gs;
+ __u16 fs;
+ union {
+ __u16 ss; /* If UC_SIGCONTEXT_SS */
+ __u16 __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */
+ };
+ __u64 err;
+ __u64 trapno;
+ __u64 oldmask;
+ __u64 cr2;
+ struct _fpstate __user *fpstate; /* Zero when no FPU context */
+# ifdef __ILP32__
+ __u32 __fpstate_pad;
+# endif
+ __u64 reserved1[8];
+};
+# endif /* __x86_64__ */
+#endif /* !__KERNEL__ */
+
+#endif /* _UAPI_ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h
new file mode 100644
index 000000000000..7114801d0499
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sigcontext32.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_SIGCONTEXT32_H
+#define _ASM_X86_SIGCONTEXT32_H
+
+/* This is a legacy file - all the type definitions are in sigcontext.h: */
+
+#include <asm/sigcontext.h>
+
+#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/uapi/asm/siginfo.h b/arch/x86/include/uapi/asm/siginfo.h
new file mode 100644
index 000000000000..6642d8be40c4
--- /dev/null
+++ b/arch/x86/include/uapi/asm/siginfo.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_SIGINFO_H
+#define _ASM_X86_SIGINFO_H
+
+#ifdef __x86_64__
+# ifdef __ILP32__ /* x32 */
+typedef long long __kernel_si_clock_t __attribute__((aligned(4)));
+# define __ARCH_SI_CLOCK_T __kernel_si_clock_t
+# define __ARCH_SI_ATTRIBUTES __attribute__((aligned(8)))
+# endif
+#endif
+
+#include <asm-generic/siginfo.h>
+
+#endif /* _ASM_X86_SIGINFO_H */
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
new file mode 100644
index 000000000000..1067efabf18b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_SIGNAL_H
+#define _UAPI_ASM_X86_SIGNAL_H
+
+#ifndef __ASSEMBLER__
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/* Avoid too many header ordering problems. */
+struct siginfo;
+
+#ifndef __KERNEL__
+/* Here we must cater to libcs that poke about in kernel headers. */
+
+#define NSIG 32
+typedef unsigned long sigset_t;
+
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLER__ */
+
+
+#define SIGHUP 1
+#define SIGINT 2
+#define SIGQUIT 3
+#define SIGILL 4
+#define SIGTRAP 5
+#define SIGABRT 6
+#define SIGIOT 6
+#define SIGBUS 7
+#define SIGFPE 8
+#define SIGKILL 9
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGPIPE 13
+#define SIGALRM 14
+#define SIGTERM 15
+#define SIGSTKFLT 16
+#define SIGCHLD 17
+#define SIGCONT 18
+#define SIGSTOP 19
+#define SIGTSTP 20
+#define SIGTTIN 21
+#define SIGTTOU 22
+#define SIGURG 23
+#define SIGXCPU 24
+#define SIGXFSZ 25
+#define SIGVTALRM 26
+#define SIGPROF 27
+#define SIGWINCH 28
+#define SIGIO 29
+#define SIGPOLL SIGIO
+/*
+#define SIGLOST 29
+*/
+#define SIGPWR 30
+#define SIGSYS 31
+#define SIGUNUSED 31
+
+/* These should not be considered constants from userland. */
+#define SIGRTMIN 32
+#define SIGRTMAX _NSIG
+
+#define SA_RESTORER 0x04000000
+
+#define MINSIGSTKSZ 2048
+#define SIGSTKSZ 8192
+
+#include <asm-generic/signal-defs.h>
+
+#ifndef __ASSEMBLER__
+
+
+# ifndef __KERNEL__
+/* Here we must cater to libcs that poke about in kernel headers. */
+#ifdef __i386__
+
+struct sigaction {
+ union {
+ __sighandler_t _sa_handler;
+ void (*_sa_sigaction)(int, struct siginfo *, void *);
+ } _u;
+ sigset_t sa_mask;
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+};
+
+#define sa_handler _u._sa_handler
+#define sa_sigaction _u._sa_sigaction
+
+#else /* __i386__ */
+
+struct sigaction {
+ __sighandler_t sa_handler;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+ sigset_t sa_mask; /* mask last for extensibility */
+};
+
+#endif /* !__i386__ */
+# endif /* ! __KERNEL__ */
+
+typedef struct sigaltstack {
+ void __user *ss_sp;
+ int ss_flags;
+ __kernel_size_t ss_size;
+} stack_t;
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _UAPI_ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/uapi/asm/stat.h
index e0b1d9bbcbc6..9e3982d95d0f 100644
--- a/arch/x86/include/asm/stat.h
+++ b/arch/x86/include/uapi/asm/stat.h
@@ -1,6 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_STAT_H
#define _ASM_X86_STAT_H
+#include <asm/posix_types.h>
+
#define STAT_HAVE_NSEC 1
#ifdef __i386__
@@ -25,6 +28,12 @@ struct stat {
unsigned long __unused5;
};
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT_PADDING(st) do { \
+ st.__unused4 = 0; \
+ st.__unused5 = 0; \
+} while (0)
+
#define STAT64_HAS_BROKEN_ST_INO 1
/* This matches struct stat64 in glibc2.1, hence the absolutely
@@ -63,30 +72,45 @@ struct stat64 {
unsigned long long st_ino;
};
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT64_PADDING(st) do { \
+ memset(&st.__pad0, 0, sizeof(st.__pad0)); \
+ memset(&st.__pad3, 0, sizeof(st.__pad3)); \
+} while (0)
+
#else /* __i386__ */
struct stat {
- unsigned long st_dev;
- unsigned long st_ino;
- unsigned long st_nlink;
+ __kernel_ulong_t st_dev;
+ __kernel_ulong_t st_ino;
+ __kernel_ulong_t st_nlink;
+
+ unsigned int st_mode;
+ unsigned int st_uid;
+ unsigned int st_gid;
+ unsigned int __pad0;
+ __kernel_ulong_t st_rdev;
+ __kernel_long_t st_size;
+ __kernel_long_t st_blksize;
+ __kernel_long_t st_blocks; /* Number 512-byte blocks allocated. */
+
+ __kernel_ulong_t st_atime;
+ __kernel_ulong_t st_atime_nsec;
+ __kernel_ulong_t st_mtime;
+ __kernel_ulong_t st_mtime_nsec;
+ __kernel_ulong_t st_ctime;
+ __kernel_ulong_t st_ctime_nsec;
+ __kernel_long_t __unused[3];
+};
- unsigned int st_mode;
- unsigned int st_uid;
- unsigned int st_gid;
- unsigned int __pad0;
- unsigned long st_rdev;
- long st_size;
- long st_blksize;
- long st_blocks; /* Number 512-byte blocks allocated. */
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT_PADDING(st) do { \
+ st.__pad0 = 0; \
+ st.__unused[0] = 0; \
+ st.__unused[1] = 0; \
+ st.__unused[2] = 0; \
+} while (0)
- unsigned long st_atime;
- unsigned long st_atime_nsec;
- unsigned long st_mtime;
- unsigned long st_mtime_nsec;
- unsigned long st_ctime;
- unsigned long st_ctime_nsec;
- long __unused[3];
-};
#endif
/* for 32bit emulation and 32 bit kernels */
diff --git a/arch/x86/include/asm/statfs.h b/arch/x86/include/uapi/asm/statfs.h
index 2d0adbf99a8e..13c2464cd2c4 100644
--- a/arch/x86/include/asm/statfs.h
+++ b/arch/x86/include/uapi/asm/statfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_STATFS_H
#define _ASM_X86_STATFS_H
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
new file mode 100644
index 000000000000..650e3256ea7d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI__SVM_H
+#define _UAPI__SVM_H
+
+#define SVM_EXIT_READ_CR0 0x000
+#define SVM_EXIT_READ_CR2 0x002
+#define SVM_EXIT_READ_CR3 0x003
+#define SVM_EXIT_READ_CR4 0x004
+#define SVM_EXIT_READ_CR8 0x008
+#define SVM_EXIT_WRITE_CR0 0x010
+#define SVM_EXIT_WRITE_CR2 0x012
+#define SVM_EXIT_WRITE_CR3 0x013
+#define SVM_EXIT_WRITE_CR4 0x014
+#define SVM_EXIT_WRITE_CR8 0x018
+#define SVM_EXIT_READ_DR0 0x020
+#define SVM_EXIT_READ_DR1 0x021
+#define SVM_EXIT_READ_DR2 0x022
+#define SVM_EXIT_READ_DR3 0x023
+#define SVM_EXIT_READ_DR4 0x024
+#define SVM_EXIT_READ_DR5 0x025
+#define SVM_EXIT_READ_DR6 0x026
+#define SVM_EXIT_READ_DR7 0x027
+#define SVM_EXIT_WRITE_DR0 0x030
+#define SVM_EXIT_WRITE_DR1 0x031
+#define SVM_EXIT_WRITE_DR2 0x032
+#define SVM_EXIT_WRITE_DR3 0x033
+#define SVM_EXIT_WRITE_DR4 0x034
+#define SVM_EXIT_WRITE_DR5 0x035
+#define SVM_EXIT_WRITE_DR6 0x036
+#define SVM_EXIT_WRITE_DR7 0x037
+#define SVM_EXIT_EXCP_BASE 0x040
+#define SVM_EXIT_LAST_EXCP 0x05f
+#define SVM_EXIT_INTR 0x060
+#define SVM_EXIT_NMI 0x061
+#define SVM_EXIT_SMI 0x062
+#define SVM_EXIT_INIT 0x063
+#define SVM_EXIT_VINTR 0x064
+#define SVM_EXIT_CR0_SEL_WRITE 0x065
+#define SVM_EXIT_IDTR_READ 0x066
+#define SVM_EXIT_GDTR_READ 0x067
+#define SVM_EXIT_LDTR_READ 0x068
+#define SVM_EXIT_TR_READ 0x069
+#define SVM_EXIT_IDTR_WRITE 0x06a
+#define SVM_EXIT_GDTR_WRITE 0x06b
+#define SVM_EXIT_LDTR_WRITE 0x06c
+#define SVM_EXIT_TR_WRITE 0x06d
+#define SVM_EXIT_RDTSC 0x06e
+#define SVM_EXIT_RDPMC 0x06f
+#define SVM_EXIT_PUSHF 0x070
+#define SVM_EXIT_POPF 0x071
+#define SVM_EXIT_CPUID 0x072
+#define SVM_EXIT_RSM 0x073
+#define SVM_EXIT_IRET 0x074
+#define SVM_EXIT_SWINT 0x075
+#define SVM_EXIT_INVD 0x076
+#define SVM_EXIT_PAUSE 0x077
+#define SVM_EXIT_HLT 0x078
+#define SVM_EXIT_INVLPG 0x079
+#define SVM_EXIT_INVLPGA 0x07a
+#define SVM_EXIT_IOIO 0x07b
+#define SVM_EXIT_MSR 0x07c
+#define SVM_EXIT_TASK_SWITCH 0x07d
+#define SVM_EXIT_FERR_FREEZE 0x07e
+#define SVM_EXIT_SHUTDOWN 0x07f
+#define SVM_EXIT_VMRUN 0x080
+#define SVM_EXIT_VMMCALL 0x081
+#define SVM_EXIT_VMLOAD 0x082
+#define SVM_EXIT_VMSAVE 0x083
+#define SVM_EXIT_STGI 0x084
+#define SVM_EXIT_CLGI 0x085
+#define SVM_EXIT_SKINIT 0x086
+#define SVM_EXIT_RDTSCP 0x087
+#define SVM_EXIT_ICEBP 0x088
+#define SVM_EXIT_WBINVD 0x089
+#define SVM_EXIT_MONITOR 0x08a
+#define SVM_EXIT_MWAIT 0x08b
+#define SVM_EXIT_MWAIT_COND 0x08c
+#define SVM_EXIT_XSETBV 0x08d
+#define SVM_EXIT_RDPRU 0x08e
+#define SVM_EXIT_EFER_WRITE_TRAP 0x08f
+#define SVM_EXIT_CR0_WRITE_TRAP 0x090
+#define SVM_EXIT_CR1_WRITE_TRAP 0x091
+#define SVM_EXIT_CR2_WRITE_TRAP 0x092
+#define SVM_EXIT_CR3_WRITE_TRAP 0x093
+#define SVM_EXIT_CR4_WRITE_TRAP 0x094
+#define SVM_EXIT_CR5_WRITE_TRAP 0x095
+#define SVM_EXIT_CR6_WRITE_TRAP 0x096
+#define SVM_EXIT_CR7_WRITE_TRAP 0x097
+#define SVM_EXIT_CR8_WRITE_TRAP 0x098
+#define SVM_EXIT_CR9_WRITE_TRAP 0x099
+#define SVM_EXIT_CR10_WRITE_TRAP 0x09a
+#define SVM_EXIT_CR11_WRITE_TRAP 0x09b
+#define SVM_EXIT_CR12_WRITE_TRAP 0x09c
+#define SVM_EXIT_CR13_WRITE_TRAP 0x09d
+#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
+#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
+#define SVM_EXIT_INVPCID 0x0a2
+#define SVM_EXIT_BUS_LOCK 0x0a5
+#define SVM_EXIT_IDLE_HLT 0x0a6
+#define SVM_EXIT_NPF 0x400
+#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
+#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
+#define SVM_EXIT_VMGEXIT 0x403
+
+/* SEV-ES software-defined VMGEXIT events */
+#define SVM_VMGEXIT_MMIO_READ 0x80000001
+#define SVM_VMGEXIT_MMIO_WRITE 0x80000002
+#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003
+#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004
+#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
+#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
+#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
+#define SVM_VMGEXIT_PSC 0x80000010
+#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011
+#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012
+#define SVM_VMGEXIT_AP_CREATION 0x80000013
+#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
+#define SVM_VMGEXIT_AP_CREATE 1
+#define SVM_VMGEXIT_AP_DESTROY 2
+#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018
+#define SVM_VMGEXIT_SAVIC 0x8000001a
+#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0
+#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1
+#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL
+#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
+#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe
+#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \
+ /* SW_EXITINFO1[3:0] */ \
+ (((((u64)reason_set) & 0xf)) | \
+ /* SW_EXITINFO1[11:4] */ \
+ ((((u64)reason_code) & 0xff) << 4))
+#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
+
+/* Exit code reserved for hypervisor/software use */
+#define SVM_EXIT_SW 0xf0000000
+
+#define SVM_EXIT_ERR -1
+
+#define SVM_EXIT_REASONS \
+ { SVM_EXIT_READ_CR0, "read_cr0" }, \
+ { SVM_EXIT_READ_CR2, "read_cr2" }, \
+ { SVM_EXIT_READ_CR3, "read_cr3" }, \
+ { SVM_EXIT_READ_CR4, "read_cr4" }, \
+ { SVM_EXIT_READ_CR8, "read_cr8" }, \
+ { SVM_EXIT_WRITE_CR0, "write_cr0" }, \
+ { SVM_EXIT_WRITE_CR2, "write_cr2" }, \
+ { SVM_EXIT_WRITE_CR3, "write_cr3" }, \
+ { SVM_EXIT_WRITE_CR4, "write_cr4" }, \
+ { SVM_EXIT_WRITE_CR8, "write_cr8" }, \
+ { SVM_EXIT_READ_DR0, "read_dr0" }, \
+ { SVM_EXIT_READ_DR1, "read_dr1" }, \
+ { SVM_EXIT_READ_DR2, "read_dr2" }, \
+ { SVM_EXIT_READ_DR3, "read_dr3" }, \
+ { SVM_EXIT_READ_DR4, "read_dr4" }, \
+ { SVM_EXIT_READ_DR5, "read_dr5" }, \
+ { SVM_EXIT_READ_DR6, "read_dr6" }, \
+ { SVM_EXIT_READ_DR7, "read_dr7" }, \
+ { SVM_EXIT_WRITE_DR0, "write_dr0" }, \
+ { SVM_EXIT_WRITE_DR1, "write_dr1" }, \
+ { SVM_EXIT_WRITE_DR2, "write_dr2" }, \
+ { SVM_EXIT_WRITE_DR3, "write_dr3" }, \
+ { SVM_EXIT_WRITE_DR4, "write_dr4" }, \
+ { SVM_EXIT_WRITE_DR5, "write_dr5" }, \
+ { SVM_EXIT_WRITE_DR6, "write_dr6" }, \
+ { SVM_EXIT_WRITE_DR7, "write_dr7" }, \
+ { SVM_EXIT_EXCP_BASE + DE_VECTOR, "DE excp" }, \
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \
+ { SVM_EXIT_EXCP_BASE + OF_VECTOR, "OF excp" }, \
+ { SVM_EXIT_EXCP_BASE + BR_VECTOR, "BR excp" }, \
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
+ { SVM_EXIT_EXCP_BASE + DF_VECTOR, "DF excp" }, \
+ { SVM_EXIT_EXCP_BASE + TS_VECTOR, "TS excp" }, \
+ { SVM_EXIT_EXCP_BASE + NP_VECTOR, "NP excp" }, \
+ { SVM_EXIT_EXCP_BASE + SS_VECTOR, "SS excp" }, \
+ { SVM_EXIT_EXCP_BASE + GP_VECTOR, "GP excp" }, \
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
+ { SVM_EXIT_EXCP_BASE + MF_VECTOR, "MF excp" }, \
+ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
+ { SVM_EXIT_EXCP_BASE + XM_VECTOR, "XF excp" }, \
+ { SVM_EXIT_INTR, "interrupt" }, \
+ { SVM_EXIT_NMI, "nmi" }, \
+ { SVM_EXIT_SMI, "smi" }, \
+ { SVM_EXIT_INIT, "init" }, \
+ { SVM_EXIT_VINTR, "vintr" }, \
+ { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \
+ { SVM_EXIT_IDTR_READ, "read_idtr" }, \
+ { SVM_EXIT_GDTR_READ, "read_gdtr" }, \
+ { SVM_EXIT_LDTR_READ, "read_ldtr" }, \
+ { SVM_EXIT_TR_READ, "read_rt" }, \
+ { SVM_EXIT_IDTR_WRITE, "write_idtr" }, \
+ { SVM_EXIT_GDTR_WRITE, "write_gdtr" }, \
+ { SVM_EXIT_LDTR_WRITE, "write_ldtr" }, \
+ { SVM_EXIT_TR_WRITE, "write_rt" }, \
+ { SVM_EXIT_RDTSC, "rdtsc" }, \
+ { SVM_EXIT_RDPMC, "rdpmc" }, \
+ { SVM_EXIT_PUSHF, "pushf" }, \
+ { SVM_EXIT_POPF, "popf" }, \
+ { SVM_EXIT_CPUID, "cpuid" }, \
+ { SVM_EXIT_RSM, "rsm" }, \
+ { SVM_EXIT_IRET, "iret" }, \
+ { SVM_EXIT_SWINT, "swint" }, \
+ { SVM_EXIT_INVD, "invd" }, \
+ { SVM_EXIT_PAUSE, "pause" }, \
+ { SVM_EXIT_HLT, "hlt" }, \
+ { SVM_EXIT_INVLPG, "invlpg" }, \
+ { SVM_EXIT_INVLPGA, "invlpga" }, \
+ { SVM_EXIT_IOIO, "io" }, \
+ { SVM_EXIT_MSR, "msr" }, \
+ { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
+ { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \
+ { SVM_EXIT_SHUTDOWN, "shutdown" }, \
+ { SVM_EXIT_VMRUN, "vmrun" }, \
+ { SVM_EXIT_VMMCALL, "hypercall" }, \
+ { SVM_EXIT_VMLOAD, "vmload" }, \
+ { SVM_EXIT_VMSAVE, "vmsave" }, \
+ { SVM_EXIT_STGI, "stgi" }, \
+ { SVM_EXIT_CLGI, "clgi" }, \
+ { SVM_EXIT_SKINIT, "skinit" }, \
+ { SVM_EXIT_RDTSCP, "rdtscp" }, \
+ { SVM_EXIT_ICEBP, "icebp" }, \
+ { SVM_EXIT_WBINVD, "wbinvd" }, \
+ { SVM_EXIT_MONITOR, "monitor" }, \
+ { SVM_EXIT_MWAIT, "mwait" }, \
+ { SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \
+ { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \
+ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
+ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
+ { SVM_EXIT_INVPCID, "invpcid" }, \
+ { SVM_EXIT_BUS_LOCK, "buslock" }, \
+ { SVM_EXIT_IDLE_HLT, "idle-halt" }, \
+ { SVM_EXIT_NPF, "npf" }, \
+ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
+ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
+ { SVM_EXIT_VMGEXIT, "vmgexit" }, \
+ { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \
+ { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \
+ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
+ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \
+ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \
+ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \
+ { SVM_VMGEXIT_GUEST_REQUEST, "vmgexit_guest_request" }, \
+ { SVM_VMGEXIT_EXT_GUEST_REQUEST, "vmgexit_ext_guest_request" }, \
+ { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \
+ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \
+ { SVM_EXIT_ERR, "invalid_guest_state" }
+
+
+#endif /* _UAPI__SVM_H */
diff --git a/arch/x86/include/asm/swab.h b/arch/x86/include/uapi/asm/swab.h
index 557cd9f00661..cd3fd8ddbe9a 100644
--- a/arch/x86/include/asm/swab.h
+++ b/arch/x86/include/uapi/asm/swab.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_SWAB_H
#define _ASM_X86_SWAB_H
@@ -6,22 +7,7 @@
static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
{
-#ifdef __i386__
-# ifdef CONFIG_X86_BSWAP
- asm("bswap %0" : "=r" (val) : "0" (val));
-# else
- asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
- "rorl $16,%0\n\t" /* swap words */
- "xchgb %b0,%h0" /* swap higher bytes */
- : "=q" (val)
- : "0" (val));
-# endif
-
-#else /* __i386__ */
- asm("bswapl %0"
- : "=r" (val)
- : "0" (val));
-#endif
+ asm("bswapl %0" : "=r" (val) : "0" (val));
return val;
}
#define __arch_swab32 __arch_swab32
@@ -37,22 +23,12 @@ static inline __attribute_const__ __u64 __arch_swab64(__u64 val)
__u64 u;
} v;
v.u = val;
-# ifdef CONFIG_X86_BSWAP
asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
: "=r" (v.s.a), "=r" (v.s.b)
: "0" (v.s.a), "1" (v.s.b));
-# else
- v.s.a = __arch_swab32(v.s.a);
- v.s.b = __arch_swab32(v.s.b);
- asm("xchgl %0,%1"
- : "=r" (v.s.a), "=r" (v.s.b)
- : "0" (v.s.a), "1" (v.s.b));
-# endif
return v.u;
#else /* __i386__ */
- asm("bswapq %0"
- : "=r" (val)
- : "0" (val));
+ asm("bswapq %0" : "=r" (val) : "0" (val));
return val;
#endif
}
diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h
new file mode 100644
index 000000000000..5657b7a49f03
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_UCONTEXT_H
+#define _ASM_X86_UCONTEXT_H
+
+/*
+ * Indicates the presence of extended state information in the memory
+ * layout pointed by the fpstate pointer in the ucontext's sigcontext
+ * struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE 0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext. All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ * saved CS is not 64-bit)
+ * new SS = saved SS (will fail IRET and signal if invalid)
+ * else
+ * new SS = a flat 32-bit data segment
+ *
+ * This behavior serves three purposes:
+ *
+ * - Legacy programs that construct a 64-bit sigcontext from scratch
+ * with zero or garbage in the SS slot (e.g. old CRIU) and call
+ * sigreturn will still work.
+ *
+ * - Old DOSEMU versions sometimes catch a signal from a segmented
+ * context, delete the old SS segment (with modify_ldt), and change
+ * the saved CS to a 64-bit segment. These DOSEMU versions expect
+ * sigreturn to send them back to 64-bit mode without killing them,
+ * despite the fact that the SS selector when the signal was raised is
+ * no longer valid. UC_STRICT_RESTORE_SS will be clear, so the kernel
+ * will fix up SS for these DOSEMU versions.
+ *
+ * - Old and new programs that catch a signal and return without
+ * modifying the saved context will end up in exactly the state they
+ * started in, even if they were running in a segmented context when
+ * the signal was raised.. Old kernels would lose track of the
+ * previous SS value.
+ */
+#define UC_SIGCONTEXT_SS 0x2
+#define UC_STRICT_RESTORE_SS 0x4
+#endif
+
+#include <asm-generic/ucontext.h>
+
+#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h
new file mode 100644
index 000000000000..be5e2e747f50
--- /dev/null
+++ b/arch/x86/include/uapi/asm/unistd.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_UNISTD_H
+#define _UAPI_ASM_X86_UNISTD_H
+
+/*
+ * x32 syscall flag bit. Some user programs expect syscall NR macros
+ * and __X32_SYSCALL_BIT to have type int, even though syscall numbers
+ * are, for practical purposes, unsigned long.
+ *
+ * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right
+ * thing regardless.
+ */
+#define __X32_SYSCALL_BIT 0x40000000
+
+#ifndef __KERNEL__
+# ifdef __i386__
+# include <asm/unistd_32.h>
+# elif defined(__ILP32__)
+# include <asm/unistd_x32.h>
+# else
+# include <asm/unistd_64.h>
+# endif
+#endif
+
+#endif /* _UAPI_ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
new file mode 100644
index 000000000000..18909b8050bc
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_VM86_H
+#define _UAPI_ASM_X86_VM86_H
+
+/*
+ * I'm guessing at the VIF/VIP flag usage, but hope that this is how
+ * the Pentium uses them. Linux will return from vm86 mode when both
+ * VIF and VIP is set.
+ *
+ * On a Pentium, we could probably optimize the virtual flags directly
+ * in the eflags register instead of doing it "by hand" in vflags...
+ *
+ * Linus
+ */
+
+#include <asm/processor-flags.h>
+
+#define BIOSSEG 0x0f000
+
+#define CPU_086 0
+#define CPU_186 1
+#define CPU_286 2
+#define CPU_386 3
+#define CPU_486 4
+#define CPU_586 5
+
+/*
+ * Return values for the 'vm86()' system call
+ */
+#define VM86_TYPE(retval) ((retval) & 0xff)
+#define VM86_ARG(retval) ((retval) >> 8)
+
+#define VM86_SIGNAL 0 /* return due to signal */
+#define VM86_UNKNOWN 1 /* unhandled GP fault
+ - IO-instruction or similar */
+#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
+#define VM86_STI 3 /* sti/popf/iret instruction enabled
+ virtual interrupts */
+
+/*
+ * Additional return values when invoking new vm86()
+ */
+#define VM86_PICRETURN 4 /* return due to pending PIC request */
+#define VM86_TRAP 6 /* return due to DOS-debugger request */
+
+/*
+ * function codes when invoking new vm86()
+ */
+#define VM86_PLUS_INSTALL_CHECK 0
+#define VM86_ENTER 1
+#define VM86_ENTER_NO_BYPASS 2
+#define VM86_REQUEST_IRQ 3
+#define VM86_FREE_IRQ 4
+#define VM86_GET_IRQ_BITS 5
+#define VM86_GET_AND_RESET_IRQ 6
+
+/*
+ * This is the stack-layout seen by the user space program when we have
+ * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
+ * is 'kernel_vm86_regs' (see below).
+ */
+
+struct vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ long __null_ds;
+ long __null_es;
+ long __null_fs;
+ long __null_gs;
+ long orig_eax;
+ long eip;
+ unsigned short cs, __csh;
+ long eflags;
+ long esp;
+ unsigned short ss, __ssh;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct revectored_struct {
+ unsigned long __map[8]; /* 256 bits */
+};
+
+struct vm86_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap; /* unused, preserved by vm86() */
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+};
+
+/*
+ * flags masks
+ */
+#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */
+
+struct vm86plus_info_struct {
+ unsigned long force_return_for_pic:1;
+ unsigned long vm86dbg_active:1; /* for debugger */
+ unsigned long vm86dbg_TFpendig:1; /* for debugger */
+ unsigned long unused:28;
+ unsigned long is_vm86pus:1; /* for vm86 internal use */
+ unsigned char vm86dbg_intxxtab[32]; /* for debugger */
+};
+struct vm86plus_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+};
+
+
+#endif /* _UAPI_ASM_X86_VM86_H */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
new file mode 100644
index 000000000000..1baa86dfe029
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ */
+#ifndef _UAPIVMX_H
+#define _UAPIVMX_H
+
+
+#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000
+
+#define EXIT_REASON_EXCEPTION_NMI 0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT_SIGNAL 3
+#define EXIT_REASON_SIPI_SIGNAL 4
+#define EXIT_REASON_OTHER_SMI 6
+
+#define EXIT_REASON_INTERRUPT_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMOFF 26
+#define EXIT_REASON_VMON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_IO_INSTRUCTION 30
+#define EXIT_REASON_MSR_READ 31
+#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MSR_LOAD_FAIL 34
+#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_TRAP_FLAG 37
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION 40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_VIOLATION 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_PREEMPTION_TIMER 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
+#define EXIT_REASON_RDRAND 57
+#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_VMFUNC 59
+#define EXIT_REASON_ENCLS 60
+#define EXIT_REASON_RDSEED 61
+#define EXIT_REASON_PML_FULL 62
+#define EXIT_REASON_XSAVES 63
+#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_UMWAIT 67
+#define EXIT_REASON_TPAUSE 68
+#define EXIT_REASON_BUS_LOCK 74
+#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
+#define EXIT_REASON_TDCALL 77
+#define EXIT_REASON_MSR_READ_IMM 84
+#define EXIT_REASON_MSR_WRITE_IMM 85
+
+#define VMX_EXIT_REASONS \
+ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
+ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
+ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
+ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \
+ { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \
+ { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \
+ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
+ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
+ { EXIT_REASON_CPUID, "CPUID" }, \
+ { EXIT_REASON_HLT, "HLT" }, \
+ { EXIT_REASON_INVD, "INVD" }, \
+ { EXIT_REASON_INVLPG, "INVLPG" }, \
+ { EXIT_REASON_RDPMC, "RDPMC" }, \
+ { EXIT_REASON_RDTSC, "RDTSC" }, \
+ { EXIT_REASON_VMCALL, "VMCALL" }, \
+ { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \
+ { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \
+ { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \
+ { EXIT_REASON_VMPTRST, "VMPTRST" }, \
+ { EXIT_REASON_VMREAD, "VMREAD" }, \
+ { EXIT_REASON_VMRESUME, "VMRESUME" }, \
+ { EXIT_REASON_VMWRITE, "VMWRITE" }, \
+ { EXIT_REASON_VMOFF, "VMOFF" }, \
+ { EXIT_REASON_VMON, "VMON" }, \
+ { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \
+ { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \
+ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \
+ { EXIT_REASON_MSR_READ, "MSR_READ" }, \
+ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
+ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
+ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
+ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \
+ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
+ { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
+ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
+ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
+ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
+ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
+ { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \
+ { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \
+ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
+ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
+ { EXIT_REASON_INVEPT, "INVEPT" }, \
+ { EXIT_REASON_RDTSCP, "RDTSCP" }, \
+ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
+ { EXIT_REASON_INVVPID, "INVVPID" }, \
+ { EXIT_REASON_WBINVD, "WBINVD" }, \
+ { EXIT_REASON_XSETBV, "XSETBV" }, \
+ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
+ { EXIT_REASON_RDRAND, "RDRAND" }, \
+ { EXIT_REASON_INVPCID, "INVPCID" }, \
+ { EXIT_REASON_VMFUNC, "VMFUNC" }, \
+ { EXIT_REASON_ENCLS, "ENCLS" }, \
+ { EXIT_REASON_RDSEED, "RDSEED" }, \
+ { EXIT_REASON_PML_FULL, "PML_FULL" }, \
+ { EXIT_REASON_XSAVES, "XSAVES" }, \
+ { EXIT_REASON_XRSTORS, "XRSTORS" }, \
+ { EXIT_REASON_UMWAIT, "UMWAIT" }, \
+ { EXIT_REASON_TPAUSE, "TPAUSE" }, \
+ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
+ { EXIT_REASON_NOTIFY, "NOTIFY" }, \
+ { EXIT_REASON_TDCALL, "TDCALL" }, \
+ { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \
+ { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" }
+
+#define VMX_EXIT_REASON_FLAGS \
+ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
+
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
+#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
+#endif /* _UAPIVMX_H */
diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h
new file mode 100644
index 000000000000..75275f547444
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vsyscall.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_VSYSCALL_H
+#define _UAPI_ASM_X86_VSYSCALL_H
+
+enum vsyscall_num {
+ __NR_vgettimeofday,
+ __NR_vtime,
+ __NR_vgetcpu,
+};
+
+#define VSYSCALL_ADDR (-10UL << 20)
+
+#endif /* _UAPI_ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
index 08f4fd731469..ef66569e7e22 100644
--- a/arch/x86/kernel/.gitignore
+++ b/arch/x86/kernel/.gitignore
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
vsyscall.lds
vsyscall_32.lds
vmlinux.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7b..bc184dd38d99 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,130 +1,168 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+always-$(KBUILD_BUILTIN) += vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
-CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
+CFLAGS_REMOVE_head32.o = -pg
+CFLAGS_REMOVE_rethook.o = -pg
endif
-#
-# vsyscalls (which work on the user stack) should have
-# no stack-protector checks:
-#
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
-CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc.o := $(nostackp)
-CFLAGS_paravirt.o := $(nostackp)
-GCOV_PROFILE_vsyscall_64.o := n
-GCOV_PROFILE_hpet.o := n
-GCOV_PROFILE_tsc.o := n
-GCOV_PROFILE_paravirt.o := n
-
-obj-y := process_$(BITS).o signal.o entry_$(BITS).o
-obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y += setup.o i8259.o irqinit.o
-obj-$(CONFIG_X86_VISWS) += visws_quirks.o
-obj-$(CONFIG_X86_32) += probe_roms_32.o
-obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
-obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
+
+# With some compiler versions the generated code results in boot hangs, caused
+# by several compilation units. To be safe, disable all instrumentation.
+KCSAN_SANITIZE := n
+KMSAN_SANITIZE_head$(BITS).o := n
+KMSAN_SANITIZE_nmi.o := n
+
+# If instrumentation of the following files is enabled, boot hangs during
+# first second.
+KCOV_INSTRUMENT_head$(BITS).o := n
+# These are called from save_stack_trace() on debug paths,
+# and produce large amounts of uninteresting coverage.
+KCOV_INSTRUMENT_stacktrace.o := n
+KCOV_INSTRUMENT_dumpstack.o := n
+KCOV_INSTRUMENT_dumpstack_$(BITS).o := n
+KCOV_INSTRUMENT_unwind_orc.o := n
+KCOV_INSTRUMENT_unwind_frame.o := n
+KCOV_INSTRUMENT_unwind_guess.o := n
+
+CFLAGS_head32.o := -fno-stack-protector
+CFLAGS_head64.o := -fno-stack-protector
+CFLAGS_irq.o := -I $(src)/../include/asm/trace
+
+obj-y += head_$(BITS).o
+obj-y += head$(BITS).o
+obj-y += ebda.o
+obj-y += platform-quirks.o
+obj-y += process_$(BITS).o signal.o signal_$(BITS).o
+obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_X86_FRED) += fred.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
+obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-y += probe_roms.o
+obj-$(CONFIG_X86_32) += sys_ia32.o
+obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
-obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y += alternative.o i8253.o pci-nommu.o
-obj-y += tsc.o io_delay.o rtc.o
+obj-y += pci-dma.o quirks.o kdebugfs.o
+obj-y += alternative.o i8253.o hw_breakpoint.o
+obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
+obj-y += resource.o
+obj-y += irqflags.o
+obj-y += static_call.o
-obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
-obj-y += i387.o xsave.o
+obj-y += fpu/
obj-y += ptrace.o
-obj-$(CONFIG_X86_DS) += ds.o
-obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-$(CONFIG_INTEL_TXT) += tboot.o
+obj-$(CONFIG_ISA_DMA_API) += i8237.o
+obj-y += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
-obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o
+obj-$(CONFIG_SMP) += smpboot.o
+obj-$(CONFIG_X86_TSC) += tsc_sync.o
obj-$(CONFIG_SMP) += setup_percpu.o
-obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
-obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
-obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
-obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_RETHOOK) += rethook.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o
+obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
-obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
+obj-$(CONFIG_X86_32) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
-obj-$(CONFIG_K8_NB) += k8.o
-obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
-obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
-obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
+obj-$(CONFIG_AMD_NB) += amd_nb.o
+obj-$(CONFIG_AMD_NODE) += amd_node.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
-obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
-obj-$(CONFIG_KVM_GUEST) += kvm.o
-obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
+
+obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
-obj-$(CONFIG_SCx200) += scx200.o
-scx200-y += scx200_32.o
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_UPROBES) += uprobes.o
-obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
+obj-$(CONFIG_X86_UMIP) += umip.o
-microcode-y := microcode_core.o
-microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
-microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
-obj-$(CONFIG_MICROCODE) += microcode.o
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+obj-$(CONFIG_CFI) += cfi.o
+
+obj-$(CONFIG_CALL_THUNKS) += callthunks.o
+
+obj-$(CONFIG_X86_CET) += cet.o
-obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
- obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
- obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
- obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
- obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
+ obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
- obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
+ obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..842a5f449404 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,12 @@
-subdir- := realmode
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_ACPI) += boot.o
-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
+obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
-obj-y += cstate.o processor.o
+obj-y += cstate.o
endif
-$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
-
-$(obj)/realmode/wakeup.bin: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/realmode
-
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 000000000000..e21419e686eb
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Arch-specific APEI-related functions.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ mce_save_apei_thr_limit(cmc->notify.error_threshold_value);
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
+
+int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ return apei_smca_report_x86_error(ctx_info, lapic_id);
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6b8ca3a0285d..9fa321a95eb3 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1,49 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* boot.c - Architecture-Specific Low-Level ACPI Boot Support
*
* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
+#define pr_fmt(fmt) "ACPI: " fmt
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/acpi_pmtmr.h>
#include <linux/efi.h>
#include <linux/cpumask.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/dmi.h>
#include <linux/irq.h>
-#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <linux/efi-bgrt.h>
+#include <linux/serial_core.h>
+#include <linux/pgtable.h>
-#include <asm/pgtable.h>
+#include <xen/xen.h>
+
+#include <asm/e820/api.h>
+#include <asm/irqdomain.h>
+#include <asm/pci_x86.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/io.h>
#include <asm/mpspec.h>
#include <asm/smp.h>
+#include <asm/i8259.h>
+#include <asm/setup.h>
+#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
-u32 acpi_rsdt_forced;
int acpi_disabled;
EXPORT_SYMBOL(acpi_disabled);
@@ -51,32 +45,43 @@ EXPORT_SYMBOL(acpi_disabled);
# include <asm/proto.h>
#endif /* X86 */
-#define BAD_MADT_ENTRY(entry, end) ( \
- (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
- ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
-
-#define PREFIX "ACPI: "
-
int acpi_noirq; /* skip ACPI IRQ initialization */
+static int acpi_nobgrt; /* skip ACPI BGRT */
int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
EXPORT_SYMBOL(acpi_pci_disabled);
-int acpi_ht __initdata = 1; /* enable HT */
int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
+int acpi_disable_cmcff;
+bool acpi_int_src_ovr[NR_IRQS_LEGACY];
+/* ACPI SCI override configuration */
u8 acpi_sci_flags __initdata;
-int acpi_sci_override_gsi __initdata;
+u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ;
int acpi_skip_timer_override __initdata;
int acpi_use_timer_override __initdata;
+int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool has_lapic_cpus __initdata;
+static bool acpi_support_online_capable;
#endif
-#ifndef __HAVE_ARCH_CMPXCHG
-#warning ACPI uses CMPXCHG, i486 and later hardware
+#ifdef CONFIG_X86_IO_APIC
+/*
+ * Locks related to IOAPIC hotplug
+ * Hotplug side:
+ * ->device_hotplug_lock
+ * ->acpi_ioapic_lock
+ * ->ioapic_lock
+ * Interrupt mapping side:
+ * ->acpi_ioapic_lock
+ * ->ioapic_mutex
+ * ->ioapic_lock
+ */
+static DEFINE_MUTEX(acpi_ioapic_lock);
#endif
/* --------------------------------------------------------------------------
@@ -91,31 +96,32 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
/*
- * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
- * to map the target physical address. The problem is that set_fixmap()
- * provides a single page, and it is possible that the page is not
- * sufficient.
- * By using this area, we can map up to MAX_IO_APICS pages temporarily,
- * i.e. until the next __va_range() call.
- *
- * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
- * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
- * count idx down while incrementing the phys address.
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
*/
-char *__init __acpi_map_table(unsigned long phys, unsigned long size)
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * This is just a simple wrapper around early_memremap(),
+ * with sanity checks for phys == 0 and size == 0.
+ */
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
{
if (!phys || !size)
return NULL;
- return early_ioremap(phys, size);
+ return early_memremap(phys, size);
}
-void __init __acpi_unmap_table(char *map, unsigned long size)
+
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
{
if (!map || !size)
return;
- early_iounmap(map, size);
+ early_memunmap(map, size);
}
#ifdef CONFIG_X86_LOCAL_APIC
@@ -123,56 +129,84 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
{
struct acpi_table_madt *madt = NULL;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -EINVAL;
madt = (struct acpi_table_madt *)table;
if (!madt) {
- printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ pr_warn("Unable to map MADT\n");
return -ENODEV;
}
if (madt->address) {
acpi_lapic_addr = (u64) madt->address;
- printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
- madt->address);
+ pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->flags & ACPI_MADT_PCAT_COMPAT)
+ legacy_pic_pcat_compat();
+
+ /* ACPI 6.3 and newer support the online capable bit. */
+ if (acpi_gbl_FADT.header.revision > 6 ||
+ (acpi_gbl_FADT.header.revision == 6 &&
+ acpi_gbl_FADT.minor_revision >= 3))
+ acpi_support_online_capable = true;
+
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
return 0;
}
-static void __cpuinit acpi_register_lapic(int id, u8 enabled)
+static bool __init acpi_is_processor_usable(u32 lapic_flags)
{
- unsigned int ver = 0;
+ if (lapic_flags & ACPI_MADT_ENABLED)
+ return true;
- if (!enabled) {
- ++disabled_cpus;
- return;
- }
-
- if (boot_cpu_physical_apicid != -1U)
- ver = apic_version[boot_cpu_physical_apicid];
+ if (!acpi_support_online_capable ||
+ (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ return true;
- generic_processor_info(id, ver);
+ return false;
}
static int __init
-acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
{
struct acpi_madt_local_x2apic *processor = NULL;
+#ifdef CONFIG_X86_X2APIC
+ u32 apic_id;
+ u8 enabled;
+#endif
processor = (struct acpi_madt_local_x2apic *)header;
if (BAD_MADT_ENTRY(processor, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
#ifdef CONFIG_X86_X2APIC
+ apic_id = processor->local_apic_id;
+ enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
+
+ /* Ignore invalid ID */
+ if (apic_id == 0xffffffff)
+ return 0;
+
+ /* don't register processors that cannot be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
+ /*
+ * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure
+ * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID
+ * in x2APIC must be equal or greater than 0xff.
+ */
+ if (has_lapic_cpus && apic_id < 0xff)
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -180,17 +214,22 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->local_apic_id, /* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ if (!apic_id_valid(apic_id)) {
+ if (enabled)
+ pr_warn("x2apic entry ignored\n");
+ return 0;
+ }
+
+ topology_register_apic(apic_id, processor->uid, enabled);
#else
- printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+ pr_warn("x2apic entry ignored\n");
#endif
return 0;
}
static int __init
-acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_check_lapic(union acpi_subtable_headers *header, const unsigned long end)
{
struct acpi_madt_local_apic *processor = NULL;
@@ -199,7 +238,37 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
if (BAD_MADT_ENTRY(processor, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /* Ignore processors that can not be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
+ has_lapic_cpus = true;
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic *processor = NULL;
+
+ processor = (struct acpi_madt_local_apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /* don't register processors that can not be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
/*
* We need to register disabled CPU as well to permit
@@ -208,14 +277,15 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->id, /* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic(processor->id, /* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
}
static int __init
-acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end)
{
struct acpi_madt_local_sapic *processor = NULL;
@@ -224,16 +294,17 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
if (BAD_MADT_ENTRY(processor, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
- acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
}
static int __init
-acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
const unsigned long end)
{
struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
@@ -243,13 +314,15 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
return -EINVAL;
+ acpi_table_print_madt_entry(&header->common);
+
acpi_lapic_addr = lapic_addr_ovr->address;
return 0;
}
static int __init
-acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+acpi_parse_x2apic_nmi(union acpi_subtable_headers *header,
const unsigned long end)
{
struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
@@ -259,16 +332,16 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
if (BAD_MADT_ENTRY(x2apic_nmi, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
if (x2apic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
static int __init
-acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
@@ -277,32 +350,137 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
if (BAD_MADT_ENTRY(lapic_nmi, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
if (lapic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
-
-#endif /*CONFIG_X86_LOCAL_APIC */
+#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi);
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi)
+{
+ /*
+ * Check bus_irq boundary.
+ */
+ if (bus_irq >= NR_IRQS_LEGACY) {
+ pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq);
+ return;
+ }
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0)
+ return;
+ /*
+ * Reset default identity mapping if gsi is also an legacy IRQ,
+ * otherwise there will be more than one entry with the same GSI
+ * and acpi_isa_irq_to_gsi() may give wrong result.
+ */
+ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+ isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ;
+ isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
+ int ioapic;
+ u8 pin;
+
+ if (!acpi_ioapic)
+ return;
+ if (!dev || !dev_is_pci(dev))
+ return;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_save_irq(&mp_irq);
+#endif
+}
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi)
+{
+ struct mpc_intsrc mp_irq;
+ int ioapic, pin;
+
+ /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ pr_warn("Failed to find ioapic for gsi : %u\n", gsi);
+ return ioapic;
+ }
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq;
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+
+ return 0;
+}
static int __init
-acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_io_apic *ioapic = NULL;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
ioapic = (struct acpi_madt_io_apic *)header;
if (BAD_MADT_ENTRY(ioapic, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
- mp_register_ioapic(ioapic->id,
- ioapic->address, ioapic->global_irq_base);
+ /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+ if (ioapic->global_irq_base < nr_legacy_irqs())
+ cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+ mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+ &cfg);
return 0;
}
@@ -310,7 +488,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
/*
* Parse Interrupt Source Override for the ACPI SCI
*/
-static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
{
if (trigger == 0) /* compatible SCI trigger is level */
trigger = 3;
@@ -325,12 +503,12 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
- /*
- * mp_config_acpi_legacy_irqs() already setup IRQs < 16
- * If GSI is < 16, this will update its flags,
- * else it will create a new mp_irqs[] entry.
- */
- mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+ if (bus_irq < NR_IRQS_LEGACY)
+ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ else
+ mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi);
+
+ acpi_penalize_sci_irq(bus_irq, trigger, polarity);
/*
* stash over-ride to indicate we've been here
@@ -341,7 +519,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
}
static int __init
-acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
const unsigned long end)
{
struct acpi_madt_interrupt_override *intsrc = NULL;
@@ -351,19 +529,30 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
if (BAD_MADT_ENTRY(intsrc, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
+
+ if (intsrc->source_irq < NR_IRQS_LEGACY)
+ acpi_int_src_ovr[intsrc->source_irq] = true;
if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
- acpi_sci_ioapic_setup(intsrc->global_irq,
+ acpi_sci_ioapic_setup(intsrc->source_irq,
intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
- (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
return 0;
}
- if (acpi_skip_timer_override &&
- intsrc->source_irq == 0 && intsrc->global_irq == 2) {
- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
- return 0;
+ if (intsrc->source_irq == 0) {
+ if (acpi_skip_timer_override) {
+ pr_warn("BIOS IRQ0 override ignored.\n");
+ return 0;
+ }
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+ intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
+ pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+ }
}
mp_override_legacy_irq(intsrc->source_irq,
@@ -375,7 +564,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
}
static int __init
-acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_nmi_source *nmi_src = NULL;
@@ -384,7 +573,7 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end
if (BAD_MADT_ENTRY(nmi_src, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
/* TBD: Support nimsrc entries? */
@@ -401,10 +590,10 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end
* If a PIC-mode SCI is not recognized or gives spurious IRQ7's
* it may require Edge Trigger -- use "acpi_sci=edge"
*
- * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * Port 0x4d0-4d1 are ELCR1 and ELCR2, the Edge/Level Control Registers
* for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
- * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
- * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
+ * ELCR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
+ * ELCR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
*/
void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
@@ -413,7 +602,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
unsigned int old, new;
/* Real old ELCR mask */
- old = inb(0x4d0) | (inb(0x4d1) << 8);
+ old = inb(PIC_ELCR1) | (inb(PIC_ELCR2) << 8);
/*
* If we use ACPI to set PCI IRQs, then we should clear ELCR
@@ -438,164 +627,257 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
if (old == new)
return;
- printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
- outb(new, 0x4d0);
- outb(new >> 8, 0x4d1);
+ pr_warn("setting ELCR to %04x (from %04x)\n", new, old);
+ outb(new, PIC_ELCR1);
+ outb(new >> 8, PIC_ELCR2);
}
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
{
- *irq = gsi;
+ int rc, irq, trigger, polarity;
+
+ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+ *irqp = gsi;
+ return 0;
+ }
+
+ rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+ if (rc)
+ return rc;
+
+ trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+ polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+ irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
+ if (irq < 0)
+ return irq;
+
+ *irqp = irq;
return 0;
}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
-/*
- * success: return IRQ number (>=0)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
{
- unsigned int irq;
- unsigned int plat_gsi = gsi;
+ if (isa_irq < nr_legacy_irqs() &&
+ isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) {
+ *gsi = isa_irq_to_gsi[isa_irq];
+ return 0;
+ }
+
+ return -1;
+}
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
#ifdef CONFIG_PCI
/*
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
- if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- if (trigger == ACPI_LEVEL_SENSITIVE)
- eisa_set_level_irq(gsi);
- }
+ if (trigger == ACPI_LEVEL_SENSITIVE)
+ elcr_set_level_irq(gsi);
#endif
+ return gsi;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ int irq = gsi;
#ifdef CONFIG_X86_IO_APIC
- if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
- plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
- }
+ int node;
+ struct irq_alloc_info info;
+
+ node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+ polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+ ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+ mutex_unlock(&acpi_ioapic_lock);
#endif
- acpi_gsi_to_irq(plat_gsi, &irq);
+
return irq;
}
-/*
- * ACPI based hotplug support for CPU
- */
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-
-static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
+static void acpi_unregister_gsi_ioapic(u32 gsi)
{
- struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
- union acpi_object *obj;
- struct acpi_madt_local_apic *lapic;
- cpumask_var_t tmp_map, new_map;
- u8 physid;
- int cpu;
- int retval = -ENOMEM;
+#ifdef CONFIG_X86_IO_APIC
+ int irq;
- if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
- return -EINVAL;
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ if (irq > 0)
+ mp_unmap_irq(irq);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+}
+#endif
- if (!buffer.length || !buffer.pointer)
- return -EINVAL;
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
- obj = buffer.pointer;
- if (obj->type != ACPI_TYPE_BUFFER ||
- obj->buffer.length < sizeof(*lapic)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
+#ifdef CONFIG_ACPI_SLEEP
+int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
+#else
+int (*acpi_suspend_lowlevel)(void);
+#endif
- lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ return __acpi_register_gsi(dev, gsi, trigger, polarity);
+}
+EXPORT_SYMBOL_GPL(acpi_register_gsi);
- if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
- !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
+void acpi_unregister_gsi(u32 gsi)
+{
+ if (__acpi_unregister_gsi)
+ __acpi_unregister_gsi(gsi);
+}
+EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
- physid = lapic->id;
+#ifdef CONFIG_X86_LOCAL_APIC
+static void __init acpi_set_irq_model_ioapic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+ __acpi_register_gsi = acpi_register_gsi_ioapic;
+ __acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
+ acpi_ioapic = 1;
+}
+#endif
- kfree(buffer.pointer);
- buffer.length = ACPI_ALLOCATE_BUFFER;
- buffer.pointer = NULL;
+/*
+ * ACPI based hotplug support for CPU
+ */
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+#include <acpi/processor.h>
- if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
- goto out;
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
- if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
- goto free_tmp_map;
+ nid = acpi_get_node(handle);
+ if (nid != NUMA_NO_NODE) {
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+ }
+#endif
+ return 0;
+}
- cpumask_copy(tmp_map, cpu_present_mask);
- acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu)
+{
+ int cpu = topology_hotplug_apic(physid, acpi_id);
- /*
- * If mp_register_lapic successfully generates a new logical cpu
- * number, then the following will get us exactly what was mapped
- */
- cpumask_andnot(new_map, cpu_present_mask, tmp_map);
- if (cpumask_empty(new_map)) {
- printk ("Unable to map lapic to logical cpu number\n");
- retval = -EINVAL;
- goto free_new_map;
+ if (cpu < 0) {
+ pr_info("Unable to map lapic to logical cpu number\n");
+ return cpu;
}
- cpu = cpumask_first(new_map);
+ acpi_processor_set_pdc(handle);
+ acpi_map_cpu2node(handle, cpu, physid);
*pcpu = cpu;
- retval = 0;
-
-free_new_map:
- free_cpumask_var(new_map);
-free_tmp_map:
- free_cpumask_var(tmp_map);
-out:
- return retval;
+ return 0;
}
+EXPORT_SYMBOL(acpi_map_cpu);
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
+int acpi_unmap_cpu(int cpu)
{
- return _acpi_map_lsapic(handle, pcpu);
+#ifdef CONFIG_ACPI_NUMA
+ set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+#endif
+ topology_hotunplug_apic(cpu);
+ return 0;
}
-EXPORT_SYMBOL(acpi_map_lsapic);
+EXPORT_SYMBOL(acpi_unmap_cpu);
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
-int acpi_unmap_lsapic(int cpu)
+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
{
- per_cpu(x86_cpu_to_apicid, cpu) = -1;
- set_cpu_present(cpu, false);
- num_processors--;
-
- return (0);
-}
+ int ret = -ENOSYS;
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ int ioapic_id;
+ u64 addr;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
+ if (ioapic_id < 0) {
+ unsigned long long uid;
+ acpi_status status;
+
+ status = acpi_evaluate_integer(handle, METHOD_NAME__UID,
+ NULL, &uid);
+ if (ACPI_FAILURE(status)) {
+ acpi_handle_warn(handle, "failed to get IOAPIC ID.\n");
+ return -EINVAL;
+ }
+ ioapic_id = (int)uid;
+ }
-EXPORT_SYMBOL(acpi_unmap_lsapic);
-#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
-int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
-{
- /* TBD */
- return -EINVAL;
+ return ret;
}
-
EXPORT_SYMBOL(acpi_register_ioapic);
int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
{
- /* TBD */
- return -EINVAL;
-}
+ int ret = -ENOSYS;
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_unregister_ioapic(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
EXPORT_SYMBOL(acpi_unregister_ioapic);
-static int __init acpi_parse_sbf(struct acpi_table_header *table)
+/**
+ * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base
+ * has been registered
+ * @handle: ACPI handle of the IOAPIC device
+ * @gsi_base: GSI base associated with the IOAPIC
+ *
+ * Assume caller holds some type of lock to serialize acpi_ioapic_registered()
+ * with acpi_register_ioapic()/acpi_unregister_ioapic().
+ */
+int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
{
- struct acpi_table_boot *sb;
+ int ret = 0;
- sb = (struct acpi_table_boot *)table;
- if (!sb) {
- printk(KERN_WARNING PREFIX "Unable to map SBF\n");
- return -ENODEV;
- }
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_ioapic_registered(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+
+static int __init acpi_parse_sbf(struct acpi_table_header *table)
+{
+ struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
sbf_port = sb->cmos_index; /* Save CMOS port */
@@ -605,34 +887,26 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
#ifdef CONFIG_HPET_TIMER
#include <asm/hpet.h>
-static struct __initdata resource *hpet_res;
+static struct resource *hpet_res __initdata;
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
- struct acpi_table_hpet *hpet_tbl;
-
- hpet_tbl = (struct acpi_table_hpet *)table;
- if (!hpet_tbl) {
- printk(KERN_WARNING PREFIX "Unable to map HPET\n");
- return -ENODEV;
- }
+ struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
- printk(KERN_WARNING PREFIX "HPET timers must be located in "
- "memory.\n");
+ pr_warn("HPET timers must be located in memory.\n");
return -1;
}
hpet_address = hpet_tbl->address.address;
+ hpet_blockid = hpet_tbl->sequence;
/*
* Some broken BIOSes advertise HPET at 0x0. We really do not
* want to allocate a resource there.
*/
if (!hpet_address) {
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: %#lx is invalid\n",
- hpet_tbl->id, hpet_address);
+ pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address);
return 0;
}
#ifdef CONFIG_X86_64
@@ -643,28 +917,25 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
*/
if (hpet_address == 0xfed0000000000000UL) {
if (!hpet_force_user) {
- printk(KERN_WARNING PREFIX "HPET id: %#x "
- "base: 0xfed0000000000000 is bogus\n "
- "try hpet=force on the kernel command line to "
- "fix it up to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address = 0;
return 0;
}
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: 0xfed0000000000000 fixed up "
- "to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address >>= 32;
}
#endif
- printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
- hpet_tbl->id, hpet_address);
+ pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address);
/*
* Allocate and initialize the HPET firmware resource for adding into
* the resource tree during the lateinit timeframe.
*/
#define HPET_RESOURCE_NAME_SIZE 9
- hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+ hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
+ SMP_CACHE_BYTES);
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
@@ -697,6 +968,27 @@ late_initcall(hpet_insert_resource);
static int __init acpi_parse_fadt(struct acpi_table_header *table)
{
+ if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
+ pr_debug("no legacy devices present\n");
+ x86_platform.legacy.devices.pnpbios = 0;
+ }
+
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
+ x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
+ pr_debug("i8042 controller is absent\n");
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
+ pr_debug("not registering RTC platform device\n");
+ x86_platform.legacy.rtc = 0;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) {
+ pr_debug("probing for VGA not safe\n");
+ x86_platform.legacy.no_vga = 1;
+ }
#ifdef CONFIG_X86_PM_TIMER
/* detect the location of the ACPI PM Timer */
@@ -719,8 +1011,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
}
if (pmtmr_ioport)
- printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
- pmtmr_ioport);
+ pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport);
#endif
return 0;
}
@@ -731,94 +1022,81 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
* returns 0 on success, < 0 on error
*/
-static void __init acpi_register_lapic_address(unsigned long address)
-{
- mp_lapic_addr = address;
-
- set_fixmap_nocache(FIX_APIC_BASE, address);
- if (boot_cpu_physical_apicid == -1U) {
- boot_cpu_physical_apicid = read_apic_id();
- apic_version[boot_cpu_physical_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
- }
-}
-
static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
{
int count;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
/*
* Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
+ pr_err("Error parsing LAPIC address override entry\n");
return count;
}
- acpi_register_lapic_address(acpi_lapic_addr);
+ register_lapic_address(acpi_lapic_addr);
return count;
}
static int __init acpi_parse_madt_lapic_entries(void)
{
- int count;
- int x2count = 0;
+ int count, x2count = 0;
+ struct acpi_subtable_proc madt_proc[2];
+ int ret;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
- /*
- * Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
- */
-
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
- if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
- return count;
- }
-
- acpi_register_lapic_address(acpi_lapic_addr);
-
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
- acpi_parse_sapic, MAX_APICS);
+ acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_APICS);
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_APICS);
+ /* Check if there are valid LAPIC entries */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_check_lapic, MAX_LOCAL_APIC);
+
+ /*
+ * Enumerate the APIC IDs in the order that they appear in the
+ * MADT, no matter LAPIC entry or x2APIC entry is used.
+ */
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ pr_err("Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
}
if (!count && !x2count) {
- printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+ pr_err("No LAPIC entries present\n");
/* TBD: Cleanup to allow fallback to MPS */
return -ENODEV;
} else if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+ pr_err("Error parsing LAPIC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
- x2count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
- acpi_parse_x2apic_nmi, 0);
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+ acpi_parse_x2apic_nmi, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+ acpi_parse_lapic_nmi, 0);
if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+ pr_err("Error parsing LAPIC NMI entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -827,232 +1105,42 @@ static int __init acpi_parse_madt_lapic_entries(void)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
-#define MP_ISA_BUS 0
-
-#ifdef CONFIG_X86_ES7000
-extern int es7000_plat;
-#endif
-
-static struct {
- int gsi_base;
- int gsi_end;
-} mp_ioapic_routing[MAX_IO_APICS];
-
-int mp_find_ioapic(int gsi)
-{
- int i = 0;
-
- /* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_ioapic_routing[i].gsi_base)
- && (gsi <= mp_ioapic_routing[i].gsi_end))
- return i;
- }
-
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
- return -1;
-}
-
-int mp_find_ioapic_pin(int ioapic, int gsi)
-{
- if (WARN_ON(ioapic == -1))
- return -1;
- if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end))
- return -1;
-
- return gsi - mp_ioapic_routing[ioapic].gsi_base;
-}
-
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return io_apic_get_unique_id(nr_ioapics, id);
- else
- return id;
-#else
- int i;
- DECLARE_BITMAP(used, 256);
- bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
- struct mpc_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->apicid, used);
- }
- if (!test_bit(id, used))
- return id;
- return find_first_zero_bit(used, 256);
-#endif
-}
-
-static int bad_ioapic(unsigned long address)
-{
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
- return 1;
- }
- return 0;
-}
-
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
-{
- int idx = 0;
-
- if (bad_ioapic(address))
- return;
-
- idx = nr_ioapics;
-
- mp_ioapics[idx].type = MP_IOAPIC;
- mp_ioapics[idx].flags = MPC_APIC_USABLE;
- mp_ioapics[idx].apicaddr = address;
-
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].apicid = uniq_ioapic_id(id);
- mp_ioapics[idx].apicver = io_apic_get_version(idx);
-
- /*
- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
- */
- mp_ioapic_routing[idx].gsi_base = gsi_base;
- mp_ioapic_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
-
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
- mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
- mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
-
- nr_ioapics++;
-}
-
-int __init acpi_probe_gsi(void)
-{
- int idx;
- int gsi;
- int max_gsi = 0;
-
- if (acpi_disabled)
- return 0;
-
- if (!acpi_ioapic)
- return 0;
-
- max_gsi = 0;
- for (idx = 0; idx < nr_ioapics; idx++) {
- gsi = mp_ioapic_routing[idx].gsi_end;
-
- if (gsi > max_gsi)
- max_gsi = gsi;
- }
-
- return max_gsi + 1;
-}
-
-static void assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static void save_mp_irq(struct mpc_intsrc *m)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
-
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- int ioapic;
- int pin;
- struct mpc_intsrc mp_irq;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = mp_find_ioapic_pin(ioapic, gsi);
-
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
-
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger << 2) | polarity;
- mp_irq.srcbus = MP_ISA_BUS;
- mp_irq.srcbusirq = bus_irq; /* IRQ */
- mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
- mp_irq.dstirq = pin; /* INTIN# */
-
- save_mp_irq(&mp_irq);
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
+static void __init mp_config_acpi_legacy_irqs(void)
{
int i;
- int ioapic;
- unsigned int dstapic;
struct mpc_intsrc mp_irq;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
/*
* Fabricate the legacy ISA bus (bus #31).
*/
mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
#endif
set_bit(MP_ISA_BUS, mp_bus_not_pci);
- pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
-
-#ifdef CONFIG_X86_ES7000
- /*
- * Older generations of ES7000 have no legacy identity mappings
- */
- if (es7000_plat == 1)
- return;
-#endif
-
- /*
- * Locate the IOAPIC that manages the ISA IRQs (0-15).
- */
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- return;
- dstapic = mp_ioapics[ioapic].apicid;
+ pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs());
/*
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
*/
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ int ioapic, pin;
+ unsigned int dstapic;
int idx;
+ u32 gsi;
+
+ /* Locate the gsi that irq i maps to. */
+ if (acpi_isa_irq_to_gsi(i, &gsi))
+ continue;
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQ.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ continue;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ dstapic = mpc_ioapic_id(ioapic);
for (idx = 0; idx < mp_irq_entries; idx++) {
struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1062,12 +1150,12 @@ void __init mp_config_acpi_legacy_irqs(void)
break;
/* Do we already have a mapping for this IOAPIC pin */
- if (irq->dstapic == dstapic && irq->dstirq == i)
+ if (irq->dstapic == dstapic && irq->dstirq == pin)
break;
}
if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ pr_debug("ACPI: IRQ%d used by override.\n", i);
continue; /* IRQ already used */
}
@@ -1077,94 +1165,12 @@ void __init mp_config_acpi_legacy_irqs(void)
mp_irq.dstapic = dstapic;
mp_irq.irqtype = mp_INT;
mp_irq.srcbusirq = i; /* Identity mapped */
- mp_irq.dstirq = i;
+ mp_irq.dstirq = pin;
- save_mp_irq(&mp_irq);
+ mp_save_irq(&mp_irq);
}
}
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
- int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
- struct mpc_intsrc mp_irq;
- struct pci_dev *pdev;
- unsigned char number;
- unsigned int devfn;
- int ioapic;
- u8 pin;
-
- if (!acpi_ioapic)
- return 0;
- if (!dev)
- return 0;
- if (dev->bus != &pci_bus_type)
- return 0;
-
- pdev = to_pci_dev(dev);
- number = pdev->bus->number;
- devfn = pdev->devfn;
- pin = pdev->pin;
- /* print the entry should happen on mptable identically */
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
- (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.srcbus = number;
- mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
- ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mp_ioapics[ioapic].apicid;
- mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
- save_mp_irq(&mp_irq);
-#endif
- return 0;
-}
-
-int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
-{
- int ioapic;
- int ioapic_pin;
- struct io_apic_irq_attr irq_attr;
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
-
- ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-
-#ifdef CONFIG_X86_32
- if (ioapic_renumber_irq)
- gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
-
- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapics[ioapic].apicid,
- ioapic_pin);
- return gsi;
- }
-
- if (enable_update_mptable)
- mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
- set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
- trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- io_apic_set_pci_routing(dev, gsi, &irq_attr);
-
- return gsi;
-}
-
/*
* Parse IOAPIC related entries in MADT
* returns 0 on success, < 0 on error
@@ -1179,39 +1185,35 @@ static int __init acpi_parse_madt_ioapic_entries(void)
* If MPS is present, it will handle them,
* otherwise the system will stay in PIC mode
*/
- if (acpi_disabled || acpi_noirq) {
+ if (acpi_disabled || acpi_noirq)
return -ENODEV;
- }
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
/*
* if "noapic" boot option, don't look for IO-APICs
*/
- if (skip_ioapic_setup) {
- printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
- "due to 'noapic' option.\n");
+ if (ioapic_is_disabled) {
+ pr_info("Skipping IOAPIC probe due to 'noapic' option.\n");
return -ENODEV;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
- MAX_IO_APICS);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+ MAX_IO_APICS);
if (!count) {
- printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+ pr_err("No IOAPIC entries present\n");
return -ENODEV;
} else if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+ pr_err("Error parsing IOAPIC entry\n");
return count;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+ acpi_parse_int_src_ovr,
+ irq_get_nr_irqs());
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing interrupt source overrides entry\n");
+ pr_err("Error parsing interrupt source overrides entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1219,18 +1221,20 @@ static int __init acpi_parse_madt_ioapic_entries(void)
/*
* If BIOS did not supply an INT_SRC_OVR for the SCI
* pretend we got one so we can set the SCI flags.
+ * But ignore setting up SCI on hardware reduced platforms.
*/
- if (!acpi_sci_override_gsi)
- acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+ if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware)
+ acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+ acpi_gbl_FADT.sci_interrupt);
- /* Fill in identity legacy mapings where no override */
+ /* Fill in identity legacy mappings where no override */
mp_config_acpi_legacy_irqs();
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+ acpi_parse_nmi_src,
+ irq_get_nr_irqs());
if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+ pr_err("Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1263,8 +1267,7 @@ static void __init early_acpi_process_madt(void)
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
}
@@ -1285,28 +1288,31 @@ static void __init acpi_process_madt(void)
if (!error) {
acpi_lapic = 1;
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
/*
* Parse MADT IO-APIC entries
*/
+ mutex_lock(&acpi_ioapic_lock);
error = acpi_parse_madt_ioapic_entries();
+ mutex_unlock(&acpi_ioapic_lock);
if (!error) {
- acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
- acpi_ioapic = 1;
+ acpi_set_irq_model_ioapic();
smp_found_config = 1;
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
}
+
+#ifdef CONFIG_ACPI_MADT_WAKEUP
+ /*
+ * Parse MADT MP Wake entry.
+ */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP,
+ acpi_parse_mp_wake, 1);
+#endif
}
if (error == -EINVAL) {
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
} else {
@@ -1316,8 +1322,7 @@ static void __init acpi_process_madt(void)
* Boot with "acpi=off" to use MPS on such a system.
*/
if (smp_found_config) {
- printk(KERN_WARNING PREFIX
- "No APIC-table, disabling MPS\n");
+ pr_warn("No APIC-table, disabling MPS\n");
smp_found_config = 0;
}
}
@@ -1327,11 +1332,9 @@ static void __init acpi_process_madt(void)
* processors, where MPS only supports physical.
*/
if (acpi_lapic && acpi_ioapic)
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ pr_info("Using ACPI (MADT) for SMP configuration information\n");
else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
+ pr_info("Using ACPI for processor (LAPIC) configuration information\n");
#endif
return;
}
@@ -1339,8 +1342,7 @@ static void __init acpi_process_madt(void)
static int __init disable_acpi_irq(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
- d->ident);
+ pr_notice("%s detected: force use of acpi=noirq\n", d->ident);
acpi_noirq_set();
}
return 0;
@@ -1349,54 +1351,41 @@ static int __init disable_acpi_irq(const struct dmi_system_id *d)
static int __init disable_acpi_pci(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
- d->ident);
+ pr_notice("%s detected: force use of pci=noacpi\n", d->ident);
acpi_disable_pci();
}
return 0;
}
-static int __init dmi_disable_acpi(const struct dmi_system_id *d)
+static int __init disable_acpi_xsdt(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
- disable_acpi();
+ pr_notice("%s detected: force use of acpi=rsdt\n", d->ident);
+ acpi_gbl_do_not_use_xsdt = TRUE;
} else {
- printk(KERN_NOTICE
- "Warning: DMI blacklist says broken, but acpi forced\n");
+ pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n");
}
return 0;
}
-/*
- * Limit ACPI to CPU enumeration for HT
- */
-static int __init force_acpi_ht(const struct dmi_system_id *d)
+static int __init dmi_disable_acpi(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
- d->ident);
+ pr_notice("%s detected: acpi off\n", d->ident);
disable_acpi();
- acpi_ht = 1;
} else {
- printk(KERN_NOTICE
- "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
+ pr_notice("Warning: DMI blacklist says broken, but acpi forced\n");
}
return 0;
}
/*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- /*
- * The ati_ixp4x0_rev() early PCI quirk should have set
- * the acpi_skip_timer_override flag already:
- */
if (!acpi_skip_timer_override) {
- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
}
@@ -1404,10 +1393,34 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
}
/*
+ * ACPI offers an alternative platform interface model that removes
+ * ACPI hardware requirements for platforms that do not implement
+ * the PC Architecture.
+ *
+ * We initialize the Hardware-reduced ACPI model here:
+ */
+void __init acpi_generic_reduced_hw_init(void)
+{
+ /*
+ * Override x86_init functions and bypass legacy PIC in
+ * hardware reduced ACPI mode.
+ */
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ legacy_pic = &null_legacy_pic;
+}
+
+static void __init acpi_reduced_hw_init(void)
+{
+ if (acpi_gbl_reduced_hardware)
+ x86_init.acpi.reduced_hw_early_init();
+}
+
+/*
* If your system is blacklisted here, but you find that acpi=force
* works for you, please contact linux-acpi@vger.kernel.org
*/
-static struct dmi_system_id __initdata acpi_dmi_table[] = {
+static const struct dmi_system_id acpi_dmi_table[] __initconst = {
/*
* Boxes that need ACPI disabled
*/
@@ -1421,90 +1434,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
},
/*
- * Boxes that need acpi=ht
- */
- {
- .callback = force_acpi_ht,
- .ident = "FSC Primergy T850",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "HP VISUALIZE NT Workstation",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
- DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "Compaq Workstation W8000",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ASUS P2B-DS",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ASUS CUR-DLS",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ABIT i440BX-W83977",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
- DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM Bladecenter",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eServer xSeries 360",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eserver xSeries 330",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eserver xSeries 440",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
- },
- },
-
- /*
* Boxes that need ACPI PCI IRQ routing disabled
*/
{
@@ -1562,11 +1491,24 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ /*
+ * Boxes that need ACPI XSDT use disabled due to corrupted tables
+ */
+ {
+ .callback = disable_acpi_xsdt,
+ .ident = "Advantech DAC-BJ01",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"),
+ DMI_MATCH(DMI_BIOS_VERSION, "V1.12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"),
+ },
+ },
{}
};
/* second table for DMI checks that should run after early-quirks */
-static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
+static const struct dmi_system_id acpi_dmi_table_late[] __initconst = {
/*
* HP laptops which use a DSDT reporting as HP/SB400/10000,
* which includes some code which overrides all temperature
@@ -1574,7 +1516,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* is enabled. This input is incorrectly designated the
* ISA IRQ 0 via an interrupt source override even though
* it is wired to the output of the master 8259A and INTIN0
- * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * is not connected at all. Force ignoring BIOS IRQ0
* override in that cases.
*/
{
@@ -1609,6 +1551,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
{}
};
@@ -1629,66 +1579,58 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* if acpi_blacklisted() acpi_disabled = 1;
* acpi_irq_model=...
* ...
- *
- * return value: (currently ignored)
- * 0: success
- * !0: failure
*/
-int __init acpi_boot_table_init(void)
+void __init acpi_boot_table_init(void)
{
- int error;
-
dmi_check_system(acpi_dmi_table);
/*
* If acpi_disabled, bail out
- * One exception: acpi=ht continues far enough to enumerate LAPICs
*/
- if (acpi_disabled && !acpi_ht)
- return 1;
+ if (acpi_disabled)
+ return;
/*
* Initialize the ACPI boot-time table parser.
*/
- error = acpi_table_init();
- if (error) {
+ if (acpi_locate_initial_tables())
disable_acpi();
- return error;
- }
+ else
+ acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_init_complete();
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
/*
* blacklist may disable ACPI entirely
*/
- error = acpi_blacklisted();
- if (error) {
+ if (acpi_blacklisted()) {
if (acpi_force) {
- printk(KERN_WARNING PREFIX "acpi=force override\n");
+ pr_warn("acpi=force override\n");
} else {
- printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+ pr_warn("Disabling ACPI support\n");
disable_acpi();
- return error;
+ return 1;
}
}
- return 0;
-}
-
-int __init early_acpi_boot_init(void)
-{
/*
- * If acpi_disabled, bail out
- * One exception: acpi=ht continues far enough to enumerate LAPICs
+ * Process the Multiple APIC Description Table (MADT), if present
*/
- if (acpi_disabled && !acpi_ht)
- return 1;
+ early_acpi_process_madt();
/*
- * Process the Multiple APIC Description Table (MADT), if present
+ * Hardware-reduced ACPI mode initialization:
*/
- early_acpi_process_madt();
+ acpi_reduced_hw_init();
return 0;
}
@@ -1700,9 +1642,8 @@ int __init acpi_boot_init(void)
/*
* If acpi_disabled, bail out
- * One exception: acpi=ht continues far enough to enumerate LAPICs
*/
- if (acpi_disabled && !acpi_ht)
+ if (acpi_disabled)
return 1;
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1718,7 +1659,14 @@ int __init acpi_boot_init(void)
acpi_process_madt();
acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+ if (IS_ENABLED(CONFIG_ACPI_BGRT) && !acpi_nobgrt)
+ acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
+ if (!acpi_noirq)
+ x86_init.pci.init = pci_acpi_init;
+
+ /* Do not enable ACPI SPCR console by default */
+ acpi_parse_spcr(earlycon_acpi_spcr_enable, false);
return 0;
}
@@ -1734,26 +1682,27 @@ static int __init parse_acpi(char *arg)
/* acpi=force to over-ride black-list */
else if (strcmp(arg, "force") == 0) {
acpi_force = 1;
- acpi_ht = 1;
acpi_disabled = 0;
}
/* acpi=strict disables out-of-spec workarounds */
else if (strcmp(arg, "strict") == 0) {
acpi_strict = 1;
}
- /* Limit ACPI just to boot-time to enable HT */
- else if (strcmp(arg, "ht") == 0) {
- if (!acpi_force)
- disable_acpi();
- acpi_ht = 1;
- }
/* acpi=rsdt use RSDT instead of XSDT */
else if (strcmp(arg, "rsdt") == 0) {
- acpi_rsdt_forced = 1;
+ acpi_gbl_do_not_use_xsdt = TRUE;
}
/* "acpi=noirq" disables ACPI interrupt routing */
else if (strcmp(arg, "noirq") == 0) {
acpi_noirq_set();
+ }
+ /* "acpi=copy_dsdt" copies DSDT */
+ else if (strcmp(arg, "copy_dsdt") == 0) {
+ acpi_gbl_copy_dsdt_locally = 1;
+ }
+ /* "acpi=nocmcff" disables FF mode for corrected errors */
+ else if (strcmp(arg, "nocmcff") == 0) {
+ acpi_disable_cmcff = 1;
} else {
/* Core will printk when we return error. */
return -EINVAL;
@@ -1762,6 +1711,13 @@ static int __init parse_acpi(char *arg)
}
early_param("acpi", parse_acpi);
+static int __init parse_acpi_bgrt(char *arg)
+{
+ acpi_nobgrt = true;
+ return 0;
+}
+early_param("bgrt_disable", parse_acpi_bgrt);
+
/* FIXME: Using pci= for an ACPI parameter is a travesty. */
static int __init parse_pci(char *arg)
{
@@ -1775,10 +1731,17 @@ int __init acpi_mps_check(void)
{
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
/* mptable code is not built-in*/
+
+ /*
+ * Xen disables ACPI in PV DomU guests but it still emulates APIC and
+ * supports SMP. Returning early here ensures that APIC is not disabled
+ * unnecessarily and the guest is not limited to a single vCPU.
+ */
+ if (xen_pv_domain() && !xen_initial_domain())
+ return 0;
+
if (acpi_disabled || acpi_noirq) {
- printk(KERN_WARNING "MPS support code is not built-in.\n"
- "Using acpi=off or acpi=noirq or pci=noacpi "
- "may have problem\n");
+ pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n");
return 1;
}
#endif
@@ -1826,21 +1789,53 @@ early_param("acpi_sci", setup_acpi_sci);
int __acpi_acquire_global_lock(unsigned int *lock)
{
unsigned int old, new, val;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
- new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
- return (new < 3) ? -1 : 0;
+ val = (old >> 1) & 0x1;
+ new = (old & ~0x3) + 2 + val;
+ } while (!try_cmpxchg(lock, &old, new));
+
+ if (val)
+ return 0;
+
+ return -1;
}
int __acpi_release_global_lock(unsigned int *lock)
{
- unsigned int old, new, val;
+ unsigned int old, new;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
new = old & ~0x3;
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
+ } while (!try_cmpxchg(lock, &old, new));
return old & 0x1;
}
+
+void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
+{
+ e820__range_add(addr, size, E820_TYPE_NVS);
+ e820__update_table_print();
+}
+
+void x86_default_set_root_pointer(u64 addr)
+{
+ boot_params.acpi_rsdp_addr = addr;
+}
+
+u64 x86_default_get_root_pointer(void)
+{
+ return boot_params.acpi_rsdp_addr;
+}
+
+#ifdef CONFIG_XEN_PV
+void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
+{
+ return ioremap_cache(phys, size);
+}
+
+void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) =
+ x86_acpi_os_ioremap;
+EXPORT_SYMBOL_GPL(acpi_os_ioremap);
+#endif
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
new file mode 100644
index 000000000000..d7c8ef1e354d
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * cppc.c: CPPC Interface for x86
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <linux/bitfield.h>
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/topology.h>
+
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_supported_by_cpu(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ if (boot_cpu_data.x86 == 0x19 && ((boot_cpu_data.x86_model <= 0x0f) ||
+ (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
+ return true;
+ else if (boot_cpu_data.x86 == 0x17 &&
+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
+ return true;
+ return boot_cpu_has(X86_FEATURE_CPPC);
+ }
+ return false;
+}
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrq_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrq_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrq_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
+
+static void amd_set_max_freq_ratio(void)
+{
+ struct cppc_perf_caps perf_caps;
+ u64 numerator, nominal_perf;
+ u64 perf_ratio;
+ int rc;
+
+ rc = cppc_get_perf_caps(0, &perf_caps);
+ if (rc) {
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
+ return;
+ }
+
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
+ nominal_perf = perf_caps.nominal_perf;
+
+ if (!nominal_perf) {
+ pr_warn("Could not retrieve nominal performance\n");
+ return;
+ }
+
+ /* midpoint between max_boost and max_P */
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
+
+ freq_invariance_set_perf_ratio(perf_ratio, false);
+}
+
+static DEFINE_MUTEX(freq_invariance_lock);
+
+static inline void init_freq_invariance_cppc(void)
+{
+ static bool init_done;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
+ mutex_lock(&freq_invariance_lock);
+ if (!init_done)
+ amd_set_max_freq_ratio();
+ init_done = true;
+ mutex_unlock(&freq_invariance_lock);
+}
+
+void acpi_processor_init_invariance_cppc(void)
+{
+ init_freq_invariance_cppc();
+}
+
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrq_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
+/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_online_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu));
+ bool prefcore;
+ int ret;
+ u32 tmp;
+
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
+ return 0;
+ }
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
+
+ /* detect if running on heterogeneous design */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) {
+ switch (core_type) {
+ case TOPO_CPU_TYPE_UNKNOWN:
+ pr_warn("Undefined core type found for cpu %d\n", cpu);
+ break;
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ /* use the max scale for performance cores */
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ /* use the highest perf value for efficiency cores */
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+ *numerator = tmp;
+ return 0;
+ }
+ }
+
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8c44c232efcb..0281703da5e2 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2005 Intel Corporation
* Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
@@ -5,14 +6,18 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <acpi/processor.h>
-#include <asm/acpi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpuid/api.h>
+#include <asm/mwait.h>
+#include <asm/special_insns.h>
+#include <asm/smp.h>
/*
* Initialize bm_flags based on the CPU cache properties
@@ -44,12 +49,61 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
/*
* On all recent Intel platforms, ARB_DISABLE is a nop.
* So, set bm_control to zero to indicate that ARB_DISABLE
- * is not required while entering C3 type state on
- * P4, Core and beyond CPUs
+ * is not required while entering C3 type state.
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
- (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)))
+ (c->x86 > 15 || (c->x86_vfm >= INTEL_CORE2_MEROM && c->x86_vfm <= INTEL_FAM6_LAST)))
+ flags->bm_control = 0;
+
+ if (c->x86_vendor == X86_VENDOR_CENTAUR) {
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f &&
+ c->x86_stepping >= 0x0e)) {
+ /*
+ * For all recent Centaur CPUs, the ucode will make sure that each
+ * core can keep cache coherence with each other while entering C3
+ * type state. So, set bm_check to 1 to indicate that the kernel
+ * doesn't need to execute a cache flush operation (WBINVD) when
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ /*
+ * For all recent Centaur platforms, ARB_DISABLE is a nop.
+ * Set bm_control to zero to indicate that ARB_DISABLE is
+ * not required while entering C3 type state.
+ */
flags->bm_control = 0;
+ }
+ }
+
+ if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
+ /*
+ * All Zhaoxin CPUs that support C3 share cache.
+ * And caches should not be flushed by software while
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ /*
+ * On all recent Zhaoxin platforms, ARB_DISABLE is a nop.
+ * So, set bm_control to zero to indicate that ARB_DISABLE
+ * is not required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
+ if (cpu_feature_enabled(X86_FEATURE_ZEN)) {
+ /*
+ * For all AMD Zen or newer CPUs that support C3, caches
+ * should not be flushed by software while entering C3
+ * type state. Set bm->check to 1 so that kernel doesn't
+ * need to execute cache flush operation.
+ */
+ flags->bm_check = 1;
+ /*
+ * In current AMD C state implementation ARB_DIS is no longer
+ * used. So set bm_control to zero to indicate ARB_DIS is not
+ * required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
@@ -61,20 +115,10 @@ struct cstate_entry {
unsigned int ecx;
} states[ACPI_PROCESSOR_MAX_POWER];
};
-static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
+static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_CSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
-
#define NATIVE_CSTATE_BEYOND_HALT (2)
static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
@@ -86,16 +130,19 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
- cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
- cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
- MWAIT_CSTATE_MASK) + 1;
+ cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1) & MWAIT_CSTATE_MASK;
edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
retval = 0;
- if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) {
+ /* If the HW does not support any sub-states in this C-state */
+ if (num_cstate_subtype == 0) {
+ pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n",
+ cx->address, edx_part);
retval = -1;
goto out;
}
@@ -110,11 +157,11 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
if (!mwait_supported[cstate_type]) {
mwait_supported[cstate_type] = 1;
printk(KERN_DEBUG
- "Monitor-Mwait will be used to enter C-%d "
- "state\n", cx->type);
+ "Monitor-Mwait will be used to enter C-%d state\n",
+ cx->type);
}
snprintf(cx->desc,
- ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
+ ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x",
cx->address);
out:
return retval;
@@ -127,7 +174,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct cpuinfo_x86 *c = &cpu_data(cpu);
long retval;
- if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
return -1;
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
@@ -139,17 +186,37 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
/* Make sure we are running on right CPU */
- retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
+ retval = call_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx,
+ false);
if (retval == 0) {
/* Use the hint in CST */
percpu_entry->states[cx->index].eax = cx->address;
percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
}
+
+ /*
+ * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
+ * then we should skip checking BM_STS for this C-state.
+ * ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
+ */
+ if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
+ cx->bm_sts_skip = 1;
+
return retval;
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cstate_entry *percpu_entry;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ mwait_play_dead(percpu_entry->states[cx->index].eax);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead);
+
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
struct cstate_entry *percpu_entry;
@@ -163,7 +230,10 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
static int __init ffh_cstate_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (c->x86_vendor != X86_VENDOR_INTEL)
+
+ if (c->x86_vendor != X86_VENDOR_INTEL &&
+ c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..aefb9cb583ad
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+ .text
+ .align PAGE_SIZE
+
+/*
+ * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
+ *
+ * rdi: Address of the ACPI MADT MPWK ResetVector
+ * rsi: PGD of the identity mapping
+ */
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+ ANNOTATE_NOENDBR
+ /* Turn off global entries. Following CR3 write will flush them. */
+ movq %cr4, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /* Switch to identity mapping */
+ movq %rsi, %cr3
+
+ /* Jump to reset vector */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
new file mode 100644
index 000000000000..6d7603511f52
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
+#include <asm/apic.h>
+#include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
+#include <asm/processor.h>
+#include <asm/reboot.h>
+
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
+
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
+
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+static void acpi_mp_stop_this_cpu(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+ play_dead_common();
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+ u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned long timeout;
+
+ /*
+ * Use TEST mailbox command to prove that BIOS got control over
+ * the CPU before declaring it dead.
+ *
+ * BIOS has to clear 'command' field of the mailbox.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_TEST);
+
+ /* Don't wait longer than a second. */
+ timeout = USEC_PER_SEC;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
+ udelay(1);
+
+ if (!timeout)
+ pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init free_pgt_page(void *pgt, void *dummy)
+{
+ return memblock_free(pgt, PAGE_SIZE);
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ unsigned long mstart, mend;
+ pgd_t *pgd;
+
+ pgd = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ mstart = PAGE_ALIGN_DOWN(reset_vector);
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ /*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping
+ * at the same place as in the kernel page tables.
+ * asm_acpi_mp_play_dead() switches to the identity mapping and the
+ * function must be present at the same spot in the virtual address space
+ * before and after switching page tables.
+ */
+ info.offset = __START_KERNEL_map - phys_base;
+ mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead));
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
+ smp_ops.cpu_die = acpi_mp_cpu_die;
+
+ acpi_mp_reset_vector_paddr = reset_vector;
+ acpi_mp_pgd = __pa(pgd);
+
+ return 0;
+}
+
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip, unsigned int cpu)
+{
+ if (!acpi_mp_wake_mailbox_paddr) {
+ pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up.
+ *
+ * The CPU being woken up is essentially in a spin loop waiting to be
+ * woken up. It should not take long for it wake up and acknowledge by
+ * zeroing out ->command.
+ *
+ * ACPI specification doesn't provide any guidance on how long kernel
+ * has to wait for a wake up acknowledgment. It also doesn't provide
+ * a way to cancel a wake up request if it takes too long.
+ *
+ * In TDX environment, the VMM has control over how long it takes to
+ * wake up secondary. It can postpone scheduling secondary vCPU
+ * indefinitely. Giving up on wake up request and reporting error opens
+ * possible attack vector for VMM: it can wake up a secondary CPU when
+ * kernel doesn't expect it. Wait until positive result of the wake up
+ * request.
+ */
+ while (READ_ONCE(acpi_mp_wake_mailbox->command))
+ cpu_relax();
+
+ return 0;
+}
+
+static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
+{
+ cpu_hotplug_disable_offlining();
+
+ /*
+ * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
+ * limits kexec: the second kernel won't be able to use more than one CPU.
+ *
+ * To prevent a kexec kernel from onlining secondary CPUs invalidate the
+ * mailbox address in the ACPI MADT wakeup structure which prevents a
+ * kexec kernel to use it.
+ *
+ * This is safe as the booting kernel has the mailbox address cached
+ * already and acpi_wakeup_cpu() uses the cached value to bring up the
+ * secondary CPUs.
+ *
+ * Note: This is a Linux specific convention and not covered by the
+ * ACPI specification.
+ */
+ mp_wake->mailbox_address = 0;
+}
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+
+ /*
+ * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
+ * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
+ * than the actual size of the MP wakeup entry in ACPI table because the
+ * 'reset_vector' is only available in the V1 MP wakeup structure.
+ */
+ if (!mp_wake)
+ return -EINVAL;
+ if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+ if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
+
+ if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
+ mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
+ if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
+ pr_warn("Failed to setup MADT reset vector\n");
+ acpi_mp_disable_offlining(mp_wake);
+ }
+ } else {
+ /*
+ * CPU offlining requires version 1 of the ACPI MADT wakeup
+ * structure.
+ */
+ acpi_mp_disable_offlining(mp_wake);
+ }
+
+ apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d296f4a195c9..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2005 Intel Corporation
- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- * - Added _PDC for platforms with Intel CPUs
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/acpi.h>
-
-#include <acpi/processor.h>
-#include <asm/acpi.h>
-
-static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
-{
- struct acpi_object_list *obj_list;
- union acpi_object *obj;
- u32 *buf;
-
- /* allocate and initialize pdc. It will be used later. */
- obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
- if (!obj_list) {
- printk(KERN_ERR "Memory allocation error\n");
- return;
- }
-
- obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
- if (!obj) {
- printk(KERN_ERR "Memory allocation error\n");
- kfree(obj_list);
- return;
- }
-
- buf = kmalloc(12, GFP_KERNEL);
- if (!buf) {
- printk(KERN_ERR "Memory allocation error\n");
- kfree(obj);
- kfree(obj_list);
- return;
- }
-
- buf[0] = ACPI_PDC_REVISION_ID;
- buf[1] = 1;
- buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
-
- /*
- * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
- * that OSPM is capable of native ACPI throttling software
- * coordination using BIOS supplied _TSD info.
- */
- buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
- if (cpu_has(c, X86_FEATURE_EST))
- buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
-
- if (cpu_has(c, X86_FEATURE_ACPI))
- buf[2] |= ACPI_PDC_T_FFH;
-
- /*
- * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
- */
- if (!cpu_has(c, X86_FEATURE_MWAIT))
- buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
-
- obj->type = ACPI_TYPE_BUFFER;
- obj->buffer.length = 12;
- obj->buffer.pointer = (u8 *) buf;
- obj_list->count = 1;
- obj_list->pointer = obj;
- pr->pdc = obj_list;
-
- return;
-}
-
-
-/* Initialize _PDC data based on the CPU vendor */
-void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
-{
- struct cpuinfo_x86 *c = &cpu_data(pr->id);
-
- pr->pdc = NULL;
- if (c->x86_vendor == X86_VENDOR_INTEL)
- init_intel_pdc(pr, c);
-
- return;
-}
-
-EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
-
-void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
-{
- if (pr->pdc) {
- kfree(pr->pdc->pointer->buffer.pointer);
- kfree(pr->pdc->pointer);
- kfree(pr->pdc);
- pr->pdc = NULL;
- }
-}
-
-EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f8..000000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-wakeup.bin
-wakeup.elf
-wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef5..000000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License. See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always := wakeup.bin
-targets := wakeup.elf wakeup.lds
-
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter. In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y += video-vga.o
-wakeup-y += video-vesa.o
-wakeup-y += video-bios.o
-
-targets += $(wakeup-y)
-
-bootsrc := $(src)/../../../boot
-
-# ---------------------------------------------------------------------------
-
-# How to compile the 16-bit code. Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-# Compile with _SETUP since this is similar to the boot-time setup code.
-KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
- -I$(srctree)/$(bootsrc) \
- $(cflags-y) \
- -Wall -Wstrict-prototypes \
- -march=i386 -mregparm=3 \
- -include $(srctree)/$(bootsrc)/code16gcc.h \
- -fno-strict-aliasing -fomit-frame-pointer \
- $(call cc-option, -ffreestanding) \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
- $(call cc-option, -fno-stack-protector) \
- $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS += $(call cc-option, -m32)
-KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.elf := -T
-
-CPPFLAGS_wakeup.lds += -P -C
-
-$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
- $(call if_changed,ld)
-
-OBJCOPYFLAGS_wakeup.bin := -O binary
-
-$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
- $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56ce..000000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d8..000000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba202..000000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a27..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f113..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226a..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f374..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
deleted file mode 100644
index 580b4e296010..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * ACPI wakeup real mode startup stub
- */
-#include <asm/segment.h>
-#include <asm/msr-index.h>
-#include <asm/page_types.h>
-#include <asm/pgtable_types.h>
-#include <asm/processor-flags.h>
-
- .code16
- .section ".header", "a"
-
-/* This should match the structure in wakeup.h */
- .globl wakeup_header
-wakeup_header:
-video_mode: .short 0 /* Video mode number */
-pmode_return: .byte 0x66, 0xea /* ljmpl */
- .long 0 /* offset goes here */
- .short __KERNEL_CS
-pmode_cr0: .long 0 /* Saved %cr0 */
-pmode_cr3: .long 0 /* Saved %cr3 */
-pmode_cr4: .long 0 /* Saved %cr4 */
-pmode_efer: .quad 0 /* Saved EFER */
-pmode_gdt: .quad 0
-realmode_flags: .long 0
-real_magic: .long 0
-trampoline_segment: .word 0
-_pad1: .byte 0
-wakeup_jmp: .byte 0xea /* ljmpw */
-wakeup_jmp_off: .word 3f
-wakeup_jmp_seg: .word 0
-wakeup_gdt: .quad 0, 0, 0
-signature: .long 0x51ee1111
-
- .text
- .globl _start
- .code16
-wakeup_code:
-_start:
- cli
- cld
-
- /* Apparently some dimwit BIOS programmers don't know how to
- program a PM to RM transition, and we might end up here with
- junk in the data segment descriptor registers. The only way
- to repair that is to go into PM and fix it ourselves... */
- movw $16, %cx
- lgdtl %cs:wakeup_gdt
- movl %cr0, %eax
- orb $X86_CR0_PE, %al
- movl %eax, %cr0
- jmp 1f
-1: ljmpw $8, $2f
-2:
- movw %cx, %ds
- movw %cx, %es
- movw %cx, %ss
- movw %cx, %fs
- movw %cx, %gs
-
- andb $~X86_CR0_PE, %al
- movl %eax, %cr0
- jmp wakeup_jmp
-3:
- /* Set up segments */
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- lidtl wakeup_idt
-
- movl $wakeup_stack_end, %esp
-
- /* Clear the EFLAGS */
- pushl $0
- popfl
-
- /* Check header signature... */
- movl signature, %eax
- cmpl $0x51ee1111, %eax
- jne bogus_real_magic
-
- /* Check we really have everything... */
- movl end_signature, %eax
- cmpl $0x65a22c82, %eax
- jne bogus_real_magic
-
- /* Call the C code */
- calll main
-
- /* Do any other stuff... */
-
-#ifndef CONFIG_64BIT
- /* This could also be done in C code... */
- movl pmode_cr3, %eax
- movl %eax, %cr3
-
- movl pmode_cr4, %ecx
- jecxz 1f
- movl %ecx, %cr4
-1:
- movl pmode_efer, %eax
- movl pmode_efer + 4, %edx
- movl %eax, %ecx
- orl %edx, %ecx
- jz 1f
- movl $0xc0000080, %ecx
- wrmsr
-1:
-
- lgdtl pmode_gdt
-
- /* This really couldn't... */
- movl pmode_cr0, %eax
- movl %eax, %cr0
- jmp pmode_return
-#else
- pushw $0
- pushw trampoline_segment
- pushw $0
- lret
-#endif
-
-bogus_real_magic:
-1:
- hlt
- jmp 1b
-
- .data
- .balign 8
-
- /* This is the standard real-mode IDT */
-wakeup_idt:
- .word 0xffff /* limit */
- .long 0 /* address */
- .word 0
-
- .globl HEAP, heap_end
-HEAP:
- .long wakeup_heap
-heap_end:
- .long wakeup_stack
-
- .bss
-wakeup_heap:
- .space 2048
-wakeup_stack:
- .space 2048
-wakeup_stack_end:
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index 7da00b799cda..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * wakeup.ld
- *
- * Linker script for the real-mode wakeup code
- */
-#undef i386
-#include "wakeup.h"
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-
-SECTIONS
-{
- . = 0;
- .text : {
- *(.text*)
- }
-
- . = ALIGN(16);
- .rodata : {
- *(.rodata*)
- }
-
- .videocards : {
- video_cards = .;
- *(.videocards)
- video_cards_end = .;
- }
-
- . = ALIGN(16);
- .data : {
- *(.data*)
- }
-
- .signature : {
- end_signature = .;
- LONG(0x65a22c82)
- }
-
- . = ALIGN(16);
- .bss : {
- __bss_start = .;
- *(.bss)
- __bss_end = .;
- }
-
- . = HEADER_OFFSET;
- .header : {
- *(.header)
- }
-
- . = ALIGN(16);
- _end = .;
-
- /DISCARD/ : {
- *(.note*)
- }
-
- . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
-}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..91fa262f0e30 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -1,150 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* sleep.c - x86-specific ACPI sleep support.
*
* Copyright (C) 2001-2003 Patrick Mochel
- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
*/
#include <linux/acpi.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
+#include <linux/pgtable.h>
#include <asm/segment.h>
#include <asm/desc.h>
-
-#include "realmode/wakeup.h"
+#include <asm/cacheflush.h>
+#include <asm/realmode.h>
+#include <asm/hypervisor.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
+
+#include <linux/ftrace.h>
+#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
-unsigned long acpi_wakeup_address;
unsigned long acpi_realmode_flags;
-/* address in low memory of the wakeup routine. */
-static unsigned long acpi_realmode;
-
#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
static char temp_stack[4096];
#endif
/**
- * acpi_save_state_mem - save kernel state
+ * acpi_get_wakeup_address - provide physical address for S3 wakeup
*
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
+ * Returns the physical address where the kernel should be resumed after the
+ * system awakes from S3, e.g. for programming into the firmware waking vector.
+ */
+unsigned long acpi_get_wakeup_address(void)
+{
+ return ((unsigned long)(real_mode_header->wakeup_start));
+}
+
+/**
+ * x86_acpi_enter_sleep_state - enter sleep state
+ * @state: Sleep state to enter.
*
- * Note that this is too late to change acpi_wakeup_address.
+ * Wrapper around acpi_enter_sleep_state() to be called by assembly.
*/
-int acpi_save_state_mem(void)
+asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state)
{
- struct wakeup_header *header;
+ return acpi_enter_sleep_state(state);
+}
- if (!acpi_realmode) {
- printk(KERN_ERR "Could not allocate memory during boot, "
- "S3 disabled\n");
- return -ENOMEM;
- }
- memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
+/**
+ * x86_acpi_suspend_lowlevel - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int x86_acpi_suspend_lowlevel(void)
+{
+ struct wakeup_header *header =
+ (struct wakeup_header *) __va(real_mode_header->wakeup_header);
- header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
- if (header->signature != 0x51ee1111) {
+ if (header->signature != WAKEUP_HEADER_SIGNATURE) {
printk(KERN_ERR "wakeup header does not match\n");
return -EINVAL;
}
header->video_mode = saved_video_mode;
- header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
-
- /*
- * Set up the wakeup GDT. We set these up as Big Real Mode,
- * that is, with limits set to 4 GB. At least the Lenovo
- * Thinkpad X61 is known to need this for the video BIOS
- * initialization quirk to work; this is likely to also
- * be the case for other laptops or integrated video devices.
- */
-
- /* GDT[0]: GDT self-pointer */
- header->wakeup_gdt[0] =
- (u64)(sizeof(header->wakeup_gdt) - 1) +
- ((u64)(acpi_wakeup_address +
- ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
- << 16);
- /* GDT[1]: big real mode-like code segment */
- header->wakeup_gdt[1] =
- GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
- /* GDT[2]: big real mode-like data segment */
- header->wakeup_gdt[2] =
- GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
+ header->pmode_behavior = 0;
#ifndef CONFIG_64BIT
- store_gdt((struct desc_ptr *)&header->pmode_gdt);
+ native_store_gdt((struct desc_ptr *)&header->pmode_gdt);
- header->pmode_efer_low = nx_enabled;
- if (header->pmode_efer_low & 1) {
- /* This is strange, why not save efer, always? */
- rdmsr(MSR_EFER, header->pmode_efer_low,
- header->pmode_efer_high);
- }
+ /*
+ * We have to check that we can write back the value, and not
+ * just read it. At least on 90 nm Pentium M (Family 6, Model
+ * 13), reading an invalid MSR is not guaranteed to trap, see
+ * Erratum X4 in "Intel Pentium M Processor on 90 nm Process
+ * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90
+ * nm process with 512-KB L2 Cache Specification Update".
+ */
+ if (!rdmsr_safe(MSR_EFER,
+ &header->pmode_efer_low,
+ &header->pmode_efer_high) &&
+ !wrmsr_safe(MSR_EFER,
+ header->pmode_efer_low,
+ header->pmode_efer_high))
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
- header->pmode_cr4 = read_cr4_safe();
+ if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
+ header->pmode_cr4 = __read_cr4();
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
+ }
+ if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+ &header->pmode_misc_en_low,
+ &header->pmode_misc_en_high) &&
+ !wrmsr_safe(MSR_IA32_MISC_ENABLE,
+ header->pmode_misc_en_low,
+ header->pmode_misc_en_high))
+ header->pmode_behavior |=
+ (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
header->realmode_flags = acpi_realmode_flags;
header->real_magic = 0x12345678;
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+ header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
- header->trampoline_segment = setup_trampoline() >> 4;
#ifdef CONFIG_SMP
- stack_start.sp = temp_stack + sizeof(temp_stack);
- early_gdt_descr.address =
- (unsigned long)get_cpu_gdt_table(smp_processor_id());
- initial_gs = per_cpu_offset(smp_processor_id());
+ /*
+ * As each CPU starts up, it will find its own stack pointer
+ * from its current_task->thread.sp. Typically that will be
+ * the idle thread for a newly-started AP, or even the boot
+ * CPU which will find it set to &init_task in the static
+ * per-cpu data.
+ *
+ * Make the resuming CPU use the temporary stack at startup
+ * by setting current->thread.sp to point to that. The true
+ * %rsp will be restored with the rest of the CPU context,
+ * by do_suspend_lowlevel(). And unwinders don't care about
+ * the abuse of ->thread.sp because it's a dead variable
+ * while the thread is running on the CPU anyway; the true
+ * value is in the actual %rsp register.
+ */
+ current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
+ /*
+ * Ensure the CPU knows which one it is when it comes back, if
+ * it isn't in parallel mode and expected to work that out for
+ * itself.
+ */
+ if (!(smpboot_control & STARTUP_PARALLEL_MASK))
+ smpboot_control = smp_processor_id();
#endif
initial_code = (unsigned long)wakeup_long64;
- saved_magic = 0x123456789abcdef0L;
+ saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
+ /*
+ * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+ * inconsistent call/return info after it jumps to the wakeup vector.
+ */
+ pause_graph_tracing();
+ do_suspend_lowlevel();
+ unpause_graph_tracing();
return 0;
}
-/*
- * acpi_restore_state - undo effects of acpi_save_state_mem
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-
-/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
- *
- * We allocate a page from the first 1MB of memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16MB pages, but not
- * <1MB pages.
- */
-void __init acpi_reserve_bootmem(void)
-{
- if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
- printk(KERN_ERR
- "ACPI: Wakeup code way too big, S3 disabled.\n");
- return;
- }
-
- acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
-
- if (!acpi_realmode) {
- printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
- return;
- }
-
- acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
-}
-
-
static int __init acpi_sleep_setup(char *str)
{
while ((str != NULL) && (*str != '\0')) {
@@ -155,13 +161,19 @@ static int __init acpi_sleep_setup(char *str)
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_hwsig", 8) == 0)
+ acpi_check_s4_hw_signature = 1;
if (strncmp(str, "s4_nohwsig", 10) == 0)
- acpi_no_s4_hw_signature();
- if (strncmp(str, "s4_nonvs", 8) == 0)
- acpi_s4_no_nvs();
+ acpi_check_s4_hw_signature = 0;
#endif
+ if (strncmp(str, "nonvs", 5) == 0)
+ acpi_nvs_nosave();
+ if (strncmp(str, "nonvs_s3", 8) == 0)
+ acpi_nvs_nosave_s3();
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
+ if (strncmp(str, "nobl", 4) == 0)
+ acpi_sleep_no_blacklist();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
@@ -170,3 +182,21 @@ static int __init acpi_sleep_setup(char *str)
}
__setup("acpi_sleep=", acpi_sleep_setup);
+
+#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST)
+static int __init init_s4_sigcheck(void)
+{
+ /*
+ * If running on a hypervisor, honour the ACPI specification
+ * by default and trigger a clean reboot when the hardware
+ * signature in FACS is changed after hibernation.
+ */
+ if (acpi_check_s4_hw_signature == -1 &&
+ !hypervisor_is_type(X86_HYPER_NATIVE))
+ acpi_check_s4_hw_signature = 1;
+
+ return 0;
+}
+/* This must happen before acpi_init() which is a subsys initcall */
+arch_initcall(init_s4_sigcheck);
+#endif
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..054c15a2f860 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -1,16 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Variables and functions used by the code in sleep.c
*/
-#include <asm/trampoline.h>
-
-extern char wakeup_code_start, wakeup_code_end;
+#include <linux/linkage.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
extern int wakeup_pmode_return;
-extern char swsusp_pg_dir[PAGE_SIZE];
-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+extern u8 wake_sleep_flags;
+
extern void wakeup_long64(void);
+
+extern void do_suspend_lowlevel(void);
+
+extern int x86_acpi_suspend_lowlevel(void);
+
+asmlinkage acpi_status x86_acpi_enter_sleep_state(u8 state);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 8ded418b0593..cf69081073b5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,24 +1,25 @@
- .section .text.page_aligned
+/* SPDX-License-Identifier: GPL-2.0-only */
+ .text
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page_types.h>
-# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz
.code32
ALIGN
-ENTRY(wakeup_pmode_return)
-wakeup_pmode_return:
+SYM_CODE_START(wakeup_pmode_return)
movw $__KERNEL_DS, %ax
movw %ax, %ss
- movw %ax, %ds
- movw %ax, %es
movw %ax, %fs
movw %ax, %gs
+ movw $__USER_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
# reload the gdt, as we need the full 32 bit address
- lgdt saved_gdt
lidt saved_idt
lldt saved_ldt
ljmp $(__KERNEL_CS), $1f
@@ -37,6 +38,7 @@ wakeup_pmode_return:
# jump to place where we left off
movl saved_eip, %eax
jmp *%eax
+SYM_CODE_END(wakeup_pmode_return)
bogus_magic:
jmp bogus_magic
@@ -44,7 +46,6 @@ bogus_magic:
save_registers:
- sgdt saved_gdt
sidt saved_idt
sldt saved_ldt
str saved_tss
@@ -59,7 +60,7 @@ save_registers:
popl saved_context_eflags
movl $ret_point, saved_eip
- ret
+ RET
restore_registers:
@@ -69,13 +70,13 @@ restore_registers:
movl saved_context_edi, %edi
pushl saved_context_eflags
popfl
- ret
+ RET
-ENTRY(do_suspend_lowlevel)
+SYM_CODE_START(do_suspend_lowlevel)
call save_processor_state
call save_registers
pushl $3
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
addl $4, %esp
# In case of S3 failure, we'll emerge here. Jump
@@ -85,15 +86,15 @@ ENTRY(do_suspend_lowlevel)
ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
+SYM_CODE_END(do_suspend_lowlevel)
.data
ALIGN
-ENTRY(saved_magic) .long 0
-ENTRY(saved_eip) .long 0
+SYM_DATA(saved_magic, .long 0)
+saved_eip: .long 0
# saved registers
-saved_gdt: .long 0,0
saved_idt: .long 0,0
saved_ldt: .long 0
saved_tss: .long 0
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8ea5164cbd04..04f561f75e99 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,44 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
.text
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
-# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+# Copyright 2003 Pavel Machek <pavel@suse.cz
.code64
/*
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
-ENTRY(wakeup_long64)
- movq saved_magic, %rax
+SYM_FUNC_START(wakeup_long64)
+ ANNOTATE_NOENDBR
+ movq saved_magic(%rip), %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
- jne bogus_64_magic
+ je 2f
+ /* stop here on a saved_magic mismatch */
+ movq $0xbad6d61676963, %rcx
+1:
+ jmp 1b
+2:
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %ds
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
- movq saved_rsp, %rsp
+ movq saved_rsp(%rip), %rsp
- movq saved_rbx, %rbx
- movq saved_rdi, %rdi
- movq saved_rsi, %rsi
- movq saved_rbp, %rbp
+ movq saved_rbx(%rip), %rbx
+ movq saved_rdi(%rip), %rdi
+ movq saved_rsi(%rip), %rsi
+ movq saved_rbp(%rip), %rbp
- movq saved_rip, %rax
+ movq saved_rip(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
jmp *%rax
-ENDPROC(wakeup_long64)
+SYM_FUNC_END(wakeup_long64)
-bogus_64_magic:
- jmp bogus_64_magic
-
-ENTRY(do_suspend_lowlevel)
+SYM_FUNC_START(do_suspend_lowlevel)
+ FRAME_BEGIN
subq $8, %rsp
xorl %eax, %eax
call save_processor_state
@@ -62,23 +71,24 @@ ENTRY(do_suspend_lowlevel)
pushfq
popq pt_regs_flags(%rax)
- movq $resume_point, saved_rip(%rip)
+ movq $.Lresume_point, saved_rip(%rip)
- movq %rsp, saved_rsp
- movq %rbp, saved_rbp
- movq %rbx, saved_rbx
- movq %rdi, saved_rdi
- movq %rsi, saved_rsi
+ movq %rsp, saved_rsp(%rip)
+ movq %rbp, saved_rbp(%rip)
+ movq %rbx, saved_rbx(%rip)
+ movq %rdi, saved_rdi(%rip)
+ movq %rsi, saved_rsi(%rip)
addq $8, %rsp
movl $3, %edi
xorl %eax, %eax
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
/* in case something went wrong, restore the machine status and go on */
- jmp resume_point
+ jmp .Lresume_point
.align 4
-resume_point:
+.Lresume_point:
+ ANNOTATE_NOENDBR
/* We don't restore %rax, it must be 0 anyway */
movq $saved_context, %rax
movq saved_context_cr4(%rax), %rbx
@@ -107,18 +117,29 @@ resume_point:
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
xorl %eax, %eax
addq $8, %rsp
+ FRAME_END
jmp restore_processor_state
-ENDPROC(do_suspend_lowlevel)
+SYM_FUNC_END(do_suspend_lowlevel)
+STACK_FRAME_NON_STANDARD do_suspend_lowlevel
.data
-ENTRY(saved_rbp) .quad 0
-ENTRY(saved_rsi) .quad 0
-ENTRY(saved_rdi) .quad 0
-ENTRY(saved_rbx) .quad 0
+saved_rbp: .quad 0
+saved_rsi: .quad 0
+saved_rdi: .quad 0
+saved_rbx: .quad 0
-ENTRY(saved_rip) .quad 0
-ENTRY(saved_rsp) .quad 0
+saved_rip: .quad 0
+saved_rsp: .quad 0
-ENTRY(saved_magic) .quad 0
+SYM_DATA(saved_magic, .quad 0)
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 6ff3b5730575..000000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Wrapper script for the realmode binary as a transport object
- * before copying to low memory.
- */
- .section ".rodata","a"
- .globl wakeup_code_start, wakeup_code_end
-wakeup_code_start:
- .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
-wakeup_code_end:
- .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f57658702571..28518371d8bf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,42 +1,42 @@
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/kprobes.h>
-#include <linux/mm.h>
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "SMP alternatives: " fmt
+
+#include <linux/mmu_context.h>
+#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
-#include <asm/alternative.h>
-#include <asm/sections.h>
-#include <asm/pgtable.h>
-#include <asm/mce.h>
+#include <linux/execmem.h>
+
+#include <asm/text-patching.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/ibt.h>
+#include <asm/set_memory.h>
#include <asm/nmi.h>
-#include <asm/vsyscall.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#define MAX_PATCH_LEN (255-1)
+int __read_mostly alternatives_patched;
-#ifdef CONFIG_HOTPLUG_CPU
-static int smp_alt_once;
+EXPORT_SYMBOL_GPL(alternatives_patched);
-static int __init bootonly(char *str)
-{
- smp_alt_once = 1;
- return 1;
-}
-__setup("smp-alt-boot", bootonly);
-#else
-#define smp_alt_once 1
-#endif
+#define MAX_PATCH_LEN (255-1)
+
+#define DA_ALL (~0)
+#define DA_ALT 0x01
+#define DA_RET 0x02
+#define DA_RETPOLINE 0x04
+#define DA_ENDBR 0x08
+#define DA_SMP 0x10
-static int debug_alternative;
+static unsigned int debug_alternative;
static int __init debug_alt(char *str)
{
- debug_alternative = 1;
+ if (str && *str == '=')
+ str++;
+
+ if (!str || kstrtouint(str, 0, &debug_alternative))
+ debug_alternative = DA_ALL;
+
return 1;
}
__setup("debug-alternative", debug_alt);
@@ -50,214 +50,2046 @@ static int __init setup_noreplace_smp(char *str)
}
__setup("noreplace-smp", setup_noreplace_smp);
-#ifdef CONFIG_PARAVIRT
-static int noreplace_paravirt = 0;
-
-static int __init setup_noreplace_paravirt(char *str)
+#define DPRINTK(type, fmt, args...) \
+do { \
+ if (debug_alternative & DA_##type) \
+ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
+} while (0)
+
+#define DUMP_BYTES(type, buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative & DA_##type)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
+} while (0)
+
+static const unsigned char x86nops[] =
{
- noreplace_paravirt = 1;
- return 1;
-}
-__setup("noreplace-paravirt", setup_noreplace_paravirt);
+ BYTES_NOP1,
+ BYTES_NOP2,
+ BYTES_NOP3,
+ BYTES_NOP4,
+ BYTES_NOP5,
+ BYTES_NOP6,
+ BYTES_NOP7,
+ BYTES_NOP8,
+#ifdef CONFIG_64BIT
+ BYTES_NOP9,
+ BYTES_NOP10,
+ BYTES_NOP11,
#endif
+};
-#define DPRINTK(fmt, args...) if (debug_alternative) \
- printk(KERN_DEBUG fmt, args)
-
-#ifdef GENERIC_NOP1
-/* Use inline assembly to define this because the nops are defined
- as inline assembly strings in the include files and we cannot
- get them easily into strings. */
-asm("\t.section .rodata, \"a\"\nintelnops: "
- GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
- GENERIC_NOP7 GENERIC_NOP8
- "\t.previous");
-extern const unsigned char intelnops[];
-static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
+const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
+{
NULL,
- intelnops,
- intelnops + 1,
- intelnops + 1 + 2,
- intelnops + 1 + 2 + 3,
- intelnops + 1 + 2 + 3 + 4,
- intelnops + 1 + 2 + 3 + 4 + 5,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ x86nops,
+ x86nops + 1,
+ x86nops + 1 + 2,
+ x86nops + 1 + 2 + 3,
+ x86nops + 1 + 2 + 3 + 4,
+ x86nops + 1 + 2 + 3 + 4 + 5,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+#ifdef CONFIG_64BIT
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
+#endif
};
+
+#ifdef CONFIG_FINEIBT
+static bool cfi_paranoid __ro_after_init;
#endif
-#ifdef K8_NOP1
-asm("\t.section .rodata, \"a\"\nk8nops: "
- K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8
- "\t.previous");
-extern const unsigned char k8nops[];
-static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
- NULL,
- k8nops,
- k8nops + 1,
- k8nops + 1 + 2,
- k8nops + 1 + 2 + 3,
- k8nops + 1 + 2 + 3 + 4,
- k8nops + 1 + 2 + 3 + 4 + 5,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
#endif
+static void *its_page;
+static unsigned int its_offset;
+struct its_array its_pages;
-#ifdef K7_NOP1
-asm("\t.section .rodata, \"a\"\nk7nops: "
- K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
- K7_NOP7 K7_NOP8
- "\t.previous");
-extern const unsigned char k7nops[];
-static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
- NULL,
- k7nops,
- k7nops + 1,
- k7nops + 1 + 2,
- k7nops + 1 + 2 + 3,
- k7nops + 1 + 2 + 3 + 4,
- k7nops + 1 + 2 + 3 + 4 + 5,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
+static void *__its_alloc(struct its_array *pages)
+{
+ void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ if (!page)
+ return NULL;
+
+ void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ pages->pages = tmp;
+ pages->pages[pages->num++] = page;
+
+ return no_free_ptr(page);
+}
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
+{
+ u8 *bytes = thunk;
+ int offset = 0;
+ int i = 0;
+
+#ifdef CONFIG_FINEIBT
+ if (cfi_paranoid) {
+ /*
+ * When ITS uses indirect branch thunk the fineibt_paranoid
+ * caller sequence doesn't fit in the caller site. So put the
+ * remaining part of the sequence (UDB + JNE) into the ITS
+ * thunk.
+ */
+ bytes[i++] = 0xd6; /* UDB */
+ bytes[i++] = 0x75; /* JNE */
+ bytes[i++] = 0xfd;
+
+ offset = 1;
+ }
#endif
-#ifdef P6_NOP1
-asm("\t.section .rodata, \"a\"\np6nops: "
- P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
- P6_NOP7 P6_NOP8
- "\t.previous");
-extern const unsigned char p6nops[];
-static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
- NULL,
- p6nops,
- p6nops + 1,
- p6nops + 1 + 2,
- p6nops + 1 + 2 + 3,
- p6nops + 1 + 2 + 3 + 4,
- p6nops + 1 + 2 + 3 + 4 + 5,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+ bytes[i++] = 0xff;
+ bytes[i++] = 0xe0 + reg; /* JMP *reg */
+ bytes[i++] = 0xcc;
+
+ return thunk + offset;
+}
+
+static void its_pages_protect(struct its_array *pages)
+{
+ for (int i = 0; i < pages->num; i++) {
+ void *page = pages->pages[i];
+ execmem_restore_rox(page, PAGE_SIZE);
+ }
+}
+
+static void its_fini_core(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+ its_pages_protect(&its_pages);
+ kfree(its_pages.pages);
+}
+
+#ifdef CONFIG_MODULES
+void its_init_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ mutex_lock(&text_mutex);
+ its_mod = mod;
+ its_page = NULL;
+}
+
+void its_fini_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ WARN_ON_ONCE(its_mod != mod);
+
+ its_mod = NULL;
+ its_page = NULL;
+ mutex_unlock(&text_mutex);
+
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ its_pages_protect(&mod->arch.its_pages);
+}
+
+void its_free_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
+ void *page = mod->arch.its_pages.pages[i];
+ execmem_free(page);
+ }
+ kfree(mod->arch.its_pages.pages);
+}
+#endif /* CONFIG_MODULES */
+
+static void *its_alloc(void)
+{
+ struct its_array *pages = &its_pages;
+ void *page;
+
+#ifdef CONFIG_MODULES
+ if (its_mod)
+ pages = &its_mod->arch.its_pages;
#endif
-#ifdef CONFIG_X86_64
+ page = __its_alloc(pages);
+ if (!page)
+ return NULL;
+
+ if (pages == &its_pages)
+ set_memory_x((unsigned long)page, 1);
+
+ return page;
+}
+
+static void *its_allocate_thunk(int reg)
+{
+ int size = 3 + (reg / 8);
+ void *thunk;
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * The ITS thunk contains an indirect jump and an int3 instruction so
+ * its size is 3 or 4 bytes depending on the register used. If CFI
+ * paranoid is used then 3 extra bytes are added in the ITS thunk to
+ * complete the fineibt_paranoid caller sequence.
+ */
+ if (cfi_paranoid)
+ size += 3;
+#endif
+
+ if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+ its_page = its_alloc();
+ if (!its_page) {
+ pr_err("ITS page allocation failed\n");
+ return NULL;
+ }
+ memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+ its_offset = 32;
+ }
+
+ /*
+ * If the indirect branch instruction will be in the lower half
+ * of a cacheline, then update the offset to reach the upper half.
+ */
+ if ((its_offset + size - 1) % 64 < 32)
+ its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+ thunk = its_page + its_offset;
+ its_offset += size;
+
+ return its_init_thunk(thunk, reg);
+}
+
+u8 *its_static_thunk(int reg)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+#ifdef CONFIG_FINEIBT
+ /* Paranoid thunk starts 2 bytes before */
+ if (cfi_paranoid)
+ return thunk - 2;
+#endif
+ return thunk;
+}
+
+#else
+static inline void its_fini_core(void) {}
+#endif /* CONFIG_MITIGATION_ITS */
-extern char __vsyscall_0;
-const unsigned char *const *find_nop_table(void)
+/*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
+
+/*
+ * Fill the buffer with a single effective instruction of size @len.
+ *
+ * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
+ * for every single-byte NOP, try to generate the maximally available NOP of
+ * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
+ * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
+ * *jump* over instead of executing long and daft NOPs.
+ */
+static void add_nop(u8 *buf, unsigned int len)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return k8_nops;
+ u8 *target = buf + len;
+
+ if (!len)
+ return;
+
+ if (len <= ASM_NOP_MAX) {
+ memcpy(buf, x86_nops[len], len);
+ return;
+ }
+
+ if (len < 128) {
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
+ } else {
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
+ }
+
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
+}
+
+/*
+ * Find the offset of the first non-NOP instruction starting at @offset
+ * but no further than @len.
+ */
+static int skip_nops(u8 *buf, int offset, int len)
+{
+ struct insn insn;
+
+ for (; offset < len; offset += insn.length) {
+ if (insn_decode_kernel(&insn, &buf[offset]))
+ break;
+
+ if (!insn_is_nop(&insn))
+ break;
+ }
+
+ return offset;
+}
+
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
+{
+ for (int next, i = 0; i < len; i = next) {
+ struct insn insn;
+
+ if (insn_decode_kernel(&insn, &buf[i]))
+ return;
+
+ next = i + insn.length;
+
+ if (insn_is_nop(&insn)) {
+ int nop = i;
+
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
+
+ next = skip_nops(buf, next, len);
+
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
+ }
}
-#else /* CONFIG_X86_64 */
+/*
+ * In this context, "source" is where the instructions are placed in the
+ * section .altinstr_replacement, for example during kernel build by the
+ * toolchain.
+ * "Destination" is where the instructions are being patched in by this
+ * machinery.
+ *
+ * The source offset is:
+ *
+ * src_imm = target - src_next_ip (1)
+ *
+ * and the target offset is:
+ *
+ * dst_imm = target - dst_next_ip (2)
+ *
+ * so rework (1) as an expression for target like:
+ *
+ * target = src_imm + src_next_ip (1a)
+ *
+ * and substitute in (2) to get:
+ *
+ * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ *
+ * Now, since the instruction stream is 'identical' at src and dst (it
+ * is being copied after all) it can be stated that:
+ *
+ * src_next_ip = src + ip_offset
+ * dst_next_ip = dst + ip_offset (4)
+ *
+ * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ * obtain:
+ *
+ * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ * = src_imm + src - dst + ip_offset - ip_offset
+ * = src_imm + src - dst (5)
+ *
+ * IOW, only the relative displacement of the code block matters.
+ */
+
+#define apply_reloc_n(n_, p_, d_) \
+ do { \
+ s32 v = *(s##n_ *)(p_); \
+ v += (d_); \
+ BUG_ON((v >> 31) != (v >> (n_-1))); \
+ *(s##n_ *)(p_) = (s##n_)v; \
+ } while (0)
-const unsigned char *const *find_nop_table(void)
+
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
{
- if (boot_cpu_has(X86_FEATURE_K8))
- return k8_nops;
- else if (boot_cpu_has(X86_FEATURE_K7))
- return k7_nops;
- else if (boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return intel_nops;
+ switch (n) {
+ case 1: apply_reloc_n(8, ptr, diff); break;
+ case 2: apply_reloc_n(16, ptr, diff); break;
+ case 4: apply_reloc_n(32, ptr, diff); break;
+ default: BUG();
+ }
}
-#endif /* CONFIG_X86_64 */
+static __always_inline
+bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+{
+ u8 *target = src + offset;
+ /*
+ * If the target is inside the patched block, it's relative to the
+ * block itself and does not need relocation.
+ */
+ return (target < src || target > src + src_len);
+}
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-void add_nops(void *insns, unsigned int len)
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
- const unsigned char *const *noptable = find_nop_table();
+ for (int next, i = 0; i < instrlen; i = next) {
+ struct insn insn;
+
+ if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+ return;
+
+ next = i + insn.length;
+
+ switch (insn.opcode.bytes[0]) {
+ case 0x0f:
+ if (insn.opcode.bytes[1] < 0x80 ||
+ insn.opcode.bytes[1] > 0x8f)
+ break;
+
+ fallthrough; /* Jcc.d32 */
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ case JMP8_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case CALL_INSN_OPCODE:
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
+ apply_reloc(insn.immediate.nbytes,
+ buf + i + insn_offset_immediate(&insn),
+ repl - instr);
+ }
+
+ /*
+ * Where possible, convert JMP.d32 into JMP.d8.
+ */
+ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+ s32 imm = insn.immediate.value;
+ imm += repl - instr;
+ imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
+ if ((imm >> 31) == (imm >> 7)) {
+ buf[i+0] = JMP8_INSN_OPCODE;
+ buf[i+1] = (s8)imm;
+
+ memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
+ }
+ }
+ break;
+ }
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, noptable[noplen], noplen);
- insns += noplen;
- len -= noplen;
+ if (insn_rip_relative(&insn)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
+ apply_reloc(insn.displacement.nbytes,
+ buf + i + insn_offset_displacement(&insn),
+ repl - instr);
+ }
+ }
}
}
-EXPORT_SYMBOL_GPL(add_nops);
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, instrlen);
+}
+
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
-/* Replace instructions with better alternatives for this CPU type.
- This runs before SMP is initialized to avoid SMP problems with
- self modifying code. This implies that assymetric systems where
- APs have less capabilities than the boot processor are not handled.
- Tough. Make sure you disable such features by hand. */
+noinstr void BUG_func(void)
+{
+ BUG();
+}
+EXPORT_SYMBOL(BUG_func);
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
- struct alt_instr *a;
- char insnbuf[MAX_PATCH_LEN];
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
- DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+static inline u8 * instr_va(struct alt_instr *i)
+{
+ return (u8 *)&i->instr_offset + i->instr_offset;
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ *
+ * Marked "noinline" to cause control flow change and thus insn cache
+ * to refetch changed I$ lines.
+ */
+void __init_or_module noinline apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ u8 insn_buff[MAX_PATCH_LEN];
+ u8 *instr, *replacement;
+ struct alt_instr *a, *b;
+
+ DPRINTK(ALT, "alt table %px, -> %px", start, end);
+
+ /*
+ * KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
+
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite previously scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
for (a = start; a < end; a++) {
- u8 *instr = a->instr;
- BUG_ON(a->replacementlen > a->instrlen);
- BUG_ON(a->instrlen > sizeof(insnbuf));
- if (!boot_cpu_has(a->cpuid))
+ unsigned int insn_buff_sz = 0;
+
+ /*
+ * In case of nested ALTERNATIVE()s the outer alternative might
+ * add more padding. To ensure consistent patching find the max
+ * padding for all alt_instr entries for this site (nested
+ * alternatives result in consecutive entries).
+ */
+ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
+ u8 len = max(a->instrlen, b->instrlen);
+ a->instrlen = b->instrlen = len;
+ }
+
+ instr = instr_va(a);
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
+ BUG_ON(a->instrlen > sizeof(insn_buff));
+ BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
+
+ /*
+ * Patch if either:
+ * - feature is present
+ * - feature not present but ALT_FLAG_NOT is set to mean,
+ * patch if feature is *NOT* present.
+ */
+ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
+ memcpy(insn_buff, instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
-#ifdef CONFIG_X86_64
- /* vsyscall code is not mapped yet. resolve it manually. */
- if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
- instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
- DPRINTK("%s: vsyscall fixup: %p => %p\n",
- __func__, a->instr, instr);
}
+
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, instr, a->instrlen,
+ replacement, a->replacementlen, a->flags);
+
+ memcpy(insn_buff, replacement, a->replacementlen);
+ insn_buff_sz = a->replacementlen;
+
+ if (a->flags & ALT_FLAG_DIRECT_CALL)
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
+
+ for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
+ insn_buff[insn_buff_sz] = 0x90;
+
+ text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
+
+ DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
+
+ text_poke_early(instr, insn_buff, insn_buff_sz);
+ }
+
+ kasan_enable_current();
+}
+
+static inline bool is_jcc32(struct insn *insn)
+{
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
+#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
+
+/*
+ * [CS]{,3} CALL/JMP *%\reg [INT3]*
+ */
+static int emit_indirect(int op, int reg, u8 *bytes, int len)
+{
+ int cs = 0, bp = 0;
+ int i = 0;
+ u8 modrm;
+
+ /*
+ * Set @len to the excess bytes after writing the instruction.
+ */
+ len -= 2 + (reg >= 8);
+ WARN_ON_ONCE(len < 0);
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ modrm = 0x10; /* Reg = 2; CALL r/m */
+ /*
+ * Additional NOP is better than prefix decode penalty.
+ */
+ if (len <= 3)
+ cs = len;
+ break;
+
+ case JMP32_INSN_OPCODE:
+ modrm = 0x20; /* Reg = 4; JMP r/m */
+ bp = len;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ while (cs--)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+
+ modrm |= 0xc0; /* Mod = 3 */
+ modrm += reg;
+
+ bytes[i++] = 0xff; /* opcode */
+ bytes[i++] = modrm;
+
+ while (bp--)
+ bytes[i++] = 0xcc; /* INT3 */
+
+ return i;
+}
+
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+ void *call_dest, void *jmp_dest)
+{
+ u8 op = insn->opcode.bytes[0];
+ int i = 0;
+
+ /*
+ * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
+ * tail-calls. Deal with them.
+ */
+ if (is_jcc32(insn)) {
+ bytes[i++] = op;
+ op = insn->opcode.bytes[1];
+ goto clang_jcc;
+ }
+
+ if (insn->length == 6)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ __text_gen_insn(bytes+i, op, addr+i,
+ call_dest,
+ CALL_INSN_SIZE);
+ i += CALL_INSN_SIZE;
+ break;
+
+ case JMP32_INSN_OPCODE:
+clang_jcc:
+ __text_gen_insn(bytes+i, op, addr+i,
+ jmp_dest,
+ JMP32_INSN_SIZE);
+ i += JMP32_INSN_SIZE;
+ break;
+
+ default:
+ WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
+ return -1;
+ }
+
+ WARN_ON_ONCE(i != insn->length);
+
+ return i;
+}
+
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ return __emit_trampoline(addr, insn, bytes,
+ __x86_indirect_call_thunk_array[reg],
+ __x86_indirect_jump_thunk_array[reg]);
+}
+
+#ifdef CONFIG_MITIGATION_ITS
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+ u8 *tmp = its_allocate_thunk(reg);
+
+ if (tmp)
+ thunk = tmp;
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return false;
+
+ /* Indirect branch opcode is 2 or 3 bytes depending on reg */
+ addr += 1 + reg / 8;
+
+ /* Lower-half of the cacheline? */
+ return !(addr & 0x20);
+}
+#else /* CONFIG_MITIGATION_ITS */
+
+#ifdef CONFIG_FINEIBT
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ return false;
+}
#endif
- memcpy(insnbuf, a->replacement, a->replacementlen);
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
- text_poke_early(instr, insnbuf, a->instrlen);
+
+#endif /* CONFIG_MITIGATION_ITS */
+
+/*
+ * Rewrite the compiler generated retpoline thunk calls.
+ *
+ * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
+ * indirect instructions, avoiding the extra indirection.
+ *
+ * For example, convert:
+ *
+ * CALL __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * CALL *%\reg
+ *
+ * It also tries to inline spectre_v2=retpoline,lfence when size permits.
+ */
+static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
+{
+ retpoline_thunk_t *target;
+ int reg, ret, i = 0;
+ u8 op, cc;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = target - __x86_indirect_thunk_array;
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
+
+ /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
+ BUG_ON(reg == 4);
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return emit_call_track_retpoline(addr, insn, reg, bytes);
+
+ return -1;
+ }
+
+ op = insn->opcode.bytes[0];
+
+ /*
+ * Convert:
+ *
+ * Jcc.d32 __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * Jncc.d8 1f
+ * [ LFENCE ]
+ * JMP *%\reg
+ * [ NOP ]
+ * 1:
+ */
+ if (is_jcc32(insn)) {
+ cc = insn->opcode.bytes[1] & 0xf;
+ cc ^= 1; /* invert condition */
+
+ bytes[i++] = 0x70 + cc; /* Jcc.d8 */
+ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
+
+ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
+ op = JMP32_INSN_OPCODE;
+ }
+
+ /*
+ * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ bytes[i++] = 0x0f;
+ bytes[i++] = 0xae;
+ bytes[i++] = 0xe8; /* LFENCE */
+ }
+
+#ifdef CONFIG_MITIGATION_ITS
+ /*
+ * Check if the address of last byte of emitted-indirect is in
+ * lower-half of the cacheline. Such branches need ITS mitigation.
+ */
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+ return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
+ ret = emit_indirect(op, reg, bytes + i, insn->length - i);
+ if (ret < 0)
+ return ret;
+ i += ret;
+
+ for (; i < insn->length;)
+ bytes[i++] = BYTES_NOP1;
+
+ return i;
+}
+
+/*
+ * Generated by 'objtool --retpoline'.
+ */
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op1, op2;
+ u8 *dest;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op1 = insn.opcode.bytes[0];
+ op2 = insn.opcode.bytes[1];
+
+ switch (op1) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ /* See cfi_paranoid. */
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ /* Check for cfi_paranoid + ITS */
+ dest = addr + insn.length + insn.immediate.value;
+ if (dest[-1] == 0xd6 && (dest[0] & 0xf0) == 0x70) {
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+ }
+ break;
+
+ case 0x0f: /* escape */
+ if (op2 >= 0x80 && op2 <= 0x8f)
+ break;
+ fallthrough;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_retpoline(addr, &insn, bytes);
+ if (len == insn.length) {
+ optimize_nops(addr, bytes, len);
+ DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
}
}
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MITIGATION_RETHUNK
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+bool cpu_wants_rethunk(void)
{
- u8 **ptr;
+ return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
- mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
+bool cpu_wants_rethunk_at(void *addr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ return false;
+ if (x86_return_thunk != its_return_thunk)
+ return true;
+
+ return !((unsigned long)addr & 0x20);
+}
+
+/*
+ * Rewrite the compiler generated return thunk tail-calls.
+ *
+ * For example, convert:
+ *
+ * JMP __x86_return_thunk
+ *
+ * into:
+ *
+ * RET
+ */
+static int patch_return(void *addr, struct insn *insn, u8 *bytes)
+{
+ int i = 0;
+
+ /* Patch the custom return thunks... */
+ if (cpu_wants_rethunk_at(addr)) {
+ i = JMP32_INSN_SIZE;
+ __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
+ } else {
+ /* ... or patch them out if not needed. */
+ bytes[i++] = RET_INSN_OPCODE;
+ }
+
+ for (; i < insn->length;)
+ bytes[i++] = INT3_INSN_OPCODE;
+ return i;
+}
+
+void __init_or_module noinline apply_returns(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ if (cpu_wants_rethunk())
+ static_call_force_reinit();
+
+ for (s = start; s < end; s++) {
+ void *dest = NULL, *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
continue;
- if (*ptr > text_end)
+
+ op = insn.opcode.bytes[0];
+ if (op == JMP32_INSN_OPCODE)
+ dest = addr + insn.length + insn.immediate.value;
+
+ if (__static_call_fixup(addr, op, dest) ||
+ WARN_ONCE(dest != &__x86_return_thunk,
+ "missing return thunk: %pS-%pS: %*ph",
+ addr, dest, 5, addr))
continue;
- /* turn DS segment override prefix into lock prefix */
- text_poke(*ptr, ((unsigned char []){0xf0}), 1);
- };
- mutex_unlock(&text_mutex);
+
+ DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_return(addr, &insn, bytes);
+ if (len == insn.length) {
+ DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+#else /* !CONFIG_MITIGATION_RETHUNK: */
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+#endif /* !CONFIG_MITIGATION_RETHUNK */
+
+#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+
+#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
+
+#ifdef CONFIG_X86_KERNEL_IBT
+
+__noendbr bool is_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return __is_endbr(endbr);
+
+Efault:
+ return false;
+}
+
+#ifdef CONFIG_FINEIBT
+
+static __noendbr bool exact_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return endbr == gen_endbr();
+
+Efault:
+ return false;
}
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+#endif
+
+static void poison_cfi(void *addr);
+
+static void __init_or_module poison_endbr(void *addr)
{
- u8 **ptr;
+ u32 poison = gen_endbr_poison();
- if (noreplace_smp)
+ if (WARN_ON_ONCE(!is_endbr(addr)))
return;
- mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
+ DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
+
+ /*
+ * When we have IBT, the lack of ENDBR will trigger #CP
+ */
+ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
+ DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
+ text_poke_early(addr, &poison, 4);
+}
+
+/*
+ * Generated by: objtool --ibt
+ *
+ * Seal the functions for indirect calls by clobbering the ENDBR instructions
+ * and the kCFI hash value.
+ */
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+
+ poison_endbr(addr);
+ if (IS_ENABLED(CONFIG_FINEIBT))
+ poison_cfi(addr - 16);
+ }
+}
+
+#else /* !CONFIG_X86_KERNEL_IBT: */
+
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
+
+#endif /* !CONFIG_X86_KERNEL_IBT */
+
+#ifdef CONFIG_CFI_AUTO_DEFAULT
+# define __CFI_DEFAULT CFI_AUTO
+#elif defined(CONFIG_CFI)
+# define __CFI_DEFAULT CFI_KCFI
+#else
+# define __CFI_DEFAULT CFI_OFF
+#endif
+
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+static bool cfi_debug __ro_after_init;
+
+#ifdef CONFIG_FINEIBT_BHI
+bool cfi_bhi __ro_after_init = false;
+#endif
+
+#ifdef CONFIG_CFI
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+
+int cfi_get_func_arity(void *func)
+{
+ bhi_thunk *target;
+ s32 disp;
+
+ if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
+ return 0;
+
+ if (get_kernel_nofault(disp, func - 4))
+ return 0;
+
+ target = func + disp;
+ return target - __bhi_args;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
+
+static bool cfi_rand __ro_after_init = true;
+static u32 cfi_seed __ro_after_init;
+
+/*
+ * Re-hash the CFI hash with a boot-time seed while making sure the result is
+ * not a valid ENDBR instruction.
+ */
+static u32 cfi_rehash(u32 hash)
+{
+ hash ^= cfi_seed;
+ while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
+ bool lsb = hash & 1;
+ hash >>= 1;
+ if (lsb)
+ hash ^= 0x80200003;
+ }
+ return hash;
+}
+
+static __init int cfi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "auto")) {
+ cfi_mode = CFI_AUTO;
+ } else if (!strcmp(str, "off")) {
+ cfi_mode = CFI_OFF;
+ cfi_rand = false;
+ } else if (!strcmp(str, "debug")) {
+ cfi_debug = true;
+ } else if (!strcmp(str, "kcfi")) {
+ cfi_mode = CFI_KCFI;
+ } else if (!strcmp(str, "fineibt")) {
+ cfi_mode = CFI_FINEIBT;
+ } else if (!strcmp(str, "norand")) {
+ cfi_rand = false;
+ } else if (!strcmp(str, "warn")) {
+ pr_alert("CFI: mismatch non-fatal!\n");
+ cfi_warn = true;
+ } else if (!strcmp(str, "paranoid")) {
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_paranoid = true;
+ } else {
+ pr_err("CFI: ignoring paranoid; depends on fineibt.\n");
+ }
+ } else if (!strcmp(str, "bhi")) {
+#ifdef CONFIG_FINEIBT_BHI
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_bhi = true;
+ } else {
+ pr_err("CFI: ignoring bhi; depends on fineibt.\n");
+ }
+#else
+ pr_err("CFI: ignoring bhi; depends on FINEIBT_BHI=y.\n");
+#endif
+ } else {
+ pr_err("CFI: Ignoring unknown option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("cfi", cfi_parse_cmdline);
+
+/*
+ * kCFI FineIBT
+ *
+ * __cfi_\func: __cfi_\func:
+ * movl $0x12345678,%eax // 5 endbr64 // 4
+ * nop subl $0x12345678,%eax // 5
+ * nop jne.d32,pn \func+3 // 7
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * \func: \func:
+ * endbr64 nopl -42(%rax)
+ *
+ *
+ * caller: caller:
+ * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%eax // 5
+ * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
+ * je 1f // 2 nop5 // 5
+ * ud2 // 2
+ * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
+ *
+ *
+ * Notably, the FineIBT sequences are crafted such that branches are presumed
+ * non-taken. This is based on Agner Fog's optimization manual, which states:
+ *
+ * "Make conditional jumps most often not taken: The efficiency and throughput
+ * for not-taken branches is better than for taken branches on most
+ * processors. Therefore, it is good to place the most frequent branch first"
+ */
+
+/*
+ * <fineibt_preamble_start>:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 0f 85 03 00 00 00 jne,pn 13 <fineibt_preamble_start+0x13>
+ * 10: 0f 1f 40 d6 nopl -0x2a(%rax)
+ *
+ * Note that the JNE target is the 0xD6 byte inside the NOPL, this decodes as
+ * UDB on x86_64 and raises #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %eax \n"
+ "fineibt_preamble_bhi: \n"
+ " cs jne.d32 fineibt_preamble_start+0x13 \n"
+ "#fineibt_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_preamble_end: \n"
+ ".popsection\n"
+);
+
+extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_bhi[];
+extern u8 fineibt_preamble_end[];
+
+#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
+#define fineibt_preamble_ud 0x13
+#define fineibt_preamble_hash 5
+
+/*
+ * <fineibt_caller_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * 9: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_caller_start: \n"
+ " movl $0x12345678, %eax \n"
+ " lea -0x10(%r11), %r11 \n"
+ ASM_NOP5
+ "fineibt_caller_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_caller_start[];
+extern u8 fineibt_caller_end[];
+
+#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
+#define fineibt_caller_hash 1
+
+#define fineibt_caller_jmp (fineibt_caller_size - 2)
+
+/*
+ * Since FineIBT does hash validation on the callee side it is prone to
+ * circumvention attacks where a 'naked' ENDBR instruction exists that
+ * is not part of the fineibt_preamble sequence.
+ *
+ * Notably the x86 entry points must be ENDBR and equally cannot be
+ * fineibt_preamble.
+ *
+ * The fineibt_paranoid caller sequence adds additional caller side
+ * hash validation. This stops such circumvention attacks dead, but at the cost
+ * of adding a load.
+ *
+ * <fineibt_paranoid_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f5 cmp -0x11(%r11), %eax
+ * 9: 2e 4d 8d 5b <f0> cs lea -0x10(%r11), %r11
+ * e: 75 fd jne d <fineibt_paranoid_start+0xd>
+ * 10: 41 ff d3 call *%r11
+ * 13: 90 nop
+ *
+ * Notably LEA does not modify flags and can be reordered with the CMP,
+ * avoiding a dependency. Again, using a non-taken (backwards) branch
+ * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
+ * Jcc.d8, causing #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_paranoid_start: \n"
+ " mov $0x12345678, %eax \n"
+ " cmpl -11(%r11), %eax \n"
+ " cs lea -0x10(%r11), %r11 \n"
+ "#fineibt_caller_size: \n"
+ " jne fineibt_paranoid_start+0xd \n"
+ "fineibt_paranoid_ind: \n"
+ " cs call *%r11 \n"
+ "fineibt_paranoid_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_paranoid_start[];
+extern u8 fineibt_paranoid_ind[];
+extern u8 fineibt_paranoid_end[];
+
+#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
+#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
+#define fineibt_paranoid_ud 0xd
+
+static u32 decode_preamble_hash(void *addr, int *reg)
+{
+ u8 *p = addr;
+
+ /* b8+reg 78 56 34 12 movl $0x12345678,\reg */
+ if (p[0] >= 0xb8 && p[0] < 0xc0) {
+ if (reg)
+ *reg = p[0] - 0xb8;
+ return *(u32 *)(addr + 1);
+ }
+
+ return 0; /* invalid hash value */
+}
+
+static u32 decode_caller_hash(void *addr)
+{
+ u8 *p = addr;
+
+ /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
+ if (p[0] == 0x41 && p[1] == 0xba)
+ return -*(u32 *)(addr + 2);
+
+ /* e8 0c 88 a9 cb ed jmp.d8 +12 */
+ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
+ return -*(u32 *)(addr + 2);
+
+ return 0; /* invalid hash value */
+}
+
+/* .retpoline_sites */
+static int cfi_disable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
+ * in tact for later usage. Also see decode_caller_hash() and
+ * cfi_rewrite_callers().
+ */
+ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
+ continue;
+
+ text_poke_early(addr, jmp, 2);
+ }
+
+ return 0;
+}
+
+static int cfi_enable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Re-enable kCFI, undo what cfi_disable_callers() did.
+ */
+ const u8 mov[] = { 0x41, 0xba };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
+ continue;
+
+ text_poke_early(addr, mov, 2);
+ }
+
+ return 0;
+}
+
+/* .cfi_sites */
+static int cfi_rand_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ hash = decode_preamble_hash(addr, NULL);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ hash = cfi_rehash(hash);
+ text_poke_early(addr + 1, &hash, 4);
+ }
+
+ return 0;
+}
+
+/*
+ * Inline the bhi-arity 1 case:
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 49 0f 45 fa cmovne %rax, %rdi
+ * d: 2e 75 03 jne,pn foo+0x3
+ *
+ * foo:
+ * 10: 0f 1f 40 <d6> nopl -42(%rax)
+ *
+ * Notably, this scheme is incompatible with permissive CFI
+ * because the CMOVcc is unconditional and RDI will have been
+ * clobbered.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_bhi1_start: \n"
+ " cmovne %rax, %rdi \n"
+ " cs jne fineibt_bhi1_func + 0x3 \n"
+ "fineibt_bhi1_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_bhi1_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_bhi1_start[];
+extern u8 fineibt_bhi1_end[];
+
+#define fineibt_bhi1_size (fineibt_bhi1_end - fineibt_bhi1_start)
+
+static void cfi_fineibt_bhi_preamble(void *addr, int arity)
+{
+ u8 bytes[MAX_INSN_SIZE];
+
+ if (!arity)
+ return;
+
+ if (!cfi_warn && arity == 1) {
+ text_poke_early(addr + fineibt_preamble_bhi,
+ fineibt_bhi1_start, fineibt_bhi1_size);
+ return;
+ }
+
+ /*
+ * Replace the bytes at fineibt_preamble_bhi with a CALL instruction
+ * that lines up exactly with the end of the preamble, such that the
+ * return address will be foo+0.
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 2e e8 DD DD DD DD cs cs call __bhi_args[arity]
+ */
+ bytes[0] = 0x2e;
+ bytes[1] = 0x2e;
+ __text_gen_insn(bytes + 2, CALL_INSN_OPCODE,
+ addr + fineibt_preamble_bhi + 2,
+ __bhi_args[arity], CALL_INSN_SIZE);
+
+ text_poke_early(addr + fineibt_preamble_bhi, bytes, 7);
+}
+
+static int cfi_rewrite_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ int arity;
+ u32 hash;
+
+ /*
+ * When the function doesn't start with ENDBR the compiler will
+ * have determined there are no indirect calls to it and we
+ * don't need no CFI either.
+ */
+ if (!is_endbr(addr + 16))
+ continue;
+
+ hash = decode_preamble_hash(addr, &arity);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+
+ WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
+ "kCFI preamble has wrong register at: %pS %*ph\n",
+ addr, 5, addr);
+
+ if (cfi_bhi)
+ cfi_fineibt_bhi_preamble(addr, arity);
+ }
+
+ return 0;
+}
+
+static void cfi_rewrite_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+
+ if (!exact_endbr(addr + 16))
+ continue;
+
+ poison_endbr(addr + 16);
+ }
+}
+
+/* .retpoline_sites */
+static int cfi_rand_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (hash) {
+ hash = -cfi_rehash(hash);
+ text_poke_early(addr + 2, &hash, 4);
+ }
+ }
+
+ return 0;
+}
+
+static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
+
+#ifdef CONFIG_MITIGATION_ITS
+ u8 *tmp = its_allocate_thunk(reg);
+ if (tmp)
+ thunk = tmp;
+#endif
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+static int cfi_rewrite_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ u8 bytes[20];
+ u32 hash;
+ int ret;
+ u8 op;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash)
+ continue;
+
+ if (!cfi_paranoid) {
+ text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ /* rely on apply_retpolines() */
+ continue;
+ }
+
+ /* cfi_paranoid */
+ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
+ memcpy(bytes + fineibt_caller_hash, &hash, 4);
+
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
+ emit_paranoid_trampoline(addr + fineibt_caller_size,
+ &insn, 11, bytes + fineibt_caller_size);
+ } else {
+ int len = fineibt_paranoid_size - fineibt_paranoid_ind;
+ ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind, len);
+ if (WARN_ON_ONCE(ret != len))
+ continue;
+ }
+
+ text_poke_early(addr, bytes, fineibt_paranoid_size);
+ }
+
+ return 0;
+}
+
+#define pr_cfi_debug(X...) if (cfi_debug) pr_info(X)
+
+#define FINEIBT_WARN(_f, _v) \
+ WARN_ONCE((_f) != (_v), "FineIBT: " #_f " %ld != %d\n", _f, _v)
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ int ret;
+
+ if (FINEIBT_WARN(fineibt_preamble_size, 20) ||
+ FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) ||
+ FINEIBT_WARN(fineibt_caller_size, 14) ||
+ FINEIBT_WARN(fineibt_paranoid_size, 20))
+ return;
+
+ if (cfi_mode == CFI_AUTO) {
+ cfi_mode = CFI_KCFI;
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
+ /*
+ * FRED has much saner context on exception entry and
+ * is less easy to take advantage of.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ cfi_paranoid = true;
+ cfi_mode = CFI_FINEIBT;
+ }
+ }
+
+ /*
+ * Rewrite the callers to not use the __cfi_ stubs, such that we might
+ * rewrite them. This disables all CFI. If this succeeds but any of the
+ * later stages fails, we're without CFI.
+ */
+ pr_cfi_debug("CFI: disabling all indirect call checking\n");
+ ret = cfi_disable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (cfi_rand) {
+ if (builtin) {
+ cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
+ pr_cfi_debug("CFI: cfi_seed: 0x%08x\n", cfi_seed);
+
+ pr_cfi_debug("CFI: rehashing all preambles\n");
+ ret = cfi_rand_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ pr_cfi_debug("CFI: rehashing all indirect calls\n");
+ ret = cfi_rand_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+ } else {
+ pr_cfi_debug("CFI: rehashing disabled\n");
+ }
+
+ switch (cfi_mode) {
+ case CFI_OFF:
+ if (builtin)
+ pr_info("CFI: disabled\n");
+ return;
+
+ case CFI_KCFI:
+ pr_cfi_debug("CFI: re-enabling all indirect call checking\n");
+ ret = cfi_enable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (builtin)
+ pr_info("CFI: Using %sretpoline kCFI\n",
+ cfi_rand ? "rehashed " : "");
+ return;
+
+ case CFI_FINEIBT:
+ pr_cfi_debug("CFI: adding FineIBT to all preambles\n");
+ /* place the FineIBT preamble at func()-16 */
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ /* rewrite the callers to target func()-16 */
+ pr_cfi_debug("CFI: rewriting indirect call sites to use FineIBT\n");
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ /* now that nobody targets func()+0, remove ENDBR there */
+ pr_cfi_debug("CFI: removing old endbr insns\n");
+ cfi_rewrite_endbr(start_cfi, end_cfi);
+
+ if (builtin) {
+ pr_info("Using %sFineIBT%s CFI\n",
+ cfi_paranoid ? "paranoid " : "",
+ cfi_bhi ? "+BHI" : "");
+ }
+ return;
+
+ default:
+ break;
+ }
+
+err:
+ pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
+}
+
+static inline void poison_hash(void *addr)
+{
+ *(u32 *)addr = 0;
+}
+
+static void poison_cfi(void *addr)
+{
+ /*
+ * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
+ * some (static) functions for which they can determine the address
+ * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
+ *
+ * As such, these functions will get sealed, but we need to be careful
+ * to not unconditionally scribble the previous function.
+ */
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ /*
+ * FineIBT prefix should start with an ENDBR.
+ */
+ if (!is_endbr(addr))
+ break;
+
+ /*
+ * __cfi_\func:
+ * nopl -42(%rax)
+ * sub $0, %eax
+ * jne \func+3
+ * \func:
+ * nopl -42(%rax)
+ */
+ poison_endbr(addr);
+ poison_hash(addr + fineibt_preamble_hash);
+ break;
+
+ case CFI_KCFI:
+ /*
+ * kCFI prefix should start with a valid hash.
+ */
+ if (!decode_preamble_hash(addr, NULL))
+ break;
+
+ /*
+ * __cfi_\func:
+ * movl $0, %eax
+ * .skip 11, 0x90
+ */
+ poison_hash(addr + 1);
+ break;
+
+ default:
+ break;
+ }
+}
+
+#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE)
+
+/*
+ * When regs->ip points to a 0xD6 byte in the FineIBT preamble,
+ * return true and fill out target and type.
+ *
+ * We check the preamble by checking for the ENDBR instruction relative to the
+ * UDB instruction.
+ */
+static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_preamble_ud;
+ u32 hash;
+
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ *target = addr + fineibt_prefix_size;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * Since regs->ip points to the middle of an instruction; it cannot
+ * continue with the normal fixup.
+ */
+ regs->ip = *target;
+
+ return true;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to one of the UD2 in __bhi_args[].
+ */
+static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr;
+ u32 hash;
+
+ if (!cfi_bhi)
+ return false;
+
+ if (regs->ip < (unsigned long)__bhi_args ||
+ regs->ip >= (unsigned long)__bhi_args_end)
+ return false;
+
+ /*
+ * Fetch the return address from the stack, this points to the
+ * FineIBT preamble. Since the CALL instruction is in the 5 last
+ * bytes of the preamble, the return address is in fact the target
+ * address.
+ */
+ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
+ *target = addr;
+
+ addr -= fineibt_prefix_size;
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * The UD2 sites are constructed with a RET immediately following,
+ * as such the non-fatal case can use the regular fixup.
+ */
+ return true;
+
+Efault:
+ return false;
+}
+
+static bool is_paranoid_thunk(unsigned long addr)
+{
+ u32 thunk;
+
+ __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
+ return (thunk & 0x00FFFFFF) == 0xfd75d6;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
+ * sequence, or to UDB + Jcc.d8 for cfi_paranoid + ITS thunk.
+ */
+static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_paranoid_ud;
+
+ if (!cfi_paranoid)
+ return false;
+
+ if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ /*
+ * Since the trapping instruction is the exact, but LOCK prefixed,
+ * Jcc.d8 that got us here, the normal fixup will work.
+ */
+ return true;
+ }
+
+ /*
+ * The cfi_paranoid + ITS thunk combination results in:
+ *
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f7 cmp -11(%r11), %eax
+ * a: 2e 3d 8d 5b f0 cs lea -0x10(%r11), %r11
+ * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
+ *
+ * Where the paranoid_thunk looks like:
+ *
+ * 1d: <d6> udb
+ * __x86_indirect_paranoid_thunk_r11:
+ * 1e: 75 fd jne 1d
+ * __x86_indirect_its_thunk_r11:
+ * 20: 41 ff eb jmp *%r11
+ * 23: cc int3
+ *
+ */
+ if (is_paranoid_thunk(regs->ip)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ regs->ip = *target;
+ return true;
+ }
+
+ return false;
+}
+
+bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ if (decode_fineibt_paranoid(regs, target, type))
+ return true;
+
+ if (decode_fineibt_bhi(regs, target, type))
+ return true;
+
+ return decode_fineibt_preamble(regs, target, type);
+}
+
+#else /* !CONFIG_FINEIBT: */
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ if (IS_ENABLED(CONFIG_CFI) && builtin)
+ pr_info("CFI: Using standard kCFI\n");
+}
+
+#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_cfi(void *addr) { }
+#endif
+
+#endif /* !CONFIG_FINEIBT */
+
+void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi)
+{
+ return __apply_fineibt(start_retpoline, end_retpoline,
+ start_cfi, end_cfi,
+ /* .builtin = */ false);
+}
+
+#ifdef CONFIG_SMP
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
- if (*ptr > text_end)
+ /* turn DS segment override prefix into lock prefix */
+ if (*ptr == 0x3e)
+ text_poke(ptr, ((unsigned char []){0xf0}), 1);
+ }
+}
+
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn lock prefix into DS segment override prefix */
- text_poke(*ptr, ((unsigned char []){0x3E}), 1);
- };
- mutex_unlock(&text_mutex);
+ if (*ptr == 0xf0)
+ text_poke(ptr, ((unsigned char []){0x3E}), 1);
+ }
}
struct smp_alt_module {
@@ -266,8 +2098,8 @@ struct smp_alt_module {
char *name;
/* ptrs to lock prefixes */
- u8 **locks;
- u8 **locks_end;
+ const s32 *locks;
+ const s32 *locks_end;
/* .text segment, needed to avoid patching init code ;) */
u8 *text;
@@ -276,28 +2108,27 @@ struct smp_alt_module {
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
-static DEFINE_MUTEX(smp_alt);
-static int smp_mode = 1; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by text_mutex */
-void alternatives_smp_module_add(struct module *mod, char *name,
- void *locks, void *locks_end,
- void *text, void *text_end)
+void __init_or_module alternatives_smp_module_add(struct module *mod,
+ char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end)
{
struct smp_alt_module *smp;
- if (noreplace_smp)
- return;
+ mutex_lock(&text_mutex);
+ if (!uniproc_patched)
+ goto unlock;
- if (smp_alt_once) {
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(locks, locks_end,
- text, text_end);
- return;
- }
+ if (num_possible_cpus() == 1)
+ /* Don't bother remembering, we'll never have to undo it. */
+ goto smp_unlock;
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
- return; /* we'll run the (safe but slow) SMP code then ... */
+ /* we'll run the (safe but slow) SMP code then ... */
+ goto unlock;
smp->mod = mod;
smp->name = name;
@@ -305,172 +2136,280 @@ void alternatives_smp_module_add(struct module *mod, char *name,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
- __func__, smp->locks, smp->locks_end,
+ DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
- mutex_lock(&smp_alt);
list_add_tail(&smp->next, &smp_alt_modules);
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(smp->locks, smp->locks_end,
- smp->text, smp->text_end);
- mutex_unlock(&smp_alt);
+smp_unlock:
+ alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
+ mutex_unlock(&text_mutex);
}
-void alternatives_smp_module_del(struct module *mod)
+void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- if (smp_alt_once || noreplace_smp)
- return;
-
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
- mutex_unlock(&smp_alt);
- DPRINTK("%s: %s\n", __func__, item->name);
kfree(item);
- return;
+ break;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
-void alternatives_smp_switch(int smp)
+void alternatives_enable_smp(void)
{
struct smp_alt_module *mod;
-#ifdef CONFIG_LOCKDEP
- /*
- * Older binutils section handling bug prevented
- * alternatives-replacement from working reliably.
- *
- * If this still occurs then you should see a hang
- * or crash shortly after this line:
- */
- printk("lockdep: fixing up alternatives.\n");
-#endif
-
- if (noreplace_smp || smp_alt_once)
- return;
- BUG_ON(!smp && (num_online_cpus() > 1));
+ /* Why bother if there are no other CPUs? */
+ BUG_ON(num_possible_cpus() == 1);
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
- /*
- * Avoid unnecessary switches because it forces JIT based VMs to
- * throw away all cached translations, which can be quite costly.
- */
- if (smp == smp_mode) {
- /* nothing */
- } else if (smp) {
- printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+ if (uniproc_patched) {
+ pr_info("switching to SMP code\n");
+ BUG_ON(num_online_cpus() != 1);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
- } else {
- printk(KERN_INFO "SMP alternatives: switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
- list_for_each_entry(mod, &smp_alt_modules, next)
- alternatives_smp_unlock(mod->locks, mod->locks_end,
- mod->text, mod->text_end);
+ uniproc_patched = false;
}
- smp_mode = smp;
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
-#endif
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
+int alternatives_text_reserved(void *start, void *end)
+{
+ struct smp_alt_module *mod;
+ const s32 *poff;
+ u8 *text_start = start;
+ u8 *text_end = end;
+
+ lockdep_assert_held(&text_mutex);
+
+ list_for_each_entry(mod, &smp_alt_modules, next) {
+ if (mod->text > text_end || mod->text_end < text_start)
+ continue;
+ for (poff = mod->locks; poff < mod->locks_end; poff++) {
+ const u8 *ptr = (const u8 *)poff + *poff;
+
+ if (text_start <= ptr && text_end > ptr)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Self-test for the INT3 based CALL emulation code.
+ *
+ * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ * properly and that there is a stack gap between the INT3 frame and the
+ * previous context. Without this gap doing a virtual PUSH on the interrupted
+ * stack would corrupt the INT3 IRET frame.
+ *
+ * See entry_{32,64}.S for more details.
+ */
+
+extern void int3_selftest_asm(unsigned int *ptr);
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_asm, @function\n"
+"int3_selftest_asm:\n"
+ ANNOTATE_NOENDBR "\n"
+ /*
+ * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
+ * exception, then the int3_exception_nb notifier emulates a call to
+ * int3_selftest_callee().
+ */
+" int3; nop; nop; nop; nop\n"
+ ASM_RET
+" .size int3_selftest_asm, . - int3_selftest_asm\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_callee(unsigned int *ptr);
+
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_callee, @function\n"
+"int3_selftest_callee:\n"
+ ANNOTATE_NOENDBR "\n"
+" movl $0x1234, (%" _ASM_ARG1 ")\n"
+ ASM_RET
+" .size int3_selftest_callee, . - int3_selftest_callee\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_ip(void); /* defined in asm below */
+
+static int __init
+int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
- struct paravirt_patch_site *p;
- char insnbuf[MAX_PATCH_LEN];
+ unsigned long selftest = (unsigned long)&int3_selftest_asm;
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
- if (noreplace_paravirt)
- return;
+ OPTIMIZER_HIDE_VAR(selftest);
- for (p = start; p < end; p++) {
- unsigned int used;
+ if (!regs || user_mode(regs))
+ return NOTIFY_DONE;
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insnbuf, p->instr, p->len);
- used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
- (unsigned long)p->instr, p->len);
+ if (val != DIE_INT3)
+ return NOTIFY_DONE;
- BUG_ON(used > p->len);
+ if (regs->ip - INT3_INSN_SIZE != selftest)
+ return NOTIFY_DONE;
- /* Pad the rest with nops */
- add_nops(insnbuf + used, p->len - used);
- text_poke_early(p->instr, insnbuf, p->len);
- }
+ int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
+ return NOTIFY_STOP;
+}
+
+/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
+static noinline void __init int3_selftest(void)
+{
+ static __initdata struct notifier_block int3_exception_nb = {
+ .notifier_call = int3_exception_notify,
+ .priority = INT_MAX-1, /* last */
+ };
+ unsigned int val = 0;
+
+ BUG_ON(register_die_notifier(&int3_exception_nb));
+
+ /*
+ * Basically: int3_selftest_callee(&val); but really complicated :-)
+ */
+ int3_selftest_asm(&val);
+
+ BUG_ON(val != 0x1234);
+
+ unregister_die_notifier(&int3_exception_nb);
+}
+
+static __initdata int __alt_reloc_selftest_addr;
+
+extern void __init __alt_reloc_selftest(void *arg);
+__visible noinline void __init __alt_reloc_selftest(void *arg)
+{
+ WARN_ON(arg != &__alt_reloc_selftest_addr);
+}
+
+static noinline void __init alt_reloc_selftest(void)
+{
+ /*
+ * Tests text_poke_apply_relocation().
+ *
+ * This has a relative immediate (CALL) in a place other than the first
+ * instruction and additionally on x86_64 we get a RIP-relative LEA:
+ *
+ * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
+ * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
+ *
+ * Getting this wrong will either crash and burn or tickle the WARN
+ * above.
+ */
+ asm_inline volatile (
+ ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+ : ASM_CALL_CONSTRAINT
+ : [mem] "m" (__alt_reloc_selftest_addr)
+ : _ASM_ARG1
+ );
}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
void __init alternative_instructions(void)
{
- /* The patching is not fully atomic, so try to avoid local interruptions
- that might execute the to be patched code.
- Other CPUs are not running. */
+ u64 ibt;
+
+ int3_selftest();
+
+ /*
+ * The patching is not fully atomic, so try to avoid local
+ * interruptions that might execute the to be patched code.
+ * Other CPUs are not running.
+ */
stop_nmi();
/*
* Don't stop machine check exceptions while patching.
* MCEs only happen when something got corrupted and in this
* case we must do something about the corruption.
- * Ignoring it is worse than a unlikely patching race.
+ * Ignoring it is worse than an unlikely patching race.
* Also machine checks tend to be broadcast and if one CPU
* goes into machine check the others follow quickly, so we don't
* expect a machine check to cause undue problems during to code
* patching.
*/
+ /*
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
+ */
+ paravirt_set_cap();
+
+ /* Keep CET-IBT disabled until caller/callee are patched */
+ ibt = ibt_save(/*disable*/ true);
+
+ __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
+ __cfi_sites, __cfi_sites_end, true);
+ cfi_debug = false;
+
+ /*
+ * Rewrite the retpolines, must be done before alternatives since
+ * those can rewrite the retpoline thunks.
+ */
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+ apply_returns(__return_sites, __return_sites_end);
+
+ its_fini_core();
+
+ /*
+ * Adjust all CALL instructions to point to func()-10, including
+ * those in .altinstr_replacement.
+ */
+ callthunks_patch_builtin_calls();
+
apply_alternatives(__alt_instructions, __alt_instructions_end);
- /* switch to patch-once-at-boottime-only mode and free the
- * tables in case we know the number of CPUs will never ever
- * change */
-#ifdef CONFIG_HOTPLUG_CPU
- if (num_possible_cpus() < 2)
- smp_alt_once = 1;
-#endif
+ /*
+ * Seal all functions that do not have their address taken.
+ */
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+
+ ibt_restore(ibt);
#ifdef CONFIG_SMP
- if (smp_alt_once) {
- if (1 == num_possible_cpus()) {
- printk(KERN_INFO "SMP alternatives: switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
-
- alternatives_smp_unlock(__smp_locks, __smp_locks_end,
- _text, _etext);
- }
- } else {
+ /* Patch to UP if other cpus not imminent. */
+ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+ uniproc_patched = true;
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
-
- /* Only switch to UP mode if we don't immediately boot others */
- if (num_present_cpus() == 1 || setup_max_cpus <= 1)
- alternatives_smp_switch(0);
}
-#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
- if (smp_alt_once)
+ if (!uniproc_patched || num_possible_cpus() == 1) {
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+ }
+#endif
restart_nmi();
+ alternatives_patched = 1;
+
+ alt_reloc_selftest();
}
/**
@@ -482,18 +2421,169 @@ void __init alternative_instructions(void)
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these
- * instructions. And on the local CPU you need to be protected again NMI or MCE
- * handlers seeing an inconsistent instruction while you patch.
+ * instructions. And on the local CPU you need to be protected against NMI or
+ * MCE handlers seeing an inconsistent instruction while you patch.
*/
-void *text_poke_early(void *addr, const void *opcode, size_t len)
+void __init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
{
unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_NX) &&
+ is_module_text_address((unsigned long)addr)) {
+ /*
+ * Modules text is marked initially as non-executable, so the
+ * code cannot be running and speculative code-fetches are
+ * prevented. Just change the code.
+ */
+ memcpy(addr, opcode, len);
+ } else {
+ local_irq_save(flags);
+ memcpy(addr, opcode, len);
+ sync_core();
+ local_irq_restore(flags);
+
+ /*
+ * Could also do a CLFLUSH here to speed up CPU recovery; but
+ * that causes hangs on some VIA CPUs.
+ */
+ }
+}
+
+__ro_after_init struct mm_struct *text_poke_mm;
+__ro_after_init unsigned long text_poke_mm_addr;
+
+/*
+ * Text poking creates and uses a mapping in the lower half of the
+ * address space. Relax LASS enforcement when accessing the poking
+ * address.
+ *
+ * objtool enforces a strict policy of "no function calls within AC=1
+ * regions". Adhere to the policy by using inline versions of
+ * memcpy()/memset() that will never result in a function call.
+ */
+
+static void text_poke_memcpy(void *dst, const void *src, size_t len)
+{
+ lass_stac();
+ __inline_memcpy(dst, src, len);
+ lass_clac();
+}
+
+static void text_poke_memset(void *dst, const void *src, size_t len)
+{
+ int c = *(const int *)src;
+
+ lass_stac();
+ __inline_memset(dst, c, len);
+ lass_clac();
+}
+
+typedef void text_poke_f(void *dst, const void *src, size_t len);
+
+static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
+{
+ bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
+ struct page *pages[2] = {NULL};
+ struct mm_struct *prev_mm;
+ unsigned long flags;
+ pte_t pte, *ptep;
+ spinlock_t *ptl;
+ pgprot_t pgprot;
+
+ /*
+ * While boot memory allocator is running we cannot use struct pages as
+ * they are not yet initialized. There is no way to recover.
+ */
+ BUG_ON(!after_bootmem);
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ if (cross_page_boundary)
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ if (cross_page_boundary)
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ /*
+ * If something went wrong, crash and burn since recovery paths are not
+ * implemented.
+ */
+ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
+
+ /*
+ * Map the page without the global bit, as TLB flushing is done with
+ * flush_tlb_mm_range(), which is intended for non-global PTEs.
+ */
+ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
+
+ /*
+ * The lock is not really needed, but this allows to avoid open-coding.
+ */
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
+
+ /*
+ * This must not fail; preallocated in poking_init().
+ */
+ VM_BUG_ON(!ptep);
+
local_irq_save(flags);
- memcpy(addr, opcode, len);
+
+ pte = mk_pte(pages[0], pgprot);
+ set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
+
+ if (cross_page_boundary) {
+ pte = mk_pte(pages[1], pgprot);
+ set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
+ }
+
+ /*
+ * Loading the temporary mm behaves as a compiler barrier, which
+ * guarantees that the PTE will be set at the time memcpy() is done.
+ */
+ prev_mm = use_temporary_mm(text_poke_mm);
+
+ kasan_disable_current();
+ func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
+ kasan_enable_current();
+
+ /*
+ * Ensure that the PTE is only cleared after the instructions of memcpy
+ * were issued by using a compiler barrier.
+ */
+ barrier();
+
+ pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
+ if (cross_page_boundary)
+ pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
+
+ /*
+ * Loading the previous page-table hierarchy requires a serializing
+ * instruction that already allows the core to see the updated version.
+ * Xen-PV is assumed to serialize execution in a similar manner.
+ */
+ unuse_temporary_mm(prev_mm);
+
+ /*
+ * Flushing the TLB might involve IPIs, which would require enabled
+ * IRQs, but not if the mm is not used, as it is in this point.
+ */
+ flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
+ (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
+ PAGE_SHIFT, false);
+
+ if (func == text_poke_memcpy) {
+ /*
+ * If the text does not match what we just wrote then something is
+ * fundamentally screwy; there's nothing we can really do about that.
+ */
+ BUG_ON(memcmp(addr, src, len));
+ }
+
local_irq_restore(flags);
- sync_core();
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
+ pte_unmap_unlock(ptep, ptl);
return addr;
}
@@ -508,39 +2598,566 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
*
- * Note: Must be called under text_mutex.
+ * Note that the caller must ensure that if the modified code is part of a
+ * module, the module would not be removed during poking. This can be achieved
+ * by registering a module notifier, and ordering module removal and patching
+ * through a mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+void *text_poke(void *addr, const void *opcode, size_t len)
{
- unsigned long flags;
- char *vaddr;
- struct page *pages[2];
- int i;
+ lockdep_assert_held(&text_mutex);
- if (!core_kernel_text((unsigned long)addr)) {
- pages[0] = vmalloc_to_page(addr);
- pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
- } else {
- pages[0] = virt_to_page(addr);
- WARN_ON(!PageReserved(pages[0]));
- pages[1] = virt_to_page(addr + PAGE_SIZE);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
+}
+
+/**
+ * text_poke_kgdb - Update instructions on a live kernel by kgdb
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ *
+ * Context: should only be used by kgdb, which ensures no other core is running,
+ * despite the fact it does not hold the text_mutex.
+ */
+void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
+{
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
+}
+
+void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
+ bool core_ok)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
+ return NULL;
+
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
+ patched += s;
}
- BUG_ON(!pages[0]);
- local_irq_save(flags);
- set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
- if (pages[1])
- set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
- vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
- memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
- clear_fixmap(FIX_TEXT_POKE0);
- if (pages[1])
- clear_fixmap(FIX_TEXT_POKE1);
- local_flush_tlb();
- sync_core();
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
- for (i = 0; i < len; i++)
- BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
- local_irq_restore(flags);
return addr;
}
+
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ mutex_lock(&text_mutex);
+ addr = text_poke_copy_locked(addr, opcode, len, false);
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+/**
+ * text_poke_set - memset into (an unused part of) RX memory
+ * @addr: address to modify
+ * @c: the byte to fill the area with
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * This is useful to overwrite unused regions of RX memory with illegal
+ * instructions.
+ */
+void *text_poke_set(void *addr, int c, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ mutex_lock(&text_mutex);
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
+ patched += s;
+ }
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
+
+void smp_text_poke_sync_each_cpu(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+/*
+ * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
+ * this thing. When len == 6 everything is prefixed with 0x0f and we map
+ * opcode to Jcc.d8, using len to distinguish.
+ */
+struct smp_text_poke_loc {
+ /* addr := _stext + rel_addr */
+ s32 rel_addr;
+ s32 disp;
+ u8 len;
+ u8 opcode;
+ const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
+ /* see smp_text_poke_batch_finish() */
+ u8 old;
+};
+
+#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
+
+static struct smp_text_poke_array {
+ struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
+ int nr_entries;
+} text_poke_array;
+
+static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
+
+/*
+ * These four __always_inline annotations imply noinstr, necessary
+ * due to smp_text_poke_int3_handler() being noinstr:
+ */
+
+static __always_inline bool try_get_text_poke_array(void)
+{
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
+
+ if (!raw_atomic_inc_not_zero(refs))
+ return false;
+
+ return true;
+}
+
+static __always_inline void put_text_poke_array(void)
+{
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
+
+ smp_mb__before_atomic();
+ raw_atomic_dec(refs);
+}
+
+static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
+{
+ return _stext + tpl->rel_addr;
+}
+
+static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
+{
+ if (tpl_a < text_poke_addr(tpl_b))
+ return -1;
+ if (tpl_a > text_poke_addr(tpl_b))
+ return 1;
+ return 0;
+}
+
+noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
+{
+ struct smp_text_poke_loc *tpl;
+ int ret = 0;
+ void *ip;
+
+ if (user_mode(regs))
+ return 0;
+
+ /*
+ * Having observed our INT3 instruction, we now must observe
+ * text_poke_array with non-zero refcount:
+ *
+ * text_poke_array_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (text_poke_array_refs != 0)
+ */
+ smp_rmb();
+
+ if (!try_get_text_poke_array())
+ return 0;
+
+ /*
+ * Discount the INT3. See smp_text_poke_batch_finish().
+ */
+ ip = (void *) regs->ip - INT3_INSN_SIZE;
+
+ /*
+ * Skip the binary search if there is a single member in the vector.
+ */
+ if (unlikely(text_poke_array.nr_entries > 1)) {
+ tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
+ sizeof(struct smp_text_poke_loc),
+ patch_cmp);
+ if (!tpl)
+ goto out_put;
+ } else {
+ tpl = text_poke_array.vec;
+ if (text_poke_addr(tpl) != ip)
+ goto out_put;
+ }
+
+ ip += tpl->len;
+
+ switch (tpl->opcode) {
+ case INT3_INSN_OPCODE:
+ /*
+ * Someone poked an explicit INT3, they'll want to handle it,
+ * do not consume.
+ */
+ goto out_put;
+
+ case RET_INSN_OPCODE:
+ int3_emulate_ret(regs);
+ break;
+
+ case CALL_INSN_OPCODE:
+ int3_emulate_call(regs, (long)ip + tpl->disp);
+ break;
+
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ int3_emulate_jmp(regs, (long)ip + tpl->disp);
+ break;
+
+ case 0x70 ... 0x7f: /* Jcc */
+ int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
+ break;
+
+ default:
+ BUG();
+ }
+
+ ret = 1;
+
+out_put:
+ put_text_poke_array();
+ return ret;
+}
+
+/**
+ * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
+ *
+ * Input state:
+ * text_poke_array.vec: vector of instructions to patch
+ * text_poke_array.nr_entries: number of entries in the vector
+ *
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ *
+ * The way it is done:
+ * - For each entry in the vector:
+ * - add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - For each entry in the vector:
+ * - update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - For each entry in the vector:
+ * - replace the first byte (INT3) by the first byte of the
+ * replacing opcode
+ * - SMP sync all CPUs
+ */
+void smp_text_poke_batch_finish(void)
+{
+ unsigned char int3 = INT3_INSN_OPCODE;
+ unsigned int i;
+ int do_sync;
+
+ if (!text_poke_array.nr_entries)
+ return;
+
+ lockdep_assert_held(&text_mutex);
+
+ /*
+ * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
+ * ensure reading a non-zero refcount provides up to date text_poke_array data.
+ */
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
+
+ /*
+ * Function tracing can enable thousands of places that need to be
+ * updated. This can take quite some time, and with full kernel debugging
+ * enabled, this could cause the softlockup watchdog to trigger.
+ * This function gets called every 256 entries added to be patched.
+ * Call cond_resched() here to make sure that other tasks can get scheduled
+ * while processing all the functions being patched.
+ */
+ cond_resched();
+
+ /*
+ * Corresponding read barrier in INT3 notifier for making sure the
+ * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
+ */
+ smp_wmb();
+
+ /*
+ * First step: add a INT3 trap to the address that will be patched.
+ */
+ for (i = 0; i < text_poke_array.nr_entries; i++) {
+ text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
+ }
+
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Second step: update all but the first byte of the patched range.
+ */
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
+ u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = text_poke_array.vec[i].text;
+ int len = text_poke_array.vec[i].len;
+
+ if (len - INT3_INSN_SIZE > 0) {
+ memcpy(old + INT3_INSN_SIZE,
+ text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
+
+ if (len == 6) {
+ _new[0] = 0x0f;
+ memcpy(_new + 1, new, 5);
+ new = _new;
+ }
+
+ text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
+
+ do_sync++;
+ }
+
+ /*
+ * Emit a perf event to record the text poke, primarily to
+ * support Intel PT decoding which must walk the executable code
+ * to reconstruct the trace. The flow up to here is:
+ * - write INT3 byte
+ * - IPI-SYNC
+ * - write instruction tail
+ * At this point the actual control flow will be through the
+ * INT3 and handler and not hit the old or new instruction.
+ * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+ * can still be decoded. Subsequently:
+ * - emit RECORD_TEXT_POKE with the new instruction
+ * - IPI-SYNC
+ * - write first byte
+ * - IPI-SYNC
+ * So before the text poke event timestamp, the decoder will see
+ * either the old instruction flow or FUP/TIP of INT3. After the
+ * text poke event timestamp, the decoder will see either the
+ * new instruction flow or FUP/TIP of INT3. Thus decoders can
+ * use the timestamp as the point at which to modify the
+ * executable code.
+ * The old instruction is recorded so that the event can be
+ * processed forwards or backwards.
+ */
+ perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
+ }
+
+ if (do_sync) {
+ /*
+ * According to Intel, this core syncing is very likely
+ * not necessary and we'd be safe even without it. But
+ * better safe than sorry (plus there's not only Intel).
+ */
+ smp_text_poke_sync_each_cpu();
+ }
+
+ /*
+ * Third step: replace the first byte (INT3) by the first byte of the
+ * replacing opcode.
+ */
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 byte = text_poke_array.vec[i].text[0];
+
+ if (text_poke_array.vec[i].len == 6)
+ byte = 0x0f;
+
+ if (byte == INT3_INSN_OPCODE)
+ continue;
+
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
+ do_sync++;
+ }
+
+ if (do_sync)
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
+ */
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
+
+ if (unlikely(!atomic_dec_and_test(refs)))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
+
+ /* They are all completed: */
+ text_poke_array.nr_entries = 0;
+}
+
+static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ struct smp_text_poke_loc *tpl;
+ struct insn insn;
+ int ret, i = 0;
+
+ tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
+
+ if (len == 6)
+ i = 1;
+ memcpy((void *)tpl->text, opcode+i, len-i);
+ if (!emulate)
+ emulate = opcode;
+
+ ret = insn_decode_kernel(&insn, emulate);
+ BUG_ON(ret < 0);
+
+ tpl->rel_addr = addr - (void *)_stext;
+ tpl->len = len;
+ tpl->opcode = insn.opcode.bytes[0];
+
+ if (is_jcc32(&insn)) {
+ /*
+ * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
+ */
+ tpl->opcode = insn.opcode.bytes[1] - 0x10;
+ }
+
+ switch (tpl->opcode) {
+ case RET_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ /*
+ * Control flow instructions without implied execution of the
+ * next instruction can be padded with INT3.
+ */
+ for (i = insn.length; i < len; i++)
+ BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
+ break;
+
+ default:
+ BUG_ON(len != insn.length);
+ }
+
+ switch (tpl->opcode) {
+ case INT3_INSN_OPCODE:
+ case RET_INSN_OPCODE:
+ break;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ case 0x70 ... 0x7f: /* Jcc */
+ tpl->disp = insn.immediate.value;
+ break;
+
+ default: /* assume NOP */
+ switch (len) {
+ case 2: /* NOP2 -- emulate as JMP8+0 */
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
+ tpl->opcode = JMP8_INSN_OPCODE;
+ tpl->disp = 0;
+ break;
+
+ case 5: /* NOP5 -- emulate as JMP32+0 */
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
+ tpl->opcode = JMP32_INSN_OPCODE;
+ tpl->disp = 0;
+ break;
+
+ default: /* unknown instruction */
+ BUG();
+ }
+ break;
+ }
+}
+
+/*
+ * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
+ * early if needed.
+ */
+static bool text_poke_addr_ordered(void *addr)
+{
+ WARN_ON_ONCE(!addr);
+
+ if (!text_poke_array.nr_entries)
+ return true;
+
+ /*
+ * If the last current entry's address is higher than the
+ * new entry's address we'd like to add, then ordering
+ * is violated and we must first flush all pending patching
+ * requests:
+ */
+ if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
+ return false;
+
+ return true;
+}
+
+/**
+ * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Add a new instruction to the current queue of to-be-patched instructions
+ * the kernel maintains. The patching request will not be executed immediately,
+ * but becomes part of an array of patching requests, optimized for batched
+ * execution. All pending patching requests will be executed on the next
+ * smp_text_poke_batch_finish() call.
+ */
+void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
+ smp_text_poke_batch_finish();
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
+}
+
+/**
+ * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Update a single instruction with the vector in the stack, avoiding
+ * dynamically allocated memory. This function should be used when it is
+ * not possible to allocate memory for a vector. The single instruction
+ * is patched in immediately.
+ */
+void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ smp_text_poke_batch_add(addr, opcode, len, emulate);
+ smp_text_poke_batch_finish();
+}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index d2e56b8f48e7..e8000a56732e 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Dynamic DMA mapping support for AMD Hammer.
*
@@ -5,10 +6,9 @@
* This allows to use PCI devices that only support 32bit addresses on systems
* with more than 4GB.
*
- * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification.
+ * See Documentation/core-api/dma-api-howto.rst for the interface specification.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU General Public License v2 only.
*/
#include <linux/types.h>
@@ -16,28 +16,31 @@
#include <linux/agp_backend.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
-#include <linux/module.h>
#include <linux/topology.h>
#include <linux/interrupt.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
#include <linux/kdebug.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/io.h>
-#include <asm/atomic.h>
+#include <linux/gfp.h>
+#include <linux/atomic.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
#include <asm/mtrr.h>
-#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/cacheflush.h>
-#include <asm/swiotlb.h>
+#include <asm/set_memory.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd/nb.h>
+#include <asm/x86_init.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -50,7 +53,7 @@ static u32 *iommu_gatt_base; /* Remapping table */
* of only flushing when an mapping is reused. With it true the GART is
* flushed for every mapping. Problem is that doing the lazy flush seems
* to trigger bugs with some popular PCI cards, in particular 3ware (but
- * has been also also seen with Qlogic at least).
+ * has been also seen with Qlogic at least).
*/
static int iommu_fullflush = 1;
@@ -67,14 +70,15 @@ static u32 gart_unmapped_entry;
(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
-#define EMERGENCY_PAGES 32 /* = 128KB */
-
#ifdef CONFIG_AGP
#define AGPEXTERN extern
#else
#define AGPEXTERN
#endif
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR (1ULL << 40)
+
/* backdoor interface to AGP driver */
AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
@@ -91,8 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
+ boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT);
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
@@ -122,7 +125,7 @@ static void free_iommu(unsigned long offset, int size)
unsigned long flags;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
- iommu_area_free(iommu_gart_bitmap, offset, size);
+ bitmap_clear(iommu_gart_bitmap, offset, size);
if (offset >= next_bit)
next_bit = offset + size;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -137,7 +140,7 @@ static void flush_gart(void)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
- k8_flush_garts();
+ amd_flush_garts();
need_flush = false;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -145,9 +148,6 @@ static void flush_gart(void)
#ifdef CONFIG_IOMMU_LEAK
/* Debugging aid for drivers that don't free their IOMMU tables */
-static int leak_trace;
-static int iommu_leak_pages = 20;
-
static void dump_leak(void)
{
static int dump;
@@ -156,7 +156,7 @@ static void dump_leak(void)
return;
dump = 1;
- show_stack(NULL, NULL);
+ show_stack(NULL, NULL, KERN_ERR);
debug_dma_dump_mappings(NULL);
}
#endif
@@ -174,14 +174,6 @@ static void iommu_full(struct device *dev, size_t size, int dir)
*/
dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
-
- if (size > PAGE_SIZE*EMERGENCY_PAGES) {
- if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
- panic("PCI-DMA: Memory would be corrupted\n");
- if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
- panic(KERN_ERR
- "PCI-DMA: Random memory would be DMAed\n");
- }
#ifdef CONFIG_IOMMU_LEAK
dump_leak();
#endif
@@ -190,14 +182,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return force_iommu ||
- !is_buffer_dma_capable(*dev->dma_mask, addr, size);
+ return force_iommu || !dma_capable(dev, addr, size, true);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
+ return !dma_capable(dev, addr, size, true);
}
/* Map a single continuous physical area into the IOMMU.
@@ -207,16 +198,20 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
- unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
+ unsigned long iommu_page;
int i;
+ if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+ return DMA_MAPPING_ERROR;
+
+ iommu_page = alloc_iommu(dev, npages, align_mask);
if (iommu_page == -1) {
if (!nonforced_iommu(dev, phys_mem, size))
return phys_mem;
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
iommu_full(dev, size, dir);
- return bad_dma_address;
+ return DMA_MAPPING_ERROR;
}
for (i = 0; i < npages; i++) {
@@ -227,16 +222,14 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
}
/* Map a single area into the IOMMU */
-static dma_addr_t gart_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
+static dma_addr_t gart_map_phys(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
{
unsigned long bus;
- phys_addr_t paddr = page_to_phys(page) + offset;
- if (!dev)
- dev = &x86_dma_fallback_dev;
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return DMA_MAPPING_ERROR;
if (!need_iommu(dev, paddr, size))
return paddr;
@@ -250,15 +243,23 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page,
/*
* Free a DMA mapping.
*/
-static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+static void gart_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
unsigned long iommu_page;
int npages;
int i;
- if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+ if (WARN_ON_ONCE(dma_addr == DMA_MAPPING_ERROR))
+ return;
+
+ /*
+ * This driver will not always use a GART mapping, but might have
+ * created a direct mapping instead. If that is the case there is
+ * nothing to unmap here.
+ */
+ if (dma_addr < iommu_bus_base ||
dma_addr >= iommu_bus_base + iommu_size)
return;
@@ -274,7 +275,7 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
* Wrapper for pci_unmap_single working with scatterlists.
*/
static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, struct dma_attrs *attrs)
+ enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s;
int i;
@@ -282,7 +283,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
for_each_sg(sg, s, nents, i) {
if (!s->dma_length || !s->length)
break;
- gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
+ gart_unmap_phys(dev, s->dma_address, s->dma_length, dir, 0);
}
}
@@ -294,7 +295,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
int i;
#ifdef CONFIG_IOMMU_DEBUG
- printk(KERN_DEBUG "dma_map_sg overflow\n");
+ pr_debug("dma_map_sg overflow\n");
#endif
for_each_sg(sg, s, nents, i) {
@@ -302,9 +303,9 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir, 0);
- if (addr == bad_dma_address) {
+ if (addr == DMA_MAPPING_ERROR) {
if (i > 0)
- gart_unmap_sg(dev, sg, i, dir, NULL);
+ gart_unmap_sg(dev, sg, i, dir, 0);
nents = 0;
sg[0].dma_length = 0;
break;
@@ -329,7 +330,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int i;
if (iommu_start == -1)
- return -1;
+ return -ENOMEM;
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
@@ -375,26 +376,25 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
* Merge chunks that have page aligned sizes into a continuous mapping.
*/
static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, struct dma_attrs *attrs)
+ enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s, *ps, *start_sg, *sgmap;
- int need = 0, nextneed, i, out, start;
+ int need = 0, nextneed, i, out, start, ret;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
if (nents == 0)
- return 0;
+ return -EINVAL;
- if (!dev)
- dev = &x86_dma_fallback_dev;
+ out = 0;
+ start = 0;
+ start_sg = sg;
+ sgmap = sg;
+ seg_size = 0;
+ max_seg_size = dma_get_max_seg_size(dev);
+ ps = NULL; /* shut up gcc */
- out = 0;
- start = 0;
- start_sg = sgmap = sg;
- seg_size = 0;
- max_seg_size = dma_get_max_seg_size(dev);
- ps = NULL; /* shut up gcc */
for_each_sg(sg, s, nents, i) {
dma_addr_t addr = sg_phys(s);
@@ -413,15 +413,17 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
- if (dma_map_cont(dev, start_sg, i - start,
- sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start,
+ sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
- seg_size = 0;
- sgmap = sg_next(sgmap);
- pages = 0;
- start = i;
- start_sg = s;
+
+ seg_size = 0;
+ sgmap = sg_next(sgmap);
+ pages = 0;
+ start = i;
+ start_sg = s;
}
}
@@ -430,7 +432,8 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
- if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
flush_gart();
@@ -442,7 +445,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
error:
flush_gart();
- gart_unmap_sg(dev, sg, out, dir, NULL);
+ gart_unmap_sg(dev, sg, out, dir, 0);
/* When it was forced or merged try again in a dumb way */
if (force_iommu || iommu_merge) {
@@ -454,49 +457,39 @@ error:
panic("dma_map_sg: overflow on %lu pages\n", pages);
iommu_full(dev, pages << PAGE_SHIFT, dir);
- for_each_sg(sg, s, nents, i)
- s->dma_address = bad_dma_address;
- return 0;
+ return ret;
}
/* allocate and map a coherent mapping */
static void *
gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
- gfp_t flag)
+ gfp_t flag, unsigned long attrs)
{
- dma_addr_t paddr;
- unsigned long align_mask;
- struct page *page;
-
- if (force_iommu && !(flag & GFP_DMA)) {
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- page = alloc_pages(flag | __GFP_ZERO, get_order(size));
- if (!page)
- return NULL;
-
- align_mask = (1UL << get_order(size)) - 1;
- paddr = dma_map_area(dev, page_to_phys(page), size,
- DMA_BIDIRECTIONAL, align_mask);
-
- flush_gart();
- if (paddr != bad_dma_address) {
- *dma_addr = paddr;
- return page_address(page);
- }
- __free_pages(page, get_order(size));
- } else
- return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+ void *vaddr;
+ vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
+ if (!vaddr ||
+ !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
+ return vaddr;
+
+ *dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size,
+ DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1);
+ flush_gart();
+ if (unlikely(*dma_addr == DMA_MAPPING_ERROR))
+ goto out_free;
+ return vaddr;
+out_free:
+ dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
return NULL;
}
/* free a coherent mapping */
static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr, unsigned long attrs)
{
- gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
- free_pages((unsigned long)vaddr, get_order(size));
+ gart_unmap_phys(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
+ dma_direct_free(dev, size, vaddr, dma_addr, attrs);
}
static int no_agp;
@@ -512,13 +505,12 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
}
a = aper + iommu_size;
- iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+ iommu_size -= round_up(a, PMD_SIZE) - a;
if (iommu_size < 64*1024*1024) {
- printk(KERN_WARNING
- "PCI-DMA: Warning: Small IOMMU %luMB."
+ pr_warn("PCI-DMA: Warning: Small IOMMU %luMB."
" Consider increasing the AGP aperture in BIOS\n",
- iommu_size >> 20);
+ iommu_size >> 20);
}
return iommu_size;
@@ -548,11 +540,17 @@ static void enable_gart_translations(void)
{
int i;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
enable_gart_translation(dev, __pa(agp_gatt_table));
}
+
+ /* Flush the GART-TLB to remove stale entries */
+ amd_flush_garts();
}
/*
@@ -570,68 +568,66 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
aperture_alloc = aper_alloc;
}
-static int gart_resume(struct sys_device *dev)
+static void gart_fixup_northbridges(void)
{
- printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
-
- if (fix_up_north_bridges) {
- int i;
+ int i;
- printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
+ if (!fix_up_north_bridges)
+ return;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
- /*
- * Don't enable translations just yet. That is the next
- * step. Restore the pre-suspend aperture settings.
- */
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
- aperture_order << 1);
- pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
- aperture_alloc >> 25);
- }
- }
+ pr_info("PCI-DMA: Restoring GART aperture settings\n");
- enable_gart_translations();
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
- return 0;
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ gart_set_size_and_enable(dev, aperture_order);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
+ }
}
-static int gart_suspend(struct sys_device *dev, pm_message_t state)
+static void gart_resume(void *data)
{
- return 0;
+ pr_info("PCI-DMA: Resuming GART IOMMU\n");
+
+ gart_fixup_northbridges();
+
+ enable_gart_translations();
}
-static struct sysdev_class gart_sysdev_class = {
- .name = "gart",
- .suspend = gart_suspend,
- .resume = gart_resume,
+static const struct syscore_ops gart_syscore_ops = {
+ .resume = gart_resume,
};
-static struct sys_device device_gart = {
- .id = 0,
- .cls = &gart_sysdev_class,
+static struct syscore gart_syscore = {
+ .ops = &gart_syscore_ops,
};
/*
* Private Northbridge GATT initialization in case we cannot use the
* AGP driver for some reason.
*/
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
{
unsigned aper_size, gatt_size, new_aper_size;
unsigned aper_base, new_aper_base;
struct pci_dev *dev;
void *gatt;
- int i, error;
+ int i;
+
+ pr_info("PCI-DMA: Disabling AGP.\n");
- printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
- for (i = 0; i < num_k8_northbridges; i++) {
- dev = k8_northbridges[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ dev = node_to_amd_nb(i)->misc;
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
@@ -645,6 +641,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
}
if (!aper_base)
goto nommu;
+
info->aper_base = aper_base;
info->aper_size = aper_size >> 20;
@@ -658,48 +655,52 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
agp_gatt_table = gatt;
- error = sysdev_class_register(&gart_sysdev_class);
- if (!error)
- error = sysdev_register(&device_gart);
- if (error)
- panic("Could not register gart_sysdev -- "
- "would corrupt data on next suspend");
+ register_syscore(&gart_syscore);
flush_gart();
- printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
+ pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
aper_base, aper_size>>10);
return 0;
nommu:
/* Should not happen anymore */
- printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
- "falling back to iommu=soft.\n");
+ pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n");
return -1;
}
-static struct dma_map_ops gart_dma_ops = {
+static const struct dma_map_ops gart_dma_ops = {
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
- .map_page = gart_map_page,
- .unmap_page = gart_unmap_page,
- .alloc_coherent = gart_alloc_coherent,
- .free_coherent = gart_free_coherent,
+ .map_phys = gart_map_phys,
+ .unmap_phys = gart_unmap_phys,
+ .alloc = gart_alloc_coherent,
+ .free = gart_free_coherent,
+ .mmap = dma_common_mmap,
+ .get_sgtable = dma_common_get_sgtable,
+ .dma_supported = dma_direct_supported,
+ .get_required_mask = dma_direct_get_required_mask,
+ .alloc_pages_op = dma_direct_alloc_pages,
+ .free_pages = dma_direct_free_pages,
};
-void gart_iommu_shutdown(void)
+static void gart_iommu_shutdown(void)
{
struct pci_dev *dev;
int i;
- if (no_agp && (dma_ops != &gart_dma_ops))
+ /* don't shutdown it if there is AGP installed */
+ if (!no_agp)
+ return;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
- for (i = 0; i < num_k8_northbridges; i++) {
+ for (i = 0; i < amd_nb_num(); i++) {
u32 ctl;
- dev = k8_northbridges[i];
+ dev = node_to_amd_nb(i)->misc;
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
@@ -708,57 +709,49 @@ void gart_iommu_shutdown(void)
}
}
-void __init gart_iommu_init(void)
+int __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
unsigned long aper_base, aper_size;
unsigned long start_pfn, end_pfn;
unsigned long scratch;
- long i;
- if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
- return;
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
#else
/* Makefile puts PCI initialization via subsys_initcall first. */
- /* Add other K8 AGP bridge drivers here */
+ /* Add other AMD AGP bridge drivers here */
no_agp = no_agp ||
(agp_amd64_init() < 0) ||
(agp_copy_info(agp_bridge, &info) < 0);
#endif
- if (swiotlb)
- return;
-
- /* Did we detect a different HW IOMMU? */
- if (iommu_detected && !gart_iommu_aperture)
- return;
-
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
- (no_agp && init_k8_gatt(&info) < 0)) {
+ (no_agp && init_amd_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
- printk(KERN_WARNING "More than 4GB of memory "
- "but GART IOMMU not available.\n");
- printk(KERN_WARNING "falling back to iommu=soft.\n");
+ pr_warn("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warn("falling back to iommu=soft.\n");
}
- return;
+ return 0;
}
/* need to map that range */
- aper_size = info.aper_size << 20;
- aper_base = info.aper_base;
- end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- if (end_pfn > max_low_pfn_mapped) {
- start_pfn = (aper_base>>PAGE_SHIFT);
- init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- }
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
+ start_pfn = PFN_DOWN(aper_base);
+ if (!pfn_range_is_mapped(start_pfn, end_pfn))
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT,
+ PAGE_KERNEL);
+
+ pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
iommu_pages = iommu_size >> PAGE_SHIFT;
@@ -767,32 +760,13 @@ void __init gart_iommu_init(void)
if (!iommu_gart_bitmap)
panic("Cannot allocate iommu bitmap\n");
-#ifdef CONFIG_IOMMU_LEAK
- if (leak_trace) {
- int ret;
-
- ret = dma_debug_resize_entries(iommu_pages);
- if (ret)
- printk(KERN_DEBUG
- "PCI-DMA: Cannot trace all the entries\n");
- }
-#endif
-
- /*
- * Out of IOMMU space handling.
- * Reserve some invalid pages at the beginning of the GART.
- */
- iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
-
- agp_memory_reserved = iommu_size;
- printk(KERN_INFO
- "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+ pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
iommu_size >> 20);
- iommu_start = aper_size - iommu_size;
- iommu_bus_base = info.aper_base + iommu_start;
- bad_dma_address = iommu_bus_base;
- iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+ agp_memory_reserved = iommu_size;
+ iommu_start = aper_size - iommu_size;
+ iommu_bus_base = info.aper_base + iommu_start;
+ iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
/*
* Unmap the IOMMU part of the GART. The alias of the page is
@@ -807,14 +781,14 @@ void __init gart_iommu_init(void)
iommu_size >> PAGE_SHIFT);
/*
* Tricky. The GART table remaps the physical memory range,
- * so the CPU wont notice potential aliases and if the memory
+ * so the CPU won't notice potential aliases and if the memory
* is remapped to UC later on, we might surprise the PCI devices
* with a stray writeout of a cacheline. So play it sure and
* do an explicit, full-scale wbinvd() _after_ having marked all
* the pages as Not-Present:
*/
wbinvd();
-
+
/*
* Now all caches are flushed and we can safely enable
* GART hardware. Doing it early leaves the possibility
@@ -833,30 +807,22 @@ void __init gart_iommu_init(void)
if (!scratch)
panic("Cannot allocate iommu scratch page");
gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
- for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
- iommu_gatt_base[i] = gart_unmapped_entry;
flush_gart();
dma_ops = &gart_dma_ops;
+ x86_platform.iommu_shutdown = gart_iommu_shutdown;
+ x86_swiotlb_enable = false;
+
+ return 0;
}
void __init gart_parse_options(char *p)
{
int arg;
-#ifdef CONFIG_IOMMU_LEAK
- if (!strncmp(p, "leak", 4)) {
- leak_trace = 1;
- p += 4;
- if (*p == '=')
- ++p;
- if (isdigit(*p) && get_option(&p, &arg))
- iommu_leak_pages = arg;
- }
-#endif
if (isdigit(*p) && get_option(&p, &arg))
iommu_size = arg;
- if (!strncmp(p, "fullflush", 8))
+ if (!strncmp(p, "fullflush", 9))
iommu_fullflush = 1;
if (!strncmp(p, "nofullflush", 11))
iommu_fullflush = 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 6c99f5037801..000000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2193 +0,0 @@
-/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/gfp.h>
-#include <linux/bitops.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-
-#define EXIT_LOOP_COUNT 10000000
-
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-
-#ifdef CONFIG_IOMMU_API
-static struct iommu_ops amd_iommu_ops;
-#endif
-
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
- u32 data[4];
-};
-
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e);
-static struct dma_ops_domain *find_protection_domain(u16 devid);
-static u64* alloc_pte(struct protection_domain *dom,
- unsigned long address, u64
- **pte_page, gfp_t gfp);
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages);
-
-#ifndef BUS_NOTIFY_UNBOUND_DRIVER
-#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
-#endif
-
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-/*
- * Initialization code for statistics collection
- */
-
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-
-static struct dentry *stats_dir;
-static struct dentry *de_isolate;
-static struct dentry *de_fflush;
-
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
- if (stats_dir == NULL)
- return;
-
- cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
- &cnt->value);
-}
-
-static void amd_iommu_stats_init(void)
-{
- stats_dir = debugfs_create_dir("amd-iommu", NULL);
- if (stats_dir == NULL)
- return;
-
- de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
- (u32 *)&amd_iommu_isolate);
-
- de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
- (u32 *)&amd_iommu_unmap_flush);
-
- amd_iommu_stats_add(&compl_wait);
- amd_iommu_stats_add(&cnt_map_single);
- amd_iommu_stats_add(&cnt_unmap_single);
- amd_iommu_stats_add(&cnt_map_sg);
- amd_iommu_stats_add(&cnt_unmap_sg);
- amd_iommu_stats_add(&cnt_alloc_coherent);
- amd_iommu_stats_add(&cnt_free_coherent);
- amd_iommu_stats_add(&cross_page);
- amd_iommu_stats_add(&domain_flush_single);
- amd_iommu_stats_add(&domain_flush_all);
- amd_iommu_stats_add(&alloced_io_mem);
- amd_iommu_stats_add(&total_map_requests);
-}
-
-#endif
-
-/* returns !0 if the IOMMU is caching non-present entries in its TLB */
-static int iommu_has_npcache(struct amd_iommu *iommu)
-{
- return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
-}
-
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-
-static void iommu_print_event(void *__evt)
-{
- u32 *event = __evt;
- int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
- int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
- int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
- int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
- u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-
- printk(KERN_ERR "AMD IOMMU: Event logged [");
-
- switch (type) {
- case EVENT_TYPE_ILL_DEV:
- printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- case EVENT_TYPE_IO_FAULT:
- printk("IO_PAGE_FAULT device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_DEV_TAB_ERR:
- printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- case EVENT_TYPE_PAGE_TAB_ERR:
- printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_ILL_CMD:
- printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
- break;
- case EVENT_TYPE_CMD_HARD_ERR:
- printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
- "flags=0x%04x]\n", address, flags);
- break;
- case EVENT_TYPE_IOTLB_INV_TO:
- printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
- "address=0x%016llx]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address);
- break;
- case EVENT_TYPE_INV_DEV_REQ:
- printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- default:
- printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
- }
-}
-
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
- u32 head, tail;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- while (head != tail) {
- iommu_print_event(iommu->evt_buf + head);
- head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
- }
-
- writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-
- spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_poll_events(iommu);
-
- return IRQ_HANDLED;
-}
-
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command. Must be called with iommu->lock held.
- */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- u32 tail, head;
- u8 *target;
-
- tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
- target = iommu->cmd_buf + tail;
- memcpy_toio(target, cmd, sizeof(*cmd));
- tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
- head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- if (tail == head)
- return -ENOMEM;
- writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- return 0;
-}
-
-/*
- * General queuing function for commands. Takes iommu->lock and calls
- * __iommu_queue_command().
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&iommu->lock, flags);
- ret = __iommu_queue_command(iommu, cmd);
- if (!ret)
- iommu->need_sync = true;
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- return ret;
-}
-
-/*
- * This function waits until an IOMMU has completed a completion
- * wait command
- */
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
- int ready = 0;
- unsigned status = 0;
- unsigned long i = 0;
-
- INC_STATS_COUNTER(compl_wait);
-
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
- }
-
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
- if (unlikely(i == EXIT_LOOP_COUNT))
- panic("AMD IOMMU: Completion wait loop failed\n");
-}
-
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int __iommu_completion_wait(struct amd_iommu *iommu)
-{
- struct iommu_cmd cmd;
-
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
-
- return __iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- if (!iommu->need_sync)
- goto out;
-
- ret = __iommu_completion_wait(iommu);
-
- iommu->need_sync = false;
-
- if (ret)
- goto out;
-
- __iommu_wait_for_completion(iommu);
-
-out:
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- return 0;
-}
-
-/*
- * Command send function for invalidating a device table entry
- */
-static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
-{
- struct iommu_cmd cmd;
- int ret;
-
- BUG_ON(iommu == NULL);
-
- memset(&cmd, 0, sizeof(cmd));
- CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
- cmd.data[0] = devid;
-
- ret = iommu_queue_command(iommu, &cmd);
-
- return ret;
-}
-
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
- u16 domid, int pde, int s)
-{
- memset(cmd, 0, sizeof(*cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- cmd->data[1] |= domid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-/*
- * Generic command send function for invalidaing TLB entries
- */
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
- u64 address, u16 domid, int pde, int s)
-{
- struct iommu_cmd cmd;
- int ret;
-
- __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
-
- ret = iommu_queue_command(iommu, &cmd);
-
- return ret;
-}
-
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
- u64 address, size_t size)
-{
- int s = 0;
- unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
-
- address &= PAGE_MASK;
-
- if (pages > 1) {
- /*
- * If we have to flush more than one page, flush all
- * TLB entries for this domain
- */
- address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
- s = 1;
- }
-
- iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
-
- return 0;
-}
-
-/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
-{
- u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-
- INC_STATS_COUNTER(domain_flush_single);
-
- iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
-}
-
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
-{
- u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-
- INC_STATS_COUNTER(domain_flush_single);
-
- iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
-}
-
-/*
- * This function is used to flush the IO/TLB for a given protection domain
- * on every IOMMU in the system
- */
-static void iommu_flush_domain(u16 domid)
-{
- unsigned long flags;
- struct amd_iommu *iommu;
- struct iommu_cmd cmd;
-
- INC_STATS_COUNTER(domain_flush_all);
-
- __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- domid, 1, 1);
-
- for_each_iommu(iommu) {
- spin_lock_irqsave(&iommu->lock, flags);
- __iommu_queue_command(iommu, &cmd);
- __iommu_completion_wait(iommu);
- __iommu_wait_for_completion(iommu);
- spin_unlock_irqrestore(&iommu->lock, flags);
- }
-}
-
-void amd_iommu_flush_all_domains(void)
-{
- int i;
-
- for (i = 1; i < MAX_DOMAIN_ID; ++i) {
- if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
- continue;
- iommu_flush_domain(i);
- }
-}
-
-void amd_iommu_flush_all_devices(void)
-{
- struct amd_iommu *iommu;
- int i;
-
- for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if (amd_iommu_pd_table[i] == NULL)
- continue;
-
- iommu = amd_iommu_rlookup_table[i];
- if (!iommu)
- continue;
-
- iommu_queue_inv_dev_entry(iommu, i);
- iommu_completion_wait(iommu);
- }
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot)
-{
- u64 __pte, *pte;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
-
- /* only support 512GB address spaces for now */
- if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
-
- if (IOMMU_PTE_PRESENT(*pte))
- return -EBUSY;
-
- __pte = phys_addr | IOMMU_PTE_P;
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- *pte = __pte;
-
- return 0;
-}
-
-static void iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr)
-{
- u64 *pte;
-
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
- *pte = 0;
-}
-
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
- struct unity_map_entry *entry)
-{
- u16 bdf, i;
-
- for (i = entry->devid_start; i <= entry->devid_end; ++i) {
- bdf = amd_iommu_alias_table[i];
- if (amd_iommu_rlookup_table[bdf] == iommu)
- return 1;
- }
-
- return 0;
-}
-
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
- struct unity_map_entry *entry;
- int ret;
-
- list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- if (!iommu_for_unity_map(iommu, entry))
- continue;
- ret = dma_ops_unity_map(iommu->default_dom, entry);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e)
-{
- u64 addr;
- int ret;
-
- for (addr = e->address_start; addr < e->address_end;
- addr += PAGE_SIZE) {
- ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
- if (ret)
- return ret;
- /*
- * if unity mapping is in aperture range mark the page
- * as allocated in the aperture
- */
- if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT,
- dma_dom->aperture[0]->bitmap);
- }
-
- return 0;
-}
-
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
- u16 devid)
-{
- struct unity_map_entry *e;
- int ret;
-
- list_for_each_entry(e, &amd_iommu_unity_map, list) {
- if (!(devid >= e->devid_start && devid <= e->devid_end))
- continue;
- ret = dma_ops_unity_map(dma_dom, e);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64* fetch_pte(struct protection_domain *domain,
- unsigned long address)
-{
- u64 *pte;
-
- pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(address)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(address)];
-
- return pte;
-}
-
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct amd_iommu *iommu,
- struct dma_ops_domain *dma_dom,
- bool populate, gfp_t gfp)
-{
- int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
- int i;
-
-#ifdef CONFIG_IOMMU_STRESS
- populate = false;
-#endif
-
- if (index >= APERTURE_MAX_RANGES)
- return -ENOMEM;
-
- dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
- if (!dma_dom->aperture[index])
- return -ENOMEM;
-
- dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
- if (!dma_dom->aperture[index]->bitmap)
- goto out_free;
-
- dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-
- if (populate) {
- unsigned long address = dma_dom->aperture_size;
- int i, num_ptes = APERTURE_RANGE_PAGES / 512;
- u64 *pte, *pte_page;
-
- for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address,
- &pte_page, gfp);
- if (!pte)
- goto out_free;
-
- dma_dom->aperture[index]->pte_pages[i] = pte_page;
-
- address += APERTURE_RANGE_SIZE / 64;
- }
- }
-
- dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-
- /* Intialize the exclusion range if necessary */
- if (iommu->exclusion_start &&
- iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
- iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
-
- /*
- * Check for areas already mapped as present in the new aperture
- * range and mark those pages as reserved in the allocator. Such
- * mappings may already exist as a result of requested unity
- * mappings for devices.
- */
- for (i = dma_dom->aperture[index]->offset;
- i < dma_dom->aperture_size;
- i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- continue;
-
- dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
- }
-
- return 0;
-
-out_free:
- free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-
- kfree(dma_dom->aperture[index]);
- dma_dom->aperture[index] = NULL;
-
- return -ENOMEM;
-}
-
-static unsigned long dma_ops_area_alloc(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask,
- unsigned long start)
-{
- unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
- int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
- int i = start >> APERTURE_RANGE_SHIFT;
- unsigned long boundary_size;
- unsigned long address = -1;
- unsigned long limit;
-
- next_bit >>= PAGE_SHIFT;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- for (;i < max_index; ++i) {
- unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-
- if (dom->aperture[i]->offset >= dma_mask)
- break;
-
- limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
- dma_mask >> PAGE_SHIFT);
-
- address = iommu_area_alloc(dom->aperture[i]->bitmap,
- limit, next_bit, pages, 0,
- boundary_size, align_mask);
- if (address != -1) {
- address = dom->aperture[i]->offset +
- (address << PAGE_SHIFT);
- dom->next_address = address + (pages << PAGE_SHIFT);
- break;
- }
-
- next_bit = 0;
- }
-
- return address;
-}
-
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask)
-{
- unsigned long address;
-
-#ifdef CONFIG_IOMMU_STRESS
- dom->next_address = 0;
- dom->need_flush = true;
-#endif
-
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, dom->next_address);
-
- if (address == -1) {
- dom->next_address = 0;
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, 0);
- dom->need_flush = true;
- }
-
- if (unlikely(address == -1))
- address = bad_dma_address;
-
- WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-
- return address;
-}
-
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
- unsigned long address,
- unsigned int pages)
-{
- unsigned i = address >> APERTURE_RANGE_SHIFT;
- struct aperture_range *range = dom->aperture[i];
-
- BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-
-#ifdef CONFIG_IOMMU_STRESS
- if (i < 4)
- return;
-#endif
-
- if (address >= dom->next_address)
- dom->need_flush = true;
-
- address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-
- iommu_area_free(range->bitmap, address, pages);
-
-}
-
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-
-static u16 domain_id_alloc(void)
-{
- unsigned long flags;
- int id;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
- BUG_ON(id == 0);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __set_bit(id, amd_iommu_pd_alloc_bitmap);
- else
- id = 0;
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return id;
-}
-
-static void domain_id_free(int id)
-{
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __clear_bit(id, amd_iommu_pd_alloc_bitmap);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages)
-{
- unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
- if (start_page + pages > last_page)
- pages = last_page - start_page;
-
- for (i = start_page; i < start_page + pages; ++i) {
- int index = i / APERTURE_RANGE_PAGES;
- int page = i % APERTURE_RANGE_PAGES;
- __set_bit(page, dom->aperture[index]->bitmap);
- }
-}
-
-static void free_pagetable(struct protection_domain *domain)
-{
- int i, j;
- u64 *p1, *p2, *p3;
-
- p1 = domain->pt_root;
-
- if (!p1)
- return;
-
- for (i = 0; i < 512; ++i) {
- if (!IOMMU_PTE_PRESENT(p1[i]))
- continue;
-
- p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++j) {
- if (!IOMMU_PTE_PRESENT(p2[j]))
- continue;
- p3 = IOMMU_PTE_PAGE(p2[j]);
- free_page((unsigned long)p3);
- }
-
- free_page((unsigned long)p2);
- }
-
- free_page((unsigned long)p1);
-
- domain->pt_root = NULL;
-}
-
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
- int i;
-
- if (!dom)
- return;
-
- free_pagetable(&dom->domain);
-
- for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
- if (!dom->aperture[i])
- continue;
- free_page((unsigned long)dom->aperture[i]->bitmap);
- kfree(dom->aperture[i]);
- }
-
- kfree(dom);
-}
-
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also intializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
-{
- struct dma_ops_domain *dma_dom;
-
- dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
- if (!dma_dom)
- return NULL;
-
- spin_lock_init(&dma_dom->domain.lock);
-
- dma_dom->domain.id = domain_id_alloc();
- if (dma_dom->domain.id == 0)
- goto free_dma_dom;
- dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
- dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- dma_dom->domain.flags = PD_DMA_OPS_MASK;
- dma_dom->domain.priv = dma_dom;
- if (!dma_dom->domain.pt_root)
- goto free_dma_dom;
-
- dma_dom->need_flush = false;
- dma_dom->target_dev = 0xffff;
-
- if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
- goto free_dma_dom;
-
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->aperture[0]->bitmap[0] = 1;
- dma_dom->next_address = 0;
-
-
- return dma_dom;
-
-free_dma_dom:
- dma_ops_domain_free(dma_dom);
-
- return NULL;
-}
-
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
- return domain->flags & PD_DMA_OPS_MASK;
-}
-
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(u16 devid)
-{
- struct protection_domain *dom;
- unsigned long flags;
-
- read_lock_irqsave(&amd_iommu_devtable_lock, flags);
- dom = amd_iommu_pd_table[devid];
- read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return dom;
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static void attach_device(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
-{
- unsigned long flags;
- u64 pte_root = virt_to_phys(domain->pt_root);
-
- domain->dev_cnt += 1;
-
- pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[2] = domain->id;
-
- amd_iommu_pd_table[devid] = domain;
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
- iommu_queue_inv_dev_entry(iommu, devid);
- iommu_flush_tlb_pde(iommu, domain->id);
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct protection_domain *domain, u16 devid)
-{
-
- /* lock domain */
- spin_lock(&domain->lock);
-
- /* remove domain from the lookup table */
- amd_iommu_pd_table[devid] = NULL;
-
- /* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
-
- /* decrease reference counter */
- domain->dev_cnt -= 1;
-
- /* ready */
- spin_unlock(&domain->lock);
-}
-
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct protection_domain *domain, u16 devid)
-{
- unsigned long flags;
-
- /* lock device table */
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(domain, devid);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static int device_change_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct device *dev = data;
- struct pci_dev *pdev = to_pci_dev(dev);
- u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
- struct protection_domain *domain;
- struct dma_ops_domain *dma_domain;
- struct amd_iommu *iommu;
- unsigned long flags;
-
- if (devid > amd_iommu_last_bdf)
- goto out;
-
- devid = amd_iommu_alias_table[devid];
-
- iommu = amd_iommu_rlookup_table[devid];
- if (iommu == NULL)
- goto out;
-
- domain = domain_for_device(devid);
-
- if (domain && !dma_ops_domain(domain))
- WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
- "to a non-dma-ops domain\n", dev_name(dev));
-
- switch (action) {
- case BUS_NOTIFY_UNBOUND_DRIVER:
- if (!domain)
- goto out;
- detach_device(domain, devid);
- break;
- case BUS_NOTIFY_ADD_DEVICE:
- /* allocate a protection domain if a device is added */
- dma_domain = find_protection_domain(devid);
- if (dma_domain)
- goto out;
- dma_domain = dma_ops_domain_alloc(iommu);
- if (!dma_domain)
- goto out;
- dma_domain->target_dev = devid;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
- list_add_tail(&dma_domain->list, &iommu_pd_list);
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- break;
- default:
- goto out;
- }
-
- iommu_queue_inv_dev_entry(iommu, devid);
- iommu_completion_wait(iommu);
-
-out:
- return 0;
-}
-
-static struct notifier_block device_nb = {
- .notifier_call = device_change_notifier,
-};
-
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
- if (!dev || !dev->dma_mask)
- return false;
-
- return true;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
- struct dma_ops_domain *entry, *ret = NULL;
- unsigned long flags;
-
- if (list_empty(&iommu_pd_list))
- return NULL;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
- list_for_each_entry(entry, &iommu_pd_list, list) {
- if (entry->target_dev == devid) {
- ret = entry;
- break;
- }
- }
-
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- return ret;
-}
-
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static int get_device_resources(struct device *dev,
- struct amd_iommu **iommu,
- struct protection_domain **domain,
- u16 *bdf)
-{
- struct dma_ops_domain *dma_dom;
- struct pci_dev *pcidev;
- u16 _bdf;
-
- *iommu = NULL;
- *domain = NULL;
- *bdf = 0xffff;
-
- if (dev->bus != &pci_bus_type)
- return 0;
-
- pcidev = to_pci_dev(dev);
- _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
-
- /* device not translated by any IOMMU in the system? */
- if (_bdf > amd_iommu_last_bdf)
- return 0;
-
- *bdf = amd_iommu_alias_table[_bdf];
-
- *iommu = amd_iommu_rlookup_table[*bdf];
- if (*iommu == NULL)
- return 0;
- *domain = domain_for_device(*bdf);
- if (*domain == NULL) {
- dma_dom = find_protection_domain(*bdf);
- if (!dma_dom)
- dma_dom = (*iommu)->default_dom;
- *domain = &dma_dom->domain;
- attach_device(*iommu, *domain, *bdf);
- DUMP_printk("Using protection domain %d for device %s\n",
- (*domain)->id, dev_name(dev));
- }
-
- if (domain_for_device(_bdf) == NULL)
- attach_device(*iommu, *domain, _bdf);
-
- return 1;
-}
-
-/*
- * If the pte_page is not yet allocated this function is called
- */
-static u64* alloc_pte(struct protection_domain *dom,
- unsigned long address, u64 **pte_page, gfp_t gfp)
-{
- u64 *pte, *page;
-
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = IOMMU_L2_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(address)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = IOMMU_L1_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
-
- if (pte_page)
- *pte_page = pte;
-
- pte = &pte[IOMMU_PTE_L0_INDEX(address)];
-
- return pte;
-}
-
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte, *pte_page;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return NULL;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte) {
- pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
- aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
- } else
- pte += IOMMU_PTE_L0_INDEX(address);
-
- return pte;
-}
-
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
- struct dma_ops_domain *dom,
- unsigned long address,
- phys_addr_t paddr,
- int direction)
-{
- u64 *pte, __pte;
-
- WARN_ON(address > dom->aperture_size);
-
- paddr &= PAGE_MASK;
-
- pte = dma_ops_get_pte(dom, address);
- if (!pte)
- return bad_dma_address;
-
- __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (direction == DMA_TO_DEVICE)
- __pte |= IOMMU_PTE_IR;
- else if (direction == DMA_FROM_DEVICE)
- __pte |= IOMMU_PTE_IW;
- else if (direction == DMA_BIDIRECTIONAL)
- __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-
- WARN_ON(*pte);
-
- *pte = __pte;
-
- return (dma_addr_t)address;
-}
-
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct amd_iommu *iommu,
- struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte;
-
- if (address >= dom->aperture_size)
- return;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte)
- return;
-
- pte += IOMMU_PTE_L0_INDEX(address);
-
- WARN_ON(!*pte);
-
- *pte = 0ULL;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
- struct amd_iommu *iommu,
- struct dma_ops_domain *dma_dom,
- phys_addr_t paddr,
- size_t size,
- int dir,
- bool align,
- u64 dma_mask)
-{
- dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start, ret;
- unsigned int pages;
- unsigned long align_mask = 0;
- int i;
-
- pages = iommu_num_pages(paddr, size, PAGE_SIZE);
- paddr &= PAGE_MASK;
-
- INC_STATS_COUNTER(total_map_requests);
-
- if (pages > 1)
- INC_STATS_COUNTER(cross_page);
-
- if (align)
- align_mask = (1UL << get_order(size)) - 1;
-
-retry:
- address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
- dma_mask);
- if (unlikely(address == bad_dma_address)) {
- /*
- * setting next_address here will let the address
- * allocator only scan the new allocated range in the
- * first run. This is a small optimization.
- */
- dma_dom->next_address = dma_dom->aperture_size;
-
- if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
- goto out;
-
- /*
- * aperture was sucessfully enlarged by 128 MB, try
- * allocation again
- */
- goto retry;
- }
-
- start = address;
- for (i = 0; i < pages; ++i) {
- ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
- if (ret == bad_dma_address)
- goto out_unmap;
-
- paddr += PAGE_SIZE;
- start += PAGE_SIZE;
- }
- address += offset;
-
- ADD_STATS_COUNTER(alloced_io_mem, size);
-
- if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- iommu_flush_tlb(iommu, dma_dom->domain.id);
- dma_dom->need_flush = false;
- } else if (unlikely(iommu_has_npcache(iommu)))
- iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
-
-out:
- return address;
-
-out_unmap:
-
- for (--i; i >= 0; --i) {
- start -= PAGE_SIZE;
- dma_ops_domain_unmap(iommu, dma_dom, start);
- }
-
- dma_ops_free_addresses(dma_dom, address, pages);
-
- return bad_dma_address;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct amd_iommu *iommu,
- struct dma_ops_domain *dma_dom,
- dma_addr_t dma_addr,
- size_t size,
- int dir)
-{
- dma_addr_t i, start;
- unsigned int pages;
-
- if ((dma_addr == bad_dma_address) ||
- (dma_addr + size > dma_dom->aperture_size))
- return;
-
- pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- dma_addr &= PAGE_MASK;
- start = dma_addr;
-
- for (i = 0; i < pages; ++i) {
- dma_ops_domain_unmap(iommu, dma_dom, start);
- start += PAGE_SIZE;
- }
-
- SUB_STATS_COUNTER(alloced_io_mem, size);
-
- dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
- if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
- dma_dom->need_flush = false;
- }
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct amd_iommu *iommu;
- struct protection_domain *domain;
- u16 devid;
- dma_addr_t addr;
- u64 dma_mask;
- phys_addr_t paddr = page_to_phys(page) + offset;
-
- INC_STATS_COUNTER(cnt_map_single);
-
- if (!check_device(dev))
- return bad_dma_address;
-
- dma_mask = *dev->dma_mask;
-
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (iommu == NULL || domain == NULL)
- /* device not handled by any AMD IOMMU */
- return (dma_addr_t)paddr;
-
- if (!dma_ops_domain(domain))
- return bad_dma_address;
-
- spin_lock_irqsave(&domain->lock, flags);
- addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
- dma_mask);
- if (addr == bad_dma_address)
- goto out;
-
- iommu_completion_wait(iommu);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return addr;
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
- enum dma_data_direction dir, struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct amd_iommu *iommu;
- struct protection_domain *domain;
- u16 devid;
-
- INC_STATS_COUNTER(cnt_unmap_single);
-
- if (!check_device(dev) ||
- !get_device_resources(dev, &iommu, &domain, &devid))
- /* device not handled by any AMD IOMMU */
- return;
-
- if (!dma_ops_domain(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(iommu, domain->priv, dma_addr, size, dir);
-
- iommu_completion_wait(iommu);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
- int nelems, int dir)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sglist, s, nelems, i) {
- s->dma_address = (dma_addr_t)sg_phys(s);
- s->dma_length = s->length;
- }
-
- return nelems;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct amd_iommu *iommu;
- struct protection_domain *domain;
- u16 devid;
- int i;
- struct scatterlist *s;
- phys_addr_t paddr;
- int mapped_elems = 0;
- u64 dma_mask;
-
- INC_STATS_COUNTER(cnt_map_sg);
-
- if (!check_device(dev))
- return 0;
-
- dma_mask = *dev->dma_mask;
-
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (!iommu || !domain)
- return map_sg_no_iommu(dev, sglist, nelems, dir);
-
- if (!dma_ops_domain(domain))
- return 0;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- paddr = sg_phys(s);
-
- s->dma_address = __map_single(dev, iommu, domain->priv,
- paddr, s->length, dir, false,
- dma_mask);
-
- if (s->dma_address) {
- s->dma_length = s->length;
- mapped_elems++;
- } else
- goto unmap;
- }
-
- iommu_completion_wait(iommu);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return mapped_elems;
-unmap:
- for_each_sg(sglist, s, mapped_elems, i) {
- if (s->dma_address)
- __unmap_single(iommu, domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- mapped_elems = 0;
-
- goto out;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct amd_iommu *iommu;
- struct protection_domain *domain;
- struct scatterlist *s;
- u16 devid;
- int i;
-
- INC_STATS_COUNTER(cnt_unmap_sg);
-
- if (!check_device(dev) ||
- !get_device_resources(dev, &iommu, &domain, &devid))
- return;
-
- if (!dma_ops_domain(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- __unmap_single(iommu, domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- iommu_completion_wait(iommu);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
-{
- unsigned long flags;
- void *virt_addr;
- struct amd_iommu *iommu;
- struct protection_domain *domain;
- u16 devid;
- phys_addr_t paddr;
- u64 dma_mask = dev->coherent_dma_mask;
-
- INC_STATS_COUNTER(cnt_alloc_coherent);
-
- if (!check_device(dev))
- return NULL;
-
- if (!get_device_resources(dev, &iommu, &domain, &devid))
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- flag |= __GFP_ZERO;
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- if (!virt_addr)
- return NULL;
-
- paddr = virt_to_phys(virt_addr);
-
- if (!iommu || !domain) {
- *dma_addr = (dma_addr_t)paddr;
- return virt_addr;
- }
-
- if (!dma_ops_domain(domain))
- goto out_free;
-
- if (!dma_mask)
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
- size, DMA_BIDIRECTIONAL, true, dma_mask);
-
- if (*dma_addr == bad_dma_address) {
- spin_unlock_irqrestore(&domain->lock, flags);
- goto out_free;
- }
-
- iommu_completion_wait(iommu);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return virt_addr;
-
-out_free:
-
- free_pages((unsigned long)virt_addr, get_order(size));
-
- return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
- void *virt_addr, dma_addr_t dma_addr)
-{
- unsigned long flags;
- struct amd_iommu *iommu;
- struct protection_domain *domain;
- u16 devid;
-
- INC_STATS_COUNTER(cnt_free_coherent);
-
- if (!check_device(dev))
- return;
-
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (!iommu || !domain)
- goto free_mem;
-
- if (!dma_ops_domain(domain))
- goto free_mem;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-
- iommu_completion_wait(iommu);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
-free_mem:
- free_pages((unsigned long)virt_addr, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
- u16 bdf;
- struct pci_dev *pcidev;
-
- /* No device or no PCI device */
- if (!dev || dev->bus != &pci_bus_type)
- return 0;
-
- pcidev = to_pci_dev(dev);
-
- bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
-
- /* Out of our scope? */
- if (bdf > amd_iommu_last_bdf)
- return 0;
-
- return 1;
-}
-
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
- struct pci_dev *dev = NULL;
- struct dma_ops_domain *dma_dom;
- struct amd_iommu *iommu;
- u16 devid;
-
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- devid = calc_devid(dev->bus->number, dev->devfn);
- if (devid > amd_iommu_last_bdf)
- continue;
- devid = amd_iommu_alias_table[devid];
- if (domain_for_device(devid))
- continue;
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- continue;
- dma_dom = dma_ops_domain_alloc(iommu);
- if (!dma_dom)
- continue;
- init_unity_mappings_for_device(dma_dom, devid);
- dma_dom->target_dev = devid;
-
- list_add_tail(&dma_dom->list, &iommu_pd_list);
- }
-}
-
-static struct dma_map_ops amd_iommu_dma_ops = {
- .alloc_coherent = alloc_coherent,
- .free_coherent = free_coherent,
- .map_page = map_page,
- .unmap_page = unmap_page,
- .map_sg = map_sg,
- .unmap_sg = unmap_sg,
- .dma_supported = amd_iommu_dma_supported,
-};
-
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-int __init amd_iommu_init_dma_ops(void)
-{
- struct amd_iommu *iommu;
- int ret;
-
- /*
- * first allocate a default protection domain for every IOMMU we
- * found in the system. Devices not assigned to any other
- * protection domain will be assigned to the default one.
- */
- for_each_iommu(iommu) {
- iommu->default_dom = dma_ops_domain_alloc(iommu);
- if (iommu->default_dom == NULL)
- return -ENOMEM;
- iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
- ret = iommu_init_unity_mappings(iommu);
- if (ret)
- goto free_domains;
- }
-
- /*
- * If device isolation is enabled, pre-allocate the protection
- * domains for each device.
- */
- if (amd_iommu_isolate)
- prealloc_protection_domains();
-
- iommu_detected = 1;
- force_iommu = 1;
- bad_dma_address = 0;
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_aperture_disabled = 1;
- gart_iommu_aperture = 0;
-#endif
-
- /* Make the driver finally visible to the drivers */
- dma_ops = &amd_iommu_dma_ops;
-
- register_iommu(&amd_iommu_ops);
-
- bus_register_notifier(&pci_bus_type, &device_nb);
-
- amd_iommu_stats_init();
-
- return 0;
-
-free_domains:
-
- for_each_iommu(iommu) {
- if (iommu->default_dom)
- dma_ops_domain_free(iommu->default_dom);
- }
-
- return ret;
-}
-
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-
-static void cleanup_domain(struct protection_domain *domain)
-{
- unsigned long flags;
- u16 devid;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
- if (amd_iommu_pd_table[devid] == domain)
- __detach_device(domain, devid);
-
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
- struct protection_domain *domain;
-
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return -ENOMEM;
-
- spin_lock_init(&domain->lock);
- domain->mode = PAGE_MODE_3_LEVEL;
- domain->id = domain_id_alloc();
- if (!domain->id)
- goto out_free;
- domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- if (!domain->pt_root)
- goto out_free;
-
- dom->priv = domain;
-
- return 0;
-
-out_free:
- kfree(domain);
-
- return -ENOMEM;
-}
-
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
- struct protection_domain *domain = dom->priv;
-
- if (!domain)
- return;
-
- if (domain->dev_cnt > 0)
- cleanup_domain(domain);
-
- BUG_ON(domain->dev_cnt != 0);
-
- free_pagetable(domain);
-
- domain_id_free(domain->id);
-
- kfree(domain);
-
- dom->priv = NULL;
-}
-
-static void amd_iommu_detach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct protection_domain *domain = dom->priv;
- struct amd_iommu *iommu;
- struct pci_dev *pdev;
- u16 devid;
-
- if (dev->bus != &pci_bus_type)
- return;
-
- pdev = to_pci_dev(dev);
-
- devid = calc_devid(pdev->bus->number, pdev->devfn);
-
- if (devid > 0)
- detach_device(domain, devid);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return;
-
- iommu_queue_inv_dev_entry(iommu, devid);
- iommu_completion_wait(iommu);
-}
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct protection_domain *domain = dom->priv;
- struct protection_domain *old_domain;
- struct amd_iommu *iommu;
- struct pci_dev *pdev;
- u16 devid;
-
- if (dev->bus != &pci_bus_type)
- return -EINVAL;
-
- pdev = to_pci_dev(dev);
-
- devid = calc_devid(pdev->bus->number, pdev->devfn);
-
- if (devid >= amd_iommu_last_bdf ||
- devid != amd_iommu_alias_table[devid])
- return -EINVAL;
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return -EINVAL;
-
- old_domain = domain_for_device(devid);
- if (old_domain)
- detach_device(old_domain, devid);
-
- attach_device(iommu, domain, devid);
-
- iommu_completion_wait(iommu);
-
- return 0;
-}
-
-static int amd_iommu_map_range(struct iommu_domain *dom,
- unsigned long iova, phys_addr_t paddr,
- size_t size, int iommu_prot)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
- int prot = 0;
- int ret;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- iova &= PAGE_MASK;
- paddr &= PAGE_MASK;
-
- for (i = 0; i < npages; ++i) {
- ret = iommu_map_page(domain, iova, paddr, prot);
- if (ret)
- return ret;
-
- iova += PAGE_SIZE;
- paddr += PAGE_SIZE;
- }
-
- return 0;
-}
-
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
- unsigned long iova, size_t size)
-{
-
- struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
-
- iova &= PAGE_MASK;
-
- for (i = 0; i < npages; ++i) {
- iommu_unmap_page(domain, iova);
- iova += PAGE_SIZE;
- }
-
- iommu_flush_domain(domain->id);
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- unsigned long iova)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long offset = iova & ~PAGE_MASK;
- phys_addr_t paddr;
- u64 *pte;
-
- pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
-
- if (!IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- paddr = *pte & IOMMU_PAGE_MASK;
- paddr |= offset;
-
- return paddr;
-}
-
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
- unsigned long cap)
-{
- return 0;
-}
-
-static struct iommu_ops amd_iommu_ops = {
- .domain_init = amd_iommu_domain_init,
- .domain_destroy = amd_iommu_domain_destroy,
- .attach_dev = amd_iommu_attach_device,
- .detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map_range,
- .unmap = amd_iommu_unmap_range,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .domain_has_cap = amd_iommu_domain_has_cap,
-};
-
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index c1b17e97252e..000000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1349 +0,0 @@
-/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/gfp.h>
-#include <linux/list.h>
-#include <linux/sysdev.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE 0x10
-#define ACPI_IVMD_TYPE_ALL 0x20
-#define ACPI_IVMD_TYPE 0x21
-#define ACPI_IVMD_TYPE_RANGE 0x22
-
-#define IVHD_DEV_ALL 0x01
-#define IVHD_DEV_SELECT 0x02
-#define IVHD_DEV_SELECT_RANGE_START 0x03
-#define IVHD_DEV_RANGE_END 0x04
-#define IVHD_DEV_ALIAS 0x42
-#define IVHD_DEV_ALIAS_RANGE 0x43
-#define IVHD_DEV_EXT_SELECT 0x46
-#define IVHD_DEV_EXT_SELECT_RANGE 0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
-#define IVHD_FLAG_PASSPW_EN_MASK 0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
-#define IVHD_FLAG_ISOC_EN_MASK 0x08
-
-#define IVMD_FLAG_EXCL_RANGE 0x08
-#define IVMD_FLAG_UNITY_MAP 0x01
-
-#define ACPI_DEVFLAG_INITPASS 0x01
-#define ACPI_DEVFLAG_EXTINT 0x02
-#define ACPI_DEVFLAG_NMI 0x04
-#define ACPI_DEVFLAG_SYSMGT1 0x10
-#define ACPI_DEVFLAG_SYSMGT2 0x20
-#define ACPI_DEVFLAG_LINT0 0x40
-#define ACPI_DEVFLAG_LINT1 0x80
-#define ACPI_DEVFLAG_ATSDIS 0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 cap_ptr;
- u64 mmio_phys;
- u16 pci_seg;
- u16 info;
- u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
- u8 type;
- u16 devid;
- u8 flags;
- u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 aux;
- u64 resv;
- u64 range_start;
- u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-
-u16 amd_iommu_last_bdf; /* largest PCI device id we have
- to handle */
-LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
- we find in ACPI */
-#ifdef CONFIG_IOMMU_STRESS
-bool amd_iommu_isolate = false;
-#else
-bool amd_iommu_isolate = true; /* if true, device isolation is
- enabled */
-#endif
-
-bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
- system */
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * The pd table (protection domain table) is used to find the protection domain
- * data structure a device belongs to. Indexed with the PCI device id too.
- */
-struct protection_domain **amd_iommu_pd_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size; /* size of the device table */
-static u32 alias_table_size; /* size of the alias table */
-static u32 rlookup_table_size; /* size if the rlookup table */
-
-static inline void update_last_devid(u16 devid)
-{
- if (devid > amd_iommu_last_bdf)
- amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
- unsigned shift = PAGE_SHIFT +
- get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
- return 1UL << shift;
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
- u64 start = iommu->exclusion_start & PAGE_MASK;
- u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
- u64 entry;
-
- if (!iommu->exclusion_start)
- return;
-
- entry = start | MMIO_EXCL_ENABLE_MASK;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
- &entry, sizeof(entry));
-
- entry = limit;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->mmio_base == NULL);
-
- entry = virt_to_phys(amd_iommu_dev_table);
- entry |= (dev_table_size >> 12) - 1;
- memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl |= (1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl &= ~(1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
- dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
- iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
- /* Disable command buffer */
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- /* Disable event logging and event interrupts */
- iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
- iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
- /* Disable IOMMU hardware itself */
- iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
- u8 *ret;
-
- if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
- return NULL;
-
- ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
- if (ret != NULL)
- return ret;
-
- release_mem_region(address, MMIO_REGION_LENGTH);
-
- return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
- if (iommu->mmio_base)
- iounmap(iommu->mmio_base);
- release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
- return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
- u32 cap;
-
- cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
- update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
- return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
- u8 *p = (void *)h, *end = (void *)h;
- struct ivhd_entry *dev;
-
- p += sizeof(*h);
- end += h->length;
-
- find_last_devid_on_pci(PCI_BUS(h->devid),
- PCI_SLOT(h->devid),
- PCI_FUNC(h->devid),
- h->cap_ptr);
-
- while (p < end) {
- dev = (struct ivhd_entry *)p;
- switch (dev->type) {
- case IVHD_DEV_SELECT:
- case IVHD_DEV_RANGE_END:
- case IVHD_DEV_ALIAS:
- case IVHD_DEV_EXT_SELECT:
- /* all the above subfield types refer to device ids */
- update_last_devid(dev->devid);
- break;
- default:
- break;
- }
- p += ivhd_entry_length(p);
- }
-
- WARN_ON(p != end);
-
- return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
- int i;
- u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
-
- /*
- * Validate checksum here so we don't need to do it when
- * we actually parse the table
- */
- for (i = 0; i < table->length; ++i)
- checksum += p[i];
- if (checksum != 0)
- /* ACPI table corrupt */
- return -ENODEV;
-
- p += IVRS_HEADER_LENGTH;
-
- end += table->length;
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (h->type) {
- case ACPI_IVHD_TYPE:
- find_last_devid_from_ivhd(h);
- break;
- default:
- break;
- }
- p += h->length;
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
- u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(CMD_BUFFER_SIZE));
-
- if (cmd_buf == NULL)
- return NULL;
-
- iommu->cmd_buf_size = CMD_BUFFER_SIZE;
-
- return cmd_buf;
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->cmd_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->cmd_buf);
- entry |= MMIO_CMD_SIZE_512;
-
- memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
-
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->cmd_buf,
- get_order(iommu->cmd_buf_size));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
- iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(EVT_BUFFER_SIZE));
-
- if (iommu->evt_buf == NULL)
- return NULL;
-
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
- return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->evt_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
- memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
- &entry, sizeof(entry));
-
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
- amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
- u16 devid, u32 flags, u32 ext_flags)
-{
- if (flags & ACPI_DEVFLAG_INITPASS)
- set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
- if (flags & ACPI_DEVFLAG_EXTINT)
- set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
- if (flags & ACPI_DEVFLAG_NMI)
- set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
- if (flags & ACPI_DEVFLAG_SYSMGT1)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
- if (flags & ACPI_DEVFLAG_SYSMGT2)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
- if (flags & ACPI_DEVFLAG_LINT0)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
- if (flags & ACPI_DEVFLAG_LINT1)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
- set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
- if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
- return;
-
- if (iommu) {
- /*
- * We only can configure exclusion ranges per IOMMU, not
- * per device. But we can enable the exclusion range per
- * device. This is done here
- */
- set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
- iommu->exclusion_start = m->range_start;
- iommu->exclusion_length = m->range_length;
- }
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
- int cap_ptr = iommu->cap_ptr;
- u32 range, misc;
-
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
- &iommu->cap);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
- &range);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
- &misc);
-
- iommu->first_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_FD(range));
- iommu->last_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_LD(range));
- iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
- struct ivhd_header *h)
-{
- u8 *p = (u8 *)h;
- u8 *end = p, flags = 0;
- u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
- u32 ext_flags = 0;
- bool alias = false;
- struct ivhd_entry *e;
-
- /*
- * First set the recommended feature enable bits from ACPI
- * into the IOMMU control registers
- */
- h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
- iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
- h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
- h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
- h->flags & IVHD_FLAG_ISOC_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
- iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
- /*
- * make IOMMU memory accesses cache coherent
- */
- iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-
- /*
- * Done. Now parse the device entries
- */
- p += sizeof(struct ivhd_header);
- end += h->length;
-
-
- while (p < end) {
- e = (struct ivhd_entry *)p;
- switch (e->type) {
- case IVHD_DEV_ALL:
-
- DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
- " last device %02x:%02x.%x flags: %02x\n",
- PCI_BUS(iommu->first_device),
- PCI_SLOT(iommu->first_device),
- PCI_FUNC(iommu->first_device),
- PCI_BUS(iommu->last_device),
- PCI_SLOT(iommu->last_device),
- PCI_FUNC(iommu->last_device),
- e->flags);
-
- for (dev_i = iommu->first_device;
- dev_i <= iommu->last_device; ++dev_i)
- set_dev_entry_from_acpi(iommu, dev_i,
- e->flags, 0);
- break;
- case IVHD_DEV_SELECT:
-
- DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
- "flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
- break;
- case IVHD_DEV_SELECT_RANGE_START:
-
- DUMP_printk(" DEV_SELECT_RANGE_START\t "
- "devid: %02x:%02x.%x flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = 0;
- alias = false;
- break;
- case IVHD_DEV_ALIAS:
-
- DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
- "flags: %02x devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid = e->devid;
- devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
- set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
- amd_iommu_alias_table[devid] = devid_to;
- break;
- case IVHD_DEV_ALIAS_RANGE:
-
- DUMP_printk(" DEV_ALIAS_RANGE\t\t "
- "devid: %02x:%02x.%x flags: %02x "
- "devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid_start = e->devid;
- flags = e->flags;
- devid_to = e->ext >> 8;
- ext_flags = 0;
- alias = true;
- break;
- case IVHD_DEV_EXT_SELECT:
-
- DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
- "flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags,
- e->ext);
- break;
- case IVHD_DEV_EXT_SELECT_RANGE:
-
- DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
- "%02x:%02x.%x flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = e->ext;
- alias = false;
- break;
- case IVHD_DEV_RANGE_END:
-
- DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid));
-
- devid = e->devid;
- for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
- if (alias) {
- amd_iommu_alias_table[dev_i] = devid_to;
- set_dev_entry_from_acpi(iommu,
- devid_to, flags, ext_flags);
- }
- set_dev_entry_from_acpi(iommu, dev_i,
- flags, ext_flags);
- }
- break;
- default:
- break;
- }
-
- p += ivhd_entry_length(p);
- }
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
- u16 i;
-
- for (i = iommu->first_device; i <= iommu->last_device; ++i)
- set_iommu_for_device(iommu, i);
-
- return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
- free_command_buffer(iommu);
- free_event_buffer(iommu);
- iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
- struct amd_iommu *iommu, *next;
-
- for_each_iommu_safe(iommu, next) {
- list_del(&iommu->list);
- free_iommu_one(iommu);
- kfree(iommu);
- }
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
- spin_lock_init(&iommu->lock);
- list_add_tail(&iommu->list, &amd_iommu_list);
-
- /*
- * Copy data from ACPI table entry to the iommu struct
- */
- iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
- if (!iommu->dev)
- return 1;
-
- iommu->cap_ptr = h->cap_ptr;
- iommu->pci_seg = h->pci_seg;
- iommu->mmio_phys = h->mmio_phys;
- iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
- if (!iommu->mmio_base)
- return -ENOMEM;
-
- iommu->cmd_buf = alloc_command_buffer(iommu);
- if (!iommu->cmd_buf)
- return -ENOMEM;
-
- iommu->evt_buf = alloc_event_buffer(iommu);
- if (!iommu->evt_buf)
- return -ENOMEM;
-
- iommu->int_enabled = false;
-
- init_iommu_from_pci(iommu);
- init_iommu_from_acpi(iommu, h);
- init_iommu_devices(iommu);
-
- return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
- struct amd_iommu *iommu;
- int ret;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (*p) {
- case ACPI_IVHD_TYPE:
-
- DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
- "seg: %d flags: %01x info %04x\n",
- PCI_BUS(h->devid), PCI_SLOT(h->devid),
- PCI_FUNC(h->devid), h->cap_ptr,
- h->pci_seg, h->flags, h->info);
- DUMP_printk(" mmio-addr: %016llx\n",
- h->mmio_phys);
-
- iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
- if (iommu == NULL)
- return -ENOMEM;
- ret = init_iommu_one(iommu, h);
- if (ret)
- return ret;
- break;
- default:
- break;
- }
- p += h->length;
-
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int __init iommu_setup_msi(struct amd_iommu *iommu)
-{
- int r;
-
- if (pci_enable_msi(iommu->dev))
- return 1;
-
- r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
- IRQF_SAMPLE_RANDOM,
- "AMD IOMMU",
- NULL);
-
- if (r) {
- pci_disable_msi(iommu->dev);
- return 1;
- }
-
- iommu->int_enabled = true;
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
- return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
- if (iommu->int_enabled)
- return 0;
-
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
- return iommu_setup_msi(iommu);
-
- return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
- struct unity_map_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
- list_del(&entry->list);
- kfree(entry);
- }
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
- int i;
-
- switch (m->type) {
- case ACPI_IVMD_TYPE:
- set_device_exclusion_range(m->devid, m);
- break;
- case ACPI_IVMD_TYPE_ALL:
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- set_device_exclusion_range(i, m);
- break;
- case ACPI_IVMD_TYPE_RANGE:
- for (i = m->devid; i <= m->aux; ++i)
- set_device_exclusion_range(i, m);
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
- struct unity_map_entry *e = 0;
- char *s;
-
- e = kzalloc(sizeof(*e), GFP_KERNEL);
- if (e == NULL)
- return -ENOMEM;
-
- switch (m->type) {
- default:
- kfree(e);
- return 0;
- case ACPI_IVMD_TYPE:
- s = "IVMD_TYPEi\t\t\t";
- e->devid_start = e->devid_end = m->devid;
- break;
- case ACPI_IVMD_TYPE_ALL:
- s = "IVMD_TYPE_ALL\t\t";
- e->devid_start = 0;
- e->devid_end = amd_iommu_last_bdf;
- break;
- case ACPI_IVMD_TYPE_RANGE:
- s = "IVMD_TYPE_RANGE\t\t";
- e->devid_start = m->devid;
- e->devid_end = m->aux;
- break;
- }
- e->address_start = PAGE_ALIGN(m->range_start);
- e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
- e->prot = m->flags >> 1;
-
- DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
- " range_start: %016llx range_end: %016llx flags: %x\n", s,
- PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
- PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
- PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
- e->address_start, e->address_end, m->flags);
-
- list_add_tail(&e->list, &amd_iommu_unity_map);
-
- return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivmd_header *m;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- m = (struct ivmd_header *)p;
- if (m->flags & IVMD_FLAG_EXCL_RANGE)
- init_exclusion_range(m);
- else if (m->flags & IVMD_FLAG_UNITY_MAP)
- init_unity_map_range(m);
-
- p += m->length;
- }
-
- return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
- u16 devid;
-
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
- set_dev_entry_bit(devid, DEV_ENTRY_VALID);
- set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
- }
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu) {
- iommu_disable(iommu);
- iommu_set_device_table(iommu);
- iommu_enable_command_buffer(iommu);
- iommu_enable_event_buffer(iommu);
- iommu_set_exclusion_range(iommu);
- iommu_init_msi(iommu);
- iommu_enable(iommu);
- }
-}
-
-static void disable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static int amd_iommu_resume(struct sys_device *dev)
-{
- /* re-load the hardware */
- enable_iommus();
-
- /*
- * we have to flush after the IOMMUs are enabled because a
- * disabled IOMMU will never execute the commands we send
- */
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
-
- return 0;
-}
-
-static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* disable IOMMUs to go out of the way for BIOS */
- disable_iommus();
-
- return 0;
-}
-
-static struct sysdev_class amd_iommu_sysdev_class = {
- .name = "amd_iommu",
- .suspend = amd_iommu_suspend,
- .resume = amd_iommu_resume,
-};
-
-static struct sys_device device_amd_iommu = {
- .id = 0,
- .cls = &amd_iommu_sysdev_class,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- * 1 pass) Find the highest PCI device id the driver has to handle.
- * Upon this information the size of the data structures is
- * determined that needs to be allocated.
- *
- * 2 pass) Initialize the data structures just allocated with the
- * information in the ACPI table about available AMD IOMMUs
- * in the system. It also maps the PCI devices in the
- * system to specific IOMMUs
- *
- * 3 pass) After the basic data structures are allocated and
- * initialized we update them with information about memory
- * remapping requirements parsed out of the ACPI table in
- * this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-int __init amd_iommu_init(void)
-{
- int i, ret = 0;
-
-
- if (no_iommu) {
- printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
- return 0;
- }
-
- if (!amd_iommu_detected)
- return -ENODEV;
-
- /*
- * First parse ACPI tables to find the largest Bus/Dev/Func
- * we need to handle. Upon this information the shared data
- * structures for the IOMMUs in the system will be allocated
- */
- if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
- return -ENODEV;
-
- dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
- alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
- rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
- ret = -ENOMEM;
-
- /* Device table - directly used by all IOMMUs */
- amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(dev_table_size));
- if (amd_iommu_dev_table == NULL)
- goto out;
-
- /*
- * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
- * IOMMU see for that device
- */
- amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
- get_order(alias_table_size));
- if (amd_iommu_alias_table == NULL)
- goto free;
-
- /* IOMMU rlookup table - find the IOMMU for a specific device */
- amd_iommu_rlookup_table = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_rlookup_table == NULL)
- goto free;
-
- /*
- * Protection Domain table - maps devices to protection domains
- * This table has the same size as the rlookup_table
- */
- amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_pd_table == NULL)
- goto free;
-
- amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(MAX_DOMAIN_ID/8));
- if (amd_iommu_pd_alloc_bitmap == NULL)
- goto free;
-
- /* init the device table */
- init_device_table();
-
- /*
- * let all alias entries point to itself
- */
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- amd_iommu_alias_table[i] = i;
-
- /*
- * never allocate domain 0 because its used as the non-allocated and
- * error value placeholder
- */
- amd_iommu_pd_alloc_bitmap[0] = 1;
-
- /*
- * now the data structures are allocated and basically initialized
- * start the real acpi table scan
- */
- ret = -ENODEV;
- if (acpi_table_parse("IVRS", init_iommu_all) != 0)
- goto free;
-
- if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
- goto free;
-
- ret = sysdev_class_register(&amd_iommu_sysdev_class);
- if (ret)
- goto free;
-
- ret = sysdev_register(&device_amd_iommu);
- if (ret)
- goto free;
-
- ret = amd_iommu_init_dma_ops();
- if (ret)
- goto free;
-
- enable_iommus();
-
- printk(KERN_INFO "AMD IOMMU: device isolation ");
- if (amd_iommu_isolate)
- printk("enabled\n");
- else
- printk("disabled\n");
-
- if (amd_iommu_unmap_flush)
- printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
- else
- printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
-
-out:
- return ret;
-
-free:
- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
- get_order(MAX_DOMAIN_ID/8));
-
- free_pages((unsigned long)amd_iommu_pd_table,
- get_order(rlookup_table_size));
-
- free_pages((unsigned long)amd_iommu_rlookup_table,
- get_order(rlookup_table_size));
-
- free_pages((unsigned long)amd_iommu_alias_table,
- get_order(alias_table_size));
-
- free_pages((unsigned long)amd_iommu_dev_table,
- get_order(dev_table_size));
-
- free_iommu_all();
-
- free_unity_maps();
-
- goto out;
-}
-
-void amd_iommu_shutdown(void)
-{
- disable_iommus();
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
- return 0;
-}
-
-void __init amd_iommu_detect(void)
-{
- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
- return;
-
- if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
- iommu_detected = 1;
- amd_iommu_detected = 1;
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_aperture_disabled = 1;
- gart_iommu_aperture = 0;
-#endif
- }
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
- amd_iommu_dump = true;
-
- return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
- for (; *str; ++str) {
- if (strncmp(str, "isolate", 7) == 0)
- amd_iommu_isolate = true;
- if (strncmp(str, "share", 5) == 0)
- amd_iommu_isolate = false;
- if (strncmp(str, "fullflush", 9) == 0)
- amd_iommu_unmap_flush = true;
- }
-
- return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 000000000000..c1acead6227a
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared support code for AMD K8 northbridges and derivatives.
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/pci_ids.h>
+
+#include <asm/amd/nb.h>
+#include <asm/cpuid/api.h>
+
+static u32 *flush_words;
+
+static const struct pci_device_id amd_nb_misc_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+ {}
+};
+
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+ return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
+
+static int amd_cache_northbridges(void)
+{
+ struct amd_northbridge *nb;
+ u16 i;
+
+ if (amd_northbridges.num)
+ return 0;
+
+ amd_northbridges.num = amd_num_nodes();
+
+ nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL);
+ if (!nb)
+ return -ENOMEM;
+
+ amd_northbridges.nb = nb;
+
+ for (i = 0; i < amd_northbridges.num; i++) {
+ node_to_amd_nb(i)->misc = amd_node_get_func(i, 3);
+
+ /*
+ * Each Northbridge must have a 'misc' device.
+ * If not, then uninitialize everything.
+ */
+ if (!node_to_amd_nb(i)->misc) {
+ amd_northbridges.num = 0;
+ kfree(nb);
+ return -ENODEV;
+ }
+
+ node_to_amd_nb(i)->link = amd_node_get_func(i, 4);
+ }
+
+ if (amd_gart_present())
+ amd_northbridges.flags |= AMD_NB_GART;
+
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return 0;
+
+ /*
+ * Some CPU families support L3 Cache Index Disable. There are some
+ * limitations because of E382 and E388 on family 0x10.
+ */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model >= 0x8 &&
+ (boot_cpu_data.x86_model > 0x9 ||
+ boot_cpu_data.x86_stepping >= 0x1))
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ /* L3 cache partitioning is supported on family 0x15 */
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
+
+ return 0;
+}
+
+/*
+ * Ignores subdevice/subvendor but as far as I can figure out
+ * they're useless anyways
+ */
+bool __init early_is_amd_nb(u32 device)
+{
+ const struct pci_device_id *id;
+ u32 vendor = device & 0xffff;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return false;
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN))
+ return false;
+
+ device >>= 16;
+ for (id = amd_nb_misc_ids; id->vendor; id++)
+ if (vendor == id->vendor && device == id->device)
+ return true;
+ return false;
+}
+
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+ u64 base, msr;
+ unsigned int segn_busn_bits;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return NULL;
+
+ /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */
+ if (boot_cpu_data.x86 < 0x10 ||
+ rdmsrq_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr))
+ return NULL;
+
+ /* mmconfig is not enabled */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ res->flags = IORESOURCE_MEM;
+ res->start = base;
+ res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+ return res;
+}
+
+int amd_get_subcaches(int cpu)
+{
+ struct pci_dev *link = node_to_amd_nb(topology_amd_node_id(cpu))->link;
+ unsigned int mask;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return 0;
+
+ pci_read_config_dword(link, 0x1d4, &mask);
+
+ return (mask >> (4 * cpu_data(cpu).topo.core_id)) & 0xf;
+}
+
+int amd_set_subcaches(int cpu, unsigned long mask)
+{
+ static unsigned int reset, ban;
+ struct amd_northbridge *nb = node_to_amd_nb(topology_amd_node_id(cpu));
+ unsigned int reg;
+ int cuid;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
+ return -EINVAL;
+
+ /* if necessary, collect reset state of L3 partitioning and BAN mode */
+ if (reset == 0) {
+ pci_read_config_dword(nb->link, 0x1d4, &reset);
+ pci_read_config_dword(nb->misc, 0x1b8, &ban);
+ ban &= 0x180000;
+ }
+
+ /* deactivate BAN mode if any subcaches are to be disabled */
+ if (mask != 0xf) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
+ }
+
+ cuid = cpu_data(cpu).topo.core_id;
+ mask <<= 4 * cuid;
+ mask |= (0xf ^ (1 << cuid)) << 26;
+
+ pci_write_config_dword(nb->link, 0x1d4, mask);
+
+ /* reset BAN mode if L3 partitioning returned to reset state */
+ pci_read_config_dword(nb->link, 0x1d4, &reg);
+ if (reg == reset) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ reg &= ~0x180000;
+ pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
+ }
+
+ return 0;
+}
+
+static void amd_cache_gart(void)
+{
+ u16 i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ pr_notice("Cannot initialize GART flush words, GART support disabled\n");
+ return;
+ }
+
+ for (i = 0; i != amd_northbridges.num; i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
+}
+
+void amd_flush_garts(void)
+{
+ int flushed, i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(gart_lock);
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ /*
+ * Avoid races between AGP and IOMMU. In theory it's not needed
+ * but I'm not sure if the hardware won't lose flush requests
+ * when another is pending. This whole thing is so expensive anyways
+ * that it doesn't matter to serialize more. -AK
+ */
+ spin_lock_irqsave(&gart_lock, flags);
+ flushed = 0;
+ for (i = 0; i < amd_northbridges.num; i++) {
+ pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ flush_words[i] | 1);
+ flushed++;
+ }
+ for (i = 0; i < amd_northbridges.num; i++) {
+ u32 w;
+ /* Make sure the hardware actually executed the flush*/
+ for (;;) {
+ pci_read_config_dword(node_to_amd_nb(i)->misc,
+ 0x9c, &w);
+ if (!(w & 1))
+ break;
+ cpu_relax();
+ }
+ }
+ spin_unlock_irqrestore(&gart_lock, flags);
+ if (!flushed)
+ pr_notice("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(amd_flush_garts);
+
+static void __fix_erratum_688(void *info)
+{
+#define MSR_AMD64_IC_CFG 0xC0011021
+
+ msr_set_bit(MSR_AMD64_IC_CFG, 3);
+ msr_set_bit(MSR_AMD64_IC_CFG, 14);
+}
+
+/* Apply erratum 688 fix so machines without a BIOS fix work. */
+static __init void fix_erratum_688(void)
+{
+ struct pci_dev *F4;
+ u32 val;
+
+ if (boot_cpu_data.x86 != 0x14)
+ return;
+
+ if (!amd_northbridges.num)
+ return;
+
+ F4 = node_to_amd_nb(0)->link;
+ if (!F4)
+ return;
+
+ if (pci_read_config_dword(F4, 0x164, &val))
+ return;
+
+ if (val & BIT(2))
+ return;
+
+ on_each_cpu(__fix_erratum_688, NULL, 0);
+
+ pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n");
+}
+
+static __init int init_amd_nbs(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return 0;
+
+ amd_cache_northbridges();
+ amd_cache_gart();
+
+ fix_erratum_688();
+
+ return 0;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
new file mode 100644
index 000000000000..3d0a4768d603
--- /dev/null
+++ b/arch/x86/kernel/amd_node.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Node helper functions and common defines
+ *
+ * Copyright (c) 2024, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include <linux/debugfs.h>
+#include <asm/amd/node.h>
+
+/*
+ * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one
+ * or more nodes per package.
+ *
+ * The nodes are software-visible through PCI config space. All nodes are enumerated
+ * on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8
+ * nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a
+ * multi-function device.
+ *
+ * On legacy systems, these node devices represent integrated Northbridge functionality.
+ * On Zen-based systems, these node devices represent Data Fabric functionality.
+ *
+ * See "Configuration Space Accesses" section in BKDGs or
+ * "Processor x86 Core" -> "Configuration Space" section in PPRs.
+ */
+struct pci_dev *amd_node_get_func(u16 node, u8 func)
+{
+ if (node >= MAX_AMD_NUM_NODES)
+ return NULL;
+
+ return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
+}
+
+static struct pci_dev **amd_roots;
+
+/* Protect the PCI config register pairs used for SMN. */
+static DEFINE_MUTEX(smn_mutex);
+static bool smn_exclusive;
+
+#define SMN_INDEX_OFFSET 0x60
+#define SMN_DATA_OFFSET 0x64
+
+#define HSMP_INDEX_OFFSET 0xc4
+#define HSMP_DATA_OFFSET 0xc8
+
+/*
+ * SMN accesses may fail in ways that are difficult to detect here in the called
+ * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
+ * their own checking based on what behavior they expect.
+ *
+ * For SMN reads, the returned value may be zero if the register is Read-as-Zero.
+ * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
+ * can be checked here, and a proper error code can be returned.
+ *
+ * But the Read-as-Zero response cannot be verified here. A value of 0 may be
+ * correct in some cases, so callers must check that this correct is for the
+ * register/fields they need.
+ *
+ * For SMN writes, success can be determined through a "write and read back"
+ * However, this is not robust when done here.
+ *
+ * Possible issues:
+ *
+ * 1) Bits that are "Write-1-to-Clear". In this case, the read value should
+ * *not* match the write value.
+ *
+ * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
+ * known here.
+ *
+ * 3) Bits that are "Reserved / Set to 1". Ditto above.
+ *
+ * Callers of amd_smn_write() should do the "write and read back" check
+ * themselves, if needed.
+ *
+ * For #1, they can see if their target bits got cleared.
+ *
+ * For #2 and #3, they can check if their target bits got set as intended.
+ *
+ * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
+ * the operation is considered a success, and the caller does their own
+ * checking.
+ */
+static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_num_nodes())
+ return err;
+
+ root = amd_roots[node];
+ if (!root)
+ return err;
+
+ if (!smn_exclusive)
+ return err;
+
+ guard(mutex)(&smn_mutex);
+
+ err = pci_write_config_dword(root, i_off, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ return pcibios_err_to_errno(err);
+ }
+
+ err = (write ? pci_write_config_dword(root, d_off, *value)
+ : pci_read_config_dword(root, d_off, value));
+
+ return pcibios_err_to_errno(err);
+}
+
+int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false);
+
+ if (PCI_POSSIBLE_ERROR(*value)) {
+ err = -ENODEV;
+ *value = 0;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int __must_check amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write)
+{
+ return __amd_smn_rw(HSMP_INDEX_OFFSET, HSMP_DATA_OFFSET, node, address, value, write);
+}
+EXPORT_SYMBOL_GPL(amd_smn_hsmp_rdwr);
+
+static struct dentry *debugfs_dir;
+static u16 debug_node;
+static u32 debug_address;
+
+static ssize_t smn_node_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u16 node;
+ int ret;
+
+ ret = kstrtou16_from_user(userbuf, count, 0, &node);
+ if (ret)
+ return ret;
+
+ if (node >= amd_num_nodes())
+ return -ENODEV;
+
+ debug_node = node;
+ return count;
+}
+
+static int smn_node_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_node);
+ return 0;
+}
+
+static ssize_t smn_address_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &debug_address);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static int smn_address_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_address);
+ return 0;
+}
+
+static int smn_value_show(struct seq_file *m, void *v)
+{
+ u32 val;
+ int ret;
+
+ ret = amd_smn_read(debug_node, debug_address, &val);
+ if (ret)
+ return ret;
+
+ seq_printf(m, "0x%08x\n", val);
+ return 0;
+}
+
+static ssize_t smn_value_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u32 val;
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &val);
+ if (ret)
+ return ret;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ ret = amd_smn_write(debug_node, debug_address, val);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
+
+static struct pci_dev *get_next_root(struct pci_dev *root)
+{
+ while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) {
+ /* Root device is Device 0 Function 0. */
+ if (root->devfn)
+ continue;
+
+ if (root->vendor != PCI_VENDOR_ID_AMD &&
+ root->vendor != PCI_VENDOR_ID_HYGON)
+ continue;
+
+ break;
+ }
+
+ return root;
+}
+
+static bool enable_dfs;
+
+static int __init amd_smn_enable_dfs(char *str)
+{
+ enable_dfs = true;
+ return 1;
+}
+__setup("amd_smn_debugfs_enable", amd_smn_enable_dfs);
+
+static int __init amd_smn_init(void)
+{
+ u16 count, num_roots, roots_per_node, node, num_nodes;
+ struct pci_dev *root;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ guard(mutex)(&smn_mutex);
+
+ if (amd_roots)
+ return 0;
+
+ num_roots = 0;
+ root = NULL;
+ while ((root = get_next_root(root))) {
+ pci_dbg(root, "Reserving PCI config space\n");
+
+ /*
+ * There are a few SMN index/data pairs and other registers
+ * that shouldn't be accessed by user space. So reserve the
+ * entire PCI config space for simplicity rather than covering
+ * specific registers piecemeal.
+ */
+ if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
+ pci_err(root, "Failed to reserve config space\n");
+ return -EEXIST;
+ }
+
+ num_roots++;
+ }
+
+ pr_debug("Found %d AMD root devices\n", num_roots);
+
+ if (!num_roots)
+ return -ENODEV;
+
+ num_nodes = amd_num_nodes();
+ amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
+ if (!amd_roots)
+ return -ENOMEM;
+
+ roots_per_node = num_roots / num_nodes;
+
+ count = 0;
+ node = 0;
+ root = NULL;
+ while (node < num_nodes && (root = get_next_root(root))) {
+ /* Use one root for each node and skip the rest. */
+ if (count++ % roots_per_node)
+ continue;
+
+ pci_dbg(root, "is root for AMD node %u\n", node);
+ amd_roots[node++] = root;
+ }
+
+ if (enable_dfs) {
+ debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir);
+
+ debugfs_create_file("node", 0600, debugfs_dir, NULL, &smn_node_fops);
+ debugfs_create_file("address", 0600, debugfs_dir, NULL, &smn_address_fops);
+ debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops);
+ }
+
+ smn_exclusive = true;
+
+ return 0;
+}
+
+fs_initcall(amd_smn_init);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 676debfc1702..769321185a08 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Firmware replacement code.
*
@@ -10,23 +11,43 @@
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
+#define pr_fmt(fmt) "AGP: " fmt
+
#include <linux/kernel.h>
+#include <linux/kcore.h>
#include <linux/types.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
-#include <linux/ioport.h>
#include <linux/suspend.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd/nb.h>
+#include <asm/x86_init.h>
+#include <linux/crash_dump.h>
+
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR (512ULL << 20)
+#define GART_MAX_ADDR (1ULL << 32)
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
@@ -37,29 +58,48 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-struct bus_dev_range {
- int bus;
- int dev_base;
- int dev_limit;
-};
+#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE)
+/*
+ * If the first kernel maps the aperture over e820 RAM, the kdump kernel will
+ * use the same range because it will remain configured in the northbridge.
+ * Trying to dump this area via /proc/vmcore may crash the machine, so exclude
+ * it from vmcore.
+ */
+static unsigned long aperture_pfn_start, aperture_page_count;
-static struct bus_dev_range bus_dev_ranges[] __initdata = {
- { 0x00, 0x18, 0x20},
- { 0xff, 0x00, 0x20},
- { 0xfe, 0x00, 0x20}
-};
+static int gart_mem_pfn_is_ram(unsigned long pfn)
+{
+ return likely((pfn < aperture_pfn_start) ||
+ (pfn >= aperture_pfn_start + aperture_page_count));
+}
+
+#ifdef CONFIG_PROC_VMCORE
+static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
+{
+ return !!gart_mem_pfn_is_ram(pfn);
+}
-static struct resource gart_resource = {
- .name = "GART",
- .flags = IORESOURCE_MEM,
+static struct vmcore_cb gart_vmcore_cb = {
+ .pfn_is_ram = gart_oldmem_pfn_is_ram,
};
+#endif
-static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
+static void __init exclude_from_core(u64 aper_base, u32 aper_order)
{
- gart_resource.start = aper_base;
- gart_resource.end = aper_base + aper_size - 1;
- insert_resource(&iomem_resource, &gart_resource);
+ aperture_pfn_start = aper_base >> PAGE_SHIFT;
+ aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
+#ifdef CONFIG_PROC_VMCORE
+ register_vmcore_cb(&gart_vmcore_cb);
+#endif
+#ifdef CONFIG_PROC_KCORE
+ WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
}
+#else
+static void exclude_from_core(u64 aper_base, u32 aper_order)
+{
+}
+#endif
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
@@ -67,7 +107,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
static u32 __init allocate_aperture(void)
{
u32 aper_size;
- void *p;
+ unsigned long addr;
/* aper_size should <= 1G */
if (fallback_aper_order > 5)
@@ -80,35 +120,19 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- /*
- * using 512M as goal, in case kexec will load kernel_big
- * that will do the on position decompress, and could overlap with
- * that positon with gart that is used.
- * sequende:
- * kernel_small
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kernel_small(gart area become e820_reserved)
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kerne_big (uncompressed size will be big than 64M or 128M)
- * so don't use 512M below as gart iommu, leave the space for kernel
- * code for safe
- */
- p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
- if (!p || __pa(p)+aper_size > 0xffffffff) {
- printk(KERN_ERR
- "Cannot allocate aperture memory hole (%p,%uK)\n",
- p, aper_size>>10);
- if (p)
- free_bootmem(__pa(p), aper_size);
+ addr = memblock_phys_alloc_range(aper_size, aper_size,
+ GART_MIN_ADDR, GART_MAX_ADDR);
+ if (!addr) {
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
- printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
- aper_size >> 10, __pa(p));
- insert_aperture_resource((u32)__pa(p), aper_size);
- register_nosave_region((u32)__pa(p) >> PAGE_SHIFT,
- (u32)__pa(p+aper_size) >> PAGE_SHIFT);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
+ register_nosave_region(addr >> PAGE_SHIFT,
+ (addr+aper_size) >> PAGE_SHIFT);
- return (u32)__pa(p);
+ return (u32)addr;
}
@@ -148,10 +172,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
u64 aper;
u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
- printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
return 0;
}
@@ -175,16 +200,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* On some sick chips, APSIZE is 0. It means it wants 4G
* so let double check that order, and lets trust AMD NB settings:
*/
- printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
- aper, 32 << old_order);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
- printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
- 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
*order = old_order;
}
- printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
- aper, 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
@@ -199,7 +226,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* Do an PCI bus scan by hand because we're running before the PCI
* subsystem.
*
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
* generically. It's probably overkill to always scan all slots because
* the AGP bridges should be always an own bus on the HT hierarchy,
* but do it here for future safety.
@@ -232,72 +259,74 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
order);
}
- /* No multi-function device? */
type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
break;
}
}
}
- printk(KERN_INFO "No AGP bridge found\n");
+ pr_info("No AGP bridge found\n");
return 0;
}
-static int gart_fix_e820 __initdata = 1;
+static bool gart_fix_e820 __initdata = true;
static int __init parse_gart_mem(char *p)
{
- if (!p)
- return -EINVAL;
-
- if (!strncmp(p, "off", 3))
- gart_fix_e820 = 0;
- else if (!strncmp(p, "on", 2))
- gart_fix_e820 = 1;
-
- return 0;
+ return kstrtobool(p, &gart_fix_e820);
}
early_param("gart_fix_e820", parse_gart_mem);
+/*
+ * With kexec/kdump, if the first kernel doesn't shut down the GART and the
+ * second kernel allocates a different GART region, there might be two
+ * overlapping GART regions present:
+ *
+ * - the first still used by the GART initialized in the first kernel.
+ * - (sub-)set of it used as normal RAM by the second kernel.
+ *
+ * which leads to memory corruptions and a kernel panic eventually.
+ *
+ * This can also happen if the BIOS has forgotten to mark the GART region
+ * as reserved.
+ *
+ * Try to update the e820 map to mark that new region as reserved.
+ */
void __init early_gart_iommu_check(void)
{
- /*
- * in case it is enabled before, esp for kexec/kdump,
- * previous kernel already enable that. memset called
- * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
- * or second kernel have different position for GART hole. and new
- * kernel could use hole as RAM that is still used by GART set by
- * first kernel
- * or BIOS forget to put that in reserved.
- * try to update e820 to make that region as reserved.
- */
- int i, fix, slot;
+ u32 agp_aper_order = 0;
+ int i, fix, slot, valid_agp = 0;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base = 0, last_aper_base = 0;
int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
+ if (!amd_gart_present())
+ return;
+
if (!early_pci_allowed())
return;
/* This is mostly duplicate of iommu_hole_init */
+ search_agp_bridge(&agp_aper_order, &valid_agp);
+
fix = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- aper_enabled = ctl & AMD64_GARTEN;
+ aper_enabled = ctl & GARTEN;
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -326,33 +355,34 @@ void __init early_gart_iommu_check(void)
fix = 1;
if (gart_fix_e820 && !fix && aper_enabled) {
- if (e820_any_mapped(aper_base, aper_base + aper_size,
- E820_RAM)) {
+ if (e820__mapped_any(aper_base, aper_base + aper_size,
+ E820_TYPE_RAM)) {
/* reserve it, so we can reuse it in second kernel */
- printk(KERN_INFO "update e820 for GART\n");
- e820_add_region(aper_base, aper_size, E820_RESERVED);
- update_e820();
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
+ e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED);
+ e820__update_table_print();
}
}
- if (!fix)
+ if (valid_agp)
return;
- /* different nodes have different setting, disable them all at first*/
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ /* disable them all at first */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- ctl &= ~AMD64_GARTEN;
+ ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -369,39 +399,57 @@ void __init gart_iommu_hole_init(void)
int fix, slot, valid_agp = 0;
int i, node;
+ if (!amd_gart_present())
+ return;
+
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
return;
- printk(KERN_INFO "Checking aperture...\n");
+ pr_info("Checking aperture...\n");
if (!fallback_aper_force)
agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
node = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
+ u32 ctl;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
iommu_detected = 1;
gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;
+
+ ctl = read_pci_config(bus, slot, 3,
+ AMD64_GARTAPERTURECTL);
+
+ /*
+ * Before we do anything else disable the GART. It may
+ * still be enabled if we boot into a crash-kernel here.
+ * Reconfiguring the GART while it is enabled could have
+ * unknown side-effects.
+ */
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
- aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+ aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
node++;
if (!aperture_valid(aper_base, aper_size, 64<<20)) {
@@ -412,9 +460,9 @@ void __init gart_iommu_hole_init(void)
if (!no_iommu &&
max_pfn > MAX_DMA32_PFN &&
!printed_gart_size_msg) {
- printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
- printk(KERN_ERR "please increase GART size in your BIOS setup\n");
- printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
printed_gart_size_msg = 1;
}
} else {
@@ -436,9 +484,12 @@ void __init gart_iommu_hole_init(void)
out:
if (!fix && !fallback_aper_force) {
if (last_aper_base) {
- unsigned long n = (32 * 1024 * 1024) << last_aper_order;
-
- insert_aperture_resource((u32)last_aper_base, n);
+ /*
+ * If this is the kdump kernel, the first kernel
+ * may have allocated the range over its e820 RAM
+ * and fixed up the northbridge
+ */
+ exclude_from_core(last_aper_base, last_aper_order);
}
return;
}
@@ -450,19 +501,14 @@ out:
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
- } else if (swiotlb && !valid_agp) {
- /* Do nothing */
} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_INFO
- "Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_INFO
- "Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_INFO
- "This costs you %d MB of RAM\n",
- 32 << fallback_aper_order);
+ pr_info("Your BIOS doesn't leave an aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
@@ -481,22 +527,32 @@ out:
return;
}
- /* Fix up the north bridges */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
- int bus;
- int dev_base, dev_limit;
+ /*
+ * If this is the kdump kernel _and_ the first kernel did not
+ * configure the aperture in the northbridge, this range may
+ * overlap with the first kernel's memory. We can't access the
+ * range through vmcore even though it should be part of the dump.
+ */
+ exclude_from_core(aper_alloc, aper_order);
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ /* Fix up the north bridges */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = aper_order << 1;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
- /* Don't enable translation yet. That is done later.
- Assume this BIOS didn't initialise the GART so
- just overwrite all previous bits */
- write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
}
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..581db89477f9 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -1,19 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particular, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
+obj-y += hw_nmi.o
+
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
-obj-y += apic_flat_64.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-y += apic_flat_64.o
endif
-obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
-obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_ES7000) += es7000_32.o
-obj-$(CONFIG_X86_SUMMIT) += summit_32.o
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c2830ec66..d93f87f29d03 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Local APIC handling, local APIC timers
*
@@ -14,91 +15,89 @@
* Mikael Pettersson : PM converted to driver model.
*/
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
+#include <linux/bitmap.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/ftrace.h>
#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/timex.h>
+#include <linux/i8253.h>
#include <linux/dmar.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/dmi.h>
-#include <linux/nmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/kvm_types.h>
-#include <asm/perf_counter.h>
-#include <asm/pgalloc.h>
-#include <asm/atomic.h>
+#include <xen/xen.h>
+
+#include <asm/trace/irq_vectors.h>
+#include <asm/irq_remapping.h>
+#include <asm/pc-conf-reg.h>
+#include <asm/perf_event.h>
+#include <asm/x86_init.h>
+#include <linux/atomic.h>
+#include <asm/barrier.h>
#include <asm/mpspec.h>
-#include <asm/i8253.h>
#include <asm/i8259.h>
#include <asm/proto.h>
+#include <asm/traps.h>
#include <asm/apic.h>
+#include <asm/acpi.h>
+#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/idle.h>
#include <asm/mtrr.h>
+#include <asm/time.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/tsc.h>
+#include <asm/hypervisor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/irq_regs.h>
+#include <asm/cpu.h>
-unsigned int num_processors;
-
-unsigned disabled_cpus __cpuinitdata;
+#include "local.h"
/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
+u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID;
+EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
-/*
- * The highest APIC ID seen during enumeration.
- *
- * This determines the messaging protocol we can use: if all APIC IDs
- * are in the 0 ... 7 range, then we can use logical addressing which
- * has some performance advantages (better broadcasting).
- *
- * If there's an APIC ID above 8, we use physical addressing.
- */
-unsigned int max_physical_apicid;
+u8 boot_cpu_apic_version __ro_after_init;
/*
- * Bitmask of physically existing CPUs:
+ * This variable controls which CPUs receive external NMIs. By default,
+ * external NMIs are delivered only to the BSP.
*/
-physid_mask_t phys_cpu_present_map;
+static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP;
/*
- * Map cpu index to physical APIC ID
+ * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID
*/
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+static bool virt_ext_dest_id __ro_after_init;
-#ifdef CONFIG_X86_32
-/*
- * Knob to control our willingness to enable the local APIC.
- *
- * +1=force-enable
- */
-static int force_enable_local_apic;
-/*
- * APIC command line parameters
- */
-static int __init parse_lapic(char *arg)
+/* For parallel bootup. */
+unsigned long apic_mmio_base __ro_after_init;
+
+static inline bool apic_accessible(void)
{
- force_enable_local_apic = 1;
- return 0;
+ return x2apic_mode || apic_mmio_base;
}
-early_param("lapic", parse_lapic);
+
+#ifdef CONFIG_X86_32
/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
+static int enabled_via_apicbase __ro_after_init;
/*
* Handle interrupt mode configuration register (IMCR).
@@ -110,103 +109,77 @@ static int enabled_via_apicbase;
*/
static inline void imcr_pic_to_apic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go through APIC */
- outb(0x01, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x01);
}
static inline void imcr_apic_to_pic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go directly to BSP */
- outb(0x00, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x00);
}
#endif
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * +1=force-enable
+ */
+static int force_enable_local_apic __initdata;
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ if (IS_ENABLED(CONFIG_X86_32) && !arg)
+ force_enable_local_apic = 1;
+ else if (arg && !strncmp(arg, "notscdeadline", 13))
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return 0;
+}
+early_param("lapic", parse_lapic);
+
#ifdef CONFIG_X86_64
static int apic_calibrate_pmtmr __initdata;
static __init int setup_apicpmtimer(char *s)
{
apic_calibrate_pmtmr = 1;
notsc_setup(NULL);
- return 0;
+ return 1;
}
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-int x2apic_mode;
-#ifdef CONFIG_X86_X2APIC
-/* x2apic enabled before OS handover */
-static int x2apic_preenabled;
-static __init int setup_nox2apic(char *str)
-{
- if (x2apic_enabled()) {
- pr_warning("Bios already enabled x2apic, "
- "can't enforce nox2apic");
- return 0;
- }
-
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
- return 0;
-}
-early_param("nox2apic", setup_nox2apic);
-#endif
-
-unsigned long mp_lapic_addr;
-int disable_apic;
+static unsigned long mp_lapic_addr __ro_after_init;
+bool apic_is_disabled __ro_after_init;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __initdata;
/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
+int local_apic_timer_c2_ok __ro_after_init;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-int first_system_vector = 0xfe;
-
/*
* Debug level, exported for io_apic.c
*/
-unsigned int apic_verbosity;
+int apic_verbosity __ro_after_init;
-int pic_mode;
+int pic_mode __ro_after_init;
/* Have we found an MP table */
-int smp_found_config;
+int smp_found_config __ro_after_init;
static struct resource lapic_resource = {
.name = "Local APIC",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
-static unsigned int calibration_result;
+/* Measured in ticks per HZ. */
+unsigned int lapic_timer_period = 0;
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static void lapic_timer_broadcast(const struct cpumask *mask);
static void apic_pm_activate(void);
/*
- * The local apic timer can be used for any function which is CPU local.
- */
-static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_mode = lapic_timer_setup,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
-static unsigned long apic_phys;
-
-/*
* Get the LAPIC version
*/
static inline int lapic_get_version(void)
@@ -219,11 +192,7 @@ static inline int lapic_get_version(void)
*/
static inline int lapic_is_integrated(void)
{
-#ifdef CONFIG_X86_64
- return 1;
-#else
return APIC_INTEGRATED(lapic_get_version());
-#endif
}
/*
@@ -235,60 +204,31 @@ static int modern_apic(void)
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 >= 0xf)
return 1;
- return lapic_get_version() >= 0x14;
-}
-/*
- * bare function to substitute write operation
- * and it's _that_ fast :)
- */
-static void native_apic_write_dummy(u32 reg, u32 v)
-{
- WARN_ON_ONCE((cpu_has_apic || !disable_apic));
-}
+ /* Hygon systems use modern APIC */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return 1;
-static u32 native_apic_read_dummy(u32 reg)
-{
- WARN_ON_ONCE((cpu_has_apic && !disable_apic));
- return 0;
+ return lapic_get_version() >= 0x14;
}
/*
- * right after this call apic->write/read doesn't do anything
- * note that there is no restore operation it works one way
+ * right after this call apic become NOOP driven
+ * so apic->write/read doesn't do anything
*/
-void apic_disable(void)
+static void __init apic_disable(void)
{
- apic->read = native_apic_read_dummy;
- apic->write = native_apic_write_dummy;
-}
-
-void native_apic_wait_icr_idle(void)
-{
- while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-u32 native_safe_apic_wait_icr_idle(void)
-{
- u32 send_status;
- int timeout;
-
- timeout = 0;
- do {
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- if (!send_status)
- break;
- udelay(100);
- } while (timeout++ < 1000);
-
- return send_status;
+ apic_install_driver(&apic_noop);
}
void native_apic_icr_write(u32 low, u32 id)
{
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ unsigned long flags;
+
+ local_irq_save(flags);
+ apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
+ local_irq_restore(flags);
}
u64 native_apic_icr_read(void)
@@ -302,45 +242,15 @@ u64 native_apic_icr_read(void)
}
/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
- unsigned int v;
-
- /* unmask and set to NMI */
- v = APIC_DM_NMI;
-
- /* Level triggered for 82489DX (32bit mode) */
- if (!lapic_is_integrated())
- v |= APIC_LVT_LEVEL_TRIGGER;
-
- apic_write(APIC_LVT0, v);
-}
-
-#ifdef CONFIG_X86_32
-/**
- * get_physical_broadcast - Get number of physical broadcast IDs
- */
-int get_physical_broadcast(void)
-{
- return modern_apic() ? 0xff : 0xf;
-}
-#endif
-
-/**
* lapic_get_maxlvt - get the maximum number of local vector table entries
*/
int lapic_get_maxlvt(void)
{
- unsigned int v;
-
- v = apic_read(APIC_LVR);
/*
* - we always have APIC integrated on 64bit mode
* - 82489DXs do not report # of LVT entries
*/
- return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+ return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2;
}
/*
@@ -349,6 +259,10 @@ int lapic_get_maxlvt(void)
/* Clock divisor */
#define APIC_DIVISOR 16
+#define TSC_DIVISOR 8
+
+/* i82489DX specific */
+#define I82489DX_BASE_DIVIDER (((0x2) << 18))
/*
* This function sets up the local APIC timer, with a timeout of
@@ -367,14 +281,33 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
lvtt_value = LOCAL_TIMER_VECTOR;
if (!oneshot)
lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+
+ /*
+ * The i82489DX APIC uses bit 18 and 19 for the base divider. This
+ * overlaps with bit 18 on integrated APICs, but is not documented
+ * in the SDM. No problem though. i82489DX equipped systems do not
+ * have TSC deadline timer.
+ */
if (!lapic_is_integrated())
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+ lvtt_value |= I82489DX_BASE_DIVIDER;
if (!irqen)
lvtt_value |= APIC_LVT_MASKED;
apic_write(APIC_LVTT, lvtt_value);
+ if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+ /*
+ * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+ * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+ * According to Intel, MFENCE can do the serialization here.
+ */
+ asm volatile("mfence" : : : "memory");
+ return;
+ }
+
/*
* Divide PICLK by 16
*/
@@ -388,38 +321,93 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
}
/*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
*
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
*
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
*/
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
-
- apic_write(reg, v);
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
}
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
+ unsigned int rsvd, vector;
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]);
+ do {
+ vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */
+ if (vector && !eilvt_entry_is_changeable(vector, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new));
+
+ rsvd = new & ~APIC_EILVT_MASKED;
+ if (rsvd && rsvd != vector)
+ pr_info("LVT offset %d assigned for vector 0x%02x\n",
+ offset, rsvd);
+
+ return new;
}
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
+
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
+
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on another cpu\n",
+ smp_processor_id(), reg, offset, new, reserved);
+ return -EINVAL;
+ }
+
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on this cpu\n",
+ smp_processor_id(), reg, offset, new, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
/*
* Program the next event, relative to now
@@ -431,40 +419,65 @@ static int lapic_next_event(unsigned long delta,
return 0;
}
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int lapic_next_deadline(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ u64 tsc;
+
+ /* This MSR is special and need a special fence: */
+ weak_wrmsr_fence();
+
+ tsc = rdtsc();
+ wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+ return 0;
+}
+
+static int lapic_timer_shutdown(struct clock_event_device *evt)
{
- unsigned long flags;
unsigned int v;
/* Lapic used as dummy for broadcast ? */
if (evt->features & CLOCK_EVT_FEAT_DUMMY)
- return;
+ return 0;
- local_irq_save(flags);
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(calibration_result,
- mode != CLOCK_EVT_MODE_PERIODIC, 1);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- v = apic_read(APIC_LVTT);
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, v);
- apic_write(APIC_TMICT, 0xffffffff);
- break;
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
+ /*
+ * Setting APIC_LVT_MASKED (above) should be enough to tell
+ * the hardware that this timer will never fire. But AMD
+ * erratum 411 and some Intel CPU behavior circa 2024 say
+ * otherwise. Time for belt and suspenders programming: mask
+ * the timer _and_ zero the counter registers:
+ */
+ if (v & APIC_LVT_TIMER_TSCDEADLINE)
+ wrmsrq(MSR_IA32_TSC_DEADLINE, 0);
+ else
+ apic_write(APIC_TMICT, 0);
- local_irq_restore(flags);
+ return 0;
+}
+
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ __setup_APIC_LVTT(lapic_timer_period, oneshot, 1);
+ return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, true);
}
/*
@@ -473,35 +486,147 @@ static void lapic_timer_setup(enum clock_event_mode mode,
static void lapic_timer_broadcast(const struct cpumask *mask)
{
#ifdef CONFIG_SMP
- apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+ __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
#endif
}
+
/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+ | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_state_shutdown = lapic_timer_shutdown,
+ .set_state_periodic = lapic_timer_set_periodic,
+ .set_state_oneshot = lapic_timer_set_oneshot,
+ .set_state_oneshot_stopped = lapic_timer_shutdown,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+static const struct x86_cpu_id deadline_match[] __initconst = {
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
+
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
+
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
+
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
+
+ X86_MATCH_VFM(INTEL_HASWELL, 0x22),
+ X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
+ X86_MATCH_VFM(INTEL_HASWELL_G, 0x17),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, 0x25),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2),
+
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE, 0x52),
+
+ {},
+};
+
+static __init bool apic_validate_deadline_timer(void)
+{
+ const struct x86_cpu_id *m;
+ u32 rev;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ m = x86_match_cpu(deadline_match);
+ if (!m)
+ return true;
+
+ rev = (u32)m->driver_data;
+
+ if (boot_cpu_data.microcode >= rev)
+ return true;
+
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
+ "please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
+}
+
+/*
+ * Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
*/
-static void __cpuinit setup_APIC_timer(void)
+static void setup_APIC_timer(void)
{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
- if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) {
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
- /* Make LAPIC timer preferrable over percpu HPET */
+ /* Make LAPIC timer preferable over percpu HPET */
lapic_clockevent.rating = 150;
}
memcpy(levt, &lapic_clockevent, sizeof(*levt));
levt->cpumask = cpumask_of(smp_processor_id());
- clockevents_register_device(levt);
+ if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->name = "lapic-deadline";
+ levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_DUMMY);
+ levt->set_next_event = lapic_next_deadline;
+ clockevents_config_and_register(levt,
+ tsc_khz * (1000 / TSC_DIVISOR),
+ 0xF, ~0UL);
+ } else
+ clockevents_register_device(levt);
+
+ apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true);
+}
+
+/*
+ * Install the updated TSC frequency from recalibration at the TSC
+ * deadline clockevent devices.
+ */
+static void __lapic_update_tsc_freq(void *info)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR));
+}
+
+void lapic_update_tsc_freq(void)
+{
+ /*
+ * The clockevent device's ->mult and ->shift can both be
+ * changed. In order to avoid races, schedule the frequency
+ * update code on each CPU.
+ */
+ on_each_cpu(__lapic_update_tsc_freq, NULL, 0);
}
/*
* In this functions we calibrate APIC bus clocks to the external timer.
*
* We want to do the calibration only once since we want to have local timer
- * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * irqs synchronous. CPUs connected by the same APIC bus have the very same bus
* frequency.
*
* This was previously done by reading the PIT/HPET and waiting for a wrap
@@ -523,20 +648,20 @@ static void __cpuinit setup_APIC_timer(void)
static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
- * Temporary interrupt handler.
+ * Temporary interrupt handler and polled calibration function.
*/
static void __init lapic_cal_handler(struct clock_event_device *dev)
{
unsigned long long tsc = 0;
long tapic = apic_read(APIC_TMCCT);
- unsigned long pm = acpi_pm_read_early();
+ u32 pm = acpi_pm_read_early();
- if (cpu_has_tsc)
- rdtscll(tsc);
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ tsc = rdtsc();
switch (lapic_cal_loops++) {
case 0:
@@ -558,7 +683,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -569,7 +694,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+ apic_pr_verbose("... PM-Timer delta = %u\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -579,14 +704,14 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (deltapm > (pm_100ms - pm_thresh) &&
deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ apic_pr_verbose("... PM-Timer result ok\n");
return 0;
}
res = (((u64)deltapm) * mult) >> 22;
do_div(res, 1000000);
- pr_warning("APIC calibration not consistent "
- "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+ pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n",
+ (long)res);
/* Correct the lapic counter value */
res = (((u64)(*delta)) * pm_100ms);
@@ -596,31 +721,111 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
*delta = (long)res;
/* Correct the tsc counter value */
- if (cpu_has_tsc) {
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
- apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
- "PM-Timer: %lu (%ld) \n",
- (unsigned long)res, *deltatsc);
+ apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
*deltatsc = (long)res;
}
return 0;
}
+static int __init lapic_init_clockevent(void)
+{
+ if (!lapic_timer_period)
+ return -1;
+
+ /* Calculate the scaled math multiplication factor */
+ lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR,
+ TICK_NSEC, lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
+
+ return 0;
+}
+
+bool __init apic_needs_pit(void)
+{
+ /*
+ * If the frequencies are not known, PIT is required for both TSC
+ * and apic timer calibration.
+ */
+ if (!tsc_khz || !cpu_khz)
+ return true;
+
+ /* Is there an APIC at all or is it disabled? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled)
+ return true;
+
+ /*
+ * If interrupt delivery mode is legacy PIC or virtual wire without
+ * configuration, the local APIC timer won't be set up. Make sure
+ * that the PIT is initialized.
+ */
+ if (apic_intr_mode == APIC_PIC ||
+ apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG)
+ return true;
+
+ /* Virt guests may lack ARAT, but still have DEADLINE */
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
+ return true;
+
+ /* Deadline timer is based on TSC so no further PIT action required */
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+
+ /* APIC timer disabled? */
+ if (disable_apic_timer)
+ return true;
+ /*
+ * The APIC timer frequency is known already, no PIT calibration
+ * required. If unknown, let the PIT be initialized.
+ */
+ return lapic_timer_period == 0;
+}
+
static int __init calibrate_APIC_clock(void)
{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- void (*real_handler)(struct clock_event_device *dev);
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+ u64 tsc_perj = 0, tsc_start = 0;
+ long delta_tsc_khz, bus_khz;
+ unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
int pm_referenced = 0;
- local_irq_disable();
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return 0;
- /* Replace the global interrupt handler */
- real_handler = global_clock_event->event_handler;
- global_clock_event->event_handler = lapic_cal_handler;
+ /*
+ * Check if lapic timer has already been calibrated by platform
+ * specific routine, such as tsc calibration code. If so just fill
+ * in the clockevent structure and return.
+ */
+ if (!lapic_init_clockevent()) {
+ apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period);
+ /*
+ * Direct calibration methods must have an always running
+ * local APIC timer, no need for broadcast timer.
+ */
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+ return 0;
+ }
+
+ apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n");
+
+ /*
+ * There are platforms w/o global clockevent devices. Instead of
+ * making the calibration conditional on that, use a polling based
+ * approach everywhere.
+ */
+ local_irq_disable();
/*
* Setup the APIC counter to maximum. There is no way the lapic
@@ -628,20 +833,55 @@ static int __init calibrate_APIC_clock(void)
*/
__setup_APIC_LVTT(0xffffffff, 0, 0);
- /* Let the interrupts run */
+ /*
+ * Methods to terminate the calibration loop:
+ * 1) Global clockevent if available (jiffies)
+ * 2) TSC if available and frequency is known
+ */
+ jif_start = READ_ONCE(jiffies);
+
+ if (tsc_khz) {
+ tsc_start = rdtsc();
+ tsc_perj = div_u64((u64)tsc_khz * 1000, HZ);
+ }
+
+ /*
+ * Enable interrupts so the tick can fire, if a global
+ * clockevent device is available
+ */
local_irq_enable();
- while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
- cpu_relax();
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) {
+ /* Wait for a tick to elapse */
+ while (1) {
+ if (tsc_khz) {
+ u64 tsc_now = rdtsc();
+ if ((tsc_now - tsc_start) >= tsc_perj) {
+ tsc_start += tsc_perj;
+ break;
+ }
+ } else {
+ unsigned long jif_now = READ_ONCE(jiffies);
+
+ if (time_after(jif_now, jif_start)) {
+ jif_start = jif_now;
+ break;
+ }
+ }
+ cpu_relax();
+ }
- local_irq_disable();
+ /* Invoke the calibration routine */
+ local_irq_disable();
+ lapic_cal_handler(NULL);
+ local_irq_enable();
+ }
- /* Restore the real event handler */
- global_clock_event->event_handler = real_handler;
+ local_irq_disable();
/* Build delta t1-t2 as apic timer counts down */
delta = lapic_cal_t1 - lapic_cal_t2;
- apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+ apic_pr_verbose("... lapic delta = %ld\n", delta);
deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
@@ -649,56 +889,48 @@ static int __init calibrate_APIC_clock(void)
pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
&delta, &deltatsc);
- /* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
- lapic_clockevent.shift);
- lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
- lapic_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ lapic_init_clockevent();
- calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ apic_pr_verbose("..... delta %ld\n", delta);
+ apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult);
+ apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
- apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
- apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
- apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
- calibration_result);
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
+ delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS);
- if (cpu_has_tsc) {
- apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
- "%ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n",
+ delta_tsc_khz / 1000, delta_tsc_khz % 1000);
}
- apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
- "%u.%04u MHz.\n",
- calibration_result / (1000000 / HZ),
- calibration_result % (1000000 / HZ));
+ bus_khz = (long)lapic_timer_period * HZ / 1000;
+ apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n",
+ bus_khz / 1000, bus_khz % 1000);
/*
* Do a sanity check on the APIC calibration result
*/
- if (calibration_result < (1000000 / HZ)) {
+ if (lapic_timer_period < (1000000 / HZ)) {
local_irq_enable();
- pr_warning("APIC frequency too slow, disabling apic timer\n");
+ pr_warn("APIC frequency too slow, disabling apic timer\n");
return -1;
}
levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
/*
- * PM timer calibration failed or not turned on
- * so lets try APIC timer based calibration
+ * PM timer calibration failed or not turned on so lets try APIC
+ * timer based calibration, if a global clockevent device is
+ * available.
*/
- if (!pm_referenced) {
- apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+ if (!pm_referenced && global_clock_event) {
+ apic_pr_verbose("... verify APIC timer\n");
/*
* Setup the apic timer manually
*/
levt->event_handler = lapic_cal_handler;
- lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+ lapic_timer_set_periodic(levt);
lapic_cal_loops = -1;
/* Let the interrupts run */
@@ -708,23 +940,24 @@ static int __init calibrate_APIC_clock(void)
cpu_relax();
/* Stop the lapic timer */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+ local_irq_disable();
+ lapic_timer_shutdown(levt);
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
- apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+ apic_pr_verbose("... jiffies delta = %lu\n", deltaj);
/* Check, if the jiffies result is consistent */
if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
- apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ apic_pr_verbose("... jiffies result ok\n");
else
levt->features |= CLOCK_EVT_FEAT_DUMMY;
- } else
- local_irq_enable();
+ }
+ local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- pr_warning("APIC timer disabled due to verification failure\n");
- return -1;
+ pr_warn("APIC timer disabled due to verification failure\n");
+ return -1;
}
return 0;
@@ -753,9 +986,6 @@ void __init setup_boot_APIC_clock(void)
return;
}
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
-
if (calibrate_APIC_clock()) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1)
@@ -768,19 +998,17 @@ void __init setup_boot_APIC_clock(void)
* PIT/HPET going. Otherwise register lapic as a dummy
* device.
*/
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- pr_warning("APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
-void __cpuinit setup_secondary_APIC_clock(void)
+void setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
/*
@@ -788,8 +1016,7 @@ void __cpuinit setup_secondary_APIC_clock(void)
*/
static void local_apic_timer_interrupt(void)
{
- int cpu = smp_processor_id();
- struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+ struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
/*
* Normally we should not be here till LAPIC has been initialized but
@@ -803,9 +1030,10 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ pr_warn("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
/* Switch it off */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+ lapic_timer_shutdown(evt);
return;
}
@@ -825,33 +1053,18 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- exit_idle();
- irq_enter();
+ apic_eoi();
+ trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
- irq_exit();
+ trace_local_timer_exit(LOCAL_TIMER_VECTOR);
set_irq_regs(old_regs);
}
-int setup_profiling_timer(unsigned int multiplier)
-{
- return -EINVAL;
-}
-
/*
* Local APIC start and shutdown
*/
@@ -868,8 +1081,7 @@ void clear_local_APIC(void)
int maxlvt;
u32 v;
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
+ if (!apic_accessible())
return;
maxlvt = lapic_get_maxlvt();
@@ -932,25 +1144,40 @@ void clear_local_APIC(void)
}
/**
- * disable_local_APIC - clear and disable the local APIC
+ * apic_soft_disable - Clears and software disables the local APIC on hotplug
+ *
+ * Contrary to disable_local_APIC() this does not touch the enable bit in
+ * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC
+ * bus would require a hardware reset as the APIC would lose track of bus
+ * arbitration. On systems with FSB delivery APICBASE could be disabled,
+ * but it has to be guaranteed that no interrupt is sent to the APIC while
+ * in that state and it's not clear from the SDM whether it still responds
+ * to INIT/SIPI messages. Stay on the safe side and use software disable.
*/
-void disable_local_APIC(void)
+void apic_soft_disable(void)
{
- unsigned int value;
-
- /* APIC hasn't been mapped yet */
- if (!apic_phys)
- return;
+ u32 value;
clear_local_APIC();
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
+ /* Soft disable APIC (implies clearing of registers for 82489DX!). */
value = apic_read(APIC_SPIV);
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+ if (!apic_accessible())
+ return;
+
+ if (apic->teardown)
+ apic->teardown();
+
+ apic_soft_disable();
#ifdef CONFIG_X86_32
/*
@@ -977,7 +1204,7 @@ void lapic_shutdown(void)
{
unsigned long flags;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
return;
local_irq_save(flags);
@@ -993,67 +1220,6 @@ void lapic_shutdown(void)
local_irq_restore(flags);
}
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = lapic_get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
- reg1 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
- apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ apic->apic_id_mask))
- return 0;
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
/**
* sync_Arb_IDs - synchronize APIC bus arbitration IDs
*/
@@ -1071,9 +1237,71 @@ void __init sync_Arb_IDs(void)
*/
apic_wait_icr_idle();
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC |
- APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_pr_debug("Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
+}
+
+enum apic_intr_mode_id apic_intr_mode __ro_after_init;
+
+static int __init __apic_intr_mode_select(void)
+{
+ /* Check kernel option */
+ if (apic_is_disabled) {
+ pr_info("APIC disabled via kernel command line\n");
+ return APIC_PIC;
+ }
+
+ /* Check BIOS */
+#ifdef CONFIG_X86_64
+ /* On 64-bit, the APIC must be integrated, Check local APIC only */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ apic_is_disabled = true;
+ pr_info("APIC disabled by BIOS\n");
+ return APIC_PIC;
+ }
+#else
+ /* On 32-bit, the APIC may be integrated APIC or 82489DX */
+
+ /* Neither 82489DX nor integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) {
+ apic_is_disabled = true;
+ return APIC_PIC;
+ }
+
+ /* If the BIOS pretends there is an integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) &&
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
+ apic_is_disabled = true;
+ pr_err(FW_BUG "Local APIC not detected, force emulation\n");
+ return APIC_PIC;
+ }
+#endif
+
+ /* Check MP table or ACPI MADT configuration */
+ if (!smp_found_config) {
+ disable_ioapic_support();
+ if (!acpi_lapic) {
+ pr_info("APIC: ACPI MADT or MP tables are not detected\n");
+ return APIC_VIRTUAL_WIRE_NO_CONFIG;
+ }
+ return APIC_VIRTUAL_WIRE;
+ }
+
+#ifdef CONFIG_SMP
+ /* If SMP should be disabled, then really disable it! */
+ if (!setup_max_cpus) {
+ pr_info("APIC: SMP mode deactivated\n");
+ return APIC_SYMMETRIC_IO_NO_ROUTING;
+ }
+#endif
+
+ return APIC_SYMMETRIC_IO;
+}
+
+/* Select the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_select(void)
+{
+ apic_intr_mode = __apic_intr_mode_select();
}
/*
@@ -1087,7 +1315,7 @@ void __init init_bsp_APIC(void)
* Don't do the setup now if we have a SMP BIOS as the
* through-I/O-APIC virtual wire mode might be active.
*/
- if (smp_found_config || !cpu_has_apic)
+ if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
return;
/*
@@ -1120,10 +1348,46 @@ void __init init_bsp_APIC(void)
value = APIC_DM_NMI;
if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
+ if (apic_extnmi == APIC_EXTNMI_NONE)
+ value |= APIC_LVT_MASKED;
apic_write(APIC_LVT1, value);
}
-static void __cpuinit lapic_setup_esr(void)
+static void __init apic_bsp_setup(bool upmode);
+
+/* Init the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_init(void)
+{
+ bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ pr_info("APIC: Keep in PIC mode(8259)\n");
+ return;
+ case APIC_VIRTUAL_WIRE:
+ pr_info("APIC: Switch to virtual wire mode setup\n");
+ break;
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
+ pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
+ upmode = true;
+ break;
+ case APIC_SYMMETRIC_IO:
+ pr_info("APIC: Switch to symmetric I/O mode setup\n");
+ break;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
+ break;
+ }
+
+ x86_64_probe_apic();
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
+
+ apic_bsp_setup(upmode);
+}
+
+static void lapic_setup_esr(void)
{
unsigned int oldvalue, value, maxlvt;
@@ -1158,26 +1422,105 @@ static void __cpuinit lapic_setup_esr(void)
if (maxlvt > 3)
apic_write(APIC_ESR, 0);
value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08x after: 0x%08x\n",
- oldvalue, value);
+ if (value != oldvalue) {
+ apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+ }
+}
+
+#define APIC_IR_REGS APIC_ISR_NR
+#define APIC_IR_BITS (APIC_IR_REGS * 32)
+#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG)
+
+union apic_ir {
+ unsigned long map[APIC_IR_MAPSIZE];
+ u32 regs[APIC_IR_REGS];
+};
+
+static bool apic_check_and_eoi_isr(union apic_ir *isr)
+{
+ int i, bit;
+
+ /* Read the ISRs */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+ /* If the ISR map empty, nothing to do here. */
+ if (bitmap_empty(isr->map, APIC_IR_BITS))
+ return true;
+
+ /*
+ * There can be multiple ISR bits set when a high priority
+ * interrupt preempted a lower priority one. Issue an EOI for each
+ * set bit. The priority traversal order does not matter as there
+ * can't be new ISR bits raised at this point. What matters is that
+ * an EOI is issued for each ISR bit.
+ */
+ for_each_set_bit(bit, isr->map, APIC_IR_BITS)
+ apic_eoi();
+
+ /* Reread the ISRs, they should be empty now */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+ return bitmap_empty(isr->map, APIC_IR_BITS);
}
+/*
+ * If a CPU services an interrupt and crashes before issuing EOI to the
+ * local APIC, the corresponding ISR bit is still set when the crashing CPU
+ * jumps into a crash kernel. Read the ISR and issue an EOI for each set
+ * bit to acknowledge it as otherwise these slots would be locked forever
+ * waiting for an EOI.
+ *
+ * If there are pending bits in the IRR, then they won't be converted into
+ * ISR bits as the CPU has interrupts disabled. They will be delivered once
+ * the CPU enables interrupts and there is nothing which can prevent that.
+ *
+ * In the worst case this results in spurious interrupt warnings.
+ */
+static void apic_clear_isr(void)
+{
+ union apic_ir ir;
+ unsigned int i;
+
+ if (!apic_check_and_eoi_isr(&ir))
+ pr_warn("APIC: Stale ISR: %256pb\n", ir.map);
+
+ for (i = 0; i < APIC_IR_REGS; i++)
+ ir.regs[i] = apic_read(APIC_IRR + i * 0x10);
+
+ if (!bitmap_empty(ir.map, APIC_IR_BITS))
+ pr_warn("APIC: Stale IRR: %256pb\n", ir.map);
+}
/**
* setup_local_APIC - setup the local APIC
+ *
+ * Used to setup local APIC while initializing BSP or bringing up APs.
+ * Always called with preemption disabled.
*/
-void __cpuinit setup_local_APIC(void)
+static void setup_local_APIC(void)
{
+ int cpu = smp_processor_id();
unsigned int value;
- int i, j;
- if (disable_apic) {
- arch_disable_smp_support();
+ if (apic_is_disabled) {
+ disable_ioapic_support();
return;
}
+ if (apic->setup)
+ apic->setup();
+
+ /*
+ * If this comes from kexec/kcrash the APIC might be enabled in
+ * SPIV. Soft disable it before doing further initialization.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
if (lapic_is_integrated() && apic->disable_esr) {
@@ -1187,50 +1530,28 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
- perf_counters_lapic_init();
-
- preempt_disable();
-
- /*
- * Double-check whether this APIC is really registered.
- * This is meaningless in clustered apic mode, so we skip it.
- */
- if (!apic->apic_id_registered())
- BUG();
-
/*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
+ * document number 292116).
+ *
+ * Except for APICs which operate in physical destination mode.
*/
- apic->init_apic_ldr();
+ if (apic->init_apic_ldr)
+ apic->init_apic_ldr();
/*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
+ * Set Task Priority to 'accept all except vectors 0-31'. An APIC
+ * vector in the 16-31 range could be delivered if TPR == 0, but we
+ * would think it's an exception and terrible things will happen. We
+ * never change this later on.
*/
value = apic_read(APIC_TASKPRI);
value &= ~APIC_TPRI_MASK;
+ value |= 0x10;
apic_write(APIC_TASKPRI, value);
- /*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
- */
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
- }
- }
+ apic_clear_isr();
/*
* Now that we are all set up, enable the APIC
@@ -1258,9 +1579,8 @@ void __cpuinit setup_local_APIC(void)
*/
/*
* Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
+ * frequent as it makes the interrupt distribution model be more
* like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
*/
/*
@@ -1277,10 +1597,12 @@ void __cpuinit setup_local_APIC(void)
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
+ perf_events_lapic_init();
+
/*
* Set up LVT0, LVT1:
*
- * set up through-local-APIC on the BP's LINT0. This is not
+ * set up through-local-APIC on the boot CPU's LINT0. This is not
* strictly necessary in pure symmetric-IO mode, but sometimes
* we delegate interrupts to the 8259A.
*/
@@ -1288,38 +1610,38 @@ void __cpuinit setup_local_APIC(void)
* TODO: set up through-local-APIC from through-I/O-APIC? --macro
*/
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && (pic_mode || !value)) {
+ if (!cpu && (pic_mode || !value || ioapic_is_disabled)) {
value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
- smp_processor_id());
+ apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu);
} else {
value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
- smp_processor_id());
+ apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu);
}
apic_write(APIC_LVT0, value);
/*
- * only the BP should see the LINT1 NMI signal, obviously.
+ * Only the BSP sees the LINT1 NMI signal by default. This can be
+ * modified by apic_extnmi= boot option.
*/
- if (!smp_processor_id())
+ if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+ apic_extnmi == APIC_EXTNMI_ALL)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!lapic_is_integrated()) /* 82489DX */
+
+ /* Is 82489DX ? */
+ if (!lapic_is_integrated())
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
- preempt_enable();
-
#ifdef CONFIG_X86_MCE_INTEL
/* Recheck CMCI information after local APIC is up on CPU #0 */
- if (smp_processor_id() == 0)
+ if (!cpu)
cmci_recheck();
#endif
}
-void __cpuinit end_local_APIC_setup(void)
+static void end_local_APIC_setup(void)
{
lapic_setup_esr();
@@ -1333,122 +1655,283 @@ void __cpuinit end_local_APIC_setup(void)
}
#endif
- setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
+/*
+ * APIC setup function for application processors. Called from smpboot.c
+ */
+void apic_ap_setup(void)
+{
+ setup_local_APIC();
+ end_local_APIC_setup();
+}
+
+static __init void apic_read_boot_cpu_id(bool x2apic)
+{
+ /*
+ * This can be invoked from check_x2apic() before the APIC has been
+ * selected. But that code knows for sure that the BIOS enabled
+ * X2APIC.
+ */
+ if (x2apic) {
+ boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID);
+ boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR));
+ } else {
+ boot_cpu_physical_apicid = read_apic_id();
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+ topology_register_boot_apic(boot_cpu_physical_apicid);
+}
+
#ifdef CONFIG_X86_X2APIC
-void check_x2apic(void)
+int x2apic_mode;
+EXPORT_SYMBOL_GPL(x2apic_mode);
+
+enum {
+ X2APIC_OFF,
+ X2APIC_DISABLED,
+ /* All states below here have X2APIC enabled */
+ X2APIC_ON,
+ X2APIC_ON_LOCKED
+};
+static int x2apic_state;
+
+static bool x2apic_hw_locked(void)
{
- if (x2apic_enabled()) {
- pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic_mode = 1;
+ u64 x86_arch_cap_msr;
+ u64 msr;
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+ if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) {
+ rdmsrq(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
+ return (msr & LEGACY_XAPIC_DISABLED);
}
+ return false;
}
-void enable_x2apic(void)
+static void __x2apic_disable(void)
{
- int msr, msr2;
+ u64 msr;
- if (!x2apic_mode)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return;
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
- if (!(msr & X2APIC_ENABLE)) {
- pr_info("Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
- }
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ if (!(msr & X2APIC_ENABLE))
+ return;
+ /* Disable xapic and x2apic first and then reenable xapic mode */
+ wrmsrq(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrq(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic disabled\n");
}
-#endif /* CONFIG_X86_X2APIC */
-void __init enable_IR_x2apic(void)
+static void __x2apic_enable(void)
{
-#ifdef CONFIG_INTR_REMAP
- int ret;
- unsigned long flags;
- struct IO_APIC_route_entry **ioapic_entries = NULL;
+ u64 msr;
- ret = dmar_table_init();
- if (ret) {
- pr_debug("dmar_table_init() failed with %d:\n", ret);
- goto ir_failed;
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE)
+ return;
+ wrmsrq(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic enabled\n");
+}
+
+static int __init setup_nox2apic(char *str)
+{
+ if (x2apic_enabled()) {
+ u32 apicid = native_apic_msr_read(APIC_ID);
+
+ if (apicid >= 255) {
+ pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+ if (x2apic_hw_locked()) {
+ pr_warn("APIC locked in x2apic mode, can't disable\n");
+ return 0;
+ }
+ pr_warn("x2apic already enabled.\n");
+ __x2apic_disable();
}
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
- if (!intr_remapping_supported()) {
- pr_debug("intr-remapping not supported\n");
- goto ir_failed;
+/* Called from cpu_init() to enable x2apic on (secondary) cpus */
+void x2apic_setup(void)
+{
+ /*
+ * Try to make the AP's APIC state match that of the BSP, but if the
+ * BSP is unlocked and the AP is locked then there is a state mismatch.
+ * Warn about the mismatch in case a GP fault occurs due to a locked AP
+ * trying to be turned off.
+ */
+ if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked())
+ pr_warn("x2apic lock mismatch between BSP and AP.\n");
+ /*
+ * If x2apic is not in ON or LOCKED state, disable it if already enabled
+ * from BIOS.
+ */
+ if (x2apic_state < X2APIC_ON) {
+ __x2apic_disable();
+ return;
}
+ __x2apic_enable();
+}
+static __init void apic_set_fixmap(bool read_apic);
- if (!x2apic_preenabled && skip_ioapic_setup) {
- pr_info("Skipped enabling intr-remap because of skipping "
- "io-apic setup\n");
+static __init void x2apic_disable(void)
+{
+ u32 x2apic_id;
+
+ if (x2apic_state < X2APIC_ON)
return;
- }
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- pr_info("Allocate ioapic_entries failed: %d\n", ret);
- goto end;
- }
+ x2apic_id = read_apic_id();
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
- ret = save_IO_APIC_setup(ioapic_entries);
- if (ret) {
- pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto end;
+ if (x2apic_hw_locked()) {
+ pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id);
+ return;
}
- local_irq_save(flags);
- mask_IO_APIC_setup(ioapic_entries);
- mask_8259A();
+ __x2apic_disable();
- ret = enable_intr_remapping(x2apic_supported());
- if (ret)
- goto end_restore;
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
- pr_info("Enabled Interrupt-remapping\n");
+ /*
+ * Don't reread the APIC ID as it was already done from
+ * check_x2apic() and the APIC driver still is a x2APIC variant,
+ * which fails to do the read after x2APIC was disabled.
+ */
+ apic_set_fixmap(false);
+}
- if (x2apic_supported() && !x2apic_mode) {
- x2apic_mode = 1;
- enable_x2apic();
- pr_info("Enabled x2apic\n");
- }
+static __init void x2apic_enable(void)
+{
+ if (x2apic_state != X2APIC_OFF)
+ return;
+
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ __x2apic_enable();
+}
+
+static __init void try_to_enable_x2apic(int remap_mode)
+{
+ if (x2apic_state == X2APIC_DISABLED)
+ return;
+
+ if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
+ u32 apic_limit = 255;
-end_restore:
- if (ret)
/*
- * IR enabling failed
+ * Using X2APIC without IR is not architecturally supported
+ * on bare metal but may be supported in guests.
*/
- restore_IO_APIC_setup(ioapic_entries);
+ if (!x86_init.hyper.x2apic_available()) {
+ pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
+ x2apic_disable();
+ return;
+ }
- unmask_8259A();
- local_irq_restore(flags);
+ /*
+ * If the hypervisor supports extended destination ID in
+ * MSI, that increases the maximum APIC ID that can be
+ * used for non-remapped IRQ domains.
+ */
+ if (x86_init.hyper.msi_ext_dest_id()) {
+ virt_ext_dest_id = 1;
+ apic_limit = 32767;
+ }
+
+ /*
+ * Without IR, all CPUs can be addressed by IOAPIC/MSI only
+ * in physical mode, and CPUs with an APIC ID that cannot
+ * be addressed must not be brought online.
+ */
+ x2apic_set_max_apicid(apic_limit);
+ x2apic_phys = 1;
+ }
+ x2apic_enable();
+}
-end:
- if (ioapic_entries)
- free_ioapic_entries(ioapic_entries);
+void __init check_x2apic(void)
+{
+ if (x2apic_enabled()) {
+ pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
+ x2apic_mode = 1;
+ if (x2apic_hw_locked())
+ x2apic_state = X2APIC_ON_LOCKED;
+ else
+ x2apic_state = X2APIC_ON;
+ apic_read_boot_cpu_id(true);
+ } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
+ x2apic_state = X2APIC_DISABLED;
+ }
+}
+#else /* CONFIG_X86_X2APIC */
+void __init check_x2apic(void)
+{
+ if (!apic_is_x2apic_enabled())
+ return;
+ /*
+ * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC?
+ */
+ pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n");
+ pr_err("Disabling APIC, expect reduced performance and functionality.\n");
- if (!ret)
+ apic_is_disabled = true;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+}
+
+static inline void try_to_enable_x2apic(int remap_mode) { }
+static inline void __x2apic_enable(void) { }
+#endif /* !CONFIG_X86_X2APIC */
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+ int ret, ir_stat;
+
+ if (ioapic_is_disabled) {
+ pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
return;
+ }
-ir_failed:
- if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
- else if (cpu_has_x2apic)
- pr_info("Not enabling x2apic,Intr-remapping\n");
-#else
- if (!cpu_has_x2apic)
+ ir_stat = irq_remapping_prepare();
+ if (ir_stat < 0 && !x2apic_supported())
return;
- if (x2apic_preenabled)
- panic("x2apic enabled prior OS handover,"
- " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
-#endif
+ ret = save_ioapic_entries();
+ if (ret) {
+ pr_info("Saving IO-APIC state failed: %d\n", ret);
+ return;
+ }
- return;
+ local_irq_save(flags);
+ legacy_pic->mask_all();
+ mask_ioapic_entries();
+
+ /* If irq_remapping_prepare() succeeded, try to enable it */
+ if (ir_stat >= 0)
+ ir_stat = irq_remapping_enable();
+ /* ir_stat contains the remap mode or an error code */
+ try_to_enable_x2apic(ir_stat);
+
+ if (ir_stat < 0)
+ restore_ioapic_entries();
+ legacy_pic->restore_mask();
+ local_irq_restore(flags);
}
-
#ifdef CONFIG_X86_64
/*
* Detect and enable local APICs on non-SMP boards.
@@ -1456,27 +1939,78 @@ ir_failed:
* On AMD64 we trust the BIOS - if it says no APIC it is likely
* not correctly set up (usually the APIC timer won't work etc.)
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
- if (!cpu_has_apic) {
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
pr_info("No local APIC present\n");
- return -1;
+ return false;
}
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- return 0;
+ register_lapic_address(APIC_DEFAULT_PHYS_BASE);
+ return true;
}
#else
+
+static bool __init apic_verify(unsigned long addr)
+{
+ u32 features, h, l;
+
+ /*
+ * The APIC feature bit should now be enabled
+ * in `cpuid'
+ */
+ features = cpuid_edx(1);
+ if (!(features & (1 << X86_FEATURE_APIC))) {
+ pr_warn("Could not enable APIC!\n");
+ return false;
+ }
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+
+ /* The BIOS may have set up the APIC at some other address */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ addr = l & MSR_IA32_APICBASE_BASE;
+ }
+
+ register_lapic_address(addr);
+ pr_info("Found and enabled local APIC!\n");
+ return true;
+}
+
+bool __init apic_force_enable(unsigned long addr)
+{
+ u32 h, l;
+
+ if (apic_is_disabled)
+ return false;
+
+ /*
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
+ */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
+ }
+ return apic_verify(addr);
+}
+
/*
* Detect and initialize APIC
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
- u32 h, l, features;
-
/* Disabled by kernel option? */
- if (disable_apic)
- return -1;
+ if (apic_is_disabled)
+ return false;
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
@@ -1484,16 +2018,18 @@ static int __init detect_init_APIC(void)
(boot_cpu_data.x86 >= 15))
break;
goto no_apic;
+ case X86_VENDOR_HYGON:
+ break;
case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
- (boot_cpu_data.x86 == 5 && cpu_has_apic))
+ if ((boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)) ||
+ boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO)
break;
goto no_apic;
default:
goto no_apic;
}
- if (!cpu_has_apic) {
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
/*
* Over-ride BIOS and try to enable the local APIC only if
* "lapic" specified.
@@ -1501,74 +2037,22 @@ static int __init detect_init_APIC(void)
if (!force_enable_local_apic) {
pr_info("Local APIC disabled by BIOS -- "
"you can enable it with \"lapic\"\n");
- return -1;
+ return false;
}
- /*
- * Some BIOSes disable the local APIC in the APIC_BASE
- * MSR. This can only be done in software for Intel P6 or later
- * and AMD K7 (Model > 1) or later.
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- pr_info("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
- }
- }
- /*
- * The APIC feature bit should now be enabled
- * in `cpuid'
- */
- features = cpuid_edx(1);
- if (!(features & (1 << X86_FEATURE_APIC))) {
- pr_warning("Could not enable APIC!\n");
- return -1;
+ if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE))
+ return false;
+ } else {
+ if (!apic_verify(APIC_DEFAULT_PHYS_BASE))
+ return false;
}
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
- pr_info("Found and enabled local APIC!\n");
apic_pm_activate();
- return 0;
+ return true;
no_apic:
pr_info("No local APIC present or hardware disabled\n");
- return -1;
-}
-#endif
-
-#ifdef CONFIG_X86_64
-void __init early_init_lapic_mapping(void)
-{
- unsigned long phys_addr;
-
- /*
- * If no local APIC can be found then go out
- * : it means there is no mpatable and MADT
- */
- if (!smp_found_config)
- return;
-
- phys_addr = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, phys_addr);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = read_apic_id();
+ return false;
}
#endif
@@ -1577,204 +2061,146 @@ void __init early_init_lapic_mapping(void)
*/
void __init init_apic_mappings(void)
{
- unsigned int new_apicid;
+ if (apic_validate_deadline_timer())
+ pr_info("TSC deadline timer available\n");
- if (x2apic_mode) {
- boot_cpu_physical_apicid = read_apic_id();
+ if (x2apic_mode)
return;
- }
- /* If no local APIC can be found return early */
- if (!smp_found_config && detect_init_APIC()) {
- /* lets NOP'ify apic operations */
- pr_info("APIC: disable apic facility\n");
- apic_disable();
- } else {
- apic_phys = mp_lapic_addr;
+ if (!smp_found_config) {
+ if (!detect_init_APIC()) {
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ }
+ }
+}
- /*
- * acpi lapic path already maps that address in
- * acpi_register_lapic_address()
- */
- if (!acpi_lapic)
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+static __init void apic_set_fixmap(bool read_apic)
+{
+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+ apic_mmio_base = APIC_BASE;
+ apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr);
+ if (read_apic)
+ apic_read_boot_cpu_id(false);
+}
- apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
- APIC_BASE, apic_phys);
- }
+void __init register_lapic_address(unsigned long address)
+{
+ /* This should only happen once */
+ WARN_ON_ONCE(mp_lapic_addr);
+ mp_lapic_addr = address;
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- new_apicid = read_apic_id();
- if (boot_cpu_physical_apicid != new_apicid) {
- boot_cpu_physical_apicid = new_apicid;
- /*
- * yeah -- we lie about apic_version
- * in case if apic was disabled via boot option
- * but it's not a problem for SMP compiled kernel
- * since smp_sanity_check is prepared for such a case
- * and disable smp mode
- */
- apic_version[new_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
- }
+ if (!x2apic_mode)
+ apic_set_fixmap(true);
}
/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
+ * Local APIC interrupts
*/
-int apic_version[MAX_APICS];
-int __init APIC_init_uniprocessor(void)
+/*
+ * Common handling code for spurious_interrupt and spurious_vector entry
+ * points below. No point in allowing the compiler to inline it twice.
+ */
+static noinline void handle_spurious_interrupt(u8 vector)
{
- if (disable_apic) {
- pr_info("Apic disabled\n");
- return -1;
- }
-#ifdef CONFIG_X86_64
- if (!cpu_has_apic) {
- disable_apic = 1;
- pr_info("Apic disabled by BIOS\n");
- return -1;
- }
-#else
- if (!smp_found_config && !cpu_has_apic)
- return -1;
-
- /*
- * Complain if the BIOS pretends there is one.
- */
- if (!cpu_has_apic &&
- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
- boot_cpu_physical_apicid);
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
- return -1;
- }
-#endif
+ u32 v;
- enable_IR_x2apic();
-#ifdef CONFIG_X86_64
- default_setup_apic_routing();
-#endif
+ trace_spurious_apic_entry(vector);
- verify_local_APIC();
- connect_bsp_APIC();
+ inc_irq_stat(irq_spurious_count);
-#ifdef CONFIG_X86_64
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-#else
/*
- * Hack: In case of kdump, after a crash, kernel might be booting
- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
- * might be zero if read from MP tables. Get it from LAPIC.
+ * If this is a spurious interrupt then do not acknowledge
*/
-# ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = read_apic_id();
-# endif
-#endif
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- setup_local_APIC();
+ if (vector == SPURIOUS_APIC_VECTOR) {
+ /* See SDM vol 3 */
+ pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n",
+ smp_processor_id());
+ goto out;
+ }
-#ifdef CONFIG_X86_IO_APIC
/*
- * Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling error vector
+ * If it is a vectored one, verify it's set in the ISR. If set,
+ * acknowledge it.
*/
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-#endif
-
- end_local_APIC_setup();
-
-#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else {
- nr_ioapics = 0;
- localise_nmi_watchdog();
+ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
+ if (v & (1 << (vector & 0x1f))) {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
+ vector, smp_processor_id());
+ apic_eoi();
+ } else {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
+ vector, smp_processor_id());
}
-#else
- localise_nmi_watchdog();
-#endif
-
- setup_boot_clock();
-#ifdef CONFIG_X86_64
- check_nmi_watchdog();
-#endif
-
- return 0;
+out:
+ trace_spurious_apic_exit(vector);
}
-/*
- * Local APIC interrupts
- */
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
+/**
+ * spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @vector: The vector number
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
*/
-void smp_spurious_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(spurious_interrupt)
{
- u32 v;
-
- exit_idle();
- irq_enter();
- /*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
- */
- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
- ack_APIC_irq();
-
- inc_irq_stat(irq_spurious_count);
+ handle_spurious_interrupt(vector);
+}
- /* see sw-dev-man vol 3, chapter 7.4.13.5 */
- pr_info("spurious APIC interrupt on CPU#%d, "
- "should never happen.\n", smp_processor_id());
- irq_exit();
+DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
+{
+ handle_spurious_interrupt(SPURIOUS_APIC_VECTOR);
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-void smp_error_interrupt(struct pt_regs *regs)
-{
- u32 v, v1;
+DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
+{
+ static const char * const error_interrupt_reason[] = {
+ "Send CS error", /* APIC Error Bit 0 */
+ "Receive CS error", /* APIC Error Bit 1 */
+ "Send accept error", /* APIC Error Bit 2 */
+ "Receive accept error", /* APIC Error Bit 3 */
+ "Redirectable IPI", /* APIC Error Bit 4 */
+ "Send illegal vector", /* APIC Error Bit 5 */
+ "Received illegal vector", /* APIC Error Bit 6 */
+ "Illegal register address", /* APIC Error Bit 7 */
+ };
+ u32 v, i = 0;
+
+ trace_error_apic_entry(ERROR_APIC_VECTOR);
- exit_idle();
- irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
+ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
v = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
- ack_APIC_irq();
+ apic_eoi();
atomic_inc(&irq_err_count);
- /*
- * Here is what the APIC error bits mean:
- * 0: Send CS error
- * 1: Receive CS error
- * 2: Send accept error
- * 3: Receive accept error
- * 4: Reserved
- * 5: Send illegal vector
- * 6: Received illegal vector
- * 7: Illegal register address
- */
- pr_debug("APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
- irq_exit();
+ apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v);
+
+ v &= 0xff;
+ while (v) {
+ if (v & 0x1)
+ apic_pr_debug_cont(" : %s", error_interrupt_reason[i]);
+ i++;
+ v >>= 1;
+ }
+
+ apic_pr_debug_cont("\n");
+
+ trace_error_apic_exit(ERROR_APIC_VECTOR);
}
/**
* connect_bsp_APIC - attach the APIC to the interrupt system
*/
-void __init connect_bsp_APIC(void)
+static void __init connect_bsp_APIC(void)
{
#ifdef CONFIG_X86_32
if (pic_mode) {
@@ -1786,13 +2212,10 @@ void __init connect_bsp_APIC(void)
* PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
* local APIC to INT and NMI lines.
*/
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
+ apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n");
imcr_pic_to_apic();
}
#endif
- if (apic->enable_apic_mode)
- apic->enable_apic_mode();
}
/**
@@ -1814,8 +2237,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
* IPIs, won't work beyond this point! The only exception are
* INIT IPIs.
*/
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
+ apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n");
imcr_apic_to_pic();
return;
}
@@ -1860,107 +2282,77 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-void __cpuinit generic_processor_info(int apicid, int version)
+void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+ bool dmar)
{
- int cpu;
-
- /*
- * Validate version
- */
- if (version == 0x0) {
- pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- version);
- version = 0x10;
- }
- apic_version[apicid] = version;
-
- if (num_processors >= nr_cpu_ids) {
- int max = nr_cpu_ids;
- int thiscpu = max + disabled_cpus;
+ memset(msg, 0, sizeof(*msg));
- pr_warning(
- "ACPI: NR_CPUS/possible_cpus limit of %i reached."
- " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
-
- disabled_cpus++;
- return;
- }
+ msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+ msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical;
+ msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF;
- num_processors++;
- cpu = cpumask_next_zero(-1, cpu_present_mask);
+ msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED;
+ msg->arch_data.vector = cfg->vector;
- if (version != apic_version[boot_cpu_physical_apicid])
- WARN_ONCE(1,
- "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
- apic_version[boot_cpu_physical_apicid], cpu, version);
-
- physid_set(apicid, phys_cpu_present_map);
- if (apicid == boot_cpu_physical_apicid) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- */
- cpu = 0;
- }
- if (apicid > max_physical_apicid)
- max_physical_apicid = apicid;
-
-#ifdef CONFIG_X86_32
+ msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
/*
- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
- * but we need to work other dependencies like SMP_SUSPEND etc
- * before this can be done without some confusion.
- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
- * - Ashok Raj <ashok.raj@intel.com>
+ * Only the IOMMU itself can use the trick of putting destination
+ * APIC ID into the high bits of the address. Anything else would
+ * just be writing to memory if it tried that, and needs IR to
+ * address APICs which can't be addressed in the normal 32-bit
+ * address range at 0xFFExxxxx. That is typically just 8 bits, but
+ * some hypervisors allow the extended destination ID field in bits
+ * 5-11 to be used, giving support for 15 bits of APIC IDs in total.
*/
- if (max_physical_apicid >= 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
-#endif
+ if (dmar)
+ msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
+ else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000)
+ msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8;
+ else
+ WARN_ON_ONCE(cfg->dest_apicid > 0xFF);
+}
-#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
- early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-#endif
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
+{
+ u32 dest = msg->arch_addr_lo.destid_0_7;
- set_cpu_possible(cpu, true);
- set_cpu_present(cpu, true);
+ if (extid)
+ dest |= msg->arch_addr_hi.destid_8_31 << 8;
+ return dest;
}
+EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid);
-int hard_smp_processor_id(void)
+static void __init apic_bsp_up_setup(void)
{
- return read_apic_id();
+ reset_phys_cpu_present_map(boot_cpu_physical_apicid);
}
-void default_init_apic_ldr(void)
+/**
+ * apic_bsp_setup - Setup function for local apic and io-apic
+ * @upmode: Force UP mode (for APIC_init_uniprocessor)
+ */
+static void __init apic_bsp_setup(bool upmode)
{
- unsigned long val;
+ connect_bsp_APIC();
+ if (upmode)
+ apic_bsp_up_setup();
+ setup_local_APIC();
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
- apic_write(APIC_LDR, val);
+ enable_IO_APIC();
+ end_local_APIC_setup();
+ irq_remap_enable_fault_handling();
+ setup_IO_APIC();
+ lapic_update_legacy_vectors();
}
-#ifdef CONFIG_X86_32
-int default_apicid_to_node(int logical_apicid)
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
{
-#ifdef CONFIG_SMP
- return apicid_2_node[hard_smp_processor_id()];
-#else
- return 0;
-#endif
+ if (apic_intr_mode == APIC_PIC)
+ return;
+
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
}
#endif
@@ -1977,7 +2369,7 @@ static struct {
*/
int active;
/* r/w apic fields */
- unsigned int apic_id;
+ u32 apic_id;
unsigned int apic_taskpri;
unsigned int apic_ldr;
unsigned int apic_dfr;
@@ -1990,9 +2382,10 @@ static struct {
unsigned int apic_tmict;
unsigned int apic_tdcr;
unsigned int apic_thmr;
+ unsigned int apic_cmci;
} apic_pm_state;
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+static int lapic_suspend(void *data)
{
unsigned long flags;
int maxlvt;
@@ -2019,61 +2412,62 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
local_irq_save(flags);
+
+ /*
+ * Mask IOAPIC before disabling the local APIC to prevent stale IRR
+ * entries on some implementations.
+ */
+ mask_ioapic_entries();
+
disable_local_APIC();
- if (intr_remapping_enabled)
- disable_intr_remapping();
+ irq_remapping_disable();
local_irq_restore(flags);
return 0;
}
-static int lapic_resume(struct sys_device *dev)
+static void lapic_resume(void *data)
{
unsigned int l, h;
unsigned long flags;
int maxlvt;
- int ret = 0;
- struct IO_APIC_route_entry **ioapic_entries = NULL;
if (!apic_pm_state.active)
- return 0;
+ return;
local_irq_save(flags);
- if (intr_remapping_enabled) {
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- WARN(1, "Alloc ioapic_entries in lapic resume failed.");
- ret = -ENOMEM;
- goto restore;
- }
-
- ret = save_IO_APIC_setup(ioapic_entries);
- if (ret) {
- WARN(1, "Saving IO-APIC state failed: %d\n", ret);
- free_ioapic_entries(ioapic_entries);
- goto restore;
- }
- mask_IO_APIC_setup(ioapic_entries);
- mask_8259A();
- }
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
+ legacy_pic->mask_all();
- if (x2apic_mode)
- enable_x2apic();
- else {
+ if (x2apic_mode) {
+ __x2apic_enable();
+ } else {
/*
* Make sure the APICBASE points to the right address
*
* FIXME! This will be wrong if we ever support suspend on
* SMP! We'll need to do this as part of the CPU restore!
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
}
maxlvt = lapic_get_maxlvt();
@@ -2085,10 +2479,14 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
if (maxlvt >= 4)
apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
@@ -2100,16 +2498,9 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled) {
- reenable_intr_remapping(x2apic_mode);
- unmask_8259A();
- restore_IO_APIC_setup(ioapic_entries);
- free_ioapic_entries(ioapic_entries);
- }
-restore:
- local_irq_restore(flags);
+ irq_remapping_reenable(x2apic_mode);
- return ret;
+ local_irq_restore(flags);
}
/*
@@ -2117,34 +2508,27 @@ restore:
* are needed on every CPU up until machine_halt/restart/poweroff.
*/
-static struct sysdev_class lapic_sysclass = {
- .name = "lapic",
+static const struct syscore_ops lapic_syscore_ops = {
.resume = lapic_resume,
.suspend = lapic_suspend,
};
-static struct sys_device device_lapic = {
- .id = 0,
- .cls = &lapic_sysclass,
+static struct syscore lapic_syscore = {
+ .ops = &lapic_syscore_ops,
};
-static void __cpuinit apic_pm_activate(void)
+static void apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
static int __init init_lapic_sysfs(void)
{
- int error;
-
- if (!cpu_has_apic)
- return 0;
/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ register_syscore(&lapic_syscore);
- error = sysdev_class_register(&lapic_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic);
- return error;
+ return 0;
}
/* local apic needs to resume before other devices access its registers. */
@@ -2158,55 +2542,10 @@ static void apic_pm_activate(void) { }
#ifdef CONFIG_X86_64
-static int __cpuinit apic_cluster_num(void)
-{
- int i, clusters, zeros;
- unsigned id;
- u16 *bios_cpu_apicid;
- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
- bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
- for (i = 0; i < nr_cpu_ids; i++) {
- /* are we being called early in kernel startup? */
- if (bios_cpu_apicid) {
- id = bios_cpu_apicid[i];
- } else if (i < nr_cpu_ids) {
- if (cpu_present(i))
- id = per_cpu(x86_bios_cpu_apicid, i);
- else
- continue;
- } else
- break;
-
- if (id != BAD_APICID)
- __set_bit(APIC_CLUSTERID(id), clustermap);
- }
+static int multi_checked;
+static int multi;
- /* Problem: Partially populated chassis may not have CPUs in some of
- * the APIC clusters they have been allocated. Only present CPUs have
- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
- * Since clusters are allocated sequentially, count zeros only if
- * they are bounded by ones.
- */
- clusters = 0;
- zeros = 0;
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (test_bit(i, clustermap)) {
- clusters += 1 + zeros;
- zeros = 0;
- } else
- ++zeros;
- }
-
- return clusters;
-}
-
-static int __cpuinitdata multi_checked;
-static int __cpuinitdata multi;
-
-static int __cpuinit set_multi(const struct dmi_system_id *d)
+static int set_multi(const struct dmi_system_id *d)
{
if (multi)
return 0;
@@ -2215,7 +2554,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d)
return 0;
}
-static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+static const struct dmi_system_id multi_dmi_table[] = {
{
.callback = set_multi,
.ident = "IBM System Summit2",
@@ -2227,7 +2566,7 @@ static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
{}
};
-static void __cpuinit dmi_check_multi(void)
+static void dmi_check_multi(void)
{
if (multi_checked)
return;
@@ -2244,42 +2583,22 @@ static void __cpuinit dmi_check_multi(void)
* multi-chassis.
* Use DMI to check them
*/
-__cpuinit int apic_is_clustered_box(void)
+int apic_is_clustered_box(void)
{
dmi_check_multi();
- if (multi)
- return 1;
-
- if (!is_vsmp_box())
- return 0;
-
- /*
- * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (apic_cluster_num() > 1)
- return 1;
-
- return 0;
+ return multi;
}
#endif
/*
* APIC command line parameters
*/
-static int __init setup_disableapic(char *arg)
+static int __init setup_nolapic(char *arg)
{
- disable_apic = 1;
+ apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static int __init setup_nolapic(char *arg)
-{
- return setup_disableapic(arg);
-}
early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)
@@ -2306,22 +2625,24 @@ early_param("nolapic_timer", parse_nolapic_timer);
static int __init apic_set_verbosity(char *arg)
{
if (!arg) {
-#ifdef CONFIG_X86_64
- skip_ioapic_setup = 0;
+ if (IS_ENABLED(CONFIG_X86_32))
+ return -EINVAL;
+
+ ioapic_is_disabled = false;
return 0;
-#endif
- return -EINVAL;
}
if (strcmp("debug", arg) == 0)
apic_verbosity = APIC_DEBUG;
else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
+#ifdef CONFIG_X86_64
else {
- pr_warning("APIC Verbosity level %s not recognised"
+ pr_warn("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
+#endif
return 0;
}
@@ -2329,11 +2650,11 @@ early_param("apic", apic_set_verbosity);
static int __init lapic_insert_resource(void)
{
- if (!apic_phys)
+ if (!apic_mmio_base)
return -1;
/* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
+ lapic_resource.start = apic_mmio_base;
lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
insert_resource(&iomem_resource, &lapic_resource);
@@ -2341,7 +2662,27 @@ static int __init lapic_insert_resource(void)
}
/*
- * need call insert after e820_reserve_resources()
+ * need call insert after e820__reserve_resources()
* that is using request_resource
*/
late_initcall(lapic_insert_resource);
+
+static int __init apic_set_extnmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strncmp("all", arg, 3))
+ apic_extnmi = APIC_EXTNMI_ALL;
+ else if (!strncmp("none", arg, 4))
+ apic_extnmi = APIC_EXTNMI_NONE;
+ else if (!strncmp("bsp", arg, 3))
+ apic_extnmi = APIC_EXTNMI_BSP;
+ else {
+ pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
new file mode 100644
index 000000000000..2ed3b5c88c7f
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -0,0 +1,43 @@
+/*
+ * Common functions shared between the various APIC flavours
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/irq.h>
+#include <linux/kvm_types.h>
+#include <asm/apic.h>
+
+#include "local.h"
+
+u32 apic_default_calc_apicid(unsigned int cpu)
+{
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+u32 apic_flat_calc_apicid(unsigned int cpu)
+{
+ return 1U << cpu;
+}
+
+u32 default_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_cpu_to_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid);
+
+/*
+ * Set up the logical destination ID when the APIC operates in logical
+ * destination mode.
+ */
+void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..e0308d8c4e6c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
*
* Flat APIC subarch code.
*
@@ -8,366 +8,61 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/errno.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/hardirq.h>
-#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-
-static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 1;
-}
-
-static const struct cpumask *flat_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void flat_init_apic_ldr(void)
-{
- unsigned long val;
- unsigned long num, id;
-
- num = smp_processor_id();
- id = 1UL << num;
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
- local_irq_restore(flags);
-}
-
-static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void
- flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
- int cpu = smp_processor_id();
-
- if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void flat_send_IPI_allbutself(int vector)
-{
- int cpu = smp_processor_id();
-#ifdef CONFIG_HOTPLUG_CPU
- int hotplug = 1;
-#else
- int hotplug = 0;
-#endif
- if (hotplug || vector == NMI_VECTOR) {
- if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
- unsigned long mask = cpumask_bits(cpu_online_mask)[0];
-
- if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
- }
- } else if (num_online_cpus() > 1) {
- __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
- vector, apic->dest_logical);
- }
-}
-
-static void flat_send_IPI_all(int vector)
-{
- if (vector == NMI_VECTOR) {
- flat_send_IPI_mask(cpu_online_mask, vector);
- } else {
- __default_send_IPI_shortcut(APIC_DEST_ALLINC,
- vector, apic->dest_logical);
- }
-}
-
-static unsigned int flat_get_apic_id(unsigned long x)
-{
- unsigned int id;
-
- id = (((x)>>24) & 0xFFu);
+#include <linux/export.h>
- return id;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
- unsigned long x;
-
- x = ((id & 0xFFu)<<24);
- return x;
-}
-
-static unsigned int read_xapic_id(void)
-{
- unsigned int id;
+#include <asm/apic.h>
- id = flat_get_apic_id(apic_read(APIC_ID));
- return id;
-}
+#include "local.h"
-static int flat_apic_id_registered(void)
+static u32 physflat_get_apic_id(u32 x)
{
- return physid_isset(read_xapic_id(), phys_cpu_present_map);
+ return (x >> 24) & 0xFF;
}
-static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+static int physflat_probe(void)
{
- return initial_apic_id >> index_msb;
+ return 1;
}
-struct apic apic_flat = {
- .name = "flat",
- .probe = NULL,
- .acpi_madt_oem_check = flat_acpi_madt_oem_check,
- .apic_id_registered = flat_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- .irq_dest_mode = 1, /* logical */
-
- .target_cpus = flat_target_cpus,
- .disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
-
- .vector_allocation_domain = flat_vector_allocation_domain,
- .init_apic_ldr = flat_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
- .cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
-
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
-
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = flat_send_IPI_mask,
- .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = flat_send_IPI_allbutself,
- .send_IPI_all = flat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
-
-/*
- * Physflat mode is used when there are more than 8 CPUs on a AMD system.
- * We cannot use logical delivery in this case because the mask
- * overflows, so use physical mode.
- */
static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
-#ifdef CONFIG_ACPI
- /*
- * Quirk: some x86_64 machines can only use physical APIC mode
- * regardless of how many processors are present (x86_64 ES7000
- * is an example).
- */
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
- printk(KERN_DEBUG "system APIC only can use physical flat");
- return 1;
- }
-#endif
-
- return 0;
-}
-
-static const struct cpumask *physflat_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- default_send_IPI_mask_sequence_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
- int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void physflat_send_IPI_all(int vector)
-{
- physflat_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- cpu = cpumask_first(cpumask);
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
-}
-
-static unsigned int
-physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
-
- return BAD_APICID;
+ return 1;
}
-struct apic apic_physflat = {
+static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
- .probe = NULL,
+ .probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
- .apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
- .target_cpus = physflat_target_cpus,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
- .vector_allocation_domain = physflat_vector_allocation_domain,
- /* not needed, but shouldn't hurt: */
- .init_apic_ldr = flat_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
-
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
- .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
+ .max_apic_id = 0xFE,
+ .get_apic_id = physflat_get_apic_id,
- .send_IPI_mask = physflat_send_IPI_mask,
- .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = physflat_send_IPI_allbutself,
- .send_IPI_all = physflat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
+ .calc_dest_apicid = apic_default_calc_apicid,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
+apic_driver(apic_physflat);
+
+struct apic *apic __ro_after_init = &apic_physflat;
+EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..58abb941c45b
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NOOP APIC driver.
+ *
+ * Does almost nothing and should be substituted by a real apic driver via
+ * probe routine.
+ *
+ * Though in case if apic is disabled (for some reason) we try
+ * to not uglify the caller's code and allow to call (some) apic routines
+ * like self-ipi, etc...
+ *
+ * FIXME: Remove this gunk. The above argument which was intentionally left
+ * in place is silly to begin with because none of the callbacks except for
+ * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh...
+ */
+#include <linux/cpumask.h>
+#include <linux/thread_info.h>
+
+#include <asm/apic.h>
+
+#include "local.h"
+
+static void noop_send_IPI(int cpu, int vector) { }
+static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_allbutself(int vector) { }
+static void noop_send_IPI_all(int vector) { }
+static void noop_send_IPI_self(int vector) { }
+static void noop_apic_icr_write(u32 low, u32 id) { }
+
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip,
+ unsigned int cpu)
+{
+ return -1;
+}
+
+static u64 noop_apic_icr_read(void) { return 0; }
+static u32 noop_get_apic_id(u32 apicid) { return 0; }
+static void noop_apic_eoi(void) { }
+
+static u32 noop_apic_read(u32 reg)
+{
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
+ return 0;
+}
+
+static void noop_apic_write(u32 reg, u32 val)
+{
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
+}
+
+struct apic apic_noop __ro_after_init = {
+ .name = "noop",
+
+ .dest_mode_logical = true,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = 0xFE,
+ .get_apic_id = noop_get_apic_id,
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+ .send_IPI = noop_send_IPI,
+ .send_IPI_mask = noop_send_IPI_mask,
+ .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = noop_send_IPI_allbutself,
+ .send_IPI_all = noop_send_IPI_all,
+ .send_IPI_self = noop_send_IPI_self,
+
+ .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
+
+ .read = noop_apic_read,
+ .write = noop_apic_write,
+ .eoi = noop_apic_eoi,
+ .icr_read = noop_apic_icr_read,
+ .icr_write = noop_apic_icr_write,
+};
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000000..5c5be2d58242
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,272 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pgtable.h>
+
+#include <asm/msr.h>
+#include <asm/numachip/numachip.h>
+#include <asm/numachip/numachip_csr.h>
+
+
+#include "local.h"
+
+u8 numachip_system __read_mostly;
+static const struct apic apic_numachip1;
+static const struct apic apic_numachip2;
+static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
+
+static u32 numachip1_get_apic_id(u32 x)
+{
+ unsigned long value;
+ unsigned int id = (x >> 24) & 0xff;
+
+ if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrq(MSR_FAM10H_NODE_ID, value);
+ id |= (value << 2) & 0xff00;
+ }
+
+ return id;
+}
+
+static u32 numachip2_get_apic_id(u32 x)
+{
+ u64 mcfg;
+
+ rdmsrq(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+ return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
+}
+
+static void numachip1_apic_icr_write(int apicid, unsigned int val)
+{
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
+}
+
+static void numachip2_apic_icr_write(int apicid, unsigned int val)
+{
+ numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
+}
+
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
+{
+ numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
+ numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
+ (start_rip >> 12));
+
+ return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+ int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned int dmode;
+
+ preempt_disable();
+ local_apicid = __this_cpu_read(x86_cpu_to_apicid);
+
+ /* Send via local APIC where non-local part matches */
+ if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(apicid, vector,
+ APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+ preempt_enable();
+ return;
+ }
+ preempt_enable();
+
+ dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED;
+ numachip_apic_icr_write(apicid, dmode | vector);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+ numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static int __init numachip1_probe(void)
+{
+ return apic == &apic_numachip1;
+}
+
+static int __init numachip2_probe(void)
+{
+ return apic == &apic_numachip2;
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ u64 val;
+ u32 nodes = 1;
+
+ c->topo.llc_id = node;
+
+ /* Account for nodes per socket in multi-core-module processors */
+ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrq(MSR_FAM10H_NODE_ID, val);
+ nodes = ((val >> 3) & 7) + 1;
+ }
+
+ c->topo.pkg_id = node / nodes;
+}
+
+static int __init numachip_system_init(void)
+{
+ /* Map the LCSR area and set up the apic_icr_write function */
+ switch (numachip_system) {
+ case 1:
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ numachip_apic_icr_write = numachip1_apic_icr_write;
+ break;
+ case 2:
+ init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
+ numachip_apic_icr_write = numachip2_apic_icr_write;
+ break;
+ default:
+ return 0;
+ }
+
+ x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
+
+ return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONNECT", 8) != 0))
+ return 0;
+
+ numachip_system = 1;
+
+ return 1;
+}
+
+static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONECT2", 8) != 0))
+ return 0;
+
+ numachip_system = 2;
+
+ return 1;
+}
+
+static const struct apic apic_numachip1 __refconst = {
+ .name = "NumaConnect system",
+ .probe = numachip1_probe,
+ .acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
+
+ .dest_mode_logical = false,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = UINT_MAX,
+ .get_apic_id = numachip1_get_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+};
+
+apic_driver(apic_numachip1);
+
+static const struct apic apic_numachip2 __refconst = {
+ .name = "NumaConnect2 system",
+ .probe = numachip2_probe,
+ .acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
+
+ .dest_mode_logical = false,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = UINT_MAX,
+ .get_apic_id = numachip2_get_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+};
+
+apic_driver(apic_numachip2);
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
deleted file mode 100644
index 676cdac385c0..000000000000
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
- *
- * Drives the local APIC in "clustered mode".
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-#include <linux/smp.h>
-
-#include <asm/apicdef.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-static unsigned bigsmp_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static int bigsmp_apic_id_registered(void)
-{
- return 1;
-}
-
-static const struct cpumask *bigsmp_target_cpus(void)
-{
-#ifdef CONFIG_SMP
- return cpu_online_mask;
-#else
- return cpumask_of(0);
-#endif
-}
-
-static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return 0;
-}
-
-static unsigned long bigsmp_check_apicid_present(int bit)
-{
- return 1;
-}
-
-static inline unsigned long calculate_ldr(int cpu)
-{
- unsigned long val, id;
-
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- id = per_cpu(x86_bios_cpu_apicid, cpu);
- val |= SET_APIC_LOGICAL_ID(id);
-
- return val;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void bigsmp_init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void bigsmp_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int bigsmp_apicid_to_node(int logical_apicid)
-{
- return apicid_2_node[hard_smp_processor_id()];
-}
-
-static int bigsmp_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
-
- return BAD_APICID;
-}
-
-static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
-
-/* Mapping from cpu number to logical apicid */
-static inline int bigsmp_cpu_to_logical_apicid(int cpu)
-{
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_physical_id(cpu);
-}
-
-static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xFFL);
-}
-
-static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return 1;
-}
-
-/* As we are using single CPU as destination, pick only one CPU here */
-static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask));
-}
-
-static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- if (cpu < nr_cpu_ids)
- return bigsmp_cpu_to_logical_apicid(cpu);
-
- return BAD_APICID;
-}
-
-static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
-static void bigsmp_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void bigsmp_send_IPI_all(int vector)
-{
- bigsmp_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int dmi_bigsmp; /* can be set by dmi scanners */
-
-static int hp_ht_bigsmp(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
- dmi_bigsmp = 1;
-
- return 0;
-}
-
-
-static const struct dmi_system_id bigsmp_dmi_table[] = {
- { hp_ht_bigsmp, "HP ProLiant DL760 G2",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
- }
- },
-
- { hp_ht_bigsmp, "HP ProLiant DL740",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
- }
- },
- { } /* NULL entry stops DMI scanning */
-};
-
-static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static int probe_bigsmp(void)
-{
- if (def_to_bigsmp)
- dmi_bigsmp = 1;
- else
- dmi_check_system(bigsmp_dmi_table);
-
- return dmi_bigsmp;
-}
-
-struct apic apic_bigsmp = {
-
- .name = "bigsmp",
- .probe = probe_bigsmp,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = bigsmp_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPU: */
- .irq_dest_mode = 0,
-
- .target_cpus = bigsmp_target_cpus,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = bigsmp_check_apicid_used,
- .check_apicid_present = bigsmp_check_apicid_present,
-
- .vector_allocation_domain = bigsmp_vector_allocation_domain,
- .init_apic_ldr = bigsmp_init_apic_ldr,
-
- .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
- .setup_apic_routing = bigsmp_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = bigsmp_apicid_to_node,
- .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
- .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
- .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = bigsmp_phys_pkg_id,
- .mps_oem_check = NULL,
-
- .get_apic_id = bigsmp_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = bigsmp_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
- .send_IPI_all = bigsmp_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
deleted file mode 100644
index 8952a5890281..000000000000
--- a/arch/x86/kernel/apic/es7000_32.c
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- * Written by: Garry Forsgren, Unisys Corporation
- * Natalie Protasevich, Unisys Corporation
- *
- * This file contains the code to configure and interface
- * with Unisys ES7000 series hardware system manager.
- *
- * Copyright (c) 2003 Unisys Corporation.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Unisys Corporation, Township Line & Union Meeting
- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
- *
- * http://www.unisys.com
- */
-#include <linux/notifier.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/nmi.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-
-#include <asm/apicdef.h>
-#include <asm/atomic.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-/*
- * ES7000 chipsets
- */
-
-#define NON_UNISYS 0
-#define ES7000_CLASSIC 1
-#define ES7000_ZORRO 2
-
-#define MIP_REG 1
-#define MIP_PSAI_REG 4
-
-#define MIP_BUSY 1
-#define MIP_SPIN 0xf0000
-#define MIP_VALID 0x0100000000000000ULL
-#define MIP_SW_APIC 0x1020b
-
-#define MIP_PORT(val) ((val >> 32) & 0xffff)
-
-#define MIP_RD_LO(val) (val & 0xffffffff)
-
-struct mip_reg {
- unsigned long long off_0x00;
- unsigned long long off_0x08;
- unsigned long long off_0x10;
- unsigned long long off_0x18;
- unsigned long long off_0x20;
- unsigned long long off_0x28;
- unsigned long long off_0x30;
- unsigned long long off_0x38;
-};
-
-struct mip_reg_info {
- unsigned long long mip_info;
- unsigned long long delivery_info;
- unsigned long long host_reg;
- unsigned long long mip_reg;
-};
-
-struct psai {
- unsigned long long entry_type;
- unsigned long long addr;
- unsigned long long bep_addr;
-};
-
-#ifdef CONFIG_ACPI
-
-struct es7000_oem_table {
- struct acpi_table_header Header;
- u32 OEMTableAddr;
- u32 OEMTableSize;
-};
-
-static unsigned long oem_addrX;
-static unsigned long oem_size;
-
-#endif
-
-/*
- * ES7000 Globals
- */
-
-static volatile unsigned long *psai;
-static struct mip_reg *mip_reg;
-static struct mip_reg *host_reg;
-static int mip_port;
-static unsigned long mip_addr;
-static unsigned long host_addr;
-
-int es7000_plat;
-
-/*
- * GSI override for ES7000 platforms.
- */
-
-static unsigned int base;
-
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
- if (es7000_plat == ES7000_ZORRO)
- return gsi;
-
- if (!base) {
- int i;
- for (i = 0; i < nr_ioapics; i++)
- base += nr_ioapic_registers[i];
- }
-
- if (!ioapic && (gsi < 16))
- gsi += base;
-
- return gsi;
-}
-
-static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
-{
- unsigned long vect = 0, psaival = 0;
-
- if (psai == NULL)
- return -1;
-
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
-
- while (*psai & 0x1000000)
- ;
-
- *psai = psaival;
-
- return 0;
-}
-
-static int es7000_apic_is_cluster(void)
-{
- /* MPENTIUMIII */
- if (boot_cpu_data.x86 == 6 &&
- (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11))
- return 1;
-
- return 0;
-}
-
-static void setup_unisys(void)
-{
- /*
- * Determine the generation of the ES7000 currently running.
- *
- * es7000_plat = 1 if the machine is a 5xx ES7000 box
- * es7000_plat = 2 if the machine is a x86_64 ES7000 box
- *
- */
- if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
- es7000_plat = ES7000_ZORRO;
- else
- es7000_plat = ES7000_CLASSIC;
- ioapic_renumber_irq = es7000_rename_gsi;
-}
-
-/*
- * Parse the OEM Table:
- */
-static int parse_unisys_oem(char *oemptr)
-{
- int i;
- int success = 0;
- unsigned char type, size;
- unsigned long val;
- char *tp = NULL;
- struct psai *psaip = NULL;
- struct mip_reg_info *mi;
- struct mip_reg *host, *mip;
-
- tp = oemptr;
-
- tp += 8;
-
- for (i = 0; i <= 6; i++) {
- type = *tp++;
- size = *tp++;
- tp -= 2;
- switch (type) {
- case MIP_REG:
- mi = (struct mip_reg_info *)tp;
- val = MIP_RD_LO(mi->host_reg);
- host_addr = val;
- host = (struct mip_reg *)val;
- host_reg = __va(host);
- val = MIP_RD_LO(mi->mip_reg);
- mip_port = MIP_PORT(mi->mip_info);
- mip_addr = val;
- mip = (struct mip_reg *)val;
- mip_reg = __va(mip);
- pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
- (unsigned long)host_reg);
- pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
- (unsigned long)mip_reg);
- success++;
- break;
- case MIP_PSAI_REG:
- psaip = (struct psai *)tp;
- if (tp != NULL) {
- if (psaip->addr)
- psai = __va(psaip->addr);
- else
- psai = NULL;
- success++;
- }
- break;
- default:
- break;
- }
- tp += size;
- }
-
- if (success < 2)
- es7000_plat = NON_UNISYS;
- else
- setup_unisys();
-
- return es7000_plat;
-}
-
-#ifdef CONFIG_ACPI
-static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
-{
- struct acpi_table_header *header = NULL;
- struct es7000_oem_table *table;
- acpi_size tbl_size;
- acpi_status ret;
- int i = 0;
-
- for (;;) {
- ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
- if (!ACPI_SUCCESS(ret))
- return -1;
-
- if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
- break;
-
- early_acpi_os_unmap_memory(header, tbl_size);
- }
-
- table = (void *)header;
-
- oem_addrX = table->OEMTableAddr;
- oem_size = table->OEMTableSize;
-
- early_acpi_os_unmap_memory(header, tbl_size);
-
- *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
-
- return 0;
-}
-
-static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
-{
- if (!oem_addr)
- return;
-
- __acpi_unmap_table((char *)oem_addr, oem_size);
-}
-
-static int es7000_check_dsdt(void)
-{
- struct acpi_table_header header;
-
- if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
- !strncmp(header.oem_id, "UNISYS", 6))
- return 1;
- return 0;
-}
-
-static int es7000_acpi_ret;
-
-/* Hook from generic ACPI tables.c */
-static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- unsigned long oem_addr = 0;
- int check_dsdt;
- int ret = 0;
-
- /* check dsdt at first to avoid clear fix_map for oem_addr */
- check_dsdt = es7000_check_dsdt();
-
- if (!find_unisys_acpi_oem_table(&oem_addr)) {
- if (check_dsdt) {
- ret = parse_unisys_oem((char *)oem_addr);
- } else {
- setup_unisys();
- ret = 1;
- }
- /*
- * we need to unmap it
- */
- unmap_unisys_acpi_oem_table(oem_addr);
- }
-
- es7000_acpi_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- int ret = es7000_acpi_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-#else /* !CONFIG_ACPI: */
-static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-#endif /* !CONFIG_ACPI */
-
-static void es7000_spin(int n)
-{
- int i = 0;
-
- while (i++ < n)
- rep_nop();
-}
-
-static int es7000_mip_write(struct mip_reg *mip_reg)
-{
- int status = 0;
- int spin;
-
- spin = MIP_SPIN;
- while ((host_reg->off_0x38 & MIP_VALID) != 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for Host Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
- outb(1, mip_port);
-
- spin = MIP_SPIN;
-
- while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for MIP Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
- mip_reg->off_0x38 &= ~MIP_VALID;
-
- return status;
-}
-
-static void es7000_enable_apic_mode(void)
-{
- struct mip_reg es7000_mip_reg;
- int mip_status;
-
- if (!es7000_plat)
- return;
-
- printk(KERN_INFO "ES7000: Enabling APIC mode.\n");
- memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
- es7000_mip_reg.off_0x00 = MIP_SW_APIC;
- es7000_mip_reg.off_0x38 = MIP_VALID;
-
- while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
- WARN(1, "Command failed, status = %x\n", mip_status);
-}
-
-static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-
-static void es7000_wait_for_init_deassert(atomic_t *deassert)
-{
- while (!atomic_read(deassert))
- cpu_relax();
-}
-
-static unsigned int es7000_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
-static void es7000_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void es7000_send_IPI_all(int vector)
-{
- es7000_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int es7000_apic_id_registered(void)
-{
- return 1;
-}
-
-static const struct cpumask *target_cpus_cluster(void)
-{
- return cpu_all_mask;
-}
-
-static const struct cpumask *es7000_target_cpus(void)
-{
- return cpumask_of(smp_processor_id());
-}
-
-static unsigned long
-es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return 0;
-}
-static unsigned long es7000_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static unsigned long calculate_ldr(int cpu)
-{
- unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
-
- return SET_APIC_LOGICAL_ID(id);
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LdR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void es7000_init_apic_ldr_cluster(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_CLUSTER);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_setup_apic_routing(void)
-{
- int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
-
- printk(KERN_INFO
- "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
- (apic_version[apic] == 0x14) ?
- "Physical Cluster" : "Logical Cluster",
- nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
-}
-
-static int es7000_apicid_to_node(int logical_apicid)
-{
- return 0;
-}
-
-
-static int es7000_cpu_present_to_apicid(int mps_cpu)
-{
- if (!mps_cpu)
- return boot_cpu_physical_apicid;
- else if (mps_cpu < nr_cpu_ids)
- return per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static int cpu_id;
-
-static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
-{
- physid_mask_t mask;
-
- mask = physid_mask_of_physid(cpu_id);
- ++cpu_id;
-
- return mask;
-}
-
-/* Mapping from cpu number to logical apicid */
-static int es7000_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_2_logical_apicid[cpu];
-#else
- return logical_smp_processor_id();
-#endif
-}
-
-static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xff);
-}
-
-static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
-{
- boot_cpu_physical_apicid = read_apic_id();
- return 1;
-}
-
-static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- unsigned int round = 0;
- int cpu, uninitialized_var(apicid);
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu(cpu, cpumask) {
- int new_apicid = es7000_cpu_to_logical_apicid(cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- WARN(1, "Not a valid mask!");
-
- return BAD_APICID;
- }
- apicid = new_apicid;
- round++;
- }
- return apicid;
-}
-
-static unsigned int
-es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask)
-{
- int apicid = es7000_cpu_to_logical_apicid(0);
- cpumask_var_t cpumask;
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return apicid;
-
- cpumask_and(cpumask, inmask, andmask);
- cpumask_and(cpumask, cpumask, cpu_online_mask);
- apicid = es7000_cpu_mask_to_apicid(cpumask);
-
- free_cpumask_var(cpumask);
-
- return apicid;
-}
-
-static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int probe_es7000(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static int es7000_mps_ret;
-static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = 0;
-
- if (mpc->oemptr) {
- struct mpc_oemtable *oem_table =
- (struct mpc_oemtable *)mpc->oemptr;
-
- if (!strncmp(oem, "UNISYS", 6))
- ret = parse_unisys_oem((char *)oem_table);
- }
-
- es7000_mps_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = es7000_mps_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-/* We've been warned by a false positive warning.Use __refdata to keep calm. */
-struct apic __refdata apic_es7000_cluster = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all procs: */
- .irq_dest_mode = 1,
-
- .target_cpus = target_cpus_cluster,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = es7000_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr_cluster,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = es7000_apicid_to_node,
- .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check_cluster,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = NULL,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
-
-struct apic __refdata apic_es7000 = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPUs: */
- .irq_dest_mode = 0,
-
- .target_cpus = es7000_target_cpus,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = es7000_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = es7000_apicid_to_node,
- .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = es7000_wait_for_init_deassert,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..45af535c44a0
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <linux/thread_info.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+
+#include "local.h"
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
+}
+#endif
+
+#ifdef arch_trigger_cpumask_backtrace
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
+{
+ __apic_send_IPI_mask(mask, NMI_VECTOR);
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
+{
+ nmi_trigger_cpumask_backtrace(mask, exclude_cpu,
+ nmi_raise_cpu_backtrace);
+}
+
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ if (nmi_cpu_backtrace(regs))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
+
+static int __init register_nmi_cpu_backtrace_handler(void)
+{
+ register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
+ 0, "arch_bt");
+ return 0;
+}
+early_initcall(register_nmi_cpu_backtrace_handler);
+#endif
diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c
new file mode 100644
index 000000000000..821e2e536f19
--- /dev/null
+++ b/arch/x86/kernel/apic/init.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "APIC: " fmt
+
+#include <asm/apic.h>
+
+#include "local.h"
+
+/*
+ * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions
+ * for each callback. The callbacks are setup during boot and all except
+ * wait_icr_idle() must be initialized before usage. The IPI wrappers
+ * use static_call() and not static_call_cond() to catch any fails.
+ */
+#define DEFINE_APIC_CALL(__cb) \
+ DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb)
+
+DEFINE_APIC_CALL(eoi);
+DEFINE_APIC_CALL(native_eoi);
+DEFINE_APIC_CALL(icr_read);
+DEFINE_APIC_CALL(icr_write);
+DEFINE_APIC_CALL(read);
+DEFINE_APIC_CALL(send_IPI);
+DEFINE_APIC_CALL(send_IPI_mask);
+DEFINE_APIC_CALL(send_IPI_mask_allbutself);
+DEFINE_APIC_CALL(send_IPI_allbutself);
+DEFINE_APIC_CALL(send_IPI_all);
+DEFINE_APIC_CALL(send_IPI_self);
+DEFINE_APIC_CALL(wait_icr_idle);
+DEFINE_APIC_CALL(wakeup_secondary_cpu);
+DEFINE_APIC_CALL(wakeup_secondary_cpu_64);
+DEFINE_APIC_CALL(write);
+
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask);
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self);
+
+/* The container for function call overrides */
+struct apic_override __x86_apic_override __initdata;
+
+#define apply_override(__cb) \
+ if (__x86_apic_override.__cb) \
+ apic->__cb = __x86_apic_override.__cb
+
+static __init void restore_override_callbacks(void)
+{
+ apply_override(eoi);
+ apply_override(native_eoi);
+ apply_override(write);
+ apply_override(read);
+ apply_override(send_IPI);
+ apply_override(send_IPI_mask);
+ apply_override(send_IPI_mask_allbutself);
+ apply_override(send_IPI_allbutself);
+ apply_override(send_IPI_all);
+ apply_override(send_IPI_self);
+ apply_override(icr_read);
+ apply_override(icr_write);
+ apply_override(wakeup_secondary_cpu);
+ apply_override(wakeup_secondary_cpu_64);
+}
+
+#define update_call(__cb) \
+ static_call_update(apic_call_##__cb, *apic->__cb)
+
+static __init void update_static_calls(void)
+{
+ update_call(eoi);
+ update_call(native_eoi);
+ update_call(write);
+ update_call(read);
+ update_call(send_IPI);
+ update_call(send_IPI_mask);
+ update_call(send_IPI_mask_allbutself);
+ update_call(send_IPI_allbutself);
+ update_call(send_IPI_all);
+ update_call(send_IPI_self);
+ update_call(icr_read);
+ update_call(icr_write);
+ update_call(wait_icr_idle);
+ update_call(wakeup_secondary_cpu);
+ update_call(wakeup_secondary_cpu_64);
+}
+
+void __init apic_setup_apic_calls(void)
+{
+ /* Ensure that the default APIC has native_eoi populated */
+ apic->native_eoi = apic->eoi;
+ update_static_calls();
+ pr_info("Static calls initialized\n");
+}
+
+void __init apic_install_driver(struct apic *driver)
+{
+ if (apic == driver)
+ return;
+
+ apic = driver;
+
+ if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid)
+ apic->max_apic_id = x2apic_max_apicid;
+
+ /* Copy the original eoi() callback as KVM/HyperV might overwrite it */
+ if (!apic->native_eoi)
+ apic->native_eoi = apic->eoi;
+
+ /* Apply any already installed callback overrides */
+ restore_override_callbacks();
+ update_static_calls();
+
+ pr_info("Switched APIC routing to: %s\n", driver->name);
+}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d2ed6c5ddc80..28f934f05a85 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Intel IO-APIC support for multi-Pentium hosts.
*
@@ -18,10 +19,21 @@
* and Rolf G. Tews
* for testing these extensively
* Paul Diefenbaugh : Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ * We used to have a workaround for a bug in SiS chips which
+ * required to rewrite the index register for a read-modify-write
+ * operation as the chip lost the index information which was
+ * setup for the read already. We cache the data now, so that
+ * workaround has been removed.
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
@@ -29,21 +41,16 @@
#include <linux/mc146818rtc.h>
#include <linux/compiler.h>
#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-#include <linux/bootmem.h>
-#include <linux/dmar.h>
-#include <linux/hpet.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <linux/msi.h>
-#include <asm/idle.h>
+#include <asm/irqdomain.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/cpu.h>
@@ -52,313 +59,194 @@
#include <asm/acpi.h>
#include <asm/dma.h>
#include <asm/timer.h>
+#include <asm/time.h>
#include <asm/i8259.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
#include <asm/setup.h>
#include <asm/irq_remapping.h>
-#include <asm/hpet.h>
#include <asm/hw_irq.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_irq.h>
-
#include <asm/apic.h>
+#include <asm/pgtable.h>
+#include <asm/x86_init.h>
+
+#define for_each_ioapic(idx) \
+ for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define for_each_ioapic_reverse(idx) \
+ for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define for_each_pin(idx, pin) \
+ for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define for_each_ioapic_pin(idx, pin) \
+ for_each_ioapic((idx)) \
+ for_each_pin((idx), (pin))
+#define for_each_irq_pin(entry, head) \
+ list_for_each_entry(entry, &head, list)
+
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
-#define __apicdebuginit(type) static type __init
-
-/*
- * Is the SiS APIC rmw bug present ?
- * -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-/* I/O APIC entries */
-struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
-
-/* MP IRQ source entries */
-struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct irq_pin_list {
+ struct list_head list;
+ int apic, pin;
+};
-/* # of MP IRQ source entries */
-int mp_irq_entries;
+struct mp_chip_data {
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ bool is_level;
+ bool active_low;
+ bool isa_irq;
+ u32 count;
+};
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
+struct mp_ioapic_gsi {
+ u32 gsi_base;
+ u32 gsi_end;
+};
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+static struct ioapic {
+ /* # of IRQ routing registers */
+ int nr_registers;
+ /* Saved state during suspend/resume, or while enabling intr-remap. */
+ struct IO_APIC_route_entry *saved_registers;
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct resource *iomem_res;
+} ioapics[MAX_IO_APICS];
-int skip_ioapic_setup;
+#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
-void arch_disable_smp_support(void)
+int mpc_ioapic_id(int ioapic_idx)
{
-#ifdef CONFIG_PCI
- noioapicquirk = 1;
- noioapicreroute = -1;
-#endif
- skip_ioapic_setup = 1;
+ return ioapics[ioapic_idx].mp_config.apicid;
}
-static int __init parse_noapic(char *str)
+unsigned int mpc_ioapic_addr(int ioapic_idx)
{
- /* disable IO-APIC */
- arch_disable_smp_support();
- return 0;
+ return ioapics[ioapic_idx].mp_config.apicaddr;
}
-early_param("noapic", parse_noapic);
-
-struct irq_pin_list;
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-struct irq_pin_list {
- int apic, pin;
- struct irq_pin_list *next;
-};
-
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
{
- struct irq_pin_list *pin;
-
- pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-
- return pin;
+ return &ioapics[ioapic_idx].gsi_config;
}
-struct irq_cfg {
- struct irq_pin_list *irq_2_pin;
- cpumask_var_t domain;
- cpumask_var_t old_domain;
- unsigned move_cleanup_count;
- u8 vector;
- u8 move_in_progress : 1;
-};
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg irq_cfgx[] = {
-#else
-static struct irq_cfg irq_cfgx[NR_IRQS] = {
-#endif
- [0] = { .vector = IRQ0_VECTOR, },
- [1] = { .vector = IRQ1_VECTOR, },
- [2] = { .vector = IRQ2_VECTOR, },
- [3] = { .vector = IRQ3_VECTOR, },
- [4] = { .vector = IRQ4_VECTOR, },
- [5] = { .vector = IRQ5_VECTOR, },
- [6] = { .vector = IRQ6_VECTOR, },
- [7] = { .vector = IRQ7_VECTOR, },
- [8] = { .vector = IRQ8_VECTOR, },
- [9] = { .vector = IRQ9_VECTOR, },
- [10] = { .vector = IRQ10_VECTOR, },
- [11] = { .vector = IRQ11_VECTOR, },
- [12] = { .vector = IRQ12_VECTOR, },
- [13] = { .vector = IRQ13_VECTOR, },
- [14] = { .vector = IRQ14_VECTOR, },
- [15] = { .vector = IRQ15_VECTOR, },
-};
-
-int __init arch_early_irq_init(void)
+static inline int mp_ioapic_pin_count(int ioapic)
{
- struct irq_cfg *cfg;
- struct irq_desc *desc;
- int count;
- int node;
- int i;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
- cfg = irq_cfgx;
- count = ARRAY_SIZE(irq_cfgx);
- node= cpu_to_node(boot_cpu_id);
-
- for (i = 0; i < count; i++) {
- desc = irq_to_desc(i);
- desc->chip_data = &cfg[i];
- zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
- zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
- if (i < NR_IRQS_LEGACY)
- cpumask_setall(cfg[i].domain);
- }
-
- return 0;
+ return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
}
-#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg *irq_cfg(unsigned int irq)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
{
- struct irq_cfg *cfg = NULL;
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
- if (desc)
- cfg = desc->chip_data;
-
- return cfg;
+ return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
}
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+static inline bool mp_is_legacy_irq(int irq)
{
- struct irq_cfg *cfg;
-
- cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
- if (cfg) {
- if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
- kfree(cfg);
- cfg = NULL;
- } else if (!alloc_cpumask_var_node(&cfg->old_domain,
- GFP_ATOMIC, node)) {
- free_cpumask_var(cfg->domain);
- kfree(cfg);
- cfg = NULL;
- } else {
- cpumask_clear(cfg->domain);
- cpumask_clear(cfg->old_domain);
- }
- }
-
- return cfg;
+ return irq >= 0 && irq < nr_legacy_irqs();
}
-int arch_init_chip_data(struct irq_desc *desc, int node)
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
{
- struct irq_cfg *cfg;
+ return ioapics[ioapic].irqdomain;
+}
- cfg = desc->chip_data;
- if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(node);
- if (!desc->chip_data) {
- printk(KERN_ERR "can not alloc irq_cfg\n");
- BUG_ON(1);
- }
- }
+int nr_ioapics;
- return 0;
-}
+/* The one past the highest gsi number used */
+u32 gsi_top;
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
-{
- struct irq_pin_list *old_entry, *head, *tail, *entry;
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
- cfg->irq_2_pin = NULL;
- old_entry = old_cfg->irq_2_pin;
- if (!old_entry)
- return;
+/* # of MP IRQ source entries */
+int mp_irq_entries;
- entry = get_one_free_irq_2_pin(node);
- if (!entry)
- return;
+#ifdef CONFIG_EISA
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- head = entry;
- tail = entry;
- old_entry = old_entry->next;
- while (old_entry) {
- entry = get_one_free_irq_2_pin(node);
- if (!entry) {
- entry = head;
- while (entry) {
- head = entry->next;
- kfree(entry);
- entry = head;
- }
- /* still use the old one */
- return;
- }
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- tail->next = entry;
- tail = entry;
- old_entry = old_entry->next;
- }
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
- tail->next = NULL;
- cfg->irq_2_pin = head;
+bool ioapic_is_disabled __ro_after_init;
+
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+void disable_ioapic_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ ioapic_is_disabled = true;
}
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+static int __init parse_noapic(char *str)
{
- struct irq_pin_list *entry, *next;
+ /* disable IO-APIC */
+ disable_ioapic_support();
+ return 0;
+}
+early_param("noapic", parse_noapic);
- if (old_cfg->irq_2_pin == cfg->irq_2_pin)
- return;
+/* Will be called in mpparse/ACPI codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+ int i;
- entry = old_cfg->irq_2_pin;
+ apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
- while (entry) {
- next = entry->next;
- kfree(entry);
- entry = next;
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+ return;
}
- old_cfg->irq_2_pin = NULL;
+
+ memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
}
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int node)
+static void alloc_ioapic_saved_registers(int idx)
{
- struct irq_cfg *cfg;
- struct irq_cfg *old_cfg;
-
- cfg = get_one_free_irq_cfg(node);
+ size_t size;
- if (!cfg)
+ if (ioapics[idx].saved_registers)
return;
- desc->chip_data = cfg;
-
- old_cfg = old_desc->chip_data;
-
- memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
-
- init_copy_irq_2_pin(old_cfg, cfg, node);
+ size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers;
+ ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL);
+ if (!ioapics[idx].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", idx);
}
-static void free_irq_cfg(struct irq_cfg *old_cfg)
+static void free_ioapic_saved_registers(int idx)
{
- kfree(old_cfg);
+ kfree(ioapics[idx].saved_registers);
+ ioapics[idx].saved_registers = NULL;
}
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+int __init arch_early_ioapic_init(void)
{
- struct irq_cfg *old_cfg, *cfg;
-
- old_cfg = old_desc->chip_data;
- cfg = desc->chip_data;
+ int i;
- if (old_cfg == cfg)
- return;
+ if (!nr_legacy_irqs())
+ io_apic_irqs = ~0UL;
- if (old_cfg) {
- free_irq_2_pin(old_cfg, cfg);
- free_irq_cfg(old_cfg);
- old_desc->chip_data = NULL;
- }
-}
-/* end for move_irq_desc */
+ for_each_ioapic(i)
+ alloc_ioapic_saved_registers(i);
-#else
-static struct irq_cfg *irq_cfg(unsigned int irq)
-{
- return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ return 0;
}
-#endif
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -370,87 +258,47 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
}
static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(vector, &io_apic->eoi);
}
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- return readl(&io_apic->data);
-}
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
+ return readl(&io_apic->data);
}
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+ unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
- if (sis_apic_bug)
- writel(reg, &io_apic->index);
+ writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
{
- struct irq_pin_list *entry;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- entry = cfg->irq_2_pin;
- for (;;) {
- unsigned int reg;
- int pin;
+ struct IO_APIC_route_entry entry;
- if (!entry)
- break;
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- /* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- spin_unlock_irqrestore(&ioapic_lock, flags);
- return true;
- }
- if (!entry->next)
- break;
- entry = entry->next;
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ entry.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- return false;
+ return entry;
}
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- return eu.entry;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ return __ioapic_read_entry(apic, pin);
}
/*
@@ -459,22 +307,16 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
* the interrupt, and we need to make sure the entry is fully populated
* before that happens.
*/
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- union entry_union eu = {{0, 0}};
-
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
}
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__ioapic_write_entry(apic, pin, e);
- spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -484,13 +326,11 @@ void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
*/
static void ioapic_mask_entry(int apic, int pin)
{
- unsigned long flags;
- union entry_union eu = { .entry.mask = 1 };
+ struct IO_APIC_route_entry e = { .masked = true };
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
}
/*
@@ -498,156 +338,131 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin)
{
struct irq_pin_list *entry;
- entry = cfg->irq_2_pin;
- if (!entry) {
- entry = get_one_free_irq_2_pin(node);
- if (!entry) {
- printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
- apic, pin);
- return;
- }
- cfg->irq_2_pin = entry;
- entry->apic = apic;
- entry->pin = pin;
- return;
- }
-
- while (entry->next) {
- /* not again, please */
+ /* Don't allow duplicates */
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == apic && entry->pin == pin)
- return;
+ return true;
+ }
- entry = entry->next;
+ entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
+ if (!entry) {
+ pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin);
+ return false;
}
- entry->next = get_one_free_irq_2_pin(node);
- entry = entry->next;
entry->apic = apic;
entry->pin = pin;
+ list_add_tail(&entry->list, &data->irq_2_pin);
+ return true;
}
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
- int oldapic, int oldpin,
- int newapic, int newpin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
{
- struct irq_pin_list *entry = cfg->irq_2_pin;
- int replaced = 0;
+ struct irq_pin_list *tmp, *entry;
- while (entry) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- replaced = 1;
- /* every one is different, right? */
- break;
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) {
+ if (entry->apic == apic && entry->pin == pin) {
+ list_del(&entry->list);
+ kfree(entry);
+ return;
}
- entry = entry->next;
}
-
- /* why? call replace before add? */
- if (!replaced)
- add_pin_to_irq_node(cfg, node, newapic, newpin);
}
-static inline void io_apic_modify_irq(struct irq_cfg *cfg,
- int mask_and, int mask_or,
- void (*final)(struct irq_pin_list *entry))
+static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
+ void (*final)(struct irq_pin_list *entry))
{
- int pin;
struct irq_pin_list *entry;
- for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
- unsigned int reg;
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin * 2);
- reg &= mask_and;
- reg |= mask_or;
- io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ data->entry.masked = masked;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1);
if (final)
final(entry);
}
}
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-#ifdef CONFIG_X86_64
+/*
+ * Synchronize the IO-APIC and the CPU by doing a dummy read from the
+ * IO-APIC
+ */
static void io_apic_sync(struct irq_pin_list *entry)
{
- /*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
struct io_apic __iomem *io_apic;
+
io_apic = io_apic_base(entry->apic);
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
-}
-#else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
{
- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
-}
+ struct mp_chip_data *data = irq_data->chip_data;
-static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED, NULL);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ io_apic_modify_irq(data, true, &io_apic_sync);
}
-static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
+static void __unmask_ioapic(struct mp_chip_data *data)
{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
+ io_apic_modify_irq(data, false, NULL);
}
-#endif /* CONFIG_X86_32 */
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
{
- struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
-
- BUG_ON(!cfg);
+ struct mp_chip_data *data = irq_data->chip_data;
- spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ __unmask_ioapic(data);
}
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
{
- struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
+ if (mpc_ioapic_ver(apic) >= 0x20) {
+ io_apic_eoi(apic, vector);
+ } else {
+ struct IO_APIC_route_entry entry, entry1;
- spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
+ entry = entry1 = __ioapic_read_entry(apic, pin);
-static void mask_IO_APIC_irq(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
+ /* Mask the entry and change the trigger mode to edge. */
+ entry1.masked = true;
+ entry1.is_level = false;
+
+ __ioapic_write_entry(apic, pin, entry1);
- mask_IO_APIC_irq_desc(desc);
+ /* Restore the previous level triggered entry. */
+ __ioapic_write_entry(apic, pin, entry);
+ }
}
-static void unmask_IO_APIC_irq(unsigned int irq)
+
+static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_pin_list *entry;
- unmask_IO_APIC_irq_desc(desc);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __eoi_ioapic_pin(entry->apic, entry->pin, vector);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -656,21 +471,50 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
+ if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI)
return;
+
/*
- * Disable it in the IO-APIC irq-routing table:
+ * Make sure the entry is masked and re-read the contents to check
+ * if it is a level triggered pin and if the remote-IRR is set.
+ */
+ if (!entry.masked) {
+ entry.masked = true;
+ ioapic_write_entry(apic, pin, entry);
+ entry = ioapic_read_entry(apic, pin);
+ }
+
+ if (entry.irr) {
+ /*
+ * Make sure the trigger mode is set to level. Explicit EOI
+ * doesn't clear the remote-IRR if the trigger mode is not
+ * set to level.
+ */
+ if (!entry.is_level) {
+ entry.is_level = true;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ __eoi_ioapic_pin(apic, pin, entry.vector);
+ }
+
+ /*
+ * Clear the rest of the bits in the IO-APIC RTE except for the mask
+ * bit.
*/
ioapic_mask_entry(apic, pin);
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.irr)
+ pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
+ mpc_ioapic_id(apic), pin);
}
-static void clear_IO_APIC (void)
+void clear_IO_APIC (void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- clear_IO_APIC_pin(apic, pin);
+ for_each_ioapic_pin(apic, pin)
+ clear_IO_APIC_pin(apic, pin);
}
#ifdef CONFIG_X86_32
@@ -686,101 +530,64 @@ static int pirq_entries[MAX_PIRQS] = {
static int __init ioapic_pirq_setup(char *str)
{
- int i, max;
- int ints[MAX_PIRQS+1];
+ int i, max, ints[MAX_PIRQS+1];
get_options(str, ARRAY_SIZE(ints), ints);
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
+ apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n");
+
max = MAX_PIRQS;
if (ints[0] < MAX_PIRQS)
max = ints[0];
for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
+ apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]);
+ /* PIRQs are mapped upside down, usually */
pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
}
return 1;
}
-
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
-struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
- int apic;
- struct IO_APIC_route_entry **ioapic_entries;
-
- ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
- GFP_ATOMIC);
- if (!ioapic_entries)
- return 0;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- ioapic_entries[apic] =
- kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_ATOMIC);
- if (!ioapic_entries[apic])
- goto nomem;
- }
-
- return ioapic_entries;
-
-nomem:
- while (--apic >= 0)
- kfree(ioapic_entries[apic]);
- kfree(ioapic_entries);
-
- return 0;
-}
-
/*
* Saves all the IO-APIC RTE's
*/
-int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int save_ioapic_entries(void)
{
int apic, pin;
+ int err = 0;
- if (!ioapic_entries)
- return -ENOMEM;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- ioapic_entries[apic][pin] =
- ioapic_read_entry(apic, pin);
+ for_each_pin(apic, pin)
+ ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin);
}
- return 0;
+ return err;
}
/*
* Mask all IO APIC entries.
*/
-void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+void mask_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- break;
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for_each_pin(apic, pin) {
struct IO_APIC_route_entry entry;
- entry = ioapic_entries[apic][pin];
- if (!entry.mask) {
- entry.mask = 1;
+ entry = ioapics[apic].saved_registers[pin];
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
}
}
@@ -788,49 +595,36 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
}
/*
- * Restore IO APIC entries which was saved in ioapic_entries.
+ * Restore IO APIC entries which was saved in the ioapic structure.
*/
-int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int restore_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return -ENOMEM;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- ioapic_write_entry(apic, pin,
- ioapic_entries[apic][pin]);
+ for_each_pin(apic, pin)
+ ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]);
}
return 0;
}
-void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
-{
- int apic;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- kfree(ioapic_entries[apic]);
-
- kfree(ioapic_entries);
-}
-
/*
* Find the IRQ entry number of a certain pin.
*/
-static int find_irq_entry(int apic, int pin, int type)
+static int find_irq_entry(int ioapic_idx, int pin, int type)
{
int i;
- for (i = 0; i < mp_irq_entries; i++)
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].irqtype == type &&
- (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+ (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
mp_irqs[i].dstapic == MP_APIC_ALL) &&
mp_irqs[i].dstirq == pin)
return i;
+ }
return -1;
}
@@ -845,10 +639,8 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
-
return mp_irqs[i].dstirq;
}
return -1;
@@ -861,789 +653,549 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
break;
}
+
if (i < mp_irq_entries) {
- int apic;
- for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
- return apic;
+ int ioapic_idx;
+
+ for_each_ioapic(ioapic_idx) {
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+ return ioapic_idx;
}
}
return -1;
}
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < NR_IRQS_LEGACY) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-#endif
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (1)
-#define default_PCI_polarity(idx) (1)
-
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
-static int MPBIOS_polarity(int idx)
+static bool irq_active_low(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].irqflag & 3)
- {
- case 0: /* conforms, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- polarity = default_ISA_polarity(idx);
- else
- polarity = default_PCI_polarity(idx);
- break;
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
+ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
+ case MP_IRQPOL_DEFAULT:
+ /*
+ * Conforms to spec, ie. bus-type dependent polarity. PCI
+ * defaults to low active. [E]ISA defaults to high active.
+ */
+ return !test_bit(bus, mp_bus_not_pci);
+ case MP_IRQPOL_ACTIVE_HIGH:
+ return false;
+ case MP_IRQPOL_RESERVED:
+ pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ fallthrough;
+ case MP_IRQPOL_ACTIVE_LOW:
+ default: /* Pointless default required due to do gcc stupidity */
+ return true;
}
- return polarity;
}
-static int MPBIOS_trigger(int idx)
+#ifdef CONFIG_EISA
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
{
- int bus = mp_irqs[idx].srcbus;
- int trigger;
-
- /*
- * Determine IRQ trigger mode (edge or level sensitive):
- */
- switch ((mp_irqs[idx].irqflag>>2) & 3)
- {
- case 0: /* conforms, ie. bus-type dependent */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus]) {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
- break;
- case 1: /* edge */
- {
- trigger = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- case 3: /* level */
- {
- trigger = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
- break;
- }
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = PIC_ELCR1 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
}
- return trigger;
+ apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq);
+ return false;
}
-static inline int irq_polarity(int idx)
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
{
- return MPBIOS_polarity(idx);
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_PCI:
+ case MP_BUS_ISA:
+ return level;
+ case MP_BUS_EISA:
+ return EISA_ELCR(mp_irqs[idx].srcbusirq);
+ }
+ pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+ return true;
}
-
-static inline int irq_trigger(int idx)
+#else
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
{
- return MPBIOS_trigger(idx);
+ return level;
}
+#endif
-int (*ioapic_renumber_irq)(int ioapic, int irq);
-static int pin_2_irq(int idx, int apic, int pin)
+static bool irq_is_level(int idx)
{
- int irq, i;
int bus = mp_irqs[idx].srcbus;
+ bool level;
/*
- * Debugging check, we are in big trouble if this message pops up!
+ * Determine IRQ trigger mode (edge or level sensitive):
*/
- if (mp_irqs[idx].dstirq != pin)
- printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
- if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].srcbusirq;
- } else {
+ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
+ case MP_IRQTRIG_DEFAULT:
/*
- * PCI IRQs are mapped in order
+ * Conforms to spec, ie. bus-type dependent trigger
+ * mode. PCI defaults to level, ISA to edge.
*/
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
- /*
- * For MPS mode, so far only needed by ES7000 platform
- */
- if (ioapic_renumber_irq)
- irq = ioapic_renumber_irq(apic, irq);
- }
-
-#ifdef CONFIG_X86_32
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
- if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
- } else {
- irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
- }
- }
+ level = !test_bit(bus, mp_bus_not_pci);
+ /* Take EISA into account */
+ return eisa_irq_is_level(idx, bus, level);
+ case MP_IRQTRIG_EDGE:
+ return false;
+ case MP_IRQTRIG_RESERVED:
+ pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ fallthrough;
+ case MP_IRQTRIG_LEVEL:
+ default: /* Pointless default required due to do gcc stupidity */
+ return true;
}
-#endif
-
- return irq;
}
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
- struct io_apic_irq_attr *irq_attr)
+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
{
- int apic, i, best_guess = -1;
+ int ioapic, pin, idx;
- apic_printk(APIC_DEBUG,
- "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE,
- "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ if (ioapic_is_disabled)
return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].srcbus;
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
- mp_irqs[i].dstapic == MP_APIC_ALL)
- break;
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
- if (pin == (mp_irqs[i].srcbusirq & 3)) {
- set_io_apic_irq_attr(irq_attr, apic,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- return irq;
- }
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0) {
- set_io_apic_irq_attr(irq_attr, apic,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- best_guess = irq;
- }
- }
- }
- return best_guess;
+ *trigger = irq_is_level(idx);
+ *polarity = irq_active_low(idx);
+ return 0;
}
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-void lock_vector_lock(void)
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
{
- /* Used to the online set of cpus does not change
- * during assign_irq_vector.
- */
- spin_lock(&vector_lock);
+ *is_level = *active_low = 0;
+ return __acpi_get_override_irq(gsi, (bool *)is_level,
+ (bool *)active_low);
}
+#endif
-void unlock_vector_lock(void)
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+ int trigger, int polarity)
{
- spin_unlock(&vector_lock);
+ init_irq_alloc_info(info, NULL);
+ info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info->ioapic.node = node;
+ info->ioapic.is_level = trigger;
+ info->ioapic.active_low = polarity;
+ info->ioapic.valid = 1;
}
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src,
+ u32 gsi, int ioapic_idx, int pin)
{
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
- unsigned int old_vector;
- int cpu, err;
- cpumask_var_t tmp_mask;
-
- if ((cfg->move_in_progress) || cfg->move_cleanup_count)
- return -EBUSY;
-
- if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
- return -ENOMEM;
-
- old_vector = cfg->vector;
- if (old_vector) {
- cpumask_and(tmp_mask, mask, cpu_online_mask);
- cpumask_and(tmp_mask, cfg->domain, tmp_mask);
- if (!cpumask_empty(tmp_mask)) {
- free_cpumask_var(tmp_mask);
- return 0;
- }
- }
-
- /* Only try and allocate irqs on cpus that are present */
- err = -ENOSPC;
- for_each_cpu_and(cpu, mask, cpu_online_mask) {
- int new_cpu;
- int vector, offset;
-
- apic->vector_allocation_domain(cpu, tmp_mask);
-
- vector = current_vector;
- offset = current_offset;
-next:
- vector += 8;
- if (vector >= first_system_vector) {
- /* If out of vectors on large boxen, must share them. */
- offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
- }
- if (unlikely(current_vector == vector))
- continue;
+ bool level, pol_low;
- if (test_bit(vector, used_vectors))
- goto next;
-
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- if (per_cpu(vector_irq, new_cpu)[vector] != -1)
- goto next;
- /* Found one! */
- current_vector = vector;
- current_offset = offset;
- if (old_vector) {
- cfg->move_in_progress = 1;
- cpumask_copy(cfg->old_domain, cfg->domain);
+ copy_irq_alloc_info(dst, src);
+ dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ dst->devid = mpc_ioapic_id(ioapic_idx);
+ dst->ioapic.pin = pin;
+ dst->ioapic.valid = 1;
+ if (src && src->ioapic.valid) {
+ dst->ioapic.node = src->ioapic.node;
+ dst->ioapic.is_level = src->ioapic.is_level;
+ dst->ioapic.active_low = src->ioapic.active_low;
+ } else {
+ dst->ioapic.node = NUMA_NO_NODE;
+ if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+ dst->ioapic.is_level = level;
+ dst->ioapic.active_low = pol_low;
+ } else {
+ /*
+ * PCI interrupts are always active low level
+ * triggered.
+ */
+ dst->ioapic.is_level = true;
+ dst->ioapic.active_low = true;
}
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq;
- cfg->vector = vector;
- cpumask_copy(cfg->domain, tmp_mask);
- err = 0;
- break;
}
- free_cpumask_var(tmp_mask);
- return err;
}
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
{
- int err;
- unsigned long flags;
-
- spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, cfg, mask);
- spin_unlock_irqrestore(&vector_lock, flags);
- return err;
+ return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
}
-static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
+static void mp_register_handler(unsigned int irq, bool level)
{
- int cpu, vector;
-
- BUG_ON(!cfg->vector);
-
- vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = -1;
-
- cfg->vector = 0;
- cpumask_clear(cfg->domain);
+ irq_flow_handler_t hdl;
+ bool fasteoi;
- if (likely(!cfg->move_in_progress))
- return;
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != irq)
- continue;
- per_cpu(vector_irq, cpu)[vector] = -1;
- break;
- }
+ if (level) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
}
- cfg->move_in_progress = 0;
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
}
-void __setup_vector_irq(int cpu)
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
{
- /* Initialize vector_irq on a new cpu */
- /* This function must be called with vector_lock held */
- int irq, vector;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
+ struct mp_chip_data *data = irq_get_chip_data(irq);
- /* Mark the inuse vectors */
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
- if (!cpumask_test_cpu(cpu, cfg->domain))
- continue;
- vector = cfg->vector;
- per_cpu(vector_irq, cpu)[vector] = irq;
+ /*
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+ * and polarity attributes. So allow the first user to reprogram the
+ * pin with real trigger and polarity attributes.
+ */
+ if (irq < nr_legacy_irqs() && data->count == 1) {
+ if (info->ioapic.is_level != data->is_level)
+ mp_register_handler(irq, info->ioapic.is_level);
+ data->entry.is_level = data->is_level = info->ioapic.is_level;
+ data->entry.active_low = data->active_low = info->ioapic.active_low;
}
- /* Mark the free vectors */
- for (vector = 0; vector < NR_VECTORS; ++vector) {
- irq = per_cpu(vector_irq, cpu)[vector];
- if (irq < 0)
- continue;
- cfg = irq_cfg(irq);
- if (!cpumask_test_cpu(cpu, cfg->domain))
- per_cpu(vector_irq, cpu)[vector] = -1;
- }
+ return data->is_level == info->ioapic.is_level &&
+ data->active_low == info->ioapic.active_low;
}
-static struct irq_chip ioapic_chip;
-static struct irq_chip ir_ioapic_chip;
-
-#define IOAPIC_AUTO -1
-#define IOAPIC_EDGE 0
-#define IOAPIC_LEVEL 1
-
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+ struct irq_alloc_info *info)
{
- int apic, idx, pin;
+ int type = ioapics[ioapic].irqdomain_cfg.type;
+ bool legacy = false;
+ int irq = -1;
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
- return irq_trigger(idx);
- }
+ switch (type) {
+ case IOAPIC_DOMAIN_LEGACY:
+ /*
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first
+ * 16 GSIs on some weird platforms.
+ */
+ if (!ioapic_initialized || gsi >= nr_legacy_irqs())
+ irq = gsi;
+ legacy = mp_is_legacy_irq(irq);
+ break;
+ case IOAPIC_DOMAIN_STRICT:
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_DYNAMIC:
+ break;
+ default:
+ WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+ return -1;
}
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
+
+ return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info),
+ info, legacy, NULL);
}
-#endif
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin,
+ struct irq_alloc_info *info)
{
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ int node = ioapic_alloc_attr_node(info);
+ struct mp_chip_data *data;
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- desc->status |= IRQ_LEVEL;
- else
- desc->status &= ~IRQ_LEVEL;
-
- if (irq_remapped(irq)) {
- desc->status |= IRQ_MOVE_PCNTXT;
- if (trigger)
- set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
- handle_fasteoi_irq,
- "fasteoi");
- else
- set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
- handle_edge_irq, "edge");
- return;
+ /*
+ * Legacy ISA IRQ has already been allocated, just add pin to
+ * the pin list associated with this IRQ and program the IOAPIC
+ * entry.
+ */
+ if (irq_data && irq_data->parent_data) {
+ if (!mp_check_pin_attr(irq, info))
+ return -EBUSY;
+ if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin))
+ return -ENOMEM;
+ } else {
+ info->flags |= X86_IRQ_ALLOC_LEGACY;
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL);
+ if (irq >= 0) {
+ irq_data = irq_domain_get_irq_data(domain, irq);
+ data = irq_data->chip_data;
+ data->isa_irq = true;
+ }
}
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_fasteoi_irq,
- "fasteoi");
- else
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_edge_irq, "edge");
+ return irq;
}
-int setup_ioapic_entry(int apic_id, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin)
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+ unsigned int flags, struct irq_alloc_info *info)
{
- /*
- * add it to the IO-APIC irq-routing table:
- */
- memset(entry,0,sizeof(*entry));
-
- if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
- struct irte irte;
- struct IR_IO_APIC_route_entry *ir_entry =
- (struct IR_IO_APIC_route_entry *) entry;
- int index;
-
- if (!iommu)
- panic("No mapping iommu for ioapic %d\n", apic_id);
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0)
- panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ struct irq_alloc_info tmp;
+ struct mp_chip_data *data;
+ bool legacy = false;
+ int irq;
- memset(&irte, 0, sizeof(irte));
+ if (!domain)
+ return -ENOSYS;
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
+ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].srcbusirq;
+ legacy = mp_is_legacy_irq(irq);
/*
- * Trigger mode in the IRTE will always be edge, and the
- * actual level or edge trigger will be setup in the IO-APIC
- * RTE. This will help simplify level triggered irq migration.
- * For more details, see the comments above explainig IO-APIC
- * irq migration in the presence of interrupt-remapping.
+ * IRQ2 is unusable for historical reasons on systems which
+ * have a legacy PIC. See the comment vs. IRQ2 further down.
+ *
+ * If this gets removed at some point then the related code
+ * in lapic_assign_system_vectors() needs to be adjusted as
+ * well.
*/
- irte.trigger_mode = 0;
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = vector;
- irte.dest_id = IRTE_DEST(destination);
-
- /* Set source-id of interrupt request */
- set_ioapic_sid(&irte, apic_id);
-
- modify_irte(irq, &irte);
+ if (legacy && irq == PIC_CASCADE_IR)
+ return -EINVAL;
+ }
- ir_entry->index2 = (index >> 15) & 0x1;
- ir_entry->zero = 0;
- ir_entry->format = 1;
- ir_entry->index = (index & 0x7fff);
- /*
- * IO-APIC RTE will be configured with virtual vector.
- * irq handler will do the explicit EOI to the io-apic.
- */
- ir_entry->vector = pin;
+ guard(mutex)(&ioapic_mutex);
+ if (!(flags & IOAPIC_MAP_ALLOC)) {
+ if (!legacy) {
+ irq = irq_find_mapping(domain, pin);
+ if (irq == 0)
+ irq = -ENOENT;
+ }
} else {
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = destination;
- entry->vector = vector;
+ ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+ if (legacy)
+ irq = alloc_isa_irq_from_domain(domain, irq,
+ ioapic, pin, &tmp);
+ else if ((irq = irq_find_mapping(domain, pin)) == 0)
+ irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+ else if (!mp_check_pin_attr(irq, &tmp))
+ irq = -EBUSY;
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ data->count++;
+ }
}
-
- entry->mask = 0; /* enable IRQ */
- entry->trigger = trigger;
- entry->polarity = polarity;
-
- /* Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (trigger)
- entry->mask = 1;
- return 0;
+ return irq;
}
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
- int trigger, int polarity)
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
{
- struct irq_cfg *cfg;
- struct IO_APIC_route_entry entry;
- unsigned int dest;
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
- if (!IO_APIC_IRQ(irq))
- return;
+ /* Debugging check, we are in big trouble if this message pops up! */
+ if (mp_irqs[idx].dstirq != pin)
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
- cfg = desc->chip_data;
+#ifdef CONFIG_X86_32
+ /* PCI IRQ command line redirection. Yes, limits are hardcoded. */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin - 16] != -1) {
+ if (!pirq_entries[pin - 16]) {
+ apic_pr_verbose("Disabling PIRQ%d\n", pin - 16);
+ } else {
+ int irq = pirq_entries[pin-16];
- if (assign_irq_vector(irq, cfg, apic->target_cpus()))
- return;
+ apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq);
+ return irq;
+ }
+ }
+ }
+#endif
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
+}
- apic_printk(APIC_VERBOSE,KERN_DEBUG
- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i)\n",
- apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
- irq, trigger, polarity);
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
+{
+ int ioapic, pin, idx;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -ENODEV;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+ return -ENODEV;
- if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
- dest, trigger, polarity, cfg->vector, pin)) {
- printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic_id].apicid, pin);
- __clear_irq_vector(irq, cfg);
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
+}
+
+void mp_unmap_irq(int irq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ struct mp_chip_data *data;
+
+ if (!irq_data || !irq_data->domain)
return;
- }
- ioapic_register_intr(irq, desc, trigger);
- if (irq < NR_IRQS_LEGACY)
- disable_8259A_irq(irq);
+ data = irq_data->chip_data;
+ if (!data || data->isa_irq)
+ return;
- ioapic_write_entry(apic_id, pin, entry);
+ guard(mutex)(&ioapic_mutex);
+ if (--data->count == 0)
+ irq_domain_free_irqs(irq, 1);
}
-static struct {
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
-static void __init setup_IO_APIC_irqs(void)
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
{
- int apic_id = 0, pin, idx, irq;
- int notcon = 0;
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- int node = cpu_to_node(boot_cpu_id);
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+ int irq, i, best_ioapic = -1, best_idx = -1;
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- apic_id = mp_find_ioapic(0);
- if (apic_id < 0)
- apic_id = 0;
+ apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
}
-#endif
- for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
- idx = find_irq_entry(apic_id, pin, mp_INT);
- if (idx == -1) {
- if (!notcon) {
- notcon = 1;
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- } else
- apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- continue;
- }
- if (notcon) {
- apic_printk(APIC_VERBOSE,
- " (apicid-pin) not connected\n");
- notcon = 0;
- }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+ int ioapic_idx, found = 0;
- irq = pin_2_irq(idx, apic_id, pin);
+ if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+ slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
+ continue;
- /*
- * Skip the timer IRQ if there's a quirk handler
- * installed and if it returns 1:
- */
- if (apic->multi_timer_check &&
- apic->multi_timer_check(apic_id, irq))
+ for_each_ioapic(ioapic_idx)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) {
+ found = 1;
+ break;
+ }
+ if (!found)
continue;
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ /* Skip ISA IRQs */
+ irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+ if (irq > 0 && !IO_APIC_IRQ(irq))
continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ goto out;
}
- cfg = desc->chip_data;
- add_pin_to_irq_node(cfg, node, apic_id, pin);
+
/*
- * don't mark it in pin_programmed, so later acpi could
- * set it correctly when irq < 16
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
*/
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ if (best_idx < 0) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ }
}
+ if (best_idx < 0)
+ return -1;
- if (notcon)
- apic_printk(APIC_VERBOSE,
- " (apicid-pin) not connected\n");
+out:
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC);
}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
- int vector)
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
+
+static void __init setup_IO_APIC_irqs(void)
{
- struct IO_APIC_route_entry entry;
+ unsigned int ioapic, pin;
+ int idx;
- if (intr_remapping_enabled)
- return;
+ apic_pr_verbose("Init IO_APIC IRQs\n");
- memset(&entry, 0, sizeof(entry));
+ for_each_ioapic_pin(ioapic, pin) {
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0) {
+ apic_pr_verbose("apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ } else {
+ pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
+ }
+}
- /*
- * We use logical delivery to get the timer IRQ
- * to the first CPU.
- */
- entry.dest_mode = apic->irq_dest_mode;
- entry.mask = 0; /* don't mask IRQ for edge */
- entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
- entry.delivery_mode = apic->irq_delivery_mode;
- entry.polarity = 0;
- entry.trigger = 0;
- entry.vector = vector;
+void ioapic_zap_locks(void)
+{
+ raw_spin_lock_init(&ioapic_lock);
+}
- /*
- * The timer IRQ doesn't have to know that behind the
- * scene we may have a 8259A-master in AEOI mode ...
- */
- set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+{
+ struct IO_APIC_route_entry entry;
+ char buf[256];
+ int i;
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(apic_id, pin, entry);
+ apic_dbg("IOAPIC %d:\n", apic);
+ for (i = 0; i <= nr_entries; i++) {
+ entry = ioapic_read_entry(apic, i);
+ snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i, entry.masked ? "disabled" : "enabled ",
+ entry.is_level ? "level" : "edge ",
+ entry.active_low ? "low " : "high",
+ entry.vector, entry.irr, entry.delivery_status);
+ if (entry.ir_format) {
+ apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf,
+ (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero);
+ } else {
+ apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf,
+ entry.dest_mode_logical ? "logical " : "physical",
+ entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode);
+ }
+ }
}
-
-__apicdebuginit(void) print_IO_APIC(void)
+static void __init print_IO_APIC(int ioapic_idx)
{
- int apic, i;
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
union IO_APIC_reg_03 reg_03;
- unsigned long flags;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
- unsigned int irq;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for (i = 0; i < nr_ioapics; i++)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].apicid, nr_ioapic_registers[i]);
-
- /*
- * We are a bit conservative about what we expect. We have to
- * know about every hardware change ASAP.
- */
- printk(KERN_INFO "testing the IO APIC.......................\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- reg_01.raw = io_apic_read(apic, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(apic, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(apic, 3);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
-
- printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ }
+
+ apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ apic_dbg(".... register #00: %08X\n", reg_00.raw);
+ apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS);
+ apic_dbg(".... register #01: %08X\n", *(int *)&reg_01);
+ apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries);
+ apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1651,8 +1203,8 @@ __apicdebuginit(void) print_IO_APIC(void)
* value, so ignore it if reg_02 == reg_01.
*/
if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ apic_dbg(".... register #02: %08X\n", reg_02.raw);
+ apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
/*
@@ -1662,278 +1214,90 @@ __apicdebuginit(void) print_IO_APIC(void)
*/
if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ apic_dbg(".... register #03: %08X\n", reg_03.raw);
+ apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT);
}
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
- printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect: \n");
-
- for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X ",
- i,
- entry.dest
- );
-
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
- }
- }
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_desc(irq, desc) {
- struct irq_pin_list *entry;
-
- cfg = desc->chip_data;
- entry = cfg->irq_2_pin;
- if (!entry)
- continue;
- printk(KERN_DEBUG "IRQ%d ", irq);
- for (;;) {
- printk("-> %d:%d", entry->apic, entry->pin);
- if (!entry->next)
- break;
- entry = entry->next;
- }
- printk("\n");
- }
-
- printk(KERN_INFO ".................................... done.\n");
-
- return;
-}
-
-__apicdebuginit(void) print_APIC_field(int base)
-{
- int i;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG);
-
- for (i = 0; i < 8; i++)
- printk(KERN_CONT "%08x", apic_read(base + i*0x10));
-
- printk(KERN_CONT "\n");
+ apic_dbg(".... IRQ redirection table:\n");
+ io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
}
-__apicdebuginit(void) print_local_APIC(void *dummy)
+void __init print_IO_APICs(void)
{
- unsigned int i, v, ver, maxlvt;
- u64 icr;
-
- if (apic_verbosity == APIC_QUIET)
- return;
+ int ioapic_idx;
+ unsigned int irq;
- printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
- v = apic_read(APIC_LVR);
- printk(KERN_INFO "... APIC VERSION: %08x\n", v);
- ver = GET_APIC_VERSION(v);
- maxlvt = lapic_get_maxlvt();
-
- v = apic_read(APIC_TASKPRI);
- printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (!APIC_XAPIC(ver)) {
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- }
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for_each_ioapic(ioapic_idx) {
+ apic_dbg("number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers);
}
/*
- * Remote read supported only in the 82489DX and local APIC for
- * Pentium processors.
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
*/
- if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
- }
-
- v = apic_read(APIC_LDR);
- printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- if (!x2apic_enabled()) {
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
- }
- v = apic_read(APIC_SPIV);
- printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
- printk(KERN_DEBUG "... APIC ISR field:\n");
- print_APIC_field(APIC_ISR);
- printk(KERN_DEBUG "... APIC TMR field:\n");
- print_APIC_field(APIC_TMR);
- printk(KERN_DEBUG "... APIC IRR field:\n");
- print_APIC_field(APIC_IRR);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
-
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
- }
+ printk(KERN_INFO "testing the IO APIC.......................\n");
- icr = apic_icr_read();
- printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
- printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
+ for_each_ioapic(ioapic_idx)
+ print_IO_APIC(ioapic_idx);
- v = apic_read(APIC_LVTT);
- printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+ apic_dbg("IRQ to pin mappings:\n");
+ for_each_active_irq(irq) {
+ struct irq_pin_list *entry;
+ struct irq_chip *chip;
+ struct mp_chip_data *data;
- if (maxlvt > 3) { /* PC is LVT#4. */
- v = apic_read(APIC_LVTPC);
- printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
- }
- v = apic_read(APIC_LVT0);
- printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
- v = apic_read(APIC_LVT1);
- printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
- if (maxlvt > 2) { /* ERR is LVT#3. */
- v = apic_read(APIC_LVTERR);
- printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
- }
+ chip = irq_get_chip(irq);
+ if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
+ continue;
+ data = irq_get_chip_data(irq);
+ if (!data)
+ continue;
+ if (list_empty(&data->irq_2_pin))
+ continue;
- v = apic_read(APIC_TMICT);
- printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
- v = apic_read(APIC_TMCCT);
- printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
- v = apic_read(APIC_TDCR);
- printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
-
- if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
- v = apic_read(APIC_EFEAT);
- maxlvt = (v >> 16) & 0xff;
- printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
- v = apic_read(APIC_ECTRL);
- printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
- for (i = 0; i < maxlvt; i++) {
- v = apic_read(APIC_EILVTn(i));
- printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
- }
+ apic_dbg("IRQ%d ", irq);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ pr_cont("-> %d:%d", entry->apic, entry->pin);
+ pr_cont("\n");
}
- printk("\n");
-}
-
-__apicdebuginit(void) print_all_local_APICs(void)
-{
- int cpu;
-
- preempt_disable();
- for_each_online_cpu(cpu)
- smp_call_function_single(cpu, print_local_APIC, NULL, 1);
- preempt_enable();
-}
-
-__apicdebuginit(void) print_PIC(void)
-{
- unsigned int v;
- unsigned long flags;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "\nprinting PIC contents\n");
-
- spin_lock_irqsave(&i8259A_lock, flags);
-
- v = inb(0xa1) << 8 | inb(0x21);
- printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
-
- v = inb(0xa0) << 8 | inb(0x20);
- printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
-
- outb(0x0b,0xa0);
- outb(0x0b,0x20);
- v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a,0xa0);
- outb(0x0a,0x20);
-
- spin_unlock_irqrestore(&i8259A_lock, flags);
-
- printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
-
- v = inb(0x4d1) << 8 | inb(0x4d0);
- printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-__apicdebuginit(int) print_all_ICs(void)
-{
- print_PIC();
- /* don't print out if apic is not there */
- if (!cpu_has_apic || disable_apic)
- return 0;
-
- print_all_local_APICs();
- print_IO_APIC();
-
- return 0;
+ printk(KERN_INFO ".................................... done.\n");
}
-fs_initcall(print_all_ICs);
-
-
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
- union IO_APIC_reg_01 reg_01;
- int i8259_apic, i8259_pin;
- int apic;
- unsigned long flags;
+ int i8259_apic, i8259_pin, apic, pin;
- /*
- * The number of IO-APIC IRQ registers (== #pins):
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- nr_ioapic_registers[apic] = reg_01.bits.entries+1;
- }
- for(apic = 0; apic < nr_ioapics; apic++) {
- int pin;
+ if (ioapic_is_disabled)
+ nr_ioapics = 0;
+
+ if (!nr_legacy_irqs() || !nr_ioapics)
+ return;
+
+ for_each_ioapic_pin(apic, pin) {
/* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- struct IO_APIC_route_entry entry;
- entry = ioapic_read_entry(apic, pin);
+ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
- */
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
- ioapic_i8259.apic = apic;
- ioapic_i8259.pin = pin;
- goto found_i8259;
- }
+ /*
+ * If the interrupt line is enabled and in ExtInt mode I
+ * have found the pin where the i8259 is connected.
+ */
+ if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ break;
}
}
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
+
+ /*
+ * Look to see what if the MP table has reported the ExtINT
+ *
+ * If we could not find the appropriate pin by looking at the ioapic
* the i8259 probably is not connected the ioapic but give the
* mptable a chance anyway.
*/
@@ -1941,69 +1305,52 @@ void __init enable_IO_APIC(void)
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ pr_warn("ExtINT not setup in hardware but reported by MP table\n");
ioapic_i8259.pin = i8259_pin;
ioapic_i8259.apic = i8259_apic;
}
/* Complain if the MP table and the hardware disagree */
if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ pr_warn("ExtINT in hardware and MP table differ\n");
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
+ /* Do not trust the IO-APIC being empty at bootup */
clear_IO_APIC();
}
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
+void native_restore_boot_irq_mode(void)
{
/*
- * Clear the IO-APIC before rebooting:
- */
- clear_IO_APIC();
-
- /*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
- *
- * With interrupt-remapping, for now we will use virtual wire A mode,
- * as virtual wire B is little complex (need to configure both
- * IOAPIC RTE aswell as interrupt-remapping table entry).
- * As this gets called during crash dump, keep this simple for now.
+ * If the i8259 is routed through an IOAPIC Put that IOAPIC in
+ * virtual wire mode so legacy interrupts can be delivered.
*/
- if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+ if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
+ u32 apic_id = read_apic_id();
memset(&entry, 0, sizeof(entry));
- entry.mask = 0; /* Enabled */
- entry.trigger = 0; /* Edge */
- entry.irr = 0;
- entry.polarity = 0; /* High */
- entry.delivery_status = 0;
- entry.dest_mode = 0; /* Physical */
- entry.delivery_mode = dest_ExtINT; /* ExtInt */
- entry.vector = 0;
- entry.dest = read_apic_id();
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
+ entry.masked = false;
+ entry.is_level = false;
+ entry.active_low = false;
+ entry.dest_mode_logical = false;
+ entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry.destid_0_7 = apic_id & 0xFF;
+ entry.virt_destid_8_14 = apic_id >> 8;
+
+ /* Add it to the IO-APIC irq-routing table */
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
- /*
- * Use virtual wire A mode when interrupt remapping is enabled.
- */
- if (cpu_has_apic)
- disconnect_bsp_APIC(!intr_remapping_enabled &&
- ioapic_i8259.pin != -1);
+ if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+
+void restore_boot_irq_mode(void)
+{
+ if (!nr_legacy_irqs())
+ return;
+
+ x86_apic_ops.restore();
}
#ifdef CONFIG_X86_32
@@ -2013,50 +1360,35 @@ void disable_IO_APIC(void)
*
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-
-static void __init setup_ioapic_ids_from_mpc(void)
+static void __init setup_ioapic_ids_from_mpc_nocheck(void)
{
+ DECLARE_BITMAP(phys_id_present_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- physid_mask_t phys_id_present_map;
- int apic_id;
- int i;
unsigned char old_id;
- unsigned long flags;
-
- if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
- return;
+ int ioapic_idx, i;
/*
- * Don't check I/O APIC IDs for xAPIC systems. They have
- * no meaning without the serial APIC bus.
- */
- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return;
- /*
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
+ copy_phys_cpu_present_map(phys_id_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
- for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
-
+ for_each_ioapic(ioapic_idx) {
/* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic_id].apicid;
-
- if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- reg_00.bits.ID);
- mp_ioapics[apic_id].apicid = reg_00.bits.ID;
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+
+ old_id = mpc_ioapic_id(ioapic_idx);
+
+ if (mpc_ioapic_id(ioapic_idx) >= broadcast_id) {
+ pr_err(FW_BUG "IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID);
+ ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
}
/*
@@ -2064,64 +1396,71 @@ static void __init setup_ioapic_ids_from_mpc(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(phys_id_present_map,
- mp_ioapics[apic_id].apicid)) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
- for (i = 0; i < get_physical_broadcast(); i++)
- if (!physid_isset(i, phys_id_present_map))
+ if (test_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map)) {
+ pr_err(FW_BUG "IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < broadcast_id; i++)
+ if (!test_bit(i, phys_id_present_map))
break;
- if (i >= get_physical_broadcast())
+ if (i >= broadcast_id)
panic("Max APIC ID exceeded!\n");
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- i);
- physid_set(i, phys_id_present_map);
- mp_ioapics[apic_id].apicid = i;
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", i);
+ set_bit(i, phys_id_present_map);
+ ioapics[ioapic_idx].mp_config.apicid = i;
} else {
- physid_mask_t tmp;
- tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
- apic_printk(APIC_VERBOSE, "Setting %d in the "
- "phys_id_present_map\n",
- mp_ioapics[apic_id].apicid);
- physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ apic_pr_verbose("Setting %d in the phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map);
}
-
/*
- * We need to adjust the IRQ routing table
- * if the ID changed.
+ * We need to adjust the IRQ routing table if the ID
+ * changed.
*/
- if (old_id != mp_ioapics[apic_id].apicid)
- for (i = 0; i < mp_irq_entries; i++)
+ if (old_id != mpc_ioapic_id(ioapic_idx)) {
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].dstapic == old_id)
- mp_irqs[i].dstapic
- = mp_ioapics[apic_id].apicid;
+ mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx);
+ }
+ }
/*
- * Read the right value from the MPC table and
- * write it into the ID register.
+ * Update the ID register according to the right value from
+ * the MPC table if they are different.
*/
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic_id].apicid);
+ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+ continue;
- reg_00.bits.ID = mp_ioapics[apic_id].apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic_id, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
- /*
- * Sanity check
- */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
- printk("could not set ID!\n");
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ }
+ /* Sanity check */
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+ pr_cont("could not set ID!\n");
else
- apic_printk(APIC_VERBOSE, " ok.\n");
+ apic_pr_verbose(" ok.\n");
}
}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(boot_cpu_apic_version))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
#endif
int no_timer_check __initdata;
@@ -2133,6 +1472,42 @@ static int __init notimercheck(char *s)
}
__setup("no_timer_check", notimercheck);
+static void __init delay_with_tsc(void)
+{
+ unsigned long long start, now;
+ unsigned long end = jiffies + 4;
+
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 40000000000/HZ TSC cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ */
+ do {
+ native_pause();
+ now = rdtsc();
+ } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end));
+}
+
+static void __init delay_without_tsc(void)
+{
+ unsigned long end = jiffies + 4;
+ int band = 1;
+
+ /*
+ * We don't know any frequency yet, but waiting for
+ * 40940000000/HZ cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094
+ */
+ do {
+ __delay(((1U << band++) * 10000000UL) / HZ);
+ } while (band < 12 && time_before_eq(jiffies, end));
+}
+
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -2144,16 +1519,15 @@ __setup("no_timer_check", notimercheck);
static int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
- unsigned long flags;
if (no_timer_check)
return 1;
- local_save_flags(flags);
local_irq_enable();
- /* Let ten ticks pass... */
- mdelay((10 * 1000) / HZ);
- local_irq_restore(flags);
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ delay_with_tsc();
+ else
+ delay_without_tsc();
/*
* Expect a few ticks at least, to be sure some possible
@@ -2163,10 +1537,10 @@ static int __init timer_irq_works(void)
* least one tick may be lost due to delays.
*/
- /* jiffies wrap? */
- if (time_after(jiffies, t1 + 4))
- return 1;
- return 0;
+ local_irq_disable();
+
+ /* Did jiffies advance? */
+ return time_after(jiffies, t1 + 4);
}
/*
@@ -2175,398 +1549,69 @@ static int __init timer_irq_works(void)
* so we 'resend' these IRQs via IPIs, to the same CPU. It's much
* better to do it this way as thus we do not have to be aware of
* 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
*
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
+ *
+ * Edge triggered needs to resend any interrupt that was delayed but this
+ * is now handled in the device independent code.
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to
+ * make sure that we get the edge. If it is already asserted for some
+ * reason, we need return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake an edge even if it
+ * isn't on the 8259A...
*/
-
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
{
- int was_pending = 0;
- unsigned long flags;
- struct irq_cfg *cfg;
+ int was_pending = 0, irq = data->irq;
- spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < NR_IRQS_LEGACY) {
- disable_8259A_irq(irq);
- if (i8259A_irq_pending(irq))
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ if (irq < nr_legacy_irqs()) {
+ legacy_pic->mask(irq);
+ if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- cfg = irq_cfg(irq);
- __unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
+ __unmask_ioapic(data->chip_data);
return was_pending;
}
-#ifdef CONFIG_X86_64
-static int ioapic_retrigger_irq(unsigned int irq)
-{
-
- struct irq_cfg *cfg = irq_cfg(irq);
- unsigned long flags;
-
- spin_lock_irqsave(&vector_lock, flags);
- apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- return 1;
-}
-#else
-static int ioapic_retrigger_irq(unsigned int irq)
-{
- apic->send_IPI_self(irq_cfg(irq)->vector);
-
- return 1;
-}
-#endif
-
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
- cfg->move_cleanup_count = 0;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- cfg->move_cleanup_count++;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
- cfg->move_in_progress = 0;
-}
+atomic_t irq_mis_count;
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
{
- int apic, pin;
struct irq_pin_list *entry;
- u8 vector = cfg->vector;
-
- entry = cfg->irq_2_pin;
- for (;;) {
- unsigned int reg;
- if (!entry)
- break;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ struct IO_APIC_route_entry e;
+ int pin;
- apic = entry->apic;
pin = entry->pin;
- /*
- * With interrupt-remapping, destination information comes
- * from interrupt-remapping table entry.
- */
- if (!irq_remapped(irq))
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = entry->next;
+ e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (e.irr)
+ return true;
}
+ return false;
}
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- unsigned int irq;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return BAD_APICID;
-
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return BAD_APICID;
-
- cpumask_copy(desc->affinity, mask);
-
- return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- dest = set_desc_affinity(desc, mask);
- if (dest != BAD_APICID) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
- ret = 0;
+ /* If we are moving the IRQ we need to mask it */
+ if (unlikely(irqd_is_setaffinity_pending(data))) {
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
+ return true;
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return ret;
-}
-
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- return set_ioapic_affinity_irq_desc(desc, mask);
-}
-
-#ifdef CONFIG_INTR_REMAP
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- */
-static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- struct irte irte;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return ret;
-
- irq = desc->irq;
- if (get_irte(irq, &irte))
- return ret;
-
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return ret;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /*
- * Modified the IRTE and flushes the Interrupt entry cache.
- */
- modify_irte(irq, &irte);
-
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- cpumask_copy(desc->affinity, mask);
-
- return 0;
-}
-
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
-{
- return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
- const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
-#else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
-{
- return 0;
+ return false;
}
-#endif
-asmlinkage void smp_irq_move_cleanup_interrupt(void)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- unsigned vector, me;
-
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- me = smp_processor_id();
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irq;
- unsigned int irr;
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- irq = __get_cpu_var(vector_irq)[vector];
-
- if (irq == -1)
- continue;
-
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
-
- cfg = irq_cfg(irq);
- spin_lock(&desc->lock);
- if (!cfg->move_cleanup_count)
- goto unlock;
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- goto unlock;
-
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (unlikely(moveit)) {
/*
- * Check if the vector that needs to be cleanedup is
- * registered at the cpu's IRR. If so, then this is not
- * the best time to clean it up. Lets clean it up in the
- * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
- * to myself.
- */
- if (irr & (1 << (vector % 32))) {
- apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
- goto unlock;
- }
- __get_cpu_var(vector_irq)[vector] = -1;
- cfg->move_cleanup_count--;
-unlock:
- spin_unlock(&desc->lock);
- }
-
- irq_exit();
-}
-
-static void irq_complete_move(struct irq_desc **descp)
-{
- struct irq_desc *desc = *descp;
- struct irq_cfg *cfg = desc->chip_data;
- unsigned vector, me;
-
- if (likely(!cfg->move_in_progress))
- return;
-
- vector = ~get_irq_regs()->orig_ax;
- me = smp_processor_id();
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- send_cleanup_vector(cfg);
-}
-#else
-static inline void irq_complete_move(struct irq_desc **descp) {}
-#endif
-
-static void ack_apic_edge(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- irq_complete_move(&desc);
- move_native_irq(irq);
- ack_APIC_irq();
-}
-
-atomic_t irq_mis_count;
-
-static void ack_apic_level(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
-#ifdef CONFIG_X86_32
- unsigned long v;
- int i;
-#endif
- struct irq_cfg *cfg;
- int do_unmask_irq = 0;
-
- irq_complete_move(&desc);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- /* If we are moving the irq we need to mask it */
- if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
- do_unmask_irq = 1;
- mask_IO_APIC_irq_desc(desc);
- }
-#endif
-
-#ifdef CONFIG_X86_32
- /*
- * It appears there is an erratum which affects at least version 0x11
- * of I/O APIC (that's the 82093AA and cores integrated into various
- * chipsets). Under certain conditions a level-triggered interrupt is
- * erroneously delivered as edge-triggered one but the respective IRR
- * bit gets set nevertheless. As a result the I/O unit expects an EOI
- * message but it will never arrive and further interrupts are blocked
- * from the source. The exact reason is so far unknown, but the
- * phenomenon was observed when two consecutive interrupt requests
- * from a given source get delivered to the same CPU and the source is
- * temporarily disabled in between.
- *
- * A workaround is to simulate an EOI message manually. We achieve it
- * by setting the trigger mode to edge and then to level when the edge
- * trigger mode gets detected in the TMR of a local APIC for a
- * level-triggered interrupt. We mask the source for the time of the
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul. --macro
- */
- cfg = desc->chip_data;
- i = cfg->vector;
-
- v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
-#endif
-
- /*
- * We must acknowledge the irq before we move it or the acknowledge will
- * not propagate properly.
- */
- ack_APIC_irq();
-
- /* Now we can move and renable the irq */
- if (unlikely(do_unmask_irq)) {
- /* Only migrate the irq if the ack has been received.
+ * Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
* delayed going to ioapics, and if we reprogram the
@@ -2585,136 +1630,272 @@ static void ack_apic_level(unsigned int irq)
* with masking the ioapic entry and then polling until
* Remote IRR was clear before reprogramming the
* ioapic I don't trust the Remote IRR bit to be
- * completey accurate.
+ * completely accurate.
*
* However there appears to be no other way to plug
* this race, so if the Remote IRR bit is not
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- cfg = desc->chip_data;
- if (!io_apic_level_ack_pending(cfg))
- move_masked_irq(irq);
- unmask_IO_APIC_irq_desc(desc);
+ if (!io_apic_level_ack_pending(data->chip_data))
+ irq_move_masked_irq(data);
+ /* If the IRQ is masked in the core, leave it: */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
}
+}
+#else
+static inline bool ioapic_prepare_move(struct irq_data *data)
+{
+ return false;
+}
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
+{
+}
+#endif
-#ifdef CONFIG_X86_32
+static void ioapic_ack_level(struct irq_data *irq_data)
+{
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ unsigned long v;
+ bool moveit;
+ int i;
+
+ irq_complete_move(cfg);
+ moveit = ioapic_prepare_move(irq_data);
+
+ /*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ *
+ * Also in the case when cpu goes offline, fixup_irqs() will forward
+ * any unhandled interrupt on the offlined cpu to the new cpu
+ * destination that is handling the corresponding interrupt. This
+ * interrupt forwarding is done via IPI's. Hence, in this case also
+ * level-triggered io-apic interrupt will be seen as an edge
+ * interrupt in the IRR. And we can't rely on the cpu's EOI
+ * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+ * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+ * supporting EOI register, we do an explicit EOI to clear the
+ * remote IRR and on IO-APIC's which don't have an EOI register,
+ * we use the above logic (mask+edge followed by unmask+level) from
+ * Manfred Spraul to clear the remote IRR.
+ */
+ i = cfg->vector;
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+ /*
+ * We must acknowledge the irq before we move it or the acknowledge will
+ * not propagate properly.
+ */
+ apic_eoi();
+
+ /*
+ * Tail end of clearing remote IRR bit (either by delivering the EOI
+ * message via io-apic EOI register write or simulating it using
+ * mask+edge followed by unmask+level logic) manually when the
+ * level triggered interrupt is seen as the edge triggered interrupt
+ * at the cpu.
+ */
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
- spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(cfg);
- __unmask_and_level_IO_APIC_irq(cfg);
- spin_unlock(&ioapic_lock);
+ eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
}
-#endif
+
+ ioapic_finish_move(irq_data, moveit);
}
-#ifdef CONFIG_INTR_REMAP
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
{
- int apic, pin;
- struct irq_pin_list *entry;
+ struct mp_chip_data *data = irq_data->chip_data;
- entry = cfg->irq_2_pin;
- for (;;) {
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ apic_ack_irq(irq_data);
+ eoi_ioapic_pin(data->entry.vector, data);
+}
- if (!entry)
- break;
+/*
+ * The I/OAPIC is just a device for generating MSI messages from legacy
+ * interrupt pins. Various fields of the RTE translate into bits of the
+ * resulting MSI which had a historical meaning.
+ *
+ * With interrupt remapping, many of those bits have different meanings
+ * in the underlying MSI, but the way that the I/OAPIC transforms them
+ * from its RTE to the MSI message is the same. This function allows
+ * the parent IRQ domain to compose the MSI message, then takes the
+ * relevant bits to put them in the appropriate places in the RTE in
+ * order to generate that message when the IRQ happens.
+ *
+ * The setup here relies on a preconfigured route entry (is_level,
+ * active_low, masked) because the parent domain is merely composing the
+ * generic message routing information which is used for the MSI.
+ */
+static void ioapic_setup_msg_from_msi(struct irq_data *irq_data,
+ struct IO_APIC_route_entry *entry)
+{
+ struct msi_msg msg;
- apic = entry->apic;
- pin = entry->pin;
- io_apic_eoi(apic, pin);
- entry = entry->next;
- }
+ /* Let the parent domain compose the MSI message */
+ irq_chip_compose_msi_msg(irq_data, &msg);
+
+ /*
+ * - Real vector
+ * - DMAR/IR: 8bit subhandle (ioapic.pin)
+ * - AMD/IR: 8bit IRTE index
+ */
+ entry->vector = msg.arch_data.vector;
+ /* Delivery mode (for DMAR/IR all 0) */
+ entry->delivery_mode = msg.arch_data.delivery_mode;
+ /* Destination mode or DMAR/IR index bit 15 */
+ entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical;
+ /* DMAR/IR: 1, 0 for all other modes */
+ entry->ir_format = msg.arch_addr_lo.dmar_format;
+ /*
+ * - DMAR/IR: index bit 0-14.
+ *
+ * - Virt: If the host supports x2apic without a virtualized IR
+ * unit then bit 0-6 of dmar_index_0_14 are providing bit
+ * 8-14 of the destination id.
+ *
+ * All other modes have bit 0-6 of dmar_index_0_14 cleared and the
+ * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7).
+ */
+ entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14;
}
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
+static void ioapic_configure_entry(struct irq_data *irqd)
{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
+ struct mp_chip_data *mpd = irqd->chip_data;
+ struct irq_pin_list *entry;
- irq = desc->irq;
- cfg = desc->chip_data;
+ ioapic_setup_msg_from_msi(irqd, &mpd->entry);
- spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ for_each_irq_pin(entry, mpd->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
}
-static void ir_ack_apic_edge(unsigned int irq)
+static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force)
{
- ack_APIC_irq();
+ struct irq_data *parent = irq_data->parent_data;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
+ ioapic_configure_entry(irq_data);
+
+ return ret;
}
-static void ir_ack_apic_level(unsigned int irq)
+/*
+ * Interrupt shutdown masks the ioapic pin, but the interrupt might already
+ * be in flight, but not yet serviced by the target CPU. That means
+ * __synchronize_hardirq() would return and claim that everything is calmed
+ * down. So free_irq() would proceed and deactivate the interrupt and free
+ * resources.
+ *
+ * Once the target CPU comes around to service it it will find a cleared
+ * vector and complain. While the spurious interrupt is harmless, the full
+ * release of resources might prevent the interrupt from being acknowledged
+ * which keeps the hardware in a weird state.
+ *
+ * Verify that the corresponding Remote-IRR bits are clear.
+ */
+static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which,
+ bool *state)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct mp_chip_data *mcd = irqd->chip_data;
+ struct IO_APIC_route_entry rentry;
+ struct irq_pin_list *p;
+
+ if (which != IRQCHIP_STATE_ACTIVE)
+ return -EINVAL;
+
+ *state = false;
- ack_APIC_irq();
- eoi_ioapic_irq(desc);
+ guard(raw_spinlock)(&ioapic_lock);
+ for_each_irq_pin(p, mcd->irq_2_pin) {
+ rentry = __ioapic_read_entry(p->apic, p->pin);
+ /*
+ * The remote IRR is only valid in level trigger mode. It's
+ * meaning is undefined for edge triggered interrupts and
+ * irrelevant because the IO-APIC treats them as fire and
+ * forget.
+ */
+ if (rentry.irr && rentry.is_level) {
+ *state = true;
+ break;
+ }
+ }
+ return 0;
}
-#endif /* CONFIG_INTR_REMAP */
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
-#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
-#endif
- .retrigger = ioapic_retrigger_irq,
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
-static struct irq_chip ir_ioapic_chip __read_mostly = {
- .name = "IR-IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
-#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
- .eoi = ir_ack_apic_level,
-#ifdef CONFIG_SMP
- .set_affinity = set_ir_ioapic_affinity_irq,
-#endif
-#endif
- .retrigger = ioapic_retrigger_irq,
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ir_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static inline void init_IO_APIC_traps(void)
{
- int irq;
- struct irq_desc *desc;
struct irq_cfg *cfg;
+ unsigned int irq;
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
+ for_each_active_irq(irq) {
+ cfg = irq_cfg(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
+ * Hmm.. We don't have an entry for this, so
+ * default to an old-fashioned 8259 interrupt if we
+ * can. Otherwise set the dummy interrupt chip.
*/
- if (irq < NR_IRQS_LEGACY)
- make_8259A_irq(irq);
+ if (irq < nr_legacy_irqs())
+ legacy_pic->make_irq(irq);
else
- /* Strange. Oh, well.. */
- desc->chip = &no_irq_chip;
+ irq_set_chip(irq, &no_irq_chip);
}
}
}
@@ -2722,58 +1903,36 @@ static inline void init_IO_APIC_traps(void)
/*
* The local APIC irq-chip implementation:
*/
-
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
{
- ack_APIC_irq();
+ apic_eoi();
}
static struct irq_chip lapic_chip __read_mostly = {
.name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
{
- desc->status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
-}
-
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge");
}
/*
@@ -2785,9 +1944,10 @@ static void __init setup_nmi(void)
*/
static inline void __init unlock_ExtINT_logic(void)
{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ struct IO_APIC_route_entry entry0, entry1;
+ int apic, pin, i;
+ u32 apic_id;
pin = find_isa_irq_pin(8, mp_INT);
if (pin == -1) {
@@ -2803,14 +1963,16 @@ static inline void __init unlock_ExtINT_logic(void)
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
+ apic_id = read_apic_id();
memset(&entry1, 0, sizeof(entry1));
- entry1.dest_mode = 0; /* physical delivery */
- entry1.mask = 0; /* unmask IRQ now */
- entry1.dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = 0;
+ entry1.dest_mode_logical = true;
+ entry1.masked = false;
+ entry1.destid_0_7 = apic_id & 0xFF;
+ entry1.virt_destid_8_14 = apic_id >> 8;
+ entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry1.active_low = entry0.active_low;
+ entry1.is_level = false;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
@@ -2844,32 +2006,66 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
-int timer_through_8259 __initdata;
+static int __init mp_alloc_timer_irq(int ioapic, int pin)
+{
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ int irq = -1;
+
+ if (domain) {
+ struct irq_alloc_info info;
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+ info.devid = mpc_ioapic_id(ioapic);
+ info.ioapic.pin = pin;
+ guard(mutex)(&ioapic_mutex);
+ irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+ }
+
+ return irq;
+}
+
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ return;
+ }
+ }
+
+ /* Old apic/pin didn't exist, so just add a new one */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
- struct irq_desc *desc = irq_to_desc(0);
- struct irq_cfg *cfg = desc->chip_data;
- int node = cpu_to_node(boot_cpu_id);
+ struct irq_data *irq_data = irq_get_irq_data(0);
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
- unsigned long flags;
int no_pin1 = 0;
- local_irq_save(flags);
+ if (!global_clock_event)
+ return;
+
+ local_irq_disable();
/*
* get/set the timer IRQ vector:
*/
- disable_8259A_irq(0);
- assign_irq_vector(0, cfg, apic->target_cpus());
+ legacy_pic->mask(0);
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2881,25 +2077,15 @@ static inline void __init check_timer(void)
* automatically.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
- init_8259A(1);
-#ifdef CONFIG_X86_32
- {
- unsigned int ver;
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
- }
-#endif
+ legacy_pic->init(1);
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- cfg->vector, apic1, pin1, apic2, pin2);
+ pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
@@ -2909,8 +2095,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- if (intr_remapping_enabled)
- panic("BIOS bug: timer not connected to IO-APIC");
+ panic_if_irq_remap(FW_BUG "Timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2920,113 +2105,90 @@ static inline void __init check_timer(void)
}
if (pin1 != -1) {
- /*
- * Ok, does IRQ0 through the IOAPIC work?
- */
+ /* Ok, does IRQ0 through the IOAPIC work? */
if (no_pin1) {
- add_pin_to_irq_node(cfg, node, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ mp_alloc_timer_irq(apic1, pin1);
} else {
- /* for edge trigger, setup_IO_APIC_irq already
- * leave it unmasked.
+ /*
+ * for edge trigger, it's already unmasked,
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
- int idx;
- idx = find_irq_entry(apic1, pin1, mp_INT);
- if (idx != -1 && irq_trigger(idx))
- unmask_IO_APIC_irq_desc(desc);
+ int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+ if (idx != -1 && irq_is_level(idx))
+ unmask_ioapic_irq(irq_get_irq_data(0));
}
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- enable_8259A_irq(0);
- }
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
goto out;
}
- if (intr_remapping_enabled)
- panic("timer doesn't work through Interrupt-remapped IO-APIC");
- local_irq_disable();
+ panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
+ pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n");
+ pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- enable_8259A_irq(0);
+ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
- timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
- setup_nmi();
- enable_8259A_irq(0);
- }
+ pr_info("....... works.\n");
goto out;
}
/*
* Cleanup, just in case ...
*/
- local_irq_disable();
- disable_8259A_irq(0);
+ legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ pr_info("....... failed.\n");
}
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
+ pr_info("...trying to set up timer as Virtual Wire IRQ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
-
- lapic_register_intr(0, desc);
+ lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- enable_8259A_irq(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- local_irq_disable();
- disable_8259A_irq(0);
+ legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+ pr_info("..... failed.\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
+ pr_info("...trying to set up timer as ExtINT IRQ...\n");
- init_8259A(0);
- make_8259A_irq(0);
+ legacy_pic->init(0);
+ legacy_pic->make_irq(0);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ legacy_pic->unmask(0);
unlock_ExtINT_logic();
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- local_irq_disable();
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+
+ pr_info("..... failed :\n");
+ if (apic_is_x2apic_enabled()) {
+ pr_info("Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ }
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
- local_irq_restore(flags);
+ local_irq_enable();
}
/*
@@ -3046,1158 +2208,752 @@ out:
* the I/O APIC in all cases now. No actual device should request
* it anyway. --macro
*/
-#define PIC_IRQS (1 << PIC_CASCADE_IR)
+#define PIC_IRQS (1UL << PIC_CASCADE_IR)
-void __init setup_IO_APIC(void)
+static int mp_irqdomain_create(int ioapic)
{
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ int hwirqs = mp_ioapic_pin_count(ioapic);
+ struct ioapic *ip = &ioapics[ioapic];
+ struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+ struct irq_domain *parent;
+ struct fwnode_handle *fn;
+ struct irq_fwspec fwspec;
- /*
- * calling enable_IO_APIC() is moved to setup_local_APIC for BP
- */
-
- io_apic_irqs = ~PIC_IRQS;
-
- apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
- /*
- * Set up IO-APIC IRQ routing.
- */
-#ifdef CONFIG_X86_32
- if (!acpi_ioapic)
- setup_ioapic_ids_from_mpc();
-#endif
- sync_Arb_IDs();
- setup_IO_APIC_irqs();
- init_IO_APIC_traps();
- check_timer();
-}
-
-/*
- * Called after all the initialization is done. If we didnt find any
- * APIC bugs then we can allow the modify fast path
- */
+ if (cfg->type == IOAPIC_DOMAIN_INVALID)
+ return 0;
-static int __init io_apic_bug_finalize(void)
-{
- if (sis_apic_bug == -1)
- sis_apic_bug = 0;
- return 0;
-}
+ /* Handle device tree enumerated APICs proper */
+ if (cfg->dev) {
+ fn = of_fwnode_handle(cfg->dev);
+ } else {
+ fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
+ if (!fn)
+ return -ENOMEM;
+ }
-late_initcall(io_apic_bug_finalize);
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = mpc_ioapic_id(ioapic);
-struct sysfs_ioapic_data {
- struct sys_device dev;
- struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI);
+ if (!parent) {
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENODEV;
+ }
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- int i;
+ ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops,
+ (void *)(long)ioapic);
+ if (!ip->irqdomain) {
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENOMEM;
+ }
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
- *entry = ioapic_read_entry(dev->id, i);
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1);
return 0;
}
-static int ioapic_resume(struct sys_device *dev)
+static void ioapic_destroy_irqdomain(int idx)
{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- unsigned long flags;
- union IO_APIC_reg_00 reg_00;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].apicid;
- io_apic_write(dev->id, 0, reg_00.raw);
+ if (ioapics[idx].irqdomain) {
+ irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ ioapics[idx].irqdomain = NULL;
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- ioapic_write_entry(dev->id, i, entry[i]);
-
- return 0;
}
-static struct sysdev_class ioapic_sysdev_class = {
- .name = "ioapic",
- .suspend = ioapic_suspend,
- .resume = ioapic_resume,
-};
-
-static int __init ioapic_init_sysfs(void)
+void __init setup_IO_APIC(void)
{
- struct sys_device * dev;
- int i, size, error;
-
- error = sysdev_class_register(&ioapic_sysdev_class);
- if (error)
- return error;
-
- for (i = 0; i < nr_ioapics; i++ ) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
- * sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
- if (!mp_ioapic_data[i]) {
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
- dev->cls = &ioapic_sysdev_class;
- error = sysdev_register(dev);
- if (error) {
- kfree(mp_ioapic_data[i]);
- mp_ioapic_data[i] = NULL;
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- }
+ int ioapic;
- return 0;
-}
+ if (ioapic_is_disabled || !nr_ioapics)
+ return;
-device_initcall(ioapic_init_sysfs);
+ io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
-/*
- * Dynamic irq allocate and deallocation
- */
-unsigned int create_irq_nr(unsigned int irq_want, int node)
-{
- /* Allocate an unused irq */
- unsigned int irq;
- unsigned int new;
- unsigned long flags;
- struct irq_cfg *cfg_new = NULL;
- struct irq_desc *desc_new = NULL;
-
- irq = 0;
- if (irq_want < nr_irqs_gsi)
- irq_want = nr_irqs_gsi;
-
- spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_node(new, node);
- if (!desc_new) {
- printk(KERN_INFO "can not get irq_desc for %d\n", new);
- continue;
- }
- cfg_new = desc_new->chip_data;
+ apic_pr_verbose("ENABLING IO-APIC IRQs\n");
+ for_each_ioapic(ioapic)
+ BUG_ON(mp_irqdomain_create(ioapic));
- if (cfg_new->vector != 0)
- continue;
+ /* Set up IO-APIC IRQ routing. */
+ x86_init.mpparse.setup_ioapic_ids();
- desc_new = move_irq_desc(desc_new, node);
-
- if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
- irq = new;
- break;
- }
- spin_unlock_irqrestore(&vector_lock, flags);
+ sync_Arb_IDs();
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ if (nr_legacy_irqs())
+ check_timer();
- if (irq > 0) {
- dynamic_irq_init(irq);
- /* restore it, in case dynamic_irq_init clear it */
- if (desc_new)
- desc_new->chip_data = cfg_new;
- }
- return irq;
+ ioapic_initialized = 1;
}
-int create_irq(void)
+static void resume_ioapic_id(int ioapic_idx)
{
- int node = cpu_to_node(boot_cpu_id);
- unsigned int irq_want;
- int irq;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want, node);
-
- if (irq == 0)
- irq = -1;
+ union IO_APIC_reg_00 reg_00;
- return irq;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ }
}
-void destroy_irq(unsigned int irq)
+static int ioapic_suspend(void *data)
{
- unsigned long flags;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
-
- /* store it, in case dynamic_irq_cleanup clear it */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- dynamic_irq_cleanup(irq);
- /* connect back irq_cfg */
- if (desc)
- desc->chip_data = cfg;
-
- free_irte(irq);
- spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, cfg);
- spin_unlock_irqrestore(&vector_lock, flags);
+ return save_ioapic_entries();
}
-/*
- * MSI message composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static void ioapic_resume(void *data)
{
- struct irq_cfg *cfg;
- int err;
- unsigned dest;
-
- if (disable_apic)
- return -ENXIO;
-
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (err)
- return err;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
-
- if (irq_remapped(irq)) {
- struct irte irte;
- int ir_index;
- u16 sub_handle;
+ int ioapic_idx;
- ir_index = map_irq_to_irte_handle(irq, &sub_handle);
- BUG_ON(ir_index == -1);
+ for_each_ioapic_reverse(ioapic_idx)
+ resume_ioapic_id(ioapic_idx);
- memset (&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /* Set source-id of interrupt request */
- set_msi_sid(&irte, pdev);
-
- modify_irte(irq, &irte);
-
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->data = sub_handle;
- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
- MSI_ADDR_IR_SHV |
- MSI_ADDR_IR_INDEX1(ir_index) |
- MSI_ADDR_IR_INDEX2(ir_index);
- } else {
- if (x2apic_enabled())
- msg->address_hi = MSI_ADDR_BASE_HI |
- MSI_ADDR_EXT_DEST_ID(dest);
- else
- msg->address_hi = MSI_ADDR_BASE_HI;
-
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(cfg->vector);
- }
- return err;
+ restore_ioapic_entries();
}
-#ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
-
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
- return -1;
-
- cfg = desc->chip_data;
-
- read_msi_msg_desc(desc, &msg);
+static const struct syscore_ops ioapic_syscore_ops = {
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
+};
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static struct syscore ioapic_syscore = {
+ .ops = &ioapic_syscore_ops,
+};
- write_msi_msg_desc(desc, &msg);
+static int __init ioapic_init_ops(void)
+{
+ register_syscore(&ioapic_syscore);
return 0;
}
-#ifdef CONFIG_INTR_REMAP
-/*
- * Migrate the MSI irq to another cpumask. This migration is
- * done in the process context using interrupt-remapping hardware.
- */
-static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
- unsigned int dest;
- struct irte irte;
- if (get_irte(irq, &irte))
- return -1;
+device_initcall(ioapic_init_ops);
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
- return -1;
+static int io_apic_get_redir_entries(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ reg_01.raw = io_apic_read(ioapic, 1);
/*
- * atomically update the IRTE with the new destination and vector.
+ * The register returns the maximum index redir index supported,
+ * which is one less than the total number of redir entries.
*/
- modify_irte(irq, &irte);
+ return reg_01.bits.entries + 1;
+}
+
+unsigned int arch_dynirq_lower_bound(unsigned int from)
+{
+ unsigned int ret;
/*
- * After this point, all the interrupts will start arriving
- * at the new destination. So, time to cleanup the previous
- * vector allocation.
+ * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+ * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
*/
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
+ ret = ioapic_dynirq_base ? : gsi_top;
- return 0;
+ /*
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and
+ * always 0. gsi_top can be 0 if there is no IO/APIC registered.
+ * 0 is an invalid interrupt number for dynamic allocations. Return
+ * @from instead.
+ */
+ return ret ? : from;
}
-#endif
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
+#ifdef CONFIG_X86_32
+static int io_apic_get_unique_id(int ioapic, int apic_id)
+{
+ static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
+ union IO_APIC_reg_00 reg_00;
+ int i = 0;
-static struct irq_chip msi_ir_chip = {
- .name = "IR-PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
-#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
-#endif
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
+ /* Initialize the ID map */
+ if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC))
+ copy_phys_cpu_present_map(apic_id_map);
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
-{
- struct intel_iommu *iommu;
- int index;
-
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- printk(KERN_ERR
- "Unable to map PCI %s to iommu\n", pci_name(dev));
- return -ENOENT;
- }
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic, 0);
- index = alloc_irte(iommu, irq, nvec);
- if (index < 0) {
- printk(KERN_ERR
- "Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
- return -ENOSPC;
+ if (apic_id >= broadcast_id) {
+ pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n",
+ ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
}
- return index;
-}
-
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
-{
- int ret;
- struct msi_msg msg;
- ret = msi_compose_msg(dev, irq, &msg);
- if (ret < 0)
- return ret;
+ /* Every APIC in a system must have a unique ID */
+ if (test_bit(apic_id, apic_id_map)) {
+ for (i = 0; i < broadcast_id; i++) {
+ if (!test_bit(i, apic_id_map))
+ break;
+ }
- set_irq_msi(irq, msidesc);
- write_msi_msg(irq, &msg);
+ if (i == broadcast_id)
+ panic("Max apic_id exceeded!\n");
- if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_to_desc(irq);
- /*
- * irq migration in process context
- */
- desc->status |= IRQ_MOVE_PCNTXT;
- set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
- } else
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+ pr_warn("IOAPIC[%d]: apic_id %d already used, trying %d\n", ioapic, apic_id, i);
+ apic_id = i;
+ }
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+ set_bit(apic_id, apic_id_map);
- return 0;
-}
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
- unsigned int irq;
- int ret, sub_handle;
- struct msi_desc *msidesc;
- unsigned int irq_want;
- struct intel_iommu *iommu = NULL;
- int index = 0;
- int node;
-
- /* x86 doesn't support multiple MSI yet */
- if (type == PCI_CAP_ID_MSI && nvec > 1)
- return 1;
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ }
- node = dev_to_node(&dev->dev);
- irq_want = nr_irqs_gsi;
- sub_handle = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want, node);
- if (irq == 0)
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
return -1;
- irq_want = irq + 1;
- if (!intr_remapping_enabled)
- goto no_ir;
-
- if (!sub_handle) {
- /*
- * allocate the consecutive block of IRTE's
- * for 'nvec'
- */
- index = msi_alloc_irte(dev, irq, nvec);
- if (index < 0) {
- ret = index;
- goto error;
- }
- } else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
- goto error;
- }
- /*
- * setup the mapping between the irq and the IRTE
- * base index, the sub_handle pointing to the
- * appropriate interrupt remap table entry.
- */
- set_irte_irq(irq, iommu, index, sub_handle);
}
-no_ir:
- ret = setup_msi_irq(dev, msidesc, irq);
- if (ret < 0)
- goto error;
- sub_handle++;
}
- return 0;
-
-error:
- destroy_irq(irq);
- return ret;
-}
-void arch_teardown_msi_irq(unsigned int irq)
-{
- destroy_irq(irq);
-}
-
-#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
-#ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
+ apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
- return -1;
-
- cfg = desc->chip_data;
-
- dmar_msi_read(irq, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- dmar_msi_write(irq, &msg);
-
- return 0;
+ return apic_id;
}
-#endif /* CONFIG_SMP */
-
-static struct irq_chip dmar_msi_type = {
- .name = "DMAR_MSI",
- .unmask = dmar_msi_unmask,
- .mask = dmar_msi_mask,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = dmar_msi_set_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
+static u8 io_apic_unique_id(int idx, u8 id)
{
- int ret;
- struct msi_msg msg;
-
- ret = msi_compose_msg(NULL, irq, &msg);
- if (ret < 0)
- return ret;
- dmar_msi_write(irq, &msg);
- set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
- "edge");
- return 0;
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(boot_cpu_apic_version))
+ return io_apic_get_unique_id(idx, id);
+ return id;
}
-#endif
-
-#ifdef CONFIG_HPET_TIMER
-
-#ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+#else
+static u8 io_apic_unique_id(int idx, u8 id)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
+ union IO_APIC_reg_00 reg_00;
+ DECLARE_BITMAP(used, 256);
+ u8 new_id;
+ int i;
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
- return -1;
+ bitmap_zero(used, 256);
+ for_each_ioapic(i)
+ __set_bit(mpc_ioapic_id(i), used);
- cfg = desc->chip_data;
+ /* Hand out the requested id if available */
+ if (!test_bit(id, used))
+ return id;
- hpet_msi_read(irq, &msg);
+ /*
+ * Read the current id from the ioapic and keep it if
+ * available.
+ */
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(idx, 0);
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+ new_id = reg_00.bits.ID;
+ if (!test_bit(new_id, used)) {
+ apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
+ return new_id;
+ }
- hpet_msi_write(irq, &msg);
+ /* Get the next free id and write it to the ioapic. */
+ new_id = find_first_zero_bit(used, 256);
+ reg_00.bits.ID = new_id;
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ }
+ /* Sanity check */
+ BUG_ON(reg_00.bits.ID != new_id);
- return 0;
+ return new_id;
}
-
-#endif /* CONFIG_SMP */
-
-static struct irq_chip hpet_msi_type = {
- .name = "HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = hpet_msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
-};
-int arch_setup_hpet_msi(unsigned int irq)
+static int io_apic_get_version(int ioapic)
{
- int ret;
- struct msi_msg msg;
- struct irq_desc *desc = irq_to_desc(irq);
-
- ret = msi_compose_msg(NULL, irq, &msg);
- if (ret < 0)
- return ret;
+ union IO_APIC_reg_01 reg_01;
- hpet_msi_write(irq, &msg);
- desc->status |= IRQ_MOVE_PCNTXT;
- set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
- "edge");
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ reg_01.raw = io_apic_read(ioapic, 1);
- return 0;
+ return reg_01.bits.version;
}
-#endif
-#endif /* CONFIG_PCI_MSI */
/*
- * Hypertransport interrupt support
+ * This function updates target affinity of IOAPIC interrupts to include
+ * the CPUs which came online during SMP bringup.
*/
-#ifdef CONFIG_HT_IRQ
+#define IOAPIC_RESOURCE_NAME_SIZE 11
-#ifdef CONFIG_SMP
+static struct resource *ioapic_resources;
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+static struct resource * __init ioapic_setup_resources(void)
{
- struct ht_irq_msg msg;
- fetch_ht_irq_msg(irq, &msg);
-
- msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+ struct resource *res;
+ unsigned long n;
+ char *mem;
+ int i;
- msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+ if (nr_ioapics == 0)
+ return NULL;
- write_ht_irq_msg(irq, &msg);
-}
+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+ n *= nr_ioapics;
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- unsigned int dest;
+ mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES);
+ res = (void *)mem;
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
- return -1;
+ mem += sizeof(struct resource) * nr_ioapics;
- cfg = desc->chip_data;
+ for_each_ioapic(i) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
+ ioapics[i].iomem_res = &res[i];
+ }
- target_ht_irq(irq, dest, cfg->vector);
+ ioapic_resources = res;
- return 0;
+ return res;
}
-#endif
-
-static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
{
- struct irq_cfg *cfg;
- int err;
+ pgprot_t flags = FIXMAP_PAGE_NOCACHE;
- if (disable_apic)
- return -ENXIO;
-
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (!err) {
- struct ht_irq_msg msg;
- unsigned dest;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus());
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+ /*
+ * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot
+ * bits, just like normal ioremap():
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ if (x86_platform.hyper.is_private_mmio(phys))
+ flags = pgprot_encrypted(flags);
+ else
+ flags = pgprot_decrypted(flags);
+ }
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((apic->irq_dest_mode == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
+ __set_fixmap(idx, phys, flags);
+}
- write_ht_irq_msg(irq, &msg);
+void __init io_apic_init_mappings(void)
+{
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ struct resource *ioapic_res;
+ int i;
- set_irq_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
+ ioapic_res = ioapic_setup_resources();
+ for_each_ioapic(i) {
+ if (smp_found_config) {
+ ioapic_phys = mpc_ioapic_addr(i);
+#ifdef CONFIG_X86_32
+ if (!ioapic_phys) {
+ pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ ioapic_is_disabled = true;
+ goto fake_ioapic_page;
+ }
+#endif
+ } else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
+ ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE,
+ PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ io_apic_set_fixmap(idx, ioapic_phys);
+ apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys);
+ idx++;
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
+ ioapic_res++;
}
- return err;
}
-#endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_X86_UV
-/*
- * Re-target the irq to the specified CPU and enable the specified MMR located
- * on the specified blade to allow the sending of MSIs to the specified CPU.
- */
-int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
- unsigned long mmr_offset)
+void __init ioapic_insert_resources(void)
{
- const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_cfg *cfg;
- int mmr_pnode;
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- unsigned long flags;
- int err;
-
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- cfg = irq_cfg(irq);
-
- err = assign_irq_vector(irq, cfg, eligible_cpu);
- if (err != 0)
- return err;
-
- spin_lock_irqsave(&vector_lock, flags);
- set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
- irq_name);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
-
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+ struct resource *r = ioapic_resources;
+ int i;
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
+ if (!r) {
+ if (nr_ioapics > 0)
+ pr_err("IO APIC resources couldn't be allocated.\n");
+ return;
+ }
- return irq;
+ for_each_ioapic(i) {
+ insert_resource(&iomem_resource, r);
+ r++;
+ }
}
-/*
- * Disable the specified MMR located on the specified blade so that MSIs are
- * longer allowed to be sent.
- */
-void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
+int mp_find_ioapic(u32 gsi)
{
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- int mmr_pnode;
+ int i;
+
+ if (nr_ioapics == 0)
+ return -1;
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+ /* Find the IOAPIC that manages this GSI. */
+ for_each_ioapic(i) {
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->mask = 1;
+ if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
+ return i;
+ }
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+ pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
}
-#endif /* CONFIG_X86_64 */
-int __init io_apic_get_redir_entries (int ioapic)
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
+ struct mp_ioapic_gsi *gsi_cfg;
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (WARN_ON(ioapic < 0))
+ return -1;
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if (WARN_ON(gsi > gsi_cfg->gsi_end))
+ return -1;
- return reg_01.bits.entries;
+ return gsi - gsi_cfg->gsi_base;
}
-void __init probe_nr_irqs_gsi(void)
+static int bad_ioapic_register(int idx)
{
- int nr = 0;
-
- nr = acpi_probe_gsi();
- if (nr > nr_irqs_gsi) {
- nr_irqs_gsi = nr;
- } else {
- /* for acpi=off or acpi is not compiled in */
- int idx;
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
- nr = 0;
- for (idx = 0; idx < nr_ioapics; idx++)
- nr += io_apic_get_redir_entries(idx) + 1;
+ reg_00.raw = io_apic_read(idx, 0);
+ reg_01.raw = io_apic_read(idx, 1);
+ reg_02.raw = io_apic_read(idx, 2);
- if (nr > nr_irqs_gsi)
- nr_irqs_gsi = nr;
+ if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+ pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+ mpc_ioapic_addr(idx));
+ return 1;
}
- printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
+ return 0;
}
-#ifdef CONFIG_SPARSE_IRQ
-int __init arch_probe_nr_irqs(void)
+static int find_free_ioapic_entry(void)
{
- int nr;
-
- if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
- nr_irqs = NR_VECTORS * nr_cpu_ids;
-
- nr = nr_irqs_gsi + 8 * nr_cpu_ids;
-#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
- /*
- * for MSI and HT dyn irq
- */
- nr += nr_irqs_gsi * 16;
-#endif
- if (nr < nr_irqs)
- nr_irqs = nr;
-
- return 0;
+ for (int idx = 0; idx < MAX_IO_APICS; idx++) {
+ if (ioapics[idx].nr_registers == 0)
+ return idx;
+ }
+ return MAX_IO_APICS;
}
-#endif
-static int __io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr)
+/**
+ * mp_register_ioapic - Register an IOAPIC device
+ * @id: hardware IOAPIC ID
+ * @address: physical address of IOAPIC register area
+ * @gsi_base: base of GSI associated with the IOAPIC
+ * @cfg: configuration information for the IOAPIC
+ */
+int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg)
{
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- int node;
- int ioapic, pin;
- int trigger, polarity;
+ bool hotplug = !!ioapic_initialized;
+ struct mp_ioapic_gsi *gsi_cfg;
+ int idx, ioapic, entries;
+ u32 gsi_end;
- ioapic = irq_attr->ioapic;
- if (!IO_APIC_IRQ(irq)) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
+ if (!address) {
+ pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
return -EINVAL;
}
- if (dev)
- node = dev_to_node(dev);
- else
- node = cpu_to_node(boot_cpu_id);
+ for_each_ioapic(ioapic) {
+ if (ioapics[ioapic].mp_config.apicaddr == address) {
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic);
+ return -EEXIST;
+ }
+ }
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
- return 0;
+ idx = find_free_ioapic_entry();
+ if (idx >= MAX_IO_APICS) {
+ pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+ MAX_IO_APICS, idx);
+ return -ENOSPC;
}
- pin = irq_attr->ioapic_pin;
- trigger = irq_attr->trigger;
- polarity = irq_attr->polarity;
+ ioapics[idx].mp_config.type = MP_IOAPIC;
+ ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+ ioapics[idx].mp_config.apicaddr = address;
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
- if (irq >= NR_IRQS_LEGACY) {
- cfg = desc->chip_data;
- add_pin_to_irq_node(cfg, node, ioapic, pin);
+ io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address);
+ if (bad_ioapic_register(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENODEV;
}
- setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id);
+ ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
- return 0;
-}
-
-int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr)
-{
- int ioapic, pin;
/*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
- ioapic = irq_attr->ioapic;
- pin = irq_attr->ioapic_pin;
- if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mp_ioapics[ioapic].apicid, pin);
- return 0;
+ entries = io_apic_get_redir_entries(idx);
+ gsi_end = gsi_base + entries - 1;
+ for_each_ioapic(ioapic) {
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if ((gsi_base >= gsi_cfg->gsi_base &&
+ gsi_base <= gsi_cfg->gsi_end) ||
+ (gsi_end >= gsi_cfg->gsi_base &&
+ gsi_end <= gsi_cfg->gsi_end)) {
+ pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
+ gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOSPC;
+ }
}
- set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ gsi_cfg->gsi_base = gsi_base;
+ gsi_cfg->gsi_end = gsi_end;
- return __io_apic_set_pci_routing(dev, irq, irq_attr);
-}
-
-/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
- -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-#ifdef CONFIG_X86_32
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
-{
- union IO_APIC_reg_00 reg_00;
- static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
- physid_mask_t tmp;
- unsigned long flags;
- int i = 0;
+ ioapics[idx].irqdomain = NULL;
+ ioapics[idx].irqdomain_cfg = *cfg;
/*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
- * supports up to 16 on one shared APIC bus.
- *
- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
- * advantage of new APIC bus architecture.
+ * If mp_register_ioapic() is called during early boot stage when
+ * walking ACPI/DT tables, it's too early to create irqdomain,
+ * we are still using bootmem allocator. So delay it to setup_IO_APIC().
*/
-
- if (physids_empty(apic_id_map))
- apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- if (apic_id >= get_physical_broadcast()) {
- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
- "%d\n", ioapic, apic_id, reg_00.bits.ID);
- apic_id = reg_00.bits.ID;
+ if (hotplug) {
+ if (mp_irqdomain_create(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOMEM;
+ }
+ alloc_ioapic_saved_registers(idx);
}
- /*
- * Every APIC in a system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (apic->check_apicid_used(apic_id_map, apic_id)) {
+ if (gsi_cfg->gsi_end >= gsi_top)
+ gsi_top = gsi_cfg->gsi_end + 1;
+ if (nr_ioapics <= idx)
+ nr_ioapics = idx + 1;
- for (i = 0; i < get_physical_broadcast(); i++) {
- if (!apic->check_apicid_used(apic_id_map, i))
- break;
- }
+ /* Set nr_registers to mark entry present */
+ ioapics[idx].nr_registers = entries;
- if (i == get_physical_broadcast())
- panic("Max apic_id exceeded!\n");
+ pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+ idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
- "trying %d\n", ioapic, apic_id, i);
+ return 0;
+}
- apic_id = i;
- }
+int mp_unregister_ioapic(u32 gsi_base)
+{
+ int ioapic, pin;
+ int found = 0;
- tmp = apic->apicid_to_cpu_present(apic_id);
- physids_or(apic_id_map, apic_id_map, tmp);
+ for_each_ioapic(ioapic) {
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
+ found = 1;
+ break;
+ }
+ }
- if (reg_00.bits.ID != apic_id) {
- reg_00.bits.ID = apic_id;
+ if (!found) {
+ pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
+ return -ENODEV;
+ }
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ for_each_pin(ioapic, pin) {
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+ int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ struct mp_chip_data *data;
- /* Sanity check */
- if (reg_00.bits.ID != apic_id) {
- printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
- return -1;
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ if (data && data->count) {
+ pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic);
+ return -EBUSY;
+ }
}
}
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+ /* Mark entry not present */
+ ioapics[ioapic].nr_registers = 0;
+ ioapic_destroy_irqdomain(ioapic);
+ free_ioapic_saved_registers(ioapic);
+ if (ioapics[ioapic].iomem_res)
+ release_resource(ioapics[ioapic].iomem_res);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic);
+ memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic]));
- return apic_id;
+ return 0;
}
-#endif
-int __init io_apic_get_version(int ioapic)
+int mp_ioapic_registered(u32 gsi_base)
{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
+ int ioapic;
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base)
+ return 1;
- return reg_01.bits.version;
+ return 0;
}
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+ struct irq_alloc_info *info)
{
- int i;
-
- if (skip_ioapic_setup)
- return -1;
-
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].irqtype == mp_INT &&
- mp_irqs[i].srcbusirq == bus_irq)
- break;
- if (i >= mp_irq_entries)
- return -1;
-
- *trigger = irq_trigger(i);
- *polarity = irq_polarity(i);
- return 0;
+ if (info && info->ioapic.valid) {
+ data->is_level = info->ioapic.is_level;
+ data->active_low = info->ioapic.active_low;
+ } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) {
+ /* PCI interrupts are always active low level triggered. */
+ data->is_level = true;
+ data->active_low = true;
+ }
}
-#endif /* CONFIG_ACPI */
-
/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be apic->target_cpus()
+ * Configure the I/O-APIC specific fields in the routing entry.
+ *
+ * This is important to setup the I/O-APIC specific bits (is_level,
+ * active_low, masked) because the underlying parent domain will only
+ * provide the routing information and is oblivious of the I/O-APIC
+ * specific bits.
+ *
+ * The entry is just preconfigured at this point and not written into the
+ * RTE. This happens later during activation which will fill in the actual
+ * routing information.
*/
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
+static void mp_preconfigure_entry(struct mp_chip_data *data)
{
- int pin, ioapic = 0, irq, irq_entry;
- struct irq_desc *desc;
- const struct cpumask *mask;
-
- if (skip_ioapic_setup == 1)
- return;
-
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- ioapic = 0;
- }
-#endif
-
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
-
- desc = irq_to_desc(irq);
-
- /*
- * Honour affinities which have been set in early boot
- */
- if (desc->status &
- (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
- else
- mask = apic->target_cpus();
-
- if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
- else
- set_ioapic_affinity_irq_desc(desc, mask);
- }
+ struct IO_APIC_route_entry *entry = &data->entry;
+ memset(entry, 0, sizeof(*entry));
+ entry->is_level = data->is_level;
+ entry->active_low = data->active_low;
+ /*
+ * Mask level triggered irqs. Edge triggered irqs are masked
+ * by the irq core code in case they fire.
+ */
+ entry->masked = data->is_level;
}
-#endif
-
-#define IOAPIC_RESOURCE_NAME_SIZE 11
-
-static struct resource *ioapic_resources;
-static struct resource * __init ioapic_setup_resources(void)
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
{
- unsigned long n;
- struct resource *res;
- char *mem;
- int i;
+ struct irq_alloc_info *info = arg;
+ struct mp_chip_data *data;
+ struct irq_data *irq_data;
+ int ret, ioapic, pin;
+ unsigned long flags;
- if (nr_ioapics <= 0)
- return NULL;
+ if (!info || nr_irqs > 1)
+ return -EINVAL;
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (!irq_data)
+ return -EINVAL;
- n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
- n *= nr_ioapics;
+ ioapic = mp_irqdomain_ioapic_idx(domain);
+ pin = info->ioapic.pin;
+ if (irq_resolve_mapping(domain, (irq_hw_number_t)pin))
+ return -EEXIST;
- mem = alloc_bootmem(n);
- res = (void *)mem;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
- if (mem != NULL) {
- mem += sizeof(struct resource) * nr_ioapics;
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0)
+ goto free_data;
- for (i = 0; i < nr_ioapics; i++) {
- res[i].name = mem;
- res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- sprintf(mem, "IOAPIC %u", i);
- mem += IOAPIC_RESOURCE_NAME_SIZE;
- }
+ INIT_LIST_HEAD(&data->irq_2_pin);
+ irq_data->hwirq = info->ioapic.pin;
+ irq_data->chip = (domain->parent == x86_vector_domain) ?
+ &ioapic_chip : &ioapic_ir_chip;
+ irq_data->chip_data = data;
+ mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+
+ if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) {
+ ret = -ENOMEM;
+ goto free_irqs;
}
- ioapic_resources = res;
+ mp_preconfigure_entry(data);
+ mp_register_handler(virq, data->is_level);
- return res;
+ local_irq_save(flags);
+ if (virq < nr_legacy_irqs())
+ legacy_pic->mask(virq);
+ local_irq_restore(flags);
+
+ apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low);
+ return 0;
+
+free_irqs:
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+free_data:
+ kfree(data);
+ return ret;
}
-void __init ioapic_init_mappings(void)
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
{
- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
- struct resource *ioapic_res;
- int i;
-
- ioapic_res = ioapic_setup_resources();
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].apicaddr;
-#ifdef CONFIG_X86_32
- if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
- "disabling IO/APIC support!\n");
- smp_found_config = 0;
- skip_ioapic_setup = 1;
- goto fake_ioapic_page;
- }
-#endif
- } else {
-#ifdef CONFIG_X86_32
-fake_ioapic_page:
-#endif
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
- ioapic_phys = __pa(ioapic_phys);
- }
- set_fixmap_nocache(idx, ioapic_phys);
- apic_printk(APIC_VERBOSE,
- "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
- idx++;
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
- if (ioapic_res != NULL) {
- ioapic_res->start = ioapic_phys;
- ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
- ioapic_res++;
- }
+ BUG_ON(nr_irqs != 1);
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
+ WARN_ON(!list_empty(&data->irq_2_pin));
+ kfree(irq_data->chip_data);
}
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-void __init ioapic_insert_resources(void)
+int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve)
{
- int i;
- struct resource *r = ioapic_resources;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ ioapic_configure_entry(irq_data);
+ return 0;
+}
- if (!r) {
- if (nr_ioapics > 0)
- printk(KERN_ERR
- "IO APIC resources couldn't be allocated.\n");
- return;
- }
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ /* It won't be called for IRQ with multiple IOAPIC pins associated */
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
+}
- for (i = 0; i < nr_ioapics; i++) {
- insert_resource(&iomem_resource, r);
- r++;
- }
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+ return (int)(long)domain->host_data;
}
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .alloc = mp_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index dbf5445727a9..98a57cb4aa86 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,164 +1,291 @@
-#include <linux/cpumask.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
+#include <linux/cpumask.h>
#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/apic.h>
-#include <asm/proto.h>
-#include <asm/ipi.h>
+#include <linux/smp.h>
+#include <linux/string_choices.h>
-void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
+#include <asm/io_apic.h>
+
+#include "local.h"
+
+DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+#ifdef CONFIG_SMP
+static int apic_ipi_shorthand_off __ro_after_init;
+
+static __init int apic_ipi_shorthand(char *str)
{
- unsigned long query_cpu;
- unsigned long flags;
+ get_option(&str, &apic_ipi_shorthand_off);
+ return 1;
+}
+__setup("no_ipi_broadcast=", apic_ipi_shorthand);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("IPI shorthand broadcast: %s\n",
+ str_disabled_enabled(apic_ipi_shorthand_off));
+ return 0;
+}
+late_initcall(print_ipi_mode);
+void apic_smt_update(void)
+{
/*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicast to each CPU instead.
- * - mbligh
+ * Do not switch to broadcast mode if:
+ * - Disabled on the command line
+ * - Only a single CPU is online
+ * - Not all present CPUs have been at least booted once
+ *
+ * The latter is important as the local APIC might be in some
+ * random state and a broadcast might cause havoc. That's
+ * especially true for NMI broadcasting.
*/
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+ if (apic_ipi_shorthand_off || num_online_cpus() == 1 ||
+ !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) {
+ static_branch_disable(&apic_use_ipi_shorthand);
+ } else {
+ static_branch_enable(&apic_use_ipi_shorthand);
}
- local_irq_restore(flags);
}
-void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
- int vector)
+void apic_send_IPI_allbutself(unsigned int vector)
{
- unsigned int this_cpu = smp_processor_id();
- unsigned int query_cpu;
- unsigned long flags;
+ if (num_online_cpus() < 2)
+ return;
- /* See Hack comment above */
+ if (static_branch_likely(&apic_use_ipi_shorthand))
+ __apic_send_IPI_allbutself(vector);
+ else
+ __apic_send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
- continue;
- __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+/*
+ * Send a 'reschedule' IPI to another CPU. It goes straight through and
+ * wastes no time serializing anything. Worst case is that we lose a
+ * reschedule ...
+ */
+void native_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
+ return;
}
- local_irq_restore(flags);
+ __apic_send_IPI(cpu, RESCHEDULE_VECTOR);
}
-void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
- int vector)
+void native_send_call_func_single_ipi(int cpu)
{
- unsigned long flags;
- unsigned int query_cpu;
+ __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+void native_send_call_func_ipi(const struct cpumask *mask)
+{
+ if (static_branch_likely(&apic_use_ipi_shorthand)) {
+ unsigned int cpu = smp_processor_id();
+
+ if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask))
+ goto sendmask;
+
+ if (cpumask_test_cpu(cpu, mask))
+ __apic_send_IPI_all(CALL_FUNCTION_VECTOR);
+ else if (num_online_cpus() > 1)
+ __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ return;
+ }
+
+sendmask:
+ __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+}
+
+void apic_send_nmi_to_offline_cpu(unsigned int cpu)
+{
+ if (WARN_ON_ONCE(!apic->nmi_to_offline_cpu))
+ return;
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, &cpus_booted_once_mask)))
+ return;
+ apic->send_IPI(cpu, NMI_VECTOR);
+}
+#endif /* CONFIG_SMP */
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_XAPIC_DEST_FIELD(mask);
+}
+
+u32 apic_mem_wait_icr_idle_timeout(void)
+{
+ int cnt;
+
+ for (cnt = 0; cnt < 1000; cnt++) {
+ if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY))
+ return 0;
+ inc_irq_stat(icr_read_retry_count);
+ udelay(100);
+ }
+ return APIC_ICR_BUSY;
+}
+
+void apic_mem_wait_icr_idle(void)
+{
+ while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+/*
+ * This is safe against interruption because it only writes the lower 32
+ * bits of the APIC_ICR register. The destination field is ignored for
+ * short hand IPIs.
+ *
+ * wait_icr_idle()
+ * write(ICR2, dest)
+ * NMI
+ * wait_icr_idle()
+ * write(ICR)
+ * wait_icr_idle()
+ * write(ICR)
+ *
+ * This function does not need to disable interrupts as there is no ICR2
+ * interaction. The memory write is direct except when the machine is
+ * affected by the 11AP Pentium erratum, which turns the plain write into
+ * an XCHG operation.
+ */
+static void __default_send_IPI_shortcut(unsigned int shortcut, int vector)
+{
/*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
+ * Wait for the previous ICR command to complete. Use
+ * safe_apic_wait_icr_idle() for the NMI vector as there have been
+ * issues where otherwise the system hangs when the panic CPU tries
+ * to stop the others before launching the kdump kernel.
*/
+ if (unlikely(vector == NMI_VECTOR))
+ apic_mem_wait_icr_idle_timeout();
+ else
+ apic_mem_wait_icr_idle();
+
+ /* Destination field (ICR2) and the destination mode are ignored */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0));
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int dest_mask, int vector,
+ unsigned int dest_mode)
+{
+ /* See comment in __default_send_IPI_shortcut() */
+ if (unlikely(vector == NMI_VECTOR))
+ apic_mem_wait_icr_idle_timeout();
+ else
+ apic_mem_wait_icr_idle();
+
+ /* Set the IPI destination field in the ICR */
+ native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask));
+ /* Send it with the proper destination mode */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode));
+}
+
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+ unsigned long flags;
local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- __default_send_IPI_dest_field(
- apic->cpu_to_logical_apicid(query_cpu), vector,
- apic->dest_logical);
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+ vector, APIC_DEST_PHYSICAL);
local_irq_restore(flags);
}
-void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
- int vector)
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
unsigned long flags;
- unsigned int query_cpu;
- unsigned int this_cpu = smp_processor_id();
-
- /* See Hack comment above */
+ unsigned long cpu;
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
- continue;
- __default_send_IPI_dest_field(
- apic->cpu_to_logical_apicid(query_cpu), vector,
- apic->dest_logical);
- }
+ for_each_cpu(cpu, mask) {
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ cpu), vector, APIC_DEST_PHYSICAL);
+ }
local_irq_restore(flags);
}
-#ifdef CONFIG_X86_32
-
-/*
- * This is only used on smaller machines.
- */
-void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector)
{
- unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned int cpu, this_cpu = smp_processor_id();
unsigned long flags;
local_irq_save(flags);
- WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ cpu), vector, APIC_DEST_PHYSICAL);
+ }
local_irq_restore(flags);
}
-void default_send_IPI_allbutself(int vector)
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
{
- /*
- * if there are no other CPUs in the system then we get an APIC send
- * error if we try to broadcast, thus avoid sending IPIs in this case.
- */
- if (!(num_online_cpus() > 1))
- return;
+ __apic_send_IPI_mask(cpumask_of(cpu), vector);
+}
- __default_local_send_IPI_allbutself(vector);
+void default_send_IPI_allbutself(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
}
void default_send_IPI_all(int vector)
{
- __default_local_send_IPI_all(vector);
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
}
void default_send_IPI_self(int vector)
{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector);
}
-/* must come after the send_IPI functions above for inlining */
-static int convert_apicid_to_cpu(int apic_id)
+#ifdef CONFIG_X86_32
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector)
{
- int i;
+ unsigned long flags;
+ unsigned int cpu;
- for_each_possible_cpu(i) {
- if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
- return i;
- }
- return -1;
+ local_irq_save(flags);
+ for_each_cpu(cpu, mask)
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
+ local_irq_restore(flags);
}
-int safe_smp_processor_id(void)
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector)
{
- int apicid, cpuid;
+ unsigned int cpu, this_cpu = smp_processor_id();
+ unsigned long flags;
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
+ local_irq_save(flags);
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
+ }
+ local_irq_restore(flags);
+}
- apicid = hard_smp_processor_id();
- if (apicid == BAD_APICID)
- return 0;
+void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned long flags;
- cpuid = convert_apicid_to_cpu(apicid);
+ if (!mask)
+ return;
- return cpuid >= 0 ? cpuid : 0;
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
+ __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
+ local_irq_restore(flags);
}
#endif
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
new file mode 100644
index 000000000000..bdcf609eb283
--- /dev/null
+++ b/arch/x86/kernel/apic/local.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Historical copyright notices:
+ *
+ * Copyright 2004 James Cleverdon, IBM.
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/jump_label.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+
+/* X2APIC */
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
+u32 x2apic_get_apic_id(u32 id);
+
+void x2apic_send_IPI_all(int vector);
+void x2apic_send_IPI_allbutself(int vector);
+void x2apic_send_IPI_self(int vector);
+extern u32 x2apic_max_apicid;
+
+/* IPI */
+
+DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ unsigned int icr = shortcut | dest;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+void default_init_apic_ldr(void);
+
+void apic_mem_wait_icr_idle(void);
+u32 apic_mem_wait_icr_idle_timeout(void);
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
+
+void default_send_IPI_single(int cpu, int vector);
+void default_send_IPI_single_phys(int cpu, int vector);
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_allbutself(int vector);
+void default_send_IPI_all(int vector);
+void default_send_IPI_self(int vector);
+
+#ifdef CONFIG_X86_32
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_logical(const struct cpumask *mask, int vector);
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
new file mode 100644
index 000000000000..66bc5d3e79db
--- /dev/null
+++ b/arch/x86/kernel/apic/msi.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support of MSI, HPET and DMAR interrupts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Convert to hierarchical irqdomain
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
+#include <linux/msi.h>
+#include <asm/irqdomain.h>
+#include <asm/hpet.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+#include <asm/xen/hypervisor.h>
+
+struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg, false);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path).
+ * The reservation mode bit is set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The interrupt is not yet started up
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_can_reserve(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ !irqd_is_started(irqd) ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in
+ * common_interrupt().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
+/**
+ * pci_dev_has_default_msi_parent_domain - Check whether the device has the default
+ * MSI parent domain associated
+ * @dev: Pointer to the PCI device
+ */
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(&dev->dev);
+
+ if (!domain)
+ domain = dev_get_msi_domain(&dev->bus->dev);
+ if (!domain)
+ return false;
+
+ return domain == x86_vector_domain;
+}
+
+/**
+ * x86_msi_prepare - Setup of msi_alloc_info_t for allocations
+ * @domain: The domain for which this setup happens
+ * @dev: The device for which interrupts are allocated
+ * @nvec: The number of vectors to allocate
+ * @alloc: The allocation info structure to initialize
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. It is always invoked from the
+ * top level interrupt domain. The domain specific allocation
+ * functionality is determined via the @domain's bus token which allows to
+ * map the X86 specific allocation type.
+ */
+static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *alloc)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ init_irq_alloc_info(alloc, NULL);
+
+ switch (info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * x86_init_dev_msi_info - Domain info setup for MSI domains
+ * @dev: The device for which the domain should be created
+ * @domain: The (root) domain providing this callback
+ * @real_parent: The real parent domain of the to initialize domain
+ * @info: The domain info for the to initialize domain
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. The domain specific functionality
+ * is determined via the @real_parent.
+ */
+static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+ const struct msi_parent_ops *pops = real_parent->msi_parent_ops;
+
+ /* MSI parent domain specific settings */
+ switch (real_parent->bus_token) {
+ case DOMAIN_BUS_ANY:
+ /* Only the vector domain can have the ANY token */
+ if (WARN_ON_ONCE(domain != real_parent))
+ return false;
+ info->chip->irq_set_affinity = msi_set_affinity;
+ info->chip->flags |= IRQCHIP_MOVE_DEFERRED;
+ break;
+ case DOMAIN_BUS_DMAR:
+ case DOMAIN_BUS_AMDVI:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Is the target supported? */
+ switch(info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Mask out the domain specific MSI feature flags which are not
+ * supported by the real parent.
+ */
+ info->flags &= pops->supported_flags;
+ /* Enforce the required flags */
+ info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED;
+
+ /* This is always invoked from the top level MSI domain! */
+ info->ops->msi_prepare = x86_msi_prepare;
+
+ info->chip->irq_ack = irq_chip_ack_parent;
+ info->chip->irq_retrigger = irq_chip_retrigger_hierarchy;
+ info->chip->flags |= IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP;
+
+ info->handler = handle_edge_irq;
+ info->handler_name = "edge";
+
+ return true;
+}
+
+static const struct msi_parent_ops x86_vector_msi_parent_ops = {
+ .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED,
+ .init_dev_msi_info = x86_init_dev_msi_info,
+};
+
+struct irq_domain * __init native_create_pci_msi_domain(void)
+{
+ if (apic_is_disabled)
+ return NULL;
+
+ x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops;
+ return x86_vector_domain;
+}
+
+void __init x86_create_pci_msi_domain(void)
+{
+ x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
+}
+
+/* Keep around for hyperV */
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
+{
+ init_irq_alloc_info(arg, NULL);
+
+ if (to_pci_dev(dev)->msix_enabled)
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ else
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
+
+#ifdef CONFIG_DMAR_TABLE
+/*
+ * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
+ * high bits of the destination APIC ID. This can't be done in the general
+ * case for MSIs as it would be targeting real memory above 4GiB not the
+ * APIC.
+ */
+static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ dmar_msi_write(data->irq, msg);
+}
+
+static struct irq_chip dmar_msi_controller = {
+ .name = "DMAR-MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = dmar_msi_compose_msg,
+ .irq_write_msi_msg = dmar_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static int dmar_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL,
+ handle_edge_irq, arg->data, "edge");
+
+ return 0;
+}
+
+static struct msi_domain_ops dmar_msi_domain_ops = {
+ .msi_init = dmar_msi_init,
+};
+
+static struct msi_domain_info dmar_msi_domain_info = {
+ .ops = &dmar_msi_domain_ops,
+ .chip = &dmar_msi_controller,
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+ static struct irq_domain *dmar_domain;
+ static DEFINE_MUTEX(dmar_lock);
+ struct fwnode_handle *fn;
+
+ mutex_lock(&dmar_lock);
+ if (dmar_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("DMAR-MSI");
+ if (fn) {
+ dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
+ x86_vector_domain);
+ if (!dmar_domain)
+ irq_domain_free_fwnode(fn);
+ }
+out:
+ mutex_unlock(&dmar_lock);
+ return dmar_domain;
+}
+
+int dmar_alloc_hwirq(int id, int node, void *arg)
+{
+ struct irq_domain *domain = dmar_get_irq_domain();
+ struct irq_alloc_info info;
+
+ if (!domain)
+ return -1;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+ info.devid = id;
+ info.hwirq = id;
+ info.data = arg;
+
+ return irq_domain_alloc_irqs(domain, 1, node, &info);
+}
+
+void dmar_free_hwirq(int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
+#endif
+
+bool arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ return xen_initdom_restore_msi(dev);
+}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index b3025b43b63a..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * NMI watchdog support on APIC systems
- *
- * Started by Ingo Molnar <mingo@redhat.com>
- *
- * Fixes:
- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
- * Mikael Pettersson : Power Management for local APIC NMI watchdog.
- * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model. Disable/enable API.
- */
-
-#include <asm/apic.h>
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-
-#include <asm/mce.h>
-
-#include <asm/mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-static cpumask_var_t backtrace_mask;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- * be enabled
- * 0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-
-static int panic_on_timeout;
-
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
- return per_cpu(irq_stat, cpu).__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_NEW_MCE)
- return atomic_read(&mce_entry) > 0;
-#endif
- return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
- return per_cpu(irq_stat, cpu).apic_timer_irqs +
- per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
- local_irq_enable_in_hardirq();
- /*
- * Intentionally don't use cpu_relax here. This is
- * to make sure that the performance counter really ticks,
- * even if there is a simulator or similar that catches the
- * pause instruction. On a real HT machine this is fine because
- * all other CPUs are busy with "useless" delay loops and don't
- * care if they get somewhat less cycles.
- */
- while (endflag == 0)
- mb();
-}
-#endif
-
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING
- "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
- cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-
- printk(KERN_WARNING
- "Please report this to bugzilla.kernel.org,\n");
- printk(KERN_WARNING
- "and attach the output of the 'dmesg' command.\n");
-
- per_cpu(wd_enabled, cpu) = 0;
- atomic_dec(&nmi_active);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-int __init check_nmi_watchdog(void)
-{
- unsigned int *prev_nmi_count;
- int cpu;
-
- if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
- return 0;
-
- prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
- if (!prev_nmi_count)
- goto error;
-
- alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
- printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
- if (nmi_watchdog == NMI_LOCAL_APIC)
- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-
- for_each_possible_cpu(cpu)
- prev_nmi_count[cpu] = get_nmi_count(cpu);
- local_irq_enable();
- mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-
- for_each_online_cpu(cpu) {
- if (!per_cpu(wd_enabled, cpu))
- continue;
- if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
- report_broken_nmi(cpu, prev_nmi_count);
- }
- endflag = 1;
- if (!atomic_read(&nmi_active)) {
- kfree(prev_nmi_count);
- atomic_set(&nmi_active, -1);
- goto error;
- }
- printk("OK.\n");
-
- /*
- * now that we know it works we can reduce NMI frequency to
- * something more reasonable; makes a difference in some configs
- */
- if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = lapic_adjust_nmi_hz(1);
-
- kfree(prev_nmi_count);
- return 0;
-error:
- if (nmi_watchdog == NMI_IO_APIC) {
- if (!timer_through_8259)
- disable_8259A_irq(0);
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
- }
-
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
- return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
- unsigned int nmi;
-
- if (!strncmp(str, "panic", 5)) {
- panic_on_timeout = 1;
- str = strchr(str, ',');
- if (!str)
- return 1;
- ++str;
- }
-
- if (!strncmp(str, "lapic", 5))
- nmi_watchdog = NMI_LOCAL_APIC;
- else if (!strncmp(str, "ioapic", 6))
- nmi_watchdog = NMI_IO_APIC;
- else {
- get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
- return 0;
- nmi_watchdog = nmi;
- }
-
- return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- nmi_pm_active = atomic_read(&nmi_active);
- stop_apic_nmi_watchdog(NULL);
- BUG_ON(atomic_read(&nmi_active) != 0);
- return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- if (nmi_pm_active > 0) {
- setup_apic_nmi_watchdog(NULL);
- touch_nmi_watchdog();
- }
- return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
- .name = "lapic_nmi",
- .resume = lapic_nmi_resume,
- .suspend = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
- .id = 0,
- .cls = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
- int error;
-
- /*
- * should really be a BUG_ON but b/c this is an
- * init call, it just doesn't work. -dcz
- */
- if (nmi_watchdog != NMI_LOCAL_APIC)
- return 0;
-
- if (atomic_read(&nmi_active) < 0)
- return 0;
-
- error = sysdev_class_register(&nmi_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic_nmi);
- return error;
-}
-
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
- __get_cpu_var(wd_enabled) = 1;
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
- if (__get_cpu_var(wd_enabled))
- return;
-
- /* cheap hack to support suspend/resume */
- /* if cpu0 is not active neither should the other cpus */
- if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
- return;
-
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- if (lapic_watchdog_init(nmi_hz) < 0) {
- __get_cpu_var(wd_enabled) = 0;
- return;
- }
- /* FALL THROUGH */
- case NMI_IO_APIC:
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
- /* only support LOCAL and IO APICs for now */
- if (!nmi_watchdog_active())
- return;
- if (__get_cpu_var(wd_enabled) == 0)
- return;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- lapic_watchdog_stop();
- else
- __acpi_nmi_disable(NULL);
- __get_cpu_var(wd_enabled) = 0;
- atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(local_t, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
- if (nmi_watchdog_active()) {
- unsigned cpu;
-
- /*
- * Tell other CPUs to reset their alert counters. We cannot
- * do it ourselves because the alert count increase is not
- * atomic.
- */
- for_each_present_cpu(cpu) {
- if (per_cpu(nmi_touch, cpu) != 1)
- per_cpu(nmi_touch, cpu) = 1;
- }
- }
-
- /*
- * Tickle the softlockup detector too:
- */
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
- /*
- * Since current_thread_info()-> is always on the stack, and we
- * always switch the stack NMI-atomically, it's safe to use
- * smp_processor_id().
- */
- unsigned int sum;
- int touched = 0;
- int cpu = smp_processor_id();
- int rc = 0;
-
- /* check for other users first */
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- rc = 1;
- touched = 1;
- }
-
- sum = get_timer_irqs(cpu);
-
- if (__get_cpu_var(nmi_touch)) {
- __get_cpu_var(nmi_touch) = 0;
- touched = 1;
- }
-
- /* We can be called before check_nmi_watchdog, hence NULL check. */
- if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
- static DEFINE_SPINLOCK(lock); /* Serialise the printks */
-
- spin_lock(&lock);
- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
- dump_stack();
- spin_unlock(&lock);
- cpumask_clear_cpu(cpu, backtrace_mask);
- }
-
- /* Could check oops_in_progress here too, but it's safer not to */
- if (mce_in_progress())
- touched = 1;
-
- /* if the none of the timers isn't firing, this cpu isn't doing much */
- if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- /*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
- */
- local_inc(&__get_cpu_var(alert_counter));
- if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
- /*
- * die_nmi will return ONLY if NOTIFY_STOP happens..
- */
- die_nmi("BUG: NMI Watchdog detected LOCKUP",
- regs, panic_on_timeout);
- } else {
- __get_cpu_var(last_irq_sum) = sum;
- local_set(&__get_cpu_var(alert_counter), 0);
- }
-
- /* see if the nmi watchdog went off */
- if (!__get_cpu_var(wd_enabled))
- return rc;
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- rc |= lapic_wd_event(nmi_hz);
- break;
- case NMI_IO_APIC:
- /*
- * don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- break;
- }
- return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- __acpi_nmi_enable(NULL);
-}
-
-static void enable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
- touch_nmi_watchdog();
-}
-
-static void disable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-
-static int __init setup_unknown_nmi_panic(char *str)
-{
- unknown_nmi_panic = 1;
- return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
- unsigned char reason = get_nmi_reason();
- char buf[64];
-
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf, regs, 1); /* Always panic here */
- return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int old_state;
-
- nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
- old_state = nmi_watchdog_enabled;
- proc_dointvec(table, write, file, buffer, length, ppos);
- if (!!old_state == !!nmi_watchdog_enabled)
- return 0;
-
- if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
- printk(KERN_WARNING
- "NMI watchdog is permanently disabled\n");
- return -EIO;
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- if (nmi_watchdog_enabled)
- enable_lapic_nmi_watchdog();
- else
- disable_lapic_nmi_watchdog();
- } else if (nmi_watchdog == NMI_IO_APIC) {
- if (nmi_watchdog_enabled)
- enable_ioapic_nmi_watchdog();
- else
- disable_ioapic_nmi_watchdog();
- } else {
- printk(KERN_WARNING
- "NMI watchdog doesn't know what hardware to touch\n");
- return -EIO;
- }
- return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
- if (unknown_nmi_panic)
- return unknown_nmi_panic_callback(regs, cpu);
-#endif
- return 0;
-}
-
-void __trigger_all_cpu_backtrace(void)
-{
- int i;
-
- cpumask_copy(backtrace_mask, cpu_online_mask);
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(backtrace_mask))
- break;
- mdelay(1);
- }
-}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
deleted file mode 100644
index ca96e68f0d23..000000000000
--- a/arch/x86/kernel/apic/numaq_32.c
+++ /dev/null
@@ -1,559 +0,0 @@
-/*
- * Written by: Patricia Gaughen, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
- */
-#include <linux/nodemask.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/numa.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/numaq.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/ipi.h>
-
-#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
-
-int found_numaq;
-
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_trans {
- unsigned char mpc_type;
- unsigned char trans_len;
- unsigned char trans_type;
- unsigned char trans_quad;
- unsigned char trans_global;
- unsigned char trans_local;
- unsigned short trans_reserved;
-};
-
-/* x86_quirks member */
-static int mpc_record;
-
-static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-int quad_local_to_mp_bus_id[NR_CPUS/4][4];
-
-
-static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
-{
- struct eachquadmem *eq = scd->eq + node;
-
- node_set_online(node);
-
- /* Convert to pages */
- node_start_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
-
- node_end_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
- e820_register_active_regions(node, node_start_pfn[node],
- node_end_pfn[node]);
-
- memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
- node_remap_size[node] = node_memmap_size_bytes(node,
- node_start_pfn[node],
- node_end_pfn[node]);
-}
-
-/*
- * Function: smp_dump_qct()
- *
- * Description: gets memory layout from the quad config table. This
- * function also updates node_online_map with the nodes (quads) present.
- */
-static void __init smp_dump_qct(void)
-{
- struct sys_cfg_data *scd;
- int node;
-
- scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
-
- nodes_clear(node_online_map);
- for_each_node(node) {
- if (scd->quads_present31_0 & (1 << node))
- numaq_register_node(node, scd);
- }
-}
-
-void __cpuinit numaq_tsc_disable(void)
-{
- if (!found_numaq)
- return;
-
- if (num_online_nodes() > 1) {
- printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
- setup_clear_cpu_cap(X86_FEATURE_TSC);
- }
-}
-
-static int __init numaq_pre_time_init(void)
-{
- numaq_tsc_disable();
- return 0;
-}
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
- return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-/* x86_quirks member */
-static int mpc_apic_id(struct mpc_cpu *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int logical_apicid = generate_logical_apicid(quad, m->apicid);
-
- printk(KERN_DEBUG
- "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->cpufeature & CPU_MODEL_MASK) >> 4,
- m->apicver, quad, logical_apicid);
-
- return logical_apicid;
-}
-
-/* x86_quirks member */
-static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- mp_bus_id_to_node[m->busid] = quad;
- mp_bus_id_to_local[m->busid] = local;
-
- printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
-}
-
-/* x86_quirks member */
-static void mpc_oem_pci_bus(struct mpc_bus *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- quad_local_to_mp_bus_id[quad][local] = m->busid;
-}
-
-static void __init MP_translation_info(struct mpc_trans *m)
-{
- printk(KERN_INFO
- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
- m->trans_local);
-
- if (mpc_record >= MAX_MPC_ENTRY)
- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
- else
- translation_table[mpc_record] = m; /* stash this for later */
-
- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
- node_set_online(m->trans_quad);
-}
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
- int sum = 0;
-
- while (len--)
- sum += *mp++;
-
- return sum & 0xFF;
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-static void __init
- smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
-{
- int count = sizeof(*oemtable); /* the header size */
- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
- mpc_record = 0;
- printk(KERN_INFO
- "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
-
- if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
- printk(KERN_WARNING
- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->signature[0], oemtable->signature[1],
- oemtable->signature[2], oemtable->signature[3]);
- return;
- }
-
- if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
- return;
- }
-
- while (count < oemtable->length) {
- switch (*oemptr) {
- case MP_TRANSLATION:
- {
- struct mpc_trans *m = (void *)oemptr;
-
- MP_translation_info(m);
- oemptr += sizeof(*m);
- count += sizeof(*m);
- ++mpc_record;
- break;
- }
- default:
- printk(KERN_WARNING
- "Unrecognised OEM table entry type! - %d\n",
- (int)*oemptr);
- return;
- }
- }
-}
-
-static int __init numaq_setup_ioapic_ids(void)
-{
- /* so can skip it */
- return 1;
-}
-
-static struct x86_quirks numaq_x86_quirks __initdata = {
- .arch_pre_time_init = numaq_pre_time_init,
- .arch_time_init = NULL,
- .arch_pre_intr_init = NULL,
- .arch_memory_setup = NULL,
- .arch_intr_init = NULL,
- .arch_trap_init = NULL,
- .mach_get_smp_config = NULL,
- .mach_find_smp_config = NULL,
- .mpc_record = &mpc_record,
- .mpc_apic_id = mpc_apic_id,
- .mpc_oem_bus_info = mpc_oem_bus_info,
- .mpc_oem_pci_bus = mpc_oem_pci_bus,
- .smp_read_mpc_oem = smp_read_mpc_oem,
- .setup_ioapic_ids = numaq_setup_ioapic_ids,
-};
-
-static __init void early_check_numaq(void)
-{
- /*
- * Find possible boot-time SMP configuration:
- */
- early_find_smp_config();
-
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-
- if (found_numaq)
- x86_quirks = &numaq_x86_quirks;
-}
-
-int __init get_memcfg_numaq(void)
-{
- early_check_numaq();
- if (!found_numaq)
- return 0;
- smp_dump_qct();
-
- return 1;
-}
-
-#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static inline unsigned int numaq_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0x0F;
-}
-
-static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static inline void numaq_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static inline void numaq_send_IPI_all(int vector)
-{
- numaq_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
-#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
-
-/*
- * Because we use NMIs rather than the INIT-STARTUP sequence to
- * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
- */
-static inline void numaq_smp_callin_clear_local_apic(void)
-{
- clear_local_APIC();
-}
-
-static inline const struct cpumask *numaq_target_cpus(void)
-{
- return cpu_all_mask;
-}
-
-static inline unsigned long
-numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return physid_isset(apicid, bitmap);
-}
-
-static inline unsigned long numaq_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static inline int numaq_apic_id_registered(void)
-{
- return 1;
-}
-
-static inline void numaq_init_apic_ldr(void)
-{
- /* Already done in NUMA-Q firmware */
-}
-
-static inline void numaq_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-/*
- * Skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum.
- */
-static inline int numaq_multi_timer_check(int apic, int irq)
-{
- return apic != 0 && irq == 0;
-}
-
-static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
-{
- /* We don't have a good way to do this yet - hack */
- return physids_promote(0xFUL);
-}
-
-static inline int numaq_cpu_to_logical_apicid(int cpu)
-{
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_2_logical_apicid[cpu];
-}
-
-/*
- * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
- * cpu to APIC ID relation to properly interact with the intelligent
- * mode of the cluster controller.
- */
-static inline int numaq_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < 60)
- return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
- else
- return BAD_APICID;
-}
-
-static inline int numaq_apicid_to_node(int logical_apicid)
-{
- return logical_apicid >> 4;
-}
-
-static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
-{
- int node = numaq_apicid_to_node(logical_apicid);
- int cpu = __ffs(logical_apicid & 0xf);
-
- return physid_mask_of_physid(cpu + 4*node);
-}
-
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-
-static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return 1;
-}
-
-/*
- * We use physical apicids here, not logical, so just return the default
- * physical broadcast to stop people from breaking us
- */
-static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- return 0x0F;
-}
-
-static inline unsigned int
-numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- return 0x0F;
-}
-
-/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
-static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int
-numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- if (strncmp(oem, "IBM NUMA", 8))
- printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
- else
- found_numaq = 1;
-
- return found_numaq;
-}
-
-static int probe_numaq(void)
-{
- /* already know from get_memcfg_numaq() */
- return found_numaq;
-}
-
-static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-static void numaq_setup_portio_remap(void)
-{
- int num_quads = num_online_nodes();
-
- if (num_quads <= 1)
- return;
-
- printk(KERN_INFO
- "Remapping cross-quad port I/O for %d quads\n", num_quads);
-
- xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
-
- printk(KERN_INFO
- "xquad_portio vaddr 0x%08lx, len %08lx\n",
- (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
-}
-
-/* Use __refdata to keep false positive warning calm. */
-struct apic __refdata apic_numaq = {
-
- .name = "NUMAQ",
- .probe = probe_numaq,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = numaq_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* physical delivery on LOCAL quad: */
- .irq_dest_mode = 0,
-
- .target_cpus = numaq_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = numaq_check_apicid_used,
- .check_apicid_present = numaq_check_apicid_present,
-
- .vector_allocation_domain = numaq_vector_allocation_domain,
- .init_apic_ldr = numaq_init_apic_ldr,
-
- .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
- .setup_apic_routing = numaq_setup_apic_routing,
- .multi_timer_check = numaq_multi_timer_check,
- .apicid_to_node = numaq_apicid_to_node,
- .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
- .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
- .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
- .setup_portio_remap = numaq_setup_portio_remap,
- .check_phys_apicid_present = numaq_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = numaq_phys_pkg_id,
- .mps_oem_check = numaq_mps_oem_check,
-
- .get_apic_id = numaq_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
-
- .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = numaq_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = numaq_send_IPI_allbutself,
- .send_IPI_all = numaq_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
- .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
-
- /* We don't do anything here because we use NMI's to boot instead */
- .wait_for_init_deassert = NULL,
-
- .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
- .inquire_remote_apic = NULL,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..87bc9e7ca5d6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -1,79 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Default generic APIC driver. This handles up to 8 CPUs.
*
* Copyright 2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License, v.2
*
* Generic x86 APIC driver probe layer.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
+#include <linux/export.h>
#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-
#include <linux/smp.h>
-#include <asm/ipi.h>
-
-#include <linux/interrupt.h>
-#include <asm/acpi.h>
-#include <asm/e820.h>
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
+#include <xen/xen.h>
-int no_broadcast = DEFAULT_SEND_IPI;
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/acpi.h>
-static __init int no_ipi_broadcast(char *str)
-{
- get_option(&str, &no_broadcast);
- pr_info("Using %s mode\n",
- no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
- return 1;
-}
-__setup("no_ipi_broadcast=", no_ipi_broadcast);
+#include "local.h"
-static int __init print_ipi_mode(void)
+static u32 default_get_apic_id(u32 x)
{
- pr_info("Using IPI %s mode\n",
- no_broadcast ? "No-Shortcut" : "Shortcut");
- return 0;
-}
-late_initcall(print_ipi_mode);
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-void default_setup_apic_routing(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- printk(KERN_INFO
- "Enabling APIC mode: Flat. Using %d I/O APICs\n",
- nr_ioapics);
-#endif
-}
-
-static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /*
- * Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
+ return (x >> 24) & 0xFF;
+ else
+ return (x >> 24) & 0x0F;
}
/* should be called last. */
@@ -82,106 +34,55 @@ static int probe_default(void)
return 1;
}
-struct apic apic_default = {
+static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = default_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .dest_mode_logical = true,
- .target_cpus = default_target_cpus,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
- .check_apicid_present = default_check_apicid_present,
- .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = default_init_apic_ldr,
-
- .ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = default_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = default_apicid_to_node,
- .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = default_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = default_phys_pkg_id,
- .mps_oem_check = NULL,
+ .max_apic_id = 0xFE,
.get_apic_id = default_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = apic_flat_calc_apicid,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
.send_IPI_allbutself = default_send_IPI_allbutself,
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
-extern struct apic apic_numaq;
-extern struct apic apic_summit;
-extern struct apic apic_bigsmp;
-extern struct apic apic_es7000;
-extern struct apic apic_es7000_cluster;
+apic_driver(apic_default);
-struct apic *apic = &apic_default;
+struct apic *apic __ro_after_init = &apic_default;
EXPORT_SYMBOL_GPL(apic);
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_NUMAQ
- &apic_numaq,
-#endif
-#ifdef CONFIG_X86_SUMMIT
- &apic_summit,
-#endif
-#ifdef CONFIG_X86_BIGSMP
- &apic_bigsmp,
-#endif
-#ifdef CONFIG_X86_ES7000
- &apic_es7000,
- &apic_es7000_cluster,
-#endif
- &apic_default, /* must be last */
- NULL,
-};
-
static int cmdline_apic __initdata;
static int __init parse_apic(char *arg)
{
- int i;
+ struct apic **drv;
if (!arg)
return -EINVAL;
- for (i = 0; apic_probe[i]; i++) {
- if (!strcmp(apic_probe[i]->name, arg)) {
- apic = apic_probe[i];
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!strcmp((*drv)->name, arg)) {
+ apic_install_driver(*drv);
cmdline_apic = 1;
return 0;
}
@@ -192,82 +93,19 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init generic_bigsmp_probe(void)
-{
-#ifdef CONFIG_X86_BIGSMP
- /*
- * This routine is used to switch to bigsmp mode when
- * - There is no apic= option specified by the user
- * - generic_apic_probe() has chosen apic_default as the sub_arch
- * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
- */
-
- if (!cmdline_apic && apic == &apic_default) {
- if (apic_bigsmp.probe()) {
- apic = &apic_bigsmp;
- printk(KERN_INFO "Overriding APIC driver with %s\n",
- apic->name);
- }
- }
-#endif
-}
-
-void __init generic_apic_probe(void)
+void __init x86_32_probe_apic(void)
{
if (!cmdline_apic) {
- int i;
- for (i = 0; apic_probe[i]; i++) {
- if (apic_probe[i]->probe()) {
- apic = apic_probe[i];
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe()) {
+ apic_install_driver(*drv);
break;
}
}
/* Not visible without early console */
- if (!apic_probe[i])
+ if (drv == __apicdrivers_end)
panic("Didn't find an APIC driver");
}
- printk(KERN_INFO "Using APIC driver %s\n", apic->name);
-}
-
-/* These functions can switch the APIC even after the initial ->probe() */
-
-int __init
-generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- int i;
-
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->mps_oem_check)
- continue;
- if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
- continue;
-
- if (!cmdline_apic) {
- apic = apic_probe[i];
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
-}
-
-int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- int i;
-
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->acpi_madt_oem_check)
- continue;
- if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
- continue;
-
- if (!cmdline_apic) {
- apic = apic_probe[i];
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880f9b82..ecdf0c4121e1 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
*
* Generic APIC sub-arch probe layer.
*
@@ -8,91 +8,33 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/hardirq.h>
-#include <linux/dmar.h>
-
-#include <asm/smp.h>
+#include <linux/thread_info.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-#include <asm/setup.h>
-
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2xpic_uv_x;
-extern struct apic apic_x2apic_phys;
-extern struct apic apic_x2apic_cluster;
-struct apic __read_mostly *apic = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
+#include "local.h"
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_UV
- &apic_x2apic_uv_x,
-#endif
-#ifdef CONFIG_X86_X2APIC
- &apic_x2apic_phys,
- &apic_x2apic_cluster,
-#endif
- &apic_physflat,
- NULL,
-};
-
-/*
- * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
- */
-void __init default_setup_apic_routing(void)
+/* Select the appropriate APIC driver */
+void __init x86_64_probe_apic(void)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode && (apic != &apic_x2apic_phys &&
-#ifdef CONFIG_X86_UV
- apic != &apic_x2apic_uv_x &&
-#endif
- apic != &apic_x2apic_cluster)) {
- if (x2apic_phys)
- apic = &apic_x2apic_phys;
- else
- apic = &apic_x2apic_cluster;
- printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
- }
-#endif
+ struct apic **drv;
- if (apic == &apic_flat) {
- if (max_physical_apicid >= 8)
- apic = &apic_physflat;
- printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
- }
+ enable_IR_x2apic();
- /*
- * Now that apic routing model is selected, configure the
- * fault handling for intr remapping.
- */
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
-}
-
-/* Same for both flat and physical. */
-
-void apic_send_IPI_self(int vector)
-{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe && (*drv)->probe()) {
+ apic_install_driver(*drv);
+ break;
+ }
+ }
}
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
- apic = apic_probe[i];
- printk(KERN_INFO "Setting APIC routing to %s.\n",
- apic->name);
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ apic_install_driver(*drv);
return 1;
}
}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
deleted file mode 100644
index eafdfbd1ea95..000000000000
--- a/arch/x86/kernel/apic/summit_32.c
+++ /dev/null
@@ -1,568 +0,0 @@
-/*
- * IBM Summit-Specific Code
- *
- * Written By: Matthew Dobson, IBM Corporation
- *
- * Copyright (c) 2003 IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/bios_ebda.h>
-
-/*
- * APIC driver for the IBM "Summit" chipset.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/smp.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/ipi.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/smp.h>
-
-static unsigned summit_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static void summit_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static void summit_send_IPI_all(int vector)
-{
- summit_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#include <asm/tsc.h>
-
-extern int use_cyclone;
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static void setup_summit(void);
-#else
-static inline void setup_summit(void) {}
-#endif
-
-static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- if (!strncmp(oem, "IBM ENSW", 8) &&
- (!strncmp(productid, "VIGIL SMP", 9)
- || !strncmp(productid, "EXA", 3)
- || !strncmp(productid, "RUTHLESS SMP", 12))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-/* Hook from generic ACPI tables.c */
-static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- if (!strncmp(oem_id, "IBM", 3) &&
- (!strncmp(oem_table_id, "SERVIGIL", 8)
- || !strncmp(oem_table_id, "EXA", 3))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-struct rio_table_hdr {
- unsigned char version; /* Version number of this data structure */
- /* Version 3 adds chassis_num & WP_index */
- unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
- unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
-} __attribute__((packed));
-
-struct scal_detail {
- unsigned char node_id; /* Scalability Node ID */
- unsigned long CBAR; /* Address of 1MB register space */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF = None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port2node; /* Node ID port connected to: 0xFF = None */
- unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
-} __attribute__((packed));
-
-struct rio_detail {
- unsigned char node_id; /* RIO Node ID */
- unsigned long BBAR; /* Address of 1MB register space */
- unsigned char type; /* Type of device */
- unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
- /* For CYC: Node ID of Twister that owns this CYC */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF=None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
- /* For CYC: 0 */
- unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
- /* = 0 : the XAPIC is not used, ie:*/
- /* ints fwded to another XAPIC */
- /* Bits1:7 Reserved */
- /* For CYC: Bits0:7 Reserved */
- unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
- /* lower slot numbers/PCI bus numbers */
- /* For CYC: No meaning */
- unsigned char chassis_num; /* 1 based Chassis number */
- /* For LookOut WPEGs this field indicates the */
- /* Expansion Chassis #, enumerated from Boot */
- /* Node WPEG external port, then Boot Node CYC */
- /* external port, then Next Vigil chassis WPEG */
- /* external port, etc. */
- /* Shared Lookouts have only 1 chassis number (the */
- /* first one assigned) */
-} __attribute__((packed));
-
-
-typedef enum {
- CompatTwister = 0, /* Compatibility Twister */
- AltTwister = 1, /* Alternate Twister of internal 8-way */
- CompatCyclone = 2, /* Compatibility Cyclone */
- AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
- CompatWPEG = 4, /* Compatibility WPEG */
- AltWPEG = 5, /* Second Planar WPEG */
- LookOutAWPEG = 6, /* LookOut WPEG */
- LookOutBWPEG = 7, /* LookOut WPEG */
-} node_type;
-
-static inline int is_WPEG(struct rio_detail *rio){
- return (rio->type == CompatWPEG || rio->type == AltWPEG ||
- rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
-}
-
-#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static const struct cpumask *summit_target_cpus(void)
-{
- /* CPU_MASK_ALL (0xff) has undefined behaviour with
- * dest_LowestPrio mode logical clustered apic interrupt routing
- * Just start on cpu 0. IRQ balancing will spread load
- */
- return cpumask_of(0);
-}
-
-static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
-{
- return 0;
-}
-
-/* we don't use the phys_cpu_present_map to indicate apicid presence */
-static unsigned long summit_check_apicid_present(int bit)
-{
- return 1;
-}
-
-static void summit_init_apic_ldr(void)
-{
- unsigned long val, id;
- int count = 0;
- u8 my_id = (u8)hard_smp_processor_id();
- u8 my_cluster = APIC_CLUSTER(my_id);
-#ifdef CONFIG_SMP
- u8 lid;
- int i;
-
- /* Create logical APIC IDs by counting CPUs already in cluster. */
- for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
- lid = cpu_2_logical_apicid[i];
- if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
- ++count;
- }
-#endif
- /* We only have a 4 wide bitmap in cluster mode. If a deranged
- * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
- BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
- id = my_cluster | (1UL << count);
- apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static int summit_apic_id_registered(void)
-{
- return 1;
-}
-
-static void summit_setup_apic_routing(void)
-{
- printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int summit_apicid_to_node(int logical_apicid)
-{
-#ifdef CONFIG_SMP
- return apicid_2_node[hard_smp_processor_id()];
-#else
- return 0;
-#endif
-}
-
-/* Mapping from cpu number to logical apicid */
-static inline int summit_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_2_logical_apicid[cpu];
-#else
- return logical_smp_processor_id();
-#endif
-}
-
-static int summit_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0x0F);
-}
-
-static physid_mask_t summit_apicid_to_cpu_present(int apicid)
-{
- return physid_mask_of_physid(0);
-}
-
-static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return 1;
-}
-
-static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- unsigned int round = 0;
- int cpu, apicid = 0;
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu(cpu, cpumask) {
- int new_apicid = summit_cpu_to_logical_apicid(cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- printk("%s: Not a valid mask!\n", __func__);
- return BAD_APICID;
- }
- apicid |= new_apicid;
- round++;
- }
- return apicid;
-}
-
-static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask)
-{
- int apicid = summit_cpu_to_logical_apicid(0);
- cpumask_var_t cpumask;
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return apicid;
-
- cpumask_and(cpumask, inmask, andmask);
- cpumask_and(cpumask, cpumask, cpu_online_mask);
- apicid = summit_cpu_mask_to_apicid(cpumask);
-
- free_cpumask_var(cpumask);
-
- return apicid;
-}
-
-/*
- * cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value. For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
-
-static int probe_summit(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static struct rio_table_hdr *rio_table_hdr;
-static struct scal_detail *scal_devs[MAX_NUMNODES];
-static struct rio_detail *rio_devs[MAX_NUMNODES*4];
-
-#ifndef CONFIG_X86_NUMAQ
-static int mp_bus_id_to_node[MAX_MP_BUSSES];
-#endif
-
-static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
-{
- int twister = 0, node = 0;
- int i, bus, num_buses;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
- twister = rio_devs[i]->owner_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_rio_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
- return last_bus;
- }
-
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
- if (scal_devs[i]->node_id == twister) {
- node = scal_devs[i]->node_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_scal_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
- return last_bus;
- }
-
- switch (rio_devs[wpeg_num]->type) {
- case CompatWPEG:
- /*
- * The Compatibility Winnipeg controls the 2 legacy buses,
- * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
- * a PCI-PCI bridge card is used in either slot: total 5 buses.
- */
- num_buses = 5;
- break;
- case AltWPEG:
- /*
- * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
- * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
- * the "extra" buses for each of those slots: total 7 buses.
- */
- num_buses = 7;
- break;
- case LookOutAWPEG:
- case LookOutBWPEG:
- /*
- * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
- * & the "extra" buses for each of those slots: total 9 buses.
- */
- num_buses = 9;
- break;
- default:
- printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
- return last_bus;
- }
-
- for (bus = last_bus; bus < last_bus + num_buses; bus++)
- mp_bus_id_to_node[bus] = node;
- return bus;
-}
-
-static int build_detail_arrays(void)
-{
- unsigned long ptr;
- int i, scal_detail_size, rio_detail_size;
-
- if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
- printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
- return 0;
- }
-
- switch (rio_table_hdr->version) {
- default:
- printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
- return 0;
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- }
-
- ptr = (unsigned long)rio_table_hdr + 3;
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 1;
-}
-
-void setup_summit(void)
-{
- unsigned long ptr;
- unsigned short offset;
- int i, next_wpeg, next_bus = 0;
-
- /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
- ptr = get_bios_ebda();
- ptr = (unsigned long)phys_to_virt(ptr);
-
- rio_table_hdr = NULL;
- offset = 0x180;
- while (offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- /* The next offset is stored in the 1st word. 0 means no more */
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
- return;
- }
-
- if (!build_detail_arrays())
- return;
-
- /* The first Winnipeg we're looking for has an index of 0 */
- next_wpeg = 0;
- do {
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
- /* It's the Winnipeg we're looking for! */
- next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
- next_wpeg++;
- break;
- }
- }
- /*
- * If we go through all Rio devices and don't find one with
- * the next index, it means we've found all the Winnipegs,
- * and thus all the PCI buses.
- */
- if (i == rio_table_hdr->num_rio_dev)
- next_wpeg = 0;
- } while (next_wpeg != 0);
-}
-#endif
-
-struct apic apic_summit = {
-
- .name = "summit",
- .probe = probe_summit,
- .acpi_madt_oem_check = summit_acpi_madt_oem_check,
- .apic_id_registered = summit_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
-
- .target_cpus = summit_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = summit_check_apicid_used,
- .check_apicid_present = summit_check_apicid_present,
-
- .vector_allocation_domain = summit_vector_allocation_domain,
- .init_apic_ldr = summit_init_apic_ldr,
-
- .ioapic_phys_id_map = summit_ioapic_phys_id_map,
- .setup_apic_routing = summit_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = summit_apicid_to_node,
- .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
- .cpu_present_to_apicid = summit_cpu_present_to_apicid,
- .apicid_to_cpu_present = summit_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = summit_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = summit_phys_pkg_id,
- .mps_oem_check = summit_mps_oem_check,
-
- .get_apic_id = summit_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = summit_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = summit_send_IPI_allbutself,
- .send_IPI_all = summit_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
new file mode 100644
index 000000000000..bddc54465399
--- /dev/null
+++ b/arch/x86/kernel/apic/vector.c
@@ -0,0 +1,1387 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Local APIC related interfaces to support IOAPIC, MSI, etc.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Enable support of hierarchical irqdomains
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/desc.h>
+#include <asm/irq_remapping.h>
+
+#include <asm/trace/irq_vectors.h>
+
+struct apic_chip_data {
+ struct irq_cfg hw_irq_cfg;
+ unsigned int vector;
+ unsigned int prev_vector;
+ unsigned int cpu;
+ unsigned int prev_cpu;
+ unsigned int irq;
+ struct hlist_node clist;
+ unsigned int move_in_progress : 1,
+ is_managed : 1,
+ can_reserve : 1,
+ has_reserved : 1;
+};
+
+struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
+static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_searchmask;
+static struct irq_chip lapic_controller;
+static struct irq_matrix *vector_matrix;
+#ifdef CONFIG_SMP
+
+static void vector_cleanup_callback(struct timer_list *tmr);
+
+struct vector_cleanup {
+ struct hlist_head head;
+ struct timer_list timer;
+};
+
+static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = {
+ .head = HLIST_HEAD_INIT,
+ .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED),
+};
+#endif
+
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ raw_spin_unlock(&vector_lock);
+}
+
+void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask)
+{
+ memset(info, 0, sizeof(*info));
+ info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+ if (src)
+ *dst = *src;
+ else
+ memset(dst, 0, sizeof(*dst));
+}
+
+static struct apic_chip_data *apic_chip_data(struct irq_data *irqd)
+{
+ if (!irqd)
+ return NULL;
+
+ while (irqd->parent_data)
+ irqd = irqd->parent_data;
+
+ return irqd->chip_data;
+}
+
+struct irq_cfg *irqd_cfg(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ return apicd ? &apicd->hw_irq_cfg : NULL;
+}
+EXPORT_SYMBOL_GPL(irqd_cfg);
+
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irqd_cfg(irq_get_irq_data(irq));
+}
+
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node);
+ if (apicd)
+ INIT_HLIST_NODE(&apicd->clist);
+ return apicd;
+}
+
+static void free_apic_chip_data(struct apic_chip_data *apicd)
+{
+ kfree(apicd);
+}
+
+static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
+ unsigned int cpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ apicd->hw_irq_cfg.vector = vector;
+ apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu);
+
+ apic_update_vector(cpu, vector, true);
+
+ irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
+ trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid);
+}
+
+static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed)
+{
+ apic_update_vector(cpu, vector, false);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+}
+
+static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ struct irq_desc *desc = irq_data_to_desc(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
+ apicd->cpu);
+
+ /*
+ * If there is no vector associated or if the associated vector is
+ * the shutdown vector, which is associated to make PCI/MSI
+ * shutdown mode work, then there is nothing to release. Clear out
+ * prev_vector for this and the offlined target case.
+ */
+ apicd->prev_vector = 0;
+ if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR)
+ goto setnew;
+ /*
+ * If the target CPU of the previous vector is online, then mark
+ * the vector as move in progress and store it for cleanup when the
+ * first interrupt on the new vector arrives. If the target CPU is
+ * offline then the regular release mechanism via the cleanup
+ * vector is not possible and the vector can be immediately freed
+ * in the underlying matrix allocator.
+ */
+ if (cpu_online(apicd->cpu)) {
+ apicd->move_in_progress = true;
+ apicd->prev_vector = apicd->vector;
+ apicd->prev_cpu = apicd->cpu;
+ WARN_ON_ONCE(apicd->cpu == newcpu);
+ } else {
+ apic_free_vector(apicd->cpu, apicd->vector, managed);
+ }
+
+setnew:
+ apicd->vector = newvec;
+ apicd->cpu = newcpu;
+ BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
+ per_cpu(vector_irq, newcpu)[newvec] = desc;
+ apic_update_irq_cfg(irqd, newvec, newcpu);
+}
+
+static void vector_assign_managed_shutdown(struct irq_data *irqd)
+{
+ unsigned int cpu = cpumask_first(cpu_online_mask);
+
+ apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu);
+}
+
+static int reserve_managed_vector(struct irq_data *irqd)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apicd->is_managed = true;
+ ret = irq_matrix_reserve_managed(vector_matrix, affmsk);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ trace_vector_reserve_managed(irqd->irq, ret);
+ return ret;
+}
+
+static void reserve_irq_vector_locked(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ irq_matrix_reserve(vector_matrix);
+ apicd->can_reserve = true;
+ apicd->has_reserved = true;
+ irqd_set_can_reserve(irqd);
+ trace_vector_reserve(irqd->irq, 0);
+ vector_assign_managed_shutdown(irqd);
+}
+
+static int reserve_irq_vector(struct irq_data *irqd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ reserve_irq_vector_locked(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return 0;
+}
+
+static int
+assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool resvd = apicd->has_reserved;
+ unsigned int cpu = apicd->cpu;
+ int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /*
+ * If the current target CPU is online and in the new requested
+ * affinity mask, there is no point in moving the interrupt from
+ * one CPU to another.
+ */
+ if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))
+ return 0;
+
+ /*
+ * Careful here. @apicd might either have move_in_progress set or
+ * be enqueued for cleanup. Assigning a new vector would either
+ * leave a stale vector on some CPU around or in case of a pending
+ * cleanup corrupt the hlist.
+ */
+ if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist))
+ return -EBUSY;
+
+ vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
+ trace_vector_alloc(irqd->irq, vector, resvd, vector);
+ if (vector < 0)
+ return vector;
+ chip_data_update(irqd, vector, cpu);
+
+ return 0;
+}
+
+static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ ret = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static int assign_irq_vector_any_locked(struct irq_data *irqd)
+{
+ /* Get the affinity mask - either irq_default_affinity or (user) set */
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ int node = irq_data_get_node(irqd);
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the intersection of @affmsk and node mask */
+ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ }
+
+ /* Try the full affinity mask */
+ cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the node mask */
+ if (!assign_vector_locked(irqd, cpumask_of_node(node)))
+ return 0;
+ }
+
+ /* Try the full online mask */
+ return assign_vector_locked(irqd, cpu_online_mask);
+}
+
+static int
+assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info)
+{
+ if (irqd_affinity_is_managed(irqd))
+ return reserve_managed_vector(irqd);
+ if (info->mask)
+ return assign_irq_vector(irqd, info->mask);
+ /*
+ * Make only a global reservation with no guarantee. A real vector
+ * is associated at activation time.
+ */
+ return reserve_irq_vector(irqd);
+}
+
+static int
+assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int vector, cpu;
+
+ cpumask_and(vector_searchmask, dest, affmsk);
+
+ /* set_affinity might call here for nothing */
+ if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask))
+ return 0;
+ vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask,
+ &cpu);
+ trace_vector_alloc_managed(irqd->irq, vector, vector);
+ if (vector < 0)
+ return vector;
+ chip_data_update(irqd, vector, cpu);
+
+ return 0;
+}
+
+static void clear_irq_vector(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+ unsigned int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ if (!vector)
+ return;
+
+ trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
+ apicd->prev_cpu);
+
+ per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN;
+ apic_free_vector(apicd->cpu, vector, managed);
+ apicd->vector = 0;
+
+ /* Clean up move in progress */
+ vector = apicd->prev_vector;
+ if (!vector)
+ return;
+
+ per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN;
+ apic_free_vector(apicd->prev_cpu, vector, managed);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+ hlist_del_init(&apicd->clist);
+}
+
+static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ trace_vector_deactivate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, false);
+
+ /* Regular fixed assigned interrupt */
+ if (!apicd->is_managed && !apicd->can_reserve)
+ return;
+ /* If the interrupt has a global reservation, nothing to do */
+ if (apicd->has_reserved)
+ return;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ if (apicd->can_reserve)
+ reserve_irq_vector_locked(irqd);
+ else
+ vector_assign_managed_shutdown(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+static int activate_reserved(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int ret;
+
+ ret = assign_irq_vector_any_locked(irqd);
+ if (!ret) {
+ apicd->has_reserved = false;
+ /*
+ * Core might have disabled reservation mode after
+ * allocating the irq descriptor. Ideally this should
+ * happen before allocation time, but that would require
+ * completely convoluted ways of transporting that
+ * information.
+ */
+ if (!irqd_can_reserve(irqd))
+ apicd->can_reserve = false;
+ }
+
+ /*
+ * Check to ensure that the effective affinity mask is a subset
+ * the user supplied affinity mask, and warn the user if it is not
+ */
+ if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd),
+ irq_data_get_affinity_mask(irqd))) {
+ pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n",
+ irqd->irq);
+ }
+
+ return ret;
+}
+
+static int activate_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ int ret;
+
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) {
+ /* Something in the core code broke! Survive gracefully */
+ pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq);
+ return -EINVAL;
+ }
+
+ ret = assign_managed_vector(irqd, vector_searchmask);
+ /*
+ * This should not happen. The vector reservation got buggered. Handle
+ * it gracefully.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ pr_err("Managed startup irq %u, no vector available\n",
+ irqd->irq);
+ }
+ return ret;
+}
+
+static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
+ bool reserve)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret = 0;
+
+ trace_vector_activate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, reserve);
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!apicd->can_reserve && !apicd->is_managed)
+ assign_irq_vector_any_locked(irqd);
+ else if (reserve || irqd_is_managed_and_shutdown(irqd))
+ vector_assign_managed_shutdown(irqd);
+ else if (apicd->is_managed)
+ ret = activate_managed(irqd);
+ else if (apicd->has_reserved)
+ ret = activate_reserved(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static void vector_free_reserved_and_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ trace_vector_teardown(irqd->irq, apicd->is_managed,
+ apicd->has_reserved);
+
+ if (apicd->has_reserved)
+ irq_matrix_remove_reserved(vector_matrix);
+ if (apicd->is_managed)
+ irq_matrix_remove_managed(vector_matrix, dest);
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+ if (irqd && irqd->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ vector_free_reserved_and_managed(irqd);
+ apicd = irqd->chip_data;
+ irq_domain_reset_irq_data(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_apic_chip_data(apicd);
+ }
+ }
+}
+
+static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
+ struct apic_chip_data *apicd)
+{
+ unsigned long flags;
+ bool realloc = false;
+
+ apicd->vector = ISA_IRQ_VECTOR(virq);
+ apicd->cpu = 0;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ /*
+ * If the interrupt is activated, then it must stay at this vector
+ * position. That's usually the timer interrupt (0).
+ */
+ if (irqd_is_activated(irqd)) {
+ trace_vector_setup(virq, true, 0);
+ apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
+ } else {
+ /* Release the vector */
+ apicd->can_reserve = true;
+ irqd_set_can_reserve(irqd);
+ clear_irq_vector(irqd);
+ realloc = true;
+ }
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return realloc;
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ int i, err, node;
+
+ if (apic_is_disabled)
+ return -ENXIO;
+
+ /*
+ * Catch any attempt to touch the cascade interrupt on a PIC
+ * equipped system.
+ */
+ if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY &&
+ virq == PIC_CASCADE_IR))
+ return -EINVAL;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!irqd);
+ node = irq_data_get_node(irqd);
+ WARN_ON_ONCE(irqd->chip_data);
+ apicd = alloc_apic_chip_data(node);
+ if (!apicd) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ apicd->irq = virq + i;
+ irqd->chip = &lapic_controller;
+ irqd->chip_data = apicd;
+ irqd->hwirq = virq + i;
+ irqd_set_single_target(irqd);
+ /*
+ * Prevent that any of these interrupts is invoked in
+ * non interrupt context via e.g. generic_handle_irq()
+ * as that can corrupt the affinity move state.
+ */
+ irqd_set_handle_enforce_irqctx(irqd);
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irqd);
+
+ /*
+ * Legacy vectors are already assigned when the IOAPIC
+ * takes them over. They stay on the same vector. This is
+ * required for check_timer() to work correctly as it might
+ * switch back to legacy mode. Only update the hardware
+ * config.
+ */
+ if (info->flags & X86_IRQ_ALLOC_LEGACY) {
+ if (!vector_configure_legacy(virq + i, irqd, apicd))
+ continue;
+ }
+
+ err = assign_irq_vector_policy(irqd, info);
+ trace_vector_setup(virq + i, false, err);
+ if (err) {
+ irqd->chip_data = NULL;
+ free_apic_chip_data(apicd);
+ goto error;
+ }
+ }
+
+ return 0;
+
+error:
+ x86_vector_free_irqs(domain, virq, i);
+ return err;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct apic_chip_data apicd;
+ unsigned long flags;
+ int irq;
+
+ if (!irqd) {
+ irq_matrix_debug_show(m, vector_matrix, ind);
+ return;
+ }
+
+ irq = irqd->irq;
+ if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) {
+ seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq));
+ seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, "");
+ return;
+ }
+
+ if (!irqd->chip_data) {
+ seq_printf(m, "%*sVector: Not assigned\n", ind, "");
+ return;
+ }
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ memcpy(&apicd, irqd->chip_data, sizeof(apicd));
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector);
+ seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu);
+ if (apicd.prev_vector) {
+ seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector);
+ seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu);
+ }
+ seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0);
+ seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0);
+ seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0);
+ seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0);
+ seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist));
+}
+#endif
+
+int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "IO-APIC-", 8) &&
+ simple_strtol(fwname+8, NULL, 10) == fwspec->param[0];
+ }
+ return to_of_node(fwspec->fwnode) &&
+ of_device_is_compatible(to_of_node(fwspec->fwnode),
+ "intel,ce4100-ioapic");
+}
+
+int x86_fwspec_is_hpet(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "HPET-MSI-", 9) &&
+ simple_strtol(fwname+9, NULL, 10) == fwspec->param[0];
+ }
+ return 0;
+}
+
+static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
+{
+ /*
+ * HPET and I/OAPIC cannot be parented in the vector domain
+ * if IRQ remapping is enabled. APIC IDs above 15 bits are
+ * only permitted if IRQ remapping is enabled, so check that.
+ */
+ if (apic_id_valid(32768))
+ return 0;
+
+ return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+ .select = x86_vector_select,
+ .alloc = x86_vector_alloc_irqs,
+ .free = x86_vector_free_irqs,
+ .activate = x86_vector_activate,
+ .deactivate = x86_vector_deactivate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = x86_vector_debug_show,
+#endif
+};
+
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids)
+ irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids);
+
+ nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI)
+ /*
+ * for MSI and HT dyn irq
+ */
+ if (gsi_top <= NR_IRQS_LEGACY)
+ nr += 8 * nr_cpu_ids;
+ else
+ nr += gsi_top * 16;
+#endif
+ if (nr < irq_get_nr_irqs())
+ irq_set_nr_irqs(nr);
+
+ /*
+ * We don't know if PIC is present at this point so we need to do
+ * probe() to get the right number of legacy IRQs.
+ */
+ return legacy_pic->probe();
+}
+
+void lapic_assign_legacy_vector(unsigned int irq, bool replace)
+{
+ /*
+ * Use assign system here so it won't get accounted as allocated
+ * and movable in the cpu hotplug check and it prevents managed
+ * irq reservation from touching it.
+ */
+ irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
+}
+
+void __init lapic_update_legacy_vectors(void)
+{
+ unsigned int i;
+
+ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
+ return;
+
+ /*
+ * If the IO/APIC is disabled via config, kernel command line or
+ * lack of enumeration then all legacy interrupts are routed
+ * through the PIC. Make sure that they are marked as legacy
+ * vectors. PIC_CASCADE_IRQ has already been marked in
+ * lapic_assign_system_vectors().
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ lapic_assign_legacy_vector(i, true);
+ }
+}
+
+void __init lapic_assign_system_vectors(void)
+{
+ unsigned int i, vector;
+
+ for_each_set_bit(vector, system_vectors, NR_VECTORS)
+ irq_matrix_assign_system(vector_matrix, vector, false);
+
+ if (nr_legacy_irqs() > 1)
+ lapic_assign_legacy_vector(PIC_CASCADE_IR, false);
+
+ /* System vectors are reserved, online it */
+ irq_matrix_online(vector_matrix);
+
+ /* Mark the preallocated legacy interrupts */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ /*
+ * Don't touch the cascade interrupt. It's unusable
+ * on PIC equipped machines. See the large comment
+ * in the IO/APIC code.
+ */
+ if (i != PIC_CASCADE_IR)
+ irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
+ }
+}
+
+int __init arch_early_irq_init(void)
+{
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_fwnode("VECTOR");
+ BUG_ON(!fn);
+ x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
+ NULL);
+ BUG_ON(x86_vector_domain == NULL);
+ irq_set_default_domain(x86_vector_domain);
+
+ BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+
+ /*
+ * Allocate the vector matrix allocator data structure and limit the
+ * search area.
+ */
+ vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR,
+ FIRST_SYSTEM_VECTOR);
+ BUG_ON(!vector_matrix);
+
+ return arch_early_ioapic_init();
+}
+
+#ifdef CONFIG_SMP
+
+static struct irq_desc *__setup_vector_irq(int vector)
+{
+ int isairq = vector - ISA_IRQ_VECTOR(0);
+
+ /* Check whether the irq is in the legacy space */
+ if (isairq < 0 || isairq >= nr_legacy_irqs())
+ return VECTOR_UNUSED;
+ /* Check whether the irq is handled by the IOAPIC */
+ if (test_bit(isairq, &io_apic_irqs))
+ return VECTOR_UNUSED;
+ return irq_to_desc(isairq);
+}
+
+/* Online the local APIC infrastructure and initialize the vectors */
+void lapic_online(void)
+{
+ unsigned int vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /* Online the vector matrix array for this CPU */
+ irq_matrix_online(vector_matrix);
+
+ /*
+ * The interrupt affinity logic never targets interrupts to offline
+ * CPUs. The exception are the legacy PIC interrupts. In general
+ * they are only targeted to CPU0, but depending on the platform
+ * they can be distributed to any online CPU in hardware. The
+ * kernel has no influence on that. So all active legacy vectors
+ * must be installed on all CPUs. All non legacy interrupts can be
+ * cleared.
+ */
+ for (vector = 0; vector < NR_VECTORS; vector++)
+ this_cpu_write(vector_irq[vector], __setup_vector_irq(vector));
+}
+
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr);
+
+void lapic_offline(void)
+{
+ struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup);
+
+ lock_vector_lock();
+
+ /* In case the vector cleanup timer has not expired */
+ __vector_cleanup(cl, false);
+
+ irq_matrix_offline(vector_matrix);
+ WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0);
+ WARN_ON_ONCE(!hlist_empty(&cl->head));
+
+ unlock_vector_lock();
+}
+
+static int apic_set_affinity(struct irq_data *irqd,
+ const struct cpumask *dest, bool force)
+{
+ int err;
+
+ if (WARN_ON_ONCE(!irqd_is_activated(irqd)))
+ return -EIO;
+
+ raw_spin_lock(&vector_lock);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (irqd_affinity_is_managed(irqd))
+ err = assign_managed_vector(irqd, vector_searchmask);
+ else
+ err = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock(&vector_lock);
+ return err ? err : IRQ_SET_MASK_OK;
+}
+
+static void free_moved_vector(struct apic_chip_data *apicd)
+{
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
+
+ /*
+ * Managed interrupts are usually not migrated away
+ * from an online CPU, but CPU isolation 'managed_irq'
+ * can make that happen.
+ * 1) Activation does not take the isolation into account
+ * to keep the code simple
+ * 2) Migration away from an isolated CPU can happen when
+ * a non-isolated CPU which is in the calculated
+ * affinity mask comes online.
+ */
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ apic_free_vector(cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+}
+
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+static void apic_force_complete_move(struct irq_data *irqd)
+{
+ unsigned int cpu = smp_processor_id();
+ struct apic_chip_data *apicd;
+ unsigned int vector;
+
+ guard(raw_spinlock)(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ return;
+
+ /*
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
+ return;
+
+ /*
+ * This is tricky. If the cleanup of the old vector has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ *
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (apicd->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendezvous in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theoretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqd->irq, vector);
+ }
+ free_moved_vector(apicd);
+}
+
+#else
+# define apic_set_affinity NULL
+# define apic_force_complete_move NULL
+#endif
+
+static int apic_retrigger_irq(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ __apic_send_IPI(apicd->cpu, apicd->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+void apic_ack_irq(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+ apic_eoi();
+}
+
+void apic_ack_edge(struct irq_data *irqd)
+{
+ irq_complete_move(irqd_cfg(irqd));
+ apic_ack_irq(irqd);
+}
+
+static void x86_vector_msi_compose_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, false);
+}
+
+static struct irq_chip lapic_controller = {
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_compose_msi_msg = x86_vector_msi_compose_msg,
+ .irq_force_complete_move = apic_force_complete_move,
+ .irq_retrigger = apic_retrigger_irq,
+};
+
+#ifdef CONFIG_SMP
+
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
+{
+ struct apic_chip_data *apicd;
+ struct hlist_node *tmp;
+ bool rearm = false;
+
+ lockdep_assert_held(&vector_lock);
+
+ hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
+ unsigned int vector = apicd->prev_vector;
+
+ /*
+ * Paranoia: Check if the vector that needs to be cleaned
+ * up is registered at the APICs IRR. That's clearly a
+ * hardware issue if the vector arrived on the old target
+ * _after_ interrupts were disabled above. Keep @apicd
+ * on the list and schedule the timer again to give the CPU
+ * a chance to handle the pending interrupt.
+ *
+ * Do not check IRR when called from lapic_offline(), because
+ * fixup_irqs() was just called to scan IRR for set bits and
+ * forward them to new destination CPUs via IPIs.
+ */
+ if (check_irr && is_vector_pending(vector)) {
+ pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
+ rearm = true;
+ continue;
+ }
+ free_moved_vector(apicd);
+ }
+
+ /*
+ * Must happen under vector_lock to make the timer_pending() check
+ * in __vector_schedule_cleanup() race free against the rearm here.
+ */
+ if (rearm)
+ mod_timer(&cl->timer, jiffies + 1);
+}
+
+static void vector_cleanup_callback(struct timer_list *tmr)
+{
+ struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer);
+
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock_irq(&vector_lock);
+ __vector_cleanup(cl, true);
+ raw_spin_unlock_irq(&vector_lock);
+}
+
+static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
+{
+ unsigned int cpu = apicd->prev_cpu;
+
+ raw_spin_lock(&vector_lock);
+ apicd->move_in_progress = 0;
+ if (cpu_online(cpu)) {
+ struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu);
+
+ hlist_add_head(&apicd->clist, &cl->head);
+
+ /*
+ * The lockless timer_pending() check is safe here. If it
+ * returns true, then the callback will observe this new
+ * apic data in the hlist as everything is serialized by
+ * vector lock.
+ *
+ * If it returns false then the timer is either not armed
+ * or the other CPU executes the callback, which again
+ * would be blocked on vector lock. Rearming it in the
+ * latter case makes it fire for nothing.
+ *
+ * This is also safe against the callback rearming the timer
+ * because that's serialized via vector lock too.
+ */
+ if (!timer_pending(&cl->timer)) {
+ cl->timer.expires = jiffies + 1;
+ add_timer_on(&cl->timer, cpu);
+ }
+ } else {
+ pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu);
+ free_moved_vector(apicd);
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+void vector_schedule_cleanup(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (apicd->move_in_progress)
+ __vector_schedule_cleanup(apicd);
+}
+
+void irq_complete_move(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (likely(!apicd->move_in_progress))
+ return;
+
+ /*
+ * If the interrupt arrived on the new target CPU, cleanup the
+ * vector on the old target CPU. A vector check is not required
+ * because an interrupt can never move from one vector to another
+ * on the same CPU.
+ */
+ if (apicd->cpu == smp_processor_id())
+ __vector_schedule_cleanup(apicd);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Note, this is not accurate accounting, but at least good enough to
+ * prevent that the actual interrupt move will run out of vectors.
+ */
+int lapic_can_unplug_cpu(void)
+{
+ unsigned int rsvd, avl, tomove, cpu = smp_processor_id();
+ int ret = 0;
+
+ raw_spin_lock(&vector_lock);
+ tomove = irq_matrix_allocated(vector_matrix);
+ avl = irq_matrix_available(vector_matrix, true);
+ if (avl < tomove) {
+ pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n",
+ cpu, tomove, avl);
+ ret = -ENOSPC;
+ goto out;
+ }
+ rsvd = irq_matrix_reserved(vector_matrix);
+ if (avl < rsvd) {
+ pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n",
+ rsvd, avl);
+ }
+out:
+ raw_spin_unlock(&vector_lock);
+ return ret;
+}
+#endif /* HOTPLUG_CPU */
+#endif /* SMP */
+
+static void __init print_APIC_field(int base)
+{
+ int i;
+
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ pr_cont("%08x", apic_read(base + i*0x10));
+
+ pr_cont("\n");
+}
+
+static void __init print_local_APIC(void *dummy)
+{
+ unsigned int i, v, ver, maxlvt;
+ u64 icr;
+
+ pr_debug("printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), read_apic_id());
+ v = apic_read(APIC_ID);
+ pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
+ v = apic_read(APIC_LVR);
+ pr_info("... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ pr_debug("... APIC ARBPRI: %08x (%02x)\n",
+ v, v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ pr_debug("... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ pr_debug("... APIC RRR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_LDR);
+ pr_debug("... APIC LDR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ pr_debug("... APIC DFR: %08x\n", v);
+ }
+ v = apic_read(APIC_SPIV);
+ pr_debug("... APIC SPIV: %08x\n", v);
+
+ pr_debug("... APIC ISR field:\n");
+ print_APIC_field(APIC_ISR);
+ pr_debug("... APIC TMR field:\n");
+ print_APIC_field(APIC_TMR);
+ pr_debug("... APIC IRR field:\n");
+ print_APIC_field(APIC_IRR);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ pr_debug("... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ pr_debug("... APIC ICR: %08x\n", (u32)icr);
+ pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+ v = apic_read(APIC_LVTT);
+ pr_debug("... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) {
+ /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ pr_debug("... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ pr_debug("... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ pr_debug("... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) {
+ /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ pr_debug("... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ pr_debug("... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ pr_debug("... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ pr_debug("... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ pr_debug("... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ pr_debug("... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ pr_debug("... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
+ pr_cont("\n");
+}
+
+static void __init print_local_APICs(int maxcpu)
+{
+ int cpu;
+
+ if (!maxcpu)
+ return;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
+ preempt_enable();
+}
+
+static void __init print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (!nr_legacy_irqs())
+ return;
+
+ pr_debug("\nprinting PIC contents\n");
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ pr_debug("... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ pr_debug("... PIC IRR: %04x\n", v);
+
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ pr_debug("... PIC ISR: %04x\n", v);
+
+ v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1);
+ pr_debug("... PIC ELCR: %04x\n", v);
+}
+
+static int show_lapic __initdata = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+static int __init print_ICs(void)
+{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
+ print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
+ return 0;
+
+ print_local_APICs(show_lapic);
+ print_IO_APICs();
+
+ return 0;
+}
+
+late_initcall(print_ICs);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..7db83212effb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -1,247 +1,261 @@
-#include <linux/threads.h>
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/dmar.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
-#include <asm/smp.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+#include "local.h"
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return x2apic_enabled();
-}
+#define apic_cluster(apicid) ((apicid) >> 4)
/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
*/
-static const struct cpumask *x2apic_target_cpus(void)
-{
- return cpu_online_mask;
-}
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+ return x2apic_enabled();
}
-static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+static void x2apic_send_IPI(int cpu, int vector)
{
- unsigned long cfg;
+ u32 dest = x86_cpu_to_logical_apicid[cpu];
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * send the IPI.
- */
- native_x2apic_icr_write(cfg, apicid);
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
}
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
- unsigned long query_cpu;
+ unsigned int cpu, clustercpu;
+ struct cpumask *tmpmsk;
unsigned long flags;
+ u32 dest;
- x2apic_wrmsr_fence();
-
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
-}
-static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
-{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
+ tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
+ cpumask_copy(tmpmsk, mask);
+ /* If IPI should not be sent to self, clear current CPU */
+ if (apic_dest != APIC_DEST_ALLINC)
+ __cpumask_clear_cpu(smp_processor_id(), tmpmsk);
- x2apic_wrmsr_fence();
+ /* Collapse cpus in a cluster so a single IPI per cluster is sent */
+ for_each_cpu(cpu, tmpmsk) {
+ struct cpumask *cmsk = per_cpu(cluster_masks, cpu);
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
+ dest = 0;
+ for_each_cpu_and(clustercpu, tmpmsk, cmsk)
+ dest |= x86_cpu_to_logical_apicid[clustercpu];
+
+ if (!dest)
continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+ /* Remove cluster CPUs from tmpmask */
+ cpumask_andnot(tmpmsk, tmpmsk, cmsk);
}
+
local_irq_restore(flags);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
+static void
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static void x2apic_send_IPI_all(int vector)
+static u32 x2apic_calc_apicid(unsigned int cpu)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
+ return x86_cpu_to_logical_apicid[cpu];
}
-static int x2apic_apic_id_registered(void)
+static void init_x2apic_ldr(void)
{
- return 1;
+ struct cpumask *cmsk = this_cpu_read(cluster_masks);
+
+ BUG_ON(!cmsk);
+
+ cpumask_set_cpu(smp_processor_id(), cmsk);
}
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+/*
+ * As an optimisation during boot, set the cluster_mask for all present
+ * CPUs at once, to prevent each of them having to iterate over the others
+ * to find the existing cluster_mask.
+ */
+static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster)
{
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
+ int cpu_i;
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
- return BAD_APICID;
+ for_each_present_cpu(cpu_i) {
+ struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i);
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster)
+ continue;
+
+ if (WARN_ON_ONCE(*cpu_cmsk == cmsk))
+ continue;
+
+ BUG_ON(*cpu_cmsk);
+ *cpu_cmsk = cmsk;
+ }
}
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
{
- int cpu;
+ struct cpumask *cmsk = NULL;
+ unsigned int cpu_i;
/*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
+ * At boot time, the CPU present mask is stable. The cluster mask is
+ * allocated for the first CPU in the cluster and propagated to all
+ * present siblings in the cluster. If the cluster mask is already set
+ * on entry to this function for a given CPU, there is nothing to do.
*/
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
+ if (per_cpu(cluster_masks, cpu))
+ return 0;
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ if (system_state < SYSTEM_RUNNING)
+ goto alloc;
- return BAD_APICID;
+ /*
+ * On post boot hotplug for a CPU which was not present at boot time,
+ * iterate over all possible CPUs (even those which are not present
+ * any more) to find any existing cluster mask.
+ */
+ for_each_possible_cpu(cpu_i) {
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+ cmsk = per_cpu(cluster_masks, cpu_i);
+ /*
+ * If the cluster is already initialized, just store
+ * the mask and return. There's no need to propagate.
+ */
+ if (cmsk) {
+ per_cpu(cluster_masks, cpu) = cmsk;
+ return 0;
+ }
+ }
+ }
+ /*
+ * No CPU in the cluster has ever been initialized, so fall through to
+ * the boot time code which will also populate the cluster mask for any
+ * other CPU in the cluster which is (now) present.
+ */
+alloc:
+ cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+ if (!cmsk)
+ return -ENOMEM;
+ per_cpu(cluster_masks, cpu) = cmsk;
+ prefill_clustermask(cmsk, cpu, cluster);
+
+ return 0;
}
-static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
+static int x2apic_prepare_cpu(unsigned int cpu)
{
- unsigned int id;
+ u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+ u32 cluster = apic_cluster(phys_apicid);
+ u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+ int node = cpu_to_node(cpu);
- id = x;
- return id;
-}
+ x86_cpu_to_logical_apicid[cpu] = logical_apicid;
-static unsigned long set_apic_id(unsigned int id)
-{
- unsigned long x;
+ if (alloc_clustermask(cpu, cluster, node) < 0)
+ return -ENOMEM;
- x = id;
- return x;
-}
+ if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node))
+ return -ENOMEM;
-static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return initial_apicid >> index_msb;
+ return 0;
}
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_dead_cpu(unsigned int dead_cpu)
{
- apic_write(APIC_SELF_IPI, vector);
+ struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu);
+
+ if (cmsk)
+ cpumask_clear_cpu(dead_cpu, cmsk);
+ free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
+ return 0;
}
-static void init_x2apic_ldr(void)
+static int x2apic_cluster_probe(void)
{
- int cpu = smp_processor_id();
-
- per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+ u32 slots;
+
+ if (!x2apic_mode)
+ return 0;
+
+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
+ if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
+ x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
+ pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
+ return 0;
+ }
+ init_x2apic_ldr();
+ return 1;
}
-struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
- .probe = NULL,
+ .probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
- .irq_dest_mode = 1, /* logical */
+ .dest_mode_logical = true,
- .target_cpus = x2apic_target_cpus,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = x2apic_cluster_phys_pkg_id,
- .mps_oem_check = NULL,
- .get_apic_id = x2apic_cluster_phys_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
+ .get_apic_id = x2apic_get_apic_id,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = x2apic_calc_apicid,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = NULL,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..12d4c35547a6 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -1,105 +1,70 @@
-#include <linux/threads.h>
+// SPDX-License-Identifier: GPL-2.0
+
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/dmar.h>
+#include <linux/acpi.h>
-#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include "local.h"
int x2apic_phys;
-static int set_x2apic_phys_mode(char *arg)
-{
- x2apic_phys = 1;
- return 0;
-}
-early_param("x2apic_phys", set_x2apic_phys_mode);
+static struct apic apic_x2apic_phys;
+u32 x2apic_max_apicid __ro_after_init = UINT_MAX;
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+void __init x2apic_set_max_apicid(u32 apicid)
{
- if (x2apic_phys)
- return x2apic_enabled();
- else
- return 0;
+ x2apic_max_apicid = apicid;
+ if (apic->x2apic_set_max_apicid)
+ apic->max_apic_id = apicid;
}
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
+static int __init set_x2apic_phys_mode(char *arg)
{
- return cpu_online_mask;
+ x2apic_phys = 1;
+ return 0;
}
+early_param("x2apic_phys", set_x2apic_phys_mode);
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static bool x2apic_fadt_phys(void)
{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+#ifdef CONFIG_ACPI
+ if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "System requires x2apic physical mode\n");
+ return true;
+ }
+#endif
+ return false;
}
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
- unsigned int dest)
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- unsigned long cfg;
-
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * send the IPI.
- */
- native_x2apic_icr_write(cfg, apicid);
+ return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static void x2apic_send_IPI(int cpu, int vector)
{
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
}
static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
- unsigned long this_cpu = smp_processor_id();
unsigned long query_cpu;
+ unsigned long this_cpu;
unsigned long flags;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu != this_cpu)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
-}
-
-static void x2apic_send_IPI_allbutself(int vector)
-{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
+ this_cpu = smp_processor_id();
+ for_each_cpu(query_cpu, mask) {
+ if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
continue;
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
@@ -107,130 +72,94 @@ static void x2apic_send_IPI_allbutself(int vector)
local_irq_restore(flags);
}
-static void x2apic_send_IPI_all(int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
}
-static int x2apic_apic_id_registered(void)
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- return 1;
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static void __x2apic_send_IPI_shorthand(int vector, u32 which)
{
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
-
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
+ unsigned long cfg = __prepare_ICR(which, vector, 0);
+
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ native_x2apic_icr_write(cfg, 0);
}
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
+void x2apic_send_IPI_allbutself(int vector)
{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
-
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
-
- return BAD_APICID;
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
}
-static unsigned int x2apic_phys_get_apic_id(unsigned long x)
+void x2apic_send_IPI_all(int vector)
{
- return x;
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
}
-static unsigned long set_apic_id(unsigned int id)
+void x2apic_send_IPI_self(int vector)
{
- return id;
+ apic_write(APIC_SELF_IPI, vector);
}
-static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
- return initial_apicid >> index_msb;
+ unsigned long cfg = __prepare_ICR(0, vector, dest);
+ native_x2apic_icr_write(cfg, apicid);
}
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_phys_probe(void)
{
- apic_write(APIC_SELF_IPI, vector);
+ if (!x2apic_mode)
+ return 0;
+
+ if (x2apic_phys || x2apic_fadt_phys())
+ return 1;
+
+ return apic == &apic_x2apic_phys;
}
-static void init_x2apic_ldr(void)
+u32 x2apic_get_apic_id(u32 id)
{
+ return id;
}
-struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
- .probe = NULL,
+ .probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
- .target_cpus = x2apic_target_cpus,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
-
- .vector_allocation_domain = x2apic_vector_allocation_domain,
- .init_apic_ldr = init_x2apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
+
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = x2apic_phys_pkg_id,
- .mps_oem_check = NULL,
- .get_apic_id = x2apic_phys_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
+ .get_apic_id = x2apic_get_apic_id,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = apic_default_calc_apicid,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = NULL,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c
new file mode 100644
index 000000000000..dbc5678bc3b6
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_savic.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Secure AVIC Support (SEV-SNP Guests)
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *
+ * Author: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
+ */
+
+#include <linux/cc_platform.h>
+#include <linux/cpumask.h>
+#include <linux/percpu-defs.h>
+#include <linux/align.h>
+
+#include <asm/apic.h>
+#include <asm/sev.h>
+
+#include "local.h"
+
+struct secure_avic_page {
+ u8 regs[PAGE_SIZE];
+} __aligned(PAGE_SIZE);
+
+static struct secure_avic_page __percpu *savic_page __ro_after_init;
+
+static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC);
+}
+
+static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset)
+{
+ return &per_cpu_ptr(savic_page, cpu)->regs[offset];
+}
+
+static inline void update_vector(unsigned int cpu, unsigned int offset,
+ unsigned int vector, bool set)
+{
+ void *bitmap = get_reg_bitmap(cpu, offset);
+
+ if (set)
+ apic_set_vector(vector, bitmap);
+ else
+ apic_clear_vector(vector, bitmap);
+}
+
+#define SAVIC_ALLOWED_IRR 0x204
+
+/*
+ * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers
+ * result in #VC exception (for non-accelerated register accesses)
+ * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler
+ * can read/write the x2APIC register in the guest APIC backing page.
+ *
+ * Since doing this would increase the latency of accessing x2APIC
+ * registers, instead of doing RDMSR/WRMSR based accesses and
+ * handling the APIC register reads/writes in the #VC exception handler,
+ * the read() and write() callbacks directly read/write the APIC register
+ * from/to the vCPU's APIC backing page.
+ */
+static u32 savic_read(u32 reg)
+{
+ void *ap = this_cpu_ptr(savic_page);
+
+ switch (reg) {
+ case APIC_LVTT:
+ case APIC_TMICT:
+ case APIC_TMCCT:
+ case APIC_TDCR:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ return savic_ghcb_msr_read(reg);
+ case APIC_ID:
+ case APIC_LVR:
+ case APIC_TASKPRI:
+ case APIC_ARBPRI:
+ case APIC_PROCPRI:
+ case APIC_LDR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_EFEAT:
+ case APIC_ECTRL:
+ case APIC_SEOI:
+ case APIC_IER:
+ case APIC_EILVTn(0) ... APIC_EILVTn(3):
+ return apic_get_reg(ap, reg);
+ case APIC_ICR:
+ return (u32)apic_get_reg64(ap, reg);
+ case APIC_ISR ... APIC_ISR + 0x70:
+ case APIC_TMR ... APIC_TMR + 0x70:
+ if (WARN_ONCE(!IS_ALIGNED(reg, 16),
+ "APIC register read offset 0x%x not aligned at 16 bytes", reg))
+ return 0;
+ return apic_get_reg(ap, reg);
+ /* IRR and ALLOWED_IRR offset range */
+ case APIC_IRR ... APIC_IRR + 0x74:
+ /*
+ * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from
+ * their respective base offset. APIC_IRRs are in the range
+ *
+ * (0x200, 0x210, ..., 0x270)
+ *
+ * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range
+ *
+ * (0x204, 0x214, ..., 0x274).
+ *
+ * Filter out everything else.
+ */
+ if (WARN_ONCE(!(IS_ALIGNED(reg, 16) ||
+ IS_ALIGNED(reg - 4, 16)),
+ "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg))
+ return 0;
+ return apic_get_reg(ap, reg);
+ default:
+ pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg);
+ return 0;
+ }
+}
+
+#define SAVIC_NMI_REQ 0x278
+
+/*
+ * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware
+ * updates the APIC_IRR in the APIC backing page of the vCPU. In addition,
+ * hardware evaluates the new APIC_IRR update for interrupt injection to
+ * the vCPU. So, self IPIs are hardware-accelerated.
+ */
+static inline void self_ipi_reg_write(unsigned int vector)
+{
+ native_apic_msr_write(APIC_SELF_IPI, vector);
+}
+
+static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi)
+{
+ if (nmi)
+ apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1);
+ else
+ update_vector(cpu, APIC_IRR, vector, true);
+}
+
+static void send_ipi_allbut(unsigned int vector, bool nmi)
+{
+ unsigned int cpu, src_cpu;
+
+ guard(irqsave)();
+
+ src_cpu = raw_smp_processor_id();
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ if (cpu == src_cpu)
+ continue;
+ send_ipi_dest(cpu, vector, nmi);
+ }
+}
+
+static inline void self_ipi(unsigned int vector, bool nmi)
+{
+ u32 icr_low = APIC_SELF_IPI | vector;
+
+ if (nmi)
+ icr_low |= APIC_DM_NMI;
+
+ native_x2apic_icr_write(icr_low, 0);
+}
+
+static void savic_icr_write(u32 icr_low, u32 icr_high)
+{
+ unsigned int dsh, vector;
+ u64 icr_data;
+ bool nmi;
+
+ dsh = icr_low & APIC_DEST_ALLBUT;
+ vector = icr_low & APIC_VECTOR_MASK;
+ nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI);
+
+ switch (dsh) {
+ case APIC_DEST_SELF:
+ self_ipi(vector, nmi);
+ break;
+ case APIC_DEST_ALLINC:
+ self_ipi(vector, nmi);
+ fallthrough;
+ case APIC_DEST_ALLBUT:
+ send_ipi_allbut(vector, nmi);
+ break;
+ default:
+ send_ipi_dest(icr_high, vector, nmi);
+ break;
+ }
+
+ icr_data = ((u64)icr_high) << 32 | icr_low;
+ if (dsh != APIC_DEST_SELF)
+ savic_ghcb_msr_write(APIC_ICR, icr_data);
+ apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data);
+}
+
+static void savic_write(u32 reg, u32 data)
+{
+ void *ap = this_cpu_ptr(savic_page);
+
+ switch (reg) {
+ case APIC_LVTT:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVTERR:
+ savic_ghcb_msr_write(reg, data);
+ break;
+ case APIC_TASKPRI:
+ case APIC_EOI:
+ case APIC_SPIV:
+ case SAVIC_NMI_REQ:
+ case APIC_ESR:
+ case APIC_ECTRL:
+ case APIC_SEOI:
+ case APIC_IER:
+ case APIC_EILVTn(0) ... APIC_EILVTn(3):
+ apic_set_reg(ap, reg, data);
+ break;
+ case APIC_ICR:
+ savic_icr_write(data, 0);
+ break;
+ case APIC_SELF_IPI:
+ self_ipi_reg_write(data);
+ break;
+ /* ALLOWED_IRR offsets are writable */
+ case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70:
+ if (IS_ALIGNED(reg - 4, 16)) {
+ apic_set_reg(ap, reg, data);
+ break;
+ }
+ fallthrough;
+ default:
+ pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg);
+ }
+}
+
+static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh)
+{
+ unsigned int icr_low;
+
+ icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL);
+ savic_icr_write(icr_low, dest);
+}
+
+static void savic_send_ipi(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ send_ipi(dest, vector, 0);
+}
+
+static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self)
+{
+ unsigned int cpu, this_cpu;
+
+ guard(irqsave)();
+
+ this_cpu = raw_smp_processor_id();
+
+ for_each_cpu(cpu, mask) {
+ if (excl_self && cpu == this_cpu)
+ continue;
+ send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0);
+ }
+}
+
+static void savic_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ send_ipi_mask(mask, vector, false);
+}
+
+static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ send_ipi_mask(mask, vector, true);
+}
+
+static void savic_send_ipi_allbutself(int vector)
+{
+ send_ipi(0, vector, APIC_DEST_ALLBUT);
+}
+
+static void savic_send_ipi_all(int vector)
+{
+ send_ipi(0, vector, APIC_DEST_ALLINC);
+}
+
+static void savic_send_ipi_self(int vector)
+{
+ self_ipi_reg_write(vector);
+}
+
+static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set)
+{
+ update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set);
+}
+
+static void savic_eoi(void)
+{
+ unsigned int cpu;
+ int vec;
+
+ cpu = raw_smp_processor_id();
+ vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR));
+ if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR"))
+ return;
+
+ /* Is level-triggered interrupt? */
+ if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) {
+ update_vector(cpu, APIC_ISR, vec, false);
+ /*
+ * Propagate the EOI write to the hypervisor for level-triggered
+ * interrupts. Return to the guest from GHCB protocol event takes
+ * care of re-evaluating interrupt state.
+ */
+ savic_ghcb_msr_write(APIC_EOI, 0);
+ } else {
+ /*
+ * Hardware clears APIC_ISR and re-evaluates the interrupt state
+ * to determine if there is any pending interrupt which can be
+ * delivered to CPU.
+ */
+ native_apic_msr_eoi();
+ }
+}
+
+static void savic_teardown(void)
+{
+ /* Disable Secure AVIC */
+ native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0);
+ savic_unregister_gpa(NULL);
+}
+
+static void savic_setup(void)
+{
+ void *ap = this_cpu_ptr(savic_page);
+ enum es_result res;
+ unsigned long gpa;
+
+ /*
+ * Before Secure AVIC is enabled, APIC MSR reads are intercepted.
+ * APIC_ID MSR read returns the value from the hypervisor.
+ */
+ apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID));
+
+ gpa = __pa(ap);
+
+ /*
+ * The NPT entry for a vCPU's APIC backing page must always be
+ * present when the vCPU is running in order for Secure AVIC to
+ * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot
+ * be resumed if the NPT entry for the APIC backing page is not
+ * present. Notify GPA of the vCPU's APIC backing page to the
+ * hypervisor by calling savic_register_gpa(). Before executing
+ * VMRUN, the hypervisor makes use of this information to make sure
+ * the APIC backing page is mapped in NPT.
+ */
+ res = savic_register_gpa(gpa);
+ if (res != ES_OK)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+ native_wrmsrq(MSR_AMD64_SAVIC_CONTROL,
+ gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI);
+}
+
+static int savic_probe(void)
+{
+ if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ return 0;
+
+ if (!x2apic_mode) {
+ pr_err("Secure AVIC enabled in non x2APIC mode\n");
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+ /* unreachable */
+ }
+
+ savic_page = alloc_percpu(struct secure_avic_page);
+ if (!savic_page)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+ return 1;
+}
+
+static struct apic apic_x2apic_savic __ro_after_init = {
+
+ .name = "secure avic x2apic",
+ .probe = savic_probe,
+ .acpi_madt_oem_check = savic_acpi_madt_oem_check,
+ .setup = savic_setup,
+ .teardown = savic_teardown,
+
+ .dest_mode_logical = false,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
+ .get_apic_id = x2apic_get_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = savic_send_ipi,
+ .send_IPI_mask = savic_send_ipi_mask,
+ .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself,
+ .send_IPI_allbutself = savic_send_ipi_allbutself,
+ .send_IPI_all = savic_send_ipi_all,
+ .send_IPI_self = savic_send_ipi_self,
+
+ .nmi_to_offline_cpu = true,
+
+ .read = savic_read,
+ .write = savic_write,
+ .eoi = savic_eoi,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = savic_icr_write,
+
+ .update_vector = savic_update_vector,
+};
+
+apic_driver(apic_x2apic_savic);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 832e908adcb5..15209f220e1f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,86 +5,506 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
+#include <linux/crash_dump.h>
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/hardirq.h>
#include <linux/proc_fs.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/cpu.h>
-#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/memory.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <asm/e820/api.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
-#include <asm/current.h>
-#include <asm/pgtable.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-#include <asm/smp.h>
-DEFINE_PER_CPU(int, x2apic_extra_bits);
+#include "local.h"
+
+static enum uv_system_type uv_system_type;
+static int uv_hubbed_system;
+static int uv_hubless_system;
+static u64 gru_start_paddr, gru_end_paddr;
+static union uvh_apicid uvh_apicid;
+static int uv_node_id;
+
+/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */
+static u8 uv_archtype[UV_AT_SIZE + 1];
+static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
+static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+
+/* Information derived from CPUID and some UV MMRs */
+static struct {
+ unsigned int apicid_shift;
+ unsigned int apicid_mask;
+ unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */
+ unsigned int pnode_mask;
+ unsigned int nasid_shift;
+ unsigned int gpa_shift;
+ unsigned int gnode_shift;
+ unsigned int m_skt;
+ unsigned int n_skt;
+} uv_cpuid;
+
+static int uv_min_hub_revision_id;
+
+static struct apic apic_x2apic_uv_x;
+static struct uv_hub_info_s uv_hub_info_node0;
+
+/* Set this to use hardware error handler instead of kernel panic: */
+static int disable_uv_undefined_panic = 1;
+
+unsigned long uv_undefined(char *str)
+{
+ if (likely(!disable_uv_undefined_panic))
+ panic("UV: error: undefined MMR: %s\n", str);
+ else
+ pr_crit("UV: error: undefined MMR: %s\n", str);
-static enum uv_system_type uv_system_type;
+ /* Cause a machine fault: */
+ return ~0ul;
+}
+EXPORT_SYMBOL(uv_undefined);
-static int early_get_nodeid(void)
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
{
- union uvh_node_id_u node_id;
- unsigned long *mmr;
+ unsigned long val, *mmr;
- mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
- node_id.v = *mmr;
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+ val = *mmr;
early_iounmap(mmr, sizeof(*mmr));
- return node_id.s.node_id;
+
+ return val;
}
-static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static inline bool is_GRU_range(u64 start, u64 end)
{
- if (!strcmp(oem_id, "SGI")) {
- if (!strcmp(oem_table_id, "UVL"))
- uv_system_type = UV_LEGACY_APIC;
- else if (!strcmp(oem_table_id, "UVX"))
- uv_system_type = UV_X2APIC;
- else if (!strcmp(oem_table_id, "UVH")) {
- __get_cpu_var(x2apic_extra_bits) =
- early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1);
- uv_system_type = UV_NON_UNIQUE_APIC;
- return 1;
- }
+ if (!gru_start_paddr)
+ return false;
+
+ return start >= gru_start_paddr && end <= gru_end_paddr;
+}
+
+static bool uv_is_untracked_pat_range(u64 start, u64 end)
+{
+ return is_ISA_range(start, end) || is_GRU_range(start, end);
+}
+
+static void __init early_get_pnodeid(void)
+{
+ int pnode;
+
+ uv_cpuid.m_skt = 0;
+ if (UVH_RH10_GAM_ADDR_MAP_CONFIG) {
+ union uvh_rh10_gam_addr_map_config_u m_n_config;
+
+ m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG);
+ uv_cpuid.n_skt = m_n_config.s.n_skt;
+ uv_cpuid.nasid_shift = 0;
+ } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) {
+ union uvh_rh_gam_addr_map_config_u m_n_config;
+
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
+ uv_cpuid.n_skt = m_n_config.s.n_skt;
+ if (is_uv(UV3))
+ uv_cpuid.m_skt = m_n_config.s3.m_skt;
+ if (is_uv(UV2))
+ uv_cpuid.m_skt = m_n_config.s2.m_skt;
+ uv_cpuid.nasid_shift = 1;
+ } else {
+ unsigned long GAM_ADDR_MAP_CONFIG = 0;
+
+ WARN(GAM_ADDR_MAP_CONFIG == 0,
+ "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n");
+ uv_cpuid.n_skt = 0;
+ uv_cpuid.nasid_shift = 0;
+ }
+
+ if (is_uv(UV4|UVY))
+ uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
+
+ uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1;
+ pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask;
+ uv_cpuid.gpa_shift = 46; /* Default unless changed */
+
+ pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n",
+ uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode);
+}
+
+/* Running on a UV Hubbed system, determine which UV Hub Type it is */
+static int __init early_set_hub_type(void)
+{
+ union uvh_node_id_u node_id;
+
+ /*
+ * The NODE_ID MMR is always at offset 0.
+ * Contains the chip part # + revision.
+ * Node_id field started with 15 bits,
+ * ... now 7 but upper 8 are masked to 0.
+ * All blades/nodes have the same part # and hub revision.
+ */
+ node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+ uv_node_id = node_id.sx.node_id;
+
+ switch (node_id.s.part_number) {
+
+ case UV5_HUB_PART_NUMBER:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV5_HUB_REVISION_BASE;
+ uv_hub_type_set(UV5);
+ break;
+
+ /* UV4/4A only have a revision difference */
+ case UV4_HUB_PART_NUMBER:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV4_HUB_REVISION_BASE - 1;
+ uv_hub_type_set(UV4);
+ if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE)
+ uv_hub_type_set(UV4|UV4A);
+ break;
+
+ case UV3_HUB_PART_NUMBER:
+ case UV3_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV3_HUB_REVISION_BASE;
+ uv_hub_type_set(UV3);
+ break;
+
+ case UV2_HUB_PART_NUMBER:
+ case UV2_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV2_HUB_REVISION_BASE - 1;
+ uv_hub_type_set(UV2);
+ break;
+
+ default:
+ return 0;
+ }
+
+ pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n",
+ node_id.s.part_number, node_id.s.revision,
+ uv_min_hub_revision_id, is_uv(~0));
+
+ return 1;
+}
+
+static void __init uv_tsc_check_sync(void)
+{
+ u64 mmr;
+ int sync_state;
+ int mmr_shift;
+ char *state;
+
+ /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */
+ if (!is_uv(UV2|UV3|UV4)) {
+ mark_tsc_async_resets("UV5+");
+ return;
+ }
+
+ /* UV2,3,4, UV BIOS TSC sync state available */
+ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
+ mmr_shift =
+ is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
+
+ /* Check if TSC is valid for all sockets */
+ switch (sync_state) {
+ case UVH_TSC_SYNC_VALID:
+ state = "in sync";
+ mark_tsc_async_resets("UV BIOS");
+ break;
+
+ /* If BIOS state unknown, don't do anything */
+ case UVH_TSC_SYNC_UNKNOWN:
+ state = "unknown";
+ break;
+
+ /* Otherwise, BIOS indicates problem with TSC */
+ default:
+ state = "unstable";
+ mark_tsc_unstable("UV BIOS");
+ break;
+ }
+ pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state);
+}
+
+/* Selector for (4|4A|5) structs */
+#define uvxy_field(sname, field, undef) ( \
+ is_uv(UV4A) ? sname.s4a.field : \
+ is_uv(UV4) ? sname.s4.field : \
+ is_uv(UV3) ? sname.s3.field : \
+ undef)
+
+static void __init early_get_apic_socketid_shift(void)
+{
+ unsigned int sid_shift = topology_get_domain_shift(TOPO_PKG_DOMAIN);
+
+ if (is_uv2_hub() || is_uv3_hub())
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+
+ if (sid_shift) {
+ uv_cpuid.apicid_shift = 0;
+ uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+ uv_cpuid.socketid_shift = sid_shift;
+ } else {
+ pr_info("UV: CPU does not have valid CPUID.11\n");
+ }
+
+ pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+ pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+}
+
+static void __init uv_stringify(int len, char *to, char *from)
+{
+ strscpy(to, from, len);
+
+ /* Trim trailing spaces */
+ (void)strim(to);
+}
+
+/* Find UV arch type entry in UVsystab */
+static unsigned long __init early_find_archtype(struct uv_systab *st)
+{
+ int i;
+
+ for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+ unsigned long ptr = st->entry[i].offset;
+
+ if (!ptr)
+ continue;
+ ptr += (unsigned long)st;
+ if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE)
+ return ptr;
+ }
+ return 0;
+}
+
+/* Validate UV arch type field in UVsystab */
+static int __init decode_arch_type(unsigned long ptr)
+{
+ struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr;
+ int n = strlen(uv_ate->archtype);
+
+ if (n > 0 && n < sizeof(uv_ate->archtype)) {
+ pr_info("UV: UVarchtype received from BIOS\n");
+ uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype);
+ return 1;
}
return 0;
}
+/* Determine if UV arch type entry might exist in UVsystab */
+static int __init early_get_arch_type(void)
+{
+ unsigned long uvst_physaddr, uvst_size, ptr;
+ struct uv_systab *st;
+ u32 rev;
+ int ret;
+
+ uvst_physaddr = get_uv_systab_phys(0);
+ if (!uvst_physaddr)
+ return 0;
+
+ st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab));
+ if (!st) {
+ pr_err("UV: Cannot access UVsystab, remap failed\n");
+ return 0;
+ }
+
+ rev = st->revision;
+ if (rev < UV_SYSTAB_VERSION_UV5) {
+ early_memunmap(st, sizeof(struct uv_systab));
+ return 0;
+ }
+
+ uvst_size = st->size;
+ early_memunmap(st, sizeof(struct uv_systab));
+ st = early_memremap_ro(uvst_physaddr, uvst_size);
+ if (!st) {
+ pr_err("UV: Cannot access UVarchtype, remap failed\n");
+ return 0;
+ }
+
+ ptr = early_find_archtype(st);
+ if (!ptr) {
+ early_memunmap(st, uvst_size);
+ return 0;
+ }
+
+ ret = decode_arch_type(ptr);
+ early_memunmap(st, uvst_size);
+ return ret;
+}
+
+/* UV system found, check which APIC MODE BIOS already selected */
+static void __init early_set_apic_mode(void)
+{
+ if (x2apic_enabled())
+ uv_system_type = UV_X2APIC;
+ else
+ uv_system_type = UV_LEGACY_APIC;
+}
+
+static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
+{
+ /* Save OEM_ID passed from ACPI MADT */
+ uv_stringify(sizeof(oem_id), oem_id, _oem_id);
+
+ /* Check if BIOS sent us a UVarchtype */
+ if (!early_get_arch_type())
+
+ /* If not use OEM ID for UVarchtype */
+ uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id);
+
+ /* Check if not hubbed */
+ if (strncmp(uv_archtype, "SGI", 3) != 0) {
+
+ /* (Not hubbed), check if not hubless */
+ if (strncmp(uv_archtype, "NSGI", 4) != 0)
+
+ /* (Not hubless), not a UV */
+ return 0;
+
+ /* Is UV hubless system */
+ uv_hubless_system = 0x01;
+
+ /* UV5 Hubless */
+ if (strncmp(uv_archtype, "NSGI5", 5) == 0)
+ uv_hubless_system |= 0x20;
+
+ /* UV4 Hubless: CH */
+ else if (strncmp(uv_archtype, "NSGI4", 5) == 0)
+ uv_hubless_system |= 0x10;
+
+ /* UV3 Hubless: UV300/MC990X w/o hub */
+ else
+ uv_hubless_system |= 0x8;
+
+ /* Copy OEM Table ID */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
+
+ pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
+ oem_id, oem_table_id, uv_system_type, uv_hubless_system);
+
+ return 0;
+ }
+
+ if (numa_off) {
+ pr_err("UV: NUMA is off, disabling UV support\n");
+ return 0;
+ }
+
+ /* Set hubbed type if true */
+ uv_hub_info->hub_revision =
+ !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE :
+ !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
+ !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+ !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0;
+
+ switch (uv_hub_info->hub_revision) {
+ case UV5_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x21;
+ uv_hub_type_set(UV5);
+ break;
+
+ case UV4_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x11;
+ uv_hub_type_set(UV4);
+ break;
+
+ case UV3_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x9;
+ uv_hub_type_set(UV3);
+ break;
+
+ case UV2_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x5;
+ uv_hub_type_set(UV2);
+ break;
+
+ default:
+ return 0;
+ }
+
+ /* Get UV hub chip part number & revision */
+ early_set_hub_type();
+
+ /* Other UV setup functions */
+ early_set_apic_mode();
+ early_get_pnodeid();
+ early_get_apic_socketid_shift();
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
+ uv_tsc_check_sync();
+
+ return 1;
+}
+
+/* Called early to probe for the correct APIC driver */
+static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
+{
+ /* Set up early hub info fields for Node 0 */
+ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
+
+ /* If not UV, return. */
+ if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
+ return 0;
+
+ /* Save for display of the OEM Table ID */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
+
+ pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n",
+ oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY),
+ uv_min_hub_revision_id);
+
+ return 0;
+}
+
enum uv_system_type get_uv_system_type(void)
{
return uv_system_type;
}
+int uv_get_hubless_system(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(uv_get_hubless_system);
+
+ssize_t uv_get_archtype(char *buf, int len)
+{
+ return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id);
+}
+EXPORT_SYMBOL_GPL(uv_get_archtype);
+
int is_uv_system(void)
{
return uv_system_type != UV_NONE;
}
EXPORT_SYMBOL_GPL(is_uv_system);
-DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
+int is_uv_hubbed(int uvtype)
+{
+ return (uv_hubbed_system & uvtype);
+}
+EXPORT_SYMBOL_GPL(is_uv_hubbed);
-struct uv_blade_info *uv_blade_info;
-EXPORT_SYMBOL_GPL(uv_blade_info);
+static int is_uv_hubless(int uvtype)
+{
+ return (uv_hubless_system & uvtype);
+}
-short *uv_node_to_blade;
-EXPORT_SYMBOL_GPL(uv_node_to_blade);
+void **__uv_hub_info_list;
+EXPORT_SYMBOL_GPL(__uv_hub_info_list);
-short *uv_cpu_to_blade;
-EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
+DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info);
short uv_possible_blades;
EXPORT_SYMBOL_GPL(uv_possible_blades);
@@ -92,52 +512,202 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
+/* The following values are used for the per node hub info struct */
+static __initdata unsigned short _min_socket, _max_socket;
+static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry *uv_gre_table;
+static __initdata struct uv_gam_parameters *uv_gp_table;
+static __initdata unsigned short *_socket_to_node;
+static __initdata unsigned short *_socket_to_pnode;
+static __initdata unsigned short *_pnode_to_socket;
+static __initdata unsigned short *_node_to_socket;
+
+static __initdata struct uv_gam_range_s *_gr_table;
+
+#define SOCK_EMPTY ((unsigned short)~0)
+
+/* Default UV memory block size is 2GB */
+static unsigned long mem_block_size __initdata = (2UL << 30);
+
+/* Kernel parameter to specify UV mem block size */
+static int __init parse_mem_block_size(char *ptr)
+{
+ unsigned long size = memparse(ptr, NULL);
+
+ /* Size will be rounded down by set_block_size() below */
+ mem_block_size = size;
+ return 0;
+}
+early_param("uv_memblksize", parse_mem_block_size);
-static const struct cpumask *uv_target_cpus(void)
+static __init int adj_blksize(u32 lgre)
{
- return cpumask_of(0);
+ unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT;
+ unsigned long size;
+
+ for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1)
+ if (IS_ALIGNED(base, size))
+ break;
+
+ if (size >= mem_block_size)
+ return 0;
+
+ mem_block_size = size;
+ return 1;
}
-static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static __init void set_block_size(void)
{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+ unsigned int order = ffs(mem_block_size);
+
+ if (order) {
+ /* adjust for ffs return of 1..64 */
+ set_memory_block_size_order(order - 1);
+ pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size);
+ } else {
+ /* bad or zero value, default to 1UL << 31 (2GB) */
+ pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size);
+ set_memory_block_size_order(31);
+ }
+}
+
+/* Build GAM range lookup table: */
+static __init void build_uv_gr_table(void)
+{
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ struct uv_gam_range_s *grt;
+ unsigned long last_limit = 0, ram_limit = 0;
+ int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
+
+ if (!gre)
+ return;
+
+ bytes = _gr_table_len * sizeof(struct uv_gam_range_s);
+ grt = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!grt))
+ return;
+ _gr_table = grt;
+
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
+ if (!ram_limit) {
+ /* Mark hole between RAM/non-RAM: */
+ ram_limit = last_limit;
+ last_limit = gre->limit;
+ lsid++;
+ continue;
+ }
+ last_limit = gre->limit;
+ pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table));
+ continue;
+ }
+ if (_max_socket < gre->sockid) {
+ pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table));
+ continue;
+ }
+ sid = gre->sockid - _min_socket;
+ if (lsid < sid) {
+ /* New range: */
+ grt = &_gr_table[indx];
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid = sid;
+ lindx = indx++;
+ continue;
+ }
+ /* Update range: */
+ if (lsid == sid && !ram_limit) {
+ /* .. if contiguous: */
+ if (grt->limit == last_limit) {
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ }
+ /* Non-contiguous RAM range: */
+ if (!ram_limit) {
+ grt++;
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ /* Non-contiguous/non-RAM: */
+ grt++;
+ /* base is this entry */
+ grt->base = grt - _gr_table;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid++;
+ }
+
+ /* Shorten table if possible */
+ grt++;
+ i = grt - _gr_table;
+ if (i < _gr_table_len) {
+ void *ret;
+
+ bytes = i * sizeof(struct uv_gam_range_s);
+ ret = krealloc(_gr_table, bytes, GFP_KERNEL);
+ if (ret) {
+ _gr_table = ret;
+ _gr_table_len = i;
+ }
+ }
+
+ /* Display resultant GAM range table: */
+ for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+ unsigned long start, end;
+ int gb = grt->base;
+
+ start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+ end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+ pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb);
+ }
}
-static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
-#ifdef CONFIG_SMP
unsigned long val;
int pnode;
pnode = uv_apicid_to_pnode(phys_apicid);
+
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_INIT;
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
- mdelay(10);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_STARTUP;
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
- atomic_set(&init_deasserted, 1);
-#endif
return 0;
}
static void uv_send_IPI_one(int cpu, int vector)
{
- unsigned long apicid;
- int pnode;
+ unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ int pnode = uv_apicid_to_pnode(apicid);
+ unsigned long dmode, val;
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
- pnode = uv_apicid_to_pnode(apicid);
- uv_hub_send_ipi(pnode, apicid, vector);
+ if (vector == NMI_VECTOR)
+ dmode = APIC_DELIVERY_MODE_NMI;
+ else
+ dmode = APIC_DELIVERY_MODE_FIXED;
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
+ (vector << UVH_IPI_INT_VECTOR_SHFT);
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -175,228 +745,367 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static int uv_apic_id_registered(void)
-{
- return 1;
-}
-
-static void uv_init_apic_ldr(void)
-{
-}
-
-static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
-
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
-}
-
-static unsigned int
-uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
-
- return BAD_APICID;
-}
-
-static unsigned int x2apic_get_apic_id(unsigned long x)
-{
- unsigned int id;
-
- WARN_ON(preemptible() && num_online_cpus() > 1);
- id = x | __get_cpu_var(x2apic_extra_bits);
-
- return id;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
- unsigned long x;
-
- /* maskout x2apic_extra_bits ? */
- x = id;
- return x;
-}
-
-static unsigned int uv_read_apic_id(void)
+static int uv_probe(void)
{
-
- return x2apic_get_apic_id(apic_read(APIC_ID));
-}
-
-static int uv_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return uv_read_apic_id() >> index_msb;
+ return apic == &apic_x2apic_uv_x;
}
-static void uv_send_IPI_self(int vector)
-{
- apic_write(APIC_SELF_IPI, vector);
-}
-
-struct apic apic_x2apic_uv_x = {
+static struct apic apic_x2apic_uv_x __ro_after_init = {
.name = "UV large system",
- .probe = NULL,
+ .probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .apic_id_registered = uv_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
- .target_cpus = uv_target_cpus,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
-
- .vector_allocation_domain = uv_vector_allocation_domain,
- .init_apic_ldr = uv_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
+
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = uv_phys_pkg_id,
- .mps_oem_check = NULL,
+ .max_apic_id = UINT_MAX,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
- .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = apic_default_calc_apicid,
+ .send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_allbutself = uv_send_IPI_allbutself,
.send_IPI_all = uv_send_IPI_all,
- .send_IPI_self = uv_send_IPI_self,
+ .send_IPI_self = x2apic_send_IPI_self,
.wakeup_secondary_cpu = uv_wakeup_secondary,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
-static __cpuinit void set_x2apic_extra_bits(int pnode)
-{
- __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
-}
-
-/*
- * Called on boot cpu.
- */
-static __init int boot_pnode_to_blade(int pnode)
-{
- int blade;
-
- for (blade = 0; blade < uv_num_possible_blades(); blade++)
- if (pnode == uv_blade_info[blade].pnode)
- return blade;
- BUG();
-}
-
-struct redir_addr {
- unsigned long redirect;
- unsigned long alias;
-};
-
-#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
-
-static __initdata struct redir_addr redir_addrs[] = {
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
-};
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3
+#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
- union uvh_si_alias0_overlay_config_u alias;
- union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ union uvh_rh_gam_alias_2_overlay_config_u alias;
+ union uvh_rh_gam_alias_2_redirect_config_u redirect;
+ unsigned long m_redirect;
+ unsigned long m_overlay;
int i;
- for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
- alias.v = uv_read_local_mmr(redir_addrs[i].alias);
- if (alias.s.base == 0) {
+ for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) {
+ switch (i) {
+ case 0:
+ m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG;
+ break;
+ case 1:
+ m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG;
+ break;
+ case 2:
+ m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG;
+ break;
+ }
+ alias.v = uv_read_local_mmr(m_overlay);
+ if (alias.s.enable && alias.s.base == 0) {
*size = (1UL << alias.s.m_alias);
- redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
+ redirect.v = uv_read_local_mmr(m_redirect);
*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
return;
}
}
- BUG();
+ *base = *size = 0;
}
enum map_type {map_wb, map_uc};
+static const char * const mt[] = { "WB", "UC" };
-static __init void map_high(char *id, unsigned long base, int shift,
- int max_pnode, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
{
unsigned long bytes, paddr;
- paddr = base << shift;
- bytes = (1UL << shift) * (max_pnode + 1);
- printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
- paddr + bytes);
+ paddr = base << pshift;
+ bytes = (1UL << bshift) * (max_pnode + 1);
+ if (!paddr) {
+ pr_info("UV: Map %s_HI base address NULL\n", id);
+ return;
+ }
if (map_type == map_uc)
init_extra_mapping_uc(paddr, bytes);
else
init_extra_mapping_wb(paddr, bytes);
+ pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n",
+ id, paddr, paddr + bytes, mt[map_type], max_pnode + 1);
}
+
static __init void map_gru_high(int max_pnode)
{
- union uvh_rh_gam_gru_overlay_config_mmr_u gru;
- int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ union uvh_rh_gam_gru_overlay_config_u gru;
+ unsigned long mask, base;
+ int shift;
+
+ if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) {
+ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG);
+ shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT;
+ mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK;
+ } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) {
+ gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG);
+ shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT;
+ mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK;
+ } else {
+ pr_err("UV: GRU unavailable (no MMR)\n");
+ return;
+ }
+
+ if (!gru.s.enable) {
+ pr_info("UV: GRU disabled (by BIOS)\n");
+ return;
+ }
+
+ base = (gru.v & mask) >> shift;
+ map_high("GRU", base, shift, shift, max_pnode, map_wb);
+ gru_start_paddr = ((u64)base << shift);
+ gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+}
+
+static __init void map_mmr_high(int max_pnode)
+{
+ unsigned long base;
+ int shift;
+ bool enable;
+
+ if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) {
+ union uvh_rh10_gam_mmr_overlay_config_u mmr;
+
+ mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG);
+ enable = mmr.s.enable;
+ base = mmr.s.base;
+ shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT;
+ } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) {
+ union uvh_rh_gam_mmr_overlay_config_u mmr;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG);
+ enable = mmr.s.enable;
+ base = mmr.s.base;
+ shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT;
+ } else {
+ pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n",
+ __func__);
+ return;
+ }
+
+ if (enable)
+ map_high("MMR", base, shift, shift, max_pnode, map_uc);
+ else
+ pr_info("UV: MMR disabled\n");
+}
+
+/* Arch specific ENUM cases */
+enum mmioh_arch {
+ UV2_MMIOH = -1,
+ UVY_MMIOH0, UVY_MMIOH1,
+ UVX_MMIOH0, UVX_MMIOH1,
+};
+
+/* Calculate and Map MMIOH Regions */
+static void __init calc_mmioh_map(enum mmioh_arch index,
+ int min_pnode, int max_pnode,
+ int shift, unsigned long base, int m_io, int n_io)
+{
+ unsigned long mmr, nasid_mask;
+ int nasid, min_nasid, max_nasid, lnasid, mapped;
+ int i, fi, li, n, max_io;
+ char id[8];
+
+ /* One (UV2) mapping */
+ if (index == UV2_MMIOH) {
+ strscpy(id, "MMIOH", sizeof(id));
+ max_io = max_pnode;
+ mapped = 0;
+ goto map_exit;
+ }
+
+ /* small and large MMIOH mappings */
+ switch (index) {
+ case UVY_MMIOH0:
+ mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
+ n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
+ min_nasid = min_pnode;
+ max_nasid = max_pnode;
+ mapped = 1;
+ break;
+ case UVY_MMIOH1:
+ mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
+ n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
+ min_nasid = min_pnode;
+ max_nasid = max_pnode;
+ mapped = 1;
+ break;
+ case UVX_MMIOH0:
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
+ min_nasid = min_pnode * 2;
+ max_nasid = max_pnode * 2;
+ mapped = 1;
+ break;
+ case UVX_MMIOH1:
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
+ min_nasid = min_pnode * 2;
+ max_nasid = max_pnode * 2;
+ mapped = 1;
+ break;
+ default:
+ pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index);
+ return;
+ }
+
+ /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */
+ snprintf(id, sizeof(id), "MMIOH%d", index%2);
+
+ max_io = lnasid = fi = li = -1;
+ for (i = 0; i < n; i++) {
+ unsigned long m_redirect = mmr + i * 8;
+ unsigned long redirect = uv_read_local_mmr(m_redirect);
+
+ nasid = redirect & nasid_mask;
+ if (i == 0)
+ pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n",
+ id, redirect, m_redirect, nasid);
+
+ /* Invalid NASID check */
+ if (nasid < min_nasid || max_nasid < nasid) {
+ /* Not an error: unused table entries get "poison" values */
+ pr_debug("UV:%s:Invalid NASID(%x):%x (range:%x..%x)\n",
+ __func__, index, nasid, min_nasid, max_nasid);
+ nasid = -1;
+ }
+
+ if (nasid == lnasid) {
+ li = i;
+ /* Last entry check: */
+ if (i != n-1)
+ continue;
+ }
+
+ /* Check if we have a cached (or last) redirect to print: */
+ if (lnasid != -1 || (i == n-1 && nasid != -1)) {
+ unsigned long addr1, addr2;
+ int f, l;
+
+ if (lnasid == -1) {
+ f = l = i;
+ lnasid = nasid;
+ } else {
+ f = fi;
+ l = li;
+ }
+ addr1 = (base << shift) + f * (1ULL << m_io);
+ addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
+ pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
+ id, fi, li, lnasid, addr1, addr2);
+ if (max_io < l)
+ max_io = l;
+ }
+ fi = li = i;
+ lnasid = nasid;
+ }
+
+map_exit:
+ pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n",
+ id, base, shift, m_io, max_io, max_pnode);
+
+ if (max_io >= 0 && !mapped)
+ map_high(id, base, shift, m_io, max_io, map_uc);
+}
+
+static __init void map_mmioh_high(int min_pnode, int max_pnode)
+{
+ /* UVY flavor */
+ if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) {
+ union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0;
+ union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1;
+
+ mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0);
+ if (unlikely(mmioh0.s.enable == 0))
+ pr_info("UV: MMIOH0 disabled\n");
+ else
+ calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode,
+ UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT,
+ mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io);
+
+ mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1);
+ if (unlikely(mmioh1.s.enable == 0))
+ pr_info("UV: MMIOH1 disabled\n");
+ else
+ calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode,
+ UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT,
+ mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io);
+ return;
+ }
+ /* UVX flavor */
+ if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) {
+ union uvh_rh_gam_mmioh_overlay_config0_u mmioh0;
+ union uvh_rh_gam_mmioh_overlay_config1_u mmioh1;
+
+ mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0);
+ if (unlikely(mmioh0.s.enable == 0))
+ pr_info("UV: MMIOH0 disabled\n");
+ else {
+ unsigned long base = uvxy_field(mmioh0, base, 0);
+ int m_io = uvxy_field(mmioh0, m_io, 0);
+ int n_io = uvxy_field(mmioh0, n_io, 0);
+
+ calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode,
+ UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT,
+ base, m_io, n_io);
+ }
+
+ mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1);
+ if (unlikely(mmioh1.s.enable == 0))
+ pr_info("UV: MMIOH1 disabled\n");
+ else {
+ unsigned long base = uvxy_field(mmioh1, base, 0);
+ int m_io = uvxy_field(mmioh1, m_io, 0);
+ int n_io = uvxy_field(mmioh1, n_io, 0);
+
+ calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode,
+ UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT,
+ base, m_io, n_io);
+ }
+ return;
+ }
- gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
- if (gru.s.enable)
- map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
+ /* UV2 flavor */
+ if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) {
+ union uvh_rh_gam_mmioh_overlay_config_u mmioh;
+
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG);
+ if (unlikely(mmioh.s2.enable == 0))
+ pr_info("UV: MMIOH disabled\n");
+ else
+ calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode,
+ UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT,
+ mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io);
+ return;
+ }
}
-static __init void map_mmioh_high(int max_pnode)
+static __init void map_low_mmrs(void)
{
- union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
- int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ if (UV_GLOBAL_MMR32_BASE)
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
- mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- if (mmioh.s.enable)
- map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
+ if (UV_LOCAL_MMR_BASE)
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
}
static __init void uv_rtc_init(void)
@@ -404,248 +1113,708 @@ static __init void uv_rtc_init(void)
long status;
u64 ticks_per_sec;
- status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
- &ticks_per_sec);
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec);
+
if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
- printk(KERN_WARNING
- "unable to determine platform RTC clock frequency, "
- "guessing.\n");
- /* BIOS gives wrong value for clock freq. so guess */
+ pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n");
+
+ /* BIOS gives wrong value for clock frequency, so guess: */
sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
- } else
+ } else {
sn_rtc_cycles_per_second = ticks_per_sec;
+ }
+}
+
+/* Direct Legacy VGA I/O traffic to designated IOH */
+static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
+{
+ int domain, bus, rc;
+
+ if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
+ return 0;
+
+ if ((command_bits & PCI_COMMAND_IO) == 0)
+ return 0;
+
+ domain = pci_domain_nr(pdev->bus);
+ bus = pdev->bus->number;
+
+ rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
+
+ return rc;
}
/*
- * percpu heartbeat timer
+ * Called on each CPU to initialize the per_cpu UV data area.
+ * FIXME: hotplug not supported yet
*/
-static void uv_heartbeat(unsigned long ignored)
+void uv_cpu_init(void)
+{
+ /* CPU 0 initialization will be done via uv_system_init. */
+ if (smp_processor_id() == 0)
+ return;
+
+ uv_hub_info->nr_online_cpus++;
+}
+
+struct mn {
+ unsigned char m_val;
+ unsigned char n_val;
+ unsigned char m_shift;
+ unsigned char n_lshift;
+};
+
+/* Initialize caller's MN struct and fill in values */
+static void get_mn(struct mn *mnp)
{
- struct timer_list *timer = &uv_hub_info->scir.timer;
- unsigned char bits = uv_hub_info->scir.state;
+ memset(mnp, 0, sizeof(*mnp));
+ mnp->n_val = uv_cpuid.n_skt;
+ if (is_uv(UV4|UVY)) {
+ mnp->m_val = 0;
+ mnp->n_lshift = 0;
+ } else if (is_uv3_hub()) {
+ union uvyh_gr0_gam_gr_config_u m_gr_config;
+
+ mnp->m_val = uv_cpuid.m_skt;
+ m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG);
+ mnp->n_lshift = m_gr_config.s3.m_skt;
+ } else if (is_uv2_hub()) {
+ mnp->m_val = uv_cpuid.m_skt;
+ mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
+ }
+ mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
+}
- /* flip heartbeat bit */
- bits ^= SCIR_CPU_HEARTBEAT;
+static void __init uv_init_hub_info(struct uv_hub_info_s *hi)
+{
+ struct mn mn;
+
+ get_mn(&mn);
+ hi->gpa_mask = mn.m_val ?
+ (1UL << (mn.m_val + mn.n_val)) - 1 :
+ (1UL << uv_cpuid.gpa_shift) - 1;
+
+ hi->m_val = mn.m_val;
+ hi->n_val = mn.n_val;
+ hi->m_shift = mn.m_shift;
+ hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
+ hi->hub_revision = uv_hub_info->hub_revision;
+ hi->hub_type = uv_hub_info->hub_type;
+ hi->pnode_mask = uv_cpuid.pnode_mask;
+ hi->nasid_shift = uv_cpuid.nasid_shift;
+ hi->min_pnode = _min_pnode;
+ hi->min_socket = _min_socket;
+ hi->node_to_socket = _node_to_socket;
+ hi->pnode_to_socket = _pnode_to_socket;
+ hi->socket_to_node = _socket_to_node;
+ hi->socket_to_pnode = _socket_to_pnode;
+ hi->gr_table_len = _gr_table_len;
+ hi->gr_table = _gr_table;
+
+ uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
+ hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+ if (mn.m_val)
+ hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val;
+
+ if (uv_gp_table) {
+ hi->global_mmr_base = uv_gp_table->mmr_base;
+ hi->global_mmr_shift = uv_gp_table->mmr_shift;
+ hi->global_gru_base = uv_gp_table->gru_base;
+ hi->global_gru_shift = uv_gp_table->gru_shift;
+ hi->gpa_shift = uv_gp_table->gpa_shift;
+ hi->gpa_mask = (1UL << hi->gpa_shift) - 1;
+ } else {
+ hi->global_mmr_base =
+ uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) &
+ ~UV_MMR_ENABLE;
+ hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
+ }
- /* is this cpu idle? */
- if (idle_cpu(raw_smp_processor_id()))
- bits &= ~SCIR_CPU_ACTIVITY;
- else
- bits |= SCIR_CPU_ACTIVITY;
+ get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top);
+
+ hi->apic_pnode_shift = uv_cpuid.socketid_shift;
- /* update system controller interface reg */
- uv_set_scir_bits(bits);
+ /* Show system specific info: */
+ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
+ pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
+ pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift);
+ if (hi->global_gru_base)
+ pr_info("UV: gru_base/shift:0x%lx/%ld\n",
+ hi->global_gru_base, hi->global_gru_shift);
- /* enable next timer period */
- mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
}
-static void __cpuinit uv_heartbeat_enable(int cpu)
+static void __init decode_gam_params(unsigned long ptr)
{
- if (!uv_cpu_hub_info(cpu)->scir.enabled) {
- struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+ uv_gp_table = (struct uv_gam_parameters *)ptr;
- uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
- setup_timer(timer, uv_heartbeat, cpu);
- timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
- add_timer_on(timer, cpu);
- uv_cpu_hub_info(cpu)->scir.enabled = 1;
- }
+ pr_info("UV: GAM Params...\n");
+ pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n",
+ uv_gp_table->mmr_base, uv_gp_table->mmr_shift,
+ uv_gp_table->gru_base, uv_gp_table->gru_shift,
+ uv_gp_table->gpa_shift);
+}
+
+static void __init decode_gam_rng_tbl(unsigned long ptr)
+{
+ struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
+ unsigned long lgre = 0, gend = 0;
+ int index = 0;
+ int sock_min = INT_MAX, pnode_min = INT_MAX;
+ int sock_max = -1, pnode_max = -1;
+
+ uv_gre_table = gre;
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ unsigned long size = ((unsigned long)(gre->limit - lgre)
+ << UV_GAM_RANGE_SHFT);
+ int order = 0;
+ char suffix[] = " KMGTPE";
+ int flag = ' ';
+
+ while (size > 9999 && order < sizeof(suffix)) {
+ size /= 1024;
+ order++;
+ }
- /* check boot cpu */
- if (!uv_cpu_hub_info(0)->scir.enabled)
- uv_heartbeat_enable(0);
+ /* adjust max block size to current range start */
+ if (gre->type == 1 || gre->type == 2)
+ if (adj_blksize(lgre))
+ flag = '*';
+
+ if (!index) {
+ pr_info("UV: GAM Range Table...\n");
+ pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
+ }
+ pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n",
+ index++,
+ (unsigned long)lgre << UV_GAM_RANGE_SHFT,
+ (unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
+ flag, size, suffix[order],
+ gre->type, gre->nasid, gre->sockid, gre->pnode);
+
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT;
+
+ /* update to next range start */
+ lgre = gre->limit;
+ if (sock_min > gre->sockid)
+ sock_min = gre->sockid;
+ if (sock_max < gre->sockid)
+ sock_max = gre->sockid;
+ if (pnode_min > gre->pnode)
+ pnode_min = gre->pnode;
+ if (pnode_max < gre->pnode)
+ pnode_max = gre->pnode;
+ }
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ _min_pnode = pnode_min;
+ _max_pnode = pnode_max;
+ _gr_table_len = index;
+
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n",
+ index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend));
}
-#ifdef CONFIG_HOTPLUG_CPU
-static void __cpuinit uv_heartbeat_disable(int cpu)
+/* Walk through UVsystab decoding the fields */
+static int __init decode_uv_systab(void)
{
- if (uv_cpu_hub_info(cpu)->scir.enabled) {
- uv_cpu_hub_info(cpu)->scir.enabled = 0;
- del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+ struct uv_systab *st;
+ int i;
+
+ /* Get mapped UVsystab pointer */
+ st = uv_systab;
+
+ /* If UVsystab is version 1, there is no extended UVsystab */
+ if (st && st->revision == UV_SYSTAB_VERSION_1)
+ return 0;
+
+ if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
+ int rev = st ? st->revision : 0;
+
+ pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n",
+ rev, UV_SYSTAB_VERSION_UV4_LATEST);
+ pr_err("UV: Does not support UV, switch to non-UV x86_64\n");
+ uv_system_type = UV_NONE;
+
+ return -EINVAL;
+ }
+
+ for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+ unsigned long ptr = st->entry[i].offset;
+
+ if (!ptr)
+ continue;
+
+ /* point to payload */
+ ptr += (unsigned long)st;
+
+ switch (st->entry[i].type) {
+ case UV_SYSTAB_TYPE_GAM_PARAMS:
+ decode_gam_params(ptr);
+ break;
+
+ case UV_SYSTAB_TYPE_GAM_RNG_TBL:
+ decode_gam_rng_tbl(ptr);
+ break;
+
+ case UV_SYSTAB_TYPE_ARCH_TYPE:
+ /* already processed in early startup */
+ break;
+
+ default:
+ pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n",
+ __func__, st->entry[i].type);
+ break;
+ }
}
- uv_set_cpu_scir_bits(cpu, 0xff);
+ return 0;
}
/*
- * cpu hotplug notifier
+ * Given a bitmask 'bits' representing presnt blades, numbered
+ * starting at 'base', masking off unused high bits of blade number
+ * with 'mask', update the minimum and maximum blade numbers that we
+ * have found. (Masking with 'mask' necessary because of BIOS
+ * treatment of system partitioning when creating this table we are
+ * interpreting.)
*/
-static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static inline void blade_update_min_max(unsigned long bits, int base, int mask, int *min, int *max)
{
- long cpu = (long)hcpu;
+ int first, last;
- switch (action) {
- case CPU_ONLINE:
- uv_heartbeat_enable(cpu);
- break;
- case CPU_DOWN_PREPARE:
- uv_heartbeat_disable(cpu);
- break;
- default:
- break;
+ if (!bits)
+ return;
+ first = (base + __ffs(bits)) & mask;
+ last = (base + __fls(bits)) & mask;
+
+ if (*min > first)
+ *min = first;
+ if (*max < last)
+ *max = last;
+}
+
+/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */
+static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
+{
+ unsigned long np;
+ int i, uv_pb = 0;
+ int sock_min = INT_MAX, sock_max = -1, s_mask;
+
+ s_mask = (1 << uv_cpuid.n_skt) - 1;
+
+ if (UVH_NODE_PRESENT_TABLE) {
+ pr_info("UV: NODE_PRESENT_DEPTH = %d\n",
+ UVH_NODE_PRESENT_TABLE_DEPTH);
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+ pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np);
+ blade_update_min_max(np, i * 64, s_mask, &sock_min, &sock_max);
+ }
+ }
+ if (UVH_NODE_PRESENT_0) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_0);
+ pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np);
+ blade_update_min_max(np, 0, s_mask, &sock_min, &sock_max);
+ }
+ if (UVH_NODE_PRESENT_1) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_1);
+ pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np);
+ blade_update_min_max(np, 64, s_mask, &sock_min, &sock_max);
}
- return NOTIFY_OK;
+
+ /* Only update if we actually found some bits indicating blades present */
+ if (sock_max >= sock_min) {
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ uv_pb = sock_max - sock_min + 1;
+ }
+ if (uv_possible_blades != uv_pb)
+ uv_possible_blades = uv_pb;
+
+ pr_info("UV: number nodes/possible blades %d (%d - %d)\n",
+ uv_pb, sock_min, sock_max);
}
-static __init void uv_scir_register_cpu_notifier(void)
+static int __init alloc_conv_table(int num_elem, unsigned short **table)
{
- hotcpu_notifier(uv_scir_cpu_notify, 0);
+ int i;
+ size_t bytes;
+
+ bytes = num_elem * sizeof(*table[0]);
+ *table = kmalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!*table))
+ return -ENOMEM;
+ for (i = 0; i < num_elem; i++)
+ ((unsigned short *)*table)[i] = SOCK_EMPTY;
+ return 0;
}
-#else /* !CONFIG_HOTPLUG_CPU */
+/* Remove conversion table if it's 1:1 */
+#define FREE_1_TO_1_TABLE(tbl, min, max, max2) free_1_to_1_table(&tbl, #tbl, min, max, max2)
-static __init void uv_scir_register_cpu_notifier(void)
+static void __init free_1_to_1_table(unsigned short **tp, char *tname, int min, int max, int max2)
{
+ int i;
+ unsigned short *table = *tp;
+
+ if (table == NULL)
+ return;
+ if (max != max2)
+ return;
+ for (i = 0; i < max; i++) {
+ if (i != table[i])
+ return;
+ }
+ kfree(table);
+ *tp = NULL;
+ pr_info("UV: %s is 1:1, conversion table removed\n", tname);
}
-static __init int uv_init_heartbeat(void)
+/*
+ * Build Socket Tables
+ * If the number of nodes is >1 per socket, socket to node table will
+ * contain lowest node number on that socket.
+ */
+static void __init build_socket_tables(void)
{
- int cpu;
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ int nums, numn, nump;
+ int i, lnid, apicid;
+ int minsock = _min_socket;
+ int maxsock = _max_socket;
+ int minpnode = _min_pnode;
+ int maxpnode = _max_pnode;
+
+ if (!gre) {
+ if (is_uv2_hub() || is_uv3_hub()) {
+ pr_info("UV: No UVsystab socket table, ignoring\n");
+ return;
+ }
+ pr_err("UV: Error: UVsystab address translations not available!\n");
+ WARN_ON_ONCE(!gre);
+ return;
+ }
- if (is_uv_system())
- for_each_online_cpu(cpu)
- uv_heartbeat_enable(cpu);
- return 0;
-}
+ numn = num_possible_nodes();
+ nump = maxpnode - minpnode + 1;
+ nums = maxsock - minsock + 1;
+
+ /* Allocate and clear tables */
+ if ((alloc_conv_table(nump, &_pnode_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_pnode) < 0)
+ || (alloc_conv_table(numn, &_node_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_node) < 0)) {
+ kfree(_pnode_to_socket);
+ kfree(_socket_to_pnode);
+ kfree(_node_to_socket);
+ return;
+ }
-late_initcall(uv_init_heartbeat);
+ /* Fill in pnode/node/addr conversion list values: */
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ continue;
+ i = gre->sockid - minsock;
+ if (_socket_to_pnode[i] == SOCK_EMPTY)
+ _socket_to_pnode[i] = gre->pnode;
+
+ i = gre->pnode - minpnode;
+ if (_pnode_to_socket[i] == SOCK_EMPTY)
+ _pnode_to_socket[i] = gre->sockid;
+
+ pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
+ gre->sockid, gre->type, gre->nasid,
+ _socket_to_pnode[gre->sockid - minsock],
+ _pnode_to_socket[gre->pnode - minpnode]);
+ }
+
+ /* Set socket -> node values: */
+ lnid = NUMA_NO_NODE;
+ for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) {
+ int nid = __apicid_to_node[apicid];
+ int sockid;
+
+ if ((nid == NUMA_NO_NODE) || (lnid == nid))
+ continue;
+ lnid = nid;
+
+ sockid = apicid >> uv_cpuid.socketid_shift;
+
+ if (_socket_to_node[sockid - minsock] == SOCK_EMPTY)
+ _socket_to_node[sockid - minsock] = nid;
+
+ if (_node_to_socket[nid] == SOCK_EMPTY)
+ _node_to_socket[nid] = sockid;
-#endif /* !CONFIG_HOTPLUG_CPU */
+ pr_info("UV: sid:%02x: apicid:%04x socket:%02d node:%03x s2n:%03x\n",
+ sockid,
+ apicid,
+ _node_to_socket[nid],
+ nid,
+ _socket_to_node[sockid - minsock]);
+ }
+
+ /*
+ * If e.g. socket id == pnode for all pnodes,
+ * system runs faster by removing corresponding conversion table.
+ */
+ FREE_1_TO_1_TABLE(_socket_to_node, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_node_to_socket, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_socket_to_pnode, _min_pnode, nums, nump);
+ FREE_1_TO_1_TABLE(_pnode_to_socket, _min_pnode, nums, nump);
+}
+
+/* Check which reboot to use */
+static void check_efi_reboot(void)
+{
+ /* If EFI reboot not available, use ACPI reboot */
+ if (!efi_enabled(EFI_BOOT))
+ reboot_type = BOOT_ACPI;
+}
/*
- * Called on each cpu to initialize the per_cpu UV data area.
- * FIXME: hotplug not supported yet
+ * User proc fs file handling now deprecated.
+ * Recommend using /sys/firmware/sgi_uv/... instead.
*/
-void __cpuinit uv_cpu_init(void)
+static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data)
{
- /* CPU 0 initilization will be done via uv_system_init. */
- if (!uv_blade_info)
- return;
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n",
+ current->comm);
+ seq_printf(file, "0x%x\n", uv_hubbed_system);
+ return 0;
+}
- uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data)
+{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n",
+ current->comm);
+ seq_printf(file, "0x%x\n", uv_hubless_system);
+ return 0;
+}
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->pnode);
+static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data)
+{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n",
+ current->comm);
+ seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id);
+ return 0;
}
+static __init void uv_setup_proc_files(int hubless)
+{
+ struct proc_dir_entry *pde;
-void __init uv_system_init(void)
+ pde = proc_mkdir(UV_PROC_NODE, NULL);
+ proc_create_single("archtype", 0, pde, proc_archtype_show);
+ if (hubless)
+ proc_create_single("hubless", 0, pde, proc_hubless_show);
+ else
+ proc_create_single("hubbed", 0, pde, proc_hubbed_show);
+}
+
+/* Initialize UV hubless systems */
+static __init int uv_system_init_hubless(void)
{
- union uvh_si_addr_map_config_u m_n_config;
- union uvh_node_id_u node_id;
- unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
- int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
- int gnode_extra, max_pnode = 0;
- unsigned long mmr_base, present, paddr;
- unsigned short pnode_mask;
-
- m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
- m_val = m_n_config.s.m_skt;
- n_val = m_n_config.s.n_skt;
- mmr_base =
- uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
- ~UV_MMR_ENABLE;
- pnode_mask = (1 << n_val) - 1;
- node_id.v = uv_read_local_mmr(UVH_NODE_ID);
- gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
- gnode_upper = ((unsigned long)gnode_extra << m_val);
- printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
- n_val, m_val, gnode_upper, gnode_extra);
-
- printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
-
- for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
- uv_possible_blades +=
- hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
- printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
-
- bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
- uv_blade_info = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!uv_blade_info);
- for (blade = 0; blade < uv_num_possible_blades(); blade++)
- uv_blade_info[blade].memory_nid = -1;
-
- get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
-
- bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
- uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!uv_node_to_blade);
- memset(uv_node_to_blade, 255, bytes);
-
- bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
- uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!uv_cpu_to_blade);
- memset(uv_cpu_to_blade, 255, bytes);
-
- blade = 0;
- for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
- present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
- for (j = 0; j < 64; j++) {
- if (!test_bit(j, &present))
- continue;
- uv_blade_info[blade].pnode = (i * 64 + j);
- uv_blade_info[blade].nr_possible_cpus = 0;
- uv_blade_info[blade].nr_online_cpus = 0;
- blade++;
- }
+ int rc;
+
+ /* Setup PCH NMI handler */
+ uv_nmi_setup_hubless();
+
+ /* Init kernel/BIOS interface */
+ rc = uv_bios_init();
+ if (rc < 0)
+ return rc;
+
+ /* Process UVsystab */
+ rc = decode_uv_systab();
+ if (rc < 0)
+ return rc;
+
+ /* Set section block size for current node memory */
+ set_block_size();
+
+ /* Create user access node */
+ if (rc >= 0)
+ uv_setup_proc_files(1);
+
+ check_efi_reboot();
+
+ return rc;
+}
+
+static void __init uv_system_init_hub(void)
+{
+ struct uv_hub_info_s hub_info = {0};
+ int bytes, cpu, nodeid, bid;
+ unsigned short min_pnode = USHRT_MAX, max_pnode = 0;
+ char *hub = is_uv5_hub() ? "UV500" :
+ is_uv4_hub() ? "UV400" :
+ is_uv3_hub() ? "UV300" :
+ is_uv2_hub() ? "UV2000/3000" : NULL;
+ struct uv_hub_info_s **uv_hub_info_list_blade;
+
+ if (!hub) {
+ pr_err("UV: Unknown/unsupported UV hub\n");
+ return;
}
+ pr_info("UV: Found %s hub\n", hub);
+
+ map_low_mmrs();
+ /* Get uv_systab for decoding, setup UV BIOS calls */
uv_bios_init();
- uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
- &sn_coherency_id, &sn_region_size);
+
+ /* If there's an UVsystab problem then abort UV init: */
+ if (decode_uv_systab() < 0) {
+ pr_err("UV: Mangled UVsystab format\n");
+ return;
+ }
+
+ build_socket_tables();
+ build_uv_gr_table();
+ set_block_size();
+ uv_init_hub_info(&hub_info);
+ /* If UV2 or UV3 may need to get # blades from HW */
+ if (is_uv(UV2|UV3) && !uv_gre_table)
+ boot_init_possible_blades(&hub_info);
+ else
+ /* min/max sockets set in decode_gam_rng_tbl */
+ uv_possible_blades = (_max_socket - _min_socket) + 1;
+
+ /* uv_num_possible_blades() is really the hub count: */
+ pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
+
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number);
+ hub_info.coherency_domain_number = sn_coherency_id;
uv_rtc_init();
- for_each_present_cpu(cpu) {
- nid = cpu_to_node(cpu);
- pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
- blade = boot_pnode_to_blade(pnode);
- lcpu = uv_blade_info[blade].nr_possible_cpus;
- uv_blade_info[blade].nr_possible_cpus++;
-
- /* Any node on the blade, else will contain -1. */
- uv_blade_info[blade].memory_nid = nid;
-
- uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
- uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
- uv_cpu_hub_info(cpu)->m_val = m_val;
- uv_cpu_hub_info(cpu)->n_val = m_val;
- uv_cpu_hub_info(cpu)->numa_blade_id = blade;
- uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
- uv_cpu_hub_info(cpu)->pnode = pnode;
- uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
- uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
- uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
- uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
- uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
- uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
- uv_node_to_blade[nid] = blade;
- uv_cpu_to_blade[cpu] = blade;
- max_pnode = max(pnode, max_pnode);
+ /*
+ * __uv_hub_info_list[] is indexed by node, but there is only
+ * one hub_info structure per blade. First, allocate one
+ * structure per blade. Further down we create a per-node
+ * table (__uv_hub_info_list[]) pointing to hub_info
+ * structures for the correct blade.
+ */
+
+ bytes = sizeof(void *) * uv_num_possible_blades();
+ uv_hub_info_list_blade = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!uv_hub_info_list_blade))
+ return;
+
+ bytes = sizeof(struct uv_hub_info_s);
+ for_each_possible_blade(bid) {
+ struct uv_hub_info_s *new_hub;
+
+ /* Allocate & fill new per hub info list */
+ new_hub = (bid == 0) ? &uv_hub_info_node0
+ : kzalloc_node(bytes, GFP_KERNEL, uv_blade_to_node(bid));
+ if (WARN_ON_ONCE(!new_hub)) {
+ /* do not kfree() bid 0, which is statically allocated */
+ while (--bid > 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
+ }
- printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
- "lcpu %d, blade %d\n",
- cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
- lcpu, blade);
+ uv_hub_info_list_blade[bid] = new_hub;
+ *new_hub = hub_info;
+
+ /* Use information from GAM table if available: */
+ if (uv_gre_table)
+ new_hub->pnode = uv_blade_to_pnode(bid);
+ else /* Or fill in during CPU loop: */
+ new_hub->pnode = 0xffff;
+
+ new_hub->numa_blade_id = bid;
+ new_hub->memory_nid = NUMA_NO_NODE;
+ new_hub->nr_possible_cpus = 0;
+ new_hub->nr_online_cpus = 0;
+ }
+
+ /*
+ * Now populate __uv_hub_info_list[] for each node with the
+ * pointer to the struct for the blade it resides on.
+ */
+
+ bytes = sizeof(void *) * num_possible_nodes();
+ __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!__uv_hub_info_list)) {
+ for_each_possible_blade(bid)
+ /* bid 0 is statically allocated */
+ if (bid != 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
}
- /* Add blade/pnode info for nodes without cpus */
- for_each_online_node(nid) {
- if (uv_node_to_blade[nid] >= 0)
+ for_each_node(nodeid)
+ __uv_hub_info_list[nodeid] = uv_hub_info_list_blade[uv_node_to_blade_id(nodeid)];
+
+ /* Initialize per CPU info: */
+ for_each_possible_cpu(cpu) {
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned short bid;
+ unsigned short pnode;
+
+ pnode = uv_apicid_to_pnode(apicid);
+ bid = uv_pnode_to_socket(pnode) - _min_socket;
+
+ uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list_blade[bid];
+ uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+ if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
+ uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
+
+ if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
+ uv_cpu_hub_info(cpu)->pnode = pnode;
+ }
+
+ for_each_possible_blade(bid) {
+ unsigned short pnode = uv_hub_info_list_blade[bid]->pnode;
+
+ if (pnode == 0xffff)
continue;
- paddr = node_start_pfn(nid) << PAGE_SHIFT;
- paddr = uv_soc_phys_ram_to_gpa(paddr);
- pnode = (paddr >> m_val) & pnode_mask;
- blade = boot_pnode_to_blade(pnode);
- uv_node_to_blade[nid] = blade;
+
+ min_pnode = min(pnode, min_pnode);
max_pnode = max(pnode, max_pnode);
+ pr_info("UV: HUB:%2d pn:%02x nrcpus:%d\n",
+ bid,
+ uv_hub_info_list_blade[bid]->pnode,
+ uv_hub_info_list_blade[bid]->nr_possible_cpus);
}
+ pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode);
map_gru_high(max_pnode);
- map_mmioh_high(max_pnode);
+ map_mmr_high(max_pnode);
+ map_mmioh_high(min_pnode, max_pnode);
+
+ kfree(uv_hub_info_list_blade);
+ uv_hub_info_list_blade = NULL;
+ uv_nmi_setup();
uv_cpu_init();
- uv_scir_register_cpu_notifier();
- proc_mkdir("sgi_uv", NULL);
+ uv_setup_proc_files(0);
+
+ /* Register Legacy VGA I/O redirection handler: */
+ pci_register_set_vga_state(uv_set_vga_state);
+
+ check_efi_reboot();
}
+
+/*
+ * There is a different code path needed to initialize a UV system that does
+ * not have a "UV HUB" (referred to as "hubless").
+ */
+void __init uv_system_init(void)
+{
+ if (likely(!is_uv_system() && !is_uv_hubless(1)))
+ return;
+
+ if (is_uv_system())
+ uv_system_init_hub();
+ else
+ uv_system_init_hubless();
+}
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 442b5508893f..b37ab1095707 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* -*- linux-c -*-
* APM BIOS driver for Linux
* Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
@@ -5,16 +6,6 @@
* Initial development of this driver was funded by NEC Australia P/L
* and NEC Corporation
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
* October 1995, Rik Faith (faith@cs.unc.edu):
* Minor enhancements and updates (to the patch set) for 1.3.x
* Documentation
@@ -66,7 +57,7 @@
* 1.5: Fix segment register reloading (in case of bad segments saved
* across BIOS call).
* Stephen Rothwell
- * 1.6: Cope with complier/assembler differences.
+ * 1.6: Cope with compiler/assembler differences.
* Only try to turn off the first display device.
* Fix OOPS at power off with no APM BIOS by Jan Echternach
* <echter@informatik.uni-rostock.de>
@@ -103,7 +94,7 @@
* Remove APM dependencies in arch/i386/kernel/process.c
* Remove APM dependencies in drivers/char/sysrq.c
* Reset time across standby.
- * Allow more inititialisation on SMP.
+ * Allow more initialisation on SMP.
* Remove CONFIG_APM_POWER_OFF and make it boot time
* configurable (default on).
* Make debug only a boot time parameter (remove APM_DEBUG).
@@ -140,7 +131,7 @@
* is now the way life works).
* Fix thinko in suspend() (wrong return).
* Notify drivers on critical suspend.
- * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
* modified by sfr).
* Disable interrupts while we are suspended (Andy Henroid
* <andy_henroid@yahoo.com> fixed by sfr).
@@ -189,8 +180,8 @@
* Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
*
* [This document is available free from Intel by calling 800.628.8686 (fax
- * 916.356.6100) or 800.548.4725; or via anonymous ftp from
- * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
* available from Microsoft by calling 206.882.8080.]
*
* APM 1.2 Reference:
@@ -201,10 +192,11 @@
* http://www.microsoft.com/whdc/archive/amp_12.mspx]
*/
+#define pr_fmt(fmt) "apm: " fmt
+
#include <linux/module.h>
#include <linux/poll.h>
-#include <linux/smp_lock.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/timer.h>
@@ -217,7 +209,8 @@
#include <linux/apm_bios.h>
#include <linux/init.h>
#include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
#include <linux/pm.h>
#include <linux/capability.h>
#include <linux/device.h>
@@ -228,28 +221,24 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
+#include <linux/cpuidle.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/desc.h>
-#include <asm/i8253.h>
#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
+#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
extern int (*console_blank_hook)(int);
#endif
/*
- * The apm_bios device is one of the misc char devices.
- * This is its minor number.
- */
-#define APM_MINOR_DEV 134
-
-/*
- * See Documentation/Config.help for the configuration options.
- *
* Various options can be changed at boot time as follows:
* (We allow underscores for compatibility with the modules code)
* apm=on/off enable/disable APM
@@ -366,44 +355,73 @@ struct apm_user {
#endif
#define DEFAULT_IDLE_PERIOD (100 / 3)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
+
+static struct cpuidle_driver apm_idle_driver = {
+ .name = "apm_idle",
+ .owner = THIS_MODULE,
+ .states = {
+ { /* entry 0 is for polling */ },
+ { /* entry 1 is for APM idle */
+ .name = "APM",
+ .desc = "APM idle",
+ .exit_latency = 250, /* WAG */
+ .target_residency = 500, /* WAG */
+ .enter = &apm_cpu_idle
+ },
+ },
+ .state_count = 2,
+};
+
+static struct cpuidle_device apm_cpuidle_device;
+
/*
* Local variables
*/
-static struct {
+__visible struct {
unsigned long offset;
unsigned short segment;
} apm_bios_entry;
static int clock_slowed;
static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
-static int set_pm_idle;
static int suspends_pending;
static int standbys_pending;
static int ignore_sys_suspend;
static int ignore_normal_resume;
static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-static int debug __read_mostly;
-static int smp __read_mostly;
+static bool debug __read_mostly;
+static bool smp __read_mostly;
static int apm_disabled = -1;
#ifdef CONFIG_SMP
-static int power_off;
+static bool power_off;
#else
-static int power_off = 1;
+static bool power_off = 1;
#endif
-static int realmode_power_off;
+static bool realmode_power_off;
#ifdef CONFIG_APM_ALLOW_INTS
-static int allow_ints = 1;
+static bool allow_ints = 1;
#else
-static int allow_ints;
+static bool allow_ints;
#endif
-static int broken_psr;
+static bool broken_psr;
static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
static struct apm_user *user_list;
static DEFINE_SPINLOCK(user_list_lock);
-static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } };
+static DEFINE_MUTEX(apm_mutex);
+
+/*
+ * Set up a segment that references the real mode segment 0x40
+ * that extends up to the end of page zero (that we have reserved).
+ * This is for buggy BIOS's that refer to (real mode) segment 0x40
+ * even though they are called in protected mode.
+ */
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS,
+ (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
static const char driver_version[] = "1.16ac"; /* no spaces */
@@ -477,11 +495,11 @@ static void apm_error(char *str, int err)
if (error_table[i].key == err)
break;
if (i < ERROR_COUNT)
- printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+ pr_notice("%s: %s\n", str, error_table[i].msg);
else if (err < 0)
- printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
+ pr_notice("%s: linux error code %i\n", str, err);
else
- printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+ pr_notice("%s: unknown error code %#2.2x\n",
str, err);
}
@@ -575,19 +593,24 @@ static long __apm_bios_call(void *_call)
struct desc_struct save_desc_40;
struct desc_struct *gdt;
struct apm_bios_call *call = _call;
+ u64 ibt;
cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
apm_bios_call_asm(call->func, call->ebx, call->ecx,
&call->eax, &call->ebx, &call->ecx, &call->edx,
&call->esi);
APM_DO_RESTORE_SEGS;
+ ibt_restore(ibt);
+ firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
put_cpu();
@@ -651,18 +674,23 @@ static long __apm_bios_call_simple(void *_call)
struct desc_struct save_desc_40;
struct desc_struct *gdt;
struct apm_bios_call *call = _call;
+ u64 ibt;
cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
&call->eax);
APM_DO_RESTORE_SEGS;
+ ibt_restore(ibt);
+ firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
put_cpu();
@@ -739,7 +767,7 @@ static int apm_driver_version(u_short *val)
* not cleared until it is acknowledged.
*
* Additional information is returned in the info pointer, providing
- * that APM 1.2 is in use. If no messges are pending the value 0x80
+ * that APM 1.2 is in use. If no messages are pending the value 0x80
* is returned (No power management events pending).
*/
static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
@@ -810,24 +838,12 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
- polling = !!(current_thread_info()->status & TS_POLLING);
- if (polling) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- }
if (!need_resched()) {
idled = 1;
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
}
- if (polling)
- current_thread_info()->status |= TS_POLLING;
if (!idled)
return 0;
@@ -874,8 +890,6 @@ static void apm_do_busy(void)
#define IDLE_CALC_LIMIT (HZ * 100)
#define IDLE_LEAKY_MAX 16
-static void (*original_pm_idle)(void) __read_mostly;
-
/**
* apm_cpu_idle - cpu idling for APM capable Linux
*
@@ -884,34 +898,36 @@ static void (*original_pm_idle)(void) __read_mostly;
* Furthermore it calls the system default idle routine.
*/
-static void apm_cpu_idle(void)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
{
static int use_apm_idle; /* = 0 */
static unsigned int last_jiffies; /* = 0 */
- static unsigned int last_stime; /* = 0 */
+ static u64 last_stime; /* = 0 */
+ u64 stime, utime;
int apm_idle_done = 0;
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
recalc:
+ task_cputime(current, &utime, &stime);
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
- last_jiffies = jiffies;
- last_stime = current->stime;
} else if (jiffies_since_last_check > idle_period) {
unsigned int idle_percentage;
- idle_percentage = current->stime - last_stime;
+ idle_percentage = nsecs_to_jiffies(stime - last_stime);
idle_percentage *= 100;
idle_percentage /= jiffies_since_last_check;
use_apm_idle = (idle_percentage > idle_threshold);
if (apm_info.forbid_idle)
use_apm_idle = 0;
- last_jiffies = jiffies;
- last_stime = current->stime;
}
+ last_jiffies = jiffies;
+ last_stime = stime;
+
bucket = IDLE_LEAKY_MAX;
while (!need_resched()) {
@@ -939,10 +955,7 @@ recalc:
break;
}
}
- if (original_pm_idle)
- original_pm_idle();
- else
- default_idle();
+ default_idle();
local_irq_disable();
jiffies_since_last_check = jiffies - last_jiffies;
if (jiffies_since_last_check > idle_period)
@@ -952,7 +965,7 @@ recalc:
if (apm_idle_done)
apm_do_busy();
- local_irq_enable();
+ return index;
}
/**
@@ -967,20 +980,10 @@ recalc:
static void apm_power_off(void)
{
- unsigned char po_bios_call[] = {
- 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
- 0x8e, 0xd0, /* movw ax,ss */
- 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
- 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
- 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
- 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
- 0xcd, 0x15 /* int $0x15 */
- };
-
/* Some bioses don't like being called from CPU != 0 */
if (apm_info.realmode_power_off) {
set_cpus_allowed_ptr(current, cpumask_of(0));
- machine_real_restart(po_bios_call, sizeof(po_bios_call));
+ machine_real_restart(MRR_APM);
} else {
(void)set_system_power_state(APM_STATE_OFF);
}
@@ -1023,7 +1026,7 @@ static int apm_enable_power_management(int enable)
* status which gives the rough battery status, and current power
* source. The bat value returned give an estimate as a percentage
* of life and a status value for the battery. The estimated life
- * if reported is a lifetime in secodnds/minutes at current powwer
+ * if reported is a lifetime in seconds/minutes at current power
* consumption.
*/
@@ -1037,8 +1040,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
if (apm_info.get_power_status_broken)
return APM_32_UNSUPPORTED;
- if (apm_bios_call(&call))
+ if (apm_bios_call(&call)) {
+ if (!call.err)
+ return APM_NO_ERROR;
return call.err;
+ }
*status = call.ebx;
*bat = call.ecx;
if (apm_info.get_power_status_swabinminutes) {
@@ -1049,41 +1055,12 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
return APM_SUCCESS;
}
-#if 0
-static int apm_get_battery_status(u_short which, u_short *status,
- u_short *bat, u_short *life, u_short *nbat)
-{
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esi;
-
- if (apm_info.connection_version < 0x0102) {
- /* pretend we only have one battery. */
- if (which != 1)
- return APM_BAD_DEVICE;
- *nbat = 1;
- return apm_get_power_status(status, bat, life);
- }
-
- if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
- &ebx, &ecx, &edx, &esi))
- return (eax >> 8) & 0xff;
- *status = ebx;
- *bat = ecx;
- *life = edx;
- *nbat = esi;
- return APM_SUCCESS;
-}
-#endif
-
/**
* apm_engage_power_management - enable PM on a device
* @device: identity of device
* @enable: on/off
*
- * Activate or deactive power management on either a specific device
+ * Activate or deactivate power management on either a specific device
* or the entire system (%APM_DEVICE_ALL).
*/
@@ -1185,7 +1162,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
static int notified;
if (notified++ == 0)
- printk(KERN_ERR "apm: an event queue overflowed\n");
+ pr_err("an event queue overflowed\n");
if (++as->event_tail >= APM_MAX_EVENTS)
as->event_tail = 0;
}
@@ -1216,15 +1193,15 @@ static void reinit_timer(void)
#ifdef INIT_TIMER_AFTER_SUSPEND
unsigned long flags;
- spin_lock_irqsave(&i8253_lock, flags);
+ raw_spin_lock_irqsave(&i8253_lock, flags);
/* set the clock to HZ */
- outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
- outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
udelay(10);
- outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
udelay(10);
- spin_unlock_irqrestore(&i8253_lock, flags);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
#endif
}
@@ -1234,11 +1211,10 @@ static int suspend(int vetoable)
struct apm_user *as;
dpm_suspend_start(PMSG_SUSPEND);
-
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
@@ -1256,12 +1232,12 @@ static int suspend(int vetoable)
apm_error("suspend", err);
err = (err == APM_SUCCESS) ? 0 : -EIO;
- sysdev_resume();
+ syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
-
+ dpm_resume_start(PMSG_RESUME);
dpm_resume_end(PMSG_RESUME);
+
queue_event(APM_NORMAL_RESUME, NULL);
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
@@ -1277,10 +1253,10 @@ static void standby(void)
{
int err;
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
err = set_system_power_state(APM_STATE_STANDBY);
@@ -1288,10 +1264,10 @@ static void standby(void)
apm_error("standby", err);
local_irq_disable();
- sysdev_resume();
+ syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
+ dpm_resume_start(PMSG_RESUME);
}
static apm_event_t get_event(void)
@@ -1449,7 +1425,7 @@ static void apm_mainloop(void)
static int check_apm_user(struct apm_user *as, const char *func)
{
if (as == NULL || as->magic != APM_BIOS_MAGIC) {
- printk(KERN_ERR "apm: %s passed bad filp\n", func);
+ pr_err("%s passed bad filp\n", func);
return 1;
}
return 0;
@@ -1498,7 +1474,7 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
return 0;
}
-static unsigned int do_poll(struct file *fp, poll_table *wait)
+static __poll_t do_poll(struct file *fp, poll_table *wait)
{
struct apm_user *as;
@@ -1507,7 +1483,7 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
return 0;
poll_wait(fp, &apm_waitqueue, wait);
if (!queue_empty(as))
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
return 0;
}
@@ -1523,7 +1499,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
return -EPERM;
switch (cmd) {
case APM_IOC_STANDBY:
- lock_kernel();
+ mutex_lock(&apm_mutex);
if (as->standbys_read > 0) {
as->standbys_read--;
as->standbys_pending--;
@@ -1532,10 +1508,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
queue_event(APM_USER_STANDBY, as);
if (standbys_pending <= 0)
standby();
- unlock_kernel();
+ mutex_unlock(&apm_mutex);
break;
case APM_IOC_SUSPEND:
- lock_kernel();
+ mutex_lock(&apm_mutex);
if (as->suspends_read > 0) {
as->suspends_read--;
as->suspends_pending--;
@@ -1544,13 +1520,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
queue_event(APM_USER_SUSPEND, as);
if (suspends_pending <= 0) {
ret = suspend(1);
+ mutex_unlock(&apm_mutex);
} else {
as->suspend_wait = 1;
+ mutex_unlock(&apm_mutex);
wait_event_interruptible(apm_suspend_waitqueue,
as->suspend_wait == 0);
ret = as->suspend_result;
}
- unlock_kernel();
return ret;
default:
return -ENOTTY;
@@ -1587,7 +1564,7 @@ static int do_release(struct inode *inode, struct file *filp)
as1 = as1->next)
;
if (as1 == NULL)
- printk(KERN_ERR "apm: filp not in user list\n");
+ pr_err("filp not in user list\n");
else
as1->next = as->next;
}
@@ -1600,14 +1577,10 @@ static int do_open(struct inode *inode, struct file *filp)
{
struct apm_user *as;
- lock_kernel();
as = kmalloc(sizeof(*as), GFP_KERNEL);
- if (as == NULL) {
- printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
- sizeof(*as));
- unlock_kernel();
+ if (as == NULL)
return -ENOMEM;
- }
+
as->magic = APM_BIOS_MAGIC;
as->event_tail = as->event_head = 0;
as->suspends_pending = as->standbys_pending = 0;
@@ -1627,10 +1600,10 @@ static int do_open(struct inode *inode, struct file *filp)
user_list = as;
spin_unlock(&user_list_lock);
filp->private_data = as;
- unlock_kernel();
return 0;
}
+#ifdef CONFIG_PROC_FS
static int proc_apm_show(struct seq_file *m, void *v)
{
unsigned short bx;
@@ -1710,19 +1683,7 @@ static int proc_apm_show(struct seq_file *m, void *v)
units);
return 0;
}
-
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_file_ops = {
- .owner = THIS_MODULE,
- .open = proc_apm_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+#endif
static int apm(void *unused)
{
@@ -1920,6 +1881,7 @@ static const struct file_operations apm_bios_fops = {
.unlocked_ioctl = do_ioctl,
.open = do_open,
.release = do_release,
+ .llseek = noop_llseek,
};
static struct miscdevice apm_device = {
@@ -1986,8 +1948,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
apm_info.disabled = 1;
printk(KERN_INFO "%s machine detected. "
"Disabling APM.\n", d->ident);
- printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
- printk(KERN_INFO "download from support.intel.com \n");
+ printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
+ printk(KERN_INFO "download from support.intel.com\n");
}
return 0;
}
@@ -2038,7 +2000,7 @@ static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id __initdata apm_dmi_table[] = {
+static const struct dmi_system_id apm_dmi_table[] __initconst = {
{
print_if_true,
KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
@@ -2266,7 +2228,7 @@ static int __init apm_init(void)
dmi_check_system(apm_dmi_table);
- if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
+ if (apm_info.bios.version == 0 || machine_is_olpc()) {
printk(KERN_INFO "apm: BIOS not found.\n");
return -ENODEV;
}
@@ -2316,29 +2278,19 @@ static int __init apm_init(void)
}
if (apm_info.disabled) {
- printk(KERN_NOTICE "apm: disabled on user request.\n");
+ pr_notice("disabled on user request.\n");
return -ENODEV;
}
if ((num_online_cpus() > 1) && !power_off && !smp) {
- printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
+ pr_notice("disabled - APM is not SMP safe.\n");
apm_info.disabled = 1;
return -ENODEV;
}
- if (pm_flags & PM_ACPI) {
- printk(KERN_NOTICE "apm: overridden by ACPI.\n");
+ if (!acpi_disabled) {
+ pr_notice("overridden by ACPI.\n");
apm_info.disabled = 1;
return -ENODEV;
}
- pm_flags |= PM_APM;
-
- /*
- * Set up a segment that references the real mode segment 0x40
- * that extends up to the end of page zero (that we have reserved).
- * This is for buggy BIOS's that refer to (real mode) segment 0x40
- * even though they are called in protected mode.
- */
- set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
- _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
/*
* Set up the long jump entry point to the APM BIOS, which is called
@@ -2357,20 +2309,19 @@ static int __init apm_init(void)
* Note we only set APM segments on CPU zero, since we pin the APM
* code to that CPU.
*/
- gdt = get_cpu_gdt_table(0);
- set_base(gdt[APM_CS >> 3],
- __va((unsigned long)apm_info.bios.cseg << 4));
- set_base(gdt[APM_CS_16 >> 3],
- __va((unsigned long)apm_info.bios.cseg_16 << 4));
- set_base(gdt[APM_DS >> 3],
- __va((unsigned long)apm_info.bios.dseg << 4));
+ gdt = get_cpu_gdt_rw(0);
+ set_desc_base(&gdt[APM_CS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
+ set_desc_base(&gdt[APM_CS_16 >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4));
+ set_desc_base(&gdt[APM_DS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
- proc_create("apm", 0, NULL, &apm_file_ops);
+ proc_create_single("apm", 0, NULL, proc_apm_show);
kapmd_task = kthread_create(apm, NULL, "kapmd");
if (IS_ERR(kapmd_task)) {
- printk(KERN_ERR "apm: disabled - Unable to start kernel "
- "thread.\n");
+ pr_err("disabled - Unable to start kernel thread\n");
err = PTR_ERR(kapmd_task);
kapmd_task = NULL;
remove_proc_entry("apm", NULL);
@@ -2395,9 +2346,10 @@ static int __init apm_init(void)
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
if (idle_threshold < 100) {
- original_pm_idle = pm_idle;
- pm_idle = apm_cpu_idle;
- set_pm_idle = 1;
+ cpuidle_poll_state_init(&apm_idle_driver);
+ if (!cpuidle_register_driver(&apm_idle_driver))
+ if (cpuidle_register_device(&apm_cpuidle_device))
+ cpuidle_unregister_driver(&apm_idle_driver);
}
return 0;
@@ -2407,15 +2359,9 @@ static void __exit apm_exit(void)
{
int error;
- if (set_pm_idle) {
- pm_idle = original_pm_idle;
- /*
- * We are about to unload the current idle thread pm callback
- * (pm_idle), Wait for all processors to update cached/local
- * copies of pm_idle before proceeding.
- */
- cpu_idle_wait();
- }
+ cpuidle_unregister_device(&apm_cpuidle_device);
+ cpuidle_unregister_driver(&apm_idle_driver);
+
if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
&& (apm_info.connection_version > 0x0100)) {
error = apm_engage_power_management(APM_DEVICE_ALL, 0);
@@ -2430,7 +2376,6 @@ static void __exit apm_exit(void)
kthread_stop(kapmd_task);
kapmd_task = NULL;
}
- pm_flags &= ~PM_APM;
}
module_init(apm_init);
@@ -2458,7 +2403,7 @@ MODULE_PARM_DESC(idle_threshold,
"System idle percentage above which to make APM BIOS idle calls");
module_param(idle_period, int, 0444);
MODULE_PARM_DESC(idle_period,
- "Period (in sec/100) over which to caculate the idle percentage");
+ "Period (in sec/100) over which to calculate the idle percentage");
module_param(smp, bool, 0444);
MODULE_PARM_DESC(smp,
"Set this to enable APM use on an SMP platform. Use with caution on older systems");
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..25fcde525c68 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/crypto.h>
+#include <crypto/aria.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <linux/kbuild.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/sigframe.h>
+#include <asm/bootparam.h>
+#include <asm/suspend.h>
+#include <asm/tlbflush.h>
+#include <asm/tdx.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#endif
+
#ifdef CONFIG_X86_32
# include "asm-offsets_32.c"
#else
# include "asm-offsets_64.c"
#endif
+
+static void __used common(void)
+{
+ OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+ OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+ OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+ OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
+ OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+ OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+ OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+
+ BLANK();
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
+
+ BLANK();
+ OFFSET(pbe_address, pbe, address);
+ OFFSET(pbe_orig_address, pbe, orig_address);
+ OFFSET(pbe_next, pbe, next);
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ BLANK();
+ OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax);
+ OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx);
+ OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx);
+ OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx);
+ OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si);
+ OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di);
+ OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp);
+ OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp);
+ OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip);
+
+ BLANK();
+ OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+ OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+ OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
+#endif
+
+ BLANK();
+ OFFSET(TDX_MODULE_rcx, tdx_module_args, rcx);
+ OFFSET(TDX_MODULE_rdx, tdx_module_args, rdx);
+ OFFSET(TDX_MODULE_r8, tdx_module_args, r8);
+ OFFSET(TDX_MODULE_r9, tdx_module_args, r9);
+ OFFSET(TDX_MODULE_r10, tdx_module_args, r10);
+ OFFSET(TDX_MODULE_r11, tdx_module_args, r11);
+ OFFSET(TDX_MODULE_r12, tdx_module_args, r12);
+ OFFSET(TDX_MODULE_r13, tdx_module_args, r13);
+ OFFSET(TDX_MODULE_r14, tdx_module_args, r14);
+ OFFSET(TDX_MODULE_r15, tdx_module_args, r15);
+ OFFSET(TDX_MODULE_rbx, tdx_module_args, rbx);
+ OFFSET(TDX_MODULE_rdi, tdx_module_args, rdi);
+ OFFSET(TDX_MODULE_rsi, tdx_module_args, rsi);
+
+ BLANK();
+ OFFSET(BP_scratch, boot_params, scratch);
+ OFFSET(BP_secure_boot, boot_params, secure_boot);
+ OFFSET(BP_loadflags, boot_params, hdr.loadflags);
+ OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
+ OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_init_size, boot_params, hdr.init_size);
+ OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+ OFFSET(C_PTREGS_SIZE, pt_regs, orig_ax);
+
+ /* TLB state for the entry code */
+ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
+ DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
+
+ /* Offset for fields in tss_struct */
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+ OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
+#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
+ /* Offset for fields in aria_ctx */
+ BLANK();
+ OFFSET(ARIA_CTX_enc_key, aria_ctx, enc_key);
+ OFFSET(ARIA_CTX_dec_key, aria_ctx, dec_key);
+ OFFSET(ARIA_CTX_rounds, aria_ctx, rounds);
+#endif
+
+ BLANK();
+ DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr));
+ DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry));
+}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf6403895..e0a292db97b2 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,71 +1,17 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-
-#include <linux/crypto.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/personality.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
-#include <asm/ucontext.h>
-#include <asm/sigframe.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/bootparam.h>
-#include <asm/elf.h>
-#include <asm/suspend.h>
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
-#include <xen/interface/xen.h>
+#include <linux/efi.h>
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
+#include <asm/ucontext.h>
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
void foo(void)
{
- OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
- OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
- OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
- OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
- OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
- OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
- OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
- OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
- OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
- BLANK();
-
- OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
- OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
- OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
- OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
- OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
- OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
- OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
- OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
- BLANK();
-
- OFFSET(TI_task, thread_info, task);
- OFFSET(TI_exec_domain, thread_info, exec_domain);
- OFFSET(TI_flags, thread_info, flags);
- OFFSET(TI_status, thread_info, status);
- OFFSET(TI_preempt_count, thread_info, preempt_count);
- OFFSET(TI_addr_limit, thread_info, addr_limit);
- OFFSET(TI_restart_block, thread_info, restart_block);
- OFFSET(TI_sysenter_return, thread_info, sysenter_return);
- OFFSET(TI_cpu, thread_info, cpu);
- BLANK();
-
- OFFSET(GDS_size, desc_ptr, size);
- OFFSET(GDS_address, desc_ptr, address);
- BLANK();
-
OFFSET(PT_EBX, pt_regs, bx);
OFFSET(PT_ECX, pt_regs, cx);
OFFSET(PT_EDX, pt_regs, dx);
@@ -85,67 +31,19 @@ void foo(void)
OFFSET(PT_OLDSS, pt_regs, ss);
BLANK();
- OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
- OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
- BLANK();
-
- OFFSET(pbe_address, pbe, address);
- OFFSET(pbe_orig_address, pbe, orig_address);
- OFFSET(pbe_next, pbe, next);
-
- /* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- sizeof(struct tss_struct));
-
- DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
- DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
- DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
- DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
- DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
-
- OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-
-#ifdef CONFIG_PARAVIRT
+ OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
- OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
- OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
- OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
- OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
- OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
- OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
- OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
-#endif
-#ifdef CONFIG_XEN
- BLANK();
- OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
- OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#endif
-
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
- BLANK();
- OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
- OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
- OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
-
- BLANK();
- OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
- OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
- OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
- OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
- OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
- OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
- OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
- OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
- OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
- OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
+ /*
+ * Offset from the entry stack to task stack stored in TSS. Kernel entry
+ * happens on the per-cpu entry-stack, and the asm code switches to the
+ * task-stack pointer stored in x86_tss.sp1, which is a copy of
+ * task->thread.sp0 where entry code can find it.
+ */
+ DEFINE(TSS_entry2task_stack,
+ offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
+ offsetofend(struct cpu_entry_area, entry_stack_page.stack));
BLANK();
- OFFSET(BP_scratch, boot_params, scratch);
- OFFSET(BP_loadflags, boot_params, hdr.loadflags);
- OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
- OFFSET(BP_version, boot_params, hdr.version);
- OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map));
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e129..590b6cd0eac0 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,93 +1,31 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed to extract
- * and format the required data.
- */
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
-#include <linux/crypto.h>
-#include <linux/sched.h>
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/hardirq.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
-#include <asm/processor.h>
-#include <asm/segment.h>
-#include <asm/thread_info.h>
#include <asm/ia32.h>
-#include <asm/bootparam.h>
-#include <asm/suspend.h>
-
-#include <xen/interface/xen.h>
-
-#include <asm/sigframe.h>
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_UNISTD_64_H
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/unistd.h>
-};
+#if defined(CONFIG_KVM_GUEST)
+#include <asm/kvm_para.h>
+#endif
int main(void)
{
-#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
- ENTRY(state);
- ENTRY(flags);
- ENTRY(pid);
- BLANK();
-#undef ENTRY
-#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
- ENTRY(flags);
- ENTRY(addr_limit);
- ENTRY(preempt_count);
- ENTRY(status);
-#ifdef CONFIG_IA32_EMULATION
- ENTRY(sysenter_return);
-#endif
- BLANK();
-#undef ENTRY
#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
+ OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
+#endif
+#endif
BLANK();
- OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
- OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
- OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
- OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
- OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
- OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
- OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
- OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
- OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
- OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
-
-#ifdef CONFIG_IA32_EMULATION
-#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
- ENTRY(ax);
- ENTRY(bx);
- ENTRY(cx);
- ENTRY(dx);
- ENTRY(si);
- ENTRY(di);
- ENTRY(bp);
- ENTRY(sp);
- ENTRY(ip);
- BLANK();
-#undef ENTRY
- DEFINE(IA32_RT_SIGFRAME_sigcontext,
- offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
+#if defined(CONFIG_KVM_GUEST)
+ OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
BLANK();
#endif
- DEFINE(pbe_address, offsetof(struct pbe, address));
- DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
- DEFINE(pbe_next, offsetof(struct pbe, next));
- BLANK();
-#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
- ENTRY(bx);
+
+#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
ENTRY(bx);
ENTRY(cx);
ENTRY(dx);
@@ -106,34 +44,15 @@ int main(void)
ENTRY(flags);
BLANK();
#undef ENTRY
-#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
+
+#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
ENTRY(cr0);
ENTRY(cr2);
ENTRY(cr3);
ENTRY(cr4);
- ENTRY(cr8);
+ ENTRY(gdt_desc);
BLANK();
#undef ENTRY
- DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
- BLANK();
- DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
- BLANK();
- DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
-
- BLANK();
- OFFSET(BP_scratch, boot_params, scratch);
- OFFSET(BP_loadflags, boot_params, hdr.loadflags);
- OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
- OFFSET(BP_version, boot_params, hdr.version);
- OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
- BLANK();
- DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-#ifdef CONFIG_XEN
- BLANK();
- OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
- OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#undef ENTRY
-#endif
return 0;
}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..190c120f4285 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -1,7 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <asm/unistd.h>
+#include <asm/audit.h>
static unsigned dir_class[] = {
#include <asm-generic/audit_dir_write.h>
@@ -40,30 +42,27 @@ int audit_classify_arch(int arch)
int audit_classify_syscall(int abi, unsigned syscall)
{
#ifdef CONFIG_IA32_EMULATION
- extern int ia32_classify_syscall(unsigned);
if (abi == AUDIT_ARCH_I386)
return ia32_classify_syscall(syscall);
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_execve:
- return 5;
+ case __NR_execveat:
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
static int __init audit_classes_init(void)
{
#ifdef CONFIG_IA32_EMULATION
- extern __u32 ia32_dir_class[];
- extern __u32 ia32_write_class[];
- extern __u32 ia32_read_class[];
- extern __u32 ia32_chattr_class[];
- extern __u32 ia32_signal_class[];
audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
deleted file mode 100644
index 63a88e1f987d..000000000000
--- a/arch/x86/kernel/bios_uv.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * BIOS run time interface routines.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
- */
-
-#include <linux/efi.h>
-#include <asm/efi.h>
-#include <linux/io.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv_hub.h>
-
-static struct uv_systab uv_systab;
-
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
-{
- struct uv_systab *tab = &uv_systab;
-
- if (!tab->function)
- /*
- * BIOS does not support UV systab
- */
- return BIOS_STATUS_UNIMPLEMENTED;
-
- return efi_call6((void *)__va(tab->function),
- (u64)which, a1, a2, a3, a4, a5);
-}
-
-s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
- u64 a4, u64 a5)
-{
- unsigned long bios_flags;
- s64 ret;
-
- local_irq_save(bios_flags);
- ret = uv_bios_call(which, a1, a2, a3, a4, a5);
- local_irq_restore(bios_flags);
-
- return ret;
-}
-
-s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
- u64 a4, u64 a5)
-{
- s64 ret;
-
- preempt_disable();
- ret = uv_bios_call(which, a1, a2, a3, a4, a5);
- preempt_enable();
-
- return ret;
-}
-
-
-long sn_partition_id;
-EXPORT_SYMBOL_GPL(sn_partition_id);
-long sn_coherency_id;
-EXPORT_SYMBOL_GPL(sn_coherency_id);
-long sn_region_size;
-EXPORT_SYMBOL_GPL(sn_region_size);
-int uv_type;
-
-
-s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
- long *region)
-{
- s64 ret;
- u64 v0, v1;
- union partition_info_u part;
-
- ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
- (u64)(&v0), (u64)(&v1), 0, 0);
- if (ret != BIOS_STATUS_SUCCESS)
- return ret;
-
- part.val = v0;
- if (uvtype)
- *uvtype = part.hub_version;
- if (partid)
- *partid = part.partition_id;
- if (coher)
- *coher = part.coherence_id;
- if (region)
- *region = part.region_size;
- return ret;
-}
-
-int
-uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
- unsigned long *intr_mmr_offset)
-{
- union uv_watchlist_u size_blade;
- u64 watchlist;
- s64 ret;
-
- size_blade.size = mq_size;
- size_blade.blade = blade;
-
- /*
- * bios returns watchlist number or negative error number.
- */
- ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
- size_blade.val, (u64)intr_mmr_offset,
- (u64)&watchlist, 0);
- if (ret < BIOS_STATUS_SUCCESS)
- return ret;
-
- return watchlist;
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
-
-int
-uv_bios_mq_watchlist_free(int blade, int watchlist_num)
-{
- return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
- blade, watchlist_num, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
-
-s64
-uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
-{
- return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
- perms, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
-
-s64
-uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
-{
- s64 ret;
-
- ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
- (u64)addr, buf, (u64)len, 0);
- return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
-
-s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
-{
- return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
- (u64)ticks_per_second, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_freq_base);
-
-
-#ifdef CONFIG_EFI
-void uv_bios_init(void)
-{
- struct uv_systab *tab;
-
- if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
- (efi.uv_systab == (unsigned long)NULL)) {
- printk(KERN_CRIT "No EFI UV System Table.\n");
- uv_systab.function = (unsigned long)NULL;
- return;
- }
-
- tab = (struct uv_systab *)ioremap(efi.uv_systab,
- sizeof(struct uv_systab));
- if (strncmp(tab->signature, "UVST", 4) != 0)
- printk(KERN_ERR "bad signature in UV system table!");
-
- /*
- * Copy table to permanent spot for later use.
- */
- memcpy(&uv_systab, tab, sizeof(struct uv_systab));
- iounmap(tab);
-
- printk(KERN_INFO "EFI UV System Table Revision %d\n",
- uv_systab.revision);
-}
-#else /* !CONFIG_EFI */
-
-void uv_bios_init(void) { }
-#endif
-
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 30f25a75fe28..73274d76ce16 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implement 'Simple Boot Flag Specification 2.0'
*/
@@ -5,9 +6,9 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
+#include <linux/bitops.h>
#include <asm/io.h>
#include <linux/mc146818rtc.h>
@@ -20,27 +21,13 @@
int sbf_port __initdata = -1; /* set via acpi_boot_init() */
-static int __init parity(u8 v)
-{
- int x = 0;
- int i;
-
- for (i = 0; i < 8; i++) {
- x ^= (v & 1);
- v >>= 1;
- }
-
- return x;
-}
-
static void __init sbf_write(u8 v)
{
unsigned long flags;
if (sbf_port != -1) {
- v &= ~SBF_PARITY;
- if (!parity(v))
- v |= SBF_PARITY;
+ if (!parity8(v))
+ v ^= SBF_PARITY;
printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
sbf_port, v);
@@ -66,14 +53,14 @@ static u8 __init sbf_read(void)
return v;
}
-static int __init sbf_value_valid(u8 v)
+static bool __init sbf_value_valid(u8 v)
{
if (v & SBF_RESERVED) /* Reserved bits */
- return 0;
- if (!parity(v))
- return 0;
+ return false;
+ if (!parity8(v))
+ return false;
- return 1;
+ return true;
}
static int __init sbf_init(void)
@@ -99,4 +86,4 @@ static int __init sbf_init(void)
return 0;
}
-module_init(sbf_init);
+arch_initcall(sbf_init);
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
new file mode 100644
index 000000000000..a951333c5995
--- /dev/null
+++ b/arch/x86/kernel/callthunks.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "callthunks: " fmt
+
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/moduleloader.h>
+#include <linux/static_call.h>
+
+#include <asm/alternative.h>
+#include <asm/asm-offsets.h>
+#include <asm/cpu.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+#include <asm/kexec.h>
+#include <asm/nospec-branch.h>
+#include <asm/paravirt.h>
+#include <asm/sections.h>
+#include <asm/switch_to.h>
+#include <asm/sync_core.h>
+#include <asm/text-patching.h>
+#include <asm/xen/hypercall.h>
+
+static int __initdata_or_module debug_callthunks;
+
+#define MAX_PATCH_LEN (255-1)
+
+#define prdbg(fmt, args...) \
+do { \
+ if (debug_callthunks) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
+} while(0)
+
+static int __init debug_thunks(char *str)
+{
+ debug_callthunks = 1;
+ return 1;
+}
+__setup("debug-callthunks", debug_thunks);
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DEFINE_PER_CPU(u64, __x86_call_count);
+DEFINE_PER_CPU(u64, __x86_ret_count);
+DEFINE_PER_CPU(u64, __x86_stuffs_count);
+DEFINE_PER_CPU(u64, __x86_ctxsw_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count);
+#endif
+
+extern s32 __call_sites[], __call_sites_end[];
+
+struct core_text {
+ unsigned long base;
+ unsigned long end;
+ const char *name;
+};
+
+static bool thunks_initialized __ro_after_init;
+
+static const struct core_text builtin_coretext = {
+ .base = (unsigned long)_text,
+ .end = (unsigned long)_etext,
+ .name = "builtin",
+};
+
+asm (
+ ".pushsection .rodata \n"
+ ".global skl_call_thunk_template \n"
+ "skl_call_thunk_template: \n"
+ __stringify(INCREMENT_CALL_DEPTH)" \n"
+ ".global skl_call_thunk_tail \n"
+ "skl_call_thunk_tail: \n"
+ ".popsection \n"
+);
+
+extern u8 skl_call_thunk_template[];
+extern u8 skl_call_thunk_tail[];
+
+#define SKL_TMPL_SIZE \
+ ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template))
+
+extern void error_entry(void);
+extern void xen_error_entry(void);
+extern void paranoid_entry(void);
+
+static inline bool within_coretext(const struct core_text *ct, void *addr)
+{
+ unsigned long p = (unsigned long)addr;
+
+ return ct->base <= p && p < ct->end;
+}
+
+static inline bool within_module_coretext(void *addr)
+{
+ bool ret = false;
+
+#ifdef CONFIG_MODULES
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_address((unsigned long)addr);
+ if (mod && within_module_core((unsigned long)addr, mod))
+ ret = true;
+#endif
+ return ret;
+}
+
+static bool is_coretext(const struct core_text *ct, void *addr)
+{
+ if (ct && within_coretext(ct, addr))
+ return true;
+ if (within_coretext(&builtin_coretext, addr))
+ return true;
+ return within_module_coretext(addr);
+}
+
+static bool skip_addr(void *dest)
+{
+ if (dest == error_entry)
+ return true;
+ if (dest == paranoid_entry)
+ return true;
+ if (dest == xen_error_entry)
+ return true;
+ /* Does FILL_RSB... */
+ if (dest == __switch_to_asm)
+ return true;
+ /* Accounts directly */
+ if (dest == ret_from_fork)
+ return true;
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT)
+ if (dest == soft_restart_cpu)
+ return true;
+#endif
+#ifdef CONFIG_FUNCTION_TRACER
+ if (dest == __fentry__)
+ return true;
+#endif
+#ifdef CONFIG_KEXEC_CORE
+# ifdef CONFIG_X86_64
+ if (dest >= (void *)__relocate_kernel_start &&
+ dest < (void *)__relocate_kernel_end)
+ return true;
+# else
+ if (dest >= (void *)relocate_kernel &&
+ dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
+ return true;
+# endif
+#endif
+ return false;
+}
+
+static __init_or_module void *call_get_dest(void *addr)
+{
+ struct insn insn;
+ void *dest;
+ int ret;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Patched out call? */
+ if (insn.opcode.bytes[0] != CALL_INSN_OPCODE)
+ return NULL;
+
+ dest = addr + insn.length + insn.immediate.value;
+ if (skip_addr(dest))
+ return NULL;
+ return dest;
+}
+
+static const u8 nops[] = {
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+};
+
+static void *patch_dest(void *dest, bool direct)
+{
+ unsigned int tsize = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
+ u8 *pad = dest - tsize;
+
+ memcpy(insn_buff, skl_call_thunk_template, tsize);
+ text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
+
+ /* Already patched? */
+ if (!bcmp(pad, insn_buff, tsize))
+ return pad;
+
+ /* Ensure there are nops */
+ if (bcmp(pad, nops, tsize)) {
+ pr_warn_once("Invalid padding area for %pS\n", dest);
+ return NULL;
+ }
+
+ if (direct)
+ memcpy(pad, insn_buff, tsize);
+ else
+ text_poke_copy_locked(pad, insn_buff, tsize, true);
+ return pad;
+}
+
+static __init_or_module void patch_call(void *addr, const struct core_text *ct)
+{
+ void *pad, *dest;
+ u8 bytes[8];
+
+ if (!within_coretext(ct, addr))
+ return;
+
+ dest = call_get_dest(addr);
+ if (!dest || WARN_ON_ONCE(IS_ERR(dest)))
+ return;
+
+ if (!is_coretext(ct, dest))
+ return;
+
+ pad = patch_dest(dest, within_coretext(ct, dest));
+ if (!pad)
+ return;
+
+ prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr,
+ dest, dest, pad);
+ __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE);
+ text_poke_early(addr, bytes, CALL_INSN_SIZE);
+}
+
+static __init_or_module void
+patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++)
+ patch_call((void *)s + *s, ct);
+}
+
+static __init_or_module void
+callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
+{
+ prdbg("Patching call sites %s\n", ct->name);
+ patch_call_sites(cs->call_start, cs->call_end, ct);
+ prdbg("Patching call sites done%s\n", ct->name);
+}
+
+void __init callthunks_patch_builtin_calls(void)
+{
+ struct callthunk_sites cs = {
+ .call_start = __call_sites,
+ .call_end = __call_sites_end,
+ };
+
+ if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return;
+
+ pr_info("Setting up call depth tracking\n");
+ mutex_lock(&text_mutex);
+ callthunks_setup(&cs, &builtin_coretext);
+ thunks_initialized = true;
+ mutex_unlock(&text_mutex);
+}
+
+void *callthunks_translate_call_dest(void *dest)
+{
+ void *target;
+
+ lockdep_assert_held(&text_mutex);
+
+ if (!thunks_initialized || skip_addr(dest))
+ return dest;
+
+ if (!is_coretext(NULL, dest))
+ return dest;
+
+ target = patch_dest(dest, false);
+ return target ? : dest;
+}
+
+#ifdef CONFIG_BPF_JIT
+static bool is_callthunk(void *addr)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
+ unsigned long dest;
+ u8 *pad;
+
+ dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
+ if (!thunks_initialized || skip_addr((void *)dest))
+ return false;
+
+ pad = (void *)(dest - tmpl_size);
+
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
+
+ return !bcmp(pad, insn_buff, tmpl_size);
+}
+
+int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
+
+ if (!thunks_initialized)
+ return 0;
+
+ /* Is function call target a thunk? */
+ if (func && is_callthunk(func))
+ return 0;
+
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
+
+ memcpy(*pprog, insn_buff, tmpl_size);
+ *pprog += tmpl_size;
+ return tmpl_size;
+}
+#endif
+
+#ifdef CONFIG_MODULES
+void noinline callthunks_patch_module_calls(struct callthunk_sites *cs,
+ struct module *mod)
+{
+ struct core_text ct = {
+ .base = (unsigned long)mod->mem[MOD_TEXT].base,
+ .end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size,
+ .name = mod->name,
+ };
+
+ if (!thunks_initialized)
+ return;
+
+ mutex_lock(&text_mutex);
+ callthunks_setup(cs, &ct);
+ mutex_unlock(&text_mutex);
+}
+#endif /* CONFIG_MODULES */
+
+#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int callthunks_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+
+ seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,",
+ per_cpu(__x86_call_count, cpu),
+ per_cpu(__x86_ret_count, cpu),
+ per_cpu(__x86_stuffs_count, cpu),
+ per_cpu(__x86_ctxsw_count, cpu));
+ return 0;
+}
+
+static int callthunks_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, callthunks_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+ .open = callthunks_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init callthunks_debugfs_init(void)
+{
+ struct dentry *dir;
+ unsigned long cpu;
+
+ dir = debugfs_create_dir("callthunks", NULL);
+ for_each_possible_cpu(cpu) {
+ void *arg = (void *)cpu;
+ char name [10];
+
+ sprintf(name, "cpu%lu", cpu);
+ debugfs_create_file(name, 0644, dir, arg, &dfs_ops);
+ }
+ return 0;
+}
+__initcall(callthunks_debugfs_init);
+#endif
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index 000000000000..99444409c026
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <asm/bugs.h>
+#include <asm/msr.h>
+#include <asm/traps.h>
+
+enum cp_error_code {
+ CP_EC = (1 << 15) - 1,
+
+ CP_RET = 1,
+ CP_IRET = 2,
+ CP_ENDBR = 3,
+ CP_RSTRORSSP = 4,
+ CP_SETSSBSY = 5,
+
+ CP_ENCL = 1 << 15,
+};
+
+static const char cp_err[][10] = {
+ [0] = "unknown",
+ [1] = "near ret",
+ [2] = "far/iret",
+ [3] = "endbranch",
+ [4] = "rstorssp",
+ [5] = "setssbsy",
+};
+
+static const char *cp_err_string(unsigned long error_code)
+{
+ unsigned int cpec = error_code & CP_EC;
+
+ if (cpec >= ARRAY_SIZE(cp_err))
+ cpec = 0;
+ return cp_err[cpec];
+}
+
+static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code)
+{
+ WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n",
+ user_mode(regs) ? "user mode" : "kernel mode",
+ cp_err_string(error_code));
+}
+
+static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ struct task_struct *tsk;
+ unsigned long ssp;
+
+ /*
+ * An exception was just taken from userspace. Since interrupts are disabled
+ * here, no scheduling should have messed with the registers yet and they
+ * will be whatever is live in userspace. So read the SSP before enabling
+ * interrupts so locking the fpregs to do it later is not required.
+ */
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+
+ cond_local_irq_enable(regs);
+
+ tsk = current;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_CP;
+
+ /* Ratelimit to prevent log spamming. */
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ __ratelimit(&cpf_rate)) {
+ pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, ssp, error_code,
+ cp_err_string(error_code),
+ error_code & CP_ENCL ? " in enclave" : "");
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0);
+ cond_local_irq_disable(regs);
+}
+
+static __ro_after_init bool ibt_fatal = true;
+
+/*
+ * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR.
+ *
+ * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered,
+ * the WFE state of the interrupted context needs to be cleared to let execution
+ * continue. Otherwise when the CPU resumes from the instruction that just
+ * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU
+ * enters a dead loop.
+ *
+ * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't
+ * set WFE. But FRED provides space on the entry stack (in an expanded CS area)
+ * to save and restore the WFE state, thus the WFE state is no longer clobbered,
+ * so software must clear it.
+ */
+static void ibt_clear_fred_wfe(struct pt_regs *regs)
+{
+ /*
+ * No need to do any FRED checks.
+ *
+ * For IDT event delivery, the high-order 48 bits of CS are pushed
+ * as 0s into the stack, and later IRET ignores these bits.
+ *
+ * For FRED, a test to check if fred_cs.wfe is set would be dropped
+ * by compilers.
+ */
+ regs->fred_cs.wfe = 0;
+}
+
+static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ if ((error_code & CP_EC) != CP_ENDBR) {
+ do_unexpected_cp(regs, error_code);
+ return;
+ }
+
+ if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) {
+ regs->ax = 0;
+ ibt_clear_fred_wfe(regs);
+ return;
+ }
+
+ pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
+ if (!ibt_fatal) {
+ printk(KERN_DEFAULT CUT_HERE);
+ __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
+ ibt_clear_fred_wfe(regs);
+ return;
+ }
+ BUG();
+}
+
+static int __init ibt_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+
+ if (!strcmp(str, "warn"))
+ ibt_fatal = false;
+
+ return 1;
+}
+
+__setup("ibt=", ibt_setup);
+
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+ if (user_mode(regs)) {
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ do_user_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ } else {
+ if (cpu_feature_enabled(X86_FEATURE_IBT))
+ do_kernel_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ }
+}
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
new file mode 100644
index 000000000000..638eb5c933e0
--- /dev/null
+++ b/arch/x86/kernel/cfi.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#include <linux/string.h>
+#include <linux/cfi.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+
+/*
+ * Returns the target address and the expected type when regs->ip points
+ * to a compiler-generated CFI trap.
+ */
+static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
+ u32 *type)
+{
+ char buffer[MAX_INSN_SIZE];
+ struct insn insn;
+ int offset = 0;
+
+ *target = *type = 0;
+
+ /*
+ * The compiler generates the following instruction sequence
+ * for indirect call checks:
+ *
+ *   movl -<id>, %r10d ; 6 bytes
+ * addl -<pos>(%reg), %r10d; 4 bytes
+ * je .Ltmp1 ; 2 bytes
+ * ud2 ; <- regs->ip
+ * .Ltmp1:
+ *
+ * We can decode the expected type and the target address from the
+ * movl/addl instructions.
+ */
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE))
+ return false;
+ if (insn_decode_kernel(&insn, &buffer[offset]))
+ return false;
+ if (insn.opcode.value != 0xBA)
+ return false;
+
+ *type = -(u32)insn.immediate.value;
+
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE))
+ return false;
+ if (insn_decode_kernel(&insn, &buffer[offset]))
+ return false;
+ if (insn.opcode.value != 0x3)
+ return false;
+
+ /* Read the target address from the register. */
+ offset = insn_get_modrm_rm_off(&insn, regs);
+ if (offset < 0)
+ return false;
+
+ *target = *(unsigned long *)((void *)regs + offset);
+
+ return true;
+}
+
+/*
+ * Checks if a ud2 trap is because of a CFI failure, and handles the trap
+ * if needed. Returns a bug_trap_type value similarly to report_bug.
+ */
+enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+ unsigned long target, addr = regs->ip;
+ u32 type;
+
+ switch (cfi_mode) {
+ case CFI_KCFI:
+ if (!is_cfi_trap(addr))
+ return BUG_TRAP_TYPE_NONE;
+
+ if (!decode_cfi_insn(regs, &target, &type))
+ return report_cfi_failure_noaddr(regs, addr);
+
+ break;
+
+ case CFI_FINEIBT:
+ if (!decode_fineibt_insn(regs, &target, &type))
+ return BUG_TRAP_TYPE_NONE;
+
+ break;
+
+ default:
+ return BUG_TRAP_TYPE_NONE;
+ }
+
+ return report_cfi_failure(regs, addr, &target, type);
+}
+
+/*
+ * Ensure that __kcfi_typeid_ symbols are emitted for functions that may
+ * not be indirectly called with all configurations.
+ */
+__ADDRESSABLE(__memcpy)
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46a..5136e6818da8 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -1,9 +1,15 @@
-#include <linux/module.h>
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/workqueue.h>
-#include <asm/e820.h>
+#include <linux/memblock.h>
+
#include <asm/proto.h>
+#include <asm/setup.h>
/*
* Some BIOSes seem to corrupt the low 64k of memory during events
@@ -18,27 +24,48 @@ static int __read_mostly memory_corruption_check = -1;
static unsigned __read_mostly corruption_check_size = 64*1024;
static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static struct scan_area {
+ u64 addr;
+ u64 size;
+} scan_areas[MAX_SCAN_AREAS];
static int num_scan_areas;
-
static __init int set_corruption_check(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check config string not provided\n");
+ return -EINVAL;
+ }
+
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- memory_corruption_check = simple_strtol(arg, &end, 10);
+ memory_corruption_check = val;
- return (*end == 0) ? 0 : -EINVAL;
+ return 0;
}
early_param("memory_corruption_check", set_corruption_check);
static __init int set_corruption_check_period(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check_period config string not provided\n");
+ return -EINVAL;
+ }
- corruption_check_period = simple_strtoul(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ corruption_check_period = val;
+ return 0;
}
early_param("memory_corruption_check_period", set_corruption_check_period);
@@ -47,6 +74,11 @@ static __init int set_corruption_check_size(char *arg)
char *end;
unsigned size;
+ if (!arg) {
+ pr_err("memory_corruption_check_size config string not provided\n");
+ return -EINVAL;
+ }
+
size = memparse(arg, &end);
if (*end == '\0')
@@ -59,7 +91,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
void __init setup_bios_corruption_check(void)
{
- u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
+ phys_addr_t start, end;
+ u64 i;
if (memory_corruption_check == -1) {
memory_corruption_check =
@@ -79,37 +112,32 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
- u64 size;
- addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
+ start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ if (start >= end)
+ continue;
- if (!(addr + 1))
- break;
-
- if (addr >= corruption_check_size)
- break;
-
- if ((addr + size) > corruption_check_size)
- size = corruption_check_size - addr;
-
- e820_update_range(addr, size, E820_RAM, E820_RESERVED);
- scan_areas[num_scan_areas].addr = addr;
- scan_areas[num_scan_areas].size = size;
- num_scan_areas++;
+ memblock_reserve(start, end - start);
+ scan_areas[num_scan_areas].addr = start;
+ scan_areas[num_scan_areas].size = end - start;
/* Assume we've already mapped this early memory */
- memset(__va(addr), 0, size);
+ memset(__va(start), 0, end - start);
- addr += size;
+ if (++num_scan_areas >= MAX_SCAN_AREAS)
+ break;
}
- printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
- num_scan_areas);
- update_e820();
+ if (num_scan_areas)
+ pr_info("Scanning %d areas for low memory corruption\n", num_scan_areas);
}
-void check_for_bios_corruption(void)
+static void check_for_bios_corruption(void)
{
int i;
int corruption = 0;
@@ -124,8 +152,7 @@ void check_for_bios_corruption(void)
for (; size; addr++, size -= sizeof(unsigned long)) {
if (!*addr)
continue;
- printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
- addr, __pa(addr), *addr);
+ pr_err("Corrupted low memory at %p (%lx phys) = %08lx\n", addr, __pa(addr), *addr);
corruption = 1;
*addr = 0;
}
@@ -141,21 +168,20 @@ static void check_corruption(struct work_struct *dummy)
{
check_for_bios_corruption();
schedule_delayed_work(&bios_check_work,
- round_jiffies_relative(corruption_check_period*HZ));
+ round_jiffies_relative(corruption_check_period*HZ));
}
static int start_periodic_check_for_corruption(void)
{
- if (!memory_corruption_check || corruption_check_period == 0)
+ if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
return 0;
- printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
- corruption_check_period);
+ pr_info("Scanning for low memory corruption every %d seconds\n", corruption_check_period);
/* First time we run the checks right away */
schedule_delayed_work(&bios_check_work, 0);
+
return 0;
}
-
-module_init(start_periodic_check_for_corruption);
+device_initcall(start_periodic_check_for_corruption);
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
index 667df55a4399..0bca7ef7426a 100644
--- a/arch/x86/kernel/cpu/.gitignore
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3efcb2b96a15..2f8a58ef690e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for x86-compatible CPU details, features and quirks
#
@@ -5,37 +6,71 @@
# Don't trace early stages of a secondary CPU boot
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
endif
-obj-y := intel_cacheinfo.o addon_cpuid_features.o
-obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+KMSAN_SANITIZE_common.o := n
-obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
-obj-$(CONFIG_X86_64) += bugs_64.o
+# As above, instrumenting secondary CPU boot code causes boot hangs.
+KCSAN_SANITIZE_common.o := n
-obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
+obj-y := cacheinfo.o scattered.o
+obj-y += topology_common.o topology_ext.o topology_amd.o
+obj-y += common.o
+obj-y += rdrand.o
+obj-y += match.o
+obj-y += bugs.o
+obj-y += aperfmperf.o
+obj-y += cpuid-deps.o cpuid_0x2_table.o
+obj-y += umwait.o
+obj-y += capflags.o powerflags.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
+obj-$(CONFIG_X86_LOCAL_APIC) += topology.o
+
+obj-$(CONFIG_PROC_FS) += proc.o
+
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
+ifdef CONFIG_CPU_SUP_INTEL
+obj-y += intel.o tsx.o
+obj-$(CONFIG_PM) += intel_epb.o
+endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy)
+obj-y += amd_cache_disable.o
+endif
+obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
+obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o
-obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
-
-obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_MICROCODE) += microcode/
+obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_SGX) += sgx/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
+obj-$(CONFIG_BHYVE_GUEST) += bhyve.o
+obj-$(CONFIG_ACRN_GUEST) += acrn.o
+
+obj-$(CONFIG_DEBUG_FS) += debugfs.o
+
+obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o
+
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^
-cpufeature = $(src)/../../include/asm/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeatures.h
+vmxfeature = $(src)/../../include/asm/vmxfeatures.h
-targets += capflags.c
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
+targets += capflags.c
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
new file mode 100644
index 000000000000..2c5b51aad91a
--- /dev/null
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN detection support
+ *
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ *
+ * Jason Chen CJ <jason.cj.chen@intel.com>
+ * Zhao Yakui <yakui.zhao@intel.com>
+ *
+ */
+
+#include <linux/interrupt.h>
+
+#include <asm/acrn.h>
+#include <asm/apic.h>
+#include <asm/cpufeatures.h>
+#include <asm/desc.h>
+#include <asm/hypervisor.h>
+#include <asm/idtentry.h>
+#include <asm/irq_regs.h>
+
+static u32 __init acrn_detect(void)
+{
+ return acrn_cpuid_base();
+}
+
+static void __init acrn_init_platform(void)
+{
+ /* Install system interrupt handler for ACRN hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
+
+ x86_platform.calibrate_tsc = acrn_get_tsc_khz;
+ x86_platform.calibrate_cpu = acrn_get_tsc_khz;
+}
+
+static bool acrn_x2apic_available(void)
+{
+ return boot_cpu_has(X86_FEATURE_X2APIC);
+}
+
+static void (*acrn_intr_handler)(void);
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /*
+ * The hypervisor requires that the APIC EOI should be acked.
+ * If the APIC EOI is not acked, the APIC ISR bit for the
+ * HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it
+ * will block the interrupt whose vector is lower than
+ * HYPERVISOR_CALLBACK_VECTOR.
+ */
+ apic_eoi();
+ inc_irq_stat(irq_hv_callback_count);
+
+ if (acrn_intr_handler)
+ acrn_intr_handler();
+
+ set_irq_regs(old_regs);
+}
+
+void acrn_setup_intr_handler(void (*handler)(void))
+{
+ acrn_intr_handler = handler;
+}
+EXPORT_SYMBOL_GPL(acrn_setup_intr_handler);
+
+void acrn_remove_intr_handler(void)
+{
+ acrn_intr_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(acrn_remove_intr_handler);
+
+const __initconst struct hypervisor_x86 x86_hyper_acrn = {
+ .name = "ACRN",
+ .detect = acrn_detect,
+ .type = X86_HYPER_ACRN,
+ .init.init_platform = acrn_init_platform,
+ .init.x2apic_available = acrn_x2apic_available,
+};
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
deleted file mode 100644
index c965e5212714..000000000000
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Routines to indentify additional cpu features that are scattered in
- * cpuid space.
- */
-#include <linux/cpu.h>
-
-#include <asm/pat.h>
-#include <asm/processor.h>
-
-#include <asm/apic.h>
-
-struct cpuid_bit {
- u16 feature;
- u8 reg;
- u8 bit;
- u32 level;
-};
-
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
-};
-
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
- u32 max_level;
- u32 regs[4];
- const struct cpuid_bit *cb;
-
- static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
- { 0, 0, 0, 0 }
- };
-
- for (cb = cpuid_bits; cb->feature; cb++) {
-
- /* Verify that the level is valid */
- max_level = cpuid_eax(cb->level & 0xffff0000);
- if (max_level < cb->level ||
- max_level > (cb->level | 0xffff))
- continue;
-
- cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
- &regs[CR_ECX], &regs[CR_EDX]);
-
- if (regs[cb->reg] & (1 << cb->bit))
- set_cpu_cap(c, cb->feature);
- }
-}
-
-/* leaf 0xb SMT level */
-#define SMT_LEVEL 0
-
-/* leaf 0xb sub-leaf types */
-#define INVALID_TYPE 0
-#define SMT_TYPE 1
-#define CORE_TYPE 2
-
-#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
-#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
-
-/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
- */
-void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int ht_mask_width, core_plus_mask_width;
- unsigned int core_select_mask, core_level_siblings;
-
- if (c->cpuid_level < 0xb)
- return;
-
- cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
-
- /*
- * check if the cpuid leaf 0xb is actually implemented.
- */
- if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
- return;
-
- set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
-
- /*
- * initial apic id, which also represents 32-bit extended x2apic id.
- */
- c->initial_apicid = edx;
-
- /*
- * Populate HT related information from sub-leaf level 0.
- */
- core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
-
- sub_index = 1;
- do {
- cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
-
- /*
- * Check for the Core type in the implemented sub leaves.
- */
- if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
- core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- break;
- }
-
- sub_index++;
- } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
-
- core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
-
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
- & core_select_mask;
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
- /*
- * Reinit the apicid, now that we have extended initial_apicid.
- */
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-
- c->x86_max_cores = (core_level_siblings / smp_num_siblings);
-
-
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- if (c->x86_max_cores > 1)
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- return;
-#endif
-}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 63fddcd082cd..bc94ff1e250a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,43 +1,100 @@
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
#include <linux/bitops.h>
+#include <linux/elf.h>
#include <linux/mm.h>
-
-#include <asm/io.h>
+#include <linux/kvm_types.h>
+#include <linux/io.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/random.h>
+#include <linux/topology.h>
+#include <linux/platform_data/x86/amd-fch.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
+#include <asm/smp.h>
+#include <asm/numa.h>
#include <asm/pci-direct.h>
+#include <asm/delay.h>
+#include <asm/debugreg.h>
+#include <asm/resctrl.h>
+#include <asm/msr.h>
+#include <asm/sev.h>
#ifdef CONFIG_X86_64
-# include <asm/numa_64.h>
# include <asm/mmconfig.h>
-# include <asm/cacheflush.h>
#endif
#include "cpu.h"
-#ifdef CONFIG_X86_32
+u16 invlpgb_count_max __ro_after_init = 1;
+
+static inline int rdmsrq_amd_safe(unsigned msr, u64 *p)
+{
+ u32 gprs[8] = { 0 };
+ int err;
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[1] = msr;
+ gprs[7] = 0x9c5a203a;
+
+ err = rdmsr_safe_regs(gprs);
+
+ *p = gprs[0] | ((u64)gprs[2] << 32);
+
+ return err;
+}
+
+static inline int wrmsrq_amd_safe(unsigned msr, u64 val)
+{
+ u32 gprs[8] = { 0 };
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[0] = (u32)val;
+ gprs[1] = msr;
+ gprs[2] = val >> 32;
+ gprs[7] = 0x9c5a203a;
+
+ return wrmsr_safe_regs(gprs);
+}
+
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
* contact AMD for precise details and a CPU swap.
*
* See http://www.multimania.com/poulot/k6bug.html
- * http://www.amd.com/K6/k6docs/revgd.html
+ * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
+ * (Publication # 21266 Issue Date: August 1998)
*
* The following test is erm.. interesting. AMD neglected to up
* the chip setting when fixing the bug but they also tweaked some
* performance at the same time..
*/
-extern void vide(void);
-__asm__(".align 4\nvide: ret");
+#ifdef CONFIG_X86_32
+extern __visible void vide(void);
+__asm__(".text\n"
+ ".globl vide\n"
+ ".type vide, @function\n"
+ ".align 4\n"
+ "vide: ret\n");
+#endif
-static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
+static void init_amd_k5(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
- * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
* drivers subsequently pokes it, and changes the CPU speed.
* Workaround : Remove the unneeded alias.
*/
@@ -45,16 +102,17 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
#define CBAR_ENB (0x80000000)
#define CBAR_KEY (0X000000CB)
if (c->x86_model == 9 || c->x86_model == 10) {
- if (inl (CBAR) & CBAR_ENB)
- outl (0 | CBAR_KEY, CBAR);
+ if (inl(CBAR) & CBAR_ENB)
+ outl(0 | CBAR_KEY, CBAR);
}
+#endif
}
-
-static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
+static void init_amd_k6(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
u32 l, h;
- int mbytes = num_physpages >> (20-PAGE_SHIFT);
+ int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
if (c->x86_model < 6) {
/* Based on AMD doc 20734R - June 2000 */
@@ -65,13 +123,13 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
return;
}
- if (c->x86_model == 6 && c->x86_mask == 1) {
+ if (c->x86_model == 6 && c->x86_stepping == 1) {
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
- unsigned long d, d2;
+ u64 d, d2;
- printk(KERN_INFO "AMD K6 stepping B detected - ");
+ pr_info("AMD K6 stepping B detected - ");
/*
* It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -80,22 +138,22 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
n = K6_BUG_LOOP;
f_vide = vide;
- rdtscl(d);
+ OPTIMIZER_HIDE_VAR(f_vide);
+ d = rdtsc();
while (n--)
f_vide();
- rdtscl(d2);
+ d2 = rdtsc();
d = d2-d;
if (d > 20*K6_BUG_LOOP)
- printk("system stability may be impaired when more than 32 MB are used.\n");
+ pr_cont("system stability may be impaired when more than 32 MB are used.\n");
else
- printk("probably OK (after B9730xxxx).\n");
- printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
+ pr_cont("probably OK (after B9730xxxx).\n");
}
/* K6 with old style WHCR */
if (c->x86_model < 8 ||
- (c->x86_model == 8 && c->x86_mask < 8)) {
+ (c->x86_model == 8 && c->x86_stepping < 8)) {
/* We can only write allocate on the low 508Mb */
if (mbytes > 508)
mbytes = 508;
@@ -108,13 +166,13 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+ pr_info("Enabling old style K6 write allocation for %d Mb\n",
mbytes);
}
return;
}
- if ((c->x86_model == 8 && c->x86_mask > 7) ||
+ if ((c->x86_model == 8 && c->x86_stepping > 7) ||
c->x86_model == 9 || c->x86_model == 13) {
/* The more serious chips .. */
@@ -129,7 +187,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+ pr_info("Enabling new style K6 write allocation for %d Mb\n",
mbytes);
}
@@ -141,13 +199,43 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
/* placeholder for any needed mods */
return;
}
+#endif
}
-static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ pr_info("Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
@@ -155,13 +243,13 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* but they are not certified as MP capable.
*/
/* Athlon 660/661 is valid. */
- if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
- (c->x86_mask == 1)))
- goto valid_k7;
+ if ((c->x86_model == 6) && ((c->x86_stepping == 0) ||
+ (c->x86_stepping == 1)))
+ return;
/* Duron 670 is valid */
- if ((c->x86_model == 7) && (c->x86_mask == 0))
- goto valid_k7;
+ if ((c->x86_model == 7) && (c->x86_stepping == 0))
+ return;
/*
* Athlon 662, Duron 671, and Athlon >model 7 have capability
@@ -170,11 +258,11 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
* more.
*/
- if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
- ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ if (((c->x86_model == 6) && (c->x86_stepping >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_stepping >= 1)) ||
(c->x86_model > 7))
- if (cpu_has_mp)
- goto valid_k7;
+ if (cpu_has(c, X86_FEATURE_MP))
+ return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -183,66 +271,27 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* approved Athlon
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
- "processors is not suitable for SMP.\n");
- if (!test_taint(TAINT_UNSAFE_SMP))
- add_taint(TAINT_UNSAFE_SMP);
-
-valid_k7:
- ;
+ " processors is not suitable for SMP.\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
#endif
}
-static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
-{
- u32 l, h;
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- rdmsr(MSR_K7_HWCR, l, h);
- l &= ~0x00008000;
- wrmsr(MSR_K7_HWCR, l, h);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
- ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
-
- set_cpu_cap(c, X86_FEATURE_K7);
-
- amd_k7_smp_check(c);
-}
-#endif
-
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-static int __cpuinit nearby_node(int apicid)
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
{
int i, node;
for (i = apicid - 1; i >= 0; i--) {
- node = apicid_to_node[i];
+ node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = apicid_to_node[i];
+ node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
@@ -250,91 +299,325 @@ static int __cpuinit nearby_node(int apicid)
}
#endif
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
- unsigned bits;
+#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
+ int node;
+ unsigned apicid = c->topo.apicid;
- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-#endif
-}
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = per_cpu_llc_id(cpu);
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
-{
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
- int cpu = smp_processor_id();
- int node;
- unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
- node = c->phys_proc_id;
- if (apicid_to_node[apicid] != NUMA_NO_NODE)
- node = apicid_to_node[apicid];
if (!node_online(node)) {
- /* Two possibilities here:
- - The CPU is missing memory and no node was created.
- In that case try picking one from a nearby CPU
- - The APIC IDs differ from the HyperTransport node IDs
- which the K8 northbridge parsing fills in.
- Assume they are all increased by a constant offset,
- but in the same order as the HT nodeids.
- If that doesn't result in a usable node fall back to the
- path for the previous case. */
-
- int ht_nodeid = c->initial_apicid;
-
- if (ht_nodeid >= 0 &&
- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = apicid_to_node[ht_nodeid];
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs
+ * which the K8 northbridge parsing fills in. Assume
+ * they are all increased by a constant offset, but in
+ * the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->topo.initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
node = nearby_node(apicid);
}
numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+static void bsp_determine_snp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
- unsigned bits, ecx;
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+ cc_vendor = CC_VENDOR_AMD;
+
+ if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
+ /*
+ * RMP table entry format is not architectural and is defined by the
+ * per-processor PPR. Restrict SNP support on the known CPU models
+ * for which the RMP table entry format is currently defined or for
+ * processors which support the architecturally defined RMPREAD
+ * instruction.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ (cpu_feature_enabled(X86_FEATURE_ZEN3) ||
+ cpu_feature_enabled(X86_FEATURE_ZEN4) ||
+ cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
+ snp_probe_rmptable_info()) {
+ cc_platform_set(CC_ATTR_HOST_SEV_SNP);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+ }
+ }
+#endif
+}
+
+#define ZEN_MODEL_STEP_UCODE(fam, model, step, ucode) \
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, fam, model), \
+ step, step, ucode)
+
+static const struct x86_cpu_id amd_tsa_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x1, 0x0a0011d7),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x2, 0x0a00123b),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x08, 0x2, 0x0a00820d),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x1, 0x0a10114c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x2, 0x0a10124c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x18, 0x1, 0x0a108109),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x0, 0x0a20102e),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x2, 0x0a201211),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x44, 0x1, 0x0a404108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x50, 0x0, 0x0a500012),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x61, 0x2, 0x0a60120a),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x74, 0x1, 0x0a704108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x75, 0x2, 0x0a705208),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x78, 0x0, 0x0a708008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x7c, 0x0, 0x0a70c008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0xa0, 0x2, 0x0aa00216),
+ {},
+};
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
+static void tsa_init(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return;
- ecx = cpuid_ecx(0x80000008);
+ if (cpu_has(c, X86_FEATURE_ZEN3) ||
+ cpu_has(c, X86_FEATURE_ZEN4)) {
+ if (x86_match_min_microcode_rev(amd_tsa_microcode))
+ setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR);
+ else
+ pr_debug("%s: current revision: 0x%x\n", __func__, c->microcode);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO);
+ setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO);
+ }
+}
- c->x86_max_cores = (ecx & 0xff) + 1;
+static void bsp_init_amd(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
+ if (c->x86 > 0x10 ||
+ (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+ u64 val;
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
+ rdmsrq(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
}
- c->x86_coreid_bits = bits;
-#endif
+ if (c->x86 == 0x15) {
+ unsigned long upperbit;
+ u32 cpuid, assoc;
+
+ cpuid = cpuid_edx(0x80000005);
+ assoc = cpuid >> 16 & 0xff;
+ upperbit = ((cpuid >> 24) << 10) / assoc;
+
+ va_align.mask = (upperbit - 1) & PAGE_MASK;
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_u32() & va_align.mask;
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
+ c->x86 >= 0x15 && c->x86 <= 0x17) {
+ unsigned int bit;
+
+ switch (c->x86) {
+ case 0x15: bit = 54; break;
+ case 0x16: bit = 33; break;
+ case 0x17: bit = 10; break;
+ default: return;
+ }
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
+ }
+ }
+
+ resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x19:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x60 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ case 0x50 ... 0x5f:
+ case 0x80 ... 0xaf:
+ case 0xc0 ... 0xcf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN6);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ bsp_determine_snp(c);
+ tsa_init(c);
+
+ if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID))
+ setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT);
+
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
+}
+
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ /*
+ * Mark using WBINVD is needed during kexec on processors that
+ * support SME. This provides support for performing a successful
+ * kexec when going from SME inactive to SME active (or vice-versa).
+ *
+ * The cache must be cleared so that if there are entries with the
+ * same physical address, both with and without the encryption bit,
+ * they don't race each other when flushed and potentially end up
+ * with the wrong entry being committed to memory.
+ *
+ * Test the CPUID bit directly because with mem_encrypt=off the
+ * BSP will clear the X86_FEATURE_SME bit and the APs will not
+ * see it set after that.
+ */
+ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ __this_cpu_write(cache_state_incoherent, true);
+
+ /*
+ * BIOS support is required for SME and SEV.
+ * For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+ * the SME physical address space reduction value.
+ * If BIOS has not enabled SME then don't advertise the
+ * SME feature (set in scattered.c).
+ * If the kernel has not enabled SME via any means then
+ * don't advertise the SME feature.
+ * For SEV: If BIOS has not enabled SEV then don't advertise SEV and
+ * any additional functionality based on it.
+ *
+ * In all cases, since support for SME and SEV requires long mode,
+ * don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+ /* Check if memory encryption is enabled */
+ rdmsrq(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ goto clear_all;
+
+ /*
+ * Always adjust physical address bits. Even though this
+ * will be a value above 32-bits this is still done for
+ * CONFIG_X86_32 so that accurate values are reported.
+ */
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto clear_all;
+
+ if (!sme_me_mask)
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+
+ rdmsrq(MSR_K7_HWCR, msr);
+ if (!(msr & MSR_K7_HWCR_SMMLOCK))
+ goto clear_sev;
+
+ return;
+
+clear_all:
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+clear_sev:
+ setup_clear_cpu_cap(X86_FEATURE_SEV);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ }
}
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void early_init_amd(struct cpuinfo_x86 *c)
{
- early_init_amd_mc(c);
+ u32 dummy;
+
+ if (c->x86 >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
@@ -345,31 +628,94 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
/* Set MTRR capability flag if appropriate */
if (c->x86 == 5)
if (c->x86_model == 13 || c->x86_model == 9 ||
- (c->x86_model == 8 && c->x86_mask >= 8))
+ (c->x86_model == 8 && c->x86_stepping >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
- /* check CPU config space for extended APIC ID */
- if (cpu_has_apic && c->x86 >= 0xf) {
- unsigned int val;
- val = read_pci_config(0, 24, 0, 0x68);
- if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+ /*
+ * ApicID can always be treated as an 8-bit value for AMD APIC versions
+ * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+ * after 16h.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ if (c->x86 > 0x16)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ else if (c->x86 >= 0xf) {
+ /* check CPU config space for extended APIC ID */
+ unsigned int val;
+
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val >> 17 & 0x3) == 0x3)
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
}
#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
+
+ early_detect_mem_encrypt(c);
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ }
+ }
}
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+static void init_amd_k8(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
- unsigned long long value;
+ u32 level;
+ u64 value;
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
+ */
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrq_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrq_amd_safe(0xc001100d, value);
+ }
+ }
+ if (!c->x86_model_id[0])
+ strscpy(c->x86_model_id, "Hammer");
+
+#ifdef CONFIG_SMP
/*
* Disable TLB flush filter by setting HWCR.FFDIS on K8
* bit 6 of msr C001_0015
@@ -377,156 +723,544 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
- if (c->x86 == 0xf) {
- rdmsrl(MSR_K7_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K7_HWCR, value);
- }
+ msr_set_bit(MSR_K7_HWCR, 6);
#endif
+ set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
- early_init_amd(c);
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MMCONF_FAM10H
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
/*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
- clear_cpu_cap(c, 0*32+31);
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
-#ifdef CONFIG_X86_64
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- u32 level;
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- level = cpuid_eax(1);
- if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
- /*
- * Some BIOSes incorrectly force this feature, but only K8
- * revision D (model = 0x14) and later actually support it.
- */
- if (c->x86_model < 0x14)
- clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- }
- if (c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-#else
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
+}
+static void init_amd_ln(struct cpuinfo_x86 *c)
+{
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
+ * Apply erratum 665 fix unconditionally so machines without a BIOS
+ * fix work.
*/
+ msr_set_bit(MSR_AMD64_DE_CFG, 31);
+}
- switch (c->x86) {
- case 4:
- init_amd_k5(c);
- break;
- case 5:
- init_amd_k6(c);
- break;
- case 6: /* An Athlon/Duron */
- init_amd_k7(c);
- break;
+static bool rdrand_force;
+
+static int __init rdrand_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "force"))
+ rdrand_force = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("rdrand", rdrand_cmdline);
+
+static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+{
+ /*
+ * Saving of the MSR used to hide the RDRAND support during
+ * suspend/resume is done by arch/x86/power/cpu.c, which is
+ * dependent on CONFIG_PM_SLEEP.
+ */
+ if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ return;
+
+ /*
+ * The self-test can clear X86_FEATURE_RDRAND, so check for
+ * RDRAND support using the CPUID function directly.
+ */
+ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ return;
+
+ msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+
+ /*
+ * Verify that the CPUID change has occurred in case the kernel is
+ * running virtualized and the hypervisor doesn't support the MSR.
+ */
+ if (cpuid_ecx(1) & BIT(30)) {
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ return;
}
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
-#endif
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+}
+
+static void init_amd_jg(struct cpuinfo_x86 *c)
+{
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
- if (!c->x86_model_id[0]) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
+ value |= 0x1E;
+ wrmsrq_safe(MSR_F15H_IC_CFG, value);
}
}
- display_cacheinfo(c);
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
+static const struct x86_cpu_id erratum_1386_microcode[] = {
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+ {}
+};
+
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
+ */
+ if (x86_match_min_microcode_rev(erratum_1386_microcode))
+ return;
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008) {
- amd_detect_cmp(c);
- srat_detect_node(c);
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
+void init_spectral_chicken(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
+ u64 value;
+
+ /*
+ * On Zen2 we offer this chicken (bit) on the altar of Speculation.
+ *
+ * This suppresses speculation from the middle of a basic block, i.e. it
+ * suppresses non-branch predictions.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
+ value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
+ wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
+ }
}
+#endif
+}
-#ifdef CONFIG_X86_32
- detect_ht(c);
+static void init_amd_zen_common(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
+#ifdef CONFIG_NUMA
+ node_reclaim_distance = 32;
#endif
+}
- if (c->extended_cpuid_level >= 0x80000006) {
- if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ fix_erratum_1386(c);
+
+ /* Fix up CPUID bits, but only if not virtualised. */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+
+ /* Erratum 1076: CPB feature bit not being set in CPUID. */
+ if (!cpu_has(c, X86_FEATURE_CPB))
+ set_cpu_cap(c, X86_FEATURE_CPB);
}
- if (c->x86 >= 0xf && c->x86 <= 0x11)
- set_cpu_cap(c, X86_FEATURE_K8);
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
- if (cpu_has_xmm2) {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ /*
+ * Turn off the Instructions Retired free counter on machines that are
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (c->x86_model < 0x30) {
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+ clear_cpu_cap(c, X86_FEATURE_IRPERF);
}
+}
-#ifdef CONFIG_X86_64
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
+static bool cpu_has_zenbleed_microcode(void)
+{
+ u32 good_rev = 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
+
+ default:
+ return false;
+ }
+
+ if (boot_cpu_data.microcode < good_rev)
+ return false;
+
+ return true;
+}
- fam10h_check_enable_mmcfg();
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (!cpu_has(c, X86_FEATURE_AVX))
+ return;
+
+ if (!cpu_has_zenbleed_microcode()) {
+ pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n");
+ msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ } else {
+ msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ }
+}
+
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+
+ /* Disable RDSEED on AMD Cyan Skillfish because of an error. */
+ if (c->x86_model == 0x47 && c->x86_stepping == 0x0) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg("RDSEED is not reliable on this platform; disabling.\n");
}
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
- unsigned long long tseg;
+ /* Correct misconfigured CPUID on some clients. */
+ clear_cpu_cap(c, X86_FEATURE_INVLPGB);
+}
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
/*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
*/
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if ((tseg>>PMD_SHIFT) <
- (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
- ((tseg>>PMD_SHIFT) <
- (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
- (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
- set_memory_4k((unsigned long)__va(tseg), 1);
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
+
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
+}
+
+static const struct x86_cpu_id zen5_rdseed_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037),
+ {},
+};
+
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+ if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n");
+ }
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ u64 vm_cr;
+
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* AMD FSRM also implies FSRS */
+ if (cpu_has(c, X86_FEATURE_FSRM))
+ set_cpu_cap(c, X86_FEATURE_FSRS);
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x12: init_amd_ln(c); break;
+ case 0x15: init_amd_bd(c); break;
+ case 0x16: init_amd_jg(c); break;
+ }
+
+ /*
+ * Save up on some future enablement work and do common Zen
+ * settings.
+ */
+ if (c->x86 >= 0x17)
+ init_amd_zen_common();
+
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
+
+ /*
+ * Enable workaround for FXSAVE leak on CPUs
+ * without a XSaveErPtr feature
+ */
+ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
+
+ cpu_detect_cache_sizes(c);
+
+ srat_detect_node(c);
+
+ init_amd_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
}
}
-#endif
+
+ if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
+
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ }
+
+ /*
+ * Family 0x12 and above processors have APIC timer
+ * running in deep C states.
+ */
+ if (c->x86 > 0x11)
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
+
+ /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ /* Enable the Instructions Retired free counter */
+ if (cpu_has(c, X86_FEATURE_IRPERF))
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
+
+ /*
+ * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up
+ * using the trampoline code and as part of it, MSR_EFER gets prepared there in
+ * order to be replicated onto them. Regardless, set it here again, if not set,
+ * to protect against any future refactoring/code reorganization which might
+ * miss setting this important bit.
+ */
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ cpu_has(c, X86_FEATURE_AUTOIBRS))
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0);
+
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
- if ((c->x86 == 6)) {
- if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */
+ if (c->x86 == 6) {
+ /* Duron Rev A0 */
+ if (c->x86_model == 3 && c->x86_stepping == 0)
size = 64;
+ /* Tbird rev A1/A2 */
if (c->x86_model == 4 &&
- (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */
+ (c->x86_stepping == 0 || c->x86_stepping == 1))
size = 256;
}
return size;
}
#endif
-static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
+static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->x86 < 0xf)
+ return;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
+
+ /*
+ * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+ * characteristics from the CPUID function 0x80000005 instead.
+ */
+ if (c->x86 == 0xf) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ mask = 0xff;
+ }
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m = tlb_lld_2m >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ /* Erratum 658 */
+ if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+ tlb_lli_2m = 1024;
+ } else {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m = eax & 0xff;
+ }
+ } else
+ tlb_lli_2m = eax & mask;
+
+ tlb_lli_4m = tlb_lli_2m >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
+}
+
+static const struct cpu_dev amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
@@ -537,11 +1271,136 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
}
},
},
- .c_size_cache = amd_size_cache,
+ .legacy_cache_size = amd_size_cache,
#endif
.c_early_init = early_init_amd,
+ .c_detect_tlb = cpu_detect_tlb_amd,
+ .c_bsp_init = bsp_init_amd,
.c_init = init_amd,
.c_x86_vendor = X86_VENDOR_AMD,
};
cpu_dev_register(amd_cpu_dev);
+
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask);
+
+static unsigned int amd_msr_dr_addr_masks[] = {
+ MSR_F16H_DR0_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK + 1,
+ MSR_F16H_DR1_ADDR_MASK + 2
+};
+
+void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr)
+{
+ int cpu = smp_processor_id();
+
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return;
+
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return;
+
+ if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask)
+ return;
+
+ wrmsrq(amd_msr_dr_addr_masks[dr], mask);
+ per_cpu(amd_dr_addr_mask, cpu)[dr] = mask;
+}
+
+unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return 0;
+
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return 0;
+
+ return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
+}
+EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask);
+
+static void zenbleed_check_cpu(void *unused)
+{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ zen2_zenbleed_check(c);
+}
+
+void amd_check_microcode(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2))
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
+}
+
+static const char * const s5_reset_reason_txt[] = {
+ [0] = "thermal pin BP_THERMTRIP_L was tripped",
+ [1] = "power button was pressed for 4 seconds",
+ [2] = "shutdown pin was tripped",
+ [4] = "remote ASF power off command was received",
+ [9] = "internal CPU thermal limit was tripped",
+ [16] = "system reset pin BP_SYS_RST_L was tripped",
+ [17] = "software issued PCI reset",
+ [18] = "software wrote 0x4 to reset control register 0xCF9",
+ [19] = "software wrote 0x6 to reset control register 0xCF9",
+ [20] = "software wrote 0xE to reset control register 0xCF9",
+ [21] = "ACPI power state transition occurred",
+ [22] = "keyboard reset pin KB_RST_L was tripped",
+ [23] = "internal CPU shutdown event occurred",
+ [24] = "system failed to boot before failed boot timer expired",
+ [25] = "hardware watchdog timer expired",
+ [26] = "remote ASF reset command was received",
+ [27] = "an uncorrected error caused a data fabric sync flood event",
+ [29] = "FCH and MP1 failed warm reset handshake",
+ [30] = "a parity error occurred",
+ [31] = "a software sync flood event occurred",
+};
+
+static __init int print_s5_reset_status_mmio(void)
+{
+ void __iomem *addr;
+ u32 value;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value));
+ if (!addr)
+ return 0;
+
+ value = ioread32(addr);
+
+ /* Value with "all bits set" is an error response and should be ignored. */
+ if (value == U32_MAX) {
+ iounmap(addr);
+ return 0;
+ }
+
+ /*
+ * Clear all reason bits so they won't be retained if the next reset
+ * does not update the register. Besides, some bits are never cleared by
+ * hardware so it's software's responsibility to clear them.
+ *
+ * Writing the value back effectively clears all reason bits as they are
+ * write-1-to-clear.
+ */
+ iowrite32(value, addr);
+ iounmap(addr);
+
+ for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
+ if (!(value & BIT(i)))
+ continue;
+
+ if (s5_reset_reason_txt[i]) {
+ pr_info("x86/amd: Previous system reset reason [0x%08x]: %s\n",
+ value, s5_reset_reason_txt[i]);
+ }
+ }
+
+ return 0;
+}
+late_initcall(print_s5_reset_status_mmio);
diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c
new file mode 100644
index 000000000000..8843b9557aea
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_cache_disable.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD L3 cache_disable_{0,1} sysfs handling
+ * Documentation/ABI/testing/sysfs-devices-system-cpu
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/capability.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+
+#include <asm/amd/nb.h>
+
+#include "cpu.h"
+
+/*
+ * L3 cache descriptors
+ */
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
+{
+ struct amd_l3_cache *l3 = &nb->l3_cache;
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(nb->misc, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+
+ if (boot_cpu_data.x86 == 0x15) {
+ l3->subcaches[0] = sc0 += !(val & BIT(1));
+ l3->subcaches[1] = sc1 += !(val & BIT(5));
+ }
+
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
+}
+
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot)
+{
+ int index;
+ struct amd_northbridge *nb = ci->priv;
+
+ index = amd_get_l3_disable_slot(nb, slot);
+ if (index >= 0)
+ return sysfs_emit(buf, "%d\n", index);
+
+ return sysfs_emit(buf, "FREE\n");
+}
+
+#define SHOW_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return show_cache_disable(ci, buf, slot); \
+}
+
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!nb->l3_cache.subcaches[i])
+ continue;
+
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long index)
+{
+ int ret = 0;
+
+ /* check if @slot is already used or the index is already disabled */
+ ret = amd_get_l3_disable_slot(nb, slot);
+ if (ret >= 0)
+ return -EEXIST;
+
+ if (index > nb->l3_cache.indices)
+ return -EINVAL;
+
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(nb, !slot))
+ return -EEXIST;
+
+ amd_l3_disable_index(nb, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf,
+ size_t count, unsigned int slot)
+{
+ struct amd_northbridge *nb = ci->priv;
+ unsigned long val = 0;
+ int cpu, err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cpu = cpumask_first(&ci->shared_cpu_map);
+
+ if (kstrtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ pr_warn("L3 slot %d in use/index already disabled!\n",
+ slot);
+ return err;
+ }
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return store_cache_disable(ci, buf, count, slot); \
+}
+
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+
+ return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu));
+}
+
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+ unsigned long val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoul(buf, 16, &val) < 0)
+ return -EINVAL;
+
+ if (amd_set_subcaches(cpu, val))
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!ci->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ static struct attribute **amd_l3_attrs;
+ int n = 1;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci)
+{
+ struct amd_northbridge *nb = ci->priv;
+
+ if (ci->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ struct amd_northbridge *nb;
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return NULL;
+
+ node = topology_amd_node_id(smp_processor_id());
+ nb = node_to_amd_nb(node);
+ if (nb && !nb->l3_cache.indices)
+ amd_calc_l3_indices(nb);
+
+ return nb;
+}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000000000000..7ffc78d5ebf2
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ */
+#include <linux/cpufreq.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+struct aperfmperf {
+ seqcount_t seq;
+ unsigned long last_update;
+ u64 acnt;
+ u64 mcnt;
+ u64 aperf;
+ u64 mperf;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
+ .seq = SEQCNT_ZERO(cpu_samples.seq)
+};
+
+static void init_counter_refs(void *data)
+{
+ u64 aperf, mperf;
+
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
+
+ this_cpu_write(cpu_samples.aperf, aperf);
+ this_cpu_write(cpu_samples.mperf, mperf);
+}
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+/*
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
+ */
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
+{
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
+
+static bool __init turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool __init intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
+ /*
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
+ */
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
+
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static const struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static struct syscore freq_invariance_syscore = {
+ .ops = &freq_invariance_syscore_ops,
+};
+
+static void register_freq_invariance_syscore(void)
+{
+ register_syscore(&freq_invariance_syscore);
+}
+#else
+static inline void register_freq_invariance_syscore(void) {}
+#endif
+
+static void freq_invariance_enable(void)
+{
+ if (static_branch_unlikely(&arch_scale_freq_key)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
+ register_freq_invariance_syscore();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
+}
+
+void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
+{
+ arch_turbo_freq_ratio = ratio;
+ arch_set_max_freq_ratio(turbo_disabled);
+ freq_invariance_enable();
+}
+
+static void __init bp_init_freq_invariance(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
+ freq_invariance_enable();
+ }
+}
+
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ static_branch_disable(&arch_scale_freq_key);
+
+ /*
+ * Set arch_freq_scale to a default value on all cpus
+ * This negates the effect of scaling
+ */
+ for_each_possible_cpu(cpu)
+ per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
+}
+
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
+
+static void scale_freq_tick(u64 acnt, u64 mcnt)
+{
+ u64 freq_scale, freq_ratio;
+
+ if (!arch_scale_freq_invariant())
+ return;
+
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
+ goto error;
+
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
+}
+#else
+static inline void bp_init_freq_invariance(void) { }
+static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
+#endif /* CONFIG_X86_64 && CONFIG_SMP */
+
+void arch_scale_freq_tick(void)
+{
+ struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
+ u64 acnt, mcnt, aperf, mperf;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return;
+
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
+ acnt = aperf - s->aperf;
+ mcnt = mperf - s->mperf;
+
+ s->aperf = aperf;
+ s->mperf = mperf;
+
+ raw_write_seqcount_begin(&s->seq);
+ s->last_update = jiffies;
+ s->acnt = acnt;
+ s->mcnt = mcnt;
+ raw_write_seqcount_end(&s->seq);
+
+ scale_freq_tick(acnt, mcnt);
+}
+
+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
+int arch_freq_get_on_cpu(int cpu)
+{
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned int seq, freq;
+ unsigned long last;
+ u64 acnt, mcnt;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ goto fallback;
+
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
+
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ goto fallback;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+
+fallback:
+ freq = cpufreq_quick_get(cpu);
+ return freq ? freq : cpu_khz;
+}
+
+static int __init bp_init_aperfmperf(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ init_counter_refs(NULL);
+ bp_init_freq_invariance();
+ return 0;
+}
+early_initcall(bp_init_aperfmperf);
+
+void ap_init_aperfmperf(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ init_counter_refs(NULL);
+}
diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c
new file mode 100644
index 000000000000..f1a8ca3dd1ed
--- /dev/null
+++ b/arch/x86/kernel/cpu/bhyve.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FreeBSD Bhyve guest enlightenments
+ *
+ * Copyright © 2025 Amazon.com, Inc. or its affiliates.
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+
+static uint32_t bhyve_cpuid_base;
+static uint32_t bhyve_cpuid_max;
+
+#define BHYVE_SIGNATURE "bhyve bhyve "
+
+#define CPUID_BHYVE_FEATURES 0x40000001
+
+/* Features advertised in CPUID_BHYVE_FEATURES %eax */
+
+/* MSI Extended Dest ID */
+#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0)
+
+static uint32_t __init bhyve_detect(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0);
+ if (!bhyve_cpuid_base)
+ return 0;
+
+ bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base);
+ return bhyve_cpuid_max;
+}
+
+static uint32_t bhyve_features(void)
+{
+ unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES;
+
+ if (bhyve_cpuid_max < cpuid_leaf)
+ return 0;
+
+ return cpuid_eax(cpuid_leaf);
+}
+
+static bool __init bhyve_ext_dest_id(void)
+{
+ return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID);
+}
+
+static bool __init bhyve_x2apic_available(void)
+{
+ return true;
+}
+
+const struct hypervisor_x86 x86_hyper_bhyve __refconst = {
+ .name = "Bhyve",
+ .detect = bhyve_detect,
+ .init.init_platform = x86_init_noop,
+ .init.x2apic_available = bhyve_x2apic_available,
+ .init.msi_ext_dest_id = bhyve_ext_dest_id,
+};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c8e315f1aa83..d0a2847a4bb0 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1994 Linus Torvalds
*
@@ -8,162 +9,3730 @@
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
-#include <linux/utsname.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
+#include <linux/sched/smt.h>
+#include <linux/pgtable.h>
+#include <linux/bpf.h>
+#include <linux/kvm_types.h>
+
+#include <asm/spec-ctrl.h>
+#include <asm/cmdline.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
#include <asm/msr.h>
+#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/alternative.h>
+#include <asm/cpu_device_id.h>
+#include <asm/e820/api.h>
+#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
+#include "cpu.h"
+
+/*
+ * Speculation Vulnerability Handling
+ *
+ * Each vulnerability is handled with the following functions:
+ * <vuln>_select_mitigation() -- Selects a mitigation to use. This should
+ * take into account all relevant command line
+ * options.
+ * <vuln>_update_mitigation() -- This is called after all vulnerabilities have
+ * selected a mitigation, in case the selection
+ * may want to change based on other choices
+ * made. This function is optional.
+ * <vuln>_apply_mitigation() -- Enable the selected mitigation.
+ *
+ * The compile-time mitigation in all cases should be AUTO. An explicit
+ * command-line option can override AUTO. If no such option is
+ * provided, <vuln>_select_mitigation() will override AUTO to the best
+ * mitigation option.
+ */
+
+/* The base value of the SPEC_CTRL MSR without task-specific bits set */
+u64 x86_spec_ctrl_base;
+
+/* The current value of the SPEC_CTRL MSR with task-specific bits set */
+DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
-static int __init no_halt(char *s)
+/*
+ * Set when the CPU has run a potentially malicious guest. An IBPB will
+ * be needed to before running userspace. That IBPB will flush the branch
+ * predictor content.
+ */
+DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user);
+
+u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+
+static u64 __ro_after_init x86_arch_cap_msr;
+
+static DEFINE_MUTEX(spec_ctrl_mutex);
+
+void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
+
+static void __init set_return_thunk(void *thunk)
{
- boot_cpu_data.hlt_works_ok = 0;
- return 1;
+ x86_return_thunk = thunk;
+
+ pr_info("active return thunk: %ps\n", thunk);
}
-__setup("no-hlt", no_halt);
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+ this_cpu_write(x86_spec_ctrl_current, val);
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
+/*
+ * Keep track of the SPEC_CTRL MSR value for the current task, which may differ
+ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
+ */
+void update_spec_ctrl_cond(u64 val)
+{
+ if (this_cpu_read(x86_spec_ctrl_current) == val)
+ return;
+
+ this_cpu_write(x86_spec_ctrl_current, val);
+
+ /*
+ * When KERNEL_IBRS this MSR is written on return-to-user, unless
+ * forced the update can be delayed until that time.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
+noinstr u64 spec_ctrl_current(void)
+{
+ return this_cpu_read(x86_spec_ctrl_current);
+}
+EXPORT_SYMBOL_GPL(spec_ctrl_current);
+
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
+
+/* Control conditional STIBP in switch_to() */
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+/* Control conditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+/* Control unconditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb);
+
+/* Control CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_idle_clear);
-static int __init no_387(char *s)
+/*
+ * Controls whether l1d flush based mitigations are enabled,
+ * based on hw features and admin setting via boot parameter
+ * defaults to false
+ */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "mitigations: " fmt
+
+static void __init cpu_print_attack_vectors(void)
{
- boot_cpu_data.hard_math = 0;
- write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
- return 1;
+ pr_info("Enabled attack vectors: ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ pr_cont("user_kernel, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER))
+ pr_cont("user_user, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST))
+ pr_cont("guest_host, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST))
+ pr_cont("guest_guest, ");
+
+ pr_cont("SMT mitigations: ");
+
+ switch (smt_mitigations) {
+ case SMT_MITIGATIONS_OFF:
+ pr_cont("off\n");
+ break;
+ case SMT_MITIGATIONS_AUTO:
+ pr_cont("auto\n");
+ break;
+ case SMT_MITIGATIONS_ON:
+ pr_cont("on\n");
+ }
}
-__setup("no387", no_387);
+/*
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
+ */
+void
+x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
+{
+ u64 guestval, hostval;
+ struct thread_info *ti = current_thread_info();
+
+ /*
+ * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+ * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+ */
+ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+ return;
+
+ /*
+ * If the host has SSBD mitigation enabled, force it in the host's
+ * virtual MSR value. If its not permanently enabled, evaluate
+ * current's TIF_SSBD thread flag.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+ hostval = SPEC_CTRL_SSBD;
+ else
+ hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Sanitize the guest value */
+ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+ if (hostval != guestval) {
+ unsigned long tif;
+
+ tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+ ssbd_spec_ctrl_to_tif(hostval);
+
+ speculation_ctrl_update(tif);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl);
+
+static void x86_amd_ssb_disable(void)
+{
+ u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
+
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ wrmsrq(MSR_AMD64_LS_CFG, msrval);
+}
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
+#undef pr_fmt
+#define pr_fmt(fmt) "MDS: " fmt
/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
+ * Returns true if vulnerability should be mitigated based on the
+ * selected attack vector controls.
*
- * We should really only care about bugs here
- * anyway. Not features.
+ * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
*/
-static void __init check_fpu(void)
+static bool __init should_mitigate_vuln(unsigned int bug)
{
- s32 fdiv_bug;
+ switch (bug) {
+ /*
+ * The only runtime-selected spectre_v1 mitigations in the kernel are
+ * related to SWAPGS protection on kernel entry. Therefore, protection
+ * is only required for the user->kernel attack vector.
+ */
+ case X86_BUG_SPECTRE_V1:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL);
+
+ case X86_BUG_SPECTRE_V2:
+ case X86_BUG_RETBLEED:
+ case X86_BUG_L1TF:
+ case X86_BUG_ITS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ case X86_BUG_SPECTRE_V2_USER:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ /*
+ * All the vulnerabilities below allow potentially leaking data
+ * across address spaces. Therefore, mitigation is required for
+ * any of these 4 attack vectors.
+ */
+ case X86_BUG_MDS:
+ case X86_BUG_TAA:
+ case X86_BUG_MMIO_STALE_DATA:
+ case X86_BUG_RFDS:
+ case X86_BUG_SRBDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ case X86_BUG_GDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) ||
+ (smt_mitigations != SMT_MITIGATIONS_OFF);
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER);
+
+ case X86_BUG_VMSCAPE:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ default:
+ WARN(1, "Unknown bug %x\n", bug);
+ return false;
+ }
+}
+
+/* Default mitigation for MDS-affected CPUs */
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
+static bool mds_nosmt __ro_after_init = false;
+
+static const char * const mds_strings[] = {
+ [MDS_MITIGATION_OFF] = "Vulnerable",
+ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+};
+
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
+/*
+ * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to
+ * userspace *and* on entry to KVM guests.
+ */
+static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
+
+static void __init mds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MDS))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else
+ mds_mitigation = MDS_MITIGATION_OFF;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_OFF)
+ return;
+
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return;
+
+ /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
+ if (mds_mitigation == MDS_MITIGATION_FULL) {
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ mds_mitigation = MDS_MITIGATION_VMWERV;
+ }
+
+ pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+
+static void __init mds_apply_mitigation(void)
+{
+ if (mds_mitigation == MDS_MITIGATION_FULL ||
+ mds_mitigation == MDS_MITIGATION_VMWERV) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
+ (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
+ cpu_smt_disable(false);
+ }
+}
+
+static int __init mds_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ mds_mitigation = MDS_MITIGATION_OFF;
+ else if (!strcmp(str, "full"))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else if (!strcmp(str, "full,nosmt")) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mds", mds_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static bool __init taa_vulnerable(void)
+{
+ return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM);
+}
+
+static void __init taa_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ return;
+ }
+
+ /* Microcode will be checked in taa_update_mitigation(). */
+ if (taa_mitigation == TAA_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_TAA))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_OFF;
+ }
+
+ if (taa_mitigation != TAA_MITIGATION_OFF)
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init taa_update_mitigation(void)
+{
+ if (!taa_vulnerable())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ taa_mitigation = TAA_MITIGATION_VERW;
+
+ if (taa_mitigation == TAA_MITIGATION_VERW) {
+ /* Check if the requisite ucode is available. */
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static void __init taa_apply_mitigation(void)
+{
+ if (taa_mitigation == TAA_MITIGATION_VERW ||
+ taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) {
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+
+ if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
+ cpu_smt_disable(false);
+ }
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "MMIO Stale Data: " fmt
+
+static bool mmio_nosmt __ro_after_init = false;
+
+static const char * const mmio_strings[] = {
+ [MMIO_MITIGATION_OFF] = "Vulnerable",
+ [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+};
+
+static void __init mmio_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ return;
+ }
+
+ /* Microcode will be checked in mmio_update_mitigation(). */
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ }
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ /*
+ * Enable CPU buffer clear mitigation for host and VMM, if also affected
+ * by MDS or TAA.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mmio_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
+ if (mmio_mitigation == MMIO_MITIGATION_VERW) {
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))))
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init mmio_apply_mitigation(void)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ /*
+ * Only enable the VMM mitigation if the CPU buffer clear mitigation is
+ * not being used.
+ */
+ if (verw_clear_cpu_buf_mitigation_selected) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO);
+ }
+
+ /*
+ * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
+ * be propagated to uncore buffers, clearing the Fill buffers on idle
+ * is required irrespective of SMT state.
+ */
+ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
+ static_branch_enable(&cpu_buf_idle_clear);
+
+ if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
+ cpu_smt_disable(false);
+}
+
+static int __init mmio_stale_data_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
+
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static inline bool __init verw_clears_cpu_reg_file(void)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR);
+}
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS)) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_RFDS))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ else
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ }
+
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (verw_clears_cpu_reg_file())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init rfds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ if (!verw_clears_cpu_reg_file())
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static void __init rfds_apply_mitigation(void)
+{
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ }
+}
+
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
+}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "SRBDS: " fmt
+
+enum srbds_mitigations {
+ SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_AUTO,
+ SRBDS_MITIGATION_UCODE_NEEDED,
+ SRBDS_MITIGATION_FULL,
+ SRBDS_MITIGATION_TSX_OFF,
+ SRBDS_MITIGATION_HYPERVISOR,
+};
+
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF;
+
+static const char * const srbds_strings[] = {
+ [SRBDS_MITIGATION_OFF] = "Vulnerable",
+ [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
+ [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+static bool srbds_off;
+
+void update_srbds_msr(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
+ return;
+
+ /*
+ * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX
+ * being disabled and it hasn't received the SRBDS MSR microcode.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ switch (srbds_mitigation) {
+ case SRBDS_MITIGATION_OFF:
+ case SRBDS_MITIGATION_TSX_OFF:
+ mcu_ctrl |= RNGDS_MITG_DIS;
+ break;
+ case SRBDS_MITIGATION_FULL:
+ mcu_ctrl &= ~RNGDS_MITG_DIS;
+ break;
+ default:
+ break;
+ }
+
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+}
+
+static void __init srbds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS)) {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRBDS))
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
+ else {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+ }
+
+ /*
+ * Check to see if this is one of the MDS_NO systems supporting TSX that
+ * are only exposed to SRBDS when TSX is enabled or when CPU is affected
+ * by Processor MMIO Stale Data vulnerability.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
+ else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
+ else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
+ else if (srbds_off)
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+
+ pr_info("%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static void __init srbds_apply_mitigation(void)
+{
+ update_srbds_msr();
+}
+
+static int __init srbds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return 0;
+
+ srbds_off = !strcmp(str, "off");
+ return 0;
+}
+early_param("srbds", srbds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1D Flush : " fmt
+
+enum l1d_flush_mitigations {
+ L1D_FLUSH_OFF = 0,
+ L1D_FLUSH_ON,
+};
- if (!boot_cpu_data.hard_math) {
-#ifndef CONFIG_MATH_EMULATION
- printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
- printk(KERN_EMERG "Giving up.\n");
- for (;;) ;
+static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF;
+
+static void __init l1d_flush_select_mitigation(void)
+{
+ if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ return;
+
+ static_branch_enable(&switch_mm_cond_l1d_flush);
+ pr_info("Conditional flush on switch_mm() enabled\n");
+}
+
+static int __init l1d_flush_parse_cmdline(char *str)
+{
+ if (!strcmp(str, "on"))
+ l1d_flush_mitigation = L1D_FLUSH_ON;
+
+ return 0;
+}
+early_param("l1d_flush", l1d_flush_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "GDS: " fmt
+
+enum gds_mitigations {
+ GDS_MITIGATION_OFF,
+ GDS_MITIGATION_AUTO,
+ GDS_MITIGATION_UCODE_NEEDED,
+ GDS_MITIGATION_FORCE,
+ GDS_MITIGATION_FULL,
+ GDS_MITIGATION_FULL_LOCKED,
+ GDS_MITIGATION_HYPERVISOR,
+};
+
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF;
+
+static const char * const gds_strings[] = {
+ [GDS_MITIGATION_OFF] = "Vulnerable",
+ [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode",
+ [GDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)",
+ [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+ return (gds_mitigation == GDS_MITIGATION_FULL ||
+ gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+ u64 mcu_ctrl_after;
+ u64 mcu_ctrl;
+
+ switch (gds_mitigation) {
+ case GDS_MITIGATION_OFF:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl |= GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FULL_LOCKED:
+ /*
+ * The LOCKED state comes from the boot CPU. APs might not have
+ * the same state. Make sure the mitigation is enabled on all
+ * CPUs.
+ */
+ case GDS_MITIGATION_FULL:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl &= ~GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FORCE:
+ case GDS_MITIGATION_UCODE_NEEDED:
+ case GDS_MITIGATION_HYPERVISOR:
+ case GDS_MITIGATION_AUTO:
+ return;
+ }
+
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ /*
+ * Check to make sure that the WRMSR value was not ignored. Writes to
+ * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+ * processor was not.
+ */
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+ return;
+ }
+
+ /* Will verify below that mitigation _can_ be disabled */
+ if (gds_mitigation == GDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_GDS))
+ gds_mitigation = GDS_MITIGATION_FULL;
+ else
+ gds_mitigation = GDS_MITIGATION_OFF;
+ }
+
+ /* No microcode */
+ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
+ if (gds_mitigation != GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+ return;
+ }
+
+ /* Microcode has mitigation, use it */
+ if (gds_mitigation == GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ if (mcu_ctrl & GDS_MITG_LOCKED) {
+ if (gds_mitigation == GDS_MITIGATION_OFF)
+ pr_warn("Mitigation locked. Disable failed.\n");
+
+ /*
+ * The mitigation is selected from the boot CPU. All other CPUs
+ * _should_ have the same state. If the boot CPU isn't locked
+ * but others are then update_gds_msr() will WARN() of the state
+ * mismatch. If the boot CPU is locked update_gds_msr() will
+ * ensure the other CPUs have the mitigation enabled.
+ */
+ gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+ }
+}
+
+static void __init gds_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ /* Microcode is present */
+ if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)
+ update_gds_msr();
+ else if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ }
+
+ pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ gds_mitigation = GDS_MITIGATION_OFF;
+ else if (!strcmp(str, "force"))
+ gds_mitigation = GDS_MITIGATION_FORCE;
+
+ return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V1 : " fmt
+
+enum spectre_v1_mitigation {
+ SPECTRE_V1_MITIGATION_NONE,
+ SPECTRE_V1_MITIGATION_AUTO,
+};
+
+static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
+
+static const char * const spectre_v1_strings[] = {
+ [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
+ [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization",
+};
+
+/*
+ * Does SMAP provide full mitigation against speculative kernel access to
+ * userspace?
+ */
+static bool smap_works_speculatively(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ /*
+ * On CPUs which are vulnerable to Meltdown, SMAP does not
+ * prevent speculative access to user data in the L1 cache.
+ * Consider SMAP to be non-functional as a mitigation on these
+ * CPUs.
+ */
+ if (boot_cpu_has(X86_BUG_CPU_MELTDOWN))
+ return false;
+
+ return true;
+}
+
+static void __init spectre_v1_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+}
+
+static void __init spectre_v1_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ return;
+
+ if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
+ /*
+ * With Spectre v1, a user can speculatively control either
+ * path of a conditional swapgs with a user-controlled GS
+ * value. The mitigation is to add lfences to both code paths.
+ *
+ * If FSGSBASE is enabled, the user can put a kernel address in
+ * GS, in which case SMAP provides no protection.
+ *
+ * If FSGSBASE is disabled, the user can only put a user space
+ * address in GS. That makes an attack harder, but still
+ * possible if there's no SMAP protection.
+ */
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ !smap_works_speculatively()) {
+ /*
+ * Mitigation can be provided from SWAPGS itself or
+ * PTI as the CR3 write in the Meltdown mitigation
+ * is serializing.
+ *
+ * If neither is there, mitigate with an LFENCE to
+ * stop speculation through swapgs.
+ */
+ if (boot_cpu_has_bug(X86_BUG_SWAPGS) &&
+ !boot_cpu_has(X86_FEATURE_PTI))
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER);
+
+ /*
+ * Enable lfences in the kernel entry (non-swapgs)
+ * paths, to prevent user entry from speculatively
+ * skipping swapgs.
+ */
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL);
+ }
+ }
+
+ pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+}
+
+static int __init nospectre_v1_cmdline(char *str)
+{
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+ return 0;
+}
+early_param("nospectre_v1", nospectre_v1_cmdline);
+
+enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
+
+/* Depends on spectre_v2 mitigation selected already */
+static inline bool cdt_possible(enum spectre_v2_mitigation mode)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE))
+ return false;
+
+ if (mode == SPECTRE_V2_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE)
+ return true;
+
+ return false;
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "RETBleed: " fmt
+
+enum its_mitigation {
+ ITS_MITIGATION_OFF,
+ ITS_MITIGATION_AUTO,
+ ITS_MITIGATION_VMEXIT_ONLY,
+ ITS_MITIGATION_ALIGNED_THUNKS,
+ ITS_MITIGATION_RETPOLINE_STUFF,
+};
+
+static enum its_mitigation its_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF;
+
+enum retbleed_mitigation {
+ RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_AUTO,
+ RETBLEED_MITIGATION_UNRET,
+ RETBLEED_MITIGATION_IBPB,
+ RETBLEED_MITIGATION_IBRS,
+ RETBLEED_MITIGATION_EIBRS,
+ RETBLEED_MITIGATION_STUFF,
+};
+
+static const char * const retbleed_strings[] = {
+ [RETBLEED_MITIGATION_NONE] = "Vulnerable",
+ [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk",
+ [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
+ [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+ [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing",
+};
+
+static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE;
+
+static int __ro_after_init retbleed_nosmt = false;
+
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_NOSMT,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
+
+static int __init retbleed_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "off")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (!strcmp(str, "auto")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!strcmp(str, "unret")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ } else if (!strcmp(str, "ibpb")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ } else if (!strcmp(str, "stuff")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+ } else if (!strcmp(str, "nosmt")) {
+ retbleed_nosmt = true;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ } else {
+ pr_err("Ignoring unknown retbleed option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("retbleed", retbleed_parse_cmdline);
+
+#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n"
+#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n"
+
+static void __init retbleed_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_UNRET:
+ if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
+ }
+ break;
+ case RETBLEED_MITIGATION_IBPB:
+ if (!boot_cpu_has(X86_FEATURE_IBPB)) {
+ pr_err("WARNING: CPU does not support IBPB.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ case RETBLEED_MITIGATION_STUFF:
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
+ /* Intel mitigation selected in retbleed_update_mitigation() */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ /* Final mitigation depends on spectre-v2 selection */
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ else if (boot_cpu_has(X86_FEATURE_IBRS))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+}
+
+static void __init retbleed_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED))
+ return;
+
+ /* ITS can also enable stuffing */
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF)
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+
+ /* If SRSO is using IBPB, that works for retbleed too */
+ if (srso_mitigation == SRSO_MITIGATION_IBPB)
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF &&
+ !cdt_possible(spectre_v2_enabled)) {
+ pr_err("WARNING: retbleed=stuff depends on retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option except for call depth based stuffing
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
+ if (retbleed_mitigation != RETBLEED_MITIGATION_NONE)
+ pr_err(RETBLEED_INTEL_MSG);
+
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+ }
+ }
+
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static void __init retbleed_apply_mitigation(void)
+{
+ bool mitigate_smt = false;
+
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_NONE:
+ return;
+
+ case RETBLEED_MITIGATION_UNRET:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ set_return_thunk(retbleed_return_thunk);
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ pr_err(RETBLEED_UNTRAIN_MSG);
+
+ mitigate_smt = true;
+ break;
+
+ case RETBLEED_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ mitigate_smt = true;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like SRSO has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: write_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+
+ break;
+
+ case RETBLEED_MITIGATION_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+
+ set_return_thunk(call_depth_return_thunk);
+ break;
+
+ default:
+ break;
+ }
+
+ if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
+ (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
+ cpu_smt_disable(false);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ITS: " fmt
+
+static const char * const its_strings[] = {
+ [ITS_MITIGATION_OFF] = "Vulnerable",
+ [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected",
+ [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks",
+ [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB",
+};
+
+static int __init its_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+ pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+ return 0;
+ }
+
+ if (!strcmp(str, "off")) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ } else if (!strcmp(str, "on")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ } else if (!strcmp(str, "force")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ setup_force_cpu_bug(X86_BUG_ITS);
+ } else if (!strcmp(str, "vmexit")) {
+ its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+ } else if (!strcmp(str, "stuff")) {
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ } else {
+ pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS)) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_ITS))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ else
+ its_mitigation = ITS_MITIGATION_OFF;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_OFF)
+ return;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) {
+ pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+ pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("RSB stuff mitigation not supported, using default\n");
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY &&
+ !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+}
+
+static void __init its_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ return;
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ if (its_mitigation != ITS_MITIGATION_OFF)
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ /* Retpoline+CDT mitigates ITS */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ break;
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ default:
+ break;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !cdt_possible(spectre_v2_enabled))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+
+ pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+static void __init its_apply_mitigation(void)
+{
+ switch (its_mitigation) {
+ case ITS_MITIGATION_OFF:
+ case ITS_MITIGATION_AUTO:
+ case ITS_MITIGATION_VMEXIT_ONLY:
+ break;
+ case ITS_MITIGATION_ALIGNED_THUNKS:
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ case ITS_MITIGATION_RETPOLINE_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ set_return_thunk(call_depth_return_thunk);
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt
+
+enum tsa_mitigations {
+ TSA_MITIGATION_NONE,
+ TSA_MITIGATION_AUTO,
+ TSA_MITIGATION_UCODE_NEEDED,
+ TSA_MITIGATION_USER_KERNEL,
+ TSA_MITIGATION_VM,
+ TSA_MITIGATION_FULL,
+};
+
+static const char * const tsa_strings[] = {
+ [TSA_MITIGATION_NONE] = "Vulnerable",
+ [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary",
+ [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM",
+ [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+};
+
+static enum tsa_mitigations tsa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_AUTO : TSA_MITIGATION_NONE;
+
+static int __init tsa_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ else if (!strcmp(str, "on"))
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ else if (!strcmp(str, "user"))
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ else if (!strcmp(str, "vm"))
+ tsa_mitigation = TSA_MITIGATION_VM;
+ else
+ pr_err("Ignoring unknown tsa=%s option.\n", str);
+
+ return 0;
+}
+early_param("tsa", tsa_parse_cmdline);
+
+static void __init tsa_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TSA)) {
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ return;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_AUTO) {
+ bool vm = false, uk = false;
+
+ tsa_mitigation = TSA_MITIGATION_NONE;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) {
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ uk = true;
+ }
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ tsa_mitigation = TSA_MITIGATION_VM;
+ vm = true;
+ }
+
+ if (uk && vm)
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_NONE)
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
+ tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * No need to set verw_clear_cpu_buf_mitigation_selected - it
+ * doesn't fit all cases here and it is not needed because this
+ * is the only VERW-based mitigation on AMD.
+ */
+ pr_info("%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static void __init tsa_apply_mitigation(void)
+{
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ break;
+ case TSA_MITIGATION_VM:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ case TSA_MITIGATION_FULL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ default:
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static bool spectre_v2_bad_module;
+
+bool retpoline_module_ok(bool has_retpoline)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
+ return true;
+
+ pr_err("System may be vulnerable to spectre v2\n");
+ spectre_v2_bad_module = true;
+ return false;
+}
+
+static inline const char *spectre_v2_module_string(void)
+{
+ return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
+}
+#else
+static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
+
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
+/* The kernel command line selection for spectre v2 */
+enum spectre_v2_mitigation_cmd {
+ SPECTRE_V2_CMD_NONE,
+ SPECTRE_V2_CMD_AUTO,
+ SPECTRE_V2_CMD_FORCE,
+ SPECTRE_V2_CMD_RETPOLINE,
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
+ SPECTRE_V2_CMD_IBRS,
+};
+
+static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
+
+enum spectre_v2_user_mitigation_cmd {
+ SPECTRE_V2_USER_CMD_NONE,
+ SPECTRE_V2_USER_CMD_AUTO,
+ SPECTRE_V2_USER_CMD_FORCE,
+ SPECTRE_V2_USER_CMD_PRCTL,
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
+ SPECTRE_V2_USER_CMD_SECCOMP,
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
+};
+
+static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
+static const char * const spectre_v2_user_strings[] = {
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+};
+
+static int __init spectre_v2_user_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "auto"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO;
+ else if (!strcmp(str, "off"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE;
+ else if (!strcmp(str, "on"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE;
+ else if (!strcmp(str, "prctl"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL;
+ else if (!strcmp(str, "prctl,ibpb"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB;
+ else if (!strcmp(str, "seccomp"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP;
+ else if (!strcmp(str, "seccomp,ibpb"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB;
+ else
+ pr_err("Ignoring unknown spectre_v2_user option (%s).", str);
+
+ return 0;
+}
+early_param("spectre_v2_user", spectre_v2_user_parse_cmdline);
+
+static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
+}
+
+static void __init spectre_v2_user_select_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ switch (spectre_v2_user_cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ return;
+ case SPECTRE_V2_USER_CMD_FORCE:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER))
+ break;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ if (smt_mitigations == SMT_MITIGATIONS_OFF)
+ break;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = spectre_v2_user_ibpb;
+ break;
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
}
/*
- * trap_init() enabled FXSR and company _before_ testing for FP
- * problems here.
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+}
+
+static void __init spectre_v2_user_update_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ /* The spectre_v2 cmd line can override spectre_v2_user options */
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
+ }
+
+ /*
+ * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP
+ * is not required.
*
- * Test for the divl bug..
+ * Intel's Enhanced IBRS also protects against cross-thread branch target
+ * injection in user-mode as the IBRS bit remains always set which
+ * implicitly enables cross-thread protections. However, in legacy IBRS
+ * mode, the IBRS bit is set only on kernel entry and cleared on return
+ * to userspace. AMD Automatic IBRS also does not protect userspace.
+ * These modes therefore disable the implicit cross-thread protection,
+ * so allow for STIBP to be selected in those cases.
*/
- __asm__("fninit\n\t"
- "fldl %1\n\t"
- "fdivl %2\n\t"
- "fmull %2\n\t"
- "fldl %1\n\t"
- "fsubp %%st,%%st(1)\n\t"
- "fistpl %0\n\t"
- "fwait\n\t"
- "fninit"
- : "=m" (*&fdiv_bug)
- : "m" (*&x), "m" (*&y));
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !cpu_smt_possible() ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))) {
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ return;
+ }
+
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE &&
+ (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) {
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT &&
+ spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED)
+ pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+ }
+ pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]);
+}
+
+static void __init spectre_v2_user_apply_mitigation(void)
+{
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) {
+ static_branch_enable(&switch_vcpu_ibpb);
+
+ switch (spectre_v2_user_ibpb) {
+ case SPECTRE_V2_USER_STRICT:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
+}
+
+static const char * const spectre_v2_strings[] = {
+ [SPECTRE_V2_NONE] = "Vulnerable",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+ [SPECTRE_V2_IBRS] = "Mitigation: IBRS",
+};
+
+static bool nospectre_v2 __ro_after_init;
- boot_cpu_data.fdiv_bug = fdiv_bug;
- if (boot_cpu_data.fdiv_bug)
- printk("Hmm, FPU with FDIV bug.\n");
+static int __init nospectre_v2_parse_cmdline(char *str)
+{
+ nospectre_v2 = true;
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ return 0;
}
+early_param("nospectre_v2", nospectre_v2_parse_cmdline);
-static void __init check_hlt(void)
+static int __init spectre_v2_parse_cmdline(char *str)
{
- if (paravirt_enabled())
+ if (!str)
+ return -EINVAL;
+
+ if (nospectre_v2)
+ return 0;
+
+ if (!strcmp(str, "off")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ } else if (!strcmp(str, "on")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_FORCE;
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ } else if (!strcmp(str, "retpoline")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE;
+ } else if (!strcmp(str, "retpoline,amd") ||
+ !strcmp(str, "retpoline,lfence")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE;
+ } else if (!strcmp(str, "retpoline,generic")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC;
+ } else if (!strcmp(str, "eibrs")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS;
+ } else if (!strcmp(str, "eibrs,lfence")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE;
+ } else if (!strcmp(str, "eibrs,retpoline")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE;
+ } else if (!strcmp(str, "auto")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ } else if (!strcmp(str, "ibrs")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_IBRS;
+ } else {
+ pr_err("Ignoring unknown spectre_v2 option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("spectre_v2", spectre_v2_parse_cmdline);
+
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
+static bool __ro_after_init rrsba_disabled;
+
+/* Disable in-kernel use of non-RSB RET predictors */
+static void __init spec_ctrl_disable_kernel_rrsba(void)
+{
+ if (rrsba_disabled)
return;
- printk(KERN_INFO "Checking 'hlt' instruction... ");
- if (!boot_cpu_data.hlt_works_ok) {
- printk("disabled\n");
+ if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) {
+ rrsba_disabled = true;
return;
}
- halt();
- halt();
- halt();
- halt();
- printk("OK.\n");
+
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ rrsba_disabled = true;
+}
+
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
+{
+ /*
+ * WARNING! There are many subtleties to consider when changing *any*
+ * code related to RSB-related mitigations. Before doing so, carefully
+ * read the following document, and update if necessary:
+ *
+ * Documentation/admin-guide/hw-vuln/rsb.rst
+ *
+ * In an overly simplified nutshell:
+ *
+ * - User->user RSB attacks are conditionally mitigated during
+ * context switches by cond_mitigation -> write_ibpb().
+ *
+ * - User->kernel and guest->host attacks are mitigated by eIBRS or
+ * RSB filling.
+ *
+ * Though, depending on config, note that other alternative
+ * mitigations may end up getting used instead, e.g., IBPB on
+ * entry/vmexit, call depth tracking, or return thunks.
+ */
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ break;
+
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
+ }
+ break;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
}
/*
- * Most 386 processors have a bug where a POPAD can lock the
- * machine even from user space.
+ * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by
+ * branch history in userspace. Not needed if BHI_NO is set.
*/
+static bool __init spec_ctrl_bhi_dis(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW);
+
+ return true;
+}
+
+enum bhi_mitigations {
+ BHI_MITIGATION_OFF,
+ BHI_MITIGATION_AUTO,
+ BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
+};
+
+static enum bhi_mitigations bhi_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
+
+static int __init spectre_bhi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ else
+ pr_err("Ignoring unknown spectre_bhi option (%s)", str);
+
+ return 0;
+}
+early_param("spectre_bhi", spectre_bhi_parse_cmdline);
+
+static void __init bhi_select_mitigation(void)
+{
+ if (!boot_cpu_has(X86_BUG_BHI))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (bhi_mitigation != BHI_MITIGATION_AUTO)
+ return;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) {
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ } else {
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ }
+}
+
+static void __init bhi_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+}
+
+static void __init bhi_apply_mitigation(void)
+{
+ if (bhi_mitigation == BHI_MITIGATION_OFF)
+ return;
+
+ /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ spec_ctrl_disable_kernel_rrsba();
+ if (rrsba_disabled)
+ return;
+ }
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
-static void __init check_popad(void)
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
+ return;
+
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
+ return;
+ }
+
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
+ pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
+ pr_err("IBRS selected but not compiled in. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
+ pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ return;
+ }
+
+ switch (spectre_v2_cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return;
+
+ case SPECTRE_V2_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2))
+ break;
+ fallthrough;
+ case SPECTRE_V2_CMD_FORCE:
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
+ break;
+ }
+
+ spectre_v2_enabled = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ spectre_v2_enabled = SPECTRE_V2_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+ spectre_v2_enabled = SPECTRE_V2_RETPOLINE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE:
+ spectre_v2_enabled = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_IBRS:
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE;
+ break;
+ }
+}
+
+static void __init spectre_v2_update_mitigation(void)
{
-#ifndef CONFIG_X86_POPAD_OK
- int res, inp = (int) &res;
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO &&
+ !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
+ boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_mitigation != RETBLEED_MITIGATION_NONE &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ }
+ }
+
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
+}
+
+static void __init spectre_v2_apply_mitigation(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
+ msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+ }
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ return;
+
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_IBRS:
+ setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS);
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ pr_warn(SPECTRE_V2_IBRS_PERF_MSG);
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ fallthrough;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
+ }
- printk(KERN_INFO "Checking for popad bug... ");
- __asm__ __volatile__(
- "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
- : "=&a" (res)
- : "d" (inp)
- : "ecx", "edi");
/*
- * If this fails, it means that any user program may lock the
- * CPU hard. Too bad.
+ * Disable alternate RSB predictions in kernel when indirect CALLs and
+ * JMPs gets protection against BHI and Intramode-BTI, but RET
+ * prediction from a non-RSB predictor is still a risk.
*/
- if (res != 12345678)
- printk("Buggy.\n");
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE ||
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE ||
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE)
+ spec_ctrl_disable_kernel_rrsba();
+
+ spectre_v2_select_rsb_mitigation(spectre_v2_enabled);
+
+ /*
+ * Retpoline protects the kernel, but doesn't protect firmware. IBRS
+ * and Enhanced IBRS protect firmware too, so enable IBRS around
+ * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
+ * otherwise enabled.
+ *
+ * Use "spectre_v2_enabled" to check Enhanced IBRS instead of
+ * boot_cpu_has(), because the user might select retpoline on the kernel
+ * command line and if the CPU supports Enhanced IBRS, kernel might
+ * un-intentionally not enable IBRS around firmware calls.
+ */
+ if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ boot_cpu_has(X86_FEATURE_IBPB) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
+
+ if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW);
+ pr_info("Enabling Speculation Barrier for firmware calls\n");
+ }
+
+ } else if (boot_cpu_has(X86_FEATURE_IBRS) &&
+ !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
+ pr_info("Enabling Restricted Speculation for firmware calls\n");
+ }
+}
+
+static void update_stibp_msr(void * __unused)
+{
+ u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
+ update_spec_ctrl(val);
+}
+
+/* Update x86_spec_ctrl_base in case SMT state changed. */
+static void update_stibp_strict(void)
+{
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
+
+ if (sched_smt_active())
+ mask |= SPEC_CTRL_STIBP;
+
+ if (mask == x86_spec_ctrl_base)
+ return;
+
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+}
+
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
+static void update_indir_branch_cond(void)
+{
+ if (sched_smt_active())
+ static_branch_enable(&switch_to_cond_stibp);
else
- printk("OK.\n");
+ static_branch_disable(&switch_to_cond_stibp);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+/* Update the static key controlling the MDS CPU buffer clear in idle */
+static void update_mds_branch_idle(void)
+{
+ /*
+ * Enable the idle clearing if SMT is active on CPUs which are
+ * affected only by MSBDS and not any other MDS variant.
+ *
+ * The other variants cannot be mitigated when SMT is enabled, so
+ * clearing the buffers on idle just to prevent the Store Buffer
+ * repartitioning leak would be a window dressing exercise.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
+ return;
+
+ if (sched_smt_active()) {
+ static_branch_enable(&cpu_buf_idle_clear);
+ } else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
+ (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
+ static_branch_disable(&cpu_buf_idle_clear);
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE;
+
+static const char * const ssb_strings[] = {
+ [SPEC_STORE_BYPASS_NONE] = "Vulnerable",
+ [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
+ [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
+ [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
+};
+
+static bool nossb __ro_after_init;
+
+static int __init nossb_parse_cmdline(char *str)
+{
+ nossb = true;
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ return 0;
+}
+early_param("nospec_store_bypass_disable", nossb_parse_cmdline);
+
+static int __init ssb_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (nossb)
+ return 0;
+
+ if (!strcmp(str, "auto"))
+ ssb_mode = SPEC_STORE_BYPASS_AUTO;
+ else if (!strcmp(str, "on"))
+ ssb_mode = SPEC_STORE_BYPASS_DISABLE;
+ else if (!strcmp(str, "off"))
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ else if (!strcmp(str, "prctl"))
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
+ else if (!strcmp(str, "seccomp"))
+ ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ?
+ SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL;
+ else
+ pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n",
+ str);
+
+ return 0;
+}
+early_param("spec_store_bypass_disable", ssb_parse_cmdline);
+
+static void __init ssb_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) {
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ return;
+ }
+
+ if (ssb_mode == SPEC_STORE_BYPASS_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS))
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
+ else
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_SSBD))
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+static void __init ssb_apply_mitigation(void)
+{
+ /*
+ * We have three CPU feature flags that are in play here:
+ * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+ * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
+ * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+ */
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) {
+ setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+ /*
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
+ * use a completely different MSR and bit dependent on family.
+ */
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_amd_ssb_disable();
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculation prctl: " fmt
+
+static void task_update_spec_tif(struct task_struct *tsk)
+{
+ /* Force the update of the real TIF bits */
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
+
+ /*
+ * Immediately update the speculation control MSRs for the current
+ * task, but for a non-current task delay setting the CPU
+ * mitigation until it is scheduled next.
+ *
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
+ * always the current task.
+ */
+ if (tsk == current)
+ speculation_ctrl_update_current();
+}
+
+static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return -EPERM;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ case PR_SPEC_DISABLE:
+ clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ default:
+ return -ERANGE;
+ }
+}
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+ ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
+ return -ENXIO;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* If speculation is force disabled, enable is not allowed */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_clear_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_force_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE_NOEXEC:
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return 0;
+
+ /*
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
+ */
+ if (!is_spec_ib_user_controlled() ||
+ task_spec_ib_force_disable(task))
+ return -EPERM;
+
+ task_clear_spec_ib_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ case PR_SPEC_FORCE_DISABLE:
+ /*
+ * Indirect branch speculation is always allowed when
+ * mitigation is force disabled.
+ */
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return -EPERM;
+
+ if (!is_spec_ib_user_controlled())
+ return 0;
+
+ task_set_spec_ib_disable(task);
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
+ task_set_spec_ib_force_disable(task);
+ task_update_spec_tif(task);
+ if (task == current)
+ indirect_branch_prediction_barrier();
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_set(task, ctrl);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_set(task, ctrl);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+ if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+ ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
#endif
+
+static int l1d_flush_prctl_get(struct task_struct *task)
+{
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return PR_SPEC_FORCE_DISABLE;
+
+ if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH))
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ else
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+}
+
+static int ssb_prctl_get(struct task_struct *task)
+{
+ switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_NONE:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
+ case SPEC_STORE_BYPASS_DISABLE:
+ return PR_SPEC_DISABLE;
+ case SPEC_STORE_BYPASS_SECCOMP:
+ case SPEC_STORE_BYPASS_PRCTL:
+ case SPEC_STORE_BYPASS_AUTO:
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ssb_noexec(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ }
+ BUG();
}
+static int ib_prctl_get(struct task_struct *task)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ return PR_SPEC_NOT_AFFECTED;
+
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return PR_SPEC_ENABLE;
+ else if (is_spec_ib_user_controlled()) {
+ if (task_spec_ib_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ib_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else
+ return PR_SPEC_NOT_AFFECTED;
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_get(task);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_get(task);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
+
+void x86_spec_ctrl_setup_ap(void)
+{
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ update_spec_ctrl(x86_spec_ctrl_base);
+
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+ x86_amd_ssb_disable();
+}
+
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1TF: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
+EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation);
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation);
+
/*
- * Check whether we are able to run this kernel safely on SMP.
+ * These CPUs all support 44bits physical address space internally in the
+ * cache but CPUID can report a smaller number of physical address bits.
*
- * - In order to run on a i386, we need to be compiled for i386
- * (for due to lack of "invlpg" and working WP on a i386)
- * - In order to run on anything without a TSC, we need to be
- * compiled for a i486.
+ * The L1TF mitigation uses the top most address bit for the inversion of
+ * non present PTEs. When the installed memory reaches into the top most
+ * address bit due to memory holes, which has been observed on machines
+ * which report 36bits physical address bits and have 32G RAM installed,
+ * then the mitigation range check in l1tf_select_mitigation() triggers.
+ * This is a false positive because the mitigation is still possible due to
+ * the fact that the cache uses 44bit internally. Use the cache bits
+ * instead of the reported physical bits and adjust them on the affected
+ * machines to 44bit if the reported bits are less than 44.
*/
+static void override_cache_bits(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 6)
+ return;
-static void __init check_config(void)
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ if (c->x86_cache_bits < 44)
+ c->x86_cache_bits = 44;
+ break;
+ }
+}
+
+static void __init l1tf_select_mitigation(void)
{
-/*
- * We'd better not be a i386 if we're configured to use some
- * i486+ only features! (WP works in supervisor mode and the
- * new "invlpg" and "bswap" instructions)
- */
-#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
- defined(CONFIG_X86_BSWAP)
- if (boot_cpu_data.x86 == 3)
- panic("Kernel requires i486+ for 'invlpg' and other features");
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (l1tf_mitigation != L1TF_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (smt_mitigations == SMT_MITIGATIONS_ON)
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+}
+
+static void __init l1tf_apply_mitigation(void)
+{
+ u64 half_pa;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
+ override_cache_bits(&boot_cpu_data);
+
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_AUTO:
+ break;
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ cpu_smt_disable(false);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ cpu_smt_disable(true);
+ break;
+ }
+
+#if CONFIG_PGTABLE_LEVELS == 2
+ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+ return;
#endif
+
+ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+ if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
+ e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+ pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
+ half_pa);
+ pr_info("However, doing so will make a part of your RAM unusable.\n");
+ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
+ return;
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
+static int __init l1tf_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (!strcmp(str, "flush,nowarn"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+ else if (!strcmp(str, "flush"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ else if (!strcmp(str, "flush,nosmt"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else if (!strcmp(str, "full"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL;
+ else if (!strcmp(str, "full,force"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+ return 0;
}
+early_param("l1tf", l1tf_cmdline);
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
-void __init check_bugs(void)
+static const char * const srso_strings[] = {
+ [SRSO_MITIGATION_NONE] = "Vulnerable",
+ [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
+ [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled",
+ [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
+ [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
+};
+
+static int __init srso_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ else if (!strcmp(str, "microcode"))
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ else if (!strcmp(str, "safe-ret"))
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ else if (!strcmp(str, "ibpb"))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ else if (!strcmp(str, "ibpb-vmexit"))
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ else
+ pr_err("Ignoring unknown SRSO option (%s).", str);
+
+ return 0;
+}
+early_param("spec_rstack_overflow", srso_parse_cmdline);
+
+#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options."
+
+static void __init srso_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
+ }
+
+ if (srso_mitigation == SRSO_MITIGATION_AUTO) {
+ /*
+ * Use safe-RET if user->kernel or guest->host protection is
+ * required. Otherwise the 'microcode' mitigation is sufficient
+ * to protect the user->user and guest->guest vectors.
+ */
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) &&
+ !boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))) {
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ } else {
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
+ }
+ }
+
+ /* Zen1/2 with SMT off aren't vulnerable to SRSO. */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ srso_mitigation = SRSO_MITIGATION_NOSMT;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) {
+ pr_warn("IBPB-extending microcode not applied!\n");
+ pr_warn(SRSO_NOTICE);
+
+ /*
+ * Safe-RET provides partial mitigation without microcode, but
+ * other mitigations require microcode to provide any
+ * mitigations.
+ */
+ if (srso_mitigation == SRSO_MITIGATION_SAFE_RET)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ else
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ goto ibpb_on_vmexit;
+ }
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+ break;
+ibpb_on_vmexit:
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+ fallthrough;
+ case SRSO_MITIGATION_IBPB:
+ if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void __init srso_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRSO))
+ return;
+
+ /* If retbleed is using IBPB, that works for SRSO as well */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB &&
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ pr_info("%s\n", srso_strings[srso_mitigation]);
+}
+
+static void __init srso_apply_mitigation(void)
+{
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86 == 0x19) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ set_return_thunk(srso_alias_return_thunk);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ set_return_thunk(srso_return_thunk);
+ }
+ break;
+ case SRSO_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+ fallthrough;
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+ default:
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "VMSCAPE: " fmt
+
+enum vmscape_mitigations {
+ VMSCAPE_MITIGATION_NONE,
+ VMSCAPE_MITIGATION_AUTO,
+ VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER,
+ VMSCAPE_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+static const char * const vmscape_strings[] = {
+ [VMSCAPE_MITIGATION_NONE] = "Vulnerable",
+ /* [VMSCAPE_MITIGATION_AUTO] */
+ [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace",
+ [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT",
+};
+
+static enum vmscape_mitigations vmscape_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE;
+
+static int __init vmscape_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ } else if (!strcmp(str, "ibpb")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+ vmscape_mitigation = VMSCAPE_MITIGATION_AUTO;
+ } else {
+ pr_err("Ignoring unknown vmscape=%s option.\n", str);
+ }
+
+ return 0;
+}
+early_param("vmscape", vmscape_parse_cmdline);
+
+static void __init vmscape_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
+ !boot_cpu_has(X86_FEATURE_IBPB)) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ return;
+ }
+
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_VMSCAPE))
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ else
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ }
+}
+
+static void __init vmscape_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE))
+ return;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB ||
+ srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT)
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT;
+
+ pr_info("%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+static void __init vmscape_apply_mitigation(void)
+{
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER)
+ setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
+#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n"
+
+void cpu_bugs_smt_update(void)
+{
+ mutex_lock(&spec_ctrl_mutex);
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ switch (mds_mitigation) {
+ case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
+ case MDS_MITIGATION_VMWERV:
+ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+ pr_warn_once(MDS_MSG_SMT);
+ update_mds_branch_idle();
+ break;
+ case MDS_MITIGATION_OFF:
+ break;
+ }
+
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ case TSA_MITIGATION_VM:
+ case TSA_MITIGATION_AUTO:
+ case TSA_MITIGATION_FULL:
+ /*
+ * TSA-SQ can potentially lead to info leakage between
+ * SMT threads.
+ */
+ if (sched_smt_active())
+ static_branch_enable(&cpu_buf_idle_clear);
+ else
+ static_branch_disable(&cpu_buf_idle_clear);
+ break;
+ case TSA_MITIGATION_NONE:
+ case TSA_MITIGATION_UCODE_NEEDED:
+ break;
+ }
+
+ switch (vmscape_mitigation) {
+ case VMSCAPE_MITIGATION_NONE:
+ case VMSCAPE_MITIGATION_AUTO:
+ break;
+ case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT:
+ case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER:
+ /*
+ * Hypervisors can be attacked across-threads, warn for SMT when
+ * STIBP is not already enabled system-wide.
+ *
+ * Intel eIBRS (!AUTOIBRS) implies STIBP on.
+ */
+ if (!sched_smt_active() ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
+ break;
+ pr_warn_once(VMSCAPE_MSG_SMT);
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
+void __init cpu_select_mitigations(void)
+{
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
+ rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ cpu_print_attack_vectors();
+
+ /* Select the proper CPU mitigations before patching alternatives: */
+ spectre_v1_select_mitigation();
+ spectre_v2_select_mitigation();
+ retbleed_select_mitigation();
+ spectre_v2_user_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
+ srbds_select_mitigation();
+ l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ its_select_mitigation();
+ bhi_select_mitigation();
+ tsa_select_mitigation();
+ vmscape_select_mitigation();
+
+ /*
+ * After mitigations are selected, some may need to update their
+ * choices.
+ */
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+ /*
+ * its_update_mitigation() depends on spectre_v2_update_mitigation()
+ * and retbleed_update_mitigation().
+ */
+ its_update_mitigation();
+
+ /*
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
+ */
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+ vmscape_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ its_apply_mitigation();
+ bhi_apply_mitigation();
+ tsa_apply_mitigation();
+ vmscape_apply_mitigation();
+}
+
+#ifdef CONFIG_SYSFS
+
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char * const l1tf_vmx_states[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = "auto",
+ [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
+ [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
+ [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
+ (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
+ sched_smt_active())) {
+ return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !boot_cpu_has(X86_FEATURE_VMX))
+ return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n");
+ else if (!(cr4_read_shadow() & X86_CR4_VMXE))
+ return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n");
+ else if (itlb_multihit_kvm_mitigation)
+ return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sysfs_emit(buf, "KVM: Vulnerable\n");
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sysfs_emit(buf, "Processor vulnerable\n");
+}
+#endif
+
+static ssize_t mds_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
+ }
+
+ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t mmio_stale_data_show_state(char *buf)
{
- identify_boot_cpu();
-#ifndef CONFIG_SMP
- printk("CPU: ");
- print_cpu_info(&boot_cpu_data);
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mmio_strings[mmio_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static ssize_t old_microcode_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return sysfs_emit(buf, "Unknown: running under hypervisor");
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+static ssize_t its_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
+static char *stibp_state(void)
+{
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))
+ return "";
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ return "; STIBP: disabled";
+ case SPECTRE_V2_USER_STRICT:
+ return "; STIBP: forced";
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ return "; STIBP: always-on";
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (static_key_enabled(&switch_to_cond_stibp))
+ return "; STIBP: conditional";
+ }
+ return "";
+}
+
+static char *ibpb_state(void)
+{
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ if (static_key_enabled(&switch_mm_always_ibpb))
+ return "; IBPB: always-on";
+ if (static_key_enabled(&switch_mm_cond_ibpb))
+ return "; IBPB: conditional";
+ return "; IBPB: disabled";
+ }
+ return "";
+}
+
+static char *pbrsb_eibrs_state(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
+ boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
+ return "; PBRSB-eIBRS: SW sequence";
+ else
+ return "; PBRSB-eIBRS: Vulnerable";
+ } else {
+ return "; PBRSB-eIBRS: Not affected";
+ }
+}
+
+static const char *spectre_bhi_state(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_BHI))
+ return "; BHI: Not affected";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW))
+ return "; BHI: BHI_DIS_S";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP))
+ return "; BHI: SW loop, KVM: SW loop";
+ else if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
+ rrsba_disabled)
+ return "; BHI: Retpoline";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT))
+ return "; BHI: Vulnerable, KVM: SW loop";
+
+ return "; BHI: Vulnerable";
+}
+
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "",
+ pbrsb_eibrs_state(),
+ spectre_bhi_state(),
+ /* this should always be at the end */
+ spectre_v2_module_string());
+}
+
+static ssize_t srbds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static ssize_t retbleed_show_state(char *buf)
+{
+ if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
+
+ return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation],
+ !sched_smt_active() ? "disabled" :
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
+ "enabled with STIBP protection" : "vulnerable");
+ }
+
+ return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static ssize_t srso_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
+}
+
+static ssize_t tsa_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static ssize_t vmscape_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+ char *buf, unsigned int bug)
+{
+ if (!boot_cpu_has_bug(bug))
+ return sysfs_emit(buf, "Not affected\n");
+
+ switch (bug) {
+ case X86_BUG_CPU_MELTDOWN:
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ return sysfs_emit(buf, "Mitigation: PTI\n");
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV))
+ return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+
+ break;
+
+ case X86_BUG_SPECTRE_V1:
+ return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+
+ case X86_BUG_SPECTRE_V2:
+ return spectre_v2_show_state(buf);
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]);
+
+ case X86_BUG_L1TF:
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
+
+ case X86_BUG_MDS:
+ return mds_show_state(buf);
+
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
+ case X86_BUG_SRBDS:
+ return srbds_show_state(buf);
+
+ case X86_BUG_MMIO_STALE_DATA:
+ return mmio_stale_data_show_state(buf);
+
+ case X86_BUG_RETBLEED:
+ return retbleed_show_state(buf);
+
+ case X86_BUG_SRSO:
+ return srso_show_state(buf);
+
+ case X86_BUG_GDS:
+ return gds_show_state(buf);
+
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
+ case X86_BUG_OLD_MICROCODE:
+ return old_microcode_show_state(buf);
+
+ case X86_BUG_ITS:
+ return its_show_state(buf);
+
+ case X86_BUG_TSA:
+ return tsa_show_state(buf);
+
+ case X86_BUG_VMSCAPE:
+ return vmscape_show_state(buf);
+
+ default:
+ break;
+ }
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
+
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
+}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
+
+ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
+}
+
+ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+}
+
+ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
+}
+
+ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRSO);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
+
+ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE);
+}
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
+
+ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
+}
+
+ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE);
+}
#endif
- check_config();
- check_fpu();
- check_hlt();
- check_popad();
- init_utsname()->machine[1] =
- '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
- alternative_instructions();
+
+void __warn_thunk(void)
+{
+ WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n");
}
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index 9a3ed0649d4e..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- * Copyright (C) 2000 SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
- identify_boot_cpu();
-#if !defined(CONFIG_SMP)
- printk("CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
- alternative_instructions();
-
- /*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
- */
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
new file mode 100644
index 000000000000..dbc99a47be45
--- /dev/null
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
+#include <linux/kvm_types.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+ sld_ratelimit,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
+};
+
+static struct ratelimit_state bld_ratelimit;
+
+static unsigned int sysctl_sld_mitigate = 1;
+static DEFINE_SEMAPHORE(buslock_sem, 1);
+
+#ifdef CONFIG_PROC_SYSCTL
+static const struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
+
+ return len == arglen;
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrq_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrq(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init sld_state_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+ sld_state = state;
+}
+
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ /* Restore the MSR to its cached value. */
+ wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrq(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+void split_lock_init(void)
+{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void __split_lock_reenable_unlock(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
+
+/*
+ * Per-CPU delayed_work can't be statically initialized properly because
+ * the struct address is unknown. Thus per-CPU delayed_work structures
+ * have to be initialized during kernel initialization and after calling
+ * setup_per_cpu_areas().
+ */
+static int __init setup_split_lock_delayed_work(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu);
+
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
+ return 0;
+}
+pure_initcall(setup_split_lock_delayed_work);
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ struct delayed_work *work;
+ int cpu;
+ unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate);
+
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ if (saved_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ }
+
+ cpu = get_cpu();
+ work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu);
+ schedule_delayed_work_on(cpu, work, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
+ sld_update_msr(false);
+ put_cpu();
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock);
+
+void bus_lock_init(void)
+{
+ u64 val;
+
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, val);
+
+ if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off) {
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ } else {
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ }
+
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+void handle_bus_lock(struct pt_regs *regs)
+{
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
+}
+
+/*
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ {}
+};
+
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /* Check for CPUs that have support but do not enumerate it: */
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (m)
+ goto supported;
+
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
+ cpu_model_supports_sld = true;
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: warning on user-space bus_locks\n");
+ }
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
new file mode 100644
index 000000000000..51a95b07831f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 CPU caches detection and configuration
+ *
+ * Previous changes
+ * - Venkatesh Pallipadi: Cache identification through CPUID(0x4)
+ * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure
+ * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpuhotplug.h>
+#include <linux/stop_machine.h>
+
+#include <asm/amd/nb.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
+#include <asm/cpuid/api.h>
+#include <asm/mtrr.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+
+#include "cpu.h"
+
+/* Shared last level cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+
+/* Shared L2 cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
+static cpumask_var_t cpu_cacheinfo_mask;
+
+/* Kernel controls MTRR and/or PAT MSRs. */
+unsigned int memory_caching_control __ro_after_init;
+
+enum _cache_type {
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
+};
+
+union _cpuid4_leaf_eax {
+ struct {
+ enum _cache_type type :5;
+ unsigned int level :3;
+ unsigned int is_self_initializing :1;
+ unsigned int is_fully_associative :1;
+ unsigned int reserved :4;
+ unsigned int num_threads_sharing :12;
+ unsigned int num_cores_on_die :6;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ebx {
+ struct {
+ unsigned int coherency_line_size :12;
+ unsigned int physical_line_partition :10;
+ unsigned int ways_of_associativity :10;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ecx {
+ struct {
+ unsigned int number_of_sets :32;
+ } split;
+ u32 full;
+};
+
+struct _cpuid4_info {
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
+ unsigned long size;
+};
+
+/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
+/*
+ * Fallback AMD CPUID(0x4) emulation
+ * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d)
+ *
+ * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should
+ * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006).
+ */
+
+#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff
+#define AMD_L2_L3_INVALID_ASSOC 0x9
+
+union l1_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :8;
+ unsigned assoc :8;
+ unsigned size_in_kb :8;
+ };
+ unsigned int val;
+};
+
+union l2_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned size_in_kb :16;
+ };
+ unsigned int val;
+};
+
+union l3_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned res :2;
+ unsigned size_encoded :14;
+ };
+ unsigned int val;
+};
+
+/* L2/L3 associativity mapping */
+static const unsigned short assocs[] = {
+ [1] = 1,
+ [2] = 2,
+ [3] = 3,
+ [4] = 4,
+ [5] = 6,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
+ [0xc] = 64,
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
+
+static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx)
+{
+ unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d, *l1;
+ union l2_cache l2;
+ union l3_cache l3;
+
+ eax->full = 0;
+ ebx->full = 0;
+ ecx->full = 0;
+
+ cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+ cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+
+ l1 = &l1d;
+ switch (index) {
+ case 1:
+ l1 = &l1i;
+ fallthrough;
+ case 0:
+ if (!l1->val)
+ return;
+
+ assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc;
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
+ break;
+ case 2:
+ if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC)
+ return;
+
+ /* Use x86_cache_size as it might have K7 errata fixes */
+ assoc = assocs[l2.assoc];
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
+ break;
+ case 3:
+ if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC)
+ return;
+
+ assoc = assocs[l3.assoc];
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
+ }
+ break;
+ default:
+ return;
+ }
+
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[index];
+ eax->split.level = levels[index];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = topology_num_cores_per_package();
+
+ if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE)
+ eax->split.is_fully_associative = 1;
+
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
+}
+
+static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax,
+ union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx)
+{
+ if (eax.split.type == CTYPE_NULL)
+ return -EIO;
+
+ id4->eax = eax;
+ id4->ebx = ebx;
+ id4->ecx = ecx;
+ id4->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
+
+ return 0;
+}
+
+static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+ else
+ legacy_amd_cpuid4(index, &eax, &ebx, &ecx);
+
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
+}
+
+static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
+
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
+}
+
+static int fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+
+ return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ?
+ amd_fill_cpuid4_info(index, id4) :
+ intel_fill_cpuid4_info(index, id4);
+}
+
+static int find_num_cache_leaves(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx, op;
+ union _cpuid4_leaf_eax cache_eax;
+ int i = -1;
+
+ /* Do a CPUID(op) loop to calculate num_cache_leaves */
+ op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4;
+ do {
+ ++i;
+ cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
+ cache_eax.full = eax;
+ } while (cache_eax.split.type != CTYPE_NULL);
+ return i;
+}
+
+/*
+ * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4)
+{
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+
+ return apicid >> index_msb;
+}
+
+/*
+ * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist.
+ */
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
+{
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return;
+
+ if (c->x86 < 0x17) {
+ /* Pre-Zen: LLC is at the node level */
+ c->topo.llc_id = die_id;
+ } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
+ /*
+ * Family 17h up to 1F models: LLC is at the core
+ * complex level. Core complex ID is ApicId[3].
+ */
+ c->topo.llc_id = c->topo.apicid >> 3;
+ } else {
+ /*
+ * Newer families: LLC ID is calculated from the number
+ * of threads sharing the L3 cache.
+ */
+ u32 llc_index = find_num_cache_leaves(c) - 1;
+ struct _cpuid4_info id4 = {};
+
+ if (!amd_fill_cpuid4_info(llc_index, &id4))
+ c->topo.llc_id = get_cache_id(c->topo.apicid, &id4);
+ }
+}
+
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
+{
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return;
+
+ /*
+ * Hygons are similar to AMD Family 17h up to 1F models: LLC is
+ * at the core complex level. Core complex ID is ApicId[3].
+ */
+ c->topo.llc_id = c->topo.apicid >> 3;
+}
+
+void init_amd_cacheinfo(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
+ ci->num_leaves = find_num_cache_leaves(c);
+ else if (c->extended_cpuid_level >= 0x80000006)
+ ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3;
+}
+
+void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ ci->num_leaves = find_num_cache_leaves(c);
+}
+
+static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3,
+ unsigned int l2, unsigned int l1i, unsigned int l1d)
+{
+ /*
+ * If llc_id is still unset, then cpuid_level < 4, which implies
+ * that the only possibility left is SMT. Since CPUID(0x2) doesn't
+ * specify any shared caches and SMT shares all caches, we can
+ * unconditionally set LLC ID to the package ID so that all
+ * threads share it.
+ */
+ if (c->topo.llc_id == BAD_APICID)
+ c->topo.llc_id = c->topo.pkg_id;
+
+ c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d);
+
+ if (!l2)
+ cpu_detect_cache_sizes(c);
+}
+
+/*
+ * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available.
+ */
+static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c)
+{
+ unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc) {
+ switch (desc->c_type) {
+ case CACHE_L1_INST: l1i += desc->c_size; break;
+ case CACHE_L1_DATA: l1d += desc->c_size; break;
+ case CACHE_L2: l2 += desc->c_size; break;
+ case CACHE_L3: l3 += desc->c_size; break;
+ }
+ }
+
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+}
+
+static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4)
+{
+ unsigned int num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ return c->topo.apicid & ~((1 << index_msb) - 1);
+}
+
+static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+ unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID;
+ unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0;
+
+ if (c->cpuid_level < 4)
+ return false;
+
+ /*
+ * There should be at least one leaf. A non-zero value means
+ * that the number of leaves has been previously initialized.
+ */
+ if (!ci->num_leaves)
+ ci->num_leaves = find_num_cache_leaves(c);
+
+ if (!ci->num_leaves)
+ return false;
+
+ for (int i = 0; i < ci->num_leaves; i++) {
+ struct _cpuid4_info id4 = {};
+ int ret;
+
+ ret = intel_fill_cpuid4_info(i, &id4);
+ if (ret < 0)
+ continue;
+
+ switch (id4.eax.split.level) {
+ case 1:
+ if (id4.eax.split.type == CTYPE_DATA)
+ l1d = id4.size / 1024;
+ else if (id4.eax.split.type == CTYPE_INST)
+ l1i = id4.size / 1024;
+ break;
+ case 2:
+ l2 = id4.size / 1024;
+ l2_id = calc_cache_topo_id(c, &id4);
+ break;
+ case 3:
+ l3 = id4.size / 1024;
+ l3_id = calc_cache_topo_id(c, &id4);
+ break;
+ default:
+ break;
+ }
+ }
+
+ c->topo.l2c_id = l2_id;
+ c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id;
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+ return true;
+}
+
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+ /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */
+ if (intel_cacheinfo_0x4(c))
+ return;
+
+ intel_cacheinfo_0x2(c);
+}
+
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon
+ */
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ const struct _cpuid4_info *id4)
+{
+ struct cpu_cacheinfo *this_cpu_ci;
+ struct cacheinfo *ci;
+ int i, sibling;
+
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ ci = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ unsigned int apicid, nshared, first, last;
+
+ nshared = id4->eax.split.num_threads_sharing + 1;
+ apicid = cpu_data(cpu).topo.apicid;
+ first = apicid - (apicid % nshared);
+ last = first + nshared - 1;
+
+ for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ apicid = cpu_data(i).topo.apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+
+ ci = this_cpu_ci->info_list + index;
+
+ for_each_online_cpu(sibling) {
+ apicid = cpu_data(sibling).topo.apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
+ }
+ }
+ } else
+ return 0;
+
+ return 1;
+}
+
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon
+ */
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ const struct _cpuid4_info *id4)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cacheinfo *ci, *sibling_ci;
+ unsigned long num_threads_sharing;
+ int index_msb, i;
+
+ if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
+ if (__cache_amd_cpumap_setup(cpu, index, id4))
+ return;
+ }
+
+ ci = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+
+ cpumask_set_cpu(cpu, &ci->shared_cpu_map);
+ if (num_threads_sharing == 1)
+ return;
+
+ index_msb = get_count_order(num_threads_sharing);
+
+ for_each_online_cpu(i)
+ if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+
+ /* Skip if itself or no cacheinfo */
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;
+
+ sibling_ci = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &ci->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map);
+ }
+}
+
+static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4,
+ struct amd_northbridge *nb)
+{
+ ci->id = id4->id;
+ ci->attributes = CACHE_ID;
+ ci->level = id4->eax.split.level;
+ ci->type = cache_type_map[id4->eax.split.type];
+ ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1;
+ ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1;
+ ci->size = id4->size;
+ ci->number_of_sets = id4->ecx.split.number_of_sets + 1;
+ ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1;
+ ci->priv = nb;
+}
+
+int init_cache_level(unsigned int cpu)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+
+ /* There should be at least one leaf. */
+ if (!ci->num_leaves)
+ return -ENOENT;
+
+ return 0;
+}
+
+int populate_cache_leaves(unsigned int cpu)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *ci = this_cpu_ci->info_list;
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+ u32 apicid = cpu_data(cpu).topo.apicid;
+ struct amd_northbridge *nb = NULL;
+ struct _cpuid4_info id4 = {};
+ int idx, ret;
+
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = fill_cpuid4_info(idx, &id4);
+ if (ret)
+ return ret;
+
+ id4.id = get_cache_id(apicid, &id4);
+
+ if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON)
+ nb = amd_init_l3_cache(idx);
+
+ ci_info_init(ci++, &id4, nb);
+ __cache_cpumap_setup(cpu, idx, &id4);
+ }
+
+ this_cpu_ci->cpu_map_populated = true;
+ return 0;
+}
+
+/*
+ * Disable and enable caches. Needed for changing MTRRs and the PAT MSR.
+ *
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after cache_enable() has been called.
+ */
+static unsigned long saved_cr4;
+static DEFINE_RAW_SPINLOCK(cache_disable_lock);
+
+/*
+ * Cache flushing is the most time-consuming step when programming the
+ * MTRRs. On many Intel CPUs without known erratas, it can be skipped
+ * if the CPU declares cache self-snooping support.
+ */
+static void maybe_flush_caches(void)
+{
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+}
+
+void cache_disable(void) __acquires(cache_disable_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * This is not ideal since the cache is only flushed/disabled
+ * for this CPU while the MTRRs are changed, but changing this
+ * requires more invasive changes to the way the kernel boots.
+ */
+ raw_spin_lock(&cache_disable_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+
+ maybe_flush_caches();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
+ saved_cr4 = __read_cr4();
+ __write_cr4(saved_cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_disable();
+
+ maybe_flush_caches();
+}
+
+void cache_enable(void) __releases(cache_disable_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_enable();
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (cpu_feature_enabled(X86_FEATURE_PGE))
+ __write_cr4(saved_cr4);
+
+ raw_spin_unlock(&cache_disable_lock);
+}
+
+static void cache_cpu_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (memory_caching_control & CACHE_MTRR) {
+ cache_disable();
+ mtrr_generic_set_state();
+ cache_enable();
+ }
+
+ if (memory_caching_control & CACHE_PAT)
+ pat_cpu_init();
+
+ local_irq_restore(flags);
+}
+
+static bool cache_aps_delayed_init = true;
+
+void set_cache_aps_delayed_init(bool val)
+{
+ cache_aps_delayed_init = val;
+}
+
+bool get_cache_aps_delayed_init(void)
+{
+ return cache_aps_delayed_init;
+}
+
+static int cache_rendezvous_handler(void *unused)
+{
+ if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id()))
+ cache_cpu_init();
+
+ return 0;
+}
+
+void __init cache_bp_init(void)
+{
+ mtrr_bp_init();
+ pat_bp_init();
+
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+void cache_bp_restore(void)
+{
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+static int cache_ap_online(unsigned int cpu)
+{
+ cpumask_set_cpu(cpu, cpu_cacheinfo_mask);
+
+ if (!memory_caching_control || get_cache_aps_delayed_init())
+ return 0;
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid MTRR entries
+ * changed, but this routine will be called in CPU boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very early time of software resume, when there absolutely
+ * isn't MTRR entry changes;
+ *
+ * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent MTRR entry changes
+ */
+ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL,
+ cpu_cacheinfo_mask);
+
+ return 0;
+}
+
+static int cache_ap_offline(unsigned int cpu)
+{
+ cpumask_clear_cpu(cpu, cpu_cacheinfo_mask);
+ return 0;
+}
+
+/*
+ * Delayed cache initialization for all AP's
+ */
+void cache_aps_init(void)
+{
+ if (!memory_caching_control || !get_cache_aps_delayed_init())
+ return;
+
+ stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask);
+ set_cache_aps_delayed_init(false);
+}
+
+static int __init cache_ap_register(void)
+{
+ zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL);
+ cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask);
+
+ cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING,
+ "x86/cachectrl:starting",
+ cache_ap_online, cache_ap_offline);
+ return 0;
+}
+early_initcall(cache_ap_register);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..a3b55db35c96 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,244 +1,16 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <asm/processor.h>
-#include <asm/e820.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "cpu.h"
-#ifdef CONFIG_X86_OOSTORE
-
-static u32 __cpuinit power2(u32 x)
-{
- u32 s = 1;
-
- while (s <= x)
- s <<= 1;
-
- return s >>= 1;
-}
-
-
-/*
- * Set up an actual MCR
- */
-static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
-{
- u32 lo, hi;
-
- hi = base & ~0xFFF;
- lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
- lo &= ~0xFFF; /* Remove the ctrl value bits */
- lo |= key; /* Attribute we wish to set */
- wrmsr(reg+MSR_IDT_MCR0, lo, hi);
- mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
-}
-
-/*
- * Figure what we can cover with MCR's
- *
- * Shortcut: We know you can't put 4Gig of RAM on a winchip
- */
-static u32 __cpuinit ramtop(void)
-{
- u32 clip = 0xFFFFFFFFUL;
- u32 top = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
-
- if (e820.map[i].addr > 0xFFFFFFFFUL)
- continue;
- /*
- * Don't MCR over reserved space. Ignore the ISA hole
- * we frob around that catastrophe already
- */
- if (e820.map[i].type == E820_RESERVED) {
- if (e820.map[i].addr >= 0x100000UL &&
- e820.map[i].addr < clip)
- clip = e820.map[i].addr;
- continue;
- }
- start = e820.map[i].addr;
- end = e820.map[i].addr + e820.map[i].size;
- if (start >= end)
- continue;
- if (end > top)
- top = end;
- }
- /*
- * Everything below 'top' should be RAM except for the ISA hole.
- * Because of the limited MCR's we want to map NV/ACPI into our
- * MCR range for gunk in RAM
- *
- * Clip might cause us to MCR insufficient RAM but that is an
- * acceptable failure mode and should only bite obscure boxes with
- * a VESA hole at 15Mb
- *
- * The second case Clip sometimes kicks in is when the EBDA is marked
- * as reserved. Again we fail safe with reasonable results
- */
- if (top > clip)
- top = clip;
-
- return top;
-}
-
-/*
- * Compute a set of MCR's to give maximum coverage
- */
-static int __cpuinit centaur_mcr_compute(int nr, int key)
-{
- u32 mem = ramtop();
- u32 root = power2(mem);
- u32 base = root;
- u32 top = root;
- u32 floor = 0;
- int ct = 0;
-
- while (ct < nr) {
- u32 fspace = 0;
- u32 high;
- u32 low;
-
- /*
- * Find the largest block we will fill going upwards
- */
- high = power2(mem-top);
-
- /*
- * Find the largest block we will fill going downwards
- */
- low = base/2;
-
- /*
- * Don't fill below 1Mb going downwards as there
- * is an ISA hole in the way.
- */
- if (base <= 1024*1024)
- low = 0;
-
- /*
- * See how much space we could cover by filling below
- * the ISA hole
- */
-
- if (floor == 0)
- fspace = 512*1024;
- else if (floor == 512*1024)
- fspace = 128*1024;
-
- /* And forget ROM space */
-
- /*
- * Now install the largest coverage we get
- */
- if (fspace > high && fspace > low) {
- centaur_mcr_insert(ct, floor, fspace, key);
- floor += fspace;
- } else if (high > low) {
- centaur_mcr_insert(ct, top, high, key);
- top += high;
- } else if (low > 0) {
- base -= low;
- centaur_mcr_insert(ct, base, low, key);
- } else
- break;
- ct++;
- }
- /*
- * We loaded ct values. We now need to set the mask. The caller
- * must do this bit.
- */
- return ct;
-}
-
-static void __cpuinit centaur_create_optimal_mcr(void)
-{
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining and weak write ordered.
- *
- * To experiment with: Linux never uses stack operations for
- * mmio spaces so we could globally enable stack operation wc
- *
- * Load the registers with type 31 - full write combining, all
- * writes weakly ordered.
- */
- used = centaur_mcr_compute(6, 31);
-
- /*
- * Wipe unused MCRs
- */
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-static void __cpuinit winchip2_create_optimal_mcr(void)
-{
- u32 lo, hi;
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining, weak store ordered.
- *
- * Load the registers with type 25
- * 8 - weak write ordering
- * 16 - weak read ordering
- * 1 - write combining
- */
- used = centaur_mcr_compute(6, 25);
-
- /*
- * Mark the registers we are using.
- */
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- for (i = 0; i < used; i++)
- lo |= 1<<(9+i);
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-
- /*
- * Wipe unused MCRs
- */
-
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-/*
- * Handle the MCR key on the Winchip 2.
- */
-static void __cpuinit winchip2_unprotect_mcr(void)
-{
- u32 lo, hi;
- u32 key;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- key = (lo>>17) & 7;
- lo |= key<<6; /* replace with unlock key */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-
-static void __cpuinit winchip2_protect_mcr(void)
-{
- u32 lo, hi;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-#endif /* CONFIG_X86_OOSTORE */
-
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
@@ -247,7 +19,7 @@ static void __cpuinit winchip2_protect_mcr(void)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-static void __cpuinit init_c3(struct cpuinfo_x86 *c)
+static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -260,7 +32,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= ACE_FCR; /* enable ACE unit */
wrmsr(MSR_VIA_FCR, lo, hi);
- printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+ pr_info("CPU: Enabled ACE h/w crypto\n");
}
/* enable RNG unit, if present and disabled */
@@ -268,17 +40,17 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_RNG, lo, hi);
lo |= RNG_ENABLE; /* enable RNG unit */
wrmsr(MSR_VIA_RNG, lo, hi);
- printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+ pr_info("CPU: Enabled h/w RNG\n");
}
/* store Centaur Extended Feature Flags as
* word 5 of the CPU capability bit array
*/
- c->x86_capability[5] = cpuid_edx(0xC0000001);
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
- if (c->x86_model >= 6 && c->x86_model <= 9) {
+ if (c->x86_model >= 6 && c->x86_model <= 13) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= (1<<1 | 1<<7);
wrmsr(MSR_VIA_FCR, lo, hi);
@@ -294,7 +66,8 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
- display_cacheinfo(c);
+ if (c->x86 >= 7)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
enum {
@@ -318,26 +91,27 @@ enum {
EAMD3D = 1<<20,
};
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+static void early_init_centaur(struct cpuinfo_x86 *c)
{
- switch (c->x86) {
#ifdef CONFIG_X86_32
- case 5:
- /* Emulate MTRRs using Centaur's MCR. */
+ /* Emulate MTRRs using Centaur's MCR. */
+ if (c->x86 == 5)
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
- break;
#endif
- case 6:
- if (c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- break;
- }
+ if ((c->x86 == 6 && c->x86_model >= 0xf) ||
+ (c->x86 >= 7))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
}
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
char *name;
@@ -353,33 +127,32 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
clear_cpu_cap(c, 0*32+31);
#endif
early_init_centaur(c);
- switch (c->x86) {
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
#ifdef CONFIG_X86_32
- case 5:
+ if (c->x86 == 5) {
switch (c->x86_model) {
case 4:
name = "C6";
fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
fcr_clr = DPDC;
- printk(KERN_NOTICE "Disabling bugged TSC.\n");
+ pr_notice("Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
-#ifdef CONFIG_X86_OOSTORE
- centaur_create_optimal_mcr();
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- *
- * The C6 original lacks weak read order
- *
- * Note 0x120 is write only on Winchip 1
- */
- wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
-#endif
break;
case 8:
- switch (c->x86_mask) {
+ switch (c->x86_stepping) {
default:
name = "2";
break;
@@ -393,40 +166,12 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
case 9:
name = "3";
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
default:
name = "??";
@@ -436,11 +181,11 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
newlo = (lo|fcr_set) & (~fcr_clr);
if (newlo != lo) {
- printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+ pr_info("Centaur FCR was 0x%X now 0x%X\n",
lo, newlo);
wrmsr(MSR_IDT_FCR1, newlo, hi);
} else {
- printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+ pr_info("Centaur FCR is 0x%X\n", lo);
}
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
@@ -457,21 +202,21 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
c->x86_cache_size = (cc>>24)+(dd>>24);
}
sprintf(c->x86_model_id, "WinChip %s", name);
- break;
+ }
#endif
- case 6:
+ if (c->x86 == 6 || c->x86 >= 7)
init_c3(c);
- break;
- }
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
+
+ init_ia32_feat_ctl(c);
}
-static unsigned int __cpuinit
+#ifdef CONFIG_X86_32
+static unsigned int
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
-#ifdef CONFIG_X86_32
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
@@ -482,18 +227,20 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* - Note, it seems this may only be in engineering samples.
*/
if ((c->x86 == 6) && (c->x86_model == 9) &&
- (c->x86_mask == 1) && (size == 65))
+ (c->x86_stepping == 1) && (size == 65))
size -= 1;
-#endif
return size;
}
+#endif
-static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
+static const struct cpu_dev centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
- .c_size_cache = centaur_size_cache,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = centaur_size_cache,
+#endif
.c_x86_vendor = X86_VENDOR_CENTAUR,
};
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * cmpxchg*() fallbacks for CPU not supporting these instructions
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
- u8 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u8 *)ptr;
- if (prev == old)
- *(u8 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
- u16 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u16 *)ptr;
- if (prev == old)
- *(u16 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
- u32 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u32 *)ptr;
- if (prev == old)
- *(u32 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
- u64 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u64 *)ptr;
- if (prev == old)
- *(u64 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5ce60a88027b..e7ab22fce3b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,68 +1,188 @@
-#include <linux/bootmem.h>
+// SPDX-License-Identifier: GPL-2.0-only
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/memblock.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/string.h>
+#include <linux/ctype.h>
#include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <linux/kgdb.h>
+#include <linux/mem_encrypt.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
-
-#include <asm/stackprotector.h>
-#include <asm/perf_counter.h>
+#include <linux/syscore_ops.h>
+#include <linux/pgtable.h>
+#include <linux/stackprotector.h>
+#include <linux/utsname.h>
+#include <linux/efi.h>
+
+#include <asm/alternative.h>
+#include <asm/cmdline.h>
+#include <asm/cpuid/api.h>
+#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/doublefault.h>
+#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/debugreg.h>
#include <asm/sections.h>
-#include <asm/topology.h>
-#include <asm/cpumask.h>
-#include <asm/pgtable.h>
-#include <asm/atomic.h>
+#include <asm/vsyscall.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
#include <asm/proto.h>
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
#include <asm/mtrr.h>
+#include <asm/hwcap2.h>
+#include <linux/numa.h>
#include <asm/numa.h>
#include <asm/asm.h>
+#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/pat.h>
-#include <asm/smp.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/cacheinfo.h>
+#include <asm/memtype.h>
+#include <asm/microcode.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fred.h>
#include <asm/uv/uv.h>
-#endif
+#include <asm/ia32.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
+#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/posted_intr.h>
+#include <asm/runtime-const.h>
#include "cpu.h"
-/* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_initialized_mask;
-cpumask_var_t cpu_callout_mask;
-cpumask_var_t cpu_callin_mask;
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+/* Used for modules: built-in code uses runtime constants */
+unsigned long USER_PTR_MAX;
+EXPORT_SYMBOL(USER_PTR_MAX);
+
+u32 elf_hwcap2 __read_mostly;
+
+/* Number of siblings per CPU package */
+unsigned int __max_threads_per_core __ro_after_init = 1;
+EXPORT_SYMBOL(__max_threads_per_core);
+
+unsigned int __max_dies_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__max_dies_per_package);
+
+unsigned int __max_logical_packages __ro_after_init = 1;
+EXPORT_SYMBOL(__max_logical_packages);
+
+unsigned int __num_cores_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_cores_per_package);
+
+unsigned int __num_threads_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_threads_per_package);
+
+static struct ppin_info {
+ int feature;
+ int msr_ppin_ctl;
+ int msr_ppin;
+} ppin_info[] = {
+ [X86_VENDOR_INTEL] = {
+ .feature = X86_FEATURE_INTEL_PPIN,
+ .msr_ppin_ctl = MSR_PPIN_CTL,
+ .msr_ppin = MSR_PPIN
+ },
+ [X86_VENDOR_AMD] = {
+ .feature = X86_FEATURE_AMD_PPIN,
+ .msr_ppin_ctl = MSR_AMD_PPIN_CTL,
+ .msr_ppin = MSR_AMD_PPIN
+ },
+};
-/* representing cpus for which sibling maps can be computed */
-cpumask_var_t cpu_sibling_setup_mask;
+static const struct x86_cpu_id ppin_cpuids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_AMD_PPIN, &ppin_info[X86_VENDOR_AMD]),
+ X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
+
+ /* Legacy models without CPUID enumeration */
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+
+ {}
+};
-/* correctly size the local cpu masks */
-void __init setup_cpu_local_masks(void)
+static void ppin_init(struct cpuinfo_x86 *c)
{
- alloc_bootmem_cpumask_var(&cpu_initialized_mask);
- alloc_bootmem_cpumask_var(&cpu_callin_mask);
- alloc_bootmem_cpumask_var(&cpu_callout_mask);
- alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+ const struct x86_cpu_id *id;
+ unsigned long long val;
+ struct ppin_info *info;
+
+ id = x86_match_cpu(ppin_cpuids);
+ if (!id)
+ return;
+
+ /*
+ * Testing the presence of the MSR is not enough. Need to check
+ * that the PPIN_CTL allows reading of the PPIN.
+ */
+ info = (struct ppin_info *)id->driver_data;
+
+ if (rdmsrq_safe(info->msr_ppin_ctl, &val))
+ goto clear_ppin;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN locked in disabled mode */
+ goto clear_ppin;
+ }
+
+ /* If PPIN is disabled, try to enable */
+ if (!(val & 2UL)) {
+ wrmsrq_safe(info->msr_ppin_ctl, val | 2UL);
+ rdmsrq_safe(info->msr_ppin_ctl, &val);
+ }
+
+ /* Is the enable bit set? */
+ if (val & 2UL) {
+ c->ppin = native_rdmsrq(info->msr_ppin);
+ set_cpu_cap(c, info->feature);
+ return;
+ }
+
+clear_ppin:
+ setup_clear_cpu_cap(info->feature);
}
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
+static void default_init(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
#else
/* Not much we can do here... */
/* Check if at least it has cpuid */
@@ -76,13 +196,13 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst default_cpu = {
+static const struct cpu_dev default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
};
-static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+static const struct cpu_dev *this_cpu = &default_cpu;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
@@ -94,87 +214,83 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
-
- [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
- [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
- GDT_STACK_CANARY_INIT
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+SYM_PIC_ALIAS(gdt_page);
-static int __init x86_xsave_setup(char *s)
+#ifdef CONFIG_X86_64
+static int __init x86_nopcid_setup(char *s)
{
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- return 1;
-}
-__setup("noxsave", x86_xsave_setup);
+ /* nopcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
-#ifdef CONFIG_X86_32
-static int cachesize_override __cpuinitdata = -1;
-static int disable_x86_serial_nr __cpuinitdata = 1;
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 0;
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 0;
}
-__setup("cachesize=", cachesize_setup);
+early_param("nopcid", x86_nopcid_setup);
+#endif
-static int __init x86_fxsr_setup(char *s)
+static int __init x86_noinvpcid_setup(char *s)
{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
- return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
-static int __init x86_sep_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_SEP);
- return 1;
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
}
-__setup("nosep", x86_sep_setup);
+early_param("noinvpcid", x86_noinvpcid_setup);
/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
+static inline bool flag_is_changeable_p(unsigned long flag)
{
- u32 f1, f2;
+ unsigned long f1, f2;
+
+ if (!IS_ENABLED(CONFIG_X86_32))
+ return true;
/*
* Cyrix and IDT cpus allow disabling of CPUID
@@ -197,16 +313,27 @@ static inline int flag_is_changeable_p(u32 flag)
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
- return ((f1^f2) & flag) != 0;
+ return (f1 ^ f2) & flag;
}
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
/* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
+bool cpuid_feature(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
-static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
@@ -219,7 +346,7 @@ static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
lo |= 0x200000;
wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
+ pr_notice("CPU serial number disabled.\n");
clear_cpu_cap(c, X86_FEATURE_PN);
/* Disabling the serial number may affect the cpuid level */
@@ -233,20 +360,300 @@ static int __init x86_serial_nr_setup(char *s)
}
__setup("serialnumber", x86_serial_nr_setup);
#else
-static inline int flag_is_changeable_p(u32 flag)
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static __always_inline void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP))
+ cr4_set_bits(X86_CR4_SMEP);
+}
+
+static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+{
+ unsigned long eflags = native_save_fl();
+
+ /* This should have been cleared long ago */
+ BUG_ON(eflags & X86_EFLAGS_AC);
+
+ if (cpu_has(c, X86_FEATURE_SMAP))
+ cr4_set_bits(X86_CR4_SMAP);
+}
+
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ pr_info_once("x86/cpu: User Mode Instruction Prevention (UMIP) activated\n");
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
+static __always_inline void setup_lass(struct cpuinfo_x86 *c)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LASS))
+ return;
+
+ /*
+ * Legacy vsyscall page access causes a #GP when LASS is active.
+ * Disable LASS because the #GP handler doesn't support vsyscall
+ * emulation.
+ *
+ * Also disable LASS when running under EFI, as some runtime and
+ * boot services rely on 1:1 mappings in the lower half.
+ */
+ if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) ||
+ IS_ENABLED(CONFIG_EFI)) {
+ setup_clear_cpu_cap(X86_FEATURE_LASS);
+ return;
+ }
+
+ cr4_set_bits(X86_CR4_LASS);
+}
+
+/* These bits should not change their value after CPU init is finished. */
+static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+ X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
+static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
+static unsigned long cr4_pinned_bits __ro_after_init;
+
+void native_write_cr0(unsigned long val)
+{
+ unsigned long bits_missing = 0;
+
+set_register:
+ asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
+
+ if (static_branch_likely(&cr_pinning)) {
+ if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) {
+ bits_missing = X86_CR0_WP;
+ val |= bits_missing;
+ goto set_register;
+ }
+ /* Warn after we've set the missing bits. */
+ WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n");
+ }
+}
+EXPORT_SYMBOL(native_write_cr0);
+
+void __no_profile native_write_cr4(unsigned long val)
{
+ unsigned long bits_changed = 0;
+
+set_register:
+ asm volatile("mov %0,%%cr4": "+r" (val) : : "memory");
+
+ if (static_branch_likely(&cr_pinning)) {
+ if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) {
+ bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits;
+ val = (val & ~cr4_pinned_mask) | cr4_pinned_bits;
+ goto set_register;
+ }
+ /* Warn after we've corrected the changed bits. */
+ WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n",
+ bits_changed);
+ }
+}
+#if IS_MODULE(CONFIG_LKDTM)
+EXPORT_SYMBOL_GPL(native_write_cr4);
+#endif
+
+void cr4_update_irqsoff(unsigned long set, unsigned long clear)
+{
+ unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
+
+ lockdep_assert_irqs_disabled();
+
+ newval = (cr4 & ~clear) | set;
+ if (newval != cr4) {
+ this_cpu_write(cpu_tlbstate.cr4, newval);
+ __write_cr4(newval);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff);
+
+/* Read the CR4 shadow. */
+unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
+}
+EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow);
+
+void cr4_init(void)
+{
+ unsigned long cr4 = __read_cr4();
+
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4 |= X86_CR4_PCIDE;
+ if (static_branch_likely(&cr_pinning))
+ cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits;
+
+ __write_cr4(cr4);
+
+ /* Initialize cr4 shadow for this CPU. */
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+}
+
+/*
+ * Once CPU feature detection is finished (and boot params have been
+ * parsed), record any of the sensitive CR bits that are set, and
+ * enable CR pinning.
+ */
+static void __init setup_cr_pinning(void)
+{
+ cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask;
+ static_key_enable(&cr_pinning.key);
+}
+
+static __init int x86_nofsgsbase_setup(char *arg)
+{
+ /* Require an exact match without trailing characters. */
+ if (strlen(arg))
+ return 0;
+
+ /* Do not emit a message if the feature is not present. */
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
+ pr_info("FSGSBASE disabled via kernel command line\n");
return 1;
}
-/* Probe for the CPUID instruction */
-static inline int have_cpuid_p(void)
+__setup("nofsgsbase", x86_nofsgsbase_setup);
+
+/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
+{
+ if (c == &boot_cpu_data) {
+ if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+ return;
+ /*
+ * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+ * bit to be set. Enforce it.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSPKE);
+
+ } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+ return;
+ }
+
+ cr4_set_bits(X86_CR4_PKE);
+ /* Load the default PKRU value */
+ pkru_write_default();
+}
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
{
+ /*
+ * Do not clear the X86_FEATURE_PKU bit. All of the
+ * runtime checks are against OSPKE so clearing the
+ * bit does nothing.
+ *
+ * This way, we will see "pku" in cpuinfo, but not
+ * "ospke", which is exactly what we want. It shows
+ * that the CPU has PKU, but the OS has not enabled it.
+ * This happens to be exactly how a system would look
+ * if we disabled the config option.
+ */
+ pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ pku_disabled = true;
return 1;
}
-static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+__setup("nopku", setup_disable_pku);
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+
+__noendbr u64 ibt_save(bool disable)
{
+ u64 msr = 0;
+
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, msr);
+ if (disable)
+ wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ }
+
+ return msr;
}
+
+__noendbr void ibt_restore(u64 save)
+{
+ u64 msr;
+
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, msr);
+ msr &= ~CET_ENDBR_EN;
+ msr |= (save & CET_ENDBR_EN);
+ wrmsrq(MSR_IA32_S_CET, msr);
+ }
+}
+
#endif
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+ bool user_shstk, kernel_ibt;
+
+ if (!IS_ENABLED(CONFIG_X86_CET))
+ return;
+
+ kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT);
+ user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK);
+
+ if (!kernel_ibt && !user_shstk)
+ return;
+
+ if (user_shstk)
+ set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
+
+ if (kernel_ibt)
+ wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN);
+ else
+ wrmsrq(MSR_IA32_S_CET, 0);
+
+ cr4_set_bits(X86_CR4_CET);
+
+ if (kernel_ibt && ibt_selftest()) {
+ pr_err("IBT selftest: Failed!\n");
+ wrmsrq(MSR_IA32_S_CET, 0);
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+ }
+}
+
+__noendbr void cet_disable(void)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_IBT) ||
+ cpu_feature_enabled(X86_FEATURE_SHSTK)))
+ return;
+
+ wrmsrq(MSR_IA32_S_CET, 0);
+ wrmsrq(MSR_IA32_U_CET, 0);
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -257,15 +664,15 @@ struct cpuid_dependent_feature {
u32 level;
};
-static const struct cpuid_dependent_feature __cpuinitconst
+static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
- { X86_FEATURE_MWAIT, 0x00000005 },
- { X86_FEATURE_DCA, 0x00000009 },
- { X86_FEATURE_XSAVE, 0x0000000d },
+ { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
+ { X86_FEATURE_DCA, CPUID_LEAF_DCA },
+ { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
-static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
{
const struct cpuid_dependent_feature *df;
@@ -289,9 +696,8 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- printk(KERN_WARNING
- "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
- x86_cap_flags[df->feature], df->level);
+ pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
@@ -303,9 +709,10 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
*/
/* Look up CPU names by table lookup. */
-static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
+static const char *table_lookup_model(struct cpuinfo_x86 *c)
{
- const struct cpu_model_info *info;
+#ifdef CONFIG_X86_32
+ const struct legacy_cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -313,52 +720,95 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
if (!this_cpu)
return NULL;
- info = this_cpu->c_models;
+ info = this_cpu->legacy_models;
- while (info && info->family) {
+ while (info->family) {
if (info->family == c->x86)
return info->model_names[c->x86_model];
info++;
}
+#endif
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
-__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu)
-{
#ifdef CONFIG_X86_32
- loadsegment(fs, __KERNEL_PERCPU);
-#else
- loadsegment(gs, 0);
- wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
- load_stack_canary_segment();
+
+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
}
+EXPORT_SYMBOL_FOR_KVM(load_direct_gdt);
-/*
- * Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one.
- */
-void switch_to_new_gdt(int cpu)
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
{
struct desc_ptr gdt_descr;
- gdt_descr.address = (long)get_cpu_gdt_table(cpu);
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
- /* Reload the per-cpu base */
+}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);
+
+/**
+ * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base
+ * @cpu: The CPU number for which this is invoked
+ *
+ * Invoked during early boot to switch from early GDT and early per CPU to
+ * the direct GDT and the runtime per CPU area. On 32-bit the percpu base
+ * switch is implicit by loading the direct GDT. On 64bit this requires
+ * to update GSBASE.
+ */
+void __init switch_gdt_and_percpu_base(int cpu)
+{
+ load_direct_gdt(cpu);
- load_percpu_segment(cpu);
+#ifdef CONFIG_X86_64
+ /*
+ * No need to load %gs. It is already correct.
+ *
+ * Writing %gs on 64bit would zero GSBASE which would make any per
+ * CPU operation up to the point of the wrmsrq() fault.
+ *
+ * Set GSBASE to the new offset. Until the wrmsrq() happens the
+ * early mapping is still valid. That means the GSBASE update will
+ * lose any prior per CPU data which was not copied over in
+ * setup_per_cpu_areas().
+ *
+ * This works even with stackprotector enabled because the
+ * per CPU stack canary is 0 in both per CPU areas.
+ */
+ wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#else
+ /*
+ * %fs is already set to __KERNEL_PERCPU, but after switching GDT
+ * it is required to load FS again so that the 'hidden' part is
+ * updated from the new GDT. Up to this point the early per CPU
+ * translation is active. Any content of the early per CPU data
+ * which was not copied over in setup_per_cpu_areas() is lost.
+ */
+ loadsegment(fs, __KERNEL_PERCPU);
+#endif
}
-static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
+static void get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
- char *p, *q;
+ char *p, *q, *s;
if (c->extended_cpuid_level < 0x80000004)
return;
@@ -369,22 +819,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /*
- * Intel chips right-justify this string for some dumb reason;
- * undo that brain damage:
- */
- p = q = &c->x86_model_id[0];
+ /* Trim whitespace */
+ p = q = s = &c->x86_model_id[0];
+
while (*p == ' ')
p++;
- if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+
+ while (*p) {
+ /* Note the last non-whitespace index */
+ if (!isspace(*p))
+ s = q;
+
+ *q++ = *p++;
}
+
+ *(s + 1) = '\0';
}
-void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -392,8 +844,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
if (n >= 0x80000005) {
cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
c->x86_cache_size = (ecx>>24) + (edx>>24);
#ifdef CONFIG_X86_64
/* On K8 L1 TLB is inclusive, so don't count it */
@@ -411,8 +861,8 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
#else
/* do processor-specific cache resizing */
- if (this_cpu->c_size_cache)
- l2size = this_cpu->c_size_cache(c, l2size);
+ if (this_cpu->legacy_cache_size)
+ l2size = this_cpu->legacy_cache_size(c, l2size);
/* Allow user to override all this if necessary. */
if (cachesize_override != -1)
@@ -423,68 +873,29 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
#endif
c->x86_cache_size = l2size;
-
- printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
- l2size, ecx & 0xFF);
}
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_HT
- u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
-
- if (!cpu_has(c, X86_FEATURE_HT))
- return;
-
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
-
- if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
- return;
+u16 __read_mostly tlb_lli_4k;
+u16 __read_mostly tlb_lli_2m;
+u16 __read_mostly tlb_lli_4m;
+u16 __read_mostly tlb_lld_4k;
+u16 __read_mostly tlb_lld_2m;
+u16 __read_mostly tlb_lld_4m;
+u16 __read_mostly tlb_lld_1g;
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- goto out;
- }
-
- if (smp_num_siblings <= 1)
- goto out;
-
- if (smp_num_siblings > nr_cpu_ids) {
- pr_warning("CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
-
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
+static void cpu_detect_tlb(struct cpuinfo_x86 *c)
+{
+ if (this_cpu->c_detect_tlb)
+ this_cpu->c_detect_tlb(c);
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
+ pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
+ tlb_lli_4k, tlb_lli_2m, tlb_lli_4m);
-out:
- if ((c->x86_max_cores * smp_num_siblings) > 1) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- }
-#endif
+ pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g);
}
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+void get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
@@ -503,15 +914,14 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
}
}
- printk_once(KERN_ERR
- "CPU: vendor_id '%s' unknown, using generic init.\n" \
- "CPU: Your system may be unstable.\n", v);
+ pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+ "CPU: Your system may be unstable.\n", v);
c->x86_vendor = X86_VENDOR_UNKNOWN;
this_cpu = &default_cpu;
}
-void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
+void cpu_detect(struct cpuinfo_x86 *c)
{
/* Get vendor name */
cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -525,14 +935,9 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
u32 junk, tfms, cap0, misc;
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
-
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xf) << 4;
+ c->x86 = x86_family(tfms);
+ c->x86_model = x86_model(tfms);
+ c->x86_stepping = x86_stepping(tfms);
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
@@ -541,50 +946,172 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+}
+
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+ /*
+ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+ * and they also have a different bit for STIBP support. Also,
+ * a hypervisor might have set the individual AMD bits even on
+ * Intel CPUs, for finer-grained selection of what's available.
+ */
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+ cpu_has(c, X86_FEATURE_VIRT_SSBD))
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+
+ if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
+ }
+}
+
+void get_cpu_cap(struct cpuinfo_x86 *c)
{
- u32 tfms, xlvl;
- u32 ebx;
+ u32 eax, ebx, ecx, edx;
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
- u32 capability, excap;
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
- c->x86_capability[0] = capability;
- c->x86_capability[4] = excap;
+ c->x86_capability[CPUID_1_ECX] = ecx;
+ c->x86_capability[CPUID_1_EDX] = edx;
}
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
-
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
+ /* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+ if (c->cpuid_level >= 0x00000006)
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_0_EBX] = ebx;
+ c->x86_capability[CPUID_7_ECX] = ecx;
+ c->x86_capability[CPUID_7_EDX] = edx;
+
+ /* Check valid sub-leaf index before accessing it */
+ if (eax >= 1) {
+ cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_1_EAX] = eax;
}
}
- if (c->extended_cpuid_level >= 0x80000008) {
- u32 eax = cpuid_eax(0x80000008);
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
+ c->x86_capability[CPUID_D_1_EAX] = eax;
+ }
+
+ /*
+ * Check if extended CPUID leaves are implemented: Max extended
+ * CPUID leaf must be in the 0x80000001-0x8000ffff range.
+ */
+ eax = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0;
+
+ if (c->extended_cpuid_level >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
-#ifdef CONFIG_X86_32
- else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
- c->x86_phys_bits = 36;
-#endif
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+ }
+
+ if (c->extended_cpuid_level >= 0x8000000a)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
+ if (c->extended_cpuid_level >= 0x8000001f)
+ c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+
+ if (c->extended_cpuid_level >= 0x80000021)
+ c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021);
+
+ init_scattered_cpuid_features(c);
+ init_speculation_control(c);
+
+ /*
+ * Clear/Set all flags overridden by options, after probe.
+ * This needs to happen each time we re-probe, which may happen
+ * several times during CPU initialization.
+ */
+ apply_forced_caps(c);
}
-static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+void get_cpu_address_sizes(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (!cpu_has(c, X86_FEATURE_CPUID) ||
+ (c->extended_cpuid_level < 0x80000008)) {
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+ } else {
+ c->x86_clflush_size = 32;
+ c->x86_virt_bits = 32;
+ c->x86_phys_bits = 32;
+
+ if (cpu_has(c, X86_FEATURE_PAE) ||
+ cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+ }
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+
+ /* Provide a sane default if not enumerated: */
+ if (!c->x86_clflush_size)
+ c->x86_clflush_size = 32;
+ }
+
+ c->x86_cache_bits = c->x86_phys_bits;
+ c->x86_cache_alignment = c->x86_clflush_size;
+}
+
+static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
int i;
/*
@@ -605,148 +1132,858 @@ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
break;
}
}
+}
+
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
+#define NO_SPECTRE_V2 BIT(8)
+#define NO_MMIO BIT(9)
+#define NO_EIBRS_PBRSB BIT(10)
+#define NO_BHI BIT(11)
+
+#define VULNWL(vendor, family, model, whitelist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
+
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
+
+#define VULNWL_AMD(family, whitelist) \
+ VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
+
+#define VULNWL_HYGON(family, whitelist) \
+ VULNWL(HYGON, family, X86_MODEL_ANY, whitelist)
+
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
+ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
+
+ /* Intel Family 6 */
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
+
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
+
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+
+ /*
+ * Technically, swapgs isn't serializing on AMD (despite it previously
+ * being documented as such in the APM). But according to AMD, %gs is
+ * updated non-speculatively, and the issuing of %gs-relative memory
+ * operands will be blocked until the %gs update completes, which is
+ * good enough for our purposes.
+ */
+
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+
+ /* AMD Family 0xf - 0x12 */
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+
+ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+
+ /* Zhaoxin Family 7 */
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ {}
+};
+
+#define VULNBL(vendor, family, model, blacklist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
+
+#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
+ X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
+
+#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \
+ X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues)
+
+#define VULNBL_AMD(family, blacklist) \
+ VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
+
+#define VULNBL_HYGON(family, blacklist) \
+ VULNBL(HYGON, family, X86_MODEL_ANY, blacklist)
+
+#define SRBDS BIT(0)
+/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+#define MMIO BIT(1)
+/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
+#define MMIO_SBDS BIT(2)
+/* CPU is affected by RETbleed, speculating where you would not expect it */
+#define RETBLEED BIT(3)
+/* CPU is affected by SMT (cross-thread) return predictions */
+#define SMT_RSB BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO BIT(5)
+/* CPU is affected by GDS */
+#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY BIT(9)
+/* CPU is affected by Transient Scheduler Attacks */
+#define TSA BIT(10)
+/* CPU is affected by VMSCAPE */
+#define VMSCAPE BIT(11)
+
+static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+ VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE),
+
+ VULNBL_AMD(0x15, RETBLEED),
+ VULNBL_AMD(0x16, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE),
+ VULNBL_AMD(0x1a, SRSO | VMSCAPE),
+ {}
+};
+
+static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(table);
+
+ return m && !!(m->driver_data & which);
+}
+
+u64 x86_read_arch_cap_msr(void)
+{
+ u64 x86_arch_cap_msr = 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
+
+ return x86_arch_cap_msr;
+}
+
+static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_PSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO);
+}
+
+static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO)
+ return false;
+
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
+}
+
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+ return false;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ /* None of the affected CPUs have BHI_CTRL */
+ if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ /*
+ * If a VMM did not expose ITS_NO, assume that a guest could
+ * be running on a vulnerable hardware or may migrate to such
+ * hardware.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ if (cpu_matches(cpu_vuln_blacklist, ITS))
+ return true;
+
+ return false;
+}
+
+static struct x86_cpu_id cpu_latest_microcode[] = {
+#include "microcode/intel-ucode-defs.h"
+ {}
+};
+
+static bool __init cpu_has_old_microcode(void)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode);
+
+ /* Give unknown CPUs a pass: */
+ if (!m) {
+ /* Intel CPUs should be in the list. Warn if not: */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ pr_info("x86/CPU: Model not found in latest microcode list\n");
+ return false;
+ }
+
+ /*
+ * Hosts usually lie to guests with a super high microcode
+ * version. Just ignore what hosts tell guests:
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ /* Consider all debug microcode to be old: */
+ if (boot_cpu_data.microcode & BIT(31))
+ return true;
+
+ /* Give new microcode a pass: */
+ if (boot_cpu_data.microcode >= m->driver_data)
+ return false;
+
+ /* Uh oh, too old: */
+ return true;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ if (cpu_has_old_microcode()) {
+ pr_warn("x86/CPU: Running old microcode\n");
+ setup_force_cpu_bug(X86_BUG_OLD_MICROCODE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
+ !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
+ !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+ /*
+ * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
+ * flag and protect from vendor-specific bugs via the whitelist.
+ *
+ * Don't use AutoIBRS when SNP is enabled because it degrades host
+ * userspace indirect branch performance.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) ||
+ (cpu_has(c, X86_FEATURE_AUTOIBRS) &&
+ !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) {
+ setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+ if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+ !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO))
+ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) {
+ setup_force_cpu_bug(X86_BUG_MDS);
+ if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
+ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
+ setup_force_cpu_bug(X86_BUG_SWAPGS);
+
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
+ /*
+ * SRBDS affects CPUs which support RDRAND or RDSEED and are listed
+ * in the vulnerability blacklist.
+ *
+ * Some of the implications and mitigation of Shared Buffers Data
+ * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
+ * SRBDS.
+ */
+ if ((cpu_has(c, X86_FEATURE_RDRAND) ||
+ cpu_has(c, X86_FEATURE_RDSEED)) &&
+ cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
+ setup_force_cpu_bug(X86_BUG_SRBDS);
+
+ /*
+ * Processor MMIO Stale Data bug enumeration
+ *
+ * Affected CPU list is generally enough to enumerate the vulnerability,
+ * but for virtualization case check for ARCH_CAP MSR bits also, VMM may
+ * not want the guest to enumerate the bug.
+ */
+ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
+ if (cpu_matches(cpu_vuln_blacklist, MMIO))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+ }
+
+ if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA))
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ }
+
+ if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
+ setup_force_cpu_bug(X86_BUG_SMT_RSB);
+
+ if (!cpu_has(c, X86_FEATURE_SRSO_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, SRSO))
+ setup_force_cpu_bug(X86_BUG_SRSO);
+ }
+
+ /*
+ * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+ * an affected processor, the VMM may have disabled the use of GATHER by
+ * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+ * which means that AVX will be disabled.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ setup_force_cpu_bug(X86_BUG_GDS);
+
+ if (vulnerable_to_rfds(x86_arch_cap_msr))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
+ /*
+ * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+ * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+ * attacks. When virtualized, eIBRS could be hidden, assume vulnerable.
+ */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR)))
+ setup_force_cpu_bug(X86_BUG_BHI);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
+ setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+
+ if (vulnerable_to_its(x86_arch_cap_msr)) {
+ setup_force_cpu_bug(X86_BUG_ITS);
+ if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+ setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+ }
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) ||
+ !cpu_has(c, X86_FEATURE_TSA_L1_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, TSA) ||
+ /* Enable bug on Zen guests to allow for live migration. */
+ (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN)))
+ setup_force_cpu_bug(X86_BUG_TSA);
+ }
+ }
+
+ /*
+ * Set the bug only on bare-metal. A nested hypervisor should already be
+ * deploying IBPB to isolate itself from nested guests.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) &&
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
+ return;
+
+ /* Rogue Data Cache Load? No! */
+ if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO)
+ return;
+
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void detect_nopl(void)
+{
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_NOPL);
+#else
+ setup_force_cpu_cap(X86_FEATURE_NOPL);
#endif
}
+static inline bool parse_set_clear_cpuid(char *arg, bool set)
+{
+ char *opt;
+ int taint = 0;
+
+ while (arg) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
+
+ opt = strsep(&arg, ",");
+
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names. It doesn't make sense for a bug not to have a
+ * name so don't handle bug flags here.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU feature flag:");
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU feature flag:");
+ setup_clear_cpu_cap(bit);
+ }
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" %d:%d\n", bit >> 5, bit & 31);
+ else
+ pr_cont(" %s\n", x86_cap_flags[bit]);
+
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+ for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) {
+ const char *flag;
+ const char *kind;
+
+ if (bit < 32 * NCAPINTS) {
+ flag = x86_cap_flags[bit];
+ kind = "feature";
+ } else {
+ kind = "bug";
+ flag = x86_bug_flags[bit - (32 * NCAPINTS)];
+ }
+
+ if (!flag)
+ continue;
+
+ if (strcmp(flag, opt))
+ continue;
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_clear_cpu_cap(bit);
+ }
+ taint++;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt);
+ }
+
+ return taint;
+}
+
+
+/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+ bool cpuid_taint = false;
+ char arg[128];
+ int arglen;
+
+#ifdef CONFIG_X86_32
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+ pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
+ setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, false);
+
+ arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, true);
+
+ if (cpuid_taint) {
+ pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+}
+
/*
* Do minimum CPU detection early.
* Fields really needed: vendor, cpuid_level, family, model, mask,
* cache alignment.
* The others are not touched to avoid unwanted side effects.
*
- * WARNING: this function is only called on the BP. Don't add code here
- * that is supposed to run on all CPUs.
+ * WARNING: this function is only called on the boot CPU. Don't add code
+ * here that is supposed to run on all CPUs.
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- c->x86_clflush_size = 64;
- c->x86_phys_bits = 36;
- c->x86_virt_bits = 48;
-#else
- c->x86_clflush_size = 32;
- c->x86_phys_bits = 32;
- c->x86_virt_bits = 32;
-#endif
- c->x86_cache_alignment = c->x86_clflush_size;
-
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
- return;
-
- cpu_detect(c);
+ if (cpuid_feature()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ intel_unlock_cpuid_leafs(c);
+ get_cpu_cap(c);
+ setup_force_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_parse_early_param();
+
+ cpu_init_topology(c);
+
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
+
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
+ check_cpufeature_deps(c);
+
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_init_topology(c);
+ }
- get_cpu_vendor(c);
+ setup_force_cpu_cap(X86_FEATURE_ALWAYS);
- get_cpu_cap(c);
+ cpu_set_bug_bits(c);
- if (this_cpu->c_early_init)
- this_cpu->c_early_init(c);
+ sld_setup(c);
-#ifdef CONFIG_SMP
- c->cpu_index = boot_cpu_id;
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
#endif
- filter_cpuid_features(c, false);
+
+ /*
+ * Later in the boot process pgtable_l5_enabled() relies on
+ * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+ * enabled by this point we need to clear the feature bit to avoid
+ * false-positives at the later stage.
+ *
+ * pgtable_l5_enabled() can be false here for several reasons:
+ * - 5-level paging is disabled compile-time;
+ * - it's 32-bit kernel;
+ * - machine doesn't support 5-level paging;
+ * - user specified 'no5lvl' in kernel command line.
+ */
+ if (!pgtable_l5_enabled())
+ setup_clear_cpu_cap(X86_FEATURE_LA57);
+
+ detect_nopl();
+ mca_bsp_init(c);
}
-void __init early_cpu_init(void)
+void __init init_cpu_devs(void)
{
const struct cpu_dev *const *cdev;
int count = 0;
- printk(KERN_INFO "KERNEL supported cpus:\n");
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
const struct cpu_dev *cpudev = *cdev;
- unsigned int j;
if (count >= X86_VENDOR_NUM)
break;
cpu_devs[count] = cpudev;
count++;
+ }
+}
+
+void __init early_cpu_init(void)
+{
+#ifdef CONFIG_PROCESSOR_SELECT
+ unsigned int i, j;
+
+ pr_info("KERNEL supported cpus:\n");
+#endif
+
+ init_cpu_devs();
+#ifdef CONFIG_PROCESSOR_SELECT
+ for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) {
for (j = 0; j < 2; j++) {
- if (!cpudev->c_ident[j])
+ if (!cpu_devs[i]->c_ident[j])
continue;
- printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
- cpudev->c_ident[j]);
+ pr_info(" %s %s\n", cpu_devs[i]->c_vendor,
+ cpu_devs[i]->c_ident[j]);
}
}
+#endif
early_identify_cpu(&boot_cpu_data);
}
-/*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect. In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work. Disable it completely
- * unless we can find a reliable way to detect all the broken cases.
- */
-static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
+{
+ /*
+ * Empirically, writing zero to a segment selector on AMD does
+ * not clear the base, whereas writing zero to a segment
+ * selector on Intel does clear the base. Intel's behavior
+ * allows slightly faster context switches in the common case
+ * where GS is unused by the prev and next threads.
+ *
+ * Since neither vendor documents this anywhere that I can see,
+ * detect it directly instead of hard-coding the choice by
+ * vendor.
+ *
+ * I've designated AMD's behavior as the "bug" because it's
+ * counterintuitive and less friendly.
+ */
+
+ unsigned long old_base, tmp;
+ rdmsrq(MSR_FS_BASE, old_base);
+ wrmsrq(MSR_FS_BASE, 1);
+ loadsegment(fs, 0);
+ rdmsrq(MSR_FS_BASE, tmp);
+ wrmsrq(MSR_FS_BASE, old_base);
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
{
- clear_cpu_cap(c, X86_FEATURE_NOPL);
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
-static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+static void generic_identify(struct cpuinfo_x86 *c)
{
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
+ if (!cpuid_feature())
return;
cpu_detect(c);
get_cpu_vendor(c);
-
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
- if (c->cpuid_level >= 0x00000001) {
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-# else
- c->apicid = c->initial_apicid;
-# endif
-#endif
-
-#ifdef CONFIG_X86_HT
- c->phys_proc_id = c->initial_apicid;
-#endif
- }
+ get_cpu_address_sizes(c);
get_model_name(c); /* Default name */
- init_scattered_cpuid_features(c);
- detect_nopl(c);
+ /*
+ * ESPFIX is a strange bug. All real CPUs have it. Paravirt
+ * systems that run Linux at CPL > 0 may or may not have the
+ * issue, but, even if they have the issue, there's absolutely
+ * nothing we can do about it because we can't use the real IRET
+ * instruction.
+ *
+ * NB: For the time being, only 32-bit kernels support
+ * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
+ * whether to apply espfix using paravirt hooks. If any
+ * non-paravirt system ever shows up that does *not* have the
+ * ESPFIX issue, we can change this.
+ */
+#ifdef CONFIG_X86_32
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
}
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void identify_cpu(struct cpuinfo_x86 *c)
{
int i;
c->loops_per_jiffy = loops_per_jiffy;
- c->x86_cache_size = -1;
+ c->x86_cache_size = 0;
c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_model = c->x86_stepping = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -758,22 +1995,26 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ memset(&c->x86_capability, 0, sizeof(c->x86_capability));
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ memset(&c->vmx_capability, 0, sizeof(c->vmx_capability));
+#endif
generic_identify(c);
+ cpu_parse_topology(c);
+
if (this_cpu->c_identify)
this_cpu->c_identify(c);
- /* Clear/Set all flags overriden by options, after probe */
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ /* Clear/Set all flags overridden by options, after probe */
+ apply_forced_caps(c);
-#ifdef CONFIG_X86_64
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-#endif
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
/*
* Vendor-specific initialization. In this section we
@@ -788,9 +2029,22 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_init)
this_cpu->c_init(c);
+ bus_lock_init();
+
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
+ setup_smep(c);
+ setup_smap(c);
+ setup_umip(c);
+ setup_lass(c);
+
+ /* Enable FSGSBASE instructions if available. */
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+ cr4_set_bits(X86_CR4_FSGSBASE);
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
+ }
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -799,6 +2053,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Filter out anything that depends on CPUID levels we don't have */
filter_cpuid_features(c, true);
+ /* Check for unmet dependencies based on the CPUID dependency table */
+ check_cpufeature_deps(c);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
@@ -811,20 +2068,15 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86, c->x86_model);
}
-#ifdef CONFIG_X86_64
- detect_ht(c);
-#endif
-
- init_hypervisor(c);
+ x86_init_rdrand(c);
+ setup_pku(c);
+ setup_cet(c);
/*
- * Clear/Set all flags overriden by options, need do it
+ * Clear/Set all flags overridden by options, need do it
* before following smp all cpus cap AND.
*/
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ apply_forced_caps(c);
/*
* On SMP, boot_cpu_data holds the common feature set between
@@ -836,106 +2088,89 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* AND the already accumulated flags with these */
for (i = 0; i < NCAPINTS; i++)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+
+ /* OR, i.e. replicate the bug flags */
+ for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++)
+ c->x86_capability[i] |= boot_cpu_data.x86_capability[i];
}
-#ifdef CONFIG_X86_MCE
- /* Init Machine Check Exception if available. */
- mcheck_init(c);
-#endif
+ ppin_init(c);
- select_idle_routine(c);
+ /* Init Machine Check Exception if available. */
+ mcheck_cpu_init(c);
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
numa_add_cpu(smp_processor_id());
-#endif
}
-#ifdef CONFIG_X86_64
-static void vgetcpu_set_mode(void)
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
{
- if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
- vgetcpu_mode = VGETCPU_RDTSCP;
- else
- vgetcpu_mode = VGETCPU_LSL;
+ struct tss_struct *tss;
+ int cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ return;
+
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss_rw, cpu);
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1);
+ wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32);
+
+ put_cpu();
}
#endif
-void __init identify_boot_cpu(void)
+static __init void identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_c1e_mask();
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ pr_info("CET detected: Indirect Branch Tracking enabled\n");
#ifdef CONFIG_X86_32
- sysenter_setup();
enable_sep_cpu();
-#else
- vgetcpu_set_mode();
#endif
- init_hw_perf_counters();
+ cpu_detect_tlb(&boot_cpu_data);
+ setup_cr_pinning();
+
+ tsx_init();
+ tdx_init();
+ lkgs_init();
}
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(unsigned int cpu)
{
- BUG_ON(c == &boot_cpu_data);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = cpu;
+
identify_cpu(c);
#ifdef CONFIG_X86_32
enable_sep_cpu();
#endif
- mtrr_ap_init();
-}
-
-struct msr_range {
- unsigned min;
- unsigned max;
-};
-
-static const struct msr_range msr_range_array[] __cpuinitconst = {
- { 0x00000000, 0x00000418},
- { 0xc0000000, 0xc000040b},
- { 0xc0010000, 0xc0010142},
- { 0xc0011000, 0xc001103b},
-};
+ x86_spec_ctrl_setup_ap();
+ update_srbds_msr();
+ if (boot_cpu_has_bug(X86_BUG_GDS))
+ update_gds_msr();
-static void __cpuinit print_cpu_msr(void)
-{
- unsigned index_min, index_max;
- unsigned index;
- u64 val;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
- index_min = msr_range_array[i].min;
- index_max = msr_range_array[i].max;
-
- for (index = index_min; index < index_max; index++) {
- if (rdmsrl_amd_safe(index, &val))
- continue;
- printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
- }
- }
-}
-
-static int show_msr __cpuinitdata;
-
-static __init int setup_show_msr(char *arg)
-{
- int num;
-
- get_option(&arg, &num);
-
- if (num > 0)
- show_msr = num;
- return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
- return 1;
+ tsx_ap_init();
+ c->initialized = true;
}
-__setup("noclflush", setup_noclflush);
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+void print_cpu_info(struct cpuinfo_x86 *c)
{
const char *vendor = NULL;
@@ -947,304 +2182,456 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
}
if (vendor && !strstr(c->x86_model_id, vendor))
- printk(KERN_CONT "%s ", vendor);
+ pr_cont("%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
+ pr_cont("%s", c->x86_model_id);
else
- printk(KERN_CONT "%d86", c->x86);
+ pr_cont("%d86", c->x86);
- if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
- else
- printk(KERN_CONT "\n");
+ pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
-#ifdef CONFIG_SMP
- if (c->cpu_index < show_msr)
- print_cpu_msr();
-#else
- if (show_msr)
- print_cpu_msr();
-#endif
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ pr_cont(", stepping: 0x%x)\n", c->x86_stepping);
+ else
+ pr_cont(")\n");
}
-static __init int setup_disablecpuid(char *arg)
-{
- int bit;
-
- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
+/*
+ * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param().
+ * These dummy functions prevent them from becoming an environment variable for
+ * init.
+ */
+static __init int setup_clearcpuid(char *arg)
+{
return 1;
}
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);
-#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-
-DEFINE_PER_CPU_FIRST(union irq_stack_union,
- irq_stack_union) __aligned(PAGE_SIZE);
+static __init int setup_setcpuid(char *arg)
+{
+ return 1;
+}
+__setup("setcpuid=", setup_setcpuid);
-DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+EXPORT_PER_CPU_SYMBOL(const_current_task);
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
+DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+#ifdef CONFIG_X86_64
/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
+ * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ * so that this space is reserved in the hot cache section even when the
+ * mitigation is disabled.
*/
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
-};
+DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+EXPORT_PER_CPU_SYMBOL(__x86_call_depth);
+
+static void wrmsrq_cstar(unsigned long val)
+{
+ /*
+ * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
+ * is so far ignored by the CPU, but raises a #VE trap in a TDX
+ * guest. Avoid the pointless write on all Intel CPUs.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ wrmsrq(MSR_CSTAR, val);
+}
+
+static inline void idt_syscall_init(void)
+{
+ wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+
+ if (ia32_enabled()) {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ } else {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore);
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ }
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
- __aligned(PAGE_SIZE);
+ /*
+ * Flags to clear on syscall; clear as much as possible
+ * to minimize user space-kernel interference.
+ */
+ wrmsrq(MSR_SYSCALL_MASK,
+ X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
+ X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
+ X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
+ X86_EFLAGS_AC|X86_EFLAGS_ID);
+}
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
+ /* The default user and kernel segments */
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
/*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to
- * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+ * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and
+ * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED
+ * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit
+ * instruction to return to ring 3 (both sysexit and sysret cause
+ * #UD when FRED is enabled).
*/
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_syscall_init();
+}
+#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
+#ifdef CONFIG_STACKPROTECTOR
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
+#ifndef CONFIG_SMP
+EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
+#endif
#endif
- /* Flags to clear on syscall */
- wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+static void initialize_debug_regs(void)
+{
+ /* Control register first -- to make sure everything is disabled. */
+ set_debugreg(DR7_FIXED_1, 7);
+ set_debugreg(DR6_RESERVED, 6);
+ /* dr5 and dr4 don't exist */
+ set_debugreg(0, 3);
+ set_debugreg(0, 2);
+ set_debugreg(0, 1);
+ set_debugreg(0, 0);
}
-unsigned long kernel_eflags;
-
+#ifdef CONFIG_KGDB
/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
*/
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
+static void dbg_restore_debug_regs(void)
+{
+ if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+ arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
-#else /* CONFIG_X86_64 */
+static inline void setup_getcpu(int cpu)
+{
+ unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
+ struct desc_struct d = { };
-#ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, stack_canary);
-#endif
+ if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+ wrmsrq(MSR_TSC_AUX, cpudata);
+
+ /* Store CPU and node number in limit. */
+ d.limit0 = cpudata;
+ d.limit1 = cpudata >> 16;
-/* Make sure %fs and %gs are initialized properly in idle threads */
-struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
+ d.type = 5; /* RO data, expand down, accessed */
+ d.dpl = 3; /* Visible to user code */
+ d.s = 1; /* Not a system segment */
+ d.p = 1; /* Present */
+ d.d = 1; /* 32-bit */
+
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
+}
+
+#ifdef CONFIG_X86_64
+static inline void tss_setup_ist(struct tss_struct *tss)
+{
+ /* Set up the per-CPU TSS IST stacks */
+ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ /* Only mapped when SEV-ES is active */
+ tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
+}
+#else /* CONFIG_X86_64 */
+static inline void tss_setup_ist(struct tss_struct *tss) { }
+#endif /* !CONFIG_X86_64 */
+
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
{
- memset(regs, 0, sizeof(struct pt_regs));
- regs->fs = __KERNEL_PERCPU;
- regs->gs = __KERNEL_STACK_CANARY;
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
- return regs;
+#ifdef CONFIG_X86_IOPL_IOPERM
+ tss->io_bitmap.prev_max = 0;
+ tss->io_bitmap.prev_sequence = 0;
+ memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+ /*
+ * Invalidate the extra array entry past the end of the all
+ * permission bitmap as required by the hardware.
+ */
+ tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
+#endif
}
-#endif /* CONFIG_X86_64 */
/*
- * Clear all 6 debug registers:
+ * Setup everything needed to handle exceptions from the IDT, including the IST
+ * exceptions which use paranoid_entry().
*/
-static void clear_all_debug_regs(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
- int i;
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ int cpu = raw_smp_processor_id();
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
+ /* paranoid_entry() gets the CPU number from the GDT */
+ setup_getcpu(cpu);
- set_debugreg(0, i);
+ /* For IDT mode, IST vectors need to be set in TSS. */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
+ load_TR_desc();
+
+ /* GHCB needs to be setup to handle #VC. */
+ setup_ghcb();
+
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
}
}
+void __init cpu_init_replace_early_idt(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ cpu_init_fred_exceptions();
+ else
+ idt_setup_early_pf();
+}
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init for 64 bit
+ * initialized (naturally) in the bootstrap process, such as the GDT. We
+ * reload it nevertheless, this function acts as a 'CPU state barrier',
+ * nothing should get across.
*/
-#ifdef CONFIG_X86_64
-
-void __cpuinit cpu_init(void)
+void cpu_init(void)
{
- struct orig_ist *orig_ist;
- struct task_struct *me;
- struct tss_struct *t;
- unsigned long v;
- int cpu;
- int i;
-
- cpu = stack_smp_processor_id();
- t = &per_cpu(init_tss, cpu);
- orig_ist = &per_cpu(orig_ist, cpu);
+ struct task_struct *cur = current;
+ int cpu = raw_smp_processor_id();
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(node_number) == 0 &&
- cpu_to_node(cpu) != NUMA_NO_NODE)
- percpu_write(node_number, cpu_to_node(cpu));
+ if (this_cpu_read(numa_node) == 0 &&
+ early_cpu_to_node(cpu) != NUMA_NO_NODE)
+ set_numa_node(early_cpu_to_node(cpu));
#endif
+ pr_debug("Initializing CPU#%d\n", cpu);
+
+ if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ loadsegment(fs, 0);
+ memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
- me = current;
+ wrmsrq(MSR_FS_BASE, 0);
+ wrmsrq(MSR_KERNEL_GS_BASE, 0);
+ barrier();
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
- panic("CPU#%d already initialized!\n", cpu);
+ x2apic_setup();
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+ intel_posted_msi_init();
+ }
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ mmgrab(&init_mm);
+ cur->active_mm = &init_mm;
+ BUG_ON(cur->mm);
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, cur);
/*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
+ * sp0 points to the entry trampoline stack regardless of what task
+ * is running.
*/
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
- switch_to_new_gdt(cpu);
- loadsegment(fs, 0);
-
- load_idt((const struct desc_ptr *)&idt_descr);
+ load_mm_ldt(&init_mm);
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
+ initialize_debug_regs();
+ dbg_restore_debug_regs();
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
+ doublefault_init_cpu_tss();
- check_efer();
- if (cpu != 0)
- enable_x2apic();
+ if (is_uv_system())
+ uv_cpu_init();
- /*
- * set up and load the per-CPU TSS
- */
- if (!orig_ist->ist[0]) {
- char *estacks = per_cpu(exception_stacks, cpu);
+ load_fixmap_gdt(cpu);
+}
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- estacks += exception_stack_sizes[v];
- orig_ist->ist[v] = t->x86_tss.ist[v] =
- (unsigned long)estacks;
- }
- }
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/**
+ * store_cpu_caps() - Store a snapshot of CPU capabilities
+ * @curr_info: Pointer where to store it
+ *
+ * Returns: None
+ */
+void store_cpu_caps(struct cpuinfo_x86 *curr_info)
+{
+ /* Reload CPUID max function as it might've changed. */
+ curr_info->cpuid_level = cpuid_eax(0);
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /* Copy all capability leafs and pick up the synthetic ones. */
+ memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability,
+ sizeof(curr_info->x86_capability));
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
+ /* Get the hardware CPUID leafs */
+ get_cpu_cap(curr_info);
+}
- atomic_inc(&init_mm.mm_count);
- me->active_mm = &init_mm;
- BUG_ON(me->mm);
- enter_lazy_tlb(&init_mm, me);
+/**
+ * microcode_check() - Check if any CPU capabilities changed after an update.
+ * @prev_info: CPU capabilities stored before an update.
+ *
+ * The microcode loader calls this upon late microcode load to recheck features,
+ * only when microcode has been updated. Caller holds and CPU hotplug lock.
+ *
+ * Return: None
+ */
+void microcode_check(struct cpuinfo_x86 *prev_info)
+{
+ struct cpuinfo_x86 curr_info;
- load_sp0(t, &current->thread);
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
+ perf_check_microcode();
-#ifdef CONFIG_KGDB
- /*
- * If the kgdb is connected no debug regs should be altered. This
- * is only applicable when KGDB and a KGDB I/O module are built
- * into the kernel and you are using early debugging with
- * kgdbwait. KGDB will control the kernel HW breakpoint registers.
- */
- if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
- arch_kgdb_ops.correct_hw_break();
- else
-#endif
- clear_all_debug_regs();
+ amd_check_microcode();
- fpu_init();
+ store_cpu_caps(&curr_info);
- raw_local_save_flags(kernel_eflags);
+ if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability,
+ sizeof(prev_info->x86_capability)))
+ return;
- if (is_uv_system())
- uv_cpu_init();
+ pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
+ pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
}
+#endif
-#else
-
-void __cpuinit cpu_init(void)
+/*
+ * Invoked from core CPU hotplug code after hotplug operations
+ */
+void arch_smt_update(void)
{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct thread_struct *thread = &curr->thread;
-
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
- printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;)
- local_irq_enable();
- }
+ /* Handle the speculative execution misfeatures */
+ cpu_bugs_smt_update();
+ /* Check whether IPI broadcasting can be enabled */
+ apic_smt_update();
+}
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+void __init arch_cpu_finalize_init(void)
+{
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
- if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ identify_boot_cpu();
- load_idt(&idt_descr);
- switch_to_new_gdt(cpu);
+ select_idle_routine();
/*
- * Set up and load the per-CPU TSS and LDT
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
*/
- atomic_inc(&init_mm.mm_count);
- curr->active_mm = &init_mm;
- BUG_ON(curr->mm);
- enter_lazy_tlb(&init_mm, curr);
+ cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core);
- load_sp0(t, thread);
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ cpu_select_mitigations();
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+ arch_smt_update();
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /*
+ * Check whether this is a real i386 which is not longer
+ * supported and fixup the utsname.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ }
- clear_all_debug_regs();
+ /*
+ * Must be before alternatives because it might set or clear
+ * feature bits.
+ */
+ fpu__init_system();
+ fpu__init_cpu();
/*
- * Force FPU initialization:
+ * This needs to follow the FPU initializtion, since EFI depends on it.
*/
- if (cpu_has_xsave)
- current_thread_info()->status = TS_XSAVE;
- else
- current_thread_info()->status = 0;
- clear_used_math();
- mxcsr_feature_mask_init();
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
+ efi_enter_virtual_mode();
/*
- * Boot processor to setup the FP and extended state context info.
+ * Ensure that access to the per CPU representation has the initial
+ * boot CPU configuration.
*/
- if (smp_processor_id() == boot_cpu_id)
- init_thread_xstate();
+ *c = boot_cpu_data;
+ c->initialized = true;
+
+ alternative_instructions();
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ USER_PTR_MAX = TASK_SIZE_MAX;
- xsave_init();
+ /*
+ * Enable this when LAM is gated on LASS support
+ if (cpu_feature_enabled(X86_FEATURE_LAM))
+ USER_PTR_MAX = (1ul << 63) - PAGE_SIZE;
+ */
+ runtime_const_init(ptr, USER_PTR_MAX);
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+ } else {
+ fpu__init_check_bugs();
+ }
+
+ /*
+ * This needs to be called before any devices perform DMA
+ * operations that might use the SWIOTLB bounce buffers. It will
+ * mark the bounce buffers as decrypted so that their usage will
+ * not cause "plain-text" data to be decrypted when accessed. It
+ * must be called after late_time_init() so that Hyper-V x86/x64
+ * hypercalls work when the SWIOTLB bounce buffers are decrypted.
+ */
+ mem_encrypt_init();
}
-#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..5c7a3a71191a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,12 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH_X86_CPU_H
-
#define ARCH_X86_CPU_H
-struct cpu_model_info {
- int vendor;
- int family;
- const char *model_names[16];
-};
+#include <asm/cpu.h>
+#include <asm/topology.h>
+
+#include "topology.h"
/* attempt to consolidate cpu attributes */
struct cpu_dev {
@@ -15,23 +14,81 @@ struct cpu_dev {
/* some have two possibilities for cpuid string */
const char *c_ident[2];
- struct cpu_model_info c_models[4];
-
void (*c_early_init)(struct cpuinfo_x86 *);
+ void (*c_bsp_init)(struct cpuinfo_x86 *);
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
- unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
+ void (*c_detect_tlb)(struct cpuinfo_x86 *);
int c_x86_vendor;
+#ifdef CONFIG_X86_32
+ /* Optional vendor specific routine to obtain the cache size. */
+ unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
+ unsigned int);
+
+ /* Family/stepping-based lookup table for model names. */
+ struct legacy_cpu_model_info {
+ int family;
+ const char *model_names[16];
+ } legacy_models[5];
+#endif
};
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
- __attribute__((__section__(".x86_cpu_dev.init"))) = \
+ __section(".x86_cpu_dev.init") = \
&cpu_devX;
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
-extern void display_cacheinfo(struct cpuinfo_x86 *c);
+#ifdef CONFIG_CPU_SUP_INTEL
+extern void __init tsx_init(void);
+void tsx_ap_init(void);
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
+#else
+static inline void tsx_init(void) { }
+static inline void tsx_ap_init(void) { }
+static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
+extern void init_spectral_chicken(struct cpuinfo_x86 *c);
+
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
+extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
+extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+struct amd_northbridge *amd_init_l3_cache(int index);
+#else
+static inline struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ return NULL;
+}
#endif
+
+unsigned int aperfmperf_get_khz(int cpu);
+void cpu_select_mitigations(void);
+
+extern void x86_spec_ctrl_setup_ap(void);
+extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
+
+extern enum spectre_v2_mitigation spectre_v2_enabled;
+
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
+}
+
+#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index 6b2a52dd0403..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
-/*
- * CPU x86 architecture debug code
- *
- * Copyright(C) 2009 Jaswinder Singh Rajput
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/interrupt.h>
-#include <linux/compiler.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-
-#include <asm/cpu_debug.h>
-#include <asm/paravirt.h>
-#include <asm/system.h>
-#include <asm/traps.h>
-#include <asm/apic.h>
-#include <asm/desc.h>
-
-static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
-static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(int, cpu_priv_count);
-
-static DEFINE_MUTEX(cpu_debug_lock);
-
-static struct dentry *cpu_debugfs_dir;
-
-static struct cpu_debug_base cpu_base[] = {
- { "mc", CPU_MC, 0 },
- { "monitor", CPU_MONITOR, 0 },
- { "time", CPU_TIME, 0 },
- { "pmc", CPU_PMC, 1 },
- { "platform", CPU_PLATFORM, 0 },
- { "apic", CPU_APIC, 0 },
- { "poweron", CPU_POWERON, 0 },
- { "control", CPU_CONTROL, 0 },
- { "features", CPU_FEATURES, 0 },
- { "lastbranch", CPU_LBRANCH, 0 },
- { "bios", CPU_BIOS, 0 },
- { "freq", CPU_FREQ, 0 },
- { "mtrr", CPU_MTRR, 0 },
- { "perf", CPU_PERF, 0 },
- { "cache", CPU_CACHE, 0 },
- { "sysenter", CPU_SYSENTER, 0 },
- { "therm", CPU_THERM, 0 },
- { "misc", CPU_MISC, 0 },
- { "debug", CPU_DEBUG, 0 },
- { "pat", CPU_PAT, 0 },
- { "vmx", CPU_VMX, 0 },
- { "call", CPU_CALL, 0 },
- { "base", CPU_BASE, 0 },
- { "ver", CPU_VER, 0 },
- { "conf", CPU_CONF, 0 },
- { "smm", CPU_SMM, 0 },
- { "svm", CPU_SVM, 0 },
- { "osvm", CPU_OSVM, 0 },
- { "tss", CPU_TSS, 0 },
- { "cr", CPU_CR, 0 },
- { "dt", CPU_DT, 0 },
- { "registers", CPU_REG_ALL, 0 },
-};
-
-static struct cpu_file_base cpu_file[] = {
- { "index", CPU_REG_ALL, 0 },
- { "value", CPU_REG_ALL, 1 },
-};
-
-/* CPU Registers Range */
-static struct cpu_debug_range cpu_reg_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, },
- { 0x00000006, 0x00000007, CPU_MONITOR, },
- { 0x00000010, 0x00000010, CPU_TIME, },
- { 0x00000011, 0x00000013, CPU_PMC, },
- { 0x00000017, 0x00000017, CPU_PLATFORM, },
- { 0x0000001B, 0x0000001B, CPU_APIC, },
- { 0x0000002A, 0x0000002B, CPU_POWERON, },
- { 0x0000002C, 0x0000002C, CPU_FREQ, },
- { 0x0000003A, 0x0000003A, CPU_CONTROL, },
- { 0x00000040, 0x00000047, CPU_LBRANCH, },
- { 0x00000060, 0x00000067, CPU_LBRANCH, },
- { 0x00000079, 0x00000079, CPU_BIOS, },
- { 0x00000088, 0x0000008A, CPU_CACHE, },
- { 0x0000008B, 0x0000008B, CPU_BIOS, },
- { 0x0000009B, 0x0000009B, CPU_MONITOR, },
- { 0x000000C1, 0x000000C4, CPU_PMC, },
- { 0x000000CD, 0x000000CD, CPU_FREQ, },
- { 0x000000E7, 0x000000E8, CPU_PERF, },
- { 0x000000FE, 0x000000FE, CPU_MTRR, },
-
- { 0x00000116, 0x0000011E, CPU_CACHE, },
- { 0x00000174, 0x00000176, CPU_SYSENTER, },
- { 0x00000179, 0x0000017B, CPU_MC, },
- { 0x00000186, 0x00000189, CPU_PMC, },
- { 0x00000198, 0x00000199, CPU_PERF, },
- { 0x0000019A, 0x0000019A, CPU_TIME, },
- { 0x0000019B, 0x0000019D, CPU_THERM, },
- { 0x000001A0, 0x000001A0, CPU_MISC, },
- { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
- { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, },
- { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, },
- { 0x00000250, 0x00000250, CPU_MTRR, },
- { 0x00000258, 0x00000259, CPU_MTRR, },
- { 0x00000268, 0x0000026F, CPU_MTRR, },
- { 0x00000277, 0x00000277, CPU_PAT, },
- { 0x000002FF, 0x000002FF, CPU_MTRR, },
-
- { 0x00000300, 0x00000311, CPU_PMC, },
- { 0x00000345, 0x00000345, CPU_PMC, },
- { 0x00000360, 0x00000371, CPU_PMC, },
- { 0x0000038D, 0x00000390, CPU_PMC, },
- { 0x000003A0, 0x000003BE, CPU_PMC, },
- { 0x000003C0, 0x000003CD, CPU_PMC, },
- { 0x000003E0, 0x000003E1, CPU_PMC, },
- { 0x000003F0, 0x000003F2, CPU_PMC, },
-
- { 0x00000400, 0x00000417, CPU_MC, },
- { 0x00000480, 0x0000048B, CPU_VMX, },
-
- { 0x00000600, 0x00000600, CPU_DEBUG, },
- { 0x00000680, 0x0000068F, CPU_LBRANCH, },
- { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
-
- { 0x000107CC, 0x000107D3, CPU_PMC, },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, },
- { 0xC0000081, 0xC0000084, CPU_CALL, },
- { 0xC0000100, 0xC0000102, CPU_BASE, },
- { 0xC0000103, 0xC0000103, CPU_TIME, },
-
- { 0xC0010000, 0xC0010007, CPU_PMC, },
- { 0xC0010010, 0xC0010010, CPU_CONF, },
- { 0xC0010015, 0xC0010015, CPU_CONF, },
- { 0xC0010016, 0xC001001A, CPU_MTRR, },
- { 0xC001001D, 0xC001001D, CPU_MTRR, },
- { 0xC001001F, 0xC001001F, CPU_CONF, },
- { 0xC0010030, 0xC0010035, CPU_BIOS, },
- { 0xC0010044, 0xC0010048, CPU_MC, },
- { 0xC0010050, 0xC0010056, CPU_SMM, },
- { 0xC0010058, 0xC0010058, CPU_CONF, },
- { 0xC0010060, 0xC0010060, CPU_CACHE, },
- { 0xC0010061, 0xC0010068, CPU_SMM, },
- { 0xC0010069, 0xC001006B, CPU_SMM, },
- { 0xC0010070, 0xC0010071, CPU_SMM, },
- { 0xC0010111, 0xC0010113, CPU_SMM, },
- { 0xC0010114, 0xC0010118, CPU_SVM, },
- { 0xC0010140, 0xC0010141, CPU_OSVM, },
- { 0xC0011022, 0xC0011023, CPU_CONF, },
-};
-
-static int is_typeflag_valid(unsigned cpu, unsigned flag)
-{
- int i;
-
- /* Standard Registers should be always valid */
- if (flag >= CPU_TSS)
- return 1;
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (cpu_reg_range[i].flag == flag)
- return 1;
- }
-
- /* Invalid */
- return 0;
-}
-
-static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
- int index, unsigned flag)
-{
- if (cpu_reg_range[index].flag == flag) {
- *min = cpu_reg_range[index].min;
- *max = cpu_reg_range[index].max;
- } else
- *max = 0;
-
- return *max;
-}
-
-/* This function can also be called with seq = NULL for printk */
-static void print_cpu_data(struct seq_file *seq, unsigned type,
- u32 low, u32 high)
-{
- struct cpu_private *priv;
- u64 val = high;
-
- if (seq) {
- priv = seq->private;
- if (priv->file) {
- val = (val << 32) | low;
- seq_printf(seq, "0x%llx\n", val);
- } else
- seq_printf(seq, " %08x: %08x_%08x\n",
- type, high, low);
- } else
- printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
-}
-
-/* This function can also be called with seq = NULL for printk */
-static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
-{
- unsigned msr, msr_min, msr_max;
- struct cpu_private *priv;
- u32 low, high;
- int i;
-
- if (seq) {
- priv = seq->private;
- if (priv->file) {
- if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
- &low, &high))
- print_cpu_data(seq, priv->reg, low, high);
- return;
- }
- }
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
- continue;
-
- for (msr = msr_min; msr <= msr_max; msr++) {
- if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
- continue;
- print_cpu_data(seq, msr, low, high);
- }
- }
-}
-
-static void print_tss(void *arg)
-{
- struct pt_regs *regs = task_pt_regs(current);
- struct seq_file *seq = arg;
- unsigned int seg;
-
- seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
- seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
- seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
- seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
-
- seq_printf(seq, " RSI\t: %016lx\n", regs->si);
- seq_printf(seq, " RDI\t: %016lx\n", regs->di);
- seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
- seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
-
-#ifdef CONFIG_X86_64
- seq_printf(seq, " R08\t: %016lx\n", regs->r8);
- seq_printf(seq, " R09\t: %016lx\n", regs->r9);
- seq_printf(seq, " R10\t: %016lx\n", regs->r10);
- seq_printf(seq, " R11\t: %016lx\n", regs->r11);
- seq_printf(seq, " R12\t: %016lx\n", regs->r12);
- seq_printf(seq, " R13\t: %016lx\n", regs->r13);
- seq_printf(seq, " R14\t: %016lx\n", regs->r14);
- seq_printf(seq, " R15\t: %016lx\n", regs->r15);
-#endif
-
- asm("movl %%cs,%0" : "=r" (seg));
- seq_printf(seq, " CS\t: %04x\n", seg);
- asm("movl %%ds,%0" : "=r" (seg));
- seq_printf(seq, " DS\t: %04x\n", seg);
- seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
- asm("movl %%es,%0" : "=r" (seg));
- seq_printf(seq, " ES\t: %04x\n", seg);
- asm("movl %%fs,%0" : "=r" (seg));
- seq_printf(seq, " FS\t: %04x\n", seg);
- asm("movl %%gs,%0" : "=r" (seg));
- seq_printf(seq, " GS\t: %04x\n", seg);
-
- seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
-
- seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
-}
-
-static void print_cr(void *arg)
-{
- struct seq_file *seq = arg;
-
- seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
- seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
- seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
- seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
-#ifdef CONFIG_X86_64
- seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
-#endif
-}
-
-static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
-{
- seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
-}
-
-static void print_dt(void *seq)
-{
- struct desc_ptr dt;
- unsigned long ldt;
-
- /* IDT */
- store_idt((struct desc_ptr *)&dt);
- print_desc_ptr("IDT", seq, dt);
-
- /* GDT */
- store_gdt((struct desc_ptr *)&dt);
- print_desc_ptr("GDT", seq, dt);
-
- /* LDT */
- store_ldt(ldt);
- seq_printf(seq, " LDT\t: %016lx\n", ldt);
-
- /* TR */
- store_tr(ldt);
- seq_printf(seq, " TR\t: %016lx\n", ldt);
-}
-
-static void print_dr(void *arg)
-{
- struct seq_file *seq = arg;
- unsigned long dr;
- int i;
-
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
- get_debugreg(dr, i);
- seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
- }
-
- seq_printf(seq, "\n MSR\t:\n");
-}
-
-static void print_apic(void *arg)
-{
- struct seq_file *seq = arg;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(seq, " LAPIC\t:\n");
- seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
- seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
- seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
- seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
- seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
- seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
- seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
- seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
- seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
- seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
- seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
- seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
- seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
- seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
- seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
- seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
- seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
- seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
- seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
- seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
- seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
- if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
- unsigned int i, v, maxeilvt;
-
- v = apic_read(APIC_EFEAT);
- maxeilvt = (v >> 16) & 0xff;
- seq_printf(seq, " EFEAT\t\t: %08x\n", v);
- seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
-
- for (i = 0; i < maxeilvt; i++) {
- v = apic_read(APIC_EILVTn(i));
- seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
- }
- }
-#endif /* CONFIG_X86_LOCAL_APIC */
- seq_printf(seq, "\n MSR\t:\n");
-}
-
-static int cpu_seq_show(struct seq_file *seq, void *v)
-{
- struct cpu_private *priv = seq->private;
-
- if (priv == NULL)
- return -EINVAL;
-
- switch (cpu_base[priv->type].flag) {
- case CPU_TSS:
- smp_call_function_single(priv->cpu, print_tss, seq, 1);
- break;
- case CPU_CR:
- smp_call_function_single(priv->cpu, print_cr, seq, 1);
- break;
- case CPU_DT:
- smp_call_function_single(priv->cpu, print_dt, seq, 1);
- break;
- case CPU_DEBUG:
- if (priv->file == CPU_INDEX_BIT)
- smp_call_function_single(priv->cpu, print_dr, seq, 1);
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
- case CPU_APIC:
- if (priv->file == CPU_INDEX_BIT)
- smp_call_function_single(priv->cpu, print_apic, seq, 1);
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
-
- default:
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
- }
- seq_printf(seq, "\n");
-
- return 0;
-}
-
-static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
-{
- if (*pos == 0) /* One time is enough ;-) */
- return seq;
-
- return NULL;
-}
-
-static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- (*pos)++;
-
- return cpu_seq_start(seq, pos);
-}
-
-static void cpu_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations cpu_seq_ops = {
- .start = cpu_seq_start,
- .next = cpu_seq_next,
- .stop = cpu_seq_stop,
- .show = cpu_seq_show,
-};
-
-static int cpu_seq_open(struct inode *inode, struct file *file)
-{
- struct cpu_private *priv = inode->i_private;
- struct seq_file *seq;
- int err;
-
- err = seq_open(file, &cpu_seq_ops);
- if (!err) {
- seq = file->private_data;
- seq->private = priv;
- }
-
- return err;
-}
-
-static int write_msr(struct cpu_private *priv, u64 val)
-{
- u32 low, high;
-
- high = (val >> 32) & 0xffffffff;
- low = val & 0xffffffff;
-
- if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
- return 0;
-
- return -EPERM;
-}
-
-static int write_cpu_register(struct cpu_private *priv, const char *buf)
-{
- int ret = -EPERM;
- u64 val;
-
- ret = strict_strtoull(buf, 0, &val);
- if (ret < 0)
- return ret;
-
- /* Supporting only MSRs */
- if (priv->type < CPU_TSS_BIT)
- return write_msr(priv, val);
-
- return ret;
-}
-
-static ssize_t cpu_write(struct file *file, const char __user *ubuf,
- size_t count, loff_t *off)
-{
- struct seq_file *seq = file->private_data;
- struct cpu_private *priv = seq->private;
- char buf[19];
-
- if ((priv == NULL) || (count >= sizeof(buf)))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, count))
- return -EFAULT;
-
- buf[count] = 0;
-
- if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
- if (!write_cpu_register(priv, buf))
- return count;
-
- return -EACCES;
-}
-
-static const struct file_operations cpu_fops = {
- .owner = THIS_MODULE,
- .open = cpu_seq_open,
- .read = seq_read,
- .write = cpu_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
- unsigned file, struct dentry *dentry)
-{
- struct cpu_private *priv = NULL;
-
- /* Already intialized */
- if (file == CPU_INDEX_BIT)
- if (per_cpu(cpu_arr[type].init, cpu))
- return 0;
-
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv == NULL)
- return -ENOMEM;
-
- priv->cpu = cpu;
- priv->type = type;
- priv->reg = reg;
- priv->file = file;
- mutex_lock(&cpu_debug_lock);
- per_cpu(priv_arr[type], cpu) = priv;
- per_cpu(cpu_priv_count, cpu)++;
- mutex_unlock(&cpu_debug_lock);
-
- if (file)
- debugfs_create_file(cpu_file[file].name, S_IRUGO,
- dentry, (void *)priv, &cpu_fops);
- else {
- debugfs_create_file(cpu_base[type].name, S_IRUGO,
- per_cpu(cpu_arr[type].dentry, cpu),
- (void *)priv, &cpu_fops);
- mutex_lock(&cpu_debug_lock);
- per_cpu(cpu_arr[type].init, cpu) = 1;
- mutex_unlock(&cpu_debug_lock);
- }
-
- return 0;
-}
-
-static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
- struct dentry *dentry)
-{
- unsigned file;
- int err = 0;
-
- for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
- err = cpu_create_file(cpu, type, reg, file, dentry);
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
-{
- struct dentry *cpu_dentry = NULL;
- unsigned reg, reg_min, reg_max;
- int i, err = 0;
- char reg_dir[12];
- u32 low, high;
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
- cpu_base[type].flag))
- continue;
-
- for (reg = reg_min; reg <= reg_max; reg++) {
- if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
- continue;
-
- sprintf(reg_dir, "0x%x", reg);
- cpu_dentry = debugfs_create_dir(reg_dir, dentry);
- err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
- if (err)
- return err;
- }
- }
-
- return err;
-}
-
-static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
-{
- struct dentry *cpu_dentry = NULL;
- unsigned type;
- int err = 0;
-
- for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
- if (!is_typeflag_valid(cpu, cpu_base[type].flag))
- continue;
- cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
- per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
-
- if (type < CPU_TSS_BIT)
- err = cpu_init_msr(cpu, type, cpu_dentry);
- else
- err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
- cpu_dentry);
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int cpu_init_cpu(void)
-{
- struct dentry *cpu_dentry = NULL;
- struct cpuinfo_x86 *cpui;
- char cpu_dir[12];
- unsigned cpu;
- int err = 0;
-
- for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- cpui = &cpu_data(cpu);
- if (!cpu_has(cpui, X86_FEATURE_MSR))
- continue;
-
- sprintf(cpu_dir, "cpu%d", cpu);
- cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
- err = cpu_init_allreg(cpu, cpu_dentry);
-
- pr_info("cpu%d(%d) debug files %d\n",
- cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
- if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
- pr_err("Register files count %d exceeds limit %d\n",
- per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
- per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
- err = -ENFILE;
- }
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int __init cpu_debug_init(void)
-{
- cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
-
- return cpu_init_cpu();
-}
-
-static void __exit cpu_debug_exit(void)
-{
- int i, cpu;
-
- if (cpu_debugfs_dir)
- debugfs_remove_recursive(cpu_debugfs_dir);
-
- for (cpu = 0; cpu < nr_cpu_ids; cpu++)
- for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
- kfree(per_cpu(priv_arr[i], cpu));
-}
-
-module_init(cpu_debug_init);
-module_exit(cpu_debug_exit);
-
-MODULE_AUTHOR("Jaswinder Singh Rajput");
-MODULE_DESCRIPTION("CPU Debug module");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index f138c6c389b9..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,252 +0,0 @@
-#
-# CPU Frequency scaling
-#
-
-menu "CPU Frequency scaling"
-
-source "drivers/cpufreq/Kconfig"
-
-if CPU_FREQ
-
-comment "CPUFreq processor drivers"
-
-config X86_ACPI_CPUFREQ
- tristate "ACPI Processor P-States driver"
- select CPU_FREQ_TABLE
- depends on ACPI_PROCESSOR
- help
- This driver adds a CPUFreq driver which utilizes the ACPI
- Processor Performance States.
- This driver also supports Intel Enhanced Speedstep.
-
- To compile this driver as a module, choose M here: the
- module will be called acpi-cpufreq.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config ELAN_CPUFREQ
- tristate "AMD Elan SC400 and SC410"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC400 and SC410
- processors.
-
- You need to specify the processor maximum speed as boot
- parameter: elanfreq=maxspeed (in kHz) or as module
- parameter "max_freq".
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config SC520_CPUFREQ
- tristate "AMD Elan SC520"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC520 processor.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-
-config X86_POWERNOW_K6
- tristate "AMD Mobile K6-2/K6-3 PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
- AMD K6-3+ processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7
- tristate "AMD Mobile Athlon/Duron PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K7 mobile processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7_ACPI
- bool
- depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
- depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
- depends on X86_32
- default y
-
-config X86_POWERNOW_K8
- tristate "AMD Opteron/Athlon64 PowerNow!"
- select CPU_FREQ_TABLE
- depends on ACPI && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
-
- To compile this driver as a module, choose M here: the
- module will be called powernow-k8.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
-config X86_GX_SUSPMOD
- tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
- depends on X86_32 && PCI
- help
- This add the CPUFreq driver for NatSemi Geode processors which
- support suspend modulation.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO
- tristate "Intel Enhanced SpeedStep (deprecated)"
- select CPU_FREQ_TABLE
- select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
- depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
- help
- This is deprecated and this functionality is now merged into
- acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
- speedstep_centrino.
- This adds the CPUFreq driver for Enhanced SpeedStep enabled
- mobile CPUs. This means Intel Pentium M (Centrino) CPUs
- or 64bit enabled Intel Xeons.
-
- To compile this driver as a module, choose M here: the
- module will be called speedstep-centrino.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO_TABLE
- bool "Built-in tables for Banias CPUs"
- depends on X86_32 && X86_SPEEDSTEP_CENTRINO
- default y
- help
- Use built-in tables for Banias CPUs if ACPI encoding
- is not available.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_ICH
- tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
- mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
- ICH3 or ICH4 southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_SMI
- tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin)
- on systems which have an Intel 440BX/ZX/MX southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_P4_CLOCKMOD
- tristate "Intel Pentium 4 clock modulation"
- select CPU_FREQ_TABLE
- help
- This adds the CPUFreq driver for Intel Pentium 4 / XEON
- processors. When enabled it will lower CPU temperature by skipping
- clocks.
-
- This driver should be only used in exceptional
- circumstances when very low power is needed because it causes severe
- slowdowns and noticeable latencies. Normally Speedstep should be used
- instead.
-
- To compile this driver as a module, choose M here: the
- module will be called p4-clockmod.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- Unless you are absolutely sure say N.
-
-config X86_CPUFREQ_NFORCE2
- tristate "nVidia nForce2 FSB changing"
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for FSB changing on nVidia nForce2
- platforms.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGRUN
- tristate "Transmeta LongRun"
- depends on X86_32
- help
- This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
- which support LongRun.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGHAUL
- tristate "VIA Cyrix III Longhaul"
- select CPU_FREQ_TABLE
- depends on X86_32 && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for VIA Samuel/CyrixIII,
- VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
- processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for VIA C7 processors. However, this driver
- does not have any safeguards to prevent operating the CPU out of spec
- and is thus considered dangerous. Please use the regular ACPI cpufreq
- driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
-
- If in doubt, say N.
-
-comment "shared options"
-
-config X86_SPEEDSTEP_LIB
- tristate
- default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
-
-config X86_SPEEDSTEP_RELAXED_CAP_CHECK
- bool "Relaxed speedstep capability checks"
- depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
- help
- Don't perform all checks for a speedstep capable system which would
- normally be done. Some ancient or strange systems, though speedstep
- capable, don't always indicate that they are speedstep capable. This
- option lets the probing code bypass some of those checks if the
- parameter "relaxed_check=1" is passed to the module.
-
-endif # CPU_FREQ
-
-endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index 509296df294d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
-# K8 systems. ACPI is preferred to all other hardware-specific drivers.
-# speedstep-* is preferred over p4-clockmod.
-
-obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
-obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
-obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
-obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
-obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
-obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
-obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
-obj-$(CONFIG_X86_LONGRUN) += longrun.o
-obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
-obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
-obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
-obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
-obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
-obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
-obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index ae9b503220ca..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,851 +0,0 @@
-/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver
- *
- * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
- * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
- * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/dmi.h>
-#include <trace/power.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "acpi-cpufreq", msg)
-
-MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-enum {
- UNDEFINED_CAPABLE = 0,
- SYSTEM_INTEL_MSR_CAPABLE,
- SYSTEM_IO_CAPABLE,
-};
-
-#define INTEL_MSR_RANGE (0xffff)
-#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
-
-struct acpi_cpufreq_data {
- struct acpi_processor_performance *acpi_data;
- struct cpufreq_frequency_table *freq_table;
- unsigned int resume;
- unsigned int cpu_feature;
-};
-
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
-
-struct acpi_msr_data {
- u64 saved_aperf, saved_mperf;
-};
-
-static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
-
-DEFINE_TRACE(power_mark);
-
-/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-static unsigned int acpi_pstate_strict;
-
-static int check_est_cpu(unsigned int cpuid)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
-
- return cpu_has(cpu, X86_FEATURE_EST);
-}
-
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
- struct acpi_processor_performance *perf;
- int i;
-
- perf = data->acpi_data;
-
- for (i = 0; i < perf->state_count; i++) {
- if (value == perf->states[i].status)
- return data->freq_table[i].frequency;
- }
- return 0;
-}
-
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
- int i;
- struct acpi_processor_performance *perf;
-
- msr &= INTEL_MSR_RANGE;
- perf = data->acpi_data;
-
- for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
- if (msr == perf->states[data->freq_table[i].index].status)
- return data->freq_table[i].frequency;
- }
- return data->freq_table[0].frequency;
-}
-
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- return extract_msr(val, data);
- case SYSTEM_IO_CAPABLE:
- return extract_io(val, data);
- default:
- return 0;
- }
-}
-
-struct msr_addr {
- u32 reg;
-};
-
-struct io_addr {
- u16 port;
- u8 bit_width;
-};
-
-struct drv_cmd {
- unsigned int type;
- const struct cpumask *mask;
- union {
- struct msr_addr msr;
- struct io_addr io;
- } addr;
- u32 val;
-};
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void do_drv_read(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 h;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, cmd->val, h);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
- &cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 lo, hi;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, lo, hi);
- lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
- wrmsr(cmd->addr.msr.reg, lo, hi);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
- cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-static void drv_read(struct drv_cmd *cmd)
-{
- cmd->val = 0;
-
- smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1);
-}
-
-static void drv_write(struct drv_cmd *cmd)
-{
- int this_cpu;
-
- this_cpu = get_cpu();
- if (cpumask_test_cpu(this_cpu, cmd->mask))
- do_drv_write(cmd);
- smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
- put_cpu();
-}
-
-static u32 get_cur_val(const struct cpumask *mask)
-{
- struct acpi_processor_performance *perf;
- struct drv_cmd cmd;
-
- if (unlikely(cpumask_empty(mask)))
- return 0;
-
- switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- break;
- default:
- return 0;
- }
-
- cmd.mask = mask;
- drv_read(&cmd);
-
- dprintk("get_cur_val = %u\n", cmd.val);
-
- return cmd.val;
-}
-
-struct perf_pair {
- union {
- struct {
- u32 lo;
- u32 hi;
- } split;
- u64 whole;
- } aperf, mperf;
-};
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
- struct perf_pair *cur = _cur;
-
- rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
- rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu)
-{
- struct perf_pair readin, cur;
- unsigned int perf_percent;
- unsigned int retval;
-
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
- return 0;
-
- cur.aperf.whole = readin.aperf.whole -
- per_cpu(msr_data, cpu).saved_aperf;
- cur.mperf.whole = readin.mperf.whole -
- per_cpu(msr_data, cpu).saved_mperf;
- per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
- per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
-
-#ifdef __i386__
- /*
- * We dont want to do 64 bit divide with 32 bit kernel
- * Get an approximate value. Return failure in case we cannot get
- * an approximate value.
- */
- if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
- int shift_count;
- u32 h;
-
- h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
- shift_count = fls(h);
-
- cur.aperf.whole >>= shift_count;
- cur.mperf.whole >>= shift_count;
- }
-
- if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
- int shift_count = 7;
- cur.aperf.split.lo >>= shift_count;
- cur.mperf.split.lo >>= shift_count;
- }
-
- if (cur.aperf.split.lo && cur.mperf.split.lo)
- perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
- else
- perf_percent = 0;
-
-#else
- if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
- int shift_count = 7;
- cur.aperf.whole >>= shift_count;
- cur.mperf.whole >>= shift_count;
- }
-
- if (cur.aperf.whole && cur.mperf.whole)
- perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
- else
- perf_percent = 0;
-
-#endif
-
- retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
-
- return retval;
-}
-
-static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
-{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
- unsigned int freq;
- unsigned int cached_freq;
-
- dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return 0;
- }
-
- cached_freq = data->freq_table[data->acpi_data->state].frequency;
- freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
- if (freq != cached_freq) {
- /*
- * The dreaded BIOS frequency change behind our back.
- * Force set the frequency on next target call.
- */
- data->resume = 1;
- }
-
- dprintk("cur freq = %u\n", freq);
-
- return freq;
-}
-
-static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
- struct acpi_cpufreq_data *data)
-{
- unsigned int cur_freq;
- unsigned int i;
-
- for (i = 0; i < 100; i++) {
- cur_freq = extract_freq(get_cur_val(mask), data);
- if (cur_freq == freq)
- return 1;
- udelay(10);
- }
- return 0;
-}
-
-static int acpi_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
- struct acpi_processor_performance *perf;
- struct cpufreq_freqs freqs;
- struct drv_cmd cmd;
- unsigned int next_state = 0; /* Index into freq_table */
- unsigned int next_perf_state = 0; /* Index into perf table */
- unsigned int i;
- int result = 0;
- struct power_trace it;
-
- dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return -ENODEV;
- }
-
- perf = data->acpi_data;
- result = cpufreq_frequency_table_target(policy,
- data->freq_table,
- target_freq,
- relation, &next_state);
- if (unlikely(result)) {
- result = -ENODEV;
- goto out;
- }
-
- next_perf_state = data->freq_table[next_state].index;
- if (perf->state == next_perf_state) {
- if (unlikely(data->resume)) {
- dprintk("Called after resume, resetting to P%d\n",
- next_perf_state);
- data->resume = 0;
- } else {
- dprintk("Already at target state (P%d)\n",
- next_perf_state);
- goto out;
- }
- }
-
- trace_power_mark(&it, POWER_PSTATE, next_perf_state);
-
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- default:
- result = -ENODEV;
- goto out;
- }
-
- /* cpufreq holds the hotplug lock, so we are safe from here on */
- if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
- cmd.mask = policy->cpus;
- else
- cmd.mask = cpumask_of(policy->cpu);
-
- freqs.old = perf->states[perf->state].core_frequency * 1000;
- freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu(i, cmd.mask) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- drv_write(&cmd);
-
- if (acpi_pstate_strict) {
- if (!check_freqs(cmd.mask, freqs.new, data)) {
- dprintk("acpi_cpufreq_target failed (%d)\n",
- policy->cpu);
- result = -EAGAIN;
- goto out;
- }
- }
-
- for_each_cpu(i, cmd.mask) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- perf->state = next_perf_state;
-
-out:
- return result;
-}
-
-static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
-
- dprintk("acpi_cpufreq_verify\n");
-
- return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
- struct acpi_processor_performance *perf = data->acpi_data;
-
- if (cpu_khz) {
- /* search the closest match to cpu_khz */
- unsigned int i;
- unsigned long freq;
- unsigned long freqn = perf->states[0].core_frequency * 1000;
-
- for (i = 0; i < (perf->state_count-1); i++) {
- freq = freqn;
- freqn = perf->states[i+1].core_frequency * 1000;
- if ((2 * cpu_khz) > (freqn + freq)) {
- perf->state = i;
- return freq;
- }
- }
- perf->state = perf->state_count-1;
- return freqn;
- } else {
- /* assume CPU is at P0... */
- perf->state = 0;
- return perf->states[0].core_frequency * 1000;
- }
-}
-
-static void free_acpi_perf_data(void)
-{
- unsigned int i;
-
- /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
- for_each_possible_cpu(i)
- free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
- ->shared_cpu_map);
- free_percpu(acpi_perf_data);
-}
-
-/*
- * acpi_cpufreq_early_init - initialize ACPI P-States library
- *
- * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
- * in order to determine correct frequency and voltage pairings. We can
- * do _PDC and _PSD and find out the processor dependency for the
- * actual init that will happen later...
- */
-static int __init acpi_cpufreq_early_init(void)
-{
- unsigned int i;
- dprintk("acpi_cpufreq_early_init\n");
-
- acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
- if (!acpi_perf_data) {
- dprintk("Memory allocation error for acpi_perf_data.\n");
- return -ENOMEM;
- }
- for_each_possible_cpu(i) {
- if (!zalloc_cpumask_var_node(
- &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
- GFP_KERNEL, cpu_to_node(i))) {
-
- /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
- free_acpi_perf_data();
- return -ENOMEM;
- }
- }
-
- /* Do initialization in ACPI core */
- acpi_processor_preregister_performance(acpi_perf_data);
- return 0;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-
-static int sw_any_bug_found(const struct dmi_system_id *d)
-{
- bios_with_sw_any_bug = 1;
- return 0;
-}
-
-static const struct dmi_system_id sw_any_bug_dmi_table[] = {
- {
- .callback = sw_any_bug_found,
- .ident = "Supermicro Server X6DLP",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
- DMI_MATCH(DMI_BIOS_VERSION, "080010"),
- DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
- },
- },
- { }
-};
-#endif
-
-static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- unsigned int valid_states = 0;
- unsigned int cpu = policy->cpu;
- struct acpi_cpufreq_data *data;
- unsigned int result = 0;
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- struct acpi_processor_performance *perf;
-
- dprintk("acpi_cpufreq_cpu_init\n");
-
- data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
- per_cpu(drv_data, cpu) = data;
-
- if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
- acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- result = acpi_processor_register_performance(data->acpi_data, cpu);
- if (result)
- goto err_free;
-
- perf = data->acpi_data;
- policy->shared_type = perf->shared_type;
-
- /*
- * Will let policy->cpus know about dependency only when software
- * coordination is required.
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
- policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
- cpumask_copy(policy->cpus, perf->shared_cpu_map);
- }
- cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
-
-#ifdef CONFIG_SMP
- dmi_check_system(sw_any_bug_dmi_table);
- if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
- policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
- cpumask_copy(policy->cpus, cpu_core_mask(cpu));
- }
-#endif
-
- /* capability check */
- if (perf->state_count <= 1) {
- dprintk("No P-States\n");
- result = -ENODEV;
- goto err_unreg;
- }
-
- if (perf->control_register.space_id != perf->status_register.space_id) {
- result = -ENODEV;
- goto err_unreg;
- }
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- dprintk("SYSTEM IO addr space\n");
- data->cpu_feature = SYSTEM_IO_CAPABLE;
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- dprintk("HARDWARE addr space\n");
- if (!check_est_cpu(cpu)) {
- result = -ENODEV;
- goto err_unreg;
- }
- data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
- break;
- default:
- dprintk("Unknown addr space %d\n",
- (u32) (perf->control_register.space_id));
- result = -ENODEV;
- goto err_unreg;
- }
-
- data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
- (perf->state_count+1), GFP_KERNEL);
- if (!data->freq_table) {
- result = -ENOMEM;
- goto err_unreg;
- }
-
- /* detect transition latency */
- policy->cpuinfo.transition_latency = 0;
- for (i = 0; i < perf->state_count; i++) {
- if ((perf->states[i].transition_latency * 1000) >
- policy->cpuinfo.transition_latency)
- policy->cpuinfo.transition_latency =
- perf->states[i].transition_latency * 1000;
- }
-
- /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
- if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
- policy->cpuinfo.transition_latency > 20 * 1000) {
- policy->cpuinfo.transition_latency = 20 * 1000;
- printk_once(KERN_INFO
- "P-state transition latency capped at 20 uS\n");
- }
-
- /* table init */
- for (i = 0; i < perf->state_count; i++) {
- if (i > 0 && perf->states[i].core_frequency >=
- data->freq_table[valid_states-1].frequency / 1000)
- continue;
-
- data->freq_table[valid_states].index = i;
- data->freq_table[valid_states].frequency =
- perf->states[i].core_frequency * 1000;
- valid_states++;
- }
- data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
- perf->state = 0;
-
- result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
- if (result)
- goto err_freqfree;
-
- if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
- printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- /* Current speed is unknown and not detectable by IO port */
- policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
- policy->cur = get_cur_freq_on_cpu(cpu);
- break;
- default:
- break;
- }
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- /* Check for APERF/MPERF support in hardware */
- if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
- unsigned int ecx;
- ecx = cpuid_ecx(6);
- if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
- acpi_cpufreq_driver.getavg = get_measured_perf;
- }
-
- dprintk("CPU%u - ACPI performance management activated.\n", cpu);
- for (i = 0; i < perf->state_count; i++)
- dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
- (i == perf->state ? '*' : ' '), i,
- (u32) perf->states[i].core_frequency,
- (u32) perf->states[i].power,
- (u32) perf->states[i].transition_latency);
-
- cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-
- /*
- * the first call to ->target() should result in us actually
- * writing something to the appropriate registers.
- */
- data->resume = 1;
-
- return result;
-
-err_freqfree:
- kfree(data->freq_table);
-err_unreg:
- acpi_processor_unregister_performance(perf, cpu);
-err_free:
- kfree(data);
- per_cpu(drv_data, cpu) = NULL;
-
- return result;
-}
-
-static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
-
- dprintk("acpi_cpufreq_cpu_exit\n");
-
- if (data) {
- cpufreq_frequency_table_put_attr(policy->cpu);
- per_cpu(drv_data, policy->cpu) = NULL;
- acpi_processor_unregister_performance(data->acpi_data,
- policy->cpu);
- kfree(data);
- }
-
- return 0;
-}
-
-static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
-
- dprintk("acpi_cpufreq_resume\n");
-
- data->resume = 1;
-
- return 0;
-}
-
-static struct freq_attr *acpi_cpufreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
- .verify = acpi_cpufreq_verify,
- .target = acpi_cpufreq_target,
- .init = acpi_cpufreq_cpu_init,
- .exit = acpi_cpufreq_cpu_exit,
- .resume = acpi_cpufreq_resume,
- .name = "acpi-cpufreq",
- .owner = THIS_MODULE,
- .attr = acpi_cpufreq_attr,
-};
-
-static int __init acpi_cpufreq_init(void)
-{
- int ret;
-
- if (acpi_disabled)
- return 0;
-
- dprintk("acpi_cpufreq_init\n");
-
- ret = acpi_cpufreq_early_init();
- if (ret)
- return ret;
-
- ret = cpufreq_register_driver(&acpi_cpufreq_driver);
- if (ret)
- free_acpi_perf_data();
-
- return ret;
-}
-
-static void __exit acpi_cpufreq_exit(void)
-{
- dprintk("acpi_cpufreq_exit\n");
-
- cpufreq_unregister_driver(&acpi_cpufreq_driver);
-
- free_percpu(acpi_perf_data);
-}
-
-module_param(acpi_pstate_strict, uint, 0644);
-MODULE_PARM_DESC(acpi_pstate_strict,
- "value 0 or non-zero. non-zero -> strict ACPI checks are "
- "performed during frequency changes.");
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-
-MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 733093d60436..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#define NFORCE2_XTAL 25
-#define NFORCE2_BOOTFSB 0x48
-#define NFORCE2_PLLENABLE 0xa8
-#define NFORCE2_PLLREG 0xa4
-#define NFORCE2_PLLADR 0xa0
-#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
-
-#define NFORCE2_MIN_FSB 50
-#define NFORCE2_SAFE_DISTANCE 50
-
-/* Delay in ms between FSB changes */
-/* #define NFORCE2_DELAY 10 */
-
-/*
- * nforce2_chipset:
- * FSB is changed using the chipset
- */
-static struct pci_dev *nforce2_dev;
-
-/* fid:
- * multiplier * 10
- */
-static int fid;
-
-/* min_fsb, max_fsb:
- * minimum and maximum FSB (= FSB at boot time)
- */
-static int min_fsb;
-static int max_fsb;
-
-MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
-MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
-MODULE_LICENSE("GPL");
-
-module_param(fid, int, 0444);
-module_param(min_fsb, int, 0444);
-
-MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
-MODULE_PARM_DESC(min_fsb,
- "Minimum FSB to use, if not defined: current FSB - 50");
-
-#define PFX "cpufreq-nforce2: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "cpufreq-nforce2", msg)
-
-/**
- * nforce2_calc_fsb - calculate FSB
- * @pll: PLL value
- *
- * Calculates FSB from PLL value
- */
-static int nforce2_calc_fsb(int pll)
-{
- unsigned char mul, div;
-
- mul = (pll >> 8) & 0xff;
- div = pll & 0xff;
-
- if (div > 0)
- return NFORCE2_XTAL * mul / div;
-
- return 0;
-}
-
-/**
- * nforce2_calc_pll - calculate PLL value
- * @fsb: FSB
- *
- * Calculate PLL value for given FSB
- */
-static int nforce2_calc_pll(unsigned int fsb)
-{
- unsigned char xmul, xdiv;
- unsigned char mul = 0, div = 0;
- int tried = 0;
-
- /* Try to calculate multiplier and divider up to 4 times */
- while (((mul == 0) || (div == 0)) && (tried <= 3)) {
- for (xdiv = 2; xdiv <= 0x80; xdiv++)
- for (xmul = 1; xmul <= 0xfe; xmul++)
- if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
- fsb + tried) {
- mul = xmul;
- div = xdiv;
- }
- tried++;
- }
-
- if ((mul == 0) || (div == 0))
- return -1;
-
- return NFORCE2_PLL(mul, div);
-}
-
-/**
- * nforce2_write_pll - write PLL value to chipset
- * @pll: PLL value
- *
- * Writes new FSB PLL value to chipset
- */
-static void nforce2_write_pll(int pll)
-{
- int temp;
-
- /* Set the pll addr. to 0x00 */
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
-
- /* Now write the value in all 64 registers */
- for (temp = 0; temp <= 0x3f; temp++)
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
-
- return;
-}
-
-/**
- * nforce2_fsb_read - Read FSB
- *
- * Read FSB from chipset
- * If bootfsb != 0, return FSB at boot-time
- */
-static unsigned int nforce2_fsb_read(int bootfsb)
-{
- struct pci_dev *nforce2_sub5;
- u32 fsb, temp = 0;
-
- /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
- nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
- if (!nforce2_sub5)
- return 0;
-
- pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
- fsb /= 1000000;
-
- /* Check if PLL register is already set */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-
- if (bootfsb || !temp)
- return fsb;
-
- /* Use PLL register FSB value */
- pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
- fsb = nforce2_calc_fsb(temp);
-
- return fsb;
-}
-
-/**
- * nforce2_set_fsb - set new FSB
- * @fsb: New FSB
- *
- * Sets new FSB
- */
-static int nforce2_set_fsb(unsigned int fsb)
-{
- u32 temp = 0;
- unsigned int tfsb;
- int diff;
- int pll = 0;
-
- if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
- printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
- return -EINVAL;
- }
-
- tfsb = nforce2_fsb_read(0);
- if (!tfsb) {
- printk(KERN_ERR PFX "Error while reading the FSB\n");
- return -EINVAL;
- }
-
- /* First write? Then set actual value */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
- if (!temp) {
- pll = nforce2_calc_pll(tfsb);
-
- if (pll < 0)
- return -EINVAL;
-
- nforce2_write_pll(pll);
- }
-
- /* Enable write access */
- temp = 0x01;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
-
- diff = tfsb - fsb;
-
- if (!diff)
- return 0;
-
- while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
- if (diff < 0)
- tfsb++;
- else
- tfsb--;
-
- /* Calculate the PLL reg. value */
- pll = nforce2_calc_pll(tfsb);
- if (pll == -1)
- return -EINVAL;
-
- nforce2_write_pll(pll);
-#ifdef NFORCE2_DELAY
- mdelay(NFORCE2_DELAY);
-#endif
- }
-
- temp = 0x40;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
-
- return 0;
-}
-
-/**
- * nforce2_get - get the CPU frequency
- * @cpu: CPU number
- *
- * Returns the CPU frequency
- */
-static unsigned int nforce2_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return nforce2_fsb_read(0) * fid * 100;
-}
-
-/**
- * nforce2_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int nforce2_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
-/* unsigned long flags; */
- struct cpufreq_freqs freqs;
- unsigned int target_fsb;
-
- if ((target_freq > policy->max) || (target_freq < policy->min))
- return -EINVAL;
-
- target_fsb = target_freq / (fid * 100);
-
- freqs.old = nforce2_get(policy->cpu);
- freqs.new = target_fsb * fid * 100;
- freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
-
- if (freqs.old == freqs.new)
- return 0;
-
- dprintk("Old CPU frequency %d kHz, new %d kHz\n",
- freqs.old, freqs.new);
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Disable IRQs */
- /* local_irq_save(flags); */
-
- if (nforce2_set_fsb(target_fsb) < 0)
- printk(KERN_ERR PFX "Changing FSB to %d failed\n",
- target_fsb);
- else
- dprintk("Changed FSB successfully to %d\n",
- target_fsb);
-
- /* Enable IRQs */
- /* local_irq_restore(flags); */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-/**
- * nforce2_verify - verifies a new CPUFreq policy
- * @policy: new policy
- */
-static int nforce2_verify(struct cpufreq_policy *policy)
-{
- unsigned int fsb_pol_max;
-
- fsb_pol_max = policy->max / (fid * 100);
-
- if (policy->min < (fsb_pol_max * fid * 100))
- policy->max = (fsb_pol_max + 1) * fid * 100;
-
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
- return 0;
-}
-
-static int nforce2_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int fsb;
- unsigned int rfid;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Get current FSB */
- fsb = nforce2_fsb_read(0);
-
- if (!fsb)
- return -EIO;
-
- /* FIX: Get FID from CPU */
- if (!fid) {
- if (!cpu_khz) {
- printk(KERN_WARNING PFX
- "cpu_khz not set, can't calculate multiplier!\n");
- return -ENODEV;
- }
-
- fid = cpu_khz / (fsb * 100);
- rfid = fid % 5;
-
- if (rfid) {
- if (rfid > 2)
- fid += 5 - rfid;
- else
- fid -= rfid;
- }
- }
-
- printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
- fid / 10, fid % 10);
-
- /* Set maximum FSB to FSB at boot time */
- max_fsb = nforce2_fsb_read(1);
-
- if (!max_fsb)
- return -EIO;
-
- if (!min_fsb)
- min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
-
- if (min_fsb < NFORCE2_MIN_FSB)
- min_fsb = NFORCE2_MIN_FSB;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = min_fsb * fid * 100;
- policy->cpuinfo.max_freq = max_fsb * fid * 100;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = nforce2_get(policy->cpu);
- policy->min = policy->cpuinfo.min_freq;
- policy->max = policy->cpuinfo.max_freq;
-
- return 0;
-}
-
-static int nforce2_cpu_exit(struct cpufreq_policy *policy)
-{
- return 0;
-}
-
-static struct cpufreq_driver nforce2_driver = {
- .name = "nforce2",
- .verify = nforce2_verify,
- .target = nforce2_target,
- .get = nforce2_get,
- .init = nforce2_cpu_init,
- .exit = nforce2_cpu_exit,
- .owner = THIS_MODULE,
-};
-
-/**
- * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
- *
- * Detects nForce2 A2 and C1 stepping
- *
- */
-static unsigned int nforce2_detect_chipset(void)
-{
- nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
- PCI_DEVICE_ID_NVIDIA_NFORCE2,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
-
- if (nforce2_dev == NULL)
- return -ENODEV;
-
- printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
- nforce2_dev->revision);
- printk(KERN_INFO PFX
- "FSB changing is maybe unstable and can lead to "
- "crashes and data loss.\n");
-
- return 0;
-}
-
-/**
- * nforce2_init - initializes the nForce2 CPUFreq driver
- *
- * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init nforce2_init(void)
-{
- /* TODO: do we need to detect the processor? */
-
- /* detect chipset */
- if (nforce2_detect_chipset()) {
- printk(KERN_INFO PFX "No nForce2 chipset.\n");
- return -ENODEV;
- }
-
- return cpufreq_register_driver(&nforce2_driver);
-}
-
-/**
- * nforce2_exit - unregisters cpufreq module
- *
- * Unregisters nForce2 FSB change support.
- */
-static void __exit nforce2_exit(void)
-{
- cpufreq_unregister_driver(&nforce2_driver);
-}
-
-module_init(nforce2_init);
-module_exit(nforce2_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb7..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Based on documentation provided by Dave Jones. Thanks!
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-
-#define EPS_BRAND_C7M 0
-#define EPS_BRAND_C7 1
-#define EPS_BRAND_EDEN 2
-#define EPS_BRAND_C3 3
-#define EPS_BRAND_C7D 4
-
-struct eps_cpu_data {
- u32 fsb;
- struct cpufreq_frequency_table freq_table[];
-};
-
-static struct eps_cpu_data *eps_cpu[NR_CPUS];
-
-
-static unsigned int eps_get(unsigned int cpu)
-{
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (cpu)
- return 0;
- centaur = eps_cpu[cpu];
- if (centaur == NULL)
- return 0;
-
- /* Return current frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- return centaur->fsb * ((lo >> 8) & 0xff);
-}
-
-static int eps_set_state(struct eps_cpu_data *centaur,
- unsigned int cpu,
- u32 dest_state)
-{
- struct cpufreq_freqs freqs;
- u32 lo, hi;
- int err = 0;
- int i;
-
- freqs.old = eps_get(cpu);
- freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
- freqs.cpu = cpu;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Wait while CPU is busy */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i = 0;
- while (lo & ((1 << 16) | (1 << 17))) {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- }
- /* Set new multiplier and voltage */
- wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
- /* Wait until transition end */
- i = 0;
- do {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- } while (lo & ((1 << 16) | (1 << 17)));
-
- /* Return current frequency */
-postchange:
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
-
-#ifdef DEBUG
- {
- u8 current_multiplier, current_voltage;
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n",
- current_multiplier);
- }
-#endif
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- return err;
-}
-
-static int eps_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- struct eps_cpu_data *centaur;
- unsigned int newstate = 0;
- unsigned int cpu = policy->cpu;
- unsigned int dest_state;
- int ret;
-
- if (unlikely(eps_cpu[cpu] == NULL))
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- &eps_cpu[cpu]->freq_table[0],
- target_freq,
- relation,
- &newstate))) {
- return -EINVAL;
- }
-
- /* Make frequency transition */
- dest_state = centaur->freq_table[newstate].index & 0xffff;
- ret = eps_set_state(centaur, cpu, dest_state);
- if (ret)
- printk(KERN_ERR "eps: Timeout!\n");
- return ret;
-}
-
-static int eps_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- &eps_cpu[policy->cpu]->freq_table[0]);
-}
-
-static int eps_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- u32 lo, hi;
- u64 val;
- u8 current_multiplier, current_voltage;
- u8 max_multiplier, max_voltage;
- u8 min_multiplier, min_voltage;
- u8 brand = 0;
- u32 fsb;
- struct eps_cpu_data *centaur;
- struct cpuinfo_x86 *c = &cpu_data(0);
- struct cpufreq_frequency_table *f_table;
- int k, step, voltage;
- int ret;
- int states;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Check brand */
- printk(KERN_INFO "eps: Detected VIA ");
-
- switch (c->x86_model) {
- case 10:
- rdmsr(0x1153, lo, hi);
- brand = (((lo >> 2) ^ lo) >> 18) & 3;
- printk(KERN_CONT "Model A ");
- break;
- case 13:
- rdmsr(0x1154, lo, hi);
- brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
- printk(KERN_CONT "Model D ");
- break;
- }
-
- switch (brand) {
- case EPS_BRAND_C7M:
- printk(KERN_CONT "C7-M\n");
- break;
- case EPS_BRAND_C7:
- printk(KERN_CONT "C7\n");
- break;
- case EPS_BRAND_EDEN:
- printk(KERN_CONT "Eden\n");
- break;
- case EPS_BRAND_C7D:
- printk(KERN_CONT "C7-D\n");
- break;
- case EPS_BRAND_C3:
- printk(KERN_CONT "C3\n");
- return -ENODEV;
- break;
- }
- /* Enable Enhanced PowerSaver */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- wrmsrl(MSR_IA32_MISC_ENABLE, val);
- /* Can be locked at 0 */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
- return -ENODEV;
- }
- }
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
-
- /* Print limits */
- max_voltage = hi & 0xff;
- printk(KERN_INFO "eps: Highest voltage = %dmV\n",
- max_voltage * 16 + 700);
- max_multiplier = (hi >> 8) & 0xff;
- printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
- min_voltage = (hi >> 16) & 0xff;
- printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
- min_voltage * 16 + 700);
- min_multiplier = (hi >> 24) & 0xff;
- printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
-
- /* Sanity checks */
- if (current_multiplier == 0 || max_multiplier == 0
- || min_multiplier == 0)
- return -EINVAL;
- if (current_multiplier > max_multiplier
- || max_multiplier <= min_multiplier)
- return -EINVAL;
- if (current_voltage > 0x1f || max_voltage > 0x1f)
- return -EINVAL;
- if (max_voltage < min_voltage)
- return -EINVAL;
-
- /* Calc FSB speed */
- fsb = cpu_khz / current_multiplier;
- /* Calc number of p-states supported */
- if (brand == EPS_BRAND_C7M)
- states = max_multiplier - min_multiplier + 1;
- else
- states = 2;
-
- /* Allocate private data and frequency table for current cpu */
- centaur = kzalloc(sizeof(struct eps_cpu_data)
- + (states + 1) * sizeof(struct cpufreq_frequency_table),
- GFP_KERNEL);
- if (!centaur)
- return -ENOMEM;
- eps_cpu[0] = centaur;
-
- /* Copy basic values */
- centaur->fsb = fsb;
-
- /* Fill frequency and MSR value table */
- f_table = &centaur->freq_table[0];
- if (brand != EPS_BRAND_C7M) {
- f_table[0].frequency = fsb * min_multiplier;
- f_table[0].index = (min_multiplier << 8) | min_voltage;
- f_table[1].frequency = fsb * max_multiplier;
- f_table[1].index = (max_multiplier << 8) | max_voltage;
- f_table[2].frequency = CPUFREQ_TABLE_END;
- } else {
- k = 0;
- step = ((max_voltage - min_voltage) * 256)
- / (max_multiplier - min_multiplier);
- for (i = min_multiplier; i <= max_multiplier; i++) {
- voltage = (k * step) / 256 + min_voltage;
- f_table[k].frequency = fsb * i;
- f_table[k].index = (i << 8) | voltage;
- k++;
- }
- f_table[k].frequency = CPUFREQ_TABLE_END;
- }
-
- policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
- policy->cur = fsb * current_multiplier;
-
- ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
- if (ret) {
- kfree(centaur);
- return ret;
- }
-
- cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
- return 0;
-}
-
-static int eps_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (eps_cpu[cpu] == NULL)
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- /* Get max frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- /* Set max frequency */
- eps_set_state(centaur, cpu, hi & 0xffff);
- /* Bye */
- cpufreq_frequency_table_put_attr(policy->cpu);
- kfree(eps_cpu[cpu]);
- eps_cpu[cpu] = NULL;
- return 0;
-}
-
-static struct freq_attr *eps_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver eps_driver = {
- .verify = eps_verify,
- .target = eps_target,
- .init = eps_cpu_init,
- .exit = eps_cpu_exit,
- .get = eps_get,
- .name = "e_powersaver",
- .owner = THIS_MODULE,
- .attr = eps_attr,
-};
-
-static int __init eps_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* This driver will work only on Centaur C7 processors with
- * Enhanced SpeedStep/PowerSaver registers */
- if (c->x86_vendor != X86_VENDOR_CENTAUR
- || c->x86 != 6 || c->x86_model < 10)
- return -ENODEV;
- if (!cpu_has(c, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpufreq_register_driver(&eps_driver))
- return -EINVAL;
- return 0;
-}
-
-static void __exit eps_exit(void)
-{
- cpufreq_unregister_driver(&eps_driver);
-}
-
-MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
-MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
-MODULE_LICENSE("GPL");
-
-module_init(eps_init);
-module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index 006b278b0d5d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * elanfreq: cpufreq driver for the AMD ELAN family
- *
- * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
- *
- * Parts of this code are (c) Sven Geggus <sven@geggus.net>
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
-#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
-
-/* Module parameter */
-static int max_freq;
-
-struct s_elan_multiplier {
- int clock; /* frequency in kHz */
- int val40h; /* PMU Force Mode register */
- int val80h; /* CPU Clock Speed Register */
-};
-
-/*
- * It is important that the frequencies
- * are listed in ascending order here!
- */
-static struct s_elan_multiplier elan_multiplier[] = {
- {1000, 0x02, 0x18},
- {2000, 0x02, 0x10},
- {4000, 0x02, 0x08},
- {8000, 0x00, 0x00},
- {16000, 0x00, 0x02},
- {33000, 0x00, 0x04},
- {66000, 0x01, 0x04},
- {99000, 0x01, 0x05}
-};
-
-static struct cpufreq_frequency_table elanfreq_table[] = {
- {0, 1000},
- {1, 2000},
- {2, 4000},
- {3, 8000},
- {4, 16000},
- {5, 33000},
- {6, 66000},
- {7, 99000},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-/**
- * elanfreq_get_cpu_frequency: determine current cpu speed
- *
- * Finds out at which frequency the CPU of the Elan SOC runs
- * at the moment. Frequencies from 1 to 33 MHz are generated
- * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
- * and have the rest of the chip running with 33 MHz.
- */
-
-static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg; /* Clock Speed Register */
-
- local_irq_disable();
- outb_p(0x80, REG_CSCIR);
- clockspeed_reg = inb_p(REG_CSCDR);
- local_irq_enable();
-
- if ((clockspeed_reg & 0xE0) == 0xE0)
- return 0;
-
- /* Are we in CPU clock multiplied mode (66/99 MHz)? */
- if ((clockspeed_reg & 0xE0) == 0xC0) {
- if ((clockspeed_reg & 0x01) == 0)
- return 66000;
- else
- return 99000;
- }
-
- /* 33 MHz is not 32 MHz... */
- if ((clockspeed_reg & 0xE0) == 0xA0)
- return 33000;
-
- return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
-}
-
-
-/**
- * elanfreq_set_cpu_frequency: Change the CPU core frequency
- * @cpu: cpu number
- * @freq: frequency in kHz
- *
- * This function takes a frequency value and changes the CPU frequency
- * according to this. Note that the frequency has to be checked by
- * elanfreq_validatespeed() for correctness!
- *
- * There is no return value.
- */
-
-static void elanfreq_set_cpu_state(unsigned int state)
-{
- struct cpufreq_freqs freqs;
-
- freqs.old = elanfreq_get_cpu_frequency(0);
- freqs.new = elan_multiplier[state].clock;
- freqs.cpu = 0; /* elanfreq.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
- elan_multiplier[state].clock);
-
-
- /*
- * Access to the Elan's internal registers is indexed via
- * 0x22: Chip Setup & Control Register Index Register (CSCI)
- * 0x23: Chip Setup & Control Register Data Register (CSCD)
- *
- */
-
- /*
- * 0x40 is the Power Management Unit's Force Mode Register.
- * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
- */
-
- local_irq_disable();
- outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
- outb_p(0x00, REG_CSCDR);
- local_irq_enable(); /* wait till internal pipelines and */
- udelay(1000); /* buffers have cleaned up */
-
- local_irq_disable();
-
- /* now, set the CPU clock speed register (0x80) */
- outb_p(0x80, REG_CSCIR);
- outb_p(elan_multiplier[state].val80h, REG_CSCDR);
-
- /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
- outb_p(0x40, REG_CSCIR);
- outb_p(elan_multiplier[state].val40h, REG_CSCDR);
- udelay(10000);
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-
-/**
- * elanfreq_validatespeed: test if frequency range is valid
- * @policy: the policy to validate
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int elanfreq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
-}
-
-static int elanfreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- elanfreq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int elanfreq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int i;
- int result;
-
- /* capability check */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10))
- return -ENODEV;
-
- /* max freq */
- if (!max_freq)
- max_freq = elanfreq_get_cpu_frequency(0);
-
- /* table init */
- for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if (elanfreq_table[i].frequency > max_freq)
- elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = elanfreq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
- return 0;
-}
-
-
-static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-#ifndef MODULE
-/**
- * elanfreq_setup - elanfreq command line parameter parsing
- *
- * elanfreq command line parameter. Use:
- * elanfreq=66000
- * to set the maximum CPU frequency to 66 MHz. Note that in
- * case you do not give this boot parameter, the maximum
- * frequency will fall back to _current_ CPU frequency which
- * might be lower. If you build this as a module, use the
- * max_freq module parameter instead.
- */
-static int __init elanfreq_setup(char *str)
-{
- max_freq = simple_strtoul(str, &str, 0);
- printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
- return 1;
-}
-__setup("elanfreq=", elanfreq_setup);
-#endif
-
-
-static struct freq_attr *elanfreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver elanfreq_driver = {
- .get = elanfreq_get_cpu_frequency,
- .verify = elanfreq_verify,
- .target = elanfreq_target,
- .init = elanfreq_cpu_init,
- .exit = elanfreq_cpu_exit,
- .name = "elanfreq",
- .owner = THIS_MODULE,
- .attr = elanfreq_attr,
-};
-
-
-static int __init elanfreq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* Test if we have the right hardware */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10)) {
- printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
- return -ENODEV;
- }
- return cpufreq_register_driver(&elanfreq_driver);
-}
-
-
-static void __exit elanfreq_exit(void)
-{
- cpufreq_unregister_driver(&elanfreq_driver);
-}
-
-
-module_param(max_freq, int, 0444);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
- "Sven Geggus <sven@geggus.net>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
-
-module_init(elanfreq_init);
-module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index ac27ec2264d5..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,519 +0,0 @@
-/*
- * Cyrix MediaGX and NatSemi Geode Suspend Modulation
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Hiroshi Miura <miura@da-cha.org>
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Theoretical note:
- *
- * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
- *
- * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
- * are based on Suspend Modulation.
- *
- * Suspend Modulation works by asserting and de-asserting the SUSP# pin
- * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
- * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
- * asserted then power consumption is reduced.
- *
- * Suspend Modulation's OFF/ON duration are configurable
- * with 'Suspend Modulation OFF Count Register'
- * and 'Suspend Modulation ON Count Register'.
- * These registers are 8bit counters that represent the number of
- * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
- * to the processor.
- *
- * These counters define a ratio which is the effective frequency
- * of operation of the system.
- *
- * OFF Count
- * F_eff = Fgx * ----------------------
- * OFF Count + ON Count
- *
- * 0 <= On Count, Off Count <= 255
- *
- * From these limits, we can get register values
- *
- * off_duration + on_duration <= MAX_DURATION
- * on_duration = off_duration * (stock_freq - freq) / freq
- *
- * off_duration = (freq * DURATION) / stock_freq
- * on_duration = DURATION - off_duration
- *
- *
- *---------------------------------------------------------------------------
- *
- * ChangeLog:
- * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
- * - fix on/off register mistake
- * - fix cpu_khz calc when it stops cpu modulation.
- *
- * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
- * - rewrite for Cyrix MediaGX Cx5510/5520 and
- * NatSemi Geode Cs5530(A).
- *
- * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * - cs5530_mod patch for 2.4.19-rc1.
- *
- *---------------------------------------------------------------------------
- *
- * Todo
- * Test on machines with 5510, 5530, 5530A
- */
-
-/************************************************************************
- * Suspend Modulation - Definitions *
- ************************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-
-#include <asm/processor-cyrix.h>
-
-/* PCI config registers, all at F0 */
-#define PCI_PMER1 0x80 /* power management enable register 1 */
-#define PCI_PMER2 0x81 /* power management enable register 2 */
-#define PCI_PMER3 0x82 /* power management enable register 3 */
-#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
-#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
-#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
-#define PCI_MODON 0x95 /* suspend modulation ON counter register */
-#define PCI_SUSCFG 0x96 /* suspend configuration register */
-
-/* PMER1 bits */
-#define GPM (1<<0) /* global power management */
-#define GIT (1<<1) /* globally enable PM device idle timers */
-#define GTR (1<<2) /* globally enable IO traps */
-#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
-#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
-
-/* SUSCFG bits */
-#define SUSMOD (1<<0) /* enable/disable suspend modulation */
-/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
-#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
- /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
-#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
-/* the below is supported only with cs5530A */
-#define PWRSVE_ISA (1<<3) /* stop ISA clock */
-#define PWRSVE (1<<4) /* active idle */
-
-struct gxfreq_params {
- u8 on_duration;
- u8 off_duration;
- u8 pci_suscfg;
- u8 pci_pmer1;
- u8 pci_pmer2;
- struct pci_dev *cs55x0;
-};
-
-static struct gxfreq_params *gx_params;
-static int stock_freq;
-
-/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk;
-module_param(pci_busclk, int, 0444);
-
-/* maximum duration for which the cpu may be suspended
- * (32us * MAX_DURATION). If no parameter is given, this defaults
- * to 255.
- * Note that this leads to a maximum of 8 ms(!) where the CPU clock
- * is suspended -- processing power is just 0.39% of what it used to be,
- * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
-static int max_duration = 255;
-module_param(max_duration, int, 0444);
-
-/* For the default policy, we want at least some processing power
- * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
- */
-#define POLICY_MIN_DIV 20
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "gx-suspmod", msg)
-
-/**
- * we can detect a core multipiler from dir0_lsb
- * from GX1 datasheet p.56,
- * MULT[3:0]:
- * 0000 = SYSCLK multiplied by 4 (test only)
- * 0001 = SYSCLK multiplied by 10
- * 0010 = SYSCLK multiplied by 4
- * 0011 = SYSCLK multiplied by 6
- * 0100 = SYSCLK multiplied by 9
- * 0101 = SYSCLK multiplied by 5
- * 0110 = SYSCLK multiplied by 7
- * 0111 = SYSCLK multiplied by 8
- * of 33.3MHz
- **/
-static int gx_freq_mult[16] = {
- 4, 10, 4, 6, 9, 5, 7, 8,
- 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-
-/****************************************************************
- * Low Level chipset interface *
- ****************************************************************/
-static struct pci_device_id gx_chipset_tbl[] __initdata = {
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
- PCI_ANY_ID, PCI_ANY_ID },
- { 0, },
-};
-
-static void gx_write_byte(int reg, int value)
-{
- pci_write_config_byte(gx_params->cs55x0, reg, value);
-}
-
-/**
- * gx_detect_chipset:
- *
- **/
-static __init struct pci_dev *gx_detect_chipset(void)
-{
- struct pci_dev *gx_pci = NULL;
-
- /* check if CPU is a MediaGX or a Geode. */
- if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
- (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
- dprintk("error: no MediaGX/Geode processor found!\n");
- return NULL;
- }
-
- /* detect which companion chip is used */
- while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
- if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
- return gx_pci;
- }
-
- dprintk("error: no supported chipset found!\n");
- return NULL;
-}
-
-/**
- * gx_get_cpuspeed:
- *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
- * Geode CPU runs.
- */
-static unsigned int gx_get_cpuspeed(unsigned int cpu)
-{
- if ((gx_params->pci_suscfg & SUSMOD) == 0)
- return stock_freq;
-
- return (stock_freq * gx_params->off_duration)
- / (gx_params->on_duration + gx_params->off_duration);
-}
-
-/**
- * gx_validate_speed:
- * determine current cpu speed
- *
- **/
-
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
- u8 *off_duration)
-{
- unsigned int i;
- u8 tmp_on, tmp_off;
- int old_tmp_freq = stock_freq;
- int tmp_freq;
-
- *off_duration = 1;
- *on_duration = 0;
-
- for (i = max_duration; i > 0; i--) {
- tmp_off = ((khz * i) / stock_freq) & 0xff;
- tmp_on = i - tmp_off;
- tmp_freq = (stock_freq * tmp_off) / i;
- /* if this relation is closer to khz, use this. If it's equal,
- * prefer it, too - lower latency */
- if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
- *on_duration = tmp_on;
- *off_duration = tmp_off;
- old_tmp_freq = tmp_freq;
- }
- }
-
- return old_tmp_freq;
-}
-
-
-/**
- * gx_set_cpuspeed:
- * set cpu speed in khz.
- **/
-
-static void gx_set_cpuspeed(unsigned int khz)
-{
- u8 suscfg, pmer1;
- unsigned int new_khz;
- unsigned long flags;
- struct cpufreq_freqs freqs;
-
- freqs.cpu = 0;
- freqs.old = gx_get_cpuspeed(0);
-
- new_khz = gx_validate_speed(khz, &gx_params->on_duration,
- &gx_params->off_duration);
-
- freqs.new = new_khz;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- local_irq_save(flags);
-
-
-
- if (new_khz != stock_freq) {
- /* if new khz == 100% of CPU speed, it is special case */
- switch (gx_params->cs55x0->device) {
- case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
- pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
- /* FIXME: need to test other values -- Zwane,Miura */
- /* typical 2 to 4ms */
- gx_write_byte(PCI_IRQTC, 4);
- /* typical 50 to 100ms */
- gx_write_byte(PCI_VIDTC, 100);
- gx_write_byte(PCI_PMER1, pmer1);
-
- if (gx_params->cs55x0->revision < 0x10) {
- /* CS5530(rev 1.2, 1.3) */
- suscfg = gx_params->pci_suscfg|SUSMOD;
- } else {
- /* CS5530A,B.. */
- suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
- }
- break;
- case PCI_DEVICE_ID_CYRIX_5520:
- case PCI_DEVICE_ID_CYRIX_5510:
- suscfg = gx_params->pci_suscfg | SUSMOD;
- break;
- default:
- local_irq_restore(flags);
- dprintk("fatal: try to set unknown chipset.\n");
- return;
- }
- } else {
- suscfg = gx_params->pci_suscfg & ~(SUSMOD);
- gx_params->off_duration = 0;
- gx_params->on_duration = 0;
- dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
- }
-
- gx_write_byte(PCI_MODOFF, gx_params->off_duration);
- gx_write_byte(PCI_MODON, gx_params->on_duration);
-
- gx_write_byte(PCI_SUSCFG, suscfg);
- pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
-
- local_irq_restore(flags);
-
- gx_params->pci_suscfg = suscfg;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
- gx_params->on_duration * 32, gx_params->off_duration * 32);
- dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
-}
-
-/****************************************************************
- * High level functions *
- ****************************************************************/
-
-/*
- * cpufreq_gx_verify: test if frequency range is valid
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int cpufreq_gx_verify(struct cpufreq_policy *policy)
-{
- unsigned int tmp_freq = 0;
- u8 tmp1, tmp2;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- /* it needs to be assured that at least one supported frequency is
- * within policy->min and policy->max. If it is not, policy->max
- * needs to be increased until one freuqency is supported.
- * policy->min may not be decreased, though. This way we guarantee a
- * specific processing capacity.
- */
- tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
- if (tmp_freq < policy->min)
- tmp_freq += stock_freq / max_duration;
- policy->min = tmp_freq;
- if (policy->min > policy->max)
- policy->max = tmp_freq;
- tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
- if (tmp_freq > policy->max)
- tmp_freq -= stock_freq / max_duration;
- policy->max = tmp_freq;
- if (policy->max < policy->min)
- policy->max = policy->min;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- return 0;
-}
-
-/*
- * cpufreq_gx_target:
- *
- */
-static int cpufreq_gx_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- u8 tmp1, tmp2;
- unsigned int tmp_freq;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
-
- tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
- while (tmp_freq < policy->min) {
- tmp_freq += stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
- while (tmp_freq > policy->max) {
- tmp_freq -= stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
-
- gx_set_cpuspeed(tmp_freq);
-
- return 0;
-}
-
-static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int maxfreq, curfreq;
-
- if (!policy || policy->cpu != 0)
- return -ENODEV;
-
- /* determine maximum frequency */
- if (pci_busclk)
- maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
- else if (cpu_khz)
- maxfreq = cpu_khz;
- else
- maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-
- stock_freq = maxfreq;
- curfreq = gx_get_cpuspeed(0);
-
- dprintk("cpu max frequency is %d.\n", maxfreq);
- dprintk("cpu current frequency is %dkHz.\n", curfreq);
-
- /* setup basic struct for cpufreq API */
- policy->cpu = 0;
-
- if (max_duration < POLICY_MIN_DIV)
- policy->min = maxfreq / max_duration;
- else
- policy->min = maxfreq / POLICY_MIN_DIV;
- policy->max = maxfreq;
- policy->cur = curfreq;
- policy->cpuinfo.min_freq = maxfreq / max_duration;
- policy->cpuinfo.max_freq = maxfreq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
- return 0;
-}
-
-/*
- * cpufreq_gx_init:
- * MediaGX/Geode GX initialize cpufreq driver
- */
-static struct cpufreq_driver gx_suspmod_driver = {
- .get = gx_get_cpuspeed,
- .verify = cpufreq_gx_verify,
- .target = cpufreq_gx_target,
- .init = cpufreq_gx_cpu_init,
- .name = "gx-suspmod",
- .owner = THIS_MODULE,
-};
-
-static int __init cpufreq_gx_init(void)
-{
- int ret;
- struct gxfreq_params *params;
- struct pci_dev *gx_pci;
-
- /* Test if we have the right hardware */
- gx_pci = gx_detect_chipset();
- if (gx_pci == NULL)
- return -ENODEV;
-
- /* check whether module parameters are sane */
- if (max_duration > 0xff)
- max_duration = 0xff;
-
- dprintk("geode suspend modulation available.\n");
-
- params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
- if (params == NULL)
- return -ENOMEM;
-
- params->cs55x0 = gx_pci;
- gx_params = params;
-
- /* keep cs55x0 configurations */
- pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
- pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
- pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
- pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
- pci_read_config_byte(params->cs55x0, PCI_MODOFF,
- &(params->off_duration));
-
- ret = cpufreq_register_driver(&gx_suspmod_driver);
- if (ret) {
- kfree(params);
- return ret; /* register error! */
- }
-
- return 0;
-}
-
-static void __exit cpufreq_gx_exit(void)
-{
- cpufreq_unregister_driver(&gx_suspmod_driver);
- pci_dev_put(gx_params->cs55x0);
- kfree(gx_params);
-}
-
-MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE("GPL");
-
-module_init(cpufreq_gx_init);
-module_exit(cpufreq_gx_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index ce2ed3e4aad9..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- * (C) 2001-2004 Dave Jones. <davej@redhat.com>
- * (C) 2002 Padraig Brady. <padraig@antefacto.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by VIA.
- *
- * VIA have currently 3 different versions of Longhaul.
- * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
- * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- * Version 2 of longhaul is backward compatible with v1, but adds
- * LONGHAUL MSR for purpose of both frequency and voltage scaling.
- * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
- * Version 3 of longhaul got renamed to Powersaver and redesigned
- * to use only the POWERSAVER MSR at 0x110a.
- * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
- * It's pretty much the same feature wise to longhaul v2, though
- * there is provision for scaling FSB too, but this doesn't work
- * too well in practice so we don't even try to use this.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/acpi.h>
-
-#include <asm/msr.h>
-#include <acpi/processor.h>
-
-#include "longhaul.h"
-
-#define PFX "longhaul: "
-
-#define TYPE_LONGHAUL_V1 1
-#define TYPE_LONGHAUL_V2 2
-#define TYPE_POWERSAVER 3
-
-#define CPU_SAMUEL 1
-#define CPU_SAMUEL2 2
-#define CPU_EZRA 3
-#define CPU_EZRA_T 4
-#define CPU_NEHEMIAH 5
-#define CPU_NEHEMIAH_C 6
-
-/* Flags */
-#define USE_ACPI_C3 (1 << 1)
-#define USE_NORTHBRIDGE (1 << 2)
-
-static int cpu_model;
-static unsigned int numscales = 16;
-static unsigned int fsb;
-
-static const struct mV_pos *vrm_mV_table;
-static const unsigned char *mV_vrm_table;
-
-static unsigned int highest_speed, lowest_speed; /* kHz */
-static unsigned int minmult, maxmult;
-static int can_scale_voltage;
-static struct acpi_processor *pr;
-static struct acpi_processor_cx *cx;
-static u32 acpi_regs_addr;
-static u8 longhaul_flags;
-static unsigned int longhaul_index;
-
-/* Module parameters */
-static int scale_voltage;
-static int disable_acpi_c3;
-static int revid_errata;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longhaul", msg)
-
-
-/* Clock ratios multiplied by 10 */
-static int mults[32];
-static int eblcr[32];
-static int longhaul_version;
-static struct cpufreq_frequency_table *longhaul_table;
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-static char speedbuffer[8];
-
-static char *print_speed(int speed)
-{
- if (speed < 1000) {
- snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
- return speedbuffer;
- }
-
- if (speed%1000 == 0)
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%dGHz", speed/1000);
- else
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%d.%dGHz", speed/1000, (speed%1000)/100);
-
- return speedbuffer;
-}
-#endif
-
-
-static unsigned int calc_speed(int mult)
-{
- int khz;
- khz = (mult/10)*fsb;
- if (mult%10)
- khz += fsb/2;
- khz *= 1000;
- return khz;
-}
-
-
-static int longhaul_get_cpu_mult(void)
-{
- unsigned long invalue = 0, lo, hi;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
- invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
- if (longhaul_version == TYPE_LONGHAUL_V2 ||
- longhaul_version == TYPE_POWERSAVER) {
- if (lo & (1<<27))
- invalue += 16;
- }
- return eblcr[invalue];
-}
-
-/* For processor with BCR2 MSR */
-
-static void do_longhaul1(unsigned int mults_index)
-{
- union msr_bcr2 bcr2;
-
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Enable software clock multiplier */
- bcr2.bits.ESOFTBF = 1;
- bcr2.bits.CLOCKMUL = mults_index & 0xff;
-
- /* Sync to timer tick */
- safe_halt();
- /* Change frequency on next halt or sleep */
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Invoke transition */
- ACPI_FLUSH_CPU_CACHE();
- halt();
-
- /* Disable software clock multiplier */
- local_irq_disable();
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- bcr2.bits.ESOFTBF = 0;
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
-}
-
-/* For processor with Longhaul MSR */
-
-static void do_powersaver(int cx_address, unsigned int mults_index,
- unsigned int dir)
-{
- union msr_longhaul longhaul;
- u32 t;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Setup new frequency */
- if (!revid_errata)
- longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
- else
- longhaul.bits.RevisionKey = 0;
- longhaul.bits.SoftBusRatio = mults_index & 0xf;
- longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
- /* Setup new voltage */
- if (can_scale_voltage)
- longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
- /* Sync to timer tick */
- safe_halt();
- /* Raise voltage if necessary */
- if (can_scale_voltage && dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-
- /* Change frequency on next halt or sleep */
- longhaul.bits.EnableSoftBusRatio = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3 read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- /* Disable bus ratio bit */
- longhaul.bits.EnableSoftBusRatio = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-
- /* Reduce voltage if necessary */
- if (can_scale_voltage && !dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-}
-
-/**
- * longhaul_set_cpu_frequency()
- * @mults_index : bitpattern of the new multiplier.
- *
- * Sets a new clock ratio.
- */
-
-static void longhaul_setstate(unsigned int table_index)
-{
- unsigned int mults_index;
- int speed, mult;
- struct cpufreq_freqs freqs;
- unsigned long flags;
- unsigned int pic1_mask, pic2_mask;
- u16 bm_status = 0;
- u32 bm_timeout = 1000;
- unsigned int dir = 0;
-
- mults_index = longhaul_table[table_index].index;
- /* Safety precautions */
- mult = mults[mults_index & 0x1f];
- if (mult == -1)
- return;
- speed = calc_speed(mult);
- if ((speed > highest_speed) || (speed < lowest_speed))
- return;
- /* Voltage transition before frequency transition? */
- if (can_scale_voltage && longhaul_index < table_index)
- dir = 1;
-
- freqs.old = calc_speed(longhaul_get_cpu_mult());
- freqs.new = speed;
- freqs.cpu = 0; /* longhaul.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
- fsb, mult/10, mult%10, print_speed(speed/1000));
-retry_loop:
- preempt_disable();
- local_irq_save(flags);
-
- pic2_mask = inb(0xA1);
- pic1_mask = inb(0x21); /* works on C3. save mask. */
- outb(0xFF, 0xA1); /* Overkill */
- outb(0xFE, 0x21); /* TMR0 only */
-
- /* Wait while PCI bus is busy. */
- if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
- || ((pr != NULL) && pr->flags.bm_control))) {
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- while (bm_status && bm_timeout) {
- outw(1 << 4, acpi_regs_addr);
- bm_timeout--;
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- }
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Disable AGP and PCI arbiters */
- outb(3, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Disable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
- }
- switch (longhaul_version) {
-
- /*
- * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
- * Software controlled multipliers only.
- */
- case TYPE_LONGHAUL_V1:
- do_longhaul1(mults_index);
- break;
-
- /*
- * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
- *
- * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
- * Nehemiah can do FSB scaling too, but this has never been proven
- * to work in practice.
- */
- case TYPE_LONGHAUL_V2:
- case TYPE_POWERSAVER:
- if (longhaul_flags & USE_ACPI_C3) {
- /* Don't allow wakeup */
- acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
- do_powersaver(cx->address, mults_index, dir);
- } else {
- do_powersaver(0, mults_index, dir);
- }
- break;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Enable arbiters */
- outb(0, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Enable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
- }
- outb(pic2_mask, 0xA1); /* restore mask */
- outb(pic1_mask, 0x21);
-
- local_irq_restore(flags);
- preempt_enable();
-
- freqs.new = calc_speed(longhaul_get_cpu_mult());
- /* Check if requested frequency is set. */
- if (unlikely(freqs.new != speed)) {
- printk(KERN_INFO PFX "Failed to set requested frequency!\n");
- /* Revision ID = 1 but processor is expecting revision key
- * equal to 0. Jumpers at the bottom of processor will change
- * multiplier and FSB, but will not change bits in Longhaul
- * MSR nor enable voltage scaling. */
- if (!revid_errata) {
- printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
- "option.\n");
- revid_errata = 1;
- msleep(200);
- goto retry_loop;
- }
- /* Why ACPI C3 sometimes doesn't work is a mystery for me.
- * But it does happen. Processor is entering ACPI C3 state,
- * but it doesn't change frequency. I tried poking various
- * bits in northbridge registers, but without success. */
- if (longhaul_flags & USE_ACPI_C3) {
- printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
- longhaul_flags &= ~USE_ACPI_C3;
- if (revid_errata) {
- printk(KERN_INFO PFX "Disabling \"Ignore "
- "Revision ID\" option.\n");
- revid_errata = 0;
- }
- msleep(200);
- goto retry_loop;
- }
- /* This shouldn't happen. Longhaul ver. 2 was reported not
- * working on processors without voltage scaling, but with
- * RevID = 1. RevID errata will make things right. Just
- * to be 100% sure. */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
- longhaul_version = TYPE_LONGHAUL_V1;
- msleep(200);
- goto retry_loop;
- }
- }
- /* Report true CPU frequency */
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- if (!bm_timeout)
- printk(KERN_INFO PFX "Warning: Timeout while waiting for "
- "idle PCI bus.\n");
-}
-
-/*
- * Centaur decided to make life a little more tricky.
- * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
- * Samuel2 and above have to try and guess what the FSB is.
- * We do this by assuming we booted at maximum multiplier, and interpolate
- * between that value multiplied by possible FSBs and cpu_mhz which
- * was calculated at boot time. Really ugly, but no other way to do this.
- */
-
-#define ROUNDING 0xf
-
-static int guess_fsb(int mult)
-{
- int speed = cpu_khz / 1000;
- int i;
- int speeds[] = { 666, 1000, 1333, 2000 };
- int f_max, f_min;
-
- for (i = 0; i < 4; i++) {
- f_max = ((speeds[i] * mult) + 50) / 100;
- f_max += (ROUNDING / 2);
- f_min = f_max - ROUNDING;
- if ((speed <= f_max) && (speed >= f_min))
- return speeds[i] / 10;
- }
- return 0;
-}
-
-
-static int __init longhaul_get_ranges(void)
-{
- unsigned int i, j, k = 0;
- unsigned int ratio;
- int mult;
-
- /* Get current frequency */
- mult = longhaul_get_cpu_mult();
- if (mult == -1) {
- printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
- return -EINVAL;
- }
- fsb = guess_fsb(mult);
- if (fsb == 0) {
- printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
- return -EINVAL;
- }
- /* Get max multiplier - as we always did.
- * Longhaul MSR is usefull only when voltage scaling is enabled.
- * C3 is booting at max anyway. */
- maxmult = mult;
- /* Get min multiplier */
- switch (cpu_model) {
- case CPU_NEHEMIAH:
- minmult = 50;
- break;
- case CPU_NEHEMIAH_C:
- minmult = 40;
- break;
- default:
- minmult = 30;
- break;
- }
-
- dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
- minmult/10, minmult%10, maxmult/10, maxmult%10);
-
- highest_speed = calc_speed(maxmult);
- lowest_speed = calc_speed(minmult);
- dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
- print_speed(lowest_speed/1000),
- print_speed(highest_speed/1000));
-
- if (lowest_speed == highest_speed) {
- printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
- return -EINVAL;
- }
- if (lowest_speed > highest_speed) {
- printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
- lowest_speed, highest_speed);
- return -EINVAL;
- }
-
- longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
- GFP_KERNEL);
- if (!longhaul_table)
- return -ENOMEM;
-
- for (j = 0; j < numscales; j++) {
- ratio = mults[j];
- if (ratio == -1)
- continue;
- if (ratio > maxmult || ratio < minmult)
- continue;
- longhaul_table[k].frequency = calc_speed(ratio);
- longhaul_table[k].index = j;
- k++;
- }
- if (k <= 1) {
- kfree(longhaul_table);
- return -ENODEV;
- }
- /* Sort */
- for (j = 0; j < k - 1; j++) {
- unsigned int min_f, min_i;
- min_f = longhaul_table[j].frequency;
- min_i = j;
- for (i = j + 1; i < k; i++) {
- if (longhaul_table[i].frequency < min_f) {
- min_f = longhaul_table[i].frequency;
- min_i = i;
- }
- }
- if (min_i != j) {
- swap(longhaul_table[j].frequency,
- longhaul_table[min_i].frequency);
- swap(longhaul_table[j].index,
- longhaul_table[min_i].index);
- }
- }
-
- longhaul_table[k].frequency = CPUFREQ_TABLE_END;
-
- /* Find index we are running on */
- for (j = 0; j < k; j++) {
- if (mults[longhaul_table[j].index & 0x1f] == mult) {
- longhaul_index = j;
- break;
- }
- }
- return 0;
-}
-
-
-static void __init longhaul_setup_voltagescaling(void)
-{
- union msr_longhaul longhaul;
- struct mV_pos minvid, maxvid, vid;
- unsigned int j, speed, pos, kHz_step, numvscales;
- int min_vid_speed;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!(longhaul.bits.RevisionID & 1)) {
- printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
- return;
- }
-
- if (!longhaul.bits.VRMRev) {
- printk(KERN_INFO PFX "VRM 8.5\n");
- vrm_mV_table = &vrm85_mV[0];
- mV_vrm_table = &mV_vrm85[0];
- } else {
- printk(KERN_INFO PFX "Mobile VRM\n");
- if (cpu_model < CPU_NEHEMIAH)
- return;
- vrm_mV_table = &mobilevrm_mV[0];
- mV_vrm_table = &mV_mobilevrm[0];
- }
-
- minvid = vrm_mV_table[longhaul.bits.MinimumVID];
- maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-
- if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
- printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
- "Voltage scaling disabled.\n",
- minvid.mV/1000, minvid.mV%1000,
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- if (minvid.mV == maxvid.mV) {
- printk(KERN_INFO PFX "Claims to support voltage scaling but "
- "min & max are both %d.%03d. "
- "Voltage scaling disabled\n",
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- /* How many voltage steps*/
- numvscales = maxvid.pos - minvid.pos + 1;
- printk(KERN_INFO PFX
- "Max VID=%d.%03d "
- "Min VID=%d.%03d, "
- "%d possible voltage scales\n",
- maxvid.mV/1000, maxvid.mV%1000,
- minvid.mV/1000, minvid.mV%1000,
- numvscales);
-
- /* Calculate max frequency at min voltage */
- j = longhaul.bits.MinMHzBR;
- if (longhaul.bits.MinMHzBR4)
- j += 16;
- min_vid_speed = eblcr[j];
- if (min_vid_speed == -1)
- return;
- switch (longhaul.bits.MinMHzFSB) {
- case 0:
- min_vid_speed *= 13333;
- break;
- case 1:
- min_vid_speed *= 10000;
- break;
- case 3:
- min_vid_speed *= 6666;
- break;
- default:
- return;
- break;
- }
- if (min_vid_speed >= highest_speed)
- return;
- /* Calculate kHz for one voltage step */
- kHz_step = (highest_speed - min_vid_speed) / numvscales;
-
- j = 0;
- while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
- speed = longhaul_table[j].frequency;
- if (speed > min_vid_speed)
- pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
- else
- pos = minvid.pos;
- longhaul_table[j].index |= mV_vrm_table[pos] << 8;
- vid = vrm_mV_table[mV_vrm_table[pos]];
- printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
- speed, j, vid.mV);
- j++;
- }
-
- can_scale_voltage = 1;
- printk(KERN_INFO PFX "Voltage scaling enabled.\n");
-}
-
-
-static int longhaul_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, longhaul_table);
-}
-
-
-static int longhaul_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int table_index = 0;
- unsigned int i;
- unsigned int dir = 0;
- u8 vid, current_vid;
-
- if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
- relation, &table_index))
- return -EINVAL;
-
- /* Don't set same frequency again */
- if (longhaul_index == table_index)
- return 0;
-
- if (!can_scale_voltage)
- longhaul_setstate(table_index);
- else {
- /* On test system voltage transitions exceeding single
- * step up or down were turning motherboard off. Both
- * "ondemand" and "userspace" are unsafe. C7 is doing
- * this in hardware, C3 is old and we need to do this
- * in software. */
- i = longhaul_index;
- current_vid = (longhaul_table[longhaul_index].index >> 8);
- current_vid &= 0x1f;
- if (table_index > longhaul_index)
- dir = 1;
- while (i != table_index) {
- vid = (longhaul_table[i].index >> 8) & 0x1f;
- if (vid != current_vid) {
- longhaul_setstate(i);
- current_vid = vid;
- msleep(200);
- }
- if (dir)
- i++;
- else
- i--;
- }
- longhaul_setstate(table_index);
- }
- longhaul_index = table_index;
- return 0;
-}
-
-
-static unsigned int longhaul_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return calc_speed(longhaul_get_cpu_mult());
-}
-
-static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
- u32 nesting_level,
- void *context, void **return_value)
-{
- struct acpi_device *d;
-
- if (acpi_bus_get_device(obj_handle, &d))
- return 0;
-
- *return_value = acpi_driver_data(d);
- return 1;
-}
-
-/* VIA don't support PM2 reg, but have something similar */
-static int enable_arbiter_disable(void)
-{
- struct pci_dev *dev;
- int status = 1;
- int reg;
- u8 pci_cmd;
-
- /* Find PLE133 host bridge */
- reg = 0x78;
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
- NULL);
- /* Find PM133/VT8605 host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8605_0, NULL);
- /* Find CLE266 host bridge */
- if (dev == NULL) {
- reg = 0x76;
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_862X_0, NULL);
- /* Find CN400 V-Link host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
- }
- if (dev != NULL) {
- /* Enable access to port 0x22 */
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- pci_cmd |= 1<<7;
- pci_write_config_byte(dev, reg, pci_cmd);
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- printk(KERN_ERR PFX
- "Can't enable access to port 0x22.\n");
- status = 0;
- }
- }
- pci_dev_put(dev);
- return status;
- }
- return 0;
-}
-
-static int longhaul_setup_southbridge(void)
-{
- struct pci_dev *dev;
- u8 pci_cmd;
-
- /* Find VT8235 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
- if (dev == NULL)
- /* Find VT8237 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8237, NULL);
- if (dev != NULL) {
- /* Set transition time to max */
- pci_read_config_byte(dev, 0xec, &pci_cmd);
- pci_cmd &= ~(1 << 2);
- pci_write_config_byte(dev, 0xec, pci_cmd);
- pci_read_config_byte(dev, 0xe4, &pci_cmd);
- pci_cmd &= ~(1 << 7);
- pci_write_config_byte(dev, 0xe4, pci_cmd);
- pci_read_config_byte(dev, 0xe5, &pci_cmd);
- pci_cmd |= 1 << 7;
- pci_write_config_byte(dev, 0xe5, pci_cmd);
- /* Get address of ACPI registers block*/
- pci_read_config_byte(dev, 0x81, &pci_cmd);
- if (pci_cmd & 1 << 7) {
- pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
- acpi_regs_addr &= 0xff00;
- printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
- acpi_regs_addr);
- }
-
- pci_dev_put(dev);
- return 1;
- }
- return 0;
-}
-
-static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- char *cpuname = NULL;
- int ret;
- u32 lo, hi;
-
- /* Check what we have on this motherboard */
- switch (c->x86_model) {
- case 6:
- cpu_model = CPU_SAMUEL;
- cpuname = "C3 'Samuel' [C5A]";
- longhaul_version = TYPE_LONGHAUL_V1;
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
- break;
-
- case 7:
- switch (c->x86_mask) {
- case 0:
- longhaul_version = TYPE_LONGHAUL_V1;
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- /* Note, this is not a typo, early Samuel2's had
- * Samuel1 ratios. */
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
- break;
- case 1 ... 15:
- longhaul_version = TYPE_LONGHAUL_V1;
- if (c->x86_mask < 8) {
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- } else {
- cpu_model = CPU_EZRA;
- cpuname = "C3 'Ezra' [C5C]";
- }
- memcpy(mults, ezra_mults, sizeof(ezra_mults));
- memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
- break;
- }
- break;
-
- case 8:
- cpu_model = CPU_EZRA_T;
- cpuname = "C3 'Ezra-T' [C5M]";
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
- memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
- break;
-
- case 9:
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
- memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
- switch (c->x86_mask) {
- case 0 ... 1:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah A' [C5XLOE]";
- break;
- case 2 ... 4:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah B' [C5XLOH]";
- break;
- case 5 ... 15:
- cpu_model = CPU_NEHEMIAH_C;
- cpuname = "C3 'Nehemiah C' [C5P]";
- break;
- }
- break;
-
- default:
- cpuname = "Unknown";
- break;
- }
- /* Check Longhaul ver. 2 */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- rdmsr(MSR_VIA_LONGHAUL, lo, hi);
- if (lo == 0 && hi == 0)
- /* Looks like MSR isn't present */
- longhaul_version = TYPE_LONGHAUL_V1;
- }
-
- printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
- switch (longhaul_version) {
- case TYPE_LONGHAUL_V1:
- case TYPE_LONGHAUL_V2:
- printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
- break;
- case TYPE_POWERSAVER:
- printk(KERN_CONT "Powersaver supported.\n");
- break;
- };
-
- /* Doesn't hurt */
- longhaul_setup_southbridge();
-
- /* Find ACPI data for processor */
- acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, &longhaul_walk_callback,
- NULL, (void *)&pr);
-
- /* Check ACPI support for C3 state */
- if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
- cx = &pr->power.states[ACPI_STATE_C3];
- if (cx->address > 0 && cx->latency <= 1000)
- longhaul_flags |= USE_ACPI_C3;
- }
- /* Disable if it isn't working */
- if (disable_acpi_c3)
- longhaul_flags &= ~USE_ACPI_C3;
- /* Check if northbridge is friendly */
- if (enable_arbiter_disable())
- longhaul_flags |= USE_NORTHBRIDGE;
-
- /* Check ACPI support for bus master arbiter disable */
- if (!(longhaul_flags & USE_ACPI_C3
- || longhaul_flags & USE_NORTHBRIDGE)
- && ((pr == NULL) || !(pr->flags.bm_control))) {
- printk(KERN_ERR PFX
- "No ACPI support. Unsupported northbridge.\n");
- return -ENODEV;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE)
- printk(KERN_INFO PFX "Using northbridge support.\n");
- if (longhaul_flags & USE_ACPI_C3)
- printk(KERN_INFO PFX "Using ACPI support.\n");
-
- ret = longhaul_get_ranges();
- if (ret != 0)
- return ret;
-
- if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
- longhaul_setup_voltagescaling();
-
- policy->cpuinfo.transition_latency = 200000; /* nsec */
- policy->cur = calc_speed(longhaul_get_cpu_mult());
-
- ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
- if (ret)
- return ret;
-
- cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
-
- return 0;
-}
-
-static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *longhaul_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver longhaul_driver = {
- .verify = longhaul_verify,
- .target = longhaul_target,
- .get = longhaul_get,
- .init = longhaul_cpu_init,
- .exit = __devexit_p(longhaul_cpu_exit),
- .name = "longhaul",
- .owner = THIS_MODULE,
- .attr = longhaul_attr,
-};
-
-
-static int __init longhaul_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
- return -ENODEV;
-
-#ifdef CONFIG_SMP
- if (num_online_cpus() > 1) {
- printk(KERN_ERR PFX "More than 1 CPU detected, "
- "longhaul disabled.\n");
- return -ENODEV;
- }
-#endif
-#ifdef CONFIG_X86_IO_APIC
- if (cpu_has_apic) {
- printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
- "broken in this configuration.\n");
- return -ENODEV;
- }
-#endif
- switch (c->x86_model) {
- case 6 ... 9:
- return cpufreq_register_driver(&longhaul_driver);
- case 10:
- printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
- default:
- ;
- }
-
- return -ENODEV;
-}
-
-
-static void __exit longhaul_exit(void)
-{
- int i;
-
- for (i = 0; i < numscales; i++) {
- if (mults[i] == maxmult) {
- longhaul_setstate(i);
- break;
- }
- }
-
- cpufreq_unregister_driver(&longhaul_driver);
- kfree(longhaul_table);
-}
-
-/* Even if BIOS is exporting ACPI C3 state, and it is used
- * with success when CPU is idle, this state doesn't
- * trigger frequency transition in some cases. */
-module_param(disable_acpi_c3, int, 0644);
-MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
-/* Change CPU voltage with frequency. Very usefull to save
- * power, but most VIA C3 processors aren't supporting it. */
-module_param(scale_voltage, int, 0644);
-MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-/* Force revision key to 0 for processors which doesn't
- * support voltage scaling, but are introducing itself as
- * such. */
-module_param(revid_errata, int, 0644);
-MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(longhaul_init);
-module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index e2360a469f79..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * longhaul.h
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * VIA-specific information
- */
-
-union msr_bcr2 {
- struct {
- unsigned Reseved:19, // 18:0
- ESOFTBF:1, // 19
- Reserved2:3, // 22:20
- CLOCKMUL:4, // 26:23
- Reserved3:5; // 31:27
- } bits;
- unsigned long val;
-};
-
-union msr_longhaul {
- struct {
- unsigned RevisionID:4, // 3:0
- RevisionKey:4, // 7:4
- EnableSoftBusRatio:1, // 8
- EnableSoftVID:1, // 9
- EnableSoftBSEL:1, // 10
- Reserved:3, // 11:13
- SoftBusRatio4:1, // 14
- VRMRev:1, // 15
- SoftBusRatio:4, // 19:16
- SoftVID:5, // 24:20
- Reserved2:3, // 27:25
- SoftBSEL:2, // 29:28
- Reserved3:2, // 31:30
- MaxMHzBR:4, // 35:32
- MaximumVID:5, // 40:36
- MaxMHzFSB:2, // 42:41
- MaxMHzBR4:1, // 43
- Reserved4:4, // 47:44
- MinMHzBR:4, // 51:48
- MinimumVID:5, // 56:52
- MinMHzFSB:2, // 58:57
- MinMHzBR4:1, // 59
- Reserved5:4; // 63:60
- } bits;
- unsigned long long val;
-};
-
-/*
- * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr values specify the ratio read from the CPU.
- * The mults values specify what to write to the CPU.
- */
-
-/*
- * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
- */
-static const int __initdata samuel1_mults[16] = {
- -1, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- -1, /* 0100 -> RESERVED */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- -1, /* 1111 -> RESERVED */
-};
-
-static const int __initdata samuel1_eblcr[16] = {
- 50, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- -1, /* 0111 -> RESERVED */
- -1, /* 1000 -> RESERVED */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- -1, /* 1100 -> RESERVED */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Samuel2 Stepping 1->15
- */
-static const int __initdata samuel2_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 110, /* 0111 -> 11.0x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 130, /* 1110 -> 13.0x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Ezra
- */
-static const int __initdata ezra_mults[16] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-};
-
-static const int __initdata ezra_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 (Ezra-T) [C5M].
- */
-static const int __initdata ezrat_mults[32] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-
- -1, /* 0000 -> RESERVED (10.0x) */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (9.0x)*/
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> RESERVED (12.0x) */
-};
-
-static const int __initdata ezrat_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-
- -1, /* 0000 -> RESERVED (9.0x) */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (10.0x)*/
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- -1, /* 1100 -> RESERVED (12.0x) */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145, /* 1111 -> 14.5x */
-};
-
-/*
- * VIA C3 Nehemiah */
-
-static const int __initdata nehemiah_mults[32] = {
- 100, /* 0000 -> 10.0x */
- -1, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
- -1, /* 0000 -> 10.0x */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> 9.0x */
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> 12.0x */
-};
-
-static const int __initdata nehemiah_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 160, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
- 90, /* 0000 -> 9.0x */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- 100, /* 0011 -> 10.0x */
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- 120, /* 1100 -> 12.0x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145 /* 1111 -> 14.5x */
-};
-
-/*
- * Voltage scales. Div/Mod by 1000 to get actual voltage.
- * Which scale to use depends on the VRM type in use.
- */
-
-struct mV_pos {
- unsigned short mV;
- unsigned short pos;
-};
-
-static const struct mV_pos __initdata vrm85_mV[32] = {
- {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
- {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
- {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
- {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
- {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
- {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
- {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
- {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
-};
-
-static const unsigned char __initdata mV_vrm85[32] = {
- 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
- 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
- 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
- 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
-};
-
-static const struct mV_pos __initdata mobilevrm_mV[32] = {
- {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
- {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
- {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
- {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
- {975, 15}, {950, 14}, {925, 13}, {900, 12},
- {875, 11}, {850, 10}, {825, 9}, {800, 8},
- {775, 7}, {750, 6}, {725, 5}, {700, 4},
- {675, 3}, {650, 2}, {625, 1}, {600, 0}
-};
-
-static const unsigned char __initdata mV_mobilevrm[32] = {
- 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
- 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
- 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
- 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-};
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index da5f70fcb766..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longrun", msg)
-
-static struct cpufreq_driver longrun_driver;
-
-/**
- * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
- * values into per cent values. In TMTA microcode, the following is valid:
- * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int longrun_low_freq, longrun_high_freq;
-
-
-/**
- * longrun_get_policy - get the current LongRun policy
- * @policy: struct cpufreq_policy where current policy is written into
- *
- * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
- * and MSR_TMTA_LONGRUN_CTRL
- */
-static void __init longrun_get_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
-
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
- if (msr_lo & 0x01)
- policy->policy = CPUFREQ_POLICY_PERFORMANCE;
- else
- policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
- msr_lo &= 0x0000007F;
- msr_hi &= 0x0000007F;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- policy->min = policy->max = longrun_high_freq;
- } else {
- policy->min = longrun_low_freq + msr_lo *
- ((longrun_high_freq - longrun_low_freq) / 100);
- policy->max = longrun_low_freq + msr_hi *
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
- policy->cpu = 0;
-}
-
-
-/**
- * longrun_set_policy - sets a new CPUFreq policy
- * @policy: new policy
- *
- * Sets a new CPUFreq policy on LongRun-capable processors. This function
- * has to be called with cpufreq_driver locked.
- */
-static int longrun_set_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
- u32 pctg_lo, pctg_hi;
-
- if (!policy)
- return -EINVAL;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- pctg_lo = pctg_hi = 100;
- } else {
- pctg_lo = (policy->min - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- pctg_hi = (policy->max - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
-
- if (pctg_hi > 100)
- pctg_hi = 100;
- if (pctg_lo > pctg_hi)
- pctg_lo = pctg_hi;
-
- /* performance or economy mode */
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFFFE;
- switch (policy->policy) {
- case CPUFREQ_POLICY_PERFORMANCE:
- msr_lo |= 0x00000001;
- break;
- case CPUFREQ_POLICY_POWERSAVE:
- break;
- }
- wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-
- /* lower and upper boundary */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_lo |= pctg_lo;
- msr_hi |= pctg_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- return 0;
-}
-
-
-/**
- * longrun_verify_poliy - verifies a new CPUFreq policy
- * @policy: the policy to verify
- *
- * Validates a new CPUFreq policy. This function has to be called with
- * cpufreq_driver locked.
- */
-static int longrun_verify_policy(struct cpufreq_policy *policy)
-{
- if (!policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
-
- if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
- (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
- return -EINVAL;
-
- return 0;
-}
-
-static unsigned int longrun_get(unsigned int cpu)
-{
- u32 eax, ebx, ecx, edx;
-
- if (cpu)
- return 0;
-
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- dprintk("cpuid eax is %u\n", eax);
-
- return eax * 1000;
-}
-
-/**
- * longrun_determine_freqs - determines the lowest and highest possible core frequency
- * @low_freq: an int to put the lowest frequency into
- * @high_freq: an int to put the highest frequency into
- *
- * Determines the lowest and highest possible core frequencies on this CPU.
- * This is necessary to calculate the performance percentage according to
- * TMTA rules:
- * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
- unsigned int *high_freq)
-{
- u32 msr_lo, msr_hi;
- u32 save_lo, save_hi;
- u32 eax, ebx, ecx, edx;
- u32 try_hi;
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (!low_freq || !high_freq)
- return -EINVAL;
-
- if (cpu_has(c, X86_FEATURE_LRTI)) {
- /* if the LongRun Table Interface is present, the
- * detection is a bit easier:
- * For minimum frequency, read out the maximum
- * level (msr_hi), write that into "currently
- * selected level", and read out the frequency.
- * For maximum frequency, read out level zero.
- */
- /* minimum */
- rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
- wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *low_freq = msr_lo * 1000; /* to kHz */
-
- /* maximum */
- wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *high_freq = msr_lo * 1000; /* to kHz */
-
- dprintk("longrun table interface told %u - %u kHz\n",
- *low_freq, *high_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
- return 0;
- }
-
- /* set the upper border to the value determined during TSC init */
- *high_freq = (cpu_khz / 1000);
- *high_freq = *high_freq * 1000;
- dprintk("high frequency is %u kHz\n", *high_freq);
-
- /* get current borders */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- save_lo = msr_lo & 0x0000007F;
- save_hi = msr_hi & 0x0000007F;
-
- /* if current perf_pctg is larger than 90%, we need to decrease the
- * upper limit to make the calculation more accurate.
- */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- /* try decreasing in 10% steps, some processors react only
- * on some barrier values */
- for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
- /* set to 0 to try_hi perf_pctg */
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_hi |= try_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- /* read out current core MHz and current perf_pctg */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-
- /* restore values */
- wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
- }
- dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
-
- /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- * eqals
- * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
- *
- * high_freq * perf_pctg is stored tempoarily into "ebx".
- */
- ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
-
- if ((ecx > 95) || (ecx == 0) || (eax < ebx))
- return -EIO;
-
- edx = ((eax - ebx) * 100) / (100 - ecx);
- *low_freq = edx * 1000; /* back to kHz */
-
- dprintk("low frequency is %u kHz\n", *low_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
-
- return 0;
-}
-
-
-static int __init longrun_cpu_init(struct cpufreq_policy *policy)
-{
- int result = 0;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* detect low and high frequency */
- result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
- if (result)
- return result;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = longrun_low_freq;
- policy->cpuinfo.max_freq = longrun_high_freq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- longrun_get_policy(policy);
-
- return 0;
-}
-
-
-static struct cpufreq_driver longrun_driver = {
- .flags = CPUFREQ_CONST_LOOPS,
- .verify = longrun_verify_policy,
- .setpolicy = longrun_set_policy,
- .get = longrun_get,
- .init = longrun_cpu_init,
- .name = "longrun",
- .owner = THIS_MODULE,
-};
-
-
-/**
- * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
- *
- * Initializes the LongRun support.
- */
-static int __init longrun_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
- !cpu_has(c, X86_FEATURE_LONGRUN))
- return -ENODEV;
-
- return cpufreq_register_driver(&longrun_driver);
-}
-
-
-/**
- * longrun_exit - unregisters LongRun support
- */
-static void __exit longrun_exit(void)
-{
- cpufreq_unregister_driver(&longrun_driver);
-}
-
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
- "Efficeon processors.");
-MODULE_LICENSE("GPL");
-
-module_init(longrun_init);
-module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 869615193720..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
- * (C) 2002 Tora T. Engstad
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Date Errata Description
- * 20020525 N44, O17 12.5% or 25% DC causes lockup
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/timex.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/timer.h>
-
-#include "speedstep-lib.h"
-
-#define PFX "p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "p4-clockmod", msg)
-
-/*
- * Duty Cycle (3bits), note DC_DISABLE is not specified in
- * intel docs i just use it to mean disable
- */
-enum {
- DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
- DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
-};
-
-#define DC_ENTRIES 8
-
-
-static int has_N44_O17_errata[NR_CPUS];
-static unsigned int stock_freq;
-static struct cpufreq_driver p4clockmod_driver;
-static unsigned int cpufreq_p4_get(unsigned int cpu);
-
-static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
-{
- u32 l, h;
-
- if (!cpu_online(cpu) ||
- (newstate > DC_DISABLE) || (newstate == DC_RESV))
- return -EINVAL;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
-
- if (l & 0x01)
- dprintk("CPU#%d currently thermal throttled\n", cpu);
-
- if (has_N44_O17_errata[cpu] &&
- (newstate == DC_25PT || newstate == DC_DFLT))
- newstate = DC_38PT;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
- if (newstate == DC_DISABLE) {
- dprintk("CPU#%d disabling modulation\n", cpu);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
- } else {
- dprintk("CPU#%d setting duty cycle to %d%%\n",
- cpu, ((125 * newstate) / 10));
- /* bits 63 - 5 : reserved
- * bit 4 : enable/disable
- * bits 3-1 : duty cycle
- * bit 0 : reserved
- */
- l = (l & ~14);
- l = l | (1<<4) | ((newstate & 0x7)<<1);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
- }
-
- return 0;
-}
-
-
-static struct cpufreq_frequency_table p4clockmod_table[] = {
- {DC_RESV, CPUFREQ_ENTRY_INVALID},
- {DC_DFLT, 0},
- {DC_25PT, 0},
- {DC_38PT, 0},
- {DC_50PT, 0},
- {DC_64PT, 0},
- {DC_75PT, 0},
- {DC_88PT, 0},
- {DC_DISABLE, 0},
- {DC_RESV, CPUFREQ_TABLE_END},
-};
-
-
-static int cpufreq_p4_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = DC_RESV;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = cpufreq_p4_get(policy->cpu);
- freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
-
- if (freqs.new == freqs.old)
- return 0;
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- /* run on each logical CPU,
- * see section 13.15.3 of IA32 Intel Architecture Software
- * Developer's Manual, Volume 3
- */
- for_each_cpu(i, policy->cpus)
- cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-static int cpufreq_p4_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
-}
-
-
-static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x06) {
- if (cpu_has(c, X86_FEATURE_EST))
- printk(KERN_WARNING PFX "Warning: EST-capable CPU "
- "detected. The acpi-cpufreq module offers "
- "voltage scaling in addition of frequency "
- "scaling. You should use that instead of "
- "p4-clockmod, if possible.\n");
- switch (c->x86_model) {
- case 0x0E: /* Core */
- case 0x0F: /* Core Duo */
- case 0x16: /* Celeron Core */
- case 0x1C: /* Atom */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
- case 0x0D: /* Pentium M (Dothan) */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- /* fall through */
- case 0x09: /* Pentium M (Banias) */
- return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
- }
- }
-
- if (c->x86 != 0xF) {
- if (!cpu_has(c, X86_FEATURE_EST))
- printk(KERN_WARNING PFX "Unknown CPU. "
- "Please send an e-mail to "
- "<cpufreq@vger.kernel.org>\n");
- return 0;
- }
-
- /* on P-4s, the TSC runs with constant frequency independent whether
- * throttling is active or not. */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
- printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
- "The speedstep-ich or acpi cpufreq modules offer "
- "voltage scaling in addition of frequency scaling. "
- "You should use either one instead of p4-clockmod, "
- "if possible.\n");
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
- }
-
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
-}
-
-
-
-static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- int cpuid = 0;
- unsigned int i;
-
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-
- /* Errata workaround */
- cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
- switch (cpuid) {
- case 0x0f07:
- case 0x0f0a:
- case 0x0f11:
- case 0x0f12:
- has_N44_O17_errata[policy->cpu] = 1;
- dprintk("has errata -- disabling low frequencies\n");
- }
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
- c->x86_model < 2) {
- /* switch to maximum frequency and measure result */
- cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
- recalibrate_cpu_khz();
- }
- /* get max frequency */
- stock_freq = cpufreq_p4_get_frequency(c);
- if (!stock_freq)
- return -EINVAL;
-
- /* table init */
- for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
- p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- p4clockmod_table[i].frequency = (stock_freq * i)/8;
- }
- cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
-
- /* cpuinfo and default policy values */
-
- /* the transition latency is set to be 1 higher than the maximum
- * transition latency of the ondemand governor */
- policy->cpuinfo.transition_latency = 10000001;
- policy->cur = stock_freq;
-
- return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
-}
-
-
-static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int cpufreq_p4_get(unsigned int cpu)
-{
- u32 l, h;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-
- if (l & 0x10) {
- l = l >> 1;
- l &= 0x7;
- } else
- l = DC_DISABLE;
-
- if (l != DC_DISABLE)
- return stock_freq * l / 8;
-
- return stock_freq;
-}
-
-static struct freq_attr *p4clockmod_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver p4clockmod_driver = {
- .verify = cpufreq_p4_verify,
- .target = cpufreq_p4_target,
- .init = cpufreq_p4_cpu_init,
- .exit = cpufreq_p4_cpu_exit,
- .get = cpufreq_p4_get,
- .name = "p4-clockmod",
- .owner = THIS_MODULE,
- .attr = p4clockmod_attr,
-};
-
-
-static int __init cpufreq_p4_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int ret;
-
- /*
- * THERM_CONTROL is architectural for IA32 now, so
- * we can rely on the capability checks
- */
- if (c->x86_vendor != X86_VENDOR_INTEL)
- return -ENODEV;
-
- if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
- !test_cpu_cap(c, X86_FEATURE_ACC))
- return -ENODEV;
-
- ret = cpufreq_register_driver(&p4clockmod_driver);
- if (!ret)
- printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
- "Modulation available\n");
-
- return ret;
-}
-
-
-static void __exit cpufreq_p4_exit(void)
-{
- cpufreq_unregister_driver(&p4clockmod_driver);
-}
-
-
-MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE("GPL");
-
-late_initcall(cpufreq_p4_init);
-module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index f10dea409f40..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
- * Dominik Brodowski.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
- as it is unused */
-
-#define PFX "powernow-k6: "
-static unsigned int busfreq; /* FSB, in 10 kHz */
-static unsigned int max_multiplier;
-
-
-/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
-static struct cpufreq_frequency_table clock_ratio[] = {
- {45, /* 000 -> 4.5x */ 0},
- {50, /* 001 -> 5.0x */ 0},
- {40, /* 010 -> 4.0x */ 0},
- {55, /* 011 -> 5.5x */ 0},
- {20, /* 100 -> 2.0x */ 0},
- {30, /* 101 -> 3.0x */ 0},
- {60, /* 110 -> 6.0x */ 0},
- {35, /* 111 -> 3.5x */ 0},
- {0, CPUFREQ_TABLE_END}
-};
-
-
-/**
- * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
- *
- * Returns the current setting of the frequency multiplier. Core clock
- * speed is frequency of the Front-Side Bus multiplied with this value.
- */
-static int powernow_k6_get_cpu_multiplier(void)
-{
- u64 invalue = 0;
- u32 msrval;
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- return clock_ratio[(invalue >> 5)&7].index;
-}
-
-
-/**
- * powernow_k6_set_state - set the PowerNow! multiplier
- * @best_i: clock_ratio[best_i] is the target multiplier
- *
- * Tries to change the PowerNow! multiplier
- */
-static void powernow_k6_set_state(unsigned int best_i)
-{
- unsigned long outvalue = 0, invalue = 0;
- unsigned long msrval;
- struct cpufreq_freqs freqs;
-
- if (clock_ratio[best_i].index > max_multiplier) {
- printk(KERN_ERR PFX "invalid target frequency\n");
- return;
- }
-
- freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
- freqs.new = busfreq * clock_ratio[best_i].index;
- freqs.cpu = 0; /* powernow-k6.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* we now need to transform best_i to the BVC format, see AMD#23446 */
-
- outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- invalue = invalue & 0xf;
- outvalue = outvalue | invalue;
- outl(outvalue , (POWERNOW_IOPORT + 0x8));
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return;
-}
-
-
-/**
- * powernow_k6_verify - verifies a new CPUfreq policy
- * @policy: new policy
- *
- * Policy must be within lowest and highest possible CPU Frequency,
- * and at least one possible state must be within min and max.
- */
-static int powernow_k6_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
-}
-
-
-/**
- * powernow_k6_setpolicy - sets a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * sets a new CPUFreq policy
- */
-static int powernow_k6_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- powernow_k6_set_state(newstate);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i, f;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* get frequencies */
- max_multiplier = powernow_k6_get_cpu_multiplier();
- busfreq = cpu_khz / max_multiplier;
-
- /* table init */
- for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
- f = clock_ratio[i].index;
- if (f > max_multiplier)
- clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- clock_ratio[i].frequency = busfreq * f;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = busfreq * max_multiplier;
-
- result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int i;
- for (i = 0; i < 8; i++) {
- if (i == max_multiplier)
- powernow_k6_set_state(i);
- }
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int powernow_k6_get(unsigned int cpu)
-{
- unsigned int ret;
- ret = (busfreq * powernow_k6_get_cpu_multiplier());
- return ret;
-}
-
-static struct freq_attr *powernow_k6_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_k6_driver = {
- .verify = powernow_k6_verify,
- .target = powernow_k6_target,
- .init = powernow_k6_cpu_init,
- .exit = powernow_k6_cpu_exit,
- .get = powernow_k6_get,
- .name = "powernow-k6",
- .owner = THIS_MODULE,
- .attr = powernow_k6_attr,
-};
-
-
-/**
- * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
- *
- * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
- * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
- * on success.
- */
-static int __init powernow_k6_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
- ((c->x86_model != 12) && (c->x86_model != 13)))
- return -ENODEV;
-
- if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
- printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
- return -EIO;
- }
-
- if (cpufreq_register_driver(&powernow_k6_driver)) {
- release_region(POWERNOW_IOPORT, 16);
- return -EINVAL;
- }
-
- return 0;
-}
-
-
-/**
- * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
- *
- * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
- */
-static void __exit powernow_k6_exit(void)
-{
- cpufreq_unregister_driver(&powernow_k6_driver);
- release_region(POWERNOW_IOPORT, 16);
-}
-
-
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE("GPL");
-
-module_init(powernow_k6_init);
-module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index d47c775eb0ab..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * AMD K7 Powernow driver.
- * (C) 2003 Dave Jones on behalf of SuSE Labs.
- * (C) 2003-2004 Dave Jones <davej@redhat.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Errata 5:
- * CPU may fail to execute a FID/VID change in presence of interrupt.
- * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15:
- * CPU with half frequency multipliers may hang upon wakeup from disconnect.
- * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/dmi.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
-#include <asm/msr.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-#endif
-
-#include "powernow-k7.h"
-
-#define PFX "powernow: "
-
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags;
- u16 settlingtime;
- u8 reserved1;
- u8 numpst;
-};
-
-struct pst_s {
- u32 cpuid;
- u8 fsbspeed;
- u8 maxfid;
- u8 startvid;
- u8 numpstates;
-};
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-union powernow_acpi_control_t {
- struct {
- unsigned long fid:5,
- vid:5,
- sgtc:20,
- res1:2;
- } bits;
- unsigned long val;
-};
-#endif
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-/* divide by 1000 to get VCore voltage in V. */
-static const int mobile_vid_table[32] = {
- 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
- 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
- 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
- 1075, 1050, 1025, 1000, 975, 950, 925, 0,
-};
-#endif
-
-/* divide by 10 to get FID. */
-static const int fid_codes[32] = {
- 110, 115, 120, 125, 50, 55, 60, 65,
- 70, 75, 80, 85, 90, 95, 100, 105,
- 30, 190, 40, 200, 130, 135, 140, 210,
- 150, 225, 160, 165, 170, 180, -1, -1,
-};
-
-/* This parameter is used in order to force ACPI instead of legacy method for
- * configuration purpose.
- */
-
-static int acpi_force;
-
-static struct cpufreq_frequency_table *powernow_table;
-
-static unsigned int can_scale_bus;
-static unsigned int can_scale_vid;
-static unsigned int minimum_speed = -1;
-static unsigned int maximum_speed;
-static unsigned int number_scales;
-static unsigned int fsb;
-static unsigned int latency;
-static char have_a0;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "powernow-k7", msg)
-
-static int check_fsb(unsigned int fsbspeed)
-{
- int delta;
- unsigned int f = fsb / 1000;
-
- delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
- return delta < 5;
-}
-
-static int check_powernow(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int maxei, eax, ebx, ecx, edx;
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
-#ifdef MODULE
- printk(KERN_INFO PFX "This module only works with "
- "AMD K7 CPUs\n");
-#endif
- return 0;
- }
-
- /* Get maximum capabilities */
- maxei = cpuid_eax(0x80000000);
- if (maxei < 0x80000007) { /* Any powernow info ? */
-#ifdef MODULE
- printk(KERN_INFO PFX "No powernow capabilities detected\n");
-#endif
- return 0;
- }
-
- if ((c->x86_model == 6) && (c->x86_mask == 0)) {
- printk(KERN_INFO PFX "K7 660[A0] core detected, "
- "enabling errata workarounds\n");
- have_a0 = 1;
- }
-
- cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
- /* Check we can actually do something before we say anything.*/
- if (!(edx & (1 << 1 | 1 << 2)))
- return 0;
-
- printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
-
- if (edx & 1 << 1) {
- printk("frequency");
- can_scale_bus = 1;
- }
-
- if ((edx & (1 << 1 | 1 << 2)) == 0x6)
- printk(" and ");
-
- if (edx & 1 << 2) {
- printk("voltage");
- can_scale_vid = 1;
- }
-
- printk(".\n");
- return 1;
-}
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static void invalidate_entry(unsigned int entry)
-{
- powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-#endif
-
-static int get_ranges(unsigned char *pst)
-{
- unsigned int j;
- unsigned int speed;
- u8 fid, vid;
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table)
- return -ENOMEM;
-
- for (j = 0 ; j < number_scales; j++) {
- fid = *pst++;
-
- powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
- powernow_table[j].index = fid; /* lower 8 bits */
-
- speed = powernow_table[j].frequency;
-
- if ((fid_codes[fid] % 10) == 5) {
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (have_a0 == 1)
- invalidate_entry(j);
-#endif
- }
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
-
- vid = *pst++;
- powernow_table[j].index |= (vid << 8); /* upper 8 bits */
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed/1000, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
- }
- powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
- powernow_table[number_scales].index = 0;
-
- return 0;
-}
-
-
-static void change_FID(int fid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.FID != fid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.FID = fid;
- fidvidctl.bits.VIDC = 0;
- fidvidctl.bits.FIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_VID(int vid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.VID != vid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.VID = vid;
- fidvidctl.bits.FIDC = 0;
- fidvidctl.bits.VIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_speed(unsigned int index)
-{
- u8 fid, vid;
- struct cpufreq_freqs freqs;
- union msr_fidvidstatus fidvidstatus;
- int cfid;
-
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in powernow_decode_bios,
- * vid are the upper 8 bits.
- */
-
- fid = powernow_table[index].index & 0xFF;
- vid = (powernow_table[index].index & 0xFF00) >> 8;
-
- freqs.cpu = 0;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
- freqs.old = fsb * fid_codes[cfid] / 10;
-
- freqs.new = powernow_table[index].frequency;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Now do the magic poking into the MSRs. */
-
- if (have_a0 == 1) /* A0 errata 5 */
- local_irq_disable();
-
- if (freqs.old > freqs.new) {
- /* Going down, so change FID first */
- change_FID(fid);
- change_VID(vid);
- } else {
- /* Going up, so change VID first */
- change_VID(vid);
- change_FID(fid);
- }
-
-
- if (have_a0 == 1)
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-}
-
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-
-static struct acpi_processor_performance *acpi_processor_perf;
-
-static int powernow_acpi_init(void)
-{
- int i;
- int retval = 0;
- union powernow_acpi_control_t pc;
-
- if (acpi_processor_perf != NULL && powernow_table != NULL) {
- retval = -EINVAL;
- goto err0;
- }
-
- acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
- GFP_KERNEL);
- if (!acpi_processor_perf) {
- retval = -ENOMEM;
- goto err0;
- }
-
- if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
- GFP_KERNEL)) {
- retval = -ENOMEM;
- goto err05;
- }
-
- if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
- retval = -EIO;
- goto err1;
- }
-
- if (acpi_processor_perf->control_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- if (acpi_processor_perf->status_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- number_scales = acpi_processor_perf->state_count;
-
- if (number_scales < 2) {
- retval = -ENODEV;
- goto err2;
- }
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table) {
- retval = -ENOMEM;
- goto err2;
- }
-
- pc.val = (unsigned long) acpi_processor_perf->states[0].control;
- for (i = 0; i < number_scales; i++) {
- u8 fid, vid;
- struct acpi_processor_px *state =
- &acpi_processor_perf->states[i];
- unsigned int speed, speed_mhz;
-
- pc.val = (unsigned long) state->control;
- dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
- i,
- (u32) state->core_frequency,
- (u32) state->power,
- (u32) state->transition_latency,
- (u32) state->control,
- pc.bits.sgtc);
-
- vid = pc.bits.vid;
- fid = pc.bits.fid;
-
- powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
- powernow_table[i].index = fid; /* lower 8 bits */
- powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-
- speed = powernow_table[i].frequency;
- speed_mhz = speed / 1000;
-
- /* processor_perflib will multiply the MHz value by 1000 to
- * get a KHz value (e.g. 1266000). However, powernow-k7 works
- * with true KHz values (e.g. 1266768). To ensure that all
- * powernow frequencies are available, we must ensure that
- * ACPI doesn't restrict them, so we round up the MHz value
- * to ensure that perflib's computed KHz value is greater than
- * or equal to powernow's KHz value.
- */
- if (speed % 1000 > 0)
- speed_mhz++;
-
- if ((fid_codes[fid] % 10) == 5) {
- if (have_a0 == 1)
- invalidate_entry(i);
- }
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed_mhz, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
-
- if (state->core_frequency != speed_mhz) {
- state->core_frequency = speed_mhz;
- dprintk(" Corrected ACPI frequency to %d\n",
- speed_mhz);
- }
-
- if (latency < pc.bits.sgtc)
- latency = pc.bits.sgtc;
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
- }
-
- powernow_table[i].frequency = CPUFREQ_TABLE_END;
- powernow_table[i].index = 0;
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- return 0;
-
-err2:
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
-err1:
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-err05:
- kfree(acpi_processor_perf);
-err0:
- printk(KERN_WARNING PFX "ACPI perflib can not be used on "
- "this platform\n");
- acpi_processor_perf = NULL;
- return retval;
-}
-#else
-static int powernow_acpi_init(void)
-{
- printk(KERN_INFO PFX "no support for ACPI processor found."
- " Please recompile your kernel with ACPI processor\n");
- return -EINVAL;
-}
-#endif
-
-static void print_pst_entry(struct pst_s *pst, unsigned int j)
-{
- dprintk("PST:%d (@%p)\n", j, pst);
- dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
- pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-}
-
-static int powernow_decode_bios(int maxfid, int startvid)
-{
- struct psb_s *psb;
- struct pst_s *pst;
- unsigned int i, j;
- unsigned char *p;
- unsigned int etuple;
- unsigned int ret;
-
- etuple = cpuid_eax(0x80000001);
-
- for (i = 0xC0000; i < 0xffff0 ; i += 16) {
-
- p = phys_to_virt(i);
-
- if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
- dprintk("Found PSB header at %p\n", p);
- psb = (struct psb_s *) p;
- dprintk("Table version: 0x%x\n", psb->tableversion);
- if (psb->tableversion != 0x12) {
- printk(KERN_INFO PFX "Sorry, only v1.2 tables"
- " supported right now\n");
- return -ENODEV;
- }
-
- dprintk("Flags: 0x%x\n", psb->flags);
- if ((psb->flags & 1) == 0)
- dprintk("Mobile voltage regulator\n");
- else
- dprintk("Desktop voltage regulator\n");
-
- latency = psb->settlingtime;
- if (latency < 100) {
- printk(KERN_INFO PFX "BIOS set settling time "
- "to %d microseconds. "
- "Should be at least 100. "
- "Correcting.\n", latency);
- latency = 100;
- }
- dprintk("Settling Time: %d microseconds.\n",
- psb->settlingtime);
- dprintk("Has %d PST tables. (Only dumping ones "
- "relevant to this CPU).\n",
- psb->numpst);
-
- p += sizeof(struct psb_s);
-
- pst = (struct pst_s *) p;
-
- for (j = 0; j < psb->numpst; j++) {
- pst = (struct pst_s *) p;
- number_scales = pst->numpstates;
-
- if ((etuple == pst->cpuid) &&
- check_fsb(pst->fsbspeed) &&
- (maxfid == pst->maxfid) &&
- (startvid == pst->startvid)) {
- print_pst_entry(pst, j);
- p = (char *)pst + sizeof(struct pst_s);
- ret = get_ranges(p);
- return ret;
- } else {
- unsigned int k;
- p = (char *)pst + sizeof(struct pst_s);
- for (k = 0; k < number_scales; k++)
- p += 2;
- }
- }
- printk(KERN_INFO PFX "No PST tables match this cpuid "
- "(0x%x)\n", etuple);
- printk(KERN_INFO PFX "This is indicative of a broken "
- "BIOS.\n");
-
- return -EINVAL;
- }
- p++;
- }
-
- return -ENODEV;
-}
-
-
-static int powernow_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate;
-
- if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
- relation, &newstate))
- return -EINVAL;
-
- change_speed(newstate);
-
- return 0;
-}
-
-
-static int powernow_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, powernow_table);
-}
-
-/*
- * We use the fact that the bus frequency is somehow
- * a multiple of 100000/3 khz, then we compute sgtc according
- * to this multiple.
- * That way, we match more how AMD thinks all of that work.
- * We will then get the same kind of behaviour already tested under
- * the "well-known" other OS.
- */
-static int __init fixup_sgtc(void)
-{
- unsigned int sgtc;
- unsigned int m;
-
- m = fsb / 3333;
- if ((m % 10) >= 5)
- m += 5;
-
- m /= 10;
-
- sgtc = 100 * m * latency;
- sgtc = sgtc / 3;
- if (sgtc > 0xfffff) {
- printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
- sgtc = 0xfffff;
- }
- return sgtc;
-}
-
-static unsigned int powernow_get(unsigned int cpu)
-{
- union msr_fidvidstatus fidvidstatus;
- unsigned int cfid;
-
- if (cpu)
- return 0;
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
-
- return fsb * fid_codes[cfid] / 10;
-}
-
-
-static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
-{
- printk(KERN_WARNING PFX
- "%s laptop with broken PST tables in BIOS detected.\n",
- d->ident);
- printk(KERN_WARNING PFX
- "You need to downgrade to 3A21 (09/09/2002), or try a newer "
- "BIOS than 3A71 (01/20/2003)\n");
- printk(KERN_WARNING PFX
- "cpufreq scaling has been disabled as a result of this.\n");
- return 0;
-}
-
-/*
- * Some Athlon laptops have really fucked PST tables.
- * A BIOS update is all that can save them.
- * Mention this, and disable cpufreq.
- */
-static struct dmi_system_id __initdata powernow_dmi_table[] = {
- {
- .callback = acer_cpufreq_pst,
- .ident = "Acer Aspire",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
- DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
- },
- },
- { }
-};
-
-static int __init powernow_cpu_init(struct cpufreq_policy *policy)
-{
- union msr_fidvidstatus fidvidstatus;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-
- recalibrate_cpu_khz();
-
- fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
- if (!fsb) {
- printk(KERN_WARNING PFX "can not determine bus frequency\n");
- return -EINVAL;
- }
- dprintk("FSB: %3dMHz\n", fsb/1000);
-
- if (dmi_check_system(powernow_dmi_table) || acpi_force) {
- printk(KERN_INFO PFX "PSB/PST known to be broken. "
- "Trying ACPI instead\n");
- result = powernow_acpi_init();
- } else {
- result = powernow_decode_bios(fidvidstatus.bits.MFID,
- fidvidstatus.bits.SVID);
- if (result) {
- printk(KERN_INFO PFX "Trying ACPI perflib\n");
- maximum_speed = 0;
- minimum_speed = -1;
- latency = 0;
- result = powernow_acpi_init();
- if (result) {
- printk(KERN_INFO PFX
- "ACPI and legacy methods failed\n");
- }
- } else {
- /* SGTC use the bus clock as timer */
- latency = fixup_sgtc();
- printk(KERN_INFO PFX "SGTC: %d\n", latency);
- }
- }
-
- if (result)
- return result;
-
- printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
- minimum_speed/1000, maximum_speed/1000);
-
- policy->cpuinfo.transition_latency =
- cpufreq_scale(2000000UL, fsb, latency);
-
- policy->cur = powernow_get(0);
-
- cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
-
- return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
-}
-
-static int powernow_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (acpi_processor_perf) {
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
- kfree(acpi_processor_perf);
- }
-#endif
-
- kfree(powernow_table);
- return 0;
-}
-
-static struct freq_attr *powernow_table_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_driver = {
- .verify = powernow_verify,
- .target = powernow_target,
- .get = powernow_get,
- .init = powernow_cpu_init,
- .exit = powernow_cpu_exit,
- .name = "powernow-k7",
- .owner = THIS_MODULE,
- .attr = powernow_table_attr,
-};
-
-static int __init powernow_init(void)
-{
- if (check_powernow() == 0)
- return -ENODEV;
- return cpufreq_register_driver(&powernow_driver);
-}
-
-
-static void __exit powernow_exit(void)
-{
- cpufreq_unregister_driver(&powernow_driver);
-}
-
-module_param(acpi_force, int, 0444);
-MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernow_init);
-module_exit(powernow_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1c..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * AMD-specific information
- *
- */
-
-union msr_fidvidctl {
- struct {
- unsigned FID:5, // 4:0
- reserved1:3, // 7:5
- VID:5, // 12:8
- reserved2:3, // 15:13
- FIDC:1, // 16
- VIDC:1, // 17
- reserved3:2, // 19:18
- FIDCHGRATIO:1, // 20
- reserved4:11, // 31-21
- SGTC:20, // 32:51
- reserved5:12; // 63:52
- } bits;
- unsigned long long val;
-};
-
-union msr_fidvidstatus {
- struct {
- unsigned CFID:5, // 4:0
- reserved1:3, // 7:5
- SFID:5, // 12:8
- reserved2:3, // 15:13
- MFID:5, // 20:16
- reserved3:11, // 31:21
- CVID:5, // 36:32
- reserved4:3, // 39:37
- SVID:5, // 44:40
- reserved5:3, // 47:45
- MVID:5, // 52:48
- reserved6:11; // 63:53
- } bits;
- unsigned long long val;
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 2a50ef891000..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1463 +0,0 @@
-
-/*
- * (c) 2003-2006 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Support : mark.langsdorf@amd.com
- *
- * Based on the powernow-k7.c module written by Dave Jones.
- * (C) 2003 Dave Jones on behalf of SuSE Labs
- * (C) 2004 Dominik Brodowski <linux@brodo.de>
- * (C) 2004 Pavel Machek <pavel@suse.cz>
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Valuable input gratefully received from Dave Jones, Pavel Machek,
- * Dominik Brodowski, Jacob Shin, and others.
- * Originally developed by Paul Devriendt.
- * Processor information obtained from Chapter 9 (Power and Thermal Management)
- * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
- * Opteron Processors" available for download from www.amd.com
- *
- * Tables for specific CPUs can be inferred from
- * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/cpumask.h>
-#include <linux/sched.h> /* for current / set_cpus_allowed() */
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-
-#include <linux/acpi.h>
-#include <linux/mutex.h>
-#include <acpi/processor.h>
-
-#define PFX "powernow-k8: "
-#define VERSION "version 2.20.00"
-#include "powernow-k8.h"
-
-/* serialize freq changes */
-static DEFINE_MUTEX(fidvid_mutex);
-
-static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
-
-static int cpu_family = CPU_OPTERON;
-
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
- return cpumask_of(0);
-}
-#endif
-
-/* Return a frequency in MHz, given an input fid */
-static u32 find_freq_from_fid(u32 fid)
-{
- return 800 + (fid * 100);
-}
-
-/* Return a frequency in KHz, given an input fid */
-static u32 find_khz_freq_from_fid(u32 fid)
-{
- return 1000 * find_freq_from_fid(fid);
-}
-
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
- u32 pstate)
-{
- return data[pstate].frequency;
-}
-
-/* Return the vco fid for an input fid
- *
- * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
- * only from corresponding high fids. This returns "high" fid corresponding to
- * "low" one.
- */
-static u32 convert_fid_to_vco_fid(u32 fid)
-{
- if (fid < HI_FID_TABLE_BOTTOM)
- return 8 + (2 * fid);
- else
- return fid;
-}
-
-/*
- * Return 1 if the pending bit is set. Unless we just instructed the processor
- * to transition to a new state, seeing this bit set is really bad news.
- */
-static int pending_bit_stuck(void)
-{
- u32 lo, hi;
-
- if (cpu_family == CPU_HW_PSTATE)
- return 0;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
-}
-
-/*
- * Update the global current fid / vid values from the status msr.
- * Returns 1 on error.
- */
-static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
-{
- u32 lo, hi;
- u32 i = 0;
-
- if (cpu_family == CPU_HW_PSTATE) {
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
- data->currpstate = i;
-
- /*
- * a workaround for family 11h erratum 311 might cause
- * an "out-of-range Pstate if the core is in Pstate-0
- */
- if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
- data->currpstate = HW_PSTATE_0;
-
- return 0;
- }
- do {
- if (i++ > 10000) {
- dprintk("detected change pending stuck\n");
- return 1;
- }
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- } while (lo & MSR_S_LO_CHANGE_PENDING);
-
- data->currvid = hi & MSR_S_HI_CURRENT_VID;
- data->currfid = lo & MSR_S_LO_CURRENT_FID;
-
- return 0;
-}
-
-/* the isochronous relief time */
-static void count_off_irt(struct powernow_k8_data *data)
-{
- udelay((1 << data->irt) * 10);
- return;
-}
-
-/* the voltage stabilization time */
-static void count_off_vst(struct powernow_k8_data *data)
-{
- udelay(data->vstable * VST_UNITS_20US);
- return;
-}
-
-/* need to init the control msr to a safe value (for each cpu) */
-static void fidvid_msr_init(void)
-{
- u32 lo, hi;
- u8 fid, vid;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- vid = hi & MSR_S_HI_CURRENT_VID;
- fid = lo & MSR_S_LO_CURRENT_FID;
- lo = fid | (vid << MSR_C_LO_VID_SHIFT);
- hi = MSR_C_HI_STP_GNT_BENIGN;
- dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
- wrmsr(MSR_FIDVID_CTL, lo, hi);
-}
-
-/* write the new fid value along with the other control fields to the msr */
-static int write_new_fid(struct powernow_k8_data *data, u32 fid)
-{
- u32 lo;
- u32 savevid = data->currvid;
- u32 i = 0;
-
- if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on fid write\n");
- return 1;
- }
-
- lo = fid;
- lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
- fid, lo, data->plllock * PLL_LOCK_CONVERSION);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
- if (i++ > 100) {
- printk(KERN_ERR PFX
- "Hardware error - pending bit very stuck - "
- "no further pstate changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- count_off_irt(data);
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX
- "vid change on fid trans, old 0x%x, new 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- if (fid != data->currfid) {
- printk(KERN_ERR PFX
- "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
- data->currfid);
- return 1;
- }
-
- return 0;
-}
-
-/* Write a new vid to the hardware */
-static int write_new_vid(struct powernow_k8_data *data, u32 vid)
-{
- u32 lo;
- u32 savefid = data->currfid;
- int i = 0;
-
- if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on vid write\n");
- return 1;
- }
-
- lo = data->currfid;
- lo |= (vid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
- vid, lo, STOP_GRANT_5NS);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
- if (i++ > 100) {
- printk(KERN_ERR PFX "internal error - pending bit "
- "very stuck - no further pstate "
- "changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "fid changed on vid trans, old "
- "0x%x new 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (vid != data->currvid) {
- printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
- "curr 0x%x\n",
- vid, data->currvid);
- return 1;
- }
-
- return 0;
-}
-
-/*
- * Reduce the vid by the max of step or reqvid.
- * Decreasing vid codes represent increasing voltages:
- * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
- */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data,
- u32 reqvid, u32 step)
-{
- if ((data->currvid - reqvid) > step)
- reqvid = data->currvid - step;
-
- if (write_new_vid(data, reqvid))
- return 1;
-
- count_off_vst(data);
-
- return 0;
-}
-
-/* Change hardware pstate by single MSR write */
-static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
-{
- wrmsr(MSR_PSTATE_CTRL, pstate, 0);
- data->currpstate = pstate;
- return 0;
-}
-
-/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data,
- u32 reqfid, u32 reqvid)
-{
- if (core_voltage_pre_transition(data, reqvid, reqfid))
- return 1;
-
- if (core_frequency_transition(data, reqfid))
- return 1;
-
- if (core_voltage_post_transition(data, reqvid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
- printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
- "curr 0x%x 0x%x\n",
- smp_processor_id(),
- reqfid, reqvid, data->currfid, data->currvid);
- return 1;
- }
-
- dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
- smp_processor_id(), data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 reqfid)
-{
- u32 rvosteps = data->rvo;
- u32 savefid = data->currfid;
- u32 maxvid, lo, rvomult = 1;
-
- dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
- "reqvid 0x%x, rvo 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqvid, data->rvo);
-
- if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
- rvomult = 2;
- rvosteps *= rvomult;
- rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
- maxvid = 0x1f & (maxvid >> 16);
- dprintk("ph1 maxvid=0x%x\n", maxvid);
- if (reqvid < maxvid) /* lower numbers are higher voltages */
- reqvid = maxvid;
-
- while (data->currvid > reqvid) {
- dprintk("ph1: curr 0x%x, req vid 0x%x\n",
- data->currvid, reqvid);
- if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
- return 1;
- }
-
- while ((rvosteps > 0) &&
- ((rvomult * data->rvo + data->currvid) > reqvid)) {
- if (data->currvid == maxvid) {
- rvosteps = 0;
- } else {
- dprintk("ph1: changing vid for rvo, req 0x%x\n",
- data->currvid - 1);
- if (decrease_vid_code_by_step(data, data->currvid-1, 1))
- return 1;
- rvosteps--;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 2 - core frequency transition */
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
-{
- u32 vcoreqfid, vcocurrfid, vcofiddiff;
- u32 fid_interval, savevid = data->currvid;
-
- if (data->currfid == reqfid) {
- printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
- data->currfid);
- return 0;
- }
-
- dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
- "reqfid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqfid);
-
- vcoreqfid = convert_fid_to_vco_fid(reqfid);
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
-
- if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
- vcofiddiff = 0;
-
- while (vcofiddiff > 2) {
- (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
-
- if (reqfid > data->currfid) {
- if (data->currfid > LO_FID_TABLE_TOP) {
- if (write_new_fid(data,
- data->currfid + fid_interval))
- return 1;
- } else {
- if (write_new_fid
- (data,
- 2 + convert_fid_to_vco_fid(data->currfid)))
- return 1;
- }
- } else {
- if (write_new_fid(data, data->currfid - fid_interval))
- return 1;
- }
-
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
- }
-
- if (write_new_fid(data, reqfid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (data->currfid != reqfid) {
- printk(KERN_ERR PFX
- "ph2: mismatch, failed fid transition, "
- "curr 0x%x, req 0x%x\n",
- data->currfid, reqfid);
- return 1;
- }
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data,
- u32 reqvid)
-{
- u32 savefid = data->currfid;
- u32 savereqvid = reqvid;
-
- dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid);
-
- if (reqvid != data->currvid) {
- if (write_new_vid(data, reqvid))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX
- "ph3: bad fid change, save 0x%x, curr 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (data->currvid != reqvid) {
- printk(KERN_ERR PFX
- "ph3: failed vid transition\n, "
- "req 0x%x, curr 0x%x",
- reqvid, data->currvid);
- return 1;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savereqvid != data->currvid) {
- dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
- return 1;
- }
-
- if (savefid != data->currfid) {
- dprintk("ph3 failed, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-static void check_supported_cpu(void *_rc)
-{
- u32 eax, ebx, ecx, edx;
- int *rc = _rc;
-
- *rc = -ENODEV;
-
- if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return;
-
- eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
- ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
- return;
-
- if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
- if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
- ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
- printk(KERN_INFO PFX
- "Processor cpuid %x not supported\n", eax);
- return;
- }
-
- eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
- if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
- printk(KERN_INFO PFX
- "No frequency change capabilities detected\n");
- return;
- }
-
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & P_STATE_TRANSITION_CAPABLE)
- != P_STATE_TRANSITION_CAPABLE) {
- printk(KERN_INFO PFX
- "Power state transitions not supported\n");
- return;
- }
- } else { /* must be a HW Pstate capable processor */
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
- cpu_family = CPU_HW_PSTATE;
- else
- return;
- }
-
- *rc = 0;
-}
-
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
- u8 maxvid)
-{
- unsigned int j;
- u8 lastfid = 0xff;
-
- for (j = 0; j < data->numps; j++) {
- if (pst[j].vid > LEAST_VID) {
- printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
- j, pst[j].vid);
- return -EINVAL;
- }
- if (pst[j].vid < data->rvo) {
- /* vid + rvo >= 0 */
- printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].vid < maxvid + data->rvo) {
- /* vid + rvo >= maxvid */
- printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].fid > MAX_FID) {
- printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
- /* Only first fid is allowed to be in "low" range */
- printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
- "0x%x\n", j, pst[j].fid);
- return -EINVAL;
- }
- if (pst[j].fid < lastfid)
- lastfid = pst[j].fid;
- }
- if (lastfid & 1) {
- printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
- return -EINVAL;
- }
- if (lastfid > LO_FID_TABLE_TOP)
- printk(KERN_INFO FW_BUG PFX
- "first fid not from lo freq table\n");
-
- return 0;
-}
-
-static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)
-{
- data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-
-static void print_basics(struct powernow_k8_data *data)
-{
- int j;
- for (j = 0; j < data->numps; j++) {
- if (data->powernow_table[j].frequency !=
- CPUFREQ_ENTRY_INVALID) {
- if (cpu_family == CPU_HW_PSTATE) {
- printk(KERN_INFO PFX
- " %d : pstate %d (%d MHz)\n", j,
- data->powernow_table[j].index,
- data->powernow_table[j].frequency/1000);
- } else {
- printk(KERN_INFO PFX
- " %d : fid 0x%x (%d MHz), vid 0x%x\n",
- j,
- data->powernow_table[j].index & 0xff,
- data->powernow_table[j].frequency/1000,
- data->powernow_table[j].index >> 8);
- }
- }
- }
- if (data->batps)
- printk(KERN_INFO PFX "Only %d pstates on battery\n",
- data->batps);
-}
-
-static u32 freq_from_fid_did(u32 fid, u32 did)
-{
- u32 mhz = 0;
-
- if (boot_cpu_data.x86 == 0x10)
- mhz = (100 * (fid + 0x10)) >> did;
- else if (boot_cpu_data.x86 == 0x11)
- mhz = (100 * (fid + 8)) >> did;
- else
- BUG();
-
- return mhz * 1000;
-}
-
-static int fill_powernow_table(struct powernow_k8_data *data,
- struct pst_s *pst, u8 maxvid)
-{
- struct cpufreq_frequency_table *powernow_table;
- unsigned int j;
-
- if (data->batps) {
- /* use ACPI support to get full speed on mains power */
- printk(KERN_WARNING PFX
- "Only %d pstates usable (use ACPI driver for full "
- "range\n", data->batps);
- data->numps = data->batps;
- }
-
- for (j = 1; j < data->numps; j++) {
- if (pst[j-1].fid >= pst[j].fid) {
- printk(KERN_ERR PFX "PST out of sequence\n");
- return -EINVAL;
- }
- }
-
- if (data->numps < 2) {
- printk(KERN_ERR PFX "no p states to transition\n");
- return -ENODEV;
- }
-
- if (check_pst_table(data, pst, maxvid))
- return -EINVAL;
-
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->numps + 1)), GFP_KERNEL);
- if (!powernow_table) {
- printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
- return -ENOMEM;
- }
-
- for (j = 0; j < data->numps; j++) {
- int freq;
- powernow_table[j].index = pst[j].fid; /* lower 8 bits */
- powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
- freq = find_khz_freq_from_fid(pst[j].fid);
- powernow_table[j].frequency = freq;
- }
- powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
- powernow_table[data->numps].index = 0;
-
- if (query_current_values_with_pending_wait(data)) {
- kfree(powernow_table);
- return -EIO;
- }
-
- dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
- data->powernow_table = powernow_table;
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
-
- for (j = 0; j < data->numps; j++)
- if ((pst[j].fid == data->currfid) &&
- (pst[j].vid == data->currvid))
- return 0;
-
- dprintk("currfid/vid do not match PST, ignoring\n");
- return 0;
-}
-
-/* Find and validate the PSB/PST table in BIOS. */
-static int find_psb_table(struct powernow_k8_data *data)
-{
- struct psb_s *psb;
- unsigned int i;
- u32 mvs;
- u8 maxvid;
- u32 cpst = 0;
- u32 thiscpuid;
-
- for (i = 0xc0000; i < 0xffff0; i += 0x10) {
- /* Scan BIOS looking for the signature. */
- /* It can not be at ffff0 - it is too big. */
-
- psb = phys_to_virt(i);
- if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
- continue;
-
- dprintk("found PSB header at 0x%p\n", psb);
-
- dprintk("table vers: 0x%x\n", psb->tableversion);
- if (psb->tableversion != PSB_VERSION_1_4) {
- printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
- return -ENODEV;
- }
-
- dprintk("flags: 0x%x\n", psb->flags1);
- if (psb->flags1) {
- printk(KERN_ERR FW_BUG PFX "unknown flags\n");
- return -ENODEV;
- }
-
- data->vstable = psb->vstable;
- dprintk("voltage stabilization time: %d(*20us)\n",
- data->vstable);
-
- dprintk("flags2: 0x%x\n", psb->flags2);
- data->rvo = psb->flags2 & 3;
- data->irt = ((psb->flags2) >> 2) & 3;
- mvs = ((psb->flags2) >> 4) & 3;
- data->vidmvs = 1 << mvs;
- data->batps = ((psb->flags2) >> 6) & 3;
-
- dprintk("ramp voltage offset: %d\n", data->rvo);
- dprintk("isochronous relief time: %d\n", data->irt);
- dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
-
- dprintk("numpst: 0x%x\n", psb->num_tables);
- cpst = psb->num_tables;
- if ((psb->cpuid == 0x00000fc0) ||
- (psb->cpuid == 0x00000fe0)) {
- thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if ((thiscpuid == 0x00000fc0) ||
- (thiscpuid == 0x00000fe0))
- cpst = 1;
- }
- if (cpst != 1) {
- printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
- return -ENODEV;
- }
-
- data->plllock = psb->plllocktime;
- dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
- dprintk("maxfid: 0x%x\n", psb->maxfid);
- dprintk("maxvid: 0x%x\n", psb->maxvid);
- maxvid = psb->maxvid;
-
- data->numps = psb->numps;
- dprintk("numpstates: 0x%x\n", data->numps);
- return fill_powernow_table(data,
- (struct pst_s *)(psb+1), maxvid);
- }
- /*
- * If you see this message, complain to BIOS manufacturer. If
- * he tells you "we do not support Linux" or some similar
- * nonsense, remember that Windows 2000 uses the same legacy
- * mechanism that the old Linux PSB driver uses. Tell them it
- * is broken with Windows 2000.
- *
- * The reference to the AMD documentation is chapter 9 in the
- * BIOS and Kernel Developer's Guide, which is available on
- * www.amd.com
- */
- printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
- return -ENODEV;
-}
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
- unsigned int index)
-{
- acpi_integer control;
-
- if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
- return;
-
- control = data->acpi_data.states[index].control;
- data->irt = (control >> IRT_SHIFT) & IRT_MASK;
- data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
- data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
- data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
- data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
- data->vstable = (control >> VST_SHIFT) & VST_MASK;
-}
-
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
-{
- struct cpufreq_frequency_table *powernow_table;
- int ret_val = -ENODEV;
- acpi_integer control, status;
-
- if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
- dprintk("register performance failed: bad ACPI data\n");
- return -EIO;
- }
-
- /* verify the data contained in the ACPI structures */
- if (data->acpi_data.state_count <= 1) {
- dprintk("No ACPI P-States\n");
- goto err_out;
- }
-
- control = data->acpi_data.control_register.space_id;
- status = data->acpi_data.status_register.space_id;
-
- if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
- (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
- dprintk("Invalid control/status registers (%x - %x)\n",
- control, status);
- goto err_out;
- }
-
- /* fill in data->powernow_table */
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->acpi_data.state_count + 1)), GFP_KERNEL);
- if (!powernow_table) {
- dprintk("powernow_table memory alloc failure\n");
- goto err_out;
- }
-
- if (cpu_family == CPU_HW_PSTATE)
- ret_val = fill_powernow_table_pstate(data, powernow_table);
- else
- ret_val = fill_powernow_table_fidvid(data, powernow_table);
- if (ret_val)
- goto err_out_mem;
-
- powernow_table[data->acpi_data.state_count].frequency =
- CPUFREQ_TABLE_END;
- powernow_table[data->acpi_data.state_count].index = 0;
- data->powernow_table = powernow_table;
-
- /* fill in data */
- data->numps = data->acpi_data.state_count;
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
- powernow_k8_acpi_pst_values(data, 0);
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
- printk(KERN_ERR PFX
- "unable to alloc powernow_k8_data cpumask\n");
- ret_val = -ENOMEM;
- goto err_out_mem;
- }
-
- return 0;
-
-err_out_mem:
- kfree(powernow_table);
-
-err_out:
- acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
- /* data->acpi_data.state_count informs us at ->exit()
- * whether ACPI was used */
- data->acpi_data.state_count = 0;
-
- return ret_val;
-}
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
- u32 hi = 0, lo = 0;
- rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
- data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 index;
-
- index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
- if (index > data->max_hw_pstate) {
- printk(KERN_ERR PFX "invalid pstate %d - "
- "bad value %d.\n", i, index);
- printk(KERN_ERR PFX "Please report to BIOS "
- "manufacturer\n");
- invalidate_entry(data, i);
- continue;
- }
- rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
- if (!(hi & HW_PSTATE_VALID_MASK)) {
- dprintk("invalid pstate %d, ignoring\n", index);
- invalidate_entry(data, i);
- continue;
- }
-
- powernow_table[i].index = index;
-
- /* Frequency may be rounded for these */
- if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
- powernow_table[i].frequency =
- freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
- } else
- powernow_table[i].frequency =
- data->acpi_data.states[i].core_frequency * 1000;
- }
- return 0;
-}
-
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
- int cntlofreq = 0;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 fid;
- u32 vid;
- u32 freq, index;
- acpi_integer status, control;
-
- if (data->exttype) {
- status = data->acpi_data.states[i].status;
- fid = status & EXT_FID_MASK;
- vid = (status >> VID_SHIFT) & EXT_VID_MASK;
- } else {
- control = data->acpi_data.states[i].control;
- fid = control & FID_MASK;
- vid = (control >> VID_SHIFT) & VID_MASK;
- }
-
- dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
-
- index = fid | (vid<<8);
- powernow_table[i].index = index;
-
- freq = find_khz_freq_from_fid(fid);
- powernow_table[i].frequency = freq;
-
- /* verify frequency is OK */
- if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
- dprintk("invalid freq %u kHz, ignoring\n", freq);
- invalidate_entry(data, i);
- continue;
- }
-
- /* verify voltage is OK -
- * BIOSs are using "off" to indicate invalid */
- if (vid == VID_OFF) {
- dprintk("invalid vid %u, ignoring\n", vid);
- invalidate_entry(data, i);
- continue;
- }
-
- /* verify only 1 entry from the lo frequency table */
- if (fid < HI_FID_TABLE_BOTTOM) {
- if (cntlofreq) {
- /* if both entries are the same,
- * ignore this one ... */
- if ((freq != powernow_table[cntlofreq].frequency) ||
- (index != powernow_table[cntlofreq].index)) {
- printk(KERN_ERR PFX
- "Too many lo freq table "
- "entries\n");
- return 1;
- }
-
- dprintk("double low frequency table entry, "
- "ignoring it.\n");
- invalidate_entry(data, i);
- continue;
- } else
- cntlofreq = i;
- }
-
- if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
- printk(KERN_INFO PFX "invalid freq entries "
- "%u kHz vs. %u kHz\n", freq,
- (unsigned int)
- (data->acpi_data.states[i].core_frequency
- * 1000));
- invalidate_entry(data, i);
- continue;
- }
- }
- return 0;
-}
-
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
-{
- if (data->acpi_data.state_count)
- acpi_processor_unregister_performance(&data->acpi_data,
- data->cpu);
- free_cpumask_var(data->acpi_data.shared_cpu_map);
-}
-
-static int get_transition_latency(struct powernow_k8_data *data)
-{
- int max_latency = 0;
- int i;
- for (i = 0; i < data->acpi_data.state_count; i++) {
- int cur_latency = data->acpi_data.states[i].transition_latency
- + data->acpi_data.states[i].bus_master_latency;
- if (cur_latency > max_latency)
- max_latency = cur_latency;
- }
- if (max_latency == 0) {
- /*
- * Fam 11h always returns 0 as transition latency.
- * This is intended and means "very fast". While cpufreq core
- * and governors currently can handle that gracefully, better
- * set it to 1 to avoid problems in the future.
- * For all others it's a BIOS bug.
- */
- if (!boot_cpu_data.x86 == 0x11)
- printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
- "latency\n");
- max_latency = 1;
- }
- /* value in usecs, needs to be in nanoseconds */
- return 1000 * max_latency;
-}
-
-/* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 fid = 0;
- u32 vid = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* fid/vid correctness check for k8 */
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in find_psb_table, vid
- * are the upper 8 bits.
- */
- fid = data->powernow_table[index].index & 0xFF;
- vid = (data->powernow_table[index].index & 0xFF00) >> 8;
-
- dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((data->currvid == vid) && (data->currfid == fid)) {
- dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
- fid, vid);
- return 0;
- }
-
- dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
- smp_processor_id(), fid, vid);
- freqs.old = find_khz_freq_from_fid(data->currfid);
- freqs.new = find_khz_freq_from_fid(fid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_fid_vid(data, fid, vid);
- freqs.new = find_khz_freq_from_fid(data->currfid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 pstate = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* get MSR index for hardware pstate transition */
- pstate = index & HW_PSTATE_MASK;
- if (pstate > data->max_hw_pstate)
- return 0;
- freqs.old = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_pstate(data, pstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol,
- unsigned targfreq, unsigned relation)
-{
- cpumask_t oldmask;
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
- u32 checkfid;
- u32 checkvid;
- unsigned int newstate;
- int ret = -EIO;
-
- if (!data)
- return -EINVAL;
-
- checkfid = data->currfid;
- checkvid = data->currvid;
-
- /* only run on specific CPU from here on */
- oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
-
- if (smp_processor_id() != pol->cpu) {
- printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
- goto err_out;
- }
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing targ, change pending bit set\n");
- goto err_out;
- }
-
- dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
- pol->cpu, targfreq, pol->min, pol->max, relation);
-
- if (query_current_values_with_pending_wait(data))
- goto err_out;
-
- if (cpu_family != CPU_HW_PSTATE) {
- dprintk("targ: curr fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- if ((checkvid != data->currvid) ||
- (checkfid != data->currfid)) {
- printk(KERN_INFO PFX
- "error - out of sync, fix 0x%x 0x%x, "
- "vid 0x%x 0x%x\n",
- checkfid, data->currfid,
- checkvid, data->currvid);
- }
- }
-
- if (cpufreq_frequency_table_target(pol, data->powernow_table,
- targfreq, relation, &newstate))
- goto err_out;
-
- mutex_lock(&fidvid_mutex);
-
- powernow_k8_acpi_pst_values(data, newstate);
-
- if (cpu_family == CPU_HW_PSTATE)
- ret = transition_frequency_pstate(data, newstate);
- else
- ret = transition_frequency_fidvid(data, newstate);
- if (ret) {
- printk(KERN_ERR PFX "transition frequency failed\n");
- ret = 1;
- mutex_unlock(&fidvid_mutex);
- goto err_out;
- }
- mutex_unlock(&fidvid_mutex);
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- newstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- ret = 0;
-
-err_out:
- set_cpus_allowed_ptr(current, &oldmask);
- return ret;
-}
-
-/* Driver entry point to verify the policy and range of frequencies */
-static int powernowk8_verify(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- return cpufreq_frequency_table_verify(pol, data->powernow_table);
-}
-
-struct init_on_cpu {
- struct powernow_k8_data *data;
- int rc;
-};
-
-static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
-{
- struct init_on_cpu *init_on_cpu = _init_on_cpu;
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing init, change pending bit set\n");
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (query_current_values_with_pending_wait(init_on_cpu->data)) {
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (cpu_family == CPU_OPTERON)
- fidvid_msr_init();
-
- init_on_cpu->rc = 0;
-}
-
-/* per CPU init entry point to the driver */
-static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
-{
- static const char ACPI_PSS_BIOS_BUG_MSG[] =
- KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
- FW_BUG PFX "Try again with latest BIOS.\n";
- struct powernow_k8_data *data;
- struct init_on_cpu init_on_cpu;
- int rc;
-
- if (!cpu_online(pol->cpu))
- return -ENODEV;
-
- smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
- if (rc)
- return -ENODEV;
-
- data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
- if (!data) {
- printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
- return -ENOMEM;
- }
-
- data->cpu = pol->cpu;
- data->currpstate = HW_PSTATE_INVALID;
-
- if (powernow_k8_cpu_init_acpi(data)) {
- /*
- * Use the PSB BIOS structure. This is only availabe on
- * an UP version, and is deprecated by AMD.
- */
- if (num_online_cpus() != 1) {
- printk_once(ACPI_PSS_BIOS_BUG_MSG);
- goto err_out;
- }
- if (pol->cpu != 0) {
- printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
- "CPU other than CPU0. Complain to your BIOS "
- "vendor.\n");
- goto err_out;
- }
- rc = find_psb_table(data);
- if (rc)
- goto err_out;
-
- /* Take a crude guess here.
- * That guess was in microseconds, so multiply with 1000 */
- pol->cpuinfo.transition_latency = (
- ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
- ((1 << data->irt) * 30)) * 1000;
- } else /* ACPI _PSS objects available */
- pol->cpuinfo.transition_latency = get_transition_latency(data);
-
- /* only run on specific CPU from here on */
- init_on_cpu.data = data;
- smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
- &init_on_cpu, 1);
- rc = init_on_cpu.rc;
- if (rc != 0)
- goto err_out_exit_acpi;
-
- if (cpu_family == CPU_HW_PSTATE)
- cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
- else
- cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
- data->available_cores = pol->cpus;
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- dprintk("policy current frequency %d kHz\n", pol->cur);
-
- /* min/max the cpu is capable of */
- if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
- printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
- powernow_k8_cpu_exit_acpi(data);
- kfree(data->powernow_table);
- kfree(data);
- return -EINVAL;
- }
-
- cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
-
- if (cpu_family == CPU_HW_PSTATE)
- dprintk("cpu_init done, current pstate 0x%x\n",
- data->currpstate);
- else
- dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- per_cpu(powernow_data, pol->cpu) = data;
-
- return 0;
-
-err_out_exit_acpi:
- powernow_k8_cpu_exit_acpi(data);
-
-err_out:
- kfree(data);
- return -ENODEV;
-}
-
-static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- powernow_k8_cpu_exit_acpi(data);
-
- cpufreq_frequency_table_put_attr(pol->cpu);
-
- kfree(data->powernow_table);
- kfree(data);
-
- return 0;
-}
-
-static void query_values_on_cpu(void *_err)
-{
- int *err = _err;
- struct powernow_k8_data *data = __get_cpu_var(powernow_data);
-
- *err = query_current_values_with_pending_wait(data);
-}
-
-static unsigned int powernowk8_get(unsigned int cpu)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
- unsigned int khz = 0;
- int err;
-
- if (!data)
- return -EINVAL;
-
- smp_call_function_single(cpu, query_values_on_cpu, &err, true);
- if (err)
- goto out;
-
- if (cpu_family == CPU_HW_PSTATE)
- khz = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- khz = find_khz_freq_from_fid(data->currfid);
-
-
-out:
- return khz;
-}
-
-static struct freq_attr *powernow_k8_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver cpufreq_amd64_driver = {
- .verify = powernowk8_verify,
- .target = powernowk8_target,
- .init = powernowk8_cpu_init,
- .exit = __devexit_p(powernowk8_cpu_exit),
- .get = powernowk8_get,
- .name = "powernow-k8",
- .owner = THIS_MODULE,
- .attr = powernow_k8_attr,
-};
-
-/* driver entry point for init */
-static int __cpuinit powernowk8_init(void)
-{
- unsigned int i, supported_cpus = 0;
-
- for_each_online_cpu(i) {
- int rc;
- smp_call_function_single(i, check_supported_cpu, &rc, 1);
- if (rc == 0)
- supported_cpus++;
- }
-
- if (supported_cpus == num_online_cpus()) {
- printk(KERN_INFO PFX "Found %d %s "
- "processors (%d cpu cores) (" VERSION ")\n",
- num_online_nodes(),
- boot_cpu_data.x86_model_id, supported_cpus);
- return cpufreq_register_driver(&cpufreq_amd64_driver);
- }
-
- return -ENODEV;
-}
-
-/* driver entry point for term */
-static void __exit powernowk8_exit(void)
-{
- dprintk("exit\n");
-
- cpufreq_unregister_driver(&cpufreq_amd64_driver);
-}
-
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
- "Mark Langsdorf <mark.langsdorf@amd.com>");
-MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernowk8_init);
-module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index 02ce824073cb..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * (c) 2003-2006 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- */
-
-
-enum pstate {
- HW_PSTATE_INVALID = 0xff,
- HW_PSTATE_0 = 0,
- HW_PSTATE_1 = 1,
- HW_PSTATE_2 = 2,
- HW_PSTATE_3 = 3,
- HW_PSTATE_4 = 4,
- HW_PSTATE_5 = 5,
- HW_PSTATE_6 = 6,
- HW_PSTATE_7 = 7,
-};
-
-struct powernow_k8_data {
- unsigned int cpu;
-
- u32 numps; /* number of p-states */
- u32 batps; /* number of p-states supported on battery */
- u32 max_hw_pstate; /* maximum legal hardware pstate */
-
- /* these values are constant when the PSB is used to determine
- * vid/fid pairings, but are modified during the ->target() call
- * when ACPI is used */
- u32 rvo; /* ramp voltage offset */
- u32 irt; /* isochronous relief time */
- u32 vidmvs; /* usable value calculated from mvs */
- u32 vstable; /* voltage stabilization time, units 20 us */
- u32 plllock; /* pll lock time, units 1 us */
- u32 exttype; /* extended interface = 1 */
-
- /* keep track of the current fid / vid or pstate */
- u32 currvid;
- u32 currfid;
- enum pstate currpstate;
-
- /* the powernow_table includes all frequency and vid/fid pairings:
- * fid are the lower 8 bits of the index, vid are the upper 8 bits.
- * frequency is in kHz */
- struct cpufreq_frequency_table *powernow_table;
-
- /* the acpi table needs to be kept. it's only available if ACPI was
- * used to determine valid frequency/vid/fid states */
- struct acpi_processor_performance acpi_data;
-
- /* we need to keep track of associated cores, but let cpufreq
- * handle hotplug events - so just point at cpufreq pol->cpus
- * structure */
- struct cpumask *available_cores;
-};
-
-
-/* processor's cpuid instruction support */
-#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
-#define CPUID_XFAM 0x0ff00000 /* extended family */
-#define CPUID_XFAM_K8 0
-#define CPUID_XMOD 0x000f0000 /* extended model */
-#define CPUID_XMOD_REV_MASK 0x000c0000
-#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
-#define CPUID_USE_XFAM_XMOD 0x00000f00
-#define CPUID_GET_MAX_CAPABILITIES 0x80000000
-#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
-#define P_STATE_TRANSITION_CAPABLE 6
-
-/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
-/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
-/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
-/* the register number is placed in ecx, and the data is returned in edx:eax. */
-
-#define MSR_FIDVID_CTL 0xc0010041
-#define MSR_FIDVID_STATUS 0xc0010042
-
-/* Field definitions within the FID VID Low Control MSR : */
-#define MSR_C_LO_INIT_FID_VID 0x00010000
-#define MSR_C_LO_NEW_VID 0x00003f00
-#define MSR_C_LO_NEW_FID 0x0000003f
-#define MSR_C_LO_VID_SHIFT 8
-
-/* Field definitions within the FID VID High Control MSR : */
-#define MSR_C_HI_STP_GNT_TO 0x000fffff
-
-/* Field definitions within the FID VID Low Status MSR : */
-#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
-#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
-#define MSR_S_LO_MAX_FID 0x003f0000
-#define MSR_S_LO_START_FID 0x00003f00
-#define MSR_S_LO_CURRENT_FID 0x0000003f
-
-/* Field definitions within the FID VID High Status MSR : */
-#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
-#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
-#define MSR_S_HI_START_VID 0x00003f00
-#define MSR_S_HI_CURRENT_VID 0x0000003f
-#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
-
-
-/* Hardware Pstate _PSS and MSR definitions */
-#define USE_HW_PSTATE 0x00000080
-#define HW_PSTATE_MASK 0x00000007
-#define HW_PSTATE_VALID_MASK 0x80000000
-#define HW_PSTATE_MAX_MASK 0x000000f0
-#define HW_PSTATE_MAX_SHIFT 4
-#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
-#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
-#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
-#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
-
-/* define the two driver architectures */
-#define CPU_OPTERON 0
-#define CPU_HW_PSTATE 1
-
-
-/*
- * There are restrictions frequencies have to follow:
- * - only 1 entry in the low fid table ( <=1.4GHz )
- * - lowest entry in the high fid table must be >= 2 * the entry in the
- * low fid table
- * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
- * in the low fid table
- * - the parts can only step at <= 200 MHz intervals, odd fid values are
- * supported in revision G and later revisions.
- * - lowest frequency must be >= interprocessor hypertransport link speed
- * (only applies to MP systems obviously)
- */
-
-/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
-#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
-#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
-
-#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
-#define HI_VCOFREQ_TABLE_BOTTOM 1600
-
-#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
-
-#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
-#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
-
-#define MIN_FREQ 800 /* Min and max freqs, per spec */
-#define MAX_FREQ 5000
-
-#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
-#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
-
-#define VID_OFF 0x3f
-
-#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
-
-#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
-
-#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
-#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
-
-/*
- * Most values of interest are encoded in a single field of the _PSS
- * entries: the "control" value.
- */
-
-#define IRT_SHIFT 30
-#define RVO_SHIFT 28
-#define EXT_TYPE_SHIFT 27
-#define PLL_L_SHIFT 20
-#define MVS_SHIFT 18
-#define VST_SHIFT 11
-#define VID_SHIFT 6
-#define IRT_MASK 3
-#define RVO_MASK 3
-#define EXT_TYPE_MASK 1
-#define PLL_L_MASK 0x7f
-#define MVS_MASK 3
-#define VST_MASK 0x7f
-#define VID_MASK 0x1f
-#define FID_MASK 0x1f
-#define EXT_VID_MASK 0x3f
-#define EXT_FID_MASK 0x3f
-
-
-/*
- * Version 1.4 of the PSB table. This table is constructed by BIOS and is
- * to tell the OS's power management driver which VIDs and FIDs are
- * supported by this particular processor.
- * If the data in the PSB / PST is wrong, then this driver will program the
- * wrong values into hardware, which is very likely to lead to a crash.
- */
-
-#define PSB_ID_STRING "AMDK7PNOW!"
-#define PSB_ID_STRING_LEN 10
-
-#define PSB_VERSION_1_4 0x14
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags1;
- u16 vstable;
- u8 flags2;
- u8 num_tables;
- u32 cpuid;
- u8 plllocktime;
- u8 maxfid;
- u8 maxvid;
- u8 numps;
-};
-
-/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
-struct pst_s {
- u8 fid;
- u8 vid;
-};
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 regfid);
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * sc520_freq.c: cpufreq driver for the AMD Elan sc520
- *
- * Copyright (C) 2005 Sean Young <sean@mess.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Based on elanfreq.c
- *
- * 2005-03-30: - initial revision
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define MMCR_BASE 0xfffef000 /* The default base address */
-#define OFFS_CPUCTL 0x2 /* CPU Control Register */
-
-static __u8 __iomem *cpuctl;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "sc520_freq", msg)
-#define PFX "sc520_freq: "
-
-static struct cpufreq_frequency_table sc520_freq_table[] = {
- {0x01, 100000},
- {0x02, 133000},
- {0, CPUFREQ_TABLE_END},
-};
-
-static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg = *cpuctl;
-
- switch (clockspeed_reg & 0x03) {
- default:
- printk(KERN_ERR PFX "error: cpuctl register has unexpected "
- "value %02x\n", clockspeed_reg);
- case 0x01:
- return 100000;
- case 0x02:
- return 133000;
- }
-}
-
-static void sc520_freq_set_cpu_state(unsigned int state)
-{
-
- struct cpufreq_freqs freqs;
- u8 clockspeed_reg;
-
- freqs.old = sc520_freq_get_cpu_frequency(0);
- freqs.new = sc520_freq_table[state].frequency;
- freqs.cpu = 0; /* AMD Elan is UP */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("attempting to set frequency to %i kHz\n",
- sc520_freq_table[state].frequency);
-
- local_irq_disable();
-
- clockspeed_reg = *cpuctl & ~0x03;
- *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
-
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-static int sc520_freq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
-}
-
-static int sc520_freq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, sc520_freq_table,
- target_freq, relation, &newstate))
- return -EINVAL;
-
- sc520_freq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int result;
-
- /* capability check */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9)
- return -ENODEV;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = 1000000; /* 1ms */
- policy->cur = sc520_freq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
-
- return 0;
-}
-
-
-static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-static struct freq_attr *sc520_freq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver sc520_freq_driver = {
- .get = sc520_freq_get_cpu_frequency,
- .verify = sc520_freq_verify,
- .target = sc520_freq_target,
- .init = sc520_freq_cpu_init,
- .exit = sc520_freq_cpu_exit,
- .name = "sc520_freq",
- .owner = THIS_MODULE,
- .attr = sc520_freq_attr,
-};
-
-
-static int __init sc520_freq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int err;
-
- /* Test if we have the right hardware */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9) {
- dprintk("no Elan SC520 processor found!\n");
- return -ENODEV;
- }
- cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
- if (!cpuctl) {
- printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
- return -ENOMEM;
- }
-
- err = cpufreq_register_driver(&sc520_freq_driver);
- if (err)
- iounmap(cpuctl);
-
- return err;
-}
-
-
-static void __exit sc520_freq_exit(void)
-{
- cpufreq_unregister_driver(&sc520_freq_driver);
- iounmap(cpuctl);
-}
-
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sean Young <sean@mess.org>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
-
-module_init(sc520_freq_init);
-module_exit(sc520_freq_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 8d672ef162ce..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,635 +0,0 @@
-/*
- * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
- * M (part of the Centrino chipset).
- *
- * Since the original Pentium M, most new Intel CPUs support Enhanced
- * SpeedStep.
- *
- * Despite the "SpeedStep" in the name, this is almost entirely unlike
- * traditional SpeedStep.
- *
- * Modelled on speedstep.c
- *
- * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h> /* current */
-#include <linux/delay.h>
-#include <linux/compiler.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-#define PFX "speedstep-centrino: "
-#define MAINTAINER "cpufreq@vger.kernel.org"
-
-#define dprintk(msg...) \
- cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
-
-#define INTEL_MSR_RANGE (0xffff)
-
-struct cpu_id
-{
- __u8 x86; /* CPU family */
- __u8 x86_model; /* model */
- __u8 x86_mask; /* stepping */
-};
-
-enum {
- CPU_BANIAS,
- CPU_DOTHAN_A1,
- CPU_DOTHAN_A2,
- CPU_DOTHAN_B0,
- CPU_MP4HT_D0,
- CPU_MP4HT_E0,
-};
-
-static const struct cpu_id cpu_ids[] = {
- [CPU_BANIAS] = { 6, 9, 5 },
- [CPU_DOTHAN_A1] = { 6, 13, 1 },
- [CPU_DOTHAN_A2] = { 6, 13, 2 },
- [CPU_DOTHAN_B0] = { 6, 13, 6 },
- [CPU_MP4HT_D0] = {15, 3, 4 },
- [CPU_MP4HT_E0] = {15, 4, 1 },
-};
-#define N_IDS ARRAY_SIZE(cpu_ids)
-
-struct cpu_model
-{
- const struct cpu_id *cpu_id;
- const char *model_name;
- unsigned max_freq; /* max clock in kHz */
-
- struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
-};
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x);
-
-/* Operating points for current CPU */
-static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
-static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
-
-static struct cpufreq_driver centrino_driver;
-
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
-
-/* Computes the correct form for IA32_PERF_CTL MSR for a particular
- frequency/voltage operating point; frequency in MHz, volts in mV.
- This is stored as "index" in the structure. */
-#define OP(mhz, mv) \
- { \
- .frequency = (mhz) * 1000, \
- .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
- }
-
-/*
- * These voltage tables were derived from the Intel Pentium M
- * datasheet, document 25261202.pdf, Table 5. I have verified they
- * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
- * M.
- */
-
-/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
-static struct cpufreq_frequency_table banias_900[] =
-{
- OP(600, 844),
- OP(800, 988),
- OP(900, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
-static struct cpufreq_frequency_table banias_1000[] =
-{
- OP(600, 844),
- OP(800, 972),
- OP(900, 988),
- OP(1000, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
-static struct cpufreq_frequency_table banias_1100[] =
-{
- OP( 600, 956),
- OP( 800, 1020),
- OP( 900, 1100),
- OP(1000, 1164),
- OP(1100, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-
-/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
-static struct cpufreq_frequency_table banias_1200[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP( 900, 1020),
- OP(1000, 1100),
- OP(1100, 1164),
- OP(1200, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.30GHz (Banias) */
-static struct cpufreq_frequency_table banias_1300[] =
-{
- OP( 600, 956),
- OP( 800, 1260),
- OP(1000, 1292),
- OP(1200, 1356),
- OP(1300, 1388),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.40GHz (Banias) */
-static struct cpufreq_frequency_table banias_1400[] =
-{
- OP( 600, 956),
- OP( 800, 1180),
- OP(1000, 1308),
- OP(1200, 1436),
- OP(1400, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.50GHz (Banias) */
-static struct cpufreq_frequency_table banias_1500[] =
-{
- OP( 600, 956),
- OP( 800, 1116),
- OP(1000, 1228),
- OP(1200, 1356),
- OP(1400, 1452),
- OP(1500, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.60GHz (Banias) */
-static struct cpufreq_frequency_table banias_1600[] =
-{
- OP( 600, 956),
- OP( 800, 1036),
- OP(1000, 1164),
- OP(1200, 1276),
- OP(1400, 1420),
- OP(1600, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.70GHz (Banias) */
-static struct cpufreq_frequency_table banias_1700[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP(1000, 1116),
- OP(1200, 1228),
- OP(1400, 1308),
- OP(1700, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-#undef OP
-
-#define _BANIAS(cpuid, max, name) \
-{ .cpu_id = cpuid, \
- .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
- .max_freq = (max)*1000, \
- .op_points = banias_##max, \
-}
-#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
-
-/* CPU models, their operating frequency range, and freq/voltage
- operating points */
-static struct cpu_model models[] =
-{
- _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
- BANIAS(1000),
- BANIAS(1100),
- BANIAS(1200),
- BANIAS(1300),
- BANIAS(1400),
- BANIAS(1500),
- BANIAS(1600),
- BANIAS(1700),
-
- /* NULL model_name is a wildcard */
- { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
-
- { NULL, }
-};
-#undef _BANIAS
-#undef BANIAS
-
-static int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- struct cpu_model *model;
-
- for(model = models; model->cpu_id != NULL; model++)
- if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
- (model->model_name == NULL ||
- strcmp(cpu->x86_model_id, model->model_name) == 0))
- break;
-
- if (model->cpu_id == NULL) {
- /* No match at all */
- dprintk("no support for CPU model \"%s\": "
- "send /proc/cpuinfo to " MAINTAINER "\n",
- cpu->x86_model_id);
- return -ENOENT;
- }
-
- if (model->op_points == NULL) {
- /* Matched a non-match */
- dprintk("no table support for CPU model \"%s\"\n",
- cpu->x86_model_id);
- dprintk("try using the acpi-cpufreq driver\n");
- return -ENOENT;
- }
-
- per_cpu(centrino_model, policy->cpu) = model;
-
- dprintk("found \"%s\": max frequency: %dkHz\n",
- model->model_name, model->max_freq);
-
- return 0;
-}
-
-#else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- return -ENODEV;
-}
-#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
-
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x)
-{
- if ((c->x86 == x->x86) &&
- (c->x86_model == x->x86_model) &&
- (c->x86_mask == x->x86_mask))
- return 1;
- return 0;
-}
-
-/* To be called only after centrino_model is initialized */
-static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
-{
- int i;
-
- /*
- * Extract clock in kHz from PERF_CTL value
- * for centrino, as some DSDTs are buggy.
- * Ideally, this can be done using the acpi_data structure.
- */
- if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
- msr = (msr >> 8) & 0xff;
- return msr * 100000;
- }
-
- if ((!per_cpu(centrino_model, cpu)) ||
- (!per_cpu(centrino_model, cpu)->op_points))
- return 0;
-
- msr &= 0xffff;
- for (i = 0;
- per_cpu(centrino_model, cpu)->op_points[i].frequency
- != CPUFREQ_TABLE_END;
- i++) {
- if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
- return per_cpu(centrino_model, cpu)->
- op_points[i].frequency;
- }
- if (failsafe)
- return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
- else
- return 0;
-}
-
-/* Return the current CPU frequency in kHz */
-static unsigned int get_cur_freq(unsigned int cpu)
-{
- unsigned l, h;
- unsigned clock_freq;
-
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
- clock_freq = extract_clock(l, cpu, 0);
-
- if (unlikely(clock_freq == 0)) {
- /*
- * On some CPUs, we can see transient MSR values (which are
- * not present in _PSS), while CPU is doing some automatic
- * P-state transition (like TM2). Get the last freq set
- * in PERF_CTL.
- */
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
- clock_freq = extract_clock(l, cpu, 1);
- }
- return clock_freq;
-}
-
-
-static int centrino_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- unsigned freq;
- unsigned l, h;
- int ret;
- int i;
-
- /* Only Intel makes Enhanced Speedstep-capable CPUs */
- if (cpu->x86_vendor != X86_VENDOR_INTEL ||
- !cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
- centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- for (i = 0; i < N_IDS; i++)
- if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
- break;
-
- if (i != N_IDS)
- per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
-
- if (!per_cpu(centrino_cpu, policy->cpu)) {
- dprintk("found unsupported CPU with "
- "Enhanced SpeedStep: send /proc/cpuinfo to "
- MAINTAINER "\n");
- return -ENODEV;
- }
-
- if (centrino_cpu_init_table(policy)) {
- return -ENODEV;
- }
-
- /* Check to see if Enhanced SpeedStep is enabled, and try to
- enable it if not. */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
- wrmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- /* check to see if it stuck */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO PFX
- "couldn't enable Enhanced SpeedStep\n");
- return -ENODEV;
- }
- }
-
- freq = get_cur_freq(policy->cpu);
- policy->cpuinfo.transition_latency = 10000;
- /* 10uS transition latency */
- policy->cur = freq;
-
- dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
-
- ret = cpufreq_frequency_table_cpuinfo(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
- if (ret)
- return (ret);
-
- cpufreq_frequency_table_get_attr(
- per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
-
- return 0;
-}
-
-static int centrino_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
-
- if (!per_cpu(centrino_model, cpu))
- return -ENODEV;
-
- cpufreq_frequency_table_put_attr(cpu);
-
- per_cpu(centrino_model, cpu) = NULL;
-
- return 0;
-}
-
-/**
- * centrino_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within this model's frequency range at least one
- * border included.
- */
-static int centrino_verify (struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
-}
-
-/**
- * centrino_setpolicy - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int centrino_target (struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
- unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
- struct cpufreq_freqs freqs;
- int retval = 0;
- unsigned int j, k, first_cpu, tmp;
- cpumask_var_t covered_cpus;
-
- if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
- return -ENOMEM;
-
- if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
- retval = -ENODEV;
- goto out;
- }
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- per_cpu(centrino_model, cpu)->op_points,
- target_freq,
- relation,
- &newstate))) {
- retval = -EINVAL;
- goto out;
- }
-
- first_cpu = 1;
- for_each_cpu(j, policy->cpus) {
- int good_cpu;
-
- /* cpufreq holds the hotplug lock, so we are safe here */
- if (!cpu_online(j))
- continue;
-
- /*
- * Support for SMP systems.
- * Make sure we are running on CPU that wants to change freq
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- good_cpu = cpumask_any_and(policy->cpus,
- cpu_online_mask);
- else
- good_cpu = j;
-
- if (good_cpu >= nr_cpu_ids) {
- dprintk("couldn't limit to CPUs in this domain\n");
- retval = -EAGAIN;
- if (first_cpu) {
- /* We haven't started the transition yet. */
- goto out;
- }
- break;
- }
-
- msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
-
- if (first_cpu) {
- rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
- if (msr == (oldmsr & 0xffff)) {
- dprintk("no change needed - msr was and needs "
- "to be %x\n", oldmsr);
- retval = 0;
- goto out;
- }
-
- freqs.old = extract_clock(oldmsr, cpu, 0);
- freqs.new = extract_clock(msr, cpu, 0);
-
- dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
- target_freq, freqs.old, freqs.new, msr);
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs,
- CPUFREQ_PRECHANGE);
- }
-
- first_cpu = 0;
- /* all but 16 LSB are reserved, treat them with care */
- oldmsr &= ~0xffff;
- msr &= 0xffff;
- oldmsr |= msr;
- }
-
- wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- break;
-
- cpumask_set_cpu(j, covered_cpus);
- }
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- if (unlikely(retval)) {
- /*
- * We have failed halfway through the frequency change.
- * We have sent callbacks to policy->cpus and
- * MSRs have already been written on coverd_cpus.
- * Best effort undo..
- */
-
- for_each_cpu(j, covered_cpus)
- wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
-
- tmp = freqs.new;
- freqs.new = freqs.old;
- freqs.old = tmp;
- for_each_cpu(j, policy->cpus) {
- if (!cpu_online(j))
- continue;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- }
- retval = 0;
-
-out:
- free_cpumask_var(covered_cpus);
- return retval;
-}
-
-static struct freq_attr* centrino_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver centrino_driver = {
- .name = "centrino", /* should be speedstep-centrino,
- but there's a 16 char limit */
- .init = centrino_cpu_init,
- .exit = centrino_cpu_exit,
- .verify = centrino_verify,
- .target = centrino_target,
- .get = get_cur_freq,
- .attr = centrino_attr,
- .owner = THIS_MODULE,
-};
-
-
-/**
- * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
- *
- * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
- * unsupported devices, -ENOENT if there's no voltage table for this
- * particular CPU model, -EINVAL on problems during initiatization,
- * and zero on success.
- *
- * This is quite picky. Not only does the CPU have to advertise the
- * "est" flag in the cpuid capability flags, we look for a specific
- * CPU model and stepping, and we need to have the exact model name in
- * our voltage tables. That is, be paranoid about not releasing
- * someone's valuable magic smoke.
- */
-static int __init centrino_init(void)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(0);
-
- if (!cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- return cpufreq_register_driver(&centrino_driver);
-}
-
-static void __exit centrino_exit(void)
-{
- cpufreq_unregister_driver(&centrino_driver);
-}
-
-MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
-MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
-MODULE_LICENSE ("GPL");
-
-late_initcall(centrino_init);
-module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 6911e91fb4f6..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- * (C) 2001 Dave Jones, Arjan van de ven.
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information, and on Intel documentation
- * for chipsets ICH2-M and ICH3-M.
- *
- * Many thanks to Ducrot Bruno for finding and fixing the last
- * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
- * for extensive testing.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-#include "speedstep-lib.h"
-
-
-/* speedstep_chipset:
- * It is necessary to know which chipset is used. As accesses to
- * this device occur at various places in this module, we need a
- * static struct pci_dev * pointing to that device.
- */
-static struct pci_dev *speedstep_chipset_dev;
-
-
-/* speedstep_processor
- */
-static unsigned int speedstep_processor;
-
-static u32 pmbase;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-ich", msg)
-
-
-/**
- * speedstep_find_register - read the PMBASE address
- *
- * Returns: -ENODEV if no register could be found
- */
-static int speedstep_find_register(void)
-{
- if (!speedstep_chipset_dev)
- return -ENODEV;
-
- /* get PMBASE */
- pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
- if (!(pmbase & 0x01)) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- pmbase &= 0xFFFFFFFE;
- if (!pmbase) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- dprintk("pmbase is 0x%x\n", pmbase);
- return 0;
-}
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- * Tries to change the SpeedStep state. Can be called from
- * smp_call_function_single.
- */
-static void speedstep_set_state(unsigned int state)
-{
- u8 pm2_blk;
- u8 value;
- unsigned long flags;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- /* read state */
- value = inb(pmbase + 0x50);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- /* write new state */
- value &= 0xFE;
- value |= state;
-
- dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
-
- /* Disable bus master arbitration */
- pm2_blk = inb(pmbase + 0x20);
- pm2_blk |= 0x01;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* Actual transition */
- outb(value, (pmbase + 0x50));
-
- /* Restore bus master arbitration */
- pm2_blk &= 0xfe;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* check if transition was successful */
- value = inb(pmbase + 0x50);
-
- /* Enable IRQs */
- local_irq_restore(flags);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- if (state == (value & 0x1))
- dprintk("change to %u MHz succeeded\n",
- speedstep_get_frequency(speedstep_processor) / 1000);
- else
- printk(KERN_ERR "cpufreq: change failed - I/O error\n");
-
- return;
-}
-
-/* Wrapper for smp_call_function_single. */
-static void _speedstep_set_state(void *_state)
-{
- speedstep_set_state(*(unsigned int *)_state);
-}
-
-/**
- * speedstep_activate - activate SpeedStep control in the chipset
- *
- * Tries to activate the SpeedStep status and control registers.
- * Returns -EINVAL on an unsupported chipset, and zero on success.
- */
-static int speedstep_activate(void)
-{
- u16 value = 0;
-
- if (!speedstep_chipset_dev)
- return -EINVAL;
-
- pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
- if (!(value & 0x08)) {
- value |= 0x08;
- dprintk("activating SpeedStep (TM) registers\n");
- pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
- *
- * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
- * the LPC bridge / PM module which contains all power-management
- * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
- * chipset, or zero on failure.
- */
-static unsigned int speedstep_detect_chipset(void)
-{
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801DB_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 4; /* 4-M */
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801CA_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 3; /* 3-M */
-
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801BA_10,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev) {
- /* speedstep.c causes lockups on Dell Inspirons 8000 and
- * 8100 which use a pretty old revision of the 82815
- * host brige. Abort on these systems.
- */
- static struct pci_dev *hostbridge;
-
- hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82815_MC,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
-
- if (!hostbridge)
- return 2; /* 2-M */
-
- if (hostbridge->revision < 5) {
- dprintk("hostbridge does not support speedstep\n");
- speedstep_chipset_dev = NULL;
- pci_dev_put(hostbridge);
- return 0;
- }
-
- pci_dev_put(hostbridge);
- return 2; /* 2-M */
- }
-
- return 0;
-}
-
-struct get_freq_data {
- unsigned int speed;
- unsigned int processor;
-};
-
-static void get_freq_data(void *_data)
-{
- struct get_freq_data *data = _data;
-
- data->speed = speedstep_get_frequency(data->processor);
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- struct get_freq_data data = { .processor = cpu };
-
- /* You're supposed to ensure CPU is online. */
- if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0)
- BUG();
-
- dprintk("detected %u kHz as current frequency\n", data.speed);
- return data.speed;
-}
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0, policy_cpu;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
- freqs.old = speedstep_get(policy_cpu);
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = policy->cpu;
-
- dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
-
- /* no transition necessary */
- if (freqs.old == freqs.new)
- return 0;
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
- true);
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-struct get_freqs {
- struct cpufreq_policy *policy;
- int ret;
-};
-
-static void get_freqs_on_cpu(void *_get_freqs)
-{
- struct get_freqs *get_freqs = _get_freqs;
-
- get_freqs->ret =
- speedstep_get_freqs(speedstep_processor,
- &speedstep_freqs[SPEEDSTEP_LOW].frequency,
- &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
- &get_freqs->policy->cpuinfo.transition_latency,
- &speedstep_set_state);
-}
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int policy_cpu, speed;
- struct get_freqs gf;
-
- /* only run on CPU to be set, or on its sibling */
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-
- /* detect low and high frequency and transition latency */
- gf.policy = policy;
- smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
- if (gf.ret)
- return gf.ret;
-
- /* get current speed setting */
- speed = speedstep_get(policy_cpu);
- if (!speed)
- return -EIO;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-ich",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- /* detect processor */
- speedstep_processor = speedstep_detect_processor();
- if (!speedstep_processor) {
- dprintk("Intel(R) SpeedStep(TM) capable processor "
- "not found\n");
- return -ENODEV;
- }
-
- /* detect chipset */
- if (!speedstep_detect_chipset()) {
- dprintk("Intel(R) SpeedStep(TM) for this chipset not "
- "(yet) available.\n");
- return -ENODEV;
- }
-
- /* activate speedstep support */
- if (speedstep_activate()) {
- pci_dev_put(speedstep_chipset_dev);
- return -EINVAL;
- }
-
- if (speedstep_find_register())
- return -ENODEV;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- pci_dev_put(speedstep_chipset_dev);
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
- "with ICH-M southbridges.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index f4c290b8482f..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#include "speedstep-lib.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-lib", msg)
-
-#define PFX "speedstep-lib: "
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check;
-#else
-#define relaxed_check 0
-#endif
-
-/*********************************************************************
- * GET PROCESSOR CORE SPEED IN KHZ *
- *********************************************************************/
-
-static unsigned int pentium3_get_frequency(unsigned int processor)
-{
- /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
- struct {
- unsigned int ratio; /* Frequency Multiplier (x10) */
- u8 bitmap; /* power on configuration bits
- [27, 25:22] (in MSR 0x2a) */
- } msr_decode_mult[] = {
- { 30, 0x01 },
- { 35, 0x05 },
- { 40, 0x02 },
- { 45, 0x06 },
- { 50, 0x00 },
- { 55, 0x04 },
- { 60, 0x0b },
- { 65, 0x0f },
- { 70, 0x09 },
- { 75, 0x0d },
- { 80, 0x0a },
- { 85, 0x26 },
- { 90, 0x20 },
- { 100, 0x2b },
- { 0, 0xff } /* error or unknown value */
- };
-
- /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
- struct {
- unsigned int value; /* Front Side Bus speed in MHz */
- u8 bitmap; /* power on configuration bits [18: 19]
- (in MSR 0x2a) */
- } msr_decode_fsb[] = {
- { 66, 0x0 },
- { 100, 0x2 },
- { 133, 0x1 },
- { 0, 0xff}
- };
-
- u32 msr_lo, msr_tmp;
- int i = 0, j = 0;
-
- /* read MSR 0x2a - we only need the low 32 bits */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
- msr_tmp = msr_lo;
-
- /* decode the FSB */
- msr_tmp &= 0x00c0000;
- msr_tmp >>= 18;
- while (msr_tmp != msr_decode_fsb[i].bitmap) {
- if (msr_decode_fsb[i].bitmap == 0xff)
- return 0;
- i++;
- }
-
- /* decode the multiplier */
- if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
- dprintk("workaround for early PIIIs\n");
- msr_lo &= 0x03c00000;
- } else
- msr_lo &= 0x0bc00000;
- msr_lo >>= 22;
- while (msr_lo != msr_decode_mult[j].bitmap) {
- if (msr_decode_mult[j].bitmap == 0xff)
- return 0;
- j++;
- }
-
- dprintk("speed is %u\n",
- (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
-
- return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
-}
-
-
-static unsigned int pentiumM_get_frequency(void)
-{
- u32 msr_lo, msr_tmp;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-
- /* see table B-2 of 24547212.pdf */
- if (msr_lo & 0x00040000) {
- printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
- return 0;
- }
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * 100 * 1000));
-
- return msr_tmp * 100 * 1000;
-}
-
-static unsigned int pentium_core_get_frequency(void)
-{
- u32 fsb = 0;
- u32 msr_lo, msr_tmp;
- int ret;
-
- rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
- /* see table B-2 of 25366920.pdf */
- switch (msr_lo & 0x07) {
- case 5:
- fsb = 100000;
- break;
- case 1:
- fsb = 133333;
- break;
- case 3:
- fsb = 166667;
- break;
- case 2:
- fsb = 200000;
- break;
- case 0:
- fsb = 266667;
- break;
- case 4:
- fsb = 333333;
- break;
- default:
- printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
- }
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * fsb));
-
- ret = (msr_tmp * fsb);
- return ret;
-}
-
-
-static unsigned int pentium4_get_frequency(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- u32 msr_lo, msr_hi, mult;
- unsigned int fsb = 0;
- unsigned int ret;
- u8 fsb_code;
-
- /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
- * to System Bus Frequency Ratio Field in the Processor Frequency
- * Configuration Register of the MSR. Therefore the current
- * frequency cannot be calculated and has to be measured.
- */
- if (c->x86_model < 2)
- return cpu_khz;
-
- rdmsr(0x2c, msr_lo, msr_hi);
-
- dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
-
- /* decode the FSB: see IA-32 Intel (C) Architecture Software
- * Developer's Manual, Volume 3: System Prgramming Guide,
- * revision #12 in Table B-1: MSRs in the Pentium 4 and
- * Intel Xeon Processors, on page B-4 and B-5.
- */
- fsb_code = (msr_lo >> 16) & 0x7;
- switch (fsb_code) {
- case 0:
- fsb = 100 * 1000;
- break;
- case 1:
- fsb = 13333 * 10;
- break;
- case 2:
- fsb = 200 * 1000;
- break;
- }
-
- if (!fsb)
- printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
- "Please send an e-mail to <linux@brodo.de>\n");
-
- /* Multiplier. */
- mult = msr_lo >> 24;
-
- dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
- fsb, mult, (fsb * mult));
-
- ret = (fsb * mult);
- return ret;
-}
-
-
-/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(unsigned int processor)
-{
- switch (processor) {
- case SPEEDSTEP_CPU_PCORE:
- return pentium_core_get_frequency();
- case SPEEDSTEP_CPU_PM:
- return pentiumM_get_frequency();
- case SPEEDSTEP_CPU_P4D:
- case SPEEDSTEP_CPU_P4M:
- return pentium4_get_frequency();
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- return pentium3_get_frequency(processor);
- default:
- return 0;
- };
- return 0;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_frequency);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
- *********************************************************************/
-
-unsigned int speedstep_detect_processor(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- u32 ebx, msr_lo, msr_hi;
-
- dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
-
- if ((c->x86_vendor != X86_VENDOR_INTEL) ||
- ((c->x86 != 6) && (c->x86 != 0xF)))
- return 0;
-
- if (c->x86 == 0xF) {
- /* Intel Mobile Pentium 4-M
- * or Intel Mobile Pentium 4 with 533 MHz FSB */
- if (c->x86_model != 2)
- return 0;
-
- ebx = cpuid_ebx(0x00000001);
- ebx &= 0x000000FF;
-
- dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
-
- switch (c->x86_mask) {
- case 4:
- /*
- * B-stepping [M-P4-M]
- * sample has ebx = 0x0f, production has 0x0e.
- */
- if ((ebx == 0x0e) || (ebx == 0x0f))
- return SPEEDSTEP_CPU_P4M;
- break;
- case 7:
- /*
- * C-stepping [M-P4-M]
- * needs to have ebx=0x0e, else it's a celeron:
- * cf. 25130917.pdf / page 7, footnote 5 even
- * though 25072120.pdf / page 7 doesn't say
- * samples are only of B-stepping...
- */
- if (ebx == 0x0e)
- return SPEEDSTEP_CPU_P4M;
- break;
- case 9:
- /*
- * D-stepping [M-P4-M or M-P4/533]
- *
- * this is totally strange: CPUID 0x0F29 is
- * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
- * The latter need to be sorted out as they don't
- * support speedstep.
- * Celerons with CPUID 0x0F29 may have either
- * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
- * specific.
- * M-P4-Ms may have either ebx=0xe or 0xf [see above]
- * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
- * also, M-P4M HTs have ebx=0x8, too
- * For now, they are distinguished by the model_id
- * string
- */
- if ((ebx == 0x0e) ||
- (strstr(c->x86_model_id,
- "Mobile Intel(R) Pentium(R) 4") != NULL))
- return SPEEDSTEP_CPU_P4M;
- break;
- default:
- break;
- }
- return 0;
- }
-
- switch (c->x86_model) {
- case 0x0B: /* Intel PIII [Tualatin] */
- /* cpuid_ebx(1) is 0x04 for desktop PIII,
- * 0x06 for mobile PIII-M */
- ebx = cpuid_ebx(0x00000001);
- dprintk("ebx is %x\n", ebx);
-
- ebx &= 0x000000FF;
-
- if (ebx != 0x06)
- return 0;
-
- /* So far all PIII-M processors support SpeedStep. See
- * Intel's 24540640.pdf of June 2003
- */
- return SPEEDSTEP_CPU_PIII_T;
-
- case 0x08: /* Intel PIII [Coppermine] */
-
- /* all mobile PIII Coppermines have FSB 100 MHz
- * ==> sort out a few desktop PIIIs. */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- msr_lo &= 0x00c0000;
- if (msr_lo != 0x0080000)
- return 0;
-
- /*
- * If the processor is a mobile version,
- * platform ID has bit 50 set
- * it has SpeedStep technology if either
- * bit 56 or 57 is set
- */
- rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- if ((msr_hi & (1<<18)) &&
- (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
- if (c->x86_mask == 0x01) {
- dprintk("early PIII version\n");
- return SPEEDSTEP_CPU_PIII_C_EARLY;
- } else
- return SPEEDSTEP_CPU_PIII_C;
- }
-
- default:
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(speedstep_detect_processor);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP SPEEDS *
- *********************************************************************/
-
-unsigned int speedstep_get_freqs(unsigned int processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state))
-{
- unsigned int prev_speed;
- unsigned int ret = 0;
- unsigned long flags;
- struct timeval tv1, tv2;
-
- if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
- return -EINVAL;
-
- dprintk("trying to determine both speeds\n");
-
- /* get current speed */
- prev_speed = speedstep_get_frequency(processor);
- if (!prev_speed)
- return -EIO;
-
- dprintk("previous speed is %u\n", prev_speed);
-
- local_irq_save(flags);
-
- /* switch to low state */
- set_state(SPEEDSTEP_LOW);
- *low_speed = speedstep_get_frequency(processor);
- if (!*low_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("low speed is %u\n", *low_speed);
-
- /* start latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv1);
-
- /* switch to high state */
- set_state(SPEEDSTEP_HIGH);
-
- /* end latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv2);
-
- *high_speed = speedstep_get_frequency(processor);
- if (!*high_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("high speed is %u\n", *high_speed);
-
- if (*low_speed == *high_speed) {
- ret = -ENODEV;
- goto out;
- }
-
- /* switch to previous state, if necessary */
- if (*high_speed != prev_speed)
- set_state(SPEEDSTEP_LOW);
-
- if (transition_latency) {
- *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
- tv2.tv_usec - tv1.tv_usec;
- dprintk("transition latency is %u uSec\n", *transition_latency);
-
- /* convert uSec to nSec and add 20% for safety reasons */
- *transition_latency *= 1200;
-
- /* check if the latency measurement is too high or too low
- * and set it to a safe value (500uSec) in that case
- */
- if (*transition_latency > 10000000 ||
- *transition_latency < 50000) {
- printk(KERN_WARNING PFX "frequency transition "
- "measured seems out of range (%u "
- "nSec), falling back to a safe one of"
- "%u nSec.\n",
- *transition_latency, 500000);
- *transition_latency = 500000;
- }
- }
-
-out:
- local_irq_restore(flags);
- return ret;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_freqs);
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check,
- "Don't do all checks for speedstep capability.");
-#endif
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 2b6c04e5a304..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-
-/* processors */
-
-#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */
-#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */
-#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */
-#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */
-
-/* the following processors are not speedstep-capable and are not auto-detected
- * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_frequency() call. */
-#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */
-#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */
-#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */
-
-/* speedstep states -- only two of them */
-
-#define SPEEDSTEP_HIGH 0x00000000
-#define SPEEDSTEP_LOW 0x00000001
-
-
-/* detect a speedstep-capable processor */
-extern unsigned int speedstep_detect_processor (void);
-
-/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(unsigned int processor);
-
-
-/* detect the low and high speeds of the processor. The callback
- * set_state"'s first argument is either SPEEDSTEP_HIGH or
- * SPEEDSTEP_LOW; the second argument is zero so that no
- * cpufreq_notify_transition calls are initiated.
- */
-extern unsigned int speedstep_get_freqs(unsigned int processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index befea088e4f5..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Intel SpeedStep SMI driver.
- *
- * (C) 2003 Hiroshi Miura <miura@da-cha.org>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <asm/ist.h>
-
-#include "speedstep-lib.h"
-
-/* speedstep system management interface port/command.
- *
- * These parameters are got from IST-SMI BIOS call.
- * If user gives it, these are used.
- *
- */
-static int smi_port;
-static int smi_cmd;
-static unsigned int smi_sig;
-
-/* info about the processor */
-static unsigned int speedstep_processor;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-#define GET_SPEEDSTEP_OWNER 0
-#define GET_SPEEDSTEP_STATE 1
-#define SET_SPEEDSTEP_STATE 2
-#define GET_SPEEDSTEP_FREQS 4
-
-/* how often shall the SMI call be tried if it failed, e.g. because
- * of DMA activity going on? */
-#define SMI_TRIES 5
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-smi", msg)
-
-/**
- * speedstep_smi_ownership
- */
-static int speedstep_smi_ownership(void)
-{
- u32 command, result, magic, dummy;
- u32 function = GET_SPEEDSTEP_OWNER;
- unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
- magic = virt_to_phys(magic_data);
-
- dprintk("trying to obtain ownership with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=D" (result),
- "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
- "=S" (dummy)
- : "a" (command), "b" (function), "c" (0), "d" (smi_port),
- "D" (0), "S" (magic)
- : "memory"
- );
-
- dprintk("result is %x\n", result);
-
- return result;
-}
-
-/**
- * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
- * @low: the low frequency value is placed here
- * @high: the high frequency value is placed here
- *
- * Only available on later SpeedStep-enabled systems, returns false results or
- * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
- * shows that the latter occurs if !(ist_info.event & 0xFFFF).
- */
-static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
-{
- u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
- u32 state = 0;
- u32 function = GET_SPEEDSTEP_FREQS;
-
- if (!(ist_info.event & 0xFFFF)) {
- dprintk("bug #1422 -- can't read freqs from BIOS\n");
- return -ENODEV;
- }
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine frequencies with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=a" (result),
- "=b" (high_mhz),
- "=c" (low_mhz),
- "=d" (state), "=D" (edi), "=S" (dummy)
- : "a" (command),
- "b" (function),
- "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("result %x, low_freq %u, high_freq %u\n",
- result, low_mhz, high_mhz);
-
- /* abort if results are obviously incorrect... */
- if ((high_mhz + low_mhz) < 600)
- return -EINVAL;
-
- *high = high_mhz * 1000;
- *low = low_mhz * 1000;
-
- return result;
-}
-
-/**
- * speedstep_get_state - set the SpeedStep state
- * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static int speedstep_get_state(void)
-{
- u32 function = GET_SPEEDSTEP_STATE;
- u32 result, state, edi, command, dummy;
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine current setting with command %x "
- "at port %x\n", command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=a" (result),
- "=b" (state), "=D" (edi),
- "=c" (dummy), "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (0),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("state is %x, result is %x\n", state, result);
-
- return state & 1;
-}
-
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static void speedstep_set_state(unsigned int state)
-{
- unsigned int result = 0, command, new_state, dummy;
- unsigned long flags;
- unsigned int function = SET_SPEEDSTEP_STATE;
- unsigned int retry = 0;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to set frequency to state %u "
- "with command %x at port %x\n",
- state, command, smi_port);
-
- do {
- if (retry) {
- dprintk("retry %u, previous result %u, waiting...\n",
- retry, result);
- mdelay(retry * 50);
- }
- retry++;
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=b" (new_state), "=D" (result),
- "=c" (dummy), "=a" (dummy),
- "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
- } while ((new_state != state) && (retry <= SMI_TRIES));
-
- /* enable IRQs */
- local_irq_restore(flags);
-
- if (new_state == state)
- dprintk("change to %u MHz succeeded after %u tries "
- "with result %u\n",
- (speedstep_freqs[new_state].frequency / 1000),
- retry, result);
- else
- printk(KERN_ERR "cpufreq: change to state %u "
- "failed with new_state %u and result %u\n",
- state, new_state, result);
-
- return;
-}
-
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: new freq
- * @relation:
- *
- * Sets a new CPUFreq policy/freq.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int newstate = 0;
- struct cpufreq_freqs freqs;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = 0; /* speedstep.c is UP only driver */
-
- if (freqs.old == freqs.new)
- return 0;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- speedstep_set_state(newstate);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int speed, state;
- unsigned int *low, *high;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- result = speedstep_smi_ownership();
- if (result) {
- dprintk("fails in aquiring ownership of a SMI interface.\n");
- return -EINVAL;
- }
-
- /* detect low and high frequency */
- low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
- high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
-
- result = speedstep_smi_get_freqs(low, high);
- if (result) {
- /* fall back to speedstep_lib.c dection mechanism:
- * try both states out */
- dprintk("could not detect low and high frequencies "
- "by SMI call.\n");
- result = speedstep_get_freqs(speedstep_processor,
- low, high,
- NULL,
- &speedstep_set_state);
-
- if (result) {
- dprintk("could not detect two different speeds"
- " -- aborting.\n");
- return result;
- } else
- dprintk("workaround worked.\n");
- }
-
- /* get current speed setting */
- state = speedstep_get_state();
- speed = speedstep_freqs[state].frequency;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- if (cpu)
- return -ENODEV;
- return speedstep_get_frequency(speedstep_processor);
-}
-
-
-static int speedstep_resume(struct cpufreq_policy *policy)
-{
- int result = speedstep_smi_ownership();
-
- if (result)
- dprintk("fails in re-aquiring ownership of a SMI interface.\n");
-
- return result;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-smi",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .resume = speedstep_resume,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * BIOS, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- speedstep_processor = speedstep_detect_processor();
-
- switch (speedstep_processor) {
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- break;
- default:
- speedstep_processor = 0;
- }
-
- if (!speedstep_processor) {
- dprintk("No supported Intel CPU detected.\n");
- return -ENODEV;
- }
-
- dprintk("signature:0x%.8lx, command:0x%.8lx, "
- "event:0x%.8lx, perf_level:0x%.8lx.\n",
- ist_info.signature, ist_info.command,
- ist_info.event, ist_info.perf_level);
-
- /* Error if no IST-SMI BIOS or no PARM
- sig= 'ISGE' aka 'Intel Speedstep Gate E' */
- if ((ist_info.signature != 0x47534943) && (
- (smi_port == 0) || (smi_cmd == 0)))
- return -ENODEV;
-
- if (smi_sig == 1)
- smi_sig = 0x47534943;
- else
- smi_sig = ist_info.signature;
-
- /* setup smi_port from MODLULE_PARM or BIOS */
- if ((smi_port > 0xff) || (smi_port < 0))
- return -EINVAL;
- else if (smi_port == 0)
- smi_port = ist_info.command & 0xff;
-
- if ((smi_cmd > 0xff) || (smi_cmd < 0))
- return -EINVAL;
- else if (smi_cmd == 0)
- smi_cmd = (ist_info.command >> 16) & 0xff;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-module_param(smi_port, int, 0444);
-module_param(smi_cmd, int, 0444);
-module_param(smi_sig, uint, 0444);
-
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
- "-- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
- "-- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
- "SMI interface.");
-
-MODULE_AUTHOR("Hiroshi Miura");
-MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..146f6f8b0650
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,192 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+static const struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_FXSR, X86_FEATURE_FPU },
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_APX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMX, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
+ { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 },
+ { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
+ { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
+ { X86_FEATURE_XFD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
+ { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
+ { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
+ { X86_FEATURE_FRED, X86_FEATURE_LKGS },
+ { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL },
+ { X86_FEATURE_LASS, X86_FEATURE_SMAP },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
+
+/*
+ * Return the feature "name" if available, otherwise return
+ * the X86_FEATURE_* numerals to make it easier to identify
+ * the feature.
+ */
+static const char *x86_feature_name(unsigned int feature, char *buf)
+{
+ if (x86_cap_flags[feature])
+ return x86_cap_flags[feature];
+
+ snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32);
+
+ return buf;
+}
+
+void check_cpufeature_deps(struct cpuinfo_x86 *c)
+{
+ char feature_buf[16], depends_buf[16];
+ const struct cpuid_dep *d;
+
+ for (d = cpuid_deps; d->feature; d++) {
+ if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) {
+ /*
+ * Only warn about the first unmet dependency on the
+ * first CPU where it is encountered to avoid spamming
+ * the kernel log.
+ */
+ pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n",
+ smp_processor_id(),
+ x86_feature_name(d->feature, feature_buf),
+ x86_feature_name(d->depends, depends_buf));
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c
new file mode 100644
index 000000000000..89bc8db5e9c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sizes.h>
+
+#include <asm/cpuid/types.h>
+
+#include "cpu.h"
+
+#define CACHE_ENTRY(_desc, _type, _size) \
+ [_desc] = { \
+ .c_type = (_type), \
+ .c_size = (_size) / SZ_1K, \
+ }
+
+#define TLB_ENTRY(_desc, _type, _entries) \
+ [_desc] = { \
+ .t_type = (_type), \
+ .entries = (_entries), \
+ }
+
+const struct leaf_0x2_table cpuid_0x2_table[256] = {
+ CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */
+
+ TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */
+ TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */
+ TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */
+ TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */
+ TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative
+ * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */
+ TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */
+ TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */
+ TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */
+ TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */
+ TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */
+ TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */
+ TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */
+ TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */
+};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 593171e967ef..dfec2c61e354 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,21 +1,25 @@
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/isa-dma.h>
#include <linux/pci.h>
#include <asm/dma.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include <asm/processor-cyrix.h>
#include <asm/processor-flags.h>
-#include <asm/timer.h>
+#include <linux/timer.h>
#include <asm/pci-direct.h>
#include <asm/tsc.h>
+#include <asm/cpufeature.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include "cpu.h"
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
-static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
@@ -44,7 +48,7 @@ static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
}
}
-static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned long flags;
@@ -59,25 +63,25 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
-static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
+static unsigned char Cx86_dir0_msb = 0;
-static const char __cpuinitconst Cx86_model[][9] = {
+static const char Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static const char __cpuinitconst Cx486_name[][5] = {
+static const char Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static const char __cpuinitconst Cx486S_name[][4] = {
+static const char Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
-static const char __cpuinitconst Cx486D_name[][4] = {
+static const char Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
-static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
-static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
+static char Cx86_cb[] = "?.5x Core/Bus Clock";
+static const char cyrix_model_mult1[] = "12??43";
+static const char cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -87,7 +91,7 @@ static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
* FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
*/
-static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
+static void check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
@@ -104,7 +108,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
local_irq_restore(flags);
if (ccr5 & 2) { /* possible wrong calibration done */
- printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+ pr_info("Recalibrating delay loop with SLOP bit reset\n");
calibrate_delay();
c->loops_per_jiffy = loops_per_jiffy;
}
@@ -112,52 +116,52 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
}
-static void __cpuinit set_cx86_reorder(void)
+static void set_cx86_reorder(void)
{
u8 ccr3;
- printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
- setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
+ setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
}
-static void __cpuinit set_cx86_memwb(void)
+static void set_cx86_memwb(void)
{
- printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
}
/*
* Configure later MediaGX and/or Geode processor.
*/
-static void __cpuinit geode_configure(void)
+static void geode_configure(void)
{
unsigned long flags;
u8 ccr3;
local_irq_save(flags);
- /* Suspend on halt power saving and enable #SUSP pin */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
+ /* Suspend on halt power saving */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
@@ -166,7 +170,7 @@ static void __cpuinit geode_configure(void)
local_irq_restore(flags);
}
-static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
+static void early_init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir1 = 0;
@@ -185,7 +189,7 @@ static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
+static void init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
@@ -212,7 +216,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* common case step number/rev -- exceptions handled below */
c->x86_model = (dir1 >> 4) + 1;
- c->x86_mask = dir1 & 0xf;
+ c->x86_stepping = dir1 & 0xf;
/* Now cook; the original recipe is by Channing Corn, from Cyrix.
* We do the same thing for each generation: we work out
@@ -249,10 +253,11 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
/* 6x86's contain this bug */
- c->coma_bug = 1;
+ set_cpu_bug(c, X86_BUG_COMA);
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
#ifdef CONFIG_PCI
{
u32 vendor, device;
@@ -269,7 +274,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
* VSA1 we work around however.
*/
- printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+ pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
/* We do this before the PCI layer is running. However we
@@ -282,16 +287,17 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
* The 5510/5520 companion chips have a funky PIT.
*/
if (vendor == PCI_VENDOR_ID_CYRIX &&
- (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
+ (device == PCI_DEVICE_ID_CYRIX_5510 ||
+ device == PCI_DEVICE_ID_CYRIX_5520))
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
- c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
+ c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -299,7 +305,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
* ? : 0x7x
* GX1 : 0x8x GX1 datasheet 56
*/
- if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
+ if ((0x30 <= dir1 && dir1 <= 0x6f) ||
+ (0x80 <= dir1 && dir1 <= 0x8f))
geode_configure();
return;
} else { /* MediaGX */
@@ -313,9 +320,10 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
} else {
- c->coma_bug = 1; /* 6x86MX, it has the bug. */
+ /* A 6x86MX - it has the bug. */
+ set_cpu_bug(c, X86_BUG_COMA);
}
tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
@@ -330,7 +338,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
switch (dir0_lsn) {
case 0xd: /* either a 486SLC or DLC w/o DEVID */
dir0_msn = 0;
- p = Cx486_name[(c->hard_math) ? 1 : 0];
+ p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)];
break;
case 0xe: /* a 486S A step */
@@ -353,7 +361,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/*
* Handle National Semiconductor branded processors
*/
-static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
+static void init_nsc(struct cpuinfo_x86 *c)
{
/*
* There may be GX1 processors in the wild that are branded
@@ -371,7 +379,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
/* Handle the GX (Formally known as the GX2) */
if (c->x86 == 5 && c->x86_model == 5)
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
else
init_cyrix(c);
}
@@ -402,7 +410,7 @@ static inline int test_cyrix_52div(void)
return (unsigned char) (test >> 8) == 0x02;
}
-static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
+static void cyrix_identify(struct cpuinfo_x86 *c)
{
/* Detect Cyrix with disabled CPUID */
if (c->x86 == 4 && test_cyrix_52div()) {
@@ -424,18 +432,21 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
if (dir0 == 5 || dir0 == 3) {
unsigned char ccr3;
unsigned long flags;
- printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+ pr_info("Enabling CPUID on Cyrix processor.\n");
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
- setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
- setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+ /* enable MAPEN */
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
+ /* enable cpuid */
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);
+ /* disable MAPEN */
+ setCx86(CX86_CCR3, ccr3);
local_irq_restore(flags);
}
}
}
-static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
+static const struct cpu_dev cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
@@ -446,7 +457,7 @@ static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
cpu_dev_register(cyrix_cpu_dev);
-static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
+static const struct cpu_dev nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
new file mode 100644
index 000000000000..1976fef2dfe5
--- /dev/null
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static int cpu_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+ struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu);
+
+ seq_printf(m, "online: %d\n", cpu_online(cpu));
+ if (!c->initialized)
+ return 0;
+
+ seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: 0x%x\n", c->topo.apicid);
+ seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
+ seq_printf(m, "die_id: %u\n", c->topo.die_id);
+ seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
+ seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
+ seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
+ seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
+ seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
+ seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
+ seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
+ seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg());
+ seq_printf(m, "num_threads: %u\n", __num_threads_per_package);
+ seq_printf(m, "num_cores: %u\n", __num_cores_per_package);
+ seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package);
+ seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core);
+ return 0;
+}
+
+static int cpu_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpu_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_cpu_ops = {
+ .open = cpu_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dom_debug_show(struct seq_file *m, void *p)
+{
+ static const char *domain_names[TOPO_MAX_DOMAIN] = {
+ [TOPO_SMT_DOMAIN] = "Thread",
+ [TOPO_CORE_DOMAIN] = "Core",
+ [TOPO_MODULE_DOMAIN] = "Module",
+ [TOPO_TILE_DOMAIN] = "Tile",
+ [TOPO_DIE_DOMAIN] = "Die",
+ [TOPO_DIEGRP_DOMAIN] = "DieGrp",
+ [TOPO_PKG_DOMAIN] = "Package",
+ };
+ unsigned int dom, nthreads = 1;
+
+ for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) {
+ nthreads *= x86_topo_system.dom_size[dom];
+ seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n",
+ domain_names[dom], x86_topo_system.dom_shifts[dom],
+ x86_topo_system.dom_size[dom], nthreads);
+ }
+ return 0;
+}
+
+static int dom_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dom_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_dom_ops = {
+ .open = dom_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int cpu_init_debugfs(void)
+{
+ struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
+ unsigned long id;
+ char name[24];
+
+ debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops);
+
+ dir = debugfs_create_dir("cpus", base);
+ for_each_possible_cpu(id) {
+ sprintf(name, "%lu", id);
+ debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops);
+ }
+ return 0;
+}
+late_initcall(cpu_init_debugfs);
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
new file mode 100644
index 000000000000..d69757246bde
--- /dev/null
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tboot.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/cpu: " fmt
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+enum vmx_feature_leafs {
+ MISC_FEATURES = 0,
+ PRIMARY_CTLS,
+ SECONDARY_CTLS,
+ TERTIARY_CTLS_LOW,
+ TERTIARY_CTLS_HIGH,
+ NR_VMX_FEATURE_WORDS,
+};
+
+#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f)
+
+static void init_vmx_capabilities(struct cpuinfo_x86 *c)
+{
+ u32 supported, funcs, ept, vpid, ign, low, high;
+
+ BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
+
+ /*
+ * The high bits contain the allowed-1 settings, i.e. features that can
+ * be turned on. The low bits contain the allowed-0 settings, i.e.
+ * features that can be turned off. Ignore the allowed-0 settings,
+ * if a feature can be turned on then it's supported.
+ *
+ * Use raw rdmsr() for primary processor controls and pin controls MSRs
+ * as they exist on any CPU that supports VMX, i.e. we want the WARN if
+ * the RDMSR faults.
+ */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported);
+ c->vmx_capability[PRIMARY_CTLS] = supported;
+
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
+ c->vmx_capability[SECONDARY_CTLS] = supported;
+
+ /* All 64 bits of tertiary controls MSR are allowed-1 settings. */
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high);
+ c->vmx_capability[TERTIARY_CTLS_LOW] = low;
+ c->vmx_capability[TERTIARY_CTLS_HIGH] = high;
+
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
+ rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
+
+ /*
+ * Except for EPT+VPID, which enumerates support for both in a single
+ * MSR, low for EPT, high for VPID.
+ */
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid);
+
+ /* Pin, EPT, VPID and VM-Func are merged into a single word. */
+ WARN_ON_ONCE(supported >> 16);
+ WARN_ON_ONCE(funcs >> 4);
+ c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) |
+ ((vpid & 0x1) << 16) |
+ ((funcs & 0xf) << 28);
+
+ /* EPT bits are full on scattered and must be manually handled. */
+ if (ept & VMX_EPT_EXECUTE_ONLY_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY);
+ if (ept & VMX_EPT_AD_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
+ if (ept & VMX_EPT_1GB_PAGE_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+ if (ept & VMX_EPT_PAGE_WALK_5_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL);
+
+ /* Synthetic APIC features that are aggregates of multiple features. */
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY);
+
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) &&
+ (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV);
+
+ /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */
+ if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR))
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS))
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT))
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD))
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID))
+ set_cpu_cap(c, X86_FEATURE_VPID);
+}
+#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+
+static int __init nosgx(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
+
+ return 0;
+}
+
+early_param("nosgx", nosgx);
+
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
+{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
+ bool tboot = tboot_enabled();
+ bool enable_vmx;
+ u64 msr;
+
+ if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
+
+ if (msr & FEAT_CTL_LOCKED)
+ goto update_caps;
+
+ /*
+ * Ignore whatever value BIOS left in the MSR to avoid enabling random
+ * features or faulting on the WRMSR.
+ */
+ msr = FEAT_CTL_LOCKED;
+
+ /*
+ * Enable VMX if and only if the kernel may do VMXON at some point,
+ * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
+ * for the kernel, e.g. using VMX to hide malicious code.
+ */
+ if (enable_vmx) {
+ msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ if (tboot)
+ msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
+ }
+
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
+
+ wrmsrq(MSR_IA32_FEAT_CTL, msr);
+
+update_caps:
+ set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
+
+ if (!cpu_has(c, X86_FEATURE_VMX))
+ goto update_sgx;
+
+ if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
+ (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
+ if (IS_ENABLED(CONFIG_KVM_INTEL))
+ pr_err_once("VMX (%s TXT) disabled by BIOS\n",
+ tboot ? "inside" : "outside");
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ } else {
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ init_vmx_capabilities(c);
+#endif
+ }
+
+update_sgx:
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled or unsupported by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
new file mode 100644
index 000000000000..1fda6c3a2b65
--- /dev/null
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Hygon Processor Support for Linux
+ *
+ * Copyright (C) 2018 Chengdu Haiguang IC Design Co., Ltd.
+ *
+ * Author: Pu Wen <puwen@hygon.cn>
+ */
+#include <linux/io.h>
+
+#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/smp.h>
+#include <asm/numa.h>
+#include <asm/cacheinfo.h>
+#include <asm/spec-ctrl.h>
+#include <asm/delay.h>
+#include <asm/msr.h>
+#include <asm/resctrl.h>
+
+#include "cpu.h"
+
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node;
+ unsigned int apicid = c->topo.apicid;
+
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = c->topo.llc_id;
+
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs.
+ * Assume they are all increased by a constant offset, but
+ * in the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->topo.initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void bsp_init_hygon(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+ u64 val;
+
+ rdmsrq(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
+ }
+ }
+
+ resctrl_cpu_detect(c);
+}
+
+static void early_init_hygon(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /*
+ * ApicID can always be treated as an 8-bit value for Hygon APIC So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+}
+
+static void init_hygon(struct cpuinfo_x86 *c)
+{
+ u64 vm_cr;
+
+ early_init_hygon(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * XXX someone from Hygon needs to confirm this DTRT
+ *
+ init_spectral_chicken(c);
+ */
+
+ set_cpu_cap(c, X86_FEATURE_ZEN);
+ set_cpu_cap(c, X86_FEATURE_CPB);
+
+ cpu_detect_cache_sizes(c);
+
+ srat_detect_node(c);
+
+ init_hygon_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
+ if (cpu_has(c, X86_FEATURE_XMM2)) {
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
+
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ }
+
+ /*
+ * Hygon processors have APIC timer running in deep C states.
+ */
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+}
+
+static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m = tlb_lld_2m >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m = eax & 0xff;
+ } else
+ tlb_lli_2m = eax & mask;
+
+ tlb_lli_4m = tlb_lli_2m >> 1;
+}
+
+static const struct cpu_dev hygon_cpu_dev = {
+ .c_vendor = "Hygon",
+ .c_ident = { "HygonGenuine" },
+ .c_early_init = early_init_hygon,
+ .c_detect_tlb = cpu_detect_tlb_hygon,
+ .c_bsp_init = bsp_init_hygon,
+ .c_init = init_hygon,
+ .c_x86_vendor = X86_VENDOR_HYGON,
+};
+
+cpu_dev_register(hygon_cpu_dev);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index fb5b86af0b01..f3e9219845e8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,38 +21,92 @@
*
*/
+#include <linux/init.h>
+#include <linux/export.h>
#include <asm/processor.h>
-#include <asm/vmware.h>
#include <asm/hypervisor.h>
-static inline void __cpuinit
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
- if (vmware_platform()) {
- c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
- } else {
- c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
- }
-}
+#ifdef CONFIG_XEN_PV
+ &x86_hyper_xen_pv,
+#endif
+#ifdef CONFIG_XEN_PVHVM
+ &x86_hyper_xen_hvm,
+#endif
+ &x86_hyper_vmware,
+ &x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
+ &x86_hyper_kvm,
+#endif
+#ifdef CONFIG_JAILHOUSE_GUEST
+ &x86_hyper_jailhouse,
+#endif
+#ifdef CONFIG_ACRN_GUEST
+ &x86_hyper_acrn,
+#endif
+#ifdef CONFIG_BHYVE_GUEST
+ &x86_hyper_bhyve,
+#endif
+};
+
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
-unsigned long get_hypervisor_tsc_freq(void)
+bool __initdata nopv;
+static __init int parse_nopv(char *arg)
{
- if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
- return vmware_get_tsc_khz();
+ nopv = true;
return 0;
}
+early_param("nopv", parse_nopv);
-static inline void __cpuinit
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+static inline const struct hypervisor_x86 * __init
+detect_hypervisor_vendor(void)
{
- if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
- vmware_set_feature_bits(c);
- return;
+ const struct hypervisor_x86 *h = NULL, * const *p;
+ uint32_t pri, max_pri = 0;
+
+ for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+ if (unlikely(nopv) && !(*p)->ignore_nopv)
+ continue;
+
+ pri = (*p)->detect();
+ if (pri > max_pri) {
+ max_pri = pri;
+ h = *p;
+ }
}
+
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
}
-void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
+static void __init copy_array(const void *src, void *target, unsigned int size)
{
- detect_hypervisor_vendor(c);
- hypervisor_set_feature_bits(c);
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
+
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
+}
+
+void __init init_hypervisor_platform(void)
+{
+ const struct hypervisor_x86 *h;
+
+ h = detect_hypervisor_vendor();
+
+ if (!h)
+ return;
+
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
+
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3260ab044996..98ae4c37c93e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,51 +1,240 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <linux/string.h>
#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/thread_info.h>
-#include <linux/module.h>
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/ds.h>
-#include <asm/bugs.h>
-#include <asm/cpu.h>
+#include <linux/string.h>
+#include <linux/types.h>
#ifdef CONFIG_X86_64
-#include <asm/topology.h>
-#include <asm/numa_64.h>
+#include <linux/topology.h>
#endif
+#include <asm/bugs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu.h>
+#include <asm/cpuid/api.h>
+#include <asm/hwcap2.h>
+#include <asm/intel-family.h>
+#include <asm/microcode.h>
+#include <asm/msr.h>
+#include <asm/numa.h>
+#include <asm/resctrl.h>
+#include <asm/thermal.h>
+#include <asm/uaccess.h>
+
#include "cpu.h"
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#endif
+/*
+ * Processors which have self-snooping capability can handle conflicting
+ * memory type across CPUs by snooping its own cache. However, there exists
+ * CPU models in which having conflicting memory types still leads to
+ * unpredictable behavior, machine check errors, or hangs. Clear this
+ * feature to prevent its use on machines with known erratas.
+ */
+static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vfm) {
+ case INTEL_CORE_YONAH:
+ case INTEL_CORE2_MEROM:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_SANDYBRIDGE:
+ setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
+ }
+}
+
+static bool ring3mwait_disabled __read_mostly;
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+static int __init ring3mwait_disable(char *__unused)
{
- /* Unmask CPUID levels if masked: */
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- u64 misc_enable;
+ ring3mwait_disabled = true;
+ return 1;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
+{
+ /*
+ * Ring 3 MONITOR/MWAIT feature cannot be detected without
+ * cpu model and family comparison.
+ */
+ if (c->x86 != 6)
+ return;
+ switch (c->x86_vfm) {
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
+ break;
+ default:
+ return;
+ }
- if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
- misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- c->cpuid_level = cpuid_eax(0);
- }
+ if (ring3mwait_disabled)
+ return;
+
+ set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+ this_cpu_or(msr_misc_features_shadow,
+ 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
+
+ if (c == &boot_cpu_data)
+ ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
+/*
+ * Early microcode releases for the Spectre v2 mitigation were broken.
+ * Information taken from;
+ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf
+ * - https://kb.vmware.com/s/article/52345
+ * - Microcode revisions observed in the wild
+ * - Release note from 20180108 microcode release
+ */
+struct sku_microcode {
+ u32 vfm;
+ u8 stepping;
+ u32 microcode;
+};
+static const struct sku_microcode spectre_bad_microcodes[] = {
+ { INTEL_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_KABYLAKE, 0x09, 0x80 },
+ { INTEL_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_KABYLAKE_L, 0x09, 0x80 },
+ { INTEL_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_BROADWELL, 0x04, 0x28 },
+ { INTEL_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_BROADWELL_D, 0x03, 0x07000011 },
+ { INTEL_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_HASWELL_L, 0x01, 0x21 },
+ { INTEL_HASWELL_G, 0x01, 0x18 },
+ { INTEL_HASWELL, 0x03, 0x23 },
+ { INTEL_HASWELL_X, 0x02, 0x3b },
+ { INTEL_HASWELL_X, 0x04, 0x10 },
+ { INTEL_IVYBRIDGE_X, 0x04, 0x42a },
+ /* Observed in the wild */
+ { INTEL_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_SANDYBRIDGE_X, 0x07, 0x712 },
+};
+
+static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ /*
+ * We know that the hypervisor lie to us on the microcode version so
+ * we may as well hope that it is running the correct version.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
+ if (c->x86_vfm == spectre_bad_microcodes[i].vfm &&
+ c->x86_stepping == spectre_bad_microcodes[i].stepping)
+ return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
+ return false;
+}
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
+
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
+
+static void detect_tme_early(struct cpuinfo_x86 *c)
+{
+ u64 tme_activate;
+ int keyid_bits;
+
+ rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate);
+
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ clear_cpu_cap(c, X86_FEATURE_TME);
+ return;
+ }
+ pr_info_once("x86/tme: enabled by BIOS\n");
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ if (!keyid_bits)
+ return;
+
+ /*
+ * KeyID bits are set by BIOS and can be present regardless
+ * of whether the kernel is using them. They effectively lower
+ * the number of physical address bits.
+ *
+ * Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+ pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n",
+ keyid_bits);
+}
+
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN)
+ return;
+
+ /*
+ * The BIOS can have limited CPUID to leaf 2, which breaks feature
+ * enumeration. Unlock it and update the maximum leaf info.
+ */
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
+ c->cpuid_level = cpuid_eax(0);
+}
+
+static void early_init_intel(struct cpuinfo_x86 *c)
+{
+ u64 misc_enable;
+
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+ c->microcode = intel_get_microcode_revision();
+
+ /* Now if any of them are set, check the blacklist and clear the lot */
+ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+ cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
+ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
+ setup_clear_cpu_cap(X86_FEATURE_IBRS);
+ setup_clear_cpu_cap(X86_FEATURE_IBPB);
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SSBD);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
+ }
+
+ /*
+ * Atom erratum AAE44/AAF40/AAG38/AAH41:
+ *
+ * A race condition between speculative fetches and invalidating
+ * a large page. This is worked around in microcode, but we
+ * need the microcode to have already been loaded... so if it is
+ * not, recommend a BIOS update and disable large pages.
+ */
+ if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 &&
+ c->microcode < 0x20e) {
+ pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
+ clear_cpu_cap(c, X86_FEATURE_PSE);
+ }
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
@@ -56,8 +245,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
#endif
/* CPUID workaround for 0F33/0F34 CPU */
- if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+ if (c->x86_vfm == INTEL_P4_PRESCOTT &&
+ (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -66,49 +255,91 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
+ *
+ * Use a model-specific check for some older CPUs that have invariant
+ * TSC but may not report it architecturally via 8000_0007.
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
- sched_clock_stable = 1;
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_CEDARMILL) ||
+ (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ }
+
+ /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
+ switch (c->x86_vfm) {
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT_NP:
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
+ *
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
-#ifdef CONFIG_KMEMCHECK
/*
- * P4s have a "fast strings" feature which causes single-
- * stepping REP instructions to only generate a #DB on
- * cache-line boundaries.
+ * Modern CPUs are generally expected to have a sane fast string
+ * implementation. However, BIOSes typically have a knob to tweak
+ * the architectural MISC_ENABLE.FAST_STRING enable bit.
*
- * Ingo Molnar reported a Pentium D (model 6) and a Xeon
- * (model 2) with the same problem.
+ * Adhere to the preference and program the Linux-defined fast
+ * string flag and enhanced fast string capabilities accordingly.
*/
- if (c->x86 == 15) {
- u64 misc_enable;
-
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
+ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) {
+ rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
- printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
-
- misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ /* X86_FEATURE_ERMS is set based on CPUID */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ } else {
+ pr_info("Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
-#endif
+
+ /*
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
+ * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
+ * to be modified.
+ */
+ if (c->x86_vfm == INTEL_QUARK_X1000) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
+ }
+
+ check_memory_type_self_snoop_errata(c);
+
+ /*
+ * Adjust the number of physical bits early because it affects the
+ * valid bits of the MTRR mask registers.
+ */
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme_early(c);
+}
+
+static void bsp_init_intel(struct cpuinfo_x86 *c)
+{
+ resctrl_cpu_detect(c);
}
#ifdef CONFIG_X86_32
@@ -118,73 +349,60 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* This is called before we do cpu ident work
*/
-int __cpuinit ppro_with_ram_bug(void)
+int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask < 8) {
- printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
+ boot_cpu_data.x86_stepping < 8) {
+ pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
}
return 0;
}
-#ifdef CONFIG_X86_F00F_BUG
-static void __cpuinit trap_init_f00f_bug(void)
-{
- __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
-
- /*
- * Update the IDT descriptor and reload the IDT so that
- * it uses the read-only mapped virtual address.
- */
- idt_descr.address = fix_to_virt(FIX_F00F_IDT);
- load_idt(&idt_descr);
-}
-#endif
-
-static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+static void intel_smp_check(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
* Mask B, Pentium, but not Pentium MMX
*/
- if (c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4) {
/*
* Remember we have B step Pentia with bugs
*/
WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
"with B stepping processors.\n");
}
-#endif
}
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
{
- unsigned long lo, hi;
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_F00F_BUG
/*
- * All current models of Pentium and Pentium with MMX technology CPUs
- * have the F0 0F bug, which lets nonprivileged users lock up the system.
- * Note that the workaround only should be initialized once...
+ * All models of Pentium and Pentium with MMX technology CPUs
+ * have the F0 0F bug, which lets nonprivileged users lock up the
+ * system. Announce that the fault handler will be checking for it.
+ * The Quark is also family 5, but does not have the same bug.
*/
- c->f00f_bug = 0;
- if (!paravirt_enabled() && c->x86 == 5) {
+ clear_cpu_bug(c, X86_BUG_F00F);
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) {
static int f00f_workaround_enabled;
- c->f00f_bug = 1;
+ set_cpu_bug(c, X86_BUG_F00F);
if (!f00f_workaround_enabled) {
- trap_init_f00f_bug();
- printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+ pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
}
}
@@ -194,20 +412,30 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+ if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) ||
+ c->x86_vfm < INTEL_PENTIUM_II_KLAMATH)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
- * P4 Xeon errata 037 workaround.
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ pr_warn("PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
+ * P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
- rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
- printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
- printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
- wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
+ if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) {
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n");
}
}
@@ -217,131 +445,102 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
- if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
- (c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
-
+ if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 &&
+ (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
+ set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
- * Set up the preferred alignment for movsl bulk memory moves
+ * MOVSL bulk memory moves can be slow when source and dest are not
+ * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment.
+ *
+ * Set the preferred alignment for Pentium Pro and newer processors, as
+ * it has only been tested on these.
*/
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO)
movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- }
-#endif
-
-#ifdef CONFIG_X86_NUMAQ
- numaq_tsc_disable();
#endif
intel_smp_check(c);
}
#else
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static void intel_workarounds(struct cpuinfo_x86 *c)
{
}
#endif
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
unsigned node;
int cpu = smp_processor_id();
- int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
- node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE || !node_online(node))
- node = first_node(node_online_map);
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE || !node_online(node)) {
+ /* reuse the value from init_cpu_to_node() */
+ node = cpu_to_node(cpu);
+ }
numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
- unsigned int eax, ebx, ecx, edx;
-
- if (c->cpuid_level < 4)
- return 1;
+ u64 msr;
- /* Intel has a non-standard dependency on %ecx for this CPUID level. */
- cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
- if (eax & 0x1f)
- return ((eax >> 26) + 1);
- else
- return 1;
+ if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+ }
}
-static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
{
- /* Intel VMX MSR indicated features */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- clear_cpu_cap(c, X86_FEATURE_VNMI);
- clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- clear_cpu_cap(c, X86_FEATURE_EPT);
- clear_cpu_cap(c, X86_FEATURE_VPID);
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
+ u64 msr;
+
+ if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ return;
+
+ /* Clear all MISC features */
+ this_cpu_write(msr_misc_features_shadow, 0);
+
+ /* Check features and update capabilities and shadow control bits */
+ init_cpuid_fault(c);
+ probe_xeon_phi_r3mwait(c);
+
+ msr = this_cpu_read(msr_misc_features_shadow);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msr);
}
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
- unsigned int l2 = 0;
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ {},
+};
+static void init_intel(struct cpuinfo_x86 *c)
+{
early_init_intel(c);
intel_workarounds(c);
- /*
- * Detect the extended topology information if available. This
- * will reinitialise the initial_apicid which will be used
- * in init_intel_cacheinfo()
- */
- detect_extended_topology(c);
+ init_intel_cacheinfo(c);
- l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
@@ -349,26 +548,33 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
- if (cpu_has_xmm2)
+ if (cpu_has(c, X86_FEATURE_XMM2))
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- if (cpu_has_ds) {
- unsigned int l1;
+
+ if (boot_cpu_has(X86_FEATURE_DS)) {
+ unsigned int l1, l2;
+
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_PEBS);
- ds_init_intel(c);
}
- if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
- set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_vfm == INTEL_CORE2_DUNNINGTON ||
+ c->x86_vfm == INTEL_NEHALEM_EX ||
+ c->x86_vfm == INTEL_WESTMERE_EX))
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
+
+ if (boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (c->x86_vfm == INTEL_ATOM_GOLDMONT ||
+ c->x86_vfm == INTEL_LUNARLAKE_M))
+ set_cpu_bug(c, X86_BUG_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
@@ -376,22 +582,21 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
* Dixon is NOT a Celeron.
*/
if (c->x86 == 6) {
+ unsigned int l2 = c->x86_cache_size;
char *p = NULL;
switch (c->x86_model) {
case 5:
- if (c->x86_mask == 0) {
- if (l2 == 0)
- p = "Celeron (Covington)";
- else if (l2 == 256)
- p = "Mobile Pentium II (Dixon)";
- }
+ if (l2 == 0)
+ p = "Celeron (Covington)";
+ else if (l2 == 256)
+ p = "Mobile Pentium II (Dixon)";
break;
case 6:
if (l2 == 128)
p = "Celeron (Mendocino)";
- else if (c->x86_mask == 0 || c->x86_mask == 5)
+ else if (c->x86_stepping == 0 || c->x86_stepping == 5)
p = "Celeron-A";
break;
@@ -404,33 +609,25 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
-
- if (c->x86 == 15)
- set_cpu_cap(c, X86_FEATURE_P4);
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
+ if (x86_match_cpu(zmm_exclusion_list))
+ set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
/* Work around errata */
srat_detect_node(c);
- if (cpu_has(c, X86_FEATURE_VMX))
- detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
+
+ init_intel_misc_features(c);
+
+ split_lock_init();
+
+ intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/*
* Intel PIII Tualatin. This comes in two flavours.
@@ -438,18 +635,98 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
- if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0)
size = 256;
+
+ /*
+ * Intel Quark SoC X1000 contains a 4-way set associative
+ * 16K cache with a 16 byte cache line and 256 lines per tag
+ */
+ if (c->x86_vfm == INTEL_QUARK_X1000)
+ size = 16;
return size;
}
#endif
-static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
+static void intel_tlb_lookup(const struct leaf_0x2_table *desc)
+{
+ short entries = desc->entries;
+
+ switch (desc->t_type) {
+ case STLB_4K:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ break;
+ case STLB_4K_2M:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_INST_ALL:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ break;
+ case TLB_INST_4K:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ break;
+ case TLB_INST_4M:
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ break;
+ case TLB_INST_2M_4M:
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ break;
+ case TLB_DATA_4K:
+ case TLB_DATA0_4K:
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ break;
+ case TLB_DATA_4M:
+ case TLB_DATA0_4M:
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_DATA_2M_4M:
+ case TLB_DATA0_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_DATA_4K_4M:
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_DATA_1G_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES);
+ tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES);
+ fallthrough;
+ case TLB_DATA_1G:
+ tlb_lld_1g = max(tlb_lld_1g, entries);
+ break;
+ }
+}
+
+static void intel_detect_tlb(struct cpuinfo_x86 *c)
+{
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc)
+ intel_tlb_lookup(desc);
+}
+
+static const struct cpu_dev intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[0] = "486 DX-25/33",
[1] = "486 DX-50",
@@ -462,7 +739,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[9] = "486 DX/4-WB"
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
+ { .family = 5, .model_names =
{
[0] = "Pentium 60/66 A-step",
[1] = "Pentium 60/66",
@@ -470,10 +747,11 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
[7] = "Mobile Pentium 75 - 200",
- [8] = "Mobile Pentium MMX"
+ [8] = "Mobile Pentium MMX",
+ [9] = "Quark SoC X1000",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
+ { .family = 6, .model_names =
{
[0] = "Pentium Pro A-step",
[1] = "Pentium Pro",
@@ -487,7 +765,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[11] = "Pentium III (Tualatin)",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
+ { .family = 15, .model_names =
{
[0] = "Pentium 4 (Unknown)",
[1] = "Pentium 4 (Willamette)",
@@ -497,12 +775,13 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
}
},
},
- .c_size_cache = intel_size_cache,
+ .legacy_cache_size = intel_size_cache,
#endif
+ .c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
+ .c_bsp_init = bsp_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
};
cpu_dev_register(intel_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
deleted file mode 100644
index 789efe217e1a..000000000000
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ /dev/null
@@ -1,1006 +0,0 @@
-/*
- * Routines to indentify caches on Intel CPU.
- *
- * Changes:
- * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
- * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
- */
-
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/pci.h>
-
-#include <asm/processor.h>
-#include <asm/smp.h>
-#include <asm/k8.h>
-
-#define LVL_1_INST 1
-#define LVL_1_DATA 2
-#define LVL_2 3
-#define LVL_3 4
-#define LVL_TRACE 5
-
-struct _cache_table
-{
- unsigned char descriptor;
- char cache_type;
- short size;
-};
-
-/* all the cache descriptor types we care about (no TLB or trace cache entries) */
-static const struct _cache_table __cpuinitconst cache_table[] =
-{
- { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
- { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
- { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
- { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
- { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
- { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
- { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */
- { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
- { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
- { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
- { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */
- { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */
- { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */
- { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */
- { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
- { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */
- { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
- { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */
- { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */
- { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */
- { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
- { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
- { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
- { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
- { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */
- { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */
- { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
- { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
- { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
- { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */
- { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */
- { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */
- { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
- { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
- { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
- { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */
- { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
- { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
- { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
- { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */
- { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
- { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
- { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
- { 0x00, 0, 0}
-};
-
-
-enum _cache_type
-{
- CACHE_TYPE_NULL = 0,
- CACHE_TYPE_DATA = 1,
- CACHE_TYPE_INST = 2,
- CACHE_TYPE_UNIFIED = 3
-};
-
-union _cpuid4_leaf_eax {
- struct {
- enum _cache_type type:5;
- unsigned int level:3;
- unsigned int is_self_initializing:1;
- unsigned int is_fully_associative:1;
- unsigned int reserved:4;
- unsigned int num_threads_sharing:12;
- unsigned int num_cores_on_die:6;
- } split;
- u32 full;
-};
-
-union _cpuid4_leaf_ebx {
- struct {
- unsigned int coherency_line_size:12;
- unsigned int physical_line_partition:10;
- unsigned int ways_of_associativity:10;
- } split;
- u32 full;
-};
-
-union _cpuid4_leaf_ecx {
- struct {
- unsigned int number_of_sets:32;
- } split;
- u32 full;
-};
-
-struct _cpuid4_info {
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned long size;
- unsigned long can_disable;
- DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
-/* subset of above _cpuid4_info w/o shared_cpu_map */
-struct _cpuid4_info_regs {
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned long size;
- unsigned long can_disable;
-};
-
-unsigned short num_cache_leaves;
-
-/* AMD doesn't have CPUID4. Emulate it here to report the same
- information to the user. This makes some assumptions about the machine:
- L2 not shared, no SMT etc. that is currently true on AMD CPUs.
-
- In theory the TLBs could be reported as fake type (they are in "dummy").
- Maybe later */
-union l1_cache {
- struct {
- unsigned line_size : 8;
- unsigned lines_per_tag : 8;
- unsigned assoc : 8;
- unsigned size_in_kb : 8;
- };
- unsigned val;
-};
-
-union l2_cache {
- struct {
- unsigned line_size : 8;
- unsigned lines_per_tag : 4;
- unsigned assoc : 4;
- unsigned size_in_kb : 16;
- };
- unsigned val;
-};
-
-union l3_cache {
- struct {
- unsigned line_size : 8;
- unsigned lines_per_tag : 4;
- unsigned assoc : 4;
- unsigned res : 2;
- unsigned size_encoded : 14;
- };
- unsigned val;
-};
-
-static const unsigned short __cpuinitconst assocs[] = {
- [1] = 1,
- [2] = 2,
- [4] = 4,
- [6] = 8,
- [8] = 16,
- [0xa] = 32,
- [0xb] = 48,
- [0xc] = 64,
- [0xd] = 96,
- [0xe] = 128,
- [0xf] = 0xffff /* fully associative - no way to show this currently */
-};
-
-static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
-static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
-
-static void __cpuinit
-amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
- union _cpuid4_leaf_ebx *ebx,
- union _cpuid4_leaf_ecx *ecx)
-{
- unsigned dummy;
- unsigned line_size, lines_per_tag, assoc, size_in_kb;
- union l1_cache l1i, l1d;
- union l2_cache l2;
- union l3_cache l3;
- union l1_cache *l1 = &l1d;
-
- eax->full = 0;
- ebx->full = 0;
- ecx->full = 0;
-
- cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
- cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
-
- switch (leaf) {
- case 1:
- l1 = &l1i;
- case 0:
- if (!l1->val)
- return;
- assoc = l1->assoc;
- line_size = l1->line_size;
- lines_per_tag = l1->lines_per_tag;
- size_in_kb = l1->size_in_kb;
- break;
- case 2:
- if (!l2.val)
- return;
- assoc = l2.assoc;
- line_size = l2.line_size;
- lines_per_tag = l2.lines_per_tag;
- /* cpu_data has errata corrections for K7 applied */
- size_in_kb = current_cpu_data.x86_cache_size;
- break;
- case 3:
- if (!l3.val)
- return;
- assoc = l3.assoc;
- line_size = l3.line_size;
- lines_per_tag = l3.lines_per_tag;
- size_in_kb = l3.size_encoded * 512;
- break;
- default:
- return;
- }
-
- eax->split.is_self_initializing = 1;
- eax->split.type = types[leaf];
- eax->split.level = levels[leaf];
- if (leaf == 3)
- eax->split.num_threads_sharing =
- current_cpu_data.x86_max_cores - 1;
- else
- eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
-
-
- if (assoc == 0xf)
- eax->split.is_fully_associative = 1;
- ebx->split.coherency_line_size = line_size - 1;
- ebx->split.ways_of_associativity = assocs[assoc] - 1;
- ebx->split.physical_line_partition = lines_per_tag - 1;
- ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
- (ebx->split.ways_of_associativity + 1) - 1;
-}
-
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
-{
- if (index < 3)
- return;
-
- if (boot_cpu_data.x86 == 0x11)
- return;
-
- /* see erratum #382 */
- if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
- return;
-
- this_leaf->can_disable = 1;
-}
-
-static int
-__cpuinit cpuid4_cache_lookup_regs(int index,
- struct _cpuid4_info_regs *this_leaf)
-{
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned edx;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- amd_cpuid4(index, &eax, &ebx, &ecx);
- if (boot_cpu_data.x86 >= 0x10)
- amd_check_l3_disable(index, this_leaf);
- } else {
- cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
- }
-
- if (eax.split.type == CACHE_TYPE_NULL)
- return -EIO; /* better error ? */
-
- this_leaf->eax = eax;
- this_leaf->ebx = ebx;
- this_leaf->ecx = ecx;
- this_leaf->size = (ecx.split.number_of_sets + 1) *
- (ebx.split.coherency_line_size + 1) *
- (ebx.split.physical_line_partition + 1) *
- (ebx.split.ways_of_associativity + 1);
- return 0;
-}
-
-static int __cpuinit find_num_cache_leaves(void)
-{
- unsigned int eax, ebx, ecx, edx;
- union _cpuid4_leaf_eax cache_eax;
- int i = -1;
-
- do {
- ++i;
- /* Do cpuid(4) loop to find out num_cache_leaves */
- cpuid_count(4, i, &eax, &ebx, &ecx, &edx);
- cache_eax.full = eax;
- } while (cache_eax.split.type != CACHE_TYPE_NULL);
- return i;
-}
-
-unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
-{
- unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
- unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
- unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
- unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
- unsigned int cpu = c->cpu_index;
-#endif
-
- if (c->cpuid_level > 3) {
- static int is_initialized;
-
- if (is_initialized == 0) {
- /* Init num_cache_leaves from boot CPU */
- num_cache_leaves = find_num_cache_leaves();
- is_initialized++;
- }
-
- /*
- * Whenever possible use cpuid(4), deterministic cache
- * parameters cpuid leaf to find the cache details
- */
- for (i = 0; i < num_cache_leaves; i++) {
- struct _cpuid4_info_regs this_leaf;
- int retval;
-
- retval = cpuid4_cache_lookup_regs(i, &this_leaf);
- if (retval >= 0) {
- switch(this_leaf.eax.split.level) {
- case 1:
- if (this_leaf.eax.split.type ==
- CACHE_TYPE_DATA)
- new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type ==
- CACHE_TYPE_INST)
- new_l1i = this_leaf.size/1024;
- break;
- case 2:
- new_l2 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid >> index_msb;
- break;
- case 3:
- new_l3 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l3_id = c->apicid >> index_msb;
- break;
- default:
- break;
- }
- }
- }
- }
- /*
- * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
- * trace cache
- */
- if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
- /* supports eax=2 call */
- int j, n;
- unsigned int regs[4];
- unsigned char *dp = (unsigned char *)regs;
- int only_trace = 0;
-
- if (num_cache_leaves != 0 && c->x86 == 15)
- only_trace = 1;
-
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for ( i = 0 ; i < n ; i++ ) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for ( j = 0 ; j < 3 ; j++ ) {
- if (regs[j] & (1 << 31)) regs[j] = 0;
- }
-
- /* Byte 0 is level count, not a descriptor */
- for ( j = 1 ; j < 16 ; j++ ) {
- unsigned char des = dp[j];
- unsigned char k = 0;
-
- /* look up this descriptor in the table */
- while (cache_table[k].descriptor != 0)
- {
- if (cache_table[k].descriptor == des) {
- if (only_trace && cache_table[k].cache_type != LVL_TRACE)
- break;
- switch (cache_table[k].cache_type) {
- case LVL_1_INST:
- l1i += cache_table[k].size;
- break;
- case LVL_1_DATA:
- l1d += cache_table[k].size;
- break;
- case LVL_2:
- l2 += cache_table[k].size;
- break;
- case LVL_3:
- l3 += cache_table[k].size;
- break;
- case LVL_TRACE:
- trace += cache_table[k].size;
- break;
- }
-
- break;
- }
-
- k++;
- }
- }
- }
- }
-
- if (new_l1d)
- l1d = new_l1d;
-
- if (new_l1i)
- l1i = new_l1i;
-
- if (new_l2) {
- l2 = new_l2;
-#ifdef CONFIG_X86_HT
- per_cpu(cpu_llc_id, cpu) = l2_id;
-#endif
- }
-
- if (new_l3) {
- l3 = new_l3;
-#ifdef CONFIG_X86_HT
- per_cpu(cpu_llc_id, cpu) = l3_id;
-#endif
- }
-
- if (trace)
- printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
- else if ( l1i )
- printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
-
- if (l1d)
- printk(", L1 D cache: %dK\n", l1d);
- else
- printk("\n");
-
- if (l2)
- printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
-
- if (l3)
- printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
-
- c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
-
- return l2;
-}
-
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- unsigned long num_threads_sharing;
- int index_msb, i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
-
- if (num_threads_sharing == 1)
- cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
- else {
- index_msb = get_count_order(num_threads_sharing);
-
- for_each_online_cpu(i) {
- if (cpu_data(i).apicid >> index_msb ==
- c->apicid >> index_msb) {
- cpumask_set_cpu(i,
- to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(cpuid4_info, i)) {
- sibling_leaf =
- CPUID4_INFO_IDX(i, index);
- cpumask_set_cpu(cpu, to_cpumask(
- sibling_leaf->shared_cpu_map));
- }
- }
- }
- }
-}
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- int sibling;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpumask_clear_cpu(cpu,
- to_cpumask(sibling_leaf->shared_cpu_map));
- }
-}
-#else
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {}
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {}
-#endif
-
-static void __cpuinit free_cache_attributes(unsigned int cpu)
-{
- int i;
-
- for (i = 0; i < num_cache_leaves; i++)
- cache_remove_shared_cpu_map(cpu, i);
-
- kfree(per_cpu(cpuid4_info, cpu));
- per_cpu(cpuid4_info, cpu) = NULL;
-}
-
-static int
-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
-{
- struct _cpuid4_info_regs *leaf_regs =
- (struct _cpuid4_info_regs *)this_leaf;
-
- return cpuid4_cache_lookup_regs(index, leaf_regs);
-}
-
-static void __cpuinit get_cpu_leaves(void *_retval)
-{
- int j, *retval = _retval, cpu = smp_processor_id();
-
- /* Do cpuid and store the results */
- for (j = 0; j < num_cache_leaves; j++) {
- struct _cpuid4_info *this_leaf;
- this_leaf = CPUID4_INFO_IDX(cpu, j);
- *retval = cpuid4_cache_lookup(j, this_leaf);
- if (unlikely(*retval < 0)) {
- int i;
-
- for (i = 0; i < j; i++)
- cache_remove_shared_cpu_map(cpu, i);
- break;
- }
- cache_shared_cpu_map_setup(cpu, j);
- }
-}
-
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
-{
- int retval;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
- if (retval) {
- kfree(per_cpu(cpuid4_info, cpu));
- per_cpu(cpuid4_info, cpu) = NULL;
- }
-
- return retval;
-}
-
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, cache_kobject);
-
-struct _index_kobject {
- struct kobject kobj;
- unsigned int cpu;
- unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val) \
-static ssize_t show_##file_name \
- (struct _cpuid4_info *this_leaf, char *buf) \
-{ \
- return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, eax.split.level, 0);
-show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
-{
- return sprintf (buf, "%luK\n", this_leaf->size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
- int type, char *buf)
-{
- ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
- int n = 0;
-
- if (len > 1) {
- const struct cpumask *mask;
-
- mask = to_cpumask(this_leaf->shared_cpu_map);
- n = type?
- cpulist_scnprintf(buf, len-2, mask) :
- cpumask_scnprintf(buf, len-2, mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- }
- return n;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf)
-{
- return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
-{
- return show_shared_cpu_map_func(leaf, 1, buf);
-}
-
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
-{
- switch (this_leaf->eax.split.type) {
- case CACHE_TYPE_DATA:
- return sprintf(buf, "Data\n");
- case CACHE_TYPE_INST:
- return sprintf(buf, "Instruction\n");
- case CACHE_TYPE_UNIFIED:
- return sprintf(buf, "Unified\n");
- default:
- return sprintf(buf, "Unknown\n");
- }
-}
-
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned int reg = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!dev)
- return -EINVAL;
-
- pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
- return sprintf(buf, "%x\n", reg);
-}
-
-#define SHOW_CACHE_DISABLE(index) \
-static ssize_t \
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
-{ \
- return show_cache_disable(this_leaf, buf, index); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count, unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned long val = 0;
- unsigned int scrubber = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!dev)
- return -EINVAL;
-
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- val |= 0xc0000000;
-
- pci_read_config_dword(dev, 0x58, &scrubber);
- scrubber &= ~0x1f000000;
- pci_write_config_dword(dev, 0x58, scrubber);
-
- pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
- wbinvd();
- pci_write_config_dword(dev, 0x1BC + index * 4, val);
- return count;
-}
-
-#define STORE_CACHE_DISABLE(index) \
-static ssize_t \
-store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count) \
-{ \
- return store_cache_disable(this_leaf, buf, count, index); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
-};
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
- __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
-
-static struct attribute * default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
- &cache_disable_0.attr,
- &cache_disable_1.attr,
- NULL
-};
-
-static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->show ?
- fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf) :
- 0;
- return ret;
-}
-
-static ssize_t store(struct kobject * kobj, struct attribute * attr,
- const char * buf, size_t count)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->store ?
- fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, count) :
- 0;
- return ret;
-}
-
-static struct sysfs_ops sysfs_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type ktype_cache = {
- .sysfs_ops = &sysfs_ops,
- .default_attrs = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
- .sysfs_ops = &sysfs_ops,
-};
-
-static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
- kfree(per_cpu(cache_kobject, cpu));
- kfree(per_cpu(index_kobject, cpu));
- per_cpu(cache_kobject, cpu) = NULL;
- per_cpu(index_kobject, cpu) = NULL;
- free_cache_attributes(cpu);
-}
-
-static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
-{
- int err;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- err = detect_cache_attributes(cpu);
- if (err)
- return err;
-
- /* Allocate all required memory */
- per_cpu(cache_kobject, cpu) =
- kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(cache_kobject, cpu) == NULL))
- goto err_out;
-
- per_cpu(index_kobject, cpu) = kzalloc(
- sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(index_kobject, cpu) == NULL))
- goto err_out;
-
- return 0;
-
-err_out:
- cpuid4_cache_sysfs_exit(cpu);
- return -ENOMEM;
-}
-
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
-{
- unsigned int cpu = sys_dev->id;
- unsigned long i, j;
- struct _index_kobject *this_object;
- int retval;
-
- retval = cpuid4_cache_sysfs_init(cpu);
- if (unlikely(retval < 0))
- return retval;
-
- retval = kobject_init_and_add(per_cpu(cache_kobject, cpu),
- &ktype_percpu_entry,
- &sys_dev->kobj, "%s", "cache");
- if (retval < 0) {
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
-
- for (i = 0; i < num_cache_leaves; i++) {
- this_object = INDEX_KOBJECT_PTR(cpu,i);
- this_object->cpu = cpu;
- this_object->index = i;
- retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache,
- per_cpu(cache_kobject, cpu),
- "index%1lu", i);
- if (unlikely(retval)) {
- for (j = 0; j < i; j++) {
- kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
- }
- kobject_put(per_cpu(cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
- kobject_uevent(&(this_object->kobj), KOBJ_ADD);
- }
- cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
- kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
- return 0;
-}
-
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
-{
- unsigned int cpu = sys_dev->id;
- unsigned long i;
-
- if (per_cpu(cpuid4_info, cpu) == NULL)
- return;
- if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
- return;
- cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
- for (i = 0; i < num_cache_leaves; i++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
- kobject_put(per_cpu(cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
-
- sys_dev = get_cpu_sysdev(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- cache_add_dev(sys_dev);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- cache_remove_dev(sys_dev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
-{
- .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __cpuinit cache_sysfs_init(void)
-{
- int i;
-
- if (num_cache_leaves == 0)
- return 0;
-
- for_each_online_cpu(i) {
- int err;
- struct sys_device *sys_dev = get_cpu_sysdev(i);
-
- err = cache_add_dev(sys_dev);
- if (err)
- return err;
- }
- register_hotcpu_notifier(&cacheinfo_cpu_notifier);
- return 0;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
new file mode 100644
index 000000000000..2c56f8730f59
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Performance and Energy Bias Hint support.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/syscore_ops.h>
+#include <linux/pm.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+/**
+ * DOC: overview
+ *
+ * The Performance and Energy Bias Hint (EPB) allows software to specify its
+ * preference with respect to the power-performance tradeoffs present in the
+ * processor. Generally, the EPB is expected to be set by user space (directly
+ * via sysfs or with the help of the x86_energy_perf_policy tool), but there are
+ * two reasons for the kernel to update it.
+ *
+ * First, there are systems where the platform firmware resets the EPB during
+ * system-wide transitions from sleep states back into the working state
+ * effectively causing the previous EPB updates by user space to be lost.
+ * Thus the kernel needs to save the current EPB values for all CPUs during
+ * system-wide transitions to sleep states and restore them on the way back to
+ * the working state. That can be achieved by saving EPB for secondary CPUs
+ * when they are taken offline during transitions into system sleep states and
+ * for the boot CPU in a syscore suspend operation, so that it can be restored
+ * for the boot CPU in a syscore resume operation and for the other CPUs when
+ * they are brought back online. However, CPUs that are already offline when
+ * a system-wide PM transition is started are not taken offline again, but their
+ * EPB values may still be reset by the platform firmware during the transition,
+ * so in fact it is necessary to save the EPB of any CPU taken offline and to
+ * restore it when the given CPU goes back online at all times.
+ *
+ * Second, on many systems the initial EPB value coming from the platform
+ * firmware is 0 ('performance') and at least on some of them that is because
+ * the platform firmware does not initialize EPB at all with the assumption that
+ * the OS will do that anyway. That sometimes is problematic, as it may cause
+ * the system battery to drain too fast, for example, so it is better to adjust
+ * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the
+ * kernel changes it to 6 ('normal').
+ */
+
+static DEFINE_PER_CPU(u8, saved_epb);
+
+#define EPB_MASK 0x0fULL
+#define EPB_SAVED 0x10ULL
+#define MAX_EPB EPB_MASK
+
+enum energy_perf_value_index {
+ EPB_INDEX_PERFORMANCE,
+ EPB_INDEX_BALANCE_PERFORMANCE,
+ EPB_INDEX_NORMAL,
+ EPB_INDEX_BALANCE_POWERSAVE,
+ EPB_INDEX_POWERSAVE,
+};
+
+static u8 energ_perf_values[] = {
+ [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE,
+ [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
+ [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL,
+ [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
+ [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE,
+};
+
+static int intel_epb_save(void *data)
+{
+ u64 epb;
+
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ /*
+ * Ensure that saved_epb will always be nonzero after this write even if
+ * the EPB value read from the MSR is 0.
+ */
+ this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED);
+
+ return 0;
+}
+
+static void intel_epb_restore(void *data)
+{
+ u64 val = this_cpu_read(saved_epb);
+ u64 epb;
+
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if (val) {
+ val &= EPB_MASK;
+ } else {
+ /*
+ * Because intel_epb_save() has not run for the current CPU yet,
+ * it is going online for the first time, so if its EPB value is
+ * 0 ('performance') at this point, assume that it has not been
+ * initialized by the platform firmware and set it to 6
+ * ('normal').
+ */
+ val = epb & EPB_MASK;
+ if (val == ENERGY_PERF_BIAS_PERFORMANCE) {
+ val = energ_perf_values[EPB_INDEX_NORMAL];
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ }
+ }
+ wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
+}
+
+static const struct syscore_ops intel_epb_syscore_ops = {
+ .suspend = intel_epb_save,
+ .resume = intel_epb_restore,
+};
+
+static struct syscore intel_epb_syscore = {
+ .ops = &intel_epb_syscore_ops,
+};
+
+static const char * const energy_perf_strings[] = {
+ [EPB_INDEX_PERFORMANCE] = "performance",
+ [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance",
+ [EPB_INDEX_NORMAL] = "normal",
+ [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power",
+ [EPB_INDEX_POWERSAVE] = "power",
+};
+
+static ssize_t energy_perf_bias_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ unsigned int cpu = dev->id;
+ u64 epb;
+ int ret;
+
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret < 0)
+ return ret;
+
+ return sprintf(buf, "%llu\n", epb);
+}
+
+static ssize_t energy_perf_bias_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int cpu = dev->id;
+ u64 epb, val;
+ int ret;
+
+ ret = __sysfs_match_string(energy_perf_strings,
+ ARRAY_SIZE(energy_perf_strings), buf);
+ if (ret >= 0)
+ val = energ_perf_values[ret];
+ else if (kstrtou64(buf, 0, &val) || val > MAX_EPB)
+ return -EINVAL;
+
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret < 0)
+ return ret;
+
+ ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
+ (epb & ~EPB_MASK) | val);
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(energy_perf_bias);
+
+static struct attribute *intel_epb_attrs[] = {
+ &dev_attr_energy_perf_bias.attr,
+ NULL
+};
+
+static const struct attribute_group intel_epb_attr_group = {
+ .name = power_group_name,
+ .attrs = intel_epb_attrs
+};
+
+static int intel_epb_online(unsigned int cpu)
+{
+ struct device *cpu_dev = get_cpu_device(cpu);
+
+ intel_epb_restore(NULL);
+ if (!cpuhp_tasks_frozen)
+ sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group);
+
+ return 0;
+}
+
+static int intel_epb_offline(unsigned int cpu)
+{
+ struct device *cpu_dev = get_cpu_device(cpu);
+
+ if (!cpuhp_tasks_frozen)
+ sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group);
+
+ intel_epb_save(NULL);
+ return 0;
+}
+
+static const struct x86_cpu_id intel_epb_normal[] = {
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ {}
+};
+
+static __init int intel_epb_init(void)
+{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal);
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_EPB))
+ return -ENODEV;
+
+ if (id)
+ energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data;
+
+ ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE,
+ "x86/intel/epb:online", intel_epb_online,
+ intel_epb_offline);
+ if (ret < 0)
+ goto err_out_online;
+
+ register_syscore(&intel_epb_syscore);
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
+ return ret;
+}
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 000000000000..6af1e8baeb0f
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+/**
+ * x86_match_vendor_cpu_type - helper function to match the hardware defined
+ * cpu-type for a single entry in the x86_cpu_id
+ * table. Note, this function does not match the
+ * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and
+ * TOPO_CPU_TYPE_PERFORMANCE.
+ * @c: Pointer to the cpuinfo_x86 structure of the CPU to match.
+ * @m: Pointer to the x86_cpu_id entry to match against.
+ *
+ * Return: true if the cpu-type matches, false otherwise.
+ */
+static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m)
+{
+ if (m->type == X86_CPU_TYPE_ANY)
+ return true;
+
+ /* Hybrid CPUs are special, they are assumed to match all cpu-types */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return true;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ return m->type == c->topo.intel_type;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return m->type == c->topo.amd_type;
+
+ return false;
+}
+
+/**
+ * x86_match_cpu - match current CPU against an array of x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ * {}.
+ *
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL. The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ *
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
+ *
+ * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
+ *
+ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
+ * for various common selections. The above can be shortened to:
+ *
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
+ *
+ * Arrays used to match for this should also be declared using
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
+ *
+ * This always matches against the boot cpu, assuming models and features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
+{
+ const struct x86_cpu_id *m;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
+ if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
+ continue;
+ if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
+ continue;
+ if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
+ continue;
+ if (m->steppings != X86_STEPPING_ANY &&
+ !(BIT(c->x86_stepping) & m->steppings))
+ continue;
+ if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
+ continue;
+ if (!x86_match_vendor_cpu_type(c, m))
+ continue;
+ return m;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(x86_match_cpu);
+
+bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
+{
+ const struct x86_cpu_id *res = x86_match_cpu(table);
+
+ if (!res || res->driver_data > boot_cpu_data.microcode)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
new file mode 100644
index 000000000000..015856abdbb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y = core.o severity.o genpool.o
+
+obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_INTEL) += intel.o
+obj-$(CONFIG_X86_MCE_AMD) += amd.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+
+mce-inject-y := inject.o
+obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
+
+obj-$(CONFIG_ACPI_APEI) += apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
new file mode 100644
index 000000000000..3f1dda355307
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -0,0 +1,1270 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
+ *
+ * Written by Jacob Shin - AMD, Inc.
+ * Maintained by: Borislav Petkov <bp@alien8.de>
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "internal.h"
+
+#define NR_BLOCKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_CNTP_HI 0x40000000
+#define MASK_LOCKED_HI 0x20000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO 0xFF000000
+#define MCG_XBLK_ADDR 0xC0000400
+
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF 0xF000
+
+static bool thresholding_irq_en;
+
+struct mce_amd_cpu_data {
+ mce_banks_t thr_intr_banks;
+ mce_banks_t dfr_intr_banks;
+
+ u32 thr_intr_en: 1,
+ dfr_intr_en: 1,
+ __resv: 30;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
+
+static const char * const th_names[] = {
+ "load_store",
+ "insn_fetch",
+ "combined_unit",
+ "decode_unit",
+ "northbridge",
+ "execution_unit",
+};
+
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
+};
+
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+ u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */
+ __reserved :63;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
+
+ /* UMC v2 is separate because both of them can exist in a single system. */
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
+};
+
+static const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t];
+}
+
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
+{
+ struct smca_bank *b;
+
+ if (bank >= MAX_NR_BANKS)
+ return N_SMCA_BANK_TYPES;
+
+ b = &per_cpu(smca_banks, cpu)[bank];
+ if (!b->hwid)
+ return N_SMCA_BANK_TYPES;
+
+ return b->hwid->bank_type;
+}
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
+
+static const struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype } */
+
+ /* Reserved type */
+ { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) },
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
+
+ /* MPDMA MCA type */
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
+
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
+};
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+
+struct threshold_block {
+ /* This block's number within its bank. */
+ unsigned int block;
+ /* MCA bank number that contains this block. */
+ unsigned int bank;
+ /* CPU which controls this block's MCA bank. */
+ unsigned int cpu;
+ /* MCA_MISC MSR address for this block. */
+ u32 address;
+ /* Enable/Disable APIC interrupt. */
+ bool interrupt_enable;
+ /* Bank can generate an interrupt. */
+ bool interrupt_capable;
+ /* Value upon which threshold interrupt is generated. */
+ u16 threshold_limit;
+ /* sysfs object */
+ struct kobject kobj;
+ /* List of threshold blocks within this block's MCA bank. */
+ struct list_head miscj;
+};
+
+struct threshold_bank {
+ struct kobject *kobj;
+ /* List of threshold blocks within this MCA bank. */
+ struct list_head miscj;
+};
+
+static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
+
+/*
+ * A list of the banks enabled on each logical CPU. Controls which respective
+ * descriptors to initialize later in mce_threshold_create_device().
+ */
+static DEFINE_PER_CPU(u64, bank_map);
+
+static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
+
+static void smca_configure(unsigned int bank, unsigned int cpu)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
+ unsigned int i, hwid_mcatype;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) {
+ __set_bit(bank, data->dfr_intr_banks);
+ high |= BIT(5);
+ }
+
+ /*
+ * SMCA Corrected Error Interrupt
+ *
+ * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can
+ * send an MCA Thresholding interrupt without the OS initializing
+ * this feature. This can be used if the threshold limit is managed
+ * by the platform.
+ *
+ * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR).
+ * The OS should set this to inform the platform that the OS is ready
+ * to handle the MCA Thresholding interrupt.
+ */
+ if ((low & BIT(10)) && data->thr_intr_en) {
+ __set_bit(bank, data->thr_intr_banks);
+ high |= BIT(8);
+ }
+
+ this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+
+ if (low & MCI_CONFIG_PADDRV)
+ this_cpu_ptr(smca_banks)[bank].paddrv = 1;
+
+ wrmsr(smca_config, low, high);
+ }
+
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
+ break;
+ }
+ }
+}
+
+struct thresh_restart {
+ struct threshold_block *b;
+ int set_lvt_off;
+ int lvt_off;
+ u16 old_limit;
+};
+
+static const char *bank4_names(const struct threshold_block *b)
+{
+ switch (b->address) {
+ /* MSR4_MISC0 */
+ case 0x00000413:
+ return "dram";
+
+ case 0xc0000408:
+ return "ht_links";
+
+ case 0xc0000409:
+ return "l3_cache";
+
+ default:
+ WARN(1, "Funny MSR: 0x%08x\n", b->address);
+ return "";
+ }
+};
+
+
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
+static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+ int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+ /*
+ * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+ * the BIOS provides the value. The original field where LVT offset
+ * was set is reserved. Return early here:
+ */
+ if (mce_flags.smca)
+ return false;
+
+ if (apic < 0) {
+ pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+ b->bank, b->block, b->address, hi, lo);
+ return false;
+ }
+
+ if (apic != msr) {
+ pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+ b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+ return false;
+ }
+
+ return true;
+};
+
+/* Reprogram MCx_MISC MSR behind this threshold block. */
+static void threshold_restart_block(void *_tr)
+{
+ struct thresh_restart *tr = _tr;
+ u32 hi, lo;
+
+ /* sysfs write might race against an offline operation */
+ if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off)
+ return;
+
+ rdmsr(tr->b->address, lo, hi);
+
+ /*
+ * Reset error count and overflow bit.
+ * This is done during init or after handling an interrupt.
+ */
+ if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) {
+ hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI);
+ hi |= THRESHOLD_MAX - tr->b->threshold_limit;
+ } else if (tr->old_limit) { /* change limit w/o reset */
+ int new_count = (hi & THRESHOLD_MAX) +
+ (tr->old_limit - tr->b->threshold_limit);
+
+ hi = (hi & ~MASK_ERR_COUNT_HI) |
+ (new_count & THRESHOLD_MAX);
+ }
+
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
+ if (tr->set_lvt_off) {
+ if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+ /* set new lvt offset */
+ hi &= ~MASK_LVTOFF_HI;
+ hi |= tr->lvt_off << 20;
+ }
+ }
+
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
+
+ hi |= MASK_COUNT_EN_HI;
+ wrmsr(tr->b->address, lo, hi);
+}
+
+static void threshold_restart_bank(unsigned int bank, bool intr_en)
+{
+ struct threshold_bank **thr_banks = this_cpu_read(threshold_banks);
+ struct threshold_block *block, *tmp;
+ struct thresh_restart tr;
+
+ if (!thr_banks || !thr_banks[bank])
+ return;
+
+ memset(&tr, 0, sizeof(tr));
+
+ list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) {
+ tr.b = block;
+ tr.b->interrupt_enable = intr_en;
+ threshold_restart_block(&tr);
+ }
+}
+
+/* Try to use the threshold limit reported through APEI. */
+static u16 get_thr_limit(void)
+{
+ u32 thr_limit = mce_get_apei_thr_limit();
+
+ /* Fallback to old default if APEI limit is not available. */
+ if (!thr_limit)
+ return THRESHOLD_MAX;
+
+ return min(thr_limit, THRESHOLD_MAX);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+ struct thresh_restart tr = {
+ .b = b,
+ .set_lvt_off = 1,
+ .lvt_off = offset,
+ };
+
+ b->threshold_limit = get_thr_limit();
+ threshold_restart_block(&tr);
+};
+
+static int setup_APIC_mce_threshold(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block,
+ unsigned int cpu)
+{
+ u32 addr = 0, offset = 0;
+
+ if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
+ return addr;
+
+ if (mce_flags.smca) {
+ if (!block)
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+
+ if (!(low & MASK_BLKPTR_LO))
+ return 0;
+
+ return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+ }
+
+ /* Fall back to method we used for older processors: */
+ switch (block) {
+ case 0:
+ addr = mca_msr_reg(bank, MCA_MISC);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+ return addr;
+}
+
+static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+ int offset, u32 misc_high)
+{
+ unsigned int cpu = smp_processor_id();
+ struct threshold_block b;
+ int new;
+
+ if (!block)
+ per_cpu(bank_map, cpu) |= BIT_ULL(bank);
+
+ memset(&b, 0, sizeof(b));
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = addr;
+ b.interrupt_capable = lvt_interrupt_supported(bank, misc_high);
+
+ if (!b.interrupt_capable)
+ goto done;
+
+ __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
+ b.interrupt_enable = 1;
+
+ if (mce_flags.smca)
+ goto done;
+
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
+ offset = setup_APIC_mce_threshold(offset, new);
+ if (offset == new)
+ thresholding_irq_en = true;
+
+done:
+ mce_threshold_block_init(&b, offset);
+
+ return offset;
+}
+
+bool amd_filter_mce(struct mce *m)
+{
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* See Family 17h Models 10h-2Fh Erratum #1114. */
+ if (c->x86 == 0x17 &&
+ c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
+ bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10)
+ return true;
+
+ /* NB GART TLB error reporting is disabled by default. */
+ if (c->x86 < 0x17) {
+ if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Turn off thresholding banks for the following conditions:
+ * - MC4_MISC thresholding is not supported on Family 0x15.
+ * - Prevent possible spurious interrupts from the IF bank on Family 0x17
+ * Models 0x10-0x2F due to Erratum #1114.
+ */
+static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
+{
+ int i, num_msrs;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[NR_BLOCKS];
+
+ if (c->x86 == 0x15 && bank == 4) {
+ msrs[0] = 0x00000413; /* MC4_MISC0 */
+ msrs[1] = 0xc0000408; /* MC4_MISC1 */
+ num_msrs = 2;
+ } else if (c->x86 == 0x17 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
+
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
+ return;
+
+ msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
+ num_msrs = 1;
+ } else {
+ return;
+ }
+
+ rdmsrq(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+ if (need_toggle)
+ wrmsrq(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < num_msrs; i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrq(MSR_K7_HWCR, hwcr);
+}
+
+static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+ mce_banks[0].ctl = 0;
+}
+
+/*
+ * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is
+ * ready to send interrupts.
+ *
+ * Individual error sources are enabled later during per-bank init.
+ */
+static void smca_enable_interrupt_vectors(void)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u64 mca_intr_cfg, offset;
+
+ if (!mce_flags.smca || !mce_flags.succor)
+ return;
+
+ if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg))
+ return;
+
+ offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12;
+ if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->thr_intr_en = 1;
+
+ offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4;
+ if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->dfr_intr_en = 1;
+}
+
+/* cpu init entry point, called from mce.c with preempt off */
+void mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+ unsigned int bank, block, cpu = smp_processor_id();
+ u32 low = 0, high = 0, address = 0;
+ int offset = -1;
+
+ amd_apply_cpu_quirks(c);
+
+ mce_flags.amd_threshold = 1;
+
+ smca_enable_interrupt_vectors();
+
+ for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
+ if (mce_flags.smca) {
+ smca_configure(bank, cpu);
+
+ if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en)
+ continue;
+ }
+
+ disable_err_thresholding(c, bank);
+
+ for (block = 0; block < NR_BLOCKS; ++block) {
+ address = get_block_address(address, low, high, bank, block, cpu);
+ if (!address)
+ break;
+
+ if (rdmsr_safe(address, &low, &high))
+ break;
+
+ if (!(high & MASK_VALID_HI))
+ continue;
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ continue;
+
+ offset = prepare_threshold_block(bank, block, address, offset, high);
+ }
+ }
+}
+
+void smca_bsp_init(void)
+{
+ mce_threshold_vector = amd_threshold_interrupt;
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+}
+
+/*
+ * DRAM ECC errors are reported in the Northbridge (bank 4) with
+ * Extended Error Code 8.
+ */
+static bool legacy_mce_is_memory_error(struct mce *m)
+{
+ return m->bank == 4 && XEC(m->status, 0x1f) == 8;
+}
+
+/*
+ * DRAM ECC errors are reported in Unified Memory Controllers with
+ * Extended Error Code 0.
+ */
+static bool smca_mce_is_memory_error(struct mce *m)
+{
+ enum smca_bank_types bank_type;
+
+ if (XEC(m->status, 0x3f))
+ return false;
+
+ bank_type = smca_get_bank_type(m->extcpu, m->bank);
+
+ return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
+}
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
+ if (mce_flags.smca)
+ return smca_mce_is_memory_error(m);
+ else
+ return legacy_mce_is_memory_error(m);
+}
+
+/*
+ * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a
+ * system physical address. Individual cases though, need to be detected for
+ * other systems. Future cases will be added as needed.
+ *
+ * 1) General case
+ * a) Assume address is not usable.
+ * 2) Poison errors
+ * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
+ * northbridge (bank 4).
+ * b) Refers to poison consumption in the core. Does not include "no action",
+ * "action optional", or "deferred" error severities.
+ * c) Will include a usable address so that immediate action can be taken.
+ * 3) Northbridge DRAM ECC errors
+ * a) Reported in legacy bank 4 with extended error code (XEC) 8.
+ * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
+ * this bit should not be checked.
+ * 4) MCI_STATUS_PADDRVAL is set
+ * a) Will provide a valid system physical address.
+ *
+ * NOTE: SMCA UMC memory errors fall into case #1.
+ */
+bool amd_mce_usable_address(struct mce *m)
+{
+ /* Check special northbridge case 3) first. */
+ if (!mce_flags.smca) {
+ if (legacy_mce_is_memory_error(m))
+ return true;
+ else if (m->bank == 4)
+ return false;
+ }
+
+ if (this_cpu_ptr(smca_banks)[m->bank].paddrv)
+ return m->status & MCI_STATUS_PADDRV;
+
+ /* Check poison bit for all other bank types. */
+ if (m->status & MCI_STATUS_POISON)
+ return true;
+
+ /* Assume address is not usable for all others. */
+ return false;
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
+{
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ apic_eoi();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
+}
+
+void mce_amd_handle_storm(unsigned int bank, bool on)
+{
+ threshold_restart_bank(bank, on);
+}
+
+static void amd_reset_thr_limit(unsigned int bank)
+{
+ threshold_restart_bank(bank, true);
+}
+
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
+}
+
+void amd_clear_bank(struct mce *m)
+{
+ amd_reset_thr_limit(m->bank);
+
+ /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+ if (m->status & MCI_STATUS_DEFERRED)
+ mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
+
+ /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ return;
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
+/*
+ * Sysfs Interface
+ */
+
+struct threshold_attr {
+ struct attribute attr;
+ ssize_t (*show) (struct threshold_block *, char *);
+ ssize_t (*store) (struct threshold_block *, const char *, size_t count);
+};
+
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", (unsigned long) b->name); \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ b->interrupt_enable = !!new;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.b = b;
+
+ if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
+ return -ENODEV;
+
+ return size;
+}
+
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (new > THRESHOLD_MAX)
+ new = THRESHOLD_MAX;
+ if (new < 1)
+ new = 1;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.old_limit = b->threshold_limit;
+ b->threshold_limit = new;
+ tr.b = b;
+
+ if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
+ return -ENODEV;
+
+ return size;
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+ u32 lo, hi;
+
+ /* CPU might be offline by now */
+ if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi))
+ return -ENODEV;
+
+ return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+ (THRESHOLD_MAX - b->threshold_limit)));
+}
+
+static struct threshold_attr error_count = {
+ .attr = {.name = __stringify(error_count), .mode = 0444 },
+ .show = show_error_count,
+};
+
+#define RW_ATTR(val) \
+static struct threshold_attr val = { \
+ .attr = {.name = __stringify(val), .mode = 0644 }, \
+ .show = show_## val, \
+ .store = store_## val, \
+};
+
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+
+static struct attribute *default_attrs[] = {
+ &threshold_limit.attr,
+ &error_count.attr,
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
+};
+ATTRIBUTE_GROUPS(default);
+
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->show ? a->show(b, buf) : -EIO;
+
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->store ? a->store(b, buf, count) : -EIO;
+
+ return ret;
+}
+
+static const struct sysfs_ops threshold_ops = {
+ .show = show,
+ .store = store,
+};
+
+static void threshold_block_release(struct kobject *kobj);
+
+static const struct kobj_type threshold_ktype = {
+ .sysfs_ops = &threshold_ops,
+ .default_groups = default_groups,
+ .release = threshold_block_release,
+};
+
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
+{
+ enum smca_bank_types bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ bank_type = smca_get_bank_type(cpu, bank);
+
+ if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ }
+
+ if (b && b->block) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block);
+ return buf_mcatype;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank);
+ return buf_mcatype;
+ }
+
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
+ return smca_get_name(bank_type);
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%u", smca_get_name(bank_type),
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
+ return buf_mcatype;
+}
+
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+ unsigned int bank, unsigned int block,
+ u32 address)
+{
+ struct threshold_block *b = NULL;
+ u32 low, high;
+ int err;
+
+ if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS))
+ return 0;
+
+ if (rdmsr_safe(address, &low, &high))
+ return 0;
+
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ goto recurse;
+ else
+ return 0;
+ }
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ goto recurse;
+
+ b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
+ b->threshold_limit = get_thr_limit();
+
+ if (b->interrupt_capable) {
+ default_attrs[2] = &interrupt_enable.attr;
+ b->interrupt_enable = 1;
+ } else {
+ default_attrs[2] = NULL;
+ }
+
+ list_add(&b->miscj, &tb->miscj);
+
+ mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
+
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
+ if (err)
+ goto out_free;
+recurse:
+ address = get_block_address(address, low, high, bank, ++block, cpu);
+ if (!address)
+ return 0;
+
+ err = allocate_threshold_blocks(cpu, tb, bank, block, address);
+ if (err)
+ goto out_free;
+
+ if (b)
+ kobject_uevent(&b->kobj, KOBJ_ADD);
+
+ return 0;
+
+out_free:
+ if (b) {
+ list_del(&b->miscj);
+ kobject_put(&b->kobj);
+ }
+ return err;
+}
+
+static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
+ unsigned int bank)
+{
+ struct device *dev = this_cpu_read(mce_device);
+ struct threshold_bank *b = NULL;
+ const char *name = get_name(cpu, bank, NULL);
+ int err = 0;
+
+ if (!dev)
+ return -ENODEV;
+
+ b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ if (!b) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Associate the bank with the per-CPU MCE device */
+ b->kobj = kobject_create_and_add(name, &dev->kobj);
+ if (!b->kobj) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ INIT_LIST_HEAD(&b->miscj);
+
+ err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
+ if (err)
+ goto out_kobj;
+
+ bp[bank] = b;
+ return 0;
+
+out_kobj:
+ kobject_put(b->kobj);
+out_free:
+ kfree(b);
+out:
+ return err;
+}
+
+static void threshold_block_release(struct kobject *kobj)
+{
+ kfree(to_block(kobj));
+}
+
+static void threshold_remove_bank(struct threshold_bank *bank)
+{
+ struct threshold_block *pos, *tmp;
+
+ list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) {
+ list_del(&pos->miscj);
+ kobject_put(&pos->kobj);
+ }
+
+ kobject_put(bank->kobj);
+ kfree(bank);
+}
+
+static void __threshold_remove_device(struct threshold_bank **bp)
+{
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (!bp[bank])
+ continue;
+
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
+ kfree(bp);
+}
+
+void mce_threshold_remove_device(unsigned int cpu)
+{
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
+
+ if (!bp)
+ return;
+
+ /*
+ * Clear the pointer before cleaning up, so that the interrupt won't
+ * touch anything of this.
+ */
+ this_cpu_write(threshold_banks, NULL);
+
+ __threshold_remove_device(bp);
+ return;
+}
+
+/**
+ * mce_threshold_create_device - Create the per-CPU MCE threshold device
+ * @cpu: The plugged in CPU
+ *
+ * Create directories and files for all valid threshold banks.
+ *
+ * This is invoked from the CPU hotplug callback which was installed in
+ * mcheck_init_device(). The invocation happens in context of the hotplug
+ * thread running on @cpu. The callback is invoked on all CPUs which are
+ * online when the callback is installed or during a real hotplug event.
+ */
+void mce_threshold_create_device(unsigned int cpu)
+{
+ unsigned int numbanks, bank;
+ struct threshold_bank **bp;
+
+ if (!mce_flags.amd_threshold)
+ return;
+
+ bp = this_cpu_read(threshold_banks);
+ if (bp)
+ return;
+
+ numbanks = this_cpu_read(mce_num_banks);
+ bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
+ if (!bp)
+ return;
+
+ for (bank = 0; bank < numbanks; ++bank) {
+ if (!(this_cpu_read(bank_map) & BIT_ULL(bank)))
+ continue;
+ if (threshold_create_bank(bp, cpu, bank)) {
+ __threshold_remove_device(bp);
+ return;
+ }
+ }
+ this_cpu_write(threshold_banks, bp);
+
+ if (thresholding_irq_en)
+ mce_threshold_vector = amd_threshold_interrupt;
+ return;
+}
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
new file mode 100644
index 000000000000..0a89947e47bc
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <acpi/ghes.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
+{
+ struct mce_hw_err err;
+ struct mce *m;
+ int lsb;
+
+ if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+ return;
+
+ /*
+ * Even if the ->validation_bits are set for address mask,
+ * to be extra safe, check and reject an error radius '0',
+ * and fall back to the default page size.
+ */
+ if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
+ lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT);
+ else
+ lsb = PAGE_SHIFT;
+
+ mce_prep_record(&err);
+ m = &err.m;
+ m->bank = -1;
+ /* Fake a memory read error with unknown channel */
+ m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
+
+ if (severity >= GHES_SEV_RECOVERABLE)
+ m->status |= MCI_STATUS_UC;
+
+ if (severity >= GHES_SEV_PANIC) {
+ m->status |= MCI_STATUS_PCC;
+ m->tsc = rdtsc();
+ }
+
+ m->addr = mem_err->physical_addr;
+ mce_log(&err);
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ const u64 *i_mce = ((const u64 *) (ctx_info + 1));
+ unsigned int cpu, num_regs;
+ bool apicid_found = false;
+ struct mce_hw_err err;
+ struct mce *m;
+
+ if (!boot_cpu_has(X86_FEATURE_SMCA))
+ return -EINVAL;
+
+ /*
+ * The starting address of the register array extracted from BERT must
+ * match with the first expected register in the register layout of
+ * SMCA address space. This address corresponds to banks's MCA_STATUS
+ * register.
+ *
+ * Match any MCi_STATUS register by turning off bank numbers.
+ */
+ if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) !=
+ MSR_AMD64_SMCA_MC0_STATUS)
+ return -EINVAL;
+
+ /*
+ * The number of registers in the register array is determined by
+ * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
+ * Sanity-check registers array size.
+ */
+ num_regs = ctx_info->reg_arr_size >> 3;
+ if (!num_regs)
+ return -EINVAL;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
+ apicid_found = true;
+ break;
+ }
+ }
+
+ if (!apicid_found)
+ return -EINVAL;
+
+ m = &err.m;
+ memset(&err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(cpu, m);
+
+ m->bank = (ctx_info->msr_addr >> 4) & 0xFF;
+
+ /*
+ * The SMCA register layout is fixed and includes 16 registers.
+ * The end of the array may be variable, but the beginning is known.
+ * Cap the number of registers to expected max (15).
+ */
+ if (num_regs > 15)
+ num_regs = 15;
+
+ switch (num_regs) {
+ /* MCA_SYND2 */
+ case 15:
+ err.vendor.amd.synd2 = *(i_mce + 14);
+ fallthrough;
+ /* MCA_SYND1 */
+ case 14:
+ err.vendor.amd.synd1 = *(i_mce + 13);
+ fallthrough;
+ /* MCA_MISC4 */
+ case 13:
+ /* MCA_MISC3 */
+ case 12:
+ /* MCA_MISC2 */
+ case 11:
+ /* MCA_MISC1 */
+ case 10:
+ /* MCA_DEADDR */
+ case 9:
+ /* MCA_DESTAT */
+ case 8:
+ /* reserved */
+ case 7:
+ /* MCA_SYND */
+ case 6:
+ m->synd = *(i_mce + 5);
+ fallthrough;
+ /* MCA_IPID */
+ case 5:
+ m->ipid = *(i_mce + 4);
+ fallthrough;
+ /* MCA_CONFIG */
+ case 4:
+ /* MCA_MISC0 */
+ case 3:
+ m->misc = *(i_mce + 2);
+ fallthrough;
+ /* MCA_ADDR */
+ case 2:
+ m->addr = *(i_mce + 1);
+ fallthrough;
+ /* MCA_STATUS */
+ case 1:
+ m->status = *i_mce;
+ }
+
+ mce_log(&err);
+
+ return 0;
+}
+
+#define CPER_CREATOR_MCE \
+ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE \
+ GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+ struct cper_mce_record rcd;
+
+ memset(&rcd, 0, sizeof(rcd));
+ memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd.hdr.revision = CPER_RECORD_REV;
+ rcd.hdr.signature_end = CPER_SIG_END;
+ rcd.hdr.section_count = 1;
+ rcd.hdr.error_severity = CPER_SEV_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd.hdr.validation_bits = 0;
+ rcd.hdr.record_length = sizeof(rcd);
+ rcd.hdr.creator_id = CPER_CREATOR_MCE;
+ rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd.hdr.record_id = cper_next_record_id();
+ rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+ rcd.sec_hdr.section_length = sizeof(rcd.mce);
+ rcd.sec_hdr.revision = CPER_SEC_REV;
+ /* fru_id and fru_text is invalid */
+ rcd.sec_hdr.validation_bits = 0;
+ rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
+
+ memcpy(&rcd.mce, m, sizeof(*m));
+
+ return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ struct cper_mce_record rcd;
+ int rc, pos;
+
+ rc = erst_get_record_id_begin(&pos);
+ if (rc)
+ return rc;
+retry:
+ rc = erst_get_record_id_next(&pos, record_id);
+ if (rc)
+ goto out;
+ /* no more record */
+ if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+ goto out;
+ rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd),
+ &CPER_CREATOR_MCE);
+ /* someone else has cleared the record, try next one */
+ if (rc == -ENOENT)
+ goto retry;
+ else if (rc < 0)
+ goto out;
+
+ memcpy(m, &rcd.mce, sizeof(*m));
+ rc = sizeof(*m);
+out:
+ erst_get_record_id_end();
+
+ return rc;
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+ return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+ return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
new file mode 100644
index 000000000000..34440021e8cf
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -0,0 +1,2970 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/ras.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/irq_work.h>
+#include <linux/export.h>
+#include <linux/set_memory.h>
+#include <linux/sync_core.h>
+#include <linux/task_work.h>
+#include <linux/hardirq.h>
+#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
+
+#include <asm/fred.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/reboot.h>
+#include <asm/tdx.h>
+
+#include "internal.h"
+
+/* sysfs synchronization */
+static DEFINE_MUTEX(mce_sysfs_mutex);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
+#define SPINUNIT 100 /* 100ns */
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
+
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
+#define ATTR_LEN 16
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank_dev {
+ struct device_attribute attr; /* device attribute */
+ char attrname[ATTR_LEN]; /* attribute name */
+ u8 bank; /* bank number */
+};
+static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
+
+struct mce_vendor_flags mce_flags __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ .monarch_timeout = -1
+};
+
+static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
+static unsigned long mce_need_notify;
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
+
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
+void mce_prep_record_common(struct mce *m)
+{
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP);
+ /* need the internal __ version to avoid deadlocks */
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of struct mce_hw_err */
+void mce_prep_record(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ memset(err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+void mce_log(struct mce_hw_err *err)
+{
+ if (mce_gen_pool_add(err))
+ irq_work_queue(&mce_irq_work);
+}
+EXPORT_SYMBOL_GPL(mce_log);
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
+ nb->priority > MCE_PRIO_HIGHEST))
+ return;
+
+ blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
+static void __print_mce(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
+
+ if (m->ip) {
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+
+ if (m->cs == __KERNEL_CS)
+ pr_cont("{%pS}", (void *)(unsigned long)m->ip);
+ pr_cont("\n");
+ }
+
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
+ if (m->addr)
+ pr_cont("ADDR %llx ", m->addr);
+ if (m->misc)
+ pr_cont("MISC %llx ", m->misc);
+ if (m->ppin)
+ pr_cont("PPIN %llx ", m->ppin);
+
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (err->vendor.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+ if (err->vendor.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
+ pr_cont("\n");
+
+ /*
+ * Note this output is parsed by external tools and old fields
+ * should not be changed.
+ */
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
+ m->microcode);
+}
+
+static void print_mce(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ __print_mce(err);
+
+ if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_panicked;
+
+static int fake_panic;
+static atomic_t mce_fake_panicked;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static const char *mce_dump_aux_info(struct mce *m)
+{
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return tdx_dump_mce_info(m);
+
+ return NULL;
+}
+
+static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
+{
+ struct llist_node *pending;
+ struct mce_evt_llist *l;
+ int apei_err = 0;
+ const char *memmsg;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
+
+ if (!fake_panic) {
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_inc_return(&mce_panicked) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ } else {
+ /* Don't log too much for fake panic */
+ if (atomic_inc_return(&mce_fake_panicked) > 1)
+ goto out;
+ }
+ pending = mce_gen_pool_prepare_records();
+ /* First print corrected ones that are still unlogged */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
+ if (!(m->status & MCI_STATUS_UC)) {
+ print_mce(err);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ /* Now print uncorrected but with the final one last */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || mce_cmp(m, &final->m)) {
+ print_mce(err);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ if (final) {
+ print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(&final->m);
+ }
+ if (exp)
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
+
+ memmsg = mce_dump_aux_info(&final->m);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
+ if (!fake_panic) {
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->m.status & MCI_STATUS_ADDRV)) {
+ struct page *p;
+ p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
+ panic(msg);
+ } else
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+ unsigned bank = __this_cpu_read(injectm.bank);
+
+ if (msr == mca_cfg.rip_msr)
+ return offsetof(struct mce, ip);
+ if (msr == mca_msr_reg(bank, MCA_STATUS))
+ return offsetof(struct mce, status);
+ if (msr == mca_msr_reg(bank, MCA_ADDR))
+ return offsetof(struct mce, addr);
+ if (msr == mca_msr_reg(bank, MCA_MISC))
+ return offsetof(struct mce, misc);
+ if (msr == MSR_IA32_MCG_STATUS)
+ return offsetof(struct mce, mcgstatus);
+ return -1;
+}
+
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
+{
+ if (wrmsr) {
+ pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+ regs->ip, (void *)regs->ip);
+ } else {
+ pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ }
+
+ show_stack_regs(regs);
+
+ panic("MCA architectural violation!\n");
+
+ while (true)
+ cpu_relax();
+}
+
+/* MSR access wrappers used for error injection */
+noinstr u64 mce_rdmsrq(u32 msr)
+{
+ EAX_EDX_DECLARE_ARGS(val, low, high);
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset;
+ u64 ret;
+
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
+ if (offset < 0)
+ ret = 0;
+ else
+ ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+
+ instrumentation_end();
+
+ return ret;
+ }
+
+ /*
+ * RDMSR on MCA MSRs should not fault. If they do, this is very much an
+ * architectural violation and needs to be reported to hw vendor. Panic
+ * the box to not allow any further progress.
+ */
+ asm volatile("1: rdmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
+ : EAX_EDX_RET(val, low, high) : "c" (msr));
+
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+noinstr void mce_wrmsrq(u32 msr, u64 v)
+{
+ u32 low, high;
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset;
+
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
+ if (offset >= 0)
+ *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+
+ instrumentation_end();
+
+ return;
+ }
+
+ low = (u32)v;
+ high = (u32)(v >> 32);
+
+ /* See comment in mce_rdmsrq() */
+ asm volatile("1: wrmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
+ : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
+{
+ struct mce *m;
+ /*
+ * Enable instrumentation around mce_prep_record() which calls external
+ * facilities.
+ */
+ instrumentation_begin();
+ mce_prep_record(err);
+ instrumentation_end();
+
+ m = &err->m;
+ m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrq(mca_cfg.rip_msr);
+ }
+}
+
+bool mce_available(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return false;
+ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+ if (!mce_gen_pool_empty())
+ schedule_work(&mce_work);
+}
+
+static void mce_irq_work_cb(struct irq_work *entry)
+{
+ mce_schedule_work();
+}
+
+bool mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_ADDRV))
+ return false;
+
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ return amd_mce_usable_address(m);
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ return intel_mce_usable_address(m);
+
+ default:
+ return true;
+ }
+}
+EXPORT_SYMBOL_GPL(mce_usable_address);
+
+bool mce_is_memory_error(struct mce *m)
+{
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ return amd_mce_is_memory_error(m);
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
+
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
+bool mce_is_correctable(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(mce_is_correctable);
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
+static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce_hw_err *err = to_mce_hw_err(data);
+
+ if (!err)
+ return NOTIFY_DONE;
+
+ /* Emit the trace record: */
+ trace_mce_record(err);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block early_nb = {
+ .notifier_call = mce_early_notifier,
+ .priority = MCE_PRIO_EARLY,
+};
+
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned long pfn;
+
+ if (!mce || !mce_usable_address(mce))
+ return NOTIFY_DONE;
+
+ if (mce->severity != MCE_AO_SEVERITY &&
+ mce->severity != MCE_DEFERRED_SEVERITY)
+ return NOTIFY_DONE;
+
+ pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0)) {
+ set_mce_nospec(pfn);
+ mce->kflags |= MCE_HANDLED_UC;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mce_uc_nb = {
+ .notifier_call = uc_decode_notifier,
+ .priority = MCE_PRIO_UC,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce_hw_err *err = to_mce_hw_err(data);
+
+ if (!err)
+ return NOTIFY_DONE;
+
+ if (mca_cfg.print_all || !(err->m.kflags))
+ __print_mce(err);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = MCE_PRIO_LOWEST,
+};
+
+/*
+ * Read ADDR and MISC registers.
+ */
+static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
+{
+ struct mce *m = &err->m;
+
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
+
+ if (m->status & MCI_STATUS_ADDRV) {
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
+ else
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+
+ smca_extract_err_addr(m);
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV) {
+ m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
+ }
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static bool smca_should_log_poll_error(struct mce *m)
+{
+ if (m->status & MCI_STATUS_VAL)
+ return true;
+
+ m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Newer Intel systems that support software error
+ * recovery need to make additional checks. Other
+ * CPUs should skip over uncorrected errors, but log
+ * everything else.
+ */
+static bool ser_should_log_poll_error(struct mce *m)
+{
+ /* Log "not enabled" (speculative) errors */
+ if (!(m->status & MCI_STATUS_EN))
+ return true;
+
+ /*
+ * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
+ * UC == 1 && PCC == 0 && S == 0
+ */
+ if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
+ return true;
+
+ return false;
+}
+
+static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ if (mce_flags.smca)
+ return smca_should_log_poll_error(m);
+
+ /* If this entry is not valid, ignore it. */
+ if (!(m->status & MCI_STATUS_VAL))
+ return false;
+
+ /*
+ * If we are logging everything (at CPU online) or this
+ * is a corrected error, then we must log it.
+ */
+ if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
+ return true;
+
+ if (mca_cfg.ser)
+ return ser_should_log_poll_error(m);
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+
+static void clear_bank(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD)
+ return amd_clear_bank(m);
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll handler -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ struct mce_hw_err err;
+ struct mce *m;
+ int i;
+
+ this_cpu_inc(mce_poll_count);
+
+ mce_gather_info(&err, NULL);
+ m = &err.m;
+
+ if (flags & MCP_TIMESTAMP)
+ m->tsc = rdtsc();
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ if (!mce_banks[i].ctl || !test_bit(i, *b))
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ barrier();
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(m);
+
+ /* Verify that the error should be logged based on hardware conditions. */
+ if (!should_log_poll_error(flags, &err))
+ continue;
+
+ mce_read_aux(&err, i);
+ m->severity = mce_severity(m, NULL, NULL, false);
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+
+ if (mca_cfg.dont_log_ce && !mce_usable_address(m))
+ goto clear_it;
+
+ if (flags & MCP_QUEUE_LOG)
+ mce_gen_pool_add(&err);
+ else
+ mce_log(&err);
+
+clear_it:
+ clear_bank(m);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+
+ sync_core();
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static __always_inline void
+quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashing on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU has been disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
+ u64 mc1_status;
+
+ /*
+ * Apply the quirk only to local machine checks, i.e., no broadcast
+ * sync is needed.
+ */
+ if (!(mcgstatus & MCG_STATUS_LMCES) ||
+ !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
+ return false;
+
+ mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0);
+
+ instrumentation_begin();
+ pr_err_once("Erratum detected, disable fast string copy instructions.\n");
+ instrumentation_end();
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
+ * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
+ *
+ * However, the context is still valid, so save the "cs" register for later use.
+ *
+ * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
+ *
+ * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
+ */
+static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 1)
+ return;
+ if (!(m->status & MCI_STATUS_POISON))
+ return;
+
+ m->cs = regs->cs;
+}
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
+{
+ struct mce *m = &err->m;
+ char *tmp = *msg;
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ arch___set_bit(i, validp);
+ if (mce_flags.snb_ifu_quirk)
+ quirk_sandybridge_ifu(i, m, regs);
+
+ if (mce_flags.zen_ifu_quirk)
+ quirk_zen_ifu(i, m, regs);
+
+ m->bank = i;
+ if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ mce_read_aux(err, i);
+ *msg = tmp;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static noinstr int mce_timed_out(u64 *t, const char *msg)
+{
+ int ret = 0;
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_panicked))
+ wait_for_panic();
+ if (!mca_cfg.monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
+ mce_panic(msg, NULL, NULL);
+
+ ret = 1;
+ goto out;
+ }
+ *t -= SPINUNIT;
+
+out:
+ touch_nmi_watchdog();
+
+ instrumentation_end();
+
+ return ret;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of a machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ struct mce_hw_err *err = NULL;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ int cpu;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
+ struct mce *mtmp = &etmp->m;
+
+ if (mtmp->severity > global_worst) {
+ global_worst = mtmp->severity;
+ err = &per_cpu(hw_errs_seen, cpu);
+ m = &err->m;
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY) {
+ /* call mce_severity() to get "msg" for panic */
+ mce_severity(m, NULL, &msg, true);
+ mce_panic("Fatal machine check", err, msg);
+ }
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (global_worst <= MCE_KEEP_SEVERITY)
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the hw_errs_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static noinstr int mce_start(int *no_way_out)
+{
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int order, ret = -1;
+
+ if (!timeout)
+ return ret;
+
+ raw_atomic_add(*no_way_out, &global_nwo);
+ /*
+ * Rely on the implied barrier below, such that global_nwo
+ * is updated before mce_callin.
+ */
+ order = raw_atomic_inc_return(&mce_callin);
+ arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
+ /*
+ * Wait for everyone.
+ */
+ while (raw_atomic_read(&mce_callin) != num_online_cpus()) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
+ raw_atomic_set(&global_nwo, 0);
+ goto out;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * mce_callin should be read before global_nwo
+ */
+ smp_rmb();
+
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
+ raw_atomic_set(&mce_executing, 1);
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (raw_atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
+ raw_atomic_set(&global_nwo, 0);
+ goto out;
+ }
+ ndelay(SPINUNIT);
+ }
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ *no_way_out = raw_atomic_read(&global_nwo);
+
+ ret = order;
+
+out:
+ instrumentation_end();
+
+ return ret;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static noinstr int mce_end(int order)
+{
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= num_online_cpus()) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ cpumask_setall(&mce_missing_cpus);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
+ return ret;
+}
+
+static __always_inline void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ if (arch_test_bit(i, toclear))
+ mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+ }
+}
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static noinstr bool mce_check_crashing_cpu(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ if (arch_cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ native_wrmsrq(MSR_IA32_MCG_STATUS, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+static __always_inline int
+__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
+ struct mce_hw_err *final, unsigned long *toclear,
+ unsigned long *valid_banks, int no_way_out, int *worst)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i, taint = 0;
+ struct mce *m = &err->m;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ arch___clear_bit(i, toclear);
+ if (!arch_test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ taint++;
+
+ severity = mce_severity(m, regs, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ arch___set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(err, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ /*
+ * Enable instrumentation around the mce_log() call which is
+ * done in #MC context, where instrumentation is disabled.
+ */
+ instrumentation_begin();
+ mce_log(err);
+ instrumentation_end();
+
+ if (severity > *worst) {
+ *final = *err;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *err = *final;
+
+ return taint;
+}
+
+static void kill_me_now(struct callback_head *ch)
+{
+ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
+ force_sig(SIGBUS);
+}
+
+static void kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+ unsigned long pfn;
+ int ret;
+
+ p->mce_count = 0;
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+
+ if (!p->mce_ripv)
+ flags |= MF_MUST_KILL;
+
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ ret = memory_failure(pfn, flags);
+ if (!ret) {
+ set_mce_nospec(pfn);
+ sync_core();
+ return;
+ }
+
+ /*
+ * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+ * to the current process with the proper error info,
+ * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
+ *
+ * In both cases, no further processing is required.
+ */
+ if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
+ return;
+
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+}
+
+static void kill_me_never(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ unsigned long pfn;
+
+ p->mce_count = 0;
+ pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
+}
+
+static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
+{
+ int count = ++current->mce_count;
+ struct mce *m = &err->m;
+
+ /* First call, save all the details */
+ if (count == 1) {
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+ current->mce_kill_me.func = func;
+ }
+
+ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
+ if (count > 10)
+ mce_panic("Too many consecutive machine checks while accessing user data",
+ err, msg);
+
+ /* Second or later call, make sure page address matches the one from first call */
+ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+ mce_panic("Consecutive machine checks to different user pages", err, msg);
+
+ /* Do not call task_work_add() more than once */
+ if (count > 1)
+ return;
+
+ task_work_add(current, &current->mce_kill_me, TWA_RESUME);
+}
+
+/* Handle unconfigured int18 (should never happen) */
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+ instrumentation_end();
+}
+
+/*
+ * The actual machine check handler. This only handles real exceptions when
+ * something got corrupted coming in through int 18.
+ *
+ * This is executed in #MC context not subject to normal locking rules.
+ * This implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ *
+ * Tracing and kprobes are disabled: if we interrupted a kernel context
+ * with IF=1, we need to minimize stack usage. There are also recursion
+ * issues: if the machine check was due to a failure of the memory
+ * backing the user stack, tracing that reads the user stack will cause
+ * potentially infinite recursion.
+ *
+ * Currently, the #MC handler calls out to a number of external facilities
+ * and, therefore, allows instrumentation around them. The optimal thing to
+ * have would be to do the absolutely minimal work required in #MC context
+ * and have instrumentation disabled only around that. Further processing can
+ * then happen in process context where instrumentation is allowed. Achieving
+ * that requires careful auditing and modifications. Until then, the code
+ * allows instrumentation temporarily, where required. *
+ */
+noinstr void do_machine_check(struct pt_regs *regs)
+{
+ int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
+ struct mce_hw_err *final;
+ struct mce_hw_err err;
+ char *msg = NULL;
+ struct mce *m;
+
+ if (unlikely(mce_flags.p5))
+ return pentium_machine_check(regs);
+ else if (unlikely(mce_flags.winchip))
+ return winchip_machine_check(regs);
+ else if (unlikely(!mca_cfg.initialized))
+ return unexpected_machine_check(regs);
+
+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ goto clear;
+
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ order = -1;
+
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE.
+ */
+ no_way_out = 0;
+
+ /*
+ * If kill_current_task is not set, there might be a way to recover from this
+ * error.
+ */
+ kill_current_task = 0;
+
+ /*
+ * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+ * on Intel.
+ */
+ lmce = 1;
+
+ this_cpu_inc(mce_exception_count);
+
+ mce_gather_info(&err, regs);
+ m = &err.m;
+ m->tsc = rdtsc();
+
+ final = this_cpu_ptr(&hw_errs_seen);
+ *final = err;
+
+ no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
+
+ barrier();
+
+ /*
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
+ */
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
+ kill_current_task = 1;
+ /*
+ * Check if this MCE is signaled to only this logical processor,
+ * on Intel, Zhaoxin only.
+ */
+ if (m->cpuvendor == X86_VENDOR_INTEL ||
+ m->cpuvendor == X86_VENDOR_ZHAOXIN)
+ lmce = m->mcgstatus & MCG_STATUS_LMCES;
+
+ /*
+ * Local machine check may already know that we have to panic.
+ * Broadcast machine check begins rendezvous in mce_start()
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+ * to see it will clear it.
+ */
+ if (lmce) {
+ if (no_way_out)
+ mce_panic("Fatal local machine check", &err, msg);
+ } else {
+ order = mce_start(&no_way_out);
+ }
+
+ taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (!lmce) {
+ if (mce_end(order) < 0) {
+ if (!no_way_out)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &err, msg);
+ }
+ } else {
+ /*
+ * If there was a fatal machine check we should have
+ * already called mce_panic earlier in this function.
+ * Since we re-read the banks, we might have found
+ * something new. Check again to see if we found a
+ * fatal error. We call "mce_severity()" again to
+ * make sure we have the right "msg".
+ */
+ if (worst >= MCE_PANIC_SEVERITY) {
+ mce_severity(m, regs, &msg, true);
+ mce_panic("Local fatal machine check!", &err, msg);
+ }
+ }
+
+ /*
+ * Enable instrumentation around the external facilities like task_work_add()
+ * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
+ * properly would need a lot more involved reorganization.
+ */
+ instrumentation_begin();
+
+ if (taint)
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ if (worst != MCE_AR_SEVERITY && !kill_current_task)
+ goto out;
+
+ /* Fault was in user mode and we need to take some action */
+ if ((m->cs & 3) == 3) {
+ /* If this triggers there is no way to recover. Die hard. */
+ BUG_ON(!on_thread_stack() || !user_mode(regs));
+
+ if (!mce_usable_address(m))
+ queue_task_work(&err, msg, kill_me_now);
+ else
+ queue_task_work(&err, msg, kill_me_maybe);
+
+ } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(m)) {
+ struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
+ } else {
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m->kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+ mce_panic("Failed kernel mode recovery", &err, msg);
+ }
+
+ if (m->kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&err, msg, kill_me_never);
+ }
+
+out:
+ /* Given it didn't panic, mark it as recoverable */
+ hwerr_log_error_type(HWERR_RECOV_OTHERS);
+
+ instrumentation_end();
+
+clear:
+ mce_wrmsrq(MSR_IA32_MCG_STATUS, 0);
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int flags)
+{
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
+}
+#endif
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll 2x faster. When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
+
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static void __start_timer(struct timer_list *t, unsigned long interval)
+{
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!timer_pending(t) || time_before(when, t->expires))
+ mod_timer(t, round_jiffies(when));
+
+ local_irq_restore(flags);
+}
+
+static void mc_poll_banks_default(void)
+{
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+}
+
+void (*mc_poll_banks)(void) = mc_poll_banks_default;
+
+static bool should_enable_timer(unsigned long iv)
+{
+ return !mca_cfg.ignore_ce && iv;
+}
+
+static void mce_timer_fn(struct timer_list *t)
+{
+ struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
+ unsigned long iv;
+
+ WARN_ON(cpu_t != t);
+
+ iv = __this_cpu_read(mce_next_interval);
+
+ if (mce_available(this_cpu_ptr(&cpu_info)))
+ mc_poll_banks();
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
+ */
+ if (mce_notify_irq())
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ else
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else if (should_enable_timer(iv)) {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
+}
+
+/*
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
+ */
+void mce_timer_kick(bool storm)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_set_storm_mode(storm);
+
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
+}
+
+/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ timer_delete_sync(&per_cpu(mce_timer, cpu));
+}
+
+static void __mcheck_cpu_mce_banks_init(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ u8 n_banks = this_cpu_read(mce_num_banks);
+ int i;
+
+ for (i = 0; i < n_banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ /*
+ * Init them all by default.
+ *
+ * The required vendor quirks will be applied before
+ * __mcheck_cpu_init_prepare_banks() does the final bank setup.
+ */
+ b->ctl = -1ULL;
+ b->init = true;
+ }
+}
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static void __mcheck_cpu_cap_init(void)
+{
+ u64 cap;
+ u8 b;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+
+ b = cap & MCG_BANKCNT_MASK;
+
+ if (b > MAX_NR_BANKS) {
+ pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
+ smp_processor_id(), MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
+ }
+
+ this_cpu_write(mce_num_banks, b);
+
+ __mcheck_cpu_mce_banks_init();
+}
+
+static void __mcheck_cpu_init_generic(void)
+{
+ u64 cap;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_prepare_banks(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ u64 msrval;
+ int i;
+
+ /*
+ * Log the machine checks left over from the previous reset. Log them
+ * only, do not start processing them. That will happen in mcheck_late_init()
+ * when all consumers have been registered on the notifier chain.
+ */
+ if (mca_cfg.bootlog) {
+ mce_banks_t all_banks;
+
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
+ }
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (!b->init)
+ continue;
+
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+
+ rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
+ b->init = !!msrval;
+ }
+}
+
+static void amd_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
+ /*
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
+ */
+ mca_cfg.bootlog = 0;
+ }
+
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+}
+
+static void intel_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ /* Older CPUs (prior to family 6) don't need quirks. */
+ if (c->x86_vfm < INTEL_PENTIUM_PRO)
+ return;
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
+
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
+ mca_cfg.bootlog = 0;
+
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
+ mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
+}
+
+static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 5)
+ return false;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ intel_p5_mcheck_init(c);
+ mce_flags.p5 = 1;
+ return true;
+ case X86_VENDOR_CENTAUR:
+ winchip_mcheck_init(c);
+ mce_flags.winchip = 1;
+ return true;
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ /*
+ * All newer Centaur CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+ c->x86 > 6) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /*
+ * These CPUs have MCA bank 8 which reports only one error type called
+ * SVAD (System View Address Decoder). The reporting of that error is
+ * controlled by IA32_MC8.CTL.0.
+ *
+ * If enabled, prefetching on these CPUs will cause SVAD MCE when
+ * virtual machines start and result in a system panic. Always disable
+ * bank 8 SVAD error by default.
+ */
+ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
+ (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (this_cpu_read(mce_num_banks) > 8)
+ mce_banks[8].ctl = 0;
+ }
+
+ intel_init_cmci();
+ intel_init_lmce();
+}
+
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ break;
+
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ mce_amd_feature_init(c);
+ break;
+
+ case X86_VENDOR_CENTAUR:
+ mce_centaur_feature_init(c);
+ break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_init(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_clear(c);
+ break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void mce_start_timer(struct timer_list *t)
+{
+ unsigned long iv = check_interval * HZ;
+
+ if (should_enable_timer(iv)) {
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+ mce_start_timer(t);
+}
+
+bool filter_mce(struct mce *m)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return amd_filter_mce(m);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return intel_filter_mce(m);
+
+ return false;
+}
+
+static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ WARN_ON_ONCE(user_mode(regs));
+
+ /*
+ * Only required when from kernel mode. See
+ * mce_check_crashing_cpu() for details.
+ */
+ if (mca_cfg.initialized && mce_check_crashing_cpu())
+ return;
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ do_machine_check(regs);
+
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+static __always_inline void exc_machine_check_user(struct pt_regs *regs)
+{
+ irqentry_enter_from_user_mode(regs);
+
+ do_machine_check(regs);
+
+ irqentry_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* MCE hit kernel mode */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+
+/* The user mode variant. */
+DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_user(regs);
+ local_db_restore(dr7);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #MCE needs to be handled on different stack: User #MCE
+ * on current task stack, while kernel #MCE on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #MCE dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
+ * stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
+#else
+/* 32bit unified entry point */
+DEFINE_IDTENTRY_RAW(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
+
+void mca_bsp_init(struct cpuinfo_x86 *c)
+{
+ u64 cap;
+
+ if (!mce_available(c))
+ return;
+
+ if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+ mca_cfg.disabled = 1;
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return;
+ }
+
+ mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV);
+ mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR);
+ mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA);
+
+ if (mce_flags.smca)
+ smca_bsp_init();
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+
+ /* Use accurate RIP reporting if available. */
+ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
+
+ if (cap & MCG_SER_P)
+ mca_cfg.ser = 1;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ amd_apply_global_quirks(c);
+ break;
+ case X86_VENDOR_INTEL:
+ intel_apply_global_quirks(c);
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ zhaoxin_apply_global_quirks(c);
+ break;
+ }
+
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = 0;
+ if (mca_cfg.bootlog != 0)
+ mca_cfg.panic_timeout = 30;
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (__mcheck_cpu_ancient_init(c))
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ __mcheck_cpu_cap_init();
+
+ if (!mce_gen_pool_init()) {
+ mca_cfg.disabled = 1;
+ pr_emerg("Couldn't allocate MCE records pool!\n");
+ return;
+ }
+
+ mca_cfg.initialized = 1;
+
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_prepare_banks();
+ __mcheck_cpu_setup_timer();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ /*
+ * Possibly to clear general settings generic to x86
+ * __mcheck_cpu_clear_generic(c);
+ */
+ __mcheck_cpu_clear_vendor(c);
+
+}
+
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= this_cpu_read(mce_num_banks)) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=print_all Print all machine check logs to console
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+ and older.
+ * mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable copy_mc_fragile()
+ */
+static int __init mcheck_enable(char *str)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ if (*str == 0) {
+ enable_p5_mce();
+ return 1;
+ }
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ cfg->disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ cfg->dont_log_ce = true;
+ else if (!strcmp(str, "print_all"))
+ cfg->print_all = true;
+ else if (!strcmp(str, "ignore_ce"))
+ cfg->ignore_ce = true;
+ else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = 1;
+ else if (!strcmp(str, "recovery"))
+ cfg->recovery = 1;
+ else if (isdigit(str[0]))
+ get_option(&str, &(cfg->monarch_timeout));
+ else {
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
+ return 0;
+ }
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+int __init mcheck_init(void)
+{
+ mce_register_decode_chain(&early_nb);
+ mce_register_decode_chain(&mce_uc_nb);
+ mce_register_decode_chain(&mce_default_nb);
+
+ INIT_WORK(&mce_work, mce_gen_pool_process);
+ init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
+ return 0;
+}
+
+/*
+ * mce_syscore: PM support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static void mce_disable_error_reporting(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrq(mca_msr_reg(i, MCA_CTL), 0);
+ }
+ return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+ /*
+ * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
+ * MSRs are socket-wide. Disabling them for just a single offlined CPU
+ * is bad, since it will inhibit reporting for all shared resources on
+ * the socket like the last level cache (LLC), the integrated memory
+ * controller (iMC), etc.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
+ return;
+
+ mce_disable_error_reporting();
+}
+
+static int mce_syscore_suspend(void *data)
+{
+ vendor_disable_error_reporting();
+ return 0;
+}
+
+static void mce_syscore_shutdown(void *data)
+{
+ vendor_disable_error_reporting();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static void mce_syscore_resume(void *data)
+{
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+ __mcheck_cpu_init_prepare_banks();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+static const struct syscore_ops mce_syscore_ops = {
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
+};
+
+static struct syscore mce_syscore = {
+ .ops = &mce_syscore_ops,
+};
+
+/*
+ * mce_device: Sysfs support
+ */
+
+static void mce_cpu_restart(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_prepare_banks();
+ __mcheck_cpu_init_timer();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ mce_timer_delete_all();
+ on_each_cpu(mce_cpu_restart, NULL, 1);
+ mce_schedule_work();
+}
+
+/* Toggle features for corrected errors */
+static void mce_disable_cmci(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_clear();
+}
+
+static void mce_enable_ce(void *all)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_reenable();
+ cmci_recheck();
+ if (all)
+ __mcheck_cpu_init_timer();
+}
+
+static const struct bus_type mce_subsys = {
+ .name = "machinecheck",
+ .dev_name = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct device *, mce_device);
+
+static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
+{
+ return container_of(attr, struct mce_bank_dev, attr);
+}
+
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
+ char *buf)
+{
+ u8 bank = attr_to_bank(attr)->bank;
+ struct mce_bank *b;
+
+ if (bank >= per_cpu(mce_num_banks, s->id))
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks_array, s->id)[bank];
+
+ if (!b->init)
+ return -ENODEV;
+
+ return sprintf(buf, "%llx\n", b->ctl);
+}
+
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u8 bank = attr_to_bank(attr)->bank;
+ struct mce_bank *b;
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (bank >= per_cpu(mce_num_banks, s->id))
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks_array, s->id)[bank];
+ if (!b->init)
+ return -ENODEV;
+
+ b->ctl = new;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.ignore_ce ^ !!new) {
+ if (new) {
+ /* disable ce features */
+ mce_timer_delete_all();
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.ignore_ce = true;
+ } else {
+ /* enable ce features */
+ mca_cfg.ignore_ce = false;
+ on_each_cpu(mce_enable_ce, (void *)1, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.cmci_disabled ^ !!new) {
+ if (new) {
+ /* disable cmci */
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.cmci_disabled = true;
+ } else {
+ /* enable cmci */
+ mca_cfg.cmci_disabled = false;
+ on_each_cpu(mce_enable_ce, NULL, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long old_check_interval = check_interval;
+ ssize_t ret = device_store_ulong(s, attr, buf, size);
+
+ if (check_interval == old_check_interval)
+ return ret;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return ret;
+}
+
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
+
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
+ &check_interval
+};
+
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
+};
+
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
+};
+
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
+ &dev_attr_trigger,
+#endif
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_print_all.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
+ NULL
+};
+
+static cpumask_var_t mce_device_initialized;
+
+static void mce_device_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+/* Per CPU device init. All of the CPUs still share the same bank device: */
+static int mce_device_create(unsigned int cpu)
+{
+ struct device *dev;
+ int err;
+ int i, j;
+
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
+
+ err = device_register(dev);
+ if (err) {
+ put_device(dev);
+ return err;
+ }
+
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
+ if (err)
+ goto error;
+ }
+ for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
+ err = device_create_file(dev, &mce_bank_devs[j].attr);
+ if (err)
+ goto error2;
+ }
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = dev;
+
+ return 0;
+error2:
+ while (--j >= 0)
+ device_remove_file(dev, &mce_bank_devs[j].attr);
+error:
+ while (--i >= 0)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ device_unregister(dev);
+
+ return err;
+}
+
+static void mce_device_remove(unsigned int cpu)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ int i;
+
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
+ return;
+
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
+ device_remove_file(dev, &mce_bank_devs[i].attr);
+
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = NULL;
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_clear();
+
+ vendor_disable_error_reporting();
+}
+
+static void mce_reenable_cpu(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ int i;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_reenable();
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ }
+}
+
+static int mce_cpu_dead(unsigned int cpu)
+{
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_device_create(cpu);
+ mce_threshold_create_device(cpu);
+ mce_reenable_cpu();
+ mce_start_timer(t);
+ return 0;
+}
+
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_disable_cpu();
+ timer_delete_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
+
+static __init void mce_init_banks(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_BANKS; i++) {
+ struct mce_bank_dev *b = &mce_bank_devs[i];
+ struct device_attribute *a = &b->attr;
+
+ b->bank = i;
+
+ sysfs_attr_init(&a->attr);
+ a->attr.name = b->attrname;
+ snprintf(b->attrname, ATTR_LEN, "bank%d", i);
+
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+}
+
+/*
+ * When running on XEN, this initcall is ordered against the XEN mcelog
+ * initcall:
+ *
+ * device_initcall(xen_late_init_mcelog);
+ * device_initcall_sync(mcheck_init_device);
+ */
+static __init int mcheck_init_device(void)
+{
+ int err;
+
+ /*
+ * Check if we have a spare virtual bit. This will only become
+ * a problem if/when we move beyond 5-level page tables.
+ */
+ MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ mce_init_banks();
+
+ err = subsys_system_register(&mce_subsys, NULL);
+ if (err)
+ goto err_out_mem;
+
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
+
+ /*
+ * Invokes mce_cpu_online() on all CPUs which are online when
+ * the state is installed.
+ */
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+
+ register_syscore(&mce_syscore);
+
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
+
+ return err;
+}
+device_initcall_sync(mcheck_init_device);
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mca_cfg.disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
+{
+ static struct dentry *dmce;
+
+ if (!dmce)
+ dmce = debugfs_create_dir("mce", NULL);
+
+ return dmce;
+}
+
+static void mce_reset(void)
+{
+ atomic_set(&mce_fake_panicked, 0);
+ atomic_set(&mce_executing, 0);
+ atomic_set(&mce_callin, 0);
+ atomic_set(&global_nwo, 0);
+ cpumask_setall(&mce_missing_cpus);
+}
+
+static int fake_panic_get(void *data, u64 *val)
+{
+ *val = fake_panic;
+ return 0;
+}
+
+static int fake_panic_set(void *data, u64 val)
+{
+ mce_reset();
+ fake_panic = val;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
+ "%llu\n");
+
+static void __init mcheck_debugfs_init(void)
+{
+ struct dentry *dmce;
+
+ dmce = mce_get_debugfs_dir();
+ debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
+ &fake_panic_fops);
+}
+#else
+static void __init mcheck_debugfs_init(void) { }
+#endif
+
+static int __init mcheck_late_init(void)
+{
+ if (mca_cfg.recovery)
+ enable_copy_mc_fragile();
+
+ mcheck_debugfs_init();
+
+ /*
+ * Flush out everything that has been logged during early boot, now that
+ * everything has been initialized (workqueues, decoders, ...).
+ */
+ mce_schedule_work();
+
+ return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
new file mode 100644
index 000000000000..8d023239ce18
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -0,0 +1,365 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "internal.h"
+
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer *mcelog;
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int entry;
+
+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ entry = mcelog->next;
+
+ /*
+ * When the buffer fills up discard new entries. Assume that the
+ * earlier errors are the more interesting ones:
+ */
+ if (entry >= mcelog->len) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags);
+ goto unlock;
+ }
+
+ mcelog->next = entry + 1;
+
+ memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
+ mcelog->entry[entry].finished = 1;
+ mcelog->entry[entry].kflags = 0;
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+unlock:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ mce->kflags |= MCE_HANDLED_MCELOG;
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strscpy(mce_helper, buf, sizeof(mce_helper));
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned next;
+ int i, err;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < mcelog->len * sizeof(struct mce))
+ goto out;
+
+ next = mcelog->next;
+ err = 0;
+
+ for (i = 0; i < next; i++) {
+ struct mce *m = &mcelog->entry[i];
+
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+ }
+
+ memset(mcelog->entry, 0, next * sizeof(struct mce));
+ mcelog->next = 0;
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ return err ? err : buf - ubuf;
+}
+
+static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog->next))
+ return EPOLLIN | EPOLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(mcelog->len, p);
+ case MCE_GETCLEAR_FLAGS:
+ return put_user(xchg(&mcelog->flags, 0), p);
+ default:
+ return -ENOTTY;
+ }
+}
+
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
+
+void mce_unregister_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ return -EINVAL;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffy or two later everywhere.
+ */
+ schedule_timeout(2);
+
+ blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+ return usize;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int mce_log_len;
+ int err;
+
+ mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
+ mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL);
+ if (!mcelog)
+ return -ENOMEM;
+
+ memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
+ mcelog->len = mce_log_len;
+ mcelog->recordlen = sizeof(struct mce);
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ if (err == -EBUSY)
+ /* Xen dom0 might have registered the device already. */
+ pr_info("Unable to init device /dev/mcelog, already registered");
+ else
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
+ kfree(mcelog);
+ return err;
+ }
+
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
new file mode 100644
index 000000000000..3ca9c007a666
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
+ */
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+ struct mce_hw_err *err1, *err2;
+ struct mce_evt_llist *node;
+
+ err1 = &t->err;
+
+ llist_for_each_entry(node, &l->llnode, llnode) {
+ err2 = &node->err;
+
+ if (!mce_cmp(&err1->m, &err2->m))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet. The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+ struct llist_node *head;
+ LLIST_HEAD(new_head);
+ struct mce_evt_llist *node, *t;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return NULL;
+
+ /* squeeze out duplicates while reversing order */
+ llist_for_each_entry_safe(node, t, head, llnode) {
+ if (!is_duplicate_mce_record(node, t))
+ llist_add(&node->llnode, &new_head);
+ }
+
+ return new_head.first;
+}
+
+void mce_gen_pool_process(struct work_struct *__unused)
+{
+ struct mce_evt_llist *node, *tmp;
+ struct llist_node *head;
+ struct mce *mce;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return;
+
+ head = llist_reverse_order(head);
+ llist_for_each_entry_safe(node, tmp, head, llnode) {
+ mce = &node->err.m;
+ blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+ }
+}
+
+bool mce_gen_pool_empty(void)
+{
+ return llist_empty(&mce_event_llist);
+}
+
+bool mce_gen_pool_add(struct mce_hw_err *err)
+{
+ struct mce_evt_llist *node;
+
+ if (filter_mce(&err->m))
+ return false;
+
+ if (!mce_evt_pool)
+ return false;
+
+ node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+ if (!node) {
+ pr_warn_ratelimited("MCE records pool full!\n");
+ return false;
+ }
+
+ memcpy(&node->err, err, sizeof(*err));
+ llist_add(&node->llnode, &mce_event_llist);
+
+ return true;
+}
+
+static bool mce_gen_pool_create(void)
+{
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return false;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return false;
+ }
+
+ if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return false;
+ }
+
+ mce_evt_pool = gpool;
+
+ return true;
+}
+
+bool mce_gen_pool_init(void)
+{
+ /* Just init mce_gen_pool once. */
+ if (mce_evt_pool)
+ return true;
+
+ return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
new file mode 100644
index 000000000000..d02c4f556cd0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -0,0 +1,805 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de>
+ * Advanced Micro Devices Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd/nb.h>
+#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "internal.h"
+
+static bool hw_injection_possible = true;
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+#define MAX_FLAG_OPT_SIZE 4
+#define NBCFG 0x44
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+MCE_INJECT_GET(ipid);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+/* Use the user provided IPID value on a sw injection. */
+static int inj_ipid_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ if (inj_type == SW_INJ)
+ m->ipid = val;
+ }
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->time = ktime_get_real_seconds();
+ m->cpuid = cpuid_eax(1);
+ m->microcode = boot_cpu_data.microcode;
+}
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+ struct mce *i = &per_cpu(injectm, m->extcpu);
+
+ /* Make sure no one reads partially written injectm */
+ i->finished = 0;
+ mb();
+ m->finished = 0;
+ /* First set the fields after finished */
+ i->extcpu = m->extcpu;
+ mb();
+ /* Now write record in order, finished last (except above) */
+ memcpy(i, m, sizeof(struct mce));
+ /* Finally activate it */
+ mb();
+ i->finished = 1;
+}
+
+static void raise_poll(struct mce *m)
+{
+ unsigned long flags;
+ mce_banks_t b;
+
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ local_irq_save(flags);
+ machine_check_poll(0, &b);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
+{
+ struct pt_regs regs;
+ unsigned long flags;
+
+ if (!pregs) {
+ memset(&regs, 0, sizeof(struct pt_regs));
+ regs.ip = m->ip;
+ regs.cs = m->cs;
+ pregs = &regs;
+ }
+ /* do_machine_check() expects interrupts disabled -- at least */
+ local_irq_save(flags);
+ do_machine_check(pregs);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
+
+static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+ if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
+ return NMI_DONE;
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ if (m->inject_flags & MCJ_EXCEPTION)
+ raise_exception(m, regs);
+ else if (m->status)
+ raise_poll(m);
+ return NMI_HANDLED;
+}
+
+static void mce_irq_ipi(void *info)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+
+ if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+ m->inject_flags & MCJ_EXCEPTION) {
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ raise_exception(m, NULL);
+ }
+}
+
+/* Inject mce on current CPU */
+static int raise_local(void)
+{
+ struct mce *m = this_cpu_ptr(&injectm);
+ int context = MCJ_CTX(m->inject_flags);
+ int ret = 0;
+ int cpu = m->extcpu;
+
+ if (m->inject_flags & MCJ_EXCEPTION) {
+ pr_info("Triggering MCE exception on CPU %d\n", cpu);
+ switch (context) {
+ case MCJ_CTX_IRQ:
+ /*
+ * Could do more to fake interrupts like
+ * calling irq_enter, but the necessary
+ * machinery isn't exported currently.
+ */
+ fallthrough;
+ case MCJ_CTX_PROCESS:
+ raise_exception(m, NULL);
+ break;
+ default:
+ pr_info("Invalid MCE context\n");
+ ret = -EINVAL;
+ }
+ pr_info("MCE exception done on CPU %d\n", cpu);
+ } else if (m->status) {
+ pr_info("Starting machine check poll CPU %d\n", cpu);
+ raise_poll(m);
+ pr_info("Machine check poll done on CPU %d\n", cpu);
+ } else
+ m->finished = 0;
+
+ return ret;
+}
+
+static void __maybe_unused raise_mce(struct mce *m)
+{
+ int context = MCJ_CTX(m->inject_flags);
+
+ inject_mce(m);
+
+ if (context == MCJ_CTX_RANDOM)
+ return;
+
+ if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
+ unsigned long start;
+ int cpu;
+
+ cpus_read_lock();
+ cpumask_copy(mce_inject_cpumask, cpu_online_mask);
+ cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
+ for_each_online_cpu(cpu) {
+ struct mce *mcpu = &per_cpu(injectm, cpu);
+ if (!mcpu->finished ||
+ MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ }
+ if (!cpumask_empty(mce_inject_cpumask)) {
+ if (m->inject_flags & MCJ_IRQ_BROADCAST) {
+ /*
+ * don't wait because mce_irq_ipi is necessary
+ * to be sync with following raise_local
+ */
+ preempt_disable();
+ smp_call_function_many(mce_inject_cpumask,
+ mce_irq_ipi, NULL, 0);
+ preempt_enable();
+ } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+ __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
+ }
+ start = jiffies;
+ while (!cpumask_empty(mce_inject_cpumask)) {
+ if (!time_before(jiffies, start + 2*HZ)) {
+ pr_err("Timeout waiting for mce inject %lx\n",
+ *cpumask_bits(mce_inject_cpumask));
+ break;
+ }
+ cpu_relax();
+ }
+ raise_local();
+ put_cpu();
+ cpus_read_unlock();
+ } else {
+ preempt_disable();
+ raise_local();
+ preempt_enable();
+ }
+}
+
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_inject_mutex);
+ raise_mce(m);
+ mutex_unlock(&mce_inject_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+ .notifier_call = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);
+
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ if (i > SW_INJ && !hw_injection_possible)
+ continue;
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+ return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ u32 cores_per_node;
+
+ cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg();
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct amd_northbridge *nb;
+ struct pci_dev *F3;
+ u32 val;
+ int err;
+
+ nb = node_to_amd_nb(nid);
+ if (!nb)
+ return;
+
+ F3 = nb->misc;
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
+
+ wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ } else {
+ wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ }
+
+ wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+
+ if (m.misc)
+ wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ } else {
+ wrmsrq(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr);
+
+ if (m.misc)
+ wrmsrq(MSR_IA32_MCx_MISC(b), m.misc);
+ }
+}
+
+static void do_inject(void)
+{
+ unsigned int cpu = i_mce.extcpu;
+ struct mce_hw_err err;
+ u64 mcg_status = 0;
+ u8 b = i_mce.bank;
+
+ i_mce.tsc = rdtsc_ordered();
+
+ i_mce.status |= MCI_STATUS_VAL;
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
+ if (inj_type == SW_INJ) {
+ err.m = i_mce;
+ mce_log(&err);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;
+
+ /*
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+ */
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status &= ~MCI_STATUS_UC;
+ }
+
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
+ toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu));
+ cpu = get_nbc_for_node(topology_amd_node_id(cpu));
+ }
+
+ cpus_read_lock();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+ toggle_hw_mce_inject(cpu, false);
+
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
+
+err:
+ cpus_read_unlock();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+ u8 n_banks;
+ u64 cap;
+
+ /* Get bank count on target CPU so we can handle non-uniform values. */
+ rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ if (val >= n_banks) {
+ pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+
+ /*
+ * sw-only injection allows to write arbitrary values into the MCA
+ * registers because it tests only the decoding paths.
+ */
+ if (inj_type == SW_INJ)
+ goto inject;
+
+ /*
+ * Read IPID value to determine if a bank is populated on the target
+ * CPU.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ u64 ipid;
+
+ if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ pr_err("Error reading IPID on CPU%d\n", m->extcpu);
+ return -EINVAL;
+ }
+
+ if (!ipid) {
+ pr_err("Cannot inject into unpopulated bank %llu\n", val);
+ return -ENODEV;
+ }
+ }
+
+inject:
+ do_inject();
+
+ /* Reset injection struct */
+ setup_inj_struct(&i_mce);
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
+"\n"
+"ipid:\t IPID (AMD-specific)\n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static void __init debugfs_init(void)
+{
+ unsigned int i;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
+ debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj,
+ &i_mce, dfs_fls[i].fops);
+}
+
+static void check_hw_inj_possible(void)
+{
+ int cpu;
+ u8 bank;
+
+ /*
+ * This behavior exists only on SMCA systems though its not directly
+ * related to SMCA.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SMCA))
+ return;
+
+ cpu = get_cpu();
+
+ for (bank = 0; bank < MAX_NR_BANKS; ++bank) {
+ u64 status = MCI_STATUS_VAL, ipid;
+
+ /* Check whether bank is populated */
+ rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
+ if (!ipid)
+ continue;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status);
+ rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0);
+
+ if (!status) {
+ hw_injection_possible = false;
+ pr_warn("Platform does not allow *hardware* error injection."
+ "Try using APEI EINJ instead.\n");
+ }
+
+ toggle_hw_mce_inject(cpu, false);
+
+ break;
+ }
+
+ put_cpu();
+}
+
+static int __init inject_init(void)
+{
+ if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ check_hw_inj_possible();
+
+ debugfs_init();
+
+ register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+ mce_register_injector_chain(&inject_nb);
+
+ setup_inj_struct(&i_mce);
+
+ pr_info("Machine check injector initialized\n");
+
+ return 0;
+}
+
+static void __exit inject_exit(void)
+{
+
+ mce_unregister_injector_chain(&inject_nb);
+ unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ free_cpumask_var(mce_inject_cpumask);
+}
+
+module_init(inject_init);
+module_exit(inject_exit);
+MODULE_DESCRIPTION("Machine check injection support");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
new file mode 100644
index 000000000000..4655223ba560
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+
+/*
+ * On systems that do support CMCI but it's disabled, polling for MCEs can
+ * cause the same event to be reported multiple times because IA32_MCi_STATUS
+ * is shared by the same package.
+ */
+static DEFINE_SPINLOCK(cmci_poll_lock);
+
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
+#define CMCI_THRESHOLD 1
+
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
+
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
+
+static bool cmci_supported(int *banks)
+{
+ u64 cap;
+
+ if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
+ return false;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
+ return false;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
+ return !!(cap & MCG_CMCI_P);
+}
+
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrq(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP
+ * fault. The MSR must also be locked for LMCE_ENABLED to take effect.
+ * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
+ * locks the MSR in the event that it wasn't already locked by BIOS.
+ */
+ rdmsrq(MSR_IA32_FEAT_CTL, tmp);
+ if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
+ return false;
+
+ return tmp & FEAT_CTL_LMCE_ENABLED;
+}
+
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
+{
+ unsigned long flags;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+void mce_intel_handle_storm(int bank, bool on)
+{
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+}
+
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
+{
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+
+ if (test_bit(bank, owned))
+ return true;
+
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
+
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), *val);
+
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
+
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ return val;
+}
+
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
+ }
+
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
+
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
+
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
+ */
+static void cmci_discover(int banks)
+{
+ int bios_wrong_thresh = 0;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+ int bios_zero_thresh = 0;
+
+ if (cmci_skip_bank(i, &val))
+ continue;
+
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
+ return;
+
+ local_irq_save(flags);
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
+ return;
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ unsigned long flags;
+ int i;
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void cmci_rediscover_work_func(void *arg)
+{
+ int banks;
+
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+/* After a CPU went down cycle through all the others and rediscover */
+void cmci_rediscover(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ on_each_cpu(cmci_rediscover_work_func, NULL, 1);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+/* Bank polling function when CMCI is disabled. */
+static void cmci_mc_poll_banks(void)
+{
+ spin_lock(&cmci_poll_lock);
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ spin_unlock(&cmci_poll_lock);
+}
+
+void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks)) {
+ mc_poll_banks = cmci_mc_poll_banks;
+ return;
+ }
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
+void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+void intel_clear_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val);
+}
+
+/*
+ * Enable additional error logs from the integrated
+ * memory controller on processors that support this.
+ */
+static void intel_imc_init(struct cpuinfo_x86 *c)
+{
+ u64 error_control;
+
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
+ if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control))
+ return;
+ error_control |= 2;
+ wrmsrq_safe(MSR_ERROR_CONTROL, error_control);
+ break;
+ }
+}
+
+static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ *
+ * Older CPUs (prior to family 6) can't reach this point and already
+ * return early due to the check of __mcheck_cpu_ancient_init().
+ */
+ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+ this_cpu_ptr(mce_banks_array)[0].init = false;
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+ intel_apply_cpu_quirks(c);
+ intel_init_cmci();
+ intel_init_lmce();
+ intel_imc_init(c);
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+ cmci_clear();
+}
+
+bool intel_filter_mce(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
+ (m->bank == 0) &&
+ ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granularity for now.
+ */
+bool intel_mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV))
+ return false;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return false;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
new file mode 100644
index 000000000000..a31cf984619c
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -0,0 +1,352 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
+#undef pr_fmt
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/device.h>
+#include <asm/mce.h>
+
+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_DEFERRED_SEVERITY,
+ MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+extern struct blocking_notifier_head x86_mce_decoder_chain;
+
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
+
+struct mce_evt_llist {
+ struct llist_node llnode;
+ struct mce_hw_err err;
+};
+
+void mce_gen_pool_process(struct work_struct *__unused);
+bool mce_gen_pool_empty(void);
+bool mce_gen_pool_add(struct mce_hw_err *err);
+bool mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
+
+int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
+struct dentry *mce_get_debugfs_dir(void);
+
+extern mce_banks_t mce_banks_ce_disabled;
+
+#ifdef CONFIG_X86_MCE_INTEL
+void mce_intel_handle_storm(int bank, bool on);
+void cmci_disable_bank(int bank);
+void intel_init_cmci(void);
+void intel_init_lmce(void);
+void intel_clear_lmce(void);
+bool intel_filter_mce(struct mce *m);
+bool intel_mce_usable_address(struct mce *m);
+#else
+static inline void mce_intel_handle_storm(int bank, bool on) { }
+static inline void cmci_disable_bank(int bank) { }
+static inline void intel_init_cmci(void) { }
+static inline void intel_init_lmce(void) { }
+static inline void intel_clear_lmce(void) { }
+static inline bool intel_filter_mce(struct mce *m) { return false; }
+static inline bool intel_mce_usable_address(struct mce *m) { return false; }
+#endif
+
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+u32 mce_get_apei_thr_limit(void);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+static inline u32 mce_get_apei_thr_limit(void) { return 0; }
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+ return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ return 0;
+}
+static inline int apei_check_mce(void)
+{
+ return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+ return m1->bank != m2->bank ||
+ m1->status != m2->status ||
+ m1->addr != m2->addr ||
+ m1->misc != m2->misc;
+}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
+#else
+static inline void mce_work_trigger(void) { }
+static inline void mce_register_injector_chain(struct notifier_block *nb) { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
+#endif
+
+struct mca_config {
+ __u64 lmce_disabled : 1,
+ disabled : 1,
+ ser : 1,
+ recovery : 1,
+ bios_cmci_threshold : 1,
+ /* Proper #MC exception handler is set */
+ initialized : 1,
+ __reserved : 58;
+
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+ bool print_all;
+
+ int monarch_timeout;
+ int panic_timeout;
+ u32 rip_msr;
+ s8 bootlog;
+};
+
+extern struct mca_config mca_cfg;
+DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
+
+struct mce_vendor_flags {
+ /*
+ * Indicates that overflow conditions are not fatal, when set.
+ */
+ __u64 overflow_recov : 1,
+
+ /*
+ * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+ * Recovery. It indicates support for data poisoning in HW and deferred
+ * error interrupts.
+ */
+ succor : 1,
+
+ /*
+ * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+ * the register space for each MCA bank and also increases number of
+ * banks. Also, to accommodate the new banks and registers, the MCA
+ * register space is moved to a new MSR range.
+ */
+ smca : 1,
+
+ /* Zen IFU quirk */
+ zen_ifu_quirk : 1,
+
+ /* AMD-style error thresholding banks present. */
+ amd_threshold : 1,
+
+ /* Pentium, family 5-style MCA */
+ p5 : 1,
+
+ /* Centaur Winchip C6-style MCA */
+ winchip : 1,
+
+ /* SandyBridge IFU quirk */
+ snb_ifu_quirk : 1,
+
+ /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 55;
+};
+
+extern struct mce_vendor_flags mce_flags;
+
+struct mce_bank {
+ /* subevents to enable */
+ u64 ctl;
+
+ /* initialise bank? */
+ __u64 init : 1,
+
+ /*
+ * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates
+ * the LSB field is found in MCA_STATUS and not in MCA_ADDR.
+ */
+ lsb_in_status : 1,
+
+ __reserved_1 : 62;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
+enum mca_msr {
+ MCA_CTL,
+ MCA_STATUS,
+ MCA_ADDR,
+ MCA_MISC,
+};
+
+/* Decide whether to add MCE record to MCE event pool or filter it out. */
+extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
+
+#ifdef CONFIG_X86_MCE_AMD
+void mce_threshold_create_device(unsigned int cpu);
+void mce_threshold_remove_device(unsigned int cpu);
+void mce_amd_handle_storm(unsigned int bank, bool on);
+extern bool amd_filter_mce(struct mce *m);
+bool amd_mce_usable_address(struct mce *m);
+void amd_clear_bank(struct mce *m);
+
+/*
+ * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
+ * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR.
+ */
+static __always_inline void smca_extract_err_addr(struct mce *m)
+{
+ u8 lsb;
+
+ if (!mce_flags.smca)
+ return;
+
+ if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) {
+ lsb = (m->status >> 24) & 0x3f;
+
+ m->addr &= GENMASK_ULL(56, lsb);
+
+ return;
+ }
+
+ lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+}
+
+void smca_bsp_init(void);
+#else
+static inline void mce_threshold_create_device(unsigned int cpu) { }
+static inline void mce_threshold_remove_device(unsigned int cpu) { }
+static inline void mce_amd_handle_storm(unsigned int bank, bool on) { }
+static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline bool amd_mce_usable_address(struct mce *m) { return false; }
+static inline void amd_clear_bank(struct mce *m) { }
+static inline void smca_extract_err_addr(struct mce *m) { }
+static inline void smca_bsp_init(void) { }
+#endif
+
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+noinstr void pentium_machine_check(struct pt_regs *regs);
+noinstr void winchip_machine_check(struct pt_regs *regs);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void enable_p5_mce(void) {}
+static __always_inline void pentium_machine_check(struct pt_regs *regs) {}
+static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
+#endif
+
+noinstr u64 mce_rdmsrq(u32 msr);
+noinstr void mce_wrmsrq(u32 msr, u64 v);
+
+static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
+{
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ switch (reg) {
+ case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
+ }
+ }
+
+ switch (reg) {
+ case MCA_CTL: return MSR_IA32_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_IA32_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
+ }
+
+ return 0;
+}
+
+extern void (*mc_poll_banks)(void);
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 5c0e6533d9bc..2272ad53fc33 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* P5 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
@@ -5,36 +6,39 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/smp.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include "internal.h"
+
/* By default disabled */
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static void pentium_machine_check(struct pt_regs *regs, long error_code)
+noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
+ instrumentation_begin();
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
- printk(KERN_EMERG
- "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
- smp_processor_id(), loaddr, lotype);
+ pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
if (lotype & (1<<5)) {
- printk(KERN_EMERG
- "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
- smp_processor_id());
+ pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
}
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ instrumentation_end();
}
/* Set up machine check reporting for processors with Intel style MCE: */
@@ -50,19 +54,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_MCE))
return;
- machine_check_vector = pentium_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
- printk(KERN_INFO
- "Intel old style machine check architecture supported.\n");
+ pr_info("Intel old style machine check architecture supported.\n");
/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO
- "Intel old style machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
+ cr4_set_bits(X86_CR4_MCE);
+ pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
new file mode 100644
index 000000000000..2235a7477436
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -0,0 +1,489 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include <asm/mce.h>
+#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+
+#include "internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
+
+static struct severity {
+ u64 mask;
+ u64 result;
+ unsigned char sev;
+ unsigned short mcgmask;
+ unsigned short mcgres;
+ unsigned char ser;
+ unsigned char context;
+ unsigned char excp;
+ unsigned char covered;
+ unsigned int cpu_vfm;
+ unsigned char cpu_minstepping;
+ unsigned char bank_lo, bank_hi;
+ char *msg;
+} severities[] = {
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define KERNEL_RECOV .context = IN_KERNEL_RECOV
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define EXCP .excp = EXCP_CONTEXT
+#define NOEXCP .excp = NO_EXCP
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
+
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ EXCP, BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
+ /* When MCIP is not set something is very confused */
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
+ /* Neither return not error IP -- no chance to recover -> PANIC */
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
+ /*
+ * known AO MCACODs reported via MCE or CMC:
+ *
+ * SRAO could be signaled either via a machine check exception or
+ * CMCI with the corresponding bit S 1 or 0. So we don't need to
+ * check bit S for SRAO.
+ */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+ ),
+ /*
+ * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured
+ * to report uncorrected errors using CMCI with a special signature.
+ * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported
+ * in one of the memory controller banks.
+ * Set severity to "AO" for same action as normal patrol scrub error.
+ */
+ MCESEV(
+ AO, "Uncorrected Patrol Scrub Error",
+ SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ ),
+
+ /* ignore OVER for UCNA */
+ MCESEV(
+ UCNA, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signaled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
+
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+
+ /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+ MCESEV(
+ KEEP, "Action required but unaffected thread is continuable",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
+ ),
+ MCESEV(
+ AR, "Action required: data load in error recoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL_RECOV
+ ),
+ MCESEV(
+ AR, "Action required: data load error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ USER
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
+ MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ PANIC, "Data load in unrecoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL
+ ),
+ MCESEV(
+ PANIC, "Instruction fetch error in kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ KERNEL
+ ),
+#endif
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
+
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Uncorrected in kernel",
+ BITSET(MCI_STATUS_UC),
+ KERNEL
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
+};
+
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+ (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+static bool is_copy_from_user(struct pt_regs *regs)
+{
+ u8 insn_buf[MAX_INSN_SIZE];
+ unsigned long addr;
+ struct insn insn;
+ int ret;
+
+ if (!regs)
+ return false;
+
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ return false;
+
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
+ return false;
+
+ switch (insn.opcode.value) {
+ /* MOV mem,reg */
+ case 0x8A: case 0x8B:
+ /* MOVZ mem,reg */
+ case 0xB60F: case 0xB70F:
+ addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ break;
+ /* REP MOVS */
+ case 0xA4: case 0xA5:
+ addr = regs->si;
+ break;
+ default:
+ return false;
+ }
+
+ if (fault_in_kernel_space(addr))
+ return false;
+
+ current->mce_vaddr = (void __user *)addr;
+
+ return true;
+}
+
+/*
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
+ */
+static noinstr int error_context(struct mce *m, struct pt_regs *regs)
+{
+ int fixup_type;
+ bool copy_user;
+
+ if ((m->cs & 3) == 3)
+ return IN_USER;
+
+ if (!mc_recoverable(m->mcgstatus))
+ return IN_KERNEL;
+
+ /* Allow instrumentation around external facilities usage. */
+ instrumentation_begin();
+ fixup_type = ex_get_fixup_type(m->ip);
+ copy_user = is_copy_from_user(regs);
+ instrumentation_end();
+
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+
+ switch (fixup_type) {
+ case EX_TYPE_FAULT_MCE_SAFE:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ m->kflags |= MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+
+ default:
+ return IN_KERNEL;
+ }
+}
+
+/* See AMD PPR(s) section Machine Check Error Handling. */
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ char *panic_msg = NULL;
+ int ret;
+
+ /*
+ * Default return value: Action required, the error must be handled
+ * immediately.
+ */
+ ret = MCE_AR_SEVERITY;
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC) {
+ panic_msg = "Processor Context Corrupt";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (m->status & MCI_STATUS_DEFERRED) {
+ ret = MCE_DEFERRED_SEVERITY;
+ goto out;
+ }
+
+ /*
+ * If the UC bit is not set, the system either corrected or deferred
+ * the error. No action will be required after logging the error.
+ */
+ if (!(m->status & MCI_STATUS_UC)) {
+ ret = MCE_KEEP_SEVERITY;
+ goto out;
+ }
+
+ /*
+ * On MCA overflow, without the MCA overflow recovery feature the
+ * system will not be able to recover, panic.
+ */
+ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
+ panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (!mce_flags.succor) {
+ panic_msg = "Uncorrected error without MCA Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (error_context(m, regs) == IN_KERNEL) {
+ panic_msg = "Uncorrected unrecoverable error in kernel context";
+ ret = MCE_PANIC_SEVERITY;
+ }
+
+out:
+ if (msg && panic_msg)
+ *msg = panic_msg;
+
+ return ret;
+}
+
+static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
+ enum context ctx = error_context(m, regs);
+ struct severity *s;
+
+ for (s = severities;; s++) {
+ if ((m->status & s->mask) != s->result)
+ continue;
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
+ continue;
+ if (s->ser == SER_REQUIRED && !mca_cfg.ser)
+ continue;
+ if (s->ser == NO_SER && mca_cfg.ser)
+ continue;
+ if (s->context && ctx != s->context)
+ continue;
+ if (s->excp && excp != s->excp)
+ continue;
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
+ continue;
+ if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
+ continue;
+ if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi))
+ continue;
+ if (msg)
+ *msg = s->msg;
+ s->covered = 1;
+
+ return s->sev;
+ }
+}
+
+int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return mce_severity_amd(m, regs, msg, is_excp);
+ else
+ return mce_severity_intel(m, regs, msg, is_excp);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+ struct severity *ser = data;
+ seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+ return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(severities); i++)
+ severities[i].covered = 0;
+ return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+ .open = severities_coverage_open,
+ .release = seq_release,
+ .read = seq_read,
+ .write = severities_coverage_write,
+ .llseek = seq_lseek,
+};
+
+static int __init severities_debugfs_init(void)
+{
+ struct dentry *dmce;
+
+ dmce = mce_get_debugfs_dir();
+
+ debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ return 0;
+}
+late_initcall(severities_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
new file mode 100644
index 000000000000..0d13c9ffcba0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "internal.h"
+
+static u32 mce_apei_thr_limit;
+
+void mce_save_apei_thr_limit(u32 thr_limit)
+{
+ mce_apei_thr_limit = thr_limit;
+ pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
+}
+
+u32 mce_get_apei_thr_limit(void)
+{
+ return mce_apei_thr_limit;
+}
+
+static void default_threshold_interrupt(void)
+{
+ pr_err("Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
+{
+ trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+ apic_eoi();
+}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ case X86_VENDOR_AMD:
+ mce_amd_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ if (!mce_flags.amd_threshold)
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!--storm->stormy_bank_count)
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
+}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index 54060f565974..6c99f2941909 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* IDT Winchip specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
@@ -5,18 +6,23 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include "internal.h"
+
/* Machine check handler for WinChip C6: */
-static void winchip_machine_check(struct pt_regs *regs, long error_code)
+noinstr void winchip_machine_check(struct pt_regs *regs)
{
- printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
- add_taint(TAINT_MACHINE_CHECK);
+ instrumentation_begin();
+ pr_emerg("CPU0: Machine Check Exception.\n");
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ instrumentation_end();
}
/* Set up machine check reporting on the Winchip C6 series */
@@ -24,17 +30,12 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
- machine_check_vector = winchip_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
- printk(KERN_INFO
- "Winchip machine check reporting enabled on CPU#0.\n");
+ pr_info("Winchip machine check reporting enabled on CPU#0.\n");
}
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
deleted file mode 100644
index 188a1ca5ad2b..000000000000
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-obj-y = mce.o
-
-obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
-obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
-obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
-obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
-obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
-obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
-obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
-obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
-
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc609..000000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Athlon specific Machine Check Exception Reporting
- * (C) Copyright 2002 Dave Jones <davej@redhat.com>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* Machine Check Handler For AMD Athlon/Duron: */
-static void k7_machine_check(struct pt_regs *regs, long error_code)
-{
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int recover = 1;
- int i;
-
- rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover = 0;
-
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- for (i = 1; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- char misc[20];
- char addr[24];
-
- misc[0] = '\0';
- addr[0] = '\0';
-
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- high &= ~(1<<31);
-
- if (high & (1<<27)) {
- rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- snprintf(addr, 24, " at %08x%08x", ahigh, alow);
- }
-
- printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
- smp_processor_id(), i, high, low, misc, addr);
-
- /* Clear it: */
- wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
- /* Serialize: */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
-
- if (recover & 2)
- panic("CPU context corrupt");
- if (recover & 1)
- panic("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
-
- mcgstl &= ~(1<<2);
- wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-
-/* AMD K7 machine check is Intel like: */
-void amd_mcheck_init(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int i;
-
- if (!cpu_has(c, X86_FEATURE_MCE))
- return;
-
- machine_check_vector = k7_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
- printk(KERN_INFO "Intel machine check architecture supported.\n");
-
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<8)) /* Control register present ? */
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- nr_mce_banks = l & 0xff;
-
- /*
- * Clear status for MC index 0 separately, we don't touch CTL,
- * as some K7 Athlons cause spurious MCEs when its enabled:
- */
- if (boot_cpu_data.x86 == 6) {
- wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
- i = 1;
- } else
- i = 0;
-
- for (; i < nr_mce_banks; i++) {
- wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
- }
-
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
deleted file mode 100644
index a3a235a53f09..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Machine check injection support.
- * Copyright 2008 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Authors:
- * Andi Kleen
- * Ying Huang
- */
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/timer.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <asm/mce.h>
-
-/* Update fake mce registers on current CPU. */
-static void inject_mce(struct mce *m)
-{
- struct mce *i = &per_cpu(injectm, m->extcpu);
-
- /* Make sure noone reads partially written injectm */
- i->finished = 0;
- mb();
- m->finished = 0;
- /* First set the fields after finished */
- i->extcpu = m->extcpu;
- mb();
- /* Now write record in order, finished last (except above) */
- memcpy(i, m, sizeof(struct mce));
- /* Finally activate it */
- mb();
- i->finished = 1;
-}
-
-struct delayed_mce {
- struct timer_list timer;
- struct mce m;
-};
-
-/* Inject mce on current CPU */
-static void raise_mce(unsigned long data)
-{
- struct delayed_mce *dm = (struct delayed_mce *)data;
- struct mce *m = &dm->m;
- int cpu = m->extcpu;
-
- inject_mce(m);
- if (m->status & MCI_STATUS_UC) {
- struct pt_regs regs;
- memset(&regs, 0, sizeof(struct pt_regs));
- regs.ip = m->ip;
- regs.cs = m->cs;
- printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
- do_machine_check(&regs, 0);
- printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
- } else {
- mce_banks_t b;
- memset(&b, 0xff, sizeof(mce_banks_t));
- printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
- machine_check_poll(0, &b);
- mce_notify_irq();
- printk(KERN_INFO "Finished machine check poll on CPU %d\n",
- cpu);
- }
- kfree(dm);
-}
-
-/* Error injection interface */
-static ssize_t mce_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- struct delayed_mce *dm;
- struct mce m;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /*
- * There are some cases where real MSR reads could slip
- * through.
- */
- if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
- return -EIO;
-
- if ((unsigned long)usize > sizeof(struct mce))
- usize = sizeof(struct mce);
- if (copy_from_user(&m, ubuf, usize))
- return -EFAULT;
-
- if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
- return -EINVAL;
-
- dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
- if (!dm)
- return -ENOMEM;
-
- /*
- * Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
- * Should we use a hrtimer here for better synchronization?
- */
- memcpy(&dm->m, &m, sizeof(struct mce));
- setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
- dm->timer.expires = jiffies + 2;
- add_timer_on(&dm->timer, m.extcpu);
- return usize;
-}
-
-static int inject_init(void)
-{
- printk(KERN_INFO "Machine check injector initialized\n");
- mce_chrdev_ops.write = mce_write;
- return 0;
-}
-
-module_init(inject_init);
-/*
- * Cannot tolerate unloading currently because we cannot
- * guarantee all openers of mce_chrdev will get a reference to us.
- */
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
deleted file mode 100644
index 54dcb8ff12e5..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <asm/mce.h>
-
-enum severity_level {
- MCE_NO_SEVERITY,
- MCE_KEEP_SEVERITY,
- MCE_SOME_SEVERITY,
- MCE_AO_SEVERITY,
- MCE_UC_SEVERITY,
- MCE_AR_SEVERITY,
- MCE_PANIC_SEVERITY,
-};
-
-int mce_severity(struct mce *a, int tolerant, char **msg);
-
-extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
deleted file mode 100644
index ff0807f97056..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * MCE grading rules.
- * Copyright 2008, 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Author: Andi Kleen
- */
-#include <linux/kernel.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-/*
- * Grade an mce by severity. In general the most severe ones are processed
- * first. Since there are quite a lot of combinations test the bits in a
- * table-driven way. The rules are simply processed in order, first
- * match wins.
- *
- * Note this is only used for machine check exceptions, the corrected
- * errors use much simpler rules. The exceptions still check for the corrected
- * errors, but only to leave them alone for the CMCI handler (except for
- * panic situations)
- */
-
-enum context { IN_KERNEL = 1, IN_USER = 2 };
-enum ser { SER_REQUIRED = 1, NO_SER = 2 };
-
-static struct severity {
- u64 mask;
- u64 result;
- unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
- unsigned char ser;
- unsigned char context;
- unsigned char covered;
- char *msg;
-} severities[] = {
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
-#define MCGMASK(x, res, s, m, r...) \
- { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
-#define MASK(x, y, s, m, r...) \
- { .mask = x, .result = y, SEV(s), .msg = m, ## r }
-#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
-#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
-#define MCACOD 0xffff
-
- BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
- BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
- BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
- /* When MCIP is not set something is very confused */
- MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
- /* Neither return not error IP -- no chance to recover -> PANIC */
- MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
- "Neither restart nor error IP"),
- MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
- KERNEL),
- BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
- MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
- "Spurious not enabled", SER),
-
- /* ignore OVER for UCNA */
- MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
- "Uncorrected no action required", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
- "Illegal combination (UCNA with AR=1)", SER),
- MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
-
- /* AR add known MCACODs here */
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
- "Action required with lost events", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
- "Action required; unknown MCACOD", SER),
-
- /* known AO MCACODs: */
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
- "Action optional: memory scrubbing error", SER),
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
- "Action optional: last level cache writeback error", SER),
-
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
- "Action optional unknown MCACOD", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
- "Action optional with lost events", SER),
- BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
- BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
- BITSET(0, SOME, "No match") /* always matches. keep at end */
-};
-
-/*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
- */
-static int error_context(struct mce *m)
-{
- if (m->mcgstatus & MCG_STATUS_EIPV)
- return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
- /* Unknown, assume kernel */
- return IN_KERNEL;
-}
-
-int mce_severity(struct mce *a, int tolerant, char **msg)
-{
- enum context ctx = error_context(a);
- struct severity *s;
-
- for (s = severities;; s++) {
- if ((a->status & s->mask) != s->result)
- continue;
- if ((a->mcgstatus & s->mcgmask) != s->mcgres)
- continue;
- if (s->ser == SER_REQUIRED && !mce_ser)
- continue;
- if (s->ser == NO_SER && mce_ser)
- continue;
- if (s->context && ctx != s->context)
- continue;
- if (msg)
- *msg = s->msg;
- s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (panic_on_oops || tolerant < 1)
- return MCE_PANIC_SEVERITY;
- }
- return s->sev;
- }
-}
-
-static void *s_start(struct seq_file *f, loff_t *pos)
-{
- if (*pos >= ARRAY_SIZE(severities))
- return NULL;
- return &severities[*pos];
-}
-
-static void *s_next(struct seq_file *f, void *data, loff_t *pos)
-{
- if (++(*pos) >= ARRAY_SIZE(severities))
- return NULL;
- return &severities[*pos];
-}
-
-static void s_stop(struct seq_file *f, void *data)
-{
-}
-
-static int s_show(struct seq_file *f, void *data)
-{
- struct severity *ser = data;
- seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
- return 0;
-}
-
-static const struct seq_operations severities_seq_ops = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
-static int severities_coverage_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &severities_seq_ops);
-}
-
-static ssize_t severities_coverage_write(struct file *file,
- const char __user *ubuf,
- size_t count, loff_t *ppos)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(severities); i++)
- severities[i].covered = 0;
- return count;
-}
-
-static const struct file_operations severities_coverage_fops = {
- .open = severities_coverage_open,
- .release = seq_release,
- .read = seq_read,
- .write = severities_coverage_write,
-};
-
-static int __init severities_debugfs_init(void)
-{
- struct dentry *dmce = NULL, *fseverities_coverage = NULL;
-
- dmce = debugfs_create_dir("mce", NULL);
- if (dmce == NULL)
- goto err_out;
- fseverities_coverage = debugfs_create_file("severities-coverage",
- 0444, dmce, NULL,
- &severities_coverage_fops);
- if (fseverities_coverage == NULL)
- goto err_out;
-
- return 0;
-
-err_out:
- if (fseverities_coverage)
- debugfs_remove(fseverities_coverage);
- if (dmce)
- debugfs_remove(dmce);
- return -ENOMEM;
-}
-late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
deleted file mode 100644
index 1cfb623ce11c..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ /dev/null
@@ -1,2047 +0,0 @@
-/*
- * Machine check handler.
- *
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
- * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
- */
-#include <linux/thread_info.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
-#include <linux/interrupt.h>
-#include <linux/ratelimit.h>
-#include <linux/kallsyms.h>
-#include <linux/rcupdate.h>
-#include <linux/kobject.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/string.h>
-#include <linux/sysdev.h>
-#include <linux/delay.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/kmod.h>
-#include <linux/poll.h>
-#include <linux/nmi.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-#include "mce-internal.h"
-
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
- smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
- unexpected_machine_check;
-
-int mce_disabled __read_mostly;
-
-#ifdef CONFIG_X86_NEW_MCE
-
-#define MISC_MCELOG_MINOR 227
-
-#define SPINUNIT 100 /* 100ns */
-
-atomic_t mce_entry;
-
-DEFINE_PER_CPU(unsigned, mce_exception_count);
-
-/*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant __read_mostly = 1;
-static int banks __read_mostly;
-static u64 *bank __read_mostly;
-static int rip_msr __read_mostly;
-static int mce_bootlog __read_mostly = -1;
-static int monarch_timeout __read_mostly = -1;
-static int mce_panic_timeout __read_mostly;
-static int mce_dont_log_ce __read_mostly;
-int mce_cmci_disabled __read_mostly;
-int mce_ignore_ce __read_mostly;
-int mce_ser __read_mostly;
-
-/* User mode helper program triggered by machine check event */
-static unsigned long mce_need_notify;
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-static unsigned long dont_init_banks;
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
-static DEFINE_PER_CPU(struct mce, mces_seen);
-static int cpu_missing;
-
-
-/* MCA banks polled by the period polling timer for corrected events */
-DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
- [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
-};
-
-static inline int skip_bank_init(int i)
-{
- return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
-}
-
-static DEFINE_PER_CPU(struct work_struct, mce_work);
-
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
-{
- memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
- rdtscll(m->tsc);
- /* We hope get_seconds stays lockless */
- m->time = get_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
-#ifdef CONFIG_SMP
- m->socketid = cpu_data(m->extcpu).phys_proc_id;
-#endif
- m->apicid = cpu_data(m->extcpu).initial_apicid;
- rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
-}
-
-DEFINE_PER_CPU(struct mce, injectm);
-EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
-
-void mce_log(struct mce *mce)
-{
- unsigned next, entry;
-
- mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference(mcelog.next);
- for (;;) {
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- mce->finished = 1;
- set_bit(0, &mce_need_notify);
-}
-
-static void print_mce(struct mce *m)
-{
- printk(KERN_EMERG
- "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
- m->extcpu, m->mcgstatus, m->bank, m->status);
- if (m->ip) {
- printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
- !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
- if (m->cs == __KERNEL_CS)
- print_symbol("{%s}", m->ip);
- printk(KERN_CONT "\n");
- }
- printk(KERN_EMERG "TSC %llx ", m->tsc);
- if (m->addr)
- printk(KERN_CONT "ADDR %llx ", m->addr);
- if (m->misc)
- printk(KERN_CONT "MISC %llx ", m->misc);
- printk(KERN_CONT "\n");
- printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
- m->cpuvendor, m->cpuid, m->time, m->socketid,
- m->apicid);
-}
-
-static void print_mce_head(void)
-{
- printk(KERN_EMERG "\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
- printk(KERN_EMERG "This is not a software problem!\n"
- "Run through mcelog --ascii to decode and contact your hardware vendor\n");
-}
-
-#define PANIC_TIMEOUT 5 /* 5 seconds */
-
-static atomic_t mce_paniced;
-
-/* Panic in progress. Enable interrupts and wait for final IPI */
-static void wait_for_panic(void)
-{
- long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
- preempt_disable();
- local_irq_enable();
- while (timeout-- > 0)
- udelay(1);
- if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
- panic("Panicing machine check CPU died");
-}
-
-static void mce_panic(char *msg, struct mce *final, char *exp)
-{
- int i;
-
- /*
- * Make sure only one CPU runs in machine check panic
- */
- if (atomic_add_return(1, &mce_paniced) > 1)
- wait_for_panic();
- barrier();
-
- bust_spinlocks(1);
- console_verbose();
- print_mce_head();
- /* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- print_mce(m);
- }
- /* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || memcmp(m, final, sizeof(struct mce)))
- print_mce(m);
- }
- if (final)
- print_mce(final);
- if (cpu_missing)
- printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
- print_mce_tail();
- if (exp)
- printk(KERN_EMERG "Machine check: %s\n", exp);
- if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
- panic(msg);
-}
-
-/* Support code for software error injection */
-
-static int msr_to_offset(u32 msr)
-{
- unsigned bank = __get_cpu_var(injectm.bank);
- if (msr == rip_msr)
- return offsetof(struct mce, ip);
- if (msr == MSR_IA32_MC0_STATUS + bank*4)
- return offsetof(struct mce, status);
- if (msr == MSR_IA32_MC0_ADDR + bank*4)
- return offsetof(struct mce, addr);
- if (msr == MSR_IA32_MC0_MISC + bank*4)
- return offsetof(struct mce, misc);
- if (msr == MSR_IA32_MCG_STATUS)
- return offsetof(struct mce, mcgstatus);
- return -1;
-}
-
-/* MSR access wrappers used for error injection */
-static u64 mce_rdmsrl(u32 msr)
-{
- u64 v;
- if (__get_cpu_var(injectm).finished) {
- int offset = msr_to_offset(msr);
- if (offset < 0)
- return 0;
- return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
- }
- rdmsrl(msr, v);
- return v;
-}
-
-static void mce_wrmsrl(u32 msr, u64 v)
-{
- if (__get_cpu_var(injectm).finished) {
- int offset = msr_to_offset(msr);
- if (offset >= 0)
- *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
- return;
- }
- wrmsrl(msr, v);
-}
-
-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16 /* we use one entry less */
-
-struct mce_ring {
- unsigned short start;
- unsigned short end;
- unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
-
- return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
- struct mce_ring *r;
- int ret = 0;
-
- *pfn = 0;
- get_cpu();
- r = &__get_cpu_var(mce_ring);
- if (r->start == r->end)
- goto out;
- *pfn = r->ring[r->start];
- r->start = (r->start + 1) % MCE_RING_SIZE;
- ret = 1;
-out:
- put_cpu();
- return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
- unsigned next;
-
- next = (r->end + 1) % MCE_RING_SIZE;
- if (next == r->start)
- return -1;
- r->ring[r->end] = pfn;
- wmb();
- r->end = next;
- return 0;
-}
-
-int mce_available(struct cpuinfo_x86 *c)
-{
- if (mce_disabled)
- return 0;
- return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
-}
-
-static void mce_schedule_work(void)
-{
- if (!mce_ring_empty()) {
- struct work_struct *work = &__get_cpu_var(mce_work);
- if (!work_pending(work))
- schedule_work(work);
- }
-}
-
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-
- if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
-{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
- mce_notify_irq();
- mce_schedule_work();
- irq_exit();
-}
-#endif
-
-static void mce_report_event(struct pt_regs *regs)
-{
- if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
- mce_notify_irq();
- /*
- * Triggering the work queue here is just an insurance
- * policy in case the syscall exit notify handler
- * doesn't run soon enough or ends up running on the
- * wrong CPU (can happen when audit sleeps)
- */
- mce_schedule_work();
- return;
- }
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
-
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
-
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
-}
-
-DEFINE_PER_CPU(unsigned, mce_poll_count);
-
-/*
- * Poll for corrected events or events that happened before reset.
- * Those are just logged through /dev/mcelog.
- *
- * This is executed in standard interrupt context.
- *
- * Note: spec recommends to panic for fatal unsignalled
- * errors here. However this would be quite problematic --
- * we would need to reimplement the Monarch handling and
- * it would mess up the exclusion between exception handler
- * and poll hander -- * so we skip this for now.
- * These cases should not happen anyways, or only when the CPU
- * is already totally * confused. In this case it's likely it will
- * not fully execute the machine check handler either.
- */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
-{
- struct mce m;
- int i;
-
- __get_cpu_var(mce_poll_count)++;
-
- mce_setup(&m);
-
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- for (i = 0; i < banks; i++) {
- if (!bank[i] || !test_bit(i, *b))
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
- m.tsc = 0;
-
- barrier();
- m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
- if (!(m.status & MCI_STATUS_VAL))
- continue;
-
- /*
- * Uncorrected or signalled events are handled by the exception
- * handler when it is enabled, so don't process those here.
- *
- * TBD do the same check for MCI_STATUS_EN here?
- */
- if (!(flags & MCP_UC) &&
- (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
- continue;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
-
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
- /*
- * Don't get the IP here because it's unlikely to
- * have anything to do with the actual error location.
- */
- if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
- mce_log(&m);
- add_taint(TAINT_MACHINE_CHECK);
- }
-
- /*
- * Clear state for this bank.
- */
- mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
-
- /*
- * Don't clear MCG_STATUS here because it's only defined for
- * exceptions.
- */
-
- sync_core();
-}
-EXPORT_SYMBOL_GPL(machine_check_poll);
-
-/*
- * Do a quick check if any of the events requires a panic.
- * This decides if we keep the events around or clear them.
- */
-static int mce_no_way_out(struct mce *m, char **msg)
-{
- int i;
-
- for (i = 0; i < banks; i++) {
- m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
- if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
- return 1;
- }
- return 0;
-}
-
-/*
- * Variable to establish order between CPUs while scanning.
- * Each CPU spins initially until executing is equal its number.
- */
-static atomic_t mce_executing;
-
-/*
- * Defines order of CPUs on entry. First CPU becomes Monarch.
- */
-static atomic_t mce_callin;
-
-/*
- * Check if a timeout waiting for other CPUs happened.
- */
-static int mce_timed_out(u64 *t)
-{
- /*
- * The others already did panic for some reason.
- * Bail out like in a timeout.
- * rmb() to tell the compiler that system_state
- * might have been modified by someone else.
- */
- rmb();
- if (atomic_read(&mce_paniced))
- wait_for_panic();
- if (!monarch_timeout)
- goto out;
- if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (tolerant < 1)
- mce_panic("Timeout synchronizing machine check over CPUs",
- NULL, NULL);
- cpu_missing = 1;
- return 1;
- }
- *t -= SPINUNIT;
-out:
- touch_nmi_watchdog();
- return 0;
-}
-
-/*
- * The Monarch's reign. The Monarch is the CPU who entered
- * the machine check handler first. It waits for the others to
- * raise the exception too and then grades them. When any
- * error is fatal panic. Only then let the others continue.
- *
- * The other CPUs entering the MCE handler will be controlled by the
- * Monarch. They are called Subjects.
- *
- * This way we prevent any potential data corruption in a unrecoverable case
- * and also makes sure always all CPU's errors are examined.
- *
- * Also this detects the case of an machine check event coming from outer
- * space (not detected by any CPUs) In this case some external agent wants
- * us to shut down, so panic too.
- *
- * The other CPUs might still decide to panic if the handler happens
- * in a unrecoverable place, but in this case the system is in a semi-stable
- * state and won't corrupt anything by itself. It's ok to let the others
- * continue for a bit first.
- *
- * All the spin loops have timeouts; when a timeout happens a CPU
- * typically elects itself to be Monarch.
- */
-static void mce_reign(void)
-{
- int cpu;
- struct mce *m = NULL;
- int global_worst = 0;
- char *msg = NULL;
- char *nmsg = NULL;
-
- /*
- * This CPU is the Monarch and the other CPUs have run
- * through their handlers.
- * Grade the severity of the errors of all the CPUs.
- */
- for_each_possible_cpu(cpu) {
- int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
- &nmsg);
- if (severity > global_worst) {
- msg = nmsg;
- global_worst = severity;
- m = &per_cpu(mces_seen, cpu);
- }
- }
-
- /*
- * Cannot recover? Panic here then.
- * This dumps all the mces in the log buffer and stops the
- * other CPUs.
- */
- if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
- mce_panic("Fatal Machine check", m, msg);
-
- /*
- * For UC somewhere we let the CPU who detects it handle it.
- * Also must let continue the others, otherwise the handling
- * CPU could deadlock on a lock.
- */
-
- /*
- * No machine check event found. Must be some external
- * source or one CPU is hung. Panic.
- */
- if (!m && tolerant < 3)
- mce_panic("Machine check from unknown source", NULL, NULL);
-
- /*
- * Now clear all the mces_seen so that they don't reappear on
- * the next mce.
- */
- for_each_possible_cpu(cpu)
- memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
-}
-
-static atomic_t global_nwo;
-
-/*
- * Start of Monarch synchronization. This waits until all CPUs have
- * entered the exception handler and then determines if any of them
- * saw a fatal event that requires panic. Then it executes them
- * in the entry order.
- * TBD double check parallel CPU hotunplug
- */
-static int mce_start(int *no_way_out)
-{
- int order;
- int cpus = num_online_cpus();
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- return -1;
-
- atomic_add(*no_way_out, &global_nwo);
- /*
- * global_nwo should be updated before mce_callin
- */
- smp_wmb();
- order = atomic_add_return(1, &mce_callin);
-
- /*
- * Wait for everyone.
- */
- while (atomic_read(&mce_callin) != cpus) {
- if (mce_timed_out(&timeout)) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
-
- /*
- * mce_callin should be read before global_nwo
- */
- smp_rmb();
-
- if (order == 1) {
- /*
- * Monarch: Starts executing now, the others wait.
- */
- atomic_set(&mce_executing, 1);
- } else {
- /*
- * Subject: Now start the scanning loop one by one in
- * the original callin order.
- * This way when there are any shared banks it will be
- * only seen by one CPU before cleared, avoiding duplicates.
- */
- while (atomic_read(&mce_executing) < order) {
- if (mce_timed_out(&timeout)) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
- }
-
- /*
- * Cache the global no_way_out state.
- */
- *no_way_out = atomic_read(&global_nwo);
-
- return order;
-}
-
-/*
- * Synchronize between CPUs after main scanning loop.
- * This invokes the bulk of the Monarch processing.
- */
-static int mce_end(int order)
-{
- int ret = -1;
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- goto reset;
- if (order < 0)
- goto reset;
-
- /*
- * Allow others to run.
- */
- atomic_inc(&mce_executing);
-
- if (order == 1) {
- /* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
-
- /*
- * Monarch: Wait for everyone to go through their scanning
- * loops.
- */
- while (atomic_read(&mce_executing) <= cpus) {
- if (mce_timed_out(&timeout))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- mce_reign();
- barrier();
- ret = 0;
- } else {
- /*
- * Subject: Wait for Monarch to finish.
- */
- while (atomic_read(&mce_executing) != 0) {
- if (mce_timed_out(&timeout))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- /*
- * Don't reset anything. That's done by the Monarch.
- */
- return 0;
- }
-
- /*
- * Reset all global state.
- */
-reset:
- atomic_set(&global_nwo, 0);
- atomic_set(&mce_callin, 0);
- barrier();
-
- /*
- * Let others run again.
- */
- atomic_set(&mce_executing, 0);
- return ret;
-}
-
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses upto page granuality for now.
- */
-static int mce_usable_address(struct mce *m)
-{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
- return 0;
- if ((m->misc & 0x3f) > PAGE_SHIFT)
- return 0;
- if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
- return 0;
- return 1;
-}
-
-static void mce_clear_state(unsigned long *toclear)
-{
- int i;
-
- for (i = 0; i < banks; i++) {
- if (test_bit(i, toclear))
- mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
- *
- * On Intel systems this is entered on all CPUs in parallel through
- * MCE broadcast. However some CPUs might be broken beyond repair,
- * so be always careful when synchronizing with others.
- */
-void do_machine_check(struct pt_regs *regs, long error_code)
-{
- struct mce m, *final;
- int i;
- int worst = 0;
- int severity;
- /*
- * Establish sequential order between the CPUs entering the machine
- * check handler.
- */
- int order;
- /*
- * If no_way_out gets set, there is no safe way to recover from this
- * MCE. If tolerant is cranked up, we'll try anyway.
- */
- int no_way_out = 0;
- /*
- * If kill_it gets set, there might be a way to recover from this
- * error.
- */
- int kill_it = 0;
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- char *msg = "Unknown";
-
- atomic_inc(&mce_entry);
-
- __get_cpu_var(mce_exception_count)++;
-
- if (notify_die(DIE_NMI, "machine check", regs, error_code,
- 18, SIGKILL) == NOTIFY_STOP)
- goto out;
- if (!banks)
- goto out;
-
- mce_setup(&m);
-
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- no_way_out = mce_no_way_out(&m, &msg);
-
- final = &__get_cpu_var(mces_seen);
- *final = m;
-
- barrier();
-
- /*
- * When no restart IP must always kill or panic.
- */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_it = 1;
-
- /*
- * Go through all the banks in exclusion of the other CPUs.
- * This way we don't report duplicated events on shared banks
- * because the first one to see it will clear it.
- */
- order = mce_start(&no_way_out);
- for (i = 0; i < banks; i++) {
- __clear_bit(i, toclear);
- if (!bank[i])
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
-
- m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
- if ((m.status & MCI_STATUS_VAL) == 0)
- continue;
-
- /*
- * Non uncorrected or non signaled errors are handled by
- * machine_check_poll. Leave them alone, unless this panics.
- */
- if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
- !no_way_out)
- continue;
-
- /*
- * Set taint even when machine check was not enabled.
- */
- add_taint(TAINT_MACHINE_CHECK);
-
- severity = mce_severity(&m, tolerant, NULL);
-
- /*
- * When machine check was for corrected handler don't touch,
- * unless we're panicing.
- */
- if (severity == MCE_KEEP_SEVERITY && !no_way_out)
- continue;
- __set_bit(i, toclear);
- if (severity == MCE_NO_SEVERITY) {
- /*
- * Machine check event was not enabled. Clear, but
- * ignore.
- */
- continue;
- }
-
- /*
- * Kill on action required.
- */
- if (severity == MCE_AR_SEVERITY)
- kill_it = 1;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
-
- /*
- * Action optional error. Queue address for later processing.
- * When the ring overflows we just ignore the AO error.
- * RED-PEN add some logging mechanism when
- * usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for tolerant == 0
- */
- if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
- mce_ring_add(m.addr >> PAGE_SHIFT);
-
- mce_get_rip(&m, regs);
- mce_log(&m);
-
- if (severity > worst) {
- *final = m;
- worst = severity;
- }
- }
-
- if (!no_way_out)
- mce_clear_state(toclear);
-
- /*
- * Do most of the synchronization with other CPUs.
- * When there's any problem use only local no_way_out state.
- */
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
-
- /*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- *
- * This is mainly used in the case when the system doesn't
- * support MCE broadcasting or it has been disabled.
- */
- if (no_way_out && tolerant < 3)
- mce_panic("Fatal machine check on current CPU", final, msg);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
- */
-
- if (kill_it && tolerant < 3)
- force_sig(SIGBUS, current);
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
-
- if (worst > 0)
- mce_report_event(regs);
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-out:
- atomic_dec(&mce_entry);
- sync_core();
-}
-EXPORT_SYMBOL_GPL(do_machine_check);
-
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
-{
- printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
-}
-
-/*
- * Called after mce notification in process context. This code
- * is allowed to sleep. Call the high level VM handler to process
- * any corrupted pages.
- * Assume that the work queue code only calls this one at a time
- * per CPU.
- * Note we don't disable preemption, so this code might run on the wrong
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
- */
-void mce_notify_process(void)
-{
- unsigned long pfn;
- mce_notify_irq();
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR);
-}
-
-static void mce_process_work(struct work_struct *dummy)
-{
- mce_notify_process();
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
- struct mce m;
-
- mce_setup(&m);
- m.bank = MCE_THERMAL_BANK;
- m.status = status;
- mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll 2x faster. When the poller finds no more
- * errors, poll 2x slower (up to check_interval seconds).
- */
-static int check_interval = 5 * 60; /* 5 minutes */
-
-static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
-
-static void mcheck_timer(unsigned long data)
-{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
-
- WARN_ON(smp_processor_id() != data);
-
- if (mce_available(&current_cpu_data)) {
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
- }
-
- /*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
- */
- n = &__get_cpu_var(next_interval);
- if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
- else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
-
- t->expires = jiffies + *n;
- add_timer_on(t, smp_processor_id());
-}
-
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- clear_thread_flag(TIF_MCE_NOTIFY);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- wake_up_interruptible(&mce_wait);
-
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (mce_helper[0] && !work_pending(&mce_trigger_work))
- schedule_work(&mce_trigger_work);
-
- if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
-
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
-
-/*
- * Initialize Machine Checks for a CPU.
- */
-static int mce_cap_init(void)
-{
- unsigned b;
- u64 cap;
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
-
- b = cap & MCG_BANKCNT_MASK;
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
-
- if (b > MAX_NR_BANKS) {
- printk(KERN_WARNING
- "MCE: Using only %u machine check banks out of %u\n",
- MAX_NR_BANKS, b);
- b = MAX_NR_BANKS;
- }
-
- /* Don't support asymmetric configurations today */
- WARN_ON(banks != 0 && b != banks);
- banks = b;
- if (!bank) {
- bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
- if (!bank)
- return -ENOMEM;
- memset(bank, 0xff, banks * sizeof(u64));
- }
-
- /* Use accurate RIP reporting if available. */
- if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
-
- if (cap & MCG_SER_P)
- mce_ser = 1;
-
- return 0;
-}
-
-static void mce_init(void)
-{
- mce_banks_t all_banks;
- u64 cap;
- int i;
-
- /*
- * Log the machine checks left over from the previous reset.
- */
- bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
-
- set_in_cr4(X86_CR4_MCE);
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- if (cap & MCG_CTL_P)
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-
- for (i = 0; i < banks; i++) {
- if (skip_bank_init(i))
- continue;
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
-}
-
-/* Add per CPU specific workarounds here */
-static void mce_cpu_quirks(struct cpuinfo_x86 *c)
-{
- /* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && banks > 4) {
- /*
- * disable GART TBL walk error reporting, which
- * trips off incorrectly with the IOMMU & 3ware
- * & Cerberus:
- */
- clear_bit(10, (unsigned long *)&bank[4]);
- }
- if (c->x86 <= 17 && mce_bootlog < 0) {
- /*
- * Lots of broken BIOS around that don't clear them
- * by default and leave crap in there. Don't log:
- */
- mce_bootlog = 0;
- }
- /*
- * Various K7s with broken bank 0 around. Always disable
- * by default.
- */
- if (c->x86 == 6 && banks > 0)
- bank[0] = 0;
- }
-
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * SDM documents that on family 6 bank 0 should not be written
- * because it aliases to another special BIOS controlled
- * register.
- * But it's not aliased anymore on model 0x1a+
- * Don't ignore bank 0 completely because there could be a
- * valid event later, merely don't write CTL0.
- */
-
- if (c->x86 == 6 && c->x86_model < 0x1A)
- __set_bit(0, &dont_init_banks);
-
- /*
- * All newer Intel systems support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- monarch_timeout < 0)
- monarch_timeout = USEC_PER_SEC;
- }
- if (monarch_timeout < 0)
- monarch_timeout = 0;
- if (mce_bootlog != 0)
- mce_panic_timeout = 30;
-}
-
-static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
-{
- if (c->x86 != 5)
- return;
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- intel_p5_mcheck_init(c);
- break;
- case X86_VENDOR_CENTAUR:
- winchip_mcheck_init(c);
- break;
- }
-}
-
-static void mce_cpu_features(struct cpuinfo_x86 *c)
-{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- break;
- case X86_VENDOR_AMD:
- mce_amd_feature_init(c);
- break;
- default:
- break;
- }
-}
-
-static void mce_init_timer(void)
-{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(next_interval);
-
- if (mce_ignore_ce)
- return;
-
- *n = check_interval * HZ;
- if (!*n)
- return;
- setup_timer(t, mcheck_timer, smp_processor_id());
- t->expires = round_jiffies(jiffies + *n);
- add_timer_on(t, smp_processor_id());
-}
-
-/*
- * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off:
- */
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
-{
- if (mce_disabled)
- return;
-
- mce_ancient_init(c);
-
- if (!mce_available(c))
- return;
-
- if (mce_cap_init() < 0) {
- mce_disabled = 1;
- return;
- }
- mce_cpu_quirks(c);
-
- machine_check_vector = do_machine_check;
-
- mce_init();
- mce_cpu_features(c);
- mce_init_timer();
- INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-}
-
-/*
- * Character device to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
-
-static int mce_open(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_state_lock);
-
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
-
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
-
- spin_unlock(&mce_state_lock);
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_state_lock);
-
- open_count--;
- open_exclu = 0;
-
- spin_unlock(&mce_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
-static DEFINE_MUTEX(mce_read_mutex);
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
-
- /* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
-
- return -EINVAL;
- }
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
- }
- }
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
-
- return err ? -EFAULT : buf - ubuf;
-}
-
-static unsigned int mce_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-/* Modified in mce-inject.c, so not static or const */
-struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
-};
-EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-
-static struct miscdevice mce_log_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
-/*
- * mce=off Disables machine check
- * mce=no_cmci Disables CMCI
- * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
- * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
- * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
- * monarchtimeout is how long to wait for other CPUs on machine
- * check, or 0 to not wait
- * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
- * mce=nobootlog Don't log MCEs from before booting.
- */
-static int __init mcheck_enable(char *str)
-{
- if (*str == 0)
- enable_p5_mce();
- if (*str == '=')
- str++;
- if (!strcmp(str, "off"))
- mce_disabled = 1;
- else if (!strcmp(str, "no_cmci"))
- mce_cmci_disabled = 1;
- else if (!strcmp(str, "dont_log_ce"))
- mce_dont_log_ce = 1;
- else if (!strcmp(str, "ignore_ce"))
- mce_ignore_ce = 1;
- else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
- mce_bootlog = (str[0] == 'b');
- else if (isdigit(str[0])) {
- get_option(&str, &tolerant);
- if (*str == ',') {
- ++str;
- get_option(&str, &monarch_timeout);
- }
- } else {
- printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
- str);
- return 0;
- }
- return 1;
-}
-__setup("mce", mcheck_enable);
-
-/*
- * Sysfs support
- */
-
-/*
- * Disable machine checks on suspend and shutdown. We can't really handle
- * them later.
- */
-static int mce_disable(void)
-{
- int i;
-
- for (i = 0; i < banks; i++) {
- if (!skip_bank_init(i))
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
- }
- return 0;
-}
-
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
-{
- return mce_disable();
-}
-
-static int mce_shutdown(struct sys_device *dev)
-{
- return mce_disable();
-}
-
-/*
- * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
- * Only one CPU is active at this time, the others get re-added later using
- * CPU hotplug:
- */
-static int mce_resume(struct sys_device *dev)
-{
- mce_init();
- mce_cpu_features(&current_cpu_data);
-
- return 0;
-}
-
-static void mce_cpu_restart(void *data)
-{
- del_timer_sync(&__get_cpu_var(mce_timer));
- if (!mce_available(&current_cpu_data))
- return;
- mce_init();
- mce_init_timer();
-}
-
-/* Reinit MCEs after user configuration changes */
-static void mce_restart(void)
-{
- on_each_cpu(mce_cpu_restart, NULL, 1);
-}
-
-/* Toggle features for corrected errors */
-static void mce_disable_ce(void *all)
-{
- if (!mce_available(&current_cpu_data))
- return;
- if (all)
- del_timer_sync(&__get_cpu_var(mce_timer));
- cmci_clear();
-}
-
-static void mce_enable_ce(void *all)
-{
- if (!mce_available(&current_cpu_data))
- return;
- cmci_reenable();
- cmci_recheck();
- if (all)
- mce_init_timer();
-}
-
-static struct sysdev_class mce_sysclass = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
- .name = "machinecheck",
-};
-
-DEFINE_PER_CPU(struct sys_device, mce_dev);
-
-__cpuinitdata
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
-static struct sysdev_attribute *bank_attrs;
-
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
- char *buf)
-{
- u64 b = bank[attr - bank_attrs];
-
- return sprintf(buf, "%llx\n", b);
-}
-
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- bank[attr - bank_attrs] = new;
- mce_restart();
-
- return size;
-}
-
-static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
-{
- strcpy(buf, mce_helper);
- strcat(buf, "\n");
- return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf, size_t siz)
-{
- char *p;
-
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
- p = strchr(mce_helper, '\n');
-
- if (p)
- *p = 0;
-
- return strlen(mce_helper) + !!p;
-}
-
-static ssize_t set_ignore_ce(struct sys_device *s,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (mce_ignore_ce ^ !!new) {
- if (new) {
- /* disable ce features */
- on_each_cpu(mce_disable_ce, (void *)1, 1);
- mce_ignore_ce = 1;
- } else {
- /* enable ce features */
- mce_ignore_ce = 0;
- on_each_cpu(mce_enable_ce, (void *)1, 1);
- }
- }
- return size;
-}
-
-static ssize_t set_cmci_disabled(struct sys_device *s,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (mce_cmci_disabled ^ !!new) {
- if (new) {
- /* disable cmci */
- on_each_cpu(mce_disable_ce, NULL, 1);
- mce_cmci_disabled = 1;
- } else {
- /* enable cmci */
- mce_cmci_disabled = 0;
- on_each_cpu(mce_enable_ce, NULL, 1);
- }
- }
- return size;
-}
-
-static ssize_t store_int_with_restart(struct sys_device *s,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- ssize_t ret = sysdev_store_int(s, attr, buf, size);
- mce_restart();
- return ret;
-}
-
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
-
-static struct sysdev_ext_attribute attr_check_interval = {
- _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
- store_int_with_restart),
- &check_interval
-};
-
-static struct sysdev_ext_attribute attr_ignore_ce = {
- _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
- &mce_ignore_ce
-};
-
-static struct sysdev_ext_attribute attr_cmci_disabled = {
- _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
- &mce_cmci_disabled
-};
-
-static struct sysdev_attribute *mce_attrs[] = {
- &attr_tolerant.attr,
- &attr_check_interval.attr,
- &attr_trigger,
- &attr_monarch_timeout.attr,
- &attr_dont_log_ce.attr,
- &attr_ignore_ce.attr,
- &attr_cmci_disabled.attr,
- NULL
-};
-
-static cpumask_var_t mce_dev_initialized;
-
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
-{
- int err;
- int i, j;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(mce_dev, cpu).id = cpu;
- per_cpu(mce_dev, cpu).cls = &mce_sysclass;
-
- err = sysdev_register(&per_cpu(mce_dev, cpu));
- if (err)
- return err;
-
- for (i = 0; mce_attrs[i]; i++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
- if (err)
- goto error;
- }
- for (j = 0; j < banks; j++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu),
- &bank_attrs[j]);
- if (err)
- goto error2;
- }
- cpumask_set_cpu(cpu, mce_dev_initialized);
-
- return 0;
-error2:
- while (--j >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
-error:
- while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
-
- sysdev_unregister(&per_cpu(mce_dev, cpu));
-
- return err;
-}
-
-static __cpuinit void mce_remove_device(unsigned int cpu)
-{
- int i;
-
- if (!cpumask_test_cpu(cpu, mce_dev_initialized))
- return;
-
- for (i = 0; mce_attrs[i]; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
-
- for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
-
- sysdev_unregister(&per_cpu(mce_dev, cpu));
- cpumask_clear_cpu(cpu, mce_dev_initialized);
-}
-
-/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
-{
- unsigned long action = *(unsigned long *)h;
- int i;
-
- if (!mce_available(&current_cpu_data))
- return;
- if (!(action & CPU_TASKS_FROZEN))
- cmci_clear();
- for (i = 0; i < banks; i++) {
- if (!skip_bank_init(i))
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
- }
-}
-
-static void mce_reenable_cpu(void *h)
-{
- unsigned long action = *(unsigned long *)h;
- int i;
-
- if (!mce_available(&current_cpu_data))
- return;
-
- if (!(action & CPU_TASKS_FROZEN))
- cmci_reenable();
- for (i = 0; i < banks; i++) {
- if (!skip_bank_init(i))
- wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
- }
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct timer_list *t = &per_cpu(mce_timer, cpu);
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- t->expires = round_jiffies(jiffies +
- __get_cpu_var(next_interval));
- add_timer_on(t, cpu);
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- break;
- case CPU_POST_DEAD:
- /* intentionally ignoring frozen here */
- cmci_rediscover(cpu);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
- .notifier_call = mce_cpu_callback,
-};
-
-static __init int mce_init_banks(void)
-{
- int i;
-
- bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
- GFP_KERNEL);
- if (!bank_attrs)
- return -ENOMEM;
-
- for (i = 0; i < banks; i++) {
- struct sysdev_attribute *a = &bank_attrs[i];
-
- a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
- if (!a->attr.name)
- goto nomem;
-
- a->attr.mode = 0644;
- a->show = show_bank;
- a->store = set_bank;
- }
- return 0;
-
-nomem:
- while (--i >= 0)
- kfree(bank_attrs[i].attr.name);
- kfree(bank_attrs);
- bank_attrs = NULL;
-
- return -ENOMEM;
-}
-
-static __init int mce_init_device(void)
-{
- int err;
- int i = 0;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
-
- err = mce_init_banks();
- if (err)
- return err;
-
- err = sysdev_class_register(&mce_sysclass);
- if (err)
- return err;
-
- for_each_online_cpu(i) {
- err = mce_create_device(i);
- if (err)
- return err;
- }
-
- register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
-
- return err;
-}
-
-device_initcall(mce_init_device);
-
-#else /* CONFIG_X86_OLD_MCE: */
-
-int nr_mce_banks;
-EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
-
-/* This has to be run for each processor */
-void mcheck_init(struct cpuinfo_x86 *c)
-{
- if (mce_disabled)
- return;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- amd_mcheck_init(c);
- break;
-
- case X86_VENDOR_INTEL:
- if (c->x86 == 5)
- intel_p5_mcheck_init(c);
- if (c->x86 == 6)
- intel_p6_mcheck_init(c);
- if (c->x86 == 15)
- intel_p4_mcheck_init(c);
- break;
-
- case X86_VENDOR_CENTAUR:
- if (c->x86 == 5)
- winchip_mcheck_init(c);
- break;
-
- default:
- break;
- }
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
-}
-
-static int __init mcheck_enable(char *str)
-{
- mce_p5_enabled = 1;
- return 1;
-}
-__setup("mce", mcheck_enable);
-
-#endif /* CONFIG_X86_OLD_MCE */
-
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
-{
- mce_disabled = 1;
- return 1;
-}
-__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
deleted file mode 100644
index ddae21620bda..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ /dev/null
@@ -1,703 +0,0 @@
-/*
- * (c) 2005, 2006 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Written by Jacob Shin - AMD, Inc.
- *
- * Support : jacob.shin@amd.com
- *
- * April 2006
- * - added support for AMD Family 0x10 processors
- *
- * All MC4_MISCi registers are shared between multi-cores
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/kobject.h>
-#include <linux/percpu.h>
-#include <linux/sysdev.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-#define PFX "mce_threshold: "
-#define VERSION "version 1.1.1"
-#define NR_BANKS 6
-#define NR_BLOCKS 9
-#define THRESHOLD_MAX 0xFFF
-#define INT_TYPE_APIC 0x00020000
-#define MASK_VALID_HI 0x80000000
-#define MASK_CNTP_HI 0x40000000
-#define MASK_LOCKED_HI 0x20000000
-#define MASK_LVTOFF_HI 0x00F00000
-#define MASK_COUNT_EN_HI 0x00080000
-#define MASK_INT_TYPE_HI 0x00060000
-#define MASK_OVERFLOW_HI 0x00010000
-#define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_BLKPTR_LO 0xFF000000
-#define MCG_XBLK_ADDR 0xC0000400
-
-struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
-};
-
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
- .interrupt_enable = 0,
- .threshold_limit = THRESHOLD_MAX,
-};
-
-struct threshold_bank {
- struct kobject *kobj;
- struct threshold_block *blocks;
- cpumask_var_t cpus;
-};
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
-
-#ifdef CONFIG_SMP
-static unsigned char shared_bank[NR_BANKS] = {
- 0, 0, 0, 0, 1
-};
-#endif
-
-static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
-
-static void amd_threshold_interrupt(void);
-
-/*
- * CPU Initialization
- */
-
-struct thresh_restart {
- struct threshold_block *b;
- int reset;
- u16 old_limit;
-};
-
-/* must be called with correct cpu affinity */
-/* Called via smp_call_function_single() */
-static void threshold_restart_bank(void *_tr)
-{
- struct thresh_restart *tr = _tr;
- u32 mci_misc_hi, mci_misc_lo;
-
- rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-
- if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
- tr->reset = 1; /* limit cannot be lower than err count */
-
- if (tr->reset) { /* reset err count and overflow bit */
- mci_misc_hi =
- (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - tr->b->threshold_limit);
- } else if (tr->old_limit) { /* change limit w/o reset */
- int new_count = (mci_misc_hi & THRESHOLD_MAX) +
- (tr->old_limit - tr->b->threshold_limit);
-
- mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
- (new_count & THRESHOLD_MAX);
- }
-
- tr->b->interrupt_enable ?
- (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
- (mci_misc_hi &= ~MASK_INT_TYPE_HI);
-
- mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-}
-
-/* cpu init entry point, called from mce.c with preempt off */
-void mce_amd_feature_init(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
- struct thresh_restart tr;
- u8 lvt_off;
-
- for (bank = 0; bank < NR_BANKS; ++bank) {
- for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
- address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else
- ++address;
-
- if (rdmsr_safe(address, &low, &high))
- break;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
-
- if (!block)
- per_cpu(bank_map, cpu) |= (1 << bank);
-#ifdef CONFIG_SMP
- if (shared_bank[bank] && c->cpu_core_id)
- break;
-#endif
- lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0);
-
- high &= ~MASK_LVTOFF_HI;
- high |= lvt_off << 20;
- wrmsr(address, low, high);
-
- threshold_defaults.address = address;
- tr.b = &threshold_defaults;
- tr.reset = 0;
- tr.old_limit = 0;
- threshold_restart_bank(&tr);
-
- mce_threshold_vector = amd_threshold_interrupt;
- }
- }
-}
-
-/*
- * APIC Interrupt Handler
- */
-
-/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
- */
-static void amd_threshold_interrupt(void)
-{
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
- struct mce m;
-
- mce_setup(&m);
-
- /* assume first bank caused it */
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
- continue;
- for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0) {
- address = MSR_IA32_MC0_MISC + bank * 4;
- } else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
-
- if (rdmsr_safe(address, &low, &high))
- break;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
-
- /*
- * Log the machine check that caused the threshold
- * event.
- */
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
-
- if (high & MASK_OVERFLOW_HI) {
- rdmsrl(address, m.misc);
- rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
- m.status);
- m.bank = K8_MCE_THRESHOLD_BASE
- + bank * NR_BLOCKS
- + block;
- mce_log(&m);
- return;
- }
- }
- }
-}
-
-/*
- * Sysfs Interface
- */
-
-struct threshold_attr {
- struct attribute attr;
- ssize_t (*show) (struct threshold_block *, char *);
- ssize_t (*store) (struct threshold_block *, const char *, size_t count);
-};
-
-#define SHOW_FIELDS(name) \
-static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
-{ \
- return sprintf(buf, "%lx\n", (unsigned long) b->name); \
-}
-SHOW_FIELDS(interrupt_enable)
-SHOW_FIELDS(threshold_limit)
-
-static ssize_t
-store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
-{
- struct thresh_restart tr;
- unsigned long new;
-
- if (strict_strtoul(buf, 0, &new) < 0)
- return -EINVAL;
-
- b->interrupt_enable = !!new;
-
- tr.b = b;
- tr.reset = 0;
- tr.old_limit = 0;
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
- return size;
-}
-
-static ssize_t
-store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
-{
- struct thresh_restart tr;
- unsigned long new;
-
- if (strict_strtoul(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (new > THRESHOLD_MAX)
- new = THRESHOLD_MAX;
- if (new < 1)
- new = 1;
-
- tr.old_limit = b->threshold_limit;
- b->threshold_limit = new;
- tr.b = b;
- tr.reset = 0;
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
- return size;
-}
-
-struct threshold_block_cross_cpu {
- struct threshold_block *tb;
- long retval;
-};
-
-static void local_error_count_handler(void *_tbcc)
-{
- struct threshold_block_cross_cpu *tbcc = _tbcc;
- struct threshold_block *b = tbcc->tb;
- u32 low, high;
-
- rdmsr(b->address, low, high);
- tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
-}
-
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
-{
- struct threshold_block_cross_cpu tbcc = { .tb = b, };
-
- smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
- return sprintf(buf, "%lx\n", tbcc.retval);
-}
-
-static ssize_t store_error_count(struct threshold_block *b,
- const char *buf, size_t count)
-{
- struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- return 1;
-}
-
-#define RW_ATTR(val) \
-static struct threshold_attr val = { \
- .attr = {.name = __stringify(val), .mode = 0644 }, \
- .show = show_## val, \
- .store = store_## val, \
-};
-
-RW_ATTR(interrupt_enable);
-RW_ATTR(threshold_limit);
-RW_ATTR(error_count);
-
-static struct attribute *default_attrs[] = {
- &interrupt_enable.attr,
- &threshold_limit.attr,
- &error_count.attr,
- NULL
-};
-
-#define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a, struct threshold_attr, attr)
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct threshold_block *b = to_block(kobj);
- struct threshold_attr *a = to_attr(attr);
- ssize_t ret;
-
- ret = a->show ? a->show(b, buf) : -EIO;
-
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct threshold_block *b = to_block(kobj);
- struct threshold_attr *a = to_attr(attr);
- ssize_t ret;
-
- ret = a->store ? a->store(b, buf, count) : -EIO;
-
- return ret;
-}
-
-static struct sysfs_ops threshold_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type threshold_ktype = {
- .sysfs_ops = &threshold_ops,
- .default_attrs = default_attrs,
-};
-
-static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
- unsigned int bank,
- unsigned int block,
- u32 address)
-{
- struct threshold_block *b = NULL;
- u32 low, high;
- int err;
-
- if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
- return 0;
-
- if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
- return 0;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- goto recurse;
- else
- return 0;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- goto recurse;
-
- b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
- if (!b)
- return -ENOMEM;
-
- b->block = block;
- b->bank = bank;
- b->cpu = cpu;
- b->address = address;
- b->interrupt_enable = 0;
- b->threshold_limit = THRESHOLD_MAX;
-
- INIT_LIST_HEAD(&b->miscj);
-
- if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
- list_add(&b->miscj,
- &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
- } else {
- per_cpu(threshold_banks, cpu)[bank]->blocks = b;
- }
-
- err = kobject_init_and_add(&b->kobj, &threshold_ktype,
- per_cpu(threshold_banks, cpu)[bank]->kobj,
- "misc%i", block);
- if (err)
- goto out_free;
-recurse:
- if (!block) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- return 0;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
-
- err = allocate_threshold_blocks(cpu, bank, ++block, address);
- if (err)
- goto out_free;
-
- if (b)
- kobject_uevent(&b->kobj, KOBJ_ADD);
-
- return err;
-
-out_free:
- if (b) {
- kobject_put(&b->kobj);
- kfree(b);
- }
- return err;
-}
-
-static __cpuinit long
-local_allocate_threshold_blocks(int cpu, unsigned int bank)
-{
- return allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
-}
-
-/* symlinks sibling shared banks to first core. first core owns dir/files. */
-static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
-{
- int i, err = 0;
- struct threshold_bank *b = NULL;
- char name[32];
-
- sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
- if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
- i = cpumask_first(cpu_core_mask(cpu));
-
- /* first core not up yet */
- if (cpu_data(i).cpu_core_id)
- goto out;
-
- /* already linked */
- if (per_cpu(threshold_banks, cpu)[bank])
- goto out;
-
- b = per_cpu(threshold_banks, i)[bank];
-
- if (!b)
- goto out;
-
- err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
- b->kobj, name);
- if (err)
- goto out;
-
- cpumask_copy(b->cpus, cpu_core_mask(cpu));
- per_cpu(threshold_banks, cpu)[bank] = b;
-
- goto out;
- }
-#endif
-
- b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
- if (!b) {
- err = -ENOMEM;
- goto out;
- }
- if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
- kfree(b);
- err = -ENOMEM;
- goto out;
- }
-
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
- if (!b->kobj)
- goto out_free;
-
-#ifndef CONFIG_SMP
- cpumask_setall(b->cpus);
-#else
- cpumask_copy(b->cpus, cpu_core_mask(cpu));
-#endif
-
- per_cpu(threshold_banks, cpu)[bank] = b;
-
- err = local_allocate_threshold_blocks(cpu, bank);
- if (err)
- goto out_free;
-
- for_each_cpu(i, b->cpus) {
- if (i == cpu)
- continue;
-
- err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
- b->kobj, name);
- if (err)
- goto out;
-
- per_cpu(threshold_banks, i)[bank] = b;
- }
-
- goto out;
-
-out_free:
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- free_cpumask_var(b->cpus);
- kfree(b);
-out:
- return err;
-}
-
-/* create dir/files for all valid threshold banks */
-static __cpuinit int threshold_create_device(unsigned int cpu)
-{
- unsigned int bank;
- int err = 0;
-
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- err = threshold_create_bank(cpu, bank);
- if (err)
- goto out;
- }
-out:
- return err;
-}
-
-/*
- * let's be hotplug friendly.
- * in case of multiple core processors, the first core always takes ownership
- * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
- */
-
-static void deallocate_threshold_block(unsigned int cpu,
- unsigned int bank)
-{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
-
- if (!head)
- return;
-
- list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
- kobject_put(&pos->kobj);
- list_del(&pos->miscj);
- kfree(pos);
- }
-
- kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
- per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
-}
-
-static void threshold_remove_bank(unsigned int cpu, int bank)
-{
- struct threshold_bank *b;
- char name[32];
- int i = 0;
-
- b = per_cpu(threshold_banks, cpu)[bank];
- if (!b)
- return;
- if (!b->blocks)
- goto free_out;
-
- sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
- /* sibling symlink */
- if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
-
- return;
- }
-#endif
-
- /* remove all sibling symlinks before unregistering */
- for_each_cpu(i, b->cpus) {
- if (i == cpu)
- continue;
-
- sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
- per_cpu(threshold_banks, i)[bank] = NULL;
- }
-
- deallocate_threshold_block(cpu, bank);
-
-free_out:
- kobject_del(b->kobj);
- kobject_put(b->kobj);
- free_cpumask_var(b->cpus);
- kfree(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
-}
-
-static void threshold_remove_device(unsigned int cpu)
-{
- unsigned int bank;
-
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- threshold_remove_bank(cpu, bank);
- }
-}
-
-/* get notified when a cpu comes on/off */
-static void __cpuinit
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
-{
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- threshold_create_device(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- threshold_remove_device(cpu);
- break;
- default:
- break;
- }
-}
-
-static __init int threshold_init_device(void)
-{
- unsigned lcpu = 0;
-
- /* to hit CPUs online before the notifier is up */
- for_each_online_cpu(lcpu) {
- int err = threshold_create_device(lcpu);
-
- if (err)
- return err;
- }
- threshold_cpu_callback = amd_64_threshold_cpu_callback;
-
- return 0;
-}
-device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
deleted file mode 100644
index e1acec0f7a32..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Intel specific MCE features.
- * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
- * Copyright (C) 2008, 2009 Intel Corporation
- * Author: Andi Kleen
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/percpu.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-
-/*
- * Support for Intel Correct Machine Check Interrupts. This allows
- * the CPU to raise an interrupt when a corrected machine check happened.
- * Normally we pick those up using a regular polling timer.
- * Also supports reliable discovery of shared banks.
- */
-
-static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
-
-/*
- * cmci_discover_lock protects against parallel discovery attempts
- * which could race against each other.
- */
-static DEFINE_SPINLOCK(cmci_discover_lock);
-
-#define CMCI_THRESHOLD 1
-
-static int cmci_supported(int *banks)
-{
- u64 cap;
-
- if (mce_cmci_disabled || mce_ignore_ce)
- return 0;
-
- /*
- * Vendor check is not strictly needed, but the initial
- * initialization is vendor keyed and this
- * makes sure none of the backdoors are entered otherwise.
- */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- return 0;
- if (!cpu_has_apic || lapic_get_maxlvt() < 6)
- return 0;
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
- return !!(cap & MCG_CMCI_P);
-}
-
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- mce_notify_irq();
-}
-
-static void print_update(char *type, int *hdr, int num)
-{
- if (*hdr == 0)
- printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
- *hdr = 1;
- printk(KERN_CONT " %s:%d", type, num);
-}
-
-/*
- * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
- * on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
- */
-static void cmci_discover(int banks, int boot)
-{
- unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
- unsigned long flags;
- int hdr = 0;
- int i;
-
- spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- u64 val;
-
- if (test_bit(i, owned))
- continue;
-
- rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
-
- /* Already owned by someone else? */
- if (val & CMCI_EN) {
- if (test_and_clear_bit(i, owned) || boot)
- print_update("SHD", &hdr, i);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
- continue;
- }
-
- val |= CMCI_EN | CMCI_THRESHOLD;
- wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
- rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & CMCI_EN) {
- if (!test_and_set_bit(i, owned) || boot)
- print_update("CMCI", &hdr, i);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
- } else {
- WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
- }
- }
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
- if (hdr)
- printk(KERN_CONT "\n");
-}
-
-/*
- * Just in case we missed an event during initialization check
- * all the CMCI owned banks.
- */
-void cmci_recheck(void)
-{
- unsigned long flags;
- int banks;
-
- if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
- return;
- local_irq_save(flags);
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- local_irq_restore(flags);
-}
-
-/*
- * Disable CMCI on this CPU for all banks it owns when it goes down.
- * This allows other CPUs to claim the banks on rediscovery.
- */
-void cmci_clear(void)
-{
- unsigned long flags;
- int i;
- int banks;
- u64 val;
-
- if (!cmci_supported(&banks))
- return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
- continue;
- /* Disable CMCI */
- rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
- val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
- wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
- __clear_bit(i, __get_cpu_var(mce_banks_owned));
- }
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-/*
- * After a CPU went down cycle through all the others and rediscover
- * Must run in process context.
- */
-void cmci_rediscover(int dying)
-{
- int banks;
- int cpu;
- cpumask_var_t old;
-
- if (!cmci_supported(&banks))
- return;
- if (!alloc_cpumask_var(&old, GFP_KERNEL))
- return;
- cpumask_copy(old, &current->cpus_allowed);
-
- for_each_online_cpu(cpu) {
- if (cpu == dying)
- continue;
- if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
- continue;
- /* Recheck banks in case CPUs don't all have the same */
- if (cmci_supported(&banks))
- cmci_discover(banks, 0);
- }
-
- set_cpus_allowed_ptr(current, old);
- free_cpumask_var(old);
-}
-
-/*
- * Reenable CMCI on this CPU in case a CPU down failed.
- */
-void cmci_reenable(void)
-{
- int banks;
- if (cmci_supported(&banks))
- cmci_discover(banks, 0);
-}
-
-static void intel_init_cmci(void)
-{
- int banks;
-
- if (!cmci_supported(&banks))
- return;
-
- mce_threshold_vector = intel_threshold_interrupt;
- cmci_discover(banks, 1);
- /*
- * For CPU #0 this runs with still disabled APIC, but that's
- * ok because only the vector is set up. We still do another
- * check for the banks later for CPU #0 just to make sure
- * to not miss any events.
- */
- apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
- cmci_recheck();
-}
-
-void mce_intel_feature_init(struct cpuinfo_x86 *c)
-{
- intel_init_thermal(c);
- intel_init_cmci();
-}
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb6..000000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Non Fatal Machine Check Exception Reporting
- *
- * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
- *
- * This file contains routines to check for non-fatal MCEs every 15s
- *
- */
-#include <linux/interrupt.h>
-#include <linux/workqueue.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-static int firstbank;
-
-#define MCE_RATE (15*HZ) /* timer rate is 15s */
-
-static void mce_checkregs(void *info)
-{
- u32 low, high;
- int i;
-
- for (i = firstbank; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
-
- if (!(high & (1<<31)))
- continue;
-
- printk(KERN_INFO "MCE: The hardware reports a non fatal, "
- "correctable incident occurred on CPU %d.\n",
- smp_processor_id());
-
- printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
-
- /*
- * Scrub the error so we don't pick it up in MCE_RATE
- * seconds time:
- */
- wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-
- /* Serialize: */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
-}
-
-static void mce_work_fn(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
-
-static void mce_work_fn(struct work_struct *work)
-{
- on_each_cpu(mce_checkregs, NULL, 1);
- schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
-}
-
-static int __init init_nonfatal_mce_checker(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- /* Check for MCE support */
- if (!cpu_has(c, X86_FEATURE_MCE))
- return -ENODEV;
-
- /* Check for PPro style MCA */
- if (!cpu_has(c, X86_FEATURE_MCA))
- return -ENODEV;
-
- /* Some Athlons misbehave when we frob bank 0 */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 == 6)
- firstbank = 1;
- else
- firstbank = 0;
-
- /*
- * Check for non-fatal errors every MCE_RATE s
- */
- schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
- printk(KERN_INFO "Machine check exception polling timer started.\n");
-
- return 0;
-}
-module_init(init_nonfatal_mce_checker);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2e..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * P4 specific Machine Check Exception Reporting
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* as supported by the P4/Xeon family */
-struct intel_mce_extended_msrs {
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esi;
- u32 edi;
- u32 ebp;
- u32 esp;
- u32 eflags;
- u32 eip;
- /* u32 *reserved[]; */
-};
-
-static int mce_num_extended_msrs;
-
-/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
-{
- u32 h;
-
- rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
- rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
- rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
- rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
- rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
- rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
- rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
- rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
- rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
- rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
-}
-
-static void intel_machine_check(struct pt_regs *regs, long error_code)
-{
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int recover = 1;
- int i;
-
- rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover = 0;
-
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- if (mce_num_extended_msrs > 0) {
- struct intel_mce_extended_msrs dbg;
-
- intel_get_extended_msrs(&dbg);
-
- printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
- "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
- "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
- smp_processor_id(), dbg.eip, dbg.eflags,
- dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
- dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
- }
-
- for (i = 0; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- char misc[20];
- char addr[24];
-
- misc[0] = addr[0] = '\0';
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- high &= ~(1<<31);
- if (high & (1<<27)) {
- rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- snprintf(addr, 24, " at %08x%08x", ahigh, alow);
- }
- printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
- smp_processor_id(), i, high, low, misc, addr);
- }
- }
-
- if (recover & 2)
- panic("CPU context corrupt");
- if (recover & 1)
- panic("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
-
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
- * recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error.
- */
- for (i = 0; i < nr_mce_banks; i++) {
- u32 msr;
- msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr(msr, low, high);
- if (high&(1<<31)) {
- /* Clear it */
- wrmsr(msr, 0UL, 0UL);
- /* Serialize */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
- mcgstl &= ~(1<<2);
- wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int i;
-
- machine_check_vector = intel_machine_check;
- wmb();
-
- printk(KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<8)) /* Control register present ? */
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- nr_mce_banks = l & 0xff;
-
- for (i = 0; i < nr_mce_banks; i++) {
- wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
- }
-
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
-
- /* Check for P4/Xeon extended MCE MSRs */
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<9)) {/* MCG_EXT_P */
- mce_num_extended_msrs = (l >> 16) & 0xff;
- printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
- " available\n",
- smp_processor_id(), mce_num_extended_msrs);
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
- /* Check for P4/Xeon Thermal monitor */
- intel_init_thermal(c);
-#endif
- }
-}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f8178183..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * P6 specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* Machine Check Handler For PII/PIII */
-static void intel_machine_check(struct pt_regs *regs, long error_code)
-{
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int recover = 1;
- int i;
-
- rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover = 0;
-
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- for (i = 0; i < nr_mce_banks; i++) {
- rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- char misc[20];
- char addr[24];
-
- misc[0] = '\0';
- addr[0] = '\0';
-
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- high &= ~(1<<31);
-
- if (high & (1<<27)) {
- rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- snprintf(addr, 24, " at %08x%08x", ahigh, alow);
- }
-
- printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
- smp_processor_id(), i, high, low, misc, addr);
- }
- }
-
- if (recover & 2)
- panic("CPU context corrupt");
- if (recover & 1)
- panic("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
- * recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error:
- */
- for (i = 0; i < nr_mce_banks; i++) {
- unsigned int msr;
-
- msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr(msr, low, high);
- if (high & (1<<31)) {
- /* Clear it: */
- wrmsr(msr, 0UL, 0UL);
- /* Serialize: */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
- mcgstl &= ~(1<<2);
- wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-/* Set up machine check reporting for processors with Intel style MCE: */
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int i;
-
- /* Check for MCE support */
- if (!cpu_has(c, X86_FEATURE_MCE))
- return;
-
- /* Check for PPro style MCA */
- if (!cpu_has(c, X86_FEATURE_MCA))
- return;
-
- /* Ok machine check is available */
- machine_check_vector = intel_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
- printk(KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr(MSR_IA32_MCG_CAP, l, h);
- if (l & (1<<8)) /* Control register present ? */
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- nr_mce_banks = l & 0xff;
-
- /*
- * Following the example in IA-32 SDM Vol 3:
- * - MC0_CTL should not be written
- * - Status registers on all banks should be cleared on reset
- */
- for (i = 1; i < nr_mce_banks; i++)
- wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-
- for (i = 0; i < nr_mce_banks; i++)
- wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
-
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
-}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
deleted file mode 100644
index 8bc64cfbe936..000000000000
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Thermal throttle event support code (such as syslog messaging and rate
- * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
- *
- * This allows consistent reporting of CPU thermal throttle events.
- *
- * Maintains a counter in /sys that keeps track of the number of thermal
- * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
- *
- * Author: Dmitriy Zavin (dmitriyz@google.com)
- *
- * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
- * Inspired by Ross Biro's and Al Borchers' counter code.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/sysdev.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
-
-static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
-static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
-static DEFINE_PER_CPU(bool, thermal_throttle_active);
-
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
-
-#ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
-
-#define define_therm_throt_sysdev_show_func(name) \
-static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_throttle_##name, cpu)); \
- else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
-}
-
-define_therm_throt_sysdev_show_func(count);
-define_therm_throt_sysdev_one_ro(count);
-
-static struct attribute *thermal_throttle_attrs[] = {
- &attr_count.attr,
- NULL
-};
-
-static struct attribute_group thermal_throttle_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
-};
-#endif /* CONFIG_SYSFS */
-
-/***
- * therm_throt_process - Process thermal throttling event from interrupt
- * @curr: Whether the condition is current or not (boolean), since the
- * thermal interrupt normally gets called both when the thermal
- * event begins and once the event has ended.
- *
- * This function is called by the thermal interrupt after the
- * IRQ has been acknowledged.
- *
- * It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- * "timeout" from previous log message.
- * 1 : Event should be logged further, and a message has been
- * printed to the syslog.
- */
-static int therm_throt_process(int curr)
-{
- unsigned int cpu = smp_processor_id();
- __u64 tmp_jiffs = get_jiffies_64();
- bool was_throttled = __get_cpu_var(thermal_throttle_active);
- bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
-
- if (is_throttled)
- __get_cpu_var(thermal_throttle_count)++;
-
- if (!(was_throttled ^ is_throttled) &&
- time_before64(tmp_jiffs, __get_cpu_var(next_check)))
- return 0;
-
- __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
-
- /* if we just entered the thermal event */
- if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, "
- "cpu clock throttled (total events = %lu)\n",
- cpu, __get_cpu_var(thermal_throttle_count));
-
- add_taint(TAINT_MACHINE_CHECK);
- } else if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
- }
-
- return 1;
-}
-
-#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
-{
- return sysfs_create_group(&sys_dev->kobj,
- &thermal_throttle_attr_group);
-}
-
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
-{
- sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
-}
-
-/* Mutex protecting device creation against CPU hotplug: */
-static DEFINE_MUTEX(therm_cpu_lock);
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
- int err = 0;
-
- sys_dev = get_cpu_sysdev(cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev);
- mutex_unlock(&therm_cpu_lock);
- WARN_ON(err);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- mutex_lock(&therm_cpu_lock);
- thermal_throttle_remove_dev(sys_dev);
- mutex_unlock(&therm_cpu_lock);
- break;
- }
- return err ? NOTIFY_BAD : NOTIFY_OK;
-}
-
-static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
-{
- .notifier_call = thermal_throttle_cpu_callback,
-};
-
-static __init int thermal_throttle_init_device(void)
-{
- unsigned int cpu = 0;
- int err;
-
- if (!atomic_read(&therm_throt_en))
- return 0;
-
- register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
-
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_lock(&therm_cpu_lock);
-#endif
- /* connect live CPUs to sysfs */
- for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
- WARN_ON(err);
- }
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_unlock(&therm_cpu_lock);
-#endif
-
- return 0;
-}
-device_initcall(thermal_throttle_init_device);
-
-#endif /* CONFIG_SYSFS */
-
-/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
-{
- __u64 msr_val;
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
- mce_log_therm_throt_event(msr_val);
-}
-
-static void unexpected_thermal_interrupt(void)
-{
- printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
- smp_processor_id());
- add_taint(TAINT_MACHINE_CHECK);
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
-{
- exit_idle();
- irq_enter();
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
- ack_APIC_irq();
-}
-
-void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- int tm2 = 0;
- u32 l, h;
-
- /* Thermal monitoring depends on ACPI and clock modulation*/
- if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
- return;
-
- /*
- * First check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already:
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- h = apic_read(APIC_LVTTHMR);
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG
- "CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
- tm2 = 1;
-
- /* Check whether a vector already exists */
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG
- "CPU%d: Thermal LVT vector (%#x) already installed\n",
- cpu, (h & APIC_VECTOR_MASK));
- return;
- }
-
- /* We'll mask the thermal vector in the lapic till we're ready: */
- h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
-
- smp_thermal_vector = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- /* Unmask the thermal vector: */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
- printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
- cpu, tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
-}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
deleted file mode 100644
index d746df2909c9..000000000000
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Common corrected MCE threshold handler code:
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-
-#include <asm/irq_vectors.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-
-static void default_threshold_interrupt(void)
-{
- printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
- THRESHOLD_APIC_VECTOR);
-}
-
-void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-
-asmlinkage void smp_threshold_interrupt(void)
-{
- exit_idle();
- irq_enter();
- inc_irq_stat(irq_threshold_count);
- mce_threshold_vector();
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
- ack_APIC_irq();
-}
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000000..193d98b33a0a
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+microcode-y := core.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o
+microcode-$(CONFIG_CPU_SUP_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
new file mode 100644
index 000000000000..3821a985f4ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -0,0 +1,1306 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2018 Borislav Petkov <bp@alien8.de>
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <aivazian.tigran@gmail.com>
+ *
+ * early loader:
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/bsearch.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include <crypto/sha2.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+#include <asm/tlb.h>
+
+#include "internal.h"
+
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ unsigned int size;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(microcode_cache);
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
+
+struct equiv_cpu_entry {
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __packed;
+
+struct microcode_header_amd {
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __packed;
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[];
+};
+
+static struct equiv_cpu_table {
+ unsigned int num_entries;
+ struct equiv_cpu_entry *entry;
+} equiv_table;
+
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
+ */
+struct cont_desc {
+ struct microcode_amd *mc;
+ u32 psize;
+ u8 *data;
+ size_t size;
+};
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/arch/x86/microcode.rst
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static bool sha_check = true;
+
+struct patch_digest {
+ u32 patch_id;
+ u8 sha256[SHA256_DIGEST_SIZE];
+};
+
+#include "amd_shas.c"
+
+static int cmp_id(const void *key, const void *elem)
+{
+ struct patch_digest *pd = (struct patch_digest *)elem;
+ u32 patch_id = *(u32 *)key;
+
+ if (patch_id == pd->patch_id)
+ return 0;
+ else if (patch_id < pd->patch_id)
+ return -1;
+ else
+ return 1;
+}
+
+static u32 cpuid_to_ucode_rev(unsigned int val)
+{
+ union zen_patch_rev p = {};
+ union cpuid_1_eax c;
+
+ c.full = val;
+
+ p.stepping = c.stepping;
+ p.model = c.model;
+ p.ext_model = c.ext_model;
+ p.ext_fam = c.ext_fam;
+
+ return p.ucode_rev;
+}
+
+static u32 get_cutoff_revision(u32 rev)
+{
+ switch (rev >> 8) {
+ case 0x80012: return 0x8001277; break;
+ case 0x80082: return 0x800820f; break;
+ case 0x83010: return 0x830107c; break;
+ case 0x86001: return 0x860010e; break;
+ case 0x86081: return 0x8608108; break;
+ case 0x87010: return 0x8701034; break;
+ case 0x8a000: return 0x8a0000a; break;
+ case 0xa0010: return 0xa00107a; break;
+ case 0xa0011: return 0xa0011da; break;
+ case 0xa0012: return 0xa001243; break;
+ case 0xa0082: return 0xa00820e; break;
+ case 0xa1011: return 0xa101153; break;
+ case 0xa1012: return 0xa10124e; break;
+ case 0xa1081: return 0xa108109; break;
+ case 0xa2010: return 0xa20102f; break;
+ case 0xa2012: return 0xa201212; break;
+ case 0xa4041: return 0xa404109; break;
+ case 0xa5000: return 0xa500013; break;
+ case 0xa6012: return 0xa60120a; break;
+ case 0xa7041: return 0xa704109; break;
+ case 0xa7052: return 0xa705208; break;
+ case 0xa7080: return 0xa708009; break;
+ case 0xa70c0: return 0xa70C009; break;
+ case 0xaa001: return 0xaa00116; break;
+ case 0xaa002: return 0xaa00218; break;
+ case 0xb0021: return 0xb002146; break;
+ case 0xb0081: return 0xb008111; break;
+ case 0xb1010: return 0xb101046; break;
+ case 0xb2040: return 0xb204031; break;
+ case 0xb4040: return 0xb404031; break;
+ case 0xb4041: return 0xb404101; break;
+ case 0xb6000: return 0xb600031; break;
+ case 0xb6080: return 0xb608031; break;
+ case 0xb7000: return 0xb700031; break;
+ default: break;
+
+ }
+ return 0;
+}
+
+static bool need_sha_check(u32 cur_rev)
+{
+ u32 cutoff;
+
+ if (!cur_rev) {
+ cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+ pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev);
+ }
+
+ cutoff = get_cutoff_revision(cur_rev);
+ if (cutoff)
+ return cur_rev <= cutoff;
+
+ pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
+ pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
+ return true;
+}
+
+static bool cpu_has_entrysign(void)
+{
+ unsigned int fam = x86_family(bsp_cpuid_1_eax);
+ unsigned int model = x86_model(bsp_cpuid_1_eax);
+
+ if (fam == 0x17 || fam == 0x19)
+ return true;
+
+ if (fam == 0x1a) {
+ if (model <= 0x2f ||
+ (0x40 <= model && model <= 0x4f) ||
+ (0x60 <= model && model <= 0x6f))
+ return true;
+ }
+
+ return false;
+}
+
+static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
+{
+ struct patch_digest *pd = NULL;
+ u8 digest[SHA256_DIGEST_SIZE];
+ int i;
+
+ if (!cpu_has_entrysign())
+ return true;
+
+ if (!need_sha_check(cur_rev))
+ return true;
+
+ if (!sha_check)
+ return true;
+
+ pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id);
+ if (!pd) {
+ pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id);
+ return false;
+ }
+
+ sha256(data, len, digest);
+
+ if (memcmp(digest, pd->sha256, sizeof(digest))) {
+ pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id);
+
+ for (i = 0; i < SHA256_DIGEST_SIZE; i++)
+ pr_cont("0x%x ", digest[i]);
+ pr_info("\n");
+
+ return false;
+ }
+
+ return true;
+}
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
+static u32 get_patch_level(void)
+{
+ u32 rev, dummy __always_unused;
+
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+ int cpu = smp_processor_id();
+
+ if (!microcode_rev[cpu]) {
+ if (!base_rev)
+ base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+
+ microcode_rev[cpu] = base_rev;
+
+ ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev);
+ }
+
+ return microcode_rev[cpu];
+ }
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ return rev;
+}
+
+static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
+{
+ unsigned int i;
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
+ if (!et || !et->num_entries)
+ return 0;
+
+ for (i = 0; i < et->num_entries; i++) {
+ struct equiv_cpu_entry *e = &et->entry[i];
+
+ if (sig == e->installed_cpu)
+ return e->equiv_cpu;
+ }
+ return 0;
+}
+
+/*
+ * Check whether there is a valid microcode container file at the beginning
+ * of @buf of size @buf_size.
+ */
+static bool verify_container(const u8 *buf, size_t buf_size)
+{
+ u32 cont_magic;
+
+ if (buf_size <= CONTAINER_HDR_SZ) {
+ ucode_dbg("Truncated microcode container header.\n");
+ return false;
+ }
+
+ cont_magic = *(const u32 *)buf;
+ if (cont_magic != UCODE_MAGIC) {
+ ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Check whether there is a valid, non-truncated CPU equivalence table at the
+ * beginning of @buf of size @buf_size.
+ */
+static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
+{
+ const u32 *hdr = (const u32 *)buf;
+ u32 cont_type, equiv_tbl_len;
+
+ if (!verify_container(buf, buf_size))
+ return false;
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
+ cont_type = hdr[1];
+ if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
+ ucode_dbg("Wrong microcode container equivalence table type: %u.\n",
+ cont_type);
+ return false;
+ }
+
+ buf_size -= CONTAINER_HDR_SZ;
+
+ equiv_tbl_len = hdr[2];
+ if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) ||
+ buf_size < equiv_tbl_len) {
+ ucode_dbg("Truncated equivalence table.\n");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Check whether there is a valid, non-truncated microcode patch section at the
+ * beginning of @buf of size @buf_size.
+ *
+ * On success, @sh_psize returns the patch size according to the section header,
+ * to the caller.
+ */
+static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
+{
+ u32 p_type, p_size;
+ const u32 *hdr;
+
+ if (buf_size < SECTION_HDR_SIZE) {
+ ucode_dbg("Truncated patch section.\n");
+ return false;
+ }
+
+ hdr = (const u32 *)buf;
+ p_type = hdr[0];
+ p_size = hdr[1];
+
+ if (p_type != UCODE_UCODE_TYPE) {
+ ucode_dbg("Invalid type field (0x%x) in container file section header.\n",
+ p_type);
+ return false;
+ }
+
+ if (p_size < sizeof(struct microcode_header_amd)) {
+ ucode_dbg("Patch of size %u too short.\n", p_size);
+ return false;
+ }
+
+ *sh_psize = p_size;
+
+ return true;
+}
+
+/*
+ * Check whether the passed remaining file @buf_size is large enough to contain
+ * a patch of the indicated @sh_psize (and also whether this size does not
+ * exceed the per-family maximum). @sh_psize is the size read from the section
+ * header.
+ */
+static bool __verify_patch_size(u32 sh_psize, size_t buf_size)
+{
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ u32 max_size;
+
+ if (family >= 0x15)
+ goto ret;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+
+ switch (family) {
+ case 0x10 ... 0x12:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ default:
+ WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
+ return false;
+ }
+
+ if (sh_psize > max_size)
+ return false;
+
+ret:
+ /* Working with the whole buffer so < is ok. */
+ return sh_psize <= buf_size;
+}
+
+/*
+ * Verify the patch in @buf.
+ *
+ * Returns:
+ * negative: on error
+ * positive: patch is not for this family, skip it
+ * 0: success
+ */
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
+{
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ struct microcode_header_amd *mc_hdr;
+ u32 cur_rev, cutoff, patch_rev;
+ u32 sh_psize;
+ u16 proc_id;
+ u8 patch_fam;
+
+ if (!__verify_patch_section(buf, buf_size, &sh_psize))
+ return -1;
+
+ /*
+ * The section header length is not included in this indicated size
+ * but is present in the leftover file length so we need to subtract
+ * it before passing this value to the function below.
+ */
+ buf_size -= SECTION_HDR_SIZE;
+
+ /*
+ * Check if the remaining buffer is big enough to contain a patch of
+ * size sh_psize, as the section claims.
+ */
+ if (buf_size < sh_psize) {
+ ucode_dbg("Patch of size %u truncated.\n", sh_psize);
+ return -1;
+ }
+
+ if (!__verify_patch_size(sh_psize, buf_size)) {
+ ucode_dbg("Per-family patch size mismatch.\n");
+ return -1;
+ }
+
+ *patch_size = sh_psize;
+
+ mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE);
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
+ return -1;
+ }
+
+ proc_id = mc_hdr->processor_rev_id;
+ patch_fam = 0xf + (proc_id >> 12);
+
+ if (patch_fam != family)
+ return 1;
+
+ cur_rev = get_patch_level();
+
+ /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */
+ cutoff = get_cutoff_revision(cur_rev);
+ if (!cutoff)
+ goto ok;
+
+ patch_rev = mc_hdr->patch_id;
+
+ ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n",
+ cur_rev, cutoff, patch_rev);
+
+ if (cur_rev <= cutoff && patch_rev <= cutoff)
+ goto ok;
+
+ if (cur_rev > cutoff && patch_rev > cutoff)
+ goto ok;
+
+ return 1;
+
+ok:
+ ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
+
+ return 0;
+}
+
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
+/*
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ *
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
+ */
+static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ struct equiv_cpu_table table;
+ size_t orig_size = size;
+ u32 *hdr = (u32 *)ucode;
+ u16 eq_id;
+ u8 *buf;
+
+ if (!verify_equivalence_table(ucode, size))
+ return 0;
+
+ buf = ucode;
+
+ table.entry = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
+ table.num_entries = hdr[2] / sizeof(struct equiv_cpu_entry);
+
+ /*
+ * Find the equivalence ID of our CPU in this table. Even if this table
+ * doesn't contain a patch for the CPU, scan through the whole container
+ * so that it can be skipped in case there are other containers appended.
+ */
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
+
+ buf += hdr[2] + CONTAINER_HDR_SZ;
+ size -= hdr[2] + CONTAINER_HDR_SZ;
+
+ /*
+ * Scan through the rest of the container to find where it ends. We do
+ * some basic sanity-checking too.
+ */
+ while (size > 0) {
+ struct microcode_amd *mc;
+ u32 patch_size;
+ int ret;
+
+ ret = verify_patch(buf, size, &patch_size);
+ if (ret < 0) {
+ /*
+ * Patch verification failed, skip to the next container, if
+ * there is one. Before exit, check whether that container has
+ * found a patch already. If so, use it.
+ */
+ goto out;
+ } else if (ret > 0) {
+ goto skip;
+ }
+
+ mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
+
+ if (mc_patch_matches(mc, eq_id)) {
+ desc->psize = patch_size;
+ desc->mc = mc;
+
+ ucode_dbg(" match: size: %d\n", patch_size);
+ }
+
+skip:
+ /* Skip patch section header too: */
+ buf += patch_size + SECTION_HDR_SIZE;
+ size -= patch_size + SECTION_HDR_SIZE;
+ }
+
+out:
+ /*
+ * If we have found a patch (desc->mc), it means we're looking at the
+ * container which has a patch for this CPU so return 0 to mean, @ucode
+ * already points to the proper container. Otherwise, we return the size
+ * we scanned so that we can advance to the next container in the
+ * buffer.
+ */
+ if (desc->mc) {
+ desc->data = ucode;
+ desc->size = orig_size - size;
+
+ return 0;
+ }
+
+ return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ while (size) {
+ size_t s = parse_container(ucode, size, desc);
+ if (!s)
+ return;
+
+ /* catch wraparound */
+ if (size >= s) {
+ ucode += s;
+ size -= s;
+ } else {
+ return;
+ }
+ }
+}
+
+static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
+ unsigned int psize)
+{
+ unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
+
+ if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
+ return false;
+
+ native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr);
+
+ if (x86_family(bsp_cpuid_1_eax) == 0x17) {
+ unsigned long p_addr_end = p_addr + psize - 1;
+
+ invlpg(p_addr);
+
+ /*
+ * Flush next page too if patch image is crossing a page
+ * boundary.
+ */
+ if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT)
+ invlpg(p_addr_end);
+ }
+
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG))
+ microcode_rev[smp_processor_id()] = mc->hdr.patch_id;
+
+ /* verify patch application was successful */
+ *cur_rev = get_patch_level();
+
+ ucode_dbg("updated rev: 0x%x\n", *cur_rev);
+
+ if (*cur_rev != mc->hdr.patch_id)
+ return false;
+
+ return true;
+}
+
+static bool get_builtin_microcode(struct cpio_data *cp)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ struct firmware fw;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ if (family >= 0x15)
+ snprintf(fw_name, sizeof(fw_name),
+ "amd-ucode/microcode_amd_fam%02hhxh.bin", family);
+
+ if (firmware_request_builtin(&fw, fw_name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
+ return false;
+}
+
+static bool __init find_blobs_in_containers(struct cpio_data *ret)
+{
+ struct cpio_data cp;
+ bool found;
+
+ if (!get_builtin_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path);
+
+ found = cp.data && cp.size;
+ if (found)
+ *ret = cp;
+
+ return found;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
+{
+ struct cont_desc desc = { };
+ struct microcode_amd *mc;
+ struct cpio_data cp = { };
+ char buf[4];
+ u32 rev;
+
+ if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) {
+ if (!strncmp(buf, "off", 3)) {
+ sha_check = false;
+ pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+ }
+
+ bsp_cpuid_1_eax = cpuid_1_eax;
+
+ rev = get_patch_level();
+ ed->old_rev = rev;
+
+ /* Needed in load_microcode_amd() */
+ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
+
+ if (!find_blobs_in_containers(&cp))
+ return;
+
+ scan_containers(cp.data, cp.size, &desc);
+
+ mc = desc.mc;
+ if (!mc)
+ return;
+
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (ed->old_rev > mc->hdr.patch_id)
+ return;
+
+ if (__apply_microcode_amd(mc, &rev, desc.psize))
+ ed->new_rev = rev;
+}
+
+static inline bool patch_cpus_equivalent(struct ucode_patch *p,
+ struct ucode_patch *n,
+ bool ignore_stepping)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ if (ignore_stepping) {
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+ }
+
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
+ }
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ list_for_each_entry(p, &microcode_cache, plist)
+ if (patch_cpus_equivalent(p, &n, false))
+ return p;
+
+ return NULL;
+}
+
+static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ if (zn.stepping != zp.stepping)
+ return -1;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+ int ret;
+
+ list_for_each_entry(p, &microcode_cache, plist) {
+ if (patch_cpus_equivalent(p, new_patch, true)) {
+ ret = patch_newer(p, new_patch);
+ if (ret < 0)
+ continue;
+ else if (!ret) {
+ /* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
+ return;
+ }
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, &microcode_cache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u16 equiv_id = 0;
+
+ uci->cpu_sig.rev = get_patch_level();
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
+
+ return cache_find_patch(uci, equiv_id);
+}
+
+void reload_ucode_amd(unsigned int cpu)
+{
+ u32 rev, dummy __always_unused;
+ struct microcode_amd *mc;
+ struct ucode_patch *p;
+
+ p = find_patch(cpu);
+ if (!p)
+ return;
+
+ mc = p->data;
+
+ rev = get_patch_level();
+ if (rev < mc->hdr.patch_id) {
+ if (__apply_microcode_amd(mc, &rev, p->size))
+ pr_info_once("reload revision: 0x%08x\n", rev);
+ }
+}
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct ucode_patch *p;
+
+ csig->sig = cpuid_eax(0x00000001);
+ csig->rev = get_patch_level();
+
+ /*
+ * a patch could have been loaded early, set uci->mc so that
+ * mc_bp_resume() can call apply_microcode()
+ */
+ p = find_patch(cpu);
+ if (p && (p->patch_id == csig->rev))
+ uci->mc = p->data;
+
+ return 0;
+}
+
+static enum ucode_state apply_microcode_amd(int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+ u32 rev;
+
+ BUG_ON(raw_smp_processor_id() != cpu);
+
+ uci = ucode_cpu_info + cpu;
+
+ p = find_patch(cpu);
+ if (!p)
+ return UCODE_NFOUND;
+
+ rev = uci->cpu_sig.rev;
+
+ mc_amd = p->data;
+ uci->mc = p->data;
+
+ /* need to apply patch? */
+ if (rev > mc_amd->hdr.patch_id) {
+ ret = UCODE_OK;
+ goto out;
+ }
+
+ if (!__apply_microcode_amd(mc_amd, &rev, p->size)) {
+ pr_err("CPU%d: update failed for patch_level=0x%08x\n",
+ cpu, mc_amd->hdr.patch_id);
+ return UCODE_ERROR;
+ }
+
+ rev = mc_amd->hdr.patch_id;
+ ret = UCODE_UPDATED;
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
+}
+
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ unsigned int cpu = smp_processor_id();
+
+ ucode_cpu_info[cpu].cpu_sig.sig = cpuid_1_eax;
+ apply_microcode_amd(cpu);
+}
+
+static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
+{
+ u32 equiv_tbl_len;
+ const u32 *hdr;
+
+ if (!verify_equivalence_table(buf, buf_size))
+ return 0;
+
+ hdr = (const u32 *)buf;
+ equiv_tbl_len = hdr[2];
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
+ equiv_table.entry = vmalloc(equiv_tbl_len);
+ if (!equiv_table.entry) {
+ pr_err("failed to allocate equivalent CPU table\n");
+ return 0;
+ }
+
+ memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
+ equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+
+out:
+ /* add header length */
+ return equiv_tbl_len + CONTAINER_HDR_SZ;
+}
+
+static void free_equiv_cpu_table(void)
+{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
+ vfree(equiv_table.entry);
+ memset(&equiv_table, 0, sizeof(equiv_table));
+}
+
+static void cleanup(void)
+{
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * Return a non-negative value even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
+ unsigned int *patch_size)
+{
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ u16 proc_id;
+ int ret;
+
+ ret = verify_patch(fw, leftover, patch_size);
+ if (ret)
+ return ret;
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kmemdup(fw + SECTION_HDR_SIZE, *patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+ patch->size = *patch_size;
+
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
+ __func__, patch->patch_id, proc_id);
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return 0;
+}
+
+/* Scan the blob in @data and add microcode patches to the cache. */
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ u8 *fw = (u8 *)data;
+ size_t offset;
+
+ offset = install_equiv_cpu_table(data, size);
+ if (!offset)
+ return UCODE_ERROR;
+
+ fw += offset;
+ size -= offset;
+
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ free_equiv_cpu_table();
+ return UCODE_ERROR;
+ }
+
+ while (size > 0) {
+ unsigned int crnt_size = 0;
+ int ret;
+
+ ret = verify_and_add_patch(family, fw, size, &crnt_size);
+ if (ret < 0)
+ return UCODE_ERROR;
+
+ fw += crnt_size + SECTION_HDR_SIZE;
+ size -= (crnt_size + SECTION_HDR_SIZE);
+ }
+
+ return UCODE_OK;
+}
+
+static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ enum ucode_state ret;
+
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = __load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
+ cleanup();
+
+ return ret;
+}
+
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ ret = _load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
+ return ret;
+
+ for_each_node_with_cpus(nid) {
+ cpu = cpumask_first(cpumask_of_node(nid));
+ c = &cpu_data(cpu);
+
+ p = find_patch(cpu);
+ if (!p)
+ continue;
+
+ if (c->microcode >= p->patch_id)
+ continue;
+
+ ret = UCODE_NEW;
+ }
+
+ return ret;
+}
+
+static int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ if (!find_blobs_in_containers(&cp))
+ return -EINVAL;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+early_initcall(save_microcode_in_initrd);
+
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ * amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Beginning with family 15h, they are in family-specific firmware files:
+ *
+ * amd-ucode/microcode_amd_fam15h.bin
+ * amd-ucode/microcode_amd_fam16h.bin
+ * ...
+ *
+ * These might be larger than 2K.
+ */
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ if (force_minrev)
+ return UCODE_NFOUND;
+
+ if (c->x86 >= 0x15)
+ snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
+
+ if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
+ ucode_dbg("failed to load file %s\n", fw_name);
+ goto out;
+ }
+
+ ret = UCODE_ERROR;
+ if (!verify_container(fw->data, fw->size))
+ goto fw_release;
+
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
+
+ fw_release:
+ release_firmware(fw);
+
+ out:
+ return ret;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->mc = NULL;
+}
+
+static void finalize_late_load_amd(int result)
+{
+ if (result)
+ cleanup();
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .finalize_late_load = finalize_late_load_amd,
+ .nmi_safe = true,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+ return &microcode_amd_ops;
+}
+
+void __exit exit_amd_microcode(void)
+{
+ cleanup();
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
new file mode 100644
index 000000000000..1fd349cfc802
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -0,0 +1,556 @@
+/* Keep 'em sorted. */
+static const struct patch_digest phashes[] = {
+ { 0x8001227, {
+ 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b,
+ 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46,
+ 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8,
+ 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18,
+ }
+ },
+ { 0x8001250, {
+ 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60,
+ 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa,
+ 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3,
+ 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19,
+ }
+ },
+ { 0x800126e, {
+ 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c,
+ 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3,
+ 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6,
+ 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43,
+ }
+ },
+ { 0x800126f, {
+ 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec,
+ 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b,
+ 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18,
+ 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33,
+ }
+ },
+ { 0x800820d, {
+ 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59,
+ 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67,
+ 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c,
+ 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e,
+ }
+ },
+ { 0x8301025, {
+ 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36,
+ 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1,
+ 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30,
+ 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77,
+ }
+ },
+ { 0x8301055, {
+ 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a,
+ 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea,
+ 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b,
+ 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b,
+ }
+ },
+ { 0x8301072, {
+ 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e,
+ 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28,
+ 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1,
+ 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30,
+ }
+ },
+ { 0x830107a, {
+ 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72,
+ 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34,
+ 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20,
+ 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a,
+ }
+ },
+ { 0x830107b, {
+ 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad,
+ 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a,
+ 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04,
+ 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19,
+ }
+ },
+ { 0x830107c, {
+ 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47,
+ 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac,
+ 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3,
+ 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38,
+ }
+ },
+ { 0x860010d, {
+ 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0,
+ 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7,
+ 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c,
+ 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8,
+ }
+ },
+ { 0x8608108, {
+ 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2,
+ 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38,
+ 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc,
+ 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd,
+ }
+ },
+ { 0x8701034, {
+ 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83,
+ 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d,
+ 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8,
+ 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b,
+ }
+ },
+ { 0x8a00008, {
+ 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e,
+ 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e,
+ 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72,
+ 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c,
+ }
+ },
+ { 0x8a0000a, {
+ 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c,
+ 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44,
+ 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb,
+ 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20,
+ }
+ },
+ { 0xa00104c, {
+ 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe,
+ 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76,
+ 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b,
+ 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b,
+ }
+ },
+ { 0xa00104e, {
+ 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2,
+ 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0,
+ 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e,
+ 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b,
+ }
+ },
+ { 0xa001053, {
+ 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d,
+ 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25,
+ 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb,
+ 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3,
+ }
+ },
+ { 0xa001058, {
+ 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36,
+ 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19,
+ 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec,
+ 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2,
+ }
+ },
+ { 0xa001075, {
+ 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9,
+ 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a,
+ 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f,
+ 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0,
+ }
+ },
+ { 0xa001078, {
+ 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25,
+ 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce,
+ 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04,
+ 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e,
+ }
+ },
+ { 0xa001079, {
+ 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb,
+ 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93,
+ 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb,
+ 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a,
+ }
+ },
+ { 0xa00107a, {
+ 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f,
+ 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd,
+ 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f,
+ 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18,
+ }
+ },
+ { 0xa001143, {
+ 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80,
+ 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42,
+ 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d,
+ 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8,
+ }
+ },
+ { 0xa001144, {
+ 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41,
+ 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b,
+ 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25,
+ 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc,
+ }
+ },
+ { 0xa00115d, {
+ 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd,
+ 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75,
+ 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e,
+ 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05,
+ }
+ },
+ { 0xa001173, {
+ 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a,
+ 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35,
+ 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3,
+ 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e,
+ }
+ },
+ { 0xa0011a8, {
+ 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b,
+ 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6,
+ 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4,
+ 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e,
+ }
+ },
+ { 0xa0011ce, {
+ 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71,
+ 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86,
+ 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e,
+ 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a,
+ }
+ },
+ { 0xa0011d1, {
+ 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e,
+ 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09,
+ 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5,
+ 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8,
+ }
+ },
+ { 0xa0011d3, {
+ 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b,
+ 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6,
+ 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00,
+ 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26,
+ }
+ },
+ { 0xa0011d5, {
+ 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13,
+ 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7,
+ 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62,
+ 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
+ }
+ },
+ { 0xa0011d7, {
+ 0x35,0x07,0xcd,0x40,0x94,0xbc,0x81,0x6b,
+ 0xfc,0x61,0x56,0x1a,0xe2,0xdb,0x96,0x12,
+ 0x1c,0x1c,0x31,0xb1,0x02,0x6f,0xe5,0xd2,
+ 0xfe,0x1b,0x04,0x03,0x2c,0x8f,0x4c,0x36,
+ }
+ },
+ { 0xa001223, {
+ 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
+ 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
+ 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15,
+ 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45,
+ }
+ },
+ { 0xa001224, {
+ 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25,
+ 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad,
+ 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9,
+ 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a,
+ }
+ },
+ { 0xa001227, {
+ 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad,
+ 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d,
+ 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9,
+ 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0,
+ }
+ },
+ { 0xa001229, {
+ 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6,
+ 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75,
+ 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4,
+ 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d,
+ }
+ },
+ { 0xa00122e, {
+ 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf,
+ 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15,
+ 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa,
+ 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10,
+ }
+ },
+ { 0xa001231, {
+ 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e,
+ 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64,
+ 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b,
+ 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd,
+ }
+ },
+ { 0xa001234, {
+ 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7,
+ 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc,
+ 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b,
+ 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a,
+ }
+ },
+ { 0xa001236, {
+ 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78,
+ 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d,
+ 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b,
+ 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25,
+ }
+ },
+ { 0xa001238, {
+ 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc,
+ 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a,
+ 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e,
+ 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
+ }
+ },
+ { 0xa00123b, {
+ 0xef,0xa1,0x1e,0x71,0xf1,0xc3,0x2c,0xe2,
+ 0xc3,0xef,0x69,0x41,0x7a,0x54,0xca,0xc3,
+ 0x8f,0x62,0x84,0xee,0xc2,0x39,0xd9,0x28,
+ 0x95,0xa7,0x12,0x49,0x1e,0x30,0x71,0x72,
+ }
+ },
+ { 0xa00820c, {
+ 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
+ 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
+ 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1,
+ 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
+ }
+ },
+ { 0xa00820d, {
+ 0xf9,0x2a,0xc0,0xf4,0x9e,0xa4,0x87,0xa4,
+ 0x7d,0x87,0x00,0xfd,0xab,0xda,0x19,0xca,
+ 0x26,0x51,0x32,0xc1,0x57,0x91,0xdf,0xc1,
+ 0x05,0xeb,0x01,0x7c,0x5a,0x95,0x21,0xb7,
+ }
+ },
+ { 0xa10113e, {
+ 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
+ 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
+ 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5,
+ 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53,
+ }
+ },
+ { 0xa101144, {
+ 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26,
+ 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09,
+ 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc,
+ 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47,
+ }
+ },
+ { 0xa101148, {
+ 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90,
+ 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f,
+ 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08,
+ 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
+ }
+ },
+ { 0xa10114c, {
+ 0x9e,0xb6,0xa2,0xd9,0x87,0x38,0xc5,0x64,
+ 0xd8,0x88,0xfa,0x78,0x98,0xf9,0x6f,0x74,
+ 0x39,0x90,0x1b,0xa5,0xcf,0x5e,0xb4,0x2a,
+ 0x02,0xff,0xd4,0x8c,0x71,0x8b,0xe2,0xc0,
+ }
+ },
+ { 0xa10123e, {
+ 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
+ 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
+ 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc,
+ 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b,
+ }
+ },
+ { 0xa101244, {
+ 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c,
+ 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1,
+ 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72,
+ 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0,
+ }
+ },
+ { 0xa101248, {
+ 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e,
+ 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22,
+ 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e,
+ 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
+ }
+ },
+ { 0xa10124c, {
+ 0x29,0xea,0xf1,0x2c,0xb2,0xe4,0xef,0x90,
+ 0xa4,0xcd,0x1d,0x86,0x97,0x17,0x61,0x46,
+ 0xfc,0x22,0xcb,0x57,0x75,0x19,0xc8,0xcc,
+ 0x0c,0xf5,0xbc,0xac,0x81,0x9d,0x9a,0xd2,
+ }
+ },
+ { 0xa108108, {
+ 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
+ 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
+ 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c,
+ 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
+ }
+ },
+ { 0xa108109, {
+ 0x85,0xb4,0xbd,0x7c,0x49,0xa7,0xbd,0xfa,
+ 0x49,0x36,0x80,0x81,0xc5,0xb7,0x39,0x1b,
+ 0x9a,0xaa,0x50,0xde,0x9b,0xe9,0x32,0x35,
+ 0x42,0x7e,0x51,0x4f,0x52,0x2c,0x28,0x59,
+ }
+ },
+ { 0xa20102d, {
+ 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
+ 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
+ 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23,
+ 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
+ }
+ },
+ { 0xa20102e, {
+ 0xbe,0x1f,0x32,0x04,0x0d,0x3c,0x9c,0xdd,
+ 0xe1,0xa4,0xbf,0x76,0x3a,0xec,0xc2,0xf6,
+ 0x11,0x00,0xa7,0xaf,0x0f,0xe5,0x02,0xc5,
+ 0x54,0x3a,0x1f,0x8c,0x16,0xb5,0xff,0xbe,
+ }
+ },
+ { 0xa201210, {
+ 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
+ 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
+ 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68,
+ 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
+ }
+ },
+ { 0xa201211, {
+ 0x69,0xa1,0x17,0xec,0xd0,0xf6,0x6c,0x95,
+ 0xe2,0x1e,0xc5,0x59,0x1a,0x52,0x0a,0x27,
+ 0xc4,0xed,0xd5,0x59,0x1f,0xbf,0x00,0xff,
+ 0x08,0x88,0xb5,0xe1,0x12,0xb6,0xcc,0x27,
+ }
+ },
+ { 0xa404107, {
+ 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
+ 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
+ 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81,
+ 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
+ }
+ },
+ { 0xa404108, {
+ 0x69,0x67,0x43,0x06,0xf8,0x0c,0x62,0xdc,
+ 0xa4,0x21,0x30,0x4f,0x0f,0x21,0x2c,0xcb,
+ 0xcc,0x37,0xf1,0x1c,0xc3,0xf8,0x2f,0x19,
+ 0xdf,0x53,0x53,0x46,0xb1,0x15,0xea,0x00,
+ }
+ },
+ { 0xa500011, {
+ 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
+ 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
+ 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2,
+ 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
+ }
+ },
+ { 0xa500012, {
+ 0xeb,0x74,0x0d,0x47,0xa1,0x8e,0x09,0xe4,
+ 0x93,0x4c,0xad,0x03,0x32,0x4c,0x38,0x16,
+ 0x10,0x39,0xdd,0x06,0xaa,0xce,0xd6,0x0f,
+ 0x62,0x83,0x9d,0x8e,0x64,0x55,0xbe,0x63,
+ }
+ },
+ { 0xa601209, {
+ 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
+ 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
+ 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46,
+ 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
+ }
+ },
+ { 0xa60120a, {
+ 0x0c,0x8b,0x3d,0xfd,0x52,0x52,0x85,0x7d,
+ 0x20,0x3a,0xe1,0x7e,0xa4,0x21,0x3b,0x7b,
+ 0x17,0x86,0xae,0xac,0x13,0xb8,0x63,0x9d,
+ 0x06,0x01,0xd0,0xa0,0x51,0x9a,0x91,0x2c,
+ }
+ },
+ { 0xa704107, {
+ 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
+ 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
+ 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2,
+ 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
+ }
+ },
+ { 0xa704108, {
+ 0xd7,0x55,0x15,0x2b,0xfe,0xc4,0xbc,0x93,
+ 0xec,0x91,0xa0,0xae,0x45,0xb7,0xc3,0x98,
+ 0x4e,0xff,0x61,0x77,0x88,0xc2,0x70,0x49,
+ 0xe0,0x3a,0x1d,0x84,0x38,0x52,0xbf,0x5a,
+ }
+ },
+ { 0xa705206, {
+ 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
+ 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
+ 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b,
+ 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
+ }
+ },
+ { 0xa705208, {
+ 0x30,0x1d,0x55,0x24,0xbc,0x6b,0x5a,0x19,
+ 0x0c,0x7d,0x1d,0x74,0xaa,0xd1,0xeb,0xd2,
+ 0x16,0x62,0xf7,0x5b,0xe1,0x1f,0x18,0x11,
+ 0x5c,0xf0,0x94,0x90,0x26,0xec,0x69,0xff,
+ }
+ },
+ { 0xa708007, {
+ 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
+ 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
+ 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80,
+ 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
+ }
+ },
+ { 0xa708008, {
+ 0x08,0x6e,0xf0,0x22,0x4b,0x8e,0xc4,0x46,
+ 0x58,0x34,0xe6,0x47,0xa2,0x28,0xfd,0xab,
+ 0x22,0x3d,0xdd,0xd8,0x52,0x9e,0x1d,0x16,
+ 0xfa,0x01,0x68,0x14,0x79,0x3e,0xe8,0x6b,
+ }
+ },
+ { 0xa70c005, {
+ 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
+ 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
+ 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55,
+ 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
+ }
+ },
+ { 0xa70c008, {
+ 0x0f,0xdb,0x37,0xa1,0x10,0xaf,0xd4,0x21,
+ 0x94,0x0d,0xa4,0xa2,0xe9,0x86,0x6c,0x0e,
+ 0x85,0x7c,0x36,0x30,0xa3,0x3a,0x78,0x66,
+ 0x18,0x10,0x60,0x0d,0x78,0x3d,0x44,0xd0,
+ }
+ },
+ { 0xaa00116, {
+ 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
+ 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
+ 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d,
+ 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9,
+ }
+ },
+ { 0xaa00212, {
+ 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75,
+ 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71,
+ 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2,
+ 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1,
+ }
+ },
+ { 0xaa00213, {
+ 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a,
+ 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f,
+ 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed,
+ 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b,
+ }
+ },
+ { 0xaa00215, {
+ 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9,
+ 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4,
+ 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b,
+ 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
+ }
+ },
+ { 0xaa00216, {
+ 0x79,0xfb,0x5b,0x9f,0xb6,0xe6,0xa8,0xf5,
+ 0x4e,0x7c,0x4f,0x8e,0x1d,0xad,0xd0,0x08,
+ 0xc2,0x43,0x7c,0x8b,0xe6,0xdb,0xd0,0xd2,
+ 0xe8,0x39,0x26,0xc1,0xe5,0x5a,0x48,0xf1,
+ }
+ },
+};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
new file mode 100644
index 000000000000..68049f171860
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -0,0 +1,926 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
+ *
+ * X86 CPU microcode early update for Linux:
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ * (C) 2015 Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on x86 processors.
+ */
+
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/stop_machine.h>
+#include <linux/device/faux.h>
+#include <linux/syscore_ops.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/firmware.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/msr.h>
+#include <asm/setup.h>
+
+#include "internal.h"
+
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr;
+
+bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
+
+/*
+ * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in
+ * order to not uglify the code with ifdeffery and use IS_ENABLED()
+ * instead, leave them in. When microcode debugging is not enabled,
+ * those are meaningless anyway.
+ */
+/* base microcode revision for debugging */
+u32 base_rev;
+u32 microcode_rev[NR_CPUS] = {};
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - cpus_read_lock/unlock() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+struct early_load_data early_data;
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+ u32 lvl, dummy, i;
+ u32 *levels;
+
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i])
+ return true;
+ }
+ return false;
+}
+
+bool __init microcode_loader_disabled(void)
+{
+ if (dis_ucode_ldr)
+ return true;
+
+ /*
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
+ */
+ if (!cpuid_feature() ||
+ ((native_cpuid_ecx(1) & BIT(31)) &&
+ !IS_ENABLED(CONFIG_MICROCODE_DBG)) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
+
+ return dis_ucode_ldr;
+}
+
+static void __init early_parse_cmdline(void)
+{
+ char cmd_buf[64] = {};
+ char *s, *p = cmd_buf;
+
+ if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) {
+ while ((s = strsep(&p, ","))) {
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+ if (strstr(s, "base_rev=")) {
+ /* advance to the option arg */
+ strsep(&s, "=");
+ if (kstrtouint(s, 16, &base_rev)) { ; }
+ }
+ }
+
+ if (!strcmp("force_minrev", s))
+ force_minrev = true;
+
+ if (!strcmp(s, "dis_ucode_ldr"))
+ dis_ucode_ldr = true;
+ }
+ }
+
+ /* old, compat option */
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+}
+
+void __init load_ucode_bsp(void)
+{
+ unsigned int cpuid_1_eax;
+ bool intel = true;
+
+ early_parse_cmdline();
+
+ if (microcode_loader_disabled())
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) < 6)
+ return;
+ break;
+
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) < 0x10)
+ return;
+ intel = false;
+ break;
+
+ default:
+ return;
+ }
+
+ if (intel)
+ load_ucode_intel_bsp(&early_data);
+ else
+ load_ucode_amd_bsp(&early_data, cpuid_1_eax);
+}
+
+void load_ucode_ap(void)
+{
+ unsigned int cpuid_1_eax;
+
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
+ if (dis_ucode_ldr)
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) >= 0x10)
+ load_ucode_amd_ap(cpuid_1_eax);
+ break;
+ default:
+ break;
+ }
+}
+
+struct cpio_data __init find_microcode_in_initrd(const char *path)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ size = boot_params.hdr.ramdisk_size;
+ /* Early load on BSP has a temporary mapping. */
+ if (size)
+ start = initrd_start_early;
+
+#else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+ start += PAGE_OFFSET;
+ }
+#endif
+
+ /*
+ * Fixup the start address: after reserve_initrd() runs, initrd_start
+ * has the virtual address of the beginning of the initrd. It also
+ * possibly relocates the ramdisk. In either case, initrd_start contains
+ * the updated address so use that instead.
+ */
+ if (initrd_start)
+ start = initrd_start;
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
+static void reload_early_microcode(unsigned int cpu)
+{
+ int vendor, family;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ reload_ucode_amd(cpu);
+ break;
+ default:
+ break;
+ }
+}
+
+/* fake device for request_firmware */
+static struct faux_device *microcode_fdev;
+
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/*
+ * Late loading dance. Why the heavy-handed stomp_machine effort?
+ *
+ * - HT siblings must be idle and not execute other code while the other sibling
+ * is loading microcode in order to avoid any negative interactions caused by
+ * the loading.
+ *
+ * - In addition, microcode update on the cores must be serialized until this
+ * requirement can be relaxed in the future. Right now, this is conservative
+ * and good.
+ */
+enum sibling_ctrl {
+ /* Spinwait with timeout */
+ SCTRL_WAIT,
+ /* Invoke the microcode_apply() callback */
+ SCTRL_APPLY,
+ /* Proceed without invoking the microcode_apply() callback */
+ SCTRL_DONE,
+};
+
+struct microcode_ctrl {
+ enum sibling_ctrl ctrl;
+ enum ucode_state result;
+ unsigned int ctrl_cpu;
+ bool nmi_enabled;
+};
+
+DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl);
+static atomic_t late_cpus_in, offline_in_nmi;
+static unsigned int loops_per_usec;
+static cpumask_t cpu_offline_mask;
+
+static noinstr bool wait_for_cpus(atomic_t *cnt)
+{
+ unsigned int timeout, loops;
+
+ WARN_ON_ONCE(raw_atomic_dec_return(cnt) < 0);
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (!raw_atomic_read(cnt))
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ /* Prevent the late comers from making progress and let them time out */
+ raw_atomic_inc(cnt);
+ return false;
+}
+
+static noinstr bool wait_for_ctrl(void)
+{
+ unsigned int timeout, loops;
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (raw_cpu_read(ucode_ctrl.ctrl) != SCTRL_WAIT)
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ return false;
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr bool load_secondary_wait(unsigned int ctrl_cpu)
+{
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ raw_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ return false;
+ }
+
+ /*
+ * Wait for primary threads to complete. If one of them hangs due
+ * to the update, there is no way out. This is non-recoverable
+ * because the CPU might hold locks or resources and confuse the
+ * scheduler, watchdogs etc. There is no way to safely evacuate the
+ * machine.
+ */
+ if (wait_for_ctrl())
+ return true;
+
+ instrumentation_begin();
+ panic("Microcode load: Primary CPU %d timed out\n", ctrl_cpu);
+ instrumentation_end();
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr void load_secondary(unsigned int cpu)
+{
+ unsigned int ctrl_cpu = raw_cpu_read(ucode_ctrl.ctrl_cpu);
+ enum ucode_state ret;
+
+ if (!load_secondary_wait(ctrl_cpu)) {
+ instrumentation_begin();
+ pr_err_once("load: %d CPUs timed out\n",
+ atomic_read(&late_cpus_in) - 1);
+ instrumentation_end();
+ return;
+ }
+
+ /* Primary thread completed. Allow to invoke instrumentable code */
+ instrumentation_begin();
+ /*
+ * If the primary succeeded then invoke the apply() callback,
+ * otherwise copy the state from the primary thread.
+ */
+ if (this_cpu_read(ucode_ctrl.ctrl) == SCTRL_APPLY)
+ ret = microcode_ops->apply_microcode(cpu);
+ else
+ ret = per_cpu(ucode_ctrl.result, ctrl_cpu);
+
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+ instrumentation_end();
+}
+
+static void __load_primary(unsigned int cpu)
+{
+ struct cpumask *secondaries = topology_sibling_cpumask(cpu);
+ enum sibling_ctrl ctrl;
+ enum ucode_state ret;
+ unsigned int sibling;
+
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ this_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ pr_err_once("load: %d CPUs timed out\n", atomic_read(&late_cpus_in) - 1);
+ return;
+ }
+
+ ret = microcode_ops->apply_microcode(cpu);
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+
+ /*
+ * If the update was successful, let the siblings run the apply()
+ * callback. If not, tell them it's done. This also covers the
+ * case where the CPU has uniform loading at package or system
+ * scope implemented but does not advertise it.
+ */
+ if (ret == UCODE_UPDATED || ret == UCODE_OK)
+ ctrl = SCTRL_APPLY;
+ else
+ ctrl = SCTRL_DONE;
+
+ for_each_cpu(sibling, secondaries) {
+ if (sibling != cpu)
+ per_cpu(ucode_ctrl.ctrl, sibling) = ctrl;
+ }
+}
+
+static bool kick_offline_cpus(unsigned int nr_offl)
+{
+ unsigned int cpu, timeout;
+
+ for_each_cpu(cpu, &cpu_offline_mask) {
+ /* Enable the rendezvous handler and send NMI */
+ per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
+ apic_send_nmi_to_offline_cpu(cpu);
+ }
+
+ /* Wait for them to arrive */
+ for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
+ if (atomic_read(&offline_in_nmi) == nr_offl)
+ return true;
+ udelay(1);
+ }
+ /* Let the others time out */
+ return false;
+}
+
+static void release_offline_cpus(void)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, &cpu_offline_mask)
+ per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
+}
+
+static void load_primary(unsigned int cpu)
+{
+ unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
+ bool proceed = true;
+
+ /* Kick soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ proceed = kick_offline_cpus(nr_offl);
+
+ /* If the soft-offlined CPUs did not respond, abort */
+ if (proceed)
+ __load_primary(cpu);
+
+ /* Unconditionally release soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ release_offline_cpus();
+}
+
+/*
+ * Minimal stub rendezvous handler for soft-offlined CPUs which participate
+ * in the NMI rendezvous to protect against a concurrent NMI on affected
+ * CPUs.
+ */
+void noinstr microcode_offline_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
+ raw_atomic_inc(&offline_in_nmi);
+ wait_for_ctrl();
+}
+
+static noinstr bool microcode_update_handler(void)
+{
+ unsigned int cpu = raw_smp_processor_id();
+
+ if (raw_cpu_read(ucode_ctrl.ctrl_cpu) == cpu) {
+ instrumentation_begin();
+ load_primary(cpu);
+ instrumentation_end();
+ } else {
+ load_secondary(cpu);
+ }
+
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+
+ return true;
+}
+
+/*
+ * Protection against instrumentation is required for CPUs which are not
+ * safe against an NMI which is delivered to the secondary SMT sibling
+ * while the primary thread updates the microcode. Instrumentation can end
+ * up in #INT3, #DB and #PF. The IRET from those exceptions reenables NMI
+ * which is the opposite of what the NMI rendezvous is trying to achieve.
+ *
+ * The primary thread is safe versus instrumentation as the actual
+ * microcode update handles this correctly. It's only the sibling code
+ * path which must be NMI safe until the primary thread completed the
+ * update.
+ */
+bool noinstr microcode_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return false;
+
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ return microcode_update_handler();
+}
+
+static int load_cpus_stopped(void *unused)
+{
+ if (microcode_ops->use_nmi) {
+ /* Enable the NMI handler and raise NMI */
+ this_cpu_write(ucode_ctrl.nmi_enabled, true);
+ apic->send_IPI(smp_processor_id(), NMI_VECTOR);
+ } else {
+ /* Just invoke the handler directly */
+ microcode_update_handler();
+ }
+ return 0;
+}
+
+static int load_late_stop_cpus(bool is_safe)
+{
+ unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
+ unsigned int nr_offl, offline = 0;
+ int old_rev = boot_cpu_data.microcode;
+ struct cpuinfo_x86 prev_info;
+
+ if (!is_safe) {
+ pr_err("Late microcode loading without minimal revision check.\n");
+ pr_err("You should switch to early loading, if possible.\n");
+ }
+
+ /*
+ * Pre-load the microcode image into a staging device. This
+ * process is preemptible and does not require stopping CPUs.
+ * Successful staging simplifies the subsequent late-loading
+ * process, reducing rendezvous time.
+ *
+ * Even if the transfer fails, the update will proceed as usual.
+ */
+ if (microcode_ops->use_staging)
+ microcode_ops->stage_microcode();
+
+ atomic_set(&late_cpus_in, num_online_cpus());
+ atomic_set(&offline_in_nmi, 0);
+ loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
+
+ /*
+ * Take a snapshot before the microcode update in order to compare and
+ * check whether any bits changed after an update.
+ */
+ store_cpu_caps(&prev_info);
+
+ if (microcode_ops->use_nmi)
+ static_branch_enable_cpuslocked(&microcode_nmi_handler_enable);
+
+ stop_machine_cpuslocked(load_cpus_stopped, NULL, cpu_online_mask);
+
+ if (microcode_ops->use_nmi)
+ static_branch_disable_cpuslocked(&microcode_nmi_handler_enable);
+
+ /* Analyze the results */
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ switch (per_cpu(ucode_ctrl.result, cpu)) {
+ case UCODE_UPDATED: updated++; break;
+ case UCODE_TIMEOUT: timedout++; break;
+ case UCODE_OK: siblings++; break;
+ case UCODE_OFFLINE: offline++; break;
+ default: failed++; break;
+ }
+ }
+
+ if (microcode_ops->finalize_late_load)
+ microcode_ops->finalize_late_load(!updated);
+
+ if (!updated) {
+ /* Nothing changed. */
+ if (!failed && !timedout)
+ return 0;
+
+ nr_offl = cpumask_weight(&cpu_offline_mask);
+ if (offline < nr_offl) {
+ pr_warn("%u offline siblings did not respond.\n",
+ nr_offl - atomic_read(&offline_in_nmi));
+ return -EIO;
+ }
+ pr_err("update failed: %u CPUs failed %u CPUs timed out\n",
+ failed, timedout);
+ return -EIO;
+ }
+
+ if (!is_safe || failed || timedout)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ pr_info("load: updated on %u primary CPUs with %u siblings\n", updated, siblings);
+ if (failed || timedout) {
+ pr_err("load incomplete. %u CPUs timed out or failed\n",
+ num_online_cpus() - (updated + siblings));
+ }
+ pr_info("revision: 0x%x -> 0x%x\n", old_rev, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+
+ return updated + siblings == num_online_cpus() ? 0 : -EIO;
+}
+
+/*
+ * This function does two things:
+ *
+ * 1) Ensure that all required CPUs which are present and have been booted
+ * once are online.
+ *
+ * To pass this check, all primary threads must be online.
+ *
+ * If the microcode load is not safe against NMI then all SMT threads
+ * must be online as well because they still react to NMIs when they are
+ * soft-offlined and parked in one of the play_dead() variants. So if a
+ * NMI hits while the primary thread updates the microcode the resulting
+ * behaviour is undefined. The default play_dead() implementation on
+ * modern CPUs uses MWAIT, which is also not guaranteed to be safe
+ * against a microcode update which affects MWAIT.
+ *
+ * As soft-offlined CPUs still react on NMIs, the SMT sibling
+ * restriction can be lifted when the vendor driver signals to use NMI
+ * for rendezvous and the APIC provides a mechanism to send an NMI to a
+ * soft-offlined CPU. The soft-offlined CPUs are then able to
+ * participate in the rendezvous in a trivial stub handler.
+ *
+ * 2) Initialize the per CPU control structure and create a cpumask
+ * which contains "offline"; secondary threads, so they can be handled
+ * correctly by a control CPU.
+ */
+static bool setup_cpus(void)
+{
+ struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
+ bool allow_smt_offline;
+ unsigned int cpu;
+
+ allow_smt_offline = microcode_ops->nmi_safe ||
+ (microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
+
+ cpumask_clear(&cpu_offline_mask);
+
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ /*
+ * Offline CPUs sit in one of the play_dead() functions
+ * with interrupts disabled, but they still react on NMIs
+ * and execute arbitrary code. Also MWAIT being updated
+ * while the offline CPU sits there is not necessarily safe
+ * on all CPU variants.
+ *
+ * Mark them in the offline_cpus mask which will be handled
+ * by CPU0 later in the update process.
+ *
+ * Ensure that the primary thread is online so that it is
+ * guaranteed that all cores are updated.
+ */
+ if (!cpu_online(cpu)) {
+ if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
+ pr_err("CPU %u not online, loading aborted\n", cpu);
+ return false;
+ }
+ cpumask_set_cpu(cpu, &cpu_offline_mask);
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ continue;
+ }
+
+ /*
+ * Initialize the per CPU state. This is core scope for now,
+ * but prepared to take package or system scope into account.
+ */
+ ctrl.ctrl_cpu = cpumask_first(topology_sibling_cpumask(cpu));
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ }
+ return true;
+}
+
+static int load_late_locked(void)
+{
+ if (!setup_cpus())
+ return -EBUSY;
+
+ switch (microcode_ops->request_microcode_fw(0, &microcode_fdev->dev)) {
+ case UCODE_NEW:
+ return load_late_stop_cpus(false);
+ case UCODE_NEW_SAFE:
+ return load_late_stop_cpus(true);
+ case UCODE_NFOUND:
+ return -ENOENT;
+ case UCODE_OK:
+ return 0;
+ default:
+ return -EBADFD;
+ }
+}
+
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret || val != 1)
+ return -EINVAL;
+
+ cpus_read_lock();
+ ret = load_late_locked();
+ cpus_read_unlock();
+
+ return ret ? : size;
+}
+
+static DEVICE_ATTR_WO(reload);
+#endif
+
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t processor_flags_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static DEVICE_ATTR_RO(version);
+static DEVICE_ATTR_RO(processor_flags);
+
+static struct attribute *mc_default_attrs[] = {
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
+ NULL
+};
+
+static const struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
+}
+
+/**
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
+ */
+void microcode_bsp_resume(void)
+{
+ int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci->mc)
+ microcode_ops->apply_microcode(cpu);
+ else
+ reload_early_microcode(cpu);
+}
+
+static void microcode_bsp_syscore_resume(void *data)
+{
+ microcode_bsp_resume();
+}
+
+static const struct syscore_ops mc_syscore_ops = {
+ .resume = microcode_bsp_syscore_resume,
+};
+
+static struct syscore mc_syscore = {
+ .ops = &mc_syscore_ops,
+};
+
+static int mc_cpu_online(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct device *dev = get_cpu_device(cpu);
+
+ memset(uci, 0, sizeof(*uci));
+
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
+
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ return 0;
+}
+
+static struct attribute *cpu_root_microcode_attrs[] = {
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+ &dev_attr_reload.attr,
+#endif
+ NULL
+};
+
+static const struct attribute_group cpu_root_microcode_group = {
+ .name = "microcode",
+ .attrs = cpu_root_microcode_attrs,
+};
+
+static int __init microcode_init(void)
+{
+ struct device *dev_root;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int error;
+
+ if (microcode_loader_disabled())
+ return -EINVAL;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ microcode_ops = init_intel_microcode();
+ else if (c->x86_vendor == X86_VENDOR_AMD)
+ microcode_ops = init_amd_microcode();
+ else
+ pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
+ return -ENODEV;
+
+ pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev));
+
+ if (early_data.new_rev)
+ pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
+
+ microcode_fdev = faux_device_create("microcode", NULL, NULL);
+ if (!microcode_fdev)
+ return -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ error = sysfs_create_group(&dev_root->kobj, &cpu_root_microcode_group);
+ put_device(dev_root);
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_pdev;
+ }
+ }
+
+ register_syscore(&mc_syscore);
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
+
+ return 0;
+
+ out_pdev:
+ faux_device_destroy(microcode_fdev);
+ return error;
+
+}
+late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
new file mode 100644
index 000000000000..2d48e6593540
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
@@ -0,0 +1,160 @@
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 },
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
new file mode 100644
index 000000000000..8744f3adc2a0
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/msr.h>
+
+#include "internal.h"
+
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+
+#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+
+/* Defines for the microcode staging mailbox interface */
+#define MBOX_REG_NUM 4
+#define MBOX_REG_SIZE sizeof(u32)
+
+#define MBOX_CONTROL_OFFSET 0x0
+#define MBOX_STATUS_OFFSET 0x4
+#define MBOX_WRDATA_OFFSET 0x8
+#define MBOX_RDDATA_OFFSET 0xc
+
+#define MASK_MBOX_CTRL_ABORT BIT(0)
+#define MASK_MBOX_CTRL_GO BIT(31)
+
+#define MASK_MBOX_STATUS_ERROR BIT(2)
+#define MASK_MBOX_STATUS_READY BIT(31)
+
+#define MASK_MBOX_RESP_SUCCESS BIT(0)
+#define MASK_MBOX_RESP_PROGRESS BIT(1)
+#define MASK_MBOX_RESP_ERROR BIT(2)
+
+#define MBOX_CMD_LOAD 0x3
+#define MBOX_OBJ_STAGING 0xb
+#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \
+ (MBOX_OBJ_STAGING << 16) | \
+ ((u64)((size) / sizeof(u32)) << 32))
+
+/* The size of each mailbox header */
+#define MBOX_HEADER_SIZE sizeof(u64)
+/* The size of staging hardware response */
+#define MBOX_RESPONSE_SIZE sizeof(u64)
+
+#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC)
+
+/* Current microcode patch used in early patching on the APs. */
+static struct microcode_intel *ucode_patch_va __read_mostly;
+static struct microcode_intel *ucode_patch_late __read_mostly;
+
+/* last level cache size per core */
+static unsigned int llc_size_per_core __ro_after_init;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[];
+};
+
+/**
+ * struct staging_state - Track the current staging process state
+ *
+ * @mmio_base: MMIO base address for staging
+ * @ucode_len: Total size of the microcode image
+ * @chunk_size: Size of each data piece
+ * @bytes_sent: Total bytes transmitted so far
+ * @offset: Current offset in the microcode image
+ */
+struct staging_state {
+ void __iomem *mmio_base;
+ unsigned int ucode_len;
+ unsigned int chunk_size;
+ unsigned int bytes_sent;
+ unsigned int offset;
+};
+
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+
+static inline unsigned int get_totalsize(struct microcode_header_intel *hdr)
+{
+ return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE;
+}
+
+static inline unsigned int exttable_size(struct extended_sigtable *et)
+{
+ return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE;
+}
+
+void intel_collect_cpu_info(struct cpu_signature *sig)
+{
+ sig->sig = cpuid_eax(1);
+ sig->pf = 0;
+ sig->rev = intel_get_microcode_revision();
+
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
+ unsigned int val[2];
+
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ sig->pf = 1 << ((val[1] >> 18) & 7);
+ }
+}
+EXPORT_SYMBOL_GPL(intel_collect_cpu_info);
+
+static inline bool cpu_signatures_match(struct cpu_signature *s1, unsigned int sig2,
+ unsigned int pf2)
+{
+ if (s1->sig != sig2)
+ return false;
+
+ /* Processor flags are either both 0 or they intersect. */
+ return ((!s1->pf && !pf2) || (s1->pf & pf2));
+}
+
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_signature *ext_sig;
+ struct extended_sigtable *ext_hdr;
+ int i;
+
+ if (cpu_signatures_match(sig, mc_hdr->sig, mc_hdr->pf))
+ return true;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return false;
+
+ ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(sig, ext_sig->sig, ext_sig->pf))
+ return true;
+ ext_sig++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_find_matching_signature);
+
+/**
+ * intel_microcode_sanity_check() - Sanity check microcode file.
+ * @mc: Pointer to the microcode file contents.
+ * @print_err: Display failure reason if true, silent if false.
+ * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
+ * Validate if the microcode header type matches with the type
+ * specified here.
+ *
+ * Validate certain header fields and verify if computed checksum matches
+ * with the one specified in the header.
+ *
+ * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
+ * fail.
+ */
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = intel_microcode_get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
+ mc_header->hdrver);
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if (ext_table_size < EXT_HEADER_SIZE ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
+
+static void update_ucode_pointer(struct microcode_intel *mc)
+{
+ kvfree(ucode_patch_va);
+
+ /*
+ * Save the virtual address for early loading and for eventual free
+ * on late loading.
+ */
+ ucode_patch_va = mc;
+}
+
+static void save_microcode_patch(struct microcode_intel *patch)
+{
+ unsigned int size = get_totalsize(&patch->hdr);
+ struct microcode_intel *mc;
+
+ mc = kvmemdup(patch, size, GFP_KERNEL);
+ if (mc)
+ update_ucode_pointer(mc);
+ else
+ pr_err("Unable to allocate microcode memory size: %u\n", size);
+}
+
+/* Scan blob for microcode matching the boot CPUs family, model, stepping */
+static __init struct microcode_intel *scan_microcode(void *data, size_t size,
+ struct ucode_cpu_info *uci,
+ bool save)
+{
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *patch = NULL;
+ u32 cur_rev = uci->cpu_sig.rev;
+ unsigned int mc_size;
+
+ for (; size >= sizeof(struct microcode_header_intel); size -= mc_size, data += mc_size) {
+ mc_header = (struct microcode_header_intel *)data;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size || mc_size > size ||
+ intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
+ break;
+
+ if (!intel_find_matching_signature(data, &uci->cpu_sig))
+ continue;
+
+ /*
+ * For saving the early microcode, find the matching revision which
+ * was loaded on the BSP.
+ *
+ * On the BSP during early boot, find a newer revision than
+ * actually loaded in the CPU.
+ */
+ if (save) {
+ if (cur_rev != mc_header->rev)
+ continue;
+ } else if (cur_rev >= mc_header->rev) {
+ continue;
+ }
+
+ patch = data;
+ cur_rev = mc_header->rev;
+ }
+
+ return size ? NULL : patch;
+}
+
+static inline u32 read_mbox_dword(void __iomem *mmio_base)
+{
+ u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
+
+ /* Acknowledge read completion to the staging hardware */
+ writel(0, mmio_base + MBOX_RDDATA_OFFSET);
+ return dword;
+}
+
+static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
+{
+ writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
+}
+
+static inline u64 read_mbox_header(void __iomem *mmio_base)
+{
+ u32 high, low;
+
+ low = read_mbox_dword(mmio_base);
+ high = read_mbox_dword(mmio_base);
+
+ return ((u64)high << 32) | low;
+}
+
+static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
+{
+ write_mbox_dword(mmio_base, value);
+ write_mbox_dword(mmio_base, value >> 32);
+}
+
+static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes)
+{
+ int i;
+
+ /*
+ * The MMIO space is mapped as Uncached (UC). Each write arrives
+ * at the device as an individual transaction in program order.
+ * The device can then reassemble the sequence accordingly.
+ */
+ for (i = 0; i < chunk_bytes / sizeof(u32); i++)
+ write_mbox_dword(mmio_base, chunk[i]);
+}
+
+/*
+ * Prepare for a new microcode transfer: reset hardware and record the
+ * image size.
+ */
+static void init_stage(struct staging_state *ss)
+{
+ ss->ucode_len = get_totalsize(&ucode_patch_late->hdr);
+
+ /*
+ * Abort any ongoing process, effectively resetting the device.
+ * Unlike regular mailbox data processing requests, this
+ * operation does not require a status check.
+ */
+ writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET);
+}
+
+/*
+ * Update the chunk size and decide whether another chunk can be sent.
+ * This accounts for remaining data and retry limits.
+ */
+static bool can_send_next_chunk(struct staging_state *ss, int *err)
+{
+ /* A page size or remaining bytes if this is the final chunk */
+ ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset);
+
+ /*
+ * Each microcode image is divided into chunks, each at most
+ * one page size. A 10-chunk image would typically require 10
+ * transactions.
+ *
+ * However, the hardware managing the mailbox has limited
+ * resources and may not cache the entire image, potentially
+ * requesting the same chunk multiple times.
+ *
+ * To tolerate this behavior, allow up to twice the expected
+ * number of transactions (i.e., a 10-chunk image can take up to
+ * 20 attempts).
+ *
+ * If the number of attempts exceeds this limit, treat it as
+ * exceeding the maximum allowed transfer size.
+ */
+ if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) {
+ *err = -EMSGSIZE;
+ return false;
+ }
+
+ *err = 0;
+ return true;
+}
+
+/*
+ * The hardware indicates completion by returning a sentinel end offset.
+ */
+static inline bool is_end_offset(u32 offset)
+{
+ return offset == UINT_MAX;
+}
+
+/*
+ * Determine whether staging is complete: either the hardware signaled
+ * the end offset, or no more transactions are permitted (retry limit
+ * reached).
+ */
+static inline bool staging_is_complete(struct staging_state *ss, int *err)
+{
+ return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err);
+}
+
+/*
+ * Wait for the hardware to complete a transaction.
+ * Return 0 on success, or an error code on failure.
+ */
+static int wait_for_transaction(struct staging_state *ss)
+{
+ u32 timeout, status;
+
+ /* Allow time for hardware to complete the operation: */
+ for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
+ msleep(1);
+
+ status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+ /* Break out early if the hardware is ready: */
+ if (status & MASK_MBOX_STATUS_READY)
+ break;
+ }
+
+ /* Check for explicit error response */
+ if (status & MASK_MBOX_STATUS_ERROR)
+ return -EIO;
+
+ /*
+ * Hardware has neither responded to the action nor signaled any
+ * error. Treat this as a timeout.
+ */
+ if (!(status & MASK_MBOX_STATUS_READY))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+/*
+ * Transmit a chunk of the microcode image to the hardware.
+ * Return 0 on success, or an error code on failure.
+ */
+static int send_data_chunk(struct staging_state *ss, void *ucode_ptr)
+{
+ u32 *src_chunk = ucode_ptr + ss->offset;
+ u16 mbox_size;
+
+ /*
+ * Write a 'request' mailbox object in this order:
+ * 1. Mailbox header includes total size
+ * 2. Command header specifies the load operation
+ * 3. Data section contains a microcode chunk
+ *
+ * Thus, the mailbox size is two headers plus the chunk size.
+ */
+ mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;
+ write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
+ write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
+ write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size);
+ ss->bytes_sent += ss->chunk_size;
+
+ /* Notify the hardware that the mailbox is ready for processing. */
+ writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
+
+ return wait_for_transaction(ss);
+}
+
+/*
+ * Retrieve the next offset from the hardware response.
+ * Return 0 on success, or an error code on failure.
+ */
+static int fetch_next_offset(struct staging_state *ss)
+{
+ const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE);
+ u32 offset, status;
+ u64 header;
+
+ /*
+ * The 'response' mailbox returns three fields, in order:
+ * 1. Header
+ * 2. Next offset in the microcode image
+ * 3. Status flags
+ */
+ header = read_mbox_header(ss->mmio_base);
+ offset = read_mbox_dword(ss->mmio_base);
+ status = read_mbox_dword(ss->mmio_base);
+
+ /* All valid responses must start with the expected header. */
+ if (header != expected_header) {
+ pr_err_once("staging: invalid response header (0x%llx)\n", header);
+ return -EBADR;
+ }
+
+ /*
+ * Verify the offset: If not at the end marker, it must not
+ * exceed the microcode image length.
+ */
+ if (!is_end_offset(offset) && offset > ss->ucode_len) {
+ pr_err_once("staging: invalid offset (%u) past the image end (%u)\n",
+ offset, ss->ucode_len);
+ return -EINVAL;
+ }
+
+ /* Hardware may report errors explicitly in the status field */
+ if (status & MASK_MBOX_RESP_ERROR)
+ return -EPROTO;
+
+ ss->offset = offset;
+ return 0;
+}
+
+/*
+ * Handle the staging process using the mailbox MMIO interface. The
+ * microcode image is transferred in chunks until completion.
+ * Return 0 on success or an error code on failure.
+ */
+static int do_stage(u64 mmio_pa)
+{
+ struct staging_state ss = {};
+ int err;
+
+ ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE);
+ if (WARN_ON_ONCE(!ss.mmio_base))
+ return -EADDRNOTAVAIL;
+
+ init_stage(&ss);
+
+ /* Perform the staging process while within the retry limit */
+ while (!staging_is_complete(&ss, &err)) {
+ /* Send a chunk of microcode each time: */
+ err = send_data_chunk(&ss, ucode_patch_late);
+ if (err)
+ break;
+ /*
+ * Then, ask the hardware which piece of the image it
+ * needs next. The same piece may be sent more than once.
+ */
+ err = fetch_next_offset(&ss);
+ if (err)
+ break;
+ }
+
+ iounmap(ss.mmio_base);
+
+ return err;
+}
+
+static void stage_microcode(void)
+{
+ unsigned int pkg_id = UINT_MAX;
+ int cpu, err;
+ u64 mmio_pa;
+
+ if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) {
+ pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n",
+ get_totalsize(&ucode_patch_late->hdr));
+ return;
+ }
+
+ lockdep_assert_cpus_held();
+
+ /*
+ * The MMIO address is unique per package, and all the SMT
+ * primary threads are online here. Find each MMIO space by
+ * their package IDs to avoid duplicate staging.
+ */
+ for_each_cpu(cpu, cpu_primary_thread_mask) {
+ if (topology_logical_package_id(cpu) == pkg_id)
+ continue;
+
+ pkg_id = topology_logical_package_id(cpu);
+
+ err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa);
+ if (WARN_ON_ONCE(err))
+ return;
+
+ err = do_stage(mmio_pa);
+ if (err) {
+ pr_err("Error: staging failed (%d) for CPU%d at package %u.\n",
+ err, cpu, pkg_id);
+ return;
+ }
+ }
+
+ pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev);
+}
+
+static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
+ struct microcode_intel *mc,
+ u32 *cur_rev)
+{
+ u32 rev;
+
+ if (!mc)
+ return UCODE_NFOUND;
+
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ *cur_rev = intel_get_microcode_revision();
+ if (*cur_rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = *cur_rev;
+ return UCODE_OK;
+ }
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
+ return UCODE_ERROR;
+
+ uci->cpu_sig.rev = rev;
+ return UCODE_UPDATED;
+}
+
+static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc = uci->mc;
+ u32 cur_rev;
+
+ return __apply_microcode(uci, mc, &cur_rev);
+}
+
+static __init bool load_builtin_intel_microcode(struct cpio_data *cp)
+{
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
+ char name[30];
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
+
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+ return false;
+}
+
+static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *uci, bool save)
+{
+ struct cpio_data cp;
+
+ intel_collect_cpu_info(&uci->cpu_sig);
+
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path);
+
+ if (!(cp.data && cp.size))
+ return NULL;
+
+ return scan_microcode(cp.data, cp.size, uci, save);
+}
+
+/*
+ * Invoked from an early init call to save the microcode blob which was
+ * selected during early boot when mm was not usable. The microcode must be
+ * saved because initrd is going away. It's an early init call so the APs
+ * just can use the pointer and do not have to scan initrd/builtin firmware
+ * again.
+ */
+static int __init save_builtin_microcode(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
+ return 0;
+
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ uci.mc = get_microcode_blob(&uci, true);
+ if (uci.mc)
+ save_microcode_patch(uci.mc);
+ return 0;
+}
+early_initcall(save_builtin_microcode);
+
+/* Load microcode on BSP from initrd or builtin blobs */
+void __init load_ucode_intel_bsp(struct early_load_data *ed)
+{
+ struct ucode_cpu_info uci;
+
+ uci.mc = get_microcode_blob(&uci, false);
+ ed->old_rev = uci.cpu_sig.rev;
+
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
+ }
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct ucode_cpu_info uci;
+
+ uci.mc = ucode_patch_va;
+ if (uci.mc)
+ apply_microcode_early(&uci);
+}
+
+/* Reload microcode on resume */
+void reload_ucode_intel(void)
+{
+ struct ucode_cpu_info uci = { .mc = ucode_patch_va, };
+
+ if (uci.mc)
+ apply_microcode_early(&uci);
+}
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+ intel_collect_cpu_info(csig);
+ return 0;
+}
+
+static enum ucode_state apply_microcode_late(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct microcode_intel *mc = ucode_patch_late;
+ enum ucode_state ret;
+ u32 cur_rev;
+
+ if (WARN_ON_ONCE(smp_processor_id() != cpu))
+ return UCODE_ERROR;
+
+ ret = __apply_microcode(uci, mc, &cur_rev);
+ if (ret != UCODE_UPDATED && ret != UCODE_OK)
+ return ret;
+
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ return ret;
+}
+
+static bool ucode_validate_minrev(struct microcode_header_intel *mc_header)
+{
+ int cur_rev = boot_cpu_data.microcode;
+
+ /*
+ * When late-loading, ensure the header declares a minimum revision
+ * required to perform a late-load. The previously reserved field
+ * is 0 in older microcode blobs.
+ */
+ if (!mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Microcode header does not specify a required min version\n");
+ return false;
+ }
+
+ /*
+ * Check whether the current revision is either greater or equal to
+ * to the minimum revision specified in the header.
+ */
+ if (cur_rev < mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Current revision 0x%x too old\n", cur_rev);
+ pr_info("Current should be at 0x%x or higher. Use early loading instead\n", mc_header->min_req_ver);
+ return false;
+ }
+ return true;
+}
+
+static enum ucode_state parse_microcode_blobs(int cpu, struct iov_iter *iter)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ bool is_safe, new_is_safe = false;
+ int cur_rev = uci->cpu_sig.rev;
+ unsigned int curr_mc_size = 0;
+ u8 *new_mc = NULL, *mc = NULL;
+
+ while (iov_iter_count(iter)) {
+ struct microcode_header_intel mc_header;
+ unsigned int mc_size, data_size;
+ u8 *data;
+
+ if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) {
+ pr_err("error! Truncated or inaccessible header in microcode data file\n");
+ goto fail;
+ }
+
+ mc_size = get_totalsize(&mc_header);
+ if (mc_size < sizeof(mc_header)) {
+ pr_err("error! Bad data in microcode data file (totalsize too small)\n");
+ goto fail;
+ }
+ data_size = mc_size - sizeof(mc_header);
+ if (data_size > iov_iter_count(iter)) {
+ pr_err("error! Bad data in microcode data file (truncated file?)\n");
+ goto fail;
+ }
+
+ /* For performance reasons, reuse mc area when possible */
+ if (!mc || mc_size > curr_mc_size) {
+ kvfree(mc);
+ mc = kvmalloc(mc_size, GFP_KERNEL);
+ if (!mc)
+ goto fail;
+ curr_mc_size = mc_size;
+ }
+
+ memcpy(mc, &mc_header, sizeof(mc_header));
+ data = mc + sizeof(mc_header);
+ if (!copy_from_iter_full(data, data_size, iter) ||
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0)
+ goto fail;
+
+ if (cur_rev >= mc_header.rev)
+ continue;
+
+ if (!intel_find_matching_signature(mc, &uci->cpu_sig))
+ continue;
+
+ is_safe = ucode_validate_minrev(&mc_header);
+ if (force_minrev && !is_safe)
+ continue;
+
+ kvfree(new_mc);
+ cur_rev = mc_header.rev;
+ new_mc = mc;
+ new_is_safe = is_safe;
+ mc = NULL;
+ }
+
+ if (iov_iter_count(iter))
+ goto fail;
+
+ kvfree(mc);
+ if (!new_mc)
+ return UCODE_NFOUND;
+
+ ucode_patch_late = (struct microcode_intel *)new_mc;
+ return new_is_safe ? UCODE_NEW_SAFE : UCODE_NEW;
+
+fail:
+ kvfree(mc);
+ kvfree(new_mc);
+ return UCODE_ERROR;
+}
+
+static bool is_blacklisted(unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Late loading on model 79 with microcode revision less than 0x0b000021
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDX90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
+ */
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
+ c->x86_stepping == 0x01 &&
+ llc_size_per_core > 2621440 &&
+ c->microcode < 0x0b000021) {
+ pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+ return true;
+ }
+
+ return false;
+}
+
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ const struct firmware *firmware;
+ struct iov_iter iter;
+ enum ucode_state ret;
+ struct kvec kvec;
+ char name[30];
+
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_stepping);
+
+ if (request_firmware_direct(&firmware, name, device)) {
+ pr_debug("data file %s load failed\n", name);
+ return UCODE_NFOUND;
+ }
+
+ kvec.iov_base = (void *)firmware->data;
+ kvec.iov_len = firmware->size;
+ iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
+ ret = parse_microcode_blobs(cpu, &iter);
+
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static void finalize_late_load(int result)
+{
+ if (!result)
+ update_ucode_pointer(ucode_patch_late);
+ else
+ kvfree(ucode_patch_late);
+ ucode_patch_late = NULL;
+}
+
+static struct microcode_ops microcode_intel_ops = {
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_late,
+ .finalize_late_load = finalize_late_load,
+ .stage_microcode = stage_microcode,
+ .use_nmi = IS_ENABLED(CONFIG_X86_64),
+};
+
+static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024ULL;
+
+ do_div(llc_size, topology_num_cores_per_package());
+ llc_size_per_core = (unsigned int)llc_size;
+}
+
+static __init bool staging_available(void)
+{
+ u64 val;
+
+ val = x86_read_arch_cap_msr();
+ if (!(val & ARCH_CAP_MCU_ENUM))
+ return false;
+
+ rdmsrq(MSR_IA32_MCU_ENUMERATION, val);
+ return !!(val & MCU_STAGING);
+}
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ if (staging_available()) {
+ microcode_intel_ops.use_staging = true;
+ pr_info("Enabled staging feature.\n");
+ }
+
+ calc_llc_size_per_core(c);
+
+ return &microcode_intel_ops;
+}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
new file mode 100644
index 000000000000..a10b547eda1e
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_MICROCODE_INTERNAL_H
+#define _X86_MICROCODE_INTERNAL_H
+
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+#include <asm/cpu.h>
+#include <asm/microcode.h>
+
+struct device;
+
+enum ucode_state {
+ UCODE_OK = 0,
+ UCODE_NEW,
+ UCODE_NEW_SAFE,
+ UCODE_UPDATED,
+ UCODE_NFOUND,
+ UCODE_ERROR,
+ UCODE_TIMEOUT,
+ UCODE_OFFLINE,
+};
+
+struct microcode_ops {
+ enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev);
+ void (*microcode_fini_cpu)(int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that the callbacks
+ * below run on a target CPU when they are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ enum ucode_state (*apply_microcode)(int cpu);
+ void (*stage_microcode)(void);
+ int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ void (*finalize_late_load)(int result);
+ unsigned int nmi_safe : 1,
+ use_nmi : 1,
+ use_staging : 1;
+};
+
+struct early_load_data {
+ u32 old_rev;
+ u32 new_rev;
+};
+
+extern struct early_load_data early_data;
+extern struct ucode_cpu_info ucode_cpu_info[];
+extern u32 microcode_rev[NR_CPUS];
+extern u32 base_rev;
+
+struct cpio_data find_microcode_in_initrd(const char *path);
+
+#define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_cpuid_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
+ *
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_cpuid_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int x86_cpuid_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ return x86_family(eax);
+}
+
+extern bool force_minrev;
+
+#ifdef CONFIG_CPU_SUP_AMD
+void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
+void load_ucode_amd_ap(unsigned int family);
+void reload_ucode_amd(unsigned int cpu);
+struct microcode_ops *init_amd_microcode(void);
+void exit_amd_microcode(void);
+#else /* CONFIG_CPU_SUP_AMD */
+static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
+static inline void load_ucode_amd_ap(unsigned int family) { }
+static inline void reload_ucode_amd(unsigned int cpu) { }
+static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
+static inline void exit_amd_microcode(void) { }
+#endif /* !CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+void load_ucode_intel_bsp(struct early_load_data *ed);
+void load_ucode_intel_ap(void);
+void reload_ucode_intel(void);
+struct microcode_ops *init_intel_microcode(void);
+#else /* CONFIG_CPU_SUP_INTEL */
+static inline void load_ucode_intel_bsp(struct early_load_data *ed) { }
+static inline void load_ucode_intel_ap(void) { }
+static inline void reload_ucode_intel(void) { }
+static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
+#endif /* !CONFIG_CPU_SUP_INTEL */
+
+#define ucode_dbg(fmt, ...) \
+({ \
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \
+ pr_info(fmt, ##__VA_ARGS__); \
+})
+
+#endif /* _X86_MICROCODE_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
deleted file mode 100644
index dfea390e1608..000000000000
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/perl
-#
-# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
-#
-
-($in, $out) = @ARGV;
-
-open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
-open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
-
-print OUT "#include <asm/cpufeature.h>\n\n";
-print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
-
-while (defined($line = <IN>)) {
- if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
- $macro = $1;
- $feature = $2;
- $tail = $3;
- if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
- $feature = $1;
- }
-
- if ($feature ne '') {
- printf OUT "\t%-32s = \"%s\",\n",
- "[$macro]", "\L$feature";
- }
- }
-}
-print OUT "};\n";
-
-close(IN);
-close(OUT);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
new file mode 100644
index 000000000000..68f537347466
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
+#
+
+set -e
+
+OUT=$1
+
+dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
+ IN=$5
+
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
+
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
+ while read i
+ do
+ # Name is everything up to the first whitespace
+ NAME="$(echo "$i" | sed 's/ .*//')"
+
+ # If the /* comment */ starts with a quote string, grab that.
+ VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
+ [ ! "$VALUE" ] && continue
+
+ # Name is uppercase, VALUE is all lowercase
+ VALUE="$(echo "$VALUE" | tr A-Z a-z)"
+
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
+ done
+ echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURES_H"
+ echo "#include <asm/cpufeatures.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2
+ echo ""
+
+ echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES"
+ echo "#ifndef _ASM_X86_VMXFEATURES_H"
+ echo "#include <asm/vmxfeatures.h>"
+ echo "#endif"
+ dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3
+ echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */"
+) > $OUT
+
+trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..579fb2c64cfd
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,771 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HyperV Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ */
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/efi.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kexec.h>
+#include <linux/random.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <hyperv/hvhdk.h>
+#include <asm/mshyperv.h>
+#include <asm/desc.h>
+#include <asm/idtentry.h>
+#include <asm/irq_regs.h>
+#include <asm/i8259.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/reboot.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <clocksource/hyperv_timer.h>
+#include <asm/numa.h>
+#include <asm/svm.h>
+
+/* Is Linux running on nested Microsoft Hypervisor */
+bool hv_nested;
+struct ms_hyperv_info ms_hyperv;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * When running with the paravisor, controls proxying the synthetic interrupts
+ * from the host
+ */
+static bool hv_para_sint_proxy;
+
+static inline unsigned int hv_get_nested_msr(unsigned int reg)
+{
+ if (hv_is_sint_msr(reg))
+ return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0;
+
+ switch (reg) {
+ case HV_X64_MSR_SIMP:
+ return HV_X64_MSR_NESTED_SIMP;
+ case HV_X64_MSR_SIEFP:
+ return HV_X64_MSR_NESTED_SIEFP;
+ case HV_X64_MSR_SVERSION:
+ return HV_X64_MSR_NESTED_SVERSION;
+ case HV_X64_MSR_SCONTROL:
+ return HV_X64_MSR_NESTED_SCONTROL;
+ case HV_X64_MSR_EOM:
+ return HV_X64_MSR_NESTED_EOM;
+ default:
+ return reg;
+ }
+}
+
+u64 hv_get_non_nested_msr(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present)
+ hv_ivm_msr_read(reg, &value);
+ else
+ rdmsrq(reg, value);
+ return value;
+}
+EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
+
+void hv_set_non_nested_msr(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+ /* The hypervisor will get the intercept. */
+ hv_ivm_msr_write(reg, value);
+
+ /* Using wrmsrq so the following goes to the paravisor. */
+ if (hv_is_sint_msr(reg)) {
+ union hv_synic_sint sint = { .as_uint64 = value };
+
+ sint.proxy = hv_para_sint_proxy;
+ native_wrmsrq(reg, sint.as_uint64);
+ }
+ } else {
+ native_wrmsrq(reg, value);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
+
+/*
+ * Enable or disable proxying synthetic interrupts
+ * to the paravisor.
+ */
+void hv_para_set_sint_proxy(bool enable)
+{
+ hv_para_sint_proxy = enable;
+}
+
+/*
+ * Get the SynIC register value from the paravisor.
+ */
+u64 hv_para_get_synic_register(unsigned int reg)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return ~0ULL;
+ return native_read_msr(reg);
+}
+
+/*
+ * Set the SynIC register value with the paravisor.
+ */
+void hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return;
+ native_write_msr(reg, val);
+}
+
+u64 hv_get_msr(unsigned int reg)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ return hv_get_non_nested_msr(reg);
+}
+EXPORT_SYMBOL_GPL(hv_get_msr);
+
+void hv_set_msr(unsigned int reg, u64 value)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ hv_set_non_nested_msr(reg, value);
+}
+EXPORT_SYMBOL_GPL(hv_set_msr);
+
+static void (*mshv_handler)(void);
+static void (*vmbus_handler)(void);
+static void (*hv_stimer0_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
+ if (vmbus_handler)
+ vmbus_handler();
+
+ if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
+ apic_eoi();
+
+ set_irq_regs(old_regs);
+}
+
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
+void hv_setup_vmbus_handler(void (*handler)(void))
+{
+ vmbus_handler = handler;
+}
+
+void hv_remove_vmbus_handler(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ vmbus_handler = NULL;
+}
+
+/*
+ * Routines to do per-architecture handling of stimer0
+ * interrupts when in Direct Mode
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(hyperv_stimer0_count);
+ if (hv_stimer0_handler)
+ hv_stimer0_handler();
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
+ apic_eoi();
+
+ set_irq_regs(old_regs);
+}
+
+/* For x86/x64, override weak placeholders in hyperv_timer.c */
+void hv_setup_stimer0_handler(void (*handler)(void))
+{
+ hv_stimer0_handler = handler;
+}
+
+void hv_remove_stimer0_handler(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ hv_stimer0_handler = NULL;
+}
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+ hv_kexec_handler = handler;
+}
+
+void hv_remove_kexec_handler(void)
+{
+ hv_kexec_handler = NULL;
+}
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+ hv_crash_handler = handler;
+}
+
+void hv_remove_crash_handler(void)
+{
+ hv_crash_handler = NULL;
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+ if (kexec_in_progress && hv_kexec_handler)
+ hv_kexec_handler();
+
+ /*
+ * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
+ * corrupts the old VP Assist Pages and can crash the kexec kernel.
+ */
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
+
+ /* The function calls stop_other_cpus(). */
+ native_machine_shutdown();
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ if (kexec_in_progress)
+ hyperv_cleanup();
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_DUMP
+static void hv_guest_crash_shutdown(struct pt_regs *regs)
+{
+ if (hv_crash_handler)
+ hv_crash_handler(regs);
+
+ /* The function calls crash_smp_send_stop(). */
+ native_machine_crash_shutdown(regs);
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ hyperv_cleanup();
+}
+#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
+
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_CALL(hv_hypercall, hv_std_hypercall);
+EXPORT_STATIC_CALL_TRAMP_GPL(hv_hypercall);
+#define hypercall_update(hc) static_call_update(hv_hypercall, hc)
+#endif
+#endif /* CONFIG_HYPERV */
+
+#ifndef hypercall_update
+#define hypercall_update(hc) (void)hc
+#endif
+
+static uint32_t __init ms_hyperv_platform(void)
+{
+ u32 eax;
+ u32 hyp_signature[3];
+
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+ &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
+
+ if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX ||
+ memcmp("Microsoft Hv", hyp_signature, 12))
+ return 0;
+
+ /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */
+ eax = cpuid_eax(HYPERV_CPUID_FEATURES);
+ if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) {
+ pr_warn("x86/hyperv: HYPERCALL MSR not available.\n");
+ return 0;
+ }
+ if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) {
+ pr_warn("x86/hyperv: VP_INDEX MSR not available.\n");
+ return 0;
+ }
+
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
+{
+ static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+ unsigned int old_cpu, this_cpu;
+
+ if (!unknown_nmi_panic)
+ return NMI_DONE;
+
+ old_cpu = -1;
+ this_cpu = raw_smp_processor_id();
+ if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+#endif
+
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
+#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV)
+static void __init hv_smp_prepare_boot_cpu(void)
+{
+ native_smp_prepare_boot_cpu();
+#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ hv_init_spinlocks();
+#endif
+}
+
+static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ int ret;
+#endif
+
+ native_smp_prepare_cpus(max_cpus);
+
+ /*
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+ return;
+ }
+
+#ifdef CONFIG_X86_64
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+ BUG_ON(ret);
+ }
+
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
+ BUG_ON(ret);
+ }
+#endif
+}
+#endif
+
+/*
+ * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the
+ * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0
+ * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns
+ * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a
+ * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and
+ * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs
+ * from the array and hence doesn't create the necessary irq description info.
+ *
+ * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here,
+ * except don't change 'legacy_pic', which keeps its default value
+ * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero
+ * nr_legacy_irqs() and eventually serial console interrupts works properly.
+ */
+static void __init reduced_hw_init(void)
+{
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+}
+
+int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
+{
+ unsigned int hv_max_functions;
+
+ hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+ if (hv_max_functions < HYPERV_CPUID_VERSION) {
+ pr_err("%s: Could not detect Hyper-V version\n", __func__);
+ return -ENODEV;
+ }
+
+ cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
+
+static void __init ms_hyperv_init_platform(void)
+{
+ int hv_max_functions_eax, eax;
+
+#ifdef CONFIG_PARAVIRT
+ pv_info.name = "Hyper-V";
+#endif
+
+ /*
+ * Extract the features and hints
+ */
+ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+
+ hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
+ ms_hyperv.misc_features);
+
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
+
+ pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
+ ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
+
+ hv_identify_partition_type();
+
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
+
+ if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
+ hv_nested = true;
+ pr_info("Hyper-V: running on a nested hypervisor\n");
+ }
+
+ /*
+ * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
+ * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
+ * root) will not get them because the nested (L1) hypervisor filters them out.
+ * These are handled through intercept processing by the Windows Hyper-V stack
+ * or the paravisor.
+ */
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+ ms_hyperv.confidential_vmbus_available =
+ eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
+ ms_hyperv.msi_ext_dest_id =
+ eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ }
+
+ if (ms_hyperv.priv_high & HV_ISOLATION) {
+ ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+
+ if (ms_hyperv.shared_gpa_boundary_active)
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+ pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
+ ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+ static_branch_enable(&isolation_type_snp);
+ if (!ms_hyperv.paravisor_present)
+ hypercall_update(hv_snp_hypercall);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
+ static_branch_enable(&isolation_type_tdx);
+
+ /* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+ if (!ms_hyperv.paravisor_present) {
+ hypercall_update(hv_tdx_hypercall);
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
+ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
+ /* HV_MSR_CRASH_CTL is unsupported. */
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ /* Don't trust Hyper-V's TLB-flushing hypercalls. */
+ ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+ x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+ }
+ }
+ }
+
+ if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ pr_info("Hyper-V: Nested features: 0x%x\n",
+ ms_hyperv.nested_features);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ /*
+ * Get the APIC frequency.
+ */
+ u64 hv_lapic_frequency;
+
+ rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
+ lapic_timer_period = hv_lapic_frequency;
+ pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_period);
+ }
+
+ register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+ "hv_nmi_unknown");
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (hv_root_partition())
+ machine_ops.power_off = hv_machine_power_off;
+#if defined(CONFIG_KEXEC_CORE)
+ machine_ops.shutdown = hv_machine_shutdown;
+#endif
+#if defined(CONFIG_CRASH_DUMP)
+ if (!hv_root_partition())
+ machine_ops.crash_shutdown = hv_guest_crash_shutdown;
+#endif
+#endif
+ /*
+ * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. Root
+ * partition doesn't need to write to synthetic MSR to enable invariant
+ * TSC feature. It sees what the hardware provides.
+ */
+ if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
+ /*
+ * Writing to synthetic MSR 0x40000118 updates/changes the
+ * guest visible CPUIDs. Setting bit 0 of this MSR enables
+ * guests to report invariant TSC feature through CPUID
+ * instruction, CPUID 0x800000007/EDX, bit 8. See code in
+ * early_init_intel() where this bit is examined. The
+ * setting of this MSR bit should happen before init_intel()
+ * is called.
+ */
+ wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ }
+
+ /*
+ * Generation 2 instances don't support reading the NMI status from
+ * 0x61 port.
+ */
+ if (efi_enabled(EFI_BOOT))
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
+ ms_hyperv.paravisor_present)
+ hv_vtom_init();
+ /*
+ * Setup the hook to get control post apic initialization.
+ */
+ x86_platform.apic_post_init = hyperv_init;
+ hyperv_setup_mmu_ops();
+
+ /* Install system interrupt handler for hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
+
+ /* Install system interrupt handler for reenlightenment notifications */
+ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
+ sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
+ }
+
+ /* Install system interrupt handler for stimer0 */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
+ sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
+ }
+
+# ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
+ if (hv_root_partition() ||
+ (!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
+ smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
+# endif
+
+ /*
+ * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
+ * set x2apic destination mode to physical mode when x2apic is available
+ * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
+ * have 8-bit APIC id.
+ */
+# ifdef CONFIG_X86_X2APIC
+ if (x2apic_supported())
+ x2apic_phys = 1;
+# endif
+
+ /* Register Hyper-V specific clocksource */
+ hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
+ hv_vtl_init_platform();
+#endif
+ /*
+ * TSC should be marked as unstable only after Hyper-V
+ * clocksource has been initialized. This ensures that the
+ * stability of the sched_clock is not altered.
+ *
+ * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. No
+ * need to check for it.
+ */
+ if (!hv_root_partition() &&
+ !(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ mark_tsc_unstable("running on Hyper-V");
+
+ hardlockup_detector_disable();
+}
+
+static bool __init ms_hyperv_x2apic_available(void)
+{
+ return x2apic_supported();
+}
+
+/*
+ * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping()
+ * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the
+ * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg().
+ *
+ * Note: for a VM on Hyper-V, the I/O-APIC is the only device which
+ * (logically) generates MSIs directly to the system APIC irq domain.
+ * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
+ * pci-hyperv host bridge.
+ *
+ * Note: for a Hyper-V root partition, this will always return false.
+ */
+static bool __init ms_hyperv_msi_ext_dest_id(void)
+{
+ return ms_hyperv.msi_ext_dest_id;
+}
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+#endif
+
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft Hyper-V",
+ .detect = ms_hyperv_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.x2apic_available = ms_hyperv_x2apic_available,
+ .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
+ .init.init_platform = ms_hyperv_init_platform,
+ .init.guest_late_init = ms_hyperv_late_init,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
+#endif
+};
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..aee4bc5ad496 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,4 @@
-obj-y := main.o if.o generic.o state.o cleanup.o
-obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y := mtrr.o if.o generic.o cleanup.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o legacy.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index ee2331b0e58f..ef3e8e42b782 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/mtrr.h>
@@ -7,15 +8,15 @@
static void
amd_get_mtrr(unsigned int reg, unsigned long *base,
- unsigned long *size, mtrr_type * type)
+ unsigned long *size, mtrr_type *type)
{
unsigned long low, high;
rdmsr(MSR_K6_UWCCR, low, high);
- /* Upper dword is region 1, lower is region 0 */
+ /* Upper dword is region 1, lower is region 0 */
if (reg == 1)
low = high;
- /* The base masks off on the right alignment */
+ /* The base masks off on the right alignment */
*base = (low & 0xFFFE0000) >> PAGE_SHIFT;
*type = 0;
if (low & 1)
@@ -27,93 +28,92 @@ amd_get_mtrr(unsigned int reg, unsigned long *base,
return;
}
/*
- * This needs a little explaining. The size is stored as an
- * inverted mask of bits of 128K granularity 15 bits long offset
- * 2 bits
+ * This needs a little explaining. The size is stored as an
+ * inverted mask of bits of 128K granularity 15 bits long offset
+ * 2 bits.
*
- * So to get a size we do invert the mask and add 1 to the lowest
- * mask bit (4 as its 2 bits in). This gives us a size we then shift
- * to turn into 128K blocks
+ * So to get a size we do invert the mask and add 1 to the lowest
+ * mask bit (4 as its 2 bits in). This gives us a size we then shift
+ * to turn into 128K blocks.
*
- * eg 111 1111 1111 1100 is 512K
+ * eg 111 1111 1111 1100 is 512K
*
- * invert 000 0000 0000 0011
- * +1 000 0000 0000 0100
- * *128K ...
+ * invert 000 0000 0000 0011
+ * +1 000 0000 0000 0100
+ * *128K ...
*/
low = (~low) & 0x1FFFC;
*size = (low + 4) << (15 - PAGE_SHIFT);
- return;
}
-static void amd_set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
-/* [SUMMARY] Set variable MTRR register on the local CPU.
- <reg> The register to set.
- <base> The base address of the region.
- <size> The size of the region. If this is 0 the region is disabled.
- <type> The type of the region.
- [RETURNS] Nothing.
-*/
+/**
+ * amd_set_mtrr - Set variable MTRR register on the local CPU.
+ *
+ * @reg The register to set.
+ * @base The base address of the region.
+ * @size The size of the region. If this is 0 the region is disabled.
+ * @type The type of the region.
+ *
+ * Returns nothing.
+ */
+static void
+amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
u32 regs[2];
/*
- * Low is MTRR0 , High MTRR 1
+ * Low is MTRR0, High MTRR 1
*/
rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
/*
- * Blank to disable
+ * Blank to disable
*/
- if (size == 0)
+ if (size == 0) {
regs[reg] = 0;
- else
- /* Set the register to the base, the type (off by one) and an
- inverted bitmask of the size The size is the only odd
- bit. We are fed say 512K We invert this and we get 111 1111
- 1111 1011 but if you subtract one and invert you get the
- desired 111 1111 1111 1100 mask
-
- But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */
+ } else {
+ /*
+ * Set the register to the base, the type (off by one) and an
+ * inverted bitmask of the size The size is the only odd
+ * bit. We are fed say 512K We invert this and we get 111 1111
+ * 1111 1011 but if you subtract one and invert you get the
+ * desired 111 1111 1111 1100 mask
+ *
+ * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!
+ */
regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
| (base << PAGE_SHIFT) | (type + 1);
+ }
/*
- * The writeback rule is quite specific. See the manual. Its
- * disable local interrupts, write back the cache, set the mtrr
+ * The writeback rule is quite specific. See the manual. Its
+ * disable local interrupts, write back the cache, set the mtrr
*/
wbinvd();
wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
}
-static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+static int
+amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
{
- /* Apply the K6 block alignment and size rules
- In order
- o Uncached or gathering only
- o 128K or bigger block
- o Power of 2 block
- o base suitably aligned to the power
- */
+ /*
+ * Apply the K6 block alignment and size rules
+ * In order
+ * o Uncached or gathering only
+ * o 128K or bigger block
+ * o Power of 2 block
+ * o base suitably aligned to the power
+ */
if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
|| (size & ~(size - 1)) - size || (base & (size - 1)))
return -EINVAL;
return 0;
}
-static struct mtrr_ops amd_mtrr_ops = {
- .vendor = X86_VENDOR_AMD,
+const struct mtrr_ops amd_mtrr_ops = {
+ .var_regs = 2,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
.get_free_region = generic_get_free_region,
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init amd_init_mtrr(void)
-{
- set_mtrr_ops(&amd_mtrr_ops);
- return 0;
-}
-
-//arch_initcall(amd_mtrr_init);
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index cb9aa3a7a7ab..6f6c3ae92943 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -1,7 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/mm.h>
+
#include <asm/mtrr.h>
#include <asm/msr.h>
+
#include "mtrr.h"
static struct {
@@ -12,25 +15,25 @@ static struct {
static u8 centaur_mcr_reserved;
static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
-/*
- * Report boot time MCR setups
+/**
+ * centaur_get_free_region - Get a free MTRR.
+ *
+ * @base: The starting (base) address of the region.
+ * @size: The size (in bytes) of the region.
+ *
+ * Returns: the index of the region on success, else -1 on error.
*/
-
static int
centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
-/* [SUMMARY] Get a free MTRR.
- <base> The starting (base) address of the region.
- <size> The size (in bytes) of the region.
- [RETURNS] The index of the region on success, else -1 on error.
-*/
{
- int i, max;
- mtrr_type ltype;
unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i, max;
max = num_var_ranges;
if (replace_reg >= 0 && replace_reg < max)
return replace_reg;
+
for (i = 0; i < max; ++i) {
if (centaur_mcr_reserved & (1 << i))
continue;
@@ -38,14 +41,8 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
if (lsize == 0)
return i;
}
- return -ENOSPC;
-}
-void
-mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
-{
- centaur_mcr[mcr].low = lo;
- centaur_mcr[mcr].high = hi;
+ return -ENOSPC;
}
static void
@@ -54,33 +51,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base,
{
*base = centaur_mcr[reg].high >> PAGE_SHIFT;
*size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
- *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */
+ *type = MTRR_TYPE_WRCOMB; /* write-combining */
+
if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
*type = MTRR_TYPE_UNCACHABLE;
if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
*type = MTRR_TYPE_WRBACK;
if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
*type = MTRR_TYPE_WRBACK;
-
}
-static void centaur_set_mcr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
+static void
+centaur_set_mcr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
{
unsigned long low, high;
if (size == 0) {
- /* Disable */
+ /* Disable */
high = low = 0;
} else {
high = base << PAGE_SHIFT;
- if (centaur_mcr_type == 0)
- low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */
- else {
+ if (centaur_mcr_type == 0) {
+ /* Only support write-combining... */
+ low = -size << PAGE_SHIFT | 0x1f;
+ } else {
if (type == MTRR_TYPE_UNCACHABLE)
- low = -size << PAGE_SHIFT | 0x02; /* NC */
+ low = -size << PAGE_SHIFT | 0x02; /* NC */
else
- low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */
+ low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */
}
}
centaur_mcr[reg].high = high;
@@ -88,137 +87,26 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base,
wrmsr(MSR_IDT_MCR0 + reg, low, high);
}
-#if 0
-/*
- * Initialise the later (saner) Winchip MCR variant. In this version
- * the BIOS can pass us the registers it has used (but not their values)
- * and the control register is read/write
- */
-
-static void __init
-centaur_mcr1_init(void)
-{
- unsigned i;
- u32 lo, hi;
-
- /* Unfortunately, MCR's are read-only, so there is no way to
- * find out what the bios might have done.
- */
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */
- lo &= ~0x1C0; /* clear key */
- lo |= 0x040; /* set key to 1 */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */
- }
-
- centaur_mcr_type = 1;
-
- /*
- * Clear any unconfigured MCR's.
- */
-
- for (i = 0; i < 8; ++i) {
- if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) {
- if (!(lo & (1 << (9 + i))))
- wrmsr(MSR_IDT_MCR0 + i, 0, 0);
- else
- /*
- * If the BIOS set up an MCR we cannot see it
- * but we don't wish to obliterate it
- */
- centaur_mcr_reserved |= (1 << i);
- }
- }
- /*
- * Throw the main write-combining switch...
- * However if OOSTORE is enabled then people have already done far
- * cleverer things and we should behave.
- */
-
- lo |= 15; /* Write combine enables */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-
-/*
- * Initialise the original winchip with read only MCR registers
- * no used bitmask for the BIOS to pass on and write only control
- */
-
-static void __init
-centaur_mcr0_init(void)
-{
- unsigned i;
-
- /* Unfortunately, MCR's are read-only, so there is no way to
- * find out what the bios might have done.
- */
-
- /* Clear any unconfigured MCR's.
- * This way we are sure that the centaur_mcr array contains the actual
- * values. The disadvantage is that any BIOS tweaks are thus undone.
- *
- */
- for (i = 0; i < 8; ++i) {
- if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0)
- wrmsr(MSR_IDT_MCR0 + i, 0, 0);
- }
-
- wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */
-}
-
-/*
- * Initialise Winchip series MCR registers
- */
-
-static void __init
-centaur_mcr_init(void)
-{
- struct set_mtrr_context ctxt;
-
- set_mtrr_prepare_save(&ctxt);
- set_mtrr_cache_disable(&ctxt);
-
- if (boot_cpu_data.x86_model == 4)
- centaur_mcr0_init();
- else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9)
- centaur_mcr1_init();
-
- set_mtrr_done(&ctxt);
-}
-#endif
-
-static int centaur_validate_add_page(unsigned long base,
- unsigned long size, unsigned int type)
+static int
+centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
{
/*
- * FIXME: Winchip2 supports uncached
+ * FIXME: Winchip2 supports uncached
*/
- if (type != MTRR_TYPE_WRCOMB &&
+ if (type != MTRR_TYPE_WRCOMB &&
(centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
- printk(KERN_WARNING
- "mtrr: only write-combining%s supported\n",
- centaur_mcr_type ? " and uncacheable are"
- : " is");
+ pr_warn("mtrr: only write-combining%s supported\n",
+ centaur_mcr_type ? " and uncacheable are" : " is");
return -EINVAL;
}
return 0;
}
-static struct mtrr_ops centaur_mtrr_ops = {
- .vendor = X86_VENDOR_CENTAUR,
-// .init = centaur_mcr_init,
+const struct mtrr_ops centaur_mtrr_ops = {
+ .var_regs = 8,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
.get_free_region = centaur_get_free_region,
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init centaur_init_mtrr(void)
-{
- set_mtrr_ops(&centaur_mtrr_ops);
- return 0;
-}
-
-//arch_initcall(centaur_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 1d584a18a50d..763534d77f59 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -1,162 +1,52 @@
-/* MTRR (Memory Type Range Register) cleanup
-
- Copyright (C) 2009 Yinghai Lu
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public
- License along with this library; if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-#include <linux/module.h>
+// SPDX-License-Identifier: LGPL-2.0+
+/*
+ * MTRR (Memory Type Range Register) cleanup
+ *
+ * Copyright (C) 2009 Yinghai Lu
+ */
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
-#include <linux/sort.h>
+#include <linux/uaccess.h>
+#include <linux/kvm_para.h>
+#include <linux/range.h>
-#include <asm/e820.h>
-#include <asm/mtrr.h>
-#include <asm/uaccess.h>
#include <asm/processor.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/kvm_para.h>
-#include "mtrr.h"
-/* should be related to MTRR_VAR_RANGES nums */
-#define RANGE_NUM 256
+#include "mtrr.h"
-struct res_range {
- unsigned long start;
- unsigned long end;
+struct var_mtrr_range_state {
+ unsigned long base_pfn;
+ unsigned long size_pfn;
+ mtrr_type type;
};
-static int __init
-add_range(struct res_range *range, int nr_range, unsigned long start,
- unsigned long end)
-{
- /* out of slots */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
- unsigned long end)
-{
- int i;
-
- /* try to merge it with old one */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* need to add that */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* find the new spare */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
+struct var_mtrr_state {
+ unsigned long range_startk;
+ unsigned long range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+ unsigned int reg;
+};
- start1 = r1->start;
- start2 = r2->start;
+/* Should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
- return start1 - start2;
-}
+static struct range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
-struct var_mtrr_range_state {
- unsigned long base_pfn;
- unsigned long size_pfn;
- mtrr_type type;
-};
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
+#define BIOS_BUG_MSG \
+ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
@@ -170,17 +60,16 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
- }
- if (debug_print) {
- printk(KERN_DEBUG "After WB checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size);
}
- /* take out UC ranges */
+ Dprintk("After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+
+ /* Take out UC ranges: */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_UNCACHABLE &&
@@ -191,65 +80,50 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
- (mtrr_state.enabled & 1)) {
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
- printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
- "contains strange UC entry under 1M, check "
- "with your system vendor!\n", i);
+ pr_warn(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
continue;
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
- subtract_range(range, base, base + size - 1);
+ subtract_range(range, RANGE_NUM, base, base + size);
}
if (extra_remove_size)
- subtract_range(range, extra_remove_base,
- extra_remove_base + extra_remove_size - 1);
+ subtract_range(range, RANGE_NUM, extra_remove_base,
+ extra_remove_base + extra_remove_size);
- /* get new range num */
- nr_range = 0;
+ Dprintk("After UC checking\n");
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
- nr_range++;
- }
- if (debug_print) {
- printk(KERN_DEBUG "After UC checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
/* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
- if (debug_print) {
- printk(KERN_DEBUG "After sorting\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
- }
+ nr_range = clean_sort_range(range, RANGE_NUM);
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
+ Dprintk("After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
return nr_range;
}
-static struct res_range __initdata range[RANGE_NUM];
-static int __initdata nr_range;
-
#ifdef CONFIG_MTRR_SANITIZER
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
- unsigned long sum;
+ unsigned long sum = 0;
int i;
- sum = 0;
for (i = 0; i < nr_range; i++)
- sum += range[i].end + 1 - range[i].start;
+ sum += range[i].end - range[i].start;
return sum;
}
@@ -271,24 +145,9 @@ static int __init enable_mtrr_cleanup_setup(char *str)
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
- debug_print = 1;
- return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
-struct var_mtrr_state {
- unsigned long range_startk;
- unsigned long range_sizek;
- unsigned long chunk_sizek;
- unsigned long gran_sizek;
- unsigned int reg;
-};
-
static void __init
set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type, unsigned int address_bits)
+ unsigned char type)
{
u32 base_lo, base_hi, mask_lo, mask_hi;
u64 base, mask;
@@ -298,10 +157,10 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
return;
}
- mask = (1ULL << address_bits) - 1;
+ mask = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
mask &= ~((((u64)sizek) << 10) - 1);
- base = ((u64)basek) << 10;
+ base = ((u64)basek) << 10;
base |= type;
mask |= 0x800;
@@ -317,15 +176,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
static void __init
save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type)
+ unsigned char type)
{
range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
range_state[reg].type = type;
}
-static void __init
-set_var_mtrr_all(unsigned int address_bits)
+static void __init set_var_mtrr_all(void)
{
unsigned long basek, sizek;
unsigned char type;
@@ -336,17 +194,17 @@ set_var_mtrr_all(unsigned int address_bits)
sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
type = range_state[reg].type;
- set_var_mtrr(reg, basek, sizek, type, address_bits);
+ set_var_mtrr(reg, basek, sizek, type);
}
}
static unsigned long to_size_factor(unsigned long sizek, char *factorp)
{
- char factor;
unsigned long base = sizek;
+ char factor;
if (base & ((1<<10) - 1)) {
- /* not MB alignment */
+ /* Not MB-aligned: */
factor = 'K';
} else if (base & ((1<<20) - 1)) {
factor = 'M';
@@ -372,25 +230,25 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long max_align, align;
unsigned long sizek;
- /* Compute the maximum size I can make a range */
+ /* Compute the maximum size with which we can make a range: */
if (range_startk)
- max_align = ffs(range_startk) - 1;
+ max_align = __ffs(range_startk);
else
- max_align = 32;
- align = fls(range_sizek) - 1;
+ max_align = BITS_PER_LONG - 1;
+
+ align = __fls(range_sizek);
if (align > max_align)
align = max_align;
- sizek = 1 << align;
- if (debug_print) {
+ sizek = 1UL << align;
+ if (mtrr_debug) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
- start_base = to_size_factor(range_startk,
- &start_factor),
- size_base = to_size_factor(sizek, &size_factor),
+ start_base = to_size_factor(range_startk, &start_factor);
+ size_base = to_size_factor(sizek, &size_factor);
- printk(KERN_DEBUG "Setting variable MTRR %d, "
+ Dprintk("Setting variable MTRR %d, "
"base: %ld%cB, range: %ld%cB, type %s\n",
reg, start_base, start_factor,
size_base, size_factor,
@@ -412,7 +270,7 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
unsigned long sizek)
{
unsigned long hole_basek, hole_sizek;
- unsigned long second_basek, second_sizek;
+ unsigned long second_sizek;
unsigned long range0_basek, range0_sizek;
unsigned long range_basek, range_sizek;
unsigned long chunk_sizek;
@@ -420,15 +278,15 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
hole_basek = 0;
hole_sizek = 0;
- second_basek = 0;
second_sizek = 0;
chunk_sizek = state->chunk_sizek;
gran_sizek = state->gran_sizek;
- /* align with gran size, prevent small block used up MTRRs */
+ /* Align with gran size, prevent small block used up MTRRs: */
range_basek = ALIGN(state->range_startk, gran_sizek);
if ((range_basek > basek) && basek)
return second_sizek;
+
state->range_sizek -= (range_basek - state->range_startk);
range_sizek = ALIGN(state->range_sizek, gran_sizek);
@@ -439,22 +297,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
}
state->range_sizek = range_sizek;
- /* try to append some small hole */
+ /* Try to append some small hole: */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
- /* no increase */
+ /* No increase: */
if (range0_sizek == state->range_sizek) {
- if (debug_print)
- printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
- range0_basek<<10,
- (range0_basek + state->range_sizek)<<10);
+ Dprintk("rangeX: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + state->range_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
state->range_sizek, MTRR_TYPE_WRBACK);
return 0;
}
- /* only cut back, when it is not the last */
+ /* Only cut back when it is not the last: */
if (sizek) {
while (range0_basek + range0_sizek > (basek + sizek)) {
if (range0_sizek >= chunk_sizek)
@@ -470,16 +327,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
second_try:
range_basek = range0_basek + range0_sizek;
- /* one hole in the middle */
+ /* One hole in the middle: */
if (range_basek > basek && range_basek <= (basek + sizek))
second_sizek = range_basek - basek;
if (range0_sizek > state->range_sizek) {
- /* one hole in middle or at end */
+ /* One hole in middle or at the end: */
hole_sizek = range0_sizek - state->range_sizek - second_sizek;
- /* hole size should be less than half of range0 size */
+ /* Hole size should be less than half of range0 size: */
if (hole_sizek >= (range0_sizek >> 1) &&
range0_sizek >= chunk_sizek) {
range0_sizek -= chunk_sizek;
@@ -491,32 +348,30 @@ second_try:
}
if (range0_sizek) {
- if (debug_print)
- printk(KERN_DEBUG "range0: %016lx - %016lx\n",
- range0_basek<<10,
- (range0_basek + range0_sizek)<<10);
+ Dprintk("range0: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
}
if (range0_sizek < state->range_sizek) {
- /* need to handle left over */
+ /* Need to handle left over range: */
range_sizek = state->range_sizek - range0_sizek;
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n",
- range_basek<<10,
- (range_basek + range_sizek)<<10);
+ Dprintk("range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+
state->reg = range_to_mtrr(state->reg, range_basek,
range_sizek, MTRR_TYPE_WRBACK);
}
if (hole_sizek) {
hole_basek = range_basek - hole_sizek - second_sizek;
- if (debug_print)
- printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10,
- (hole_basek + hole_sizek)<<10);
+ Dprintk("hole: %016lx - %016lx\n",
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
state->reg = range_to_mtrr(state->reg, hole_basek,
hole_sizek, MTRR_TYPE_UNCACHABLE);
}
@@ -537,23 +392,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
basek = base_pfn << (PAGE_SHIFT - 10);
sizek = size_pfn << (PAGE_SHIFT - 10);
- /* See if I can merge with the last range */
+ /* See if I can merge with the last range: */
if ((basek <= 1024) ||
(state->range_startk + state->range_sizek == basek)) {
unsigned long endk = basek + sizek;
state->range_sizek = endk - state->range_startk;
return;
}
- /* Write the range mtrrs */
+ /* Write the range mtrrs: */
if (state->range_sizek != 0)
second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
- /* Allocate an msr */
+ /* Allocate an msr: */
state->range_startk = basek + second_sizek;
state->range_sizek = sizek - second_sizek;
}
-/* mininum size of mtrr block that can take hole */
+/* Minimum size of mtrr block that can take hole: */
static u64 mtrr_chunk_size __initdata = (256ULL<<20);
static int __init parse_mtrr_chunk_size_opt(char *p)
@@ -565,7 +420,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p)
}
early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
-/* granity of mtrr of block */
+/* Granularity of mtrr of block: */
static u64 mtrr_gran_size __initdata;
static int __init parse_mtrr_gran_size_opt(char *p)
@@ -577,7 +432,7 @@ static int __init parse_mtrr_gran_size_opt(char *p)
}
early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
-static int nr_mtrr_spare_reg __initdata =
+static unsigned long nr_mtrr_spare_reg __initdata =
CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
static int __init parse_mtrr_spare_reg(char *arg)
@@ -586,16 +441,15 @@ static int __init parse_mtrr_spare_reg(char *arg)
nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
return 0;
}
-
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
- int i;
int num_reg;
+ int i;
var_state.range_startk = 0;
var_state.range_sizek = 0;
@@ -605,17 +459,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
memset(range_state, 0, sizeof(range_state));
- /* Write the range etc */
- for (i = 0; i < nr_range; i++)
+ /* Write the range: */
+ for (i = 0; i < nr_range; i++) {
set_var_mtrr_range(&var_state, range[i].start,
- range[i].end - range[i].start + 1);
+ range[i].end - range[i].start);
+ }
- /* Write the last range */
+ /* Write the last range: */
if (var_state.range_sizek != 0)
range_to_mtrr_with_hole(&var_state, 0, 0);
num_reg = var_state.reg;
- /* Clear out the extra MTRR's */
+ /* Clear out the extra MTRR's: */
while (var_state.reg < num_var_ranges) {
save_var_mtrr(var_state.reg, 0, 0, 0);
var_state.reg++;
@@ -625,11 +480,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
}
struct mtrr_cleanup_result {
- unsigned long gran_sizek;
- unsigned long chunk_sizek;
- unsigned long lose_cover_sizek;
- unsigned int num_reg;
- int bad;
+ unsigned long gran_sizek;
+ unsigned long chunk_sizek;
+ unsigned long lose_cover_sizek;
+ unsigned int num_reg;
+ int bad;
};
/*
@@ -645,10 +500,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static void __init print_out_mtrr_range_state(void)
{
- int i;
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
mtrr_type type;
+ int i;
for (i = 0; i < num_var_ranges; i++) {
@@ -656,12 +511,12 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
- size_base = to_size_factor(size_base, &size_factor),
+ size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
+ start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ Dprintk("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
i, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -676,10 +531,10 @@ static int __init mtrr_need_cleanup(void)
int i;
mtrr_type type;
unsigned long size;
- /* extra one for all 0 */
+ /* Extra one for all 0: */
int num[MTRR_NUM_TYPES + 1];
- /* check entries number */
+ /* Check entries number: */
memset(num, 0, sizeof(num));
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
@@ -688,93 +543,96 @@ static int __init mtrr_need_cleanup(void)
continue;
if (!size)
type = MTRR_NUM_TYPES;
- if (type == MTRR_TYPE_WRPROT)
- type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
- /* check if we got UC entries */
+ /* Check if we got UC entries: */
if (!num[MTRR_TYPE_UNCACHABLE])
return 0;
- /* check if we only had WB and UC */
+ /* Check if we only had WB and UC */
if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
- num_var_ranges - num[MTRR_NUM_TYPES])
+ num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
return 1;
}
static unsigned long __initdata range_sums;
-static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
- unsigned long extra_remove_base,
- unsigned long extra_remove_size,
- int i)
+
+static void __init
+mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+ unsigned long x_remove_base,
+ unsigned long x_remove_size, int i)
{
- int num_reg;
- static struct res_range range_new[RANGE_NUM];
- static int nr_range_new;
+ /*
+ * range_new should really be an automatic variable, but
+ * putting 4096 bytes on the stack is frowned upon, to put it
+ * mildly. It is safe to make it a static __initdata variable,
+ * since mtrr_calc_range_state is only called during init and
+ * there's no way it will call itself recursively.
+ */
+ static struct range range_new[RANGE_NUM] __initdata;
unsigned long range_sums_new;
+ int nr_range_new;
+ int num_reg;
- /* convert ranges to var ranges state */
- num_reg = x86_setup_var_mtrrs(range, nr_range,
- chunk_size, gran_size);
+ /* Convert ranges to var ranges state: */
+ num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- /* we got new setting in range_state, check it */
+ /* We got new setting in range_state, check it: */
memset(range_new, 0, sizeof(range_new));
nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
- extra_remove_base, extra_remove_size);
+ x_remove_base, x_remove_size);
range_sums_new = sum_ranges(range_new, nr_range_new);
result[i].chunk_sizek = chunk_size >> 10;
result[i].gran_sizek = gran_size >> 10;
result[i].num_reg = num_reg;
+
if (range_sums < range_sums_new) {
- result[i].lose_cover_sizek =
- (range_sums_new - range_sums) << PSHIFT;
+ result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT;
result[i].bad = 1;
- } else
- result[i].lose_cover_sizek =
- (range_sums - range_sums_new) << PSHIFT;
+ } else {
+ result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT;
+ }
- /* double check it */
+ /* Double check it: */
if (!result[i].bad && !result[i].lose_cover_sizek) {
- if (nr_range_new != nr_range ||
- memcmp(range, range_new, sizeof(range)))
- result[i].bad = 1;
+ if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
}
- if (!result[i].bad && (range_sums - range_sums_new <
- min_loss_pfn[num_reg])) {
- min_loss_pfn[num_reg] =
- range_sums - range_sums_new;
- }
+ if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg]))
+ min_loss_pfn[num_reg] = range_sums - range_sums_new;
}
static void __init mtrr_print_out_one_result(int i)
{
- char gran_factor, chunk_factor, lose_factor;
unsigned long gran_base, chunk_base, lose_base;
+ char gran_factor, chunk_factor, lose_factor;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad ? "*BAD*" : " ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad ? "-" : "",
- lose_base, lose_factor);
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
+
+ pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad ? "*BAD*" : " ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad ? "-" : "",
+ lose_base, lose_factor);
}
static int __init mtrr_search_optimal_index(void)
{
- int i;
int num_reg_good;
int index_good;
+ int i;
if (nr_mtrr_spare_reg >= num_var_ranges)
nr_mtrr_spare_reg = num_var_ranges - 1;
+
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
if (!min_loss_pfn[i])
@@ -796,24 +654,27 @@ static int __init mtrr_search_optimal_index(void)
return index_good;
}
-
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
- unsigned long extra_remove_base, extra_remove_size;
+ unsigned long x_remove_base, x_remove_size;
unsigned long base, size, def, dummy;
- mtrr_type type;
u64 chunk_size, gran_size;
+ mtrr_type type;
int index_good;
int i;
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ if (!mtrr_enabled())
+ return 0;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || enable_mtrr_cleanup < 1)
return 0;
+
rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
- /* get it and store it aside */
+ /* Get it and store it aside: */
memset(range_state, 0, sizeof(range_state));
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -822,50 +683,48 @@ int __init mtrr_cleanup(unsigned address_bits)
range_state[i].type = type;
}
- /* check if we need handle it and can handle it */
+ /* Check if we need handle it and can handle it: */
if (!mtrr_need_cleanup())
return 0;
- /* print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
+ /* Print original var MTRRs at first, for debugging: */
+ Dprintk("original variable MTRRs\n");
print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
- extra_remove_size = 0;
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ x_remove_size = 0;
+ x_remove_base = 1 << (32 - PAGE_SHIFT);
if (mtrr_tom2)
- extra_remove_size =
- (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
- nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
- extra_remove_size);
+ x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
+
/*
- * [0, 1M) should always be coverred by var mtrr with WB
- * and fixed mtrrs should take effective before var mtrr for it
+ * [0, 1M) should always be covered by var mtrr with WB
+ * and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
- /* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0,
+ 1ULL<<(20 - PAGE_SHIFT));
+ /* add from var mtrr at last */
+ nr_range = x86_get_mtrr_mem_range(range, nr_range,
+ x_remove_base, x_remove_size);
range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM coverred: %ldM\n",
+ pr_info("total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
i = 0;
mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
- extra_remove_base, extra_remove_size, i);
+ x_remove_base, x_remove_size, i);
mtrr_print_out_one_result(i);
if (!result[i].bad) {
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
}
- printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
- "will find optimal one\n");
+ pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
}
i = 0;
@@ -880,32 +739,32 @@ int __init mtrr_cleanup(unsigned address_bits)
continue;
mtrr_calc_range_state(chunk_size, gran_size,
- extra_remove_base, extra_remove_size, i);
- if (debug_print) {
+ x_remove_base, x_remove_size, i);
+ if (mtrr_debug) {
mtrr_print_out_one_result(i);
- printk(KERN_INFO "\n");
+ pr_info("\n");
}
i++;
}
}
- /* try to find the optimal index */
+ /* Try to find the optimal index: */
index_good = mtrr_search_optimal_index();
if (index_good != -1) {
- printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ pr_info("Found optimal setting for mtrr clean up\n");
i = index_good;
mtrr_print_out_one_result(i);
- /* convert ranges to var ranges state */
+ /* Convert ranges to var ranges state: */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
} else {
@@ -914,13 +773,13 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_print_out_one_result(i);
}
- printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
- printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+ pr_info("mtrr_cleanup: can not find optimal value\n");
+ pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
return 0;
}
#else
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
return 0;
}
@@ -941,19 +800,20 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
* Note this won't check if the MTRRs < 4GB where the magic bit doesn't
* apply to are wrong, but so far we don't know of any such case in the wild.
*/
-#define Tom2Enabled (1U << 21)
-#define Tom2ForceMemTypeWB (1U << 22)
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
int __init amd_special_default_mtrr(void)
{
u32 l, h;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return 0;
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ if (boot_cpu_data.x86 < 0xf)
return 0;
- /* In case some hypervisor doesn't pass SYSCFG through */
- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ /* In case some hypervisor doesn't pass SYSCFG through: */
+ if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0)
return 0;
/*
* Memory between 4GB and top of mem is forced WB by this magic bit.
@@ -965,19 +825,21 @@ int __init amd_special_default_mtrr(void)
return 0;
}
-static u64 __init real_trim_memory(unsigned long start_pfn,
- unsigned long limit_pfn)
+static u64 __init
+real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
{
u64 trim_start, trim_size;
+
trim_start = start_pfn;
trim_start <<= PAGE_SHIFT;
+
trim_size = limit_pfn;
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
- return e820_update_range(trim_start, trim_size, E820_RAM,
- E820_RESERVED);
+ return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED);
}
+
/**
* mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
* @end_pfn: ending page frame number
@@ -985,7 +847,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn,
* Some buggy BIOSes don't setup the MTRRs properly for systems with certain
* memory configurations. This routine checks that the highest MTRR matches
* the end of memory, to make sure the MTRRs having a write back type cover
- * all of the memory the kernel is intending to use. If not, it'll trim any
+ * all of the memory the kernel is intending to use. If not, it'll trim any
* memory off the end by adjusting end_pfn, removing it from the kernel's
* allocation pools, warning the user with an obnoxious message.
*/
@@ -994,21 +856,25 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
u64 total_trim_size;
-
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
+
+ if (!mtrr_enabled())
+ return 0;
+
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
*/
- if (!is_cpu(INTEL) || disable_mtrr_trim)
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || disable_mtrr_trim)
return 0;
+
rdmsr(MSR_MTRRdefType, def, dummy);
- def &= 0xff;
+ def &= MTRR_DEF_TYPE_TYPE;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
- /* get it and store it aside */
+ /* Get it and store it aside: */
memset(range_state, 0, sizeof(range_state));
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -1017,7 +883,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
range_state[i].type = type;
}
- /* Find highest cached pfn */
+ /* Find highest cached pfn: */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
if (type != MTRR_TYPE_WRBACK)
@@ -1028,13 +894,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
highest_pfn = base + size;
}
- /* kvm/qemu doesn't have mtrr set right, don't trim them all */
+ /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
if (!highest_pfn) {
- printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+ pr_info("CPU MTRRs all blank - virtualized system.\n");
return 0;
}
- /* check entries number */
+ /* Check entries number: */
memset(num, 0, sizeof(num));
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
@@ -1046,11 +912,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
num[type]++;
}
- /* no entry for WB? */
+ /* No entry for WB? */
if (!num[MTRR_TYPE_WRBACK])
return 0;
- /* check if we only had WB and UC */
+ /* Check if we only had WB and UC: */
if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
@@ -1059,43 +925,43 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
nr_range = 0;
if (mtrr_tom2) {
range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
- if (highest_pfn < range[nr_range].end + 1)
- highest_pfn = range[nr_range].end + 1;
+ range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
+ if (highest_pfn < range[nr_range].end)
+ highest_pfn = range[nr_range].end;
nr_range++;
}
nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+ /* Check the head: */
total_trim_size = 0;
- /* check the head */
if (range[0].start)
total_trim_size += real_trim_memory(0, range[0].start);
- /* check the holes */
+
+ /* Check the holes: */
for (i = 0; i < nr_range - 1; i++) {
- if (range[i].end + 1 < range[i+1].start)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end,
range[i+1].start);
}
- /* check the top */
+
+ /* Check the top: */
i = nr_range - 1;
- if (range[i].end + 1 < end_pfn)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end,
end_pfn);
if (total_trim_size) {
- printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
- " all of memory, losing %lluMB of RAM.\n",
+ pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
total_trim_size >> 20);
if (!changed_by_mtrr_cleanup)
WARN_ON(1);
- printk(KERN_INFO "update e820 for mtrr\n");
- update_e820();
+ pr_info("update e820 for mtrr\n");
+ e820__update_table_print();
return 1;
}
return 0;
}
-
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index ff14c320040c..238dad57d4d6 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -1,38 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
+#include <linux/io.h>
#include <linux/mm.h>
-#include <asm/mtrr.h>
-#include <asm/msr.h>
-#include <asm/io.h>
+
#include <asm/processor-cyrix.h>
#include <asm/processor-flags.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
#include "mtrr.h"
static void
cyrix_get_arr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
{
- unsigned long flags;
unsigned char arr, ccr3, rcr, shift;
+ unsigned long flags;
arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
- /* Save flags and disable interrupts */
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
- ((unsigned char *) base)[3] = getCx86(arr);
- ((unsigned char *) base)[2] = getCx86(arr + 1);
- ((unsigned char *) base)[1] = getCx86(arr + 2);
+ ((unsigned char *)base)[3] = getCx86(arr);
+ ((unsigned char *)base)[2] = getCx86(arr + 1);
+ ((unsigned char *)base)[1] = getCx86(arr + 2);
rcr = getCx86(CX86_RCR_BASE + reg);
- setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
- /* Enable interrupts if it was enabled previously */
local_irq_restore(flags);
+
shift = ((unsigned char *) base)[1] & 0x0f;
*base >>= PAGE_SHIFT;
- /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
+ /*
+ * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
* Note: shift==0xf means 4G, this is unsupported.
*/
if (shift)
@@ -76,22 +79,26 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
}
}
+/*
+ * cyrix_get_free_region - get a free ARR.
+ *
+ * @base: the starting (base) address of the region.
+ * @size: the size (in bytes) of the region.
+ *
+ * Returns: the index of the region on success, else -1 on error.
+*/
static int
cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
-/* [SUMMARY] Get a free ARR.
- <base> The starting (base) address of the region.
- <size> The size (in bytes) of the region.
- [RETURNS] The index of the region on success, else -1 on error.
-*/
{
- int i;
- mtrr_type ltype;
unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i;
switch (replace_reg) {
case 7:
if (size < 0x40)
break;
+ fallthrough;
case 6:
case 5:
case 4:
@@ -107,14 +114,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
cyrix_get_arr(7, &lbase, &lsize, &ltype);
if (lsize == 0)
return 7;
- /* Else try ARR0-ARR6 first */
+ /* Else try ARR0-ARR6 first */
} else {
for (i = 0; i < 7; i++) {
cyrix_get_arr(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
- /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */
+ /*
+ * ARR0-ARR6 isn't free
+ * try ARR7 but its size must be at least 256K
+ */
cyrix_get_arr(i, &lbase, &lsize, &ltype);
if ((lsize == 0) && (size >= 0x40))
return i;
@@ -122,21 +132,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
return -ENOSPC;
}
-static u32 cr4 = 0;
-static u32 ccr3;
+static u32 cr4, ccr3;
static void prepare_set(void)
{
u32 cr0;
/* Save value of CR4 and clear Page Global Enable (bit 7) */
- if ( cpu_has_pge ) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
- /* Disable and flush caches. Note that wbinvd flushes the TLBs as
- a side-effect */
+ /*
+ * Disable and flush caches.
+ * Note that wbinvd flushes the TLBs as a side-effect
+ */
cr0 = read_cr0() | X86_CR0_CD;
wbinvd();
write_cr0(cr0);
@@ -147,23 +158,22 @@ static void prepare_set(void)
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
-
}
static void post_set(void)
{
- /* Flush caches and TLBs */
+ /* Flush caches and TLBs */
wbinvd();
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, ccr3);
-
- /* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
- /* Restore value of CR4 */
- if ( cpu_has_pge )
- write_cr4(cr4);
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ __write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
@@ -178,7 +188,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
size >>= 6;
size &= 0x7fff; /* make sure arr_size <= 14 */
- for (arr_size = 0; size; arr_size++, size >>= 1) ;
+ for (arr_size = 0; size; arr_size++, size >>= 1)
+ ;
if (reg < 7) {
switch (type) {
@@ -215,60 +226,19 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
prepare_set();
base <<= PAGE_SHIFT;
- setCx86(arr, ((unsigned char *) &base)[3]);
- setCx86(arr + 1, ((unsigned char *) &base)[2]);
- setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size);
+ setCx86(arr + 0, ((unsigned char *)&base)[3]);
+ setCx86(arr + 1, ((unsigned char *)&base)[2]);
+ setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size);
setCx86(CX86_RCR_BASE + reg, arr_type);
post_set();
}
-typedef struct {
- unsigned long base;
- unsigned long size;
- mtrr_type type;
-} arr_state_t;
-
-static arr_state_t arr_state[8] = {
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
-};
-
-static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
-
-static void cyrix_set_all(void)
-{
- int i;
-
- prepare_set();
-
- /* the CCRs are not contiguous */
- for (i = 0; i < 4; i++)
- setCx86(CX86_CCR0 + i, ccr_state[i]);
- for (; i < 7; i++)
- setCx86(CX86_CCR4 + i, ccr_state[i]);
- for (i = 0; i < 8; i++)
- cyrix_set_arr(i, arr_state[i].base,
- arr_state[i].size, arr_state[i].type);
-
- post_set();
-}
-
-static struct mtrr_ops cyrix_mtrr_ops = {
- .vendor = X86_VENDOR_CYRIX,
-// .init = cyrix_arr_init,
- .set_all = cyrix_set_all,
+const struct mtrr_ops cyrix_mtrr_ops = {
+ .var_regs = 8,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init cyrix_init_mtrr(void)
-{
- set_mtrr_ops(&cyrix_mtrr_ops);
- return 0;
-}
-
-//arch_initcall(cyrix_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0543f69f0b27..0863733858dc 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,44 +1,104 @@
-/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
- because MTRRs can span upto 40 bits (36bits on most modern x86) */
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
+ * because MTRRs can span up to 40 bits (36bits on most modern x86)
+ */
+
+#include <linux/export.h>
#include <linux/init.h>
-#include <linux/slab.h>
+#include <linux/io.h>
#include <linux/mm.h>
-#include <linux/module.h>
-#include <asm/io.h>
-#include <asm/mtrr.h>
-#include <asm/msr.h>
-#include <asm/system.h>
-#include <asm/cpufeature.h>
+#include <linux/cc_platform.h>
+#include <linux/string_choices.h>
#include <asm/processor-flags.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
-#include <asm/pat.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/memtype.h>
+
#include "mtrr.h"
struct fixed_range_block {
- int base_msr; /* start address of an MTRR block */
- int ranges; /* number of MTRRs in this block */
+ int base_msr; /* start address of an MTRR block */
+ int ranges; /* number of MTRRs in this block */
};
static struct fixed_range_block fixed_range_blocks[] = {
- { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
- { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
- { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
+ { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
+ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
+ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
{}
};
+struct cache_map {
+ u64 start;
+ u64 end;
+ u64 flags;
+ u64 type:8;
+ u64 fixed:1;
+};
+
+bool mtrr_debug;
+
+static int __init mtrr_param_setup(char *str)
+{
+ int rc = 0;
+
+ if (!str)
+ return -EINVAL;
+ if (!strcmp(str, "debug"))
+ mtrr_debug = true;
+ else
+ rc = -EINVAL;
+
+ return rc;
+}
+early_param("mtrr", mtrr_param_setup);
+
+/*
+ * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where
+ * no 2 adjacent ranges have the same cache mode (those would be merged).
+ * The number is based on the worst case:
+ * - no two adjacent fixed MTRRs share the same cache mode
+ * - one variable MTRR is spanning a huge area with mode WB
+ * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2
+ * additional ranges each (result like "ababababa...aba" with a = WB, b = UC),
+ * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries
+ * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries
+ * to the possible maximum, as it always starts at 4GB, thus it can't be in
+ * the middle of that MTRR, unless that MTRR starts at 0, which would remove
+ * the initial "a" from the "abababa" pattern above)
+ * The map won't contain ranges with no matching MTRR (those fall back to the
+ * default cache mode).
+ */
+#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2)
+
+static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata;
+static struct cache_map *cache_map __refdata = init_cache_map;
+static unsigned int cache_map_size = CACHE_MAP_MAX;
+static unsigned int cache_map_n;
+static unsigned int cache_map_fixed;
+
static unsigned long smp_changes_mask;
static int mtrr_state_set;
u64 mtrr_tom2;
-struct mtrr_state_type mtrr_state = {};
-EXPORT_SYMBOL_GPL(mtrr_state);
+struct mtrr_state_type mtrr_state;
-/**
+/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */
+u32 phys_hi_rsvd;
+
+/*
* BIOS is expected to clear MtrrFixDramModEn bit, see for example
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
* Opteron Processors" (26094 Rev. 3.30 February 2006), section
* "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
- * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
* 0 for operation."
*/
static inline void k8_check_syscfg_dram_mod_en(void)
@@ -49,122 +109,451 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
- rdmsr(MSR_K8_SYSCFG, lo, hi);
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
- printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
- mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi);
+ }
+}
+
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+ u64 size;
+
+ mask |= (u64)phys_hi_rsvd << 32;
+ size = -mask;
+
+ return size;
+}
+
+static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size)
+{
+ struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg;
+
+ if (!(mtrr->mask_lo & MTRR_PHYSMASK_V))
+ return MTRR_TYPE_INVALID;
+
+ *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK);
+ *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) +
+ (mtrr->mask_lo & PAGE_MASK));
+
+ return mtrr->base_lo & MTRR_PHYSBASE_TYPE;
+}
+
+static u8 get_effective_type(u8 type1, u8 type2)
+{
+ if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK))
+ return MTRR_TYPE_WRTHROUGH;
+
+ if (type1 != type2)
+ return MTRR_TYPE_UNCACHABLE;
+
+ return type1;
+}
+
+static void rm_map_entry_at(int idx)
+{
+ cache_map_n--;
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx, cache_map + idx + 1,
+ sizeof(*cache_map) * (cache_map_n - idx));
}
}
/*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Add an entry into cache_map at a specific index. Merges adjacent entries if
+ * appropriate. Return the number of merges for correcting the scan index
+ * (this is needed as merging will reduce the number of entries, which will
+ * result in skipping entries in future iterations if the scan index isn't
+ * corrected).
+ * Note that the corrected index can never go below -1 (resulting in being 0 in
+ * the next scan iteration), as "2" is returned only if the current index is
+ * larger than zero.
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int add_map_entry_at(u64 start, u64 end, u8 type, int idx)
{
- int i;
- u64 base, mask;
- u8 prev_match, curr_match;
-
- if (!mtrr_state_set)
- return 0xFF;
-
- if (!mtrr_state.enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state.have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state.fixed_ranges[idx];
- }
+ bool merge_prev = false, merge_next = false;
+
+ if (start >= end)
+ return 0;
+
+ if (idx > 0) {
+ struct cache_map *prev = cache_map + idx - 1;
+
+ if (!prev->fixed && start == prev->end && type == prev->type)
+ merge_prev = true;
}
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state.enabled & 2)) {
- return mtrr_state.def_type;
+ if (idx < cache_map_n) {
+ struct cache_map *next = cache_map + idx;
+
+ if (!next->fixed && end == next->start && type == next->type)
+ merge_next = true;
}
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
+ if (merge_prev && merge_next) {
+ cache_map[idx - 1].end = cache_map[idx].end;
+ rm_map_entry_at(idx);
+ return 2;
+ }
+ if (merge_prev) {
+ cache_map[idx - 1].end = end;
+ return 1;
+ }
+ if (merge_next) {
+ cache_map[idx].start = start;
+ return 1;
+ }
- if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
- continue;
+ /* Sanity check: the array should NEVER be too small! */
+ if (cache_map_n == cache_map_size) {
+ WARN(1, "MTRR cache mode memory map exhausted!\n");
+ cache_map_n = cache_map_fixed;
+ return 0;
+ }
+
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx + 1, cache_map + idx,
+ sizeof(*cache_map) * (cache_map_n - idx));
+ }
+
+ cache_map[idx].start = start;
+ cache_map[idx].end = end;
+ cache_map[idx].type = type;
+ cache_map[idx].fixed = 0;
+ cache_map_n++;
+
+ return 0;
+}
+
+/* Clear a part of an entry. Return 1 if start of entry is still valid. */
+static int clr_map_range_at(u64 start, u64 end, int idx)
+{
+ int ret = start != cache_map[idx].start;
+ u64 tmp;
+
+ if (start == cache_map[idx].start && end == cache_map[idx].end) {
+ rm_map_entry_at(idx);
+ } else if (start == cache_map[idx].start) {
+ cache_map[idx].start = end;
+ } else if (end == cache_map[idx].end) {
+ cache_map[idx].end = start;
+ } else {
+ tmp = cache_map[idx].end;
+ cache_map[idx].end = start;
+ add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1);
+ }
+
+ return ret;
+}
- base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
- (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
- (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
+/*
+ * Add MTRR to the map. The current map is scanned and each part of the MTRR
+ * either overlapping with an existing entry or with a hole in the map is
+ * handled separately.
+ */
+static void add_map_entry(u64 start, u64 end, u8 type)
+{
+ u8 new_type, old_type;
+ u64 tmp;
+ int i;
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ if (start >= cache_map[i].end)
+ continue;
- if ((start & mask) != (base & mask)) {
+ if (start < cache_map[i].start) {
+ /* Region start has no overlap. */
+ tmp = min(end, cache_map[i].start);
+ i -= add_map_entry_at(start, tmp, type, i);
+ start = tmp;
continue;
}
- curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
+ new_type = get_effective_type(type, cache_map[i].type);
+ old_type = cache_map[i].type;
+
+ if (cache_map[i].fixed || new_type == old_type) {
+ /* Cut off start of new entry. */
+ start = cache_map[i].end;
continue;
}
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE) {
- return MTRR_TYPE_UNCACHABLE;
+ /* Handle only overlapping part of region. */
+ tmp = min(end, cache_map[i].end);
+ i += clr_map_range_at(start, tmp, i);
+ i -= add_map_entry_at(start, tmp, new_type, i);
+ start = tmp;
+ }
+
+ /* Add rest of region after last map entry (rest might be empty). */
+ add_map_entry_at(start, end, type, i);
+}
+
+/* Add variable MTRRs to cache map. */
+static void map_add_var(void)
+{
+ u64 start, size;
+ unsigned int i;
+ u8 type;
+
+ /*
+ * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it
+ * needs to be added again when rebuilding the map due to potentially
+ * having moved as a result of variable MTRRs for memory below 4GB.
+ */
+ if (mtrr_tom2) {
+ add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK);
+ cache_map[cache_map_n - 1].fixed = 1;
+ }
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = get_var_mtrr_state(i, &start, &size);
+ if (type != MTRR_TYPE_INVALID)
+ add_map_entry(start, start + size, type);
+ }
+}
+
+/*
+ * Rebuild map by replacing variable entries. Needs to be called when MTRR
+ * registers are being changed after boot, as such changes could include
+ * removals of registers, which are complicated to handle without rebuild of
+ * the map.
+ */
+void generic_rebuild_map(void)
+{
+ if (mtrr_if != &generic_mtrr_ops)
+ return;
+
+ cache_map_n = cache_map_fixed;
+
+ map_add_var();
+}
+
+static unsigned int __init get_cache_map_size(void)
+{
+ return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0);
+}
+
+/* Build the cache_map containing the cache modes per memory range. */
+void __init mtrr_build_map(void)
+{
+ u64 start, end, size;
+ unsigned int i;
+ u8 type;
+
+ /* Add fixed MTRRs, optimize for adjacent entries with same type. */
+ if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) {
+ /*
+ * Start with 64k size fixed entries, preset 1st one (hence the
+ * loop below is starting with index 1).
+ */
+ start = 0;
+ end = size = 0x10000;
+ type = mtrr_state.fixed_ranges[0];
+
+ for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) {
+ /* 8 64k entries, then 16 16k ones, rest 4k. */
+ if (i == 8 || i == 24)
+ size >>= 2;
+
+ if (mtrr_state.fixed_ranges[i] != type) {
+ add_map_entry(start, end, type);
+ start = end;
+ type = mtrr_state.fixed_ranges[i];
+ }
+ end += size;
}
+ add_map_entry(start, end, type);
+ }
+
+ /* Mark fixed, they take precedence. */
+ for (i = 0; i < cache_map_n; i++)
+ cache_map[i].fixed = 1;
+ cache_map_fixed = cache_map_n;
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
+ map_add_var();
+
+ pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n",
+ cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed,
+ get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0));
+
+ if (mtrr_debug) {
+ for (i = 0; i < cache_map_n; i++) {
+ pr_info("%3u: %016llx-%016llx %s\n", i,
+ cache_map[i].start, cache_map[i].end - 1,
+ mtrr_attrib_to_str(cache_map[i].type));
}
+ }
+}
- if (prev_match != curr_match) {
- return MTRR_TYPE_UNCACHABLE;
+/* Copy the cache_map from __initdata memory to dynamically allocated one. */
+void __init mtrr_copy_map(void)
+{
+ unsigned int new_size = get_cache_map_size();
+
+ if (!mtrr_state.enabled || !new_size) {
+ cache_map = NULL;
+ return;
+ }
+
+ mutex_lock(&mtrr_mutex);
+
+ cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL);
+ if (cache_map) {
+ memmove(cache_map, init_cache_map,
+ cache_map_n * sizeof(*cache_map));
+ cache_map_size = new_size;
+ } else {
+ mtrr_state.enabled = 0;
+ pr_err("MTRRs disabled due to allocation failure for lookup map.\n");
+ }
+
+ mutex_unlock(&mtrr_mutex);
+}
+
+/**
+ * guest_force_mtrr_state - set static MTRR state for a guest
+ *
+ * Used to set MTRR state via different means (e.g. with data obtained from
+ * a hypervisor).
+ * Is allowed only for special cases when running virtualized. Must be called
+ * from the x86_init.hyper.init_platform() hook. It can be called only once.
+ * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
+ * is cleared.
+ *
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
+ */
+void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
+ mtrr_type def_type)
+{
+ unsigned int i;
+
+ /* Only allowed to be called once before mtrr_bp_init(). */
+ if (WARN_ON_ONCE(mtrr_state_set))
+ return;
+
+ /* Only allowed when running virtualized. */
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /*
+ * Only allowed for special virtualization cases:
+ * - when running as Hyper-V, SEV-SNP guest using vTOM
+ * - when running as Xen PV guest
+ * - when running as SEV-SNP or TDX guest to avoid unnecessary
+ * VMM communication/Virtualization exceptions (#VC, #VE)
+ */
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) &&
+ !hv_is_isolation_supported() &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV) &&
+ !cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return;
+
+ /* Disable MTRR in order to disable MTRR modifications. */
+ setup_clear_cpu_cap(X86_FEATURE_MTRR);
+
+ if (var) {
+ if (num_var > MTRR_MAX_VAR_RANGES) {
+ pr_warn("Trying to overwrite MTRR state with %u variable entries\n",
+ num_var);
+ num_var = MTRR_MAX_VAR_RANGES;
}
+ for (i = 0; i < num_var; i++)
+ mtrr_state.var_ranges[i] = var[i];
+ num_var_ranges = num_var;
}
- if (mtrr_tom2) {
- if (start >= (1ULL<<32) && (end < mtrr_tom2))
- return MTRR_TYPE_WRBACK;
+ mtrr_state.def_type = def_type;
+ mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED;
+
+ mtrr_state_set = 1;
+}
+
+static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
+{
+ u8 effective_type;
+
+ if (type == MTRR_TYPE_INVALID)
+ return new_type;
+
+ effective_type = get_effective_type(type, new_type);
+ if (type != effective_type)
+ *uniform = 0;
+
+ return effective_type;
+}
+
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
+{
+ u8 type = MTRR_TYPE_INVALID;
+ unsigned int i;
+
+ if (!mtrr_state_set) {
+ /* Uniformity is unknown. */
+ *uniform = 0;
+ return MTRR_TYPE_UNCACHABLE;
}
- if (prev_match != 0xFF)
- return prev_match;
+ *uniform = 1;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_UNCACHABLE;
+
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ /* Region after current map entry? -> continue with next one. */
+ if (start >= cache_map[i].end)
+ continue;
- return mtrr_state.def_type;
+ /* Start of region not covered by current map entry? */
+ if (start < cache_map[i].start) {
+ /* At least some part of region has default type. */
+ type = type_merge(type, mtrr_state.def_type, uniform);
+ /* End of region not covered, too? -> lookup done. */
+ if (end <= cache_map[i].start)
+ return type;
+ }
+
+ /* At least part of region covered by map entry. */
+ type = type_merge(type, cache_map[i].type, uniform);
+
+ start = cache_map[i].end;
+ }
+
+ /* End of region past last entry in map? -> use default type. */
+ if (start < end)
+ type = type_merge(type, mtrr_state.def_type, uniform);
+
+ return type;
}
-/* Get the MSR pair relating to a var range */
+/* Get the MSR pair relating to a var range */
static void
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
{
@@ -172,7 +561,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
}
-/* fill the MSR pair relating to a var range */
+/* Fill the MSR pair relating to a var range */
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
{
@@ -186,10 +575,9 @@ void fill_mtrr_var_range(unsigned int index,
vr[index].mask_hi = mask_hi;
}
-static void
-get_fixed_ranges(mtrr_type * frs)
+static void get_fixed_ranges(mtrr_type *frs)
{
- unsigned int *p = (unsigned int *) frs;
+ unsigned int *p = (unsigned int *)frs;
int i;
k8_check_syscfg_dram_mod_en();
@@ -204,7 +592,7 @@ get_fixed_ranges(mtrr_type * frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (cpu_has_mtrr)
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
@@ -217,22 +605,22 @@ static void __init print_fixed_last(void)
if (!last_fixed_end)
return;
- printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
+ pr_info(" %05X-%05X %s\n", last_fixed_start,
last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
last_fixed_end = 0;
}
static void __init update_fixed_last(unsigned base, unsigned end,
- mtrr_type type)
+ mtrr_type type)
{
last_fixed_start = base;
last_fixed_end = end;
last_fixed_type = type;
}
-static void __init print_fixed(unsigned base, unsigned step,
- const mtrr_type *types)
+static void __init
+print_fixed(unsigned base, unsigned step, const mtrr_type *types)
{
unsigned i;
@@ -251,67 +639,63 @@ static void __init print_fixed(unsigned base, unsigned step,
}
}
-static void prepare_set(void);
-static void post_set(void);
-
static void __init print_mtrr_state(void)
{
unsigned int i;
int high_width;
- printk(KERN_DEBUG "MTRR default type: %s\n",
- mtrr_attrib_to_str(mtrr_state.def_type));
+ pr_info("MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
- printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
+ pr_info("MTRR fixed ranges %s:\n",
+ str_enabled_disabled(
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)));
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
- print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+ print_fixed(0x80000 + i * 0x20000, 0x04000,
+ mtrr_state.fixed_ranges + (i + 1) * 8);
for (i = 0; i < 8; ++i)
- print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+ print_fixed(0xC0000 + i * 0x08000, 0x01000,
+ mtrr_state.fixed_ranges + (i + 3) * 8);
/* tail */
print_fixed_last();
}
- printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
- if (size_or_mask & 0xffffffffUL)
- high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
- else
- high_width = ffs(size_or_mask>>32) + 32 - 1;
- high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
+ pr_info("MTRR variable ranges %s:\n",
+ str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED));
+ high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4;
+
for (i = 0; i < num_var_ranges; ++i) {
- if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
- printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
- i,
- high_width,
- mtrr_state.var_ranges[i].base_hi,
- mtrr_state.var_ranges[i].base_lo >> 12,
- high_width,
- mtrr_state.var_ranges[i].mask_hi,
- mtrr_state.var_ranges[i].mask_lo >> 12,
- mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V)
+ pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo &
+ MTRR_PHYSBASE_TYPE));
else
- printk(KERN_DEBUG " %u disabled\n", i);
- }
- if (mtrr_tom2) {
- printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
- mtrr_tom2, mtrr_tom2>>20);
+ pr_info(" %u disabled\n", i);
}
+ if (mtrr_tom2)
+ pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
-/* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+/* Grab all of the MTRR state for this CPU into *state */
+bool __init get_mtrr_state(void)
{
- unsigned int i;
struct mtrr_var_range *vrs;
unsigned lo, dummy;
- unsigned long flags;
+ unsigned int i;
vrs = mtrr_state.var_ranges;
rdmsr(MSR_MTRRcap, lo, dummy);
- mtrr_state.have_fixed = (lo >> 8) & 1;
+ mtrr_state.have_fixed = lo & MTRR_CAP_FIX;
for (i = 0; i < num_var_ranges; i++)
get_mtrr_var_range(i, &vrs[i]);
@@ -319,11 +703,12 @@ void __init get_mtrr_state(void)
get_fixed_ranges(mtrr_state.fixed_ranges);
rdmsr(MSR_MTRRdefType, lo, dummy);
- mtrr_state.def_type = (lo & 0xff);
- mtrr_state.enabled = (lo & 0xc00) >> 10;
+ mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE;
+ mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT;
if (amd_special_default_mtrr()) {
unsigned low, high;
+
/* TOP_MEM2 */
rdmsr(MSR_K8_TOP_MEM2, low, high);
mtrr_tom2 = high;
@@ -332,22 +717,15 @@ void __init get_mtrr_state(void)
mtrr_tom2 &= 0xffffff800000ULL;
}
- print_mtrr_state();
+ if (mtrr_debug)
+ print_mtrr_state();
mtrr_state_set = 1;
- /* PAT setup for BP. We need to go through sync steps here */
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}
-/* Some BIOS's are fucked and don't set all MTRRs the same! */
+/* Some BIOS's are messed up and don't set all MTRRs the same! */
void __init mtrr_state_warn(void)
{
unsigned long mask = smp_changes_mask;
@@ -355,28 +733,32 @@ void __init mtrr_state_warn(void)
if (!mask)
return;
if (mask & MTRR_CHANGE_MASK_FIXED)
- printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_VARIABLE)
- printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_DEFTYPE)
- printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n");
- printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
- printk(KERN_INFO "mtrr: corrected configuration.\n");
+ pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+
+ pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+ pr_info("mtrr: corrected configuration.\n");
}
-/* Doesn't attempt to pass an error out to MTRR users
- because it's quite complicated in some cases and probably not
- worth it because the best error handling is to ignore it. */
+/*
+ * Doesn't attempt to pass an error out to MTRR users
+ * because it's quite complicated in some cases and probably not
+ * worth it because the best error handling is to ignore it.
+ */
void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
{
- if (wrmsr_safe(msr, a, b) < 0)
- printk(KERN_ERR
- "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+ if (wrmsr_safe(msr, a, b) < 0) {
+ pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
smp_processor_id(), msr, a, b);
+ }
}
/**
- * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
+ * set_fixed_range - checks & updates a fixed-range MTRR if it
+ * differs from the value it should have
* @msr: MSR address of the MTTR which should be checked and updated
* @changed: pointer which indicates whether the MTRR needed to be changed
* @msrwords: pointer to the MSR values which the MSR should have
@@ -401,40 +783,43 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
*
* Returns: The index of the region on success, else negative on error.
*/
-int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+int
+generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
{
- int i, max;
- mtrr_type ltype;
unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i, max;
max = num_var_ranges;
if (replace_reg >= 0 && replace_reg < max)
return replace_reg;
+
for (i = 0; i < max; ++i) {
mtrr_if->get(i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
+
return -ENOSPC;
}
static void generic_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
- unsigned int mask_lo, mask_hi, base_lo, base_hi;
- unsigned int tmp, hi;
- int cpu;
+ u32 mask_lo, mask_hi, base_lo, base_hi;
+ unsigned int hi;
+ u64 tmp, mask;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
* from any cpu, so try to print it out directly.
*/
- cpu = get_cpu();
+ get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
- if ((mask_lo & 0x800) == 0) {
- /* Invalid (i.e. free) range */
+ if (!(mask_lo & MTRR_PHYSMASK_V)) {
+ /* Invalid (i.e. free) range */
*base = 0;
*size = 0;
*type = 0;
@@ -444,17 +829,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
- tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
- mask_lo = size_or_mask | tmp;
+ tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK);
+ mask = (u64)phys_hi_rsvd << 32 | tmp;
/* Expand tmp with high bits to all 1s: */
- hi = fls(tmp);
+ hi = fls64(tmp);
if (hi > 0) {
- tmp |= ~((1<<(hi - 1)) - 1);
+ tmp |= ~((1ULL<<(hi - 1)) - 1);
- if (tmp != mask_lo) {
- WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
- mask_lo = tmp;
+ if (tmp != mask) {
+ pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ mask = tmp;
}
}
@@ -462,54 +848,57 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
- *size = -mask_lo;
- *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
- *type = base_lo & 0xff;
+ *size = -mask >> PAGE_SHIFT;
+ *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+ *type = base_lo & MTRR_PHYSBASE_TYPE;
out_put_cpu:
put_cpu();
}
/**
- * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set
+ * set_fixed_ranges - checks & updates the fixed-range MTRRs if they
+ * differ from the saved set
* @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
*/
-static int set_fixed_ranges(mtrr_type * frs)
+static int set_fixed_ranges(mtrr_type *frs)
{
- unsigned long long *saved = (unsigned long long *) frs;
+ unsigned long long *saved = (unsigned long long *)frs;
bool changed = false;
- int block=-1, range;
+ int block = -1, range;
k8_check_syscfg_dram_mod_en();
- while (fixed_range_blocks[++block].ranges)
- for (range=0; range < fixed_range_blocks[block].ranges; range++)
- set_fixed_range(fixed_range_blocks[block].base_msr + range,
- &changed, (unsigned int *) saved++);
+ while (fixed_range_blocks[++block].ranges) {
+ for (range = 0; range < fixed_range_blocks[block].ranges; range++)
+ set_fixed_range(fixed_range_blocks[block].base_msr + range,
+ &changed, (unsigned int *)saved++);
+ }
return changed;
}
-/* Set the MSR pair relating to a var range. Returns TRUE if
- changes are made */
+/*
+ * Set the MSR pair relating to a var range.
+ * Returns true if changes are made.
+ */
static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
{
unsigned int lo, hi;
bool changed = false;
rdmsr(MTRRphysBase_MSR(index), lo, hi);
- if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
- || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD)
+ || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
+
mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
changed = true;
}
rdmsr(MTRRphysMask_MSR(index), lo, hi);
- if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
- || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD)
+ || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
changed = true;
}
@@ -521,126 +910,83 @@ static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
- * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * NOTE: The CPU must already be in a safe state for MTRR changes, including
+ * measures that only a single CPU can be active in set_mtrr_state() in
+ * order to not be subject to races for usage of deftype_lo. This is
+ * accomplished by taking cache_disable_lock.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
{
- unsigned int i;
unsigned long change_mask = 0;
+ unsigned int i;
- for (i = 0; i < num_var_ranges; i++)
+ for (i = 0; i < num_var_ranges; i++) {
if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
change_mask |= MTRR_CHANGE_MASK_VARIABLE;
+ }
if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
change_mask |= MTRR_CHANGE_MASK_FIXED;
- /* Set_mtrr_restore restores the old value of MTRRdefType,
- so to set it we fiddle with the saved value */
- if ((deftype_lo & 0xff) != mtrr_state.def_type
- || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
- deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
+ /*
+ * Set_mtrr_restore restores the old value of MTRRdefType,
+ * so to set it we fiddle with the saved value:
+ */
+ if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type ||
+ ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) {
+
+ deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) |
+ mtrr_state.def_type |
+ (mtrr_state.enabled << MTRR_STATE_SHIFT);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
return change_mask;
}
-
-static unsigned long cr4 = 0;
-static DEFINE_SPINLOCK(set_atomicity_lock);
-
-/*
- * Since we are disabling the cache don't allow any interrupts - they
- * would run extremely slow and would only increase the pain. The caller must
- * ensure that local interrupts are disabled and are reenabled after post_set()
- * has been called.
- */
-
-static void prepare_set(void) __acquires(set_atomicity_lock)
+void mtrr_disable(void)
{
- unsigned long cr0;
-
- /* Note that this is not ideal, since the cache is only flushed/disabled
- for this CPU while the MTRRs are changed, but changing this requires
- more invasive changes to the way the kernel boots */
-
- spin_lock(&set_atomicity_lock);
-
- /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
- cr0 = read_cr0() | X86_CR0_CD;
- write_cr0(cr0);
- wbinvd();
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if ( cpu_has_pge ) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
- }
-
- /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- __flush_tlb();
-
- /* Save MTRR state */
+ /* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
- /* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
+ /* Disable MTRRs, and set the default type to uncached */
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi);
}
-static void post_set(void) __releases(set_atomicity_lock)
+void mtrr_enable(void)
{
- /* Flush TLBs (no need to flush caches - they are disabled) */
- __flush_tlb();
-
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
-
- /* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
-
- /* Restore value of CR4 */
- if ( cpu_has_pge )
- write_cr4(cr4);
- spin_unlock(&set_atomicity_lock);
}
-static void generic_set_all(void)
+void mtrr_generic_set_state(void)
{
unsigned long mask, count;
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
- /* also set PAT */
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
- /* Use the atomic bitops to update the global mask */
- for (count = 0; count < sizeof mask * 8; ++count) {
+ /* Use the atomic bitops to update the global mask */
+ for (count = 0; count < sizeof(mask) * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
-
}
+/**
+ * generic_set_mtrr - set variable MTRR register on the local CPU.
+ *
+ * @reg: The register to set.
+ * @base: The base address of the region.
+ * @size: The size of the region. If this is 0 the region is disabled.
+ * @type: The type of the region.
+ *
+ * Returns nothing.
+ */
static void generic_set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
-/* [SUMMARY] Set variable MTRR register on the local CPU.
- <reg> The register to set.
- <base> The base address of the region.
- <size> The size of the region. If this is 0 the region is disabled.
- <type> The type of the region.
- [RETURNS] Nothing.
-*/
{
unsigned long flags;
struct mtrr_var_range *vr;
@@ -648,67 +994,72 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
- prepare_set();
+ cache_disable();
if (size == 0) {
- /* The invalid bit is kept in the mask, so we simply clear the
- relevant mask register to disable a range. */
+ /*
+ * The invalid bit is kept in the mask, so we simply
+ * clear the relevant mask register to disable a range.
+ */
mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
memset(vr, 0, sizeof(struct mtrr_var_range));
} else {
vr->base_lo = base << PAGE_SHIFT | type;
- vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
- vr->mask_lo = -size << PAGE_SHIFT | 0x800;
- vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+ vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
+ vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V;
+ vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
- post_set();
+ cache_enable();
local_irq_restore(flags);
}
-int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+int generic_validate_add_page(unsigned long base, unsigned long size,
+ unsigned int type)
{
unsigned long lbase, last;
- /* For Intel PPro stepping <= 7, must be 4 MiB aligned
- and not touch 0x70000000->0x7003FFFF */
- if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask <= 7) {
+ /*
+ * For Intel PPro stepping <= 7
+ * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
+ */
+ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
+ boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
- printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+ pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
}
if (!(base + size < 0x70000 || base > 0x7003F) &&
(type == MTRR_TYPE_WRCOMB
|| type == MTRR_TYPE_WRBACK)) {
- printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+ pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
return -EINVAL;
}
}
- /* Check upper bits of base and last are equal and lower bits are 0
- for base and 1 for last */
+ /*
+ * Check upper bits of base and last are equal and lower bits are 0
+ * for base and 1 for last
+ */
last = base + size - 1;
for (lbase = base; !(lbase & 1) && (last & 1);
- lbase = lbase >> 1, last = last >> 1) ;
+ lbase = lbase >> 1, last = last >> 1)
+ ;
if (lbase != last) {
- printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n",
- base, size);
+ pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
return -EINVAL;
}
return 0;
}
-
static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
rdmsr(MSR_MTRRcap, config, dummy);
- return (config & (1 << 10));
+ return config & MTRR_CAP_WC;
}
int positive_have_wrcomb(void)
@@ -716,14 +1067,13 @@ int positive_have_wrcomb(void)
return 1;
}
-/* generic structure...
+/*
+ * Generic structure...
*/
-struct mtrr_ops generic_mtrr_ops = {
- .use_intel_if = 1,
- .set_all = generic_set_all,
- .get = generic_get_mtrr,
- .get_free_region = generic_get_free_region,
- .set = generic_set_mtrr,
- .validate_add_page = generic_validate_add_page,
- .have_wrcomb = generic_have_wrcomb,
+const struct mtrr_ops generic_mtrr_ops = {
+ .get = generic_get_mtrr,
+ .get_free_region = generic_get_free_region,
+ .set = generic_set_mtrr,
+ .validate_add_page = generic_validate_add_page,
+ .have_wrcomb = generic_have_wrcomb,
};
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index fb73a52913a4..4049235b1bfe 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -1,27 +1,30 @@
-#include <linux/init.h>
-#include <linux/proc_fs.h>
+// SPDX-License-Identifier: GPL-2.0
#include <linux/capability.h>
-#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
#define LINE_SIZE 80
#include <asm/mtrr.h>
+
#include "mtrr.h"
#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
static const char *const mtrr_strings[MTRR_NUM_TYPES] =
{
- "uncachable", /* 0 */
- "write-combining", /* 1 */
- "?", /* 2 */
- "?", /* 3 */
- "write-through", /* 4 */
- "write-protect", /* 5 */
- "write-back", /* 6 */
+ "uncachable", /* 0 */
+ "write-combining", /* 1 */
+ "?", /* 2 */
+ "?", /* 3 */
+ "write-through", /* 4 */
+ "write-protect", /* 5 */
+ "write-back", /* 6 */
};
const char *mtrr_attrib_to_str(int x)
@@ -35,12 +38,12 @@ static int
mtrr_file_add(unsigned long base, unsigned long size,
unsigned int type, bool increment, struct file *file, int page)
{
+ unsigned int *fcount = FILE_FCOUNT(file);
int reg, max;
- unsigned int *fcount = FILE_FCOUNT(file);
max = num_var_ranges;
if (fcount == NULL) {
- fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
+ fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL);
if (!fcount)
return -ENOMEM;
FILE_FCOUNT(file) = fcount;
@@ -61,8 +64,8 @@ static int
mtrr_file_del(unsigned long base, unsigned long size,
struct file *file, int page)
{
- int reg;
unsigned int *fcount = FILE_FCOUNT(file);
+ int reg;
if (!page) {
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
@@ -81,34 +84,33 @@ mtrr_file_del(unsigned long base, unsigned long size,
return reg;
}
-/* RED-PEN: seq_file can seek now. this is ignored. */
+/*
+ * seq_file can seek but we ignore it.
+ *
+ * Format of control line:
+ * "base=%Lx size=%Lx type=%s" or "disable=%d"
+ */
static ssize_t
mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
-/* Format of control line:
- "base=%Lx size=%Lx type=%s" OR:
- "disable=%d"
-*/
{
int i, err;
unsigned long reg;
unsigned long long base, size;
char *ptr;
char line[LINE_SIZE];
- size_t linelen;
+ int length;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (!len)
- return -EINVAL;
memset(line, 0, LINE_SIZE);
- if (len > LINE_SIZE)
- len = LINE_SIZE;
- if (copy_from_user(line, buf, len - 1))
- return -EFAULT;
- linelen = strlen(line);
- ptr = line + linelen - 1;
- if (linelen && *ptr == '\n')
+
+ len = min_t(size_t, len, LINE_SIZE - 1);
+ length = strncpy_from_user(line, buf, len);
+ if (length < 0)
+ return length;
+
+ ptr = line + length - 1;
+ if (length && *ptr == '\n')
*ptr = '\0';
+
if (!strncmp(line, "disable=", 8)) {
reg = simple_strtoul(line + 8, &ptr, 0);
err = mtrr_del_page(reg, 0, 0);
@@ -116,33 +118,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
return err;
return len;
}
+
if (strncmp(line, "base=", 5))
return -EINVAL;
+
base = simple_strtoull(line + 5, &ptr, 0);
- for (; isspace(*ptr); ++ptr) ;
+ ptr = skip_spaces(ptr);
+
if (strncmp(ptr, "size=", 5))
return -EINVAL;
+
size = simple_strtoull(ptr + 5, &ptr, 0);
if ((base & 0xfff) || (size & 0xfff))
return -EINVAL;
- for (; isspace(*ptr); ++ptr) ;
+ ptr = skip_spaces(ptr);
+
if (strncmp(ptr, "type=", 5))
return -EINVAL;
- ptr += 5;
- for (; isspace(*ptr); ++ptr) ;
- for (i = 0; i < MTRR_NUM_TYPES; ++i) {
- if (strcmp(ptr, mtrr_strings[i]))
- continue;
- base >>= PAGE_SHIFT;
- size >>= PAGE_SHIFT;
- err =
- mtrr_add_page((unsigned long) base, (unsigned long) size, i,
- true);
- if (err < 0)
- return err;
- return len;
- }
- return -EINVAL;
+ ptr = skip_spaces(ptr + 5);
+
+ i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr);
+ if (i < 0)
+ return i;
+
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
+ if (err < 0)
+ return err;
+ return len;
}
static long
@@ -150,11 +154,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
{
int err = 0;
mtrr_type type;
+ unsigned long base;
unsigned long size;
struct mtrr_sentry sentry;
struct mtrr_gentry gentry;
void __user *arg = (void __user *) __arg;
+ memset(&gentry, 0, sizeof(gentry));
+
switch (cmd) {
case MTRRIOC_ADD_ENTRY:
case MTRRIOC_SET_ENTRY:
@@ -164,12 +171,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
case MTRRIOC_SET_PAGE_ENTRY:
case MTRRIOC_DEL_PAGE_ENTRY:
case MTRRIOC_KILL_PAGE_ENTRY:
- if (copy_from_user(&sentry, arg, sizeof sentry))
+ if (copy_from_user(&sentry, arg, sizeof(sentry)))
return -EFAULT;
break;
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
- if (copy_from_user(&gentry, arg, sizeof gentry))
+ if (copy_from_user(&gentry, arg, sizeof(gentry)))
return -EFAULT;
break;
#ifdef CONFIG_COMPAT
@@ -181,7 +188,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
case MTRRIOC32_SET_PAGE_ENTRY:
case MTRRIOC32_DEL_PAGE_ENTRY:
case MTRRIOC32_KILL_PAGE_ENTRY: {
- struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg;
+ struct mtrr_sentry32 __user *s32;
+
+ s32 = (struct mtrr_sentry32 __user *)__arg;
err = get_user(sentry.base, &s32->base);
err |= get_user(sentry.size, &s32->size);
err |= get_user(sentry.type, &s32->type);
@@ -191,7 +200,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
}
case MTRRIOC32_GET_ENTRY:
case MTRRIOC32_GET_PAGE_ENTRY: {
- struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg;
+ struct mtrr_gentry32 __user *g32;
+
+ g32 = (struct mtrr_gentry32 __user *)__arg;
err = get_user(gentry.regnum, &g32->regnum);
err |= get_user(gentry.base, &g32->base);
err |= get_user(gentry.size, &g32->size);
@@ -210,8 +221,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
@@ -220,24 +229,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
@@ -246,14 +249,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that go above 4GB */
- if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+ if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
|| size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
gentry.base = gentry.size = gentry.type = 0;
else {
- gentry.base <<= PAGE_SHIFT;
+ gentry.base = base << PAGE_SHIFT;
gentry.size = size << PAGE_SHIFT;
gentry.type = type;
}
@@ -263,8 +266,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
@@ -273,8 +274,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
@@ -282,16 +281,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
@@ -300,11 +295,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that would overflow */
if (size != (__typeof__(gentry.size))size)
gentry.base = gentry.size = gentry.type = 0;
else {
+ gentry.base = base;
gentry.size = size;
gentry.type = type;
}
@@ -314,16 +310,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
if (err)
return err;
- switch(cmd) {
+ switch (cmd) {
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
- if (copy_to_user(arg, &gentry, sizeof gentry))
+ if (copy_to_user(arg, &gentry, sizeof(gentry)))
err = -EFAULT;
break;
#ifdef CONFIG_COMPAT
case MTRRIOC32_GET_ENTRY:
case MTRRIOC32_GET_PAGE_ENTRY: {
- struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg;
+ struct mtrr_gentry32 __user *g32;
+
+ g32 = (struct mtrr_gentry32 __user *)__arg;
err = put_user(gentry.base, &g32->base);
err |= put_user(gentry.size, &g32->size);
err |= put_user(gentry.regnum, &g32->regnum);
@@ -335,11 +333,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
return err;
}
-static int
-mtrr_close(struct inode *ino, struct file *file)
+static int mtrr_close(struct inode *ino, struct file *file)
{
- int i, max;
unsigned int *fcount = FILE_FCOUNT(file);
+ int i, max;
if (fcount != NULL) {
max = num_var_ranges;
@@ -355,60 +352,60 @@ mtrr_close(struct inode *ino, struct file *file)
return single_release(ino, file);
}
-static int mtrr_seq_show(struct seq_file *seq, void *offset);
-
-static int mtrr_open(struct inode *inode, struct file *file)
-{
- if (!mtrr_if)
- return -EIO;
- if (!mtrr_if->get)
- return -ENXIO;
- return single_open(file, mtrr_seq_show, NULL);
-}
-
-static const struct file_operations mtrr_fops = {
- .owner = THIS_MODULE,
- .open = mtrr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = mtrr_write,
- .unlocked_ioctl = mtrr_ioctl,
- .compat_ioctl = mtrr_ioctl,
- .release = mtrr_close,
-};
-
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
- int i, max, len;
+ int i, max;
mtrr_type type;
unsigned long base, size;
- len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
- if (size == 0)
+ if (size == 0) {
mtrr_usage_table[i] = 0;
- else {
- if (size < (0x100000 >> PAGE_SHIFT)) {
- /* less than 1MB */
- factor = 'K';
- size <<= PAGE_SHIFT - 10;
- } else {
- factor = 'M';
- size >>= 20 - PAGE_SHIFT;
- }
- /* RED-PEN: base can be > 32bit */
- len += seq_printf(seq,
- "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
- i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_usage_table[i], mtrr_attrib_to_str(type));
+ continue;
}
+ if (size < (0x100000 >> PAGE_SHIFT)) {
+ /* less than 1MB */
+ factor = 'K';
+ size <<= PAGE_SHIFT - 10;
+ } else {
+ factor = 'M';
+ size >>= 20 - PAGE_SHIFT;
+ }
+ /* Base can be > 32bit */
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
return 0;
}
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct proc_ops mtrr_proc_ops = {
+ .proc_open = mtrr_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = mtrr_write,
+ .proc_ioctl = mtrr_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = mtrr_ioctl,
+#endif
+ .proc_release = mtrr_close,
+};
+
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -419,9 +416,8 @@ static int __init mtrr_if_init(void)
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
- proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops);
return 0;
}
-
arch_initcall(mtrr_if_init);
#endif /* CONFIG_PROC_FS */
diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c
new file mode 100644
index 000000000000..2415ffaaf02c
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/legacy.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include <asm/cpufeature.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+#include "mtrr.h"
+
+void mtrr_set_if(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ /* Pre-Athlon (K6) AMD CPU MTRRs */
+ if (cpu_feature_enabled(X86_FEATURE_K6_MTRR))
+ mtrr_if = &amd_mtrr_ops;
+ break;
+ case X86_VENDOR_CENTAUR:
+ if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR))
+ mtrr_if = &centaur_mtrr_ops;
+ break;
+ case X86_VENDOR_CYRIX:
+ if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR))
+ mtrr_if = &cyrix_mtrr_ops;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * The suspend/resume methods are only for CPUs without MTRR. CPUs using generic
+ * MTRR driver don't require this.
+ */
+struct mtrr_value {
+ mtrr_type ltype;
+ unsigned long lbase;
+ unsigned long lsize;
+};
+
+static struct mtrr_value *mtrr_value;
+
+static int mtrr_save(void *data)
+{
+ int i;
+
+ if (!mtrr_value)
+ return -ENOMEM;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
+ }
+ return 0;
+}
+
+static void mtrr_restore(void *data)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (mtrr_value[i].lsize) {
+ mtrr_if->set(i, mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
+ }
+ }
+}
+
+static const struct syscore_ops mtrr_syscore_ops = {
+ .suspend = mtrr_save,
+ .resume = mtrr_restore,
+};
+
+static struct syscore mtrr_syscore = {
+ .ops = &mtrr_syscore_ops,
+};
+
+void mtrr_register_syscore(void)
+{
+ mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL);
+
+ /*
+ * The CPU has no MTRR and seems to not support SMP. They have
+ * specific drivers, we use a tricky method to support
+ * suspend/resume for them.
+ *
+ * TBD: is there any system with such CPU which supports
+ * suspend/resume? If no, we should remove the code.
+ */
+ register_syscore(&mtrr_syscore);
+}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
deleted file mode 100644
index 8fc248b5aeaf..000000000000
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ /dev/null
@@ -1,750 +0,0 @@
-/* Generic MTRR (Memory Type Range Register) driver.
-
- Copyright (C) 1997-2000 Richard Gooch
- Copyright (c) 2002 Patrick Mochel
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public
- License along with this library; if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
- Richard Gooch may be reached by email at rgooch@atnf.csiro.au
- The postal address is:
- Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
-
- Source: "Pentium Pro Family Developer's Manual, Volume 3:
- Operating System Writer's Guide" (Intel document number 242692),
- section 11.11.7
-
- This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
- on 6-7 March 2002.
- Source: Intel Architecture Software Developers Manual, Volume 3:
- System Programming Guide; Section 9.11. (1997 edition - PPro).
-*/
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/sort.h>
-
-#include <asm/e820.h>
-#include <asm/mtrr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/kvm_para.h>
-#include "mtrr.h"
-
-u32 num_var_ranges = 0;
-
-unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
-static DEFINE_MUTEX(mtrr_mutex);
-
-u64 size_or_mask, size_and_mask;
-
-static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
-
-struct mtrr_ops * mtrr_if = NULL;
-
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type);
-
-void set_mtrr_ops(struct mtrr_ops * ops)
-{
- if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
- mtrr_ops[ops->vendor] = ops;
-}
-
-/* Returns non-zero if we have the write-combining memory type */
-static int have_wrcomb(void)
-{
- struct pci_dev *dev;
- u8 rev;
-
- if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) {
- /* ServerWorks LE chipsets < rev 6 have problems with write-combining
- Don't allow it and leave room for other chipsets to be tagged */
- if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
- dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev <= 5) {
- printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
- }
- /* Intel 450NX errata # 23. Non ascending cacheline evictions to
- write combining memory may resulting in data corruption */
- if (dev->vendor == PCI_VENDOR_ID_INTEL &&
- dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
- printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
- pci_dev_put(dev);
- }
- return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0);
-}
-
-/* This function returns the number of variable MTRRs */
-static void __init set_num_var_ranges(void)
-{
- unsigned long config = 0, dummy;
-
- if (use_intel()) {
- rdmsr(MSR_MTRRcap, config, dummy);
- } else if (is_cpu(AMD))
- config = 2;
- else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
- config = 8;
- num_var_ranges = config & 0xff;
-}
-
-static void __init init_table(void)
-{
- int i, max;
-
- max = num_var_ranges;
- for (i = 0; i < max; i++)
- mtrr_usage_table[i] = 1;
-}
-
-struct set_mtrr_data {
- atomic_t count;
- atomic_t gate;
- unsigned long smp_base;
- unsigned long smp_size;
- unsigned int smp_reg;
- mtrr_type smp_type;
-};
-
-static void ipi_handler(void *info)
-/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
- [RETURNS] Nothing.
-*/
-{
-#ifdef CONFIG_SMP
- struct set_mtrr_data *data = info;
- unsigned long flags;
-
- local_irq_save(flags);
-
- atomic_dec(&data->count);
- while(!atomic_read(&data->gate))
- cpu_relax();
-
- /* The master has cleared me to execute */
- if (data->smp_reg != ~0U)
- mtrr_if->set(data->smp_reg, data->smp_base,
- data->smp_size, data->smp_type);
- else
- mtrr_if->set_all();
-
- atomic_dec(&data->count);
- while(atomic_read(&data->gate))
- cpu_relax();
-
- atomic_dec(&data->count);
- local_irq_restore(flags);
-#endif
-}
-
-static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
- return type1 == MTRR_TYPE_UNCACHABLE ||
- type2 == MTRR_TYPE_UNCACHABLE ||
- (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
- (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
-}
-
-/**
- * set_mtrr - update mtrrs on all processors
- * @reg: mtrr in question
- * @base: mtrr base
- * @size: mtrr size
- * @type: mtrr type
- *
- * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
- *
- * 1. Send IPI to do the following:
- * 2. Disable Interrupts
- * 3. Wait for all procs to do so
- * 4. Enter no-fill cache mode
- * 5. Flush caches
- * 6. Clear PGE bit
- * 7. Flush all TLBs
- * 8. Disable all range registers
- * 9. Update the MTRRs
- * 10. Enable all range registers
- * 11. Flush all TLBs and caches again
- * 12. Enter normal cache mode and reenable caching
- * 13. Set PGE
- * 14. Wait for buddies to catch up
- * 15. Enable interrupts.
- *
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
- * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it
- * differently, so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate to
- * be reset.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag.
- * Everyone then enables interrupts and we all continue on.
- *
- * Note that the mechanism is the same for UP systems, too; all the SMP stuff
- * becomes nops.
- */
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
-{
- struct set_mtrr_data data;
- unsigned long flags;
-
- data.smp_reg = reg;
- data.smp_base = base;
- data.smp_size = size;
- data.smp_type = type;
- atomic_set(&data.count, num_booting_cpus() - 1);
- /* make sure data.count is visible before unleashing other CPUs */
- smp_wmb();
- atomic_set(&data.gate,0);
-
- /* Start the ball rolling on other CPUs */
- if (smp_call_function(ipi_handler, &data, 0) != 0)
- panic("mtrr: timed out waiting for other CPUs\n");
-
- local_irq_save(flags);
-
- while(atomic_read(&data.count))
- cpu_relax();
-
- /* ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate,1);
-
- /* do our MTRR business */
-
- /* HACK!
- * We use this same function to initialize the mtrrs on boot.
- * The state of the boot cpu's mtrrs has been saved, and we want
- * to replicate across all the APs.
- * If we're doing that @reg is set to something special...
- */
- if (reg != ~0U)
- mtrr_if->set(reg,base,size,type);
-
- /* wait for the others */
- while(atomic_read(&data.count))
- cpu_relax();
-
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate,0);
-
- /*
- * Wait here for everyone to have seen the gate change
- * So we're the last ones to touch 'data'
- */
- while(atomic_read(&data.count))
- cpu_relax();
-
- local_irq_restore(flags);
-}
-
-/**
- * mtrr_add_page - Add a memory type region
- * @base: Physical base address of region in pages (in units of 4 kB!)
- * @size: Physical size of region in pages (4 kB)
- * @type: Type of MTRR desired
- * @increment: If this is true do usage counting on the region
- *
- * Memory type region registers control the caching on newer Intel and
- * non Intel processors. This function allows drivers to request an
- * MTRR is added. The details and hardware specifics of each processor's
- * implementation are hidden from the caller, but nevertheless the
- * caller should expect to need to provide a power of two size on an
- * equivalent power of two boundary.
- *
- * If the region cannot be added either because all regions are in use
- * or the CPU cannot support it a negative value is returned. On success
- * the register number for this entry is returned, but should be treated
- * as a cookie only.
- *
- * On a multiprocessor machine the changes are made to all processors.
- * This is required on x86 by the Intel processors.
- *
- * The available types are
- *
- * %MTRR_TYPE_UNCACHABLE - No caching
- *
- * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
- *
- * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
- *
- * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
- *
- * BUGS: Needs a quiet flag for the cases where drivers do not mind
- * failures and do not wish system log messages to be sent.
- */
-
-int mtrr_add_page(unsigned long base, unsigned long size,
- unsigned int type, bool increment)
-{
- int i, replace, error;
- mtrr_type ltype;
- unsigned long lbase, lsize;
-
- if (!mtrr_if)
- return -ENXIO;
-
- if ((error = mtrr_if->validate_add_page(base,size,type)))
- return error;
-
- if (type >= MTRR_NUM_TYPES) {
- printk(KERN_WARNING "mtrr: type: %u invalid\n", type);
- return -EINVAL;
- }
-
- /* If the type is WC, check that this processor supports it */
- if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
- printk(KERN_WARNING
- "mtrr: your processor doesn't support write-combining\n");
- return -ENOSYS;
- }
-
- if (!size) {
- printk(KERN_WARNING "mtrr: zero sized request\n");
- return -EINVAL;
- }
-
- if (base & size_or_mask || size & size_or_mask) {
- printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
- return -EINVAL;
- }
-
- error = -EINVAL;
- replace = -1;
-
- /* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
- /* Search for existing MTRR */
- mutex_lock(&mtrr_mutex);
- for (i = 0; i < num_var_ranges; ++i) {
- mtrr_if->get(i, &lbase, &lsize, &ltype);
- if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
- continue;
- /* At this point we know there is some kind of overlap/enclosure */
- if (base < lbase || base + size - 1 > lbase + lsize - 1) {
- if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
- /* New region encloses an existing region */
- if (type == ltype) {
- replace = replace == -1 ? i : -2;
- continue;
- }
- else if (types_compatible(type, ltype))
- continue;
- }
- printk(KERN_WARNING
- "mtrr: 0x%lx000,0x%lx000 overlaps existing"
- " 0x%lx000,0x%lx000\n", base, size, lbase,
- lsize);
- goto out;
- }
- /* New region is enclosed by an existing region */
- if (ltype != type) {
- if (types_compatible(type, ltype))
- continue;
- printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
- base, size, mtrr_attrib_to_str(ltype),
- mtrr_attrib_to_str(type));
- goto out;
- }
- if (increment)
- ++mtrr_usage_table[i];
- error = i;
- goto out;
- }
- /* Search for an empty MTRR */
- i = mtrr_if->get_free_region(base, size, replace);
- if (i >= 0) {
- set_mtrr(i, base, size, type);
- if (likely(replace < 0)) {
- mtrr_usage_table[i] = 1;
- } else {
- mtrr_usage_table[i] = mtrr_usage_table[replace];
- if (increment)
- mtrr_usage_table[i]++;
- if (unlikely(replace != i)) {
- set_mtrr(replace, 0, 0, 0);
- mtrr_usage_table[replace] = 0;
- }
- }
- } else
- printk(KERN_INFO "mtrr: no more MTRRs available\n");
- error = i;
- out:
- mutex_unlock(&mtrr_mutex);
- put_online_cpus();
- return error;
-}
-
-static int mtrr_check(unsigned long base, unsigned long size)
-{
- if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
- printk(KERN_WARNING
- "mtrr: size and base must be multiples of 4 kiB\n");
- printk(KERN_DEBUG
- "mtrr: size: 0x%lx base: 0x%lx\n", size, base);
- dump_stack();
- return -1;
- }
- return 0;
-}
-
-/**
- * mtrr_add - Add a memory type region
- * @base: Physical base address of region
- * @size: Physical size of region
- * @type: Type of MTRR desired
- * @increment: If this is true do usage counting on the region
- *
- * Memory type region registers control the caching on newer Intel and
- * non Intel processors. This function allows drivers to request an
- * MTRR is added. The details and hardware specifics of each processor's
- * implementation are hidden from the caller, but nevertheless the
- * caller should expect to need to provide a power of two size on an
- * equivalent power of two boundary.
- *
- * If the region cannot be added either because all regions are in use
- * or the CPU cannot support it a negative value is returned. On success
- * the register number for this entry is returned, but should be treated
- * as a cookie only.
- *
- * On a multiprocessor machine the changes are made to all processors.
- * This is required on x86 by the Intel processors.
- *
- * The available types are
- *
- * %MTRR_TYPE_UNCACHABLE - No caching
- *
- * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
- *
- * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
- *
- * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
- *
- * BUGS: Needs a quiet flag for the cases where drivers do not mind
- * failures and do not wish system log messages to be sent.
- */
-
-int
-mtrr_add(unsigned long base, unsigned long size, unsigned int type,
- bool increment)
-{
- if (mtrr_check(base, size))
- return -EINVAL;
- return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
- increment);
-}
-
-/**
- * mtrr_del_page - delete a memory type region
- * @reg: Register returned by mtrr_add
- * @base: Physical base address
- * @size: Size of region
- *
- * If register is supplied then base and size are ignored. This is
- * how drivers should call it.
- *
- * Releases an MTRR region. If the usage count drops to zero the
- * register is freed and the region returns to default state.
- * On success the register is returned, on failure a negative error
- * code.
- */
-
-int mtrr_del_page(int reg, unsigned long base, unsigned long size)
-{
- int i, max;
- mtrr_type ltype;
- unsigned long lbase, lsize;
- int error = -EINVAL;
-
- if (!mtrr_if)
- return -ENXIO;
-
- max = num_var_ranges;
- /* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
- mutex_lock(&mtrr_mutex);
- if (reg < 0) {
- /* Search for existing MTRR */
- for (i = 0; i < max; ++i) {
- mtrr_if->get(i, &lbase, &lsize, &ltype);
- if (lbase == base && lsize == size) {
- reg = i;
- break;
- }
- }
- if (reg < 0) {
- printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
- size);
- goto out;
- }
- }
- if (reg >= max) {
- printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
- goto out;
- }
- mtrr_if->get(reg, &lbase, &lsize, &ltype);
- if (lsize < 1) {
- printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
- goto out;
- }
- if (mtrr_usage_table[reg] < 1) {
- printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
- goto out;
- }
- if (--mtrr_usage_table[reg] < 1)
- set_mtrr(reg, 0, 0, 0);
- error = reg;
- out:
- mutex_unlock(&mtrr_mutex);
- put_online_cpus();
- return error;
-}
-/**
- * mtrr_del - delete a memory type region
- * @reg: Register returned by mtrr_add
- * @base: Physical base address
- * @size: Size of region
- *
- * If register is supplied then base and size are ignored. This is
- * how drivers should call it.
- *
- * Releases an MTRR region. If the usage count drops to zero the
- * register is freed and the region returns to default state.
- * On success the register is returned, on failure a negative error
- * code.
- */
-
-int
-mtrr_del(int reg, unsigned long base, unsigned long size)
-{
- if (mtrr_check(base, size))
- return -EINVAL;
- return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
-}
-
-EXPORT_SYMBOL(mtrr_add);
-EXPORT_SYMBOL(mtrr_del);
-
-/* HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
- */
-static void __init init_ifs(void)
-{
-#ifndef CONFIG_X86_64
- amd_init_mtrr();
- cyrix_init_mtrr();
- centaur_init_mtrr();
-#endif
-}
-
-/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
- * MTRR driver doesn't require this
- */
-struct mtrr_value {
- mtrr_type ltype;
- unsigned long lbase;
- unsigned long lsize;
-};
-
-static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
-
-static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i,
- &mtrr_value[i].lbase,
- &mtrr_value[i].lsize,
- &mtrr_value[i].ltype);
- }
- return 0;
-}
-
-static int mtrr_restore(struct sys_device * sysdev)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- if (mtrr_value[i].lsize)
- set_mtrr(i,
- mtrr_value[i].lbase,
- mtrr_value[i].lsize,
- mtrr_value[i].ltype);
- }
- return 0;
-}
-
-
-
-static struct sysdev_driver mtrr_sysdev_driver = {
- .suspend = mtrr_save,
- .resume = mtrr_restore,
-};
-
-int __initdata changed_by_mtrr_cleanup;
-
-/**
- * mtrr_bp_init - initialize mtrrs on the boot CPU
- *
- * This needs to be called early; before any of the other CPUs are
- * initialized (i.e. before smp_init()).
- *
- */
-void __init mtrr_bp_init(void)
-{
- u32 phys_addr;
- init_ifs();
-
- phys_addr = 32;
-
- if (cpu_has_mtrr) {
- mtrr_if = &generic_mtrr_ops;
- size_or_mask = 0xff000000; /* 36 bits */
- size_and_mask = 0x00f00000;
- phys_addr = 36;
-
- /* This is an AMD specific MSR, but we assume(hope?) that
- Intel will implement it to when they extend the address
- bus of the Xeon. */
- if (cpuid_eax(0x80000000) >= 0x80000008) {
- phys_addr = cpuid_eax(0x80000008) & 0xff;
- /* CPUID workaround for Intel 0F33/0F34 CPU */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 0xF &&
- boot_cpu_data.x86_model == 0x3 &&
- (boot_cpu_data.x86_mask == 0x3 ||
- boot_cpu_data.x86_mask == 0x4))
- phys_addr = 36;
-
- size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
- size_and_mask = ~size_or_mask & 0xfffff00000ULL;
- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
- boot_cpu_data.x86 == 6) {
- /* VIA C* family have Intel style MTRRs, but
- don't support PAE */
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- phys_addr = 32;
- }
- } else {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (cpu_has_k6_mtrr) {
- /* Pre-Athlon (K6) AMD CPU MTRRs */
- mtrr_if = mtrr_ops[X86_VENDOR_AMD];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CENTAUR:
- if (cpu_has_centaur_mcr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CYRIX:
- if (cpu_has_cyrix_arr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- default:
- break;
- }
- }
-
- if (mtrr_if) {
- set_num_var_ranges();
- init_table();
- if (use_intel()) {
- get_mtrr_state();
-
- if (mtrr_cleanup(phys_addr)) {
- changed_by_mtrr_cleanup = 1;
- mtrr_if->set_all();
- }
-
- }
- }
-}
-
-void mtrr_ap_init(void)
-{
- unsigned long flags;
-
- if (!mtrr_if || !use_intel())
- return;
- /*
- * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed,
- * but this routine will be called in cpu boot time, holding the lock
- * breaks it. This routine is called in two cases: 1.very earily time
- * of software resume, when there absolutely isn't mtrr entry changes;
- * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to
- * prevent mtrr entry changes
- */
- local_irq_save(flags);
-
- mtrr_if->set_all();
-
- local_irq_restore(flags);
-}
-
-/**
- * Save current fixed-range MTRR state of the BSP
- */
-void mtrr_save_state(void)
-{
- smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
-}
-
-static int __init mtrr_init_finialize(void)
-{
- if (!mtrr_if)
- return 0;
- if (use_intel()) {
- if (!changed_by_mtrr_cleanup)
- mtrr_state_warn();
- } else {
- /* The CPUs haven't MTRR and seem to not support SMP. They have
- * specific drivers, we use a tricky method to support
- * suspend/resume for them.
- * TBD: is there any system with such CPU which supports
- * suspend/resume? if no, we should remove the code.
- */
- sysdev_driver_register(&cpu_sysdev_class,
- &mtrr_sysdev_driver);
- }
- return 0;
-}
-subsys_initcall(mtrr_init_finialize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
new file mode 100644
index 000000000000..4b3d492afe17
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -0,0 +1,633 @@
+// SPDX-License-Identifier: LGPL-2.0+
+/* Generic MTRR (Memory Type Range Register) driver.
+
+ Copyright (C) 1997-2000 Richard Gooch
+ Copyright (c) 2002 Patrick Mochel
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+
+ Source: "Pentium Pro Family Developer's Manual, Volume 3:
+ Operating System Writer's Guide" (Intel document number 242692),
+ section 11.11.7
+
+ This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
+ on 6-7 March 2002.
+ Source: Intel Architecture Software Developers Manual, Volume 3:
+ System Programming Guide; Section 9.11. (1997 edition - PPro).
+*/
+
+#include <linux/types.h> /* FIXME: kvm_para.h needs this */
+
+#include <linux/stop_machine.h>
+#include <linux/kvm_para.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
+
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/memtype.h>
+
+#include "mtrr.h"
+
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
+
+/* arch_phys_wc_add returns an MTRR register index plus this offset. */
+#define MTRR_TO_PHYS_WC_OFFSET 1000
+
+u32 num_var_ranges;
+
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+DEFINE_MUTEX(mtrr_mutex);
+
+const struct mtrr_ops *mtrr_if;
+
+/* Returns non-zero if we have the write-combining memory type */
+static int have_wrcomb(void)
+{
+ struct pci_dev *dev;
+
+ dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
+ if (dev != NULL) {
+ /*
+ * ServerWorks LE chipsets < rev 6 have problems with
+ * write-combining. Don't allow it and leave room for other
+ * chipsets to be tagged
+ */
+ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
+ dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+ dev->revision <= 5) {
+ pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
+ }
+ /*
+ * Intel 450NX errata # 23. Non ascending cacheline evictions to
+ * write combining memory may resulting in data corruption
+ */
+ if (dev->vendor == PCI_VENDOR_ID_INTEL &&
+ dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
+ pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
+ }
+ pci_dev_put(dev);
+ }
+ return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
+}
+
+static void __init init_table(void)
+{
+ int i, max;
+
+ max = num_var_ranges;
+ for (i = 0; i < max; i++)
+ mtrr_usage_table[i] = 1;
+}
+
+struct set_mtrr_data {
+ unsigned long smp_base;
+ unsigned long smp_size;
+ unsigned int smp_reg;
+ mtrr_type smp_type;
+};
+
+/**
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
+ * @info: pointer to mtrr configuration data
+ *
+ * Returns nothing.
+ */
+static int mtrr_rendezvous_handler(void *info)
+{
+ struct set_mtrr_data *data = info;
+
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
+ return 0;
+}
+
+static inline int types_compatible(mtrr_type type1, mtrr_type type2)
+{
+ return type1 == MTRR_TYPE_UNCACHABLE ||
+ type2 == MTRR_TYPE_UNCACHABLE ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
+ (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
+}
+
+/**
+ * set_mtrr - update mtrrs on all processors
+ * @reg: mtrr in question
+ * @base: mtrr base
+ * @size: mtrr size
+ * @type: mtrr type
+ *
+ * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
+ *
+ * 1. Queue work to do the following on all processors:
+ * 2. Disable Interrupts
+ * 3. Wait for all procs to do so
+ * 4. Enter no-fill cache mode
+ * 5. Flush caches
+ * 6. Clear PGE bit
+ * 7. Flush all TLBs
+ * 8. Disable all range registers
+ * 9. Update the MTRRs
+ * 10. Enable all range registers
+ * 11. Flush all TLBs and caches again
+ * 12. Enter normal cache mode and reenable caching
+ * 13. Set PGE
+ * 14. Wait for buddies to catch up
+ * 15. Enable interrupts.
+ *
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
+ *
+ * Note that the mechanism is the same for UP systems, too; all the SMP stuff
+ * becomes nops.
+ */
+static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size,
+ mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
+
+ generic_rebuild_map();
+}
+
+/**
+ * mtrr_add_page - Add a memory type region
+ * @base: Physical base address of region in pages (in units of 4 kB!)
+ * @size: Physical size of region in pages (4 kB)
+ * @type: Type of MTRR desired
+ * @increment: If this is true do usage counting on the region
+ *
+ * Memory type region registers control the caching on newer Intel and
+ * non Intel processors. This function allows drivers to request an
+ * MTRR is added. The details and hardware specifics of each processor's
+ * implementation are hidden from the caller, but nevertheless the
+ * caller should expect to need to provide a power of two size on an
+ * equivalent power of two boundary.
+ *
+ * If the region cannot be added either because all regions are in use
+ * or the CPU cannot support it a negative value is returned. On success
+ * the register number for this entry is returned, but should be treated
+ * as a cookie only.
+ *
+ * On a multiprocessor machine the changes are made to all processors.
+ * This is required on x86 by the Intel processors.
+ *
+ * The available types are
+ *
+ * %MTRR_TYPE_UNCACHABLE - No caching
+ *
+ * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
+ *
+ * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
+ *
+ * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
+ *
+ * BUGS: Needs a quiet flag for the cases where drivers do not mind
+ * failures and do not wish system log messages to be sent.
+ */
+int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ unsigned long lbase, lsize;
+ int i, replace, error;
+ mtrr_type ltype;
+
+ if (!mtrr_enabled())
+ return -ENXIO;
+
+ error = mtrr_if->validate_add_page(base, size, type);
+ if (error)
+ return error;
+
+ if (type >= MTRR_NUM_TYPES) {
+ pr_warn("type: %u invalid\n", type);
+ return -EINVAL;
+ }
+
+ /* If the type is WC, check that this processor supports it */
+ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
+ pr_warn("your processor doesn't support write-combining\n");
+ return -ENOSYS;
+ }
+
+ if (!size) {
+ pr_warn("zero sized request\n");
+ return -EINVAL;
+ }
+
+ if ((base | (base + size - 1)) >>
+ (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
+ pr_warn("base or size exceeds the MTRR width\n");
+ return -EINVAL;
+ }
+
+ error = -EINVAL;
+ replace = -1;
+
+ /* No CPU hotplug when we change MTRR entries */
+ cpus_read_lock();
+
+ /* Search for existing MTRR */
+ mutex_lock(&mtrr_mutex);
+ for (i = 0; i < num_var_ranges; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, &ltype);
+ if (!lsize || base > lbase + lsize - 1 ||
+ base + size - 1 < lbase)
+ continue;
+ /*
+ * At this point we know there is some kind of
+ * overlap/enclosure
+ */
+ if (base < lbase || base + size - 1 > lbase + lsize - 1) {
+ if (base <= lbase &&
+ base + size - 1 >= lbase + lsize - 1) {
+ /* New region encloses an existing region */
+ if (type == ltype) {
+ replace = replace == -1 ? i : -2;
+ continue;
+ } else if (types_compatible(type, ltype))
+ continue;
+ }
+ pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
+ lsize);
+ goto out;
+ }
+ /* New region is enclosed by an existing region */
+ if (ltype != type) {
+ if (types_compatible(type, ltype))
+ continue;
+ pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
+ base, size, mtrr_attrib_to_str(ltype),
+ mtrr_attrib_to_str(type));
+ goto out;
+ }
+ if (increment)
+ ++mtrr_usage_table[i];
+ error = i;
+ goto out;
+ }
+ /* Search for an empty MTRR */
+ i = mtrr_if->get_free_region(base, size, replace);
+ if (i >= 0) {
+ set_mtrr(i, base, size, type);
+ if (likely(replace < 0)) {
+ mtrr_usage_table[i] = 1;
+ } else {
+ mtrr_usage_table[i] = mtrr_usage_table[replace];
+ if (increment)
+ mtrr_usage_table[i]++;
+ if (unlikely(replace != i)) {
+ set_mtrr(replace, 0, 0, 0);
+ mtrr_usage_table[replace] = 0;
+ }
+ }
+ } else {
+ pr_info("no more MTRRs available\n");
+ }
+ error = i;
+ out:
+ mutex_unlock(&mtrr_mutex);
+ cpus_read_unlock();
+ return error;
+}
+
+static int mtrr_check(unsigned long base, unsigned long size)
+{
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+ pr_warn("size and base must be multiples of 4 kiB\n");
+ Dprintk("size: 0x%lx base: 0x%lx\n", size, base);
+ dump_stack();
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * mtrr_add - Add a memory type region
+ * @base: Physical base address of region
+ * @size: Physical size of region
+ * @type: Type of MTRR desired
+ * @increment: If this is true do usage counting on the region
+ *
+ * Memory type region registers control the caching on newer Intel and
+ * non Intel processors. This function allows drivers to request an
+ * MTRR is added. The details and hardware specifics of each processor's
+ * implementation are hidden from the caller, but nevertheless the
+ * caller should expect to need to provide a power of two size on an
+ * equivalent power of two boundary.
+ *
+ * If the region cannot be added either because all regions are in use
+ * or the CPU cannot support it a negative value is returned. On success
+ * the register number for this entry is returned, but should be treated
+ * as a cookie only.
+ *
+ * On a multiprocessor machine the changes are made to all processors.
+ * This is required on x86 by the Intel processors.
+ *
+ * The available types are
+ *
+ * %MTRR_TYPE_UNCACHABLE - No caching
+ *
+ * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
+ *
+ * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
+ *
+ * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
+ *
+ * BUGS: Needs a quiet flag for the cases where drivers do not mind
+ * failures and do not wish system log messages to be sent.
+ */
+int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+ bool increment)
+{
+ if (!mtrr_enabled())
+ return -ENODEV;
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+ increment);
+}
+
+/**
+ * mtrr_del_page - delete a memory type region
+ * @reg: Register returned by mtrr_add
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If register is supplied then base and size are ignored. This is
+ * how drivers should call it.
+ *
+ * Releases an MTRR region. If the usage count drops to zero the
+ * register is freed and the region returns to default state.
+ * On success the register is returned, on failure a negative error
+ * code.
+ */
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+ int i, max;
+ mtrr_type ltype;
+ unsigned long lbase, lsize;
+ int error = -EINVAL;
+
+ if (!mtrr_enabled())
+ return -ENODEV;
+
+ max = num_var_ranges;
+ /* No CPU hotplug when we change MTRR entries */
+ cpus_read_lock();
+ mutex_lock(&mtrr_mutex);
+ if (reg < 0) {
+ /* Search for existing MTRR */
+ for (i = 0; i < max; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, &ltype);
+ if (lbase == base && lsize == size) {
+ reg = i;
+ break;
+ }
+ }
+ if (reg < 0) {
+ Dprintk("no MTRR for %lx000,%lx000 found\n", base, size);
+ goto out;
+ }
+ }
+ if (reg >= max) {
+ pr_warn("register: %d too big\n", reg);
+ goto out;
+ }
+ mtrr_if->get(reg, &lbase, &lsize, &ltype);
+ if (lsize < 1) {
+ pr_warn("MTRR %d not used\n", reg);
+ goto out;
+ }
+ if (mtrr_usage_table[reg] < 1) {
+ pr_warn("reg: %d has count=0\n", reg);
+ goto out;
+ }
+ if (--mtrr_usage_table[reg] < 1)
+ set_mtrr(reg, 0, 0, 0);
+ error = reg;
+ out:
+ mutex_unlock(&mtrr_mutex);
+ cpus_read_unlock();
+ return error;
+}
+
+/**
+ * mtrr_del - delete a memory type region
+ * @reg: Register returned by mtrr_add
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If register is supplied then base and size are ignored. This is
+ * how drivers should call it.
+ *
+ * Releases an MTRR region. If the usage count drops to zero the
+ * register is freed and the region returns to default state.
+ * On success the register is returned, on failure a negative error
+ * code.
+ */
+int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+ if (!mtrr_enabled())
+ return -ENODEV;
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+
+/**
+ * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If PAT is available, this does nothing. If PAT is unavailable, it
+ * attempts to add a WC MTRR covering size bytes starting at base and
+ * logs an error if this fails.
+ *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
+ * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
+ * but drivers should not try to interpret that return value.
+ */
+int arch_phys_wc_add(unsigned long base, unsigned long size)
+{
+ int ret;
+
+ if (pat_enabled() || !mtrr_enabled())
+ return 0; /* Success! (We don't need to do anything.) */
+
+ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
+ if (ret < 0) {
+ pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
+ (void *)base, (void *)(base + size - 1));
+ return ret;
+ }
+ return ret + MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL(arch_phys_wc_add);
+
+/*
+ * arch_phys_wc_del - undoes arch_phys_wc_add
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This cleans up after mtrr_add_wc_if_needed.
+ *
+ * The API guarantees that mtrr_del_wc_if_needed(error code) and
+ * mtrr_del_wc_if_needed(0) do nothing.
+ */
+void arch_phys_wc_del(int handle)
+{
+ if (handle >= 1) {
+ WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
+ mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
+ }
+}
+EXPORT_SYMBOL(arch_phys_wc_del);
+
+/*
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This will turn the return value from arch_phys_wc_add into an mtrr
+ * index suitable for debugging.
+ *
+ * Note: There is no legitimate use for this function, except possibly
+ * in printk line. Alas there is an illegitimate use in some ancient
+ * drm ioctls.
+ */
+int arch_phys_wc_index(int handle)
+{
+ if (handle < MTRR_TO_PHYS_WC_OFFSET)
+ return -1;
+ else
+ return handle - MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
+
+int __initdata changed_by_mtrr_cleanup;
+
+/**
+ * mtrr_bp_init - initialize MTRRs on the boot CPU
+ *
+ * This needs to be called early; before any of the other CPUs are
+ * initialized (i.e. before smp_init()).
+ */
+void __init mtrr_bp_init(void)
+{
+ bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR);
+ const char *why = "(not available)";
+ unsigned long config, dummy;
+
+ phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32);
+
+ if (!generic_mtrrs && mtrr_state.enabled) {
+ /*
+ * Software overwrite of MTRR state, only for generic case.
+ * Note that X86_FEATURE_MTRR has been reset in this case.
+ */
+ init_table();
+ mtrr_build_map();
+ pr_info("MTRRs set to read-only\n");
+
+ return;
+ }
+
+ if (generic_mtrrs)
+ mtrr_if = &generic_mtrr_ops;
+ else
+ mtrr_set_if();
+
+ if (mtrr_enabled()) {
+ /* Get the number of variable MTRR ranges. */
+ if (mtrr_if == &generic_mtrr_ops)
+ rdmsr(MSR_MTRRcap, config, dummy);
+ else
+ config = mtrr_if->var_regs;
+ num_var_ranges = config & MTRR_CAP_VCNT;
+
+ init_table();
+ if (mtrr_if == &generic_mtrr_ops) {
+ /* BIOS may override */
+ if (get_mtrr_state()) {
+ memory_caching_control |= CACHE_MTRR;
+ changed_by_mtrr_cleanup = mtrr_cleanup();
+ mtrr_build_map();
+ } else {
+ mtrr_if = NULL;
+ why = "by BIOS";
+ }
+ }
+ }
+
+ if (!mtrr_enabled())
+ pr_info("MTRRs disabled %s\n", why);
+}
+
+/**
+ * mtrr_save_state - Save current fixed-range MTRR state of the first
+ * cpu in cpu_online_mask.
+ */
+void mtrr_save_state(void)
+{
+ int first_cpu;
+
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
+ return;
+
+ first_cpu = cpumask_first(cpu_online_mask);
+ smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
+}
+
+static int __init mtrr_init_finalize(void)
+{
+ /*
+ * Map might exist if guest_force_mtrr_state() has been called or if
+ * mtrr_enabled() returns true.
+ */
+ mtrr_copy_map();
+
+ if (!mtrr_enabled())
+ return 0;
+
+ if (memory_caching_control & CACHE_MTRR) {
+ if (!changed_by_mtrr_cleanup)
+ mtrr_state_warn();
+ return 0;
+ }
+
+ mtrr_register_syscore();
+
+ return 0;
+}
+subsys_initcall(mtrr_init_finalize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 7538b767f206..2de3bd2f95d1 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
- * local mtrr defines.
+ * local MTRR defines.
*/
#include <linux/types.h>
@@ -9,18 +10,17 @@
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
+extern bool mtrr_debug;
+#define Dprintk(x...) do { if (mtrr_debug) pr_info(x); } while (0)
+
extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
- u32 vendor;
- u32 use_intel_if;
-// void (*init)(void);
+ u32 var_regs;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
- void (*set_all)(void);
-
void (*get)(unsigned int reg, unsigned long *base,
- unsigned long *size, mtrr_type * type);
+ unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
int replace_reg);
int (*validate_add_page)(unsigned long base, unsigned long size,
@@ -33,47 +33,58 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
extern int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type);
-extern struct mtrr_ops generic_mtrr_ops;
+extern const struct mtrr_ops generic_mtrr_ops;
extern int positive_have_wrcomb(void);
/* library functions for processor-specific routines */
struct set_mtrr_context {
- unsigned long flags;
- unsigned long cr4val;
- u32 deftype_lo;
- u32 deftype_hi;
- u32 ccr3;
+ unsigned long flags;
+ unsigned long cr4val;
+ u32 deftype_lo;
+ u32 deftype_hi;
+ u32 ccr3;
};
-void set_mtrr_done(struct set_mtrr_context *ctxt);
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
-
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
-
-extern void set_mtrr_ops(struct mtrr_ops * ops);
-
-extern u64 size_or_mask, size_and_mask;
-extern struct mtrr_ops * mtrr_if;
+bool get_mtrr_state(void);
-#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
-#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
+extern const struct mtrr_ops *mtrr_if;
+extern struct mutex mtrr_mutex;
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
extern struct mtrr_state_type mtrr_state;
+extern u32 phys_hi_rsvd;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
-
-/* CPU specific mtrr init functions */
-int amd_init_mtrr(void);
-int cyrix_init_mtrr(void);
-int centaur_init_mtrr(void);
+#ifdef CONFIG_X86_32
+void mtrr_set_if(void);
+void mtrr_register_syscore(void);
+#else
+static inline void mtrr_set_if(void) { }
+static inline void mtrr_register_syscore(void) { }
+#endif
+void mtrr_build_map(void);
+void mtrr_copy_map(void);
+
+/* CPU specific mtrr_ops vectors. */
+extern const struct mtrr_ops amd_mtrr_ops;
+extern const struct mtrr_ops cyrix_mtrr_ops;
+extern const struct mtrr_ops centaur_mtrr_ops;
extern int changed_by_mtrr_cleanup;
-extern int mtrr_cleanup(unsigned address_bits);
+extern int mtrr_cleanup(void);
+
+/*
+ * Must be used by code which uses mtrr_if to call platform-specific
+ * MTRR manipulation functions.
+ */
+static inline bool mtrr_enabled(void)
+{
+ return !!mtrr_if;
+}
+void generic_rebuild_map(void);
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index 1f5fb1588d1f..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,82 +0,0 @@
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/mtrr.h>
-#include <asm/msr.h>
-#include <asm/processor-cyrix.h>
-#include <asm/processor-flags.h>
-#include "mtrr.h"
-
-
-/* Put the processor into a state where MTRRs can be safely set */
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
-{
- unsigned int cr0;
-
- /* Disable interrupts locally */
- local_irq_save(ctxt->flags);
-
- if (use_intel() || is_cpu(CYRIX)) {
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_pge) {
- ctxt->cr4val = read_cr4();
- write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
- }
-
- /*
- * Disable and flush caches. Note that wbinvd flushes the TLBs
- * as a side-effect
- */
- cr0 = read_cr0() | X86_CR0_CD;
- wbinvd();
- write_cr0(cr0);
- wbinvd();
-
- if (use_intel())
- /* Save MTRR state */
- rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
- else
- /* Cyrix ARRs - everything else were excluded at the top */
- ctxt->ccr3 = getCx86(CX86_CCR3);
- }
-}
-
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
-{
- if (use_intel())
- /* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
- ctxt->deftype_hi);
- else if (is_cpu(CYRIX))
- /* Cyrix ARRs - everything else were excluded at the top */
- setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
-}
-
-/* Restore the processor after a set_mtrr_prepare */
-void set_mtrr_done(struct set_mtrr_context *ctxt)
-{
- if (use_intel() || is_cpu(CYRIX)) {
-
- /* Flush caches and TLBs */
- wbinvd();
-
- /* Restore MTRRdefType */
- if (use_intel())
- /* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
- else
- /* Cyrix ARRs - everything else was excluded at the top */
- setCx86(CX86_CCR3, ctxt->ccr3);
-
- /* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
-
- /* Restore value of CR4 */
- if (cpu_has_pge)
- write_cr4(ctxt->cr4val);
- }
- /* Re-enable interrupts locally (if enabled previously) */
- local_irq_restore(ctxt->flags);
-}
-
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
deleted file mode 100644
index 900332b800f8..000000000000
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ /dev/null
@@ -1,1964 +0,0 @@
-/*
- * Performance counter x86 architecture code
- *
- * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2009 Jaswinder Singh Rajput
- * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_counter.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-
-static u64 perf_counter_mask __read_mostly;
-
-struct cpu_hw_counters {
- struct perf_counter *counters[X86_PMC_IDX_MAX];
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long interrupts;
- int enabled;
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
- const char *name;
- int version;
- int (*handle_irq)(struct pt_regs *);
- void (*disable_all)(void);
- void (*enable_all)(void);
- void (*enable)(struct hw_perf_counter *, int);
- void (*disable)(struct hw_perf_counter *, int);
- unsigned eventsel;
- unsigned perfctr;
- u64 (*event_map)(int);
- u64 (*raw_event)(u64);
- int max_events;
- int num_counters;
- int num_counters_fixed;
- int counter_bits;
- u64 counter_mask;
- int apic;
- u64 max_period;
- u64 intel_ctrl;
-};
-
-static struct x86_pmu x86_pmu __read_mostly;
-
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
- .enabled = 1,
-};
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
-};
-
-static u64 p6_pmu_event_map(int event)
-{
- return p6_perfmon_event_map[event];
-}
-
-/*
- * Counter setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_COUNTER 0x0000002EULL
-
-static u64 p6_pmu_raw_event(u64 event)
-{
-#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define P6_EVNTSEL_INV_MASK 0x00800000ULL
-#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
-
-#define P6_EVNTSEL_MASK \
- (P6_EVNTSEL_EVENT_MASK | \
- P6_EVNTSEL_UNIT_MASK | \
- P6_EVNTSEL_EDGE_MASK | \
- P6_EVNTSEL_INV_MASK | \
- P6_EVNTSEL_COUNTER_MASK)
-
- return event & P6_EVNTSEL_MASK;
-}
-
-
-/*
- * Intel PerfMon v3. Used on Core2 and later.
- */
-static const u64 intel_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
-};
-
-static u64 intel_pmu_event_map(int event)
-{
- return intel_perfmon_event_map[event];
-}
-
-/*
- * Generalized hw caching related event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'event makes no sense on
- * this CPU', any other value means the raw event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-static u64 __read_mostly hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-static const u64 nehalem_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
- [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
- [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
- [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static const u64 core2_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static const u64 atom_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static u64 intel_pmu_raw_event(u64 event)
-{
-#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
-#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK \
- (CORE_EVNTSEL_EVENT_MASK | \
- CORE_EVNTSEL_UNIT_MASK | \
- CORE_EVNTSEL_EDGE_MASK | \
- CORE_EVNTSEL_INV_MASK | \
- CORE_EVNTSEL_COUNTER_MASK)
-
- return event & CORE_EVNTSEL_MASK;
-}
-
-static const u64 amd_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
- [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
- [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
- [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
- [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
-};
-
-static u64 amd_pmu_event_map(int event)
-{
- return amd_perfmon_event_map[event];
-}
-
-static u64 amd_pmu_raw_event(u64 event)
-{
-#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
-#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
-#define K7_EVNTSEL_INV_MASK 0x000800000ULL
-#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK \
- (K7_EVNTSEL_EVENT_MASK | \
- K7_EVNTSEL_UNIT_MASK | \
- K7_EVNTSEL_EDGE_MASK | \
- K7_EVNTSEL_INV_MASK | \
- K7_EVNTSEL_COUNTER_MASK)
-
- return event & K7_EVNTSEL_MASK;
-}
-
-/*
- * Propagate counter elapsed time into the generic counter.
- * Can only be executed on the CPU where the counter is active.
- * Returns the delta events processed.
- */
-static u64
-x86_perf_counter_update(struct perf_counter *counter,
- struct hw_perf_counter *hwc, int idx)
-{
- int shift = 64 - x86_pmu.counter_bits;
- u64 prev_raw_count, new_raw_count;
- s64 delta;
-
- /*
- * Careful: an NMI might modify the previous counter value.
- *
- * Our tactic to handle this is to first atomically read and
- * exchange a new raw count - then add that new-prev delta
- * count to the generic counter atomically:
- */
-again:
- prev_raw_count = atomic64_read(&hwc->prev_count);
- rdmsrl(hwc->counter_base + idx, new_raw_count);
-
- if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count)
- goto again;
-
- /*
- * Now we have the new raw value and have updated the prev
- * timestamp already. We can now calculate the elapsed delta
- * (counter-)time and add that to the generic counter.
- *
- * Careful, not all hw sign-extends above the physical width
- * of the count.
- */
- delta = (new_raw_count << shift) - (prev_raw_count << shift);
- delta >>= shift;
-
- atomic64_add(delta, &counter->count);
- atomic64_sub(delta, &hwc->period_left);
-
- return new_raw_count;
-}
-
-static atomic_t active_counters;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-static bool reserve_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- int i;
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- disable_lapic_nmi_watchdog();
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
- goto perfctr_fail;
- }
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
- goto eventsel_fail;
- }
-#endif
-
- return true;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-eventsel_fail:
- for (i--; i >= 0; i--)
- release_evntsel_nmi(x86_pmu.eventsel + i);
-
- i = x86_pmu.num_counters;
-
-perfctr_fail:
- for (i--; i >= 0; i--)
- release_perfctr_nmi(x86_pmu.perfctr + i);
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
-
- return false;
-#endif
-}
-
-static void release_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- int i;
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- release_perfctr_nmi(x86_pmu.perfctr + i);
- release_evntsel_nmi(x86_pmu.eventsel + i);
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
-#endif
-}
-
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
- if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
- release_pmc_hardware();
- mutex_unlock(&pmc_reserve_mutex);
- }
-}
-
-static inline int x86_pmu_initialized(void)
-{
- return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
-{
- unsigned int cache_type, cache_op, cache_result;
- u64 config, val;
-
- config = attr->config;
-
- cache_type = (config >> 0) & 0xff;
- if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
- return -EINVAL;
-
- cache_op = (config >> 8) & 0xff;
- if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
- return -EINVAL;
-
- cache_result = (config >> 16) & 0xff;
- if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
- return -EINVAL;
-
- val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
- if (val == 0)
- return -ENOENT;
-
- if (val == -1)
- return -EINVAL;
-
- hwc->config |= val;
-
- return 0;
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_counter_init(struct perf_counter *counter)
-{
- struct perf_counter_attr *attr = &counter->attr;
- struct hw_perf_counter *hwc = &counter->hw;
- u64 config;
- int err;
-
- if (!x86_pmu_initialized())
- return -ENODEV;
-
- err = 0;
- if (!atomic_inc_not_zero(&active_counters)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
- err = -EBUSY;
- else
- atomic_inc(&active_counters);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
-
- /*
- * Generate PMC IRQs:
- * (keep 'enabled' bit clear for now)
- */
- hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
- /*
- * Count user and OS events unless requested not to.
- */
- if (!attr->exclude_user)
- hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
- if (!attr->exclude_kernel)
- hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-
- if (!hwc->sample_period) {
- hwc->sample_period = x86_pmu.max_period;
- hwc->last_period = hwc->sample_period;
- atomic64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * counters (user-space has to fall back and
- * sample via a hrtimer based software counter):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
- }
-
- counter->destroy = hw_perf_counter_destroy;
-
- /*
- * Raw event type provide the config in the event structure
- */
- if (attr->type == PERF_TYPE_RAW) {
- hwc->config |= x86_pmu.raw_event(attr->config);
- return 0;
- }
-
- if (attr->type == PERF_TYPE_HW_CACHE)
- return set_ext_hw_attr(hwc, attr);
-
- if (attr->config >= x86_pmu.max_events)
- return -EINVAL;
-
- /*
- * The generic map:
- */
- config = x86_pmu.event_map(attr->config);
-
- if (config == 0)
- return -ENOENT;
-
- if (config == -1LL)
- return -EINVAL;
-
- hwc->config |= config;
-
- return 0;
-}
-
-static void p6_pmu_disable_all(void)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- u64 val;
-
- if (!cpuc->enabled)
- return;
-
- cpuc->enabled = 0;
- barrier();
-
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_disable_all(void)
-{
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-}
-
-static void amd_pmu_disable_all(void)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- int idx;
-
- if (!cpuc->enabled)
- return;
-
- cpuc->enabled = 0;
- /*
- * ensure we write the disable before we start disabling the
- * counters proper, so that amd_pmu_enable_counter() does the
- * right thing.
- */
- barrier();
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- u64 val;
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
- rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
- if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
- continue;
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
- }
-}
-
-void hw_perf_disable(void)
-{
- if (!x86_pmu_initialized())
- return;
- return x86_pmu.disable_all();
-}
-
-static void p6_pmu_enable_all(void)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- unsigned long val;
-
- if (cpuc->enabled)
- return;
-
- cpuc->enabled = 1;
- barrier();
-
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_enable_all(void)
-{
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
-}
-
-static void amd_pmu_enable_all(void)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- int idx;
-
- if (cpuc->enabled)
- return;
-
- cpuc->enabled = 1;
- barrier();
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_counter *counter = cpuc->counters[idx];
- u64 val;
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- val = counter->hw.config;
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
- }
-}
-
-void hw_perf_enable(void)
-{
- if (!x86_pmu_initialized())
- return;
- x86_pmu.enable_all();
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
- u64 status;
-
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
- return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
- wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
- (void)checking_wrmsrl(hwc->config_base + idx,
- hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
- (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
-}
-
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
- int idx = __idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, mask;
-
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- (void)checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline void
-p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- u64 val = P6_NOP_COUNTER;
-
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-static inline void
-intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_disable_fixed(hwc, idx);
- return;
- }
-
- x86_pmu_disable_counter(hwc, idx);
-}
-
-static inline void
-amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
- x86_pmu_disable_counter(hwc, idx);
-}
-
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the counter disabled in hw:
- */
-static int
-x86_perf_counter_set_period(struct perf_counter *counter,
- struct hw_perf_counter *hwc, int idx)
-{
- s64 left = atomic64_read(&hwc->period_left);
- s64 period = hwc->sample_period;
- int err, ret = 0;
-
- /*
- * If we are way outside a reasoable range then just skip forward:
- */
- if (unlikely(left <= -period)) {
- left = period;
- atomic64_set(&hwc->period_left, left);
- hwc->last_period = period;
- ret = 1;
- }
-
- if (unlikely(left <= 0)) {
- left += period;
- atomic64_set(&hwc->period_left, left);
- hwc->last_period = period;
- ret = 1;
- }
- /*
- * Quirk: certain CPUs dont like it if just 1 event is left:
- */
- if (unlikely(left < 2))
- left = 2;
-
- if (left > x86_pmu.max_period)
- left = x86_pmu.max_period;
-
- per_cpu(prev_left[idx], smp_processor_id()) = left;
-
- /*
- * The hw counter starts counting from this counter offset,
- * mark it to be able to extra future deltas:
- */
- atomic64_set(&hwc->prev_count, (u64)-left);
-
- err = checking_wrmsrl(hwc->counter_base + idx,
- (u64)(-left) & x86_pmu.counter_mask);
-
- perf_counter_update_userpage(counter);
-
- return ret;
-}
-
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
- int idx = __idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, bits, mask;
- int err;
-
- /*
- * Enable IRQ generation (0x8),
- * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
- * if requested:
- */
- bits = 0x8ULL;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
- bits |= 0x2;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
- bits |= 0x1;
- bits <<= (idx * 4);
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- ctrl_val |= bits;
- err = checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- u64 val;
-
- val = hwc->config;
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-
-static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_enable_fixed(hwc, idx);
- return;
- }
-
- x86_pmu_enable_counter(hwc, idx);
-}
-
-static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
- if (cpuc->enabled)
- x86_pmu_enable_counter(hwc, idx);
-}
-
-static int
-fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
-{
- unsigned int event;
-
- if (!x86_pmu.num_counters_fixed)
- return -1;
-
- event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
- if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
- return X86_PMC_IDX_FIXED_INSTRUCTIONS;
- if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
- return X86_PMC_IDX_FIXED_CPU_CYCLES;
- if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
- return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
- return -1;
-}
-
-/*
- * Find a PMC slot for the freshly enabled / scheduled in counter:
- */
-static int x86_pmu_enable(struct perf_counter *counter)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- struct hw_perf_counter *hwc = &counter->hw;
- int idx;
-
- idx = fixed_mode_idx(counter, hwc);
- if (idx >= 0) {
- /*
- * Try to get the fixed counter, if that is already taken
- * then try to get a generic counter:
- */
- if (test_and_set_bit(idx, cpuc->used_mask))
- goto try_generic;
-
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- /*
- * We set it so that counter_base + idx in wrmsr/rdmsr maps to
- * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
- */
- hwc->counter_base =
- MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- hwc->idx = idx;
- } else {
- idx = hwc->idx;
- /* Try to get the previous generic counter again */
- if (test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
- idx = find_first_zero_bit(cpuc->used_mask,
- x86_pmu.num_counters);
- if (idx == x86_pmu.num_counters)
- return -EAGAIN;
-
- set_bit(idx, cpuc->used_mask);
- hwc->idx = idx;
- }
- hwc->config_base = x86_pmu.eventsel;
- hwc->counter_base = x86_pmu.perfctr;
- }
-
- perf_counters_lapic_init();
-
- x86_pmu.disable(hwc, idx);
-
- cpuc->counters[idx] = counter;
- set_bit(idx, cpuc->active_mask);
-
- x86_perf_counter_set_period(counter, hwc, idx);
- x86_pmu.enable(hwc, idx);
-
- perf_counter_update_userpage(counter);
-
- return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_counter *counter)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- struct hw_perf_counter *hwc = &counter->hw;
-
- if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
- cpuc->counters[hwc->idx] != counter))
- return;
-
- x86_pmu.enable(hwc, hwc->idx);
-}
-
-void perf_counter_print_debug(void)
-{
- u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- struct cpu_hw_counters *cpuc;
- unsigned long flags;
- int cpu, idx;
-
- if (!x86_pmu.num_counters)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_counters, cpu);
-
- if (x86_pmu.version >= 2) {
- rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
- rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
- rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
- pr_info("\n");
- pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
- pr_info("CPU#%d: status: %016llx\n", cpu, status);
- pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
- pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- }
- pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
- rdmsrl(x86_pmu.perfctr + idx, pmc_count);
-
- prev_left = per_cpu(prev_left[idx], cpu);
-
- pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
- cpu, idx, pmc_ctrl);
- pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
- cpu, idx, pmc_count);
- pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
- cpu, idx, prev_left);
- }
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
- pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
- cpu, idx, pmc_count);
- }
- local_irq_restore(flags);
-}
-
-static void x86_pmu_disable(struct perf_counter *counter)
-{
- struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
- struct hw_perf_counter *hwc = &counter->hw;
- int idx = hwc->idx;
-
- /*
- * Must be done before we disable, otherwise the nmi handler
- * could reenable again:
- */
- clear_bit(idx, cpuc->active_mask);
- x86_pmu.disable(hwc, idx);
-
- /*
- * Make sure the cleared pointer becomes visible before we
- * (potentially) free the counter:
- */
- barrier();
-
- /*
- * Drain the remaining delta count out of a counter
- * that we are disabling:
- */
- x86_perf_counter_update(counter, hwc, idx);
- cpuc->counters[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);
-
- perf_counter_update_userpage(counter);
-}
-
-/*
- * Save and restart an expired counter. Called by NMI contexts,
- * so it has to be careful about preempting normal counter ops:
- */
-static int intel_pmu_save_and_restart(struct perf_counter *counter)
-{
- struct hw_perf_counter *hwc = &counter->hw;
- int idx = hwc->idx;
- int ret;
-
- x86_perf_counter_update(counter, hwc, idx);
- ret = x86_perf_counter_set_period(counter, hwc, idx);
-
- if (counter->state == PERF_COUNTER_STATE_ACTIVE)
- intel_pmu_enable_counter(hwc, idx);
-
- return ret;
-}
-
-static void intel_pmu_reset(void)
-{
- unsigned long flags;
- int idx;
-
- if (!x86_pmu.num_counters)
- return;
-
- local_irq_save(flags);
-
- printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
- checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
- }
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
- }
-
- local_irq_restore(flags);
-}
-
-static int p6_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_counters *cpuc;
- struct perf_counter *counter;
- struct hw_perf_counter *hwc;
- int idx, handled = 0;
- u64 val;
-
- data.regs = regs;
- data.addr = 0;
-
- cpuc = &__get_cpu_var(cpu_hw_counters);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- counter = cpuc->counters[idx];
- hwc = &counter->hw;
-
- val = x86_perf_counter_update(counter, hwc, idx);
- if (val & (1ULL << (x86_pmu.counter_bits - 1)))
- continue;
-
- /*
- * counter overflow
- */
- handled = 1;
- data.period = counter->hw.last_period;
-
- if (!x86_perf_counter_set_period(counter, hwc, idx))
- continue;
-
- if (perf_counter_overflow(counter, 1, &data))
- p6_pmu_disable_counter(hwc, idx);
- }
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- return handled;
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_counters *cpuc;
- int bit, loops;
- u64 ack, status;
-
- data.regs = regs;
- data.addr = 0;
-
- cpuc = &__get_cpu_var(cpu_hw_counters);
-
- perf_disable();
- status = intel_pmu_get_status();
- if (!status) {
- perf_enable();
- return 0;
- }
-
- loops = 0;
-again:
- if (++loops > 100) {
- WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
- perf_counter_print_debug();
- intel_pmu_reset();
- perf_enable();
- return 1;
- }
-
- inc_irq_stat(apic_perf_irqs);
- ack = status;
- for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
- struct perf_counter *counter = cpuc->counters[bit];
-
- clear_bit(bit, (unsigned long *) &status);
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- if (!intel_pmu_save_and_restart(counter))
- continue;
-
- data.period = counter->hw.last_period;
-
- if (perf_counter_overflow(counter, 1, &data))
- intel_pmu_disable_counter(&counter->hw, bit);
- }
-
- intel_pmu_ack_status(ack);
-
- /*
- * Repeat if there is more work to be done:
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
-
- perf_enable();
-
- return 1;
-}
-
-static int amd_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_counters *cpuc;
- struct perf_counter *counter;
- struct hw_perf_counter *hwc;
- int idx, handled = 0;
- u64 val;
-
- data.regs = regs;
- data.addr = 0;
-
- cpuc = &__get_cpu_var(cpu_hw_counters);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- counter = cpuc->counters[idx];
- hwc = &counter->hw;
-
- val = x86_perf_counter_update(counter, hwc, idx);
- if (val & (1ULL << (x86_pmu.counter_bits - 1)))
- continue;
-
- /*
- * counter overflow
- */
- handled = 1;
- data.period = counter->hw.last_period;
-
- if (!x86_perf_counter_set_period(counter, hwc, idx))
- continue;
-
- if (perf_counter_overflow(counter, 1, &data))
- amd_pmu_disable_counter(hwc, idx);
- }
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- return handled;
-}
-
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- ack_APIC_irq();
- inc_irq_stat(apic_pending_irqs);
- perf_counter_do_pending();
- irq_exit();
-}
-
-void set_perf_counter_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
-void perf_counters_lapic_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- /*
- * Always use NMI for PMU
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
-}
-
-static int __kprobes
-perf_counter_nmi_handler(struct notifier_block *self,
- unsigned long cmd, void *__args)
-{
- struct die_args *args = __args;
- struct pt_regs *regs;
-
- if (!atomic_read(&active_counters))
- return NOTIFY_DONE;
-
- switch (cmd) {
- case DIE_NMI:
- case DIE_NMI_IPI:
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
- regs = args->regs;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
- /*
- * Can't rely on the handled return value to say it was our NMI, two
- * counters could trigger 'simultaneously' raising two back-to-back NMIs.
- *
- * If the first NMI handles both, the latter will be empty and daze
- * the CPU.
- */
- x86_pmu.handle_irq(regs);
-
- return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
- .notifier_call = perf_counter_nmi_handler,
- .next = NULL,
- .priority = 1
-};
-
-static struct x86_pmu p6_pmu = {
- .name = "p6",
- .handle_irq = p6_pmu_handle_irq,
- .disable_all = p6_pmu_disable_all,
- .enable_all = p6_pmu_enable_all,
- .enable = p6_pmu_enable_counter,
- .disable = p6_pmu_disable_counter,
- .eventsel = MSR_P6_EVNTSEL0,
- .perfctr = MSR_P6_PERFCTR0,
- .event_map = p6_pmu_event_map,
- .raw_event = p6_pmu_raw_event,
- .max_events = ARRAY_SIZE(p6_perfmon_event_map),
- .apic = 1,
- .max_period = (1ULL << 31) - 1,
- .version = 0,
- .num_counters = 2,
- /*
- * Counters have 40 bits implemented. However they are designed such
- * that bits [32-39] are sign extensions of bit 31. As such the
- * effective width of a counter for P6-like PMU is 32 bits only.
- *
- * See IA-32 Intel Architecture Software developer manual Vol 3B
- */
- .counter_bits = 32,
- .counter_mask = (1ULL << 32) - 1,
-};
-
-static struct x86_pmu intel_pmu = {
- .name = "Intel",
- .handle_irq = intel_pmu_handle_irq,
- .disable_all = intel_pmu_disable_all,
- .enable_all = intel_pmu_enable_all,
- .enable = intel_pmu_enable_counter,
- .disable = intel_pmu_disable_counter,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .raw_event = intel_pmu_raw_event,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic counter period:
- */
- .max_period = (1ULL << 31) - 1,
-};
-
-static struct x86_pmu amd_pmu = {
- .name = "AMD",
- .handle_irq = amd_pmu_handle_irq,
- .disable_all = amd_pmu_disable_all,
- .enable_all = amd_pmu_enable_all,
- .enable = amd_pmu_enable_counter,
- .disable = amd_pmu_disable_counter,
- .eventsel = MSR_K7_EVNTSEL0,
- .perfctr = MSR_K7_PERFCTR0,
- .event_map = amd_pmu_event_map,
- .raw_event = amd_pmu_raw_event,
- .max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = 4,
- .counter_bits = 48,
- .counter_mask = (1ULL << 48) - 1,
- .apic = 1,
- /* use highest bit to detect overflow */
- .max_period = (1ULL << 47) - 1,
-};
-
-static int p6_pmu_init(void)
-{
- switch (boot_cpu_data.x86_model) {
- case 1:
- case 3: /* Pentium Pro */
- case 5:
- case 6: /* Pentium II */
- case 7:
- case 8:
- case 11: /* Pentium III */
- break;
- case 9:
- case 13:
- /* Pentium M */
- break;
- default:
- pr_cont("unsupported p6 CPU model %d ",
- boot_cpu_data.x86_model);
- return -ENODEV;
- }
-
- x86_pmu = p6_pmu;
-
- if (!cpu_has_apic) {
- pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
- pr_info("no hardware sampling interrupt available.\n");
- x86_pmu.apic = 0;
- }
-
- return 0;
-}
-
-static int intel_pmu_init(void)
-{
- union cpuid10_edx edx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int ebx;
- int version;
-
- if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- /* check for P6 processor family */
- if (boot_cpu_data.x86 == 6) {
- return p6_pmu_init();
- } else {
- return -ENODEV;
- }
- }
-
- /*
- * Check whether the Architectural PerfMon supports
- * Branch Misses Retired Event or not.
- */
- cpuid(10, &eax.full, &ebx, &unused, &edx.full);
- if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
- return -ENODEV;
-
- version = eax.split.version_id;
- if (version < 2)
- return -ENODEV;
-
- x86_pmu = intel_pmu;
- x86_pmu.version = version;
- x86_pmu.num_counters = eax.split.num_counters;
- x86_pmu.counter_bits = eax.split.bit_width;
- x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
-
- /*
- * Quirk: v2 perfmon does not report fixed-purpose counters, so
- * assume at least 3 counters:
- */
- x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
- /*
- * Install the hw-cache-events table:
- */
- switch (boot_cpu_data.x86_model) {
- case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 29: /* six-core 45 nm xeon "Dunnington" */
- memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- pr_cont("Core2 events, ");
- break;
- default:
- case 26:
- memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- pr_cont("Nehalem/Corei7 events, ");
- break;
- case 28:
- memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- pr_cont("Atom events, ");
- break;
- }
- return 0;
-}
-
-static int amd_pmu_init(void)
-{
- /* Performance-monitoring supported from K7 and later: */
- if (boot_cpu_data.x86 < 6)
- return -ENODEV;
-
- x86_pmu = amd_pmu;
-
- /* Events are common for all AMDs */
- memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- return 0;
-}
-
-void __init init_hw_perf_counters(void)
-{
- int err;
-
- pr_info("Performance Counters: ");
-
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- err = intel_pmu_init();
- break;
- case X86_VENDOR_AMD:
- err = amd_pmu_init();
- break;
- default:
- return;
- }
- if (err != 0) {
- pr_cont("no PMU driver, software counters only.\n");
- return;
- }
-
- pr_cont("%s PMU driver.\n", x86_pmu.name);
-
- if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
- x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
- x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
- }
- perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
- perf_max_counters = x86_pmu.num_counters;
-
- if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
- x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
- x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
- }
-
- perf_counter_mask |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
- x86_pmu.intel_ctrl = perf_counter_mask;
-
- perf_counters_lapic_init();
- register_die_notifier(&perf_counter_nmi_notifier);
-
- pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.counter_bits);
- pr_info("... generic counters: %d\n", x86_pmu.num_counters);
- pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
- pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
- pr_info("... counter mask: %016Lx\n", perf_counter_mask);
-}
-
-static inline void x86_pmu_read(struct perf_counter *counter)
-{
- x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-}
-
-static const struct pmu pmu = {
- .enable = x86_pmu_enable,
- .disable = x86_pmu_disable,
- .read = x86_pmu_read,
- .unthrottle = x86_pmu_unthrottle,
-};
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
- int err;
-
- err = __hw_perf_counter_init(counter);
- if (err)
- return ERR_PTR(err);
-
- return &pmu;
-}
-
-/*
- * callchain support
- */
-
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
-
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
- per_cpu(in_nmi_frame, smp_processor_id()) =
- x86_is_stack_id(NMI_STACK, name);
-
- return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
- struct perf_callchain_entry *entry = data;
-
- if (per_cpu(in_nmi_frame, smp_processor_id()))
- return;
-
- if (reliable)
- callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
- .stack = backtrace_stack,
- .address = backtrace_address,
-};
-
-#include "../dumpstack.h"
-
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->ip);
-
- dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
- unsigned long offset, addr = (unsigned long)from;
- int type = in_nmi() ? KM_NMI : KM_IRQ0;
- unsigned long size, len = 0;
- struct page *page;
- void *map;
- int ret;
-
- do {
- ret = __get_user_pages_fast(addr, 1, 0, &page);
- if (!ret)
- break;
-
- offset = addr & (PAGE_SIZE - 1);
- size = min(PAGE_SIZE - offset, n - len);
-
- map = kmap_atomic(page, type);
- memcpy(to, map+offset, size);
- kunmap_atomic(map, type);
- put_page(page);
-
- len += size;
- to += size;
- addr += size;
-
- } while (len < n);
-
- return len;
-}
-
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
- unsigned long bytes;
-
- bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
-
- return bytes == sizeof(*frame);
-}
-
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- struct stack_frame frame;
- const void __user *fp;
-
- if (!user_mode(regs))
- regs = task_pt_regs(current);
-
- fp = (void __user *)regs->bp;
-
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, regs->ip);
-
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
- frame.next_frame = NULL;
- frame.return_address = 0;
-
- if (!copy_stack_frame(fp, &frame))
- break;
-
- if ((unsigned long)fp < regs->sp)
- break;
-
- callchain_store(entry, frame.return_address);
- fp = frame.next_frame;
- }
-}
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- int is_user;
-
- if (!regs)
- return;
-
- is_user = user_mode(regs);
-
- if (!current || current->pid == 0)
- return;
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- if (!is_user)
- perf_callchain_kernel(regs, entry);
-
- if (current->mm)
- perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
- struct perf_callchain_entry *entry;
-
- if (in_nmi())
- entry = &__get_cpu_var(nmi_entry);
- else
- entry = &__get_cpu_var(irq_entry);
-
- entry->nr = 0;
-
- perf_do_callchain(regs, entry);
-
- return entry;
-}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index e60ed740d2b3..7aecb2fc3186 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
- * with other users (like oprofile).
+ * with other users.
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
@@ -12,35 +13,15 @@
*/
#include <linux/percpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
#include <linux/kprobes.h>
#include <asm/apic.h>
-#include <asm/perf_counter.h>
-
-struct nmi_watchdog_ctlblk {
- unsigned int cccr_msr;
- unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
- unsigned int evntsel_msr; /* the MSR to select the events to handle */
-};
-
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
- int (*reserve)(void);
- void (*unreserve)(void);
- int (*setup)(unsigned nmi_hz);
- void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
- void (*stop)(void);
- unsigned perfctr;
- unsigned evntsel;
- u64 checkbit;
-};
-
-static const struct wd_ops *wd_ops;
+#include <asm/perf_event.h>
/*
* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
@@ -60,25 +41,32 @@ static const struct wd_ops *wd_ops;
static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the performance counter register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
- return (msr - MSR_K7_PERFCTR0);
+ if (msr >= MSR_F15H_PERF_CTR)
+ return (msr - MSR_F15H_PERF_CTR) >> 1;
+ return msr - MSR_K7_PERFCTR0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
- return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+ return msr - MSR_ARCH_PERFMON_PERFCTR0;
switch (boot_cpu_data.x86) {
case 6:
- return (msr - MSR_P6_PERFCTR0);
+ return msr - MSR_P6_PERFCTR0;
+ case 11:
+ return msr - MSR_KNC_PERFCTR0;
case 15:
- return (msr - MSR_P4_BPU_PERFCTR0);
+ return msr - MSR_P4_BPU_PERFCTR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_PERFCTR0;
}
return 0;
}
@@ -91,43 +79,32 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
- return (msr - MSR_K7_EVNTSEL0);
+ if (msr >= MSR_F15H_PERF_CTL)
+ return (msr - MSR_F15H_PERF_CTL) >> 1;
+ return msr - MSR_K7_EVNTSEL0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
- return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+ return msr - MSR_ARCH_PERFMON_EVENTSEL0;
switch (boot_cpu_data.x86) {
case 6:
- return (msr - MSR_P6_EVNTSEL0);
+ return msr - MSR_P6_EVNTSEL0;
+ case 11:
+ return msr - MSR_KNC_EVNTSEL0;
case 15:
- return (msr - MSR_P4_BSU_ESCR0);
+ return msr - MSR_P4_BSU_ESCR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_EVENTSEL0;
}
return 0;
}
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return (!test_bit(counter, perfctr_nmi_owner));
-}
-
-/* checks the an msr for availability */
-int avail_to_resrv_perfctr_nmi(unsigned int msr)
-{
- unsigned int counter;
-
- counter = nmi_perfctr_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return (!test_bit(counter, perfctr_nmi_owner));
-}
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
@@ -183,623 +160,3 @@ void release_evntsel_nmi(unsigned int msr)
clear_bit(counter, evntsel_nmi_owner);
}
EXPORT_SYMBOL(release_evntsel_nmi);
-
-void disable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- if (atomic_read(&nmi_active) <= 0)
- return;
-
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-
- if (wd_ops)
- wd_ops->unreserve();
-
- BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- /* are we already enabled */
- if (atomic_read(&nmi_active) != 0)
- return;
-
- /* are we lapic aware */
- if (!wd_ops)
- return;
- if (!wd_ops->reserve()) {
- printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
- return;
- }
-
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
- touch_nmi_watchdog();
-}
-
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
- u64 counter_val;
- unsigned int retval = hz;
-
- /*
- * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- counter_val = (u64)cpu_khz * 1000;
- do_div(counter_val, retval);
- if (counter_val > 0x7fffffffULL) {
- u64 count = (u64)cpu_khz * 1000;
- do_div(count, 0x7fffffffUL);
- retval = count + 1;
- }
- return retval;
-}
-
-static void write_watchdog_counter(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if(descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if(descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = K7_EVNTSEL_INT
- | K7_EVNTSEL_OS
- | K7_EVNTSEL_USR
- | K7_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void single_msr_stop_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int single_msr_reserve(void)
-{
- if (!reserve_perfctr_nmi(wd_ops->perfctr))
- return 0;
-
- if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
- release_perfctr_nmi(wd_ops->perfctr);
- return 0;
- }
- return 1;
-}
-
-static void single_msr_unreserve(void)
-{
- release_evntsel_nmi(wd_ops->evntsel);
- release_perfctr_nmi(wd_ops->perfctr);
-}
-
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops k7_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_k7_watchdog,
- .rearm = single_msr_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_K7_PERFCTR0,
- .evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL << 47,
-};
-
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- /* KVM doesn't implement this MSR */
- if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
- return 0;
-
- evntsel = P6_EVNTSEL_INT
- | P6_EVNTSEL_OS
- | P6_EVNTSEL_USR
- | P6_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /*
- * P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant.
- * ArchPerfom/Core Duo also needs this
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /* P6/ARCH_PERFMON has 32 bit counter write */
- write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
-}
-
-static const struct wd_ops p6_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_p6_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_P6_PERFCTR0,
- .evntsel = MSR_P6_EVNTSEL0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
-#define P4_ESCR_OS (1 << 3)
-#define P4_ESCR_USR (1 << 2)
-#define P4_CCCR_OVF_PMI0 (1 << 26)
-#define P4_CCCR_OVF_PMI1 (1 << 27)
-#define P4_CCCR_THRESHOLD(N) ((N) << 20)
-#define P4_CCCR_COMPLEMENT (1 << 19)
-#define P4_CCCR_COMPARE (1 << 18)
-#define P4_CCCR_REQUIRED (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
-#define P4_CCCR_ENABLE (1 << 12)
-#define P4_CCCR_OVF (1 << 31)
-
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
- MSR_P4_BPU_CCCR0,
- MSR_P4_BPU_CCCR1,
- MSR_P4_BPU_CCCR2,
- MSR_P4_BPU_CCCR3,
- MSR_P4_MS_CCCR0,
- MSR_P4_MS_CCCR1,
- MSR_P4_MS_CCCR2,
- MSR_P4_MS_CCCR3,
- MSR_P4_FLAME_CCCR0,
- MSR_P4_FLAME_CCCR1,
- MSR_P4_FLAME_CCCR2,
- MSR_P4_FLAME_CCCR3,
- MSR_P4_IQ_CCCR0,
- MSR_P4_IQ_CCCR1,
- MSR_P4_IQ_CCCR2,
- MSR_P4_IQ_CCCR3,
- MSR_P4_IQ_CCCR4,
- MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr, cccr_msr;
- unsigned int evntsel, cccr_val;
- unsigned int misc_enable, dummy;
- unsigned int ht_num;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
-
-#ifdef CONFIG_SMP
- /* detect which hyperthread we are on */
- if (smp_num_siblings == 2) {
- unsigned int ebx, apicid;
-
- ebx = cpuid_ebx(1);
- apicid = (ebx >> 24) & 0xff;
- ht_num = apicid & 1;
- } else
-#endif
- ht_num = 0;
-
- /*
- * performance counters are shared resources
- * assign each hyperthread its own set
- * (re-use the ESCR0 register, seems safe
- * and keeps the cccr_val the same)
- */
- if (!ht_num) {
- /* logical cpu 0 */
- perfctr_msr = MSR_P4_IQ_PERFCTR0;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR0;
- cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
- /*
- * If we're on the kdump kernel or other situation, we may
- * still have other performance counter registers set to
- * interrupt and they'll keep interrupting forever because
- * of the P4_CCCR_OVF quirk. So we need to ACK all the
- * pending interrupts and disable all the registers here,
- * before reenabling the NMI delivery. Refer to p4_rearm()
- * about the P4_CCCR_OVF quirk.
- */
- if (reset_devices) {
- unsigned int low, high;
- int i;
-
- for (i = 0; i < P4_CONTROLS; i++) {
- rdmsr(p4_controls[i], low, high);
- low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
- wrmsr(p4_controls[i], low, high);
- }
- }
- } else {
- /* logical cpu 1 */
- perfctr_msr = MSR_P4_IQ_PERFCTR1;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR1;
-
- /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
- if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
- cccr_val = P4_CCCR_OVF_PMI0;
- else
- cccr_val = P4_CCCR_OVF_PMI1;
- cccr_val |= P4_CCCR_ESCR_SELECT(4);
- }
-
- evntsel = P4_ESCR_EVENT_SELECT(0x3F)
- | P4_ESCR_OS
- | P4_ESCR_USR;
-
- cccr_val |= P4_CCCR_THRESHOLD(15)
- | P4_CCCR_COMPLEMENT
- | P4_CCCR_COMPARE
- | P4_CCCR_REQUIRED;
-
- wrmsr(evntsel_msr, evntsel, 0);
- wrmsr(cccr_msr, cccr_val, 0);
- write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = cccr_msr;
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
- return 1;
-}
-
-static void stop_p4_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- wrmsr(wd->cccr_msr, 0, 0);
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int p4_reserve(void)
-{
- if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
- return 0;
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
- goto fail1;
-#endif
- if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
- goto fail2;
- /* RED-PEN why is ESCR1 not reserved here? */
- return 1;
- fail2:
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
- return 0;
-}
-
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
- release_evntsel_nmi(MSR_P4_CRU_ESCR0);
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- unsigned dummy;
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- rdmsrl(wd->cccr_msr, dummy);
- dummy &= ~P4_CCCR_OVF;
- wrmsrl(wd->cccr_msr, dummy);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p4_wd_ops = {
- .reserve = p4_reserve,
- .unreserve = p4_unreserve,
- .setup = setup_p4_watchdog,
- .rearm = p4_rearm,
- .stop = stop_p4_watchdog,
- /* RED-PEN this is wrong for the other sibling */
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .evntsel = MSR_P4_BSU_ESCR0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static struct wd_ops intel_arch_wd_ops;
-
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
- unsigned int ebx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebx indicates event present.
- */
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
- if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
- (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return 0;
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = ARCH_PERFMON_EVENTSEL_INT
- | ARCH_PERFMON_EVENTSEL_OS
- | ARCH_PERFMON_EVENTSEL_USR
- | ARCH_PERFMON_NMI_EVENT_SEL
- | ARCH_PERFMON_NMI_EVENT_UMASK;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
- intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
- return 1;
-}
-
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-
-static void probe_nmi_watchdog(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16)
- return;
- wd_ops = &k7_wd_ops;
- break;
- case X86_VENDOR_INTEL:
- /* Work around where perfctr1 doesn't have a working enable
- * bit as described in the following errata:
- * AE49 Core Duo and Intel Core Solo 65 nm
- * AN49 Intel Pentium Dual-Core
- * AF49 Dual-Core Intel Xeon Processor LV
- */
- if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
- ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
- boot_cpu_data.x86_mask == 4))) {
- intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
- intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
- }
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- wd_ops = &intel_arch_wd_ops;
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 13)
- return;
-
- wd_ops = &p6_wd_ops;
- break;
- case 15:
- wd_ops = &p4_wd_ops;
- break;
- default:
- return;
- }
- break;
- }
-}
-
-/* Interface to nmi.c */
-
-int lapic_watchdog_init(unsigned nmi_hz)
-{
- if (!wd_ops) {
- probe_nmi_watchdog();
- if (!wd_ops) {
- printk(KERN_INFO "NMI watchdog: CPU not supported\n");
- return -1;
- }
-
- if (!wd_ops->reserve()) {
- printk(KERN_ERR
- "NMI watchdog: cannot reserve perfctrs\n");
- return -1;
- }
- }
-
- if (!(wd_ops->setup(nmi_hz))) {
- printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
- raw_smp_processor_id());
- return -1;
- }
-
- return 0;
-}
-
-void lapic_watchdog_stop(void)
-{
- if (wd_ops)
- wd_ops->stop();
-}
-
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
- wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
- hz = adjust_for_32bit_ctr(hz);
- return hz;
-}
-
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- u64 ctr;
-
- rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) /* perfctr still running? */
- return 0;
-
- wd_ops->rearm(wd, nmi_hz);
- return 1;
-}
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..fd6ec2aa0303 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Strings for the various x86 power flags
*
@@ -11,10 +12,13 @@ const char *const x86_power_flags[32] = {
"fid", /* frequency id control */
"vid", /* voltage id control */
"ttp", /* thermal trip */
- "tm",
- "stc",
- "100mhzsteps",
- "hwpstate",
+ "tm", /* hardware thermal control */
+ "stc", /* software thermal control */
+ "100mhzsteps", /* 100 MHz multiplier control */
+ "hwpstate", /* hardware P-state control */
"", /* tsc invariant mapped to constant_tsc */
- /* nothing */
+ "cpb", /* core performance boost */
+ "eff_freq_ro", /* Readonly aperf/mperf */
+ "proc_feedback", /* processor feedback interface */
+ "acc_power", /* accumulated power mechanism */
};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index d5e30397246b..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -1,8 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/smp.h>
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include <asm/prctl.h>
+#include <linux/proc_fs.h>
+
+#include "cpu.h"
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+extern const char * const x86_vmx_flags[NVMXINTS*32];
+#endif
/*
* Get CPU information for use by the procfs.
@@ -11,43 +20,33 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpumask_weight(cpu_core_mask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
- }
+ seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id);
+ seq_printf(m, "siblings\t: %d\n",
+ cpumask_weight(topology_core_cpumask(cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->topo.core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid);
#endif
}
#ifdef CONFIG_X86_32
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
- /*
- * We use exception 16 if we have hardware math and we've either seen
- * it or the CPU claims it is internal
- */
- int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
seq_printf(m,
"fdiv_bug\t: %s\n"
- "hlt_bug\t\t: %s\n"
"f00f_bug\t: %s\n"
"coma_bug\t: %s\n"
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
- "wp\t\t: %s\n",
- c->fdiv_bug ? "yes" : "no",
- c->hlt_works_ok ? "no" : "yes",
- c->f00f_bug ? "yes" : "no",
- c->coma_bug ? "yes" : "no",
- c->hard_math ? "yes" : "no",
- fpu_exception ? "yes" : "no",
- c->cpuid_level,
- c->wp_works_ok ? "yes" : "no");
+ "wp\t\t: yes\n",
+ str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ c->cpuid_level);
}
#else
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
@@ -64,12 +63,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
- unsigned int cpu = 0;
+ unsigned int cpu;
int i;
-#ifdef CONFIG_SMP
cpu = c->cpu_index;
-#endif
seq_printf(m, "processor\t: %u\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
@@ -81,32 +78,53 @@ static int show_cpuinfo(struct seq_file *m, void *v)
c->x86_model,
c->x86_model_id[0] ? c->x86_model_id : "unknown");
- if (c->x86_mask || c->cpuid_level >= 0)
- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ seq_printf(m, "stepping\t: %d\n", c->x86_stepping);
else
- seq_printf(m, "stepping\t: unknown\n");
+ seq_puts(m, "stepping\t: unknown\n");
+ if (c->microcode)
+ seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- if (!freq)
- freq = cpu_khz;
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
- if (c->x86_cache_size >= 0)
- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+ if (c->x86_cache_size)
+ seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
- seq_printf(m, "flags\t\t:");
+ seq_puts(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) {
+ seq_puts(m, "\nvmx flags\t:");
+ for (i = 0; i < 32*NVMXINTS; i++) {
+ if (test_bit(i, (unsigned long *)c->vmx_capability) &&
+ x86_vmx_flags[i] != NULL)
+ seq_printf(m, " %s", x86_vmx_flags[i]);
+ }
+ }
+#endif
+
+ seq_puts(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
@@ -116,36 +134,31 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
#endif
seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
-#ifdef CONFIG_X86_64
seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
c->x86_phys_bits, c->x86_virt_bits);
-#endif
- seq_printf(m, "power management:");
+ seq_puts(m, "power management:");
for (i = 0; i < 32; i++) {
if (c->x86_power & (1 << i)) {
if (i < ARRAY_SIZE(x86_power_flags) &&
x86_power_flags[i])
seq_printf(m, "%s%s",
- x86_power_flags[i][0]?" ":"",
+ x86_power_flags[i][0] ? " " : "",
x86_power_flags[i]);
else
seq_printf(m, " [%d]", i);
}
}
- seq_printf(m, "\n\n");
+ seq_puts(m, "\n\n");
return 0;
}
static void *c_start(struct seq_file *m, loff_t *pos)
{
- if (*pos == 0) /* just in case, cpu 0 is not the first */
- *pos = cpumask_first(cpu_online_mask);
- else
- *pos = cpumask_next(*pos - 1, cpu_online_mask);
+ *pos = cpumask_next(*pos - 1, cpu_online_mask);
if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
@@ -167,3 +180,24 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static void dump_x86_features(struct seq_file *m, unsigned long features)
+{
+ if (features & ARCH_SHSTK_SHSTK)
+ seq_puts(m, "shstk ");
+ if (features & ARCH_SHSTK_WRSS)
+ seq_puts(m, "wrss ");
+}
+
+void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task)
+{
+ seq_puts(m, "x86_Thread_features:\t");
+ dump_x86_features(m, task->thread.features);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "x86_Thread_features_locked:\t");
+ dump_x86_features(m, task->thread.features_locked);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
new file mode 100644
index 000000000000..eeac00d20926
--- /dev/null
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ */
+#include <linux/printk.h>
+
+#include <asm/processor.h>
+#include <asm/archrandom.h>
+#include <asm/sections.h>
+
+/*
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check. Also make sure
+ * it's not outputting the same value over and over, which has happened
+ * as a result of past CPU bugs.
+ *
+ * If it fails, it is simple to disable RDRAND and RDSEED here.
+ */
+
+void x86_init_rdrand(struct cpuinfo_x86 *c)
+{
+ enum { SAMPLES = 8, MIN_CHANGE = 5 };
+ unsigned long sample, prev;
+ bool failure = false;
+ size_t i, changed;
+
+ if (!cpu_has(c, X86_FEATURE_RDRAND))
+ return;
+
+ for (changed = 0, i = 0; i < SAMPLES; ++i) {
+ if (!rdrand_long(&sample)) {
+ failure = true;
+ break;
+ }
+ changed += i && sample != prev;
+ prev = sample;
+ }
+ if (changed < MIN_CHANGE)
+ failure = true;
+
+ if (failure) {
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ pr_emerg("RDRAND is not reliable on this platform; disabling.\n");
+ }
+}
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
new file mode 100644
index 000000000000..d8a04b195da2
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
+CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
new file mode 100644
index 000000000000..3792ab4819dc
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+#include <asm/resctrl.h>
+#include "internal.h"
+
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
+
+/*
+ * The cached resctrl_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
+
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
+
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
+
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
+ [RDT_RESOURCE_L3] =
+ {
+ .r_resctrl = {
+ .name = "L3",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .mon_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
+ .mon_domains = mon_domain_init(RDT_RESOURCE_L3),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
+ },
+ .msr_base = MSR_IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ },
+ [RDT_RESOURCE_L2] =
+ {
+ .r_resctrl = {
+ .name = "L2",
+ .ctrl_scope = RESCTRL_L2_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
+ },
+ .msr_base = MSR_IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ },
+ [RDT_RESOURCE_MBA] =
+ {
+ .r_resctrl = {
+ .name = "MB",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
+ },
+ },
+ [RDT_RESOURCE_SMBA] =
+ {
+ .r_resctrl = {
+ .name = "SMBA",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
+ },
+ },
+};
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->mon.num_rmid;
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cache mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline void cache_alloc_hsw_probe(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r = &hw_res->r_resctrl;
+ u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
+
+ if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
+ return;
+
+ rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
+
+ /* If all the bits were set in MSR, return success */
+ if (l3_cbm_0 != max_cbm)
+ return;
+
+ hw_res->num_closid = 4;
+ r->cache.cbm_len = 20;
+ r->cache.shareable_bits = 0xc0000;
+ r->cache.min_cbm_bits = 2;
+ r->cache.arch_has_sparse_bitmasks = false;
+ r->alloc_capable = true;
+
+ rdt_alloc_capable = true;
+}
+
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+ /*
+ * There are no Intel SKUs as of now to support non-linear delay.
+ */
+ pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ return false;
+}
+
+static __init bool __get_mem_config_intel(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ union cpuid_0x10_3_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx, max_delay;
+
+ cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+ hw_res->num_closid = edx.split.cos_max + 1;
+ max_delay = eax.split.max_delay + 1;
+ r->membw.max_bw = MAX_MBA_BW;
+ r->membw.arch_needs_linear = true;
+ if (ecx & MBA_IS_LINEAR) {
+ r->membw.delay_linear = true;
+ r->membw.min_bw = MAX_MBA_BW - max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - max_delay;
+ } else {
+ if (!rdt_get_mb_table(r))
+ return false;
+ r->membw.arch_needs_linear = false;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
+ r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
+ else
+ r->membw.throttle_mode = THREAD_THROTTLE_MAX;
+
+ r->alloc_capable = true;
+
+ return true;
+}
+
+static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ u32 eax, ebx, ecx, edx, subleaf;
+
+ /*
+ * Query CPUID_Fn80000020_EDX_x01 for MBA and
+ * CPUID_Fn80000020_EDX_x02 for SMBA
+ */
+ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
+
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->membw.max_bw = 1 << eax;
+
+ /* AMD does not use delay */
+ r->membw.delay_linear = false;
+ r->membw.arch_needs_linear = false;
+
+ /*
+ * AMD does not use memory delay throttle model to control
+ * the allocation like Intel does.
+ */
+ r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ r->membw.min_bw = 0;
+ r->membw.bw_gran = 1;
+
+ r->alloc_capable = true;
+
+ return true;
+}
+
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_ecx ecx;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, default_ctrl;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
+ hw_res->num_closid = edx.split.cos_max + 1;
+ r->cache.cbm_len = eax.split.cbm_len + 1;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
+ r->alloc_capable = true;
+}
+
+static void rdt_get_cdp_config(int level)
+{
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ rdt_resources_all[level].cdp_enabled = false;
+ rdt_resources_all[level].r_resctrl.cdp_capable = true;
+}
+
+static void rdt_set_io_alloc_capable(struct rdt_resource *r)
+{
+ r->cache.io_alloc_capable = true;
+}
+
+static void rdt_get_cdp_l3_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L3);
+}
+
+static void rdt_get_cdp_l2_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L2);
+}
+
+static void mba_wrmsr_amd(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+{
+ if (r->membw.delay_linear)
+ return MAX_MBA_BW - bw;
+
+ pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+ return MAX_MBA_BW;
+}
+
+static void mba_wrmsr_intel(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ /* Write the delay values for mba. */
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
+}
+
+static void cat_wrmsr(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->num_closid;
+}
+
+void rdt_ctrl_update(void *arg)
+{
+ struct rdt_hw_resource *hw_res;
+ struct msr_param *m = arg;
+
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
+}
+
+static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ int i;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100%
+ */
+ for (i = 0; i < hw_res->num_closid; i++, dc++)
+ *dc = resctrl_get_default_ctrl(r);
+}
+
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
+{
+ int idx;
+
+ for_each_mbm_idx(idx)
+ kfree(hw_dom->arch_mbm_states[idx]);
+ kfree(hw_dom);
+}
+
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct msr_param m;
+ u32 *dc;
+
+ dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
+ GFP_KERNEL);
+ if (!dc)
+ return -ENOMEM;
+
+ hw_dom->ctrl_val = dc;
+ setup_default_ctrlval(r, dc);
+
+ m.res = r;
+ m.dom = d;
+ m.low = 0;
+ m.high = hw_res->num_closid;
+ hw_res->msr_update(&m);
+ return 0;
+}
+
+/**
+ * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
+ * @num_rmid: The size of the MBM counter array
+ * @hw_dom: The domain that owns the allocated arrays
+ */
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
+{
+ size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
+ enum resctrl_event_id eventid;
+ int idx;
+
+ for_each_mbm_event_id(eventid) {
+ if (!resctrl_is_mon_event_enabled(eventid))
+ continue;
+ idx = MBM_STATE_IDX(eventid);
+ hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL);
+ if (!hw_dom->arch_mbm_states[idx])
+ goto cleanup;
+ }
+
+ return 0;
+cleanup:
+ for_each_mbm_idx(idx) {
+ kfree(hw_dom->arch_mbm_states[idx]);
+ hw_dom->arch_mbm_states[idx] = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
+{
+ switch (scope) {
+ case RESCTRL_L2_CACHE:
+ case RESCTRL_L3_CACHE:
+ return get_cpu_cacheinfo_id(cpu, scope);
+ case RESCTRL_L3_NODE:
+ return cpu_to_node(cpu);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct list_head *add_pos = NULL;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ if (r->cache.arch_has_per_cpu_cfg)
+ rdt_domain_reconfigure_cdp(r);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ rdt_domain_reconfigure_cdp(r);
+
+ if (domain_setup_ctrlval(r, d)) {
+ ctrl_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_ctrl_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ ctrl_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct list_head *add_pos = NULL;
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ /* Update the mbm_assign_mode state for the CPU if supported */
+ if (r->mon.mbm_cntr_assignable)
+ resctrl_arch_mbm_cntr_assign_set_one(r);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_MON_DOMAIN;
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci) {
+ pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+ mon_domain_free(hw_dom);
+ return;
+ }
+ d->ci_id = ci->id;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ /* Update the mbm_assign_mode state for the CPU if supported */
+ if (r->mon.mbm_cntr_assignable)
+ resctrl_arch_mbm_cntr_assign_set_one(r);
+
+ arch_mon_domain_online(r, d);
+
+ if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
+ mon_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_mon_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_add_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_add_cpu_mon(cpu, r);
+}
+
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_ctrl_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+
+ /*
+ * rdt_ctrl_domain "d" is going to be freed below, so clear
+ * its pointer from pseudo_lock_region struct.
+ */
+ if (d->plr)
+ d->plr->d = NULL;
+ ctrl_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_mon_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_remove_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_remove_cpu_mon(cpu, r);
+}
+
+static void clear_closid_rmid(int cpu)
+{
+ struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ state->default_closid = RESCTRL_RESERVED_CLOSID;
+ state->default_rmid = RESCTRL_RESERVED_RMID;
+ state->cur_closid = RESCTRL_RESERVED_CLOSID;
+ state->cur_rmid = RESCTRL_RESERVED_RMID;
+ wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
+ RESCTRL_RESERVED_CLOSID);
+}
+
+static int resctrl_arch_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&domain_list_lock);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ mutex_unlock(&domain_list_lock);
+
+ clear_closid_rmid(cpu);
+ resctrl_online_cpu(cpu);
+
+ return 0;
+}
+
+static int resctrl_arch_offline_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ resctrl_offline_cpu(cpu);
+
+ mutex_lock(&domain_list_lock);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ mutex_unlock(&domain_list_lock);
+
+ clear_closid_rmid(cpu);
+
+ return 0;
+}
+
+enum {
+ RDT_FLAG_CMT,
+ RDT_FLAG_MBM_TOTAL,
+ RDT_FLAG_MBM_LOCAL,
+ RDT_FLAG_L3_CAT,
+ RDT_FLAG_L3_CDP,
+ RDT_FLAG_L2_CAT,
+ RDT_FLAG_L2_CDP,
+ RDT_FLAG_MBA,
+ RDT_FLAG_SMBA,
+ RDT_FLAG_BMEC,
+ RDT_FLAG_ABMC,
+ RDT_FLAG_SDCIAE,
+};
+
+#define RDT_OPT(idx, n, f) \
+[idx] = { \
+ .name = n, \
+ .flag = f \
+}
+
+struct rdt_options {
+ char *name;
+ int flag;
+ bool force_off, force_on;
+};
+
+static struct rdt_options rdt_options[] __ro_after_init = {
+ RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
+ RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+ RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+ RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
+ RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
+ RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
+ RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
+ RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+ RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
+ RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
+ RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC),
+ RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+ struct rdt_options *o;
+ bool force_off;
+ char *tok;
+
+ if (*str == '=')
+ str++;
+ while ((tok = strsep(&str, ",")) != NULL) {
+ force_off = *tok == '!';
+ if (force_off)
+ tok++;
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (strcmp(tok, o->name) == 0) {
+ if (force_off)
+ o->force_off = true;
+ else
+ o->force_on = true;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("rdt", set_rdt_options);
+
+bool rdt_cpu_has(int flag)
+{
+ bool ret = boot_cpu_has(flag);
+ struct rdt_options *o;
+
+ if (!ret)
+ return ret;
+
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (flag == o->flag) {
+ if (o->force_off)
+ ret = false;
+ if (o->force_on)
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+}
+
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
+static __init bool get_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_MBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return __get_mem_config_intel(&hw_res->r_resctrl);
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
+static __init bool get_slow_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_SMBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
+static __init bool get_rdt_alloc_resources(void)
+{
+ struct rdt_resource *r;
+ bool ret = false;
+
+ if (rdt_alloc_capable)
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ rdt_get_cache_alloc_cfg(1, r);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3))
+ rdt_get_cdp_l3_config();
+ if (rdt_cpu_has(X86_FEATURE_SDCIAE))
+ rdt_set_io_alloc_capable(r);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
+ rdt_get_cache_alloc_cfg(2, r);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L2))
+ rdt_get_cdp_l2_config();
+ ret = true;
+ }
+
+ if (get_mem_config())
+ ret = true;
+
+ if (get_slow_mem_config())
+ ret = true;
+
+ return ret;
+}
+
+static __init bool get_rdt_mon_resources(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ bool ret = false;
+
+ if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
+ resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
+ resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
+ resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_ABMC))
+ ret = true;
+
+ if (!ret)
+ return false;
+
+ return !rdt_get_mon_l3_config(r);
+}
+
+static __init void __check_quirks_intel(void)
+{
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
+ if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+ cache_alloc_hsw_probe();
+ break;
+ case INTEL_SKYLAKE_X:
+ if (boot_cpu_data.x86_stepping <= 4)
+ set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ else
+ set_rdt_options("!l3cat");
+ fallthrough;
+ case INTEL_BROADWELL_X:
+ intel_rdt_mbm_apply_quirk();
+ break;
+ }
+}
+
+static __init void check_quirks(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ __check_quirks_intel();
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
+static __init void rdt_init_res_defs_intel(void)
+{
+ struct rdt_hw_resource *hw_res;
+ struct rdt_resource *r;
+
+ for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
+ if (r->rid == RDT_RESOURCE_L3 ||
+ r->rid == RDT_RESOURCE_L2) {
+ r->cache.arch_has_per_cpu_cfg = false;
+ r->cache.min_cbm_bits = 1;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
+ hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
+ hw_res->msr_update = mba_wrmsr_intel;
+ }
+ }
+}
+
+static __init void rdt_init_res_defs_amd(void)
+{
+ struct rdt_hw_resource *hw_res;
+ struct rdt_resource *r;
+
+ for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
+ if (r->rid == RDT_RESOURCE_L3 ||
+ r->rid == RDT_RESOURCE_L2) {
+ r->cache.arch_has_sparse_bitmasks = true;
+ r->cache.arch_has_per_cpu_cfg = true;
+ r->cache.min_cbm_bits = 0;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
+ hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
+ } else if (r->rid == RDT_RESOURCE_SMBA) {
+ hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
+ }
+ }
+}
+
+static __init void rdt_init_res_defs(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ rdt_init_res_defs_intel();
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ rdt_init_res_defs_amd();
+}
+
+static enum cpuhp_state rdt_online;
+
+/* Runs once on the BSP during boot. */
+void resctrl_cpu_detect(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ c->x86_cache_mbm_width_offset = -1;
+ return;
+ }
+
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) ||
+ cpu_has(c, X86_FEATURE_ABMC)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ c->x86_cache_mbm_width_offset = eax & 0xff;
+
+ if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+ c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
+ }
+}
+
+static int __init resctrl_arch_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret, i;
+
+ /* for_each_rdt_resource() requires all rid to be initialised. */
+ for (i = 0; i < RDT_NUM_RESOURCES; i++)
+ rdt_resources_all[i].r_resctrl.rid = i;
+
+ /*
+ * Initialize functions(or definitions) that are different
+ * between vendors here.
+ */
+ rdt_init_res_defs();
+
+ check_quirks();
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/resctrl/cat:online:",
+ resctrl_arch_online_cpu,
+ resctrl_arch_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = resctrl_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+ rdt_online = state;
+
+ for_each_alloc_capable_rdt_resource(r)
+ pr_info("%s allocation detected\n", r->name);
+
+ for_each_mon_capable_rdt_resource(r)
+ pr_info("%s monitoring detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(resctrl_arch_late_init);
+
+static void __exit resctrl_arch_exit(void)
+{
+ cpuhp_remove_state(rdt_online);
+
+ resctrl_exit();
+}
+
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
new file mode 100644
index 000000000000..b20e705606b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+
+#include "internal.h"
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ u32 idx = resctrl_get_config_index(closid, t);
+ struct msr_param msr_param;
+
+ if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
+ return -EINVAL;
+
+ hw_dom->ctrl_val[idx] = cfg_val;
+
+ msr_param.res = r;
+ msr_param.dom = d;
+ msr_param.low = idx;
+ msr_param.high = idx + 1;
+ hw_res->msr_update(&msr_param);
+
+ return 0;
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+ struct resctrl_staged_config *cfg;
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
+ enum resctrl_conf_type t;
+ u32 idx;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+ msr_param.res = NULL;
+ for (t = 0; t < CDP_NUM_TYPES; t++) {
+ cfg = &hw_dom->d_resctrl.staged_config[t];
+ if (!cfg->have_new_ctrl)
+ continue;
+
+ idx = resctrl_get_config_index(closid, t);
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
+ continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
+
+ if (!msr_param.res) {
+ msr_param.low = idx;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+ msr_param.dom = d;
+ } else {
+ msr_param.low = min(msr_param.low, idx);
+ msr_param.high = max(msr_param.high, idx + 1);
+ }
+ }
+ if (msr_param.res)
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ }
+
+ return 0;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type type)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ u32 idx = resctrl_get_config_index(closid, type);
+
+ return hw_dom->ctrl_val[idx];
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->sdciae_enabled;
+}
+
+static void resctrl_sdciae_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+}
+
+static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_ctrl_domain *d;
+
+ /* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list)
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1);
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (hw_res->r_resctrl.cache.io_alloc_capable &&
+ hw_res->sdciae_enabled != enable) {
+ _resctrl_sdciae_enable(r, enable);
+ hw_res->sdciae_enabled = enable;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
new file mode 100644
index 000000000000..4a916c84a322
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RESCTRL_INTERNAL_H
+#define _ASM_X86_RESCTRL_INTERNAL_H
+
+#include <linux/resctrl.h>
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+#define L2_QOS_CDP_ENABLE 0x01ULL
+
+#define MBM_CNTR_WIDTH_BASE 24
+
+#define MBA_IS_LINEAR 0x4
+
+#define MBM_CNTR_WIDTH_OFFSET_AMD 20
+
+#define RMID_VAL_ERROR BIT_ULL(63)
+
+#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+/*
+ * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
+ * data to be returned. The counter width is discovered from the hardware
+ * as an offset from MBM_CNTR_WIDTH_BASE.
+ */
+#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
+
+/**
+ * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
+ * return value.
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to
+ * find this struct.
+ */
+struct arch_mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+};
+
+/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */
+#define ABMC_ENABLE_BIT 0
+
+/*
+ * Qos Event Identifiers.
+ */
+#define ABMC_EXTENDED_EVT_ID BIT(31)
+#define ABMC_EVT_ID BIT(0)
+
+/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */
+#define SDCIAE_ENABLE_BIT 1
+
+/**
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a control function
+ * @d_resctrl: Properties exposed to the resctrl file system
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_ctrl_domain {
+ struct rdt_ctrl_domain d_resctrl;
+ u32 *ctrl_val;
+};
+
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a monitor function
+ * @d_resctrl: Properties exposed to the resctrl file system
+ * @arch_mbm_states: Per-event pointer to the MBM event's saved state.
+ * An MBM event's state is an array of struct arch_mbm_state
+ * indexed by RMID on x86.
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_mon_domain {
+ struct rdt_mon_domain d_resctrl;
+ struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS];
+};
+
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
+{
+ return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
+{
+ return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
+}
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @dom: The domain to update
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ struct rdt_ctrl_domain *dom;
+ u32 low;
+ u32 high;
+};
+
+/**
+ * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @r_resctrl: Attributes of the resource used directly by resctrl.
+ * @num_closid: Maximum number of closid this hardware can support,
+ * regardless of CDP. This is exposed via
+ * resctrl_arch_get_num_closid() to avoid confusion
+ * with struct resctrl_schema's property of the same name,
+ * which has been corrected for features like CDP.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @mbm_width: Monitor width, to detect and correct for overflow.
+ * @cdp_enabled: CDP state of this resource
+ * @mbm_cntr_assign_enabled: ABMC feature is enabled
+ * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled.
+ *
+ * Members of this structure are either private to the architecture
+ * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
+ * msr_update and msr_base.
+ */
+struct rdt_hw_resource {
+ struct rdt_resource r_resctrl;
+ u32 num_closid;
+ unsigned int msr_base;
+ void (*msr_update)(struct msr_param *m);
+ unsigned int mon_scale;
+ unsigned int mbm_width;
+ bool cdp_enabled;
+ bool mbm_cntr_assign_enabled;
+ bool sdciae_enabled;
+};
+
+static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
+{
+ return container_of(r, struct rdt_hw_resource, r_resctrl);
+}
+
+extern struct rdt_hw_resource rdt_resources_all[];
+
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+ struct {
+ unsigned int reserved:3;
+ unsigned int noncont:1;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+/*
+ * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG.
+ *
+ * @bw_type : Event configuration that represents the memory
+ * transactions being tracked by the @cntr_id.
+ * @bw_src : Bandwidth source (RMID or CLOSID).
+ * @reserved1 : Reserved.
+ * @is_clos : @bw_src field is a CLOSID (not an RMID).
+ * @cntr_id : Counter identifier.
+ * @reserved : Reserved.
+ * @cntr_en : Counting enable bit.
+ * @cfg_en : Configuration enable bit.
+ *
+ * Configuration and counting:
+ * Counter can be configured across multiple writes to MSR. Configuration
+ * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the
+ * configuration is applied.
+ * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not
+ * count events.
+ * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start
+ * counting events.
+ */
+union l3_qos_abmc_cfg {
+ struct {
+ unsigned long bw_type :32,
+ bw_src :12,
+ reserved1: 3,
+ is_clos : 1,
+ cntr_id : 5,
+ reserved : 9,
+ cntr_en : 1,
+ cfg_en : 1;
+ } split;
+ unsigned long full;
+};
+
+void rdt_ctrl_update(void *arg);
+
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+
+bool rdt_cpu_has(int flag);
+
+void __init intel_rdt_mbm_apply_quirk(void);
+
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r);
+
+#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
new file mode 100644
index 000000000000..dffcc8307500
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -0,0 +1,583 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "internal.h"
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+
+static int snc_nodes_per_l3_cache = 1;
+
+/*
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ * for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ * to calculate corrected value by shifting:
+ * corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+ u32 rmidthreshold;
+ u64 cf;
+} mbm_cf_table[] __initconst = {
+ {7, CF(1.000000)},
+ {15, CF(1.000000)},
+ {15, CF(0.969650)},
+ {31, CF(1.000000)},
+ {31, CF(1.066667)},
+ {31, CF(0.969650)},
+ {47, CF(1.142857)},
+ {63, CF(1.000000)},
+ {63, CF(1.185115)},
+ {63, CF(1.066553)},
+ {79, CF(1.454545)},
+ {95, CF(1.000000)},
+ {95, CF(1.230769)},
+ {95, CF(1.142857)},
+ {95, CF(1.066667)},
+ {127, CF(1.000000)},
+ {127, CF(1.254863)},
+ {127, CF(1.185255)},
+ {151, CF(1.000000)},
+ {127, CF(1.066667)},
+ {167, CF(1.000000)},
+ {159, CF(1.454334)},
+ {183, CF(1.000000)},
+ {127, CF(0.969744)},
+ {191, CF(1.280246)},
+ {191, CF(1.230921)},
+ {215, CF(1.000000)},
+ {191, CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+ /* Correct MBM value. */
+ if (rmid > mbm_cf_rmidthreshold)
+ val = (val * mbm_cf) >> 20;
+
+ return val;
+}
+
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache. So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ if (snc_nodes_per_l3_cache == 1)
+ return lrmid;
+
+ return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;
+}
+
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
+ rdmsrq(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
+ u32 rmid,
+ enum resctrl_event_id eventid)
+{
+ struct arch_mbm_state *state;
+
+ if (!resctrl_is_mbm_event(eventid))
+ return NULL;
+
+ state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];
+
+ return state ? &state[rmid] : NULL;
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid,
+ enum resctrl_event_id eventid)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u32 prmid;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ memset(am, 0, sizeof(*am));
+
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ /* Record any initial, non-zero count value. */
+ __rmid_read_phys(prmid, eventid, &am->prev_msr);
+ }
+}
+
+/*
+ * Assumes that hardware counters are also reset and thus that there is
+ * no need to record initial non-zero counts.
+ */
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ enum resctrl_event_id eventid;
+ int idx;
+
+ for_each_mbm_event_id(eventid) {
+ if (!resctrl_is_mon_event_enabled(eventid))
+ continue;
+ idx = MBM_STATE_IDX(eventid);
+ memset(hw_dom->arch_mbm_states[idx], 0,
+ sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);
+ }
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
+{
+ u64 shift = 64 - width, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >> shift;
+}
+
+static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 rmid, enum resctrl_event_id eventid, u64 msr_val)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct arch_mbm_state *am;
+ u64 chunks;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+ hw_res->mbm_width);
+ chunks = get_corrected_mbm_count(rmid, am->chunks);
+ am->prev_msr = msr_val;
+ } else {
+ chunks = msr_val;
+ }
+
+ return chunks * hw_res->mon_scale;
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, enum resctrl_event_id eventid,
+ u64 *val, void *ignored)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u64 msr_val;
+ u32 prmid;
+ int ret;
+
+ resctrl_arch_rmid_read_context_check();
+
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ ret = __rmid_read_phys(prmid, eventid, &msr_val);
+
+ if (!ret) {
+ *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+ } else if (ret == -EINVAL) {
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am)
+ am->prev_msr = 0;
+ }
+
+ return ret;
+}
+
+static int __cntr_id_read(u32 cntr_id, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * QM_EVTSEL Register definition:
+ * =======================================================
+ * Bits Mnemonic Description
+ * =======================================================
+ * 63:44 -- Reserved
+ * 43:32 RMID RMID or counter ID in ABMC mode
+ * when reading an MBM event
+ * 31 ExtendedEvtID Extended Event Identifier
+ * 30:8 -- Reserved
+ * 7:0 EvtID Event Identifier
+ * =======================================================
+ * The contents of a specific counter can be read by setting the
+ * following fields in QM_EVTSEL.ExtendedEvtID(=1) and
+ * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID
+ * to the desired counter ID. Reading the QM_CTR then returns the
+ * contents of the specified counter. The RMID_VAL_ERROR bit is set
+ * if the counter configuration is invalid, or if an invalid counter
+ * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit
+ * is set if the counter data is unavailable.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ struct arch_mbm_state *am;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ memset(am, 0, sizeof(*am));
+
+ /* Record any initial, non-zero count value. */
+ __cntr_id_read(cntr_id, &am->prev_msr);
+ }
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+ int ret;
+
+ ret = __cntr_id_read(cntr_id, &msr_val);
+ if (ret)
+ return ret;
+
+ *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+
+ return 0;
+}
+
+/*
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
+ */
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ if (snc_nodes_per_l3_cache > 1)
+ msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
+}
+
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),
+ {}
+};
+
+/*
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
+ */
+static __init int snc_get_config(void)
+{
+ struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+ const cpumask_t *node0_cpumask;
+ int cpus_per_node, cpus_per_l3;
+ int ret;
+
+ if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ return 1;
+
+ cpus_read_lock();
+ if (num_online_cpus() != num_present_cpus())
+ pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+ cpus_read_unlock();
+
+ node0_cpumask = cpumask_of_node(cpu_to_node(0));
+
+ cpus_per_node = cpumask_weight(node0_cpumask);
+ cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
+
+ if (!cpus_per_node || !cpus_per_l3)
+ return 1;
+
+ ret = cpus_per_l3 / cpus_per_node;
+
+ /* sanity check: Only valid results are 1, 2, 3, 4, 6 */
+ switch (ret) {
+ case 1:
+ break;
+ case 2 ... 4:
+ case 6:
+ pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+ rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+ break;
+ default:
+ pr_warn("Ignore improbable SNC node count %d\n", ret);
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
+int __init rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ unsigned int threshold;
+ u32 eax, ebx, ecx, edx;
+
+ snc_nodes_per_l3_cache = snc_get_config();
+
+ resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
+ hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+ if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+ hw_res->mbm_width += mbm_offset;
+ else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+ pr_warn("Ignoring impossible MBM counter offset\n");
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;
+
+ /*
+ * Because num_rmid may not be a power of two, round the value
+ * to the nearest multiple of hw_res->mon_scale so it matches a
+ * value the hardware will measure. mon_scale may not be a power of 2.
+ */
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
+
+ if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+ }
+
+ /*
+ * resctrl assumes a system that supports assignable counters can
+ * switch to "default" mode. Ensure that there is a "default" mode
+ * to switch to. This enforces a dependency between the independent
+ * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL
+ * hardware features.
+ */
+ if (rdt_cpu_has(X86_FEATURE_ABMC) &&
+ (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) ||
+ rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) {
+ r->mon.mbm_cntr_assignable = true;
+ cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
+ r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
+ hw_res->mbm_cntr_assign_enabled = true;
+ }
+
+ r->mon_capable = true;
+
+ return 0;
+}
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+ int cf_index;
+
+ cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+ if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+ pr_info("No MBM correction factor available\n");
+ return;
+ }
+
+ mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+ mbm_cf = mbm_cf_table[cf_index].cf;
+}
+
+static void resctrl_abmc_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+}
+
+/*
+ * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs
+ * associated with all monitor domains.
+ */
+static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,
+ &enable, 1);
+ resctrl_arch_reset_rmid_all(r, d);
+ }
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (r->mon.mbm_cntr_assignable &&
+ hw_res->mbm_cntr_assign_enabled != enable) {
+ _resctrl_abmc_enable(r, enable);
+ hw_res->mbm_cntr_assign_enabled = enable;
+ }
+
+ return 0;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;
+}
+
+static void resctrl_abmc_config_one_amd(void *info)
+{
+ union l3_qos_abmc_cfg *abmc_cfg = info;
+
+ wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
+}
+
+/*
+ * Send an IPI to the domain to assign the counter to RMID, event pair.
+ */
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ enum resctrl_event_id evtid, u32 rmid, u32 closid,
+ u32 cntr_id, bool assign)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ union l3_qos_abmc_cfg abmc_cfg = { 0 };
+ struct arch_mbm_state *am;
+
+ abmc_cfg.split.cfg_en = 1;
+ abmc_cfg.split.cntr_en = assign ? 1 : 0;
+ abmc_cfg.split.cntr_id = cntr_id;
+ abmc_cfg.split.bw_src = rmid;
+ if (assign)
+ abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);
+
+ smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);
+
+ /*
+ * The hardware counter is reset (because cfg_en == 1) so there is no
+ * need to record initial non-zero counts.
+ */
+ am = get_arch_mbm_state(hw_dom, rmid, evtid);
+ if (am)
+ memset(am, 0, sizeof(*am));
+}
+
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
new file mode 100644
index 000000000000..de580eca3363
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheflush.h>
+#include <linux/cpu.h>
+#include <linux/perf_event.h>
+#include <linux/pm_qos.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#include "../../events/perf_event.h" /* For X86_CONFIG() */
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+
+#include "pseudo_lock_trace.h"
+
+/*
+ * The bits needed to disable hardware prefetching varies based on the
+ * platform. During initialization we will discover which bits to use.
+ */
+static u64 prefetch_disable_bits;
+
+/**
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
+ * @void: It takes no parameters.
+ *
+ * Capture the list of platforms that have been validated to support
+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
+ * with low cache miss rates can be created under variety of load conditions
+ * as well as that these pseudo-locked regions can maintain their low cache
+ * miss rates under variety of load conditions for significant lengths of time.
+ *
+ * After a platform has been validated to support pseudo-locking its
+ * hardware prefetch disable bits are included here as they are documented
+ * in the SDM.
+ *
+ * When adding a platform here also add support for its cache events to
+ * resctrl_arch_measure_l*_residency()
+ *
+ * Return:
+ * If platform is supported, the bits to disable hardware prefetchers, 0
+ * if platform is not supported.
+ */
+u64 resctrl_arch_get_prefetch_disable_bits(void)
+{
+ prefetch_disable_bits = 0;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 3 DCU IP Prefetcher Disable (R/W)
+ * 63:4 Reserved
+ */
+ prefetch_disable_bits = 0xF;
+ break;
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 Reserved
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 63:3 Reserved
+ */
+ prefetch_disable_bits = 0x5;
+ break;
+ }
+
+ return prefetch_disable_bits;
+}
+
+/**
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
+ *
+ * This is the core pseudo-locking flow.
+ *
+ * First we ensure that the kernel memory cannot be found in the cache.
+ * Then, while taking care that there will be as little interference as
+ * possible, the memory to be loaded is accessed while core is running
+ * with class of service set to the bitmask of the pseudo-locked region.
+ * After this is complete no future CAT allocations will be allowed to
+ * overlap with this bitmask.
+ *
+ * Local register variables are utilized to ensure that the memory region
+ * to be locked is the only memory access made during the critical locking
+ * loop.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_pseudo_lock_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ u32 rmid_p, closid_p;
+ unsigned long i;
+ u64 saved_msr;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer, but the cost
+ * is that this variable will enter the cache through evicting the
+ * memory we are trying to lock into the cache. Thus expect lower
+ * pseudo-locking success rate when KASAN is active.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+ register void *mem_r asm(_ASM_BX);
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Make sure none of the allocated memory is cached. If it is we
+ * will get a cache hit in below loop from outside of pseudo-locked
+ * region.
+ * wbinvd (as opposed to clflush/clflushopt) is required to
+ * increase likelihood that allocated cache portion will be filled
+ * with associated memory.
+ */
+ wbinvd();
+
+ /*
+ * Always called with interrupts enabled. By disabling interrupts
+ * ensure that we will not be preempted during this critical section.
+ */
+ local_irq_disable();
+
+ /*
+ * Call wrmsr and rdmsr as directly as possible to avoid tracing
+ * clobbering local register variables or affecting cache accesses.
+ *
+ * Disable the hardware prefetcher so that when the end of the memory
+ * being pseudo-locked is reached the hardware will not read beyond
+ * the buffer and evict pseudo-locked memory read earlier from the
+ * cache.
+ */
+ saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
+ native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+ closid_p = this_cpu_read(pqr_state.cur_closid);
+ rmid_p = this_cpu_read(pqr_state.cur_rmid);
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ /*
+ * Critical section begin: start by writing the closid associated
+ * with the capacity bitmask of the cache region being
+ * pseudo-locked followed by reading of kernel memory to load it
+ * into the cache.
+ */
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
+ /*
+ * Cache was flushed earlier. Now access kernel memory to read it
+ * into cache region associated with just activated plr->closid.
+ * Loop over data twice:
+ * - In first loop the cache region is shared with the page walker
+ * as it populates the paging structure caches (including TLB).
+ * - In the second loop the paging structure caches are used and
+ * cache region is populated with the memory being referenced.
+ */
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Critical section end: restore closid with capacity bitmask that
+ * does not overlap with pseudo-locked region.
+ */
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
+
+ /* Re-enable the hardware prefetcher(s) */
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
+ local_irq_enable();
+
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
+ * @_plr: pseudo-lock region to measure
+ *
+ * There is no deterministic way to test if a memory region is cached. One
+ * way is to measure how long it takes to read the memory, the speed of
+ * access is a good way to learn how close to the cpu the data was. Even
+ * more, if the prefetcher is disabled and the memory is read at a stride
+ * of half the cache line, then a cache miss will be easy to spot since the
+ * read of the first half would be significantly slower than the read of
+ * the second half.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ u32 saved_low, saved_high;
+ unsigned long i;
+ u64 start, end;
+ void *mem_r;
+
+ local_irq_disable();
+ /*
+ * Disable hardware prefetchers.
+ */
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+ mem_r = READ_ONCE(plr->kmem);
+ /*
+ * Dummy execute of the time measurement to load the needed
+ * instructions into the L1 instruction cache.
+ */
+ start = rdtsc_ordered();
+ for (i = 0; i < plr->size; i += 32) {
+ start = rdtsc_ordered();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ end = rdtsc_ordered();
+ trace_pseudo_lock_mem_latency((u32)(end - start));
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ local_irq_enable();
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/*
+ * Create a perf_event_attr for the hit and miss perf events that will
+ * be used during the performance measurement. A perf_event maintains
+ * a pointer to its perf_event_attr so a unique attribute structure is
+ * created for each perf_event.
+ *
+ * The actual configuration of the event is set right before use in order
+ * to use the X86_CONFIG macro.
+ */
+static struct perf_event_attr perf_miss_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+static struct perf_event_attr perf_hit_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+struct residency_counts {
+ u64 miss_before, hits_before;
+ u64 miss_after, hits_after;
+};
+
+static int measure_residency_fn(struct perf_event_attr *miss_attr,
+ struct perf_event_attr *hit_attr,
+ struct pseudo_lock_region *plr,
+ struct residency_counts *counts)
+{
+ u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
+ struct perf_event *miss_event, *hit_event;
+ int hit_pmcnum, miss_pmcnum;
+ u32 saved_low, saved_high;
+ unsigned int line_size;
+ unsigned int size;
+ unsigned long i;
+ void *mem_r;
+ u64 tmp;
+
+ miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(miss_event))
+ goto out;
+
+ hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(hit_event))
+ goto out_miss;
+
+ local_irq_disable();
+ /*
+ * Check any possible error state of events used by performing
+ * one local read.
+ */
+ if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+ if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+
+ /*
+ * Disable hardware prefetchers.
+ */
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+
+ /* Initialize rest of local variables */
+ /*
+ * Performance event has been validated right before this with
+ * interrupts disabled - it is thus safe to read the counter index.
+ */
+ miss_pmcnum = x86_perf_rdpmc_index(miss_event);
+ hit_pmcnum = x86_perf_rdpmc_index(hit_event);
+ line_size = READ_ONCE(plr->line_size);
+ mem_r = READ_ONCE(plr->kmem);
+ size = READ_ONCE(plr->size);
+
+ /*
+ * Read counter variables twice - first to load the instructions
+ * used in L1 cache, second to capture accurate value that does not
+ * include cache misses incurred because of instruction loads.
+ */
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
+ /*
+ * From SDM: Performing back-to-back fast reads are not guaranteed
+ * to be monotonic.
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ hits_after = rdpmc(hit_pmcnum);
+ miss_after = rdpmc(miss_pmcnum);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ /* Re-enable hardware prefetchers */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ local_irq_enable();
+out_hit:
+ perf_event_release_kernel(hit_event);
+out_miss:
+ perf_event_release_kernel(miss_event);
+out:
+ /*
+ * All counts will be zero on failure.
+ */
+ counts->miss_before = miss_before;
+ counts->hits_before = hits_before;
+ counts->miss_after = miss_after;
+ counts->hits_after = hits_after;
+ return 0;
+}
+
+int resctrl_arch_measure_l2_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
+ /*
+ * Non-architectural event for the Goldmont Microarchitecture
+ * from Intel x86 Architecture Software Developer Manual (SDM):
+ * MEM_LOAD_UOPS_RETIRED D1H (event number)
+ * Umask values:
+ * L2_HIT 02H
+ * L2_MISS 10H
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x10);
+ perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x2);
+ break;
+ default:
+ goto out;
+ }
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+ /*
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
+ */
+ trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
+ counts.miss_after - counts.miss_before);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+int resctrl_arch_measure_l3_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
+ /*
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform the following events are used instead:
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
+ */
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
+ /* On BDW the hit event counts references, not hits */
+ perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x4f);
+ perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x41);
+ break;
+ default:
+ goto out;
+ }
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+ /*
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
+ */
+
+ counts.miss_after -= counts.miss_before;
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
+ /*
+ * On BDW references and misses are counted, need to adjust.
+ * Sometimes the "hits" counter is a bit more than the
+ * references, for example, x references but x + 1 hits.
+ * To not report invalid hit values in this case we treat
+ * that as misses equal to references.
+ */
+ /* First compute the number of cache references measured */
+ counts.hits_after -= counts.hits_before;
+ /* Next convert references to cache hits */
+ counts.hits_after -= min(counts.miss_after, counts.hits_after);
+ } else {
+ counts.hits_after -= counts.hits_before;
+ }
+
+ trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
new file mode 100644
index 000000000000..7c8aef08010f
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(pseudo_lock_mem_latency,
+ TP_PROTO(u32 latency),
+ TP_ARGS(latency),
+ TP_STRUCT__entry(__field(u32, latency)),
+ TP_fast_assign(__entry->latency = latency),
+ TP_printk("latency=%u", __entry->latency)
+ );
+
+TRACE_EVENT(pseudo_lock_l2,
+ TP_PROTO(u64 l2_hits, u64 l2_miss),
+ TP_ARGS(l2_hits, l2_miss),
+ TP_STRUCT__entry(__field(u64, l2_hits)
+ __field(u64, l2_miss)),
+ TP_fast_assign(__entry->l2_hits = l2_hits;
+ __entry->l2_miss = l2_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l2_hits, __entry->l2_miss));
+
+TRACE_EVENT(pseudo_lock_l3,
+ TP_PROTO(u64 l3_hits, u64 l3_miss),
+ TP_ARGS(l3_hits, l3_miss),
+ TP_STRUCT__entry(__field(u64, l3_hits)
+ __field(u64, l3_miss)),
+ TP_fast_assign(__entry->l3_hits = l3_hits;
+ __entry->l3_miss = l3_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l3_hits, __entry->l3_miss));
+
+#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE pseudo_lock_trace
+
+#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
new file mode 100644
index 000000000000..885026468440
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/fs_parser.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/resctrl.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+#include <linux/user_namespace.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/msr.h>
+#include "internal.h"
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+/*
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from update_closid_rmid() is protected against __switch_to() because
+ * preemption is disabled.
+ */
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+ struct resctrl_cpu_defaults *r = info;
+
+ if (r) {
+ this_cpu_write(pqr_state.default_closid, r->closid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
+ }
+
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ resctrl_arch_sched_in(current);
+}
+
+#define INVALID_CONFIG_INDEX UINT_MAX
+
+/**
+ * mon_event_config_index_get - get the hardware index for the
+ * configurable event
+ * @evtid: event id.
+ *
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ * INVALID_CONFIG_INDEX for invalid evtid
+ */
+static inline unsigned int mon_event_config_index_get(u32 evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return 0;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return 1;
+ default:
+ /* Should never reach here */
+ return INVALID_CONFIG_INDEX;
+ }
+}
+
+void resctrl_arch_mon_event_config_read(void *_config_info)
+{
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
+ u64 msrval;
+
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
+ return;
+ }
+ rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval);
+
+ /* Report only the valid event configuration bits */
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+}
+
+void resctrl_arch_mon_event_config_write(void *_config_info)
+{
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
+
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
+ return;
+ }
+ wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config);
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static void l2_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_cache_qos_cfg(int level, bool enable)
+{
+ void (*update)(void *arg);
+ struct rdt_ctrl_domain *d;
+ struct rdt_resource *r_l;
+ cpumask_var_t cpu_mask;
+ int cpu;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ if (level == RDT_RESOURCE_L3)
+ update = l3_qos_cfg_update;
+ else if (level == RDT_RESOURCE_L2)
+ update = l2_qos_cfg_update;
+ else
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ r_l = &rdt_resources_all[level].r_resctrl;
+ list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
+ if (r_l->cache.arch_has_per_cpu_cfg)
+ /* Pick all the CPUs in the domain instance */
+ for_each_cpu(cpu, &d->hdr.cpu_mask)
+ cpumask_set_cpu(cpu, cpu_mask);
+ else
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
+ }
+
+ /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, update, &enable, 1);
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (!r->cdp_capable)
+ return;
+
+ if (r->rid == RDT_RESOURCE_L2)
+ l2_qos_cfg_update(&hw_res->cdp_enabled);
+
+ if (r->rid == RDT_RESOURCE_L3)
+ l3_qos_cfg_update(&hw_res->cdp_enabled);
+}
+
+static int cdp_enable(int level)
+{
+ struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
+ int ret;
+
+ if (!r_l->alloc_capable)
+ return -EINVAL;
+
+ ret = set_cache_qos_cfg(level, true);
+ if (!ret)
+ rdt_resources_all[level].cdp_enabled = true;
+
+ return ret;
+}
+
+static void cdp_disable(int level)
+{
+ struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
+
+ if (r_hw->cdp_enabled) {
+ set_cache_qos_cfg(level, false);
+ r_hw->cdp_enabled = false;
+ }
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
+
+ if (!hw_res->r_resctrl.cdp_capable)
+ return -EINVAL;
+
+ if (enable)
+ return cdp_enable(l);
+
+ cdp_disable(l);
+
+ return 0;
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
+{
+ return rdt_resources_all[l].cdp_enabled;
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
+ int i;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = hw_res->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ for (i = 0; i < hw_res->num_closid; i++)
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
+ msr_param.dom = d;
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ }
+
+ return;
+}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..42c7eac0c387
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,93 @@
+/*
+ * Routines to identify additional cpu features that are scattered in
+ * cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/memtype.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+ u32 sub_leaf;
+};
+
+/*
+ * Please keep the leaf sorted by cpuid_bit.level for faster search.
+ * X86_FEATURE_MBA is supported by both Intel and AMD. But the CPUID
+ * levels are different and there is a separate entry for each.
+ */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_MSR_IMM, CPUID_ECX, 5, 0x00000007, 1 },
+ { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
+ { X86_FEATURE_OVERFLOW_RECOV, CPUID_EBX, 0, 0x80000007, 0 },
+ { X86_FEATURE_SUCCOR, CPUID_EBX, 1, 0x80000007, 0 },
+ { X86_FEATURE_SMCA, CPUID_EBX, 3, 0x80000007, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
+ { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 },
+ { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 },
+ { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 },
+ { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
+ { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
+ { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
+ { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 },
+ { 0, 0, 0, 0, 0 }
+};
+
+void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+ &regs[CPUID_EBX], &regs[CPUID_ECX],
+ &regs[CPUID_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_cpu_cap(c, cb->feature);
+ }
+}
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
new file mode 100644
index 000000000000..9c1656779b2a
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -0,0 +1,6 @@
+obj-y += \
+ driver.o \
+ encl.o \
+ ioctl.o \
+ main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
new file mode 100644
index 000000000000..a42c7180900b
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/acpi.h>
+#include <linux/miscdevice.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "driver.h"
+#include "encl.h"
+
+u64 sgx_attributes_reserved_mask;
+u64 sgx_xfrm_reserved_mask = ~0x3;
+u32 sgx_misc_reserved_mask;
+
+static int __sgx_open(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl;
+ int ret;
+
+ encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+ if (!encl)
+ return -ENOMEM;
+
+ kref_init(&encl->refcount);
+ xa_init(&encl->page_array);
+ mutex_init(&encl->lock);
+ INIT_LIST_HEAD(&encl->va_pages);
+ INIT_LIST_HEAD(&encl->mm_list);
+ spin_lock_init(&encl->mm_lock);
+
+ ret = init_srcu_struct(&encl->srcu);
+ if (ret) {
+ kfree(encl);
+ return ret;
+ }
+
+ file->private_data = encl;
+
+ return 0;
+}
+
+static int sgx_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sgx_release(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl = file->private_data;
+ struct sgx_encl_mm *encl_mm;
+
+ /*
+ * Drain the remaining mm_list entries. At this point the list contains
+ * entries for processes, which have closed the enclave file but have
+ * not exited yet. The processes, which have exited, are gone from the
+ * list by sgx_mmu_notifier_release().
+ */
+ for ( ; ; ) {
+ spin_lock(&encl->mm_lock);
+
+ if (list_empty(&encl->mm_list)) {
+ encl_mm = NULL;
+ } else {
+ encl_mm = list_first_entry(&encl->mm_list,
+ struct sgx_encl_mm, list);
+ list_del_rcu(&encl_mm->list);
+ }
+
+ spin_unlock(&encl->mm_lock);
+
+ /* The enclave is no longer mapped by any mm. */
+ if (!encl_mm)
+ break;
+
+ synchronize_srcu(&encl->srcu);
+ mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+ kfree(encl_mm);
+
+ /* 'encl_mm' is gone, put encl_mm->encl reference: */
+ kref_put(&encl->refcount, sgx_encl_release);
+ }
+
+ kref_put(&encl->refcount, sgx_encl_release);
+ return 0;
+}
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = file->private_data;
+ int ret;
+
+ ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
+ if (ret)
+ return ret;
+
+ ret = sgx_encl_mm_add(encl, vma->vm_mm);
+ if (ret)
+ return ret;
+
+ vma->vm_ops = &sgx_vm_ops;
+ vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
+ vma->vm_private_data = encl;
+
+ return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ if ((flags & MAP_TYPE) == MAP_PRIVATE)
+ return -EINVAL;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ return mm_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+#ifdef CONFIG_COMPAT
+static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ return sgx_ioctl(filep, cmd, arg);
+}
+#endif
+
+static const struct file_operations sgx_encl_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_open,
+ .release = sgx_release,
+ .unlocked_ioctl = sgx_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sgx_compat_ioctl,
+#endif
+ .mmap = sgx_mmap,
+ .get_unmapped_area = sgx_get_unmapped_area,
+};
+
+static struct miscdevice sgx_dev_enclave = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_enclave",
+ .nodename = "sgx_enclave",
+ .fops = &sgx_encl_fops,
+};
+
+int __init sgx_drv_init(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ u64 attr_mask;
+ u64 xfrm_mask;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
+ return -ENODEV;
+ }
+
+ cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+
+ if (!(eax & 1)) {
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
+ return -ENODEV;
+ }
+
+ sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
+
+ cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+
+ attr_mask = (((u64)ebx) << 32) + (u64)eax;
+ sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
+
+ if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+ sgx_xfrm_reserved_mask = ~xfrm_mask;
+ }
+
+ ret = misc_register(&sgx_dev_enclave);
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
new file mode 100644
index 000000000000..30f39f92c98f
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_SGX_DRIVER_H__
+#define __ARCH_SGX_DRIVER_H__
+
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <uapi/asm/sgx.h>
+#include "sgx.h"
+
+#define SGX_EINIT_SPIN_COUNT 20
+#define SGX_EINIT_SLEEP_COUNT 50
+#define SGX_EINIT_SLEEP_TIME 20
+
+extern u64 sgx_attributes_reserved_mask;
+extern u64 sgx_xfrm_reserved_mask;
+extern u32 sgx_misc_reserved_mask;
+
+extern const struct file_operations sgx_provision_fops;
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+int sgx_drv_init(void);
+
+#endif /* __ARCH_X86_SGX_DRIVER_H__ */
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
new file mode 100644
index 000000000000..cf149b9f4916
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -0,0 +1,1326 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/suspend.h>
+#include <linux/sched/mm.h>
+#include <asm/sgx.h>
+#include "encl.h"
+#include "encls.h"
+#include "sgx.h"
+
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+
+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
+/*
+ * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
+ * determine the page index associated with the first PCMD entry
+ * within a PCMD page.
+ */
+#define PCMD_FIRST_MASK GENMASK(4, 0)
+
+/**
+ * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
+ * a PCMD page is in process of being reclaimed.
+ * @encl: Enclave to which PCMD page belongs
+ * @start_addr: Address of enclave page using first entry within the PCMD page
+ *
+ * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
+ * stored. The PCMD data of a reclaimed enclave page contains enough
+ * information for the processor to verify the page at the time
+ * it is loaded back into the Enclave Page Cache (EPC).
+ *
+ * The backing storage to which enclave pages are reclaimed is laid out as
+ * follows:
+ * Encrypted enclave pages:SECS page:PCMD pages
+ *
+ * Each PCMD page contains the PCMD metadata of
+ * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
+ *
+ * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
+ * process of getting data (and thus soon being non-empty). (b) is tested with
+ * a check if an enclave page sharing the PCMD page is in the process of being
+ * reclaimed.
+ *
+ * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
+ * intends to reclaim that enclave page - it means that the PCMD page
+ * associated with that enclave page is about to get some data and thus
+ * even if the PCMD page is empty, it should not be truncated.
+ *
+ * Context: Enclave mutex (&sgx_encl->lock) must be held.
+ * Return: 1 if the reclaimer is about to write to the PCMD page
+ * 0 if the reclaimer has no intention to write to the PCMD page
+ */
+static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
+ unsigned long start_addr)
+{
+ int reclaimed = 0;
+ int i;
+
+ /*
+ * PCMD_FIRST_MASK is based on number of PCMD entries within
+ * PCMD page being 32.
+ */
+ BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
+
+ for (i = 0; i < PCMDS_PER_PAGE; i++) {
+ struct sgx_encl_page *entry;
+ unsigned long addr;
+
+ addr = start_addr + i * PAGE_SIZE;
+
+ /*
+ * Stop when reaching the SECS page - it does not
+ * have a page_array entry and its reclaim is
+ * started and completed with enclave mutex held so
+ * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
+ * flag.
+ */
+ if (addr == encl->base + encl->size)
+ break;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ continue;
+
+ /*
+ * VA page slot ID uses same bit as the flag so it is important
+ * to ensure that the page is not already in backing store.
+ */
+ if (entry->epc_page &&
+ (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
+ reclaimed = 1;
+ break;
+ }
+ }
+
+ return reclaimed;
+}
+
+/*
+ * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
+ * follow right after the EPC data in the backing storage. In addition to the
+ * visible enclave pages, there's one extra page slot for SECS, before PCMD
+ * structs.
+ */
+static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
+ unsigned long page_index)
+{
+ pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
+
+ return epc_end_off + page_index * sizeof(struct sgx_pcmd);
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
+{
+ struct inode *inode = file_inode(encl->backing);
+
+ shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+}
+
+/*
+ * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
+ * Pages" in the SDM.
+ */
+static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_epc_page *secs_page)
+{
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ pgoff_t page_index, page_pcmd_off;
+ unsigned long pcmd_first_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_backing b;
+ bool pcmd_page_empty;
+ u8 *pcmd_page;
+ int ret;
+
+ if (secs_page)
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+ else
+ page_index = PFN_DOWN(encl->size);
+
+ /*
+ * Address of enclave page using the first entry within the PCMD page.
+ */
+ pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
+
+ page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+
+ ret = sgx_encl_lookup_backing(encl, page_index, &b);
+ if (ret)
+ return ret;
+
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.contents = (unsigned long)kmap_local_page(b.contents);
+ pcmd_page = kmap_local_page(b.pcmd);
+ pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
+
+ if (secs_page)
+ pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
+ else
+ pginfo.secs = 0;
+
+ ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
+ sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ELDU");
+
+ ret = -EFAULT;
+ }
+
+ memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+ set_page_dirty(b.pcmd);
+
+ /*
+ * The area for the PCMD in the page was zeroed above. Check if the
+ * whole page is now empty meaning that all PCMD's have been zeroed:
+ */
+ pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+ kunmap_local(pcmd_page);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
+
+ get_page(b.pcmd);
+ sgx_encl_put_backing(&b);
+
+ sgx_encl_truncate_backing_page(encl, page_index);
+
+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
+ sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ pcmd_page = kmap_local_page(b.pcmd);
+ if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
+ pr_warn("PCMD page not empty after truncate.\n");
+ kunmap_local(pcmd_page);
+ }
+
+ put_page(b.pcmd);
+
+ return ret;
+}
+
+static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *secs_page)
+{
+
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page))
+ return epc_page;
+
+ ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
+ if (ret) {
+ sgx_encl_free_epc_page(epc_page);
+ return ERR_PTR(ret);
+ }
+
+ sgx_free_va_slot(encl_page->va_page, va_offset);
+ list_move(&encl_page->va_page->list, &encl->va_pages);
+ encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ encl_page->epc_page = epc_page;
+
+ return epc_page;
+}
+
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
+static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
+ struct sgx_encl_page *entry)
+{
+ struct sgx_epc_page *epc_page;
+
+ /* Entry successfully located. */
+ if (entry->epc_page) {
+ if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
+ return ERR_PTR(-EBUSY);
+
+ return entry;
+ }
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ encl->secs_child_cnt++;
+ sgx_mark_page_reclaimable(entry->epc_page);
+
+ return entry;
+}
+
+static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
+ unsigned long addr,
+ vm_flags_t vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ /*
+ * Verify that the page has equal or higher build time
+ * permissions than the VMA permissions (i.e. the subset of {VM_READ,
+ * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
+ */
+ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr)
+{
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+/**
+ * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
+ * @vma: VMA obtained from fault info from where page is accessed
+ * @encl: enclave accessing the page
+ * @addr: address that triggered the page fault
+ *
+ * When an initialized enclave accesses a page with no backing EPC page
+ * on a SGX2 system then the EPC can be added dynamically via the SGX2
+ * ENCLS[EAUG] instruction.
+ *
+ * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
+ * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
+ */
+static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
+ struct sgx_encl *encl, unsigned long addr)
+{
+ vm_fault_t vmret = VM_FAULT_SIGBUS;
+ struct sgx_pageinfo pginfo = {0};
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ unsigned long phys_addr;
+ u64 secinfo_flags;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Ignore internal permission checking for dynamically added pages.
+ * They matter only for data added during the pre-initialization
+ * phase. The enclave decides the permissions by the means of
+ * EACCEPT, EACCEPTCOPY and EMODPE.
+ */
+ secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+ encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
+ if (IS_ERR(encl_page))
+ return VM_FAULT_OOM;
+
+ mutex_lock(&encl->lock);
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
+ va_page = sgx_encl_grow(encl, false);
+ if (IS_ERR(va_page)) {
+ if (PTR_ERR(va_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_epc;
+ }
+
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ /*
+ * If ret == -EBUSY then page was created in another flow while
+ * running without encl->lock
+ */
+ if (ret)
+ goto err_out_shrink;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = 0;
+
+ ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
+ if (ret)
+ goto err_out;
+
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = SGX_PAGE_TYPE_REG;
+ encl->secs_child_cnt++;
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+
+ phys_addr = sgx_get_epc_phys_addr(epc_page);
+ /*
+ * Do not undo everything when creating PTE entry fails - next #PF
+ * would find page ready for a PTE.
+ */
+ vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (vmret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_SIGBUS;
+ }
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_NOPAGE;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+err_out_epc:
+ sgx_encl_free_epc_page(epc_page);
+err_out_unlock:
+ mutex_unlock(&encl->lock);
+ kfree(encl_page);
+
+ return vmret;
+}
+
+static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
+{
+ unsigned long addr = (unsigned long)vmf->address;
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_encl_page *entry;
+ unsigned long phys_addr;
+ struct sgx_encl *encl;
+ vm_fault_t ret;
+
+ encl = vma->vm_private_data;
+
+ /*
+ * It's very unlikely but possible that allocating memory for the
+ * mm_list entry of a forked process failed in sgx_vma_open(). When
+ * this happens, vm_private_data is set to NULL.
+ */
+ if (unlikely(!encl))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * The page_array keeps track of all enclave pages, whether they
+ * are swapped out or not. If there is no entry for this page and
+ * the system supports SGX2 then it is possible to dynamically add
+ * a new enclave page. This is only possible for an initialized
+ * enclave that will be checked for right away.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
+ (!xa_load(&encl->page_array, PFN_DOWN(addr))))
+ return sgx_encl_eaug_page(vma, encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
+ if (IS_ERR(entry)) {
+ mutex_unlock(&encl->lock);
+
+ if (PTR_ERR(entry) == -EBUSY)
+ return VM_FAULT_NOPAGE;
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
+
+ ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (ret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ sgx_encl_test_and_clear_young(vma->vm_mm, entry);
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_NOPAGE;
+}
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+
+ /*
+ * It's possible but unlikely that vm_private_data is NULL. This can
+ * happen in a grandchild of a process, when sgx_encl_mm_add() had
+ * failed to allocate memory in this callback.
+ */
+ if (unlikely(!encl))
+ return;
+
+ if (sgx_encl_mm_add(encl, vma->vm_mm))
+ vma->vm_private_data = NULL;
+}
+
+
+/**
+ * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
+ * @encl: an enclave pointer
+ * @start: lower bound of the address range, inclusive
+ * @end: upper bound of the address range, exclusive
+ * @vm_flags: VMA flags
+ *
+ * Iterate through the enclave pages contained within [@start, @end) to verify
+ * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
+ * do not contain any permissions that are not contained in the build time
+ * permissions of any of the enclave pages within the given address range.
+ *
+ * An enclave creator must declare the strongest permissions that will be
+ * needed for each enclave page. This ensures that mappings have the identical
+ * or weaker permissions than the earlier declared permissions.
+ *
+ * Return: 0 on success, -EACCES otherwise
+ */
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, vm_flags_t vm_flags)
+{
+ vm_flags_t vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
+ struct sgx_encl_page *page;
+ unsigned long count = 0;
+ int ret = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
+
+ /* Disallow mapping outside enclave's address range. */
+ if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
+ (start < encl->base || end > encl->base + encl->size))
+ return -EACCES;
+
+ /*
+ * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
+ * conflict with the enclave page permissions.
+ */
+ if (current->personality & READ_IMPLIES_EXEC)
+ return -EACCES;
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
+ if (~page->vm_max_prot_bits & vm_prot_bits) {
+ ret = -EACCES;
+ break;
+ }
+
+ /* Reschedule on every XA_CHECK_SCHED iteration. */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ cond_resched();
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ return ret;
+}
+
+static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
+}
+
+static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+
+ ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+ ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * Load an enclave page to EPC if required, and take encl->lock.
+ */
+static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
+ unsigned long addr,
+ vm_flags_t vm_flags)
+{
+ struct sgx_encl_page *entry;
+
+ for ( ; ; ) {
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
+ if (PTR_ERR(entry) != -EBUSY)
+ break;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ if (IS_ERR(entry))
+ mutex_unlock(&encl->lock);
+
+ return entry;
+}
+
+static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+ struct sgx_encl_page *entry = NULL;
+ char data[sizeof(unsigned long)];
+ unsigned long align;
+ int offset;
+ int cnt;
+ int ret = 0;
+ int i;
+
+ /*
+ * If process was forked, VMA is still there but vm_private_data is set
+ * to NULL.
+ */
+ if (!encl)
+ return -EFAULT;
+
+ if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
+ return -EFAULT;
+
+ for (i = 0; i < len; i += cnt) {
+ entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
+ vma->vm_flags);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ break;
+ }
+
+ align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
+ offset = (addr + i) & (sizeof(unsigned long) - 1);
+ cnt = sizeof(unsigned long) - offset;
+ cnt = min(cnt, len - i);
+
+ ret = sgx_encl_debug_read(encl, entry, align, data);
+ if (ret)
+ goto out;
+
+ if (write) {
+ memcpy(data + offset, buf + i, cnt);
+ ret = sgx_encl_debug_write(encl, entry, align, data);
+ if (ret)
+ goto out;
+ } else {
+ memcpy(buf + i, data + offset, cnt);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+
+ if (ret)
+ break;
+ }
+
+ return ret < 0 ? ret : i;
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+ .fault = sgx_vma_fault,
+ .mprotect = sgx_vma_mprotect,
+ .open = sgx_vma_open,
+ .access = sgx_vma_access,
+};
+
+/**
+ * sgx_encl_release - Destroy an enclave instance
+ * @ref: address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+ struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
+ struct sgx_va_page *va_page;
+ struct sgx_encl_page *entry;
+ unsigned long count = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
+
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, max_page_index) {
+ if (entry->epc_page) {
+ /*
+ * The page and its radix tree entry cannot be freed
+ * if the page is being held by the reclaimer.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page))
+ continue;
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ }
+
+ kfree(entry);
+ /*
+ * Invoke scheduler on every XA_CHECK_SCHED iteration
+ * to prevent soft lockups.
+ */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+
+ cond_resched();
+
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+
+ xa_destroy(&encl->page_array);
+
+ if (!encl->secs_child_cnt && encl->secs.epc_page) {
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+ }
+
+ while (!list_empty(&encl->va_pages)) {
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ list_del(&va_page->list);
+ sgx_encl_free_epc_page(va_page->epc_page);
+ kfree(va_page);
+ }
+
+ if (encl->backing)
+ fput(encl->backing);
+
+ cleanup_srcu_struct(&encl->srcu);
+
+ WARN_ON_ONCE(!list_empty(&encl->mm_list));
+
+ /* Detect EPC page leak's. */
+ WARN_ON_ONCE(encl->secs_child_cnt);
+ WARN_ON_ONCE(encl->secs.epc_page);
+
+ kfree(encl);
+ sgx_dec_usage_count();
+}
+
+/*
+ * 'mm' is exiting and no longer needs mmu notifications.
+ */
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ struct sgx_encl_mm *tmp = NULL;
+ bool found = false;
+
+ /*
+ * The enclave itself can remove encl_mm. Note, objects can't be moved
+ * off an RCU protected list, but deletion is ok.
+ */
+ spin_lock(&encl_mm->encl->mm_lock);
+ list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+ if (tmp == encl_mm) {
+ list_del_rcu(&encl_mm->list);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&encl_mm->encl->mm_lock);
+
+ if (found) {
+ synchronize_srcu(&encl_mm->encl->srcu);
+ mmu_notifier_put(mn);
+ }
+}
+
+static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+
+ /* 'encl_mm' is going away, put encl_mm->encl reference: */
+ kref_put(&encl_mm->encl->refcount, sgx_encl_release);
+
+ kfree(encl_mm);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+ .release = sgx_mmu_notifier_release,
+ .free_notifier = sgx_mmu_notifier_free,
+};
+
+static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = NULL;
+ struct sgx_encl_mm *tmp;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
+ if (tmp->mm == mm) {
+ encl_mm = tmp;
+ break;
+ }
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return encl_mm;
+}
+
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm;
+ int ret;
+
+ /*
+ * Even though a single enclave may be mapped into an mm more than once,
+ * each 'mm' only appears once on encl->mm_list. This is guaranteed by
+ * holding the mm's mmap lock for write before an mm can be added or
+ * remove to an encl->mm_list.
+ */
+ mmap_assert_write_locked(mm);
+
+ /*
+ * It's possible that an entry already exists in the mm_list, because it
+ * is removed only on VFS release or process exit.
+ */
+ if (sgx_encl_find_mm(encl, mm))
+ return 0;
+
+ encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
+ if (!encl_mm)
+ return -ENOMEM;
+
+ /* Grab a refcount for the encl_mm->encl reference: */
+ kref_get(&encl->refcount);
+ encl_mm->encl = encl;
+ encl_mm->mm = mm;
+ encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+ ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+ if (ret) {
+ kfree(encl_mm);
+ return ret;
+ }
+
+ spin_lock(&encl->mm_lock);
+ list_add_rcu(&encl_mm->list, &encl->mm_list);
+ /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
+ smp_wmb();
+ encl->mm_list_version++;
+ spin_unlock(&encl->mm_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
+ * @encl: the enclave
+ *
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. For example, ENCLS[EWB]
+ * copies a page from the enclave page cache to regular main memory but
+ * it fails if it cannot ensure that there are no cached
+ * linear-to-physical address mappings referring to the page.
+ *
+ * SGX hardware flushes all cached linear-to-physical mappings on a CPU
+ * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
+ * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
+ * address mappings are cleared but coordination with the tracking done within
+ * the SGX hardware is needed to support the SGX functions that depend on this
+ * cache clearing.
+ *
+ * When the ENCLS[ETRACK] function is issued on an enclave the hardware
+ * tracks threads operating inside the enclave at that time. The SGX
+ * hardware tracking require that all the identified threads must have
+ * exited the enclave in order to flush the mappings before a function such
+ * as ENCLS[EWB] will be permitted
+ *
+ * The following flow is used to support SGX functions that require that
+ * no cached linear-to-physical address mappings are present:
+ * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
+ * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
+ * accessing the enclave.
+ * 3) Send IPI to identified CPUs, kicking them out of the enclave and
+ * thus flushing all locally cached linear-to-physical address mappings.
+ * 4) Execute SGX function.
+ *
+ * Context: It is required to call this function after ENCLS[ETRACK].
+ * This will ensure that if any new mm appears (racing with
+ * sgx_encl_mm_add()) then the new mm will enter into the
+ * enclave with fresh linear-to-physical address mappings.
+ *
+ * It is required that all IPIs are completed before a new
+ * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
+ * of the above flow with the enclave's mutex.
+ *
+ * Return: cpumask of CPUs that might be accessing @encl
+ */
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
+{
+ cpumask_t *cpumask = &encl->cpumask;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ cpumask_clear(cpumask);
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return cpumask;
+}
+
+static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
+ pgoff_t index)
+{
+ struct address_space *mapping = encl->backing->f_mapping;
+ gfp_t gfpmask = mapping_gfp_mask(mapping);
+
+ return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
+}
+
+/**
+ * __sgx_encl_get_backing() - Pin the backing storage
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Pin the backing storage pages for storing the encrypted contents and Paging
+ * Crypto MetaData (PCMD) of an enclave page.
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+ struct page *contents;
+ struct page *pcmd;
+
+ contents = sgx_encl_get_backing_page(encl, page_index);
+ if (IS_ERR(contents))
+ return PTR_ERR(contents);
+
+ pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ if (IS_ERR(pcmd)) {
+ put_page(contents);
+ return PTR_ERR(pcmd);
+ }
+
+ backing->contents = contents;
+ backing->pcmd = pcmd;
+ backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
+
+ return 0;
+}
+
+/*
+ * When called from ksgxd, returns the mem_cgroup of a struct mm stored
+ * in the enclave's mm_list. When not called from ksgxd, just returns
+ * the mem_cgroup of the current task.
+ */
+static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ /*
+ * If called from normal task context, return the mem_cgroup
+ * of the current task's mm. The remainder of the handling is for
+ * ksgxd.
+ */
+ if (!current_is_ksgxd())
+ return get_mem_cgroup_from_mm(current->mm);
+
+ /*
+ * Search the enclave's mm_list to find an mm associated with
+ * this enclave to charge the allocation to.
+ */
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ memcg = get_mem_cgroup_from_mm(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ /*
+ * In the rare case that there isn't an mm associated with
+ * the enclave, set memcg to the current active mem_cgroup.
+ * This will be the root mem_cgroup if there is no active
+ * mem_cgroup.
+ */
+ if (!memcg)
+ return get_mem_cgroup_from_mm(NULL);
+
+ return memcg;
+}
+
+/**
+ * sgx_encl_alloc_backing() - create a new backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * When called from ksgxd, sets the active memcg from one of the
+ * mms in the enclave's mm_list prior to any backing page allocation,
+ * in order to ensure that shmem page allocations are charged to the
+ * enclave. Create a backing page for loading data back into an EPC page with
+ * ELDU. This function takes a reference on a new backing page which
+ * must be dropped with a corresponding call to sgx_encl_put_backing().
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
+ struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
+ int ret;
+
+ ret = __sgx_encl_get_backing(encl, page_index, backing);
+
+ set_active_memcg(memcg);
+ mem_cgroup_put(encl_memcg);
+
+ return ret;
+}
+
+/**
+ * sgx_encl_lookup_backing() - retrieve an existing backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Retrieve a backing page for loading data back into an EPC page with ELDU.
+ * It is the caller's responsibility to ensure that it is appropriate to use
+ * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
+ * not used correctly, this will cause an allocation which is not accounted for.
+ * This function takes a reference on an existing backing page which must be
+ * dropped with a corresponding call to sgx_encl_put_backing().
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ return __sgx_encl_get_backing(encl, page_index, backing);
+}
+
+/**
+ * sgx_encl_put_backing() - Unpin the backing storage
+ * @backing: data for accessing backing storage for the page
+ */
+void sgx_encl_put_backing(struct sgx_backing *backing)
+{
+ put_page(backing->pcmd);
+ put_page(backing->contents);
+}
+
+static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
+ void *data)
+{
+ pte_t pte;
+ int ret;
+
+ ret = pte_young(*ptep);
+ if (ret) {
+ pte = pte_mkold(*ptep);
+ set_pte_at((struct mm_struct *)data, addr, ptep, pte);
+ }
+
+ return ret;
+}
+
+/**
+ * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
+ * @mm: mm_struct that is checked
+ * @page: enclave page to be tested for recent access
+ *
+ * Checks the Access (A) bit from the PTE corresponding to the enclave page and
+ * clears it.
+ *
+ * Return: 1 if the page has been recently accessed and 0 if not.
+ */
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page)
+{
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = sgx_encl_find(mm, addr, &vma);
+ if (ret)
+ return 0;
+
+ if (encl != vma->vm_private_data)
+ return 0;
+
+ ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
+ sgx_encl_test_and_clear_young_cb, vma->vm_mm);
+ if (ret < 0)
+ return 0;
+
+ return ret;
+}
+
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags)
+{
+ struct sgx_encl_page *encl_page;
+ unsigned long prot;
+
+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+ if (!encl_page)
+ return ERR_PTR(-ENOMEM);
+
+ encl_page->desc = encl->base + offset;
+ encl_page->encl = encl;
+
+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+ /*
+ * TCS pages must always RW set for CPU access while the SECINFO
+ * permissions are *always* zero - the CPU ignores the user provided
+ * values and silently overwrites them with zero permissions.
+ */
+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+ prot |= PROT_READ | PROT_WRITE;
+
+ /* Calculate maximum of the VM flags for the page. */
+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ return encl_page;
+}
+
+/**
+ * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
+ * @encl: the enclave
+ * @addr: page aligned pointer to single page for which PTEs will be removed
+ *
+ * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
+ * @addr from each VMA. Ensure that page fault handler is ready to handle
+ * new mappings of @addr before calling this function.
+ */
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
+{
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ struct vm_area_struct *vma;
+ int idx, ret;
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+ if (!ret && encl == vma->vm_private_data)
+ zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (unlikely(encl->mm_list_version != mm_list_version));
+}
+
+/**
+ * sgx_alloc_va_page() - Allocate a Version Array (VA) page
+ * @reclaim: Reclaim EPC pages directly if none available. Enclave
+ * mutex should not be held if this is set.
+ *
+ * Allocate a free EPC page and convert it to a Version Array (VA) page.
+ *
+ * Return:
+ * a VA page,
+ * -errno otherwise
+ */
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
+{
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(NULL, reclaim);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ ret = __epa(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
+ sgx_encl_free_epc_page(epc_page);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return epc_page;
+}
+
+/**
+ * sgx_alloc_va_slot - allocate a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Allocates a slot from a &struct sgx_va_page instance.
+ *
+ * Return: offset of the slot inside the VA page
+ */
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ if (slot < SGX_VA_SLOT_COUNT)
+ set_bit(slot, va_page->slots);
+
+ return slot << 3;
+}
+
+/**
+ * sgx_free_va_slot - free a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ * @offset: offset of the slot inside the VA page
+ *
+ * Frees a slot from a &struct sgx_va_page instance.
+ */
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
+{
+ clear_bit(offset >> 3, va_page->slots);
+}
+
+/**
+ * sgx_va_page_full - is the VA page full?
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Return: true if all slots have been taken
+ */
+bool sgx_va_page_full(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ return slot == SGX_VA_SLOT_COUNT;
+}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
new file mode 100644
index 000000000000..8ff47f6652b9
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Contains the software defined data structures for enclaves.
+ */
+#ifndef _X86_ENCL_H
+#define _X86_ENCL_H
+
+#include <linux/cpumask.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/srcu.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include "sgx.h"
+
+/* 'desc' bits holding the offset in the VA (version array) page. */
+#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3)
+
+/* 'desc' bit marking that the page is being reclaimed. */
+#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3)
+
+struct sgx_encl_page {
+ unsigned long desc;
+ unsigned long vm_max_prot_bits:8;
+ enum sgx_page_type type:16;
+ struct sgx_epc_page *epc_page;
+ struct sgx_encl *encl;
+ struct sgx_va_page *va_page;
+};
+
+enum sgx_encl_flags {
+ SGX_ENCL_IOCTL = BIT(0),
+ SGX_ENCL_DEBUG = BIT(1),
+ SGX_ENCL_CREATED = BIT(2),
+ SGX_ENCL_INITIALIZED = BIT(3),
+};
+
+struct sgx_encl_mm {
+ struct sgx_encl *encl;
+ struct mm_struct *mm;
+ struct list_head list;
+ struct mmu_notifier mmu_notifier;
+};
+
+struct sgx_encl {
+ unsigned long base;
+ unsigned long size;
+ unsigned long flags;
+ unsigned int page_cnt;
+ unsigned int secs_child_cnt;
+ struct mutex lock;
+ struct xarray page_array;
+ struct sgx_encl_page secs;
+ unsigned long attributes;
+ unsigned long attributes_mask;
+
+ cpumask_t cpumask;
+ struct file *backing;
+ struct kref refcount;
+ struct list_head va_pages;
+ unsigned long mm_list_version;
+ struct list_head mm_list;
+ spinlock_t mm_lock;
+ struct srcu_struct srcu;
+};
+
+#define SGX_VA_SLOT_COUNT 512
+
+struct sgx_va_page {
+ struct sgx_epc_page *epc_page;
+ DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT);
+ struct list_head list;
+};
+
+struct sgx_backing {
+ struct page *contents;
+ struct page *pcmd;
+ unsigned long pcmd_offset;
+};
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **vma)
+{
+ struct vm_area_struct *result;
+
+ result = vma_lookup(mm, addr);
+ if (!result || result->vm_ops != &sgx_vm_ops)
+ return -EINVAL;
+
+ *vma = result;
+
+ return 0;
+}
+
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, vm_flags_t vm_flags);
+
+bool current_is_ksgxd(void);
+void sgx_encl_release(struct kref *ref);
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+void sgx_encl_put_backing(struct sgx_backing *backing);
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page);
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags);
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr);
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim);
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
+bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr);
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
+
+#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
new file mode 100644
index 000000000000..74be751199a4
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_ENCLS_H
+#define _X86_ENCLS_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/traps.h>
+#include "sgx.h"
+
+/* Retrieve the encoded trapnr from the specified return code. */
+#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG)
+
+/* Issue a WARN() about an ENCLS function. */
+#define ENCLS_WARN(r, name) { \
+ do { \
+ int _r = (r); \
+ WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \
+ } while (0); \
+}
+
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & SGX_ENCLS_FAULT_FLAG;
+}
+
+/**
+ * encls_failed() - Check if an ENCLS function failed
+ * @ret: the return value of an ENCLS function call
+ *
+ * Check if an ENCLS function failed. This happens when the function causes a
+ * fault that is not caused by an EPCM conflict or when the function returns a
+ * non-zero value.
+ */
+static inline bool encls_failed(int ret)
+{
+ if (encls_faulted(ret))
+ return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
+
+ return !!ret;
+}
+
+/**
+ * __encls_ret_N - encode an ENCLS function that returns an error code in EAX
+ * @rax: function number
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE.
+ * And because SGX isn't complex enough as it is, function that return an error
+ * code also modify flags.
+ *
+ * Return:
+ * 0 on success,
+ * SGX error code on failure
+ */
+#define __encls_ret_N(rax, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: encls\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
+ : "=a"(ret) \
+ : "a"(rax), inputs \
+ : "memory", "cc"); \
+ ret; \
+ })
+
+#define __encls_ret_1(rax, rcx) \
+ ({ \
+ __encls_ret_N(rax, "c"(rcx)); \
+ })
+
+#define __encls_ret_2(rax, rbx, rcx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_ret_3(rax, rbx, rcx, rdx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \
+ })
+
+/**
+ * __encls_N - encode an ENCLS function that doesn't return an error code
+ * @rax: function number
+ * @rbx_out: optional output variable
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that does not return an error code, e.g.
+ * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an
+ * optional parameter for use by EDGBRD, which returns the requested value in
+ * RBX.
+ *
+ * Return:
+ * 0 on success,
+ * trapnr with SGX_ENCLS_FAULT_FLAG set on fault
+ */
+#define __encls_N(rax, rbx_out, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: encls\n\t" \
+ "xor %%eax,%%eax\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
+ : "=a"(ret), "=b"(rbx_out) \
+ : "a"(rax), inputs \
+ : "memory"); \
+ ret; \
+ })
+
+#define __encls_2(rax, rbx, rcx) \
+ ({ \
+ unsigned long ign_rbx_out; \
+ __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_1_1(rax, data, rcx) \
+ ({ \
+ unsigned long rbx_out; \
+ int ret = __encls_N(rax, rbx_out, "c"(rcx)); \
+ if (!ret) \
+ data = rbx_out; \
+ ret; \
+ })
+
+/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */
+static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
+{
+ return __encls_2(ECREATE, pginfo, secs);
+}
+
+/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */
+static inline int __eextend(void *secs, void *addr)
+{
+ return __encls_2(EEXTEND, secs, addr);
+}
+
+/*
+ * Associate an EPC page to an enclave either as a REG or TCS page
+ * populated with the provided data.
+ */
+static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EADD, pginfo, addr);
+}
+
+/* Finalize enclave build, initialize enclave for user code execution. */
+static inline int __einit(void *sigstruct, void *token, void *secs)
+{
+ return __encls_ret_3(EINIT, sigstruct, secs, token);
+}
+
+/* Disassociate EPC page from its enclave and mark it as unused. */
+static inline int __eremove(void *addr)
+{
+ return __encls_ret_1(EREMOVE, addr);
+}
+
+/* Copy data to an EPC page belonging to a debug enclave. */
+static inline int __edbgwr(void *addr, unsigned long *data)
+{
+ return __encls_2(EDGBWR, *data, addr);
+}
+
+/* Copy data from an EPC page belonging to a debug enclave. */
+static inline int __edbgrd(void *addr, unsigned long *data)
+{
+ return __encls_1_1(EDGBRD, *data, addr);
+}
+
+/* Track that software has completed the required TLB address clears. */
+static inline int __etrack(void *addr)
+{
+ return __encls_ret_1(ETRACK, addr);
+}
+
+/* Load, verify, and unblock an EPC page. */
+static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(ELDU, pginfo, addr, va);
+}
+
+/* Make EPC page inaccessible to enclave, ready to be written to memory. */
+static inline int __eblock(void *addr)
+{
+ return __encls_ret_1(EBLOCK, addr);
+}
+
+/* Initialize an EPC page into a Version Array (VA) page. */
+static inline int __epa(void *addr)
+{
+ unsigned long rbx = SGX_PAGE_TYPE_VA;
+
+ return __encls_2(EPA, rbx, addr);
+}
+
+/* Invalidate an EPC page and write it out to main memory. */
+static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(EWB, pginfo, addr, va);
+}
+
+/* Restrict the EPCM permissions of an EPC page. */
+static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODPR, secinfo, addr);
+}
+
+/* Change the type of an EPC page. */
+static inline int __emodt(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODT, secinfo, addr);
+}
+
+/* Zero a page of EPC memory and add it to an initialized enclave. */
+static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EAUG, pginfo, addr);
+}
+
+/* Attempt to update CPUSVN at runtime. */
+static inline int __eupdatesvn(void)
+{
+ return __encls_ret_1(EUPDATESVN, "");
+}
+#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
new file mode 100644
index 000000000000..66f1efa16fbb
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <asm/mman.h>
+#include <asm/sgx.h>
+#include <crypto/sha2.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim)
+{
+ struct sgx_va_page *va_page = NULL;
+ void *err;
+
+ BUILD_BUG_ON(SGX_VA_SLOT_COUNT !=
+ (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1);
+
+ if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) {
+ va_page = kzalloc(sizeof(*va_page), GFP_KERNEL);
+ if (!va_page)
+ return ERR_PTR(-ENOMEM);
+
+ va_page->epc_page = sgx_alloc_va_page(reclaim);
+ if (IS_ERR(va_page->epc_page)) {
+ err = ERR_CAST(va_page->epc_page);
+ kfree(va_page);
+ return err;
+ }
+
+ WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT);
+ }
+ encl->page_cnt++;
+ return va_page;
+}
+
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
+{
+ encl->page_cnt--;
+
+ if (va_page) {
+ sgx_encl_free_epc_page(va_page->epc_page);
+ list_del(&va_page->list);
+ kfree(va_page);
+ }
+}
+
+static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
+{
+ struct sgx_epc_page *secs_epc;
+ struct sgx_va_page *va_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_secinfo secinfo;
+ unsigned long encl_size;
+ struct file *backing;
+ long ret;
+
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
+ va_page = sgx_encl_grow(encl, true);
+ if (IS_ERR(va_page))
+ return PTR_ERR(va_page);
+ else if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+ /* else the tail page of the VA page list had free slots. */
+
+ /* The extra page goes to SECS. */
+ encl_size = secs->size + PAGE_SIZE;
+
+ backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
+ VM_NORESERVE);
+ if (IS_ERR(backing)) {
+ ret = PTR_ERR(backing);
+ goto err_out_shrink;
+ }
+
+ encl->backing = backing;
+
+ secs_epc = sgx_alloc_epc_page(&encl->secs, true);
+ if (IS_ERR(secs_epc)) {
+ ret = PTR_ERR(secs_epc);
+ goto err_out_backing;
+ }
+
+ encl->secs.epc_page = secs_epc;
+
+ pginfo.addr = 0;
+ pginfo.contents = (unsigned long)secs;
+ pginfo.metadata = (unsigned long)&secinfo;
+ pginfo.secs = 0;
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc));
+ if (ret) {
+ ret = -EIO;
+ goto err_out;
+ }
+
+ if (secs->attributes & SGX_ATTR_DEBUG)
+ set_bit(SGX_ENCL_DEBUG, &encl->flags);
+
+ encl->secs.encl = encl;
+ encl->secs.type = SGX_PAGE_TYPE_SECS;
+ encl->base = secs->base;
+ encl->size = secs->size;
+ encl->attributes = secs->attributes;
+ encl->attributes_mask = SGX_ATTR_UNPRIV_MASK;
+
+ /* Set only after completion, as encl->lock has not been taken. */
+ set_bit(SGX_ENCL_CREATED, &encl->flags);
+
+ return 0;
+
+err_out:
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+err_out_backing:
+ fput(encl->backing);
+ encl->backing = NULL;
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE
+ * @encl: An enclave pointer.
+ * @arg: The ioctl argument.
+ *
+ * Allocate kernel data structures for the enclave and invoke ECREATE.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EIO: ECREATE failed.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_create create_arg;
+ void *secs;
+ int ret;
+
+ if (test_bit(SGX_ENCL_CREATED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&create_arg, arg, sizeof(create_arg)))
+ return -EFAULT;
+
+ secs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!secs)
+ return -ENOMEM;
+
+ if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE))
+ ret = -EFAULT;
+ else
+ ret = sgx_encl_create(encl, secs);
+
+ kfree(secs);
+ return ret;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+ u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+ u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+
+ if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
+ return -EINVAL;
+
+ if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
+ return -EINVAL;
+
+ /*
+ * CPU will silently overwrite the permissions as zero, which means
+ * that we need to validate it ourselves.
+ */
+ if (pt == SGX_SECINFO_TCS && perm)
+ return -EINVAL;
+
+ if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
+ return -EINVAL;
+
+ if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+ struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_secinfo *secinfo, unsigned long src)
+{
+ struct sgx_pageinfo pginfo;
+ struct vm_area_struct *vma;
+ struct page *src_page;
+ int ret;
+
+ /* Deny noexec. */
+ vma = find_vma(current->mm, src);
+ if (!vma)
+ return -EFAULT;
+
+ if (!(vma->vm_flags & VM_MAYEXEC))
+ return -EACCES;
+
+ ret = get_user_pages(src, 1, 0, &src_page);
+ if (ret < 1)
+ return -EFAULT;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = (unsigned long)secinfo;
+ pginfo.contents = (unsigned long)kmap_local_page(src_page);
+
+ ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
+
+ kunmap_local((void *)pginfo.contents);
+ put_page(src_page);
+
+ return ret ? -EIO : 0;
+}
+
+/*
+ * If the caller requires measurement of the page as a proof for the content,
+ * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
+ * operation until the entire page is measured."
+ */
+static int __sgx_encl_extend(struct sgx_encl *encl,
+ struct sgx_epc_page *epc_page)
+{
+ unsigned long offset;
+ int ret;
+
+ for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) {
+ ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
+ sgx_get_epc_virt_addr(epc_page) + offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EEXTEND");
+
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
+ unsigned long offset, struct sgx_secinfo *secinfo,
+ unsigned long flags)
+{
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ int ret;
+
+ encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
+ if (IS_ERR(encl_page))
+ return PTR_ERR(encl_page);
+
+ epc_page = sgx_alloc_epc_page(encl_page, true);
+ if (IS_ERR(epc_page)) {
+ kfree(encl_page);
+ return PTR_ERR(epc_page);
+ }
+
+ va_page = sgx_encl_grow(encl, true);
+ if (IS_ERR(va_page)) {
+ ret = PTR_ERR(va_page);
+ goto err_out_free;
+ }
+
+ mmap_read_lock(current->mm);
+ mutex_lock(&encl->lock);
+
+ /*
+ * Adding to encl->va_pages must be done under encl->lock. Ditto for
+ * deleting (via sgx_encl_shrink()) in the error path.
+ */
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ /*
+ * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e.
+ * can't be gracefully unwound, while failure on EADD/EXTEND is limited
+ * to userspace errors (or kernel/hardware bugs).
+ */
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ if (ret)
+ goto err_out_unlock;
+
+ ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
+ src);
+ if (ret)
+ goto err_out;
+
+ /*
+ * Complete the "add" before doing the "extend" so that the "add"
+ * isn't in a half-baked state in the extremely unlikely scenario
+ * the enclave will be destroyed in response to EEXTEND failure.
+ */
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8;
+ encl->secs_child_cnt++;
+
+ if (flags & SGX_PAGE_MEASURE) {
+ ret = __sgx_encl_extend(encl, epc_page);
+ if (ret)
+ goto err_out;
+ }
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+ return ret;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_unlock:
+ sgx_encl_shrink(encl, va_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+
+err_out_free:
+ sgx_encl_free_epc_page(epc_page);
+ kfree(encl_page);
+
+ return ret;
+}
+
+/*
+ * Ensure user provided offset and length values are valid for
+ * an enclave.
+ */
+static int sgx_validate_offset_length(struct sgx_encl *encl,
+ unsigned long offset,
+ unsigned long length)
+{
+ if (!IS_ALIGNED(offset, PAGE_SIZE))
+ return -EINVAL;
+
+ if (!length || !IS_ALIGNED(length, PAGE_SIZE))
+ return -EINVAL;
+
+ if (offset + length < offset)
+ return -EINVAL;
+
+ if (offset + length - PAGE_SIZE >= encl->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
+ * @encl: an enclave pointer
+ * @arg: a user pointer to a struct sgx_enclave_add_pages instance
+ *
+ * Add one or more pages to an uninitialized enclave, and optionally extend the
+ * measurement with the contents of the page. The SECINFO and measurement mask
+ * are applied to all pages.
+ *
+ * A SECINFO for a TCS is required to always contain zero permissions because
+ * CPU silently zeros them. Allowing anything else would cause a mismatch in
+ * the measurement.
+ *
+ * mmap()'s protection bits are capped by the page permissions. For each page
+ * address, the maximum protection bits are computed with the following
+ * heuristics:
+ *
+ * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
+ * 2. A TCS page: PROT_R | PROT_W.
+ *
+ * mmap() is not allowed to surpass the minimum of the maximum protection bits
+ * within the given address range.
+ *
+ * The function deinitializes kernel data structures for enclave and returns
+ * -EIO in any of the following conditions:
+ *
+ * - Enclave Page Cache (EPC), the physical memory holding enclaves, has
+ * been invalidated. This will cause EADD and EEXTEND to fail.
+ * - If the source address is corrupted somehow when executing EADD.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EACCES: The source page is located in a noexec partition.
+ * - -ENOMEM: Out of EPC pages.
+ * - -EINTR: The call was interrupted before data was processed.
+ * - -EIO: Either EADD or EEXTEND failed because invalid source address
+ * or power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_add_pages add_arg;
+ struct sgx_secinfo secinfo;
+ unsigned long c;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ if (!IS_ALIGNED(add_arg.src, PAGE_SIZE))
+ return -EINVAL;
+
+ if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length))
+ return -EINVAL;
+
+ if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo,
+ sizeof(secinfo)))
+ return -EFAULT;
+
+ if (sgx_validate_secinfo(&secinfo))
+ return -EINVAL;
+
+ for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) {
+ if (signal_pending(current)) {
+ if (!c)
+ ret = -ERESTARTSYS;
+
+ break;
+ }
+
+ if (need_resched())
+ cond_resched();
+
+ ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
+ &secinfo, add_arg.flags);
+ if (ret)
+ break;
+ }
+
+ add_arg.count = c;
+
+ if (copy_to_user(arg, &add_arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+ void *token)
+{
+ u64 mrsigner[4];
+ int i, j;
+ void *addr;
+ int ret;
+
+ /*
+ * Deny initializing enclaves with attributes (namely provisioning)
+ * that have not been explicitly allowed.
+ */
+ if (encl->attributes & ~encl->attributes_mask)
+ return -EACCES;
+
+ /*
+ * Attributes should not be enforced *only* against what's available on
+ * platform (done in sgx_encl_create) but checked and enforced against
+ * the mask for enforcement in sigstruct. For example an enclave could
+ * opt to sign with AVX bit in xfrm, but still be loadable on a platform
+ * without it if the sigstruct->body.attributes_mask does not turn that
+ * bit on.
+ */
+ if (sigstruct->body.attributes & sigstruct->body.attributes_mask &
+ sgx_attributes_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.miscselect & sigstruct->body.misc_mask &
+ sgx_misc_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask &
+ sgx_xfrm_reserved_mask)
+ return -EINVAL;
+
+ sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner);
+
+ mutex_lock(&encl->lock);
+
+ /*
+ * ENCLS[EINIT] is interruptible because it has such a high latency,
+ * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending,
+ * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be
+ * serviced.
+ */
+ for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
+ for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
+ addr = sgx_get_epc_virt_addr(encl->secs.epc_page);
+
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(mrsigner);
+
+ ret = __einit(sigstruct, token, addr);
+
+ preempt_enable();
+
+ if (ret == SGX_UNMASKED_EVENT)
+ continue;
+ else
+ break;
+ }
+
+ if (ret != SGX_UNMASKED_EVENT)
+ break;
+
+ msleep_interruptible(SGX_EINIT_SLEEP_TIME);
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ goto err_out;
+ }
+ }
+
+ if (encls_faulted(ret)) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EINIT");
+
+ ret = -EIO;
+ } else if (ret) {
+ pr_debug("EINIT returned %d\n", ret);
+ ret = -EPERM;
+ } else {
+ set_bit(SGX_ENCL_INITIALIZED, &encl->flags);
+ }
+
+err_out:
+ mutex_unlock(&encl->lock);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_init instance
+ *
+ * Flush any outstanding enqueued EADD operations and perform EINIT. The
+ * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EPERM: Invalid SIGSTRUCT.
+ * - -EIO: EINIT failed because of a power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_sigstruct *sigstruct;
+ struct sgx_enclave_init init_arg;
+ void *token;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
+ return -EFAULT;
+
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
+ return -ENOMEM;
+
+ token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
+ memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
+
+ if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct,
+ sizeof(*sigstruct))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * A legacy field used with Intel signed enclaves. These used to mean
+ * regular and architectural enclaves. The CPU only accepts these values
+ * but they do not have any other meaning.
+ *
+ * Thus, reject any other values.
+ */
+ if (sigstruct->header.vendor != 0x0000 &&
+ sigstruct->header.vendor != 0x8086) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sgx_encl_init(encl, sigstruct, token);
+
+out:
+ kfree(sigstruct);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_provision instance
+ *
+ * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to
+ * /dev/sgx_provision.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_provision params;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
+}
+
+/*
+ * Ensure enclave is ready for SGX2 functions. Readiness is checked
+ * by ensuring the hardware supports SGX2 and the enclave is initialized
+ * and thus able to handle requests to modify pages within it.
+ */
+static int sgx_ioc_sgx2_ready(struct sgx_encl *encl)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_SGX2)))
+ return -ENODEV;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. Collaborate with
+ * hardware via ENCLS[ETRACK] to ensure that all cached
+ * linear-to-physical address mappings belonging to all threads of
+ * the enclave are cleared. See sgx_encl_cpumask() for details.
+ *
+ * Must be called with enclave's mutex held from the time the
+ * SGX function requiring that no cached linear-to-physical mappings
+ * are present is executed until this ETRACK flow is complete.
+ */
+static int sgx_enclave_etrack(struct sgx_encl *encl)
+{
+ void *epc_virt;
+ int ret;
+
+ epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ /*
+ * ETRACK only fails when there is an OS issue. For
+ * example, two consecutive ETRACK was sent without
+ * completed IPI between.
+ */
+ pr_err_once("ETRACK returned %d (0x%x)", ret, ret);
+ /*
+ * Send IPIs to kick CPUs out of the enclave and
+ * try ETRACK again.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ pr_err_once("ETRACK repeat returned %d (0x%x)",
+ ret, ret);
+ return -EFAULT;
+ }
+ }
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+
+ return 0;
+}
+
+/**
+ * sgx_enclave_restrict_permissions() - Restrict EPCM permissions
+ * @encl: Enclave to which the pages belong.
+ * @modp: Checked parameters from user on which pages need modifying and
+ * their new permissions.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long
+sgx_enclave_restrict_permissions(struct sgx_encl *encl,
+ struct sgx_enclave_restrict_permissions *modp)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK;
+
+ for (c = 0 ; c < modp->length; c += PAGE_SIZE) {
+ addr = encl->base + modp->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Changing EPCM permissions is only supported on regular
+ * SGX pages. Attempting this change on other pages will
+ * result in #PF.
+ */
+ if (entry->type != SGX_PAGE_TYPE_REG) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Apart from ensuring that read-access remains, do not verify
+ * the permission bits requested. Kernel has no control over
+ * how EPCM permissions can be relaxed from within the enclave.
+ * ENCLS[EMODPR] can only remove existing EPCM permissions,
+ * attempting to set new permissions will be ignored by the
+ * hardware.
+ */
+
+ /* Change EPCM permissions. */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * permissions of a regular page, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these
+ * are protected with mutex.
+ */
+ pr_err_once("EMODPR encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ if (encls_failed(ret)) {
+ modp->result = ret;
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modp->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_restrict_permissions() - handler for
+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions
+ * instance
+ *
+ * SGX2 distinguishes between relaxing and restricting the enclave page
+ * permissions maintained by the hardware (EPCM permissions) of pages
+ * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT).
+ *
+ * EPCM permissions cannot be restricted from within the enclave, the enclave
+ * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR]
+ * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call
+ * will be ignored by the hardware.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_restrict_permissions params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK)
+ return -EINVAL;
+
+ /*
+ * Fail early if invalid permissions requested to prevent ENCLS[EMODPR]
+ * from faulting later when the CPU does the same check.
+ */
+ if ((params.permissions & SGX_SECINFO_W) &&
+ !(params.permissions & SGX_SECINFO_R))
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_restrict_permissions(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_enclave_modify_types() - Modify type of SGX enclave pages
+ * @encl: Enclave to which the pages belong.
+ * @modt: Checked parameters from user about which pages need modifying
+ * and their new page type.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_enclave_modify_types(struct sgx_encl *encl,
+ struct sgx_enclave_modify_types *modt)
+{
+ unsigned long max_prot_restore;
+ enum sgx_page_type page_type;
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long prot;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ page_type = modt->page_type & SGX_PAGE_TYPE_MASK;
+
+ /*
+ * The only new page types allowed by hardware are PT_TCS and PT_TRIM.
+ */
+ if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM)
+ return -EINVAL;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ secinfo.flags = page_type << 8;
+
+ for (c = 0 ; c < modt->length; c += PAGE_SIZE) {
+ addr = encl->base + modt->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Borrow the logic from the Intel SDM. Regular pages
+ * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS
+ * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed.
+ * CET pages not supported yet.
+ */
+ if (!(entry->type == SGX_PAGE_TYPE_REG ||
+ (entry->type == SGX_PAGE_TYPE_TCS &&
+ page_type == SGX_PAGE_TYPE_TRIM))) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ max_prot_restore = entry->vm_max_prot_bits;
+
+ /*
+ * Once a regular page becomes a TCS page it cannot be
+ * changed back. So the maximum allowed protection reflects
+ * the TCS page that is always RW from kernel perspective but
+ * will be inaccessible from within enclave. Before doing
+ * so, do make sure that the new page type continues to
+ * respect the originally vetted page permissions.
+ */
+ if (entry->type == SGX_PAGE_TYPE_REG &&
+ page_type == SGX_PAGE_TYPE_TCS) {
+ if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ prot = PROT_READ | PROT_WRITE;
+ entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ /*
+ * Prevent page from being reclaimed while mutex
+ * is released.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EAGAIN;
+ goto out_entry_changed;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_mark_page_reclaimable(entry->epc_page);
+ }
+
+ /* Change EPC type */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodt(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * valid page types, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these are
+ * protected with mutex.
+ */
+ pr_err_once("EMODT encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+ if (encls_failed(ret)) {
+ modt->result = ret;
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ entry->type = page_type;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_entry_changed:
+ entry->vm_max_prot_bits = max_prot_restore;
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modt->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance
+ *
+ * Ability to change the enclave page type supports the following use cases:
+ *
+ * * It is possible to add TCS pages to an enclave by changing the type of
+ * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages.
+ * With this support the number of threads supported by an initialized
+ * enclave can be increased dynamically.
+ *
+ * * Regular or TCS pages can dynamically be removed from an initialized
+ * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the
+ * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual
+ * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called
+ * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the
+ * enclave.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_modify_types params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.page_type & ~SGX_PAGE_TYPE_MASK)
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_modify_types(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave
+ * @encl: Enclave to which the pages belong
+ * @params: Checked parameters from user on which pages need to be removed
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_encl_remove_pages(struct sgx_encl *encl,
+ struct sgx_enclave_remove_pages *params)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+
+ for (c = 0 ; c < params->length; c += PAGE_SIZE) {
+ addr = encl->base + params->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entry->type != SGX_PAGE_TYPE_TRIM) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ /*
+ * ENCLS[EMODPR] is a no-op instruction used to inform if
+ * ENCLU[EACCEPT] was run from within the enclave. If
+ * ENCLS[EMODPR] is run with RWX on a trimmed page that is
+ * not yet accepted then it will return
+ * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is
+ * accepted the instruction will encounter a page fault.
+ */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ xa_erase(&encl->page_array, PFN_DOWN(entry->desc));
+ sgx_encl_shrink(encl, NULL);
+ kfree(entry);
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ params->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance
+ *
+ * Final step of the flow removing pages from an initialized enclave. The
+ * complete flow is:
+ *
+ * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM
+ * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl().
+ * 2) User approves the page removal by running ENCLU[EACCEPT] from within
+ * the enclave.
+ * 3) User initiates actual page removal using the
+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here.
+ *
+ * First remove any page table entries pointing to the page and then proceed
+ * with the actual removal of the enclave page and data in support of it.
+ *
+ * VA pages are not affected by this removal. It is thus possible that the
+ * enclave may end up with more VA pages than needed to support all its
+ * pages.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_remove_pages params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.count)
+ return -EINVAL;
+
+ ret = sgx_encl_remove_pages(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct sgx_encl *encl = filep->private_data;
+ int ret;
+
+ if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags))
+ return -EBUSY;
+
+ switch (cmd) {
+ case SGX_IOC_ENCLAVE_CREATE:
+ ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_ADD_PAGES:
+ ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_INIT:
+ ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_PROVISION:
+ ret = sgx_ioc_enclave_provision(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS:
+ ret = sgx_ioc_enclave_restrict_permissions(encl,
+ (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_MODIFY_TYPES:
+ ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_REMOVE_PAGES:
+ ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg);
+ break;
+ default:
+ ret = -ENOIOCTLCMD;
+ break;
+ }
+
+ clear_bit(SGX_ENCL_IOCTL, &encl->flags);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
new file mode 100644
index 000000000000..dc73194416ac
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/kvm_types.h>
+#include <linux/miscdevice.h>
+#include <linux/node.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#include <asm/msr.h>
+#include <asm/sgx.h>
+#include <asm/archrandom.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+static int sgx_nr_epc_sections;
+static struct task_struct *ksgxd_tsk;
+static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
+
+/*
+ * These variables are part of the state of the reclaimer, and must be accessed
+ * with sgx_reclaimer_lock acquired.
+ */
+static LIST_HEAD(sgx_active_page_list);
+static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
+/*
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
+ *
+ * Return 0 when sanitization was successful or kthread was stopped, and the
+ * number of unsanitized pages otherwise.
+ */
+static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
+{
+ unsigned long left_dirty = 0;
+ struct sgx_epc_page *page;
+ LIST_HEAD(dirty);
+ int ret;
+
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
+ if (kthread_should_stop())
+ return 0;
+
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
+
+ continue;
+ }
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
+ list_move_tail(&page->list, &dirty);
+ left_dirty++;
+ }
+
+ cond_resched();
+ }
+
+ list_splice(&dirty, dirty_page_list);
+ return left_dirty;
+}
+
+static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ struct sgx_encl *encl = page->encl;
+ struct sgx_encl_mm *encl_mm;
+ bool ret = true;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+ ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ if (!ret)
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ int ret;
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ ret = __eblock(sgx_get_epc_virt_addr(epc_page));
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EBLOCK");
+
+ mutex_unlock(&encl->lock);
+}
+
+static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
+ struct sgx_backing *backing)
+{
+ struct sgx_pageinfo pginfo;
+ int ret;
+
+ pginfo.addr = 0;
+ pginfo.secs = 0;
+
+ pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
+ backing->pcmd_offset;
+
+ ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
+
+ kunmap_local((void *)(unsigned long)(pginfo.metadata -
+ backing->pcmd_offset));
+ kunmap_local((void *)(unsigned long)pginfo.contents);
+
+ return ret;
+}
+
+void sgx_ipi_cb(void *info)
+{
+}
+
+/*
+ * Swap page to the regular memory transformed to the blocked state by using
+ * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
+ *
+ * The first trial just tries to write the page assuming that some other thread
+ * has reset the count for threads inside the enclave by using ETRACK, and
+ * previous thread count has been zeroed out. The second trial calls ETRACK
+ * before EWB. If that fails we kick all the HW threads out, and then do EWB,
+ * which should be guaranteed the succeed.
+ */
+static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_va_page *va_page;
+ unsigned int va_offset;
+ void *va_slot;
+ int ret;
+
+ encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
+
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ va_offset = sgx_alloc_va_slot(va_page);
+ va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
+ if (sgx_va_page_full(va_page))
+ list_move_tail(&va_page->list, &encl->va_pages);
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ETRACK");
+ }
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ /*
+ * Slow path, send IPIs to kick cpus out of the
+ * enclave. Note, it's imperative that the cpu
+ * mask is generated *after* ETRACK, else we'll
+ * miss cpus that entered the enclave between
+ * generating the mask and incrementing epoch.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl),
+ sgx_ipi_cb, NULL, 1);
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ }
+ }
+
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EWB");
+
+ sgx_free_va_slot(va_page, va_offset);
+ } else {
+ encl_page->desc |= va_offset;
+ encl_page->va_page = va_page;
+ }
+}
+
+static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_backing secs_backing;
+ int ret;
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_ewb(epc_page, backing);
+ encl_page->epc_page = NULL;
+ encl->secs_child_cnt--;
+ sgx_encl_put_backing(backing);
+
+ if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
+ ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
+ &secs_backing);
+ if (ret)
+ goto out;
+
+ sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
+
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+ sgx_encl_put_backing(&secs_backing);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+}
+
+/*
+ * Take a fixed number of pages from the head of the active page pool and
+ * reclaim them to the enclave's private shmem files. Skip the pages, which have
+ * been accessed since the last scan. Move those pages to the tail of active
+ * page pool so that the pages get scanned in LRU like fashion.
+ *
+ * Batch process a chunk of pages (at the moment 16) in order to degrade amount
+ * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
+ * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
+ * + EWB) but not sufficiently. Reclaiming one page at a time would also be
+ * problematic as it would increase the lock contention too much, which would
+ * halt forward progress.
+ */
+static void sgx_reclaim_pages(void)
+{
+ struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
+ struct sgx_backing backing[SGX_NR_TO_SCAN];
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ pgoff_t page_index;
+ int cnt = 0;
+ int ret;
+ int i;
+
+ spin_lock(&sgx_reclaimer_lock);
+ for (i = 0; i < SGX_NR_TO_SCAN; i++) {
+ if (list_empty(&sgx_active_page_list))
+ break;
+
+ epc_page = list_first_entry(&sgx_active_page_list,
+ struct sgx_epc_page, list);
+ list_del_init(&epc_page->list);
+ encl_page = epc_page->owner;
+
+ if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
+ chunk[cnt++] = epc_page;
+ else
+ /* The owner is freeing the page. No need to add the
+ * page back to the list of reclaimable pages.
+ */
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ encl_page = epc_page->owner;
+
+ if (!sgx_reclaimer_age(epc_page))
+ goto skip;
+
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+
+ mutex_lock(&encl_page->encl->lock);
+ ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
+ if (ret) {
+ mutex_unlock(&encl_page->encl->lock);
+ goto skip;
+ }
+
+ encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
+ mutex_unlock(&encl_page->encl->lock);
+ continue;
+
+skip:
+ spin_lock(&sgx_reclaimer_lock);
+ list_add_tail(&epc_page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+
+ chunk[i] = NULL;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (epc_page)
+ sgx_reclaimer_block(epc_page);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (!epc_page)
+ continue;
+
+ encl_page = epc_page->owner;
+ sgx_reclaimer_write(epc_page, &backing[i]);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+
+ sgx_free_epc_page(epc_page);
+ }
+}
+
+static bool sgx_should_reclaim(unsigned long watermark)
+{
+ return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+ !list_empty(&sgx_active_page_list);
+}
+
+/*
+ * sgx_reclaim_direct() should be called (without enclave's mutex held)
+ * in locations where SGX memory resources might be low and might be
+ * needed in order to make forward progress.
+ */
+void sgx_reclaim_direct(void)
+{
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ sgx_reclaim_pages();
+}
+
+static int ksgxd(void *p)
+{
+ set_freezable();
+
+ /*
+ * Sanitize pages in order to recover from kexec(). The 2nd pass is
+ * required for SECS pages, whose child pages blocked EREMOVE.
+ */
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
+ wait_event_freezable(ksgxd_waitq,
+ kthread_should_stop() ||
+ sgx_should_reclaim(SGX_NR_HIGH_PAGES));
+
+ if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
+ sgx_reclaim_pages();
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static bool __init sgx_page_reclaimer_init(void)
+{
+ struct task_struct *tsk;
+
+ tsk = kthread_run(ksgxd, NULL, "ksgxd");
+ if (IS_ERR(tsk))
+ return false;
+
+ ksgxd_tsk = tsk;
+
+ return true;
+}
+
+bool current_is_ksgxd(void)
+{
+ return current == ksgxd_tsk;
+}
+
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
+{
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
+
+ spin_lock(&node->lock);
+
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
+ return NULL;
+ }
+
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
+ list_del_init(&page->list);
+ page->flags = 0;
+
+ spin_unlock(&node->lock);
+ atomic_long_dec(&sgx_nr_free_pages);
+
+ return page;
+}
+
+/**
+ * __sgx_alloc_epc_page() - Allocate an EPC page
+ *
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
+ *
+ * Return:
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
+ */
+struct sgx_epc_page *__sgx_alloc_epc_page(void)
+{
+ struct sgx_epc_page *page;
+ int nid_of_current = numa_node_id();
+ int nid_start, nid;
+
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+
+ nid = nid_start;
+ do {
+ page = __sgx_alloc_epc_page_from_node(nid);
+ if (page)
+ return page;
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * sgx_mark_page_reclaimable() - Mark a page as reclaimable
+ * @page: EPC page
+ *
+ * Mark a page as reclaimable and add it to the active page list. Pages
+ * are automatically removed from the active list when freed.
+ */
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ list_add_tail(&page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+}
+
+/**
+ * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
+ * @page: EPC page
+ *
+ * Clear the reclaimable flag and remove the page from the active page list.
+ *
+ * Return:
+ * 0 on success,
+ * -EBUSY if the page is in the process of being reclaimed
+ */
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
+ /* The page is being reclaimed. */
+ if (list_empty(&page->list)) {
+ spin_unlock(&sgx_reclaimer_lock);
+ return -EBUSY;
+ }
+
+ list_del(&page->list);
+ page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_alloc_epc_page() - Allocate an EPC page
+ * @owner: the owner of the EPC page
+ * @reclaim: reclaim pages if necessary
+ *
+ * Iterate through EPC sections and borrow a free EPC page to the caller. When a
+ * page is no longer needed it must be released with sgx_free_epc_page(). If
+ * @reclaim is set to true, directly reclaim pages when we are out of pages. No
+ * mm's can be locked when @reclaim is set to true.
+ *
+ * Finally, wake up ksgxd when the number of pages goes below the watermark
+ * before returning back to the caller.
+ *
+ * Return:
+ * an EPC page,
+ * -errno on error
+ */
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
+{
+ struct sgx_epc_page *page;
+
+ for ( ; ; ) {
+ page = __sgx_alloc_epc_page();
+ if (!IS_ERR(page)) {
+ page->owner = owner;
+ break;
+ }
+
+ if (list_empty(&sgx_active_page_list))
+ return ERR_PTR(-ENOMEM);
+
+ if (!reclaim) {
+ page = ERR_PTR(-EBUSY);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ page = ERR_PTR(-ERESTARTSYS);
+ break;
+ }
+
+ sgx_reclaim_pages();
+ cond_resched();
+ }
+
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ wake_up(&ksgxd_waitq);
+
+ return page;
+}
+
+/**
+ * sgx_free_epc_page() - Free an EPC page
+ * @page: an EPC page
+ *
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
+ */
+void sgx_free_epc_page(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
+
+ spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
+}
+
+static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
+ unsigned long index,
+ struct sgx_epc_section *section)
+{
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ unsigned long i;
+
+ section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
+ if (!section->virt_addr)
+ return false;
+
+ section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
+ if (!section->pages) {
+ memunmap(section->virt_addr);
+ return false;
+ }
+
+ section->phys_addr = phys_addr;
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
+
+ for (i = 0; i < nr_pages; i++) {
+ section->pages[i].section = index;
+ section->pages[i].flags = 0;
+ section->pages[i].owner = NULL;
+ section->pages[i].poison = 0;
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
+ }
+
+ return true;
+}
+
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ sgx_unmark_page_reclaimable(page);
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
+/*
+ * A section metric is concatenated in a way that @low bits 12-31 define the
+ * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
+ * metric.
+ */
+static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
+{
+ return (low & GENMASK_ULL(31, 12)) +
+ ((high & GENMASK_ULL(19, 0)) << 32);
+}
+
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
+static bool __init sgx_page_cache_init(void)
+{
+ u32 eax, ebx, ecx, edx, type;
+ u64 pa, size;
+ int nid;
+ int i;
+
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
+ cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
+
+ type = eax & SGX_CPUID_EPC_MASK;
+ if (type == SGX_CPUID_EPC_INVALID)
+ break;
+
+ if (type != SGX_CPUID_EPC_SECTION) {
+ pr_err_once("Unknown EPC section type: %u\n", type);
+ break;
+ }
+
+ pa = sgx_calc_section_metric(eax, ebx);
+ size = sgx_calc_section_metric(ecx, edx);
+
+ pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
+
+ if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
+ pr_err("No free memory for an EPC section\n");
+ break;
+ }
+
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
+ node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
+
+ sgx_nr_epc_sections++;
+ }
+
+ if (!sgx_nr_epc_sections) {
+ pr_err("There are zero EPC sections.\n");
+ return false;
+ }
+
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
+ return true;
+}
+
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ CLASS(fd, f)(attribute_fd);
+
+ if (fd_empty(f))
+ return -EINVAL;
+
+ if (fd_file(f)->f_op != &sgx_provision_fops)
+ return -EINVAL;
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute);
+
+/* Counter to count the active SGX users */
+static int sgx_usage_count;
+
+/**
+ * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN].
+ *
+ * This instruction attempts to update CPUSVN to the
+ * currently loaded microcode update SVN and generate new
+ * cryptographic assets.
+ *
+ * Return:
+ * * %0: - Success or not supported
+ * * %-EAGAIN: - Can be safely retried, failure is due to lack of
+ * * entropy in RNG
+ * * %-EIO: - Unexpected error, retries are not advisable
+ */
+static int sgx_update_svn(void)
+{
+ int ret;
+
+ /*
+ * If EUPDATESVN is not available, it is ok to
+ * silently skip it to comply with legacy behavior.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN))
+ return 0;
+
+ /*
+ * EPC is guaranteed to be empty when there are no users.
+ * Ensure we are on our first user before proceeding further.
+ */
+ WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n");
+
+ for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) {
+ ret = __eupdatesvn();
+
+ /* Stop on success or unexpected errors: */
+ if (ret != SGX_INSUFFICIENT_ENTROPY)
+ break;
+ }
+
+ switch (ret) {
+ case 0:
+ /*
+ * SVN successfully updated.
+ * Let users know when the update was successful.
+ */
+ pr_info("SVN updated successfully\n");
+ return 0;
+ case SGX_NO_UPDATE:
+ /*
+ * SVN update failed since the current SVN is
+ * not newer than CPUSVN. This is the most
+ * common case and indicates no harm.
+ */
+ return 0;
+ case SGX_INSUFFICIENT_ENTROPY:
+ /*
+ * SVN update failed due to lack of entropy in DRNG.
+ * Indicate to userspace that it should retry.
+ */
+ return -EAGAIN;
+ default:
+ break;
+ }
+
+ /*
+ * EUPDATESVN was called when EPC is empty, all other error
+ * codes are unexpected.
+ */
+ ENCLS_WARN(ret, "EUPDATESVN");
+ return -EIO;
+}
+
+/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */
+static DEFINE_MUTEX(sgx_svn_lock);
+
+int sgx_inc_usage_count(void)
+{
+ int ret;
+
+ guard(mutex)(&sgx_svn_lock);
+
+ if (!sgx_usage_count) {
+ ret = sgx_update_svn();
+ if (ret)
+ return ret;
+ }
+
+ sgx_usage_count++;
+
+ return 0;
+}
+
+void sgx_dec_usage_count(void)
+{
+ guard(mutex)(&sgx_svn_lock);
+ sgx_usage_count--;
+}
+
+static int __init sgx_init(void)
+{
+ int ret;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX))
+ return -ENODEV;
+
+ if (!sgx_page_cache_init())
+ return -ENOMEM;
+
+ if (!sgx_page_reclaimer_init()) {
+ ret = -ENOMEM;
+ goto err_page_cache;
+ }
+
+ ret = misc_register(&sgx_dev_provision);
+ if (ret)
+ goto err_kthread;
+
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
+ return 0;
+
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
+err_kthread:
+ kthread_stop(ksgxd_tsk);
+
+err_page_cache:
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ vfree(sgx_epc_sections[i].pages);
+ memunmap(sgx_epc_sections[i].virt_addr);
+ }
+
+ return ret;
+}
+
+device_initcall(sgx_init);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
new file mode 100644
index 000000000000..f5940393d9bd
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_SGX_H
+#define _X86_SGX_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/sgx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "sgx: " fmt
+
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/arch/x86/sgx.rst for more information."
+
+#define SGX_MAX_EPC_SECTIONS 8
+#define SGX_EEXTEND_BLOCK_SIZE 256
+#define SGX_NR_TO_SCAN 16
+#define SGX_NR_LOW_PAGES 32
+#define SGX_NR_HIGH_PAGES 64
+
+/* Pages, which are being tracked by the page reclaimer. */
+#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
+struct sgx_epc_page {
+ unsigned int section;
+ u16 flags;
+ u16 poison;
+ struct sgx_encl_page *owner;
+ struct list_head list;
+};
+
+/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
+ spinlock_t lock;
+};
+
+/*
+ * The firmware can define multiple chunks of EPC to the different areas of the
+ * physical memory e.g. for memory areas of the each node. This structure is
+ * used to store EPC pages for one EPC section and virtual memory area where
+ * the pages have been mapped.
+ */
+struct sgx_epc_section {
+ unsigned long phys_addr;
+ void *virt_addr;
+ struct sgx_epc_page *pages;
+ struct sgx_numa_node *node;
+};
+
+extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+
+static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->phys_addr + index * PAGE_SIZE;
+}
+
+static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->virt_addr + index * PAGE_SIZE;
+}
+
+struct sgx_epc_page *__sgx_alloc_epc_page(void);
+void sgx_free_epc_page(struct sgx_epc_page *page);
+
+void sgx_reclaim_direct(void);
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+
+void sgx_ipi_cb(void *info);
+
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+int sgx_inc_usage_count(void);
+void sgx_dec_usage_count(void);
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
+#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..8de1f1a755f2
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/kvm_types.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
+{
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret = sgx_vepc_remove_page(epc_page);
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+ return 0;
+}
+
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+ struct sgx_epc_page *entry;
+ unsigned long index;
+ long failures = 0;
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ int ret = sgx_vepc_remove_page(entry);
+ if (ret) {
+ if (ret == SGX_CHILD_PRESENT) {
+ /* The page is a SECS, userspace will retry. */
+ failures++;
+ } else {
+ /*
+ * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+ * WARN, as userspace can induce said failures by
+ * calling the ioctl concurrently on multiple vEPCs or
+ * while one or more CPUs is running the enclave. Only
+ * a #PF on EREMOVE indicates a kernel/hardware issue.
+ */
+ WARN_ON_ONCE(encls_faulted(ret) &&
+ ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+ return -EBUSY;
+ }
+ }
+ cond_resched();
+ }
+
+ /*
+ * Return the number of SECS pages that failed to be removed, so
+ * userspace knows that it has to retry.
+ */
+ return failures;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ cond_resched();
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ cond_resched();
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ cond_resched();
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ xa_destroy(&vepc->page_array);
+ kfree(vepc);
+
+ sgx_dec_usage_count();
+ return 0;
+}
+
+static int __sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_vepc_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
+static long sgx_vepc_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ switch (cmd) {
+ case SGX_IOC_VEPC_REMOVE_ALL:
+ if (arg)
+ return -EINVAL;
+ return sgx_vepc_remove_all(vepc);
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .unlocked_ioctl = sgx_vepc_ioctl,
+ .compat_ioctl = sgx_vepc_ioctl,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
new file mode 100644
index 000000000000..f55ea3cdbf88
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU/APIC topology
+ *
+ * The APIC IDs describe the system topology in multiple domain levels.
+ * The CPUID topology parser provides the information which part of the
+ * APIC ID is associated to the individual levels:
+ *
+ * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
+ *
+ * The root space contains the package (socket) IDs.
+ *
+ * Not enumerated levels consume 0 bits space, but conceptually they are
+ * always represented. If e.g. only CORE and THREAD levels are enumerated
+ * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
+ *
+ * If SMT is not supported, then the THREAD domain is still used. It then
+ * has the same physical ID as the CORE domain and is the only child of
+ * the core domain.
+ *
+ * This allows a unified view on the system independent of the enumerated
+ * domain levels without requiring any conditionals in the code.
+ */
+#define pr_fmt(fmt) "CPU topo: " fmt
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/apic.h>
+#include <asm/hypervisor.h>
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
+
+/* Bitmap of physically present CPUs. */
+DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
+
+/* Used for CPU number allocation and parallel CPU bringup */
+u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
+
+/* Bitmaps to mark registered APICs at each topology domain */
+static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
+
+/*
+ * Keep track of assigned, disabled and rejected CPUs. Present assigned
+ * with 1 as CPU #0 is reserved for the boot CPU.
+ */
+static struct {
+ unsigned int nr_assigned_cpus;
+ unsigned int nr_disabled_cpus;
+ unsigned int nr_rejected_cpus;
+ u32 boot_cpu_apic_id;
+ u32 real_bsp_apic_id;
+} topo_info __ro_after_init = {
+ .nr_assigned_cpus = 1,
+ .boot_cpu_apic_id = BAD_APICID,
+ .real_bsp_apic_id = BAD_APICID,
+};
+
+#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
+
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+ return phys_id == (u64)cpuid_to_apicid[cpu];
+}
+
+static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
+{
+ if (!(apicid & (__max_threads_per_core - 1)))
+ cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
+}
+
+/*
+ * Convert the APIC ID to a domain level ID by masking out the low bits
+ * below the domain level @dom.
+ */
+static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
+}
+
+static int topo_lookup_cpuid(u32 apic_id)
+{
+ int i;
+
+ /* CPU# to APICID mapping is persistent once it is established */
+ for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
+ if (cpuid_to_apicid[i] == apic_id)
+ return i;
+ }
+ return -ENODEV;
+}
+
+static __init int topo_get_cpunr(u32 apic_id)
+{
+ int cpu = topo_lookup_cpuid(apic_id);
+
+ if (cpu >= 0)
+ return cpu;
+
+ return topo_info.nr_assigned_cpus++;
+}
+
+static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
+#endif
+ set_cpu_present(cpu, true);
+}
+
+static __init bool check_for_real_bsp(u32 apic_id)
+{
+ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
+ u64 msr;
+
+ /*
+ * There is no real good way to detect whether this a kdump()
+ * kernel, but except on the Voyager SMP monstrosity which is not
+ * longer supported, the real BSP APIC ID is the first one which is
+ * enumerated by firmware. That allows to detect whether the boot
+ * CPU is the real BSP. If it is not, then do not register the APIC
+ * because sending INIT to the real BSP would reset the whole
+ * system.
+ *
+ * The first APIC ID which is enumerated by firmware is detectable
+ * because the boot CPU APIC ID is registered before that without
+ * invoking this code.
+ */
+ if (topo_info.real_bsp_apic_id != BAD_APICID)
+ return false;
+
+ /*
+ * Check whether the enumeration order is broken by evaluating the
+ * BSP bit in the APICBASE MSR. If the CPU does not have the
+ * APICBASE MSR then the BSP detection is not possible and the
+ * kernel must rely on the firmware enumeration order.
+ */
+ if (has_apic_base) {
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
+ }
+
+ if (apic_id == topo_info.boot_cpu_apic_id) {
+ /*
+ * If the boot CPU has the APIC BSP bit set then the
+ * firmware enumeration is agreeing. If the CPU does not
+ * have the APICBASE MSR then the only choice is to trust
+ * the enumeration order.
+ */
+ if (is_bsp || !has_apic_base) {
+ topo_info.real_bsp_apic_id = apic_id;
+ return false;
+ }
+ /*
+ * If the boot APIC is enumerated first, but the APICBASE
+ * MSR does not have the BSP bit set, then there is no way
+ * to discover the real BSP here. Assume a crash kernel and
+ * limit the number of CPUs to 1 as an INIT to the real BSP
+ * would reset the machine.
+ */
+ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
+ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
+ set_nr_cpu_ids(1);
+ goto fwbug;
+ }
+
+ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
+ topo_info.boot_cpu_apic_id, apic_id);
+
+ if (is_bsp) {
+ /*
+ * The boot CPU has the APIC BSP bit set. Use it and complain
+ * about the broken firmware enumeration.
+ */
+ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
+ goto fwbug;
+ }
+
+ pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
+
+ topo_info.real_bsp_apic_id = apic_id;
+ return true;
+
+fwbug:
+ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
+ return false;
+}
+
+static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
+ unsigned long *map)
+{
+ unsigned int id, end, cnt = 0;
+
+ /* Calculate the exclusive end */
+ end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
+
+ /* Unfortunately there is no bitmap_weight_range() */
+ for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
+ cnt++;
+ return cnt;
+}
+
+static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ int cpu, dom;
+
+ if (present) {
+ set_bit(apic_id, phys_cpu_present_map);
+
+ /*
+ * Double registration is valid in case of the boot CPU
+ * APIC because that is registered before the enumeration
+ * of the APICs via firmware parsers or VM guest
+ * mechanisms.
+ */
+ if (apic_id == topo_info.boot_cpu_apic_id)
+ cpu = 0;
+ else
+ cpu = topo_get_cpunr(apic_id);
+
+ cpuid_to_apicid[cpu] = apic_id;
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ } else {
+ u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
+
+ /*
+ * Check for present APICs in the same package when running
+ * on bare metal. Allow the bogosity in a guest.
+ */
+ if (hypervisor_is_type(X86_HYPER_NATIVE) &&
+ topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
+ pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
+ apic_id);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ topo_info.nr_disabled_cpus++;
+ }
+
+ /*
+ * Register present and possible CPUs in the domain
+ * maps. cpu_possible_map will be updated in
+ * topology_init_possible_cpus() after enumeration is done.
+ */
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
+ set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
+}
+
+/**
+ * topology_register_apic - Register an APIC in early topology maps
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ * @present: True if the corresponding CPU is present
+ */
+void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ if (apic_id >= MAX_LOCAL_APIC) {
+ pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ if (check_for_real_bsp(apic_id)) {
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ /* CPU numbers exhausted? */
+ if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
+ pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ topo_register_apic(apic_id, acpi_id, present);
+}
+
+/**
+ * topology_register_boot_apic - Register the boot CPU APIC
+ * @apic_id: The APIC ID to set up
+ *
+ * Separate so CPU #0 can be assigned
+ */
+void __init topology_register_boot_apic(u32 apic_id)
+{
+ WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
+
+ topo_info.boot_cpu_apic_id = apic_id;
+ topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+}
+
+/**
+ * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
+ * @apicid: The APIC ID for which to lookup the logical ID
+ * @at_level: The topology domain level to use
+ *
+ * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
+ * all bits below the domain level specified by @at_level to be clear. So both
+ * real APIC IDs and backshifted normalized APIC IDs work correctly.
+ *
+ * Returns:
+ * - >= 0: The requested logical ID
+ * - -ERANGE: @apicid is out of range
+ * - -ENODEV: @apicid is not registered
+ */
+int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return -ERANGE;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return -ENODEV;
+ /* Get the number of set bits before @lvlid. */
+ return bitmap_weight(apic_maps[at_level].map, lvlid);
+}
+EXPORT_SYMBOL_GPL(topology_get_logical_id);
+
+/**
+ * topology_unit_count - Retrieve the count of specified units at a given topology domain level
+ * @apicid: The APIC ID which specifies the search range
+ * @which_units: The domain level specifying the units to count
+ * @at_level: The domain level at which @which_units have to be counted
+ *
+ * This returns the number of possible units according to the enumerated
+ * information.
+ *
+ * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
+ * counts the number of possible cores in the package to which @apicid
+ * belongs.
+ *
+ * @at_level must obviously be greater than @which_level to produce useful
+ * results. If @at_level is equal to @which_units the result is
+ * unsurprisingly 1. If @at_level is less than @which_units the results
+ * is by definition undefined and the function returns 0.
+ */
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return 0;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return 0;
+ if (which_units > at_level)
+ return 0;
+ if (which_units == at_level)
+ return 1;
+ return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
+}
+
+#ifdef CONFIG_SMP
+int topology_get_primary_thread(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
+
+ /*
+ * Get the core domain level APIC id, which is the primary thread
+ * and return the CPU number assigned to it.
+ */
+ return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
+}
+#endif
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+/**
+ * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ */
+int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
+{
+ int cpu;
+
+ if (apic_id >= MAX_LOCAL_APIC)
+ return -EINVAL;
+
+ /* Reject if the APIC ID was not registered during enumeration. */
+ if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
+ return -ENODEV;
+
+ cpu = topo_lookup_cpuid(apic_id);
+ if (cpu < 0)
+ return -ENOSPC;
+
+ set_bit(apic_id, phys_cpu_present_map);
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ cpu_mark_primary_thread(cpu, apic_id);
+ return cpu;
+}
+
+/**
+ * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
+ * @cpu: The CPU number for which the APIC ID is removed
+ */
+void topology_hotunplug_apic(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
+
+ if (apic_id == BAD_APICID)
+ return;
+
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
+ clear_bit(apic_id, phys_cpu_present_map);
+ set_cpu_present(cpu, false);
+}
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int max_possible_cpus __initdata = NR_CPUS;
+
+/**
+ * topology_apply_cmdline_limits_early - Apply topology command line limits early
+ *
+ * Ensure that command line limits are in effect before firmware parsing
+ * takes place.
+ */
+void __init topology_apply_cmdline_limits_early(void)
+{
+ unsigned int possible = nr_cpu_ids;
+
+ /* 'maxcpus=0' 'nosmp' 'nolapic' */
+ if (!setup_max_cpus || apic_is_disabled)
+ possible = 1;
+
+ /* 'possible_cpus=N' */
+ possible = min_t(unsigned int, max_possible_cpus, possible);
+
+ if (possible < nr_cpu_ids) {
+ pr_info("Limiting to %u possible CPUs\n", possible);
+ set_nr_cpu_ids(possible);
+ }
+}
+
+static __init bool restrict_to_up(void)
+{
+ if (!smp_found_config)
+ return true;
+ /*
+ * XEN PV is special as it does not advertise the local APIC
+ * properly, but provides a fake topology for it so that the
+ * infrastructure works. So don't apply the restrictions vs. APIC
+ * here.
+ */
+ if (xen_pv_domain())
+ return false;
+
+ return apic_is_disabled;
+}
+
+void __init topology_init_possible_cpus(void)
+{
+ unsigned int assigned = topo_info.nr_assigned_cpus;
+ unsigned int disabled = topo_info.nr_disabled_cpus;
+ unsigned int cnta, cntb, cpu, allowed = 1;
+ unsigned int total = assigned + disabled;
+ u32 apicid, firstid;
+
+ /*
+ * If there was no APIC registered, then fake one so that the
+ * topology bitmap is populated. That ensures that the code below
+ * is valid and the various query interfaces can be used
+ * unconditionally. This does not affect the actual APIC code in
+ * any way because either the local APIC address has not been
+ * registered or the local APIC was disabled on the command line.
+ */
+ if (topo_info.boot_cpu_apic_id == BAD_APICID)
+ topology_register_boot_apic(0);
+
+ if (!restrict_to_up()) {
+ if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
+ disabled += assigned - nr_cpu_ids;
+ assigned = nr_cpu_ids;
+ }
+ allowed = min_t(unsigned int, total, nr_cpu_ids);
+ }
+
+ if (total > allowed)
+ pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
+
+ assigned = min_t(unsigned int, allowed, assigned);
+ disabled = allowed - assigned;
+
+ topo_info.nr_assigned_cpus = assigned;
+ topo_info.nr_disabled_cpus = disabled;
+
+ total_cpus = allowed;
+ set_nr_cpu_ids(allowed);
+
+ cnta = domain_weight(TOPO_PKG_DOMAIN);
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
+ __max_logical_packages = cnta;
+ __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
+
+ pr_info("Max. logical packages: %3u\n", cnta);
+ pr_info("Max. logical dies: %3u\n", cntb);
+ pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
+
+ cnta = domain_weight(TOPO_CORE_DOMAIN);
+ cntb = domain_weight(TOPO_SMT_DOMAIN);
+ /*
+ * Can't use order delta here as order(cnta) can be equal
+ * order(cntb) even if cnta != cntb.
+ */
+ __max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
+ pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
+
+ firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
+ __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. cores per package: %3u\n", __num_cores_per_package);
+ __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
+
+ pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
+ if (topo_info.nr_rejected_cpus)
+ pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
+
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ /* Assign CPU numbers to non-present CPUs */
+ for (apicid = 0; disabled; disabled--, apicid++) {
+ apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
+ MAX_LOCAL_APIC, apicid);
+ if (apicid >= MAX_LOCAL_APIC)
+ break;
+ cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
+ }
+
+ for (cpu = 0; cpu < allowed; cpu++) {
+ apicid = cpuid_to_apicid[cpu];
+
+ set_cpu_possible(cpu, true);
+
+ if (apicid == BAD_APICID)
+ continue;
+
+ cpu_mark_primary_thread(cpu, apicid);
+ set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
+ }
+}
+
+/*
+ * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
+ */
+void __init topology_reset_possible_cpus_up(void)
+{
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
+ if (topo_info.boot_cpu_apic_id != BAD_APICID)
+ set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
+}
+
+static int __init setup_possible_cpus(char *str)
+{
+ get_option(&str, &max_possible_cpus);
+ return 0;
+}
+early_param("possible_cpus", setup_possible_cpus);
+#endif
diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h
new file mode 100644
index 000000000000..37326297f80c
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_TOPOLOGY_H
+#define ARCH_X86_TOPOLOGY_H
+
+struct topo_scan {
+ struct cpuinfo_x86 *c;
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_ncpus[TOPO_MAX_DOMAIN];
+
+ /* Legacy CPUID[1]:EBX[23:16] number of logical processors */
+ unsigned int ebx1_nproc_shift;
+
+ /* AMD specific node ID which cannot be mapped into APIC space. */
+ u16 amd_nodes_per_pkg;
+ u16 amd_node_id;
+};
+
+void cpu_init_topology(struct cpuinfo_x86 *c);
+void cpu_parse_topology(struct cpuinfo_x86 *c);
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus);
+bool cpu_parse_topology_ext(struct topo_scan *tscan);
+void cpu_parse_topology_amd(struct topo_scan *tscan);
+void cpu_topology_fixup_amd(struct topo_scan *tscan);
+
+static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid >> x86_topo_system.dom_shifts[dom - 1];
+}
+
+static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom != TOPO_SMT_DOMAIN)
+ apicid >>= x86_topo_system.dom_shifts[dom - 1];
+ return apicid & (x86_topo_system.dom_size[dom] - 1);
+}
+
+static inline u32 topo_domain_mask(enum x86_topology_domains dom)
+{
+ return (1U << x86_topo_system.dom_shifts[dom]) - 1;
+}
+
+/*
+ * Update a domain level after the fact without propagating. Used to fixup
+ * broken CPUID enumerations.
+ */
+static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level);
+#else
+static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ return 1;
+}
+#endif
+
+#endif /* ARCH_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
new file mode 100644
index 000000000000..6ac097e13106
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static bool parse_8000_0008(struct topo_scan *tscan)
+{
+ struct {
+ // ecx
+ u32 cpu_nthreads : 8, // Number of physical threads - 1
+ : 4, // Reserved
+ apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID
+ perf_tsc_len : 2, // Performance time-stamp counter size
+ : 14; // Reserved
+ } ecx;
+ unsigned int sft;
+
+ if (tscan->c->extended_cpuid_level < 0x80000008)
+ return false;
+
+ cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx);
+
+ /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */
+ sft = ecx.apicid_coreid_len;
+ if (!sft)
+ sft = get_count_order(ecx.cpu_nthreads + 1);
+
+ /*
+ * cpu_nthreads describes the number of threads in the package
+ * sft is the number of APIC ID bits per package
+ *
+ * As the number of actual threads per core is not described in
+ * this leaf, just set the CORE domain shift and let the later
+ * parsers set SMT shift. Assume one thread per core by default
+ * which is correct if there are no other CPUID leafs to parse.
+ */
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1);
+ return true;
+}
+
+static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
+{
+ /*
+ * Starting with Fam 17h the DIE domain could probably be used to
+ * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps
+ * suggests it's the topmost bit(s) of the CPU cores area, but
+ * that's guess work and neither enumerated nor documented.
+ *
+ * Up to Fam 16h this does not work at all and the legacy node ID
+ * has to be used.
+ */
+ tscan->amd_nodes_per_pkg = nr_nodes;
+ tscan->amd_node_id = node_id;
+}
+
+static bool parse_8000_001e(struct topo_scan *tscan)
+{
+ struct {
+ // eax
+ u32 ext_apic_id : 32; // Extended APIC ID
+ // ebx
+ u32 core_id : 8, // Unique per-socket logical core unit ID
+ core_nthreads : 8, // #Threads per core (zero-based)
+ : 16; // Reserved
+ // ecx
+ u32 node_id : 8, // Node (die) ID of invoking logical CPU
+ nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket
+ : 21; // Reserved
+ // edx
+ u32 : 32; // Reserved
+ } leaf;
+
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+ return false;
+
+ cpuid_leaf(0x8000001e, &leaf);
+
+ /*
+ * If leaf 0xb/0x26 is available, then the APIC ID and the domain
+ * shifts are set already.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) {
+ tscan->c->topo.initial_apicid = leaf.ext_apic_id;
+
+ /*
+ * Leaf 0x8000008 sets the CORE domain shift but not the
+ * SMT domain shift. On CPUs with family >= 0x17, there
+ * might be hyperthreads.
+ */
+ if (tscan->c->x86 >= 0x17) {
+ /* Update the SMT domain, but do not propagate it. */
+ unsigned int nthreads = leaf.core_nthreads + 1;
+
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN,
+ get_count_order(nthreads), nthreads);
+ }
+ }
+
+ store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id);
+
+ if (tscan->c->x86_vendor == X86_VENDOR_AMD) {
+ if (tscan->c->x86 == 0x15)
+ tscan->c->topo.cu_id = leaf.core_id;
+
+ cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id);
+ } else {
+ /*
+ * Package ID is ApicId[6..] on certain Hygon CPUs. See
+ * commit e0ceeae708ce for explanation. The topology info
+ * is screwed up: The package shift is always 6 and the
+ * node ID is bit [4:5].
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) {
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6,
+ tscan->dom_ncpus[TOPO_CORE_DOMAIN]);
+ }
+ cacheinfo_hygon_init_llc_id(tscan->c);
+ }
+ return true;
+}
+
+static void parse_fam10h_node_id(struct topo_scan *tscan)
+{
+ union {
+ struct {
+ u64 node_id : 3,
+ nodes_per_pkg : 3,
+ unused : 58;
+ };
+ u64 msr;
+ } nid;
+
+ if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
+ return;
+
+ rdmsrq(MSR_FAM10H_NODE_ID, nid.msr);
+ store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
+ tscan->c->topo.llc_id = nid.node_id;
+}
+
+static void legacy_set_llc(struct topo_scan *tscan)
+{
+ unsigned int apicid = tscan->c->topo.initial_apicid;
+
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+}
+
+static void topoext_fixup(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u64 msrval;
+
+ /* Try to re-enable TopologyExtensions if switched off by BIOS */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD ||
+ c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f)
+ return;
+
+ if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT,
+ MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0)
+ return;
+
+ rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval);
+ if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+}
+
+static void parse_topology_amd(struct topo_scan *tscan)
+{
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES))
+ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
+
+ /*
+ * Try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb. If either leaf is
+ * available, cpu_parse_topology_ext() will return true.
+ *
+ * If XTOPOLOGY leaves (0x26/0xb) are not available, try to
+ * get the CORE shift from leaf 0x8000_0008 first.
+ */
+ if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan))
+ return;
+
+ /*
+ * Prefer leaf 0x8000001e if available to get the SMT shift and
+ * the initial APIC ID if XTOPOLOGY leaves are not available.
+ */
+ if (parse_8000_001e(tscan))
+ return;
+
+ /* Try the NODEID MSR */
+ parse_fam10h_node_id(tscan);
+}
+
+void cpu_parse_topology_amd(struct topo_scan *tscan)
+{
+ tscan->amd_nodes_per_pkg = 1;
+ topoext_fixup(tscan);
+ parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
+
+ if (tscan->amd_nodes_per_pkg > 1)
+ set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
+}
+
+void cpu_topology_fixup_amd(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+
+ /*
+ * Adjust the core_id relative to the node when there is more than
+ * one node.
+ */
+ if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1)
+ c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
new file mode 100644
index 000000000000..71625795d711
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/intel-family.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+struct x86_topology_system x86_topo_system __ro_after_init;
+EXPORT_SYMBOL_GPL(x86_topo_system);
+
+unsigned int __amd_nodes_per_pkg __ro_after_init;
+EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg);
+
+/* CPUs which are the primary SMT threads */
+struct cpumask __cpu_primary_thread_mask __read_mostly;
+
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ topology_update_dom(tscan, dom, shift, ncpus);
+
+ /* Propagate to the upper levels */
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1];
+ tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1];
+ }
+}
+
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ switch (c->topo.intel_type) {
+ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
+ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
+ }
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch (c->topo.amd_type) {
+ case 0: return TOPO_CPU_TYPE_PERFORMANCE;
+ case 1: return TOPO_CPU_TYPE_EFFICIENCY;
+ }
+ }
+
+ return TOPO_CPU_TYPE_UNKNOWN;
+}
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
+{
+ switch (get_topology_cpu_type(c)) {
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ return "performance";
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ return "efficiency";
+ default:
+ return "unknown";
+ }
+}
+
+static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
+{
+ struct {
+ u32 cache_type : 5,
+ unused : 21,
+ ncores : 6;
+ } eax;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax);
+ if (!eax.cache_type)
+ return 1;
+
+ return eax.ncores + 1;
+}
+
+static void parse_legacy(struct topo_scan *tscan)
+{
+ unsigned int cores, core_shift, smt_shift = 0;
+ struct cpuinfo_x86 *c = tscan->c;
+
+ cores = parse_num_cores_legacy(c);
+ core_shift = get_count_order(cores);
+
+ if (cpu_has(c, X86_FEATURE_HT)) {
+ if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift))
+ smt_shift = tscan->ebx1_nproc_shift - core_shift;
+ /*
+ * The parser expects leaf 0xb/0x1f format, which means
+ * the number of logical processors at core level is
+ * counting threads.
+ */
+ core_shift += smt_shift;
+ cores <<= smt_shift;
+ }
+
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores);
+}
+
+static bool fake_topology(struct topo_scan *tscan)
+{
+ /*
+ * Preset the CORE level shift for CPUID less systems and XEN_PV,
+ * which has useless CPUID information.
+ */
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1);
+
+ return tscan->c->cpuid_level < 1;
+}
+
+static void parse_topology(struct topo_scan *tscan, bool early)
+{
+ const struct cpuinfo_topology topo_defaults = {
+ .cu_id = 0xff,
+ .llc_id = BAD_APICID,
+ .l2c_id = BAD_APICID,
+ .cpu_type = TOPO_CPU_TYPE_UNKNOWN,
+ };
+ struct cpuinfo_x86 *c = tscan->c;
+ struct {
+ u32 unused0 : 16,
+ nproc : 8,
+ apicid : 8;
+ } ebx;
+
+ c->topo = topo_defaults;
+
+ if (fake_topology(tscan))
+ return;
+
+ /* Preset Initial APIC ID from CPUID leaf 1 */
+ cpuid_leaf_reg(1, CPUID_EBX, &ebx);
+ c->topo.initial_apicid = ebx.apicid;
+
+ /*
+ * The initial invocation from early_identify_cpu() happens before
+ * the APIC is mapped or X2APIC enabled. For establishing the
+ * topology, that's not required. Use the initial APIC ID.
+ */
+ if (early)
+ c->topo.apicid = c->topo.initial_apicid;
+ else
+ c->topo.apicid = read_apic_id();
+
+ /* The above is sufficient for UP */
+ if (!IS_ENABLED(CONFIG_SMP))
+ return;
+
+ tscan->ebx1_nproc_shift = get_count_order(ebx.nproc);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (IS_ENABLED(CONFIG_CPU_SUP_AMD))
+ cpu_parse_topology_amd(tscan);
+ break;
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_ZHAOXIN:
+ parse_legacy(tscan);
+ break;
+ case X86_VENDOR_INTEL:
+ if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
+ parse_legacy(tscan);
+ if (c->cpuid_level >= 0x1a)
+ c->topo.cpu_type = cpuid_eax(0x1a);
+ break;
+ case X86_VENDOR_HYGON:
+ if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))
+ cpu_parse_topology_amd(tscan);
+ break;
+ }
+}
+
+static void topo_set_ids(struct topo_scan *tscan, bool early)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u32 apicid = c->topo.apicid;
+
+ c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN);
+ c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
+
+ if (!early) {
+ c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ }
+
+ /* Package relative core ID */
+ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+
+ c->topo.amd_node_id = tscan->amd_node_id;
+
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ cpu_topology_fixup_amd(tscan);
+}
+
+void cpu_parse_topology(struct cpuinfo_x86 *c)
+{
+ unsigned int dom, cpu = smp_processor_id();
+ struct topo_scan tscan = { .c = c, };
+
+ parse_topology(&tscan, false);
+
+ if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ if (c->topo.initial_apicid != c->topo.apicid) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n",
+ cpu, c->topo.initial_apicid, c->topo.apicid);
+ }
+
+ if (c->topo.apicid != cpuid_to_apicid[cpu]) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n",
+ cpu, cpuid_to_apicid[cpu], c->topo.apicid);
+ }
+ }
+
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) {
+ if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom])
+ continue;
+ pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom,
+ tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
+ }
+
+ topo_set_ids(&tscan, false);
+}
+
+void __init cpu_init_topology(struct cpuinfo_x86 *c)
+{
+ struct topo_scan tscan = { .c = c, };
+ unsigned int dom, sft;
+
+ parse_topology(&tscan, true);
+
+ /* Copy the shift values and calculate the unit sizes. */
+ memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts));
+
+ dom = TOPO_SMT_DOMAIN;
+ x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom];
+
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1];
+ x86_topo_system.dom_size[dom] = 1U << sft;
+ }
+
+ topo_set_ids(&tscan, true);
+
+ /*
+ * AMD systems have Nodes per package which cannot be mapped to
+ * APIC ID.
+ */
+ __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
new file mode 100644
index 000000000000..467b0326bf1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+enum topo_types {
+ INVALID_TYPE = 0,
+ SMT_TYPE = 1,
+ CORE_TYPE = 2,
+ MAX_TYPE_0B = 3,
+ MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
+ TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
+ DIE_TYPE = 5,
+ DIEGRP_TYPE = 6,
+ MAX_TYPE_1F = 7,
+};
+
+/*
+ * Use a lookup table for the case that there are future types > 6 which
+ * describe an intermediate domain level which does not exist today.
+ */
+static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [MODULE_TYPE] = TOPO_MODULE_DOMAIN,
+ [TILE_TYPE] = TOPO_TILE_DOMAIN,
+ [DIE_TYPE] = TOPO_DIE_DOMAIN,
+ [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
+};
+
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
+static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
+ unsigned int *last_dom)
+{
+ unsigned int dom, maxtype;
+ const unsigned int *map;
+ struct {
+ // eax
+ u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
+ // for the topology ID at the next level
+ : 27; // Reserved
+ // ebx
+ u32 num_processors : 16, // Number of processors at current level
+ : 16; // Reserved
+ // ecx
+ u32 level : 8, // Current topology level. Same as sub leaf number
+ type : 8, // Level type. If 0, invalid
+ : 16; // Reserved
+ // edx
+ u32 x2apic_id : 32; // X2APIC ID of the current logical processor
+ } sl;
+
+ switch (leaf) {
+ case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
+ case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
+ default: return false;
+ }
+
+ cpuid_subleaf(leaf, subleaf, &sl);
+
+ if (!sl.num_processors || sl.type == INVALID_TYPE)
+ return false;
+
+ if (sl.type >= maxtype) {
+ pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n",
+ leaf, subleaf, sl.type);
+ /*
+ * It really would have been too obvious to make the domain
+ * type space sparse and leave a few reserved types between
+ * the points which might change instead of following the
+ * usual "this can be fixed in software" principle.
+ */
+ dom = *last_dom + 1;
+ } else {
+ dom = map[sl.type];
+ *last_dom = dom;
+ }
+
+ if (!dom) {
+ tscan->c->topo.initial_apicid = sl.x2apic_id;
+ } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) {
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n",
+ leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id);
+ }
+
+ topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors);
+ return true;
+}
+
+static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf)
+{
+ unsigned int last_dom;
+ u32 subleaf;
+
+ /* Read all available subleafs and populate the levels */
+ for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++);
+
+ /* If subleaf 0 failed to parse, give up */
+ if (!subleaf)
+ return false;
+
+ /*
+ * There are machines in the wild which have shift 0 in the subleaf
+ * 0, but advertise 2 logical processors at that level. They are
+ * truly SMT.
+ */
+ if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) {
+ unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n",
+ leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ }
+
+ set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY);
+ return true;
+}
+
+bool cpu_parse_topology_ext(struct topo_scan *tscan)
+{
+ /* Intel: Try leaf 0x1F first. */
+ if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
+ return true;
+
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
+ /* Intel/AMD: Fall back to leaf 0xB if available */
+ return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
+}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..42c939827621 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,11 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
#include <asm/msr.h>
#include "cpu.h"
-static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
+static void early_init_transmeta(struct cpuinfo_x86 *c)
{
u32 xlvl;
@@ -13,11 +15,11 @@ static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
xlvl = cpuid_eax(0x80860000);
if ((xlvl & 0xffff0000) == 0x80860000) {
if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
+ c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
}
}
-static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
+static void init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
@@ -26,7 +28,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
early_init_transmeta(c);
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
/* Print CMS and CPU revision */
max = cpuid_eax(0x80860000);
@@ -34,7 +36,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860001) {
cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
if (cpu_rev != 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+ pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
(cpu_rev >> 24) & 0xff,
(cpu_rev >> 16) & 0xff,
(cpu_rev >> 8) & 0xff,
@@ -45,10 +47,10 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860002) {
cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
if (cpu_rev == 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+ pr_info("CPU: Processor revision %08X, %u MHz\n",
new_cpu_rev, cpu_freq);
}
- printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+ pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
(cms_rev1 >> 24) & 0xff,
(cms_rev1 >> 16) & 0xff,
(cms_rev1 >> 8) & 0xff,
@@ -77,13 +79,13 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
(void *)&cpu_info[56],
(void *)&cpu_info[60]);
cpu_info[64] = '\0';
- printk(KERN_INFO "CPU: %s\n", cpu_info);
+ pr_info("CPU: %s\n", cpu_info);
}
/* Unhide possibly hidden capability flags */
rdmsr(0x80860004, cap_mask, uk);
wrmsr(0x80860004, ~0, uk);
- c->x86_capability[0] = cpuid_edx(0x00000001);
+ c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
wrmsr(0x80860004, cap_mask, uk);
/* All Transmeta CPUs have a constant TSC */
@@ -98,7 +100,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
+static const struct cpu_dev transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..209b5a22d880
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tsx: " fmt
+
+enum tsx_ctrl_states {
+ TSX_CTRL_AUTO,
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init =
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO :
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE;
+
+static void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+/*
+ * Disabling TSX is not a trivial business.
+ *
+ * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT
+ * which says that TSX is practically disabled (all transactions are
+ * aborted by default). When that bit is set, the kernel unconditionally
+ * disables TSX.
+ *
+ * In order to do that, however, it needs to dance a bit:
+ *
+ * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and
+ * the MSR is present only when *two* CPUID bits are set:
+ *
+ * - X86_FEATURE_RTM_ALWAYS_ABORT
+ * - X86_FEATURE_TSX_FORCE_ABORT
+ *
+ * 2. The second method is for CPUs which do not have the above-mentioned
+ * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX
+ * through that one. Those CPUs can also have the initially mentioned
+ * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy
+ * applies: TSX gets disabled unconditionally.
+ *
+ * When either of the two methods are present, the kernel disables TSX and
+ * clears the respective RTM and HLE feature flags.
+ *
+ * An additional twist in the whole thing presents late microcode loading
+ * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID
+ * bit to be set after the update.
+ *
+ * A subsequent hotplug operation on any logical CPU except the BSP will
+ * cause for the supported CPUID feature bits to get re-detected and, if
+ * RTM and HLE get cleared all of a sudden, but, userspace did consult
+ * them before the update, then funny explosions will happen. Long story
+ * short: the kernel doesn't modify CPUID feature bits after booting.
+ *
+ * That's why, this function's call in init_intel() doesn't clear the
+ * feature flags.
+ */
+static void tsx_clear_cpuid(void)
+{
+ u64 msr;
+
+ /*
+ * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID
+ * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ rdmsrq(MSR_TSX_FORCE_ABORT, msr);
+ msr |= MSR_TFA_TSX_CPUID_CLEAR;
+ wrmsrq(MSR_TSX_FORCE_ABORT, msr);
+ } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) {
+ rdmsrq(MSR_IA32_TSX_CTRL, msr);
+ msr |= TSX_CTRL_CPUID_CLEAR;
+ wrmsrq(MSR_IA32_TSX_CTRL, msr);
+ }
+}
+
+/*
+ * Disable TSX development mode
+ *
+ * When the microcode released in Feb 2022 is applied, TSX will be disabled by
+ * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123
+ * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is
+ * not recommended for production deployments. In particular, applying MD_CLEAR
+ * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient
+ * execution attack may not be effective on these processors when Intel TSX is
+ * enabled with updated microcode.
+ */
+static void tsx_dev_mode_disable(void)
+{
+ u64 mcu_opt_ctrl;
+
+ /* Check if RTM_ALLOW exists */
+ if (!boot_cpu_has_bug(X86_BUG_TAA) ||
+ !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) ||
+ !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+
+ if (mcu_opt_ctrl & RTM_ALLOW) {
+ mcu_opt_ctrl &= ~RTM_ALLOW;
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
+ }
+}
+
+static int __init tsx_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(str, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(str, "auto")) {
+ tsx_ctrl_state = TSX_CTRL_AUTO;
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("invalid option, defaulting to off\n");
+ }
+
+ return 0;
+}
+early_param("tsx", tsx_parse_cmdline);
+
+void __init tsx_init(void)
+{
+ tsx_dev_mode_disable();
+
+ /*
+ * Hardware will always abort a TSX transaction when the CPUID bit
+ * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate
+ * CPUID.RTM and CPUID.HLE bits. Clear them here.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
+ tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
+ tsx_clear_cpuid();
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ return;
+ }
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) {
+ setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL);
+ } else {
+ tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
+ return;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_AUTO)
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
+ }
+}
+
+void tsx_ap_init(void)
+{
+ tsx_dev_mode_disable();
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ /* See comment over that function for more details. */
+ tsx_clear_cpuid();
+}
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index fd2c37bf7acb..65a58a390fc3 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include "cpu.h"
@@ -8,11 +8,11 @@
* so no special init takes place.
*/
-static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
+static const struct cpu_dev umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
- .c_models = {
- { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[1] = "U5D",
[2] = "U5S",
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
new file mode 100644
index 000000000000..e4a31c536642
--- /dev/null
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/syscore_ops.h>
+#include <linux/suspend.h>
+#include <linux/cpu.h>
+
+#include <asm/msr.h>
+#include <asm/mwait.h>
+
+#define UMWAIT_C02_ENABLE 0
+
+#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
+ (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
+ ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
+
+/*
+ * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
+ * umwait max time is 100000 in TSC-quanta and C0.2 is enabled
+ */
+static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
+
+/*
+ * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
+ * hardware or BIOS before kernel boot.
+ */
+static u32 orig_umwait_control_cached __ro_after_init;
+
+/*
+ * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
+ * the sysfs write functions.
+ */
+static DEFINE_MUTEX(umwait_lock);
+
+static void umwait_update_control_msr(void * unused)
+{
+ lockdep_assert_irqs_disabled();
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached));
+}
+
+/*
+ * The CPU hotplug callback sets the control MSR to the global control
+ * value.
+ *
+ * Disable interrupts so the read of umwait_control_cached and the WRMSR
+ * are protected against a concurrent sysfs write. Otherwise the sysfs
+ * write could update the cached value after it had been read on this CPU
+ * and issue the IPI before the old value had been written. The IPI would
+ * interrupt, write the new value and after return from IPI the previous
+ * value would be written by this CPU.
+ *
+ * With interrupts disabled the upcoming CPU either sees the new control
+ * value or the IPI is updating this CPU to the new control value after
+ * interrupts have been reenabled.
+ */
+static int umwait_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ umwait_update_control_msr(NULL);
+ local_irq_enable();
+ return 0;
+}
+
+/*
+ * The CPU hotplug callback sets the control MSR to the original control
+ * value.
+ */
+static int umwait_cpu_offline(unsigned int cpu)
+{
+ /*
+ * This code is protected by the CPU hotplug already and
+ * orig_umwait_control_cached is never changed after it caches
+ * the original control MSR value in umwait_init(). So there
+ * is no race condition here.
+ */
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+
+ return 0;
+}
+
+/*
+ * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
+ * is the only active CPU at this time. The MSR is set up on the APs via the
+ * CPU hotplug callback.
+ *
+ * This function is invoked on resume from suspend and hibernation. On
+ * resume from suspend the restore should be not required, but we neither
+ * trust the firmware nor does it matter if the same value is written
+ * again.
+ */
+static void umwait_syscore_resume(void *data)
+{
+ umwait_update_control_msr(NULL);
+}
+
+static const struct syscore_ops umwait_syscore_ops = {
+ .resume = umwait_syscore_resume,
+};
+
+static struct syscore umwait_syscore = {
+ .ops = &umwait_syscore_ops,
+};
+
+/* sysfs interface */
+
+/*
+ * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
+ * Otherwise, C0.2 is enabled.
+ */
+static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
+{
+ return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
+}
+
+static inline u32 umwait_ctrl_max_time(u32 ctrl)
+{
+ return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+}
+
+static inline void umwait_update_control(u32 maxtime, bool c02_enable)
+{
+ u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+
+ if (!c02_enable)
+ ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
+
+ WRITE_ONCE(umwait_control_cached, ctrl);
+ /* Propagate to all CPUs */
+ on_each_cpu(umwait_update_control_msr, NULL, 1);
+}
+
+static ssize_t
+enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
+}
+
+static ssize_t enable_c02_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool c02_enable;
+ u32 ctrl;
+ int ret;
+
+ ret = kstrtobool(buf, &c02_enable);
+ if (ret)
+ return ret;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
+ umwait_update_control(ctrl, c02_enable);
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(enable_c02);
+
+static ssize_t
+max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
+}
+
+static ssize_t max_time_store(struct device *kobj,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 max_time, ctrl;
+ int ret;
+
+ ret = kstrtou32(buf, 0, &max_time);
+ if (ret)
+ return ret;
+
+ /* bits[1:0] must be zero */
+ if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
+ return -EINVAL;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (max_time != umwait_ctrl_max_time(ctrl))
+ umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(max_time);
+
+static struct attribute *umwait_attrs[] = {
+ &dev_attr_enable_c02.attr,
+ &dev_attr_max_time.attr,
+ NULL
+};
+
+static struct attribute_group umwait_attr_group = {
+ .attrs = umwait_attrs,
+ .name = "umwait_control",
+};
+
+static int __init umwait_init(void)
+{
+ struct device *dev;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_WAITPKG))
+ return -ENODEV;
+
+ /*
+ * Cache the original control MSR value before the control MSR is
+ * changed. This is the only place where orig_umwait_control_cached
+ * is modified.
+ */
+ rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
+ umwait_cpu_online, umwait_cpu_offline);
+ if (ret < 0) {
+ /*
+ * On failure, the control MSR on all CPUs has the
+ * original control value.
+ */
+ return ret;
+ }
+
+ register_syscore(&umwait_syscore);
+
+ /*
+ * Add umwait control interface. Ignore failure, so at least the
+ * default values are set up in case the machine manages to boot.
+ */
+ dev = bus_get_dev_root(&cpu_subsys);
+ if (dev) {
+ ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ put_device(dev);
+ }
+ return ret;
+}
+device_initcall(umwait_init);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 284c399e3234..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,76 +22,355 @@
*/
#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/efi.h>
+#include <linux/reboot.h>
+#include <linux/static_call.h>
#include <asm/div64.h>
+#include <asm/x86_init.h>
+#include <asm/hypervisor.h>
+#include <asm/timer.h>
+#include <asm/apic.h>
#include <asm/vmware.h>
+#include <asm/svm.h>
-#define CPUID_VMWARE_INFO_LEAF 0x40000000
-#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-#define VMWARE_HYPERVISOR_PORT 0x5658
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
-#define VMWARE_PORT_CMD_GETVERSION 10
-#define VMWARE_PORT_CMD_GETHZ 45
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
-#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx)" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "0"(VMWARE_HYPERVISOR_MAGIC), \
- "1"(VMWARE_PORT_CMD_##cmd), \
- "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
- "memory");
+#define GETVCPU_INFO_LEGACY_X2APIC BIT(3)
+#define GETVCPU_INFO_VCPU_RESERVED BIT(31)
+
+#define STEALCLOCK_NOT_AVAILABLE (-1)
+#define STEALCLOCK_DISABLED 0
+#define STEALCLOCK_ENABLED 1
+
+struct vmware_steal_time {
+ union {
+ u64 clock; /* stolen time counter in units of vtsc */
+ struct {
+ /* only for little-endian */
+ u32 clock_low;
+ u32 clock_high;
+ };
+ };
+ u64 reserved[7];
+};
+
+static unsigned long vmware_tsc_khz __ro_after_init;
+static u8 vmware_hypercall_mode __ro_after_init;
+
+unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ unsigned long out0, rbx, rcx, rdx, rsi, rdi;
+
+ switch (vmware_hypercall_mode) {
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL:
+ asm_inline volatile ("vmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
+ asm_inline volatile ("vmmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ default:
+ asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ }
+
+ if (out1)
+ *out1 = rbx;
+ if (out2)
+ *out2 = rcx;
+ if (out3)
+ *out3 = rdx;
+ if (out4)
+ *out4 = rsi;
+ if (out5)
+ *out5 = rdi;
+
+ return out0;
+}
static inline int __vmware_platform(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
- return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+ u32 eax, ebx, ecx;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
+ return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
-static unsigned long __vmware_get_tsc_khz(void)
+static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz;
- uint32_t eax, ebx, ecx, edx;
+ return vmware_tsc_khz;
+}
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static bool vmw_sched_clock __initdata = true;
+static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64);
+static bool has_steal_clock;
+static bool steal_acc __initdata = true; /* steal time accounting */
- if (ebx == UINT_MAX)
- return 0;
- tsc_hz = eax | (((uint64_t)ebx) << 32);
- do_div(tsc_hz, 1000);
- BUG_ON(tsc_hz >> 32);
- return tsc_hz;
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = false;
+ return 0;
}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
-/*
- * While checking the dmi string infomation, just checking the product
- * serial key should be enough, as this will always have a VMware
- * specific string when running under VMware hypervisor.
+static __init int parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static noinstr u64 vmware_sched_clock(void)
+{
+ unsigned long long ns;
+
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
+}
+
+static void __init vmware_cyc2ns_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
+{
+ u32 info;
+
+ return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
+ &info);
+}
+
+static bool stealclock_enable(phys_addr_t pa)
+{
+ return vmware_cmd_stealclock(upper_32_bits(pa),
+ lower_32_bits(pa)) == STEALCLOCK_ENABLED;
+}
+
+static int __stealclock_disable(void)
+{
+ return vmware_cmd_stealclock(0, 1);
+}
+
+static void stealclock_disable(void)
+{
+ __stealclock_disable();
+}
+
+static bool vmware_is_stealclock_available(void)
+{
+ return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE;
+}
+
+/**
+ * vmware_steal_clock() - read the per-cpu steal clock
+ * @cpu: the cpu number whose steal clock we want to read
+ *
+ * The function reads the steal clock if we are on a 64-bit system, otherwise
+ * reads it in parts, checking that the high part didn't change in the
+ * meantime.
+ *
+ * Return:
+ * The steal clock reading in ns.
*/
-int vmware_platform(void)
-{
- if (cpu_has_hypervisor) {
- unsigned int eax, ebx, ecx, edx;
- char hyper_vendor_id[13];
-
- cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
- memcpy(hyper_vendor_id + 0, &ebx, 4);
- memcpy(hyper_vendor_id + 4, &ecx, 4);
- memcpy(hyper_vendor_id + 8, &edx, 4);
- hyper_vendor_id[12] = '\0';
- if (!strcmp(hyper_vendor_id, "VMwareVMware"))
- return 1;
- } else if (dmi_available && dmi_name_in_serial("VMware") &&
- __vmware_platform())
- return 1;
+static u64 vmware_steal_clock(int cpu)
+{
+ struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
+ u64 clock;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ clock = READ_ONCE(steal->clock);
+ else {
+ u32 initial_high, low, high;
+
+ do {
+ initial_high = READ_ONCE(steal->clock_high);
+ /* Do not reorder initial_high and high readings */
+ virt_rmb();
+ low = READ_ONCE(steal->clock_low);
+ /* Keep low reading in between */
+ virt_rmb();
+ high = READ_ONCE(steal->clock_high);
+ } while (initial_high != high);
+
+ clock = ((u64)high << 32) | low;
+ }
+
+ return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+}
+
+static void vmware_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ if (!stealclock_enable(slow_virt_to_phys(st))) {
+ has_steal_clock = false;
+ return;
+ }
+
+ pr_info("vmware-stealtime: cpu %d, pa %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
+}
+
+static void vmware_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ stealclock_disable();
+}
+
+static void vmware_guest_cpu_init(void)
+{
+ if (has_steal_clock)
+ vmware_register_steal_time();
+}
+
+static void vmware_pv_guest_cpu_reboot(void *unused)
+{
+ vmware_disable_steal_time();
+}
+
+static int vmware_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+static struct notifier_block vmware_pv_reboot_nb = {
+ .notifier_call = vmware_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init vmware_smp_prepare_boot_cpu(void)
+{
+ vmware_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
+
+static int vmware_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_guest_cpu_init();
+ local_irq_enable();
return 0;
}
-unsigned long vmware_get_tsc_khz(void)
+static int vmware_cpu_down_prepare(unsigned int cpu)
{
- BUG_ON(!vmware_platform());
- return __vmware_get_tsc_khz();
+ local_irq_disable();
+ vmware_disable_steal_time();
+ local_irq_enable();
+ return 0;
+}
+#endif
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_ops.cpu.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz == 0)
+ return;
+
+ vmware_cyc2ns_setup();
+
+ if (vmw_sched_clock)
+ paravirt_set_sched_clock(vmware_sched_clock);
+
+ if (vmware_is_stealclock_available()) {
+ has_steal_clock = true;
+ static_call_update(pv_steal_clock, vmware_steal_clock);
+
+ /* We use reboot notifier only to disable steal clock */
+ register_reboot_notifier(&vmware_pv_reboot_nb);
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu =
+ vmware_smp_prepare_boot_cpu;
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "x86/vmware:online",
+ vmware_cpu_online,
+ vmware_cpu_down_prepare) < 0)
+ pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n");
+#else
+ vmware_guest_cpu_init();
+#endif
+ }
}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -105,8 +384,210 @@ unsigned long vmware_get_tsc_khz(void)
* so that the kernel could just trust the hypervisor with providing a
* reliable virtual TSC that is suitable for timekeeping.
*/
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __init vmware_set_capabilities(void)
{
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+ setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_tsc_khz)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMCALL);
+ else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL);
}
+
+static void __init vmware_platform_setup(void)
+{
+ u32 eax, ebx, ecx;
+ u64 lpj, tsc_khz;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
+
+ if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((u64)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
+
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
+ vmware_tsc_khz = tsc_khz;
+ x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Skip lapic calibration since we know the bus frequency. */
+ lapic_timer_period = ecx / HZ;
+ pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
+ ecx);
+#endif
+ } else {
+ pr_warn("Failed to get TSC freq from the hypervisor\n");
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
+ vmware_paravirt_ops_setup();
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+ vmware_set_capabilities();
+}
+
+static u8 __init vmware_select_hypercall(void)
+{
+ int eax, ebx, ecx, edx;
+
+ cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx);
+ return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL |
+ CPUID_VMWARE_FEATURES_ECX_VMCALL));
+}
+
+/*
+ * While checking the dmi string information, just checking the product
+ * serial key should be enough, as this will always have a VMware
+ * specific string when running under VMware hypervisor.
+ * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
+ * intentionally defaults to 0.
+ */
+static u32 __init vmware_platform(void)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ unsigned int eax;
+ unsigned int hyper_vendor_id[3];
+
+ cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+ &hyper_vendor_id[1], &hyper_vendor_id[2]);
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) {
+ if (eax >= CPUID_VMWARE_FEATURES_LEAF)
+ vmware_hypercall_mode =
+ vmware_select_hypercall();
+
+ pr_info("hypercall mode: 0x%02x\n",
+ (unsigned int) vmware_hypercall_mode);
+
+ return CPUID_VMWARE_INFO_LEAF;
+ }
+ } else if (dmi_available && dmi_name_in_serial("VMware") &&
+ __vmware_platform())
+ return 1;
+
+ return 0;
+}
+
+/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
+static bool __init vmware_legacy_x2apic_available(void)
+{
+ u32 eax;
+
+ eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
+ return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
+ (eax & GETVCPU_INFO_LEGACY_X2APIC);
+}
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+/*
+ * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
+ * we remap those registers to %r12 and %r13, respectively.
+ */
+unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ struct tdx_module_args args = {};
+
+ if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
+ pr_warn_once("Incorrect usage\n");
+ return ULONG_MAX;
+ }
+
+ if (cmd & ~VMWARE_CMD_MASK) {
+ pr_warn_once("Out of range command %lx\n", cmd);
+ return ULONG_MAX;
+ }
+
+ args.rbx = in1;
+ args.rdx = in3;
+ args.rsi = in4;
+ args.rdi = in5;
+ args.r10 = VMWARE_TDX_VENDOR_LEAF;
+ args.r11 = VMWARE_TDX_HCALL_FUNC;
+ args.r12 = VMWARE_HYPERVISOR_MAGIC;
+ args.r13 = cmd;
+ /* CPL */
+ args.r15 = 0;
+
+ __tdx_hypercall(&args);
+
+ if (out1)
+ *out1 = args.rbx;
+ if (out2)
+ *out2 = args.r13;
+ if (out3)
+ *out3 = args.rdx;
+ if (out4)
+ *out4 = args.rsi;
+ if (out5)
+ *out5 = args.rdi;
+
+ return args.r12;
+}
+EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
+ struct pt_regs *regs)
+{
+ /* Copy VMWARE specific Hypercall parameters to the GHCB */
+ ghcb_set_rip(ghcb, regs->ip);
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+ ghcb_set_rdi(ghcb, regs->di);
+ ghcb_set_rbp(ghcb, regs->bp);
+}
+
+static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ if (!(ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb) &&
+ ghcb_rsi_is_valid(ghcb) &&
+ ghcb_rdi_is_valid(ghcb) &&
+ ghcb_rbp_is_valid(ghcb)))
+ return false;
+
+ regs->bx = ghcb_get_rbx(ghcb);
+ regs->cx = ghcb_get_rcx(ghcb);
+ regs->dx = ghcb_get_rdx(ghcb);
+ regs->si = ghcb_get_rsi(ghcb);
+ regs->di = ghcb_get_rdi(ghcb);
+ regs->bp = ghcb_get_rbp(ghcb);
+
+ return true;
+}
+#endif
+
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish,
+#endif
+};
diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c
new file mode 100644
index 000000000000..e2685470ba94
--- /dev/null
+++ b/arch/x86/kernel/cpu/vortex.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * No special init required for Vortex processors.
+ */
+
+static const struct cpu_dev vortex_cpu_dev = {
+ .c_vendor = "Vortex",
+ .c_ident = { "Vortex86 SoC" },
+ .legacy_models = {
+ {
+ .family = 5,
+ .model_names = {
+ [2] = "Vortex86DX",
+ [8] = "Vortex86MX",
+ },
+ },
+ {
+ .family = 6,
+ .model_names = {
+ /*
+ * Both the Vortex86EX and the Vortex86EX2
+ * have the same family and model id.
+ *
+ * However, the -EX2 supports the product name
+ * CPUID call, so this name will only be used
+ * for the -EX, which does not.
+ */
+ [0] = "Vortex86EX",
+ },
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_VORTEX,
+};
+
+cpu_dev_register(vortex_cpu_dev);
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
new file mode 100644
index 000000000000..89b1c8a70fe8
--- /dev/null
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#define MSR_ZHAOXIN_FCR57 0x00001257
+
+#define ACE_PRESENT (1 << 6)
+#define ACE_ENABLED (1 << 7)
+#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */
+
+#define RNG_PRESENT (1 << 2)
+#define RNG_ENABLED (1 << 3)
+#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
+
+static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ /* Test for Extended Feature Flags presence */
+ if (cpuid_eax(0xC0000000) >= 0xC0000001) {
+ u32 tmp = cpuid_edx(0xC0000001);
+
+ /* Enable ACE unit, if present and disabled */
+ if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable ACE unit */
+ lo |= ACE_FCR;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled ACE h/w crypto\n");
+ }
+
+ /* Enable RNG unit, if present and disabled */
+ if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable RNG unit */
+ lo |= RNG_ENABLE;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled h/w RNG\n");
+ }
+
+ /*
+ * Store Extended Feature Flags as word 5 of the CPU
+ * capability bit array
+ */
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
+ }
+
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+}
+
+static void early_init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+}
+
+static void init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ early_init_zhaoxin(c);
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (c->x86 >= 0x6)
+ init_zhaoxin_cap(c);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
+
+ init_ia32_feat_ctl(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int
+zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ return size;
+}
+#endif
+
+static const struct cpu_dev zhaoxin_cpu_dev = {
+ .c_vendor = "zhaoxin",
+ .c_ident = { " Shanghai " },
+ .c_early_init = early_init_zhaoxin,
+ .c_init = init_zhaoxin,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = zhaoxin_size_cache,
+#endif
+ .c_x86_vendor = X86_VENDOR_ZHAOXIN,
+};
+
+cpu_dev_register(zhaoxin_cpu_dev);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index b07af8861244..dae436253de4 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* ----------------------------------------------------------------------- *
*
* Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- * USA; either version 2 of the License, or (at your option) any later
- * version; incorporated herein by reference.
- *
* ----------------------------------------------------------------------- */
/*
@@ -33,60 +28,42 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/completion.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
-static struct class *cpuid_class;
+static enum cpuhp_state cpuhp_cpuid_state;
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
+struct cpuid_regs_done {
+ struct cpuid_regs regs;
+ struct completion done;
};
static void cpuid_smp_cpuid(void *cmd_block)
{
- struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block;
+ struct cpuid_regs_done *cmd = cmd_block;
- cpuid_count(cmd->eax, cmd->ecx,
- &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
-}
+ cpuid_count(cmd->regs.eax, cmd->regs.ecx,
+ &cmd->regs.eax, &cmd->regs.ebx,
+ &cmd->regs.ecx, &cmd->regs.edx);
-static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file->f_mapping->host;
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
+ complete(&cmd->done);
}
static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
char __user *tmp = buf;
- struct cpuid_regs cmd;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ struct cpuid_regs_done cmd;
+ int cpu = iminor(file_inode(file));
u64 pos = *ppos;
ssize_t bytes = 0;
int err = 0;
@@ -94,19 +71,27 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
if (count % 16)
return -EINVAL; /* Invalid chunk size */
+ init_completion(&cmd.done);
for (; count; count -= 16) {
- cmd.eax = pos;
- cmd.ecx = pos >> 32;
- err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
+ call_single_data_t csd;
+
+ INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
+
+ cmd.regs.eax = pos;
+ cmd.regs.ecx = pos >> 32;
+
+ err = smp_call_function_single_async(cpu, &csd);
if (err)
break;
- if (copy_to_user(tmp, &cmd, 16)) {
+ wait_for_completion(&cmd.done);
+ if (copy_to_user(tmp, &cmd.regs, 16)) {
err = -EFAULT;
break;
}
tmp += 16;
bytes += 16;
*ppos = ++pos;
+ reinit_completion(&cmd.done);
}
return bytes ? bytes : err;
@@ -116,21 +101,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
{
unsigned int cpu;
struct cpuinfo_x86 *c;
- int ret = 0;
- lock_kernel();
+ cpu = iminor(file_inode(file));
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
- cpu = iminor(file->f_path.dentry->d_inode);
- if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
- ret = -ENXIO; /* No such CPU */
- goto out;
- }
c = &cpu_data(cpu);
if (c->cpuid_level < 0)
- ret = -EIO; /* CPUID not supported */
-out:
- unlock_kernel();
- return ret;
+ return -EIO; /* CPUID not supported */
+
+ return 0;
}
/*
@@ -138,106 +118,72 @@ out:
*/
static const struct file_operations cpuid_fops = {
.owner = THIS_MODULE,
- .llseek = cpuid_seek,
+ .llseek = no_seek_end_llseek,
.read = cpuid_read,
.open = cpuid_open,
};
-static __cpuinit int cpuid_device_create(int cpu)
+static char *cpuid_devnode(const struct device *dev, umode_t *mode)
{
- struct device *dev;
-
- dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
- "cpu%d", cpu);
- return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+ return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
-static void cpuid_device_destroy(int cpu)
-{
- device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
-}
+static const struct class cpuid_class = {
+ .name = "cpuid",
+ .devnode = cpuid_devnode,
+};
-static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int cpuid_device_create(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
+ struct device *dev;
- switch (action) {
- case CPU_UP_PREPARE:
- err = cpuid_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- cpuid_device_destroy(cpu);
- break;
- }
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ dev = device_create(&cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
+ "cpu%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
}
-static struct notifier_block __refdata cpuid_class_cpu_notifier =
+static int cpuid_device_destroy(unsigned int cpu)
{
- .notifier_call = cpuid_class_cpu_callback,
-};
-
-static char *cpuid_nodename(struct device *dev)
-{
- return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
+ device_destroy(&cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ return 0;
}
static int __init cpuid_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
- if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
+ if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
+ "cpu/cpuid", &cpuid_fops)) {
printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
CPUID_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
- cpuid_class = class_create(THIS_MODULE, "cpuid");
- if (IS_ERR(cpuid_class)) {
- err = PTR_ERR(cpuid_class);
+ err = class_register(&cpuid_class);
+ if (err)
goto out_chrdev;
- }
- cpuid_class->nodename = cpuid_nodename;
- for_each_online_cpu(i) {
- err = cpuid_device_create(i);
- if (err != 0)
- goto out_class;
- }
- register_hotcpu_notifier(&cpuid_class_cpu_notifier);
- err = 0;
- goto out;
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+ cpuid_device_create, cpuid_device_destroy);
+ if (err < 0)
+ goto out_class;
+
+ cpuhp_cpuid_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i) {
- cpuid_device_destroy(i);
- }
- class_destroy(cpuid_class);
+ class_unregister(&cpuid_class);
out_chrdev:
- unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
-out:
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
return err;
}
+module_init(cpuid_init);
static void __exit cpuid_exit(void)
{
- int cpu = 0;
-
- for_each_online_cpu(cpu)
- cpuid_device_destroy(cpu);
- class_destroy(cpuid_class);
- unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
- unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ cpuhp_remove_state(cpuhp_cpuid_state);
+ class_unregister(&cpuid_class);
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
}
-
-module_init(cpuid_init);
module_exit(cpuid_exit);
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..335fd2ee9766 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -1,13 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Architecture specific (i386/x86_64) functions for kexec based crash dumps.
*
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
*
*/
-#include <linux/init.h>
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,60 +21,76 @@
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/memblock.h>
+#include <asm/bootparam.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/e820/types.h>
+#include <asm/io_apic.h>
#include <asm/hpet.h>
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
-#include <asm/iommu.h>
+#include <asm/intel_pt.h>
+#include <asm/crash.h>
+#include <asm/cmdline.h>
+#include <asm/sev.h>
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
-static void kdump_nmi_callback(int cpu, struct die_args *args)
+static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
- struct pt_regs *regs;
-#ifdef CONFIG_X86_32
- struct pt_regs fixed_regs;
-#endif
-
- regs = args->regs;
-
-#ifdef CONFIG_X86_32
- if (!user_mode_vm(regs)) {
- crash_fixup_ss_esp(&fixed_regs, regs);
- regs = &fixed_regs;
- }
-#endif
crash_save_cpu(regs, cpu);
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
+ /*
+ * Disable Intel PT to stop its logging
*/
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_stop_pt();
+
+ kdump_sev_callback();
disable_local_APIC();
}
-static void kdump_nmi_shootdown_cpus(void)
+void kdump_nmi_shootdown_cpus(void)
{
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
}
+/* Override the weak function in kernel/panic.c */
+void crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ if (cpus_stopped)
+ return;
+
+ if (smp_ops.crash_stop_other_cpus)
+ smp_ops.crash_stop_other_cpus();
+ else
+ smp_send_stop();
+
+ cpus_stopped = 1;
+}
+
#else
-static void kdump_nmi_shootdown_cpus(void)
+void crash_smp_send_stop(void)
{
/* There are no cpus to shootdown */
}
@@ -88,26 +109,463 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
/* The kernel is broken so disable interrupts */
local_irq_disable();
- kdump_nmi_shootdown_cpus();
+ crash_smp_send_stop();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
+ cpu_emergency_disable_virtualization();
+
+ /*
+ * Disable Intel PT to stop its logging
*/
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_stop_pt();
- lapic_shutdown();
-#if defined(CONFIG_X86_IO_APIC)
- disable_IO_APIC();
+#ifdef CONFIG_X86_IO_APIC
+ /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
+ ioapic_zap_locks();
+ clear_IO_APIC();
#endif
+ lapic_shutdown();
+ restore_boot_irq_mode();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
-#ifdef CONFIG_X86_64
- pci_iommu_shutdown();
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() gets called after all but one CPU have been shut
+ * down and interrupts have been disabled. This allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin();
+ x86_platform.guest.enc_kexec_finish();
+
+ crash_save_cpu(regs, smp_processor_id());
+}
+
+#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
+{
+ unsigned int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static struct crash_mem *fill_up_crash_elf_data(void)
+{
+ unsigned int nr_ranges = 0;
+ struct crash_mem *cmem;
+
+ walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
+ if (!nr_ranges)
+ return NULL;
+
+ /*
+ * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
+ * may cause range splits. So add extra slots here.
+ *
+ * Exclusion of low 1M may not cause another range split, because the
+ * range of exclude is [0, 1M] and the condition for splitting a new
+ * region is that the start, end parameters are both in a certain
+ * existing region in cmem and cannot be equal to existing region's
+ * start or end. Obviously, the start of [0, 1M] cannot meet this
+ * condition.
+ *
+ * But in order to lest the low 1M could be changed in the future,
+ * (e.g. [start, 1M]), add a extra slot.
+ */
+ nr_ranges += 3 + crashk_cma_cnt;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
+ if (!cmem)
+ return NULL;
+
+ cmem->max_nr_ranges = nr_ranges;
+
+ return cmem;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in cmem->ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_mem *cmem)
+{
+ int ret = 0;
+ int i;
+
+ /* Exclude the low 1M because it is always reserved */
+ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
+ if (ret)
+ return ret;
+
+ /* Exclude crashkernel region */
+ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ if (crashk_low_res.end)
+ ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
+ crashk_low_res.end);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
+{
+ struct crash_mem *cmem = arg;
+
+ cmem->ranges[cmem->nr_ranges].start = res->start;
+ cmem->ranges[cmem->nr_ranges].end = res->end;
+ cmem->nr_ranges++;
+
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(void **addr, unsigned long *sz,
+ unsigned long *nr_mem_ranges)
+{
+ struct crash_mem *cmem;
+ int ret;
+
+ cmem = fill_up_crash_elf_data();
+ if (!cmem)
+ return -ENOMEM;
+
+ ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
+ if (ret)
+ goto out;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(cmem);
+ if (ret)
+ goto out;
+
+ /* Return the computed number of memory ranges, for hotplug usage */
+ *nr_mem_ranges = cmem->nr_ranges;
+
+ /* By default prepare 64bit headers */
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
+
+out:
+ vfree(cmem);
+ return ret;
+}
#endif
- crash_save_cpu(regs, safe_smp_processor_id());
+#ifdef CONFIG_KEXEC_FILE
+static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
+ return 1;
+
+ memcpy(&params->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(struct resource *res, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820_entry ei;
+
+ ei.addr = res->start;
+ ei.size = resource_size(res);
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude elf header region */
+ start = image->elf_load_addr;
+ end = start + image->elf_headers_sz - 1;
+ ret = crash_exclude_mem_range(cmem, start, end);
+
+ if (ret)
+ return ret;
+
+ /* Exclude dm crypt keys region */
+ if (image->dm_crypt_keys_addr) {
+ start = image->dm_crypt_keys_addr;
+ end = start + image->dm_crypt_keys_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+ }
+
+ return ret;
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ unsigned int nr_ranges = 0;
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820_entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ /*
+ * In the current x86 architecture code, the elfheader is always
+ * allocated at crashk_res.start. But it depends on the allocation
+ * position of elfheader in crashk_res. To avoid potential out of
+ * bounds in future, add an extra slot.
+ *
+ * And using random kexec_buf for passing dm crypt keys may cause a
+ * range split too, add another extra slot here.
+ */
+ nr_ranges = 3;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
+ if (!cmem)
+ return -ENOMEM;
+
+ cmem->max_nr_ranges = nr_ranges;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add the low 1M */
+ cmd.type = E820_TYPE_RAM;
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI tables */
+ cmd.type = E820_TYPE_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_TYPE_NVS;
+ walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add e820 reserved ranges */
+ cmd.type = E820_TYPE_RESERVED;
+ flags = IORESOURCE_MEM;
+ walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = resource_size(&crashk_low_res);
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ei.addr = crashk_cma_ranges[i].start;
+ ei.size = crashk_cma_ranges[i].end -
+ crashk_cma_ranges[i].start + 1;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ int ret;
+ unsigned long pnum = 0;
+ struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+ .buf_max = ULONG_MAX, .top_down = false };
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
+ if (ret)
+ return ret;
+
+ image->elf_headers = kbuf.buffer;
+ image->elf_headers_sz = kbuf.bufsz;
+ kbuf.memsz = kbuf.bufsz;
+
+#ifdef CONFIG_CRASH_HOTPLUG
+ /*
+ * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map,
+ * maximum CPUs and maximum memory ranges.
+ */
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
+ else
+ pnum += 2 + CONFIG_NR_CPUS_DEFAULT;
+
+ if (pnum < (unsigned long)PN_XNUM) {
+ kbuf.memsz = pnum * sizeof(Elf64_Phdr);
+ kbuf.memsz += sizeof(Elf64_Ehdr);
+
+ image->elfcorehdr_index = image->nr_segments;
+
+ /* Mark as usable to crash kernel, else crash kernel fails on boot */
+ image->elf_headers_sz = kbuf.memsz;
+ } else {
+ pr_err("number of Phdrs %lu exceeds max\n", pnum);
+ }
+#endif
+
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ return ret;
+ image->elf_load_addr = kbuf.mem;
+ kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ return ret;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+#ifdef CONFIG_CRASH_HOTPLUG
+
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
+{
+
+#ifdef CONFIG_KEXEC_FILE
+ if (image->file_mode)
+ return 1;
+#endif
+ /*
+ * Initially, crash hotplug support for kexec_load was added
+ * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this
+ * functionality was expanded to accommodate multiple kexec
+ * segment updates, leading to the introduction of the
+ * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently,
+ * when the kexec tool sends either of these flags, it indicates
+ * that the required kexec segment (elfcorehdr) is excluded from
+ * the SHA calculation.
+ */
+ return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
+ kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);
+}
+
+unsigned int arch_crash_get_elfcorehdr_size(void)
+{
+ unsigned int sz;
+
+ /* kernel_map, VMCOREINFO and maximum CPUs */
+ sz = 2 + CONFIG_NR_CPUS_DEFAULT;
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ sz += CONFIG_CRASH_MAX_MEMORY_RANGES;
+ sz *= sizeof(Elf64_Phdr);
+ return sz;
+}
+
+/**
+ * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
+ * @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and
+ * NULL for CPU hotplug case.
+ *
+ * Prepare the new elfcorehdr and replace the existing elfcorehdr.
+ */
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
+{
+ void *elfbuf = NULL, *old_elfcorehdr;
+ unsigned long nr_mem_ranges;
+ unsigned long mem, memsz;
+ unsigned long elfsz = 0;
+
+ /*
+ * As crash_prepare_elf64_headers() has already described all
+ * possible CPUs, there is no need to update the elfcorehdr
+ * for additional CPU changes.
+ */
+ if ((image->file_mode || image->elfcorehdr_updated) &&
+ ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) ||
+ (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU)))
+ return;
+
+ /*
+ * Create the new elfcorehdr reflecting the changes to CPU and/or
+ * memory resources.
+ */
+ if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
+ pr_err("unable to create new elfcorehdr");
+ goto out;
+ }
+
+ /*
+ * Obtain address and size of the elfcorehdr segment, and
+ * check it against the new elfcorehdr buffer.
+ */
+ mem = image->segment[image->elfcorehdr_index].mem;
+ memsz = image->segment[image->elfcorehdr_index].memsz;
+ if (elfsz > memsz) {
+ pr_err("update elfcorehdr elfsz %lu > memsz %lu",
+ elfsz, memsz);
+ goto out;
+ }
+
+ /*
+ * Copy new elfcorehdr over the old elfcorehdr at destination.
+ */
+ old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (!old_elfcorehdr) {
+ pr_err("mapping elfcorehdr segment failed\n");
+ goto out;
+ }
+
+ /*
+ * Temporarily invalidate the crash image while the
+ * elfcorehdr is updated.
+ */
+ xchg(&kexec_crash_image, NULL);
+ memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz);
+ xchg(&kexec_crash_image, image);
+ kunmap_local(old_elfcorehdr);
+ pr_debug("updated elfcorehdr\n");
+
+out:
+ vfree(elfbuf);
+}
+#endif
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index f7cdb3b457aa..5f4ae5476e19 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Memory preserving reboot related code.
*
@@ -5,74 +6,42 @@
* Copyright (C) IBM Corporation, 2004. All rights reserved
*/
+#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/crash_dump.h>
+#include <linux/uio.h>
-#include <asm/uaccess.h>
-
-static void *kdump_buf_page;
-
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+static inline bool is_crashed_pfn_valid(unsigned long pfn)
+{
+#ifndef CONFIG_X86_PAE
+ /*
+ * non-PAE kdump kernel executed from a PAE one will crop high pte
+ * bits and poke unwanted space counting again from address 0, we
+ * don't want that. pte must fit into unsigned long. In fact the
+ * test checks high 12 bits for being zero (pfn will be shifted left
+ * by PAGE_SHIFT).
+ */
+ return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
+#else
+ return true;
+#endif
+}
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- *
- * Calling copy_to_user() in atomic context is not desirable. Hence first
- * copying the data to a pre-allocated kernel page and then copying to user
- * space in non-atomic context.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
{
void *vaddr;
if (!csize)
return 0;
- vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+ if (!is_crashed_pfn_valid(pfn))
+ return -EFAULT;
- if (!userbuf) {
- memcpy(buf, (vaddr + offset), csize);
- kunmap_atomic(vaddr, KM_PTE0);
- } else {
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Kdump buffer page not"
- " allocated\n");
- kunmap_atomic(vaddr, KM_PTE0);
- return -EFAULT;
- }
- copy_page(kdump_buf_page, vaddr);
- kunmap_atomic(vaddr, KM_PTE0);
- if (copy_to_user(buf, (kdump_buf_page + offset), csize))
- return -EFAULT;
- }
+ vaddr = kmap_local_pfn(pfn);
+ csize = copy_to_iter(vaddr + offset, csize, iter);
+ kunmap_local(vaddr);
return csize;
}
-
-static int __init kdump_buf_page_init(void)
-{
- int ret = 0;
-
- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
- " page\n");
- ret = -ENOMEM;
- }
-
- return ret;
-}
-arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..32d710f7eb84 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Memory preserving reboot related code.
*
@@ -7,45 +8,57 @@
#include <linux/errno.h>
#include <linux/crash_dump.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <linux/io.h>
+#include <linux/cc_platform.h>
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn,
+ size_t csize, unsigned long offset,
+ bool encrypted)
{
void *vaddr;
if (!csize)
return 0;
- vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (encrypted)
+ vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+ else
+ vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+
if (!vaddr)
return -ENOMEM;
- if (userbuf) {
- if (copy_to_user(buf, vaddr + offset, csize)) {
- iounmap(vaddr);
- return -EFAULT;
- }
- } else
- memcpy(buf, vaddr + offset, csize);
+ csize = copy_to_iter(vaddr + offset, csize, iter);
- iounmap(vaddr);
+ iounmap((void __iomem *)vaddr);
return csize;
}
+
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
+{
+ return __copy_oldmem_page(iter, pfn, csize, offset, false);
+}
+
+/*
+ * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
+ * memory with the encryption mask set to accommodate kdump on SME-enabled
+ * machines.
+ */
+ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn,
+ size_t csize, unsigned long offset)
+{
+ return __copy_oldmem_page(iter, pfn, csize, offset, true);
+}
+
+ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
+}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..dd8748c45529
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Architecture specific OF callbacks.
+ */
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/of_irq.h>
+#include <linux/libfdt.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/of_pci.h>
+#include <linux/initrd.h>
+
+#include <asm/irqdomain.h>
+#include <asm/hpet.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/pci_x86.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+#include <asm/numa.h>
+#include <asm/prom.h>
+
+__initdata u64 initial_dtb;
+char __initdata cmd_line[COMMAND_LINE_SIZE];
+
+int __initdata of_ioapic;
+
+void __init add_dtb(u64 data)
+{
+ initial_dtb = data + offsetof(struct setup_data, data);
+}
+
+/*
+ * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
+ */
+static struct of_device_id __initdata ce4100_ids[] = {
+ { .compatible = "intel,ce4100-cp", },
+ { .compatible = "isa", },
+ { .compatible = "pci", },
+ {},
+};
+
+static int __init add_bus_probe(void)
+{
+ if (!of_have_populated_dt())
+ return 0;
+
+ return of_platform_bus_probe(NULL, ce4100_ids, NULL);
+}
+device_initcall(add_bus_probe);
+
+#ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
+static int x86_of_pci_irq_enable(struct pci_dev *dev)
+{
+ u32 virq;
+ int ret;
+ u8 pin;
+
+ ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (ret)
+ return pcibios_err_to_errno(ret);
+ if (!pin)
+ return 0;
+
+ virq = of_irq_parse_and_map_pci(dev, 0, 0);
+ if (virq == 0)
+ return -EINVAL;
+ dev->irq = virq;
+ return 0;
+}
+
+static void x86_of_pci_irq_disable(struct pci_dev *dev)
+{
+}
+
+void x86_of_pci_init(void)
+{
+ pcibios_enable_irq = x86_of_pci_irq_enable;
+ pcibios_disable_irq = x86_of_pci_irq_disable;
+}
+#endif
+
+static void __init dtb_setup_hpet(void)
+{
+#ifdef CONFIG_HPET_TIMER
+ struct device_node *dn;
+ struct resource r;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
+ if (!dn)
+ return;
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ WARN_ON(1);
+ return;
+ }
+ hpet_address = r.start;
+#endif
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static void __init dtb_cpu_setup(void)
+{
+ struct device_node *dn;
+ u32 apic_id;
+
+ for_each_of_cpu_node(dn) {
+ apic_id = of_get_cpu_hwid(dn, 0);
+ if (apic_id == ~0U) {
+ pr_warn("%pOF: missing local APIC ID\n", dn);
+ continue;
+ }
+ topology_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+ set_apicid_to_node(apic_id, of_node_to_nid(dn));
+ }
+}
+
+static void __init dtb_lapic_setup(void)
+{
+ struct device_node *dn;
+ struct resource r;
+ unsigned long lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
+ if (dn) {
+ ret = of_address_to_resource(dn, 0, &r);
+ if (WARN_ON(ret))
+ return;
+ lapic_addr = r.start;
+ }
+
+ /* Did the boot loader setup the local APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ /* Try force enabling, which registers the APIC address */
+ if (!apic_force_enable(lapic_addr))
+ return;
+ } else {
+ register_lapic_address(lapic_addr);
+ }
+ smp_found_config = 1;
+ pic_mode = !of_property_read_bool(dn, "intel,virtual-wire-mode");
+ pr_info("%s compatibility mode.\n", pic_mode ? "IMCR and PIC" : "Virtual Wire");
+}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static unsigned int ioapic_id;
+
+struct of_ioapic_type {
+ u32 out_type;
+ u32 is_level;
+ u32 active_low;
+};
+
+static struct of_ioapic_type of_ioapic_type[] =
+{
+ {
+ .out_type = IRQ_TYPE_EDGE_FALLING,
+ .is_level = 0,
+ .active_low = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_HIGH,
+ .is_level = 1,
+ .active_low = 0,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_LOW,
+ .is_level = 1,
+ .active_low = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_EDGE_RISING,
+ .is_level = 0,
+ .active_low = 0,
+ },
+};
+
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_fwspec *fwspec = (struct irq_fwspec *)arg;
+ struct of_ioapic_type *it;
+ struct irq_alloc_info tmp;
+ int type_index;
+
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+
+ type_index = fwspec->param[1];
+ if (type_index >= ARRAY_SIZE(of_ioapic_type))
+ return -EINVAL;
+
+ it = &of_ioapic_type[type_index];
+ ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low);
+ tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+ tmp.ioapic.pin = fwspec->param[0];
+
+ return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
+}
+
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .alloc = dt_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
+
+static void __init dtb_add_ioapic(struct device_node *dn)
+{
+ struct resource r;
+ int ret;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &ioapic_irq_domain_ops,
+ .dev = dn,
+ };
+
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ pr_err("Can't obtain address from device node %pOF.\n", dn);
+ return;
+ }
+ mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
+}
+
+static void __init dtb_ioapic_setup(void)
+{
+ struct device_node *dn;
+
+ for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+ dtb_add_ioapic(dn);
+
+ if (nr_ioapics) {
+ of_ioapic = 1;
+ return;
+ }
+ pr_err("Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
+
+static void __init dtb_apic_setup(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ dtb_lapic_setup();
+ dtb_cpu_setup();
+#endif
+ dtb_ioapic_setup();
+}
+
+static void __init x86_dtb_parse_smp_config(void)
+{
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
+
+void __init x86_flattree_get_config(void)
+{
+#ifdef CONFIG_OF_EARLY_FLATTREE
+ u32 size, map_len;
+ void *dt;
+
+ if (initial_dtb) {
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+ dt = early_memremap(initial_dtb, map_len);
+ size = fdt_totalsize(dt);
+ if (map_len < size) {
+ early_memunmap(dt, map_len);
+ dt = early_memremap(initial_dtb, size);
+ map_len = size;
+ }
+
+ early_init_dt_verify(dt, __pa(dt));
+ }
+
+ unflatten_and_copy_device_tree();
+
+ if (initial_dtb)
+ early_memunmap(dt, map_len);
+#endif
+ if (acpi_disabled && of_have_populated_dt())
+ x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
+}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index b4f14c6c09d9..6eaf9a6bc02f 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -1,71 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/sched.h>
-#include <linux/init.h>
+#include <linux/sched/debug.h>
#include <linux/init_task.h>
#include <linux/fs.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/desc.h>
-
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#include <asm/traps.h>
+#include <asm/doublefault.h>
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
-static void doublefault_fn(void)
+#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
+
+static void set_df_gdt_entry(unsigned int cpu);
+
+/*
+ * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
+ * we're running the doublefault task. Cannot return.
+ */
+asmlinkage noinstr void __noreturn doublefault_shim(void)
{
- struct desc_ptr gdt_desc = {0, 0};
- unsigned long gdt, tss;
+ unsigned long cr2;
+ struct pt_regs regs;
- store_gdt(&gdt_desc);
- gdt = gdt_desc.address;
+ BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+ cr2 = native_read_cr2();
- if (ptr_ok(gdt)) {
- gdt += GDT_ENTRY_TSS << 3;
- tss = *(u16 *)(gdt+2);
- tss += *(u8 *)(gdt+4) << 16;
- tss += *(u8 *)(gdt+7) << 24;
- printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+ /* Reset back to the normal kernel task. */
+ force_reload_TR();
+ set_df_gdt_entry(smp_processor_id());
- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ trace_hardirqs_off();
- printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
- t->ip, t->sp);
+ /*
+ * Fill in pt_regs. A downside of doing this in C is that the unwinder
+ * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
+ * won't successfully unwind to the source of the double fault.
+ * The main dump from exc_double_fault() is fine, though, since it
+ * uses these regs directly.
+ *
+ * If anyone ever cares, this could be moved to asm.
+ */
+ regs.ss = TSS(ss);
+ regs.__ssh = 0;
+ regs.sp = TSS(sp);
+ regs.flags = TSS(flags);
+ regs.cs = TSS(cs);
+ /* We won't go through the entry asm, so we can leave __csh as 0. */
+ regs.__csh = 0;
+ regs.ip = TSS(ip);
+ regs.orig_ax = 0;
+ regs.gs = TSS(gs);
+ regs.__gsh = 0;
+ regs.fs = TSS(fs);
+ regs.__fsh = 0;
+ regs.es = TSS(es);
+ regs.__esh = 0;
+ regs.ds = TSS(ds);
+ regs.__dsh = 0;
+ regs.ax = TSS(ax);
+ regs.bp = TSS(bp);
+ regs.di = TSS(di);
+ regs.si = TSS(si);
+ regs.dx = TSS(dx);
+ regs.cx = TSS(cx);
+ regs.bx = TSS(bx);
- printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
- t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
- }
- }
+ exc_double_fault(&regs, 0, cr2);
- for (;;)
- cpu_relax();
+ /*
+ * x86_32 does not save the original CR3 anywhere on a task switch.
+ * This means that, even if we wanted to return, we would need to find
+ * some way to reconstruct CR3. We could make a credible guess based
+ * on cpu_tlbstate, but that would be racy and would not account for
+ * PTI.
+ */
+ panic("cannot return from double fault\n");
}
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
+DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
+ .tss = {
+ /*
+ * No sp0 or ss0 -- we never run CPL != 0 with this TSS
+ * active. sp is filled in later.
+ */
.ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
+ .ip = (unsigned long) asm_exc_double_fault,
+ .flags = X86_EFLAGS_FIXED,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
+ .gs = 0,
.__cr3 = __pa_nodebug(swapper_pg_dir),
- }
+ },
};
+
+static void set_df_gdt_entry(unsigned int cpu)
+{
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+ &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
+
+void doublefault_init_cpu_tss(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+ /*
+ * The linker isn't smart enough to initialize percpu variables that
+ * point to other places in percpu space.
+ */
+ this_cpu_write(doublefault_stack.tss.sp,
+ (unsigned long)&cea->doublefault_stack.stack +
+ sizeof(doublefault_stack.stack));
+
+ set_df_gdt_entry(cpu);
+}
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 48bfe1386038..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-
-#include <asm/ds.h>
-
-#include "ds_selftest.h"
-
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
- /* The name of the configuration: */
- const char *name;
-
- /* The size of pointer-typed fields in DS, BTS, and PEBS: */
- unsigned char sizeof_ptr_field;
-
- /* The size of a BTS/PEBS record in bytes: */
- unsigned char sizeof_rec[2];
-
- /* The number of pebs counter reset values in the DS structure. */
- unsigned char nr_counter_reset;
-
- /* Control bit-masks indexed by enum ds_feature: */
- unsigned long ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-
-
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS 0x80
-
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS (3 * 8)
-
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT (1 << 3)
-
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS 8
-
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE 8
-
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL \
- ( ds_cfg.ctl[dsf_bts] | \
- ds_cfg.ctl[dsf_bts_kernel] | \
- ds_cfg.ctl[dsf_bts_user] | \
- ds_cfg.ctl[dsf_bts_overflow] )
-
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
- /* The DS context (partially) owned by this tracer. */
- struct ds_context *context;
- /* The buffer provided on ds_request() and its size in bytes. */
- void *buffer;
- size_t size;
-};
-
-struct bts_tracer {
- /* The common DS part: */
- struct ds_tracer ds;
-
- /* The trace including the DS configuration: */
- struct bts_trace trace;
-
- /* Buffer overflow notification function: */
- bts_ovfl_callback_t ovfl;
-
- /* Active flags affecting trace collection. */
- unsigned int flags;
-};
-
-struct pebs_tracer {
- /* The common DS part: */
- struct ds_tracer ds;
-
- /* The trace including the DS configuration: */
- struct pebs_trace trace;
-
- /* Buffer overflow notification function: */
- pebs_ovfl_callback_t ovfl;
-};
-
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- * the buffer)
- * - interrupt pointer into BTS buffer
- * (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- * the buffer)
- * - interrupt pointer into PEBS buffer
- * (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
- ds_buffer_base = 0,
- ds_index,
- ds_absolute_maximum,
- ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
- ds_bts = 0,
- ds_pebs
-};
-
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
- base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
- return *(unsigned long *)base;
-}
-
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
- unsigned long value)
-{
- base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
- (*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- * =0 no tracers
- * >0 number of per-thread tracers
- * <0 number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-
-static inline int get_tracer(struct task_struct *task)
-{
- int error;
-
- spin_lock_irq(&ds_lock);
-
- if (task) {
- error = -EPERM;
- if (atomic_read(&tracers) < 0)
- goto out;
- atomic_inc(&tracers);
- } else {
- error = -EPERM;
- if (atomic_read(&tracers) > 0)
- goto out;
- atomic_dec(&tracers);
- }
-
- error = 0;
-out:
- spin_unlock_irq(&ds_lock);
- return error;
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
- if (task)
- atomic_dec(&tracers);
- else
- atomic_inc(&tracers);
-}
-
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- * attached context.
- * - in the latter case, we use a static array of per-cpu context
- * pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
- /* The DS configuration; goes into MSR_IA32_DS_AREA: */
- unsigned char ds[MAX_SIZEOF_DS];
-
- /* The owner of the BTS and PEBS configuration, respectively: */
- struct bts_tracer *bts_master;
- struct pebs_tracer *pebs_master;
-
- /* Use count: */
- unsigned long count;
-
- /* Pointer to the context pointer field: */
- struct ds_context **this;
-
- /* The traced task; NULL for cpu tracing: */
- struct task_struct *task;
-
- /* The traced cpu; only valid if task is NULL: */
- int cpu;
-};
-
-static DEFINE_PER_CPU(struct ds_context *, cpu_context);
-
-
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
- struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
- struct ds_context *context = NULL;
- struct ds_context *new_context = NULL;
-
- /* Chances are small that we already have a context. */
- new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
- if (!new_context)
- return NULL;
-
- spin_lock_irq(&ds_lock);
-
- context = *p_context;
- if (likely(!context)) {
- context = new_context;
-
- context->this = p_context;
- context->task = task;
- context->cpu = cpu;
- context->count = 0;
-
- *p_context = context;
- }
-
- context->count++;
-
- spin_unlock_irq(&ds_lock);
-
- if (context != new_context)
- kfree(new_context);
-
- return context;
-}
-
-static void ds_put_context(struct ds_context *context)
-{
- struct task_struct *task;
- unsigned long irq;
-
- if (!context)
- return;
-
- spin_lock_irqsave(&ds_lock, irq);
-
- if (--context->count) {
- spin_unlock_irqrestore(&ds_lock, irq);
- return;
- }
-
- *(context->this) = NULL;
-
- task = context->task;
-
- if (task)
- clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
- /*
- * We leave the (now dangling) pointer to the DS configuration in
- * the DS_AREA msr. This is as good or as bad as replacing it with
- * NULL - the hardware would crash if we enabled tracing.
- *
- * This saves us some problems with having to write an msr on a
- * different cpu while preventing others from doing the same for the
- * next context for that same cpu.
- */
-
- spin_unlock_irqrestore(&ds_lock, irq);
-
- /* The context might still be in use for context switching. */
- if (task && (task != current))
- wait_task_context_switch(task);
-
- kfree(context);
-}
-
-static void ds_install_ds_area(struct ds_context *context)
-{
- unsigned long ds;
-
- ds = (unsigned long)context->ds;
-
- /*
- * There is a race between the bts master and the pebs master.
- *
- * The thread/cpu access is synchronized via get/put_cpu() for
- * task tracing and via wrmsr_on_cpu for cpu tracing.
- *
- * If bts and pebs are collected for the same task or same cpu,
- * the same confiuration is written twice.
- */
- if (context->task) {
- get_cpu();
- if (context->task == current)
- wrmsrl(MSR_IA32_DS_AREA, ds);
- set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
- put_cpu();
- } else
- wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
- (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
- switch (qual) {
- case ds_bts:
- if (context->bts_master &&
- context->bts_master->ovfl)
- context->bts_master->ovfl(context->bts_master);
- break;
- case ds_pebs:
- if (context->pebs_master &&
- context->pebs_master->ovfl)
- context->pebs_master->ovfl(context->pebs_master);
- break;
- }
-}
-
-
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
- const void *record, size_t size)
-{
- int bytes_written = 0;
-
- if (!record)
- return -EINVAL;
-
- while (size) {
- unsigned long base, index, end, write_end, int_th;
- unsigned long write_size, adj_write_size;
-
- /*
- * Write as much as possible without producing an
- * overflow interrupt.
- *
- * Interrupt_threshold must either be
- * - bigger than absolute_maximum or
- * - point to a record between buffer_base and absolute_maximum
- *
- * Index points to a valid record.
- */
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
- int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
- write_end = min(end, int_th);
-
- /*
- * If we are already beyond the interrupt threshold,
- * we fill the entire buffer.
- */
- if (write_end <= index)
- write_end = end;
-
- if (write_end <= index)
- break;
-
- write_size = min((unsigned long) size, write_end - index);
- memcpy((void *)index, record, write_size);
-
- record = (const char *)record + write_size;
- size -= write_size;
- bytes_written += write_size;
-
- adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
- adj_write_size *= ds_cfg.sizeof_rec[qual];
-
- /* Zero out trailing bytes. */
- memset((char *)index + write_size, 0,
- adj_write_size - write_size);
- index += adj_write_size;
-
- if (index >= end)
- index = base;
- ds_set(context->ds, qual, ds_index, index);
-
- if (index >= int_th)
- ds_overflow(context, qual);
- }
-
- return bytes_written;
-}
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-
-enum bts_field {
- bts_from,
- bts_to,
- bts_flags,
-
- bts_qual = bts_from,
- bts_clock = bts_to,
- bts_pid = bts_flags,
-
- bts_qual_mask = (bts_qual_max - 1),
- bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
-};
-
-static inline unsigned long bts_get(const char *base, enum bts_field field)
-{
- base += (ds_cfg.sizeof_ptr_field * field);
- return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, enum bts_field field, unsigned long val)
-{
- base += (ds_cfg.sizeof_ptr_field * field);;
- (*(unsigned long *)base) = val;
-}
-
-
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- * writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
- if (!tracer)
- return -EINVAL;
-
- if (at < tracer->trace.ds.begin)
- return -EINVAL;
-
- if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
- return -EINVAL;
-
- memset(out, 0, sizeof(*out));
- if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
- out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
- out->variant.event.clock = bts_get(at, bts_clock);
- out->variant.event.pid = bts_get(at, bts_pid);
- } else {
- out->qualifier = bts_branch;
- out->variant.lbr.from = bts_get(at, bts_from);
- out->variant.lbr.to = bts_get(at, bts_to);
-
- if (!out->variant.lbr.from && !out->variant.lbr.to)
- out->qualifier = bts_invalid;
- }
-
- return ds_cfg.sizeof_rec[ds_bts];
-}
-
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
- unsigned char raw[MAX_SIZEOF_BTS];
-
- if (!tracer)
- return -EINVAL;
-
- if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
- return -EOVERFLOW;
-
- switch (in->qualifier) {
- case bts_invalid:
- bts_set(raw, bts_from, 0);
- bts_set(raw, bts_to, 0);
- bts_set(raw, bts_flags, 0);
- break;
- case bts_branch:
- bts_set(raw, bts_from, in->variant.lbr.from);
- bts_set(raw, bts_to, in->variant.lbr.to);
- bts_set(raw, bts_flags, 0);
- break;
- case bts_task_arrives:
- case bts_task_departs:
- bts_set(raw, bts_qual, (bts_escape | in->qualifier));
- bts_set(raw, bts_clock, in->variant.event.clock);
- bts_set(raw, bts_pid, in->variant.event.pid);
- break;
- default:
- return -EINVAL;
- }
-
- return ds_write(tracer->ds.context, ds_bts, raw,
- ds_cfg.sizeof_rec[ds_bts]);
-}
-
-
-static void ds_write_config(struct ds_context *context,
- struct ds_trace *cfg, enum ds_qualifier qual)
-{
- unsigned char *ds = context->ds;
-
- ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
- ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
- ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
- ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-
-static void ds_read_config(struct ds_context *context,
- struct ds_trace *cfg, enum ds_qualifier qual)
-{
- unsigned char *ds = context->ds;
-
- cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
- cfg->top = (void *)ds_get(ds, qual, ds_index);
- cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
- cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
- void *base, size_t size, size_t ith,
- unsigned int flags) {
- unsigned long buffer, adj;
-
- /*
- * Adjust the buffer address and size to meet alignment
- * constraints:
- * - buffer is double-word aligned
- * - size is multiple of record size
- *
- * We checked the size at the very beginning; we have enough
- * space to do the adjustment.
- */
- buffer = (unsigned long)base;
-
- adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
- buffer += adj;
- size -= adj;
-
- trace->n = size / ds_cfg.sizeof_rec[qual];
- trace->size = ds_cfg.sizeof_rec[qual];
-
- size = (trace->n * trace->size);
-
- trace->begin = (void *)buffer;
- trace->top = trace->begin;
- trace->end = (void *)(buffer + size);
- /*
- * The value for 'no threshold' is -1, which will set the
- * threshold outside of the buffer, just like we want it.
- */
- ith *= ds_cfg.sizeof_rec[qual];
- trace->ith = (void *)(buffer + size - ith);
-
- trace->flags = flags;
-}
-
-
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
- enum ds_qualifier qual, struct task_struct *task,
- int cpu, void *base, size_t size, size_t th)
-{
- struct ds_context *context;
- int error;
- size_t req_size;
-
- error = -EOPNOTSUPP;
- if (!ds_cfg.sizeof_rec[qual])
- goto out;
-
- error = -EINVAL;
- if (!base)
- goto out;
-
- req_size = ds_cfg.sizeof_rec[qual];
- /* We might need space for alignment adjustments. */
- if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
- req_size += DS_ALIGNMENT;
-
- error = -EINVAL;
- if (size < req_size)
- goto out;
-
- if (th != (size_t)-1) {
- th *= ds_cfg.sizeof_rec[qual];
-
- error = -EINVAL;
- if (size <= th)
- goto out;
- }
-
- tracer->buffer = base;
- tracer->size = size;
-
- error = -ENOMEM;
- context = ds_get_context(task, cpu);
- if (!context)
- goto out;
- tracer->context = context;
-
- /*
- * Defer any tracer-specific initialization work for the context until
- * context ownership has been clarified.
- */
-
- error = 0;
- out:
- return error;
-}
-
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Buffer overflow notification is not yet implemented. */
- error = -EOPNOTSUPP;
- if (ovfl)
- goto out;
-
- error = get_tracer(task);
- if (error < 0)
- goto out;
-
- error = -ENOMEM;
- tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
- if (!tracer)
- goto out_put_tracer;
- tracer->ovfl = ovfl;
-
- /* Do some more error checking and acquire a tracing context. */
- error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_bts, task, cpu, base, size, th);
- if (error < 0)
- goto out_tracer;
-
- /* Claim the bts part of the tracing context we acquired above. */
- spin_lock_irq(&ds_lock);
-
- error = -EPERM;
- if (tracer->ds.context->bts_master)
- goto out_unlock;
- tracer->ds.context->bts_master = tracer;
-
- spin_unlock_irq(&ds_lock);
-
- /*
- * Now that we own the bts part of the context, let's complete the
- * initialization for that part.
- */
- ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
- ds_install_ds_area(tracer->ds.context);
-
- tracer->trace.read = bts_read;
- tracer->trace.write = bts_write;
-
- /* Start tracing. */
- ds_resume_bts(tracer);
-
- return tracer;
-
- out_unlock:
- spin_unlock_irq(&ds_lock);
- ds_put_context(tracer->ds.context);
- out_tracer:
- kfree(tracer);
- out_put_tracer:
- put_tracer(task);
- out:
- return ERR_PTR(error);
-}
-
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
-{
- struct pebs_tracer *tracer;
- int error;
-
- /* Buffer overflow notification is not yet implemented. */
- error = -EOPNOTSUPP;
- if (ovfl)
- goto out;
-
- error = get_tracer(task);
- if (error < 0)
- goto out;
-
- error = -ENOMEM;
- tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
- if (!tracer)
- goto out_put_tracer;
- tracer->ovfl = ovfl;
-
- /* Do some more error checking and acquire a tracing context. */
- error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_pebs, task, cpu, base, size, th);
- if (error < 0)
- goto out_tracer;
-
- /* Claim the pebs part of the tracing context we acquired above. */
- spin_lock_irq(&ds_lock);
-
- error = -EPERM;
- if (tracer->ds.context->pebs_master)
- goto out_unlock;
- tracer->ds.context->pebs_master = tracer;
-
- spin_unlock_irq(&ds_lock);
-
- /*
- * Now that we own the pebs part of the context, let's complete the
- * initialization for that part.
- */
- ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
- ds_install_ds_area(tracer->ds.context);
-
- /* Start tracing. */
- ds_resume_pebs(tracer);
-
- return tracer;
-
- out_unlock:
- spin_unlock_irq(&ds_lock);
- ds_put_context(tracer->ds.context);
- out_tracer:
- kfree(tracer);
- out_put_tracer:
- put_tracer(task);
- out:
- return ERR_PTR(error);
-}
-
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static void ds_free_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
-
- task = tracer->ds.context->task;
-
- WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
- tracer->ds.context->bts_master = NULL;
-
- /* Make sure tracing stopped and the tracer is not in use. */
- if (task && (task != current))
- wait_task_context_switch(task);
-
- ds_put_context(tracer->ds.context);
- put_tracer(task);
-
- kfree(tracer);
-}
-
-void ds_release_bts(struct bts_tracer *tracer)
-{
- might_sleep();
-
- if (!tracer)
- return;
-
- ds_suspend_bts(tracer);
- ds_free_bts(tracer);
-}
-
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long irq;
- int error;
-
- if (!tracer)
- return 0;
-
- task = tracer->ds.context->task;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task &&
- (tracer->ds.context->cpu != smp_processor_id()))
- goto out;
-
- error = -EPERM;
- if (task && (task != current))
- goto out;
-
- ds_suspend_bts_noirq(tracer);
- ds_free_bts(tracer);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static void update_task_debugctlmsr(struct task_struct *task,
- unsigned long debugctlmsr)
-{
- task->thread.debugctlmsr = debugctlmsr;
-
- get_cpu();
- if (task == current)
- update_debugctlmsr(debugctlmsr);
- put_cpu();
-}
-
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr;
- int cpu;
-
- if (!tracer)
- return;
-
- tracer->flags = 0;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- WARN_ON(!task && irqs_disabled());
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr_on_cpu(cpu));
- debugctlmsr &= ~BTS_CONTROL;
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr, irq;
- int cpu, error = 0;
-
- if (!tracer)
- return 0;
-
- tracer->flags = 0;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task && (cpu != smp_processor_id()))
- goto out;
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr());
- debugctlmsr &= ~BTS_CONTROL;
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr(debugctlmsr);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
- unsigned long control;
-
- control = ds_cfg.ctl[dsf_bts];
- if (!(tracer->trace.ds.flags & BTS_KERNEL))
- control |= ds_cfg.ctl[dsf_bts_kernel];
- if (!(tracer->trace.ds.flags & BTS_USER))
- control |= ds_cfg.ctl[dsf_bts_user];
-
- return control;
-}
-
-void ds_resume_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr;
- int cpu;
-
- if (!tracer)
- return;
-
- tracer->flags = tracer->trace.ds.flags;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- WARN_ON(!task && irqs_disabled());
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr_on_cpu(cpu));
- debugctlmsr |= ds_bts_control(tracer);
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr, irq;
- int cpu, error = 0;
-
- if (!tracer)
- return 0;
-
- tracer->flags = tracer->trace.ds.flags;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task && (cpu != smp_processor_id()))
- goto out;
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr());
- debugctlmsr |= ds_bts_control(tracer);
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr(debugctlmsr);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
- struct task_struct *task;
-
- task = tracer->ds.context->task;
-
- WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
- tracer->ds.context->pebs_master = NULL;
-
- ds_put_context(tracer->ds.context);
- put_tracer(task);
-
- kfree(tracer);
-}
-
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
- might_sleep();
-
- if (!tracer)
- return;
-
- ds_suspend_pebs(tracer);
- ds_free_pebs(tracer);
-}
-
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long irq;
- int error;
-
- if (!tracer)
- return 0;
-
- task = tracer->ds.context->task;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task &&
- (tracer->ds.context->cpu != smp_processor_id()))
- goto out;
-
- error = -EPERM;
- if (task && (task != current))
- goto out;
-
- ds_suspend_pebs_noirq(tracer);
- ds_free_pebs(tracer);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
- return 0;
-}
-
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
- return 0;
-}
-
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
- if (!tracer)
- return NULL;
-
- ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
- return &tracer->trace;
-}
-
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
- if (!tracer)
- return NULL;
-
- ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-
- tracer->trace.counters = ds_cfg.nr_counter_reset;
- memcpy(tracer->trace.counter_reset,
- tracer->ds.context->ds +
- (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
- ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-
- return &tracer->trace;
-}
-
-int ds_reset_bts(struct bts_tracer *tracer)
-{
- if (!tracer)
- return -EINVAL;
-
- tracer->trace.ds.top = tracer->trace.ds.begin;
-
- ds_set(tracer->ds.context->ds, ds_bts, ds_index,
- (unsigned long)tracer->trace.ds.top);
-
- return 0;
-}
-
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
- if (!tracer)
- return -EINVAL;
-
- tracer->trace.ds.top = tracer->trace.ds.begin;
-
- ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
- (unsigned long)tracer->trace.ds.top);
-
- return 0;
-}
-
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
- unsigned int counter, u64 value)
-{
- if (!tracer)
- return -EINVAL;
-
- if (ds_cfg.nr_counter_reset < counter)
- return -EINVAL;
-
- *(u64 *)(tracer->ds.context->ds +
- (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
- (counter * PEBS_RESET_FIELD_SIZE)) = value;
-
- return 0;
-}
-
-static const struct ds_configuration ds_cfg_netburst = {
- .name = "Netburst",
- .ctl[dsf_bts] = (1 << 2) | (1 << 3),
- .ctl[dsf_bts_kernel] = (1 << 5),
- .ctl[dsf_bts_user] = (1 << 6),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
- .name = "Pentium M",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
- .name = "Core 2/Atom",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .ctl[dsf_bts_kernel] = (1 << 9),
- .ctl[dsf_bts_user] = (1 << 10),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
- .name = "Core i7",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .ctl[dsf_bts_kernel] = (1 << 9),
- .ctl[dsf_bts_user] = (1 << 10),
- .nr_counter_reset = 4,
-};
-
-static void
-ds_configure(const struct ds_configuration *cfg,
- struct cpuinfo_x86 *cpu)
-{
- unsigned long nr_pebs_fields = 0;
-
- printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-
-#ifdef __i386__
- nr_pebs_fields = 10;
-#else
- nr_pebs_fields = 18;
-#endif
-
- /*
- * Starting with version 2, architectural performance
- * monitoring supports a format specifier.
- */
- if ((cpuid_eax(0xa) & 0xff) > 1) {
- unsigned long perf_capabilities, format;
-
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
- format = (perf_capabilities >> 8) & 0xf;
-
- switch (format) {
- case 0:
- nr_pebs_fields = 18;
- break;
- case 1:
- nr_pebs_fields = 22;
- break;
- default:
- printk(KERN_INFO
- "[ds] unknown PEBS format: %lu\n", format);
- nr_pebs_fields = 0;
- break;
- }
- }
-
- memset(&ds_cfg, 0, sizeof(ds_cfg));
- ds_cfg = *cfg;
-
- ds_cfg.sizeof_ptr_field =
- (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-
- ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
- ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-
- if (!cpu_has(cpu, X86_FEATURE_BTS)) {
- ds_cfg.sizeof_rec[ds_bts] = 0;
- printk(KERN_INFO "[ds] bts not available\n");
- }
- if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
- ds_cfg.sizeof_rec[ds_pebs] = 0;
- printk(KERN_INFO "[ds] pebs not available\n");
- }
-
- printk(KERN_INFO "[ds] sizes: address: %u bit, ",
- 8 * ds_cfg.sizeof_ptr_field);
- printk("bts/pebs record: %u/%u bytes\n",
- ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-
- WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
- /* Only configure the first cpu. Others are identical. */
- if (ds_cfg.name)
- return;
-
- switch (c->x86) {
- case 0x6:
- switch (c->x86_model) {
- case 0x9:
- case 0xd: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m, c);
- break;
- case 0xf:
- case 0x17: /* Core2 */
- case 0x1c: /* Atom */
- ds_configure(&ds_cfg_core2_atom, c);
- break;
- case 0x1a: /* Core i7 */
- ds_configure(&ds_cfg_core_i7, c);
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
- break;
- case 0xf:
- switch (c->x86_model) {
- case 0x0:
- case 0x1:
- case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst, c);
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
-}
-
-static inline void ds_take_timestamp(struct ds_context *context,
- enum bts_qualifier qualifier,
- struct task_struct *task)
-{
- struct bts_tracer *tracer = context->bts_master;
- struct bts_struct ts;
-
- /* Prevent compilers from reading the tracer pointer twice. */
- barrier();
-
- if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
- return;
-
- memset(&ts, 0, sizeof(ts));
- ts.qualifier = qualifier;
- ts.variant.event.clock = trace_clock_global();
- ts.variant.event.pid = task->pid;
-
- bts_write(tracer, &ts);
-}
-
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
- struct ds_context *prev_ctx = prev->thread.ds_ctx;
- struct ds_context *next_ctx = next->thread.ds_ctx;
- unsigned long debugctlmsr = next->thread.debugctlmsr;
-
- /* Make sure all data is read before we start. */
- barrier();
-
- if (prev_ctx) {
- update_debugctlmsr(0);
-
- ds_take_timestamp(prev_ctx, bts_task_departs, prev);
- }
-
- if (next_ctx) {
- ds_take_timestamp(next_ctx, bts_task_arrives, next);
-
- wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
- }
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static __init int ds_selftest(void)
-{
- if (ds_cfg.sizeof_rec[ds_bts]) {
- int error;
-
- error = ds_selftest_bts();
- if (error) {
- WARN(1, "[ds] selftest failed. disabling bts.\n");
- ds_cfg.sizeof_rec[ds_bts] = 0;
- }
- }
-
- if (ds_cfg.sizeof_rec[ds_pebs]) {
- int error;
-
- error = ds_selftest_pebs();
- if (error) {
- WARN(1, "[ds] selftest failed. disabling pebs.\n");
- ds_cfg.sizeof_rec[ds_pebs] = 0;
- }
- }
-
- return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#include "ds_selftest.h"
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/ds.h>
-
-
-#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
-
-struct ds_selftest_bts_conf {
- struct bts_tracer *tracer;
- int error;
- int (*suspend)(struct bts_tracer *);
- int (*resume)(struct bts_tracer *);
-};
-
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
- int error = 0;
-
- if (!trace) {
- printk(KERN_CONT "failed to access trace...");
- /* Bail out. Other tests are pointless. */
- return -1;
- }
-
- if (!trace->read) {
- printk(KERN_CONT "bts read not available...");
- error = -1;
- }
-
- /* Do some sanity checks on the trace configuration. */
- if (!trace->ds.n) {
- printk(KERN_CONT "empty bts buffer...");
- error = -1;
- }
- if (!trace->ds.size) {
- printk(KERN_CONT "bad bts trace setup...");
- error = -1;
- }
- if (trace->ds.end !=
- (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
- printk(KERN_CONT "bad bts buffer setup...");
- error = -1;
- }
- /*
- * We allow top in [begin; end], since its not clear when the
- * overflow adjustment happens: after the increment or before the
- * write.
- */
- if ((trace->ds.top < trace->ds.begin) ||
- (trace->ds.end < trace->ds.top)) {
- printk(KERN_CONT "bts top out of bounds...");
- error = -1;
- }
-
- return error;
-}
-
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
- const struct bts_trace *trace,
- const void *from, const void *to)
-{
- const unsigned char *at;
-
- /*
- * Check a few things which do not belong to this test.
- * They should be covered by other tests.
- */
- if (!trace)
- return -1;
-
- if (!trace->read)
- return -1;
-
- if (to < from)
- return -1;
-
- if (from < trace->ds.begin)
- return -1;
-
- if (trace->ds.end < to)
- return -1;
-
- if (!trace->ds.size)
- return -1;
-
- /* Now to the test itself. */
- for (at = from; (void *)at < to; at += trace->ds.size) {
- struct bts_struct bts;
- unsigned long index;
- int error;
-
- if (((void *)at - trace->ds.begin) % trace->ds.size) {
- printk(KERN_CONT
- "read from non-integer index...");
- return -1;
- }
- index = ((void *)at - trace->ds.begin) / trace->ds.size;
-
- memset(&bts, 0, sizeof(bts));
- error = trace->read(tracer, at, &bts);
- if (error < 0) {
- printk(KERN_CONT
- "error reading bts trace at [%lu] (0x%p)...",
- index, at);
- return error;
- }
-
- switch (bts.qualifier) {
- case BTS_BRANCH:
- break;
- default:
- printk(KERN_CONT
- "unexpected bts entry %llu at [%lu] (0x%p)...",
- bts.qualifier, index, at);
- return -1;
- }
- }
-
- return 0;
-}
-
-static void ds_selftest_bts_cpu(void *arg)
-{
- struct ds_selftest_bts_conf *conf = arg;
- const struct bts_trace *trace;
- void *top;
-
- if (IS_ERR(conf->tracer)) {
- conf->error = PTR_ERR(conf->tracer);
- conf->tracer = NULL;
-
- printk(KERN_CONT
- "initialization failed (err: %d)...", conf->error);
- return;
- }
-
- /* We should meanwhile have enough trace. */
- conf->error = conf->suspend(conf->tracer);
- if (conf->error < 0)
- return;
-
- /* Let's see if we can access the trace. */
- trace = ds_read_bts(conf->tracer);
-
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- /* If everything went well, we should have a few trace entries. */
- if (trace->ds.top == trace->ds.begin) {
- /*
- * It is possible but highly unlikely that we got a
- * buffer overflow and end up at exactly the same
- * position we started from.
- * Let's issue a warning, but continue.
- */
- printk(KERN_CONT "no trace/overflow...");
- }
-
- /* Let's try to read the trace we collected. */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.top);
- if (conf->error < 0)
- return;
-
- /*
- * Let's read the trace again.
- * Since we suspended tracing, we should get the same result.
- */
- top = trace->ds.top;
-
- trace = ds_read_bts(conf->tracer);
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- if (top != trace->ds.top) {
- printk(KERN_CONT "suspend not working...");
- conf->error = -1;
- return;
- }
-
- /* Let's collect some more trace - see if resume is working. */
- conf->error = conf->resume(conf->tracer);
- if (conf->error < 0)
- return;
-
- conf->error = conf->suspend(conf->tracer);
- if (conf->error < 0)
- return;
-
- trace = ds_read_bts(conf->tracer);
-
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- if (trace->ds.top == top) {
- /*
- * It is possible but highly unlikely that we got a
- * buffer overflow and end up at exactly the same
- * position we started from.
- * Let's issue a warning and check the full trace.
- */
- printk(KERN_CONT
- "no resume progress/overflow...");
-
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.end);
- } else if (trace->ds.top < top) {
- /*
- * We had a buffer overflow - the entire buffer should
- * contain trace records.
- */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.end);
- } else {
- /*
- * It is quite likely that the buffer did not overflow.
- * Let's just check the delta trace.
- */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace, top,
- trace->ds.top);
- }
- if (conf->error < 0)
- return;
-
- conf->error = 0;
-}
-
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
- ds_suspend_bts(tracer);
- return 0;
-}
-
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
- ds_resume_bts(tracer);
- return 0;
-}
-
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
- (void)ds_release_bts_noirq(tracer);
-}
-
-static int ds_selftest_bts_bad_release_noirq(int cpu,
- struct bts_tracer *tracer)
-{
- int error = -EPERM;
-
- /* Try to release the tracer on the wrong cpu. */
- get_cpu();
- if (cpu != smp_processor_id()) {
- error = ds_release_bts_noirq(tracer);
- if (error != -EPERM)
- printk(KERN_CONT "release on wrong cpu...");
- }
- put_cpu();
-
- return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Try to request cpu tracing while task tracing is active. */
- tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
- (size_t)-1, BTS_KERNEL);
- error = PTR_ERR(tracer);
- if (!IS_ERR(tracer)) {
- ds_release_bts(tracer);
- error = 0;
- }
-
- if (error != -EPERM)
- printk(KERN_CONT "cpu/task tracing overlap...");
-
- return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Try to request cpu tracing while task tracing is active. */
- tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
- (size_t)-1, BTS_KERNEL);
- error = PTR_ERR(tracer);
- if (!IS_ERR(tracer)) {
- error = 0;
- ds_release_bts(tracer);
- }
-
- if (error != -EPERM)
- printk(KERN_CONT "task/cpu tracing overlap...");
-
- return error ? 0 : -1;
-}
-
-int ds_selftest_bts(void)
-{
- struct ds_selftest_bts_conf conf;
- unsigned char buffer[BUFFER_SIZE], *small_buffer;
- unsigned long irq;
- int cpu;
-
- printk(KERN_INFO "[ds] bts selftest...");
- conf.error = 0;
-
- small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- conf.suspend = ds_suspend_bts_wrap;
- conf.resume = ds_resume_bts_wrap;
- conf.tracer =
- ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_task(buffer);
- ds_release_bts(conf.tracer);
- if (conf.error < 0)
- goto out;
-
- conf.suspend = ds_suspend_bts_noirq;
- conf.resume = ds_resume_bts_noirq;
- conf.tracer =
- ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
- if (conf.error >= 0) {
- conf.error =
- ds_selftest_bts_bad_release_noirq(cpu,
- conf.tracer);
- /* We must not release the tracer twice. */
- if (conf.error < 0)
- conf.tracer = NULL;
- }
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_task(buffer);
- smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
- conf.tracer, 1);
- if (conf.error < 0)
- goto out;
- }
-
- conf.suspend = ds_suspend_bts_wrap;
- conf.resume = ds_resume_bts_wrap;
- conf.tracer =
- ds_request_bts_task(current, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
- ds_release_bts(conf.tracer);
- if (conf.error < 0)
- goto out;
-
- conf.suspend = ds_suspend_bts_noirq;
- conf.resume = ds_resume_bts_noirq;
- conf.tracer =
- ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- local_irq_save(irq);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
- ds_release_bts_noirq(conf.tracer);
- local_irq_restore(irq);
- if (conf.error < 0)
- goto out;
-
- conf.error = 0;
- out:
- put_online_cpus();
- printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-
- return conf.error;
-}
-
-int ds_selftest_pebs(void)
-{
- return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c8405718a4c3..b10684dedc58 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,209 +10,357 @@
#include <linux/kdebug.h>
#include <linux/module.h>
#include <linux/ptrace.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/ftrace.h>
#include <linux/kexec.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/sysfs.h>
-#include <linux/ftrace.h>
+#include <linux/kasan.h>
+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
+#include <asm/unwind.h>
-#include "dumpstack.h"
-
-int panic_on_unrecovered_nmi;
-int panic_on_io_nmi;
-unsigned int code_bytes = 64;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
-void printk_address(unsigned long address, int reliable)
+static struct pt_regs exec_summary_regs;
+
+bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
+ unsigned long *begin = task_stack_page(task);
+ unsigned long *end = task_stack_page(task) + THREAD_SIZE;
+
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_TASK;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct thread_info *tinfo, int *graph)
+/* Called from get_stack_info_noinstr - so must be noinstr too */
+bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)
{
- struct task_struct *task = tinfo->task;
- unsigned long ret_addr;
- int index = task->curr_ret_stack;
+ struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
- if (addr != (unsigned long)return_to_handler)
- return;
+ void *begin = ss;
+ void *end = ss + 1;
- if (!task->ret_stack || index < *graph)
- return;
-
- index -= *graph;
- ret_addr = task->ret_stack[index].ret;
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
- ops->address(data, ret_addr, 1);
+ info->type = STACK_TYPE_ENTRY;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
- (*graph)++;
+ return true;
}
-#else
-static inline void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct thread_info *tinfo, int *graph)
-{ }
-#endif
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
+static void printk_stack_address(unsigned long address, int reliable,
+ const char *log_lvl)
+{
+ touch_nmi_watchdog();
+ printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address);
+}
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
+static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
+ unsigned int nbytes)
{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
+ if (!user_mode(regs))
+ return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
+
+ /* The user space code from other tasks cannot be accessed. */
+ if (regs != task_pt_regs(current))
+ return -EPERM;
+
+ /*
+ * Even if named copy_from_user_nmi() this can be invoked from
+ * other contexts and will not try to resolve a pagefault, which is
+ * the correct thing to do here as this code can be called from any
+ * context.
+ */
+ return copy_from_user_nmi(buf, (void __user *)src, nbytes);
}
-unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
+/*
+ * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
+ *
+ * In case where we don't have the exact kernel image (which, if we did, we can
+ * simply disassemble and navigate to the RIP), the purpose of the bigger
+ * prologue is to have more context and to be able to correlate the code from
+ * the different toolchains better.
+ *
+ * In addition, it helps in recreating the register allocation of the failing
+ * kernel and thus make sense of the register dump.
+ *
+ * What is more, the additional complication of a variable length insn arch like
+ * x86 warrants having longer byte sequence before rIP so that the disassembler
+ * can "sync" up properly and find instruction boundaries when decoding the
+ * opcode bytes.
+ *
+ * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
+ * guesstimate in attempt to achieve all of the above.
+ */
+void show_opcodes(struct pt_regs *regs, const char *loglvl)
{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, 0);
- }
- print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
- }
- stack++;
+#define PROLOGUE_SIZE 42
+#define EPILOGUE_SIZE 21
+#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
+ u8 opcodes[OPCODE_BUFSIZE];
+ unsigned long prologue = regs->ip - PROLOGUE_SIZE;
+
+ switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
+ case 0:
+ printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
+ __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
+ opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
+ break;
+ case -EPERM:
+ /* No access to the user space stack of other tasks. Ignore. */
+ break;
+ default:
+ printk("%sCode: Unable to access opcode bytes at 0x%lx.\n",
+ loglvl, prologue);
+ break;
}
- return bp;
}
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+void show_ip(struct pt_regs *regs, const char *loglvl)
{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
+#ifdef CONFIG_X86_32
+ printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
+#else
+ printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+#endif
+ show_opcodes(regs, loglvl);
}
-static void print_trace_warning(void *data, char *msg)
+void show_iret_regs(struct pt_regs *regs, const char *log_lvl)
{
- printk("%s%s\n", (char *)data, msg);
+ show_ip(regs, log_lvl);
+ printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss,
+ regs->sp, regs->flags);
}
-static int print_trace_stack(void *data, char *name)
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+ bool partial, const char *log_lvl)
{
- printk("%s <%s> ", (char *)data, name);
- return 0;
+ /*
+ * These on_stack() checks aren't strictly necessary: the unwind code
+ * has already validated the 'regs' pointer. The checks are done for
+ * ordering reasons: if the registers are on the next stack, we don't
+ * want to print them out yet. Otherwise they'll be shown as part of
+ * the wrong stack. Later, when show_trace_log_lvl() switches to the
+ * next stack, this function will be called again with the same regs so
+ * they can be printed in the right context.
+ */
+ if (!partial && on_stack(info, regs, sizeof(*regs))) {
+ __show_regs(regs, SHOW_REGS_SHORT, log_lvl);
+
+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
+ /*
+ * When an interrupt or exception occurs in entry code, the
+ * full pt_regs might not have been saved yet. In that case
+ * just print the iret frame.
+ */
+ show_iret_regs(regs, log_lvl);
+ }
}
/*
- * Print one address/symbol entries per line.
+ * This function reads pointers from the stack and dereferences them. The
+ * pointers may not have their KMSAN shadow set up properly, which may result
+ * in false positive reports. Disable instrumentation to avoid those.
*/
-static void print_trace_address(void *data, unsigned long addr, int reliable)
+__no_kmsan_checks
+static void __show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
+ struct unwind_state state;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+ int graph_idx = 0;
+ bool partial = false;
-void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ unwind_start(&state, task, regs, stack);
+ stack = stack ?: get_stack_pointer(task, regs);
+ regs = unwind_get_entry_regs(&state, &partial);
+
+ /*
+ * Iterate through the stacks, starting with the current stack pointer.
+ * Each stack has a pointer to the next one.
+ *
+ * x86-64 can have several stacks:
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
+ * - entry stack
+ *
+ * x86-32 can have up to four stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
+ * - entry stack
+ */
+ for (; stack; stack = stack_info.next_sp) {
+ const char *stack_name;
+
+ stack = PTR_ALIGN(stack, sizeof(long));
+
+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
+ }
+
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
+
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+ * same time, follow proper stack frames with the unwinder.
+ *
+ * Addresses found during the scan which are not reported by
+ * the unwinder are considered to be additional clues which are
+ * sometimes useful for debugging and are prefixed with '?'.
+ * This also serves as a failsafe option in case the unwinder
+ * goes off in the weeds.
+ */
+ for (; stack < stack_info.end; stack++) {
+ unsigned long real_addr;
+ int reliable = 0;
+ unsigned long addr = READ_ONCE_NOCHECK(*stack);
+ unsigned long *ret_addr_p =
+ unwind_get_return_address_ptr(&state);
+
+ if (!__kernel_text_address(addr))
+ continue;
+
+ /*
+ * Don't print regs->ip again if it was already printed
+ * by show_regs_if_on_stack().
+ */
+ if (regs && stack == &regs->ip)
+ goto next;
+
+ if (stack == ret_addr_p)
+ reliable = 1;
+
+ /*
+ * When function graph tracing is enabled for a
+ * function, its return address on the stack is
+ * replaced with the address of an ftrace handler
+ * (return_to_handler). In that case, before printing
+ * the "real" address, we want to print the handler
+ * address as an "unreliable" hint that function graph
+ * tracing was involved.
+ */
+ real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+ addr, stack);
+ if (real_addr != addr)
+ printk_stack_address(addr, 0, log_lvl);
+ printk_stack_address(real_addr, reliable, log_lvl);
+
+ if (!reliable)
+ continue;
+
+next:
+ /*
+ * Get the next frame from the unwinder. No need to
+ * check for an error: if anything goes wrong, the rest
+ * of the addresses will just be printed as unreliable.
+ */
+ unwind_next_frame(&state);
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state, &partial);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
+ }
+
+ if (stack_name)
+ printk("%s </%s>\n", log_lvl, stack_name);
+ }
}
-void show_stack(struct task_struct *task, unsigned long *sp)
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
{
- show_stack_log_lvl(task, NULL, sp, 0, "");
+ /*
+ * Disable KASAN to avoid false positives during walking another
+ * task's stacks, as values on these stacks may change concurrently
+ * with task execution.
+ */
+ bool disable_kasan = task && task != current;
+
+ if (disable_kasan)
+ kasan_disable_current();
+
+ __show_trace_log_lvl(task, regs, stack, log_lvl);
+
+ if (disable_kasan)
+ kasan_enable_current();
}
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
+void show_stack(struct task_struct *task, unsigned long *sp,
+ const char *loglvl)
{
- unsigned long bp = 0;
- unsigned long stack;
+ task = task ? : current;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
+ /*
+ * Stack frames below this one aren't interesting. Don't show them
+ * if we're printing for %current.
+ */
+ if (!sp && task == current)
+ sp = get_stack_pointer(current, NULL);
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
+ show_trace_log_lvl(task, NULL, sp, loglvl);
}
-EXPORT_SYMBOL(dump_stack);
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+void show_stack_regs(struct pt_regs *regs)
+{
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
+}
+
+static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
-unsigned __kprobes long oops_begin(void)
+unsigned long oops_begin(void)
{
int cpu;
unsigned long flags;
- /* notify the hw-branch tracer so it may disable tracing and
- add the last trace to the trace buffer -
- the earlier this happens, the more useful the trace. */
- trace_hw_branch_oops();
-
oops_enter();
/* racy, but better than risking deadlock. */
raw_local_irq_save(flags);
cpu = smp_processor_id();
- if (!__raw_spin_trylock(&die_lock)) {
+ if (!arch_spin_trylock(&die_lock)) {
if (cpu == die_owner)
/* nested oops. should stop eventually */;
else
- __raw_spin_lock(&die_lock);
+ arch_spin_lock(&die_lock);
}
die_nest_count++;
die_owner = cpu;
@@ -220,72 +368,83 @@ unsigned __kprobes long oops_begin(void)
bust_spinlocks(1);
return flags;
}
+NOKPROBE_SYMBOL(oops_begin);
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+void __noreturn rewind_stack_and_make_dead(int signr);
+
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
die_owner = -1;
- add_taint(TAINT_DIE);
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
die_nest_count--;
if (!die_nest_count)
/* Nest count reaches zero, release the lock. */
- __raw_spin_unlock(&die_lock);
+ arch_spin_unlock(&die_lock);
raw_local_irq_restore(flags);
oops_exit();
+ /* Executive summary in case the oops scrolled away */
+ __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT);
+
if (!signr)
return;
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
- do_exit(signr);
+
+ /*
+ * We're not going to return, but we might be on an IST stack or
+ * have very little stack space left. Rewind the stack and kill
+ * the task.
+ * Before we rewind the stack, we have to tell KASAN that we're going to
+ * reuse the task stack and that existing poisons are invalid.
+ */
+ kasan_unpoison_task_stack(current);
+ rewind_stack_and_make_dead(signr);
}
+NOKPROBE_SYMBOL(oops_end);
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+static void __die_header(const char *str, struct pt_regs *regs, long err)
{
-#ifdef CONFIG_X86_32
- unsigned short ss;
- unsigned long sp;
-#endif
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- sysfs_printk_last_file();
+ /* Save the regs of the first oops for the executive summary later. */
+ if (!die_counter)
+ exec_summary_regs = *regs;
+
+ printk(KERN_DEFAULT
+ "Oops: %s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter,
+ IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
+ debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
+ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
+ IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) ?
+ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+}
+NOKPROBE_SYMBOL(__die_header);
+
+static int __die_body(const char *str, struct pt_regs *regs, long err)
+{
+ show_regs(regs);
+ print_modules();
+
if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;
- show_registers(regs);
-#ifdef CONFIG_X86_32
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- }
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
-#else
- /* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
- printk(" RSP <%016lx>\n", regs->sp);
-#endif
return 0;
}
+NOKPROBE_SYMBOL(__die_body);
+
+int __die(const char *str, struct pt_regs *regs, long err)
+{
+ __die_header(str, regs, err);
+ return __die_body(str, regs, err);
+}
+NOKPROBE_SYMBOL(__die);
/*
* This is gone through when something in the kernel has done something bad
@@ -296,64 +455,36 @@ void die(const char *str, struct pt_regs *regs, long err)
unsigned long flags = oops_begin();
int sig = SIGSEGV;
- if (!user_mode_vm(regs))
- report_bug(regs->ip, regs);
-
if (__die(str, regs, err))
sig = 0;
oops_end(flags, regs, sig);
}
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
+void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr)
{
- unsigned long flags;
-
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
- */
- flags = oops_begin();
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- oops_end(flags, regs, 0);
- if (do_panic || panic_on_oops)
- panic("Non maskable interrupt");
- nmi_exit();
- local_irq_enable();
- do_exit(SIGBUS);
+ __die_header(str, regs, err);
+ if (gp_addr)
+ kasan_non_canonical_hook(gp_addr);
+ if (__die_body(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
}
-static int __init oops_setup(char *s)
+void show_regs(struct pt_regs *regs)
{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
+ enum show_regs_mode print_kernel_regs;
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
+ show_regs_print_info(KERN_DEFAULT);
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
+ print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL;
+ __show_regs(regs, print_kernel_regs, KERN_DEFAULT);
- return 1;
+ /*
+ * When in-kernel, we also print out the stack at the time of the fault..
+ */
+ if (!user_mode(regs))
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index 81086c227ab7..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-
-extern unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph);
-
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
-
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
-
-extern unsigned int code_bytes;
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9e..722fd712e1cf 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -1,160 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*/
+#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
-#include <linux/utsname.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
+#include <linux/sysfs.h>
#include <linux/bug.h>
#include <linux/nmi.h>
-#include <linux/sysfs.h>
#include <asm/stacktrace.h>
-#include "dumpstack.h"
+const char *stack_type_name(enum stack_type type)
+{
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ if (type == STACK_TYPE_ENTRY)
+ return "ENTRY_TRAMPOLINE";
+
+ if (type == STACK_TYPE_EXCEPTION)
+ return "#DF";
-/* Just a stub for now */
-int x86_is_stack_id(int id, char *name)
+ return NULL;
+}
+
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- return 0;
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
+
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
+
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * See irq_32.c -- the next stack pointer is stored at the beginning of
+ * the stack.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
}
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- int graph = 0;
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- if (!task)
- task = current;
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
- if (!stack) {
- unsigned long dummy;
- stack = &dummy;
- if (task && task != current)
- stack = (unsigned long *)task->thread.sp;
- }
+ info->type = STACK_TYPE_SOFTIRQ;
+ info->begin = begin;
+ info->end = end;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
- for (;;) {
- struct thread_info *context;
-
- context = (struct thread_info *)
- ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = print_context_stack(context, stack, bp, ops,
- data, NULL, &graph);
-
- stack = (unsigned long *)context->previous_esp;
- if (!stack)
- break;
- if (ops->stack(data, "IRQ") < 0)
- break;
- touch_nmi_watchdog();
- }
+ /*
+ * The next stack pointer is stored at the beginning of the stack.
+ * See irq_32.c.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *stack;
- int i;
-
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
+ struct doublefault_stack *ss = &cea->doublefault_stack;
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (kstack_end(stack))
- break;
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %08lx", *stack++);
- touch_nmi_watchdog();
- }
- printk("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ void *begin = ss->stack;
+ void *end = begin + sizeof(ss->stack);
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_EXCEPTION;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
+
+ return true;
}
-void show_registers(struct pt_regs *regs)
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
{
- int i;
+ if (!stack)
+ goto unknown;
+
+ task = task ? : current;
+
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- print_modules();
- __show_regs(regs, 0);
+ if (task != current)
+ goto unknown;
- printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
- TASK_COMM_LEN, current->comm, task_pid_nr(current),
- current_thread_info(), current, task_thread_info(current));
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_softirq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_doublefault_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
/*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
*/
- if (!user_mode_vm(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- printk(KERN_EMERG "Stack:\n");
- show_stack_log_lvl(NULL, regs, &regs->sp,
- 0, KERN_EMERG);
-
- printk(KERN_EMERG "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- printk(" Bad EIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
- else
- printk("%02x ", c);
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
}
+ *visit_mask |= 1UL << info->type;
}
- printk("\n");
-}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
- if (ip < PAGE_OFFSET)
- return 0;
- if (probe_kernel_address((unsigned short *)ip, ud2))
- return 0;
+ return 0;
- return ud2 == 0x0b0f;
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a3276766..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -1,307 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*/
+#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
-#include <linux/utsname.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
+#include <linux/sysfs.h>
#include <linux/bug.h>
#include <linux/nmi.h>
-#include <linux/sysfs.h>
+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
-#include "dumpstack.h"
+static const char * const exception_stack_names[] = {
+ [ ESTACK_DF ] = "#DF",
+ [ ESTACK_NMI ] = "NMI",
+ [ ESTACK_DB ] = "#DB",
+ [ ESTACK_MCE ] = "#MC",
+ [ ESTACK_VC ] = "#VC",
+ [ ESTACK_VC2 ] = "#VC2",
+};
+const char *stack_type_name(enum stack_type type)
+{
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
-static char x86_stack_ids[][8] = {
- [DEBUG_STACK - 1] = "#DB",
- [NMI_STACK - 1] = "NMI",
- [DOUBLEFAULT_STACK - 1] = "#DF",
- [STACKFAULT_STACK - 1] = "#SS",
- [MCE_STACK - 1] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
-#endif
- };
+ if (type == STACK_TYPE_TASK)
+ return "TASK";
-int x86_is_stack_id(int id, char *name)
-{
- return x86_stack_ids[id - 1] == name;
-}
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
-{
- unsigned k;
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end = per_cpu(orig_ist, cpu).ist[k];
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= end - EXCEPTION_STKSZ) {
- /*
- * Make sure we only iterate through an exception
- * stack once. If it comes up for the second time
- * then there's something wrong going on - just
- * break out and return NULL:
- */
- if (*usedp & (1U << k))
- break;
- *usedp |= 1U << k;
- *idp = x86_stack_ids[k];
- return (unsigned long *)end;
- }
+ if (type == STACK_TYPE_ENTRY) {
/*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
+ * On 64-bit, we have a generic entry stack that we
+ * use for all the kernel entry points, including
+ * SYSENTER.
*/
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
- unsigned j = N_EXCEPTION_STACKS - 1;
-
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- ++j;
- end -= EXCEPTION_STKSZ;
- x86_stack_ids[j][4] = '1' +
- (j - N_EXCEPTION_STACKS);
- } while (stack < end - EXCEPTION_STKSZ);
- if (*usedp & (1U << j))
- break;
- *usedp |= 1U << j;
- *idp = x86_stack_ids[j];
- return (unsigned long *)end;
- }
-#endif
+ return "ENTRY_TRAMPOLINE";
}
+
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
return NULL;
}
+/**
+ * struct estack_pages - Page descriptor for exception stacks
+ * @offs: Offset from the start of the exception stack area
+ * @size: Size of the exception stack
+ * @type: Type to store in the stack_info struct
+ */
+struct estack_pages {
+ u32 offs;
+ u16 size;
+ u16 type;
+};
+
+#define EPAGERANGE(st) \
+ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \
+ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \
+ .offs = CEA_ESTACK_OFFS(st), \
+ .size = CEA_ESTACK_SIZE(st), \
+ .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, }
+
/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ * Array of exception stack page descriptors. If the stack is larger than
+ * PAGE_SIZE, all pages covering a particular stack will have the same
+ * info. The guard pages including the not mapped DB2 stack are zeroed
+ * out.
*/
+static const
+struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
+ EPAGERANGE(DF),
+ EPAGERANGE(NMI),
+ EPAGERANGE(DB),
+ EPAGERANGE(MCE),
+ EPAGERANGE(VC),
+ EPAGERANGE(VC2),
+};
+
+static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long begin, end, stk = (unsigned long)stack;
+ const struct estack_pages *ep;
+ struct pt_regs *regs;
+ unsigned int k;
+
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+
+ begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
+ /*
+ * Handle the case where stack trace is collected _before_
+ * cea_exception_stacks had been initialized.
+ */
+ if (!begin)
+ return false;
+
+ end = begin + sizeof(struct cea_exception_stacks);
+ /* Bail if @stack is outside the exception stack area. */
+ if (stk < begin || stk >= end)
+ return false;
+
+ /* Calc page offset from start of exception stacks */
+ k = (stk - begin) >> PAGE_SHIFT;
+ /* Lookup the page descriptor */
+ ep = &estack_pages[k];
+ /* Guard page? */
+ if (!ep->size)
+ return false;
+
+ begin += (unsigned long)ep->offs;
+ end = begin + (unsigned long)ep->size;
+ regs = (struct pt_regs *)end - 1;
+
+ info->type = ep->type;
+ info->begin = (unsigned long *)begin;
+ info->end = (unsigned long *)end;
+ info->next_sp = (unsigned long *)regs->sp;
+ return true;
+}
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- unsigned long *irq_stack_end =
- (unsigned long *)per_cpu(irq_stack_ptr, cpu);
- unsigned used = 0;
- struct thread_info *tinfo;
- int graph = 0;
-
- if (!task)
- task = current;
-
- if (!stack) {
- unsigned long dummy;
- stack = &dummy;
- if (task && task != current)
- stack = (unsigned long *)task->thread.sp;
- }
+ unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *begin;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
+ /*
+ * @end points directly to the top most stack entry to avoid a -8
+ * adjustment in the stack switch hotpath. Adjust it back before
+ * calculating @begin.
+ */
+ end++;
+ begin = end - (IRQ_STACK_SIZE / sizeof(long));
/*
- * Print function call entries in all stacks, starting at the
- * current stack address. If the stacks consist of nested
- * exceptions
+ * Due to the switching logic RSP can never be == @end because the
+ * final operation is 'popq %rsp' which means after that RSP points
+ * to the original stack and not to @end.
*/
- tinfo = task_thread_info(task);
- for (;;) {
- char *id;
- unsigned long *estack_end;
- estack_end = in_exception_stack(cpu, (unsigned long)stack,
- &used, &id);
-
- if (estack_end) {
- if (ops->stack(data, id) < 0)
- break;
-
- bp = print_context_stack(tinfo, stack, bp, ops,
- data, estack_end, &graph);
- ops->stack(data, "<EOE>");
- /*
- * We link to the next stack via the
- * second-to-last pointer (index -2 to end) in the
- * exception stack:
- */
- stack = (unsigned long *) estack_end[-2];
- continue;
- }
- if (irq_stack_end) {
- unsigned long *irq_stack;
- irq_stack = irq_stack_end -
- (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
-
- if (stack >= irq_stack && stack < irq_stack_end) {
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = print_context_stack(tinfo, stack, bp,
- ops, data, irq_stack_end, &graph);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (irq_stack_end[-1]);
- irq_stack_end = NULL;
- ops->stack(data, "EOI");
- continue;
- }
- }
- break;
- }
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
/*
- * This handles the process stack:
+ * The next stack pointer is stored at the top of the irq stack
+ * before switching to the irq stack. Actual stack entries are all
+ * below that.
*/
- bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
- put_cpu();
+ info->next_sp = (unsigned long *)*(end - 1);
+
+ return true;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
{
- unsigned long *stack;
- int i;
- const int cpu = smp_processor_id();
- unsigned long *irq_stack_end =
- (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
- unsigned long *irq_stack =
- (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+ if (in_task_stack(stack, task, info))
+ return true;
- /*
- * debugging aid: "show_stack(NULL, NULL);" prints the
- * back trace for this cpu.
- */
+ if (task != current)
+ return false;
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ if (in_exception_stack(stack, info))
+ return true;
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (stack >= irq_stack && stack <= irq_stack_end) {
- if (stack == irq_stack_end) {
- stack = (unsigned long *) (irq_stack_end[-1]);
- printk(" <EOI> ");
- }
- } else {
- if (((long) stack & (THREAD_SIZE-1)) == 0)
- break;
- }
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %016lx", *stack++);
- touch_nmi_watchdog();
- }
- printk("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ if (in_irq_stack(stack, info))
+ return true;
+
+ if (in_entry_stack(stack, info))
+ return true;
+
+ return false;
}
-void show_registers(struct pt_regs *regs)
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
{
- int i;
- unsigned long sp;
- const int cpu = smp_processor_id();
- struct task_struct *cur = current;
+ task = task ? : current;
+
+ if (!stack)
+ goto unknown;
- sp = regs->sp;
- printk("CPU %d ", cpu);
- __show_regs(regs, 1);
- printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
- cur->comm, cur->pid, task_thread_info(cur), cur);
+ if (!get_stack_info_noinstr(stack, task, info))
+ goto unknown;
/*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
*/
- if (!user_mode(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- printk(KERN_EMERG "Stack:\n");
- show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- regs->bp, KERN_EMERG);
-
- printk(KERN_EMERG "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- printk(" Bad RIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
- else
- printk("%02x ", c);
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ if (task == current)
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
}
+ *visit_mask |= 1UL << info->type;
}
- printk("\n");
-}
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
+ return 0;
- if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
- return 0;
-
- return ud2 == 0x0b0f;
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5cb5725b2bae..b15b97d3cb52 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1,49 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
+ * Low level x86 E820 memory map handling functions.
*
- * Getting sanitize_e820_map() in sync with i386 version by applying change:
- * - Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * The firmware and bootloader passes us the "E820 table", which is the primary
+ * physical memory layout description available about x86 systems.
*
+ * The kernel takes the E820 memory layout and optionally modifies it with
+ * quirks and other tweaks, and feeds that into the generic Linux memory
+ * allocation code routines via a platform independent interface (memblock, etc.).
*/
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
+#include <linux/crash_dump.h>
+#include <linux/memblock.h>
#include <linux/suspend.h>
+#include <linux/acpi.h>
#include <linux/firmware-map.h>
+#include <linux/sort.h>
+#include <linux/memory_hotplug.h>
+#include <linux/kvm_types.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
+#include <asm/e820/api.h>
#include <asm/setup.h>
-#include <asm/trampoline.h>
/*
- * The e820 map is the map that gets modified e.g. with command line parameters
- * and that is also registered with modifications in the kernel resource tree
- * with the iomem_resource as parent.
+ * We organize the E820 table into three main data structures:
*
- * The e820_saved is directly saved after the BIOS-provided memory map is
- * copied. It doesn't get modified afterwards. It's registered for the
- * /sys/firmware/memmap interface.
+ * - 'e820_table_firmware': the original firmware version passed to us by the
+ * bootloader - not modified by the kernel. It is composed of two parts:
+ * the first 128 E820 memory entries in boot_params.e820_table and the remaining
+ * (if any) entries of the SETUP_E820_EXT nodes. We use this to:
*
- * That memory map is not modified and is used as base for kexec. The kexec'd
- * kernel should get the same memory map as the firmware provides. Then the
- * user can e.g. boot the original kernel with mem=1G while still booting the
- * next kernel with full memory.
+ * - the hibernation code uses it to generate a kernel-independent CRC32
+ * checksum of the physical memory layout of a system.
+ *
+ * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
+ * passed to us by the bootloader - the major difference between
+ * e820_table_firmware[] and this one is that e820_table_kexec[]
+ * might be modified by the kexec itself to fake an mptable.
+ * We use this to:
+ *
+ * - kexec, which is a bootloader in disguise, uses the original E820
+ * layout to pass to the kexec-ed kernel. This way the original kernel
+ * can have a restricted E820 map while the kexec()-ed kexec-kernel
+ * can have access to full memory - etc.
+ *
+ * Export the memory layout via /sys/firmware/memmap. kexec-tools uses
+ * the entries to create an E820 table for the kexec kernel.
+ *
+ * kexec_file_load in-kernel code uses the table for the kexec kernel.
+ *
+ * - 'e820_table': this is the main E820 table that is massaged by the
+ * low level x86 platform code, or modified by boot parameters, before
+ * passed on to higher level MM layers.
+ *
+ * Once the E820 map has been converted to the standard Linux memory layout
+ * information its role stops - modifying it has no effect and does not get
+ * re-propagated. So its main role is a temporary bootstrap storage of firmware
+ * specific memory layout data during early bootup.
*/
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820_table e820_table_init __initdata;
+static struct e820_table e820_table_kexec_init __initdata;
+static struct e820_table e820_table_firmware_init __initdata;
+
+struct e820_table *e820_table __refdata = &e820_table_init;
+struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init;
+struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init;
/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
@@ -55,141 +75,166 @@ EXPORT_SYMBOL(pci_mem_start);
* This function checks if any part of the range <start,end> is mapped
* with type.
*/
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
+static bool _e820__mapped_any(struct e820_table *table,
+ u64 start, u64 end, enum e820_type type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < table->nr_entries; i++) {
+ struct e820_entry *entry = &table->entries[i];
- if (type && ei->type != type)
+ if (type && entry->type != type)
continue;
- if (ei->addr >= end || ei->addr + ei->size <= start)
+ if (entry->addr >= end || entry->addr + entry->size <= start)
continue;
- return 1;
+ return true;
}
- return 0;
+ return false;
+}
+
+bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
+{
+ return _e820__mapped_any(e820_table_firmware, start, end, type);
}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
+EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any);
+
+bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
+{
+ return _e820__mapped_any(e820_table, start, end, type);
+}
+EXPORT_SYMBOL_GPL(e820__mapped_any);
/*
- * This function checks if the entire range <start,end> is mapped with type.
+ * This function checks if the entire <start,end> range is mapped with 'type'.
*
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
+ * Note: this function only works correctly once the E820 table is sorted and
+ * not-overlapping (at least for the range specified), which is the case normally.
*/
-int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+ enum e820_type type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
- if (type && ei->type != type)
+ if (type && entry->type != type)
continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
+
+ /* Is the region (part) in overlap with the current region? */
+ if (entry->addr >= end || entry->addr + entry->size <= start)
continue;
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
+ /*
+ * If the region is at the beginning of <start,end> we move
+ * 'start' to the end of the region since it's ok until there
*/
- if (ei->addr <= start)
- start = ei->addr + ei->size;
+ if (entry->addr <= start)
+ start = entry->addr + entry->size;
+
/*
- * if start is now at or beyond end, we're done, full
- * coverage
+ * If 'start' is now at or beyond 'end', we're done, full
+ * coverage of the desired range exists:
*/
if (start >= end)
- return 1;
+ return entry;
}
- return 0;
+
+ return NULL;
}
/*
- * Add a memory region to the kernel e820 map.
+ * This function checks if the entire range <start,end> is mapped with type.
*/
-static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
- int type)
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
{
- int x = e820x->nr_map;
+ return __e820__mapped_all(start, end, type);
+}
- if (x == ARRAY_SIZE(e820x->map)) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+ struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+ return entry ? entry->type : -EINVAL;
+}
+
+/*
+ * Add a memory region to the kernel E820 map.
+ */
+static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
+{
+ int x = table->nr_entries;
+
+ if (x >= ARRAY_SIZE(table->entries)) {
+ pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ start, start + size - 1);
return;
}
- e820x->map[x].addr = start;
- e820x->map[x].size = size;
- e820x->map[x].type = type;
- e820x->nr_map++;
+ table->entries[x].addr = start;
+ table->entries[x].size = size;
+ table->entries[x].type = type;
+ table->nr_entries++;
}
-void __init e820_add_region(u64 start, u64 size, int type)
+void __init e820__range_add(u64 start, u64 size, enum e820_type type)
{
- __e820_add_region(&e820, start, size, type);
+ __e820__range_add(e820_table, start, size, type);
}
-static void __init e820_print_type(u32 type)
+static void __init e820_print_type(enum e820_type type)
{
switch (type) {
- case E820_RAM:
- case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)");
- break;
- case E820_RESERVED:
- printk(KERN_CONT "(reserved)");
- break;
- case E820_ACPI:
- printk(KERN_CONT "(ACPI data)");
- break;
- case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)");
- break;
- case E820_UNUSABLE:
- printk(KERN_CONT "(unusable)");
- break;
- default:
- printk(KERN_CONT "type %u", type);
- break;
+ case E820_TYPE_RAM: pr_cont("usable"); break;
+ case E820_TYPE_RESERVED: pr_cont("reserved"); break;
+ case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
+ case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
+ case E820_TYPE_NVS: pr_cont("ACPI NVS"); break;
+ case E820_TYPE_UNUSABLE: pr_cont("unusable"); break;
+ case E820_TYPE_PMEM: /* Fall through: */
+ case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break;
+ default: pr_cont("type %u", type); break;
}
}
-void __init e820_print_map(char *who)
+void __init e820__print_table(char *who)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
- (unsigned long long) e820.map[i].addr,
- (unsigned long long)
- (e820.map[i].addr + e820.map[i].size));
- e820_print_type(e820.map[i].type);
- printk(KERN_CONT "\n");
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ pr_info("%s: [mem %#018Lx-%#018Lx] ",
+ who,
+ e820_table->entries[i].addr,
+ e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+
+ e820_print_type(e820_table->entries[i].type);
+ pr_cont("\n");
}
}
/*
- * Sanitize the BIOS e820 map.
+ * Sanitize an E820 map.
*
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps,
+ * Some E820 layouts include overlapping entries. The following
+ * replaces the original E820 map with a new one, removing overlaps,
* and resolving conflicting memory types in favor of highest
* numbered type.
*
- * The input parameter biosmap points to an array of 'struct
- * e820entry' which on entry has elements in the range [0, *pnr_map)
- * valid, and which has space for up to max_nr_map entries.
- * On return, the resulting sanitized e820 map entries will be in
- * overwritten in the same location, starting at biosmap.
+ * The input parameter 'entries' points to an array of 'struct
+ * e820_entry' which on entry has elements in the range [0, *nr_entries)
+ * valid, and which has space for up to max_nr_entries entries.
+ * On return, the resulting sanitized E820 map entries will be in
+ * overwritten in the same location, starting at 'entries'.
*
- * The integer pointed to by pnr_map must be valid on entry (the
- * current number of valid entries located at biosmap) and will
- * be updated on return, with the new number of valid entries
- * (something no more than max_nr_map.)
+ * The integer pointed to by nr_entries must be valid on entry (the
+ * current number of valid entries located at 'entries'). If the
+ * sanitizing succeeds the *nr_entries will be updated with the new
+ * number of valid entries (something no more than max_nr_entries).
*
- * The return value from sanitize_e820_map() is zero if it
+ * The return value from e820__update_table() is zero if it
* successfully 'sanitized' the map entries passed in, and is -1
* if it did nothing, which can happen if either of (1) it was
* only passed one map entry, or (2) any of the input map entries
@@ -231,188 +276,173 @@ void __init e820_print_map(char *who)
* ____________________33__
* ______________________4_
*/
+struct change_member {
+ /* Pointer to the original entry: */
+ struct e820_entry *entry;
+ /* Address for this change point: */
+ unsigned long long addr;
+};
+
+static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata;
+static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata;
+static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata;
+static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata;
+
+static int __init cpcompare(const void *a, const void *b)
+{
+ struct change_member * const *app = a, * const *bpp = b;
+ const struct change_member *ap = *app, *bp = *bpp;
+
+ /*
+ * Inputs are pointers to two elements of change_point[]. If their
+ * addresses are not equal, their difference dominates. If the addresses
+ * are equal, then consider one that represents the end of its region
+ * to be greater than one that does not.
+ */
+ if (ap->addr != bp->addr)
+ return ap->addr > bp->addr ? 1 : -1;
+
+ return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
+}
-int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
- u32 *pnr_map)
+static bool e820_nomerge(enum e820_type type)
{
- struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
- };
- static struct change_member change_point_list[2*E820_X_MAX] __initdata;
- static struct change_member *change_point[2*E820_X_MAX] __initdata;
- static struct e820entry *overlap_list[E820_X_MAX] __initdata;
- static struct e820entry new_bios[E820_X_MAX] __initdata;
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
+ /*
+ * These types may indicate distinct platform ranges aligned to
+ * numa node, protection domain, performance domain, or other
+ * boundaries. Do not merge them.
+ */
+ if (type == E820_TYPE_PRAM)
+ return true;
+ if (type == E820_TYPE_SOFT_RESERVED)
+ return true;
+ return false;
+}
+
+int __init e820__update_table(struct e820_table *table)
+{
+ struct e820_entry *entries = table->entries;
+ u32 max_nr_entries = ARRAY_SIZE(table->entries);
+ enum e820_type current_type, last_type;
unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
+ u32 new_nr_entries, overlap_entries;
+ u32 i, chg_idx, chg_nr;
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2)
+ /* If there's only one memory region, don't bother: */
+ if (table->nr_entries < 2)
return -1;
- old_nr = *pnr_map;
- BUG_ON(old_nr > max_nr_map);
+ BUG_ON(table->nr_entries > max_nr_entries);
- /* bail out if we find any unreasonable addresses in bios map */
- for (i = 0; i < old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+ /* Bail out if we find any unreasonable addresses in the map: */
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].addr + entries[i].size < entries[i].addr)
return -1;
+ }
- /* create pointers for initial change-point information (for sorting) */
- for (i = 0; i < 2 * old_nr; i++)
+ /* Create pointers for initial change-point information (for sorting): */
+ for (i = 0; i < 2 * table->nr_entries; i++)
change_point[i] = &change_point_list[i];
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i = 0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr +
- biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx;
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i = 1; i < chg_nr; i++) {
- unsigned long long curaddr, lastaddr;
- unsigned long long curpbaddr, lastpbaddr;
-
- curaddr = change_point[i]->addr;
- lastaddr = change_point[i - 1]->addr;
- curpbaddr = change_point[i]->pbios->addr;
- lastpbaddr = change_point[i - 1]->pbios->addr;
-
- /*
- * swap entries, when:
- *
- * curaddr > lastaddr or
- * curaddr == lastaddr and curaddr == curpbaddr and
- * lastaddr != lastpbaddr
- */
- if (curaddr < lastaddr ||
- (curaddr == lastaddr && curaddr == curpbaddr &&
- lastaddr != lastpbaddr)) {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing = 1;
- }
+ /*
+ * Record all known change-points (starting and ending addresses),
+ * omitting empty memory regions:
+ */
+ chg_idx = 0;
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].size != 0) {
+ change_point[chg_idx]->addr = entries[i].addr;
+ change_point[chg_idx++]->entry = &entries[i];
+ change_point[chg_idx]->addr = entries[i].addr + entries[i].size;
+ change_point[chg_idx++]->entry = &entries[i];
}
}
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries = 0; /* number of entries in the overlap table */
- new_bios_entry = 0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
-
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx = 0; chgidx < chg_nr; chgidx++) {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr ==
- change_point[chgidx]->pbios->addr) {
- /*
- * add map entry to overlap list (> 1 entry
- * implies an overlap)
- */
- overlap_list[overlap_entries++] =
- change_point[chgidx]->pbios;
+ chg_nr = chg_idx;
+
+ /* Sort change-point list by memory addresses (low -> high): */
+ sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
+
+ /* Create a new memory map, removing overlaps: */
+ overlap_entries = 0; /* Number of entries in the overlap table */
+ new_nr_entries = 0; /* Index for creating new map entries */
+ last_type = 0; /* Start with undefined memory type */
+ last_addr = 0; /* Start with 0 as last starting address */
+
+ /* Loop through change-points, determining effect on the new map: */
+ for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
+ /* Keep track of all overlapping entries */
+ if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
+ /* Add map entry to overlap list (> 1 entry implies an overlap) */
+ overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
} else {
- /*
- * remove entry from list (order independent,
- * so swap with last)
- */
+ /* Remove entry from list (order independent, so swap with last): */
for (i = 0; i < overlap_entries; i++) {
- if (overlap_list[i] ==
- change_point[chgidx]->pbios)
- overlap_list[i] =
- overlap_list[overlap_entries-1];
+ if (overlap_list[i] == change_point[chg_idx]->entry)
+ overlap_list[i] = overlap_list[overlap_entries-1];
}
overlap_entries--;
}
/*
- * if there are overlapping entries, decide which
+ * If there are overlapping entries, decide which
* "type" to use (larger value takes precedence --
* 1=usable, 2,3,4,4+=unusable)
*/
current_type = 0;
- for (i = 0; i < overlap_entries; i++)
+ for (i = 0; i < overlap_entries; i++) {
if (overlap_list[i]->type > current_type)
current_type = overlap_list[i]->type;
- /*
- * continue building up new bios map based on this
- * information
- */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /*
- * move forward only if the new size
- * was non-zero
- */
- if (new_bios[new_bios_entry].size != 0)
- /*
- * no more space left for new
- * bios entries ?
- */
- if (++new_bios_entry >= max_nr_map)
+ }
+
+ /* Continue building up new map based on this information: */
+ if (current_type != last_type || e820_nomerge(current_type)) {
+ if (last_type) {
+ new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
+ /* Move forward only if the new size was non-zero: */
+ if (new_entries[new_nr_entries].size != 0)
+ /* No more space left for new entries? */
+ if (++new_nr_entries >= max_nr_entries)
break;
}
- if (current_type != 0) {
- new_bios[new_bios_entry].addr =
- change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr = change_point[chgidx]->addr;
+ if (current_type) {
+ new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
+ new_entries[new_nr_entries].type = current_type;
+ last_addr = change_point[chg_idx]->addr;
}
last_type = current_type;
}
}
- /* retain count for new bios entries */
- new_nr = new_bios_entry;
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
- *pnr_map = new_nr;
+ /* Copy the new entries into the original location: */
+ memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
+ table->nr_entries = new_nr_entries;
return 0;
}
-static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
{
- while (nr_map) {
- u64 start = biosmap->addr;
- u64 size = biosmap->size;
- u64 end = start + size;
- u32 type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
+ struct boot_e820_entry *entry = entries;
+
+ while (nr_entries) {
+ u64 start = entry->addr;
+ u64 size = entry->size;
+ u64 end = start + size - 1;
+ u32 type = entry->type;
+
+ /* Ignore the entry on 64-bit overflow: */
+ if (start > end && likely(size))
return -1;
- e820_add_region(start, size, type);
+ e820__range_add(start, size, type);
- biosmap++;
- nr_map--;
+ entry++;
+ nr_entries--;
}
return 0;
}
/*
- * Copy the BIOS e820 map into a safe place.
+ * Copy the BIOS E820 map into a safe place.
*
* Sanity-check it while we're at it..
*
@@ -420,18 +450,17 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
* will have given us a memory map that we can use to properly
* set up memory. If we aren't, we'll fake a memory map.
*/
-static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
{
/* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
+ if (nr_entries < 2)
return -1;
- return __append_e820_map(biosmap, nr_map);
+ return __append_e820_table(entries, nr_entries);
}
-static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
- u64 size, unsigned old_type,
- unsigned new_type)
+static u64 __init
+__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
{
u64 end;
unsigned int i;
@@ -443,156 +472,166 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
e820_print_type(old_type);
- printk(KERN_CONT " ==> ");
+ pr_cont(" ==> ");
e820_print_type(new_type);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
- for (i = 0; i < e820x->nr_map; i++) {
- struct e820entry *ei = &e820x->map[i];
+ for (i = 0; i < table->nr_entries; i++) {
+ struct e820_entry *entry = &table->entries[i];
u64 final_start, final_end;
- u64 ei_end;
+ u64 entry_end;
- if (ei->type != old_type)
+ if (entry->type != old_type)
continue;
- ei_end = ei->addr + ei->size;
- /* totally covered by new range? */
- if (ei->addr >= start && ei_end <= end) {
- ei->type = new_type;
- real_updated_size += ei->size;
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered by new range? */
+ if (entry->addr >= start && entry_end <= end) {
+ entry->type = new_type;
+ real_updated_size += entry->size;
continue;
}
- /* new range is totally covered? */
- if (ei->addr < start && ei_end > end) {
- __e820_add_region(e820x, start, size, new_type);
- __e820_add_region(e820x, end, ei_end - end, ei->type);
- ei->size = start - ei->addr;
+ /* New range is completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ __e820__range_add(table, start, size, new_type);
+ __e820__range_add(table, end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
real_updated_size += size;
continue;
}
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(end, ei_end);
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
if (final_start >= final_end)
continue;
- __e820_add_region(e820x, final_start, final_end - final_start,
- new_type);
+ __e820__range_add(table, final_start, final_end - final_start, new_type);
real_updated_size += final_end - final_start;
/*
- * left range could be head or tail, so need to update
- * size at first.
+ * Left range could be head or tail, so need to update
+ * its size first:
*/
- ei->size -= final_end - final_start;
- if (ei->addr < final_start)
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
continue;
- ei->addr = final_end;
+
+ entry->addr = final_end;
}
return real_updated_size;
}
-u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type)
+u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
{
- return __e820_update_range(&e820, start, size, old_type, new_type);
+ return __e820__range_update(e820_table, start, size, old_type, new_type);
}
-static u64 __init e820_update_range_saved(u64 start, u64 size,
- unsigned old_type, unsigned new_type)
+u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size,
+ enum e820_type old_type, enum e820_type new_type)
{
- return __e820_update_range(&e820_saved, start, size, old_type,
- new_type);
+ return __e820__range_update(t, start, size, old_type, new_type);
}
-/* make e820 not cover the range */
-u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
- int checktype)
+/* Remove a range of memory from the E820 table: */
+u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
{
int i;
+ u64 end;
u64 real_removed_size = 0;
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ end = start + size;
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
+ if (check_type)
+ e820_print_type(old_type);
+ pr_cont("\n");
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
u64 final_start, final_end;
+ u64 entry_end;
- if (checktype && ei->type != old_type)
+ if (check_type && entry->type != old_type)
continue;
- /* totally covered? */
- if (ei->addr >= start &&
- (ei->addr + ei->size) <= (start + size)) {
- real_removed_size += ei->size;
- memset(ei, 0, sizeof(struct e820entry));
+
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered? */
+ if (entry->addr >= start && entry_end <= end) {
+ real_removed_size += entry->size;
+ memset(entry, 0, sizeof(*entry));
continue;
}
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
+
+ /* Is the new range completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ e820__range_add(end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
+ real_removed_size += size;
+ continue;
+ }
+
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
if (final_start >= final_end)
continue;
+
real_removed_size += final_end - final_start;
- ei->size -= final_end - final_start;
- if (ei->addr < final_start)
+ /*
+ * Left range could be head or tail, so need to update
+ * the size first:
+ */
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
continue;
- ei->addr = final_end;
+
+ entry->addr = final_end;
}
return real_removed_size;
}
-void __init update_e820(void)
+void __init e820__update_table_print(void)
{
- u32 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+ if (e820__update_table(e820_table))
return;
- e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
- e820_print_map("modified");
+
+ pr_info("modified physical RAM map:\n");
+ e820__print_table("modified");
}
-static void __init update_e820_saved(void)
-{
- u32 nr_map;
- nr_map = e820_saved.nr_map;
- if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
- return;
- e820_saved.nr_map = nr_map;
+static void __init e820__update_table_kexec(void)
+{
+ e820__update_table(e820_table_kexec);
}
+
#define MAX_GAP_END 0x100000000ull
+
/*
- * Search for a gap in the e820 memory space from start_addr to end_addr.
+ * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
*/
-__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
- unsigned long start_addr, unsigned long long end_addr)
+static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
{
- unsigned long long last;
- int i = e820.nr_map;
+ unsigned long long last = MAX_GAP_END;
+ int i = e820_table->nr_entries;
int found = 0;
- last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
-
while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
-
- if (end < start_addr)
- continue;
+ unsigned long long start = e820_table->entries[i].addr;
+ unsigned long long end = start + e820_table->entries[i].size;
/*
* Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
+ * fit in 32 bits if this condition is true:
*/
if (last > end) {
unsigned long gap = last - end;
@@ -610,488 +649,168 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
}
/*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space. We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
+ * Search for the biggest gap in the low 32 bits of the E820
+ * memory space. We pass this space to the PCI subsystem, so
+ * that it can assign MMIO resources for hotplug or
+ * unconfigured devices in.
+ *
* Hopefully the BIOS let enough space left.
*/
-__init void e820_setup_gap(void)
+__init void e820__setup_pci_gap(void)
{
unsigned long gapstart, gapsize;
int found;
- gapstart = 0x10000000;
gapsize = 0x400000;
- found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+ found = e820_search_gap(&gapstart, &gapsize);
-#ifdef CONFIG_X86_64
if (!found) {
+#ifdef CONFIG_X86_64
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
- printk(KERN_ERR
- "PCI: Warning: Cannot find a gap in the 32bit address range\n"
- "PCI: Unassigned devices with 32bit resource registers may break!\n");
- }
+ pr_err("Cannot find an available gap in the 32-bit address range\n");
+ pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
+#else
+ gapstart = 0x10000000;
#endif
+ }
/*
- * e820_reserve_resources_late protect stolen RAM already
+ * e820__reserve_resources_late() protects stolen RAM already:
*/
pci_mem_start = gapstart;
- printk(KERN_INFO
- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
-/**
- * Because of the size limitation of struct boot_params, only first
- * 128 E820 memory entries are passed to kernel via
- * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
- * linked list of struct setup_data, which is parsed here.
- */
-void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
-{
- u32 map_len;
- int entries;
- struct e820entry *extmap;
-
- entries = sdata->len / sizeof(struct e820entry);
- map_len = sdata->len + sizeof(struct setup_data);
- if (map_len > PAGE_SIZE)
- sdata = early_ioremap(pa_data, map_len);
- extmap = (struct e820entry *)(sdata->data);
- __append_e820_map(extmap, entries);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- if (map_len > PAGE_SIZE)
- early_iounmap(sdata, map_len);
- printk(KERN_INFO "extended physical RAM map:\n");
- e820_print_map("extended");
+ pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
}
-#if defined(CONFIG_X86_64) || \
- (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
-/**
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(unsigned long limit_pfn)
-{
- int i;
- unsigned long pfn;
-
- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (pfn < PFN_UP(ei->addr))
- register_nosave_region(pfn, PFN_UP(ei->addr));
-
- pfn = PFN_DOWN(ei->addr + ei->size);
- if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
- register_nosave_region(PFN_UP(ei->addr), pfn);
-
- if (pfn >= limit_pfn)
- break;
- }
-}
-#endif
-
-#ifdef CONFIG_HIBERNATION
-/**
- * Mark ACPI NVS memory region, so that we can save/restore it during
- * hibernation and the subsequent resume.
- */
-static int __init e820_mark_nvs_memory(void)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (ei->type == E820_NVS)
- hibernate_nvs_register(ei->addr, ei->size);
- }
-
- return 0;
-}
-core_initcall(e820_mark_nvs_memory);
-#endif
-
/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
- u64 start, end;
- char name[16];
- char overlap_ok;
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
- { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
- {}
-};
-
-static int __init find_overlapped_early(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- break;
- }
-
- return i;
-}
-
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
+ * Called late during init, in free_initmem().
+ *
+ * Initial e820_table and e820_table_kexec are largish __initdata arrays.
+ *
+ * Copy them to a (usually much smaller) dynamically allocated area that is
+ * sized precisely after the number of e820 entries.
+ *
+ * This is done after we've performed all the fixes and tweaks to the tables.
+ * All functions which modify them are __init functions, which won't exist
+ * after free_initmem().
*/
-static void __init drop_range(int i)
+__init void e820__reallocate_tables(void)
{
- int j;
-
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
+ struct e820_table *n;
+ int size;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
+ n = kmemdup(e820_table, size, GFP_KERNEL);
+ BUG_ON(!n);
+ e820_table = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
+ n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
+ BUG_ON(!n);
+ e820_table_kexec = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
+ n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
+ BUG_ON(!n);
+ e820_table_firmware = n;
}
/*
- * Split any existing ranges that:
- * 1) are marked 'overlap_ok', and
- * 2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range. Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
+ * Because of the small fixed size of struct boot_params, only the first
+ * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
+ * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
+ * struct setup_data, which is parsed here.
*/
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
{
- int i;
- struct early_res *r;
- u64 lower_start, lower_end;
- u64 upper_start, upper_end;
- char name[16];
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
-
- /* Continue past non-overlapping ranges */
- if (end <= r->start || start >= r->end)
- continue;
-
- /*
- * Leave non-ok overlaps as is; let caller
- * panic "Overlapping early reservations"
- * when it hits this overlap.
- */
- if (!r->overlap_ok)
- return;
-
- /*
- * We have an ok overlap. We will drop it from the early
- * reservation map, and add back in any non-overlapping
- * portions (lower or upper) as separate, overlap_ok,
- * non-overlapping ranges.
- */
+ int entries;
+ struct boot_e820_entry *extmap;
+ struct setup_data *sdata;
- /* 1. Note any non-overlapping (lower or upper) ranges. */
- strncpy(name, r->name, sizeof(name) - 1);
+ sdata = early_memremap(phys_addr, data_len);
+ entries = sdata->len / sizeof(*extmap);
+ extmap = (struct boot_e820_entry *)(sdata->data);
- lower_start = lower_end = 0;
- upper_start = upper_end = 0;
- if (r->start < start) {
- lower_start = r->start;
- lower_end = start;
- }
- if (r->end > end) {
- upper_start = end;
- upper_end = r->end;
- }
+ __append_e820_table(extmap, entries);
+ e820__update_table(e820_table);
- /* 2. Drop the original ok overlapping range */
- drop_range(i);
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
- i--; /* resume for-loop on copied down entry */
-
- /* 3. Add back in any non-overlapping ranges. */
- if (lower_end)
- reserve_early_overlap_ok(lower_start, lower_end, name);
- if (upper_end)
- reserve_early_overlap_ok(upper_start, upper_end, name);
- }
-}
-
-static void __init __reserve_early(u64 start, u64 end, char *name,
- int overlap_ok)
-{
- int i;
- struct early_res *r;
-
- i = find_overlapped_early(start, end);
- if (i >= MAX_EARLY_RES)
- panic("Too many early reservations");
- r = &early_res[i];
- if (r->end)
- panic("Overlapping early reservations "
- "%llx-%llx %s to %llx-%llx %s\n",
- start, end - 1, name?name:"", r->start,
- r->end - 1, r->name);
- r->start = start;
- r->end = end;
- r->overlap_ok = overlap_ok;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first. We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 1);
+ early_memunmap(sdata, data_len);
+ pr_info("extended physical RAM map:\n");
+ e820__print_table("extended");
}
/*
- * Most early reservations come here.
+ * Find the ranges of physical addresses that do not correspond to
+ * E820 RAM areas and register the corresponding pages as 'nosave' for
+ * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
*
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
+ * This function requires the E820 map to be sorted and without any
+ * overlapping entries.
*/
-void __init reserve_early(u64 start, u64 end, char *name)
+void __init e820__register_nosave_regions(unsigned long limit_pfn)
{
- if (start >= end)
- return;
-
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 0);
-}
-
-void __init free_early(u64 start, u64 end)
-{
- struct early_res *r;
int i;
+ u64 last_addr = 0;
- i = find_overlapped_early(start, end);
- r = &early_res[i];
- if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
- panic("free_early on not reserved area: %llx-%llx!",
- start, end - 1);
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
- drop_range(i);
-}
-
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
- int i, count;
- u64 final_start, final_end;
-
- count = 0;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
- count++;
-
- printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count, start, end);
- for (i = 0; i < count; i++) {
- struct early_res *r = &early_res[i];
- printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
- r->start, r->end, r->name);
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end) {
- printk(KERN_CONT "\n");
+ if (entry->type != E820_TYPE_RAM)
continue;
- }
- printk(KERN_CONT " ==> [%010llx - %010llx]\n",
- final_start, final_end);
- reserve_bootmem_generic(final_start, final_end - final_start,
- BOOTMEM_DEFAULT);
- }
-}
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
- int i;
- u64 addr = *addrp;
- int changed = 0;
- struct early_res *r;
-again:
- i = find_overlapped_early(addr, addr + size);
- r = &early_res[i];
- if (i < MAX_EARLY_RES && r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
- }
- return changed;
-}
+ if (last_addr < entry->addr)
+ register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr));
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
-{
- int i;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
+ last_addr = entry->addr + entry->size;
}
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
-/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
- continue;
- return addr;
- }
- return -1ULL;
+ register_nosave_region(PFN_DOWN(last_addr), limit_pfn);
}
+#ifdef CONFIG_ACPI
/*
- * Find next free range after *start
+ * Register ACPI NVS memory regions, so that we can save/restore them during
+ * hibernation and the subsequent resume:
*/
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+static int __init e820__register_nvs_regions(void)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
+ if (entry->type == E820_TYPE_NVS)
+ acpi_nvs_register(entry->addr, entry->size);
}
- return -1ULL;
+ return 0;
}
+core_initcall(e820__register_nvs_regions);
+#endif
/*
- * pre allocated 4k and reserved it in e820
+ * Allocate the requested number of bytes with the requested alignment
+ * and return (the physical address) to the caller. Also register this
+ * range in the 'kexec' E820 table as a reserved range.
+ *
+ * This allows kexec to fake a new mptable, as if it came from the real
+ * system.
*/
-u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
{
- u64 size = 0;
u64 addr;
- u64 start;
- for (start = startt; ; start += size) {
- start = find_e820_area_size(start, &size, align);
- if (!(start + 1))
- return 0;
- if (size >= sizet)
- break;
+ addr = memblock_phys_alloc(size, align);
+ if (addr) {
+ e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
+ e820__update_table_kexec();
}
-#ifdef CONFIG_X86_32
- if (start >= MAXMEM)
- return 0;
- if (start + size > MAXMEM)
- size = MAXMEM - start;
-#endif
-
- addr = round_down(start + size - sizet, align);
- if (addr < start)
- return 0;
- e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
- e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820 for early_reserve_e820\n");
- update_e820();
- update_e820_saved();
-
return addr;
}
@@ -1108,22 +827,23 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
unsigned long start_pfn;
unsigned long end_pfn;
- if (ei->type != type)
+ if (entry->type != E820_TYPE_RAM &&
+ entry->type != E820_TYPE_ACPI)
continue;
- start_pfn = ei->addr >> PAGE_SHIFT;
- end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+ start_pfn = entry->addr >> PAGE_SHIFT;
+ end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
if (start_pfn >= limit_pfn)
continue;
@@ -1138,89 +858,22 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
- printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
- last_pfn, max_arch_pfn);
+ pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
+ last_pfn, max_arch_pfn);
return last_pfn;
}
-unsigned long __init e820_end_of_ram_pfn(void)
-{
- return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
-}
-unsigned long __init e820_end_of_low_ram_pfn(void)
+unsigned long __init e820__end_of_ram_pfn(void)
{
- return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
-}
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-int __init e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
- u64 align = PAGE_SIZE;
-
- *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
- *ei_startpfn >= last_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > last_pfn)
- *ei_endpfn = last_pfn;
-
- return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- int i;
-
- for (i = 0; i < e820.nr_map; i++)
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
+ return e820__end_ram_pfn(MAX_ARCH_PFN);
}
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init e820_hole_size(u64 start, u64 end)
+unsigned long __init e820__end_of_low_ram_pfn(void)
{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long last_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
- }
- return end - start - ((u64)ram << PAGE_SHIFT);
+ return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT));
}
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
{
early_printk(msg);
panic(msg);
@@ -1228,7 +881,7 @@ static void early_panic(char *msg)
static int userdef __initdata;
-/* "mem=nopentium" disables the 4MB page tables. */
+/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
static int __init parse_memopt(char *p)
{
u64 mem_size;
@@ -1236,22 +889,34 @@ static int __init parse_memopt(char *p)
if (!p)
return -EINVAL;
-#ifdef CONFIG_X86_32
if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
setup_clear_cpu_cap(X86_FEATURE_PSE);
return 0;
- }
+#else
+ pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
+ return -EINVAL;
#endif
+ }
userdef = 1;
mem_size = memparse(p, &p);
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+ /* Don't remove all memory when getting "mem={invalid}" parameter: */
+ if (mem_size == 0)
+ return -EINVAL;
+
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ max_mem_size = mem_size;
+#endif
return 0;
}
early_param("mem", parse_memopt);
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
{
char *oldp;
u64 start_at, mem_size;
@@ -1260,15 +925,7 @@ static int __init parse_memmap_opt(char *p)
return -EINVAL;
if (!strncmp(p, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
- /*
- * If we are doing a crash dump, we still need to know
- * the real mem size before original memory map is
- * reset.
- */
- saved_max_pfn = e820_end_of_ram_pfn();
-#endif
- e820.nr_map = 0;
+ e820_table->nr_entries = 0;
userdef = 1;
return 0;
}
@@ -1281,92 +938,197 @@ static int __init parse_memmap_opt(char *p)
userdef = 1;
if (*p == '@') {
start_at = memparse(p+1, &p);
- e820_add_region(start_at, mem_size, E820_RAM);
+ e820__range_add(start_at, mem_size, E820_TYPE_RAM);
} else if (*p == '#') {
start_at = memparse(p+1, &p);
- e820_add_region(start_at, mem_size, E820_ACPI);
+ e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
} else if (*p == '$') {
start_at = memparse(p+1, &p);
- e820_add_region(start_at, mem_size, E820_RESERVED);
- } else
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+ e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
+ } else if (*p == '!') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
+ } else if (*p == '%') {
+ enum e820_type from = 0, to = 0;
+
+ start_at = memparse(p + 1, &p);
+ if (*p == '-')
+ from = simple_strtoull(p + 1, &p, 0);
+ if (*p == '+')
+ to = simple_strtoull(p + 1, &p, 0);
+ if (*p != '\0')
+ return -EINVAL;
+ if (from && to)
+ e820__range_update(start_at, mem_size, from, to);
+ else if (to)
+ e820__range_add(start_at, mem_size, to);
+ else if (from)
+ e820__range_remove(start_at, mem_size, from, 1);
+ else
+ e820__range_remove(start_at, mem_size, 0, 0);
+ } else {
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+ }
return *p == '\0' ? 0 : -EINVAL;
}
+
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
early_param("memmap", parse_memmap_opt);
-void __init finish_e820_parsing(void)
+/*
+ * Called after parse_early_param(), after early parameters (such as mem=)
+ * have been processed, in which case we already have an E820 table filled in
+ * via the parameter callback function(s), but it's not sorted and printed yet:
+ */
+void __init e820__finish_early_params(void)
{
if (userdef) {
- u32 nr = e820.nr_map;
-
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+ if (e820__update_table(e820_table) < 0)
early_panic("Invalid user supplied memory map");
- e820.nr_map = nr;
- printk(KERN_INFO "user-defined physical RAM map:\n");
- e820_print_map("user");
+ pr_info("user-defined physical RAM map:\n");
+ e820__print_table("user");
+ }
+}
+
+static const char *__init e820_type_to_string(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_RAM: return "System RAM";
+ case E820_TYPE_ACPI: return "ACPI Tables";
+ case E820_TYPE_NVS: return "ACPI Non-volatile Storage";
+ case E820_TYPE_UNUSABLE: return "Unusable memory";
+ case E820_TYPE_PRAM: return "Persistent Memory (legacy)";
+ case E820_TYPE_PMEM: return "Persistent Memory";
+ case E820_TYPE_RESERVED: return "Reserved";
+ case E820_TYPE_SOFT_RESERVED: return "Soft Reserved";
+ default: return "Unknown E820 type";
+ }
+}
+
+static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
+ case E820_TYPE_ACPI: /* Fall-through: */
+ case E820_TYPE_NVS: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ case E820_TYPE_PRAM: /* Fall-through: */
+ case E820_TYPE_PMEM: /* Fall-through: */
+ case E820_TYPE_RESERVED: /* Fall-through: */
+ case E820_TYPE_SOFT_RESERVED: /* Fall-through: */
+ default: return IORESOURCE_MEM;
+ }
+}
+
+static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES;
+ case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
+ case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY;
+ case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+ case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
+ case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED;
+ case E820_TYPE_RAM: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ default: return IORES_DESC_NONE;
}
}
-static inline const char *e820_type_to_string(int e820_type)
+static bool __init do_mark_busy(enum e820_type type, struct resource *res)
{
- switch (e820_type) {
- case E820_RESERVED_KERN:
- case E820_RAM: return "System RAM";
- case E820_ACPI: return "ACPI Tables";
- case E820_NVS: return "ACPI Non-volatile Storage";
- case E820_UNUSABLE: return "Unusable memory";
- default: return "reserved";
+ /* this is the legacy bios/dos rom-shadow + mmio region */
+ if (res->start < (1ULL<<20))
+ return true;
+
+ /*
+ * Treat persistent memory and other special memory ranges like
+ * device memory, i.e. reserve it for exclusive use of a driver
+ */
+ switch (type) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_SOFT_RESERVED:
+ case E820_TYPE_PRAM:
+ case E820_TYPE_PMEM:
+ return false;
+ case E820_TYPE_RAM:
+ case E820_TYPE_ACPI:
+ case E820_TYPE_NVS:
+ case E820_TYPE_UNUSABLE:
+ default:
+ return true;
}
}
/*
- * Mark e820 reserved areas as busy for the resource manager.
+ * Mark E820 reserved areas as busy for the resource manager:
*/
+
static struct resource __initdata *e820_res;
-void __init e820_reserve_resources(void)
+
+void __init e820__reserve_resources(void)
{
int i;
struct resource *res;
u64 end;
- res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+ res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries,
+ SMP_CACHE_BYTES);
e820_res = res;
- for (i = 0; i < e820.nr_map; i++) {
- end = e820.map[i].addr + e820.map[i].size - 1;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = e820_table->entries + i;
+
+ end = entry->addr + entry->size - 1;
if (end != (resource_size_t)end) {
res++;
continue;
}
- res->name = e820_type_to_string(e820.map[i].type);
- res->start = e820.map[i].addr;
- res->end = end;
-
- res->flags = IORESOURCE_MEM;
+ res->start = entry->addr;
+ res->end = end;
+ res->name = e820_type_to_string(entry);
+ res->flags = e820_type_to_iomem_type(entry);
+ res->desc = e820_type_to_iores_desc(entry);
/*
- * don't register the region that could be conflicted with
- * pci device BAR resource and insert them later in
- * pcibios_resource_survey()
+ * Don't register the region that could be conflicted with
+ * PCI device BAR resources and insert them later in
+ * pcibios_resource_survey():
*/
- if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+ if (do_mark_busy(entry->type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
}
- for (i = 0; i < e820_saved.nr_map; i++) {
- struct e820entry *entry = &e820_saved.map[i];
- firmware_map_add_early(entry->addr,
- entry->addr + entry->size - 1,
- e820_type_to_string(entry->type));
+ /* Expose the kexec e820 table to the sysfs. */
+ for (i = 0; i < e820_table_kexec->nr_entries; i++) {
+ struct e820_entry *entry = e820_table_kexec->entries + i;
+
+ firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
}
}
-/* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+/*
+ * How much should we pad the end of RAM, depending on where it is?
+ */
+static unsigned long __init ram_alignment(resource_size_t pos)
{
unsigned long mb = pos >> 20;
@@ -1378,67 +1140,65 @@ static unsigned long ram_alignment(resource_size_t pos)
if (mb < 16)
return 1024*1024;
- /* To 32MB for anything above that */
- return 32*1024*1024;
+ /* To 64MB for anything above that */
+ return 64*1024*1024;
}
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
-void __init e820_reserve_resources_late(void)
+void __init e820__reserve_resources_late(void)
{
int i;
struct resource *res;
res = e820_res;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820_table->nr_entries; i++) {
if (!res->parent && res->end)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
/*
- * Try to bump up RAM regions to reasonable boundaries to
+ * Try to bump up RAM regions to reasonable boundaries, to
* avoid stolen RAM:
*/
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *entry = &e820.map[i];
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
u64 start, end;
- if (entry->type != E820_RAM)
+ if (entry->type != E820_TYPE_RAM)
continue;
+
start = entry->addr + entry->size;
end = round_up(start, ram_alignment(start)) - 1;
if (end > MAX_RESOURCE_SIZE)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
- reserve_region_with_split(&iomem_resource, start, end,
- "RAM buffer");
+
+ printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
+ reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
}
}
-char *__init default_machine_specific_memory_setup(void)
+/*
+ * Pass the firmware (bootloader) E820 map to the kernel and process it:
+ */
+char *__init e820__memory_setup_default(void)
{
char *who = "BIOS-e820";
- u32 new_nr;
+
/*
* Try to copy the BIOS-supplied E820-map.
*
* Otherwise fake a memory map; one section from 0k->640k,
* the next section from 1mb->appropriate_mem_k
*/
- new_nr = boot_params.e820_entries;
- sanitize_e820_map(boot_params.e820_map,
- ARRAY_SIZE(boot_params.e820_map),
- &new_nr);
- boot_params.e820_entries = new_nr;
- if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
- < 0) {
+ if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
u64 mem_size;
- /* compare results from other methods and take the greater */
- if (boot_params.alt_mem_k
- < boot_params.screen_info.ext_mem_k) {
+ /* Compare results from other methods and take the one that gives more RAM: */
+ if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
mem_size = boot_params.screen_info.ext_mem_k;
who = "BIOS-88";
} else {
@@ -1446,38 +1206,128 @@ char *__init default_machine_specific_memory_setup(void)
who = "BIOS-e801";
}
- e820.nr_map = 0;
- e820_add_region(0, LOWMEMSIZE(), E820_RAM);
- e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+ e820_table->nr_entries = 0;
+ e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
+ e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
}
- /* In case someone cares... */
+ /* We just appended a lot of ranges, sanitize the table: */
+ e820__update_table(e820_table);
+
return who;
}
-char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+/*
+ * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
+ * E820 map - with an optional platform quirk available for virtual platforms
+ * to override this method of boot environment processing:
+ */
+void __init e820__memory_setup(void)
{
- if (x86_quirks->arch_memory_setup) {
- char *who = x86_quirks->arch_memory_setup();
+ char *who;
- if (who)
- return who;
- }
- return default_machine_specific_memory_setup();
-}
+ /* This is a firmware interface ABI - make sure we don't break it: */
+ BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
- return machine_specific_memory_setup();
+ who = x86_init.resources.memory_setup();
+
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+ pr_info("BIOS-provided physical RAM map:\n");
+ e820__print_table(who);
}
-void __init setup_memory_map(void)
+void __init e820__memblock_setup(void)
{
- char *who;
+ int i;
+ u64 end;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
+ /*
+ * At this point only the first megabyte is mapped for sure, the
+ * rest of the memory cannot be used for memblock resizing
+ */
+ memblock_set_current_limit(ISA_END_ADDRESS);
+
+ /*
+ * The bootstrap memblock region count maximum is 128 entries
+ * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
+ * than that - so allow memblock resizing.
+ *
+ * This is safe, because this call happens pretty late during x86 setup,
+ * so we know about reserved memory regions already. (This is important
+ * so that memblock resizing does no stomp over reserved areas.)
+ */
+ memblock_allow_resize();
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ end = entry->addr + entry->size;
+ if (end != (resource_size_t)end)
+ continue;
+
+ if (entry->type == E820_TYPE_SOFT_RESERVED)
+ memblock_reserve(entry->addr, entry->size);
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ memblock_add(entry->addr, entry->size);
+ }
+
+ /*
+ * At this point memblock is only allowed to allocate from memory
+ * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set
+ * up in init_mem_mapping().
+ *
+ * KHO kernels are special and use only scratch memory for memblock
+ * allocations, but memory below 1M is ignored by kernel after early
+ * boot and cannot be naturally marked as scratch.
+ *
+ * To allow allocation of the real-mode trampoline and a few (if any)
+ * other very early allocations from below 1M forcibly mark the memory
+ * below 1M as scratch.
+ *
+ * After real mode trampoline is allocated, we clear that scratch
+ * marking.
+ */
+ memblock_mark_kho_scratch(0, SZ_1M);
+
+ /*
+ * 32-bit systems are limited to 4BG of memory even with HIGHMEM and
+ * to even less without it.
+ * Discard memory after max_pfn - the actual limit detected at runtime.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ memblock_remove(PFN_PHYS(max_pfn), -1);
+
+ /* Throw away partial pages: */
+ memblock_trim_memory(PAGE_SIZE);
- who = memory_setup();
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- e820_print_map(who);
+ memblock_dump_all();
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..6b6f32f40cbe 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* Various workarounds for chipset bugs.
This code runs very early and can't use the regular PCI subsystem
The entries are keyed to PCI bridges which usually identify chipsets
@@ -11,13 +12,22 @@
#include <linux/pci.h>
#include <linux/acpi.h>
+#include <linux/delay.h>
#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
+#include <linux/platform_data/x86/apple.h>
+#include <drm/intel/i915_drm.h>
+#include <drm/intel/pciids.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
+#include <asm/hpet.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -73,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func)
#ifdef CONFIG_ACPI
#ifdef CONFIG_X86_IO_APIC
/*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
* Unfortunately that's not true on many Asus boards.
@@ -97,7 +114,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
@@ -115,7 +131,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
-#endif
static void __init ati_bugs(int num, int slot, int func)
{
@@ -145,15 +160,10 @@ static void __init ati_bugs(int num, int slot, int func)
static u32 __init ati_sbx00_rev(int num, int slot, int func)
{
- u32 old, d;
+ u32 d;
- d = read_pci_config(num, slot, func, 0x70);
- old = d;
- d &= ~(1<<8);
- write_pci_config(num, slot, func, 0x70, d);
d = read_pci_config(num, slot, func, 0x8);
d &= 0xff;
- write_pci_config(num, slot, func, 0x70, old);
return d;
}
@@ -162,11 +172,19 @@ static void __init ati_bugs_contd(int num, int slot, int func)
{
u32 d, rev;
- if (acpi_use_timer_override)
+ rev = ati_sbx00_rev(num, slot, func);
+ if (rev >= 0x40)
+ acpi_fix_pin2_polarity = 1;
+
+ /*
+ * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
+ * SB700: revisions 0x39, 0x3a, ...
+ * SB800: revisions 0x40, 0x41, ...
+ */
+ if (rev >= 0x39)
return;
- rev = ati_sbx00_rev(num, slot, func);
- if (rev > 0x13)
+ if (acpi_use_timer_override)
return;
/* check for IRQ0 interrupt swap */
@@ -191,6 +209,481 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
+static void __init intel_remapping_check(int num, int slot, int func)
+{
+ u8 revision;
+ u16 device;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+ revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
+
+ /*
+ * Revision <= 13 of all triggering devices id in this quirk
+ * have a problem draining interrupts when irq remapping is
+ * enabled, and should be flagged as broken. Additionally
+ * revision 0x22 of device id 0x3405 has this problem.
+ */
+ if (revision <= 0x13)
+ set_irq_remapping_broken();
+ else if (device == 0x3405 && revision == 0x22)
+ set_irq_remapping_broken();
+}
+
+/*
+ * Systems with Intel graphics controllers set aside memory exclusively
+ * for gfx driver use. This memory is not marked in the E820 as reserved
+ * or as RAM, and so is subject to overlap from E820 manipulation later
+ * in the boot process. On some systems, MMIO space is allocated on top,
+ * despite the efforts of the "RAM buffer" approach, which simply rounds
+ * memory boundaries up to 64M to try to catch space that may decode
+ * as RAM and so is not suitable for MMIO.
+ */
+
+#define KB(x) ((x) * 1024UL)
+#define MB(x) (KB (KB (x)))
+
+static resource_size_t __init i830_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ if (esmramc & I830_TSEG_SIZE_1M)
+ return MB(1);
+ else
+ return KB(512);
+}
+
+static resource_size_t __init i845_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+ u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK;
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ switch (tseg_size) {
+ case I845_TSEG_SIZE_512K: return KB(512);
+ case I845_TSEG_SIZE_1M: return MB(1);
+ default:
+ WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc);
+ }
+ return 0;
+}
+
+static resource_size_t __init i85x_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ return MB(1);
+}
+
+static resource_size_t __init i830_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
+}
+
+static resource_size_t __init i85x_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
+}
+
+/*
+ * On 830/845/85x the stolen memory base isn't available in any
+ * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
+ */
+static resource_size_t __init i830_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i830_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i845_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i845_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i85x_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i85x_mem_size() - i85x_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i865_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u16 toud = 0;
+
+ toud = read_pci_config_16(0, 0, 0, I865_TOUD);
+
+ return toud * KB(64) + i845_tseg_size();
+}
+
+static resource_size_t __init gen3_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u32 bsm;
+
+ /* Almost universally we can find the Graphics Base of Stolen Memory
+ * at register BSM (0x5c) in the igfx configuration space. On a few
+ * (desktop) machines this is also mirrored in the bridge device at
+ * different locations, or in the MCHBAR.
+ */
+ bsm = read_pci_config(num, slot, func, INTEL_BSM);
+
+ return bsm & INTEL_BSM_MASK;
+}
+
+static resource_size_t __init gen11_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u64 bsm;
+
+ bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0);
+ bsm &= INTEL_BSM_MASK;
+ bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32;
+
+ return bsm;
+}
+
+static resource_size_t __init i830_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I830_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I830_GMCH_GMS_STOLEN_512: return KB(512);
+ case I830_GMCH_GMS_STOLEN_1024: return MB(1);
+ case I830_GMCH_GMS_STOLEN_8192: return MB(8);
+ /* local memory isn't part of the normal address space */
+ case I830_GMCH_GMS_LOCAL: return 0;
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen3_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I855_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I855_GMCH_GMS_STOLEN_1M: return MB(1);
+ case I855_GMCH_GMS_STOLEN_4M: return MB(4);
+ case I855_GMCH_GMS_STOLEN_8M: return MB(8);
+ case I855_GMCH_GMS_STOLEN_16M: return MB(16);
+ case I855_GMCH_GMS_STOLEN_32M: return MB(32);
+ case I915_GMCH_GMS_STOLEN_48M: return MB(48);
+ case I915_GMCH_GMS_STOLEN_64M: return MB(64);
+ case G33_GMCH_GMS_STOLEN_128M: return MB(128);
+ case G33_GMCH_GMS_STOLEN_256M: return MB(256);
+ case INTEL_GMCH_GMS_STOLEN_96M: return MB(96);
+ case INTEL_GMCH_GMS_STOLEN_160M:return MB(160);
+ case INTEL_GMCH_GMS_STOLEN_224M:return MB(224);
+ case INTEL_GMCH_GMS_STOLEN_352M:return MB(352);
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen6_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init gen8_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gms < 0x11)
+ return gms * MB(32);
+ else if (gms < 0x17)
+ return (gms - 0x11) * MB(4) + MB(8);
+ else
+ return (gms - 0x17) * MB(4) + MB(36);
+}
+
+static resource_size_t __init gen9_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ /* 0x0 to 0xef: 32MB increments starting at 0MB */
+ /* 0xf0 to 0xfe: 4MB increments starting at 4MB */
+ if (gms < 0xf0)
+ return gms * MB(32);
+ else
+ return (gms - 0xf0) * MB(4) + MB(4);
+}
+
+struct intel_early_ops {
+ resource_size_t (*stolen_size)(int num, int slot, int func);
+ resource_size_t (*stolen_base)(int num, int slot, int func,
+ resource_size_t size);
+};
+
+static const struct intel_early_ops i830_early_ops __initconst = {
+ .stolen_base = i830_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i845_early_ops __initconst = {
+ .stolen_base = i845_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i85x_early_ops __initconst = {
+ .stolen_base = i85x_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops i865_early_ops __initconst = {
+ .stolen_base = i865_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen3_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen6_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen6_stolen_size,
+};
+
+static const struct intel_early_ops gen8_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen8_stolen_size,
+};
+
+static const struct intel_early_ops gen9_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+static const struct intel_early_ops chv_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = chv_stolen_size,
+};
+
+static const struct intel_early_ops gen11_early_ops __initconst = {
+ .stolen_base = gen11_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
+static const struct pci_device_id intel_early_ids[] __initconst = {
+ INTEL_I830_IDS(INTEL_VGA_DEVICE, &i830_early_ops),
+ INTEL_I845G_IDS(INTEL_VGA_DEVICE, &i845_early_ops),
+ INTEL_I85X_IDS(INTEL_VGA_DEVICE, &i85x_early_ops),
+ INTEL_I865G_IDS(INTEL_VGA_DEVICE, &i865_early_ops),
+ INTEL_I915G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I915GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_VLV_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_PNV_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G33_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_GM45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_ILK_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_SNB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_IVB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_HSW_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_BDW_IDS(INTEL_VGA_DEVICE, &gen8_early_ops),
+ INTEL_CHV_IDS(INTEL_VGA_DEVICE, &chv_early_ops),
+ INTEL_SKL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_BXT_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_KBL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CFL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_WHL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CML_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_GLK_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CNL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_ICL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_EHL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_JSL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_TGL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RKL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLN_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLU_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+};
+
+struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
+EXPORT_SYMBOL(intel_graphics_stolen_res);
+
+static void __init
+intel_graphics_stolen(int num, int slot, int func,
+ const struct intel_early_ops *early_ops)
+{
+ resource_size_t base, size;
+ resource_size_t end;
+
+ size = early_ops->stolen_size(num, slot, func);
+ base = early_ops->stolen_base(num, slot, func, size);
+
+ if (!size || !base)
+ return;
+
+ end = base + size - 1;
+
+ intel_graphics_stolen_res.start = base;
+ intel_graphics_stolen_res.end = end;
+
+ printk(KERN_INFO "Reserving Intel graphics memory at %pR\n",
+ &intel_graphics_stolen_res);
+
+ /* Mark this space as reserved */
+ e820__range_add(base, size, E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+}
+
+static void __init intel_graphics_quirks(int num, int slot, int func)
+{
+ const struct intel_early_ops *early_ops;
+ u16 device;
+ int i;
+
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+ for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
+ kernel_ulong_t driver_data = intel_early_ids[i].driver_data;
+
+ if (intel_early_ids[i].device != device)
+ continue;
+
+ early_ops = (typeof(early_ops))driver_data;
+
+ intel_graphics_stolen(num, slot, func, early_ops);
+
+ return;
+ }
+}
+
+static void __init force_disable_hpet(int num, int slot, int func)
+{
+#ifdef CONFIG_HPET_TIMER
+ boot_hpet_disable = true;
+ pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
+#endif
+}
+
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!x86_apple_machine)
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
+
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -203,12 +696,6 @@ struct chipset {
void (*f)(int num, int slot, int func);
};
-/*
- * Only works for devices on the root bus. If you add any devices
- * not on bus 0 readd another loop level in early_quirks(). But
- * be careful because at least the Nvidia quirk here relies on
- * only matching on bus 0.
- */
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -220,9 +707,31 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
+ { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
+ 0, intel_graphics_quirks },
+ /*
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet:
+ *
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
+ */
+ { PCI_VENDOR_ID_INTEL, 0x0f00,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
};
+static void __init early_pci_scan_bus(int bus);
+
/**
* check_dev_quirk - apply early quirks to a given PCI device
* @num: bus number
@@ -231,7 +740,7 @@ static struct chipset early_qrk[] __initdata = {
*
* Check the vendor & device ID against the early quirks table.
*
- * If the device is single function, let early_quirks() know so we don't
+ * If the device is single function, let early_pci_scan_bus() know so we don't
* poke at this device again.
*/
static int __init check_dev_quirk(int num, int slot, int func)
@@ -240,6 +749,7 @@ static int __init check_dev_quirk(int num, int slot, int func)
u16 vendor;
u16 device;
u8 type;
+ u8 sec;
int i;
class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
@@ -267,25 +777,36 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
- if (!(type & 0x80))
+
+ if ((type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
+ if (!(type & PCI_HEADER_TYPE_MFD))
return -1;
return 0;
}
-void __init early_quirks(void)
+static void __init early_pci_scan_bus(int bus)
{
int slot, func;
- if (!early_pci_allowed())
- return;
-
/* Poor man's PCI discovery */
- /* Only scan the root bus */
for (slot = 0; slot < 32; slot++)
for (func = 0; func < 8; func++) {
/* Only probe function 0 on single fn devices */
- if (check_dev_quirk(0, slot, func))
+ if (check_dev_quirk(bus, slot, func))
break;
}
}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110f..cba75306e5b6 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/console.h>
#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/screen_info.h>
@@ -7,6 +9,7 @@
#include <linux/pci_regs.h>
#include <linux/pci_ids.h>
#include <linux/errno.h>
+#include <linux/pgtable.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
@@ -14,8 +17,10 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
-#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
+#include <linux/usb/xhci-dbgp.h>
+#include <asm/pci_x86.h>
+#include <linux/static_call.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -41,6 +46,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n)
writew(0x720, VGABASE + 2*(max_xpos*j + i));
current_ypos = max_ypos-1;
}
+#ifdef CONFIG_KGDB_KDB
+ if (c == '\b') {
+ if (current_xpos > 0)
+ current_xpos--;
+ } else if (c == '\r') {
+ current_xpos = 0;
+ } else
+#endif
if (c == '\n') {
current_xpos = 0;
current_ypos++;
@@ -65,7 +78,7 @@ static struct console early_vga_console = {
/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
-static int early_serial_base = 0x3f8; /* ttyS0 */
+static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define XMTRDY 0x20
@@ -83,13 +96,28 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
+static __noendbr unsigned int io_serial_in(unsigned long addr, int offset)
+{
+ return inb(addr + offset);
+}
+ANNOTATE_NOENDBR_SYM(io_serial_in);
+
+static __noendbr void io_serial_out(unsigned long addr, int offset, int value)
+{
+ outb(value, addr + offset);
+}
+ANNOTATE_NOENDBR_SYM(io_serial_out);
+
+DEFINE_STATIC_CALL(serial_in, io_serial_in);
+DEFINE_STATIC_CALL(serial_out, io_serial_out);
+
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
- while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ while ((static_call(serial_in)(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
- outb(ch, early_serial_base + TXR);
+ static_call(serial_out)(early_serial_base, TXR, ch);
return timeout ? 0 : -1;
}
@@ -103,13 +131,33 @@ static void early_serial_write(struct console *con, const char *s, unsigned n)
}
}
+static __init void early_serial_hw_init(unsigned divisor)
+{
+ unsigned char c;
+
+ static_call(serial_out)(early_serial_base, LCR, 0x3); /* 8n1 */
+ static_call(serial_out)(early_serial_base, IER, 0); /* no interrupt */
+ static_call(serial_out)(early_serial_base, FCR, 0); /* no fifo */
+ static_call(serial_out)(early_serial_base, MCR, 0x3); /* DTR + RTS */
+
+ c = static_call(serial_in)(early_serial_base, LCR);
+ static_call(serial_out)(early_serial_base, LCR, c | DLAB);
+ static_call(serial_out)(early_serial_base, DLL, divisor & 0xff);
+ static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ static_call(serial_out)(early_serial_base, LCR, c & ~DLAB);
+
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ if (static_call_query(serial_in) == io_serial_in)
+ kexec_debug_8250_port = early_serial_base;
+#endif
+}
+
#define DEFAULT_BAUD 9600
static __init void early_serial_init(char *s)
{
- unsigned char c;
unsigned divisor;
- unsigned baud = DEFAULT_BAUD;
+ unsigned long baud = DEFAULT_BAUD;
char *e;
if (*s == ',')
@@ -134,811 +182,267 @@ static __init void early_serial_init(char *s)
s++;
}
- outb(0x3, early_serial_base + LCR); /* 8n1 */
- outb(0, early_serial_base + IER); /* no interrupt */
- outb(0, early_serial_base + FCR); /* no fifo */
- outb(0x3, early_serial_base + MCR); /* DTR + RTS */
-
if (*s) {
- baud = simple_strtoul(s, &e, 0);
+ baud = simple_strtoull(s, &e, 0);
+
if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}
+ /* Convert from baud to divisor value */
divisor = 115200 / baud;
- c = inb(early_serial_base + LCR);
- outb(c | DLAB, early_serial_base + LCR);
- outb(divisor & 0xff, early_serial_base + DLL);
- outb((divisor >> 8) & 0xff, early_serial_base + DLH);
- outb(c & ~DLAB, early_serial_base + LCR);
-}
-
-static struct console early_serial_console = {
- .name = "earlyser",
- .write = early_serial_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
-#ifdef CONFIG_EARLY_PRINTK_DBGP
-
-static struct ehci_caps __iomem *ehci_caps;
-static struct ehci_regs __iomem *ehci_regs;
-static struct ehci_dbg_port __iomem *ehci_debug;
-static unsigned int dbgp_endpoint_out;
-
-struct ehci_dev {
- u32 bus;
- u32 slot;
- u32 func;
-};
-
-static struct ehci_dev ehci_dev;
-
-#define USB_DEBUG_DEVNUM 127
-
-#define DBGP_DATA_TOGGLE 0x8800
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
-static inline u32 dbgp_pid_update(u32 x, u32 tok)
+static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value)
{
- return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
}
+ANNOTATE_NOENDBR_SYM(mem32_serial_out);
-static inline u32 dbgp_len_update(u32 x, u32 len)
+static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset)
{
- return (x & ~0x0f) | (len & 0x0f);
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
}
+ANNOTATE_NOENDBR_SYM(mem32_serial_in);
/*
- * USB Packet IDs (PIDs)
+ * early_mmio_serial_init() - Initialize MMIO-based early serial console.
+ * @s: MMIO-based serial specification.
*/
-
-/* token */
-#define USB_PID_OUT 0xe1
-#define USB_PID_IN 0x69
-#define USB_PID_SOF 0xa5
-#define USB_PID_SETUP 0x2d
-/* handshake */
-#define USB_PID_ACK 0xd2
-#define USB_PID_NAK 0x5a
-#define USB_PID_STALL 0x1e
-#define USB_PID_NYET 0x96
-/* data */
-#define USB_PID_DATA0 0xc3
-#define USB_PID_DATA1 0x4b
-#define USB_PID_DATA2 0x87
-#define USB_PID_MDATA 0x0f
-/* Special */
-#define USB_PID_PREAMBLE 0x3c
-#define USB_PID_ERR 0x3c
-#define USB_PID_SPLIT 0x78
-#define USB_PID_PING 0xb4
-#define USB_PID_UNDEF_0 0xf0
-
-#define USB_PID_DATA_TOGGLE 0x88
-#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
-
-#define PCI_CAP_ID_EHCI_DEBUG 0xa
-
-#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
-#define HUB_SHORT_RESET_TIME 10
-#define HUB_LONG_RESET_TIME 200
-#define HUB_RESET_TIMEOUT 500
-
-#define DBGP_MAX_PACKET 8
-
-static int dbgp_wait_until_complete(void)
-{
- u32 ctrl;
- int loop = 0x100000;
-
- do {
- ctrl = readl(&ehci_debug->control);
- /* Stop when the transaction is finished */
- if (ctrl & DBGP_DONE)
- break;
- } while (--loop > 0);
-
- if (!loop)
- return -1;
-
- /*
- * Now that we have observed the completed transaction,
- * clear the done bit.
- */
- writel(ctrl | DBGP_DONE, &ehci_debug->control);
- return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
-}
-
-static void __init dbgp_mdelay(int ms)
-{
- int i;
-
- while (ms--) {
- for (i = 0; i < 1000; i++)
- outb(0x1, 0x80);
- }
-}
-
-static void dbgp_breath(void)
-{
- /* Sleep to give the debug port a chance to breathe */
-}
-
-static int dbgp_wait_until_done(unsigned ctrl)
-{
- u32 pids, lpid;
- int ret;
- int loop = 3;
-
-retry:
- writel(ctrl | DBGP_GO, &ehci_debug->control);
- ret = dbgp_wait_until_complete();
- pids = readl(&ehci_debug->pids);
- lpid = DBGP_PID_GET(pids);
-
- if (ret < 0)
- return ret;
-
- /*
- * If the port is getting full or it has dropped data
- * start pacing ourselves, not necessary but it's friendly.
- */
- if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
- dbgp_breath();
-
- /* If I get a NACK reissue the transmission */
- if (lpid == USB_PID_NAK) {
- if (--loop > 0)
- goto retry;
- }
-
- return ret;
-}
-
-static void dbgp_set_data(const void *buf, int size)
+static __init void early_mmio_serial_init(char *s)
{
- const unsigned char *bytes = buf;
- u32 lo, hi;
- int i;
-
- lo = hi = 0;
- for (i = 0; i < 4 && i < size; i++)
- lo |= bytes[i] << (8*i);
- for (; i < 8 && i < size; i++)
- hi |= bytes[i] << (8*(i - 4));
- writel(lo, &ehci_debug->data03);
- writel(hi, &ehci_debug->data47);
-}
-
-static void __init dbgp_get_data(void *buf, int size)
-{
- unsigned char *bytes = buf;
- u32 lo, hi;
- int i;
-
- lo = readl(&ehci_debug->data03);
- hi = readl(&ehci_debug->data47);
- for (i = 0; i < 4 && i < size; i++)
- bytes[i] = (lo >> (8*i)) & 0xff;
- for (; i < 8 && i < size; i++)
- bytes[i] = (hi >> (8*(i - 4))) & 0xff;
-}
-
-static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
- const char *bytes, int size)
-{
- u32 pids, addr, ctrl;
- int ret;
-
- if (size > DBGP_MAX_PACKET)
- return -1;
-
- addr = DBGP_EPADDR(devnum, endpoint);
-
- pids = readl(&ehci_debug->pids);
- pids = dbgp_pid_update(pids, USB_PID_OUT);
-
- ctrl = readl(&ehci_debug->control);
- ctrl = dbgp_len_update(ctrl, size);
- ctrl |= DBGP_OUT;
- ctrl |= DBGP_GO;
-
- dbgp_set_data(bytes, size);
- writel(addr, &ehci_debug->address);
- writel(pids, &ehci_debug->pids);
-
- ret = dbgp_wait_until_done(ctrl);
- if (ret < 0)
- return ret;
-
- return ret;
-}
-
-static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
- int size)
-{
- u32 pids, addr, ctrl;
- int ret;
-
- if (size > DBGP_MAX_PACKET)
- return -1;
-
- addr = DBGP_EPADDR(devnum, endpoint);
-
- pids = readl(&ehci_debug->pids);
- pids = dbgp_pid_update(pids, USB_PID_IN);
-
- ctrl = readl(&ehci_debug->control);
- ctrl = dbgp_len_update(ctrl, size);
- ctrl &= ~DBGP_OUT;
- ctrl |= DBGP_GO;
-
- writel(addr, &ehci_debug->address);
- writel(pids, &ehci_debug->pids);
- ret = dbgp_wait_until_done(ctrl);
- if (ret < 0)
- return ret;
-
- if (size > ret)
- size = ret;
- dbgp_get_data(data, size);
- return ret;
-}
-
-static int __init dbgp_control_msg(unsigned devnum, int requesttype,
- int request, int value, int index, void *data, int size)
-{
- u32 pids, addr, ctrl;
- struct usb_ctrlrequest req;
- int read;
- int ret;
-
- read = (requesttype & USB_DIR_IN) != 0;
- if (size > (read ? DBGP_MAX_PACKET:0))
- return -1;
-
- /* Compute the control message */
- req.bRequestType = requesttype;
- req.bRequest = request;
- req.wValue = cpu_to_le16(value);
- req.wIndex = cpu_to_le16(index);
- req.wLength = cpu_to_le16(size);
-
- pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
- addr = DBGP_EPADDR(devnum, 0);
-
- ctrl = readl(&ehci_debug->control);
- ctrl = dbgp_len_update(ctrl, sizeof(req));
- ctrl |= DBGP_OUT;
- ctrl |= DBGP_GO;
-
- /* Send the setup message */
- dbgp_set_data(&req, sizeof(req));
- writel(addr, &ehci_debug->address);
- writel(pids, &ehci_debug->pids);
- ret = dbgp_wait_until_done(ctrl);
- if (ret < 0)
- return ret;
-
- /* Read the result */
- return dbgp_bulk_read(devnum, 0, data, size);
-}
-
-
-/* Find a PCI capability */
-static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
-{
- u8 pos;
- int bytes;
-
- if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
- PCI_STATUS_CAP_LIST))
- return 0;
-
- pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
- for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
- u8 id;
-
- pos &= ~3;
- id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
- if (id == 0xff)
- break;
- if (id == cap)
- return pos;
-
- pos = read_pci_config_byte(num, slot, func,
- pos+PCI_CAP_LIST_NEXT);
- }
- return 0;
-}
-
-static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
-{
- u32 class;
-
- class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
- if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
- return 0;
-
- return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
-}
-
-static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
-{
- u32 bus, slot, func;
-
- for (bus = 0; bus < 256; bus++) {
- for (slot = 0; slot < 32; slot++) {
- for (func = 0; func < 8; func++) {
- unsigned cap;
-
- cap = __find_dbgp(bus, slot, func);
-
- if (!cap)
- continue;
- if (ehci_num-- != 0)
- continue;
- *rbus = bus;
- *rslot = slot;
- *rfunc = func;
- return cap;
- }
- }
- }
- return 0;
-}
+ unsigned long baudrate;
+ unsigned long membase;
+ char *e;
-static int __init ehci_reset_port(int port)
-{
- u32 portsc;
- u32 delay_time, delay;
- int loop;
-
- /* Reset the usb debug port */
- portsc = readl(&ehci_regs->port_status[port - 1]);
- portsc &= ~PORT_PE;
- portsc |= PORT_RESET;
- writel(portsc, &ehci_regs->port_status[port - 1]);
-
- delay = HUB_ROOT_RESET_TIME;
- for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
- delay_time += delay) {
- dbgp_mdelay(delay);
-
- portsc = readl(&ehci_regs->port_status[port - 1]);
- if (portsc & PORT_RESET) {
- /* force reset to complete */
- loop = 2;
- writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
- &ehci_regs->port_status[port - 1]);
- do {
- portsc = readl(&ehci_regs->port_status[port-1]);
- } while ((portsc & PORT_RESET) && (--loop > 0));
- }
+ if (*s == ',')
+ s++;
- /* Device went away? */
- if (!(portsc & PORT_CONNECT))
- return -ENOTCONN;
+ if (!strncmp(s, "0x", 2)) {
+ /* NB: only 32-bit addresses are supported. */
+ membase = simple_strtoul(s, &e, 16);
+ early_serial_base = (unsigned long)early_ioremap(membase, PAGE_SIZE);
- /* bomb out completely if something weird happend */
- if ((portsc & PORT_CSC))
- return -EINVAL;
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
- /* If we've finished resetting, then break out of the loop */
- if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
- return 0;
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
}
- return -EBUSY;
-}
-static int __init ehci_wait_for_port(int port)
-{
- u32 status;
- int ret, reps;
-
- for (reps = 0; reps < 3; reps++) {
- dbgp_mdelay(100);
- status = readl(&ehci_regs->status);
- if (status & STS_PCD) {
- ret = ehci_reset_port(port);
- if (ret == 0)
- return 0;
- }
+ if (!strncmp(s, "nocfg", 5)) {
+ baudrate = 0;
+ } else {
+ baudrate = simple_strtoul(s, &e, 0);
+ if (baudrate == 0 || s == e)
+ baudrate = DEFAULT_BAUD;
}
- return -ENOTCONN;
-}
-
-#ifdef DBGP_DEBUG
-# define dbgp_printk early_printk
-#else
-static inline void dbgp_printk(const char *fmt, ...) { }
-#endif
-
-typedef void (*set_debug_port_t)(int port);
-static void __init default_set_debug_port(int port)
-{
-}
-
-static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-
-static void __init nvidia_set_debug_port(int port)
-{
- u32 dword;
- dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
- 0x74);
- dword &= ~(0x0f<<12);
- dword |= ((port & 0x0f)<<12);
- write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
- dword);
- dbgp_printk("set debug port to %d\n", port);
-}
-
-static void __init detect_set_debug_port(void)
-{
- u32 vendorid;
-
- vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
- 0x00);
-
- if ((vendorid & 0xffff) == 0x10de) {
- dbgp_printk("using nvidia set_debug_port\n");
- set_debug_port = nvidia_set_debug_port;
- }
+ if (baudrate)
+ early_serial_hw_init(115200 / baudrate);
}
-static int __init ehci_setup(void)
+#ifdef CONFIG_PCI
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be "[force],B:D.F,baud", where B, D & F describe
+ * the location of a PCI device that must be a UART device. "force" is optional
+ * and overrides the use of an UART device with a wrong PCI class code.
+ */
+static __init void early_pci_serial_init(char *s)
{
- struct usb_debug_descriptor dbgp_desc;
- u32 cmd, ctrl, status, portsc, hcs_params;
- u32 debug_port, new_debug_port = 0, n_ports;
- u32 devnum;
- int ret, i;
- int loop;
- int port_map_tried;
- int playtimes = 3;
-
-try_next_time:
- port_map_tried = 0;
-
-try_next_port:
-
- hcs_params = readl(&ehci_caps->hcs_params);
- debug_port = HCS_DEBUG_PORT(hcs_params);
- n_ports = HCS_N_PORTS(hcs_params);
-
- dbgp_printk("debug_port: %d\n", debug_port);
- dbgp_printk("n_ports: %d\n", n_ports);
-
- for (i = 1; i <= n_ports; i++) {
- portsc = readl(&ehci_regs->port_status[i-1]);
- dbgp_printk("portstatus%d: %08x\n", i, portsc);
- }
-
- if (port_map_tried && (new_debug_port != debug_port)) {
- if (--playtimes) {
- set_debug_port(new_debug_port);
- goto try_next_time;
- }
- return -1;
- }
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ u8 bus, slot, func;
+ u32 classcode, bar0;
+ u16 cmdreg;
+ char *e;
+ int force = 0;
- loop = 10;
- /* Reset the EHCI controller */
- cmd = readl(&ehci_regs->command);
- cmd |= CMD_RESET;
- writel(cmd, &ehci_regs->command);
- do {
- cmd = readl(&ehci_regs->command);
- } while ((cmd & CMD_RESET) && (--loop > 0));
-
- if (!loop) {
- dbgp_printk("can not reset ehci\n");
- return -1;
- }
- dbgp_printk("ehci reset done\n");
-
- /* Claim ownership, but do not enable yet */
- ctrl = readl(&ehci_debug->control);
- ctrl |= DBGP_OWNER;
- ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
- writel(ctrl, &ehci_debug->control);
-
- /* Start the ehci running */
- cmd = readl(&ehci_regs->command);
- cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
- cmd |= CMD_RUN;
- writel(cmd, &ehci_regs->command);
-
- /* Ensure everything is routed to the EHCI */
- writel(FLAG_CF, &ehci_regs->configured_flag);
-
- /* Wait until the controller is no longer halted */
- loop = 10;
- do {
- status = readl(&ehci_regs->status);
- } while ((status & STS_HALT) && (--loop > 0));
-
- if (!loop) {
- dbgp_printk("ehci can be started\n");
- return -1;
- }
- dbgp_printk("ehci started\n");
+ if (*s == ',')
+ ++s;
- /* Wait for a device to show up in the debug port */
- ret = ehci_wait_for_port(debug_port);
- if (ret < 0) {
- dbgp_printk("No device found in debug port\n");
- goto next_debug_port;
- }
- dbgp_printk("ehci wait for port done\n");
-
- /* Enable the debug port */
- ctrl = readl(&ehci_debug->control);
- ctrl |= DBGP_CLAIM;
- writel(ctrl, &ehci_debug->control);
- ctrl = readl(&ehci_debug->control);
- if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
- dbgp_printk("No device in debug port\n");
- writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
- goto err;
- }
- dbgp_printk("debug ported enabled\n");
-
- /* Completely transfer the debug device to the debug controller */
- portsc = readl(&ehci_regs->port_status[debug_port - 1]);
- portsc &= ~PORT_PE;
- writel(portsc, &ehci_regs->port_status[debug_port - 1]);
-
- dbgp_mdelay(100);
-
- /* Find the debug device and make it device number 127 */
- for (devnum = 0; devnum <= 127; devnum++) {
- ret = dbgp_control_msg(devnum,
- USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
- USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
- &dbgp_desc, sizeof(dbgp_desc));
- if (ret > 0)
- break;
- }
- if (devnum > 127) {
- dbgp_printk("Could not find attached debug device\n");
- goto err;
- }
- if (ret < 0) {
- dbgp_printk("Attached device is not a debug device\n");
- goto err;
- }
- dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
-
- /* Move the device to 127 if it isn't already there */
- if (devnum != USB_DEBUG_DEVNUM) {
- ret = dbgp_control_msg(devnum,
- USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
- USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
- if (ret < 0) {
- dbgp_printk("Could not move attached device to %d\n",
- USB_DEBUG_DEVNUM);
- goto err;
- }
- devnum = USB_DEBUG_DEVNUM;
- dbgp_printk("debug device renamed to 127\n");
- }
+ if (*s == 0)
+ return;
- /* Enable the debug interface */
- ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
- USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
- USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
- if (ret < 0) {
- dbgp_printk(" Could not enable the debug device\n");
- goto err;
+ /* Force the use of an UART device with wrong class code */
+ if (!strncmp(s, "force,", 6)) {
+ force = 1;
+ s += 6;
}
- dbgp_printk("debug interface enabled\n");
- /* Perform a small write to get the even/odd data state in sync
+ /*
+ * Part the param to get the BDF values
*/
- ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
- if (ret < 0) {
- dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
- goto err;
- }
- dbgp_printk("small write doned\n");
-
- return 0;
-err:
- /* Things didn't work so remove my claim */
- ctrl = readl(&ehci_debug->control);
- ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
- writel(ctrl, &ehci_debug->control);
- return -1;
-
-next_debug_port:
- port_map_tried |= (1<<(debug_port - 1));
- new_debug_port = ((debug_port-1+1)%n_ports) + 1;
- if (port_map_tried != ((1<<n_ports) - 1)) {
- set_debug_port(new_debug_port);
- goto try_next_port;
- }
- if (--playtimes) {
- set_debug_port(new_debug_port);
- goto try_next_time;
- }
-
- return -1;
-}
-
-static int __init early_dbgp_init(char *s)
-{
- u32 debug_port, bar, offset;
- u32 bus, slot, func, cap;
- void __iomem *ehci_bar;
- u32 dbgp_num;
- u32 bar_val;
- char *e;
- int ret;
- u8 byte;
-
- if (!early_pci_allowed())
- return -1;
-
- dbgp_num = 0;
- if (*s)
- dbgp_num = simple_strtoul(s, &e, 10);
- dbgp_printk("dbgp_num: %d\n", dbgp_num);
-
- cap = find_dbgp(dbgp_num, &bus, &slot, &func);
- if (!cap)
- return -1;
-
- dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
- func);
-
- debug_port = read_pci_config(bus, slot, func, cap);
- bar = (debug_port >> 29) & 0x7;
- bar = (bar * 4) + 0xc;
- offset = (debug_port >> 16) & 0xfff;
- dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
- if (bar != PCI_BASE_ADDRESS_0) {
- dbgp_printk("only debug ports on bar 1 handled.\n");
+ bus = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != ':')
+ return;
+ ++s;
+ slot = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != '.')
+ return;
+ ++s;
+ func = (u8)simple_strtoul(s, &e, 16);
+ s = e;
- return -1;
- }
+ /* A baud might be following */
+ if (*s == ',')
+ s++;
- bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
- dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
- if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
- dbgp_printk("only simple 32bit mmio bars supported\n");
+ /*
+ * Find the device from the BDF
+ */
+ cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+ classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+ bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
- return -1;
+ /*
+ * Verify it is a 16550-UART type device
+ */
+ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+ (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+ (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) {
+ if (!force)
+ return;
}
- /* double check if the mem space is enabled */
- byte = read_pci_config_byte(bus, slot, func, 0x04);
- if (!(byte & 0x2)) {
- byte |= 0x02;
- write_pci_config_byte(bus, slot, func, 0x04, byte);
- dbgp_printk("mmio for ehci enabled\n");
+ /*
+ * Determine if it is IO or memory mapped
+ */
+ if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
+ /* it is IO mapped */
+ early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK;
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_IO);
+ } else {
+ /* It is memory mapped - assume 32-bit alignment */
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
+ /* WARNING! assuming the address is always in the first 4G */
+ early_serial_base =
+ (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10);
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK;
+#endif
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_MEMORY);
}
/*
- * FIXME I don't have the bar size so just guess PAGE_SIZE is more
- * than enough. 1K is the biggest I have seen.
+ * Initialize the hardware
*/
- set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
- ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
- ehci_bar += bar_val & ~PAGE_MASK;
- dbgp_printk("ehci_bar: %p\n", ehci_bar);
-
- ehci_caps = ehci_bar;
- ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
- ehci_debug = ehci_bar + offset;
- ehci_dev.bus = bus;
- ehci_dev.slot = slot;
- ehci_dev.func = func;
-
- detect_set_debug_port();
-
- ret = ehci_setup();
- if (ret < 0) {
- dbgp_printk("ehci_setup failed\n");
- ehci_debug = NULL;
-
- return -1;
+ if (*s) {
+ if (strcmp(s, "nocfg") == 0)
+ /* Sometimes, we want to leave the UART alone
+ * and assume the BIOS has set it up correctly.
+ * "nocfg" tells us this is the case, and we
+ * should do no more setup.
+ */
+ return;
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = DEFAULT_BAUD;
}
- return 0;
-}
-
-static void early_dbgp_write(struct console *con, const char *str, u32 n)
-{
- int chunk, ret;
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
- if (!ehci_debug)
- return;
- while (n > 0) {
- chunk = n;
- if (chunk > DBGP_MAX_PACKET)
- chunk = DBGP_MAX_PACKET;
- ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
- dbgp_endpoint_out, str, chunk);
- str += chunk;
- n -= chunk;
- }
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
}
+#endif
-static struct console early_dbgp_console = {
- .name = "earlydbg",
- .write = early_dbgp_write,
+static struct console early_serial_console = {
+ .name = "earlyser",
+ .write = early_serial_write,
.flags = CON_PRINTBUFFER,
.index = -1,
};
-#endif
-
-/* Direct interface for emergencies */
-static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
-asmlinkage void early_printk(const char *fmt, ...)
+static void early_console_register(struct console *con, int keep_early)
{
- char buf[512];
- int n;
- va_list ap;
-
- va_start(ap, fmt);
- n = vscnprintf(buf, sizeof(buf), fmt, ap);
- early_console->write(early_console, buf, n);
- va_end(ap);
+ if (con->index != -1) {
+ printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
+ con->name);
+ return;
+ }
+ early_console = con;
+ if (keep_early)
+ early_console->flags &= ~CON_BOOT;
+ else
+ early_console->flags |= CON_BOOT;
+ register_console(early_console);
}
-
static int __init setup_early_printk(char *buf)
{
- int keep_early;
+ int keep;
if (!buf)
return 0;
- if (early_console_initialized)
+ if (early_console)
return 0;
- early_console_initialized = 1;
-
- keep_early = (strstr(buf, "keep") != NULL);
-
- if (!strncmp(buf, "serial", 6)) {
- early_serial_init(buf + 6);
- early_console = &early_serial_console;
- } else if (!strncmp(buf, "ttyS", 4)) {
- early_serial_init(buf);
- early_console = &early_serial_console;
- } else if (!strncmp(buf, "vga", 3)
- && boot_params.screen_info.orig_video_isVGA == 1) {
- max_xpos = boot_params.screen_info.orig_video_cols;
- max_ypos = boot_params.screen_info.orig_video_lines;
- current_ypos = boot_params.screen_info.orig_y;
- early_console = &early_vga_console;
+
+ keep = (strstr(buf, "keep") != NULL);
+
+ while (*buf != '\0') {
+ if (!strncmp(buf, "mmio32", 6)) {
+ buf += 6;
+ early_mmio_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ }
+ if (!strncmp(buf, "serial", 6)) {
+ buf += 6;
+ early_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ if (!strncmp(buf, ",ttyS", 5))
+ buf += 5;
+ }
+ if (!strncmp(buf, "ttyS", 4)) {
+ early_serial_init(buf + 4);
+ early_console_register(&early_serial_console, keep);
+ }
+#ifdef CONFIG_PCI
+ if (!strncmp(buf, "pciserial", 9)) {
+ buf += 9; /* Keep from match the above "pciserial" */
+ early_pci_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ }
+#endif
+ if (!strncmp(buf, "vga", 3) &&
+ boot_params.screen_info.orig_video_isVGA == 1) {
+ max_xpos = boot_params.screen_info.orig_video_cols;
+ max_ypos = boot_params.screen_info.orig_video_lines;
+ current_ypos = boot_params.screen_info.orig_y;
+ early_console_register(&early_vga_console, keep);
+ }
#ifdef CONFIG_EARLY_PRINTK_DBGP
- } else if (!strncmp(buf, "dbgp", 4)) {
- if (early_dbgp_init(buf+4) < 0)
- return 0;
- early_console = &early_dbgp_console;
- /*
- * usb subsys will reset ehci controller, so don't keep
- * that early console
- */
- keep_early = 0;
+ if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
+ early_console_register(&early_dbgp_console, keep);
#endif
#ifdef CONFIG_HVC_XEN
- } else if (!strncmp(buf, "xen", 3)) {
- early_console = &xenboot_console;
+ if (!strncmp(buf, "xen", 3))
+ early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+ if (!strncmp(buf, "xdbc", 4))
+ early_xdbc_parse_parameter(buf + 4, keep);
#endif
- }
- if (keep_early)
- early_console->flags &= ~CON_BOOT;
- else
- early_console->flags |= CON_BOOT;
- register_console(early_console);
+ buf++;
+ }
return 0;
}
diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c
new file mode 100644
index 000000000000..38e7d597b660
--- /dev/null
+++ b/arch/x86/kernel/ebda.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+/*
+ * This function reserves all conventional PC system BIOS related
+ * firmware memory areas (some of which are data, some of which
+ * are code), that must not be used by the kernel as available
+ * RAM.
+ *
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too.
+ *
+ * This means that as a first approximation on most systems we can
+ * guess the reserved BIOS area by looking at the low BIOS RAM size
+ * value and assume that everything above that value (up to 1MB) is
+ * reserved.
+ *
+ * But life in firmware country is not that simple:
+ *
+ * - This code also contains a quirk for Dell systems that neglect
+ * to reserve the EBDA area in the 'RAM size' value ...
+ *
+ * - The same quirk also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). (Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.)
+ *
+ * - Plus paravirt systems don't have a reliable value in the
+ * 'BIOS RAM size' pointer we can rely on, so we must quirk
+ * them too.
+ *
+ * Due to those various problems this function is deliberately
+ * very conservative and tries to err on the side of reserving
+ * too much, to not risk reserving too little.
+ *
+ * Losing a small amount of memory in the bottom megabyte is
+ * rarely a problem, as long as we have enough memory to install
+ * the SMP bootup trampoline which *must* be in this area.
+ *
+ * Using memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem to the kernel,
+ * obviously.
+ */
+
+#define BIOS_RAM_SIZE_KB_PTR 0x413
+
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+void __init reserve_bios_regions(void)
+{
+ unsigned int bios_start, ebda_start;
+
+ /*
+ * NOTE: In a paravirtual environment the BIOS reserved
+ * area is absent. We'll just have to assume that the
+ * paravirt case can handle memory setup correctly,
+ * without our help.
+ */
+ if (!x86_platform.legacy.reserve_bios_regions)
+ return;
+
+ /*
+ * BIOS RAM size is encoded in kilobytes, convert it
+ * to bytes to get a first guess at where the BIOS
+ * firmware area starts:
+ */
+ bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR);
+ bios_start <<= 10;
+
+ /*
+ * If bios_start is less than 128K, assume it is bogus
+ * and bump it up to 640K. Similarly, if bios_start is above 640K,
+ * don't trust it.
+ */
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ /* Get the start address of the EBDA page: */
+ ebda_start = get_bios_ebda();
+
+ /*
+ * If the EBDA start address is sane and is below the BIOS region,
+ * then also reserve everything from the EBDA start address up to
+ * the BIOS region.
+ */
+ if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ /* Reserve all memory between bios_start and the 1MB mark: */
+ memblock_reserve(bios_start, 0x100000 - bios_start);
+}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
deleted file mode 100644
index fe26ba3e3451..000000000000
--- a/arch/x86/kernel/efi.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * Common EFI (Extensible Firmware Interface) support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- * David Mosberger-Tang <davidm@hpl.hp.com>
- * Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2005-2008 Intel Co.
- * Fenghua Yu <fenghua.yu@intel.com>
- * Bibo Mao <bibo.mao@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- * Huang Ying <ying.huang@intel.com>
- *
- * Copied from efi_32.c to eliminate the duplicated code between EFI
- * 32/64 support code. --ying 2007-10-26
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version. --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls. --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- * Skip non-WB memory and ignore empty memory ranges.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/efi.h>
-#include <linux/bootmem.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-#include <linux/time.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-#include <linux/bcd.h>
-
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/time.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#define EFI_DEBUG 1
-#define PFX "EFI: "
-
-int efi_enabled;
-EXPORT_SYMBOL(efi_enabled);
-
-struct efi efi;
-EXPORT_SYMBOL(efi);
-
-struct efi_memory_map memmap;
-
-static struct efi efi_phys __initdata;
-static efi_system_table_t efi_systab __initdata;
-
-static int __init setup_noefi(char *arg)
-{
- efi_enabled = 0;
- return 0;
-}
-early_param("noefi", setup_noefi);
-
-int add_efi_memmap;
-EXPORT_SYMBOL(add_efi_memmap);
-
-static int __init setup_add_efi_memmap(char *arg)
-{
- add_efi_memmap = 1;
- return 0;
-}
-early_param("add_efi_memmap", setup_add_efi_memmap);
-
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
- return efi_call_virt2(get_time, tm, tc);
-}
-
-static efi_status_t virt_efi_set_time(efi_time_t *tm)
-{
- return efi_call_virt1(set_time, tm);
-}
-
-static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
- efi_bool_t *pending,
- efi_time_t *tm)
-{
- return efi_call_virt3(get_wakeup_time,
- enabled, pending, tm);
-}
-
-static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
-{
- return efi_call_virt2(set_wakeup_time,
- enabled, tm);
-}
-
-static efi_status_t virt_efi_get_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 *attr,
- unsigned long *data_size,
- void *data)
-{
- return efi_call_virt5(get_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
- efi_char16_t *name,
- efi_guid_t *vendor)
-{
- return efi_call_virt3(get_next_variable,
- name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- unsigned long attr,
- unsigned long data_size,
- void *data)
-{
- return efi_call_virt5(set_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
-{
- return efi_call_virt1(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system(int reset_type,
- efi_status_t status,
- unsigned long data_size,
- efi_char16_t *data)
-{
- efi_call_virt4(reset_system, reset_type, status,
- data_size, data);
-}
-
-static efi_status_t virt_efi_set_virtual_address_map(
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
-{
- return efi_call_virt4(set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
-}
-
-static efi_status_t __init phys_efi_set_virtual_address_map(
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
-{
- efi_status_t status;
-
- efi_call_phys_prelog();
- status = efi_call_phys4(efi_phys.set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
- efi_call_phys_epilog();
- return status;
-}
-
-static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
- efi_time_cap_t *tc)
-{
- efi_status_t status;
-
- efi_call_phys_prelog();
- status = efi_call_phys2(efi_phys.get_time, tm, tc);
- efi_call_phys_epilog();
- return status;
-}
-
-int efi_set_rtc_mmss(unsigned long nowtime)
-{
- int real_seconds, real_minutes;
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS) {
- printk(KERN_ERR "Oops: efitime: can't read time!\n");
- return -1;
- }
-
- real_seconds = nowtime % 60;
- real_minutes = nowtime / 60;
- if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
- real_minutes += 30;
- real_minutes %= 60;
- eft.minute = real_minutes;
- eft.second = real_seconds;
-
- status = efi.set_time(&eft);
- if (status != EFI_SUCCESS) {
- printk(KERN_ERR "Oops: efitime: can't write time!\n");
- return -1;
- }
- return 0;
-}
-
-unsigned long efi_get_time(void)
-{
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS)
- printk(KERN_ERR "Oops: efitime: can't read time!\n");
-
- return mktime(eft.year, eft.month, eft.day, eft.hour,
- eft.minute, eft.second);
-}
-
-/*
- * Tell the kernel about the EFI memory map. This might include
- * more than the max 128 entries that can fit in the e820 legacy
- * (zeropage) memory map.
- */
-
-static void __init do_add_efi_memmap(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- unsigned long long start = md->phys_addr;
- unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
- int e820_type;
-
- switch (md->type) {
- case EFI_LOADER_CODE:
- case EFI_LOADER_DATA:
- case EFI_BOOT_SERVICES_CODE:
- case EFI_BOOT_SERVICES_DATA:
- case EFI_CONVENTIONAL_MEMORY:
- if (md->attribute & EFI_MEMORY_WB)
- e820_type = E820_RAM;
- else
- e820_type = E820_RESERVED;
- break;
- case EFI_ACPI_RECLAIM_MEMORY:
- e820_type = E820_ACPI;
- break;
- case EFI_ACPI_MEMORY_NVS:
- e820_type = E820_NVS;
- break;
- case EFI_UNUSABLE_MEMORY:
- e820_type = E820_UNUSABLE;
- break;
- default:
- /*
- * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
- * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
- * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
- */
- e820_type = E820_RESERVED;
- break;
- }
- e820_add_region(start, size, e820_type);
- }
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-}
-
-void __init efi_reserve_early(void)
-{
- unsigned long pmap;
-
-#ifdef CONFIG_X86_32
- pmap = boot_params.efi_info.efi_memmap;
-#else
- pmap = (boot_params.efi_info.efi_memmap |
- ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
-#endif
- memmap.phys_map = (void *)pmap;
- memmap.nr_map = boot_params.efi_info.efi_memmap_size /
- boot_params.efi_info.efi_memdesc_size;
- memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
- memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
- reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
- "EFI memmap");
-}
-
-#if EFI_DEBUG
-static void __init print_efi_memmap(void)
-{
- efi_memory_desc_t *md;
- void *p;
- int i;
-
- for (p = memmap.map, i = 0;
- p < memmap.map_end;
- p += memmap.desc_size, i++) {
- md = p;
- printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
- "range=[0x%016llx-0x%016llx) (%lluMB)\n",
- i, md->type, md->attribute, md->phys_addr,
- md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
- (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
- }
-}
-#endif /* EFI_DEBUG */
-
-void __init efi_init(void)
-{
- efi_config_table_t *config_tables;
- efi_runtime_services_t *runtime;
- efi_char16_t *c16;
- char vendor[100] = "unknown";
- int i = 0;
- void *tmp;
-
-#ifdef CONFIG_X86_32
- efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-#else
- efi_phys.systab = (efi_system_table_t *)
- (boot_params.efi_info.efi_systab |
- ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-#endif
-
- efi.systab = early_ioremap((unsigned long)efi_phys.systab,
- sizeof(efi_system_table_t));
- if (efi.systab == NULL)
- printk(KERN_ERR "Couldn't map the EFI system table!\n");
- memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
- early_iounmap(efi.systab, sizeof(efi_system_table_t));
- efi.systab = &efi_systab;
-
- /*
- * Verify the EFI Table
- */
- if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
- printk(KERN_ERR "EFI system table signature incorrect!\n");
- if ((efi.systab->hdr.revision >> 16) == 0)
- printk(KERN_ERR "Warning: EFI system table version "
- "%d.%02d, expected 1.00 or greater!\n",
- efi.systab->hdr.revision >> 16,
- efi.systab->hdr.revision & 0xffff);
-
- /*
- * Show what we know for posterity
- */
- c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
- if (c16) {
- for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
- vendor[i] = *c16++;
- vendor[i] = '\0';
- } else
- printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
- early_iounmap(tmp, 2);
-
- printk(KERN_INFO "EFI v%u.%.02u by %s \n",
- efi.systab->hdr.revision >> 16,
- efi.systab->hdr.revision & 0xffff, vendor);
-
- /*
- * Let's see what config tables the firmware passed to us.
- */
- config_tables = early_ioremap(
- efi.systab->tables,
- efi.systab->nr_tables * sizeof(efi_config_table_t));
- if (config_tables == NULL)
- printk(KERN_ERR "Could not map EFI Configuration Table!\n");
-
- printk(KERN_INFO);
- for (i = 0; i < efi.systab->nr_tables; i++) {
- if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
- efi.mps = config_tables[i].table;
- printk(" MPS=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- ACPI_20_TABLE_GUID)) {
- efi.acpi20 = config_tables[i].table;
- printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- ACPI_TABLE_GUID)) {
- efi.acpi = config_tables[i].table;
- printk(" ACPI=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- SMBIOS_TABLE_GUID)) {
- efi.smbios = config_tables[i].table;
- printk(" SMBIOS=0x%lx ", config_tables[i].table);
-#ifdef CONFIG_X86_UV
- } else if (!efi_guidcmp(config_tables[i].guid,
- UV_SYSTEM_TABLE_GUID)) {
- efi.uv_systab = config_tables[i].table;
- printk(" UVsystab=0x%lx ", config_tables[i].table);
-#endif
- } else if (!efi_guidcmp(config_tables[i].guid,
- HCDP_TABLE_GUID)) {
- efi.hcdp = config_tables[i].table;
- printk(" HCDP=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- UGA_IO_PROTOCOL_GUID)) {
- efi.uga = config_tables[i].table;
- printk(" UGA=0x%lx ", config_tables[i].table);
- }
- }
- printk("\n");
- early_iounmap(config_tables,
- efi.systab->nr_tables * sizeof(efi_config_table_t));
-
- /*
- * Check out the runtime services table. We need to map
- * the runtime services table so that we can grab the physical
- * address of several of the EFI runtime functions, needed to
- * set the firmware into virtual mode.
- */
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
- sizeof(efi_runtime_services_t));
- if (runtime != NULL) {
- /*
- * We will only need *early* access to the following
- * two EFI runtime services before set_virtual_address_map
- * is invoked.
- */
- efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
- efi_phys.set_virtual_address_map =
- (efi_set_virtual_address_map_t *)
- runtime->set_virtual_address_map;
- /*
- * Make efi_get_time can be called before entering
- * virtual mode.
- */
- efi.get_time = phys_efi_get_time;
- } else
- printk(KERN_ERR "Could not map the EFI runtime service "
- "table!\n");
- early_iounmap(runtime, sizeof(efi_runtime_services_t));
-
- /* Map the EFI memory map */
- memmap.map = early_ioremap((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
- if (memmap.map == NULL)
- printk(KERN_ERR "Could not map the EFI memory map!\n");
- memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-
- if (memmap.desc_size != sizeof(efi_memory_desc_t))
- printk(KERN_WARNING
- "Kernel-defined memdesc doesn't match the one from EFI!\n");
-
- if (add_efi_memmap)
- do_add_efi_memmap();
-
- /* Setup for EFI runtime service */
- reboot_type = BOOT_EFI;
-
-#if EFI_DEBUG
- print_efi_memmap();
-#endif
-}
-
-static void __init runtime_code_page_mkexec(void)
-{
- efi_memory_desc_t *md;
- void *p;
- u64 addr, npages;
-
- /* Make EFI runtime service code area executable */
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
-
- if (md->type != EFI_RUNTIME_SERVICES_CODE)
- continue;
-
- addr = md->virt_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&addr, &npages);
- set_memory_x(addr, npages);
- }
-}
-
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- */
-void __init efi_enter_virtual_mode(void)
-{
- efi_memory_desc_t *md;
- efi_status_t status;
- unsigned long size;
- u64 end, systab, addr, npages, end_pfn;
- void *p, *va;
-
- efi.systab = NULL;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if (!(md->attribute & EFI_MEMORY_RUNTIME))
- continue;
-
- size = md->num_pages << EFI_PAGE_SHIFT;
- end = md->phys_addr + size;
-
- end_pfn = PFN_UP(end);
- if (end_pfn <= max_low_pfn_mapped
- || (end_pfn > (1UL << (32 - PAGE_SHIFT))
- && end_pfn <= max_pfn_mapped))
- va = __va(md->phys_addr);
- else
- va = efi_ioremap(md->phys_addr, size, md->type);
-
- md->virt_addr = (u64) (unsigned long) va;
-
- if (!va) {
- printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
- (unsigned long long)md->phys_addr);
- continue;
- }
-
- if (!(md->attribute & EFI_MEMORY_WB)) {
- addr = md->virt_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&addr, &npages);
- set_memory_uc(addr, npages);
- }
-
- systab = (u64) (unsigned long) efi_phys.systab;
- if (md->phys_addr <= systab && systab < end) {
- systab += md->virt_addr - md->phys_addr;
- efi.systab = (efi_system_table_t *) (unsigned long) systab;
- }
- }
-
- BUG_ON(!efi.systab);
-
- status = phys_efi_set_virtual_address_map(
- memmap.desc_size * memmap.nr_map,
- memmap.desc_size,
- memmap.desc_version,
- memmap.phys_map);
-
- if (status != EFI_SUCCESS) {
- printk(KERN_ALERT "Unable to switch EFI into virtual mode "
- "(status=%lx)!\n", status);
- panic("EFI call to SetVirtualAddressMap() failed!");
- }
-
- /*
- * Now that EFI is in virtual mode, update the function
- * pointers in the runtime service table to the new virtual addresses.
- *
- * Call EFI services through wrapper functions.
- */
- efi.get_time = virt_efi_get_time;
- efi.set_time = virt_efi_set_time;
- efi.get_wakeup_time = virt_efi_get_wakeup_time;
- efi.set_wakeup_time = virt_efi_set_wakeup_time;
- efi.get_variable = virt_efi_get_variable;
- efi.get_next_variable = virt_efi_get_next_variable;
- efi.set_variable = virt_efi_set_variable;
- efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
- efi.reset_system = virt_efi_reset_system;
- efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
- if (__supported_pte_mask & _PAGE_NX)
- runtime_code_page_mkexec();
- early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
- memmap.map = NULL;
-}
-
-/*
- * Convenience functions to obtain memory types and attributes
- */
-u32 efi_mem_type(unsigned long phys_addr)
-{
- efi_memory_desc_t *md;
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if ((md->phys_addr <= phys_addr) &&
- (phys_addr < (md->phys_addr +
- (md->num_pages << EFI_PAGE_SHIFT))))
- return md->type;
- }
- return 0;
-}
-
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
- efi_memory_desc_t *md;
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if ((md->phys_addr <= phys_addr) &&
- (phys_addr < (md->phys_addr +
- (md->num_pages << EFI_PAGE_SHIFT))))
- return md->attribute;
- }
- return 0;
-}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
deleted file mode 100644
index 5cab48ee61a4..000000000000
--- a/arch/x86/kernel/efi_32.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Extensible Firmware Interface
- *
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- * David Mosberger-Tang <davidm@hpl.hp.com>
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version. --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls. --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- * Skip non-WB memory and ignore empty memory ranges.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/efi.h>
-
-#include <asm/io.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/efi.h>
-
-/*
- * To make EFI call EFI runtime service in physical addressing mode we need
- * prelog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
- */
-
-static unsigned long efi_rt_eflags;
-static pgd_t efi_bak_pg_dir_pointer[2];
-
-void efi_call_phys_prelog(void)
-{
- unsigned long cr4;
- unsigned long temp;
- struct desc_ptr gdt_descr;
-
- local_irq_save(efi_rt_eflags);
-
- /*
- * If I don't have PAE, I should just duplicate two entries in page
- * directory. If I have PAE, I just need to duplicate one entry in
- * page directory.
- */
- cr4 = read_cr4_safe();
-
- if (cr4 & X86_CR4_PAE) {
- efi_bak_pg_dir_pointer[0].pgd =
- swapper_pg_dir[pgd_index(0)].pgd;
- swapper_pg_dir[0].pgd =
- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
- } else {
- efi_bak_pg_dir_pointer[0].pgd =
- swapper_pg_dir[pgd_index(0)].pgd;
- efi_bak_pg_dir_pointer[1].pgd =
- swapper_pg_dir[pgd_index(0x400000)].pgd;
- swapper_pg_dir[pgd_index(0)].pgd =
- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
- temp = PAGE_OFFSET + 0x400000;
- swapper_pg_dir[pgd_index(0x400000)].pgd =
- swapper_pg_dir[pgd_index(temp)].pgd;
- }
-
- /*
- * After the lock is released, the original page table is restored.
- */
- __flush_tlb_all();
-
- gdt_descr.address = __pa(get_cpu_gdt_table(0));
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
-}
-
-void efi_call_phys_epilog(void)
-{
- unsigned long cr4;
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
-
- cr4 = read_cr4_safe();
-
- if (cr4 & X86_CR4_PAE) {
- swapper_pg_dir[pgd_index(0)].pgd =
- efi_bak_pg_dir_pointer[0].pgd;
- } else {
- swapper_pg_dir[pgd_index(0)].pgd =
- efi_bak_pg_dir_pointer[0].pgd;
- swapper_pg_dir[pgd_index(0x400000)].pgd =
- efi_bak_pg_dir_pointer[1].pgd;
- }
-
- /*
- * After the lock is released, the original page table is restored.
- */
- __flush_tlb_all();
-
- local_irq_restore(efi_rt_eflags);
-}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
deleted file mode 100644
index ac0621a7ac3d..000000000000
--- a/arch/x86/kernel/efi_64.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * x86_64 specific EFI support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 2005-2008 Intel Co.
- * Fenghua Yu <fenghua.yu@intel.com>
- * Bibo Mao <bibo.mao@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- * Huang Ying <ying.huang@intel.com>
- *
- * Code to convert EFI to E820 map has been implemented in elilo bootloader
- * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
- * is setup appropriately for EFI runtime code.
- * - mouli 06/14/2007.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-
-#include <asm/setup.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm/efi.h>
-#include <asm/cacheflush.h>
-#include <asm/fixmap.h>
-
-static pgd_t save_pgd __initdata;
-static unsigned long efi_flags __initdata;
-
-static void __init early_mapping_set_exec(unsigned long start,
- unsigned long end,
- int executable)
-{
- unsigned long num_pages;
-
- start &= PMD_MASK;
- end = (end + PMD_SIZE - 1) & PMD_MASK;
- num_pages = (end - start) >> PAGE_SHIFT;
- if (executable)
- set_memory_x((unsigned long)__va(start), num_pages);
- else
- set_memory_nx((unsigned long)__va(start), num_pages);
-}
-
-static void __init early_runtime_code_mapping_set_exec(int executable)
-{
- efi_memory_desc_t *md;
- void *p;
-
- if (!(__supported_pte_mask & _PAGE_NX))
- return;
-
- /* Make EFI runtime service code area executable */
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if (md->type == EFI_RUNTIME_SERVICES_CODE) {
- unsigned long end;
- end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
- early_mapping_set_exec(md->phys_addr, end, executable);
- }
- }
-}
-
-void __init efi_call_phys_prelog(void)
-{
- unsigned long vaddress;
-
- early_runtime_code_mapping_set_exec(1);
- local_irq_save(efi_flags);
- vaddress = (unsigned long)__va(0x0UL);
- save_pgd = *pgd_offset_k(0x0UL);
- set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
- __flush_tlb_all();
-}
-
-void __init efi_call_phys_epilog(void)
-{
- /*
- * After the lock is released, the original page table is restored.
- */
- set_pgd(pgd_offset_k(0x0UL), save_pgd);
- __flush_tlb_all();
- local_irq_restore(efi_flags);
- early_runtime_code_mapping_set_exec(0);
-}
-
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
- u32 type)
-{
- unsigned long last_map_pfn;
-
- if (type == EFI_MEMORY_MAPPED_IO)
- return ioremap(phys_addr, size);
-
- last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
- if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
- return NULL;
-
- return (void __iomem *)__va(phys_addr);
-}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
deleted file mode 100644
index fbe66e626c09..000000000000
--- a/arch/x86/kernel/efi_stub_32.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * EFI call stub for IA32.
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off.
- */
-
-#include <linux/linkage.h>
-#include <asm/page_types.h>
-
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
-
-.text
-ENTRY(efi_call_phys)
- /*
- * 0. The function can only be called in Linux kernel. So CS has been
- * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
- * the values of these registers are the same. And, the corresponding
- * GDT entries are identical. So I will do nothing about segment reg
- * and GDT, but change GDT base register in prelog and epilog.
- */
-
- /*
- * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
- * But to make it smoothly switch from virtual mode to flat mode.
- * The mapping of lower virtual memory has been created in prelog and
- * epilog.
- */
- movl $1f, %edx
- subl $__PAGE_OFFSET, %edx
- jmp *%edx
-1:
-
- /*
- * 2. Now on the top of stack is the return
- * address in the caller of efi_call_phys(), then parameter 1,
- * parameter 2, ..., param n. To make things easy, we save the return
- * address of efi_call_phys in a global variable.
- */
- popl %edx
- movl %edx, saved_return_addr
- /* get the function pointer into ECX*/
- popl %ecx
- movl %ecx, efi_rt_function_ptr
- movl $2f, %edx
- subl $__PAGE_OFFSET, %edx
- pushl %edx
-
- /*
- * 3. Clear PG bit in %CR0.
- */
- movl %cr0, %edx
- andl $0x7fffffff, %edx
- movl %edx, %cr0
- jmp 1f
-1:
-
- /*
- * 4. Adjust stack pointer.
- */
- subl $__PAGE_OFFSET, %esp
-
- /*
- * 5. Call the physical function.
- */
- jmp *%ecx
-
-2:
- /*
- * 6. After EFI runtime service returns, control will return to
- * following instruction. We'd better readjust stack pointer first.
- */
- addl $__PAGE_OFFSET, %esp
-
- /*
- * 7. Restore PG bit
- */
- movl %cr0, %edx
- orl $0x80000000, %edx
- movl %edx, %cr0
- jmp 1f
-1:
- /*
- * 8. Now restore the virtual mode from flat mode by
- * adding EIP with PAGE_OFFSET.
- */
- movl $1f, %edx
- jmp *%edx
-1:
-
- /*
- * 9. Balance the stack. And because EAX contain the return value,
- * we'd better not clobber it.
- */
- leal efi_rt_function_ptr, %edx
- movl (%edx), %ecx
- pushl %ecx
-
- /*
- * 10. Push the saved return address onto the stack and return.
- */
- leal saved_return_addr, %edx
- movl (%edx), %ecx
- pushl %ecx
- ret
-ENDPROC(efi_call_phys)
-.previous
-
-.data
-saved_return_addr:
- .long 0
-efi_rt_function_ptr:
- .long 0
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
deleted file mode 100644
index 4c07ccab8146..000000000000
--- a/arch/x86/kernel/efi_stub_64.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Function calling ABI conversion from Linux to EFI for x86_64
- *
- * Copyright (C) 2007 Intel Corp
- * Bibo Mao <bibo.mao@intel.com>
- * Huang Ying <ying.huang@intel.com>
- */
-
-#include <linux/linkage.h>
-
-#define SAVE_XMM \
- mov %rsp, %rax; \
- subq $0x70, %rsp; \
- and $~0xf, %rsp; \
- mov %rax, (%rsp); \
- mov %cr0, %rax; \
- clts; \
- mov %rax, 0x8(%rsp); \
- movaps %xmm0, 0x60(%rsp); \
- movaps %xmm1, 0x50(%rsp); \
- movaps %xmm2, 0x40(%rsp); \
- movaps %xmm3, 0x30(%rsp); \
- movaps %xmm4, 0x20(%rsp); \
- movaps %xmm5, 0x10(%rsp)
-
-#define RESTORE_XMM \
- movaps 0x60(%rsp), %xmm0; \
- movaps 0x50(%rsp), %xmm1; \
- movaps 0x40(%rsp), %xmm2; \
- movaps 0x30(%rsp), %xmm3; \
- movaps 0x20(%rsp), %xmm4; \
- movaps 0x10(%rsp), %xmm5; \
- mov 0x8(%rsp), %rsi; \
- mov %rsi, %cr0; \
- mov (%rsp), %rsp
-
-ENTRY(efi_call0)
- SAVE_XMM
- subq $32, %rsp
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call0)
-
-ENTRY(efi_call1)
- SAVE_XMM
- subq $32, %rsp
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call1)
-
-ENTRY(efi_call2)
- SAVE_XMM
- subq $32, %rsp
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call2)
-
-ENTRY(efi_call3)
- SAVE_XMM
- subq $32, %rsp
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call3)
-
-ENTRY(efi_call4)
- SAVE_XMM
- subq $32, %rsp
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call4)
-
-ENTRY(efi_call5)
- SAVE_XMM
- subq $48, %rsp
- mov %r9, 32(%rsp)
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $48, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call5)
-
-ENTRY(efi_call6)
- SAVE_XMM
- mov (%rsp), %rax
- mov 8(%rax), %rax
- subq $48, %rsp
- mov %r9, 32(%rsp)
- mov %rax, 40(%rsp)
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $48, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call6)
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 000000000000..9535a6507db7
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * EISA specific code
+ */
+#include <linux/cc_platform.h>
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#include <linux/io.h>
+
+#include <xen/xen.h>
+
+static __init int eisa_bus_probe(void)
+{
+ u32 *p;
+
+ if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return 0;
+
+ p = memremap(0x0FFFD9, 4, MEMREMAP_WB);
+ if (p && *p == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
+ EISA_bus = 1;
+ memunmap(p);
+ return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
deleted file mode 100644
index c097e7d607c6..000000000000
--- a/arch/x86/kernel/entry_32.S
+++ /dev/null
@@ -1,1425 +0,0 @@
-/*
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
- *
- * Stack layout in 'syscall_exit':
- * ptrace needs to have all regs on the stack.
- * if the order here is changed, it needs to be
- * updated in fork.c:copy_process, signal.c:do_signal,
- * ptrace.c and ptrace.h
- *
- * 0(%esp) - %ebx
- * 4(%esp) - %ecx
- * 8(%esp) - %edx
- * C(%esp) - %esi
- * 10(%esp) - %edi
- * 14(%esp) - %ebp
- * 18(%esp) - %eax
- * 1C(%esp) - %ds
- * 20(%esp) - %es
- * 24(%esp) - %fs
- * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
- * 2C(%esp) - orig_eax
- * 30(%esp) - %eip
- * 34(%esp) - %cs
- * 38(%esp) - %eflags
- * 3C(%esp) - %oldesp
- * 40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
- */
-
-#include <linux/linkage.h>
-#include <asm/thread_info.h>
-#include <asm/irqflags.h>
-#include <asm/errno.h>
-#include <asm/segment.h>
-#include <asm/smp.h>
-#include <asm/page_types.h>
-#include <asm/percpu.h>
-#include <asm/dwarf2.h>
-#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
-#include <asm/irq_vectors.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit syscall_trace_entry
-#define sysexit_audit syscall_exit_work
-#endif
-
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#define nr_syscalls ((syscall_table_size)/4)
-
-#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-#define preempt_stop(clobbers)
-#define resume_kernel restore_all
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-#ifdef CONFIG_VM86
-#define resume_userspace_sig check_userspace
-#else
-#define resume_userspace_sig resume_userspace
-#endif
-
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc. Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
-.endm
-.macro POP_GS pop=0
- addl $(4 + \pop), %esp
- CFI_ADJUST_CFA_OFFSET -(4 + \pop)
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else /* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
- pushl %gs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET gs, 0*/
-.endm
-
-.macro POP_GS pop=0
-98: popl %gs
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE gs*/
- .if \pop <> 0
- add $\pop, %esp
- CFI_ADJUST_CFA_OFFSET -\pop
- .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
-.popsection
-.endm
-
-.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
-.popsection
-.endm
-
-.macro GS_TO_REG reg
- movl %gs, \reg
- /*CFI_REGISTER gs, \reg*/
-.endm
-.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
- /*CFI_REL_OFFSET gs, PT_GS*/
-.endm
-.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
-.endm
-
-#endif /* CONFIG_X86_32_LAZY_GS */
-
-.macro SAVE_ALL
- cld
- PUSH_GS
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0;*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0;*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0;*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- movl $(__USER_DS), %edx
- movl %edx, %ds
- movl %edx, %es
- movl $(__KERNEL_PERCPU), %edx
- movl %edx, %fs
- SET_KERNEL_GS %edx
-.endm
-
-.macro RESTORE_INT_REGS
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebx
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE esi
- popl %edi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edi
- popl %ebp
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebp
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE eax
-.endm
-
-.macro RESTORE_REGS pop=0
- RESTORE_INT_REGS
-1: popl %ds
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE ds;*/
-2: popl %es
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE es;*/
-3: popl %fs
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE fs;*/
- POP_GS \pop
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.section __ex_table, "a"
- .align 4
- .long 1b, 4b
- .long 2b, 5b
- .long 3b, 6b
-.popsection
- POP_GS_EX
-.endm
-
-.macro RING0_INT_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 3*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 4*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
- CFI_OFFSET eip, PT_EIP-PT_OLDESP
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
- CFI_OFFSET eax, PT_EAX-PT_OLDESP
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP
- CFI_OFFSET edi, PT_EDI-PT_OLDESP
- CFI_OFFSET esi, PT_ESI-PT_OLDESP
- CFI_OFFSET edx, PT_EDX-PT_OLDESP
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP
- CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
-ENTRY(ret_from_fork)
- CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- call schedule_tail
- GET_THREAD_INFO(%ebp)
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- pushl $0x0202 # Reset kernel eflags
- CFI_ADJUST_CFA_OFFSET 4
- popfl
- CFI_ADJUST_CFA_OFFSET -4
- jmp syscall_exit
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
- ALIGN
- RING0_PTREGS_FRAME
-ret_from_exception:
- preempt_stop(CLBR_ANY)
-ret_from_intr:
- GET_THREAD_INFO(%ebp)
-check_userspace:
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
- cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
-
-ENTRY(resume_userspace)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
- # int/exception return?
- jne work_pending
- jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
- DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
- jnz restore_all
-need_resched:
- movl TI_flags(%ebp), %ecx # need_resched set ?
- testb $_TIF_NEED_RESCHED, %cl
- jz restore_all
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
- call preempt_schedule_irq
- jmp need_resched
-END(resume_kernel)
-#endif
- CFI_ENDPROC
-
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
- the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
-
- # sysenter call handler stub
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 0
- CFI_REGISTER esp, ebp
- movl TSS_sysenter_sp0(%esp),%esp
-sysenter_past_esp:
- /*
- * Interrupts are disabled here, but we can't trace it until
- * enough kernel state to call TRACE_IRQS_OFF can be called - but
- * we immediately enable interrupts at that point anyway.
- */
- pushl $(__USER_DS)
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ss, 0*/
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esp, 0
- pushfl
- orl $X86_EFLAGS_IF, (%esp)
- CFI_ADJUST_CFA_OFFSET 4
- pushl $(__USER_CS)
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET cs, 0*/
- /*
- * Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
- * pushed above; +8 corresponds to copy_thread's esp0 setting.
- */
- pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eip, 0
-
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
- cmpl $__PAGE_OFFSET-3,%ebp
- jae syscall_fault
-1: movl (%ebp),%ebp
- movl %ebp,PT_EBP(%esp)
-.section __ex_table,"a"
- .align 4
- .long 1b,syscall_fault
-.previous
-
- GET_THREAD_INFO(%ebp)
-
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz sysenter_audit
-sysenter_do_call:
- cmpl $(nr_syscalls), %eax
- jae syscall_badsys
- call *sys_call_table(,%eax,4)
- movl %eax,PT_EAX(%esp)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx
- jne sysexit_audit
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
- movl PT_EIP(%esp), %edx
- movl PT_OLDESP(%esp), %ecx
- xorl %ebp,%ebp
- TRACE_IRQS_ON
-1: mov PT_FS(%esp), %fs
- PTGS_TO_GS
- ENABLE_INTERRUPTS_SYSEXIT
-
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
- jnz syscall_trace_entry
- addl $4,%esp
- CFI_ADJUST_CFA_OFFSET -4
- /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
- /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
- /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
- movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
- movl %eax,%edx /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
- call audit_syscall_entry
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- movl PT_EAX(%esp),%eax /* reload syscall number */
- jmp sysenter_do_call
-
-sysexit_audit:
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- movl %eax,%edx /* second arg, syscall return value */
- cmpl $0,%eax /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
- movzbl %al,%eax /* zero-extend that */
- inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
- movl PT_EAX(%esp),%eax /* reload syscall return value */
- jmp sysenter_exit
-#endif
-
- CFI_ENDPROC
-.pushsection .fixup,"ax"
-2: movl $0,PT_FS(%esp)
- jmp 1b
-.section __ex_table,"a"
- .align 4
- .long 1b,2b
-.popsection
- PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
-
- # system call handler stub
-ENTRY(system_call)
- RING0_INT_FRAME # can't unwind into user space anyway
- pushl %eax # save orig_eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- GET_THREAD_INFO(%ebp)
- # system call tracing in operation / emulation
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz syscall_trace_entry
- cmpl $(nr_syscalls), %eax
- jae syscall_badsys
-syscall_call:
- call *sys_call_table(,%eax,4)
- movl %eax,PT_EAX(%esp) # store the return value
-syscall_exit:
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jne syscall_exit_work
-
-restore_all:
- TRACE_IRQS_IRET
-restore_all_notrace:
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
- # See comments in process.c:copy_thread() for details.
- movb PT_OLDSS(%esp), %ah
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- CFI_REMEMBER_STATE
- je ldt_ss # returning to user-space with LDT SS
-restore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
- CFI_ADJUST_CFA_OFFSET -4
-irq_return:
- INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
- pushl $0 # no error code
- pushl $do_iret_error
- jmp error_code
-.previous
-.section __ex_table,"a"
- .align 4
- .long irq_return,iret_exc
-.previous
-
- CFI_RESTORE_STATE
-ldt_ss:
- larl PT_OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
-
-#ifdef CONFIG_PARAVIRT
- /*
- * The kernel can't run on a non-flat stack if paravirt mode
- * is active. Rather than try to fixup the high bits of
- * ESP, bypass this code entirely. This may break DOSemu
- * and/or Wine support in a paravirt VM, although the option
- * is still available to implement the setting of the high
- * 16-bits in the INTERRUPT_RETURN paravirt-op.
- */
- cmpl $0, pv_info+PARAVIRT_enabled
- jne restore_nocheck
-#endif
-
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
- mov %esp, %edx /* load kernel esp */
- mov PT_OLDESP(%esp), %eax /* load userspace esp */
- mov %dx, %ax /* eax: new kernel esp */
- sub %eax, %edx /* offset (low word is 0) */
- PER_CPU(gdt_page, %ebx)
- shr $16, %edx
- mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
- mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
- pushl $__ESPFIX_SS
- CFI_ADJUST_CFA_OFFSET 4
- push %eax /* new kernel esp */
- CFI_ADJUST_CFA_OFFSET 4
- /* Disable interrupts, but do not irqtrace this section: we
- * will soon execute iret and the tracer was already set to
- * the irqstate after the iret */
- DISABLE_INTERRUPTS(CLBR_EAX)
- lss (%esp), %esp /* switch to espfix segment */
- CFI_ADJUST_CFA_OFFSET -8
- jmp restore_nocheck
- CFI_ENDPROC
-ENDPROC(system_call)
-
- # perform work that needs to be done immediately before resumption
- ALIGN
- RING0_PTREGS_FRAME # can't unwind into user space anyway
-work_pending:
- testb $_TIF_NEED_RESCHED, %cl
- jz work_notifysig
-work_resched:
- call schedule
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
- # than syscall tracing?
- jz restore_all
- testb $_TIF_NEED_RESCHED, %cl
- jnz work_resched
-
-work_notifysig: # deal with pending signals and
- # notify-resume requests
-#ifdef CONFIG_VM86
- testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
- movl %esp, %eax
- jne work_notifysig_v86 # returning to kernel-space or
- # vm86-space
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace_sig
-
- ALIGN
-work_notifysig_v86:
- pushl %ecx # save ti_flags for do_notify_resume
- CFI_ADJUST_CFA_OFFSET 4
- call save_v86_state # %eax contains pt_regs pointer
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- movl %eax, %esp
-#else
- movl %esp, %eax
-#endif
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace_sig
-END(work_pending)
-
- # perform syscall exit tracing
- ALIGN
-syscall_trace_entry:
- movl $-ENOSYS,PT_EAX(%esp)
- movl %esp, %eax
- call syscall_trace_enter
- /* What it returned is what we'll actually use. */
- cmpl $(nr_syscalls), %eax
- jnae syscall_call
- jmp syscall_exit
-END(syscall_trace_entry)
-
- # perform syscall exit tracing
- ALIGN
-syscall_exit_work:
- testl $_TIF_WORK_SYSCALL_EXIT, %ecx
- jz work_pending
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
- # schedule() instead
- movl %esp, %eax
- call syscall_trace_leave
- jmp resume_userspace
-END(syscall_exit_work)
- CFI_ENDPROC
-
- RING0_INT_FRAME # can't unwind into user space anyway
-syscall_fault:
- GET_THREAD_INFO(%ebp)
- movl $-EFAULT,PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_badsys)
- CFI_ENDPROC
-
-/*
- * System calls that need a pt_regs pointer.
- */
-#define PTREGSCALL(name) \
- ALIGN; \
-ptregs_##name: \
- leal 4(%esp),%eax; \
- jmp sys_##name;
-
-PTREGSCALL(iopl)
-PTREGSCALL(fork)
-PTREGSCALL(clone)
-PTREGSCALL(vfork)
-PTREGSCALL(execve)
-PTREGSCALL(sigaltstack)
-PTREGSCALL(sigreturn)
-PTREGSCALL(rt_sigreturn)
-PTREGSCALL(vm86)
-PTREGSCALL(vm86old)
-
-.macro FIXUP_ESPFIX_STACK
-/*
- * Switch back for ESPFIX stack to the normal zerobased stack
- *
- * We can't call C functions using the ESPFIX stack. This code reads
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
- /* fixup the stack */
- PER_CPU(gdt_page, %ebx)
- mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
- mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
- shl $16, %eax
- addl %esp, %eax /* the adjusted stack pointer */
- pushl $__KERNEL_DS
- CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- lss (%esp), %esp /* switch to the normal stack segment */
- CFI_ADJUST_CFA_OFFSET -8
-.endm
-.macro UNWIND_ESPFIX_STACK
- movl %ss, %eax
- /* see if on espfix stack */
- cmpw $__ESPFIX_SS, %ax
- jne 27f
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- /* switch to normal stack */
- FIXUP_ESPFIX_STACK
-27:
-.endm
-
-/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
- */
-.section .init.rodata,"a"
-ENTRY(interrupt)
-.text
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-ENTRY(irq_entries_start)
- RING0_INT_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < NR_VECTORS
- .if vector <> FIRST_EXTERNAL_VECTOR
- CFI_ADJUST_CFA_OFFSET -4
- .endif
-1: pushl $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 4
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .long 1b
- .text
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
-END(irq_entries_start)
-
-.previous
-END(interrupt)
-.previous
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
- SAVE_ALL
- TRACE_IRQS_OFF
- movl %esp,%eax
- call do_IRQ
- jmp ret_from_intr
-ENDPROC(common_interrupt)
- CFI_ENDPROC
-
-#define BUILD_INTERRUPT3(name, nr, fn) \
-ENTRY(name) \
- RING0_INT_FRAME; \
- pushl $~(nr); \
- CFI_ADJUST_CFA_OFFSET 4; \
- SAVE_ALL; \
- TRACE_IRQS_OFF \
- movl %esp,%eax; \
- call fn; \
- jmp ret_from_intr; \
- CFI_ENDPROC; \
-ENDPROC(name)
-
-#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-ENTRY(coprocessor_error)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_error
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_simd_coprocessor_error
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_device_not_available
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
- iret
-.section __ex_table,"a"
- .align 4
- .long native_iret, iret_exc
-.previous
-END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
-#endif
-
-ENTRY(overflow)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_overflow
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(overflow)
-
-ENTRY(bounds)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_bounds
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(bounds)
-
-ENTRY(invalid_op)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_invalid_op
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_segment_overrun
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
- RING0_EC_FRAME
- pushl $do_invalid_TSS
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- RING0_EC_FRAME
- pushl $do_segment_not_present
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(segment_not_present)
-
-ENTRY(stack_segment)
- RING0_EC_FRAME
- pushl $do_stack_segment
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(stack_segment)
-
-ENTRY(alignment_check)
- RING0_EC_FRAME
- pushl $do_alignment_check
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(alignment_check)
-
-ENTRY(divide_error)
- RING0_INT_FRAME
- pushl $0 # no error code
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_divide_error
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl machine_check_vector
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_spurious_interrupt_bug
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(spurious_interrupt_bug)
-
-ENTRY(kernel_thread_helper)
- pushl $0 # fake return address for unwinder
- CFI_STARTPROC
- movl %edx,%eax
- push %edx
- CFI_ADJUST_CFA_OFFSET 4
- call *%ebx
- push %eax
- CFI_ADJUST_CFA_OFFSET 4
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-ENDPROC(kernel_thread_helper)
-
-#ifdef CONFIG_XEN
-/* Xen doesn't set %esp to be precisely what the normal sysenter
- entrypoint expects, so fix it up before using the normal path. */
-ENTRY(xen_sysenter_target)
- RING0_INT_FRAME
- addl $5*4, %esp /* remove xen-provided frame */
- CFI_ADJUST_CFA_OFFSET -5*4
- jmp sysenter_past_esp
- CFI_ENDPROC
-
-ENTRY(xen_hypervisor_callback)
- CFI_STARTPROC
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
-
- /* Check to see if we got the event in the critical
- region in xen_iret_direct, after we've reenabled
- events and checked for pending events. This simulates
- iret instruction's behaviour where it delivers a
- pending interrupt when enabling interrupts. */
- movl PT_EIP(%esp),%eax
- cmpl $xen_iret_start_crit,%eax
- jb 1f
- cmpl $xen_iret_end_crit,%eax
- jae 1f
-
- jmp xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1: mov %esp, %eax
- call xen_evtchn_do_upcall
- jmp ret_from_intr
- CFI_ENDPROC
-ENDPROC(xen_hypervisor_callback)
-
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-# 1. Fault while reloading DS, ES, FS or GS
-# 2. Fault while executing IRET
-# Category 1 we fix up by reattempting the load, and zeroing the segment
-# register if the load fails.
-# Category 2 we fix up by jumping to do_iret_error. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by maintaining a status value in EAX.
-ENTRY(xen_failsafe_callback)
- CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl $1,%eax
-1: mov 4(%esp),%ds
-2: mov 8(%esp),%es
-3: mov 12(%esp),%fs
-4: mov 16(%esp),%gs
- testl %eax,%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- lea 16(%esp),%esp
- CFI_ADJUST_CFA_OFFSET -16
- jz 5f
- addl $16,%esp
- jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- jmp ret_from_exception
- CFI_ENDPROC
-
-.section .fixup,"ax"
-6: xorl %eax,%eax
- movl %eax,4(%esp)
- jmp 1b
-7: xorl %eax,%eax
- movl %eax,8(%esp)
- jmp 2b
-8: xorl %eax,%eax
- movl %eax,12(%esp)
- jmp 3b
-9: xorl %eax,%eax
- movl %eax,16(%esp)
- jmp 4b
-.previous
-.section __ex_table,"a"
- .align 4
- .long 1b,6b
- .long 2b,7b
- .long 3b,8b
- .long 4b,9b
-.previous
-ENDPROC(xen_failsafe_callback)
-
-#endif /* CONFIG_XEN */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
- ret
-END(mcount)
-
-ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
- call ftrace_stub
-
- popl %edx
- popl %ecx
- popl %eax
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- jmp ftrace_stub
-#endif
-
-.globl ftrace_stub
-ftrace_stub:
- ret
-END(ftrace_caller)
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpl $ftrace_stub, ftrace_trace_function
- jnz trace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpl $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
- ret
-
- /* taken from glibc */
-trace:
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
- call *ftrace_trace_function
-
- popl %edx
- popl %ecx
- popl %eax
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %edx
- lea 0x4(%ebp), %eax
- movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %edx
- call prepare_ftrace_return
- popl %edx
- popl %ecx
- popl %eax
- ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
- pushl $0
- pushl %eax
- pushl %ecx
- pushl %edx
- movl %ebp, %eax
- call ftrace_return_to_handler
- movl %eax, 0xc(%esp)
- popl %edx
- popl %ecx
- popl %eax
- ret
-#endif
-
-.section .rodata,"a"
-#include "syscall_table_32.S"
-
-syscall_table_size=(.-sys_call_table)
-
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
-ENTRY(page_fault)
- RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
- ALIGN
-error_code:
- /* the function address is in %gs's slot on the stack */
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-END(page_fault)
-
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
- cmpw $__KERNEL_CS, 4(%esp)
- jne \ok
-\label:
- movl TSS_sysenter_sp0 + \offset(%esp), %esp
- CFI_DEF_CFA esp, 0
- CFI_UNDEFINED eip
- pushfl
- CFI_ADJUST_CFA_OFFSET 4
- pushl $__KERNEL_CS
- CFI_ADJUST_CFA_OFFSET 4
- pushl $sysenter_past_esp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eip, 0
-.endm
-
-ENTRY(debug)
- RING0_INT_FRAME
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-ENTRY(nmi)
- RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- je nmi_espfix_stack
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_all_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK 12, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
- addl $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- .endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
- CFI_ENDPROC
-END(nmi)
-
-ENTRY(int3)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-END(int3)
-
-ENTRY(general_protection)
- RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(general_protection)
-
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
deleted file mode 100644
index c251be745107..000000000000
--- a/arch/x86/kernel/entry_64.S
+++ /dev/null
@@ -1,1594 +0,0 @@
-/*
- * linux/arch/x86_64/entry.S
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
- * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
- * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
- * - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved.
- *
- * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
- * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
- */
-
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/cache.h>
-#include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/msr.h>
-#include <asm/unistd.h>
-#include <asm/thread_info.h>
-#include <asm/hw_irq.h>
-#include <asm/page_types.h>
-#include <asm/irqflags.h>
-#include <asm/paravirt.h>
-#include <asm/ftrace.h>
-#include <asm/percpu.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE 0x40000000
-
- .code64
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(mcount)
- retq
-END(mcount)
-
-ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
-GLOBAL(ftrace_call)
- call ftrace_stub
-
- MCOUNT_RESTORE_FRAME
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
- jmp ftrace_stub
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-END(ftrace_caller)
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(mcount)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpq $ftrace_stub, ftrace_trace_function
- jnz trace
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-
-trace:
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
-
- MCOUNT_RESTORE_FRAME
-
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- leaq 8(%rbp), %rdi
- movq 0x38(%rsp), %rsi
- movq (%rbp), %rdx
- subq $MCOUNT_INSN_SIZE, %rsi
-
- call prepare_ftrace_return
-
- MCOUNT_RESTORE_FRAME
-
- retq
-END(ftrace_graph_caller)
-
-GLOBAL(return_to_handler)
- subq $80, %rsp
-
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, %rdi
-
- call ftrace_return_to_handler
-
- movq %rax, 72(%rsp)
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
- addq $72, %rsp
- retq
-#endif
-
-
-#ifndef CONFIG_PREEMPT
-#define retint_kernel retint_restore_args
-#endif
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret64)
- swapgs
- sysretq
-ENDPROC(native_usergs_sysret64)
-#endif /* CONFIG_PARAVIRT */
-
-
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
-#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp offset=0
- movq PER_CPU_VAR(old_rsp),\tmp
- movq \tmp,RSP+\offset(%rsp)
- movq $__USER_DS,SS+\offset(%rsp)
- movq $__USER_CS,CS+\offset(%rsp)
- movq $-1,RCX+\offset(%rsp)
- movq R11+\offset(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS+\offset(%rsp)
- .endm
-
- .macro RESTORE_TOP_OF_STACK tmp offset=0
- movq RSP+\offset(%rsp),\tmp
- movq \tmp,PER_CPU_VAR(old_rsp)
- movq EFLAGS+\offset(%rsp),\tmp
- movq \tmp,R11+\offset(%rsp)
- .endm
-
- .macro FAKE_STACK_FRAME child_rip
- /* push in order ss, rsp, eflags, cs, rip */
- xorl %eax, %eax
- pushq $__KERNEL_DS /* ss */
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET ss,0*/
- pushq %rax /* rsp */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rsp,0
- pushq $X86_EFLAGS_IF /* eflags - interrupts on */
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET rflags,0*/
- pushq $__KERNEL_CS /* cs */
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET cs,0*/
- pushq \child_rip /* rip */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip,0
- pushq %rax /* orig rax */
- CFI_ADJUST_CFA_OFFSET 8
- .endm
-
- .macro UNFAKE_STACK_FRAME
- addq $8*6, %rsp
- CFI_ADJUST_CFA_OFFSET -(6*8)
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
- .macro EMPTY_FRAME start=1 offset=0
- .if \start
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,8+\offset
- .else
- CFI_DEF_CFA_OFFSET 8+\offset
- .endif
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
- .macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, SS+8+\offset-RIP
- /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
- CFI_REL_OFFSET rsp, RSP+\offset-RIP
- /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
- /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
- CFI_REL_OFFSET rip, RIP+\offset-RIP
- .endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
- .macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, RIP+\offset-ORIG_RAX
- /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
- .endm
-
-/*
- * frame that enables calling into C.
- */
- .macro PARTIAL_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
- CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
- CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
- CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
- CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
- CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
- CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
- CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
- .endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
- .macro DEFAULT_FRAME start=1 offset=0
- PARTIAL_FRAME \start, R11+\offset-R15
- CFI_REL_OFFSET rbx, RBX+\offset
- CFI_REL_OFFSET rbp, RBP+\offset
- CFI_REL_OFFSET r12, R12+\offset
- CFI_REL_OFFSET r13, R13+\offset
- CFI_REL_OFFSET r14, R14+\offset
- CFI_REL_OFFSET r15, R15+\offset
- .endm
-
-/* save partial stack frame */
-ENTRY(save_args)
- XCPT_FRAME
- cld
- movq_cfi rdi, RDI+16-ARGOFFSET
- movq_cfi rsi, RSI+16-ARGOFFSET
- movq_cfi rdx, RDX+16-ARGOFFSET
- movq_cfi rcx, RCX+16-ARGOFFSET
- movq_cfi rax, RAX+16-ARGOFFSET
- movq_cfi r8, R8+16-ARGOFFSET
- movq_cfi r9, R9+16-ARGOFFSET
- movq_cfi r10, R10+16-ARGOFFSET
- movq_cfi r11, R11+16-ARGOFFSET
-
- leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
- movq_cfi rbp, 8 /* push %rbp */
- leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
- testl $3, CS(%rdi)
- je 1f
- SWAPGS
- /*
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
-1: incl PER_CPU_VAR(irq_count)
- jne 2f
- popq_cfi %rax /* move return address... */
- mov PER_CPU_VAR(irq_stack_ptr),%rsp
- EMPTY_FRAME 0
- pushq_cfi %rbp /* backlink for unwinder */
- pushq_cfi %rax /* ... to the new stack */
- /*
- * We entered an interrupt context - irqs are off:
- */
-2: TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-END(save_args)
-
-ENTRY(save_rest)
- PARTIAL_FRAME 1 REST_SKIP+8
- movq 5*8+16(%rsp), %r11 /* save return address */
- movq_cfi rbx, RBX+16
- movq_cfi rbp, RBP+16
- movq_cfi r12, R12+16
- movq_cfi r13, R13+16
- movq_cfi r14, R14+16
- movq_cfi r15, R15+16
- movq %r11, 8(%rsp) /* return address */
- FIXUP_TOP_OF_STACK %r11, 16
- ret
- CFI_ENDPROC
-END(save_rest)
-
-/* save complete stack frame */
- .pushsection .kprobes.text, "ax"
-ENTRY(save_paranoid)
- XCPT_FRAME 1 RDI+8
- cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
- movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx,%ebx
-1: ret
- CFI_ENDPROC
-END(save_paranoid)
- .popsection
-
-/*
- * A newly forked process directly context switches into this address.
- *
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
- DEFAULT_FRAME
-
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
- push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -8
-
- call schedule_tail # rdi: 'prev' task parameter
-
- GET_THREAD_INFO(%rcx)
-
- RESTORE_REST
-
- testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- je int_ret_from_sys_call
-
- testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
- jnz int_ret_from_sys_call
-
- RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
- jmp ret_from_sys_call # go to the SYSRET fastpath
-
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Upto 6 arguments in registers are supported.
- *
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer.
- */
-
-/*
- * Register setup:
- * rax system call number
- * rdi arg0
- * rcx return address for syscall/sysret, C arg3
- * rsi arg1
- * rdx arg2
- * r10 arg3 (--> moved to rcx for C)
- * r8 arg4
- * r9 arg5
- * r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
- *
- * Interrupts are off on entry.
- * Only called from user space.
- *
- * XXX if we had a free scratch register we could save the RSP into the stack frame
- * and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
- * it deals with uncanonical addresses better. SYSRET has trouble
- * with them due to bugs in both AMD and Intel CPUs.
- */
-
-ENTRY(system_call)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
- CFI_REGISTER rip,rcx
- /*CFI_REGISTER rflags,r11*/
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-ENTRY(system_call_after_swapgs)
-
- movq %rsp,PER_CPU_VAR(old_rsp)
- movq PER_CPU_VAR(kernel_stack),%rsp
- /*
- * No need to follow this irqs off/on section - it's straight
- * and short:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- GET_THREAD_INFO(%rcx)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
- jnz tracesys
-system_call_fastpath:
- cmpq $__NR_syscall_max,%rax
- ja badsys
- movq %r10,%rcx
- call *sys_call_table(,%rax,8) # XXX: rip relative
- movq %rax,RAX-ARGOFFSET(%rsp)
-/*
- * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
- */
-ret_from_sys_call:
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: flagmask */
-sysret_check:
- LOCKDEP_SYS_EXIT
- GET_THREAD_INFO(%rcx)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- jnz sysret_careful
- CFI_REMEMBER_STATE
- /*
- * sysretq will re-enable interrupts:
- */
- TRACE_IRQS_ON
- movq RIP-ARGOFFSET(%rsp),%rcx
- CFI_REGISTER rip,rcx
- RESTORE_ARGS 0,-ARG_SKIP,1
- /*CFI_REGISTER rflags,r11*/
- movq PER_CPU_VAR(old_rsp), %rsp
- USERGS_SYSRET64
-
- CFI_RESTORE_STATE
- /* Handle reschedules */
- /* edx: work, edi: workmask */
-sysret_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc sysret_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- jmp sysret_check
-
- /* Handle a signal */
-sysret_signal:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
- bt $TIF_SYSCALL_AUDIT,%edx
- jc sysret_audit
-#endif
- /* edx: work flags (arg3) */
- leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
- xorl %esi,%esi # oldset -> arg2
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call do_notify_resume
- RESTORE_TOP_OF_STACK %r11
- RESTORE_REST
- movl $_TIF_WORK_MASK,%edi
- /* Use IRET because user could have changed frame. This
- works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
-
-badsys:
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp ret_from_sys_call
-
-#ifdef CONFIG_AUDITSYSCALL
- /*
- * Fast path for syscall audit without full syscall trace.
- * We just call audit_syscall_entry() directly, and then
- * jump back to the normal fast path.
- */
-auditsys:
- movq %r10,%r9 /* 6th arg: 4th syscall arg */
- movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
- movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
- movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
- movq %rax,%rsi /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
- LOAD_ARGS 0 /* reload call-clobbered registers */
- jmp system_call_fastpath
-
- /*
- * Return fast path for syscall audit. Call audit_syscall_exit()
- * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
- * masked off.
- */
-sysret_audit:
- movq %rax,%rsi /* second arg, syscall return value */
- cmpq $0,%rax /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- jmp sysret_check
-#endif /* CONFIG_AUDITSYSCALL */
-
- /* Do syscall tracing */
-tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
- jz auditsys
-#endif
- SAVE_REST
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
- FIXUP_TOP_OF_STACK %rdi
- movq %rsp,%rdi
- call syscall_trace_enter
- /*
- * Reload arg registers from stack in case ptrace changed them.
- * We don't reload %rax because syscall_trace_enter() returned
- * the value it wants us to use in the table lookup.
- */
- LOAD_ARGS ARGOFFSET, 1
- RESTORE_REST
- cmpq $__NR_syscall_max,%rax
- ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
- movq %r10,%rcx /* fixup for C */
- call *sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
- /* Use IRET because user could have changed frame */
-
-/*
- * Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
- */
-GLOBAL(int_ret_from_sys_call)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_restore_args
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: mask to check */
-GLOBAL(int_with_check)
- LOCKDEP_SYS_EXIT_IRQ
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp retint_swapgs
-
- /* Either reschedule or signal or syscall exit tracking needed. */
- /* First do a reschedule test. */
- /* edx: work, edi: workmask */
-int_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc int_very_careful
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
-
- /* handle signals and tracing -- both require a full stack frame */
-int_very_careful:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
- /* Check for syscall exit trace */
- testl $_TIF_WORK_SYSCALL_EXIT,%edx
- jz int_signal
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- leaq 8(%rsp),%rdi # &ptregs -> arg1
- call syscall_trace_leave
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
- jmp int_restore_rest
-
-int_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz 1f
- movq %rsp,%rdi # &ptregs -> arg1
- xorl %esi,%esi # oldset -> arg2
- call do_notify_resume
-1: movl $_TIF_WORK_MASK,%edi
-int_restore_rest:
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
- CFI_ENDPROC
-END(system_call)
-
-/*
- * Certain special system calls that need to save a complete full stack frame.
- */
- .macro PTREGSCALL label,func,arg
-ENTRY(\label)
- PARTIAL_FRAME 1 8 /* offset 8: return address */
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- call save_rest
- DEFAULT_FRAME 0 8 /* offset 8: return address */
- leaq 8(%rsp), \arg /* pt_regs pointer */
- call \func
- jmp ptregscall_common
- CFI_ENDPROC
-END(\label)
- .endm
-
- PTREGSCALL stub_clone, sys_clone, %r8
- PTREGSCALL stub_fork, sys_fork, %rdi
- PTREGSCALL stub_vfork, sys_vfork, %rdi
- PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
- PTREGSCALL stub_iopl, sys_iopl, %rsi
-
-ENTRY(ptregscall_common)
- DEFAULT_FRAME 1 8 /* offset 8: return address */
- RESTORE_TOP_OF_STACK %r11, 8
- movq_cfi_restore R15+8, r15
- movq_cfi_restore R14+8, r14
- movq_cfi_restore R13+8, r13
- movq_cfi_restore R12+8, r12
- movq_cfi_restore RBP+8, rbp
- movq_cfi_restore RBX+8, rbx
- ret $REST_SKIP /* pop extended registers */
- CFI_ENDPROC
-END(ptregscall_common)
-
-ENTRY(stub_execve)
- CFI_STARTPROC
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- movq %rsp, %rcx
- call sys_execve
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_execve)
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
- CFI_STARTPROC
- addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
- SAVE_REST
- movq %rsp,%rdi
- FIXUP_TOP_OF_STACK %r11
- call sys_rt_sigreturn
- movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_rt_sigreturn)
-
-/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
- */
- .section .init.rodata,"a"
-ENTRY(interrupt)
- .text
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-ENTRY(irq_entries_start)
- INTR_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < NR_VECTORS
- .if vector <> FIRST_EXTERNAL_VECTOR
- CFI_ADJUST_CFA_OFFSET -8
- .endif
-1: pushq $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 8
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .quad 1b
- .text
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
- CFI_ENDPROC
-END(irq_entries_start)
-
-.previous
-END(interrupt)
-.previous
-
-/*
- * Interrupt entry/exit.
- *
- * Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
-
-/* 0(%rsp): ~(interrupt number) */
- .macro interrupt func
- subq $10*8, %rsp
- CFI_ADJUST_CFA_OFFSET 10*8
- call save_args
- PARTIAL_FRAME 0
- call \func
- .endm
-
- /*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_interrupt.
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- XCPT_FRAME
- addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
- interrupt do_IRQ
- /* 0(%rsp): old_rsp-ARGOFFSET */
-ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- decl PER_CPU_VAR(irq_count)
- leaveq
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
-exit_intr:
- GET_THREAD_INFO(%rcx)
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_kernel
-
- /* Interrupt came from user space */
- /*
- * Has a correct top of stack, but a partial stack frame
- * %rcx: thread info. Interrupts off.
- */
-retint_with_reschedule:
- movl $_TIF_WORK_MASK,%edi
-retint_check:
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- CFI_REMEMBER_STATE
- jnz retint_careful
-
-retint_swapgs: /* return to user-space */
- /*
- * The iretq could re-enable interrupts:
- */
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_IRETQ
- SWAPGS
- jmp restore_args
-
-retint_restore_args: /* return to kernel space */
- DISABLE_INTERRUPTS(CLBR_ANY)
- /*
- * The iretq could re-enable interrupts:
- */
- TRACE_IRQS_IRETQ
-restore_args:
- RESTORE_ARGS 0,8,0
-
-irq_return:
- INTERRUPT_RETURN
-
- .section __ex_table, "a"
- .quad irq_return, bad_iret
- .previous
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
- iretq
-
- .section __ex_table,"a"
- .quad native_iret, bad_iret
- .previous
-#endif
-
- .section .fixup,"ax"
-bad_iret:
- /*
- * The iret traps when the %cs or %ss being restored is bogus.
- * We've lost the original trap vector and error code.
- * #GPF is the most likely one to get for an invalid selector.
- * So pretend we completed the iret and took the #GPF in user mode.
- *
- * We are now running with the kernel GS after exception recovery.
- * But error_entry expects us to have user GS to match the user %cs,
- * so swap back.
- */
- pushq $0
-
- SWAPGS
- jmp general_protection
-
- .previous
-
- /* edi: workmask, edx: work */
-retint_careful:
- CFI_RESTORE_STATE
- bt $TIF_NEED_RESCHED,%edx
- jnc retint_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- GET_THREAD_INFO(%rcx)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp retint_check
-
-retint_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz retint_swapgs
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
- movq $-1,ORIG_RAX(%rsp)
- xorl %esi,%esi # oldset
- movq %rsp,%rdi # &pt_regs
- call do_notify_resume
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- jmp retint_with_reschedule
-
-#ifdef CONFIG_PREEMPT
- /* Returning to kernel space. Check if we need preemption */
- /* rcx: threadinfo. interrupts off. */
-ENTRY(retint_kernel)
- cmpl $0,TI_preempt_count(%rcx)
- jnz retint_restore_args
- bt $TIF_NEED_RESCHED,TI_flags(%rcx)
- jnc retint_restore_args
- bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
- jnc retint_restore_args
- call preempt_schedule_irq
- jmp exit_intr
-#endif
-
- CFI_ENDPROC
-END(common_interrupt)
-
-/*
- * APIC interrupts.
- */
-.macro apicinterrupt num sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- pushq $~(\num)
- CFI_ADJUST_CFA_OFFSET 8
- interrupt \do_sym
- jmp ret_from_intr
- CFI_ENDPROC
-END(\sym)
-.endm
-
-#ifdef CONFIG_SMP
-apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
- irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt REBOOT_VECTOR \
- reboot_interrupt smp_reboot_interrupt
-#endif
-
-#ifdef CONFIG_X86_UV
-apicinterrupt UV_BAU_MESSAGE \
- uv_bau_message_intr1 uv_bau_message_interrupt
-#endif
-apicinterrupt LOCAL_TIMER_VECTOR \
- apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt GENERIC_INTERRUPT_VECTOR \
- generic_interrupt smp_generic_interrupt
-
-#ifdef CONFIG_SMP
-apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
- invalidate_interrupt0 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
- invalidate_interrupt1 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
- invalidate_interrupt2 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
- invalidate_interrupt3 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
- invalidate_interrupt4 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
- invalidate_interrupt5 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
- invalidate_interrupt6 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
- invalidate_interrupt7 smp_invalidate_interrupt
-#endif
-
-apicinterrupt THRESHOLD_APIC_VECTOR \
- threshold_interrupt smp_threshold_interrupt
-apicinterrupt THERMAL_APIC_VECTOR \
- thermal_interrupt smp_thermal_interrupt
-
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
-#ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
- call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR \
- call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR \
- reschedule_interrupt smp_reschedule_interrupt
-#endif
-
-apicinterrupt ERROR_APIC_VECTOR \
- error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR \
- spurious_interrupt smp_spurious_interrupt
-
-#ifdef CONFIG_PERF_COUNTERS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
-#endif
-
-/*
- * Exception entry points.
- */
-.macro zeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-.macro paranoidzeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-.macro paranoidzeroentry_ist sym do_sym ist
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- PER_CPU(init_tss, %rbp)
- subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
- call \do_sym
- addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-.macro errorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
- /* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call save_paranoid
- DEFAULT_FRAME 0
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-zeroentry divide_error do_divide_error
-zeroentry overflow do_overflow
-zeroentry bounds do_bounds
-zeroentry invalid_op do_invalid_op
-zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
-zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
-errorentry invalid_TSS do_invalid_TSS
-errorentry segment_not_present do_segment_not_present
-zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
-zeroentry coprocessor_error do_coprocessor_error
-errorentry alignment_check do_alignment_check
-zeroentry simd_coprocessor_error do_simd_coprocessor_error
-
- /* Reload gs selector with exception handling */
- /* edi: new selector */
-ENTRY(native_load_gs_index)
- CFI_STARTPROC
- pushf
- CFI_ADJUST_CFA_OFFSET 8
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
- SWAPGS
-gs_change:
- movl %edi,%gs
-2: mfence /* workaround */
- SWAPGS
- popf
- CFI_ADJUST_CFA_OFFSET -8
- ret
- CFI_ENDPROC
-END(native_load_gs_index)
-
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
- .section .fixup,"ax"
- /* running with kernelgs */
-bad_gs:
- SWAPGS /* switch back to user gs */
- xorl %eax,%eax
- movl %eax,%gs
- jmp 2b
- .previous
-
-/*
- * Create a kernel thread.
- *
- * C extern interface:
- * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
- *
- * asm input arguments:
- * rdi: fn, rsi: arg, rdx: flags
- */
-ENTRY(kernel_thread)
- CFI_STARTPROC
- FAKE_STACK_FRAME $child_rip
- SAVE_ALL
-
- # rdi: flags, rsi: usp, rdx: will be &pt_regs
- movq %rdx,%rdi
- orq kernel_thread_flags(%rip),%rdi
- movq $-1, %rsi
- movq %rsp, %rdx
-
- xorl %r8d,%r8d
- xorl %r9d,%r9d
-
- # clone now
- call do_fork
- movq %rax,RAX(%rsp)
- xorl %edi,%edi
-
- /*
- * It isn't worth to check for reschedule here,
- * so internally to the x86_64 port you can rely on kernel_thread()
- * not to reschedule the child before returning, this avoids the need
- * of hacks for example to fork off the per-CPU idle tasks.
- * [Hopefully no generic code relies on the reschedule -AK]
- */
- RESTORE_ALL
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_thread)
-
-ENTRY(child_rip)
- pushq $0 # fake return address
- CFI_STARTPROC
- /*
- * Here we are in the child and the registers are set as they were
- * at kernel_thread() invocation in the parent.
- */
- movq %rdi, %rax
- movq %rsi, %rdi
- call *%rax
- # exit
- mov %eax, %edi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-END(child_rip)
-
-/*
- * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
- *
- * C extern interface:
- * extern long execve(char *name, char **argv, char **envp)
- *
- * asm input arguments:
- * rdi: name, rsi: argv, rdx: envp
- *
- * We want to fallback into:
- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
- *
- * do_sys_execve asm fallback arguments:
- * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
- */
-ENTRY(kernel_execve)
- CFI_STARTPROC
- FAKE_STACK_FRAME $0
- SAVE_ALL
- movq %rsp,%rcx
- call sys_execve
- movq %rax, RAX(%rsp)
- RESTORE_REST
- testq %rax,%rax
- je int_ret_from_sys_call
- RESTORE_ARGS
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_execve)
-
-/* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(call_softirq)
- CFI_STARTPROC
- push %rbp
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbp,0
- mov %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr),%rsp
- push %rbp # backlink for old unwinder
- call __do_softirq
- leaveq
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
- decl PER_CPU_VAR(irq_count)
- ret
- CFI_ENDPROC
-END(call_softirq)
-
-#ifdef CONFIG_XEN
-zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
-
-/*
- * A note on the "critical region" in our callback handler.
- * We want to avoid stacking callback handlers due to events occurring
- * during handling of the last event. To do this, we keep events disabled
- * until we've done all processing. HOWEVER, we must enable events before
- * popping the stack frame (can't be done atomically) and so it would still
- * be possible to get enough handler activations to overflow the stack.
- * Although unlikely, bugs of that kind are hard to track down, so we'd
- * like to avoid the possibility.
- * So, on entry to the handler we detect whether we interrupted an
- * existing activation in its critical region -- if so, we pop the current
- * activation and restart the handler using the previous one.
- */
-ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
- CFI_STARTPROC
-/*
- * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
- * see the correct pointer to the pt_regs
- */
- movq %rdi, %rsp # we don't return, adjust the stack frame
- CFI_ENDPROC
- DEFAULT_FRAME
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- pushq %rbp # backlink for old unwinder
- call xen_evtchn_do_upcall
- popq %rsp
- CFI_DEF_CFA_REGISTER rsp
- decl PER_CPU_VAR(irq_count)
- jmp error_exit
- CFI_ENDPROC
-END(do_hypervisor_callback)
-
-/*
- * Hypervisor uses this for application faults while it executes.
- * We get here for two reasons:
- * 1. Fault while reloading DS, ES, FS or GS
- * 2. Fault while executing IRET
- * Category 1 we do not need to fix up as Xen has already reloaded all segment
- * registers that could be reloaded and zeroed the others.
- * Category 2 we fix up by killing the current process. We cannot use the
- * normal Linux return path in this case because if we use the IRET hypercall
- * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
- * We distinguish between categories by comparing each saved segment register
- * with its current contents: any discrepancy means we in category 1.
- */
-ENTRY(xen_failsafe_callback)
- INTR_FRAME 1 (6*8)
- /*CFI_REL_OFFSET gs,GS*/
- /*CFI_REL_OFFSET fs,FS*/
- /*CFI_REL_OFFSET es,ES*/
- /*CFI_REL_OFFSET ds,DS*/
- CFI_REL_OFFSET r11,8
- CFI_REL_OFFSET rcx,0
- movw %ds,%cx
- cmpw %cx,0x10(%rsp)
- CFI_REMEMBER_STATE
- jne 1f
- movw %es,%cx
- cmpw %cx,0x18(%rsp)
- jne 1f
- movw %fs,%cx
- cmpw %cx,0x20(%rsp)
- jne 1f
- movw %gs,%cx
- cmpw %cx,0x28(%rsp)
- jne 1f
- /* All segments match their saved values => Category 2 (Bad IRET). */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0 /* RIP */
- pushq_cfi %r11
- pushq_cfi %rcx
- jmp general_protection
- CFI_RESTORE_STATE
-1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0
- SAVE_ALL
- jmp error_exit
- CFI_ENDPROC
-END(xen_failsafe_callback)
-
-#endif /* CONFIG_XEN */
-
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
-#ifdef CONFIG_XEN
-zeroentry xen_debug do_debug
-zeroentry xen_int3 do_int3
-errorentry xen_stack_segment do_stack_segment
-#endif
-errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
-#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check *machine_check_vector(%rip)
-#endif
-
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- *
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
- */
-
- /* ebx: no swapgs flag */
-ENTRY(paranoid_exit)
- INTR_FRAME
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
- TRACE_IRQS_IRETQ 0
- SWAPGS_UNSAFE_STACK
- RESTORE_ALL 8
- jmp irq_return
-paranoid_restore:
- TRACE_IRQS_IRETQ 0
- RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
-paranoid_schedule:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
- CFI_ENDPROC
-END(paranoid_exit)
-
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
- */
-ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
- cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
- movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
- xorl %ebx,%ebx
- testl $3,CS+8(%rsp)
- je error_kernelspace
-error_swapgs:
- SWAPGS
-error_sti:
- TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
- */
-error_kernelspace:
- incl %ebx
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- cmpq $gs_change,RIP+8(%rsp)
- je error_swapgs
- jmp error_sti
-END(error_entry)
-
-
-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
-ENTRY(error_exit)
- DEFAULT_FRAME
- movl %ebx,%eax
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jne retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
- CFI_ENDPROC
-END(error_exit)
-
-
- /* runs on exception stack */
-ENTRY(nmi)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call save_paranoid
- DEFAULT_FRAME 0
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- movq %rsp,%rdi
- movq $-1,%rsi
- call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
-nmi_swapgs:
- SWAPGS_UNSAFE_STACK
-nmi_restore:
- RESTORE_ALL 8
- jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
- CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
-END(nmi)
-
-ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
- sysret
- CFI_ENDPROC
-END(ignore_sysret)
-
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000000..6726e0473d0b
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <linux/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more virtual address space for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand = get_random_long();
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ /* FRED systems always restore the full value of %rsp */
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ return;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
+ p4d_populate(&init_mm, p4d, espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap(0);
+}
+
+void init_espfix_ap(int cpu)
+{
+ unsigned int page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n, node;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* FRED systems always restore the full value of %rsp */
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ return;
+
+ /* We only have to do this once... */
+ if (likely(per_cpu(espfix_stack, cpu)))
+ return; /* Already initialized */
+
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ node = cpu_to_node(cpu);
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
+ /*
+ * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since
+ * this is mapped to userspace.
+ */
+ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ WRITE_ONCE(espfix_pages[page], stack_page);
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
+}
diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile
new file mode 100644
index 000000000000..78c5621457d4
--- /dev/null
+++ b/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Build rules for the FPU support code:
+#
+
+obj-y += init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
new file mode 100644
index 000000000000..edbafc5940e3
--- /dev/null
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 FPU bug checks:
+ */
+#include <linux/printk.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+void __init fpu__init_check_bugs(void)
+{
+ s32 fdiv_bug;
+
+ /* kernel_fpu_begin/end() relies on patched alternative instructions. */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ kernel_fpu_begin();
+
+ /*
+ * trap_init() enabled FXSR and company _before_ testing for FP
+ * problems here.
+ *
+ * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+ */
+ __asm__("fninit\n\t"
+ "fldl %1\n\t"
+ "fdivl %2\n\t"
+ "fmull %2\n\t"
+ "fldl %1\n\t"
+ "fsubp %%st,%%st(1)\n\t"
+ "fistpl %0\n\t"
+ "fwait\n\t"
+ "fninit"
+ : "=m" (*&fdiv_bug)
+ : "m" (*&x), "m" (*&y));
+
+ kernel_fpu_end();
+
+ if (fdiv_bug) {
+ set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+ pr_warn("Hmm, FPU with FDIV bug\n");
+ }
+}
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
new file mode 100644
index 000000000000..10d0a720659c
--- /dev/null
+++ b/arch/x86/kernel/fpu/context.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_CONTEXT_H
+#define __X86_KERNEL_FPU_CONTEXT_H
+
+#include <asm/fpu/xstate.h>
+#include <asm/trace/fpu.h>
+
+/* Functions related to FPU context tracking */
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Invalidate a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+ fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+ return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+ trace_x86_fpu_regs_activated(fpu);
+}
+
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ int cpu = smp_processor_id();
+
+ if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
+ return;
+
+ if (!fpregs_state_valid(fpu, cpu)) {
+ /*
+ * This restores _all_ xstate which has not been
+ * established yet.
+ *
+ * If PKRU is enabled, then the PKRU value is already
+ * correct because it was either set in switch_to() or in
+ * flush_thread(). So it is excluded because it might be
+ * not up to date in current->thread.fpu->xsave state.
+ *
+ * XFD state is handled in restore_fpregs_from_fpstate().
+ */
+ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
+
+ fpregs_activate(fpu);
+ fpu->last_cpu = cpu;
+ }
+ clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
new file mode 100644
index 000000000000..da233f20ae6f
--- /dev/null
+++ b/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,989 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/api.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/types.h>
+#include <asm/msr.h>
+#include <asm/traps.h>
+#include <asm/irq_regs.h>
+
+#include <uapi/asm/kvm.h>
+
+#include <linux/hardirq.h>
+#include <linux/kvm_types.h>
+#include <linux/pkeys.h>
+#include <linux/vmalloc.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/fpu.h>
+
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
+#endif
+
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+struct vcpu_fpu_config guest_default_cfg __ro_after_init;
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+struct fpstate init_fpstate __ro_after_init;
+
+/*
+ * Track FPU initialization and kernel-mode usage. 'true' means the FPU is
+ * initialized and is not currently being used by the kernel:
+ */
+DEFINE_PER_CPU(bool, kernel_fpu_allowed);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+#ifdef CONFIG_X86_DEBUG_FPU
+struct fpu *x86_task_fpu(struct task_struct *task)
+{
+ if (WARN_ON_ONCE(task->flags & PF_KTHREAD))
+ return NULL;
+
+ return (void *)task + sizeof(*task);
+}
+#endif
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ */
+bool irq_fpu_usable(void)
+{
+ if (WARN_ON_ONCE(in_nmi()))
+ return false;
+
+ /*
+ * Return false in the following cases:
+ *
+ * - FPU is not yet initialized. This can happen only when the call is
+ * coming from CPU onlining, for example for microcode checksumming.
+ * - The kernel is already using the FPU, either because of explicit
+ * nesting (which should never be done), or because of implicit
+ * nesting when a hardirq interrupted a kernel-mode FPU section.
+ *
+ * The single boolean check below handles both cases:
+ */
+ if (!this_cpu_read(kernel_fpu_allowed))
+ return false;
+
+ /*
+ * When not in NMI or hard interrupt context, FPU can be used in:
+ *
+ * - Task context except from within fpregs_lock()'ed critical
+ * regions.
+ *
+ * - Soft interrupt processing context which cannot happen
+ * while in a fpregs_lock()'ed critical region.
+ */
+ if (!in_hardirq())
+ return true;
+
+ /*
+ * In hard interrupt context it's safe when soft interrupts
+ * are enabled, which means the interrupt did not hit in
+ * a fpregs_lock()'ed critical region.
+ */
+ return !softirq_count();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+/*
+ * Track AVX512 state use because it is known to slow the max clock
+ * speed of the core.
+ */
+static void update_avx_timestamp(struct fpu *fpu)
+{
+
+#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+ if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
+ fpu->avx512_timestamp = jiffies;
+}
+
+/*
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
+ * preserved.
+ *
+ * Must be called with fpregs_lock() held.
+ *
+ * The legacy FNSAVE instruction clears all FPU state unconditionally, so
+ * register state has to be reloaded. That might be a pointless exercise
+ * when the FPU is going to be used by another task right after that. But
+ * this only affects 20+ years old 32bit systems and avoids conditionals all
+ * over the place.
+ *
+ * FXSAVE and all XSAVE variants preserve the FPU register state.
+ */
+void save_fpregs_to_fpstate(struct fpu *fpu)
+{
+ if (likely(use_xsave())) {
+ os_xsave(fpu->fpstate);
+ update_avx_timestamp(fpu);
+ return;
+ }
+
+ if (likely(use_fxsr())) {
+ fxsave(&fpu->fpstate->regs.fxsave);
+ return;
+ }
+
+ /*
+ * Legacy FPU register saving, FNSAVE always clears FPU registers,
+ * so we have to reload them from the memory state.
+ */
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+ frstor(&fpu->fpstate->regs.fsave);
+}
+
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
+{
+ /*
+ * AMD K7/K8 and later CPUs up to Zen don't save/restore
+ * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
+ * here by setting it to fixed values. "m" is a random variable
+ * that should be in L1.
+ */
+ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+ asm volatile(
+ "fnclex\n\t"
+ "emms\n\t"
+ "fildl %[addr]" /* set F?P to defined value */
+ : : [addr] "m" (*fpstate));
+ }
+
+ if (use_xsave()) {
+ /*
+ * Dynamically enabled features are enabled in XCR0, but
+ * usage requires also that the corresponding bits in XFD
+ * are cleared. If the bits are set then using a related
+ * instruction will raise #NM. This allows to do the
+ * allocation of the larger FPU buffer lazy from #NM or if
+ * the task has no permission to kill it which would happen
+ * via #UD if the feature is disabled in XCR0.
+ *
+ * XFD state is following the same life time rules as
+ * XSTATE and to restore state correctly XFD has to be
+ * updated before XRSTORS otherwise the component would
+ * stay in or go into init state even if the bits are set
+ * in fpstate::regs::xsave::xfeatures.
+ */
+ xfd_update_state(fpstate);
+
+ /*
+ * Restoring state always needs to modify all features
+ * which are in @mask even if the current task cannot use
+ * extended features.
+ *
+ * So fpstate->xfeatures cannot be used here, because then
+ * a feature for which the task has no permission but was
+ * used by the previous task would not go into init state.
+ */
+ mask = fpu_kernel_cfg.max_features & mask;
+
+ os_xrstor(fpstate, mask);
+ } else {
+ if (use_fxsr())
+ fxrstor(&fpstate->regs.fxsave);
+ else
+ frstor(&fpstate->regs.fsave);
+ }
+}
+
+void fpu_reset_from_exception_fixup(void)
+{
+ restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+static void __fpstate_reset(struct fpstate *fpstate);
+
+static void fpu_lock_guest_permissions(void)
+{
+ struct fpu_state_perm *fpuperm;
+ u64 perm;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ spin_lock_irq(&current->sighand->siglock);
+ fpuperm = &x86_task_fpu(current->group_leader)->guest_perm;
+ perm = fpuperm->__state_perm;
+
+ /* First fpstate allocation locks down permissions. */
+ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
+
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fpstate;
+ unsigned int size;
+
+ size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64);
+
+ fpstate = vzalloc(size);
+ if (!fpstate)
+ return false;
+
+ /* Initialize indicators to reflect properties of the fpstate */
+ fpstate->is_valloc = true;
+ fpstate->is_guest = true;
+
+ __fpstate_reset(fpstate);
+ fpstate_init_user(fpstate);
+
+ gfpu->fpstate = fpstate;
+ gfpu->xfeatures = guest_default_cfg.features;
+
+ /*
+ * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
+ * to userspace, even when XSAVE is unsupported, so that restoring FPU
+ * state on a different CPU that does support XSAVE can cleanly load
+ * the incoming state using its natural XSAVE. In other words, KVM's
+ * uABI size may be larger than this host's default size. Conversely,
+ * the default size should never be larger than KVM's base uABI size;
+ * all features that can expand the uABI size must be opt-in.
+ */
+ gfpu->uabi_size = sizeof(struct kvm_xsave);
+ if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
+ gfpu->uabi_size = fpu_user_cfg.default_size;
+
+ fpu_lock_guest_permissions();
+
+ return true;
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate);
+
+void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fpstate = gfpu->fpstate;
+
+ if (!fpstate)
+ return;
+
+ if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use))
+ return;
+
+ gfpu->fpstate = NULL;
+ vfree(fpstate);
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate);
+
+/*
+ * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+ * @guest_fpu: Pointer to the guest FPU container
+ * @xfeatures: Features requested by guest CPUID
+ *
+ * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+ lockdep_assert_preemption_enabled();
+
+ /* Nothing to do if all requested features are already enabled. */
+ xfeatures &= ~guest_fpu->xfeatures;
+ if (!xfeatures)
+ return 0;
+
+ return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
+
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+ fpregs_lock();
+ guest_fpu->fpstate->xfd = xfd;
+ if (guest_fpu->fpstate->in_use)
+ xfd_update_state(guest_fpu->fpstate);
+ fpregs_unlock();
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
+ *
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be updated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
+ */
+void fpu_sync_guest_vmexit_xfd_state(void)
+{
+ struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
+
+ lockdep_assert_irqs_disabled();
+ if (fpu_state_size_dynamic()) {
+ rdmsrq(MSR_IA32_XFD, fpstate->xfd);
+ __this_cpu_write(xfd_state, fpstate->xfd);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state);
+#endif /* CONFIG_X86_64 */
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+ struct fpstate *guest_fps = guest_fpu->fpstate;
+ struct fpu *fpu = x86_task_fpu(current);
+ struct fpstate *cur_fps = fpu->fpstate;
+
+ fpregs_lock();
+ if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
+
+ /* Swap fpstate */
+ if (enter_guest) {
+ fpu->__task_fpstate = cur_fps;
+ fpu->fpstate = guest_fps;
+ guest_fps->in_use = true;
+ } else {
+ guest_fps->in_use = false;
+ fpu->fpstate = fpu->__task_fpstate;
+ fpu->__task_fpstate = NULL;
+ }
+
+ cur_fps = fpu->fpstate;
+
+ if (!cur_fps->is_confidential) {
+ /* Includes XFD update */
+ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+ } else {
+ /*
+ * XSTATE is restored by firmware from encrypted
+ * memory. Make sure XFD state is correct while
+ * running with guest fpstate
+ */
+ xfd_update_state(cur_fps);
+ }
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate);
+
+void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+ unsigned int size, u64 xfeatures, u32 pkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ union fpregs_state *ustate = buf;
+ struct membuf mb = { .p = buf, .left = size };
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
+ XSTATE_COPY_XSAVE);
+ } else {
+ memcpy(&ustate->fxsave, &kstate->regs.fxsave,
+ sizeof(ustate->fxsave));
+ /* Make it restorable on a XSAVE enabled host */
+ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi);
+
+int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
+ u64 xcr0, u32 *vpkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ const union fpregs_state *ustate = buf;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
+ return -EINVAL;
+ if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+ return 0;
+ }
+
+ if (ustate->xsave.header.xfeatures & ~xcr0)
+ return -EINVAL;
+
+ /*
+ * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
+ * in the header. KVM's odd ABI is to leave PKRU untouched in this
+ * case (all other components are eventually re-initialized).
+ */
+ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
+ vpkru = NULL;
+
+ return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate);
+#endif /* CONFIG_KVM */
+
+void kernel_fpu_begin_mask(unsigned int kfpu_mask)
+{
+ if (!irqs_disabled())
+ fpregs_lock();
+
+ WARN_ON_FPU(!irq_fpu_usable());
+
+ /* Toggle kernel_fpu_allowed to false: */
+ WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed));
+ this_cpu_write(kernel_fpu_allowed, false);
+
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
+ !test_thread_flag(TIF_NEED_FPU_LOAD)) {
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ save_fpregs_to_fpstate(x86_task_fpu(current));
+ }
+ __cpu_invalidate_fpregs_state();
+
+ /* Put sane initial values into the control registers. */
+ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
+ ldmxcsr(MXCSR_DEFAULT);
+
+ if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
+ asm volatile ("fninit");
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
+
+void kernel_fpu_end(void)
+{
+ /* Toggle kernel_fpu_allowed back to true: */
+ WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed));
+ this_cpu_write(kernel_fpu_allowed, true);
+
+ if (!irqs_disabled())
+ fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * Sync the FPU register state to current's memory register state when the
+ * current task owns the FPU. The hardware register state is preserved.
+ */
+void fpu_sync_fpstate(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != x86_task_fpu(current));
+
+ fpregs_lock();
+ trace_x86_fpu_before_save(fpu);
+
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
+
+ trace_x86_fpu_after_save(fpu);
+ fpregs_unlock();
+}
+
+static inline unsigned int init_fpstate_copy_size(void)
+{
+ if (!use_xsave())
+ return fpu_kernel_cfg.default_size;
+
+ /* XSAVE(S) just needs the legacy and the xstate header part */
+ return sizeof(init_fpstate.regs.xsave);
+}
+
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
+{
+ fpstate->regs.fxsave.cwd = 0x37f;
+ fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
+}
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
+{
+ fpstate->regs.fsave.cwd = 0xffff037fu;
+ fpstate->regs.fsave.swd = 0xffff0000u;
+ fpstate->regs.fsave.twd = 0xffffffffu;
+ fpstate->regs.fsave.fos = 0xffff0000u;
+}
+
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_alloc_guest_fpstate() which is invoked from KVM
+ */
+void fpstate_init_user(struct fpstate *fpstate)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpstate_init_soft(&fpstate->regs.soft);
+ return;
+ }
+
+ xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
+
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ fpstate_init_fxstate(fpstate);
+ else
+ fpstate_init_fstate(fpstate);
+}
+
+static void __fpstate_reset(struct fpstate *fpstate)
+{
+ /*
+ * Supervisor features (and thus sizes) may diverge between guest
+ * FPUs and host FPUs, as some supervisor features are supported
+ * for guests despite not being utilized by the host. User
+ * features and sizes are always identical, which allows for
+ * common guest and userspace ABI.
+ *
+ * For the host, set XFD to the kernel's desired initialization
+ * value. For guests, set XFD to its architectural RESET value.
+ */
+ if (fpstate->is_guest) {
+ fpstate->size = guest_default_cfg.size;
+ fpstate->xfeatures = guest_default_cfg.features;
+ fpstate->xfd = 0;
+ } else {
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->xfd = init_fpstate.xfd;
+ }
+
+ fpstate->user_size = fpu_user_cfg.default_size;
+ fpstate->user_xfeatures = fpu_user_cfg.default_features;
+}
+
+void fpstate_reset(struct fpu *fpu)
+{
+ /* Set the fpstate pointer to the default fpstate */
+ fpu->fpstate = &fpu->__fpstate;
+ __fpstate_reset(fpu->fpstate);
+
+ /* Initialize the permission related info in fpu */
+ fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
+ fpu->perm.__state_size = fpu_kernel_cfg.default_size;
+ fpu->perm.__user_state_size = fpu_user_cfg.default_size;
+
+ fpu->guest_perm.__state_perm = guest_default_cfg.features;
+ fpu->guest_perm.__state_size = guest_default_cfg.size;
+ /*
+ * User features and sizes are always identical between host and
+ * guest FPUs, which allows for common guest and userspace ABI.
+ */
+ fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size;
+}
+
+static inline void fpu_inherit_perms(struct fpu *dst_fpu)
+{
+ if (fpu_state_size_dynamic()) {
+ struct fpu *src_fpu = x86_task_fpu(current->group_leader);
+
+ spin_lock_irq(&current->sighand->siglock);
+ /* Fork also inherits the permissions of the parent */
+ dst_fpu->perm = src_fpu->perm;
+ dst_fpu->guest_perm = src_fpu->guest_perm;
+ spin_unlock_irq(&current->sighand->siglock);
+ }
+}
+
+/* A passed ssp of zero will not cause any update */
+static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
+{
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ struct cet_user_state *xstate;
+
+ /* If ssp update is not needed. */
+ if (!ssp)
+ return 0;
+
+ xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave,
+ XFEATURE_CET_USER);
+
+ /*
+ * If there is a non-zero ssp, then 'dst' must be configured with a shadow
+ * stack and the fpu state should be up to date since it was just copied
+ * from the parent in fpu_clone(). So there must be a valid non-init CET
+ * state location in the buffer.
+ */
+ if (WARN_ON_ONCE(!xstate))
+ return 1;
+
+ xstate->user_ssp = (u64)ssp;
+#endif
+ return 0;
+}
+
+/* Clone current's FPU state on fork */
+int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal,
+ unsigned long ssp)
+{
+ /*
+ * We allocate the new FPU structure right after the end of the task struct.
+ * task allocation size already took this into account.
+ *
+ * This is safe because task_struct size is a multiple of cacheline size,
+ * thus x86_task_fpu() will always be cacheline aligned as well.
+ */
+ struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
+
+ BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);
+
+ /* The new task's FPU state cannot be valid in the hardware. */
+ dst_fpu->last_cpu = -1;
+
+ fpstate_reset(dst_fpu);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
+ return 0;
+
+ /*
+ * Enforce reload for user space tasks and prevent kernel threads
+ * from trying to save the FPU registers on context switch.
+ */
+ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
+
+ /*
+ * No FPU state inheritance for kernel threads and IO
+ * worker threads.
+ */
+ if (minimal) {
+ /* Clear out the minimal state */
+ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
+ init_fpstate_copy_size());
+ return 0;
+ }
+
+ /*
+ * If a new feature is added, ensure all dynamic features are
+ * caller-saved from here!
+ */
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ /*
+ * Save the default portion of the current FPU state into the
+ * clone. Assume all dynamic features to be defined as caller-
+ * saved, which enables skipping both the expansion of fpstate
+ * and the copying of any dynamic state.
+ *
+ * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
+ * copying is not valid when current uses non-default states.
+ */
+ fpregs_lock();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+ save_fpregs_to_fpstate(dst_fpu);
+ fpregs_unlock();
+ if (!(clone_flags & CLONE_THREAD))
+ fpu_inherit_perms(dst_fpu);
+
+ /*
+ * Children never inherit PASID state.
+ * Force it to have its init value:
+ */
+ if (use_xsave())
+ dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;
+
+ /*
+ * Update shadow stack pointer, in case it changed during clone.
+ */
+ if (update_fpu_shstk(dst, ssp))
+ return 1;
+
+ trace_x86_fpu_copy_dst(dst_fpu);
+
+ return 0;
+}
+
+/*
+ * While struct fpu is no longer part of struct thread_struct, it is still
+ * allocated after struct task_struct in the "task_struct" kmem cache. But
+ * since FPU is expected to be part of struct thread_struct, we have to
+ * adjust for it here.
+ */
+void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+ /* The allocation follows struct task_struct. */
+ *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread);
+ *offset += offsetof(struct fpu, __fpstate.regs);
+ *size = fpu_kernel_cfg.default_size;
+}
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct task_struct *tsk)
+{
+ struct fpu *fpu;
+
+ if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD))
+ return;
+
+ fpu = x86_task_fpu(tsk);
+
+ preempt_disable();
+
+ if (fpu == x86_task_fpu(current)) {
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
+ fpregs_deactivate(fpu);
+ }
+
+ trace_x86_fpu_dropped(fpu);
+
+ preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from the init fpstate.
+ * Caller must do fpregs_[un]lock() around it.
+ */
+static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
+{
+ if (use_xsave())
+ os_xrstor(&init_fpstate, features_mask);
+ else if (use_fxsr())
+ fxrstor(&init_fpstate.regs.fxsave);
+ else
+ frstor(&init_fpstate.regs.fsave);
+
+ pkru_write_default();
+}
+
+/*
+ * Reset current->fpu memory state to the init values.
+ */
+static void fpu_reset_fpstate_regs(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+
+ fpregs_lock();
+ __fpu_invalidate_fpregs_state(fpu);
+ /*
+ * This does not change the actual hardware registers. It just
+ * resets the memory image and sets TIF_NEED_FPU_LOAD so a
+ * subsequent return to usermode will reload the registers from the
+ * task's memory image.
+ *
+ * Do not use fpstate_init() here. Just copy init_fpstate which has
+ * the correct content already except for PKRU.
+ *
+ * PKRU handling does not rely on the xstate when restoring for
+ * user space as PKRU is eagerly written in switch_to() and
+ * flush_thread().
+ */
+ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ fpregs_unlock();
+}
+
+/*
+ * Reset current's user FPU states to the init states. current's
+ * supervisor states, if any, are not modified by this function. The
+ * caller guarantees that the XSTATE header in memory is intact.
+ */
+void fpu__clear_user_states(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != x86_task_fpu(current));
+
+ fpregs_lock();
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpu_reset_fpstate_regs();
+ fpregs_unlock();
+ return;
+ }
+
+ /*
+ * Ensure that current's supervisor states are loaded into their
+ * corresponding registers.
+ */
+ if (xfeatures_mask_supervisor() &&
+ !fpregs_state_valid(fpu, smp_processor_id()))
+ os_xrstor_supervisor(fpu->fpstate);
+
+ /* Ensure XFD state is in sync before reloading XSTATE */
+ xfd_update_state(fpu->fpstate);
+
+ /* Reset user states in registers. */
+ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
+
+ /*
+ * Now all FPU registers have their desired values. Inform the FPU
+ * state machine that current's FPU registers are in the hardware
+ * registers. The memory image does not need to be updated because
+ * any operation relying on it has to save the registers first when
+ * current's FPU is marked active.
+ */
+ fpregs_mark_activate();
+ fpregs_unlock();
+}
+
+void fpu_flush_thread(void)
+{
+ fpstate_reset(x86_task_fpu(current));
+ fpu_reset_fpstate_regs();
+}
+/*
+ * Load FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ fpregs_restore_userregs();
+}
+EXPORT_SYMBOL_FOR_KVM(switch_fpu_return);
+
+void fpregs_lock_and_load(void)
+{
+ /*
+ * fpregs_lock() only disables preemption (mostly). So modifying state
+ * in an interrupt could screw up some in progress fpregs operation.
+ * Warn about it.
+ */
+ WARN_ON_ONCE(!irq_fpu_usable());
+ WARN_ON_ONCE(current->flags & PF_KTHREAD);
+
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * If current FPU state according to its tracking (loaded FPU context on this
+ * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
+ * loaded on return to userland.
+ */
+void fpregs_assert_state_consistent(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ return;
+
+ WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
+}
+EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent);
+#endif
+
+void fpregs_mark_activate(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+
+ fpregs_activate(fpu);
+ fpu->last_cpu = smp_processor_id();
+ clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+ int err;
+
+ if (trap_nr == X86_TRAP_MF) {
+ unsigned short cwd, swd;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+ * C1 reg you need in case of a stack fault, 0x040 is the stack
+ * fault bit. We should only be taking one exception at a time,
+ * so if this combination doesn't produce any single exception,
+ * then we have a bad program that isn't synchronizing its FPU usage
+ * and it will suffer the consequences since we won't be able to
+ * fully reproduce the context of the exception.
+ */
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ cwd = fpu->fpstate->regs.fxsave.cwd;
+ swd = fpu->fpstate->regs.fxsave.swd;
+ } else {
+ cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
+ swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
+ }
+
+ err = swd & ~cwd;
+ } else {
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+ * unmasked exception was caught we must mask the exception mask bits
+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+ */
+ unsigned short mxcsr = MXCSR_DEFAULT;
+
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ mxcsr = fpu->fpstate->regs.fxsave.mxcsr;
+
+ err = ~(mxcsr >> 7) & mxcsr;
+ }
+
+ if (err & 0x001) { /* Invalid op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ return FPE_FLTINV;
+ } else if (err & 0x004) { /* Divide by Zero */
+ return FPE_FLTDIV;
+ } else if (err & 0x008) { /* Overflow */
+ return FPE_FLTOVF;
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ return FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
+ return FPE_FLTRES;
+ }
+
+ /*
+ * If we're using IRQ 13, or supposedly even some trap
+ * X86_TRAP_MF implementations, it's possible
+ * we get a spurious trap, which is not an error.
+ */
+ return 0;
+}
+
+/*
+ * Initialize register state that may prevent from entering low-power idle.
+ * This function will be invoked from the cpuidle driver only when needed.
+ */
+noinstr void fpu_idle_fpregs(void)
+{
+ /* Note: AMX_TILE being enabled implies XGETBV1 support */
+ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
+ (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
+ tile_release();
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ }
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
new file mode 100644
index 000000000000..ff988b9ea39f
--- /dev/null
+++ b/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/api.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/init.h>
+
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ if (cr4_mask)
+ cr4_set_bits(cr4_mask);
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);
+
+ /* Flush out any pending x87 state: */
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ ;
+ else
+#endif
+ asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+ fpu__init_cpu_generic();
+ fpu__init_cpu_xstate();
+
+ /* Start allowing kernel-mode FPU: */
+ this_cpu_write(kernel_fpu_allowed, true);
+}
+
+static bool __init fpu__probe_without_cpuid(void)
+{
+ unsigned long cr0;
+ u16 fsw, fcw;
+
+ fsw = fcw = 0xffff;
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+ write_cr0(cr0);
+
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+ pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw);
+
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static void __init fpu__init_system_early_generic(void)
+{
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+
+ if (!boot_cpu_has(X86_FEATURE_CPUID) &&
+ !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+ if (fpu__probe_without_cpuid())
+ setup_force_cpu_cap(X86_FEATURE_FPU);
+ else
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+ }
+
+#ifndef CONFIG_MATH_EMULATION
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) {
+ pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+ for (;;)
+ asm volatile("hlt");
+ }
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu;
+
+static void __init fpu__init_system_mxcsr(void)
+{
+ unsigned int mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ /* Static because GCC does not get 16-byte stack alignment right: */
+ static struct fxregs_state fxregs __initdata;
+
+ asm volatile("fxsave %0" : "+m" (fxregs));
+
+ mask = fxregs.mxcsr_mask;
+
+ /*
+ * If zero then use the default features mask,
+ * which has all features set, except the
+ * denormals-are-zero feature bit:
+ */
+ if (mask == 0)
+ mask = 0x0000ffbf;
+ }
+ mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+ /*
+ * Set up the legacy init FPU context. Will be updated when the
+ * CPU supports XSAVE[S].
+ */
+ fpstate_init_user(&init_fpstate);
+
+ fpu__init_system_mxcsr();
+}
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+ BUILD_BUG_ON(sizeof(TYPE) != \
+ ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE)))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+ int task_size = sizeof(struct task_struct);
+
+ task_size += sizeof(struct fpu);
+
+ /*
+ * Subtract off the static size of the register state.
+ * It potentially has a bunch of padding.
+ */
+ task_size -= sizeof(union fpregs_state);
+
+ /*
+ * Add back the dynamically-calculated register state
+ * size.
+ */
+ task_size += fpu_kernel_cfg.default_size;
+
+ /*
+ * We dynamically size 'struct fpu', so we require that
+ * 'state' be at the end of 'it:
+ */
+ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
+
+ arch_task_struct_size = task_size;
+}
+
+/*
+ * Set up the user and kernel xstate sizes based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+ unsigned int size;
+
+ /*
+ * Note that the size configuration might be overwritten later
+ * during fpu__init_system_xstate().
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ size = sizeof(struct swregs_state);
+ } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ size = sizeof(struct fxregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE;
+ } else {
+ size = sizeof(struct fregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FP;
+ }
+
+ fpu_kernel_cfg.max_size = size;
+ fpu_kernel_cfg.default_size = size;
+ fpu_user_cfg.max_size = size;
+ fpu_user_cfg.default_size = size;
+ guest_default_cfg.size = size;
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(void)
+{
+ fpu__init_system_early_generic();
+
+ /*
+ * The FPU has to be operational for some of the
+ * later FPU init activities:
+ */
+ fpu__init_cpu();
+
+ fpu__init_system_generic();
+ fpu__init_system_xstate_size_legacy();
+ fpu__init_system_xstate(fpu_kernel_cfg.max_size);
+ fpu__init_task_struct_size();
+}
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
new file mode 100644
index 000000000000..975de070c9c9
--- /dev/null
+++ b/arch/x86/kernel/fpu/internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_INTERNAL_H
+#define __X86_KERNEL_FPU_INTERNAL_H
+
+extern struct fpstate init_fpstate;
+
+/* CPU feature check wrappers */
+static __always_inline __pure bool use_xsave(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_FXSR);
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; })
+#endif
+
+/* Used in init.c */
+extern void fpstate_init_user(struct fpstate *fpstate);
+extern void fpstate_reset(struct fpu *fpu);
+
+#endif
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
new file mode 100644
index 000000000000..098f367bb8a7
--- /dev/null
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_LEGACY_H
+#define __X86_KERNEL_FPU_LEGACY_H
+
+#include <asm/fpu/types.h>
+
+extern unsigned int mxcsr_feature_mask;
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+ asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
+#define user_insn(insn, output, input...) \
+({ \
+ int err; \
+ \
+ might_fault(); \
+ \
+ asm volatile(ASM_STAC "\n" \
+ "1: " #insn "\n" \
+ "2: " ASM_CLAC "\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn_err(insn, output, input...) \
+({ \
+ int err; \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn(insn, output, input...) \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \
+ : output : input)
+
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+ else
+ return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+}
+
+static inline void fxrstor(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_safe(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void frstor(struct fregs_state *fx)
+{
+ kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_safe(struct fregs_state *fx)
+{
+ return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
new file mode 100644
index 000000000000..0986c2200adc
--- /dev/null
+++ b/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/prctl.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilities supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ return regset->n;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ return regset->n;
+ else
+ return 0;
+}
+
+/*
+ * The regset get() functions are invoked from:
+ *
+ * - coredump to dump the current task's fpstate. If the current task
+ * owns the FPU then the memory state has to be synchronized and the
+ * FPU register state preserved. Otherwise fpstate is already in sync.
+ *
+ * - ptrace to dump fpstate of a stopped task, in which case the registers
+ * have already been saved to fpstate on context switch.
+ */
+static void sync_fpstate(struct fpu *fpu)
+{
+ if (fpu == x86_task_fpu(current))
+ fpu_sync_fpstate(fpu);
+}
+
+/*
+ * Invalidate cached FPU registers before modifying the stopped target
+ * task's fpstate.
+ *
+ * This forces the target task on resume to restore the FPU registers from
+ * modified fpstate. Otherwise the task might skip the restore and operate
+ * with the cached FPU registers which discards the modifications.
+ */
+static void fpu_force_restore(struct fpu *fpu)
+{
+ /*
+ * Only stopped child tasks can be used to modify the FPU
+ * state in the fpstate buffer:
+ */
+ WARN_ON_FPU(fpu == x86_task_fpu(current));
+
+ __fpu_invalidate_fpregs_state(fpu);
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ sync_fpstate(fpu);
+
+ if (!use_xsave()) {
+ return membuf_write(&to, &fpu->fpstate->regs.fxsave,
+ sizeof(fpu->fpstate->regs.fxsave));
+ }
+
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
+ return 0;
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct fxregs_state newstate;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(newstate))
+ return -EINVAL;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+ if (ret)
+ return ret;
+
+ /* Do not allow an invalid MXCSR value. */
+ if (newstate.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+
+ fpu_force_restore(fpu);
+
+ /* Copy the state */
+ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
+
+ /* Clear xmm8..15 for 32-bit callers */
+ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
+ if (in_ia32_syscall())
+ memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16);
+
+ /* Mark FP and SSE as in use when XSAVE is enabled */
+ if (use_xsave())
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+
+ return 0;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ sync_fpstate(x86_task_fpu(target));
+
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE);
+ return 0;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct xregs_state *tmpbuf = NULL;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ /*
+ * A whole standard-format XSAVE buffer is needed:
+ */
+ if (pos != 0 || count != fpu_user_cfg.max_size)
+ return -EFAULT;
+
+ if (!kbuf) {
+ tmpbuf = vmalloc(count);
+ if (!tmpbuf)
+ return -ENOMEM;
+
+ if (copy_from_user(tmpbuf, ubuf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ fpu_force_restore(fpu);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru);
+
+out:
+ vfree(tmpbuf);
+ return ret;
+}
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+int ssp_active(struct task_struct *target, const struct user_regset *regset)
+{
+ if (target->thread.features & ARCH_SHSTK_SHSTK)
+ return regset->n;
+
+ return 0;
+}
+
+int ssp_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct cet_user_state *cetregs;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !ssp_active(target, regset))
+ return -ENODEV;
+
+ sync_fpstate(fpu);
+ cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ return membuf_write(&to, (unsigned long *)&cetregs->user_ssp,
+ sizeof(cetregs->user_ssp));
+}
+
+int ssp_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct xregs_state *xsave = &fpu->fpstate->regs.xsave;
+ struct cet_user_state *cetregs;
+ unsigned long user_ssp;
+ int r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !ssp_active(target, regset))
+ return -ENODEV;
+
+ if (pos != 0 || count != sizeof(user_ssp))
+ return -EINVAL;
+
+ r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1);
+ if (r)
+ return r;
+
+ /*
+ * Some kernel instructions (IRET, etc) can cause exceptions in the case
+ * of disallowed CET register values. Just prevent invalid values.
+ */
+ if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8))
+ return -EINVAL;
+
+ fpu_force_restore(fpu);
+
+ cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ cetregs->user_ssp = user_ssp;
+ return 0;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+ unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+ /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+ tmp = ~twd;
+ tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+ /* and move the valid bits to the lower byte. */
+ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+ return tmp;
+}
+
+#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID 0
+#define FP_EXP_TAG_ZERO 1
+#define FP_EXP_TAG_SPECIAL 2
+#define FP_EXP_TAG_EMPTY 3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+ struct _fpxreg *st;
+ u32 tos = (fxsave->swd >> 11) & 7;
+ u32 twd = (unsigned long) fxsave->twd;
+ u32 tag;
+ u32 ret = 0xffff0000u;
+ int i;
+
+ for (i = 0; i < 8; i++, twd >>= 1) {
+ if (twd & 0x1) {
+ st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+ switch (st->exponent & 0x7fff) {
+ case 0x7fff:
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ case 0x0000:
+ if (!st->significand[0] &&
+ !st->significand[1] &&
+ !st->significand[2] &&
+ !st->significand[3])
+ tag = FP_EXP_TAG_ZERO;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ default:
+ if (st->significand[3] & 0x8000)
+ tag = FP_EXP_TAG_VALID;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ }
+ } else {
+ tag = FP_EXP_TAG_EMPTY;
+ }
+ ret |= tag << (2 * i);
+ }
+ return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
+ struct task_struct *tsk,
+ struct fxregs_state *fxsave)
+{
+ struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ env->cwd = fxsave->cwd | 0xffff0000u;
+ env->swd = fxsave->swd | 0xffff0000u;
+ env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+ env->fip = fxsave->rip;
+ env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
+ if (tsk == current) {
+ savesegment(ds, env->fos);
+ } else {
+ env->fos = tsk->thread.ds;
+ }
+ env->fos |= 0xffff0000;
+#else
+ env->fip = fxsave->fip;
+ env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+ env->foo = fxsave->foo;
+ env->fos = fxsave->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+ __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave);
+}
+
+void convert_to_fxsr(struct fxregs_state *fxsave,
+ const struct user_i387_ia32_struct *env)
+
+{
+ struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ fxsave->cwd = env->cwd;
+ fxsave->swd = env->swd;
+ fxsave->twd = twd_i387_to_fxsr(env->twd);
+ fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+ fxsave->rip = env->fip;
+ fxsave->rdp = env->foo;
+ /* cs and ds ignored */
+#else
+ fxsave->fip = env->fip;
+ fxsave->fcs = (env->fcs & 0xffff);
+ fxsave->foo = env->foo;
+ fxsave->fos = env->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct user_i387_ia32_struct env;
+ struct fxregs_state fxsave, *fx;
+
+ sync_fpstate(fpu);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
+ return fpregs_soft_get(target, regset, to);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ return membuf_write(&to, &fpu->fpstate->regs.fsave,
+ sizeof(struct fregs_state));
+ }
+
+ if (use_xsave()) {
+ struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) };
+
+ /* Handle init state optimized xstate correctly */
+ copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
+ fx = &fxsave;
+ } else {
+ fx = &fpu->fpstate->regs.fxsave;
+ }
+
+ __convert_from_fxsr(&env, target, fx);
+ return membuf_write(&to, &env, sizeof(env));
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct user_i387_ia32_struct env;
+ int ret;
+
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(struct user_i387_ia32_struct))
+ return -EINVAL;
+
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
+ return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+ if (ret)
+ return ret;
+
+ fpu_force_restore(fpu);
+
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env);
+ else
+ memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env));
+
+ /*
+ * Update the header bit in the xsave header, indicating the
+ * presence of FP.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+
+ return 0;
+}
+
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
new file mode 100644
index 000000000000..c3ec2512f2bb
--- /dev/null
+++ b/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+#include <linux/pagemap.h>
+
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+
+#include <asm/sigframe.h>
+#include <asm/trapnr.h>
+#include <asm/trace/fpu.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw)
+{
+ void __user *fpstate = fxbuf;
+ unsigned int magic2;
+
+ if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
+ return false;
+
+ /* Check for the first magic field */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1)
+ goto setfx;
+
+ /*
+ * Check for the presence of second magic word at the end of memory
+ * layout. This detects the case where the user just copied the legacy
+ * fpstate layout with out copying the extended state information
+ * in the memory layout.
+ */
+ if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size)))
+ return false;
+
+ if (likely(magic2 == FP_XSTATE_MAGIC2))
+ return true;
+setfx:
+ trace_x86_fpu_xstate_check_failed(x86_task_fpu(current));
+
+ /* Set the parameters for fx only state */
+ fx_sw->magic1 = 0;
+ fx_sw->xstate_size = sizeof(struct fxregs_state);
+ fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
+ return true;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+ if (use_fxsr()) {
+ struct xregs_state *xsave = &x86_task_fpu(tsk)->fpstate->regs.xsave;
+ struct user_i387_ia32_struct env;
+ struct _fpstate_32 __user *fp = buf;
+
+ fpregs_lock();
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+ fxsave(&x86_task_fpu(tsk)->fpstate->regs.fxsave);
+ fpregs_unlock();
+
+ convert_from_fxsr(&env, tsk);
+
+ if (__copy_to_user(buf, &env, sizeof(env)) ||
+ __put_user(xsave->i387.swd, &fp->status) ||
+ __put_user(X86_FXSR_MAGIC, &fp->magic))
+ return false;
+ } else {
+ struct fregs_state __user *fp = buf;
+ u32 swd;
+
+ if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed to by the fpstate pointer in the sigcontext.
+ * This is saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
+ struct fpstate *fpstate)
+{
+ sw_bytes->magic1 = FP_XSTATE_MAGIC1;
+ sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
+ sw_bytes->xfeatures = fpstate->user_xfeatures;
+ sw_bytes->xstate_size = fpstate->user_size;
+
+ if (ia32_frame)
+ sw_bytes->extended_size += sizeof(struct fregs_state);
+}
+
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
+ struct fpstate *fpstate)
+{
+ struct xregs_state __user *x = buf;
+ struct _fpx_sw_bytes sw_bytes = {};
+ int err;
+
+ /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+ save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
+ err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));
+
+ if (!use_xsave())
+ return !err;
+
+ err |= __put_user(FP_XSTATE_MAGIC2,
+ (__u32 __user *)(buf + fpstate->user_size));
+
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context. This will
+ * enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xfeatures in the xsave header.
+ *
+ * xsave aware apps can change the xfeatures in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE);
+
+ return !err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ if (use_xsave())
+ return xsave_to_user_sigframe(buf, pkru);
+
+ if (use_fxsr())
+ return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
+ else
+ return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ * state is copied.
+ * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ * buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * Save it directly to the user frame with disabled page fault handler. If
+ * that faults, try to clear the frame which handles the page fault.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
+{
+ struct task_struct *tsk = current;
+ struct fpstate *fpstate = x86_task_fpu(tsk)->fpstate;
+ bool ia32_fxstate = (buf != buf_fx);
+ int ret;
+
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ struct user_i387_ia32_struct fp;
+
+ fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
+ .left = sizeof(fp)});
+ return !copy_to_user(buf, &fp, sizeof(fp));
+ }
+
+ if (!access_ok(buf, size))
+ return false;
+
+ if (use_xsave()) {
+ struct xregs_state __user *xbuf = buf_fx;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
+ return false;
+ }
+retry:
+ /*
+ * Load the FPU registers if they are not valid for the current task.
+ * With a valid FPU state we can attempt to save the state directly to
+ * userland's stack frame which will likely succeed. If it does not,
+ * resolve the fault in the user memory and try again.
+ */
+ fpregs_lock();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+
+ pagefault_disable();
+ ret = copy_fpregs_to_sigframe(buf_fx, pkru);
+ pagefault_enable();
+ fpregs_unlock();
+
+ if (ret) {
+ if (!__clear_user(buf_fx, fpstate->user_size))
+ goto retry;
+ return false;
+ }
+
+ /* Save the fsave header for the 32-bit frames. */
+ if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
+ return false;
+
+ if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
+ return false;
+
+ return true;
+}
+
+static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
+ u64 xrestore, bool fx_only)
+{
+ if (use_xsave()) {
+ u64 init_bv = ufeatures & ~xrestore;
+ int ret;
+
+ if (likely(!fx_only))
+ ret = xrstor_from_user_sigframe(buf, xrestore);
+ else
+ ret = fxrstor_from_user_sigframe(buf);
+
+ if (!ret && unlikely(init_bv))
+ os_xrstor(&init_fpstate, init_bv);
+ return ret;
+ } else if (use_fxsr()) {
+ return fxrstor_from_user_sigframe(buf);
+ } else {
+ return frstor_from_user_sigframe(buf);
+ }
+}
+
+/*
+ * Attempt to restore the FPU registers directly from user memory.
+ * Pagefaults are handled and any errors returned are fatal.
+ */
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ int ret;
+
+ /* Restore enabled features only. */
+ xrestore &= fpu->fpstate->user_xfeatures;
+retry:
+ fpregs_lock();
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpu->fpstate);
+ pagefault_disable();
+ ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
+ xrestore, fx_only);
+ pagefault_enable();
+
+ if (unlikely(ret)) {
+ /*
+ * The above did an FPU restore operation, restricted to
+ * the user portion of the registers, and failed, but the
+ * microcode might have modified the FPU registers
+ * nevertheless.
+ *
+ * If the FPU registers do not belong to current, then
+ * invalidate the FPU register state otherwise the task
+ * might preempt current and return to user space with
+ * corrupted FPU registers.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ __cpu_invalidate_fpregs_state();
+ fpregs_unlock();
+
+ /* Try to handle #PF, but anything else is fatal. */
+ if (ret != X86_TRAP_PF)
+ return false;
+
+ if (!fault_in_readable(buf, fpu->fpstate->user_size))
+ goto retry;
+ return false;
+ }
+
+ /*
+ * Restore supervisor states: previous context switch etc has done
+ * XSAVES and saved the supervisor states in the kernel buffer from
+ * which they can be restored now.
+ *
+ * It would be optimal to handle this with a single XRSTORS, but
+ * this does not work because the rest of the FPU registers have
+ * been restored from a user buffer directly.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
+ os_xrstor_supervisor(fpu->fpstate);
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return true;
+}
+
+static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+ bool ia32_fxstate)
+{
+ struct task_struct *tsk = current;
+ struct fpu *fpu = x86_task_fpu(tsk);
+ struct user_i387_ia32_struct env;
+ bool success, fx_only = false;
+ union fpregs_state *fpregs;
+ u64 user_xfeatures = 0;
+
+ if (use_xsave()) {
+ struct _fpx_sw_bytes fx_sw_user;
+
+ if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+ return false;
+
+ fx_only = !fx_sw_user.magic1;
+ user_xfeatures = fx_sw_user.xfeatures;
+ } else {
+ user_xfeatures = XFEATURE_MASK_FPSSE;
+ }
+
+ if (likely(!ia32_fxstate)) {
+ /* Restore the FPU registers directly from user memory. */
+ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
+ }
+
+ /*
+ * Copy the legacy state because the FP portion of the FX frame has
+ * to be ignored for histerical raisins. The legacy state is folded
+ * in once the larger state has been copied.
+ */
+ if (__copy_from_user(&env, buf, sizeof(env)))
+ return false;
+
+ /*
+ * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
+ * not modified on context switch and that the xstate is considered
+ * to be loaded again on return to userland (overriding last_cpu avoids
+ * the optimisation).
+ */
+ fpregs_lock();
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+ /*
+ * If supervisor states are available then save the
+ * hardware state in current's fpstate so that the
+ * supervisor state is preserved. Save the full state for
+ * simplicity. There is no point in optimizing this by only
+ * saving the supervisor states and then shuffle them to
+ * the right place in memory. It's ia32 mode. Shrug.
+ */
+ if (xfeatures_mask_supervisor())
+ os_xsave(fpu->fpstate);
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ }
+ __fpu_invalidate_fpregs_state(fpu);
+ __cpu_invalidate_fpregs_state();
+ fpregs_unlock();
+
+ fpregs = &fpu->fpstate->regs;
+ if (use_xsave() && !fx_only) {
+ if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
+ return false;
+ } else {
+ if (__copy_from_user(&fpregs->fxsave, buf_fx,
+ sizeof(fpregs->fxsave)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Reject invalid MXCSR values. */
+ if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return false;
+ } else {
+ /* Mask invalid bits out for historical reasons (broken hardware). */
+ fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
+ }
+
+ /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
+ if (use_xsave())
+ fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ }
+
+ /* Fold the legacy FP storage */
+ convert_to_fxsr(&fpregs->fxsave, &env);
+
+ fpregs_lock();
+ if (use_xsave()) {
+ /*
+ * Remove all UABI feature bits not set in user_xfeatures
+ * from the memory xstate header which makes the full
+ * restore below bring them into init state. This works for
+ * fx_only mode as well because that has only FP and SSE
+ * set in user_xfeatures.
+ *
+ * Preserve supervisor states!
+ */
+ u64 mask = user_xfeatures | xfeatures_mask_supervisor();
+
+ fpregs->xsave.header.xfeatures &= mask;
+ success = !os_xrstor_safe(fpu->fpstate,
+ fpu_kernel_cfg.max_features);
+ } else {
+ success = !fxrstor_safe(&fpregs->fxsave);
+ }
+
+ if (likely(success))
+ fpregs_mark_activate();
+
+ fpregs_unlock();
+ return success;
+}
+
+static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
+{
+ unsigned int size = fpstate->user_size;
+
+ return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+bool fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ void __user *buf_fx = buf;
+ bool ia32_fxstate = false;
+ bool success = false;
+ unsigned int size;
+
+ if (unlikely(!buf)) {
+ fpu__clear_user_states(fpu);
+ return true;
+ }
+
+ size = xstate_sigframe_size(fpu->fpstate);
+
+ ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ /*
+ * Only FXSR enabled systems need the FX state quirk.
+ * FRSTOR does not need it and can use the fast path.
+ */
+ if (ia32_frame && use_fxsr()) {
+ buf_fx = buf + sizeof(struct fregs_state);
+ size += sizeof(struct fregs_state);
+ ia32_fxstate = true;
+ }
+
+ if (!access_ok(buf, size))
+ goto out;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
+ success = !fpregs_soft_set(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct),
+ NULL, buf);
+ } else {
+ success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+ }
+
+out:
+ if (unlikely(!success))
+ fpu__clear_user_states(fpu);
+ return success;
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+ unsigned long *buf_fx, unsigned long *size)
+{
+ unsigned long frame_size = xstate_sigframe_size(x86_task_fpu(current)->fpstate);
+
+ *buf_fx = sp = round_down(sp - frame_size, 64);
+ if (ia32_frame && use_fxsr()) {
+ frame_size += sizeof(struct fregs_state);
+ sp -= sizeof(struct fregs_state);
+ }
+
+ *size = frame_size;
+
+ return sp;
+}
+
+unsigned long __init fpu__get_fpstate_size(void)
+{
+ unsigned long ret = fpu_user_cfg.max_size;
+
+ if (use_xsave())
+ ret += FP_XSTATE_MAGIC2_SIZE;
+
+ /*
+ * This space is needed on (most) 32-bit kernels, or when a 32-bit
+ * app is running on a 64-bit kernel. To keep things simple, just
+ * assume the worst case and always include space for 'freg_state',
+ * even for 64-bit apps on 64-bit kernels. This wastes a bit of
+ * space, but keeps the code simple.
+ */
+ if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+ ret += sizeof(struct fregs_state);
+
+ return ret;
+}
+
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
new file mode 100644
index 000000000000..48113c5193aa
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,2012 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/bitops.h>
+#include <linux/compat.h>
+#include <linux/cpu.h>
+#include <linux/mman.h>
+#include <linux/kvm_types.h>
+#include <linux/nospec.h>
+#include <linux/pkeys.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/coredump.h>
+#include <linux/sort.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xcr.h>
+
+#include <asm/cpuid/api.h>
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+
+#include <uapi/asm/elf.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define for_each_extended_xfeature(bit, mask) \
+ (bit) = FIRST_EXTENDED_XFEATURE; \
+ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
+
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused. We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
+static const char *xfeature_names[] =
+{
+ "x87 floating point registers",
+ "SSE registers",
+ "AVX registers",
+ "MPX bounds registers",
+ "MPX CSR",
+ "AVX-512 opmask",
+ "AVX-512 Hi256",
+ "AVX-512 ZMM_Hi256",
+ "Processor Trace (unused)",
+ "Protection Keys User registers",
+ "PASID state",
+ "Control-flow User registers",
+ "Control-flow Kernel registers (KVM only)",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "AMX Tile config",
+ "AMX Tile data",
+ "APX registers",
+ "unknown xstate feature",
+};
+
+static unsigned short xsave_cpuid_features[] __initdata = {
+ [XFEATURE_FP] = X86_FEATURE_FPU,
+ [XFEATURE_SSE] = X86_FEATURE_XMM,
+ [XFEATURE_YMM] = X86_FEATURE_AVX,
+ [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
+ [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
+ [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
+ [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
+ [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
+ [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
+ [XFEATURE_PKRU] = X86_FEATURE_OSPKE,
+ [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
+ [XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
+ [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK,
+ [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_APX] = X86_FEATURE_APX,
+};
+
+static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
+
+/*
+ * Ordering of xstate components in uncompacted format: The xfeature
+ * number does not necessarily indicate its position in the XSAVE buffer.
+ * This array defines the traversal order of xstate features.
+ */
+static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+
+static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
+{
+ for (; xfeature_uncompact_order[i] != -1; i++) {
+ if (mask & BIT_ULL(xfeature_uncompact_order[i]))
+ break;
+ }
+
+ return i;
+}
+
+/* Iterate xstate features in uncompacted order: */
+#define for_each_extended_xfeature_in_order(i, mask) \
+ for (i = 0; \
+ i = next_xfeature_order(i, mask), \
+ xfeature_uncompact_order[i] != -1; \
+ i++)
+
+#define XSTATE_FLAG_SUPERVISOR BIT(0)
+#define XSTATE_FLAG_ALIGNED64 BIT(1)
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+ u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
+
+ if (unlikely(feature_name)) {
+ long xfeature_idx, max_idx;
+ u64 xfeatures_print;
+ /*
+ * So we use FLS here to be able to print the most advanced
+ * feature that was requested but is missing. So if a driver
+ * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
+ * missing AVX feature - this is the most informative message
+ * to users:
+ */
+ if (xfeatures_missing)
+ xfeatures_print = xfeatures_missing;
+ else
+ xfeatures_print = xfeatures_needed;
+
+ xfeature_idx = fls64(xfeatures_print)-1;
+ max_idx = ARRAY_SIZE(xfeature_names)-1;
+ xfeature_idx = min(xfeature_idx, max_idx);
+
+ *feature_name = xfeature_names[xfeature_idx];
+ }
+
+ if (xfeatures_missing)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+static bool xfeature_is_aligned64(int xfeature_nr)
+{
+ return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
+}
+
+static bool xfeature_is_supervisor(int xfeature_nr)
+{
+ return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
+}
+
+static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
+{
+ unsigned int offs, i;
+
+ /*
+ * Non-compacted format and legacy features use the cached fixed
+ * offsets.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
+ xfeature <= XFEATURE_SSE)
+ return xstate_offsets[xfeature];
+
+ /*
+ * Compacted format offsets depend on the actual content of the
+ * compacted xsave area which is determined by the xcomp_bv header
+ * field.
+ */
+ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for_each_extended_xfeature(i, xcomp_bv) {
+ if (xfeature_is_aligned64(i))
+ offs = ALIGN(offs, 64);
+ if (i == xfeature)
+ break;
+ offs += xstate_sizes[i];
+ }
+ return offs;
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
+ return;
+
+ cr4_set_bits(X86_CR4_OSXSAVE);
+
+ /*
+ * Must happen after CR4 setup and before xsetbv() to allow KVM
+ * lazy passthrough. Write independent of the dynamic state static
+ * key as that does not work on the boot CPU. This also ensures
+ * that any stale state is wiped out from XFD. Reset the per CPU
+ * xfd cache too.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XFD))
+ xfd_set_state(init_fpstate.xfd);
+
+ /*
+ * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
+ * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
+ * states can be set here.
+ */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
+
+ /*
+ * MSR_IA32_XSS sets supervisor states managed by XSAVES.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_independent());
+ }
+}
+
+static bool xfeature_enabled(enum xfeature xfeature)
+{
+ return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
+}
+
+static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
+{
+ return xstate_offsets[*(unsigned int *)xfeature1] -
+ xstate_offsets[*(unsigned int *)xfeature2];
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout. Also, create an ordered
+ * list of xfeatures for handling out-of-order offsets.
+ */
+static void __init setup_xstate_cache(void)
+{
+ u32 eax, ebx, ecx, edx, xfeature, i = 0;
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_offsets[XFEATURE_FP] = 0;
+ xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state,
+ xmm_space);
+
+ xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP];
+ xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
+ xmm_space);
+
+ for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
+ cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);
+
+ xstate_sizes[xfeature] = eax;
+ xstate_flags[xfeature] = ecx;
+
+ /*
+ * If an xfeature is supervisor state, the offset in EBX is
+ * invalid, leave it to -1.
+ */
+ if (xfeature_is_supervisor(xfeature))
+ continue;
+
+ xstate_offsets[xfeature] = ebx;
+
+ /* Populate the list of xfeatures before sorting */
+ xfeature_uncompact_order[i++] = xfeature;
+ }
+
+ /*
+ * Sort xfeatures by their offsets to support out-of-order
+ * offsets in the uncompacted format.
+ */
+ sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+ int i;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = BIT_ULL(i);
+ const char *name;
+
+ if (cpu_has_xfeatures(mask, &name))
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
+ }
+}
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do { \
+ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
+ WARN_ON(nr >= XFEATURE_MAX); \
+} while (0)
+
+/*
+ * Print out xstate component offsets and sizes
+ */
+static void __init print_xstate_offset_size(void)
+{
+ int i;
+
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
+ i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
+ i, xstate_sizes[i]);
+ }
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static __init void os_xrstor_booting(struct xregs_state *xstate)
+{
+ u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ /*
+ * We should never fault when copying from a kernel buffer, and the FPU
+ * state we set at boot time should be valid.
+ */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_CET_KERNEL | \
+ XFEATURE_MASK_XTILE | \
+ XFEATURE_MASK_APX)
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return;
+
+ print_xstate_features();
+
+ xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
+
+ /*
+ * Init all the features state with header.xfeatures being 0x0
+ */
+ os_xrstor_booting(&init_fpstate.regs.xsave);
+
+ /*
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVEC/S is available because XSAVEC/S uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
+ */
+ fxsave(&init_fpstate.regs.fxsave);
+}
+
+int xfeature_size(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return eax;
+}
+
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+static int validate_user_xstate_header(const struct xstate_header *hdr,
+ struct fpstate *fpstate)
+{
+ /* No unknown or supervisor features may be set */
+ if (hdr->xfeatures & ~fpstate->user_xfeatures)
+ return -EINVAL;
+
+ /* Userspace must use the uncompacted format */
+ if (hdr->xcomp_bv)
+ return -EINVAL;
+
+ /*
+ * If 'reserved' is shrunken to add a new field, make sure to validate
+ * that new field here!
+ */
+ BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
+
+ /* No reserved bits may be set */
+ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __init __xstate_dump_leaves(void)
+{
+ int i;
+ u32 eax, ebx, ecx, edx;
+ static int should_dump = 1;
+
+ if (!should_dump)
+ return;
+ should_dump = 0;
+ /*
+ * Dump out a few leaves past the ones that we support
+ * just in case there are some goodies up there
+ */
+ for (i = 0; i < XFEATURE_MAX + 10; i++) {
+ cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
+ pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+ CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
+ }
+}
+
+#define XSTATE_WARN_ON(x, fmt, ...) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+#define XCHECK_SZ(sz, nr, __struct) ({ \
+ if (WARN_ONCE(sz != sizeof(__struct), \
+ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \
+ xfeature_names[nr], sizeof(__struct), sz)) { \
+ __xstate_dump_leaves(); \
+ } \
+ true; \
+})
+
+
+/**
+ * check_xtile_data_against_struct - Check tile data state size.
+ *
+ * Calculate the state size by multiplying the single tile size which is
+ * recorded in a C struct, and the number of tiles that the CPU informs.
+ * Compare the provided size with the calculation.
+ *
+ * @size: The tile data state size
+ *
+ * Returns: 0 on success, -EINVAL on mismatch.
+ */
+static int __init check_xtile_data_against_struct(int size)
+{
+ u32 max_palid, palid, state_size;
+ u32 eax, ebx, ecx, edx;
+ u16 max_tile;
+
+ /*
+ * Check the maximum palette id:
+ * eax: the highest numbered palette subleaf.
+ */
+ cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
+
+ /*
+ * Cross-check each tile size and find the maximum number of
+ * supported tiles.
+ */
+ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
+ u16 tile_size, max;
+
+ /*
+ * Check the tile size info:
+ * eax[31:16]: bytes per title
+ * ebx[31:16]: the max names (or max number of tiles)
+ */
+ cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
+ tile_size = eax >> 16;
+ max = ebx >> 16;
+
+ if (tile_size != sizeof(struct xtile_data)) {
+ pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA),
+ sizeof(struct xtile_data), tile_size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+
+ if (max > max_tile)
+ max_tile = max;
+ }
+
+ state_size = sizeof(struct xtile_data) * max_tile;
+ if (size != state_size) {
+ pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA), state_size, size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * We have a C struct for each 'xstate'. We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static bool __init check_xstate_against_struct(int nr)
+{
+ /*
+ * Ask the CPU for the size of the state.
+ */
+ int sz = xfeature_size(nr);
+
+ /*
+ * Match each CPU state with the corresponding software
+ * structure.
+ */
+ switch (nr) {
+ case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct);
+ case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
+ case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
+ case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
+ case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
+ case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
+ case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state);
+ case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
+ case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
+ case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
+ case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state);
+ case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
+ default:
+ XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
+ return false;
+ }
+
+ return true;
+}
+
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+ unsigned int topmost = fls64(xfeatures) - 1;
+ unsigned int offset, i;
+
+ if (topmost <= XFEATURE_SSE)
+ return sizeof(struct xregs_state);
+
+ if (compacted) {
+ offset = xfeature_get_offset(xfeatures, topmost);
+ } else {
+ /* Walk through the xfeature order to pick the last */
+ for_each_extended_xfeature_in_order(i, xfeatures)
+ topmost = xfeature_uncompact_order[i];
+ offset = xstate_offsets[topmost];
+ }
+
+ return offset + xstate_sizes[topmost];
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be. We are recalculating
+ * it to be safe.
+ *
+ * Independent XSAVE features allocate their own buffers and are not
+ * covered by these checks. Only the size of the buffer for task->fpu
+ * is checked here.
+ */
+static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
+{
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+ bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ int i;
+
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ if (!check_xstate_against_struct(i))
+ return false;
+ /*
+ * Supervisor state components can be managed only by
+ * XSAVES.
+ */
+ if (!xsaves && xfeature_is_supervisor(i)) {
+ XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
+ return false;
+ }
+ }
+ size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
+ XSTATE_WARN_ON(size != kernel_size,
+ "size %u != kernel_size %u\n", size, kernel_size);
+ return size == kernel_size;
+}
+
+/*
+ * Get total size of enabled xstates in XCR0 | IA32_XSS.
+ *
+ * Note the SDM's wording here. "sub-function 0" only enumerates
+ * the size of the *user* states. If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * This also takes compaction into account. So this works for
+ * XSAVEC as well.
+ */
+static unsigned int __init get_compacted_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 1:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVES instruction for an XSAVE area
+ * containing all the state components
+ * corresponding to bits currently set in
+ * XCR0 | IA32_XSS.
+ *
+ * When XSAVES is not available but XSAVEC is (virt), then there
+ * are no supervisor states, but XSAVEC still uses compacted
+ * format.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+/*
+ * Get the total size of the enabled xstates without the independent supervisor
+ * features.
+ */
+static unsigned int __init get_xsave_compacted_size(void)
+{
+ u64 mask = xfeatures_mask_independent();
+ unsigned int size;
+
+ if (!mask)
+ return get_compacted_size();
+
+ /* Disable independent features. */
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());
+
+ /*
+ * Ask the hardware what size is required of the buffer.
+ * This is the size required for the task->fpu buffer.
+ */
+ size = get_compacted_size();
+
+ /* Re-enable independent features so XSAVES will work on them again. */
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
+
+ return size;
+}
+
+static unsigned int __init get_xsave_size_user(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 0:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVE instruction for an XSAVE area
+ * containing all the *user* state components
+ * corresponding to bits currently set in XCR0.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+static int __init init_xstate_size(void)
+{
+ /* Recompute the context size for enabled features: */
+ unsigned int user_size, kernel_size, kernel_default_size;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+
+ /* Uncompacted user space size */
+ user_size = get_xsave_size_user();
+
+ /*
+ * XSAVES kernel size includes supervisor states and uses compacted
+ * format. XSAVEC uses compacted format, but does not save
+ * supervisor states.
+ *
+ * XSAVE[OPT] do not support supervisor states so kernel and user
+ * size is identical.
+ */
+ if (compacted)
+ kernel_size = get_xsave_compacted_size();
+ else
+ kernel_size = user_size;
+
+ kernel_default_size =
+ xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
+
+ if (!paranoid_xstate_size_valid(kernel_size))
+ return -EINVAL;
+
+ fpu_kernel_cfg.max_size = kernel_size;
+ fpu_user_cfg.max_size = user_size;
+
+ fpu_kernel_cfg.default_size = kernel_default_size;
+ fpu_user_cfg.default_size =
+ xstate_calculate_size(fpu_user_cfg.default_features, false);
+
+ guest_default_cfg.size =
+ xstate_calculate_size(guest_default_cfg.features, compacted);
+
+ return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it. Disable it.
+ */
+static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
+{
+ pr_info("x86/fpu: XSAVE disabled\n");
+
+ fpu_kernel_cfg.max_features = 0;
+ cr4_clear_bits(X86_CR4_OSXSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ /* Restore the legacy size.*/
+ fpu_kernel_cfg.max_size = legacy_size;
+ fpu_kernel_cfg.default_size = legacy_size;
+ fpu_user_cfg.max_size = legacy_size;
+ fpu_user_cfg.default_size = legacy_size;
+ guest_default_cfg.size = legacy_size;
+
+ /*
+ * Prevent enabling the static branch which enables writes to the
+ * XFD MSR.
+ */
+ init_fpstate.xfd = 0;
+
+ fpstate_reset(x86_task_fpu(current));
+}
+
+static u64 __init host_default_mask(void)
+{
+ /*
+ * Exclude dynamic features (require userspace opt-in) and features
+ * that are supported only for KVM guests.
+ */
+ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
+}
+
+static u64 __init guest_default_mask(void)
+{
+ /*
+ * Exclude dynamic features, which require userspace opt-in even
+ * for KVM guests.
+ */
+ return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(unsigned int legacy_size)
+{
+ unsigned int eax, ebx, ecx, edx;
+ u64 xfeatures;
+ int err;
+ int i;
+
+ if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ pr_info("x86/fpu: No FPU detected\n");
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ pr_info("x86/fpu: x87 FPU will use %s\n",
+ boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
+ return;
+ }
+
+ /*
+ * Find user xstates supported by the processor.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
+ fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
+
+ /*
+ * Find supervisor xstates supported by the processor.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
+ fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
+
+ if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ /*
+ * This indicates that something really unexpected happened
+ * with the enumeration. Disable XSAVE and try to continue
+ * booting without it. This is too early to BUG().
+ */
+ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
+ fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
+ if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
+ fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
+ /*
+ * This is a problematic CPU configuration where two
+ * conflicting state components are both enumerated.
+ */
+ pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
+ fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ unsigned short cid = xsave_cpuid_features[i];
+
+ /* Careful: X86_FEATURE_FPU is 0! */
+ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
+ fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_XFD))
+ fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+ else
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
+ fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
+ fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+
+ /*
+ * Now, given maximum feature set, determine default values by
+ * applying default masks.
+ */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask();
+ guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask();
+
+ /* Store it for paranoia check at the end */
+ xfeatures = fpu_kernel_cfg.max_features;
+
+ /*
+ * Initialize the default XFD state in initfp_state and enable the
+ * dynamic sizing mechanism if dynamic states are available. The
+ * static key cannot be enabled here because this runs before
+ * jump_label_init(). This is delayed to an initcall.
+ */
+ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
+
+ /* Set up compaction feature bit */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
+ cpu_feature_enabled(X86_FEATURE_XSAVES))
+ setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
+
+ /* Enable xstate instructions to be able to continue with initialization: */
+ fpu__init_cpu_xstate();
+
+ /* Cache size, offset and flags for initialization */
+ setup_xstate_cache();
+
+ err = init_xstate_size();
+ if (err)
+ goto out_disable;
+
+ /*
+ * Update info used for ptrace frames; use standard-format size and no
+ * supervisor xstates:
+ */
+ update_regset_xstate_info(fpu_user_cfg.max_size,
+ fpu_user_cfg.max_features);
+
+ /*
+ * init_fpstate excludes dynamic states as they are large but init
+ * state is zero.
+ */
+ init_fpstate.size = fpu_kernel_cfg.default_size;
+ init_fpstate.xfeatures = fpu_kernel_cfg.default_features;
+
+ if (init_fpstate.size > sizeof(init_fpstate.regs)) {
+ pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
+ sizeof(init_fpstate.regs), init_fpstate.size);
+ goto out_disable;
+ }
+
+ setup_init_fpu_buf();
+
+ /*
+ * Paranoia check whether something in the setup modified the
+ * xfeatures mask.
+ */
+ if (xfeatures != fpu_kernel_cfg.max_features) {
+ pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
+ xfeatures, fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
+ /*
+ * CPU capabilities initialization runs before FPU init. So
+ * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
+ * functional, set the feature bit so depending code works.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
+
+ print_xstate_offset_size();
+ pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
+ fpu_kernel_cfg.max_features,
+ fpu_kernel_cfg.max_size,
+ boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
+ return;
+
+out_disable:
+ /* something went wrong, try to boot without any XSAVE support */
+ fpu__init_disable_system_xstate(legacy_size);
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+ /*
+ * Restore XCR0 on xsave capable CPUs:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
+
+ /*
+ * Restore IA32_XSS. The same CPUID bit enumerates support
+ * of XSAVES and MSR_IA32_XSS.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_independent());
+ }
+
+ if (fpu_state_size_dynamic())
+ wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
+}
+
+/*
+ * Given an xstate feature nr, calculate where in the xsave
+ * buffer the state is. Callers should ensure that the buffer
+ * is valid.
+ */
+static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+{
+ u64 xcomp_bv = xsave->header.xcomp_bv;
+
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
+ if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
+ return NULL;
+ }
+
+ return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
+}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ * xstate: the thread's storage area for all FPU data
+ * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
+ * XFEATURE_SSE, etc...)
+ * Output:
+ * address of the state in the xsave area, or NULL if the
+ * field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+{
+ /*
+ * Do we even *have* xsave state?
+ */
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return NULL;
+
+ /*
+ * We should not ever be requesting features that we
+ * have not enabled.
+ */
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ /*
+ * This assumes the last 'xsave*' instruction to
+ * have requested that 'xfeature_nr' be saved.
+ * If it did not, we might be seeing and old value
+ * of the field in the buffer.
+ *
+ * This can happen because the last 'xsave' did not
+ * request that this feature be saved (unlikely)
+ * or because the "init optimization" caused it
+ * to not be saved.
+ */
+ if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
+ return NULL;
+
+ return __raw_xsave_addr(xsave, xfeature_nr);
+}
+EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);
+
+/*
+ * Given an xstate feature nr, calculate where in the xsave buffer the state is.
+ * The xsave buffer should be in standard format, not compacted (e.g. user mode
+ * signal frames).
+ */
+void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
+{
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ return (void __user *)xsave + xstate_offsets[xfeature_nr];
+}
+
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
+/*
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ u32 old_pkru, new_pkru_bits = 0;
+ int pkey_shift;
+
+ /*
+ * This check implies XSAVE support. OSPKE only gets
+ * set if we enable XSAVE and we enable PKU in XCR0.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return -EINVAL;
+
+ /*
+ * This code should only be called with valid 'pkey'
+ * values originating from in-kernel users. Complain
+ * if a bad value is observed.
+ */
+ if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
+ return -EINVAL;
+
+ /* Set the bits we need in PKRU: */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_pkru_bits |= PKRU_AD_BIT;
+ if (init_val & PKEY_DISABLE_WRITE)
+ new_pkru_bits |= PKRU_WD_BIT;
+
+ /* Shift the bits in to the correct place in PKRU for pkey: */
+ pkey_shift = pkey * PKRU_BITS_PER_PKEY;
+ new_pkru_bits <<= pkey_shift;
+
+ /* Get old PKRU and mask off any old bits in place: */
+ old_pkru = read_pkru();
+ old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+
+ /* Write old part along with new part: */
+ write_pkru(old_pkru | new_pkru_bits);
+
+ return 0;
+}
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
+
+static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
+ void *init_xstate, unsigned int size)
+{
+ membuf_write(to, from_xstate ? xstate : init_xstate, size);
+}
+
+/**
+ * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @fpstate: The fpstate buffer from which to copy
+ * @xfeatures: The mask of xfeatures to save (XSAVE mode only)
+ * @pkru_val: The PKRU value to store in the PKRU component
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode)
+{
+ const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+ struct xregs_state *xinit = &init_fpstate.regs.xsave;
+ struct xregs_state *xsave = &fpstate->regs.xsave;
+ unsigned int zerofrom, i, xfeature;
+ struct xstate_header header;
+ u64 mask;
+
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+
+ /* Mask out the feature bits depending on copy mode */
+ switch (copy_mode) {
+ case XSTATE_COPY_FP:
+ header.xfeatures &= XFEATURE_MASK_FP;
+ break;
+
+ case XSTATE_COPY_FX:
+ header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
+ break;
+
+ case XSTATE_COPY_XSAVE:
+ header.xfeatures &= fpstate->user_xfeatures & xfeatures;
+ break;
+ }
+
+ /* Copy FP state up to MXCSR */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
+ &xinit->i387, off_mxcsr);
+
+ /* Copy MXCSR when SSE or YMM are set in the feature mask */
+ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
+ &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
+ MXCSR_AND_FLAGS_SIZE);
+
+ /* Copy the remaining FP state */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP,
+ &to, &xsave->i387.st_space, &xinit->i387.st_space,
+ sizeof(xsave->i387.st_space));
+
+ /* Copy the SSE state - shared with YMM, but independently managed */
+ copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
+ &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
+ sizeof(xsave->i387.xmm_space));
+
+ if (copy_mode != XSTATE_COPY_XSAVE)
+ goto out;
+
+ /* Zero the padding area */
+ membuf_zero(&to, sizeof(xsave->i387.padding));
+
+ /* Copy xsave->i387.sw_reserved */
+ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
+
+ /* Copy the user space relevant state of @xsave->header */
+ membuf_write(&to, &header, sizeof(header));
+
+ zerofrom = offsetof(struct xregs_state, extended_state_area);
+
+ /*
+ * This 'mask' indicates which states to copy from fpstate.
+ * Those extended states that are not present in fpstate are
+ * either disabled or initialized:
+ *
+ * In non-compacted format, disabled features still occupy
+ * state space but there is no state to copy from in the
+ * compacted init_fpstate. The gap tracking will zero these
+ * states.
+ *
+ * The extended features have an all zeroes init state. Thus,
+ * remove them from 'mask' to zero those features in the user
+ * buffer instead of retrieving them from init_fpstate.
+ */
+ mask = header.xfeatures;
+
+ for_each_extended_xfeature_in_order(i, mask) {
+ xfeature = xfeature_uncompact_order[i];
+ /*
+ * If there was a feature or alignment gap, zero the space
+ * in the destination buffer.
+ */
+ if (zerofrom < xstate_offsets[xfeature])
+ membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);
+
+ if (xfeature == XFEATURE_PKRU) {
+ struct pkru_state pkru = {0};
+ /*
+ * PKRU is not necessarily up to date in the
+ * XSAVE buffer. Use the provided value.
+ */
+ pkru.pkru = pkru_val;
+ membuf_write(&to, &pkru, sizeof(pkru));
+ } else {
+ membuf_write(&to,
+ __raw_xsave_addr(xsave, xfeature),
+ xstate_sizes[xfeature]);
+ }
+ /*
+ * Keep track of the last copied state in the non-compacted
+ * target buffer for gap zeroing.
+ */
+ zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
+ }
+
+out:
+ if (to.left)
+ membuf_zero(&to, to.left);
+}
+
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @tsk: The task from which to copy the saved xstate
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode copy_mode)
+{
+ __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
+ x86_task_fpu(tsk)->fpstate->user_xfeatures,
+ tsk->thread.pkru, copy_mode);
+}
+
+static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
+ const void *kbuf, const void __user *ubuf)
+{
+ if (kbuf) {
+ memcpy(dst, kbuf + offset, size);
+ } else {
+ if (copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+
+/**
+ * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
+ * @fpstate: The fpstate buffer to copy to
+ * @kbuf: The UABI format buffer, if it comes from the kernel
+ * @ubuf: The UABI format buffer, if it comes from userspace
+ * @pkru: The location to write the PKRU value to
+ *
+ * Converts from the UABI format into the kernel internal hardware
+ * dependent format.
+ *
+ * This function ultimately has three different callers with distinct PKRU
+ * behavior.
+ * 1. When called from sigreturn the PKRU register will be restored from
+ * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
+ * @fpstate is sufficient to cover this case, but the caller will also
+ * pass a pointer to the thread_struct's pkru field in @pkru and updating
+ * it is harmless.
+ * 2. When called from ptrace the PKRU register will be restored from the
+ * thread_struct's pkru field. A pointer to that is passed in @pkru.
+ * The kernel will restore it manually, so the XRSTOR behavior that resets
+ * the PKRU register to the hardware init value (0) if the corresponding
+ * xfeatures bit is not set is emulated here.
+ * 3. When called from KVM the PKRU register will be restored from the vcpu's
+ * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
+ * XRSTOR and hasn't had the PKRU resetting behavior described above. To
+ * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
+ * bit is not set.
+ */
+static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
+ const void __user *ubuf, u32 *pkru)
+{
+ struct xregs_state *xsave = &fpstate->regs.xsave;
+ unsigned int offset, size;
+ struct xstate_header hdr;
+ u64 mask;
+ int i;
+
+ offset = offsetof(struct xregs_state, header);
+ if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
+ return -EFAULT;
+
+ if (validate_user_xstate_header(&hdr, fpstate))
+ return -EINVAL;
+
+ /* Validate MXCSR when any of the related features is in use */
+ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
+ if (hdr.xfeatures & mask) {
+ u32 mxcsr[2];
+
+ offset = offsetof(struct fxregs_state, mxcsr);
+ if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
+ return -EFAULT;
+
+ /* Reserved bits in MXCSR must be zero. */
+ if (mxcsr[0] & ~mxcsr_feature_mask)
+ return -EINVAL;
+
+ /* SSE and YMM require MXCSR even when FP is not in use. */
+ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
+ xsave->i387.mxcsr = mxcsr[0];
+ xsave->i387.mxcsr_mask = mxcsr[1];
+ }
+ }
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ mask = BIT_ULL(i);
+
+ if (hdr.xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
+ return -EFAULT;
+ }
+ }
+
+ if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
+ struct pkru_state *xpkru;
+
+ xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
+ *pkru = xpkru->pkru;
+ } else {
+ /*
+ * KVM may pass NULL here to indicate that it does not need
+ * PKRU updated.
+ */
+ if (pkru)
+ *pkru = 0;
+ }
+
+ /*
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
+ */
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
+
+ /*
+ * Add back in the features that came in from userspace:
+ */
+ xsave->header.xfeatures |= hdr.xfeatures;
+
+ return 0;
+}
+
+/*
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
+ * format and copy to the target thread. Used by ptrace and KVM.
+ */
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
+{
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
+}
+
+/*
+ * Convert from a sigreturn standard-format user-space buffer to kernel
+ * XSAVE[S] format and copy to the target thread. This is called from the
+ * sigreturn() and rt_sigreturn() system calls.
+ */
+int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
+ const void __user *ubuf)
+{
+ return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
+}
+
+static bool validate_independent_components(u64 mask)
+{
+ u64 xchk;
+
+ if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
+ return false;
+
+ xchk = ~xfeatures_mask_independent();
+
+ if (WARN_ON_ONCE(!mask || mask & xchk))
+ return false;
+
+ return true;
+}
+
+/**
+ * xsaves - Save selected components to a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to save
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized as
+ * XSAVES does not write the full xstate header. Before first use the
+ * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
+ * can #GP.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xsaves(struct xregs_state *xstate, u64 mask)
+{
+ int err;
+
+ if (!validate_independent_components(mask))
+ return;
+
+ XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
+}
+
+/**
+ * xrstors - Restore selected components from a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to restore
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized
+ * otherwise XRSTORS from that buffer can #GP.
+ *
+ * Proper usage is to restore the state which was saved with
+ * xsaves() into @xstate.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xrstors(struct xregs_state *xstate, u64 mask)
+{
+ int err;
+
+ if (!validate_independent_components(mask))
+ return;
+
+ XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
+{
+ void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);
+
+ if (addr)
+ memset(addr, 0, xstate_sizes[xfeature]);
+}
+EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
+#endif
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
+ */
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ u64 xfd = __this_cpu_read(xfd_state);
+
+ if (fpstate->xfd == xfd)
+ return true;
+
+ /*
+ * The XFD MSR does not match fpstate->xfd. That's invalid when
+ * the passed in fpstate is current's fpstate.
+ */
+ if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
+ return false;
+
+ /*
+ * XRSTOR(S) from init_fpstate are always correct as it will just
+ * bring all components into init state and not read from the
+ * buffer. XSAVE(S) raises #PF after init.
+ */
+ if (fpstate == &init_fpstate)
+ return rstor;
+
+ /*
+ * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
+ * XRSTORS(S): fpu_swap_kvm_fpstate()
+ */
+
+ /*
+ * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+ * the buffer area for XFD-disabled state components.
+ */
+ mask &= ~xfd;
+
+ /*
+ * Remove features which are valid in fpstate. They
+ * have space allocated in fpstate.
+ */
+ mask &= ~fpstate->xfeatures;
+
+ /*
+ * Any remaining state components in 'mask' might be written
+ * by XSAVE/XRSTOR. Fail validation it found.
+ */
+ return !mask;
+}
+
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */
+
+static int __init xfd_update_static_branch(void)
+{
+ /*
+ * If init_fpstate.xfd has bits set then dynamic features are
+ * available and the dynamic sizing must be enabled.
+ */
+ if (init_fpstate.xfd)
+ static_branch_enable(&__fpu_state_size_dynamic);
+ return 0;
+}
+arch_initcall(xfd_update_static_branch)
+
+void fpstate_free(struct fpu *fpu)
+{
+ if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
+ vfree(fpu->fpstate);
+}
+
+/**
+ * fpstate_realloc - Reallocate struct fpstate for the requested new features
+ *
+ * @xfeatures: A bitmap of xstate features which extend the enabled features
+ * of that task
+ * @ksize: The required size for the kernel buffer
+ * @usize: The required size for user space buffers
+ * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
+ *
+ * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
+ * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
+ * with large states are likely to live longer.
+ *
+ * Returns: 0 on success, -ENOMEM on allocation error.
+ */
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+ unsigned int usize, struct fpu_guest *guest_fpu)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ struct fpstate *curfps, *newfps = NULL;
+ unsigned int fpsize;
+ bool in_use;
+
+ fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
+
+ newfps = vzalloc(fpsize);
+ if (!newfps)
+ return -ENOMEM;
+ newfps->size = ksize;
+ newfps->user_size = usize;
+ newfps->is_valloc = true;
+
+ /*
+ * When a guest FPU is supplied, use @guest_fpu->fpstate
+ * as reference independent whether it is in use or not.
+ */
+ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
+
+ /* Determine whether @curfps is the active fpstate */
+ in_use = fpu->fpstate == curfps;
+
+ if (guest_fpu) {
+ newfps->is_guest = true;
+ newfps->is_confidential = curfps->is_confidential;
+ newfps->in_use = curfps->in_use;
+ guest_fpu->xfeatures |= xfeatures;
+ guest_fpu->uabi_size = usize;
+ }
+
+ fpregs_lock();
+ /*
+ * If @curfps is in use, ensure that the current state is in the
+ * registers before swapping fpstate as that might invalidate it
+ * due to layout changes.
+ */
+ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+
+ newfps->xfeatures = curfps->xfeatures | xfeatures;
+ newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
+ newfps->xfd = curfps->xfd & ~xfeatures;
+
+ /* Do the final updates within the locked region */
+ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
+
+ if (guest_fpu) {
+ guest_fpu->fpstate = newfps;
+ /* If curfps is active, update the FPU fpstate pointer */
+ if (in_use)
+ fpu->fpstate = newfps;
+ } else {
+ fpu->fpstate = newfps;
+ }
+
+ if (in_use)
+ xfd_update_state(fpu->fpstate);
+ fpregs_unlock();
+
+ /* Only free valloc'ed state */
+ if (curfps && curfps->is_valloc)
+ vfree(curfps);
+
+ return 0;
+}
+
+static int validate_sigaltstack(unsigned int usize)
+{
+ struct task_struct *thread, *leader = current->group_leader;
+ unsigned long framesize = get_sigframe_size();
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ /* get_sigframe_size() is based on fpu_user_cfg.max_size */
+ framesize -= fpu_user_cfg.max_size;
+ framesize += usize;
+ for_each_thread(leader, thread) {
+ if (thread->sas_ss_size && thread->sas_ss_size < framesize)
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
+{
+ /*
+ * This deliberately does not exclude !XSAVES as we still might
+ * decide to optionally context switch XCR0 or talk the silicon
+ * vendors into extending XFD for the pre AMX states, especially
+ * AVX512.
+ */
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+ struct fpu *fpu = x86_task_fpu(current->group_leader);
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ u64 mask;
+ int ret = 0;
+
+ /* Check whether fully enabled */
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /*
+ * Calculate the resulting kernel state size. Note, @permitted also
+ * contains supervisor xfeatures even though supervisor are always
+ * permitted for kernel and guest FPUs, and never permitted for user
+ * FPUs.
+ */
+ mask = permitted | requested;
+ ksize = xstate_calculate_size(mask, compacted);
+
+ /*
+ * Calculate the resulting user state size. Take care not to clobber
+ * the supervisor xfeatures in the new mask!
+ */
+ usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);
+
+ if (!guest) {
+ ret = validate_sigaltstack(usize);
+ if (ret)
+ return ret;
+ }
+
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
+ WRITE_ONCE(perm->__state_perm, mask);
+ /* Protected by sighand lock */
+ perm->__state_size = ksize;
+ perm->__user_state_size = usize;
+ return ret;
+}
+
+/*
+ * Permissions array to map facilities with more than one component
+ */
+static const u64 xstate_prctl_req[XFEATURE_MAX] = {
+ [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
+};
+
+static int xstate_request_perm(unsigned long idx, bool guest)
+{
+ u64 permitted, requested;
+ int ret;
+
+ if (idx >= XFEATURE_MAX)
+ return -EINVAL;
+
+ /*
+ * Look up the facility mask which can require more than
+ * one xstate component.
+ */
+ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
+ requested = xstate_prctl_req[idx];
+ if (!requested)
+ return -EOPNOTSUPP;
+
+ if ((fpu_user_cfg.max_features & requested) != requested)
+ return -EOPNOTSUPP;
+
+ /* Lockless quick check */
+ permitted = xstate_get_group_perm(guest);
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+ permitted = xstate_get_group_perm(guest);
+
+ /* First vCPU allocation locks the permissions. */
+ if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
+ ret = -EBUSY;
+ else
+ ret = __xstate_request_perm(permitted, requested, guest);
+ spin_unlock_irq(&current->sighand->siglock);
+ return ret;
+}
+
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
+{
+ u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ struct fpu *fpu;
+
+ if (!xfd_event) {
+ if (!guest_fpu)
+ pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+ return 0;
+ }
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+
+ /* If not permitted let it die */
+ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
+ spin_unlock_irq(&current->sighand->siglock);
+ return -EPERM;
+ }
+
+ fpu = x86_task_fpu(current->group_leader);
+ perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+ ksize = perm->__state_size;
+ usize = perm->__user_state_size;
+
+ /*
+ * The feature is permitted. State size is sufficient. Dropping
+ * the lock is safe here even if more features are added from
+ * another task, the retrieved buffer sizes are valid for the
+ * currently requested feature(s).
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Try to allocate a new fpstate. If that fails there is no way
+ * out.
+ */
+ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
+ return -EFAULT;
+ return 0;
+}
+
+int xfd_enable_feature(u64 xfd_err)
+{
+ return __xfd_enable_feature(xfd_err, NULL);
+}
+
+#else /* CONFIG_X86_64 */
+static inline int xstate_request_perm(unsigned long idx, bool guest)
+{
+ return -EPERM;
+}
+#endif /* !CONFIG_X86_64 */
+
+u64 xstate_get_guest_group_perm(void)
+{
+ return xstate_get_group_perm(true);
+}
+EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);
+
+/**
+ * fpu_xstate_prctl - xstate permission operations
+ * @option: A subfunction of arch_prctl()
+ * @arg2: option argument
+ * Return: 0 if successful; otherwise, an error code
+ *
+ * Option arguments:
+ *
+ * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
+ * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
+ * ARCH_REQ_XCOMP_PERM: Facility number requested
+ *
+ * For facilities which require more than one XSTATE component, the request
+ * must be the highest state component number related to that facility,
+ * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
+ * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
+ */
+long fpu_xstate_prctl(int option, unsigned long arg2)
+{
+ u64 __user *uptr = (u64 __user *)arg2;
+ u64 permitted, supported;
+ unsigned long idx = arg2;
+ bool guest = false;
+
+ switch (option) {
+ case ARCH_GET_XCOMP_SUPP:
+ supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
+ return put_user(supported, uptr);
+
+ case ARCH_GET_XCOMP_PERM:
+ /*
+ * Lockless snapshot as it can also change right after the
+ * dropping the lock.
+ */
+ permitted = xstate_get_host_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ permitted = xstate_get_guest_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ guest = true;
+ fallthrough;
+
+ case ARCH_REQ_XCOMP_PERM:
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return -EOPNOTSUPP;
+
+ return xstate_request_perm(idx, guest);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+/*
+ * Report the amount of time elapsed in millisecond since last AVX512
+ * use in the task. Report -1 if no AVX-512 usage.
+ */
+static void avx512_status(struct seq_file *m, struct task_struct *task)
+{
+ unsigned long timestamp;
+ long delta = -1;
+
+ /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */
+ if (task->flags & (PF_KTHREAD | PF_USER_WORKER))
+ return;
+
+ timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
+
+ if (timestamp) {
+ delta = (long)(jiffies - timestamp);
+ /*
+ * Cap to LONG_MAX if time difference > LONG_MAX
+ */
+ if (delta < 0)
+ delta = LONG_MAX;
+ delta = jiffies_to_msecs(delta);
+ }
+
+ seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
+ seq_putc(m, '\n');
+}
+
+/*
+ * Report architecture specific information
+ */
+int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ /*
+ * Report AVX512 state if the processor and build option supported.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_AVX512F))
+ avx512_status(m, task);
+
+ return 0;
+}
+#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
+#ifdef CONFIG_COREDUMP
+static const char owner_name[] = "LINUX";
+
+/*
+ * Dump type, size, offset and flag values for every xfeature that is present.
+ */
+static int dump_xsave_layout_desc(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ int i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
+ struct x86_xfeat_component xc = {
+ .type = i,
+ .size = xstate_sizes[i],
+ .offset = xstate_offsets[i],
+ /* reserved for future use */
+ .flags = 0,
+ };
+
+ if (!dump_emit(cprm, &xc, sizeof(xc)))
+ return 0;
+
+ num_records++;
+ }
+ return num_records;
+}
+
+static u32 get_xsave_desc_size(void)
+{
+ u32 cnt = 0;
+ u32 i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features)
+ cnt++;
+
+ return cnt * (sizeof(struct x86_xfeat_component));
+}
+
+int elf_coredump_extra_notes_write(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ struct elf_note en;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ en.n_namesz = sizeof(owner_name);
+ en.n_descsz = get_xsave_desc_size();
+ en.n_type = NT_X86_XSAVE_LAYOUT;
+
+ if (!dump_emit(cprm, &en, sizeof(en)))
+ return 1;
+ if (!dump_emit(cprm, owner_name, en.n_namesz))
+ return 1;
+ if (!dump_align(cprm, 4))
+ return 1;
+
+ num_records = dump_xsave_layout_desc(cprm);
+ if (!num_records)
+ return 1;
+
+ /* Total size should be equal to the number of records */
+ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
+ return 1;
+
+ return 0;
+}
+
+int elf_coredump_extra_notes_size(void)
+{
+ int size;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ /* .note header */
+ size = sizeof(struct elf_note);
+ /* Name plus alignment to 4 bytes */
+ size += roundup(sizeof(owner_name), 4);
+ size += get_xsave_desc_size();
+
+ return size;
+}
+#endif /* CONFIG_COREDUMP */
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
new file mode 100644
index 000000000000..52ce19289989
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_XSTATE_H
+#define __X86_KERNEL_FPU_XSTATE_H
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u64, xfd_state);
+#endif
+
+static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
+{
+ /*
+ * XRSTORS requires these bits set in xcomp_bv, or it will
+ * trigger #GP:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
+ xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
+}
+
+static inline u64 xstate_get_group_perm(bool guest)
+{
+ struct fpu *fpu = x86_task_fpu(current->group_leader);
+ struct fpu_state_perm *perm;
+
+ /* Pairs with WRITE_ONCE() in xstate_request_perm() */
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ return READ_ONCE(perm->__state_perm);
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+ return xstate_get_group_perm(false);
+}
+
+enum xstate_copy_mode {
+ XSTATE_COPY_FP,
+ XSTATE_COPY_FX,
+ XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode);
+extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode mode);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
+extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);
+
+
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system_xstate(unsigned int legacy_size);
+
+extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);
+
+static inline u64 xfeatures_mask_supervisor(void)
+{
+ return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_independent(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
+
+ return fpu_kernel_cfg.independent_features;
+}
+
+static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask)
+{
+ u64 xfeatures;
+ int err;
+
+ /* Read the xfeatures value already saved in the user buffer */
+ err = __get_user(xfeatures, &xbuf->header.xfeatures);
+ xfeatures |= mask;
+ err |= __put_user(xfeatures, &xbuf->header.xfeatures);
+
+ return err;
+}
+
+/*
+ * Update the value of PKRU register that was already pushed onto the signal frame.
+ */
+static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ int err;
+
+ if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
+ return 0;
+
+ /* Mark PKRU as in-use so that it is restored correctly. */
+ err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU);
+ if (err)
+ return err;
+
+ /* Update PKRU value in the userspace xsave buffer. */
+ return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
+}
+
+/* XSAVE/XRSTOR wrapper functions */
+
+#ifdef CONFIG_X86_64
+#define REX_SUFFIX "64"
+#else
+#define REX_SUFFIX
+#endif
+
+#define XSAVE "xsave" REX_SUFFIX " %[xa]"
+#define XSAVEOPT "xsaveopt" REX_SUFFIX " %[xa]"
+#define XSAVEC "xsavec" REX_SUFFIX " %[xa]"
+#define XSAVES "xsaves" REX_SUFFIX " %[xa]"
+#define XRSTOR "xrstor" REX_SUFFIX " %[xa]"
+#define XRSTORS "xrstors" REX_SUFFIX " %[xa]"
+
+/*
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
+ *
+ * The [xa] input parameter below represents the struct xregs_state pointer
+ * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns
+ * above.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
+ * states in addition to XSAVEC.
+ *
+ * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
+ * compacted storage format in addition to XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * Use XSAVE as a fallback.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile("1: " ALTERNATIVE_3(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVEC, X86_FEATURE_XSAVEC, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n\t" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask) \
+ asm volatile("1: " ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \
+ : \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
+extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
+#else
+static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
+#endif
+
+#ifdef CONFIG_X86_64
+static inline void xfd_set_state(u64 xfd)
+{
+ wrmsrq(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+}
+
+static inline void xfd_update_state(struct fpstate *fpstate)
+{
+ if (fpu_state_size_dynamic()) {
+ u64 xfd = fpstate->xfd;
+
+ if (__this_cpu_read(xfd_state) != xfd)
+ xfd_set_state(xfd);
+ }
+}
+
+extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
+#else
+static inline void xfd_set_state(u64 xfd) { }
+
+static inline void xfd_update_state(struct fpstate *fpstate) { }
+
+static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+ return -EPERM;
+}
+#endif
+
+/*
+ * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
+ */
+static inline void os_xsave(struct fpstate *fpstate)
+{
+ u64 mask = fpstate->xfeatures;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ WARN_ON_FPU(!alternatives_patched);
+ xfd_validate_state(fpstate, mask, false);
+
+ XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
+
+ /* We should never fault when copying to a kernel buffer: */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
+ */
+static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ xfd_validate_state(fpstate, mask, true);
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/* Restore of supervisor state. Does not require XFD */
+static inline void os_xrstor_supervisor(struct fpstate *fpstate)
+{
+ u64 mask = xfeatures_mask_supervisor();
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/*
+ * XSAVE itself always writes all requested xfeatures. Removing features
+ * from the request bitmap reduces the features which are written.
+ * Generate a mask of features which must be written to a sigframe. The
+ * unset features can be optimized away and not written.
+ *
+ * This optimization is user-visible. Only use for states where
+ * uninitialized sigframe contents are tolerable, like dynamic features.
+ *
+ * Users of buffers produced with this optimization must check XSTATE_BV
+ * to determine which features have been optimized out.
+ */
+static inline u64 xfeatures_need_sigframe_write(void)
+{
+ u64 xfeaures_to_write;
+
+ /* In-use features must be written: */
+ xfeaures_to_write = xfeatures_in_use();
+
+ /* Also write all non-optimizable sigframe features: */
+ xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
+ ~XFEATURE_MASK_SIGFRAME_INITOPT;
+
+ return xfeaures_to_write;
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
+ */
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ /*
+ * Include the features which are not xsaved/rstored by the kernel
+ * internally, e.g. PKRU. That's user space ABI and also required
+ * to allow the signal handler to modify PKRU.
+ */
+ struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
+ u64 mask = fpstate->user_xfeatures;
+ u32 lmask;
+ u32 hmask;
+ int err;
+
+ /* Optimize away writing unnecessary xfeatures: */
+ if (fpu_state_size_dynamic())
+ mask &= xfeatures_need_sigframe_write();
+
+ lmask = mask;
+ hmask = mask >> 32;
+ xfd_validate_state(fpstate, mask, false);
+
+ stac();
+ XSTATE_OP(XSAVE, buf, lmask, hmask, err);
+ clac();
+
+ if (!err)
+ err = update_pkru_in_sigframe(buf, pkru);
+
+ return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
+{
+ struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true);
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
+{
+ struct xregs_state *xstate = &fpstate->regs.xsave;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpstate);
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ return err;
+}
+
+
+#endif
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
new file mode 100644
index 000000000000..816187da3a47
--- /dev/null
+++ b/arch/x86/kernel/fred.c
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/traps.h>
+
+/* #DB in the kernel would imply the use of a kernel debugger. */
+#define FRED_DB_STACK_LEVEL 1UL
+#define FRED_NMI_STACK_LEVEL 2UL
+#define FRED_MC_STACK_LEVEL 2UL
+/*
+ * #DF is the highest level because a #DF means "something went wrong
+ * *while delivering an exception*." The number of cases for which that
+ * can happen with FRED is drastically reduced and basically amounts to
+ * "the stack you pointed me to is broken." Thus, always change stacks
+ * on #DF, which means it should be at the highest level.
+ */
+#define FRED_DF_STACK_LEVEL 3UL
+
+#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector)))
+
+DEFINE_PER_CPU(unsigned long, fred_rsp0);
+EXPORT_PER_CPU_SYMBOL(fred_rsp0);
+
+void cpu_init_fred_exceptions(void)
+{
+ /* When FRED is enabled by default, remove this log message */
+ pr_info("Initialize FRED on CPU%d\n", smp_processor_id());
+
+ /*
+ * If a kernel event is delivered before a CPU goes to user level for
+ * the first time, its SS is NULL thus NULL is pushed into the SS field
+ * of the FRED stack frame. But before ERETS is executed, the CPU may
+ * context switch to another task and go to user level. Then when the
+ * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later
+ * when ERETS is executed to return from the kernel event handler, a #GP
+ * fault is generated because SS doesn't match the SS saved in the FRED
+ * stack frame.
+ *
+ * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs.
+ */
+ loadsegment(ss, __KERNEL_DS);
+
+ wrmsrq(MSR_IA32_FRED_CONFIG,
+ /* Reserve for CALL emulation */
+ FRED_CONFIG_REDZONE |
+ FRED_CONFIG_INT_STKLVL(0) |
+ FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
+
+ wrmsrq(MSR_IA32_FRED_STKLVLS, 0);
+
+ /*
+ * Ater a CPU offline/online cycle, the FRED RSP0 MSR should be
+ * resynchronized with its per-CPU cache.
+ */
+ wrmsrq(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0));
+
+ wrmsrq(MSR_IA32_FRED_RSP1, 0);
+ wrmsrq(MSR_IA32_FRED_RSP2, 0);
+ wrmsrq(MSR_IA32_FRED_RSP3, 0);
+
+ /* Enable FRED */
+ cr4_set_bits(X86_CR4_FRED);
+ /* Any further IDT use is a bug */
+ idt_invalidate();
+
+ /* Use int $0x80 for 32-bit system calls in FRED mode */
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+/* Must be called after setup_cpu_entry_areas() */
+void cpu_init_fred_rsps(void)
+{
+ /*
+ * The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
+ * (remember that user space faults are always taken on stack level 0)
+ * is to avoid overflowing the kernel stack.
+ */
+ wrmsrq(MSR_IA32_FRED_STKLVLS,
+ FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL));
+
+ /* The FRED equivalents to IST stacks... */
+ wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
+ wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
+ wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9fe..0543b57f54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * Code for replacing ftrace calls with jumps.
+ * Dynamic function tracing support.
*
* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
*
@@ -9,526 +10,660 @@
* the dangers of modifying code on the run.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include <linux/execmem.h>
#include <trace/syscall.h>
-#include <asm/cacheflush.h>
+#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
-#include <asm/nmi.h>
-
+#include <asm/text-patching.h>
#ifdef CONFIG_DYNAMIC_FTRACE
-int ftrace_arch_code_modify_prepare(void)
+static int ftrace_poke_late = 0;
+
+void ftrace_arch_code_modify_prepare(void)
+ __acquires(&text_mutex)
{
- set_kernel_text_rw();
- return 0;
+ /*
+ * Need to grab text_mutex to prevent a race from module loading
+ * and live kernel patching from changing the text permissions while
+ * ftrace has it set to "read/write".
+ */
+ mutex_lock(&text_mutex);
+ ftrace_poke_late = 1;
}
-int ftrace_arch_code_modify_post_process(void)
+void ftrace_arch_code_modify_post_process(void)
+ __releases(&text_mutex)
{
- set_kernel_text_ro();
- return 0;
+ /*
+ * ftrace_make_{call,nop}() may be called during
+ * module load, and we need to finish the smp_text_poke_batch_add()
+ * that they do, here.
+ */
+ smp_text_poke_batch_finish();
+ ftrace_poke_late = 0;
+ mutex_unlock(&text_mutex);
}
-union ftrace_code_union {
- char code[MCOUNT_INSN_SIZE];
- struct {
- char e8;
- int offset;
- } __attribute__((packed));
-};
-
-static int ftrace_calc_offset(long ip, long addr)
+static const char *ftrace_nop_replace(void)
{
- return (int)(addr - ip);
+ return x86_nops[5];
}
-static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
- static union ftrace_code_union calc;
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting itself.
+ */
+ if (ftrace_is_jmp(addr)) {
+ addr = ftrace_jmp_get(addr);
+ return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
+ } else {
+ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
+ }
+}
- calc.e8 = 0xe8;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
+{
+ char cur_code[MCOUNT_INSN_SIZE];
/*
- * No locking needed, this must be called via kstop_machine
- * which in essence is like running on a uniprocessor machine.
+ * Note:
+ * We are paranoid about modifying text, as if a bug was to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with probe_kernel_*(), and make
+ * sure what we read is what we expected it to be before modifying it.
*/
- return calc.code;
+ /* read the text we want to modify */
+ if (copy_from_kernel_nofault(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ WARN_ON(1);
+ return -EFAULT;
+ }
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+ ftrace_expected = old_code;
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ return 0;
}
/*
- * Modifying code must take extra care. On an SMP machine, if
- * the code being modified is also being executed on another CPU
- * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
- * But this does not stop NMIs from happening. We still need
- * to protect against that. We separate out the modification of
- * the code to take care of this.
- *
- * Two buffers are added: An IP buffer and a "code" buffer.
- *
- * 1) Put the instruction pointer into the IP buffer
- * and the new code into the "code" buffer.
- * 2) Wait for any running NMIs to finish and set a flag that says
- * we are modifying code, it is done in an atomic operation.
- * 3) Write the code
- * 4) clear the flag.
- * 5) Wait for any running NMIs to finish.
- *
- * If an NMI is executed, the first thing it does is to call
- * "ftrace_nmi_enter". This will check if the flag is set to write
- * and if it is, it will write what is in the IP and "code" buffers.
- *
- * The trick is, it does not matter if everyone is writing the same
- * content to the code location. Also, if a CPU is executing code
- * it is OK to write to that code location if the contents being written
- * are the same as what exists.
+ * Marked __ref because it calls text_poke_early() which is .init.text. That is
+ * ok because that call will happen early, during boot, when .init sections are
+ * still present.
*/
-
-#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
-static atomic_t nmi_running = ATOMIC_INIT(0);
-static int mod_code_status; /* holds return value of text write */
-static void *mod_code_ip; /* holds the IP to write to */
-static void *mod_code_newcode; /* holds the text to write to the IP */
-
-static unsigned nmi_wait_count;
-static atomic_t nmi_update_count = ATOMIC_INIT(0);
-
-int ftrace_arch_read_dyn_info(char *buf, int size)
+static int __ref
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+ const char *new_code)
{
- int r;
+ int ret = ftrace_verify_code(ip, old_code);
+ if (ret)
+ return ret;
- r = snprintf(buf, size, "%u %u",
- nmi_wait_count,
- atomic_read(&nmi_update_count));
- return r;
+ /* replace the text with the new text */
+ if (ftrace_poke_late)
+ smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ return 0;
}
-static void clear_mod_flag(void)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
{
- int old = atomic_read(&nmi_running);
-
- for (;;) {
- int new = old & ~MOD_CODE_WRITE_FLAG;
-
- if (old == new)
- break;
+ unsigned long ip = rec->ip;
+ const char *new, *old;
- old = atomic_cmpxchg(&nmi_running, old, new);
- }
-}
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
-static void ftrace_mod_code(void)
-{
/*
- * Yes, more than one CPU process can be writing to mod_code_status.
- * (and the code itself)
- * But if one were to fail, then they all should, and if one were
- * to succeed, then they all should.
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
*/
- mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
- MCOUNT_INSN_SIZE);
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(ip, old, new);
- /* if we fail, then kill any new writers */
- if (mod_code_status)
- clear_mod_flag();
+ /*
+ * x86 overrides ftrace_replace_code -- this function will never be used
+ * in this case.
+ */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
}
-void ftrace_nmi_enter(void)
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
- smp_rmb();
- ftrace_mod_code();
- atomic_inc(&nmi_update_count);
- }
- /* Must have previous changes seen before executions */
- smp_mb();
+ unsigned long ip = rec->ip;
+ const char *new, *old;
+
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
+
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
}
-void ftrace_nmi_exit(void)
+/*
+ * Should never be called:
+ * As it is only called by __ftrace_replace_code() which is called by
+ * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ * which is called to turn mcount into nops or nops into function calls
+ * but not to convert a function from not using regs to one that uses
+ * regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
{
- /* Finish all executions before clearing nmi_running */
- smp_mb();
- atomic_dec(&nmi_running);
+ WARN_ON(1);
+ return -EINVAL;
}
-static void wait_for_nmi_and_set_mod_flag(void)
+int ftrace_update_ftrace_func(ftrace_func_t func)
{
- if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
- return;
+ unsigned long ip;
+ const char *new;
- do {
- cpu_relax();
- } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
+ ip = (unsigned long)(&ftrace_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
- nmi_wait_count++;
+ return 0;
}
-static void wait_for_nmi(void)
+void ftrace_replace_code(int enable)
{
- if (!atomic_read(&nmi_running))
- return;
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *new, *old;
+ int ret;
- do {
- cpu_relax();
- } while (atomic_read(&nmi_running));
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
- nmi_wait_count++;
-}
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
-static int
-do_ftrace_mod_code(unsigned long ip, void *new_code)
-{
- mod_code_ip = (void *)ip;
- mod_code_newcode = new_code;
+ case FTRACE_UPDATE_MAKE_CALL:
+ old = ftrace_nop_replace();
+ break;
- /* The buffers need to be visible before we let NMIs write them */
- smp_mb();
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+ break;
+ }
+
+ ret = ftrace_verify_code(rec->ip, old);
+ if (ret) {
+ ftrace_expected = old;
+ ftrace_bug(ret, rec);
+ ftrace_expected = NULL;
+ return;
+ }
+ }
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
- wait_for_nmi_and_set_mod_flag();
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- /* Make sure all running NMIs have finished before we write the code */
- smp_mb();
+ case FTRACE_UPDATE_MAKE_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+ break;
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ new = ftrace_nop_replace();
+ break;
+ }
- ftrace_mod_code();
+ smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ ftrace_update_record(rec, enable);
+ }
+ smp_text_poke_batch_finish();
+}
- /* Make sure the write happens before clearing the bit */
- smp_mb();
+void arch_ftrace_update_code(int command)
+{
+ ftrace_modify_all_code(command);
+}
- clear_mod_flag();
- wait_for_nmi();
+/* Currently only x86_64 supports dynamic trampolines */
+#ifdef CONFIG_X86_64
- return mod_code_status;
+static inline void *alloc_tramp(unsigned long size)
+{
+ return execmem_alloc_rw(EXECMEM_FTRACE, size);
+}
+static inline void tramp_free(void *tramp)
+{
+ execmem_free(tramp);
}
+/* Defined as markers to the end of the ftrace default trampolines */
+extern void ftrace_regs_caller_end(void);
+extern void ftrace_caller_end(void);
+extern void ftrace_caller_op_ptr(void);
+extern void ftrace_regs_caller_op_ptr(void);
+extern void ftrace_regs_caller_jmp(void);
+/* movq function_trace_op(%rip), %rdx */
+/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
+#define OP_REF_SIZE 7
+/*
+ * The ftrace_ops is passed to the function callback. Since the
+ * trampoline only services a single ftrace_ops, we can pass in
+ * that ops directly.
+ *
+ * The ftrace_op_code_union is used to create a pointer to the
+ * ftrace_ops that will be passed to the callback function.
+ */
+union ftrace_op_code_union {
+ char code[OP_REF_SIZE];
+ struct {
+ char op[3];
+ int offset;
+ } __attribute__((packed));
+};
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#define RET_SIZE \
+ (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_MITIGATION_SLS))
-static unsigned char *ftrace_nop_replace(void)
+static unsigned long
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
{
- return ftrace_nop;
-}
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long op_offset;
+ unsigned long call_offset;
+ unsigned long jmp_offset;
+ unsigned long offset;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long *ptr;
+ void *trampoline;
+ void *ip, *dest;
+ /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
+ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
+ unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
+ union ftrace_op_code_union op_ptr;
+ int ret;
-static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
- unsigned char *new_code)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_regs_call;
+ jmp_offset = (unsigned long)ftrace_regs_caller_jmp;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_caller_end;
+ op_offset = (unsigned long)ftrace_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_call;
+ jmp_offset = 0;
+ }
+
+ size = end_offset - start_offset;
/*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
+ * Allocate enough size to store the ftrace_caller code,
+ * the iret , as well as the address of the ftrace_ops this
+ * trampoline is used for.
*/
+ trampoline = alloc_tramp(size + RET_SIZE + sizeof(void *));
+ if (!trampoline)
+ return 0;
+
+ *tramp_size = size + RET_SIZE + sizeof(void *);
+ npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
+
+ /* Copy ftrace_caller onto the trampoline memory */
+ ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size);
+ if (WARN_ON(ret < 0))
+ goto fail;
+
+ ip = trampoline + size;
+ if (cpu_wants_rethunk_at(ip))
+ __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
+ else
+ memcpy(ip, retq, sizeof(retq));
+
+ /* No need to test direct calls on created trampolines */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ /* NOP the jnz 1f; but make sure it's a 2 byte jnz */
+ ip = trampoline + (jmp_offset - start_offset);
+ if (WARN_ON(*(char *)ip != 0x75))
+ goto fail;
+ ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
+ if (ret < 0)
+ goto fail;
+ }
- /* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ /*
+ * The address of the ftrace_ops that is used for this trampoline
+ * is stored at the end of the trampoline. This will be used to
+ * load the third parameter for the callback. Basically, that
+ * location at the end of the trampoline takes the place of
+ * the global function_trace_op variable.
+ */
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
+ ptr = (unsigned long *)(trampoline + size + RET_SIZE);
+ *ptr = (unsigned long)ops;
- /* replace the text with the new text */
- if (do_ftrace_mod_code(ip, new_code))
- return -EPERM;
+ op_offset -= start_offset;
+ memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
+
+ /* Are we pointing to the reference? */
+ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0))
+ goto fail;
- sync_core();
+ /* Load the contents of ptr into the callback parameter */
+ offset = (unsigned long)ptr;
+ offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE;
+ op_ptr.offset = offset;
+
+ /* put in the new offset to the ftrace_ops */
+ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+
+ /* put in the call to the function */
+ mutex_lock(&text_mutex);
+ call_offset -= start_offset;
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting before the call already.
+ */
+ dest = ftrace_ops_get_func(ops);
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+
+ /* ALLOC_TRAMP flags lets us know we created it */
+ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+
+ set_memory_rox((unsigned long)trampoline, npages);
+ return (unsigned long)trampoline;
+fail:
+ tramp_free(trampoline);
return 0;
}
-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+void set_ftrace_ops_ro(void)
{
- unsigned char *new, *old;
- unsigned long ip = rec->ip;
+ struct ftrace_ops *ops;
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long npages;
+ unsigned long size;
+
+ do_for_each_ftrace_op(ops, ftrace_ops_list) {
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ continue;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_caller_end;
+ }
+ size = end_offset - start_offset;
+ size = size + RET_SIZE + sizeof(void *);
+ npages = DIV_ROUND_UP(size, PAGE_SIZE);
+ set_memory_ro((unsigned long)ops->trampoline, npages);
+ } while_for_each_ftrace_op(ops);
+}
- old = ftrace_call_replace(ip, addr);
- new = ftrace_nop_replace();
+static unsigned long calc_trampoline_call_offset(bool save_regs)
+{
+ unsigned long start_offset;
+ unsigned long call_offset;
+
+ if (save_regs) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ call_offset = (unsigned long)ftrace_regs_call;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ call_offset = (unsigned long)ftrace_call;
+ }
- return ftrace_modify_code(rec->ip, old, new);
+ return call_offset - start_offset;
}
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
- unsigned char *new, *old;
- unsigned long ip = rec->ip;
+ ftrace_func_t func;
+ unsigned long offset;
+ unsigned long ip;
+ unsigned int size;
+ const char *new;
+
+ if (!ops->trampoline) {
+ ops->trampoline = create_trampoline(ops, &size);
+ if (!ops->trampoline)
+ return;
+ ops->trampoline_size = size;
+ return;
+ }
- old = ftrace_nop_replace();
- new = ftrace_call_replace(ip, addr);
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
- return ftrace_modify_code(rec->ip, old, new);
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ ip = ops->trampoline + offset;
+ func = ftrace_ops_get_func(ops);
+
+ mutex_lock(&text_mutex);
+ /* Do a safe modify in case the trampoline is executing */
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ mutex_unlock(&text_mutex);
}
-int ftrace_update_ftrace_func(ftrace_func_t func)
+/* Return the address of the function the trampoline calls */
+static void *addr_from_call(void *ptr)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char old[MCOUNT_INSN_SIZE], *new;
+ union text_poke_insn call;
int ret;
- memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = ftrace_modify_code(ip, old, new);
+ ret = copy_from_kernel_nofault(&call, ptr, CALL_INSN_SIZE);
+ if (WARN_ON_ONCE(ret < 0))
+ return NULL;
- return ret;
+ /* Make sure this is a call */
+ if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) {
+ pr_warn("Expected E8, got %x\n", call.opcode);
+ return NULL;
+ }
+
+ return ptr + CALL_INSN_SIZE + call.disp;
}
-int __init ftrace_dyn_arch_init(void *data)
+/*
+ * If the ops->trampoline was not allocated, then it probably
+ * has a static trampoline func, or is the ftrace caller itself.
+ */
+static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
{
- extern const unsigned char ftrace_test_p6nop[];
- extern const unsigned char ftrace_test_nop5[];
- extern const unsigned char ftrace_test_jmp[];
- int faulted = 0;
-
- /*
- * There is no good nop for all x86 archs.
- * We will default to using the P6_NOP5, but first we
- * will test to make sure that the nop will actually
- * work on this CPU. If it faults, we will then
- * go to a lesser efficient 5 byte nop. If that fails
- * we then just use a jmp as our nop. This isn't the most
- * efficient nop, but we can not use a multi part nop
- * since we would then risk being preempted in the middle
- * of that nop, and if we enabled tracing then, it might
- * cause a system crash.
- *
- * TODO: check the cpuid to determine the best nop.
- */
- asm volatile (
- "ftrace_test_jmp:"
- "jmp ftrace_test_p6nop\n"
- "nop\n"
- "nop\n"
- "nop\n" /* 2 byte jmp + 3 bytes */
- "ftrace_test_p6nop:"
- P6_NOP5
- "jmp 1f\n"
- "ftrace_test_nop5:"
- ".byte 0x66,0x66,0x66,0x66,0x90\n"
- "1:"
- ".section .fixup, \"ax\"\n"
- "2: movl $1, %0\n"
- " jmp ftrace_test_nop5\n"
- "3: movl $2, %0\n"
- " jmp 1b\n"
- ".previous\n"
- _ASM_EXTABLE(ftrace_test_p6nop, 2b)
- _ASM_EXTABLE(ftrace_test_nop5, 3b)
- : "=r"(faulted) : "0" (faulted));
-
- switch (faulted) {
- case 0:
- pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
- memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
- break;
- case 1:
- pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
- memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
- break;
- case 2:
- pr_info("ftrace: converting mcount calls to jmp . + 5\n");
- memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
- break;
+ unsigned long offset;
+ bool save_regs = rec->flags & FTRACE_FL_REGS_EN;
+ void *ptr;
+
+ if (ops && ops->trampoline) {
+#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \
+ defined(CONFIG_FUNCTION_GRAPH_TRACER)
+ /*
+ * We only know about function graph tracer setting as static
+ * trampoline.
+ */
+ if (ops->trampoline == FTRACE_GRAPH_ADDR)
+ return (void *)prepare_ftrace_return;
+#endif
+ return NULL;
}
- /* The return code is retured via data */
- *(unsigned long *)data = 0;
+ offset = calc_trampoline_call_offset(save_regs);
- return 0;
+ if (save_regs)
+ ptr = (void *)FTRACE_REGS_ADDR + offset;
+ else
+ ptr = (void *)FTRACE_ADDR + offset;
+
+ return addr_from_call(ptr);
}
-#endif
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
-#ifdef CONFIG_DYNAMIC_FTRACE
-extern void ftrace_graph_call(void);
+ /* If we didn't allocate this trampoline, consider it static */
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return static_tramp_func(ops, rec);
+
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ return addr_from_call((void *)ops->trampoline + offset);
+}
-static int ftrace_mod_jmp(unsigned long ip,
- int old_offset, int new_offset)
+void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
- unsigned char code[MCOUNT_INSN_SIZE];
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
- if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ tramp_free((void *)ops->trampoline);
+ ops->trampoline = 0;
+}
- if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
- return -EINVAL;
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_DYNAMIC_FTRACE */
- *(int *)(&code[1]) = new_offset;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- if (do_ftrace_mod_code(ip, &code))
- return -EPERM;
+#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
+extern void ftrace_graph_call(void);
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+{
+ return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
+}
+
+static int ftrace_mod_jmp(unsigned long ip, void *func)
+{
+ const char *new;
+ new = ftrace_jmp_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
int ftrace_enable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
- old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
-
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_graph_caller);
}
int ftrace_disable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
-
- old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_stub);
}
+#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
-#endif /* !CONFIG_DYNAMIC_FTRACE */
-
-/*
- * Hook the return address and push it in the stack of return addrs
- * in current thread info.
- */
-void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
- unsigned long frame_pointer)
+static inline bool skip_ftrace_return(void)
{
- unsigned long old;
- int faulted;
- struct ftrace_graph_ent trace;
- unsigned long return_hooker = (unsigned long)
- &return_to_handler;
-
- /* Nmi's are currently unsupported */
- if (unlikely(in_nmi()))
- return;
-
- if (unlikely(atomic_read(&current->tracing_graph_pause)))
- return;
-
/*
- * Protect against fault, even if it shouldn't
- * happen. This tool is too much intrusive to
- * ignore such a protection.
+ * When resuming from suspend-to-ram, this function can be indirectly
+ * called from early CPU startup code while the CPU is in real mode,
+ * which would fail miserably. Make sure the stack pointer is a
+ * virtual address.
+ *
+ * This check isn't as accurate as virt_addr_valid(), but it should be
+ * good enough for this purpose, and it's fast.
*/
- asm volatile(
- "1: " _ASM_MOV " (%[parent]), %[old]\n"
- "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
- " movl $0, %[faulted]\n"
- "3:\n"
-
- ".section .fixup, \"ax\"\n"
- "4: movl $1, %[faulted]\n"
- " jmp 3b\n"
- ".previous\n"
-
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 4b)
-
- : [old] "=&r" (old), [faulted] "=r" (faulted)
- : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
- : "memory"
- );
-
- if (unlikely(faulted)) {
- ftrace_graph_stop();
- WARN_ON(1);
- return;
- }
-
- if (ftrace_push_return_trace(old, self_addr, &trace.depth,
- frame_pointer) == -EBUSY) {
- *parent = old;
- return;
- }
+ if ((long)__builtin_frame_address(0) >= 0)
+ return true;
- trace.func = self_addr;
+ if (ftrace_graph_is_dead())
+ return true;
- /* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace)) {
- current->curr_ret_stack--;
- *parent = old;
- }
+ if (atomic_read(&current->tracing_graph_pause))
+ return true;
+ return false;
}
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-#ifdef CONFIG_FTRACE_SYSCALLS
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
-extern unsigned long *sys_call_table;
-
-static struct syscall_metadata **syscalls_metadata;
-
-static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
+ unsigned long frame_pointer)
{
- struct syscall_metadata *start;
- struct syscall_metadata *stop;
- char str[KSYM_SYMBOL_LEN];
-
-
- start = (struct syscall_metadata *)__start_syscalls_metadata;
- stop = (struct syscall_metadata *)__stop_syscalls_metadata;
- kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
- for ( ; start < stop; start++) {
- if (start->name && !strcmp(start->name, str))
- return start;
- }
- return NULL;
-}
-
-struct syscall_metadata *syscall_nr_to_meta(int nr)
-{
- if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0)
- return NULL;
+ if (unlikely(skip_ftrace_return()))
+ return;
- return syscalls_metadata[nr];
+ if (!function_graph_enter(*parent, ip, frame_pointer, parent))
+ *parent = return_hooker;
}
-void arch_init_ftrace_syscalls(void)
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
- int i;
- struct syscall_metadata *meta;
- unsigned long **psys_syscall_table = &sys_call_table;
- static atomic_t refs;
+ struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
+ unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
+ unsigned long *parent = (unsigned long *)stack;
- if (atomic_inc_return(&refs) != 1)
- goto end;
-
- syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
- FTRACE_SYSCALL_MAX, GFP_KERNEL);
- if (!syscalls_metadata) {
- WARN_ON(1);
+ if (unlikely(skip_ftrace_return()))
return;
- }
- for (i = 0; i < FTRACE_SYSCALL_MAX; i++) {
- meta = find_syscall_meta(psys_syscall_table[i]);
- syscalls_metadata[i] = meta;
- }
- return;
- /* Paranoid: avoid overflow */
-end:
- atomic_dec(&refs);
+ if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs))
+ *parent = return_hooker;
}
#endif
+
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
new file mode 100644
index 000000000000..f4e0c3361234
--- /dev/null
+++ b/arch/x86/kernel/ftrace_32.S
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2017 Steven Rostedt, VMware Inc.
+ */
+
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+#include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define MCOUNT_FRAME 1 /* using frame = true */
+#else
+# define MCOUNT_FRAME 0 /* using frame = false */
+#endif
+
+SYM_FUNC_START(__fentry__)
+ RET
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
+
+SYM_CODE_START(ftrace_caller)
+
+#ifdef CONFIG_FRAME_POINTER
+ /*
+ * Frame pointers are of ip followed by bp.
+ * Since fentry is an immediate jump, we are left with
+ * parent-ip, function-ip. We need to add a frame with
+ * parent-ip followed by ebp.
+ */
+ pushl 4(%esp) /* parent ip */
+ pushl %ebp
+ movl %esp, %ebp
+ pushl 2*4(%esp) /* function ip */
+
+ /* For mcount, the function ip is directly above */
+ pushl %ebp
+ movl %esp, %ebp
+#endif
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+
+#ifdef CONFIG_FRAME_POINTER
+ /* Load parent ebp into edx */
+ movl 4*4(%esp), %edx
+#else
+ /* There's no frame pointer, load the appropriate stack addr instead */
+ lea 4*4(%esp), %edx
+#endif
+
+ movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */
+ /* Get the parent ip */
+ movl 4(%edx), %edx /* edx has ebp */
+
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+#ifdef CONFIG_FRAME_POINTER
+ popl %ebp
+ addl $4,%esp /* skip function ip */
+ popl %ebp /* this is the orig bp */
+ addl $4, %esp /* skip parent ip */
+#endif
+.Lftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+/* This is weak to keep gas from relaxing the jumps */
+SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
+ RET
+SYM_CODE_END(ftrace_caller)
+
+SYM_CODE_START(ftrace_regs_caller)
+ /*
+ * We're here from an mcount/fentry CALL, and the stack frame looks like:
+ *
+ * <previous context>
+ * RET-IP
+ *
+ * The purpose of this function is to call out in an emulated INT3
+ * environment with a stack frame like:
+ *
+ * <previous context>
+ * gap / RET-IP
+ * gap
+ * gap
+ * gap
+ * pt_regs
+ *
+ * We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds
+ */
+ subl $3*4, %esp # RET-IP + 3 gaps
+ pushl %ss # ss
+ pushl %esp # points at ss
+ addl $5*4, (%esp) # make it point at <previous context>
+ pushfl # flags
+ pushl $__KERNEL_CS # cs
+ pushl 7*4(%esp) # ip <- RET-IP
+ pushl $0 # orig_eax
+
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ ENCODE_FRAME_POINTER
+
+ movl PT_EIP(%esp), %eax # 1st argument: IP
+ subl $MCOUNT_INSN_SIZE, %eax
+ movl 21*4(%esp), %edx # 2nd argument: parent ip
+ movl function_trace_op, %ecx # 3rd argument: ftrace_pos
+ pushl %esp # 4th argument: pt_regs
+
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
+ call ftrace_stub
+
+ addl $4, %esp # skip 4th argument
+
+ /* place IP below the new SP */
+ movl PT_OLDESP(%esp), %eax
+ movl PT_EIP(%esp), %ecx
+ movl %ecx, -4(%eax)
+
+ /* place EAX below that */
+ movl PT_EAX(%esp), %ecx
+ movl %ecx, -8(%eax)
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+
+ lea -8(%eax), %esp
+ popl %eax
+
+ jmp .Lftrace_ret
+SYM_CODE_END(ftrace_regs_caller)
+
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+SYM_CODE_START(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 3*4(%esp), %eax
+ /* Even with frame pointers, fentry doesn't have one here */
+ lea 4*4(%esp), %edx
+ movl $0, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ RET
+SYM_CODE_END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ subl $(PTREGS_SIZE), %esp
+ movl $0, PT_EBP(%esp)
+ movl %edx, PT_EDX(%esp)
+ movl %eax, PT_EAX(%esp)
+ movl %esp, %eax
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ movl PT_EAX(%esp), %eax
+ movl PT_EDX(%esp), %edx
+ addl $(PTREGS_SIZE), %esp
+ JMP_NOSPEC ecx
+#endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
new file mode 100644
index 000000000000..a132608265f6
--- /dev/null
+++ b/arch/x86/kernel/ftrace_64.S
@@ -0,0 +1,402 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/export.h>
+#include <linux/cfi_types.h>
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
+#include <asm/unwind_hints.h>
+#include <asm/frame.h>
+
+ .code64
+ .section .text, "ax"
+
+#ifdef CONFIG_FRAME_POINTER
+/* Save parent and function stack frames (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16*2)
+#else
+/* No need to save a stack frame */
+# define MCOUNT_FRAME_SIZE 0
+#endif /* CONFIG_FRAME_POINTER */
+
+/* Size of stack used to save mcount regs in save_mcount_regs */
+#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE)
+
+/*
+ * gcc -pg option adds a call to 'mcount' in most functions.
+ * When -mfentry is used, the call is to 'fentry' and not 'mcount'
+ * and is done before the function's stack frame is set up.
+ * They both require a set of regs to be saved before calling
+ * any C code and restored before returning back to the function.
+ *
+ * On boot up, all these calls are converted into nops. When tracing
+ * is enabled, the call can jump to either ftrace_caller or
+ * ftrace_regs_caller. Callbacks (tracing functions) that require
+ * ftrace_regs_caller (like kprobes) need to have pt_regs passed to
+ * it. For this reason, the size of the pt_regs structure will be
+ * allocated on the stack and the required mcount registers will
+ * be saved in the locations that pt_regs has them in.
+ */
+
+/*
+ * @added: the amount of stack added before calling this
+ *
+ * After this is called, the following registers contain:
+ *
+ * %rdi - holds the address that called the trampoline
+ * %rsi - holds the parent function (traced function's return address)
+ * %rdx - holds the original %rbp
+ */
+.macro save_mcount_regs added=0
+
+#ifdef CONFIG_FRAME_POINTER
+ /* Save the original rbp */
+ pushq %rbp
+
+ /*
+ * Stack traces will stop at the ftrace trampoline if the frame pointer
+ * is not set up properly. If fentry is used, we need to save a frame
+ * pointer for the parent as well as the function traced, because the
+ * fentry is called before the stack frame is set up, where as mcount
+ * is called afterward.
+ */
+
+ /* Save the parent pointer (skip orig rbp and our return address) */
+ pushq \added+8*2(%rsp)
+ pushq %rbp
+ movq %rsp, %rbp
+ /* Save the return address (now skip orig rbp, rbp and parent) */
+ pushq \added+8*3(%rsp)
+ pushq %rbp
+ movq %rsp, %rbp
+#endif /* CONFIG_FRAME_POINTER */
+
+ /*
+ * We add enough stack to save all regs.
+ */
+ subq $(FRAME_SIZE), %rsp
+ movq %rax, RAX(%rsp)
+ movq %rcx, RCX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rsi, RSI(%rsp)
+ movq %rdi, RDI(%rsp)
+ movq %r8, R8(%rsp)
+ movq %r9, R9(%rsp)
+ movq $0, ORIG_RAX(%rsp)
+ /*
+ * Save the original RBP. Even though the mcount ABI does not
+ * require this, it helps out callers.
+ */
+#ifdef CONFIG_FRAME_POINTER
+ movq MCOUNT_REG_SIZE-8(%rsp), %rdx
+#else
+ movq %rbp, %rdx
+#endif
+ movq %rdx, RBP(%rsp)
+
+ /* Copy the parent address into %rsi (second parameter) */
+ movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi
+
+ /* Move RIP to its proper location */
+ movq MCOUNT_REG_SIZE+\added(%rsp), %rdi
+ movq %rdi, RIP(%rsp)
+
+ /*
+ * Now %rdi (the first parameter) has the return address of
+ * where ftrace_call returns. But the callbacks expect the
+ * address of the call itself.
+ */
+ subq $MCOUNT_INSN_SIZE, %rdi
+ .endm
+
+.macro restore_mcount_regs save=0
+
+ /* ftrace_regs_caller or frame pointers require this */
+ movq RBP(%rsp), %rbp
+
+ movq R9(%rsp), %r9
+ movq R8(%rsp), %r8
+ movq RDI(%rsp), %rdi
+ movq RSI(%rsp), %rsi
+ movq RDX(%rsp), %rdx
+ movq RCX(%rsp), %rcx
+ movq RAX(%rsp), %rax
+
+ addq $MCOUNT_REG_SIZE-\save, %rsp
+
+ .endm
+
+SYM_TYPED_FUNC_START(ftrace_stub)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+SYM_TYPED_FUNC_START(ftrace_stub_graph)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_graph)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
+
+SYM_FUNC_START(ftrace_caller)
+ ANNOTATE_NOENDBR
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+ CALL_DEPTH_ACCOUNT
+
+ /* Stack - skipping return address of ftrace_caller */
+ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+ /* Only ops with REGS flag set should have CS register set */
+ movq $0, CS(%rsp)
+
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
+SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ call ftrace_stub
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs
+
+ /*
+ * The code up to this label is copied into trampolines so
+ * think twice before adding any new code or changing the
+ * layout here.
+ */
+SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ RET
+SYM_FUNC_END(ftrace_caller);
+STACK_FRAME_NON_STANDARD_FP(ftrace_caller)
+
+SYM_FUNC_START(ftrace_regs_caller)
+ ANNOTATE_NOENDBR
+ /* Save the current flags before any operations that can change them */
+ pushfq
+
+ /* added 8 bytes to save flags */
+ save_mcount_regs 8
+ /* save_mcount_regs fills in first two parameters */
+
+ CALL_DEPTH_ACCOUNT
+
+SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq MCOUNT_REG_SIZE(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address and flags */
+ leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ ENCODE_FRAME_POINTER
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBX(%rsp), %rbx
+
+ movq ORIG_RAX(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE-8(%rsp)
+
+ /*
+ * If ORIG_RAX is anything but zero, make this a call to that.
+ * See arch_ftrace_set_direct_caller().
+ */
+ testq %rax, %rax
+SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ jnz 1f
+
+ restore_mcount_regs
+ /* Restore flags */
+ popfq
+
+ /*
+ * The trampoline will add the return.
+ */
+SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ RET
+
+1:
+ testb $1, %al
+ jz 2f
+ andq $0xfffffffffffffffe, %rax
+ movq %rax, MCOUNT_REG_SIZE+8(%rsp)
+ restore_mcount_regs
+ /* Restore flags */
+ popfq
+ RET
+
+ /* Swap the flags with orig_rax */
+2: movq MCOUNT_REG_SIZE(%rsp), %rdi
+ movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs 8
+ /* Restore flags */
+ popfq
+ UNWIND_HINT_FUNC
+
+ /*
+ * The above left an extra return value on the stack; effectively
+ * doing a tail-call without using a register. This PUSH;RET
+ * pattern unbalances the RSB, inject a pointless CALL to rebalance.
+ */
+ ANNOTATE_INTRA_FUNCTION_CALL
+ CALL .Ldo_rebalance
+ int3
+.Ldo_rebalance:
+ add $8, %rsp
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
+
+SYM_FUNC_END(ftrace_regs_caller)
+STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
+
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+ ANNOTATE_NOENDBR
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
+ CALL_DEPTH_ACCOUNT
+
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+ RET
+
+trace:
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+ /*
+ * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
+ * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the
+ * ip and parent ip are used and the list function is called when
+ * function tracing is enabled.
+ */
+ movq ftrace_trace_function, %r8
+ CALL_NOSPEC r8
+ restore_mcount_regs
+
+ jmp ftrace_stub
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
+STACK_FRAME_NON_STANDARD_FP(__fentry__)
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+SYM_CODE_START(return_to_handler)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+
+ /* Restore return_to_handler value that got eaten by previous ret instruction. */
+ subq $8, %rsp
+ UNWIND_HINT_FUNC
+
+ /* Save ftrace_regs for function exit context */
+ subq $(FRAME_SIZE), %rsp
+
+ movq %rax, RAX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rsp, RSP(%rsp)
+ movq %rsp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq RDX(%rsp), %rdx
+ movq RAX(%rsp), %rax
+
+ addq $(FRAME_SIZE) + 8, %rsp
+
+ /*
+ * Jump back to the old return address. This cannot be JMP_NOSPEC rdi
+ * since IBT would demand that contain ENDBR, which simply isn't so for
+ * return addresses. Use a retpoline here to keep the RSB balanced.
+ */
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call .Ldo_rop
+ int3
+.Ldo_rop:
+ mov %rdi, (%rsp)
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
+SYM_CODE_END(return_to_handler)
+#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * AMD Geode southbridge support code
- * Copyright (C) 2006, Advanced Micro Devices, Inc.
- * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ioport.h>
-#include <linux/io.h>
-#include <asm/msr.h>
-#include <asm/geode.h>
-
-static struct {
- char *name;
- u32 msr;
- int size;
- u32 base;
-} lbars[] = {
- { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
- { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
- { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
- { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
-};
-
-static void __init init_lbars(void)
-{
- u32 lo, hi;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(lbars); i++) {
- rdmsr(lbars[i].msr, lo, hi);
- if (hi & 0x01)
- lbars[i].base = lo & 0x0000ffff;
-
- if (lbars[i].base == 0)
- printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
- lbars[i].name);
- }
-}
-
-int geode_get_dev_base(unsigned int dev)
-{
- BUG_ON(dev >= ARRAY_SIZE(lbars));
- return lbars[dev].base;
-}
-EXPORT_SYMBOL_GPL(geode_get_dev_base);
-
-/* === GPIO API === */
-
-void geode_gpio_set(u32 gpio, unsigned int reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-
- if (!base)
- return;
-
- /* low bank register */
- if (gpio & 0xFFFF)
- outl(gpio & 0xFFFF, base + reg);
- /* high bank register */
- gpio >>= 16;
- if (gpio)
- outl(gpio, base + 0x80 + reg);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_set);
-
-void geode_gpio_clear(u32 gpio, unsigned int reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-
- if (!base)
- return;
-
- /* low bank register */
- if (gpio & 0xFFFF)
- outl((gpio & 0xFFFF) << 16, base + reg);
- /* high bank register */
- gpio &= (0xFFFF << 16);
- if (gpio)
- outl(gpio, base + 0x80 + reg);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_clear);
-
-int geode_gpio_isset(u32 gpio, unsigned int reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
- u32 val;
-
- if (!base)
- return 0;
-
- /* low bank register */
- if (gpio & 0xFFFF) {
- val = inl(base + reg) & (gpio & 0xFFFF);
- if ((gpio & 0xFFFF) == val)
- return 1;
- }
- /* high bank register */
- gpio >>= 16;
- if (gpio) {
- val = inl(base + 0x80 + reg) & gpio;
- if (gpio == val)
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(geode_gpio_isset);
-
-void geode_gpio_set_irq(unsigned int group, unsigned int irq)
-{
- u32 lo, hi;
-
- if (group > 7 || irq > 15)
- return;
-
- rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
-
- lo &= ~(0xF << (group * 4));
- lo |= (irq & 0xF) << (group * 4);
-
- wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
-
-void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
- u32 offset, shift, val;
-
- if (gpio >= 24)
- offset = GPIO_MAP_W;
- else if (gpio >= 16)
- offset = GPIO_MAP_Z;
- else if (gpio >= 8)
- offset = GPIO_MAP_Y;
- else
- offset = GPIO_MAP_X;
-
- shift = (gpio % 8) * 4;
-
- val = inl(base + offset);
-
- /* Clear whatever was there before */
- val &= ~(0xF << shift);
-
- /* And set the new value */
-
- val |= ((pair & 7) << shift);
-
- /* Set the PME bit if this is a PME event */
-
- if (pme)
- val |= (1 << (shift + 3));
-
- outl(val, base + offset);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
-
-int geode_has_vsa2(void)
-{
- static int has_vsa2 = -1;
-
- if (has_vsa2 == -1) {
- u16 val;
-
- /*
- * The VSA has virtual registers that we can query for a
- * signature.
- */
- outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
- outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
-
- val = inw(VSA_VRC_DATA);
- has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
- }
-
- return has_vsa2;
-}
-EXPORT_SYMBOL_GPL(geode_has_vsa2);
-
-static int __init geode_southbridge_init(void)
-{
- if (!is_geode())
- return -ENODEV;
-
- init_lbars();
- (void) mfgpt_timer_setup();
- return 0;
-}
-
-postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
deleted file mode 100644
index 3e66bd364a9d..000000000000
--- a/arch/x86/kernel/head.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-
-#include <asm/setup.h>
-#include <asm/bios_ebda.h>
-
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-void __init reserve_ebda_region(void)
-{
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
-}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3f8579f8d42c..375f2d7f1762 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/arch/i386/kernel/head32.c -- prepare to run common code
*
@@ -7,35 +8,163 @@
#include <linux/init.h>
#include <linux/start_kernel.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/sections.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
+#include <asm/page.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
-#include <asm/trampoline.h>
+#include <asm/microcode.h>
+#include <asm/tlbflush.h>
+#include <asm/bootparam_utils.h>
-void __init i386_start_kernel(void)
+static void __init i386_default_early_setup(void)
{
- reserve_trampoline_memory();
+ /* Initialize 32bit specific setup functions */
+ x86_init.resources.reserve_resources = i386_reserve_resources;
+ x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
+}
+
+#ifdef CONFIG_MICROCODE_INITRD32
+unsigned long __initdata initrd_start_early;
+static pte_t __initdata *initrd_pl2p_start, *initrd_pl2p_end;
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+static void zap_early_initrd_mapping(void)
+{
+ pte_t *pl2p = initrd_pl2p_start;
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = ramdisk_image + ramdisk_size;
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ for (; pl2p < initrd_pl2p_end; pl2p++) {
+ *pl2p = (pte_t){ .pte = 0 };
+
+ if (!IS_ENABLED(CONFIG_X86_PAE))
+ *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = (pte_t) {.pte = 0};
}
+}
+#else
+static inline void zap_early_initrd_mapping(void) { }
#endif
- reserve_ebda_region();
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
+asmlinkage __visible void __init __noreturn i386_start_kernel(void)
+{
+ /* Make sure IDT is set up before any exception happens */
+ idt_setup_early_handler();
+
+ load_ucode_bsp();
+ zap_early_initrd_mapping();
+
+ cr4_init_shadow();
+
+ sanitize_boot_params(&boot_params);
+
+ x86_early_init_platform_quirks();
+
+ /* Call the subarch specific early setup function */
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ case X86_SUBARCH_CE4100:
+ x86_ce4100_early_setup();
+ break;
+ default:
+ i386_default_early_setup();
+ break;
+ }
start_kernel();
}
+
+/*
+ * Initialize page tables. This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD. Note the upper half of each PMD or PTE are
+ * always zero at this stage.
+ */
+#ifdef CONFIG_X86_PAE
+typedef pmd_t pl2_t;
+#define pl2_base initial_pg_pmd
+#define SET_PL2(val) { .pmd = (val), }
+#else
+typedef pgd_t pl2_t;
+#define pl2_base initial_page_table
+#define SET_PL2(val) { .pgd = (val), }
+#endif
+
+static __init __no_stack_protector pte_t init_map(pte_t pte, pte_t **ptep, pl2_t **pl2p,
+ const unsigned long limit)
+{
+ while ((pte.pte & PTE_PFN_MASK) < limit) {
+ pl2_t pl2 = SET_PL2((unsigned long)*ptep | PDE_IDENT_ATTR);
+ int i;
+
+ **pl2p = pl2;
+ if (!IS_ENABLED(CONFIG_X86_PAE)) {
+ /* Kernel PDE entry */
+ *(*pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+ }
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ **ptep = pte;
+ pte.pte += PAGE_SIZE;
+ (*ptep)++;
+ }
+ (*pl2p)++;
+ }
+ return pte;
+}
+
+void __init __no_stack_protector mk_early_pgtbl_32(void)
+{
+ /* Enough space to fit pagetables for the low memory linear map */
+ unsigned long limit = __pa_nodebug(_end) + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+ pte_t pte, *ptep = (pte_t *)__pa_nodebug(__brk_base);
+ struct boot_params __maybe_unused *params;
+ pl2_t *pl2p = (pl2_t *)__pa_nodebug(pl2_base);
+ unsigned long *ptr;
+
+ pte.pte = PTE_IDENT_ATTR;
+ pte = init_map(pte, &ptep, &pl2p, limit);
+
+ ptr = (unsigned long *)__pa_nodebug(&max_pfn_mapped);
+ /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
+ *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+
+ ptr = (unsigned long *)__pa_nodebug(&_brk_end);
+ *ptr = (unsigned long)ptep + PAGE_OFFSET;
+
+#ifdef CONFIG_MICROCODE_INITRD32
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
+ return;
+
+ /* Save the virtual start address */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_start_early);
+ *ptr = (pte.pte & PTE_PFN_MASK) + PAGE_OFFSET;
+ *ptr += ((unsigned long)params->hdr.ramdisk_image) & ~PAGE_MASK;
+
+ /* Save PLP2 for cleanup */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_start);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+
+ limit = (unsigned long)params->hdr.ramdisk_image;
+ pte.pte = PTE_IDENT_ATTR | PFN_ALIGN(limit);
+ limit = (unsigned long)params->hdr.ramdisk_image + params->hdr.ramdisk_size;
+
+ init_map(pte, &ptep, &pl2p, limit);
+
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_end);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+#endif
+}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c732..fd28b53dbac5 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -1,9 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* prepare to run common code
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
*/
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
@@ -12,113 +16,308 @@
#include <linux/percpu.h>
#include <linux/start_kernel.h>
#include <linux/io.h>
+#include <linux/memblock.h>
+#include <linux/cc_platform.h>
+#include <linux/pgtable.h>
+#include <asm/asm.h>
+#include <asm/page_64.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/setup.h>
#include <asm/desc.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/kdebug.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/bios_ebda.h>
-#include <asm/trampoline.h>
+#include <asm/bootparam_utils.h>
+#include <asm/microcode.h>
+#include <asm/kasan.h>
+#include <asm/fixmap.h>
+#include <asm/realmode.h>
+#include <asm/extable.h>
+#include <asm/trapnr.h>
+#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/init.h>
+
+/*
+ * Manage page tables very early on.
+ */
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+unsigned int __initdata next_early_pgt;
+SYM_PIC_ALIAS(next_early_pgt);
+pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+
+unsigned int __pgtable_l5_enabled __ro_after_init;
+SYM_PIC_ALIAS(__pgtable_l5_enabled);
+unsigned int pgdir_shift __ro_after_init = 39;
+EXPORT_SYMBOL(pgdir_shift);
+SYM_PIC_ALIAS(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+EXPORT_SYMBOL(ptrs_per_p4d);
+SYM_PIC_ALIAS(ptrs_per_p4d);
+
+unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
+EXPORT_SYMBOL(vmemmap_base);
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
+{
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ next_early_pgt = 0;
+ write_cr3(__sme_pa_nodebug(early_top_pgt));
+}
+
+/* Create a new PMD entry */
+bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
+ pudval_t pud, *pud_p;
+ pmdval_t *pmd_p;
+
+ /* Invalid address or early pgt is done ? */
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
+ return false;
+
+again:
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (!pgtable_l5_enabled())
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
-static void __init zap_identity_mappings(void)
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pud_p += pud_index(address);
+ pud = *pud_p;
+
+ if (pud)
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pmd_p[pmd_index(address)] = pmd;
+
+ return true;
+}
+
+static bool __init early_make_pgtable(unsigned long address)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- __flush_tlb_all();
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pmdval_t pmd;
+
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+ return __early_make_pgtable(address, pmd);
+}
+
+void __init do_early_exception(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr == X86_TRAP_PF &&
+ early_make_pgtable(native_read_cr2()))
+ return;
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) &&
+ trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs))
+ return;
+
+ if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs))
+ return;
+
+ early_fixup_exception(regs, trapnr);
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
-static void __init clear_bss(void)
+void __init clear_bss(void)
{
memset(__bss_start, 0,
(unsigned long) __bss_stop - (unsigned long) __bss_start);
+ memset(__brk_base, 0,
+ (unsigned long) __brk_limit - (unsigned long) __brk_base);
+}
+
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
}
static void __init copy_bootdata(char *real_mode_data)
{
char * command_line;
+ unsigned long cmd_line_ptr;
- memcpy(&boot_params, real_mode_data, sizeof boot_params);
- if (boot_params.hdr.cmd_line_ptr) {
- command_line = __va(boot_params.hdr.cmd_line_ptr);
+ /*
+ * If SME is active, this will create decrypted mappings of the
+ * boot data in advance of the copy operations.
+ */
+ sme_map_bootdata(real_mode_data);
+
+ memcpy(&boot_params, real_mode_data, sizeof(boot_params));
+ sanitize_boot_params(&boot_params);
+ cmd_line_ptr = get_cmd_line_ptr();
+ if (cmd_line_ptr) {
+ command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
+
+ /*
+ * The old boot data is no longer needed and won't be reserved,
+ * freeing up that memory for use by the system. If SME is active,
+ * we need to remove the mappings that were created so that the
+ * memory doesn't remain mapped as decrypted.
+ */
+ sme_unmap_bootdata(real_mode_data);
}
-void __init x86_64_start_kernel(char * real_mode_data)
+asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data)
{
- int i;
-
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
*/
- BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
- BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+ BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
+ BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
- BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+ BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
- /* clear bss before set_intr_gate with early_idt_handler */
+ cr4_init_shadow();
+
+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
+ if (pgtable_l5_enabled()) {
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+ }
+
clear_bss();
- /* Make NULL pointers segfault */
- zap_identity_mappings();
+ /*
+ * This needs to happen *before* kasan_early_init() because latter maps stuff
+ * into that page.
+ */
+ clear_page(init_top_pgt);
- /* Cleanup the over mapped high alias */
- cleanup_highmap();
+ /*
+ * SME support may update early_pmd_flags to include the memory
+ * encryption mask, so it needs to be called before anything
+ * that may generate a page fault.
+ */
+ sme_early_init();
- for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
-#ifdef CONFIG_EARLY_PRINTK
- set_intr_gate(i, &early_idt_handlers[i]);
-#else
- set_intr_gate(i, early_idt_handler);
-#endif
- }
- load_idt((const struct desc_ptr *)&idt_descr);
+ kasan_early_init();
- if (console_loglevel == 10)
- early_printk("Kernel alive\n");
+ /*
+ * Flush global TLB entries which could be left over from the trampoline page
+ * table.
+ *
+ * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs
+ * instrument native_write_cr4() so KASAN must be initialized for that
+ * instrumentation to work.
+ */
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+
+ idt_setup_early_handler();
+
+ /* Needed before cc_platform_has() can be used for TDX */
+ tdx_early_init();
+
+ copy_bootdata(__va(real_mode_data));
+
+ /*
+ * Load microcode early on BSP.
+ */
+ load_ucode_bsp();
+
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data);
}
-void __init x86_64_start_reservations(char *real_mode_data)
+void __init __noreturn x86_64_start_reservations(char *real_mode_data)
{
- copy_bootdata(__va(real_mode_data));
+ /* version is always not zero if it is copied */
+ if (!boot_params.hdr.version)
+ copy_bootdata(__va(real_mode_data));
- reserve_trampoline_memory();
+ x86_early_init_platform_quirks();
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
-
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ default:
+ break;
}
-#endif
- reserve_ebda_region();
+ start_kernel();
+}
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
+void early_setup_idt(void)
+{
+ void *handler = NULL;
- start_kernel();
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ setup_ghcb();
+ handler = vc_boot_ghcb;
+ }
+
+ __pi_startup_64_load_idt(handler);
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0d98a01cbdb2..80ef5d386b03 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
*
* Copyright (C) 1991, 1992 Linus Torvalds
@@ -7,6 +8,7 @@
*/
.text
+#include <linux/export.h>
#include <linux/threads.h>
#include <linux/init.h>
#include <linux/linkage.h>
@@ -18,7 +20,13 @@
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeatures.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
+#include <asm/nospec-branch.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable_32.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -30,46 +38,20 @@
#define X86 new_cpu_data+CPUINFO_x86
#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
#define X86_MODEL new_cpu_data+CPUINFO_x86_model
-#define X86_MASK new_cpu_data+CPUINFO_x86_mask
+#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping
#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
/*
- * This is how much memory in addition to the memory covered up to
- * and including _end we need mapped initially.
- * We need:
- * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
- * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- *
- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
- * and small than max_low_pfn, otherwise will waste some page table entries
- */
-
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
- PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
-
-/*
* Worst-case size of the kernel mapping we need to make:
- * the worst-case size of the kernel itself, plus the extra we need
- * to map for the linear map.
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
+ * to map all of lowmem.
*/
-KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+KERNEL_PAGES = LOWMEM_PAGES
-INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
RESERVE_BRK(pagetables, INIT_MAP_SIZE)
/*
@@ -79,13 +61,10 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
* any particular GDT layout, because we load our own as soon as we
* can.
*/
-.section .text.head,"ax",@progbits
-ENTRY(startup_32)
- /* test KEEP_SEGMENTS flag to see if the bootloader is asking
- us to not reload segments */
- testb $(1<<6), BP_loadflags(%esi)
- jnz 2f
-
+ __INIT
+SYM_CODE_START(startup_32)
+ movl pa(initial_stack),%ecx
+
/*
* Set segments to known values.
*/
@@ -95,7 +74,8 @@ ENTRY(startup_32)
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
-2:
+ movl %eax,%ss
+ leal -__PAGE_OFFSET(%ecx),%esp
/*
* Clear BSS first so that there are no surprises...
@@ -106,7 +86,7 @@ ENTRY(startup_32)
movl $pa(__bss_stop),%ecx
subl %edi,%ecx
shrl $2,%ecx
- rep ; stosl
+ rep stosl
/*
* Copy bootup parameters out of the way.
* Note: %esi still has the pointer to the real-mode data.
@@ -118,140 +98,36 @@ ENTRY(startup_32)
movl $pa(boot_params),%edi
movl $(PARAM_SIZE/4),%ecx
cld
- rep
- movsl
+ rep movsl
movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
- jz 1f # No comand line
+ jz 1f # No command line
movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
- rep
- movsl
+ rep movsl
1:
-#ifdef CONFIG_PARAVIRT
- /* This is can only trip for a broken bootloader... */
- cmpw $0x207, pa(boot_params + BP_version)
- jb default_entry
-
- /* Paravirt-compatible boot parameters. Look to see what architecture
- we're booting under. */
- movl pa(boot_params + BP_hardware_subarch), %eax
- cmpl $num_subarch_entries, %eax
- jae bad_subarch
-
- movl pa(subarch_entries)(,%eax,4), %eax
- subl $__PAGE_OFFSET, %eax
- jmp *%eax
-
-bad_subarch:
-WEAK(lguest_entry)
-WEAK(xen_entry)
- /* Unknown implementation; there's really
- nothing we can do at this point. */
- ud2a
-
- __INITDATA
-
-subarch_entries:
- .long default_entry /* normal x86/PC */
- .long lguest_entry /* lguest hypervisor */
- .long xen_entry /* Xen hypervisor */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#endif /* CONFIG_PARAVIRT */
-
-/*
- * Initialize page tables. This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base. The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- *
- * Note that the stack is not yet set up!
- */
-default_entry:
-#ifdef CONFIG_X86_PAE
+#ifdef CONFIG_OLPC
+ /* save OFW's pgdir table for later use when calling into OFW */
+ movl %cr3, %eax
+ movl %eax, pa(olpc_ofw_pgd)
+#endif
- /*
- * In PAE mode swapper_pg_dir is statically defined to contain enough
- * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
- * entries). The identity mapping is handled by pointing two PGD
- * entries to the first kernel PMD.
- *
- * Note the upper half of each PMD or PTE are always zero at
- * this stage.
- */
+ /* Create early pagetables. */
+ call mk_early_pgtbl_32
+ /* Do early initialization of the fixmap area */
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#ifdef CONFIG_X86_PAE
#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
+#else
+ movl %eax,pa(initial_page_table+0xffc)
+#endif
- xorl %ebx,%ebx /* %ebx is kept at zero */
-
- movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_pmd), %edx
- movl $PTE_IDENT_ATTR, %eax
-10:
- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
- movl %ecx,(%edx) /* Store PMD entry */
- /* Upper half already zero */
- addl $8,%edx
- movl $512,%ecx
-11:
- stosl
- xchgl %eax,%ebx
- stosl
- xchgl %eax,%ebx
- addl $0x1000,%eax
- loop 11b
-
- /*
- * End condition: we must map up to the end + MAPPING_BEYOND_END.
- */
- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
- cmpl %ebp,%eax
- jb 10b
-1:
- addl $__PAGE_OFFSET, %edi
- movl %edi, pa(_brk_end)
- shrl $12, %eax
- movl %eax, pa(max_pfn_mapped)
-
- /* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
-#else /* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
- movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_dir), %edx
- movl $PTE_IDENT_ATTR, %eax
-10:
- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
- movl %ecx,(%edx) /* Store identity PDE entry */
- movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
- addl $4,%edx
- movl $1024, %ecx
-11:
- stosl
- addl $0x1000,%eax
- loop 11b
- /*
- * End condition: we must map up to the end + MAPPING_BEYOND_END.
- */
- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
- cmpl %ebp,%eax
- jb 10b
- addl $__PAGE_OFFSET, %edi
- movl %edi, pa(_brk_end)
- shrl $12, %eax
- movl %eax, pa(max_pfn_mapped)
+ jmp .Ldefault_entry
+SYM_CODE_END(startup_32)
- /* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_dir+0xffc)
-#endif
- jmp 3f
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
@@ -260,131 +136,110 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
-
-#ifndef CONFIG_HOTPLUG_CPU
-.section .init.text,"ax",@progbits
+#ifdef CONFIG_HOTPLUG_CPU
+ .text
#endif
-
-#ifdef CONFIG_SMP
-ENTRY(startup_32_smp)
+SYM_FUNC_START(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
-#endif /* CONFIG_SMP */
-3:
+ movl pa(initial_stack),%ecx
+ movl %eax,%ss
+ leal -__PAGE_OFFSET(%ecx),%esp
+
+.Ldefault_entry:
+ movl $(CR0_STATE & ~X86_CR0_PG),%eax
+ movl %eax,%cr0
/*
- * New page tables may be in 4Mbyte page mode and may
- * be using the global pages.
- *
- * NOTE! If we are on a 486 we may have no cr4 at all!
- * So we do not try to touch it unless we really have
- * some bits in it to set. This won't work if the BSP
- * implements cr4 but this AP does not -- very unlikely
- * but be warned! The same applies to the pse feature
- * if not equally supported. --macro
+ * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
+ * bits like NT set. This would confuse the debugger if this code is traced. So
+ * initialize them properly now before switching to protected mode. That means
+ * DF in particular (even though we have cleared it earlier after copying the
+ * command line) because GCC expects it.
+ */
+ pushl $0
+ popfl
+
+/*
+ * New page tables may be in 4Mbyte page mode and may be using the global pages.
*
- * NOTE! We have to correct for the fact that we're
- * not yet offset PAGE_OFFSET..
+ * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
+ * if and only if CPUID exists and has flags other than the FPU flag set.
*/
-#define cr4_bits pa(mmu_cr4_features)
- movl cr4_bits,%edx
- andl %edx,%edx
- jz 6f
- movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
- orl %edx,%eax
+ movl $-1,pa(X86_CPUID) # preset CPUID level
+ movl $X86_EFLAGS_ID,%ecx
+ pushl %ecx
+ popfl # set EFLAGS=ID
+ pushfl
+ popl %eax # get EFLAGS
+ testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
+ jz .Lenable_paging # hw disallowed setting of ID bit
+ # which means no CPUID and no CR4
+
+ xorl %eax,%eax
+ cpuid
+ movl %eax,pa(X86_CPUID) # save largest std CPUID function
+
+ movl $1,%eax
+ cpuid
+ andl $~1,%edx # Ignore CPUID.FPU
+ jz .Lenable_paging # No flags or only CPUID.FPU = no CR4
+
+ movl pa(mmu_cr4_features),%eax
movl %eax,%cr4
- btl $5, %eax # check if PAE is enabled
- jnc 6f
+ testb $X86_CR4_PAE, %al # check if PAE is enabled
+ jz .Lenable_paging
/* Check if extended functions are implemented */
movl $0x80000000, %eax
cpuid
- cmpl $0x80000000, %eax
- jbe 6f
+ /* Value must be in the range 0x80000001 to 0x8000ffff */
+ subl $0x80000001, %eax
+ cmpl $(0x8000ffff-0x80000001), %eax
+ ja .Lenable_paging
+
+ /* Clear bogus XD_DISABLE bits */
+ call verify_cpu
+
mov $0x80000001, %eax
cpuid
/* Execute Disable bit supported? */
- btl $20, %edx
- jnc 6f
+ btl $(X86_FEATURE_NX & 31), %edx
+ jnc .Lenable_paging
/* Setup EFER (Extended Feature Enable Register) */
- movl $0xc0000080, %ecx
+ movl $MSR_EFER, %ecx
rdmsr
- btsl $11, %eax
+ btsl $_EFER_NX, %eax
/* Make changes effective */
wrmsr
-6:
+.Lenable_paging:
/*
* Enable paging
*/
- movl $pa(swapper_pg_dir),%eax
+ movl $pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
- movl %cr0,%eax
- orl $X86_CR0_PG,%eax
+ movl $CR0_STATE,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1:
- /* Set up the stack pointer */
- lss stack_start,%esp
+ /* Shift the stack pointer to a virtual address */
+ addl $__PAGE_OFFSET, %esp
/*
- * Initialize eflags. Some BIOS's leave bits like NT set. This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
+ * Check if it is 486
*/
- pushl $0
- popfl
-
-#ifdef CONFIG_SMP
- cmpb $0, ready
- jz 1f /* Initial CPU cleans BSS */
- jmp checkCPUtype
-1:
-#endif /* CONFIG_SMP */
-
-/*
- * start system 32-bit setup. We need to re-do some of the things done
- * in 16-bit mode for the "real" operations.
- */
- call setup_idt
-
-checkCPUtype:
-
- movl $-1,X86_CPUID # -1 for no CPUID initially
-
-/* check if it is 486 or 386. */
-/*
- * XXX - this does a lot of unnecessary setup. Alignment checks don't
- * apply at our cpl of 0 and the stack ought to be aligned already, and
- * we don't need to preserve eflags.
- */
-
- movb $3,X86 # at least 386
- pushfl # push EFLAGS
- popl %eax # get EFLAGS
- movl %eax,%ecx # save original EFLAGS
- xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
- pushl %eax # copy to EFLAGS
- popfl # set EFLAGS
- pushfl # get new EFLAGS
- popl %eax # put it in eax
- xorl %ecx,%eax # change in flags
- pushl %ecx # restore original EFLAGS
- popfl
- testl $0x40000,%eax # check if AC bit changed
- je is386
-
- movb $4,X86 # at least 486
- testl $0x200000,%eax # check if ID bit changed
- je is486
+ movb $4,X86 # at least 486
+ cmpl $-1,X86_CPUID
+ je .Lis486
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
@@ -395,7 +250,7 @@ checkCPUtype:
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
orl %eax,%eax # do we have processor info as well?
- je is486
+ je .Lis486
movl $1,%eax # Use the CPUID instruction to get CPU type
cpuid
@@ -406,21 +261,17 @@ checkCPUtype:
shrb $4,%al
movb %al,X86_MODEL
andb $0x0f,%cl # mask mask revision
- movb %cl,X86_MASK
+ movb %cl,X86_STEPPING
movl %edx,X86_CAPABILITY
-is486: movl $0x50022,%ecx # set AM, WP, NE and MP
- jmp 2f
-
-is386: movl $2,%ecx # set MP
-2: movl %cr0,%eax
+.Lis486:
+ movl $0x50022,%ecx # set AM, WP, NE and MP
+ movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
orl %ecx,%eax
movl %eax,%cr0
- call check_x87
lgdt early_gdt_descr
- lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
@@ -432,145 +283,91 @@ is386: movl $2,%ecx # set MP
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
-#ifdef CONFIG_CC_STACKPROTECTOR
- /*
- * The linker can't handle this by relocation. Manually set
- * base address in stack canary segment descriptor.
- */
- cmpb $0,ready
- jne 1f
- movl $per_cpu__gdt_page,%eax
- movl $per_cpu__stack_canary,%ecx
- subl $20, %ecx
- movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
- shrl $16, %ecx
- movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
- movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-1:
-#endif
- movl $(__KERNEL_STACK_CANARY),%eax
- movl %eax,%gs
+ xorl %eax,%eax
+ movl %eax,%gs # clear possible garbage in %gs
xorl %eax,%eax # Clear LDT
lldt %ax
- cld # gcc2 wants the direction flag cleared at all times
- pushl $0 # fake return address for unwinder
-#ifdef CONFIG_SMP
- movb ready, %cl
- movb $1, ready
- cmpb $0,%cl # the first CPU calls start_kernel
- je 1f
- movl (stack_start), %esp
-1:
-#endif /* CONFIG_SMP */
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
+SYM_FUNC_END(startup_32_smp)
+
+#include "verify_cpu.S"
+
+__INIT
+SYM_FUNC_START(early_idt_handler_array)
+ # 36(%esp) %eflags
+ # 32(%esp) %cs
+ # 28(%esp) %eip
+ # 24(%rsp) error code
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ pushl $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushl $i # 20(%esp) Vector number
+ jmp early_idt_handler_common
+ i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+SYM_FUNC_END(early_idt_handler_array)
+
+SYM_CODE_START_LOCAL(early_idt_handler_common)
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
-/*
- * We depend on ET to be correct. This checks for 287/387.
- */
-check_x87:
- movb $0,X86_HARD_MATH
- clts
- fninit
- fstsw %ax
- cmpb $0,%al
- je 1f
- movl %cr0,%eax /* no coprocessor: have to set bits */
- xorl $4,%eax /* set EM */
- movl %eax,%cr0
- ret
- ALIGN
-1: movb $1,X86_HARD_MATH
- .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
- ret
+ incl %ss:early_recursion_flag
+
+ /* The vector number is in pt_regs->gs */
-/*
- * setup_idt
- *
- * sets up a idt with 256 entries pointing to
- * ignore_int, interrupt gates. It doesn't actually load
- * idt - that can be done only after paging has been enabled
- * and the kernel moved to PAGE_OFFSET. Interrupts
- * are enabled elsewhere, when we can be relatively
- * sure everything is ok.
- *
- * Warning: %esi is live across this function.
- */
-setup_idt:
- lea ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
-
- lea idt_table,%edi
- mov $256,%ecx
-rp_sidt:
- movl %eax,(%edi)
- movl %edx,4(%edi)
- addl $8,%edi
- dec %ecx
- jne rp_sidt
-
-.macro set_early_handler handler,trapno
- lea \handler,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
- lea idt_table,%edi
- movl %eax,8*\trapno(%edi)
- movl %edx,8*\trapno+4(%edi)
-.endm
-
- set_early_handler handler=early_divide_err,trapno=0
- set_early_handler handler=early_illegal_opcode,trapno=6
- set_early_handler handler=early_protection_fault,trapno=13
- set_early_handler handler=early_page_fault,trapno=14
-
- ret
-
-early_divide_err:
- xor %edx,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
-
-early_illegal_opcode:
- movl $6,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
-
-early_protection_fault:
- movl $13,%edx
- jmp early_fault
-
-early_page_fault:
- movl $14,%edx
- jmp early_fault
-
-early_fault:
cld
-#ifdef CONFIG_PRINTK
- pusha
- movl $(__KERNEL_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- cmpl $2,early_recursion_flag
- je hlt_loop
- incl early_recursion_flag
- movl %cr2,%eax
- pushl %eax
- pushl %edx /* trapno */
- pushl $fault_msg
- call printk
-#endif
- call dump_stack
-hlt_loop:
- hlt
- jmp hlt_loop
+ pushl %fs /* pt_regs->fs (__fsh varies by model) */
+ pushl %es /* pt_regs->es (__esh varies by model) */
+ pushl %ds /* pt_regs->ds (__dsh varies by model) */
+ pushl %eax /* pt_regs->ax */
+ pushl %ebp /* pt_regs->bp */
+ pushl %edi /* pt_regs->di */
+ pushl %esi /* pt_regs->si */
+ pushl %edx /* pt_regs->dx */
+ pushl %ecx /* pt_regs->cx */
+ pushl %ebx /* pt_regs->bx */
+
+ /* Fix up DS and ES */
+ movl $(__KERNEL_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+
+ /* Load the vector number into EDX */
+ movl PT_GS(%esp), %edx
+
+ /* Load GS into pt_regs->gs (and maybe clobber __gsh) */
+ movw %gs, PT_GS(%esp)
+
+ movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
+ call early_fixup_exception
+
+ popl %ebx /* pt_regs->bx */
+ popl %ecx /* pt_regs->cx */
+ popl %edx /* pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+ popl %eax /* pt_regs->ax */
+ popl %ds /* pt_regs->ds (always ignores __dsh) */
+ popl %es /* pt_regs->es (always ignores __esh) */
+ popl %fs /* pt_regs->fs (always ignores __fsh) */
+ popl %gs /* pt_regs->gs (always ignores __gsh) */
+ decl %ss:early_recursion_flag
+ addl $4, %esp /* pop pt_regs->orig_ax */
+ iret
+SYM_CODE_END(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
- ALIGN
-ignore_int:
+SYM_FUNC_START(early_ignore_irq)
cld
#ifdef CONFIG_PRINTK
pushl %eax
@@ -589,7 +386,7 @@ ignore_int:
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
- call printk
+ call _printk
call dump_stack
@@ -602,85 +399,100 @@ ignore_int:
#endif
iret
-#ifndef CONFIG_HOTPLUG_CPU
- __CPUINITDATA
+hlt_loop:
+ hlt
+ jmp hlt_loop
+SYM_FUNC_END(early_ignore_irq)
+
+__INITDATA
+ .align 4
+SYM_DATA(early_recursion_flag, .long 0)
+
+__REFDATA
+ .align 4
+SYM_DATA(initial_code, .long i386_start_kernel)
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+#define PGD_ALIGN (2 * PAGE_SIZE)
+#define PTI_USER_PGD_FILL 1024
#else
- __REFDATA
+#define PGD_ALIGN (PAGE_SIZE)
+#define PTI_USER_PGD_FILL 0
#endif
-.align 4
-ENTRY(initial_code)
- .long i386_start_kernel
-
/*
* BSS section
*/
-.section ".bss.page_aligned","wa"
- .align PAGE_SIZE_asm
+__PAGE_ALIGNED_BSS
+ .align PGD_ALIGN
#ifdef CONFIG_X86_PAE
-swapper_pg_pmd:
+.globl initial_pg_pmd
+initial_pg_pmd:
.fill 1024*KPMDS,4,0
#else
-ENTRY(swapper_pg_dir)
+.globl initial_page_table
+initial_page_table:
.fill 1024,4,0
#endif
-swapper_pg_fixmap:
+ .align PGD_ALIGN
+initial_pg_fixmap:
.fill 1024,4,0
-ENTRY(empty_zero_page)
+.globl swapper_pg_dir
+ .align PGD_ALIGN
+swapper_pg_dir:
+ .fill 1024,4,0
+ .fill PTI_USER_PGD_FILL,4,0
+.globl empty_zero_page
+empty_zero_page:
.fill 4096,1,0
+EXPORT_SYMBOL(empty_zero_page)
/*
* This starts the data section.
*/
#ifdef CONFIG_X86_PAE
-.section ".data.page_aligned","wa"
+__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
- .align PAGE_SIZE_asm
-ENTRY(swapper_pg_dir)
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
+ .align PGD_ALIGN
+SYM_DATA_START(initial_page_table)
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
- .align PAGE_SIZE_asm /* needs to be page-sized too */
-#endif
+ .align PAGE_SIZE /* needs to be page-sized too */
-.data
-ENTRY(stack_start)
- .long init_thread_union+THREAD_SIZE
- .long __BOOT_DS
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ /*
+ * PTI needs another page so sync_initial_pagetable() works correctly
+ * and does not scribble over the data which is placed behind the
+ * actual initial_page_table. See clone_pgd_range().
+ */
+ .fill 1024, 4, 0
+#endif
-ready: .byte 0
+SYM_DATA_END(initial_page_table)
+#endif
-early_recursion_flag:
- .long 0
+.data
+.balign 4
+SYM_DATA(initial_stack, .long __top_init_kernel_stack)
+__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
-fault_msg:
-/* fault info: */
- .ascii "BUG: Int %d: CR2 %p\n"
-/* pusha regs: */
- .ascii " EDI %p ESI %p EBP %p ESP %p\n"
- .ascii " EBX %p EDX %p ECX %p EAX %p\n"
-/* fault frame: */
- .ascii " err %p EIP %p CS %p flg %p\n"
- .ascii "Stack: %p %p %p %p %p %p %p %p\n"
- .ascii " %p %p %p %p %p %p %p %p\n"
- .asciz " %p %p %p %p %p %p %p %p\n"
-
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
@@ -689,33 +501,29 @@ fault_msg:
* segment size, and 32-bit linear address value:
*/
-.globl boot_gdt_descr
-.globl idt_descr
-
+ .data
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
.word 0 # 32 bit align gdt_desc.address
-boot_gdt_descr:
+SYM_DATA_START_LOCAL(boot_gdt_descr)
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
-
- .word 0 # 32-bit align idt_desc.address
-idt_descr:
- .word IDT_ENTRIES*8-1 # idt contains 256 entries
- .long idt_table
+SYM_DATA_END(boot_gdt_descr)
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
-ENTRY(early_gdt_descr)
+SYM_DATA_START(early_gdt_descr)
.word GDT_ENTRIES*8-1
- .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
+ .long gdt_page /* Overwritten for secondary CPUs */
+SYM_DATA_END(early_gdt_descr)
/*
* The boot_gdt must mirror the equivalent in setup.S and is
* used only for booting.
*/
.align L1_CACHE_BYTES
-ENTRY(boot_gdt)
+SYM_DATA_START(boot_gdt)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
+SYM_DATA_END(boot_gdt)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a05..21816b48537c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
- * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -8,53 +9,46 @@
* Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
*/
-
+#include <linux/export.h>
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
+#include <linux/pgtable.h>
#include <asm/segment.h>
-#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/processor-flags.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
+#include "../entry/calling.h"
+#include <asm/nospec-branch.h>
+#include <asm/apicdef.h>
+#include <asm/fixmap.h>
+#include <asm/smp.h>
+#include <asm/thread_info.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/asm-offsets.h>
-#include <asm/paravirt.h>
-#else
-#define GET_CR2_INTO_RCX movq %cr2, %rcx
-#endif
-
-/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+/*
+ * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
- *
*/
-#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
-L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
-L3_START_KERNEL = pud_index(__START_KERNEL_map)
-
- .text
- .section .text.head
+ __INIT
.code64
- .globl startup_64
-startup_64:
-
+SYM_CODE_START_NOALIGN(startup_64)
+ UNWIND_HINT_END_OF_STACK
/*
- * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %RSI holds the physical address of the boot_params structure
+ * provided by the bootloader. Preserve it in %R15 so C function calls
+ * will not clobber it.
*
* We come here either directly from a 64bit bootloader, or from
- * arch/x86_64/boot/compressed/head.S.
+ * arch/x86/boot/compressed/head_64.S.
*
* We only come here initially at boot nothing else comes here.
*
@@ -62,97 +56,101 @@ startup_64:
* compiled to run at we first fixup the physical addresses in our page
* tables and then reload them.
*/
+ mov %rsi, %r15
- /* Compute the delta between the address I am compiled to run at and the
- * address I am actually running at.
- */
- leaq _text(%rip), %rbp
- subq $_text - __START_KERNEL_map, %rbp
-
- /* Is the address not 2M aligned? */
- movq %rbp, %rax
- andl $~PMD_PAGE_MASK, %eax
- testl %eax, %eax
- jnz bad_address
-
- /* Is the address too large? */
- leaq _text(%rip), %rdx
- movq $PGDIR_SIZE, %rax
- cmpq %rax, %rdx
- jae bad_address
-
- /* Fixup the physical addresses in the page table
- */
- addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
- addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+ /* Set up the stack for verify_cpu() */
+ leaq __top_init_kernel_stack(%rip), %rsp
- addq %rbp, level3_ident_pgt + 0(%rip)
+ /*
+ * Set up GSBASE.
+ * Note that on SMP the boot CPU uses the init data section until
+ * the per-CPU areas are set up.
+ */
+ movl $MSR_GS_BASE, %ecx
+ xorl %eax, %eax
+ xorl %edx, %edx
+ wrmsr
- addq %rbp, level3_kernel_pgt + (510*8)(%rip)
- addq %rbp, level3_kernel_pgt + (511*8)(%rip)
+ call __pi_startup_64_setup_gdt_idt
- addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
+ /* Now switch to __KERNEL_CS so IRET works reliably */
+ pushq $__KERNEL_CS
+ leaq .Lon_kernel_cs(%rip), %rax
+ pushq %rax
+ lretq
- /* Add an Identity mapping if I am above 1G */
- leaq _text(%rip), %rdi
- andq $PMD_PAGE_MASK, %rdi
+.Lon_kernel_cs:
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_END_OF_STACK
- movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andq $(PTRS_PER_PUD - 1), %rax
- jz ident_complete
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Activate SEV/SME memory encryption if supported/enabled. This needs to
+ * be done now, since this also includes setup of the SEV-SNP CPUID table,
+ * which needs to be done before any CPUID instructions are executed in
+ * subsequent code. Pass the boot_params pointer as the first argument.
+ */
+ movq %r15, %rdi
+ call __pi_sme_enable
+#endif
- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
- leaq level3_ident_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
+ /* Sanitize CPU configuration */
+ call verify_cpu
- movq %rdi, %rax
- shrq $PMD_SHIFT, %rax
- andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
- leaq level2_spare_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+ /*
+ * Derive the kernel's physical-to-virtual offset from the physical and
+ * virtual addresses of common_startup_64().
+ */
+ leaq common_startup_64(%rip), %rdi
+ subq .Lcommon_startup_64(%rip), %rdi
/*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
+ * Perform pagetable fixups. Additionally, if SME is active, encrypt
+ * the kernel and retrieve the modifier (SME encryption mask if SME
+ * is active) to be added to the initial pgdir entry that will be
+ * programmed into CR3.
*/
+ movq %r15, %rsi
+ call __pi___startup_64
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ leaq early_top_pgt(%rip), %rcx
+ addq %rcx, %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ mov %rax, %rdi
- leaq level2_kernel_pgt(%rip), %rdi
- leaq 4096(%rdi), %r8
- /* See if it is a valid page table entry */
-1: testq $1, 0(%rdi)
- jz 2f
- addq %rbp, 0(%rdi)
- /* Go to the next page */
-2: addq $8, %rdi
- cmp %r8, %rdi
- jne 1b
-
- /* Fixup phys_base */
- addq %rbp, phys_base(%rip)
-
-#ifdef CONFIG_X86_TRAMPOLINE
- addq %rbp, trampoline_level4_pgt + 0(%rip)
- addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ */
+ call sev_verify_cbit
#endif
- /* Due to ENTRY(), sometimes the empty space gets filled with
- * zeros. Better take a jmp than relying on empty space being
- * filled with 0x90 (nop)
+ /*
+ * Switch to early_top_pgt which still has the identity mappings
+ * present.
*/
- jmp secondary_startup_64
-ENTRY(secondary_startup_64)
+ movq %rax, %cr3
+
+ /* Branch to the common startup code at its kernel virtual address */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *.Lcommon_startup_64(%rip)
+SYM_CODE_END(startup_64)
+
+ __INITRODATA
+SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64)
+
+ .text
+SYM_CODE_START(secondary_startup_64)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
/*
- * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
*
- * %esi holds a physical pointer to real_mode_data.
- *
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
*
@@ -161,59 +159,197 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
- /* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
- movq %rax, %cr4
+ /* Sanitize CPU configuration */
+ call verify_cpu
- /* Setup early boot stage 4 level pagetables. */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
- addq phys_base(%rip), %rax
+ /*
+ * The secondary_startup_64_no_verify entry point is only used by
+ * SEV-ES guests. In those guests the call to verify_cpu() would cause
+ * #VC exceptions which can not be handled at this stage of secondary
+ * CPU bringup.
+ *
+ * All non SEV-ES systems, especially Intel systems, need to execute
+ * verify_cpu() above to make sure NX is enabled.
+ */
+SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+
+ /* Clear %R15 which holds the boot_params pointer on the boot CPU */
+ xorl %r15d, %r15d
+
+ /* Derive the runtime physical address of init_top_pgt[] */
+ movq phys_base(%rip), %rax
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
+
+ /*
+ * Retrieve the modifier (SME encryption mask if SME is active) to be
+ * added to the initial pgdir entry that will be programmed into CR3.
+ */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ addq sme_me_mask(%rip), %rax
+#endif
+ /*
+ * Switch to the init_top_pgt here, away from the trampoline_pgd and
+ * unmap the identity mapped ranges.
+ */
movq %rax, %cr3
- /* Ensure I am executing from virtual addresses */
- movq $1f, %rax
- jmp *%rax
-1:
+SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
- /* Check if nx is implemented */
- movl $0x80000001, %eax
- cpuid
- movl %edx,%edi
+ /*
+ * Create a mask of CR4 bits to preserve. Omit PGE in order to flush
+ * global 1:1 translations from the TLBs.
+ *
+ * From the SDM:
+ * "If CR4.PGE is changing from 0 to 1, there were no global TLB
+ * entries before the execution; if CR4.PGE is changing from 1 to 0,
+ * there will be no global TLB entries after the execution."
+ */
+ movl $(X86_CR4_PAE | X86_CR4_LA57), %edx
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ orl $X86_CR4_MCE, %edx
+#endif
+ movq %cr4, %rcx
+ andl %edx, %ecx
- /* Setup EFER (Extended Feature Enable Register) */
- movl $MSR_EFER, %ecx
+ /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */
+ btsl $X86_CR4_PSE_BIT, %ecx
+ movq %rcx, %cr4
+
+ /*
+ * Set CR4.PGE to re-enable global translations.
+ */
+ btsl $X86_CR4_PGE_BIT, %ecx
+ movq %rcx, %cr4
+
+#ifdef CONFIG_SMP
+ /*
+ * For parallel boot, the APIC ID is read from the APIC, and then
+ * used to look up the CPU number. For booting a single CPU, the
+ * CPU number is encoded in smpboot_control.
+ *
+ * Bit 31 STARTUP_READ_APICID (Read APICID from APIC)
+ * Bit 0-23 CPU# if STARTUP_xx flags are not set
+ */
+ movl smpboot_control(%rip), %ecx
+ testl $STARTUP_READ_APICID, %ecx
+ jnz .Lread_apicid
+ /*
+ * No control bit set, single CPU bringup. CPU number is provided
+ * in bit 0-23. This is also the boot CPU case (CPU number 0).
+ */
+ andl $(~STARTUP_PARALLEL_MASK), %ecx
+ jmp .Lsetup_cpu
+
+.Lread_apicid:
+ /* Check whether X2APIC mode is already enabled */
+ mov $MSR_IA32_APICBASE, %ecx
rdmsr
- btsl $_EFER_SCE, %eax /* Enable System Call */
- btl $20,%edi /* No Execute supported? */
- jnc 1f
- btsl $_EFER_NX, %eax
-1: wrmsr /* Make changes effective */
+ testl $X2APIC_ENABLE, %eax
+ jnz .Lread_apicid_msr
- /* Setup cr0 */
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
- movl $CR0_STATE, %eax
- /* Make changes effective */
- movq %rax, %cr0
+#ifdef CONFIG_X86_X2APIC
+ /*
+ * If system is in X2APIC mode then MMIO base might not be
+ * mapped causing the MMIO read below to fault. Faults can't
+ * be handled at that point.
+ */
+ cmpl $0, x2apic_mode(%rip)
+ jz .Lread_apicid_mmio
- /* Setup a boot time stack */
- movq stack_start(%rip),%rsp
+ /* Force the AP into X2APIC mode. */
+ orl $X2APIC_ENABLE, %eax
+ wrmsr
+ jmp .Lread_apicid_msr
+#endif
- /* zero EFLAGS after setting rsp */
- pushq $0
- popfq
+.Lread_apicid_mmio:
+ /* Read the APIC ID from the fix-mapped MMIO space. */
+ movq apic_mmio_base(%rip), %rcx
+ addq $APIC_ID, %rcx
+ movl (%rcx), %eax
+ shr $24, %eax
+ jmp .Llookup_AP
+
+.Lread_apicid_msr:
+ mov $APIC_X2APIC_ID_MSR, %ecx
+ rdmsr
+
+.Llookup_AP:
+ /* EAX contains the APIC ID of the current CPU */
+ xorl %ecx, %ecx
+ leaq cpuid_to_apicid(%rip), %rbx
+
+.Lfind_cpunr:
+ cmpl (%rbx,%rcx,4), %eax
+ jz .Lsetup_cpu
+ inc %ecx
+#ifdef CONFIG_FORCE_NR_CPUS
+ cmpl $NR_CPUS, %ecx
+#else
+ cmpl nr_cpu_ids(%rip), %ecx
+#endif
+ jb .Lfind_cpunr
+
+ /* APIC ID not found in the table. Drop the trampoline lock and bail. */
+ movq trampoline_lock(%rip), %rax
+ movl $0, (%rax)
+
+1: cli
+ hlt
+ jmp 1b
+
+.Lsetup_cpu:
+ /* Get the per cpu offset for the given CPU# which is in ECX */
+ movq __per_cpu_offset(,%rcx,8), %rdx
+#else
+ xorl %edx, %edx /* zero-extended to clear all of RDX */
+#endif /* CONFIG_SMP */
+
+ /*
+ * Setup a boot time stack - Any secondary CPU will have lost its stack
+ * by now because the cr3-switch above unmaps the real-mode stack.
+ *
+ * RDX contains the per-cpu offset
+ */
+ movq current_task(%rdx), %rax
+ movq TASK_threadsp(%rax), %rsp
+
+ /*
+ * Now that this CPU is running on its own stack, drop the realmode
+ * protection. For the boot CPU the pointer is NULL!
+ */
+ movq trampoline_lock(%rip), %rax
+ testq %rax, %rax
+ jz .Lsetup_gdt
+ movl $0, (%rax)
+.Lsetup_gdt:
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
- lgdt early_gdt_descr(%rip)
-
- /* set up data segments. actually 0 would do too */
- movl $__KERNEL_DS,%eax
+ subq $16, %rsp
+ movw $(GDT_SIZE-1), (%rsp)
+ leaq gdt_page(%rdx), %rax
+ movq %rax, 2(%rsp)
+ lgdt (%rsp)
+ addq $16, %rsp
+
+ /* set up data segments */
+ xorl %eax,%eax
movl %eax,%ds
movl %eax,%ss
movl %eax,%es
@@ -226,199 +362,360 @@ ENTRY(secondary_startup_64)
movl %eax,%fs
movl %eax,%gs
- /* Set up %gs.
- *
- * The base of %gs always points to the bottom of the irqstack
- * union. If the stack protector canary is enabled, it is
- * located at %gs:40. Note that, on SMP, the boot cpu uses
- * init data section till per cpu areas are set up.
+ /*
+ * Set up GSBASE.
+ * Note that, on SMP, the boot cpu uses init data section until
+ * the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movq initial_gs(%rip),%rax
- movq %rax,%rdx
- shrq $32,%rdx
- wrmsr
-
- /* esi is pointer to real mode structure with interesting info.
- pass it to C */
- movl %esi, %edi
-
- /* Finally jump to run C code and to be on real kernel address
- * Since we are running on identity-mapped space we have to jump
- * to the full 64bit address, this is only possible as indirect
- * jump. In addition we need to ensure %cs is set so we make this
- * a far return.
+ movl %edx, %eax
+ shrq $32, %rdx
+ wrmsr
+
+ /* Setup and Load IDT */
+ call early_setup_idt
+
+ /* Check if nx is implemented */
+ movl $0x80000001, %eax
+ cpuid
+ movl %edx,%edi
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /*
+ * Preserve current value of EFER for comparison and to skip
+ * EFER writes if no change was made (for TDX guest)
*/
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
+ movl %eax, %edx
+ btsl $_EFER_SCE, %eax /* Enable System Call */
+ btl $20,%edi /* No Execute supported? */
+ jnc 1f
+ btsl $_EFER_NX, %eax
+ btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
- /* SMP bootup changes these two */
- __REFDATA
- .align 8
- ENTRY(initial_code)
- .quad x86_64_start_kernel
- ENTRY(initial_gs)
- .quad INIT_PER_CPU_VAR(irq_stack_union)
- __FINITDATA
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
+ /* Setup cr0 */
+ movl $CR0_STATE, %eax
+ /* Make changes effective */
+ movq %rax, %cr0
+
+ /* zero EFLAGS after setting rsp */
+ pushq $0
+ popfq
+
+ /* Pass the boot_params pointer as first argument */
+ movq %r15, %rdi
+
+.Ljump_to_C_code:
+ xorl %ebp, %ebp # clear frame pointer
+ ANNOTATE_RETPOLINE_SAFE
+ callq *initial_code(%rip)
+ ud2
+SYM_CODE_END(secondary_startup_64)
- ENTRY(stack_start)
- .quad init_thread_union+THREAD_SIZE-8
- .word 0
+#include "verify_cpu.S"
+#include "sev_verify_cbit.S"
+
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT)
+/*
+ * Entry point for soft restart of a CPU. Invoked from xxx_play_dead() for
+ * restarting the boot CPU or for restarting SEV guest CPUs after CPU hot
+ * unplug. Everything is set up already except the stack.
+ */
+SYM_CODE_START(soft_restart_cpu)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_END_OF_STACK
+
+ /* Find the idle task stack */
+ movq PER_CPU_VAR(current_task), %rcx
+ movq TASK_threadsp(%rcx), %rsp
+
+ jmp .Ljump_to_C_code
+SYM_CODE_END(soft_restart_cpu)
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * VC Exception handler used during early boot when running on kernel
+ * addresses, but before the switch to the idt_table can be made.
+ * The early_idt_handler_array can't be used here because it calls into a lot
+ * of __init code and this handler is also used during CPU offlining/onlining.
+ * Therefore this handler ends up in the .text section so that it stays around
+ * when .init.text is freed.
+ */
+SYM_CODE_START_NOALIGN(vc_boot_ghcb)
+ UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
-bad_address:
- jmp bad_address
+ /* Build pt_regs */
+ PUSH_AND_CLEAR_REGS
- .section ".init.text","ax"
-#ifdef CONFIG_EARLY_PRINTK
- .globl early_idt_handlers
-early_idt_handlers:
+ /* Call C handler */
+ movq %rsp, %rdi
+ movq ORIG_RAX(%rsp), %rsi
+ movq initial_vc_handler(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rax
+
+ /* Unwind pt_regs */
+ POP_REGS
+
+ /* Remove Error Code */
+ addq $8, %rsp
+
+ iretq
+SYM_CODE_END(vc_boot_ghcb)
+#endif
+
+ /* Both SMP bootup and ACPI suspend change these variables */
+ __REFDATA
+ .balign 8
+SYM_DATA(initial_code, .quad x86_64_start_kernel)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
+#endif
+
+SYM_DATA(trampoline_lock, .quad 0);
+ __FINITDATA
+
+ __INIT
+SYM_CODE_START(early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
- movl $i, %esi
- jmp early_idt_handler
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ UNWIND_HINT_IRET_REGS
+ ENDBR
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
+ .endif
+ pushq $i # 72(%rsp) Vector number
+ jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-#endif
+SYM_CODE_END(early_idt_handler_array)
+ ANNOTATE_NOENDBR // early_idt_handler_array[NUM_EXCEPTION_VECTORS]
+
+SYM_CODE_START_LOCAL(early_idt_handler_common)
+ UNWIND_HINT_IRET_REGS offset=16
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
-ENTRY(early_idt_handler)
-#ifdef CONFIG_EARLY_PRINTK
- cmpl $2,early_recursion_flag(%rip)
- jz 1f
incl early_recursion_flag(%rip)
- GET_CR2_INTO_RCX
- movq %rcx,%r9
- xorl %r8d,%r8d # zero for error code
- movl %esi,%ecx # get vector number
- # Test %ecx against mask of vectors that push error code.
- cmpl $31,%ecx
- ja 0f
- movl $1,%eax
- salq %cl,%rax
- testl $0x27d00,%eax
- je 0f
- popq %r8 # get error code
-0: movq 0(%rsp),%rcx # get ip
- movq 8(%rsp),%rdx # get cs
- xorl %eax,%eax
- leaq early_idt_msg(%rip),%rdi
- call early_printk
- cmpl $2,early_recursion_flag(%rip)
- jz 1f
- call dump_stack
-#ifdef CONFIG_KALLSYMS
- leaq early_idt_ripmsg(%rip),%rdi
- movq 0(%rsp),%rsi # get rip again
- call __print_symbol
+
+ /* The vector number is currently in the pt_regs->di slot. */
+ pushq %rsi /* pt_regs->si */
+ movq 8(%rsp), %rsi /* RSI = vector number */
+ movq %rdi, 8(%rsp) /* pt_regs->di = RDI */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->bx */
+ pushq %rbp /* pt_regs->bp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
+
+ movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
+ call do_early_exception
+
+ decl early_recursion_flag(%rip)
+ jmp restore_regs_and_return_to_kernel
+SYM_CODE_END(early_idt_handler_common)
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * VC Exception handler used during very early boot. The
+ * early_idt_handler_array can't be used because it returns via the
+ * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early.
+ *
+ * XXX it does, fix this.
+ *
+ * This handler will end up in the .init.text section and not be
+ * available to boot secondary CPUs.
+ */
+SYM_CODE_START_NOALIGN(vc_no_ghcb)
+ UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
+
+ /* Build pt_regs */
+ PUSH_AND_CLEAR_REGS
+
+ /* Call C handler */
+ movq %rsp, %rdi
+ movq ORIG_RAX(%rsp), %rsi
+ call __pi_do_vc_no_ghcb
+
+ /* Unwind pt_regs */
+ POP_REGS
+
+ /* Remove Error Code */
+ addq $8, %rsp
+
+ /* Pure iret required here - don't use INTERRUPT_RETURN */
+ iretq
+SYM_CODE_END(vc_no_ghcb)
+SYM_PIC_ALIAS(vc_no_ghcb);
#endif
-#endif /* EARLY_PRINTK */
-1: hlt
- jmp 1b
-
-#ifdef CONFIG_EARLY_PRINTK
-early_recursion_flag:
- .long 0
-
-early_idt_msg:
- .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
-early_idt_ripmsg:
- .asciz "RIP %s\n"
-#endif /* CONFIG_EARLY_PRINTK */
- .previous
-
-#define NEXT_PAGE(name) \
- .balign PAGE_SIZE; \
-ENTRY(name)
-
-/* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT) \
- i = 0 ; \
- .rept (COUNT) ; \
- .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
- i = i + 1 ; \
- .endr
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
-NEXT_PAGE(init_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE)
+#else
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_DATA_START_PAGE_ALIGNED(name)
+#define PTI_USER_PGD_FILL 0
+#endif
-NEXT_PAGE(level3_ident_pgt)
- .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ __INITDATA
+ .balign 4
+
+SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
.fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(early_top_pgt)
+SYM_PIC_ALIAS(early_top_pgt)
-NEXT_PAGE(level3_kernel_pgt)
- .fill L3_START_KERNEL,8,0
- /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
- .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+SYM_DATA_END(early_dynamic_pgts)
+SYM_PIC_ALIAS(early_dynamic_pgts);
-NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
+SYM_DATA(early_recursion_flag, .long 0)
-NEXT_PAGE(level1_fixmap_pgt)
- .fill 512,8,0
+ .data
-NEXT_PAGE(level2_ident_pgt)
- /* Since I easily can, map the first 1G.
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH)
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .fill 511, 8, 0
+SYM_DATA_END(level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt)
+ /*
+ * Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
+ *
+ * Note: This sets _PAGE_GLOBAL despite whether
+ * the CPU supports it or it is enabled. But,
+ * the CPU should ignore the bit.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+SYM_DATA_END(level2_ident_pgt)
+#else
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
+ .fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
+#endif
+
+SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level4_kernel_pgt)
+SYM_PIC_ALIAS(level4_kernel_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
+ .fill L3_START_KERNEL,8,0
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level3_kernel_pgt)
+SYM_PIC_ALIAS(level3_kernel_pgt)
-NEXT_PAGE(level2_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
- * 512 MB kernel mapping. We spend a full page on this pagetable
- * anyway.
+ * Kernel high mapping.
+ *
+ * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in
+ * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled,
+ * 512 MiB otherwise.
*
- * The kernel code+data+bss must not be bigger than that.
+ * (NOTE: after that starts the module area, see MODULES_VADDR.)
*
- * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
- * If you want to increase this then increase MODULES_VADDR
- * too.)
+ * This table is eventually used by the kernel during normal runtime.
+ * Care must be taken to clear out undesired bits later, like _PAGE_RW
+ * or _PAGE_GLOBAL in some cases.
*/
- PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
- KERNEL_IMAGE_SIZE/PMD_SIZE)
-
-NEXT_PAGE(level2_spare_pgt)
- .fill 512, 8, 0
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
+SYM_DATA_END(level2_kernel_pgt)
+SYM_PIC_ALIAS(level2_kernel_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
+ .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
+ pgtno = 0
+ .rept (FIXMAP_PMD_NUM)
+ .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
+ + _PAGE_TABLE_NOENC;
+ pgtno = pgtno + 1
+ .endr
+ /* 6 MB reserved space + a 2MB hole */
+ .fill 4,8,0
+SYM_DATA_END(level2_fixmap_pgt)
+SYM_PIC_ALIAS(level2_fixmap_pgt)
-#undef PMDS
-#undef NEXT_PAGE
+SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
+ .rept (FIXMAP_PMD_NUM)
+ .fill 512,8,0
+ .endr
+SYM_DATA_END(level1_fixmap_pgt)
.data
.align 16
- .globl early_gdt_descr
-early_gdt_descr:
- .word GDT_ENTRIES*8-1
-early_gdt_descr_base:
- .quad INIT_PER_CPU_VAR(gdt_page)
-
-ENTRY(phys_base)
- /* This must match the first entry in level2_kernel_pgt */
- .quad 0x0000000000000000
-
-#include "../../x86/xen/xen-head.S"
-
- .section .bss, "aw", @nobits
- .align L1_CACHE_BYTES
-ENTRY(idt_table)
- .skip IDT_ENTRIES * 16
-
- .section .bss.page_aligned, "aw", @nobits
- .align PAGE_SIZE
-ENTRY(empty_zero_page)
+
+SYM_DATA(smpboot_control, .long 0)
+
+ .align 16
+/* This must match the first entry in level2_kernel_pgt */
+SYM_DATA(phys_base, .quad 0x0)
+SYM_PIC_ALIAS(phys_base);
+EXPORT_SYMBOL(phys_base)
+
+#include "../xen/xen-head.S"
+
+ __PAGE_ALIGNED_BSS
+SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
+SYM_DATA_END(empty_zero_page)
+EXPORT_SYMBOL(empty_zero_page)
+
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..d6387dde3ff9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,72 +1,94 @@
-#include <linux/clocksource.h>
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/clockchips.h>
#include <linux/interrupt.h>
-#include <linux/sysdev.h>
+#include <linux/export.h>
#include <linux/delay.h>
-#include <linux/errno.h>
#include <linux/hpet.h>
-#include <linux/init.h>
#include <linux/cpu.h>
-#include <linux/pm.h>
-#include <linux/io.h>
+#include <linux/irq.h>
-#include <asm/fixmap.h>
-#include <asm/i8253.h>
+#include <asm/cpuid/api.h>
+#include <asm/irq_remapping.h>
#include <asm/hpet.h>
+#include <asm/time.h>
+#include <asm/mwait.h>
+#include <asm/msr.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "hpet: " fmt
+
+enum hpet_mode {
+ HPET_MODE_UNUSED,
+ HPET_MODE_LEGACY,
+ HPET_MODE_CLOCKEVT,
+ HPET_MODE_DEVICE,
+};
-#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
+struct hpet_channel {
+ struct clock_event_device evt;
+ unsigned int num;
+ unsigned int cpu;
+ unsigned int irq;
+ unsigned int in_use;
+ enum hpet_mode mode;
+ unsigned int boot_cfg;
+ char name[10];
+};
-/* FSEC = 10^-15
- NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000L
+struct hpet_base {
+ unsigned int nr_channels;
+ unsigned int nr_clockevents;
+ unsigned int boot_cfg;
+ struct hpet_channel *channels;
+};
-#define HPET_DEV_USED_BIT 2
-#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
-#define HPET_DEV_VALID 0x8
-#define HPET_DEV_FSB_CAP 0x1000
-#define HPET_DEV_PERI_CAP 0x2000
+#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
+#define HPET_MIN_CYCLES 128
+#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
unsigned long hpet_address;
-#ifdef CONFIG_PCI_MSI
-static unsigned long hpet_num_timers;
+u8 hpet_blockid; /* OS timer block num */
+bool hpet_msi_disable;
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
+static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
+static struct irq_domain *hpet_domain;
#endif
+
static void __iomem *hpet_virt_address;
-struct hpet_dev {
- struct clock_event_device evt;
- unsigned int num;
- int cpu;
- unsigned int irq;
- unsigned int flags;
- char name[10];
-};
+static struct hpet_base hpet_base;
+
+static bool hpet_legacy_int_enabled;
+static unsigned long hpet_freq;
-unsigned long hpet_readl(unsigned long a)
+bool boot_hpet_disable;
+bool hpet_force_user;
+static bool hpet_verbose;
+
+static inline
+struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt)
+{
+ return container_of(evt, struct hpet_channel, evt);
+}
+
+inline unsigned int hpet_readl(unsigned int a)
{
return readl(hpet_virt_address + a);
}
-static inline void hpet_writel(unsigned long d, unsigned long a)
+static inline void hpet_writel(unsigned int d, unsigned int a)
{
writel(d, hpet_virt_address + a);
}
-#ifdef CONFIG_X86_64
-#include <asm/pgtable.h>
-#endif
-
static inline void hpet_set_mapping(void)
{
- hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
-#endif
+ hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE);
}
static inline void hpet_clear_mapping(void)
@@ -78,19 +100,20 @@ static inline void hpet_clear_mapping(void)
/*
* HPET command line enable / disable
*/
-static int boot_hpet_disable;
-int hpet_force_user;
-static int hpet_verbose;
-
static int __init hpet_setup(char *str)
{
- if (str) {
+ while (str) {
+ char *next = strchr(str, ',');
+
+ if (next)
+ *next++ = 0;
if (!strncmp("disable", str, 7))
- boot_hpet_disable = 1;
+ boot_hpet_disable = true;
if (!strncmp("force", str, 5))
- hpet_force_user = 1;
+ hpet_force_user = true;
if (!strncmp("verbose", str, 7))
- hpet_verbose = 1;
+ hpet_verbose = true;
+ str = next;
}
return 1;
}
@@ -98,7 +121,7 @@ __setup("hpet=", hpet_setup);
static int __init disable_hpet(char *str)
{
- boot_hpet_disable = 1;
+ boot_hpet_disable = true;
return 1;
}
__setup("nohpet", disable_hpet);
@@ -108,13 +131,8 @@ static inline int is_hpet_capable(void)
return !boot_hpet_disable && hpet_address;
}
-/*
- * HPET timer interrupt enable / disable
- */
-static int hpet_legacy_int_enabled;
-
/**
- * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled
*/
int is_hpet_enabled(void)
{
@@ -124,67 +142,60 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
static void _hpet_print_config(const char *function, int line)
{
- u32 i, timers, l, h;
- printk(KERN_INFO "hpet: %s(%d):\n", function, line);
- l = hpet_readl(HPET_ID);
- h = hpet_readl(HPET_PERIOD);
- timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
- l = hpet_readl(HPET_CFG);
- h = hpet_readl(HPET_STATUS);
- printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
+ u32 i, id, period, cfg, status, channels, l, h;
+
+ pr_info("%s(%d):\n", function, line);
+
+ id = hpet_readl(HPET_ID);
+ period = hpet_readl(HPET_PERIOD);
+ pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period);
+
+ cfg = hpet_readl(HPET_CFG);
+ status = hpet_readl(HPET_STATUS);
+ pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status);
+
l = hpet_readl(HPET_COUNTER);
h = hpet_readl(HPET_COUNTER+4);
- printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+ pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+
+ channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- for (i = 0; i < timers; i++) {
+ for (i = 0; i < channels; i++) {
l = hpet_readl(HPET_Tn_CFG(i));
h = hpet_readl(HPET_Tn_CFG(i)+4);
- printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h);
+
l = hpet_readl(HPET_Tn_CMP(i));
h = hpet_readl(HPET_Tn_CMP(i)+4);
- printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h);
+
l = hpet_readl(HPET_Tn_ROUTE(i));
h = hpet_readl(HPET_Tn_ROUTE(i)+4);
- printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h);
}
}
#define hpet_print_config() \
do { \
if (hpet_verbose) \
- _hpet_print_config(__FUNCTION__, __LINE__); \
+ _hpet_print_config(__func__, __LINE__); \
} while (0)
/*
- * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * When the HPET driver (/dev/hpet) is enabled, we need to reserve
* timer 0 and timer 1 in case of RTC emulation.
*/
#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd);
-
-static void hpet_reserve_platform_timers(unsigned long id)
+static void __init hpet_reserve_platform_timers(void)
{
- struct hpet __iomem *hpet = hpet_virt_address;
- struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
- unsigned int nrtimers, i;
struct hpet_data hd;
-
- nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+ unsigned int i;
memset(&hd, 0, sizeof(hd));
hd.hd_phys_address = hpet_address;
- hd.hd_address = hpet;
- hd.hd_nirqs = nrtimers;
- hpet_reserve_timer(&hd, 0);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
- hpet_reserve_timer(&hd, 1);
-#endif
+ hd.hd_address = hpet_virt_address;
+ hd.hd_nirqs = hpet_base.nr_channels;
/*
* NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
@@ -194,46 +205,52 @@ static void hpet_reserve_platform_timers(unsigned long id)
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
- for (i = 2; i < nrtimers; timer++, i++) {
- hd.hd_irq[i] = (readl(&timer->hpet_config) &
- Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
- }
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
- hpet_reserve_msi_timers(&hd);
+ if (i >= 2)
+ hd.hd_irq[i] = hc->irq;
- hpet_alloc(&hd);
+ switch (hc->mode) {
+ case HPET_MODE_UNUSED:
+ case HPET_MODE_DEVICE:
+ hc->mode = HPET_MODE_DEVICE;
+ break;
+ case HPET_MODE_CLOCKEVT:
+ case HPET_MODE_LEGACY:
+ hpet_reserve_timer(&hd, hc->num);
+ break;
+ }
+ }
+ hpet_alloc(&hd);
}
-#else
-static void hpet_reserve_platform_timers(unsigned long id) { }
-#endif
-/*
- * Common hpet info
- */
-static unsigned long hpet_period;
+static void __init hpet_select_device_channel(void)
+{
+ int i;
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt);
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
- .name = "hpet",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = hpet_legacy_set_mode,
- .set_next_event = hpet_legacy_next_event,
- .shift = 32,
- .irq = 0,
- .rating = 50,
-};
+ /* Associate the first unused channel to /dev/hpet */
+ if (hc->mode == HPET_MODE_UNUSED) {
+ hc->mode = HPET_MODE_DEVICE;
+ return;
+ }
+ }
+}
+
+#else
+static inline void hpet_reserve_platform_timers(void) { }
+static inline void hpet_select_device_channel(void) {}
+#endif
+/* Common HPET functions */
static void hpet_stop_counter(void)
{
- unsigned long cfg = hpet_readl(HPET_CFG);
+ u32 cfg = hpet_readl(HPET_CFG);
+
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -246,7 +263,8 @@ static void hpet_reset_counter(void)
static void hpet_start_counter(void)
{
- unsigned long cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG);
+
cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -263,7 +281,7 @@ static void hpet_resume_device(void)
force_hpet_resume();
}
-static void hpet_resume_counter(void)
+static void hpet_resume_counter(struct clocksource *cs)
{
hpet_resume_device();
hpet_restart_counter();
@@ -271,483 +289,563 @@ static void hpet_resume_counter(void)
static void hpet_enable_legacy_int(void)
{
- unsigned long cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG);
cfg |= HPET_CFG_LEGACY;
hpet_writel(cfg, HPET_CFG);
- hpet_legacy_int_enabled = 1;
+ hpet_legacy_int_enabled = true;
}
-static void hpet_legacy_clockevent_register(void)
+static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt)
{
- /* Start HPET legacy interrupts */
- hpet_enable_legacy_int();
+ unsigned int channel = clockevent_to_channel(evt)->num;
+ unsigned int cfg, cmp, now;
+ uint64_t delta;
+ hpet_stop_counter();
+ delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+ delta >>= evt->shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned int)delta;
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
+ cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
+ hpet_writel(cmp, HPET_Tn_CMP(channel));
+ udelay(1);
/*
- * The mult factor is defined as (include/linux/clockchips.h)
- * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
- * hpet_period is in units of femtoseconds (per cycle), so
- * mult/2^shift = cyc/ns = 10^6/hpet_period
- * mult = (10^6 * 2^shift)/hpet_period
- * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
+ * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+ * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+ * bit is automatically cleared after the first write.
+ * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+ * Publication # 24674)
*/
- hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
- hpet_period, hpet_clockevent.shift);
- /* Calculate the min / max delta */
- hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
- &hpet_clockevent);
- /* 5 usec minimum reprogramming delta. */
- hpet_clockevent.min_delta_ns = 5000;
+ hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel));
+ hpet_start_counter();
+ hpet_print_config();
- /*
- * Start hpet with the boot cpu mask and make it
- * global after the IO_APIC has been initialized.
- */
- hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
- clockevents_register_device(&hpet_clockevent);
- global_clock_event = &hpet_clockevent;
- printk(KERN_DEBUG "hpet clockevent registered\n");
+ return 0;
}
-static int hpet_setup_msi_irq(unsigned int irq);
+static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt)
+{
+ unsigned int channel = clockevent_to_channel(evt)->num;
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
+
+ return 0;
+}
-static void hpet_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt, int timer)
+static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt)
{
- unsigned long cfg, cmp, now;
- uint64_t delta;
+ unsigned int channel = clockevent_to_channel(evt)->num;
+ unsigned int cfg;
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- hpet_stop_counter();
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
- delta >>= evt->shift;
- now = hpet_readl(HPET_COUNTER);
- cmp = now + (unsigned long) delta;
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- /* Make sure we use edge triggered interrupts */
- cfg &= ~HPET_TN_LEVEL;
- cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
- HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- hpet_writel(cmp, HPET_Tn_CMP(timer));
- udelay(1);
- /*
- * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
- * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
- * bit is automatically cleared after the first write.
- * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
- * Publication # 24674)
- */
- hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
- hpet_start_counter();
- hpet_print_config();
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_PERIODIC;
- cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- if (timer == 0) {
- hpet_enable_legacy_int();
- } else {
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- hpet_setup_msi_irq(hdev->irq);
- disable_irq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
- enable_irq(hdev->irq);
- }
- hpet_print_config();
- break;
- }
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
+
+ return 0;
+}
+
+static int hpet_clkevt_legacy_resume(struct clock_event_device *evt)
+{
+ hpet_enable_legacy_int();
+ hpet_print_config();
+ return 0;
}
-static int hpet_next_event(unsigned long delta,
- struct clock_event_device *evt, int timer)
+static int
+hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt)
{
+ unsigned int channel = clockevent_to_channel(evt)->num;
u32 cnt;
+ s32 res;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
- hpet_writel(cnt, HPET_Tn_CMP(timer));
+ hpet_writel(cnt, HPET_Tn_CMP(channel));
/*
- * We need to read back the CMP register to make sure that
- * what we wrote hit the chip before we compare it to the
- * counter.
+ * HPETs are a complete disaster. The compare register is
+ * based on a equal comparison and neither provides a less
+ * than or equal functionality (which would require to take
+ * the wraparound into account) nor a simple count down event
+ * mode. Further the write to the comparator register is
+ * delayed internally up to two HPET clock cycles in certain
+ * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
+ * longer delays. We worked around that by reading back the
+ * compare register, but that required another workaround for
+ * ICH9,10 chips where the first readout after write can
+ * return the old stale value. We already had a minimum
+ * programming delta of 5us enforced, but a NMI or SMI hitting
+ * between the counter readout and the comparator write can
+ * move us behind that point easily. Now instead of reading
+ * the compare register back several times, we make the ETIME
+ * decision based on the following: Return ETIME if the
+ * counter value after the write is less than HPET_MIN_CYCLES
+ * away from the event or if the counter is already ahead of
+ * the event. The minimum programming delta for the generic
+ * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
*/
- WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt);
+ res = (s32)(cnt - hpet_readl(HPET_COUNTER));
- return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+ return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating)
{
- hpet_set_mode(mode, evt, 0);
+ struct clock_event_device *evt = &hc->evt;
+
+ evt->rating = rating;
+ evt->irq = hc->irq;
+ evt->name = hc->name;
+ evt->cpumask = cpumask_of(hc->cpu);
+ evt->set_state_oneshot = hpet_clkevt_set_state_oneshot;
+ evt->set_next_event = hpet_clkevt_set_next_event;
+ evt->set_state_shutdown = hpet_clkevt_set_state_shutdown;
+
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hc->boot_cfg & HPET_TN_PERIODIC) {
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+ evt->set_state_periodic = hpet_clkevt_set_state_periodic;
+ }
}
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt)
+static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
{
- return hpet_next_event(delta, evt, 0);
-}
+ /*
+ * Start HPET with the boot CPU's cpumask and make it global after
+ * the IO_APIC has been initialized.
+ */
+ hc->cpu = boot_cpu_data.cpu_index;
+ strscpy(hc->name, "hpet", sizeof(hc->name));
+ hpet_init_clockevent(hc, 50);
-/*
- * HPET MSI Support
- */
-#ifdef CONFIG_PCI_MSI
+ hc->evt.tick_resume = hpet_clkevt_legacy_resume;
-static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
-static struct hpet_dev *hpet_devs;
+ /*
+ * Legacy horrors and sins from the past. HPET used periodic mode
+ * unconditionally forever on the legacy channel 0. Removing the
+ * below hack and using the conditional in hpet_init_clockevent()
+ * makes at least Qemu and one hardware machine fail to boot.
+ * There are two issues which cause the boot failure:
+ *
+ * #1 After the timer delivery test in IOAPIC and the IOAPIC setup
+ * the next interrupt is not delivered despite the HPET channel
+ * being programmed correctly. Reprogramming the HPET after
+ * switching to IOAPIC makes it work again. After fixing this,
+ * the next issue surfaces:
+ *
+ * #2 Due to the unconditional periodic mode availability the Local
+ * APIC timer calibration can hijack the global clockevents
+ * event handler without causing damage. Using oneshot at this
+ * stage makes if hang because the HPET does not get
+ * reprogrammed due to the handler hijacking. Duh, stupid me!
+ *
+ * Both issues require major surgery and especially the kick HPET
+ * again after enabling IOAPIC results in really nasty hackery.
+ * This 'assume periodic works' magic has survived since HPET
+ * support got added, so it's questionable whether this should be
+ * fixed. Both Qemu and the failing hardware machine support
+ * periodic mode despite the fact that both don't advertise it in
+ * the configuration register and both need that extra kick after
+ * switching to IOAPIC. Seems to be a feature...
+ */
+ hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC;
+ hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic;
-void hpet_msi_unmask(unsigned int irq)
-{
- struct hpet_dev *hdev = get_irq_data(irq);
- unsigned long cfg;
+ /* Start HPET legacy interrupts */
+ hpet_enable_legacy_int();
- /* unmask it */
- cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg |= HPET_TN_FSB;
- hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+ clockevents_config_and_register(&hc->evt, hpet_freq,
+ HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
+ global_clock_event = &hc->evt;
+ pr_debug("Clockevent registered\n");
}
-void hpet_msi_mask(unsigned int irq)
+/*
+ * HPET MSI Support
+ */
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
+static void hpet_msi_unmask(struct irq_data *data)
{
- unsigned long cfg;
- struct hpet_dev *hdev = get_irq_data(irq);
+ struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
+ unsigned int cfg;
- /* mask it */
- cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg &= ~HPET_TN_FSB;
- hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+ cfg = hpet_readl(HPET_Tn_CFG(hc->num));
+ cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+static void hpet_msi_mask(struct irq_data *data)
{
- struct hpet_dev *hdev = get_irq_data(irq);
+ struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
+ unsigned int cfg;
- hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
- hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+ cfg = hpet_readl(HPET_Tn_CFG(hc->num));
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
- msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
- msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
- msg->address_hi = 0;
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4);
}
-static void hpet_msi_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- hpet_set_mode(mode, evt, hdev->num);
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
}
-static int hpet_msi_next_event(unsigned long delta,
- struct clock_event_device *evt)
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+ .name = "HPET-MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_write_msi_msg = hpet_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static int hpet_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- return hpet_next_event(delta, evt, hdev->num);
+ irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
+ handle_edge_irq, arg->data, "edge");
+
+ return 0;
}
-static int hpet_setup_msi_irq(unsigned int irq)
+static struct msi_domain_ops hpet_msi_domain_ops = {
+ .msi_init = hpet_msi_init,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+ .ops = &hpet_msi_domain_ops,
+ .chip = &hpet_msi_controller,
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *hpet_create_irq_domain(int hpet_id)
{
- if (arch_setup_hpet_msi(irq)) {
- destroy_irq(irq);
- return -EINVAL;
+ struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct fwnode_handle *fn;
+ struct irq_fwspec fwspec;
+
+ if (x86_vector_domain == NULL)
+ return NULL;
+
+ domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+ if (!domain_info)
+ return NULL;
+
+ *domain_info = hpet_msi_domain_info;
+ domain_info->data = (void *)(long)hpet_id;
+
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
}
- return 0;
+
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = hpet_id;
+
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI);
+ if (!parent) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ return NULL;
+ }
+ if (parent != x86_vector_domain)
+ hpet_msi_controller.name = "IR-HPET-MSI";
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
+ return d;
}
-static int hpet_assign_irq(struct hpet_dev *dev)
+static inline int hpet_dev_id(struct irq_domain *domain)
{
- unsigned int irq;
+ struct msi_domain_info *info = msi_get_domain_info(domain);
- irq = create_irq();
- if (!irq)
- return -EINVAL;
+ return (int)(long)info->data;
+}
- set_irq_data(irq, dev);
+static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
+ int dev_num)
+{
+ struct irq_alloc_info info;
- if (hpet_setup_msi_irq(irq))
- return -EINVAL;
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.data = hc;
+ info.devid = hpet_dev_id(domain);
+ info.hwirq = dev_num;
- dev->irq = irq;
+ return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
+
+static int hpet_clkevt_msi_resume(struct clock_event_device *evt)
+{
+ struct hpet_channel *hc = clockevent_to_channel(evt);
+ struct irq_data *data = irq_get_irq_data(hc->irq);
+ struct msi_msg msg;
+
+ /* Restore the MSI msg and unmask the interrupt */
+ irq_chip_compose_msi_msg(data, &msg);
+ hpet_msi_write(hc, &msg);
+ hpet_msi_unmask(data);
return 0;
}
-static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data)
{
- struct hpet_dev *dev = (struct hpet_dev *)data;
- struct clock_event_device *hevt = &dev->evt;
+ struct hpet_channel *hc = data;
+ struct clock_event_device *evt = &hc->evt;
- if (!hevt->event_handler) {
- printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
- dev->num);
+ if (!evt->event_handler) {
+ pr_info("Spurious interrupt HPET channel %d\n", hc->num);
return IRQ_HANDLED;
}
- hevt->event_handler(hevt);
+ evt->event_handler(evt);
return IRQ_HANDLED;
}
-static int hpet_setup_irq(struct hpet_dev *dev)
+static int hpet_setup_msi_irq(struct hpet_channel *hc)
{
-
- if (request_irq(dev->irq, hpet_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- dev->name, dev))
+ if (request_irq(hc->irq, hpet_msi_interrupt_handler,
+ IRQF_TIMER | IRQF_NOBALANCING,
+ hc->name, hc))
return -1;
- disable_irq(dev->irq);
- irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
- enable_irq(dev->irq);
+ disable_irq(hc->irq);
+ irq_set_affinity(hc->irq, cpumask_of(hc->cpu));
+ enable_irq(hc->irq);
- printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
- dev->name, dev->irq);
+ pr_debug("%s irq %u for MSI\n", hc->name, hc->irq);
return 0;
}
-/* This should be called in specific @cpu */
-static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+/* Invoked from the hotplug callback on @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu)
{
- struct clock_event_device *evt = &hdev->evt;
- uint64_t hpet_freq;
+ struct clock_event_device *evt = &hc->evt;
- WARN_ON(cpu != smp_processor_id());
- if (!(hdev->flags & HPET_DEV_VALID))
- return;
+ hc->cpu = cpu;
+ per_cpu(cpu_hpet_channel, cpu) = hc;
+ hpet_setup_msi_irq(hc);
- if (hpet_setup_msi_irq(hdev->irq))
- return;
+ hpet_init_clockevent(hc, 110);
+ evt->tick_resume = hpet_clkevt_msi_resume;
- hdev->cpu = cpu;
- per_cpu(cpu_hpet_dev, cpu) = hdev;
- evt->name = hdev->name;
- hpet_setup_irq(hdev);
- evt->irq = hdev->irq;
+ clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+ 0x7FFFFFFF);
+}
- evt->rating = 110;
- evt->features = CLOCK_EVT_FEAT_ONESHOT;
- if (hdev->flags & HPET_DEV_PERI_CAP)
- evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+static struct hpet_channel *hpet_get_unused_clockevent(void)
+{
+ int i;
- evt->set_mode = hpet_msi_set_mode;
- evt->set_next_event = hpet_msi_next_event;
- evt->shift = 32;
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
- /*
- * The period is a femto seconds value. We need to calculate the
- * scaled math multiplication factor for nanosecond to hpet tick
- * conversion.
- */
- hpet_freq = 1000000000000000ULL;
- do_div(hpet_freq, hpet_period);
- evt->mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, evt->shift);
- /* Calculate the max delta */
- evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
- /* 5 usec minimum reprogramming delta. */
- evt->min_delta_ns = 5000;
-
- evt->cpumask = cpumask_of(hdev->cpu);
- clockevents_register_device(evt);
+ if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use)
+ continue;
+ hc->in_use = 1;
+ return hc;
+ }
+ return NULL;
}
-#ifdef CONFIG_HPET
-/* Reserve at least one timer for userspace (/dev/hpet) */
-#define RESERVE_TIMERS 1
-#else
-#define RESERVE_TIMERS 0
-#endif
+static int hpet_cpuhp_online(unsigned int cpu)
+{
+ struct hpet_channel *hc = hpet_get_unused_clockevent();
-static void hpet_msi_capability_lookup(unsigned int start_timer)
+ if (hc)
+ init_one_hpet_msi_clockevent(hc, cpu);
+ return 0;
+}
+
+static int hpet_cpuhp_dead(unsigned int cpu)
{
- unsigned int id;
- unsigned int num_timers;
- unsigned int num_timers_used = 0;
- int i;
+ struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu);
- id = hpet_readl(HPET_ID);
+ if (!hc)
+ return 0;
+ free_irq(hc->irq, hc);
+ hc->in_use = 0;
+ per_cpu(cpu_hpet_channel, cpu) = NULL;
+ return 0;
+}
+
+static void __init hpet_select_clockevents(void)
+{
+ unsigned int i;
+
+ hpet_base.nr_clockevents = 0;
+
+ /* No point if MSI is disabled or CPU has an Always Running APIC Timer */
+ if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT))
+ return;
- num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
- num_timers++; /* Value read out starts from 0 */
hpet_print_config();
- hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
- if (!hpet_devs)
+ hpet_domain = hpet_create_irq_domain(hpet_blockid);
+ if (!hpet_domain)
return;
- hpet_num_timers = num_timers;
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
+ int irq;
- for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
- struct hpet_dev *hdev = &hpet_devs[num_timers_used];
- unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
+ if (hc->mode != HPET_MODE_UNUSED)
+ continue;
- /* Only consider HPET timer with MSI support */
- if (!(cfg & HPET_TN_FSB_CAP))
+ /* Only consider HPET channel with MSI support */
+ if (!(hc->boot_cfg & HPET_TN_FSB_CAP))
continue;
- hdev->flags = 0;
- if (cfg & HPET_TN_PERIODIC_CAP)
- hdev->flags |= HPET_DEV_PERI_CAP;
- hdev->num = i;
+ sprintf(hc->name, "hpet%d", i);
- sprintf(hdev->name, "hpet%d", i);
- if (hpet_assign_irq(hdev))
+ irq = hpet_assign_irq(hpet_domain, hc, hc->num);
+ if (irq <= 0)
continue;
- hdev->flags |= HPET_DEV_FSB_CAP;
- hdev->flags |= HPET_DEV_VALID;
- num_timers_used++;
- if (num_timers_used == num_possible_cpus())
+ hc->irq = irq;
+ hc->mode = HPET_MODE_CLOCKEVT;
+
+ if (++hpet_base.nr_clockevents == num_possible_cpus())
break;
}
- printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
- num_timers, num_timers_used);
+ pr_info("%d channels of %d reserved for per-cpu timers\n",
+ hpet_base.nr_channels, hpet_base.nr_clockevents);
}
-#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd)
-{
- int i;
+#else
- if (!hpet_devs)
- return;
+static inline void hpet_select_clockevents(void) { }
- for (i = 0; i < hpet_num_timers; i++) {
- struct hpet_dev *hdev = &hpet_devs[i];
+#define hpet_cpuhp_online NULL
+#define hpet_cpuhp_dead NULL
- if (!(hdev->flags & HPET_DEV_VALID))
- continue;
-
- hd->hd_irq[hdev->num] = hdev->irq;
- hpet_reserve_timer(hd, hdev->num);
- }
-}
#endif
-static struct hpet_dev *hpet_get_unused_timer(void)
-{
- int i;
-
- if (!hpet_devs)
- return NULL;
-
- for (i = 0; i < hpet_num_timers; i++) {
- struct hpet_dev *hdev = &hpet_devs[i];
-
- if (!(hdev->flags & HPET_DEV_VALID))
- continue;
- if (test_and_set_bit(HPET_DEV_USED_BIT,
- (unsigned long *)&hdev->flags))
- continue;
- return hdev;
- }
- return NULL;
-}
+/*
+ * Clock source related code
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delays and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe, that it can actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. We also need 64-bit atomic read.
+ *
+ * The lock and the HPET value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
-struct hpet_work_struct {
- struct delayed_work work;
- struct completion complete;
+static union hpet_lock hpet __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
};
-static void hpet_work(struct work_struct *w)
+static u64 read_hpet(struct clocksource *cs)
{
- struct hpet_dev *hdev;
- int cpu = smp_processor_id();
- struct hpet_work_struct *hpet_work;
+ unsigned long flags;
+ union hpet_lock old, new;
- hpet_work = container_of(w, struct hpet_work_struct, work.work);
+ BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
- hdev = hpet_get_unused_timer();
- if (hdev)
- init_one_hpet_msi_clockevent(hdev, cpu);
+ /*
+ * Read HPET directly if in NMI.
+ */
+ if (in_nmi())
+ return (u64)hpet_readl(HPET_COUNTER);
- complete(&hpet_work->complete);
-}
+ /*
+ * Read the current state of the lock and HPET value atomically.
+ */
+ old.lockval = READ_ONCE(hpet.lockval);
-static int hpet_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- unsigned long cpu = (unsigned long)hcpu;
- struct hpet_work_struct work;
- struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
-
- switch (action & 0xf) {
- case CPU_ONLINE:
- INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
- init_completion(&work.complete);
- /* FIXME: add schedule_work_on() */
- schedule_delayed_work_on(cpu, &work.work, 0);
- wait_for_completion(&work.complete);
- destroy_timer_on_stack(&work.work.timer);
- break;
- case CPU_DEAD:
- if (hdev) {
- free_irq(hdev->irq, hdev);
- hdev->flags &= ~HPET_DEV_USED;
- per_cpu(cpu_hpet_dev, cpu) = NULL;
- }
- break;
- }
- return NOTIFY_OK;
-}
-#else
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
-static int hpet_setup_msi_irq(unsigned int irq)
-{
- return 0;
-}
-static void hpet_msi_capability_lookup(unsigned int start_timer)
-{
- return;
-}
+ local_irq_save(flags);
+ if (arch_spin_trylock(&hpet.lock)) {
+ new.value = hpet_readl(HPET_COUNTER);
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(hpet.value, new.value);
+ arch_spin_unlock(&hpet.lock);
+ local_irq_restore(flags);
+ return (u64)new.value;
+ }
+ local_irq_restore(flags);
-#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd)
-{
- return;
-}
-#endif
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the HPET value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * HPET value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next HPET reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(hpet.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
-static int hpet_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- return NOTIFY_OK;
+ return (u64)new.value;
}
-
-#endif
-
+#else
/*
- * Clock source related code
+ * For UP or 32-bit.
*/
-static cycle_t read_hpet(struct clocksource *cs)
-{
- return (cycle_t)hpet_readl(HPET_COUNTER);
-}
-
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
+static u64 read_hpet(struct clocksource *cs)
{
- return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+ return (u64)hpet_readl(HPET_COUNTER);
}
#endif
@@ -756,25 +854,46 @@ static struct clocksource clocksource_hpet = {
.rating = 250,
.read = read_hpet,
.mask = HPET_MASK,
- .shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
-#ifdef CONFIG_X86_64
- .vread = vread_hpet,
-#endif
};
-static int hpet_clocksource_register(void)
+/*
+ * AMD SB700 based systems with spread spectrum enabled use a SMM based
+ * HPET emulation to provide proper frequency setting.
+ *
+ * On such systems the SMM code is initialized with the first HPET register
+ * access and takes some time to complete. During this time the config
+ * register reads 0xffffffff. We check for max 1000 loops whether the
+ * config register reads a non-0xffffffff value to make sure that the
+ * HPET is up and running before we proceed any further.
+ *
+ * A counting loop is safe, as the HPET access takes thousands of CPU cycles.
+ *
+ * On non-SB700 based machines this check is only done once and has no
+ * side effects.
+ */
+static bool __init hpet_cfg_working(void)
{
- u64 start, now;
- cycle_t t1;
+ int i;
+
+ for (i = 0; i < 1000; i++) {
+ if (hpet_readl(HPET_CFG) != 0xFFFFFFFF)
+ return true;
+ }
+
+ pr_warn("Config register invalid. Disabling HPET\n");
+ return false;
+}
+
+static bool __init hpet_counting(void)
+{
+ u64 start, now, t1;
- /* Start the counter */
hpet_restart_counter();
- /* Verify whether hpet counter works */
t1 = hpet_readl(HPET_COUNTER);
- rdtscll(start);
+ start = rdtsc();
/*
* We don't know the TSC frequency yet, but waiting for
@@ -783,29 +902,87 @@ static int hpet_clocksource_register(void)
* 1 GHz == 200us
*/
do {
- rep_nop();
- rdtscll(now);
+ if (t1 != hpet_readl(HPET_COUNTER))
+ return true;
+ now = rdtsc();
} while ((now - start) < 200000UL);
- if (t1 == hpet_readl(HPET_COUNTER)) {
- printk(KERN_WARNING
- "HPET counter not counting. HPET disabled\n");
- return -ENODEV;
- }
+ pr_warn("Counter not counting. HPET disabled\n");
+ return false;
+}
- /*
- * The definition of mult is (include/linux/clocksource.h)
- * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
- * so we first need to convert hpet_period to ns/cyc units:
- * mult/2^shift = ns/cyc = hpet_period/10^6
- * mult = (hpet_period * 2^shift)/10^6
- * mult = (hpet_period << shift)/FSEC_PER_NSEC
- */
- clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
+static bool __init mwait_pc10_supported(void)
+{
+ unsigned int eax, ebx, ecx, mwait_substates;
- clocksource_register(&clocksource_hpet);
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
- return 0;
+ if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
+ return false;
+
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates);
+
+ return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
+ (ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
+ (mwait_substates & (0xF << 28));
+}
+
+/*
+ * Check whether the system supports PC10. If so force disable HPET as that
+ * stops counting in PC10. This check is overbroad as it does not take any
+ * of the following into account:
+ *
+ * - ACPI tables
+ * - Enablement of intel_idle
+ * - Command line arguments which limit intel_idle C-state support
+ *
+ * That's perfectly fine. HPET is a piece of hardware designed by committee
+ * and the only reasons why it is still in use on modern systems is the
+ * fact that it is impossible to reliably query TSC and CPU frequency via
+ * CPUID or firmware.
+ *
+ * If HPET is functional it is useful for calibrating TSC, but this can be
+ * done via PMTIMER as well which seems to be the last remaining timer on
+ * X86/INTEL platforms that has not been completely wreckaged by feature
+ * creep.
+ *
+ * In theory HPET support should be removed altogether, but there are older
+ * systems out there which depend on it because TSC and APIC timer are
+ * dysfunctional in deeper C-states.
+ *
+ * It's only 20 years now that hardware people have been asked to provide
+ * reliable and discoverable facilities which can be used for timekeeping
+ * and per CPU timer interrupts.
+ *
+ * The probability that this problem is going to be solved in the
+ * foreseeable future is close to zero, so the kernel has to be cluttered
+ * with heuristics to keep up with the ever growing amount of hardware and
+ * firmware trainwrecks. Hopefully some day hardware people will understand
+ * that the approach of "This can be fixed in software" is not sustainable.
+ * Hope dies last...
+ */
+static bool __init hpet_is_pc10_damaged(void)
+{
+ unsigned long long pcfg;
+
+ /* Check whether PC10 substates are supported */
+ if (!mwait_pc10_supported())
+ return false;
+
+ /* Check whether PC10 is enabled in PKG C-state limit */
+ rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+ if ((pcfg & 0xF) < 8)
+ return false;
+
+ if (hpet_force_user) {
+ pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
+ return false;
+ }
+
+ pr_info("HPET dysfunctional in PC10. Force disabled.\n");
+ boot_hpet_disable = true;
+ return true;
}
/**
@@ -813,44 +990,37 @@ static int hpet_clocksource_register(void)
*/
int __init hpet_enable(void)
{
- unsigned long id;
- int i;
+ u32 hpet_period, cfg, id, irq;
+ unsigned int i, channels;
+ struct hpet_channel *hc;
+ u64 freq;
if (!is_hpet_capable())
return 0;
+ if (hpet_is_pc10_damaged())
+ return 0;
+
hpet_set_mapping();
+ if (!hpet_virt_address)
+ return 0;
+
+ /* Validate that the config register is working */
+ if (!hpet_cfg_working())
+ goto out_nohpet;
/*
* Read the period and check for a sane value:
*/
hpet_period = hpet_readl(HPET_PERIOD);
-
- /*
- * AMD SB700 based systems with spread spectrum enabled use a
- * SMM based HPET emulation to provide proper frequency
- * setting. The SMM code is initialized with the first HPET
- * register access and takes some time to complete. During
- * this time the config register reads 0xffffffff. We check
- * for max. 1000 loops whether the config register reads a non
- * 0xffffffff value to make sure that HPET is up and running
- * before we go further. A counting loop is safe, as the HPET
- * access takes thousands of CPU cycles. On non SB700 based
- * machines this check is only done once and has no side
- * effects.
- */
- for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
- if (i == 1000) {
- printk(KERN_WARNING
- "HPET config register value = 0xFFFFFFFF. "
- "Disabling HPET\n");
- goto out_nohpet;
- }
- }
-
if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
goto out_nohpet;
+ /* The period is a femtoseconds value. Convert it to a frequency. */
+ freq = FSEC_PER_SEC;
+ do_div(freq, hpet_period);
+ hpet_freq = freq;
+
/*
* Read the HPET ID register to retrieve the IRQ routing
* information and the number of channels
@@ -858,44 +1028,99 @@ int __init hpet_enable(void)
id = hpet_readl(HPET_ID);
hpet_print_config();
-#ifdef CONFIG_HPET_EMULATE_RTC
+ /* This is the HPET channel number which is zero based */
+ channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
/*
* The legacy routing mode needs at least two channels, tick timer
* and the rtc emulation channel.
*/
- if (!(id & HPET_ID_NUMBER))
+ if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2)
goto out_nohpet;
-#endif
- if (hpet_clocksource_register())
+ hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL);
+ if (!hc) {
+ pr_warn("Disabling HPET.\n");
goto out_nohpet;
+ }
+ hpet_base.channels = hc;
+ hpet_base.nr_channels = channels;
+
+ /* Read, store and sanitize the global configuration */
+ cfg = hpet_readl(HPET_CFG);
+ hpet_base.boot_cfg = cfg;
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ if (cfg)
+ pr_warn("Global config: Unknown bits %#x\n", cfg);
+
+ /* Read, store and sanitize the per channel configuration */
+ for (i = 0; i < channels; i++, hc++) {
+ hc->num = i;
+
+ cfg = hpet_readl(HPET_Tn_CFG(i));
+ hc->boot_cfg = cfg;
+ irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
+ hc->irq = irq;
+
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(i));
+
+ cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+ | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+ | HPET_TN_FSB | HPET_TN_FSB_CAP);
+ if (cfg)
+ pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg);
+ }
+ hpet_print_config();
+
+ /*
+ * Validate that the counter is counting. This needs to be done
+ * after sanitizing the config registers to properly deal with
+ * force enabled HPETs.
+ */
+ if (!hpet_counting())
+ goto out_nohpet;
+
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {
- hpet_legacy_clockevent_register();
- hpet_msi_capability_lookup(2);
+ hpet_legacy_clockevent_register(&hpet_base.channels[0]);
+ hpet_base.channels[0].mode = HPET_MODE_LEGACY;
+ if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC))
+ hpet_base.channels[1].mode = HPET_MODE_LEGACY;
return 1;
}
- hpet_msi_capability_lookup(0);
return 0;
out_nohpet:
+ kfree(hpet_base.channels);
+ hpet_base.channels = NULL;
+ hpet_base.nr_channels = 0;
hpet_clear_mapping();
hpet_address = 0;
return 0;
}
/*
- * Needs to be late, as the reserve_timer code calls kalloc !
+ * The late initialization runs after the PCI quirks have been invoked
+ * which might have detected a system on which the HPET can be enforced.
+ *
+ * Also, the MSI machinery is not working yet when the HPET is initialized
+ * early.
*
- * Not a problem on i386 as hpet_enable is called from late_time_init,
- * but on x86_64 it is necessary !
+ * If the HPET is enabled, then:
+ *
+ * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y
+ * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents
+ * 3) Setup /dev/hpet if CONFIG_HPET=y
+ * 4) Register hotplug callbacks when clockevents are available
*/
static __init int hpet_late_init(void)
{
- int cpu;
-
- if (boot_hpet_disable)
- return -ENODEV;
+ int ret;
if (!hpet_address) {
if (!force_hpet_address)
@@ -908,53 +1133,76 @@ static __init int hpet_late_init(void)
if (!hpet_virt_address)
return -ENODEV;
- hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ hpet_select_device_channel();
+ hpet_select_clockevents();
+ hpet_reserve_platform_timers();
hpet_print_config();
- for_each_online_cpu(cpu) {
- hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
- }
-
- /* This notifier should be called after workqueue is ready */
- hotcpu_notifier(hpet_cpuhp_notify, -20);
+ if (!hpet_base.nr_clockevents)
+ return 0;
+ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online",
+ hpet_cpuhp_online, NULL);
+ if (ret)
+ return ret;
+ ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL,
+ hpet_cpuhp_dead);
+ if (ret)
+ goto err_cpuhp;
return 0;
+
+err_cpuhp:
+ cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE);
+ return ret;
}
fs_initcall(hpet_late_init);
void hpet_disable(void)
{
- if (is_hpet_capable()) {
- unsigned long cfg = hpet_readl(HPET_CFG);
+ unsigned int i;
+ u32 cfg;
- if (hpet_legacy_int_enabled) {
- cfg &= ~HPET_CFG_LEGACY;
- hpet_legacy_int_enabled = 0;
- }
- cfg &= ~HPET_CFG_ENABLE;
- hpet_writel(cfg, HPET_CFG);
- }
+ if (!is_hpet_capable() || !hpet_virt_address)
+ return;
+
+ /* Restore boot configuration with the enable bit cleared */
+ cfg = hpet_base.boot_cfg;
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+
+ /* Restore the channel boot configuration */
+ for (i = 0; i < hpet_base.nr_channels; i++)
+ hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i));
+
+ /* If the HPET was enabled at boot time, reenable it */
+ if (hpet_base.boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(hpet_base.boot_cfg, HPET_CFG);
}
#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+/*
+ * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET
* is enabled, we support RTC interrupt functionality in software.
+ *
* RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- * is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
+ *
+ * 1) Update Interrupt - generate an interrupt, every second, when the
+ * RTC clock is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2)
+ *
+ * (1) and (2) above are implemented using polling at a frequency of 64 Hz:
+ * DEFAULT_RTC_INT_FREQ.
+ *
+ * The exact frequency is a tradeoff between accuracy and interrupt overhead.
+ *
+ * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency,
+ * if it's higher.
*/
#include <linux/mc146818rtc.h>
#include <linux/rtc.h>
-#include <asm/rtc.h>
#define DEFAULT_RTC_INT_FREQ 64
#define DEFAULT_RTC_SHIFT 6
@@ -965,14 +1213,14 @@ static int hpet_prev_update_sec;
static struct rtc_time hpet_alarm_time;
static unsigned long hpet_pie_count;
static u32 hpet_t1_cmp;
-static unsigned long hpet_default_delta;
-static unsigned long hpet_pie_delta;
+static u32 hpet_default_delta;
+static u32 hpet_pie_delta;
static unsigned long hpet_pie_limit;
static rtc_irq_handler irq_handler;
/*
- * Check that the hpet counter c1 is ahead of the c2
+ * Check that the HPET counter c1 is ahead of c2
*/
static inline int hpet_cnt_ahead(u32 c1, u32 c2)
{
@@ -1010,24 +1258,26 @@ void hpet_unregister_irq_handler(rtc_irq_handler handler)
EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
/*
- * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
- * is not supported by all HPET implementations for timer 1.
+ * Channel 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for channel 1.
*
* hpet_rtc_timer_init() is called when the rtc is initialized.
*/
int hpet_rtc_timer_init(void)
{
- unsigned long cfg, cnt, delta, flags;
+ unsigned int cfg, cnt, delta;
+ unsigned long flags;
if (!is_hpet_enabled())
return 0;
if (!hpet_default_delta) {
+ struct clock_event_device *evt = &hpet_base.channels[0].evt;
uint64_t clc;
- clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
- clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
- hpet_default_delta = (unsigned long) clc;
+ clc = (uint64_t) evt->mult * NSEC_PER_SEC;
+ clc >>= evt->shift + DEFAULT_RTC_SHIFT;
+ hpet_default_delta = clc;
}
if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1052,6 +1302,14 @@ int hpet_rtc_timer_init(void)
}
EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
+static void hpet_disable_rtc_channel(void)
+{
+ u32 cfg = hpet_readl(HPET_T1_CFG);
+
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
+}
+
/*
* The functions below are called from rtc driver.
* Return 0 if HPET is not being used.
@@ -1063,6 +1321,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
return 0;
hpet_rtc_flags &= ~bit_mask;
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
+
return 1;
}
EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
@@ -1086,8 +1347,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
}
EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
- unsigned char sec)
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
{
if (!is_hpet_enabled())
return 0;
@@ -1107,35 +1367,29 @@ int hpet_set_periodic_freq(unsigned long freq)
if (!is_hpet_enabled())
return 0;
- if (freq <= DEFAULT_RTC_INT_FREQ)
+ if (freq <= DEFAULT_RTC_INT_FREQ) {
hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
- else {
- clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ } else {
+ struct clock_event_device *evt = &hpet_base.channels[0].evt;
+
+ clc = (uint64_t) evt->mult * NSEC_PER_SEC;
do_div(clc, freq);
- clc >>= hpet_clockevent.shift;
- hpet_pie_delta = (unsigned long) clc;
+ clc >>= evt->shift;
+ hpet_pie_delta = clc;
+ hpet_pie_limit = 0;
}
+
return 1;
}
EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
-int hpet_rtc_dropped_irq(void)
-{
- return is_hpet_enabled();
-}
-EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
-
static void hpet_rtc_timer_reinit(void)
{
- unsigned long cfg, delta;
+ unsigned int delta;
int lost_ints = -1;
- if (unlikely(!hpet_rtc_flags)) {
- cfg = hpet_readl(HPET_T1_CFG);
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T1_CFG);
- return;
- }
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
delta = hpet_default_delta;
@@ -1156,8 +1410,7 @@ static void hpet_rtc_timer_reinit(void)
if (hpet_rtc_flags & RTC_PIE)
hpet_pie_count += lost_ints;
if (printk_ratelimit())
- printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
- lost_ints);
+ pr_warn("Lost %d RTC interrupts\n", lost_ints);
}
}
@@ -1169,8 +1422,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_rtc_timer_reinit();
memset(&curr_time, 0, sizeof(struct rtc_time));
- if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
- get_rtc_time(&curr_time);
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
+ if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) {
+ pr_err_ratelimited("unable to read current time from RTC\n");
+ return IRQ_HANDLED;
+ }
+ }
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
@@ -1179,8 +1436,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_prev_update_sec = curr_time.tm_sec;
}
- if (hpet_rtc_flags & RTC_PIE &&
- ++hpet_pie_count >= hpet_pie_limit) {
+ if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) {
rtc_int_flag |= RTC_PF;
hpet_pie_count = 0;
}
@@ -1189,7 +1445,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
(curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
(curr_time.tm_min == hpet_alarm_time.tm_min) &&
(curr_time.tm_hour == hpet_alarm_time.tm_hour))
- rtc_int_flag |= RTC_AF;
+ rtc_int_flag |= RTC_AF;
if (rtc_int_flag) {
rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..f846c15f21ca
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,593 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/kvm_types.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <asm/user.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, cpu_dr7);
+EXPORT_PER_CPU_SYMBOL(cpu_dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+static inline unsigned long
+__encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ unsigned long bp_info;
+
+ bp_info = (len | type) & 0xf;
+ bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+ bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
+
+ return bp_info;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7. Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+ int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+ *len = (bp_info & 0xc) | 0x40;
+ *type = (bp_info & 0x3) | 0x80;
+
+ return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ lockdep_assert_irqs_disabled();
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
+
+ if (!*slot) {
+ *slot = bp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return -EBUSY;
+
+ set_debugreg(info->address, i);
+ __this_cpu_write(cpu_debugreg[i], info->address);
+
+ dr7 = this_cpu_ptr(&cpu_dr7);
+ *dr7 |= encode_dr7(i, info->len, info->type);
+
+ /*
+ * Ensure we first write cpu_dr7 before we set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
+
+ set_debugreg(*dr7, 7);
+ if (info->mask)
+ amd_set_dr_addr_mask(info->mask, i);
+
+ return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long dr7;
+ int i;
+
+ lockdep_assert_irqs_disabled();
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
+
+ if (*slot == bp) {
+ *slot = NULL;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return;
+
+ dr7 = this_cpu_read(cpu_dr7);
+ dr7 &= ~__encode_dr7(i, info->len, info->type);
+
+ set_debugreg(dr7, 7);
+ if (info->mask)
+ amd_set_dr_addr_mask(0, i);
+
+ /*
+ * Ensure the write to cpu_dr7 is after we've set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
+
+ this_cpu_write(cpu_dr7, dr7);
+}
+
+static int arch_bp_generic_len(int x86_len)
+{
+ switch (x86_len) {
+ case X86_BREAKPOINT_LEN_1:
+ return HW_BREAKPOINT_LEN_1;
+ case X86_BREAKPOINT_LEN_2:
+ return HW_BREAKPOINT_LEN_2;
+ case X86_BREAKPOINT_LEN_4:
+ return HW_BREAKPOINT_LEN_4;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ return HW_BREAKPOINT_LEN_8;
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type)
+{
+ int len;
+
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ if (x86_len != X86_BREAKPOINT_LEN_X)
+ return -EINVAL;
+
+ *gen_type = HW_BREAKPOINT_X;
+ *gen_len = sizeof(long);
+ return 0;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
+ break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ len = arch_bp_generic_len(x86_len);
+ if (len < 0)
+ return -EINVAL;
+ *gen_len = len;
+
+ return 0;
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
+{
+ unsigned long va;
+ int len;
+
+ va = hw->address;
+ len = arch_bp_generic_len(hw->len);
+ WARN_ON_ONCE(len < 0);
+
+ /*
+ * We don't need to worry about va + len - 1 overflowing:
+ * we already require that va is aligned to a multiple of len.
+ */
+ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+}
+
+/*
+ * Checks whether the range [addr, end], overlaps the area [base, base + size).
+ */
+static inline bool within_area(unsigned long addr, unsigned long end,
+ unsigned long base, unsigned long size)
+{
+ return end >= base && addr < (base + size);
+}
+
+/*
+ * Checks whether the range from addr to end, inclusive, overlaps the fixed
+ * mapped CPU entry area range or other ranges used for CPU entry.
+ */
+static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
+{
+ int cpu;
+
+ /* CPU entry erea is always used for CPU entry */
+ if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_MAP_SIZE))
+ return true;
+
+ /*
+ * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
+ * GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
+ */
+#ifdef CONFIG_SMP
+ if (within_area(addr, end, (unsigned long)__per_cpu_offset,
+ sizeof(unsigned long) * nr_cpu_ids))
+ return true;
+#else
+ if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
+ sizeof(pcpu_unit_offsets)))
+ return true;
+#endif
+
+ for_each_possible_cpu(cpu) {
+ /* The original rw GDT is being used after load_direct_gdt() */
+ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
+ GDT_SIZE))
+ return true;
+
+ /*
+ * cpu_tss_rw is not directly referenced by hardware, but
+ * cpu_tss_rw is also used in CPU entry code,
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct)))
+ return true;
+
+ /*
+ * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry.
+ * If a data breakpoint on it, it will cause an unwanted #DB.
+ * Protect the full cpu_tlbstate structure to be sure.
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tlbstate, cpu),
+ sizeof(struct tlb_state)))
+ return true;
+
+ /*
+ * When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
+ * will read per-cpu cpu_dr7 before clear dr7 register.
+ */
+ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
+ sizeof(cpu_dr7)))
+ return true;
+ }
+
+ return false;
+}
+
+static int arch_build_bp_info(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ unsigned long bp_end;
+
+ bp_end = attr->bp_addr + attr->bp_len - 1;
+ if (bp_end < attr->bp_addr)
+ return -EINVAL;
+
+ /*
+ * Prevent any breakpoint of any type that overlaps the CPU
+ * entry area and data. This protects the IST stacks and also
+ * reduces the chance that we ever find out what happens if
+ * there's a data breakpoint on the GDT, IDT, or TSS.
+ */
+ if (within_cpu_entry(attr->bp_addr, bp_end))
+ return -EINVAL;
+
+ hw->address = attr->bp_addr;
+ hw->mask = 0;
+
+ /* Type */
+ switch (attr->bp_type) {
+ case HW_BREAKPOINT_W:
+ hw->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ hw->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ /*
+ * We don't allow kernel breakpoints in places that are not
+ * acceptable for kprobes. On non-kprobes kernels, we don't
+ * allow kernel breakpoints at all.
+ */
+ if (attr->bp_addr >= TASK_SIZE_MAX) {
+ if (within_kprobe_blacklist(attr->bp_addr))
+ return -EINVAL;
+ }
+
+ hw->type = X86_BREAKPOINT_EXECUTE;
+ /*
+ * x86 inst breakpoints need to have a specific undefined len.
+ * But we still need to check userspace is not trying to setup
+ * an unsupported length, to get a range breakpoint for example.
+ */
+ if (attr->bp_len == sizeof(long)) {
+ hw->len = X86_BREAKPOINT_LEN_X;
+ return 0;
+ }
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ switch (attr->bp_len) {
+ case HW_BREAKPOINT_LEN_1:
+ hw->len = X86_BREAKPOINT_LEN_1;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ hw->len = X86_BREAKPOINT_LEN_2;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ hw->len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case HW_BREAKPOINT_LEN_8:
+ hw->len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ /* AMD range breakpoint */
+ if (!is_power_of_2(attr->bp_len))
+ return -EINVAL;
+ if (attr->bp_addr & (attr->bp_len - 1))
+ return -EINVAL;
+
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return -EOPNOTSUPP;
+
+ /*
+ * It's impossible to use a range breakpoint to fake out
+ * user vs kernel detection because bp_len - 1 can't
+ * have the high bit set. If we ever allow range instruction
+ * breakpoints, then we'll have to check for kprobe-blacklisted
+ * addresses anywhere in the range.
+ */
+ hw->mask = attr->bp_len - 1;
+ hw->len = X86_BREAKPOINT_LEN_1;
+ }
+
+ return 0;
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ unsigned int align;
+ int ret;
+
+
+ ret = arch_build_bp_info(bp, attr, hw);
+ if (ret)
+ return ret;
+
+ switch (hw->len) {
+ case X86_BREAKPOINT_LEN_1:
+ align = 0;
+ if (hw->mask)
+ align = hw->mask;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ align = 1;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ align = 3;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ align = 7;
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ /*
+ * Check that the low-order bits of the address are appropriate
+ * for the alignment implied by len.
+ */
+ if (hw->address & align)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+ int i;
+ struct thread_struct *t = &tsk->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ unregister_hw_breakpoint(t->ptrace_bps[i]);
+ t->ptrace_bps[i] = NULL;
+ }
+
+ t->virtual_dr6 = 0;
+ t->ptrace_dr7 = 0;
+}
+
+void hw_breakpoint_restore(void)
+{
+ set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
+ set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
+ set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
+ set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
+ set_debugreg(DR6_RESERVED, 6);
+ set_debugreg(__this_cpu_read(cpu_dr7), 7);
+}
+EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int hw_breakpoint_handler(struct die_args *args)
+{
+ int i, rc = NOTIFY_STOP;
+ struct perf_event *bp;
+ unsigned long *dr6_p;
+ unsigned long dr6;
+ bool bpx;
+
+ /* The DR6 value is pointed by args->err */
+ dr6_p = (unsigned long *)ERR_PTR(args->err);
+ dr6 = *dr6_p;
+
+ /* Do an early return if no trap bits are set in DR6 */
+ if ((dr6 & DR_TRAP_BITS) == 0)
+ return NOTIFY_DONE;
+
+ /* Handle all the breakpoints that were triggered */
+ for (i = 0; i < HBP_NUM; ++i) {
+ if (likely(!(dr6 & (DR_TRAP0 << i))))
+ continue;
+
+ bp = this_cpu_read(bp_per_reg[i]);
+ if (!bp)
+ continue;
+
+ bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE;
+
+ /*
+ * TF and data breakpoints are traps and can be merged, however
+ * instruction breakpoints are faults and will be raised
+ * separately.
+ *
+ * However DR6 can indicate both TF and instruction
+ * breakpoints. In that case take TF as that has precedence and
+ * delay the instruction breakpoint for the next exception.
+ */
+ if (bpx && (dr6 & DR_STEP))
+ continue;
+
+ /*
+ * Reset the 'i'th TRAP bit in dr6 to denote completion of
+ * exception handling
+ */
+ (*dr6_p) &= ~(DR_TRAP0 << i);
+
+ perf_bp_event(bp, args->regs);
+
+ /*
+ * Set up resume flag to avoid breakpoint recursion when
+ * returning back to origin.
+ */
+ if (bpx)
+ args->regs->flags |= X86_EFLAGS_RF;
+ }
+
+ /*
+ * Further processing in do_debug() is needed for a) user-space
+ * breakpoints (to generate signals) and b) when the system has
+ * taken exception due to multiple causes
+ */
+ if ((current->thread.virtual_dr6 & DR_TRAP_BITS) ||
+ (dr6 & (~DR_TRAP_BITS)))
+ rc = NOTIFY_DONE;
+
+ return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int hw_breakpoint_exceptions_notify(
+ struct notifier_block *unused, unsigned long val, void *data)
+{
+ if (val != DIE_DEBUG)
+ return NOTIFY_DONE;
+
+ return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+ /* TODO */
+}
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
deleted file mode 100644
index 43cec6bdda63..000000000000
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <linux/module.h>
-
-#include <asm/checksum.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/ftrace.h>
-
-#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
-EXPORT_SYMBOL(mcount);
-#endif
-
-/* Networking helper routines. */
-EXPORT_SYMBOL(csum_partial_copy_generic);
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(strstr);
-
-EXPORT_SYMBOL(csum_partial);
-EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
deleted file mode 100644
index f2f8540a7f3d..000000000000
--- a/arch/x86/kernel/i387.c
+++ /dev/null
@@ -1,669 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- * Gareth Hughes <gareth@valinux.com>, May 2000
- */
-#include <linux/module.h>
-#include <linux/regset.h>
-#include <linux/sched.h>
-
-#include <asm/sigcontext.h>
-#include <asm/processor.h>
-#include <asm/math_emu.h>
-#include <asm/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/i387.h>
-#include <asm/user.h>
-
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-#else
-# define save_i387_xstate_ia32 save_i387_xstate
-# define restore_i387_xstate_ia32 restore_i387_xstate
-# define _fpstate_ia32 _fpstate
-# define _xstate_ia32 _xstate
-# define sig_xstate_ia32_size sig_xstate_size
-# define fx_sw_reserved_ia32 fx_sw_reserved
-# define user_i387_ia32_struct user_i387_struct
-# define user32_fxsr_struct user_fxsr_struct
-#endif
-
-#ifdef CONFIG_MATH_EMULATION
-# define HAVE_HWFP (boot_cpu_data.hard_math)
-#else
-# define HAVE_HWFP 1
-#endif
-
-static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
-unsigned int xstate_size;
-unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
-static struct i387_fxsave_struct fx_scratch __cpuinitdata;
-
-void __cpuinit mxcsr_feature_mask_init(void)
-{
- unsigned long mask = 0;
-
- clts();
- if (cpu_has_fxsr) {
- memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
- asm volatile("fxsave %0" : : "m" (fx_scratch));
- mask = fx_scratch.mxcsr_mask;
- if (mask == 0)
- mask = 0x0000ffbf;
- }
- mxcsr_feature_mask &= mask;
- stts();
-}
-
-void __cpuinit init_thread_xstate(void)
-{
- if (!HAVE_HWFP) {
- xstate_size = sizeof(struct i387_soft_struct);
- return;
- }
-
- if (cpu_has_xsave) {
- xsave_cntxt_init();
- return;
- }
-
- if (cpu_has_fxsr)
- xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
- else
- xstate_size = sizeof(struct i387_fsave_struct);
-#endif
-}
-
-#ifdef CONFIG_X86_64
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-void __cpuinit fpu_init(void)
-{
- unsigned long oldcr0 = read_cr0();
-
- set_in_cr4(X86_CR4_OSFXSR);
- set_in_cr4(X86_CR4_OSXMMEXCPT);
-
- write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
-
- /*
- * Boot processor to setup the FP and extended state context info.
- */
- if (!smp_processor_id())
- init_thread_xstate();
- xsave_init();
-
- mxcsr_feature_mask_init();
- /* clean state in init */
- if (cpu_has_xsave)
- current_thread_info()->status = TS_XSAVE;
- else
- current_thread_info()->status = 0;
- clear_used_math();
-}
-#endif /* CONFIG_X86_64 */
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
-{
- if (tsk_used_math(tsk)) {
- if (HAVE_HWFP && tsk == current)
- unlazy_fpu(tsk);
- return 0;
- }
-
- /*
- * Memory allocation at the first usage of the FPU and other state.
- */
- if (!tsk->thread.xstate) {
- tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
- GFP_KERNEL);
- if (!tsk->thread.xstate)
- return -ENOMEM;
- }
-
-#ifdef CONFIG_X86_32
- if (!HAVE_HWFP) {
- memset(tsk->thread.xstate, 0, xstate_size);
- finit_task(tsk);
- set_stopped_child_used_math(tsk);
- return 0;
- }
-#endif
-
- if (cpu_has_fxsr) {
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
-
- memset(fx, 0, xstate_size);
- fx->cwd = 0x37f;
- if (cpu_has_xmm)
- fx->mxcsr = MXCSR_DEFAULT;
- } else {
- struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
- memset(fp, 0, xstate_size);
- fp->cwd = 0xffff037fu;
- fp->swd = 0xffff0000u;
- fp->twd = 0xffffffffu;
- fp->fos = 0xffff0000u;
- }
- /*
- * Only the device not available exception or ptrace can call init_fpu.
- */
- set_stopped_child_used_math(tsk);
- return 0;
-}
-
-int fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
- return tsk_used_math(target) ? regset->n : 0;
-}
-
-int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
- return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
-}
-
-int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
-{
- int ret;
-
- if (!cpu_has_fxsr)
- return -ENODEV;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fxsave, 0, -1);
-}
-
-int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- const void *kbuf, const void __user *ubuf)
-{
- int ret;
-
- if (!cpu_has_fxsr)
- return -ENODEV;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- set_stopped_child_used_math(target);
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fxsave, 0, -1);
-
- /*
- * mxcsr reserved bits must be masked to zero for security reasons.
- */
- target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
-
- /*
- * update the header bits in the xsave header, indicating the
- * presence of FP and SSE state.
- */
- if (cpu_has_xsave)
- target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
- return ret;
-}
-
-#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
- unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
- /* Transform each pair of bits into 01 (valid) or 00 (empty) */
- tmp = ~twd;
- tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
- /* and move the valid bits to the lower byte. */
- tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
- tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
- tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
- return tmp;
-}
-
-#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
-#define FP_EXP_TAG_VALID 0
-#define FP_EXP_TAG_ZERO 1
-#define FP_EXP_TAG_SPECIAL 2
-#define FP_EXP_TAG_EMPTY 3
-
-static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
- struct _fpxreg *st;
- u32 tos = (fxsave->swd >> 11) & 7;
- u32 twd = (unsigned long) fxsave->twd;
- u32 tag;
- u32 ret = 0xffff0000u;
- int i;
-
- for (i = 0; i < 8; i++, twd >>= 1) {
- if (twd & 0x1) {
- st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
- switch (st->exponent & 0x7fff) {
- case 0x7fff:
- tag = FP_EXP_TAG_SPECIAL;
- break;
- case 0x0000:
- if (!st->significand[0] &&
- !st->significand[1] &&
- !st->significand[2] &&
- !st->significand[3])
- tag = FP_EXP_TAG_ZERO;
- else
- tag = FP_EXP_TAG_SPECIAL;
- break;
- default:
- if (st->significand[3] & 0x8000)
- tag = FP_EXP_TAG_VALID;
- else
- tag = FP_EXP_TAG_SPECIAL;
- break;
- }
- } else {
- tag = FP_EXP_TAG_EMPTY;
- }
- ret |= tag << (2 * i);
- }
- return ret;
-}
-
-/*
- * FXSR floating point environment conversions.
- */
-
-static void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
-{
- struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
- struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
- struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
- int i;
-
- env->cwd = fxsave->cwd | 0xffff0000u;
- env->swd = fxsave->swd | 0xffff0000u;
- env->twd = twd_fxsr_to_i387(fxsave);
-
-#ifdef CONFIG_X86_64
- env->fip = fxsave->rip;
- env->foo = fxsave->rdp;
- if (tsk == current) {
- /*
- * should be actually ds/cs at fpu exception time, but
- * that information is not available in 64bit mode.
- */
- asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
- asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
- } else {
- struct pt_regs *regs = task_pt_regs(tsk);
-
- env->fos = 0xffff0000 | tsk->thread.ds;
- env->fcs = regs->cs;
- }
-#else
- env->fip = fxsave->fip;
- env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
- env->foo = fxsave->foo;
- env->fos = fxsave->fos;
-#endif
-
- for (i = 0; i < 8; ++i)
- memcpy(&to[i], &from[i], sizeof(to[0]));
-}
-
-static void convert_to_fxsr(struct task_struct *tsk,
- const struct user_i387_ia32_struct *env)
-
-{
- struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
- struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
- struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
- int i;
-
- fxsave->cwd = env->cwd;
- fxsave->swd = env->swd;
- fxsave->twd = twd_i387_to_fxsr(env->twd);
- fxsave->fop = (u16) ((u32) env->fcs >> 16);
-#ifdef CONFIG_X86_64
- fxsave->rip = env->fip;
- fxsave->rdp = env->foo;
- /* cs and ds ignored */
-#else
- fxsave->fip = env->fip;
- fxsave->fcs = (env->fcs & 0xffff);
- fxsave->foo = env->foo;
- fxsave->fos = env->fos;
-#endif
-
- for (i = 0; i < 8; ++i)
- memcpy(&to[i], &from[i], sizeof(from[0]));
-}
-
-int fpregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
-{
- struct user_i387_ia32_struct env;
- int ret;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- if (!HAVE_HWFP)
- return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
- if (!cpu_has_fxsr) {
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fsave, 0,
- -1);
- }
-
- if (kbuf && pos == 0 && count == sizeof(env)) {
- convert_from_fxsr(kbuf, target);
- return 0;
- }
-
- convert_from_fxsr(&env, target);
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-}
-
-int fpregs_set(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- const void *kbuf, const void __user *ubuf)
-{
- struct user_i387_ia32_struct env;
- int ret;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- set_stopped_child_used_math(target);
-
- if (!HAVE_HWFP)
- return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
- if (!cpu_has_fxsr) {
- return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fsave, 0, -1);
- }
-
- if (pos > 0 || count < sizeof(env))
- convert_from_fxsr(&env, target);
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
- if (!ret)
- convert_to_fxsr(target, &env);
-
- /*
- * update the header bit in the xsave header, indicating the
- * presence of FP.
- */
- if (cpu_has_xsave)
- target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
- return ret;
-}
-
-/*
- * Signal frame handlers.
- */
-
-static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
-
- fp->status = fp->swd;
- if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
- return -1;
- return 1;
-}
-
-static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
- struct user_i387_ia32_struct env;
- int err = 0;
-
- convert_from_fxsr(&env, tsk);
- if (__copy_to_user(buf, &env, sizeof(env)))
- return -1;
-
- err |= __put_user(fx->swd, &buf->status);
- err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
- if (err)
- return -1;
-
- if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
- return -1;
- return 1;
-}
-
-static int save_i387_xsave(void __user *buf)
-{
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fx = buf;
- int err = 0;
-
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context.
- * This will enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware applications can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
- if (save_i387_fxsave(fx) < 0)
- return -1;
-
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
- sizeof(struct _fpx_sw_bytes));
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_ia32_size
- - FP_XSTATE_MAGIC2_SIZE));
- if (err)
- return -1;
-
- return 1;
-}
-
-int save_i387_xstate_ia32(void __user *buf)
-{
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
- struct task_struct *tsk = current;
-
- if (!used_math())
- return 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
- return -EACCES;
- /*
- * This will cause a "finit" to be triggered by the next
- * attempted FPU operation by the 'current' process.
- */
- clear_used_math();
-
- if (!HAVE_HWFP) {
- return fpregs_soft_get(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) ? -1 : 1;
- }
-
- unlazy_fpu(tsk);
-
- if (cpu_has_xsave)
- return save_i387_xsave(fp);
- if (cpu_has_fxsr)
- return save_i387_fxsave(fp);
- else
- return save_i387_fsave(fp);
-}
-
-static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
-
- return __copy_from_user(&tsk->thread.xstate->fsave, buf,
- sizeof(struct i387_fsave_struct));
-}
-
-static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
- unsigned int size)
-{
- struct task_struct *tsk = current;
- struct user_i387_ia32_struct env;
- int err;
-
- err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
- size);
- /* mxcsr reserved bits must be masked to zero for security reasons */
- tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
- if (err || __copy_from_user(&env, buf, sizeof(env)))
- return 1;
- convert_to_fxsr(tsk, &env);
-
- return 0;
-}
-
-static int restore_i387_xsave(void __user *buf)
-{
- struct _fpx_sw_bytes fx_sw_user;
- struct _fpstate_ia32 __user *fx_user =
- ((struct _fpstate_ia32 __user *) buf);
- struct i387_fxsave_struct __user *fx =
- (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
- struct xsave_hdr_struct *xsave_hdr =
- &current->thread.xstate->xsave.xsave_hdr;
- u64 mask;
- int err;
-
- if (check_for_xstate(fx, buf, &fx_sw_user))
- goto fx_only;
-
- mask = fx_sw_user.xstate_bv;
-
- err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
-
- xsave_hdr->xstate_bv &= pcntxt_mask;
- /*
- * These bits must be zero.
- */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
-
- /*
- * Init the state that is not present in the memory layout
- * and enabled by the OS.
- */
- mask = ~(pcntxt_mask & ~mask);
- xsave_hdr->xstate_bv &= mask;
-
- return err;
-fx_only:
- /*
- * Couldn't find the extended state information in the memory
- * layout. Restore the FP/SSE and init the other extended state
- * enabled by the OS.
- */
- xsave_hdr->xstate_bv = XSTATE_FPSSE;
- return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
-}
-
-int restore_i387_xstate_ia32(void __user *buf)
-{
- int err;
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
-
- if (HAVE_HWFP)
- clear_fpu(tsk);
-
- if (!buf) {
- if (used_math()) {
- clear_fpu(tsk);
- clear_used_math();
- }
-
- return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
- return -EACCES;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
- if (HAVE_HWFP) {
- if (cpu_has_xsave)
- err = restore_i387_xsave(buf);
- else if (cpu_has_fxsr)
- err = restore_i387_fxsave(fp, sizeof(struct
- i387_fxsave_struct));
- else
- err = restore_i387_fsave(fp);
- } else {
- err = fpregs_soft_set(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) != 0;
- }
- set_used_math();
-
- return err;
-}
-
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
-{
- struct task_struct *tsk = current;
- int fpvalid;
-
- fpvalid = !!used_math();
- if (fpvalid)
- fpvalid = !fpregs_get(tsk, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- fpu, NULL);
-
- return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..896d46b44284 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -1,18 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 8237A DMA controller suspend functions.
*
* Written by Pierre Ossman, 2005.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
*/
+#include <linux/dmi.h>
#include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <asm/dma.h>
+#include <asm/x86_init.h>
/*
* This module just handles suspend/resume issues with the
@@ -21,7 +19,7 @@
* in asm/dma.h.
*/
-static int i8237A_resume(struct sys_device *dev)
+static void i8237A_resume(void *data)
{
unsigned long flags;
int i;
@@ -41,31 +39,42 @@ static int i8237A_resume(struct sys_device *dev)
enable_dma(4);
release_dma_lock(flags);
-
- return 0;
-}
-
-static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
-{
- return 0;
}
-static struct sysdev_class i8237_sysdev_class = {
- .name = "i8237",
- .suspend = i8237A_suspend,
+static const struct syscore_ops i8237_syscore_ops = {
.resume = i8237A_resume,
};
-static struct sys_device device_i8237A = {
- .id = 0,
- .cls = &i8237_sysdev_class,
+static struct syscore i8237_syscore = {
+ .ops = &i8237_syscore_ops,
};
-static int __init i8237A_init_sysfs(void)
+static int __init i8237A_init_ops(void)
{
- int error = sysdev_class_register(&i8237_sysdev_class);
- if (!error)
- error = sysdev_register(&device_i8237A);
- return error;
+ /*
+ * From SKL PCH onwards, the legacy DMA device is removed in which the
+ * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed
+ * as well. All removed ports must return 0xff for a inb() request.
+ *
+ * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting
+ * the presence of DMA device since it may be used by BIOS to decode
+ * LPC traffic for POST codes. Original LPC only decodes one byte of
+ * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x
+ * decoding.
+ */
+ if (dma_inb(DMA_PAGE_0) == 0xFF)
+ return -ENODEV;
+
+ /*
+ * It is not required to load this driver as newer SoC may not
+ * support 8237 DMA or bus mastering from LPC. Platform firmware
+ * must announce the support for such legacy devices via
+ * ACPI_FADT_LEGACY_DEVICES field in FADT table.
+ */
+ if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017)
+ return -ENODEV;
+
+ register_syscore(&i8237_syscore);
+ return 0;
}
-device_initcall(i8237A_init_sysfs);
+device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 5cf36c053ac4..cb9852ad6098 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -1,30 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* 8253/PIT functions
*
*/
#include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/timex.h>
-#include <linux/delay.h>
#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/timex.h>
+#include <linux/i8253.h>
-#include <asm/i8253.h>
+#include <asm/hypervisor.h>
+#include <asm/apic.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#include <asm/smp.h>
-DEFINE_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
-#ifdef CONFIG_X86_32
-static void pit_disable_clocksource(void);
-#else
-static inline void pit_disable_clocksource(void) { }
-#endif
-
/*
* HPET replaces the PIT, when enabled. So we need to know, which of
* the two timers is used
@@ -32,185 +21,41 @@ static inline void pit_disable_clocksource(void) { }
struct clock_event_device *global_clock_event;
/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- spin_lock(&i8253_lock);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- /* binary, mode 2, LSB/MSB, ch 0 */
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
- outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
- break;
-
- case CLOCK_EVT_MODE_SHUTDOWN:
- case CLOCK_EVT_MODE_UNUSED:
- if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
- evt->mode == CLOCK_EVT_MODE_ONESHOT) {
- outb_pit(0x30, PIT_MODE);
- outb_pit(0, PIT_CH0);
- outb_pit(0, PIT_CH0);
- }
- pit_disable_clocksource();
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- /* One shot setup */
- pit_disable_clocksource();
- outb_pit(0x38, PIT_MODE);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
- spin_unlock(&i8253_lock);
-}
-
-/*
- * Program the next event in oneshot mode
+ * Modern chipsets can disable the PIT clock which makes it unusable. It
+ * would be possible to enable the clock but the registers are chipset
+ * specific and not discoverable. Avoid the whack a mole game.
*
- * Delta is given in PIT ticks
+ * These platforms have discoverable TSC/CPU frequencies but this also
+ * requires to know the local APIC timer frequency as it normally is
+ * calibrated against the PIT interrupt.
*/
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
+static bool __init use_pit(void)
{
- spin_lock(&i8253_lock);
- outb_pit(delta & 0xff , PIT_CH0); /* LSB */
- outb_pit(delta >> 8 , PIT_CH0); /* MSB */
- spin_unlock(&i8253_lock);
+ if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC))
+ return true;
- return 0;
+ /* This also returns true when APIC is disabled */
+ return apic_needs_pit();
}
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
- .name = "pit",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = init_pit_timer,
- .set_next_event = pit_next_event,
- .shift = 32,
- .irq = 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
-void __init setup_pit_timer(void)
+bool __init pit_timer_init(void)
{
- /*
- * Start pit with the boot cpu mask and make it global after the
- * IO_APIC has been initialized.
- */
- pit_ce.cpumask = cpumask_of(smp_processor_id());
- pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
- pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
- pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
-
- clockevents_register_device(&pit_ce);
- global_clock_event = &pit_ce;
-}
-
-#ifndef CONFIG_X86_64
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(struct clocksource *cs)
-{
- static int old_count;
- static u32 old_jifs;
- unsigned long flags;
- int count;
- u32 jifs;
-
- spin_lock_irqsave(&i8253_lock, flags);
- /*
- * Although our caller may have the read side of xtime_lock,
- * this is now a seqlock, and we are cheating in this routine
- * by having side effects on state that we cannot undo if
- * there is a collision on the seqlock and our caller has to
- * retry. (Namely, old_jifs and old_count.) So we must treat
- * jiffies as volatile despite the lock. We read jiffies
- * before latching the timer count to guarantee that although
- * the jiffies value might be older than the count (that is,
- * the counter may underflow between the last point where
- * jiffies was incremented and the point where we latch the
- * count), it cannot be newer.
- */
- jifs = jiffies;
- outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
- count = inb_pit(PIT_CH0); /* read the latched count */
- count |= inb_pit(PIT_CH0) << 8;
-
- /* VIA686a test code... reset the latch if count > max + 1 */
- if (count > LATCH) {
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff, PIT_CH0);
- outb_pit(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
- }
-
- /*
- * It's possible for count to appear to go the wrong way for a
- * couple of reasons:
- *
- * 1. The timer counter underflows, but we haven't handled the
- * resulting interrupt and incremented jiffies yet.
- * 2. Hardware problem with the timer, not giving us continuous time,
- * the counter does small "jumps" upwards on some Pentium systems,
- * (see c't 95/10 page 335 for Neptun bug.)
- *
- * Previous attempts to handle these cases intelligently were
- * buggy, so we just do the simple thing now.
- */
- if (count > old_count && jifs == old_jifs)
- count = old_count;
-
- old_count = count;
- old_jifs = jifs;
-
- spin_unlock_irqrestore(&i8253_lock, flags);
-
- count = (LATCH - 1) - count;
-
- return (cycle_t)(jifs * LATCH) + count;
-}
-
-static struct clocksource pit_cs = {
- .name = "pit",
- .rating = 110,
- .read = pit_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 20,
-};
-
-static void pit_disable_clocksource(void)
-{
- /*
- * Use mult to check whether it is registered or not
- */
- if (pit_cs.mult) {
- clocksource_unregister(&pit_cs);
- pit_cs.mult = 0;
+ if (!use_pit()) {
+ /*
+ * Don't just ignore the PIT. Ensure it's stopped, because
+ * VMMs otherwise steal CPU time just to pointlessly waggle
+ * the (masked) IRQ.
+ */
+ scoped_guard(irq)
+ clockevent_i8253_disable();
+ return false;
}
+ clockevent_i8253_init(true);
+ global_clock_event = &i8253_clockevent;
+ return true;
}
+#ifndef CONFIG_X86_64
static int __init init_pit_clocksource(void)
{
/*
@@ -221,13 +66,10 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+ !clockevent_state_periodic(&i8253_clockevent))
return 0;
- pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
- return clocksource_register(&pit_cs);
+ return clocksource_i8253_init();
}
arch_initcall(init_pit_clocksource);
-
#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef80..f67063df6723 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,28 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
-#include <linux/slab.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
+#include <linux/pgtable.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
+#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/i8259.h>
+#include <asm/io_apic.h>
/*
* This is the 'legacy' 8259A Programmable Interrupt Controller,
@@ -30,18 +31,11 @@
* plus some generic x86 specific things if generic specifics makes
* any sense at all.
*/
+static void init_8259A(int auto_eoi);
+static bool pcat_compat __ro_after_init;
static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-struct irq_chip i8259A_chip = {
- .name = "XT-PIC",
- .mask = disable_8259A_irq,
- .disable = disable_8259A_irq,
- .unmask = enable_8259A_irq,
- .mask_ack = mask_and_ack_8259A,
-};
+DEFINE_RAW_SPINLOCK(i8259A_lock);
/*
* 8259A PIC functions to handle ISA devices:
@@ -63,57 +57,68 @@ unsigned int cached_irq_mask = 0xffff;
*/
unsigned long io_apic_irqs;
-void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask |= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+ mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask &= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void enable_8259A_irq(struct irq_data *data)
+{
+ unmask_8259A_irq(data->irq);
}
-int i8259A_irq_pending(unsigned int irq)
+static int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
unsigned long flags;
int ret;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
if (irq < 8)
ret = inb(PIC_MASTER_CMD) & mask;
else
ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return ret;
}
-void make_8259A_irq(unsigned int irq)
+static void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
- set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
- "XT");
+ irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ irq_set_status_flags(irq, IRQ_LEVEL);
enable_irq(irq);
+ lapic_assign_legacy_vector(irq, true);
}
/*
@@ -145,12 +150,13 @@ static inline int i8259A_irq_real(unsigned int irq)
* first, _then_ send the EOI, and the order of EOI
* to the two 8259s is important!
*/
-static void mask_and_ack_8259A(unsigned int irq)
+static void mask_and_ack_8259A(struct irq_data *data)
{
+ unsigned int irq = data->irq;
unsigned int irqmask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
/*
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +189,7 @@ handle_real_irq:
outb(cached_master_mask, PIC_MASTER_IMR);
outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
}
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return;
spurious_8259A_irq:
@@ -204,7 +210,7 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
+ printk_deferred(KERN_DEBUG
"spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
@@ -218,112 +224,145 @@ spurious_8259A_irq:
}
}
+struct irq_chip i8259A_chip = {
+ .name = "XT-PIC",
+ .irq_mask = disable_8259A_irq,
+ .irq_disable = disable_8259A_irq,
+ .irq_unmask = enable_8259A_irq,
+ .irq_mask_ack = mask_and_ack_8259A,
+};
+
static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
+/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */
static void restore_ELCR(char *trigger)
{
- outb(trigger[0], 0x4d0);
- outb(trigger[1], 0x4d1);
+ outb(trigger[0], PIC_ELCR1);
+ outb(trigger[1], PIC_ELCR2);
}
static void save_ELCR(char *trigger)
{
/* IRQ 0,1,2,8,13 are marked as reserved */
- trigger[0] = inb(0x4d0) & 0xF8;
- trigger[1] = inb(0x4d1) & 0xDE;
+ trigger[0] = inb(PIC_ELCR1) & 0xF8;
+ trigger[1] = inb(PIC_ELCR2) & 0xDE;
}
-static int i8259A_resume(struct sys_device *dev)
+static void i8259A_resume(void *data)
{
init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
- return 0;
}
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+static int i8259A_suspend(void *data)
{
save_ELCR(irq_trigger);
return 0;
}
-static int i8259A_shutdown(struct sys_device *dev)
+static void i8259A_shutdown(void *data)
{
/* Put the i8259A into a quiescent state that
* the kernel initialization code can get it
* out of.
*/
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
- return 0;
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
}
-static struct sysdev_class i8259_sysdev_class = {
- .name = "i8259",
+static const struct syscore_ops i8259_syscore_ops = {
.suspend = i8259A_suspend,
.resume = i8259A_resume,
.shutdown = i8259A_shutdown,
};
-static struct sys_device device_i8259A = {
- .id = 0,
- .cls = &i8259_sysdev_class,
+static struct syscore i8259_syscore = {
+ .ops = &i8259_syscore_ops,
};
-static int __init i8259A_init_sysfs(void)
-{
- int error = sysdev_class_register(&i8259_sysdev_class);
- if (!error)
- error = sysdev_register(&device_i8259A);
- return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void mask_8259A(void)
+static void mask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void unmask_8259A(void)
+static void unmask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static int probe_8259A(void)
+{
+ unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned long flags;
+
+ /*
+ * If MADT has the PCAT_COMPAT flag set, then do not bother probing
+ * for the PIC. Some BIOSes leave the PIC uninitialized and probing
+ * fails.
+ *
+ * Right now this causes problems as quite some code depends on
+ * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
+ * when the system has an IO/APIC because then PIC is not required
+ * at all, except for really old machines where the timer interrupt
+ * must be routed through the PIC. So just pretend that the PIC is
+ * there and let legacy_pic->init() initialize it for nothing.
+ *
+ * Alternatively this could just try to initialize the PIC and
+ * repeat the probe, but for cases where there is no PIC that's
+ * just pointless.
+ */
+ if (pcat_compat)
+ return nr_legacy_irqs();
+
+ /*
+ * Check to see if we have a PIC. Mask all except the cascade and
+ * read back the value we just wrote. If we don't have a PIC, we
+ * will read 0xff as opposed to the value we wrote.
+ */
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ }
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return nr_legacy_irqs();
}
-void init_8259A(int auto_eoi)
+static void init_8259A(int auto_eoi)
{
unsigned long flags;
i8259A_auto_eoi = auto_eoi;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
/*
* outb_pic - this has to work on a wide range of PC hardware.
*/
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
- to 0x20-0x27 on i386 */
- outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+ /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+ outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
/* 8259A-1 (the master) has a slave on IR2 */
outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
@@ -335,8 +374,8 @@ void init_8259A(int auto_eoi)
outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
- /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
- outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+ /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+ outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
/* 8259A-2 is a slave on master's IR2 */
outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
/* (slave's support for AEOI in flat mode is to be investigated) */
@@ -347,14 +386,75 @@ void init_8259A(int auto_eoi)
* In AEOI mode we just have to mask the interrupt
* when acking.
*/
- i8259A_chip.mask_ack = disable_8259A_irq;
+ i8259A_chip.irq_mask_ack = disable_8259A_irq;
else
- i8259A_chip.mask_ack = mask_and_ack_8259A;
+ i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
udelay(100); /* wait for 8259A to initialize */
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+/*
+ * make i8259 a driver so that we can select pic functions at run time. the goal
+ * is to make x86 binary compatible among pc compatible and non-pc compatible
+ * platforms, such as x86 MID.
+ */
+
+static void legacy_pic_noop(void) { };
+static void legacy_pic_uint_noop(unsigned int unused) { };
+static void legacy_pic_int_noop(int unused) { };
+static int legacy_pic_irq_pending_noop(unsigned int irq)
+{
+ return 0;
+}
+static int legacy_pic_probe(void)
+{
+ return 0;
+}
+
+struct legacy_pic null_legacy_pic = {
+ .nr_legacy_irqs = 0,
+ .chip = &dummy_irq_chip,
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
+ .mask_all = legacy_pic_noop,
+ .restore_mask = legacy_pic_noop,
+ .init = legacy_pic_int_noop,
+ .probe = legacy_pic_probe,
+ .irq_pending = legacy_pic_irq_pending_noop,
+ .make_irq = legacy_pic_uint_noop,
+};
+
+static struct legacy_pic default_legacy_pic = {
+ .nr_legacy_irqs = NR_IRQS_LEGACY,
+ .chip = &i8259A_chip,
+ .mask = mask_8259A_irq,
+ .unmask = unmask_8259A_irq,
+ .mask_all = mask_8259A,
+ .restore_mask = unmask_8259A,
+ .init = init_8259A,
+ .probe = probe_8259A,
+ .irq_pending = i8259A_irq_pending,
+ .make_irq = make_8259A_irq,
+};
+
+struct legacy_pic *legacy_pic = &default_legacy_pic;
+EXPORT_SYMBOL(legacy_pic);
+
+static int __init i8259A_init_ops(void)
+{
+ if (legacy_pic == &default_legacy_pic)
+ register_syscore(&i8259_syscore);
+
+ return 0;
+}
+device_initcall(i8259A_init_ops);
+
+void __init legacy_pic_pcat_compat(void)
+{
+ pcat_compat = true;
}
diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S
new file mode 100644
index 000000000000..c43c4ed28a9c
--- /dev/null
+++ b/arch/x86/kernel/ibt_selftest.S
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+#include <asm/nospec-branch.h>
+
+SYM_CODE_START(ibt_selftest_noendbr)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ /* #CP handler sets %ax to 0 */
+ RET
+SYM_CODE_END(ibt_selftest_noendbr)
+
+SYM_FUNC_START(ibt_selftest)
+ lea ibt_selftest_noendbr(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rax
+SYM_FUNC_END(ibt_selftest)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000000000000..f445bec516a0
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interrupt descriptor table related code
+ */
+#include <linux/interrupt.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include <asm/ia32.h>
+#include <asm/idtentry.h>
+
+#define DPL0 0x0
+#define DPL3 0x3
+
+#define DEFAULT_STACK 0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+ { \
+ .vector = _vector, \
+ .bits.ist = _ist, \
+ .bits.type = _type, \
+ .bits.dpl = _dpl, \
+ .bits.p = 1, \
+ .addr = _addr, \
+ .segment = _segment, \
+ }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+#ifdef CONFIG_X86_64
+/*
+ * Interrupt gate with interrupt stack. The _ist index is the index in
+ * the tss.ist[] array, but for the descriptor it needs to start at 1.
+ */
+#define ISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+#else
+#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr)
+#endif
+
+/* Task gate */
+#define TSKG(_vector, _gdt) \
+ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc))
+
+static bool idt_setup_done __initdata;
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_idts[] = {
+ INTG(X86_TRAP_DB, asm_exc_debug),
+ SYSG(X86_TRAP_BP, asm_exc_int3),
+
+#ifdef CONFIG_X86_32
+ /*
+ * Not possible on 64-bit. See idt_setup_early_pf() for details.
+ */
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initconst struct idt_data def_idts[] = {
+ INTG(X86_TRAP_DE, asm_exc_divide_error),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
+ INTG(X86_TRAP_BR, asm_exc_bounds),
+ INTG(X86_TRAP_UD, asm_exc_invalid_op),
+ INTG(X86_TRAP_NM, asm_exc_device_not_available),
+ INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun),
+ INTG(X86_TRAP_TS, asm_exc_invalid_tss),
+ INTG(X86_TRAP_NP, asm_exc_segment_not_present),
+ INTG(X86_TRAP_SS, asm_exc_stack_segment),
+ INTG(X86_TRAP_GP, asm_exc_general_protection),
+ INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
+ INTG(X86_TRAP_AC, asm_exc_alignment_check),
+ INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+ TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
+#endif
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
+
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+
+#ifdef CONFIG_X86_CET
+ INTG(X86_TRAP_CP, asm_exc_control_protection),
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
+#endif
+
+ SYSG(X86_TRAP_OF, asm_exc_overflow),
+};
+
+static const struct idt_data ia32_idt[] __initconst = {
+#if defined(CONFIG_IA32_EMULATION)
+ SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation),
+#elif defined(CONFIG_X86_32)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initconst struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+ INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi),
+ INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single),
+ INTG(REBOOT_VECTOR, asm_sysvec_reboot),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
+# if IS_ENABLED(CONFIG_KVM)
+ INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work),
+# endif
+ INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
+ INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+ INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
+# endif
+#endif
+};
+
+/* Must be page-aligned because the real IDT is used in the cpu entry area */
+static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+static struct desc_ptr idt_descr __ro_after_init = {
+ .size = IDT_TABLE_SIZE - 1,
+ .address = (unsigned long) idt_table,
+};
+
+void load_current_idt(void)
+{
+ lockdep_assert_irqs_disabled();
+ load_idt(&idt_descr);
+}
+
+#ifdef CONFIG_X86_F00F_BUG
+bool idt_is_f00f_address(unsigned long address)
+{
+ return ((address - idt_descr.address) >> 3) == 6;
+}
+#endif
+
+static __init void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+ gate_desc desc;
+
+ for (; size > 0; t++, size--) {
+ idt_init_desc(&desc, t);
+ write_idt_entry(idt, t->vector, &desc);
+ if (sys)
+ set_bit(t->vector, system_vectors);
+ }
+}
+
+static __init void set_intr_gate(unsigned int n, const void *addr)
+{
+ struct idt_data data;
+
+ init_idt_data(&data, n, addr);
+
+ idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+ idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+ true);
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+ idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+
+ if (ia32_enabled())
+ idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true);
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+};
+
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * Note, that X86_64 cannot install the real #PF handler in
+ * idt_setup_early_traps() because the memory initialization needs the #PF
+ * handler from the early_idt_handler_array to initialize the early page
+ * tables.
+ */
+void __init idt_setup_early_pf(void)
+{
+ idt_setup_from_table(idt_table, early_pf_idts,
+ ARRAY_SIZE(early_pf_idts), true);
+}
+#endif
+
+static void __init idt_map_in_cea(void)
+{
+ /*
+ * Set the IDT descriptor to a fixed read-only location in the cpu
+ * entry area, so that the "sidt" instruction will not leak the
+ * location of the kernel, and to defend the IDT against arbitrary
+ * memory write vulnerabilities.
+ */
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
+}
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+ int i = FIRST_EXTERNAL_VECTOR;
+ void *entry;
+
+ idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+ for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) {
+ entry = irq_entries_start + IDT_ALIGN * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
+ entry = spurious_entries_start + IDT_ALIGN * (i - FIRST_SYSTEM_VECTOR);
+ set_intr_gate(i, entry);
+ }
+#endif
+ /* Map IDT into CPU entry area and reload it. */
+ idt_map_in_cea();
+ load_idt(&idt_descr);
+
+ /* Make the IDT table read only */
+ set_memory_ro((unsigned long)&idt_table, 1);
+
+ idt_setup_done = true;
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+ for ( ; i < NR_VECTORS; i++)
+ set_intr_gate(i, early_ignore_irq);
+#endif
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ */
+void idt_invalidate(void)
+{
+ static const struct desc_ptr idt = { .address = 0, .size = 0 };
+
+ load_idt(&idt);
+}
+
+void __init idt_install_sysvec(unsigned int n, const void *function)
+{
+ if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
+ return;
+
+ if (WARN_ON(idt_setup_done))
+ return;
+
+ if (!WARN_ON(test_and_set_bit(n, system_vectors)))
+ set_intr_gate(n, function);
+}
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 270ff83efc11..000000000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union
- __attribute__((__section__(".data.init_task"))) =
- { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data.cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index a979b5bd2fc0..fdb6506ceaaa 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* I/O delay strategies for inb_p/outb_p
*
@@ -6,13 +7,28 @@
* outb_p/inb_p API uses.
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/dmi.h>
#include <linux/io.h>
-int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
+#define IO_DELAY_TYPE_0X80 0
+#define IO_DELAY_TYPE_0XED 1
+#define IO_DELAY_TYPE_UDELAY 2
+#define IO_DELAY_TYPE_NONE 3
+
+#if defined(CONFIG_IO_DELAY_0X80)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0X80
+#elif defined(CONFIG_IO_DELAY_0XED)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0XED
+#elif defined(CONFIG_IO_DELAY_UDELAY)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_UDELAY
+#elif defined(CONFIG_IO_DELAY_NONE)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_NONE
+#endif
+
+int io_delay_type __read_mostly = DEFAULT_IO_DELAY_TYPE;
static int __initdata io_delay_override;
@@ -23,13 +39,13 @@ void native_io_delay(void)
{
switch (io_delay_type) {
default:
- case CONFIG_IO_DELAY_TYPE_0X80:
+ case IO_DELAY_TYPE_0X80:
asm volatile ("outb %al, $0x80");
break;
- case CONFIG_IO_DELAY_TYPE_0XED:
+ case IO_DELAY_TYPE_0XED:
asm volatile ("outb %al, $0xed");
break;
- case CONFIG_IO_DELAY_TYPE_UDELAY:
+ case IO_DELAY_TYPE_UDELAY:
/*
* 2 usecs is an upper-bound for the outb delay but
* note that udelay doesn't have the bus-level
@@ -38,7 +54,8 @@ void native_io_delay(void)
* are shorter until calibrated):
*/
udelay(2);
- case CONFIG_IO_DELAY_TYPE_NONE:
+ break;
+ case IO_DELAY_TYPE_NONE:
break;
}
}
@@ -46,9 +63,9 @@ EXPORT_SYMBOL(native_io_delay);
static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
{
- if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
+ if (io_delay_type == IO_DELAY_TYPE_0X80) {
pr_notice("%s: using 0xed I/O delay port\n", id->ident);
- io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ io_delay_type = IO_DELAY_TYPE_0XED;
}
return 0;
@@ -58,7 +75,7 @@ static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
* Quirk table for systems that misbehave (lock up, etc.) if port
* 0x80 is used:
*/
-static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
+static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = {
{
.callback = dmi_io_delay_0xed_port,
.ident = "Compaq Presario V6000",
@@ -114,13 +131,13 @@ static int __init io_delay_param(char *s)
return -EINVAL;
if (!strcmp(s, "0x80"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
+ io_delay_type = IO_DELAY_TYPE_0X80;
else if (!strcmp(s, "0xed"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ io_delay_type = IO_DELAY_TYPE_0XED;
else if (!strcmp(s, "udelay"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
+ io_delay_type = IO_DELAY_TYPE_UDELAY;
else if (!strcmp(s, "none"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
+ io_delay_type = IO_DELAY_TYPE_NONE;
else
return -EINVAL;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..ff40f09ad911 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -1,47 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This contains the io-permission bitmap code - written by obz, with changes
* by Linus. 32/64 bits code unification by Miguel Botón.
*/
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/bitmap.h>
#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
+#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
+
+#include <asm/io_bitmap.h>
+#include <asm/desc.h>
#include <asm/syscalls.h>
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base,
- unsigned int extent, int new_value)
+#ifdef CONFIG_X86_IOPL_IOPERM
+
+static atomic64_t io_bitmap_sequence;
+
+void io_bitmap_share(struct task_struct *tsk)
{
- unsigned int i;
+ /* Can be NULL when current->thread.iopl_emul == 3 */
+ if (current->thread.io_bitmap) {
+ /*
+ * Take a refcount on current's bitmap. It can be used by
+ * both tasks as long as none of them changes the bitmap.
+ */
+ refcount_inc(&current->thread.io_bitmap->refcnt);
+ tsk->thread.io_bitmap = current->thread.io_bitmap;
+ }
+ set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+}
- for (i = base; i < base + extent; i++) {
- if (new_value)
- __set_bit(i, bitmap);
- else
- __clear_bit(i, bitmap);
+static void task_update_io_bitmap(void)
+{
+ struct task_struct *tsk = current;
+ struct thread_struct *t = &tsk->thread;
+
+ if (t->iopl_emul == 3 || t->io_bitmap) {
+ /* TSS update is handled on exit to user space */
+ set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+ } else {
+ clear_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+ /* Invalidate TSS */
+ preempt_disable();
+ tss_update_io_bitmap();
+ preempt_enable();
}
}
+void io_bitmap_exit(struct task_struct *tsk)
+{
+ struct io_bitmap *iobm = tsk->thread.io_bitmap;
+
+ tsk->thread.io_bitmap = NULL;
+ /*
+ * Don't touch the TSS when invoked on a failed fork(). TSS
+ * reflects the state of @current and not the state of @tsk.
+ */
+ if (tsk == current)
+ task_update_io_bitmap();
+ if (iobm && refcount_dec_and_test(&iobm->refcnt))
+ kfree(iobm);
+}
+
/*
- * this changes the io permissions bitmap in the current task.
+ * This changes the io permissions bitmap in the current task.
*/
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
struct thread_struct *t = &current->thread;
- struct tss_struct *tss;
- unsigned int i, max_long, bytes, bytes_updated;
+ unsigned int i, max_long;
+ struct io_bitmap *iobm;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
- if (turn_on && !capable(CAP_SYS_RAWIO))
+ if (turn_on && (!capable(CAP_SYS_RAWIO) ||
+ security_locked_down(LOCKDOWN_IOPORT)))
return -EPERM;
/*
@@ -49,95 +85,136 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* IO bitmap up. ioperm() is much less timing critical than clone(),
* this is why we delay this operation until now:
*/
- if (!t->io_bitmap_ptr) {
- unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-
- if (!bitmap)
+ iobm = t->io_bitmap;
+ if (!iobm) {
+ /* No point to allocate a bitmap just to clear permissions */
+ if (!turn_on)
+ return 0;
+ iobm = kmalloc(sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
return -ENOMEM;
- memset(bitmap, 0xff, IO_BITMAP_BYTES);
- t->io_bitmap_ptr = bitmap;
- set_thread_flag(TIF_IO_BITMAP);
+ memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap));
+ refcount_set(&iobm->refcnt, 1);
}
/*
- * do it in the per-thread copy and in the TSS ...
- *
- * Disable preemption via get_cpu() - we must not switch away
- * because the ->io_bitmap_max value must match the bitmap
- * contents:
+ * If the bitmap is not shared, then nothing can take a refcount as
+ * current can obviously not fork at the same time. If it's shared
+ * duplicate it and drop the refcount on the original one.
*/
- tss = &per_cpu(init_tss, get_cpu());
+ if (refcount_read(&iobm->refcnt) > 1) {
+ iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
+ return -ENOMEM;
+ refcount_set(&iobm->refcnt, 1);
+ io_bitmap_exit(current);
+ }
+
+ /*
+ * Store the bitmap pointer (might be the same if the task already
+ * head one). Must be done here so freeing the bitmap when all
+ * permissions are dropped has the pointer set up.
+ */
+ t->io_bitmap = iobm;
+ /* Mark it active for context switching and exit to user mode */
+ set_thread_flag(TIF_IO_BITMAP);
- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+ /*
+ * Update the tasks bitmap. The update of the TSS bitmap happens on
+ * exit to user mode. So this needs no protection.
+ */
+ if (turn_on)
+ bitmap_clear(iobm->bitmap, from, num);
+ else
+ bitmap_set(iobm->bitmap, from, num);
/*
* Search for a (possibly new) maximum. This is simple and stupid,
* to keep it obviously correct:
*/
- max_long = 0;
- for (i = 0; i < IO_BITMAP_LONGS; i++)
- if (t->io_bitmap_ptr[i] != ~0UL)
+ max_long = UINT_MAX;
+ for (i = 0; i < IO_BITMAP_LONGS; i++) {
+ if (iobm->bitmap[i] != ~0UL)
max_long = i;
+ }
+ /* All permissions dropped? */
+ if (max_long == UINT_MAX) {
+ io_bitmap_exit(current);
+ return 0;
+ }
- bytes = (max_long + 1) * sizeof(unsigned long);
- bytes_updated = max(bytes, t->io_bitmap_max);
-
- t->io_bitmap_max = bytes;
-
- /* Update the TSS: */
- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+ iobm->max = (max_long + 1) * sizeof(unsigned long);
- put_cpu();
+ /*
+ * Update the sequence number to force a TSS update on return to
+ * user mode.
+ */
+ iobm->sequence = atomic64_inc_return(&io_bitmap_sequence);
return 0;
}
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return ksys_ioperm(from, num, turn_on);
+}
+
/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ * The sys_iopl functionality depends on the level argument, which if
+ * granted for the task is used to enable access to all 65536 I/O ports.
+ *
+ * This does not use the IOPL mechanism provided by the CPU as that would
+ * also allow the user space task to use the CLI/STI instructions.
+ *
+ * Disabling interrupts in a user space task is dangerous as it might lock
+ * up the machine and the semantics vs. syscalls and exceptions is
+ * undefined.
*
- * Here we just change the flags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
+ * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3
+ * 3 enables them.
+ *
+ * IOPL is strictly per thread and inherited on fork.
*/
-static int do_iopl(unsigned int level, struct pt_regs *regs)
+SYSCALL_DEFINE1(iopl, unsigned int, level)
{
- unsigned int old = (regs->flags >> 12) & 3;
+ struct thread_struct *t = &current->thread;
+ unsigned int old;
if (level > 3)
return -EINVAL;
+
+ old = t->iopl_emul;
+
+ /* No point in going further if nothing changes */
+ if (level == old)
+ return 0;
+
/* Trying to gain more privileges? */
if (level > old) {
- if (!capable(CAP_SYS_RAWIO))
+ if (!capable(CAP_SYS_RAWIO) ||
+ security_locked_down(LOCKDOWN_IOPORT))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
+ t->iopl_emul = level;
+ task_update_io_bitmap();
return 0;
}
-#ifdef CONFIG_X86_32
-long sys_iopl(struct pt_regs *regs)
-{
- unsigned int level = regs->bx;
- struct thread_struct *t = &current->thread;
- int rc;
-
- rc = do_iopl(level, regs);
- if (rc < 0)
- goto out;
+#else /* CONFIG_X86_IOPL_IOPERM */
- t->iopl = level << 12;
- set_iopl_mask(t->iopl);
-out:
- return rc;
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ return -ENOSYS;
}
-#else
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return -ENOSYS;
+}
+
+SYSCALL_DEFINE1(iopl, unsigned int, level)
{
- return do_iopl(level, regs);
+ return -ENOSYS;
}
#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f5..86f4e574de02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -1,24 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Common interrupt code for 32 and 64 bit
*/
#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/of.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/kvm_types.h>
+#include <asm/irq_stack.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
+#include <asm/desc.h>
+#include <asm/traps.h>
+#include <asm/thermal.h>
+#include <asm/posted_intr.h>
+#include <asm/irq_remapping.h>
+
+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR)
+#define CREATE_TRACE_POINTS
+#include <asm/trace/irq_vectors.h>
+#endif
-atomic_t irq_err_count;
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
-/* Function pointer for generic interrupt vector handling */
-void (*generic_interrupt_extension)(void) = NULL;
+DEFINE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
+EXPORT_PER_CPU_SYMBOL(__softirq_pending);
+
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr);
+
+atomic_t irq_err_count;
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
@@ -38,139 +59,146 @@ void ack_bad_irq(unsigned int irq)
* completely.
* But only ack when the APIC is enabled -AK
*/
- ack_APIC_irq();
+ apic_eoi();
}
#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
- * /proc/interrupts printing:
+ * /proc/interrupts printing for arch specific interrupts
*/
-static int show_other_interrupts(struct seq_file *p, int prec)
+int arch_show_interrupts(struct seq_file *p, int prec)
{
int j;
seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
- seq_printf(p, " Non-maskable interrupts\n");
+ seq_puts(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
+ seq_puts(p, " Local timer interrupts\n");
seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
- seq_printf(p, "%*s: ", prec, "CNT");
+ seq_puts(p, " Spurious interrupts\n");
+ seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
- seq_printf(p, " Performance counter interrupts\n");
- seq_printf(p, "%*s: ", prec, "PND");
+ seq_puts(p, " Performance monitoring interrupts\n");
+ seq_printf(p, "%*s: ", prec, "IWI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
- seq_printf(p, " Performance pending work\n");
-#endif
- if (generic_interrupt_extension) {
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+ seq_puts(p, " IRQ work interrupts\n");
+ seq_printf(p, "%*s: ", prec, "RTR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+ seq_puts(p, " APIC ICR read retries\n");
+ if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
- seq_printf(p, " Platform interrupts\n");
+ seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
+ seq_puts(p, " Platform interrupts\n");
}
+#endif
#ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
+ seq_puts(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
- seq_printf(p, " Function call interrupts\n");
+ seq_puts(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
+ seq_puts(p, " TLB shootdowns\n");
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
-# ifdef CONFIG_X86_MCE_THRESHOLD
+ seq_puts(p, " Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
- seq_printf(p, " Threshold APIC interrupts\n");
-# endif
+ seq_puts(p, " Threshold APIC interrupts\n");
#endif
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE_AMD
+ seq_printf(p, "%*s: ", prec, "DFR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+ seq_puts(p, " Deferred Error APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
- seq_printf(p, " Machine check exceptions\n");
+ seq_puts(p, " Machine check exceptions\n");
seq_printf(p, "%*s: ", prec, "MCP");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
- seq_printf(p, " Machine check polls\n");
+ seq_puts(p, " Machine check polls\n");
+#endif
+#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
+ if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HYP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_callback_count);
+ seq_puts(p, " Hypervisor callback interrupts\n");
+ }
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HRE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_reenlightenment_count);
+ seq_puts(p, " Hyper-V reenlightenment interrupts\n");
+ }
+ if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HVS");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->hyperv_stimer0_count);
+ seq_puts(p, " Hyper-V stimer0 interrupts\n");
+ }
#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
- return 0;
-}
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j, prec;
- struct irqaction *action;
- struct irq_desc *desc;
-
- if (i > nr_irqs)
- return 0;
-
- for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
- j *= 10;
-
- if (i == nr_irqs)
- return show_other_interrupts(p, prec);
-
- /* print header */
- if (i == 0) {
- seq_printf(p, "%*s", prec + 8, "");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d", j);
- seq_putc(p, '\n');
- }
-
- desc = irq_to_desc(i);
- if (!desc)
- return 0;
-
- spin_lock_irqsave(&desc->lock, flags);
+#if IS_ENABLED(CONFIG_KVM)
+ seq_printf(p, "%*s: ", prec, "PIN");
for_each_online_cpu(j)
- any_count |= kstat_irqs_cpu(i, j);
- action = desc->action;
- if (!action && !any_count)
- goto out;
+ seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+ seq_puts(p, " Posted-interrupt notification event\n");
- seq_printf(p, "%*d: ", prec, i);
+ seq_printf(p, "%*s: ", prec, "NPI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- seq_printf(p, " %8s", desc->chip->name);
- seq_printf(p, "-%-8s", desc->name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_nested_ipis);
+ seq_puts(p, " Nested posted-interrupt event\n");
- seq_putc(p, '\n');
-out:
- spin_unlock_irqrestore(&desc->lock, flags);
+ seq_printf(p, "%*s: ", prec, "PIW");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+ seq_puts(p, " Posted-interrupt wakeup event\n");
+#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ seq_printf(p, "%*s: ", prec, "PMN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->posted_msi_notification_count);
+ seq_puts(p, " Posted MSI notification event\n");
+#endif
return 0;
}
@@ -185,22 +213,29 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
- sum += irq_stats(cpu)->apic_pending_irqs;
+ sum += irq_stats(cpu)->apic_irq_work_irqs;
+ sum += irq_stats(cpu)->icr_read_retry_count;
+ if (x86_platform_ipi_callback)
+ sum += irq_stats(cpu)->x86_platform_ipis;
#endif
- if (generic_interrupt_extension)
- sum += irq_stats(cpu)->generic_irqs;
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
- sum += irq_stats(cpu)->irq_tlb_count;
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
sum += irq_stats(cpu)->irq_threshold_count;
-# endif
#endif
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
+ sum += irq_stats(cpu)->irq_hv_callback_count;
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ sum += irq_stats(cpu)->irq_hv_reenlightenment_count;
+ sum += irq_stats(cpu)->hyperv_stimer0_count;
+#endif
+#ifdef CONFIG_X86_MCE
sum += per_cpu(mce_exception_count, cpu);
sum += per_cpu(mce_poll_count, cpu);
#endif
@@ -210,67 +245,304 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
u64 arch_irq_stat(void)
{
u64 sum = atomic_read(&irq_err_count);
-
-#ifdef CONFIG_X86_IO_APIC
- sum += atomic_read(&irq_mis_count);
-#endif
return sum;
}
+static __always_inline void handle_irq(struct irq_desc *desc,
+ struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_X86_64))
+ generic_handle_irq_desc(desc);
+ else
+ __handle_irq(desc, regs);
+}
+
+static struct irq_desc *reevaluate_vector(int vector)
+{
+ struct irq_desc *desc = __this_cpu_read(vector_irq[vector]);
+
+ if (!IS_ERR_OR_NULL(desc))
+ return desc;
+
+ if (desc == VECTOR_UNUSED)
+ pr_emerg_ratelimited("No irq handler for %d.%u\n", smp_processor_id(), vector);
+ else
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ return NULL;
+}
+
+static __always_inline bool call_irq_handler(int vector, struct pt_regs *regs)
+{
+ struct irq_desc *desc = __this_cpu_read(vector_irq[vector]);
+
+ if (likely(!IS_ERR_OR_NULL(desc))) {
+ handle_irq(desc, regs);
+ return true;
+ }
+
+ /*
+ * Reevaluate with vector_lock held to prevent a race against
+ * request_irq() setting up the vector:
+ *
+ * CPU0 CPU1
+ * interrupt is raised in APIC IRR
+ * but not handled
+ * free_irq()
+ * per_cpu(vector_irq, CPU1)[vector] = VECTOR_SHUTDOWN;
+ *
+ * request_irq() common_interrupt()
+ * d = this_cpu_read(vector_irq[vector]);
+ *
+ * per_cpu(vector_irq, CPU1)[vector] = desc;
+ *
+ * if (d == VECTOR_SHUTDOWN)
+ * this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ *
+ * This requires that the same vector on the same target CPU is
+ * handed out or that a spurious interrupt hits that CPU/vector.
+ */
+ lock_vector_lock();
+ desc = reevaluate_vector(vector);
+ unlock_vector_lock();
+
+ if (!desc)
+ return false;
+
+ handle_irq(desc, regs);
+ return true;
+}
/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
*/
-unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(common_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
- unsigned irq;
+ /* entry code tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
- exit_idle();
- irq_enter();
+ if (unlikely(!call_irq_handler(vector, regs)))
+ apic_eoi();
+
+ set_irq_regs(old_regs);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
+/*
+ * Handler for X86_PLATFORM_IPI_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
- irq = __get_cpu_var(vector_irq)[vector];
+ apic_eoi();
+ trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+ inc_irq_stat(x86_platform_ipis);
+ if (x86_platform_ipi_callback)
+ x86_platform_ipi_callback();
+ trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+ set_irq_regs(old_regs);
+}
+#endif
- if (!handle_irq(irq, regs)) {
- ack_APIC_irq();
+#if IS_ENABLED(CONFIG_KVM)
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
- if (printk_ratelimit())
- pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+ if (handler)
+ kvm_posted_intr_wakeup_handler = handler;
+ else {
+ kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
}
+}
+EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler);
- irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
+{
+ apic_eoi();
+ inc_irq_stat(kvm_posted_intr_ipis);
+}
- set_irq_regs(old_regs);
- return 1;
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
+{
+ apic_eoi();
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+ kvm_posted_intr_wakeup_handler();
}
/*
- * Handler for GENERIC_INTERRUPT_VECTOR.
+ * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
*/
-void smp_generic_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
+ apic_eoi();
+ inc_irq_stat(kvm_posted_intr_nested_ipis);
+}
+#endif
+
+#ifdef CONFIG_X86_POSTED_MSI
+
+/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
+DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+
+void intel_posted_msi_init(void)
+{
+ u32 destination;
+ u32 apic_id;
+
+ this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
+
+ /*
+ * APIC destination ID is stored in bit 8:15 while in XAPIC mode.
+ * VT-d spec. CH 9.11
+ */
+ apic_id = this_cpu_read(x86_cpu_to_apicid);
+ destination = x2apic_enabled() ? apic_id : apic_id << 8;
+ this_cpu_write(posted_msi_pi_desc.ndst, destination);
+}
+
+static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
+{
+ unsigned long pir_copy[NR_PIR_WORDS];
+ int vec = FIRST_EXTERNAL_VECTOR;
+
+ if (!pi_harvest_pir(pir, pir_copy))
+ return false;
+
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
+
+ return true;
+}
- ack_APIC_irq();
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
- exit_idle();
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct pi_desc *pid;
+ int i = 0;
+ pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ inc_irq_stat(posted_msi_notification_count);
irq_enter();
- inc_irq_stat(generic_irqs);
+ /*
+ * Max coalescing count includes the extra round of handle_pending_pir
+ * after clearing the outstanding notification bit. Hence, at most
+ * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+ */
+ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+ if (!handle_pending_pir(pid->pir, regs))
+ break;
+ }
+
+ /*
+ * Clear outstanding notification bit to allow new IRQ notifications,
+ * do this last to maximize the window of interrupt coalescing.
+ */
+ pi_clear_on(pid);
- if (generic_interrupt_extension)
- generic_interrupt_extension();
+ /*
+ * There could be a race of PI notification and the clearing of ON bit,
+ * process PIR bits one last time such that handling the new interrupts
+ * are not delayed until the next IRQ.
+ */
+ handle_pending_pir(pid->pir, regs);
+ apic_eoi();
irq_exit();
-
set_irq_regs(old_regs);
}
+#endif /* X86_POSTED_MSI */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
+{
+ unsigned int vector;
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+
+ irq_migrate_all_off_this_cpu();
+
+ /*
+ * We can remove mdelay() and then send spurious interrupts to
+ * new cpu targets for all the irqs that were handled previously by
+ * this cpu. While it works, I have seen spurious interrupt messages
+ * (nothing wrong but still...).
+ *
+ * So for now, retain mdelay(1) and check the IRR and then send those
+ * interrupts to new targets as this cpu is already offlined...
+ */
+ mdelay(1);
+
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
+ continue;
+
+ if (is_vector_pending(vector)) {
+ desc = __this_cpu_read(vector_irq[vector]);
+
+ raw_spin_lock(&desc->lock);
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_retrigger) {
+ chip->irq_retrigger(data);
+ __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ }
+ raw_spin_unlock(&desc->lock);
+ }
+ if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ }
+}
+#endif
-EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+static void smp_thermal_vector(void)
+{
+ if (x86_thermal_enabled())
+ intel_thermal_interrupt();
+ else
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
+{
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ apic_eoi();
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 3b09634a5153..c7a5d2960d57 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
*
@@ -8,32 +9,29 @@
* io_apic.c.)
*/
-#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/kernel_stat.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/percpu.h>
+#include <linux/mm.h>
#include <asm/apic.h>
+#include <asm/nospec-branch.h>
+#include <asm/softirq_stack.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
+int sysctl_panic_on_stackoverflow __read_mostly;
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
-static int check_stack_overflow(void)
+static bool check_stack_overflow(void)
{
- long sp;
-
- __asm__ __volatile__("andl %%esp,%0" :
- "=r" (sp) : "0" (THREAD_SIZE - 1));
+ unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1);
return sp < (sizeof(struct thread_info) + STACK_WARN);
}
@@ -42,47 +40,39 @@ static void print_stack_overflow(void)
{
printk(KERN_WARNING "low stack detected by irq handler\n");
dump_stack();
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
}
#else
-static inline int check_stack_overflow(void) { return 0; }
+static inline bool check_stack_overflow(void) { return false; }
static inline void print_stack_overflow(void) { }
#endif
-#ifdef CONFIG_4KSTACKS
-/*
- * per-CPU IRQ handling contexts (thread information and stack)
- */
-union irq_ctx {
- struct thread_info tinfo;
- u32 stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(PAGE_SIZE)));
-
-static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
-static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr);
static void call_on_stack(void *func, void *stack)
{
- asm volatile("xchgl %%ebx,%%esp \n"
- "call *%%edi \n"
- "movl %%ebx,%%esp \n"
- : "=b" (stack)
- : "0" (stack),
- "D"(func)
+ asm volatile("xchgl %[sp], %%esp\n"
+ CALL_NOSPEC
+ "movl %[sp], %%esp"
+ : [sp] "+b" (stack)
+ : [thunk_target] "D" (func)
: "memory", "cc", "edx", "ecx", "eax");
}
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+static inline void *current_stack(void)
+{
+ return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
+static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc)
{
- union irq_ctx *curctx, *irqctx;
- u32 *isp, arg1, arg2;
+ struct irq_stack *curstk, *irqstk;
+ u32 *isp, *prev_esp;
- curctx = (union irq_ctx *) current_thread_info();
- irqctx = __get_cpu_var(hardirq_ctx);
+ curstk = (struct irq_stack *) current_stack();
+ irqstk = __this_cpu_read(hardirq_stack_ptr);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -90,170 +80,78 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
* handler) we can't do that and just have to keep using the
* current stack (which is the irq stack already after all)
*/
- if (unlikely(curctx == irqctx))
- return 0;
+ if (unlikely(curstk == irqstk))
+ return false;
- /* build the stack frame on the IRQ stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
- irqctx->tinfo.task = curctx->tinfo.task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
- /*
- * Copy the softirq bits in preempt_count so that the
- * softirq checks work in the hardirq context.
- */
- irqctx->tinfo.preempt_count =
- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+ /* Save the next esp at the bottom of the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
- asm volatile("xchgl %%ebx,%%esp \n"
- "call *%%edi \n"
- "movl %%ebx,%%esp \n"
- : "=a" (arg1), "=d" (arg2), "=b" (isp)
- : "0" (irq), "1" (desc), "2" (isp),
- "D" (desc->handle_irq)
- : "memory", "cc", "ecx");
- return 1;
+ asm volatile("xchgl %[sp], %%esp\n"
+ CALL_NOSPEC
+ "movl %[sp], %%esp"
+ : "+a" (desc), [sp] "+b" (isp)
+ : [thunk_target] "D" (desc->handle_irq)
+ : "memory", "cc", "edx", "ecx");
+ return true;
}
/*
- * allocate per-cpu stacks for hardirq and for softirq processing
+ * Allocate per-cpu stacks for hardirq and softirq processing
*/
-void __cpuinit irq_ctx_init(int cpu)
+int irq_init_percpu_irqstack(unsigned int cpu)
{
- union irq_ctx *irqctx;
-
- if (per_cpu(hardirq_ctx, cpu))
- return;
-
- irqctx = &per_cpu(hardirq_stack, cpu);
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
-
- per_cpu(hardirq_ctx, cpu) = irqctx;
+ int node = cpu_to_node(cpu);
+ struct page *ph, *ps;
- irqctx = &per_cpu(softirq_stack, cpu);
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = 0;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
-
- per_cpu(softirq_ctx, cpu) = irqctx;
+ if (per_cpu(hardirq_stack_ptr, cpu))
+ return 0;
- printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
-}
+ ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+ if (!ph)
+ return -ENOMEM;
+ ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+ if (!ps) {
+ __free_pages(ph, THREAD_SIZE_ORDER);
+ return -ENOMEM;
+ }
-void irq_ctx_exit(int cpu)
-{
- per_cpu(hardirq_ctx, cpu) = NULL;
+ per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
+ per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
+ return 0;
}
-asmlinkage void do_softirq(void)
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
+void do_softirq_own_stack(void)
{
- unsigned long flags;
- struct thread_info *curctx;
- union irq_ctx *irqctx;
- u32 *isp;
+ struct irq_stack *irqstk;
+ u32 *isp, *prev_esp;
- if (in_interrupt())
- return;
+ irqstk = __this_cpu_read(softirq_stack_ptr);
- local_irq_save(flags);
+ /* build the stack frame on the softirq stack */
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
- if (local_softirq_pending()) {
- curctx = current_thread_info();
- irqctx = __get_cpu_var(softirq_ctx);
- irqctx->tinfo.task = curctx->task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ /* Push the previous esp onto the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
- /* build the stack frame on the softirq stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
-
- call_on_stack(__do_softirq, isp);
- /*
- * Shouldnt happen, we returned above if in_interrupt():
- */
- WARN_ON_ONCE(softirq_count());
- }
-
- local_irq_restore(flags);
+ call_on_stack(__do_softirq, isp);
}
-
-#else
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
#endif
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- struct irq_desc *desc;
- int overflow;
-
- overflow = check_stack_overflow();
+ bool overflow = check_stack_overflow();
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
- return false;
-
- if (!execute_on_irq_stack(overflow, desc, irq)) {
+ if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
if (unlikely(overflow))
print_stack_overflow();
- desc->handle_irq(irq, desc);
+ generic_handle_irq_desc(desc);
}
-
- return true;
}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
-void fixup_irqs(void)
-{
- unsigned int irq;
- static int warned;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
-
- affinity = desc->affinity;
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- printk("Breaking affinity for irq %i\n", irq);
- affinity = cpu_all_mask;
- }
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
- else if (desc->action && !(warned++))
- printk("Cannot set affinity for irq %i\n", irq);
- }
-
-#if 0
- barrier();
- /* Ingo Molnar says: "after the IO-APIC masks have been redirected
- [note the nop - the interrupt-enable boundary on x86 is two
- instructions from sti] - to flush out pending hardirqs and
- IPIs. After this point nothing is supposed to reach this CPU." */
- __asm__ __volatile__("sti; nop; cli");
- barrier();
-#else
- /* That doesn't seem sufficient. Give it 1ms. */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
-#endif
-}
-#endif
-
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..ca78dce39361 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
*
@@ -10,133 +11,67 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/seq_file.h>
-#include <linux/module.h>
#include <linux/delay.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/smp.h>
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/softirq_stack.h>
+#include <asm/irq_stack.h>
#include <asm/io_apic.h>
-#include <asm/idle.h>
#include <asm/apic.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
+DEFINE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse);
+DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
+#ifdef CONFIG_VMAP_STACK
/*
- * Probabilistic stack overflow check:
- *
- * Only check the stack in process context, because everything else
- * runs on the big interrupt stacks. Checking reliably is too expensive,
- * so we just check from interrupts.
+ * VMAP the backing store with guard pages
*/
-static inline void stack_overflow_check(struct pt_regs *regs)
+static int map_irq_stack(unsigned int cpu)
{
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
- u64 curbase = (u64)task_stack_page(current);
-
- WARN_ONCE(regs->sp >= curbase &&
- regs->sp <= curbase + THREAD_SIZE &&
- regs->sp < curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + 128,
-
- "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- current->comm, curbase, regs->sp);
-#endif
-}
+ char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu);
+ struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE];
+ void *va;
+ int i;
-bool handle_irq(unsigned irq, struct pt_regs *regs)
-{
- struct irq_desc *desc;
+ for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) {
+ phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT));
- stack_overflow_check(regs);
+ pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
+ }
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
- return false;
+ va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL);
+ if (!va)
+ return -ENOMEM;
- generic_handle_irq_desc(irq, desc);
- return true;
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ return 0;
}
-
-#ifdef CONFIG_HOTPLUG_CPU
-/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
-void fixup_irqs(void)
+#else
+/*
+ * If VMAP stacks are disabled due to KASAN, just use the per cpu
+ * backing store without guard pages.
+ */
+static int map_irq_stack(unsigned int cpu)
{
- unsigned int irq;
- static int warned;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- int break_affinity = 0;
- int set_affinity = 1;
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
+ void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
- /* interrupt's are disabled at this point */
- spin_lock(&desc->lock);
-
- affinity = desc->affinity;
- if (!irq_has_action(irq) ||
- cpumask_equal(affinity, cpu_online_mask)) {
- spin_unlock(&desc->lock);
- continue;
- }
-
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- break_affinity = 1;
- affinity = cpu_all_mask;
- }
-
- if (desc->chip->mask)
- desc->chip->mask(irq);
-
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
- else if (!(warned++))
- set_affinity = 0;
-
- if (desc->chip->unmask)
- desc->chip->unmask(irq);
-
- spin_unlock(&desc->lock);
-
- if (break_affinity && set_affinity)
- printk("Broke affinity for irq %i\n", irq);
- else if (!set_affinity)
- printk("Cannot set affinity for irq %i\n", irq);
- }
-
- /* That doesn't seem sufficient. Give it 1ms. */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ return 0;
}
#endif
-extern void call_softirq(void);
-
-asmlinkage void do_softirq(void)
+int irq_init_percpu_irqstack(unsigned int cpu)
{
- __u32 pending;
- unsigned long flags;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
- pending = local_softirq_pending();
- /* Switch to interrupt stack */
- if (pending) {
- call_softirq();
- WARN_ON_ONCE(softirq_count());
- }
- local_irq_restore(flags);
+ if (per_cpu(hardirq_stack_ptr, cpu))
+ return 0;
+ return map_irq_stack(cpu);
}
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..b0a24deab4a1
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+#include <asm/idtentry.h>
+#include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work)
+{
+ apic_eoi();
+ trace_irq_work_entry(IRQ_WORK_VECTOR);
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
+ trace_irq_work_exit(IRQ_WORK_VECTOR);
+}
+
+void arch_irq_work_raise(void)
+{
+ if (!arch_irq_work_has_interrupt())
+ return;
+
+ __apic_send_IPI_self(IRQ_WORK_VECTOR);
+ apic_wait_icr_idle();
+}
+#endif
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
new file mode 100644
index 000000000000..fdabd5dda154
--- /dev/null
+++ b/arch/x86/kernel/irqflags.S
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/asm.h>
+#include <linux/export.h>
+#include <linux/linkage.h>
+
+/*
+ * unsigned long native_save_fl(void)
+ */
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(native_save_fl)
+ ENDBR
+ pushf
+ pop %_ASM_AX
+ RET
+SYM_FUNC_END(native_save_fl)
+.popsection
+EXPORT_SYMBOL(native_save_fl)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d58..6ab9eac64670 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,31 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
-#include <linux/slab.h>
#include <linux/random.h>
#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
+#include <linux/pgtable.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
+#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
#include <asm/desc.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <asm/i8259.h>
#include <asm/traps.h>
+#include <asm/fred.h>
+#include <asm/prom.h>
/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -43,233 +47,68 @@
* (these are usually mapped into the 0x30-0xff vector range)
*/
-#ifdef CONFIG_X86_32
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
- outb(0, 0xF0);
- if (ignore_fpu_irq || !boot_cpu_data.hard_math)
- return IRQ_NONE;
- math_error((void __user *)get_irq_regs()->ip);
- return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
- .handler = math_error_irq,
- .name = "fpu",
-};
-#endif
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
- .handler = no_action,
- .name = "cascade",
-};
-
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
-int vector_used_by_percpu_irq(unsigned int vector)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
- return 1;
- }
-
- return 0;
-}
-
-static void __init init_ISA_irqs(void)
+void __init init_ISA_irqs(void)
{
+ struct irq_chip *chip = legacy_pic->chip;
int i;
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
- init_bsp_APIC();
-#endif
- init_8259A(0);
-
/*
- * 16 old-style INTA-cycle interrupts:
+ * Try to set up the through-local-APIC virtual wire mode earlier.
+ *
+ * On some 32-bit UP machines, whose APIC has been disabled by BIOS
+ * and then got re-enabled by "lapic", it hangs at boot time without this.
*/
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- struct irq_desc *desc = irq_to_desc(i);
+ init_bsp_APIC();
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
+ legacy_pic->init(0);
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ irq_set_chip_and_handler(i, chip, handle_level_irq);
+ irq_set_status_flags(i, IRQ_LEVEL);
}
}
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
+void __init init_IRQ(void)
{
-#ifdef CONFIG_SMP
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+ int i;
+
/*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
+ * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
+ * If these IRQ's are handled by legacy interrupt-controllers like PIC,
+ * then this configuration will likely be static after the boot. If
+ * these IRQs are handled by more modern controllers like IO-APIC,
+ * then this vector space can be freed and re-used dynamically as the
+ * irq's migrate etc.
*/
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPIs for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+ for (i = 0; i < nr_legacy_irqs(); i++)
+ per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+ BUG_ON(irq_init_percpu_irqstack(smp_processor_id()));
- /* IPI for generic single function call */
- alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
- set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-
- /* IPI used for rebooting/stopping */
- alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
-#endif
-#endif /* CONFIG_SMP */
-}
-
-static void __init apic_intr_init(void)
-{
- smp_intr_init();
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#ifdef CONFIG_X86_MCE_THRESHOLD
- alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
- alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* generic IPI for platform specific use */
- alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_COUNTERS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
-# endif
-
-#endif
-}
-
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- * Perform any necessary interrupt initialisation prior to setting up
- * the "ordinary" interrupt call gates. For legacy reasons, the ISA
- * interrupts should be initialised here if the machine emulates a PC
- * in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-#ifdef CONFIG_X86_32
- if (x86_quirks->arch_pre_intr_init) {
- if (x86_quirks->arch_pre_intr_init())
- return;
- }
-#endif
- init_ISA_irqs();
+ x86_init.irqs.intr_init();
}
void __init native_init_IRQ(void)
{
- int i;
-
/* Execute any quirks before the call gates are initialised: */
- x86_quirk_pre_intr_init();
-
- apic_intr_init();
-
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- if (!test_bit(i, used_vectors))
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
- }
+ x86_init.irqs.pre_vector_init();
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
+ /* FRED's IRQ path may be used even if FRED isn't fully enabled. */
+ if (IS_ENABLED(CONFIG_X86_FRED))
+ fred_complete_exception_setup();
-#ifdef CONFIG_X86_32
- /*
- * Call quirks after call gates are initialised (usually add in
- * the architecture specific gates):
- */
- x86_quirk_intr_init();
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_setup_apic_and_irq_gates();
- /*
- * External FPU? Set up irq13 if so, for
- * original braindamaged IBM FERR coupling.
- */
- if (boot_cpu_data.hard_math && !cpu_has_fpu)
- setup_irq(FPU_IRQ, &fpu_irq);
+ lapic_assign_system_vectors();
- irq_ctx_init(smp_processor_id());
-#endif
+ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) {
+ /* IRQ2 is cascade interrupt to second interrupt controller */
+ if (request_irq(2, no_action, IRQF_NO_THREAD, "cascade", NULL))
+ pr_err("%s: request_irq() failed\n", "cascade");
+ }
}
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000000..243a769fdd97
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package. In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/debugfs.h>
+#include <linux/mutex.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled
+ */
+bool __read_mostly sysctl_sched_itmt_enabled;
+
+static ssize_t sched_itmt_enabled_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ ssize_t result;
+ bool orig;
+
+ guard(mutex)(&itmt_update_mutex);
+
+ orig = sysctl_sched_itmt_enabled;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+ if (sysctl_sched_itmt_enabled != orig) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ return result;
+}
+
+static int sched_core_priority_show(struct seq_file *s, void *unused)
+{
+ int cpu;
+
+ seq_puts(s, "CPU #\tPriority\n");
+ for_each_possible_cpu(cpu)
+ seq_printf(s, "%d\t%d\n", cpu, arch_asym_cpu_priority(cpu));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(sched_core_priority);
+
+static const struct file_operations dfs_sched_itmt_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_itmt_enabled_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static struct dentry *dfs_sched_itmt;
+static struct dentry *dfs_sched_core_prio;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+ guard(mutex)(&itmt_update_mutex);
+
+ if (sched_itmt_capable)
+ return 0;
+
+ dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled",
+ 0644,
+ arch_debugfs_dir,
+ &sysctl_sched_itmt_enabled,
+ &dfs_sched_itmt_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_itmt)) {
+ dfs_sched_itmt = NULL;
+ return -ENOMEM;
+ }
+
+ dfs_sched_core_prio = debugfs_create_file("sched_core_priority", 0644,
+ arch_debugfs_dir, NULL,
+ &sched_core_priority_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_core_prio)) {
+ dfs_sched_core_prio = NULL;
+ return -ENOMEM;
+ }
+
+ sched_itmt_capable = true;
+
+ sysctl_sched_itmt_enabled = 1;
+
+ x86_topology_update = true;
+ rebuild_sched_domains();
+
+ return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+ guard(mutex)(&itmt_update_mutex);
+
+ if (!sched_itmt_capable)
+ return;
+
+ sched_itmt_capable = false;
+
+ debugfs_remove(dfs_sched_itmt);
+ dfs_sched_itmt = NULL;
+ debugfs_remove(dfs_sched_core_prio);
+ dfs_sched_core_prio = NULL;
+
+ if (sysctl_sched_itmt_enabled) {
+ /* disable sched_itmt if we are no longer ITMT capable */
+ sysctl_sched_itmt_enabled = 0;
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+ return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio: Priority of @cpu
+ * @cpu: The CPU number
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPUs with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int cpu)
+{
+ per_cpu(sched_core_priority, cpu) = prio;
+}
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
new file mode 100644
index 000000000000..9e9a591a5fec
--- /dev/null
+++ b/arch/x86/kernel/jailhouse.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Jailhouse paravirt_ops implementation
+ *
+ * Copyright (c) Siemens AG, 2015-2017
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ */
+
+#include <linux/acpi_pmtmr.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/serial_8250.h>
+#include <linux/acpi.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
+#include <asm/cpu.h>
+#include <asm/hypervisor.h>
+#include <asm/i8259.h>
+#include <asm/irqdomain.h>
+#include <asm/pci_x86.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/jailhouse_para.h>
+
+static struct jailhouse_setup_data setup_data;
+#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1))
+#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2))
+
+static unsigned int precalibrated_tsc_khz;
+
+static void jailhouse_setup_irq(unsigned int irq)
+{
+ struct mpc_intsrc mp_irq = {
+ .type = MP_INTSRC,
+ .irqtype = mp_INT,
+ .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
+ .srcbusirq = irq,
+ .dstirq = irq,
+ };
+ mp_save_irq(&mp_irq);
+}
+
+static uint32_t jailhouse_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0 ||
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ return cpuid_base_hypervisor("Jailhouse\0\0\0", 0);
+}
+
+static uint32_t __init jailhouse_detect(void)
+{
+ return jailhouse_cpuid_base();
+}
+
+static void jailhouse_get_wallclock(struct timespec64 *now)
+{
+ memset(now, 0, sizeof(*now));
+}
+
+static void __init jailhouse_timer_init(void)
+{
+ lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ);
+}
+
+static unsigned long jailhouse_get_tsc(void)
+{
+ return precalibrated_tsc_khz;
+}
+
+static void __init jailhouse_x2apic_init(void)
+{
+#ifdef CONFIG_X86_X2APIC
+ if (!x2apic_enabled())
+ return;
+ /*
+ * We do not have access to IR inside Jailhouse non-root cells. So
+ * we have to run in physical mode.
+ */
+ x2apic_phys = 1;
+ /*
+ * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs
+ * ensure that only this APIC driver picks up the call.
+ */
+ default_acpi_madt_oem_check("", "");
+#endif
+}
+
+static void __init jailhouse_parse_smp_config(void)
+{
+ struct ioapic_domain_cfg ioapic_cfg = {
+ .type = IOAPIC_DOMAIN_STRICT,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+ unsigned int cpu;
+
+ jailhouse_x2apic_init();
+
+ register_lapic_address(0xfee00000);
+
+ for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++)
+ topology_register_apic(setup_data.v1.cpu_ids[cpu], CPU_ACPIID_INVALID, true);
+
+ smp_found_config = 1;
+
+ if (setup_data.v1.standard_ioapic) {
+ mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
+
+ if (IS_ENABLED(CONFIG_SERIAL_8250) &&
+ setup_data.hdr.version < 2) {
+ /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
+ jailhouse_setup_irq(3);
+ jailhouse_setup_irq(4);
+ }
+ }
+}
+
+static void jailhouse_no_restart(void)
+{
+ pr_notice("Jailhouse: Restart not supported, halting\n");
+ machine_halt();
+}
+
+static int __init jailhouse_pci_arch_init(void)
+{
+ pci_direct_init(1);
+
+ /*
+ * There are no bridges on the virtual PCI root bus under Jailhouse,
+ * thus no other way to discover all devices than a full scan.
+ * Respect any overrides via the command line, though.
+ */
+ if (pcibios_last_bus < 0)
+ pcibios_last_bus = 0xff;
+
+#ifdef CONFIG_PCI_MMCONFIG
+ if (setup_data.v1.pci_mmconfig_base) {
+ pci_mmconfig_add(0, 0, pcibios_last_bus,
+ setup_data.v1.pci_mmconfig_base);
+ pci_mmcfg_arch_init();
+ }
+#endif
+
+ return 0;
+}
+
+#ifdef CONFIG_SERIAL_8250
+static inline bool jailhouse_uart_enabled(unsigned int uart_nr)
+{
+ return setup_data.v2.flags & BIT(uart_nr);
+}
+
+static void jailhouse_serial_fixup(int port, struct uart_port *up,
+ u32 *capabilities)
+{
+ static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8};
+ unsigned int n;
+
+ for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) {
+ if (pcuart_base[n] != up->iobase)
+ continue;
+
+ if (jailhouse_uart_enabled(n)) {
+ pr_info("Enabling UART%u (port 0x%lx)\n", n,
+ up->iobase);
+ jailhouse_setup_irq(up->irq);
+ } else {
+ /* Deactivate UART if access isn't allowed */
+ up->iobase = 0;
+ }
+ break;
+ }
+}
+
+static void __init jailhouse_serial_workaround(void)
+{
+ /*
+ * There are flags inside setup_data that indicate availability of
+ * platform UARTs since setup data version 2.
+ *
+ * In case of version 1, we don't know which UARTs belong Linux. In
+ * this case, unconditionally register 1:1 mapping for legacy UART IRQs
+ * 3 and 4.
+ */
+ if (setup_data.hdr.version > 1)
+ serial8250_set_isa_configurator(jailhouse_serial_fixup);
+}
+#else /* !CONFIG_SERIAL_8250 */
+static inline void jailhouse_serial_workaround(void)
+{
+}
+#endif /* CONFIG_SERIAL_8250 */
+
+static void __init jailhouse_init_platform(void)
+{
+ u64 pa_data = boot_params.hdr.setup_data;
+ unsigned long setup_data_len;
+ struct setup_data header;
+ void *mapping;
+
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ x86_init.timers.timer_init = jailhouse_timer_init;
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
+ x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config;
+ x86_init.pci.arch_init = jailhouse_pci_arch_init;
+
+ x86_platform.calibrate_cpu = jailhouse_get_tsc;
+ x86_platform.calibrate_tsc = jailhouse_get_tsc;
+ x86_platform.get_wallclock = jailhouse_get_wallclock;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.warm_reset = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+
+ legacy_pic = &null_legacy_pic;
+
+ machine_ops.emergency_restart = jailhouse_no_restart;
+
+ while (pa_data) {
+ mapping = early_memremap(pa_data, sizeof(header));
+ memcpy(&header, mapping, sizeof(header));
+ early_memunmap(mapping, sizeof(header));
+
+ if (header.type == SETUP_JAILHOUSE)
+ break;
+
+ pa_data = header.next;
+ }
+
+ if (!pa_data)
+ panic("Jailhouse: No valid setup data found");
+
+ /* setup data must at least contain the header */
+ if (header.len < sizeof(setup_data.hdr))
+ goto unsupported;
+
+ pa_data += offsetof(struct setup_data, data);
+ setup_data_len = min_t(unsigned long, sizeof(setup_data),
+ (unsigned long)header.len);
+ mapping = early_memremap(pa_data, setup_data_len);
+ memcpy(&setup_data, mapping, setup_data_len);
+ early_memunmap(mapping, setup_data_len);
+
+ if (setup_data.hdr.version == 0 ||
+ setup_data.hdr.compatible_version !=
+ JAILHOUSE_SETUP_REQUIRED_VERSION ||
+ (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) ||
+ (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN))
+ goto unsupported;
+
+ pmtmr_ioport = setup_data.v1.pm_timer_address;
+ pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
+
+ precalibrated_tsc_khz = setup_data.v1.tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ pci_probe = 0;
+
+ /*
+ * Avoid that the kernel complains about missing ACPI tables - there
+ * are none in a non-root cell.
+ */
+ disable_acpi();
+
+ jailhouse_serial_workaround();
+ return;
+
+unsupported:
+ panic("Jailhouse: Unsupported setup data structure");
+}
+
+bool jailhouse_paravirt(void)
+{
+ return jailhouse_cpuid_base() != 0;
+}
+
+static bool __init jailhouse_x2apic_available(void)
+{
+ /*
+ * The x2APIC is only available if the root cell enabled it. Jailhouse
+ * does not support switching between xAPIC and x2APIC.
+ */
+ return x2apic_enabled();
+}
+
+const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
+ .name = "Jailhouse",
+ .detect = jailhouse_detect,
+ .init.init_platform = jailhouse_init_platform,
+ .init.x2apic_available = jailhouse_x2apic_available,
+ .ignore_nopv = true,
+};
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..a7949a54a0ff
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+#include <asm/text-patching.h>
+#include <asm/insn.h>
+
+int arch_jump_entry_size(struct jump_entry *entry)
+{
+ struct insn insn = {};
+
+ insn_decode_kernel(&insn, (void *)jump_entry_code(entry));
+ BUG_ON(insn.length != 2 && insn.length != 5);
+
+ return insn.length;
+}
+
+struct jump_label_patch {
+ const void *code;
+ int size;
+};
+
+static struct jump_label_patch
+__jump_label_patch(struct jump_entry *entry, enum jump_label_type type)
+{
+ const void *expect, *code, *nop;
+ const void *addr, *dest;
+ int size;
+
+ addr = (void *)jump_entry_code(entry);
+ dest = (void *)jump_entry_target(entry);
+
+ size = arch_jump_entry_size(entry);
+ switch (size) {
+ case JMP8_INSN_SIZE:
+ code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
+
+ case JMP32_INSN_SIZE:
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
+
+ default: BUG();
+ }
+
+ if (type == JUMP_LABEL_JMP)
+ expect = nop;
+ else
+ expect = code;
+
+ if (memcmp(addr, expect, size)) {
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n",
+ addr, addr, addr, expect, size, type);
+ BUG();
+ }
+
+ if (type == JUMP_LABEL_NOP)
+ code = nop;
+
+ return (struct jump_label_patch){.code = code, .size = size};
+}
+
+static __always_inline void
+__jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
+{
+ const struct jump_label_patch jlp = __jump_label_patch(entry, type);
+
+ /*
+ * As long as only a single processor is running and the code is still
+ * not marked as RO, text_poke_early() can be used; Checking that
+ * system_state is SYSTEM_BOOTING guarantees it. It will be set to
+ * SYSTEM_SCHEDULING before other cores are awaken and before the
+ * code is write-protected.
+ *
+ * At the time the change is being done, just ignore whether we
+ * are doing nop -> jump or jump -> nop transition, and assume
+ * always nop being the 'currently valid' instruction
+ */
+ if (init || system_state == SYSTEM_BOOTING) {
+ text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size);
+ return;
+ }
+
+ smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+}
+
+static void __ref jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
+{
+ mutex_lock(&text_mutex);
+ __jump_label_transform(entry, type, init);
+ mutex_unlock(&text_mutex);
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ jump_label_transform(entry, type, 0);
+}
+
+bool arch_jump_label_transform_queue(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ struct jump_label_patch jlp;
+
+ if (system_state == SYSTEM_BOOTING) {
+ /*
+ * Fallback to the non-batching mode.
+ */
+ arch_jump_label_transform(entry, type);
+ return true;
+ }
+
+ mutex_lock(&text_mutex);
+ jlp = __jump_label_patch(entry, type);
+ smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+ mutex_unlock(&text_mutex);
+ return true;
+}
+
+void arch_jump_label_transform_apply(void)
+{
+ mutex_lock(&text_mutex);
+ smp_text_poke_batch_finish();
+ mutex_unlock(&text_mutex);
+}
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
deleted file mode 100644
index cbc4332a77b2..000000000000
--- a/arch/x86/kernel/k8.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Shared support code for AMD K8 northbridges and derivates.
- * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
- */
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/k8.h>
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
-
-static u32 *flush_words;
-
-struct pci_device_id k8_nb_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
- {}
-};
-EXPORT_SYMBOL(k8_nb_ids);
-
-struct pci_dev **k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
-
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
-{
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (!dev)
- break;
- } while (!pci_match_id(&k8_nb_ids[0], dev));
- return dev;
-}
-
-int cache_k8_northbridges(void)
-{
- int i;
- struct pci_dev *dev;
-
- if (num_k8_northbridges)
- return 0;
-
- dev = NULL;
- while ((dev = next_k8_northbridge(dev)) != NULL)
- num_k8_northbridges++;
-
- k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
- GFP_KERNEL);
- if (!k8_northbridges)
- return -ENOMEM;
-
- if (!num_k8_northbridges) {
- k8_northbridges[0] = NULL;
- return 0;
- }
-
- flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
- if (!flush_words) {
- kfree(k8_northbridges);
- return -ENOMEM;
- }
-
- dev = NULL;
- i = 0;
- while ((dev = next_k8_northbridge(dev)) != NULL) {
- k8_northbridges[i] = dev;
- pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
- }
- k8_northbridges[i] = NULL;
- return 0;
-}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
-
-/* Ignores subdevice/subvendor but as far as I can figure out
- they're useless anyways */
-int __init early_is_k8_nb(u32 device)
-{
- struct pci_device_id *id;
- u32 vendor = device & 0xffff;
- device >>= 16;
- for (id = k8_nb_ids; id->vendor; id++)
- if (vendor == id->vendor && device == id->device)
- return 1;
- return 0;
-}
-
-void k8_flush_garts(void)
-{
- int flushed, i;
- unsigned long flags;
- static DEFINE_SPINLOCK(gart_lock);
-
- /* Avoid races between AGP and IOMMU. In theory it's not needed
- but I'm not sure if the hardware won't lose flush requests
- when another is pending. This whole thing is so expensive anyways
- that it doesn't matter to serialize more. -AK */
- spin_lock_irqsave(&gart_lock, flags);
- flushed = 0;
- for (i = 0; i < num_k8_northbridges; i++) {
- pci_write_config_dword(k8_northbridges[i], 0x9c,
- flush_words[i]|1);
- flushed++;
- }
- for (i = 0; i < num_k8_northbridges; i++) {
- u32 w;
- /* Make sure the hardware actually executed the flush*/
- for (;;) {
- pci_read_config_dword(k8_northbridges[i],
- 0x9c, &w);
- if (!(w & 1))
- break;
- cpu_relax();
- }
- }
- spin_unlock_irqrestore(&gart_lock, flags);
- if (!flushed)
- printk("nothing to flush?\n");
-}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
-
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index e444357375ce..e2e89bebcbc3 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -1,14 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Architecture specific debugfs files
*
* Copyright (C) 2007, Intel Corp.
* Huang Ying <ying.huang@intel.com>
- *
- * This file is released under the GPLv2.
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/stat.h>
#include <linux/io.h>
@@ -32,7 +32,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
struct setup_data_node *node = file->private_data;
unsigned long remain;
loff_t pos = *ppos;
- struct page *pg;
void *p;
u64 pa;
@@ -45,19 +44,19 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
if (count > node->len - pos)
count = node->len - pos;
- pa = node->paddr + sizeof(struct setup_data) + pos;
- pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- p = ioremap_cache(pa, count);
- if (!p)
- return -ENXIO;
- } else
- p = __va(pa);
+ pa = node->paddr + pos;
+
+ /* Is it direct data or invalid indirect one? */
+ if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT)
+ pa += sizeof(struct setup_data);
+
+ p = memremap(pa, count, MEMREMAP_WB);
+ if (!p)
+ return -ENOMEM;
remain = copy_to_user(user_buf, p, count);
- if (PageHighMem(pg))
- iounmap(p);
+ memunmap(p);
if (remain)
return -EFAULT;
@@ -67,96 +66,94 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
return count;
}
-static int setup_data_open(struct inode *inode, struct file *file)
-{
- file->private_data = inode->i_private;
-
- return 0;
-}
-
static const struct file_operations fops_setup_data = {
.read = setup_data_read,
- .open = setup_data_open,
+ .open = simple_open,
+ .llseek = default_llseek,
};
-static int __init
+static void __init
create_setup_data_node(struct dentry *parent, int no,
struct setup_data_node *node)
{
- struct dentry *d, *type, *data;
+ struct dentry *d;
char buf[16];
sprintf(buf, "%d", no);
d = debugfs_create_dir(buf, parent);
- if (!d)
- return -ENOMEM;
-
- type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
- if (!type)
- goto err_dir;
- data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
- if (!data)
- goto err_type;
-
- return 0;
-
-err_type:
- debugfs_remove(type);
-err_dir:
- debugfs_remove(d);
- return -ENOMEM;
+ debugfs_create_x32("type", S_IRUGO, d, &node->type);
+ debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
}
static int __init create_setup_data_nodes(struct dentry *parent)
{
+ struct setup_indirect *indirect;
struct setup_data_node *node;
struct setup_data *data;
- int error = -ENOMEM;
+ u64 pa_data, pa_next;
struct dentry *d;
- struct page *pg;
- u64 pa_data;
+ int error;
+ u32 len;
int no = 0;
d = debugfs_create_dir("setup_data", parent);
- if (!d)
- return -ENOMEM;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
+ if (!node) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ kfree(node);
+ error = -ENOMEM;
goto err_dir;
+ }
+ pa_next = data->next;
- pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- data = ioremap_cache(pa_data, sizeof(*data));
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(pa_data, len, MEMREMAP_WB);
if (!data) {
kfree(node);
- error = -ENXIO;
+ error = -ENOMEM;
goto err_dir;
}
- } else
- data = __va(pa_data);
-
- node->paddr = pa_data;
- node->type = data->type;
- node->len = data->len;
- error = create_setup_data_node(d, no, node);
- pa_data = data->next;
-
- if (PageHighMem(pg))
- iounmap(data);
- if (error)
- goto err_dir;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ node->paddr = indirect->addr;
+ node->type = indirect->type;
+ node->len = indirect->len;
+ } else {
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ }
+ } else {
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ }
+
+ create_setup_data_node(d, no, node);
+ pa_data = pa_next;
+
+ memunmap(data);
no++;
}
return 0;
err_dir:
- debugfs_remove(d);
+ debugfs_remove_recursive(d);
return error;
}
@@ -167,35 +164,18 @@ static struct debugfs_blob_wrapper boot_params_blob = {
static int __init boot_params_kdebugfs_init(void)
{
- struct dentry *dbp, *version, *data;
- int error = -ENOMEM;
-
- dbp = debugfs_create_dir("boot_params", NULL);
- if (!dbp)
- return -ENOMEM;
+ struct dentry *dbp;
+ int error;
- version = debugfs_create_x16("version", S_IRUGO, dbp,
- &boot_params.hdr.version);
- if (!version)
- goto err_dir;
+ dbp = debugfs_create_dir("boot_params", arch_debugfs_dir);
- data = debugfs_create_blob("data", S_IRUGO, dbp,
- &boot_params_blob);
- if (!data)
- goto err_version;
+ debugfs_create_x16("version", S_IRUGO, dbp, &boot_params.hdr.version);
+ debugfs_create_blob("data", S_IRUGO, dbp, &boot_params_blob);
error = create_setup_data_nodes(dbp);
if (error)
- goto err_data;
+ debugfs_remove_recursive(dbp);
- return 0;
-
-err_data:
- debugfs_remove(data);
-err_version:
- debugfs_remove(version);
-err_dir:
- debugfs_remove(dbp);
return error;
}
#endif /* CONFIG_DEBUG_BOOT_PARAMS */
@@ -205,8 +185,6 @@ static int __init arch_kdebugfs_init(void)
int error = 0;
arch_debugfs_dir = debugfs_create_dir("x86", NULL);
- if (!arch_debugfs_dir)
- return -ENOMEM;
#ifdef CONFIG_DEBUG_BOOT_PARAMS
error = boot_params_kdebugfs_init();
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 000000000000..c3244ac680d1
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/libfdt.h>
+#include <linux/of_fdt.h>
+#include <linux/efi.h>
+#include <linux/random.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+#include <asm/e820/api.h>
+#include <asm/kexec-bzimage64.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */
+
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len = 0;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr,
+ "elfcorehdr=0x%lx ", image->elf_load_addr);
+
+ if (image->dm_crypt_keys_addr != 0)
+ len += sprintf(cmdline_ptr + len,
+ "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr);
+ }
+ memcpy(cmdline_ptr + len, cmdline, cmdline_len);
+ cmdline_len += len;
+
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ kexec_dprintk("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_table_kexec->nr_entries;
+
+ /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */
+ if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE)
+ nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry));
+
+ return 0;
+}
+
+enum { RNG_SEED_LENGTH = 32 };
+
+static void
+setup_rng_seed(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int rng_seed_setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + rng_seed_setup_data_offset;
+ unsigned long setup_data_phys;
+
+ if (!rng_is_initialized())
+ return;
+
+ sd->type = SETUP_RNG_SEED;
+ sd->len = RNG_SEED_LENGTH;
+ get_random_bytes(sd->data, RNG_SEED_LENGTH);
+ setup_data_phys = params_load_addr + rng_seed_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi_fw_vendor;
+ esd->tables = efi_config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return 0;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ params->secure_boot = boot_params.secure_boot;
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+#ifdef CONFIG_OF_FLATTREE
+static void setup_dtb(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int dtb_setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + dtb_setup_data_offset;
+ unsigned long setup_data_phys, dtb_len;
+
+ dtb_len = fdt_totalsize(initial_boot_params);
+ sd->type = SETUP_DTB;
+ sd->len = dtb_len;
+
+ /* Carry over current boot DTB with setup_data */
+ memcpy(sd->data, initial_boot_params, dtb_len);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + dtb_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+}
+#endif /* CONFIG_OF_FLATTREE */
+
+static void
+setup_ima_state(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int ima_setup_data_offset)
+{
+#ifdef CONFIG_IMA_KEXEC
+ struct setup_data *sd = (void *)params + ima_setup_data_offset;
+ unsigned long setup_data_phys;
+ struct ima_setup_data *ima;
+
+ if (!image->ima_buffer_size)
+ return;
+
+ sd->type = SETUP_IMA;
+ sd->len = sizeof(*ima);
+
+ ima = (void *)sd + sizeof(struct setup_data);
+ ima->addr = image->ima_buffer_addr;
+ ima->size = image->ima_buffer_size;
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + ima_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+#endif /* CONFIG_IMA_KEXEC */
+}
+
+static void setup_kho(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + setup_data_offset;
+ struct kho_data *kho = (void *)sd + sizeof(*sd);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return;
+
+ sd->type = SETUP_KEXEC_KHO;
+ sd->len = sizeof(struct kho_data);
+
+ /* Only add if we have all KHO images in place */
+ if (!image->kho.fdt || !image->kho.scratch)
+ return;
+
+ /* Add setup data */
+ kho->fdt_addr = image->kho.fdt;
+ kho->fdt_size = PAGE_SIZE;
+ kho->scratch_addr = image->kho.scratch->mem;
+ kho->scratch_size = image->kho.scratch->bufsz;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = params_load_addr + setup_data_offset;
+}
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &screen_info, sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Always fill in RSDP: it is either 0 or a valid value */
+ params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+#ifdef CONFIG_CRASH_DUMP
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+#endif
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ kexec_dprintk("E820 memmap:\n");
+ for (i = 0; i < nr_e820_entries; i++) {
+ kexec_dprintk("%016llx-%016llx (%d)\n",
+ params->e820_table[i].addr,
+ params->e820_table[i].addr + params->e820_table[i].size - 1,
+ params->e820_table[i].type);
+ if (params->e820_table[i].type != E820_TYPE_RAM)
+ continue;
+ start = params->e820_table[i].addr;
+ end = params->e820_table[i].addr + params->e820_table[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+#endif
+
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params) {
+ setup_dtb(params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+ } else {
+ pr_debug("Not carrying over DTB, force_dtb = %d\n",
+ image->force_dtb);
+ }
+#endif
+
+ if (IS_ENABLED(CONFIG_IMA_KEXEC)) {
+ /* Setup IMA log buffer state */
+ setup_ima_state(image, params, params_load_addr,
+ setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct ima_setup_data);
+ }
+
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ /* Setup space to store preservation metadata */
+ setup_kho(image, params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+ }
+
+ /* Setup RNG seed */
+ setup_rng_seed(params, params_load_addr, setup_data_offset);
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+static int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be at least two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_5LEVEL) && pgtable_l5_enabled()) {
+ pr_err("bzImage cannot handle 5-level paging mode.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+static void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+ struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+ .top_down = true };
+ struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR,
+ .buf_max = ULONG_MAX, .top_down = true };
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_err("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+#ifdef CONFIG_CRASH_DUMP
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ ret = crash_load_dm_crypt_keys(image);
+ if (ret == -ENOENT) {
+ kexec_dprintk("No dm crypt key to load\n");
+ } else if (ret) {
+ pr_err("Failed to load dm crypt keys\n");
+ return ERR_PTR(ret);
+ }
+ if (image->dm_crypt_keys_addr &&
+ cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN >
+ header->cmdline_size) {
+ pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+#endif
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, &pbuf);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ if (image->dm_crypt_keys_addr)
+ params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data) +
+ sizeof(struct setup_data) +
+ RNG_SEED_LENGTH;
+
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params)
+ kbuf.bufsz += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+#endif
+
+ if (IS_ENABLED(CONFIG_IMA_KEXEC))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct ima_setup_data);
+
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+
+ params = kvzalloc(kbuf.bufsz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
+
+ /* Copy setup header onto bootparams. Documentation/arch/x86/boot.rst */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ kbuf.buffer = params;
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = 16;
+ kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ bootparam_load_addr = kbuf.mem;
+ kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ /* Load kernel */
+ kbuf.buffer = kernel + kern16_size;
+ kbuf.bufsz = kernel_len - kern16_size;
+ kbuf.memsz = PAGE_ALIGN(header->init_size);
+ kbuf.buf_align = header->kernel_alignment;
+ if (header->pref_address < MIN_KERNEL_LOAD_ADDR)
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ else
+ kbuf.buf_min = header->pref_address;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ kernel_load_addr = kbuf.mem;
+
+ kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ kbuf.buffer = initrd;
+ kbuf.bufsz = kbuf.memsz = initrd_len;
+ kbuf.buf_align = PAGE_SIZE;
+ kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ initrd_load_addr = kbuf.mem;
+
+ kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kvfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+static int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kvfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+const struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = kexec_kernel_verify_pe_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..8b1a9733d13e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -1,14 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
*/
/*
@@ -26,7 +17,7 @@
* Updated by: Tom Rini <trini@kernel.crashing.org>
* Updated by: Jason Wessel <jason.wessel@windriver.com>
* Modified for 386 by Jim Kingdon, Cygnus Support.
- * Origianl kgdb, compatibility with 2.1.xx kernel by
+ * Original kgdb, compatibility with 2.1.xx kernel by
* David Grothe <dave@gcom.com>
* Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
* X86_64 changes from Andi Kleen's patch merged by Jim Houston
@@ -39,70 +30,101 @@
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/kgdb.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
+#include <asm/text-patching.h>
+#include <asm/debugreg.h>
#include <asm/apicdef.h>
-#include <asm/system.h>
-
#include <asm/apic.h>
+#include <asm/nmi.h>
+#include <asm/switch_to.h>
-/*
- * Put the error code here just in case the user cares:
- */
-static int gdb_x86errcode;
-
-/*
- * Likewise, the vector number here (since GDB only gets the signal
- * number through the usual means, and that's not very specific):
- */
-static int gdb_x86vector = -1;
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
+{
+#ifdef CONFIG_X86_32
+ { "ax", 4, offsetof(struct pt_regs, ax) },
+ { "cx", 4, offsetof(struct pt_regs, cx) },
+ { "dx", 4, offsetof(struct pt_regs, dx) },
+ { "bx", 4, offsetof(struct pt_regs, bx) },
+ { "sp", 4, offsetof(struct pt_regs, sp) },
+ { "bp", 4, offsetof(struct pt_regs, bp) },
+ { "si", 4, offsetof(struct pt_regs, si) },
+ { "di", 4, offsetof(struct pt_regs, di) },
+ { "ip", 4, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, offsetof(struct pt_regs, ds) },
+ { "es", 4, offsetof(struct pt_regs, es) },
+#else
+ { "ax", 8, offsetof(struct pt_regs, ax) },
+ { "bx", 8, offsetof(struct pt_regs, bx) },
+ { "cx", 8, offsetof(struct pt_regs, cx) },
+ { "dx", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, si) },
+ { "di", 8, offsetof(struct pt_regs, di) },
+ { "bp", 8, offsetof(struct pt_regs, bp) },
+ { "sp", 8, offsetof(struct pt_regs, sp) },
+ { "r8", 8, offsetof(struct pt_regs, r8) },
+ { "r9", 8, offsetof(struct pt_regs, r9) },
+ { "r10", 8, offsetof(struct pt_regs, r10) },
+ { "r11", 8, offsetof(struct pt_regs, r11) },
+ { "r12", 8, offsetof(struct pt_regs, r12) },
+ { "r13", 8, offsetof(struct pt_regs, r13) },
+ { "r14", 8, offsetof(struct pt_regs, r14) },
+ { "r15", 8, offsetof(struct pt_regs, r15) },
+ { "ip", 8, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, -1 },
+ { "es", 4, -1 },
+#endif
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
+};
-/**
- * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- * @gdb_regs: A pointer to hold the registers in the order GDB wants.
- * @regs: The &struct pt_regs of the current process.
- *
- * Convert the pt_regs in @regs into the format for registers that
- * GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
+ if (
+#ifdef CONFIG_X86_32
+ regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
#endif
- gdb_regs[GDB_AX] = regs->ax;
- gdb_regs[GDB_BX] = regs->bx;
- gdb_regs[GDB_CX] = regs->cx;
- gdb_regs[GDB_DX] = regs->dx;
- gdb_regs[GDB_SI] = regs->si;
- gdb_regs[GDB_DI] = regs->di;
- gdb_regs[GDB_BP] = regs->bp;
- gdb_regs[GDB_PC] = regs->ip;
+ regno == GDB_SP || regno == GDB_ORIG_AX)
+ return 0;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+ dbg_reg_def[regno].size);
+ return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (regno == GDB_ORIG_AX) {
+ memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+ return "orig_ax";
+ }
+ if (regno >= DBG_MAX_REG_NUM || regno < 0)
+ return NULL;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+ dbg_reg_def[regno].size);
+
#ifdef CONFIG_X86_32
- gdb_regs[GDB_PS] = regs->flags;
- gdb_regs[GDB_DS] = regs->ds;
- gdb_regs[GDB_ES] = regs->es;
- gdb_regs[GDB_CS] = regs->cs;
- gdb_regs[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_FS] = 0xFFFF;
- gdb_regs[GDB_GS] = 0xFFFF;
- gdb_regs[GDB_SP] = (int)&regs->sp;
-#else
- gdb_regs[GDB_R8] = regs->r8;
- gdb_regs[GDB_R9] = regs->r9;
- gdb_regs[GDB_R10] = regs->r10;
- gdb_regs[GDB_R11] = regs->r11;
- gdb_regs[GDB_R12] = regs->r12;
- gdb_regs[GDB_R13] = regs->r13;
- gdb_regs[GDB_R14] = regs->r14;
- gdb_regs[GDB_R15] = regs->r15;
- gdb_regs32[GDB_PS] = regs->flags;
- gdb_regs32[GDB_CS] = regs->cs;
- gdb_regs32[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = regs->sp;
+ switch (regno) {
+ case GDB_GS:
+ case GDB_FS:
+ *(unsigned long *)mem = 0xFFFF;
+ break;
+ }
#endif
+ return dbg_reg_def[regno].name;
}
/**
@@ -128,21 +150,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_DX] = 0;
gdb_regs[GDB_SI] = 0;
gdb_regs[GDB_DI] = 0;
- gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp;
+ gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp;
#ifdef CONFIG_X86_32
gdb_regs[GDB_DS] = __KERNEL_DS;
gdb_regs[GDB_ES] = __KERNEL_DS;
gdb_regs[GDB_PS] = 0;
gdb_regs[GDB_CS] = __KERNEL_CS;
- gdb_regs[GDB_PC] = p->thread.ip;
gdb_regs[GDB_SS] = __KERNEL_DS;
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
#else
- gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs32[GDB_PS] = 0;
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
@@ -152,87 +172,104 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_R14] = 0;
gdb_regs[GDB_R15] = 0;
#endif
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_SP] = p->thread.sp;
}
-/**
- * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- * @gdb_regs: A pointer to hold the registers we've received from GDB.
- * @regs: A pointer to a &struct pt_regs to hold these values in.
- *
- * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- * in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
- regs->ax = gdb_regs[GDB_AX];
- regs->bx = gdb_regs[GDB_BX];
- regs->cx = gdb_regs[GDB_CX];
- regs->dx = gdb_regs[GDB_DX];
- regs->si = gdb_regs[GDB_SI];
- regs->di = gdb_regs[GDB_DI];
- regs->bp = gdb_regs[GDB_BP];
- regs->ip = gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
- regs->flags = gdb_regs[GDB_PS];
- regs->ds = gdb_regs[GDB_DS];
- regs->es = gdb_regs[GDB_ES];
- regs->cs = gdb_regs[GDB_CS];
-#else
- regs->r8 = gdb_regs[GDB_R8];
- regs->r9 = gdb_regs[GDB_R9];
- regs->r10 = gdb_regs[GDB_R10];
- regs->r11 = gdb_regs[GDB_R11];
- regs->r12 = gdb_regs[GDB_R12];
- regs->r13 = gdb_regs[GDB_R13];
- regs->r14 = gdb_regs[GDB_R14];
- regs->r15 = gdb_regs[GDB_R15];
- regs->flags = gdb_regs32[GDB_PS];
- regs->cs = gdb_regs32[GDB_CS];
- regs->ss = gdb_regs32[GDB_SS];
-#endif
-}
-
static struct hw_breakpoint {
unsigned enabled;
- unsigned type;
- unsigned len;
unsigned long addr;
-} breakinfo[4];
+ int len;
+ int type;
+ struct perf_event * __percpu *pev;
+} breakinfo[HBP_NUM];
+
+static unsigned long early_dr7;
static void kgdb_correct_hw_break(void)
{
- unsigned long dr7;
- int correctit = 0;
- int breakbit;
int breakno;
- get_debugreg(dr7, 7);
- for (breakno = 0; breakno < 4; breakno++) {
- breakbit = 2 << (breakno << 1);
- if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
- correctit = 1;
- dr7 |= breakbit;
- dr7 &= ~(0xf0000 << (breakno << 2));
- dr7 |= ((breakinfo[breakno].len << 2) |
- breakinfo[breakno].type) <<
- ((breakno << 2) + 16);
- if (breakno >= 0 && breakno <= 3)
- set_debugreg(breakinfo[breakno].addr, breakno);
-
- } else {
- if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
- correctit = 1;
- dr7 &= ~breakbit;
- dr7 &= ~(0xf0000 << (breakno << 2));
- }
+ for (breakno = 0; breakno < HBP_NUM; breakno++) {
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ int val;
+ int cpu = raw_smp_processor_id();
+ if (!breakinfo[breakno].enabled)
+ continue;
+ if (dbg_is_early) {
+ set_debugreg(breakinfo[breakno].addr, breakno);
+ early_dr7 |= encode_dr7(breakno,
+ breakinfo[breakno].len,
+ breakinfo[breakno].type);
+ set_debugreg(early_dr7, 7);
+ continue;
}
+ bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ info = counter_arch_bp(bp);
+ if (bp->attr.disabled != 1)
+ continue;
+ bp->attr.bp_addr = breakinfo[breakno].addr;
+ bp->attr.bp_len = breakinfo[breakno].len;
+ bp->attr.bp_type = breakinfo[breakno].type;
+ info->address = breakinfo[breakno].addr;
+ info->len = breakinfo[breakno].len;
+ info->type = breakinfo[breakno].type;
+ val = arch_install_hw_breakpoint(bp);
+ if (!val)
+ bp->attr.disabled = 0;
}
- if (correctit)
- set_debugreg(dr7, 7);
+ if (!dbg_is_early)
+ hw_breakpoint_restore();
+}
+
+static int hw_break_reserve_slot(int breakno)
+{
+ int cpu;
+ int cnt = 0;
+ struct perf_event **pevent;
+
+ if (dbg_is_early)
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ cnt++;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_reserve_bp_slot(*pevent))
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ for_each_online_cpu(cpu) {
+ cnt--;
+ if (!cnt)
+ break;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ dbg_release_bp_slot(*pevent);
+ }
+ return -1;
+}
+
+static int hw_break_release_slot(int breakno)
+{
+ struct perf_event **pevent;
+ int cpu;
+
+ if (dbg_is_early)
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_release_bp_slot(*pevent))
+ /*
+ * The debugger is responsible for handing the retry on
+ * remove failure.
+ */
+ return -1;
+ }
+ return 0;
}
static int
@@ -240,12 +277,16 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (breakinfo[i].addr == addr && breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
+ if (hw_break_release_slot(i)) {
+ printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
+ return -1;
+ }
breakinfo[i].enabled = 0;
return 0;
@@ -254,46 +295,77 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
static void kgdb_remove_all_hw_break(void)
{
int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
- for (i = 0; i < 4; i++)
- memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
+ for (i = 0; i < HBP_NUM; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (!bp->attr.disabled) {
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ continue;
+ }
+ if (dbg_is_early)
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ else if (hw_break_release_slot(i))
+ printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
+ breakinfo[i].addr);
+ breakinfo[i].enabled = 0;
+ }
}
static int
kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
- unsigned type;
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (!breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
switch (bptype) {
case BP_HARDWARE_BREAKPOINT:
- type = 0;
- len = 1;
+ len = 1;
+ breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
break;
case BP_WRITE_WATCHPOINT:
- type = 1;
+ breakinfo[i].type = X86_BREAKPOINT_WRITE;
break;
case BP_ACCESS_WATCHPOINT:
- type = 3;
+ breakinfo[i].type = X86_BREAKPOINT_RW;
break;
default:
return -1;
}
-
- if (len == 1 || len == 2 || len == 4)
- breakinfo[i].len = len - 1;
- else
+ switch (len) {
+ case 1:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_1;
+ break;
+ case 2:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_2;
+ break;
+ case 4:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case 8:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
return -1;
-
- breakinfo[i].enabled = 1;
+ }
breakinfo[i].addr = addr;
- breakinfo[i].type = type;
+ if (hw_break_reserve_slot(i)) {
+ breakinfo[i].addr = 0;
+ return -1;
+ }
+ breakinfo[i].enabled = 1;
return 0;
}
@@ -306,60 +378,56 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
* disable hardware debugging while it is processing gdb packets or
* handling exception.
*/
-void kgdb_disable_hw_debug(struct pt_regs *regs)
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
{
- /* Disable hardware debugging while we are in kgdb: */
- set_debugreg(0UL, 7);
-}
+ int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
-/**
- * kgdb_post_primary_code - Save error vector/code numbers.
- * @regs: Original pt_regs.
- * @e_vector: Original error vector.
- * @err_code: Original error code.
- *
- * This is needed on architectures which support SMP and KGDB.
- * This function is called after all the slave cpus have been put
- * to a know spin state and the primary CPU has control over KGDB.
- */
-void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
- /* primary processor is completely in the debugger */
- gdb_x86vector = e_vector;
- gdb_x86errcode = err_code;
+ /* Disable hardware debugging while we are in kgdb: */
+ set_debugreg(DR7_FIXED_1, 7);
+ for (i = 0; i < HBP_NUM; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ if (dbg_is_early) {
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ continue;
+ }
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (bp->attr.disabled == 1)
+ continue;
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ }
}
#ifdef CONFIG_SMP
/**
* kgdb_roundup_cpus - Get other CPUs into a holding pattern
- * @flags: Current IRQ state
*
* On SMP systems, we need to get the attention of the other CPUs
* and get them be in a known state. This should do what is needed
* to get the other CPUs to call kgdb_wait(). Note that on some arches,
* the NMI approach is not used for rounding up all the CPUs. For example,
- * in case of MIPS, smp_call_function() is used to roundup CPUs. In
- * this case, we have to make sure that interrupts are enabled before
- * calling smp_call_function(). The argument to this function is
- * the flags that will be used when restoring the interrupts. There is
- * local_irq_save() call before kgdb_roundup_cpus().
+ * in case of MIPS, smp_call_function() is used to roundup CPUs.
*
* On non-SMP systems, this is not called.
*/
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
{
- apic->send_IPI_allbutself(APIC_DM_NMI);
+ apic_send_IPI_allbutself(NMI_VECTOR);
}
#endif
/**
* kgdb_arch_handle_exception - Handle architecture specific GDB packets.
- * @vector: The error vector of the exception that happened.
+ * @e_vector: The error vector of the exception that happened.
* @signo: The signal number of the exception that happened.
* @err_code: The error code of the exception that happened.
- * @remcom_in_buffer: The buffer of the packet we have read.
- * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
- * @regs: The &struct pt_regs of the current process.
+ * @remcomInBuffer: The buffer of the packet we have read.
+ * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @linux_regs: The &struct pt_regs of the current process.
*
* This function MUST handle the 'c' and 's' command packets,
* as well packets to set / remove a hardware breakpoint, if used.
@@ -373,9 +441,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
struct pt_regs *linux_regs)
{
unsigned long addr;
- unsigned long dr6;
char *ptr;
- int newPC;
switch (remcomInBuffer[0]) {
case 'c':
@@ -384,10 +450,9 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
ptr = &remcomInBuffer[1];
if (kgdb_hex2long(&ptr, &addr))
linux_regs->ip = addr;
+ fallthrough;
case 'D':
case 'k':
- newPC = linux_regs->ip;
-
/* clear the trace bit */
linux_regs->flags &= ~X86_EFLAGS_TF;
atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -395,27 +460,10 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
/* set the trace bit if we're stepping */
if (remcomInBuffer[0] == 's') {
linux_regs->flags |= X86_EFLAGS_TF;
- kgdb_single_step = 1;
atomic_set(&kgdb_cpu_doing_single_step,
raw_smp_processor_id());
}
- get_debugreg(dr6, 6);
- if (!(dr6 & 0x4000)) {
- int breakno;
-
- for (breakno = 0; breakno < 4; breakno++) {
- if (dr6 & (1 << breakno) &&
- breakinfo[breakno].type == 0) {
- /* Set restore flag: */
- linux_regs->flags |= X86_EFLAGS_RF;
- break;
- }
- }
- }
- set_debugreg(0UL, 6);
- kgdb_correct_hw_break();
-
return 0;
}
@@ -434,50 +482,55 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
"resuming...\n");
kgdb_arch_handle_exception(args->trapnr, args->signr,
args->err, "c", "", regs);
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
return NOTIFY_STOP;
}
-static int was_in_debug_nmi[NR_CPUS];
+static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS);
-static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
- struct pt_regs *regs = args->regs;
+ int cpu;
switch (cmd) {
- case DIE_NMI:
+ case NMI_LOCAL:
if (atomic_read(&kgdb_active) != -1) {
/* KGDB CPU roundup */
- kgdb_nmicallback(raw_smp_processor_id(), regs);
- was_in_debug_nmi[raw_smp_processor_id()] = 1;
+ cpu = raw_smp_processor_id();
+ kgdb_nmicallback(cpu, regs);
+ set_bit(cpu, was_in_debug_nmi);
touch_nmi_watchdog();
- return NOTIFY_STOP;
+
+ return NMI_HANDLED;
}
- return NOTIFY_DONE;
+ break;
- case DIE_NMI_IPI:
- /* Just ignore, we will handle the roundup on DIE_NMI. */
- return NOTIFY_DONE;
+ case NMI_UNKNOWN:
+ cpu = raw_smp_processor_id();
- case DIE_NMIUNKNOWN:
- if (was_in_debug_nmi[raw_smp_processor_id()]) {
- was_in_debug_nmi[raw_smp_processor_id()] = 0;
- return NOTIFY_STOP;
- }
- return NOTIFY_DONE;
+ if (__test_and_clear_bit(cpu, was_in_debug_nmi))
+ return NMI_HANDLED;
- case DIE_NMIWATCHDOG:
- if (atomic_read(&kgdb_active) != -1) {
- /* KGDB CPU roundup: */
- kgdb_nmicallback(raw_smp_processor_id(), regs);
- return NOTIFY_STOP;
- }
- /* Enter debugger: */
break;
+ default:
+ /* do nothing */
+ break;
+ }
+ return NMI_DONE;
+}
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+ struct pt_regs *regs = args->regs;
+ switch (cmd) {
case DIE_DEBUG:
- if (atomic_read(&kgdb_cpu_doing_single_step) ==
- raw_smp_processor_id()) {
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
if (user_mode(regs))
return single_step_cont(regs, args);
break;
@@ -486,13 +539,13 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
* a system call which should be ignored
*/
return NOTIFY_DONE;
- /* fall through */
+ fallthrough;
default:
if (user_mode(regs))
return NOTIFY_DONE;
}
- if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs))
+ if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
return NOTIFY_DONE;
/* Must touch watchdog before return to normal operation */
@@ -500,6 +553,24 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
return NOTIFY_STOP;
}
+int kgdb_ll_trap(int cmd, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+
+ if (!kgdb_io_module_registered)
+ return NOTIFY_DONE;
+
+ return __kgdb_notify(&args, cmd);
+}
+
static int
kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
{
@@ -515,22 +586,92 @@ kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
static struct notifier_block kgdb_notifier = {
.notifier_call = kgdb_notify,
-
- /*
- * Lowest-prio notifier priority, we want to be notified last:
- */
- .priority = -INT_MAX,
};
/**
- * kgdb_arch_init - Perform any architecture specific initalization.
+ * kgdb_arch_init - Perform any architecture specific initialization.
*
- * This function will handle the initalization of any architecture
+ * This function will handle the initialization of any architecture
* specific callbacks.
*/
int kgdb_arch_init(void)
{
- return register_die_notifier(&kgdb_notifier);
+ int retval;
+
+ retval = register_die_notifier(&kgdb_notifier);
+ if (retval)
+ goto out;
+
+ retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler,
+ 0, "kgdb");
+ if (retval)
+ goto out1;
+
+ retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler,
+ 0, "kgdb");
+
+ if (retval)
+ goto out2;
+
+ return retval;
+
+out2:
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
+out1:
+ unregister_die_notifier(&kgdb_notifier);
+out:
+ return retval;
+}
+
+static void kgdb_hw_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct task_struct *tsk = current;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (breakinfo[i].enabled)
+ tsk->thread.virtual_dr6 |= (DR_TRAP0 << i);
+ }
+}
+
+void kgdb_arch_late(void)
+{
+ int i, cpu;
+ struct perf_event_attr attr;
+ struct perf_event **pevent;
+
+ /*
+ * Pre-allocate the hw breakpoint instructions in the non-atomic
+ * portion of kgdb because this operation requires mutexs to
+ * complete.
+ */
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)kgdb_arch_init;
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_W;
+ attr.disabled = 1;
+ for (i = 0; i < HBP_NUM; i++) {
+ if (breakinfo[i].pev)
+ continue;
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
+ if (IS_ERR_PCPU(breakinfo[i].pev)) {
+ printk(KERN_ERR "kgdb: Could not allocate hw"
+ "breakpoints\nDisabling the kernel debugger\n");
+ breakinfo[i].pev = NULL;
+ kgdb_arch_exit();
+ return;
+ }
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
+ pevent[0]->hw.sample_period = 1;
+ pevent[0]->overflow_handler = kgdb_hw_overflow_handler;
+ if (pevent[0]->destroy != NULL) {
+ pevent[0]->destroy = NULL;
+ release_bp_slot(*pevent);
+ }
+ }
+ }
}
/**
@@ -541,11 +682,19 @@ int kgdb_arch_init(void)
*/
void kgdb_arch_exit(void)
{
+ int i;
+ for (i = 0; i < 4; i++) {
+ if (breakinfo[i].pev) {
+ unregister_wide_hw_breakpoint(breakinfo[i].pev);
+ breakinfo[i].pev = NULL;
+ }
+ }
+ unregister_nmi_handler(NMI_UNKNOWN, "kgdb");
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
unregister_die_notifier(&kgdb_notifier);
}
/**
- *
* kgdb_skipexception - Bail out of KGDB when we've been triggered.
* @exception: Exception vector number
* @regs: Current &struct pt_regs.
@@ -573,12 +722,63 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
return instruction_pointer(regs);
}
-struct kgdb_arch arch_kgdb_ops = {
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+
+ bpt->type = BP_BREAKPOINT;
+ err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = copy_to_kernel_nofault((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ if (!err)
+ return err;
+ /*
+ * It is safe to call text_poke_kgdb() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ return -EBUSY;
+ text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+ bpt->type = BP_POKE_BREAKPOINT;
+
+ return 0;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ if (bpt->type != BP_POKE_BREAKPOINT)
+ goto knl_write;
+ /*
+ * It is safe to call text_poke_kgdb() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ goto knl_write;
+ text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
+ BREAK_INSTR_SIZE);
+ return 0;
+
+knl_write:
+ return copy_to_kernel_nofault((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
+const struct kgdb_arch arch_kgdb_ops = {
/* Breakpoint instruction: */
.gdb_bpt_instr = { 0xcc },
.flags = KGDB_HW_BREAKPOINT,
.set_hw_breakpoint = kgdb_set_hw_break,
.remove_hw_breakpoint = kgdb_remove_hw_break,
+ .disable_hw_break = kgdb_disable_hw_debug,
.remove_all_hw_break = kgdb_remove_all_hw_break,
.correct_hw_break = kgdb_correct_hw_break,
};
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
deleted file mode 100644
index 7b5169d2b000..000000000000
--- a/arch/x86/kernel/kprobes.c
+++ /dev/null
@@ -1,1068 +0,0 @@
-/*
- * Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- * Probes initial implementation ( includes contributions from
- * Rusty Russell).
- * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- * interface to access function arguments.
- * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
- * <prasanna@in.ibm.com> adapted for x86_64 from i386.
- * 2005-Mar Roland McGrath <roland@redhat.com>
- * Fixed to handle %rip-relative addressing mode correctly.
- * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
- * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
- * <prasanna@in.ibm.com> added function-return probes.
- * 2005-May Rusty Lynch <rusty.lynch@intel.com>
- * Added function return probes functionality
- * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
- * kprobe-booster and kretprobe-booster for i386.
- * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
- * and kretprobe-booster for x86-64
- * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
- * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
- * unified x86 kprobes code.
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/hardirq.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-
-#include <asm/cacheflush.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-
-void jprobe_return_end(void);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-#ifdef CONFIG_X86_64
-#define stack_addr(regs) ((unsigned long *)regs->sp)
-#else
-/*
- * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
- * don't save the ss and esp registers if the CPU is already in kernel
- * mode when it traps. So for kprobes, regs->sp and regs->ss are not
- * the [nonexistent] saved stack pointer and ss register, but rather
- * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
- * point to the top of the pre-int3 stack.
- */
-#define stack_addr(regs) ((unsigned long *)&regs->sp)
-#endif
-
-#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
- (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
- (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
- (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
- (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
- << (row % 32))
- /*
- * Undefined/reserved opcodes, conditional jump, Opcode Extension
- * Groups, and some special opcodes can not boost.
- */
-static const u32 twobyte_is_boostable[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ---------------------------------------------- */
- W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
- W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
- W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
- W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
- W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
- W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
- W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
- W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
- W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
- W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
- W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-static const u32 onebyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
- W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
- W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
- W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
- W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
- W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
- W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
- W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
- W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
- W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
- W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
- W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
- W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
- W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
- W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
- W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
- W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
- W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
- W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
- W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
- W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
- W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-#undef W
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {
- {"__switch_to", }, /* This function switches only current task, but
- doesn't switch kernel stack.*/
- {NULL, NULL} /* Terminator */
-};
-const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes set_jmp_op(void *from, void *to)
-{
- struct __arch_jmp_op {
- char op;
- s32 raddr;
- } __attribute__((packed)) * jop;
- jop = (struct __arch_jmp_op *)from;
- jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
- jop->op = RELATIVEJUMP_INSTRUCTION;
-}
-
-/*
- * Check for the REX prefix which can only exist on X86_64
- * X86_32 always returns 0
- */
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
-{
-#ifdef CONFIG_X86_64
- if ((*insn & 0xf0) == 0x40)
- return 1;
-#endif
- return 0;
-}
-
-/*
- * Returns non-zero if opcode is boostable.
- * RIP relative instructions are adjusted at copying time in 64 bits mode
- */
-static int __kprobes can_boost(kprobe_opcode_t *opcodes)
-{
- kprobe_opcode_t opcode;
- kprobe_opcode_t *orig_opcodes = opcodes;
-
- if (search_exception_tables((unsigned long)opcodes))
- return 0; /* Page fault may occur on this address. */
-
-retry:
- if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
- return 0;
- opcode = *(opcodes++);
-
- /* 2nd-byte opcode */
- if (opcode == 0x0f) {
- if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
- return 0;
- return test_bit(*opcodes,
- (unsigned long *)twobyte_is_boostable);
- }
-
- switch (opcode & 0xf0) {
-#ifdef CONFIG_X86_64
- case 0x40:
- goto retry; /* REX prefix is boostable */
-#endif
- case 0x60:
- if (0x63 < opcode && opcode < 0x67)
- goto retry; /* prefixes */
- /* can't boost Address-size override and bound */
- return (opcode != 0x62 && opcode != 0x67);
- case 0x70:
- return 0; /* can't boost conditional jump */
- case 0xc0:
- /* can't boost software-interruptions */
- return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
- case 0xd0:
- /* can boost AA* and XLAT */
- return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
- case 0xe0:
- /* can boost in/out and absolute jmps */
- return ((opcode & 0x04) || opcode == 0xea);
- case 0xf0:
- if ((opcode & 0x0c) == 0 && opcode != 0xf1)
- goto retry; /* lock/rep(ne) prefix */
- /* clear and set flags are boostable */
- return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
- default:
- /* segment override prefixes are boostable */
- if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
- goto retry; /* prefixes */
- /* CS override prefix and call are not boostable */
- return (opcode != 0x2e && opcode != 0x9a);
- }
-}
-
-/*
- * Returns non-zero if opcode modifies the interrupt flag.
- */
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
-{
- switch (*insn) {
- case 0xfa: /* cli */
- case 0xfb: /* sti */
- case 0xcf: /* iret/iretd */
- case 0x9d: /* popf/popfd */
- return 1;
- }
-
- /*
- * on X86_64, 0x40-0x4f are REX prefixes so we need to look
- * at the next byte instead.. but of course not recurse infinitely
- */
- if (is_REX_prefix(insn))
- return is_IF_modifier(++insn);
-
- return 0;
-}
-
-/*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
- * If it does, Return the address of the 32-bit displacement word.
- * If not, return null.
- * Only applicable to 64-bit x86.
- */
-static void __kprobes fix_riprel(struct kprobe *p)
-{
-#ifdef CONFIG_X86_64
- u8 *insn = p->ainsn.insn;
- s64 disp;
- int need_modrm;
-
- /* Skip legacy instruction prefixes. */
- while (1) {
- switch (*insn) {
- case 0x66:
- case 0x67:
- case 0x2e:
- case 0x3e:
- case 0x26:
- case 0x64:
- case 0x65:
- case 0x36:
- case 0xf0:
- case 0xf3:
- case 0xf2:
- ++insn;
- continue;
- }
- break;
- }
-
- /* Skip REX instruction prefix. */
- if (is_REX_prefix(insn))
- ++insn;
-
- if (*insn == 0x0f) {
- /* Two-byte opcode. */
- ++insn;
- need_modrm = test_bit(*insn,
- (unsigned long *)twobyte_has_modrm);
- } else
- /* One-byte opcode. */
- need_modrm = test_bit(*insn,
- (unsigned long *)onebyte_has_modrm);
-
- if (need_modrm) {
- u8 modrm = *++insn;
- if ((modrm & 0xc7) == 0x05) {
- /* %rip+disp32 addressing mode */
- /* Displacement follows ModRM byte. */
- ++insn;
- /*
- * The copied instruction uses the %rip-relative
- * addressing mode. Adjust the displacement for the
- * difference between the original location of this
- * instruction and the location of the copy that will
- * actually be run. The tricky bit here is making sure
- * that the sign extension happens correctly in this
- * calculation, since we need a signed 32-bit result to
- * be sign-extended to 64 bits when it's added to the
- * %rip value and yield the same 64-bit result that the
- * sign-extension of the original signed 32-bit
- * displacement would have given.
- */
- disp = (u8 *) p->addr + *((s32 *) insn) -
- (u8 *) p->ainsn.insn;
- BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
- *(s32 *)insn = (s32) disp;
- }
- }
-#endif
-}
-
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
- memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-
- fix_riprel(p);
-
- if (can_boost(p->addr))
- p->ainsn.boostable = 0;
- else
- p->ainsn.boostable = -1;
-
- p->opcode = *p->addr;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
- /* insn: must be on special executable page on x86. */
- p->ainsn.insn = get_insn_slot();
- if (!p->ainsn.insn)
- return -ENOMEM;
- arch_copy_kprobe(p);
- return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
- text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
- text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
- if (p->ainsn.insn) {
- free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
- p->ainsn.insn = NULL;
- }
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
- kcb->prev_kprobe.kp = kprobe_running();
- kcb->prev_kprobe.status = kcb->kprobe_status;
- kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
- kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
- __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
- kcb->kprobe_status = kcb->prev_kprobe.status;
- kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
- kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- __get_cpu_var(current_kprobe) = p;
- kcb->kprobe_saved_flags = kcb->kprobe_old_flags
- = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
- if (is_IF_modifier(p->ainsn.insn))
- kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
-}
-
-static void __kprobes clear_btf(void)
-{
- if (test_thread_flag(TIF_DEBUGCTLMSR))
- update_debugctlmsr(0);
-}
-
-static void __kprobes restore_btf(void)
-{
- if (test_thread_flag(TIF_DEBUGCTLMSR))
- update_debugctlmsr(current->thread.debugctlmsr);
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
-}
-
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
- struct pt_regs *regs)
-{
- unsigned long *sara = stack_addr(regs);
-
- ri->ret_addr = (kprobe_opcode_t *) *sara;
-
- /* Replace the return addr with trampoline addr */
- *sara = (unsigned long) &kretprobe_trampoline;
-}
-
-static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
- if (p->ainsn.boostable == 1 && !p->post_handler) {
- /* Boost up -- we can execute copied instructions directly */
- reset_current_kprobe();
- regs->ip = (unsigned long)p->ainsn.insn;
- preempt_enable_no_resched();
- return;
- }
-#endif
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_HIT_SS;
-}
-
-/*
- * We have reentered the kprobe_handler(), since another probe was hit while
- * within the handler. We save the original kprobes variables and just single
- * step on the instruction of the new probe without calling any user handlers.
- */
-static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- switch (kcb->kprobe_status) {
- case KPROBE_HIT_SSDONE:
-#ifdef CONFIG_X86_64
- /* TODO: Provide re-entrancy from post_kprobes_handler() and
- * avoid exception stack corruption while single-stepping on
- * the instruction of the new probe.
- */
- arch_disarm_kprobe(p);
- regs->ip = (unsigned long)p->addr;
- reset_current_kprobe();
- preempt_enable_no_resched();
- break;
-#endif
- case KPROBE_HIT_ACTIVE:
- save_previous_kprobe(kcb);
- set_current_kprobe(p, regs, kcb);
- kprobes_inc_nmissed_count(p);
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_REENTER;
- break;
- case KPROBE_HIT_SS:
- if (p == kprobe_running()) {
- regs->flags &= ~X86_EFLAGS_TF;
- regs->flags |= kcb->kprobe_saved_flags;
- return 0;
- } else {
- /* A probe has been hit in the codepath leading up
- * to, or just after, single-stepping of a probed
- * instruction. This entire codepath should strictly
- * reside in .kprobes.text section. Raise a warning
- * to highlight this peculiar case.
- */
- }
- default:
- /* impossible cases */
- WARN_ON(1);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
-static int __kprobes kprobe_handler(struct pt_regs *regs)
-{
- kprobe_opcode_t *addr;
- struct kprobe *p;
- struct kprobe_ctlblk *kcb;
-
- addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
- if (*addr != BREAKPOINT_INSTRUCTION) {
- /*
- * The breakpoint instruction was removed right
- * after we hit it. Another cpu has removed
- * either a probepoint or a debugger breakpoint
- * at this address. In either case, no further
- * handling of this interrupt is appropriate.
- * Back up over the (now missing) int3 and run
- * the original instruction.
- */
- regs->ip = (unsigned long)addr;
- return 1;
- }
-
- /*
- * We don't want to be preempted for the entire
- * duration of kprobe processing. We conditionally
- * re-enable preemption at the end of this function,
- * and also in reenter_kprobe() and setup_singlestep().
- */
- preempt_disable();
-
- kcb = get_kprobe_ctlblk();
- p = get_kprobe(addr);
-
- if (p) {
- if (kprobe_running()) {
- if (reenter_kprobe(p, regs, kcb))
- return 1;
- } else {
- set_current_kprobe(p, regs, kcb);
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
- /*
- * If we have no pre-handler or it returned 0, we
- * continue with normal processing. If we have a
- * pre-handler and it returned non-zero, it prepped
- * for calling the break_handler below on re-entry
- * for jprobe processing, so get out doing nothing
- * more here.
- */
- if (!p->pre_handler || !p->pre_handler(p, regs))
- setup_singlestep(p, regs, kcb);
- return 1;
- }
- } else if (kprobe_running()) {
- p = __get_cpu_var(current_kprobe);
- if (p->break_handler && p->break_handler(p, regs)) {
- setup_singlestep(p, regs, kcb);
- return 1;
- }
- } /* else: not a kprobe fault; let the kernel handle it */
-
- preempt_enable_no_resched();
- return 0;
-}
-
-/*
- * When a retprobed function returns, this code saves registers and
- * calls trampoline_handler() runs, which calls the kretprobe's handler.
- */
-static void __used __kprobes kretprobe_trampoline_holder(void)
-{
- asm volatile (
- ".global kretprobe_trampoline\n"
- "kretprobe_trampoline: \n"
-#ifdef CONFIG_X86_64
- /* We don't bother saving the ss register */
- " pushq %rsp\n"
- " pushfq\n"
- /*
- * Skip cs, ip, orig_ax.
- * trampoline_handler() will plug in these values
- */
- " subq $24, %rsp\n"
- " pushq %rdi\n"
- " pushq %rsi\n"
- " pushq %rdx\n"
- " pushq %rcx\n"
- " pushq %rax\n"
- " pushq %r8\n"
- " pushq %r9\n"
- " pushq %r10\n"
- " pushq %r11\n"
- " pushq %rbx\n"
- " pushq %rbp\n"
- " pushq %r12\n"
- " pushq %r13\n"
- " pushq %r14\n"
- " pushq %r15\n"
- " movq %rsp, %rdi\n"
- " call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movq %rax, 152(%rsp)\n"
- " popq %r15\n"
- " popq %r14\n"
- " popq %r13\n"
- " popq %r12\n"
- " popq %rbp\n"
- " popq %rbx\n"
- " popq %r11\n"
- " popq %r10\n"
- " popq %r9\n"
- " popq %r8\n"
- " popq %rax\n"
- " popq %rcx\n"
- " popq %rdx\n"
- " popq %rsi\n"
- " popq %rdi\n"
- /* Skip orig_ax, ip, cs */
- " addq $24, %rsp\n"
- " popfq\n"
-#else
- " pushf\n"
- /*
- * Skip cs, ip, orig_ax and gs.
- * trampoline_handler() will plug in these values
- */
- " subl $16, %esp\n"
- " pushl %fs\n"
- " pushl %es\n"
- " pushl %ds\n"
- " pushl %eax\n"
- " pushl %ebp\n"
- " pushl %edi\n"
- " pushl %esi\n"
- " pushl %edx\n"
- " pushl %ecx\n"
- " pushl %ebx\n"
- " movl %esp, %eax\n"
- " call trampoline_handler\n"
- /* Move flags to cs */
- " movl 56(%esp), %edx\n"
- " movl %edx, 52(%esp)\n"
- /* Replace saved flags with true return address. */
- " movl %eax, 56(%esp)\n"
- " popl %ebx\n"
- " popl %ecx\n"
- " popl %edx\n"
- " popl %esi\n"
- " popl %edi\n"
- " popl %ebp\n"
- " popl %eax\n"
- /* Skip ds, es, fs, gs, orig_ax and ip */
- " addl $24, %esp\n"
- " popf\n"
-#endif
- " ret\n");
-}
-
-/*
- * Called from kretprobe_trampoline
- */
-static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
-{
- struct kretprobe_instance *ri = NULL;
- struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
- unsigned long flags, orig_ret_address = 0;
- unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
-
- INIT_HLIST_HEAD(&empty_rp);
- kretprobe_hash_lock(current, &head, &flags);
- /* fixup registers */
-#ifdef CONFIG_X86_64
- regs->cs = __KERNEL_CS;
-#else
- regs->cs = __KERNEL_CS | get_kernel_rpl();
- regs->gs = 0;
-#endif
- regs->ip = trampoline_address;
- regs->orig_ax = ~0UL;
-
- /*
- * It is possible to have multiple instances associated with a given
- * task either because multiple functions in the call path have
- * return probes installed on them, and/or more than one
- * return probe was registered for a target function.
- *
- * We can handle this because:
- * - instances are always pushed into the head of the list
- * - when multiple return probes are registered for the same
- * function, the (chronologically) first instance's ret_addr
- * will be the real return address, and all the rest will
- * point to kretprobe_trampoline.
- */
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- if (ri->rp && ri->rp->handler) {
- __get_cpu_var(current_kprobe) = &ri->rp->kp;
- get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
- ri->rp->handler(ri, regs);
- __get_cpu_var(current_kprobe) = NULL;
- }
-
- orig_ret_address = (unsigned long)ri->ret_addr;
- recycle_rp_inst(ri, &empty_rp);
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
- kretprobe_hash_unlock(current, &flags);
-
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
- return (void *)orig_ret_address;
-}
-
-/*
- * Called after single-stepping. p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction. To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction. The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt. We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new ip is relative to the copied instruction. We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed flags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
- */
-static void __kprobes resume_execution(struct kprobe *p,
- struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
- unsigned long *tos = stack_addr(regs);
- unsigned long copy_ip = (unsigned long)p->ainsn.insn;
- unsigned long orig_ip = (unsigned long)p->addr;
- kprobe_opcode_t *insn = p->ainsn.insn;
-
- /*skip the REX prefix*/
- if (is_REX_prefix(insn))
- insn++;
-
- regs->flags &= ~X86_EFLAGS_TF;
- switch (*insn) {
- case 0x9c: /* pushfl */
- *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
- *tos |= kcb->kprobe_old_flags;
- break;
- case 0xc2: /* iret/ret/lret */
- case 0xc3:
- case 0xca:
- case 0xcb:
- case 0xcf:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.boostable = 1;
- goto no_change;
- case 0xe8: /* call relative - Fix return addr */
- *tos = orig_ip + (*tos - copy_ip);
- break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
-#endif
- case 0xff:
- if ((insn[1] & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
- } else if (((insn[1] & 0x31) == 0x20) ||
- ((insn[1] & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct. And this is boostable
- */
- p->ainsn.boostable = 1;
- goto no_change;
- }
- default:
- break;
- }
-
- if (p->ainsn.boostable == 0) {
- if ((regs->ip > copy_ip) &&
- (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
- /*
- * These instructions can be executed directly if it
- * jumps back to correct address.
- */
- set_jmp_op((void *)regs->ip,
- (void *)orig_ip + (regs->ip - copy_ip));
- p->ainsn.boostable = 1;
- } else {
- p->ainsn.boostable = -1;
- }
- }
-
- regs->ip += orig_ip - copy_ip;
-
-no_change:
- restore_btf();
-}
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
- */
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
- struct kprobe *cur = kprobe_running();
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- if (!cur)
- return 0;
-
- resume_execution(cur, regs, kcb);
- regs->flags |= kcb->kprobe_saved_flags;
-
- if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
- kcb->kprobe_status = KPROBE_HIT_SSDONE;
- cur->post_handler(cur, regs, 0);
- }
-
- /* Restore back the original saved kprobes variables and continue. */
- if (kcb->kprobe_status == KPROBE_REENTER) {
- restore_previous_kprobe(kcb);
- goto out;
- }
- reset_current_kprobe();
-out:
- preempt_enable_no_resched();
-
- /*
- * if somebody else is singlestepping across a probe point, flags
- * will have TF set, in which case, continue the remaining processing
- * of do_debug, as if this is not a probe hit.
- */
- if (regs->flags & X86_EFLAGS_TF)
- return 0;
-
- return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
- struct kprobe *cur = kprobe_running();
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- switch (kcb->kprobe_status) {
- case KPROBE_HIT_SS:
- case KPROBE_REENTER:
- /*
- * We are here because the instruction being single
- * stepped caused a page fault. We reset the current
- * kprobe and the ip points back to the probe address
- * and allow the page fault handler to continue as a
- * normal page fault.
- */
- regs->ip = (unsigned long)cur->addr;
- regs->flags |= kcb->kprobe_old_flags;
- if (kcb->kprobe_status == KPROBE_REENTER)
- restore_previous_kprobe(kcb);
- else
- reset_current_kprobe();
- preempt_enable_no_resched();
- break;
- case KPROBE_HIT_ACTIVE:
- case KPROBE_HIT_SSDONE:
- /*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(cur);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
- return 1;
-
- /*
- * In case the user-specified fault handler returned
- * zero, try to fix up.
- */
- if (fixup_exception(regs))
- return 1;
-
- /*
- * fixup routine could not handle it,
- * Let do_page_fault() fix it.
- */
- break;
- default:
- break;
- }
- return 0;
-}
-
-/*
- * Wrapper routine for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data)
-{
- struct die_args *args = data;
- int ret = NOTIFY_DONE;
-
- if (args->regs && user_mode_vm(args->regs))
- return ret;
-
- switch (val) {
- case DIE_INT3:
- if (kprobe_handler(args->regs))
- ret = NOTIFY_STOP;
- break;
- case DIE_DEBUG:
- if (post_kprobe_handler(args->regs))
- ret = NOTIFY_STOP;
- break;
- case DIE_GPF:
- /*
- * To be potentially processing a kprobe fault and to
- * trust the result from kprobe_running(), we have
- * be non-preemptible.
- */
- if (!preemptible() && kprobe_running() &&
- kprobe_fault_handler(args->regs, args->trapnr))
- ret = NOTIFY_STOP;
- break;
- default:
- break;
- }
- return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct jprobe *jp = container_of(p, struct jprobe, kp);
- unsigned long addr;
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- kcb->jprobe_saved_regs = *regs;
- kcb->jprobe_saved_sp = stack_addr(regs);
- addr = (unsigned long)(kcb->jprobe_saved_sp);
-
- /*
- * As Linus pointed out, gcc assumes that the callee
- * owns the argument space and could overwrite it, e.g.
- * tailcall optimization. So, to be absolutely safe
- * we also save and restore enough stack bytes to cover
- * the argument area.
- */
- memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
- MIN_STACK_SIZE(addr));
- regs->flags &= ~X86_EFLAGS_IF;
- trace_hardirqs_off();
- regs->ip = (unsigned long)(jp->entry);
- return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- asm volatile (
-#ifdef CONFIG_X86_64
- " xchg %%rbx,%%rsp \n"
-#else
- " xchgl %%ebx,%%esp \n"
-#endif
- " int3 \n"
- " .globl jprobe_return_end\n"
- " jprobe_return_end: \n"
- " nop \n"::"b"
- (kcb->jprobe_saved_sp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- u8 *addr = (u8 *) (regs->ip - 1);
- struct jprobe *jp = container_of(p, struct jprobe, kp);
-
- if ((addr > (u8 *) jprobe_return) &&
- (addr < (u8 *) jprobe_return_end)) {
- if (stack_addr(regs) != kcb->jprobe_saved_sp) {
- struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
- printk(KERN_ERR
- "current sp %p does not match saved sp %p\n",
- stack_addr(regs), kcb->jprobe_saved_sp);
- printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_registers(saved_regs);
- printk(KERN_ERR "Current registers\n");
- show_registers(regs);
- BUG();
- }
- *regs = kcb->jprobe_saved_regs;
- memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
- kcb->jprobes_stack,
- MIN_STACK_SIZE(kcb->jprobe_saved_sp));
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-
-int __init arch_init_kprobes(void)
-{
- return 0;
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
- return 0;
-}
diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile
new file mode 100644
index 000000000000..8a753432b2d4
--- /dev/null
+++ b/arch/x86/kernel/kprobes/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for kernel probes
+#
+
+obj-$(CONFIG_KPROBES) += core.o
+obj-$(CONFIG_OPTPROBES) += opt.o
+obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
new file mode 100644
index 000000000000..e772276f5aa9
--- /dev/null
+++ b/arch/x86/kernel/kprobes/common.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+
+/* Kprobes and Optprobes common header */
+
+#include <asm/asm.h>
+#include <asm/frame.h>
+#include <asm/insn.h>
+
+#ifdef CONFIG_X86_64
+
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ " pushq %rbp\n" \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n" \
+ ENCODE_FRAME_POINTER
+
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $4*4, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %es\n" \
+ " pushl %ds\n" \
+ " pushl %eax\n" \
+ " pushl %ebp\n" \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n" \
+ ENCODE_FRAME_POINTER
+
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\
+ " addl $7*4, %esp\n"
+#endif
+
+/* Ensure if the instruction can be boostable */
+extern bool can_boost(struct insn *insn, void *orig_addr);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+ unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn);
+
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *dest, void *from, void *to);
+extern void synthesize_relcall(void *dest, void *from, void *to);
+
+#ifdef CONFIG_OPTPROBES
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else /* !CONFIG_OPTPROBES */
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ return addr;
+}
+#endif
+
+#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
new file mode 100644
index 000000000000..c1fac3a9fecc
--- /dev/null
+++ b/arch/x86/kernel/kprobes/core.c
@@ -0,0 +1,1081 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel Probes (KProbes)
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation ( includes contributions from
+ * Rusty Russell).
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> adapted for x86_64 from i386.
+ * 2005-Mar Roland McGrath <roland@redhat.com>
+ * Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> added function-return probes.
+ * 2005-May Rusty Lynch <rusty.lynch@intel.com>
+ * Added function return probes functionality
+ * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
+ * kprobe-booster and kretprobe-booster for i386.
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
+ * and kretprobe-booster for x86-64
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
+ * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ * unified x86 kprobes code.
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/sched/debug.h>
+#include <linux/perf_event.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/ftrace.h>
+#include <linux/kasan.h>
+#include <linux/objtool.h>
+#include <linux/vmalloc.h>
+#include <linux/pgtable.h>
+#include <linux/set_memory.h>
+#include <linux/cfi.h>
+#include <linux/execmem.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/ibt.h>
+
+#include "common.h"
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+ /*
+ * Undefined/reserved opcodes, conditional jump, Opcode Extension
+ * Groups, and some special opcodes can not boost.
+ * This is non-const and volatile to keep gcc from statically
+ * optimizing it out, as variable_test_bit makes gcc think only
+ * *(unsigned long*) is used.
+ */
+static volatile u32 twobyte_is_boostable[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+ W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+ W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+ W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+ W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
+ /* ----------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+struct kretprobe_blackpoint kretprobe_blacklist[] = {
+ {"__switch_to", }, /* This function switches only current task, but
+ doesn't switch kernel stack.*/
+ {NULL, NULL} /* Terminator */
+};
+
+const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
+
+static nokprobe_inline void
+__synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
+{
+ struct __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } __packed *insn;
+
+ insn = (struct __arch_relative_insn *)dest;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+void synthesize_reljump(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_reljump);
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void synthesize_relcall(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_relcall);
+
+/*
+ * Returns non-zero if INSN is boostable.
+ * RIP relative instructions are adjusted at copying time in 64 bits mode
+ */
+bool can_boost(struct insn *insn, void *addr)
+{
+ kprobe_opcode_t opcode;
+ insn_byte_t prefix;
+
+ if (search_exception_tables((unsigned long)addr))
+ return false; /* Page fault may occur on this address. */
+
+ /* 2nd-byte opcode */
+ if (insn->opcode.nbytes == 2)
+ return test_bit(insn->opcode.bytes[1],
+ (unsigned long *)twobyte_is_boostable);
+
+ if (insn->opcode.nbytes != 1)
+ return false;
+
+ for_each_insn_prefix(insn, prefix) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(prefix);
+ /* Can't boost Address-size override prefix and CS override prefix */
+ if (prefix == 0x2e || inat_is_address_size_prefix(attr))
+ return false;
+ }
+
+ opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0x62: /* bound */
+ case 0x70 ... 0x7f: /* Conditional jumps */
+ case 0x9a: /* Call far */
+ case 0xcc ... 0xce: /* software exceptions */
+ case 0xd6: /* (UD) */
+ case 0xd8 ... 0xdf: /* ESC */
+ case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
+ case 0xe8 ... 0xe9: /* near Call, JMP */
+ case 0xeb: /* Short JMP */
+ case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
+ /* ... are not boostable */
+ return false;
+ case 0xc0 ... 0xc1: /* Grp2 */
+ case 0xd0 ... 0xd3: /* Grp2 */
+ /*
+ * AMD uses nnn == 110 as SHL/SAL, but Intel makes it reserved.
+ */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b110;
+ case 0xf6 ... 0xf7: /* Grp3 */
+ /* AMD uses nnn == 001 as TEST, but Intel makes it reserved. */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b001;
+ case 0xfe: /* Grp4 */
+ /* Only INC and DEC are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001;
+ case 0xff: /* Grp5 */
+ /* Only INC, DEC, and indirect JMP are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b100;
+ default:
+ return true;
+ }
+}
+
+static unsigned long
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct kprobe *kp;
+ bool faddr;
+
+ kp = get_kprobe((void *)addr);
+ faddr = ftrace_location(addr) == addr;
+ /*
+ * Use the current code if it is not modified by Kprobe
+ * and it cannot be modified by ftrace.
+ */
+ if (!kp && !faddr)
+ return addr;
+
+ /*
+ * Basically, kp->ainsn.insn has an original instruction.
+ * However, RIP-relative instruction can not do single-stepping
+ * at different place, __copy_instruction() tweaks the displacement of
+ * that instruction. In that case, we can't recover the instruction
+ * from the kp->ainsn.insn.
+ *
+ * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+ * of the first byte of the probed instruction, which is overwritten
+ * by int3. And the instruction at kp->addr is not modified by kprobes
+ * except for the first byte, we can recover the original instruction
+ * from it and kp->opcode.
+ *
+ * In case of Kprobes using ftrace, we do not have a copy of
+ * the original instruction. In fact, the ftrace location might
+ * be modified at anytime and even could be in an inconsistent state.
+ * Fortunately, we know that the original code is the ideal 5-byte
+ * long NOP.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (faddr)
+ memcpy(buf, x86_nops[5], 5);
+ else
+ buf[0] = kp->opcode;
+ return (unsigned long)buf;
+}
+
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered (or access failed).
+ */
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ unsigned long __addr;
+
+ __addr = __recover_optprobed_insn(buf, addr);
+ if (__addr != addr)
+ return __addr;
+
+ return __recover_probed_insn(buf, addr);
+}
+
+/* Check if insn is INT or UD */
+static inline bool is_exception_insn(struct insn *insn)
+{
+ /* UD uses 0f escape */
+ if (insn->opcode.bytes[0] == 0x0f) {
+ /* UD0 / UD1 / UD2 */
+ return insn->opcode.bytes[1] == 0xff ||
+ insn->opcode.bytes[1] == 0xb9 ||
+ insn->opcode.bytes[1] == 0x0b;
+ }
+
+ /* INT3 / INT n / INTO / INT1 */
+ return insn->opcode.bytes[0] == 0xcc ||
+ insn->opcode.bytes[0] == 0xcd ||
+ insn->opcode.bytes[0] == 0xce ||
+ insn->opcode.bytes[0] == 0xf1;
+}
+
+/*
+ * Check if paddr is at an instruction boundary and that instruction can
+ * be probed
+ */
+static bool can_probe(unsigned long paddr)
+{
+ unsigned long addr, __addr, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+ return false;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case we replace the breakpoint by the
+ * original instruction in our buffer.
+ * Also, jump optimization will change the breakpoint to
+ * relative-jump. Since the relative-jump itself is
+ * normally used, we just go through if there is no kprobe.
+ */
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return false;
+
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
+
+#ifdef CONFIG_KGDB
+ /*
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
+ */
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
+ return false;
+#endif
+ addr += insn.length;
+ }
+
+ /* Check if paddr is at an instruction boundary */
+ if (addr != paddr)
+ return false;
+
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return false;
+
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
+
+ /* INT and UD are special and should not be kprobed */
+ if (is_exception_insn(&insn))
+ return false;
+
+ if (IS_ENABLED(CONFIG_CFI)) {
+ /*
+ * The compiler generates the following instruction sequence
+ * for indirect call checks and cfi.c decodes this;
+ *
+ *  movl -<id>, %r10d ; 6 bytes
+ * addl -4(%reg), %r10d ; 4 bytes
+ * je .Ltmp1 ; 2 bytes
+ * ud2 ; <- regs->ip
+ * .Ltmp1:
+ *
+ * Also, these movl and addl are used for showing expected
+ * type. So those must not be touched.
+ */
+ if (insn.opcode.value == 0xBA)
+ offset = 12;
+ else if (insn.opcode.value == 0x3)
+ offset = 6;
+ else
+ goto out;
+
+ /* This movl/addl is used for decoding CFI. */
+ if (is_cfi_trap(addr + offset))
+ return false;
+ }
+
+out:
+ return true;
+}
+
+/* If x86 supports IBT (ENDBR) it must be skipped. */
+kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
+ bool *on_func_entry)
+{
+ if (is_endbr((u32 *)addr)) {
+ *on_func_entry = !offset || offset == 4;
+ if (*on_func_entry)
+ offset = 4;
+
+ } else {
+ *on_func_entry = !offset;
+ }
+
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
+ * Copy an instruction with recovering modified instruction by kprobes
+ * and adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode. Note that since @real will be the final place of copied
+ * instruction, displacement must be adjust by @real, not @dest.
+ * This returns the length of copied instruction, or 0 if it has an error.
+ */
+int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
+{
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
+ int ret;
+
+ if (!recovered_insn || !insn)
+ return 0;
+
+ /* This can access kernel text if given address is not recovered */
+ if (copy_from_kernel_nofault(dest, (void *)recovered_insn,
+ MAX_INSN_SIZE))
+ return 0;
+
+ ret = insn_decode_kernel(insn, dest);
+ if (ret < 0)
+ return 0;
+
+ /* We can not probe force emulate prefixed instruction */
+ if (insn_has_emulate_prefix(insn))
+ return 0;
+
+ /* Another subsystem puts a breakpoint, failed to recover */
+ if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
+ return 0;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return 0;
+
+#ifdef CONFIG_X86_64
+ /* Only x86_64 has RIP relative instructions */
+ if (insn_rip_relative(insn)) {
+ s64 newdisp;
+ u8 *disp;
+ /*
+ * The copied instruction uses the %rip-relative addressing
+ * mode. Adjust the displacement for the difference between
+ * the original location of this instruction and the location
+ * of the copy that will actually be run. The tricky bit here
+ * is making sure that the sign extension happens correctly in
+ * this calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the %rip
+ * value and yield the same 64-bit result that the sign-
+ * extension of the original signed 32-bit displacement would
+ * have given.
+ */
+ newdisp = (u8 *) src + (s64) insn->displacement.value
+ - (u8 *) real;
+ if ((s64) (s32) newdisp != newdisp) {
+ pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
+ return 0;
+ }
+ disp = (u8 *) dest + insn_offset_displacement(insn);
+ *(s32 *) disp = (s32) newdisp;
+ }
+#endif
+ return insn->length;
+}
+
+/* Prepare reljump or int3 right after instruction */
+static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
+ struct insn *insn)
+{
+ int len = insn->length;
+
+ if (!IS_ENABLED(CONFIG_PREEMPTION) &&
+ !p->post_handler && can_boost(insn, p->addr) &&
+ MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
+ /*
+ * These instructions can be executed directly if it
+ * jumps back to correct address.
+ */
+ synthesize_reljump(buf + len, p->ainsn.insn + len,
+ p->addr + insn->length);
+ len += JMP32_INSN_SIZE;
+ p->ainsn.boostable = 1;
+ } else {
+ /* Otherwise, put an int3 for trapping singlestep */
+ if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
+ return -ENOSPC;
+
+ buf[len] = INT3_INSN_OPCODE;
+ len += INT3_INSN_SIZE;
+ }
+
+ return len;
+}
+
+/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
+
+static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
+{
+ switch (p->ainsn.opcode) {
+ case 0xfa: /* cli */
+ regs->flags &= ~(X86_EFLAGS_IF);
+ break;
+ case 0xfb: /* sti */
+ regs->flags |= X86_EFLAGS_IF;
+ break;
+ case 0x9c: /* pushf */
+ int3_emulate_push(regs, regs->flags);
+ break;
+ case 0x9d: /* popf */
+ regs->flags = int3_emulate_pop(regs);
+ break;
+ }
+ regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
+
+static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
+{
+ int3_emulate_ret(regs);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ret);
+
+static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ func += p->ainsn.rel32;
+ int3_emulate_call(regs, func);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_call);
+
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp);
+
+static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jcc);
+
+static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+ bool match;
+
+ if (p->ainsn.loop.type != 3) { /* LOOP* */
+ if (p->ainsn.loop.asize == 32)
+ match = ((*(u32 *)&regs->cx)--) != 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = ((*(u64 *)&regs->cx)--) != 0;
+#endif
+ else
+ match = ((*(u16 *)&regs->cx)--) != 0;
+ } else { /* JCXZ */
+ if (p->ainsn.loop.asize == 32)
+ match = *(u32 *)(&regs->cx) == 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = *(u64 *)(&regs->cx) == 0;
+#endif
+ else
+ match = *(u16 *)(&regs->cx) == 0;
+ }
+
+ if (p->ainsn.loop.type == 0) /* LOOPNE */
+ match = match && !(regs->flags & X86_EFLAGS_ZF);
+ else if (p->ainsn.loop.type == 1) /* LOOPE */
+ match = match && (regs->flags & X86_EFLAGS_ZF);
+
+ if (match)
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_loop);
+
+static const int addrmode_regoffs[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+};
+
+static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size);
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
+
+static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
+
+static int prepare_emulation(struct kprobe *p, struct insn *insn)
+{
+ insn_byte_t opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0x9c: /* pushfl */
+ case 0x9d: /* popf/popfd */
+ /*
+ * IF modifiers must be emulated since it will enable interrupt while
+ * int3 single stepping.
+ */
+ p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
+ p->ainsn.opcode = opcode;
+ break;
+ case 0xc2: /* ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ p->ainsn.emulate_op = kprobe_emulate_ret;
+ break;
+ case 0x9a: /* far call absolute -- segment is not supported */
+ case 0xea: /* far jmp absolute -- segment is not supported */
+ case 0xcc: /* int3 */
+ case 0xcf: /* iret -- in-kernel IRET is not supported */
+ return -EOPNOTSUPP;
+ break;
+ case 0xe8: /* near call relative */
+ p->ainsn.emulate_op = kprobe_emulate_call;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ break;
+ case 0xeb: /* short jump relative */
+ case 0xe9: /* near jump relative */
+ p->ainsn.emulate_op = kprobe_emulate_jmp;
+ if (insn->immediate.nbytes == 1)
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ else if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ break;
+ case 0x70 ... 0x7f:
+ /* 1 byte conditional jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ p->ainsn.rel32 = insn->immediate.value;
+ break;
+ case 0x0f:
+ opcode = insn->opcode.bytes[1];
+ if ((opcode & 0xf0) == 0x80) {
+ /* 2 bytes Conditional Jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ } else if (opcode == 0x01 &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
+ X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
+ /* VM extensions - not supported */
+ return -EOPNOTSUPP;
+ }
+ break;
+ case 0xe0: /* Loop NZ */
+ case 0xe1: /* Loop */
+ case 0xe2: /* Loop */
+ case 0xe3: /* J*CXZ */
+ p->ainsn.emulate_op = kprobe_emulate_loop;
+ p->ainsn.loop.type = opcode & 0x3;
+ p->ainsn.loop.asize = insn->addr_bytes * 8;
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ break;
+ case 0xff:
+ /*
+ * Since the 0xff is an extended group opcode, the instruction
+ * is determined by the MOD/RM byte.
+ */
+ opcode = insn->modrm.bytes[0];
+ switch (X86_MODRM_REG(opcode)) {
+ case 0b010: /* FF /2, call near, absolute indirect */
+ p->ainsn.emulate_op = kprobe_emulate_call_indirect;
+ break;
+ case 0b100: /* FF /4, jmp near, absolute indirect */
+ p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
+ break;
+ case 0b011: /* FF /3, call far, absolute indirect */
+ case 0b101: /* FF /5, jmp far, absolute indirect */
+ return -EOPNOTSUPP;
+ }
+
+ if (!p->ainsn.emulate_op)
+ break;
+
+ if (insn->addr_bytes != sizeof(unsigned long))
+ return -EOPNOTSUPP; /* Don't support different size */
+ if (X86_MODRM_MOD(opcode) != 3)
+ return -EOPNOTSUPP; /* TODO: support memory addressing */
+
+ p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
+#ifdef CONFIG_X86_64
+ if (X86_REX_B(insn->rex_prefix.value))
+ p->ainsn.indirect.reg += 8;
+#endif
+ break;
+ default:
+ break;
+ }
+ p->ainsn.size = insn->length;
+
+ return 0;
+}
+
+static int arch_copy_kprobe(struct kprobe *p)
+{
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ int ret, len;
+
+ /* Copy an instruction with recovering if other optprobe modifies it.*/
+ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
+ if (!len)
+ return -EINVAL;
+
+ /* Analyze the opcode and setup emulate functions */
+ ret = prepare_emulation(p, &insn);
+ if (ret < 0)
+ return ret;
+
+ /* Add int3 for single-step or booster jmp */
+ len = prepare_singlestep(buf, p, &insn);
+ if (len < 0)
+ return len;
+
+ /* Also, displacement change doesn't affect the first byte */
+ p->opcode = buf[0];
+
+ p->ainsn.tp_len = len;
+ perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
+
+ /* OK, write back the instruction(s) into ROX insn buffer */
+ text_poke(p->ainsn.insn, buf, len);
+
+ return 0;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+ int ret;
+
+ if (alternatives_text_reserved(p->addr, p->addr))
+ return -EINVAL;
+
+ if (!can_probe((unsigned long)p->addr))
+ return -EILSEQ;
+
+ memset(&p->ainsn, 0, sizeof(p->ainsn));
+
+ /* insn: must be on special executable page on x86. */
+ p->ainsn.insn = get_insn_slot();
+ if (!p->ainsn.insn)
+ return -ENOMEM;
+
+ ret = arch_copy_kprobe(p);
+ if (ret) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+
+ return ret;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ text_poke(p->addr, &int3, 1);
+ smp_text_poke_sync_each_cpu();
+ perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
+ text_poke(p->addr, &p->opcode, 1);
+ smp_text_poke_sync_each_cpu();
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+ if (p->ainsn.insn) {
+ /* Record the perf event before freeing the slot */
+ perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
+ p->ainsn.tp_len, NULL, 0);
+ free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
+ p->ainsn.insn = NULL;
+ }
+}
+
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+ kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+ kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
+}
+
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+ kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
+}
+
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_saved_flags = kcb->kprobe_old_flags
+ = (regs->flags & X86_EFLAGS_IF);
+}
+
+static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ /* Restore back the original saved kprobes variables and continue. */
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ /* This will restore both kcb and current_kprobe */
+ restore_previous_kprobe(kcb);
+ } else {
+ /*
+ * Always update the kcb status because
+ * reset_curent_kprobe() doesn't update kcb.
+ */
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ if (cur->post_handler)
+ cur->post_handler(cur, regs, 0);
+ reset_current_kprobe();
+ }
+}
+NOKPROBE_SYMBOL(kprobe_post_process);
+
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
+{
+ if (setup_detour_execution(p, regs, reenter))
+ return;
+
+#if !defined(CONFIG_PREEMPTION)
+ if (p->ainsn.boostable) {
+ /* Boost up -- we can execute copied instructions directly */
+ if (!reenter)
+ reset_current_kprobe();
+ /*
+ * Reentering boosted probe doesn't reset current_kprobe,
+ * nor set current_kprobe, because it doesn't use single
+ * stepping.
+ */
+ regs->ip = (unsigned long)p->ainsn.insn;
+ return;
+ }
+#endif
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+
+ if (p->ainsn.emulate_op) {
+ p->ainsn.emulate_op(p, regs);
+ kprobe_post_process(p, regs, kcb);
+ return;
+ }
+
+ /* Disable interrupt, and set ip register on trampoline */
+ regs->flags &= ~X86_EFLAGS_IF;
+ regs->ip = (unsigned long)p->ainsn.insn;
+}
+NOKPROBE_SYMBOL(setup_singlestep);
+
+/*
+ * Called after single-stepping. p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int3"
+ * instruction. To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction. The address of this
+ * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
+ * right after the copied instruction.
+ * Different from the trap single-step, "int3" single-step can not
+ * handle the instruction which changes the ip register, e.g. jmp,
+ * call, conditional jmp, and the instructions which changes the IF
+ * flags because interrupt must be disabled around the single-stepping.
+ * Such instructions are software emulated, but others are single-stepped
+ * using "int3".
+ *
+ * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
+ * be adjusted, so that we can resume execution on correct code.
+ */
+static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+ unsigned long orig_ip = (unsigned long)p->addr;
+
+ /* Restore saved interrupt flag and ip register */
+ regs->flags |= kcb->kprobe_saved_flags;
+ /* Note that regs->ip is executed int3 so must be a step back */
+ regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
+}
+NOKPROBE_SYMBOL(resume_singlestep);
+
+/*
+ * We have reentered the kprobe_handler(), since another probe was hit while
+ * within the handler. We save the original kprobes variables and just single
+ * step on the instruction of the new probe without calling any user handlers.
+ */
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SSDONE:
+ case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
+ kprobes_inc_nmissed_count(p);
+ setup_singlestep(p, regs, kcb, 1);
+ break;
+ case KPROBE_REENTER:
+ /* A probe has been hit in the codepath leading up to, or just
+ * after, single-stepping of a probed instruction. This entire
+ * codepath should strictly reside in .kprobes.text section.
+ * Raise a BUG or we'll continue in an endless reentering loop
+ * and eventually a stack overflow.
+ */
+ pr_err("Unrecoverable kprobe detected.\n");
+ dump_kprobe(p);
+ BUG();
+ default:
+ /* impossible cases */
+ WARN_ON(1);
+ return 0;
+ }
+
+ return 1;
+}
+NOKPROBE_SYMBOL(reenter_kprobe);
+
+static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
+{
+ return (kcb->kprobe_status == KPROBE_HIT_SS ||
+ kcb->kprobe_status == KPROBE_REENTER);
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled throughout this function.
+ */
+int kprobe_int3_handler(struct pt_regs *regs)
+{
+ kprobe_opcode_t *addr;
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+
+ if (user_mode(regs))
+ return 0;
+
+ addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+ /*
+ * We don't want to be preempted for the entire duration of kprobe
+ * processing. Since int3 and debug trap disables irqs and we clear
+ * IF while singlestepping, it must be no preemptible.
+ */
+
+ kcb = get_kprobe_ctlblk();
+ p = get_kprobe(addr);
+
+ if (p) {
+ if (kprobe_running()) {
+ if (reenter_kprobe(p, regs, kcb))
+ return 1;
+ } else {
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ /*
+ * If we have no pre-handler or it returned 0, we
+ * continue with normal processing. If we have a
+ * pre-handler and it returned non-zero, that means
+ * user handler setup registers to exit to another
+ * instruction, we must skip the single stepping.
+ */
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ setup_singlestep(p, regs, kcb, 0);
+ else
+ reset_current_kprobe();
+ return 1;
+ }
+ } else if (kprobe_is_ss(kcb)) {
+ p = kprobe_running();
+ if ((unsigned long)p->ainsn.insn < regs->ip &&
+ (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
+ /* Most provably this is the second int3 for singlestep */
+ resume_singlestep(p, regs, kcb);
+ kprobe_post_process(p, regs, kcb);
+ return 1;
+ }
+ } /* else: not a kprobe fault; let the kernel handle it */
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_int3_handler);
+
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
+ /* This must happen on single-stepping */
+ WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
+ kcb->kprobe_status != KPROBE_REENTER);
+ /*
+ * We are here because the instruction being single
+ * stepped caused a page fault. We reset the current
+ * kprobe and the ip points back to the probe address
+ * and allow the page fault handler to continue as a
+ * normal page fault.
+ */
+ regs->ip = (unsigned long)cur->addr;
+
+ /*
+ * If the IF flag was set before the kprobe hit,
+ * don't touch it:
+ */
+ regs->flags |= kcb->kprobe_old_flags;
+
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ restore_previous_kprobe(kcb);
+ else
+ reset_current_kprobe();
+ }
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
+
+int __init arch_populate_kprobe_blacklist(void)
+{
+ return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
+ (unsigned long)__entry_text_end);
+}
+
+int __init arch_init_kprobes(void)
+{
+ return 0;
+}
+
+int arch_trampoline_kprobe(struct kprobe *p)
+{
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
new file mode 100644
index 000000000000..2be55ec3f392
--- /dev/null
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+#include <asm/text-patching.h>
+
+#include "common.h"
+
+/* Ftrace callback handler for kprobes -- called under preempt disabled */
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+ int bit;
+
+ if (unlikely(kprobe_ftrace_disabled))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ goto out;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ unsigned long orig_ip = instruction_pointer(regs);
+
+ /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+ instruction_pointer_set(regs, ip + INT3_INSN_SIZE);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+ if (unlikely(p->post_handler)) {
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE);
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ /* Recover IP address */
+ instruction_pointer_set(regs, orig_ip);
+ }
+ /*
+ * If pre_handler returns !0, it changes regs->ip. We have to
+ * skip emulating post_handler.
+ */
+ __this_cpu_write(current_kprobe, NULL);
+ }
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = false;
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
new file mode 100644
index 000000000000..6f826a00eca2
--- /dev/null
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel Probes Jump Optimization (Optprobes)
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/ftrace.h>
+#include <linux/objtool.h>
+#include <linux/pgtable.h>
+#include <linux/static_call.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/set_memory.h>
+#include <asm/sections.h>
+#include <asm/nospec-branch.h>
+
+#include "common.h"
+
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct optimized_kprobe *op;
+ struct kprobe *kp;
+ long offs;
+ int i;
+
+ for (i = 0; i < JMP32_INSN_SIZE; i++) {
+ kp = get_kprobe((void *)addr - i);
+ /* This function only handles jump-optimized kprobe */
+ if (kp && kprobe_optimized(kp)) {
+ op = container_of(kp, struct optimized_kprobe, kp);
+ /* If op is optimized or under unoptimizing */
+ if (list_empty(&op->list) || optprobe_queued_unopt(op))
+ goto found;
+ }
+ }
+
+ return addr;
+found:
+ /*
+ * If the kprobe can be optimized, original bytes which can be
+ * overwritten by jump destination address. In this case, original
+ * bytes must be recovered from op->optinsn.copied_insn buffer.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (addr == (unsigned long)kp->addr) {
+ buf[0] = kp->opcode;
+ memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
+ } else {
+ offs = addr - (unsigned long)kp->addr - 1;
+ memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
+ }
+
+ return (unsigned long)buf;
+}
+
+static void synthesize_clac(kprobe_opcode_t *addr)
+{
+ /*
+ * Can't be static_cpu_has() due to how objtool treats this feature bit.
+ * This isn't a fast path anyway.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SMAP))
+ return;
+
+ /* Replace the NOP3 with CLAC */
+ addr[0] = 0x0f;
+ addr[1] = 0x01;
+ addr[2] = 0xca;
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+ " pushq $" __stringify(__KERNEL_DS) "\n"
+ /* Save the 'sp - 8', this will be fixed later. */
+ " pushq %rsp\n"
+ " pushfq\n"
+ ".global optprobe_template_clac\n"
+ "optprobe_template_clac:\n"
+ ASM_NOP3
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Copy 'regs->flags' into 'regs->ss'. */
+ " movq 18*8(%rsp), %rdx\n"
+ " movq %rdx, 20*8(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip 'regs->flags' and 'regs->sp'. */
+ " addq $16, %rsp\n"
+ /* And pop flags register from 'regs->ss'. */
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushl %ss\n"
+ /* Save the 'sp - 4', this will be fixed later. */
+ " pushl %esp\n"
+ " pushfl\n"
+ ".global optprobe_template_clac\n"
+ "optprobe_template_clac:\n"
+ ASM_NOP3
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Copy 'regs->flags' into 'regs->ss'. */
+ " movl 14*4(%esp), %edx\n"
+ " movl %edx, 16*4(%esp)\n"
+ RESTORE_REGS_STRING
+ /* Skip 'regs->flags' and 'regs->sp'. */
+ " addl $8, %esp\n"
+ /* And pop flags register from 'regs->ss'. */
+ " popfl\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end:\n"
+ ".popsection\n");
+
+#define TMPL_CLAC_IDX \
+ ((long)optprobe_template_clac - (long)optprobe_template_entry)
+#define TMPL_MOVE_IDX \
+ ((long)optprobe_template_val - (long)optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)optprobe_template_call - (long)optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)optprobe_template_end - (long)optprobe_template_entry)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* Adjust stack pointer */
+ regs->sp += sizeof(long);
+ /* Save skipped registers */
+ regs->cs = __KERNEL_CS;
+#ifdef CONFIG_X86_32
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __this_cpu_write(current_kprobe, &op->kp);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+ preempt_enable();
+}
+NOKPROBE_SYMBOL(optimized_callback);
+
+static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
+{
+ struct insn insn;
+ int len = 0, ret;
+
+ while (len < JMP32_INSN_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, real + len, &insn);
+ if (!ret || !can_boost(&insn, src + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1) ||
+ static_call_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int can_optimize(unsigned long paddr)
+{
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+ return 0;
+
+ /*
+ * Do not optimize in the entry code due to the unstable
+ * stack handling and registers setup.
+ */
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < JMP32_INSN_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ unsigned long recovered_insn;
+ int ret;
+
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ recovered_insn = recover_probed_instruction(buf, addr);
+ if (!recovered_insn)
+ return 0;
+
+ ret = insn_decode_kernel(&insn, (void *)recovered_insn);
+ if (ret < 0)
+ return 0;
+#ifdef CONFIG_KGDB
+ /*
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
+ */
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
+ return 0;
+#endif
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /*
+ * Check any instructions don't jump into target, indirectly or
+ * directly.
+ *
+ * The indirect case is present to handle a code with jump
+ * tables. When the kernel uses retpolines, the check should in
+ * theory additionally look for jumps to indirect thunks.
+ * However, the kernel built with retpolines or IBT has jump
+ * tables disabled so the check can be skipped altogether.
+ */
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_X86_KERNEL_IBT) &&
+ insn_is_indirect_jump(&insn))
+ return 0;
+ if (insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
+ DISP32_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disarmed(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ kprobe_opcode_t *addr)
+{
+ return (op->kp.addr <= addr &&
+ op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ u8 *slot = op->optinsn.insn;
+ if (slot) {
+ int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
+
+ /* Record the perf event before freeing the slot */
+ if (dirty)
+ perf_event_text_poke(slot, slot, len, NULL, 0);
+
+ free_optinsn_slot(slot, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+ struct kprobe *__unused)
+{
+ u8 *buf = NULL, *slot;
+ int ret, len;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ op->optinsn.insn = slot = get_optinsn_slot();
+ if (!slot) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
+ if (abs(rel) > 0x7fffffff) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
+ slot + TMPL_END_IDX);
+ if (ret < 0)
+ goto err;
+ op->optinsn.size = ret;
+ len = TMPL_END_IDX + op->optinsn.size;
+
+ synthesize_clac(buf + TMPL_CLAC_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX,
+ slot + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + len, slot + len,
+ (u8 *)op->kp.addr + op->optinsn.size);
+ len += JMP32_INSN_SIZE;
+
+ /*
+ * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
+ * used in __arch_remove_optimized_kprobe().
+ */
+
+ /* We have to use text_poke() for instruction buffer because it is RO */
+ perf_event_text_poke(slot, NULL, 0, buf, len);
+ text_poke(slot, buf, len);
+
+ ret = 0;
+out:
+ kfree(buf);
+ return ret;
+
+err:
+ __arch_remove_optimized_kprobe(op, 0);
+ goto out;
+}
+
+/*
+ * Replace breakpoints (INT3) with relative jumps (JMP.d32).
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ *
+ * The caller will have installed a regular kprobe and after that issued
+ * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
+ * the 4 bytes after the INT3 are unused and can now be overwritten.
+ */
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ u8 insn_buff[JMP32_INSN_SIZE];
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + JMP32_INSN_SIZE));
+
+ WARN_ON(kprobe_disabled(&op->kp));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
+ DISP32_SIZE);
+
+ insn_buff[0] = JMP32_INSN_OPCODE;
+ *(s32 *)(&insn_buff[1]) = rel;
+
+ smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
+
+ list_del_init(&op->list);
+ }
+}
+
+/*
+ * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
+ *
+ * After that, we can restore the 4 bytes after the INT3 to undo what
+ * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
+ * unused once the INT3 lands.
+ */
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
+ u8 old[JMP32_INSN_SIZE];
+ u8 *addr = op->kp.addr;
+
+ memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
+ memcpy(new + INT3_INSN_SIZE,
+ op->optinsn.copied_insn,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+
+ text_poke(addr, new, INT3_INSN_SIZE);
+ smp_text_poke_sync_each_cpu();
+ text_poke(addr + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+ smp_text_poke_sync_each_cpu();
+
+ perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ arch_unoptimize_kprobe(op);
+ list_move(&op->list, done_list);
+ }
+}
+
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ return 1;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 000000000000..d547de9b3ed8
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ * Dave Young <dyoung@redhat.com>
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+
+#include <asm/setup.h>
+
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+ const struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, (void *)&boot_params + off, count);
+ return count;
+}
+
+static const struct bin_attribute boot_params_data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = boot_params_data_read,
+ .size = sizeof(boot_params),
+};
+
+static struct attribute *boot_params_version_attrs[] = {
+ &boot_params_version_attr.attr,
+ NULL,
+};
+
+static const struct bin_attribute *const boot_params_data_attrs[] = {
+ &boot_params_data_attr,
+ NULL,
+};
+
+static const struct attribute_group boot_params_attr_group = {
+ .attrs = boot_params_version_attrs,
+ .bin_attrs = boot_params_data_attrs,
+};
+
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+ const char *name;
+
+ name = kobject_name(kobj);
+ return kstrtoint(name, 10, nr);
+}
+
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ if (nr == i) {
+ *paddr = pa_data;
+ return 0;
+ }
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ pa_data = data->next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+ u64 pa_data = boot_params.hdr.setup_data, pa_next;
+ struct setup_indirect *indirect;
+ struct setup_data *data;
+ int i = 0;
+ u32 len;
+
+ while (pa_data) {
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+ pa_next = data->next;
+
+ if (nr == i) {
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(pa_data, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT)
+ *size = indirect->len;
+ else
+ *size = data->len;
+ } else {
+ *size = data->len;
+ }
+
+ memunmap(data);
+ return 0;
+ }
+
+ pa_data = pa_next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct setup_indirect *indirect;
+ struct setup_data *data;
+ int nr, ret;
+ u64 paddr;
+ u32 len;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(paddr, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ ret = sprintf(buf, "0x%x\n", indirect->type);
+ } else {
+ ret = sprintf(buf, "0x%x\n", data->type);
+ }
+
+ memunmap(data);
+ return ret;
+}
+
+static ssize_t setup_data_data_read(struct file *fp,
+ struct kobject *kobj,
+ const struct bin_attribute *bin_attr,
+ char *buf,
+ loff_t off, size_t count)
+{
+ struct setup_indirect *indirect;
+ struct setup_data *data;
+ int nr, ret = 0;
+ u64 paddr, len;
+ void *p;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(paddr, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ paddr = indirect->addr;
+ len = indirect->len;
+ } else {
+ /*
+ * Even though this is technically undefined, return
+ * the data as though it is a normal setup_data struct.
+ * This will at least allow it to be inspected.
+ */
+ paddr += sizeof(*data);
+ len = data->len;
+ }
+ } else {
+ paddr += sizeof(*data);
+ len = data->len;
+ }
+
+ if (off > len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (count > len - off)
+ count = len - off;
+
+ if (!count)
+ goto out;
+
+ ret = count;
+ p = memremap(paddr, len, MEMREMAP_WB);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(buf, p + off, count);
+ memunmap(p);
+out:
+ memunmap(data);
+ return ret;
+}
+
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+
+static struct bin_attribute data_attr __ro_after_init = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = setup_data_data_read,
+};
+
+static struct attribute *setup_data_type_attrs[] = {
+ &type_attr.attr,
+ NULL,
+};
+
+static const struct bin_attribute *const setup_data_data_attrs[] = {
+ &data_attr,
+ NULL,
+};
+
+static const struct attribute_group setup_data_attr_group = {
+ .attrs = setup_data_type_attrs,
+ .bin_attrs = setup_data_data_attrs,
+};
+
+static int __init create_setup_data_node(struct kobject *parent,
+ struct kobject **kobjp, int nr)
+{
+ int ret = 0;
+ size_t size;
+ struct kobject *kobj;
+ char name[16]; /* should be enough for setup_data nodes numbers */
+ snprintf(name, 16, "%d", nr);
+
+ kobj = kobject_create_and_add(name, parent);
+ if (!kobj)
+ return -ENOMEM;
+
+ ret = get_setup_data_size(nr, &size);
+ if (ret)
+ goto out_kobj;
+
+ data_attr.size = size;
+ ret = sysfs_create_group(kobj, &setup_data_attr_group);
+ if (ret)
+ goto out_kobj;
+ *kobjp = kobj;
+
+ return 0;
+out_kobj:
+ kobject_put(kobj);
+ return ret;
+}
+
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+ sysfs_remove_group(kobj, &setup_data_attr_group);
+ kobject_put(kobj);
+}
+
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+ int ret = 0;
+ struct setup_data *data;
+
+ *nr = 0;
+ while (pa_data) {
+ *nr += 1;
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pa_data = data->next;
+ memunmap(data);
+ }
+
+out:
+ return ret;
+}
+
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+ struct kobject *setup_data_kobj, **kobjp;
+ u64 pa_data;
+ int i, j, nr, ret = 0;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return 0;
+
+ setup_data_kobj = kobject_create_and_add("setup_data", parent);
+ if (!setup_data_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_setup_data_total_num(pa_data, &nr);
+ if (ret)
+ goto out_setup_data_kobj;
+
+ kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL);
+ if (!kobjp) {
+ ret = -ENOMEM;
+ goto out_setup_data_kobj;
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+ if (ret)
+ goto out_clean_nodes;
+ }
+
+ kfree(kobjp);
+ return 0;
+
+out_clean_nodes:
+ for (j = i - 1; j >= 0; j--)
+ cleanup_setup_data_node(*(kobjp + j));
+ kfree(kobjp);
+out_setup_data_kobj:
+ kobject_put(setup_data_kobj);
+out:
+ return ret;
+}
+
+static int __init boot_params_ksysfs_init(void)
+{
+ int ret;
+ struct kobject *boot_params_kobj;
+
+ boot_params_kobj = kobject_create_and_add("boot_params",
+ kernel_kobj);
+ if (!boot_params_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+ if (ret)
+ goto out_boot_params_kobj;
+
+ ret = create_setup_data_nodes(boot_params_kobj);
+ if (ret)
+ goto out_create_group;
+
+ return 0;
+out_create_group:
+ sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+ kobject_put(boot_params_kobj);
+out:
+ return ret;
+}
+
+arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c664d515f613..df78ddee0abb 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -1,49 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* KVM paravirt_ops implementation
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
* Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright IBM Corporation, 2007
* Authors: Anthony Liguori <aliguori@us.ibm.com>
*/
-#include <linux/module.h>
+#define pr_fmt(fmt) "kvm-guest: " fmt
+
+#include <linux/context_tracking.h>
+#include <linux/init.h>
+#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/swait.h>
+#include <linux/syscore_ops.h>
+#include <linux/cc_platform.h>
+#include <linux/efi.h>
+#include <linux/kvm_types.h>
#include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
+#include <asm/mtrr.h>
+#include <asm/tlb.h>
+#include <asm/cpuidle_haltpoll.h>
+#include <asm/msr.h>
+#include <asm/ptrace.h>
+#include <asm/reboot.h>
+#include <asm/svm.h>
+#include <asm/e820/api.h>
-#define MMU_QUEUE_SIZE 1024
+DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
-struct kvm_para_state {
- u8 mmu_queue[MMU_QUEUE_SIZE];
- int mmu_queue_len;
- enum paravirt_lazy_mode mode;
-};
+static int kvmapf = 1;
-static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static int __init parse_no_kvmapf(char *arg)
+{
+ kvmapf = 0;
+ return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
-static struct kvm_para_state *kvm_para_state(void)
+static int steal_acc = 1;
+static int __init parse_no_stealacc(char *arg)
{
- return &per_cpu(para_state, raw_smp_processor_id());
+ steal_acc = 0;
+ return 0;
}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
+static int has_steal_clock = 0;
+
+static int has_guest_poll = 0;
/*
* No need for any "IO delay" on KVM
*/
@@ -51,195 +81,1099 @@ static void kvm_io_delay(void)
{
}
-static void kvm_mmu_op(void *buffer, unsigned len)
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ struct swait_queue_head wq;
+ u32 token;
+ int cpu;
+};
+
+static struct kvm_task_sleep_head {
+ raw_spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u32 token)
{
- int r;
- unsigned long a1, a2;
+ struct hlist_node *p;
- do {
- a1 = __pa(buffer);
- a2 = 0; /* on i386 __pa() always returns <4G */
- r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
- buffer += r;
- len -= r;
- } while (len);
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
}
-static void mmu_queue_flush(struct kvm_para_state *state)
+static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
{
- if (state->mmu_queue_len) {
- kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
- state->mmu_queue_len = 0;
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *e;
+
+ raw_spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ raw_spin_unlock(&b->lock);
+ kfree(e);
+ return false;
}
+
+ n->token = token;
+ n->cpu = smp_processor_id();
+ init_swait_queue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ raw_spin_unlock(&b->lock);
+ return true;
}
-static void kvm_deferred_mmu_op(void *buffer, int len)
+/*
+ * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
+ * @token: Token to identify the sleep node entry
+ *
+ * Invoked from the async pagefault handling code or from the VM exit page
+ * fault handler. In both cases RCU is watching.
+ */
+void kvm_async_pf_task_wait_schedule(u32 token)
{
- struct kvm_para_state *state = kvm_para_state();
+ struct kvm_task_sleep_node n;
+ DECLARE_SWAITQUEUE(wait);
- if (state->mode != PARAVIRT_LAZY_MMU) {
- kvm_mmu_op(buffer, len);
+ lockdep_assert_irqs_disabled();
+
+ if (!kvm_async_pf_queue_task(token, &n))
return;
+
+ for (;;) {
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
}
- if (state->mmu_queue_len + len > sizeof state->mmu_queue)
- mmu_queue_flush(state);
- memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
- state->mmu_queue_len += len;
+ finish_swait(&n.wq, &wait);
}
+EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
-static void kvm_mmu_write(void *dest, u64 val)
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
- __u64 pte_phys;
- struct kvm_mmu_op_write_pte wpte;
+ hlist_del_init(&n->link);
+ if (swq_has_sleeper(&n->wq))
+ swake_up_one(&n->wq);
+}
-#ifdef CONFIG_HIGHPTE
- struct page *page;
- unsigned long dst = (unsigned long) dest;
+static void apf_task_wake_all(void)
+{
+ int i;
- page = kmap_atomic_to_page(dest);
- pte_phys = page_to_pfn(page);
- pte_phys <<= PAGE_SHIFT;
- pte_phys += (dst & ~(PAGE_MASK));
-#else
- pte_phys = (unsigned long)__pa(dest);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ struct kvm_task_sleep_node *n;
+ struct hlist_node *p, *next;
+
+ raw_spin_lock(&b->lock);
+ hlist_for_each_safe(p, next, &b->list) {
+ n = hlist_entry(p, typeof(*n), link);
+ if (n->cpu == smp_processor_id())
+ apf_task_wake_one(n);
+ }
+ raw_spin_unlock(&b->lock);
+ }
+}
+
+static void kvm_async_pf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n, *dummy = NULL;
+
+ if (token == ~0) {
+ apf_task_wake_all();
+ return;
+ }
+
+again:
+ raw_spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * Async #PF not yet handled, add a dummy entry for the token.
+ * Allocating the token must be down outside of the raw lock
+ * as the allocator is preemptible on PREEMPT_RT kernels.
+ */
+ if (!dummy) {
+ raw_spin_unlock(&b->lock);
+ dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
+
+ /*
+ * Continue looping on allocation failure, eventually
+ * the async #PF will be handled and allocating a new
+ * node will be unnecessary.
+ */
+ if (!dummy)
+ cpu_relax();
+
+ /*
+ * Recheck for async #PF completion before enqueueing
+ * the dummy token to avoid duplicate list entries.
+ */
+ goto again;
+ }
+ dummy->token = token;
+ dummy->cpu = smp_processor_id();
+ init_swait_queue_head(&dummy->wq);
+ hlist_add_head(&dummy->link, &b->list);
+ dummy = NULL;
+ } else {
+ apf_task_wake_one(n);
+ }
+ raw_spin_unlock(&b->lock);
+
+ /* A dummy token might be allocated and ultimately not used. */
+ kfree(dummy);
+}
+
+noinstr u32 kvm_read_and_reset_apf_flags(void)
+{
+ u32 flags = 0;
+
+ if (__this_cpu_read(async_pf_enabled)) {
+ flags = __this_cpu_read(apf_reason.flags);
+ __this_cpu_write(apf_reason.flags, 0);
+ }
+
+ return flags;
+}
+EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
+
+noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+ u32 flags = kvm_read_and_reset_apf_flags();
+ irqentry_state_t state;
+
+ if (!flags)
+ return false;
+
+ state = irqentry_enter(regs);
+ instrumentation_begin();
+
+ /*
+ * If the host managed to inject an async #PF into an interrupt
+ * disabled region, then die hard as this is not going to end well
+ * and the host side is seriously broken.
+ */
+ if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
+ panic("Host injected async #PF in interrupt disabled region\n");
+
+ if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ if (unlikely(!(user_mode(regs))))
+ panic("Host injected async #PF in kernel mode\n");
+ /* Page is swapped out by the host. */
+ kvm_async_pf_task_wait_schedule(token);
+ } else {
+ WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
+ }
+
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ return true;
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ u32 token;
+
+ apic_eoi();
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ if (__this_cpu_read(async_pf_enabled)) {
+ token = __this_cpu_read(apf_reason.token);
+ kvm_async_pf_task_wake(token);
+ __this_cpu_write(apf_reason.token, 0);
+ wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1);
+ }
+
+ set_irq_regs(old_regs);
+}
+
+static void __init paravirt_ops_setup(void)
+{
+ pv_info.name = "KVM";
+
+ if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+ pv_ops.cpu.io_delay = kvm_io_delay;
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
#endif
- wpte.header.op = KVM_MMU_OP_WRITE_PTE;
- wpte.pte_val = val;
- wpte.pte_phys = pte_phys;
+}
+
+static void kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
+ (unsigned long long) slow_virt_to_phys(st));
+}
+
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+
+static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
+{
+ /**
+ * This relies on __test_and_clear_bit to modify the memory
+ * in a way that is atomic with respect to the local CPU.
+ * The hypervisor only accesses this memory from the local CPU so
+ * there's no need for lock or memory barriers.
+ * An optimization barrier is implied in apic write.
+ */
+ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
+ return;
+ apic_native_eoi();
+}
- kvm_deferred_mmu_op(&wpte, sizeof wpte);
+static void kvm_guest_cpu_init(void)
+{
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ u64 pa;
+
+ WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
+
+ pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+ pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
+ pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
+ wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
+
+ wrmsrq(MSR_KVM_ASYNC_PF_EN, pa);
+ __this_cpu_write(async_pf_enabled, true);
+ pr_debug("setup async PF for cpu %d\n", smp_processor_id());
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
+ unsigned long pa;
+
+ /* Size alignment is implied but just to make it explicit. */
+ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
+ __this_cpu_write(kvm_apic_eoi, 0);
+ pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
+ | KVM_MSR_ENABLED;
+ wrmsrq(MSR_KVM_PV_EOI_EN, pa);
+ }
+
+ if (has_steal_clock)
+ kvm_register_steal_time();
+}
+
+static void kvm_pv_disable_apf(void)
+{
+ if (!__this_cpu_read(async_pf_enabled))
+ return;
+
+ wrmsrq(MSR_KVM_ASYNC_PF_EN, 0);
+ __this_cpu_write(async_pf_enabled, false);
+
+ pr_debug("disable async PF for cpu %d\n", smp_processor_id());
+}
+
+static void kvm_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ wrmsrq(MSR_KVM_STEAL_TIME, 0);
+}
+
+static u64 kvm_steal_clock(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ virt_rmb();
+ steal = src->steal;
+ virt_rmb();
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
+static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+ early_set_memory_decrypted((unsigned long) ptr, size);
}
/*
- * We only need to hook operations that are MMU writes. We hook these so that
- * we can use lazy MMU mode to batch these operations. We could probably
- * improve the performance of the host code if we used some of the information
- * here to simplify processing of batched writes.
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
*/
-static void kvm_set_pte(pte_t *ptep, pte_t pte)
+static void __init sev_map_percpu_data(void)
{
- kvm_mmu_write(ptep, pte_val(pte));
+ int cpu;
+
+ if (cc_vendor != CC_VENDOR_AMD ||
+ !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+ __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+ __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+ }
}
-static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static void kvm_guest_cpu_offline(bool shutdown)
{
- kvm_mmu_write(ptep, pte_val(pte));
+ kvm_disable_steal_time();
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrq(MSR_KVM_PV_EOI_EN, 0);
+ if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0);
+ kvm_pv_disable_apf();
+ if (!shutdown)
+ apf_task_wake_all();
+ kvmclock_disable();
}
-static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static int kvm_cpu_online(unsigned int cpu)
{
- kvm_mmu_write(pmdp, pmd_val(pmd));
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvm_guest_cpu_init();
+ local_irq_restore(flags);
+ return 0;
}
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
+static bool pv_tlb_flush_supported(void)
{
- kvm_mmu_write(ptep, pte_val(pte));
+ return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
+ !boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (num_possible_cpus() != 1));
}
-static void kvm_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+static bool pv_ipi_supported(void)
{
- kvm_mmu_write(ptep, 0);
+ return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
+ (num_possible_cpus() != 1));
}
-static void kvm_pmd_clear(pmd_t *pmdp)
+static bool pv_sched_yield_supported(void)
{
- kvm_mmu_write(pmdp, 0);
+ return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
+ !boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (num_possible_cpus() != 1));
}
+
+#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
+
+static void __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ unsigned long flags;
+ int cpu, min = 0, max = 0;
+#ifdef CONFIG_X86_64
+ __uint128_t ipi_bitmap = 0;
+#else
+ u64 ipi_bitmap = 0;
#endif
+ u32 apic_id, icr;
+ long ret;
-static void kvm_set_pud(pud_t *pudp, pud_t pud)
+ if (cpumask_empty(mask))
+ return;
+
+ local_irq_save(flags);
+
+ switch (vector) {
+ default:
+ icr = APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr = APIC_DM_NMI;
+ break;
+ }
+
+ for_each_cpu(cpu, mask) {
+ apic_id = per_cpu(x86_cpu_to_apicid, cpu);
+ if (!ipi_bitmap) {
+ min = max = apic_id;
+ } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
+ ipi_bitmap <<= min - apic_id;
+ min = apic_id;
+ } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+ max = apic_id < max ? max : apic_id;
+ } else {
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
+ min = max = apic_id;
+ ipi_bitmap = 0;
+ }
+ __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
+ }
+
+ if (ipi_bitmap) {
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
{
- kvm_mmu_write(pudp, pud_val(pud));
+ __send_ipi_mask(mask, vector);
}
-#if PAGETABLE_LEVELS == 4
-static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
{
- kvm_mmu_write(pgdp, pgd_val(pgd));
+ unsigned int this_cpu = smp_processor_id();
+ struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+ const struct cpumask *local_mask;
+
+ cpumask_copy(new_mask, mask);
+ cpumask_clear_cpu(this_cpu, new_mask);
+ local_mask = new_mask;
+ __send_ipi_mask(local_mask, vector);
}
+
+static int __init setup_efi_kvm_sev_migration(void)
+{
+ efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
+ efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
+ efi_status_t status;
+ unsigned long size;
+ bool enabled;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
+ !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ return 0;
+
+ if (!efi_enabled(EFI_BOOT))
+ return 0;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
+ pr_info("%s : EFI runtime services are not enabled\n", __func__);
+ return 0;
+ }
+
+ size = sizeof(enabled);
+
+ /* Get variable contents into buffer */
+ status = efi.get_variable(efi_sev_live_migration_enabled,
+ &efi_variable_guid, NULL, &size, &enabled);
+
+ if (status == EFI_NOT_FOUND) {
+ pr_info("%s : EFI live migration variable not found\n", __func__);
+ return 0;
+ }
+
+ if (status != EFI_SUCCESS) {
+ pr_info("%s : EFI variable retrieval failed\n", __func__);
+ return 0;
+ }
+
+ if (enabled == 0) {
+ pr_info("%s: live migration disabled in EFI\n", __func__);
+ return 0;
+ }
+
+ pr_info("%s : live migration enabled in EFI\n", __func__);
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
+
+ return 1;
+}
+
+late_initcall(setup_efi_kvm_sev_migration);
+
+/*
+ * Set the IPI entry points
+ */
+static __init void kvm_setup_pv_ipi(void)
+{
+ apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
+ apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
+ pr_info("setup PV IPIs\n");
+}
+
+static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
+{
+ int cpu;
+
+ native_send_call_func_ipi(mask);
+
+ /* Make sure other vCPUs get a chance to run if they need to. */
+ for_each_cpu(cpu, mask) {
+ if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
+ kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
+ break;
+ }
+ }
+}
+
+static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ /*
+ * The local vCPU is never preempted, so we do not explicitly
+ * skip check for local vCPU - it will never be cleared from
+ * flushmask.
+ */
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_multi(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+ int cpu;
+
+ if (!kvm_para_available() || nopv)
+ return 0;
+
+ if (pv_tlb_flush_supported() || pv_ipi_supported())
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+
+ return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+ /*
+ * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+ * shares the guest physical address with the hypervisor.
+ */
+ sev_map_percpu_data();
+
+ kvm_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+ kvm_spinlock_init();
+}
+
+static int kvm_cpu_down_prepare(unsigned int cpu)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvm_guest_cpu_offline(false);
+ local_irq_restore(flags);
+ return 0;
+}
+
#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-static void kvm_flush_tlb(void)
+static int kvm_suspend(void *data)
{
- struct kvm_mmu_op_flush_tlb ftlb = {
- .header.op = KVM_MMU_OP_FLUSH_TLB,
- };
+ u64 val = 0;
- kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
+ kvm_guest_cpu_offline(false);
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ rdmsrq(MSR_KVM_POLL_CONTROL, val);
+ has_guest_poll = !(val & 1);
+#endif
+ return 0;
}
-static void kvm_release_pt(unsigned long pfn)
+static void kvm_resume(void *data)
{
- struct kvm_mmu_op_release_pt rpt = {
- .header.op = KVM_MMU_OP_RELEASE_PT,
- .pt_phys = (u64)pfn << PAGE_SHIFT,
- };
+ kvm_cpu_online(raw_smp_processor_id());
- kvm_mmu_op(&rpt, sizeof rpt);
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+ wrmsrq(MSR_KVM_POLL_CONTROL, 0);
+#endif
}
-static void kvm_enter_lazy_mmu(void)
+static const struct syscore_ops kvm_syscore_ops = {
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static struct syscore kvm_syscore = {
+ .ops = &kvm_syscore_ops,
+};
+
+static void kvm_pv_guest_cpu_reboot(void *unused)
{
- struct kvm_para_state *state = kvm_para_state();
+ kvm_guest_cpu_offline(true);
+}
- paravirt_enter_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
}
-static void kvm_leave_lazy_mmu(void)
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+/*
+ * After a PV feature is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written.
+ */
+#ifdef CONFIG_CRASH_DUMP
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ kvm_guest_cpu_offline(true);
+ native_machine_crash_shutdown(regs);
+}
+#endif
+
+#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+bool __kvm_vcpu_is_preempted(long cpu);
+
+__visible bool __kvm_vcpu_is_preempted(long cpu)
{
- struct kvm_para_state *state = kvm_para_state();
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
- mmu_queue_flush(state);
- paravirt_leave_lazy_mmu();
- state->mode = paravirt_get_lazy_mode();
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
-static void __init paravirt_ops_setup(void)
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+#define PV_VCPU_PREEMPTED_ASM \
+ "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
+ "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
+ "setne %al\n\t"
+
+DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
+#endif
+
+static void __init kvm_guest_init(void)
{
- pv_info.name = "KVM";
- pv_info.paravirt_enabled = 1;
+ int i;
- if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
- pv_cpu_ops.io_delay = kvm_io_delay;
-
- if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
- pv_mmu_ops.set_pte = kvm_set_pte;
- pv_mmu_ops.set_pte_at = kvm_set_pte_at;
- pv_mmu_ops.set_pmd = kvm_set_pmd;
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.pte_clear = kvm_pte_clear;
- pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+ paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ raw_spin_lock_init(&async_pf_sleepers[i].lock);
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ has_steal_clock = 1;
+ static_call_update(pv_steal_clock, kvm_steal_clock);
+
+ pv_ops.lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ apic_update_callback(eoi, kvm_guest_apic_eoi_write);
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ static_branch_enable(&kvm_async_pf_enabled);
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
+ }
+
+#ifdef CONFIG_SMP
+ if (pv_tlb_flush_supported()) {
+ pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ if (pv_sched_yield_supported()) {
+ smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
+ pr_info("setup PV sched yield\n");
+ }
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+ kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+ pr_err("failed to install cpu hotplug callbacks\n");
+#else
+ sev_map_percpu_data();
+ kvm_guest_cpu_init();
#endif
- pv_mmu_ops.set_pud = kvm_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = kvm_set_pgd;
+
+#ifdef CONFIG_CRASH_DUMP
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
+
+ register_syscore(&kvm_syscore);
+
+ /*
+ * Hard lockup detection is enabled by default. Disable it, as guests
+ * can get false positives too easily, for example if the host is
+ * overcommitted.
+ */
+ hardlockup_detector_disable();
+}
+
+static noinline uint32_t __kvm_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0)
+ return 0; /* So we don't blow up on old processors */
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return cpuid_base_hypervisor(KVM_SIGNATURE, 0);
+
+ return 0;
+}
+
+static inline uint32_t kvm_cpuid_base(void)
+{
+ static int kvm_cpuid_base = -1;
+
+ if (kvm_cpuid_base == -1)
+ kvm_cpuid_base = __kvm_cpuid_base();
+
+ return kvm_cpuid_base;
+}
+
+bool kvm_para_available(void)
+{
+ return kvm_cpuid_base() != 0;
+}
+EXPORT_SYMBOL_GPL(kvm_para_available);
+
+unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
+unsigned int kvm_arch_para_hints(void)
+{
+ return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
+
+static uint32_t __init kvm_detect(void)
+{
+ return kvm_cpuid_base();
+}
+
+static void __init kvm_apic_init(void)
+{
+#ifdef CONFIG_SMP
+ if (pv_ipi_supported())
+ kvm_setup_pv_ipi();
#endif
- pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
- pv_mmu_ops.release_pte = kvm_release_pt;
- pv_mmu_ops.release_pmd = kvm_release_pt;
- pv_mmu_ops.release_pud = kvm_release_pt;
+}
+
+static bool __init kvm_msi_ext_dest_id(void)
+{
+ return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
+}
+
+static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
+{
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
+ KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+}
+
+static void __init kvm_init_platform(void)
+{
+ u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
+ /*
+ * Note, hardware requires variable MTRR ranges to be power-of-2 sized
+ * and naturally aligned. But when forcing guest MTRR state, Linux
+ * doesn't program the forced ranges into hardware. Don't bother doing
+ * the math to generate a technically-legal range.
+ */
+ struct mtrr_var_range pci_hole = {
+ .base_lo = tolud | X86_MEMTYPE_UC,
+ .mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
+ .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
+ };
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
+ kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
+ unsigned long nr_pages;
+ int i;
+
+ pv_ops.mmu.notify_page_enc_status_changed =
+ kvm_sev_hc_page_enc_status;
+
+ /*
+ * Reset the host's shared pages list related to kernel
+ * specific page encryption status settings before we load a
+ * new kernel by kexec. Reset the page encryption status
+ * during early boot instead of just before kexec to avoid SMP
+ * races during kvm_pv_guest_cpu_reboot().
+ * NOTE: We cannot reset the complete shared pages list
+ * here as we need to retain the UEFI/OVMF firmware
+ * specific settings.
+ */
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
+
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
+ nr_pages,
+ KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+ }
- pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+ /*
+ * Ensure that _bss_decrypted section is marked as decrypted in the
+ * shared pages list.
+ */
+ early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
+ __end_bss_decrypted - __start_bss_decrypted, 0);
+
+ /*
+ * If not booted using EFI, enable Live migration support.
+ */
+ if (!efi_enabled(EFI_BOOT))
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL,
+ KVM_MIGRATION_READY);
}
-#ifdef CONFIG_X86_IO_APIC
- no_timer_check = 1;
+ kvmclock_init();
+ x86_platform.apic_post_init = kvm_apic_init;
+
+ /*
+ * Set WB as the default cache mode for SEV-SNP and TDX, with a single
+ * UC range for the legacy PCI hole, e.g. so that devices that expect
+ * to get UC/WC mappings don't get surprised with WB.
+ */
+ guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK);
+}
+
+#if defined(CONFIG_AMD_MEM_ENCRYPT)
+static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+}
+
+static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
#endif
+
+const __initconst struct hypervisor_x86 x86_hyper_kvm = {
+ .name = "KVM",
+ .detect = kvm_detect,
+ .type = X86_HYPER_KVM,
+ .init.guest_late_init = kvm_guest_init,
+ .init.x2apic_available = kvm_para_available,
+ .init.msi_ext_dest_id = kvm_msi_ext_dest_id,
+ .init.init_platform = kvm_init_platform,
+#if defined(CONFIG_AMD_MEM_ENCRYPT)
+ .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
+#endif
+};
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
}
+arch_initcall(activate_jump_labels);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
-void __init kvm_guest_init(void)
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+static void kvm_kick_cpu(int cpu)
{
- if (!kvm_para_available())
+ unsigned long flags = 0;
+ u32 apicid;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
+}
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+ if (in_nmi())
return;
- paravirt_ops_setup();
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (irqs_disabled()) {
+ if (READ_ONCE(*ptr) == val)
+ halt();
+ } else {
+ local_irq_disable();
+
+ /* safe_halt() will enable IRQ */
+ if (READ_ONCE(*ptr) == val)
+ safe_halt();
+ else
+ local_irq_enable();
+ }
+}
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+ /*
+ * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
+ * are available.
+ */
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
+ pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
+ goto out;
+ }
+
+ if (num_possible_cpus() == 1) {
+ pr_info("PV spinlocks disabled, single CPU\n");
+ goto out;
+ }
+
+ if (nopvspin) {
+ pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
+ goto out;
+ }
+
+ /*
+ * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
+ * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
+ * preferred over native qspinlock when vCPU is preempted.
+ */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
+ pr_info("PV spinlocks disabled, no host support\n");
+ return;
+ }
+
+ pr_info("PV spinlocks enabled\n");
+
+ __pv_init_lock_hash();
+ pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_ops.lock.queued_spin_unlock =
+ PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_ops.lock.wait = kvm_wait;
+ pv_ops.lock.kick = kvm_kick_cpu;
+
+ /*
+ * When PV spinlock is enabled which is preferred over
+ * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
+ * Just disable it anyway.
+ */
+out:
+ static_branch_disable(&virt_spin_lock_key);
+}
+
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+
+static void kvm_disable_host_haltpoll(void *i)
+{
+ wrmsrq(MSR_KVM_POLL_CONTROL, 0);
}
+
+static void kvm_enable_host_haltpoll(void *i)
+{
+ wrmsrq(MSR_KVM_POLL_CONTROL, 1);
+}
+
+void arch_haltpoll_enable(unsigned int cpu)
+{
+ if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
+ pr_err_once("host does not support poll control\n");
+ pr_err_once("host upgrade recommended\n");
+ return;
+ }
+
+ /* Enable guest halt poll disables host halt poll */
+ smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
+
+void arch_haltpoll_disable(unsigned int cpu)
+{
+ if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ return;
+
+ /* Disable guest halt poll enables host halt poll */
+ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
+#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f1526..ca0a49eeac4a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* KVM paravirtual clock driver. A clocksource implementation
Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/clocksource.h>
@@ -22,66 +9,102 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <linux/percpu.h>
-#include <asm/reboot.h>
+#include <linux/hardirq.h>
+#include <linux/cpuhotplug.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/set_memory.h>
+#include <linux/cc_platform.h>
-#define KVM_SCALE 22
+#include <asm/hypervisor.h>
+#include <asm/x86_init.h>
+#include <asm/kvmclock.h>
-static int kvmclock = 1;
+static int kvmclock __initdata = 1;
+static int kvmclock_vsyscall __initdata = 1;
+static int msr_kvm_system_time __ro_after_init;
+static int msr_kvm_wall_clock __ro_after_init;
+static u64 kvm_sched_clock_offset __ro_after_init;
-static int parse_no_kvmclock(char *arg)
+static int __init parse_no_kvmclock(char *arg)
{
kvmclock = 0;
return 0;
}
early_param("no-kvmclock", parse_no_kvmclock);
-/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
-static struct pvclock_wall_clock wall_clock;
+static int __init parse_no_kvmclock_vsyscall(char *arg)
+{
+ kvmclock_vsyscall = 0;
+ return 0;
+}
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
+/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */
+#define HVC_BOOT_ARRAY_SIZE \
+ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
+
+static struct pvclock_vsyscall_time_info
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __bss_decrypted;
+static struct pvclock_vsyscall_time_info *hvclock_mem;
+DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
* that with system time
*/
-static unsigned long kvm_get_wallclock(void)
+static void kvm_get_wallclock(struct timespec64 *now)
{
- struct pvclock_vcpu_time_info *vcpu_time;
- struct timespec ts;
- int low, high;
-
- low = (int)__pa(&wall_clock);
- high = ((u64)__pa(&wall_clock) >> 32);
- native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
-
- vcpu_time = &get_cpu_var(hv_clock);
- pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
- put_cpu_var(hv_clock);
-
- return ts.tv_sec;
+ wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
+ preempt_disable();
+ pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
+ preempt_enable();
}
-static int kvm_set_wallclock(unsigned long now)
+static int kvm_set_wallclock(const struct timespec64 *now)
{
- return -1;
+ return -ENODEV;
}
-static cycle_t kvm_clock_read(void)
+static u64 kvm_clock_read(void)
{
- struct pvclock_vcpu_time_info *src;
- cycle_t ret;
+ u64 ret;
- src = &get_cpu_var(hv_clock);
- ret = pvclock_clocksource_read(src);
- put_cpu_var(hv_clock);
+ preempt_disable_notrace();
+ ret = pvclock_clocksource_read_nowd(this_cpu_pvti());
+ preempt_enable_notrace();
return ret;
}
-static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+static u64 kvm_clock_get_cycles(struct clocksource *cs)
{
return kvm_clock_read();
}
+static noinstr u64 kvm_sched_clock_read(void)
+{
+ return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable)
+ clear_sched_clock_stable();
+ kvm_sched_clock_offset = kvm_clock_read();
+ paravirt_set_sched_clock(kvm_sched_clock_read);
+
+ pr_info("kvm-clock: using sched offset of %llu cycles",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -93,12 +116,11 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- struct pvclock_vcpu_time_info *src;
- src = &per_cpu(hv_clock, 0);
- return pvclock_tsc_khz(src);
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ return pvclock_tsc_khz(this_cpu_pvti());
}
-static void kvm_get_preset_lpj(void)
+static void __init kvm_get_preset_lpj(void)
{
unsigned long khz;
u64 lpj;
@@ -110,95 +132,218 @@ static void kvm_get_preset_lpj(void)
preset_lpj = lpj;
}
+bool kvm_check_and_clear_guest_paused(void)
+{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ bool ret = false;
+
+ if (!src)
+ return ret;
+
+ if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ ret = true;
+ }
+ return ret;
+}
+
+static int kvm_cs_enable(struct clocksource *cs)
+{
+ vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
+ return 0;
+}
+
static struct clocksource kvm_clock = {
- .name = "kvm-clock",
- .read = kvm_clock_get_cycles,
- .rating = 400,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << KVM_SCALE,
- .shift = KVM_SCALE,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .name = "kvm-clock",
+ .read = kvm_clock_get_cycles,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .id = CSID_X86_KVM_CLK,
+ .enable = kvm_cs_enable,
};
-static int kvm_register_clock(char *txt)
+static void kvm_register_clock(char *txt)
{
- int cpu = smp_processor_id();
- int low, high;
- low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
- high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
- printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
- cpu, high, low, txt);
- return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ u64 pa;
+
+ if (!src)
+ return;
+
+ pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
+ wrmsrq(msr_kvm_system_time, pa);
+ pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+}
+
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
}
#ifdef CONFIG_X86_LOCAL_APIC
-static void __cpuinit kvm_setup_secondary_clock(void)
+static void kvm_setup_secondary_clock(void)
{
- /*
- * Now that the first cpu already had this clocksource initialized,
- * we shouldn't fail.
- */
- WARN_ON(kvm_register_clock("secondary cpu clock"));
- /* ok, done with our trickery, call native */
- setup_secondary_APIC_clock();
+ kvm_register_clock("secondary cpu clock");
}
#endif
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
+void kvmclock_disable(void)
{
- WARN_ON(kvm_register_clock("primary cpu clock"));
- native_smp_prepare_boot_cpu();
+ if (msr_kvm_system_time)
+ native_write_msr(msr_kvm_system_time, 0);
}
-#endif
-/*
- * After the clock is registered, the host will keep writing to the
- * registered memory location. If the guest happens to shutdown, this memory
- * won't be valid. In cases like kexec, in which you install a new kernel, this
- * means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writting anything
- * that does not have the 'enable' bit set in the msr
- */
-#ifdef CONFIG_KEXEC
-static void kvm_crash_shutdown(struct pt_regs *regs)
+static void __init kvmclock_init_mem(void)
{
- native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
- native_machine_crash_shutdown(regs);
+ unsigned long ncpus;
+ unsigned int order;
+ struct page *p;
+ int r;
+
+ if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
+ return;
+
+ ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
+ order = get_order(ncpus * sizeof(*hvclock_mem));
+
+ p = alloc_pages(GFP_KERNEL, order);
+ if (!p) {
+ pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
+ return;
+ }
+
+ hvclock_mem = page_address(p);
+
+ /*
+ * hvclock is shared between the guest and the hypervisor, must
+ * be mapped decrypted.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ r = set_memory_decrypted((unsigned long) hvclock_mem,
+ 1UL << order);
+ if (r) {
+ __free_pages(p, order);
+ hvclock_mem = NULL;
+ pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
+ return;
+ }
+ }
+
+ memset(hvclock_mem, 0, PAGE_SIZE << order);
}
+
+static int __init kvm_setup_vsyscall_timeinfo(void)
+{
+ if (!kvm_para_available() || !kvmclock || nopv)
+ return 0;
+
+ kvmclock_init_mem();
+
+#ifdef CONFIG_X86_64
+ if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
+ u8 flags;
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 0;
+
+ kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+ }
#endif
-static void kvm_shutdown(void)
+ return 0;
+}
+early_initcall(kvm_setup_vsyscall_timeinfo);
+
+static int kvmclock_setup_percpu(unsigned int cpu)
{
- native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
- native_machine_shutdown();
+ struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu);
+
+ /*
+ * The per cpu area setup replicates CPU0 data to all cpu
+ * pointers. So carefully check. CPU0 has been set up in init
+ * already.
+ */
+ if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
+ return 0;
+
+ /* Use the static page for the first CPUs, allocate otherwise */
+ if (cpu < HVC_BOOT_ARRAY_SIZE)
+ p = &hv_clock_boot[cpu];
+ else if (hvclock_mem)
+ p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
+ else
+ return -ENOMEM;
+
+ per_cpu(hv_clock_per_cpu, cpu) = p;
+ return p ? 0 : -ENOMEM;
}
void __init kvmclock_init(void)
{
- if (!kvm_para_available())
+ u8 flags;
+
+ if (!kvm_para_available() || !kvmclock)
return;
- if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
- if (kvm_register_clock("boot clock"))
- return;
- pv_time_ops.get_wallclock = kvm_get_wallclock;
- pv_time_ops.set_wallclock = kvm_set_wallclock;
- pv_time_ops.sched_clock = kvm_clock_read;
- pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+ } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+ } else {
+ return;
+ }
+
+ if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
+ kvmclock_setup_percpu, NULL) < 0) {
+ return;
+ }
+
+ pr_info("kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
+ this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
+ kvm_register_clock("primary cpu clock");
+ pvclock_set_pvti_cpu0_va(hv_clock_boot);
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+
+ x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.calibrate_cpu = kvm_get_tsc_khz;
+ x86_platform.get_wallclock = kvm_get_wallclock;
+ x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
-#endif
-#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
- machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC
- machine_ops.crash_shutdown = kvm_crash_shutdown;
+ x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
#endif
- kvm_get_preset_lpj();
- clocksource_register(&kvm_clock);
- pv_info.paravirt_enabled = 1;
- pv_info.name = "KVM";
- }
+ x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+ x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
+ kvm_get_preset_lpj();
+
+ /*
+ * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states.
+ *
+ * Invariant TSC exposed by host means kvmclock is not necessary:
+ * can use TSC as clocksource.
+ *
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ !check_tsc_unstable())
+ kvm_clock.rating = 299;
+
+ clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
+ pv_info.name = "KVM";
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635d..0f19ef355f5f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -1,120 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2002 Andi Kleen
*
* This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ * context.ldt_usr_sem
+ * mmap_lock
+ * context.lock
*/
#include <linux/errno.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/system.h>
#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
-#include <asm/syscalls.h>
+#include <asm/pgtable_areas.h>
+
+#include <xen/xen.h>
+
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
-#ifdef CONFIG_SMP
-static void flush_ldt(void *current_mm)
+static inline void *ldt_slot_va(int slot)
{
- if (current->active_mm == current_mm)
- load_LDT(&current->active_mm->context);
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
}
-#endif
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+void load_mm_ldt(struct mm_struct *mm)
{
- void *oldldt, *newldt;
- int oldsize;
+ struct ldt_struct *ldt;
- if (mincount <= pc->size)
- return 0;
- oldsize = pc->size;
- mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
- (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
- if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
- newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
- else
- newldt = (void *)__get_free_page(GFP_KERNEL);
+ /* READ_ONCE synchronizes with smp_store_release */
+ ldt = READ_ONCE(mm->context.ldt);
- if (!newldt)
- return -ENOMEM;
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
- if (oldsize)
- memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
- oldldt = pc->ldt;
- memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
- (mincount - oldsize) * LDT_ENTRY_SIZE);
+ if (unlikely(ldt)) {
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
- paravirt_alloc_ldt(newldt, mincount);
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
+ clear_LDT();
+ }
+}
+
+void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
+
+static void refresh_ldt_segments(void)
+{
#ifdef CONFIG_X86_64
- /* CHECKME: Do we really need this ? */
- wmb();
-#endif
- pc->ldt = newldt;
- wmb();
- pc->size = mincount;
- wmb();
-
- if (reload) {
-#ifdef CONFIG_SMP
- preempt_disable();
- load_LDT(pc);
- if (!cpus_equal(current->mm->cpu_vm_mask,
- cpumask_of_cpu(smp_processor_id())))
- smp_call_function(flush_ldt, current->mm, 1);
- preempt_enable();
-#else
- load_LDT(pc);
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
#endif
+}
+
+/* context.lock is held by the task which issued the smp function call */
+static void flush_ldt(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
+ return;
+
+ load_mm_ldt(mm);
+
+ refresh_ldt_segments();
+}
+
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
+{
+ struct ldt_struct *new_ldt;
+ unsigned int alloc_size;
+
+ if (num_entries > LDT_ENTRIES)
+ return NULL;
+
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
+ if (!new_ldt)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
+
+ /*
+ * Xen is very picky: it requires a page-aligned LDT that has no
+ * trailing nonzero bytes in any page that contains LDT descriptors.
+ * Keep it simple: zero the whole allocation and never allocate less
+ * than PAGE_SIZE.
+ */
+ if (alloc_size > PAGE_SIZE)
+ new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ else
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+
+ if (!new_ldt->entries) {
+ kfree(new_ldt);
+ return NULL;
+ }
+
+ /* The new LDT isn't aliased for PTI yet. */
+ new_ldt->slot = -1;
+
+ new_ldt->nr_entries = num_entries;
+ return new_ldt;
+}
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+
+static void do_sanity_check(struct mm_struct *mm,
+ bool had_kernel_mapping,
+ bool had_user_mapping)
+{
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_kernel_mapping);
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(!had_user_mapping);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_kernel_mapping);
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(had_user_mapping);
}
- if (oldsize) {
- paravirt_free_ldt(oldldt, oldsize);
- if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(oldldt);
- else
- put_page(virt_to_page(oldldt));
+}
+
+#ifdef CONFIG_X86_PAE
+
+static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ if (pgd->pgd == 0)
+ return NULL;
+
+ p4d = p4d_offset(pgd, va);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, va);
+ if (pud_none(*pud))
+ return NULL;
+
+ return pmd_offset(pud, va);
+}
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+
+ if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pmd(u_pmd, *k_pmd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ bool had_kernel, had_user;
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+ had_kernel = (k_pmd->pmd != 0);
+ had_user = (u_pmd->pmd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#else /* !CONFIG_X86_PAE */
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ bool had_kernel = (pgd->pgd != 0);
+ bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#endif /* CONFIG_X86_PAE */
+
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+ unsigned long va;
+ bool is_vmalloc;
+ spinlock_t *ptl;
+ int i, nr_pages;
+
+ if (!boot_cpu_has(X86_FEATURE_PTI))
+ return 0;
+
+ /*
+ * Any given ldt_struct should have map_ldt_struct() called at most
+ * once.
+ */
+ WARN_ON(ldt->slot != -1);
+
+ /* Check if the current mappings are sane */
+ sanity_check_ldt_mapping(mm);
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn;
+ pgprot_t pte_prot;
+ pte_t pte, *ptep;
+
+ va = (unsigned long)ldt_slot_va(slot) + offset;
+ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+ /*
+ * Treat the PTI LDT range as a *userspace* range.
+ * get_locked_pte() will allocate all needed pagetables
+ * and account for them in this mm.
+ */
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+ /*
+ * Map it RO so the easy to find address is not a primary
+ * target via some kernel interface which misses a
+ * permission check.
+ */
+ pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
+ /* Filter out unsuppored __PAGE_KERNEL* bits: */
+ pgprot_val(pte_prot) &= __supported_pte_mask;
+ pte = pfn_pte(pfn, pte_prot);
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
}
+
+ /* Propagate LDT mapping to the user page-table */
+ map_ldt_struct_to_user(mm);
+
+ ldt->slot = slot;
return 0;
}
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
- int err = alloc_ldt(new, old->size, 0);
- int i;
+ unsigned long va;
+ int i, nr_pages;
+
+ if (!ldt)
+ return;
- if (err < 0)
- return err;
+ /* LDT map/unmap is only required for PTI */
+ if (!boot_cpu_has(X86_FEATURE_PTI))
+ return;
- for (i = 0; i < old->size; i++)
- write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!WARN_ON_ONCE(!ptep)) {
+ pte_clear(mm, va, ptep);
+ pte_unmap_unlock(ptep, ptl);
+ }
+ }
+
+ va = (unsigned long)ldt_slot_va(ldt->slot);
+ flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
+}
+
+#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
+
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
return 0;
}
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+}
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = LDT_END_ADDR;
+
+ if (!boot_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Although free_pgd_range() is intended for freeing user
+ * page-tables, it also works out for kernel mappings on x86.
+ * We use tlb_gather_mmu_fullmm() to avoid confusing the
+ * range-tracking logic in __tlb_adjust_range().
+ */
+ tlb_gather_mmu_fullmm(&tlb, mm);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb);
+#endif
+}
+
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
+}
+
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+ mutex_lock(&mm->context.lock);
+
+ /* Synchronizes with READ_ONCE in load_mm_ldt. */
+ smp_store_release(&mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using currents mm. */
+ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
+
+ mutex_unlock(&mm->context.lock);
+}
+
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (likely(!ldt))
+ return;
+
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree_atomic(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
*/
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
- struct mm_struct *old_mm;
+ struct ldt_struct *new_ldt;
int retval = 0;
- mutex_init(&mm->context.lock);
- mm->context.size = 0;
- old_mm = current->mm;
- if (old_mm && old_mm->context.size > 0) {
- mutex_lock(&old_mm->context.lock);
- retval = copy_ldt(&mm->context, &old_mm->context);
- mutex_unlock(&old_mm->context.lock);
+ if (!old_mm)
+ return 0;
+
+ mutex_lock(&old_mm->context.lock);
+ if (!old_mm->context.ldt)
+ goto out_unlock;
+
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
}
+
+ memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval) {
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+ mm->context.ldt = new_ldt;
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
return retval;
}
@@ -123,55 +488,54 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
*
* 64bit: Don't touch the LDT register - we're already in the next thread.
*/
-void destroy_context(struct mm_struct *mm)
+void destroy_context_ldt(struct mm_struct *mm)
{
- if (mm->context.size) {
-#ifdef CONFIG_X86_32
- /* CHECKME: Can this ever happen ? */
- if (mm == current->active_mm)
- clear_LDT();
-#endif
- paravirt_free_ldt(mm->context.ldt, mm->context.size);
- if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(mm->context.ldt);
- else
- put_page(virt_to_page(mm->context.ldt));
- mm->context.size = 0;
- }
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
+}
+
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
}
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int err;
- unsigned long size;
struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
+
+ down_read(&mm->context.ldt_usr_sem);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
- if (!mm->context.size)
- return 0;
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- mutex_lock(&mm->context.lock);
- size = mm->context.size * LDT_ENTRY_SIZE;
- if (size > bytecount)
- size = bytecount;
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
- err = 0;
- if (copy_to_user(ptr, mm->context.ldt, size))
- err = -EFAULT;
- mutex_unlock(&mm->context.lock);
- if (err < 0)
- goto error_return;
- if (size != bytecount) {
- /* zero-fill the rest */
- if (clear_user(ptr + size, bytecount - size) != 0) {
- err = -EFAULT;
- goto error_return;
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entries_size != bytecount) {
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
}
}
- return bytecount;
-error_return:
- return err;
+ retval = bytecount;
+
+out_unlock:
+ up_read(&mm->context.ldt_usr_sem);
+ return retval;
}
static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -189,12 +553,36 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
return bytecount;
}
+static bool allow_16bit_segments(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_16BIT))
+ return false;
+
+#ifdef CONFIG_XEN_PV
+ /*
+ * Xen PV does not implement ESPFIX64, which means that 16-bit
+ * segments will not work correctly. Until either Xen PV implements
+ * ESPFIX64 and can signal this fact to the guest or unless someone
+ * provides compelling evidence that allowing broken 16-bit segments
+ * is worthwhile, disallow 16-bit segments under Xen PV.
+ */
+ if (xen_pv_domain()) {
+ pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
+ struct ldt_struct *new_ldt, *old_ldt;
+ unsigned int old_nr_entries, new_nr_entries;
+ struct user_desc ldt_info;
struct desc_struct ldt;
int error;
- struct user_desc ldt_info;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -213,39 +601,71 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
goto out;
}
- mutex_lock(&mm->context.lock);
- if (ldt_info.entry_number >= mm->context.size) {
- error = alloc_ldt(&current->mm->context,
- ldt_info.entry_number + 1, 1);
- if (error < 0)
- goto out_unlock;
- }
-
- /* Allow LDTs to be cleared by the user. */
- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
- if (oldmode || LDT_empty(&ldt_info)) {
- memset(&ldt, 0, sizeof(ldt));
- goto install;
+ if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
+ error = -EINVAL;
+ goto out;
}
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
}
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
+ if (down_write_killable(&mm->context.ldt_usr_sem))
+ return -EINTR;
+
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(new_nr_entries);
+ if (!new_ldt)
+ goto out_unlock;
- /* Install the new entry ... */
-install:
- write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+ if (old_ldt)
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
+
+ /*
+ * If we are using PTI, map the new LDT into the userspace pagetables.
+ * If there is already an LDT, use the other slot so that other CPUs
+ * will continue to use the old LDT until install_ldt() switches
+ * them over to the new LDT.
+ */
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ /*
+ * This only can fail for the first LDT setup. If an LDT is
+ * already installed then the PTE page is already
+ * populated. Mop up a half populated page table.
+ */
+ if (!WARN_ON_ONCE(old_ldt))
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+
+ install_ldt(mm, new_ldt);
+ unmap_ldt_struct(mm, old_ldt);
+ free_ldt_struct(old_ldt);
error = 0;
out_unlock:
- mutex_unlock(&mm->context.lock);
+ up_write(&mm->context.ldt_usr_sem);
out:
return error;
}
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
- unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
int ret = -ENOSYS;
@@ -263,5 +683,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
ret = write_ldt(ptr, bytecount, 0);
break;
}
- return ret;
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but the ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..1f325304c4a8 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -1,53 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
-#include <linux/init.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
#include <linux/suspend.h>
#include <linux/gfp.h>
#include <linux/io.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
-#include <asm/system.h>
-#include <asm/cacheflush.h>
-
-static void set_idt(void *newidt, __u16 limit)
-{
- struct desc_ptr curidt;
-
- /* ia32 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- load_idt(&curidt);
-}
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* ia32 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- load_gdt(&curgdt);
-}
+#include <asm/set_memory.h>
+#include <asm/debugreg.h>
static void load_segments(void)
{
@@ -60,8 +34,6 @@ static void load_segments(void)
"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
"\tmovl %%eax,%%ds\n"
"\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
: : : "eax", "memory");
#undef STR
@@ -70,18 +42,24 @@ static void load_segments(void)
static void machine_kexec_free_page_tables(struct kimage *image)
{
- free_page((unsigned long)image->arch.pgd);
+ free_pages((unsigned long)image->arch.pgd, pgd_allocation_order());
+ image->arch.pgd = NULL;
#ifdef CONFIG_X86_PAE
free_page((unsigned long)image->arch.pmd0);
+ image->arch.pmd0 = NULL;
free_page((unsigned long)image->arch.pmd1);
+ image->arch.pmd1 = NULL;
#endif
free_page((unsigned long)image->arch.pte0);
+ image->arch.pte0 = NULL;
free_page((unsigned long)image->arch.pte1);
+ image->arch.pte1 = NULL;
}
static int machine_kexec_alloc_page_tables(struct kimage *image)
{
- image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ pgd_allocation_order());
#ifdef CONFIG_X86_PAE
image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
@@ -93,7 +71,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)
!image->arch.pmd0 || !image->arch.pmd1 ||
#endif
!image->arch.pte0 || !image->arch.pte1) {
- machine_kexec_free_page_tables(image);
return -ENOMEM;
}
return 0;
@@ -103,6 +80,7 @@ static void machine_kexec_page_table_set_one(
pgd_t *pgd, pmd_t *pmd, pte_t *pte,
unsigned long vaddr, unsigned long paddr)
{
+ p4d_t *p4d;
pud_t *pud;
pgd += pgd_index(vaddr);
@@ -110,7 +88,8 @@ static void machine_kexec_page_table_set_one(
if (!(pgd_val(*pgd) & _PAGE_PRESENT))
set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
#endif
- pud = pud_offset(pgd, vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
if (!(pmd_val(*pmd) & _PAGE_PRESENT))
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
@@ -157,8 +136,7 @@ int machine_kexec_prepare(struct kimage *image)
{
int error;
- if (nx_enabled)
- set_pages_x(image->control_code_page, 1);
+ set_memory_x((unsigned long)page_address(image->control_code_page), 1);
error = machine_kexec_alloc_page_tables(image);
if (error)
return error;
@@ -172,8 +150,7 @@ int machine_kexec_prepare(struct kimage *image)
*/
void machine_kexec_cleanup(struct kimage *image)
{
- if (nx_enabled)
- set_pages_nx(image->control_code_page, 1);
+ set_memory_nx((unsigned long)page_address(image->control_code_page), 1);
machine_kexec_free_page_tables(image);
}
@@ -183,15 +160,10 @@ void machine_kexec_cleanup(struct kimage *image)
*/
void machine_kexec(struct kimage *image)
{
+ relocate_kernel_fn *relocate_kernel_ptr;
unsigned long page_list[PAGES_NR];
void *control_page;
int save_ftrace_enabled;
- asmlinkage unsigned long
- (*relocate_kernel_ptr)(unsigned long indirection_page,
- unsigned long control_page,
- unsigned long start_address,
- unsigned int has_pae,
- unsigned int preserve_context);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -202,17 +174,18 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ hw_breakpoint_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
/*
* We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel. kexec/kdump
- * paths already have calls to disable_IO_APIC() in
- * one form or other. kexec jump path also need
- * one.
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
*/
- disable_IO_APIC();
+ clear_IO_APIC();
+ restore_boot_irq_mode();
#endif
}
@@ -243,13 +216,14 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
+ native_idt_invalidate();
+ native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
(unsigned long)page_list,
- image->start, cpu_has_pae,
+ image->start,
+ boot_cpu_has(X86_FEATURE_PAE),
image->preserve_context);
#ifdef CONFIG_KEXEC_JUMP
@@ -259,15 +233,3 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
-
-void arch_crash_save_vmcoreinfo(void)
-{
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
-#ifdef CONFIG_X86_PAE
- VMCOREINFO_CONFIG(X86_PAE);
-#endif
-}
-
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..201137b98fb8 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -1,164 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
+#include <linux/gfp.h>
#include <linux/reboot.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
#include <linux/io.h>
#include <linux/suspend.h>
+#include <linux/vmalloc.h>
+#include <linux/efi.h>
+#include <linux/cc_platform.h>
-#include <asm/pgtable.h>
+#include <asm/init.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/io_apic.h>
+#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+#include <asm/setup.h>
+#include <asm/set_memory.h>
+#include <asm/cpu.h>
+#include <asm/efi.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_ACPI
+/*
+ * Used while adding mapping for ACPI tables.
+ * Can be reused when other iomem regions need be mapped
+ */
+struct init_pgtable_data {
+ struct x86_mapping_info *info;
+ pgd_t *level4p;
+};
-static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
- unsigned long addr)
+static int mem_region_callback(struct resource *res, void *arg)
{
- pud_t *pud;
- pmd_t *pmd;
- struct page *page;
- int result = -ENOMEM;
+ struct init_pgtable_data *data = arg;
- addr &= PMD_MASK;
- pgd += pgd_index(addr);
- if (!pgd_present(*pgd)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pud = (pud_t *)page_address(page);
- memset(pud, 0, PAGE_SIZE);
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pmd = (pmd_t *)page_address(page);
- memset(pmd, 0, PAGE_SIZE);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- result = 0;
-out:
- return result;
+ return kernel_ident_mapping_init(data->info, data->level4p,
+ res->start, res->end + 1);
}
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
+static int
+map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
{
- unsigned long end_addr;
+ struct init_pgtable_data data;
+ unsigned long flags;
+ int ret;
- addr &= PAGE_MASK;
- end_addr = addr + PUD_SIZE;
- while (addr < end_addr) {
- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- addr += PMD_SIZE;
- }
+ data.info = info;
+ data.level4p = level4p;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
+ &data, mem_region_callback);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ /* ACPI tables could be located in ACPI Non-volatile Storage region */
+ ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
+ &data, mem_region_callback);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ return 0;
}
+#else
+static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
+#endif
-static int init_level3_page(struct kimage *image, pud_t *level3p,
- unsigned long addr, unsigned long last_addr)
+static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p)
{
- unsigned long end_addr;
- int result;
+ unsigned long mstart, mend;
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + PGDIR_SIZE;
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pmd_t *level2p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level2p = (pmd_t *)page_address(page);
- init_level2_page(level2p, addr);
- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
- addr += PUD_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pud_clear(level3p++);
- addr += PUD_SIZE;
- }
-out:
- return result;
+ if (!kexec_debug_8250_mmio32)
+ return 0;
+
+ mstart = kexec_debug_8250_mmio32 & PAGE_MASK;
+ mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK;
+ pr_info("Map PCI serial at %lx - %lx\n", mstart, mend);
+ return kernel_ident_mapping_init(info, level4p, mstart, mend);
}
+#ifdef CONFIG_KEXEC_FILE
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+ NULL
+};
+#endif
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
- unsigned long addr, unsigned long last_addr)
+static int
+map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
{
- unsigned long end_addr;
- int result;
+#ifdef CONFIG_EFI
+ unsigned long mstart, mend;
+ void *kaddr;
+ int ret;
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pud_t *level3p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level3p = (pud_t *)page_address(page);
- result = init_level3_page(image, level3p, addr, last_addr);
- if (result)
- goto out;
- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
- addr += PGDIR_SIZE;
+ if (!efi_enabled(EFI_BOOT))
+ return 0;
+
+ mstart = (boot_params.efi_info.efi_systab |
+ ((u64)boot_params.efi_info.efi_systab_hi<<32));
+
+ if (efi_enabled(EFI_64BIT))
+ mend = mstart + sizeof(efi_system_table_64_t);
+ else
+ mend = mstart + sizeof(efi_system_table_32_t);
+
+ if (!mstart)
+ return 0;
+
+ ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
+ if (ret)
+ return ret;
+
+ kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
+ if (!kaddr) {
+ pr_err("Could not map UEFI system table\n");
+ return -ENOMEM;
}
- /* clear the unused entries */
- while (addr < end_addr) {
- pgd_clear(level4p++);
- addr += PGDIR_SIZE;
+
+ mstart = efi_config_table;
+
+ if (efi_enabled(EFI_64BIT)) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
+ } else {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
}
-out:
- return result;
+
+ memunmap(kaddr);
+
+ return kernel_ident_mapping_init(info, level4p, mstart, mend);
+#endif
+ return 0;
}
static void free_transition_pgtable(struct kimage *image)
{
+ free_page((unsigned long)image->arch.p4d);
+ image->arch.p4d = NULL;
free_page((unsigned long)image->arch.pud);
+ image->arch.pud = NULL;
free_page((unsigned long)image->arch.pmd);
+ image->arch.pmd = NULL;
free_page((unsigned long)image->arch.pte);
+ image->arch.pte = NULL;
}
-static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
+ unsigned long control_page)
{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ int result = -ENOMEM;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- unsigned long vaddr, paddr;
- int result = -ENOMEM;
- vaddr = (unsigned long)relocate_kernel;
- paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ /*
+ * For the transition to the identity mapped page tables, the control
+ * code page also needs to be mapped at the virtual address it starts
+ * off running from.
+ */
+ vaddr = (unsigned long)__va(control_page);
+ paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+ if (!p4d)
+ goto err;
+ image->arch.p4d = p4d;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
goto err;
image->arch.pud = pud;
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
- pud = pud_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
@@ -175,60 +211,104 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ prot = PAGE_KERNEL_EXEC;
+
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
return 0;
err:
- free_transition_pgtable(image);
return result;
}
+static void *alloc_pgt_page(void *data)
+{
+ struct kimage *image = (struct kimage *)data;
+ struct page *page;
+ void *p = NULL;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (page) {
+ p = page_address(page);
+ clear_page(p);
+ }
-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+ return p;
+}
+
+static int init_pgtable(struct kimage *image, unsigned long control_page)
{
- pgd_t *level4p;
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .context = image,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ unsigned long mstart, mend;
int result;
- level4p = (pgd_t *)__va(start_pgtable);
- result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
- if (result)
- return result;
+ int i;
+
+ image->arch.pgd = alloc_pgt_page(image);
+ if (!image->arch.pgd)
+ return -ENOMEM;
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ info.page_flag |= _PAGE_ENC;
+ info.kernpg_flag |= _PAGE_ENC;
+ }
+
+ if (direct_gbpages)
+ info.direct_gbpages = true;
+
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
+ if (result)
+ return result;
+ }
+
/*
- * image->start may be outside 0 ~ max_pfn, for example when
- * jump back to original kernel from kexeced kernel
+ * segments's mem ranges could be outside 0 ~ max_pfn,
+ * for example when jump back to original kernel from kexeced kernel.
+ * or first kernel is booted with user mem map, and second kernel
+ * could be loaded out of that range.
*/
- result = init_one_level2_page(image, level4p, image->start);
- if (result)
- return result;
- return init_transition_pgtable(image, level4p);
-}
-
-static void set_idt(void *newidt, u16 limit)
-{
- struct desc_ptr curidt;
+ for (i = 0; i < image->nr_segments; i++) {
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
- /* x86-64 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
- __asm__ __volatile__ (
- "lidtq %0\n"
- : : "m" (curidt)
- );
-};
+ if (result)
+ return result;
+ }
+ /*
+ * Prepare EFI systab and ACPI tables for kexec kernel since they are
+ * not covered by pfn_mapped.
+ */
+ result = map_efi_systab(&info, image->arch.pgd);
+ if (result)
+ return result;
-static void set_gdt(void *newgdt, u16 limit)
-{
- struct desc_ptr curgdt;
+ result = map_acpi_tables(&info, image->arch.pgd);
+ if (result)
+ return result;
- /* x86-64 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
+ result = map_mmio_serial(&info, image->arch.pgd);
+ if (result)
+ return result;
- __asm__ __volatile__ (
- "lgdtq %0\n"
- : : "m" (curgdt)
- );
-};
+ /*
+ * This must be last because the intermediate page table pages it
+ * allocates will not be control pages and may overlap the image.
+ */
+ return init_transition_pgtable(image, image->arch.pgd, control_page);
+}
static void load_segments(void)
{
@@ -242,24 +322,74 @@ static void load_segments(void)
);
}
+static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs)
+{
+ gate_desc idtentry = { 0 };
+ int i;
+
+ idtentry.bits.p = 1;
+ idtentry.bits.type = GATE_TRAP;
+ idtentry.segment = __KERNEL_CS;
+ idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs;
+ idtentry.offset_middle = (control_page >> 16) & 0xFFFF;
+ idtentry.offset_high = control_page >> 32;
+
+ for (i = 0; i < 16; i++) {
+ kexec_debug_idt[i] = idtentry;
+ idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE;
+ }
+}
+
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long start_pgtable;
+ void *control_page = page_address(image->control_code_page);
+ unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+ unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
int result;
- /* Calculate the offsets */
- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ /*
+ * Some early TDX-capable platforms have an erratum. A kernel
+ * partial write (a write transaction of less than cacheline
+ * lands at memory controller) to TDX private memory poisons that
+ * memory, and a subsequent read triggers a machine check.
+ *
+ * On those platforms the old kernel must reset TDX private
+ * memory before jumping to the new kernel otherwise the new
+ * kernel may see unexpected machine check. For simplicity
+ * just fail kexec/kdump on those platforms.
+ */
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) {
+ pr_info_once("Not allowed on platform with tdx_pw_mce bug\n");
+ return -EOPNOTSUPP;
+ }
/* Setup the identity mapped 64bit page table */
- result = init_pgtable(image, start_pgtable);
+ result = init_pgtable(image, __pa(control_page));
if (result)
return result;
+ kexec_va_control_page = (unsigned long)control_page;
+ kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd);
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
+
+ prepare_debug_idt((unsigned long)__pa(control_page),
+ (unsigned long)kexec_debug_exc_vectors - reloc_start);
+
+ __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
+
+ set_memory_rox((unsigned long)control_page, 1);
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
+ void *control_page = page_address(image->control_code_page);
+
+ set_memory_nx((unsigned long)control_page, 1);
+ set_memory_rw((unsigned long)control_page, 1);
+
free_transition_pgtable(image);
}
@@ -267,11 +397,13 @@ void machine_kexec_cleanup(struct kimage *image)
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-void machine_kexec(struct kimage *image)
+void __nocfi machine_kexec(struct kimage *image)
{
- unsigned long page_list[PAGES_NR];
- void *control_page;
+ unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+ relocate_kernel_fn *relocate_kernel_ptr;
+ unsigned int relocate_kernel_flags;
int save_ftrace_enabled;
+ void *control_page;
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -282,31 +414,40 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ hw_breakpoint_disable();
+ cet_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
/*
* We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel. kexec/kdump
- * paths already have calls to disable_IO_APIC() in
- * one form or other. kexec jump path also need
- * one.
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
*/
- disable_IO_APIC();
+ clear_IO_APIC();
+ restore_boot_irq_mode();
#endif
}
- control_page = page_address(image->control_code_page) + PAGE_SIZE;
- memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ control_page = page_address(image->control_code_page);
- page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
- page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_TABLE_PAGE] =
- (unsigned long)__pa(page_address(image->control_code_page));
+ /*
+ * Allow for the possibility that relocate_kernel might not be at
+ * the very start of the page.
+ */
+ relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start;
- if (image->type == KEXEC_TYPE_DEFAULT)
- page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
- << PAGE_SHIFT);
+ relocate_kernel_flags = 0;
+ if (image->preserve_context)
+ relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT;
+
+ /*
+ * This must be done before load_segments() since it resets
+ * GS to 0 and percpu data needs the correct GS to work.
+ */
+ if (this_cpu_read(cache_state_incoherent))
+ relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT;
/*
* The segment registers are funny things, they have both a
@@ -315,22 +456,21 @@ void machine_kexec(struct kimage *image)
* with from a table in memory. At no other time is the
* descriptor table in memory accessed.
*
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
+ * Take advantage of this here by force loading the segments,
+ * before the GDT is zapped with an invalid value.
+ *
+ * load_segments() resets GS to 0. Don't make any function call
+ * after here since call depth tracking uses percpu variables to
+ * operate (relocate_kernel() is explicitly ignored by call depth
+ * tracking).
*/
load_segments();
- /*
- * The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
/* now call it */
- image->start = relocate_kernel((unsigned long)image->head,
- (unsigned long)page_list,
- image->start,
- image->preserve_context);
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ virt_to_phys(control_page),
+ image->start,
+ relocate_kernel_flags);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -339,15 +479,250 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
+/*
+ * Handover to the next kernel, no CFI concern.
+ */
+ANNOTATE_NOCFI_SYM(machine_kexec);
+
+/* arch-dependent functionality related to kexec file-based syscall */
+
+#ifdef CONFIG_KEXEC_FILE
+/*
+ * Apply purgatory relocations.
+ *
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtabsec: Corresponding symtab.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section, const Elf_Shdr *relsec,
+ const Elf_Shdr *symtabsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
+
+ rel = (void *)pi->ehdr + relsec->sh_offset;
+
+ pr_debug("Applying relocate section %s to %u\n",
+ shstrtab + relsec->sh_name, relsec->sh_info);
+
+ for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update. This is temporary buffer
+ * where section is currently loaded. This will finally be
+ * loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = pi->purgatory_buf;
+ location += section->sh_offset;
+ location += rel[i].r_offset;
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (void *)pi->ehdr + symtabsec->sh_offset;
+ sym += ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= pi->ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ case R_X86_64_PLT32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
-void arch_crash_save_vmcoreinfo(void)
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- VMCOREINFO_SYMBOL(phys_base);
- VMCOREINFO_SYMBOL(init_level4_pgt);
+ vfree(image->elf_headers);
+ image->elf_headers = NULL;
+ image->elf_headers_sz = 0;
+
+ return kexec_image_post_load_cleanup_default(image);
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+#ifdef CONFIG_CRASH_DUMP
+
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ /*
+ * For physical range: [start, end]. We must skip the unassigned
+ * crashk resource with zero-valued "end" member.
+ */
+ if (!end || start > end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ kexec_mark_range(crashk_res.start, control - 1, protect);
+ control += KEXEC_CONTROL_PAGE_SIZE;
+ kexec_mark_range(control, crashk_res.end, protect);
+}
+
+/* make the memory storing dm crypt keys in/accessible */
+static void kexec_mark_dm_crypt_keys(bool protect)
+{
+ unsigned long start_paddr, end_paddr;
+ unsigned int nr_pages;
+
+ if (kexec_crash_image->dm_crypt_keys_addr) {
+ start_paddr = kexec_crash_image->dm_crypt_keys_addr;
+ end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1;
+ nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE;
+ if (protect)
+ set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages);
+ else
+ __set_memory_prot(
+ (unsigned long)phys_to_virt(start_paddr),
+ nr_pages,
+ __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW));
+ }
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+ kexec_mark_dm_crypt_keys(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_dm_crypt_keys(false);
+ kexec_mark_crashkres(false);
+}
#endif
+
+/*
+ * During a traditional boot under SME, SME will encrypt the kernel,
+ * so the SME kexec kernel also needs to be un-encrypted in order to
+ * replicate a normal SME boot.
+ *
+ * During a traditional boot under SEV, the kernel has already been
+ * loaded encrypted, so the SEV kexec kernel needs to be encrypted in
+ * order to replicate a normal SEV boot.
+ */
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return 0;
+
+ /*
+ * If host memory encryption is active we need to be sure that kexec
+ * pages are not encrypted because when we boot to the new kernel the
+ * pages won't be accessed encrypted (initially).
+ */
+ return set_memory_decrypted((unsigned long)vaddr, pages);
}
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * If host memory encryption is active we need to reset the pages back
+ * to being an encrypted mapping before freeing them.
+ */
+ set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
deleted file mode 100644
index 845d80ce1ef1..000000000000
--- a/arch/x86/kernel/mca_32.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Written by Martin Kolinek, February 1996
- *
- * Changes:
- *
- * Chris Beauregard July 28th, 1996
- * - Fixed up integrated SCSI detection
- *
- * Chris Beauregard August 3rd, 1996
- * - Made mca_info local
- * - Made integrated registers accessible through standard function calls
- * - Added name field
- * - More sanity checking
- *
- * Chris Beauregard August 9th, 1996
- * - Rewrote /proc/mca
- *
- * Chris Beauregard January 7th, 1997
- * - Added basic NMI-processing
- * - Added more information to mca_info structure
- *
- * David Weinehall October 12th, 1998
- * - Made a lot of cleaning up in the source
- * - Added use of save_flags / restore_flags
- * - Added the 'driver_loaded' flag in MCA_adapter
- * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
- *
- * David Weinehall March 24th, 1999
- * - Fixed the output of 'Driver Installed' in /proc/mca/pos
- * - Made the Integrated Video & SCSI show up even if they have id 0000
- *
- * Alexander Viro November 9th, 1999
- * - Switched to regular procfs methods
- *
- * Alfred Arnold & David Weinehall August 23rd, 2000
- * - Added support for Planar POS-registers
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mca.h>
-#include <linux/kprobes.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <linux/proc_fs.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/ioport.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-
-static unsigned char which_scsi;
-
-int MCA_bus;
-EXPORT_SYMBOL(MCA_bus);
-
-/*
- * Motherboard register spinlock. Untested on SMP at the moment, but
- * are there any MCA SMP boxes?
- *
- * Yes - Alan
- */
-static DEFINE_SPINLOCK(mca_lock);
-
-/* Build the status info for the adapter */
-
-static void mca_configure_adapter_status(struct mca_device *mca_dev)
-{
- mca_dev->status = MCA_ADAPTER_NONE;
-
- mca_dev->pos_id = mca_dev->pos[0]
- + (mca_dev->pos[1] << 8);
-
- if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
-
- /*
- * id = 0x0000 usually indicates hardware failure,
- * however, ZP Gu (zpg@castle.net> reports that his 9556
- * has 0x0000 as id and everything still works. There
- * also seem to be an adapter with id = 0x0000; the
- * NCR Parallel Bus Memory Card. Until this is confirmed,
- * however, this code will stay.
- */
-
- mca_dev->status = MCA_ADAPTER_ERROR;
-
- return;
- } else if (mca_dev->pos_id != 0xffff) {
-
- /*
- * 0xffff usually indicates that there's no adapter,
- * however, some integrated adapters may have 0xffff as
- * their id and still be valid. Examples are on-board
- * VGA of the 55sx, the integrated SCSI of the 56 & 57,
- * and possibly also the 95 ULTIMEDIA.
- */
-
- mca_dev->status = MCA_ADAPTER_NORMAL;
- }
-
- if ((mca_dev->pos_id == 0xffff ||
- mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
- int j;
-
- for (j = 2; j < 8; j++) {
- if (mca_dev->pos[j] != 0xff) {
- mca_dev->status = MCA_ADAPTER_NORMAL;
- break;
- }
- }
- }
-
- if (!(mca_dev->pos[2] & MCA_ENABLED)) {
-
- /* enabled bit is in POS 2 */
-
- mca_dev->status = MCA_ADAPTER_DISABLED;
- }
-} /* mca_configure_adapter_status */
-
-/*--------------------------------------------------------------------*/
-
-static struct resource mca_standard_resources[] = {
- { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
- { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
- { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
- { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
- { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
- { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
- { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
-};
-
-#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
-
-/*
- * mca_read_and_store_pos - read the POS registers into a memory buffer
- * @pos: a char pointer to 8 bytes, contains the POS register value on
- * successful return
- *
- * Returns 1 if a card actually exists (i.e. the pos isn't
- * all 0xff) or 0 otherwise
- */
-static int mca_read_and_store_pos(unsigned char *pos)
-{
- int j;
- int found = 0;
-
- for (j = 0; j < 8; j++) {
- pos[j] = inb_p(MCA_POS_REG(j));
- if (pos[j] != 0xff) {
- /* 0xff all across means no device. 0x00 means
- * something's broken, but a device is
- * probably there. However, if you get 0x00
- * from a motherboard register it won't matter
- * what we find. For the record, on the
- * 57SLC, the integrated SCSI adapter has
- * 0xffff for the adapter ID, but nonzero for
- * other registers. */
-
- found = 1;
- }
- }
- return found;
-}
-
-static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
-{
- unsigned char byte;
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return 0;
-
- spin_lock_irqsave(&mca_lock, flags);
- if (mca_dev->pos_register) {
- /* Disable adapter setup, enable motherboard setup */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
- } else {
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read the appropriate register */
-
- outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- }
- spin_unlock_irqrestore(&mca_lock, flags);
-
- mca_dev->pos[reg] = byte;
-
- return byte;
-}
-
-static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
- unsigned char byte)
-{
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return;
-
- spin_lock_irqsave(&mca_lock, flags);
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read in the appropriate register */
-
- outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
- outb_p(byte, MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- spin_unlock_irqrestore(&mca_lock, flags);
-
- /* Update the global register list, while we have the byte */
-
- mca_dev->pos[reg] = byte;
-
-}
-
-/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
-{
- return irq;
-}
-
-static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
-{
- return port;
-}
-
-static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
-{
- return mem;
-}
-
-
-static int __init mca_init(void)
-{
- unsigned int i, j;
- struct mca_device *mca_dev;
- unsigned char pos[8];
- short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
- struct mca_bus *bus;
-
- /*
- * WARNING: Be careful when making changes here. Putting an adapter
- * and the motherboard simultaneously into setup mode may result in
- * damage to chips (according to The Indispensible PC Hardware Book
- * by Hans-Peter Messmer). Also, we disable system interrupts (so
- * that we are not disturbed in the middle of this).
- */
-
- /* Make sure the MCA bus is present */
-
- if (mca_system_init()) {
- printk(KERN_ERR "MCA bus system initialisation failed\n");
- return -ENODEV;
- }
-
- if (!MCA_bus)
- return -ENODEV;
-
- printk(KERN_INFO "Micro Channel bus detected.\n");
-
- /* All MCA systems have at least a primary bus */
- bus = mca_attach_bus(MCA_PRIMARY_BUS);
- if (!bus)
- goto out_nomem;
- bus->default_dma_mask = 0xffffffffLL;
- bus->f.mca_write_pos = mca_pc_write_pos;
- bus->f.mca_read_pos = mca_pc_read_pos;
- bus->f.mca_transform_irq = mca_dummy_transform_irq;
- bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
- bus->f.mca_transform_memory = mca_dummy_transform_memory;
-
- /* get the motherboard device */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
- if (unlikely(!mca_dev))
- goto out_nomem;
-
- /*
- * We do not expect many MCA interrupts during initialization,
- * but let us be safe:
- */
- spin_lock_irq(&mca_lock);
-
- /* Make sure adapter setup is off */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Read motherboard POS registers */
-
- mca_dev->pos_register = 0x7f;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for a motherboard */
- mca_dev->pos_id = MCA_MOTHERBOARD_POS;
- mca_dev->slot = MCA_MOTHERBOARD;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- /* Put motherboard into video setup mode, read integrated video
- * POS registers, and turn motherboard setup off.
- */
-
- mca_dev->pos_register = 0xdf;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for the integrated video */
- mca_dev->pos_id = MCA_INTEGVIDEO_POS;
- mca_dev->slot = MCA_INTEGVIDEO;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- /*
- * Put motherboard into scsi setup mode, read integrated scsi
- * POS registers, and turn motherboard setup off.
- *
- * It seems there are two possible SCSI registers. Martin says that
- * for the 56,57, 0xf7 is the one, but fails on the 76.
- * Alfredo (apena@vnet.ibm.com) says
- * 0xfd works on his machine. We'll try both of them. I figure it's
- * a good bet that only one could be valid at a time. This could
- * screw up though if one is used for something else on the other
- * machine.
- */
-
- for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
- outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
- if (mca_read_and_store_pos(pos))
- break;
- }
- if (which_scsi) {
- /* found a scsi card */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for integrated SCSI controller */
- mca_dev->pos_id = MCA_INTEGSCSI_POS;
- mca_dev->slot = MCA_INTEGSCSI;
- mca_dev->pos_register = which_scsi;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
-
- /* Turn off motherboard setup */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /*
- * Now loop over MCA slots: put each adapter into setup mode, and
- * read its POS registers. Then put adapter setup off.
- */
-
- for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
- outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
- if (!mca_read_and_store_pos(pos))
- continue;
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_dev->driver_loaded = 0;
- mca_dev->slot = i;
- mca_dev->pos_register = 0;
- mca_configure_adapter_status(mca_dev);
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Enable interrupts and return memory start */
- spin_unlock_irq(&mca_lock);
-
- for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
- request_resource(&ioport_resource, mca_standard_resources + i);
-
- mca_do_proc_init();
-
- return 0;
-
- out_unlock_nomem:
- spin_unlock_irq(&mca_lock);
- out_nomem:
- printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
- return -ENOMEM;
-}
-
-subsys_initcall(mca_init);
-
-/*--------------------------------------------------------------------*/
-
-static __kprobes void
-mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
-{
- int slot = mca_dev->slot;
-
- if (slot == MCA_INTEGSCSI) {
- printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_INTEGVIDEO) {
- printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_MOTHERBOARD) {
- printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
- mca_dev->name);
- }
-
- /* More info available in POS 6 and 7? */
-
- if (check_flag) {
- unsigned char pos6, pos7;
-
- pos6 = mca_device_read_pos(mca_dev, 6);
- pos7 = mca_device_read_pos(mca_dev, 7);
-
- printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
- }
-
-} /* mca_handle_nmi_slot */
-
-/*--------------------------------------------------------------------*/
-
-static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
-{
- struct mca_device *mca_dev = to_mca_device(dev);
- unsigned char pos5;
-
- pos5 = mca_device_read_pos(mca_dev, 5);
-
- if (!(pos5 & 0x80)) {
- /*
- * Bit 7 of POS 5 is reset when this adapter has a hardware
- * error. Bit 7 it reset if there's error information
- * available in POS 6 and 7.
- */
- mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
- return 1;
- }
- return 0;
-}
-
-void __kprobes mca_handle_nmi(void)
-{
- /*
- * First try - scan the various adapters and see if a specific
- * adapter was responsible for the error.
- */
- bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-}
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
- *
- * Copyright (C) 2006, Advanced Micro Devices, Inc.
- * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- *
- * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
- */
-
-/*
- * We are using the 32.768kHz input clock - it's the only one that has the
- * ranges we find desirable. The following table lists the suitable
- * divisors and the associated Hz, minimum interval and the maximum interval:
- *
- * Divisor Hz Min Delta (s) Max Delta (s)
- * 1 32768 .00048828125 2.000
- * 2 16384 .0009765625 4.000
- * 4 8192 .001953125 8.000
- * 8 4096 .00390625 16.000
- * 16 2048 .0078125 32.000
- * 32 1024 .015625 64.000
- * 64 512 .03125 128.000
- * 128 256 .0625 256.000
- * 256 128 .125 512.000
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <asm/geode.h>
-
-#define MFGPT_DEFAULT_IRQ 7
-
-static struct mfgpt_timer_t {
- unsigned int avail:1;
-} mfgpt_timers[MFGPT_MAX_TIMERS];
-
-/* Selected from the table above */
-
-#define MFGPT_DIVISOR 16
-#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
-#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
-#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
-
-/* Allow for disabling of MFGPTs */
-static int disable;
-static int __init mfgpt_disable(char *s)
-{
- disable = 1;
- return 1;
-}
-__setup("nomfgpt", mfgpt_disable);
-
-/* Reset the MFGPT timers. This is required by some broken BIOSes which already
- * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
- * affected at least (0.99 is OK with MFGPT workaround left to off).
- */
-static int __init mfgpt_fix(char *s)
-{
- u32 val, dummy;
-
- /* The following udocumented bit resets the MFGPT timers */
- val = 0xFF; dummy = 0;
- wrmsr(MSR_MFGPT_SETUP, val, dummy);
- return 1;
-}
-__setup("mfgptfix", mfgpt_fix);
-
-/*
- * Check whether any MFGPTs are available for the kernel to use. In most
- * cases, firmware that uses AMD's VSA code will claim all timers during
- * bootup; we certainly don't want to take them if they're already in use.
- * In other cases (such as with VSAless OpenFirmware), the system firmware
- * leaves timers available for us to use.
- */
-
-
-static int timers = -1;
-
-static void geode_mfgpt_detect(void)
-{
- int i;
- u16 val;
-
- timers = 0;
-
- if (disable) {
- printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
- goto done;
- }
-
- if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
- printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
- goto done;
- }
-
- for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
- val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
- if (!(val & MFGPT_SETUP_SETUP)) {
- mfgpt_timers[i].avail = 1;
- timers++;
- }
- }
-
-done:
- printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
-}
-
-int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
-{
- u32 msr, mask, value, dummy;
- int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
-
- if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
- return -EIO;
-
- /*
- * The register maps for these are described in sections 6.17.1.x of
- * the AMD Geode CS5536 Companion Device Data Book.
- */
- switch (event) {
- case MFGPT_EVENT_RESET:
- /*
- * XXX: According to the docs, we cannot reset timers above
- * 6; that is, resets for 7 and 8 will be ignored. Is this
- * a problem? -dilinger
- */
- msr = MSR_MFGPT_NR;
- mask = 1 << (timer + 24);
- break;
-
- case MFGPT_EVENT_NMI:
- msr = MSR_MFGPT_NR;
- mask = 1 << (timer + shift);
- break;
-
- case MFGPT_EVENT_IRQ:
- msr = MSR_MFGPT_IRQ;
- mask = 1 << (timer + shift);
- break;
-
- default:
- return -EIO;
- }
-
- rdmsr(msr, value, dummy);
-
- if (enable)
- value |= mask;
- else
- value &= ~mask;
-
- wrmsr(msr, value, dummy);
- return 0;
-}
-EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
-
-int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
-{
- u32 zsel, lpc, dummy;
- int shift;
-
- if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
- return -EIO;
-
- /*
- * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
- * is using the same CMP of the timer's Siamese twin, the IRQ is set to
- * 2, and we mustn't use nor change it.
- * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
- * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
- * with *irq==0 is safe. Currently there _are_ no 2 drivers.
- */
- rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
- shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
- if (((zsel >> shift) & 0xF) == 2)
- return -EIO;
-
- /* Choose IRQ: if none supplied, keep IRQ already set or use default */
- if (!*irq)
- *irq = (zsel >> shift) & 0xF;
- if (!*irq)
- *irq = MFGPT_DEFAULT_IRQ;
-
- /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
- if (*irq < 1 || *irq == 2 || *irq > 15)
- return -EIO;
- rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
- if (lpc & (1 << *irq))
- return -EIO;
-
- /* All chosen and checked - go for it */
- if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
- return -EIO;
- if (enable) {
- zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
- wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
- }
-
- return 0;
-}
-
-static int mfgpt_get(int timer)
-{
- mfgpt_timers[timer].avail = 0;
- printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
- return timer;
-}
-
-int geode_mfgpt_alloc_timer(int timer, int domain)
-{
- int i;
-
- if (timers == -1) {
- /* timers haven't been detected yet */
- geode_mfgpt_detect();
- }
-
- if (!timers)
- return -1;
-
- if (timer >= MFGPT_MAX_TIMERS)
- return -1;
-
- if (timer < 0) {
- /* Try to find an available timer */
- for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
- if (mfgpt_timers[i].avail)
- return mfgpt_get(i);
-
- if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
- break;
- }
- } else {
- /* If they requested a specific timer, try to honor that */
- if (mfgpt_timers[timer].avail)
- return mfgpt_get(timer);
- }
-
- /* No timers available - too bad */
- return -1;
-}
-EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
-
-
-#ifdef CONFIG_GEODE_MFGPT_TIMER
-
-/*
- * The MFPGT timers on the CS5536 provide us with suitable timers to use
- * as clock event sources - not as good as a HPET or APIC, but certainly
- * better than the PIT. This isn't a general purpose MFGPT driver, but
- * a simplified one designed specifically to act as a clock event source.
- * For full details about the MFGPT, please consult the CS5536 data sheet.
- */
-
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
-static u16 mfgpt_event_clock;
-
-static int irq;
-static int __init mfgpt_setup(char *str)
-{
- get_option(&str, &irq);
- return 1;
-}
-__setup("mfgpt_irq=", mfgpt_setup);
-
-static void mfgpt_disable_timer(u16 clock)
-{
- /* avoid races by clearing CMP1 and CMP2 unconditionally */
- geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
- MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
-}
-
-static int mfgpt_next_event(unsigned long, struct clock_event_device *);
-static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
-
-static struct clock_event_device mfgpt_clockevent = {
- .name = "mfgpt-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = mfgpt_set_mode,
- .set_next_event = mfgpt_next_event,
- .rating = 250,
- .cpumask = cpu_all_mask,
- .shift = 32
-};
-
-static void mfgpt_start_timer(u16 delta)
-{
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
-
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
- MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
-}
-
-static void mfgpt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- mfgpt_disable_timer(mfgpt_event_clock);
-
- if (mode == CLOCK_EVT_MODE_PERIODIC)
- mfgpt_start_timer(MFGPT_PERIODIC);
-
- mfgpt_tick_mode = mode;
-}
-
-static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
-{
- mfgpt_start_timer(delta);
- return 0;
-}
-
-static irqreturn_t mfgpt_tick(int irq, void *dev_id)
-{
- u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
-
- /* See if the interrupt was for us */
- if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
- return IRQ_NONE;
-
- /* Turn off the clock (and clear the event) */
- mfgpt_disable_timer(mfgpt_event_clock);
-
- if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
- return IRQ_HANDLED;
-
- /* Clear the counter */
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
-
- /* Restart the clock in periodic mode */
-
- if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
- MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
- }
-
- mfgpt_clockevent.event_handler(&mfgpt_clockevent);
- return IRQ_HANDLED;
-}
-
-static struct irqaction mfgptirq = {
- .handler = mfgpt_tick,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
- .name = "mfgpt-timer"
-};
-
-int __init mfgpt_timer_setup(void)
-{
- int timer, ret;
- u16 val;
-
- timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
- if (timer < 0) {
- printk(KERN_ERR
- "mfgpt-timer: Could not allocate a MFPGT timer\n");
- return -ENODEV;
- }
-
- mfgpt_event_clock = timer;
-
- /* Set up the IRQ on the MFGPT side */
- if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
- printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
- return -EIO;
- }
-
- /* And register it with the kernel */
- ret = setup_irq(irq, &mfgptirq);
-
- if (ret) {
- printk(KERN_ERR
- "mfgpt-timer: Unable to set up the interrupt.\n");
- goto err;
- }
-
- /* Set the clock scale and enable the event mode for CMP2 */
- val = MFGPT_SCALE | (3 << 8);
-
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
-
- /* Set up the clock event */
- mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
- mfgpt_clockevent.shift);
- mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
- &mfgpt_clockevent);
- mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
- &mfgpt_clockevent);
-
- printk(KERN_INFO
- "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
- timer, irq);
- clockevents_register_device(&mfgpt_clockevent);
-
- return 0;
-
-err:
- geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
- printk(KERN_ERR
- "mfgpt-timer: Unable to set up the MFGPT clock source\n");
- return -EIO;
-}
-
-#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
deleted file mode 100644
index 366baa179913..000000000000
--- a/arch/x86/kernel/microcode_amd.c
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * AMD CPU Microcode Update Driver for Linux
- * Copyright (C) 2008 Advanced Micro Devices Inc.
- *
- * Author: Peter Oruba <peter.oruba@amd.com>
- *
- * Based on work by:
- * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *
- * This driver allows to upgrade microcode on AMD
- * family 0x10 and 0x11 processors.
- *
- * Licensed under the terms of the GNU General Public
- * License version 2. See file COPYING for details.
- */
-#include <linux/firmware.h>
-#include <linux/pci_ids.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba");
-MODULE_LICENSE("GPL v2");
-
-#define UCODE_MAGIC 0x00414d44
-#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
-#define UCODE_UCODE_TYPE 0x00000001
-
-struct equiv_cpu_entry {
- u32 installed_cpu;
- u32 fixed_errata_mask;
- u32 fixed_errata_compare;
- u16 equiv_cpu;
- u16 res;
-} __attribute__((packed));
-
-struct microcode_header_amd {
- u32 data_code;
- u32 patch_id;
- u16 mc_patch_data_id;
- u8 mc_patch_data_len;
- u8 init_flag;
- u32 mc_patch_data_checksum;
- u32 nb_dev_id;
- u32 sb_dev_id;
- u16 processor_rev_id;
- u8 nb_rev_id;
- u8 sb_rev_id;
- u8 bios_api_rev;
- u8 reserved1[3];
- u32 match_reg[8];
-} __attribute__((packed));
-
-struct microcode_amd {
- struct microcode_header_amd hdr;
- unsigned int mpb[0];
-};
-
-#define UCODE_MAX_SIZE 2048
-#define UCODE_CONTAINER_SECTION_HDR 8
-#define UCODE_CONTAINER_HEADER_SIZE 12
-
-static struct equiv_cpu_entry *equiv_cpu_table;
-
-static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- u32 dummy;
-
- memset(csig, 0, sizeof(*csig));
- if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
- "supported\n", cpu, c->x86);
- return -1;
- }
- rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
- printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
- return 0;
-}
-
-static int get_matching_microcode(int cpu, void *mc, int rev)
-{
- struct microcode_header_amd *mc_header = mc;
- unsigned int current_cpu_id;
- u16 equiv_cpu_id = 0;
- unsigned int i = 0;
-
- BUG_ON(equiv_cpu_table == NULL);
- current_cpu_id = cpuid_eax(0x00000001);
-
- while (equiv_cpu_table[i].installed_cpu != 0) {
- if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
- equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
- break;
- }
- i++;
- }
-
- if (!equiv_cpu_id) {
- printk(KERN_WARNING "microcode: CPU%d: cpu revision "
- "not listed in equivalent cpu table\n", cpu);
- return 0;
- }
-
- if (mc_header->processor_rev_id != equiv_cpu_id) {
- printk(KERN_ERR "microcode: CPU%d: patch mismatch "
- "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
- cpu, mc_header->processor_rev_id, equiv_cpu_id);
- return 0;
- }
-
- /* ucode might be chipset specific -- currently we don't support this */
- if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
- printk(KERN_ERR "microcode: CPU%d: loading of chipset "
- "specific code not yet supported\n", cpu);
- return 0;
- }
-
- if (mc_header->patch_id <= rev)
- return 0;
-
- return 1;
-}
-
-static int apply_microcode_amd(int cpu)
-{
- u32 rev, dummy;
- int cpu_num = raw_smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- struct microcode_amd *mc_amd = uci->mc;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (mc_amd == NULL)
- return 0;
-
- wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
- /* get patch id after patching */
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- /* check current patch id and patch's id for match */
- if (rev != mc_amd->hdr.patch_id) {
- printk(KERN_ERR "microcode: CPU%d: update failed "
- "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
- return -1;
- }
-
- printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
- cpu, rev);
-
- uci->cpu_sig.rev = rev;
-
- return 0;
-}
-
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
-static void *
-get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
-{
- unsigned int total_size;
- u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
- void *mc;
-
- if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
- return NULL;
-
- if (section_hdr[0] != UCODE_UCODE_TYPE) {
- printk(KERN_ERR "microcode: error: invalid type field in "
- "container file section header\n");
- return NULL;
- }
-
- total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
-
- printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
- size, total_size);
-
- if (total_size > size || total_size > UCODE_MAX_SIZE) {
- printk(KERN_ERR "microcode: error: size mismatch\n");
- return NULL;
- }
-
- mc = vmalloc(UCODE_MAX_SIZE);
- if (mc) {
- memset(mc, 0, UCODE_MAX_SIZE);
- if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
- total_size)) {
- vfree(mc);
- mc = NULL;
- } else
- *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
- }
- return mc;
-}
-
-static int install_equiv_cpu_table(const u8 *buf)
-{
- u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
- unsigned int *buf_pos = (unsigned int *)container_hdr;
- unsigned long size;
-
- if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
- return 0;
-
- size = buf_pos[2];
-
- if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
- printk(KERN_ERR "microcode: error: invalid type field in "
- "container file section header\n");
- return 0;
- }
-
- equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
- if (!equiv_cpu_table) {
- printk(KERN_ERR "microcode: failed to allocate "
- "equivalent CPU table\n");
- return 0;
- }
-
- buf += UCODE_CONTAINER_HEADER_SIZE;
- if (get_ucode_data(equiv_cpu_table, buf, size)) {
- vfree(equiv_cpu_table);
- return 0;
- }
-
- return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
-}
-
-static void free_equiv_cpu_table(void)
-{
- vfree(equiv_cpu_table);
- equiv_cpu_table = NULL;
-}
-
-static enum ucode_state
-generic_load_microcode(int cpu, const u8 *data, size_t size)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- const u8 *ucode_ptr = data;
- void *new_mc = NULL;
- void *mc;
- int new_rev = uci->cpu_sig.rev;
- unsigned int leftover;
- unsigned long offset;
- enum ucode_state state = UCODE_OK;
-
- offset = install_equiv_cpu_table(ucode_ptr);
- if (!offset) {
- printk(KERN_ERR "microcode: failed to create "
- "equivalent cpu table\n");
- return UCODE_ERROR;
- }
-
- ucode_ptr += offset;
- leftover = size - offset;
-
- while (leftover) {
- unsigned int uninitialized_var(mc_size);
- struct microcode_header_amd *mc_header;
-
- mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
- if (!mc)
- break;
-
- mc_header = (struct microcode_header_amd *)mc;
- if (get_matching_microcode(cpu, mc, new_rev)) {
- vfree(new_mc);
- new_rev = mc_header->patch_id;
- new_mc = mc;
- } else
- vfree(mc);
-
- ucode_ptr += mc_size;
- leftover -= mc_size;
- }
-
- if (new_mc) {
- if (!leftover) {
- vfree(uci->mc);
- uci->mc = new_mc;
- pr_debug("microcode: CPU%d found a matching microcode "
- "update with version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
- } else {
- vfree(new_mc);
- state = UCODE_ERROR;
- }
- } else
- state = UCODE_NFOUND;
-
- free_equiv_cpu_table();
-
- return state;
-}
-
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
-{
- const char *fw_name = "amd-ucode/microcode_amd.bin";
- const struct firmware *firmware;
- enum ucode_state ret;
-
- if (request_firmware(&firmware, fw_name, device)) {
- printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
- return UCODE_NFOUND;
- }
-
- ret = generic_load_microcode(cpu, firmware->data, firmware->size);
-
- release_firmware(firmware);
-
- return ret;
-}
-
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- printk(KERN_INFO "microcode: AMD microcode update via "
- "/dev/cpu/microcode not supported\n");
- return UCODE_ERROR;
-}
-
-static void microcode_fini_cpu_amd(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- vfree(uci->mc);
- uci->mc = NULL;
-}
-
-static struct microcode_ops microcode_amd_ops = {
- .request_microcode_user = request_microcode_user,
- .request_microcode_fw = request_microcode_fw,
- .collect_cpu_info = collect_cpu_info_amd,
- .apply_microcode = apply_microcode_amd,
- .microcode_fini_cpu = microcode_fini_cpu_amd,
-};
-
-struct microcode_ops * __init init_amd_microcode(void)
-{
- return &microcode_amd_ops;
-}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
deleted file mode 100644
index 9371448290ac..000000000000
--- a/arch/x86/kernel/microcode_core.c
+++ /dev/null
@@ -1,570 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to
- * speculative nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
- */
-#include <linux/platform_device.h>
-#include <linux/miscdevice.h>
-#include <linux/capability.h>
-#include <linux/smp_lock.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION "2.00"
-
-static struct microcode_ops *microcode_ops;
-
-/*
- * Synchronization.
- *
- * All non cpu-hotplug-callback call sites use:
- *
- * - microcode_mutex to synchronize with each other;
- * - get/put_online_cpus() to synchronize with
- * the cpu-hotplug-callback call sites.
- *
- * We guarantee that only a single cpu is being
- * updated at any particular moment of time.
- */
-static DEFINE_MUTEX(microcode_mutex);
-
-struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-EXPORT_SYMBOL_GPL(ucode_cpu_info);
-
-/*
- * Operations that are run on a target cpu:
- */
-
-struct cpu_info_ctx {
- struct cpu_signature *cpu_sig;
- int err;
-};
-
-static void collect_cpu_info_local(void *arg)
-{
- struct cpu_info_ctx *ctx = arg;
-
- ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
- ctx->cpu_sig);
-}
-
-static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
-{
- struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
- int ret;
-
- ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
- if (!ret)
- ret = ctx.err;
-
- return ret;
-}
-
-static int collect_cpu_info(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- int ret;
-
- memset(uci, 0, sizeof(*uci));
-
- ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
- if (!ret)
- uci->valid = 1;
-
- return ret;
-}
-
-struct apply_microcode_ctx {
- int err;
-};
-
-static void apply_microcode_local(void *arg)
-{
- struct apply_microcode_ctx *ctx = arg;
-
- ctx->err = microcode_ops->apply_microcode(smp_processor_id());
-}
-
-static int apply_microcode_on_target(int cpu)
-{
- struct apply_microcode_ctx ctx = { .err = 0 };
- int ret;
-
- ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
- if (!ret)
- ret = ctx.err;
-
- return ret;
-}
-
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static int do_microcode_update(const void __user *buf, size_t size)
-{
- int error = 0;
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
-
- if (!uci->valid)
- continue;
-
- ustate = microcode_ops->request_microcode_user(cpu, buf, size);
- if (ustate == UCODE_ERROR) {
- error = -1;
- break;
- } else if (ustate == UCODE_OK)
- apply_microcode_on_target(cpu);
- }
-
- return error;
-}
-
-static int microcode_open(struct inode *unused1, struct file *unused2)
-{
- cycle_kernel_lock();
- return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-static ssize_t microcode_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
-{
- ssize_t ret = -EINVAL;
-
- if ((len >> PAGE_SHIFT) > num_physpages) {
- pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
- return ret;
- }
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- if (do_microcode_update(buf, len) == 0)
- ret = (ssize_t)len;
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- return ret;
-}
-
-static const struct file_operations microcode_fops = {
- .owner = THIS_MODULE,
- .write = microcode_write,
- .open = microcode_open,
-};
-
-static struct miscdevice microcode_dev = {
- .minor = MICROCODE_MINOR,
- .name = "microcode",
- .devnode = "cpu/microcode",
- .fops = &microcode_fops,
-};
-
-static int __init microcode_dev_init(void)
-{
- int error;
-
- error = misc_register(&microcode_dev);
- if (error) {
- pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
- return error;
- }
-
- return 0;
-}
-
-static void microcode_dev_exit(void)
-{
- misc_deregister(&microcode_dev);
-}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while (0)
-#endif
-
-/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
-
-static int reload_for_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- int err = 0;
-
- mutex_lock(&microcode_mutex);
- if (uci->valid) {
- enum ucode_state ustate;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if (ustate == UCODE_OK)
- apply_microcode_on_target(cpu);
- else
- if (ustate == UCODE_ERROR)
- err = -EINVAL;
- }
- mutex_unlock(&microcode_mutex);
-
- return err;
-}
-
-static ssize_t reload_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- unsigned long val;
- int cpu = dev->id;
- int ret = 0;
- char *end;
-
- val = simple_strtoul(buf, &end, 0);
- if (end == buf)
- return -EINVAL;
-
- if (val == 1) {
- get_online_cpus();
- if (cpu_online(cpu))
- ret = reload_for_cpu(cpu);
- put_online_cpus();
- }
-
- if (!ret)
- ret = size;
-
- return ret;
-}
-
-static ssize_t version_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
- return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
-}
-
-static ssize_t pf_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
- return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
-}
-
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
-
-static struct attribute *mc_default_attrs[] = {
- &attr_reload.attr,
- &attr_version.attr,
- &attr_processor_flags.attr,
- NULL
-};
-
-static struct attribute_group mc_attr_group = {
- .attrs = mc_default_attrs,
- .name = "microcode",
-};
-
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- microcode_ops->microcode_fini_cpu(cpu);
- uci->valid = 0;
-}
-
-static enum ucode_state microcode_resume_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!uci->mc)
- return UCODE_NFOUND;
-
- pr_debug("microcode: CPU%d updated upon resume\n", cpu);
- apply_microcode_on_target(cpu);
-
- return UCODE_OK;
-}
-
-static enum ucode_state microcode_init_cpu(int cpu)
-{
- enum ucode_state ustate;
-
- if (collect_cpu_info(cpu))
- return UCODE_ERROR;
-
- /* --dimm. Trigger a delayed update? */
- if (system_state != SYSTEM_RUNNING)
- return UCODE_NFOUND;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
-
- if (ustate == UCODE_OK) {
- pr_debug("microcode: CPU%d updated upon init\n", cpu);
- apply_microcode_on_target(cpu);
- }
-
- return ustate;
-}
-
-static enum ucode_state microcode_update_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
-
- if (uci->valid)
- ustate = microcode_resume_cpu(cpu);
- else
- ustate = microcode_init_cpu(cpu);
-
- return ustate;
-}
-
-static int mc_sysdev_add(struct sys_device *sys_dev)
-{
- int err, cpu = sys_dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("microcode: CPU%d added\n", cpu);
-
- err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
- if (err)
- return err;
-
- if (microcode_init_cpu(cpu) == UCODE_ERROR)
- err = -EINVAL;
-
- return err;
-}
-
-static int mc_sysdev_remove(struct sys_device *sys_dev)
-{
- int cpu = sys_dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("microcode: CPU%d removed\n", cpu);
- microcode_fini_cpu(cpu);
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
- return 0;
-}
-
-static int mc_sysdev_resume(struct sys_device *dev)
-{
- int cpu = dev->id;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!cpu_online(cpu))
- return 0;
-
- /*
- * All non-bootup cpus are still disabled,
- * so only CPU 0 will apply ucode here.
- *
- * Moreover, there can be no concurrent
- * updates from any other places at this point.
- */
- WARN_ON(cpu != 0);
-
- if (uci->valid && uci->mc)
- microcode_ops->apply_microcode(cpu);
-
- return 0;
-}
-
-static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
- .resume = mc_sysdev_resume,
-};
-
-static __cpuinit int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
-
- sys_dev = get_cpu_sysdev(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- microcode_update_cpu(cpu);
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- pr_debug("microcode: CPU%d added\n", cpu);
- if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- pr_err("microcode: Failed to create group for CPU%d\n", cpu);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- /* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
- pr_debug("microcode: CPU%d removed\n", cpu);
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED_FROZEN:
- /* The CPU refused to come up during a system resume */
- microcode_fini_cpu(cpu);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __refdata mc_cpu_notifier = {
- .notifier_call = mc_cpu_callback,
-};
-
-static int __init microcode_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int error;
-
- if (c->x86_vendor == X86_VENDOR_INTEL)
- microcode_ops = init_intel_microcode();
- else if (c->x86_vendor == X86_VENDOR_AMD)
- microcode_ops = init_amd_microcode();
-
- if (!microcode_ops) {
- pr_err("microcode: no support for this CPU vendor\n");
- return -ENODEV;
- }
-
- microcode_pdev = platform_device_register_simple("microcode", -1,
- NULL, 0);
- if (IS_ERR(microcode_pdev)) {
- microcode_dev_exit();
- return PTR_ERR(microcode_pdev);
- }
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- if (error) {
- platform_device_unregister(microcode_pdev);
- return error;
- }
-
- error = microcode_dev_init();
- if (error)
- return error;
-
- register_hotcpu_notifier(&mc_cpu_notifier);
-
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION
- " <tigran@aivazian.fsnet.co.uk>,"
- " Peter Oruba\n");
-
- return 0;
-}
-module_init(microcode_init);
-
-static void __exit microcode_exit(void)
-{
- microcode_dev_exit();
-
- unregister_hotcpu_notifier(&mc_cpu_notifier);
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- platform_device_unregister(microcode_pdev);
-
- microcode_ops = NULL;
-
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
-}
-module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
deleted file mode 100644
index 0d334ddd0a96..000000000000
--- a/arch/x86/kernel/microcode_intel.c
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to
- * speculative nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
- */
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-struct microcode_header_intel {
- unsigned int hdrver;
- unsigned int rev;
- unsigned int date;
- unsigned int sig;
- unsigned int cksum;
- unsigned int ldrver;
- unsigned int pf;
- unsigned int datasize;
- unsigned int totalsize;
- unsigned int reserved[3];
-};
-
-struct microcode_intel {
- struct microcode_header_intel hdr;
- unsigned int bits[0];
-};
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
- unsigned int sig;
- unsigned int pf;
- unsigned int cksum;
-};
-
-struct extended_sigtable {
- unsigned int count;
- unsigned int cksum;
- unsigned int reserved[3];
- struct extended_signature sigs[0];
-};
-
-#define DEFAULT_UCODE_DATASIZE (2000)
-#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
-#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
-#define DWSIZE (sizeof(u32))
-
-#define get_totalsize(mc) \
- (((struct microcode_intel *)mc)->hdr.totalsize ? \
- ((struct microcode_intel *)mc)->hdr.totalsize : \
- DEFAULT_UCODE_TOTALSIZE)
-
-#define get_datasize(mc) \
- (((struct microcode_intel *)mc)->hdr.datasize ? \
- ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
- (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned int val[2];
-
- memset(csig, 0, sizeof(*csig));
-
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- printk(KERN_ERR "microcode: CPU%d not a capable Intel "
- "processor\n", cpu_num);
- return -1;
- }
-
- csig->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig->pf = 1 << ((val[1] >> 18) & 7);
- }
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
-
- printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
- cpu_num, csig->sig, csig->pf, csig->rev);
-
- return 0;
-}
-
-static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
-{
- return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
-}
-
-static inline int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- int sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- printk(KERN_ERR "microcode: error! "
- "Bad data size in microcode data file\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- printk(KERN_ERR "microcode: error! "
- "Unknown microcode update format\n");
- return -EINVAL;
- }
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! "
- "Small exttable size in microcode data file\n");
- return -EINVAL;
- }
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- printk(KERN_ERR "microcode: error! "
- "Bad exttable size in microcode data file\n");
- return -EFAULT;
- }
- ext_sigcount = ext_header->count;
- }
-
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int *ext_tablep = (int *)ext_header;
-
- i = ext_table_size / DWSIZE;
- while (i--)
- ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- printk(KERN_WARNING "microcode: aborting, "
- "bad extended signature table checksum\n");
- return -EINVAL;
- }
- }
-
- /* calculate the checksum */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--)
- orig_sum += ((int *)mc)[i];
- if (orig_sum) {
- printk(KERN_ERR "microcode: aborting, bad checksum\n");
- return -EINVAL;
- }
- if (!ext_table_size)
- return 0;
- /* check extended signature checksum */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
- sum = orig_sum
- - (mc_header->sig + mc_header->pf + mc_header->cksum)
- + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- printk(KERN_ERR "microcode: aborting, bad checksum\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int
-get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
-{
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- int ext_sigcount, i;
- struct extended_signature *ext_sig;
-
- if (!update_match_revision(mc_header, rev))
- return 0;
-
- if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
- return 0;
-
- ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_sigcount; i++) {
- if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-
-static int apply_microcode(int cpu)
-{
- struct microcode_intel *mc_intel;
- struct ucode_cpu_info *uci;
- unsigned int val[2];
- int cpu_num;
-
- cpu_num = raw_smp_processor_id();
- uci = ucode_cpu_info + cpu;
- mc_intel = uci->mc;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (mc_intel == NULL)
- return 0;
-
- /* write microcode via MSR 0x79 */
- wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) mc_intel->bits,
- (unsigned long) mc_intel->bits >> 16 >> 16);
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- if (val[1] != mc_intel->hdr.rev) {
- printk(KERN_ERR "microcode: CPU%d update "
- "to revision 0x%x failed\n",
- cpu_num, mc_intel->hdr.rev);
- return -1;
- }
- printk(KERN_INFO "microcode: CPU%d updated to revision "
- "0x%x, date = %04x-%02x-%02x \n",
- cpu_num, val[1],
- mc_intel->hdr.date & 0xffff,
- mc_intel->hdr.date >> 24,
- (mc_intel->hdr.date >> 16) & 0xff);
-
- uci->cpu_sig.rev = val[1];
-
- return 0;
-}
-
-static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
- int (*get_ucode_data)(void *, const void *, size_t))
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u8 *ucode_ptr = data, *new_mc = NULL, *mc;
- int new_rev = uci->cpu_sig.rev;
- unsigned int leftover = size;
- enum ucode_state state = UCODE_OK;
-
- while (leftover) {
- struct microcode_header_intel mc_header;
- unsigned int mc_size;
-
- if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
- break;
-
- mc_size = get_totalsize(&mc_header);
- if (!mc_size || mc_size > leftover) {
- printk(KERN_ERR "microcode: error!"
- "Bad data in microcode data file\n");
- break;
- }
-
- mc = vmalloc(mc_size);
- if (!mc)
- break;
-
- if (get_ucode_data(mc, ucode_ptr, mc_size) ||
- microcode_sanity_check(mc) < 0) {
- vfree(mc);
- break;
- }
-
- if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
- if (new_mc)
- vfree(new_mc);
- new_rev = mc_header.rev;
- new_mc = mc;
- } else
- vfree(mc);
-
- ucode_ptr += mc_size;
- leftover -= mc_size;
- }
-
- if (leftover) {
- if (new_mc)
- vfree(new_mc);
- state = UCODE_ERROR;
- goto out;
- }
-
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
- }
-
- if (uci->mc)
- vfree(uci->mc);
- uci->mc = (struct microcode_intel *)new_mc;
-
- pr_debug("microcode: CPU%d found a matching microcode update with"
- " version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
-out:
- return state;
-}
-
-static int get_ucode_fw(void *to, const void *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
-{
- char name[30];
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- const struct firmware *firmware;
- enum ucode_state ret;
-
- sprintf(name, "intel-ucode/%02x-%02x-%02x",
- c->x86, c->x86_model, c->x86_mask);
-
- if (request_firmware(&firmware, name, device)) {
- pr_debug("microcode: data file %s load failed\n", name);
- return UCODE_NFOUND;
- }
-
- ret = generic_load_microcode(cpu, (void *)firmware->data,
- firmware->size, &get_ucode_fw);
-
- release_firmware(firmware);
-
- return ret;
-}
-
-static int get_ucode_user(void *to, const void *from, size_t n)
-{
- return copy_from_user(to, from, n);
-}
-
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
-}
-
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- vfree(uci->mc);
- uci->mc = NULL;
-}
-
-static struct microcode_ops microcode_intel_ops = {
- .request_microcode_user = request_microcode_user,
- .request_microcode_fw = request_microcode_fw,
- .collect_cpu_info = collect_cpu_info,
- .apply_microcode = apply_microcode,
- .microcode_fini_cpu = microcode_fini_cpu,
-};
-
-struct microcode_ops * __init init_intel_microcode(void)
-{
- return &microcode_intel_ops;
-}
-
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..ef6104e7cc72 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* AMD Family 10h mmconfig enablement
*/
@@ -7,6 +8,9 @@
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/dmi.h>
+#include <linux/range.h>
+#include <linux/acpi.h>
+
#include <asm/pci-direct.h>
#include <linux/sort.h>
#include <asm/io.h>
@@ -22,20 +26,14 @@ struct pci_hostbridge_probe {
u32 device;
};
-static u64 __cpuinitdata fam10h_pci_mmconf_base;
-static int __cpuinitdata fam10h_pci_mmconf_base_status;
+static u64 fam10h_pci_mmconf_base;
-static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
+static struct pci_hostbridge_probe pci_probes[] = {
{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-struct range {
- u64 start;
- u64 end;
-};
-
-static int __cpuinit cmp_range(const void *x1, const void *x2)
+static int cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
const struct range *r2 = x2;
@@ -47,11 +45,13 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
return start1 - start2;
}
-/*[47:0] */
-/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */
+#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
+#define MMCONF_MASK (~(MMCONF_UNIT - 1))
+#define MMCONF_SIZE (MMCONF_UNIT << 8)
+/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
-#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32)))
-static void __cpuinit get_fam10h_pci_mmconf_base(void)
+#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
+static void get_fam10h_pci_mmconf_base(void)
{
int i;
unsigned bus;
@@ -67,12 +67,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
struct range range[8];
/* only try to get setting from BSP */
- /* -1 or 1 */
- if (fam10h_pci_mmconf_base_status)
+ if (fam10h_pci_mmconf_base)
return;
if (!early_pci_allowed())
- goto fail;
+ return;
found = 0;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -94,24 +93,24 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
}
if (!found)
- goto fail;
+ return;
/* SYS_CFG */
- address = MSR_K8_SYSCFG;
- rdmsrl(address, val);
+ address = MSR_AMD64_SYSCFG;
+ rdmsrq(address, val);
/* TOP_MEM2 is not enabled? */
if (!(val & (1<<21))) {
- tom2 = 0;
+ tom2 = 1ULL << 32;
} else {
/* TOP_MEM2 */
address = MSR_K8_TOP_MEM2;
- rdmsrl(address, val);
- tom2 = val & (0xffffULL<<32);
+ rdmsrq(address, val);
+ tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
}
if (base <= tom2)
- base = tom2 + (1ULL<<32);
+ base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
/*
* need to check if the range is in the high mmio range that is
@@ -126,11 +125,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (!(reg & 3))
continue;
- start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
- end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
- if (!end)
+ if (end < tom2)
continue;
range[hi_mmio_num].start = start;
@@ -146,35 +145,30 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (range[hi_mmio_num - 1].end < base)
goto out;
- if (range[0].start > base)
+ if (range[0].start > base + MMCONF_SIZE)
goto out;
/* need to find one window */
- base = range[0].start - (1ULL << 32);
+ base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
if ((base > tom2) && BASE_VALID(base))
goto out;
- base = range[hi_mmio_num - 1].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
+ base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ if (BASE_VALID(base))
goto out;
/* need to find window between ranges */
- if (hi_mmio_num > 1)
- for (i = 0; i < hi_mmio_num - 1; i++) {
- if (range[i + 1].start > (range[i].end + (1ULL << 32))) {
- base = range[i].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
- goto out;
- }
+ for (i = 1; i < hi_mmio_num; i++) {
+ base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ val = range[i].start & MMCONF_MASK;
+ if (val >= base + MMCONF_SIZE && BASE_VALID(base))
+ goto out;
}
-
-fail:
- fam10h_pci_mmconf_base_status = -1;
return;
+
out:
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
}
-void __cpuinit fam10h_check_enable_mmcfg(void)
+void fam10h_check_enable_mmcfg(void)
{
u64 val;
u32 address;
@@ -183,7 +177,7 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
return;
address = MSR_FAM10H_MMIO_CONF_BASE;
- rdmsrl(address, val);
+ rdmsrq(address, val);
/* try to make sure that AP's setting is identical to BSP setting */
if (val & FAM10H_MMIO_CONF_ENABLE) {
@@ -193,11 +187,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
/* only trust the one handle 256 buses, if acpi=off */
if (!acpi_pci_disabled || busnbits >= 8) {
- u64 base;
- base = val & (0xffffULL << 32);
- if (fam10h_pci_mmconf_base_status <= 0) {
+ u64 base = val & MMCONF_MASK;
+
+ if (!fam10h_pci_mmconf_base) {
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
return;
} else if (fam10h_pci_mmconf_base == base)
return;
@@ -209,24 +202,26 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
* with 256 buses
*/
get_fam10h_pci_mmconf_base();
- if (fam10h_pci_mmconf_base_status <= 0)
+ if (!fam10h_pci_mmconf_base) {
+ pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
return;
+ }
printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
(FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
FAM10H_MMIO_CONF_ENABLE;
- wrmsrl(address, val);
+ wrmsrq(address, val);
}
-static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
{
pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
return 0;
}
-static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
{
.callback = set_check_enable_amd_mmconf,
.ident = "Sun Microsystems Machine",
@@ -237,7 +232,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
{}
};
-void __cpuinit check_enable_amd_mmconf_dmi(void)
+/* Called from a non __init function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
{
dmi_check_system(mmconf_dmi_table);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 89f386f044e4..11c45ce42694 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -1,72 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Kernel module help for x86.
Copyright (C) 2001 Rusty Russell.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
+*/
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
#include <linux/moduleloader.h>
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kasan.h>
#include <linux/bug.h>
#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/jump_label.h>
+#include <linux/random.h>
+#include <linux/memory.h>
+#include <linux/stackprotector.h>
-#include <asm/system.h>
+#include <asm/text-patching.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/unwind.h>
#if 0
-#define DEBUGP printk
+#define DEBUGP(fmt, ...) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__)
#else
-#define DEBUGP(fmt...)
+#define DEBUGP(fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
#endif
-void *module_alloc(unsigned long size)
-{
- struct vm_struct *area;
-
- if (!size)
- return NULL;
- size = PAGE_ALIGN(size);
- if (size > MODULES_LEN)
- return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
- if (!area)
- return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC);
-}
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_32
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
@@ -79,8 +49,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
Elf32_Sym *sym;
uint32_t *location;
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -96,44 +66,41 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
- /* Add the value, subtract its postition */
+ case R_386_PLT32:
+ /* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
default:
- printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+ pr_err("%s: Unknown relocation: %u\n",
me->name, ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
}
return 0;
}
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
- me->name);
- return -ENOEXEC;
-}
#else /*X86_64*/
-int apply_relocate_add(Elf64_Shdr *sechdrs,
+static int __write_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
- struct module *me)
+ struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len),
+ bool apply)
{
unsigned int i;
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
u64 val;
+ u64 zero = 0ULL;
+
+ DEBUGP("%s relocate section %u to %u\n",
+ apply ? "Applying" : "Clearing",
+ relsec, sechdrs[relsec].sh_info);
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ size_t size;
+
/* This is where to make the change */
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
@@ -144,60 +111,130 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
+ ELF64_R_SYM(rel[i].r_info);
DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info),
- sym->st_value, rel[i].r_addend, (u64)loc);
+ (int)ELF64_R_TYPE(rel[i].r_info),
+ sym->st_value, rel[i].r_addend, (u64)loc);
val = sym->st_value + rel[i].r_addend;
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
- break;
+ continue; /* nothing to write */
case R_X86_64_64:
- *(u64 *)loc = val;
+ size = 8;
break;
case R_X86_64_32:
- *(u32 *)loc = val;
- if (val != *(u32 *)loc)
+ if (val != *(u32 *)&val)
goto overflow;
+ size = 4;
break;
case R_X86_64_32S:
- *(s32 *)loc = val;
- if ((s64)val != *(s32 *)loc)
+ if ((s64)val != *(s32 *)&val)
goto overflow;
+ size = 4;
break;
+#if defined(CONFIG_STACKPROTECTOR) && \
+ defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
+ case R_X86_64_REX_GOTPCRELX: {
+ static unsigned long __percpu *const addr = &__stack_chk_guard;
+
+ if (sym->st_value != (u64)addr) {
+ pr_err("%s: Unsupported GOTPCREL relocation\n", me->name);
+ return -ENOEXEC;
+ }
+
+ val = (u64)&addr + rel[i].r_addend;
+ fallthrough;
+ }
+#endif
case R_X86_64_PC32:
+ case R_X86_64_PLT32:
val -= (u64)loc;
- *(u32 *)loc = val;
-#if 0
- if ((s64)val != *(s32 *)loc)
- goto overflow;
-#endif
+ size = 4;
+ break;
+ case R_X86_64_PC64:
+ val -= (u64)loc;
+ size = 8;
break;
default:
- printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
+ pr_err("%s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
+
+ if (apply) {
+ if (memcmp(loc, &zero, size)) {
+ pr_err("x86/modules: Invalid relocation target, existing value is nonzero for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &val, size);
+ } else {
+ if (memcmp(loc, &val, size)) {
+ pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &zero, size);
+ }
}
return 0;
overflow:
- printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), val);
- printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+ pr_err("overflow in relocation type %d val %llx sec %u idx %d\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), val, relsec, i);
+ pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
}
-int apply_relocate(Elf_Shdr *sechdrs,
+static int write_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me,
+ bool apply)
+{
+ int ret;
+ bool early = me->state == MODULE_STATE_UNFORMED;
+ void *(*write)(void *, const void *, size_t) = memcpy;
+
+ if (!early) {
+ write = text_poke;
+ mutex_lock(&text_mutex);
+ }
+
+ ret = __write_relocate_add(sechdrs, strtab, symindex, relsec, me,
+ write, apply);
+
+ if (!early) {
+ smp_text_poke_sync_each_cpu();
+ mutex_unlock(&text_mutex);
+ }
+
+ return ret;
+}
+
+int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
- printk(KERN_ERR "non add relocation not supported\n");
- return -ENOSYS;
+ return write_relocate_add(sechdrs, strtab, symindex, relsec, me, true);
+}
+
+#ifdef CONFIG_LIVEPATCH
+void clear_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ write_relocate_add(sechdrs, strtab, symindex, relsec, me, false);
}
+#endif
#endif
@@ -205,44 +242,97 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
+ const Elf_Shdr *s, *alt = NULL, *locks = NULL,
+ *orc = NULL, *orc_ip = NULL,
+ *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
+ *calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".text", secstrings + s->sh_name))
- text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
+ if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+ orc = s;
+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+ orc_ip = s;
+ if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
+ retpolines = s;
+ if (!strcmp(".return_sites", secstrings + s->sh_name))
+ returns = s;
+ if (!strcmp(".call_sites", secstrings + s->sh_name))
+ calls = s;
+ if (!strcmp(".cfi_sites", secstrings + s->sh_name))
+ cfi = s;
+ if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name))
+ ibt_endbr = s;
}
+ its_init_mod(me);
+
+ if (retpolines || cfi) {
+ void *rseg = NULL, *cseg = NULL;
+ unsigned int rsize = 0, csize = 0;
+
+ if (retpolines) {
+ rseg = (void *)retpolines->sh_addr;
+ rsize = retpolines->sh_size;
+ }
+
+ if (cfi) {
+ cseg = (void *)cfi->sh_addr;
+ csize = cfi->sh_size;
+ }
+
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
+ }
+ if (retpolines) {
+ void *rseg = (void *)retpolines->sh_addr;
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
+ }
+
+ its_fini_mod(me);
+
+ if (returns) {
+ void *rseg = (void *)returns->sh_addr;
+ apply_returns(rseg, rseg + returns->sh_size);
+ }
+ if (calls) {
+ struct callthunk_sites cs = {};
+
+ cs.call_start = (void *)calls->sh_addr;
+ cs.call_end = (void *)calls->sh_addr + calls->sh_size;
+
+ callthunks_patch_module_calls(&cs, me);
+ }
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
- if (locks && text) {
+ if (ibt_endbr) {
+ void *iseg = (void *)ibt_endbr->sh_addr;
+ apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
+ }
+ if (locks) {
void *lseg = (void *)locks->sh_addr;
- void *tseg = (void *)text->sh_addr;
+ void *text = me->mem[MOD_TEXT].base;
+ void *text_end = text + me->mem[MOD_TEXT].size;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
- tseg, tseg + text->sh_size);
+ text, text_end);
}
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
- return module_bug_finalize(hdr, sechdrs, me);
+ return 0;
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
- module_bug_cleanup(mod);
+ its_free_mod(mod);
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 651c93b28862..4a1b1b28abf9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Intel Multiprocessor Specification 1.1 and 1.4
* compliant MP-table parsing routines.
@@ -10,23 +11,23 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/delay.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
-#include <linux/module.h>
#include <linux/smp.h>
#include <linux/pci.h>
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
+#include <asm/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
#include <asm/proto.h>
#include <asm/bios_ebda.h>
-#include <asm/e820.h>
-#include <asm/trampoline.h>
+#include <asm/e820/api.h>
#include <asm/setup.h>
#include <asm/smp.h>
@@ -35,6 +36,8 @@
* Checksum an MP configuration block.
*/
+static unsigned int num_procs __initdata;
+
static int __init mpf_checksum(unsigned char *mp, int len)
{
int sum = 0;
@@ -47,192 +50,86 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __init MP_processor_info(struct mpc_cpu *m)
{
- int apicid;
char *bootup_cpu = "";
- if (!(m->cpuflag & CPU_ENABLED)) {
- disabled_cpus++;
+ topology_register_apic(m->apicid, CPU_ACPIID_INVALID, m->cpuflag & CPU_ENABLED);
+ if (!(m->cpuflag & CPU_ENABLED))
return;
- }
- if (x86_quirks->mpc_apic_id)
- apicid = x86_quirks->mpc_apic_id(m);
- else
- apicid = m->apicid;
-
- if (m->cpuflag & CPU_BOOTPROCESSOR) {
+ if (m->cpuflag & CPU_BOOTPROCESSOR)
bootup_cpu = " (Bootup-CPU)";
- boot_cpu_physical_apicid = m->apicid;
- }
- printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
- generic_processor_info(apicid, m->apicver);
+ pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
+ num_procs++;
}
#ifdef CONFIG_X86_IO_APIC
-static void __init MP_bus_info(struct mpc_bus *m)
+static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str)
{
- char str[7];
memcpy(str, m->bustype, 6);
str[6] = 0;
+ apic_pr_verbose("Bus #%d is %s\n", m->busid, str);
+}
- if (x86_quirks->mpc_oem_bus_info)
- x86_quirks->mpc_oem_bus_info(m, str);
- else
- apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+static void __init MP_bus_info(struct mpc_bus *m)
+{
+ char str[7];
+
+ mpc_oem_bus_info(m, str);
#if MAX_MP_BUSSES < 256
if (m->busid >= MAX_MP_BUSSES) {
- printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
- " is too large, max. supported is %d\n",
- m->busid, str, MAX_MP_BUSSES - 1);
+ pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+ m->busid, str, MAX_MP_BUSSES - 1);
return;
}
#endif
+ set_bit(m->busid, mp_bus_not_pci);
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
- set_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
- if (x86_quirks->mpc_oem_pci_bus)
- x86_quirks->mpc_oem_pci_bus(m);
-
clear_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
- mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
#endif
} else
- printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
-}
-
-static int bad_ioapic(unsigned long address)
-{
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
- return 1;
- }
- return 0;
+ pr_warn("Unknown bustype %s - ignoring\n", str);
}
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
- if (!(m->flags & MPC_APIC_USABLE))
- return;
-
- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
- m->apicid, m->apicver, m->apicaddr);
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_LEGACY,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
- if (bad_ioapic(m->apicaddr))
- return;
-
- mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
- mp_ioapics[nr_ioapics].apicid = m->apicid;
- mp_ioapics[nr_ioapics].type = m->type;
- mp_ioapics[nr_ioapics].apicver = m->apicver;
- mp_ioapics[nr_ioapics].flags = m->flags;
- nr_ioapics++;
-}
-
-static void print_MP_intsrc_info(struct mpc_intsrc *m)
-{
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
- m->srcbusirq, m->dstapic, m->dstirq);
+ if (m->flags & MPC_APIC_USABLE)
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
}
static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
{
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
mp_irq->irqtype, mp_irq->irqflag & 3,
(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
}
-static void __init assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- mp_irq->dstapic = m->dstapic;
- mp_irq->type = m->type;
- mp_irq->irqtype = m->irqtype;
- mp_irq->irqflag = m->irqflag;
- mp_irq->srcbus = m->srcbus;
- mp_irq->srcbusirq = m->srcbusirq;
- mp_irq->dstirq = m->dstirq;
-}
-
-static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- m->dstapic = mp_irq->dstapic;
- m->type = mp_irq->type;
- m->irqtype = mp_irq->irqtype;
- m->irqflag = mp_irq->irqflag;
- m->srcbus = mp_irq->srcbus;
- m->srcbusirq = mp_irq->srcbusirq;
- m->dstirq = mp_irq->dstirq;
-}
-
-static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- if (mp_irq->dstapic != m->dstapic)
- return 1;
- if (mp_irq->type != m->type)
- return 2;
- if (mp_irq->irqtype != m->irqtype)
- return 3;
- if (mp_irq->irqflag != m->irqflag)
- return 4;
- if (mp_irq->srcbus != m->srcbus)
- return 5;
- if (mp_irq->srcbusirq != m->srcbusirq)
- return 6;
- if (mp_irq->dstirq != m->dstirq)
- return 7;
-
- return 0;
-}
-
-static void __init MP_intsrc_info(struct mpc_intsrc *m)
-{
- int i;
-
- print_MP_intsrc_info(m);
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
#else /* CONFIG_X86_IO_APIC */
static inline void __init MP_bus_info(struct mpc_bus *m) {}
static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
-static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
#endif /* CONFIG_X86_IO_APIC */
-
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
- apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
m->srcbusirq, m->destapic, m->destapiclint);
}
@@ -240,39 +137,37 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
/*
* Read/parse the MPC
*/
-
static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
{
if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
- printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+ pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
mpc->signature[0], mpc->signature[1],
mpc->signature[2], mpc->signature[3]);
return 0;
}
if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
- printk(KERN_ERR "MPTABLE: checksum error!\n");
+ pr_err("MPTABLE: checksum error!\n");
return 0;
}
if (mpc->spec != 0x01 && mpc->spec != 0x04) {
- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
- mpc->spec);
+ pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
return 0;
}
if (!mpc->lapic) {
- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+ pr_err("MPTABLE: null local APIC address!\n");
return 0;
}
memcpy(oem, mpc->oem, 8);
oem[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+ pr_info("MPTABLE: OEM ID: %s\n", oem);
memcpy(str, mpc->productid, 12);
str[12] = 0;
- printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+ pr_info("MPTABLE: Product ID: %s\n", str);
- printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+ pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
return 1;
}
@@ -285,8 +180,8 @@ static void skip_entry(unsigned char **ptr, int *count, int size)
static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
{
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
- "type %x\n", *mpt);
+ pr_err("Your mptable is wrong, contact your HW vendor!\n");
+ pr_cont("type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
1, mpc, mpc->length, 1);
}
@@ -302,27 +197,14 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (!smp_check_mpc(mpc, oem, str))
return 0;
-#ifdef CONFIG_X86_32
- generic_mps_oem_check(mpc, oem, str);
-#endif
- /* save the local APIC address, it might be non-default */
- if (!acpi_lapic)
- mp_lapic_addr = mpc->lapic;
-
- if (early)
+ if (early) {
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ register_lapic_address(mpc->lapic);
return 1;
-
- if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) {
- struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr;
- x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
}
- /*
- * Now process the configuration blocks.
- */
- if (x86_quirks->mpc_record)
- *x86_quirks->mpc_record = 0;
-
+ /* Now process the configuration blocks. */
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
@@ -340,7 +222,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
break;
case MP_INTSRC:
- MP_intsrc_info((struct mpc_intsrc *)mpt);
+ mp_save_irq((struct mpc_intsrc *)mpt);
skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
break;
case MP_LINTSRC:
@@ -353,20 +235,11 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
count = mpc->length;
break;
}
- if (x86_quirks->mpc_record)
- (*x86_quirks->mpc_record)++;
}
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
-
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
-
- if (!num_processors)
- printk(KERN_ERR "MPTABLE: no processors registered!\n");
- return num_processors;
+ if (!num_procs && !acpi_lapic)
+ pr_err("MPTABLE: no processors registered!\n");
+ return num_procs || acpi_lapic;
}
#ifdef CONFIG_X86_IO_APIC
@@ -375,7 +248,7 @@ static int __init ELCR_trigger(unsigned int irq)
{
unsigned int port;
- port = 0x4d0 + (irq >> 3);
+ port = PIC_ELCR1 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -386,9 +259,9 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
int ELCR_fallback = 0;
intsrc.type = MP_INTSRC;
- intsrc.irqflag = 0; /* conforming */
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
intsrc.srcbus = 0;
- intsrc.dstapic = mp_ioapics[0].apicid;
+ intsrc.dstapic = mpc_ioapic_id(0);
intsrc.irqtype = mp_INT;
@@ -401,16 +274,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* If it does, we assume it's valid.
*/
if (mpc_default_type == 5) {
- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
- "falling back to ELCR\n");
+ pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
ELCR_trigger(13))
- printk(KERN_ERR "ELCR contains invalid data... "
- "not using ELCR\n");
+ pr_err("ELCR contains invalid data... not using ELCR\n");
else {
- printk(KERN_INFO
- "Using ELCR to identify PCI interrupts\n");
+ pr_info("Using ELCR to identify PCI interrupts\n");
ELCR_fallback = 1;
}
}
@@ -420,7 +290,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
case 2:
if (i == 0 || i == 13)
continue; /* IRQ0 & IRQ13 not connected */
- /* fall through */
+ fallthrough;
default:
if (i == 2)
continue; /* IRQ2 is never connected */
@@ -432,21 +302,24 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* copy that information over to the MP table in the
* irqflag field (level sensitive, active high polarity).
*/
- if (ELCR_trigger(i))
- intsrc.irqflag = 13;
- else
- intsrc.irqflag = 0;
+ if (ELCR_trigger(i)) {
+ intsrc.irqflag = MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_HIGH;
+ } else {
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT |
+ MP_IRQPOL_DEFAULT;
+ }
}
intsrc.srcbusirq = i;
intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
- MP_intsrc_info(&intsrc);
+ mp_save_irq(&intsrc);
}
intsrc.irqtype = mp_ExtINT;
intsrc.srcbusirq = 0;
intsrc.dstirq = 0; /* 8259A to INTIN0 */
- MP_intsrc_info(&intsrc);
+ mp_save_irq(&intsrc);
}
@@ -459,9 +332,9 @@ static void __init construct_ioapic_table(int mpc_default_type)
bus.busid = 0;
switch (mpc_default_type) {
default:
- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+ pr_err("???\nUnknown standard configuration %d\n",
mpc_default_type);
- /* fall through */
+ fallthrough;
case 1:
case 5:
memcpy(bus.bustype, "ISA ", 6);
@@ -471,9 +344,6 @@ static void __init construct_ioapic_table(int mpc_default_type)
case 3:
memcpy(bus.bustype, "EISA ", 6);
break;
- case 4:
- case 7:
- memcpy(bus.bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -482,11 +352,11 @@ static void __init construct_ioapic_table(int mpc_default_type)
MP_bus_info(&bus);
}
- ioapic.type = MP_IOAPIC;
- ioapic.apicid = 2;
- ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- ioapic.flags = MPC_APIC_USABLE;
- ioapic.apicaddr = 0xFEC00000;
+ ioapic.type = MP_IOAPIC;
+ ioapic.apicid = 2;
+ ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.flags = MPC_APIC_USABLE;
+ ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
MP_ioapic_info(&ioapic);
/*
@@ -506,11 +376,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
int i;
/*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /*
* 2 CPUs, numbered 0 & 1.
*/
processor.type = MP_PROCESSOR;
@@ -518,8 +383,8 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
processor.cpuflag = CPU_ENABLED;
processor.cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.featureflag = boot_cpu_data.x86_capability[0];
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping;
+ processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
processor.reserved[0] = 0;
processor.reserved[1] = 0;
for (i = 0; i < 2; i++) {
@@ -530,7 +395,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
construct_ioapic_table(mpc_default_type);
lintsrc.type = MP_LINTSRC;
- lintsrc.irqflag = 0; /* conforming */
+ lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
lintsrc.srcbusid = 0;
lintsrc.srcbusirq = 0;
lintsrc.destapic = MP_APIC_ALL;
@@ -541,17 +406,18 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
}
}
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
+static bool mpf_found;
static unsigned long __init get_mpc_size(unsigned long physptr)
{
struct mpc_table *mpc;
unsigned long size;
- mpc = early_ioremap(physptr, PAGE_SIZE);
+ mpc = early_memremap(physptr, PAGE_SIZE);
size = mpc->length;
- early_iounmap(mpc, PAGE_SIZE);
- apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+ early_memunmap(mpc, PAGE_SIZE);
+ apic_pr_verbose(" mpc: %lx-%lx\n", physptr, physptr + size);
return size;
}
@@ -562,7 +428,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
unsigned long size;
size = get_mpc_size(mpf->physptr);
- mpc = early_ioremap(mpf->physptr, size);
+ mpc = early_memremap(mpf->physptr, size);
+
/*
* Read the physical hardware table. Anything here will
* override the defaults.
@@ -571,12 +438,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
- "... disabling SMP support. (tell your hw vendor)\n");
- early_iounmap(mpc, size);
+ pr_err("BIOS bug, MP table errors detected!...\n");
+ pr_cont("... disabling SMP support. (tell your hw vendor)\n");
+ early_memunmap(mpc, size);
return -1;
}
- early_iounmap(mpc, size);
+ early_memunmap(mpc, size);
if (early)
return -1;
@@ -590,8 +457,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
if (!mp_irq_entries) {
struct mpc_bus bus;
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
- "using default mptable. (tell your hw vendor)\n");
+ pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
bus.type = MP_BUS;
bus.busid = 0;
@@ -608,11 +474,14 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
/*
* Scan the memory blocks for an SMP configuration block.
*/
-static void __init __get_smp_config(unsigned int early)
+static __init void mpparse_get_smp_config(unsigned int early)
{
- struct mpf_intel *mpf = mpf_found;
+ struct mpf_intel *mpf;
+
+ if (!smp_found_config)
+ return;
- if (!mpf)
+ if (!mpf_found)
return;
if (acpi_lapic && early)
@@ -625,97 +494,77 @@ static void __init __get_smp_config(unsigned int early)
if (acpi_lapic && acpi_ioapic)
return;
- if (x86_quirks->mach_get_smp_config) {
- if (x86_quirks->mach_get_smp_config(early))
- return;
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: error mapping MP table\n");
+ return;
}
- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
- mpf->specification);
+ pr_info("Intel MultiProcessor Specification v1.%d\n",
+ mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
if (mpf->feature2 & (1 << 7)) {
- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
+ pr_info(" IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
+ pr_info(" Virtual Wire compatibility mode.\n");
pic_mode = 0;
}
#endif
/*
* Now see if we need to read further.
*/
- if (mpf->feature1 != 0) {
+ if (mpf->feature1) {
if (early) {
- /*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- return;
+ /* Local APIC has default address */
+ register_lapic_address(APIC_DEFAULT_PHYS_BASE);
+ goto out;
}
- printk(KERN_INFO "Default MP configuration #%d\n",
- mpf->feature1);
+ pr_info("Default MP configuration #%d\n", mpf->feature1);
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
if (check_physptr(mpf, early))
- return;
+ goto out;
} else
BUG();
- if (!early)
- printk(KERN_INFO "Processors: %d\n", num_processors);
+ if (!early && !acpi_lapic)
+ pr_info("Processors: %d\n", num_procs);
/*
* Only use the first configuration found.
*/
+out:
+ early_memunmap(mpf, sizeof(*mpf));
}
-void __init early_get_smp_config(void)
+void __init mpparse_parse_early_smp_config(void)
{
- __get_smp_config(1);
+ mpparse_get_smp_config(true);
}
-void __init get_smp_config(void)
+void __init mpparse_parse_smp_config(void)
{
- __get_smp_config(0);
+ mpparse_get_smp_config(false);
}
-static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
- unsigned long size = get_mpc_size(mpf->physptr);
-#ifdef CONFIG_X86_32
- /*
- * We cannot access to MPC table to compute table size yet,
- * as only few megabytes from the bottom is mapped now.
- * PC-9800's MPC table places on the very last of physical
- * memory; so that simply reserving PAGE_SIZE from mpf->physptr
- * yields BUG() in reserve_bootmem.
- * also need to make sure physptr is below than max_low_pfn
- * we don't need reserve the area above max_low_pfn
- */
- unsigned long end = max_low_pfn * PAGE_SIZE;
-
- if (mpf->physptr < end) {
- if (mpf->physptr + size > end)
- size = end - mpf->physptr;
- reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
- }
-#else
- reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
-#endif
+ memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
}
-static int __init smp_scan_config(unsigned long base, unsigned long length,
- unsigned reserve)
+static int __init smp_scan_config(unsigned long base, unsigned long length)
{
- unsigned int *bp = phys_to_virt(base);
+ unsigned int *bp;
struct mpf_intel *mpf;
+ int ret = 0;
- apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
- bp, length);
+ apic_pr_verbose("Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
+ bp = early_memremap(base, length);
mpf = (struct mpf_intel *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
(mpf->length == 1) &&
@@ -725,34 +574,33 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
#endif
- mpf_found = mpf;
+ mpf_base = base;
+ mpf_found = true;
- printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
- mpf, (u64)virt_to_phys(mpf));
+ pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n",
+ base, base + sizeof(*mpf) - 1);
- if (!reserve)
- return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
- BOOTMEM_DEFAULT);
+ memblock_reserve(base, sizeof(*mpf));
if (mpf->physptr)
- smp_reserve_bootmem(mpf);
+ smp_reserve_memory(mpf);
- return 1;
+ ret = 1;
}
- bp += 4;
+ early_memunmap(bp, length);
+
+ if (ret)
+ break;
+
+ base += 16;
length -= 16;
}
- return 0;
+ return ret;
}
-static void __init __find_smp_config(unsigned int reserve)
+void __init mpparse_find_mptable(void)
{
unsigned int address;
- if (x86_quirks->mach_find_smp_config) {
- if (x86_quirks->mach_find_smp_config(reserve))
- return;
- }
/*
* FIXME: Linux assumes you have 640K of base ram..
* this continues the error...
@@ -761,13 +609,13 @@ static void __init __find_smp_config(unsigned int reserve)
* 2) Scan the top 1K of base RAM
* 3) Scan the 64K of bios
*/
- if (smp_scan_config(0x0, 0x400, reserve) ||
- smp_scan_config(639 * 0x400, 0x400, reserve) ||
- smp_scan_config(0xF0000, 0x10000, reserve))
+ if (smp_scan_config(0x0, 0x400) ||
+ smp_scan_config(639 * 0x400, 0x400) ||
+ smp_scan_config(0xF0000, 0x10000))
return;
/*
* If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
+ * configuration is in an EISA bus machine with an
* extended bios data area.
*
* there is a real-mode segmented pointer pointing to the
@@ -784,17 +632,7 @@ static void __init __find_smp_config(unsigned int reserve)
address = get_bios_ebda();
if (address)
- smp_scan_config(address, 0x400, reserve);
-}
-
-void __init early_find_smp_config(void)
-{
- __find_smp_config(0);
-}
-
-void __init find_smp_config(void)
-{
- __find_smp_config(1);
+ smp_scan_config(address, 0x400);
}
#ifdef CONFIG_X86_IO_APIC
@@ -807,7 +645,7 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
if (m->irqtype != mp_INT)
return 0;
- if (m->irqflag != 0x0f)
+ if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW))
return 0;
/* not legacy */
@@ -816,7 +654,8 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
continue;
if (mp_irqs[i].srcbus != m->srcbus)
@@ -843,13 +682,13 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
{
int i;
- apic_printk(APIC_VERBOSE, "OLD ");
- print_MP_intsrc_info(m);
+ apic_pr_verbose("OLD ");
+ print_mp_irq_info(m);
i = get_MP_intsrc_index(m);
if (i > 0) {
- assign_to_mpc_intsrc(&mp_irqs[i], m);
- apic_printk(APIC_VERBOSE, "NEW ");
+ memcpy(m, &mp_irqs[i], sizeof(*m));
+ apic_pr_verbose("NEW ");
print_mp_irq_info(&mp_irqs[i]);
return;
}
@@ -866,23 +705,21 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
*nr_m_spare += 1;
}
}
-#else /* CONFIG_X86_IO_APIC */
-static
-inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
-#endif /* CONFIG_X86_IO_APIC */
-static int
+static int __init
check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- int ret = 0;
-
if (!mpc_new_phys || count <= mpc_new_length) {
WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
return -1;
}
- return ret;
+ return 0;
}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
static int __init replace_intsrc_all(struct mpc_table *mpc,
unsigned long mpc_new_phys,
@@ -895,7 +732,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
int nr_m_spare = 0;
unsigned char *mpt = ((unsigned char *)mpc) + count;
- printk(KERN_INFO "mpc_length %x\n", mpc->length);
+ pr_info("mpc_length %x\n", mpc->length);
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
@@ -929,20 +766,21 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
continue;
if (nr_m_spare > 0) {
- apic_printk(APIC_VERBOSE, "*NEW* found\n");
+ apic_pr_verbose("*NEW* found\n");
nr_m_spare--;
- assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+ memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
m_spare[nr_m_spare] = NULL;
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
goto out;
- assign_to_mpc_intsrc(&mp_irqs[i], m);
+ memcpy(m, &mp_irqs[i], sizeof(*m));
mpc->length = count;
mpt += sizeof(struct mpc_intsrc);
}
@@ -988,15 +826,10 @@ static int __init parse_alloc_mptable_opt(char *p)
}
early_param("alloc_mptable", parse_alloc_mptable_opt);
-void __init early_reserve_e820_mpc_new(void)
+void __init e820__memblock_alloc_reserved_mpc_new(void)
{
- if (enable_update_mptable && alloc_mptable) {
- u64 startt = 0;
-#ifdef CONFIG_X86_TRAMPOLINE
- startt = TRAMPOLINE_BASE;
-#endif
- mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
- }
+ if (enable_update_mptable && alloc_mptable)
+ mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4);
}
static int __init update_mp_table(void)
@@ -1005,67 +838,89 @@ static int __init update_mp_table(void)
char oem[10];
struct mpf_intel *mpf;
struct mpc_table *mpc, *mpc_new;
+ unsigned long size;
if (!enable_update_mptable)
return 0;
- mpf = mpf_found;
- if (!mpf)
+ if (!mpf_found)
return 0;
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: mpf early_memremap() failed\n");
+ return 0;
+ }
+
/*
* Now see if we need to go further.
*/
- if (mpf->feature1 != 0)
- return 0;
+ if (mpf->feature1)
+ goto do_unmap_mpf;
if (!mpf->physptr)
- return 0;
+ goto do_unmap_mpf;
- mpc = phys_to_virt(mpf->physptr);
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+ if (!mpc) {
+ pr_err("MPTABLE: mpc early_memremap() failed\n");
+ goto do_unmap_mpf;
+ }
if (!smp_check_mpc(mpc, oem, str))
- return 0;
+ goto do_unmap_mpc;
- printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
- printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+ pr_info("mpf: %llx\n", (u64)mpf_base);
+ pr_info("physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
- printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
- mpc_new_length);
+ pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
}
if (!mpc_new_phys) {
unsigned char old, new;
- /* check if we can change the postion */
+ /* check if we can change the position */
mpc->checksum = 0;
old = mpf_checksum((unsigned char *)mpc, mpc->length);
mpc->checksum = 0xff;
new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
- printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
- return 0;
+ pr_info("mpc is readonly, please try alloc_mptable instead\n");
+ goto do_unmap_mpc;
}
- printk(KERN_INFO "use in-positon replacing\n");
+ pr_info("use in-position replacing\n");
} else {
+ mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+ if (!mpc_new) {
+ pr_err("MPTABLE: new mpc early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
mpf->physptr = mpc_new_phys;
- mpc_new = phys_to_virt(mpc_new_phys);
memcpy(mpc_new, mpc, mpc->length);
+ early_memunmap(mpc, size);
mpc = mpc_new;
+ size = mpc_new_length;
/* check if we can modify that */
if (mpc_new_phys - mpf->physptr) {
struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
- printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
- mpf_new = phys_to_virt(0x400 - 16);
+ mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+ if (!mpf_new) {
+ pr_err("MPTABLE: new mpf early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
+ pr_info("mpf new: %x\n", 0x400 - 16);
memcpy(mpf_new, mpf, 16);
+ early_memunmap(mpf, sizeof(*mpf));
mpf = mpf_new;
mpf->physptr = mpc_new_phys;
}
mpf->checksum = 0;
mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
- printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+ pr_info("physptr new: %x\n", mpf->physptr);
}
/*
@@ -1076,6 +931,12 @@ static int __init update_mp_table(void)
*/
replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+do_unmap_mpc:
+ early_memunmap(mpc, size);
+
+do_unmap_mpf:
+ early_memunmap(mpf, sizeof(*mpf));
+
return 0;
}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 98fd6cd4e3a4..4469c784eaa0 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* ----------------------------------------------------------------------- *
*
* Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- * USA; either version 2 of the License, or (at your option) any later
- * version; incorporated herein by reference.
+ * Copyright 2009 Intel Corporation; author: H. Peter Anvin
*
* ----------------------------------------------------------------------- */
@@ -21,6 +17,8 @@
* an SMP box will direct the access to CPU %d.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
@@ -29,41 +27,27 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/security.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
#include <asm/msr.h>
-#include <asm/system.h>
-static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
-static loff_t msr_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file->f_mapping->host;
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
+enum allow_write_msrs {
+ MSR_WRITES_ON,
+ MSR_WRITES_OFF,
+ MSR_WRITES_DEFAULT,
+};
+
+static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -71,7 +55,7 @@ static ssize_t msr_read(struct file *file, char __user *buf,
u32 __user *tmp = (u32 __user *) buf;
u32 data[2];
u32 reg = *ppos;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err = 0;
ssize_t bytes = 0;
@@ -80,11 +64,8 @@ static ssize_t msr_read(struct file *file, char __user *buf,
for (; count; count -= 8) {
err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
- if (err) {
- if (err == -EFAULT) /* Fix idiotic error code */
- err = -EIO;
+ if (err)
break;
- }
if (copy_to_user(tmp, &data, 8)) {
err = -EFAULT;
break;
@@ -96,16 +77,52 @@ static ssize_t msr_read(struct file *file, char __user *buf,
return bytes ? bytes : err;
}
+static int filter_write(u32 reg)
+{
+ /*
+ * MSRs writes usually happen all at once, and can easily saturate kmsg.
+ * Only allow one message every 30 seconds.
+ *
+ * It's possible to be smarter here and do it (for example) per-MSR, but
+ * it would certainly be more complex, and this is enough at least to
+ * avoid saturating the ring buffer.
+ */
+ static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1);
+
+ switch (allow_writes) {
+ case MSR_WRITES_ON: return 0;
+ case MSR_WRITES_OFF: return -EPERM;
+ default: break;
+ }
+
+ if (!__ratelimit(&fw_rs))
+ return 0;
+
+ pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d), tainting CPU_OUT_OF_SPEC.\n",
+ reg, current->comm, current->pid);
+ pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n");
+
+ return 0;
+}
+
static ssize_t msr_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
const u32 __user *tmp = (const u32 __user *)buf;
u32 data[2];
u32 reg = *ppos;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err = 0;
ssize_t bytes = 0;
+ err = security_locked_down(LOCKDOWN_MSR);
+ if (err)
+ return err;
+
+ err = filter_write(reg);
+ if (err)
+ return err;
+
if (count % 8)
return -EINVAL; /* Invalid chunk size */
@@ -114,12 +131,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
err = -EFAULT;
break;
}
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
- if (err) {
- if (err == -EFAULT) /* Fix idiotic error code */
- err = -EIO;
+ if (err)
break;
- }
+
tmp += 2;
bytes += 8;
}
@@ -127,25 +145,80 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
return bytes ? bytes : err;
}
-static int msr_open(struct inode *inode, struct file *file)
+static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
{
- unsigned int cpu = iminor(file->f_path.dentry->d_inode);
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- int ret = 0;
+ u32 __user *uregs = (u32 __user *)arg;
+ u32 regs[8];
+ int cpu = iminor(file_inode(file));
+ int err;
+
+ switch (ioc) {
+ case X86_IOC_RDMSR_REGS:
+ if (!(file->f_mode & FMODE_READ)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(&regs, uregs, sizeof(regs))) {
+ err = -EFAULT;
+ break;
+ }
+ err = rdmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, &regs, sizeof(regs)))
+ err = -EFAULT;
+ break;
- lock_kernel();
- cpu = iminor(file->f_path.dentry->d_inode);
+ case X86_IOC_WRMSR_REGS:
+ if (!(file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(&regs, uregs, sizeof(regs))) {
+ err = -EFAULT;
+ break;
+ }
+ err = security_locked_down(LOCKDOWN_MSR);
+ if (err)
+ break;
- if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
- ret = -ENXIO; /* No such CPU */
- goto out;
+ err = filter_write(regs[1]);
+ if (err)
+ return err;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ err = wrmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, &regs, sizeof(regs)))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOTTY;
+ break;
}
+
+ return err;
+}
+
+static int msr_open(struct inode *inode, struct file *file)
+{
+ unsigned int cpu = iminor(file_inode(file));
+ struct cpuinfo_x86 *c;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
+
c = &cpu_data(cpu);
if (!cpu_has(c, X86_FEATURE_MSR))
- ret = -EIO; /* MSR not supported */
-out:
- unlock_kernel();
- return ret;
+ return -EIO; /* MSR not supported */
+
+ return 0;
}
/*
@@ -153,105 +226,109 @@ out:
*/
static const struct file_operations msr_fops = {
.owner = THIS_MODULE,
- .llseek = msr_seek,
+ .llseek = no_seek_end_llseek,
.read = msr_read,
.write = msr_write,
.open = msr_open,
+ .unlocked_ioctl = msr_ioctl,
+ .compat_ioctl = msr_ioctl,
};
-static int __cpuinit msr_device_create(int cpu)
+static char *msr_devnode(const struct device *dev, umode_t *mode)
{
- struct device *dev;
-
- dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
- "msr%d", cpu);
- return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+ return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
-static void msr_device_destroy(int cpu)
-{
- device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
-}
+static const struct class msr_class = {
+ .name = "msr",
+ .devnode = msr_devnode,
+};
-static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int msr_device_create(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
+ struct device *dev;
- switch (action) {
- case CPU_UP_PREPARE:
- err = msr_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- msr_device_destroy(cpu);
- break;
- }
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ dev = device_create(&msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
+ "msr%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
}
-static struct notifier_block __refdata msr_class_cpu_notifier = {
- .notifier_call = msr_class_cpu_callback,
-};
-
-static char *msr_nodename(struct device *dev)
+static int msr_device_destroy(unsigned int cpu)
{
- return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
+ device_destroy(&msr_class, MKDEV(MSR_MAJOR, cpu));
+ return 0;
}
static int __init msr_init(void)
{
- int i, err = 0;
- i = 0;
-
- if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
- printk(KERN_ERR "msr: unable to get major %d for msr\n",
- MSR_MAJOR);
- err = -EBUSY;
- goto out;
+ int err;
+
+ if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
+ pr_err("unable to get major %d for msr\n", MSR_MAJOR);
+ return -EBUSY;
}
- msr_class = class_create(THIS_MODULE, "msr");
- if (IS_ERR(msr_class)) {
- err = PTR_ERR(msr_class);
+ err = class_register(&msr_class);
+ if (err)
goto out_chrdev;
- }
- msr_class->nodename = msr_nodename;
- for_each_online_cpu(i) {
- err = msr_device_create(i);
- if (err != 0)
- goto out_class;
- }
- register_hotcpu_notifier(&msr_class_cpu_notifier);
- err = 0;
- goto out;
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+ msr_device_create, msr_device_destroy);
+ if (err < 0)
+ goto out_class;
+ cpuhp_msr_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i)
- msr_device_destroy(i);
- class_destroy(msr_class);
+ class_unregister(&msr_class);
out_chrdev:
- unregister_chrdev(MSR_MAJOR, "cpu/msr");
-out:
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
return err;
}
+module_init(msr_init);
static void __exit msr_exit(void)
{
- int cpu = 0;
- for_each_online_cpu(cpu)
- msr_device_destroy(cpu);
- class_destroy(msr_class);
- unregister_chrdev(MSR_MAJOR, "cpu/msr");
- unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+ cpuhp_remove_state(cpuhp_msr_state);
+ class_unregister(&msr_class);
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
}
-
-module_init(msr_init);
module_exit(msr_exit)
+static int set_allow_writes(const char *val, const struct kernel_param *cp)
+{
+ /* val is NUL-terminated, see kernfs_fop_write() */
+ char *s = strstrip((char *)val);
+
+ if (!strcmp(s, "on"))
+ allow_writes = MSR_WRITES_ON;
+ else if (!strcmp(s, "off"))
+ allow_writes = MSR_WRITES_OFF;
+ else
+ allow_writes = MSR_WRITES_DEFAULT;
+
+ return 0;
+}
+
+static int get_allow_writes(char *buf, const struct kernel_param *kp)
+{
+ const char *res;
+
+ switch (allow_writes) {
+ case MSR_WRITES_ON: res = "on"; break;
+ case MSR_WRITES_OFF: res = "off"; break;
+ default: res = "default"; break;
+ }
+
+ return sprintf(buf, "%s\n", res);
+}
+
+static const struct kernel_param_ops allow_writes_ops = {
+ .set = set_allow_writes,
+ .get = get_allow_writes
+};
+
+module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600);
+
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
MODULE_DESCRIPTION("x86 generic MSR driver");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
new file mode 100644
index 000000000000..3d239ed12744
--- /dev/null
+++ b/arch/x86/kernel/nmi.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ * Copyright (C) 2011 Don Zickus Red Hat, Inc.
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
+#include <linux/nmi.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/atomic.h>
+#include <linux/sched/clock.h>
+#include <linux/kvm_types.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/traps.h>
+#include <asm/mach_traps.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
+#include <asm/reboot.h>
+#include <asm/cache.h>
+#include <asm/nospec-branch.h>
+#include <asm/microcode.h>
+#include <asm/sev.h>
+#include <asm/fred.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/nmi.h>
+
+/*
+ * An emergency handler can be set in any context including NMI
+ */
+struct nmi_desc {
+ raw_spinlock_t lock;
+ nmi_handler_t emerg_handler;
+ struct list_head head;
+};
+
+#define NMI_DESC_INIT(type) { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[type].lock), \
+ .head = LIST_HEAD_INIT(nmi_desc[type].head), \
+}
+
+static struct nmi_desc nmi_desc[NMI_MAX] = {
+ NMI_DESC_INIT(NMI_LOCAL),
+ NMI_DESC_INIT(NMI_UNKNOWN),
+ NMI_DESC_INIT(NMI_SERR),
+ NMI_DESC_INIT(NMI_IO_CHECK),
+};
+
+#define nmi_to_desc(type) (&nmi_desc[type])
+
+struct nmi_stats {
+ unsigned int normal;
+ unsigned int unknown;
+ unsigned int external;
+ unsigned int swallow;
+ unsigned long recv_jiffies;
+ unsigned long idt_seq;
+ unsigned long idt_nmi_seq;
+ unsigned long idt_ignored;
+ atomic_long_t idt_calls;
+ unsigned long idt_seq_snap;
+ unsigned long idt_nmi_seq_snap;
+ unsigned long idt_ignored_snap;
+ long idt_calls_snap;
+};
+
+static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
+
+static int ignore_nmis __read_mostly;
+
+int unknown_nmi_panic;
+int panic_on_unrecovered_nmi;
+int panic_on_io_nmi;
+
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
+
+static int __init setup_unknown_nmi_panic(char *str)
+{
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
+static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
+static int __init nmi_warning_debugfs(void)
+{
+ debugfs_create_u64("nmi_longest_ns", 0644,
+ arch_debugfs_dir, &nmi_longest_ns);
+ return 0;
+}
+fs_initcall(nmi_warning_debugfs);
+
+static void nmi_check_duration(struct nmiaction *action, u64 duration)
+{
+ int remainder_ns, decimal_msecs;
+
+ if (duration < nmi_longest_ns || duration < action->max_duration)
+ return;
+
+ action->max_duration = duration;
+
+ /* Convert duration from nsec to msec */
+ remainder_ns = do_div(duration, NSEC_PER_MSEC);
+ decimal_msecs = remainder_ns / NSEC_PER_USEC;
+
+ pr_info_ratelimited("INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ action->handler, duration, decimal_msecs);
+}
+
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ nmi_handler_t ehandler;
+ struct nmiaction *a;
+ int handled=0;
+
+ /*
+ * Call the emergency handler, if set
+ *
+ * In the case of crash_nmi_callback() emergency handler, it will
+ * return in the case of the crashing CPU to enable it to complete
+ * other necessary crashing actions ASAP. Other handlers in the
+ * linked list won't need to be run.
+ */
+ ehandler = desc->emerg_handler;
+ if (ehandler)
+ return ehandler(type, regs);
+
+ rcu_read_lock();
+
+ /*
+ * NMIs are edge-triggered, which means if you have enough
+ * of them concurrently, you can lose some because only one
+ * can be latched at any given time. Walk the whole list
+ * to handle those situations.
+ */
+ list_for_each_entry_rcu(a, &desc->head, list) {
+ int thishandled;
+ u64 delta;
+
+ delta = sched_clock();
+ thishandled = a->handler(type, regs);
+ handled += thishandled;
+ delta = sched_clock() - delta;
+ trace_nmi_handler(a->handler, (int)delta, thishandled);
+
+ nmi_check_duration(a, delta);
+ }
+
+ rcu_read_unlock();
+
+ /* return total number of NMI events handled */
+ return handled;
+}
+NOKPROBE_SYMBOL(nmi_handle);
+
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list)))
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /*
+ * Indicate if there are multiple registrations on the
+ * internal NMI handler call chains (SERR and IO_CHECK).
+ */
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
+
+ /*
+ * some handlers need to be executed first otherwise a fake
+ * event confuses some handlers (kdump uses this flag)
+ */
+ if (action->flags & NMI_FLAG_FIRST)
+ list_add_rcu(&action->list, &desc->head);
+ else
+ list_add_tail_rcu(&action->list, &desc->head);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(__register_nmi_handler);
+
+void unregister_nmi_handler(unsigned int type, const char *name)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ struct nmiaction *n, *found = NULL;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ list_for_each_entry_rcu(n, &desc->head, list) {
+ /*
+ * the name passed in to describe the nmi handler
+ * is used as the lookup key
+ */
+ if (!strcmp(n->name, name)) {
+ WARN(in_nmi(),
+ "Trying to free NMI (%s) from NMI context!\n", n->name);
+ list_del_rcu(&n->list);
+ found = n;
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (found) {
+ synchronize_rcu();
+ INIT_LIST_HEAD(&found->list);
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+
+/**
+ * set_emergency_nmi_handler - Set emergency handler
+ * @type: NMI type
+ * @handler: the emergency handler to be stored
+ *
+ * Set an emergency NMI handler which, if set, will preempt all the other
+ * handlers in the linked list. If a NULL handler is passed in, it will clear
+ * it. It is expected that concurrent calls to this function will not happen
+ * or the system is screwed beyond repair.
+ */
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+
+ if (WARN_ON_ONCE(desc->emerg_handler == handler))
+ return;
+ desc->emerg_handler = handler;
+
+ /*
+ * Ensure the emergency handler is visible to other CPUs before
+ * function return
+ */
+ smp_wmb();
+}
+
+static void
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
+{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs))
+ return;
+
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ if (panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg("Dazed and confused, but trying to continue\n");
+
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(pci_serr_error);
+
+static void
+io_check_error(unsigned char reason, struct pt_regs *regs)
+{
+ unsigned long i;
+
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs))
+ return;
+
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ show_regs(regs);
+
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
+
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(io_check_error);
+
+static void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+{
+ int handled;
+
+ /*
+ * As a last resort, let the "unknown" handlers make a
+ * best-effort attempt to figure out if they can claim
+ * responsibility for this Unknown NMI.
+ */
+ handled = nmi_handle(NMI_UNKNOWN, regs);
+ if (handled) {
+ __this_cpu_add(nmi_stats.unknown, handled);
+ return;
+ }
+
+ __this_cpu_add(nmi_stats.unknown, 1);
+
+ pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ if (unknown_nmi_panic || panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg_ratelimited("Dazed and confused, but trying to continue\n");
+}
+NOKPROBE_SYMBOL(unknown_nmi_error);
+
+static DEFINE_PER_CPU(bool, swallow_nmi);
+static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
+
+static noinstr void default_do_nmi(struct pt_regs *regs)
+{
+ unsigned char reason = 0;
+ int handled;
+ bool b2b = false;
+
+ /*
+ * Back-to-back NMIs are detected by comparing the RIP of the
+ * current NMI with that of the previous NMI. If it is the same,
+ * it is assumed that the CPU did not have a chance to jump back
+ * into a non-NMI context and execute code in between the two
+ * NMIs.
+ *
+ * They are interesting because even if there are more than two,
+ * only a maximum of two can be detected (anything over two is
+ * dropped due to NMI being edge-triggered). If this is the
+ * second half of the back-to-back NMI, assume we dropped things
+ * and process more handlers. Otherwise, reset the 'swallow' NMI
+ * behavior.
+ */
+ if (regs->ip == __this_cpu_read(last_nmi_rip))
+ b2b = true;
+ else
+ __this_cpu_write(swallow_nmi, false);
+
+ __this_cpu_write(last_nmi_rip, regs->ip);
+
+ instrumentation_begin();
+
+ if (microcode_nmi_handler_enabled() && microcode_nmi_handler())
+ goto out;
+
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+ handled = nmi_handle(NMI_LOCAL, regs);
+ __this_cpu_add(nmi_stats.normal, handled);
+ if (handled) {
+ /*
+ * There are cases when a NMI handler handles multiple
+ * events in the current NMI. One of these events may
+ * be queued for in the next NMI. Because the event is
+ * already handled, the next NMI will result in an unknown
+ * NMI. Instead lets flag this for a potential NMI to
+ * swallow.
+ */
+ if (handled > 1)
+ __this_cpu_write(swallow_nmi, true);
+ goto out;
+ }
+
+ /*
+ * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+ *
+ * Another CPU may be processing panic routines while holding
+ * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+ * and if so, call its callback directly. If there is no CPU preparing
+ * crash dump, we simply loop here.
+ */
+ while (!raw_spin_trylock(&nmi_reason_lock)) {
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+
+ reason = x86_platform.get_nmi_reason();
+
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ reassert_nmi();
+
+ __this_cpu_add(nmi_stats.external, 1);
+ raw_spin_unlock(&nmi_reason_lock);
+ goto out;
+ }
+ raw_spin_unlock(&nmi_reason_lock);
+
+ /*
+ * Only one NMI can be latched at a time. To handle
+ * this we may process multiple nmi handlers at once to
+ * cover the case where an NMI is dropped. The downside
+ * to this approach is we may process an NMI prematurely,
+ * while its real NMI is sitting latched. This will cause
+ * an unknown NMI on the next run of the NMI processing.
+ *
+ * We tried to flag that condition above, by setting the
+ * swallow_nmi flag when we process more than one event.
+ * This condition is also only present on the second half
+ * of a back-to-back NMI, so we flag that condition too.
+ *
+ * If both are true, we assume we already processed this
+ * NMI previously and we swallow it. Otherwise we reset
+ * the logic.
+ *
+ * There are scenarios where we may accidentally swallow
+ * a 'real' unknown NMI. For example, while processing
+ * a perf NMI another perf NMI comes in along with a
+ * 'real' unknown NMI. These two NMIs get combined into
+ * one (as described above). When the next NMI gets
+ * processed, it will be flagged by perf as handled, but
+ * no one will know that there was a 'real' unknown NMI sent
+ * also. As a result it gets swallowed. Or if the first
+ * perf NMI returns two events handled then the second
+ * NMI will get eaten by the logic below, again losing a
+ * 'real' unknown NMI. But this is the best we can do
+ * for now.
+ */
+ if (b2b && __this_cpu_read(swallow_nmi))
+ __this_cpu_add(nmi_stats.swallow, 1);
+ else
+ unknown_nmi_error(reason, regs);
+
+out:
+ instrumentation_end();
+}
+
+/*
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
+ *
+ * 1) not running
+ * 2) executing
+ * 3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ * when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
+ *
+ * No trap (breakpoint or page fault) should be hit before nmi_restart,
+ * thus there is no race between the first check of state for NOT_RUNNING
+ * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
+ * at this point.
+ *
+ * In case the NMI takes a page fault, we need to save off the CR2
+ * because the NMI could have preempted another page fault and corrupt
+ * the CR2 that is about to be read. As nested NMIs must be restarted
+ * and they can not take breakpoints or page faults, the update of the
+ * CR2 must be done before converting the nmi state back to NOT_RUNNING.
+ * Otherwise, there would be a race of another nested NMI coming in
+ * after setting state to NOT_RUNNING but before updating the nmi_cr2.
+ */
+enum nmi_states {
+ NMI_NOT_RUNNING = 0,
+ NMI_EXECUTING,
+ NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+static DEFINE_PER_CPU(unsigned long, nmi_cr2);
+static DEFINE_PER_CPU(unsigned long, nmi_dr7);
+
+DEFINE_IDTENTRY_RAW(exc_nmi)
+{
+ irqentry_state_t irq_state;
+ struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
+
+ /*
+ * Re-enable NMIs right here when running as an SEV-ES guest. This might
+ * cause nested NMIs, but those can be handled safely.
+ */
+ sev_es_nmi_complete();
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
+ raw_atomic_long_inc(&nsp->idt_calls);
+
+ if (arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
+ return;
+ }
+
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+
+nmi_restart:
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
+
+ /*
+ * Needs to happen before DR7 is accessed, because the hypervisor can
+ * intercept DR7 reads/writes, turning those into #VC exceptions.
+ */
+ sev_es_ist_enter(regs);
+
+ this_cpu_write(nmi_dr7, local_db_save());
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ inc_irq_stat(__nmi_count);
+
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
+ WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
+ } else if (!ignore_nmis) {
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
+ }
+ default_do_nmi(regs);
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
+ }
+ }
+
+ irqentry_nmi_exit(regs, irq_state);
+
+ local_db_restore(this_cpu_read(nmi_dr7));
+
+ sev_es_ist_exit();
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(nsp->idt_seq & 0x1);
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
+}
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
+{
+ exc_nmi(regs);
+}
+EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
+#endif
+
+#ifdef CONFIG_NMI_CHECK_CPU
+
+static char *nmi_check_stall_msg[] = {
+/* */
+/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */
+/* | +------ cpu_is_offline(cpu) */
+/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
+/* | | | NMI handler has been invoked. */
+/* | | | */
+/* V V V */
+/* 0 0 0 */ "NMIs are not reaching exc_nmi() handler",
+/* 0 0 1 */ "exc_nmi() handler is ignoring NMIs",
+/* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler",
+/* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs",
+/* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler",
+/* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs",
+/* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler",
+/* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs",
+};
+
+void nmi_backtrace_stall_snap(const struct cpumask *btp)
+{
+ int cpu;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq);
+ nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq);
+ nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored);
+ nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls);
+ }
+}
+
+void nmi_backtrace_stall_check(const struct cpumask *btp)
+{
+ int cpu;
+ int idx;
+ unsigned long nmi_seq;
+ unsigned long j = jiffies;
+ char *modp;
+ char *msgp;
+ char *msghp;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ modp = "";
+ msghp = "";
+ nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
+ msgp = "CPU entered NMI handler function, but has not exited";
+ } else if (nsp->idt_nmi_seq_snap == nmi_seq ||
+ nsp->idt_nmi_seq_snap + 1 == nmi_seq) {
+ idx = ((nmi_seq & 0x1) << 2) |
+ (cpu_is_offline(cpu) << 1) |
+ (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
+ msgp = nmi_check_stall_msg[idx];
+ if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
+ modp = ", but OK because ignore_nmis was set";
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
+ msghp = " (CPU exited one NMI handler function)";
+ else if (nmi_seq & 0x1)
+ msghp = " (CPU currently in NMI handler function)";
+ else
+ msghp = " (CPU was never in an NMI handler function)";
+ } else {
+ msgp = "CPU is handling NMIs";
+ }
+ pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp);
+ pr_alert("%s: last activity: %lu jiffies ago.\n",
+ __func__, j - READ_ONCE(nsp->recv_jiffies));
+ }
+}
+
+#endif
+
+#ifdef CONFIG_X86_FRED
+/*
+ * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED
+ * event delivery, i.e., there is no problem of transient states.
+ * And NMI unblocking only happens when the stack frame indicates
+ * that so should happen.
+ *
+ * Thus, the NMI entry stub for FRED is really straightforward and
+ * as simple as most exception handlers. As such, #DB is allowed
+ * during NMI handling.
+ */
+DEFINE_FREDENTRY_NMI(exc_nmi)
+{
+ irqentry_state_t irq_state;
+
+ if (arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
+ return;
+ }
+
+ /*
+ * Save CR2 for eventual restore to cover the case where the NMI
+ * hits the VMENTER/VMEXIT region where guest CR2 is life. This
+ * prevents guest state corruption in case that the NMI handler
+ * takes a page fault.
+ */
+ this_cpu_write(nmi_cr2, read_cr2());
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ inc_irq_stat(__nmi_count);
+ default_do_nmi(regs);
+
+ irqentry_nmi_exit(regs, irq_state);
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+}
+#endif
+
+void stop_nmi(void)
+{
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+}
+
+/* reset the back-to-back NMI logic */
+void local_touch_nmi(void)
+{
+ __this_cpu_write(last_nmi_rip, 0);
+}
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..a010e9d062bf
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS 0
+#define FAILURE 1
+#define TIMEOUT 2
+
+static int __initdata nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
+
+static int __initdata testcase_total;
+static int __initdata testcase_successes;
+static int __initdata unexpected_testcase_failures;
+static int __initdata unexpected_testcase_unknowns;
+
+static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+ unexpected_testcase_unknowns++;
+ return NMI_HANDLED;
+}
+
+static void __init init_nmi_testsuite(void)
+{
+ /* trap all the unknown NMIs we may generate */
+ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
+ __initdata);
+}
+
+static void __init cleanup_nmi_testsuite(void)
+{
+ unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+static void __init test_nmi_ipi(struct cpumask *mask)
+{
+ unsigned long timeout;
+
+ if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
+ nmi_fail = FAILURE;
+ return;
+ }
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ __apic_send_IPI_mask(mask, NMI_VECTOR);
+
+ /* Don't wait longer than a second */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(mask) && --timeout)
+ udelay(1);
+
+ /* What happens if we timeout, do we still unregister?? */
+ unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+ if (!timeout)
+ nmi_fail = TIMEOUT;
+ return;
+}
+
+static void __init remote_ipi(void)
+{
+ cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init local_ipi(void)
+{
+ cpumask_clear(to_cpumask(nmi_ipi_mask));
+ cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init reset_nmi(void)
+{
+ nmi_fail = 0;
+}
+
+static void __init dotest(void (*testcase_fn)(void), int expected)
+{
+ testcase_fn();
+ /*
+ * Filter out expected failures:
+ */
+ if (nmi_fail != expected) {
+ unexpected_testcase_failures++;
+
+ if (nmi_fail == FAILURE)
+ pr_cont("FAILED |");
+ else if (nmi_fail == TIMEOUT)
+ pr_cont("TIMEOUT|");
+ else
+ pr_cont("ERROR |");
+ dump_stack();
+ } else {
+ testcase_successes++;
+ pr_cont(" ok |");
+ }
+ pr_cont("\n");
+
+ testcase_total++;
+ reset_nmi();
+}
+
+void __init nmi_selftest(void)
+{
+ init_nmi_testsuite();
+
+ /*
+ * Run the testsuite:
+ */
+ pr_info("----------------\n");
+ pr_info("| NMI testsuite:\n");
+ pr_info("--------------------\n");
+
+ pr_info("%12s:", "remote IPI");
+ dotest(remote_ipi, SUCCESS);
+
+ pr_info("%12s:", "local IPI");
+ dotest(local_ipi, SUCCESS);
+
+ cleanup_nmi_testsuite();
+
+ pr_info("--------------------\n");
+ if (unexpected_testcase_failures) {
+ pr_info("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+ unexpected_testcase_failures, testcase_total);
+ } else {
+ pr_info("Good, all %3d testcases passed! |\n",
+ testcase_successes);
+ }
+ pr_info("-----------------------------------------------------------------\n");
+}
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..9e1ea99ad9df 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -1,28 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Split spinlock implementation out into its own file, so it can be
* compiled in a FTRACE-compatible way.
*/
#include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
#include <asm/paravirt.h>
-static inline void
-default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
- __raw_spin_lock(lock);
+ native_queued_spin_unlock(lock);
}
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
-struct pv_lock_ops pv_lock_ops = {
-#ifdef CONFIG_SMP
- .spin_is_locked = __ticket_spin_is_locked,
- .spin_is_contended = __ticket_spin_is_contended,
+bool pv_is_native_spin_unlock(void)
+{
+ return pv_ops.lock.queued_spin_unlock.func ==
+ __raw_callee_save___native_queued_spin_unlock;
+}
+
+__visible bool __native_vcpu_is_preempted(long cpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
- .spin_lock = __ticket_spin_lock,
- .spin_lock_flags = default_spin_lock_flags,
- .spin_trylock = __ticket_spin_trylock,
- .spin_unlock = __ticket_spin_unlock,
-#endif
-};
-EXPORT_SYMBOL(pv_lock_ops);
+bool pv_is_native_vcpu_is_preempted(void)
+{
+ return pv_ops.lock.vcpu_is_preempted.func ==
+ __raw_callee_save___native_vcpu_is_preempted;
+}
+void __init paravirt_set_cap(void)
+{
+ if (!pv_is_native_spin_unlock())
+ setup_force_cpu_cap(X86_FEATURE_PVUNLOCK);
+
+ if (!pv_is_native_vcpu_is_preempted())
+ setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT);
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 70ec9b951d76..ab3e172dcc69 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -1,34 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Paravirtualization interfaces
Copyright (C) 2006 Rusty Russell IBM Corporation
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
*/
#include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>
+#include <linux/pgtable.h>
+#include <linux/static_call.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
-#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
@@ -37,474 +29,227 @@
#include <asm/apic.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
+#include <asm/special_insns.h>
+#include <asm/tlb.h>
+#include <asm/io_bitmap.h>
+#include <asm/gsseg.h>
+#include <asm/msr.h>
-/* nop stub */
-void _paravirt_nop(void)
-{
-}
-
-/* identity function, which can be inlined */
-u32 _paravirt_ident_32(u32 x)
-{
- return x;
-}
+/* stub always returning 0. */
+DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);
-u64 _paravirt_ident_64(u64 x)
-{
- return x;
-}
-
-static void __init default_banner(void)
+void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
pv_info.name);
}
-char *memory_setup(void)
-{
- return pv_init_ops.memory_setup();
-}
-
-/* Simple instruction patching code. */
-#define DEF_NATIVE(ops, name, code) \
- extern const char start_##ops##_##name[], end_##ops##_##name[]; \
- asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
-
-/* Undefined instruction for dealing with missing ops pointers. */
-static const unsigned char ud2a[] = { 0x0f, 0x0b };
-
-unsigned paravirt_patch_nop(void)
-{
- return 0;
-}
-
-unsigned paravirt_patch_ignore(unsigned len)
-{
- return len;
-}
-
-struct branch {
- unsigned char opcode;
- u32 delta;
-} __attribute__((packed));
-
-unsigned paravirt_patch_call(void *insnbuf,
- const void *target, u16 tgt_clobbers,
- unsigned long addr, u16 site_clobbers,
- unsigned len)
-{
- struct branch *b = insnbuf;
- unsigned long delta = (unsigned long)target - (addr+5);
-
- if (tgt_clobbers & ~site_clobbers)
- return len; /* target would clobber too much for this site */
- if (len < 5)
- return len; /* call too long for patch site */
-
- b->opcode = 0xe8; /* call */
- b->delta = delta;
- BUILD_BUG_ON(sizeof(*b) != 5);
-
- return 5;
-}
-
-unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
- unsigned long addr, unsigned len)
-{
- struct branch *b = insnbuf;
- unsigned long delta = (unsigned long)target - (addr+5);
-
- if (len < 5)
- return len; /* call too long for patch site */
-
- b->opcode = 0xe9; /* jmp */
- b->delta = delta;
-
- return 5;
-}
-
-/* Neat trick to map patch type back to the call within the
- * corresponding structure. */
-static void *get_call_destination(u8 type)
-{
- struct paravirt_patch_template tmpl = {
- .pv_init_ops = pv_init_ops,
- .pv_time_ops = pv_time_ops,
- .pv_cpu_ops = pv_cpu_ops,
- .pv_irq_ops = pv_irq_ops,
- .pv_apic_ops = pv_apic_ops,
- .pv_mmu_ops = pv_mmu_ops,
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- .pv_lock_ops = pv_lock_ops,
+#ifdef CONFIG_PARAVIRT_XXL
+DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
- };
- return *((void **)&tmpl + type);
-}
-unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
- unsigned long addr, unsigned len)
-{
- void *opfunc = get_call_destination(type);
- unsigned ret;
-
- if (opfunc == NULL)
- /* If there's no function, patch it with a ud2a (BUG) */
- ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
- else if (opfunc == _paravirt_nop)
- /* If the operation is a nop, then nop the callsite */
- ret = paravirt_patch_nop();
-
- /* identity functions just return their single argument */
- else if (opfunc == _paravirt_ident_32)
- ret = paravirt_patch_ident_32(insnbuf, len);
- else if (opfunc == _paravirt_ident_64)
- ret = paravirt_patch_ident_64(insnbuf, len);
-
- else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
- /* If operation requires a jmp, then jmp */
- ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
- else
- /* Otherwise call the function; assume target could
- clobber any caller-save reg */
- ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
- addr, clobbers, len);
-
- return ret;
-}
-
-unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
- const char *start, const char *end)
-{
- unsigned insn_len = end - start;
-
- if (insn_len > len || start == NULL)
- insn_len = len;
- else
- memcpy(insnbuf, start, insn_len);
-
- return insn_len;
-}
-
-void init_IRQ(void)
-{
- pv_irq_ops.init_IRQ();
-}
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
-static void native_flush_tlb(void)
+void __init native_pv_lock_init(void)
{
- __native_flush_tlb();
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_enable(&virt_spin_lock_key);
}
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-static void native_flush_tlb_global(void)
-{
- __native_flush_tlb_global();
-}
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
-static void native_flush_tlb_single(unsigned long addr)
+static u64 native_steal_clock(int cpu)
{
- __native_flush_tlb_single(addr);
-}
-
-/* These are in entry.S */
-extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
-extern void native_usergs_sysret32(void);
-extern void native_usergs_sysret64(void);
-
-static int __init print_banner(void)
-{
- pv_init_ops.banner();
return 0;
}
-core_initcall(print_banner);
-
-static struct resource reserve_ioports = {
- .start = 0,
- .end = IO_SPACE_LIMIT,
- .name = "paravirt-ioport",
- .flags = IORESOURCE_IO | IORESOURCE_BUSY,
-};
-
-/*
- * Reserve the whole legacy IO space to prevent any legacy drivers
- * from wasting time probing for their hardware. This is a fairly
- * brute-force approach to disabling all non-virtual drivers.
- *
- * Note that this must be called very early to have any effect.
- */
-int paravirt_disable_iospace(void)
-{
- return request_resource(&ioport_resource, &reserve_ioports);
-}
-
-static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
-
-static inline void enter_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- percpu_write(paravirt_lazy_mode, mode);
-}
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);
-static void leave_lazy(enum paravirt_lazy_mode mode)
+void paravirt_set_sched_clock(u64 (*func)(void))
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
-
- percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+ static_call_update(pv_sched_clock, func);
}
-void paravirt_enter_lazy_mmu(void)
+static noinstr void pv_native_safe_halt(void)
{
- enter_lazy(PARAVIRT_LAZY_MMU);
+ native_safe_halt();
}
-void paravirt_leave_lazy_mmu(void)
+#ifdef CONFIG_PARAVIRT_XXL
+static noinstr void pv_native_write_cr2(unsigned long val)
{
- leave_lazy(PARAVIRT_LAZY_MMU);
+ native_write_cr2(val);
}
-void paravirt_start_context_switch(struct task_struct *prev)
+static noinstr unsigned long pv_native_read_cr3(void)
{
- BUG_ON(preemptible());
-
- if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
- }
- enter_lazy(PARAVIRT_LAZY_CPU);
+ return __native_read_cr3();
}
-void paravirt_end_context_switch(struct task_struct *next)
+static noinstr void pv_native_write_cr3(unsigned long cr3)
{
- BUG_ON(preemptible());
-
- leave_lazy(PARAVIRT_LAZY_CPU);
-
- if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
- arch_enter_lazy_mmu_mode();
+ native_write_cr3(cr3);
}
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
+static noinstr unsigned long pv_native_get_debugreg(int regno)
{
- if (in_interrupt())
- return PARAVIRT_LAZY_NONE;
-
- return percpu_read(paravirt_lazy_mode);
+ return native_get_debugreg(regno);
}
-void arch_flush_lazy_mmu_mode(void)
+static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
-
- preempt_enable();
+ native_set_debugreg(regno, val);
}
+#endif
struct pv_info pv_info = {
.name = "bare hardware",
- .paravirt_enabled = 0,
- .kernel_rpl = 0,
- .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
-};
-
-struct pv_init_ops pv_init_ops = {
- .patch = native_patch,
- .banner = default_banner,
- .arch_setup = paravirt_nop,
- .memory_setup = machine_specific_memory_setup,
+#ifdef CONFIG_PARAVIRT_XXL
+ .extra_user_64bit_cs = __USER_CS,
+#endif
};
-struct pv_time_ops pv_time_ops = {
- .time_init = hpet_time_init,
- .get_wallclock = native_get_wallclock,
- .set_wallclock = native_set_wallclock,
- .sched_clock = native_sched_clock,
- .get_tsc_khz = native_calibrate_tsc,
-};
+/* 64-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
-struct pv_irq_ops pv_irq_ops = {
- .init_IRQ = native_init_IRQ,
- .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
- .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
- .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
- .safe_halt = native_safe_halt,
- .halt = native_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
+struct paravirt_patch_template pv_ops = {
+ /* Cpu ops. */
+ .cpu.io_delay = native_io_delay,
+
+#ifdef CONFIG_PARAVIRT_XXL
+ .cpu.cpuid = native_cpuid,
+ .cpu.get_debugreg = pv_native_get_debugreg,
+ .cpu.set_debugreg = pv_native_set_debugreg,
+ .cpu.read_cr0 = native_read_cr0,
+ .cpu.write_cr0 = native_write_cr0,
+ .cpu.write_cr4 = native_write_cr4,
+ .cpu.read_msr = native_read_msr,
+ .cpu.write_msr = native_write_msr,
+ .cpu.read_msr_safe = native_read_msr_safe,
+ .cpu.write_msr_safe = native_write_msr_safe,
+ .cpu.read_pmc = native_read_pmc,
+ .cpu.load_tr_desc = native_load_tr_desc,
+ .cpu.set_ldt = native_set_ldt,
+ .cpu.load_gdt = native_load_gdt,
+ .cpu.load_idt = native_load_idt,
+ .cpu.store_tr = native_store_tr,
+ .cpu.load_tls = native_load_tls,
+ .cpu.load_gs_index = native_load_gs_index,
+ .cpu.write_ldt_entry = native_write_ldt_entry,
+ .cpu.write_gdt_entry = native_write_gdt_entry,
+ .cpu.write_idt_entry = native_write_idt_entry,
+
+ .cpu.alloc_ldt = paravirt_nop,
+ .cpu.free_ldt = paravirt_nop,
+
+ .cpu.load_sp0 = native_load_sp0,
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
+ .cpu.update_io_bitmap = native_tss_update_io_bitmap,
#endif
-};
-struct pv_cpu_ops pv_cpu_ops = {
- .cpuid = native_cpuid,
- .get_debugreg = native_get_debugreg,
- .set_debugreg = native_set_debugreg,
- .clts = native_clts,
- .read_cr0 = native_read_cr0,
- .write_cr0 = native_write_cr0,
- .read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
- .write_cr4 = native_write_cr4,
-#ifdef CONFIG_X86_64
- .read_cr8 = native_read_cr8,
- .write_cr8 = native_write_cr8,
-#endif
- .wbinvd = native_wbinvd,
- .read_msr = native_read_msr_safe,
- .read_msr_amd = native_read_msr_amd_safe,
- .write_msr = native_write_msr_safe,
- .read_tsc = native_read_tsc,
- .read_pmc = native_read_pmc,
- .read_tscp = native_read_tscp,
- .load_tr_desc = native_load_tr_desc,
- .set_ldt = native_set_ldt,
- .load_gdt = native_load_gdt,
- .load_idt = native_load_idt,
- .store_gdt = native_store_gdt,
- .store_idt = native_store_idt,
- .store_tr = native_store_tr,
- .load_tls = native_load_tls,
-#ifdef CONFIG_X86_64
- .load_gs_index = native_load_gs_index,
-#endif
- .write_ldt_entry = native_write_ldt_entry,
- .write_gdt_entry = native_write_gdt_entry,
- .write_idt_entry = native_write_idt_entry,
+ .cpu.start_context_switch = paravirt_nop,
+ .cpu.end_context_switch = paravirt_nop,
- .alloc_ldt = paravirt_nop,
- .free_ldt = paravirt_nop,
+ /* Irq ops. */
+ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
+ .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
+ .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
+#endif /* CONFIG_PARAVIRT_XXL */
- .load_sp0 = native_load_sp0,
+ /* Irq HLT ops. */
+ .irq.safe_halt = pv_native_safe_halt,
+ .irq.halt = native_halt,
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
- .irq_enable_sysexit = native_irq_enable_sysexit,
-#endif
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
- .usergs_sysret32 = native_usergs_sysret32,
-#endif
- .usergs_sysret64 = native_usergs_sysret64,
-#endif
- .iret = native_iret,
- .swapgs = native_swapgs,
+ /* Mmu ops. */
+ .mmu.flush_tlb_user = native_flush_tlb_local,
+ .mmu.flush_tlb_kernel = native_flush_tlb_global,
+ .mmu.flush_tlb_one_user = native_flush_tlb_one_user,
+ .mmu.flush_tlb_multi = native_flush_tlb_multi,
- .set_iopl_mask = native_set_iopl_mask,
- .io_delay = native_io_delay,
+ .mmu.exit_mmap = paravirt_nop,
+ .mmu.notify_page_enc_status_changed = paravirt_nop,
- .start_context_switch = paravirt_nop,
- .end_context_switch = paravirt_nop,
-};
+#ifdef CONFIG_PARAVIRT_XXL
+ .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
+ .mmu.write_cr2 = pv_native_write_cr2,
+ .mmu.read_cr3 = pv_native_read_cr3,
+ .mmu.write_cr3 = pv_native_write_cr3,
-struct pv_apic_ops pv_apic_ops = {
-#ifdef CONFIG_X86_LOCAL_APIC
- .setup_boot_clock = setup_boot_APIC_clock,
- .setup_secondary_clock = setup_secondary_APIC_clock,
- .startup_ipi_hook = paravirt_nop,
-#endif
-};
+ .mmu.pgd_alloc = __paravirt_pgd_alloc,
+ .mmu.pgd_free = paravirt_nop,
-#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
-/* 32-bit pagetable entries */
-#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
-#else
-/* 64-bit pagetable entries */
-#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
-#endif
+ .mmu.alloc_pte = paravirt_nop,
+ .mmu.alloc_pmd = paravirt_nop,
+ .mmu.alloc_pud = paravirt_nop,
+ .mmu.alloc_p4d = paravirt_nop,
+ .mmu.release_pte = paravirt_nop,
+ .mmu.release_pmd = paravirt_nop,
+ .mmu.release_pud = paravirt_nop,
+ .mmu.release_p4d = paravirt_nop,
-struct pv_mmu_ops pv_mmu_ops = {
-#ifndef CONFIG_X86_64
- .pagetable_setup_start = native_pagetable_setup_start,
- .pagetable_setup_done = native_pagetable_setup_done,
-#else
- .pagetable_setup_start = paravirt_nop,
- .pagetable_setup_done = paravirt_nop,
-#endif
+ .mmu.set_pte = native_set_pte,
+ .mmu.set_pmd = native_set_pmd,
- .read_cr2 = native_read_cr2,
- .write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
- .write_cr3 = native_write_cr3,
-
- .flush_tlb_user = native_flush_tlb,
- .flush_tlb_kernel = native_flush_tlb_global,
- .flush_tlb_single = native_flush_tlb_single,
- .flush_tlb_others = native_flush_tlb_others,
-
- .pgd_alloc = __paravirt_pgd_alloc,
- .pgd_free = paravirt_nop,
-
- .alloc_pte = paravirt_nop,
- .alloc_pmd = paravirt_nop,
- .alloc_pmd_clone = paravirt_nop,
- .alloc_pud = paravirt_nop,
- .release_pte = paravirt_nop,
- .release_pmd = paravirt_nop,
- .release_pud = paravirt_nop,
-
- .set_pte = native_set_pte,
- .set_pte_at = native_set_pte_at,
- .set_pmd = native_set_pmd,
- .pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
-
- .ptep_modify_prot_start = __ptep_modify_prot_start,
- .ptep_modify_prot_commit = __ptep_modify_prot_commit,
-
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = kmap_atomic,
-#endif
+ .mmu.ptep_modify_prot_start = __ptep_modify_prot_start,
+ .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- .set_pte_atomic = native_set_pte_atomic,
- .pte_clear = native_pte_clear,
- .pmd_clear = native_pmd_clear,
-#endif
- .set_pud = native_set_pud,
+ .mmu.set_pud = native_set_pud,
- .pmd_val = PTE_IDENT,
- .make_pmd = PTE_IDENT,
+ .mmu.pmd_val = PTE_IDENT,
+ .mmu.make_pmd = PTE_IDENT,
-#if PAGETABLE_LEVELS == 4
- .pud_val = PTE_IDENT,
- .make_pud = PTE_IDENT,
+ .mmu.pud_val = PTE_IDENT,
+ .mmu.make_pud = PTE_IDENT,
- .set_pgd = native_set_pgd,
-#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
+ .mmu.set_p4d = native_set_p4d,
+
+ .mmu.p4d_val = PTE_IDENT,
+ .mmu.make_p4d = PTE_IDENT,
+
+ .mmu.set_pgd = native_set_pgd,
- .pte_val = PTE_IDENT,
- .pgd_val = PTE_IDENT,
+ .mmu.pte_val = PTE_IDENT,
+ .mmu.pgd_val = PTE_IDENT,
- .make_pte = PTE_IDENT,
- .make_pgd = PTE_IDENT,
+ .mmu.make_pte = PTE_IDENT,
+ .mmu.make_pgd = PTE_IDENT,
- .dup_mmap = paravirt_nop,
- .exit_mmap = paravirt_nop,
- .activate_mm = paravirt_nop,
+ .mmu.enter_mmap = paravirt_nop,
- .lazy_mode = {
- .enter = paravirt_nop,
- .leave = paravirt_nop,
+ .mmu.lazy_mode = {
+ .enter = paravirt_nop,
+ .leave = paravirt_nop,
+ .flush = paravirt_nop,
},
- .set_fixmap = native_set_fixmap,
+ .mmu.set_fixmap = native_set_fixmap,
+#endif /* CONFIG_PARAVIRT_XXL */
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ /* Lock ops. */
+#ifdef CONFIG_SMP
+ .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .lock.queued_spin_unlock =
+ PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .lock.wait = paravirt_nop,
+ .lock.kick = paravirt_nop,
+ .lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+#endif /* SMP */
+#endif
};
-EXPORT_SYMBOL_GPL(pv_time_ops);
-EXPORT_SYMBOL (pv_cpu_ops);
-EXPORT_SYMBOL (pv_mmu_ops);
-EXPORT_SYMBOL_GPL(pv_apic_ops);
+#ifdef CONFIG_PARAVIRT_XXL
+NOKPROBE_SYMBOL(native_load_idt);
+#endif
+
+EXPORT_SYMBOL(pv_ops);
EXPORT_SYMBOL_GPL(pv_info);
-EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
deleted file mode 100644
index d9f32e6d6ab6..000000000000
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <asm/paravirt.h>
-
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
-DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
-
-unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
-{
- /* arg in %eax, return in %eax */
- return 0;
-}
-
-unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
-{
- /* arg in %edx:%eax, return in %edx:%eax */
- return 0;
-}
-
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
-{
- const unsigned char *start, *end;
- unsigned ret;
-
-#define PATCH_SITE(ops, x) \
- case PARAVIRT_PATCH(ops.x): \
- start = start_##ops##_##x; \
- end = end_##ops##_##x; \
- goto patch_site
- switch (type) {
- PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_irq_ops, irq_enable);
- PATCH_SITE(pv_irq_ops, restore_fl);
- PATCH_SITE(pv_irq_ops, save_fl);
- PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
- PATCH_SITE(pv_mmu_ops, read_cr2);
- PATCH_SITE(pv_mmu_ops, read_cr3);
- PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
- PATCH_SITE(pv_cpu_ops, read_tsc);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
-
- default:
- ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
- break;
- }
-#undef PATCH_SITE
- return ret;
-}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
deleted file mode 100644
index 3f08f34f93eb..000000000000
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <asm/paravirt.h>
-#include <asm/asm-offsets.h>
-#include <linux/stringify.h>
-
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
-
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
-DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
-
-DEF_NATIVE(, mov32, "mov %edi, %eax");
-DEF_NATIVE(, mov64, "mov %rdi, %rax");
-
-unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
-{
- return paravirt_patch_insns(insnbuf, len,
- start__mov32, end__mov32);
-}
-
-unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
-{
- return paravirt_patch_insns(insnbuf, len,
- start__mov64, end__mov64);
-}
-
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
-{
- const unsigned char *start, *end;
- unsigned ret;
-
-#define PATCH_SITE(ops, x) \
- case PARAVIRT_PATCH(ops.x): \
- start = start_##ops##_##x; \
- end = end_##ops##_##x; \
- goto patch_site
- switch(type) {
- PATCH_SITE(pv_irq_ops, restore_fl);
- PATCH_SITE(pv_irq_ops, save_fl);
- PATCH_SITE(pv_irq_ops, irq_enable);
- PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
- PATCH_SITE(pv_cpu_ops, usergs_sysret32);
- PATCH_SITE(pv_cpu_ops, usergs_sysret64);
- PATCH_SITE(pv_cpu_ops, swapgs);
- PATCH_SITE(pv_mmu_ops, read_cr2);
- PATCH_SITE(pv_mmu_ops, read_cr3);
- PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
- PATCH_SITE(pv_cpu_ops, wbinvd);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
-
- default:
- ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
- break;
- }
-#undef PATCH_SITE
- return ret;
-}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
deleted file mode 100644
index 971a3bec47a8..000000000000
--- a/arch/x86/kernel/pci-calgary_64.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- * Derived from arch/powerpc/kernel/iommu.c
- *
- * Copyright IBM Corporation, 2006-2007
- * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
- *
- * Author: Jon Mason <jdmason@kudzu.us>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
-
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/crash_dump.h>
-#include <linux/dma-mapping.h>
-#include <linux/bitops.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/scatterlist.h>
-#include <linux/iommu-helper.h>
-
-#include <asm/iommu.h>
-#include <asm/calgary.h>
-#include <asm/tce.h>
-#include <asm/pci-direct.h>
-#include <asm/system.h>
-#include <asm/dma.h>
-#include <asm/rio.h>
-#include <asm/bios_ebda.h>
-
-#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
-int use_calgary __read_mostly = 1;
-#else
-int use_calgary __read_mostly = 0;
-#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
-
-#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
-#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
-
-/* register offsets inside the host bridge space */
-#define CALGARY_CONFIG_REG 0x0108
-#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
-#define PHB_PLSSR_OFFSET 0x0120
-#define PHB_CONFIG_RW_OFFSET 0x0160
-#define PHB_IOBASE_BAR_LOW 0x0170
-#define PHB_IOBASE_BAR_HIGH 0x0180
-#define PHB_MEM_1_LOW 0x0190
-#define PHB_MEM_1_HIGH 0x01A0
-#define PHB_IO_ADDR_SIZE 0x01B0
-#define PHB_MEM_1_SIZE 0x01C0
-#define PHB_MEM_ST_OFFSET 0x01D0
-#define PHB_AER_OFFSET 0x0200
-#define PHB_CONFIG_0_HIGH 0x0220
-#define PHB_CONFIG_0_LOW 0x0230
-#define PHB_CONFIG_0_END 0x0240
-#define PHB_MEM_2_LOW 0x02B0
-#define PHB_MEM_2_HIGH 0x02C0
-#define PHB_MEM_2_SIZE_HIGH 0x02D0
-#define PHB_MEM_2_SIZE_LOW 0x02E0
-#define PHB_DOSHOLE_OFFSET 0x08E0
-
-/* CalIOC2 specific */
-#define PHB_SAVIOR_L2 0x0DB0
-#define PHB_PAGE_MIG_CTRL 0x0DA8
-#define PHB_PAGE_MIG_DEBUG 0x0DA0
-#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
-
-/* PHB_CONFIG_RW */
-#define PHB_TCE_ENABLE 0x20000000
-#define PHB_SLOT_DISABLE 0x1C000000
-#define PHB_DAC_DISABLE 0x01000000
-#define PHB_MEM2_ENABLE 0x00400000
-#define PHB_MCSR_ENABLE 0x00100000
-/* TAR (Table Address Register) */
-#define TAR_SW_BITS 0x0000ffffffff800fUL
-#define TAR_VALID 0x0000000000000008UL
-/* CSR (Channel/DMA Status Register) */
-#define CSR_AGENT_MASK 0xffe0ffff
-/* CCR (Calgary Configuration Register) */
-#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
-/* PMCR/PMDR (Page Migration Control/Debug Registers */
-#define PMR_SOFTSTOP 0x80000000
-#define PMR_SOFTSTOPFAULT 0x40000000
-#define PMR_HARDSTOP 0x20000000
-
-#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
-#define MAX_NUM_CHASSIS 8 /* max number of chassis */
-/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
-#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
-#define PHBS_PER_CALGARY 4
-
-/* register offsets in Calgary's internal register space */
-static const unsigned long tar_offsets[] = {
- 0x0580 /* TAR0 */,
- 0x0588 /* TAR1 */,
- 0x0590 /* TAR2 */,
- 0x0598 /* TAR3 */
-};
-
-static const unsigned long split_queue_offsets[] = {
- 0x4870 /* SPLIT QUEUE 0 */,
- 0x5870 /* SPLIT QUEUE 1 */,
- 0x6870 /* SPLIT QUEUE 2 */,
- 0x7870 /* SPLIT QUEUE 3 */
-};
-
-static const unsigned long phb_offsets[] = {
- 0x8000 /* PHB0 */,
- 0x9000 /* PHB1 */,
- 0xA000 /* PHB2 */,
- 0xB000 /* PHB3 */
-};
-
-/* PHB debug registers */
-
-static const unsigned long phb_debug_offsets[] = {
- 0x4000 /* PHB 0 DEBUG */,
- 0x5000 /* PHB 1 DEBUG */,
- 0x6000 /* PHB 2 DEBUG */,
- 0x7000 /* PHB 3 DEBUG */
-};
-
-/*
- * STUFF register for each debug PHB,
- * byte 1 = start bus number, byte 2 = end bus number
- */
-
-#define PHB_DEBUG_STUFF_OFFSET 0x0020
-
-#define EMERGENCY_PAGES 32 /* = 128KB */
-
-unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
-static int translate_empty_slots __read_mostly = 0;
-static int calgary_detected __read_mostly = 0;
-
-static struct rio_table_hdr *rio_table_hdr __initdata;
-static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
-
-struct calgary_bus_info {
- void *tce_space;
- unsigned char translation_disabled;
- signed char phbid;
- void __iomem *bbar;
-};
-
-static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calgary_tce_cache_blast(struct iommu_table *tbl);
-static void calgary_dump_error_regs(struct iommu_table *tbl);
-static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calioc2_tce_cache_blast(struct iommu_table *tbl);
-static void calioc2_dump_error_regs(struct iommu_table *tbl);
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
-static void get_tce_space_from_tar(void);
-
-static struct cal_chipset_ops calgary_chip_ops = {
- .handle_quirks = calgary_handle_quirks,
- .tce_cache_blast = calgary_tce_cache_blast,
- .dump_error_regs = calgary_dump_error_regs
-};
-
-static struct cal_chipset_ops calioc2_chip_ops = {
- .handle_quirks = calioc2_handle_quirks,
- .tce_cache_blast = calioc2_tce_cache_blast,
- .dump_error_regs = calioc2_dump_error_regs
-};
-
-static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-
-static inline int translation_enabled(struct iommu_table *tbl)
-{
- /* only PHBs with translation enabled have an IOMMU table */
- return (tbl != NULL);
-}
-
-static void iommu_range_reserve(struct iommu_table *tbl,
- unsigned long start_addr, unsigned int npages)
-{
- unsigned long index;
- unsigned long end;
- unsigned long flags;
-
- index = start_addr >> PAGE_SHIFT;
-
- /* bail out if we're asked to reserve a region we don't cover */
- if (index >= tbl->it_size)
- return;
-
- end = index + npages;
- if (end > tbl->it_size) /* don't go off the table */
- end = tbl->it_size;
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- iommu_area_reserve(tbl->it_map, index, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static unsigned long iommu_range_alloc(struct device *dev,
- struct iommu_table *tbl,
- unsigned int npages)
-{
- unsigned long flags;
- unsigned long offset;
- unsigned long boundary_size;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- BUG_ON(npages == 0);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- tbl->chip_ops->tce_cache_blast(tbl);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- printk(KERN_WARNING "Calgary: IOMMU full.\n");
- spin_unlock_irqrestore(&tbl->it_lock, flags);
- if (panic_on_overflow)
- panic("Calgary: fix the allocator.\n");
- else
- return bad_dma_address;
- }
- }
-
- tbl->it_hint = offset + npages;
- BUG_ON(tbl->it_hint > tbl->it_size);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-
- return offset;
-}
-
-static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
- void *vaddr, unsigned int npages, int direction)
-{
- unsigned long entry;
- dma_addr_t ret = bad_dma_address;
-
- entry = iommu_range_alloc(dev, tbl, npages);
-
- if (unlikely(entry == bad_dma_address))
- goto error;
-
- /* set the return dma address */
- ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
-
- /* put the TCEs in the HW table */
- tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
- direction);
-
- return ret;
-
-error:
- printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
- "iommu %p\n", npages, tbl);
- return bad_dma_address;
-}
-
-static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
- unsigned int npages)
-{
- unsigned long entry;
- unsigned long badend;
- unsigned long flags;
-
- /* were we called with bad_dma_address? */
- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
- WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
- "address 0x%Lx\n", dma_addr);
- return;
- }
-
- entry = dma_addr >> PAGE_SHIFT;
-
- BUG_ON(entry + npages > tbl->it_size);
-
- tce_free(tbl, entry, npages);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- iommu_area_free(tbl->it_map, entry, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static inline struct iommu_table *find_iommu_table(struct device *dev)
-{
- struct pci_dev *pdev;
- struct pci_bus *pbus;
- struct iommu_table *tbl;
-
- pdev = to_pci_dev(dev);
-
- pbus = pdev->bus;
-
- /* is the device behind a bridge? Look for the root bus */
- while (pbus->parent)
- pbus = pbus->parent;
-
- tbl = pci_iommu(pbus);
-
- BUG_ON(tbl && (tbl->it_busno != pbus->number));
-
- return tbl;
-}
-
-static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems,enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- int i;
-
- if (!translation_enabled(tbl))
- return;
-
- for_each_sg(sglist, s, nelems, i) {
- unsigned int npages;
- dma_addr_t dma = s->dma_address;
- unsigned int dmalen = s->dma_length;
-
- if (dmalen == 0)
- break;
-
- npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
- iommu_free(tbl, dma, npages);
- }
-}
-
-static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- unsigned long vaddr;
- unsigned int npages;
- unsigned long entry;
- int i;
-
- for_each_sg(sg, s, nelems, i) {
- BUG_ON(!sg_page(s));
-
- vaddr = (unsigned long) sg_virt(s);
- npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
-
- entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == bad_dma_address) {
- /* makes sure unmap knows to stop */
- s->dma_length = 0;
- goto error;
- }
-
- s->dma_address = (entry << PAGE_SHIFT) | s->offset;
-
- /* insert into HW table */
- tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
-
- s->dma_length = s->length;
- }
-
- return nelems;
-error:
- calgary_unmap_sg(dev, sg, nelems, dir, NULL);
- for_each_sg(sg, s, nelems, i) {
- sg->dma_address = bad_dma_address;
- sg->dma_length = 0;
- }
- return 0;
-}
-
-static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- void *vaddr = page_address(page) + offset;
- unsigned long uaddr;
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- uaddr = (unsigned long)vaddr;
- npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
-
- return iommu_alloc(dev, tbl, vaddr, npages, dir);
-}
-
-static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
- size_t size, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- unsigned int npages;
-
- npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- iommu_free(tbl, dma_addr, npages);
-}
-
-static void* calgary_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
-{
- void *ret = NULL;
- dma_addr_t mapping;
- unsigned int npages, order;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size); /* size rounded up to full pages */
- npages = size >> PAGE_SHIFT;
- order = get_order(size);
-
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- /* alloc enough pages (and possibly more) */
- ret = (void *)__get_free_pages(flag, order);
- if (!ret)
- goto error;
- memset(ret, 0, size);
-
- /* set up tces to cover the allocated range */
- mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == bad_dma_address)
- goto free;
- *dma_handle = mapping;
- return ret;
-free:
- free_pages((unsigned long)ret, get_order(size));
- ret = NULL;
-error:
- return ret;
-}
-
-static void calgary_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
-{
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
-
- iommu_free(tbl, dma_handle, npages);
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static struct dma_map_ops calgary_dma_ops = {
- .alloc_coherent = calgary_alloc_coherent,
- .free_coherent = calgary_free_coherent,
- .map_sg = calgary_map_sg,
- .unmap_sg = calgary_unmap_sg,
- .map_page = calgary_map_page,
- .unmap_page = calgary_unmap_page,
-};
-
-static inline void __iomem * busno_to_bbar(unsigned char num)
-{
- return bus_info[num].bbar;
-}
-
-static inline int busno_to_phbid(unsigned char num)
-{
- return bus_info[num].phbid;
-}
-
-static inline unsigned long split_queue_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return split_queue_offsets[idx];
-}
-
-static inline unsigned long tar_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return tar_offsets[idx];
-}
-
-static inline unsigned long phb_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return phb_offsets[idx];
-}
-
-static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
-{
- unsigned long target = ((unsigned long)bar) | offset;
- return (void __iomem*)target;
-}
-
-static inline int is_calioc2(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALIOC2);
-}
-
-static inline int is_calgary(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALGARY);
-}
-
-static inline int is_cal_pci_dev(unsigned short device)
-{
- return (is_calgary(device) || is_calioc2(device));
-}
-
-static void calgary_tce_cache_blast(struct iommu_table *tbl)
-{
- u64 val;
- u32 aer;
- int i = 0;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
-
- /* disable arbitration on the bus */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- aer = readl(target);
- writel(0, target);
-
- /* read plssr to ensure it got there */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- val = readl(target);
-
- /* poll split queues until all DMA activity is done */
- target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
- do {
- val = readq(target);
- i++;
- } while ((val & 0xff) != 0xff && i < 100);
- if (i == 100)
- printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
- "continuing anyway\n");
-
- /* invalidate TCE cache */
- target = calgary_reg(bbar, tar_offset(tbl->it_busno));
- writeq(tbl->tar_val, target);
-
- /* enable arbitration */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- writel(aer, target);
- (void)readl(target); /* flush */
-}
-
-static void calioc2_tce_cache_blast(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u64 val64;
- u32 val;
- int i = 0;
- int count = 1;
- unsigned char bus = tbl->it_busno;
-
-begin:
- printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
- "sequence - count %d\n", bus, count);
-
- /* 1. using the Page Migration Control reg set SoftStop */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
- val |= PMR_SOFTSTOP;
- printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
-
- /* 2. poll split queues until all DMA activity is done */
- printk(KERN_DEBUG "2a. starting to poll split queues\n");
- target = calgary_reg(bbar, split_queue_offset(bus));
- do {
- val64 = readq(target);
- i++;
- } while ((val64 & 0xff) != 0xff && i < 100);
- if (i == 100)
- printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
- "continuing anyway\n");
-
- /* 3. poll Page Migration DEBUG for SoftStopFault */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
-
- /* 4. if SoftStopFault - goto (1) */
- if (val & PMR_SOFTSTOPFAULT) {
- if (++count < 100)
- goto begin;
- else {
- printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
- "aborting TCE cache flush sequence!\n");
- return; /* pray for the best */
- }
- }
-
- /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
-
- /* 6. invalidate TCE cache */
- printk(KERN_DEBUG "6. invalidating TCE cache\n");
- target = calgary_reg(bbar, tar_offset(bus));
- writeq(tbl->tar_val, target);
-
- /* 7. Re-read PMCR */
- printk(KERN_DEBUG "7a. Re-reading PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
-
- /* 8. Remove HardStop */
- printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = 0;
- printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
-}
-
-static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
- u64 limit)
-{
- unsigned int numpages;
-
- limit = limit | 0xfffff;
- limit++;
-
- numpages = ((limit - start) >> PAGE_SHIFT);
- iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
-}
-
-static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
-{
- void __iomem *target;
- u64 low, high, sizelow;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* peripheral MEM_1 region */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
- sizelow = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
-{
- void __iomem *target;
- u32 val32;
- u64 low, high, sizelow, sizehigh;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* is it enabled? */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- if (!(val32 & PHB_MEM2_ENABLE))
- return;
-
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
- sizelow = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
- sizehigh = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = (sizehigh << 32) | sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-/*
- * some regions of the IO address space do not get translated, so we
- * must not give devices IO addresses in those regions. The regions
- * are the 640KB-1MB region and the two PCI peripheral memory holes.
- * Reserve all of them in the IOMMU bitmap to avoid giving them out
- * later.
- */
-static void __init calgary_reserve_regions(struct pci_dev *dev)
-{
- unsigned int npages;
- u64 start;
- struct iommu_table *tbl = pci_iommu(dev->bus);
-
- /* reserve EMERGENCY_PAGES from bad_dma_address and up */
- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
-
- /* avoid the BIOS/VGA first 640KB-1MB region */
- /* for CalIOC2 - avoid the entire first MB */
- if (is_calgary(dev->device)) {
- start = (640 * 1024);
- npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
- } else { /* calioc2 */
- start = 0;
- npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
- }
- iommu_range_reserve(tbl, start, npages);
-
- /* reserve the two PCI peripheral memory regions in IO space */
- calgary_reserve_peripheral_mem_1(dev);
- calgary_reserve_peripheral_mem_2(dev);
-}
-
-static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
-{
- u64 val64;
- u64 table_phys;
- void __iomem *target;
- int ret;
- struct iommu_table *tbl;
-
- /* build TCE tables for each PHB */
- ret = build_tce_table(dev, bbar);
- if (ret)
- return ret;
-
- tbl = pci_iommu(dev->bus);
- tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
-
- if (is_kdump_kernel())
- calgary_init_bitmap_from_tce_table(tbl);
- else
- tce_free(tbl, 0, tbl->it_size);
-
- if (is_calgary(dev->device))
- tbl->chip_ops = &calgary_chip_ops;
- else if (is_calioc2(dev->device))
- tbl->chip_ops = &calioc2_chip_ops;
- else
- BUG();
-
- calgary_reserve_regions(dev);
-
- /* set TARs for each PHB */
- target = calgary_reg(bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
-
- /* zero out all TAR bits under sw control */
- val64 &= ~TAR_SW_BITS;
- table_phys = (u64)__pa(tbl->it_base);
-
- val64 |= table_phys;
-
- BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
- val64 |= (u64) specified_table_size;
-
- tbl->tar_val = cpu_to_be64(val64);
-
- writeq(tbl->tar_val, target);
- readq(target); /* flush */
-
- return 0;
-}
-
-static void __init calgary_free_bus(struct pci_dev *dev)
-{
- u64 val64;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- void __iomem *target;
- unsigned int bitmapsz;
-
- target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
- val64 &= ~TAR_SW_BITS;
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
- tbl->it_map = NULL;
-
- kfree(tbl);
-
- set_pci_iommu(dev->bus, NULL);
-
- /* Can't free bootmem allocated memory after system is up :-( */
- bus_info[dev->bus->number].tce_space = NULL;
-}
-
-static void calgary_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 csr, plssr;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
- "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
-}
-
-static void calioc2_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- u32 csr, csmr, plssr, mck, rcstat;
- void __iomem *target;
- unsigned long phboff = phb_offset(tbl->it_busno);
- unsigned long erroff;
- u32 errregs[7];
- int i;
-
- /* dump CSR */
- target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
- /* dump PLSSR */
- target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
- /* dump CSMR */
- target = calgary_reg(bbar, phboff | 0x290);
- csmr = be32_to_cpu(readl(target));
- /* dump mck */
- target = calgary_reg(bbar, phboff | 0x800);
- mck = be32_to_cpu(readl(target));
-
- printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
- tbl->it_busno);
-
- printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
- csr, plssr, csmr, mck);
-
- /* dump rest of error regs */
- printk(KERN_EMERG "Calgary: ");
- for (i = 0; i < ARRAY_SIZE(errregs); i++) {
- /* err regs are at 0x810 - 0x870 */
- erroff = (0x810 + (i * 0x10));
- target = calgary_reg(bbar, phboff | erroff);
- errregs[i] = be32_to_cpu(readl(target));
- printk("0x%08x@0x%lx ", errregs[i], erroff);
- }
- printk("\n");
-
- /* root complex status */
- target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
- rcstat = be32_to_cpu(readl(target));
- printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
- PHB_ROOT_COMPLEX_STATUS);
-}
-
-static void calgary_watchdog(unsigned long data)
-{
- struct pci_dev *dev = (struct pci_dev *)data;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- void __iomem *bbar = tbl->bbar;
- u32 val32;
- void __iomem *target;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- val32 = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- if (val32 & CSR_AGENT_MASK) {
- tbl->chip_ops->dump_error_regs(tbl);
-
- /* reset error */
- writel(0, target);
-
- /* Disable bus that caused the error */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
- PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_SLOT_DISABLE;
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
- } else {
- /* Reset the timer */
- mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
- }
-}
-
-static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
- unsigned char busnum, unsigned long timeout)
-{
- u64 val64;
- void __iomem *target;
- unsigned int phb_shift = ~0; /* silence gcc */
- u64 mask;
-
- switch (busno_to_phbid(busnum)) {
- case 0: phb_shift = (63 - 19);
- break;
- case 1: phb_shift = (63 - 23);
- break;
- case 2: phb_shift = (63 - 27);
- break;
- case 3: phb_shift = (63 - 35);
- break;
- default:
- BUG_ON(busno_to_phbid(busnum));
- }
-
- target = calgary_reg(bbar, CALGARY_CONFIG_REG);
- val64 = be64_to_cpu(readq(target));
-
- /* zero out this PHB's timer bits */
- mask = ~(0xFUL << phb_shift);
- val64 &= mask;
- val64 |= (timeout << phb_shift);
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-}
-
-static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 val;
-
- /*
- * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
- */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
- val = cpu_to_be32(readl(target));
- val |= 0x00800000;
- writel(cpu_to_be32(val), target);
-}
-
-static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
-
- /*
- * Give split completion a longer timeout on bus 1 for aic94xx
- * http://bugzilla.kernel.org/show_bug.cgi?id=7180
- */
- if (is_calgary(dev->device) && (busnum == 1))
- calgary_set_split_completion_timeout(tbl->bbar, busnum,
- CCR_2SEC_TIMEOUT);
-}
-
-static void __init calgary_enable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* enable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
-
- printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
- (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
- "Calgary" : "CalIOC2", busnum);
- printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
- "bus.\n");
-
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- init_timer(&tbl->watchdog_timer);
- tbl->watchdog_timer.function = &calgary_watchdog;
- tbl->watchdog_timer.data = (unsigned long)dev;
- mod_timer(&tbl->watchdog_timer, jiffies);
-}
-
-static void __init calgary_disable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* disable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
-
- printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- del_timer_sync(&tbl->watchdog_timer);
-}
-
-static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
-{
- pci_dev_get(dev);
- set_pci_iommu(dev->bus, NULL);
-
- /* is the device behind a bridge? */
- if (dev->bus->parent)
- dev->bus->parent->self = dev;
- else
- dev->bus->self = dev;
-}
-
-static int __init calgary_init_one(struct pci_dev *dev)
-{
- void __iomem *bbar;
- struct iommu_table *tbl;
- int ret;
-
- BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
-
- bbar = busno_to_bbar(dev->bus->number);
- ret = calgary_setup_tar(dev, bbar);
- if (ret)
- goto done;
-
- pci_dev_get(dev);
-
- if (dev->bus->parent) {
- if (dev->bus->parent->self)
- printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
- "bus->parent->self!\n", dev);
- dev->bus->parent->self = dev;
- } else
- dev->bus->self = dev;
-
- tbl = pci_iommu(dev->bus);
- tbl->chip_ops->handle_quirks(tbl, dev);
-
- calgary_enable_translation(dev);
-
- return 0;
-
-done:
- return ret;
-}
-
-static int __init calgary_locate_bbars(void)
-{
- int ret;
- int rioidx, phb, bus;
- void __iomem *bbar;
- void __iomem *target;
- unsigned long offset;
- u8 start_bus, end_bus;
- u32 val;
-
- ret = -ENODATA;
- for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
- struct rio_detail *rio = rio_devs[rioidx];
-
- if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
- continue;
-
- /* map entire 1MB of Calgary config space */
- bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
- if (!bbar)
- goto error;
-
- for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
- offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
- target = calgary_reg(bbar, offset);
-
- val = be32_to_cpu(readl(target));
-
- start_bus = (u8)((val & 0x00FF0000) >> 16);
- end_bus = (u8)((val & 0x0000FF00) >> 8);
-
- if (end_bus) {
- for (bus = start_bus; bus <= end_bus; bus++) {
- bus_info[bus].bbar = bbar;
- bus_info[bus].phbid = phb;
- }
- } else {
- bus_info[start_bus].bbar = bbar;
- bus_info[start_bus].phbid = phb;
- }
- }
- }
-
- return 0;
-
-error:
- /* scan bus_info and iounmap any bbars we previously ioremap'd */
- for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
- if (bus_info[bus].bbar)
- iounmap(bus_info[bus].bbar);
-
- return ret;
-}
-
-static int __init calgary_init(void)
-{
- int ret;
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- ret = calgary_locate_bbars();
- if (ret)
- return ret;
-
- /* Purely for kdump kernel case */
- if (is_kdump_kernel())
- get_tce_space_from_tar();
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- calgary_init_one_nontraslated(dev);
- continue;
- }
-
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- ret = calgary_init_one(dev);
- if (ret)
- goto error;
- } while (1);
-
- dev = NULL;
- for_each_pci_dev(dev) {
- struct iommu_table *tbl;
-
- tbl = find_iommu_table(&dev->dev);
-
- if (translation_enabled(tbl))
- dev->dev.archdata.dma_ops = &calgary_dma_ops;
- }
-
- return ret;
-
-error:
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- pci_dev_put(dev);
- continue;
- }
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- calgary_disable_translation(dev);
- calgary_free_bus(dev);
- pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
- dev->dev.archdata.dma_ops = NULL;
- } while (1);
-
- return ret;
-}
-
-static inline int __init determine_tce_table_size(u64 ram)
-{
- int ret;
-
- if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
- return specified_table_size;
-
- /*
- * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
- * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
- * larger table size has twice as many entries, so shift the
- * max ram address by 13 to divide by 8K and then look at the
- * order of the result to choose between 0-7.
- */
- ret = get_order(ram >> 13);
- if (ret > TCE_TABLE_SIZE_8M)
- ret = TCE_TABLE_SIZE_8M;
-
- return ret;
-}
-
-static int __init build_detail_arrays(void)
-{
- unsigned long ptr;
- unsigned numnodes, i;
- int scal_detail_size, rio_detail_size;
-
- numnodes = rio_table_hdr->num_scal_dev;
- if (numnodes > MAX_NUMNODES){
- printk(KERN_WARNING
- "Calgary: MAX_NUMNODES too low! Defined as %d, "
- "but system has %d nodes.\n",
- MAX_NUMNODES, numnodes);
- return -ENODEV;
- }
-
- switch (rio_table_hdr->version){
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- default:
- printk(KERN_WARNING
- "Calgary: Invalid Rio Grande Table Version: %d\n",
- rio_table_hdr->version);
- return -EPROTO;
- }
-
- ptr = ((unsigned long)rio_table_hdr) + 3;
- for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev;
- i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 0;
-}
-
-static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
-{
- int dev;
- u32 val;
-
- if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
- /*
- * FIXME: properly scan for devices accross the
- * PCI-to-PCI bridge on every CalIOC2 port.
- */
- return 1;
- }
-
- for (dev = 1; dev < 8; dev++) {
- val = read_pci_config(bus, dev, 0, 0);
- if (val != 0xffffffff)
- break;
- }
- return (val != 0xffffffff);
-}
-
-/*
- * calgary_init_bitmap_from_tce_table():
- * Funtion for kdump case. In the second/kdump kernel initialize
- * the bitmap based on the tce table entries obtained from first kernel
- */
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
-{
- u64 *tp;
- unsigned int index;
- tp = ((u64 *)tbl->it_base);
- for (index = 0 ; index < tbl->it_size; index++) {
- if (*tp != 0x0)
- set_bit(index, tbl->it_map);
- tp++;
- }
-}
-
-/*
- * get_tce_space_from_tar():
- * Function for kdump case. Get the tce tables from first kernel
- * by reading the contents of the base adress register of calgary iommu
- */
-static void __init get_tce_space_from_tar(void)
-{
- int bus;
- void __iomem *target;
- unsigned long tce_space;
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- target = calgary_reg(bus_info[bus].bbar,
- tar_offset(bus));
- tce_space = be64_to_cpu(readq(target));
- tce_space = tce_space & TAR_SW_BITS;
-
- tce_space = tce_space & (~specified_table_size);
- info->tce_space = (u64 *)__va(tce_space);
- }
- }
- return;
-}
-
-void __init detect_calgary(void)
-{
- int bus;
- void *tbl;
- int calgary_found = 0;
- unsigned long ptr;
- unsigned int offset, prev_offset;
- int ret;
-
- /*
- * if the user specified iommu=off or iommu=soft or we found
- * another HW IOMMU already, bail out.
- */
- if (swiotlb || no_iommu || iommu_detected)
- return;
-
- if (!use_calgary)
- return;
-
- if (!early_pci_allowed())
- return;
-
- printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
-
- ptr = (unsigned long)phys_to_virt(get_bios_ebda());
-
- rio_table_hdr = NULL;
- prev_offset = 0;
- offset = 0x180;
- /*
- * The next offset is stored in the 1st word.
- * Only parse up until the offset increases:
- */
- while (offset > prev_offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- prev_offset = offset;
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
- "in EBDA - bailing!\n");
- return;
- }
-
- ret = build_detail_arrays();
- if (ret) {
- printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
- return;
- }
-
- specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
- saved_max_pfn : max_pfn) * PAGE_SIZE);
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
-
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- /*
- * If it is kdump kernel, find and use tce tables
- * from first kernel, else allocate tce tables here
- */
- if (!is_kdump_kernel()) {
- tbl = alloc_tce_table();
- if (!tbl)
- goto cleanup;
- info->tce_space = tbl;
- }
- calgary_found = 1;
- }
- }
-
- printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
- calgary_found ? "found" : "not found");
-
- if (calgary_found) {
- iommu_detected = 1;
- calgary_detected = 1;
- printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
- printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
- specified_table_size);
-
- /* swiotlb for devices that aren't behind the Calgary. */
- if (max_pfn > MAX_DMA32_PFN)
- swiotlb = 1;
- }
- return;
-
-cleanup:
- for (--bus; bus >= 0; --bus) {
- struct calgary_bus_info *info = &bus_info[bus];
-
- if (info->tce_space)
- free_tce_table(info->tce_space);
- }
-}
-
-int __init calgary_iommu_init(void)
-{
- int ret;
-
- if (no_iommu || (swiotlb && !calgary_detected))
- return -ENODEV;
-
- if (!calgary_detected)
- return -ENODEV;
-
- /* ok, we're trying to use Calgary - let's roll */
- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
- ret = calgary_init();
- if (ret) {
- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
- "falling back to no_iommu\n", ret);
- return ret;
- }
-
- force_iommu = 1;
- bad_dma_address = 0x0;
- /* dma_ops is set to swiotlb or nommu */
- if (!dma_ops)
- dma_ops = &nommu_dma_ops;
-
- return 0;
-}
-
-static int __init calgary_parse_options(char *p)
-{
- unsigned int bridge;
- size_t len;
- char* endp;
-
- while (*p) {
- if (!strncmp(p, "64k", 3))
- specified_table_size = TCE_TABLE_SIZE_64K;
- else if (!strncmp(p, "128k", 4))
- specified_table_size = TCE_TABLE_SIZE_128K;
- else if (!strncmp(p, "256k", 4))
- specified_table_size = TCE_TABLE_SIZE_256K;
- else if (!strncmp(p, "512k", 4))
- specified_table_size = TCE_TABLE_SIZE_512K;
- else if (!strncmp(p, "1M", 2))
- specified_table_size = TCE_TABLE_SIZE_1M;
- else if (!strncmp(p, "2M", 2))
- specified_table_size = TCE_TABLE_SIZE_2M;
- else if (!strncmp(p, "4M", 2))
- specified_table_size = TCE_TABLE_SIZE_4M;
- else if (!strncmp(p, "8M", 2))
- specified_table_size = TCE_TABLE_SIZE_8M;
-
- len = strlen("translate_empty_slots");
- if (!strncmp(p, "translate_empty_slots", len))
- translate_empty_slots = 1;
-
- len = strlen("disable");
- if (!strncmp(p, "disable", len)) {
- p += len;
- if (*p == '=')
- ++p;
- if (*p == '\0')
- break;
- bridge = simple_strtoul(p, &endp, 0);
- if (p == endp)
- break;
-
- if (bridge < MAX_PHB_BUS_NUM) {
- printk(KERN_INFO "Calgary: disabling "
- "translation for PHB %#x\n", bridge);
- bus_info[bridge].translation_disabled = 1;
- }
- }
-
- p = strpbrk(p, ",");
- if (!p)
- break;
-
- p++; /* skip ',' */
- }
- return 1;
-}
-__setup("calgary=", calgary_parse_options);
-
-static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
-{
- struct iommu_table *tbl;
- unsigned int npages;
- int i;
-
- tbl = pci_iommu(dev->bus);
-
- for (i = 0; i < 4; i++) {
- struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
-
- /* Don't give out TCEs that map MEM resources */
- if (!(r->flags & IORESOURCE_MEM))
- continue;
-
- /* 0-based? we reserve the whole 1st MB anyway */
- if (!r->start)
- continue;
-
- /* cover the whole region */
- npages = (r->end - r->start) >> PAGE_SHIFT;
- npages++;
-
- iommu_range_reserve(tbl, r->start, npages);
- }
-}
-
-static int __init calgary_fixup_tce_spaces(void)
-{
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- if (no_iommu || swiotlb || !calgary_detected)
- return -ENODEV;
-
- printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled)
- continue;
-
- if (!info->tce_space)
- continue;
-
- calgary_fixup_one_tce_space(dev);
-
- } while (1);
-
- return 0;
-}
-
-/*
- * We need to be call after pcibios_assign_resources (fs_initcall level)
- * and before device_initcall.
- */
-rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a041bcf506b..6267363e0189 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,22 +1,27 @@
-#include <linux/dma-mapping.h>
-#include <linux/dma-debug.h>
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/dma-map-ops.h>
+#include <linux/dma-direct.h>
+#include <linux/iommu.h>
#include <linux/dmar.h>
-#include <linux/bootmem.h>
+#include <linux/export.h>
+#include <linux/memblock.h>
+#include <linux/gfp.h>
#include <linux/pci.h>
+#include <linux/amd-iommu.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/calgary.h>
-#include <asm/amd_iommu.h>
+#include <asm/x86_init.h>
-static int forbid_dac __read_mostly;
+#include <xen/xen.h>
+#include <xen/swiotlb-xen.h>
-struct dma_map_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
+static bool disable_dac_quirk __read_mostly;
-static int iommu_sac_force __read_mostly;
+const struct dma_map_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
#ifdef CONFIG_IOMMU_DEBUG
int panic_on_overflow __read_mostly = 1;
@@ -32,140 +37,77 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-int iommu_pass_through;
-
-dma_addr_t bad_dma_address __read_mostly = 0;
-EXPORT_SYMBOL(bad_dma_address);
-
-/* Dummy device used for NULL arguments (normally ISA). Better would
- be probably a smaller DMA mask, but this is bug-to-bug compatible
- to older i386. */
-struct device x86_dma_fallback_dev = {
- .init_name = "fallback device",
- .coherent_dma_mask = DMA_BIT_MASK(32),
- .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
-};
-EXPORT_SYMBOL(x86_dma_fallback_dev);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES 32768
+#ifdef CONFIG_SWIOTLB
+bool x86_swiotlb_enable;
+static unsigned int x86_swiotlb_flags;
-int dma_set_mask(struct device *dev, u64 mask)
+static void __init pci_swiotlb_detect(void)
{
- if (!dev->dma_mask || !dma_supported(dev, mask))
- return -EIO;
+ /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+ if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
+ x86_swiotlb_enable = true;
- *dev->dma_mask = mask;
+ /*
+ * Set swiotlb to 1 so that bounce buffers are allocated and used for
+ * devices that can't support DMA to encrypted memory.
+ */
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ x86_swiotlb_enable = true;
- return 0;
+ /*
+ * Guest with guest memory encryption currently perform all DMA through
+ * bounce buffers as the hypervisor can't access arbitrary VM memory
+ * that is not explicitly shared with it.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ x86_swiotlb_enable = true;
+ x86_swiotlb_flags |= SWIOTLB_FORCE;
+ }
}
-EXPORT_SYMBOL(dma_set_mask);
-
-#ifdef CONFIG_X86_64
-static __initdata void *dma32_bootmem_ptr;
-static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
+#else
+static inline void __init pci_swiotlb_detect(void)
+{
+}
+#define x86_swiotlb_flags 0
+#endif /* CONFIG_SWIOTLB */
-static int __init parse_dma32_size_opt(char *p)
+#ifdef CONFIG_SWIOTLB_XEN
+static bool xen_swiotlb_enabled(void)
{
- if (!p)
- return -EINVAL;
- dma32_bootmem_size = memparse(p, &p);
- return 0;
+ return xen_initial_domain() || x86_swiotlb_enable ||
+ (IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) && xen_pv_pci_possible);
}
-early_param("dma32_size", parse_dma32_size_opt);
-void __init dma32_reserve_bootmem(void)
+static void __init pci_xen_swiotlb_init(void)
{
- unsigned long size, align;
- if (max_pfn <= MAX_DMA32_PFN)
+ if (!xen_swiotlb_enabled())
return;
-
- /*
- * check aperture_64.c allocate_aperture() for reason about
- * using 512M as goal
- */
- align = 64ULL<<20;
- size = roundup(dma32_bootmem_size, align);
- dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
- 512ULL<<20);
- if (dma32_bootmem_ptr)
- dma32_bootmem_size = size;
- else
- dma32_bootmem_size = 0;
+ x86_swiotlb_enable = true;
+ x86_swiotlb_flags |= SWIOTLB_ANY;
+ swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);
+ dma_ops = &xen_swiotlb_dma_ops;
+ if (IS_ENABLED(CONFIG_PCI))
+ pci_request_acs();
}
-static void __init dma32_free_bootmem(void)
+#else
+static inline void __init pci_xen_swiotlb_init(void)
{
-
- if (max_pfn <= MAX_DMA32_PFN)
- return;
-
- if (!dma32_bootmem_ptr)
- return;
-
- free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
-
- dma32_bootmem_ptr = NULL;
- dma32_bootmem_size = 0;
}
-#endif
+#endif /* CONFIG_SWIOTLB_XEN */
void __init pci_iommu_alloc(void)
{
-#ifdef CONFIG_X86_64
- /* free the range so iommu could get some range less than 4G */
- dma32_free_bootmem();
-#endif
-
- /*
- * The order of these functions is important for
- * fall-back/fail-over reasons
- */
+ if (xen_pv_domain()) {
+ pci_xen_swiotlb_init();
+ return;
+ }
+ pci_swiotlb_detect();
gart_iommu_hole_init();
-
- detect_calgary();
-
- detect_intel_iommu();
-
amd_iommu_detect();
-
- pci_swiotlb_init();
-}
-
-void *dma_generic_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
-{
- unsigned long dma_mask;
- struct page *page;
- dma_addr_t addr;
-
- dma_mask = dma_alloc_coherent_mask(dev, flag);
-
- flag |= __GFP_ZERO;
-again:
- page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
- if (!page)
- return NULL;
-
- addr = page_to_phys(page);
- if (!is_buffer_dma_capable(dma_mask, addr, size)) {
- __free_pages(page, get_order(size));
-
- if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
- flag = (flag & ~GFP_DMA32) | GFP_DMA;
- goto again;
- }
-
- return NULL;
- }
-
- *dma_addr = addr;
- return page_address(page);
+ detect_intel_iommu();
+ swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
}
-/*
- * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
- * documentation.
- */
static __init int iommu_setup(char *p)
{
iommu_merge = 1;
@@ -199,31 +141,26 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "nomerge", 7))
iommu_merge = 0;
if (!strncmp(p, "forcesac", 8))
- iommu_sac_force = 1;
+ pr_warn("forcesac option ignored.\n");
if (!strncmp(p, "allowdac", 8))
- forbid_dac = 0;
+ pr_warn("allowdac option ignored.\n");
if (!strncmp(p, "nodac", 5))
- forbid_dac = -1;
+ pr_warn("nodac option ignored.\n");
if (!strncmp(p, "usedac", 6)) {
- forbid_dac = -1;
+ disable_dac_quirk = true;
return 1;
}
#ifdef CONFIG_SWIOTLB
if (!strncmp(p, "soft", 4))
- swiotlb = 1;
+ x86_swiotlb_enable = true;
#endif
- if (!strncmp(p, "pt", 2)) {
- iommu_pass_through = 1;
- return 1;
- }
+ if (!strncmp(p, "pt", 2))
+ iommu_set_default_passthrough(true);
+ if (!strncmp(p, "nopt", 4))
+ iommu_set_default_translated(true);
gart_parse_options(p);
-#ifdef CONFIG_CALGARY_IOMMU
- if (!strncmp(p, "calgary", 7))
- use_calgary = 1;
-#endif /* CONFIG_CALGARY_IOMMU */
-
p += strcspn(p, ",");
if (*p == ',')
++p;
@@ -232,85 +169,41 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-int dma_supported(struct device *dev, u64 mask)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
-
-#ifdef CONFIG_PCI
- if (mask > 0xffffffff && forbid_dac > 0) {
- dev_info(dev, "PCI: Disallowing DAC for device\n");
- return 0;
- }
-#endif
-
- if (ops->dma_supported)
- return ops->dma_supported(dev, mask);
-
- /* Copied from i386. Doesn't make much sense, because it will
- only work for pci_alloc_coherent.
- The caller just has to use GFP_DMA in this case. */
- if (mask < DMA_BIT_MASK(24))
- return 0;
-
- /* Tell the device to use SAC when IOMMU force is on. This
- allows the driver to use cheaper accesses in some cases.
-
- Problem with this is that if we overflow the IOMMU area and
- return DAC as fallback address the device may not handle it
- correctly.
-
- As a special case some controllers have a 39bit address
- mode that is as efficient as 32bit (aic79xx). Don't force
- SAC for these. Assume all masks <= 40 bits are of this
- type. Normally this doesn't make any difference, but gives
- more gentle handling of IOMMU overflow. */
- if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
- dev_info(dev, "Force SAC with mask %Lx\n", mask);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL(dma_supported);
-
static int __init pci_iommu_init(void)
{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+ x86_init.iommu.iommu_init();
-#ifdef CONFIG_PCI
- dma_debug_add_bus(&pci_bus_type);
+#ifdef CONFIG_SWIOTLB
+ /* An IOMMU turned us off. */
+ if (x86_swiotlb_enable) {
+ pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ } else {
+ swiotlb_exit();
+ }
#endif
- calgary_iommu_init();
-
- intel_iommu_init();
-
- amd_iommu_init();
-
- gart_iommu_init();
-
- no_iommu_init();
return 0;
}
-
-void pci_iommu_shutdown(void)
-{
- gart_iommu_shutdown();
-
- amd_iommu_shutdown();
-}
/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
+rootfs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-static __devinit void via_no_dac(struct pci_dev *dev)
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+ pdev->dev.bus_dma_limit = DMA_BIT_MASK(32);
+ return 0;
+}
+
+static void via_no_dac(struct pci_dev *dev)
{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+ if (!disable_dac_quirk) {
dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
- forbid_dac = 1;
+ pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
}
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
deleted file mode 100644
index 71d412a09f30..000000000000
--- a/arch/x86/kernel/pci-nommu.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Fallback functions when the main IOMMU code is not compiled in. This
- code is roughly equivalent to i386. */
-#include <linux/dma-mapping.h>
-#include <linux/scatterlist.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/iommu.h>
-#include <asm/dma.h>
-
-static int
-check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
-{
- if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
- if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
- printk(KERN_ERR
- "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
- name, (long long)bus, size,
- (long long)*hwdev->dma_mask);
- return 0;
- }
- return 1;
-}
-
-static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- dma_addr_t bus = page_to_phys(page) + offset;
- WARN_ON(size == 0);
- if (!check_addr("map_single", dev, bus, size))
- return bad_dma_address;
- flush_write_buffers();
- return bus;
-}
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA. This is the scatter-gather version of the
- * above pci_map_single interface. Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length. They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- * DMA address/length pairs than there are SG table elements.
- * (for example via virtual mapping capabilities)
- * The routine returns the number of addr/length pairs actually
- * used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct scatterlist *s;
- int i;
-
- WARN_ON(nents == 0 || sg[0].length == 0);
-
- for_each_sg(sg, s, nents, i) {
- BUG_ON(!sg_page(s));
- s->dma_address = sg_phys(s);
- if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
- return 0;
- s->dma_length = s->length;
- }
- flush_write_buffers();
- return nents;
-}
-
-static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
-{
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
-struct dma_map_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = nommu_free_coherent,
- .map_sg = nommu_map_sg,
- .map_page = nommu_map_page,
- .is_phys = 1,
-};
-
-void __init no_iommu_init(void)
-{
- if (dma_ops)
- return;
-
- force_iommu = 0; /* no HW IOMMU */
- dma_ops = &nommu_dma_ops;
-}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
deleted file mode 100644
index 6af96ee44200..000000000000
--- a/arch/x86/kernel/pci-swiotlb.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Glue code to lib/swiotlb.c */
-
-#include <linux/pci.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <linux/swiotlb.h>
-#include <linux/bootmem.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/iommu.h>
-#include <asm/swiotlb.h>
-#include <asm/dma.h>
-
-int swiotlb __read_mostly;
-
-void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
-{
- return alloc_bootmem_low_pages(size);
-}
-
-void *swiotlb_alloc(unsigned order, unsigned long nslabs)
-{
- return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
-}
-
-dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
-{
- return paddr;
-}
-
-phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
-{
- return baddr;
-}
-
-int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
-{
- return 0;
-}
-
-static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, gfp_t flags)
-{
- void *vaddr;
-
- vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags);
- if (vaddr)
- return vaddr;
-
- return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
-}
-
-static struct dma_map_ops swiotlb_dma_ops = {
- .mapping_error = swiotlb_dma_mapping_error,
- .alloc_coherent = x86_swiotlb_alloc_coherent,
- .free_coherent = swiotlb_free_coherent,
- .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
- .sync_single_for_device = swiotlb_sync_single_for_device,
- .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
- .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
- .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
- .sync_sg_for_device = swiotlb_sync_sg_for_device,
- .map_sg = swiotlb_map_sg_attrs,
- .unmap_sg = swiotlb_unmap_sg_attrs,
- .map_page = swiotlb_map_page,
- .unmap_page = swiotlb_unmap_page,
- .dma_supported = NULL,
-};
-
-void __init pci_swiotlb_init(void)
-{
- /* don't initialize swiotlb if iommu=off (no_iommu=1) */
-#ifdef CONFIG_X86_64
- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
- iommu_pass_through)
- swiotlb = 1;
-#endif
- if (swiotlb_force)
- swiotlb = 1;
- if (swiotlb) {
- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_init();
- dma_ops = &swiotlb_dma_ops;
- }
-}
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index a311ffcaad16..4a710ffffd9a 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/platform_device.h>
#include <linux/err.h>
#include <linux/init.h>
@@ -8,6 +9,6 @@ static __init int add_pcspkr(void)
pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
- return IS_ERR(pd) ? PTR_ERR(pd) : 0;
+ return PTR_ERR_OR_ZERO(pd);
}
device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 000000000000..624703af80a1
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+ PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+ PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+ PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+ PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+ PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+ PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+ PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+ PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+ PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+ PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+ PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+ PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+ PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+ PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+ PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+ /*
+ * The pt_regs struct does not store
+ * ds, es, fs, gs in 64 bit mode.
+ */
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+ PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+ PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+ PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+ PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+ PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+ PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+ PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+ PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ struct x86_perf_regs *perf_regs;
+
+ if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) {
+ perf_regs = container_of(regs, struct x86_perf_regs, regs);
+ if (!perf_regs->xmm_regs)
+ return 0;
+ return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0];
+ }
+
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \
+ ~((1ULL << PERF_REG_X86_MAX) - 1))
+
+#ifdef CONFIG_X86_32
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \
+ (1ULL << PERF_REG_X86_R9) | \
+ (1ULL << PERF_REG_X86_R10) | \
+ (1ULL << PERF_REG_X86_R11) | \
+ (1ULL << PERF_REG_X86_R12) | \
+ (1ULL << PERF_REG_X86_R13) | \
+ (1ULL << PERF_REG_X86_R14) | \
+ (1ULL << PERF_REG_X86_R15))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs)
+{
+ regs_user->regs = task_pt_regs(current);
+ regs_user->abi = perf_reg_abi(current);
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+ (1ULL << PERF_REG_X86_ES) | \
+ (1ULL << PERF_REG_X86_FS) | \
+ (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ if (!user_64bit_mode(task_pt_regs(task)))
+ return PERF_SAMPLE_REGS_ABI_32;
+ else
+ return PERF_SAMPLE_REGS_ABI_64;
+}
+
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs)
+{
+ struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
+ struct pt_regs *user_regs = task_pt_regs(current);
+
+ if (!in_nmi()) {
+ regs_user->regs = user_regs;
+ regs_user->abi = perf_reg_abi(current);
+ return;
+ }
+
+ /*
+ * If we're in an NMI that interrupted task_pt_regs setup, then
+ * we can't sample user regs at all. This check isn't really
+ * sufficient, though, as we could be in an NMI inside an interrupt
+ * that happened during task_pt_regs setup.
+ */
+ if (regs->sp > (unsigned long)&user_regs->r11 &&
+ regs->sp <= (unsigned long)(user_regs + 1)) {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
+ return;
+ }
+
+ /*
+ * These registers are always saved on 64-bit syscall entry.
+ * On 32-bit entry points, they are saved too except r8..r11.
+ */
+ regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->ax = user_regs->ax;
+ regs_user_copy->cx = user_regs->cx;
+ regs_user_copy->dx = user_regs->dx;
+ regs_user_copy->si = user_regs->si;
+ regs_user_copy->di = user_regs->di;
+ regs_user_copy->r8 = user_regs->r8;
+ regs_user_copy->r9 = user_regs->r9;
+ regs_user_copy->r10 = user_regs->r10;
+ regs_user_copy->r11 = user_regs->r11;
+ regs_user_copy->orig_ax = user_regs->orig_ax;
+ regs_user_copy->flags = user_regs->flags;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ /*
+ * Store user space frame-pointer value on sample
+ * to facilitate stack unwinding for cases when
+ * user space executable code has such support
+ * enabled at compile time:
+ */
+ regs_user_copy->bp = user_regs->bp;
+
+ regs_user_copy->bx = -1;
+ regs_user_copy->r12 = -1;
+ regs_user_copy->r13 = -1;
+ regs_user_copy->r14 = -1;
+ regs_user_copy->r15 = -1;
+ /*
+ * For this to be at all useful, we need a reasonable guess for
+ * the ABI. Be careful: we're in NMI context, and we're
+ * considering current to be the current task, so we should
+ * be careful not to look at any other percpu variables that might
+ * change during context switches.
+ */
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
+
+ regs_user->regs = regs_user_copy;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
new file mode 100644
index 000000000000..b525fe6d6657
--- /dev/null
+++ b/arch/x86/kernel/platform-quirks.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pnp.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+void __init x86_early_init_platform_quirks(void)
+{
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
+ x86_platform.legacy.rtc = 1;
+ x86_platform.legacy.warm_reset = 1;
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.legacy.devices.pnpbios = 1;
+
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_PC:
+ x86_platform.legacy.reserve_bios_regions = 1;
+ break;
+ case X86_SUBARCH_XEN:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ break;
+ case X86_SUBARCH_INTEL_MID:
+ case X86_SUBARCH_CE4100:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+ break;
+ }
+
+ if (x86_platform.set_legacy_features)
+ x86_platform.set_legacy_features();
+}
+
+bool __init x86_pnpbios_disabled(void)
+{
+ return x86_platform.legacy.devices.pnpbios == 0;
+}
+
+#if defined(CONFIG_PNPBIOS)
+bool __init arch_pnpbios_disabled(void)
+{
+ return x86_pnpbios_disabled();
+}
+#endif
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 000000000000..23154d24b117
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+
+static int found(struct resource *res, void *data)
+{
+ return 1;
+}
+
+static __init int register_e820_pmem(void)
+{
+ struct platform_device *pdev;
+ int rc;
+
+ rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+ IORESOURCE_MEM, 0, -1, NULL, found);
+ if (rc <= 0)
+ return 0;
+
+ /*
+ * See drivers/nvdimm/e820.c for the implementation, this is
+ * simply here to trigger the module to load on demand.
+ */
+ pdev = platform_device_alloc("e820_pmem", -1);
+
+ rc = platform_device_add(pdev);
+ if (rc)
+ platform_device_put(pdev);
+
+ return rc;
+}
+device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-static inline u32 cyc2us(u32 cycles)
-{
- /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
- * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
- *
- * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
- * easily be multiplied with 286 (=0x11E) without having to fear
- * u32 overflows.
- */
- cycles *= 286;
- return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
- u32 a, b;
- for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
- a == b;
- b = inl(pmtmr_ioport) & ACPI_PM_MASK)
- cpu_relax();
- return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
- u32 a, b;
- a = pmtimer_wait_tick();
- do {
- b = inl(pmtmr_ioport);
- cpu_relax();
- } while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
- pmtmr_ioport = 0;
- return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..cc2c34ba7228 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
@@ -10,15 +11,17 @@
#include <linux/dmi.h>
#include <linux/pfn.h>
#include <linux/pci.h>
-#include <asm/pci-direct.h>
-
+#include <linux/export.h>
-#include <asm/e820.h>
+#include <asm/probe_roms.h>
+#include <asm/pci-direct.h>
+#include <asm/e820/api.h>
#include <asm/mmzone.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/io.h>
#include <asm/setup_arch.h>
+#include <asm/sev.h>
static struct resource system_rom_resource = {
.name = "System ROM",
@@ -73,6 +76,107 @@ static struct resource video_rom_resource = {
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+ struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
+ const struct pci_device_id *id;
+
+ if (pdev->vendor == vendor && pdev->device == device)
+ return true;
+
+ for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+ if (id->vendor == vendor && id->device == device)
+ break;
+
+ return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+ const void *rom_list)
+{
+ unsigned short device;
+
+ do {
+ if (get_kernel_nofault(device, rom_list) != 0)
+ device = 0;
+
+ if (device && match_id(pdev, vendor, device))
+ break;
+
+ rom_list += 2;
+ } while (device);
+
+ return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+ struct resource *oprom = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+ unsigned short offset, vendor, device, list, rev;
+ const void *rom;
+
+ if (res->end == 0)
+ break;
+
+ rom = isa_bus_to_virt(res->start);
+ if (get_kernel_nofault(offset, rom + 0x18) != 0)
+ continue;
+
+ if (get_kernel_nofault(vendor, rom + offset + 0x4) != 0)
+ continue;
+
+ if (get_kernel_nofault(device, rom + offset + 0x6) != 0)
+ continue;
+
+ if (match_id(pdev, vendor, device)) {
+ oprom = res;
+ break;
+ }
+
+ if (get_kernel_nofault(list, rom + offset + 0x8) == 0 &&
+ get_kernel_nofault(rev, rom + offset + 0xc) == 0 &&
+ rev >= 3 && list &&
+ probe_list(pdev, vendor, rom + offset + list)) {
+ oprom = res;
+ break;
+ }
+ }
+
+ return oprom;
+}
+
+void __iomem *pci_map_biosrom(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ if (!oprom)
+ return NULL;
+
+ return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+ iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
#define ROMSIGNATURE 0xaa55
static int __init romsignature(const unsigned char *rom)
@@ -80,22 +184,22 @@ static int __init romsignature(const unsigned char *rom)
const unsigned short * const ptr = (const unsigned short *)rom;
unsigned short sig;
- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+ return get_kernel_nofault(sig, ptr) == 0 && sig == ROMSIGNATURE;
}
static int __init romchecksum(const unsigned char *rom, unsigned long length)
{
unsigned char sum, c;
- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ for (sum = 0; length && get_kernel_nofault(c, rom++) == 0; length--)
sum += c;
return !length && !sum;
}
void __init probe_roms(void)
{
- const unsigned char *rom;
unsigned long start, length, upper;
+ const unsigned char *rom;
unsigned char c;
int i;
@@ -108,7 +212,7 @@ void __init probe_roms(void)
video_rom_resource.start = start;
- if (probe_kernel_address(rom + 2, c) != 0)
+ if (get_kernel_nofault(c, rom + 2) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
@@ -133,7 +237,7 @@ void __init probe_roms(void)
/* check for extension rom (ignore length byte!) */
rom = isa_bus_to_virt(extension_rom_resource.start);
if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ length = resource_size(&extension_rom_resource);
if (romchecksum(rom, length)) {
request_resource(&iomem_resource, &extension_rom_resource);
upper = extension_rom_resource.start;
@@ -146,7 +250,7 @@ void __init probe_roms(void)
if (!romsignature(rom))
continue;
- if (probe_kernel_address(rom + 2, c) != 0)
+ if (get_kernel_nofault(c, rom + 2) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 994dd6a4a2a0..4c718f8adc59 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,131 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/pm.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
#include <linux/random.h>
-#include <trace/power.h>
-#include <asm/system.h>
+#include <linux/user-return-notifier.h>
+#include <linux/dmi.h>
+#include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/cpuidle.h>
+#include <linux/acpi.h>
+#include <linux/elf-randomize.h>
+#include <linux/static_call.h>
+#include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
+#include <asm/cpu.h>
+#include <asm/cpuid/api.h>
#include <asm/apic.h>
-#include <asm/syscalls.h>
-#include <asm/idle.h>
-#include <asm/uaccess.h>
-#include <asm/i387.h>
-#include <asm/ds.h>
+#include <linux/uaccess.h>
+#include <asm/mwait.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/xstate.h>
+#include <asm/debugreg.h>
+#include <asm/nmi.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
+#include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
+#include <asm/io_bitmap.h>
+#include <asm/proto.h>
+#include <asm/frame.h>
+#include <asm/unwind.h>
+#include <asm/tdx.h>
+#include <asm/mmu_context.h>
+#include <asm/msr.h>
+#include <asm/shstk.h>
+
+#include "process.h"
+
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+ .x86_tss = {
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+ * privilege level. Since the init task never runs anything
+ * but ring 0 code, there is no need for a valid value here.
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
-unsigned long idle_halt;
-EXPORT_SYMBOL(idle_halt);
-unsigned long idle_nomwait;
-EXPORT_SYMBOL(idle_nomwait);
+#ifdef CONFIG_X86_32
+ .sp1 = TOP_OF_INIT_STACK,
-struct kmem_cache *task_xstate_cachep;
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+#endif
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
+ },
+};
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
-DEFINE_TRACE(power_start);
-DEFINE_TRACE(power_end);
+DEFINE_PER_CPU(bool, __tss_limit_invalid);
+EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
+
+/*
+ * The cache may be in an incoherent state and needs flushing during kexec.
+ * E.g., on SME/TDX platforms, dirty cacheline aliases with and without
+ * encryption bit(s) can coexist and the cache needs to be flushed before
+ * booting to the new kernel to avoid the silent memory corruption due to
+ * dirty cachelines with different encryption property being written back
+ * to the memory.
+ */
+DEFINE_PER_CPU(bool, cache_state_incoherent);
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- *dst = *src;
- if (src->thread.xstate) {
- dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
- GFP_KERNEL);
- if (!dst->thread.xstate)
- return -ENOMEM;
- WARN_ON((unsigned long)dst->thread.xstate & 15);
- memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
- }
+ /* fpu_clone() will initialize the "dst_fpu" memory */
+ memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(*dst), 0);
+
+#ifdef CONFIG_VM86
+ dst->thread.vm86 = NULL;
+#endif
+
return 0;
}
-void free_thread_xstate(struct task_struct *tsk)
+#ifdef CONFIG_X86_64
+void arch_release_task_struct(struct task_struct *tsk)
{
- if (tsk->thread.xstate) {
- kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
- tsk->thread.xstate = NULL;
- }
-
- WARN(tsk->thread.ds_ctx, "leaking DS context\n");
+ if (fpu_state_size_dynamic() && !(tsk->flags & (PF_KTHREAD | PF_USER_WORKER)))
+ fpstate_free(x86_task_fpu(tsk));
}
+#endif
-void free_thread_info(struct thread_info *ti)
+/*
+ * Free thread data structures etc..
+ */
+void exit_thread(struct task_struct *tsk)
{
- free_thread_xstate(ti->task);
- free_pages((unsigned long)ti, get_order(THREAD_SIZE));
+ struct thread_struct *t = &tsk->thread;
+
+ if (test_thread_flag(TIF_IO_BITMAP))
+ io_bitmap_exit(tsk);
+
+ free_vm86(t);
+
+ shstk_free(tsk);
+ fpu__drop(tsk);
}
-void arch_task_cache_init(void)
+static int set_new_tls(struct task_struct *p, unsigned long tls)
{
- task_xstate_cachep =
- kmem_cache_create("task_xstate", xstate_size,
- __alignof__(union thread_xstate),
- SLAB_PANIC | SLAB_NOTRACK, NULL);
+ struct user_desc __user *utls = (struct user_desc __user *)tls;
+
+ if (in_ia32_syscall())
+ return do_set_thread_area(p, -1, utls, 0);
+ else
+ return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
{
- struct task_struct *me = current;
- struct thread_struct *t = &me->thread;
- unsigned long *bp = t->io_bitmap_ptr;
+ schedule_tail(prev);
- if (bp) {
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
/*
- * Careful, clear this in the TSS too:
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
*/
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
- t->io_bitmap_max = 0;
- put_cpu();
- kfree(bp);
+ regs->ax = 0;
}
+
+ syscall_exit_to_user_mode(regs);
}
-void flush_thread(void)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
- struct task_struct *tsk = current;
+ u64 clone_flags = args->flags;
+ unsigned long sp = args->stack;
+ unsigned long tls = args->tls;
+ struct inactive_task_frame *frame;
+ struct fork_frame *fork_frame;
+ struct pt_regs *childregs;
+ unsigned long new_ssp;
+ int ret = 0;
+
+ childregs = task_pt_regs(p);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+
+ frame->bp = encode_frame_pointer(childregs);
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.io_bitmap = NULL;
+ clear_tsk_thread_flag(p, TIF_IO_BITMAP);
+ p->thread.iopl_warn = 0;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
- if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
- clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
- if (test_tsk_thread_flag(tsk, TIF_IA32)) {
- clear_tsk_thread_flag(tsk, TIF_IA32);
- } else {
- set_tsk_thread_flag(tsk, TIF_IA32);
- current_thread_info()->status |= TS_COMPAT;
- }
- }
+ current_save_fsgs();
+ p->thread.fsindex = current->thread.fsindex;
+ p->thread.fsbase = current->thread.fsbase;
+ p->thread.gsindex = current->thread.gsindex;
+ p->thread.gsbase = current->thread.gsbase;
+
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
+
+ if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM)
+ set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
+#else
+ p->thread.sp0 = (unsigned long) (childregs + 1);
+ savesegment(gs, p->thread.gs);
+ /*
+ * Clear all status flags including IF and set fixed bit. 64bit
+ * does not have this initialization as the frame does not contain
+ * flags. The flags consistency (especially vs. AC) is there
+ * ensured via objtool, which lacks 32bit support.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
#endif
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
+ /*
+ * Allocate a new shadow stack for thread if needed. If shadow stack,
+ * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
+ * update it.
+ */
+ new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+ if (IS_ERR_VALUE(new_ssp))
+ return PTR_ERR((void *)new_ssp);
- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ fpu_clone(p, clone_flags, args->fn, new_ssp);
+
+ /* Kernel thread ? */
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ p->thread.pkru = pkru_get_init_value();
+ memset(childregs, 0, sizeof(struct pt_regs));
+ kthread_frame_init(frame, args->fn, args->fn_arg);
+ return 0;
+ }
+
+ /*
+ * Clone current's PKRU value from hardware. tsk->thread.pkru
+ * is only valid when scheduled out.
+ */
+ p->thread.pkru = read_pkru();
+
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
+
+ if (unlikely(args->fn)) {
+ /*
+ * A user space thread, but it doesn't return to
+ * ret_after_fork().
+ *
+ * In order to indicate that to tools like gdb,
+ * we reset the stack and instruction pointers.
+ *
+ * It does the same kernel frame setup to return to a kernel
+ * function that a kernel thread does.
+ */
+ childregs->sp = 0;
+ childregs->ip = 0;
+ kthread_frame_init(frame, args->fn, args->fn_arg);
+ return 0;
+ }
+
+ /* Set a new TLS for the child thread? */
+ if (clone_flags & CLONE_SETTLS)
+ ret = set_new_tls(p, tls);
+
+ if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
+ io_bitmap_share(p);
+
+ return ret;
+}
+
+static void pkru_flush_thread(void)
+{
/*
- * Forget coprocessor state..
+ * If PKRU is enabled the default PKRU value has to be loaded into
+ * the hardware right here (similar to context switch).
*/
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
+ pkru_write_default();
}
-static void hard_disable_TSC(void)
+void flush_thread(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ struct task_struct *tsk = current;
+
+ flush_ptrace_hw_breakpoint(tsk);
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+
+ fpu_flush_thread();
+ pkru_flush_thread();
}
void disable_TSC(void)
@@ -136,15 +300,10 @@ void disable_TSC(void)
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
- hard_disable_TSC();
+ cr4_set_bits(X86_CR4_TSD);
preempt_enable();
}
-static void hard_enable_TSC(void)
-{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
static void enable_TSC(void)
{
preempt_disable();
@@ -153,7 +312,7 @@ static void enable_TSC(void)
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
- hard_enable_TSC();
+ cr4_clear_bits(X86_CR4_TSD);
preempt_enable();
}
@@ -181,437 +340,678 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss)
-{
- struct thread_struct *prev, *next;
-
- prev = &prev_p->thread;
- next = &next_p->thread;
-
- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
- ds_switch_to(prev_p, next_p);
- else if (next->debugctlmsr != prev->debugctlmsr)
- update_debugctlmsr(next->debugctlmsr);
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- set_debugreg(next->debugreg0, 0);
- set_debugreg(next->debugreg1, 1);
- set_debugreg(next->debugreg2, 2);
- set_debugreg(next->debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(next->debugreg6, 6);
- set_debugreg(next->debugreg7, 7);
- }
+static void set_cpuid_faulting(bool on)
+{
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOTSC))
- hard_disable_TSC();
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ u64 msrval;
+
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (on)
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
else
- hard_enable_TSC();
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
}
+}
- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
- /*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+static void disable_cpuid(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOCPUID)) {
/*
- * Clear any possible leftover bits:
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
*/
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ set_cpuid_faulting(true);
}
+ preempt_enable();
}
-int sys_fork(struct pt_regs *regs)
+static void enable_cpuid(void)
{
- return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ set_cpuid_faulting(false);
+ }
+ preempt_enable();
}
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-int sys_vfork(struct pt_regs *regs)
+static int get_cpuid_mode(void)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
- NULL, NULL);
+ return !test_thread_flag(TIF_NOCPUID);
}
+static int set_cpuid_mode(unsigned long cpuid_enabled)
+{
+ if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
+ return -ENODEV;
-/*
- * Idle related variables and functions
- */
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
+ if (cpuid_enabled)
+ enable_cpuid();
+ else
+ disable_cpuid();
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
+ return 0;
+}
-#ifdef CONFIG_X86_32
/*
- * This halt magic was a workaround for ancient floppy DMA
- * wreckage. It should be safe to remove.
+ * Called immediately after a successful exec.
*/
-static int hlt_counter;
-void disable_hlt(void)
+void arch_setup_new_exec(void)
{
- hlt_counter++;
+ /* If cpuid was previously disabled for this task, re-enable it. */
+ if (test_thread_flag(TIF_NOCPUID))
+ enable_cpuid();
+
+ /*
+ * Don't inherit TIF_SSBD across exec boundary when
+ * PR_SPEC_DISABLE_NOEXEC is used.
+ */
+ if (test_thread_flag(TIF_SSBD) &&
+ task_spec_ssb_noexec(current)) {
+ clear_thread_flag(TIF_SSBD);
+ task_clear_spec_ssb_disable(current);
+ task_clear_spec_ssb_noexec(current);
+ speculation_ctrl_update(read_thread_flags());
+ }
+
+ mm_reset_untag_mask(current->mm);
}
-EXPORT_SYMBOL(disable_hlt);
-void enable_hlt(void)
+#ifdef CONFIG_X86_IOPL_IOPERM
+static inline void switch_to_bitmap(unsigned long tifp)
{
- hlt_counter--;
+ /*
+ * Invalidate I/O bitmap if the previous task used it. This prevents
+ * any possible leakage of an active I/O bitmap.
+ *
+ * If the next task has an I/O bitmap it will handle it on exit to
+ * user mode.
+ */
+ if (tifp & _TIF_IO_BITMAP)
+ tss_invalidate_io_bitmap();
}
-EXPORT_SYMBOL(enable_hlt);
-static inline int hlt_use_halt(void)
+static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
{
- return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+ /*
+ * Copy at least the byte range of the incoming tasks bitmap which
+ * covers the permitted I/O ports.
+ *
+ * If the previous task which used an I/O bitmap had more bits
+ * permitted, then the copy needs to cover those as well so they
+ * get turned off.
+ */
+ memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
+ max(tss->io_bitmap.prev_max, iobm->max));
+
+ /*
+ * Store the new max and the sequence number of this bitmap
+ * and a pointer to the bitmap itself.
+ */
+ tss->io_bitmap.prev_max = iobm->max;
+ tss->io_bitmap.prev_sequence = iobm->sequence;
}
-#else
-static inline int hlt_use_halt(void)
+
+/**
+ * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
+ */
+void native_tss_update_io_bitmap(void)
{
- return 1;
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct thread_struct *t = &current->thread;
+ u16 *base = &tss->x86_tss.io_bitmap_base;
+
+ if (!test_thread_flag(TIF_IO_BITMAP)) {
+ native_tss_invalidate_io_bitmap();
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
+ *base = IO_BITMAP_OFFSET_VALID_ALL;
+ } else {
+ struct io_bitmap *iobm = t->io_bitmap;
+
+ if (WARN_ON_ONCE(!iobm)) {
+ clear_thread_flag(TIF_IO_BITMAP);
+ native_tss_invalidate_io_bitmap();
+ }
+
+ /*
+ * Only copy bitmap data when the sequence number differs. The
+ * update time is accounted to the incoming task.
+ */
+ if (tss->io_bitmap.prev_sequence != iobm->sequence)
+ tss_copy_io_bitmap(tss, iobm);
+
+ /* Enable the bitmap */
+ *base = IO_BITMAP_OFFSET_VALID_MAP;
+ }
+
+ /*
+ * Make sure that the TSS limit is covering the IO bitmap. It might have
+ * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
+ * access from user space to trigger a #GP because the bitmap is outside
+ * the TSS limit.
+ */
+ refresh_tss_limit();
}
+#else /* CONFIG_X86_IOPL_IOPERM */
+static inline void switch_to_bitmap(unsigned long tifp) { }
#endif
+#ifdef CONFIG_SMP
+
+struct ssb_state {
+ struct ssb_state *shared_state;
+ raw_spinlock_t lock;
+ unsigned int disable_state;
+ unsigned long local_state;
+};
+
+#define LSTATE_SSB 0
+
+static DEFINE_PER_CPU(struct ssb_state, ssb_state);
+
+void speculative_store_bypass_ht_init(void)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ st->local_state = 0;
+
+ /*
+ * Shared state setup happens once on the first bringup
+ * of the CPU. It's not destroyed on CPU hotunplug.
+ */
+ if (st->shared_state)
+ return;
+
+ raw_spin_lock_init(&st->lock);
+
+ /*
+ * Go over HT siblings and check whether one of them has set up the
+ * shared state pointer already.
+ */
+ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
+ if (cpu == this_cpu)
+ continue;
+
+ if (!per_cpu(ssb_state, cpu).shared_state)
+ continue;
+
+ /* Link it to the state of the sibling: */
+ st->shared_state = per_cpu(ssb_state, cpu).shared_state;
+ return;
+ }
+
+ /*
+ * First HT sibling to come up on the core. Link shared state of
+ * the first HT sibling to itself. The siblings on the same core
+ * which come up later will see the shared state pointer and link
+ * themselves to the state of this CPU.
+ */
+ st->shared_state = st;
+}
+
/*
- * We use this if we don't have any better
- * idle routine..
+ * Logic is: First HT sibling enables SSBD for both siblings in the core
+ * and last sibling to disable it, disables it for the whole core. This how
+ * MSR_SPEC_CTRL works in "hardware":
+ *
+ * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
*/
-void default_idle(void)
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
{
- if (hlt_use_halt()) {
- struct power_trace it;
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ u64 msr = x86_amd_ls_cfg_base;
+
+ if (!static_cpu_has(X86_FEATURE_ZEN)) {
+ msr |= ssbd_tif_to_amd_ls_cfg(tifn);
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+ return;
+ }
- trace_power_start(&it, POWER_CSTATE, 1);
- current_thread_info()->status &= ~TS_POLLING;
+ if (tifn & _TIF_SSBD) {
/*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
+ * Since this can race with prctl(), block reentry on the
+ * same CPU.
*/
- smp_mb();
+ if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
+ return;
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
- trace_power_end(&it);
+ msr |= x86_amd_ls_cfg_ssbd_mask;
+
+ raw_spin_lock(&st->shared_state->lock);
+ /* First sibling enables SSBD: */
+ if (!st->shared_state->disable_state)
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+ st->shared_state->disable_state++;
+ raw_spin_unlock(&st->shared_state->lock);
} else {
- local_irq_enable();
- /* loop is done by the caller */
- cpu_relax();
+ if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ raw_spin_lock(&st->shared_state->lock);
+ st->shared_state->disable_state--;
+ if (!st->shared_state->disable_state)
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+ raw_spin_unlock(&st->shared_state->lock);
}
}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
+#else
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
+
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+}
#endif
-void stop_this_cpu(void *dummy)
+static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
{
- local_irq_disable();
/*
- * Remove this CPU:
+ * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
+ * so ssbd_tif_to_spec_ctrl() just works.
*/
- set_cpu_online(smp_processor_id(), false);
- disable_local_APIC();
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+}
- for (;;) {
- if (hlt_works(smp_processor_id()))
- halt();
+/*
+ * Update the MSRs managing speculation control, during context switch.
+ *
+ * tifp: Previous task's thread flags
+ * tifn: Next task's thread flags
+ */
+static __always_inline void __speculation_ctrl_update(unsigned long tifp,
+ unsigned long tifn)
+{
+ unsigned long tif_diff = tifp ^ tifn;
+ u64 msr = x86_spec_ctrl_base;
+ bool updmsr = false;
+
+ lockdep_assert_irqs_disabled();
+
+ /* Handle change of TIF_SSBD depending on the mitigation method. */
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
+ amd_set_ssb_virt_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
+ amd_set_core_ssb_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ updmsr |= !!(tif_diff & _TIF_SSBD);
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
+ }
+
+ /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
+ if (IS_ENABLED(CONFIG_SMP) &&
+ static_branch_unlikely(&switch_to_cond_stibp)) {
+ updmsr |= !!(tif_diff & _TIF_SPEC_IB);
+ msr |= stibp_tif_to_spec_ctrl(tifn);
}
+
+ if (updmsr)
+ update_spec_ctrl_cond(msr);
}
-static void do_nothing(void *unused)
+static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
{
+ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
+ if (task_spec_ssb_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SSBD);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SSBD);
+
+ if (task_spec_ib_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ }
+ /* Return the updated threadinfo flags*/
+ return read_task_thread_flags(tsk);
}
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
+void speculation_ctrl_update(unsigned long tif)
{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 1);
+ unsigned long flags;
+
+ /* Forced update. Make sure all relevant TIF flags are different */
+ local_irq_save(flags);
+ __speculation_ctrl_update(~tif, tif);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+/* Called from seccomp/prctl update */
+void speculation_ctrl_update_current(void)
{
- struct power_trace it;
+ preempt_disable();
+ speculation_ctrl_update(speculation_ctrl_update_tif(current));
+ preempt_enable();
+}
- trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
- if (!need_resched()) {
- if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
+static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
+{
+ unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
+ newval = cr4 ^ mask;
+ if (newval != cr4) {
+ this_cpu_write(cpu_tlbstate.cr4, newval);
+ __write_cr4(newval);
}
- trace_power_end(&it);
}
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
- struct power_trace it;
- if (!need_resched()) {
- trace_power_start(&it, POWER_CSTATE, 1);
- if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
+ unsigned long tifp, tifn;
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(0, 0);
- else
- local_irq_enable();
- trace_power_end(&it);
- } else
- local_irq_enable();
+ tifn = read_task_thread_flags(next_p);
+ tifp = read_task_thread_flags(prev_p);
+
+ switch_to_bitmap(tifp);
+
+ propagate_user_return_notify(prev_p, next_p);
+
+ if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+ arch_has_block_step()) {
+ unsigned long debugctl, msk;
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ msk = tifn & _TIF_BLOCKSTEP;
+ debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
+
+ if ((tifp ^ tifn) & _TIF_NOTSC)
+ cr4_toggle_bits_irqsoff(X86_CR4_TSD);
+
+ if ((tifp ^ tifn) & _TIF_NOCPUID)
+ set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
+ __speculation_ctrl_update(tifp, tifn);
+ } else {
+ speculation_ctrl_update_tif(prev_p);
+ tifn = speculation_ctrl_update_tif(next_p);
+
+ /* Enforce MSR update to ensure consistent state */
+ __speculation_ctrl_update(~tifn, tifn);
+ }
}
/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * We use this if we don't have any better idle routine..
*/
-static void poll_idle(void)
+void __cpuidle default_idle(void)
+{
+ raw_safe_halt();
+ raw_local_irq_disable();
+}
+#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
+EXPORT_SYMBOL(default_idle);
+#endif
+
+DEFINE_STATIC_CALL_NULL(x86_idle, default_idle);
+
+static bool x86_idle_set(void)
+{
+ return !!static_call_query(x86_idle);
+}
+
+#ifndef CONFIG_SMP
+static inline void __noreturn play_dead(void)
+{
+ BUG();
+}
+#endif
+
+void arch_cpu_idle_enter(void)
{
- struct power_trace it;
+ tsc_verify_tsc_adjust(false);
+ local_touch_nmi();
+}
- trace_power_start(&it, POWER_CSTATE, 0);
- local_irq_enable();
- while (!need_resched())
- cpu_relax();
- trace_power_end(&it);
+void __noreturn arch_cpu_idle_dead(void)
+{
+ play_dead();
}
/*
- * mwait selection logic:
- *
- * It depends on the CPU. For AMD CPUs that support MWAIT this is
- * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
- * then depend on a clock divisor and current Pstate of the core. If
- * all cores of a processor are in halt state (C1) the processor can
- * enter the C1E (C1 enhanced) state. If mwait is used this will never
- * happen.
- *
- * idle=mwait overrides this decision and forces the usage of mwait.
+ * Called from the generic idle code.
*/
-static int __cpuinitdata force_mwait;
+void __cpuidle arch_cpu_idle(void)
+{
+ static_call(x86_idle)();
+}
+EXPORT_SYMBOL_GPL(arch_cpu_idle);
+
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void)
+{
+ bool ret = x86_idle_set();
+
+ static_call_update(x86_idle, default_idle);
+
+ return ret;
+}
+#endif
-#define MWAIT_INFO 0x05
-#define MWAIT_ECX_EXTENDED_INFO 0x01
-#define MWAIT_EDX_C1 0xf0
+struct cpumask cpus_stop_mask;
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+void __noreturn stop_this_cpu(void *dummy)
{
- u32 eax, ebx, ecx, edx;
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+ unsigned int cpu = smp_processor_id();
- if (force_mwait)
- return 1;
+ local_irq_disable();
- if (c->cpuid_level < MWAIT_INFO)
- return 0;
+ /*
+ * Remove this CPU from the online mask and disable it
+ * unconditionally. This might be redundant in case that the reboot
+ * vector was handled late and stop_other_cpus() sent an NMI.
+ *
+ * According to SDM and APM NMIs can be accepted even after soft
+ * disabling the local APIC.
+ */
+ set_cpu_online(cpu, false);
+ disable_local_APIC();
+ mcheck_cpu_clear(c);
- cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
- /* Check, whether EDX has extended info about MWAIT */
- if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
- return 1;
+ if (this_cpu_read(cache_state_incoherent))
+ wbinvd();
/*
- * edx enumeratios MONITOR/MWAIT extensions. Check, whether
- * C1 supports MWAIT
+ * This brings a cache line back and dirties it, but
+ * native_stop_other_cpus() will overwrite cpus_stop_mask after it
+ * observed that all CPUs reported stop. This write will invalidate
+ * the related cache line on this CPU.
*/
- return (edx & MWAIT_EDX_C1);
+ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+
+#ifdef CONFIG_SMP
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ BUG();
+ }
+#endif
+
+ for (;;) {
+ /*
+ * Use native_halt() so that memory contents don't change
+ * (stack usage and variables) after possibly issuing the
+ * wbinvd() above.
+ */
+ native_halt();
+ }
}
/*
- * Check for AMD CPUs, which have potentially C1E support
+ * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf
+ * exists and whenever MONITOR/MWAIT extensions are present there is at
+ * least one C1 substate.
+ *
+ * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait
+ * is passed to kernel commandline parameter.
*/
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+static __init bool prefer_mwait_c1_over_halt(void)
{
- if (c->x86_vendor != X86_VENDOR_AMD)
- return 0;
+ const struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 eax, ebx, ecx, edx;
- if (c->x86 < 0x0F)
- return 0;
+ /* If override is enforced on the command line, fall back to HALT. */
+ if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+ return false;
- /* Family 0x0f models < rev F do not have C1E */
- if (c->x86 == 0x0f && c->x86_model < 0x40)
- return 0;
+ /* MWAIT is not supported on this platform. Fallback to HALT */
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
+ return false;
- return 1;
-}
+ /* Monitor has a bug or APIC stops in C1E. Fallback to HALT */
+ if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E))
+ return false;
-static cpumask_var_t c1e_mask;
-static int c1e_detected;
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
-void c1e_remove_cpu(int cpu)
-{
- if (c1e_mask != NULL)
- cpumask_clear_cpu(cpu, c1e_mask);
+ /*
+ * If MWAIT extensions are not available, it is safe to use MWAIT
+ * with EAX=0, ECX=0.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED))
+ return true;
+
+ /*
+ * If MWAIT extensions are available, there should be at least one
+ * MWAIT C1 substate present.
+ */
+ return !!(edx & MWAIT_C1_SUBSTATE_MASK);
}
/*
- * C1E aware idle routine. We check for C1E active in the interrupt
- * pending message MSR. If we detect C1E, then we handle it the same
- * way as C3 power states (local apic timer and TSC stop)
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
*/
-static void c1e_idle(void)
+static __cpuidle void mwait_idle(void)
{
if (need_resched())
return;
- if (!c1e_detected) {
- u32 lo, hi;
+ x86_idle_clear_cpu_buffers();
- rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
- if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = 1;
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- mark_tsc_unstable("TSC halt in AMD C1E");
- printk(KERN_INFO "System has AMD C1E enabled\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
- }
- }
+ if (!current_set_polling_and_test()) {
+ const void *addr = &current_thread_info()->flags;
- if (c1e_detected) {
- int cpu = smp_processor_id();
-
- if (!cpumask_test_cpu(cpu, c1e_mask)) {
- cpumask_set_cpu(cpu, c1e_mask);
- /*
- * Force broadcast so ACPI can not interfere. Needs
- * to run with interrupts enabled as it uses
- * smp_function_call.
- */
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
- &cpu);
- printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
- cpu);
- local_irq_disable();
- }
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+ alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
+ __monitor(addr, 0, 0);
+ if (need_resched())
+ goto out;
- default_idle();
+ __sti_mwait(0, 0);
+ raw_local_irq_disable();
+ }
- /*
- * The switch back from broadcast mode needs to be
- * called with interrupts disabled.
- */
- local_irq_disable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
- local_irq_enable();
- } else
- default_idle();
+out:
+ __current_clr_polling();
}
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+void __init select_idle_routine(void)
{
-#ifdef CONFIG_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
- " performance may degrade.\n");
+ if (boot_option_idle_override == IDLE_POLL) {
+ if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1)
+ pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
+ return;
}
-#endif
- if (pm_idle)
+
+ /* Required to guard against xen_set_default_idle() */
+ if (x86_idle_set())
return;
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * One CPU supports mwait => All CPUs supports mwait
- */
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- } else if (check_c1e_idle(c)) {
- printk(KERN_INFO "using C1E aware idle routine\n");
- pm_idle = c1e_idle;
- } else
- pm_idle = default_idle;
+ if (prefer_mwait_c1_over_halt()) {
+ pr_info("using mwait in idle threads\n");
+ static_call_update(x86_idle, mwait_idle);
+ } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_info("using TDX aware idle routine\n");
+ static_call_update(x86_idle, tdx_halt);
+ } else {
+ static_call_update(x86_idle, default_idle);
+ }
}
-void __init init_c1e_mask(void)
+void amd_e400_c1e_apic_setup(void)
{
- /* If we're using c1e_idle, we need to allocate c1e_mask. */
- if (pm_idle == c1e_idle) {
- alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
- cpumask_clear(c1e_mask);
+ if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+ local_irq_disable();
+ tick_broadcast_force();
+ local_irq_enable();
}
}
+void __init arch_post_acpi_subsys_init(void)
+{
+ u32 lo, hi;
+
+ if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+ return;
+
+ /*
+ * AMD E400 detection needs to happen after ACPI has been enabled. If
+ * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+ * MSR_K8_INT_PENDING_MSG.
+ */
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+ return;
+
+ boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE))
+ static_branch_enable(&arch_needs_tick_broadcast);
+ pr_info("System has AMD C1E erratum E400. Workaround enabled.\n");
+}
+
static int __init idle_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strcmp(str, "poll")) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else if (!strcmp(str, "halt")) {
- /*
- * When the boot option of idle=halt is added, halt is
- * forced to be used for CPU idle. In such case CPU C2/C3
- * won't be used again.
- * To continue to load the CPU idle driver, don't touch
- * the boot_option_idle_override.
- */
- pm_idle = default_idle;
- idle_halt = 1;
- return 0;
+ pr_info("using polling idle threads\n");
+ boot_option_idle_override = IDLE_POLL;
+ cpu_idle_poll_ctrl(true);
+ } else if (!strcmp(str, "halt")) {
+ /* 'idle=halt' HALT for idle. C-states are disabled. */
+ boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
- /*
- * If the boot option of "idle=nomwait" is added,
- * it means that mwait will be disabled for CPU C2/C3
- * states. In such case it won't touch the variable
- * of boot_option_idle_override.
- */
- idle_nomwait = 1;
- return 0;
- } else
- return -1;
+ /* 'idle=nomwait' disables MWAIT for idle */
+ boot_option_idle_override = IDLE_NOMWAIT;
+ } else {
+ return -EINVAL;
+ }
- boot_option_idle_override = 1;
return 0;
}
early_param("idle", idle_setup);
@@ -619,13 +1019,69 @@ early_param("idle", idle_setup);
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
+ sp -= get_random_u32_below(8192);
return sp & ~0xf;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ if (mmap_is_ia32())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
+}
+
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long __get_wchan(struct task_struct *p)
+{
+ struct unwind_state state;
+ unsigned long addr = 0;
+
+ if (!try_get_task_stack(p))
+ return 0;
+
+ for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+ if (in_sched_functions(addr))
+ continue;
+ break;
+ }
+
+ put_task_stack(p);
+
+ return addr;
+}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ switch (option) {
+ case ARCH_GET_CPUID:
+ return get_cpuid_mode();
+ case ARCH_SET_CPUID:
+ return set_cpuid_mode(arg2);
+ case ARCH_GET_XCOMP_SUPP:
+ case ARCH_GET_XCOMP_PERM:
+ case ARCH_REQ_XCOMP_PERM:
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ return fpu_xstate_prctl(option, arg2);
+ }
+
+ if (!in_ia32_syscall())
+ return do_arch_prctl_64(current, option, arg2);
+
+ return -EINVAL;
}
+SYSCALL_DEFINE0(ni_syscall)
+{
+ return -ENOSYS;
+}
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
new file mode 100644
index 000000000000..76b547b83232
--- /dev/null
+++ b/arch/x86/kernel/process.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+//
+// Code shared between 32 and 64 bit
+
+#include <asm/spec-ctrl.h>
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
+
+/*
+ * This needs to be inline to optimize for the common case where no extra
+ * work needs to be done.
+ */
+static inline void switch_to_extra(struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long next_tif = read_task_thread_flags(next);
+ unsigned long prev_tif = read_task_thread_flags(prev);
+
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /*
+ * Avoid __switch_to_xtra() invocation when conditional
+ * STIBP is disabled and the only different bit is
+ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
+ * in the TIF_WORK_CTXSW masks.
+ */
+ if (!static_branch_likely(&switch_to_cond_stibp)) {
+ prev_tif &= ~_TIF_SPEC_IB;
+ next_tif &= ~_TIF_SPEC_IB;
+ }
+ }
+
+ /*
+ * __switch_to_xtra() handles debug registers, i/o bitmaps,
+ * speculation mitigations etc.
+ */
+ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
+ prev_tif & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev, next);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 59f4524984af..3ef15c2f152f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,10 +9,11 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -23,209 +24,83 @@
#include <linux/vmalloc.h>
#include <linux/user.h>
#include <linux/interrupt.h>
-#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/reboot.h>
-#include <linux/init.h>
#include <linux/mc146818rtc.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/personality.h>
-#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
-#include <linux/dmi.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
+#include <linux/syscalls.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/fpu/sched.h>
#include <asm/desc.h>
-#ifdef CONFIG_MATH_EMULATION
-#include <asm/math_emu.h>
-#endif
#include <linux/err.h>
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/idle.h>
-#include <asm/syscalls.h>
-#include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/vm86.h>
+#include <asm/resctrl.h>
+#include <asm/proto.h>
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+#include "process.h"
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return ((unsigned long *)tsk->thread.sp)[3];
-}
-
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
- BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
- int cpu = smp_processor_id();
-
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- current_thread_info()->status |= TS_POLLING;
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_stop_sched_tick(1);
- while (!need_resched()) {
-
- check_pgt_cache();
- rmb();
-
- if (cpu_is_offline(cpu))
- play_dead();
-
- local_irq_disable();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
- pm_idle();
- start_critical_timings();
- }
- tick_nohz_restart_sched_tick();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- }
-}
-
-void __show_regs(struct pt_regs *regs, int all)
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
+ const char *log_lvl)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
unsigned long d0, d1, d2, d3, d6, d7;
- unsigned long sp;
- unsigned short ss, gs;
- const char *board;
+ unsigned short gs;
- if (user_mode_vm(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- gs = get_user_gs(regs);
- } else {
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- savesegment(gs, gs);
- }
+ savesegment(gs, gs);
- printk("\n");
+ show_ip(regs, log_lvl);
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (!board)
- board = "";
- printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
- task_pid_nr(current), current->comm,
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board);
+ printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ log_lvl, regs->ax, regs->bx, regs->cx, regs->dx);
+ printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ log_lvl, regs->si, regs->di, regs->bp, regs->sp);
+ printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+ log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags);
- printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
- (u16)regs->cs, regs->ip, regs->flags,
- smp_processor_id());
- print_symbol("EIP is at %s\n", regs->ip);
-
- printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
- regs->ax, regs->bx, regs->cx, regs->dx);
- printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
- regs->si, regs->di, regs->bp, sp);
- printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
- (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
-
- if (!all)
+ if (mode != SHOW_REGS_ALL)
return;
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
- cr4 = read_cr4_safe();
- printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
- cr0, cr2, cr3, cr4);
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
+ printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ log_lvl, cr0, cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
get_debugreg(d3, 3);
- printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
- d0, d1, d2, d3);
-
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk("DR6: %08lx DR7: %08lx\n",
- d6, d7);
-}
-
-void show_regs(struct pt_regs *regs)
-{
- __show_regs(regs, 1);
- show_trace(NULL, regs, &regs->sp, regs->bp);
-}
-
-/*
- * This gets run with %bx containing the
- * function to call, and %dx containing
- * the "args".
- */
-extern void kernel_thread_helper(void);
-
-/*
- * Create a kernel thread
- */
-int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
- struct pt_regs regs;
- memset(&regs, 0, sizeof(regs));
-
- regs.bx = (unsigned long) fn;
- regs.dx = (unsigned long) arg;
-
- regs.ds = __USER_DS;
- regs.es = __USER_DS;
- regs.fs = __KERNEL_PERCPU;
- regs.gs = __KERNEL_STACK_CANARY;
- regs.orig_ax = -1;
- regs.ip = (unsigned long) kernel_thread_helper;
- regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+ /* Only print out debug registers if they are in their non-default state. */
+ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))
+ return;
- /* Ok, create the new process.. */
- return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+ printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ log_lvl, d0, d1, d2, d3);
+ printk("%sDR6: %08lx DR7: %08lx\n",
+ log_lvl, d6, d7);
}
-EXPORT_SYMBOL(kernel_thread);
void release_thread(struct task_struct *dead_task)
{
@@ -233,91 +108,24 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
- struct task_struct *p, struct pt_regs *regs)
-{
- struct pt_regs *childregs;
- struct task_struct *tsk;
- int err;
-
- childregs = task_pt_regs(p);
- *childregs = *regs;
- childregs->ax = 0;
- childregs->sp = sp;
-
- p->thread.sp = (unsigned long) childregs;
- p->thread.sp0 = (unsigned long) (childregs+1);
-
- p->thread.ip = (unsigned long) ret_from_fork;
-
- task_user_gs(p) = get_user_gs(regs);
-
- tsk = current;
- if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
-
- err = 0;
-
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS)
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
-
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
- }
-
- clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
- p->thread.ds_ctx = NULL;
-
- clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
- p->thread.debugctlmsr = 0;
-
- return err;
-}
-
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- set_user_gs(regs, 0);
+ loadsegment(gs, 0);
regs->fs = 0;
- set_fs(USER_DS);
regs->ds = __USER_DS;
regs->es = __USER_DS;
regs->ss = __USER_DS;
regs->cs = __USER_CS;
regs->ip = new_ip;
regs->sp = new_sp;
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
+ regs->flags = X86_EFLAGS_IF;
}
EXPORT_SYMBOL_GPL(start_thread);
/*
- * switch_to(x,yn) should switch tasks from x to y.
+ * switch_to(x,y) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
@@ -343,27 +151,16 @@ EXPORT_SYMBOL_GPL(start_thread);
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
-__notrace_funcgraph struct task_struct *
+__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
- *next = &next_p->thread;
+ *next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- __unlazy_fpu(prev_p);
-
-
- /* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter > 5)
- prefetch(next->xstate);
-
- /*
- * Reload esp0.
- */
- load_sp0(tss, next);
+ switch_fpu(prev_p, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -375,123 +172,43 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- lazy_save_gs(prev->gs);
+ savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
*/
load_TLS(next, cpu);
- /*
- * Restore IOPL if needed. In normal use, the flags restore
- * in the switch assembly will handle this. But if the kernel
- * is running virtualized at a non-zero CPL, the popf will
- * not restore flags, so it must be done in a separate step.
- */
- if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
- set_iopl_mask(next->iopl);
-
- /*
- * Now maybe handle debug registers and/or IO bitmaps
- */
- if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
- task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
- __switch_to_xtra(prev_p, next_p, tss);
+ switch_to_extra(prev_p, next_p);
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
- * the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
- * to date.
+ * the GDT and LDT are properly updated.
*/
arch_end_context_switch(next_p);
- /* If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- *
- * tsk_used_math() checks prevent calling math_state_restore(),
- * which can sleep in the case of !tsk_used_math()
+ /*
+ * Reload esp0 and cpu_current_top_of_stack. This changes
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
*/
- if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
- math_state_restore();
+ update_task_stack(next_p);
+ refresh_sysenter_cs(next);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
/*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- lazy_load_gs(next->gs);
-
- percpu_write(current_task, next_p);
-
- return prev_p;
-}
-
-int sys_clone(struct pt_regs *regs)
-{
- unsigned long clone_flags;
- unsigned long newsp;
- int __user *parent_tidptr, *child_tidptr;
+ loadsegment(gs, next->gs);
- clone_flags = regs->bx;
- newsp = regs->cx;
- parent_tidptr = (int __user *)regs->dx;
- child_tidptr = (int __user *)regs->di;
- if (!newsp)
- newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
-}
+ raw_cpu_write(current_task, next_p);
-/*
- * sys_execve() executes a new program.
- */
-int sys_execve(struct pt_regs *regs)
-{
- int error;
- char *filename;
+ /* Load the Intel cache allocation PQR MSR. */
+ resctrl_arch_sched_in(next_p);
- filename = getname((char __user *) regs->bx);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- goto out;
- error = do_execve(filename,
- (char __user * __user *) regs->cx,
- (char __user * __user *) regs->dx,
- regs);
- if (error == 0) {
- /* Make sure we don't return using sysenter.. */
- set_thread_flag(TIF_IRET);
- }
- putname(filename);
-out:
- return error;
-}
-
-#define top_esp (THREAD_SIZE - sizeof(unsigned long))
-#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
-
-unsigned long get_wchan(struct task_struct *p)
-{
- unsigned long bp, sp, ip;
- unsigned long stack_page;
- int count = 0;
- if (!p || p == current || p->state == TASK_RUNNING)
- return 0;
- stack_page = (unsigned long)task_stack_page(p);
- sp = p->thread.sp;
- if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
- return 0;
- /* include/asm-i386/system.h:switch_to() pushes bp last. */
- bp = *(unsigned long *) sp;
- do {
- if (bp < stack_page || bp > top_ebp+stack_page)
- return 0;
- ip = *(unsigned long *) (bp+4);
- if (!in_sched_functions(ip))
- return ip;
- bp = *(unsigned long *) bp;
- } while (count++ < 16);
- return 0;
+ return prev_p;
}
-
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ebefb5407b9d..432c0a004c60 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1995 Linus Torvalds
*
@@ -14,10 +15,11 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -26,348 +28,573 @@
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/interrupt.h>
-#include <linux/utsname.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/ptrace.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
-#include <linux/tick.h>
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
-#include <linux/dmi.h>
+#include <linux/syscalls.h>
+#include <linux/iommu.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/pkru.h>
+#include <asm/fpu/sched.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
-#include <asm/idle.h>
-#include <asm/syscalls.h>
-#include <asm/ds.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/vdso.h>
+#include <asm/resctrl.h>
+#include <asm/unistd.h>
+#include <asm/fsgsbase.h>
+#include <asm/fred.h>
+#include <asm/msr.h>
+#ifdef CONFIG_IA32_EMULATION
+/* Not included via unistd.h */
+#include <asm/unistd_32_ia32.h>
+#endif
+
+#include "process.h"
-asmlinkage extern void ret_from_fork(void);
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
+ const char *log_lvl)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned int fsindex, gsindex;
+ unsigned int ds, es;
+
+ show_iret_regs(regs, log_lvl);
+
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+ pr_cont("\n");
+
+ printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
+ log_lvl, regs->ax, regs->bx, regs->cx);
+ printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
+ log_lvl, regs->dx, regs->si, regs->di);
+ printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
+ log_lvl, regs->bp, regs->r8, regs->r9);
+ printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
+ log_lvl, regs->r10, regs->r11, regs->r12);
+ printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
+ log_lvl, regs->r13, regs->r14, regs->r15);
+
+ if (mode == SHOW_REGS_SHORT)
+ return;
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
+ if (mode == SHOW_REGS_USER) {
+ rdmsrq(MSR_FS_BASE, fs);
+ rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
+ printk("%sFS: %016lx GS: %016lx\n",
+ log_lvl, fs, shadowgs);
+ return;
+ }
-DEFINE_PER_CPU(unsigned long, old_rsp);
-static DEFINE_PER_CPU(unsigned char, is_idle);
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%es,%0" : "=r" (es));
+ asm("movl %%fs,%0" : "=r" (fsindex));
+ asm("movl %%gs,%0" : "=r" (gsindex));
-unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+ rdmsrq(MSR_FS_BASE, fs);
+ rdmsrq(MSR_GS_BASE, gs);
+ rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
+ printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ log_lvl, fs, fsindex, gs, gsindex, shadowgs);
+ printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n",
+ log_lvl, regs->cs, ds, es, cr0);
+ printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
+ log_lvl, cr2, cr3, cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) {
+ printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
+ log_lvl, d0, d1, d2);
+ printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
+ log_lvl, d3, d6, d7);
+ }
+
+ if (cr4 & X86_CR4_PKE)
+ printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-void idle_notifier_unregister(struct notifier_block *n)
+void release_thread(struct task_struct *dead_task)
{
- atomic_notifier_chain_unregister(&idle_notifier, n);
+ WARN_ON(dead_task->mm);
}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-void enter_idle(void)
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr unsigned long __rdgsbase_inactive(void)
{
- percpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+ unsigned long gsbase;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * SWAPGS is no longer needed thus NOT allowed with FRED because
+ * FRED transitions ensure that an operating system can _always_
+ * operate with its own GS base address:
+ * - For events that occur in ring 3, FRED event delivery swaps
+ * the GS base address with the IA32_KERNEL_GS_BASE MSR.
+ * - ERETU (the FRED transition that returns to ring 3) also swaps
+ * the GS base address with the IA32_KERNEL_GS_BASE MSR.
+ *
+ * And the operating system can still setup the GS segment for a
+ * user thread without the need of loading a user thread GS with:
+ * - Using LKGS, available with FRED, to modify other attributes
+ * of the GS segment without compromising its ability always to
+ * operate with its own GS base address.
+ * - Accessing the GS segment base address for a user thread as
+ * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
+ *
+ * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
+ * MSR instead of the GS segment’s descriptor cache. As such, the
+ * operating system never changes its runtime GS base address.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ native_swapgs();
+ gsbase = rdgsbase();
+ native_swapgs();
+ } else {
+ instrumentation_begin();
+ rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
+ instrumentation_end();
+ }
+
+ return gsbase;
}
-static void __exit_idle(void)
+/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr void __wrgsbase_inactive(unsigned long gsbase)
{
- if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
- return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+ lockdep_assert_irqs_disabled();
+
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ native_swapgs();
+ wrgsbase(gsbase);
+ native_swapgs();
+ } else {
+ instrumentation_begin();
+ wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
+ instrumentation_end();
+ }
}
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
{
- /* idle loop has pid 0 */
- if (current->pid)
- return;
- __exit_idle();
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
}
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
+static __always_inline void save_fsgs(struct task_struct *task)
{
- BUG();
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /*
+ * If FSGSBASE is enabled, we can't make any useful guesses
+ * about the base, and user code expects us to save the current
+ * value. Fortunately, reading the base directly is efficient.
+ */
+ task->thread.fsbase = rdfsbase();
+ task->thread.gsbase = __rdgsbase_inactive();
+ } else {
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+ }
}
-#endif
/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
+ * While a process is running,current->thread.fsbase and current->thread.gsbase
+ * may not match the corresponding CPU registers (see save_base_legacy()).
*/
-void cpu_idle(void)
+void current_save_fsgs(void)
{
- current_thread_info()->status |= TS_POLLING;
+ unsigned long flags;
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_stop_sched_tick(1);
- while (!need_resched()) {
+ /* Interrupts need to be off for FSGSBASE */
+ local_irq_save(flags);
+ save_fsgs(current);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_FOR_KVM(current_save_fsgs);
- rmb();
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
- if (cpu_is_offline(smp_processor_id()))
- play_dead();
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
/*
- * Idle routines should keep interrupts disabled
- * from here on, until they go to idle.
- * Otherwise, idle callbacks can misfire.
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
*/
- local_irq_disable();
- enter_idle();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
- pm_idle();
- start_critical_timings();
- /* In many cases the interrupt that ended idle
- has already called exit_idle. But some idle
- loops can be woken up without interrupt. */
- __exit_idle();
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
}
-
- tick_nohz_restart_sched_tick();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
}
}
-/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs *regs, int all)
+/*
+ * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
+ * is not XSTATE managed on context switch because that would require a
+ * lookup in the task's FPU xsave buffer and require to keep that updated
+ * in various places.
+ */
+static __always_inline void x86_pkru_load(struct thread_struct *prev,
+ struct thread_struct *next)
{
- unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
- unsigned long d0, d1, d2, d3, d6, d7;
- unsigned int fsindex, gsindex;
- unsigned int ds, cs, es;
- const char *board;
-
- printk("\n");
- print_modules();
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (!board)
- board = "";
- printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board);
- printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip, 1);
- printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
- regs->sp, regs->flags);
- printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
- regs->ax, regs->bx, regs->cx);
- printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
- regs->dx, regs->si, regs->di);
- printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
- regs->bp, regs->r8, regs->r9);
- printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
- asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
- asm("movl %%es,%0" : "=r" (es));
- asm("movl %%fs,%0" : "=r" (fsindex));
- asm("movl %%gs,%0" : "=r" (gsindex));
+ /* Stash the prev task's value: */
+ prev->pkru = rdpkru();
- rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ /*
+ * PKRU writes are slightly expensive. Avoid them when not
+ * strictly necessary:
+ */
+ if (prev->pkru != next->pkru)
+ wrpkru(next->pkru);
+}
- if (!all)
- return;
+static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
+ struct thread_struct *next)
+{
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /* Update the FS and GS selectors if they could have changed. */
+ if (unlikely(prev->fsindex || next->fsindex))
+ loadseg(FS, next->fsindex);
+ if (unlikely(prev->gsindex || next->gsindex))
+ loadseg(GS, next->gsindex);
+
+ /* Update the bases. */
+ wrfsbase(next->fsbase);
+ __wrgsbase_inactive(next->gsbase);
+ } else {
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
+ }
+}
- cr0 = read_cr0();
- cr2 = read_cr2();
- cr3 = read_cr3();
- cr4 = read_cr4();
+unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+ unsigned short selector)
+{
+ unsigned short idx = selector >> 3;
+ unsigned long base;
- printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
- es, cr0);
- printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
- cr4);
+ if (likely((selector & SEGMENT_TI_MASK) == 0)) {
+ if (unlikely(idx >= GDT_ENTRIES))
+ return 0;
- get_debugreg(d0, 0);
- get_debugreg(d1, 1);
- get_debugreg(d2, 2);
- printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
- get_debugreg(d3, 3);
- get_debugreg(d6, 6);
- get_debugreg(d7, 7);
- printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ /*
+ * There are no user segments in the GDT with nonzero bases
+ * other than the TLS segments.
+ */
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return 0;
+
+ idx -= GDT_ENTRY_TLS_MIN;
+ base = get_desc_base(&task->thread.tls_array[idx]);
+ } else {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /*
+ * If performance here mattered, we could protect the LDT
+ * with RCU. This is a slow path, though, so we can just
+ * take the mutex.
+ */
+ mutex_lock(&task->mm->context.lock);
+ ldt = task->mm->context.ldt;
+ if (unlikely(!ldt || idx >= ldt->nr_entries))
+ base = 0;
+ else
+ base = get_desc_base(ldt->entries + idx);
+ mutex_unlock(&task->mm->context.lock);
+#else
+ base = 0;
+#endif
+ }
+
+ return base;
}
-void show_regs(struct pt_regs *regs)
+unsigned long x86_gsbase_read_cpu_inactive(void)
{
- printk(KERN_INFO "CPU %d:", smp_processor_id());
- __show_regs(regs, 1);
- show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
+ unsigned long gsbase;
+
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ gsbase = __rdgsbase_inactive();
+ local_irq_restore(flags);
+ } else {
+ rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
+ }
+
+ return gsbase;
}
-void release_thread(struct task_struct *dead_task)
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
{
- if (dead_task->mm) {
- if (dead_task->mm->context.size) {
- printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
- dead_task->comm,
- dead_task->mm->context.ldt,
- dead_task->mm->context.size);
- BUG();
- }
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __wrgsbase_inactive(gsbase);
+ local_irq_restore(flags);
+ } else {
+ wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
}
}
-static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+unsigned long x86_fsbase_read_task(struct task_struct *task)
{
- struct user_desc ud = {
- .base_addr = addr,
- .limit = 0xfffff,
- .seg_32bit = 1,
- .limit_in_pages = 1,
- .useable = 1,
- };
- struct desc_struct *desc = t->thread.tls_array;
- desc += tls;
- fill_ldt(desc, &ud);
+ unsigned long fsbase;
+
+ if (task == current)
+ fsbase = x86_fsbase_read_cpu();
+ else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.fsindex == 0))
+ fsbase = task->thread.fsbase;
+ else
+ fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
+
+ return fsbase;
}
-static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+unsigned long x86_gsbase_read_task(struct task_struct *task)
{
- return get_desc_base(&t->thread.tls_array[tls]);
+ unsigned long gsbase;
+
+ if (task == current)
+ gsbase = x86_gsbase_read_cpu_inactive();
+ else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.gsindex == 0))
+ gsbase = task->thread.gsbase;
+ else
+ gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
+
+ return gsbase;
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
+void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
{
- unlazy_fpu(tsk);
+ WARN_ON_ONCE(task == current);
+
+ task->thread.fsbase = fsbase;
}
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
- struct task_struct *p, struct pt_regs *regs)
+void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
{
- int err;
- struct pt_regs *childregs;
- struct task_struct *me = current;
-
- childregs = ((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(p))) - 1;
- *childregs = *regs;
-
- childregs->ax = 0;
- childregs->sp = sp;
- if (sp == ~0UL)
- childregs->sp = (unsigned long)childregs;
-
- p->thread.sp = (unsigned long) childregs;
- p->thread.sp0 = (unsigned long) (childregs+1);
- p->thread.usersp = me->thread.usersp;
-
- set_tsk_thread_flag(p, TIF_FORK);
-
- p->thread.fs = me->thread.fs;
- p->thread.gs = me->thread.gs;
-
- savesegment(gs, p->thread.gsindex);
- savesegment(fs, p->thread.fsindex);
- savesegment(es, p->thread.es);
- savesegment(ds, p->thread.ds);
-
- if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES);
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
+ WARN_ON_ONCE(task == current);
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS) {
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
- else
-#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
- if (err)
- goto out;
+ task->thread.gsbase = gsbase;
+}
+
+static void
+start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp,
+ u16 _cs, u16 _ss, u16 _ds)
+{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
}
- clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
- p->thread.ds_ctx = NULL;
+ reset_thread_features();
- clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
- p->thread.debugctlmsr = 0;
+ loadsegment(fs, 0);
+ loadsegment(es, _ds);
+ loadsegment(ds, _ds);
+ load_gs_index(0);
- err = 0;
-out:
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ regs->csx = _cs;
+ regs->ssx = _ss;
+ /*
+ * Allow single-step trap and NMI when starting a new task, thus
+ * once the new task enters user space, single-step trap and NMI
+ * are both enabled immediately.
+ *
+ * Entering a new task is logically speaking a return from a
+ * system call (exec, fork, clone, etc.). As such, if ptrace
+ * enables single stepping a single step exception should be
+ * allowed to trigger immediately upon entering user space.
+ * This is not optional.
+ *
+ * NMI should *never* be disabled in user space. As such, this
+ * is an optional, opportunistic way to catch errors.
+ *
+ * Paranoia: High-order 48 bits above the lowest 16 bit SS are
+ * discarded by the legacy IRET instruction on all Intel, AMD,
+ * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
+ * even when FRED is not enabled. But we choose the safer side
+ * to use these bits only when FRED is enabled.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ regs->fred_ss.swevent = true;
+ regs->fred_ss.nmi = true;
}
- return err;
+
+ regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
}
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- loadsegment(fs, 0);
- loadsegment(es, 0);
- loadsegment(ds, 0);
- load_gs_index(0);
- regs->ip = new_ip;
- regs->sp = new_sp;
- percpu_write(old_rsp, new_sp);
- regs->cs = __USER_CS;
- regs->ss = __USER_DS;
- regs->flags = 0x200;
- set_fs(USER_DS);
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
+ start_thread_common(regs, new_ip, new_sp,
+ __USER_CS, __USER_DS, 0);
}
EXPORT_SYMBOL_GPL(start_thread);
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
+{
+ start_thread_common(regs, new_ip, new_sp,
+ x32 ? __USER_CS : __USER32_CS,
+ __USER_DS, __USER_DS);
+}
+#endif
+
/*
* switch_to(x,y) should switch tasks from x to y.
*
@@ -378,27 +605,52 @@ EXPORT_SYMBOL_GPL(start_thread);
* Kprobes not supported here. Set the probe on schedule instead.
* Function graph tracer not supported too.
*/
-__notrace_funcgraph struct task_struct *
+__no_kmsan_checks
+__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
- unsigned fsindex, gsindex;
- /* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter > 5)
- prefetch(next->xstate);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(hardirq_stack_inuse));
+
+ switch_fpu(prev_p, cpu);
+
+ /* We must save %fs and %gs before load_TLS() because
+ * %fs and %gs may be cleared by load_TLS().
+ *
+ * (e.g. xen_load_tls())
+ */
+ save_fsgs(prev_p);
/*
- * Reload esp0, LDT and the page table pointer:
+ * Load TLS before restoring any segments so that segment loads
+ * reference the correct GDT entries.
*/
- load_sp0(tss, next);
+ load_TLS(next, cpu);
/*
- * Switch DS and ES.
- * This won't pick up thread selector changes, but I guess that is ok.
+ * Leave lazy mode, flushing any hypercalls made here. This
+ * must be done after loading TLS entries in the GDT but before
+ * loading segments that might reference them.
+ */
+ arch_end_context_switch(next_p);
+
+ /* Switch DS and ES.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT. The
+ * LDT for next is loaded in switch_mm, and the GDT is loaded
+ * above.
+ *
+ * We therefore need to write new values to the segment
+ * registers on every context switch unless both the new and old
+ * values are zero.
+ *
+ * Note that we don't need to do anything for CS and SS, as
+ * those are saved and restored as part of pt_regs.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
@@ -408,243 +660,319 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
+ x86_fsgsbase_load(prev, next);
- /* We must save %fs and %gs before load_TLS() because
- * %fs and %gs may be cleared by load_TLS().
- *
- * (e.g. xen_load_tls())
- */
- savesegment(fs, fsindex);
- savesegment(gs, gsindex);
-
- load_TLS(next, cpu);
+ x86_pkru_load(prev, next);
/*
- * Leave lazy mode, flushing any hypercalls made here.
- * This must be done before restoring TLS segments so
- * the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
- * to date.
+ * Switch the PDA and FPU contexts.
*/
- arch_end_context_switch(next_p);
+ raw_cpu_write(current_task, next_p);
+ raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
- /*
- * Switch FS and GS.
- *
- * Segment register != 0 always requires a reload. Also
- * reload when it has changed. When prev process used 64bit
- * base always reload to avoid an information leak.
- */
- if (unlikely(fsindex | next->fsindex | prev->fs)) {
- loadsegment(fs, next->fsindex);
+ /* Reload sp0. */
+ update_task_stack(next_p);
+
+ switch_to_extra(prev_p, next_p);
+
+ if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
- * Check if the user used a selector != 0; if yes
- * clear 64bit base, since overloaded base is always
- * mapped to the Null selector
+ * AMD CPUs have a misfeature: SYSRET sets the SS selector but
+ * does not update the cached descriptor. As a result, if we
+ * do SYSRET while SS is NULL, we'll end up in user mode with
+ * SS apparently equal to __USER_DS but actually unusable.
+ *
+ * The straightforward workaround would be to fix it up just
+ * before SYSRET, but that would slow down the system call
+ * fast paths. Instead, we ensure that SS is never NULL in
+ * system call context. We do this by replacing NULL SS
+ * selectors at every context switch. SYSCALL sets up a valid
+ * SS, so the only way to get NULL is to re-enter the kernel
+ * from CPL 3 through an interrupt. Since that can't happen
+ * in the same task as a running syscall, we are guaranteed to
+ * context switch between every interrupt vector entry and a
+ * subsequent SYSRET.
+ *
+ * We read SS first because SS reads are much faster than
+ * writes. Out of caution, we force SS to __KERNEL_DS even if
+ * it previously had a different non-NULL value.
*/
- if (fsindex)
- prev->fs = 0;
+ unsigned short ss_sel;
+ savesegment(ss, ss_sel);
+ if (ss_sel != __KERNEL_DS)
+ loadsegment(ss, __KERNEL_DS);
}
- /* when next process has a 64bit base use it */
- if (next->fs)
- wrmsrl(MSR_FS_BASE, next->fs);
- prev->fsindex = fsindex;
-
- if (unlikely(gsindex | next->gsindex | prev->gs)) {
- load_gs_index(next->gsindex);
- if (gsindex)
- prev->gs = 0;
- }
- if (next->gs)
- wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
- prev->gsindex = gsindex;
-
- /* Must be after DS reload */
- unlazy_fpu(prev_p);
-
- /*
- * Switch the PDA and FPU contexts.
- */
- prev->usersp = percpu_read(old_rsp);
- percpu_write(old_rsp, next->usersp);
- percpu_write(current_task, next_p);
- percpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - KERNEL_STACK_OFFSET);
+ /* Load the Intel cache allocation PQR MSR. */
+ resctrl_arch_sched_in(next_p);
- /*
- * Now maybe reload the debug registers and handle I/O bitmaps
- */
- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
- task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
- __switch_to_xtra(prev_p, next_p, tss);
+ /* Reset hw history on AMD CPUs */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
+ wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
- /* If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- *
- * tsk_used_math() checks prevent calling math_state_restore(),
- * which can sleep in the case of !tsk_used_math()
- */
- if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
- math_state_restore();
return prev_p;
}
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage
-long sys_execve(char __user *name, char __user * __user *argv,
- char __user * __user *envp, struct pt_regs *regs)
-{
- long error;
- char *filename;
-
- filename = getname(name);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- return error;
- error = do_execve(filename, argv, envp, regs);
- putname(filename);
- return error;
-}
-
void set_personality_64bit(void)
{
/* inherit personality from parent */
/* Make sure to be in 64bit mode */
- clear_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_ADDR32);
+ /* Pretend that this comes from a 64bit execve */
+ task_pt_regs(current)->orig_ax = __NR_execve;
+ current_thread_info()->status &= ~TS_COMPAT;
+ if (current->mm)
+ __set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
/* TBD: overwrites user setup. Should have two bits.
But 64bit processes have always behaved this way,
so it's not too bad. The main problem is just that
- 32bit childs are affected again. */
+ 32bit children are affected again. */
current->personality &= ~READ_IMPLIES_EXEC;
}
-asmlinkage long
-sys_clone(unsigned long clone_flags, unsigned long newsp,
- void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+static void __set_personality_x32(void)
{
- if (!newsp)
- newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+#ifdef CONFIG_X86_X32_ABI
+ if (current->mm)
+ current->mm->context.flags = 0;
+
+ current->personality &= ~READ_IMPLIES_EXEC;
+ /*
+ * in_32bit_syscall() uses the presence of the x32 syscall bit
+ * flag to determine compat status. The x86 mmap() code relies on
+ * the syscall bitness so set x32 syscall bit right here to make
+ * in_32bit_syscall() work during exec().
+ *
+ * Pretend to come from a x32 execve.
+ */
+ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
+ current_thread_info()->status &= ~TS_COMPAT;
+#endif
}
-unsigned long get_wchan(struct task_struct *p)
+static void __set_personality_ia32(void)
{
- unsigned long stack;
- u64 fp, ip;
- int count = 0;
+#ifdef CONFIG_IA32_EMULATION
+ if (current->mm) {
+ /*
+ * uprobes applied to this MM need to know this and
+ * cannot use user_64bit_mode() at that time.
+ */
+ __set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
+ }
+
+ current->personality |= force_personality32;
+ /* Prepare the first "return" to user space */
+ task_pt_regs(current)->orig_ax = __NR_ia32_execve;
+ current_thread_info()->status |= TS_COMPAT;
+#endif
+}
+
+void set_personality_ia32(bool x32)
+{
+ /* Make sure to be in 32bit mode */
+ set_thread_flag(TIF_ADDR32);
+
+ if (x32)
+ __set_personality_x32();
+ else
+ __set_personality_ia32();
+}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ int ret;
+
+ ret = map_vdso_once(image, addr);
+ if (ret)
+ return ret;
+
+ return (long)image->size;
+}
+#endif
+
+#ifdef CONFIG_ADDRESS_MASKING
+
+#define LAM_U57_BITS 6
+
+static void enable_lam_func(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+ unsigned long lam;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
+ lam = mm_lam_cr3_mask(mm);
+ write_cr3(__read_cr3() | lam);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
+ }
+}
+
+static void mm_enable_lam(struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
+
+ /*
+ * Even though the process must still be single-threaded at this
+ * point, kernel threads may be using the mm. IPI those kernel
+ * threads if they exist.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
+ set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+}
+
+static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return -ENODEV;
+
+ /* PTRACE_ARCH_PRCTL */
+ if (current->mm != mm)
+ return -EINVAL;
+
+ if (mm_valid_pasid(mm) &&
+ !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
+ return -EINVAL;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ /*
+ * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from
+ * being enabled unless the process is single threaded:
+ */
+ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
+ mmap_write_unlock(mm);
+ return -EBUSY;
+ }
+
+ if (!nr_bits || nr_bits > LAM_U57_BITS) {
+ mmap_write_unlock(mm);
+ return -EINVAL;
+ }
+
+ mm_enable_lam(mm);
+
+ mmap_write_unlock(mm);
- if (!p || p == current || p->state == TASK_RUNNING)
- return 0;
- stack = (unsigned long)task_stack_page(p);
- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
- return 0;
- fp = *(u64 *)(p->thread.sp);
- do {
- if (fp < (unsigned long)stack ||
- fp >= (unsigned long)stack+THREAD_SIZE)
- return 0;
- ip = *(u64 *)(fp+8);
- if (!in_sched_functions(ip))
- return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
return 0;
}
+#endif
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
- int doit = task == current;
- int cpu;
- switch (code) {
- case ARCH_SET_GS:
- if (addr >= TASK_SIZE_OF(task))
+ switch (option) {
+ case ARCH_SET_GS: {
+ if (unlikely(arg2 >= TASK_SIZE_MAX))
return -EPERM;
- cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
- switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, GS_TLS, addr);
- if (doit) {
- load_TLS(&task->thread, cpu);
- load_gs_index(GS_TLS_SEL);
- }
- task->thread.gsindex = GS_TLS_SEL;
- task->thread.gs = 0;
+
+ preempt_disable();
+ /*
+ * ARCH_SET_GS has always overwritten the index
+ * and the base. Zero is the most sensible value
+ * to put in the index, and is the only value that
+ * makes any sense if FSGSBASE is unavailable.
+ */
+ if (task == current) {
+ loadseg(GS, 0);
+ x86_gsbase_write_cpu_inactive(arg2);
+
+ /*
+ * On non-FSGSBASE systems, save_base_legacy() expects
+ * that we also fill in thread.gsbase.
+ */
+ task->thread.gsbase = arg2;
+
} else {
task->thread.gsindex = 0;
- task->thread.gs = addr;
- if (doit) {
- load_gs_index(0);
- ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
- }
+ x86_gsbase_write_task(task, arg2);
}
- put_cpu();
+ preempt_enable();
break;
- case ARCH_SET_FS:
- /* Not strictly needed for fs, but do it for symmetry
- with gs */
- if (addr >= TASK_SIZE_OF(task))
+ }
+ case ARCH_SET_FS: {
+ /*
+ * Not strictly needed for %fs, but do it for symmetry
+ * with %gs
+ */
+ if (unlikely(arg2 >= TASK_SIZE_MAX))
return -EPERM;
- cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
- switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, FS_TLS, addr);
- if (doit) {
- load_TLS(&task->thread, cpu);
- loadsegment(fs, FS_TLS_SEL);
- }
- task->thread.fsindex = FS_TLS_SEL;
- task->thread.fs = 0;
+
+ preempt_disable();
+ /*
+ * Set the selector to 0 for the same reason
+ * as %gs above.
+ */
+ if (task == current) {
+ loadseg(FS, 0);
+ x86_fsbase_write_cpu(arg2);
+
+ /*
+ * On non-FSGSBASE systems, save_base_legacy() expects
+ * that we also fill in thread.fsbase.
+ */
+ task->thread.fsbase = arg2;
} else {
task->thread.fsindex = 0;
- task->thread.fs = addr;
- if (doit) {
- /* set the selector to 0 to not confuse
- __switch_to */
- loadsegment(fs, 0);
- ret = checking_wrmsrl(MSR_FS_BASE, addr);
- }
+ x86_fsbase_write_task(task, arg2);
}
- put_cpu();
+ preempt_enable();
break;
+ }
case ARCH_GET_FS: {
- unsigned long base;
- if (task->thread.fsindex == FS_TLS_SEL)
- base = read_32bit_tls(task, FS_TLS);
- else if (doit)
- rdmsrl(MSR_FS_BASE, base);
- else
- base = task->thread.fs;
- ret = put_user(base, (unsigned long __user *)addr);
+ unsigned long base = x86_fsbase_read_task(task);
+
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
- unsigned long base;
- unsigned gsindex;
- if (task->thread.gsindex == GS_TLS_SEL)
- base = read_32bit_tls(task, GS_TLS);
- else if (doit) {
- savesegment(gs, gsindex);
- if (gsindex)
- rdmsrl(MSR_KERNEL_GS_BASE, base);
- else
- base = task->thread.gs;
- } else
- base = task->thread.gs;
- ret = put_user(base, (unsigned long __user *)addr);
+ unsigned long base = x86_gsbase_read_task(task);
+
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32_ABI
+ case ARCH_MAP_VDSO_X32:
+ return prctl_map_vdso(&vdso_image_x32, arg2);
+# endif
+# ifdef CONFIG_IA32_EMULATION
+ case ARCH_MAP_VDSO_32:
+ return prctl_map_vdso(&vdso_image_32, arg2);
+# endif
+ case ARCH_MAP_VDSO_64:
+ return prctl_map_vdso(&vdso_image_64, arg2);
+#endif
+#ifdef CONFIG_ADDRESS_MASKING
+ case ARCH_GET_UNTAG_MASK:
+ return put_user(task->mm->context.untag_mask,
+ (unsigned long __user *)arg2);
+ case ARCH_ENABLE_TAGGED_ADDR:
+ return prctl_enable_tagged_addr(task->mm, arg2);
+ case ARCH_FORCE_TAGGED_SVA:
+ if (current != task)
+ return -EINVAL;
+ set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
+ return 0;
+ case ARCH_GET_MAX_TAG_BITS:
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return put_user(0, (unsigned long __user *)arg2);
+ else
+ return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
+#endif
+ case ARCH_SHSTK_ENABLE:
+ case ARCH_SHSTK_DISABLE:
+ case ARCH_SHSTK_LOCK:
+ case ARCH_SHSTK_UNLOCK:
+ case ARCH_SHSTK_STATUS:
+ return shstk_prctl(task, option, arg2);
default:
ret = -EINVAL;
break;
@@ -652,9 +980,3 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
return ret;
}
-
-long sys_arch_prctl(int code, unsigned long addr)
-{
- return do_arch_prctl(current, code, addr);
-}
-
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 09ecbde91c13..3dcadc13f09a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1,53 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* By Ross Biro 1/23/92 */
/*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
*/
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
+#include <linux/slab.h>
#include <linux/ptrace.h>
-#include <linux/regset.h>
-#include <linux/tracehook.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/security.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
-#include <linux/workqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/nospec.h>
+
+#include <linux/uaccess.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/proto.h>
-#include <asm/ds.h>
-
-#include <trace/syscall.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/traps.h>
+#include <asm/syscall.h>
+#include <asm/fsgsbase.h>
+#include <asm/io_bitmap.h>
#include "tls.h"
-enum x86_regset {
- REGSET_GENERAL,
- REGSET_FP,
- REGSET_XFP,
- REGSET_IOPERM64 = REGSET_XFP,
- REGSET_TLS,
- REGSET_IOPERM32,
+enum x86_regset_32 {
+ REGSET32_GENERAL,
+ REGSET32_FP,
+ REGSET32_XFP,
+ REGSET32_XSTATE,
+ REGSET32_TLS,
+ REGSET32_IOPERM,
+};
+
+enum x86_regset_64 {
+ REGSET64_GENERAL,
+ REGSET64_FP,
+ REGSET64_IOPERM,
+ REGSET64_XSTATE,
+ REGSET64_SSP,
+};
+
+#define REGSET_GENERAL \
+({ \
+ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \
+ REGSET32_GENERAL; \
+})
+
+#define REGSET_FP \
+({ \
+ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \
+ REGSET32_FP; \
+})
+
+
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+ REG_OFFSET_NAME(r15),
+ REG_OFFSET_NAME(r14),
+ REG_OFFSET_NAME(r13),
+ REG_OFFSET_NAME(r12),
+ REG_OFFSET_NAME(r11),
+ REG_OFFSET_NAME(r10),
+ REG_OFFSET_NAME(r9),
+ REG_OFFSET_NAME(r8),
+#endif
+ REG_OFFSET_NAME(bx),
+ REG_OFFSET_NAME(cx),
+ REG_OFFSET_NAME(dx),
+ REG_OFFSET_NAME(si),
+ REG_OFFSET_NAME(di),
+ REG_OFFSET_NAME(bp),
+ REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+ REG_OFFSET_NAME(ds),
+ REG_OFFSET_NAME(es),
+ REG_OFFSET_NAME(fs),
+ REG_OFFSET_NAME(gs),
+#endif
+ REG_OFFSET_NAME(orig_ax),
+ REG_OFFSET_NAME(ip),
+ REG_OFFSET_NAME(cs),
+ REG_OFFSET_NAME(flags),
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(ss),
+ REG_OFFSET_END,
};
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset: the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (roff->offset == offset)
+ return roff->name;
+ return NULL;
+}
+
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
@@ -91,9 +190,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
retval = *pt_regs_access(task_pt_regs(task), offset);
else {
if (task == current)
- retval = get_user_gs(task_pt_regs(task));
+ savesegment(gs, retval);
else
- retval = task_user_gs(task);
+ retval = task->thread.gs;
}
return retval;
}
@@ -101,6 +200,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
@@ -121,26 +223,19 @@ static int set_segment_reg(struct task_struct *task,
case offsetof(struct user_regs_struct, ss):
if (unlikely(value == 0))
return -EIO;
+ fallthrough;
default:
*pt_regs_access(task_pt_regs(task), offset) = value;
break;
case offsetof(struct user_regs_struct, gs):
- if (task == current)
- set_user_gs(task_pt_regs(task), value);
- else
- task_user_gs(task) = value;
+ task->thread.gs = value;
}
return 0;
}
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
- return TASK_SIZE - 3;
-}
-
#else /* CONFIG_X86_64 */
#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -195,50 +290,33 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
if (invalid_selector(value))
return -EIO;
+ /*
+ * Writes to FS and GS will change the stored selector. Whether
+ * this changes the segment base as well depends on whether
+ * FSGSBASE is enabled.
+ */
+
switch (offset) {
case offsetof(struct user_regs_struct,fs):
- /*
- * If this is setting fs as for normal 64-bit use but
- * setting fs_base has implicitly changed it, leave it.
- */
- if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
- task->thread.fs != 0) ||
- (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
- task->thread.fs == 0))
- break;
task->thread.fsindex = value;
- if (task == current)
- loadsegment(fs, task->thread.fsindex);
break;
case offsetof(struct user_regs_struct,gs):
- /*
- * If this is setting gs as for normal 64-bit use but
- * setting gs_base has implicitly changed it, leave it.
- */
- if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
- task->thread.gs != 0) ||
- (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
- task->thread.gs == 0))
- break;
task->thread.gsindex = value;
- if (task == current)
- load_gs_index(task->thread.gsindex);
break;
case offsetof(struct user_regs_struct,ds):
task->thread.ds = value;
- if (task == current)
- loadsegment(ds, task->thread.ds);
break;
case offsetof(struct user_regs_struct,es):
task->thread.es = value;
- if (task == current)
- loadsegment(es, task->thread.es);
break;
/*
@@ -247,33 +325,18 @@ static int set_segment_reg(struct task_struct *task,
case offsetof(struct user_regs_struct,cs):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->cs = value;
-#endif
+ task_pt_regs(task)->cs = value;
break;
case offsetof(struct user_regs_struct,ss):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->ss = value;
-#endif
+ task_pt_regs(task)->ss = value;
break;
}
return 0;
}
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- return IA32_PAGE_OFFSET - 3;
-#endif
- return TASK_SIZE_MAX - 7;
-}
-
#endif /* CONFIG_X86_32 */
static unsigned long get_flags(struct task_struct *task)
@@ -324,35 +387,15 @@ static int putreg(struct task_struct *child,
return set_flags(child, value);
#ifdef CONFIG_X86_64
- /*
- * Orig_ax is really just a flag with small positive and
- * negative values, so make sure to always sign-extend it
- * from 32 bits so that it works correctly regardless of
- * whether we come from a 32-bit environment or not.
- */
- case offsetof(struct user_regs_struct, orig_ax):
- value = (long) (s32) value;
- break;
-
case offsetof(struct user_regs_struct,fs_base):
- if (value >= TASK_SIZE_OF(child))
+ if (value >= TASK_SIZE_MAX)
return -EIO;
- /*
- * When changing the segment base, use do_arch_prctl
- * to set either thread.fs or thread.fsindex and the
- * corresponding GDT slot.
- */
- if (child->thread.fs != value)
- return do_arch_prctl(child, ARCH_SET_FS, value);
+ x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
- /*
- * Exactly the same here as the %fs handling above.
- */
- if (value >= TASK_SIZE_OF(child))
+ if (value >= TASK_SIZE_MAX)
return -EIO;
- if (child->thread.gs != value)
- return do_arch_prctl(child, ARCH_SET_GS, value);
+ x86_gsbase_write_task(child, value);
return 0;
#endif
}
@@ -376,34 +419,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
return get_flags(task);
#ifdef CONFIG_X86_64
- case offsetof(struct user_regs_struct, fs_base): {
- /*
- * do_arch_prctl may have used a GDT slot instead of
- * the MSR. To userland, it appears the same either
- * way, except the %fs segment selector might not be 0.
- */
- unsigned int seg = task->thread.fsindex;
- if (task->thread.fs != 0)
- return task->thread.fs;
- if (task == current)
- asm("movl %%fs,%0" : "=r" (seg));
- if (seg != FS_TLS_SEL)
- return 0;
- return get_desc_base(&task->thread.tls_array[FS_TLS]);
- }
- case offsetof(struct user_regs_struct, gs_base): {
- /*
- * Exactly the same here as the %fs handling above.
- */
- unsigned int seg = task->thread.gsindex;
- if (task->thread.gs != 0)
- return task->thread.gs;
- if (task == current)
- asm("movl %%gs,%0" : "=r" (seg));
- if (seg != GS_TLS_SEL)
- return 0;
- return get_desc_base(&task->thread.tls_array[GS_TLS]);
- }
+ case offsetof(struct user_regs_struct, fs_base):
+ return x86_fsbase_read_task(task);
+ case offsetof(struct user_regs_struct, gs_base):
+ return x86_gsbase_read_task(task);
#endif
}
@@ -412,26 +431,12 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
static int genregs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (kbuf) {
- unsigned long *k = kbuf;
- while (count > 0) {
- *k++ = getreg(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- unsigned long __user *u = ubuf;
- while (count > 0) {
- if (__put_user(getreg(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ int reg;
+ for (reg = 0; to.left; reg++)
+ membuf_store(&to, getreg(target, reg * sizeof(unsigned long)));
return 0;
}
@@ -443,14 +448,14 @@ static int genregs_set(struct task_struct *target,
int ret = 0;
if (kbuf) {
const unsigned long *k = kbuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*k) && !ret) {
ret = putreg(target, pos, *k++);
count -= sizeof(*k);
pos += sizeof(*k);
}
} else {
const unsigned long __user *u = ubuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*u) && !ret) {
unsigned long word;
ret = __get_user(word, u++);
if (ret)
@@ -463,458 +468,249 @@ static int genregs_set(struct task_struct *target,
return ret;
}
-/*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
- */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
-{
- switch (n) {
- case 0: return child->thread.debugreg0;
- case 1: return child->thread.debugreg1;
- case 2: return child->thread.debugreg2;
- case 3: return child->thread.debugreg3;
- case 6: return child->thread.debugreg6;
- case 7: return child->thread.debugreg7;
- }
- return 0;
-}
-
-static int ptrace_set_debugreg(struct task_struct *child,
- int n, unsigned long data)
+static void ptrace_triggered(struct perf_event *bp,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int i;
+ struct thread_struct *thread = &(current->thread);
- if (unlikely(n == 4 || n == 5))
- return -EIO;
-
- if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
- return -EIO;
-
- switch (n) {
- case 0: child->thread.debugreg0 = data; break;
- case 1: child->thread.debugreg1 = data; break;
- case 2: child->thread.debugreg2 = data; break;
- case 3: child->thread.debugreg3 = data; break;
-
- case 6:
- if ((data & ~0xffffffffUL) != 0)
- return -EIO;
- child->thread.debugreg6 = data;
- break;
-
- case 7:
- /*
- * Sanity-check data. Take one half-byte at once with
- * check = (val >> (16 + 4*i)) & 0xf. It contains the
- * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
- * 2 and 3 are LENi. Given a list of invalid values,
- * we do mask |= 1 << invalid_value, so that
- * (mask >> check) & 1 is a correct test for invalid
- * values.
- *
- * R/Wi contains the type of the breakpoint /
- * watchpoint, LENi contains the length of the watched
- * data in the watchpoint case.
- *
- * The invalid values are:
- * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
- * - R/Wi == 0x10 (break on I/O reads or writes), so
- * mask |= 0x4444.
- * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
- * 0x1110.
- *
- * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
- *
- * See the Intel Manual "System Programming Guide",
- * 15.2.4
- *
- * Note that LENi == 0x10 is defined on x86_64 in long
- * mode (i.e. even for 32-bit userspace software, but
- * 64-bit kernel), so the x86_64 mask value is 0x5454.
- * See the AMD manual no. 24593 (AMD64 System Programming)
- */
-#ifdef CONFIG_X86_32
-#define DR7_MASK 0x5f54
-#else
-#define DR7_MASK 0x5554
-#endif
- data &= ~DR_CONTROL_RESERVED;
- for (i = 0; i < 4; i++)
- if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
- return -EIO;
- child->thread.debugreg7 = data;
- if (data)
- set_tsk_thread_flag(child, TIF_DEBUG);
- else
- clear_tsk_thread_flag(child, TIF_DEBUG);
- break;
+ /*
+ * Store in the virtual DR6 register the fact that the breakpoint
+ * was hit so the thread's debugger will see it.
+ */
+ for (i = 0; i < HBP_NUM; i++) {
+ if (thread->ptrace_bps[i] == bp)
+ break;
}
- return 0;
+ thread->virtual_dr6 |= (DR_TRAP0 << i);
}
/*
- * These access the current or another (stopped) task's io permission
- * bitmap for debugging or core dump.
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
*/
-static int ioperm_active(struct task_struct *target,
- const struct user_regset *regset)
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
{
- return target->thread.io_bitmap_max / regset->size;
-}
+ int i;
+ int dr7 = 0;
+ struct arch_hw_breakpoint *info;
-static int ioperm_get(struct task_struct *target,
- const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
-{
- if (!target->thread.io_bitmap_ptr)
- return -ENXIO;
+ for (i = 0; i < HBP_NUM; i++) {
+ if (bp[i] && !bp[i]->attr.disabled) {
+ info = counter_arch_bp(bp[i]);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ }
+ }
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- target->thread.io_bitmap_ptr,
- 0, IO_BITMAP_BYTES);
+ return dr7;
}
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
- /* The branch trace handle. */
- struct bts_tracer *tracer;
-
- /* The buffer used to store the branch trace and its size. */
- void *buffer;
- unsigned int size;
-
- /* The mm that paid for the above buffer. */
- struct mm_struct *mm;
-
- /* The task this context belongs to. */
- struct task_struct *task;
-
- /* The signal to send on a bts buffer overflow. */
- unsigned int bts_ovfl_signal;
-
- /* The work struct to destroy a context. */
- struct work_struct work;
-};
-
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+static int ptrace_fill_bp_fields(struct perf_event_attr *attr,
+ int len, int type, bool disabled)
{
- void *buffer = NULL;
- int err = -ENOMEM;
-
- err = account_locked_memory(current->mm, current->signal->rlim, size);
- if (err < 0)
- return err;
-
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- goto out_refund;
-
- context->buffer = buffer;
- context->size = size;
- context->mm = get_task_mm(current);
+ int err, bp_len, bp_type;
- return 0;
+ err = arch_bp_generic_fields(len, type, &bp_len, &bp_type);
+ if (!err) {
+ attr->bp_len = bp_len;
+ attr->bp_type = bp_type;
+ attr->disabled = disabled;
+ }
- out_refund:
- refund_locked_memory(current->mm, size);
return err;
}
-static inline void free_bts_buffer(struct bts_context *context)
+static struct perf_event *
+ptrace_register_breakpoint(struct task_struct *tsk, int len, int type,
+ unsigned long addr, bool disabled)
{
- if (!context->buffer)
- return;
+ struct perf_event_attr attr;
+ int err;
- kfree(context->buffer);
- context->buffer = NULL;
+ ptrace_breakpoint_init(&attr);
+ attr.bp_addr = addr;
- refund_locked_memory(context->mm, context->size);
- context->size = 0;
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return ERR_PTR(err);
- mmput(context->mm);
- context->mm = NULL;
+ return register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
}
-static void free_bts_context_work(struct work_struct *w)
+static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+ int disabled)
{
- struct bts_context *context;
+ struct perf_event_attr attr = bp->attr;
+ int err;
- context = container_of(w, struct bts_context, work);
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return err;
- ds_release_bts(context->tracer);
- put_task_struct(context->task);
- free_bts_buffer(context);
- kfree(context);
+ return modify_user_hw_breakpoint(bp, &attr);
}
-static inline void free_bts_context(struct bts_context *context)
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
{
- INIT_WORK(&context->work, free_bts_context_work);
- schedule_work(&context->work);
-}
+ struct thread_struct *thread = &tsk->thread;
+ unsigned long old_dr7;
+ bool second_pass = false;
+ int i, rc, ret = 0;
+
+ data &= ~DR_CONTROL_RESERVED;
+ old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+
+restore:
+ rc = 0;
+ for (i = 0; i < HBP_NUM; i++) {
+ unsigned len, type;
+ bool disabled = !decode_dr7(data, i, &len, &type);
+ struct perf_event *bp = thread->ptrace_bps[i];
+
+ if (!bp) {
+ if (disabled)
+ continue;
+
+ bp = ptrace_register_breakpoint(tsk,
+ len, type, 0, disabled);
+ if (IS_ERR(bp)) {
+ rc = PTR_ERR(bp);
+ break;
+ }
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
- struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (context) {
- context->task = task;
- task->bts = context;
+ thread->ptrace_bps[i] = bp;
+ continue;
+ }
- get_task_struct(task);
+ rc = ptrace_modify_breakpoint(bp, len, type, disabled);
+ if (rc)
+ break;
}
- return context;
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
- struct bts_struct __user *out)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
- struct bts_struct bts;
- const unsigned char *at;
- int error;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- at = trace->ds.top - ((index + 1) * trace->ds.size);
- if ((void *)at < trace->ds.begin)
- at += (trace->ds.n * trace->ds.size);
-
- if (!trace->read)
- return -EOPNOTSUPP;
-
- error = trace->read(context->tracer, at, &bts);
- if (error < 0)
- return error;
-
- if (copy_to_user(out, &bts, sizeof(bts)))
- return -EFAULT;
+ /* Restore if the first pass failed, second_pass shouldn't fail. */
+ if (rc && !WARN_ON(second_pass)) {
+ ret = rc;
+ data = old_dr7;
+ second_pass = true;
+ goto restore;
+ }
- return sizeof(bts);
+ return ret;
}
-static int ptrace_bts_drain(struct task_struct *child,
- long size,
- struct bts_struct __user *out)
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
{
- struct bts_context *context;
- const struct bts_trace *trace;
- const unsigned char *at;
- int error, drained = 0;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- if (!trace->read)
- return -EOPNOTSUPP;
-
- if (size < (trace->ds.top - trace->ds.begin))
- return -EIO;
-
- for (at = trace->ds.begin; (void *)at < trace->ds.top;
- out++, drained++, at += trace->ds.size) {
- struct bts_struct bts;
-
- error = trace->read(context->tracer, at, &bts);
- if (error < 0)
- return error;
-
- if (copy_to_user(out, &bts, sizeof(bts)))
- return -EFAULT;
+ struct thread_struct *thread = &tsk->thread;
+ unsigned long val = 0;
+
+ if (n < HBP_NUM) {
+ int index = array_index_nospec(n, HBP_NUM);
+ struct perf_event *bp = thread->ptrace_bps[index];
+
+ if (bp)
+ val = bp->hw.info.address;
+ } else if (n == 6) {
+ val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */
+ } else if (n == 7) {
+ val = thread->ptrace_dr7;
}
-
- memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
- error = ds_reset_bts(context->tracer);
- if (error < 0)
- return error;
-
- return drained;
+ return val;
}
-static int ptrace_bts_config(struct task_struct *child,
- long cfg_size,
- const struct ptrace_bts_config __user *ucfg)
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+ unsigned long addr)
{
- struct bts_context *context;
- struct ptrace_bts_config cfg;
- unsigned int flags = 0;
-
- if (cfg_size < sizeof(cfg))
- return -EIO;
-
- if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- return -EFAULT;
-
- context = child->bts;
- if (!context)
- context = alloc_bts_context(child);
- if (!context)
- return -ENOMEM;
-
- if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
- if (!cfg.signal)
- return -EINVAL;
+ struct thread_struct *t = &tsk->thread;
+ struct perf_event *bp = t->ptrace_bps[nr];
+ int err = 0;
- return -EOPNOTSUPP;
- context->bts_ovfl_signal = cfg.signal;
- }
-
- ds_release_bts(context->tracer);
- context->tracer = NULL;
-
- if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
- int err;
-
- free_bts_buffer(context);
- if (!cfg.size)
- return 0;
-
- err = alloc_bts_buffer(context, cfg.size);
- if (err < 0)
- return err;
- }
-
- if (cfg.flags & PTRACE_BTS_O_TRACE)
- flags |= BTS_USER;
-
- if (cfg.flags & PTRACE_BTS_O_SCHED)
- flags |= BTS_TIMESTAMPS;
-
- context->tracer =
- ds_request_bts_task(child, context->buffer, context->size,
- NULL, (size_t)-1, flags);
- if (unlikely(IS_ERR(context->tracer))) {
- int error = PTR_ERR(context->tracer);
+ if (!bp) {
+ /*
+ * Put stub len and type to create an inactive but correct bp.
+ *
+ * CHECKME: the previous code returned -EIO if the addr wasn't
+ * a valid task virtual addr. The new one will return -EINVAL in
+ * this case.
+ * -EINVAL may be what we want for in-kernel breakpoints users,
+ * but -EIO looks better for ptrace, since we refuse a register
+ * writing for the user. And anyway this is the previous
+ * behaviour.
+ */
+ bp = ptrace_register_breakpoint(tsk,
+ X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE,
+ addr, true);
+ if (IS_ERR(bp))
+ err = PTR_ERR(bp);
+ else
+ t->ptrace_bps[nr] = bp;
+ } else {
+ struct perf_event_attr attr = bp->attr;
- free_bts_buffer(context);
- context->tracer = NULL;
- return error;
+ attr.bp_addr = addr;
+ err = modify_user_hw_breakpoint(bp, &attr);
}
- return sizeof(cfg);
+ return err;
}
-static int ptrace_bts_status(struct task_struct *child,
- long cfg_size,
- struct ptrace_bts_config __user *ucfg)
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+static int ptrace_set_debugreg(struct task_struct *tsk, int n,
+ unsigned long val)
{
- struct bts_context *context;
- const struct bts_trace *trace;
- struct ptrace_bts_config cfg;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- if (cfg_size < sizeof(cfg))
- return -EIO;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- memset(&cfg, 0, sizeof(cfg));
- cfg.size = trace->ds.end - trace->ds.begin;
- cfg.signal = context->bts_ovfl_signal;
- cfg.bts_size = sizeof(struct bts_struct);
-
- if (cfg.signal)
- cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
- if (trace->ds.flags & BTS_USER)
- cfg.flags |= PTRACE_BTS_O_TRACE;
-
- if (trace->ds.flags & BTS_TIMESTAMPS)
- cfg.flags |= PTRACE_BTS_O_SCHED;
-
- if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
- return -EFAULT;
-
- return sizeof(cfg);
+ struct thread_struct *thread = &tsk->thread;
+ /* There are no DR4 or DR5 registers */
+ int rc = -EIO;
+
+ if (n < HBP_NUM) {
+ rc = ptrace_set_breakpoint_addr(tsk, n, val);
+ } else if (n == 6) {
+ thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */
+ rc = 0;
+ } else if (n == 7) {
+ rc = ptrace_write_dr7(tsk, val);
+ if (!rc)
+ thread->ptrace_dr7 = val;
+ }
+ return rc;
}
-static int ptrace_bts_clear(struct task_struct *child)
+/*
+ * These access the current or another (stopped) task's io permission
+ * bitmap for debugging or core dump.
+ */
+static int ioperm_active(struct task_struct *target,
+ const struct user_regset *regset)
{
- struct bts_context *context;
- const struct bts_trace *trace;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
+ struct io_bitmap *iobm = target->thread.io_bitmap;
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
- return ds_reset_bts(context->tracer);
+ return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0;
}
-static int ptrace_bts_size(struct task_struct *child)
+static int ioperm_get(struct task_struct *target,
+ const struct user_regset *regset,
+ struct membuf to)
{
- struct bts_context *context;
- const struct bts_trace *trace;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
+ struct io_bitmap *iobm = target->thread.io_bitmap;
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
+ if (!iobm)
+ return -ENXIO;
- return (trace->ds.top - trace->ds.begin) / trace->ds.size;
+ return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES);
}
/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
- if (unlikely(child->bts)) {
- free_bts_context(child->bts);
- child->bts = NULL;
- }
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
-
-/*
* Called by kernel/ptrace.c when detaching..
*
* Make sure the single step bit is not set.
@@ -922,28 +718,36 @@ void ptrace_bts_untrace(struct task_struct *child)
void ptrace_disable(struct task_struct *child)
{
user_disable_single_step(child);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-#endif
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
+#ifdef CONFIG_X86_64
+static const struct user_regset_view user_x86_64_view; /* Initialized below. */
+#endif
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
{
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
+#ifdef CONFIG_X86_64
+ /* This is native 64-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_64_view;
+#else
+ /* This is native 32-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_32_view;
+#endif
+
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
unsigned long tmp;
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
tmp = 0; /* Default return condition */
@@ -960,8 +764,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
if (addr < sizeof(struct user_regs_struct))
@@ -976,28 +779,28 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1005,30 +808,30 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
#ifdef CONFIG_X86_32
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
case PTRACE_GET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_get_thread_area(child, addr,
- (struct user_desc __user *) data);
+ (struct user_desc __user *)data);
break;
case PTRACE_SET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_set_thread_area(child, addr,
- (struct user_desc __user *) data, 0);
+ (struct user_desc __user *)data, 0);
break;
#endif
@@ -1037,43 +840,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
Works just like arch_prctl, except that the arguments
are reversed. */
case PTRACE_ARCH_PRCTL:
- ret = do_arch_prctl(child, data, addr);
+ ret = do_arch_prctl_64(child, data, addr);
break;
#endif
- /*
- * These bits need more cooking - not enabled yet:
- */
-#ifdef CONFIG_X86_PTRACE_BTS
- case PTRACE_BTS_CONFIG:
- ret = ptrace_bts_config
- (child, data, (struct ptrace_bts_config __user *)addr);
- break;
-
- case PTRACE_BTS_STATUS:
- ret = ptrace_bts_status
- (child, data, (struct ptrace_bts_config __user *)addr);
- break;
-
- case PTRACE_BTS_SIZE:
- ret = ptrace_bts_size(child);
- break;
-
- case PTRACE_BTS_GET:
- ret = ptrace_bts_read_record
- (child, data, (struct bts_struct __user *) addr);
- break;
-
- case PTRACE_BTS_CLEAR:
- ret = ptrace_bts_clear(child);
- break;
-
- case PTRACE_BTS_DRAIN:
- ret = ptrace_bts_drain
- (child, data, (struct bts_struct __user *) addr);
- break;
-#endif /* CONFIG_X86_PTRACE_BTS */
-
default:
ret = ptrace_request(child, request, addr, data);
break;
@@ -1103,14 +873,39 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
static int putreg32(struct task_struct *child, unsigned regno, u32 value)
{
struct pt_regs *regs = task_pt_regs(child);
+ int ret;
switch (regno) {
SEG32(cs);
SEG32(ds);
SEG32(es);
- SEG32(fs);
- SEG32(gs);
+
+ /*
+ * A 32-bit ptracer on a 64-bit kernel expects that writing
+ * FS or GS will also update the base. This is needed for
+ * operations like PTRACE_SETREGS to fully restore a saved
+ * CPU state.
+ */
+
+ case offsetof(struct user32, regs.fs):
+ ret = set_segment_reg(child,
+ offsetof(struct user_regs_struct, fs),
+ value);
+ if (ret == 0)
+ child->thread.fsbase =
+ x86_fsgsbase_read_task(child, value);
+ return ret;
+
+ case offsetof(struct user32, regs.gs):
+ ret = set_segment_reg(child,
+ offsetof(struct user_regs_struct, gs),
+ value);
+ if (ret == 0)
+ child->thread.gsbase =
+ x86_fsgsbase_read_task(child, value);
+ return ret;
+
SEG32(ss);
R32(ebx, bx);
@@ -1125,10 +920,18 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
case offsetof(struct user32, regs.orig_eax):
/*
- * Sign-extend the value so that orig_eax = -1
- * causes (long)orig_ax < 0 tests to fire correctly.
+ * Warning: bizarre corner case fixup here. A 32-bit
+ * debugger setting orig_eax to -1 wants to disable
+ * syscall restart. Make sure that the syscall
+ * restart code sign-extends orig_ax. Also make sure
+ * we interpret the -ERESTART* codes correctly if
+ * loaded into regs->ax in case the task is not
+ * actually still sitting at the exit from a 32-bit
+ * syscall with TS_COMPAT still set.
*/
- regs->orig_ax = (long) (s32) value;
+ regs->orig_ax = value;
+ if (syscall_get_nr(child, regs) != -1)
+ child->thread_info.status |= TS_I386_REGS_POKED;
break;
case offsetof(struct user32, regs.eflags):
@@ -1218,28 +1021,15 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
static int genregs32_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count > 0) {
- getreg32(target, pos, k++);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count > 0) {
- compat_ulong_t word;
- getreg32(target, pos, &word);
- if (__put_user(word, u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ int reg;
+ for (reg = 0; to.left; reg++) {
+ u32 val;
+ getreg32(target, reg * 4, &val);
+ membuf_store(&to, val);
+ }
return 0;
}
@@ -1251,14 +1041,14 @@ static int genregs32_set(struct task_struct *target,
int ret = 0;
if (kbuf) {
const compat_ulong_t *k = kbuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*k) && !ret) {
ret = putreg32(target, pos, *k++);
count -= sizeof(*k);
pos += sizeof(*k);
}
} else {
const compat_ulong_t __user *u = ubuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*u) && !ret) {
compat_ulong_t word;
ret = __get_user(word, u++);
if (ret)
@@ -1271,8 +1061,8 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
-long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
- compat_ulong_t caddr, compat_ulong_t cdata)
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
{
unsigned long addr = caddr;
unsigned long data = cdata;
@@ -1316,26 +1106,18 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_GET_THREAD_AREA:
case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
- case PTRACE_BTS_CONFIG:
- case PTRACE_BTS_STATUS:
- case PTRACE_BTS_SIZE:
- case PTRACE_BTS_GET:
- case PTRACE_BTS_CLEAR:
- case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
return arch_ptrace(child, request, addr, data);
default:
@@ -1344,30 +1126,159 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
return ret;
}
+#endif /* CONFIG_IA32_EMULATION */
+
+#ifdef CONFIG_X86_X32_ABI
+static long x32_arch_ptrace(struct task_struct *child,
+ compat_long_t request, compat_ulong_t caddr,
+ compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+
+ switch (request) {
+ /* Read 32bits at location addr in the USER area. Only allow
+ to return the lower 32bits of segment and debug registers. */
+ case PTRACE_PEEKUSR: {
+ u32 tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, (__u32 __user *)datap);
+ break;
+ }
+
+ /* Write the word at location addr in the USER area. Only allow
+ to update segment and debug registers with the upper 32bits
+ zero-extended. */
+ case PTRACE_POKEUSR:
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ &user_x86_64_view,
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ &user_x86_64_view,
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ &user_x86_64_view,
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ &user_x86_64_view,
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif
-#endif /* CONFIG_IA32_EMULATION */
+#ifdef CONFIG_COMPAT
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (!in_ia32_syscall())
+ return x32_arch_ptrace(child, request, caddr, cdata);
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+ return 0;
+#endif
+}
+#endif /* CONFIG_COMPAT */
#ifdef CONFIG_X86_64
-static const struct user_regset x86_64_regsets[] = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .get = genregs_get, .set = genregs_set
+static struct user_regset x86_64_regsets[] __ro_after_init = {
+ [REGSET64_GENERAL] = {
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
+ .n = sizeof(struct user_regs_struct) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .regset_get = genregs_get,
+ .set = genregs_set
+ },
+ [REGSET64_FP] = {
+ USER_REGSET_NOTE_TYPE(PRFPREG),
+ .n = sizeof(struct fxregs_state) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct user_i387_struct) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ [REGSET64_XSTATE] = {
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_IOPERM64] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_LONGS,
- .size = sizeof(long), .align = sizeof(long),
- .active = ioperm_active, .get = ioperm_get
+ [REGSET64_IOPERM] = {
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
+ .n = IO_BITMAP_LONGS,
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ [REGSET64_SSP] = {
+ USER_REGSET_NOTE_TYPE(X86_SHSTK),
+ .n = 1,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = ssp_active,
+ .regset_get = ssp_get,
+ .set = ssp_set
+ },
+#endif
};
static const struct user_regset_view user_x86_64_view = {
@@ -1381,44 +1292,61 @@ static const struct user_regset_view user_x86_64_view = {
#define genregs32_get genregs_get
#define genregs32_set genregs_set
-#define user_i387_ia32_struct user_i387_struct
-#define user32_fxsr_struct user_fxsr_struct
-
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static const struct user_regset x86_32_regsets[] = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct32) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .get = genregs32_get, .set = genregs32_set
+static struct user_regset x86_32_regsets[] __ro_after_init = {
+ [REGSET32_GENERAL] = {
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
+ .n = sizeof(struct user_regs_struct32) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .regset_get = genregs32_get,
+ .set = genregs32_set
+ },
+ [REGSET32_FP] = {
+ USER_REGSET_NOTE_TYPE(PRFPREG),
+ .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_fpregs_active,
+ .regset_get = fpregs_get,
+ .set = fpregs_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+ [REGSET32_XFP] = {
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
+ .n = sizeof(struct fxregs_state) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_XFP] = {
- .core_note_type = NT_PRXFPREG,
- .n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ [REGSET32_XSTATE] = {
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_TLS] = {
- .core_note_type = NT_386_TLS,
- .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
- .size = sizeof(struct user_desc),
- .align = sizeof(struct user_desc),
- .active = regset_tls_active,
- .get = regset_tls_get, .set = regset_tls_set
+ [REGSET32_TLS] = {
+ USER_REGSET_NOTE_TYPE(386_TLS),
+ .n = GDT_ENTRY_TLS_ENTRIES,
+ .bias = GDT_ENTRY_TLS_MIN,
+ .size = sizeof(struct user_desc),
+ .align = sizeof(struct user_desc),
+ .active = regset_tls_active,
+ .regset_get = regset_tls_get,
+ .set = regset_tls_set
},
- [REGSET_IOPERM32] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_BYTES / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = ioperm_active, .get = ioperm_get
+ [REGSET32_IOPERM] = {
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
+ .n = IO_BITMAP_BYTES / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
};
@@ -1428,10 +1356,46 @@ static const struct user_regset_view user_x86_32_view = {
};
#endif
+/*
+ * This represents bytes 464..511 in the memory layout exported through
+ * the REGSET_XSTATE interface.
+ */
+u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+{
+#ifdef CONFIG_X86_64
+ x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64);
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64);
+#endif
+ xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
+}
+
+/*
+ * This is used by the core dump code to decide which regset to dump. The
+ * core dump code writes out the resulting .e_machine and the corresponding
+ * regsets. This is suboptimal if the task is messing around with its CS.L
+ * field, but at worst the core dump will end up missing some information.
+ *
+ * Unfortunately, it is also used by the broken PTRACE_GETREGSET and
+ * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
+ * no way to make sure that the e_machine they use matches the caller's
+ * expectations. The result is that the data format returned by
+ * PTRACE_GETREGSET depends on the returned CS field (and even the offset
+ * of the returned CS field depends on its value!) and the data format
+ * accepted by PTRACE_SETREGSET is determined by the old CS value. The
+ * upshot is that it is basically impossible to use these APIs correctly.
+ *
+ * The best way to fix it in the long run would probably be to add new
+ * improved ptrace() APIs to read and write registers reliably, possibly by
+ * allowing userspace to select the ELF e_machine variant that they expect.
+ */
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
+ if (!user_64bit_mode(task_pt_regs(task)))
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
return &user_x86_32_view;
@@ -1441,108 +1405,19 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
- int error_code, int si_code)
+void send_sigtrap(struct pt_regs *regs, int error_code, int si_code)
{
- struct siginfo info;
+ struct task_struct *tsk = current;
- tsk->thread.trap_no = 1;
+ tsk->thread.trap_nr = X86_TRAP_DB;
tsk->thread.error_code = error_code;
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGTRAP;
- info.si_code = si_code;
-
- /* User-mode ip? */
- info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
-
/* Send us the fake SIGTRAP */
- force_sig_info(SIGTRAP, &info, tsk);
+ force_sig_fault(SIGTRAP, si_code,
+ user_mode(regs) ? (void __user *)regs->ip : NULL);
}
-
-#ifdef CONFIG_X86_32
-# define IS_IA32 1
-#elif defined CONFIG_IA32_EMULATION
-# define IS_IA32 is_compat_task()
-#else
-# define IS_IA32 0
-#endif
-
-/*
- * We must return the syscall number to actually look up in the table.
- * This can be -1L to skip running any syscall at all.
- */
-asmregparm long syscall_trace_enter(struct pt_regs *regs)
+void user_single_step_report(struct pt_regs *regs)
{
- long ret = 0;
-
- /*
- * If we stepped into a sysenter/syscall insn, it trapped in
- * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
- * If user-mode had set TF itself, then it's still clear from
- * do_debug() and we need to set it again to restore the user
- * state. If we entered on the slow path, TF was already set.
- */
- if (test_thread_flag(TIF_SINGLESTEP))
- regs->flags |= X86_EFLAGS_TF;
-
- /* do the secure computing check first */
- secure_computing(regs->orig_ax);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
- ret = -1L;
-
- if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
- tracehook_report_syscall_entry(regs))
- ret = -1L;
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_enter(regs);
-
- if (unlikely(current->audit_context)) {
- if (IS_IA32)
- audit_syscall_entry(AUDIT_ARCH_I386,
- regs->orig_ax,
- regs->bx, regs->cx,
- regs->dx, regs->si);
-#ifdef CONFIG_X86_64
- else
- audit_syscall_entry(AUDIT_ARCH_X86_64,
- regs->orig_ax,
- regs->di, regs->si,
- regs->dx, regs->r10);
-#endif
- }
-
- return ret ?: regs->orig_ax;
-}
-
-asmregparm void syscall_trace_leave(struct pt_regs *regs)
-{
- if (unlikely(current->audit_context))
- audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
- ftrace_syscall_exit(regs);
-
- if (test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, 0);
-
- /*
- * If TIF_SYSCALL_EMU is set, we only get here because of
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
- * We already reported this syscall instruction in
- * syscall_trace_enter(), so don't do any more now.
- */
- if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
- return;
-
- /*
- * If we are single-stepping, synthesize a trap to follow the
- * system call instruction.
- */
- if (test_thread_flag(TIF_SINGLESTEP) &&
- tracehook_consider_fatal_signal(current, SIGTRAP))
- send_sigtrap(current, regs, 0, TRAP_BRKPT);
+ send_sigtrap(regs, 0, TRAP_BRKPT);
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761f..b3f81379c2fc 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -1,100 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* paravirtual clock -- common code used by kvm/xen
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/clocksource.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
-#include <asm/pvclock.h>
-
-/*
- * These are perodically updated
- * xen: magic shared_info page
- * kvm: gpa registered via msr
- * and then copied here.
- */
-struct pvclock_shadow_time {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
-};
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/nmi.h>
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif defined(__x86_64__)
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
-}
+#include <asm/fixmap.h>
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
-{
- u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
+static u8 valid_flags __read_mostly = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
-/*
- * Reads a consistent set of time-base values from hypervisor,
- * into a shadow data area.
- */
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
- struct pvclock_vcpu_time_info *src)
+void pvclock_set_flags(u8 flags)
{
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) || (dst->version != src->version));
-
- return dst->version;
+ valid_flags = flags;
}
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
@@ -109,45 +36,131 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
return pv_tsc_khz;
}
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+void pvclock_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ reset_hung_task_detector();
+}
+
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
+void pvclock_resume(void)
+{
+ atomic64_set(&last_value, 0);
+}
+
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
{
- struct pvclock_shadow_time shadow;
unsigned version;
- cycle_t ret, offset;
+ u8 flags;
do {
- version = pvclock_get_time_values(&shadow, src);
- barrier();
- offset = pvclock_get_nsec_offset(&shadow);
- ret = shadow.system_timestamp + offset;
- barrier();
- } while (version != src->version);
+ version = pvclock_read_begin(src);
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
+
+ return flags & valid_flags;
+}
+
+static __always_inline
+u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
+{
+ unsigned version;
+ u64 ret;
+ u64 last;
+ u8 flags;
+
+ do {
+ version = pvclock_read_begin(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
+
+ if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ }
+
+ if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+ (flags & PVCLOCK_TSC_STABLE_BIT))
+ return ret;
+
+ /*
+ * Assumption here is that last_value, a global accumulator, always goes
+ * forward. If we are less than that, we should not be much smaller.
+ * We assume there is an error margin we're inside, and then the correction
+ * does not sacrifice accuracy.
+ *
+ * For reads: global may have changed between test and return,
+ * but this means someone else updated poked the clock at a later time.
+ * We just need to make sure we are not seeing a backwards event.
+ *
+ * For updates: last_value = ret is not enough, since two vcpus could be
+ * updating at the same time, and one of them could be slightly behind,
+ * making the assumption that last_value always go forward fail to hold.
+ */
+ last = raw_atomic64_read(&last_value);
+ do {
+ if (ret <= last)
+ return last;
+ } while (!raw_atomic64_try_cmpxchg(&last_value, &last, ret));
return ret;
}
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, true);
+}
+
+noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, false);
+}
+
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
- struct timespec *ts)
+ struct timespec64 *ts)
{
u32 version;
u64 delta;
- struct timespec now;
+ struct timespec64 now;
/* get wallclock at system boot */
do {
version = wall_clock->version;
rmb(); /* fetch version before time */
+ /*
+ * Note: wall_clock->sec is a u32 value, so it can
+ * only store dates between 1970 and 2106. To allow
+ * times beyond that, we need to create a new hypercall
+ * interface with an extended pvclock_wall_clock structure
+ * like ARM has.
+ */
now.tv_sec = wall_clock->sec;
now.tv_nsec = wall_clock->nsec;
rmb(); /* fetch time before checking version */
} while ((wall_clock->version & 1) || (version != wall_clock->version));
delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+ delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+ set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
+}
+
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+{
+ WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK));
+ pvti_cpu0_va = pvti;
+}
+
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return pvti_cpu0_va;
}
+EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624bf..a92f18db9610 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -1,16 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file contains work-arounds for x86 and x86_64 platform bugs.
*/
+#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/irq.h>
#include <asm/hpet.h>
+#include <asm/setup.h>
+#include <asm/mce.h>
+
+#include <linux/platform_data/x86/apple.h>
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+static void quirk_intel_irqbalance(struct pci_dev *dev)
{
- u8 config, rev;
+ u8 config;
u16 word;
/* BIOS may enable hardware IRQ balancing for
@@ -18,8 +24,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
* based platforms.
* Disable SW irqbalance/affinity on those platforms.
*/
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev > 0x9)
+ if (dev->revision > 0x9)
return;
/* enable access to config space*/
@@ -88,14 +93,12 @@ static void ich_force_hpet_resume(void)
BUG();
else
printk(KERN_DEBUG "Force enabled HPET at resume\n");
-
- return;
}
static void ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(rcba);
+ u32 rcba;
int err = 0;
if (hpet_address || force_hpet_address)
@@ -110,7 +113,7 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
}
/* use bits 31:14, 16 kB aligned */
- rcba_base = ioremap_nocache(rcba, 0x4000);
+ rcba_base = ioremap(rcba, 0x4000);
if (rcba_base == NULL) {
dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
"cannot force enable HPET\n");
@@ -185,7 +188,7 @@ static void hpet_print_force_info(void)
static void old_ich_force_hpet_resume(void)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (!force_hpet_address || !cached_dev)
return;
@@ -207,7 +210,7 @@ static void old_ich_force_hpet_resume(void)
static void old_ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (hpet_address || force_hpet_address)
return;
@@ -298,7 +301,7 @@ static void vt8237_force_hpet_resume(void)
static void vt8237_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
@@ -344,6 +347,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
vt8237_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+ vt8237_force_enable_hpet);
static void ati_force_hpet_resume(void)
{
@@ -353,18 +358,22 @@ static void ati_force_hpet_resume(void)
static u32 ati_ixp4x0_rev(struct pci_dev *dev)
{
- u32 d;
- u8 b;
+ int err = 0;
+ u32 d = 0;
+ u8 b = 0;
- pci_read_config_byte(dev, 0xac, &b);
+ err = pci_read_config_byte(dev, 0xac, &b);
b &= ~(1<<5);
- pci_write_config_byte(dev, 0xac, b);
- pci_read_config_dword(dev, 0x70, &d);
+ err |= pci_write_config_byte(dev, 0xac, b);
+ err |= pci_read_config_dword(dev, 0x70, &d);
d |= 1<<8;
- pci_write_config_dword(dev, 0x70, d);
- pci_read_config_dword(dev, 0x8, &d);
+ err |= pci_write_config_dword(dev, 0x70, d);
+ err |= pci_read_config_dword(dev, 0x8, &d);
d &= 0xff;
dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
+
+ WARN_ON_ONCE(err);
+
return d;
}
@@ -423,7 +432,7 @@ static void nvidia_force_hpet_resume(void)
static void nvidia_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
@@ -440,7 +449,6 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
force_hpet_address);
cached_dev = dev;
- return;
}
/* ISA Bridges */
@@ -491,14 +499,48 @@ void force_hpet_resume(void)
break;
}
}
+
+/*
+ * According to the datasheet e6xx systems have the HPET hardwired to
+ * 0xfed00000
+ */
+static void e6xx_force_enable_hpet(struct pci_dev *dev)
+{
+ if (hpet_address || force_hpet_address)
+ return;
+
+ force_hpet_address = 0xFED00000;
+ force_hpet_resume_type = NONE_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
+ e6xx_force_enable_hpet);
+
+/*
+ * HPET MSI on some boards (ATI SB700/SB800) has side effect on
+ * floppy DMA. Disable HPET MSI on such platforms.
+ * See erratum #27 (Misinterpreted MSI Requests May Result in
+ * Corrupted LPC DMA Data) in AMD Publication #46837,
+ * "SB700 Family Product Errata", Rev. 1.0, March 2010.
+ */
+static void force_disable_hpet_msi(struct pci_dev *unused)
+{
+ hpet_msi_disable = true;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ force_disable_hpet_msi);
+
#endif
#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
/* Set correct numa_node information for AMD NB functions */
-static void __init quirk_amd_nb_node(struct pci_dev *dev)
+static void quirk_amd_nb_node(struct pci_dev *dev)
{
struct pci_dev *nb_ht;
unsigned int devfn;
+ u32 node;
u32 val;
devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,8 +549,14 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
return;
pci_read_config_dword(nb_ht, 0x60, &val);
- set_dev_node(&dev->dev, val & 7);
- pci_dev_put(dev);
+ node = pcibus_to_node(dev->bus) | (val & 7);
+ /*
+ * Some hardware may return an invalid node ID,
+ * so check it first:
+ */
+ if (node_online(node))
+ set_dev_node(&dev->dev, node);
+ pci_dev_put(nb_ht);
}
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
@@ -529,4 +577,95 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
quirk_amd_nb_node);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
+ quirk_amd_nb_node);
+
+#endif
+
+#ifdef CONFIG_PCI
+/*
+ * Processor does not ensure DRAM scrub read/write sequence
+ * is atomic wrt accesses to CC6 save state area. Therefore
+ * if a concurrent scrub read/write access is to same address
+ * the entry may appear as if it is not written. This quirk
+ * applies to Fam16h models 00h-0Fh
+ *
+ * See "Revision Guide" for AMD F16h models 00h-0fh,
+ * document 51810 rev. 3.04, Nov 2013
+ */
+static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
+{
+ u32 val;
+
+ /*
+ * Suggested workaround:
+ * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b
+ */
+ pci_read_config_dword(dev, 0x58, &val);
+ if (val & 0x1F) {
+ val &= ~(0x1F);
+ pci_write_config_dword(dev, 0x58, val);
+ }
+
+ pci_read_config_dword(dev, 0x5C, &val);
+ if (val & BIT(0)) {
+ val &= ~BIT(0);
+ pci_write_config_dword(dev, 0x5c, val);
+ }
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
+ amd_disable_seq_and_redirect_scrub);
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if (capid0 & 0x10)
+ enable_copy_mc_fragile();
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0, capid5;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+ pci_read_config_dword(pdev, 0x98, &capid5);
+
+ /*
+ * CAPID0{7:6} indicate whether this is an advanced RAS SKU
+ * CAPID5{8:5} indicate that various NVDIMM usage modes are
+ * enabled, so memory machine check recovery is also enabled.
+ */
+ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
+ enable_copy_mc_fragile();
+
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
#endif
+
+bool x86_apple_machine;
+EXPORT_SYMBOL(x86_apple_machine);
+
+void __init early_platform_quirks(void)
+{
+ x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") ||
+ dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc.");
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d101844..6032fa9ec753 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,28 +1,38 @@
-#include <linux/module.h>
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/pm.h>
#include <linux/efi.h>
#include <linux/dmi.h>
+#include <linux/sched.h>
+#include <linux/tboot.h>
+#include <linux/delay.h>
+#include <linux/objtool.h>
+#include <linux/pgtable.h>
+#include <linux/kexec.h>
+#include <linux/kvm_types.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
#include <asm/pci_x86.h>
-#include <asm/virtext.h>
#include <asm/cpu.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
-#ifdef CONFIG_X86_32
-# include <linux/ctype.h>
-# include <linux/mc146818rtc.h>
-#else
-# include <asm/iommu.h>
-#endif
+#include <linux/ctype.h>
+#include <linux/mc146818rtc.h>
+#include <asm/realmode.h>
+#include <asm/x86_init.h>
+#include <asm/efi.h>
/*
* Power off function, if any
@@ -30,16 +40,8 @@
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
-static const struct desc_ptr no_idt = {};
-static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
-int reboot_force;
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
-static int reboot_cpu = -1;
-#endif
-
-/* This is set if we need to go through the 'emergency' path.
+/*
+ * This is set if we need to go through the 'emergency' path.
* When machine_emergency_restart() is called, we may be on
* an inconsistent state and won't be able to do a clean cleanup
*/
@@ -48,91 +50,242 @@ static int reboot_emergency;
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
- warm Don't set the cold reboot flag
- cold Set the cold reboot flag
- bios Reboot by jumping through the BIOS (only for X86_32)
- smp Reboot by executing reset on BSP or other CPU (only for X86_32)
- triple Force a triple fault (init)
- kbd Use the keyboard controller. cold reset (default)
- acpi Use the RESET_REG in the FADT
- efi Use efi reset_system runtime service
- pci Use the so-called "PCI reset register", CF9
- force Avoid anything that could hang.
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
*/
-static int __init reboot_setup(char *str)
-{
- for (;;) {
- switch (*str) {
- case 'w':
- reboot_mode = 0x1234;
- break;
- case 'c':
- reboot_mode = 0;
- break;
-
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_SMP
- case 's':
- if (isdigit(*(str+1))) {
- reboot_cpu = (int) (*(str+1) - '0');
- if (isdigit(*(str+2)))
- reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
- }
- /* we will leave sorting out the final value
- when we are ready to reboot, since we might not
- have set up boot_cpu_id or smp_num_cpu */
- break;
-#endif /* CONFIG_SMP */
-
- case 'b':
-#endif
- case 'a':
- case 'k':
- case 't':
- case 'e':
- case 'p':
- reboot_type = *str;
- break;
+/*
+ * Some machines require the "reboot=a" commandline options
+ */
+static int __init set_acpi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_ACPI) {
+ reboot_type = BOOT_ACPI;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "ACPI");
+ }
+ return 0;
+}
- case 'f':
- reboot_force = 1;
- break;
- }
+/*
+ * Some machines require the "reboot=b" or "reboot=k" commandline options,
+ * this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_BIOS) {
+ reboot_type = BOOT_BIOS;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "BIOS");
+ }
+ return 0;
+}
- str = strchr(str, ',');
- if (str)
- str++;
- else
- break;
+/*
+ * Some machines don't handle the default ACPI reboot method and
+ * require the EFI reboot method:
+ */
+static int __init set_efi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) {
+ reboot_type = BOOT_EFI;
+ pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident);
}
- return 1;
+ return 0;
}
-__setup("reboot=", reboot_setup);
+void __noreturn machine_real_restart(unsigned int type)
+{
+ local_irq_disable();
+
+ /*
+ * Write zero to CMOS register number 0x0f, which the BIOS POST
+ * routine will recognize as telling it to do a proper reboot. (Well
+ * that's what this book in front of me says -- it may only apply to
+ * the Phoenix BIOS though, it's not clear). At the same time,
+ * disable NMIs by setting the top bit in the CMOS address register,
+ * as we're about to do peculiar things to the CPU. I'm not sure if
+ * `outb_p' is needed instead of just `outb'. Use it to be on the
+ * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ */
+ spin_lock(&rtc_lock);
+ CMOS_WRITE(0x00, 0x8f);
+ spin_unlock(&rtc_lock);
+ /*
+ * Switch to the trampoline page table.
+ */
+ load_trampoline_pgtable();
+ /* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
-/*
- * Reboot options and system auto-detection code provided by
- * Dell Inc. so their systems "just work". :-)
- */
+ asm volatile("jmpl *%0" : :
+ "rm" (real_mode_header->machine_real_restart_asm),
+ "a" (type));
+#else
+ asm volatile("ljmpl *%0" : :
+ "m" (real_mode_header->machine_real_restart_asm),
+ "D" (type));
+#endif
+ unreachable();
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
- * Some machines require the "reboot=b" commandline option,
- * this quirk makes that automatic.
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
*/
-static int __init set_bios_reboot(const struct dmi_system_id *d)
+static int __init set_pci_reboot(const struct dmi_system_id *d)
{
- if (reboot_type != BOOT_BIOS) {
- reboot_type = BOOT_BIOS;
- printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+ if (reboot_type != BOOT_CF9_FORCE) {
+ reboot_type = BOOT_CF9_FORCE;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "PCI");
}
return 0;
}
-static struct dmi_system_id __initdata reboot_dmi_table[] = {
+static int __init set_kbd_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_KBD) {
+ reboot_type = BOOT_KBD;
+ pr_info("%s series board detected. Selecting %s-method for reboot.\n",
+ d->ident, "KBD");
+ }
+ return 0;
+}
+
+/*
+ * This is a single dmi_table handling all reboot quirks.
+ */
+static const struct dmi_system_id reboot_dmi_table[] __initconst = {
+
+ /* Acer */
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_kbd_reboot,
+ .ident = "Acer Aspire One A110",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ },
+ },
+ { /* Handle reboot issue on Acer TravelMate X514-51T */
+ .callback = set_efi_reboot,
+ .ident = "Acer TravelMate X514-51T",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"),
+ },
+ },
+
+ /* Apple */
+ { /* Handle problems with rebooting on Apple MacBook5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBook6,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook6,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBookPro5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBookPro5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
+
+ /* ASRock */
+ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
+ .callback = set_pci_reboot,
+ .ident = "ASRock Q1900DC-ITX",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
+ DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
+ },
+ },
+
+ /* ASUS */
+ { /* Handle problems with rebooting on ASUS P4S800 */
+ .callback = set_bios_reboot,
+ .ident = "ASUS P4S800",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TA */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TA",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TAW */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TAW",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"),
+ },
+ },
+
+ /* Certec */
+ { /* Handle problems with rebooting on Certec BPC600 */
+ .callback = set_pci_reboot,
+ .ident = "Certec BPC600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Certec"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"),
+ },
+ },
+
+ /* Dell */
+ { /* Handle problems with rebooting on Dell DXP061 */
+ .callback = set_bios_reboot,
+ .ident = "Dell DXP061",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ },
+ },
{ /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
.ident = "Dell E520",
@@ -141,23 +294,57 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
},
},
- { /* Handle problems with rebooting on Dell 1300's */
+ { /* Handle problems with rebooting on the Latitude E5410. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5410",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
.callback = set_bios_reboot,
- .ident = "Dell PowerEdge 1300",
+ .ident = "Dell OptiPlex 330",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
- { /* Handle problems with rebooting on Dell 300's */
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
.callback = set_bios_reboot,
- .ident = "Dell PowerEdge 300",
+ .ident = "Dell OptiPlex 360",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -165,7 +352,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -174,7 +361,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -183,22 +370,38 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 330",
+ .ident = "Dell OptiPlex 760",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
- DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 360",
+ { /* Handle problems with rebooting on the OptiPlex 990. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990 BIOS A0x",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
- DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A0"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 1300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 1300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
},
},
{ /* Handle problems with rebooting on Dell 2400's */
@@ -209,6 +412,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
},
},
+ { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
+ .callback = set_pci_reboot,
+ .ident = "Dell PowerEdge C6100",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ },
+ },
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Precision M6600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
{ /* Handle problems with rebooting on Dell T5400's */
.callback = set_bios_reboot,
.ident = "Dell Precision T5400",
@@ -217,12 +436,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
},
},
- { /* Handle problems with rebooting on HP laptops */
+ { /* Handle problems with rebooting on Dell T7400's */
.callback = set_bios_reboot,
- .ident = "HP Compaq Laptop",
+ .ident = "Dell Precision T7400",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
- DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
},
},
{ /* Handle problems with rebooting on Dell XPS710 */
@@ -233,216 +452,71 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
},
},
- { /* Handle problems with rebooting on Dell DXP061 */
- .callback = set_bios_reboot,
- .ident = "Dell DXP061",
+ { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */
+ .callback = set_acpi_reboot,
+ .ident = "Dell OptiPlex 7450 AIO",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"),
},
},
- { /* Handle problems with rebooting on Sony VGN-Z540N */
+
+ /* Hewlett-Packard */
+ { /* Handle problems with rebooting on HP laptops */
.callback = set_bios_reboot,
- .ident = "Sony VGN-Z540N",
+ .ident = "HP Compaq Laptop",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
- { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
+
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
+ },
+ },
+
+ /* Sony */
+ { /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
- .ident = "CompuLab SBC-FITPC2",
+ .ident = "Sony VGN-Z540N",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
- DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
},
},
+
{ }
};
static int __init reboot_init(void)
{
- dmi_check_system(reboot_dmi_table);
- return 0;
-}
-core_initcall(reboot_init);
-
-/* The following code and data reboots the machine by switching to real
- mode and jumping to the BIOS reset entry point, as if the CPU has
- really been reset. The previous version asked the keyboard
- controller to pulse the CPU reset line, which is more thorough, but
- doesn't work with at least one type of 486 motherboard. It is easy
- to stop this code working; hence the copious comments. */
-static const unsigned long long
-real_mode_gdt_entries [3] =
-{
- 0x0000000000000000ULL, /* Null descriptor */
- 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
- 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
-};
-
-static const struct desc_ptr
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, 0 };
-
-/* This is 16-bit protected mode code to disable paging and the cache,
- switch to real mode and jump to the BIOS reset code.
-
- The instruction that switches to real mode by writing to CR0 must be
- followed immediately by a far jump instruction, which set CS to a
- valid value for real mode, and flushes the prefetch queue to avoid
- running instructions that have already been decoded in protected
- mode.
-
- Clears all the flags except ET, especially PG (paging), PE
- (protected-mode enable) and TS (task switch for coprocessor state
- save). Flushes the TLB after paging has been disabled. Sets CD and
- NW, to disable the cache on a 486, and invalidates the cache. This
- is more like the state of a 486 after reset. I don't know if
- something else should be done for other chips.
-
- More could be done here to set up the registers as if a CPU reset had
- occurred; hopefully real BIOSs don't assume much. */
-static const unsigned char real_mode_switch [] =
-{
- 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
- 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
- 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
- 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
- 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
- 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
- 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
- 0x74, 0x02, /* jz f */
- 0x0f, 0x09, /* wbinvd */
- 0x24, 0x10, /* f: andb $0x10,al */
- 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
-};
-static const unsigned char jump_to_bios [] =
-{
- 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
-};
+ int rv;
-/*
- * Switch to real mode and then execute the code
- * specified by the code and length parameters.
- * We assume that length will aways be less that 100!
- */
-void machine_real_restart(const unsigned char *code, int length)
-{
- local_irq_disable();
-
- /* Write zero to CMOS register number 0x0f, which the BIOS POST
- routine will recognize as telling it to do a proper reboot. (Well
- that's what this book in front of me says -- it may only apply to
- the Phoenix BIOS though, it's not clear). At the same time,
- disable NMIs by setting the top bit in the CMOS address register,
- as we're about to do peculiar things to the CPU. I'm not sure if
- `outb_p' is needed instead of just `outb'. Use it to be on the
- safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ /*
+ * Only do the DMI check if reboot_type hasn't been overridden
+ * on the command line
*/
- spin_lock(&rtc_lock);
- CMOS_WRITE(0x00, 0x8f);
- spin_unlock(&rtc_lock);
-
- /* Remap the kernel at virtual address zero, as well as offset zero
- from the kernel segment. This assumes the kernel segment starts at
- virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
+ if (!reboot_default)
+ return 0;
/*
- * Use `swapper_pg_dir' as our page directory.
+ * The DMI quirks table takes precedence. If no quirks entry
+ * matches and the ACPI Hardware Reduced bit is set and EFI
+ * runtime services are enabled, force EFI reboot.
*/
- load_cr3(swapper_pg_dir);
-
- /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
- this on booting to tell it to "Bypass memory test (also warm
- boot)". This seems like a fairly standard thing that gets set by
- REBOOT.COM programs, and the previous reset routine did this
- too. */
- *((unsigned short *)0x472) = reboot_mode;
-
- /* For the switch to real mode, copy some code to low memory. It has
- to be in the first 64k because it is running in 16-bit mode, and it
- has to have the same physical and virtual address, because it turns
- off paging. Copy it near the end of the first page, out of the way
- of BIOS variables. */
- memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
- real_mode_switch, sizeof (real_mode_switch));
- memcpy((void *)(0x1000 - 100), code, length);
-
- /* Set up the IDT for real mode. */
- load_idt(&real_mode_idt);
-
- /* Set up a GDT from which we can load segment descriptors for real
- mode. The GDT is not used in real mode; it is just needed here to
- prepare the descriptors. */
- load_gdt(&real_mode_gdt);
-
- /* Load the data segment registers, and thus the descriptors ready for
- real mode. The base address of each segment is 0x100, 16 times the
- selector value being loaded here. This is so that the segment
- registers don't have to be reloaded after switching to real mode:
- the values are consistent for real mode operation already. */
- __asm__ __volatile__ ("movl $0x0010,%%eax\n"
- "\tmovl %%eax,%%ds\n"
- "\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
- "\tmovl %%eax,%%ss" : : : "eax");
-
- /* Jump to the 16-bit code that we copied earlier. It disables paging
- and the cache, switches to real mode, and jumps to the BIOS reset
- entry point. */
- __asm__ __volatile__ ("ljmp $0x0008,%0"
- :
- : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
+ rv = dmi_check_system(reboot_dmi_table);
-#endif /* CONFIG_X86_32 */
+ if (!rv && efi_reboot_required() && !efi_runtime_disabled())
+ reboot_type = BOOT_EFI;
-/*
- * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
- */
-static int __init set_pci_reboot(const struct dmi_system_id *d)
-{
- if (reboot_type != BOOT_CF9) {
- reboot_type = BOOT_CF9;
- printk(KERN_INFO "%s series board detected. "
- "Selecting PCI-method for reboots.\n", d->ident);
- }
return 0;
}
-
-static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
- { /* Handle problems with rebooting on Apple MacBook5 */
- .callback = set_pci_reboot,
- .ident = "Apple MacBook5",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
- },
- },
- { /* Handle problems with rebooting on Apple MacBookPro5 */
- .callback = set_pci_reboot,
- .ident = "Apple MacBookPro5",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
- },
- },
- { }
-};
-
-static int __init pci_reboot_init(void)
-{
- dmi_check_system(pci_reboot_dmi_table);
- return 0;
-}
-core_initcall(pci_reboot_init);
+core_initcall(reboot_init);
static inline void kb_wait(void)
{
@@ -455,116 +529,187 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct die_args *args)
+static inline void nmi_shootdown_cpus_on_restart(void);
+
+#if IS_ENABLED(CONFIG_KVM_X86)
+/* RCU-protected callback to disable virtualization prior to reboot. */
+static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
+
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, callback);
+}
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback);
+
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
{
- cpu_emergency_vmxoff();
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
+ synchronize_rcu();
}
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback);
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
*/
-static void emergency_vmx_disable_all(void)
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_virt_cb *callback;
+
+ /*
+ * IRQs must be disabled as KVM enables virtualization in hardware via
+ * function call IPIs, i.e. IRQs need to be disabled to guarantee
+ * virtualization stays disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
+ rcu_read_lock();
+ callback = rcu_dereference(cpu_emergency_virt_callback);
+ if (callback)
+ callback();
+ rcu_read_unlock();
+}
+
+static void emergency_reboot_disable_virtualization(void)
{
- /* Just make sure we won't change CPUs while doing this */
local_irq_disable();
- /* We need to disable VMX on all CPUs before rebooting, otherwise
- * we risk hanging up the machine, because the CPU ignore INIT
- * signals when VMX is enabled.
- *
- * We can't take any locks and we may be on an inconsistent
- * state, so we use NMIs as IPIs to tell the other CPUs to disable
- * VMX and halt.
+ /*
+ * Disable virtualization on all CPUs before rebooting to avoid hanging
+ * the system, as VMX and SVM block INIT when running in the host.
*
- * For safety, we will avoid running the nmi_shootdown_cpus()
- * stuff unnecessarily, but we don't have a way to check
- * if other CPUs have VMX enabled. So we will call it only if the
- * CPU we are running on has VMX enabled.
+ * We can't take any locks and we may be on an inconsistent state, so
+ * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
*
- * We will miss cases where VMX is not enabled on all CPUs. This
- * shouldn't do much harm because KVM always enable VMX on all
- * CPUs anyway. But we can miss it on the small window where KVM
- * is still enabling VMX.
+ * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
+ * other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU.
- */
- cpu_vmxoff();
-
- /* Halt and disable VMX on the other CPUs */
- nmi_shootdown_cpus(vmxoff_nmi);
+ if (rcu_access_pointer(cpu_emergency_virt_callback)) {
+ /* Safely force _this_ CPU out of VMX/SVM operation. */
+ cpu_emergency_disable_virtualization();
+ /* Disable VMX/SVM and halt on other CPUs. */
+ nmi_shootdown_cpus_on_restart();
}
}
-
+#else
+static void emergency_reboot_disable_virtualization(void) { }
+#endif /* CONFIG_KVM_X86 */
void __attribute__((weak)) mach_reboot_fixups(void)
{
}
+/*
+ * To the best of our knowledge Windows compatible x86 hardware expects
+ * the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ * 5) If still alive, call the EFI runtime service to reboot
+ * 6) If no EFI runtime service, call the BIOS to do a reboot
+ *
+ * We default to following the same pattern. We also have
+ * two other reboot methods: 'triple fault' and 'PCI', which
+ * can be triggered via the reboot= kernel boot option or
+ * via quirks.
+ *
+ * This means that this function can never return, it can misbehave
+ * by not rebooting properly and hanging.
+ */
static void native_machine_emergency_restart(void)
{
int i;
+ int attempt = 0;
+ int orig_reboot_type = reboot_type;
+ unsigned short mode;
if (reboot_emergency)
- emergency_vmx_disable_all();
+ emergency_reboot_disable_virtualization();
+
+ tboot_shutdown(TB_SHUTDOWN_REBOOT);
/* Tell the BIOS if we want cold or warm reboot */
- *((unsigned short *)__va(0x472)) = reboot_mode;
+ mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+ *((unsigned short *)__va(0x472)) = mode;
+
+ /*
+ * If an EFI capsule has been registered with the firmware then
+ * override the reboot= parameter.
+ */
+ if (efi_capsule_pending(NULL)) {
+ pr_info("EFI capsule is pending, forcing EFI reboot.\n");
+ reboot_type = BOOT_EFI;
+ }
for (;;) {
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
+ case BOOT_ACPI:
+ acpi_reboot();
+ reboot_type = BOOT_KBD;
+ break;
+
case BOOT_KBD:
- mach_reboot_fixups(); /* for board specific fixups */
+ mach_reboot_fixups(); /* For board specific fixups */
for (i = 0; i < 10; i++) {
kb_wait();
udelay(50);
- outb(0xfe, 0x64); /* pulse reset low */
+ outb(0xfe, 0x64); /* Pulse reset low */
udelay(50);
}
-
- case BOOT_TRIPLE:
- load_idt(&no_idt);
- __asm__ __volatile__("int3");
-
- reboot_type = BOOT_KBD;
+ if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+ attempt = 1;
+ reboot_type = BOOT_ACPI;
+ } else {
+ reboot_type = BOOT_EFI;
+ }
break;
-#ifdef CONFIG_X86_32
- case BOOT_BIOS:
- machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
-
- reboot_type = BOOT_KBD;
+ case BOOT_EFI:
+ efi_reboot(reboot_mode, NULL);
+ reboot_type = BOOT_BIOS;
break;
-#endif
- case BOOT_ACPI:
- acpi_reboot();
- reboot_type = BOOT_KBD;
- break;
+ case BOOT_BIOS:
+ machine_real_restart(MRR_BIOS);
- case BOOT_EFI:
- if (efi_enabled)
- efi.reset_system(reboot_mode ?
- EFI_RESET_WARM :
- EFI_RESET_COLD,
- EFI_SUCCESS, 0, NULL);
- reboot_type = BOOT_KBD;
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_CF9_SAFE;
break;
- case BOOT_CF9:
+ case BOOT_CF9_FORCE:
port_cf9_safe = true;
- /* fall through */
+ fallthrough;
- case BOOT_CF9_COND:
+ case BOOT_CF9_SAFE:
if (port_cf9_safe) {
- u8 cf9 = inb(0xcf9) & ~6;
+ u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E;
+ u8 cf9 = inb(0xcf9) & ~reboot_code;
outb(cf9|2, 0xcf9); /* Request hard reset */
udelay(50);
- outb(cf9|6, 0xcf9); /* Actually do the reset */
+ /* Actually do the reset */
+ outb(cf9|reboot_code, 0xcf9);
udelay(50);
}
+ reboot_type = BOOT_TRIPLE;
+ break;
+
+ case BOOT_TRIPLE:
+ idt_invalidate();
+ __asm__ __volatile__("int3");
+
+ /* We're probably dead after this, but... */
reboot_type = BOOT_KBD;
break;
}
@@ -573,45 +718,53 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
- /* Stop the cpus and apics */
-#ifdef CONFIG_SMP
-
- /* The boot cpu is always logical cpu 0 */
- int reboot_cpu_id = 0;
+ /*
+ * Call enc_kexec_begin() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_begin();
-#ifdef CONFIG_X86_32
- /* See if there has been given a command line override */
- if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
- cpu_online(reboot_cpu))
- reboot_cpu_id = reboot_cpu;
+ /* Stop the cpus and apics */
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Disabling IO APIC before local APIC is a workaround for
+ * erratum AVR31 in "Intel Atom Processor C2000 Product Family
+ * Specification Update". In this situation, interrupts that target
+ * a Logical Processor whose Local APIC is either in the process of
+ * being hardware disabled or software disabled are neither delivered
+ * nor discarded. When this erratum occurs, the processor may hang.
+ *
+ * Even without the erratum, it still makes sense to quiet IO APIC
+ * before disabling Local APIC.
+ */
+ clear_IO_APIC();
#endif
- /* Make certain the cpu I'm about to reboot on is online */
- if (!cpu_online(reboot_cpu_id))
- reboot_cpu_id = smp_processor_id();
-
- /* Make certain I only run on the appropriate processor */
- set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
-
- /* O.K Now that I'm on the appropriate processor,
- * stop all of the others.
+#ifdef CONFIG_SMP
+ /*
+ * Stop all of the others. Also disable the local irq to
+ * not receive the per-cpu timer interrupt which may trigger
+ * scheduler's load balance.
*/
- smp_send_stop();
+ local_irq_disable();
+ stop_other_cpus();
#endif
lapic_shutdown();
-
-#ifdef CONFIG_X86_IO_APIC
- disable_IO_APIC();
-#endif
+ restore_boot_irq_mode();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
#ifdef CONFIG_X86_64
- pci_iommu_shutdown();
+ x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_finish();
}
static void __machine_emergency_restart(int emergency)
@@ -622,7 +775,7 @@ static void __machine_emergency_restart(int emergency)
static void native_machine_restart(char *__unused)
{
- printk("machine restart\n");
+ pr_notice("machine restart\n");
if (!reboot_force)
machine_shutdown();
@@ -631,29 +784,32 @@ static void native_machine_restart(char *__unused)
static void native_machine_halt(void)
{
- /* stop other cpus and apics */
+ /* Stop other cpus and apics */
machine_shutdown();
- /* stop this cpu */
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+
stop_this_cpu(NULL);
}
static void native_machine_power_off(void)
{
- if (pm_power_off) {
+ if (kernel_can_power_off()) {
if (!reboot_force)
machine_shutdown();
- pm_power_off();
+ do_kernel_power_off();
}
+ /* A fallback in case there is no PM info available */
+ tboot_shutdown(TB_SHUTDOWN_HALT);
}
-struct machine_ops machine_ops = {
+struct machine_ops machine_ops __ro_after_init = {
.power_off = native_machine_power_off,
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_CRASH_DUMP
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -683,86 +839,104 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_CRASH_DUMP
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
}
#endif
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
#if defined(CONFIG_SMP)
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
-static int crash_nmi_callback(struct notifier_block *self,
- unsigned long val, void *data)
+static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
{
int cpu;
- if (val != DIE_NMI_IPI)
- return NOTIFY_OK;
-
cpu = raw_smp_processor_id();
- /* Don't do anything if this handler is invoked on crashing cpu.
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
- return NOTIFY_STOP;
+ return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, (struct die_args *)data);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
+
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ BUG();
+ }
+
/* Assume hlt works */
halt();
for (;;)
cpu_relax();
- return 1;
+ return NMI_HANDLED;
}
-static void smp_send_nmi_allbutself(void)
-{
- apic->send_IPI_allbutself(NMI_VECTOR);
-}
-
-static struct notifier_block crash_nmi_nb = {
- .notifier_call = crash_nmi_callback,
-};
-
-/* Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
- /* Make a note of crashing cpu. Will be used in NMI callback.*/
- crashing_cpu = safe_smp_processor_id();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
+ /* Make a note of crashing cpu. Will be used in NMI callback. */
+ crashing_cpu = smp_processor_id();
shootdown_callback = callback;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_die_notifier(&crash_nmi_nb))
- return; /* return what? */
- /* Ensure the new callback function is set before sending
- * out the NMI
+
+ /*
+ * Set emergency handler to preempt other handlers.
*/
- wmb();
+ set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback);
+
+ apic_send_IPI_allbutself(NMI_VECTOR);
- smp_send_nmi_allbutself();
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
@@ -770,11 +944,49 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
+}
+
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+ if (crash_ipi_issued)
+ crash_nmi_callback(0, regs);
}
+
+/* Override the weak function in kernel/panic.c */
+void __noreturn nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /* If no CPU is preparing crash dump, we simply loop here. */
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+}
+
#else /* !CONFIG_SMP */
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
/* No other CPUs to shoot down */
}
+
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
#endif
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..4679ac0a03eb 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This is a good place to put board specific reboot fixups.
*
@@ -12,7 +13,7 @@
#include <linux/interrupt.h>
#include <asm/reboot_fixups.h>
#include <asm/msr.h>
-#include <asm/geode.h>
+#include <linux/cs5535.h>
static void cs5530a_warm_reset(struct pci_dev *dev)
{
@@ -26,7 +27,7 @@ static void cs5530a_warm_reset(struct pci_dev *dev)
static void cs5536_warm_reset(struct pci_dev *dev)
{
/* writing 1 to the LSB of this MSR causes a hard reset */
- wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL);
+ wrmsrq(MSR_DIVIL_SOFT_RESET, 1ULL);
udelay(50); /* shouldn't get here but be safe and spin a while */
}
@@ -43,17 +44,33 @@ static void rdc321x_reset(struct pci_dev *dev)
outb(1, 0x92);
}
+static void ce4100_reset(struct pci_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ outb(0x2, 0xcf9);
+ udelay(50);
+ }
+}
+
struct device_fixup {
unsigned int vendor;
unsigned int device;
void (*reboot_fixup)(struct pci_dev *);
};
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
+
static const struct device_fixup fixups_table[] = {
{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
};
/*
@@ -80,6 +97,7 @@ void mach_reboot_fixups(void)
continue;
cur->reboot_fixup(dev);
+ pci_dev_put(dev);
}
}
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11c..57276f134d12 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -1,18 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
+#include <asm/nospec-branch.h>
#include <asm/processor-flags.h>
/*
- * Must be relocatable PIC code callable as a C function
+ * Must be relocatable PIC code callable as a C function, in particular
+ * there must be a plain RET and not jump to return thunk.
*/
#define PTR(x) (x << 2)
@@ -37,8 +37,7 @@
#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
.text
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
/* Save the CPU context, used for jumping back */
pushl %ebx
@@ -94,9 +93,14 @@ relocate_kernel:
movl %edi, %eax
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(relocate_kernel)
-identity_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
+ /* set return address to 0 if not preserving context */
+ pushl $0
/* store the start address on the stack */
pushl %edx
@@ -107,7 +111,7 @@ identity_mapped:
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movl %cr0, %eax
andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax
@@ -159,12 +163,15 @@ identity_mapped:
xorl %edx, %edx
xorl %esi, %esi
xorl %ebp, %ebp
+ ANNOTATE_UNRET_SAFE
ret
+ int3
1:
popl %edx
movl CP_PA_SWAP_PAGE(%edi), %esp
addl $PAGE_SIZE, %esp
2:
+ ANNOTATE_RETPOLINE_SAFE
call *%edx
/* get the re-entry point of the peer system */
@@ -184,15 +191,18 @@ identity_mapped:
movl CP_PA_PGD(%ebx), %eax
movl %eax, %cr3
movl %cr0, %eax
- orl $(1<<31), %eax
+ orl $X86_CR0_PG, %eax
movl %eax, %cr0
lea PAGE_SIZE(%edi), %esp
movl %edi, %eax
addl $(virtual_mapped - relocate_kernel), %eax
pushl %eax
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(identity_mapped)
-virtual_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
movl CR4(%edi), %eax
movl %eax, %cr4
movl CR3(%edi), %eax
@@ -207,10 +217,13 @@ virtual_mapped:
popl %edi
popl %esi
popl %ebx
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movl 8(%esp), %edx
movl 4(%esp), %ecx
pushl %ebp
@@ -224,23 +237,23 @@ swap_pages:
movl (%ebx), %ecx
addl $4, %ebx
1:
- testl $0x1, %ecx /* is it a destination page */
+ testb $0x1, %cl /* is it a destination page */
jz 2f
movl %ecx, %edi
andl $0xfffff000, %edi
jmp 0b
2:
- testl $0x2, %ecx /* is it an indirection page */
+ testb $0x2, %cl /* is it an indirection page */
jz 2f
movl %ecx, %ebx
andl $0xfffff000, %ebx
jmp 0b
2:
- testl $0x4, %ecx /* is it the done indicator */
+ testb $0x4, %cl /* is it the done indicator */
jz 2f
jmp 3f
2:
- testl $0x8, %ecx /* is it the source indicator */
+ testb $0x8, %cl /* is it the source indicator */
jz 0b /* Ignore it otherwise */
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
@@ -250,17 +263,17 @@ swap_pages:
movl %edx, %edi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %ebp, %edi
movl %eax, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %eax, %edi
movl %edx, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
lea PAGE_SIZE(%ebp), %esi
jmp 0b
@@ -269,7 +282,10 @@ swap_pages:
popl %edi
popl %ebx
popl %ebp
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d476..4ffba68dc57b 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -1,52 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>
+#include <linux/stringify.h>
+#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
#include <asm/pgtable_types.h>
+#include <asm/nospec-branch.h>
+#include <asm/unwind_hints.h>
+#include <asm/asm-offsets.h>
/*
- * Must be relocatable PIC code callable as a C function
+ * Must be relocatable PIC code callable as a C function, in particular
+ * there must be a plain RET and not jump to return thunk.
*/
#define PTR(x) (x << 3)
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
- * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
- * ~ control_page + PAGE_SIZE are used as data storage and stack for
- * jumping back
+ * The .text..relocate_kernel and .data..relocate_kernel sections are copied
+ * into the control page, and the remainder of the page is used as the stack.
*/
-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+ .section .data..relocate_kernel,"a";
/* Minimal CPU state */
-#define RSP DATA(0x0)
-#define CR0 DATA(0x8)
-#define CR3 DATA(0x10)
-#define CR4 DATA(0x18)
-
-/* other data */
-#define CP_PA_TABLE_PAGE DATA(0x20)
-#define CP_PA_SWAP_PAGE DATA(0x28)
-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
-
- .text
- .align PAGE_SIZE
+SYM_DATA_LOCAL(saved_rsp, .quad 0)
+SYM_DATA_LOCAL(saved_cr0, .quad 0)
+SYM_DATA_LOCAL(saved_cr3, .quad 0)
+SYM_DATA_LOCAL(saved_cr4, .quad 0)
+ /* other data */
+SYM_DATA(kexec_va_control_page, .quad 0)
+SYM_DATA(kexec_pa_table_page, .quad 0)
+SYM_DATA(kexec_pa_swap_page, .quad 0)
+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
+SYM_DATA(kexec_debug_8250_port, .word 0)
+
+ .balign 16
+SYM_DATA_START_LOCAL(kexec_debug_gdt)
+ .word kexec_debug_gdt_end - kexec_debug_gdt - 1
+ .long 0
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
+
+ .balign 8
+SYM_DATA_START(kexec_debug_idt)
+ .skip 0x100, 0x00
+SYM_DATA_END(kexec_debug_idt)
+
+ .section .text..relocate_kernel,"ax";
.code64
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
/*
* %rdi indirection_page
- * %rsi page_list
+ * %rsi pa_control_page
* %rdx start address
- * %rcx preserve_context
+ * %rcx flags: RELOC_KERNEL_*
*/
/* Save the CPU context, used for jumping back */
@@ -58,51 +78,97 @@ relocate_kernel:
pushq %r15
pushf
- movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
- movq %rsp, RSP(%r11)
- movq %cr0, %rax
- movq %rax, CR0(%r11)
- movq %cr3, %rax
- movq %rax, CR3(%r11)
- movq %cr4, %rax
- movq %rax, CR4(%r11)
+ /* Invalidate GDT/IDT, zero out flags */
+ pushq $0
+ pushq $0
- /* zero out flags, and disable interrupts */
- pushq $0
+ lidt (%rsp)
+ lgdt (%rsp)
+ addq $8, %rsp
popfq
- /*
- * get physical address of control page now
- * this is impossible after page table switch
- */
- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ /* Switch to the identity mapped page tables */
+ movq %cr3, %rax
+ movq kexec_pa_table_page(%rip), %r9
+ movq %r9, %cr3
- /* get physical address of page table now too */
- movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+ /* Leave CR4 in %r13 to enable the right paging mode later. */
+ movq %cr4, %r13
- /* get physical address of swap page now */
- movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+ /*
+ * Disable global pages immediately to ensure this mapping is RWX.
+ * Disable LASS before jumping to the identity mapped page.
+ */
+ movq %r13, %r12
+ andq $~(X86_CR4_PGE | X86_CR4_LASS), %r12
+ movq %r12, %cr4
+
+ /* Save %rsp and CRs. */
+ movq %r13, saved_cr4(%rip)
+ movq %rsp, saved_rsp(%rip)
+ movq %rax, saved_cr3(%rip)
+ movq %cr0, %rax
+ movq %rax, saved_cr0(%rip)
- /* save some information for jumping back */
- movq %r9, CP_PA_TABLE_PAGE(%r11)
- movq %r10, CP_PA_SWAP_PAGE(%r11)
- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+ /* save indirection list for jumping back */
+ movq %rdi, pa_backup_pages_map(%rip)
- /* Switch to the identity mapped page tables */
- movq %r9, %cr3
+ /* Save the flags to %r11 as swap_pages clobbers %rcx. */
+ movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
- lea PAGE_SIZE(%r8), %rsp
+ lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
- addq $(identity_mapped - relocate_kernel), %r8
- pushq %r8
- ret
+0: addq $identity_mapped - 0b, %rsi
+ subq $__relocate_kernel_start - 0b, %rsi
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
+SYM_CODE_END(relocate_kernel)
+
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
+ UNWIND_HINT_END_OF_STACK
+ /*
+ * %rdi indirection page
+ * %rdx start address
+ * %r9 page table page
+ * %r11 flags: RELOC_KERNEL_*
+ * %r13 original CR4 when relocate_kernel() was invoked
+ */
-identity_mapped:
/* store the start address on the stack */
pushq %rdx
+ /* Create a GDTR (16 bits limit, 64 bits addr) on stack */
+ leaq kexec_debug_gdt(%rip), %rax
+ pushq %rax
+ pushw (%rax)
+
+ /* Load the GDT, put the stack back */
+ lgdt (%rsp)
+ addq $10, %rsp
+
+ /* Test that we can load segments */
+ movq %ds, %rax
+ movq %rax, %ds
+
+ /* Now an IDTR on the stack to load the IDT the kernel created */
+ leaq kexec_debug_idt(%rip), %rsi
+ pushq %rsi
+ pushw $0xff
+ lidt (%rsp)
+ addq $10, %rsp
+
+ //int3
+
+ /*
+ * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
+ * below.
+ */
+ movq %cr4, %rax
+ andq $~(X86_CR4_CET), %rax
+ movq %rax, %cr4
+
/*
* Set cr0 to a known state:
* - Paging enabled
@@ -110,7 +176,7 @@ identity_mapped:
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movq %cr0, %rax
andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
@@ -120,17 +186,37 @@ identity_mapped:
/*
* Set cr4 to a known state:
* - physical address extension enabled
+ * - 5-level paging, if it was enabled before
+ * - Machine check exception on TDX guest, if it was enabled before.
+ * Clearing MCE might not be allowed in TDX guests, depending on setup.
+ *
+ * Use R13 that contains the original CR4 value, read in relocate_kernel().
+ * PAE is always set in the original CR4.
*/
- movq $X86_CR4_PAE, %rax
- movq %rax, %cr4
-
- jmp 1f
-1:
+ andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
+ ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
+ movq %r13, %cr4
/* Flush the TLB (needed?) */
movq %r9, %cr3
- movq %rcx, %r11
+ /*
+ * If the memory cache is in incoherent state, e.g., due to
+ * memory encryption, do WBINVD to flush cache.
+ *
+ * If SME is active, there could be old encrypted cache line
+ * entries that will conflict with the now unencrypted memory
+ * used by kexec. Flush the caches before copying the kernel.
+ *
+ * Note SME sets this flag to true when the platform supports
+ * SME, so the WBINVD is performed even SME is not activated
+ * by the kernel. But this has no harm.
+ */
+ testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b
+ jz .Lnowbinvd
+ wbinvd
+.Lnowbinvd:
+
call swap_pages
/*
@@ -142,60 +228,94 @@ identity_mapped:
movq %cr3, %rax
movq %rax, %cr3
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
+ jnz .Lrelocate
+
/*
* set all of the registers to known values
* leave %rsp alone
*/
- testq %r11, %r11
- jnz 1f
- xorq %rax, %rax
- xorq %rbx, %rbx
- xorq %rcx, %rcx
- xorq %rdx, %rdx
- xorq %rsi, %rsi
- xorq %rdi, %rdi
- xorq %rbp, %rbp
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r9
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- xorq %r14, %r14
- xorq %r15, %r15
-
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ xorl %r11d, %r11d
+ xorl %r12d, %r12d
+ xorl %r13d, %r13d
+ xorl %r14d, %r14d
+ xorl %r15d, %r15d
+
+ ANNOTATE_UNRET_SAFE
ret
+ int3
-1:
+.Lrelocate:
popq %rdx
+
+ /* Use the swap page for the callee's stack */
+ movq kexec_pa_swap_page(%rip), %r10
leaq PAGE_SIZE(%r10), %rsp
+
+ /* push the existing entry point onto the callee's stack */
+ pushq %rdx
+
+ ANNOTATE_RETPOLINE_SAFE
call *%rdx
/* get the re-entry point of the peer system */
- movq 0(%rsp), %rbp
- call 1f
-1:
- popq %r8
- subq $(1b - relocate_kernel), %r8
- movq CP_PA_SWAP_PAGE(%r8), %r10
- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
- movq CP_PA_TABLE_PAGE(%r8), %rax
+ popq %rbp
+ movq kexec_pa_swap_page(%rip), %r10
+ movq pa_backup_pages_map(%rip), %rdi
+ movq kexec_pa_table_page(%rip), %rax
movq %rax, %cr3
+
+ /* Find start (and end) of this physical mapping of control page */
+ leaq (%rip), %r8
+ ANNOTATE_NOENDBR
+ andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
+ /*
+ * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that
+ * swap_pages() can swap pages correctly. Note all other
+ * RELOC_KERNEL_* flags passed to relocate_kernel() are not
+ * restored.
+ */
+ movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d
call swap_pages
- movq $virtual_mapped, %rax
+ movq kexec_va_control_page(%rip), %rax
+0: addq $virtual_mapped - 0b, %rax
+ subq $__relocate_kernel_start - 0b, %rax
pushq %rax
+ ANNOTATE_UNRET_SAFE
ret
-
-virtual_mapped:
- movq RSP(%r8), %rsp
- movq CR4(%r8), %rax
+ int3
+SYM_CODE_END(identity_mapped)
+
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR // RET target, above
+ movq saved_rsp(%rip), %rsp
+ movq saved_cr4(%rip), %rax
movq %rax, %cr4
- movq CR3(%r8), %rax
- movq CR0(%r8), %r8
+ movq saved_cr3(%rip), %rax
+ movq saved_cr0(%rip), %r8
movq %rax, %cr3
movq %r8, %cr0
+
+#ifdef CONFIG_KEXEC_JUMP
+ /* Saved in save_processor_state. */
+ movq $saved_context, %rax
+ lgdt saved_context_gdt_desc(%rax)
+#endif
+
+ /* relocate_kernel() returns the re-entry point for next time */
movq %rbp, %rax
popf
@@ -205,62 +325,297 @@ virtual_mapped:
popq %r12
popq %rbp
popq %rbx
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
- movq %rdi, %rcx /* Put the page_list in %rcx */
- xorq %rdi, %rdi
- xorq %rsi, %rsi
- jmp 1f
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
+ UNWIND_HINT_END_OF_STACK
+ /*
+ * %rdi indirection page
+ * %r11 flags: RELOC_KERNEL_*
+ */
+ movq %rdi, %rcx /* Put the indirection_page in %rcx */
+ xorl %edi, %edi
+ xorl %esi, %esi
+ jmp .Lstart /* Should start with an indirection record */
-0: /* top, read another word for the indirection page */
+.Lloop: /* top, read another word for the indirection page */
movq (%rbx), %rcx
addq $8, %rbx
-1:
- testq $0x1, %rcx /* is it a destination page? */
- jz 2f
+.Lstart:
+ testb $0x1, %cl /* is it a destination page? */
+ jz .Lnotdest
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
- jmp 0b
-2:
- testq $0x2, %rcx /* is it an indirection page? */
- jz 2f
+ jmp .Lloop
+.Lnotdest:
+ testb $0x2, %cl /* is it an indirection page? */
+ jz .Lnotind
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
- jmp 0b
-2:
- testq $0x4, %rcx /* is it the done indicator? */
- jz 2f
- jmp 3f
-2:
- testq $0x8, %rcx /* is it the source indicator? */
- jz 0b /* Ignore it otherwise */
+ jmp .Lloop
+.Lnotind:
+ testb $0x4, %cl /* is it the done indicator? */
+ jz .Lnotdone
+ jmp .Ldone
+.Lnotdone:
+ testb $0x8, %cl /* is it the source indicator? */
+ jz .Lloop /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
- movq %rdi, %rdx
- movq %rsi, %rax
+ movq %rdi, %rdx /* Save destination page to %rdx */
+ movq %rsi, %rax /* Save source page to %rax */
+
+ /* Only actually swap for ::preserve_context */
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
+ jz .Lnoswap
- movq %r10, %rdi
- movq $512, %rcx
- rep ; movsq
+ /* copy source page to swap page */
+ movq kexec_pa_swap_page(%rip), %rdi
+ movl $512, %ecx
+ rep movsq
+ /* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
- movq $512, %rcx
- rep ; movsq
+ movl $512, %ecx
+ rep movsq
+ /* copy swap page to destination page */
movq %rdx, %rdi
- movq %r10, %rsi
- movq $512, %rcx
- rep ; movsq
+ movq kexec_pa_swap_page(%rip), %rsi
+.Lnoswap:
+ movl $512, %ecx
+ rep movsq
lea PAGE_SIZE(%rax), %rsi
- jmp 0b
-3:
+ jmp .Lloop
+.Ldone:
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(swap_pages)
+
+/*
+ * Generic 'print character' routine
+ * - %al: Character to be printed (may clobber %rax)
+ * - %rdx: MMIO address or port.
+ */
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ addw $LSR, %dx
+ xchg %al, %ah
+.Lxmtrdy_loop:
+ inb %dx, %al
+ testb $XMTRDY, %al
+ jnz .Lready
+ pause
+ jmp .Lxmtrdy_loop
+
+.Lready:
+ subw $LSR, %dx
+ xchg %al, %ah
+ outb %al, %dx
+pr_char_null:
+ ANNOTATE_NOENDBR
+
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+.Lxmtrdy_loop_mmio:
+ movb (LSR*4)(%rdx), %ah
+ testb $XMTRDY, %ah
+ jnz .Lready_mmio
+ pause
+ jmp .Lxmtrdy_loop_mmio
+
+.Lready_mmio:
+ movb %al, (%rdx)
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250_mmio32)
+
+/*
+ * Load pr_char function pointer into %rsi and load %rdx with whatever
+ * that function wants to see there (typically port/MMIO address).
+ */
+.macro pr_setup
+ leaq pr_char_8250(%rip), %rsi
+ movw kexec_debug_8250_port(%rip), %dx
+ testw %dx, %dx
+ jnz 1f
+
+ leaq pr_char_8250_mmio32(%rip), %rsi
+ movq kexec_debug_8250_mmio32(%rip), %rdx
+ testq %rdx, %rdx
+ jnz 1f
+
+ leaq pr_char_null(%rip), %rsi
+1:
+.endm
+
+/* Print the nybble in %bl, clobber %rax */
+SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
+ UNWIND_HINT_FUNC
+ movb %bl, %al
+ nop
+ andb $0x0f, %al
+ addb $0x30, %al
+ cmpb $0x3a, %al
+ jb 1f
+ addb $('a' - '0' - 10), %al
+ ANNOTATE_RETPOLINE_SAFE
+1: jmp *%rsi
+SYM_CODE_END(pr_nybble)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
+ UNWIND_HINT_FUNC
+ movq $16, %rcx
+1: rolq $4, %rbx
+ call pr_nybble
+ loop 1b
+ movb $'\n', %al
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
+SYM_CODE_END(pr_qword)
+
+.macro print_reg a, b, c, d, r
+ movb $\a, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\b, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\c, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\d, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movq \r, %rbx
+ call pr_qword
+.endm
+
+SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
+ /* Each of these is 6 bytes. */
+.macro vec_err exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ nop
+ nop
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+.macro vec_noerr exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ pushq $0
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+ ANNOTATE_NOENDBR
+ vec_noerr 0 // #DE
+ vec_noerr 1 // #DB
+ vec_noerr 2 // #NMI
+ vec_noerr 3 // #BP
+ vec_noerr 4 // #OF
+ vec_noerr 5 // #BR
+ vec_noerr 6 // #UD
+ vec_noerr 7 // #NM
+ vec_err 8 // #DF
+ vec_noerr 9
+ vec_err 10 // #TS
+ vec_err 11 // #NP
+ vec_err 12 // #SS
+ vec_err 13 // #GP
+ vec_err 14 // #PF
+ vec_noerr 15
+SYM_CODE_END(kexec_debug_exc_vectors)
+
+SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
+ /* No need for RET mitigations during kexec */
+ VALIDATE_UNRET_END
+
+ pushq %rax
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+ pushq %rsi
+
+ /* Stack frame */
+#define EXC_SS 0x58 /* Architectural... */
+#define EXC_RSP 0x50
+#define EXC_EFLAGS 0x48
+#define EXC_CS 0x40
+#define EXC_RIP 0x38
+#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */
+#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */
+#define EXC_RAX 0x20 /* Pushed just above in exc_handler */
+#define EXC_RBX 0x18
+#define EXC_RCX 0x10
+#define EXC_RDX 0x08
+#define EXC_RSI 0x00
+
+ /* Set up %rdx/%rsi for debug output */
+ pr_setup
+
+ /* rip and exception info */
+ print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
+ print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
+ print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
+ print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
+
+ /* We spilled these to the stack */
+ print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
+ print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
+ print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
+ print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
+ print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
+
+ /* Other registers untouched */
+ print_reg 'r', 'd', 'i', ':', %rdi
+ print_reg 'r', '8', ' ', ':', %r8
+ print_reg 'r', '9', ' ', ':', %r9
+ print_reg 'r', '1', '0', ':', %r10
+ print_reg 'r', '1', '1', ':', %r11
+ print_reg 'r', '1', '2', ':', %r12
+ print_reg 'r', '1', '3', ':', %r13
+ print_reg 'r', '1', '4', ':', %r14
+ print_reg 'r', '1', '5', ':', %r15
+ print_reg 'c', 'r', '2', ':', %cr2
+
+ /* Only return from INT3 */
+ cmpq $3, EXC_EXCEPTION(%rsp)
+ jne .Ldie
+
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ popq %rax
+
+ addq $16, %rsp
+ iretq
+
+.Ldie:
+ hlt
+ jmp .Ldie
- .globl kexec_control_code_size
-.set kexec_control_code_size, . - relocate_kernel
+SYM_CODE_END(exc_handler)
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..79bc8a97a083
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ioport.h>
+#include <linux/printk.h>
+#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
+
+static void resource_clip(struct resource *res, resource_size_t start,
+ resource_size_t end)
+{
+ resource_size_t low = 0, high = 0;
+
+ if (res->end < start || res->start > end)
+ return; /* no conflict */
+
+ if (res->start < start)
+ low = start - res->start;
+
+ if (res->end > end)
+ high = res->end - end;
+
+ /* Keep the area above or below the conflict, whichever is larger */
+ if (low > high)
+ res->end = start - 1;
+ else
+ res->start = end + 1;
+}
+
+static void remove_e820_regions(struct resource *avail)
+{
+ int i;
+ struct e820_entry *entry;
+ u64 e820_start, e820_end;
+ struct resource orig = *avail;
+
+ if (!pci_use_e820)
+ return;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ entry = &e820_table->entries[i];
+ e820_start = entry->addr;
+ e820_end = entry->addr + entry->size - 1;
+
+ resource_clip(avail, e820_start, e820_end);
+ if (orig.start != avail->start || orig.end != avail->end) {
+ pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n",
+ e820_start, e820_end);
+ if (avail->end > avail->start)
+ /*
+ * Use %pa instead of %pR because "avail"
+ * is typically IORESOURCE_UNSET, so %pR
+ * shows the size instead of addresses.
+ */
+ pr_info("resource: remaining [mem %pa-%pa] available\n",
+ &avail->start, &avail->end);
+ orig = *avail;
+ }
+ }
+}
+
+void arch_remove_reservations(struct resource *avail)
+{
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
+ if (avail->flags & IORESOURCE_MEM) {
+ resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+
+ remove_e820_regions(avail);
+ }
+}
diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c
new file mode 100644
index 000000000000..85e2f2d16a90
--- /dev/null
+++ b/arch/x86/kernel/rethook.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c.
+ */
+#include <linux/bug.h>
+#include <linux/rethook.h>
+#include <linux/kprobes.h>
+#include <linux/objtool.h>
+
+#include "kprobes/common.h"
+
+__visible void arch_rethook_trampoline_callback(struct pt_regs *regs);
+
+#ifndef ANNOTATE_NOENDBR
+#define ANNOTATE_NOENDBR
+#endif
+
+/*
+ * When a target function returns, this code saves registers and calls
+ * arch_rethook_trampoline_callback(), which calls the rethook handler.
+ */
+asm(
+ ".text\n"
+ ".global arch_rethook_trampoline\n"
+ ".type arch_rethook_trampoline, @function\n"
+ "arch_rethook_trampoline:\n"
+#ifdef CONFIG_X86_64
+ ANNOTATE_NOENDBR "\n" /* This is only jumped from ret instruction */
+ /* Push a fake return address to tell the unwinder it's a rethook. */
+ " pushq $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ " pushq $" __stringify(__KERNEL_DS) "\n"
+ /* Save the 'sp - 16', this will be fixed later. */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rdi\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */
+ " addq $16, %rsp\n"
+ " popfq\n"
+#else
+ /* Push a fake return address to tell the unwinder it's a rethook. */
+ " pushl $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ " pushl %ss\n"
+ /* Save the 'sp - 8', this will be fixed later. */
+ " pushl %esp\n"
+ " pushfl\n"
+ SAVE_REGS_STRING
+ " movl %esp, %eax\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */
+ " addl $8, %esp\n"
+ " popfl\n"
+#endif
+ ASM_RET
+ ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n"
+);
+NOKPROBE_SYMBOL(arch_rethook_trampoline);
+
+/*
+ * Called from arch_rethook_trampoline
+ */
+__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs)
+{
+ unsigned long *frame_pointer;
+
+ /* fixup registers */
+ regs->cs = __KERNEL_CS;
+#ifdef CONFIG_X86_32
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)&arch_rethook_trampoline;
+ regs->orig_ax = ~0UL;
+ regs->sp += 2*sizeof(long);
+ frame_pointer = (long *)(regs + 1);
+
+ /*
+ * The return address at 'frame_pointer' is recovered by the
+ * arch_rethook_fixup_return() which called from this
+ * rethook_trampoline_handler().
+ */
+ rethook_trampoline_handler(regs, (unsigned long)frame_pointer);
+
+ /*
+ * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline()
+ * can do RET right after POPF.
+ */
+ *(unsigned long *)&regs->ss = regs->flags;
+}
+NOKPROBE_SYMBOL(arch_rethook_trampoline_callback);
+
+/*
+ * arch_rethook_trampoline() skips updating frame pointer. The frame pointer
+ * saved in arch_rethook_trampoline_callback() points to the real caller
+ * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have
+ * a standard stack frame with CONFIG_FRAME_POINTER=y.
+ * Let's mark it non-standard function. Anyway, FP unwinder can correctly
+ * unwind without the hint.
+ */
+STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline);
+
+/* This is called from rethook_trampoline_handler(). */
+void arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ unsigned long *frame_pointer = (void *)(regs + 1);
+
+ /* Replace fake return address with real one. */
+ *frame_pointer = correct_ret_addr;
+}
+NOKPROBE_SYMBOL(arch_rethook_fixup_return);
+
+void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount)
+{
+ unsigned long *stack = (unsigned long *)regs->sp;
+
+ rh->ret_addr = stack[0];
+ rh->frame = regs->sp;
+
+ /* Replace the return addr with trampoline addr */
+ stack[0] = (unsigned long) arch_rethook_trampoline;
+}
+NOKPROBE_SYMBOL(arch_rethook_prepare);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e72..51a849a79c98 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,145 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* RTC related functions
*/
#include <linux/platform_device.h>
#include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/bcd.h>
+#include <linux/export.h>
#include <linux/pnp.h>
#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
#include <asm/time.h>
+#include <asm/setup.h>
#ifdef CONFIG_X86_32
/*
* This is a special lock that is owned by the CPU and holds the index
* register we are working with. It is required for NMI access to the
- * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
+ * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details.
*/
volatile unsigned long cmos_lock;
EXPORT_SYMBOL(cmos_lock);
#endif /* CONFIG_X86_32 */
-/* For two digit years assume time is always after that */
-#define CMOS_YEARS_OFFS 2000
-
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * In order to set the CMOS clock precisely, mach_set_cmos_time has to be
* called 500 ms after the second nowtime has started, because when
* nowtime is written into the registers of the CMOS clock, it will
* jump to the next second precisely 500 ms later. Check the Motorola
* MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- * sets the minutes. Usually you'll only notice that after reboot!
*/
-int mach_set_rtc_mmss(unsigned long nowtime)
+int mach_set_cmos_time(const struct timespec64 *now)
{
- int real_seconds, real_minutes, cmos_minutes;
- unsigned char save_control, save_freq_select;
+ unsigned long long nowtime = now->tv_sec;
+ struct rtc_time tm;
int retval = 0;
- /* tell the clock it's being set */
- save_control = CMOS_READ(RTC_CONTROL);
- CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
- /* stop and reset prescaler */
- save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
- CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
- cmos_minutes = CMOS_READ(RTC_MINUTES);
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- cmos_minutes = bcd2bin(cmos_minutes);
-
- /*
- * since we're only adjusting minutes and seconds,
- * don't interfere with hour overflow. This avoids
- * messing with unknown time zones but requires your
- * RTC not to be off by more than 15 minutes
- */
- real_seconds = nowtime % 60;
- real_minutes = nowtime / 60;
- /* correct for half hour time zone */
- if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
- real_minutes += 30;
- real_minutes %= 60;
-
- if (abs(real_minutes - cmos_minutes) < 30) {
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
- real_seconds = bin2bcd(real_seconds);
- real_minutes = bin2bcd(real_minutes);
- }
- CMOS_WRITE(real_seconds, RTC_SECONDS);
- CMOS_WRITE(real_minutes, RTC_MINUTES);
+ rtc_time64_to_tm(nowtime, &tm);
+ if (!rtc_valid_tm(&tm)) {
+ retval = mc146818_set_time(&tm);
+ if (retval)
+ printk(KERN_ERR "%s: RTC write failed with error %d\n",
+ __func__, retval);
} else {
- printk(KERN_WARNING
- "set_rtc_mmss: can't update from %d to %d\n",
- cmos_minutes, real_minutes);
- retval = -1;
+ printk(KERN_ERR
+ "%s: Invalid RTC value: write of %llx to RTC failed\n",
+ __func__, nowtime);
+ retval = -EINVAL;
}
-
- /* The following flags have to be released exactly in this order,
- * otherwise the DS12887 (popular MC146818A clone with integrated
- * battery and quartz) will not reset the oscillator and will not
- * update precisely 500 ms later. You won't find this mentioned in
- * the Dallas Semiconductor data sheets, but who believes data
- * sheets anyway ... -- Markus Kuhn
- */
- CMOS_WRITE(save_control, RTC_CONTROL);
- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-
return retval;
}
-unsigned long mach_get_cmos_time(void)
+void mach_get_cmos_time(struct timespec64 *now)
{
- unsigned int status, year, mon, day, hour, min, sec, century = 0;
+ struct rtc_time tm;
/*
- * If UIP is clear, then we have >= 244 microseconds before
- * RTC registers will be updated. Spec sheet says that this
- * is the reliable way to read RTC - registers. If UIP is set
- * then the register access might be invalid.
+ * If pm_trace abused the RTC as storage, set the timespec to 0,
+ * which tells the caller that this RTC value is unusable.
*/
- while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
- cpu_relax();
-
- sec = CMOS_READ(RTC_SECONDS);
- min = CMOS_READ(RTC_MINUTES);
- hour = CMOS_READ(RTC_HOURS);
- day = CMOS_READ(RTC_DAY_OF_MONTH);
- mon = CMOS_READ(RTC_MONTH);
- year = CMOS_READ(RTC_YEAR);
-
-#ifdef CONFIG_ACPI
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- acpi_gbl_FADT.century)
- century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-
- status = CMOS_READ(RTC_CONTROL);
- WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
-
- if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
- sec = bcd2bin(sec);
- min = bcd2bin(min);
- hour = bcd2bin(hour);
- day = bcd2bin(day);
- mon = bcd2bin(mon);
- year = bcd2bin(year);
+ if (!pm_trace_rtc_valid()) {
+ now->tv_sec = now->tv_nsec = 0;
+ return;
}
- if (century) {
- century = bcd2bin(century);
- year += century * 100;
- printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
- } else
- year += CMOS_YEARS_OFFS;
+ if (mc146818_get_time(&tm, 1000)) {
+ pr_err("Unable to read current time from RTC\n");
+ now->tv_sec = now->tv_nsec = 0;
+ return;
+ }
- return mktime(year, mon, day, hour, min, sec);
+ now->tv_sec = rtc_tm_to_time64(&tm);
+ now->tv_nsec = 0;
}
/* Routines for accessing the CMOS RAM/RTC. */
@@ -165,40 +99,16 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
}
EXPORT_SYMBOL(rtc_cmos_write);
-static int set_rtc_mmss(unsigned long nowtime)
+int update_persistent_clock64(struct timespec64 now)
{
- unsigned long flags;
- int retval;
-
- spin_lock_irqsave(&rtc_lock, flags);
- retval = set_wallclock(nowtime);
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- return retval;
+ return x86_platform.set_wallclock(&now);
}
/* not static: needed by APM */
-unsigned long read_persistent_clock(void)
-{
- unsigned long retval, flags;
-
- spin_lock_irqsave(&rtc_lock, flags);
- retval = get_wallclock();
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
- return set_rtc_mmss(now.tv_sec);
-}
-
-unsigned long long native_read_tsc(void)
+void read_persistent_clock64(struct timespec64 *ts)
{
- return __native_read_tsc();
+ x86_platform.get_wallclock(ts);
}
-EXPORT_SYMBOL(native_read_tsc);
static struct resource rtc_resources[] = {
@@ -224,21 +134,20 @@ static struct platform_device rtc_device = {
static __init int add_rtc_cmos(void)
{
#ifdef CONFIG_PNP
- static const char *ids[] __initconst =
+ static const char * const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
- struct pnp_id *id;
int i;
pnp_for_each_dev(dev) {
- for (id = dev->id; id; id = id->next) {
- for (i = 0; i < ARRAY_SIZE(ids); i++) {
- if (compare_pnp_id(id, ids[i]) != 0)
- return 0;
- }
+ for (i = 0; i < ARRAY_SIZE(ids); i++) {
+ if (compare_pnp_id(dev->id, ids[i]) != 0)
+ return 0;
}
}
#endif
+ if (!x86_platform.legacy.rtc)
+ return -ENODEV;
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef2..1b2edd07a3e1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1,201 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1995 Linus Torvalds
*
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- * Memory region support
- * David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- * Added E820 sanitization routine (removes overlapping memory regions);
- * Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- * Patrick Mochel <mochel@osdl.org>, March 2002
- *
- * Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
+ * This file contains the setup_arch() code, which handles the architecture-dependent
+ * parts of early kernel initialization.
*/
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
#include <linux/acpi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/bootmem.h>
-#include <linux/seq_file.h>
#include <linux/console.h>
-#include <linux/mca.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/crash_dump.h>
+#include <linux/dma-map-ops.h>
#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
+#include <linux/hugetlb.h>
+#include <linux/ima.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kexec.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
+#include <linux/memblock.h>
+#include <linux/panic_notifier.h>
#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/delay.h>
-
-#include <linux/kallsyms.h>
-#include <linux/cpufreq.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/uaccess.h>
-
-#include <linux/percpu.h>
-#include <linux/crash_dump.h>
+#include <linux/random.h>
+#include <linux/root_dev.h>
+#include <linux/static_call.h>
+#include <linux/swiotlb.h>
+#include <linux/tboot.h>
+#include <linux/usb/xhci-dbgp.h>
+#include <linux/vmalloc.h>
-#include <video/edid.h>
+#include <uapi/linux/mount.h>
+
+#include <xen/xen.h>
-#include <asm/mtrr.h>
#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/vmi.h>
-#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
#include <asm/bugs.h>
-
-#include <asm/system.h>
-#include <asm/vsyscall.h>
+#include <asm/cacheinfo.h>
+#include <asm/coco.h>
#include <asm/cpu.h>
-#include <asm/desc.h>
-#include <asm/dma.h>
-#include <asm/iommu.h>
+#include <asm/efi.h>
#include <asm/gart.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-
-#include <asm/paravirt.h>
#include <asm/hypervisor.h>
-
-#include <asm/percpu.h>
-#include <asm/topology.h>
-#include <asm/apicdef.h>
-#ifdef CONFIG_X86_64
-#include <asm/numa_64.h>
-#endif
-
-#ifndef ARCH_SETUP
-#define ARCH_SETUP
-#endif
+#include <asm/io_apic.h>
+#include <asm/kasan.h>
+#include <asm/kaslr.h>
+#include <asm/mce.h>
+#include <asm/memtype.h>
+#include <asm/mtrr.h>
+#include <asm/nmi.h>
+#include <asm/numa.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pci-direct.h>
+#include <asm/prom.h>
+#include <asm/proto.h>
+#include <asm/realmode.h>
+#include <asm/thermal.h>
+#include <asm/unwind.h>
+#include <asm/vsyscall.h>
/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest directly mapped pfn < 4 GB
+ * max_pfn_mapped: highest directly mapped pfn > 4 GB
+ *
+ * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped[].
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
+#ifdef CONFIG_DMI
RESERVE_BRK(dmi_alloc, 65536);
+#endif
-unsigned int boot_cpu_id __read_mostly;
-
-static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
-unsigned long _brk_end = (unsigned long)__brk_base;
-
-#ifdef CONFIG_X86_64
-int default_cpu_present_to_apicid(int mps_cpu)
-{
- return __default_cpu_present_to_apicid(mps_cpu);
-}
-int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
-{
- return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
-}
-#endif
+unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
struct boot_params boot_params;
-#endif
/*
- * Machine setup..
+ * These are the four main kernel memory regions, we put them into
+ * the resource tree so that kdump tools and other debugging tools
+ * recover it:
*/
+
+static struct resource rodata_resource = {
+ .name = "Kernel rodata",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
static struct resource data_resource = {
.name = "Kernel data",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource code_resource = {
.name = "Kernel code",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource bss_resource = {
.name = "Kernel bss",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
#ifdef CONFIG_X86_32
-static struct resource video_ram_resource = {
- .name = "Video RAM area",
- .start = 0xa0000,
- .end = 0xbffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
-/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
-EXPORT_SYMBOL(boot_cpu_data);
-static void set_mca_bus(int x)
-{
-#ifdef CONFIG_MCA
- MCA_bus = x;
-#endif
-}
-
-unsigned int def_to_bigsmp;
-
-/* for MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
+/* CPU data as detected by the assembly code in head_32.S */
+struct cpuinfo_x86 new_cpu_data;
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
@@ -208,30 +127,96 @@ EXPORT_SYMBOL(ist_info);
struct ist_info ist_info;
#endif
-#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {
- .x86_phys_bits = MAX_PHYSMEM_BITS,
-};
-EXPORT_SYMBOL(boot_cpu_data);
#endif
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+SYM_PIC_ALIAS(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features __ro_after_init;
#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
+#endif
+
+#ifdef CONFIG_IMA
+static phys_addr_t ima_kexec_buffer_phys;
+static size_t ima_kexec_buffer_size;
#endif
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
int bootloader_type, bootloader_version;
+static const struct ctl_table x86_sysctl_table[] = {
+ {
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_type",
+ .data = &bootloader_type,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_version",
+ .data = &bootloader_version,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "io_delay_type",
+ .data = &io_delay_type,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#if defined(CONFIG_ACPI_SLEEP)
+ {
+ .procname = "acpi_video_flags",
+ .data = &acpi_realmode_flags,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+#endif
+};
+
+static int __init init_x86_sysctl(void)
+{
+ register_sysctl_init("kernel", x86_sysctl_table);
+ return 0;
+}
+arch_initcall(init_x86_sysctl);
+
/*
* Setup options
*/
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
+#if defined(CONFIG_FIRMWARE_EDID)
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
+#endif
extern int root_mountflags;
@@ -243,7 +228,8 @@ unsigned long saved_video_mode;
static char __initdata command_line[COMMAND_LINE_SIZE];
#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+bool builtin_cmdline_added __ro_after_init;
#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -256,7 +242,7 @@ EXPORT_SYMBOL(edd);
* from boot_params into a safe place.
*
*/
-static inline void copy_edd(void)
+static inline void __init copy_edd(void)
{
memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
sizeof(edd.mbr_signature));
@@ -265,7 +251,7 @@ static inline void copy_edd(void)
edd.edd_info_nr = boot_params.eddbuf_entries;
}
#else
-static inline void copy_edd(void)
+static inline void __init copy_edd(void)
{
}
#endif
@@ -289,16 +275,8 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
}
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
+#ifdef CONFIG_X86_32
+static void __init cleanup_highmap(void)
{
}
#endif
@@ -306,7 +284,8 @@ static inline void init_gbpages(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+ memblock_reserve_kern(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -315,99 +294,92 @@ static void __init reserve_brk(void)
#ifdef CONFIG_BLK_DEV_INITRD
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
-static void __init relocate_initrd(void)
+static u64 __init get_ramdisk_image(void)
{
-
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
- u64 ramdisk_here;
- unsigned long slop, clen, mapaddr;
- char *p, *q;
- /* We need to move the initrd down into lowmem */
- ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
- PAGE_SIZE);
+ ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+ if (ramdisk_image == 0)
+ ramdisk_image = phys_initrd_start;
+
+ return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+ if (ramdisk_size == 0)
+ ramdisk_size = phys_initrd_size;
+
+ return ramdisk_size;
+}
- if (ramdisk_here == -1ULL)
+static void __init relocate_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 area_size = PAGE_ALIGN(ramdisk_size);
+ int ret = 0;
+
+ /* We need to move the initrd down into directly mapped mem */
+ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
+ if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
- ramdisk_size);
+ ramdisk_size);
- /* Note: this includes all the lowmem currently occupied by
- the initrd, we rely on that fact to keep the data intact. */
- reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
- "NEW RAMDISK");
- initrd_start = ramdisk_here + PAGE_OFFSET;
+ initrd_start = relocated_ramdisk + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
- printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
- ramdisk_here, ramdisk_here + ramdisk_size);
-
- q = (char *)initrd_start;
-
- /* Copy any lowmem portion of the initrd */
- if (ramdisk_image < end_of_lowmem) {
- clen = end_of_lowmem - ramdisk_image;
- p = (char *)__va(ramdisk_image);
- memcpy(q, p, clen);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
+ printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
- /* Copy the highmem portion of the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_memremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_iounmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
- /* high pages is not converted by early_res_to_bootmem */
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
- printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
- " %08llx - %08llx\n",
+ ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
+ if (ret)
+ panic("Copy RAMDISK failed\n");
+
+ printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+ " [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
- ramdisk_here, ramdisk_here + ramdisk_size - 1);
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
}
-static void __init reserve_initrd(void)
+static void __init early_reserve_initrd(void)
{
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = ramdisk_image + ramdisk_size;
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
- initrd_start = 0;
+ memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image);
+}
- if (ramdisk_size >= (end_of_lowmem>>1)) {
- free_early(ramdisk_image, ramdisk_end);
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
- }
+static void __init reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
- ramdisk_end);
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ initrd_start = 0;
+ printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+ ramdisk_end - 1);
- if (ramdisk_end <= end_of_lowmem) {
- /* All in lowmem, easy case */
- /*
- * don't need to reserve again, already reserved early
- * in i386_start_kernel
- */
+ if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ PFN_DOWN(ramdisk_end))) {
+ /* All are mapped, easy case */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
@@ -415,172 +387,243 @@ static void __init reserve_initrd(void)
relocate_initrd();
- free_early(ramdisk_image, ramdisk_end);
+ memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
+
#else
+static void __init early_reserve_initrd(void)
+{
+}
static void __init reserve_initrd(void)
{
}
#endif /* CONFIG_BLK_DEV_INITRD */
-static void __init parse_setup_data(void)
+static void __init add_early_ima_buffer(u64 phys_addr)
{
- struct setup_data *data;
- u64 pa_data;
+#ifdef CONFIG_IMA
+ struct ima_setup_data *data;
- if (boot_params.hdr.version < 0x0209)
+ data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data));
+ if (!data) {
+ pr_warn("setup: failed to memremap ima_setup_data entry\n");
return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_memremap(pa_data, PAGE_SIZE);
- switch (data->type) {
- case SETUP_E820_EXT:
- parse_e820_ext(data, pa_data);
- break;
- default:
- break;
- }
- pa_data = data->next;
- early_iounmap(data, PAGE_SIZE);
}
+
+ if (data->size) {
+ memblock_reserve_kern(data->addr, data->size);
+ ima_kexec_buffer_phys = data->addr;
+ ima_kexec_buffer_size = data->size;
+ }
+
+ early_memunmap(data, sizeof(*data));
+#else
+ pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n");
+#endif
}
-static void __init e820_reserve_setup_data(void)
+#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
+int __init ima_free_kexec_buffer(void)
{
- struct setup_data *data;
- u64 pa_data;
- int found = 0;
+ if (!ima_kexec_buffer_size)
+ return -ENOENT;
+
+ memblock_free_late(ima_kexec_buffer_phys,
+ ima_kexec_buffer_size);
+
+ ima_kexec_buffer_phys = 0;
+ ima_kexec_buffer_size = 0;
+
+ return 0;
+}
+
+int __init ima_get_kexec_buffer(void **addr, size_t *size)
+{
+ if (!ima_kexec_buffer_size)
+ return -ENOENT;
- if (boot_params.hdr.version < 0x0209)
+ *addr = __va(ima_kexec_buffer_phys);
+ *size = ima_kexec_buffer_size;
+
+ return 0;
+}
+#endif
+
+static void __init add_kho(u64 phys_addr, u32 data_len)
+{
+ struct kho_data *kho;
+ u64 addr = phys_addr + sizeof(struct setup_data);
+ u64 size = data_len - sizeof(struct setup_data);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n");
return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_memremap(pa_data, sizeof(*data));
- e820_update_range(pa_data, sizeof(*data)+data->len,
- E820_RAM, E820_RESERVED_KERN);
- found = 1;
- pa_data = data->next;
- early_iounmap(data, sizeof(*data));
}
- if (!found)
+
+ kho = early_memremap(addr, size);
+ if (!kho) {
+ pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n",
+ addr, size);
return;
+ }
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
- printk(KERN_INFO "extended physical RAM map:\n");
- e820_print_map("reserve setup_data");
+ kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size);
+
+ early_memunmap(kho, size);
}
-static void __init reserve_early_setup_data(void)
+static void __init parse_setup_data(void)
{
struct setup_data *data;
- u64 pa_data;
- char buf[32];
+ u64 pa_data, pa_next;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
+ u32 data_len, data_type;
+
data = early_memremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
- pa_data = data->next;
- early_iounmap(data, sizeof(*data));
+ data_len = data->len + sizeof(struct setup_data);
+ data_type = data->type;
+ pa_next = data->next;
+ early_memunmap(data, sizeof(*data));
+
+ switch (data_type) {
+ case SETUP_E820_EXT:
+ e820__memory_setup_extended(pa_data, data_len);
+ break;
+ case SETUP_DTB:
+ add_dtb(pa_data);
+ break;
+ case SETUP_EFI:
+ parse_efi_setup(pa_data, data_len);
+ break;
+ case SETUP_IMA:
+ add_early_ima_buffer(pa_data);
+ break;
+ case SETUP_KEXEC_KHO:
+ add_kho(pa_data, data_len);
+ break;
+ case SETUP_RNG_SEED:
+ data = early_memremap(pa_data, data_len);
+ add_bootloader_randomness(data->data, data->len);
+ /* Zero seed for forward secrecy. */
+ memzero_explicit(data->data, data->len);
+ /* Zero length in case we find ourselves back here by accident. */
+ memzero_explicit(&data->len, sizeof(data->len));
+ early_memunmap(data, data_len);
+ break;
+ default:
+ break;
+ }
+ pa_data = pa_next;
}
}
/*
- * --------- Crashkernel reservation ------------------------------
+ * Translate the fields of 'struct boot_param' into global variables
+ * representing these parameters.
*/
+static void __init parse_boot_params(void)
+{
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+#if defined(CONFIG_FIRMWARE_EDID)
+ edid_info = boot_params.edid_info;
+#endif
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI32_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI64_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+ }
+#endif
-/**
- * Reserve @size bytes of crashkernel memory at any suitable offset.
- *
- * @size: Size of the crashkernel memory to reserve.
- * Returns the base address on success, and -1ULL on failure.
- */
-static
-unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+}
+
+static void __init memblock_x86_reserve_range_setup_data(void)
{
- const unsigned long long alignment = 16<<20; /* 16M */
- unsigned long long start = 0LL;
+ struct setup_indirect *indirect;
+ struct setup_data *data;
+ u64 pa_data, pa_next;
+ u32 len;
- while (1) {
- int ret;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_memremap(pa_data, sizeof(*data));
+ if (!data) {
+ pr_warn("setup: failed to memremap setup_data entry\n");
+ return;
+ }
- start = find_e820_area(start, ULONG_MAX, size, alignment);
- if (start == -1ULL)
- return start;
+ len = sizeof(*data);
+ pa_next = data->next;
- /* try to reserve it */
- ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
- if (ret >= 0)
- return start;
+ memblock_reserve_kern(pa_data, sizeof(*data) + data->len);
- start += alignment;
- }
-}
+ if (data->type == SETUP_INDIRECT) {
+ len += data->len;
+ early_memunmap(data, sizeof(*data));
+ data = early_memremap(pa_data, len);
+ if (!data) {
+ pr_warn("setup: failed to memremap indirect setup_data\n");
+ return;
+ }
-static inline unsigned long long get_total_mem(void)
-{
- unsigned long long total;
+ indirect = (struct setup_indirect *)data->data;
- total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
- total += highend_pfn - highstart_pfn;
-#endif
+ if (indirect->type != SETUP_INDIRECT)
+ memblock_reserve_kern(indirect->addr, indirect->len);
+ }
- return total << PAGE_SHIFT;
+ pa_data = pa_next;
+ early_memunmap(data, len);
+ }
}
-static void __init reserve_crashkernel(void)
+static void __init arch_reserve_crashkernel(void)
{
- unsigned long long total_mem;
- unsigned long long crash_size, crash_base;
+ unsigned long long crash_base, crash_size, low_size = 0, cma_size = 0;
+ bool high = false;
int ret;
- total_mem = get_total_mem();
+ if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
+ return;
- ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0)
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+ &crash_size, &crash_base,
+ &low_size, &cma_size, &high);
+ if (ret)
return;
- /* 0 means: find the address automatically */
- if (crash_base <= 0) {
- crash_base = find_and_reserve_crashkernel(crash_size);
- if (crash_base == -1ULL) {
- pr_info("crashkernel reservation failed. "
- "No suitable area found.\n");
- return;
- }
- } else {
- ret = reserve_bootmem_generic(crash_base, crash_size,
- BOOTMEM_EXCLUSIVE);
- if (ret < 0) {
- pr_info("crashkernel reservation failed - "
- "memory is in use\n");
- return;
- }
+ if (xen_pv_domain()) {
+ pr_info("Ignoring crashkernel for a Xen PV domain\n");
+ return;
}
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
+ reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
+ reserve_crashkernel_cma(cma_size);
}
-#else
-static void __init reserve_crashkernel(void)
-{
-}
-#endif
static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
@@ -605,7 +648,7 @@ static struct resource standard_io_resources[] = {
.flags = IORESOURCE_BUSY | IORESOURCE_IO }
};
-static void __init reserve_standard_io_resources(void)
+void __init reserve_standard_io_resources(void)
{
int i;
@@ -615,79 +658,211 @@ static void __init reserve_standard_io_resources(void)
}
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
+static void __init setup_kernel_resources(void)
+{
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ rodata_resource.start = __pa_symbol(__start_rodata);
+ rodata_resource.end = __pa_symbol(__end_rodata)-1;
+ data_resource.start = __pa_symbol(_sdata);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
+
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &rodata_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+}
+
+static bool __init snb_gfx_workaround_needed(void)
+{
+#ifdef CONFIG_PCI
+ int i;
+ u16 vendor, devid;
+ static const __initconst u16 snb_ids[] = {
+ 0x0102,
+ 0x0112,
+ 0x0122,
+ 0x0106,
+ 0x0116,
+ 0x0126,
+ 0x010a,
+ };
+
+ /* Assume no if something weird is going on with PCI */
+ if (!early_pci_allowed())
+ return false;
+
+ vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+ if (vendor != 0x8086)
+ return false;
+
+ devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+ for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+ if (devid == snb_ids[i])
+ return true;
+#endif
+
+ return false;
+}
-#ifdef CONFIG_CRASH_DUMP
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
+/*
+ * Sandy Bridge graphics has trouble with certain ranges, exclude
+ * them from allocation.
*/
-static int __init setup_elfcorehdr(char *arg)
+static void __init trim_snb_memory(void)
{
- char *end;
- if (!arg)
- return -EINVAL;
- elfcorehdr_addr = memparse(arg, &end);
- return end > arg ? 0 : -EINVAL;
+ static const __initconst unsigned long bad_pages[] = {
+ 0x20050000,
+ 0x20110000,
+ 0x20130000,
+ 0x20138000,
+ 0x40004000,
+ };
+ int i;
+
+ if (!snb_gfx_workaround_needed())
+ return;
+
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+
+ /*
+ * SandyBridge integrated graphics devices have a bug that prevents
+ * them from accessing certain memory ranges, namely anything below
+ * 1M and in the pages listed in bad_pages[] above.
+ *
+ * To avoid these pages being ever accessed by SNB gfx devices reserve
+ * bad_pages that have not already been reserved at boot time.
+ * All memory below the 1 MB mark is anyway reserved later during
+ * setup_arch(), so there is no need to reserve it here.
+ */
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+ bad_pages[i]);
+ }
}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-static struct x86_quirks default_x86_quirks __initdata;
+static void __init trim_bios_range(void)
+{
+ /*
+ * A special case is the first 4Kb of memory;
+ * This is a BIOS owned area, not kernel ram, but generally
+ * not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
+ */
+ e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+
+ /*
+ * special case: Some BIOSes report the PC BIOS
+ * area (640Kb -> 1Mb) as RAM even though it is not.
+ * take them out.
+ */
+ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
-struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+ e820__update_table(e820_table);
+}
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
{
- printk(KERN_NOTICE
- "%s detected: BIOS may corrupt low RAM, working around it.\n",
- d->ident);
+ u64 start = __pa_symbol(_text);
+ u64 size = __pa_symbol(_end) - start;
- e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ /*
+ * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and
+ * attempt to fix it by adding the range. We may have a confused BIOS,
+ * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+ * exclude kernel range. If we really are running on top non-RAM,
+ * we will crash later anyways.
+ */
+ if (e820__mapped_all(start, start + size, E820_TYPE_RAM))
+ return;
- return 0;
+ pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
+ e820__range_remove(start, size, E820_TYPE_RAM, 0);
+ e820__range_add(start, size, E820_TYPE_RAM);
}
-#endif
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
- },
- },
- {
+static void __init early_reserve_memory(void)
+{
/*
- * AMI BIOS with low memory corruption was found on Intel DG45ID board.
- * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
- * match only DMI_BOARD_NAME and see if there is more bad products
- * with this vendor.
+ * Reserve the memory occupied by the kernel between _text and
+ * __end_of_kernel_reserve symbols. Any kernel sections after the
+ * __end_of_kernel_reserve symbol must be explicitly reserved with a
+ * separate memblock_reserve() or they will be discarded.
*/
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
- },
- },
+ memblock_reserve_kern(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+
+ /*
+ * The first 4Kb of memory is a BIOS owned area, but generally it is
+ * not listed as such in the E820 table.
+ *
+ * Reserve the first 64K of memory since some BIOSes are known to
+ * corrupt low memory. After the real mode trampoline is allocated the
+ * rest of the memory below 640k is reserved.
+ *
+ * In addition, make sure page 0 is always reserved because on
+ * systems with L1TF its contents can be leaked to user processes.
+ */
+ memblock_reserve(0, SZ_64K);
+
+ early_reserve_initrd();
+
+ memblock_x86_reserve_range_setup_data();
+
+ reserve_bios_regions();
+ trim_snb_memory();
+}
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+ if (kaslr_enabled()) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+ kaslr_offset(),
+ __START_KERNEL,
+ __START_KERNEL_map,
+ MODULES_VADDR-1);
+ } else {
+ pr_emerg("Kernel Offset: disabled\n");
+ }
+
+ return 0;
+}
+
+void x86_configure_nx(void)
+{
+ if (boot_cpu_has(X86_FEATURE_NX))
+ __supported_pte_mask |= _PAGE_NX;
+ else
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+static void __init x86_report_nx(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "missing in CPU!\n");
+ } else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#else
+ /* 32bit non-PAE kernel, NX cannot be used */
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "cannot be enabled: non-PAE kernel!\n");
#endif
- {}
-};
+ }
+}
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
@@ -706,152 +881,145 @@ void __init setup_arch(char **cmdline_p)
{
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
- visws_early_detect();
-#else
- printk(KERN_INFO "Command line: %s\n", boot_command_line);
-#endif
-
- /* VMI may relocate the fixmap; do this before touching ioremap area */
- vmi_init();
-
- early_cpu_init();
- early_ioremap_init();
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
-#ifdef CONFIG_X86_32
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
- if (boot_params.sys_desc_table.length != 0) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
- machine_id = boot_params.sys_desc_table.table[0];
- machine_submodel_id = boot_params.sys_desc_table.table[1];
- BIOS_revision = boot_params.sys_desc_table.table[2];
- }
-#endif
- saved_video_mode = boot_params.hdr.vid_mode;
- bootloader_type = boot_params.hdr.type_of_loader;
- if ((bootloader_type >> 4) == 0xe) {
- bootloader_type &= 0xf;
- bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
- }
- bootloader_version = bootloader_type & 0xf;
- bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+ /*
+ * copy kernel address range established so far and switch
+ * to the proper swapper page table
+ */
+ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ initial_page_table + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
- "EL32",
+ load_cr3(swapper_pg_dir);
+ /*
+ * Note: Quark X1000 CPUs advertise PGE incorrectly and require
+ * a cr3 based tlb flush, so the following __flush_tlb_all()
+ * will not flush anything because the CPU quirk which clears
+ * X86_FEATURE_PGE has not been invoked yet. Though due to the
+ * load_cr3() above the TLB has been flushed already. The
+ * quirk is invoked before subsequent calls to __flush_tlb_all()
+ * so proper operation is guaranteed.
+ */
+ __flush_tlb_all();
#else
- "EL64",
-#endif
- 4)) {
- efi_enabled = 1;
- efi_reserve_early();
- }
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+ boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
- ARCH_SETUP
-
- setup_memory_map();
- parse_setup_data();
- /* update the e820_saved too */
- e820_reserve_setup_data();
-
- copy_edd();
-
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = _brk_end;
-
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
#else
if (builtin_cmdline[0]) {
/* append boot loader cmdline to builtin */
strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
}
#endif
+ builtin_cmdline_added = true;
#endif
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
+ /*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
+ olpc_ofw_detect();
+
+ idt_setup_early_traps();
+ early_cpu_init();
+ jump_label_init();
+ static_call_init();
+ early_ioremap_init();
+
+ setup_olpc_ofw_pgd();
+
+ parse_boot_params();
+
+ x86_init.oem.arch_setup();
+
+ /*
+ * Do some memory reservations *before* memory is added to memblock, so
+ * memblock allocations won't overwrite it.
+ *
+ * After this point, everything still needed from the boot loader or
+ * firmware or kernel text should be early reserved or marked not RAM in
+ * e820. All other memory is free game.
+ *
+ * This call needs to happen before e820__memory_setup() which calls the
+ * xen_memory_setup() on Xen dom0 which relies on the fact that those
+ * early reservations have happened already.
+ */
+ early_reserve_memory();
+
+ iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
+ e820__memory_setup();
+ parse_setup_data();
+
+ copy_edd();
+
+ setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
+
+ /*
+ * x86_configure_nx() is called before parse_early_param() to detect
+ * whether hardware doesn't support NX (so that the early EHCI debug
+ * console setup can safely call set_fixmap()).
+ */
+ x86_configure_nx();
+
parse_early_param();
-#ifdef CONFIG_X86_64
- check_efer();
-#endif
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
- /* Must be before kernel pagetables are setup */
- vmi_activate();
+ x86_report_nx();
- /* after early param, so could get panic from serial */
- reserve_early_setup_data();
+ apic_setup_apic_calls();
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
- disable_apic = 1;
+ apic_is_disabled = true;
#endif
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
-#ifdef CONFIG_PCI
- if (pci_early_dump_regs)
- early_dump_pci_devices();
-#endif
-
- finish_e820_parsing();
+ e820__finish_early_params();
- if (efi_enabled)
+ if (efi_enabled(EFI_BOOT))
efi_init();
- dmi_scan_machine();
-
- dmi_check_system(bad_bios_dmi_table);
+ reserve_ibft_region();
+ x86_init.resources.dmi_setup();
/*
* VMware detection requires dmi to be available, so this
- * needs to be done after dmi_scan_machine, for the BP.
+ * needs to be done after dmi_setup(), for the boot CPU.
+ * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be
+ * called before cache_bp_init() for setting up MTRR state.
*/
- init_hypervisor(&boot_cpu_data);
-
-#ifdef CONFIG_X86_32
- probe_roms();
-#endif
+ init_hypervisor_platform();
- /* after parse_early_param, so could debug it */
- insert_resource(&iomem_resource, &code_resource);
- insert_resource(&iomem_resource, &data_resource);
- insert_resource(&iomem_resource, &bss_resource);
+ tsc_early_init();
+ x86_init.resources.probe_roms();
+ /*
+ * Add resources for kernel text and data to the iomem_resource.
+ * Do it after parse_early_param, so it can be debugged.
+ */
+ setup_kernel_resources();
+ e820_add_kernel_range();
+ trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
- e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
- E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
+ E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
printk(KERN_INFO "fixed physical RAM map:\n");
- e820_print_map("bad_ppro");
+ e820__print_table("bad_ppro");
}
#else
early_gart_iommu_check();
@@ -861,57 +1029,119 @@ void __init setup_arch(char **cmdline_p)
* partially used pages are not usable - thus
* we are rounding upwards:
*/
- max_pfn = e820_end_of_ram_pfn();
+ max_pfn = e820__end_of_ram_pfn();
- /* preallocate 4k for mptable mpc */
- early_reserve_e820_mpc_new();
/* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
+ cache_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
- max_pfn = e820_end_of_ram_pfn();
+ max_pfn = e820__end_of_ram_pfn();
+
+ max_possible_pfn = max_pfn;
+
+ /*
+ * Define random base addresses for memory sections after max_pfn is
+ * defined and before each memory section base is used.
+ */
+ kernel_randomize_memory();
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
#else
- num_physpages = max_pfn;
-
check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
- max_low_pfn = e820_end_of_low_ram_pfn();
+ max_low_pfn = e820__end_of_low_ram_pfn();
else
max_low_pfn = max_pfn;
-
- high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
+ /* Find and reserve MPTABLE area */
+ x86_init.mpparse.find_mptable();
+
+ early_alloc_pgt_buf();
+
+ /*
+ * Need to conclude brk, before e820__memblock_setup()
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
+ */
+ reserve_brk();
+
+ cleanup_highmap();
+
+ e820__memblock_setup();
+
+ /*
+ * Needs to run after memblock setup because it needs the physical
+ * memory size.
+ */
+ mem_encrypt_setup_arch();
+ cc_random_init();
+
+ efi_find_mirror();
+ efi_esrt_init();
+ efi_mokvar_table_init();
+
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
+ efi_reserve_boot_services();
+
+ /* preallocate 4k for mptable mpc */
+ e820__memblock_alloc_reserved_mpc_new();
+
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
setup_bios_corruption_check();
#endif
- printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
- max_pfn_mapped<<PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+ printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
+#endif
- reserve_brk();
+ /*
+ * Find free memory for the real mode trampoline and place it there. If
+ * there is not enough free memory under 1M, on EFI-enabled systems
+ * there will be additional attempt to reclaim the memory for the real
+ * mode trampoline at efi_free_boot_services().
+ *
+ * Unconditionally reserve the entire first 1M of RAM because BIOSes
+ * are known to corrupt low memory and several hundred kilobytes are not
+ * worth complex detection what memory gets clobbered. Windows does the
+ * same thing for very similar reasons.
+ *
+ * Moreover, on machines with SandyBridge graphics or in setups that use
+ * crashkernel the entire 1M is reserved anyway.
+ *
+ * Note the host kernel TDX also requires the first 1MB being reserved.
+ */
+ x86_platform.realmode_reserve();
- init_gbpages();
+ init_mem_mapping();
+
+ /*
+ * init_mem_mapping() relies on the early IDT page fault handling.
+ * Now either enable FRED or install the real page fault handler
+ * for 64-bit in the IDT.
+ */
+ cpu_init_replace_early_idt();
- /* max_pfn_mapped is updated here */
- max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
- max_pfn_mapped = max_low_pfn_mapped;
+ /*
+ * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+ * with the current CR4 value. This may not be necessary, but
+ * auditing all the early-boot CR4 manipulation would be needed to
+ * rule it out.
+ *
+ * Mask off features that don't work outside long mode (just
+ * PCIDE for now).
+ */
+ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
-#ifdef CONFIG_X86_64
- if (max_pfn > max_low_pfn) {
- max_pfn_mapped = init_memory_mapping(1UL<<32,
- max_pfn<<PAGE_SHIFT);
- /* can we preseve max_low_pfn ?*/
- max_low_pfn = max_pfn;
- }
-#endif
+ memblock_set_current_limit(get_max_mapped());
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -921,187 +1151,168 @@ void __init setup_arch(char **cmdline_p)
if (init_ohci1394_dma_early)
init_ohci1394_dma_on_all_controllers();
#endif
+ /* Allocate bigger log buffer */
+ setup_log_buf(1);
+
+ if (efi_enabled(EFI_BOOT)) {
+ switch (boot_params.secure_boot) {
+ case efi_secureboot_mode_disabled:
+ pr_info("Secure boot disabled\n");
+ break;
+ case efi_secureboot_mode_enabled:
+ pr_info("Secure boot enabled\n");
+ break;
+ default:
+ pr_info("Secure boot could not be determined\n");
+ break;
+ }
+ }
reserve_initrd();
+ acpi_table_upgrade();
+ /* Look for ACPI tables and reserve memory occupied by them. */
+ acpi_boot_table_init();
+
vsmp_init();
io_delay_init();
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
+ early_platform_quirks();
+ /* Some platforms need the APIC registered for NUMA configuration */
early_acpi_boot_init();
+ x86_init.mpparse.early_parse_smp_cfg();
-#ifdef CONFIG_ACPI_NUMA
- /*
- * Parse SRAT to discover nodes.
- */
- acpi_numa_init();
-#endif
+ x86_flattree_get_config();
- initmem_init(0, max_pfn);
+ initmem_init();
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
+
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
+ hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+ hugetlb_bootmem_alloc();
+ }
-#ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
- */
- acpi_reserve_bootmem();
-#endif
/*
- * Find and reserve possible boot-time SMP configuration:
+ * Reserve memory for crash kernel after SRAT is parsed so that it
+ * won't consume hotpluggable memory.
*/
- find_smp_config();
+ arch_reserve_crashkernel();
- reserve_crashkernel();
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
-#ifdef CONFIG_X86_64
- /*
- * dma32_reserve_bootmem() allocates bootmem which may conflict
- * with the crashkernel command line, so do that after
- * reserve_crashkernel()
- */
- dma32_reserve_bootmem();
-#endif
+ x86_init.paging.pagetable_init();
- reserve_ibft_region();
+ kasan_init();
-#ifdef CONFIG_KVM_CLOCK
- kvmclock_init();
-#endif
+ /*
+ * Sync back kernel address range.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
- paravirt_pagetable_setup_start(swapper_pg_dir);
- paging_init();
- paravirt_pagetable_setup_done(swapper_pg_dir);
- paravirt_post_allocator_init();
+ tboot_probe();
-#ifdef CONFIG_X86_64
map_vsyscall();
-#endif
- generic_apic_probe();
+ x86_32_probe_apic();
early_quirks();
+ topology_apply_cmdline_limits_early();
+
/*
- * Read APIC and some other early information from ACPI tables.
+ * Parse SMP configuration. Try ACPI first and then the platform
+ * specific parser.
*/
acpi_boot_init();
+ x86_init.mpparse.parse_smp_cfg();
-#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- get_smp_config();
-#endif
+ /* Last opportunity to detect and map the local APIC */
+ init_apic_mappings();
- prefill_possible_map();
+ topology_init_possible_cpus();
-#ifdef CONFIG_X86_64
init_cpu_to_node();
-#endif
-
- init_apic_mappings();
- ioapic_init_mappings();
+ init_gi_nodes();
- /* need to wait for io_apic is mapped */
- probe_nr_irqs_gsi();
+ io_apic_init_mappings();
- kvm_guest_init();
+ x86_init.hyper.guest_late_init();
- e820_reserve_resources();
- e820_mark_nosave_regions(max_low_pfn);
+ e820__reserve_resources();
+ e820__register_nosave_regions(max_pfn);
-#ifdef CONFIG_X86_32
- request_resource(&iomem_resource, &video_ram_resource);
-#endif
- reserve_standard_io_resources();
+ x86_init.resources.reserve_resources();
- e820_setup_gap();
+ e820__setup_pci_gap();
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
+ if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ vgacon_register_screen(&screen_info);
+#endif
#endif
+ x86_init.oem.banner();
+
+ x86_init.timers.wallclock_init();
+
+ /*
+ * This needs to run before setup_local_APIC() which soft-disables the
+ * local APIC temporarily and that masks the thermal LVT interrupt,
+ * leading to softlockups on machines which have configured SMI
+ * interrupt delivery.
+ */
+ therm_lvt_init();
+
+ mcheck_init();
+
+ register_refined_jiffies(CLOCK_TICK_RATE);
+
+#ifdef CONFIG_EFI
+ if (efi_enabled(EFI_BOOT))
+ efi_apply_memmap_quirks();
#endif
+
+ unwind_init();
}
#ifdef CONFIG_X86_32
-/**
- * x86_quirk_intr_init - post gate setup interrupt initialisation
- *
- * Description:
- * Fill in any interrupts that may have been left out by the general
- * init_IRQ() routine. interrupts having to do with the machine rather
- * than the devices on the I/O bus (like APIC interrupts in intel MP
- * systems) are started here.
- **/
-void __init x86_quirk_intr_init(void)
-{
- if (x86_quirks->arch_intr_init) {
- if (x86_quirks->arch_intr_init())
- return;
- }
-}
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
-/**
- * x86_quirk_trap_init - initialise system specific traps
- *
- * Description:
- * Called as the final act of trap_init(). Used in VISWS to initialise
- * the various board specific APIC traps.
- **/
-void __init x86_quirk_trap_init(void)
+void __init i386_reserve_resources(void)
{
- if (x86_quirks->arch_trap_init) {
- if (x86_quirks->arch_trap_init())
- return;
- }
+ request_resource(&iomem_resource, &video_ram_resource);
+ reserve_standard_io_resources();
}
-static struct irqaction irq0 = {
- .handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
- .name = "timer"
+#endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
};
-/**
- * x86_quirk_pre_time_init - do any specific initialisations before.
- *
- **/
-void __init x86_quirk_pre_time_init(void)
+static int __init register_kernel_offset_dumper(void)
{
- if (x86_quirks->arch_pre_time_init)
- x86_quirks->arch_pre_time_init();
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+ return 0;
}
+__initcall(register_kernel_offset_dumper);
-/**
- * x86_quirk_time_init - do any specific initialisations for the system timer.
- *
- * Description:
- * Must plug the system timer interrupt source at HZ into the IRQ listed
- * in irq_vectors.h:TIMER_IRQ
- **/
-void __init x86_quirk_time_init(void)
+#ifdef CONFIG_HOTPLUG_CPU
+bool arch_cpu_is_hotpluggable(int cpu)
{
- if (x86_quirks->arch_time_init) {
- /*
- * A nonzero return code does not mean failure, it means
- * that the architecture quirk does not want any
- * generic (timer) setup to be performed after this:
- */
- if (x86_quirks->arch_time_init())
- return;
- }
-
- irq0.mask = cpumask_of_cpu(0);
- setup_irq(0, &irq0);
+ return cpu > 0;
}
-#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 29a3eef7cf4a..bfa48e7a32a2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,15 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/percpu.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
#include <linux/smp.h>
#include <linux/topology.h>
#include <linux/pfn.h>
+#include <linux/stackprotector.h>
#include <asm/sections.h>
#include <asm/processor.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
@@ -17,29 +22,14 @@
#include <asm/proto.h>
#include <asm/cpumask.h>
#include <asm/cpu.h>
-#include <asm/stackprotector.h>
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(x...) printk(KERN_DEBUG x)
-#else
-# define DBG(x...)
-#endif
-
-DEFINE_PER_CPU(int, cpu_number);
+DEFINE_PER_CPU_CACHE_HOT(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);
-#ifdef CONFIG_X86_64
-#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
-#else
-#define BOOT_PERCPU_OFFSET 0
-#endif
-
-DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
- [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
-};
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init;
EXPORT_SYMBOL(__per_cpu_offset);
/*
@@ -55,6 +45,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
#define PERCPU_FIRST_CHUNK_RESERVE 0
#endif
+#ifdef CONFIG_X86_32
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
@@ -67,7 +58,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
*/
static bool __init pcpu_need_numa(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
pg_data_t *last = NULL;
unsigned int cpu;
@@ -83,414 +74,98 @@ static bool __init pcpu_need_numa(void)
#endif
return false;
}
-
-/**
- * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
- * @cpu: cpu to allocate for
- * @size: size allocation in bytes
- * @align: alignment
- *
- * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
- * does the right thing for NUMA regardless of the current
- * configuration.
- *
- * RETURNS:
- * Pointer to the allocated area on success, NULL on failure.
- */
-static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
- unsigned long align)
-{
- const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- int node = early_cpu_to_node(cpu);
- void *ptr;
-
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = __alloc_bootmem_nopanic(size, align, goal);
- pr_info("cpu %d has no node %d or node-local memory\n",
- cpu, node);
- pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
- cpu, size, __pa(ptr));
- } else {
- ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
- size, align, goal);
- pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
- "%016lx\n", cpu, size, node, __pa(ptr));
- }
- return ptr;
-#else
- return __alloc_bootmem_nopanic(size, align, goal);
#endif
-}
-
-/*
- * Large page remap allocator
- *
- * This allocator uses PMD page as unit. A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk. Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk. The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
- */
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pcpul_ent {
- unsigned int cpu;
- void *ptr;
-};
-
-static size_t pcpul_size;
-static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
-
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
-{
- size_t off = (size_t)pageno << PAGE_SHIFT;
- if (off >= pcpul_size)
- return NULL;
-
- return virt_to_page(pcpul_map[cpu].ptr + off);
-}
-
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
-{
- size_t map_size, dyn_size;
- unsigned int cpu;
- int i, j;
- ssize_t ret;
-
- if (!chosen) {
- size_t vm_size = VMALLOC_END - VMALLOC_START;
- size_t tot_size = num_possible_cpus() * PMD_SIZE;
-
- /* on non-NUMA, embedding is better */
- if (!pcpu_need_numa())
- return -EINVAL;
-
- /* don't consume more than 20% of vmalloc area */
- if (tot_size > vm_size / 5) {
- pr_info("PERCPU: too large chunk size %zuMB for "
- "large page remap\n", tot_size >> 20);
- return -EINVAL;
- }
- }
-
- /* need PSE */
- if (!cpu_has_pse) {
- pr_warning("PERCPU: lpage allocator requires PSE\n");
- return -EINVAL;
- }
-
- /*
- * Currently supports only single page. Supporting multiple
- * pages won't be too difficult if it ever becomes necessary.
- */
- pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- PERCPU_DYNAMIC_RESERVE);
- if (pcpul_size > PMD_SIZE) {
- pr_warning("PERCPU: static data is larger than large page, "
- "can't use large page\n");
- return -EINVAL;
- }
- dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
- /* allocate pointer array and alloc large pages */
- map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
- pcpul_map = alloc_bootmem(map_size);
-
- for_each_possible_cpu(cpu) {
- pcpul_map[cpu].cpu = cpu;
- pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
- PMD_SIZE);
- if (!pcpul_map[cpu].ptr) {
- pr_warning("PERCPU: failed to allocate large page "
- "for cpu%u\n", cpu);
- goto enomem;
- }
-
- /*
- * Only use pcpul_size bytes and give back the rest.
- *
- * Ingo: The 2MB up-rounding bootmem is needed to make
- * sure the partial 2MB page is still fully RAM - it's
- * not well-specified to have a PAT-incompatible area
- * (unmapped RAM, device memory, etc.) in that hole.
- */
- free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
- PMD_SIZE - pcpul_size);
-
- memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
- }
-
- /* allocate address and map */
- pcpul_vm.flags = VM_ALLOC;
- pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
- vm_area_register_early(&pcpul_vm, PMD_SIZE);
-
- for_each_possible_cpu(cpu) {
- pmd_t *pmd, pmd_v;
-
- pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
- cpu * PMD_SIZE);
- pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, pmd_v);
- }
-
- /* we're ready, commit */
- pr_info("PERCPU: Remapped at %p with large pages, static data "
- "%zu bytes\n", pcpul_vm.addr, static_size);
-
- ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- PMD_SIZE, pcpul_vm.addr, NULL);
-
- /* sort pcpul_map array for pcpu_lpage_remapped() */
- for (i = 0; i < num_possible_cpus() - 1; i++)
- for (j = i + 1; j < num_possible_cpus(); j++)
- if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- struct pcpul_ent tmp = pcpul_map[i];
- pcpul_map[i] = pcpul_map[j];
- pcpul_map[j] = tmp;
- }
-
- return ret;
-
-enomem:
- for_each_possible_cpu(cpu)
- if (pcpul_map[cpu].ptr)
- free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
- free_bootmem(__pa(pcpul_map), map_size);
- return -ENOMEM;
-}
-
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area. This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
- void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
- unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- int left = 0, right = num_possible_cpus() - 1;
- int pos;
-
- /* pcpul in use at all? */
- if (!pcpul_map)
- return NULL;
-
- /* okay, perform binary search */
- while (left <= right) {
- pos = (left + right) / 2;
-
- if (pcpul_map[pos].ptr < pmd_addr)
- left = pos + 1;
- else if (pcpul_map[pos].ptr > pmd_addr)
- right = pos - 1;
- else {
- /* it shouldn't be in the area for the first chunk */
- WARN_ON(offset < pcpul_size);
-
- return pcpul_vm.addr +
- pcpul_map[pos].cpu * PMD_SIZE + offset;
- }
- }
-
- return NULL;
-}
+#ifdef CONFIG_NUMA
+ if (early_cpu_to_node(from) == early_cpu_to_node(to))
+ return LOCAL_DISTANCE;
+ else
+ return REMOTE_DISTANCE;
#else
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
-{
- return -EINVAL;
-}
+ return LOCAL_DISTANCE;
#endif
-
-/*
- * Embedding allocator
- *
- * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves and embedded into linear physical
- * mapping so that it can use PMD mapping without additional TLB
- * pressure.
- */
-static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
-{
- size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-
- /*
- * If large page isn't supported, there's no benefit in doing
- * this. Also, embedding allocation doesn't play well with
- * NUMA.
- */
- if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
- return -EINVAL;
-
- return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
- reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
}
-/*
- * 4k page allocator
- *
- * This is the basic allocator. Static percpu area is allocated
- * page-by-page and most of initialization is done by the generic
- * setup function.
- */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_nr_static_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+static int __init pcpu_cpu_to_node(int cpu)
{
- if (pageno < pcpu4k_nr_static_pages)
- return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
- return NULL;
+ return early_cpu_to_node(cpu);
}
-static void __init pcpu4k_populate_pte(unsigned long addr)
+void __init pcpu_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
-static ssize_t __init setup_pcpu_4k(size_t static_size)
-{
- size_t pages_size;
- unsigned int cpu;
- int i, j;
- ssize_t ret;
-
- pcpu4k_nr_static_pages = PFN_UP(static_size);
-
- /* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
- * sizeof(pcpu4k_pages[0]));
- pcpu4k_pages = alloc_bootmem(pages_size);
-
- /* allocate and copy */
- j = 0;
- for_each_possible_cpu(cpu)
- for (i = 0; i < pcpu4k_nr_static_pages; i++) {
- void *ptr;
-
- ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
- if (!ptr) {
- pr_warning("PERCPU: failed to allocate "
- "4k page for cpu%u\n", cpu);
- goto enomem;
- }
-
- memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
- pcpu4k_pages[j++] = virt_to_page(ptr);
- }
-
- /* we're ready, commit */
- pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
- pcpu4k_nr_static_pages, static_size);
-
- ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, -1,
- -1, NULL, pcpu4k_populate_pte);
- goto out_free_ar;
-
-enomem:
- while (--j >= 0)
- free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
- ret = -ENOMEM;
-out_free_ar:
- free_bootmem(__pa(pcpu4k_pages), pages_size);
- return ret;
-}
-
-/* for explicit first chunk allocator selection */
-static char pcpu_chosen_alloc[16] __initdata;
-
-static int __init percpu_alloc_setup(char *str)
-{
- strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
- return 0;
-}
-early_param("percpu_alloc", percpu_alloc_setup);
-
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct gdt;
+ struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32,
+ per_cpu_offset(cpu), 0xFFFFF);
- pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
- 0x2 | DESCTYPE_S, 0x8);
- gdt.s = 1;
- write_gdt_entry(get_cpu_gdt_table(cpu),
- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
}
void __init setup_per_cpu_areas(void)
{
- size_t static_size = __per_cpu_end - __per_cpu_start;
unsigned int cpu;
unsigned long delta;
- size_t pcpu_unit_size;
- ssize_t ret;
+ int rc;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
- * Allocate percpu area. If PSE is supported, try to make use
- * of large page mappings. Please read comments on top of
- * each allocator for details.
+ * Allocate percpu area. Embedding allocator is our favorite;
+ * however, on NUMA configurations, it can result in very
+ * sparse unit mapping and vmalloc area isn't spacious enough
+ * on 32bit. Use page in that case.
*/
- ret = -EINVAL;
- if (strlen(pcpu_chosen_alloc)) {
- if (strcmp(pcpu_chosen_alloc, "4k")) {
- if (!strcmp(pcpu_chosen_alloc, "lpage"))
- ret = setup_pcpu_lpage(static_size, true);
- else if (!strcmp(pcpu_chosen_alloc, "embed"))
- ret = setup_pcpu_embed(static_size, true);
- else
- pr_warning("PERCPU: unknown allocator %s "
- "specified\n", pcpu_chosen_alloc);
- if (ret < 0)
- pr_warning("PERCPU: %s allocator failed (%zd), "
- "falling back to 4k\n",
- pcpu_chosen_alloc, ret);
- }
- } else {
- ret = setup_pcpu_lpage(static_size, false);
- if (ret < 0)
- ret = setup_pcpu_embed(static_size, false);
- }
- if (ret < 0)
- ret = setup_pcpu_4k(static_size);
- if (ret < 0)
- panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
- static_size, ret);
+#ifdef CONFIG_X86_32
+ if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ rc = -EINVAL;
+ if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+ const size_t dyn_size = PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
- pcpu_unit_size = ret;
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
+ rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ dyn_size, atom_size,
+ pcpu_cpu_distance,
+ pcpu_cpu_to_node);
+ if (rc < 0)
+ pr_warn("%s allocator failed (%d), falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
+ }
+ if (rc < 0)
+ rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ pcpu_cpu_to_node);
+ if (rc < 0)
+ panic("cannot initialize percpu area (err=%d)", rc);
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
- per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+ per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
- setup_stack_canary_segment(cpu);
/*
* Copy data used in early init routines from the
* initial arrays to the per cpu data areas. These
@@ -501,46 +176,54 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_LOCAL_APIC
per_cpu(x86_cpu_to_apicid, cpu) =
early_per_cpu_map(x86_cpu_to_apicid, cpu);
- per_cpu(x86_bios_cpu_apicid, cpu) =
- early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+ per_cpu(x86_cpu_to_acpiid, cpu) =
+ early_per_cpu_map(x86_cpu_to_acpiid, cpu);
#endif
-#ifdef CONFIG_X86_64
- per_cpu(irq_stack_ptr, cpu) =
- per_cpu(irq_stack_union.irq_stack, cpu) +
- IRQ_STACK_SIZE - 64;
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
early_per_cpu_map(x86_cpu_to_node_map, cpu);
-#endif
+ /*
+ * Ensure that the boot cpu numa_node is correct when the boot
+ * cpu is on a node that doesn't have memory installed.
+ * Also cpu_up() will call cpu_to_node() for APs when
+ * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
+ * up later with c_init aka intel_init/amd_init.
+ * So set them all (boot cpu and all APs).
+ */
+ set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
#endif
/*
- * Up to this point, the boot CPU has been using .data.init
+ * Up to this point, the boot CPU has been using .init.data
* area. Reload any changed state for the boot CPU.
*/
- if (cpu == boot_cpu_id)
- switch_to_new_gdt(cpu);
+ if (!cpu)
+ switch_gdt_and_percpu_base(cpu);
}
/* indicate the early static arrays will soon be gone */
#ifdef CONFIG_X86_LOCAL_APIC
early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
- early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+ early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+#ifdef CONFIG_NUMA
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
- /*
- * make sure boot cpu node_number is right, when boot cpu is on the
- * node that doesn't have mem installed
- */
- per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
-#endif
-
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
/* Setup cpu initialized, callin, callout masks */
setup_cpu_local_masks();
+
+ /*
+ * Sync back kernel address range again. We already did this in
+ * setup_arch(), but percpu data also needs to be available in
+ * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to
+ * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available
+ * there too.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
}
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
new file mode 100644
index 000000000000..1ab65f6c6ae7
--- /dev/null
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sev_verify_cbit.S - Code for verification of the C-bit position reported
+ * by the Hypervisor when running with SEV enabled.
+ *
+ * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de)
+ *
+ * sev_verify_cbit() is called before switching to a new long-mode page-table
+ * at boot.
+ *
+ * Verify that the C-bit position is correct by writing a random value to
+ * an encrypted memory location while on the current page-table. Then it
+ * switches to the new page-table to verify the memory content is still the
+ * same. After that it switches back to the current page-table and when the
+ * check succeeded it returns. If the check failed the code invalidates the
+ * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to
+ * make sure no interrupt or exception can get the CPU out of the hlt loop.
+ *
+ * New page-table pointer is expected in %rdi (first parameter)
+ *
+ */
+SYM_FUNC_START(sev_verify_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* First check if a C-bit was detected */
+ movq sme_me_mask(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */
+ movq sev_status(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* Save CR4 in %rsi */
+ movq %cr4, %rsi
+
+ /* Disable Global Pages */
+ movq %rsi, %rdx
+ andq $(~X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /*
+ * Verified that running under SEV - now get a random value using
+ * RDRAND. This instruction is mandatory when running as an SEV guest.
+ *
+ * Don't bail out of the loop if RDRAND returns errors. It is better to
+ * prevent forward progress than to work with a non-random value here.
+ */
+1: rdrand %rdx
+ jnc 1b
+
+ /* Store value to memory and keep it in %rdx */
+ movq %rdx, sev_check_data(%rip)
+
+ /* Backup current %cr3 value to restore it later */
+ movq %cr3, %rcx
+
+ /* Switch to new %cr3 - This might unmap the stack */
+ movq %rdi, %cr3
+
+ /*
+ * Compare value in %rdx with memory location. If C-bit is incorrect
+ * this would read the encrypted data and make the check fail.
+ */
+ cmpq %rdx, sev_check_data(%rip)
+
+ /* Restore old %cr3 */
+ movq %rcx, %cr3
+
+ /* Restore previous CR4 */
+ movq %rsi, %cr4
+
+ /* Check CMPQ result */
+ je 3f
+
+ /*
+ * The check failed, prevent any forward progress to prevent ROP
+ * attacks, invalidate the stack and go into a hlt loop.
+ */
+ xorl %esp, %esp
+ subq $0x1000, %rsp
+2: hlt
+ jmp 2b
+3:
+#endif
+ /* Return page-table pointer */
+ movq %rdi, %rax
+ RET
+SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
new file mode 100644
index 000000000000..978232b6d48d
--- /dev/null
+++ b/arch/x86/kernel/shstk.c
@@ -0,0 +1,635 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <linux/syscalls.h>
+#include <asm/msr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/shstk.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/api.h>
+#include <asm/prctl.h>
+
+#define SS_FRAME_SIZE 8
+
+static bool features_enabled(unsigned long features)
+{
+ return current->thread.features & features;
+}
+
+static void features_set(unsigned long features)
+{
+ current->thread.features |= features;
+}
+
+static void features_clr(unsigned long features)
+{
+ current->thread.features &= ~features;
+}
+
+/*
+ * Create a restore token on the shadow stack. A token is always 8-byte
+ * and aligned to 8.
+ */
+static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+{
+ unsigned long addr;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(ssp, 8))
+ return -EINVAL;
+
+ addr = ssp - SS_FRAME_SIZE;
+
+ /*
+ * SSP is aligned, so reserved bits and mode bit are a zero, just mark
+ * the token 64-bit.
+ */
+ ssp |= BIT(0);
+
+ if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
+ return -EFAULT;
+
+ if (token_addr)
+ *token_addr = addr;
+
+ return 0;
+}
+
+/*
+ * VM_SHADOW_STACK will have a guard page. This helps userspace protect
+ * itself from attacks. The reasoning is as follows:
+ *
+ * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
+ * INCSSP instruction can increment the shadow stack pointer. It is the
+ * shadow stack analog of an instruction like:
+ *
+ * addq $0x80, %rsp
+ *
+ * However, there is one important difference between an ADD on %rsp
+ * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
+ * memory of the first and last elements that were "popped". It can be
+ * thought of as acting like this:
+ *
+ * READ_ONCE(ssp); // read+discard top element on stack
+ * ssp += nr_to_pop * 8; // move the shadow stack
+ * READ_ONCE(ssp-8); // read+discard last popped stack element
+ *
+ * The maximum distance INCSSP can move the SSP is 2040 bytes, before
+ * it would read the memory. Therefore a single page gap will be enough
+ * to prevent any operation from shifting the SSP to an adjacent stack,
+ * since it would have to land in the gap at least once, causing a
+ * fault.
+ */
+static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
+ unsigned long token_offset, bool set_res_tok)
+{
+ int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
+ struct mm_struct *mm = current->mm;
+ unsigned long mapped_addr, unused;
+
+ if (addr)
+ flags |= MAP_FIXED_NOREPLACE;
+
+ mmap_write_lock(mm);
+ mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+ mmap_write_unlock(mm);
+
+ if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
+ goto out;
+
+ if (create_rstor_token(mapped_addr + token_offset, NULL)) {
+ vm_munmap(mapped_addr, size);
+ return -EINVAL;
+ }
+
+out:
+ return mapped_addr;
+}
+
+static unsigned long adjust_shstk_size(unsigned long size)
+{
+ if (size)
+ return PAGE_ALIGN(size);
+
+ return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
+}
+
+static void unmap_shadow_stack(u64 base, u64 size)
+{
+ int r;
+
+ r = vm_munmap(base, size);
+
+ /*
+ * mmap_write_lock_killable() failed with -EINTR. This means
+ * the process is about to die and have it's MM cleaned up.
+ * This task shouldn't ever make it back to userspace. In this
+ * case it is ok to leak a shadow stack, so just exit out.
+ */
+ if (r == -EINTR)
+ return;
+
+ /*
+ * For all other types of vm_munmap() failure, either the
+ * system is out of memory or there is bug.
+ */
+ WARN_ON_ONCE(r);
+}
+
+static int shstk_setup(void)
+{
+ struct thread_shstk *shstk = &current->thread.shstk;
+ unsigned long addr, size;
+
+ /* Already enabled */
+ if (features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /* Also not supported for 32 bit */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
+ return -EOPNOTSUPP;
+
+ size = adjust_shstk_size(0);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return PTR_ERR((void *)addr);
+
+ fpregs_lock_and_load();
+ wrmsrq(MSR_IA32_PL3_SSP, addr + size);
+ wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN);
+ fpregs_unlock();
+
+ shstk->base = addr;
+ shstk->size = size;
+ features_set(ARCH_SHSTK_SHSTK);
+
+ return 0;
+}
+
+void reset_thread_features(void)
+{
+ memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
+ current->thread.features = 0;
+ current->thread.features_locked = 0;
+}
+
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
+ unsigned long stack_size)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+ unsigned long addr, size;
+
+ /*
+ * If shadow stack is not enabled on the new thread, skip any
+ * switch to a new shadow stack.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /*
+ * For CLONE_VFORK the child will share the parents shadow stack.
+ * Make sure to clear the internal tracking of the thread shadow
+ * stack so the freeing logic run for child knows to leave it alone.
+ */
+ if (clone_flags & CLONE_VFORK) {
+ shstk->base = 0;
+ shstk->size = 0;
+ return 0;
+ }
+
+ /*
+ * For !CLONE_VM the child will use a copy of the parents shadow
+ * stack.
+ */
+ if (!(clone_flags & CLONE_VM))
+ return 0;
+
+ size = adjust_shstk_size(stack_size);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ shstk->base = addr;
+ shstk->size = size;
+
+ return addr + size;
+}
+
+static unsigned long get_user_shstk_addr(void)
+{
+ unsigned long long ssp;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+
+ fpregs_unlock();
+
+ return ssp;
+}
+
+int shstk_pop(u64 *val)
+{
+ int ret = 0;
+ u64 ssp;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -ENOTSUPP;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+ if (val && get_user(*val, (__user u64 *)ssp))
+ ret = -EFAULT;
+ else
+ wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
+ fpregs_unlock();
+
+ return ret;
+}
+
+int shstk_push(u64 val)
+{
+ u64 ssp;
+ int ret;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -ENOTSUPP;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+ ssp -= SS_FRAME_SIZE;
+ ret = write_user_shstk_64((__user void *)ssp, val);
+ if (!ret)
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return ret;
+}
+
+#define SHSTK_DATA_BIT BIT(63)
+
+static int put_shstk_data(u64 __user *addr, u64 data)
+{
+ if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ /*
+ * Mark the high bit so that the sigframe can't be processed as a
+ * return address.
+ */
+ if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
+ return -EFAULT;
+ return 0;
+}
+
+static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
+{
+ unsigned long ldata;
+
+ if (unlikely(get_user(ldata, addr)))
+ return -EFAULT;
+
+ if (!(ldata & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ *data = ldata & ~SHSTK_DATA_BIT;
+
+ return 0;
+}
+
+static int shstk_push_sigframe(unsigned long *ssp)
+{
+ unsigned long target_ssp = *ssp;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(target_ssp, 8))
+ return -EINVAL;
+
+ *ssp -= SS_FRAME_SIZE;
+ if (put_shstk_data((void __user *)*ssp, target_ssp))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int shstk_pop_sigframe(unsigned long *ssp)
+{
+ struct vm_area_struct *vma;
+ unsigned long token_addr;
+ bool need_to_check_vma;
+ int err = 1;
+
+ /*
+ * It is possible for the SSP to be off the end of a shadow stack by 4
+ * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
+ * before it, it might be this case, so check that the address being
+ * read is actually shadow stack.
+ */
+ if (!IS_ALIGNED(*ssp, 8))
+ return -EINVAL;
+
+ need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
+
+ if (need_to_check_vma)
+ mmap_read_lock_killable(current->mm);
+
+ err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
+ if (unlikely(err))
+ goto out_err;
+
+ if (need_to_check_vma) {
+ vma = find_vma(current->mm, *ssp);
+ if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
+ err = -EFAULT;
+ goto out_err;
+ }
+
+ mmap_read_unlock(current->mm);
+ }
+
+ /* Restore SSP aligned? */
+ if (unlikely(!IS_ALIGNED(token_addr, 8)))
+ return -EINVAL;
+
+ /* SSP in userspace? */
+ if (unlikely(token_addr >= TASK_SIZE_MAX))
+ return -EINVAL;
+
+ *ssp = token_addr;
+
+ return 0;
+out_err:
+ if (need_to_check_vma)
+ mmap_read_unlock(current->mm);
+ return err;
+}
+
+int setup_signal_shadow_stack(struct ksignal *ksig)
+{
+ void __user *restorer = ksig->ka.sa.sa_restorer;
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ if (!restorer)
+ return -EINVAL;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_push_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ /* Push restorer address */
+ ssp -= SS_FRAME_SIZE;
+ err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
+ if (unlikely(err))
+ return -EFAULT;
+
+ fpregs_lock_and_load();
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+int restore_signal_shadow_stack(void)
+{
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_pop_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ fpregs_lock_and_load();
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return;
+
+ /*
+ * When fork() with CLONE_VM fails, the child (tsk) already has a
+ * shadow stack allocated, and exit_thread() calls this function to
+ * free it. In this case the parent (current) and the child share
+ * the same mm struct.
+ */
+ if (!tsk->mm || tsk->mm != current->mm)
+ return;
+
+ /*
+ * If shstk->base is NULL, then this task is not managing its
+ * own shadow stack (CLONE_VFORK). So skip freeing it.
+ */
+ if (!shstk->base)
+ return;
+
+ /*
+ * shstk->base is NULL for CLONE_VFORK child tasks, and so is
+ * normal. But size = 0 on a shstk->base is not normal and
+ * indicated an attempt to free the thread shadow stack twice.
+ * Warn about it.
+ */
+ if (WARN_ON(!shstk->size))
+ return;
+
+ unmap_shadow_stack(shstk->base, shstk->size);
+
+ shstk->size = 0;
+}
+
+static int wrss_control(bool enable)
+{
+ u64 msrval;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /*
+ * Only enable WRSS if shadow stack is enabled. If shadow stack is not
+ * enabled, WRSS will already be disabled, so don't bother clearing it
+ * when disabling.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -EPERM;
+
+ /* Already enabled/disabled? */
+ if (features_enabled(ARCH_SHSTK_WRSS) == enable)
+ return 0;
+
+ fpregs_lock_and_load();
+ rdmsrq(MSR_IA32_U_CET, msrval);
+
+ if (enable) {
+ features_set(ARCH_SHSTK_WRSS);
+ msrval |= CET_WRSS_EN;
+ } else {
+ features_clr(ARCH_SHSTK_WRSS);
+ if (!(msrval & CET_WRSS_EN))
+ goto unlock;
+
+ msrval &= ~CET_WRSS_EN;
+ }
+
+ wrmsrq(MSR_IA32_U_CET, msrval);
+
+unlock:
+ fpregs_unlock();
+
+ return 0;
+}
+
+static int shstk_disable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /* Already disabled? */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ fpregs_lock_and_load();
+ /* Disable WRSS too when disabling shadow stack */
+ wrmsrq(MSR_IA32_U_CET, 0);
+ wrmsrq(MSR_IA32_PL3_SSP, 0);
+ fpregs_unlock();
+
+ shstk_free(current);
+ features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+ bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
+ unsigned long aligned_size;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ if (flags & ~SHADOW_STACK_SET_TOKEN)
+ return -EINVAL;
+
+ /* If there isn't space for a token */
+ if (set_tok && size < 8)
+ return -ENOSPC;
+
+ if (addr && addr < SZ_4G)
+ return -ERANGE;
+
+ /*
+ * An overflow would result in attempting to write the restore token
+ * to the wrong location. Not catastrophic, but just return the right
+ * error code and block it.
+ */
+ aligned_size = PAGE_ALIGN(size);
+ if (aligned_size < size)
+ return -EOVERFLOW;
+
+ return alloc_shstk(addr, aligned_size, size, set_tok);
+}
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
+{
+ unsigned long features = arg2;
+
+ if (option == ARCH_SHSTK_STATUS) {
+ return put_user(task->thread.features, (unsigned long __user *)arg2);
+ }
+
+ if (option == ARCH_SHSTK_LOCK) {
+ task->thread.features_locked |= features;
+ return 0;
+ }
+
+ /* Only allow via ptrace */
+ if (task != current) {
+ if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
+ task->thread.features_locked &= ~features;
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ /* Do not allow to change locked features */
+ if (features & task->thread.features_locked)
+ return -EPERM;
+
+ /* Only support enabling/disabling one feature at a time. */
+ if (hweight_long(features) > 1)
+ return -EINVAL;
+
+ if (option == ARCH_SHSTK_DISABLE) {
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(false);
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_disable();
+ return -EINVAL;
+ }
+
+ /* Handle ARCH_SHSTK_ENABLE */
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_setup();
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(true);
+ return -EINVAL;
+}
+
+int shstk_update_last_frame(unsigned long val)
+{
+ unsigned long ssp;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ return write_user_shstk_64((u64 __user *)ssp, (u64)val);
+}
+
+bool shstk_is_enabled(void)
+{
+ return features_enabled(ARCH_SHSTK_SHSTK);
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c578751e94e..2404233336ab 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
@@ -6,695 +7,261 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
-#include <linux/signal.h>
+#include <linux/kstrtox.h>
#include <linux/errno.h>
#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+#include <linux/context_tracking.h>
+#include <linux/entry-common.h>
+#include <linux/syscalls.h>
+#include <linux/rseq.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/i387.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
#include <asm/vdso.h>
#include <asm/mce.h>
-
-#ifdef CONFIG_X86_64
-#include <asm/proto.h>
-#include <asm/ia32_unistd.h>
-#endif /* CONFIG_X86_64 */
+#include <asm/sighandling.h>
+#include <asm/vm86.h>
#include <asm/syscall.h>
-#include <asm/syscalls.h>
-
#include <asm/sigframe.h>
+#include <asm/signal.h>
+#include <asm/shstk.h>
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
- X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
- X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
- X86_EFLAGS_CF)
-
-#ifdef CONFIG_X86_32
-# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
-#else
-# define FIX_EFLAGS __FIX_EFLAGS
-#endif
-
-#define COPY(x) do { \
- get_user_ex(regs->x, &sc->x); \
-} while (0)
-
-#define GET_SEG(seg) ({ \
- unsigned short tmp; \
- get_user_ex(tmp, &sc->seg); \
- tmp; \
-})
-
-#define COPY_SEG(seg) do { \
- regs->seg = GET_SEG(seg); \
-} while (0)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
+}
-#define COPY_SEG_CPL3(seg) do { \
- regs->seg = GET_SEG(seg) | 3; \
-} while (0)
+static inline int is_ia32_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
+}
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
+static inline int is_x32_frame(struct ksignal *ksig)
{
- void __user *buf;
- unsigned int tmpflags;
- unsigned int err = 0;
-
- /* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
- get_user_try {
-
-#ifdef CONFIG_X86_32
- set_user_gs(regs, GET_SEG(gs));
- COPY_SEG(fs);
- COPY_SEG(es);
- COPY_SEG(ds);
-#endif /* CONFIG_X86_32 */
-
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
-
-#ifdef CONFIG_X86_64
- COPY(r8);
- COPY(r9);
- COPY(r10);
- COPY(r11);
- COPY(r12);
- COPY(r13);
- COPY(r14);
- COPY(r15);
-#endif /* CONFIG_X86_64 */
-
-#ifdef CONFIG_X86_32
- COPY_SEG_CPL3(cs);
- COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
-
- get_user_ex(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
-
- get_user_ex(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
-
- get_user_ex(*pax, &sc->ax);
- } get_user_catch(err);
-
- return err;
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
}
-static int
-setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
- struct pt_regs *regs, unsigned long mask)
+/*
+ * Enable all pkeys temporarily, so as to ensure that both the current
+ * execution stack as well as the alternate signal stack are writeable.
+ * The application can use any of the available pkeys to protect the
+ * alternate signal stack, and we don't know which one it is, so enable
+ * all. The PKRU register will be reset to init_pkru later in the flow,
+ * in fpu__clear_user_states(), and it is the application's responsibility
+ * to enable the appropriate pkey as the first step in the signal handler
+ * so that the handler does not segfault.
+ */
+static inline u32 sig_prepare_pkru(void)
{
- int err = 0;
-
- put_user_try {
-
-#ifdef CONFIG_X86_32
- put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
- put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
- put_user_ex(regs->es, (unsigned int __user *)&sc->es);
- put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
-#endif /* CONFIG_X86_32 */
-
- put_user_ex(regs->di, &sc->di);
- put_user_ex(regs->si, &sc->si);
- put_user_ex(regs->bp, &sc->bp);
- put_user_ex(regs->sp, &sc->sp);
- put_user_ex(regs->bx, &sc->bx);
- put_user_ex(regs->dx, &sc->dx);
- put_user_ex(regs->cx, &sc->cx);
- put_user_ex(regs->ax, &sc->ax);
-#ifdef CONFIG_X86_64
- put_user_ex(regs->r8, &sc->r8);
- put_user_ex(regs->r9, &sc->r9);
- put_user_ex(regs->r10, &sc->r10);
- put_user_ex(regs->r11, &sc->r11);
- put_user_ex(regs->r12, &sc->r12);
- put_user_ex(regs->r13, &sc->r13);
- put_user_ex(regs->r14, &sc->r14);
- put_user_ex(regs->r15, &sc->r15);
-#endif /* CONFIG_X86_64 */
-
- put_user_ex(current->thread.trap_no, &sc->trapno);
- put_user_ex(current->thread.error_code, &sc->err);
- put_user_ex(regs->ip, &sc->ip);
-#ifdef CONFIG_X86_32
- put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
- put_user_ex(regs->flags, &sc->flags);
- put_user_ex(regs->sp, &sc->sp_at_signal);
- put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
-#else /* !CONFIG_X86_32 */
- put_user_ex(regs->flags, &sc->flags);
- put_user_ex(regs->cs, &sc->cs);
- put_user_ex(0, &sc->gs);
- put_user_ex(0, &sc->fs);
-#endif /* CONFIG_X86_32 */
-
- put_user_ex(fpstate, &sc->fpstate);
-
- /* non-iBCS2 extensions.. */
- put_user_ex(mask, &sc->oldmask);
- put_user_ex(current->thread.cr2, &sc->cr2);
- } put_user_catch(err);
-
- return err;
+ u32 orig_pkru = read_pkru();
+
+ write_pkru(0);
+ return orig_pkru;
}
/*
* Set up a signal frame.
*/
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT 16UL
+
+#define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1)
+
/*
* Determine which stack to use..
*/
-static unsigned long align_sigframe(unsigned long sp)
-{
-#ifdef CONFIG_X86_32
- /*
- * Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0.
- */
- sp = ((sp + 4) & -16ul) - 4;
-#else /* !CONFIG_X86_32 */
- sp = round_down(sp, 16) - 8;
-#endif
- return sp;
-}
-
-static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+void __user *
+get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
+ struct k_sigaction *ka = &ksig->ka;
+ int ia32_frame = is_ia32_frame(ksig);
/* Default to using normal stack */
+ bool nested_altstack = on_sig_stack(regs->sp);
+ bool entering_altstack = false;
+ unsigned long math_size = 0;
unsigned long sp = regs->sp;
- int onsigstack = on_sig_stack(sp);
+ unsigned long buf_fx = 0;
+ u32 pkru;
-#ifdef CONFIG_X86_64
/* redzone */
- sp -= 128;
-#endif /* CONFIG_X86_64 */
-
- if (!onsigstack) {
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (current->sas_ss_size)
- sp = current->sas_ss_sp + current->sas_ss_size;
- } else {
-#ifdef CONFIG_X86_32
- /* This is the legacy signal stack switching. */
- if ((regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
- sp = (unsigned long) ka->sa.sa_restorer;
-#endif /* CONFIG_X86_32 */
+ if (!ia32_frame)
+ sp -= 128;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ /*
+ * This checks nested_altstack via sas_ss_flags(). Sensible
+ * programs use SS_AUTODISARM, which disables that check, and
+ * programs that don't use SS_AUTODISARM get compatible.
+ */
+ if (sas_ss_flags(sp) == 0) {
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ entering_altstack = true;
}
+ } else if (ia32_frame &&
+ !nested_altstack &&
+ regs->ss != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ /* This is the legacy signal stack switching. */
+ sp = (unsigned long) ka->sa.sa_restorer;
+ entering_altstack = true;
}
- if (used_math()) {
- sp -= sig_xstate_size;
-#ifdef CONFIG_X86_64
- sp = round_down(sp, 64);
-#endif /* CONFIG_X86_64 */
- *fpstate = (void __user *)sp;
- }
+ sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size);
+ *fpstate = (void __user *)sp;
- sp = align_sigframe(sp - frame_size);
+ sp -= frame_size;
+
+ if (ia32_frame)
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
+ else
+ sp = round_down(sp, FRAME_ALIGNMENT) - 8;
/*
* If we are on the alternate signal stack and would overflow it, don't.
* Return an always-bogus address instead so we will die with SIGSEGV.
*/
- if (onsigstack && !likely(on_sig_stack(sp)))
- return (void __user *)-1L;
-
- /* save i387 state */
- if (used_math() && save_i387_xstate(*fpstate) < 0)
- return (void __user *)-1L;
+ if (unlikely((nested_altstack || entering_altstack) &&
+ !__on_sig_stack(sp))) {
- return (void __user *)sp;
-}
-
-#ifdef CONFIG_X86_32
-static const struct {
- u16 poplmovl;
- u32 val;
- u16 int80;
-} __attribute__((packed)) retcode = {
- 0xb858, /* popl %eax; movl $..., %eax */
- __NR_sigreturn,
- 0x80cd, /* int $0x80 */
-};
-
-static const struct {
- u8 movl;
- u32 val;
- u16 int80;
- u8 pad;
-} __attribute__((packed)) rt_retcode = {
- 0xb8, /* movl $..., %eax */
- __NR_rt_sigreturn,
- 0x80cd, /* int $0x80 */
- 0
-};
-
-static int
-__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
- struct pt_regs *regs)
-{
- struct sigframe __user *frame;
- void __user *restorer;
- int err = 0;
- void __user *fpstate = NULL;
-
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ if (show_unhandled_signals && printk_ratelimit())
+ pr_info("%s[%d] overflowed sigaltstack\n",
+ current->comm, task_pid_nr(current));
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (__put_user(sig, &frame->sig))
- return -EFAULT;
-
- if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
- return -EFAULT;
-
- if (_NSIG_WORDS > 1) {
- if (__copy_to_user(&frame->extramask, &set->sig[1],
- sizeof(frame->extramask)))
- return -EFAULT;
+ return (void __user *)-1L;
}
- if (current->mm->context.vdso)
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
- else
- restorer = &frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
-
- /* Set up to return from userspace. */
- err |= __put_user(restorer, &frame->pretcode);
-
- /*
- * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = 0;
- regs->cx = 0;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-}
-
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *restorer;
- int err = 0;
- void __user *fpstate = NULL;
-
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- put_user_try {
- put_user_ex(sig, &frame->sig);
- put_user_ex(&frame->info, &frame->pinfo);
- put_user_ex(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
-
- /* Create the ucontext. */
- if (cpu_has_xsave)
- put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- put_user_ex(0, &frame->uc.uc_flags);
- put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- /* Set up to return from userspace. */
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
- put_user_ex(restorer, &frame->pretcode);
-
+ /* Update PKRU to enable access to the alternate signal stack. */
+ pkru = sig_prepare_pkru();
+ /* save i387 and extended state */
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) {
/*
- * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
+ * Restore PKRU to the original, user-defined value; disable
+ * extra pkeys enabled for the alternate signal stack, if any.
*/
- put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
- } put_user_catch(err);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = (unsigned long)&frame->info;
- regs->cx = (unsigned long)&frame->uc;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-}
-#else /* !CONFIG_X86_32 */
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *fp = NULL;
- int err = 0;
- struct task_struct *me = current;
-
- frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, info))
- return -EFAULT;
+ write_pkru(pkru);
+ return (void __user *)-1L;
}
- put_user_try {
- /* Create the ucontext. */
- if (cpu_has_xsave)
- put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- put_user_ex(0, &frame->uc.uc_flags);
- put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- /* Set up to return from userspace. If provided, use a stub
- already in userspace. */
- /* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER) {
- put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
- } else {
- /* could use a vstub here */
- err |= -EFAULT;
- }
- } put_user_catch(err);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->di = sig;
- /* In case the signal handler was declared without prototypes */
- regs->ax = 0;
-
- /* This also works for non SA_SIGINFO handlers because they expect the
- next argument after the signal number on the stack. */
- regs->si = (unsigned long)&frame->info;
- regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ka->sa.sa_handler;
-
- regs->sp = (unsigned long)frame;
-
- /* Set up the CS register to run signal handlers in 64-bit mode,
- even if the handler happens to be interrupting 32-bit code. */
- regs->cs = __USER_CS;
-
- return 0;
+ return (void __user *)sp;
}
-#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_X86_32
/*
- * Atomically swap in the new signal mask, and wait for a signal.
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
*/
-asmlinkage int
-sys_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
-
- return -ERESTARTNOHAND;
-}
-
-asmlinkage int
-sys_sigaction(int sig, const struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret = 0;
-
- if (act) {
- old_sigset_t mask;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)))
- return -EFAULT;
-
- get_user_try {
- get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
- get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
- get_user_ex(mask, &act->sa_mask);
- get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
- } get_user_catch(ret);
-
- if (ret)
- return -EFAULT;
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
- return -EFAULT;
-
- put_user_try {
- put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
- put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
- put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
- put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
- } put_user_catch(ret);
-
- if (ret)
- return -EFAULT;
- }
-
- return ret;
-}
-#endif /* CONFIG_X86_32 */
-
-#ifdef CONFIG_X86_32
-int sys_sigaltstack(struct pt_regs *regs)
-{
- const stack_t __user *uss = (const stack_t __user *)regs->bx;
- stack_t __user *uoss = (stack_t __user *)regs->cx;
-
- return do_sigaltstack(uss, uoss, regs->sp);
-}
-#else /* !CONFIG_X86_32 */
-asmlinkage long
-sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
- struct pt_regs *regs)
-{
- return do_sigaltstack(uss, uoss, regs->sp);
-}
-#endif /* CONFIG_X86_32 */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32)
+#else
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe)
+#endif
/*
- * Do a signal return; undo the signal stack.
+ * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+ * If a signal frame starts at an unaligned address, extra space is required.
+ * This is the max alignment padding, conservatively.
*/
-#ifdef CONFIG_X86_32
-unsigned long sys_sigreturn(struct pt_regs *regs)
-{
- struct sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
-
- frame = (struct sigframe __user *)(regs->sp - 8);
-
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
- && __copy_from_user(&set.sig[1], &frame->extramask,
- sizeof(frame->extramask))))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+#define MAX_XSAVE_PADDING 63UL
- if (restore_sigcontext(regs, &frame->sc, &ax))
- goto badframe;
- return ax;
-
-badframe:
- signal_fault(regs, frame, "sigreturn");
+/*
+ * The frame data is composed of the following areas and laid out as:
+ *
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | (f)xsave frame |
+ * -------------------------
+ * | fsave header |
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | siginfo + ucontext |
+ * -------------------------
+ */
- return 0;
-}
-#endif /* CONFIG_X86_32 */
+/* max_frame_size tells userspace the worst case signal stack size. */
+static unsigned long __ro_after_init max_frame_size;
+static unsigned int __ro_after_init fpu_default_state_size;
-long sys_rt_sigreturn(struct pt_regs *regs)
+static int __init init_sigframe_size(void)
{
- struct rt_sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
+ fpu_default_state_size = fpu__get_fpstate_size();
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
- goto badframe;
+ max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
- goto badframe;
+ /* Userspace expects an aligned size. */
+ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
- if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
- goto badframe;
-
- return ax;
-
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
+ pr_info("max sigframe size: %lu\n", max_frame_size);
return 0;
}
+early_initcall(init_sigframe_size);
-/*
- * OK, we're invoking a handler:
- */
-static int signr_convert(int sig)
+unsigned long get_sigframe_size(void)
{
-#ifdef CONFIG_X86_32
- struct thread_info *info = current_thread_info();
-
- if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
- return info->exec_domain->signal_invmap[sig];
-#endif /* CONFIG_X86_32 */
- return sig;
+ return max_frame_size;
}
-#ifdef CONFIG_X86_32
-
-#define is_ia32 1
-#define ia32_setup_frame __setup_frame
-#define ia32_setup_rt_frame __setup_rt_frame
-
-#else /* !CONFIG_X86_32 */
-
-#ifdef CONFIG_IA32_EMULATION
-#define is_ia32 test_thread_flag(TIF_IA32)
-#else /* !CONFIG_IA32_EMULATION */
-#define is_ia32 0
-#endif /* CONFIG_IA32_EMULATION */
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs *regs);
-
-#endif /* CONFIG_X86_32 */
-
static int
-setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
+setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
- int usig = signr_convert(sig);
- int ret;
+ /* Perform fixup for the pre-signal frame. */
+ rseq_signal_deliver(ksig, regs);
/* Set up the stack frame */
- if (is_ia32) {
- if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ if (is_ia32_frame(ksig)) {
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO)
+ return ia32_setup_rt_frame(ksig, regs);
else
- ret = ia32_setup_frame(usig, ka, set, regs);
- } else
- ret = __setup_rt_frame(sig, ka, info, set, regs);
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
+ return ia32_setup_frame(ksig, regs);
+ } else if (is_x32_frame(ksig)) {
+ return x32_setup_rt_frame(ksig, regs);
+ } else {
+ return x64_setup_rt_frame(ksig, regs);
}
-
- return ret;
}
-static int
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs *regs)
+static void
+handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
- int ret;
+ bool stepping, failed;
+ struct fpu *fpu = x86_task_fpu(current);
+
+ if (v8086_mode(regs))
+ save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
/* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* If so, check system call restarting.. */
switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
@@ -703,11 +270,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
break;
case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
+ if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
regs->ax = -EINTR;
break;
}
- /* fallthrough */
+ fallthrough;
case -ERESTARTNOINTR:
regs->ax = regs->orig_ax;
regs->ip -= 2;
@@ -716,113 +283,65 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
}
/*
- * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
- * flag so that register information in the sigcontext is correct.
+ * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+ * so that register information in the sigcontext is correct and
+ * then notify the tracer before entering the signal handler.
*/
- if (unlikely(regs->flags & X86_EFLAGS_TF) &&
- likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
- regs->flags &= ~X86_EFLAGS_TF;
+ stepping = test_thread_flag(TIF_SINGLESTEP);
+ if (stepping)
+ user_disable_single_step(current);
- ret = setup_rt_frame(sig, ka, info, oldset, regs);
-
- if (ret)
- return ret;
+ failed = (setup_rt_frame(ksig, regs) < 0);
+ if (!failed) {
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ *
+ * Clear RF when entering the signal handler, because
+ * it might disable possible debug exception from the
+ * signal handler.
+ *
+ * Clear TF for the case when it wasn't set by debugger to
+ * avoid the recursive send_sigtrap() in SIGTRAP handler.
+ */
+ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ fpu__clear_user_states(fpu);
+ }
+ signal_setup_done(failed, ksig, stepping);
+}
-#ifdef CONFIG_X86_64
- /*
- * This has nothing to do with segment registers,
- * despite the name. This magic affects uaccess.h
- * macros' behavior. Reset it to the normal setting.
- */
- set_fs(USER_DS);
+static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (current->restart_block.arch_data & TS_COMPAT)
+ return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+ return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
+ return __NR_restart_syscall;
#endif
-
- /*
- * Clear the direction flag as per the ABI for function entry.
- */
- regs->flags &= ~X86_EFLAGS_DF;
-
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~X86_EFLAGS_TF;
-
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked, sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
}
-#ifdef CONFIG_X86_32
-#define NR_restart_syscall __NR_restart_syscall
-#else /* !CONFIG_X86_32 */
-#define NR_restart_syscall \
- test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
-#endif /* CONFIG_X86_32 */
-
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-static void do_signal(struct pt_regs *regs)
+void arch_do_signal_or_restart(struct pt_regs *regs)
{
- struct k_sigaction ka;
- siginfo_t info;
- int signr;
- sigset_t *oldset;
-
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
-
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- oldset = &current->saved_sigmask;
- else
- oldset = &current->blocked;
-
- signr = get_signal_to_deliver(&info, &ka, regs, NULL);
- if (signr > 0) {
- /*
- * Re-enable any watchpoints before delivering the
- * signal to user space. The processor register will
- * have been cleared if the watchpoint triggered
- * inside the kernel.
- */
- if (current->thread.debugreg7)
- set_debugreg(current->thread.debugreg7, 7);
+ struct ksignal ksig;
+ if (get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
- if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
- /*
- * A signal was successfully delivered; the saved
- * sigmask will have been stored in the signal frame,
- * and will be restored by sigreturn, so we can simply
- * clear the TS_RESTORE_SIGMASK flag.
- */
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- }
+ handle_signal(&ksig, regs);
return;
}
/* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* Restart the system call - no handlers present */
switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
@@ -833,7 +352,7 @@ static void do_signal(struct pt_regs *regs)
break;
case -ERESTART_RESTARTBLOCK:
- regs->ax = NR_restart_syscall;
+ regs->ax = get_nr_restart_syscall(regs);
regs->ip -= 2;
break;
}
@@ -843,37 +362,7 @@ static void do_signal(struct pt_regs *regs)
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
- }
-}
-
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-#ifdef CONFIG_X86_NEW_MCE
- /* notify userspace of pending MCEs */
- if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_process();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
- /* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- }
-
-#ifdef CONFIG_X86_32
- clear_thread_flag(TIF_IRET);
-#endif /* CONFIG_X86_32 */
+ restore_saved_sigmask();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
@@ -886,9 +375,65 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
me->comm, me->pid, where, frame,
regs->ip, regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
- printk(KERN_CONT "\n");
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
}
- force_sig(SIGSEGV, me);
+ force_sig(SIGSEGV);
+}
+
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
+static bool strict_sigaltstack_size __ro_after_init = true;
+#else
+static bool strict_sigaltstack_size __ro_after_init = false;
+#endif
+
+static int __init strict_sas_size(char *arg)
+{
+ return kstrtobool(arg, &strict_sigaltstack_size) == 0;
+}
+__setup("strict_sas_size", strict_sas_size);
+
+/*
+ * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
+ * exceeds that size already. As such programs might never use the
+ * sigaltstack they just continued to work. While always checking against
+ * the real size would be correct, this might be considered a regression.
+ *
+ * Therefore avoid the sanity check, unless enforced by kernel
+ * configuration or command line option.
+ *
+ * When dynamic FPU features are supported, the check is also enforced when
+ * the task has permissions to use dynamic features. Tasks which have no
+ * permission are checked against the size of the non-dynamic feature set
+ * if strict checking is enabled. This avoids forcing all tasks on the
+ * system to allocate large sigaltstacks even if they are never going
+ * to use a dynamic feature. As this is serialized via sighand::siglock
+ * any permission request for a dynamic feature either happened already
+ * or will see the newly install sigaltstack size in the permission checks.
+ */
+bool sigaltstack_size_valid(size_t ss_size)
+{
+ unsigned long fsize = max_frame_size - fpu_default_state_size;
+ u64 mask;
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
+ return true;
+
+ fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size;
+ if (likely(ss_size > fsize))
+ return true;
+
+ if (strict_sigaltstack_size)
+ return ss_size > fsize;
+
+ mask = x86_task_fpu(current->group_leader)->perm.__state_perm;
+ if (mask & XFEATURE_MASK_USER_DYNAMIC)
+ return ss_size > fsize;
+
+ return true;
}
+#endif /* CONFIG_DYNAMIC_SIGFRAME */
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
new file mode 100644
index 000000000000..42bbc42bd350
--- /dev/null
+++ b/arch/x86/kernel/signal_32.c
@@ -0,0 +1,535 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
+ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compat.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <asm/ucontext.h>
+#include <linux/uaccess.h>
+#include <asm/fpu/signal.h>
+#include <asm/ptrace.h>
+#include <asm/user32.h>
+#include <uapi/asm/sigcontext.h>
+#include <asm/proto.h>
+#include <asm/vdso.h>
+#include <asm/sigframe.h>
+#include <asm/sighandling.h>
+#include <asm/smap.h>
+#include <asm/gsseg.h>
+
+/*
+ * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0
+ * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index
+ * GDT, selector values 0~3 all point to the NULL descriptor, thus values
+ * 0, 1, 2 and 3 are all valid NULL selector values.
+ *
+ * However IRET zeros ES, FS, GS, and DS segment registers if any of them
+ * is found to have any nonzero NULL selector value, which can be used by
+ * userspace in pre-FRED systems to spot any interrupt/exception by loading
+ * a nonzero NULL selector and waiting for it to become zero. Before FRED
+ * there was nothing software could do to prevent such an information leak.
+ *
+ * ERETU, the only legit instruction to return to userspace from kernel
+ * under FRED, by design does NOT zero any segment register to avoid this
+ * problem behavior.
+ *
+ * As such, leave NULL selector values 0~3 unchanged.
+ */
+static inline u16 fixup_rpl(u16 sel)
+{
+ return sel <= 3 ? sel : sel | 3;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/unistd_32_ia32.h>
+
+static inline void reload_segments(struct sigcontext_32 *sc)
+{
+ u16 cur;
+
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
+ savesegment(gs, cur);
+ if (fixup_rpl(sc->gs) != cur)
+ load_gs_index(fixup_rpl(sc->gs));
+ savesegment(fs, cur);
+ if (fixup_rpl(sc->fs) != cur)
+ loadsegment(fs, fixup_rpl(sc->fs));
+
+ savesegment(ds, cur);
+ if (fixup_rpl(sc->ds) != cur)
+ loadsegment(ds, fixup_rpl(sc->ds));
+ savesegment(es, cur);
+ if (fixup_rpl(sc->es) != cur)
+ loadsegment(es, fixup_rpl(sc->es));
+}
+
+#define sigset32_t compat_sigset_t
+#define siginfo32_t compat_siginfo_t
+#define restore_altstack32 compat_restore_altstack
+#define unsafe_save_altstack32 unsafe_compat_save_altstack
+
+#else
+
+#define sigset32_t sigset_t
+#define siginfo32_t siginfo_t
+#define __NR_ia32_sigreturn __NR_sigreturn
+#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn
+#define restore_altstack32 restore_altstack
+#define unsafe_save_altstack32 unsafe_save_altstack
+#define __copy_siginfo_to_user32 copy_siginfo_to_user
+
+#endif
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+static bool ia32_restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext_32 __user *usc)
+{
+ struct sigcontext_32 sc;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ if (unlikely(copy_from_user(&sc, usc, sizeof(sc))))
+ return false;
+
+ /* Get only the ia32 registers. */
+ regs->bx = sc.bx;
+ regs->cx = sc.cx;
+ regs->dx = sc.dx;
+ regs->si = sc.si;
+ regs->di = sc.di;
+ regs->bp = sc.bp;
+ regs->ax = sc.ax;
+ regs->sp = sc.sp;
+ regs->ip = sc.ip;
+
+ /* Get CS/SS and force CPL3 */
+ regs->cs = sc.cs | 0x03;
+ regs->ss = sc.ss | 0x03;
+
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+#ifdef CONFIG_IA32_EMULATION
+ reload_segments(&sc);
+#else
+ loadsegment(gs, fixup_rpl(sc.gs));
+ regs->fs = fixup_rpl(sc.fs);
+ regs->es = fixup_rpl(sc.es);
+ regs->ds = fixup_rpl(sc.ds);
+#endif
+
+ return fpu__restore_sig(compat_ptr(sc.fpstate), 1);
+}
+
+SYSCALL32_DEFINE0(sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
+ sigset_t set;
+
+ prevent_single_step_upon_eretu(regs);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask)
+ || __get_user(((__u32 *)&set)[1], &frame->extramask[0]))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!ia32_restore_sigcontext(regs, &frame->sc))
+ goto badframe;
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "32bit sigreturn");
+ return 0;
+}
+
+SYSCALL32_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_ia32 __user *frame;
+ sigset_t set;
+
+ prevent_single_step_upon_eretu(regs);
+
+ frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
+ goto badframe;
+
+ if (restore_altstack32(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "32bit rt sigreturn");
+ return 0;
+}
+
+/*
+ * Set up a signal frame.
+ */
+
+#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; })
+
+static __always_inline int
+__unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc,
+ void __user *fpstate,
+ struct pt_regs *regs, unsigned int mask)
+{
+ unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault);
+#ifdef CONFIG_IA32_EMULATION
+ unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault);
+ unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault);
+ unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault);
+#else
+ unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
+ unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
+ unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
+#endif
+
+ unsafe_put_user(regs->di, &sc->di, Efault);
+ unsafe_put_user(regs->si, &sc->si, Efault);
+ unsafe_put_user(regs->bp, &sc->bp, Efault);
+ unsafe_put_user(regs->sp, &sc->sp, Efault);
+ unsafe_put_user(regs->bx, &sc->bx, Efault);
+ unsafe_put_user(regs->dx, &sc->dx, Efault);
+ unsafe_put_user(regs->cx, &sc->cx, Efault);
+ unsafe_put_user(regs->ax, &sc->ax, Efault);
+ unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+ unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+ unsafe_put_user(regs->ip, &sc->ip, Efault);
+ unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
+ unsafe_put_user(regs->flags, &sc->flags, Efault);
+ unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
+ unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
+
+ unsafe_put_user(ptr_to_compat(fpstate), &sc->fpstate, Efault);
+
+ /* non-iBCS2 extensions.. */
+ unsafe_put_user(mask, &sc->oldmask, Efault);
+ unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+ return 0;
+
+Efault:
+ return -EFAULT;
+}
+
+#define unsafe_put_sigcontext32(sc, fp, regs, set, label) \
+do { \
+ if (__unsafe_setup_sigcontext32(sc, fp, regs, set->sig[0])) \
+ goto label; \
+} while(0)
+
+int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
+ struct sigframe_ia32 __user *frame;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ /* copy_to_user optimizes that into a single 8 byte store */
+ static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+ } __attribute__((packed)) code = {
+ 0xb858, /* popl %eax ; movl $...,%eax */
+ __NR_ia32_sigreturn,
+ 0x80cd, /* int $0x80 */
+ };
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
+ } else {
+ /* Return stub is in 32bit vsyscall page */
+ if (current->mm->context.vdso)
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_sigreturn;
+ else
+ restorer = &frame->retcode;
+ }
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
+ unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
+ unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
+ unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
+ /*
+ * These are actually not used anymore, but left because some
+ * gdb versions depend on them as a marker.
+ */
+ unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+ user_access_end();
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* Make -mregparm=3 work */
+ regs->ax = ksig->sig;
+ regs->dx = 0;
+ regs->cx = 0;
+
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
+ struct rt_sigframe_ia32 __user *frame;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ /* unsafe_put_user optimizes that into a single 8 byte store */
+ static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+ } __attribute__((packed)) code = {
+ 0xb8,
+ __NR_ia32_rt_sigreturn,
+ 0x80cd,
+ 0,
+ };
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
+ unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault);
+ unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault);
+
+ /* Create the ucontext. */
+ if (static_cpu_has(X86_FEATURE_XSAVE))
+ unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
+ else
+ unsafe_put_user(0, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault);
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+ else
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_rt_sigreturn;
+ unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
+
+ /*
+ * Not actually used anymore, but left because some gdb
+ * versions need it.
+ */
+ unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+ unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault);
+ user_access_end();
+
+ if (__copy_siginfo_to_user32(&frame->info, &ksig->info))
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* Make -mregparm=3 work */
+ regs->ax = ksig->sig;
+ regs->dx = (unsigned long) &frame->info;
+ regs->cx = (unsigned long) &frame->uc;
+
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+/*
+ * The siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 10);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo32_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo32_t) == 4);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo32_t, _sifields) == 3 * sizeof(int));
+
+static_assert(offsetof(siginfo32_t, si_signo) == 0);
+static_assert(offsetof(siginfo32_t, si_errno) == 4);
+static_assert(offsetof(siginfo32_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo32_t, _sifields) == \
+ offsetof(siginfo32_t, _sifields.name))
+
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo32_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0xC);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+
+CHECK_SI_OFFSET(_timer);
+#ifdef CONFIG_COMPAT
+/* compat_siginfo_t doesn't have si_sys_private */
+CHECK_SI_SIZE (_timer, 3*sizeof(int));
+#else
+CHECK_SI_SIZE (_timer, 4*sizeof(int));
+#endif
+static_assert(offsetof(siginfo32_t, si_tid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_overrun) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 5*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_status) == 0x14);
+static_assert(offsetof(siginfo32_t, si_utime) == 0x18);
+static_assert(offsetof(siginfo32_t, si_stime) == 0x1C);
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 4*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_addr) == 0x0C);
+
+static_assert(offsetof(siginfo32_t, si_trapno) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_addr_lsb) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_lower) == 0x14);
+static_assert(offsetof(siginfo32_t, si_upper) == 0x18);
+
+static_assert(offsetof(siginfo32_t, si_pkey) == 0x14);
+
+static_assert(offsetof(siginfo32_t, si_perf_data) == 0x10);
+static_assert(offsetof(siginfo32_t, si_perf_type) == 0x14);
+static_assert(offsetof(siginfo32_t, si_perf_flags) == 0x18);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_band) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_fd) == 0x10);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_call_addr) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_syscall) == 0x10);
+static_assert(offsetof(siginfo32_t, si_arch) == 0x14);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644
index 000000000000..d483b585c6c6
--- /dev/null
+++ b/arch/x86/kernel/signal_64.c
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include <asm/ucontext.h>
+#include <asm/fpu/signal.h>
+#include <asm/sighandling.h>
+
+#include <asm/syscall.h>
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+
+/*
+ * If regs->ss will cause an IRET fault, change it. Otherwise leave it
+ * alone. Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+ u32 ar;
+ asm volatile ("lar %[old_ss], %[ar]\n\t"
+ "jz 1f\n\t" /* If invalid: */
+ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
+ "1:"
+ : [ar] "=r" (ar)
+ : [old_ss] "rm" ((u16)regs->ss));
+
+ /*
+ * For a valid 64-bit user context, we need DPL 3, type
+ * read-write data or read-write exp-down data, and S and P
+ * set. We can't use VERW because VERW doesn't check the
+ * P bit.
+ */
+ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+ if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+ ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+ regs->ss = __USER_DS;
+}
+
+static bool restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *usc,
+ unsigned long uc_flags)
+{
+ struct sigcontext sc;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1)))
+ return false;
+
+ regs->bx = sc.bx;
+ regs->cx = sc.cx;
+ regs->dx = sc.dx;
+ regs->si = sc.si;
+ regs->di = sc.di;
+ regs->bp = sc.bp;
+ regs->ax = sc.ax;
+ regs->sp = sc.sp;
+ regs->ip = sc.ip;
+ regs->r8 = sc.r8;
+ regs->r9 = sc.r9;
+ regs->r10 = sc.r10;
+ regs->r11 = sc.r11;
+ regs->r12 = sc.r12;
+ regs->r13 = sc.r13;
+ regs->r14 = sc.r14;
+ regs->r15 = sc.r15;
+
+ /* Get CS/SS and force CPL3 */
+ regs->cs = sc.cs | 0x03;
+ regs->ss = sc.ss | 0x03;
+
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+ /*
+ * Fix up SS if needed for the benefit of old DOSEMU and
+ * CRIU.
+ */
+ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
+ force_valid_ss(regs);
+
+ return fpu__restore_sig((void __user *)sc.fpstate, 0);
+}
+
+static __always_inline int
+__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
+{
+ unsafe_put_user(regs->di, &sc->di, Efault);
+ unsafe_put_user(regs->si, &sc->si, Efault);
+ unsafe_put_user(regs->bp, &sc->bp, Efault);
+ unsafe_put_user(regs->sp, &sc->sp, Efault);
+ unsafe_put_user(regs->bx, &sc->bx, Efault);
+ unsafe_put_user(regs->dx, &sc->dx, Efault);
+ unsafe_put_user(regs->cx, &sc->cx, Efault);
+ unsafe_put_user(regs->ax, &sc->ax, Efault);
+ unsafe_put_user(regs->r8, &sc->r8, Efault);
+ unsafe_put_user(regs->r9, &sc->r9, Efault);
+ unsafe_put_user(regs->r10, &sc->r10, Efault);
+ unsafe_put_user(regs->r11, &sc->r11, Efault);
+ unsafe_put_user(regs->r12, &sc->r12, Efault);
+ unsafe_put_user(regs->r13, &sc->r13, Efault);
+ unsafe_put_user(regs->r14, &sc->r14, Efault);
+ unsafe_put_user(regs->r15, &sc->r15, Efault);
+
+ unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+ unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+ unsafe_put_user(regs->ip, &sc->ip, Efault);
+ unsafe_put_user(regs->flags, &sc->flags, Efault);
+ unsafe_put_user(regs->cs, &sc->cs, Efault);
+ unsafe_put_user(0, &sc->gs, Efault);
+ unsafe_put_user(0, &sc->fs, Efault);
+ unsafe_put_user(regs->ss, &sc->ss, Efault);
+
+ unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
+
+ /* non-iBCS2 extensions.. */
+ unsafe_put_user(mask, &sc->oldmask, Efault);
+ unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+ return 0;
+Efault:
+ return -EFAULT;
+}
+
+#define unsafe_put_sigcontext(sc, fp, regs, set, label) \
+do { \
+ if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \
+ goto label; \
+} while(0);
+
+#define unsafe_put_sigmask(set, frame, label) \
+ unsafe_put_user(*(__u64 *)(set), \
+ (__u64 __user *)&(frame)->uc.uc_sigmask, \
+ label)
+
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+ else
+ flags = UC_SIGCONTEXT_SS;
+
+ if (likely(user_64bit_mode(regs)))
+ flags |= UC_STRICT_RESTORE_SS;
+
+ return flags;
+}
+
+int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset_t *set = sigmask_to_save();
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ unsigned long uc_flags;
+
+ /* x86-64 should always use SA_RESTORER. */
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
+ uc_flags = frame_uc_flags(regs);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->di = ksig->sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value. We also have a compatibility issue here: DOSEMU
+ * relies on the contents of the SS register indicating the
+ * SS value at the time of the signal, even though that code in
+ * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
+ * avoids relying on sigreturn to restore SS; instead it uses
+ * a trampoline.) So we do our best: if the old SS was valid,
+ * we keep it. Otherwise we replace it.
+ */
+ regs->cs = __USER_CS;
+
+ if (unlikely(regs->ss != __USER_DS))
+ force_valid_ss(regs);
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+SYSCALL_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ prevent_single_step_upon_eretu(regs);
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
+
+#ifdef CONFIG_X86_X32_ABI
+static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ struct compat_siginfo new;
+
+ copy_siginfo_to_external32(&new, from);
+ if (from->si_signo == SIGCHLD) {
+ new._sifields._sigchld_x32._utime = from->si_utime;
+ new._sifields._sigchld_x32._stime = from->si_stime;
+ }
+ if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+ return 0;
+}
+
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ if (in_x32_syscall())
+ return x32_copy_siginfo_to_user(to, from);
+ return __copy_siginfo_to_user32(to, from);
+}
+
+int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save();
+ struct rt_sigframe_x32 __user *frame;
+ unsigned long uc_flags;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ uc_flags = frame_uc_flags(regs);
+
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+ unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
+ restorer = ksig->ka.sa.sa_restorer;
+ unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = ksig->sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ prevent_single_step_upon_eretu(regs);
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif /* CONFIG_X86_X32_ABI */
+
+#ifdef CONFIG_COMPAT
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ if (!act)
+ return;
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
+#endif /* CONFIG_COMPAT */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 10);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo_t) == 8);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo_t, si_signo) == 0);
+static_assert(offsetof(siginfo_t, si_errno) == 4);
+static_assert(offsetof(siginfo_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo_t, _sifields) == \
+ offsetof(siginfo_t, _sifields.name))
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+
+CHECK_SI_OFFSET(_timer);
+CHECK_SI_SIZE (_timer, 6*sizeof(int));
+static_assert(offsetof(siginfo_t, si_tid) == 0x10);
+static_assert(offsetof(siginfo_t, si_overrun) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_status) == 0x18);
+static_assert(offsetof(siginfo_t, si_utime) == 0x20);
+static_assert(offsetof(siginfo_t, si_stime) == 0x28);
+
+#ifdef CONFIG_X86_X32_ABI
+/* no _sigchld_x32 in the generic siginfo_t */
+static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) ==
+ 7*sizeof(int));
+static_assert(offsetof(compat_siginfo_t, _sifields) ==
+ offsetof(compat_siginfo_t, _sifields._sigchld_x32));
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18);
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20);
+#endif
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_addr) == 0x10);
+
+static_assert(offsetof(siginfo_t, si_trapno) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_lower) == 0x20);
+static_assert(offsetof(siginfo_t, si_upper) == 0x28);
+
+static_assert(offsetof(siginfo_t, si_pkey) == 0x20);
+
+static_assert(offsetof(siginfo_t, si_perf_data) == 0x18);
+static_assert(offsetof(siginfo_t, si_perf_type) == 0x20);
+static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_band) == 0x10);
+static_assert(offsetof(siginfo_t, si_fd) == 0x18);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_call_addr) == 0x10);
+static_assert(offsetof(siginfo_t, si_syscall) == 0x18);
+static_assert(offsetof(siginfo_t, si_arch) == 0x1C);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97600e7..b014e6d229f9 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Intel SMP support routines.
*
@@ -6,9 +7,6 @@
* (c) 2002,2003 Andi Kleen, SuSE Labs.
*
* i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
*/
#include <linux/init.h>
@@ -16,17 +14,28 @@
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
+#include <linux/export.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
+#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/idtentry.h>
+#include <asm/nmi.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+#include <asm/kexec.h>
+#include <asm/reboot.h>
+
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -60,7 +69,7 @@
* 5AP. symmetric IO mode (normal Linux operation) not affected.
* 'noapic' mode has vector 0xf filled out properly.
* 6AP. 'noapic' mode might be affected - fixed in later steppings
- * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 7AP. We do not assume writes to the LVT deasserting IRQs
* 8AP. We do not enable low power mode (deep sleep) during MP bootup
* 9AP. We do not use mixed mode
*
@@ -106,131 +115,183 @@
* about nothing of note with C stepping upwards.
*/
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-static void native_smp_send_reschedule(int cpu)
-{
- if (unlikely(cpu_is_offline(cpu))) {
- WARN_ON(1);
- return;
- }
- apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
-}
-
-void native_send_call_func_single_ipi(int cpu)
-{
- apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
-}
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
-void native_send_call_func_ipi(const struct cpumask *mask)
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
{
- cpumask_var_t allbutself;
-
- if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
- return;
- }
-
- cpumask_copy(allbutself, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), allbutself);
+ /* We are registered on stopping cpu too, avoid spurious NMI */
+ if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+ return NMI_HANDLED;
- if (cpumask_equal(mask, allbutself) &&
- cpumask_equal(cpu_online_mask, cpu_callout_mask))
- apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
- else
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ cpu_emergency_disable_virtualization();
+ stop_this_cpu(NULL);
- free_cpumask_var(allbutself);
+ return NMI_HANDLED;
}
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
-
-asmlinkage void smp_reboot_interrupt(void)
+DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
- ack_APIC_irq();
- irq_enter();
+ apic_eoi();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
- irq_exit();
}
-static void native_smp_send_stop(void)
+static int register_stop_handler(void)
+{
+ return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop");
+}
+
+static void native_stop_other_cpus(int wait)
{
- unsigned long flags;
- unsigned long wait;
+ unsigned int old_cpu, this_cpu;
+ unsigned long flags, timeout;
if (reboot_force)
return;
+ /* Only proceed if this is the first CPU to reach this code */
+ old_cpu = -1;
+ this_cpu = smp_processor_id();
+ if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu))
+ return;
+
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
- * Use an own vector here because smp_call_function
- * does lots of things not suitable in a panic situation.
- * On most systems we could also use an NMI here,
- * but there are a few systems around where NMI
- * is problematic so stay with an non NMI for now
- * (this implies we cannot stop CPUs spinning with irq off
- * currently)
+ * 1) Send an IPI on the reboot vector to all other CPUs.
+ *
+ * The other CPUs should react on it after leaving critical
+ * sections and re-enabling interrupts. They might still hold
+ * locks, but there is nothing which can be done about that.
+ *
+ * 2) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * 3) If #2 timed out send an NMI to the CPUs which did not
+ * yet report
+ *
+ * 4) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * #3 can obviously race against a CPU reaching the HLT loop late.
+ * That CPU will have reported already and the "have all CPUs
+ * reached HLT" condition will be true despite the fact that the
+ * other CPU is still handling the NMI. Again, there is no
+ * protection against that as "disabled" APICs still respond to
+ * NMIs.
*/
- if (num_online_cpus() > 1) {
- apic->send_IPI_allbutself(REBOOT_VECTOR);
+ cpumask_copy(&cpus_stop_mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, &cpus_stop_mask);
- /* Don't wait longer than a second */
- wait = USEC_PER_SEC;
- while (num_online_cpus() > 1 && wait--)
+ if (!cpumask_empty(&cpus_stop_mask)) {
+ apic_send_IPI_allbutself(REBOOT_VECTOR);
+
+ /*
+ * Don't wait longer than a second for IPI completion. The
+ * wait request is not checked here because that would
+ * prevent an NMI shutdown attempt in case that not all
+ * CPUs reach shutdown state.
+ */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(&cpus_stop_mask) && timeout--)
+ udelay(1);
+ }
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if (!cpumask_empty(&cpus_stop_mask)) {
+ /*
+ * If NMI IPI is enabled, try to register the stop handler
+ * and send the IPI. In any case try to wait for the other
+ * CPUs to stop.
+ */
+ if (!smp_no_nmi_ipi && !register_stop_handler()) {
+ unsigned int cpu;
+
+ pr_emerg("Shutting down cpus with NMI\n");
+
+ for_each_cpu(cpu, &cpus_stop_mask)
+ __apic_send_IPI(cpu, NMI_VECTOR);
+ }
+ /*
+ * Don't wait longer than 10 ms if the caller didn't
+ * request it. If wait is true, the machine hangs here if
+ * one or more CPUs do not reach shutdown state.
+ */
+ timeout = USEC_PER_MSEC * 10;
+ while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
udelay(1);
}
local_irq_save(flags);
disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
local_irq_restore(flags);
+
+ /*
+ * Ensure that the cpus_stop_mask cache lines are invalidated on
+ * the other CPUs. See comment vs. SME in stop_this_cpu().
+ */
+ cpumask_clear(&cpus_stop_mask);
}
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back. KVM uses this interrupt to force a cpu out of
+ * guest mode.
*/
-void smp_reschedule_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
{
- ack_APIC_irq();
+ apic_eoi();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
inc_irq_stat(irq_resched_count);
- /*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
+ scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
}
-void smp_call_function_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
{
- ack_APIC_irq();
- irq_enter();
- generic_smp_call_function_interrupt();
+ apic_eoi();
+ trace_call_function_entry(CALL_FUNCTION_VECTOR);
inc_irq_stat(irq_call_count);
- irq_exit();
+ generic_smp_call_function_interrupt();
+ trace_call_function_exit(CALL_FUNCTION_VECTOR);
}
-void smp_call_function_single_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
{
- ack_APIC_irq();
- irq_enter();
- generic_smp_call_function_single_interrupt();
+ apic_eoi();
+ trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
inc_irq_stat(irq_call_count);
- irq_exit();
+ generic_smp_call_function_single_interrupt();
+ trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
}
+static int __init nonmi_ipi_setup(char *str)
+{
+ smp_no_nmi_ipi = true;
+ return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .smp_send_stop = native_smp_send_stop,
+ .stop_other_cpus = native_stop_other_cpus,
+#if defined(CONFIG_CRASH_DUMP)
+ .crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
+#endif
.smp_send_reschedule = native_smp_send_reschedule,
- .cpu_up = native_cpu_up,
- .cpu_die = native_cpu_die,
+ .kick_ap_alive = native_kick_ap,
.cpu_disable = native_cpu_disable,
.play_dead = native_play_dead,
@@ -238,3 +299,27 @@ struct smp_ops smp_ops = {
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);
+
+int arch_cpu_rescan_dead_smt_siblings(void)
+{
+ enum cpuhp_smt_control old = cpu_smt_control;
+ int ret;
+
+ /*
+ * If SMT has been disabled and SMT siblings are in HLT, bring them back
+ * online and offline them again so that they end up in MWAIT proper.
+ *
+ * Called with hotplug enabled.
+ */
+ if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED)
+ return 0;
+
+ ret = cpuhp_smt_enable();
+ if (ret)
+ return ret;
+
+ ret = cpuhp_smt_disable(old);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2fecda69ee64..5cd6950ab672 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,5 @@
-/*
+// SPDX-License-Identifier: GPL-2.0-or-later
+ /*
* x86 SMP booting functions
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -12,9 +13,6 @@
* Pentium Pro and Pentium-II/Xeon MP machines.
* Original development of Linux SMP code supported by Caldera.
*
- * This code is released under the GNU General Public License version 2 or
- * later.
- *
* Fixes
* Felix Koop : NR_CPUS used properly
* Jose Renau : Handle single CPU case.
@@ -39,380 +37,617 @@
* Glauber Costa : i386 and x86_64 integration
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <linux/smp.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
#include <linux/percpu.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/nmi.h>
+#include <linux/tboot.h>
+#include <linux/gfp.h>
+#include <linux/cpuidle.h>
+#include <linux/kexec.h>
+#include <linux/numa.h>
+#include <linux/pgtable.h>
+#include <linux/overflow.h>
+#include <linux/stackprotector.h>
+#include <linux/cpuhotplug.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
#include <asm/acpi.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpuid/api.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
-#include <asm/idle.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/fpu/api.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
-#include <linux/mc146818rtc.h>
-
-#include <asm/smpboot_hooks.h>
-
-#ifdef CONFIG_X86_32
-u8 apicid_2_node[MAX_APICID];
-static int low_mappings;
-#endif
-
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x) (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
-#endif
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
+#include <asm/microcode.h>
+#include <asm/i8259.h>
+#include <asm/misc.h>
+#include <asm/qspinlock.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
+#include <asm/hw_irq.h>
+#include <asm/stackprotector.h>
+#include <asm/sev.h>
+#include <asm/spec-ctrl.h>
/* representing HT siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
/* representing HT and core siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
-/* Per CPU bogomips and other parameters */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
-EXPORT_PER_CPU_SYMBOL(cpu_info);
-
-atomic_t init_deasserted;
+/* representing HT, core, and die siblings of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
+EXPORT_PER_CPU_SYMBOL(cpu_die_map);
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
-/* which node each logical CPU is on */
-int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_to_node_map);
+/* Representing CPUs for which sibling maps can be computed */
+static cpumask_var_t cpu_sibling_setup_mask;
-/* set up a mapping between cpu and node. */
-static void map_cpu_to_node(int cpu, int node)
-{
- printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
- cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
- cpu_to_node_map[cpu] = node;
-}
+struct mwait_cpu_dead {
+ unsigned int control;
+ unsigned int status;
+};
-/* undo a mapping between cpu and node. */
-static void unmap_cpu_to_node(int cpu)
-{
- int node;
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
- printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
- for (node = 0; node < MAX_NUMNODES; node++)
- cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
- cpu_to_node_map[cpu] = 0;
-}
-#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
-#define map_cpu_to_node(cpu, node) ({})
-#define unmap_cpu_to_node(cpu) ({})
-#endif
+/*
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+ * that it's unlikely to be touched by other CPUs.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
-#ifdef CONFIG_X86_32
-static int boot_cpu_logical_apicid;
+/* Maximum number of SMT threads on any online core */
+int __read_mostly __max_smt_threads = 1;
-u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
- { [0 ... NR_CPUS-1] = BAD_APICID };
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
-static void map_cpu_to_logical_apicid(void)
+int arch_update_cpu_topology(void)
{
- int cpu = smp_processor_id();
- int apicid = logical_smp_processor_id();
- int node = apic->apicid_to_node(apicid);
+ int retval = x86_topology_update;
- if (!node_online(node))
- node = first_online_node;
-
- cpu_2_logical_apicid[cpu] = apicid;
- map_cpu_to_node(cpu, node);
+ x86_topology_update = false;
+ return retval;
}
-void numa_remove_cpu(int cpu)
+static unsigned int smpboot_warm_reset_vector_count;
+
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
- cpu_2_logical_apicid[cpu] = BAD_APICID;
- unmap_cpu_to_node(cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ if (!smpboot_warm_reset_vector_count++) {
+ CMOS_WRITE(0xa, 0xf);
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf;
+ }
+ spin_unlock_irqrestore(&rtc_lock, flags);
}
-#else
-#define map_cpu_to_logical_apicid() do {} while (0)
-#endif
-/*
- * Report back to the Boot Processor.
- * Running on AP.
- */
-static void __cpuinit smp_callin(void)
+static inline void smpboot_restore_warm_reset_vector(void)
{
- int cpuid, phys_id;
- unsigned long timeout;
-
- /*
- * If waken up by an INIT in an 82489DX configuration
- * we may get here before an INIT-deassert IPI reaches
- * our local APIC. We have to wait for the IPI or we'll
- * lock up on an APIC access.
- */
- if (apic->wait_for_init_deassert)
- apic->wait_for_init_deassert(&init_deasserted);
+ unsigned long flags;
/*
- * (This works even if the APIC is not enabled.)
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
*/
- phys_id = read_apic_id();
- cpuid = smp_processor_id();
- if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
- panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
- phys_id, cpuid);
+ spin_lock_irqsave(&rtc_lock, flags);
+ if (!--smpboot_warm_reset_vector_count) {
+ CMOS_WRITE(0, 0xf);
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
- pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+ spin_unlock_irqrestore(&rtc_lock, flags);
- /*
- * STARTUP IPIs are fragile beasts as they might sometimes
- * trigger some glue motherboard logic. Complete APIC bus
- * silence for 1 second, this overestimates the time the
- * boot CPU is spending to send the up to 2 STARTUP IPIs
- * by a factor of two. This should be enough.
- */
+}
- /*
- * Waiting 2s total for startup (udelay is not yet working)
- */
- timeout = jiffies + 2*HZ;
- while (time_before(jiffies, timeout)) {
- /*
- * Has the boot CPU finished it's STARTUP sequence?
- */
- if (cpumask_test_cpu(cpuid, cpu_callout_mask))
- break;
- cpu_relax();
- }
+/* Run the next set of setup steps for the upcoming CPU */
+static void ap_starting(void)
+{
+ int cpuid = smp_processor_id();
- if (!time_before(jiffies, timeout)) {
- panic("%s: CPU%d started up but did not get a callout!\n",
- __func__, cpuid);
- }
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
/*
- * the boot CPU has finished the init stage and is spinning
- * on callin_map until we finish. We are free to set up this
- * CPU, first the APIC. (this is probably redundant on most
- * boards)
+ * If woken up by an INIT in an 82489DX configuration the alive
+ * synchronization guarantees that the CPU does not reach this
+ * point before an INIT_deassert IPI reaches the local APIC, so it
+ * is now safe to touch the local APIC.
+ *
+ * Set up this CPU, first the APIC, which is probably redundant on
+ * most boards.
*/
+ apic_ap_setup();
- pr_debug("CALLIN, before setup_local_APIC().\n");
- if (apic->smp_callin_clear_local_apic)
- apic->smp_callin_clear_local_apic();
- setup_local_APIC();
- end_local_APIC_setup();
- map_cpu_to_logical_apicid();
+ /* Save the processor parameters. */
+ identify_secondary_cpu(cpuid);
- notify_cpu_starting(cpuid);
/*
- * Get our bogomips.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
+ * The topology information must be up to date before
+ * notify_cpu_starting().
*/
- local_irq_enable();
- calibrate_delay();
- local_irq_disable();
+ set_cpu_sibling_map(cpuid);
+
+ ap_init_aperfmperf();
+
pr_debug("Stack at about %p\n", &cpuid);
+ wmb();
+
/*
- * Save our processor parameters
+ * This runs the AP through all the cpuhp states to its target
+ * state CPUHP_ONLINE.
*/
- smp_store_cpu_info(cpuid);
+ notify_cpu_starting(cpuid);
+}
+static void ap_calibrate_delay(void)
+{
/*
- * Allow the master to continue.
+ * Calibrate the delay loop and update loops_per_jiffy in cpu_data.
+ * identify_secondary_cpu() stored a value that is close but not as
+ * accurate as the value just calculated.
+ *
+ * As this is invoked after the TSC synchronization check,
+ * calibrate_delay_is_known() will skip the calibration routine
+ * when TSC is synchronized across sockets.
*/
- cpumask_set_cpu(cpuid, cpu_callin_mask);
+ calibrate_delay();
+ cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
}
/*
* Activate a secondary processor.
*/
-notrace static void __cpuinit start_secondary(void *unused)
+static void notrace __noendbr start_secondary(void *unused)
{
/*
- * Don't put *anything* before cpu_init(), SMP booting is too
- * fragile that we want to limit the things done here to the
- * most necessary things.
+ * Don't put *anything* except direct CPU state initialization
+ * before cpu_init(), SMP booting is too fragile that we want to
+ * limit the things done here to the most necessary things.
*/
- vmi_bringup();
- cpu_init();
- preempt_disable();
- smp_callin();
+ cr4_init();
- /* otherwise gcc will move up smp_processor_id before the cpu_init */
- barrier();
/*
- * Check TSC synchronization with the BP:
+ * 32-bit specific. 64-bit reaches this code with the correct page
+ * table established. Yet another historical divergence.
*/
- check_tsc_sync_target();
-
- if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
- enable_NMI_through_LVT0();
- enable_8259A_irq(0);
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /* switch away from the initial page table */
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
}
-#ifdef CONFIG_X86_32
- while (low_mappings)
- cpu_relax();
- __flush_tlb_all();
-#endif
-
- /* This must be done before setting cpu_online_mask */
- set_cpu_sibling_map(raw_smp_processor_id());
- wmb();
+ cpu_init_exception_handling(false);
/*
- * We need to hold call_lock, so there is no inconsistency
- * between the time smp_call_function() determines number of
- * IPI recipients, and the time when the determination is made
- * for which cpus receive the IPI. Holding this
- * lock helps us to not include this cpu in a currently in progress
- * smp_call_function().
+ * Load the microcode before reaching the AP alive synchronization
+ * point below so it is not part of the full per CPU serialized
+ * bringup part when "parallel" bringup is enabled.
+ *
+ * That's even safe when hyperthreading is enabled in the CPU as
+ * the core code starts the primary threads first and leaves the
+ * secondary threads waiting for SIPI. Loading microcode on
+ * physical cores concurrently is a safe operation.
*
- * We need to hold vector_lock so there the set of online cpus
- * does not change while we are assigning vectors to cpus. Holding
- * this lock ensures we don't half assign or remove an irq from a cpu.
+ * This covers both the Intel specific issue that concurrent
+ * microcode loading on SMT siblings must be prohibited and the
+ * vendor independent issue`that microcode loading which changes
+ * CPUID, MSRs etc. must be strictly serialized to maintain
+ * software state correctness.
+ */
+ load_ucode_ap();
+
+ /*
+ * Synchronization point with the hotplug core. Sets this CPUs
+ * synchronization state to ALIVE and spin-waits for the control CPU to
+ * release this CPU for further bringup.
+ */
+ cpuhp_ap_sync_alive();
+
+ cpu_init();
+ fpu__init_cpu();
+ rcutree_report_cpu_starting(raw_smp_processor_id());
+ x86_cpuinit.early_percpu_clock_init();
+
+ ap_starting();
+
+ /* Check TSC synchronization with the control CPU. */
+ check_tsc_sync_target();
+
+ /*
+ * Calibrate the delay loop after the TSC synchronization check.
+ * This allows to skip the calibration when TSC is synchronized
+ * across sockets.
+ */
+ ap_calibrate_delay();
+
+ speculative_store_bypass_ht_init();
+
+ /*
+ * Lock vector_lock, set CPU online and bring the vector
+ * allocator online. Online must be set with vector_lock held
+ * to prevent a concurrent irq setup/teardown from seeing a
+ * half valid vector space.
*/
- ipi_call_lock();
lock_vector_lock();
- __setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
+ lapic_online();
unlock_vector_lock();
- ipi_call_unlock();
- per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ x86_platform.nmi_init();
/* enable local interrupts */
local_irq_enable();
- setup_secondary_clock();
+ x86_cpuinit.setup_percpu_clockev();
wmb();
- cpu_idle();
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+ANNOTATE_NOENDBR_SYM(start_secondary);
+
+static bool
+topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
+}
+
+static bool
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(!topology_same_node(c, o),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+}
+
+#define link_mask(mfunc, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), mfunc(c2)); \
+ cpumask_set_cpu((c2), mfunc(c1)); \
+} while (0)
+
+static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ c->topo.amd_node_id == o->topo.amd_node_id &&
+ per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) {
+ if (c->topo.core_id == o->topo.core_id)
+ return topology_sane(c, o, "smt");
+
+ if ((c->topo.cu_id != 0xff) &&
+ (o->topo.cu_id != 0xff) &&
+ (c->topo.cu_id == o->topo.cu_id))
+ return topology_sane(c, o, "smt");
+ }
+
+ } else if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ c->topo.core_id == o->topo.core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* In this case, llc_shared_map is a pointer to a cpumask. */
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
- const struct cpuinfo_x86 *src)
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- struct cpumask *llc = dst->llc_shared_map;
- *dst = *src;
- dst->llc_shared_map = llc;
+ if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id)
+ return false;
+
+ if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1)
+ return c->topo.amd_node_id == o->topo.amd_node_id;
+
+ return true;
}
-#else
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
- const struct cpuinfo_x86 *src)
+
+static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- *dst = *src;
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* If the arch didn't set up l2c_id, fall back to SMT */
+ if (per_cpu_l2c_id(cpu1) == BAD_APICID)
+ return match_smt(c, o);
+
+ /* Do not match if L2 cache id does not match: */
+ if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2))
+ return false;
+
+ return topology_sane(c, o, "l2c");
}
-#endif /* CONFIG_CPUMASK_OFFSTACK */
/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node. If this happens, we will
+ * discard the MC level of the topology later.
*/
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->topo.pkg_id == o->topo.pkg_id)
+ return true;
+ return false;
+}
+
+/*
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
+ *
+ * Any Intel CPU that has multiple nodes per package and does not
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
+ *
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
+ * by multiple NUMA nodes. The LLC is shared for off-package data
+ * access but private to the NUMA node (half of the package) for
+ * on-package access. CPUID (the source of the information about
+ * the LLC) can only enumerate the cache as shared or unshared,
+ * but not this particular configuration.
+ */
+
+static const struct x86_cpu_id intel_cod_cpu[] = {
+ X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */
+ {}
+};
-void __cpuinit smp_store_cpu_info(int id)
+static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- struct cpuinfo_x86 *c = &cpu_data(id);
+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ bool intel_snc = id && id->driver_data;
- copy_cpuinfo_x86(c, &boot_cpu_data);
- c->cpu_index = id;
- if (id != 0)
- identify_secondary_cpu(c);
+ /* Do not match if we do not have a valid APICID for cpu: */
+ if (per_cpu_llc_id(cpu1) == BAD_APICID)
+ return false;
+
+ /* Do not match if LLC id does not match: */
+ if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
+ return false;
+
+ /*
+ * Allow the SNC topology without warning. Return of false
+ * means 'c' does not share the LLC of 'o'. This will be
+ * reflected to userspace.
+ */
+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
+ return false;
+
+ return topology_sane(c, o, "llc");
}
-void __cpuinit set_cpu_sibling_map(int cpu)
+static inline int x86_sched_itmt_flags(void)
{
- int i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
- cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+ return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_CLUSTER
+static int x86_cluster_flags(void)
+{
+ return cpu_cluster_flags() | x86_sched_itmt_flags();
+}
+#endif
+
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
+
+static struct sched_domain_topology_level x86_topology[] = {
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
+#ifdef CONFIG_SCHED_CLUSTER
+ SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS),
+#endif
+#ifdef CONFIG_SCHED_MC
+ SDTL_INIT(tl_mc_mask, x86_core_flags, MC),
+#endif
+ SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG),
+ { NULL },
+};
+
+static void __init build_sched_topology(void)
+{
+ struct sched_domain_topology_level *topology = x86_topology;
+
+ /*
+ * When there is NUMA topology inside the package invalidate the
+ * PKG domain since the NUMA domains will auto-magically create the
+ * right spanning domains based on the SLIT.
+ */
+ if (x86_has_numa_in_package) {
+ unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
+
+ memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
+ }
+
+ /*
+ * Drop the SMT domains if there is only one thread per-core
+ * since it'll get degenerated by the scheduler anyways.
+ */
+ if (cpu_smt_num_threads <= 1)
+ ++topology;
+
+ set_sched_topology(topology);
+}
+
+#ifdef CONFIG_NUMA
+static int sched_avg_remote_distance;
+static int avg_remote_numa_distance(void)
+{
+ int i, j;
+ int distance, nr_remote, total_distance;
+
+ if (sched_avg_remote_distance > 0)
+ return sched_avg_remote_distance;
- if (smp_num_siblings > 1) {
- for_each_cpu(i, cpu_sibling_setup_mask) {
- struct cpuinfo_x86 *o = &cpu_data(i);
-
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- cpumask_set_cpu(i, cpu_sibling_mask(cpu));
- cpumask_set_cpu(cpu, cpu_sibling_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, c->llc_shared_map);
- cpumask_set_cpu(cpu, o->llc_shared_map);
+ nr_remote = 0;
+ total_distance = 0;
+ for_each_node_state(i, N_CPU) {
+ for_each_node_state(j, N_CPU) {
+ distance = node_distance(i, j);
+
+ if (distance >= REMOTE_DISTANCE) {
+ nr_remote++;
+ total_distance += distance;
}
}
- } else {
- cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
}
+ if (nr_remote)
+ sched_avg_remote_distance = total_distance / nr_remote;
+ else
+ sched_avg_remote_distance = REMOTE_DISTANCE;
+
+ return sched_avg_remote_distance;
+}
+
+int arch_sched_node_distance(int from, int to)
+{
+ int d = node_distance(from, to);
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_ATOM_DARKMONT_X:
+
+ if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ d < REMOTE_DISTANCE)
+ return d;
+
+ /*
+ * With SNC enabled, there could be too many levels of remote
+ * NUMA node distances, creating NUMA domain levels
+ * including local nodes and partial remote nodes.
+ *
+ * Trim finer distance tuning for NUMA nodes in remote package
+ * for the purpose of building sched domains. Group NUMA nodes
+ * in the remote package in the same sched group.
+ * Simplify NUMA domains and avoid extra NUMA levels including
+ * different remote NUMA nodes and local nodes.
+ *
+ * GNR and CWF don't expect systems with more than 2 packages
+ * and more than 2 hops between packages. Single average remote
+ * distance won't be appropriate if there are more than 2
+ * packages as average distance to different remote packages
+ * could be different.
+ */
+ WARN_ONCE(topology_max_packages() > 2,
+ "sched: Expect only up to 2 packages for GNR or CWF, "
+ "but saw %d packages when building sched domains.",
+ topology_max_packages());
+
+ d = avg_remote_numa_distance();
+ }
+ return d;
+}
+#endif /* CONFIG_NUMA */
- cpumask_set_cpu(cpu, c->llc_shared_map);
+void set_cpu_sibling_map(int cpu)
+{
+ bool has_smt = __max_threads_per_core > 1;
+ bool has_mp = has_smt || topology_num_cores_per_package() > 1;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i, threads;
- if (current_cpu_data.x86_max_cores == 1) {
- cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
+
+ if (!has_mp) {
+ cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
+ cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
+ cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1;
return;
}
for_each_cpu(i, cpu_sibling_setup_mask) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpumask_set_cpu(i, c->llc_shared_map);
- cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
+ o = &cpu_data(i);
+
+ if (match_pkg(c, o) && !topology_same_node(c, o))
+ x86_has_numa_in_package = true;
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(topology_sibling_cpumask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_llc(c, o)))
+ link_mask(cpu_llc_shared_mask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_l2c(c, o)))
+ link_mask(cpu_l2c_shared_mask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_die(c, o)))
+ link_mask(topology_die_cpumask, cpu, i);
+ }
+
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
+
+ for_each_cpu(i, topology_sibling_cpumask(cpu))
+ cpu_data(i).smt_active = threads > 1;
+
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * topology_sibling_cpumask links to be set-up.
+ */
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_mp && match_pkg(c, o))) {
+ link_mask(topology_core_cpumask, cpu, i);
+
/*
* Does this new cpu bringup a new core?
*/
- if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
+ if (threads == 1) {
/*
* for each core in package, increment
* the booted_cores for this new cpu
*/
- if (cpumask_first(cpu_sibling_mask(i)) == i)
+ if (cpumask_first(
+ topology_sibling_cpumask(i)) == i)
c->booted_cores++;
/*
* increment the core count for all
@@ -429,16 +664,14 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if (sched_mc_power_savings || sched_smt_power_savings)
- return cpu_core_mask(cpu);
- else
- return c->llc_shared_map;
+ return cpu_llc_shared_mask(cpu);
+}
+
+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+ return cpu_l2c_shared_mask(cpu);
}
+EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
static void impress_friends(void)
{
@@ -447,141 +680,95 @@ static void impress_friends(void)
/*
* Allow the user to impress friends.
*/
- pr_debug("Before bogomips.\n");
- for_each_possible_cpu(cpu)
- if (cpumask_test_cpu(cpu, cpu_callout_mask))
- bogosum += cpu_data(cpu).loops_per_jiffy;
- printk(KERN_INFO
- "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ pr_debug("Before bogomips\n");
+ for_each_online_cpu(cpu)
+ bogosum += cpu_data(cpu).loops_per_jiffy;
+
+ pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
num_online_cpus(),
bogosum/(500000/HZ),
(bogosum/(5000/HZ))%100);
- pr_debug("Before bogocount - setting activated=1.\n");
+ pr_debug("Before bogocount - setting activated=1\n");
}
-void __inquire_remote_apic(int apicid)
-{
- unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- char *names[] = { "ID", "VERSION", "SPIV" };
- int timeout;
- u32 status;
+/*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "cpu_init_udelay=" is available to override this delay.
+ */
+#define UDELAY_10MS_LEGACY 10000
- printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
+static unsigned int init_udelay = UINT_MAX;
- for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
+static int __init cpu_init_udelay(char *str)
+{
+ get_option(&str, &init_udelay);
- /*
- * Wait for idle.
- */
- status = safe_apic_wait_icr_idle();
- if (status)
- printk(KERN_CONT
- "a previous APIC delivery may have failed\n");
-
- apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
-
- timeout = 0;
- do {
- udelay(100);
- status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
- } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
- switch (status) {
- case APIC_ICR_RR_VALID:
- status = apic_read(APIC_RRR);
- printk(KERN_CONT "%08x\n", status);
- break;
- default:
- printk(KERN_CONT "failed\n");
- }
- }
+ return 0;
}
+early_param("cpu_init_udelay", cpu_init_udelay);
-/*
- * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
- * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
- * won't ... remember to clear down the APIC, etc later.
- */
-int __cpuinit
-wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
+static void __init smp_set_init_udelay(void)
{
- unsigned long send_status, accept_status = 0;
- int maxlvt;
-
- /* Target chip */
- /* Boot on the stack */
- /* Kick the second */
- apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
+ /* if cmdline changed it from default, leave it alone */
+ if (init_udelay != UINT_MAX)
+ return;
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(200);
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- accept_status = (apic_read(APIC_ESR) & 0xEF);
+ /* if modern processor, use no delay */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) {
+ init_udelay = 0;
+ return;
}
- pr_debug("NMI sent.\n");
-
- if (send_status)
- printk(KERN_ERR "APIC never delivered???\n");
- if (accept_status)
- printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
-
- return (send_status | accept_status);
+ /* else, use legacy delay */
+ init_udelay = UDELAY_10MS_LEGACY;
}
-static int __cpuinit
-wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+static void send_init_sequence(u32 phys_apicid)
{
- unsigned long send_status, accept_status = 0;
- int maxlvt, num_starts, j;
+ int maxlvt = lapic_get_maxlvt();
- maxlvt = lapic_get_maxlvt();
-
- /*
- * Be paranoid about clearing APIC errors.
- */
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ /* Be paranoid about clearing APIC errors. */
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
- pr_debug("Asserting INIT.\n");
+ /* Assert INIT on the target CPU */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
- /*
- * Turn INIT on target chip
- */
- /*
- * Send IPI
- */
- apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
- phys_apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
-
- mdelay(10);
-
- pr_debug("Deasserting INIT.\n");
+ udelay(init_udelay);
- /* Target chip */
- /* Send IPI */
+ /* Deassert INIT on the target CPU */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
+}
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu)
+{
+ unsigned long send_status = 0, accept_status = 0;
+ int num_starts, j, maxlvt;
+
+ preempt_disable();
+ maxlvt = lapic_get_maxlvt();
+ send_init_sequence(phys_apicid);
mb();
- atomic_set(&init_deasserted, 1);
/*
* Should we send STARTUP IPIs ?
@@ -589,29 +776,22 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid]))
+ if (APIC_INTEGRATED(boot_cpu_apic_version))
num_starts = 2;
else
num_starts = 0;
/*
- * Paravirt / VMI wants a startup IPI hook here to set up the
- * target processor state.
- */
- startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- (unsigned long)stack_start.sp);
-
- /*
* Run STARTUP IPI loop.
*/
- pr_debug("#startup loops: %d.\n", num_starts);
+ pr_debug("#startup loops: %d\n", num_starts);
for (j = 1; j <= num_starts; j++) {
- pr_debug("Sending STARTUP #%d.\n", j);
+ pr_debug("Sending STARTUP #%d\n", j);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- pr_debug("After apic_write.\n");
+ pr_debug("After apic_write\n");
/*
* STARTUP IPI
@@ -626,9 +806,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Give the other CPU some time to accept the IPI.
*/
- udelay(300);
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(300);
- pr_debug("Startup point 1.\n");
+ pr_debug("Startup point 1\n");
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -636,116 +819,124 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Give the other CPU some time to accept the IPI.
*/
- udelay(200);
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(200);
+
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
if (send_status || accept_status)
break;
}
- pr_debug("After Startup.\n");
+ pr_debug("After Startup\n");
if (send_status)
- printk(KERN_ERR "APIC never delivered???\n");
+ pr_err("APIC never delivered???\n");
if (accept_status)
- printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+ pr_err("APIC delivery error (%lx)\n", accept_status);
+ preempt_enable();
return (send_status | accept_status);
}
-struct create_idle {
- struct work_struct work;
- struct task_struct *idle;
- struct completion done;
- int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
+/* reduce the number of lines printed when booting a large cpu count system */
+static void announce_cpu(int cpu, int apicid)
{
- struct create_idle *c_idle =
- container_of(work, struct create_idle, work);
+ static int width, node_width, first = 1;
+ static int current_node = NUMA_NO_NODE;
+ int node = early_cpu_to_node(cpu);
- c_idle->idle = fork_idle(c_idle->cpu);
- complete(&c_idle->done);
-}
+ if (!width)
+ width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
-/*
- * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
- * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from
- * ->wakeup_secondary_cpu.
- */
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
-{
- unsigned long boot_error = 0;
- unsigned long start_ip;
- int timeout;
- struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- };
+ if (!node_width)
+ node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
- INIT_WORK(&c_idle.work, do_fork_idle);
+ if (system_state < SYSTEM_RUNNING) {
+ if (first)
+ pr_info("x86: Booting SMP configuration:\n");
- alternatives_smp_switch(1);
+ if (node != current_node) {
+ if (current_node > (-1))
+ pr_cont("\n");
+ current_node = node;
- c_idle.idle = get_idle_for_cpu(cpu);
+ printk(KERN_INFO ".... node %*s#%d, CPUs: ",
+ node_width - num_digits(node), " ", node);
+ }
- /*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
- */
- if (c_idle.idle) {
- c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
- init_idle(c_idle.idle, cpu);
- goto do_rest;
- }
+ /* Add padding for the BSP */
+ if (first)
+ pr_cont("%*s", width + 1, " ");
+ first = 0;
- if (!keventd_up() || current_is_keventd())
- c_idle.work.func(&c_idle.work);
- else {
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
- }
+ pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
+ } else
+ pr_info("Booting Node %d Processor %d APIC 0x%x\n",
+ node, cpu, apicid);
+}
- if (IS_ERR(c_idle.idle)) {
- printk("failed fork for CPU %d\n", cpu);
- return PTR_ERR(c_idle.idle);
- }
+int common_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+ int ret;
+
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
+
+ per_cpu(current_task, cpu) = idle;
+ cpu_init_stack_canary(cpu, idle);
+
+ /* Initialize the interrupt stack(s) */
+ ret = irq_init_percpu_irqstack(cpu);
+ if (ret)
+ return ret;
- set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
- per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
- irq_ctx_init(cpu);
-#else
- clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
- initial_gs = per_cpu_offset(cpu);
- per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(c_idle.idle) -
- KERNEL_STACK_OFFSET + THREAD_SIZE;
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#endif
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ return 0;
+}
+
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if startup was successfully sent, else error code from
+ * ->wakeup_secondary_cpu.
+ */
+static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
+{
+ unsigned long start_ip = real_mode_header->trampoline_start;
+ int ret;
+
+#ifdef CONFIG_X86_64
+ /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
+ if (apic->wakeup_secondary_cpu_64)
+ start_ip = real_mode_header->trampoline_start64;
+#endif
+ idle->thread.sp = (unsigned long)task_pt_regs(idle);
initial_code = (unsigned long)start_secondary;
- stack_start.sp = (void *) c_idle.idle->thread.sp;
- /* start_ip had better be page-aligned! */
- start_ip = setup_trampoline();
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+ initial_stack = idle->thread.sp;
+ } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
+ smpboot_control = cpu;
+ }
+
+ /* Enable the espfix hack for this CPU */
+ init_espfix_ap(cpu);
- /* So we see what's up */
- printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
- cpu, apicid, start_ip);
+ /* So we see what's up */
+ announce_cpu(cpu, apicid);
/*
* This grunge runs the startup process for
* the targeted processor.
*/
-
- atomic_set(&init_deasserted, 0);
-
- if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+ if (x86_platform.legacy.warm_reset) {
pr_debug("Setting warm reset code and vector.\n");
@@ -753,107 +944,51 @@ do_rest:
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}
+ smp_mb();
+
/*
- * Kick the secondary CPU. Use the method in the APIC driver
- * if it's defined - or use an INIT boot APIC message otherwise:
+ * Wake up a CPU in difference cases:
+ * - Use a method from the APIC driver if one defined, with wakeup
+ * straight to 64-bit mode preferred over wakeup to RM.
+ * Otherwise,
+ * - Use an INIT boot APIC message
*/
- if (apic->wakeup_secondary_cpu)
- boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
+ if (apic->wakeup_secondary_cpu_64)
+ ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
+ else if (apic->wakeup_secondary_cpu)
+ ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
else
- boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
-
- if (!boot_error) {
- /*
- * allow APs to start initializing.
- */
- pr_debug("Before Callout %d.\n", cpu);
- cpumask_set_cpu(cpu, cpu_callout_mask);
- pr_debug("After Callout %d.\n", cpu);
-
- /*
- * Wait 5s total for a response
- */
- for (timeout = 0; timeout < 50000; timeout++) {
- if (cpumask_test_cpu(cpu, cpu_callin_mask))
- break; /* It has booted */
- udelay(100);
- }
-
- if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
- /* number CPUs logically, starting from 1 (BSP is 0) */
- pr_debug("OK.\n");
- printk(KERN_INFO "CPU%d: ", cpu);
- print_cpu_info(&cpu_data(cpu));
- pr_debug("CPU has booted.\n");
- } else {
- boot_error = 1;
- if (*((volatile unsigned char *)trampoline_base)
- == 0xA5)
- /* trampoline started but...? */
- printk(KERN_ERR "Stuck ??\n");
- else
- /* trampoline code not run */
- printk(KERN_ERR "Not responding.\n");
- if (apic->inquire_remote_apic)
- apic->inquire_remote_apic(apicid);
- }
- }
-
- if (boot_error) {
- /* Try to put things back the way they were before ... */
- numa_remove_cpu(cpu); /* was set by numa_add_cpu */
-
- /* was set by do_boot_cpu() */
- cpumask_clear_cpu(cpu, cpu_callout_mask);
-
- /* was set by cpu_init() */
- cpumask_clear_cpu(cpu, cpu_initialized_mask);
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
- set_cpu_present(cpu, false);
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
- }
-
- /* mark "stuck" area as not stuck */
- *((volatile unsigned long *)trampoline_base) = 0;
-
- if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
- }
-
- return boot_error;
+ /* If the wakeup mechanism failed, cleanup the warm reset vector */
+ if (ret)
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
}
-int __cpuinit native_cpu_up(unsigned int cpu)
+int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
{
- int apicid = apic->cpu_present_to_apicid(cpu);
- unsigned long flags;
+ u32 apicid = apic->cpu_present_to_apicid(cpu);
int err;
- WARN_ON(irqs_disabled());
+ lockdep_assert_irqs_enabled();
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
- if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
- !physid_isset(apicid, phys_cpu_present_map)) {
- printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
+ if (apicid == BAD_APICID || !apic_id_valid(apicid)) {
+ pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid);
return -EINVAL;
}
- /*
- * Already booted CPU?
- */
- if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
- pr_debug("do_boot_cpu %d Already started\n", cpu);
- return -ENOSYS;
+ if (!test_bit(apicid, phys_cpu_present_map)) {
+ pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid);
+ return -EINVAL;
}
/*
@@ -862,462 +997,415 @@ int __cpuinit native_cpu_up(unsigned int cpu)
*/
mtrr_save_state();
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-
-#ifdef CONFIG_X86_32
- /* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
- flush_tlb_all();
- low_mappings = 1;
-
- err = do_boot_cpu(apicid, cpu);
-
- zap_low_mappings(false);
- low_mappings = 0;
-#else
- err = do_boot_cpu(apicid, cpu);
-#endif
- if (err) {
- pr_debug("do_boot_cpu failed %d\n", err);
- return -EIO;
- }
+ /* the FPU context is blank, nobody can own it */
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
- /*
- * Check TSC synchronization with the AP (keep irqs disabled
- * while doing so):
- */
- local_irq_save(flags);
- check_tsc_sync_source(cpu);
- local_irq_restore(flags);
+ err = common_cpu_up(cpu, tidle);
+ if (err)
+ return err;
- while (!cpu_online(cpu)) {
- cpu_relax();
- touch_nmi_watchdog();
- }
+ err = do_boot_cpu(apicid, cpu, tidle);
+ if (err)
+ pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
- return 0;
+ return err;
}
-/*
- * Fall back to non SMP mode after errors.
- *
- * RED-PEN audit/test this more. I bet there is more state messed up here.
- */
-static __init void disable_smp(void)
+int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
{
- init_cpu_present(cpumask_of(0));
- init_cpu_possible(cpumask_of(0));
- smpboot_clear_io_apic_irqs();
-
- if (smp_found_config)
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- else
- physid_set_mask_of_physid(0, &phys_cpu_present_map);
- map_cpu_to_logical_apicid();
- cpumask_set_cpu(0, cpu_sibling_mask(0));
- cpumask_set_cpu(0, cpu_core_mask(0));
+ return smp_ops.kick_ap_alive(cpu, tidle);
}
-/*
- * Various sanity checks.
- */
-static int __init smp_sanity_check(unsigned max_cpus)
+void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)
{
- preempt_disable();
-
-#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
- if (def_to_bigsmp && nr_cpu_ids > 8) {
- unsigned int cpu;
- unsigned nr;
+ /* Cleanup possible dangling ends... */
+ if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
+ smpboot_restore_warm_reset_vector();
+}
- printk(KERN_WARNING
- "More than 8 CPUs detected - skipping them.\n"
- "Use CONFIG_X86_BIGSMP.\n");
+void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
+{
+ if (smp_ops.cleanup_dead_cpu)
+ smp_ops.cleanup_dead_cpu(cpu);
- nr = 0;
- for_each_present_cpu(cpu) {
- if (nr >= 8)
- set_cpu_present(cpu, false);
- nr++;
- }
+ if (system_state == SYSTEM_RUNNING)
+ pr_info("CPU %u is now offline\n", cpu);
+}
- nr = 0;
- for_each_possible_cpu(cpu) {
- if (nr >= 8)
- set_cpu_possible(cpu, false);
- nr++;
- }
+void arch_cpuhp_sync_state_poll(void)
+{
+ if (smp_ops.poll_sync_state)
+ smp_ops.poll_sync_state();
+}
- nr_cpu_ids = 8;
- }
-#endif
+/**
+ * arch_disable_smp_support() - Disables SMP support for x86 at boottime
+ */
+void __init arch_disable_smp_support(void)
+{
+ disable_ioapic_support();
+}
- if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- printk(KERN_WARNING
- "weird, boot CPU (#%d) not listed by the BIOS.\n",
- hard_smp_processor_id());
+/*
+ * Fall back to non SMP mode after errors.
+ *
+ * RED-PEN audit/test this more. I bet there is more state messed up here.
+ */
+static __init void disable_smp(void)
+{
+ pr_info("SMP disabled\n");
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
+ disable_ioapic_support();
+ topology_reset_possible_cpus_up();
- /*
- * If we couldn't find an SMP configuration at boot time,
- * get out of here now!
- */
- if (!smp_found_config && !acpi_lapic) {
- preempt_enable();
- printk(KERN_NOTICE "SMP motherboard not detected.\n");
- disable_smp();
- if (APIC_init_uniprocessor())
- printk(KERN_NOTICE "Local APIC not detected."
- " Using dummy APIC emulation.\n");
- return -1;
- }
+ cpumask_set_cpu(0, topology_sibling_cpumask(0));
+ cpumask_set_cpu(0, topology_core_cpumask(0));
+ cpumask_set_cpu(0, topology_die_cpumask(0));
+}
- /*
- * Should not be necessary because the MP table should list the boot
- * CPU too, but we do it for the sake of robustness anyway.
- */
- if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
- printk(KERN_NOTICE
- "weird, boot CPU (#%d) not listed by the BIOS.\n",
- boot_cpu_physical_apicid);
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
- preempt_enable();
+void __init smp_prepare_cpus_common(void)
+{
+ unsigned int cpu, node;
- /*
- * If we couldn't find a local APIC, then get out of here now!
- */
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
- !cpu_has_apic) {
- if (!disable_apic) {
- pr_err("BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- pr_err("... forcing use of dummy APIC emulation."
- "(tell your hw vendor)\n");
- }
- smpboot_clear_io_apic();
- arch_disable_smp_support();
- return -1;
+ /* Mark all except the boot CPU as hotpluggable */
+ for_each_possible_cpu(cpu) {
+ if (cpu)
+ per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
}
- verify_local_APIC();
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
- /*
- * If SMP should be disabled, then really disable it!
- */
- if (!max_cpus) {
- printk(KERN_INFO "SMP mode deactivated.\n");
- smpboot_clear_io_apic();
-
- localise_nmi_watchdog();
-
- connect_bsp_APIC();
- setup_local_APIC();
- end_local_APIC_setup();
- return -1;
+ zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
}
- return 0;
+ set_cpu_sibling_map(0);
}
-static void __init smp_cpu_index_default(void)
+void __init smp_prepare_boot_cpu(void)
{
- int i;
- struct cpuinfo_x86 *c;
+ smp_ops.smp_prepare_boot_cpu();
+}
- for_each_possible_cpu(i) {
- c = &cpu_data(i);
- /* mark all to hotplug */
- c->cpu_index = nr_cpu_ids;
+#ifdef CONFIG_X86_64
+/* Establish whether parallel bringup can be supported. */
+bool __init arch_cpuhp_init_parallel_bringup(void)
+{
+ if (!x86_cpuinit.parallel_bringup) {
+ pr_info("Parallel CPU startup disabled by the platform\n");
+ return false;
}
+
+ smpboot_control = STARTUP_READ_APICID;
+ pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control);
+ return true;
}
+#endif
/*
- * Prepare for SMP bootup. The MP table or ACPI has been read
- * earlier. Just do some sanity checking here and enable APIC mode.
+ * Prepare for SMP bootup.
+ * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
+ * for common interface support.
*/
void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
- unsigned int i;
+ smp_prepare_cpus_common();
- preempt_disable();
- smp_cpu_index_default();
- current_cpu_data = boot_cpu_data;
- cpumask_copy(cpu_callin_mask, cpumask_of(0));
- mb();
- /*
- * Setup boot CPU information
- */
- smp_store_cpu_info(0); /* Final full version of the data */
-#ifdef CONFIG_X86_32
- boot_cpu_logical_apicid = logical_smp_processor_id();
-#endif
- current_thread_info()->cpu = 0; /* needed? */
- for_each_possible_cpu(i) {
- alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
- cpumask_clear(per_cpu(cpu_core_map, i));
- cpumask_clear(per_cpu(cpu_sibling_map, i));
- cpumask_clear(cpu_data(i).llc_shared_map);
- }
- set_cpu_sibling_map(0);
-
- enable_IR_x2apic();
-#ifdef CONFIG_X86_64
- default_setup_apic_routing();
-#endif
-
- if (smp_sanity_check(max_cpus) < 0) {
- printk(KERN_INFO "SMP disabled\n");
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
disable_smp();
- goto out;
- }
-
- preempt_disable();
- if (read_apic_id() != boot_cpu_physical_apicid) {
- panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
- read_apic_id(), boot_cpu_physical_apicid);
- /* Or can we switch back to PIC here? */
+ return;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ disable_smp();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+ return;
+ case APIC_VIRTUAL_WIRE:
+ case APIC_SYMMETRIC_IO:
+ break;
}
- preempt_enable();
- connect_bsp_APIC();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
- /*
- * Switch from PIC to APIC mode.
- */
- setup_local_APIC();
-
- /*
- * Enable IO APIC before setting up error vector
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
+ pr_info("CPU0: ");
+ print_cpu_info(&cpu_data(0));
- end_local_APIC_setup();
+ uv_system_init();
- map_cpu_to_logical_apicid();
+ smp_set_init_udelay();
- if (apic->setup_portio_remap)
- apic->setup_portio_remap();
+ speculative_store_bypass_ht_init();
- smpboot_setup_io_apic();
- /*
- * Set up local APIC timer on boot CPU.
- */
+ snp_set_wakeup_secondary_cpu();
+}
- printk(KERN_INFO "CPU%d: ", 0);
- print_cpu_info(&cpu_data(0));
- setup_boot_clock();
+void arch_thaw_secondary_cpus_begin(void)
+{
+ set_cache_aps_delayed_init(true);
+}
- if (is_uv_system())
- uv_system_init();
-out:
- preempt_enable();
+void arch_thaw_secondary_cpus_end(void)
+{
+ cache_aps_init();
}
+
/*
* Early setup to make printk work.
*/
void __init native_smp_prepare_boot_cpu(void)
{
int me = smp_processor_id();
- switch_to_new_gdt(me);
- /* already set me in cpu_online_mask in boot_cpu_init() */
- cpumask_set_cpu(me, cpu_callout_mask);
- per_cpu(cpu_state, me) = CPU_ONLINE;
+
+ /* SMP handles this from setup_per_cpu_areas() */
+ if (!IS_ENABLED(CONFIG_SMP))
+ switch_gdt_and_percpu_base(me);
+
+ native_pv_lock_init();
}
void __init native_smp_cpus_done(unsigned int max_cpus)
{
- pr_debug("Boot done.\n");
+ pr_debug("Boot done\n");
+ build_sched_topology();
+ nmi_selftest();
impress_friends();
-#ifdef CONFIG_X86_IO_APIC
- setup_ioapic_dest();
-#endif
- check_nmi_watchdog();
+ cache_aps_init();
}
-static int __initdata setup_possible_cpus = -1;
-static int __init _setup_possible_cpus(char *str)
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
{
- get_option(&str, &setup_possible_cpus);
- return 0;
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}
-early_param("possible_cpus", _setup_possible_cpus);
+#ifdef CONFIG_HOTPLUG_CPU
-/*
- * cpu_possible_mask should be static, it cannot change as cpu's
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_mask on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * - Ashok Raj
- *
- * Three ways to find out the number of additional hotplug CPUs:
- * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with possible_cpus=NUM
- * - Otherwise don't reserve additional CPUs.
- * We do this because additional CPUs waste a lot of memory.
- * -AK
- */
-__init void prefill_possible_map(void)
+/* Recompute SMT state for all CPUs on offline */
+static void recompute_smt_state(void)
{
- int i, possible;
-
- /* no processor from mptable or madt */
- if (!num_processors)
- num_processors = 1;
+ int max_threads, cpu;
- if (setup_possible_cpus == -1)
- possible = num_processors + disabled_cpus;
- else
- possible = setup_possible_cpus;
-
- total_cpus = max_t(int, possible, num_processors + disabled_cpus);
+ max_threads = 0;
+ for_each_online_cpu (cpu) {
+ int threads = cpumask_weight(topology_sibling_cpumask(cpu));
- if (possible > CONFIG_NR_CPUS) {
- printk(KERN_WARNING
- "%d Processors exceeds NR_CPUS limit of %d\n",
- possible, CONFIG_NR_CPUS);
- possible = CONFIG_NR_CPUS;
+ if (threads > max_threads)
+ max_threads = threads;
}
-
- printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
- possible, max_t(int, possible - num_processors, 0));
-
- for (i = 0; i < possible; i++)
- set_cpu_possible(i, true);
-
- nr_cpu_ids = possible;
+ __max_smt_threads = max_threads;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
static void remove_siblinginfo(int cpu)
{
int sibling;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- for_each_cpu(sibling, cpu_core_mask(cpu)) {
- cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
+ for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+ cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
/*/
* last thread sibling in this cpu core going down
*/
- if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
+ if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
cpu_data(sibling).booted_cores--;
}
- for_each_cpu(sibling, cpu_sibling_mask(cpu))
- cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
- cpumask_clear(cpu_sibling_mask(cpu));
- cpumask_clear(cpu_core_mask(cpu));
- c->phys_proc_id = 0;
- c->cpu_core_id = 0;
+ for_each_cpu(sibling, topology_die_cpumask(cpu))
+ cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
+
+ for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
+ cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
+ if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1)
+ cpu_data(sibling).smt_active = false;
+ }
+
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(cpu_l2c_shared_mask(cpu));
+ cpumask_clear(topology_sibling_cpumask(cpu));
+ cpumask_clear(topology_core_cpumask(cpu));
+ cpumask_clear(topology_die_cpumask(cpu));
+ c->topo.core_id = 0;
+ c->booted_cores = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
+ recompute_smt_state();
}
-static void __ref remove_cpu_from_maps(int cpu)
+static void remove_cpu_from_maps(int cpu)
{
set_cpu_online(cpu, false);
- cpumask_clear_cpu(cpu, cpu_callout_mask);
- cpumask_clear_cpu(cpu, cpu_callin_mask);
- /* was set by cpu_init() */
- cpumask_clear_cpu(cpu, cpu_initialized_mask);
numa_remove_cpu(cpu);
}
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
- /*
- * HACK:
- * Allow any queued timer interrupts to get serviced
- * This is only a temporary solution until we cleanup
- * fixup_irqs as we do for IA64.
- */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
remove_siblinginfo(cpu);
+ /*
+ * Stop allowing kernel-mode FPU. This is needed so that if the CPU is
+ * brought online again, the initial state is not allowed:
+ */
+ this_cpu_write(kernel_fpu_allowed, false);
+
/* It's now safe to remove this processor from the online map */
lock_vector_lock();
remove_cpu_from_maps(cpu);
unlock_vector_lock();
fixup_irqs();
+ lapic_offline();
}
int native_cpu_disable(void)
{
- int cpu = smp_processor_id();
-
- /*
- * Perhaps use cpufreq to drop frequency, but that could go
- * into generic code.
- *
- * We won't take down the boot processor on i386 due to some
- * interrupts only being able to be serviced by the BSP.
- * Especially so if we're not using an IOAPIC -zwane
- */
- if (cpu == 0)
- return -EBUSY;
+ int ret;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- stop_apic_nmi_watchdog(NULL);
- clear_local_APIC();
+ ret = lapic_can_unplug_cpu();
+ if (ret)
+ return ret;
cpu_disable_common();
+
+ /*
+ * Disable the local APIC. Otherwise IPI broadcasts will reach
+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
+ * messages.
+ *
+ * Disabling the APIC must happen after cpu_disable_common()
+ * which invokes fixup_irqs().
+ *
+ * Disabling the APIC preserves already set bits in IRR, but
+ * an interrupt arriving after disabling the local APIC does not
+ * set the corresponding IRR bit.
+ *
+ * fixup_irqs() scans IRR for set bits so it can raise a not
+ * yet handled interrupt on the new destination CPU via an IPI
+ * but obviously it can't do so for IRR bits which are not set.
+ * IOW, interrupts arriving after disabling the local APIC will
+ * be lost.
+ */
+ apic_soft_disable();
+
return 0;
}
-void native_cpu_die(unsigned int cpu)
+void play_dead_common(void)
+{
+ idle_task_exit();
+
+ cpuhp_ap_report_dead();
+
+ local_irq_disable();
+}
+
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+void __noreturn mwait_play_dead(unsigned int eax_hint)
{
- /* We don't do anything here: idle task is faking death itself. */
- unsigned int i;
-
- for (i = 0; i < 10; i++) {
- /* They ack this in play_dead by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
- printk(KERN_INFO "CPU %d is now offline\n", cpu);
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
- return;
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ mb();
+ clflush(md);
+ mb();
+ __monitor(md, 0, 0);
+ mb();
+ __mwait(eax_hint, 0);
+
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
}
- msleep(100);
}
- printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
-void play_dead_common(void)
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
{
- idle_task_exit();
- reset_lazy_tlbstate();
- irq_ctx_exit(raw_smp_processor_id());
- c1e_remove_cpu(raw_smp_processor_id());
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
- mb();
- /* Ack it */
- __get_cpu_var(cpu_state) = CPU_DEAD;
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
- /*
- * With physical CPU hotplug, we should halt the cpu
- */
- local_irq_disable();
+void __noreturn hlt_play_dead(void)
+{
+ if (__this_cpu_read(cpu_info.x86) >= 4)
+ wbinvd();
+
+ while (1)
+ native_halt();
}
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ __update_spec_ctrl(0);
+
play_dead_common();
- wbinvd_halt();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
+
+ /* Below returns only on error. */
+ cpuidle_play_dead();
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
@@ -1326,13 +1414,7 @@ int native_cpu_disable(void)
return -ENOSYS;
}
-void native_cpu_die(unsigned int cpu)
-{
- /* We said "no" in __cpu_disable */
- BUG();
-}
-
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
BUG();
}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..ee117fcf46ed 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -4,127 +4,115 @@
* Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
+#include <asm/unwind.h>
-static void save_stack_warning(void *data, char *msg)
+void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
+ struct task_struct *task, struct pt_regs *regs)
{
-}
+ struct unwind_state state;
+ unsigned long addr;
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
-
-static int save_stack_stack(void *data, char *name)
-{
- return 0;
-}
-
-static void save_stack_address(void *data, unsigned long addr, int reliable)
-{
- struct stack_trace *trace = data;
- if (!reliable)
- return;
- if (trace->skip > 0) {
- trace->skip--;
+ if (regs && !consume_entry(cookie, regs->ip))
return;
+
+ for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || !consume_entry(cookie, addr))
+ break;
}
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = addr;
}
-static void
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
+ void *cookie, struct task_struct *task)
{
- struct stack_trace *trace = (struct stack_trace *)data;
- if (!reliable)
- return;
- if (in_sched_functions(addr))
- return;
- if (trace->skip > 0) {
- trace->skip--;
- return;
- }
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = addr;
-}
+ struct unwind_state state;
+ struct pt_regs *regs;
+ unsigned long addr;
+
+ for (unwind_start(&state, task, NULL, NULL);
+ !unwind_done(&state) && !unwind_error(&state);
+ unwind_next_frame(&state)) {
+
+ regs = unwind_get_entry_regs(&state, NULL);
+ if (regs) {
+ /* Success path for user tasks */
+ if (user_mode(regs))
+ return 0;
+
+ /*
+ * Kernel mode registers on the stack indicate an
+ * in-kernel interrupt or exception (e.g., preemption
+ * or a page fault), which can make frame pointers
+ * unreliable.
+ */
+ if (IS_ENABLED(CONFIG_FRAME_POINTER))
+ return -EINVAL;
+ }
-static const struct stacktrace_ops save_stack_ops = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
- .stack = save_stack_stack,
- .address = save_stack_address,
-};
+ addr = unwind_get_return_address(&state);
-static const struct stacktrace_ops save_stack_ops_nosched = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
- .stack = save_stack_stack,
- .address = save_stack_address_nosched,
-};
+ /*
+ * A NULL or invalid return address probably means there's some
+ * generated code which __kernel_text_address() doesn't know
+ * about.
+ */
+ if (!addr)
+ return -EINVAL;
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- */
-void save_stack_trace(struct stack_trace *trace)
-{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
-}
-EXPORT_SYMBOL_GPL(save_stack_trace);
+ if (!consume_entry(cookie, addr))
+ return -EINVAL;
+ }
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
-{
- dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
-}
+ /* Check for stack corruption */
+ if (unwind_error(&state))
+ return -EINVAL;
-void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
-{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ return 0;
}
-EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
-struct stack_frame {
+struct stack_frame_user {
const void __user *next_fp;
unsigned long ret_addr;
};
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const struct stack_frame_user __user *fp,
+ struct stack_frame_user *frame)
{
int ret;
- if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ if (!__access_ok(fp, sizeof(*frame)))
return 0;
ret = 1;
pagefault_disable();
- if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ if (__get_user(frame->next_fp, &fp->next_fp) ||
+ __get_user(frame->ret_addr, &fp->ret_addr))
ret = 0;
pagefault_enable();
return ret;
}
-static inline void __save_stack_trace_user(struct stack_trace *trace)
+void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
+ const struct pt_regs *regs)
{
- const struct pt_regs *regs = task_pt_regs(current);
const void __user *fp = (const void __user *)regs->bp;
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = regs->ip;
+ if (!consume_entry(cookie, regs->ip))
+ return;
- while (trace->nr_entries < trace->max_entries) {
- struct stack_frame frame;
+ while (1) {
+ struct stack_frame_user frame;
frame.next_fp = NULL;
frame.ret_addr = 0;
@@ -132,25 +120,11 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
break;
if ((unsigned long)fp < regs->sp)
break;
- if (frame.ret_addr) {
- trace->entries[trace->nr_entries++] =
- frame.ret_addr;
- }
- if (fp == frame.next_fp)
+ if (!frame.ret_addr)
+ break;
+ if (!consume_entry(cookie, frame.ret_addr))
break;
fp = frame.next_fp;
}
}
-void save_stack_trace_user(struct stack_trace *trace)
-{
- /*
- * Trace user stack if we are not a kernel thread
- */
- if (current->mm) {
- __save_stack_trace_user(trace);
- }
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
-}
-
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
new file mode 100644
index 000000000000..61592e41a6b1
--- /dev/null
+++ b/arch/x86/kernel/static_call.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/static_call.h>
+#include <linux/memory.h>
+#include <linux/bug.h>
+#include <asm/text-patching.h>
+
+enum insn_type {
+ CALL = 0, /* site call */
+ NOP = 1, /* site cond-call */
+ JMP = 2, /* tramp / site tail-call */
+ RET = 3, /* tramp / site cond-tail-call */
+ JCC = 4,
+};
+
+/*
+ * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such
+ * that there is no false-positive trampoline identification while also being a
+ * speculation stop.
+ */
+static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc };
+
+/*
+ * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax
+ */
+static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
+
+static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+
+/*
+ * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug()
+ */
+static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a };
+
+static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
+{
+ u8 ret = 0;
+
+ if (insn[0] == 0x0f) {
+ u8 tmp = insn[1];
+ if ((tmp & 0xf0) == 0x80)
+ ret = tmp;
+ }
+
+ return ret;
+}
+
+extern void __static_call_return(void);
+
+asm (".global __static_call_return\n\t"
+ ".type __static_call_return, @function\n\t"
+ ASM_FUNC_ALIGN "\n\t"
+ "__static_call_return:\n\t"
+ ANNOTATE_NOENDBR "\n\t"
+ ANNOTATE_RETPOLINE_SAFE "\n\t"
+ "ret; int3\n\t"
+ ".size __static_call_return, . - __static_call_return \n\t");
+
+static void __ref __static_call_transform(void *insn, enum insn_type type,
+ void *func, bool modinit)
+{
+ const void *emulate = NULL;
+ int size = CALL_INSN_SIZE;
+ const void *code;
+ u8 op, buf[6];
+
+ if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
+ type = JCC;
+
+ switch (type) {
+ case CALL:
+ func = callthunks_translate_call_dest(func);
+ code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ if (func == &__static_call_return0) {
+ emulate = code;
+ code = &xor5rax;
+ }
+ if (func == &__WARN_trap) {
+ emulate = code;
+ code = &warninsn;
+ }
+ break;
+
+ case NOP:
+ code = x86_nops[5];
+ break;
+
+ case JMP:
+ code = text_gen_insn(JMP32_INSN_OPCODE, insn, func);
+ break;
+
+ case RET:
+ if (cpu_wants_rethunk_at(insn))
+ code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
+ else
+ code = &retinsn;
+ break;
+
+ case JCC:
+ if (!func) {
+ func = __static_call_return;
+ if (cpu_wants_rethunk())
+ func = x86_return_thunk;
+ }
+
+ buf[0] = 0x0f;
+ __text_gen_insn(buf+1, op, insn+1, func, 5);
+ code = buf;
+ size = 6;
+
+ break;
+ }
+
+ if (memcmp(insn, code, size) == 0)
+ return;
+
+ if (system_state == SYSTEM_BOOTING || modinit)
+ return text_poke_early(insn, code, size);
+
+ smp_text_poke_single(insn, code, size, emulate);
+}
+
+static void __static_call_validate(u8 *insn, bool tail, bool tramp)
+{
+ u8 opcode = insn[0];
+
+ if (tramp && memcmp(insn+5, tramp_ud, 3)) {
+ pr_err("trampoline signature fail");
+ BUG();
+ }
+
+ if (tail) {
+ if (opcode == JMP32_INSN_OPCODE ||
+ opcode == RET_INSN_OPCODE ||
+ __is_Jcc(insn))
+ return;
+ } else {
+ if (opcode == CALL_INSN_OPCODE ||
+ !memcmp(insn, x86_nops[5], 5) ||
+ !memcmp(insn, xor5rax, 5) ||
+ !memcmp(insn, warninsn, 5))
+ return;
+ }
+
+ /*
+ * If we ever trigger this, our text is corrupt, we'll probably not live long.
+ */
+ pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn);
+ BUG();
+}
+
+static inline enum insn_type __sc_insn(bool null, bool tail)
+{
+ /*
+ * Encode the following table without branches:
+ *
+ * tail null insn
+ * -----+-------+------
+ * 0 | 0 | CALL
+ * 0 | 1 | NOP
+ * 1 | 0 | JMP
+ * 1 | 1 | RET
+ */
+ return 2*tail + null;
+}
+
+void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
+{
+ mutex_lock(&text_mutex);
+
+ if (tramp && !site) {
+ __static_call_validate(tramp, true, true);
+ __static_call_transform(tramp, __sc_insn(!func, true), func, false);
+ }
+
+ if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) {
+ __static_call_validate(site, tail, false);
+ __static_call_transform(site, __sc_insn(!func, tail), func, false);
+ }
+
+ mutex_unlock(&text_mutex);
+}
+EXPORT_SYMBOL_GPL(arch_static_call_transform);
+
+noinstr void __static_call_update_early(void *tramp, void *func)
+{
+ BUG_ON(system_state != SYSTEM_BOOTING);
+ BUG_ON(static_call_initialized);
+ __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE);
+ sync_core();
+}
+
+#ifdef CONFIG_MITIGATION_RETHUNK
+/*
+ * This is called by apply_returns() to fix up static call trampolines,
+ * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as
+ * having a return trampoline.
+ *
+ * The problem is that static_call() is available before determining
+ * X86_FEATURE_RETHUNK and, by implication, running alternatives.
+ *
+ * This means that __static_call_transform() above can have overwritten the
+ * return trampoline and we now need to fix things up to be consistent.
+ */
+bool __static_call_fixup(void *tramp, u8 op, void *dest)
+{
+ unsigned long addr = (unsigned long)tramp;
+ /*
+ * Not all .return_sites are a static_call trampoline (most are not).
+ * Check if the 3 bytes after the return are still kernel text, if not,
+ * then this definitely is not a trampoline and we need not worry
+ * further.
+ *
+ * This avoids the memcmp() below tripping over pagefaults etc..
+ */
+ if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) &&
+ !kernel_text_address(addr + 7))
+ return false;
+
+ if (memcmp(tramp+5, tramp_ud, 3)) {
+ /* Not a trampoline site, not our problem. */
+ return false;
+ }
+
+ mutex_lock(&text_mutex);
+ if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk)
+ __static_call_transform(tramp, RET, NULL, true);
+ mutex_unlock(&text_mutex);
+
+ return true;
+}
+#endif
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index e8b9863ef8c4..3e2952679b88 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -1,21 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* x86 single-step support code, common to 32-bit and 64-bit.
*/
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/ptrace.h>
+#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/mmu_context.h>
+
unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
unsigned long addr, seg;
addr = regs->ip;
- seg = regs->cs & 0xffff;
+ seg = regs->cs;
if (v8086_mode(regs)) {
addr = (addr & 0xffff) + (seg << 4);
return addr;
}
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
@@ -23,27 +30,27 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
* and APM bios ones we just ignore here.
*/
if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
- u32 *desc;
+ struct desc_struct *desc;
unsigned long base;
- seg &= ~7UL;
+ seg >>= 3;
mutex_lock(&child->mm->context.lock);
- if (unlikely((seg >> 3) >= child->mm->context.size))
+ if (unlikely(!child->mm->context.ldt ||
+ seg >= child->mm->context.ldt->nr_entries))
addr = -1L; /* bogus selector, access would fault */
else {
- desc = child->mm->context.ldt + seg;
- base = ((desc[0] >> 16) |
- ((desc[1] & 0xff) << 16) |
- (desc[1] & 0xff000000));
+ desc = &child->mm->context.ldt->entries[seg];
+ base = get_desc_base(desc);
/* 16-bit code segment? */
- if (!((desc[1] >> 22) & 1))
+ if (!desc->d)
addr &= 0xffff;
addr += base;
}
mutex_unlock(&child->mm->context.lock);
}
+#endif
return addr;
}
@@ -54,7 +61,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
unsigned char opcode[15];
unsigned long addr = convert_ip_to_linear(child, regs);
- copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+ copied = access_process_vm(child, addr, opcode, sizeof(opcode),
+ FOLL_FORCE);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
/* popf and iret */
@@ -75,7 +83,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
#ifdef CONFIG_X86_64
case 0x40 ... 0x4f:
- if (regs->cs != __USER_CS)
+ if (!user_64bit_mode(regs))
/* 32-bit mode: register increment */
return 0;
/* 64-bit mode: REX prefix */
@@ -121,12 +129,17 @@ static int enable_single_step(struct task_struct *child)
regs->flags |= X86_EFLAGS_TF;
/*
- * Always set TIF_SINGLESTEP - this guarantees that
- * we single-step system calls etc.. This will also
+ * Always set TIF_SINGLESTEP. This will also
* cause us to set TF when returning to user mode.
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
+ /*
+ * Ensure that a trap is triggered once stepping out of a system
+ * call prior to executing any user instruction.
+ */
+ set_task_syscall_work(child, SYSCALL_EXIT_TRAP);
+
oflags = regs->flags;
/* Set TF on the kernel stack.. */
@@ -158,20 +171,31 @@ static int enable_single_step(struct task_struct *child)
return 1;
}
-/*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
+void set_task_blockstep(struct task_struct *task, bool on)
{
- if (child->thread.debugctlmsr == val)
- return;
-
- child->thread.debugctlmsr = val;
-
- if (child != current)
- return;
+ unsigned long debugctl;
- update_debugctlmsr(val);
+ /*
+ * Ensure irq/preemption can't change debugctl in between.
+ * Note also that both TIF_BLOCKSTEP and debugctl should
+ * be changed atomically wrt preemption.
+ *
+ * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
+ * task is current or it can't be running, otherwise we can race
+ * with __switch_to_xtra(). We rely on ptrace_freeze_traced().
+ */
+ local_irq_disable();
+ debugctl = get_debugctlmsr();
+ if (on) {
+ debugctl |= DEBUGCTLMSR_BTF;
+ set_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ } else {
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ }
+ if (task == current)
+ update_debugctlmsr(debugctl);
+ local_irq_enable();
}
/*
@@ -183,20 +207,13 @@ static void enable_step(struct task_struct *child, bool block)
* Make sure block stepping (BTF) is not enabled unless it should be.
* Note that we don't try to worry about any is_setting_trap_flag()
* instructions after the first when using block stepping.
- * So noone should try to use debugger block stepping in a program
+ * So no one should try to use debugger block stepping in a program
* that uses user-mode single stepping itself.
*/
- if (enable_single_step(child) && block) {
- set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- write_debugctlmsr(child,
- child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
- } else {
- write_debugctlmsr(child,
- child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- }
+ if (enable_single_step(child) && block)
+ set_task_blockstep(child, true);
+ else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
}
void user_enable_single_step(struct task_struct *child)
@@ -214,14 +231,12 @@ void user_disable_single_step(struct task_struct *child)
/*
* Make sure block stepping (BTF) is disabled.
*/
- write_debugctlmsr(child,
- child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+ clear_task_syscall_work(child, SYSCALL_EXIT_TRAP);
/* But touch TF only if it was set by us.. */
if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
deleted file mode 100644
index 1884a8d12bfa..000000000000
--- a/arch/x86/kernel/sys_i386_32.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * This file contains various random system calls that
- * have a non-standard calling sequence on the Linux/i386
- * platform.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/ipc.h>
-
-#include <linux/uaccess.h>
-#include <linux/unistd.h>
-
-#include <asm/syscalls.h>
-
-asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff)
-{
- int error = -EBADF;
- struct file *file = NULL;
- struct mm_struct *mm = current->mm;
-
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- if (!(flags & MAP_ANONYMOUS)) {
- file = fget(fd);
- if (!file)
- goto out;
- }
-
- down_write(&mm->mmap_sem);
- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&mm->mmap_sem);
-
- if (file)
- fput(file);
-out:
- return error;
-}
-
-/*
- * Perform the select(nd, in, out, ex, tv) and mmap() system
- * calls. Linux/i386 didn't use to be able to handle more than
- * 4 system call parameters, so these system calls used a memory
- * block for parameter passing..
- */
-
-struct mmap_arg_struct {
- unsigned long addr;
- unsigned long len;
- unsigned long prot;
- unsigned long flags;
- unsigned long fd;
- unsigned long offset;
-};
-
-asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
-{
- struct mmap_arg_struct a;
- int err = -EFAULT;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- goto out;
-
- err = -EINVAL;
- if (a.offset & ~PAGE_MASK)
- goto out;
-
- err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
- a.fd, a.offset >> PAGE_SHIFT);
-out:
- return err;
-}
-
-
-struct sel_arg_struct {
- unsigned long n;
- fd_set __user *inp, *outp, *exp;
- struct timeval __user *tvp;
-};
-
-asmlinkage int old_select(struct sel_arg_struct __user *arg)
-{
- struct sel_arg_struct a;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
- /* sys_select() does the appropriate kernel locking */
- return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
-}
-
-/*
- * sys_ipc() is the de-multiplexer for the SysV IPC calls..
- *
- * This is really horribly ugly.
- */
-asmlinkage int sys_ipc(uint call, int first, int second,
- int third, void __user *ptr, long fifth)
-{
- int version, ret;
-
- version = call >> 16; /* hack for backward compatibility */
- call &= 0xffff;
-
- switch (call) {
- case SEMOP:
- return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
- case SEMTIMEDOP:
- return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
- (const struct timespec __user *)fifth);
-
- case SEMGET:
- return sys_semget(first, second, third);
- case SEMCTL: {
- union semun fourth;
- if (!ptr)
- return -EINVAL;
- if (get_user(fourth.__pad, (void __user * __user *) ptr))
- return -EFAULT;
- return sys_semctl(first, second, third, fourth);
- }
-
- case MSGSND:
- return sys_msgsnd(first, (struct msgbuf __user *) ptr,
- second, third);
- case MSGRCV:
- switch (version) {
- case 0: {
- struct ipc_kludge tmp;
- if (!ptr)
- return -EINVAL;
-
- if (copy_from_user(&tmp,
- (struct ipc_kludge __user *) ptr,
- sizeof(tmp)))
- return -EFAULT;
- return sys_msgrcv(first, tmp.msgp, second,
- tmp.msgtyp, third);
- }
- default:
- return sys_msgrcv(first,
- (struct msgbuf __user *) ptr,
- second, fifth, third);
- }
- case MSGGET:
- return sys_msgget((key_t) first, second);
- case MSGCTL:
- return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
-
- case SHMAT:
- switch (version) {
- default: {
- ulong raddr;
- ret = do_shmat(first, (char __user *) ptr, second, &raddr);
- if (ret)
- return ret;
- return put_user(raddr, (ulong __user *) third);
- }
- case 1: /* iBCS2 emulator entry point */
- if (!segment_eq(get_fs(), get_ds()))
- return -EINVAL;
- /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
- return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
- }
- case SHMDT:
- return sys_shmdt((char __user *)ptr);
- case SHMGET:
- return sys_shmget(first, second, third);
- case SHMCTL:
- return sys_shmctl(first, second,
- (struct shmid_ds __user *) ptr);
- default:
- return -ENOSYS;
- }
-}
-
-/*
- * Old cruft
- */
-asmlinkage int sys_uname(struct old_utsname __user *name)
-{
- int err;
- if (!name)
- return -EFAULT;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- return err? -EFAULT:0;
-}
-
-asmlinkage int sys_olduname(struct oldold_utsname __user *name)
-{
- int error;
-
- if (!name)
- return -EFAULT;
- if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
- return -EFAULT;
-
- down_read(&uts_sem);
-
- error = __copy_to_user(&name->sysname, &utsname()->sysname,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename, &utsname()->nodename,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->release, &utsname()->release,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->release + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->version, &utsname()->version,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->version + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine, &utsname()->machine,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-
- up_read(&uts_sem);
-
- error = error ? -EFAULT : 0;
-
- return error;
-}
-
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
-{
- long __res;
- asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
- : "=a" (__res)
- : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
- return __res;
-}
diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c
new file mode 100644
index 000000000000..6cf65397d225
--- /dev/null
+++ b/arch/x86/kernel/sys_ia32.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
+ * sys_sparc32
+ *
+ * Copyright (C) 2000 VA Linux Co
+ * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000 Hewlett-Packard Co.
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment. In 2.5 most of this should be moved to a generic directory.
+ *
+ * This file assumes that there is a hole at the end of user address space.
+ *
+ * Some of the functions are LE specific currently. These are
+ * hopefully all marked. This should be fixed.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/utsname.h>
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/rwsem.h>
+#include <linux/compat.h>
+#include <linux/vfs.h>
+#include <linux/ptrace.h>
+#include <linux/highuid.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+#include <asm/mman.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <asm/vgtod.h>
+#include <asm/ia32.h>
+
+#define AA(__x) ((unsigned long)(__x))
+
+SYSCALL_DEFINE3(ia32_truncate64, const char __user *, filename,
+ unsigned long, offset_low, unsigned long, offset_high)
+{
+ return ksys_truncate(filename,
+ ((loff_t) offset_high << 32) | offset_low);
+}
+
+SYSCALL_DEFINE3(ia32_ftruncate64, unsigned int, fd,
+ unsigned long, offset_low, unsigned long, offset_high)
+{
+ return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
+}
+
+/* warning: next two assume little endian */
+SYSCALL_DEFINE5(ia32_pread64, unsigned int, fd, char __user *, ubuf,
+ u32, count, u32, poslo, u32, poshi)
+{
+ return ksys_pread64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+SYSCALL_DEFINE5(ia32_pwrite64, unsigned int, fd, const char __user *, ubuf,
+ u32, count, u32, poslo, u32, poshi)
+{
+ return ksys_pwrite64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+
+/*
+ * Some system calls that need sign extended arguments. This could be
+ * done by a generic wrapper.
+ */
+SYSCALL_DEFINE6(ia32_fadvise64_64, int, fd, __u32, offset_low,
+ __u32, offset_high, __u32, len_low, __u32, len_high,
+ int, advice)
+{
+ return ksys_fadvise64_64(fd,
+ (((u64)offset_high)<<32) | offset_low,
+ (((u64)len_high)<<32) | len_low,
+ advice);
+}
+
+SYSCALL_DEFINE4(ia32_readahead, int, fd, unsigned int, off_lo,
+ unsigned int, off_hi, size_t, count)
+{
+ return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
+}
+
+SYSCALL_DEFINE6(ia32_sync_file_range, int, fd, unsigned int, off_low,
+ unsigned int, off_hi, unsigned int, n_low,
+ unsigned int, n_hi, int, flags)
+{
+ return ksys_sync_file_range(fd,
+ ((u64)off_hi << 32) | off_low,
+ ((u64)n_hi << 32) | n_low, flags);
+}
+
+SYSCALL_DEFINE5(ia32_fadvise64, int, fd, unsigned int, offset_lo,
+ unsigned int, offset_hi, size_t, len, int, advice)
+{
+ return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
+ len, advice);
+}
+
+SYSCALL_DEFINE6(ia32_fallocate, int, fd, int, mode,
+ unsigned int, offset_lo, unsigned int, offset_hi,
+ unsigned int, len_lo, unsigned int, len_hi)
+{
+ return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
+ ((u64)len_hi << 32) | len_lo);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+/*
+ * Another set for IA32/LFS -- x86_64 struct stat is different due to
+ * support for 64bit inode numbers.
+ */
+static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
+{
+ typeof(ubuf->st_uid) uid = 0;
+ typeof(ubuf->st_gid) gid = 0;
+ SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
+ if (!user_write_access_begin(ubuf, sizeof(struct stat64)))
+ return -EFAULT;
+ unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault);
+ unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault);
+ unsafe_put_user(stat->ino, &ubuf->st_ino, Efault);
+ unsafe_put_user(stat->mode, &ubuf->st_mode, Efault);
+ unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault);
+ unsafe_put_user(uid, &ubuf->st_uid, Efault);
+ unsafe_put_user(gid, &ubuf->st_gid, Efault);
+ unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault);
+ unsafe_put_user(stat->size, &ubuf->st_size, Efault);
+ unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault);
+ unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault);
+ unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault);
+ unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault);
+ unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault);
+ unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault);
+ unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault);
+ unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault);
+ user_access_end();
+ return 0;
+Efault:
+ user_write_access_end();
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_stat(filename, &stat);
+
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(ia32_lstat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_lstat(filename, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(ia32_fstat64, unsigned int, fd,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_fstat(fd, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(ia32_fstatat64, unsigned int, dfd,
+ const char __user *, filename,
+ struct stat64 __user *, statbuf, int, flag)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_fstatat(dfd, filename, &stat, flag);
+ if (error)
+ return error;
+ return cp_stat64(statbuf, &stat);
+}
+
+/*
+ * Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+
+struct mmap_arg_struct32 {
+ unsigned int addr;
+ unsigned int len;
+ unsigned int prot;
+ unsigned int flags;
+ unsigned int fd;
+ unsigned int offset;
+};
+
+COMPAT_SYSCALL_DEFINE1(ia32_mmap, struct mmap_arg_struct32 __user *, arg)
+{
+ struct mmap_arg_struct32 a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset>>PAGE_SHIFT);
+}
+
+/*
+ * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS
+ */
+COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags,
+ unsigned long, newsp, int __user *, parent_tidptr,
+ unsigned long, tls_val, int __user *, child_tidptr)
+{
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = newsp,
+ .tls = tls_val,
+ };
+
+ return kernel_clone(&args);
+}
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..776ae6fa7f2d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -1,5 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -14,44 +17,82 @@
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include <linux/hugetlb.h>
+#include <asm/elf.h>
#include <asm/ia32.h>
-#include <asm/syscalls.h>
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+/*
+ * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
+ */
+static unsigned long get_align_mask(struct file *filp)
{
- long error;
- struct file *file;
+ if (filp && is_file_hugepages(filp))
+ return huge_page_mask_align(filp);
+ /* handle 32- and 64-bit case with a single conditional */
+ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
+ return 0;
- error = -EINVAL;
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+
+ return va_align.mask;
+}
+
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+ return va_align.bits & get_align_mask(NULL);
+}
+
+static int __init control_va_addr_alignment(char *str)
+{
+ /* guard against enabling this on other CPU families */
+ if (va_align.flags < 0)
+ return 1;
+
+ if (*str == 0)
+ return 1;
+
+ if (!strcmp(str, "32"))
+ va_align.flags = ALIGN_VA_32;
+ else if (!strcmp(str, "64"))
+ va_align.flags = ALIGN_VA_64;
+ else if (!strcmp(str, "off"))
+ va_align.flags = 0;
+ else if (!strcmp(str, "on"))
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+ else
+ pr_warn("invalid option value: 'align_va_addr=%s'\n", str);
+
+ return 1;
+}
+__setup("align_va_addr=", control_va_addr_alignment);
+
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
+{
if (off & ~PAGE_MASK)
- goto out;
-
- error = -EBADF;
- file = NULL;
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- if (!(flags & MAP_ANONYMOUS)) {
- file = fget(fd);
- if (!file)
- goto out;
- }
- down_write(&current->mm->mmap_sem);
- error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
- up_write(&current->mm->mmap_sem);
-
- if (file)
- fput(file);
-out:
- return error;
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
-static void find_start_end(unsigned long flags, unsigned long *begin,
- unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+ unsigned long *begin, unsigned long *end)
{
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
- unsigned long new_begin;
+ if (!in_32bit_syscall() && (flags & MAP_32BIT)) {
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
it to that. This means we need to move the
@@ -62,29 +103,39 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
*begin = 0x40000000;
*end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
- new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
- if (new_begin)
- *begin = new_begin;
+ *begin = randomize_page(*begin, 0x02000000);
}
- } else {
- *begin = TASK_UNMAPPED_BASE;
- *end = TASK_SIZE;
+ return;
}
+
+ *begin = get_mmap_base(1);
+ if (in_32bit_syscall())
+ *end = task_size_32bit();
+ else
+ *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+}
+
+static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
+{
+ if (vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
}
unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_unmapped_area_info info = {};
unsigned long begin, end;
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(addr, flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -93,118 +144,90 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
- if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
- && len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = begin;
- }
- addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
- start_addr = addr;
-
-full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (end - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != begin) {
- start_addr = addr = begin;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
+ info.length = len;
+ info.low_limit = begin;
+ info.high_limit = end;
+ if (!(filp && is_file_hugepages(filp))) {
+ info.align_offset = pgoff << PAGE_SHIFT;
+ info.start_gap = stack_guard_placement(vm_flags);
+ }
+ if (filp) {
+ info.align_mask = get_align_mask(filp);
+ info.align_offset += get_align_bits();
}
-}
+ return vm_unmapped_area(&info);
+}
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
+ struct vm_unmapped_area_info info = {};
/* requested length too big for entire address space */
if (len > TASK_SIZE)
return -ENOMEM;
+ /* No address checking. See comment at mmap_address_hint_valid() */
if (flags & MAP_FIXED)
return addr;
- /* for MAP_32BIT mappings we force the legact mmap base */
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
+ /* for MAP_32BIT mappings we force the legacy mmap base */
+ if (!in_32bit_syscall() && (flags & MAP_32BIT))
goto bottomup;
/* requesting a specific address */
if (addr) {
- addr = PAGE_ALIGN(addr);
+ addr &= PAGE_MASK;
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ if (!vma || addr + len <= vm_start_gap(vma))
return addr;
}
-
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
+get_unmapped_area:
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ if (!in_32bit_syscall() && (flags & MAP_ABOVE4G))
+ info.low_limit = SZ_4G;
+ else
+ info.low_limit = PAGE_SIZE;
+
+ info.high_limit = get_mmap_base(0);
+ if (!(filp && is_file_hugepages(filp))) {
+ info.start_gap = stack_guard_placement(vm_flags);
+ info.align_offset = pgoff << PAGE_SHIFT;
}
- /* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ *
+ * !in_32bit_syscall() check to avoid high addresses for x32
+ * (and make it no op on native i386).
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = addr-len;
+ if (filp) {
+ info.align_mask = get_align_mask(filp);
+ info.align_offset += get_align_bits();
}
-
- if (mm->mmap_base < len)
- goto bottomup;
-
- addr = mm->mmap_base-len;
-
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = addr;
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
+ addr = vm_unmapped_area(&info);
+ if (!(addr & ~PAGE_MASK))
+ return addr;
+ VM_BUG_ON(addr != -ENOMEM);
bottomup:
/*
@@ -213,26 +236,5 @@ bottomup:
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
-
- return addr;
-}
-
-
-asmlinkage long sys_uname(struct new_utsname __user *name)
-{
- int err;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
- return err ? -EFAULT : 0;
+ return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0);
}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
deleted file mode 100644
index de87d6008295..000000000000
--- a/arch/x86/kernel/syscall_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* System call table for x86-64. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __NO_STUBS
-
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include <asm/unistd_64.h>
-
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_UNISTD_64_H
-
-typedef void (*sys_call_ptr_t)(void);
-
-extern void sys_ni_syscall(void);
-
-const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
- /*
- *Smells like a like a compiler bug -- it doesn't work
- *when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/unistd_64.h>
-};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index d51321ddafda..000000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,338 +0,0 @@
-ENTRY(sys_call_table)
- .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
- .long sys_exit
- .long ptregs_fork
- .long sys_read
- .long sys_write
- .long sys_open /* 5 */
- .long sys_close
- .long sys_waitpid
- .long sys_creat
- .long sys_link
- .long sys_unlink /* 10 */
- .long ptregs_execve
- .long sys_chdir
- .long sys_time
- .long sys_mknod
- .long sys_chmod /* 15 */
- .long sys_lchown16
- .long sys_ni_syscall /* old break syscall holder */
- .long sys_stat
- .long sys_lseek
- .long sys_getpid /* 20 */
- .long sys_mount
- .long sys_oldumount
- .long sys_setuid16
- .long sys_getuid16
- .long sys_stime /* 25 */
- .long sys_ptrace
- .long sys_alarm
- .long sys_fstat
- .long sys_pause
- .long sys_utime /* 30 */
- .long sys_ni_syscall /* old stty syscall holder */
- .long sys_ni_syscall /* old gtty syscall holder */
- .long sys_access
- .long sys_nice
- .long sys_ni_syscall /* 35 - old ftime syscall holder */
- .long sys_sync
- .long sys_kill
- .long sys_rename
- .long sys_mkdir
- .long sys_rmdir /* 40 */
- .long sys_dup
- .long sys_pipe
- .long sys_times
- .long sys_ni_syscall /* old prof syscall holder */
- .long sys_brk /* 45 */
- .long sys_setgid16
- .long sys_getgid16
- .long sys_signal
- .long sys_geteuid16
- .long sys_getegid16 /* 50 */
- .long sys_acct
- .long sys_umount /* recycled never used phys() */
- .long sys_ni_syscall /* old lock syscall holder */
- .long sys_ioctl
- .long sys_fcntl /* 55 */
- .long sys_ni_syscall /* old mpx syscall holder */
- .long sys_setpgid
- .long sys_ni_syscall /* old ulimit syscall holder */
- .long sys_olduname
- .long sys_umask /* 60 */
- .long sys_chroot
- .long sys_ustat
- .long sys_dup2
- .long sys_getppid
- .long sys_getpgrp /* 65 */
- .long sys_setsid
- .long sys_sigaction
- .long sys_sgetmask
- .long sys_ssetmask
- .long sys_setreuid16 /* 70 */
- .long sys_setregid16
- .long sys_sigsuspend
- .long sys_sigpending
- .long sys_sethostname
- .long sys_setrlimit /* 75 */
- .long sys_old_getrlimit
- .long sys_getrusage
- .long sys_gettimeofday
- .long sys_settimeofday
- .long sys_getgroups16 /* 80 */
- .long sys_setgroups16
- .long old_select
- .long sys_symlink
- .long sys_lstat
- .long sys_readlink /* 85 */
- .long sys_uselib
- .long sys_swapon
- .long sys_reboot
- .long sys_old_readdir
- .long old_mmap /* 90 */
- .long sys_munmap
- .long sys_truncate
- .long sys_ftruncate
- .long sys_fchmod
- .long sys_fchown16 /* 95 */
- .long sys_getpriority
- .long sys_setpriority
- .long sys_ni_syscall /* old profil syscall holder */
- .long sys_statfs
- .long sys_fstatfs /* 100 */
- .long sys_ioperm
- .long sys_socketcall
- .long sys_syslog
- .long sys_setitimer
- .long sys_getitimer /* 105 */
- .long sys_newstat
- .long sys_newlstat
- .long sys_newfstat
- .long sys_uname
- .long ptregs_iopl /* 110 */
- .long sys_vhangup
- .long sys_ni_syscall /* old "idle" system call */
- .long ptregs_vm86old
- .long sys_wait4
- .long sys_swapoff /* 115 */
- .long sys_sysinfo
- .long sys_ipc
- .long sys_fsync
- .long ptregs_sigreturn
- .long ptregs_clone /* 120 */
- .long sys_setdomainname
- .long sys_newuname
- .long sys_modify_ldt
- .long sys_adjtimex
- .long sys_mprotect /* 125 */
- .long sys_sigprocmask
- .long sys_ni_syscall /* old "create_module" */
- .long sys_init_module
- .long sys_delete_module
- .long sys_ni_syscall /* 130: old "get_kernel_syms" */
- .long sys_quotactl
- .long sys_getpgid
- .long sys_fchdir
- .long sys_bdflush
- .long sys_sysfs /* 135 */
- .long sys_personality
- .long sys_ni_syscall /* reserved for afs_syscall */
- .long sys_setfsuid16
- .long sys_setfsgid16
- .long sys_llseek /* 140 */
- .long sys_getdents
- .long sys_select
- .long sys_flock
- .long sys_msync
- .long sys_readv /* 145 */
- .long sys_writev
- .long sys_getsid
- .long sys_fdatasync
- .long sys_sysctl
- .long sys_mlock /* 150 */
- .long sys_munlock
- .long sys_mlockall
- .long sys_munlockall
- .long sys_sched_setparam
- .long sys_sched_getparam /* 155 */
- .long sys_sched_setscheduler
- .long sys_sched_getscheduler
- .long sys_sched_yield
- .long sys_sched_get_priority_max
- .long sys_sched_get_priority_min /* 160 */
- .long sys_sched_rr_get_interval
- .long sys_nanosleep
- .long sys_mremap
- .long sys_setresuid16
- .long sys_getresuid16 /* 165 */
- .long ptregs_vm86
- .long sys_ni_syscall /* Old sys_query_module */
- .long sys_poll
- .long sys_nfsservctl
- .long sys_setresgid16 /* 170 */
- .long sys_getresgid16
- .long sys_prctl
- .long ptregs_rt_sigreturn
- .long sys_rt_sigaction
- .long sys_rt_sigprocmask /* 175 */
- .long sys_rt_sigpending
- .long sys_rt_sigtimedwait
- .long sys_rt_sigqueueinfo
- .long sys_rt_sigsuspend
- .long sys_pread64 /* 180 */
- .long sys_pwrite64
- .long sys_chown16
- .long sys_getcwd
- .long sys_capget
- .long sys_capset /* 185 */
- .long ptregs_sigaltstack
- .long sys_sendfile
- .long sys_ni_syscall /* reserved for streams1 */
- .long sys_ni_syscall /* reserved for streams2 */
- .long ptregs_vfork /* 190 */
- .long sys_getrlimit
- .long sys_mmap2
- .long sys_truncate64
- .long sys_ftruncate64
- .long sys_stat64 /* 195 */
- .long sys_lstat64
- .long sys_fstat64
- .long sys_lchown
- .long sys_getuid
- .long sys_getgid /* 200 */
- .long sys_geteuid
- .long sys_getegid
- .long sys_setreuid
- .long sys_setregid
- .long sys_getgroups /* 205 */
- .long sys_setgroups
- .long sys_fchown
- .long sys_setresuid
- .long sys_getresuid
- .long sys_setresgid /* 210 */
- .long sys_getresgid
- .long sys_chown
- .long sys_setuid
- .long sys_setgid
- .long sys_setfsuid /* 215 */
- .long sys_setfsgid
- .long sys_pivot_root
- .long sys_mincore
- .long sys_madvise
- .long sys_getdents64 /* 220 */
- .long sys_fcntl64
- .long sys_ni_syscall /* reserved for TUX */
- .long sys_ni_syscall
- .long sys_gettid
- .long sys_readahead /* 225 */
- .long sys_setxattr
- .long sys_lsetxattr
- .long sys_fsetxattr
- .long sys_getxattr
- .long sys_lgetxattr /* 230 */
- .long sys_fgetxattr
- .long sys_listxattr
- .long sys_llistxattr
- .long sys_flistxattr
- .long sys_removexattr /* 235 */
- .long sys_lremovexattr
- .long sys_fremovexattr
- .long sys_tkill
- .long sys_sendfile64
- .long sys_futex /* 240 */
- .long sys_sched_setaffinity
- .long sys_sched_getaffinity
- .long sys_set_thread_area
- .long sys_get_thread_area
- .long sys_io_setup /* 245 */
- .long sys_io_destroy
- .long sys_io_getevents
- .long sys_io_submit
- .long sys_io_cancel
- .long sys_fadvise64 /* 250 */
- .long sys_ni_syscall
- .long sys_exit_group
- .long sys_lookup_dcookie
- .long sys_epoll_create
- .long sys_epoll_ctl /* 255 */
- .long sys_epoll_wait
- .long sys_remap_file_pages
- .long sys_set_tid_address
- .long sys_timer_create
- .long sys_timer_settime /* 260 */
- .long sys_timer_gettime
- .long sys_timer_getoverrun
- .long sys_timer_delete
- .long sys_clock_settime
- .long sys_clock_gettime /* 265 */
- .long sys_clock_getres
- .long sys_clock_nanosleep
- .long sys_statfs64
- .long sys_fstatfs64
- .long sys_tgkill /* 270 */
- .long sys_utimes
- .long sys_fadvise64_64
- .long sys_ni_syscall /* sys_vserver */
- .long sys_mbind
- .long sys_get_mempolicy
- .long sys_set_mempolicy
- .long sys_mq_open
- .long sys_mq_unlink
- .long sys_mq_timedsend
- .long sys_mq_timedreceive /* 280 */
- .long sys_mq_notify
- .long sys_mq_getsetattr
- .long sys_kexec_load
- .long sys_waitid
- .long sys_ni_syscall /* 285 */ /* available */
- .long sys_add_key
- .long sys_request_key
- .long sys_keyctl
- .long sys_ioprio_set
- .long sys_ioprio_get /* 290 */
- .long sys_inotify_init
- .long sys_inotify_add_watch
- .long sys_inotify_rm_watch
- .long sys_migrate_pages
- .long sys_openat /* 295 */
- .long sys_mkdirat
- .long sys_mknodat
- .long sys_fchownat
- .long sys_futimesat
- .long sys_fstatat64 /* 300 */
- .long sys_unlinkat
- .long sys_renameat
- .long sys_linkat
- .long sys_symlinkat
- .long sys_readlinkat /* 305 */
- .long sys_fchmodat
- .long sys_faccessat
- .long sys_pselect6
- .long sys_ppoll
- .long sys_unshare /* 310 */
- .long sys_set_robust_list
- .long sys_get_robust_list
- .long sys_splice
- .long sys_sync_file_range
- .long sys_tee /* 315 */
- .long sys_vmsplice
- .long sys_move_pages
- .long sys_getcpu
- .long sys_epoll_pwait
- .long sys_utimensat /* 320 */
- .long sys_signalfd
- .long sys_timerfd_create
- .long sys_eventfd
- .long sys_fallocate
- .long sys_timerfd_settime /* 325 */
- .long sys_timerfd_gettime
- .long sys_signalfd4
- .long sys_eventfd2
- .long sys_epoll_create1
- .long sys_dup3 /* 330 */
- .long sys_pipe2
- .long sys_inotify_init1
- .long sys_preadv
- .long sys_pwritev
- .long sys_rt_tgsigqueueinfo /* 335 */
- .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000000..46b8f1f16676
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tboot.c: main implementation of helper functions used by kernel for
+ * runtime support of Intel(R) Trusted Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ */
+
+#include <linux/init_task.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/cpu.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/tboot.h>
+#include <linux/debugfs.h>
+
+#include <asm/realmode.h>
+#include <asm/processor.h>
+#include <asm/bootparam.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/e820/api.h>
+#include <asm/io.h>
+
+#include "../realmode/rm/wakeup.h"
+
+/* Global pointer to shared data; NULL means no measured launch. */
+static struct tboot *tboot __read_mostly;
+
+/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
+#define AP_WAIT_TIMEOUT 1
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tboot: " fmt
+
+static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+
+bool tboot_enabled(void)
+{
+ return tboot != NULL;
+}
+
+/* noinline to prevent gcc from warning about dereferencing constant fixaddr */
+static noinline __init bool check_tboot_version(void)
+{
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ return false;
+ }
+
+ if (tboot->version < 5) {
+ pr_warn("tboot version is invalid: %u\n", tboot->version);
+ return false;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+
+ return true;
+}
+
+void __init tboot_probe(void)
+{
+ /* Look for valid page-aligned address for shared page. */
+ if (!boot_params.tboot_addr)
+ return;
+ /*
+ * also verify that it is mapped as we expect it before calling
+ * set_fixmap(), to reduce chance of garbage value causing crash
+ */
+ if (!e820__mapped_any(boot_params.tboot_addr,
+ boot_params.tboot_addr, E820_TYPE_RESERVED)) {
+ pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
+ return;
+ }
+
+ /* Map and check for tboot UUID. */
+ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
+ tboot = (void *)fix_to_virt(FIX_TBOOT_BASE);
+ if (!check_tboot_version())
+ tboot = NULL;
+}
+
+static pgd_t *tboot_pg_dir;
+static struct mm_struct tboot_mm = {
+ .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq),
+ MMAP_LOCK_INITIALIZER(init_mm)
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+};
+
+static inline void switch_to_tboot_pt(void)
+{
+ write_cr3(virt_to_phys(tboot_pg_dir));
+}
+
+static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(&tboot_mm, vaddr);
+ p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
+ if (!p4d)
+ return -1;
+ pud = pud_alloc(&tboot_mm, p4d, vaddr);
+ if (!pud)
+ return -1;
+ pmd = pmd_alloc(&tboot_mm, pud, vaddr);
+ if (!pmd)
+ return -1;
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ if (!pte)
+ return -1;
+ set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
+ pte_unmap(pte);
+
+ /*
+ * PTI poisons low addresses in the kernel page tables in the
+ * name of making them unusable for userspace. To execute
+ * code at such a low address, the poison must be cleared.
+ *
+ * Note: 'pgd' actually gets set in p4d_alloc() _or_
+ * pud_alloc() depending on 4/5-level paging.
+ */
+ pgd->pgd &= ~_PAGE_NX;
+
+ return 0;
+}
+
+static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
+ unsigned long nr)
+{
+ /* Reuse the original kernel mapping */
+ tboot_pg_dir = pgd_alloc(&tboot_mm);
+ if (!tboot_pg_dir)
+ return -1;
+
+ for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
+ if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
+ return -1;
+ }
+
+ return 0;
+}
+
+static void tboot_create_trampoline(void)
+{
+ u32 map_base, map_size;
+
+ /* Create identity map for tboot shutdown code. */
+ map_base = PFN_DOWN(tboot->tboot_base);
+ map_size = PFN_UP(tboot->tboot_size);
+ if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
+ panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
+ map_base, map_size);
+}
+
+#ifdef CONFIG_ACPI_SLEEP
+
+static void add_mac_region(phys_addr_t start, unsigned long size)
+{
+ struct tboot_mac_region *mr;
+ phys_addr_t end = start + size;
+
+ if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+ panic("tboot: Too many MAC regions\n");
+
+ if (start && size) {
+ mr = &tboot->mac_regions[tboot->num_mac_regions++];
+ mr->start = round_down(start, PAGE_SIZE);
+ mr->size = round_up(end, PAGE_SIZE) - mr->start;
+ }
+}
+
+static int tboot_setup_sleep(void)
+{
+ int i;
+
+ tboot->num_mac_regions = 0;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ if (e820_table->entries[i].type != E820_TYPE_RAM)
+ continue;
+
+ add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
+ }
+
+ tboot->acpi_sinfo.kernel_s3_resume_vector =
+ real_mode_header->wakeup_start;
+
+ return 0;
+}
+
+#else /* no CONFIG_ACPI_SLEEP */
+
+static int tboot_setup_sleep(void)
+{
+ /* S3 shutdown requested, but S3 not supported by the kernel... */
+ BUG();
+ return -1;
+}
+
+#endif
+
+void tboot_shutdown(u32 shutdown_type)
+{
+ void (*shutdown)(void);
+
+ if (!tboot_enabled())
+ return;
+
+ /*
+ * if we're being called before the 1:1 mapping is set up then just
+ * return and let the normal shutdown happen; this should only be
+ * due to very early panic()
+ */
+ if (!tboot_pg_dir)
+ return;
+
+ /* if this is S3 then set regions to MAC */
+ if (shutdown_type == TB_SHUTDOWN_S3)
+ if (tboot_setup_sleep())
+ return;
+
+ tboot->shutdown_type = shutdown_type;
+
+ switch_to_tboot_pt();
+
+ shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
+ shutdown();
+
+ /* should not reach here */
+ while (1)
+ halt();
+}
+
+static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
+{
+#define TB_COPY_GAS(tbg, g) \
+ tbg.space_id = g.space_id; \
+ tbg.bit_width = g.bit_width; \
+ tbg.bit_offset = g.bit_offset; \
+ tbg.access_width = g.access_width; \
+ tbg.address = g.address;
+
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
+
+ /*
+ * We need phys addr of waking vector, but can't use virt_to_phys() on
+ * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
+ * addr.
+ */
+ tboot->acpi_sinfo.wakeup_vector = fadt->facs +
+ offsetof(struct acpi_table_facs, firmware_waking_vector);
+}
+
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+{
+ static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
+ /* S0,1,2: */ -1, -1, -1,
+ /* S3: */ TB_SHUTDOWN_S3,
+ /* S4: */ TB_SHUTDOWN_S4,
+ /* S5: */ TB_SHUTDOWN_S5 };
+
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_copy_fadt(&acpi_gbl_FADT);
+ tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
+ tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
+ /* we always use the 32b wakeup vector */
+ tboot->acpi_sinfo.vector_width = 32;
+
+ if (sleep_state >= ACPI_S_STATE_COUNT ||
+ acpi_shutdown_map[sleep_state] == -1) {
+ pr_warn("unsupported sleep state 0x%x\n", sleep_state);
+ return -1;
+ }
+
+ tboot_shutdown(acpi_shutdown_map[sleep_state]);
+ return 0;
+}
+
+static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ return -ENODEV;
+}
+
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
+{
+ unsigned long timeout;
+
+ timeout = AP_WAIT_TIMEOUT*HZ;
+ while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+ timeout) {
+ mdelay(1);
+ timeout--;
+ }
+
+ if (timeout)
+ pr_warn("tboot wait for APs timeout\n");
+
+ return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int tboot_dying_cpu(unsigned int cpu)
+{
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1) {
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return -EBUSY;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \
+ 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 }
+
+#define TBOOT_SERIAL_LOG_ADDR 0x60000
+#define TBOOT_SERIAL_LOG_SIZE 0x08000
+#define LOG_MAX_SIZE_OFF 16
+#define LOG_BUF_OFF 24
+
+static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID;
+
+static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos)
+{
+ void __iomem *log_base;
+ u8 log_uuid[16];
+ u32 max_size;
+ void *kbuf;
+ int ret = -EFAULT;
+
+ log_base = ioremap(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
+ if (!log_base)
+ return ret;
+
+ memcpy_fromio(log_uuid, log_base, sizeof(log_uuid));
+ if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid)))
+ goto err_iounmap;
+
+ max_size = readl(log_base + LOG_MAX_SIZE_OFF);
+ if (*ppos >= max_size) {
+ ret = 0;
+ goto err_iounmap;
+ }
+
+ if (*ppos + count > max_size)
+ count = max_size - *ppos;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf) {
+ ret = -ENOMEM;
+ goto err_iounmap;
+ }
+
+ memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count);
+ if (copy_to_user(user_buf, kbuf, count))
+ goto err_kfree;
+
+ *ppos += count;
+
+ ret = count;
+
+err_kfree:
+ kfree(kbuf);
+
+err_iounmap:
+ iounmap(log_base);
+
+ return ret;
+}
+
+static const struct file_operations tboot_log_fops = {
+ .read = tboot_log_read,
+ .llseek = default_llseek,
+};
+
+#endif /* CONFIG_DEBUG_FS */
+
+static __init int tboot_late_init(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_create_trampoline();
+
+ atomic_set(&ap_wfs_count, 0);
+ cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL,
+ tboot_dying_cpu);
+#ifdef CONFIG_DEBUG_FS
+ debugfs_create_file("tboot_log", S_IRUSR,
+ arch_debugfs_dir, NULL, &tboot_log_fops);
+#endif
+
+ acpi_os_set_prepare_sleep(&tboot_sleep);
+ acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep);
+ return 0;
+}
+
+late_initcall(tboot_late_init);
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
+ TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_HEAP_BASE 0x0300
+#define TXTCR_HEAP_SIZE 0x0308
+
+#define SHA1_SIZE 20
+
+struct sha1_hash {
+ u8 hash[SHA1_SIZE];
+};
+
+struct sinit_mle_data {
+ u32 version; /* currently 6 */
+ struct sha1_hash bios_acm_id;
+ u32 edx_senter_flags;
+ u64 mseg_valid;
+ struct sha1_hash sinit_hash;
+ struct sha1_hash mle_hash;
+ struct sha1_hash stm_hash;
+ struct sha1_hash lcp_policy_hash;
+ u32 lcp_policy_control;
+ u32 rlp_wakeup_addr;
+ u32 reserved;
+ u32 num_mdrs;
+ u32 mdrs_off;
+ u32 num_vtd_dmars;
+ u32 vtd_dmars_off;
+} __packed;
+
+struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
+{
+ void *heap_base, *heap_ptr, *config;
+
+ if (!tboot_enabled())
+ return dmar_tbl;
+
+ /*
+ * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+ * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+ */
+
+ /* map config space in order to get heap addr */
+ config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
+ PAGE_SIZE);
+ if (!config)
+ return NULL;
+
+ /* now map TXT heap */
+ heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
+ *(u64 *)(config + TXTCR_HEAP_SIZE));
+ iounmap(config);
+ if (!heap_base)
+ return NULL;
+
+ /* walk heap to SinitMleData */
+ /* skip BiosData */
+ heap_ptr = heap_base + *(u64 *)heap_base;
+ /* skip OsMleData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* skip OsSinitData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* now points to SinitMleDataSize; set to SinitMleData */
+ heap_ptr += sizeof(u64);
+ /* get addr of DMAR table */
+ dmar_tbl = (struct acpi_table_header *)(heap_ptr +
+ ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
+ sizeof(u64));
+
+ /* don't unmap heap because dmar.c needs access to this */
+
+ return dmar_tbl;
+}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
deleted file mode 100644
index 9e540fee7009..000000000000
--- a/arch/x86/kernel/tce_64.c
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * This file manages the translation entries for the IBM Calgary IOMMU.
- *
- * Derived from arch/powerpc/platforms/pseries/iommu.c
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Jon Mason <jdmason@us.ibm.com>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/bootmem.h>
-#include <asm/tce.h>
-#include <asm/calgary.h>
-#include <asm/proto.h>
-
-/* flush a tce at 'tceaddr' to main memory */
-static inline void flush_tce(void* tceaddr)
-{
- /* a single tce can't cross a cache line */
- if (cpu_has_clflush)
- clflush(tceaddr);
- else
- wbinvd();
-}
-
-void tce_build(struct iommu_table *tbl, unsigned long index,
- unsigned int npages, unsigned long uaddr, int direction)
-{
- u64* tp;
- u64 t;
- u64 rpn;
-
- t = (1 << TCE_READ_SHIFT);
- if (direction != DMA_TO_DEVICE)
- t |= (1 << TCE_WRITE_SHIFT);
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
- t &= ~TCE_RPN_MASK;
- t |= (rpn << TCE_RPN_SHIFT);
-
- *tp = cpu_to_be64(t);
- flush_tce(tp);
-
- uaddr += PAGE_SIZE;
- tp++;
- }
-}
-
-void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
-{
- u64* tp;
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- *tp = cpu_to_be64(0);
- flush_tce(tp);
- tp++;
- }
-}
-
-static inline unsigned int table_size_to_number_of_entries(unsigned char size)
-{
- /*
- * size is the order of the table, 0-7
- * smallest table is 8K entries, so shift result by 13 to
- * multiply by 8K
- */
- return (1 << size) << 13;
-}
-
-static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
-{
- unsigned int bitmapsz;
- unsigned long bmppages;
- int ret;
-
- tbl->it_busno = dev->bus->number;
-
- /* set the tce table size - measured in entries */
- tbl->it_size = table_size_to_number_of_entries(specified_table_size);
-
- /*
- * number of bytes needed for the bitmap size in number of
- * entries; we need one bit per entry
- */
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
- if (!bmppages) {
- printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
- ret = -ENOMEM;
- goto done;
- }
-
- tbl->it_map = (unsigned long*)bmppages;
-
- memset(tbl->it_map, 0, bitmapsz);
-
- tbl->it_hint = 0;
-
- spin_lock_init(&tbl->it_lock);
-
- return 0;
-
-done:
- return ret;
-}
-
-int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
-{
- struct iommu_table *tbl;
- int ret;
-
- if (pci_iommu(dev->bus)) {
- printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
- dev, pci_iommu(dev->bus));
- BUG();
- }
-
- tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
- if (!tbl) {
- printk(KERN_ERR "Calgary: error allocating iommu_table\n");
- ret = -ENOMEM;
- goto done;
- }
-
- ret = tce_table_setparms(dev, tbl);
- if (ret)
- goto free_tbl;
-
- tbl->bbar = bbar;
-
- set_pci_iommu(dev->bus, tbl);
-
- return 0;
-
-free_tbl:
- kfree(tbl);
-done:
- return ret;
-}
-
-void * __init alloc_tce_table(void)
-{
- unsigned int size;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- return __alloc_bootmem_low(size, size, 0);
-}
-
-void __init free_tce_table(void *tbl)
-{
- unsigned int size;
-
- if (!tbl)
- return;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- free_bootmem(__pa(tbl), size);
-}
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
deleted file mode 100644
index 787a5e499dd1..000000000000
--- a/arch/x86/kernel/test_nx.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * test_nx.c: functional test for NX functionality
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/asm.h>
-
-extern int rodata_test_data;
-
-/*
- * This file checks 4 things:
- * 1) Check if the stack is not executable
- * 2) Check if kmalloc memory is not executable
- * 3) Check if the .rodata section is not executable
- * 4) Check if the .data section of a module is not executable
- *
- * To do this, the test code tries to execute memory in stack/kmalloc/etc,
- * and then checks if the expected trap happens.
- *
- * Sadly, this implies having a dynamic exception handling table entry.
- * ... which can be done (and will make Rusty cry)... but it can only
- * be done in a stand-alone module with only 1 entry total.
- * (otherwise we'd have to sort and that's just too messy)
- */
-
-
-
-/*
- * We want to set up an exception handling point on our stack,
- * which means a variable value. This function is rather dirty
- * and walks the exception table of the module, looking for a magic
- * marker and replaces it with a specific function.
- */
-static void fudze_exception_table(void *marker, void *new)
-{
- struct module *mod = THIS_MODULE;
- struct exception_table_entry *extable;
-
- /*
- * Note: This module has only 1 exception table entry,
- * so searching and sorting is not needed. If that changes,
- * this would be the place to search and re-sort the exception
- * table.
- */
- if (mod->num_exentries > 1) {
- printk(KERN_ERR "test_nx: too many exception table entries!\n");
- printk(KERN_ERR "test_nx: test results are not reliable.\n");
- return;
- }
- extable = (struct exception_table_entry *)mod->extable;
- extable[0].insn = (unsigned long)new;
-}
-
-
-/*
- * exception tables get their symbols translated so we need
- * to use a fake function to put in there, which we can then
- * replace at runtime.
- */
-void foo_label(void);
-
-/*
- * returns 0 for not-executable, negative for executable
- *
- * Note: we cannot allow this function to be inlined, because
- * that would give us more than 1 exception table entry.
- * This in turn would break the assumptions above.
- */
-static noinline int test_address(void *address)
-{
- unsigned long result;
-
- /* Set up an exception table entry for our address */
- fudze_exception_table(&foo_label, address);
- result = 1;
- asm volatile(
- "foo_label:\n"
- "0: call *%[fake_code]\n"
- "1:\n"
- ".section .fixup,\"ax\"\n"
- "2: mov %[zero], %[rslt]\n"
- " ret\n"
- ".previous\n"
- _ASM_EXTABLE(0b,2b)
- : [rslt] "=r" (result)
- : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
- );
- /* change the exception table back for the next round */
- fudze_exception_table(address, &foo_label);
-
- if (result)
- return -ENODEV;
- return 0;
-}
-
-static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
-
-static int test_NX(void)
-{
- int ret = 0;
- /* 0xC3 is the opcode for "ret" */
- char stackcode[] = {0xC3, 0x90, 0 };
- char *heap;
-
- test_data = 0xC3;
-
- printk(KERN_INFO "Testing NX protection\n");
-
- /* Test 1: check if the stack is not executable */
- if (test_address(&stackcode)) {
- printk(KERN_ERR "test_nx: stack was executable\n");
- ret = -ENODEV;
- }
-
-
- /* Test 2: Check if the heap is executable */
- heap = kmalloc(64, GFP_KERNEL);
- if (!heap)
- return -ENOMEM;
- heap[0] = 0xC3; /* opcode for "ret" */
-
- if (test_address(heap)) {
- printk(KERN_ERR "test_nx: heap was executable\n");
- ret = -ENODEV;
- }
- kfree(heap);
-
- /*
- * The following 2 tests currently fail, this needs to get fixed
- * Until then, don't run them to avoid too many people getting scared
- * by the error message
- */
-
-#ifdef CONFIG_DEBUG_RODATA
- /* Test 3: Check if the .rodata section is executable */
- if (rodata_test_data != 0xC3) {
- printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
- ret = -ENODEV;
- } else if (test_address(&rodata_test_data)) {
- printk(KERN_ERR "test_nx: .rodata section is executable\n");
- ret = -ENODEV;
- }
-#endif
-
-#if 0
- /* Test 4: Check if the .data section of a module is executable */
- if (test_address(&test_data)) {
- printk(KERN_ERR "test_nx: .data section is executable\n");
- ret = -ENODEV;
- }
-
-#endif
- return 0;
-}
-
-static void test_exit(void)
-{
-}
-
-module_init(test_NX);
-module_exit(test_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the NX infrastructure");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644
index c29e235792af..000000000000
--- a/arch/x86/kernel/test_rodata.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/module.h>
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-
-int rodata_test(void)
-{
- unsigned long result;
- unsigned long start, end;
-
- /* test 1: read the value */
- /* If this test fails, some previous testrun has clobbered the state */
- if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
- return -ENODEV;
- }
-
- /* test 2: write to the variable; this should fault */
- /*
- * If this test fails, we managed to overwrite the data
- *
- * This is written in assembly to be able to catch the
- * exception that is supposed to happen in the correct
- * case
- */
-
- result = 1;
- asm volatile(
- "0: mov %[zero],(%[rodata_test])\n"
- " mov %[zero], %[rslt]\n"
- "1:\n"
- ".section .fixup,\"ax\"\n"
- "2: jmp 1b\n"
- ".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 16\n"
-#ifdef CONFIG_X86_32
- " .long 0b,2b\n"
-#else
- " .quad 0b,2b\n"
-#endif
- ".previous"
- : [rslt] "=r" (result)
- : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
- );
-
-
- if (!result) {
- printk(KERN_ERR "rodata_test: test data was not read only\n");
- return -ENODEV;
- }
-
- /* test 3: check the value hasn't changed */
- /* If this test fails, we managed to overwrite the data */
- if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
- return -ENODEV;
- }
- /* test 4: check if the rodata section is 4Kb aligned */
- start = (unsigned long)__start_rodata;
- end = (unsigned long)__end_rodata;
- if (start & (PAGE_SIZE - 1)) {
- printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
- return -ENODEV;
- }
- if (end & (PAGE_SIZE - 1)) {
- printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
- return -ENODEV;
- }
-
- return 0;
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 000000000000..52e1f3f0b361
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 1991,1992,1995 Linus Torvalds
+ * Copyright (c) 1994 Alan Modra
+ * Copyright (c) 1995 Markus Kuhn
+ * Copyright (c) 1996 Ingo Molnar
+ * Copyright (c) 1998 Andrea Arcangeli
+ * Copyright (c) 2002,2006 Vojtech Pavlik
+ * Copyright (c) 2003 Andi Kleen
+ *
+ */
+
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/i8253.h>
+#include <linux/time.h>
+#include <linux/export.h>
+
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
+#include <asm/i8259.h>
+#include <asm/timer.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * Default timer interrupt handler for PIT/HPET
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+ global_clock_event->event_handler(global_clock_event);
+ return IRQ_HANDLED;
+}
+
+static void __init setup_default_timer_irq(void)
+{
+ unsigned long flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER;
+
+ /*
+ * Unconditionally register the legacy timer interrupt; even
+ * without legacy PIC/PIT we need this for the HPET0 in legacy
+ * replacement mode.
+ */
+ if (request_irq(0, timer_interrupt, flags, "timer", NULL))
+ pr_info("Failed to register legacy timer interrupt\n");
+}
+
+/* Default timer init function */
+void __init hpet_time_init(void)
+{
+ if (!hpet_enable()) {
+ if (!pit_timer_init())
+ return;
+ }
+
+ setup_default_timer_irq();
+}
+
+static __init void x86_late_time_init(void)
+{
+ /*
+ * Before PIT/HPET init, select the interrupt mode. This is required
+ * to make the decision whether PIT should be initialized correct.
+ */
+ x86_init.irqs.intr_mode_select();
+
+ /* Setup the legacy timers */
+ x86_init.timers.timer_init();
+
+ /*
+ * After PIT/HPET timers init, set up the final interrupt mode for
+ * delivering IRQs.
+ */
+ x86_init.irqs.intr_mode_init();
+ tsc_init();
+
+ if (static_cpu_has(X86_FEATURE_WAITPKG))
+ use_tpause_delay();
+}
+
+/*
+ * Initialize TSC and delay the periodic timer init to
+ * late x86_late_time_init() so ioremap works.
+ */
+void __init time_init(void)
+{
+ late_time_init = x86_late_time_init;
+}
+
+/*
+ * Sanity check the vdso related archdata content.
+ */
+void clocksource_arch_init(struct clocksource *cs)
+{
+ if (cs->vdso_clock_mode == VDSO_CLOCKMODE_NONE)
+ return;
+
+ if (cs->mask != CLOCKSOURCE_MASK(64)) {
+ pr_warn("clocksource %s registered with invalid mask %016llx for VDSO. Disabling VDSO support.\n",
+ cs->name, cs->mask);
+ cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
+ }
+}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
deleted file mode 100644
index 5c5d87f0b2e1..000000000000
--- a/arch/x86/kernel/time_32.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (C) 1991, 1992, 1995 Linus Torvalds
- *
- * This file contains the PC-specific time handling details:
- * reading the RTC at bootup, etc..
- * 1994-07-02 Alan Modra
- * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
- * 1995-03-26 Markus Kuhn
- * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
- * precision CMOS clock update
- * 1996-05-03 Ingo Molnar
- * fixed time warps in do_[slow|fast]_gettimeoffset()
- * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
- * "A Kernel Model for Precision Timekeeping" by Dave Mills
- * 1998-09-05 (Various)
- * More robust do_fast_gettimeoffset() algorithm implemented
- * (works with APM, Cyrix 6x86MX and Centaur C6),
- * monotonic gettimeofday() with fast_get_timeoffset(),
- * drift-proof precision TSC calibration on boot
- * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
- * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
- * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
- * 1998-12-16 Andrea Arcangeli
- * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
- * because was not accounting lost_ticks.
- * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
- * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
- * serialize accesses to xtime/lost_ticks).
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/mca.h>
-
-#include <asm/setup.h>
-#include <asm/hpet.h>
-#include <asm/time.h>
-#include <asm/timer.h>
-
-#include <asm/do_timer.h>
-
-int timer_ack;
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
- unsigned long pc = instruction_pointer(regs);
-
-#ifdef CONFIG_SMP
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp = (unsigned long *)&regs->sp;
-
- /* Return address is either directly at stack pointer
- or above a saved flags. Eflags has bits 22-31 zero,
- kernel addresses don't. */
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
-#endif
- return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
- /* Keep nmi watchdog up to date */
- inc_irq_stat(irq0_irqs);
-
-#ifdef CONFIG_X86_IO_APIC
- if (timer_ack) {
- /*
- * Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to deassert NMI lines for the watchdog if run
- * on an 82489DX-based system.
- */
- spin_lock(&i8259A_lock);
- outb(0x0c, PIC_MASTER_OCW3);
- /* Ack the IRQ; AEOI will end it automatically. */
- inb(PIC_MASTER_POLL);
- spin_unlock(&i8259A_lock);
- }
-#endif
-
- do_timer_interrupt_hook();
-
-#ifdef CONFIG_MCA
- if (MCA_bus) {
- /* The PS/2 uses level-triggered interrupts. You can't
- turn them off, nor would you want to (any attempt to
- enable edge-triggered interrupts usually gets intercepted by a
- special hardware circuit). Hence we have to acknowledge
- the timer interrupt. Through some incredibly stupid
- design idea, the reset for IRQ 0 is done by setting the
- high bit of the PPI port B (0x61). Note that some PS/2s,
- notably the 55SX, work fine if this is removed. */
-
- u8 irq_v = inb_p(0x61); /* read the current state */
- outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
- }
-#endif
-
- return IRQ_HANDLED;
-}
-
-/* Duplicate of time_init() below, with hpet_enable part added */
-void __init hpet_time_init(void)
-{
- if (!hpet_enable())
- setup_pit_timer();
- x86_quirk_time_init();
-}
-
-/*
- * This is called directly from init code; we must delay timer setup in the
- * HPET case as we can't make the decision to turn on HPET this early in the
- * boot process.
- *
- * The chosen time_init function will usually be hpet_time_init, above, but
- * in the case of virtual hardware, an alternative function may be substituted.
- */
-void __init time_init(void)
-{
- x86_quirk_pre_time_init();
- tsc_init();
- late_time_init = choose_time_init();
-}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
deleted file mode 100644
index 5ba343e61844..000000000000
--- a/arch/x86/kernel/time_64.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * "High Precision Event Timer" based timekeeping.
- *
- * Copyright (c) 1991,1992,1995 Linus Torvalds
- * Copyright (c) 1994 Alan Modra
- * Copyright (c) 1995 Markus Kuhn
- * Copyright (c) 1996 Ingo Molnar
- * Copyright (c) 1998 Andrea Arcangeli
- * Copyright (c) 2002,2006 Vojtech Pavlik
- * Copyright (c) 2003 Andi Kleen
- * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
- */
-
-#include <linux/clockchips.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/mca.h>
-#include <linux/nmi.h>
-
-#include <asm/i8253.h>
-#include <asm/hpet.h>
-#include <asm/vgtod.h>
-#include <asm/time.h>
-#include <asm/timer.h>
-
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
- unsigned long pc = instruction_pointer(regs);
-
- /* Assume the lock function has either no stack frame or a copy
- of flags from PUSHF
- Eflags always has bits 22 and up cleared unlike kernel addresses. */
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp = (unsigned long *)regs->sp;
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
- return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
- inc_irq_stat(irq0_irqs);
-
- global_clock_event->event_handler(global_clock_event);
-
-#ifdef CONFIG_MCA
- if (MCA_bus) {
- u8 irq_v = inb_p(0x61); /* read the current state */
- outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
- }
-#endif
-
- return IRQ_HANDLED;
-}
-
-/* calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency */
-#define TICK_COUNT 100000000
-unsigned long __init calibrate_cpu(void)
-{
- int tsc_start, tsc_now;
- int i, no_ctr_free;
- unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
- unsigned long flags;
-
- for (i = 0; i < 4; i++)
- if (avail_to_resrv_perfctr_nmi_bit(i))
- break;
- no_ctr_free = (i == 4);
- if (no_ctr_free) {
- WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
- "cpu_khz value may be incorrect.\n");
- i = 3;
- rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- rdmsrl(MSR_K7_PERFCTR3, pmc3);
- } else {
- reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
- local_irq_save(flags);
- /* start measuring cycles, incrementing from 0 */
- wrmsrl(MSR_K7_PERFCTR0 + i, 0);
- wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
- rdtscl(tsc_start);
- do {
- rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
- tsc_now = get_cycles();
- } while ((tsc_now - tsc_start) < TICK_COUNT);
-
- local_irq_restore(flags);
- if (no_ctr_free) {
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- wrmsrl(MSR_K7_PERFCTR3, pmc3);
- wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
- } else {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-
- return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-
-static struct irqaction irq0 = {
- .handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
- .name = "timer"
-};
-
-void __init hpet_time_init(void)
-{
- if (!hpet_enable())
- setup_pit_timer();
-
- setup_irq(0, &irq0);
-}
-
-void __init time_init(void)
-{
- tsc_init();
-
- late_time_init = choose_time_init();
-}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
deleted file mode 100644
index 8ccabb8a2f6a..000000000000
--- a/arch/x86/kernel/tlb_uv.c
+++ /dev/null
@@ -1,868 +0,0 @@
-/*
- * SGI UltraViolet TLB flush routines.
- *
- * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
- */
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/kernel.h>
-
-#include <asm/mmu_context.h>
-#include <asm/uv/uv.h>
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/tsc.h>
-#include <asm/irq_vectors.h>
-
-static struct bau_control **uv_bau_table_bases __read_mostly;
-static int uv_bau_retry_limit __read_mostly;
-
-/* position of pnode (which is nasid>>1): */
-static int uv_nshift __read_mostly;
-/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
-
-static unsigned long uv_mmask __read_mostly;
-
-static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
-static DEFINE_PER_CPU(struct bau_control, bau_control);
-
-/*
- * Determine the first node on a blade.
- */
-static int __init blade_to_first_node(int blade)
-{
- int node, b;
-
- for_each_online_node(node) {
- b = uv_node_to_blade_id(node);
- if (blade == b)
- return node;
- }
- return -1; /* shouldn't happen */
-}
-
-/*
- * Determine the apicid of the first cpu on a blade.
- */
-static int __init blade_to_first_apicid(int blade)
-{
- int cpu;
-
- for_each_present_cpu(cpu)
- if (blade == uv_cpu_to_blade_id(cpu))
- return per_cpu(x86_cpu_to_apicid, cpu);
- return -1;
-}
-
-/*
- * Free a software acknowledge hardware resource by clearing its Pending
- * bit. This will return a reply to the sender.
- * If the message has timed out, a reply has already been sent by the
- * hardware but the resource has not been released. In that case our
- * clear of the Timeout bit (as well) will free the resource. No reply will
- * be sent (the hardware will only do one reply per message).
- */
-static void uv_reply_to_message(int resource,
- struct bau_payload_queue_entry *msg,
- struct bau_msg_status *msp)
-{
- unsigned long dw;
-
- dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
- msg->replied_to = 1;
- msg->sw_ack_vector = 0;
- if (msp)
- msp->seen_by.bits = 0;
- uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
-}
-
-/*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
- */
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
- int msg_slot, int sw_ack_slot)
-{
- unsigned long this_cpu_mask;
- struct bau_msg_status *msp;
- int cpu;
-
- msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
- cpu = uv_blade_processor_id();
- msg->number_of_cpus =
- uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
- this_cpu_mask = 1UL << cpu;
- if (msp->seen_by.bits & this_cpu_mask)
- return;
- atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
-
- if (msg->replied_to == 1)
- return;
-
- if (msg->address == TLB_FLUSH_ALL) {
- local_flush_tlb();
- __get_cpu_var(ptcstats).alltlb++;
- } else {
- __flush_tlb_one(msg->address);
- __get_cpu_var(ptcstats).onetlb++;
- }
-
- __get_cpu_var(ptcstats).requestee++;
-
- atomic_inc_short(&msg->acknowledge_count);
- if (msg->number_of_cpus == msg->acknowledge_count)
- uv_reply_to_message(sw_ack_slot, msg, msp);
-}
-
-/*
- * Examine the payload queue on one distribution node to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
- */
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
-{
- struct bau_payload_queue_entry *msg;
- struct bau_msg_status *msp;
- int count = 0;
- int i;
- int j;
-
- for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
- msg++, i++) {
- if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
- msp = bau_tablesp->msg_statuses + i;
- printk(KERN_DEBUG
- "blade %d: address:%#lx %d of %d, not cpu(s): ",
- i, msg->address, msg->acknowledge_count,
- msg->number_of_cpus);
- for (j = 0; j < msg->number_of_cpus; j++) {
- if (!((1L << j) & msp->seen_by.bits)) {
- count++;
- printk("%d ", j);
- }
- }
- printk("\n");
- }
- }
- return count;
-}
-
-/*
- * Examine the payload queue on all the distribution nodes to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
- */
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
-{
- int sender;
- int i;
- int count = 0;
-
- sender = smp_processor_id();
- for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
- if (!bau_node_isset(i, distribution))
- continue;
- count += uv_examine_destination(uv_bau_table_bases[i], sender);
- }
- return count;
-}
-
-/*
- * wait for completion of a broadcast message
- *
- * return COMPLETE, RETRY or GIVEUP
- */
-static int uv_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift)
-{
- int exams = 0;
- long destination_timeouts = 0;
- long source_timeouts = 0;
- unsigned long descriptor_status;
-
- while ((descriptor_status = (((unsigned long)
- uv_read_local_mmr(mmr_offset) >>
- right_shift) & UV_ACT_STATUS_MASK)) !=
- DESC_STATUS_IDLE) {
- if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
- source_timeouts++;
- if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
- source_timeouts = 0;
- __get_cpu_var(ptcstats).s_retry++;
- return FLUSH_RETRY;
- }
- /*
- * spin here looking for progress at the destinations
- */
- if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
- destination_timeouts++;
- if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
- /*
- * returns number of cpus not responding
- */
- if (uv_examine_destinations
- (&bau_desc->distribution) == 0) {
- __get_cpu_var(ptcstats).d_retry++;
- return FLUSH_RETRY;
- }
- exams++;
- if (exams >= uv_bau_retry_limit) {
- printk(KERN_DEBUG
- "uv_flush_tlb_others");
- printk("giving up on cpu %d\n",
- smp_processor_id());
- return FLUSH_GIVEUP;
- }
- /*
- * delays can hang the simulator
- udelay(1000);
- */
- destination_timeouts = 0;
- }
- }
- cpu_relax();
- }
- return FLUSH_COMPLETE;
-}
-
-/**
- * uv_flush_send_and_wait
- *
- * Send a broadcast and wait for a broadcast message to complete.
- *
- * The flush_mask contains the cpus the broadcast was sent to.
- *
- * Returns NULL if all remote flushing was done. The mask is zeroed.
- * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
- */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
- struct bau_desc *bau_desc,
- struct cpumask *flush_mask)
-{
- int completion_status = 0;
- int right_shift;
- int tries = 0;
- int pnode;
- int bit;
- unsigned long mmr_offset;
- unsigned long index;
- cycles_t time1;
- cycles_t time2;
-
- if (cpu < UV_CPUS_PER_ACT_STATUS) {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
- right_shift = cpu * UV_ACT_STATUS_SIZE;
- } else {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- right_shift =
- ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
- }
- time1 = get_cycles();
- do {
- tries++;
- index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
- cpu;
- uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
- completion_status = uv_wait_completion(bau_desc, mmr_offset,
- right_shift);
- } while (completion_status == FLUSH_RETRY);
- time2 = get_cycles();
- __get_cpu_var(ptcstats).sflush += (time2 - time1);
- if (tries > 1)
- __get_cpu_var(ptcstats).retriesok++;
-
- if (completion_status == FLUSH_GIVEUP) {
- /*
- * Cause the caller to do an IPI-style TLB shootdown on
- * the cpu's, all of which are still in the mask.
- */
- __get_cpu_var(ptcstats).ptc_i++;
- return flush_mask;
- }
-
- /*
- * Success, so clear the remote cpu's from the mask so we don't
- * use the IPI method of shootdown on them.
- */
- for_each_cpu(bit, flush_mask) {
- pnode = uv_cpu_to_pnode(bit);
- if (pnode == this_pnode)
- continue;
- cpumask_clear_cpu(bit, flush_mask);
- }
- if (!cpumask_empty(flush_mask))
- return flush_mask;
- return NULL;
-}
-
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
-/**
- * uv_flush_tlb_others - globally purge translation cache of a virtual
- * address or all TLB's
- * @cpumask: mask of all cpu's in which the address is to be removed
- * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
- * @cpu: the current cpu
- *
- * This is the entry point for initiating any UV global TLB shootdown.
- *
- * Purges the translation caches of all specified processors of the given
- * virtual address, or purges all TLB's on specified processors.
- *
- * The caller has derived the cpumask from the mm_struct. This function
- * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
- *
- * The cpumask is converted into a nodemask of the nodes containing
- * the cpus.
- *
- * Note that this function should be called with preemption disabled.
- *
- * Returns NULL if all remote flushing was done.
- * Returns pointer to cpumask if some remote flushing remains to be
- * done. The returned pointer is valid till preemption is re-enabled.
- */
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va, unsigned int cpu)
-{
- struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
- int i;
- int bit;
- int pnode;
- int uv_cpu;
- int this_pnode;
- int locals = 0;
- struct bau_desc *bau_desc;
-
- cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
-
- uv_cpu = uv_blade_processor_id();
- this_pnode = uv_hub_info->pnode;
- bau_desc = __get_cpu_var(bau_control).descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
-
- bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-
- i = 0;
- for_each_cpu(bit, flush_mask) {
- pnode = uv_cpu_to_pnode(bit);
- BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
- if (pnode == this_pnode) {
- locals++;
- continue;
- }
- bau_node_set(pnode - uv_partition_base_pnode,
- &bau_desc->distribution);
- i++;
- }
- if (i == 0) {
- /*
- * no off_node flushing; return status for local node
- */
- if (locals)
- return flush_mask;
- else
- return NULL;
- }
- __get_cpu_var(ptcstats).requestor++;
- __get_cpu_var(ptcstats).ntargeted += i;
-
- bau_desc->payload.address = va;
- bau_desc->payload.sending_cpu = cpu;
-
- return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
-}
-
-/*
- * The BAU message interrupt comes here. (registered by set_intr_gate)
- * See entry_64.S
- *
- * We received a broadcast assist message.
- *
- * Interrupts may have been disabled; this interrupt could represent
- * the receipt of several messages.
- *
- * All cores/threads on this node get this interrupt.
- * The last one to see it does the s/w ack.
- * (the resource will not be freed until noninterruptable cpus see this
- * interrupt; hardware will timeout the s/w ack and reply ERROR)
- */
-void uv_bau_message_interrupt(struct pt_regs *regs)
-{
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
- struct bau_payload_queue_entry *msg;
- struct pt_regs *old_regs = set_irq_regs(regs);
- cycles_t time1;
- cycles_t time2;
- int msg_slot;
- int sw_ack_slot;
- int fw;
- int count = 0;
- unsigned long local_pnode;
-
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- time1 = get_cycles();
-
- local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
-
- va_queue_first = __get_cpu_var(bau_control).va_queue_first;
- va_queue_last = __get_cpu_var(bau_control).va_queue_last;
-
- msg = __get_cpu_var(bau_control).bau_msg_head;
- while (msg->sw_ack_vector) {
- count++;
- fw = msg->sw_ack_vector;
- msg_slot = msg - va_queue_first;
- sw_ack_slot = ffs(fw) - 1;
-
- uv_bau_process_message(msg, msg_slot, sw_ack_slot);
-
- msg++;
- if (msg > va_queue_last)
- msg = va_queue_first;
- __get_cpu_var(bau_control).bau_msg_head = msg;
- }
- if (!count)
- __get_cpu_var(ptcstats).nomsg++;
- else if (count > 1)
- __get_cpu_var(ptcstats).multmsg++;
-
- time2 = get_cycles();
- __get_cpu_var(ptcstats).dflush += (time2 - time1);
-
- irq_exit();
- set_irq_regs(old_regs);
-}
-
-/*
- * uv_enable_timeouts
- *
- * Each target blade (i.e. blades that have cpu's) needs to have
- * shootdown message timeouts enabled. The timeout does not cause
- * an interrupt, but causes an error message to be returned to
- * the sender.
- */
-static void uv_enable_timeouts(void)
-{
- int blade;
- int nblades;
- int pnode;
- unsigned long mmr_image;
-
- nblades = uv_num_possible_blades();
-
- for (blade = 0; blade < nblades; blade++) {
- if (!uv_blade_nr_possible_cpus(blade))
- continue;
-
- pnode = uv_blade_to_pnode(blade);
- mmr_image =
- uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
- /*
- * Set the timeout period and then lock it in, in three
- * steps; captures and locks in the period.
- *
- * To program the period, the SOFT_ACK_MODE must be off.
- */
- mmr_image &= ~((unsigned long)1 <<
- UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
- /*
- * Set the 4-bit period.
- */
- mmr_image &= ~((unsigned long)0xf <<
- UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
- mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
- UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
- /*
- * Subsequent reversals of the timebase bit (3) cause an
- * immediate timeout of one or all INTD resources as
- * indicated in bits 2:0 (7 causes all of them to timeout).
- */
- mmr_image |= ((unsigned long)1 <<
- UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
- }
-}
-
-static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
-{
- if (*offset < num_possible_cpus())
- return offset;
- return NULL;
-}
-
-static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
-{
- (*offset)++;
- if (*offset < num_possible_cpus())
- return offset;
- return NULL;
-}
-
-static void uv_ptc_seq_stop(struct seq_file *file, void *data)
-{
-}
-
-/*
- * Display the statistics thru /proc
- * data points to the cpu number
- */
-static int uv_ptc_seq_show(struct seq_file *file, void *data)
-{
- struct ptc_stats *stat;
- int cpu;
-
- cpu = *(loff_t *)data;
-
- if (!cpu) {
- seq_printf(file,
- "# cpu requestor requestee one all sretry dretry ptc_i ");
- seq_printf(file,
- "sw_ack sflush dflush sok dnomsg dmult starget\n");
- }
- if (cpu < num_possible_cpus() && cpu_online(cpu)) {
- stat = &per_cpu(ptcstats, cpu);
- seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
- cpu, stat->requestor,
- stat->requestee, stat->onetlb, stat->alltlb,
- stat->s_retry, stat->d_retry, stat->ptc_i);
- seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
- uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
- stat->sflush, stat->dflush,
- stat->retriesok, stat->nomsg,
- stat->multmsg, stat->ntargeted);
- }
-
- return 0;
-}
-
-/*
- * 0: display meaning of the statistics
- * >0: retry limit
- */
-static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
- size_t count, loff_t *data)
-{
- long newmode;
- char optstr[64];
-
- if (count == 0 || count > sizeof(optstr))
- return -EINVAL;
- if (copy_from_user(optstr, user, count))
- return -EFAULT;
- optstr[count - 1] = '\0';
- if (strict_strtoul(optstr, 10, &newmode) < 0) {
- printk(KERN_DEBUG "%s is invalid\n", optstr);
- return -EINVAL;
- }
-
- if (newmode == 0) {
- printk(KERN_DEBUG "# cpu: cpu number\n");
- printk(KERN_DEBUG
- "requestor: times this cpu was the flush requestor\n");
- printk(KERN_DEBUG
- "requestee: times this cpu was requested to flush its TLBs\n");
- printk(KERN_DEBUG
- "one: times requested to flush a single address\n");
- printk(KERN_DEBUG
- "all: times requested to flush all TLB's\n");
- printk(KERN_DEBUG
- "sretry: number of retries of source-side timeouts\n");
- printk(KERN_DEBUG
- "dretry: number of retries of destination-side timeouts\n");
- printk(KERN_DEBUG
- "ptc_i: times UV fell through to IPI-style flushes\n");
- printk(KERN_DEBUG
- "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
- printk(KERN_DEBUG
- "sflush_us: cycles spent in uv_flush_tlb_others()\n");
- printk(KERN_DEBUG
- "dflush_us: cycles spent in handling flush requests\n");
- printk(KERN_DEBUG "sok: successes on retry\n");
- printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
- printk(KERN_DEBUG
- "dmult: interrupts with multiple messages\n");
- printk(KERN_DEBUG "starget: nodes targeted\n");
- } else {
- uv_bau_retry_limit = newmode;
- printk(KERN_DEBUG "timeout retry limit:%d\n",
- uv_bau_retry_limit);
- }
-
- return count;
-}
-
-static const struct seq_operations uv_ptc_seq_ops = {
- .start = uv_ptc_seq_start,
- .next = uv_ptc_seq_next,
- .stop = uv_ptc_seq_stop,
- .show = uv_ptc_seq_show
-};
-
-static int uv_ptc_proc_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &uv_ptc_seq_ops);
-}
-
-static const struct file_operations proc_uv_ptc_operations = {
- .open = uv_ptc_proc_open,
- .read = seq_read,
- .write = uv_ptc_proc_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init uv_ptc_init(void)
-{
- struct proc_dir_entry *proc_uv_ptc;
-
- if (!is_uv_system())
- return 0;
-
- proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
- if (!proc_uv_ptc) {
- printk(KERN_ERR "unable to create %s proc entry\n",
- UV_PTC_BASENAME);
- return -EINVAL;
- }
- proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
- return 0;
-}
-
-/*
- * begin the initialization of the per-blade control structures
- */
-static struct bau_control * __init uv_table_bases_init(int blade, int node)
-{
- int i;
- struct bau_msg_status *msp;
- struct bau_control *bau_tabp;
-
- bau_tabp =
- kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
- BUG_ON(!bau_tabp);
-
- bau_tabp->msg_statuses =
- kmalloc_node(sizeof(struct bau_msg_status) *
- DEST_Q_SIZE, GFP_KERNEL, node);
- BUG_ON(!bau_tabp->msg_statuses);
-
- for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
- bau_cpubits_clear(&msp->seen_by, (int)
- uv_blade_nr_possible_cpus(blade));
-
- uv_bau_table_bases[blade] = bau_tabp;
-
- return bau_tabp;
-}
-
-/*
- * finish the initialization of the per-blade control structures
- */
-static void __init
-uv_table_bases_finish(int blade,
- struct bau_control *bau_tablesp,
- struct bau_desc *adp)
-{
- struct bau_control *bcp;
- int cpu;
-
- for_each_present_cpu(cpu) {
- if (blade != uv_cpu_to_blade_id(cpu))
- continue;
-
- bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
- bcp->bau_msg_head = bau_tablesp->va_queue_first;
- bcp->va_queue_first = bau_tablesp->va_queue_first;
- bcp->va_queue_last = bau_tablesp->va_queue_last;
- bcp->msg_statuses = bau_tablesp->msg_statuses;
- bcp->descriptor_base = adp;
- }
-}
-
-/*
- * initialize the sending side's sending buffers
- */
-static struct bau_desc * __init
-uv_activation_descriptor_init(int node, int pnode)
-{
- int i;
- unsigned long pa;
- unsigned long m;
- unsigned long n;
- struct bau_desc *adp;
- struct bau_desc *ad2;
-
- /*
- * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
- * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
- */
- adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
- UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
- BUG_ON(!adp);
-
- pa = uv_gpa(adp); /* need the real nasid*/
- n = pa >> uv_nshift;
- m = pa & uv_mmask;
-
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
- (n << UV_DESC_BASE_PNODE_SHIFT | m));
-
- /*
- * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
- * cpu even though we only use the first one; one descriptor can
- * describe a broadcast to 256 nodes.
- */
- for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
- i++, ad2++) {
- memset(ad2, 0, sizeof(struct bau_desc));
- ad2->header.sw_ack_flag = 1;
- /*
- * base_dest_nodeid is the first node in the partition, so
- * the bit map will indicate partition-relative node numbers.
- * note that base_dest_nodeid is actually a nasid.
- */
- ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
- ad2->header.command = UV_NET_ENDPOINT_INTD;
- ad2->header.int_both = 1;
- /*
- * all others need to be set to zero:
- * fairness chaining multilevel count replied_to
- */
- }
- return adp;
-}
-
-/*
- * initialize the destination side's receiving buffers
- */
-static struct bau_payload_queue_entry * __init
-uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
-{
- struct bau_payload_queue_entry *pqp;
- unsigned long pa;
- int pn;
- char *cp;
-
- pqp = (struct bau_payload_queue_entry *) kmalloc_node(
- (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
- GFP_KERNEL, node);
- BUG_ON(!pqp);
-
- cp = (char *)pqp + 31;
- pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
- bau_tablesp->va_queue_first = pqp;
- /*
- * need the pnode of where the memory was really allocated
- */
- pa = uv_gpa(pqp);
- pn = pa >> uv_nshift;
- uv_write_global_mmr64(pnode,
- UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
- ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
- uv_physnodeaddr(pqp));
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
- uv_physnodeaddr(pqp));
- bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
- (unsigned long)
- uv_physnodeaddr(bau_tablesp->va_queue_last));
- memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-
- return pqp;
-}
-
-/*
- * Initialization of each UV blade's structures
- */
-static int __init uv_init_blade(int blade)
-{
- int node;
- int pnode;
- unsigned long pa;
- unsigned long apicid;
- struct bau_desc *adp;
- struct bau_payload_queue_entry *pqp;
- struct bau_control *bau_tablesp;
-
- node = blade_to_first_node(blade);
- bau_tablesp = uv_table_bases_init(blade, node);
- pnode = uv_blade_to_pnode(blade);
- adp = uv_activation_descriptor_init(node, pnode);
- pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
- uv_table_bases_finish(blade, bau_tablesp, adp);
- /*
- * the below initialization can't be in firmware because the
- * messaging IRQ will be determined by the OS
- */
- apicid = blade_to_first_apicid(blade);
- pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
- if ((pa & 0xff) != UV_BAU_MESSAGE) {
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
- ((apicid << 32) | UV_BAU_MESSAGE));
- }
- return 0;
-}
-
-/*
- * Initialization of BAU-related structures
- */
-static int __init uv_bau_init(void)
-{
- int blade;
- int nblades;
- int cur_cpu;
-
- if (!is_uv_system())
- return 0;
-
- for_each_possible_cpu(cur_cpu)
- zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
- GFP_KERNEL, cpu_to_node(cur_cpu));
-
- uv_bau_retry_limit = 1;
- uv_nshift = uv_hub_info->n_val;
- uv_mmask = (1UL << uv_hub_info->n_val) - 1;
- nblades = uv_num_possible_blades();
-
- uv_bau_table_bases = (struct bau_control **)
- kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
- BUG_ON(!uv_bau_table_bases);
-
- uv_partition_base_pnode = 0x7fffffff;
- for (blade = 0; blade < nblades; blade++)
- if (uv_blade_nr_possible_cpus(blade) &&
- (uv_blade_to_pnode(blade) < uv_partition_base_pnode))
- uv_partition_base_pnode = uv_blade_to_pnode(blade);
- for (blade = 0; blade < nblades; blade++)
- if (uv_blade_nr_possible_cpus(blade))
- uv_init_blade(blade);
-
- alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
- uv_enable_timeouts();
-
- return 0;
-}
-__initcall(uv_bau_init);
-__initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b8579e70..3ffbab0081f4 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -1,16 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/user.h>
#include <linux/regset.h>
+#include <linux/syscalls.h>
+#include <linux/nospec.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
-#include <asm/syscalls.h>
+#include <asm/gsseg.h>
#include "tls.h"
@@ -28,6 +30,58 @@ static int get_free_idx(void)
return -ESRCH;
}
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ /*
+ * For historical reasons (i.e. no one ever documented how any
+ * of the segmentation APIs work), user programs can and do
+ * assume that a struct user_desc that's all zeros except for
+ * entry_number means "no segment at all". This never actually
+ * worked. In fact, up to Linux 3.19, a struct user_desc like
+ * this would create a 16-bit read-write segment with base and
+ * limit both equal to zero.
+ *
+ * That was close enough to "no segment at all" until we
+ * hardened this function to disallow 16-bit TLS segments. Fix
+ * it up by interpreting these zeroed segments the way that they
+ * were almost certainly intended to be interpreted.
+ *
+ * The correct way to ask for "no segment at all" is to specify
+ * a user_desc that satisfies LDT_empty. To keep everything
+ * working, we accept both.
+ *
+ * Note that there's a similar kludge in modify_ldt -- look at
+ * the distinction between modes 1 and 0x11.
+ */
+ if (LDT_empty(info) || LDT_zero(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ /* Only allow data segments in the TLS array. */
+ if (info->contents > 1)
+ return false;
+
+ /*
+ * Non-present segments with DPL 3 present an interesting attack
+ * surface. The kernel should handle such segments correctly,
+ * but TLS is very difficult to protect in a sandbox, so prevent
+ * such segments from being created.
+ *
+ * If userspace needs to remove a TLS entry, it can still delete
+ * it outright.
+ */
+ if (info->seg_not_present)
+ return false;
+
+ return true;
+}
+
static void set_tls_desc(struct task_struct *p, int idx,
const struct user_desc *info, int n)
{
@@ -41,8 +95,8 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info))
- desc->a = desc->b = 0;
+ if (LDT_empty(info) || LDT_zero(info))
+ memset(desc, 0, sizeof(*desc));
else
fill_ldt(desc, info);
++info;
@@ -63,10 +117,14 @@ int do_set_thread_area(struct task_struct *p, int idx,
int can_allocate)
{
struct user_desc info;
+ unsigned short __maybe_unused sel, modified_sel;
if (copy_from_user(&info, u_info, sizeof(info)))
return -EFAULT;
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
if (idx == -1)
idx = info.entry_number;
@@ -87,14 +145,47 @@ int do_set_thread_area(struct task_struct *p, int idx,
set_tls_desc(p, idx, &info, 1);
+ /*
+ * If DS, ES, FS, or GS points to the modified segment, forcibly
+ * refresh it. Only needed on x86_64 because x86_32 reloads them
+ * on return to user mode.
+ */
+ modified_sel = (idx << 3) | 3;
+
+ if (p == current) {
+#ifdef CONFIG_X86_64
+ savesegment(ds, sel);
+ if (sel == modified_sel)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if (sel == modified_sel)
+ loadsegment(es, sel);
+
+ savesegment(fs, sel);
+ if (sel == modified_sel)
+ loadsegment(fs, sel);
+#endif
+
+ savesegment(gs, sel);
+ if (sel == modified_sel)
+ load_gs_index(sel);
+ } else {
+#ifdef CONFIG_X86_64
+ if (p->thread.fsindex == modified_sel)
+ p->thread.fsbase = info.base_addr;
+
+ if (p->thread.gsindex == modified_sel)
+ p->thread.gsbase = info.base_addr;
+#endif
+ }
+
return 0;
}
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info)
{
- int ret = do_set_thread_area(current, -1, u_info, 1);
- asmlinkage_protect(1, ret, u_info);
- return ret;
+ return do_set_thread_area(current, -1, u_info, 1);
}
@@ -125,6 +216,7 @@ int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *u_info)
{
struct user_desc info;
+ int index;
if (idx == -1 && get_user(idx, &u_info->entry_number))
return -EFAULT;
@@ -132,19 +224,20 @@ int do_get_thread_area(struct task_struct *p, int idx,
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
- fill_user_desc(&info, idx,
- &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
+ index = idx - GDT_ENTRY_TLS_MIN;
+ index = array_index_nospec(index,
+ GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1);
+
+ fill_user_desc(&info, idx, &p->thread.tls_array[index]);
if (copy_to_user(u_info, &info, sizeof(info)))
return -EFAULT;
return 0;
}
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info)
{
- int ret = do_get_thread_area(current, -1, u_info);
- asmlinkage_protect(1, ret, u_info);
- return ret;
+ return do_get_thread_area(current, -1, u_info);
}
int regset_tls_active(struct task_struct *target,
@@ -158,36 +251,16 @@ int regset_tls_active(struct task_struct *target,
}
int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
const struct desc_struct *tls;
+ struct user_desc v;
+ int pos;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
- (pos % sizeof(struct user_desc)) != 0 ||
- (count % sizeof(struct user_desc)) != 0)
- return -EINVAL;
-
- pos /= sizeof(struct user_desc);
- count /= sizeof(struct user_desc);
-
- tls = &target->thread.tls_array[pos];
-
- if (kbuf) {
- struct user_desc *info = kbuf;
- while (count-- > 0)
- fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
- tls++);
- } else {
- struct user_desc __user *u_info = ubuf;
- while (count-- > 0) {
- struct user_desc info;
- fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
- if (__copy_to_user(u_info++, &info, sizeof(info)))
- return -EFAULT;
- }
+ for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) {
+ fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls);
+ membuf_write(&to, &v, sizeof(v));
}
-
return 0;
}
@@ -197,8 +270,9 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
{
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
+ int i;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
(count % sizeof(struct user_desc)) != 0)
return -EINVAL;
@@ -210,6 +284,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
else
info = infobuf;
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
set_tls_desc(target,
GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
info, count / sizeof(struct user_desc));
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
index 2f083a2fe216..fc39447a0c1a 100644
--- a/arch/x86/kernel/tls.h
+++ b/arch/x86/kernel/tls.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Internal declarations for x86 TLS implementation functions.
*
* Copyright (C) 2007 Red Hat, Inc. All rights reserved.
*
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
- *
* Red Hat Author: Roland McGrath.
*/
@@ -15,7 +12,7 @@
#include <linux/regset.h>
extern user_regset_active_fn regset_tls_active;
-extern user_regset_get_fn regset_tls_get;
+extern user_regset_get2_fn regset_tls_get;
extern user_regset_set_fn regset_tls_set;
#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
deleted file mode 100644
index 7e4515957a1c..000000000000
--- a/arch/x86/kernel/topology.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Populate sysfs with topology information
- *
- * Written by: Matthew Dobson, IBM Corporation
- * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#include <linux/nodemask.h>
-#include <linux/mmzone.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <asm/cpu.h>
-
-static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
-
-#ifdef CONFIG_HOTPLUG_CPU
-int __ref arch_register_cpu(int num)
-{
- /*
- * CPU0 cannot be offlined due to several
- * restrictions and assumptions in kernel. This basically
- * doesnt add a control file, one cannot attempt to offline
- * BSP.
- *
- * Also certain PCI quirks require not to enable hotplug control
- * for all CPU's.
- */
- if (num)
- per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
-
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-
-void arch_unregister_cpu(int num)
-{
- unregister_cpu(&per_cpu(cpu_devices, num).cpu);
-}
-EXPORT_SYMBOL(arch_unregister_cpu);
-#else /* CONFIG_HOTPLUG_CPU */
-
-static int __init arch_register_cpu(int num)
-{
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-static int __init topology_init(void)
-{
- int i;
-
-#ifdef CONFIG_NUMA
- for_each_online_node(i)
- register_one_node(i);
-#endif
-
- for_each_present_cpu(i)
- arch_register_cpu(i);
-
- return 0;
-}
-subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
new file mode 100644
index 000000000000..8322e8352777
--- /dev/null
+++ b/arch/x86/kernel/trace.c
@@ -0,0 +1,234 @@
+#include <asm/trace/irq_vectors.h>
+#include <linux/trace.h>
+
+#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC)
+/*
+ * trace_intel_irq_entry - record intel specific IRQ entry
+ */
+static void trace_intel_irq_entry(void *data, int vector)
+{
+ osnoise_trace_irq_entry(vector);
+}
+
+/*
+ * trace_intel_irq_exit - record intel specific IRQ exit
+ */
+static void trace_intel_irq_exit(void *data, int vector)
+{
+ char *vector_desc = (char *) data;
+
+ osnoise_trace_irq_exit(vector, vector_desc);
+}
+
+/*
+ * register_intel_irq_tp - Register intel specific IRQ entry tracepoints
+ */
+int osnoise_arch_register(void)
+{
+ int ret;
+
+ ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ if (ret)
+ goto out_timer_entry;
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_timer_exit;
+
+ ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ if (ret)
+ goto out_thermal_entry;
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+#ifdef CONFIG_X86_MCE_AMD
+ ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_thermal_exit;
+
+ ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ if (ret)
+ goto out_deferred_entry;
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_deferred_exit;
+
+ ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ if (ret)
+ goto out_threshold_entry;
+#endif /* CONFIG_X86_MCE_THRESHOLD */
+
+#ifdef CONFIG_SMP
+ ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_threshold_exit;
+
+ ret = register_trace_call_function_single_exit(trace_intel_irq_exit,
+ "call_function_single");
+ if (ret)
+ goto out_call_function_single_entry;
+
+ ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_single_exit;
+
+ ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ if (ret)
+ goto out_call_function_entry;
+
+ ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_exit;
+
+ ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ if (ret)
+ goto out_reschedule_entry;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_IRQ_WORK
+ ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_reschedule_exit;
+
+ ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ if (ret)
+ goto out_irq_work_entry;
+#endif
+
+ ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_irq_work_exit;
+
+ ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ if (ret)
+ goto out_x86_ipi_entry;
+
+ ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_x86_ipi_exit;
+
+ ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ if (ret)
+ goto out_error_apic_entry;
+
+ ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_error_apic_exit;
+
+ ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ if (ret)
+ goto out_spurious_apic_entry;
+
+ return 0;
+
+out_spurious_apic_entry:
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+out_error_apic_exit:
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+out_error_apic_entry:
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+out_x86_ipi_exit:
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+out_x86_ipi_entry:
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+out_irq_work_exit:
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+out_irq_work_entry:
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+out_reschedule_exit:
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+out_reschedule_entry:
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+out_call_function_exit:
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+out_call_function_entry:
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+out_call_function_single_exit:
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+out_call_function_single_entry:
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+out_threshold_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+out_threshold_entry:
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+out_deferred_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+out_deferred_entry:
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+out_thermal_exit:
+#endif /* CONFIG_X86_MCE_AMD */
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+out_thermal_entry:
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+out_timer_exit:
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+out_timer_entry:
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+out_err:
+ return -EINVAL;
+}
+
+void osnoise_arch_unregister(void)
+{
+ unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+}
+#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
new file mode 100644
index 000000000000..708d61743d15
--- /dev/null
+++ b/arch/x86/kernel/trace_clock.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * X86 trace clocks
+ */
+#include <asm/trace_clock.h>
+#include <asm/barrier.h>
+#include <asm/tsc.h>
+
+/*
+ * trace_clock_x86_tsc(): A clock that is just the cycle counter.
+ *
+ * Unlike the other clocks, this is not in nanoseconds.
+ */
+u64 notrace trace_clock_x86_tsc(void)
+{
+ return rdtsc_ordered();
+}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index 808031a5ba19..000000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <linux/io.h>
-
-#include <asm/trampoline.h>
-#include <asm/e820.h>
-
-/* ready for x86_64 and x86 */
-unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
-
-void __init reserve_trampoline_memory(void)
-{
-#ifdef CONFIG_X86_32
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
-#endif
- /* Has to be in very low memory so we can execute real-mode AP code. */
- reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
- "TRAMPOLINE");
-}
-
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
-unsigned long setup_trampoline(void)
-{
- memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
- return virt_to_phys(trampoline_base);
-}
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 66d874e5404c..000000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- * This is only used for booting secondary CPUs in SMP machine
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * We jump into arch/x86/kernel/head_32.S.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * If you work on this file, check the object module with
- * objdump --reloc to make sure there are no relocation
- * entries except for:
- *
- * TYPE VALUE
- * R_386_32 startup_32_smp
- * R_386_32 boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-/* We can free up trampoline after bootup if cpu hotplug is not supported. */
-#ifndef CONFIG_HOTPLUG_CPU
-.section ".cpuinit.data","aw",@progbits
-#else
-.section .rodata,"a",@progbits
-#endif
-
-.code16
-
-ENTRY(trampoline_data)
-r_base = .
- wbinvd # Needed for NUMA-Q should be harmless for others
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
-
- cli # We should be safe anyway
-
- movl $0xA5A5A5A5, trampoline_data - r_base
- # write marker for master knows we're running
-
- /* GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl boot_idt_descr - r_base # load idt with 0, 0
- lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
-
- xor %ax, %ax
- inc %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
- # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
- ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
- # These need to be in the same 64K segment as the above;
- # hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
- .word __BOOT_DS + 7 # gdt limit
- .long boot_gdt - __PAGE_OFFSET # gdt base
-
-boot_idt_descr:
- .word 0 # idt limit = 0
- .long 0 # idt base = 0L
-
-.globl trampoline_end
-trampoline_end:
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
deleted file mode 100644
index cddfb8d386b9..000000000000
--- a/arch/x86/kernel/trampoline_64.S
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- * 15 Sept 2005 Eric Biederman: 64bit PIC support
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * With the addition of trampoline_level4_pgt this code can
- * now enter a 64bit kernel that lives at arbitrary 64bit
- * physical addresses.
- *
- * If you work on this file, check the object module with objdump
- * --full-contents --reloc to make sure there are no relocation
- * entries.
- */
-
-#include <linux/linkage.h>
-#include <asm/pgtable_types.h>
-#include <asm/page_types.h>
-#include <asm/msr.h>
-#include <asm/segment.h>
-#include <asm/processor-flags.h>
-
-.section .rodata, "a", @progbits
-
-.code16
-
-ENTRY(trampoline_data)
-r_base = .
- cli # We should be safe anyway
- wbinvd
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
- mov %ax, %es
- mov %ax, %ss
-
-
- movl $0xA5A5A5A5, trampoline_data - r_base
- # write marker for master knows we're running
-
- # Setup stack
- movw $(trampoline_stack_end - r_base), %sp
-
- call verify_cpu # Verify the cpu supports long mode
- testl %eax, %eax # Check for return code
- jnz no_longmode
-
- mov %cs, %ax
- movzx %ax, %esi # Find the 32bit trampoline location
- shll $4, %esi
-
- # Fixup the vectors
- addl %esi, startup_32_vector - r_base
- addl %esi, startup_64_vector - r_base
- addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer
-
- /*
- * GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl tidt - r_base # load idt with 0, 0
- lgdtl tgdt - r_base # load gdt with whatever is appropriate
-
- mov $X86_CR0_PE, %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
-
- # flush prefetch and jump to startup_32
- ljmpl *(startup_32_vector - r_base)
-
- .code32
- .balign 4
-startup_32:
- movl $__KERNEL_DS, %eax # Initialize the %ds segment register
- movl %eax, %ds
-
- movl $X86_CR4_PAE, %eax
- movl %eax, %cr4 # Enable PAE mode
-
- # Setup trampoline 4 level pagetables
- leal (trampoline_level4_pgt - r_base)(%esi), %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- movl $(1 << _EFER_LME), %eax # Enable Long Mode
- xorl %edx, %edx
- wrmsr
-
- # Enable paging and in turn activate Long Mode
- # Enable protected mode
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
- movl %eax, %cr0
-
- /*
- * At this point we're in long mode but in 32bit compatibility mode
- * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
- * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
- * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
- */
- ljmp *(startup_64_vector - r_base)(%esi)
-
- .code64
- .balign 4
-startup_64:
- # Now jump into the kernel using virtual addresses
- movq $secondary_startup_64, %rax
- jmp *%rax
-
- .code16
-no_longmode:
- hlt
- jmp no_longmode
-#include "verify_cpu_64.S"
-
- # Careful these need to be in the same 64K segment as the above;
-tidt:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-
- # Duplicate the global descriptor table
- # so the kernel can live anywhere
- .balign 4
-tgdt:
- .short tgdt_end - tgdt # gdt limit
- .long tgdt - r_base
- .short 0
- .quad 0x00cf9b000000ffff # __KERNEL32_CS
- .quad 0x00af9b000000ffff # __KERNEL_CS
- .quad 0x00cf93000000ffff # __KERNEL_DS
-tgdt_end:
-
- .balign 4
-startup_32_vector:
- .long startup_32 - r_base
- .word __KERNEL32_CS, 0
-
- .balign 4
-startup_64_vector:
- .long startup_64 - r_base
- .word __KERNEL_CS, 0
-
-trampoline_stack:
- .org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 510,8,0
- .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5204332f475d..bcf1dedc1d00 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,502 +9,1168 @@
/*
* Handle hardware traps and faults.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
+#include <linux/kmsan.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
-#include <linux/utsname.h>
#include <linux/kdebug.h>
+#include <linux/kgdb.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
+#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/static_call.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
+#include <linux/hardirq.h>
+#include <linux/atomic.h>
+#include <linux/iommu.h>
+#include <linux/ubsan.h>
-#ifdef CONFIG_EISA
-#include <linux/ioport.h>
-#include <linux/eisa.h>
-#endif
-
-#ifdef CONFIG_MCA
-#include <linux/mca.h>
-#endif
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
-#include <asm/kmemcheck.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
+#include <asm/realmode.h>
+#include <asm/text-patching.h>
+#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fred.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu.h>
+#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
-
+#include <asm/fixmap.h>
#include <asm/mach_traps.h>
+#include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/vm86.h>
+#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/vdso.h>
+#include <asm/tdx.h>
+#include <asm/cfi.h>
+#include <asm/msr.h>
#ifdef CONFIG_X86_64
-#include <asm/pgalloc.h>
-#include <asm/proto.h>
+#include <asm/x86_init.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
-#include <asm/traps.h>
+#endif
+
+#include <asm/proto.h>
-asmlinkage int system_call(void);
+DECLARE_BITMAP(system_vectors, NR_VECTORS);
-/* Do we ignore FPU interrupts ? */
-char ignore_fpu_irq;
+__always_inline int is_valid_bugaddr(unsigned long addr)
+{
+ if (addr < TASK_SIZE_MAX)
+ return 0;
+
+ /*
+ * We got #UD, if the text isn't readable we'd have gotten
+ * a different exception.
+ */
+ return *(unsigned short *)addr == INSN_UD2;
+}
/*
- * The IDT has to be page-aligned to simplify the Pentium
- * F0 0F bug workaround.. We have a special link segment
- * for this.
+ * Check for UD1 or UD2, accounting for Address Size Override Prefixes.
+ * If it's a UD1, further decode to determine its use:
+ *
+ * FineIBT: d6 udb
+ * FineIBT: f0 75 f9 lock jne . - 6
+ * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
+ * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
+ * static_call: 0f b9 cc ud1 %esp,%ecx
+ * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg
+ *
+ * Notable, since __WARN_trap can use all registers, the distinction between
+ * UD1 users is through R/M.
*/
-gate_desc idt_table[256]
- __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
-#endif
+__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
+{
+ unsigned long start = addr;
+ u8 v, reg, rm, rex = 0;
+ int type = BUG_UD1;
+ bool lock = false;
+
+ if (addr < TASK_SIZE_MAX)
+ return BUG_NONE;
+
+ for (;;) {
+ v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ continue;
+
+ if (v == INSN_LOCK) {
+ lock = true;
+ continue;
+ }
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
+ if ((v & 0xf0) == 0x40) {
+ rex = v;
+ continue;
+ }
-static int ignore_nmis;
+ break;
+ }
-static inline void conditional_sti(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
-}
+ switch (v) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ addr += 1; /* d8 */
+ *len = addr - start;
+ WARN_ON_ONCE(!lock);
+ return BUG_LOCK;
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
- inc_preempt_count();
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
-}
+ case 0xd6:
+ *len = addr - start;
+ return BUG_UDB;
-static inline void conditional_cli(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_disable();
+ case OPCODE_ESCAPE:
+ break;
+
+ default:
+ return BUG_NONE;
+ }
+
+ v = *(u8 *)(addr++);
+ if (v == SECOND_BYTE_OPCODE_UD2) {
+ *len = addr - start;
+ return BUG_UD2;
+ }
+
+ if (v != SECOND_BYTE_OPCODE_UD1)
+ return BUG_NONE;
+
+ *imm = 0;
+ v = *(u8 *)(addr++); /* ModRM */
+
+ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
+ addr++; /* SIB */
+
+ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex);
+ rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex);
+
+ /* Decode immediate, if present */
+ switch (X86_MODRM_MOD(v)) {
+ case 0: if (X86_MODRM_RM(v) == 5)
+ addr += 4; /* RIP + disp32 */
+
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+
+ if (rm == 2) { /* (%edx) */
+ *imm = reg;
+ type = BUG_UD1_WARN;
+ }
+ break;
+
+ case 1: *imm = *(s8 *)addr;
+ addr += 1;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+ break;
+
+ case 2: *imm = *(s32 *)addr;
+ addr += 4;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+ break;
+
+ case 3: break;
+ }
+
+ /* record instruction length */
+ *len = addr - start;
+
+ return type;
}
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_disable();
- dec_preempt_count();
+ int offset = pt_regs_offset(regs, nr);
+ if (WARN_ON_ONCE(offset < -0))
+ return 0;
+ return *((unsigned long *)((void *)regs + offset));
}
-#ifdef CONFIG_X86_32
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+DEFINE_STATIC_CALL(WARN_trap, __WARN_trap);
+EXPORT_STATIC_CALL_TRAMP(WARN_trap);
+
+/*
+ * Create a va_list from an exception context.
+ */
+void *__warn_args(struct arch_va_list *args, struct pt_regs *regs)
{
- if (!user_mode_vm(regs))
- die(str, regs, err);
+ /*
+ * Register save area; populate with function call argument registers
+ */
+ args->regs[0] = regs->di;
+ args->regs[1] = regs->si;
+ args->regs[2] = regs->dx;
+ args->regs[3] = regs->cx;
+ args->regs[4] = regs->r8;
+ args->regs[5] = regs->r9;
+
+ /*
+ * From the ABI document:
+ *
+ * @gp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available general purpose
+ * argument register is saved. In case all argument registers have
+ * been exhausted, it is set to the value 48 (6*8).
+ *
+ * @fp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available floating point
+ * argument is saved. In case all argument registers have been
+ * exhausted, it is set to the value 176 (6*8 + 8*16)
+ *
+ * @overflow_arg_area - this pointer is used to fetch arguments passed
+ * on the stack. It is initialized with the address of the first
+ * argument passed on the stack, if any, and then always updated to
+ * point to the start of the next argument on the stack.
+ *
+ * @reg_save_area - the element points to the start of the register
+ * save area.
+ *
+ * Notably the vararg starts with the second argument and there are no
+ * floating point arguments in the kernel.
+ */
+ args->args.gp_offset = 1*8;
+ args->args.fp_offset = 6*8 + 8*16;
+ args->args.reg_save_area = &args->regs;
+ args->args.overflow_arg_area = (void *)regs->sp;
+
+ /*
+ * If the exception came from __WARN_trap, there is a return
+ * address on the stack, skip that. This is why any __WARN_trap()
+ * caller must inhibit tail-call optimization.
+ */
+ if ((void *)regs->ip == &__WARN_trap)
+ args->args.overflow_arg_area += 8;
+
+ return &args->args;
}
-#endif
+#endif /* HAVE_ARCH_BUG_FORMAT */
-static void __kprobes
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
- long error_code, siginfo_t *info)
+static nokprobe_inline int
+do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
+ struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
-
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
/*
- * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
*/
- if (trapnr < 6)
- goto vm86_trap;
- goto trap_signal;
- }
-#endif
+ if (trapnr < X86_TRAP_UD) {
+ if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ return 0;
+ }
+ } else if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return 0;
- if (!user_mode(regs))
- goto kernel_trap;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
+ } else {
+ if (fixup_vdso_exception(regs, trapnr, error_code, 0))
+ return 0;
+ }
-#ifdef CONFIG_X86_32
-trap_signal:
-#endif
/*
- * We want error_code and trap_no set for userspace faults and
+ * We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
* kernelspace faults which are fixed up. die() gives the
* process no chance to handle the signal and notice the
* kernel fault information, so that won't result in polluting
* the information about previously queued, but not yet
- * delivered, faults. See also do_general_protection below.
+ * delivered, faults. See also exc_general_protection below.
*/
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
+ tsk->thread.trap_nr = trapnr;
-#ifdef CONFIG_X86_64
+ return -1;
+}
+
+static void show_signal(struct task_struct *tsk, int signr,
+ const char *type, const char *desc,
+ struct pt_regs *regs, long error_code)
+{
if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid, str,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
+ pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx",
+ tsk->comm, task_pid_nr(tsk), type, desc,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
}
-#endif
+}
+
+static void
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, int sicode, void __user *addr)
+{
+ struct task_struct *tsk = current;
+
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
+
+ show_signal(tsk, signr, "trap ", str, regs, error_code);
- if (info)
- force_sig_info(signr, info, tsk);
+ if (!sicode)
+ force_sig(signr);
else
- force_sig(signr, tsk);
- return;
+ force_sig_fault(signr, sicode, addr);
+}
+NOKPROBE_SYMBOL(do_trap);
-kernel_trap:
- if (!fixup_exception(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
- die(str, regs, error_code);
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr, int sicode, void __user *addr)
+{
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ cond_local_irq_enable(regs);
+ do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
+ cond_local_irq_disable(regs);
}
- return;
+}
-#ifdef CONFIG_X86_32
-vm86_trap:
- if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, trapnr))
- goto trap_signal;
- return;
-#endif
+/*
+ * Posix requires to provide the address of the faulting instruction for
+ * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
+ *
+ * This address is usually regs->ip, but when an uprobe moved the code out
+ * of line then regs->ip points to the XOL code which would confuse
+ * anything which analyzes the fault address vs. the unmodified binary. If
+ * a trap happened in XOL code then uprobe maps regs->ip back to the
+ * original instruction address.
+ */
+static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
+{
+ return (void __user *)uprobe_get_trap_addr(regs);
}
-#define DO_ERROR(trapnr, signr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- siginfo_t info; \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
-}
-
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
-#ifdef CONFIG_X86_32
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
-#endif
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DEFINE_IDTENTRY(exc_divide_error)
+{
+ do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE,
+ FPE_INTDIV, error_get_trap_addr(regs));
+}
-#ifdef CONFIG_X86_64
-/* Runs on IST stack */
-dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_overflow)
{
- if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- 12, SIGBUS) == NOTIFY_STOP)
- return;
- preempt_conditional_sti(regs);
- do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
+ do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+#ifdef CONFIG_X86_F00F_BUG
+void handle_invalid_op(struct pt_regs *regs)
+#else
+static inline void handle_invalid_op(struct pt_regs *regs)
+#endif
{
- static const char str[] = "double fault";
- struct task_struct *tsk = current;
+ do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL,
+ ILL_ILLOPN, error_get_trap_addr(regs));
+}
- /* Return not checked because double check cannot be ignored */
- notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+static noinstr bool handle_bug(struct pt_regs *regs)
+{
+ unsigned long addr = regs->ip;
+ bool handled = false;
+ int ud_type, ud_len;
+ s32 ud_imm;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 8;
+ ud_type = decode_bug(addr, &ud_imm, &ud_len);
+ if (ud_type == BUG_NONE)
+ return handled;
/*
- * This is always a kernel trap and never fixable (and thus must
- * never return).
+ * All lies, just get the WARN/BUG out.
*/
- for (;;)
- die(str, regs, error_code);
-}
-#endif
+ instrumentation_begin();
+ /*
+ * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+ * is a rare case that uses @regs without passing them to
+ * irqentry_enter().
+ */
+ kmsan_unpoison_entry_regs(regs);
+ /*
+ * Since we're emulating a CALL with exceptions, restore the interrupt
+ * state to what it was at the exception site.
+ */
+ if (regs->flags & X86_EFLAGS_IF)
+ raw_local_irq_enable();
-dotraplinkage void __kprobes
-do_general_protection(struct pt_regs *regs, long error_code)
-{
- struct task_struct *tsk;
+ switch (ud_type) {
+ case BUG_UD1_WARN:
+ if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN)
+ handled = true;
+ break;
- conditional_sti(regs);
+ case BUG_UD2:
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ handled = true;
+ break;
+ }
+ fallthrough;
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto gp_in_vm86;
-#endif
+ case BUG_UDB:
+ case BUG_LOCK:
+ if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
+ handled = true;
+ break;
+ }
+ break;
- tsk = current;
- if (!user_mode(regs))
- goto gp_in_kernel;
+ case BUG_UD1_UBSAN:
+ if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
+ pr_crit("%s at %pS\n",
+ report_ubsan_failure(ud_imm),
+ (void *)regs->ip);
+ }
+ break;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
+ default:
+ break;
+ }
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
- tsk->comm, task_pid_nr(tsk),
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
+ /*
+ * When continuing, and regs->ip hasn't changed, move it to the next
+ * instruction. When not continuing execution, restore the instruction
+ * pointer.
+ */
+ if (handled) {
+ if (regs->ip == addr)
+ regs->ip += ud_len;
+ } else {
+ regs->ip = addr;
}
- force_sig(SIGSEGV, tsk);
- return;
+ if (regs->flags & X86_EFLAGS_IF)
+ raw_local_irq_disable();
+ instrumentation_end();
+
+ return handled;
+}
-#ifdef CONFIG_X86_32
-gp_in_vm86:
- local_irq_enable();
- handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
- return;
-#endif
+DEFINE_IDTENTRY_RAW(exc_invalid_op)
+{
+ irqentry_state_t state;
-gp_in_kernel:
- if (fixup_exception(regs))
+ /*
+ * We use UD2 as a short encoding for 'CALL __WARN', as such
+ * handle it before exception entry to avoid recursive WARN
+ * in case exception entry is the one triggering WARNs.
+ */
+ if (!user_mode(regs) && handle_bug(regs))
return;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
- if (notify_die(DIE_GPF, "general protection fault", regs,
- error_code, 13, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
+ state = irqentry_enter(regs);
+ instrumentation_begin();
+ handle_invalid_op(regs);
+ instrumentation_end();
+ irqentry_exit(regs, state);
}
-static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
+DEFINE_IDTENTRY(exc_coproc_segment_overrun)
{
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ do_error_trap(regs, 0, "coprocessor segment overrun",
+ X86_TRAP_OLD_MF, SIGFPE, 0, NULL);
+}
- printk(KERN_EMERG
- "You have some hardware problem, likely on the PCI bus.\n");
+DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
+{
+ do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV,
+ 0, NULL);
+}
-#if defined(CONFIG_EDAC)
- if (edac_handler_set()) {
- edac_atomic_assert_error();
+DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
+{
+ do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP,
+ SIGBUS, 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
+{
+ do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS,
+ 0, NULL);
+}
+
+DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
+{
+ char *str = "alignment check";
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
return;
- }
-#endif
- if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ if (!user_mode(regs))
+ die("Split lock detected\n", regs, error_code);
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ local_irq_enable();
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
-}
+ if (handle_user_split_lock(regs, error_code))
+ goto out;
-static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs *regs)
-{
- unsigned long i;
+ do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
+ error_code, BUS_ADRALN, NULL);
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
+out:
+ local_irq_disable();
+}
- if (panic_on_io_nmi)
- panic("NMI IOCK error: Not continuing");
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info)
+{
+ const char *name = stack_type_name(info->type);
- /* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
+ printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
+ name, (void *)fault_address, info->begin, info->end);
- i = 2000;
- while (--i)
- udelay(1000);
+ die("stack guard page", regs, 0);
- reason &= ~8;
- outb(reason, 0x61);
+ /* Be absolutely certain we don't return. */
+ panic("%s stack guard hit", name);
}
+#endif
+
+/*
+ * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64
+ * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch
+ * between configs triggers objtool warnings.
+ *
+ * This is a temporary hack until we have compiler or plugin support for
+ * annotating noreturns.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+#define always_true() true
+#else
+bool always_true(void);
+bool __weak always_true(void) { return true; }
+#endif
-static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+/*
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
+ *
+ * On x86_64, this is more or less a normal kernel entry. Notwithstanding the
+ * SDM's warnings about double faults being unrecoverable, returning works as
+ * expected. Presumably what the SDM actually means is that the CPU may get
+ * the register state wrong on entry, so returning could be a bad idea.
+ *
+ * Various CPU engineers have promised that double faults due to an IRET fault
+ * while the stack is read-only are, in fact, recoverable.
+ *
+ * On x86_32, this is entered through a task gate, and regs are synthesized
+ * from the TSS. Returning is, in principle, okay, but changes to regs will
+ * be lost. If, for some reason, we need to return to a context with modified
+ * regs, the shim code could be adjusted to synchronize the registers.
+ *
+ * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
+ * to be read before doing anything else.
+ */
+DEFINE_IDTENTRY_DF(exc_double_fault)
{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
- NOTIFY_STOP)
- return;
-#ifdef CONFIG_MCA
+ static const char str[] = "double fault";
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_VMAP_STACK
+ unsigned long address = read_cr2();
+ struct stack_info info;
+#endif
+
+#ifdef CONFIG_X86_ESPFIX64
+ extern unsigned char native_irq_return_iret[];
+
/*
- * Might actually be able to figure out what the guilty party
- * is:
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+ * end up promoting it to a doublefault. In that case, take
+ * advantage of the fact that we're not using the normal (TSS.sp0)
+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
+ * and then modify our own IRET frame so that, when we return,
+ * we land directly at the #GP(0) vector with the stack already
+ * set up according to its expectations.
+ *
+ * The net result is that our #GP handler will think that we
+ * entered from usermode with the bad user context.
+ *
+ * No need for nmi_enter() here because we don't use RCU.
*/
- if (MCA_bus) {
- mca_handle_nmi();
+ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ unsigned long *p = (unsigned long *)regs->sp;
+
+ /*
+ * regs->sp points to the failing IRET frame on the
+ * ESPFIX64 stack. Copy it to the entry stack. This fills
+ * in gpregs->ss through gpregs->ip.
+ *
+ */
+ gpregs->ip = p[0];
+ gpregs->cs = p[1];
+ gpregs->flags = p[2];
+ gpregs->sp = p[3];
+ gpregs->ss = p[4];
+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
+
+ /*
+ * Adjust our frame so that we return straight to the #GP
+ * vector with the expected RSP value. This is safe because
+ * we won't enable interrupts or schedule before we invoke
+ * general_protection, so nothing will clobber the stack
+ * frame we just set up.
+ *
+ * We will enter general_protection with kernel GSBASE,
+ * which is what the stub expects, given that the faulting
+ * RIP will be the IRET instruction.
+ */
+ regs->ip = (unsigned long)asm_exc_general_protection;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
+
return;
}
#endif
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
- if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ irqentry_nmi_enter(regs);
+ instrumentation_begin();
+ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_DF;
+
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * delivered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ if (get_stack_guard_info((void *)address, &info))
+ handle_stack_overflow(regs, address, &info);
+#endif
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+ die("double fault", regs, error_code);
+ if (always_true())
+ panic("Machine halted.");
+ instrumentation_end();
}
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+DEFINE_IDTENTRY(exc_bounds)
{
- unsigned char reason = 0;
- int cpu;
+ if (notify_die(DIE_TRAP, "bounds", regs, 0,
+ X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
+ return;
+ cond_local_irq_enable(regs);
- cpu = smp_processor_id();
+ if (!user_mode(regs))
+ die("bounds", regs, 0);
- /* Only the BSP gets external NMIs from the system. */
- if (!cpu)
- reason = get_nmi_reason();
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);
- if (!(reason & 0xc0)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, cpu))
- unknown_nmi_error(reason, regs);
-#else
- unknown_nmi_error(reason, regs);
+ cond_local_irq_disable(regs);
+}
+
+enum kernel_gp_hint {
+ GP_NO_HINT,
+ GP_NON_CANONICAL,
+ GP_CANONICAL,
+ GP_LASS_VIOLATION,
+ GP_NULL_POINTER,
+};
+
+static const char * const kernel_gp_hint_help[] = {
+ [GP_NON_CANONICAL] = "probably for non-canonical address",
+ [GP_CANONICAL] = "maybe for address",
+ [GP_LASS_VIOLATION] = "probably LASS violation for address",
+ [GP_NULL_POINTER] = "kernel NULL pointer dereference",
+};
+
+/*
+ * When an uncaught #GP occurs, try to determine the memory address accessed by
+ * the instruction and return that address to the caller. Also, try to figure
+ * out whether any part of the access to that address was non-canonical or
+ * across privilege levels.
+ */
+static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
+ unsigned long *addr)
+{
+ u8 insn_buf[MAX_INSN_SIZE];
+ struct insn insn;
+ int ret;
+
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip,
+ MAX_INSN_SIZE))
+ return GP_NO_HINT;
+
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
+ return GP_NO_HINT;
+
+ *addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (*addr == -1UL)
+ return GP_NO_HINT;
+
+#ifdef CONFIG_X86_64
+ /* Operand is in the kernel half */
+ if (*addr >= ~__VIRTUAL_MASK)
+ return GP_CANONICAL;
+
+ /* The last byte of the operand is not in the user canonical half */
+ if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
+ return GP_NON_CANONICAL;
+
+ /*
+ * A NULL pointer dereference usually causes a #PF. However, it
+ * can result in a #GP when LASS is active. Provide the same
+ * hint in the rare case that the condition is hit without LASS.
+ */
+ if (*addr < PAGE_SIZE)
+ return GP_NULL_POINTER;
+
+ /*
+ * Assume that LASS caused the exception, because the address is
+ * canonical and in the user half.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_LASS))
+ return GP_LASS_VIOLATION;
#endif
- return;
+ return GP_CANONICAL;
+}
+
+#define GPFSTR "general protection fault"
+
+static bool fixup_iopl_exception(struct pt_regs *regs)
+{
+ struct thread_struct *t = &current->thread;
+ unsigned char byte;
+ unsigned long ip;
+
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
+ return false;
+
+ if (insn_get_effective_ip(regs, &ip))
+ return false;
+
+ if (get_user(byte, (const char __user *)ip))
+ return false;
+
+ if (byte != 0xfa && byte != 0xfb)
+ return false;
+
+ if (!t->iopl_warn && printk_ratelimit()) {
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
+ current->comm, task_pid_nr(current), ip);
+ print_vma_addr(KERN_CONT " in ", ip);
+ pr_cont("\n");
+ t->iopl_warn = 1;
}
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
- /* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
+ regs->ip += 1;
+ return true;
+}
+
+/*
+ * The unprivileged ENQCMD instruction generates #GPs if the
+ * IA32_PASID MSR has not been populated. If possible, populate
+ * the MSR from a PASID previously allocated to the mm.
+ */
+static bool try_fixup_enqcmd_gp(void)
+{
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
+ u32 pasid;
+
/*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered:
+ * MSR_IA32_PASID is managed using XSAVE. Directly
+ * writing to the MSR is only possible when fpregs
+ * are valid and the fpstate is not. This is
+ * guaranteed when handling a userspace exception
+ * in *before* interrupts are re-enabled.
*/
- reassert_nmi();
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Hardware without ENQCMD will not generate
+ * #GPs that can be fixed up here.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+ return false;
+
+ /*
+ * If the mm has not been allocated a
+ * PASID, the #GP can not be fixed up.
+ */
+ if (!mm_valid_pasid(current->mm))
+ return false;
+
+ pasid = mm_get_enqcmd_pasid(current->mm);
+
+ /*
+ * Did this thread already have its PASID activated?
+ * If so, the #GP must be from something else.
+ */
+ if (current->pasid_activated)
+ return false;
+
+ wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
+ current->pasid_activated = 1;
+
+ return true;
+#else
+ return false;
#endif
}
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str,
+ unsigned long address)
{
- nmi_enter();
+ if (fixup_exception(regs, trapnr, error_code, address))
+ return true;
- inc_irq_stat(__nmi_count);
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
- if (!ignore_nmis)
- default_do_nmi(regs);
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
- nmi_exit();
+ return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP;
}
-void stop_nmi(void)
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
{
- acpi_nmi_disable();
- ignore_nmis++;
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
}
-void restart_nmi(void)
+DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
- ignore_nmis--;
- acpi_nmi_enable();
+ char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
+ enum kernel_gp_hint hint = GP_NO_HINT;
+ unsigned long gp_addr;
+
+ if (user_mode(regs) && try_fixup_enqcmd_gp())
+ return;
+
+ cond_local_irq_enable(regs);
+
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs))
+ goto exit;
+ }
+
+ if (v8086_mode(regs)) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ local_irq_disable();
+ return;
+ }
+
+ if (user_mode(regs)) {
+ if (fixup_iopl_exception(regs))
+ goto exit;
+
+ if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
+ goto exit;
+
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
+ goto exit;
+ }
+
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0))
+ goto exit;
+
+ if (error_code)
+ snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
+ else
+ hint = get_kernel_gp_address(regs, &gp_addr);
+
+ if (hint != GP_NO_HINT)
+ snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
+ kernel_gp_hint_help[hint], gp_addr);
+
+ /*
+ * KASAN is interested only in the non-canonical case, clear it
+ * otherwise.
+ */
+ if (hint != GP_NON_CANONICAL)
+ gp_addr = 0;
+
+ die_addr(desc, regs, error_code, gp_addr);
+
+exit:
+ cond_local_irq_disable(regs);
}
-/* May run on IST stack. */
-dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+static bool do_int3(struct pt_regs *regs)
{
+ int res;
+
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ return true;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
#ifdef CONFIG_KPROBES
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
+ if (kprobe_int3_handler(regs))
+ return true;
+#endif
+ res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
+
+ return res == NOTIFY_STOP;
+}
+NOKPROBE_SYMBOL(do_int3);
+
+static void do_int3_user(struct pt_regs *regs)
+{
+ if (do_int3(regs))
return;
-#else
- if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
+
+ cond_local_irq_enable(regs);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
+ cond_local_irq_disable(regs);
+}
+
+DEFINE_IDTENTRY_RAW(exc_int3)
+{
+ /*
+ * smp_text_poke_int3_handler() is completely self contained code; it does (and
+ * must) *NOT* call out to anything, lest it hits upon yet another
+ * INT3.
+ */
+ if (smp_text_poke_int3_handler(regs))
return;
-#endif
- preempt_conditional_sti(regs);
- do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
- preempt_conditional_cli(regs);
+ /*
+ * irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
+ * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must
+ * be done before. If the entry came from kernel mode, then use
+ * nmi_enter() because the INT3 could have been hit in any context
+ * including NMI.
+ */
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
+ do_int3_user(regs);
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ } else {
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+ if (!do_int3(regs))
+ die("int3", regs, 0);
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+ }
}
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch back to user stack
- * for scheduling or signal handling. The actual stack switch is done in
- * entry.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
*/
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
-{
- struct pt_regs *regs = eregs;
- /* Did already sync */
- if (eregs == (struct pt_regs *)eregs->sp)
- ;
- /* Exception from user space */
- else if (user_mode(eregs))
- regs = task_pt_regs(current);
- /*
- * Exception from kernel and interrupts are enabled. Move to
- * kernel process stack.
- */
- else if (eregs->flags & X86_EFLAGS_IF)
- regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
- if (eregs != regs)
+asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+ struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1;
+ if (regs != eregs)
*regs = *eregs;
return regs;
}
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
+{
+ unsigned long sp, *stack;
+ struct stack_info info;
+ struct pt_regs *regs_ret;
+
+ /*
+ * In the SYSCALL entry path the RSP value comes from user-space - don't
+ * trust it and switch to the current kernel stack
+ */
+ if (ip_within_syscall_gap(regs)) {
+ sp = current_top_of_stack();
+ goto sync;
+ }
+
+ /*
+ * From here on the RSP value is trusted. Now check whether entry
+ * happened from a safe stack. Not safe are the entry or unknown stacks,
+ * use the fall-back stack instead in this case.
+ */
+ sp = regs->sp;
+ stack = (unsigned long *)sp;
+
+ if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
+ info.type > STACK_TYPE_EXCEPTION_LAST)
+ sp = __this_cpu_ist_top_va(VC2);
+
+sync:
+ /*
+ * Found a safe stack - switch to it as if the entry didn't happen via
+ * IST stack. The code below only copies pt_regs, the real switch happens
+ * in assembly code.
+ */
+ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
+
+ regs_ret = (struct pt_regs *)sp;
+ *regs_ret = *regs;
+
+ return regs_ret;
+}
#endif
+asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
+{
+ struct pt_regs tmp, *new_stack;
+
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+ * correctly, we want to move our stack frame to where it would
+ * be had we entered directly on the entry stack (rather than
+ * just below the IRET frame) and we want to pretend that the
+ * exception came from the IRET target.
+ */
+ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /* Copy the IRET target to the temporary storage. */
+ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8);
+
+ /* Copy the remainder of the stack from the current stack. */
+ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip));
+
+ /* Update the entry stack */
+ __memcpy(new_stack, &tmp, sizeof(tmp));
+
+ BUG_ON(!user_mode(new_stack));
+ return new_stack;
+}
+#endif
+
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+ /*
+ * We don't try for precision here. If we're anywhere in the region of
+ * code that can be single-stepped in the SYSENTER entry path, then
+ * assume that this is a useless single-step trap due to SYSENTER
+ * being invoked with TF set. (We don't know in advance exactly
+ * which instructions will be hit because BTF could plausibly
+ * be set.)
+ */
+#ifdef CONFIG_X86_32
+ return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+ (unsigned long)__end_SYSENTER_singlestep_region -
+ (unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+ return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+ (unsigned long)__end_entry_SYSENTER_compat -
+ (unsigned long)entry_SYSENTER_compat;
+#else
+ return false;
+#endif
+}
+
+static __always_inline unsigned long debug_read_reset_dr6(void)
+{
+ unsigned long dr6;
+
+ get_debugreg(dr6, 6);
+ dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
+
+ /*
+ * The Intel SDM says:
+ *
+ * Certain debug exceptions may clear bits 0-3 of DR6.
+ *
+ * BLD induced #DB clears DR6.BLD and any other debug
+ * exception doesn't modify DR6.BLD.
+ *
+ * RTM induced #DB clears DR6.RTM and any other debug
+ * exception sets DR6.RTM.
+ *
+ * To avoid confusion in identifying debug exceptions,
+ * debug handlers should set DR6.BLD and DR6.RTM, and
+ * clear other DR6 bits before returning.
+ *
+ * Keep it simple: write DR6 with its architectural reset
+ * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately.
+ */
+ set_debugreg(DR6_RESERVED, 6);
+
+ return dr6;
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -529,458 +1195,502 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*
* May run on IST stack.
*/
-dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
-{
- struct task_struct *tsk = current;
- unsigned long condition;
- int si_code;
- get_debugreg(condition, 6);
+static bool notify_debug(struct pt_regs *regs, unsigned long *dr6)
+{
+ /*
+ * Notifiers will clear bits in @dr6 to indicate the event has been
+ * consumed - hw_breakpoint_handler(), single_stop_cont().
+ *
+ * Notifiers will set bits in @virtual_dr6 to indicate the desire
+ * for signals - ptrace_triggered(), kgdb_hw_overflow_handler().
+ */
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP)
+ return true;
- /* Catch kmemcheck conditions first of all! */
- if (condition & DR_STEP && kmemcheck_trap(regs))
- return;
+ return false;
+}
+static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6)
+{
/*
- * The processor cleared BTF, so don't mark that we need it set.
+ * Disable breakpoints during exception handling; recursive exceptions
+ * are exceedingly 'fun'.
+ *
+ * Since this function is NOKPROBE, and that also applies to
+ * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
+ * HW_BREAKPOINT_W on our stack)
+ *
+ * Entry text is excluded for HW_BP_X and cpu_entry_area, which
+ * includes the entry stack is excluded for everything.
+ *
+ * For FRED, nested #DB should just work fine. But when a watchpoint or
+ * breakpoint is set in the code path which is executed by #DB handler,
+ * it results in an endless recursion and stack overflow. Thus we stay
+ * with the IDT approach, i.e., save DR7 and disable #DB.
*/
- clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
- tsk->thread.debugctlmsr = 0;
+ unsigned long dr7 = local_db_save();
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+ instrumentation_begin();
- if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
- SIGTRAP) == NOTIFY_STOP)
- return;
+ /*
+ * If something gets miswired and we end up here for a user mode
+ * #DB, we will malfunction.
+ */
+ WARN_ON_ONCE(user_mode(regs));
- /* It's safe to allow irq's after DR6 has been saved */
- preempt_conditional_sti(regs);
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." but PTRACE_BLOCKSTEP requested
+ * it for userspace, but we just took a kernel #DB, so re-set
+ * BTF.
+ */
+ unsigned long debugctl;
- /* Mask out spurious debug traps due to lazy DR7 setting */
- if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
- if (!tsk->thread.debugreg7)
- goto clear_dr7;
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl |= DEBUGCTLMSR_BTF;
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
}
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto debug_vm86;
-#endif
-
- /* Save debug status register where ptrace can see it */
- tsk->thread.debugreg6 = condition;
+ /*
+ * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
+ * watchpoint at the same time then that will still be handled.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ (dr6 & DR_STEP) && is_sysenter_singlestep(regs))
+ dr6 &= ~DR_STEP;
/*
- * Single-stepping through TF: make sure we ignore any events in
- * kernel space (but re-enable TF when returning to user mode).
+ * The kernel doesn't use INT1
*/
- if (condition & DR_STEP) {
- if (!user_mode(regs))
- goto clear_TF_reenable;
- }
+ if (!dr6)
+ goto out;
- si_code = get_si_code(condition);
- /* Ok, finally something we can handle */
- send_sigtrap(tsk, regs, error_code, si_code);
+ if (notify_debug(regs, &dr6))
+ goto out;
/*
- * Disable additional traps. They'll be re-enabled when
- * the signal is delivered.
+ * The kernel doesn't use TF single-step outside of:
+ *
+ * - Kprobes, consumed through kprobe_debug_handler()
+ * - KGDB, consumed through notify_debug()
+ *
+ * So if we get here with DR_STEP set, something is wonky.
+ *
+ * A known way to trigger this is through QEMU's GDB stub,
+ * which leaks #DB into the guest and causes IST recursion.
*/
-clear_dr7:
- set_debugreg(0, 7);
- preempt_conditional_cli(regs);
- return;
+ if (WARN_ON_ONCE(dr6 & DR_STEP))
+ regs->flags &= ~X86_EFLAGS_TF;
+out:
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
-#ifdef CONFIG_X86_32
-debug_vm86:
- /* reenable preemption: handle_vm86_trap() might sleep */
- dec_preempt_count();
- handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- conditional_cli(regs);
- return;
-#endif
-
-clear_TF_reenable:
- set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
- regs->flags &= ~X86_EFLAGS_TF;
- preempt_conditional_cli(regs);
- return;
+ local_db_restore(dr7);
}
-#ifdef CONFIG_X86_64
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
+static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6)
{
- if (fixup_exception(regs))
- return 1;
+ bool icebp;
- notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
- /* Illegal floating point operation in the kernel */
- current->thread.trap_no = trapnr;
- die(str, regs, 0);
- return 0;
-}
-#endif
+ /*
+ * If something gets miswired and we end up here for a kernel mode
+ * #DB, we will malfunction.
+ */
+ WARN_ON_ONCE(!user_mode(regs));
-/*
- * Note that we play around with the 'TS' bit in an attempt to get
- * the correct behaviour even in the presence of the asynchronous
- * IRQ13 behaviour
- */
-void math_error(void __user *ip)
-{
- struct task_struct *task;
- siginfo_t info;
- unsigned short cwd, swd, err;
+ /*
+ * NB: We can't easily clear DR7 here because
+ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * user memory, etc. This means that a recursive #DB is possible. If
+ * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
+ * Since we're not on the IST stack right now, everything will be
+ * fine.
+ */
+
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
/*
- * Save the info for the exception handler and clear the error.
+ * Start the virtual/ptrace DR6 value with just the DR_STEP mask
+ * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits.
+ *
+ * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6)
+ * even if it is not the result of PTRACE_SINGLESTEP.
*/
- task = current;
- save_init_fpu(task);
- task->thread.trap_no = 16;
- task->thread.error_code = 0;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_addr = ip;
+ current->thread.virtual_dr6 = (dr6 & DR_STEP);
+
/*
- * (~cwd & swd) will mask out exceptions that are not set to unmasked
- * status. 0x3f is the exception bits in these regs, 0x200 is the
- * C1 reg you need in case of a stack fault, 0x040 is the stack
- * fault bit. We should only be taking one exception at a time,
- * so if this combination doesn't produce any single exception,
- * then we have a bad program that isn't synchronizing its FPU usage
- * and it will suffer the consequences since we won't be able to
- * fully reproduce the context of the exception
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
*/
- cwd = get_fpu_cwd(task);
- swd = get_fpu_swd(task);
+ clear_thread_flag(TIF_BLOCKSTEP);
- err = swd & ~cwd;
+ /*
+ * If dr6 has no reason to give us about the origin of this trap,
+ * then it's very likely the result of an icebp/int01 trap.
+ * User wants a sigtrap for that.
+ */
+ icebp = !dr6;
- if (err & 0x001) { /* Invalid op */
- /*
- * swd & 0x240 == 0x040: Stack Underflow
- * swd & 0x240 == 0x240: Stack Overflow
- * User must clear the SF bit (0x40) if set
- */
- info.si_code = FPE_FLTINV;
- } else if (err & 0x004) { /* Divide by Zero */
- info.si_code = FPE_FLTDIV;
- } else if (err & 0x008) { /* Overflow */
- info.si_code = FPE_FLTOVF;
- } else if (err & 0x012) { /* Denormal, Underflow */
- info.si_code = FPE_FLTUND;
- } else if (err & 0x020) { /* Precision */
- info.si_code = FPE_FLTRES;
- } else {
- /*
- * If we're using IRQ 13, or supposedly even some trap 16
- * implementations, it's possible we get a spurious trap...
- */
- return; /* Spurious trap, no error */
+ if (notify_debug(regs, &dr6))
+ goto out;
+
+ /* It's safe to allow irq's after DR6 has been saved */
+ local_irq_enable();
+
+ if (v8086_mode(regs)) {
+ handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB);
+ goto out_irq;
}
- force_sig_info(SIGFPE, &info, task);
-}
-dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
-{
- conditional_sti(regs);
+ /* #DB for bus lock can only be triggered from userspace. */
+ if (dr6 & DR_BUS_LOCK)
+ handle_bus_lock(regs);
-#ifdef CONFIG_X86_32
- ignore_fpu_irq = 1;
-#else
- if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel x87 math error", 16))
- return;
-#endif
+ /* Add the virtual_dr6 bits for signals. */
+ dr6 |= current->thread.virtual_dr6;
+ if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp)
+ send_sigtrap(regs, 0, get_si_code(dr6));
- math_error((void __user *)regs->ip);
+out_irq:
+ local_irq_disable();
+out:
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
}
-static void simd_math_error(void __user *ip)
+#ifdef CONFIG_X86_64
+/* IST stack entry */
+DEFINE_IDTENTRY_DEBUG(exc_debug)
{
- struct task_struct *task;
- siginfo_t info;
- unsigned short mxcsr;
-
- /*
- * Save the info for the exception handler and clear the error.
- */
- task = current;
- save_init_fpu(task);
- task->thread.trap_no = 19;
- task->thread.error_code = 0;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = __SI_FAULT;
- info.si_addr = ip;
- /*
- * The SIMD FPU exceptions are handled a little differently, as there
- * is only a single status/control register. Thus, to determine which
- * unmasked exception was caught we must mask the exception mask bits
- * at 0x1f80, and then use these to mask the exception bits at 0x3f.
- */
- mxcsr = get_fpu_mxcsr(task);
- switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
- }
- force_sig_info(SIGFPE, &info, task);
+ exc_debug_kernel(regs, debug_read_reset_dr6());
}
-dotraplinkage void
-do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+/* User entry, runs on regular task stack */
+DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
- conditional_sti(regs);
+ exc_debug_user(regs, debug_read_reset_dr6());
+}
-#ifdef CONFIG_X86_32
- if (cpu_has_xmm) {
- /* Handle SIMD FPU exceptions on PIII+ processors. */
- ignore_fpu_irq = 1;
- simd_math_error((void __user *)regs->ip);
- return;
- }
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #DB needs to be handled on different stack: User #DB on
+ * current task stack, while kernel #DB on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #DB dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception
+ * entry stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_DEBUG(exc_debug)
+{
/*
- * Handle strange cache flush from user space exception
- * in all other cases. This is undocumented behaviour.
+ * FRED #DB stores DR6 on the stack in the format which
+ * debug_read_reset_dr6() returns for the IDT entry points.
*/
- if (regs->flags & X86_VM_MASK) {
- handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
- return;
- }
- current->thread.trap_no = 19;
- current->thread.error_code = error_code;
- die_if_kernel("cache flush denied", regs, error_code);
- force_sig(SIGSEGV, current);
-#else
- if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel simd math error", 19))
- return;
- simd_math_error((void __user *)regs->ip);
-#endif
-}
+ unsigned long dr6 = fred_event_data(regs);
-dotraplinkage void
-do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
-{
- conditional_sti(regs);
-#if 0
- /* No need to warn about this any longer. */
- printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);
+ else
+ exc_debug_kernel(regs, dr6);
}
+#endif /* CONFIG_X86_FRED */
-#ifdef CONFIG_X86_32
-unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
+#else
+/* 32 bit does not have separate entry points. */
+DEFINE_IDTENTRY_RAW(exc_debug)
{
- struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
- unsigned long base = (kesp - uesp) & -THREAD_SIZE;
- unsigned long new_kesp = kesp - base;
- unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
- __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
-
- /* Set up base for espfix segment */
- desc &= 0x00f0ff0000000000ULL;
- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
- ((((__u64)base) << 32) & 0xff00000000000000ULL) |
- ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
- (lim_pages & 0xffff);
- *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+ unsigned long dr6 = debug_read_reset_dr6();
- return new_kesp;
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);
+ else
+ exc_debug_kernel(regs, dr6);
}
#endif
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+static void math_error(struct pt_regs *regs, int trapnr)
{
+ struct task_struct *task = current;
+ struct fpu *fpu = x86_task_fpu(task);
+ int si_code;
+ char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+ "simd exception";
+
+ cond_local_irq_enable(regs);
+
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr, 0, 0))
+ goto exit;
+
+ task->thread.error_code = 0;
+ task->thread.trap_nr = trapnr;
+
+ if (notify_die(DIE_TRAP, str, regs, 0, trapnr,
+ SIGFPE) != NOTIFY_STOP)
+ die(str, regs, 0);
+ goto exit;
+ }
+
+ /*
+ * Synchronize the FPU register state to the memory register state
+ * if necessary. This allows the exception handler to inspect it.
+ */
+ fpu_sync_fpstate(fpu);
+
+ task->thread.trap_nr = trapnr;
+ task->thread.error_code = 0;
+
+ si_code = fpu__exception_code(fpu, trapnr);
+ /* Retry when we get spurious exceptions: */
+ if (!si_code)
+ goto exit;
+
+ if (fixup_vdso_exception(regs, trapnr, 0, 0))
+ goto exit;
+
+ force_sig_fault(SIGFPE, si_code,
+ (void __user *)uprobe_get_trap_addr(regs));
+exit:
+ cond_local_irq_disable(regs);
}
-asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
+DEFINE_IDTENTRY(exc_coprocessor_error)
{
+ math_error(regs, X86_TRAP_MF);
}
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (in this case,
- * local interrupts are disabled at the call-site in entry.S).
- */
-asmlinkage void math_state_restore(void)
+DEFINE_IDTENTRY(exc_simd_coprocessor_error)
{
- struct thread_info *thread = current_thread_info();
- struct task_struct *tsk = thread->task;
-
- if (!tsk_used_math(tsk)) {
- local_irq_enable();
- /*
- * does a slab alloc which can sleep
- */
- if (init_fpu(tsk)) {
- /*
- * ran out of memory!
- */
- do_group_exit(SIGKILL);
+ if (IS_ENABLED(CONFIG_X86_INVD_BUG)) {
+ /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */
+ if (!static_cpu_has(X86_FEATURE_XMM)) {
+ __exc_general_protection(regs, 0);
return;
}
- local_irq_disable();
}
+ math_error(regs, X86_TRAP_XF);
+}
- clts(); /* Allow maths ops (or we recurse) */
+DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
+{
/*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ * This addresses a Pentium Pro Erratum:
+ *
+ * PROBLEM: If the APIC subsystem is configured in mixed mode with
+ * Virtual Wire mode implemented through the local APIC, an
+ * interrupt vector of 0Fh (Intel reserved encoding) may be
+ * generated by the local APIC (Int 15). This vector may be
+ * generated upon receipt of a spurious interrupt (an interrupt
+ * which is removed before the system receives the INTA sequence)
+ * instead of the programmed 8259 spurious interrupt vector.
+ *
+ * IMPLICATION: The spurious interrupt vector programmed in the
+ * 8259 is normally handled by an operating system's spurious
+ * interrupt handler. However, a vector of 0Fh is unknown to some
+ * operating systems, which would crash if this erratum occurred.
+ *
+ * In theory this could be limited to 32bit, but the handler is not
+ * hurting and who knows which other CPUs suffer from this.
*/
- if (unlikely(restore_fpu_checking(tsk))) {
- stts();
- force_sig(SIGSEGV, tsk);
- return;
- }
-
- thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
- tsk->fpu_counter++;
}
-EXPORT_SYMBOL_GPL(math_state_restore);
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
+static bool handle_xfd_event(struct pt_regs *regs)
{
- printk(KERN_EMERG
- "math-emulation not enabled and no coprocessor found.\n");
- printk(KERN_EMERG "killing %s.\n", current->comm);
- force_sig(SIGFPE, current);
- schedule();
+ u64 xfd_err;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
+ return false;
+
+ rdmsrq(MSR_IA32_XFD_ERR, xfd_err);
+ if (!xfd_err)
+ return false;
+
+ wrmsrq(MSR_IA32_XFD_ERR, 0);
+
+ /* Die if that happens in kernel space */
+ if (WARN_ON(!user_mode(regs)))
+ return false;
+
+ local_irq_enable();
+
+ err = xfd_enable_feature(xfd_err);
+
+ switch (err) {
+ case -EPERM:
+ force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
+ break;
+ case -EFAULT:
+ force_sig(SIGSEGV);
+ break;
+ }
+
+ local_irq_disable();
+ return true;
}
-#endif /* CONFIG_MATH_EMULATION */
-dotraplinkage void __kprobes
-do_device_not_available(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_device_not_available)
{
-#ifdef CONFIG_X86_32
- if (read_cr0() & X86_CR0_EM) {
+ unsigned long cr0 = read_cr0();
+
+ if (handle_xfd_event(regs))
+ return;
+
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
- conditional_sti(regs);
+ cond_local_irq_enable(regs);
info.regs = regs;
math_emulate(&info);
- } else {
- math_state_restore(); /* interrupts still off */
- conditional_sti(regs);
+
+ cond_local_irq_disable(regs);
+ return;
}
-#else
- math_state_restore();
#endif
+
+ /* This should not happen. */
+ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+ /* Try to fix it up and carry on. */
+ write_cr0(cr0 & ~X86_CR0_TS);
+ } else {
+ /*
+ * Something terrible happened, and we're better off trying
+ * to kill the task than getting stuck in a never-ending
+ * loop of #NM faults.
+ */
+ die("unexpected #NM exception", regs, 0);
+ }
}
-#ifdef CONFIG_X86_32
-dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+#define VE_FAULT_STR "VE fault"
+
+static void ve_raise_fault(struct pt_regs *regs, long error_code,
+ unsigned long address)
{
- siginfo_t info;
- local_irq_enable();
+ if (user_mode(regs)) {
+ gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
+ return;
+ }
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_BADSTK;
- info.si_addr = NULL;
- if (notify_die(DIE_TRAP, "iret exception",
- regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code,
+ VE_FAULT_STR, address)) {
return;
- do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+ }
+
+ die_addr(VE_FAULT_STR, regs, error_code, address);
}
-#endif
-void __init trap_init(void)
+/*
+ * Virtualization Exceptions (#VE) are delivered to TDX guests due to
+ * specific guest actions which may happen in either user space or the
+ * kernel:
+ *
+ * * Specific instructions (WBINVD, for example)
+ * * Specific MSR accesses
+ * * Specific CPUID leaf accesses
+ * * Access to specific guest physical addresses
+ *
+ * In the settings that Linux will run in, virtualization exceptions are
+ * never generated on accesses to normal, TD-private memory that has been
+ * accepted (by BIOS or with tdx_enc_status_changed()).
+ *
+ * Syscall entry code has a critical window where the kernel stack is not
+ * yet set up. Any exception in this window leads to hard to debug issues
+ * and can be exploited for privilege escalation. Exceptions in the NMI
+ * entry code also cause issues. Returning from the exception handler with
+ * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
+ *
+ * For these reasons, the kernel avoids #VEs during the syscall gap and
+ * the NMI entry code. Entry code paths do not access TD-shared memory,
+ * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
+ * that might generate #VE. VMM can remove memory from TD at any point,
+ * but access to unaccepted (or missing) private memory leads to VM
+ * termination, not to #VE.
+ *
+ * Similarly to page faults and breakpoints, #VEs are allowed in NMI
+ * handlers once the kernel is ready to deal with nested NMIs.
+ *
+ * During #VE delivery, all interrupts, including NMIs, are blocked until
+ * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
+ * the VE info.
+ *
+ * If a guest kernel action which would normally cause a #VE occurs in
+ * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
+ * exception) is delivered to the guest which will result in an oops.
+ *
+ * The entry code has been audited carefully for following these expectations.
+ * Changes in the entry code have to be audited for correctness vs. this
+ * aspect. Similarly to #PF, #VE in these places will expose kernel to
+ * privilege escalation or may lead to random crashes.
+ */
+DEFINE_IDTENTRY(exc_virtualization_exception)
{
- int i;
+ struct ve_info ve;
-#ifdef CONFIG_EISA
- void __iomem *p = early_ioremap(0x0FFFD9, 4);
+ /*
+ * NMIs/Machine-checks/Interrupts will be in a disabled state
+ * till TDGETVEINFO TDCALL is executed. This ensures that VE
+ * info cannot be overwritten by a nested #VE.
+ */
+ tdx_get_ve_info(&ve);
- if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
- EISA_bus = 1;
- early_iounmap(p, 4);
-#endif
+ cond_local_irq_enable(regs);
- set_intr_gate(0, &divide_error);
- set_intr_gate_ist(1, &debug, DEBUG_STACK);
- set_intr_gate_ist(2, &nmi, NMI_STACK);
- /* int3 can be called from all */
- set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
- /* int4 can be called from all */
- set_system_intr_gate(4, &overflow);
- set_intr_gate(5, &bounds);
- set_intr_gate(6, &invalid_op);
- set_intr_gate(7, &device_not_available);
-#ifdef CONFIG_X86_32
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
-#else
- set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
-#endif
- set_intr_gate(9, &coprocessor_segment_overrun);
- set_intr_gate(10, &invalid_TSS);
- set_intr_gate(11, &segment_not_present);
- set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(13, &general_protection);
- set_intr_gate(14, &page_fault);
- set_intr_gate(15, &spurious_interrupt_bug);
- set_intr_gate(16, &coprocessor_error);
- set_intr_gate(17, &alignment_check);
-#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(18, &machine_check, MCE_STACK);
-#endif
- set_intr_gate(19, &simd_coprocessor_error);
+ /*
+ * If tdx_handle_virt_exception() could not process
+ * it successfully, treat it as #GP(0) and handle it.
+ */
+ if (!tdx_handle_virt_exception(regs, &ve))
+ ve_raise_fault(regs, 0, ve.gla);
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
+ cond_local_irq_disable(regs);
+}
-#ifdef CONFIG_IA32_EMULATION
- set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
- if (cpu_has_fxsr) {
- printk(KERN_INFO "Enabling fast FPU save and restore... ");
- set_in_cr4(X86_CR4_OSFXSR);
- printk("done.\n");
- }
- if (cpu_has_xmm) {
- printk(KERN_INFO
- "Enabling unmasked SIMD FPU exception support... ");
- set_in_cr4(X86_CR4_OSXMMEXCPT);
- printk("done.\n");
+DEFINE_IDTENTRY_SW(iret_error)
+{
+ local_irq_enable();
+ if (notify_die(DIE_TRAP, "iret exception", regs, 0,
+ X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
+ ILL_BADSTK, (void __user *)NULL);
}
-
- set_system_trap_gate(SYSCALL_VECTOR, &system_call);
- set_bit(SYSCALL_VECTOR, used_vectors);
+ local_irq_disable();
+}
#endif
- /*
- * Should be a barrier for any external CPU state:
- */
- cpu_init();
+void __init trap_init(void)
+{
+ /* Init cpu_entry_area before IST entries are set up */
+ setup_cpu_entry_areas();
-#ifdef CONFIG_X86_32
- x86_quirk_trap_init();
-#endif
+ /* Init GHCB memory pages when running as an SEV-ES guest */
+ sev_es_init_vc_handling();
+
+ /* Initialize TSS before setting up traps so ISTs work */
+ cpu_init_exception_handling(true);
+
+ /* Setup traps as cpu_init() might #GP */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_setup_traps();
+
+ cpu_init();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357e..7d3e13e14eab 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,22 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
-#include <linux/dmi.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/timex.h>
+#include <linux/static_key.h>
+#include <linux/static_call.h>
+#include <asm/cpuid/api.h>
#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
#include <asm/hypervisor.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
+#include <asm/geode.h>
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/i8259.h>
+#include <asm/msr.h>
+#include <asm/topology.h>
+#include <asm/uv/uv.h>
+#include <asm/sev.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -24,23 +41,207 @@ EXPORT_SYMBOL(cpu_khz);
unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);
+#define KHZ 1000
+
/*
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
static int __read_mostly tsc_unstable;
+static unsigned int __initdata tsc_early_khz;
+
+static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);
+
+int tsc_clocksource_reliable;
+
+static int __read_mostly tsc_force_recalibrate;
+
+static struct clocksource_base art_base_clk = {
+ .id = CSID_X86_ART,
+};
+static bool have_art;
+
+struct cyc2ns {
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_latch_t seq; /* 32 + 4 = 36 */
+
+}; /* fits one cacheline */
+
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+
+static int __init tsc_early_khz_setup(char *buf)
+{
+ return kstrtouint(buf, 0, &tsc_early_khz);
+}
+early_param("tsc_early_khz", tsc_early_khz_setup);
-/* native_sched_clock() is called before tsc_init(), so
- we must start with the TSC soft disabled to prevent
- erroneous rdtsc usage on !cpu_has_tsc processors */
-static int __read_mostly tsc_disabled = -1;
+__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
+{
+ int seq, idx;
+
+ do {
+ seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
+ idx = seq & 1;
+
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
+
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
+}
+
+__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+{
+ preempt_disable_notrace();
+ __cyc2ns_read(data);
+}
+
+__always_inline void cyc2ns_read_end(void)
+{
+ preempt_enable_notrace();
+}
+
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+{
+ struct cyc2ns_data data;
+ unsigned long long ns;
+
+ __cyc2ns_read(&data);
+
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ return ns;
+}
+
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ unsigned long long ns;
+ preempt_disable_notrace();
+ ns = __cycles_2_ns(cyc);
+ preempt_enable_notrace();
+ return ns;
+}
+
+static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
+
+ ns_now = cycles_2_ns(tsc_now);
+
+ /*
+ * Compute a new multiplier as per the above comment and ensure our
+ * time function is continuous; see the comment near struct
+ * cyc2ns_data.
+ */
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
+ NSEC_PER_MSEC, 0);
+
+ /*
+ * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
+ * not expected to be greater than 31 due to the original published
+ * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
+ * value) - refer perf_event_mmap_page documentation in perf_event.h.
+ */
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
+ }
+
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+
+ write_seqcount_latch_begin(&c2n->seq);
+ c2n->data[0] = data;
+ write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
+ write_seqcount_latch_end(&c2n->seq);
+}
+
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ if (khz)
+ __set_cyc2ns_scale(khz, cpu, tsc_now);
+
+ sched_clock_idle_wakeup_event();
+ local_irq_restore(flags);
+}
+
+/*
+ * Initialize cyc2ns for boot cpu
+ */
+static void __init cyc2ns_init_boot_cpu(void)
+{
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+
+ seqcount_latch_init(&c2n->seq);
+ __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
+}
+
+/*
+ * Secondary CPUs do not run through tsc_init(), so set up
+ * all the scale factors for all CPUs, assuming the same
+ * speed as the bootup CPU.
+ */
+static void __init cyc2ns_init_secondary_cpus(void)
+{
+ unsigned int cpu, this_cpu = smp_processor_id();
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+ struct cyc2ns_data *data = c2n->data;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != this_cpu) {
+ seqcount_latch_init(&c2n->seq);
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+ c2n->data[0] = data[0];
+ c2n->data[1] = data[1];
+ }
+ }
+}
-static int tsc_clocksource_reliable;
/*
* Scheduler clock - returns current time in nanosec units.
*/
-u64 native_sched_clock(void)
+noinstr u64 native_sched_clock(void)
{
- u64 this_offset;
+ if (static_branch_likely(&__use_tsc)) {
+ u64 tsc_now = rdtsc();
+
+ /* return the value in ns */
+ return __cycles_2_ns(tsc_now);
+ }
/*
* Fall back to jiffies if there's no TSC available:
@@ -48,32 +249,48 @@ u64 native_sched_clock(void)
* unstable. We do this because unlike Time Of Day,
* the scheduler clock tolerates small errors and it's
* very important for it to be as fast as the platform
- * can achive it. )
+ * can achieve it. )
*/
- if (unlikely(tsc_disabled)) {
- /* No locking but a rare wrong value is not a big deal: */
- return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
- }
- /* read the Time Stamp Counter: */
- rdtscll(this_offset);
+ /* No locking but a rare wrong value is not a big deal: */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+}
- /* return the value in ns */
- return __cycles_2_ns(this_offset);
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+ return cycles_2_ns(tsc);
}
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
+noinstr u64 sched_clock_noinstr(void)
{
return paravirt_sched_clock();
}
+
+bool using_native_sched_clock(void)
+{
+ return static_call_query(pv_sched_clock) == native_sched_clock;
+}
#else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
+
+bool using_native_sched_clock(void) { return true; }
#endif
+notrace u64 sched_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = sched_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
+
int check_tsc_unstable(void)
{
return tsc_unstable;
@@ -83,9 +300,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
- printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC completely.\n");
- tsc_disabled = 1;
+ mark_tsc_unstable("boot parameter notsc");
return 1;
}
#else
@@ -102,24 +317,49 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
+static int no_sched_irq_time;
+static int no_tsc_watchdog;
+static int tsc_as_watchdog;
+
static int __init tsc_setup(char *str)
{
if (!strcmp(str, "reliable"))
tsc_clocksource_reliable = 1;
+ if (!strncmp(str, "noirqtime", 9))
+ no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
+ if (!strcmp(str, "nowatchdog")) {
+ no_tsc_watchdog = 1;
+ if (tsc_as_watchdog)
+ pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
+ __func__);
+ tsc_as_watchdog = 0;
+ }
+ if (!strcmp(str, "recalibrate"))
+ tsc_force_recalibrate = 1;
+ if (!strcmp(str, "watchdog")) {
+ if (no_tsc_watchdog)
+ pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
+ __func__);
+ else
+ tsc_as_watchdog = 1;
+ }
return 1;
}
__setup("tsc=", tsc_setup);
-#define MAX_RETRIES 5
-#define SMI_TRESHOLD 50000
+#define MAX_RETRIES 5
+#define TSC_DEFAULT_THRESHOLD 0x20000
/*
- * Read TSC and the reference counters. Take care of SMI disturbance
+ * Read TSC and the reference counters. Take care of any disturbances
*/
static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
+ u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
int i;
for (i = 0; i < MAX_RETRIES; i++) {
@@ -129,7 +369,7 @@ static u64 tsc_read_refs(u64 *p, int hpet)
else
*p = acpi_pm_read_early();
t2 = get_cycles();
- if ((t2 - t1) < SMI_TRESHOLD)
+ if ((t2 - t1) < thresh)
return t2;
}
return ULLONG_MAX;
@@ -147,7 +387,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
hpet2 -= hpet1;
tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
do_div(tmp, 1000000);
- do_div(deltatsc, tmp);
+ deltatsc = div64_u64(deltatsc, tmp);
return (unsigned long) deltatsc;
}
@@ -173,11 +413,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
}
#define CAL_MS 10
-#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS 1000
#define CAL2_MS 50
-#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS 5000
@@ -194,6 +434,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
unsigned long tscmin, tscmax;
int pitcnt;
+ if (!has_legacy_pic()) {
+ /*
+ * Relies on tsc_early_delay_calibrate() to have given us semi
+ * usable udelay(), wait for the same 50ms we would have with
+ * the PIT loop below.
+ */
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ return ULONG_MAX;
+ }
+
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
@@ -273,7 +527,7 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
* transition from one expected value to another with a fairly
* high accuracy, and we didn't miss any events. We can thus
* use the TSC value at the transitions to calculate a pretty
- * good value for the TSC frequencty.
+ * good value for the TSC frequency.
*/
static inline int pit_verify_msb(unsigned char val)
{
@@ -285,14 +539,15 @@ static inline int pit_verify_msb(unsigned char val)
static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
int count;
- u64 tsc = 0;
+ u64 tsc = 0, prev_tsc = 0;
for (count = 0; count < 50000; count++) {
if (!pit_verify_msb(val))
break;
+ prev_tsc = tsc;
tsc = get_cycles();
}
- *deltap = get_cycles() - tsc;
+ *deltap = get_cycles() - prev_tsc;
*tscp = tsc;
/*
@@ -306,9 +561,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
* How many MSB values do we want to see? We aim for
* a maximum error rate of 500ppm (in practice the
* real error is much smaller), but refuse to spend
- * more than 25ms on it.
+ * more than 50ms on it.
*/
-#define MAX_QUICK_PIT_MS 25
+#define MAX_QUICK_PIT_MS 50
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
static unsigned long quick_pit_calibrate(void)
@@ -317,6 +572,9 @@ static unsigned long quick_pit_calibrate(void)
u64 tsc, delta;
unsigned long d1, d2;
+ if (!has_legacy_pic())
+ return 0;
+
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
@@ -348,10 +606,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;
@@ -367,7 +634,7 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
- printk("Fast TSC calibration failed\n");
+ pr_info("Fast TSC calibration failed\n");
return 0;
success:
@@ -378,42 +645,123 @@ success:
*
* As a result, we can depend on there not being
* any odd delays anywhere, and the TSC reads are
- * reliable (within the error). We also adjust the
- * delta to the middle of the error bars, just
- * because it looks nicer.
+ * reliable (within the error).
*
* kHz = ticks / time-in-seconds / 1000;
* kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
* kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
*/
- delta += (long)(d2 - d1)/2;
delta *= PIT_TICK_RATE;
do_div(delta, i*256*1000);
- printk("Fast TSC calibration using PIT\n");
+ pr_info("Fast TSC calibration using PIT\n");
return delta;
}
/**
- * native_calibrate_tsc - calibrate the tsc on boot
+ * native_calibrate_tsc - determine TSC frequency
+ * Determine TSC frequency via CPUID, else return 0.
*/
unsigned long native_calibrate_tsc(void)
{
- u64 tsc1, tsc2, delta, ref1, ref2;
- unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
- int hpet = is_hpet_enabled(), i, loopmin;
+ unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
+ unsigned int crystal_khz;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ return 0;
- hv_tsc_khz = get_hypervisor_tsc_freq();
- if (hv_tsc_khz) {
- printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
- return hv_tsc_khz;
+ eax_denominator = ebx_numerator = ecx_hz = edx = 0;
+
+ /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+ cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
+
+ if (ebx_numerator == 0 || eax_denominator == 0)
+ return 0;
+
+ crystal_khz = ecx_hz / 1000;
+
+ /*
+ * Denverton SoCs don't report crystal clock, and also don't support
+ * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
+ * crystal clock.
+ */
+ if (crystal_khz == 0 &&
+ boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
+ crystal_khz = 25000;
+
+ /*
+ * TSC frequency reported directly by CPUID is a "hardware reported"
+ * frequency and is the most accurate one so far we have. This
+ * is considered a known frequency.
+ */
+ if (crystal_khz != 0)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Some Intel SoCs like Skylake and Kabylake don't report the crystal
+ * clock, but we can easily calculate it to a high degree of accuracy
+ * by considering the crystal ratio and the CPU speed.
+ */
+ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
+ unsigned int eax_base_mhz, ebx, ecx, edx;
+
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
+ crystal_khz = eax_base_mhz * 1000 *
+ eax_denominator / ebx_numerator;
}
- local_irq_save(flags);
- fast_calibrate = quick_pit_calibrate();
- local_irq_restore(flags);
- if (fast_calibrate)
- return fast_calibrate;
+ if (crystal_khz == 0)
+ return 0;
+
+ /*
+ * For Atom SoCs TSC is the only reliable clocksource.
+ * Mark TSC reliable so no watchdog on it.
+ */
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * The local APIC appears to be fed by the core crystal clock
+ * (which sounds entirely sensible). We can set the global
+ * lapic_timer_period here to avoid having to calibrate the APIC
+ * timer later.
+ */
+ lapic_timer_period = crystal_khz * 1000 / HZ;
+#endif
+
+ return crystal_khz * ebx_numerator / eax_denominator;
+}
+
+static unsigned long cpu_khz_from_cpuid(void)
+{
+ unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
+ return 0;
+
+ eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
+
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
+
+ return eax_base_mhz * 1000;
+}
+
+/*
+ * calibrate cpu using pit, hpet, and ptimer methods. They are available
+ * later in boot after acpi is initialized.
+ */
+static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
+{
+ u64 tsc1, tsc2, delta, ref1, ref2;
+ unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
+ unsigned long flags, latch, ms;
+ int hpet = is_hpet_enabled(), i, loopmin;
/*
* Run 5 calibration loops to get the lowest frequency value
@@ -426,15 +774,15 @@ unsigned long native_calibrate_tsc(void)
* zero. In each wait loop iteration we read the TSC and check
* the delta to the previous read. We keep track of the min
* and max values of that delta. The delta is mostly defined
- * by the IO time of the PIT access, so we can detect when a
- * SMI/SMM disturbance happend between the two reads. If the
+ * by the IO time of the PIT access, so we can detect when
+ * any disturbance happened between the two reads. If the
* maximum time is significantly larger than the minimum time,
* then we discard the result and have another try.
*
* 2) Reference counter. If available we use the HPET or the
* PMTIMER as a reference to check the sanity of that value.
* We use separate TSC readouts and check inside of the
- * reference read for a SMI/SMM disturbance. We dicard
+ * reference read for any possible disturbance. We discard
* disturbed values here as well. We do that around the PIT
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
@@ -464,10 +812,10 @@ unsigned long native_calibrate_tsc(void)
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !ref1 && !ref2)
+ if (ref1 == ref2)
continue;
- /* Check, whether the sampling was disturbed by an SMI */
+ /* Check, whether the sampling was disturbed */
if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
continue;
@@ -490,9 +838,8 @@ unsigned long native_calibrate_tsc(void)
* use the reference value, as it is more precise.
*/
if (delta >= 90 && delta <= 110) {
- printk(KERN_INFO
- "TSC: PIT calibration matches %s. %d loops\n",
- hpet ? "HPET" : "PMTIMER", i + 1);
+ pr_info("PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
return tsc_ref_min;
}
@@ -514,38 +861,36 @@ unsigned long native_calibrate_tsc(void)
*/
if (tsc_pit_min == ULONG_MAX) {
/* PIT gave no useful value */
- printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
+ pr_warn("Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
if (!hpet && !ref1 && !ref2) {
- printk("TSC: No reference (HPET/PMTIMER) available\n");
+ pr_notice("No reference (HPET/PMTIMER) available\n");
return 0;
}
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed.\n");
+ pr_warn("HPET/PMTIMER calibration failed\n");
return 0;
}
/* Use the alternative source */
- printk(KERN_INFO "TSC: using %s reference calibration\n",
- hpet ? "HPET" : "PMTIMER");
+ pr_info("using %s reference calibration\n",
+ hpet ? "HPET" : "PMTIMER");
return tsc_ref_min;
}
/* We don't have an alternative source, use the PIT calibration value */
if (!hpet && !ref1 && !ref2) {
- printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ pr_info("Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
- "Using PIT calibration\n");
+ pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
return tsc_pit_min;
}
@@ -554,90 +899,119 @@ unsigned long native_calibrate_tsc(void)
* the PIT value as we know that there are PMTIMERs around
* running at double speed. At least we let the user know:
*/
- printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
- hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
- printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ pr_warn("PIT calibration deviates from %s: %lu %lu\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+ pr_info("Using PIT calibration value\n");
return tsc_pit_min;
}
-int recalibrate_cpu_khz(void)
+/**
+ * native_calibrate_cpu_early - can calibrate the cpu early in boot
+ */
+unsigned long native_calibrate_cpu_early(void)
+{
+ unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();
+
+ if (!fast_calibrate)
+ fast_calibrate = cpu_khz_from_msr();
+ if (!fast_calibrate) {
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ }
+ return fast_calibrate;
+}
+
+
+/**
+ * native_calibrate_cpu - calibrate the cpu
+ */
+static unsigned long native_calibrate_cpu(void)
+{
+ unsigned long tsc_freq = native_calibrate_cpu_early();
+
+ if (!tsc_freq)
+ tsc_freq = pit_hpet_ptimer_calibrate_cpu();
+
+ return tsc_freq;
+}
+
+void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
unsigned long cpu_khz_old = cpu_khz;
- if (cpu_has_tsc) {
- tsc_khz = calibrate_tsc();
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
cpu_khz = tsc_khz;
- cpu_data(0).loops_per_jiffy =
- cpufreq_scale(cpu_data(0).loops_per_jiffy,
- cpu_khz_old, cpu_khz);
- return 0;
- } else
- return -ENODEV;
-#else
- return -ENODEV;
+ cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
+ cpu_khz_old, cpu_khz);
#endif
}
+EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);
-EXPORT_SYMBOL(recalibrate_cpu_khz);
+static unsigned long long cyc2ns_suspend;
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
+void tsc_save_sched_clock_state(void)
+{
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+ return;
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
+ cyc2ns_suspend = sched_clock();
+}
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void tsc_restore_sched_clock_state(void)
{
- unsigned long long tsc_now, ns_now, *offset;
- unsigned long flags, *scale;
+ unsigned long long offset;
+ unsigned long flags;
+ int cpu;
+
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+ return;
local_irq_save(flags);
- sched_clock_idle_sleep_event();
- scale = &per_cpu(cyc2ns, cpu);
- offset = &per_cpu(cyc2ns_offset, cpu);
+ /*
+ * We're coming out of suspend, there's no concurrency yet; don't
+ * bother being nice about the RCU stuff, just write to both
+ * data fields.
+ */
- rdtscll(tsc_now);
- ns_now = __cycles_2_ns(tsc_now);
+ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+ this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
- if (cpu_khz) {
- *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
- *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+ offset = cyc2ns_suspend - sched_clock();
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+ per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
}
- sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
}
#ifdef CONFIG_CPU_FREQ
-
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+/*
+ * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
* changes.
*
- * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
- * not that important because current Opteron setups do not support
- * scaling on SMP anyroads.
+ * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
+ * as unstable and give up in those cases.
*
* Should fix up last_tsc too. Currently gettimeofday in the
* first tick after the change will be slightly wrong.
@@ -651,33 +1025,29 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
- unsigned long *lpj;
- if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+ if (num_online_cpus() > 1) {
+ mark_tsc_unstable("cpufreq changes on SMP");
return 0;
-
- lpj = &boot_cpu_data.loops_per_jiffy;
-#ifdef CONFIG_SMP
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#endif
+ }
if (!ref_freq) {
ref_freq = freq->old;
- loops_per_jiffy_ref = *lpj;
+ loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
tsc_khz_ref = tsc_khz;
}
+
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
- *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
+ boot_cpu_data.loops_per_jiffy =
+ cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
+ }
return 0;
}
@@ -686,9 +1056,9 @@ static struct notifier_block time_cpufreq_notifier_block = {
.notifier_call = time_cpufreq_notifier
};
-static int __init cpufreq_tsc(void)
+static int __init cpufreq_register_tsc_scaling(void)
{
- if (!cpu_has_tsc)
+ if (!boot_cpu_has(X86_FEATURE_TSC))
return 0;
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return 0;
@@ -697,125 +1067,212 @@ static int __init cpufreq_tsc(void)
return 0;
}
-core_initcall(cpufreq_tsc);
+core_initcall(cpufreq_register_tsc_scaling);
#endif /* CONFIG_CPU_FREQ */
+#define ART_MIN_DENOMINATOR (1)
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void __init detect_art(void)
+{
+ unsigned int unused;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ return;
+
+ /*
+ * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
+ * and the TSC counter resets must not occur asynchronously.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
+ tsc_async_resets)
+ return;
+
+ cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator,
+ &art_base_clk.numerator, &art_base_clk.freq_khz, &unused);
+
+ art_base_clk.freq_khz /= KHZ;
+ if (art_base_clk.denominator < ART_MIN_DENOMINATOR)
+ return;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset);
+
+ /* Make this sticky over multiple CPU init calls */
+ setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
/* clocksource code */
-static struct clocksource clocksource_tsc;
+static void tsc_resume(struct clocksource *cs)
+{
+ tsc_verify_tsc_adjust(true);
+}
/*
- * We compare the TSC to the cycle_last value in the clocksource
+ * We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
* is smaller than the cycle_last reference value due to a TSC which
- * is slighty behind. This delta is nowhere else observable, but in
+ * is slightly behind. This delta is nowhere else observable, but in
* that case it results in a forward time jump in the range of hours
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
* timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
-static cycle_t read_tsc(struct clocksource *cs)
+static u64 read_tsc(struct clocksource *cs)
{
- cycle_t ret = (cycle_t)get_cycles();
+ return (u64)rdtsc_ordered();
+}
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
+static void tsc_cs_tick_stable(struct clocksource *cs)
{
- cycle_t ret;
+ if (tsc_unstable)
+ return;
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
- rdtsc_barrier();
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
- return ret >= __vsyscall_gtod_data.clock.cycle_last ?
- ret : __vsyscall_gtod_data.clock.cycle_last;
+static int tsc_cs_enable(struct clocksource *cs)
+{
+ vclocks_set_used(VDSO_CLOCKMODE_TSC);
+ return 0;
}
-#endif
-static struct clocksource clocksource_tsc = {
- .name = "tsc",
- .rating = 300,
- .read = read_tsc,
- .mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
+static struct clocksource clocksource_tsc_early = {
+ .name = "tsc-early",
+ .rating = 299,
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
- .vread = vread_tsc,
-#endif
+ .id = CSID_X86_TSC_EARLY,
+ .vdso_clock_mode = VDSO_CLOCKMODE_TSC,
+ .enable = tsc_cs_enable,
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc_early.list),
+};
+
+/*
+ * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
+ * this one will immediately take over. We will only register if TSC has
+ * been found good.
+ */
+static struct clocksource clocksource_tsc = {
+ .name = "tsc",
+ .rating = 300,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_VALID_FOR_HRES |
+ CLOCK_SOURCE_MUST_VERIFY |
+ CLOCK_SOURCE_VERIFY_PERCPU,
+ .id = CSID_X86_TSC,
+ .vdso_clock_mode = VDSO_CLOCKMODE_TSC,
+ .enable = tsc_cs_enable,
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc.list),
};
void mark_tsc_unstable(char *reason)
{
- if (!tsc_unstable) {
- tsc_unstable = 1;
- printk("Marking TSC unstable due to %s\n", reason);
- /* Change only the rating, when not registered */
- if (clocksource_tsc.mult)
- clocksource_change_rating(&clocksource_tsc, 0);
- else
- clocksource_tsc.rating = 0;
- }
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to %s\n", reason);
+
+ clocksource_mark_unstable(&clocksource_tsc_early);
+ clocksource_mark_unstable(&clocksource_tsc);
}
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
+static void __init tsc_disable_clocksource_watchdog(void)
{
- printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
- d->ident);
- tsc_unstable = 1;
- return 0;
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
- {
- .callback = dmi_mark_tsc_unstable,
- .ident = "IBM Thinkpad 380XD",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
- },
- },
- {}
-};
+bool tsc_clocksource_watchdog_disabled(void)
+{
+ return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
+ tsc_as_watchdog && !no_tsc_watchdog;
+}
static void __init check_system_tsc_reliable(void)
{
-#ifdef CONFIG_MGEODE_LX
- /* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+ if (is_geode_lx()) {
+ /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
- unsigned long res_low, res_high;
+ unsigned long res_low, res_high;
- rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
- /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
- if (res_low & RTSC_SUSP)
- tsc_clocksource_reliable = 1;
+ rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
+ if (res_low & RTSC_SUSP)
+ tsc_clocksource_reliable = 1;
+ }
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than four packages
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ topology_max_packages() <= 4)
+ tsc_disable_clocksource_watchdog();
}
/*
* Make an educated guess if the TSC is trustworthy and synchronized
* over all CPUs.
*/
-__cpuinit int unsynchronized_tsc(void)
+int unsynchronized_tsc(void)
{
- if (!cpu_has_tsc || tsc_unstable)
+ if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
return 1;
#ifdef CONFIG_SMP
@@ -825,86 +1282,325 @@ __cpuinit int unsynchronized_tsc(void)
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return 0;
+
+ if (tsc_clocksource_reliable)
+ return 0;
/*
* Intel systems are normally all synchronized.
* Exceptions must mark TSC as unstable:
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
- if (num_possible_cpus() > 1)
- tsc_unstable = 1;
+ if (topology_max_packages() > 1)
+ return 1;
}
- return tsc_unstable;
+ return 0;
}
-static void __init init_tsc_clocksource(void)
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work: ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomalies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
{
- clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
- clocksource_tsc.shift);
- if (tsc_clocksource_reliable)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
- /* lower the rating if we already know its unstable: */
- if (check_tsc_unstable()) {
- clocksource_tsc.rating = 0;
- clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+ static u64 tsc_start = ULLONG_MAX, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+ int cpu;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (tsc_unstable)
+ goto unreg;
+
+ /*
+ * Since the work is started early in boot, we may be
+ * delayed the first time we expire. So set the workqueue
+ * again once we know timers are working.
+ */
+ if (tsc_start == ULLONG_MAX) {
+restart:
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ return;
+ }
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (ref_start == ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed */
+ if (tsc_stop == ULLONG_MAX)
+ goto restart;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Will hit this only if tsc_force_recalibrate has been set */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+
+ /* Warn if the deviation exceeds 500 ppm */
+ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
+ pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
+ pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+ }
+
+ pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
+ hpet ? "HPET" : "PM_TIMER",
+ (unsigned long)freq / 1000,
+ (unsigned long)freq % 1000);
+
+ return;
}
- clocksource_register(&clocksource_tsc);
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+ /* Inform the TSC deadline clockevent devices about the recalibration */
+ lapic_update_tsc_freq();
+
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
+out:
+ if (tsc_unstable)
+ goto unreg;
+
+ if (boot_cpu_has(X86_FEATURE_ART)) {
+ have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
+ clocksource_unregister(&clocksource_tsc_early);
}
-void __init tsc_init(void)
+
+static int __init init_tsc_clocksource(void)
{
- u64 lpj;
- int cpu;
+ if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
+ return 0;
- if (!cpu_has_tsc)
- return;
+ if (tsc_unstable) {
+ clocksource_unregister(&clocksource_tsc_early);
+ return 0;
+ }
- tsc_khz = calibrate_tsc();
- cpu_khz = tsc_khz;
+ if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+ clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
- if (!tsc_khz) {
- mark_tsc_unstable("could not calculate TSC khz");
- return;
+ /*
+ * When TSC frequency is known (retrieved via MSR or CPUID), we skip
+ * the refined calibration and directly register it as a clocksource.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+ if (boot_cpu_has(X86_FEATURE_ART)) {
+ have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ clocksource_unregister(&clocksource_tsc_early);
+
+ if (!tsc_force_recalibrate)
+ return 0;
}
-#ifdef CONFIG_X86_64
- if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
- cpu_khz = calibrate_cpu();
-#endif
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
+}
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
- printk("Detected %lu.%03lu MHz processor.\n",
- (unsigned long)cpu_khz / 1000,
- (unsigned long)cpu_khz % 1000);
+static bool __init determine_cpu_tsc_frequencies(bool early)
+{
+ /* Make sure that cpu and tsc are not already calibrated */
+ WARN_ON(cpu_khz || tsc_khz);
+
+ if (early) {
+ cpu_khz = x86_platform.calibrate_cpu();
+ if (tsc_early_khz) {
+ tsc_khz = tsc_early_khz;
+ } else {
+ tsc_khz = x86_platform.calibrate_tsc();
+ clocksource_tsc.freq_khz = tsc_khz;
+ }
+ } else {
+ /* We should not be here with non-native cpu calibration */
+ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
+ cpu_khz = pit_hpet_ptimer_calibrate_cpu();
+ }
/*
- * Secondary CPUs do not run through tsc_init(), so set up
- * all the scale factors for all CPUs, assuming the same
- * speed as the bootup CPU. (cpufreq notifiers will fix this
- * up if their speed diverges)
+ * Trust non-zero tsc_khz as authoritative,
+ * and use it to sanity check cpu_khz,
+ * which will be off if system timer is off.
*/
- for_each_possible_cpu(cpu)
- set_cyc2ns_scale(cpu_khz, cpu);
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
- if (tsc_disabled > 0)
- return;
+ if (tsc_khz == 0)
+ return false;
+
+ pr_info("Detected %lu.%03lu MHz processor\n",
+ (unsigned long)cpu_khz / KHZ,
+ (unsigned long)cpu_khz % KHZ);
+
+ if (cpu_khz != tsc_khz) {
+ pr_info("Detected %lu.%03lu MHz TSC",
+ (unsigned long)tsc_khz / KHZ,
+ (unsigned long)tsc_khz % KHZ);
+ }
+ return true;
+}
- /* now allow native_sched_clock() to use rdtsc */
- tsc_disabled = 0;
+static unsigned long __init get_loops_per_jiffy(void)
+{
+ u64 lpj = (u64)tsc_khz * KHZ;
- lpj = ((u64)tsc_khz * 1000);
do_div(lpj, HZ);
- lpj_fine = lpj;
+ return lpj;
+}
+static void __init tsc_enable_sched_clock(void)
+{
+ loops_per_jiffy = get_loops_per_jiffy();
use_tsc_delay();
- /* Check and install the TSC clocksource */
- dmi_check_system(bad_tsc_dmi_table);
- if (unsynchronized_tsc())
- mark_tsc_unstable("TSCs unsynchronized");
+ /* Sanitize TSC ADJUST before cyc2ns gets initialized */
+ tsc_store_and_check_tsc_adjust(true);
+ cyc2ns_init_boot_cpu();
+ static_branch_enable(&__use_tsc);
+}
+
+void __init tsc_early_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+ /* Don't change UV TSC multi-chassis synchronization */
+ if (is_early_uv_system())
+ return;
+
+ snp_secure_tsc_init();
+
+ if (!determine_cpu_tsc_frequencies(true))
+ return;
+ tsc_enable_sched_clock();
+}
+
+void __init tsc_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+
+ /*
+ * native_calibrate_cpu_early can only calibrate using methods that are
+ * available early in boot.
+ */
+ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
+ x86_platform.calibrate_cpu = native_calibrate_cpu;
+
+ if (!tsc_khz) {
+ /* We failed to determine frequencies earlier, try again */
+ if (!determine_cpu_tsc_frequencies(false)) {
+ mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+ tsc_enable_sched_clock();
+ }
+
+ cyc2ns_init_secondary_cpus();
+
+ if (!no_sched_irq_time)
+ enable_sched_clock_irqtime();
+
+ lpj_fine = get_loops_per_jiffy();
check_system_tsc_reliable();
- init_tsc_clocksource();
+
+ if (unsynchronized_tsc()) {
+ mark_tsc_unstable("TSCs unsynchronized");
+ return;
+ }
+
+ if (tsc_clocksource_reliable || no_tsc_watchdog)
+ tsc_disable_clocksource_watchdog();
+
+ clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
+ detect_art();
}
+#ifdef CONFIG_SMP
+/*
+ * Check whether existing calibration data can be reused.
+ */
+unsigned long calibrate_delay_is_known(void)
+{
+ int sibling, cpu = smp_processor_id();
+ int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
+ const struct cpumask *mask = topology_core_cpumask(cpu);
+
+ /*
+ * If TSC has constant frequency and TSC is synchronized across
+ * sockets then reuse CPU0 calibration.
+ */
+ if (constant_tsc && !tsc_unstable)
+ return cpu_data(0).loops_per_jiffy;
+
+ /*
+ * If TSC has constant frequency and TSC is not synchronized across
+ * sockets and this is not the first CPU in the socket, then reuse
+ * the calibration value of an already online CPU on that socket.
+ *
+ * This assumes that CONSTANT_TSC is consistent for all CPUs in a
+ * socket.
+ */
+ if (!constant_tsc || !mask)
+ return 0;
+
+ sibling = cpumask_any_but(mask, cpu);
+ if (sibling < nr_cpu_ids)
+ return cpu_data(sibling).loops_per_jiffy;
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 000000000000..48e6cc1cb017
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TSC frequency enumeration via MSR
+ *
+ * Copyright (C) 2013, 2018 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/thread_info.h>
+
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+#include <asm/param.h>
+#include <asm/tsc.h>
+
+#define MAX_NUM_FREQS 16 /* 4 bits to select the frequency */
+
+/*
+ * The frequency numbers in the SDM are e.g. 83.3 MHz, which does not contain a
+ * lot of accuracy which leads to clock drift. As far as we know Bay Trail SoCs
+ * use a 25 MHz crystal and Cherry Trail uses a 19.2 MHz crystal, the crystal
+ * is the source clk for a root PLL which outputs 1600 and 100 MHz. It is
+ * unclear if the root PLL outputs are used directly by the CPU clock PLL or
+ * if there is another PLL in between.
+ * This does not matter though, we can model the chain of PLLs as a single PLL
+ * with a quotient equal to the quotients of all PLLs in the chain multiplied.
+ * So we can create a simplified model of the CPU clock setup using a reference
+ * clock of 100 MHz plus a quotient which gets us as close to the frequency
+ * from the SDM as possible.
+ * For the 83.3 MHz example from above this would give us 100 MHz * 5 / 6 =
+ * 83 and 1/3 MHz, which matches exactly what has been measured on actual hw.
+ */
+#define TSC_REFERENCE_KHZ 100000
+
+struct muldiv {
+ u32 multiplier;
+ u32 divider;
+};
+
+/*
+ * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field use_msr_plat does.
+ */
+struct freq_desc {
+ bool use_msr_plat;
+ struct muldiv muldiv[MAX_NUM_FREQS];
+ /*
+ * Some CPU frequencies in the SDM do not map to known PLL freqs, in
+ * that case the muldiv array is empty and the freqs array is used.
+ */
+ u32 freqs[MAX_NUM_FREQS];
+ u32 mask;
+};
+
+/*
+ * Penwell and Clovertrail use spread spectrum clock,
+ * so the freq number is not exactly the same as reported
+ * by MSR based on SDM.
+ */
+static const struct freq_desc freq_desc_pnw = {
+ .use_msr_plat = false,
+ .freqs = { 0, 0, 0, 0, 0, 99840, 0, 83200 },
+ .mask = 0x07,
+};
+
+static const struct freq_desc freq_desc_clv = {
+ .use_msr_plat = false,
+ .freqs = { 0, 133200, 0, 0, 0, 99840, 0, 83200 },
+ .mask = 0x07,
+};
+
+/*
+ * Bay Trail SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 000: 100 * 5 / 6 = 83.3333 MHz
+ * 001: 100 * 1 / 1 = 100.0000 MHz
+ * 010: 100 * 4 / 3 = 133.3333 MHz
+ * 011: 100 * 7 / 6 = 116.6667 MHz
+ * 100: 100 * 4 / 5 = 80.0000 MHz
+ */
+static const struct freq_desc freq_desc_byt = {
+ .use_msr_plat = true,
+ .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 },
+ { 4, 5 } },
+ .mask = 0x07,
+};
+
+/*
+ * Cherry Trail SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 0000: 100 * 5 / 6 = 83.3333 MHz
+ * 0001: 100 * 1 / 1 = 100.0000 MHz
+ * 0010: 100 * 4 / 3 = 133.3333 MHz
+ * 0011: 100 * 7 / 6 = 116.6667 MHz
+ * 0100: 100 * 4 / 5 = 80.0000 MHz
+ * 0101: 100 * 14 / 15 = 93.3333 MHz
+ * 0110: 100 * 9 / 10 = 90.0000 MHz
+ * 0111: 100 * 8 / 9 = 88.8889 MHz
+ * 1000: 100 * 7 / 8 = 87.5000 MHz
+ */
+static const struct freq_desc freq_desc_cht = {
+ .use_msr_plat = true,
+ .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 },
+ { 4, 5 }, { 14, 15 }, { 9, 10 }, { 8, 9 },
+ { 7, 8 } },
+ .mask = 0x0f,
+};
+
+/*
+ * Merriefield SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 0001: 100 * 1 / 1 = 100.0000 MHz
+ * 0010: 100 * 4 / 3 = 133.3333 MHz
+ */
+static const struct freq_desc freq_desc_tng = {
+ .use_msr_plat = true,
+ .muldiv = { { 0, 0 }, { 1, 1 }, { 4, 3 } },
+ .mask = 0x07,
+};
+
+/*
+ * Moorefield SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 0000: 100 * 5 / 6 = 83.3333 MHz
+ * 0001: 100 * 1 / 1 = 100.0000 MHz
+ * 0010: 100 * 4 / 3 = 133.3333 MHz
+ * 0011: 100 * 1 / 1 = 100.0000 MHz
+ */
+static const struct freq_desc freq_desc_ann = {
+ .use_msr_plat = true,
+ .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 1, 1 } },
+ .mask = 0x0f,
+};
+
+/*
+ * 24 MHz crystal? : 24 * 13 / 4 = 78 MHz
+ * Frequency step for Lightning Mountain SoC is fixed to 78 MHz,
+ * so all the frequency entries are 78000.
+ */
+static const struct freq_desc freq_desc_lgm = {
+ .use_msr_plat = true,
+ .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000,
+ 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 },
+ .mask = 0x0f,
+};
+
+static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
+ {}
+};
+
+/*
+ * MSR-based CPU/TSC frequency discovery for certain CPUs.
+ *
+ * Set global "lapic_timer_period" to bus_clock_cycles/jiffy
+ * Return processor base frequency in KHz, or 0 on failure.
+ */
+unsigned long cpu_khz_from_msr(void)
+{
+ u32 lo, hi, ratio, freq, tscref;
+ const struct freq_desc *freq_desc;
+ const struct x86_cpu_id *id;
+ const struct muldiv *md;
+ unsigned long res;
+ int index;
+
+ id = x86_match_cpu(tsc_msr_cpu_ids);
+ if (!id)
+ return 0;
+
+ freq_desc = (struct freq_desc *)id->driver_data;
+ if (freq_desc->use_msr_plat) {
+ rdmsr(MSR_PLATFORM_INFO, lo, hi);
+ ratio = (lo >> 8) & 0xff;
+ } else {
+ rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+ ratio = (hi >> 8) & 0x1f;
+ }
+
+ /* Get FSB FREQ ID */
+ rdmsr(MSR_FSB_FREQ, lo, hi);
+ index = lo & freq_desc->mask;
+ md = &freq_desc->muldiv[index];
+
+ /*
+ * Note this also catches cases where the index points to an unpopulated
+ * part of muldiv, in that case the else will set freq and res to 0.
+ */
+ if (md->divider) {
+ tscref = TSC_REFERENCE_KHZ * md->multiplier;
+ freq = DIV_ROUND_CLOSEST(tscref, md->divider);
+ /*
+ * Multiplying by ratio before the division has better
+ * accuracy than just calculating freq * ratio.
+ */
+ res = DIV_ROUND_CLOSEST(tscref * ratio, md->divider);
+ } else {
+ freq = freq_desc->freqs[index];
+ res = freq * ratio;
+ }
+
+ if (freq == 0)
+ pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ lapic_timer_period = (freq * 1000) / HZ;
+#endif
+
+ /*
+ * TSC frequency determined by MSR is always considered "known"
+ * because it is reported by HW.
+ * Another fact is that on MSR capable platforms, PIT/HPET is
+ * generally not available so calibration won't work at all.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Unfortunately there is no way for hardware to tell whether the
+ * TSC is reliable. We were told by silicon design team that TSC
+ * on Atom SoCs are always "reliable". TSC is also the only
+ * reliable clocksource on these SoCs (HPET is either not present
+ * or not functional) so mark TSC reliable which removes the
+ * requirement for a watchdog clocksource.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ return res;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 027b5b498993..ec3aa340d351 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* check TSC synchronization.
*
@@ -14,47 +15,264 @@
* ( The serial nature of the boot logic and the CPU hotplug lock
* protects against more than 2 CPUs entering this code. )
*/
+#include <linux/workqueue.h>
+#include <linux/topology.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <asm/msr.h>
#include <asm/tsc.h>
+struct tsc_adjust {
+ s64 bootval;
+ s64 adjusted;
+ unsigned long nextcheck;
+ bool warned;
+};
+
+static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
+
+/*
+ * TSC's on different sockets may be reset asynchronously.
+ * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
+ */
+bool __read_mostly tsc_async_resets;
+
+void mark_tsc_async_resets(char *reason)
+{
+ if (tsc_async_resets)
+ return;
+ tsc_async_resets = true;
+ pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
+}
+
+void tsc_verify_tsc_adjust(bool resume)
+{
+ struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
+ s64 curval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return;
+
+ /* Rate limit the MSR check */
+ if (!resume && time_before(jiffies, adj->nextcheck))
+ return;
+
+ adj->nextcheck = jiffies + HZ;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, curval);
+ if (adj->adjusted == curval)
+ return;
+
+ /* Restore the original value */
+ wrmsrq(MSR_IA32_TSC_ADJUST, adj->adjusted);
+
+ if (!adj->warned || resume) {
+ pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
+ smp_processor_id(), adj->adjusted, curval);
+ adj->warned = true;
+ }
+}
+
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
+static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
+ unsigned int cpu, bool bootcpu)
+{
+ /*
+ * First online CPU in a package stores the boot value in the
+ * adjustment value. This value might change later via the sync
+ * mechanism. If that fails we still can yell about boot values not
+ * being consistent.
+ *
+ * On the boot cpu we just force set the ADJUST value to 0 if it's
+ * non zero. We don't do that on non boot cpus because physical
+ * hotplug should have set the ADJUST register to a value > 0 so
+ * the TSC is in sync with the already running cpus.
+ *
+ * Also don't force the ADJUST value to zero if that is a valid value
+ * for socket 0 as determined by the system arch. This is required
+ * when multiple sockets are reset asynchronously with each other
+ * and socket 0 may not have an TSC ADJUST value of 0.
+ */
+ if (bootcpu && bootval != 0) {
+ if (likely(!tsc_async_resets)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
+ cpu, bootval);
+ wrmsrq(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ } else {
+ pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
+ cpu, bootval);
+ }
+ }
+ cur->adjusted = bootval;
+}
+
+#ifndef CONFIG_SMP
+bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return false;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
+ return false;
+}
+
+#else /* !CONFIG_SMP */
+
+/*
+ * Store and check the TSC ADJUST MSR if available
+ */
+bool tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int refcpu, cpu = smp_processor_id();
+ struct cpumask *mask;
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ cur->warned = false;
+
+ /*
+ * The default adjust value cannot be assumed to be zero on any socket.
+ */
+ cur->adjusted = bootval;
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different. When called on the
+ * boot CPU topology_core_cpumask() might not be available yet.
+ */
+ mask = topology_core_cpumask(cpu);
+ refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
+
+ if (refcpu >= nr_cpu_ids) {
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
+ bootcpu);
+ return false;
+ }
+
+ ref = per_cpu_ptr(&tsc_adjust, refcpu);
+ /*
+ * Compare the boot value and complain if it differs in the
+ * package.
+ */
+ if (bootval != ref->bootval)
+ printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
+
+ /*
+ * The TSC_ADJUST values in a package must be the same. If the boot
+ * value on this newly upcoming CPU differs from the adjustment
+ * value of the already online CPU in this package, set it to that
+ * adjusted value.
+ */
+ if (bootval != ref->adjusted) {
+ cur->adjusted = ref->adjusted;
+ wrmsrq(MSR_IA32_TSC_ADJUST, ref->adjusted);
+ }
+ /*
+ * We have the TSCs forced to be in sync on this package. Skip sync
+ * test:
+ */
+ return true;
+}
+
/*
* Entry/exit counters that make sure that both CPUs
* run the measurement code at once:
*/
-static __cpuinitdata atomic_t start_count;
-static __cpuinitdata atomic_t stop_count;
+static atomic_t start_count;
+static atomic_t stop_count;
+static atomic_t test_runs;
/*
* We use a raw spinlock in this exceptional case, because
* we want to have the fastest, inlined, non-debug version
* of a critical section, to be able to prove TSC time-warps:
*/
-static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-static __cpuinitdata cycles_t last_tsc;
-static __cpuinitdata cycles_t max_warp;
-static __cpuinitdata int nr_warps;
+static cycles_t last_tsc;
+static cycles_t max_warp;
+static int nr_warps;
+static int random_warps;
/*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs. This is not called
+ * if there is no TSC.
*/
-static __cpuinit void check_tsc_warp(void)
+static cycles_t check_tsc_warp(unsigned int timeout)
{
- cycles_t start, now, prev, end;
- int i;
+ cycles_t start, now, prev, end, cur_max_warp = 0;
+ int i, cur_warps = 0;
- rdtsc_barrier();
- start = get_cycles();
- rdtsc_barrier();
+ start = rdtsc_ordered();
/*
- * The measurement runs for 20 msecs:
+ * The measurement runs for 'timeout' msecs:
*/
- end = start + tsc_khz * 20ULL;
- now = start;
+ end = start + (cycles_t) tsc_khz * timeout;
for (i = 0; ; i++) {
/*
@@ -62,13 +280,11 @@ static __cpuinit void check_tsc_warp(void)
* previous TSC that was measured (possibly on
* another CPU) and update the previous TSC timestamp.
*/
- __raw_spin_lock(&sync_lock);
+ arch_spin_lock(&sync_lock);
prev = last_tsc;
- rdtsc_barrier();
- now = get_cycles();
- rdtsc_barrier();
+ now = rdtsc_ordered();
last_tsc = now;
- __raw_spin_unlock(&sync_lock);
+ arch_spin_unlock(&sync_lock);
/*
* Be nice every now and then (and also check whether
@@ -87,73 +303,113 @@ static __cpuinit void check_tsc_warp(void)
* we saw a time-warp of the TSC going backwards:
*/
if (unlikely(prev > now)) {
- __raw_spin_lock(&sync_lock);
+ arch_spin_lock(&sync_lock);
max_warp = max(max_warp, prev - now);
+ cur_max_warp = max_warp;
+ /*
+ * Check whether this bounces back and forth. Only
+ * one CPU should observe time going backwards.
+ */
+ if (cur_warps != nr_warps)
+ random_warps++;
nr_warps++;
- __raw_spin_unlock(&sync_lock);
+ cur_warps = nr_warps;
+ arch_spin_unlock(&sync_lock);
}
}
WARN(!(now-start),
"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
now-start, end-start);
+ return cur_max_warp;
}
/*
- * Source CPU calls into this - it waits for the freshly booted
- * target CPU to arrive and then starts the measurement:
+ * If the target CPU coming online doesn't have any of its core-siblings
+ * online, a timeout of 20msec will be used for the TSC-warp measurement
+ * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
+ * information about this socket already (and this information grows as we
+ * have more and more logical-siblings in that socket).
+ *
+ * Ideally we should be able to skip the TSC sync check on the other
+ * core-siblings, if the first logical CPU in a socket passed the sync test.
+ * But as the TSC is per-logical CPU and can potentially be modified wrongly
+ * by the bios, TSC sync test for smaller duration should be able
+ * to catch such errors. Also this will catch the condition where all the
+ * cores in the socket don't get reset at the same time.
*/
-void __cpuinit check_tsc_sync_source(int cpu)
+static inline unsigned int loop_timeout(int cpu)
{
- int cpus = 2;
-
- /*
- * No need to check if we already know that the TSC is not
- * synchronized:
- */
- if (unsynchronized_tsc())
- return;
+ return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
+}
- if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- pr_info("Skipping synchronization checks as TSC is reliable.\n");
- return;
- }
+static void tsc_sync_mark_tsc_unstable(struct work_struct *work)
+{
+ mark_tsc_unstable("check_tsc_sync_source failed");
+}
- pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
- smp_processor_id(), cpu);
+static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable);
- /*
- * Reset it - in case this is a second bootup:
- */
- atomic_set(&stop_count, 0);
+/*
+ * The freshly booted CPU initiates this via an async SMP function call.
+ */
+static void check_tsc_sync_source(void *__cpu)
+{
+ unsigned int cpu = (unsigned long)__cpu;
+ int cpus = 2;
/*
- * Wait for the target to arrive:
+ * Set the maximum number of test runs to
+ * 1 if the CPU does not provide the TSC_ADJUST MSR
+ * 3 if the MSR is available, so the target can try to adjust
*/
- while (atomic_read(&start_count) != cpus-1)
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ atomic_set(&test_runs, 1);
+ else
+ atomic_set(&test_runs, 3);
+retry:
+ /* Wait for the target to start. */
+ while (atomic_read(&start_count) != cpus - 1)
cpu_relax();
+
/*
* Trigger the target to continue into the measurement too:
*/
atomic_inc(&start_count);
- check_tsc_warp();
+ check_tsc_warp(loop_timeout(cpu));
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
- if (nr_warps) {
- printk("\n");
- pr_warning("Measured %Ld cycles TSC warp between CPUs, "
- "turning off TSC clock.\n", max_warp);
- mark_tsc_unstable("check_tsc_sync_source failed");
- } else {
- printk(" passed.\n");
+ /*
+ * If the test was successful set the number of runs to zero and
+ * stop. If not, decrement the number of runs an check if we can
+ * retry. In case of random warps no retry is attempted.
+ */
+ if (!nr_warps) {
+ atomic_set(&test_runs, 0);
+
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n",
+ smp_processor_id(), cpu);
+
+ } else if (atomic_dec_and_test(&test_runs) || random_warps) {
+ /* Force it to 0 if random warps brought us here */
+ atomic_set(&test_runs, 0);
+
+ pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n",
+ smp_processor_id(), cpu);
+ pr_warn("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
+ if (random_warps)
+ pr_warn("TSC warped randomly between CPUs\n");
+ schedule_work(&tsc_sync_work);
}
/*
* Reset it - just in case we boot another CPU later:
*/
atomic_set(&start_count, 0);
+ random_warps = 0;
nr_warps = 0;
max_warp = 0;
last_tsc = 0;
@@ -162,19 +418,45 @@ void __cpuinit check_tsc_sync_source(int cpu)
* Let the target continue with the bootup:
*/
atomic_inc(&stop_count);
+
+ /*
+ * Retry, if there is a chance to do so.
+ */
+ if (atomic_read(&test_runs) > 0)
+ goto retry;
}
/*
* Freshly booted CPUs call into this:
*/
-void __cpuinit check_tsc_sync_target(void)
+void check_tsc_sync_target(void)
{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int cpu = smp_processor_id();
+ cycles_t cur_max_warp, gbl_max_warp;
int cpus = 2;
- if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ /* Also aborts if there is no TSC. */
+ if (unsynchronized_tsc())
return;
/*
+ * Store, verify and sanitize the TSC adjust register. If
+ * successful skip the test.
+ *
+ * The test is also skipped when the TSC is marked reliable. This
+ * is true for SoCs which have no fallback clocksource. On these
+ * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
+ * register might have been wreckaged by the BIOS..
+ */
+ if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable)
+ return;
+
+ /* Kick the control CPU into the TSC synchronization function */
+ smp_call_function_single(cpumask_first(cpu_online_mask), check_tsc_sync_source,
+ (unsigned long *)(unsigned long)cpu, 0);
+retry:
+ /*
* Register this CPU's participation and wait for the
* source CPU to start the measurement:
*/
@@ -182,7 +464,12 @@ void __cpuinit check_tsc_sync_target(void)
while (atomic_read(&start_count) != cpus)
cpu_relax();
- check_tsc_warp();
+ cur_max_warp = check_tsc_warp(loop_timeout(cpu));
+
+ /*
+ * Store the maximum observed warp value for a potential retry:
+ */
+ gbl_max_warp = max_warp;
/*
* Ok, we are done:
@@ -194,4 +481,47 @@ void __cpuinit check_tsc_sync_target(void)
*/
while (atomic_read(&stop_count) != cpus)
cpu_relax();
+
+ /*
+ * Reset it for the next sync test:
+ */
+ atomic_set(&stop_count, 0);
+
+ /*
+ * Check the number of remaining test runs. If not zero, the test
+ * failed and a retry with adjusted TSC is possible. If zero the
+ * test was either successful or failed terminally.
+ */
+ if (!atomic_read(&test_runs))
+ return;
+
+ /*
+ * If the warp value of this CPU is 0, then the other CPU
+ * observed time going backwards so this TSC was ahead and
+ * needs to move backwards.
+ */
+ if (!cur_max_warp)
+ cur_max_warp = -gbl_max_warp;
+
+ /*
+ * Add the result to the previous adjustment value.
+ *
+ * The adjustment value is slightly off by the overhead of the
+ * sync mechanism (observed values are ~200 TSC cycles), but this
+ * really depends on CPU, node distance and frequency. So
+ * compensating for this is hard to get right. Experiments show
+ * that the warp is not longer detectable when the observed warp
+ * value is used. In the worst case the adjustment needs to go
+ * through a 3rd run for fine tuning.
+ */
+ cur->adjusted += cur_max_warp;
+
+ pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+ cpu, cur_max_warp, cur->adjusted);
+
+ wrmsrq(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ goto retry;
+
}
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
new file mode 100644
index 000000000000..d432f3824f0c
--- /dev/null
+++ b/arch/x86/kernel/umip.c
@@ -0,0 +1,422 @@
+/*
+ * umip.c Emulation for instruction protected by the User-Mode Instruction
+ * Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * User-Mode Instruction Prevention is a security feature present in recent
+ * x86 processors that, when enabled, prevents a group of instructions (SGDT,
+ * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general
+ * protection fault if the instruction is executed with CPL > 0.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (SGDT and SIDT) and those which return a
+ * value (SLDT, STR and SMSW).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * The instruction SMSW is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ * SLDT and STR are emulated to return the values that the kernel programmatically
+ * assigns:
+ * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not.
+ * - STR returns (GDT_ENTRY_TSS * 8).
+ *
+ * Emulation is provided for both 32-bit and 64-bit processes.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL
+#define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes X
+ * has a value of 4, in 64-bit processes X has a value of 8.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE_64BIT 8
+#define UMIP_GDT_IDT_BASE_SIZE_32BIT 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
+#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
+#define UMIP_INST_SMSW 2 /* 0F 01 /4 */
+#define UMIP_INST_SLDT 3 /* 0F 00 /0 */
+#define UMIP_INST_STR 4 /* 0F 00 /1 */
+
+static const char * const umip_insns[5] = {
+ [UMIP_INST_SGDT] = "SGDT",
+ [UMIP_INST_SIDT] = "SIDT",
+ [UMIP_INST_SMSW] = "SMSW",
+ [UMIP_INST_SLDT] = "SLDT",
+ [UMIP_INST_STR] = "STR",
+};
+
+#define umip_pr_err(regs, fmt, ...) \
+ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
+#define umip_pr_debug(regs, fmt, ...) \
+ umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__)
+
+/**
+ * umip_printk() - Print a rate-limited message
+ * @regs: Register set with the context in which the warning is printed
+ * @log_level: Kernel log level to print the message
+ * @fmt: The text string to print
+ *
+ * Print the text contained in @fmt. The print rate is limited to bursts of 5
+ * messages every two minutes. The purpose of this customized version of
+ * printk() is to print messages when user space processes use any of the
+ * UMIP-protected instructions. Thus, the printed text is prepended with the
+ * task name and process ID number of the current task as well as the
+ * instruction and stack pointers in @regs as seen when entering kernel mode.
+ *
+ * Returns:
+ *
+ * None.
+ */
+static __printf(3, 4)
+void umip_printk(const struct pt_regs *regs, const char *log_level,
+ const char *fmt, ...)
+{
+ /* Bursts of 5 messages every two minutes */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5);
+ struct task_struct *tsk = current;
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm,
+ task_pid_nr(tsk), regs->ip, regs->sp, &vaf);
+ va_end(args);
+}
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn: Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+ /* By getting modrm we also get the opcode. */
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */
+ if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf)
+ return -EINVAL;
+
+ if (insn->opcode.bytes[1] == 0x1) {
+ switch (X86_MODRM_REG(insn->modrm.value)) {
+ case 0:
+ /* The reg form of 0F 01 /0 encodes VMX instructions. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ return UMIP_INST_SGDT;
+ case 1:
+ /*
+ * The reg form of 0F 01 /1 encodes MONITOR/MWAIT,
+ * STAC/CLAC, and ENCLS.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ return UMIP_INST_SIDT;
+ case 4:
+ return UMIP_INST_SMSW;
+ default:
+ return -EINVAL;
+ }
+ } else if (insn->opcode.bytes[1] == 0x0) {
+ if (X86_MODRM_REG(insn->modrm.value) == 0)
+ return UMIP_INST_SLDT;
+ else if (X86_MODRM_REG(insn->modrm.value) == 1)
+ return UMIP_INST_STR;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn: Instruction structure with operands
+ * @umip_inst: A constant indicating the instruction to emulate
+ * @data: Buffer into which the dummy result is stored
+ * @data_size: Size of the emulated result
+ * @x86_64: true if process is 64-bit, false otherwise
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+ unsigned char *data, int *data_size, bool x86_64)
+{
+ if (!data || !data_size || !insn)
+ return -EINVAL;
+ /*
+ * These two instructions return the base address and limit of the
+ * global and interrupt descriptor table, respectively. According to the
+ * Intel Software Development manual, the base address can be 24-bit,
+ * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+ * 16-bit, the returned value of the base address is supposed to be a
+ * zero-extended 24-byte number. However, it seems that a 32-byte number
+ * is always returned irrespective of the operand size.
+ */
+ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ u64 dummy_base_addr;
+ u16 dummy_limit = 0;
+
+ /* SGDT and SIDT do not use registers operands. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ if (umip_inst == UMIP_INST_SGDT)
+ dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+ else
+ dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+ /*
+ * 64-bit processes use the entire dummy base address.
+ * 32-bit processes use the lower 32 bits of the base address.
+ * dummy_base_addr is always 64 bits, but we memcpy the correct
+ * number of bytes from it to the destination.
+ */
+ if (x86_64)
+ *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT;
+ else
+ *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT;
+
+ memcpy(data + 2, &dummy_base_addr, *data_size);
+
+ *data_size += UMIP_GDT_IDT_LIMIT_SIZE;
+ memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+ } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT ||
+ umip_inst == UMIP_INST_STR) {
+ unsigned long dummy_value;
+
+ if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+ } else if (umip_inst == UMIP_INST_STR) {
+ dummy_value = GDT_ENTRY_TSS * 8;
+ } else if (umip_inst == UMIP_INST_SLDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ down_read(&current->mm->context.ldt_usr_sem);
+ if (current->mm->context.ldt)
+ dummy_value = GDT_ENTRY_LDT * 8;
+ else
+ dummy_value = 0;
+ up_read(&current->mm->context.ldt_usr_sem);
+#else
+ dummy_value = 0;
+#endif
+ }
+
+ /*
+ * For these 3 instructions, the number
+ * of bytes to be copied in the result buffer is determined
+ * by whether the operand is a register or a memory location.
+ * If operand is a register, return as many bytes as the operand
+ * size. If operand is memory, return only the two least
+ * significant bytes.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ *data_size = insn->opnd_bytes;
+ else
+ *data_size = 2;
+
+ memcpy(data, &dummy_value, *data_size);
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr: Address that caused the signal
+ * @regs: Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+ struct task_struct *tsk = current;
+
+ tsk->thread.cr2 = (unsigned long)addr;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, addr);
+
+ if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+ return;
+
+ umip_pr_err(regs, "segfault in emulation. error%x\n",
+ X86_PF_USER | X86_PF_WRITE);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs: Registers as saved when entering the #GP handler
+ *
+ * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). This function fixes
+ * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR
+ * and SLDT are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+ int nr_copied, reg_offset, dummy_data_size, umip_inst;
+ /* 10 bytes is the maximum size of the result of UMIP instructions */
+ unsigned char dummy_data[10] = { 0 };
+ unsigned char buf[MAX_INSN_SIZE];
+ unsigned long *reg_addr;
+ void __user *uaddr;
+ struct insn insn;
+
+ if (!regs)
+ return false;
+
+ /*
+ * Give up on emulation if fetching the instruction failed. Should a
+ * page fault or a #GP be issued?
+ */
+ nr_copied = insn_fetch_from_user(regs, buf);
+ if (nr_copied <= 0)
+ return false;
+
+ if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
+ return false;
+
+ umip_inst = identify_insn(&insn);
+ if (umip_inst < 0)
+ return false;
+
+ umip_pr_debug(regs, "%s instruction cannot be used by applications.\n",
+ umip_insns[umip_inst]);
+
+ umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n");
+
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
+ user_64bit_mode(regs)))
+ return false;
+
+ /*
+ * If operand is a register, write result to the copy of the register
+ * value that was pushed to the stack when entering into kernel mode.
+ * Upon exit, the value we write will be restored to the actual hardware
+ * register.
+ */
+ if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+ reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+ /*
+ * Negative values are usually errors. In memory addressing,
+ * the exception is -EDOM. Since we expect a register operand,
+ * all negative values are errors.
+ */
+ if (reg_offset < 0)
+ return false;
+
+ reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+ memcpy(reg_addr, dummy_data, dummy_data_size);
+ } else {
+ uaddr = insn_get_addr_ref(&insn, regs);
+ if ((unsigned long)uaddr == -1L)
+ return false;
+
+ nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+ if (nr_copied > 0) {
+ /*
+ * If copy fails, send a signal and tell caller that
+ * fault was fixed up.
+ */
+ force_sig_info_umip_fault(uaddr, regs);
+ return true;
+ }
+ }
+
+ /* increase IP to let the program keep going */
+ regs->ip += insn.length;
+ return true;
+}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644
index 000000000000..d8ba93778ae3
--- /dev/null
+++ b/arch/x86/kernel/unwind_frame.c
@@ -0,0 +1,419 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/interrupt.h>
+#include <asm/sections.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
+
+static void unwind_dump(struct unwind_state *state)
+{
+ static bool dumped_before = false;
+ bool prev_zero, zero = false;
+ unsigned long word, *sp;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+ break;
+
+ for (; sp < stack_info.end; sp++) {
+
+ word = READ_ONCE_NOCHECK(*sp);
+
+ prev_zero = zero;
+ zero = word == 0;
+
+ if (zero) {
+ if (!prev_zero)
+ printk_deferred("%p: %0*x ...\n",
+ sp, BITS_PER_LONG/4, 0);
+ continue;
+ }
+
+ printk_deferred("%p: %0*lx (%pB)\n",
+ sp, BITS_PER_LONG/4, word, (void *)word);
+ }
+ }
+}
+
+static bool in_entry_code(unsigned long ip)
+{
+ char *addr = (char *)ip;
+
+ return addr >= __entry_text_start && addr < __entry_text_end;
+}
+
+static inline unsigned long *last_frame(struct unwind_state *state)
+{
+ return (unsigned long *)task_pt_regs(state->task) - 2;
+}
+
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
+#ifdef CONFIG_X86_32
+#define GCC_REALIGN_WORDS 3
+#else
+#define GCC_REALIGN_WORDS 1
+#endif
+
+static inline unsigned long *last_aligned_frame(struct unwind_state *state)
+{
+ return last_frame(state) - GCC_REALIGN_WORDS;
+}
+
+static bool is_last_aligned_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *aligned_bp = last_aligned_frame(state);
+
+ /*
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
+ *
+ * <start_secondary>:
+ * push %edi
+ * lea 0x8(%esp),%edi
+ * and $0xfffffff8,%esp
+ * pushl -0x4(%edi)
+ * push %ebp
+ * mov %esp,%ebp
+ *
+ * <x86_64_start_kernel>:
+ * lea 0x8(%rsp),%r10
+ * and $0xfffffffffffffff0,%rsp
+ * pushq -0x8(%r10)
+ * push %rbp
+ * mov %rsp,%rbp
+ *
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
+ */
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+#ifdef CONFIG_X86_64
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (regs & 0x80000000)
+ return NULL;
+
+ return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
+
+/*
+ * While walking the stack, KMSAN may stomp on stale locals from other
+ * functions that were marked as uninitialized upon function exit, and
+ * now hold the call frame information for the current function (e.g. the frame
+ * pointer). Because KMSAN does not specifically mark call frames as
+ * initialized, false positive reports are possible. To prevent such reports,
+ * we mark the functions scanning the stack (here and below) with
+ * __no_kmsan_checks.
+ */
+__no_kmsan_checks
+static bool update_stack_state(struct unwind_state *state,
+ unsigned long *next_bp)
+{
+ struct stack_info *info = &state->stack_info;
+ enum stack_type prev_type = info->type;
+ struct pt_regs *regs;
+ unsigned long *frame, *prev_frame_end, *addr_p, addr;
+ size_t len;
+
+ if (state->regs)
+ prev_frame_end = (void *)state->regs + sizeof(*state->regs);
+ else
+ prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE;
+
+ /* Is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ frame = (unsigned long *)regs;
+ len = sizeof(*regs);
+ state->got_irq = true;
+ } else {
+ frame = next_bp;
+ len = FRAME_HEADER_SIZE;
+ }
+
+ /*
+ * If the next bp isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that info->next_sp could point to an empty stack and the next bp
+ * could be on a subsequent stack.
+ */
+ while (!on_stack(info, frame, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ /* Make sure it only unwinds up and doesn't overlap the prev frame: */
+ if (state->orig_sp && state->stack_info.type == prev_type &&
+ frame < prev_frame_end)
+ return false;
+
+ /* Move state to the next frame: */
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
+ /* Save the return address: */
+ if (state->regs && user_mode(state->regs))
+ state->ip = 0;
+ else {
+ addr_p = unwind_get_return_address_ptr(state);
+ addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+ state->ip = unwind_recover_ret_addr(state, addr, addr_p);
+ }
+
+ /* Save the original stack pointer for unwind_dump(): */
+ if (!state->orig_sp)
+ state->orig_sp = frame;
+
+ return true;
+}
+
+__no_kmsan_checks
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct pt_regs *regs;
+ unsigned long *next_bp;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ if (is_last_task_frame(state)) {
+ regs = task_pt_regs(state->task);
+
+ /*
+ * kthreads (other than the boot CPU's idle thread) have some
+ * partial regs at the end of their stack which were placed
+ * there by copy_thread(). But the regs don't have any
+ * useful information, so we can skip them.
+ *
+ * This user_mode() check is slightly broader than a PF_KTHREAD
+ * check because it also catches the awkward situation where a
+ * newly forked kthread transitions into a user task by calling
+ * kernel_execve(), which eventually clears PF_KTHREAD.
+ */
+ if (!user_mode(regs))
+ goto the_end;
+
+ /*
+ * We're almost at the end, but not quite: there's still the
+ * syscall regs frame. Entry code doesn't encode the regs
+ * pointer for syscalls, so we have to set it manually.
+ */
+ state->regs = regs;
+ state->bp = NULL;
+ state->ip = 0;
+ return true;
+ }
+
+ /* Get the next frame pointer: */
+ if (state->next_bp) {
+ next_bp = state->next_bp;
+ state->next_bp = NULL;
+ } else if (state->regs) {
+ next_bp = (unsigned long *)state->regs->bp;
+ } else {
+ next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+ }
+
+ /* Move to the next frame if it's safe: */
+ if (!update_stack_state(state, next_bp))
+ goto bad_address;
+
+ return true;
+
+bad_address:
+ state->error = true;
+
+ /*
+ * When unwinding a non-current task, the task might actually be
+ * running on another CPU, in which case it could be modifying its
+ * stack while we're reading it. This is generally not a problem and
+ * can be ignored as long as the caller understands that unwinding
+ * another task will not always succeed.
+ */
+ if (state->task != current)
+ goto the_end;
+
+ /*
+ * Don't warn if the unwinder got lost due to an interrupt in entry
+ * code or in the C handler before the first frame pointer got set up:
+ */
+ if (state->got_irq && in_entry_code(state->ip))
+ goto the_end;
+ if (state->regs &&
+ state->regs->sp >= (unsigned long)last_aligned_frame(state) &&
+ state->regs->sp < (unsigned long)task_pt_regs(state->task))
+ goto the_end;
+
+ /*
+ * There are some known frame pointer issues on 32-bit. Disable
+ * unwinder warnings on 32-bit until it gets objtool support.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto the_end;
+
+ if (state->task != current)
+ goto the_end;
+
+ if (state->regs) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+ state->regs, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ } else {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+ state->bp, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ }
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ unsigned long *bp;
+
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+ state->got_irq = (regs);
+
+ /* Don't even attempt to start from user mode regs: */
+ if (regs && user_mode(regs)) {
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+ }
+
+ bp = get_frame_pointer(task, regs);
+
+ /*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer.
+ * That means that SP points into the middle of an incomplete frame:
+ * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
+ * would have written a frame pointer if we hadn't crashed.
+ * Pretend that the frame is complete and that BP points to it, but save
+ * the real BP so that we can use it when looking for the next frame.
+ */
+ if (regs && regs->ip == 0 && (unsigned long *)regs->sp >= first_frame) {
+ state->next_bp = bp;
+ bp = ((unsigned long *)regs->sp) - 1;
+ }
+
+ /* Initialize stack info and make sure the frame data is accessible: */
+ get_stack_info(bp, state->task, &state->stack_info,
+ &state->stack_mask);
+ update_stack_state(state, bp);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ (state->next_bp == NULL && state->bp < first_frame)))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644
index 000000000000..884d68a6e714
--- /dev/null
+++ b/arch/x86/kernel/unwind_guess.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ unsigned long addr;
+
+ if (unwind_done(state))
+ return 0;
+
+ addr = READ_ONCE_NOCHECK(*state->sp);
+
+ return unwind_recover_ret_addr(state, addr, state->sp);
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct stack_info *info = &state->stack_info;
+
+ if (unwind_done(state))
+ return false;
+
+ do {
+ for (state->sp++; state->sp < info->end; state->sp++) {
+ unsigned long addr = READ_ONCE_NOCHECK(*state->sp);
+
+ if (__kernel_text_address(addr))
+ return true;
+ }
+
+ state->sp = PTR_ALIGN(info->next_sp, sizeof(long));
+
+ } while (!get_stack_info(state->sp, state->task, info,
+ &state->stack_mask));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->task = task;
+ state->sp = PTR_ALIGN(first_frame, sizeof(long));
+
+ get_stack_info(first_frame, state->task, &state->stack_info,
+ &state->stack_mask);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ if (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ !__kernel_text_address(*first_frame)))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..977ee75e047c
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/objtool.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/orc_header.h>
+
+ORC_HEADER;
+
+#define orc_warn(fmt, ...) \
+ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)
+
+#define orc_warn_current(args...) \
+({ \
+ static bool dumped_before; \
+ if (state->task == current && !state->error) { \
+ orc_warn(args); \
+ if (unwind_debug && !dumped_before) { \
+ dumped_before = true; \
+ unwind_dump(state); \
+ } \
+ } \
+})
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static bool orc_init __ro_after_init;
+static bool unwind_debug __ro_after_init;
+static unsigned int lookup_num_blocks __ro_after_init;
+
+static int __init unwind_debug_cmdline(char *str)
+{
+ unwind_debug = true;
+
+ return 0;
+}
+early_param("unwind_debug", unwind_debug_cmdline);
+
+static void unwind_dump(struct unwind_state *state)
+{
+ static bool dumped_before;
+ unsigned long word, *sp;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = __builtin_frame_address(0); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+ break;
+
+ for (; sp < stack_info.end; sp++) {
+
+ word = READ_ONCE_NOCHECK(*sp);
+
+ printk_deferred("%0*lx: %0*lx (%pB)\n", BITS_PER_LONG/4,
+ (unsigned long)sp, BITS_PER_LONG/4,
+ word, (void *)word);
+ }
+ }
+}
+
+static inline unsigned long orc_ip(const int *ip)
+{
+ return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+ unsigned int num_entries, unsigned long ip)
+{
+ int *first = ip_table;
+ int *last = ip_table + num_entries - 1;
+ int *mid, *found = first;
+
+ if (!num_entries)
+ return NULL;
+
+ /*
+ * Do a binary range search to find the rightmost duplicate of a given
+ * starting address. Some entries are section terminators which are
+ * "weak" entries for ensuring there are no gaps. They should be
+ * ignored when they conflict with a real entry.
+ */
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+
+ if (orc_ip(mid) <= ip) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ struct module *mod;
+
+ mod = __module_address(ip);
+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+ return NULL;
+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+ mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static struct orc_entry *orc_find(unsigned long ip);
+
+/*
+ * Ftrace dynamic trampolines do not have orc entries of their own.
+ * But they are copies of the ftrace entries that are static and
+ * defined in ftrace_*.S, which do have orc entries.
+ *
+ * If the unwinder comes across a ftrace trampoline, then find the
+ * ftrace function that was used to create it, and use that ftrace
+ * function's orc entry, as the placement of the return code in
+ * the stack will be identical.
+ */
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ struct ftrace_ops *ops;
+ unsigned long tramp_addr, offset;
+
+ ops = ftrace_ops_trampoline(ip);
+ if (!ops)
+ return NULL;
+
+ /* Set tramp_addr to the start of the code copied by the trampoline */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ tramp_addr = (unsigned long)ftrace_regs_caller;
+ else
+ tramp_addr = (unsigned long)ftrace_caller;
+
+ /* Now place tramp_addr to the location within the trampoline ip is at */
+ offset = ip - ops->trampoline;
+ tramp_addr += offset;
+
+ /* Prevent unlikely recursion */
+ if (ip == tramp_addr)
+ return NULL;
+
+ return orc_find(tramp_addr);
+}
+#else
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+/*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer,
+ * and we don't have unwind information for NULL.
+ * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
+ * pointer into its parent and then continue normally from there.
+ */
+static struct orc_entry null_orc_entry = {
+ .sp_offset = sizeof(long),
+ .sp_reg = ORC_REG_SP,
+ .bp_reg = ORC_REG_UNDEFINED,
+ .type = ORC_TYPE_CALL
+};
+
+/* Fake frame pointer entry -- used as a fallback for generated code */
+static struct orc_entry orc_fp_entry = {
+ .type = ORC_TYPE_CALL,
+ .sp_reg = ORC_REG_BP,
+ .sp_offset = 16,
+ .bp_reg = ORC_REG_PREV_SP,
+ .bp_offset = -16,
+};
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+ static struct orc_entry *orc;
+
+ if (ip == 0)
+ return &null_orc_entry;
+
+ /* For non-init vmlinux addresses, use the fast lookup table: */
+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+ unsigned int idx, start, stop;
+
+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+ if (unlikely((idx >= lookup_num_blocks-1))) {
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n",
+ idx, lookup_num_blocks, (void *)ip);
+ return NULL;
+ }
+
+ start = orc_lookup[idx];
+ stop = orc_lookup[idx + 1] + 1;
+
+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+ (__start_orc_unwind + stop > __stop_orc_unwind))) {
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n",
+ idx, lookup_num_blocks, start, stop, (void *)ip);
+ return NULL;
+ }
+
+ return __orc_find(__start_orc_unwind_ip + start,
+ __start_orc_unwind + start, stop - start, ip);
+ }
+
+ /* vmlinux .init slow lookup: */
+ if (is_kernel_inittext(ip))
+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+ /* Module lookup: */
+ orc = orc_module_find(ip);
+ if (orc)
+ return orc;
+
+ return orc_ftrace_find(ip);
+}
+
+#ifdef CONFIG_MODULES
+
+static DEFINE_MUTEX(sort_mutex);
+static int *cur_orc_ip_table = __start_orc_unwind_ip;
+static struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+ struct orc_entry *orc_a, *orc_b;
+ int *a = _a, *b = _b, tmp;
+ int delta = _b - _a;
+
+ /* Swap the .orc_unwind_ip entries: */
+ tmp = *a;
+ *a = *b + delta;
+ *b = tmp - delta;
+
+ /* Swap the corresponding .orc_unwind entries: */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ orc_b = cur_orc_table + (b - cur_orc_ip_table);
+ swap(*orc_a, *orc_b);
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+ struct orc_entry *orc_a;
+ const int *a = _a, *b = _b;
+ unsigned long a_val = orc_ip(a);
+ unsigned long b_val = orc_ip(b);
+
+ if (a_val > b_val)
+ return 1;
+ if (a_val < b_val)
+ return -1;
+
+ /*
+ * The "weak" section terminator entries need to always be first
+ * to ensure the lookup code skips them in favor of real entries.
+ * These terminator entries exist to handle any gaps created by
+ * whitelisted .o files which didn't get objtool generation.
+ */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
+}
+
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+ void *_orc, size_t orc_size)
+{
+ int *orc_ip = _orc_ip;
+ struct orc_entry *orc = _orc;
+ unsigned int num_entries = orc_ip_size / sizeof(int);
+
+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(*orc) != 0 ||
+ num_entries != orc_size / sizeof(*orc));
+
+ /*
+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+ * associate an .orc_unwind_ip table entry with its corresponding
+ * .orc_unwind entry so they can both be swapped.
+ */
+ mutex_lock(&sort_mutex);
+ cur_orc_ip_table = orc_ip;
+ cur_orc_table = orc;
+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+ mutex_unlock(&sort_mutex);
+
+ mod->arch.orc_unwind_ip = orc_ip;
+ mod->arch.orc_unwind = orc;
+ mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+ size_t num_entries = orc_ip_size / sizeof(int);
+ struct orc_entry *orc;
+ int i;
+
+ if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(struct orc_entry) != 0 ||
+ num_entries != orc_size / sizeof(struct orc_entry)) {
+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ /*
+ * Note, the orc_unwind and orc_unwind_ip tables were already
+ * sorted at build time via the 'sorttable' tool.
+ * It's ready for binary search straight away, no need to sort it.
+ */
+
+ /* Initialize the fast lookup table: */
+ lookup_num_blocks = orc_lookup_end - orc_lookup;
+ for (i = 0; i < lookup_num_blocks-1; i++) {
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ num_entries,
+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ orc_lookup[i] = orc - __start_orc_unwind;
+ }
+
+ /* Initialize the ending block: */
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+ LOOKUP_STOP_IP);
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+ orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (state->regs)
+ return &state->regs->ip;
+
+ if (state->sp)
+ return (unsigned long *)state->sp - 1;
+
+ return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+ void *addr = (void *)_addr;
+
+ if (on_stack(info, addr, len))
+ return true;
+
+ return !get_stack_info(addr, state->task, info, &state->stack_mask) &&
+ on_stack(info, addr, len);
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+ unsigned long *val)
+{
+ if (!stack_access_ok(state, addr, sizeof(long)))
+ return false;
+
+ *val = READ_ONCE_NOCHECK(*(unsigned long *)addr);
+ return true;
+}
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (struct pt_regs *)addr;
+
+ /* x86-32 support will be more complicated due to the &regs->sp hack */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
+
+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
+ return false;
+
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
+ return true;
+}
+
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
+
+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+ return false;
+
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
+ return true;
+}
+
+/*
+ * If state->regs is non-NULL, and points to a full pt_regs, just get the reg
+ * value from state->regs.
+ *
+ * Otherwise, if state->regs just points to IRET regs, and the previous frame
+ * had full regs, it's safe to get the value from the previous regs. This can
+ * happen when early/late IRQ entry code gets interrupted by an NMI.
+ */
+static bool get_reg(struct unwind_state *state, unsigned int reg_off,
+ unsigned long *val)
+{
+ unsigned int reg = reg_off/8;
+
+ if (!state->regs)
+ return false;
+
+ if (state->full_regs) {
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
+ return true;
+ }
+
+ if (state->prev_regs) {
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
+ return true;
+ }
+
+ return false;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+ bool indirect = false;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Don't let modules unload while we're reading their ORC data. */
+ guard(rcu)();
+
+ /* End-of-stack check for user tasks: */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ /*
+ * Find the orc_entry associated with the text address.
+ *
+ * For a call frame (as opposed to a signal frame), state->ip points to
+ * the instruction after the call. That instruction's stack layout
+ * could be different from the call instruction's layout, for example
+ * if the call was to a noreturn function. So get the ORC data for the
+ * call instruction itself.
+ */
+ orc = orc_find(state->signal ? state->ip : state->ip - 1);
+ if (!orc) {
+ /*
+ * As a fallback, try to assume this code uses a frame pointer.
+ * This is useful for generated code, like BPF, which ORC
+ * doesn't know about. This is just a guess, so the rest of
+ * the unwind is no longer considered reliable.
+ */
+ orc = &orc_fp_entry;
+ state->error = true;
+ } else {
+ if (orc->type == ORC_TYPE_UNDEFINED)
+ goto err;
+
+ if (orc->type == ORC_TYPE_END_OF_STACK)
+ goto the_end;
+ }
+
+ state->signal = orc->signal;
+
+ /* Find the previous frame's stack: */
+ switch (orc->sp_reg) {
+ case ORC_REG_SP:
+ sp = state->sp + orc->sp_offset;
+ break;
+
+ case ORC_REG_BP:
+ sp = state->bp + orc->sp_offset;
+ break;
+
+ case ORC_REG_SP_INDIRECT:
+ sp = state->sp;
+ indirect = true;
+ break;
+
+ case ORC_REG_BP_INDIRECT:
+ sp = state->bp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_R10:
+ if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
+ orc_warn_current("missing R10 value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_R13:
+ if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
+ orc_warn_current("missing R13 value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_DI:
+ if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
+ orc_warn_current("missing RDI value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_DX:
+ if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
+ orc_warn_current("missing DX value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ default:
+ orc_warn("unknown SP base reg %d at %pB\n",
+ orc->sp_reg, (void *)state->ip);
+ goto err;
+ }
+
+ if (indirect) {
+ if (!deref_stack_reg(state, sp, &sp))
+ goto err;
+
+ if (orc->sp_reg == ORC_REG_SP_INDIRECT)
+ sp += orc->sp_offset;
+ }
+
+ /* Find IP, SP and possibly regs: */
+ switch (orc->type) {
+ case ORC_TYPE_CALL:
+ ip_p = sp - sizeof(long);
+
+ if (!deref_stack_reg(state, ip_p, &state->ip))
+ goto err;
+
+ state->ip = unwind_recover_ret_addr(state, state->ip,
+ (unsigned long *)ip_p);
+ state->sp = sp;
+ state->regs = NULL;
+ state->prev_regs = NULL;
+ break;
+
+ case ORC_TYPE_REGS:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn_current("can't access registers at %pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+ /*
+ * There is a small chance to interrupt at the entry of
+ * arch_rethook_trampoline() where the ORC info doesn't exist.
+ * That point is right after the RET to arch_rethook_trampoline()
+ * which was modified return address.
+ * At that point, the @addr_p of the unwind_recover_rethook()
+ * (this has to point the address of the stack entry storing
+ * the modified return address) must be "SP - (a stack entry)"
+ * because SP is incremented by the RET.
+ */
+ state->ip = unwind_recover_rethook(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
+ state->regs = (struct pt_regs *)sp;
+ state->prev_regs = NULL;
+ state->full_regs = true;
+ break;
+
+ case ORC_TYPE_REGS_PARTIAL:
+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn_current("can't access iret registers at %pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+ /* See ORC_TYPE_REGS case comment. */
+ state->ip = unwind_recover_rethook(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
+
+ if (state->full_regs)
+ state->prev_regs = state->regs;
+ state->regs = (void *)sp - IRET_FRAME_OFFSET;
+ state->full_regs = false;
+ break;
+
+ default:
+ orc_warn("unknown .orc_unwind entry type %d at %pB\n",
+ orc->type, (void *)orig_ip);
+ goto err;
+ }
+
+ /* Find BP: */
+ switch (orc->bp_reg) {
+ case ORC_REG_UNDEFINED:
+ if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
+ state->bp = tmp;
+ break;
+
+ case ORC_REG_PREV_SP:
+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ case ORC_REG_BP:
+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ default:
+ orc_warn("unknown BP base reg %d for ip %pB\n",
+ orc->bp_reg, (void *)orig_ip);
+ goto err;
+ }
+
+ /* Prevent a recursive loop due to bad ORC data: */
+ if (state->stack_info.type == prev_type &&
+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+ state->sp <= prev_sp) {
+ orc_warn_current("stack going in the wrong direction? at %pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+
+ return true;
+
+err:
+ state->error = true;
+
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ if (!orc_init)
+ goto err;
+
+ /*
+ * Refuse to unwind the stack of a task while it's executing on another
+ * CPU. This check is racy, but that's ok: the unwinder has other
+ * checks to prevent it from going off the rails.
+ */
+ if (task_on_another_cpu(task))
+ goto err;
+
+ if (regs) {
+ if (user_mode(regs))
+ goto the_end;
+
+ state->ip = regs->ip;
+ state->sp = regs->sp;
+ state->bp = regs->bp;
+ state->regs = regs;
+ state->full_regs = true;
+ state->signal = true;
+
+ } else if (task == current) {
+ asm volatile("lea (%%rip), %0\n\t"
+ "mov %%rsp, %1\n\t"
+ "mov %%rbp, %2\n\t"
+ : "=r" (state->ip), "=r" (state->sp),
+ "=r" (state->bp));
+
+ } else {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+ state->sp = task->thread.sp + sizeof(*frame);
+ state->bp = READ_ONCE_NOCHECK(frame->bp);
+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ state->signal = (void *)state->ip == ret_from_fork_asm;
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+ &state->stack_info, &state->stack_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ state->error = true;
+ if (get_stack_info(next_page, state->task, &state->stack_info,
+ &state->stack_mask))
+ return;
+ }
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+
+ /* When starting from regs, skip the regs frame: */
+ if (regs) {
+ unwind_next_frame(state);
+ return;
+ }
+
+ /* Otherwise, skip ahead to the user-specified starting frame: */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->sp <= (unsigned long)first_frame))
+ unwind_next_frame(state);
+
+ return;
+
+err:
+ state->error = true;
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 000000000000..7be8e361ca55
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,1825 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include <linux/kdebug.h>
+#include <asm/processor.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/mmu_context.h>
+#include <asm/nops.h>
+
+/* Post-execution fixups. */
+
+/* Adjust IP back to vicinity of actual insn */
+#define UPROBE_FIX_IP 0x01
+
+/* Adjust the return address of a call insn */
+#define UPROBE_FIX_CALL 0x02
+
+/* Instruction will modify TF, don't change it */
+#define UPROBE_FIX_SETF 0x04
+
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
+
+#define UPROBE_TRAP_NR UINT_MAX
+
+/* Adaptations for mhiramat x86 decoder v14. */
+#define OPCODE1(insn) ((insn)->opcode.bytes[0])
+#define OPCODE2(insn) ((insn)->opcode.bytes[1])
+#define OPCODE3(insn) ((insn)->opcode.bytes[2])
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+
+/*
+ * Good-instruction tables for 32-bit apps. This is non-const and volatile
+ * to keep gcc from statically optimizing it out, as variable_test_bit makes
+ * some versions of gcc to think only *(unsigned long*) is used.
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
+ * (why we support bound (62) then? it's similar, and similarly unused...)
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * 07,17,1f - pop es/ss/ds
+ * Normally not used in userspace, but would execute if used.
+ * Can cause GP or stack exception if tries to load wrong segment descriptor.
+ * We hesitate to run them under single step since kernel's handling
+ * of userspace single-stepping (TF flag) is fragile.
+ * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
+ * on the same grounds that they are never used.
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static volatile u32 good_insns_32[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_32 NULL
+#endif
+
+/* Good-instruction tables for 64-bit apps.
+ *
+ * Genuinely invalid opcodes:
+ * 06,07 - formerly push/pop es
+ * 0e - formerly push cs
+ * 16,17 - formerly push/pop ss
+ * 1e,1f - formerly push/pop ds
+ * 27,2f,37,3f - formerly daa/das/aaa/aas
+ * 60,61 - formerly pusha/popa
+ * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
+ * 82 - formerly redundant encoding of Group1
+ * 9a - formerly call seg:ofs
+ * ce - formerly into
+ * d4,d5 - formerly aam/aad
+ * d6 - formerly undocumented salc
+ * ea - formerly jmp seg:ofs
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_64)
+static volatile u32 good_insns_64[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_64 NULL
+#endif
+
+/* Using this for both 64-bit and 32-bit apps.
+ * Opcodes we don't support:
+ * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
+ * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
+ * Also encodes tons of other system insns if mod=11.
+ * Some are in fact non-system: xend, xtest, rdtscp, maybe more
+ * 0f 05 - syscall
+ * 0f 06 - clts (CPL0 insn)
+ * 0f 07 - sysret
+ * 0f 08 - invd (CPL0 insn)
+ * 0f 09 - wbinvd (CPL0 insn)
+ * 0f 0b - ud2
+ * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
+ * 0f 34 - sysenter
+ * 0f 35 - sysexit
+ * 0f 37 - getsec
+ * 0f 78 - vmread (Intel VMX. CPL0 insn)
+ * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
+ * Note: with prefixes, these two opcodes are
+ * extrq/insertq/AVX512 convert vector ops.
+ * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
+ * {rd,wr}{fs,gs}base,{s,l,m}fence.
+ * Why? They are all user-executable.
+ */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+/*
+ * opcodes we may need to refine support for:
+ *
+ * 0f - 2-byte instructions: For many of these instructions, the validity
+ * depends on the prefix and/or the reg field. On such instructions, we
+ * just consider the opcode combination valid if it corresponds to any
+ * valid instruction.
+ *
+ * 8f - Group 1 - only reg = 0 is OK
+ * c6-c7 - Group 11 - only reg = 0 is OK
+ * d9-df - fpu insns with some illegal encodings
+ * f2, f3 - repnz, repz prefixes. These are also the first byte for
+ * certain floating-point instructions, such as addsd.
+ *
+ * fe - Group 4 - only reg = 0 or 1 is OK
+ * ff - Group 5 - only reg = 0-6 is OK
+ *
+ * others -- Do we need to support these?
+ *
+ * 0f - (floating-point?) prefetch instructions
+ * 07, 17, 1f - pop es, pop ss, pop ds
+ * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ * but 64 and 65 (fs: and gs:) seem to be used, so we support them
+ * 67 - addr16 prefix
+ * ce - into
+ * f0 - lock prefix
+ */
+
+/*
+ * TODO:
+ * - Where necessary, examine the modrm byte and allow only valid instructions
+ * in the different Groups and fpu instructions.
+ */
+
+static bool is_prefix_bad(struct insn *insn)
+{
+ insn_byte_t p;
+
+ for_each_insn_prefix(insn, p) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(p);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
+ return true;
+ }
+ }
+ return false;
+}
+
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
+{
+ enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32;
+ u32 volatile *good_insns;
+ int ret;
+
+ ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m);
+ if (ret < 0)
+ return -ENOEXEC;
+
+ if (is_prefix_bad(insn))
+ return -ENOTSUPP;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return -ENOTSUPP;
+
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
+ return 0;
+
+ if (insn->opcode.nbytes == 2) {
+ if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+ return 0;
+ }
+
+ return -ENOTSUPP;
+}
+
+#ifdef CONFIG_X86_64
+
+struct uretprobe_syscall_args {
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long ax;
+};
+
+asm (
+ ".pushsection .rodata\n"
+ ".global uretprobe_trampoline_entry\n"
+ "uretprobe_trampoline_entry:\n"
+ "push %rax\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "mov $" __stringify(__NR_uretprobe) ", %rax\n"
+ "syscall\n"
+ ".global uretprobe_syscall_check\n"
+ "uretprobe_syscall_check:\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ /*
+ * The uretprobe syscall replaces stored %rax value with final
+ * return address, so we don't restore %rax in here and just
+ * call ret.
+ */
+ "ret\n"
+ "int3\n"
+ ".global uretprobe_trampoline_end\n"
+ "uretprobe_trampoline_end:\n"
+ ".popsection\n"
+);
+
+extern u8 uretprobe_trampoline_entry[];
+extern u8 uretprobe_trampoline_end[];
+extern u8 uretprobe_syscall_check[];
+
+void *arch_uretprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /*
+ * At the moment the uretprobe syscall trampoline is supported
+ * only for native 64-bit process, the compat process still uses
+ * standard breakpoint.
+ */
+ if (user_64bit_mode(regs)) {
+ *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry;
+ return uretprobe_trampoline_entry;
+ }
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
+static unsigned long trampoline_check_ip(unsigned long tramp)
+{
+ return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
+}
+
+SYSCALL_DEFINE0(uretprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uretprobe_syscall_args args;
+ unsigned long err, ip, sp, tramp;
+
+ /* If there's no trampoline, we are called from wrong place. */
+ tramp = uprobe_get_trampoline_vaddr();
+ if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR))
+ goto sigill;
+
+ /* Make sure the ip matches the only allowed sys_uretprobe caller. */
+ if (unlikely(regs->ip != trampoline_check_ip(tramp)))
+ goto sigill;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ax = args.ax;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ ip = regs->ip;
+ sp = regs->sp;
+
+ uprobe_handle_trampoline(regs);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ * .. or shadow stack is enabled, in which case we need to skip
+ * return through the user space stack address.
+ */
+ if (regs->sp != sp || shstk_is_enabled())
+ return regs->ax;
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed r11/cx */
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /*
+ * ax register is passed through as return value, so we can use
+ * its space on stack for ip value and jump to it through the
+ * trampoline's ret instruction
+ */
+ args.ax = regs->ip;
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+
+ return regs->ax;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+/*
+ * If arch_uprobe->insn doesn't use rip-relative addressing, return
+ * immediately. Otherwise, rewrite the instruction so that it accesses
+ * its memory operand indirectly through a scratch register. Set
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
+ *
+ * We do this because a rip-relative instruction can access only a
+ * relatively small area (+/- 2 GB from the instruction), and the XOL
+ * area typically lies beyond that area. At least for instructions
+ * that store to memory, we can't execute the original instruction
+ * and "fix things up" later, because the misdirected store could be
+ * disastrous.
+ *
+ * Some useful facts about rip-relative instructions:
+ *
+ * - There's always a modrm byte with bit layout "00 reg 101".
+ * - There's never a SIB byte.
+ * - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 *cursor;
+ u8 reg;
+ u8 reg2;
+
+ if (!insn_rip_relative(insn))
+ return;
+
+ /*
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
+ * Clear REX.b bit (extension of MODRM.rm field):
+ * we want to encode low numbered reg, not r8+.
+ */
+ if (insn->rex_prefix.nbytes) {
+ cursor = auprobe->insn + insn_offset_rex_prefix(insn);
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3/EVEX prefix.
+ * TODO: add XOP treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes >= 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * Setting VEX3.b (setting because it has inverted meaning).
+ * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+ * is the 4th bit of MODRM.rm, and needs the same treatment.
+ * For VEX3-encoded insns, VEX3.x value has no effect in
+ * non-SIB encoding, the change is superfluous but harmless.
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x60;
+ }
+
+ /*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
+ * Point cursor at the modrm byte. The next 4 bytes are the
+ * displacement. Beyond the displacement, for some instructions,
+ * is the immediate operand.
+ */
+ cursor = auprobe->insn + insn_offset_modrm(insn);
+ /*
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
+ */
+ *cursor = 0x80 | (reg << 3) | reg2;
+}
+
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return &regs->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return &regs->di;
+ return &regs->bx;
+}
+
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
+ }
+}
+
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ *sr = utask->autask.saved_scratch_register;
+ }
+}
+
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
+static struct page *tramp_mapping_pages[2] __ro_after_init;
+
+static struct vm_special_mapping tramp_mapping = {
+ .name = "[uprobes-trampoline]",
+ .mremap = tramp_mremap,
+ .pages = tramp_mapping_pages,
+};
+
+struct uprobe_trampoline {
+ struct hlist_node node;
+ unsigned long vaddr;
+};
+
+static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
+{
+ long delta = (long)(vaddr + 5 - vtramp);
+
+ return delta >= INT_MIN && delta <= INT_MAX;
+}
+
+static unsigned long find_nearest_trampoline(unsigned long vaddr)
+{
+ struct vm_unmapped_area_info info = {
+ .length = PAGE_SIZE,
+ .align_mask = ~PAGE_MASK,
+ };
+ unsigned long low_limit, high_limit;
+ unsigned long low_tramp, high_tramp;
+ unsigned long call_end = vaddr + 5;
+
+ if (check_add_overflow(call_end, INT_MIN, &low_limit))
+ low_limit = PAGE_SIZE;
+
+ high_limit = call_end + INT_MAX;
+
+ /* Search up from the caller address. */
+ info.low_limit = call_end;
+ info.high_limit = min(high_limit, TASK_SIZE);
+ high_tramp = vm_unmapped_area(&info);
+
+ /* Search down from the caller address. */
+ info.low_limit = max(low_limit, PAGE_SIZE);
+ info.high_limit = call_end;
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ low_tramp = vm_unmapped_area(&info);
+
+ if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp))
+ return -ENOMEM;
+ if (IS_ERR_VALUE(high_tramp))
+ return low_tramp;
+ if (IS_ERR_VALUE(low_tramp))
+ return high_tramp;
+
+ /* Return address that's closest to the caller address. */
+ if (call_end - low_tramp < high_tramp - call_end)
+ return low_tramp;
+ return high_tramp;
+}
+
+static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct mm_struct *mm = current->mm;
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+
+ if (!user_64bit_mode(regs))
+ return NULL;
+
+ vaddr = find_nearest_trampoline(vaddr);
+ if (IS_ERR_VALUE(vaddr))
+ return NULL;
+
+ tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
+ if (unlikely(!tramp))
+ return NULL;
+
+ tramp->vaddr = vaddr;
+ vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
+ VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+ &tramp_mapping);
+ if (IS_ERR(vma)) {
+ kfree(tramp);
+ return NULL;
+ }
+ return tramp;
+}
+
+static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
+{
+ struct uprobes_state *state = &current->mm->uprobes_state;
+ struct uprobe_trampoline *tramp = NULL;
+
+ if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
+ return NULL;
+
+ hlist_for_each_entry(tramp, &state->head_tramps, node) {
+ if (is_reachable_by_call(tramp->vaddr, vaddr)) {
+ *new = false;
+ return tramp;
+ }
+ }
+
+ tramp = create_uprobe_trampoline(vaddr);
+ if (!tramp)
+ return NULL;
+
+ *new = true;
+ hlist_add_head(&tramp->node, &state->head_tramps);
+ return tramp;
+}
+
+static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
+{
+ /*
+ * We do not unmap and release uprobe trampoline page itself,
+ * because there's no easy way to make sure none of the threads
+ * is still inside the trampoline.
+ */
+ hlist_del(&tramp->node);
+ kfree(tramp);
+}
+
+void arch_uprobe_init_state(struct mm_struct *mm)
+{
+ INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
+}
+
+void arch_uprobe_clear_state(struct mm_struct *mm)
+{
+ struct uprobes_state *state = &mm->uprobes_state;
+ struct uprobe_trampoline *tramp;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
+ destroy_uprobe_trampoline(tramp);
+}
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+ struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+ return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ bool found, retry = true;
+ unsigned int seq;
+
+ rcu_read_lock();
+ if (mmap_lock_speculate_try_begin(mm, &seq)) {
+ found = __in_uprobe_trampoline(ip);
+ retry = mmap_lock_speculate_retry(mm, seq);
+ }
+ rcu_read_unlock();
+
+ if (retry) {
+ mmap_read_lock(mm);
+ found = __in_uprobe_trampoline(ip);
+ mmap_read_unlock(mm);
+ }
+ return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+ unsigned long ax;
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uprobe_syscall_args args;
+ unsigned long ip, sp, sret;
+ int err;
+
+ /* Allow execution only from uprobe trampolines. */
+ if (!in_uprobe_trampoline(regs->ip))
+ return -ENXIO;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ ip = regs->ip;
+
+ /*
+ * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+ * - adjust ip to the probe address, call saved next instruction address
+ * - adjust sp to the probe's stack frame (check trampoline code)
+ */
+ regs->ax = args.ax;
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ip = args.retaddr - 5;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ sp = regs->sp;
+
+ err = shstk_pop((u64 *)&sret);
+ if (err == -EFAULT || (!err && sret != args.retaddr))
+ goto sigill;
+
+ handle_syscall_uprobe(regs, regs->ip);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ */
+ if (regs->sp != sp) {
+ /* skip the trampoline call */
+ if (args.retaddr - 5 == regs->ip)
+ regs->ip += 5;
+ return regs->ax;
+ }
+
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed ax/r11/cx */
+ args.ax = regs->ax;
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /* keep return address unless we are instructed otherwise */
+ if (args.retaddr - 5 != regs->ip)
+ args.retaddr = regs->ip;
+
+ if (shstk_push(args.retaddr) == -EFAULT)
+ goto sigill;
+
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+ return 0;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ "uprobe_trampoline_entry:\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "push %rax\n"
+ "mov $" __stringify(__NR_uprobe) ", %rax\n"
+ "syscall\n"
+ "pop %rax\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ "ret\n"
+ "int3\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ ".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+ tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+ return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
+enum {
+ EXPECT_SWBP,
+ EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+ unsigned long base;
+ int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+ return *insn == CALL_INSN_OPCODE;
+}
+
+/*
+ * Verification callback used by int3_update uprobe_write calls to make sure
+ * the underlying instruction is as expected - either int3 or call.
+ */
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+ int nbytes, void *data)
+{
+ struct write_opcode_ctx *ctx = data;
+ uprobe_opcode_t old_opcode[5];
+
+ uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+ switch (ctx->expect) {
+ case EXPECT_SWBP:
+ if (is_swbp_insn(&old_opcode[0]))
+ return 1;
+ break;
+ case EXPECT_CALL:
+ if (is_call_insn(&old_opcode[0]))
+ return 1;
+ break;
+ }
+
+ return -1;
+}
+
+/*
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ * (borrowed comment from smp_text_poke_batch_finish)
+ *
+ * The way it is done:
+ * - Add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - Update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - Replace the first byte (INT3) by the first byte of the replacing opcode
+ * - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, char *insn, bool optimize)
+{
+ uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+ struct write_opcode_ctx ctx = {
+ .base = vaddr,
+ };
+ int err;
+
+ /*
+ * Write int3 trap.
+ *
+ * The swbp_optimize path comes with breakpoint already installed,
+ * so we can skip this step for optimize == true.
+ */
+ if (!optimize) {
+ ctx.expect = EXPECT_CALL;
+ err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+ }
+
+ smp_text_poke_sync_each_cpu();
+
+ /* Write all but the first byte of the patched range. */
+ ctx.expect = EXPECT_SWBP;
+ err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Write first byte.
+ *
+ * The swbp_unoptimize needs to finish uprobe removal together
+ * with ref_ctr update, using uprobe_write with proper flags.
+ */
+ err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+ optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+ return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, unsigned long tramp)
+{
+ u8 call[5];
+
+ __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+ (const void *) tramp, CALL_INSN_SIZE);
+ return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+ unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ uprobe_copy_from_page(page, vaddr, dst, len);
+ put_page(page);
+ return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+ struct __packed __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } *call = (struct __arch_relative_insn *) insn;
+
+ if (!is_call_insn(insn))
+ return false;
+ return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
+{
+ uprobe_opcode_t insn[5];
+ int err;
+
+ err = copy_from_vaddr(mm, vaddr, &insn, 5);
+ if (err)
+ return err;
+ return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+ return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+ test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (should_optimize(auprobe)) {
+ /*
+ * We could race with another thread that already optimized the probe,
+ * so let's not overwrite it with int3 again in this case.
+ */
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+ true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret) {
+ ret = swbp_unoptimize(auprobe, vma, vaddr);
+ WARN_ON_ONCE(ret);
+ return ret;
+ }
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+ false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+ unsigned long vaddr)
+{
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+ bool new = false;
+ int err = 0;
+
+ vma = find_vma(mm, vaddr);
+ if (!vma)
+ return -EINVAL;
+ tramp = get_uprobe_trampoline(vaddr, &new);
+ if (!tramp)
+ return -EINVAL;
+ err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+ if (WARN_ON_ONCE(err) && new)
+ destroy_uprobe_trampoline(tramp);
+ return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn[5];
+
+ if (!should_optimize(auprobe))
+ return;
+
+ mmap_write_lock(mm);
+
+ /*
+ * Check if some other thread already optimized the uprobe for us,
+ * if it's the case just go away silently.
+ */
+ if (copy_from_vaddr(mm, vaddr, &insn, 5))
+ goto unlock;
+ if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+ goto unlock;
+
+ /*
+ * If we fail to optimize the uprobe we set the fail bit so the
+ * above should_optimize will fail from now on.
+ */
+ if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+ set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+ mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ if (!insn->x86_64 || insn->length != 5)
+ return false;
+
+ if (!insn_is_nop(insn))
+ return false;
+
+ /* We can't do cross page atomic writes yet. */
+ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
+#else /* 32-bit: */
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ return false;
+}
+#endif /* CONFIG_X86_64 */
+
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(struct pt_regs *regs)
+{
+ /*
+ * Check registers for mode as in_xxx_syscall() does not apply here.
+ */
+ return user_64bit_mode(regs) ? 8 : 4;
+}
+
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_pre_xol(auprobe, regs);
+ return 0;
+}
+
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
+{
+ unsigned long new_sp = regs->sp - sizeof_long(regs);
+
+ if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs)))
+ return -EFAULT;
+
+ regs->sp = new_sp;
+ return 0;
+}
+
+/*
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
+ */
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
+ regs->ip += correction;
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(regs); /* Pop incorrect return address */
+ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
+ return -ERESTART;
+ }
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
+
+ return 0;
+}
+
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_post_xol(auprobe, regs);
+}
+
+static const struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+ .abort = default_abort_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
+{
+ return auprobe->branch.opc1 == 0xe8;
+}
+
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
+
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
+
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
+
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
+}
+
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long flags = regs->flags;
+
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
+
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
+
+#undef XF
+#undef COND
+#undef CASE_COND
+
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (emulate_push_stack(regs, new_ip))
+ return false;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
+}
+
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+ if (emulate_push_stack(regs, *src_ptr))
+ return false;
+ regs->ip += auprobe->push.ilen;
+ return true;
+}
+
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long(regs);
+ return -ERESTART;
+}
+
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
+}
+
+static const struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+static const struct uprobe_xol_ops push_xol_ops = {
+ .emulate = push_emulate_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn);
+ insn_byte_t p;
+
+ if (insn_is_nop(insn))
+ goto setup;
+
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
+ /*
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
+ */
+ opc1 = OPCODE2(insn) - 0x10;
+ fallthrough;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
+ }
+
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for_each_insn_prefix(insn, p) {
+ if (p == 0x66)
+ return -ENOTSUPP;
+ }
+
+setup:
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
+}
+
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+ if (opc1 < 0x50 || opc1 > 0x57)
+ return -ENOSYS;
+
+ if (insn->length > 2)
+ return -ENOSYS;
+ if (insn->length == 2) {
+ /* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+ if (insn->rex_prefix.nbytes != 1 ||
+ insn->rex_prefix.bytes[0] != 0x41)
+ return -ENOSYS;
+
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, r8);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, r9);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, r10);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, r11);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, r12);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, r13);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, r14);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, r15);
+ break;
+ }
+#else
+ return -ENOSYS;
+#endif
+ } else {
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, ax);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, cx);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, dx);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, bx);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, sp);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, bp);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, si);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, di);
+ break;
+ }
+ }
+
+ auprobe->push.reg_offset = reg_offset;
+ auprobe->push.ilen = insn->length;
+ auprobe->ops = &push_xol_ops;
+ return 0;
+}
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @auprobe: the probepoint information.
+ * @mm: the probed address space.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+{
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
+ struct insn insn;
+ int ret;
+
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
+ if (ret)
+ return ret;
+
+ if (can_optimize(&insn, addr))
+ set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ ret = push_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 0xff:
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip_or_call = 0;
+ break;
+ }
+ fallthrough;
+ default:
+ riprel_analyze(auprobe, &insn);
+ }
+
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ return 0;
+}
+
+/*
+ * If xol insn itself traps and generates a signal(Say,
+ * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
+ * instruction jumps back to its own address. It is assumed that anything
+ * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+ if (t->thread.trap_nr != UPROBE_TRAP_NR)
+ return true;
+
+ return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
+
+ WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+
+ if (auprobe->ops->post_xol) {
+ err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ /*
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
+ */
+ regs->ip = utask->vaddr;
+ if (err == -ERESTART)
+ err = 0;
+ send_sigtrap = false;
+ }
+ }
+ /*
+ * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
+ * so we can get an extra SIGTRAP if we do not clear TF. We need
+ * to examine the opcode to make it right.
+ */
+ if (send_sigtrap)
+ send_sig(SIGTRAP, current, 0);
+
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ return err;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+ int ret = NOTIFY_DONE;
+
+ /* We are only interested in userspace traps */
+ if (regs && !user_mode(regs))
+ return NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_INT3:
+ if (uprobe_pre_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ case DIE_DEBUG:
+ if (uprobe_post_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
+ /* clear TF if it was set by us in arch_uprobe_pre_xol() */
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+}
+
+static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
+ return false;
+}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ bool ret = __skip_sstep(auprobe, regs);
+ if (ret && (regs->flags & X86_EFLAGS_TF))
+ send_sig(SIGTRAP, current, 0);
+ return ret;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+ int rasize = sizeof_long(regs), nleft;
+ unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+ return -1;
+
+ /* check whether address has been already hijacked */
+ if (orig_ret_vaddr == trampoline_vaddr)
+ return orig_ret_vaddr;
+
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft)) {
+ if (shstk_update_last_frame(trampoline_vaddr)) {
+ force_sig(SIGSEGV);
+ return -1;
+ }
+ return orig_ret_vaddr;
+ }
+
+ if (nleft != rasize) {
+ pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+ current->pid, regs->sp, regs->ip);
+
+ force_sig(SIGSEGV);
+ }
+
+ return -1;
+}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+ return regs->sp < ret->stack;
+ else
+ return regs->sp <= ret->stack;
+}
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
deleted file mode 100644
index aeef529917e4..000000000000
--- a/arch/x86/kernel/uv_irq.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV IRQ functions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#include <linux/module.h>
-#include <linux/irq.h>
-
-#include <asm/apic.h>
-#include <asm/uv/uv_irq.h>
-
-static void uv_noop(unsigned int irq)
-{
-}
-
-static unsigned int uv_noop_ret(unsigned int irq)
-{
- return 0;
-}
-
-static void uv_ack_apic(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-struct irq_chip uv_irq_chip = {
- .name = "UV-CORE",
- .startup = uv_noop_ret,
- .shutdown = uv_noop,
- .enable = uv_noop,
- .disable = uv_noop,
- .ack = uv_noop,
- .mask = uv_noop,
- .unmask = uv_noop,
- .eoi = uv_ack_apic,
- .end = uv_noop,
-};
-
-/*
- * Set up a mapping of an available irq and vector, and enable the specified
- * MMR that defines the MSI that is to be sent to the specified CPU when an
- * interrupt is raised.
- */
-int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
- unsigned long mmr_offset)
-{
- int irq;
- int ret;
-
- irq = create_irq();
- if (irq <= 0)
- return -EBUSY;
-
- ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
- if (ret != irq)
- destroy_irq(irq);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(uv_setup_irq);
-
-/*
- * Tear down a mapping of an irq and vector, and disable the specified MMR that
- * defined the MSI that was to be sent to the specified CPU when an interrupt
- * was raised.
- *
- * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
- */
-void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
-{
- arch_disable_uv_irq(mmr_blade, mmr_offset);
- destroy_irq(irq);
-}
-EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
deleted file mode 100644
index 36afb98675a4..000000000000
--- a/arch/x86/kernel/uv_sysfs.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
- */
-
-#include <linux/sysdev.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv.h>
-
-struct kobject *sgi_uv_kobj;
-
-static ssize_t partition_id_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
-}
-
-static ssize_t coherence_id_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
-}
-
-static struct kobj_attribute partition_id_attr =
- __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
-
-static struct kobj_attribute coherence_id_attr =
- __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
-
-
-static int __init sgi_uv_sysfs_init(void)
-{
- unsigned long ret;
-
- if (!is_uv_system())
- return -ENODEV;
-
- if (!sgi_uv_kobj)
- sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
- if (!sgi_uv_kobj) {
- printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
- return -EINVAL;
- }
-
- ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
- if (ret) {
- printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
- return ret;
- }
-
- ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
- if (ret) {
- printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
- return ret;
- }
-
- return 0;
-}
-
-device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
new file mode 100644
index 000000000000..37ad43792452
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu.S
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * verify_cpu.S - Code for cpu long mode and SSE verification. This
+ * code has been borrowed from boot/setup.S and was introduced by
+ * Andi Kleen.
+ *
+ * Copyright (c) 2007 Andi Kleen (ak@suse.de)
+ * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
+ * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
+ *
+ * This is a common code for verification whether CPU supports
+ * long mode and SSE or not. It is not called directly instead this
+ * file is included at various places and compiled in that context.
+ * This file is expected to run in 32bit code. Currently:
+ *
+ * arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ * arch/x86/kernel/trampoline_64.S: secondary processor verification
+ * arch/x86/kernel/head_32.S: processor startup
+ *
+ * verify_cpu, returns the status of longmode and SSE in register %eax.
+ * 0: Success 1: Failure
+ *
+ * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
+ * The caller needs to check for the error code and take the action
+ * appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeatures.h>
+#include <asm/cpufeaturemasks.h>
+#include <asm/msr-index.h>
+
+#define SSE_MASK \
+ (REQUIRED_MASK0 & ((1<<(X86_FEATURE_XMM & 31)) | (1<<(X86_FEATURE_XMM2 & 31))))
+
+SYM_FUNC_START_LOCAL(verify_cpu)
+ pushf # Save caller passed flags
+ push $0 # Kill any dangerous flags
+ popf
+
+#ifndef __x86_64__
+ pushfl # standard way to check for cpuid
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
+#endif
+
+ movl $0x0,%eax # See if cpuid 1 is implemented
+ cpuid
+ cmpl $0x1,%eax
+ jb .Lverify_cpu_no_longmode # no cpuid 1
+
+ xor %di,%di
+ cmpl $0x68747541,%ebx # AuthenticAMD
+ jnz .Lverify_cpu_noamd
+ cmpl $0x69746e65,%edx
+ jnz .Lverify_cpu_noamd
+ cmpl $0x444d4163,%ecx
+ jnz .Lverify_cpu_noamd
+ mov $1,%di # cpu is from AMD
+ jmp .Lverify_cpu_check
+
+.Lverify_cpu_noamd:
+ cmpl $0x756e6547,%ebx # GenuineIntel?
+ jnz .Lverify_cpu_check
+ cmpl $0x49656e69,%edx
+ jnz .Lverify_cpu_check
+ cmpl $0x6c65746e,%ecx
+ jnz .Lverify_cpu_check
+
+ # only call IA32_MISC_ENABLE when:
+ # family > 6 || (family == 6 && model >= 0xd)
+ movl $0x1, %eax # check CPU family and model
+ cpuid
+ movl %eax, %ecx
+
+ andl $0x0ff00f00, %eax # mask family and extended family
+ shrl $8, %eax
+ cmpl $6, %eax
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip
+
+ andl $0x000f00f0, %ecx # mask model and extended model
+ shrl $4, %ecx
+ cmpl $0xd, %ecx
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip
+
+.Lverify_cpu_clear_xd:
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ rdmsr
+ btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
+ wrmsr
+
+.Lverify_cpu_check:
+ movl $0x1,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK0,%edx
+ xorl $REQUIRED_MASK0,%edx
+ jnz .Lverify_cpu_no_longmode
+
+ movl $0x80000000,%eax # See if extended cpuid is implemented
+ cpuid
+ cmpl $0x80000001,%eax
+ jb .Lverify_cpu_no_longmode # no extended cpuid
+
+ movl $0x80000001,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
+ jnz .Lverify_cpu_no_longmode
+
+.Lverify_cpu_sse_test:
+ movl $1,%eax
+ cpuid
+ andl $SSE_MASK,%edx
+ cmpl $SSE_MASK,%edx
+ je .Lverify_cpu_sse_ok
+ test %di,%di
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
+ movl $MSR_K7_HWCR,%ecx
+ rdmsr
+ btr $15,%eax # enable SSE
+ wrmsr
+ xor %di,%di # don't loop
+ jmp .Lverify_cpu_sse_test # try again
+
+.Lverify_cpu_no_longmode:
+ popf # Restore caller passed flags
+ movl $1,%eax
+ RET
+.Lverify_cpu_sse_ok:
+ popf # Restore caller passed flags
+ xorl %eax, %eax
+ RET
+SYM_FUNC_END(verify_cpu)
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
deleted file mode 100644
index 45b6f8a975a1..000000000000
--- a/arch/x86/kernel/verify_cpu_64.S
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- *
- * verify_cpu.S - Code for cpu long mode and SSE verification. This
- * code has been borrowed from boot/setup.S and was introduced by
- * Andi Kleen.
- *
- * Copyright (c) 2007 Andi Kleen (ak@suse.de)
- * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
- * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
- *
- * This is a common code for verification whether CPU supports
- * long mode and SSE or not. It is not called directly instead this
- * file is included at various places and compiled in that context.
- * Following are the current usage.
- *
- * This file is included by both 16bit and 32bit code.
- *
- * arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- * verify_cpu, returns the status of cpu check in register %eax.
- * 0: Success 1: Failure
- *
- * The caller needs to check for the error code and take the action
- * appropriately. Either display a message or halt.
- */
-
-#include <asm/cpufeature.h>
-
-verify_cpu:
- pushfl # Save caller passed flags
- pushl $0 # Kill any dangerous flags
- popfl
-
- pushfl # standard way to check for cpuid
- popl %eax
- movl %eax,%ebx
- xorl $0x200000,%eax
- pushl %eax
- popfl
- pushfl
- popl %eax
- cmpl %eax,%ebx
- jz verify_cpu_no_longmode # cpu has no cpuid
-
- movl $0x0,%eax # See if cpuid 1 is implemented
- cpuid
- cmpl $0x1,%eax
- jb verify_cpu_no_longmode # no cpuid 1
-
- xor %di,%di
- cmpl $0x68747541,%ebx # AuthenticAMD
- jnz verify_cpu_noamd
- cmpl $0x69746e65,%edx
- jnz verify_cpu_noamd
- cmpl $0x444d4163,%ecx
- jnz verify_cpu_noamd
- mov $1,%di # cpu is from AMD
-
-verify_cpu_noamd:
- movl $0x1,%eax # Does the cpu have what it takes
- cpuid
- andl $REQUIRED_MASK0,%edx
- xorl $REQUIRED_MASK0,%edx
- jnz verify_cpu_no_longmode
-
- movl $0x80000000,%eax # See if extended cpuid is implemented
- cpuid
- cmpl $0x80000001,%eax
- jb verify_cpu_no_longmode # no extended cpuid
-
- movl $0x80000001,%eax # Does the cpu have what it takes
- cpuid
- andl $REQUIRED_MASK1,%edx
- xorl $REQUIRED_MASK1,%edx
- jnz verify_cpu_no_longmode
-
-verify_cpu_sse_test:
- movl $1,%eax
- cpuid
- andl $SSE_MASK,%edx
- cmpl $SSE_MASK,%edx
- je verify_cpu_sse_ok
- test %di,%di
- jz verify_cpu_no_longmode # only try to force SSE on AMD
- movl $0xc0010015,%ecx # HWCR
- rdmsr
- btr $15,%eax # enable SSE
- wrmsr
- xor %di,%di # don't loop
- jmp verify_cpu_sse_test # try again
-
-verify_cpu_no_longmode:
- popfl # Restore caller passed flags
- movl $1,%eax
- ret
-verify_cpu_sse_ok:
- popfl # Restore caller passed flags
- xorl %eax, %eax
- ret
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
deleted file mode 100644
index 31ffc24eec4d..000000000000
--- a/arch/x86/kernel/visws_quirks.c
+++ /dev/null
@@ -1,687 +0,0 @@
-/*
- * SGI Visual Workstation support and quirks, unmaintained.
- *
- * Split out from setup.c by davej@suse.de
- *
- * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
- *
- * SGI Visual Workstation interrupt controller
- *
- * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
- * which serves as the main interrupt controller in the system. Non-legacy
- * hardware in the system uses this controller directly. Legacy devices
- * are connected to the PIIX4 which in turn has its 8259(s) connected to
- * a of the Cobalt APIC entry.
- *
- * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
- *
- * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
- */
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/visws/cobalt.h>
-#include <asm/visws/piix4.h>
-#include <asm/io_apic.h>
-#include <asm/fixmap.h>
-#include <asm/reboot.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/io.h>
-
-#include <linux/kernel_stat.h>
-
-#include <asm/i8259.h>
-#include <asm/irq_vectors.h>
-#include <asm/visws/lithium.h>
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-
-extern int no_broadcast;
-
-char visws_board_type = -1;
-char visws_board_rev = -1;
-
-int is_visws_box(void)
-{
- return visws_board_type >= 0;
-}
-
-static int __init visws_time_init(void)
-{
- printk(KERN_INFO "Starting Cobalt Timer system clock\n");
-
- /* Set the countdown value */
- co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
-
- /* Start the timer */
- co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
-
- /* Enable (unmask) the timer interrupt */
- co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
-
- /*
- * Zero return means the generic timer setup code will set up
- * the standard vector:
- */
- return 0;
-}
-
-static int __init visws_pre_intr_init(void)
-{
- init_VISWS_APIC_irqs();
-
- /*
- * We dont want ISA irqs to be set up by the generic code:
- */
- return 1;
-}
-
-/* Quirk for machine specific memory setup. */
-
-#define MB (1024 * 1024)
-
-unsigned long sgivwfb_mem_phys;
-unsigned long sgivwfb_mem_size;
-EXPORT_SYMBOL(sgivwfb_mem_phys);
-EXPORT_SYMBOL(sgivwfb_mem_size);
-
-long long mem_size __initdata = 0;
-
-static char * __init visws_memory_setup(void)
-{
- long long gfx_mem_size = 8 * MB;
-
- mem_size = boot_params.alt_mem_k;
-
- if (!mem_size) {
- printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
- mem_size = 128 * MB;
- }
-
- /*
- * this hardcodes the graphics memory to 8 MB
- * it really should be sized dynamically (or at least
- * set as a boot param)
- */
- if (!sgivwfb_mem_size) {
- printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
- sgivwfb_mem_size = 8 * MB;
- }
-
- /*
- * Trim to nearest MB
- */
- sgivwfb_mem_size &= ~((1 << 20) - 1);
- sgivwfb_mem_phys = mem_size - gfx_mem_size;
-
- e820_add_region(0, LOWMEMSIZE(), E820_RAM);
- e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
- e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
-
- return "PROM";
-}
-
-static void visws_machine_emergency_restart(void)
-{
- /*
- * Visual Workstations restart after this
- * register is poked on the PIIX4
- */
- outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
-}
-
-static void visws_machine_power_off(void)
-{
- unsigned short pm_status;
-/* extern unsigned int pci_bus0; */
-
- while ((pm_status = inw(PMSTS_PORT)) & 0x100)
- outw(pm_status, PMSTS_PORT);
-
- outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
-
- mdelay(10);
-
-#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
- (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
-
-/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
- outl(PIIX_SPECIAL_STOP, 0xCFC);
-}
-
-static int __init visws_get_smp_config(unsigned int early)
-{
- /*
- * Prevent MP-table parsing by the generic code:
- */
- return 1;
-}
-
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesn't have a BIOS(-configuration table).
- * No problem for Linux.
- */
-
-static void __init MP_processor_info(struct mpc_cpu *m)
-{
- int ver, logical_apicid;
- physid_mask_t apic_cpus;
-
- if (!(m->cpuflag & CPU_ENABLED))
- return;
-
- logical_apicid = m->apicid;
- printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
- m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
-
- if (m->cpuflag & CPU_BOOTPROCESSOR)
- boot_cpu_physical_apicid = m->apicid;
-
- ver = m->apicver;
- if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
- printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
- m->apicid, MAX_APICS);
- return;
- }
-
- apic_cpus = apic->apicid_to_cpu_present(m->apicid);
- physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
- /*
- * Validate version
- */
- if (ver == 0x0) {
- printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- m->apicid);
- ver = 0x10;
- }
- apic_version[m->apicid] = ver;
-}
-
-static int __init visws_find_smp_config(unsigned int reserve)
-{
- struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
- unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
-
- if (ncpus > CO_CPU_MAX) {
- printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
- ncpus, mp);
-
- ncpus = CO_CPU_MAX;
- }
-
- if (ncpus > setup_max_cpus)
- ncpus = setup_max_cpus;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- smp_found_config = 1;
-#endif
- while (ncpus--)
- MP_processor_info(mp++);
-
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- return 1;
-}
-
-static int visws_trap_init(void);
-
-static struct x86_quirks visws_x86_quirks __initdata = {
- .arch_time_init = visws_time_init,
- .arch_pre_intr_init = visws_pre_intr_init,
- .arch_memory_setup = visws_memory_setup,
- .arch_intr_init = NULL,
- .arch_trap_init = visws_trap_init,
- .mach_get_smp_config = visws_get_smp_config,
- .mach_find_smp_config = visws_find_smp_config,
-};
-
-void __init visws_early_detect(void)
-{
- int raw;
-
- visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
- >> PIIX_GPI_BD_SHIFT;
-
- if (visws_board_type < 0)
- return;
-
- /*
- * Install special quirks for timer, interrupt and memory setup:
- * Fall back to generic behavior for traps:
- * Override generic MP-table parsing:
- */
- x86_quirks = &visws_x86_quirks;
-
- /*
- * Install reboot quirks:
- */
- pm_power_off = visws_machine_power_off;
- machine_ops.emergency_restart = visws_machine_emergency_restart;
-
- /*
- * Do not use broadcast IPIs:
- */
- no_broadcast = 0;
-
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Turn off IO-APIC detection and initialization:
- */
- skip_ioapic_setup = 1;
-#endif
-
- /*
- * Get Board rev.
- * First, we have to initialize the 307 part to allow us access
- * to the GPIO registers. Let's map them at 0x0fc0 which is right
- * after the PIIX4 PM section.
- */
- outb_p(SIO_DEV_SEL, SIO_INDEX);
- outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
-
- outb_p(SIO_DEV_MSB, SIO_INDEX);
- outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
-
- outb_p(SIO_DEV_LSB, SIO_INDEX);
- outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
-
- outb_p(SIO_DEV_ENB, SIO_INDEX);
- outb_p(1, SIO_DATA); /* Enable GPIO registers. */
-
- /*
- * Now, we have to map the power management section to write
- * a bit which enables access to the GPIO registers.
- * What lunatic came up with this shit?
- */
- outb_p(SIO_DEV_SEL, SIO_INDEX);
- outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
-
- outb_p(SIO_DEV_MSB, SIO_INDEX);
- outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
-
- outb_p(SIO_DEV_LSB, SIO_INDEX);
- outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
-
- outb_p(SIO_DEV_ENB, SIO_INDEX);
- outb_p(1, SIO_DATA); /* Enable PM registers. */
-
- /*
- * Now, write the PM register which enables the GPIO registers.
- */
- outb_p(SIO_PM_FER2, SIO_PM_INDEX);
- outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
-
- /*
- * Now, initialize the GPIO registers.
- * We want them all to be inputs which is the
- * power on default, so let's leave them alone.
- * So, let's just read the board rev!
- */
- raw = inb_p(SIO_GP_DATA1);
- raw &= 0x7f; /* 7 bits of valid board revision ID. */
-
- if (visws_board_type == VISWS_320) {
- if (raw < 0x6) {
- visws_board_rev = 4;
- } else if (raw < 0xc) {
- visws_board_rev = 5;
- } else {
- visws_board_rev = 6;
- }
- } else if (visws_board_type == VISWS_540) {
- visws_board_rev = 2;
- } else {
- visws_board_rev = raw;
- }
-
- printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
- (visws_board_type == VISWS_320 ? "320" :
- (visws_board_type == VISWS_540 ? "540" :
- "unknown")), visws_board_rev);
-}
-
-#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
-#define BCD (LI_INTB | LI_INTC | LI_INTD)
-#define ALLDEVS (A01234 | BCD)
-
-static __init void lithium_init(void)
-{
- set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
- set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
-
- if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
- (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
- printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
-/* panic("This machine is not SGI Visual Workstation 320/540"); */
- }
-
- if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
- (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
- printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
-/* panic("This machine is not SGI Visual Workstation 320/540"); */
- }
-
- li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
- li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
-}
-
-static __init void cobalt_init(void)
-{
- /*
- * On normal SMP PC this is used only with SMP, but we have to
- * use it and set it up here to start the Cobalt clock
- */
- set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
- setup_local_APIC();
- printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
- (unsigned int)apic_read(APIC_LVR),
- (unsigned int)apic_read(APIC_ID));
-
- set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
- set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
- printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
- co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
-
- /* Enable Cobalt APIC being careful to NOT change the ID! */
- co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
-
- printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
- co_apic_read(CO_APIC_ID));
-}
-
-static int __init visws_trap_init(void)
-{
- lithium_init();
- cobalt_init();
-
- return 1;
-}
-
-/*
- * IRQ controller / APIC support:
- */
-
-static DEFINE_SPINLOCK(cobalt_lock);
-
-/*
- * Set the given Cobalt APIC Redirection Table entry to point
- * to the given IDT vector/index.
- */
-static inline void co_apic_set(int entry, int irq)
-{
- co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
- co_apic_write(CO_APIC_HI(entry), 0);
-}
-
-/*
- * Cobalt (IO)-APIC functions to handle PCI devices.
- */
-static inline int co_apic_ide0_hack(void)
-{
- extern char visws_board_type;
- extern char visws_board_rev;
-
- if (visws_board_type == VISWS_320 && visws_board_rev == 5)
- return 5;
- return CO_APIC_IDE0;
-}
-
-static int is_co_apic(unsigned int irq)
-{
- if (IS_CO_APIC(irq))
- return CO_APIC(irq);
-
- switch (irq) {
- case 0: return CO_APIC_CPU;
- case CO_IRQ_IDE0: return co_apic_ide0_hack();
- case CO_IRQ_IDE1: return CO_APIC_IDE1;
- default: return -1;
- }
-}
-
-
-/*
- * This is the SGI Cobalt (IO-)APIC:
- */
-
-static void enable_cobalt_irq(unsigned int irq)
-{
- co_apic_set(is_co_apic(irq), irq);
-}
-
-static void disable_cobalt_irq(unsigned int irq)
-{
- int entry = is_co_apic(irq);
-
- co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
- co_apic_read(CO_APIC_LO(entry));
-}
-
-/*
- * "irq" really just serves to identify the device. Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
-
- spin_lock_irqsave(&cobalt_lock, flags);
- if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
- desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
- return 0;
-}
-
-static void ack_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cobalt_lock, flags);
- disable_cobalt_irq(irq);
- apic_write(APIC_EOI, APIC_EIO_ACK);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static void end_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
-
- spin_lock_irqsave(&cobalt_lock, flags);
- if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip cobalt_irq_type = {
- .typename = "Cobalt-APIC",
- .startup = startup_cobalt_irq,
- .shutdown = disable_cobalt_irq,
- .enable = enable_cobalt_irq,
- .disable = disable_cobalt_irq,
- .ack = ack_cobalt_irq,
- .end = end_cobalt_irq,
-};
-
-
-/*
- * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
- * -- not the manner expected by the code in i8259.c.
- *
- * there is a 'master' physical interrupt source that gets sent to
- * the CPU. But in the chipset there are various 'virtual' interrupts
- * waiting to be handled. We represent this to Linux through a 'master'
- * interrupt controller type, and through a special virtual interrupt-
- * controller. Device drivers only see the virtual interrupt sources.
- */
-static unsigned int startup_piix4_master_irq(unsigned int irq)
-{
- init_8259A(0);
-
- return startup_cobalt_irq(irq);
-}
-
-static void end_piix4_master_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cobalt_lock, flags);
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip piix4_master_irq_type = {
- .typename = "PIIX4-master",
- .startup = startup_piix4_master_irq,
- .ack = ack_cobalt_irq,
- .end = end_piix4_master_irq,
-};
-
-
-static struct irq_chip piix4_virtual_irq_type = {
- .typename = "PIIX4-virtual",
- .shutdown = disable_8259A_irq,
- .enable = enable_8259A_irq,
- .disable = disable_8259A_irq,
-};
-
-
-/*
- * PIIX4-8259 master/virtual functions to handle interrupt requests
- * from legacy devices: floppy, parallel, serial, rtc.
- *
- * None of these get Cobalt APIC entries, neither do they have IDT
- * entries. These interrupts are purely virtual and distributed from
- * the 'master' interrupt source: CO_IRQ_8259.
- *
- * When the 8259 interrupts its handler figures out which of these
- * devices is interrupting and dispatches to its handler.
- *
- * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
- * enable_irq gets the right irq. This 'master' irq is never directly
- * manipulated by any driver.
- */
-static irqreturn_t piix4_master_intr(int irq, void *dev_id)
-{
- int realirq;
- struct irq_desc *desc;
- unsigned long flags;
-
- spin_lock_irqsave(&i8259A_lock, flags);
-
- /* Find out what's interrupting in the PIIX4 master 8259 */
- outb(0x0c, 0x20); /* OCW3 Poll command */
- realirq = inb(0x20);
-
- /*
- * Bit 7 == 0 means invalid/spurious
- */
- if (unlikely(!(realirq & 0x80)))
- goto out_unlock;
-
- realirq &= 7;
-
- if (unlikely(realirq == 2)) {
- outb(0x0c, 0xa0);
- realirq = inb(0xa0);
-
- if (unlikely(!(realirq & 0x80)))
- goto out_unlock;
-
- realirq = (realirq & 7) + 8;
- }
-
- /* mask and ack interrupt */
- cached_irq_mask |= 1 << realirq;
- if (unlikely(realirq > 7)) {
- inb(0xa1);
- outb(cached_slave_mask, 0xa1);
- outb(0x60 + (realirq & 7), 0xa0);
- outb(0x60 + 2, 0x20);
- } else {
- inb(0x21);
- outb(cached_master_mask, 0x21);
- outb(0x60 + realirq, 0x20);
- }
-
- spin_unlock_irqrestore(&i8259A_lock, flags);
-
- desc = irq_to_desc(realirq);
-
- /*
- * handle this 'virtual interrupt' as a Cobalt one now.
- */
- kstat_incr_irqs_this_cpu(realirq, desc);
-
- if (likely(desc->action != NULL))
- handle_IRQ_event(realirq, desc->action);
-
- if (!(desc->status & IRQ_DISABLED))
- enable_8259A_irq(realirq);
-
- return IRQ_HANDLED;
-
-out_unlock:
- spin_unlock_irqrestore(&i8259A_lock, flags);
- return IRQ_NONE;
-}
-
-static struct irqaction master_action = {
- .handler = piix4_master_intr,
- .name = "PIIX4-8259",
-};
-
-static struct irqaction cascade_action = {
- .handler = no_action,
- .name = "cascade",
-};
-
-
-void init_VISWS_APIC_irqs(void)
-{
- int i;
-
- for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = 0;
- desc->depth = 1;
-
- if (i == 0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE1) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_8259) {
- desc->chip = &piix4_master_irq_type;
- }
- else if (i < CO_IRQ_APIC0) {
- desc->chip = &piix4_virtual_irq_type;
- }
- else if (IS_CO_APIC(i)) {
- desc->chip = &cobalt_irq_type;
- }
- }
-
- setup_irq(CO_IRQ_8259, &master_action);
- setup_irq(2, &cascade_action);
-}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..e6cc84143f3e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1994 Linus Torvalds
*
@@ -28,10 +29,14 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
+#include <linux/syscalls.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/string.h>
@@ -41,12 +46,16 @@
#include <linux/ptrace.h>
#include <linux/audit.h>
#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
-#include <asm/syscalls.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* Known problems:
@@ -64,10 +73,6 @@
*/
-#define KVM86 ((struct kernel_vm86_struct *)regs)
-#define VMPI KVM86->vm86plus
-
-
/*
* 8- and 16-bit register defines..
*/
@@ -79,8 +84,8 @@
/*
* virtual flags (16 and 32-bit versions)
*/
-#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS (current->thread.v86flags)
+#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS (current->thread.vm86->veflags)
#define set_flags(X, new, mask) \
((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -88,46 +93,11 @@
#define SAFE_MASK (0xDD5)
#define RETURN_MASK (0xDFF)
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
- const struct kernel_vm86_regs *regs)
-{
- int ret = 0;
-
- /*
- * kernel_vm86_regs is missing gs, so copy everything up to
- * (but not including) orig_eax, and then rest including orig_eax.
- */
- ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
- ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
- sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
- return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
- const struct vm86_regs __user *user,
- unsigned extra)
-{
- int ret = 0;
-
- /* copy ax-fs inclusive */
- ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
- /* copy orig_ax-__gsh+extra */
- ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
- sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.orig_ax) +
- extra);
- return ret;
-}
-
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
- struct tss_struct *tss;
- struct pt_regs *ret;
- unsigned long tmp;
+ struct task_struct *tsk = current;
+ struct vm86plus_struct __user *user;
+ struct vm86 *vm86 = current->thread.vm86;
/*
* This gets called from entry.S with interrupts disabled, but
@@ -136,116 +106,81 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
*/
local_irq_enable();
- if (!current->thread.vm86_info) {
- printk("no vm86_info: BAD\n");
- do_exit(SIGSEGV);
- }
- set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
- tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
- tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
- if (tmp) {
- printk("vm86: could not access userspace vm86_info\n");
- do_exit(SIGSEGV);
- }
-
- tss = &per_cpu(init_tss, get_cpu());
- current->thread.sp0 = current->thread.saved_sp0;
- current->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &current->thread);
- current->thread.saved_sp0 = 0;
- put_cpu();
+ BUG_ON(!vm86);
+
+ set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+ user = vm86->user_vm86;
+
+ if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ?
+ sizeof(struct vm86plus_struct) :
+ sizeof(struct vm86_struct)))
+ goto Efault;
+
+ unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end);
+ unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end);
+ unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end);
+ unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end);
+ unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end);
+ unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end);
+ unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end);
+ unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end);
+ unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end);
+ unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end);
+ unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end);
+ unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end);
+ unsafe_put_user(regs->es, &user->regs.es, Efault_end);
+ unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
+ unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
+ unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
- ret = KVM86->regs32;
+ /*
+ * Don't write screen_bitmap in case some user had a value there
+ * and expected it to remain unchanged.
+ */
- ret->fs = current->thread.saved_fs;
- set_user_gs(ret, current->thread.saved_gs);
+ user_access_end();
- return ret;
-}
+exit_vm86:
+ preempt_disable();
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+ update_task_stack(tsk);
+ refresh_sysenter_cs(&tsk->thread);
+ vm86->saved_sp0 = 0;
+ preempt_enable();
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- spinlock_t *ptl;
- int i;
+ memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
- pgd = pgd_offset(mm, 0xA0000);
- if (pgd_none_or_clear_bad(pgd))
- goto out;
- pud = pud_offset(pgd, 0xA0000);
- if (pud_none_or_clear_bad(pud))
- goto out;
- pmd = pmd_offset(pud, 0xA0000);
- if (pmd_none_or_clear_bad(pmd))
- goto out;
- pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
- for (i = 0; i < 32; i++) {
- if (pte_present(*pte))
- set_pte(pte, pte_wrprotect(*pte));
- pte++;
- }
- pte_unmap_unlock(pte, ptl);
-out:
- flush_tlb();
-}
+ loadsegment(gs, vm86->regs32.gs);
+ regs->pt.ax = retval;
+ return;
+Efault_end:
+ user_access_end();
+Efault:
+ pr_alert("could not access userspace vm86 info\n");
+ force_exit_sig(SIGSEGV);
+ goto exit_vm86;
+}
static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
-int sys_vm86old(struct pt_regs *regs)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
{
- struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
- struct kernel_vm86_struct info; /* declare this _on top_,
- * this avoids wasting of stack space.
- * This remains on the stack until we
- * return to 32 bit user space.
- */
- struct task_struct *tsk;
- int tmp, ret = -EPERM;
-
- tsk = current;
- if (tsk->thread.saved_sp0)
- goto out;
- tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
- offsetof(struct kernel_vm86_struct, vm86plus) -
- sizeof(info.regs));
- ret = -EFAULT;
- if (tmp)
- goto out;
- memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = regs;
- tsk->thread.vm86_info = v86;
- do_sys_vm86(&info, tsk);
- ret = 0; /* we never return here */
-out:
- return ret;
+ return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
}
-int sys_vm86(struct pt_regs *regs)
+SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
{
- struct kernel_vm86_struct info; /* declare this _on top_,
- * this avoids wasting of stack space.
- * This remains on the stack until we
- * return to 32 bit user space.
- */
- struct task_struct *tsk;
- int tmp, ret;
- struct vm86plus_struct __user *v86;
-
- tsk = current;
- switch (regs->bx) {
+ switch (cmd) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
- goto out;
+ return do_vm86_irq_handling(cmd, (int)arg);
case VM86_PLUS_INSTALL_CHECK:
/*
* NOTE: on old vm86 stuff this will return the error
@@ -253,121 +188,163 @@ int sys_vm86(struct pt_regs *regs)
* interpreted as (invalid) address to vm86_struct.
* So the installation check works.
*/
- ret = 0;
- goto out;
+ return 0;
}
/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
- ret = -EPERM;
- if (tsk->thread.saved_sp0)
- goto out;
- v86 = (struct vm86plus_struct __user *)regs->cx;
- tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
- offsetof(struct kernel_vm86_struct, regs32) -
- sizeof(info.regs));
- ret = -EFAULT;
- if (tmp)
- goto out;
- info.regs32 = regs;
- info.vm86plus.is_vm86pus = 1;
- tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
- do_sys_vm86(&info, tsk);
- ret = 0; /* we never return here */
-out:
- return ret;
+ return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
}
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
- struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
- info->regs.pt.ds = 0;
- info->regs.pt.es = 0;
- info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
- info->regs.pt.gs = 0;
-#endif
+ struct task_struct *tsk = current;
+ struct vm86 *vm86 = tsk->thread.vm86;
+ struct kernel_vm86_regs vm86regs;
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long err = 0;
+ struct vm86_struct v;
+
+ err = security_mmap_addr(0);
+ if (err) {
+ /*
+ * vm86 cannot virtualize the address space, so vm86 users
+ * need to manage the low 1MB themselves using mmap. Given
+ * that BIOS places important data in the first page, vm86
+ * is essentially useless if mmap_min_addr != 0. DOSEMU,
+ * for example, won't even bother trying to use vm86 if it
+ * can't map a page at virtual address 0.
+ *
+ * To reduce the available kernel attack surface, simply
+ * disallow vm86(old) for users who cannot mmap at va 0.
+ *
+ * The implementation of security_mmap_addr will allow
+ * suitably privileged users to map va 0 even if
+ * vm.mmap_min_addr is set above 0, and we want this
+ * behavior for vm86 as well, as it ensures that legacy
+ * tools like vbetool will not fail just because of
+ * vm.mmap_min_addr.
+ */
+ pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()));
+ return -EPERM;
+ }
+
+ if (!vm86) {
+ if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+ return -ENOMEM;
+ tsk->thread.vm86 = vm86;
+ }
+ if (vm86->saved_sp0)
+ return -EPERM;
+
+ if (copy_from_user(&v, user_vm86,
+ offsetof(struct vm86_struct, int_revectored)))
+ return -EFAULT;
+
+
+ /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
+ if (v.flags & VM86_SCREEN_BITMAP) {
+ pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n",
+ current->comm);
+ return -EINVAL;
+ }
+
+ memset(&vm86regs, 0, sizeof(vm86regs));
+
+ vm86regs.pt.bx = v.regs.ebx;
+ vm86regs.pt.cx = v.regs.ecx;
+ vm86regs.pt.dx = v.regs.edx;
+ vm86regs.pt.si = v.regs.esi;
+ vm86regs.pt.di = v.regs.edi;
+ vm86regs.pt.bp = v.regs.ebp;
+ vm86regs.pt.ax = v.regs.eax;
+ vm86regs.pt.ip = v.regs.eip;
+ vm86regs.pt.cs = v.regs.cs;
+ vm86regs.pt.flags = v.regs.eflags;
+ vm86regs.pt.sp = v.regs.esp;
+ vm86regs.pt.ss = v.regs.ss;
+ vm86regs.es = v.regs.es;
+ vm86regs.ds = v.regs.ds;
+ vm86regs.fs = v.regs.fs;
+ vm86regs.gs = v.regs.gs;
+
+ vm86->flags = v.flags;
+ vm86->cpu_type = v.cpu_type;
+
+ if (copy_from_user(&vm86->int_revectored,
+ &user_vm86->int_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (copy_from_user(&vm86->int21_revectored,
+ &user_vm86->int21_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (plus) {
+ if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+ sizeof(struct vm86plus_info_struct)))
+ return -EFAULT;
+ vm86->vm86plus.is_vm86pus = 1;
+ } else
+ memset(&vm86->vm86plus, 0,
+ sizeof(struct vm86plus_info_struct));
+
+ memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+ vm86->user_vm86 = user_vm86;
/*
* The flags register is also special: we cannot trust that the user
* has set it up safely, so this makes sure interrupt etc flags are
* inherited from protected mode.
*/
- VEFLAGS = info->regs.pt.flags;
- info->regs.pt.flags &= SAFE_MASK;
- info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
- info->regs.pt.flags |= X86_VM_MASK;
+ VEFLAGS = vm86regs.pt.flags;
+ vm86regs.pt.flags &= SAFE_MASK;
+ vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+ vm86regs.pt.flags |= X86_VM_MASK;
+
+ vm86regs.pt.orig_ax = regs->orig_ax;
- switch (info->cpu_type) {
+ switch (vm86->cpu_type) {
case CPU_286:
- tsk->thread.v86mask = 0;
+ vm86->veflags_mask = 0;
break;
case CPU_386:
- tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
case CPU_486:
- tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
default:
- tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
}
/*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
*/
- info->regs32->ax = VM86_SIGNAL;
- tsk->thread.saved_sp0 = tsk->thread.sp0;
- tsk->thread.saved_fs = info->regs32->fs;
- tsk->thread.saved_gs = get_user_gs(info->regs32);
-
- tss = &per_cpu(init_tss, get_cpu());
- tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
- if (cpu_has_sep)
+ vm86->saved_sp0 = tsk->thread.sp0;
+ savesegment(gs, vm86->regs32.gs);
+
+ /* make room for real-mode segments */
+ preempt_disable();
+ tsk->thread.sp0 += 16;
+
+ if (boot_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
- load_sp0(tss, &tsk->thread);
- put_cpu();
-
- tsk->thread.screen_bitmap = info->screen_bitmap;
- if (info->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk->mm);
-
- /*call audit_syscall_exit since we do not exit via the normal paths */
- if (unlikely(current->audit_context))
- audit_syscall_exit(AUDITSC_RESULT(0), 0);
-
- __asm__ __volatile__(
- "movl %0,%%esp\n\t"
- "movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
- "mov %2, %%gs\n\t"
-#endif
- "jmp resume_userspace"
- : /* no outputs */
- :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
- /* we never return here */
-}
+ refresh_sysenter_cs(&tsk->thread);
+ }
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
- struct pt_regs *regs32;
-
- regs32 = save_v86_state(regs16);
- regs32->ax = retval;
- __asm__ __volatile__("movl %0,%%esp\n\t"
- "movl %1,%%ebp\n\t"
- "jmp resume_userspace"
- : : "r" (regs32), "r" (current_thread_info()));
+ update_task_stack(tsk);
+ preempt_enable();
+
+ memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+ return regs->ax;
}
static inline void set_IF(struct kernel_vm86_regs *regs)
{
VEFLAGS |= X86_EFLAGS_VIF;
- if (VEFLAGS & X86_EFLAGS_VIP)
- return_to_32bit(regs, VM86_STI);
}
static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -399,7 +376,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
{
- set_flags(VEFLAGS, flags, current->thread.v86mask);
+ set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@@ -409,7 +386,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
{
- set_flags(VFLAGS, flags, current->thread.v86mask);
+ set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@@ -424,15 +401,12 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
if (VEFLAGS & X86_EFLAGS_VIF)
flags |= X86_EFLAGS_IF;
flags |= X86_EFLAGS_IOPL;
- return flags | (VEFLAGS & current->thread.v86mask);
+ return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
}
static inline int is_revectored(int nr, struct revectored_struct *bitmap)
{
- __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
- :"=r" (nr)
- :"m" (*bitmap), "r" (nr));
- return nr;
+ return test_bit(nr, bitmap->__map);
}
#define val_byte(val, n) (((__u8 *)&val)[n])
@@ -522,12 +496,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
{
unsigned long __user *intr_ptr;
unsigned long segoffs;
+ struct vm86 *vm86 = current->thread.vm86;
if (regs->pt.cs == BIOSSEG)
goto cannot_handle;
- if (is_revectored(i, &KVM86->int_revectored))
+ if (is_revectored(i, &vm86->int_revectored))
goto cannot_handle;
- if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+ if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
goto cannot_handle;
intr_ptr = (unsigned long __user *) (i << 2);
if (get_user(segoffs, intr_ptr))
@@ -546,22 +521,26 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
return;
cannot_handle:
- return_to_32bit(regs, VM86_INTx + (i << 8));
+ save_v86_state(regs, VM86_INTx + (i << 8));
}
int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
- if (VMPI.is_vm86pus) {
- if ((trapno == 3) || (trapno == 1))
- return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+ struct vm86 *vm86 = current->thread.vm86;
+
+ if (vm86->vm86plus.is_vm86pus) {
+ if ((trapno == 3) || (trapno == 1)) {
+ save_v86_state(regs, VM86_TRAP + (trapno << 8));
+ return 0;
+ }
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
return 0;
}
if (trapno != 1)
return 1; /* we let this handle by the calling routine */
- current->thread.trap_no = trapno;
+ current->thread.trap_nr = trapno;
current->thread.error_code = error_code;
- force_sig(SIGTRAP, current);
+ force_sig(SIGTRAP);
return 0;
}
@@ -572,16 +551,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
unsigned char __user *ssp;
unsigned short ip, sp, orig_flags;
int data32, pref_done;
+ struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
#define CHECK_IF_IN_TRAP \
- if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+ if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
- if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
- return_to_32bit(regs, VM86_PICRETURN); \
- if (orig_flags & X86_EFLAGS_TF) \
- handle_vm86_trap(regs, 0, 1); \
- return; } while (0)
orig_flags = *(unsigned short *)&regs->pt.flags;
@@ -620,7 +594,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
SP(regs) -= 2;
}
IP(regs) = ip;
- VM86_FAULT_RETURN;
+ goto vm86_fault_return;
/* popf */
case 0x9d:
@@ -640,16 +614,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
else
set_vflags_short(newflags, regs);
- VM86_FAULT_RETURN;
+ goto check_vip;
}
/* int xx */
case 0xcd: {
int intno = popb(csp, ip, simulate_sigsegv);
IP(regs) = ip;
- if (VMPI.vm86dbg_active) {
- if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
- return_to_32bit(regs, VM86_INTx + (intno << 8));
+ if (vmpi->vm86dbg_active) {
+ if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+ save_v86_state(regs, VM86_INTx + (intno << 8));
+ return;
+ }
}
do_int(regs, intno, ssp, sp);
return;
@@ -680,14 +656,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
} else {
set_vflags_short(newflags, regs);
}
- VM86_FAULT_RETURN;
+ goto check_vip;
}
/* cli */
case 0xfa:
IP(regs) = ip;
clear_IF(regs);
- VM86_FAULT_RETURN;
+ goto vm86_fault_return;
/* sti */
/*
@@ -699,14 +675,30 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
case 0xfb:
IP(regs) = ip;
set_IF(regs);
- VM86_FAULT_RETURN;
+ goto check_vip;
default:
- return_to_32bit(regs, VM86_UNKNOWN);
+ save_v86_state(regs, VM86_UNKNOWN);
}
return;
+check_vip:
+ if ((VEFLAGS & (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) ==
+ (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) {
+ save_v86_state(regs, VM86_STI);
+ return;
+ }
+
+vm86_fault_return:
+ if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+ save_v86_state(regs, VM86_PICRETURN);
+ return;
+ }
+ if (orig_flags & X86_EFLAGS_TF)
+ handle_vm86_trap(regs, 0, X86_TRAP_DB);
+ return;
+
simulate_sigsegv:
/* FIXME: After a long discussion with Stas we finally
* agreed, that this is wrong. Here we should
@@ -718,7 +710,7 @@ simulate_sigsegv:
* should be a mixture of the two, but how do we
* get the information? [KD]
*/
- return_to_32bit(regs, VM86_UNKNOWN);
+ save_v86_state(regs, VM86_UNKNOWN);
}
/* ---------------- vm86 special IRQ passing stuff ----------------- */
diff --git a/arch/x86/kernel/vmcore_info_32.c b/arch/x86/kernel/vmcore_info_32.c
new file mode 100644
index 000000000000..5995a749288a
--- /dev/null
+++ b/arch/x86/kernel/vmcore_info_32.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <linux/pgtable.h>
+
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_X86_PAE
+ VMCOREINFO_CONFIG(X86_PAE);
+#endif
+}
diff --git a/arch/x86/kernel/vmcore_info_64.c b/arch/x86/kernel/vmcore_info_64.c
new file mode 100644
index 000000000000..0dec7d868754
--- /dev/null
+++ b/arch/x86/kernel/vmcore_info_64.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <linux/pgtable.h>
+
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+ u64 sme_mask = sme_me_mask;
+
+ VMCOREINFO_NUMBER(phys_base);
+ VMCOREINFO_SYMBOL(init_top_pgt);
+ vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+ pgtable_l5_enabled());
+
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_NUMBER(sme_mask);
+}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index 95a7289e4b0c..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,913 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
- (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
- (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
- void (*cpuid)(void /* non-c */);
- void (*_set_ldt)(u32 selector);
- void (*set_tr)(u32 selector);
- void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
- void (*set_kernel_stack)(u32 selector, u32 sp0);
- void (*allocate_page)(u32, u32, u32, u32, u32);
- void (*release_page)(u32, u32);
- void (*set_pte)(pte_t, pte_t *, unsigned);
- void (*update_pte)(pte_t *, unsigned);
- void (*set_linear_mapping)(int, void *, u32, u32);
- void (*_flush_tlb)(int);
- void (*set_initial_ap_state)(int, int);
- void (*halt)(void);
- void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP 0xe9
-#define MNEM_RET 0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE 5
-
-static inline void patch_offset(void *insnbuf,
- unsigned long ip, unsigned long dest)
-{
- *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
- unsigned long ip)
-{
- u64 reloc;
- struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
- switch(rel->type) {
- case VMI_RELOCATION_CALL_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_CALL;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_JUMP_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_JMP;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_NOP:
- /* obliterate the whole thing */
- return 0;
-
- case VMI_RELOCATION_NONE:
- /* leave native code in place */
- break;
-
- default:
- BUG();
- }
- return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence. The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
- unsigned long ip, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- return patch_internal(VMI_CALL_DisableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- return patch_internal(VMI_CALL_EnableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return patch_internal(VMI_CALL_SetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- return patch_internal(VMI_CALL_GetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.iret):
- return patch_internal(VMI_CALL_IRET, len, insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
- return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
- default:
- break;
- }
- return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int override = 0;
- if (*ax == 1)
- override = 1;
- asm volatile ("call *%6"
- : "=a" (*ax),
- "=b" (*bx),
- "=c" (*cx),
- "=d" (*dx)
- : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
- if (override) {
- if (disable_pse)
- *dx &= ~X86_FEATURE_PSE;
- if (disable_pge)
- *dx &= ~X86_FEATURE_PGE;
- if (disable_sep)
- *dx &= ~X86_FEATURE_SEP;
- if (disable_tsc)
- *dx &= ~X86_FEATURE_TSC;
- if (disable_mtrr)
- *dx &= ~X86_FEATURE_MTRR;
- }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
- if (gdt[nr].a != new->a || gdt[nr].b != new->b)
- write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
- unsigned cpu = smp_processor_id();
- struct desc_struct desc;
-
- pack_descriptor(&desc, (unsigned long)addr,
- entries * sizeof(struct desc_struct) - 1,
- DESC_LDT, 0);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
- vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
- vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
- u32 *idt_entry = (u32 *)g;
- vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
- const void *desc, int type)
-{
- u32 *gdt_entry = (u32 *)desc;
- vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
- const void *desc)
-{
- u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- tss->x86_tss.sp0 = thread->sp0;
-
- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
- vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-#ifdef CONFIG_HIGHPTE
-static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- void *va = kmap_atomic(page, type);
-
- /*
- * Internally, the VMI ROM must map virtual addresses to physical
- * addresses for processing MMU updates. By the time MMU updates
- * are issued, this information is typically already lost.
- * Fortunately, the VMI provides a cache of mapping slots for active
- * page tables.
- *
- * We use slot zero for the linear mapping of physical memory, and
- * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
- *
- * args: SLOT VA COUNT PFN
- */
- BUG_ON(type != KM_PTE0 && type != KM_PTE1);
- vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
-
- return va;
-}
-#endif
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- /*
- * This call comes in very early, before mem_map is setup.
- * It is called only for swapper_pg_dir, which already has
- * data on it.
- */
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags. We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush). We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs. We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
- (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
- /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
- const pte_t pte = { .pte = pmdval.pmd };
-#else
- const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
- vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
- /*
- * XXX This is called from set_pmd_pte, but at both PT
- * and PD layers so the VMI_PAGE_PT flag is wrong. But
- * it is only called for large page mapping changes,
- * the Xen backend, doesn't support large pages, and the
- * ESX backend doesn't depend on the flag.
- */
- set_64bit((unsigned long long *)ptep,pte_val(pteval));
- vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
- /* Um, eww */
- const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- struct vmi_ap_state ap;
-
- /* Default everything to zero. This is fine for most GPRs. */
- memset(&ap, 0, sizeof(struct vmi_ap_state));
-
- ap.gdtr_limit = GDT_SIZE - 1;
- ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
- ap.idtr_limit = IDT_ENTRIES * 8 - 1;
- ap.idtr_base = (unsigned long) idt_table;
-
- ap.ldtr = 0;
-
- ap.cs = __KERNEL_CS;
- ap.eip = (unsigned long) start_eip;
- ap.ss = __KERNEL_DS;
- ap.esp = (unsigned long) start_esp;
-
- ap.ds = __USER_DS;
- ap.es = __USER_DS;
- ap.fs = __KERNEL_PERCPU;
- ap.gs = __KERNEL_STACK_CANARY;
-
- ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
- /* efer should match BSP efer. */
- if (cpu_has_nx) {
- unsigned l, h;
- rdmsr(MSR_EFER, l, h);
- ap.efer = (unsigned long long) h << 32 | l;
- }
-#endif
-
- ap.cr3 = __pa(swapper_pg_dir);
- /* Protected mode, paging, AM, WP, NE, MP. */
- ap.cr0 = 0x80050023;
- ap.cr4 = mmu_cr4_features;
- vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
- paravirt_start_context_switch(prev);
- vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
- vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
- struct pci_header *pci;
- struct pnp_header *pnp;
- const char *manufacturer = "UNKNOWN";
- const char *product = "UNKNOWN";
- const char *license = "unspecified";
-
- if (rom->rom_signature != 0xaa55)
- return 0;
- if (rom->vrom_signature != VMI_SIGNATURE)
- return 0;
- if (rom->api_version_maj != VMI_API_REV_MAJOR ||
- rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
- printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
- rom->api_version_maj,
- rom->api_version_min);
- return 0;
- }
-
- /*
- * Relying on the VMI_SIGNATURE field is not 100% safe, so check
- * the PCI header and device type to make sure this is really a
- * VMI device.
- */
- if (!rom->pci_header_offs) {
- printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
- return 0;
- }
-
- pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
- if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
- pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
- /* Allow it to run... anyways, but warn */
- printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
- }
-
- if (rom->pnp_header_offs) {
- pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
- if (pnp->manufacturer_offset)
- manufacturer = (const char *)rom+pnp->manufacturer_offset;
- if (pnp->product_offset)
- product = (const char *)rom+pnp->product_offset;
- }
-
- if (rom->license_offs)
- license = (char *)rom+rom->license_offs;
-
- printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
- manufacturer, product,
- rom->api_version_maj, rom->api_version_min,
- pci->rom_version_maj, pci->rom_version_min);
-
- /* Don't allow BSD/MIT here for now because we don't want to end up
- with any binary only shim layers */
- if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
- printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
- license);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
- unsigned long base;
-
- /* VMI ROM is in option ROM area, check signature */
- for (base = 0xC0000; base < 0xE0000; base += 2048) {
- struct vrom_header *romstart;
- romstart = (struct vrom_header *)isa_bus_to_virt(base);
- if (check_vmi_rom(romstart)) {
- vmi_rom = romstart;
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- /* We must establish the lowmem mapping for MMU ops to work */
- if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
- if (rel->type == VMI_RELOCATION_CALL_REL)
- return (void *)rel->eip;
- else
- return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- if (rel->type == VMI_RELOCATION_CALL_REL) \
- opname = (void *)rel->eip; \
- else if (rel->type == VMI_RELOCATION_NOP) \
- opname = (void *)vmi_nop; \
- else if (rel->type != VMI_RELOCATION_NONE) \
- printk(KERN_WARNING "VMI: Unknown relocation " \
- "type %d for " #vmicall"\n",\
- rel->type); \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub. Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
- if (rel->type == VMI_RELOCATION_CALL_REL) { \
- opname = wrapper; \
- vmi_ops.cache = (void *)rel->eip; \
- } \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
- short kernel_cs;
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
- if (call_vrom_func(vmi_rom, vmi_init) != 0) {
- printk(KERN_ERR "VMI ROM failed to initialize!");
- return 0;
- }
- savesegment(cs, kernel_cs);
-
- pv_info.paravirt_enabled = 1;
- pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi";
-
- pv_init_ops.patch = vmi_patch;
-
- /*
- * Many of these operations are ABI compatible with VMI.
- * This means we can fill in the paravirt-ops with direct
- * pointers into the VMI ROM. If the calling convention for
- * these operations changes, this code needs to be updated.
- *
- * Exceptions
- * CPUID paravirt-op uses pointers, not the native ISA
- * halt has no VMI equivalent; all VMI halts are "safe"
- * no MSR support yet - just trap and emulate. VMI uses the
- * same ABI as the native ISA, but Linux wants exceptions
- * from bogus MSR read / write handled
- * rdpmc is not yet used in Linux
- */
-
- /* CPUID is special, so very special it gets wrapped like a present */
- para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
- para_fill(pv_cpu_ops.clts, CLTS);
- para_fill(pv_cpu_ops.get_debugreg, GetDR);
- para_fill(pv_cpu_ops.set_debugreg, SetDR);
- para_fill(pv_cpu_ops.read_cr0, GetCR0);
- para_fill(pv_mmu_ops.read_cr2, GetCR2);
- para_fill(pv_mmu_ops.read_cr3, GetCR3);
- para_fill(pv_cpu_ops.read_cr4, GetCR4);
- para_fill(pv_cpu_ops.write_cr0, SetCR0);
- para_fill(pv_mmu_ops.write_cr2, SetCR2);
- para_fill(pv_mmu_ops.write_cr3, SetCR3);
- para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
- para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
- para_fill(pv_cpu_ops.wbinvd, WBINVD);
- para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
- /* The following we emulate with trap and emulate for now */
- /* paravirt_ops.read_msr = vmi_rdmsr */
- /* paravirt_ops.write_msr = vmi_wrmsr */
- /* paravirt_ops.rdpmc = vmi_rdpmc */
-
- /* TR interface doesn't pass TR value, wrap */
- para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
- /* LDT is special, too */
- para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
- para_fill(pv_cpu_ops.load_gdt, SetGDT);
- para_fill(pv_cpu_ops.load_idt, SetIDT);
- para_fill(pv_cpu_ops.store_gdt, GetGDT);
- para_fill(pv_cpu_ops.store_idt, GetIDT);
- para_fill(pv_cpu_ops.store_tr, GetTR);
- pv_cpu_ops.load_tls = vmi_load_tls;
- para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
- write_ldt_entry, WriteLDTEntry);
- para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
- write_gdt_entry, WriteGDTEntry);
- para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
- write_idt_entry, WriteIDTEntry);
- para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
- para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
- para_fill(pv_cpu_ops.io_delay, IODelay);
-
- para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
- set_lazy_mode, SetLazyMode);
-
- para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
- set_lazy_mode, SetLazyMode);
-
- /* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
- para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
- para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
- /*
- * Until a standard flag format can be agreed on, we need to
- * implement these as wrappers in Linux. Get the VMI ROM
- * function pointers for the two backend calls.
- */
-#ifdef CONFIG_X86_PAE
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
- if (vmi_ops.set_pte) {
- pv_mmu_ops.set_pte = vmi_set_pte;
- pv_mmu_ops.set_pte_at = vmi_set_pte_at;
- pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pud = vmi_set_pud;
- pv_mmu_ops.pte_clear = vmi_pte_clear;
- pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
- }
-
- if (vmi_ops.update_pte) {
- pv_mmu_ops.pte_update = vmi_update_pte;
- pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
- }
-
- vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
- if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pte = vmi_allocate_pte;
- pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
- pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
- }
-
- vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
- if (vmi_ops.release_page) {
- pv_mmu_ops.release_pte = vmi_release_pte;
- pv_mmu_ops.release_pmd = vmi_release_pmd;
- pv_mmu_ops.pgd_free = vmi_pgd_free;
- }
-
- /* Set linear is needed in all cases */
- vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-#ifdef CONFIG_HIGHPTE
- if (vmi_ops.set_linear_mapping)
- pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
-#endif
-
- /*
- * These MUST always be patched. Don't support indirect jumps
- * through these operations, as the VMI interface may use either
- * a jump or a call to get to these operations, depending on
- * the backend. They are performance critical anyway, so requiring
- * a patch is not a big problem.
- */
- pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
- pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
- para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic->read, APICRead);
- para_fill(apic->write, APICWrite);
-#endif
-
- /*
- * Check for VMI timer functionality by probing for a cycle frequency method
- */
- reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
- if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
- vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
- vmi_timer_ops.get_cycle_counter =
- vmi_get_function(VMI_CALL_GetCycleCounter);
- vmi_timer_ops.get_wallclock =
- vmi_get_function(VMI_CALL_GetWallclockTime);
- vmi_timer_ops.wallclock_updated =
- vmi_get_function(VMI_CALL_WallclockUpdated);
- vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
- vmi_timer_ops.cancel_alarm =
- vmi_get_function(VMI_CALL_CancelAlarm);
- pv_time_ops.time_init = vmi_time_init;
- pv_time_ops.get_wallclock = vmi_get_wallclock;
- pv_time_ops.set_wallclock = vmi_set_wallclock;
-#ifdef CONFIG_X86_LOCAL_APIC
- pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
- pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
-#endif
- pv_time_ops.sched_clock = vmi_sched_clock;
- pv_time_ops.get_tsc_khz = vmi_tsc_khz;
-
- /* We have true wallclock functions; disable CMOS clock sync */
- no_sync_cmos_clock = 1;
- } else {
- disable_noidle = 1;
- disable_vmi_timer = 1;
- }
-
- para_fill(pv_irq_ops.safe_halt, Halt);
-
- /*
- * Alternative instruction rewriting doesn't happen soon enough
- * to convert VMI_IRET to a call instead of a jump; so we have
- * to do this before IRQs get reenabled. Fortunately, it is
- * idempotent.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
- vmi_bringup();
-
- return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
- if (!vmi_rom)
- probe_vmi_rom();
- else
- check_vmi_rom(vmi_rom);
-
- /* In case probing for or validating the ROM failed, basil */
- if (!vmi_rom)
- return;
-
- reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
- /* This is virtual hardware; timer routing is wired correctly */
- no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
- unsigned long flags;
-
- if (!vmi_rom)
- return;
-
- local_irq_save(flags);
- activate_vmi();
- local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "disable_pge")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- disable_pge = 1;
- } else if (!strcmp(arg, "disable_pse")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
- disable_pse = 1;
- } else if (!strcmp(arg, "disable_sep")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
- disable_sep = 1;
- } else if (!strcmp(arg, "disable_tsc")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
- disable_tsc = 1;
- } else if (!strcmp(arg, "disable_mtrr")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
- disable_mtrr = 1;
- } else if (!strcmp(arg, "disable_timer")) {
- disable_vmi_timer = 1;
- disable_noidle = 1;
- } else if (!strcmp(arg, "disable_noidle"))
- disable_noidle = 1;
- return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 2b3eb82efeeb..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
- /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
- * cycle counter. */
- return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
- unsigned long long wallclock;
- wallclock = vmi_timer_ops.get_wallclock(); // nsec
- (void)do_div(wallclock, 1000000000); // sec
-
- return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
- return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
- return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
- unsigned long long khz;
- khz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(khz, 1000);
- return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- return FIRST_DEVICE_VECTOR;
-#else
- return FIRST_EXTERNAL_VECTOR;
-#endif
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-
- return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
- .name = "VMI-LOCAL",
- .startup = startup_timer_irq,
- .mask = mask_timer_irq,
- .unmask = unmask_timer_irq,
- .ack = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
- return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- cycle_t now, cycles_per_hz;
- BUG_ON(!irqs_disabled());
-
- switch (mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_hz, HZ);
- now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
- vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- switch (evt->mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
- break;
- default:
- break;
- }
- break;
- default:
- break;
- }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* Unfortunately, set_next_event interface only passes relative
- * expiry, but we want absolute expiry. It'd be better if were
- * were passed an aboslute expiry, since a bunch of time may
- * have been stolen between the time the delta is computed and
- * when we set the alarm below. */
- cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
- BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
- vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
- return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
- .name = "vmi-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .shift = 22,
- .set_mode = vmi_timer_set_mode,
- .set_next_event = vmi_timer_next_event,
- .rating = 1000,
- .irq = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &__get_cpu_var(local_events);
- evt->event_handler(evt);
- return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action = {
- .name = "vmi-timer",
- .handler = vmi_timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
- cycle_t cycles_per_msec;
- struct clock_event_device *evt;
-
- int cpu = smp_processor_id();
- evt = &__get_cpu_var(local_events);
-
- /* Use cycles_per_msec since div_sc params are 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- memcpy(evt, &vmi_clockevent, sizeof(*evt));
- /* Must pick .shift such that .mult fits in 32-bits. Choosing
- * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
- * before overflow. */
- evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
- /* Upper bound is clockevent's use of ulong for cycle deltas. */
- evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
- evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of(cpu);
-
- printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
- evt->name, evt->mult, evt->shift);
- clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
- unsigned int cpu;
- /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
- outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
- vmi_time_init_clockevent();
- setup_irq(0, &vmi_clock_action);
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
- /*
- * On APIC systems, we want local timers to fire on each cpu. We do
- * this by programming LVTT to deliver timer events to the IRQ handler
- * for IRQ-0, since we can't re-use the APIC local timer handler
- * without interfering with that code.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
- local_irq_disable();
-#ifdef CONFIG_SMP
- /*
- * XXX handle_percpu_irq only defined for SMP; we need to switch over
- * to using it, since this is a local interrupt, which each CPU must
- * handle individually without locking out or dropping simultaneous
- * local timers on other CPUs. We also don't want to trigger the
- * quirk workaround code for interrupts which gets invoked from
- * handle_percpu_irq via eoi, so we use our own IRQ chip.
- */
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
- vmi_wiring = VMI_ALARM_WIRED_LVTT;
- apic_write(APIC_LVTT, vmi_get_timer_vector());
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
- vmi_time_init_clockevent();
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
- cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
- return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
- .name = "vmi-timer",
- .rating = 450,
- .read = read_real_cycles,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
- cycle_t cycles_per_msec;
-
- if (!vmi_timer_ops.get_cycle_frequency)
- return 0;
- /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- /* Note that clocksource.{mult, shift} converts in the opposite direction
- * as clockevents. */
- clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
- clocksource_vmi.shift);
-
- printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
- return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 78d185d797de..d7af4a64c211 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ld script for the x86 kernel
*
@@ -14,286 +15,285 @@
* put it inside the section definition.
*/
-#ifdef CONFIG_X86_32
-#define LOAD_OFFSET __PAGE_OFFSET
-#else
#define LOAD_OFFSET __START_KERNEL_map
-#endif
+
+#define RUNTIME_DISCARD_EXIT
+#define EMITS_PT_NOTE
+#define RO_EXCEPTION_TABLE_ALIGN 16
#include <asm-generic/vmlinux.lds.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/page_types.h>
+#include <asm/orc_lookup.h>
#include <asm/cache.h>
#include <asm/boot.h>
+#include <asm/kexec.h>
#undef i386 /* in case the preprocessor is a 32bit one */
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#ifdef CONFIG_X86_32
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
-jiffies = jiffies_64;
#else
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
#endif
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
-#ifdef CONFIG_X86_64
- user PT_LOAD FLAGS(7); /* RWE */
- data.init PT_LOAD FLAGS(7); /* RWE */
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(7); /* RWE */
+jiffies = jiffies_64;
+const_current_task = current_task;
+const_cpu_current_top_of_stack = cpu_current_top_of_stack;
+
+#if defined(CONFIG_X86_64)
+/*
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
+ *
+ * However, kernel identity mappings will have different RWX permissions
+ * to the pages mapping to text and to the pages padding (which are freed) the
+ * text section. Hence kernel identity mappings will be broken to smaller
+ * pages. For 64-bit, kernel text and kernel identity mappings are different,
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
+ */
+#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(HPAGE_SIZE); \
+ __end_rodata_hpage_align = .; \
+ __end_rodata_aligned = .;
+
+#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid splitting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define BSS_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_bss_decrypted = .; \
+ __pi___start_bss_decrypted = .; \
+ *(.bss..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_bss_decrypted_unused = .; \
+ . = ALIGN(PMD_SIZE); \
+ __end_bss_decrypted = .; \
+ __pi___end_bss_decrypted = .; \
+
+#else
+
+#define X86_ALIGN_RODATA_BEGIN
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(PAGE_SIZE); \
+ __end_rodata_aligned = .;
+
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+#define BSS_DECRYPTED
+
#endif
- data.init2 PT_LOAD FLAGS(7); /* RWE */
+#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE)
+#define KEXEC_RELOCATE_KERNEL \
+ . = ALIGN(0x100); \
+ __relocate_kernel_start = .; \
+ *(.text..relocate_kernel); \
+ *(.data..relocate_kernel); \
+ __relocate_kernel_end = .;
+
+ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "relocate_kernel code too large!")
+#else
+#define KEXEC_RELOCATE_KERNEL
#endif
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(6); /* RW_ */
note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
{
+ . = __START_KERNEL;
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
- phys_startup_32 = startup_32 - LOAD_OFFSET;
+ phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
- phys_startup_64 = startup_64 - LOAD_OFFSET;
+ phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
/* Text and read-only data */
-
- /* bootstrapping code */
- .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
- _text = .;
- *(.text.head)
- } :text = 0x9090
-
- /* The rest of the text */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
-#ifdef CONFIG_X86_32
- /* not really needed, already page aligned */
- . = ALIGN(PAGE_SIZE);
- *(.text.page_aligned)
-#endif
- . = ALIGN(8);
+ _text = .;
+ __pi__text = .;
_stext = .;
+ ALIGN_ENTRY_TEXT_BEGIN
+ *(.text..__x86.rethunk_untrain)
+ ENTRY_TEXT
+
+#ifdef CONFIG_MITIGATION_SRSO
+ /*
+ * See the comment above srso_alias_untrain_ret()'s
+ * definition.
+ */
+ . = srso_alias_untrain_ret | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20);
+ *(.text..__x86.rethunk_safe)
+#endif
+ ALIGN_ENTRY_TEXT_END
+
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
+ SOFTIRQENTRY_TEXT
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ *(.text..__x86.indirect_thunk)
+ *(.text..__x86.return_thunk)
+#endif
+ STATIC_CALL_TEXT
*(.gnu.warning)
- /* End of text section */
- _etext = .;
- } :text = 0x9090
- NOTES :text :note
+ } :text = 0xcccccccc
- /* Exception table */
- . = ALIGN(16);
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
+ /* End of text section, which should occupy whole number of pages */
+ _etext = .;
+ . = ALIGN(PAGE_SIZE);
- RODATA
+ X86_ALIGN_RODATA_BEGIN
+ RO_DATA(PAGE_SIZE)
+ X86_ALIGN_RODATA_END
/* Data */
- . = ALIGN(PAGE_SIZE);
.data : AT(ADDR(.data) - LOAD_OFFSET) {
/* Start of data section */
_sdata = .;
- DATA_DATA
- CONSTRUCTORS
- } :data
-
-#ifdef CONFIG_X86_32
- /* 32 bit has nosave before _edata */
- . = ALIGN(PAGE_SIZE);
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- }
-#endif
- . = ALIGN(PAGE_SIZE);
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- *(.data.page_aligned)
- *(.data.idt)
- }
+ /* init_task */
+ INIT_TASK_DATA(THREAD_SIZE)
-#ifdef CONFIG_X86_32
- . = ALIGN(32);
-#else
- . = ALIGN(PAGE_SIZE);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-#endif
- .data.cacheline_aligned :
- AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- *(.data.cacheline_aligned)
- }
+ /* equivalent to task_pt_regs(&init_task) */
+ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
- /* rarely changed data like cpu maps */
#ifdef CONFIG_X86_32
- . = ALIGN(32);
-#else
- . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+ /* 32 bit has nosave before _edata */
+ NOSAVE_DATA
#endif
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
-
- /* End of data section */
- _edata = .;
- }
-#ifdef CONFIG_X86_64
+ PAGE_ALIGNED_DATA(PAGE_SIZE)
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
- SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
- SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+ CACHE_HOT_DATA(L1_CACHE_BYTES)
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+ CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+ DATA_DATA
+ CONSTRUCTORS
+ KEXEC_RELOCATE_KERNEL
- . = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
- *(.vsyscall_0)
- } :user
+ /* rarely changed data like cpu maps */
+ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
- __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+ /* End of data section */
+ _edata = .;
+ } :data
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
- *(.vsyscall_fn)
- }
+ BUG_TABLE
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
- *(.vsyscall_gtod_data)
- }
+ ORC_UNWIND_TABLE
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
- *(.vsyscall_clock)
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
}
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
- *(.vsyscall_1)
- }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
- *(.vsyscall_2)
- }
+ INIT_TEXT_SECTION(PAGE_SIZE)
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
- *(.vgetcpu_mode)
+ /*
+ * Section for code used exclusively before alternatives are run. All
+ * references to such code must be patched out by alternatives, normally
+ * by using X86_FEATURE_ALWAYS CPU feature bit.
+ *
+ * See static_cpu_has() for an example.
+ */
+ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+ *(.altinstr_aux)
+ . = ALIGN(PAGE_SIZE);
+ __inittext_end = .;
}
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) {
- *(.jiffies)
- }
- jiffies = VVIRT(.jiffies);
+ INIT_DATA_SECTION(16)
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
- *(.vsyscall_3)
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
}
- . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
-#endif /* CONFIG_X86_64 */
-
- /* init_task */
- . = ALIGN(THREAD_SIZE);
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- *(.data.init_task)
+#ifdef CONFIG_X86_INTEL_MID
+ .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \
+ LOAD_OFFSET) {
+ __x86_intel_mid_dev_start = .;
+ *(.x86_intel_mid_dev.init)
+ __x86_intel_mid_dev_end = .;
}
-#ifdef CONFIG_X86_64
- :data.init
#endif
+#ifdef CONFIG_MITIGATION_RETPOLINE
/*
- * smp_locks might be freed after init
- * start/end must be page aligned
+ * List of instructions that call/jmp/jcc to retpoline thunks
+ * __x86_indirect_thunk_*(). These instructions can be patched along
+ * with alternatives, after which the section can be freed.
*/
- . = ALIGN(PAGE_SIZE);
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- . = ALIGN(PAGE_SIZE);
- }
-
- /* Init code and data - will be freed after init */
- . = ALIGN(PAGE_SIZE);
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- __init_begin = .; /* paired with __init_end */
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
-
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- INIT_DATA
+ . = ALIGN(8);
+ .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) {
+ __retpoline_sites = .;
+ *(.retpoline_sites)
+ __retpoline_sites_end = .;
}
- . = ALIGN(16);
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
+ . = ALIGN(8);
+ .return_sites : AT(ADDR(.return_sites) - LOAD_OFFSET) {
+ __return_sites = .;
+ *(.return_sites)
+ __return_sites_end = .;
}
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
+ . = ALIGN(8);
+ .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) {
+ __call_sites = .;
+ *(.call_sites)
+ __call_sites_end = .;
}
+#endif
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
+#ifdef CONFIG_X86_KERNEL_IBT
+ . = ALIGN(8);
+ .ibt_endbr_seal : AT(ADDR(.ibt_endbr_seal) - LOAD_OFFSET) {
+ __ibt_endbr_seal = .;
+ *(.ibt_endbr_seal)
+ __ibt_endbr_seal_end = .;
}
+#endif
- SECURITY_INIT
-
+#ifdef CONFIG_FINEIBT
. = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
+ .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) {
+ __cfi_sites = .;
+ *(.cfi_sites)
+ __cfi_sites_end = .;
}
+#endif
+ /*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ */
. = ALIGN(8);
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
__alt_instructions = .;
@@ -301,13 +301,26 @@ SECTIONS
__alt_instructions_end = .;
}
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ */
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
+ . = ALIGN(8);
+ .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+ __apicdrivers = .;
+ *(.apicdrivers);
+ __apicdrivers_end = .;
+ }
+
+ . = ALIGN(8);
/*
- * .exit.text is discard at runtime, not link time, to deal with
- * references from .altinstructions and .eh_frame
+ * .exit.text is discarded at runtime, not link time, to deal with
+ * references from .altinstructions
*/
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
EXIT_TEXT
@@ -317,28 +330,11 @@ SECTIONS
EXIT_DATA
}
-#ifdef CONFIG_BLK_DEV_INITRD
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
+ PERCPU_SECTION(L1_CACHE_BYTES)
+ ASSERT(__per_cpu_hot_end - __per_cpu_hot_start <= 64, "percpu cache hot data too large")
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - __data_nosave - should
- * start another section data.init2. Also, pda should be at the head of
- * percpu area. Preallocate it and define the percpu offset symbol
- * so that it can be accessed as a percpu variable.
- */
- . = ALIGN(PAGE_SIZE);
- PERCPU_VADDR(0, :percpu)
-#else
- PERCPU(PAGE_SIZE)
-#endif
+ RUNTIME_CONST_VARIABLES
+ RUNTIME_CONST(ptr, USER_PTR_MAX)
. = ALIGN(PAGE_SIZE);
@@ -347,80 +343,194 @@ SECTIONS
__init_end = .;
}
+ /*
+ * smp_locks might be freed after init
+ * start/end must be page aligned
+ */
+ . = ALIGN(PAGE_SIZE);
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
+ *(.smp_locks)
+ . = ALIGN(PAGE_SIZE);
+ __smp_locks_end = .;
+ }
+
#ifdef CONFIG_X86_64
.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- } :data.init2
- /* use another section data.init2, see PERCPU_VADDR() above */
+ NOSAVE_DATA
+ }
#endif
/* BSS */
. = ALIGN(PAGE_SIZE);
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__bss_start = .;
- *(.bss.page_aligned)
- *(.bss)
- . = ALIGN(4);
+ *(.bss..page_aligned)
+ . = ALIGN(PAGE_SIZE);
+ *(BSS_MAIN)
+ BSS_DECRYPTED
+ . = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
+ /*
+ * The memory occupied from _text to here, __end_of_kernel_reserve, is
+ * automatically reserved in setup_arch(). Anything after here must be
+ * explicitly reserved using memblock_reserve() or it will be discarded
+ * and treated as available memory.
+ */
+ __end_of_kernel_reserve = .;
+
. = ALIGN(PAGE_SIZE);
.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
. += 64 * 1024; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
+ *(.bss..brk) /* areas brk users have reserved */
__brk_limit = .;
}
- .end : AT(ADDR(.end) - LOAD_OFFSET) {
- _end = .;
- }
+ . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
+ _end = .;
+ __pi__end = .;
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.eh_frame)
- *(.discard)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Early scratch/workarea section: Lives outside of the kernel proper
+ * (_text - _end).
+ *
+ * Resides after _end because even though the .brk section is after
+ * __end_of_kernel_reserve, the .brk section is later reserved as a
+ * part of the kernel. Since it is located after __end_of_kernel_reserve
+ * it will be discarded and become part of the available memory. As
+ * such, it can only be used by very early boot code and must not be
+ * needed afterwards.
+ *
+ * Currently used by SME for performing in-place encryption of the
+ * kernel during boot. Resides on a 2MB boundary to simplify the
+ * pagetable setup used for SME in-place encryption.
+ */
+ . = ALIGN(HPAGE_SIZE);
+ .init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) {
+ __init_scratch_begin = .;
+ *(.init.scratch)
+ . = ALIGN(HPAGE_SIZE);
+ __init_scratch_end = .;
}
+#endif
- STABS_DEBUG
- DWARF_DEBUG
-}
+ STABS_DEBUG
+ DWARF_DEBUG
+#ifdef CONFIG_PROPELLER_CLANG
+ .llvm_bb_addr_map : { *(.llvm_bb_addr_map) }
+#endif
+ ELF_DETAILS
-#ifdef CONFIG_X86_32
-. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE");
+ DISCARDS
+
+ /*
+ * Make sure that the .got.plt is either completely empty or it
+ * contains only the lazy dispatch entries.
+ */
+ .got.plt (INFO) : { *(.got.plt) }
+ ASSERT(SIZEOF(.got.plt) == 0 ||
+#ifdef CONFIG_X86_64
+ SIZEOF(.got.plt) == 0x18,
#else
+ SIZEOF(.got.plt) == 0xc,
+#endif
+ "Unexpected GOT/PLT entries detected!")
+
+ /*
+ * Sections that should stay zero sized, which is safer to
+ * explicitly check instead of blindly discarding.
+ */
+ .got : {
+ *(.got) *(.igot.*)
+ }
+ ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!")
+
+ .plt : {
+ *(.plt) *(.plt.*) *(.iplt)
+ }
+ ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+
+ .rel.dyn : {
+ *(.rel.*) *(.rel_*)
+ }
+ ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!")
+
+ .rela.dyn : {
+ *(.rela.*) *(.rela_*)
+ }
+ ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
+}
+
/*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
+ * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause
+ * this. Let's assume that nobody will be running a COMPILE_TEST kernel and
+ * let's assert that fuller build coverage is more valuable than being able to
+ * run a COMPILE_TEST kernel.
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
+#ifndef CONFIG_COMPILE_TEST
/*
- * Build-time check on the image size:
+ * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility:
*/
-. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
+#endif
+
+/* needed for Clang - see arch/x86/entry/entry.S */
+PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
+. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
+#endif
-#ifdef CONFIG_SMP
-. = ASSERT((per_cpu__irq_stack_union == 0),
- "irq_stack_union is not at start of per-cpu area");
+#ifdef CONFIG_MITIGATION_SRSO
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
+/*
+ * GNU ld cannot do XOR until 2.41.
+ * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1
+ *
+ * LLVM lld cannot do XOR until lld-17.
+ * https://github.com/llvm/llvm-project/commit/fae96104d4378166cbe5c875ef8ed808a356f3fb
+ *
+ * Instead do: (A | B) - (A & B) in order to compute the XOR
+ * of the two function addresses:
+ */
+. = ASSERT(((ABSOLUTE(srso_alias_untrain_ret) | srso_alias_safe_ret) -
+ (ABSOLUTE(srso_alias_untrain_ret) & srso_alias_safe_ret)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
+ "SRSO function pair won't alias");
#endif
-#endif /* CONFIG_X86_32 */
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline");
+. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart");
+. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array");
+#endif
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline");
+#endif
+
+#endif /* CONFIG_X86_64 */
-. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big");
+/*
+ * The symbols below are referenced using relative relocations in the
+ * respective ELF notes. This produces build time constants that the
+ * linker will never mark as relocatable. (Using just ABSOLUTE() is not
+ * sufficient for that).
+ */
+#ifdef CONFIG_XEN_PV
+xen_elfnote_entry_value =
+ ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen);
+#endif
+#ifdef CONFIG_PVH
+xen_elfnote_phys32_entry_value =
+ ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET);
#endif
+#include "../boot/startup/exports.h"
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a1d804bcd483..73511332bb67 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* vSMPowered(tm) systems specific initialization
* Copyright (C) 2005 ScaleMP Inc.
*
- * Use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
* Ravikiran Thirumalai <kiran@scalemp.com>,
* Shai Fultheim <shai@scalemp.com>
* Paravirt ops integration: Glauber de Oliveira Costa <gcosta@redhat.com>,
@@ -15,6 +12,8 @@
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/pci-direct.h>
@@ -22,65 +21,10 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
-#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
-/*
- * Interrupt control on vSMPowered systems:
- * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
- * and vice versa.
- */
-
-static unsigned long vsmp_save_fl(void)
-{
- unsigned long flags = native_save_fl();
-
- if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC))
- flags &= ~X86_EFLAGS_IF;
- return flags;
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
+#define TOPOLOGY_REGISTER_OFFSET 0x10
-static void vsmp_restore_fl(unsigned long flags)
-{
- if (flags & X86_EFLAGS_IF)
- flags &= ~X86_EFLAGS_AC;
- else
- flags |= X86_EFLAGS_AC;
- native_restore_fl(flags);
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
-
-static void vsmp_irq_disable(void)
-{
- unsigned long flags = native_save_fl();
-
- native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
-
-static void vsmp_irq_enable(void)
-{
- unsigned long flags = native_save_fl();
-
- native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
-
-static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return paravirt_patch_default(type, clobbers, ibuf, addr, len);
- default:
- return native_patch(type, clobbers, ibuf, addr, len);
- }
-
-}
-
-static void __init set_vsmp_pv_ops(void)
+#ifdef CONFIG_PCI
+static void __init set_vsmp_ctl(void)
{
void __iomem *address;
unsigned int cap, ctl, cfg;
@@ -92,29 +36,25 @@ static void __init set_vsmp_pv_ops(void)
ctl = readl(address + 4);
printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
cap, ctl);
- if (cap & ctl & (1 << 4)) {
- /* Setup irq ops and turn on vSMP IRQ fastpath handling */
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
- pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
- pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
- pv_init_ops.patch = vsmp_patch;
-
- ctl &= ~(1 << 4);
- writel(ctl, address + 4);
- ctl = readl(address + 4);
- printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
+
+ /* If possible, let the vSMP foundation route the interrupt optimally */
+#ifdef CONFIG_SMP
+ if (cap & ctl & BIT(8)) {
+ ctl &= ~BIT(8);
+
+#ifdef CONFIG_PROC_FS
+ /* Don't let users change irq affinity via procfs */
+ no_irq_affinity = 1;
+#endif
}
+#endif
+
+ writel(ctl, address + 4);
+ ctl = readl(address + 4);
+ pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
early_iounmap(address, 8);
}
-#else
-static void __init set_vsmp_pv_ops(void)
-{
-}
-#endif
-
-#ifdef CONFIG_PCI
static int is_vsmp = -1;
static void __init detect_vsmp_box(void)
@@ -130,7 +70,7 @@ static void __init detect_vsmp_box(void)
is_vsmp = 1;
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
if (is_vsmp != -1)
return is_vsmp;
@@ -144,17 +84,57 @@ int is_vsmp_box(void)
static void __init detect_vsmp_box(void)
{
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
return 0;
}
+static void __init set_vsmp_ctl(void)
+{
+}
#endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+ void __iomem *address;
+ unsigned int cfg, topology, node_shift, maxcpus;
+
+ /*
+ * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+ * ones present in the first board, unless explicitly overridden by
+ * setup_max_cpus
+ */
+ if (setup_max_cpus != NR_CPUS)
+ return;
+
+ /* Read the vSMP Foundation topology register */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+ if (WARN_ON(!address))
+ return;
+
+ topology = readl(address);
+ node_shift = (topology >> 16) & 0x7;
+ if (!node_shift)
+ /* The value 0 should be decoded as 8 */
+ node_shift = 8;
+ maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+ pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+ maxcpus);
+ setup_max_cpus = maxcpus;
+ early_iounmap(address, 4);
+#endif
+}
+
void __init vsmp_init(void)
{
detect_vsmp_box();
if (!is_vsmp_box())
return;
- set_vsmp_pv_ops();
+ vsmp_cap_cpus();
+
+ set_vsmp_ctl();
return;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
deleted file mode 100644
index 25ee06a80aad..000000000000
--- a/arch/x86/kernel/vsyscall_64.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- * Copyright 2003 Andi Kleen, SuSE Labs.
- *
- * Thanks to hpa@transmeta.com for some useful hint.
- * Special thanks to Ingo Molnar for his early experience with
- * a different vsyscall implementation for Linux/IA32 and for the name.
- *
- * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
- * at virtual address -10Mbyte+1024bytes etc... There are at max 4
- * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
- * jumping out of line if necessary. We cannot add more with this
- * mechanism because older kernels won't return -ENOSYS.
- * If we want more than four we need a vDSO.
- *
- * Note: the concept clashes with user mode linux. If you use UML and
- * want per guest time just set the kernel.vsyscall64 sysctl to 0.
- */
-
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/seqlock.h>
-#include <linux/jiffies.h>
-#include <linux/sysctl.h>
-#include <linux/clocksource.h>
-#include <linux/getcpu.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/notifier.h>
-
-#include <asm/vsyscall.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/unistd.h>
-#include <asm/fixmap.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/segment.h>
-#include <asm/desc.h>
-#include <asm/topology.h>
-#include <asm/vgtod.h>
-
-#define __vsyscall(nr) \
- __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
-
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
-{
- .lock = SEQLOCK_UNLOCKED,
- .sysctl_enabled = 1,
-};
-
-void update_vsyscall_tz(void)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* sys_tz has changed */
- vsyscall_gtod_data.sys_tz = sys_tz;
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
-
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* copy vsyscall data */
- vsyscall_gtod_data.clock.vread = clock->vread;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
- vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = clock->mult;
- vsyscall_gtod_data.clock.shift = clock->shift;
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
-
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
-{
- *tz = __vsyscall_gtod_data.sys_tz;
-}
-
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
- int ret;
- asm volatile("syscall"
- : "=a" (ret)
- : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
- : __syscall_clobber );
- return ret;
-}
-
-static __always_inline long time_syscall(long *t)
-{
- long secs;
- asm volatile("syscall"
- : "=a" (secs)
- : "0" (__NR_time),"D" (t) : __syscall_clobber);
- return secs;
-}
-
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
- cycle_t now, base, mask, cycle_delta;
- unsigned seq;
- unsigned long mult, shift, nsec;
- cycle_t (*vread)(void);
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
-
- vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
- gettimeofday(tv,NULL);
- return;
- }
-
- now = vread();
- base = __vsyscall_gtod_data.clock.cycle_last;
- mask = __vsyscall_gtod_data.clock.mask;
- mult = __vsyscall_gtod_data.clock.mult;
- shift = __vsyscall_gtod_data.clock.shift;
-
- tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
- nsec = __vsyscall_gtod_data.wall_time_nsec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
-
- /* calculate interval: */
- cycle_delta = (now - base) & mask;
- /* convert to nsecs: */
- nsec += (cycle_delta * mult) >> shift;
-
- while (nsec >= NSEC_PER_SEC) {
- tv->tv_sec += 1;
- nsec -= NSEC_PER_SEC;
- }
- tv->tv_usec = nsec / NSEC_PER_USEC;
-}
-
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
-{
- if (tv)
- do_vgettimeofday(tv);
- if (tz)
- do_get_tz(tz);
- return 0;
-}
-
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
- struct timeval tv;
- time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
- return time_syscall(t);
-
- vgettimeofday(&tv, NULL);
- result = tv.tv_sec;
- if (t)
- *t = result;
- return result;
-}
-
-/* Fast way to get current CPU and node.
- This helps to do per node and per CPU caches in user space.
- The result is not guaranteed without CPU affinity, but usually
- works out because the scheduler tries to keep a thread on the same
- CPU.
-
- tcache must point to a two element sized long array.
- All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
- unsigned int p;
- unsigned long j = 0;
-
- /* Fast cache - only recompute value once per jiffies and avoid
- relatively costly rdtscp/cpuid otherwise.
- This works because the scheduler usually keeps the process
- on the same CPU and this syscall doesn't guarantee its
- results anyways.
- We do this here because otherwise user space would do it on
- its own in a likely inferior way (no access to jiffies).
- If you don't like it pass NULL. */
- if (tcache && tcache->blob[0] == (j = __jiffies)) {
- p = tcache->blob[1];
- } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
- /* Load per CPU data from RDTSCP */
- native_read_tscp(&p);
- } else {
- /* Load per CPU data from GDT */
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
- }
- if (tcache) {
- tcache->blob[0] = j;
- tcache->blob[1] = p;
- }
- if (cpu)
- *cpu = p & 0xfff;
- if (node)
- *node = p >> 12;
- return 0;
-}
-
-static long __vsyscall(3) venosys_1(void)
-{
- return -ENOSYS;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static int
-vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-}
-
-static ctl_table kernel_table2[] = {
- { .procname = "vsyscall64",
- .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = vsyscall_sysctl_change },
- {}
-};
-
-static ctl_table kernel_root_table2[] = {
- { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
- .child = kernel_table2 },
- {}
-};
-#endif
-
-/* Assume __initcall executes before all user space. Hopefully kmod
- doesn't violate that. We'll find out if it does. */
-static void __cpuinit vsyscall_set_cpu(int cpu)
-{
- unsigned long d;
- unsigned long node = 0;
-#ifdef CONFIG_NUMA
- node = cpu_to_node(cpu);
-#endif
- if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
- write_rdtscp_aux((node << 12) | cpu);
-
- /* Store cpu number in limit so that it can be loaded quickly
- in user space in vgetcpu.
- 12 bits for the CPU and 8 bits for the node. */
- d = 0x0f40000000000ULL;
- d |= cpu;
- d |= (node & 0xf) << 12;
- d |= (node >> 4) << 48;
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
-}
-
-static void __cpuinit cpu_vsyscall_init(void *arg)
-{
- /* preemption should be already off */
- vsyscall_set_cpu(raw_smp_processor_id());
-}
-
-static int __cpuinit
-cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
-{
- long cpu = (long)arg;
- if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
- smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
- return NOTIFY_DONE;
-}
-
-void __init map_vsyscall(void)
-{
- extern char __vsyscall_0;
- unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
-
- /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
-}
-
-static int __init vsyscall_init(void)
-{
- BUG_ON(((unsigned long) &vgettimeofday !=
- VSYSCALL_ADDR(__NR_vgettimeofday)));
- BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
- BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
- BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
- register_sysctl_table(kernel_root_table2);
-#endif
- on_each_cpu(cpu_vsyscall_init, NULL, 1);
- hotcpu_notifier(cpu_vsyscall_notifier, 0);
- return 0;
-}
-
-__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
deleted file mode 100644
index 3909e3ba5ce3..000000000000
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Exports for assembly files.
- All C exports should go in the respective C files. */
-
-#include <linux/module.h>
-#include <linux/smp.h>
-
-#include <net/checksum.h>
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-#include <asm/ftrace.h>
-
-#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
-EXPORT_SYMBOL(mcount);
-#endif
-
-EXPORT_SYMBOL(kernel_thread);
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(copy_user_generic);
-EXPORT_SYMBOL(__copy_user_nocache);
-EXPORT_SYMBOL(copy_from_user);
-EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(__copy_from_user_inatomic);
-
-EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * Export string functions. We normally rely on gcc builtin for most of these,
- * but gcc sometimes decides not to inline them.
- */
-#undef memcpy
-#undef memset
-#undef memmove
-
-extern void *memset(void *, int, __kernel_size_t);
-extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
-
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
-
-EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
-EXPORT_SYMBOL(load_gs_index);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 000000000000..0a2bbd674a6d
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+
+#include <asm/acpi.h>
+#include <asm/bios_ebda.h>
+#include <asm/paravirt.h>
+#include <asm/pci_x86.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820/api.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/io_apic.h>
+#include <asm/hpet.h>
+#include <asm/memtype.h>
+#include <asm/tsc.h>
+#include <asm/iommu.h>
+#include <asm/mach_traps.h>
+#include <asm/irqdomain.h>
+#include <asm/realmode.h>
+
+void x86_init_noop(void) { }
+void __init x86_init_uint_noop(unsigned int unused) { }
+static int __init iommu_init_noop(void) { return 0; }
+static void iommu_shutdown_noop(void) { }
+bool __init bool_x86_init_noop(void) { return false; }
+void x86_op_int_noop(int cpu) { }
+int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
+void get_rtc_noop(struct timespec64 *now) { }
+
+static __initconst const struct of_device_id of_cmos_match[] = {
+ { .compatible = "motorola,mc146818" },
+ {}
+};
+
+/*
+ * Allow devicetree configured systems to disable the RTC by setting the
+ * corresponding DT node's status property to disabled. Code is optimized
+ * out for CONFIG_OF=n builds.
+ */
+static __init void x86_wallclock_init(void)
+{
+ struct device_node *node = of_find_matching_node(NULL, of_cmos_match);
+
+ if (node && !of_device_is_available(node)) {
+ x86_platform.get_wallclock = get_rtc_noop;
+ x86_platform.set_wallclock = set_rtc_noop;
+ }
+}
+
+/*
+ * The platform setup functions are preset with the default functions
+ * for standard PC hardware.
+ */
+struct x86_init_ops x86_init __initdata = {
+
+ .resources = {
+ .probe_roms = probe_roms,
+ .reserve_resources = reserve_standard_io_resources,
+ .memory_setup = e820__memory_setup_default,
+ .dmi_setup = dmi_setup,
+ },
+
+ .mpparse = {
+ .setup_ioapic_ids = x86_init_noop,
+ .find_mptable = mpparse_find_mptable,
+ .early_parse_smp_cfg = mpparse_parse_early_smp_config,
+ .parse_smp_cfg = mpparse_parse_smp_config,
+ },
+
+ .irqs = {
+ .pre_vector_init = init_ISA_irqs,
+ .intr_init = native_init_IRQ,
+ .intr_mode_select = apic_intr_mode_select,
+ .intr_mode_init = apic_intr_mode_init,
+ .create_pci_msi_domain = native_create_pci_msi_domain,
+ },
+
+ .oem = {
+ .arch_setup = x86_init_noop,
+ .banner = default_banner,
+ },
+
+ .paging = {
+ .pagetable_init = native_pagetable_init,
+ },
+
+ .timers = {
+ .setup_percpu_clockev = setup_boot_APIC_clock,
+ .timer_init = hpet_time_init,
+ .wallclock_init = x86_wallclock_init,
+ },
+
+ .iommu = {
+ .iommu_init = iommu_init_noop,
+ },
+
+ .pci = {
+ .init = x86_default_pci_init,
+ .init_irq = x86_default_pci_init_irq,
+ .fixup_irqs = x86_default_pci_fixup_irqs,
+ },
+
+ .hyper = {
+ .init_platform = x86_init_noop,
+ .guest_late_init = x86_init_noop,
+ .x2apic_available = bool_x86_init_noop,
+ .msi_ext_dest_id = bool_x86_init_noop,
+ .init_mem_mapping = x86_init_noop,
+ .init_after_bootmem = x86_init_noop,
+ },
+
+ .acpi = {
+ .set_root_pointer = x86_default_set_root_pointer,
+ .get_root_pointer = x86_default_get_root_pointer,
+ .reduced_hw_early_init = acpi_generic_reduced_hw_init,
+ },
+};
+
+struct x86_cpuinit_ops x86_cpuinit = {
+ .early_percpu_clock_init = x86_init_noop,
+ .setup_percpu_clockev = setup_secondary_APIC_clock,
+ .parallel_bringup = true,
+};
+
+static void default_nmi_init(void) { };
+
+static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
+static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
+static bool enc_tlb_flush_required_noop(bool enc) { return false; }
+static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_begin_noop(void) {}
+static void enc_kexec_finish_noop(void) {}
+static bool is_private_mmio_noop(u64 addr) {return false; }
+
+struct x86_platform_ops x86_platform __ro_after_init = {
+ .calibrate_cpu = native_calibrate_cpu_early,
+ .calibrate_tsc = native_calibrate_tsc,
+ .get_wallclock = mach_get_cmos_time,
+ .set_wallclock = mach_set_cmos_time,
+ .iommu_shutdown = iommu_shutdown_noop,
+ .is_untracked_pat_range = is_ISA_range,
+ .nmi_init = default_nmi_init,
+ .get_nmi_reason = default_get_nmi_reason,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .realmode_reserve = reserve_real_mode,
+ .realmode_init = init_real_mode,
+ .hyper.pin_vcpu = x86_op_int_noop,
+ .hyper.is_private_mmio = is_private_mmio_noop,
+
+ .guest = {
+ .enc_status_change_prepare = enc_status_change_prepare_noop,
+ .enc_status_change_finish = enc_status_change_finish_noop,
+ .enc_tlb_flush_required = enc_tlb_flush_required_noop,
+ .enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_begin = enc_kexec_begin_noop,
+ .enc_kexec_finish = enc_kexec_finish_noop,
+ },
+};
+
+EXPORT_SYMBOL_GPL(x86_platform);
+
+struct x86_apic_ops x86_apic_ops __ro_after_init = {
+ .io_apic_read = native_io_apic_read,
+ .restore = native_restore_boot_irq_mode,
+};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
deleted file mode 100644
index c5ee17e8c6d9..000000000000
--- a/arch/x86/kernel/xsave.c
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * xsave/xrstor support.
- *
- * Author: Suresh Siddha <suresh.b.siddha@intel.com>
- */
-#include <linux/bootmem.h>
-#include <linux/compat.h>
-#include <asm/i387.h>
-#ifdef CONFIG_IA32_EMULATION
-#include <asm/sigcontext32.h>
-#endif
-#include <asm/xcr.h>
-
-/*
- * Supported feature mask by the CPU and the kernel.
- */
-u64 pcntxt_mask;
-
-struct _fpx_sw_bytes fx_sw_reserved;
-#ifdef CONFIG_IA32_EMULATION
-struct _fpx_sw_bytes fx_sw_reserved_ia32;
-#endif
-
-/*
- * Check for the presence of extended state information in the
- * user fpstate pointer in the sigcontext.
- */
-int check_for_xstate(struct i387_fxsave_struct __user *buf,
- void __user *fpstate,
- struct _fpx_sw_bytes *fx_sw_user)
-{
- int min_xstate_size = sizeof(struct i387_fxsave_struct) +
- sizeof(struct xsave_hdr_struct);
- unsigned int magic2;
- int err;
-
- err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
- sizeof(struct _fpx_sw_bytes));
-
- if (err)
- return err;
-
- /*
- * First Magic check failed.
- */
- if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
- return -1;
-
- /*
- * Check for error scenarios.
- */
- if (fx_sw_user->xstate_size < min_xstate_size ||
- fx_sw_user->xstate_size > xstate_size ||
- fx_sw_user->xstate_size > fx_sw_user->extended_size)
- return -1;
-
- err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
- fx_sw_user->extended_size -
- FP_XSTATE_MAGIC2_SIZE));
- /*
- * Check for the presence of second magic word at the end of memory
- * layout. This detects the case where the user just copied the legacy
- * fpstate layout with out copying the extended state information
- * in the memory layout.
- */
- if (err || magic2 != FP_XSTATE_MAGIC2)
- return -1;
-
- return 0;
-}
-
-#ifdef CONFIG_X86_64
-/*
- * Signal frame handlers.
- */
-
-int save_i387_xstate(void __user *buf)
-{
- struct task_struct *tsk = current;
- int err = 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
- return -EACCES;
-
- BUG_ON(sig_xstate_size < xstate_size);
-
- if ((unsigned long)buf % 64)
- printk("save_i387_xstate: bad fpstate %p\n", buf);
-
- if (!used_math())
- return 0;
-
- if (task_thread_info(tsk)->status & TS_USEDFPU) {
- /*
- * Start with clearing the user buffer. This will present a
- * clean context for the bytes not touched by the fxsave/xsave.
- */
- err = __clear_user(buf, sig_xstate_size);
- if (err)
- return err;
-
- if (task_thread_info(tsk)->status & TS_XSAVE)
- err = xsave_user(buf);
- else
- err = fxsave_user(buf);
-
- if (err)
- return err;
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
- stts();
- } else {
- if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
- xstate_size))
- return -1;
- }
-
- clear_used_math(); /* trigger finit */
-
- if (task_thread_info(tsk)->status & TS_XSAVE) {
- struct _fpstate __user *fx = buf;
- struct _xstate __user *x = buf;
- u64 xstate_bv;
-
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
-
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_size
- - FP_XSTATE_MAGIC2_SIZE));
-
- /*
- * Read the xstate_bv which we copied (directly from the cpu or
- * from the state in task struct) to the user buffers and
- * set the FP/SSE bits.
- */
- err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
-
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context. This will
- * enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware apps can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- xstate_bv |= XSTATE_FPSSE;
-
- err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
-
- if (err)
- return err;
- }
-
- return 1;
-}
-
-/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE
- * state.
- */
-static int restore_user_xstate(void __user *buf)
-{
- struct _fpx_sw_bytes fx_sw_user;
- u64 mask;
- int err;
-
- if (((unsigned long)buf % 64) ||
- check_for_xstate(buf, buf, &fx_sw_user))
- goto fx_only;
-
- mask = fx_sw_user.xstate_bv;
-
- /*
- * restore the state passed by the user.
- */
- err = xrestore_user(buf, mask);
- if (err)
- return err;
-
- /*
- * init the state skipped by the user.
- */
- mask = pcntxt_mask & ~mask;
-
- xrstor_state(init_xstate_buf, mask);
-
- return 0;
-
-fx_only:
- /*
- * couldn't find the extended state information in the
- * memory layout. Restore just the FP/SSE and init all
- * the other extended state.
- */
- xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
- return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
-}
-
-/*
- * This restores directly out of user space. Exceptions are handled.
- */
-int restore_i387_xstate(void __user *buf)
-{
- struct task_struct *tsk = current;
- int err = 0;
-
- if (!buf) {
- if (used_math())
- goto clear;
- return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
- return -EACCES;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
- if (!(task_thread_info(current)->status & TS_USEDFPU)) {
- clts();
- task_thread_info(current)->status |= TS_USEDFPU;
- }
- if (task_thread_info(tsk)->status & TS_XSAVE)
- err = restore_user_xstate(buf);
- else
- err = fxrstor_checking((__force struct i387_fxsave_struct *)
- buf);
- if (unlikely(err)) {
- /*
- * Encountered an error while doing the restore from the
- * user buffer, clear the fpu state.
- */
-clear:
- clear_fpu(tsk);
- clear_used_math();
- }
- return err;
-}
-#endif
-
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-static void prepare_fx_sw_frame(void)
-{
- int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
- FP_XSTATE_MAGIC2_SIZE;
-
- sig_xstate_size = sizeof(struct _fpstate) + size_extended;
-
-#ifdef CONFIG_IA32_EMULATION
- sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
-#endif
-
- memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
-
- fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
- fx_sw_reserved.extended_size = sig_xstate_size;
- fx_sw_reserved.xstate_bv = pcntxt_mask;
- fx_sw_reserved.xstate_size = xstate_size;
-#ifdef CONFIG_IA32_EMULATION
- memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
- fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
-#endif
-}
-
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
-#ifdef CONFIG_X86_64
-unsigned int sig_xstate_size = sizeof(struct _fpstate);
-#endif
-
-/*
- * Enable the extended processor state save/restore feature
- */
-void __cpuinit xsave_init(void)
-{
- if (!cpu_has_xsave)
- return;
-
- set_in_cr4(X86_CR4_OSXSAVE);
-
- /*
- * Enable all the features that the HW is capable of
- * and the Linux kernel is aware of.
- */
- xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-}
-
-/*
- * setup the xstate image representing the init state
- */
-static void __init setup_xstate_init(void)
-{
- init_xstate_buf = alloc_bootmem(xstate_size);
- init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
-}
-
-/*
- * Enable and initialize the xsave feature.
- */
-void __ref xsave_cntxt_init(void)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
- pcntxt_mask = eax + ((u64)edx << 32);
-
- if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
- printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
- pcntxt_mask);
- BUG();
- }
-
- /*
- * Support only the state known to OS.
- */
- pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
- xsave_init();
-
- /*
- * Recompute the context size for enabled features
- */
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
- xstate_size = ebx;
-
- prepare_fx_sw_frame();
-
- setup_xstate_init();
-
- printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
- "cntxt size 0x%x\n",
- pcntxt_mask, xstate_size);
-}
diff --git a/arch/x86/kvm/.gitignore b/arch/x86/kvm/.gitignore
new file mode 100644
index 000000000000..615d6ff35c00
--- /dev/null
+++ b/arch/x86/kvm/.gitignore
@@ -0,0 +1,2 @@
+/kvm-asm-offsets.s
+/kvm-asm-offsets.h
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8600a09e0c6c..278f08194ec8 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,18 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
#
# KVM configuration
#
-config HAVE_KVM
- bool
-config HAVE_KVM_IRQCHIP
- bool
- default y
+source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
- depends on HAVE_KVM || X86
default y
- ---help---
+ help
Say Y here to get to see options for using your Linux host to run other
operating systems inside virtual machines (guests).
This option alone does not add any kernel code.
@@ -21,15 +17,42 @@ menuconfig VIRTUALIZATION
if VIRTUALIZATION
+config KVM_X86
+ def_tristate KVM if (KVM_INTEL != n || KVM_AMD != n)
+ select KVM_COMMON
+ select KVM_GENERIC_MMU_NOTIFIER
+ select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+ select KVM_MMU_LOCKLESS_AGING
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_PFNCACHE
+ select HAVE_KVM_DIRTY_RING_TSO
+ select HAVE_KVM_DIRTY_RING_ACQ_REL
+ select HAVE_KVM_IRQ_BYPASS
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_READONLY_MEM
+ select VHOST_TASK
+ select KVM_ASYNC_PF
+ select USER_RETURN_NOTIFIER
+ select KVM_MMIO
+ select SCHED_INFO
+ select PERF_EVENTS
+ select GUEST_PERF_EVENTS
+ select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select HAVE_KVM_NO_POLL
+ select VIRT_XFER_TO_GUEST_WORK
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_VFIO
+ select HAVE_KVM_PM_NOTIFIER if PM
+ select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_PRE_FAULT_MEMORY
+ select KVM_WERROR if WERROR
+ select KVM_GUEST_MEMFD if X86_64
+
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
- # for device assignment:
- depends on PCI
- select PREEMPT_NOTIFIERS
- select MMU_NOTIFIER
- select ANON_INODES
- ---help---
+ depends on X86_LOCAL_APIC
+ help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
processor equipped with virtualization extensions. You will also
@@ -43,41 +66,178 @@ config KVM
If unsure, say N.
+config KVM_WERROR
+ bool "Compile KVM with -Werror"
+ # Disallow KVM's -Werror if KASAN is enabled, e.g. to guard against
+ # randomized configs from selecting KVM_WERROR=y, which doesn't play
+ # nice with KASAN. KASAN builds generates warnings for the default
+ # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
+ # Building KVM with -Werror and KASAN is still doable via enabling
+ # the kernel-wide WERROR=y.
+ depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR)
+ help
+ Add -Werror to the build flags for KVM.
+
+ If in doubt, say "N".
+
+config KVM_SW_PROTECTED_VM
+ bool "Enable support for KVM software-protected VMs"
+ depends on EXPERT
+ depends on KVM_X86 && X86_64
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ help
+ Enable support for KVM software-protected VMs. Currently, software-
+ protected VMs are purely a development and testing vehicle for
+ KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a
+ software-protected VM will fail miserably.
+
+ If unsure, say "N".
+
config KVM_INTEL
- tristate "KVM for Intel processors support"
- depends on KVM
- ---help---
- Provides support for KVM on Intel processors equipped with the VT
- extensions.
+ tristate "KVM for Intel (and compatible) processors support"
+ depends on KVM && IA32_FEAT_CTL
+ select X86_FRED if X86_64
+ help
+ Provides support for KVM on processors equipped with Intel's VT
+ extensions, a.k.a. Virtual Machine Extensions (VMX).
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config KVM_INTEL_PROVE_VE
+ bool "Check that guests do not receive #VE exceptions"
+ depends on KVM_INTEL && EXPERT
+ help
+ Checks that KVM's page table management code will not incorrectly
+ let guests receive a virtualization exception. Virtualization
+ exceptions will be trapped by the hypervisor rather than injected
+ in the guest.
+
+ Note: some CPUs appear to generate spurious EPT Violations #VEs
+ that trigger KVM's WARN, in particular with eptad=0 and/or nested
+ virtualization.
+
+ If unsure, say N.
+
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
+config KVM_INTEL_TDX
+ bool "Intel Trust Domain Extensions (TDX) support"
+ default y
+ depends on INTEL_TDX_HOST
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select HAVE_KVM_ARCH_GMEM_POPULATE
+ help
+ Provides support for launching Intel Trust Domain Extensions (TDX)
+ confidential VMs on Intel processors.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
- depends on KVM
- ---help---
+ depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON)
+ help
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
To compile this as a module, choose M here: the module
will be called kvm-amd.
-config KVM_TRACE
- bool "KVM trace support"
- depends on KVM && SYSFS
- select MARKERS
- select RELAY
- select DEBUG_FS
- default n
- ---help---
- This option allows reading a trace of kvm-related events through
- relayfs. Note the ABI is not considered stable and will be
- modified in future updates.
-
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source drivers/lguest/Kconfig
-source drivers/virtio/Kconfig
+config KVM_AMD_SEV
+ bool "AMD Secure Encrypted Virtualization (SEV) support"
+ default y
+ depends on KVM_AMD && X86_64
+ depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
+ select ARCH_HAS_CC_PLATFORM
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select HAVE_KVM_ARCH_GMEM_PREPARE
+ select HAVE_KVM_ARCH_GMEM_INVALIDATE
+ select HAVE_KVM_ARCH_GMEM_POPULATE
+ help
+ Provides support for launching encrypted VMs which use Secure
+ Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
+ Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
+ Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM_X86
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
+config KVM_SMM
+ bool "System Management Mode emulation"
+ default y
+ depends on KVM_X86
+ help
+ Provides support for KVM to emulate System Management Mode (SMM)
+ in virtual machines. This can be used by the virtual machine
+ firmware to implement UEFI secure boot.
+
+ If unsure, say Y.
+
+config KVM_HYPERV
+ bool "Support for Microsoft Hyper-V emulation"
+ depends on KVM_X86
+ default y
+ help
+ Provides KVM support for emulating Microsoft Hyper-V. This allows KVM
+ to expose a subset of the paravirtualized interfaces defined in the
+ Hyper-V Hypervisor Top-Level Functional Specification (TLFS):
+ https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
+ These interfaces are required for the correct and performant functioning
+ of Windows and Hyper-V guests on KVM.
+
+ If unsure, say "Y".
+
+config KVM_XEN
+ bool "Support for Xen hypercall interface"
+ depends on KVM_X86
+ help
+ Provides KVM support for the hosting Xen HVM guests and
+ passing Xen hypercalls to userspace.
+
+ If in doubt, say "N".
+
+config KVM_PROVE_MMU
+ bool "Prove KVM MMU correctness"
+ depends on DEBUG_KERNEL
+ depends on KVM_X86
+ depends on EXPERT
+ help
+ Enables runtime assertions in KVM's MMU that are too costly to enable
+ in anything remotely resembling a production environment, e.g. this
+ gates code that verifies a to-be-freed page table doesn't have any
+ present SPTEs.
+
+ If in doubt, say "N".
+
+config KVM_EXTERNAL_WRITE_TRACKING
+ bool
+
+config KVM_MAX_NR_VCPUS
+ int "Maximum number of vCPUs per KVM guest"
+ depends on KVM_X86
+ range 1024 4096
+ default 4096 if MAXSMP
+ default 1024
+ help
+ Set the maximum number of vCPUs per KVM guest. Larger values will increase
+ the memory footprint of each KVM guest, regardless of how many vCPUs are
+ created for a given VM.
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4efafe80..c4b8950c7abe 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,22 +1,49 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o)
-ifeq ($(CONFIG_KVM_TRACE),y)
-common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
-endif
-ifeq ($(CONFIG_IOMMU_API),y)
-common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -I $(srctree)/arch/x86/kvm
+ccflags-$(CONFIG_KVM_WERROR) += -Werror
+
+include $(srctree)/virt/kvm/Makefile.kvm
+
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
+
+kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
+kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
+kvm-$(CONFIG_KVM_XEN) += xen.o
+kvm-$(CONFIG_KVM_SMM) += smm.o
+
+kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
+ vmx/nested.o vmx/posted_intr.o vmx/main.o
+
+kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
+kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o
+
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o
+
+kvm-amd-$(CONFIG_KVM_AMD_SEV) += svm/sev.o
+kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
+
+ifdef CONFIG_HYPERV
+kvm-y += kvm_onhyperv.o
+kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o
+kvm-amd-y += svm/svm_onhyperv.o
endif
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+obj-$(CONFIG_KVM_X86) += kvm.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+
+AFLAGS_svm/vmenter.o := -iquote $(obj)
+$(obj)/svm/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+AFLAGS_vmx/vmenter.o := -iquote $(obj)
+$(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+$(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
+ $(call filechk,offsets,__KVM_ASM_OFFSETS_H__)
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+targets += kvm-asm-offsets.s
+clean-files += kvm-asm-offsets.h
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000000..d563a948318b
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,2107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "linux/lockdep.h"
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/sched/stat.h>
+
+#include <asm/processor.h>
+#include <asm/user.h>
+#include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
+#include <asm/cpuid/api.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+#include "pmu.h"
+#include "xen.h"
+
+/*
+ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
+ * aligned to sizeof(unsigned long) because it's not accessed via bitops.
+ */
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps);
+
+struct cpuid_xstate_sizes {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+};
+
+static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init;
+
+void __init kvm_init_xstate_sizes(void)
+{
+ u32 ign;
+ int i;
+
+ for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) {
+ struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
+
+ cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign);
+ }
+}
+
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
+{
+ u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+ int i;
+
+ xstate_bv &= XFEATURE_MASK_EXTEND;
+ for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) {
+ struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
+ u32 offset;
+
+ if (!(xstate_bv & BIT_ULL(i)))
+ continue;
+
+ /* ECX[1]: 64B alignment in compacted form */
+ if (compacted)
+ offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ else
+ offset = xs->ebx;
+ ret = max(ret, offset + xs->eax);
+ xstate_bv &= ~BIT_ULL(i);
+ }
+
+ return ret;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(
+ struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
+{
+ struct kvm_cpuid_entry2 *e;
+ int i;
+
+ /*
+ * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+ * with IRQs disabled is disallowed. The CPUID model can legitimately
+ * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+ * typically disabled in KVM only when KVM is in a performance critical
+ * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
+ * if this rule is violated, this assertion is purely to flag potential
+ * performance issues. If this fires, consider moving the lookup out
+ * of the hotpath, e.g. by caching information during CPUID updates.
+ */
+ lockdep_assert_irqs_enabled();
+
+ for (i = 0; i < nent; i++) {
+ e = &entries[i];
+
+ if (e->function != function)
+ continue;
+
+ /*
+ * If the index isn't significant, use the first entry with a
+ * matching function. It's userspace's responsibility to not
+ * provide "duplicate" entries in all cases.
+ */
+ if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
+ return e;
+
+
+ /*
+ * Similarly, use the first matching entry if KVM is doing a
+ * lookup (as opposed to emulating CPUID) for a function that's
+ * architecturally defined as not having a significant index.
+ */
+ if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) {
+ /*
+ * Direct lookups from KVM should not diverge from what
+ * KVM defines internally (the architectural behavior).
+ */
+ WARN_ON_ONCE(cpuid_function_is_indexed(function));
+ return e;
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_cpuid_entry2);
+
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+ u64 xfeatures;
+
+ /*
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }
+
+ /*
+ * Exposing dynamic xfeatures to the guest requires additional
+ * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+ */
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0);
+ if (!best)
+ return 0;
+
+ xfeatures = best->eax | ((u64)best->edx << 32);
+ xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+ if (!xfeatures)
+ return 0;
+
+ return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
+}
+
+static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu);
+static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
+
+/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
+static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *orig;
+ int i;
+
+ /*
+ * Apply runtime CPUID updates to the incoming CPUID entries to avoid
+ * false positives due mismatches on KVM-owned feature flags.
+ *
+ * Note! @e2 and @nent track the _old_ CPUID entries!
+ */
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_apply_cpuid_pv_features_quirk(vcpu);
+
+ if (nent != vcpu->arch.cpuid_nent)
+ return -EINVAL;
+
+ for (i = 0; i < nent; i++) {
+ orig = &vcpu->arch.cpuid_entries[i];
+ if (e2[i].function != orig->function ||
+ e2[i].index != orig->index ||
+ e2[i].flags != orig->flags ||
+ e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
+ e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
+{
+ struct kvm_hypervisor_cpuid cpuid = {};
+ struct kvm_cpuid_entry2 *entry;
+ u32 base;
+
+ for_each_possible_cpuid_base_hypervisor(base) {
+ entry = kvm_find_cpuid_entry(vcpu, base);
+
+ if (entry) {
+ u32 signature[3];
+
+ signature[0] = entry->ebx;
+ signature[1] = entry->ecx;
+ signature[2] = entry->edx;
+
+ if (!memcmp(signature, sig, sizeof(signature))) {
+ cpuid.base = base;
+ cpuid.limit = entry->eax;
+ break;
+ }
+ }
+ }
+
+ return cpuid;
+}
+
+static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hypervisor_cpuid kvm_cpuid;
+ struct kvm_cpuid_entry2 *best;
+
+ kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+ if (!kvm_cpuid.base)
+ return 0;
+
+ best = kvm_find_cpuid_entry(vcpu, kvm_cpuid.base | KVM_CPUID_FEATURES);
+ if (!best)
+ return 0;
+
+ if (kvm_hlt_in_guest(vcpu->kvm))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+
+ return best->eax;
+}
+
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0);
+ if (!best)
+ return 0;
+
+ return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0;
+}
+
+static u64 cpuid_get_supported_xss(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 1);
+ if (!best)
+ return 0;
+
+ return (best->ecx | ((u64)best->edx << 32)) & kvm_caps.supported_xss;
+}
+
+static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool has_feature)
+{
+ cpuid_entry_change(entry, x86_feature, has_feature);
+ guest_cpu_cap_change(vcpu, x86_feature, has_feature);
+}
+
+static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ vcpu->arch.cpuid_dynamic_bits_dirty = false;
+
+ best = kvm_find_cpuid_entry(vcpu, 1);
+ if (best) {
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE,
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
+
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_APIC,
+ vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
+
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_MWAIT,
+ vcpu->arch.ia32_misc_enable_msr &
+ MSR_IA32_MISC_ENABLE_MWAIT);
+ }
+
+ best = kvm_find_cpuid_entry_index(vcpu, 7, 0);
+ if (best)
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSPKE,
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
+
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xD, 0);
+ if (best)
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1);
+ if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+ cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
+ best->ebx = xstate_required_size(vcpu->arch.xcr0 |
+ vcpu->arch.ia32_xss, true);
+}
+
+static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
+ return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+#else
+ return false;
+#endif
+}
+
+static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0);
+ if (!entry)
+ return false;
+
+ return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) ||
+ is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx);
+}
+
+/*
+ * This isn't truly "unsafe", but except for the cpu_caps initialization code,
+ * all register lookups should use __cpuid_entry_get_reg(), which provides
+ * compile-time validation of the input.
+ */
+static u32 cpuid_get_reg_unsafe(struct kvm_cpuid_entry2 *entry, u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return entry->eax;
+ case CPUID_EBX:
+ return entry->ebx;
+ case CPUID_ECX:
+ return entry->ecx;
+ case CPUID_EDX:
+ return entry->edx;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
+ bool include_partially_emulated);
+
+void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *best;
+ struct kvm_cpuid_entry2 *entry;
+ bool allow_gbpages;
+ int i;
+
+ memset(vcpu->arch.cpu_caps, 0, sizeof(vcpu->arch.cpu_caps));
+ BUILD_BUG_ON(ARRAY_SIZE(reverse_cpuid) != NR_KVM_CPU_CAPS);
+
+ /*
+ * Reset guest capabilities to userspace's guest CPUID definition, i.e.
+ * honor userspace's definition for features that don't require KVM or
+ * hardware management/support (or that KVM simply doesn't care about).
+ */
+ for (i = 0; i < NR_KVM_CPU_CAPS; i++) {
+ const struct cpuid_reg cpuid = reverse_cpuid[i];
+ struct kvm_cpuid_entry2 emulated;
+
+ if (!cpuid.function)
+ continue;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ continue;
+
+ cpuid_func_emulated(&emulated, cpuid.function, true);
+
+ /*
+ * A vCPU has a feature if it's supported by KVM and is enabled
+ * in guest CPUID. Note, this includes features that are
+ * supported by KVM but aren't advertised to userspace!
+ */
+ vcpu->arch.cpu_caps[i] = kvm_cpu_caps[i] |
+ cpuid_get_reg_unsafe(&emulated, cpuid.reg);
+ vcpu->arch.cpu_caps[i] &= cpuid_get_reg_unsafe(entry, cpuid.reg);
+ }
+
+ kvm_update_cpuid_runtime(vcpu);
+
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ * If TDP is disabled, honor *only* guest CPUID as KVM has full control
+ * and can install smaller shadow pages if the host lacks 1GiB support.
+ */
+ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES);
+ guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages);
+
+ best = kvm_find_cpuid_entry(vcpu, 1);
+ if (best && apic) {
+ if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
+ apic->lapic_timer.timer_mode_mask = 3 << 17;
+ else
+ apic->lapic_timer.timer_mode_mask = 1 << 17;
+
+ kvm_apic_set_version(vcpu);
+ }
+
+ vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu);
+ vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu);
+
+ vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu);
+
+ vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+
+ kvm_pmu_refresh(vcpu);
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) |
+ __cr4_reserved_bits(guest_cpu_cap_has, vcpu);
+#undef __kvm_cpu_cap_has
+
+ kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu));
+
+ /* Invoke the vendor callback only after the above state is updated. */
+ kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
+
+ /*
+ * Except for the MMU, which needs to do its thing any vendor specific
+ * adjustments to the reserved GPA bits.
+ */
+ kvm_mmu_after_set_cpuid(vcpu);
+
+ kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
+}
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+
+int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best)
+ return (best->eax >> 16) & 0xff;
+not_found:
+ return 0;
+}
+
+/*
+ * This "raw" version returns the reserved GPA bits without any adjustments for
+ * encryption technologies that usurp bits. The raw mask should be used if and
+ * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
+ */
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+}
+
+static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ u32 vcpu_caps[NR_KVM_CPU_CAPS];
+ int r;
+
+ /*
+ * Swap the existing (old) entries with the incoming (new) entries in
+ * order to massage the new entries, e.g. to account for dynamic bits
+ * that KVM controls, without clobbering the current guest CPUID, which
+ * KVM needs to preserve in order to unwind on failure.
+ *
+ * Similarly, save the vCPU's current cpu_caps so that the capabilities
+ * can be updated alongside the CPUID entries when performing runtime
+ * updates. Full initialization is done if and only if the vCPU hasn't
+ * run, i.e. only if userspace is potentially changing CPUID features.
+ */
+ swap(vcpu->arch.cpuid_entries, e2);
+ swap(vcpu->arch.cpuid_nent, nent);
+
+ memcpy(vcpu_caps, vcpu->arch.cpu_caps, sizeof(vcpu_caps));
+ BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps));
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly. It would've been better to forbid any
+ * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
+ * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+ * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
+ * whether the supplied CPUID data is equal to what's already set.
+ */
+ if (kvm_vcpu_has_run(vcpu)) {
+ r = kvm_cpuid_check_equal(vcpu, e2, nent);
+ if (r)
+ goto err;
+ goto success;
+ }
+
+#ifdef CONFIG_KVM_HYPERV
+ if (kvm_cpuid_has_hyperv(vcpu)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ goto err;
+ }
+#endif
+
+ r = kvm_check_cpuid(vcpu);
+ if (r)
+ goto err;
+
+#ifdef CONFIG_KVM_XEN
+ vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
+#endif
+ kvm_vcpu_after_set_cpuid(vcpu);
+
+success:
+ kvfree(e2);
+ return 0;
+
+err:
+ memcpy(vcpu->arch.cpu_caps, vcpu_caps, sizeof(vcpu_caps));
+ swap(vcpu->arch.cpuid_entries, e2);
+ swap(vcpu->arch.cpuid_nent, nent);
+ return r;
+}
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *e = NULL;
+ struct kvm_cpuid_entry2 *e2 = NULL;
+
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
+
+ if (cpuid->nent) {
+ e = vmemdup_array_user(entries, cpuid->nent, sizeof(*e));
+ if (IS_ERR(e))
+ return PTR_ERR(e);
+
+ e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
+ if (!e2) {
+ r = -ENOMEM;
+ goto out_free_cpuid;
+ }
+ }
+ for (i = 0; i < cpuid->nent; i++) {
+ e2[i].function = e[i].function;
+ e2[i].eax = e[i].eax;
+ e2[i].ebx = e[i].ebx;
+ e2[i].ecx = e[i].ecx;
+ e2[i].edx = e[i].edx;
+ e2[i].index = 0;
+ e2[i].flags = 0;
+ e2[i].padding[0] = 0;
+ e2[i].padding[1] = 0;
+ e2[i].padding[2] = 0;
+ }
+
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
+ kvfree(e2);
+
+out_free_cpuid:
+ kvfree(e);
+
+ return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ struct kvm_cpuid_entry2 *e2 = NULL;
+ int r;
+
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
+
+ if (cpuid->nent) {
+ e2 = vmemdup_array_user(entries, cpuid->nent, sizeof(*e2));
+ if (IS_ERR(e2))
+ return PTR_ERR(e2);
+ }
+
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
+ kvfree(e2);
+
+ return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ return -E2BIG;
+
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
+ if (copy_to_user(entries, vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ return -EFAULT;
+
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return 0;
+}
+
+static __always_inline u32 raw_cpuid_get(struct cpuid_reg cpuid)
+{
+ struct kvm_cpuid_entry2 entry;
+ u32 base;
+
+ /*
+ * KVM only supports features defined by Intel (0x0), AMD (0x80000000),
+ * and Centaur (0xc0000000). WARN if a feature for new vendor base is
+ * defined, as this and other code would need to be updated.
+ */
+ base = cpuid.function & 0xffff0000;
+ if (WARN_ON_ONCE(base && base != 0x80000000 && base != 0xc0000000))
+ return 0;
+
+ if (cpuid_eax(base) < cpuid.function)
+ return 0;
+
+ cpuid_count(cpuid.function, cpuid.index,
+ &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
+
+ return *__cpuid_entry_get_reg(&entry, cpuid.reg);
+}
+
+/*
+ * For kernel-defined leafs, mask KVM's supported feature set with the kernel's
+ * capabilities as well as raw CPUID. For KVM-defined leafs, consult only raw
+ * CPUID, as KVM is the one and only authority (in the kernel).
+ */
+#define kvm_cpu_cap_init(leaf, feature_initializers...) \
+do { \
+ const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); \
+ const u32 __maybe_unused kvm_cpu_cap_init_in_progress = leaf; \
+ const u32 *kernel_cpu_caps = boot_cpu_data.x86_capability; \
+ u32 kvm_cpu_cap_passthrough = 0; \
+ u32 kvm_cpu_cap_synthesized = 0; \
+ u32 kvm_cpu_cap_emulated = 0; \
+ u32 kvm_cpu_cap_features = 0; \
+ \
+ feature_initializers \
+ \
+ kvm_cpu_caps[leaf] = kvm_cpu_cap_features; \
+ \
+ if (leaf < NCAPINTS) \
+ kvm_cpu_caps[leaf] &= kernel_cpu_caps[leaf]; \
+ \
+ kvm_cpu_caps[leaf] |= kvm_cpu_cap_passthrough; \
+ kvm_cpu_caps[leaf] &= (raw_cpuid_get(cpuid) | \
+ kvm_cpu_cap_synthesized); \
+ kvm_cpu_caps[leaf] |= kvm_cpu_cap_emulated; \
+} while (0)
+
+/*
+ * Assert that the feature bit being declared, e.g. via F(), is in the CPUID
+ * word that's being initialized. Exempt 0x8000_0001.EDX usage of 0x1.EDX
+ * features, as AMD duplicated many 0x1.EDX features into 0x8000_0001.EDX.
+ */
+#define KVM_VALIDATE_CPU_CAP_USAGE(name) \
+do { \
+ u32 __leaf = __feature_leaf(X86_FEATURE_##name); \
+ \
+ BUILD_BUG_ON(__leaf != kvm_cpu_cap_init_in_progress); \
+} while (0)
+
+#define F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ kvm_cpu_cap_features |= feature_bit(name); \
+})
+
+/* Scattered Flag - For features that are scattered by cpufeatures.h. */
+#define SCATTERED_F(name) \
+({ \
+ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ if (boot_cpu_has(X86_FEATURE_##name)) \
+ F(name); \
+})
+
+/* Features that KVM supports only on 64-bit kernels. */
+#define X86_64_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ if (IS_ENABLED(CONFIG_X86_64)) \
+ F(name); \
+})
+
+/*
+ * Emulated Feature - For features that KVM emulates in software irrespective
+ * of host CPU/kernel support.
+ */
+#define EMULATED_F(name) \
+({ \
+ kvm_cpu_cap_emulated |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Synthesized Feature - For features that are synthesized into boot_cpu_data,
+ * i.e. may not be present in the raw CPUID, but can still be advertised to
+ * userspace. Primarily used for mitigation related feature flags.
+ */
+#define SYNTHESIZED_F(name) \
+({ \
+ kvm_cpu_cap_synthesized |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Passthrough Feature - For features that KVM supports based purely on raw
+ * hardware CPUID, i.e. that KVM virtualizes even if the host kernel doesn't
+ * use the feature. Simply force set the feature in KVM's capabilities, raw
+ * CPUID support will be factored in by kvm_cpu_cap_mask().
+ */
+#define PASSTHROUGH_F(name) \
+({ \
+ kvm_cpu_cap_passthrough |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Aliased Features - For features in 0x8000_0001.EDX that are duplicates of
+ * identical 0x1.EDX features, and thus are aliased from 0x1 to 0x8000_0001.
+ */
+#define ALIASED_1_EDX_F(name) \
+({ \
+ BUILD_BUG_ON(__feature_leaf(X86_FEATURE_##name) != CPUID_1_EDX); \
+ BUILD_BUG_ON(kvm_cpu_cap_init_in_progress != CPUID_8000_0001_EDX); \
+ kvm_cpu_cap_features |= feature_bit(name); \
+})
+
+/*
+ * Vendor Features - For features that KVM supports, but are added in later
+ * because they require additional vendor enabling.
+ */
+#define VENDOR_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+})
+
+/*
+ * Runtime Features - For features that KVM dynamically sets/clears at runtime,
+ * e.g. when CR4 changes, but which are never advertised to userspace.
+ */
+#define RUNTIME_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+})
+
+/*
+ * Undefine the MSR bit macro to avoid token concatenation issues when
+ * processing X86_FEATURE_SPEC_CTRL_SSBD.
+ */
+#undef SPEC_CTRL_SSBD
+
+/* DS is defined by ptrace-abi.h on 32-bit builds. */
+#undef DS
+
+void kvm_set_cpu_caps(void)
+{
+ memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
+
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
+ sizeof(boot_cpu_data.x86_capability));
+
+ kvm_cpu_cap_init(CPUID_1_ECX,
+ F(XMM3),
+ F(PCLMULQDQ),
+ VENDOR_F(DTES64),
+ /*
+ * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
+ * advertised to guests via CPUID! MWAIT is also technically a
+ * runtime flag thanks to IA32_MISC_ENABLES; mark it as such so
+ * that KVM is aware that it's a known, unadvertised flag.
+ */
+ RUNTIME_F(MWAIT),
+ /* DS-CPL */
+ VENDOR_F(VMX),
+ /* SMX, EST */
+ /* TM2 */
+ F(SSSE3),
+ /* CNXT-ID */
+ /* Reserved */
+ F(FMA),
+ F(CX16),
+ /* xTPR Update */
+ F(PDCM),
+ F(PCID),
+ /* Reserved, DCA */
+ F(XMM4_1),
+ F(XMM4_2),
+ EMULATED_F(X2APIC),
+ F(MOVBE),
+ F(POPCNT),
+ EMULATED_F(TSC_DEADLINE_TIMER),
+ F(AES),
+ F(XSAVE),
+ RUNTIME_F(OSXSAVE),
+ F(AVX),
+ F(F16C),
+ F(RDRAND),
+ EMULATED_F(HYPERVISOR),
+ );
+
+ kvm_cpu_cap_init(CPUID_1_EDX,
+ F(FPU),
+ F(VME),
+ F(DE),
+ F(PSE),
+ F(TSC),
+ F(MSR),
+ F(PAE),
+ F(MCE),
+ F(CX8),
+ F(APIC),
+ /* Reserved */
+ F(SEP),
+ F(MTRR),
+ F(PGE),
+ F(MCA),
+ F(CMOV),
+ F(PAT),
+ F(PSE36),
+ /* PSN */
+ F(CLFLUSH),
+ /* Reserved */
+ VENDOR_F(DS),
+ /* ACPI */
+ F(MMX),
+ F(FXSR),
+ F(XMM),
+ F(XMM2),
+ F(SELFSNOOP),
+ /* HTT, TM, Reserved, PBE */
+ );
+
+ kvm_cpu_cap_init(CPUID_7_0_EBX,
+ F(FSGSBASE),
+ EMULATED_F(TSC_ADJUST),
+ F(SGX),
+ F(BMI1),
+ F(HLE),
+ F(AVX2),
+ F(FDP_EXCPTN_ONLY),
+ F(SMEP),
+ F(BMI2),
+ F(ERMS),
+ F(INVPCID),
+ F(RTM),
+ F(ZERO_FCS_FDS),
+ VENDOR_F(MPX),
+ F(AVX512F),
+ F(AVX512DQ),
+ F(RDSEED),
+ F(ADX),
+ F(SMAP),
+ F(AVX512IFMA),
+ F(CLFLUSHOPT),
+ F(CLWB),
+ VENDOR_F(INTEL_PT),
+ F(AVX512PF),
+ F(AVX512ER),
+ F(AVX512CD),
+ F(SHA_NI),
+ F(AVX512BW),
+ F(AVX512VL),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_ECX,
+ F(AVX512VBMI),
+ PASSTHROUGH_F(LA57),
+ F(PKU),
+ RUNTIME_F(OSPKE),
+ F(RDPID),
+ F(AVX512_VPOPCNTDQ),
+ F(UMIP),
+ F(AVX512_VBMI2),
+ F(GFNI),
+ F(VAES),
+ F(VPCLMULQDQ),
+ F(AVX512_VNNI),
+ F(AVX512_BITALG),
+ F(CLDEMOTE),
+ F(MOVDIRI),
+ F(MOVDIR64B),
+ VENDOR_F(WAITPKG),
+ F(SGX_LC),
+ F(BUS_LOCK_DETECT),
+ X86_64_F(SHSTK),
+ );
+
+ /*
+ * PKU not yet implemented for shadow paging and requires OSPKE
+ * to be set on the host. Clear it if that is not the case
+ */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ kvm_cpu_cap_clear(X86_FEATURE_PKU);
+
+ /*
+ * Shadow Stacks aren't implemented in the Shadow MMU. Shadow Stack
+ * accesses require "magic" Writable=0,Dirty=1 protection, which KVM
+ * doesn't know how to emulate or map.
+ */
+ if (!tdp_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+
+ kvm_cpu_cap_init(CPUID_7_EDX,
+ F(AVX512_4VNNIW),
+ F(AVX512_4FMAPS),
+ F(SPEC_CTRL),
+ F(SPEC_CTRL_SSBD),
+ EMULATED_F(ARCH_CAPABILITIES),
+ F(INTEL_STIBP),
+ F(MD_CLEAR),
+ F(AVX512_VP2INTERSECT),
+ F(FSRM),
+ F(SERIALIZE),
+ F(TSXLDTRK),
+ F(AVX512_FP16),
+ F(AMX_TILE),
+ F(AMX_INT8),
+ F(AMX_BF16),
+ F(FLUSH_L1D),
+ F(IBT),
+ );
+
+ /*
+ * Disable support for IBT and SHSTK if KVM is configured to emulate
+ * accesses to reserved GPAs, as KVM's emulator doesn't support IBT or
+ * SHSTK, nor does KVM handle Shadow Stack #PFs (see above).
+ */
+ if (allow_smaller_maxphyaddr) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ }
+
+ if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) &&
+ boot_cpu_has(X86_FEATURE_AMD_IBPB) &&
+ boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
+
+ kvm_cpu_cap_init(CPUID_7_1_EAX,
+ F(SHA512),
+ F(SM3),
+ F(SM4),
+ F(AVX_VNNI),
+ F(AVX512_BF16),
+ F(CMPCCXADD),
+ F(FZRM),
+ F(FSRS),
+ F(FSRC),
+ F(WRMSRNS),
+ X86_64_F(LKGS),
+ F(AMX_FP16),
+ F(AVX_IFMA),
+ F(LAM),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_1_ECX,
+ SCATTERED_F(MSR_IMM),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_1_EDX,
+ F(AVX_VNNI_INT8),
+ F(AVX_NE_CONVERT),
+ F(AMX_COMPLEX),
+ F(AVX_VNNI_INT16),
+ F(PREFETCHITI),
+ F(AVX10),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_2_EDX,
+ F(INTEL_PSFD),
+ F(IPRED_CTRL),
+ F(RRSBA_CTRL),
+ F(DDPD_U),
+ F(BHI_CTRL),
+ F(MCDT_NO),
+ );
+
+ kvm_cpu_cap_init(CPUID_D_1_EAX,
+ F(XSAVEOPT),
+ F(XSAVEC),
+ F(XGETBV1),
+ F(XSAVES),
+ X86_64_F(XFD),
+ );
+
+ kvm_cpu_cap_init(CPUID_12_EAX,
+ SCATTERED_F(SGX1),
+ SCATTERED_F(SGX2),
+ SCATTERED_F(SGX_EDECCSSA),
+ );
+
+ kvm_cpu_cap_init(CPUID_24_0_EBX,
+ F(AVX10_128),
+ F(AVX10_256),
+ F(AVX10_512),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0001_ECX,
+ F(LAHF_LM),
+ F(CMP_LEGACY),
+ VENDOR_F(SVM),
+ /* ExtApicSpace */
+ F(CR8_LEGACY),
+ F(ABM),
+ F(SSE4A),
+ F(MISALIGNSSE),
+ F(3DNOWPREFETCH),
+ F(OSVW),
+ /* IBS */
+ F(XOP),
+ /* SKINIT, WDT, LWP */
+ F(FMA4),
+ F(TBM),
+ F(TOPOEXT),
+ VENDOR_F(PERFCTR_CORE),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0001_EDX,
+ ALIASED_1_EDX_F(FPU),
+ ALIASED_1_EDX_F(VME),
+ ALIASED_1_EDX_F(DE),
+ ALIASED_1_EDX_F(PSE),
+ ALIASED_1_EDX_F(TSC),
+ ALIASED_1_EDX_F(MSR),
+ ALIASED_1_EDX_F(PAE),
+ ALIASED_1_EDX_F(MCE),
+ ALIASED_1_EDX_F(CX8),
+ ALIASED_1_EDX_F(APIC),
+ /* Reserved */
+ F(SYSCALL),
+ ALIASED_1_EDX_F(MTRR),
+ ALIASED_1_EDX_F(PGE),
+ ALIASED_1_EDX_F(MCA),
+ ALIASED_1_EDX_F(CMOV),
+ ALIASED_1_EDX_F(PAT),
+ ALIASED_1_EDX_F(PSE36),
+ /* Reserved */
+ F(NX),
+ /* Reserved */
+ F(MMXEXT),
+ ALIASED_1_EDX_F(MMX),
+ ALIASED_1_EDX_F(FXSR),
+ F(FXSR_OPT),
+ X86_64_F(GBPAGES),
+ F(RDTSCP),
+ /* Reserved */
+ X86_64_F(LM),
+ F(3DNOWEXT),
+ F(3DNOW),
+ );
+
+ if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
+ kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+
+ kvm_cpu_cap_init(CPUID_8000_0007_EDX,
+ SCATTERED_F(CONSTANT_TSC),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0008_EBX,
+ F(CLZERO),
+ F(XSAVEERPTR),
+ F(WBNOINVD),
+ F(AMD_IBPB),
+ F(AMD_IBRS),
+ F(AMD_SSBD),
+ F(VIRT_SSBD),
+ F(AMD_SSB_NO),
+ F(AMD_STIBP),
+ F(AMD_STIBP_ALWAYS_ON),
+ F(AMD_IBRS_SAME_MODE),
+ PASSTHROUGH_F(EFER_LMSLE_MBZ),
+ F(AMD_PSFD),
+ F(AMD_IBPB_RET),
+ );
+
+ /*
+ * AMD has separate bits for each SPEC_CTRL bit.
+ * arch/x86/kernel/cpu/bugs.c is kind enough to
+ * record that in cpufeatures so use them.
+ */
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB_RET);
+ }
+ if (boot_cpu_has(X86_FEATURE_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* All SVM features required additional vendor module enabling. */
+ kvm_cpu_cap_init(CPUID_8000_000A_EDX,
+ VENDOR_F(NPT),
+ VENDOR_F(VMCBCLEAN),
+ VENDOR_F(FLUSHBYASID),
+ VENDOR_F(NRIPS),
+ VENDOR_F(TSCRATEMSR),
+ VENDOR_F(V_VMSAVE_VMLOAD),
+ VENDOR_F(LBRV),
+ VENDOR_F(PAUSEFILTER),
+ VENDOR_F(PFTHRESHOLD),
+ VENDOR_F(VGIF),
+ VENDOR_F(VNMI),
+ VENDOR_F(SVME_ADDR_CHK),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_001F_EAX,
+ VENDOR_F(SME),
+ VENDOR_F(SEV),
+ /* VM_PAGE_FLUSH */
+ VENDOR_F(SEV_ES),
+ F(SME_COHERENT),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0021_EAX,
+ F(NO_NESTED_DATA_BP),
+ F(WRMSR_XX_BASE_NS),
+ /*
+ * Synthesize "LFENCE is serializing" into the AMD-defined entry
+ * in KVM's supported CPUID, i.e. if the feature is reported as
+ * supported by the kernel. LFENCE_RDTSC was a Linux-defined
+ * synthetic feature long before AMD joined the bandwagon, e.g.
+ * LFENCE is serializing on most CPUs that support SSE2. On
+ * CPUs that don't support AMD's leaf, ANDing with the raw host
+ * CPUID will drop the flags, and reporting support in AMD's
+ * leaf can make it easier for userspace to detect the feature.
+ */
+ SYNTHESIZED_F(LFENCE_RDTSC),
+ /* SmmPgCfgLock */
+ /* 4: Resv */
+ SYNTHESIZED_F(VERW_CLEAR),
+ F(NULL_SEL_CLR_BASE),
+ /* UpperAddressIgnore */
+ F(AUTOIBRS),
+ F(PREFETCHI),
+ EMULATED_F(NO_SMM_CTL_MSR),
+ /* PrefetchCtlMsr */
+ /* GpOnUserCpuid */
+ /* EPSF */
+ SYNTHESIZED_F(SBPB),
+ SYNTHESIZED_F(IBPB_BRTYPE),
+ SYNTHESIZED_F(SRSO_NO),
+ F(SRSO_USER_KERNEL_NO),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0021_ECX,
+ SYNTHESIZED_F(TSA_SQ_NO),
+ SYNTHESIZED_F(TSA_L1_NO),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0022_EAX,
+ F(PERFMON_V2),
+ );
+
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE);
+
+ kvm_cpu_cap_init(CPUID_C000_0001_EDX,
+ F(XSTORE),
+ F(XSTORE_EN),
+ F(XCRYPT),
+ F(XCRYPT_EN),
+ F(ACE2),
+ F(ACE2_EN),
+ F(PHE),
+ F(PHE_EN),
+ F(PMM),
+ F(PMM_EN),
+ );
+
+ /*
+ * Hide RDTSCP and RDPID if either feature is reported as supported but
+ * probing MSR_TSC_AUX failed. This is purely a sanity check and
+ * should never happen, but the guest will likely crash if RDTSCP or
+ * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
+ * the past. For example, the sanity check may fire if this instance of
+ * KVM is running as L1 on top of an older, broken KVM.
+ */
+ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
+ kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
+ !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps);
+
+#undef F
+#undef SCATTERED_F
+#undef X86_64_F
+#undef EMULATED_F
+#undef SYNTHESIZED_F
+#undef PASSTHROUGH_F
+#undef ALIASED_1_EDX_F
+#undef VENDOR_F
+#undef RUNTIME_F
+
+struct kvm_cpuid_array {
+ struct kvm_cpuid_entry2 *entries;
+ int maxnent;
+ int nent;
+};
+
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
+ u32 function, u32 index)
+{
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
+
+ if (!entry)
+ return NULL;
+
+ memset(entry, 0, sizeof(*entry));
+ entry->function = function;
+ entry->index = index;
+ switch (function & 0xC0000000) {
+ case 0x40000000:
+ /* Hypervisor leaves are always synthesized by __do_cpuid_func. */
+ return entry;
+
+ case 0x80000000:
+ /*
+ * 0x80000021 is sometimes synthesized by __do_cpuid_func, which
+ * would result in out-of-bounds calls to do_host_cpuid.
+ */
+ {
+ static int max_cpuid_80000000;
+ if (!READ_ONCE(max_cpuid_80000000))
+ WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000));
+ if (function > READ_ONCE(max_cpuid_80000000))
+ return entry;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+
+ if (cpuid_function_is_indexed(function))
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+
+ return entry;
+}
+
+static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
+ bool include_partially_emulated)
+{
+ memset(entry, 0, sizeof(*entry));
+
+ entry->function = func;
+ entry->index = 0;
+ entry->flags = 0;
+
+ switch (func) {
+ case 0:
+ entry->eax = 7;
+ return 1;
+ case 1:
+ entry->ecx = feature_bit(MOVBE);
+ /*
+ * KVM allows userspace to enumerate MONITOR+MWAIT support to
+ * the guest, but the MWAIT feature flag is never advertised
+ * to userspace because MONITOR+MWAIT aren't virtualized by
+ * hardware, can't be faithfully emulated in software (KVM
+ * emulates them as NOPs), and allowing the guest to execute
+ * them natively requires enabling a per-VM capability.
+ */
+ if (include_partially_emulated)
+ entry->ecx |= feature_bit(MWAIT);
+ return 1;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ entry->eax = 0;
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ entry->ecx = feature_bit(RDPID);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
+{
+ if (array->nent >= array->maxnent)
+ return -E2BIG;
+
+ array->nent += cpuid_func_emulated(&array->entries[array->nent], func, false);
+ return 0;
+}
+
+static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
+{
+ struct kvm_cpuid_entry2 *entry;
+ int r, i, max_idx;
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+
+ r = -E2BIG;
+
+ entry = do_host_cpuid(array, function, 0);
+ if (!entry)
+ goto out;
+
+ switch (function) {
+ case 0:
+ /* Limited to the highest leaf implemented in KVM. */
+ entry->eax = min(entry->eax, 0x24U);
+ break;
+ case 1:
+ cpuid_entry_override(entry, CPUID_1_EDX);
+ cpuid_entry_override(entry, CPUID_1_ECX);
+ break;
+ case 2:
+ /*
+ * On ancient CPUs, function 2 entries are STATEFUL. That is,
+ * CPUID(function=2, index=0) may return different results each
+ * time, with the least-significant byte in EAX enumerating the
+ * number of times software should do CPUID(2, 0).
+ *
+ * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
+ * idiotic. Intel's SDM states that EAX & 0xff "will always
+ * return 01H. Software should ignore this value and not
+ * interpret it as an informational descriptor", while AMD's
+ * APM states that CPUID(2) is reserved.
+ *
+ * WARN if a frankenstein CPU that supports virtualization and
+ * a stateful CPUID.0x2 is encountered.
+ */
+ WARN_ON_ONCE((entry->eax & 0xff) > 1);
+ break;
+ /* functions 4 and 0x8000001d have additional index. */
+ case 4:
+ case 0x8000001d:
+ /*
+ * Read entries until the cache type in the previous entry is
+ * zero, i.e. indicates an invalid entry.
+ */
+ for (i = 1; entry->eax & 0x1f; ++i) {
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
+ goto out;
+ }
+ break;
+ case 6: /* Thermal management */
+ entry->eax = 0x4; /* allow ARAT */
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ /* function 7 has additional index. */
+ case 7:
+ max_idx = entry->eax = min(entry->eax, 2u);
+ cpuid_entry_override(entry, CPUID_7_0_EBX);
+ cpuid_entry_override(entry, CPUID_7_ECX);
+ cpuid_entry_override(entry, CPUID_7_EDX);
+
+ /* KVM only supports up to 0x7.2, capped above via min(). */
+ if (max_idx >= 1) {
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_7_1_EAX);
+ cpuid_entry_override(entry, CPUID_7_1_ECX);
+ cpuid_entry_override(entry, CPUID_7_1_EDX);
+ entry->ebx = 0;
+ }
+ if (max_idx >= 2) {
+ entry = do_host_cpuid(array, function, 2);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_7_2_EDX);
+ entry->ecx = 0;
+ entry->ebx = 0;
+ entry->eax = 0;
+ }
+ break;
+ case 0xa: { /* Architectural Performance Monitoring */
+ union cpuid10_eax eax = { };
+ union cpuid10_edx edx = { };
+
+ if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ eax.split.version_id = kvm_pmu_cap.version;
+ eax.split.num_counters = kvm_pmu_cap.num_counters_gp;
+ eax.split.bit_width = kvm_pmu_cap.bit_width_gp;
+ eax.split.mask_length = kvm_pmu_cap.events_mask_len;
+ edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed;
+ edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed;
+
+ if (kvm_pmu_cap.version)
+ edx.split.anythread_deprecated = 1;
+
+ entry->eax = eax.full;
+ entry->ebx = kvm_pmu_cap.events_mask;
+ entry->ecx = 0;
+ entry->edx = edx.full;
+ break;
+ }
+ case 0x1f:
+ case 0xb:
+ /*
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
+ */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0xd: {
+ u64 permitted_xcr0 = kvm_get_filtered_xcr0();
+ u64 permitted_xss = kvm_caps.supported_xss;
+
+ entry->eax &= permitted_xcr0;
+ entry->ebx = xstate_required_size(permitted_xcr0, false);
+ entry->ecx = entry->ebx;
+ entry->edx &= permitted_xcr0 >> 32;
+ if (!permitted_xcr0)
+ break;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_D_1_EAX);
+ if (entry->eax & (feature_bit(XSAVES) | feature_bit(XSAVEC)))
+ entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
+ true);
+ else {
+ WARN_ON_ONCE(permitted_xss != 0);
+ entry->ebx = 0;
+ }
+ entry->ecx &= permitted_xss;
+ entry->edx &= permitted_xss >> 32;
+
+ for (i = 2; i < 64; ++i) {
+ bool s_state;
+ if (permitted_xcr0 & BIT_ULL(i))
+ s_state = false;
+ else if (permitted_xss & BIT_ULL(i))
+ s_state = true;
+ else
+ continue;
+
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
+ goto out;
+
+ /*
+ * The supported check above should have filtered out
+ * invalid sub-leafs. Only valid sub-leafs should
+ * reach this point, and they should have a non-zero
+ * save state size. Furthermore, check whether the
+ * processor agrees with permitted_xcr0/permitted_xss
+ * on whether this is an XCR0- or IA32_XSS-managed area.
+ */
+ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
+ --array->nent;
+ continue;
+ }
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ entry->ecx &= ~BIT_ULL(2);
+ entry->edx = 0;
+ }
+ break;
+ }
+ case 0x12:
+ /* Intel SGX */
+ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+ * and max enclave sizes. The SGX sub-features and MISCSELECT
+ * are restricted by kernel and KVM capabilities (like most
+ * feature flags), while enclave size is unrestricted.
+ */
+ cpuid_entry_override(entry, CPUID_12_EAX);
+ entry->ebx &= SGX_MISC_EXINFO;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ /*
+ * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
+ * feature flags. Advertise all supported flags, including
+ * privileged attributes that require explicit opt-in from
+ * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
+ * expected to derive it from supported XCR0.
+ */
+ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK;
+ entry->ebx &= 0;
+ break;
+ /* Intel PT */
+ case 0x14:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ /* Intel AMX TILE */
+ case 0x1d:
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ case 0x1e: /* TMUL information */
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ break;
+ case 0x24: {
+ u8 avx10_version;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * The AVX10 version is encoded in EBX[7:0]. Note, the version
+ * is guaranteed to be >=1 if AVX10 is supported. Note #2, the
+ * version needs to be captured before overriding EBX features!
+ */
+ avx10_version = min_t(u8, entry->ebx & 0xff, 1);
+ cpuid_entry_override(entry, CPUID_24_0_EBX);
+ entry->ebx |= avx10_version;
+
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ }
+ case KVM_CPUID_SIGNATURE: {
+ const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
+ entry->eax = KVM_CPUID_FEATURES;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_PV_EOI) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_PV_UNHALT) |
+ (1 << KVM_FEATURE_PV_TLB_FLUSH) |
+ (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+ (1 << KVM_FEATURE_PV_SEND_IPI) |
+ (1 << KVM_FEATURE_POLL_CONTROL) |
+ (1 << KVM_FEATURE_PV_SCHED_YIELD) |
+ (1 << KVM_FEATURE_ASYNC_PF_INT);
+
+ if (sched_info_on())
+ entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x80000022);
+ /*
+ * Serializing LFENCE is reported in a multitude of ways, and
+ * NullSegClearsBase is not reported in CPUID on Zen2; help
+ * userspace by providing the CPUID leaf ourselves.
+ *
+ * However, only do it if the host has CPUID leaf 0x8000001d.
+ * QEMU thinks that it can query the host blindly for that
+ * CPUID leaf if KVM reports that it supports 0x8000001d or
+ * above. The processor merrily returns values from the
+ * highest Intel leaf which QEMU tries to use as the guest's
+ * 0x8000001d. Even worse, this can result in an infinite
+ * loop if said highest leaf has no subleaves indexed by ECX.
+ */
+ if (entry->eax >= 0x8000001d &&
+ (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
+ entry->eax = max(entry->eax, 0x80000021);
+ break;
+ case 0x80000001:
+ entry->ebx &= ~GENMASK(27, 16);
+ cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+ cpuid_entry_override(entry, CPUID_8000_0001_ECX);
+ break;
+ case 0x80000005:
+ /* Pass host L1 cache and TLB info. */
+ break;
+ case 0x80000006:
+ /* Drop reserved bits, pass host L2 cache and TLB info. */
+ entry->edx &= ~GENMASK(17, 16);
+ break;
+ case 0x80000007: /* Advanced power management */
+ cpuid_entry_override(entry, CPUID_8000_0007_EDX);
+
+ /* mask against host */
+ entry->edx &= boot_cpu_data.x86_power;
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0x80000008: {
+ /*
+ * GuestPhysAddrSize (EAX[23:16]) is intended for software
+ * use.
+ *
+ * KVM's ABI is to report the effective MAXPHYADDR for the
+ * guest in PhysAddrSize (phys_as), and the maximum
+ * *addressable* GPA in GuestPhysAddrSize (g_phys_as).
+ *
+ * GuestPhysAddrSize is valid if and only if TDP is enabled,
+ * in which case the max GPA that can be addressed by KVM may
+ * be less than the max GPA that can be legally generated by
+ * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't
+ * support 5-level TDP.
+ */
+ unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned int phys_as, g_phys_as;
+
+ /*
+ * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
+ * the guest operates in the same PA space as the host, i.e.
+ * reductions in MAXPHYADDR for memory encryption affect shadow
+ * paging, too.
+ *
+ * If TDP is enabled, use the raw bare metal MAXPHYADDR as
+ * reductions to the HPAs do not affect GPAs. The max
+ * addressable GPA is the same as the max effective GPA, except
+ * that it's capped at 48 bits if 5-level TDP isn't supported
+ * (hardware processes bits 51:48 only when walking the fifth
+ * level page table).
+ */
+ if (!tdp_enabled) {
+ phys_as = boot_cpu_data.x86_phys_bits;
+ g_phys_as = 0;
+ } else {
+ phys_as = entry->eax & 0xff;
+ g_phys_as = phys_as;
+ if (kvm_mmu_get_max_tdp_level() < 5)
+ g_phys_as = min(g_phys_as, 48U);
+ }
+
+ entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16);
+ entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
+ entry->edx = 0;
+ cpuid_entry_override(entry, CPUID_8000_0008_EBX);
+ break;
+ }
+ case 0x8000000A:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ entry->eax = 1; /* SVM revision 1 */
+ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+ ASID emulation to nested SVM */
+ entry->ecx = 0; /* Reserved */
+ cpuid_entry_override(entry, CPUID_8000_000A_EDX);
+ break;
+ case 0x80000019:
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
+ entry->eax &= GENMASK(2, 0);
+ entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
+ break;
+ case 0x8000001F:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ } else {
+ cpuid_entry_override(entry, CPUID_8000_001F_EAX);
+ /* Clear NumVMPL since KVM does not support VMPL. */
+ entry->ebx &= ~GENMASK(31, 12);
+ /*
+ * Enumerate '0' for "PA bits reduction", the adjusted
+ * MAXPHYADDR is enumerated directly (see 0x80000008).
+ */
+ entry->ebx &= ~GENMASK(11, 6);
+ }
+ break;
+ case 0x80000020:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x80000021:
+ entry->ebx = entry->edx = 0;
+ cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+ cpuid_entry_override(entry, CPUID_8000_0021_ECX);
+ break;
+ /* AMD Extended Performance Monitoring and Debug */
+ case 0x80000022: {
+ union cpuid_0x80000022_ebx ebx = { };
+
+ entry->ecx = entry->edx = 0;
+ if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
+ entry->eax = entry->ebx = 0;
+ break;
+ }
+
+ cpuid_entry_override(entry, CPUID_8000_0022_EAX);
+
+ ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp;
+ entry->ebx = ebx.full;
+ break;
+ }
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ cpuid_entry_override(entry, CPUID_C000_0001_EDX);
+ break;
+ case 3: /* Processor serial number */
+ case 5: /* MONITOR/MWAIT */
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ default:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ r = 0;
+
+out:
+ put_cpu();
+
+ return r;
+}
+
+static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+ unsigned int type)
+{
+ if (type == KVM_GET_EMULATED_CPUID)
+ return __do_cpuid_func_emulated(array, func);
+
+ return __do_cpuid_func(array, func);
+}
+
+#define CENTAUR_CPUID_SIGNATURE 0xC0000000
+
+static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+ unsigned int type)
+{
+ u32 limit;
+ int r;
+
+ if (func == CENTAUR_CPUID_SIGNATURE &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+ return 0;
+
+ r = do_cpuid_func(array, func, type);
+ if (r)
+ return r;
+
+ limit = array->entries[array->nent - 1].eax;
+ for (func = func + 1; func <= limit; ++func) {
+ r = do_cpuid_func(array, func, type);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+ __u32 num_entries, unsigned int ioctl_type)
+{
+ int i;
+ __u32 pad[3];
+
+ if (ioctl_type != KVM_GET_EMULATED_CPUID)
+ return false;
+
+ /*
+ * We want to make sure that ->padding is being passed clean from
+ * userspace in case we want to use it for something in the future.
+ *
+ * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+ * have to give ourselves satisfied only with the emulated side. /me
+ * sheds a tear.
+ */
+ for (i = 0; i < num_entries; i++) {
+ if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+ return true;
+
+ if (pad[0] || pad[1] || pad[2])
+ return true;
+ }
+ return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type)
+{
+ static const u32 funcs[] = {
+ 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
+ };
+
+ struct kvm_cpuid_array array = {
+ .nent = 0,
+ };
+ int r, i;
+
+ if (cpuid->nent < 1)
+ return -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+ if (sanity_check_entries(entries, cpuid->nent, type))
+ return -EINVAL;
+
+ array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL);
+ if (!array.entries)
+ return -ENOMEM;
+
+ array.maxnent = cpuid->nent;
+
+ for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+ r = get_cpuid_func(&array, funcs[i], type);
+ if (r)
+ goto out_free;
+ }
+ cpuid->nent = array.nent;
+
+ if (copy_to_user(entries, array.entries,
+ array.nent * sizeof(struct kvm_cpuid_entry2)))
+ r = -EFAULT;
+
+out_free:
+ kvfree(array.entries);
+ return r;
+}
+
+/*
+ * Intel CPUID semantics treats any query for an out-of-range leaf as if the
+ * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics
+ * returns all zeroes for any undefined leaf, whether or not the leaf is in
+ * range. Centaur/VIA follows Intel semantics.
+ *
+ * A leaf is considered out-of-range if its function is higher than the maximum
+ * supported leaf of its associated class or if its associated class does not
+ * exist.
+ *
+ * There are three primary classes to be considered, with their respective
+ * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary
+ * class exists if a guest CPUID entry for its <base> leaf exists. For a given
+ * class, CPUID.<base>.EAX contains the max supported leaf for the class.
+ *
+ * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
+ * - Hypervisor: 0x40000000 - 0x4fffffff
+ * - Extended: 0x80000000 - 0xbfffffff
+ * - Centaur: 0xc0000000 - 0xcfffffff
+ *
+ * The Hypervisor class is further subdivided into sub-classes that each act as
+ * their own independent class associated with a 0x100 byte range. E.g. if Qemu
+ * is advertising support for both HyperV and KVM, the resulting Hypervisor
+ * CPUID sub-classes are:
+ *
+ * - HyperV: 0x40000000 - 0x400000ff
+ * - KVM: 0x40000100 - 0x400001ff
+ */
+static struct kvm_cpuid_entry2 *
+get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
+{
+ struct kvm_cpuid_entry2 *basic, *class;
+ u32 function = *fn_ptr;
+
+ basic = kvm_find_cpuid_entry(vcpu, 0);
+ if (!basic)
+ return NULL;
+
+ if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) ||
+ is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx))
+ return NULL;
+
+ if (function >= 0x40000000 && function <= 0x4fffffff)
+ class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00);
+ else if (function >= 0xc0000000)
+ class = kvm_find_cpuid_entry(vcpu, 0xc0000000);
+ else
+ class = kvm_find_cpuid_entry(vcpu, function & 0x80000000);
+
+ if (class && function <= class->eax)
+ return NULL;
+
+ /*
+ * Leaf specific adjustments are also applied when redirecting to the
+ * max basic entry, e.g. if the max basic leaf is 0xb but there is no
+ * entry for CPUID.0xb.index (see below), then the output value for EDX
+ * needs to be pulled from CPUID.0xb.1.
+ */
+ *fn_ptr = basic->eax;
+
+ /*
+ * The class does not exist or the requested function is out of range;
+ * the effective CPUID entry is the max basic leaf. Note, the index of
+ * the original requested leaf is observed!
+ */
+ return kvm_find_cpuid_entry_index(vcpu, basic->eax, index);
+}
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only)
+{
+ u32 orig_function = *eax, function = *eax, index = *ecx;
+ struct kvm_cpuid_entry2 *entry;
+ bool exact, used_max_basic = false;
+
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, index);
+ exact = !!entry;
+
+ if (!entry && !exact_only) {
+ entry = get_out_of_range_cpuid_entry(vcpu, &function, index);
+ used_max_basic = !!entry;
+ }
+
+ if (entry) {
+ *eax = entry->eax;
+ *ebx = entry->ebx;
+ *ecx = entry->ecx;
+ *edx = entry->edx;
+ if (function == 7 && index == 0) {
+ u64 data;
+ if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) &&
+ !kvm_msr_read(vcpu, MSR_IA32_TSX_CTRL, &data) &&
+ (data & TSX_CTRL_CPUID_CLEAR))
+ *ebx &= ~(feature_bit(RTM) | feature_bit(HLE));
+ } else if (function == 0x80000007) {
+ if (kvm_hv_invtsc_suppressed(vcpu))
+ *edx &= ~feature_bit(CONSTANT_TSC);
+ } else if (IS_ENABLED(CONFIG_KVM_XEN) &&
+ kvm_xen_is_tsc_leaf(vcpu, function)) {
+ /*
+ * Update guest TSC frequency information if necessary.
+ * Ignore failures, there is no sane value that can be
+ * provided if KVM can't get the TSC frequency.
+ */
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
+ kvm_guest_time_update(vcpu);
+
+ if (index == 1) {
+ *ecx = vcpu->arch.pvclock_tsc_mul;
+ *edx = vcpu->arch.pvclock_tsc_shift;
+ } else if (index == 2) {
+ *eax = vcpu->arch.hw_tsc_khz;
+ }
+ }
+ } else {
+ *eax = *ebx = *ecx = *edx = 0;
+ /*
+ * When leaf 0BH or 1FH is defined, CL is pass-through
+ * and EDX is always the x2APIC ID, even for undefined
+ * subleaves. Index 1 will exist iff the leaf is
+ * implemented, so we pass through CL iff leaf 1
+ * exists. EDX can be copied from any existing index.
+ */
+ if (function == 0xb || function == 0x1f) {
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
+ if (entry) {
+ *ecx = index & 0xff;
+ *edx = entry->edx;
+ }
+ }
+ }
+ trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact,
+ used_max_basic);
+ return exact;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpuid);
+
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+ return 1;
+
+ eax = kvm_rax_read(vcpu);
+ ecx = kvm_rcx_read(vcpu);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
+ kvm_rax_write(vcpu, eax);
+ kvm_rbx_write(vcpu, ebx);
+ kvm_rcx_write(vcpu, ecx);
+ kvm_rdx_write(vcpu, edx);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000000..d3f5ae15a7ca
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "reverse_cpuid.h"
+#include <asm/cpu.h>
+#include <asm/processor.h>
+#include <uapi/asm/kvm_para.h>
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+void kvm_set_cpu_caps(void);
+
+void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
+ int nent, u32 function, u64 index);
+/*
+ * Magic value used by KVM when querying userspace-provided CPUID entries and
+ * doesn't care about the CPIUD index because the index of the function in
+ * question is not significant. Note, this magic value must have at least one
+ * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2()
+ * to avoid false positives when processing guest CPUID input.
+ *
+ * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of
+ * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry().
+ */
+#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
+
+static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, index);
+}
+
+static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function)
+{
+ return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only);
+
+void __init kvm_init_xstate_sizes(void);
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu);
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.maxphyaddr;
+}
+
+static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return !(gpa & vcpu->arch.reserved_gpa_bits);
+}
+
+static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t alignment)
+{
+ return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
+}
+
+static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
+ unsigned int leaf)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
+
+ BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps));
+ *reg = kvm_cpu_caps[leaf];
+}
+
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+ struct kvm_cpuid_entry2 *entry;
+ u32 *reg;
+
+ /*
+ * XSAVES is a special snowflake. Due to lack of a dedicated intercept
+ * on SVM, KVM must assume that XSAVES (and thus XRSTORS) is usable by
+ * the guest if the host supports XSAVES and *XSAVE* is exposed to the
+ * guest. Because the guest can execute XSAVES and XRSTORS, i.e. can
+ * indirectly consume XSS, KVM must ensure XSS is zeroed when running
+ * the guest, i.e. must set XSAVES in vCPU capabilities. But to reject
+ * direct XSS reads and writes (to minimize the virtualization hole and
+ * honor userspace's CPUID), KVM needs to check the raw guest CPUID,
+ * not KVM's view of guest capabilities.
+ *
+ * For all other features, guest capabilities are accurate. Expand
+ * this allowlist with extreme vigilance.
+ */
+ BUILD_BUG_ON(x86_feature != X86_FEATURE_XSAVES);
+
+ entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ return NULL;
+
+ reg = __cpuid_entry_get_reg(entry, cpuid.reg);
+ if (!reg)
+ return false;
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.is_amd_compatible;
+}
+
+static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu)
+{
+ return !guest_cpuid_is_amd_compatible(vcpu);
+}
+
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_model(best->eax);
+}
+
+static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu)
+{
+ return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_stepping(best->eax);
+}
+
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_misc_features_enables &
+ MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
+static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
+static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
+{
+ return !!kvm_cpu_cap_get(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
+{
+ if (boot_cpu_has(x86_feature))
+ kvm_cpu_cap_set(x86_feature);
+}
+
+static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
+ unsigned int kvm_feature)
+{
+ if (!vcpu->arch.pv_cpuid.enforce)
+ return true;
+
+ return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
+}
+
+static __always_inline void guest_cpu_cap_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ vcpu->arch.cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
+static __always_inline void guest_cpu_cap_clear(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ vcpu->arch.cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void guest_cpu_cap_change(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature,
+ bool guest_has_cap)
+{
+ if (guest_has_cap)
+ guest_cpu_cap_set(vcpu, x86_feature);
+ else
+ guest_cpu_cap_clear(vcpu, x86_feature);
+}
+
+static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ /*
+ * Except for MWAIT, querying dynamic feature bits is disallowed, so
+ * that KVM can defer runtime updates until the next CPUID emulation.
+ */
+ BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC ||
+ x86_feature == X86_FEATURE_OSXSAVE ||
+ x86_feature == X86_FEATURE_OSPKE);
+
+ return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LAM))
+ cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
+
+ return kvm_vcpu_is_legal_gpa(vcpu, cr3);
+}
+
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB));
+}
+
+#endif
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000000..999227fc7c66
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+#include "lapic.h"
+#include "mmu.h"
+#include "mmu/mmu_internal.h"
+
+static int vcpu_get_timer_advance_ns(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n");
+
+static int vcpu_get_guest_mode(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->stat.guest_mode;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n");
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_caps.tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
+{
+ debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu,
+ &vcpu_guest_mode_fops);
+ debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu,
+ &vcpu_tsc_offset_fops);
+
+ if (lapic_in_kernel(vcpu))
+ debugfs_create_file("lapic_timer_advance_ns", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_timer_advance_ns_fops);
+
+ if (kvm_caps.has_tsc_control) {
+ debugfs_create_file("tsc-scaling-ratio", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_fops);
+ debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_frac_fops);
+ }
+}
+
+/*
+ * This covers statistics <1024 (11=log(1024)+1), which should be enough to
+ * cover RMAP_RECYCLE_THRESHOLD.
+ */
+#define RMAP_LOG_SIZE 11
+
+static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" };
+
+static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
+{
+ struct kvm_rmap_head *rmap;
+ struct kvm *kvm = m->private;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned int lpage_size, index;
+ /* Still small enough to be on the stack */
+ unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
+ int i, j, k, l, ret;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
+ ret = -ENOMEM;
+ memset(log, 0, sizeof(log));
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL);
+ if (!log[i])
+ goto out;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ int bkt;
+
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots)
+ for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
+ rmap = slot->arch.rmap[k];
+ lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
+ cur = log[k];
+ for (l = 0; l < lpage_size; l++) {
+ index = ffs(pte_list_count(&rmap[l]));
+ if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE))
+ index = RMAP_LOG_SIZE - 1;
+ cur[index]++;
+ }
+ }
+ }
+
+ write_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+
+ /* index=0 counts no rmap; index=1 counts 1 rmap */
+ seq_printf(m, "Rmap_Count:\t0\t1\t");
+ for (i = 2; i < RMAP_LOG_SIZE; i++) {
+ j = 1 << (i - 1);
+ k = (1 << i) - 1;
+ seq_printf(m, "%d-%d\t", j, k);
+ }
+ seq_printf(m, "\n");
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]);
+ cur = log[i];
+ for (j = 0; j < RMAP_LOG_SIZE; j++)
+ seq_printf(m, "%d\t", cur[j]);
+ seq_printf(m, "\n");
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++)
+ kfree(log[i]);
+
+ return ret;
+}
+
+static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+ int r;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ r = single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+ if (r < 0)
+ kvm_put_kvm(kvm);
+
+ return r;
+}
+
+static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ kvm_put_kvm(kvm);
+
+ return single_release(inode, file);
+}
+
+static const struct file_operations mmu_rmaps_stat_fops = {
+ .owner = THIS_MODULE,
+ .open = kvm_mmu_rmaps_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_mmu_rmaps_stat_release,
+};
+
+void kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
+ &mmu_rmaps_stat_fops);
+}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
new file mode 100644
index 000000000000..c8e292e9a24d
--- /dev/null
+++ b/arch/x86/kvm/emulate.c
@@ -0,0 +1,5645 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+ * emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include <linux/stringify.h>
+#include <asm/debugreg.h>
+#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
+#include <asm/text-patching.h>
+
+#include "x86.h"
+#include "tss.h"
+#include "mmu.h"
+#include "pmu.h"
+
+/*
+ * Operand types
+ */
+#define OpNone 0ull
+#define OpImplicit 1ull /* No generic decode */
+#define OpReg 2ull /* Register */
+#define OpMem 3ull /* Memory */
+#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
+#define OpDI 5ull /* ES:DI/EDI/RDI */
+#define OpMem64 6ull /* Memory, 64-bit */
+#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
+#define OpDX 8ull /* DX register */
+#define OpCL 9ull /* CL register (for shifts) */
+#define OpImmByte 10ull /* 8-bit sign extended immediate */
+#define OpOne 11ull /* Implied 1 */
+#define OpImm 12ull /* Sign extended up to 32-bit immediate */
+#define OpMem16 13ull /* Memory operand (16-bit). */
+#define OpMem32 14ull /* Memory operand (32-bit). */
+#define OpImmU 15ull /* Immediate operand, zero extended */
+#define OpSI 16ull /* SI/ESI/RSI */
+#define OpImmFAddr 17ull /* Immediate far address */
+#define OpMemFAddr 18ull /* Far address in memory */
+#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
+#define OpES 20ull /* ES */
+#define OpCS 21ull /* CS */
+#define OpSS 22ull /* SS */
+#define OpDS 23ull /* DS */
+#define OpFS 24ull /* FS */
+#define OpGS 25ull /* GS */
+#define OpMem8 26ull /* 8-bit zero extended memory operand */
+#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
+#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
+
+#define OpBits 5 /* Width of operand field */
+#define OpMask ((1ull << OpBits) - 1)
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+#define DstShift 1 /* Destination operand type at bits 1-5 */
+#define ImplicitOps (OpImplicit << DstShift)
+#define DstReg (OpReg << DstShift)
+#define DstMem (OpMem << DstShift)
+#define DstAcc (OpAcc << DstShift)
+#define DstDI (OpDI << DstShift)
+#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
+#define DstImmUByte (OpImmUByte << DstShift)
+#define DstDX (OpDX << DstShift)
+#define DstAccLo (OpAccLo << DstShift)
+#define DstMask (OpMask << DstShift)
+#define SrcShift 6 /* Source operand type at bits 6-10 */
+#define SrcNone (OpNone << SrcShift)
+#define SrcReg (OpReg << SrcShift)
+#define SrcMem (OpMem << SrcShift)
+#define SrcMem16 (OpMem16 << SrcShift)
+#define SrcMem32 (OpMem32 << SrcShift)
+#define SrcImm (OpImm << SrcShift)
+#define SrcImmByte (OpImmByte << SrcShift)
+#define SrcOne (OpOne << SrcShift)
+#define SrcImmUByte (OpImmUByte << SrcShift)
+#define SrcImmU (OpImmU << SrcShift)
+#define SrcSI (OpSI << SrcShift)
+#define SrcXLat (OpXLat << SrcShift)
+#define SrcImmFAddr (OpImmFAddr << SrcShift)
+#define SrcMemFAddr (OpMemFAddr << SrcShift)
+#define SrcAcc (OpAcc << SrcShift)
+#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcImm64 (OpImm64 << SrcShift)
+#define SrcDX (OpDX << SrcShift)
+#define SrcMem8 (OpMem8 << SrcShift)
+#define SrcAccHi (OpAccHi << SrcShift)
+#define SrcMask (OpMask << SrcShift)
+#define BitOp (1<<11)
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define String (1<<13) /* String instruction (rep capable) */
+#define Stack (1<<14) /* Stack instruction (push/pop) */
+#define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */
+#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Escape (5<<15) /* Escape to coprocessor instruction */
+#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
+#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
+#define Sse (1<<18) /* SSE Vector instruction */
+#define ModRM (1<<19) /* Generic ModRM decode. */
+#define Mov (1<<20) /* Destination is only written; never read. */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
+#define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */
+#define PageTable (1 << 29) /* instruction used to write page table */
+#define NotImpl (1 << 30) /* instruction is not implemented */
+#define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */
+#define Src2Shift (32) /* Source 2 operand type at bits 32-36 */
+#define Src2None (OpNone << Src2Shift)
+#define Src2Mem (OpMem << Src2Shift)
+#define Src2CL (OpCL << Src2Shift)
+#define Src2ImmByte (OpImmByte << Src2Shift)
+#define Src2One (OpOne << Src2Shift)
+#define Src2Imm (OpImm << Src2Shift)
+#define Src2ES (OpES << Src2Shift)
+#define Src2CS (OpCS << Src2Shift)
+#define Src2SS (OpSS << Src2Shift)
+#define Src2DS (OpDS << Src2Shift)
+#define Src2FS (OpFS << Src2Shift)
+#define Src2GS (OpGS << Src2Shift)
+#define Src2Mask (OpMask << Src2Shift)
+/* free: 37-39 */
+#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
+#define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */
+#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
+#define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+/* free: 43-44 */
+#define NoWrite ((u64)1 << 45) /* No writeback */
+#define SrcWrite ((u64)1 << 46) /* Write back src operand */
+#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
+#define NearBranch ((u64)1 << 52) /* Near branches */
+#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */
+#define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */
+
+#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
+
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+struct opcode {
+ u64 flags;
+ u8 intercept;
+ u8 pad[7];
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ const struct opcode *group;
+ const struct group_dual *gdual;
+ const struct gprefix *gprefix;
+ const struct escape *esc;
+ const struct instr_dual *idual;
+ const struct mode_dual *mdual;
+ } u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+};
+
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
+};
+
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
+};
+
+struct escape {
+ struct opcode op[8];
+ struct opcode high[64];
+};
+
+struct instr_dual {
+ struct opcode mod012;
+ struct opcode mod3;
+};
+
+struct mode_dual {
+ struct opcode mode32;
+ struct opcode mode64;
+};
+
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
+enum rex_bits {
+ REX_B = 1,
+ REX_X = 2,
+ REX_R = 4,
+ REX_W = 8,
+};
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dirty = ctxt->regs_dirty;
+ unsigned reg;
+
+ for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS)
+ ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs_dirty = 0;
+ ctxt->regs_valid = 0;
+}
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+ X86_EFLAGS_PF|X86_EFLAGS_CF)
+
+#ifdef CONFIG_X86_64
+#define ON64(x...) x
+#else
+#define ON64(x...)
+#endif
+
+#define EM_ASM_START(op) \
+static int em_##op(struct x86_emulate_ctxt *ctxt) \
+{ \
+ unsigned long flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; \
+ int bytes = 1, ok = 1; \
+ if (!(ctxt->d & ByteOp)) \
+ bytes = ctxt->dst.bytes; \
+ switch (bytes) {
+
+#define __EM_ASM(str) \
+ asm("push %[flags]; popf \n\t" \
+ "10: " str \
+ "pushf; pop %[flags] \n\t" \
+ "11: \n\t" \
+ : "+a" (ctxt->dst.val), \
+ "+d" (ctxt->src.val), \
+ [flags] "+D" (flags), \
+ "+S" (ok) \
+ : "c" (ctxt->src2.val))
+
+#define __EM_ASM_1(op, dst) \
+ __EM_ASM(#op " %%" #dst " \n\t")
+
+#define __EM_ASM_1_EX(op, dst) \
+ __EM_ASM(#op " %%" #dst " \n\t" \
+ _ASM_EXTABLE_TYPE_REG(10b, 11f, EX_TYPE_ZERO_REG, %%esi))
+
+#define __EM_ASM_2(op, dst, src) \
+ __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
+
+#define __EM_ASM_3(op, dst, src, src2) \
+ __EM_ASM(#op " %%" #src2 ", %%" #src ", %%" #dst " \n\t")
+
+#define EM_ASM_END \
+ } \
+ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); \
+ return !ok ? emulate_de(ctxt) : X86EMUL_CONTINUE; \
+}
+
+/* 1-operand, using "a" (dst) */
+#define EM_ASM_1(op) \
+ EM_ASM_START(op) \
+ case 1: __EM_ASM_1(op##b, al); break; \
+ case 2: __EM_ASM_1(op##w, ax); break; \
+ case 4: __EM_ASM_1(op##l, eax); break; \
+ ON64(case 8: __EM_ASM_1(op##q, rax); break;) \
+ EM_ASM_END
+
+/* 1-operand, using "c" (src2) */
+#define EM_ASM_1SRC2(op, name) \
+ EM_ASM_START(name) \
+ case 1: __EM_ASM_1(op##b, cl); break; \
+ case 2: __EM_ASM_1(op##w, cx); break; \
+ case 4: __EM_ASM_1(op##l, ecx); break; \
+ ON64(case 8: __EM_ASM_1(op##q, rcx); break;) \
+ EM_ASM_END
+
+/* 1-operand, using "c" (src2) with exception */
+#define EM_ASM_1SRC2EX(op, name) \
+ EM_ASM_START(name) \
+ case 1: __EM_ASM_1_EX(op##b, cl); break; \
+ case 2: __EM_ASM_1_EX(op##w, cx); break; \
+ case 4: __EM_ASM_1_EX(op##l, ecx); break; \
+ ON64(case 8: __EM_ASM_1_EX(op##q, rcx); break;) \
+ EM_ASM_END
+
+/* 2-operand, using "a" (dst), "d" (src) */
+#define EM_ASM_2(op) \
+ EM_ASM_START(op) \
+ case 1: __EM_ASM_2(op##b, al, dl); break; \
+ case 2: __EM_ASM_2(op##w, ax, dx); break; \
+ case 4: __EM_ASM_2(op##l, eax, edx); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \
+ EM_ASM_END
+
+/* 2-operand, reversed */
+#define EM_ASM_2R(op, name) \
+ EM_ASM_START(name) \
+ case 1: __EM_ASM_2(op##b, dl, al); break; \
+ case 2: __EM_ASM_2(op##w, dx, ax); break; \
+ case 4: __EM_ASM_2(op##l, edx, eax); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rdx, rax); break;) \
+ EM_ASM_END
+
+/* 2-operand, word only (no byte op) */
+#define EM_ASM_2W(op) \
+ EM_ASM_START(op) \
+ case 1: break; \
+ case 2: __EM_ASM_2(op##w, ax, dx); break; \
+ case 4: __EM_ASM_2(op##l, eax, edx); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \
+ EM_ASM_END
+
+/* 2-operand, using "a" (dst) and CL (src2) */
+#define EM_ASM_2CL(op) \
+ EM_ASM_START(op) \
+ case 1: __EM_ASM_2(op##b, al, cl); break; \
+ case 2: __EM_ASM_2(op##w, ax, cl); break; \
+ case 4: __EM_ASM_2(op##l, eax, cl); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rax, cl); break;) \
+ EM_ASM_END
+
+/* 3-operand, using "a" (dst), "d" (src) and CL (src2) */
+#define EM_ASM_3WCL(op) \
+ EM_ASM_START(op) \
+ case 1: break; \
+ case 2: __EM_ASM_3(op##w, ax, dx, cl); break; \
+ case 4: __EM_ASM_3(op##l, eax, edx, cl); break; \
+ ON64(case 8: __EM_ASM_3(op##q, rax, rdx, cl); break;) \
+ EM_ASM_END
+
+static int em_salc(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Set AL 0xFF if CF is set, or 0x00 when clear.
+ */
+ ctxt->dst.val = 0xFF * !!(ctxt->eflags & X86_EFLAGS_CF);
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ * Using asm goto would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+ int _fault = 0; \
+ \
+ asm volatile("1:" insn "\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \
+ : [_fault] "+r"(_fault) inoutclob ); \
+ \
+ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
+})
+
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
+{
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->rep_prefix,
+ .modrm_mod = ctxt->modrm_mod,
+ .modrm_reg = ctxt->modrm_reg,
+ .modrm_rm = ctxt->modrm_rm,
+ .src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
+ .src_bytes = ctxt->src.bytes,
+ .dst_bytes = ctxt->dst.bytes,
+ .src_type = ctxt->src.type,
+ .dst_type = ctxt->dst.type,
+ .ad_bytes = ctxt->ad_bytes,
+ .rip = ctxt->eip,
+ .next_rip = ctxt->_eip,
+ };
+
+ return ctxt->ops->intercept(ctxt, &info, stage);
+}
+
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+ *dest = (*dest & ~mask) | (src & mask);
+}
+
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (bytes) {
+ case 1:
+ *(u8 *)reg = (u8)val;
+ break;
+ case 2:
+ *(u16 *)reg = (u16)val;
+ break;
+ case 4:
+ *reg = (u32)val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *reg = val;
+ break;
+ }
+}
+
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
+{
+ return (1UL << (ctxt->ad_bytes << 3)) - 1;
+}
+
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel;
+ struct desc_struct ss;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return ~0UL;
+ ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+ return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+ return (__fls(stack_mask(ctxt)) + 1) >> 3;
+}
+
+/* Access/update address held in a register, based on addressing mode. */
+static inline unsigned long
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
+{
+ if (ctxt->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(ctxt);
+}
+
+static inline unsigned long
+register_address(struct x86_emulate_ctxt *ctxt, int reg)
+{
+ return address_mask(ctxt, reg_read(ctxt, reg));
+}
+
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+ assign_masked(reg, *reg + inc, mask);
+}
+
+static inline void
+register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
+{
+ ulong *preg = reg_rmw(ctxt, reg);
+
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
+}
+
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
+{
+ masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
+}
+
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
+ return 0;
+
+ return ctxt->ops->get_cached_segment_base(ctxt, seg);
+}
+
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt))
+ return X86EMUL_UNHANDLEABLE;
+
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
+}
+
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, SS_VECTOR, err, true);
+}
+
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
+}
+
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
+static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
+{
+ u16 selector;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
+ return selector;
+}
+
+static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
+ unsigned seg)
+{
+ u16 dummy;
+ u32 base3;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
+}
+
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+ return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+ struct x86_emulate_ctxt *ctxt,
+ unsigned int flags)
+{
+ return !ctxt->ops->is_canonical_addr(ctxt, la, flags);
+}
+
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check. FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
+ */
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+ u64 alignment = ctxt->d & AlignMask;
+
+ if (likely(size < 16))
+ return 1;
+
+ switch (alignment) {
+ case Unaligned:
+ return 1;
+ case Aligned16:
+ return 16;
+ case Aligned:
+ default:
+ return size;
+ }
+}
+
+static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned *max_size, unsigned size,
+ enum x86emul_mode mode, ulong *linear,
+ unsigned int flags)
+{
+ struct desc_struct desc;
+ bool usable;
+ ulong la;
+ u32 lim;
+ u16 sel;
+ u8 va_bits;
+
+ la = seg_base(ctxt, addr.seg) + addr.ea;
+ *max_size = 0;
+ switch (mode) {
+ case X86EMUL_MODE_PROT64:
+ *linear = la = ctxt->ops->get_untagged_addr(ctxt, la, flags);
+ va_bits = ctxt_virt_addr_bits(ctxt);
+ if (!__is_canonical_address(la, va_bits))
+ goto bad;
+
+ *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
+ if (size > *max_size)
+ goto bad;
+ break;
+ default:
+ *linear = la = (u32)la;
+ usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
+ addr.seg);
+ if (!usable)
+ goto bad;
+ /* code segment in protected mode or read-only data segment */
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) || !(desc.type & 2)) &&
+ (flags & X86EMUL_F_WRITE))
+ goto bad;
+ /* unreadable code segment */
+ if (!(flags & X86EMUL_F_FETCH) && (desc.type & 8) && !(desc.type & 2))
+ goto bad;
+ lim = desc_limit_scaled(&desc);
+ if (!(desc.type & 8) && (desc.type & 4)) {
+ /* expand-down segment */
+ if (addr.ea <= lim)
+ goto bad;
+ lim = desc.d ? 0xffffffff : 0xffff;
+ }
+ if (addr.ea > lim)
+ goto bad;
+ if (lim == 0xffffffff)
+ *max_size = ~0u;
+ else {
+ *max_size = (u64)lim + 1 - addr.ea;
+ if (size > *max_size)
+ goto bad;
+ }
+ break;
+ }
+ if (la & (insn_alignment(ctxt, size) - 1))
+ return emulate_gp(ctxt, 0);
+ return X86EMUL_CONTINUE;
+bad:
+ if (addr.seg == VCPU_SREG_SS)
+ return emulate_ss(ctxt, 0);
+ else
+ return emulate_gp(ctxt, 0);
+}
+
+static int linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write,
+ ulong *linear)
+{
+ unsigned max_size;
+ return __linearize(ctxt, addr, &max_size, size, ctxt->mode, linear,
+ write ? X86EMUL_F_WRITE : 0);
+}
+
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ ulong linear;
+ int rc;
+ unsigned max_size;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = dst };
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+ rc = __linearize(ctxt, addr, &max_size, 1, ctxt->mode, &linear,
+ X86EMUL_F_FETCH);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+}
+
+static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+ struct desc_struct cs;
+ u16 selector;
+ u32 base3;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
+ /* Real mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_REAL;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ /* Protected/VM86 mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_VM86;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
+ return X86EMUL_UNHANDLEABLE;
+
+ if (efer & EFER_LMA) {
+ if (cs.l) {
+ /* Proper long mode */
+ ctxt->mode = X86EMUL_MODE_PROT64;
+ } else if (cs.d) {
+ /* 32 bit compatibility mode*/
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT16;
+ }
+ } else {
+ /* Legacy 32 bit / 16 bit mode */
+ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip(ctxt, dst);
+}
+
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ int rc = emulator_recalc_and_set_mode(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip(ctxt, dst);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
+static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear,
+ void *data, unsigned size)
+{
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int linear_write_system(struct x86_emulate_ctxt *ctxt,
+ ulong linear, void *data,
+ unsigned int size)
+{
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned int size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+/*
+ * Prefetch the remaining bytes of the instruction without crossing page
+ * boundary if they are not in fetch_cache yet.
+ */
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
+{
+ int rc;
+ unsigned size, max_size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ /*
+ * We do not know exactly how many bytes will be needed, and
+ * __linearize is expensive, so fetch as much as possible. We
+ * just have to avoid going beyond the 15 byte limit, the end
+ * of the segment, or the end of the page.
+ *
+ * __linearize is called with size 0 so that it does not do any
+ * boundary check itself. Instead, we use max_size to check
+ * against op_size.
+ */
+ rc = __linearize(ctxt, addr, &max_size, 0, ctxt->mode, &linear,
+ X86EMUL_F_FETCH);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+
+ size = min_t(unsigned, 15UL ^ cur_size, max_size);
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
+
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
+ return emulate_gp(ctxt, 0);
+
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
+ return X86EMUL_CONTINUE;
+}
+
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr;
+
+ if (unlikely(done_size < size))
+ return __do_insn_fetch_bytes(ctxt, size - done_size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _ctxt) \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += sizeof(_type); \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
+})
+
+#define insn_fetch_arr(_arr, _size, _ctxt) \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
+})
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
+ int byteop)
+{
+ void *p;
+ int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop;
+
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+ p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+ else
+ p = reg_rmw(ctxt, modrm_reg);
+ return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ u16 *size, unsigned long *address, int op_bytes)
+{
+ int rc;
+
+ if (op_bytes == 2)
+ op_bytes = 3;
+ *address = 0;
+ rc = segmented_read_std(ctxt, addr, size, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr.ea += 2;
+ rc = segmented_read_std(ctxt, addr, address, op_bytes);
+ return rc;
+}
+
+EM_ASM_2(add);
+EM_ASM_2(or);
+EM_ASM_2(adc);
+EM_ASM_2(sbb);
+EM_ASM_2(and);
+EM_ASM_2(sub);
+EM_ASM_2(xor);
+EM_ASM_2(cmp);
+EM_ASM_2(test);
+EM_ASM_2(xadd);
+
+EM_ASM_1SRC2(mul, mul_ex);
+EM_ASM_1SRC2(imul, imul_ex);
+EM_ASM_1SRC2EX(div, div_ex);
+EM_ASM_1SRC2EX(idiv, idiv_ex);
+
+EM_ASM_3WCL(shld);
+EM_ASM_3WCL(shrd);
+
+EM_ASM_2W(imul);
+
+EM_ASM_1(not);
+EM_ASM_1(neg);
+EM_ASM_1(inc);
+EM_ASM_1(dec);
+
+EM_ASM_2CL(rol);
+EM_ASM_2CL(ror);
+EM_ASM_2CL(rcl);
+EM_ASM_2CL(rcr);
+EM_ASM_2CL(shl);
+EM_ASM_2CL(shr);
+EM_ASM_2CL(sar);
+
+EM_ASM_2W(bsf);
+EM_ASM_2W(bsr);
+EM_ASM_2W(bt);
+EM_ASM_2W(bts);
+EM_ASM_2W(btr);
+EM_ASM_2W(btc);
+
+EM_ASM_2R(cmp, cmp_r);
+
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return em_bsf(ctxt);
+}
+
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return em_bsr(ctxt);
+}
+
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
+{
+ return __emulate_cc(flags, condition & 0xf);
+}
+
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
+ break;
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
+ break;
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
+ break;
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
+ break;
+ }
+ op->orig_val = op->val;
+}
+
+static int em_fninit(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fninit");
+ kvm_fpu_put();
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fcw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fnstcw %0": "+m"(fcw));
+ kvm_fpu_put();
+
+ ctxt->dst.val = fcw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fsw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fnstsw %0": "+m"(fsw));
+ kvm_fpu_put();
+
+ ctxt->dst.val = fsw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static void __decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op, int reg)
+{
+ if ((ctxt->d & Avx) && ctxt->op_bytes == 32) {
+ op->type = OP_YMM;
+ op->bytes = 32;
+ op->addr.xmm = reg;
+ kvm_read_avx_reg(reg, &op->vec_val2);
+ return;
+ }
+ if (ctxt->d & (Avx|Sse)) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ kvm_read_sse_reg(reg, &op->vec_val);
+ return;
+ }
+ if (ctxt->d & Mmx) {
+ reg &= 7;
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = reg;
+ return;
+ }
+
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
+ fetch_register_operand(op);
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ unsigned int reg;
+
+ if (ctxt->d & ModRM)
+ reg = ctxt->modrm_reg;
+ else
+ reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0);
+
+ __decode_register_operand(ctxt, op, reg);
+}
+
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+ if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+ ctxt->modrm_seg = VCPU_SREG_SS;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ u8 sib;
+ int index_reg, base_reg, scale;
+ int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
+
+ ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0);
+ index_reg = (ctxt->rex_bits & REX_X ? 8 : 0);
+ base_reg = (ctxt->rex_bits & REX_B ? 8 : 0);
+
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
+ ctxt->modrm_seg = VCPU_SREG_DS;
+
+ if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
+ __decode_register_operand(ctxt, op, ctxt->modrm_rm);
+ return rc;
+ }
+
+ op->type = OP_MEM;
+
+ if (ctxt->ad_bytes == 2) {
+ unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+ unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+ unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+ unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ /* 16-bit ModR/M decode. */
+ switch (ctxt->modrm_mod) {
+ case 0:
+ if (ctxt->modrm_rm == 6)
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ }
+ switch (ctxt->modrm_rm) {
+ case 0:
+ modrm_ea += bx + si;
+ break;
+ case 1:
+ modrm_ea += bx + di;
+ break;
+ case 2:
+ modrm_ea += bp + si;
+ break;
+ case 3:
+ modrm_ea += bp + di;
+ break;
+ case 4:
+ modrm_ea += si;
+ break;
+ case 5:
+ modrm_ea += di;
+ break;
+ case 6:
+ if (ctxt->modrm_mod != 0)
+ modrm_ea += bp;
+ break;
+ case 7:
+ modrm_ea += bx;
+ break;
+ }
+ if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+ (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+ ctxt->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
+ } else {
+ /* 32/64-bit ModR/M decode. */
+ if ((ctxt->modrm_rm & 7) == 4) {
+ sib = insn_fetch(u8, ctxt);
+ index_reg |= (sib >> 3) & 7;
+ base_reg |= sib & 7;
+ scale = sib >> 6;
+
+ if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+ modrm_ea += insn_fetch(s32, ctxt);
+ else {
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
+ }
+ if (index_reg != 4)
+ modrm_ea += reg_read(ctxt, index_reg) << scale;
+ } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
+ modrm_ea += insn_fetch(s32, ctxt);
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->rip_relative = 1;
+ } else {
+ base_reg = ctxt->modrm_rm;
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ }
+ switch (ctxt->modrm_mod) {
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(s32, ctxt);
+ break;
+ }
+ }
+ op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
+done:
+ return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_MEM;
+ switch (ctxt->ad_bytes) {
+ case 2:
+ op->addr.mem.ea = insn_fetch(u16, ctxt);
+ break;
+ case 4:
+ op->addr.mem.ea = insn_fetch(u32, ctxt);
+ break;
+ case 8:
+ op->addr.mem.ea = insn_fetch(u64, ctxt);
+ break;
+ }
+done:
+ return rc;
+}
+
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
+{
+ long sv = 0, mask;
+
+ if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
+
+ if (ctxt->src.bytes == 2)
+ sv = (s16)ctxt->src.val & (s16)mask;
+ else if (ctxt->src.bytes == 4)
+ sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
+
+ ctxt->dst.addr.mem.ea = address_mask(ctxt,
+ ctxt->dst.addr.mem.ea + (sv >> 3));
+ }
+
+ /* only subword offset */
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+}
+
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->mem_read;
+
+ if (mc->pos < mc->end)
+ goto read_cached;
+
+ if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt))
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
+ return X86EMUL_CONTINUE;
+}
+
+static int segmented_read(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return read_emulated(ctxt, linear, data, size);
+}
+
+static int segmented_write(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_emulated(ctxt, linear, data, size,
+ &ctxt->exception);
+}
+
+static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *orig_data, const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+ size, &ctxt->exception);
+}
+
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned int size, unsigned short port,
+ void *dest)
+{
+ struct read_cache *rc = &ctxt->io_read;
+
+ if (rc->pos == rc->end) { /* refill pio read ahead */
+ unsigned int in_page, n;
+ unsigned int count = ctxt->rep_prefix ?
+ address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
+ in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
+ offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+ PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
+ if (n == 0)
+ n = 1;
+ rc->pos = rc->end = 0;
+ if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
+ return 0;
+ rc->end = n * size;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String) &&
+ !(ctxt->eflags & X86_EFLAGS_DF)) {
+ ctxt->dst.data = rc->data + rc->pos;
+ ctxt->dst.type = OP_MEM_STR;
+ ctxt->dst.count = (rc->end - rc->pos) / size;
+ rc->pos = rc->end;
+ } else {
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ }
+ return 1;
+}
+
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 index, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ ulong addr;
+
+ ctxt->ops->get_idt(ctxt, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, index << 3 | 0x2);
+
+ addr = dt.address + index * 8;
+ return linear_read_system(ctxt, addr, desc, sizeof(*desc));
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_ptr *dt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
+
+ if (selector & 1 << 2) {
+ struct desc_struct desc;
+ u16 sel;
+
+ memset(dt, 0, sizeof(*dt));
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
+ return;
+
+ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
+ } else
+ ops->get_gdt(ctxt, dt);
+}
+
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ ulong addr;
+
+ get_descriptor_table_ptr(ctxt, selector, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
+
+ addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc));
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc)
+{
+ int rc;
+ ulong addr;
+
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_write_system(ctxt, addr, desc, sizeof(*desc));
+}
+
+static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl)
+{
+ const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET;
+ u64 efer = 0, cet = 0, ssp = 0;
+
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer))
+ return true;
+
+ /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */
+ if (!(efer & EFER_LMA))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet))
+ return true;
+
+ if (!(cet & CET_SHSTK_EN))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp))
+ return true;
+
+ /*
+ * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must
+ * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode.
+ */
+ return ssp >> 32;
+}
+
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg, u8 cpl,
+ enum x86_transfer_type transfer,
+ struct desc_struct *desc)
+{
+ struct desc_struct seg_desc, old_desc;
+ u8 dpl, rpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ ulong desc_addr;
+ int ret;
+ u16 dummy;
+ u32 base3 = 0;
+
+ memset(&seg_desc, 0, sizeof(seg_desc));
+
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor (keep limit etc. for
+ * unreal mode) */
+ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
+ set_desc_base(&seg_desc, selector << 4);
+ goto load;
+ } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+ /* VM86 needs a clean new segment descriptor */
+ set_desc_base(&seg_desc, selector << 4);
+ set_desc_limit(&seg_desc, 0xffff);
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = 3;
+ goto load;
+ }
+
+ rpl = selector & 3;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ /* NULL selector is not valid for TR, CS and (except for long mode) SS */
+ if (null_selector) {
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+ goto exception;
+
+ if (seg == VCPU_SREG_SS) {
+ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+ goto exception;
+
+ /*
+ * ctxt->ops->set_segment expects the CPL to be in
+ * SS.DPL, so fake an expand-up 32-bit data segment.
+ */
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = cpl;
+ seg_desc.d = 1;
+ seg_desc.g = 1;
+ }
+
+ /* Skip all following checks */
+ goto load;
+ }
+
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ err_code = selector & 0xfffc;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
+
+ /* can't load system descriptor into segment selector */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
+ goto exception;
+ }
+
+ dpl = seg_desc.dpl;
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or DPL != CPL
+ */
+ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ /*
+ * KVM uses "none" when loading CS as part of emulating Real
+ * Mode exceptions and IRET (handled above). In all other
+ * cases, loading CS without a control transfer is a KVM bug.
+ */
+ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE))
+ goto exception;
+
+ if (!(seg_desc.type & 8))
+ goto exception;
+
+ if (transfer == X86_TRANSFER_RET) {
+ /* RET can never return to an inner privilege level. */
+ if (rpl < cpl)
+ goto exception;
+ /* Outer-privilege level return is not implemented */
+ if (rpl > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) {
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > rpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (dpl != rpl)
+ goto exception;
+ }
+ } else { /* X86_TRANSFER_CALL_JMP */
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ }
+ /* in long-mode d/b must be clear if l is set */
+ if (seg_desc.d && seg_desc.l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ goto exception;
+ }
+ if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) {
+ err_code = 0;
+ goto exception;
+ }
+
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (seg_desc.s || seg_desc.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and ((RPL > DPL) or (CPL > DPL)))
+ */
+ if ((seg_desc.type & 0xa) == 0x8 ||
+ (((seg_desc.type & 0xc) != 0xc) &&
+ (rpl > dpl || cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ if (seg_desc.s) {
+ /* mark segment as accessed */
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32), ctxt,
+ X86EMUL_F_DT_LOAD))
+ return emulate_gp(ctxt, err_code);
+ }
+
+ if (seg == VCPU_SREG_TR) {
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+load:
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
+ if (desc)
+ *desc = seg_desc;
+ return X86EMUL_CONTINUE;
+exception:
+ return emulate_exception(ctxt, err_vec, err_code, true);
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg)
+{
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ /*
+ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+ * they can load it at CPL<3 (Intel's manual says only LSS can,
+ * but it's wrong).
+ *
+ * However, the Intel manual says that putting IST=1/DPL=3 in
+ * an interrupt gate will result in SS=3 (the AMD manual instead
+ * says it doesn't), so allow SS=3 in __load_segment_descriptor
+ * and only forbid it here.
+ */
+ if (seg == VCPU_SREG_SS && selector == 3 &&
+ ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
+}
+
+static void write_register_operand(struct operand *op)
+{
+ return assign_register(op->addr.reg, op->val, op->bytes);
+}
+
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
+{
+ switch (op->type) {
+ case OP_REG:
+ write_register_operand(op);
+ break;
+ case OP_MEM:
+ if (ctxt->lock_prefix)
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
+ op->addr.mem,
+ &op->val,
+ op->bytes);
+ case OP_MEM_STR:
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
+ case OP_XMM:
+ if (!(ctxt->d & Avx)) {
+ kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
+ break;
+ }
+ /* full YMM write but with high bytes cleared */
+ memset(op->valptr + 16, 0, 16);
+ fallthrough;
+ case OP_YMM:
+ kvm_write_avx_reg(op->addr.xmm, &op->vec_val2);
+ break;
+ case OP_MM:
+ kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len)
+{
+ struct segmented_address addr;
+
+ rsp_increment(ctxt, -len);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+
+ return segmented_write(ctxt, addr, data, len);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes);
+}
+
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ struct segmented_address addr;
+
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+ rc = segmented_read(ctxt, addr, dest, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rsp_increment(ctxt, len);
+ return rc;
+}
+
+static int em_pop(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val = 0;
+ unsigned long change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ int cpl = ctxt->ops->cpl(ctxt);
+
+ rc = emulate_pop(ctxt, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+ X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+ X86_EFLAGS_AC | X86_EFLAGS_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= X86_EFLAGS_IOPL;
+ if (cpl <= iopl)
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
+static int em_popf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->eflags;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned frame_size = ctxt->src.val;
+ unsigned nesting_level = ctxt->src2.val & 31;
+ ulong rbp;
+
+ if (nesting_level)
+ return X86EMUL_UNHANDLEABLE;
+
+ rbp = reg_read(ctxt, VCPU_REGS_RBP);
+ rc = emulate_push(ctxt, &rbp, stack_size(ctxt));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
+ stack_mask(ctxt));
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+ reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
+ stack_mask(ctxt));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
+ stack_mask(ctxt));
+ return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
+}
+
+static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+
+ ctxt->src.val = get_segment_selector(ctxt, seg);
+ if (ctxt->op_bytes == 4) {
+ rsp_increment(ctxt, -2);
+ ctxt->op_bytes = 2;
+ }
+
+ return em_push(ctxt);
+}
+
+static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned long selector = 0;
+ int rc;
+
+ rc = emulate_pop(ctxt, &selector, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (seg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
+
+ rc = load_segment_descriptor(ctxt, (u16)selector, seg);
+ return rc;
+}
+
+static int em_pusha(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RAX;
+
+ while (reg <= VCPU_REGS_RDI) {
+ (reg == VCPU_REGS_RSP) ?
+ (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
+
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ++reg;
+ }
+
+ return rc;
+}
+
+static int em_pushf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
+ return em_push(ctxt);
+}
+
+static int em_popa(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RDI;
+ u32 val = 0;
+
+ while (reg >= VCPU_REGS_RAX) {
+ if (reg == VCPU_REGS_RSP) {
+ rsp_increment(ctxt, ctxt->op_bytes);
+ --reg;
+ }
+
+ rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ break;
+ assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
+ --reg;
+ }
+ return rc;
+}
+
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc;
+ struct desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+
+ /* TODO: Add limit checks */
+ ctxt->src.val = ctxt->eflags;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
+
+ ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->src.val = ctxt->_eip;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ops->get_idt(ctxt, &dt);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = linear_read_system(ctxt, cs_addr, &cs, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = linear_read_system(ctxt, eip_addr, &eip, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = eip;
+
+ return rc;
+}
+
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ rc = __emulate_int_real(ctxt, irq);
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+ return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return __emulate_int_real(ctxt, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+ X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+ X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+ X86_EFLAGS_AC | X86_EFLAGS_ID |
+ X86_EFLAGS_FIXED;
+ unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+ X86_EFLAGS_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
+
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = temp_eip;
+
+ if (ctxt->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (ctxt->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
+ }
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ return rc;
+}
+
+static int em_iret(struct x86_emulate_ctxt *ctxt)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned short sel;
+ struct desc_struct new_desc;
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_jmp_abs(struct x86_emulate_ctxt *ctxt)
+{
+ return assign_eip_near(ctxt, ctxt->src.val);
+}
+
+static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long int old_eip;
+
+ old_eip = ctxt->_eip;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ return rc;
+}
+
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+{
+ u64 old = ctxt->dst.orig_val64;
+
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ } else {
+ ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+ (u32) reg_read(ctxt, VCPU_REGS_RBX);
+
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip = 0;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip_near(ctxt, eip);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip = 0;
+ unsigned long cs = 0;
+ int cpl = ctxt->ops->cpl(ctxt);
+ struct desc_struct new_desc;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_far(ctxt, eip);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = em_ret_far(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.orig_val = ctxt->src.val;
+ ctxt->src.val = ctxt->dst.orig_val;
+ em_cmp(ctxt);
+
+ if (ctxt->eflags & X86_EFLAGS_ZF) {
+ /* Success: write back to memory; no update of EAX */
+ ctxt->src.type = OP_NONE;
+ ctxt->dst.val = ctxt->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ ctxt->src.type = OP_REG;
+ ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
+ /* Create write-cycle to dest by writing the same value */
+ ctxt->dst.val = ctxt->dst.orig_val;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lseg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned short sel;
+ int rc;
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->dst.val = ctxt->src.val;
+ return rc;
+}
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+ if (!ctxt->ops->is_smm(ctxt))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->leave_smm(ctxt))
+ ctxt->ops->triple_fault(ctxt);
+
+ return emulator_recalc_and_set_mode(ctxt);
+}
+
+static void
+setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
+{
+ cs->l = 0; /* will be adjusted later */
+ set_desc_base(cs, 0); /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->p = 1;
+ cs->d = 1;
+ cs->avl = 0;
+
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->d = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
+}
+
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ /* syscall is not available in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
+
+ /*
+ * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas
+ * AMD allows SYSCALL in any flavor of protected mode. Note, it's
+ * infeasible to emulate Intel behavior when running on AMD hardware,
+ * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD
+ * for KVM to trap-and-emulate, unlike emulating AMD on Intel.
+ */
+ if (ctxt->mode != X86EMUL_MODE_PROT64 &&
+ ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
+ return emulate_ud(ctxt);
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_SCE))
+ return emulate_ud(ctxt);
+
+ setup_syscalls_segments(&cs, &ss);
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
+
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
+ if (efer & EFER_LMA) {
+#ifdef CONFIG_X86_64
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
+
+ ops->get_msr(ctxt,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ctxt->_eip = msr_data;
+
+ ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~msr_data;
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+#endif
+ } else {
+ /* legacy mode */
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ ctxt->_eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ }
+
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
+
+ /*
+ * Intel's architecture allows SYSENTER in compatibility mode, but AMD
+ * does not. Note, AMD does allow SYSENTER in legacy protected mode.
+ */
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) &&
+ !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
+ return emulate_ud(ctxt);
+
+ /* sysenter/sysexit have not been tested in 64bit mode. */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+
+ setup_syscalls_segments(&cs, &ss);
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data;
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
+ *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
+ (u32)msr_data;
+ if (efer & EFER_LMA)
+ ctxt->mode = X86EMUL_MODE_PROT64;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data, rcx, rdx;
+ int usermode;
+ u16 cs_sel = 0, ss_sel = 0;
+
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
+
+ setup_syscalls_segments(&cs, &ss);
+
+ if (ctxt->rex_bits & REX_W)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs_sel = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = (u16)(msr_data + 24);
+ rcx = (u32)rcx;
+ rdx = (u32)rdx;
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs_sel = (u16)(msr_data + 32);
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
+ cs.l = 1;
+ if (emul_is_noncanonical_address(rcx, ctxt, 0) ||
+ emul_is_noncanonical_address(rdx, ctxt, 0))
+ return emulate_gp(ctxt, 0);
+ break;
+ }
+ cs_sel |= SEGMENT_RPL_MASK;
+ ss_sel |= SEGMENT_RPL_MASK;
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ctxt->_eip = rdx;
+ ctxt->mode = usermode;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ return ctxt->ops->cpl(ctxt) > iopl;
+}
+
+#define VMWARE_PORT_VMPORT (0x5658)
+#define VMWARE_PORT_VMRPC (0x5659)
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct tr_seg;
+ u32 base3;
+ int r;
+ u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+ unsigned long base;
+
+ /*
+ * VMware allows access to these ports even if denied
+ * by TSS I/O permission bitmap. Mimic behavior.
+ */
+ if (enable_vmware_backdoor &&
+ ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
+ return true;
+
+ ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
+ if (!tr_seg.p)
+ return false;
+ if (desc_limit_scaled(&tr_seg) < 103)
+ return false;
+ base = get_desc_base(&tr_seg);
+#ifdef CONFIG_X86_64
+ base |= ((u64)base3) << 32;
+#endif
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
+ return false;
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permitted(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ if (ctxt->perm_ok)
+ return true;
+
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, port, len))
+ return false;
+
+ ctxt->perm_ok = true;
+
+ return true;
+}
+
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ u32 eax, ebx, ecx, edx;
+
+ if (ctxt->ad_bytes != 4)
+ return;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ if (!is_guest_vendor_intel(ebx, ecx, edx))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ fallthrough;
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = ctxt->_eip;
+ tss->flag = ctxt->eflags;
+ tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ ctxt->_eip = tss->ip;
+ ctxt->eflags = tss->flag | 2;
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+
+ cpl = tss->cs & 3;
+
+ /*
+ * Now load segment descriptors. If fault happens at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_16 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss16(ctxt, &tss_seg);
+
+ ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof(tss_seg.prev_task_link));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss16(ctxt, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ /* CR3 and ldt selector are not saved intentionally */
+ tss->eip = ctxt->_eip;
+ tss->eflags = ctxt->eflags;
+ tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = tss->eip;
+ ctxt->eflags = tss->eflags | 2;
+
+ /* General purpose registers */
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors. This is important because CPL checks will
+ * use CS.RPL.
+ */
+ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
+ set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * If we're switching between Protected Mode and VM86, we need to make
+ * sure to update the mode before loading the segment descriptors so
+ * that the selectors are interpreted correctly.
+ */
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ ctxt->mode = X86EMUL_MODE_VM86;
+ cpl = 3;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ cpl = tss->cs & 3;
+ }
+
+ /*
+ * Now load segment descriptors. If fault happens at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+
+ return ret;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_32 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+ u32 eip_offset = offsetof(struct tss_segment_32, eip);
+ u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss32(ctxt, &tss_seg);
+
+ /* Only GP registers and segment selectors are saved */
+ ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+ ldt_sel_offset - eip_offset);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof(tss_seg.prev_task_link));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss32(ctxt, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct curr_tss_desc, next_tss_desc;
+ int ret;
+ u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
+ ulong old_tss_base =
+ ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
+ u32 desc_limit;
+ ulong desc_addr, dr7;
+
+ /* FIXME: old_tss_base == ~0 ? */
+
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ /* FIXME: check that next_tss_desc is tss */
+
+ /*
+ * Check privileges. The three cases are task switch caused by...
+ *
+ * 1. jmp/call/int to task gate: Check against DPL of the task gate
+ * 2. Exception/IRQ/iret: No check is performed
+ * 3. jmp/call to TSS/task-gate: No check is performed since the
+ * hardware checks it before exiting.
+ */
+ if (reason == TASK_SWITCH_GATE) {
+ if (idt_index != -1) {
+ /* Software interrupts */
+ struct desc_struct task_gate_desc;
+ int dpl;
+
+ ret = read_interrupt_descriptor(ctxt, idt_index,
+ &task_gate_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ dpl = task_gate_desc.dpl;
+ if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+ return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+ }
+ }
+
+ desc_limit = desc_limit_scaled(&next_tss_desc);
+ if (!next_tss_desc.p ||
+ ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ return emulate_ts(ctxt, tss_selector & 0xfffc);
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+ write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET)
+ ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used after this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (next_tss_desc.type & 8)
+ ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc);
+ else
+ ret = task_switch_16(ctxt, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+ ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+ if (reason != TASK_SWITCH_IRET) {
+ next_tss_desc.type |= (1 << 1); /* set busy flag */
+ write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
+ }
+
+ ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
+ ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
+
+ if (has_error_code) {
+ ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ ctxt->lock_prefix = 0;
+ ctxt->src.val = (unsigned long) error_code;
+ ret = em_push(ctxt);
+ }
+
+ dr7 = ops->get_dr(ctxt, 7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
+ return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ ctxt->_eip = ctxt->eip;
+ ctxt->dst.type = OP_NONE;
+
+ rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+
+ if (rc == X86EMUL_CONTINUE) {
+ ctxt->eip = ctxt->_eip;
+ writeback_registers(ctxt);
+ }
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+ struct operand *op)
+{
+ int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
+
+ register_address_increment(ctxt, reg, df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg);
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = ctxt->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
+
+ ctxt->dst.val = al;
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ em_or(ctxt);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aam(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, ah;
+
+ if (ctxt->src.val == 0)
+ return emulate_de(ctxt);
+
+ al = ctxt->dst.val & 0xff;
+ ah = al / ctxt->src.val;
+ al %= ctxt->src.val;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8);
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ em_or(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aad(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al = ctxt->dst.val & 0xff;
+ u8 ah = (ctxt->dst.val >> 8) & 0xff;
+
+ al = (al + (ah * ctxt->src.val)) & 0xff;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ em_or(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long rel = ctxt->src.val;
+
+ ctxt->src.val = (unsigned long)ctxt->_eip;
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return em_push(ctxt);
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int cpl = ctxt->ops->cpl(ctxt);
+ enum x86emul_mode prev_mode = ctxt->mode;
+
+ old_eip = ctxt->_eip;
+ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_cs;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ /* If we failed, we tainted the memory, but the very least we should
+ restore cs */
+ if (rc != X86EMUL_CONTINUE) {
+ pr_warn_once("faulting far call emulation tainted memory\n");
+ goto fail;
+ }
+ return rc;
+fail:
+ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ ctxt->mode = prev_mode;
+ return rc;
+
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip = 0;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Write back the register source. */
+ ctxt->src.val = ctxt->dst.val;
+ write_register_operand(&ctxt->src);
+
+ /* Write back the memory destination with implicit LOCK prefix. */
+ ctxt->dst.val = ctxt->src.orig_val;
+ ctxt->lock_prefix = 1;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = ctxt->src2.val;
+ return em_imul(ctxt);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = ctxt->src.bytes;
+ ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (!ctxt->ops->guest_has_rdpid(ctxt))
+ return emulate_ud(ctxt);
+
+ ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 pmc;
+
+ if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
+ return emulate_gp(ctxt, 0);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ u16 tmp;
+
+ if (!ctxt->ops->guest_has_movbe(ctxt))
+ return emulate_ud(ctxt);
+
+ switch (ctxt->op_bytes) {
+ case 2:
+ /*
+ * From MOVBE definition: "...When the operand size is 16 bits,
+ * the upper word of the destination register remains unchanged
+ * ..."
+ *
+ * Both casting ->valptr and ->val to u16 breaks strict aliasing
+ * rules so we have to do the operation almost per hand.
+ */
+ tmp = (u16)ctxt->src.val;
+ ctxt->dst.val &= ~0xffffUL;
+ ctxt->dst.val |= (unsigned long)swab16(tmp);
+ break;
+ case 4:
+ ctxt->dst.val = swab32((u32)ctxt->src.val);
+ break;
+ case 8:
+ ctxt->dst.val = swab64(ctxt->src.val);
+ break;
+ default:
+ BUG();
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ int cr_num = ctxt->modrm_reg;
+ int r;
+
+ if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+
+ if (cr_num == 0) {
+ /*
+ * CR0 write might have updated CR0.PE and/or CR0.PG
+ * which can affect the cpu's execution mode.
+ */
+ r = emulator_recalc_and_set_mode(ctxt);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long val;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ val = ctxt->src.val & ~0ULL;
+ else
+ val = ctxt->src.val & ~0U;
+
+ /* #UD condition is already handled. */
+ if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
+ u64 msr_data;
+ int r;
+
+ msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+ | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+ r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
+
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ return emulate_gp(ctxt, 0);
+
+ return r;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
+ u64 msr_data;
+ int r;
+
+ r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
+
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ return emulate_gp(ctxt, 0);
+
+ if (r == X86EMUL_CONTINUE) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+ }
+ return r;
+}
+
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
+{
+ if (segment > VCPU_SREG_GS &&
+ (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->dst.val = get_segment_selector(ctxt, segment);
+ if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
+}
+
+static int em_invlpg(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ ulong linear;
+ unsigned int max_size;
+
+ rc = __linearize(ctxt, ctxt->src.addr.mem, &max_size, 1, ctxt->mode,
+ &linear, X86EMUL_F_INVLPG);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->ops->invlpg(ctxt, linear);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clts(struct x86_emulate_ctxt *ctxt)
+{
+ ulong cr0;
+
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ cr0 &= ~X86_CR0_TS;
+ ctxt->ops->set_cr(ctxt, 0, cr0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = ctxt->ops->fix_hypercall(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /* Let the processor re-execute the fixed hypercall */
+ ctxt->_eip = ctxt->eip;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+ void (*get)(struct x86_emulate_ctxt *ctxt,
+ struct desc_ptr *ptr))
+{
+ struct desc_ptr desc_ptr;
+
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ get(ctxt, &desc_ptr);
+ if (ctxt->op_bytes == 2) {
+ ctxt->op_bytes = 4;
+ desc_ptr.address &= 0x00ffffff;
+ }
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
+}
+
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
+
+static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
+{
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ rc = read_descriptor(ctxt, ctxt->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64 &&
+ emul_is_noncanonical_address(desc_ptr.address, ctxt,
+ X86EMUL_F_DT_LOAD))
+ return emulate_gp(ctxt, 0);
+ if (lgdt)
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ else
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, true);
+}
+
+static int em_lidt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, false);
+}
+
+static int em_smsw(struct x86_emulate_ctxt *ctxt)
+{
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lmsw(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
+ | (ctxt->src.val & 0x0f));
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ register_address_increment(ctxt, VCPU_REGS_RCX, -1);
+ if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
+ (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+ if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+ &ctxt->dst.val))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+ &ctxt->src.val, 1);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ ctxt->eflags |= X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+ u64 msr = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+ if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ ctxt->ops->cpl(ctxt)) {
+ return emulate_gp(ctxt, 0);
+ }
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ *reg_write(ctxt, VCPU_REGS_RAX) = eax;
+ *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+ *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = edx;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+ u32 flags;
+
+ flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF;
+ flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+
+ ctxt->eflags &= ~0xffUL;
+ ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+ *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+ *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+ case 8:
+ asm("bswap %0" : "+r"(ctxt->dst.val));
+ break;
+#endif
+ default:
+ asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflush(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflush regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = (s32) ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ if (!ctxt->ops->guest_has_fxsr(ctxt))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ /*
+ * Don't emulate a case that should never be hit, instead of working
+ * around a lack of fxsave64/fxrstor64 on old compilers.
+ */
+ if (ctxt->mode >= X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ * 1) 16 bit mode
+ * 2) 32 bit mode
+ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs
+ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ * save and restore
+ * 3) 64-bit mode with REX.W prefix
+ * - like (2), but XMM 8-15 are being saved and restored
+ * 4) 64-bit mode without REX.W prefix
+ * - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result. (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ kvm_fpu_get();
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+ kvm_fpu_put();
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
+}
+
+/*
+ * FXRSTOR might restore XMM registers not provided by the guest. Fill
+ * in the host registers (via FXSAVE) instead, so they won't be modified.
+ * (preemption has to stay disabled until FXRSTOR).
+ *
+ * Use noinline to keep the stack for other functions called by callers small.
+ */
+static noinline int fxregs_fixup(struct fxregs_state *fx_state,
+ const size_t used_size)
+{
+ struct fxregs_state fx_tmp;
+ int rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp));
+ memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size,
+ __fxstate_size(16) - used_size);
+
+ return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+ size_t size;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ size = fxstate_size(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ kvm_fpu_get();
+
+ if (size < __fxstate_size(16)) {
+ rc = fxregs_fixup(&fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
+ }
+
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
+
+ if (rc == X86EMUL_CONTINUE)
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+out:
+ kvm_fpu_put();
+
+ return rc;
+}
+
+static int em_xsetbv(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ecx, edx;
+
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE))
+ return emulate_ud(ctxt);
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ edx = reg_read(ctxt, VCPU_REGS_RDX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_access(struct x86_emulate_ctxt *ctxt)
+{
+ if (!valid_cr(ctxt->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ int dr = ctxt->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) {
+ ulong dr6;
+
+ dr6 = ctxt->ops->get_dr(ctxt, 6);
+ dr6 &= ~DR_TRAP_BITS;
+ dr6 |= DR6_BD | DR6_ACTIVE_LOW;
+ ctxt->ops->set_dr(ctxt, 6, dr6);
+ return emulate_db(ctxt);
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int dr = ctxt->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000ULL)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+ u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ /*
+ * VMware allows access to these Pseduo-PMCs even when read via RDPMC
+ * in Ring3 when CR4.PCE=0.
+ */
+ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
+ return X86EMUL_CONTINUE;
+
+ /*
+ * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE
+ * check however is unnecessary because CPL is always 0 outside
+ * protected mode.
+ */
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
+ ctxt->ops->check_rdpmc_early(ctxt, rcx))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+ if (!emulator_io_permitted(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+ if (!emulator_io_permitted(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define N D(NotImpl)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
+#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
+
+#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
+#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
+
+#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+
+static const struct opcode ud = I(SrcNone, emulate_ud);
+
+static const struct opcode group7_rm0[] = {
+ N,
+ I(SrcNone | Priv | EmulateOnUD, em_hypercall),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm1[] = {
+ DI(SrcNone | Priv, monitor),
+ DI(SrcNone | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm2[] = {
+ N,
+ II(ImplicitOps | Priv, em_xsetbv, xsetbv),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm3[] = {
+ DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
+ DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | Prot | Priv, invlpga, check_svme),
+};
+
+static const struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group1[] = {
+ I(Lock, em_add),
+ I(Lock | PageTable, em_or),
+ I(Lock, em_adc),
+ I(Lock, em_sbb),
+ I(Lock | PageTable, em_and),
+ I(Lock, em_sub),
+ I(Lock, em_xor),
+ I(NoWrite, em_cmp),
+};
+
+static const struct opcode group1A[] = {
+ I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
+};
+
+static const struct opcode group2[] = {
+ I(DstMem | ModRM, em_rol),
+ I(DstMem | ModRM, em_ror),
+ I(DstMem | ModRM, em_rcl),
+ I(DstMem | ModRM, em_rcr),
+ I(DstMem | ModRM, em_shl),
+ I(DstMem | ModRM, em_shr),
+ I(DstMem | ModRM, em_shl),
+ I(DstMem | ModRM, em_sar),
+};
+
+static const struct opcode group3[] = {
+ I(DstMem | SrcImm | NoWrite, em_test),
+ I(DstMem | SrcImm | NoWrite, em_test),
+ I(DstMem | SrcNone | Lock, em_not),
+ I(DstMem | SrcNone | Lock, em_neg),
+ I(DstXacc | Src2Mem, em_mul_ex),
+ I(DstXacc | Src2Mem, em_imul_ex),
+ I(DstXacc | Src2Mem, em_div_ex),
+ I(DstXacc | Src2Mem, em_idiv_ex),
+};
+
+static const struct opcode group4[] = {
+ I(ByteOp | DstMem | SrcNone | Lock, em_inc),
+ I(ByteOp | DstMem | SrcNone | Lock, em_dec),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group5[] = {
+ I(DstMem | SrcNone | Lock, em_inc),
+ I(DstMem | SrcNone | Lock, em_dec),
+ I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far),
+ I(SrcMem | NearBranch | IsBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far),
+ I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
+};
+
+static const struct opcode group6[] = {
+ II(Prot | DstMem, em_sldt, sldt),
+ II(Prot | DstMem, em_str, str),
+ II(Prot | Priv | SrcMem16, em_lldt, lldt),
+ II(Prot | Priv | SrcMem16, em_ltr, ltr),
+ N, N, N, N,
+};
+
+static const struct group_dual group7 = { {
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
+ II(SrcMem | Priv, em_lgdt, lgdt),
+ II(SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+}, {
+ EXT(0, group7_rm0),
+ EXT(0, group7_rm1),
+ EXT(0, group7_rm2),
+ EXT(0, group7_rm3),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ EXT(0, group7_rm7),
+} };
+
+static const struct opcode group8[] = {
+ N, N, N, N,
+ I(DstMem | SrcImmByte | NoWrite, em_bt),
+ I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ I(DstMem | SrcImmByte | Lock, em_btr),
+ I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
+};
+
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
+};
+
+
+static const struct group_dual group9 = { {
+ N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+}, {
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
+} };
+
+static const struct opcode group11[] = {
+ I(DstMem | SrcImm | Mov | PageTable, em_mov),
+ X7(D(Undefined)),
+};
+
+static const struct gprefix pfx_0f_ae_7 = {
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
+};
+
+static const struct group_dual group15 = { {
+ I(ModRM | Aligned16, em_fxsave),
+ I(ModRM | Aligned16, em_fxrstor),
+ N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct gprefix pfx_0f_6f_0f_7f = {
+ I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov),
+};
+
+static const struct instr_dual instr_dual_0f_2b = {
+ I(0, em_mov), N
+};
+
+static const struct gprefix pfx_0f_2b = {
+ ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
+};
+
+static const struct gprefix pfx_0f_10_0f_11 = {
+ I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_28_0f_29 = {
+ I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_e7_0f_38_2a = {
+ N, I(Sse | Avx, em_mov), N, N,
+};
+
+static const struct escape escape_d9 = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_db = { {
+ N, N, N, N, N, N, N, N,
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_dd = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct instr_dual instr_dual_0f_c3 = {
+ I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
+};
+
+static const struct mode_dual mode_dual_63 = {
+ N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
+static const struct instr_dual instr_dual_8d = {
+ D(DstReg | SrcMem | ModRM | NoAccess), N
+};
+
+static const struct opcode opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ I6ALU(Lock, em_add),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
+ /* 0x08 - 0x0F */
+ I6ALU(Lock | PageTable, em_or),
+ I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
+ N,
+ /* 0x10 - 0x17 */
+ I6ALU(Lock, em_adc),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
+ /* 0x18 - 0x1F */
+ I6ALU(Lock, em_sbb),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
+ /* 0x20 - 0x27 */
+ I6ALU(Lock | PageTable, em_and), N, N,
+ /* 0x28 - 0x2F */
+ I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
+ /* 0x30 - 0x37 */
+ I6ALU(Lock, em_xor), N, N,
+ /* 0x38 - 0x3F */
+ I6ALU(NoWrite, em_cmp), N, N,
+ /* 0x40 - 0x4F */
+ X8(I(DstReg, em_inc)), X8(I(DstReg, em_dec)),
+ /* 0x50 - 0x57 */
+ X8(I(SrcReg | Stack, em_push)),
+ /* 0x58 - 0x5F */
+ X8(I(DstReg | Stack, em_pop)),
+ /* 0x60 - 0x67 */
+ I(ImplicitOps | Stack | No64, em_pusha),
+ I(ImplicitOps | Stack | No64, em_popa),
+ N, MD(ModRM, &mode_dual_63),
+ N, N, N, N,
+ /* 0x68 - 0x6F */
+ I(SrcImm | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+ I(SrcImmByte | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+ I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
+ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
+ /* 0x70 - 0x7F */
+ X16(D(SrcImmByte | NearBranch | IsBranch)),
+ /* 0x80 - 0x87 */
+ G(ByteOp | DstMem | SrcImm, group1),
+ G(DstMem | SrcImm, group1),
+ G(ByteOp | DstMem | SrcImm | No64, group1),
+ G(DstMem | SrcImmByte, group1),
+ I2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
+ /* 0x88 - 0x8F */
+ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
+ I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+ I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
+ ID(0, &instr_dual_8d),
+ I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+ G(0, group1A),
+ /* 0x90 - 0x97 */
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
+ /* 0x98 - 0x9F */
+ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+ I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N,
+ II(ImplicitOps | Stack, em_pushf, pushf),
+ II(ImplicitOps | Stack, em_popf, popf),
+ I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
+ /* 0xA0 - 0xA7 */
+ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
+ I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+ I2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
+ /* 0xA8 - 0xAF */
+ I2bv(DstAcc | SrcImm | NoWrite, em_test),
+ I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+ I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+ I2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r),
+ /* 0xB0 - 0xB7 */
+ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+ /* 0xB8 - 0xBF */
+ X8(I(DstReg | SrcImm64 | Mov, em_mov)),
+ /* 0xC0 - 0xC7 */
+ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
+ I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm),
+ I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
+ G(ByteOp, group11), G(0, group11),
+ /* 0xC8 - 0xCF */
+ I(Stack | SrcImmU16 | Src2ImmByte, em_enter),
+ I(Stack, em_leave),
+ I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm),
+ I(ImplicitOps | IsBranch | ShadowStack, em_ret_far),
+ D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn),
+ D(ImplicitOps | No64 | IsBranch),
+ II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret),
+ /* 0xD0 - 0xD7 */
+ G(Src2One | ByteOp, group2), G(Src2One, group2),
+ G(Src2CL | ByteOp, group2), G(Src2CL, group2),
+ I(DstAcc | SrcImmUByte | No64, em_aam),
+ I(DstAcc | SrcImmUByte | No64, em_aad),
+ I(DstAcc | ByteOp | No64, em_salc),
+ I(DstAcc | SrcXLat | ByteOp, em_mov),
+ /* 0xD8 - 0xDF */
+ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
+ /* 0xE0 - 0xE7 */
+ X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)),
+ I(SrcImmByte | NearBranch | IsBranch, em_jcxz),
+ I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
+ /* 0xE8 - 0xEF */
+ I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call),
+ D(SrcImm | ImplicitOps | NearBranch | IsBranch),
+ I(SrcImmFAddr | No64 | IsBranch, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch | IsBranch),
+ I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
+ /* 0xF0 - 0xF7 */
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
+ /* 0xF8 - 0xFF */
+ D(ImplicitOps), D(ImplicitOps),
+ I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
+ D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static const struct opcode twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ G(0, group6), GD(0, &group7), N, N,
+ N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall),
+ II(ImplicitOps | Priv, em_clts, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
+ N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ /* 0x10 - 0x1F */
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11),
+ N, N, N, N, N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */
+ /* 0x20 - 0x2F */
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+ check_cr_access),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+ check_dr_write),
+ N, N, N, N,
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b),
+ N, N, N, N,
+ /* 0x30 - 0x3F */
+ II(ImplicitOps | Priv, em_wrmsr, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ II(ImplicitOps | Priv, em_rdmsr, rdmsr),
+ IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
+ I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit),
+ N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg | SrcMem | ModRM)),
+ /* 0x50 - 0x5F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x60 - 0x6F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x70 - 0x7F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x80 - 0x8F */
+ X16(D(SrcImm | NearBranch | IsBranch)),
+ /* 0x90 - 0x9F */
+ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+ /* 0xA0 - 0xA7 */
+ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
+ II(ImplicitOps, em_cpuid, cpuid),
+ I(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
+ I(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
+ I(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
+ /* 0xA8 - 0xAF */
+ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
+ II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
+ I(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
+ I(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
+ GD(0, &group15), I(DstReg | SrcMem | ModRM, em_imul),
+ /* 0xB0 - 0xB7 */
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
+ I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xB8 - 0xBF */
+ N, N,
+ G(BitOp, group8),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ I(DstReg | SrcMem | ModRM, em_bsf_c),
+ I(DstReg | SrcMem | ModRM, em_bsr_c),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xC0 - 0xC7 */
+ I2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
+ N, ID(0, &instr_dual_0f_c3),
+ N, N, N, GD(0, &group9),
+ /* 0xC8 - 0xCF */
+ X8(I(DstReg, em_bswap)),
+ /* 0xD0 - 0xDF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xEF */
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a),
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xFF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+static const struct instr_dual instr_dual_0f_38_f0 = {
+ I(DstReg | SrcMem | Mov, em_movbe), N
+};
+
+static const struct instr_dual instr_dual_0f_38_f1 = {
+ I(DstMem | SrcReg | Mov, em_movbe), N
+};
+
+static const struct gprefix three_byte_0f_38_f0 = {
+ ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N
+};
+
+static const struct gprefix three_byte_0f_38_f1 = {
+ ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N
+};
+
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+ /* 0x00 - 0x1f */
+ X16(N), X16(N),
+ /* 0x20 - 0x2f */
+ X8(N),
+ X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N,
+ /* 0x30 - 0x7f */
+ X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x80 - 0xef */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0xf0 - 0xf1 */
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
+ /* 0xf2 - 0xff */
+ N, N, X4(N), X8(N)
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+#undef GP
+#undef EXT
+#undef MD
+#undef ID
+
+#undef D2bv
+#undef D2bvIP
+#undef I2bv
+#undef I2bvIP
+#undef I6ALU
+
+static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & ShadowStack;
+}
+
+static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt)
+{
+ u64 flags = ctxt->d;
+
+ if (!(flags & IsBranch))
+ return false;
+
+ /*
+ * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are
+ * indirect and thus affect IBT state. All far RETs (including SYSEXIT
+ * and IRET) are protected via Shadow Stacks and thus don't affect IBT
+ * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is
+ * enabled, but that should be handled by IRET emulation (in the very
+ * unlikely scenario that KVM adds support for fully emulating IRET).
+ */
+ if (!(flags & NearBranch))
+ return ctxt->execute != em_iret &&
+ ctxt->execute != em_ret_far &&
+ ctxt->execute != em_ret_far_imm &&
+ ctxt->execute != em_sysexit;
+
+ switch (flags & SrcMask) {
+ case SrcReg:
+ case SrcMem:
+ case SrcMem16:
+ case SrcMem32:
+ return true;
+ case SrcMemFAddr:
+ case SrcImmFAddr:
+ /* Far branches should be handled above. */
+ WARN_ON_ONCE(1);
+ return true;
+ case SrcNone:
+ case SrcImm:
+ case SrcImmByte:
+ /*
+ * Note, ImmU16 is used only for the stack adjustment operand on ENTER
+ * and RET instructions. ENTER isn't a branch and RET FAR is handled
+ * by the NearBranch check above. RET itself isn't an indirect branch.
+ */
+ case SrcImmU16:
+ return false;
+ default:
+ WARN_ONCE(1, "Unexpected Src operand '%llx' on branch",
+ flags & SrcMask);
+ return false;
+ }
+}
+
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned size;
+
+ size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ if (size == 8)
+ size = 4;
+ return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned size, bool sign_extension)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_IMM;
+ op->bytes = size;
+ op->addr.mem.ea = ctxt->_eip;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (op->bytes) {
+ case 1:
+ op->val = insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ op->val = insn_fetch(s16, ctxt);
+ break;
+ case 4:
+ op->val = insn_fetch(s32, ctxt);
+ break;
+ case 8:
+ op->val = insn_fetch(s64, ctxt);
+ break;
+ }
+ if (!sign_extension) {
+ switch (op->bytes) {
+ case 1:
+ op->val &= 0xff;
+ break;
+ case 2:
+ op->val &= 0xffff;
+ break;
+ case 4:
+ op->val &= 0xffffffff;
+ break;
+ }
+ }
+done:
+ return rc;
+}
+
+static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned d)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ switch (d) {
+ case OpReg:
+ decode_register_operand(ctxt, op);
+ break;
+ case OpImmUByte:
+ rc = decode_imm(ctxt, op, 1, false);
+ break;
+ case OpMem:
+ ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ mem_common:
+ *op = ctxt->memop;
+ ctxt->memopp = op;
+ if (ctxt->d & BitOp)
+ fetch_bit_operand(ctxt);
+ op->orig_val = op->val;
+ break;
+ case OpMem64:
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
+ goto mem_common;
+ case OpAcc:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ break;
+ case OpAccLo:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ break;
+ case OpAccHi:
+ if (ctxt->d & ByteOp) {
+ op->type = OP_NONE;
+ break;
+ }
+ op->type = OP_REG;
+ op->bytes = ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ break;
+ case OpDI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RDI);
+ op->addr.mem.seg = VCPU_SREG_ES;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpDX:
+ op->type = OP_REG;
+ op->bytes = 2;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ break;
+ case OpCL:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
+ break;
+ case OpImmByte:
+ rc = decode_imm(ctxt, op, 1, true);
+ break;
+ case OpOne:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = 1;
+ break;
+ case OpImm:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), true);
+ break;
+ case OpImm64:
+ rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
+ break;
+ case OpMem8:
+ ctxt->memop.bytes = 1;
+ if (ctxt->memop.type == OP_REG) {
+ ctxt->memop.addr.reg = decode_register(ctxt,
+ ctxt->modrm_rm, true);
+ fetch_register_operand(&ctxt->memop);
+ }
+ goto mem_common;
+ case OpMem16:
+ ctxt->memop.bytes = 2;
+ goto mem_common;
+ case OpMem32:
+ ctxt->memop.bytes = 4;
+ goto mem_common;
+ case OpImmU16:
+ rc = decode_imm(ctxt, op, 2, false);
+ break;
+ case OpImmU:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), false);
+ break;
+ case OpSI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RSI);
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpXLat:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ address_mask(ctxt,
+ reg_read(ctxt, VCPU_REGS_RBX) +
+ (reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ break;
+ case OpImmFAddr:
+ op->type = OP_IMM;
+ op->addr.mem.ea = ctxt->_eip;
+ op->bytes = ctxt->op_bytes + 2;
+ insn_fetch_arr(op->valptr, op->bytes, ctxt);
+ break;
+ case OpMemFAddr:
+ ctxt->memop.bytes = ctxt->op_bytes + 2;
+ goto mem_common;
+ case OpES:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_ES;
+ break;
+ case OpCS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_CS;
+ break;
+ case OpSS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_SS;
+ break;
+ case OpDS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_DS;
+ break;
+ case OpFS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_FS;
+ break;
+ case OpGS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_GS;
+ break;
+ case OpImplicit:
+ /* Special instructions do their own operand decoding. */
+ default:
+ op->type = OP_NONE; /* Disable writeback. */
+ break;
+ }
+
+done:
+ return rc;
+}
+
+static int x86_decode_avx(struct x86_emulate_ctxt *ctxt,
+ u8 vex_1st, u8 vex_2nd, struct opcode *opcode)
+{
+ u8 vex_3rd, map, pp, l, v;
+ int rc = X86EMUL_CONTINUE;
+
+ if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix)
+ goto ud;
+
+ if (vex_1st == 0xc5) {
+ /* Expand RVVVVlpp to VEX3 format */
+ vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */
+ vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */
+ } else {
+ vex_3rd = insn_fetch(u8, ctxt);
+ }
+
+ /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */
+ vex_2nd ^= 0xE0; /* binary 11100000 */
+ vex_3rd ^= 0x78; /* binary 01111000 */
+
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */
+ ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */
+ if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64)
+ goto ud;
+
+ map = vex_2nd & 0x1f;
+ v = (vex_3rd >> 3) & 0xf;
+ l = vex_3rd & 0x4;
+ pp = vex_3rd & 0x3;
+
+ ctxt->b = insn_fetch(u8, ctxt);
+ switch (map) {
+ case 1:
+ ctxt->opcode_len = 2;
+ *opcode = twobyte_table[ctxt->b];
+ break;
+ case 2:
+ ctxt->opcode_len = 3;
+ *opcode = opcode_map_0f_38[ctxt->b];
+ break;
+ case 3:
+ /* no 0f 3a instructions are supported yet */
+ return X86EMUL_UNHANDLEABLE;
+ default:
+ goto ud;
+ }
+
+ /*
+ * No three operand instructions are supported yet; those that
+ * *are* marked with the Avx flag reserve the VVVV flag.
+ */
+ if (v)
+ goto ud;
+
+ if (l)
+ ctxt->op_bytes = 32;
+ else
+ ctxt->op_bytes = 16;
+
+ switch (pp) {
+ case 0: break;
+ case 1: ctxt->op_prefix = true; break;
+ case 2: ctxt->rep_prefix = 0xf3; break;
+ case 3: ctxt->rep_prefix = 0xf2; break;
+ }
+
+done:
+ return rc;
+ud:
+ *opcode = ud;
+ return rc;
+}
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
+{
+ int rc = X86EMUL_CONTINUE;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+ bool vex_prefix = false;
+ bool has_seg_override = false;
+ struct opcode opcode;
+ u16 dummy;
+ struct desc_struct desc;
+
+ ctxt->memop.type = OP_NONE;
+ ctxt->memopp = NULL;
+ ctxt->_eip = ctxt->eip;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
+ ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
+ if (insn_len > 0)
+ memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ def_op_bytes = def_ad_bytes = 2;
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+ if (desc.d)
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->op_bytes = def_op_bytes;
+ ctxt->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (ctxt->b = insn_fetch(u8, ctxt)) {
+ case 0x66: /* operand-size override */
+ ctxt->op_prefix = true;
+ /* switch between 2/4 bytes */
+ ctxt->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
+ case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
+ case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
+ case 0x3e: /* DS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_DS;
+ break;
+ case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
+ case 0x65: /* GS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_GS;
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = ctxt->b & 0xf;
+ continue;
+ case 0xf0: /* LOCK */
+ ctxt->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP/REPE/REPZ */
+ ctxt->rep_prefix = ctxt->b;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (ctxt->rex_bits & REX_W)
+ ctxt->op_bytes = 8;
+
+ /* Opcode byte(s). */
+ if (ctxt->b == 0xc4 || ctxt->b == 0xc5) {
+ /* VEX or LDS/LES */
+ u8 vex_2nd = insn_fetch(u8, ctxt);
+ if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) {
+ opcode = opcode_table[ctxt->b];
+ ctxt->modrm = vex_2nd;
+ /* the Mod/RM byte has been fetched already! */
+ goto done_modrm;
+ }
+
+ vex_prefix = true;
+ rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ } else if (ctxt->b == 0x0f) {
+ /* Two- or three-byte opcode */
+ ctxt->opcode_len = 2;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = twobyte_table[ctxt->b];
+
+ /* 0F_38 opcode map */
+ if (ctxt->b == 0x38) {
+ ctxt->opcode_len = 3;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = opcode_map_0f_38[ctxt->b];
+ }
+ } else {
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
+ }
+
+ if (opcode.flags & ModRM)
+ ctxt->modrm = insn_fetch(u8, ctxt);
+
+done_modrm:
+ ctxt->d = opcode.flags;
+ while (ctxt->d & GroupMask) {
+ switch (ctxt->d & GroupMask) {
+ case Group:
+ goffset = (ctxt->modrm >> 3) & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case GroupDual:
+ goffset = (ctxt->modrm >> 3) & 7;
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.gdual->mod3[goffset];
+ else
+ opcode = opcode.u.gdual->mod012[goffset];
+ break;
+ case RMExt:
+ goffset = ctxt->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case Prefix:
+ if (ctxt->rep_prefix && ctxt->op_prefix)
+ return EMULATION_FAILED;
+ simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
+ break;
+ case Escape:
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
+ break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.idual->mod3;
+ else
+ opcode = opcode.u.idual->mod012;
+ break;
+ case ModeDual:
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ opcode = opcode.u.mdual->mode64;
+ else
+ opcode = opcode.u.mdual->mode32;
+ break;
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->d &= ~(u64)GroupMask;
+ ctxt->d |= opcode.flags;
+ }
+
+ ctxt->is_branch = opcode.flags & IsBranch;
+
+ /* Unrecognised? */
+ if (ctxt->d == 0)
+ return EMULATION_FAILED;
+
+ if (unlikely(vex_prefix)) {
+ /*
+ * Only specifically marked instructions support VEX. Since many
+ * instructions support it but are not annotated, return not implemented
+ * rather than #UD.
+ */
+ if (!(ctxt->d & Avx))
+ return EMULATION_FAILED;
+
+ if (!(ctxt->d & AlignMask))
+ ctxt->d |= Unaligned;
+ }
+
+ ctxt->execute = opcode.u.execute;
+
+ /*
+ * Reject emulation if KVM might need to emulate shadow stack updates
+ * and/or indirect branch tracking enforcement, which the emulator
+ * doesn't support.
+ */
+ if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) &&
+ ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) {
+ u64 u_cet = 0, s_cet = 0;
+
+ /*
+ * Check both User and Supervisor on far transfers as inter-
+ * privilege level transfers are impacted by CET at the target
+ * privilege level, and that is not known at this time. The
+ * expectation is that the guest will not require emulation of
+ * any CET-affected instructions at any privilege level.
+ */
+ if (!(ctxt->d & NearBranch))
+ u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+ else if (ctxt->ops->cpl(ctxt) == 3)
+ u_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+ else
+ s_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+
+ if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) ||
+ (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet)))
+ return EMULATION_FAILED;
+
+ if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt))
+ return EMULATION_FAILED;
+
+ if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt))
+ return EMULATION_FAILED;
+ }
+
+ if (unlikely(emulation_type & EMULTYPE_TRAP_UD) &&
+ likely(!(ctxt->d & EmulateOnUD)))
+ return EMULATION_FAILED;
+
+ if (unlikely(ctxt->d &
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch|
+ No16))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
+
+ if (mode == X86EMUL_MODE_PROT64) {
+ if (ctxt->op_bytes == 4 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
+ else if (ctxt->d & NearBranch)
+ ctxt->op_bytes = 8;
+ }
+
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if ((ctxt->d & No16) && ctxt->op_bytes == 2)
+ ctxt->op_bytes = 4;
+
+ if (vex_prefix)
+ ;
+ else if (ctxt->d & Sse)
+ ctxt->op_bytes = 16, ctxt->d &= ~Avx;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
+ }
+
+ /* ModRM and SIB bytes. */
+ if (ctxt->d & ModRM) {
+ rc = decode_modrm(ctxt, &ctxt->memop);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
+ } else if (ctxt->d & MemAbs)
+ rc = decode_abs(ctxt, &ctxt->memop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
+
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
+
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Decode and fetch the destination operand: register or memory. */
+ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
+
+ if (ctxt->rip_relative && likely(ctxt->memopp))
+ ctxt->memopp->addr.mem.ea = address_mask(ctxt,
+ ctxt->memopp->addr.mem.ea + ctxt->_eip);
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
+ return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & PageTable;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+ (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+ && (((ctxt->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
+ || ((ctxt->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
+ return true;
+
+ return false;
+}
+
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ kvm_fpu_get();
+ rc = asm_safe("fwait");
+ kvm_fpu_put();
+
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+ return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct operand *op)
+{
+ if (op->type == OP_MM)
+ kvm_read_mmx_reg(op->addr.mm, &op->mm_val);
+}
+
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ /* Clear fields that are set conditionally but read without a guard. */
+ ctxt->rip_relative = false;
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
+ ctxt->lock_prefix = 0;
+ ctxt->op_prefix = false;
+ ctxt->rep_prefix = 0;
+ ctxt->regs_valid = 0;
+ ctxt->regs_dirty = 0;
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = ctxt->dst.type;
+
+ ctxt->mem_read.pos = 0;
+
+ /* LOCK prefix is allowed only with some instructions */
+ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (ctxt->d & Avx) {
+ u64 xcr = 0;
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)
+ || ops->get_xcr(ctxt, 0, &xcr)
+ || !(xcr & XFEATURE_MASK_YMM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ } else if (ctxt->d & Sse) {
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ }
+
+ if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(&ctxt->src);
+ fetch_possible_mmx_operand(&ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(&ctxt->dst);
+ }
+
+ if (unlikely(check_intercepts) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(check_intercepts) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ goto done;
+ }
+ }
+ }
+
+ if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+ rc = segmented_read(ctxt, ctxt->src.addr.mem,
+ ctxt->src.valptr, ctxt->src.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ ctxt->src.orig_val64 = ctxt->src.val64;
+ }
+
+ if (ctxt->src2.type == OP_MEM) {
+ rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+ &ctxt->src2.val, ctxt->src2.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if ((ctxt->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+ &ctxt->dst.val, ctxt->dst.bytes);
+ if (rc != X86EMUL_CONTINUE) {
+ if (!(ctxt->d & NoWrite) &&
+ rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
+ goto done;
+ }
+ }
+ /* Copy full 64-bit value for CMPXCHG8B. */
+ ctxt->dst.orig_val64 = ctxt->dst.val64;
+
+special_insn:
+
+ if (unlikely(check_intercepts) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_MEMACCESS);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= X86_EFLAGS_RF;
+ else
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+
+ if (ctxt->execute) {
+ rc = ctxt->execute(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+
+ if (ctxt->opcode_len == 2)
+ goto twobyte_insn;
+ else if (ctxt->opcode_len == 3)
+ goto threebyte_insn;
+
+ switch (ctxt->b) {
+ case 0x70 ... 0x7f: /* jcc (short) */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x8d: /* lea r16/r32, m */
+ ctxt->dst.val = ctxt->src.addr.mem.ea;
+ break;
+ case 0x90 ... 0x97: /* nop / xchg reg, rax */
+ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
+ break;
+ case 0x98: /* cbw/cwde/cdqe */
+ switch (ctxt->op_bytes) {
+ case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+ case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+ case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
+ }
+ break;
+ case 0xcc: /* int3 */
+ rc = emulate_int(ctxt, 3);
+ break;
+ case 0xcd: /* int n */
+ rc = emulate_int(ctxt, ctxt->src.val);
+ break;
+ case 0xce: /* into */
+ if (ctxt->eflags & X86_EFLAGS_OF)
+ rc = emulate_int(ctxt, 4);
+ break;
+ case 0xe9: /* jmp rel */
+ case 0xeb: /* jmp rel short */
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf4: /* hlt */
+ ctxt->ops->halt(ctxt);
+ break;
+ case 0xf5: /* cmc */
+ /* complement carry flag from eflags reg */
+ ctxt->eflags ^= X86_EFLAGS_CF;
+ break;
+ case 0xf8: /* clc */
+ ctxt->eflags &= ~X86_EFLAGS_CF;
+ break;
+ case 0xf9: /* stc */
+ ctxt->eflags |= X86_EFLAGS_CF;
+ break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~X86_EFLAGS_DF;
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= X86_EFLAGS_DF;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+writeback:
+ if (ctxt->d & SrcWrite) {
+ BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+ rc = writeback(ctxt, &ctxt->src);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /*
+ * restore dst type in case the decoding will be reused
+ * (happens for string instruction )
+ */
+ ctxt->dst.type = saved_dst_type;
+
+ if ((ctxt->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
+
+ if ((ctxt->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ unsigned int count;
+ struct read_cache *r = &ctxt->io_read;
+ if ((ctxt->d & SrcMask) == SrcSI)
+ count = ctxt->src.count;
+ else
+ count = ctxt->dst.count;
+ register_address_increment(ctxt, VCPU_REGS_RCX, -count);
+
+ if (!string_insn_completed(ctxt)) {
+ /*
+ * Re-enter guest when pio read ahead buffer is empty
+ * or, if it is not used, after each 1024 iteration.
+ */
+ if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
+ (r->end == 0 || r->end != r->pos)) {
+ /*
+ * Reset read cache. Usually happens before
+ * decode, but since instruction is restarted
+ * we have to do it here.
+ */
+ ctxt->mem_read.end = 0;
+ writeback_registers(ctxt);
+ return EMULATION_RESTART;
+ }
+ goto done; /* skip rip writeback */
+ }
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ }
+
+ ctxt->eip = ctxt->_eip;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT) {
+ if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt))
+ return EMULATION_FAILED;
+ ctxt->have_exception = true;
+ }
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
+
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+
+twobyte_insn:
+ switch (ctxt->b) {
+ case 0x09: /* wbinvd */
+ (ctxt->ops->wbinvd)(ctxt);
+ break;
+ case 0x08: /* invd */
+ case 0x0d: /* GrpP (prefetch) */
+ case 0x18: /* Grp16 (prefetch/nop) */
+ case 0x1f: /* nop */
+ break;
+ case 0x20: /* mov cr, reg */
+ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
+ break;
+ case 0x21: /* mov from dr to reg */
+ ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg);
+ break;
+ case 0x40 ... 0x4f: /* cmov */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->op_bytes != 4)
+ ctxt->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x90 ... 0x9f: /* setcc r/m8 */
+ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
+ break;
+ case 0xb6 ... 0xb7: /* movzx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
+ : (u16) ctxt->src.val;
+ break;
+ case 0xbe ... 0xbf: /* movsx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
+ (s16) ctxt->src.val;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+threebyte_insn:
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ goto writeback;
+
+cannot_emulate:
+ return EMULATION_FAILED;
+}
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ writeback_registers(ctxt);
+}
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ return false;
+
+ if (ctxt->d & TwoMemOp)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
new file mode 100644
index 000000000000..f898781b6a06
--- /dev/null
+++ b/arch/x86/kvm/fpu.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KVM_FPU_H_
+#define __KVM_FPU_H_
+
+#include <asm/fpu/api.h>
+
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
+#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; })
+#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; })
+#define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; })
+#define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; })
+#define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; })
+#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
+#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+
+typedef u32 __attribute__((vector_size(32))) avx256_t;
+
+static inline void _kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break;
+ case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break;
+ case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break;
+ case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break;
+ case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break;
+ case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break;
+ case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break;
+ case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break;
+ case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break;
+ case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break;
+ case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break;
+ case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break;
+ case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break;
+ case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break;
+ case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break;
+ case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break;
+ case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break;
+ case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break;
+ case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break;
+ case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break;
+ case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break;
+ case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break;
+ case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break;
+ case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break;
+ case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break;
+ case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break;
+ case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break;
+ case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break;
+ case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_read_mmx_reg(int reg, u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void kvm_fpu_get(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static inline void kvm_fpu_put(void)
+{
+ fpregs_unlock();
+}
+
+static inline void kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_read_mmx_reg(int reg, u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_read_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_write_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+#endif
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
new file mode 100644
index 000000000000..de92292eb1f5
--- /dev/null
+++ b/arch/x86/kvm/hyperv.c
@@ -0,0 +1,2925 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "x86.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "mmu.h"
+#include "xen.h"
+
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/sched/cputime.h>
+#include <linux/spinlock.h>
+#include <linux/eventfd.h>
+
+#include <asm/apicdef.h>
+#include <asm/mshyperv.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+#include "irq.h"
+#include "fpu.h"
+
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
+
+/*
+ * As per Hyper-V TLFS, extended hypercalls start from 0x8001
+ * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value
+ * where each bit tells which extended hypercall is available besides
+ * HvExtCallQueryCapabilities.
+ *
+ * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit
+ * assigned.
+ *
+ * 0x8002 - Bit 0
+ * 0x8003 - Bit 1
+ * ..
+ * 0x8041 - Bit 63
+ *
+ * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64
+ */
+#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64)
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick);
+
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+ return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+ if (sint_value & HV_SYNIC_SINT_MASKED)
+ return -1;
+ return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ return true;
+ }
+ return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+ u64 sint_value;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ sint_value = synic_read_sint(synic, i);
+ if (synic_get_sint_vector(sint_value) == vector &&
+ sint_value & HV_SYNIC_SINT_AUTO_EOI)
+ return true;
+ }
+ return false;
+}
+
+static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ bool auto_eoi_old, auto_eoi_new;
+
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
+ return;
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+
+ auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
+ if (auto_eoi_old == auto_eoi_new)
+ return;
+
+ if (!enable_apicv)
+ return;
+
+ down_write(&vcpu->kvm->arch.apicv_update_lock);
+
+ if (auto_eoi_new)
+ hv->synic_auto_eoi_used++;
+ else
+ hv->synic_auto_eoi_used--;
+
+ /*
+ * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on
+ * the hypervisor to manually inject IRQs.
+ */
+ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm,
+ APICV_INHIBIT_REASON_HYPERV,
+ !!hv->synic_auto_eoi_used);
+
+ up_write(&vcpu->kvm->arch.apicv_update_lock);
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector, old_vector;
+ bool masked;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ masked = data & HV_SYNIC_SINT_MASKED;
+
+ /*
+ * Valid vectors are 16-255, however, nested Hyper-V attempts to write
+ * default '0x10000' value on boot and this should not #GP. We need to
+ * allow zero-initing the register from host as well.
+ */
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
+
+ atomic64_set(&synic->sint[sint], data);
+
+ synic_update_vector(synic, old_vector);
+
+ synic_update_vector(synic, vector);
+
+ /* Load SynIC vectors into EOI exit bitmap */
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic));
+ return 0;
+}
+
+static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu = NULL;
+ unsigned long i;
+
+ if (vpidx >= KVM_MAX_VCPUS)
+ return NULL;
+
+ vcpu = kvm_get_vcpu(kvm, vpidx);
+ if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx)
+ return vcpu;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_hv_get_vpindex(vcpu) == vpidx)
+ return vcpu;
+ return NULL;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_hv_synic *synic;
+
+ vcpu = get_vcpu_by_vpidx(kvm, vpidx);
+ if (!vcpu || !to_hv_vcpu(vcpu))
+ return NULL;
+ synic = to_hv_synic(vcpu);
+ return (synic->active) ? synic : NULL;
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ int gsi, idx;
+
+ trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+ /* Try to deliver pending Hyper-V SynIC timers messages */
+ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+ stimer = &hv_vcpu->stimer[idx];
+ if (stimer->msg_pending && stimer->config.enable &&
+ !stimer->config.direct_mode &&
+ stimer->config.sintx == sint)
+ stimer_mark_pending(stimer, false);
+ }
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = atomic_read(&synic->sint_to_gsi[sint]);
+ if (gsi != -1)
+ kvm_notify_acked_gsi(kvm, gsi);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+ hv_vcpu->exit.u.synic.msr = msr;
+ hv_vcpu->exit.u.synic.control = synic->control;
+ hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+ hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+ u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ int ret;
+
+ if (!synic->active && (!host || data))
+ return 1;
+
+ trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ synic->control = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SVERSION:
+ if (!host) {
+ ret = 1;
+ break;
+ }
+ synic->version = data;
+ break;
+ case HV_X64_MSR_SIEFP:
+ if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->evt_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SIMP:
+ if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->msg_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_EOM: {
+ int i;
+
+ if (!synic->active)
+ break;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ kvm_hv_notify_acked_sint(vcpu, i);
+ break;
+ }
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu->cpuid_cache.syndbg_cap_eax &
+ HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+}
+
+static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
+ hv->hv_syndbg.control.status =
+ vcpu->run->hyperv.u.syndbg.status;
+ return 1;
+}
+
+static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
+ hv_vcpu->exit.u.syndbg.msr = msr;
+ hv_vcpu->exit.u.syndbg.control = syndbg->control.control;
+ hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page;
+ hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page;
+ hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page;
+ vcpu->arch.complete_userspace_io =
+ kvm_hv_syndbg_complete_userspace;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+ return 1;
+
+ trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
+ to_hv_vcpu(vcpu)->vp_index, msr, data);
+ switch (msr) {
+ case HV_X64_MSR_SYNDBG_CONTROL:
+ syndbg->control.control = data;
+ if (!host)
+ syndbg_exit(vcpu, msr);
+ break;
+ case HV_X64_MSR_SYNDBG_STATUS:
+ syndbg->control.status = data;
+ break;
+ case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+ syndbg->control.send_page = data;
+ break;
+ case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+ syndbg->control.recv_page = data;
+ break;
+ case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ syndbg->control.pending_page = data;
+ if (!host)
+ syndbg_exit(vcpu, msr);
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ syndbg->options = data;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_SYNDBG_CONTROL:
+ *pdata = syndbg->control.control;
+ break;
+ case HV_X64_MSR_SYNDBG_STATUS:
+ *pdata = syndbg->control.status;
+ break;
+ case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+ *pdata = syndbg->control.send_page;
+ break;
+ case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+ *pdata = syndbg->control.recv_page;
+ break;
+ case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ *pdata = syndbg->control.pending_page;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ *pdata = syndbg->options;
+ break;
+ default:
+ break;
+ }
+
+ trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata);
+
+ return 0;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+ bool host)
+{
+ int ret;
+
+ if (!synic->active && !host)
+ return 1;
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ *pdata = synic->control;
+ break;
+ case HV_X64_MSR_SVERSION:
+ *pdata = synic->version;
+ break;
+ case HV_X64_MSR_SIEFP:
+ *pdata = synic->evt_page;
+ break;
+ case HV_X64_MSR_SIMP:
+ *pdata = synic->msg_page;
+ break;
+ case HV_X64_MSR_EOM:
+ *pdata = 0;
+ break;
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_lapic_irq irq;
+ int ret, vector;
+
+ if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint))
+ return -EINVAL;
+
+ vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+ if (vector < 0)
+ return -ENOENT;
+
+ memset(&irq, 0, sizeof(irq));
+ irq.shorthand = APIC_DEST_SELF;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.vector = vector;
+ irq.level = 1;
+
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
+ trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+ return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ if (!level)
+ return -1;
+
+ synic = synic_get(kvm, e->hv_sint.vcpu);
+ if (!synic)
+ return -EINVAL;
+
+ return synic_set_irq(synic, e->hv_sint.sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ int i;
+
+ trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+ return -EINVAL;
+
+ atomic_set(&synic->sint_to_gsi[sint], gsi);
+ return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_kernel_irq_routing_entry *e;
+ u32 gsi;
+
+ irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+ lockdep_is_held(&kvm->irq_lock));
+
+ for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+ hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+ if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+ kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+ e->hv_sint.sint, gsi);
+ }
+ }
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+ int i;
+
+ memset(synic, 0, sizeof(*synic));
+ synic->version = HV_SYNIC_VERSION_1;
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+ atomic_set(&synic->sint_to_gsi[i], -1);
+ }
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * Fall back to get_kvmclock_ns() when TSC page hasn't been set up,
+ * is broken, disabled or being updated.
+ */
+ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+
+ set_bit(stimer->index,
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+ if (vcpu_kick)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+
+ trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+
+ hrtimer_cancel(&stimer->timer);
+ clear_bit(stimer->index,
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ stimer->msg_pending = false;
+ stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu_hv_stimer *stimer;
+
+ stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+ trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+ stimer_mark_pending(stimer, true);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+ u64 time_now;
+ ktime_t ktime_now;
+
+ time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm);
+ ktime_now = ktime_get();
+
+ if (stimer->config.periodic) {
+ if (stimer->exp_time) {
+ if (time_now >= stimer->exp_time) {
+ u64 remainder;
+
+ div64_u64_rem(time_now - stimer->exp_time,
+ stimer->count, &remainder);
+ stimer->exp_time =
+ time_now + (stimer->count - remainder);
+ }
+ } else
+ stimer->exp_time = time_now + stimer->count;
+
+ trace_kvm_hv_stimer_start_periodic(
+ hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->exp_time);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now,
+ 100 * (stimer->exp_time - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+ }
+ stimer->exp_time = stimer->count;
+ if (time_now >= stimer->count) {
+ /*
+ * Expire timer according to Hypervisor Top-Level Functional
+ * specification v4(15.3.1):
+ * "If a one shot is enabled and the specified count is in
+ * the past, it will expire immediately."
+ */
+ stimer_mark_pending(stimer, false);
+ return 0;
+ }
+
+ trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->count);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+ bool host)
+{
+ union hv_stimer_config new_config = {.as_uint64 = config},
+ old_config = {.as_uint64 = stimer->config.as_uint64};
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+
+ if (!synic->active && (!host || config))
+ return 1;
+
+ if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_STIMER_DIRECT_MODE_AVAILABLE)))
+ return 1;
+
+ trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, config, host);
+
+ stimer_cleanup(stimer);
+ if (old_config.enable &&
+ !new_config.direct_mode && new_config.sintx == 0)
+ new_config.enable = 0;
+ stimer->config.as_uint64 = new_config.as_uint64;
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
+ return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+ bool host)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+
+ if (!synic->active && (!host || count))
+ return 1;
+
+ trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, count, host);
+
+ stimer_cleanup(stimer);
+ stimer->count = count;
+ if (!host) {
+ if (stimer->count == 0)
+ stimer->config.enable = 0;
+ else if (stimer->config.auto_enable)
+ stimer->config.enable = 1;
+ }
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
+ return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+ *pconfig = stimer->config.as_uint64;
+ return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+ *pcount = stimer->count;
+ return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+ struct hv_message *src_msg, bool no_retry)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
+ gfn_t msg_page_gfn;
+ struct hv_message_header hv_hdr;
+ int r;
+
+ if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+ return -ENOENT;
+
+ msg_page_gfn = synic->msg_page >> PAGE_SHIFT;
+
+ /*
+ * Strictly following the spec-mandated ordering would assume setting
+ * .msg_pending before checking .message_type. However, this function
+ * is only called in vcpu context so the entire update is atomic from
+ * guest POV and thus the exact order here doesn't matter.
+ */
+ r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type,
+ msg_off + offsetof(struct hv_message,
+ header.message_type),
+ sizeof(hv_hdr.message_type));
+ if (r < 0)
+ return r;
+
+ if (hv_hdr.message_type != HVMSG_NONE) {
+ if (no_retry)
+ return 0;
+
+ hv_hdr.message_flags.msg_pending = 1;
+ r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn,
+ &hv_hdr.message_flags,
+ msg_off +
+ offsetof(struct hv_message,
+ header.message_flags),
+ sizeof(hv_hdr.message_flags));
+ if (r < 0)
+ return r;
+ return -EAGAIN;
+ }
+
+ r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off,
+ sizeof(src_msg->header) +
+ src_msg->header.payload_size);
+ if (r < 0)
+ return r;
+
+ r = synic_set_irq(synic, sint);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ /*
+ * To avoid piling up periodic ticks, don't retry message
+ * delivery for them (within "lazy" lost ticks policy).
+ */
+ bool no_retry = stimer->config.periodic;
+
+ payload->expiration_time = stimer->exp_time;
+ payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+ return synic_deliver_msg(to_hv_synic(vcpu),
+ stimer->config.sintx, msg,
+ no_retry);
+}
+
+static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = stimer->config.apic_vector
+ };
+
+ if (lapic_in_kernel(vcpu))
+ return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ return 0;
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+ int r, direct = stimer->config.direct_mode;
+
+ stimer->msg_pending = true;
+ if (!direct)
+ r = stimer_send_msg(stimer);
+ else
+ r = stimer_notify_direct(stimer);
+ trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, direct, r);
+ if (!r) {
+ stimer->msg_pending = false;
+ if (!(stimer->config.periodic))
+ stimer->config.enable = 0;
+ }
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ u64 time_now, exp_time;
+ int i;
+
+ if (!hv_vcpu)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+ stimer = &hv_vcpu->stimer[i];
+ if (stimer->config.enable) {
+ exp_time = stimer->exp_time;
+
+ if (exp_time) {
+ time_now =
+ get_time_ref_counter(vcpu->kvm);
+ if (time_now >= exp_time)
+ stimer_expiration(stimer);
+ }
+
+ if ((stimer->config.enable) &&
+ stimer->count) {
+ if (!stimer->msg_pending)
+ stimer_start(stimer);
+ } else
+ stimer_cleanup(stimer);
+ }
+ }
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i;
+
+ if (!hv_vcpu)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_cleanup(&hv_vcpu->stimer[i]);
+
+ kfree(hv_vcpu);
+ vcpu->arch.hyperv = NULL;
+}
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ return false;
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled);
+
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu))
+ return -EFAULT;
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page);
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ memset(&msg->header, 0, sizeof(msg->header));
+ msg->header.message_type = HVMSG_TIMER_EXPIRED;
+ msg->header.payload_size = sizeof(*payload);
+
+ payload->timer_index = stimer->index;
+ payload->expiration_time = 0;
+ payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+ memset(stimer, 0, sizeof(*stimer));
+ stimer->index = timer_index;
+ hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ stimer_prepare_msg(stimer);
+}
+
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i;
+
+ if (hv_vcpu)
+ return 0;
+
+ hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
+ if (!hv_vcpu)
+ return -ENOMEM;
+
+ vcpu->arch.hyperv = hv_vcpu;
+ hv_vcpu->vcpu = vcpu;
+
+ synic_init(&hv_vcpu->synic);
+
+ bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_init(&hv_vcpu->stimer[i], i);
+
+ hv_vcpu->vp_index = vcpu->vcpu_idx;
+
+ for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) {
+ INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries);
+ spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock);
+ }
+
+ return 0;
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
+{
+ struct kvm_vcpu_hv_synic *synic;
+ int r;
+
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+
+ synic = to_hv_synic(vcpu);
+
+ synic->active = true;
+ synic->dont_zero_synic_pages = dont_zero_synic_pages;
+ synic->control = HV_SYNIC_CONTROL_ENABLE;
+ return 0;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ case HV_X64_MSR_REFERENCE_TSC:
+ case HV_X64_MSR_TIME_REF_COUNT:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
+ return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ *pdata = hv->hv_crash_ctl;
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
+
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
+ return 0;
+}
+
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ struct ms_hyperv_tsc_page *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+/*
+ * Don't touch TSC page values if the guest has opted for TSC emulation after
+ * migration. KVM doesn't fully support reenlightenment notifications and TSC
+ * access emulation and Hyper-V is known to expect the values in TSC page to
+ * stay constant before TSC access emulation is disabled from guest side
+ * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC
+ * frequency and guest visible TSC value across migration (and prevent it when
+ * TSC scaling is unsupported).
+ */
+static inline bool tsc_page_update_unsafe(struct kvm_hv *hv)
+{
+ return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) &&
+ hv->hv_tsc_emulation_control;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
+
+ guard(mutex)(&hv->hv_lock);
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_SET ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
+ return;
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ return;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ goto out_err;
+
+ if (tsc_seq && tsc_page_update_unsafe(hv)) {
+ if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ return;
+ }
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ goto out_err;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ return;
+
+out_err:
+ hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+}
+
+void kvm_hv_request_tsc_page_update(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_lock(&hv->hv_lock);
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET &&
+ !tsc_page_update_unsafe(hv))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
+
+ mutex_unlock(&hv->hv_lock);
+}
+
+static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_HYPERCALL_AVAILABLE;
+ case HV_X64_MSR_VP_RUNTIME:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_RUNTIME_AVAILABLE;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ case HV_X64_MSR_VP_INDEX:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_INDEX_AVAILABLE;
+ case HV_X64_MSR_RESET:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_RESET_AVAILABLE;
+ case HV_X64_MSR_REFERENCE_TSC:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_REFERENCE_TSC_AVAILABLE;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNIC_AVAILABLE;
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG:
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNTIMER_AVAILABLE;
+ case HV_X64_MSR_EOI:
+ case HV_X64_MSR_ICR:
+ case HV_X64_MSR_TPR:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_APIC_ACCESS_AVAILABLE;
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_FREQUENCY_MSRS;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_TSC_INVARIANT;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+#define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839
+#define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */
+
+/*
+ * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC
+ * configuration.
+ * Such configuration can result from, for example, AMD Erratum 1386 workaround.
+ *
+ * Print a notice so users aren't left wondering what's suddenly gone wrong.
+ */
+static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ /* Check again under the hv_lock. */
+ if (hv->xsaves_xsavec_checked)
+ return;
+
+ if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) !=
+ KVM_HV_WIN2016_GUEST_ID)
+ return;
+
+ hv->xsaves_xsavec_checked = true;
+
+ /* UP configurations aren't affected */
+ if (atomic_read(&kvm->online_vcpus) < 2)
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC))
+ return;
+
+ pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. "
+ "If it fails to boot try disabling XSAVEC in the VM config.\n");
+}
+
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!vcpu->arch.hyperv_enabled ||
+ hv->xsaves_xsavec_checked)
+ return;
+
+ mutex_lock(&hv->hv_lock);
+ __kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+ mutex_unlock(&hv->hv_lock);
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+ bool host)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ hv->hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!hv->hv_guest_os_id)
+ hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u8 instructions[9];
+ int i = 0;
+ u64 addr;
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!hv->hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ hv->hv_hypercall = data;
+ break;
+ }
+
+ /*
+ * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+ * the same way Xen itself does, by setting the bit 31 of EAX
+ * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+ * going to be clobbered on 64-bit.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ /* orl $0x80000000, %eax */
+ instructions[i++] = 0x0d;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x80;
+ }
+
+ /* vmcall/vmmcall */
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + i);
+ i += 3;
+
+ /* ret */
+ ((unsigned char *)instructions)[i++] = 0xc3;
+
+ addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+ if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
+ return 1;
+ hv->hv_hypercall = data;
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC:
+ hv->hv_tsc_page = data;
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+ if (!host)
+ hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED;
+ else
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ } else {
+ hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET;
+ }
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_set_crash_data(kvm,
+ msr - HV_X64_MSR_CRASH_P0,
+ data);
+ case HV_X64_MSR_CRASH_CTL:
+ if (host)
+ return kvm_hv_msr_set_crash_ctl(kvm, data);
+
+ if (data & HV_CRASH_CTL_CRASH_NOTIFY) {
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+ break;
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ hv->hv_reenlightenment_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ hv->hv_tsc_emulation_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ if (data && !host)
+ return 1;
+
+ hv->hv_tsc_emulation_status = data;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ /* Only bit 0 is supported */
+ if (data & ~HV_EXPOSE_INVARIANT_TSC)
+ return 1;
+
+ /* The feature can't be disabled from the guest */
+ if (!host && hv->hv_invtsc_control && !data)
+ return 1;
+
+ hv->hv_invtsc_control = data;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_set_msr(vcpu, msr, data, host);
+ default:
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+
+ return div_u64(utime + stime, 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ u32 new_vp_index = (u32)data;
+
+ if (!host || new_vp_index >= KVM_MAX_VCPUS)
+ return 1;
+
+ if (new_vp_index == hv_vcpu->vp_index)
+ return 0;
+
+ /*
+ * The VP index is initialized to vcpu_index by
+ * kvm_hv_vcpu_postcreate so they initially match. Now the
+ * VP index is changing, adjust num_mismatched_vp_indexes if
+ * it now matches or no longer matches vcpu_idx.
+ */
+ if (hv_vcpu->vp_index == vcpu->vcpu_idx)
+ atomic_inc(&hv->num_mismatched_vp_indexes);
+ else if (new_vp_index == vcpu->vcpu_idx)
+ atomic_dec(&hv->num_mismatched_vp_indexes);
+
+ hv_vcpu->vp_index = new_vp_index;
+ break;
+ }
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
+ u64 gfn;
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
+ hv_vcpu->hv_vapic = data;
+ if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
+ return 1;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+
+ /*
+ * Clear apic_assist portion of struct hv_vp_assist_page
+ * only, there can be valuable data in the rest which needs
+ * to be preserved e.g. on migration.
+ */
+ if (put_user(0, (u32 __user *)addr))
+ return 1;
+ hv_vcpu->hv_vapic = data;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ if (kvm_lapic_set_pv_eoi(vcpu,
+ gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+ sizeof(struct hv_vp_assist_page)))
+ return 1;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_set_msr(to_hv_synic(vcpu), msr, data, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_set_config(to_hv_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_set_count(to_hv_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ default:
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = hv->hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = hv->hv_hypercall;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ data = get_time_ref_counter(kvm);
+ break;
+ case HV_X64_MSR_REFERENCE_TSC:
+ data = hv->hv_tsc_page;
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_get_crash_data(kvm,
+ msr - HV_X64_MSR_CRASH_P0,
+ pdata);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_get_crash_ctl(kvm, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ data = hv->hv_reenlightenment_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ data = hv->hv_tsc_emulation_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ data = hv->hv_tsc_emulation_status;
+ break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ data = hv->hv_invtsc_control;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_get_msr(vcpu, msr, pdata, host);
+ default:
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX:
+ data = hv_vcpu->vp_index;
+ break;
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ data = hv_vcpu->hv_vapic;
+ break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_get_config(to_hv_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_get_count(to_hv_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
+ break;
+ case HV_X64_MSR_APIC_FREQUENCY:
+ data = div64_u64(1000000000ULL,
+ vcpu->kvm->arch.apic_bus_cycle_ns);
+ break;
+ default:
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&hv->hv_lock);
+ r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
+ mutex_unlock(&hv->hv_lock);
+ return r;
+ } else
+ return kvm_hv_set_msr(vcpu, msr, data, host);
+}
+
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&hv->hv_lock);
+ r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
+ mutex_unlock(&hv->hv_lock);
+ return r;
+ } else
+ return kvm_hv_get_msr(vcpu, msr, pdata, host);
+}
+
+static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
+ u64 valid_bank_mask, unsigned long *vcpu_mask)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes);
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ struct kvm_vcpu *vcpu;
+ int bank, sbank = 0;
+ unsigned long i;
+ u64 *bitmap;
+
+ BUILD_BUG_ON(sizeof(vp_bitmap) >
+ sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS));
+
+ /*
+ * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else
+ * fill a temporary buffer and manually test each vCPU's VP index.
+ */
+ if (likely(!has_mismatch))
+ bitmap = (u64 *)vcpu_mask;
+ else
+ bitmap = vp_bitmap;
+
+ /*
+ * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask
+ * having a '1' for each bank that exists in sparse_banks. Sets must
+ * be in ascending order, i.e. bank0..bankN.
+ */
+ memset(bitmap, 0, sizeof(vp_bitmap));
+ for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
+ KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
+ bitmap[bank] = sparse_banks[sbank++];
+
+ if (likely(!has_mismatch))
+ return;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
+ __set_bit(i, vcpu_mask);
+ }
+}
+
+static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[])
+{
+ int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK;
+ unsigned long sbank;
+
+ if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask))
+ return false;
+
+ /*
+ * The index into the sparse bank is the number of preceding bits in
+ * the valid mask. Optimize for VMs with <64 vCPUs by skipping the
+ * fancy math if there can't possibly be preceding bits.
+ */
+ if (valid_bit_nr)
+ sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0));
+ else
+ sbank = 0;
+
+ return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK,
+ (unsigned long *)&sparse_banks[sbank]);
+}
+
+struct kvm_hv_hcall {
+ /* Hypercall input data */
+ u64 param;
+ u64 ingpa;
+ u64 outgpa;
+ u16 code;
+ u16 var_cnt;
+ u16 rep_cnt;
+ u16 rep_idx;
+ bool fast;
+ bool rep;
+ sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
+
+ /*
+ * Current read offset when KVM reads hypercall input data gradually,
+ * either offset in bytes from 'ingpa' for regular hypercalls or the
+ * number of already consumed 'XMM halves' for 'fast' hypercalls.
+ */
+ union {
+ gpa_t data_offset;
+ int consumed_xmm_halves;
+ };
+};
+
+
+static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u16 orig_cnt, u16 cnt_cap, u64 *data)
+{
+ /*
+ * Preserve the original count when ignoring entries via a "cap", KVM
+ * still needs to validate the guest input (though the non-XMM path
+ * punts on the checks).
+ */
+ u16 cnt = min(orig_cnt, cnt_cap);
+ int i, j;
+
+ if (hc->fast) {
+ /*
+ * Each XMM holds two sparse banks, but do not count halves that
+ * have already been consumed for hypercall parameters.
+ */
+ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ for (i = 0; i < cnt; i++) {
+ j = i + hc->consumed_xmm_halves;
+ if (j % 2)
+ data[i] = sse128_hi(hc->xmm[j / 2]);
+ else
+ data[i] = sse128_lo(hc->xmm[j / 2]);
+ }
+ return 0;
+ }
+
+ return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data,
+ cnt * sizeof(*data));
+}
+
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u64 *sparse_banks)
+{
+ if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
+ return -EINVAL;
+
+ /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
+ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
+ sparse_banks);
+}
+
+static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[])
+{
+ return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries);
+}
+
+static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
+ u64 *entries, int count)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY;
+
+ if (!hv_vcpu)
+ return;
+
+ spin_lock(&tlb_flush_fifo->write_lock);
+
+ /*
+ * All entries should fit on the fifo leaving one free for 'flush all'
+ * entry in case another request comes in. In case there's not enough
+ * space, just put 'flush all' entry there.
+ */
+ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) {
+ WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count);
+ goto out_unlock;
+ }
+
+ /*
+ * Note: full fifo always contains 'flush all' entry, no need to check the
+ * return value.
+ */
+ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1);
+
+out_unlock:
+ spin_unlock(&tlb_flush_fifo->write_lock);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE];
+ int i, j, count;
+ gva_t gva;
+
+ if (!tdp_enabled || !hv_vcpu)
+ return -EINVAL;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+
+ for (i = 0; i < count; i++) {
+ if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+ goto out_flush_all;
+
+ if (is_noncanonical_invlpg_address(entries[i], vcpu))
+ continue;
+
+ /*
+ * Lower 12 bits of 'address' encode the number of additional
+ * pages to flush.
+ */
+ gva = entries[i] & PAGE_MASK;
+ for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
+ kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+
+ ++vcpu->stat.tlb_flush;
+ }
+ return 0;
+
+out_flush_all:
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+
+ /* Fall back to full flush. */
+ return -ENOSPC;
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ unsigned long *vcpu_mask = hv_vcpu->vcpu_mask;
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
+ struct kvm *kvm = vcpu->kvm;
+ struct hv_tlb_flush_ex flush_ex;
+ struct hv_tlb_flush flush;
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ /*
+ * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
+ * entries on the TLB flush fifo. The last entry, however, needs to be
+ * always left free for 'flush all' entry which gets placed when
+ * there is not enough space to put all the requested entries.
+ */
+ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1];
+ u64 *tlb_flush_entries;
+ u64 valid_bank_mask;
+ struct kvm_vcpu *v;
+ unsigned long i;
+ bool all_cpus;
+
+ /*
+ * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
+ * sparse banks. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) exceeds this limit.
+ */
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS);
+
+ /*
+ * 'Slow' hypercall's first parameter is the address in guest's memory
+ * where hypercall parameters are placed. This is either a GPA or a
+ * nested GPA when KVM is handling the call from L2 ('direct' TLB
+ * flush). Translate the address here so the memory can be uniformly
+ * read with kvm_read_guest().
+ */
+ if (!hc->fast && is_guest_mode(vcpu)) {
+ hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ if (unlikely(hc->ingpa == INVALID_GPA))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
+ if (hc->fast) {
+ flush.address_space = hc->ingpa;
+ flush.flags = hc->outgpa;
+ flush.processor_mask = sse128_lo(hc->xmm[0]);
+ hc->consumed_xmm_halves = 1;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa,
+ &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush);
+ }
+
+ trace_kvm_hv_flush_tlb(flush.processor_mask,
+ flush.address_space, flush.flags,
+ is_guest_mode(vcpu));
+
+ valid_bank_mask = BIT_ULL(0);
+ sparse_banks[0] = flush.processor_mask;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
+ } else {
+ if (hc->fast) {
+ flush_ex.address_space = hc->ingpa;
+ flush_ex.flags = hc->outgpa;
+ memcpy(&flush_ex.hv_vp_set,
+ &hc->xmm[0], sizeof(hc->xmm[0]));
+ hc->consumed_xmm_halves = 2;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush_ex);
+ }
+
+ trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
+ flush_ex.hv_vp_set.format,
+ flush_ex.address_space,
+ flush_ex.flags, is_guest_mode(vcpu));
+
+ valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
+ all_cpus = flush_ex.hv_vp_set.format !=
+ HV_GENERIC_SET_SPARSE_4K;
+
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (!all_cpus) {
+ if (!hc->var_cnt)
+ goto ret_success;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ /*
+ * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU
+ * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs'
+ * case (HV_GENERIC_SET_ALL). Always adjust data_offset and
+ * consumed_xmm_halves to make sure TLB flush entries are read
+ * from the correct offset.
+ */
+ if (hc->fast)
+ hc->consumed_xmm_halves += hc->var_cnt;
+ else
+ hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
+ tlb_flush_entries = NULL;
+ } else {
+ if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ tlb_flush_entries = __tlb_flush_entries;
+ }
+
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
+ * analyze it here, flush TLB regardless of the specified address space.
+ */
+ if (all_cpus && !is_guest_mode(vcpu)) {
+ kvm_for_each_vcpu(i, v, kvm) {
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
+ } else if (!is_guest_mode(vcpu)) {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
+
+ for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
+ v = kvm_get_vcpu(kvm, i);
+ if (!v)
+ continue;
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ } else {
+ struct kvm_vcpu_hv *hv_v;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+
+ kvm_for_each_vcpu(i, v, kvm) {
+ hv_v = to_hv_vcpu(v);
+
+ /*
+ * The following check races with nested vCPUs entering/exiting
+ * and/or migrating between L1's vCPUs, however the only case when
+ * KVM *must* flush the TLB is when the target L2 vCPU keeps
+ * running on the same L1 vCPU from the moment of the request until
+ * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
+ * cases, e.g. when the target L2 vCPU migrates to a different L1
+ * vCPU or when the corresponding L1 vCPU temporary switches to a
+ * different L2 vCPU while the request is being processed.
+ */
+ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
+ continue;
+
+ if (!all_cpus &&
+ !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
+ sparse_banks))
+ continue;
+
+ __set_bit(i, vcpu_mask);
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ }
+
+ret_success:
+ /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
+ return (u64)HV_STATUS_SUCCESS |
+ ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+}
+
+static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
+ u64 *sparse_banks, u64 valid_bank_mask)
+{
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vector
+ };
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (sparse_banks &&
+ !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
+ valid_bank_mask, sparse_banks))
+ continue;
+
+ /* We fail only when APIC is disabled */
+ kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+}
+
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
+ struct kvm *kvm = vcpu->kvm;
+ struct hv_send_ipi_ex send_ipi_ex;
+ struct hv_send_ipi send_ipi;
+ u64 valid_bank_mask;
+ u32 vector;
+ bool all_cpus;
+
+ if (!lapic_in_kernel(vcpu))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (hc->code == HVCALL_SEND_IPI) {
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
+ sizeof(send_ipi))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = send_ipi.cpu_mask;
+ vector = send_ipi.vector;
+ } else {
+ /* 'reserved' part of hv_send_ipi should be 0 */
+ if (unlikely(hc->ingpa >> 32 != 0))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = hc->outgpa;
+ vector = (u32)hc->ingpa;
+ }
+ all_cpus = false;
+ valid_bank_mask = BIT_ULL(0);
+
+ trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
+ } else {
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ } else {
+ send_ipi_ex.vector = (u32)hc->ingpa;
+ send_ipi_ex.vp_set.format = hc->outgpa;
+ send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]);
+ }
+
+ trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
+ send_ipi_ex.vp_set.format,
+ send_ipi_ex.vp_set.valid_bank_mask);
+
+ vector = send_ipi_ex.vector;
+ valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
+ all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ goto check_and_send_ipi;
+
+ if (!hc->var_cnt)
+ goto ret_success;
+
+ if (!hc->fast)
+ hc->data_offset = offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents);
+ else
+ hc->consumed_xmm_halves = 1;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+check_and_send_ipi:
+ if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
+ else
+ kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
+
+ret_success:
+ return HV_STATUS_SUCCESS;
+}
+
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vcpu->arch.hyperv_enabled = hyperv_enabled;
+
+ if (!hv_vcpu) {
+ /*
+ * KVM should have already allocated kvm_vcpu_hv if Hyper-V is
+ * enabled in CPUID.
+ */
+ WARN_ON_ONCE(vcpu->arch.hyperv_enabled);
+ return;
+ }
+
+ memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache));
+
+ if (!vcpu->arch.hyperv_enabled)
+ return;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
+ if (entry) {
+ hv_vcpu->cpuid_cache.features_eax = entry->eax;
+ hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
+ hv_vcpu->cpuid_cache.features_edx = entry->edx;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
+ if (entry) {
+ hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
+ hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
+ if (entry)
+ hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES);
+ if (entry) {
+ hv_vcpu->cpuid_cache.nested_eax = entry->eax;
+ hv_vcpu->cpuid_cache.nested_ebx = entry->ebx;
+ }
+}
+
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+ int ret = 0;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (enforce) {
+ ret = kvm_hv_vcpu_init(vcpu);
+ if (ret)
+ return ret;
+ } else {
+ return 0;
+ }
+ }
+
+ hv_vcpu = to_hv_vcpu(vcpu);
+ hv_vcpu->enforce_cpuid = enforce;
+
+ return ret;
+}
+
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ bool longmode;
+
+ longmode = is_64_bit_hypercall(vcpu);
+ if (longmode)
+ kvm_rax_write(vcpu, result);
+ else {
+ kvm_rdx_write(vcpu, result >> 32);
+ kvm_rax_write(vcpu, result & 0xffffffff);
+ }
+}
+
+static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
+{
+ u32 tlb_lock_count = 0;
+ int ret;
+
+ if (hv_result_success(result) && is_guest_mode(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu) &&
+ kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
+ &tlb_lock_count, sizeof(tlb_lock_count)))
+ result = HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_hypercall_done(result);
+ kvm_hv_hypercall_set_result(vcpu, result);
+ ++vcpu->stat.hypercalls;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (tlb_lock_count)
+ kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
+
+ return ret;
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
+}
+
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ struct eventfd_ctx *eventfd;
+
+ if (unlikely(!hc->fast)) {
+ int ret;
+ gpa_t gpa = hc->ingpa;
+
+ if ((gpa & (__alignof__(hc->ingpa) - 1)) ||
+ offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE)
+ return HV_STATUS_INVALID_ALIGNMENT;
+
+ ret = kvm_vcpu_read_guest(vcpu, gpa,
+ &hc->ingpa, sizeof(hc->ingpa));
+ if (ret < 0)
+ return HV_STATUS_INVALID_ALIGNMENT;
+ }
+
+ /*
+ * Per spec, bits 32-47 contain the extra "flag number". However, we
+ * have no use for it, and in all known usecases it is zero, so just
+ * report lookup failure if it isn't.
+ */
+ if (hc->ingpa & 0xffff00000000ULL)
+ return HV_STATUS_INVALID_PORT_ID;
+ /* remaining bits are reserved-zero */
+ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+ rcu_read_lock();
+ eventfd = idr_find(&hv->conn_to_evt, hc->ingpa);
+ rcu_read_unlock();
+ if (!eventfd)
+ return HV_STATUS_INVALID_PORT_ID;
+
+ eventfd_signal(eventfd);
+ return HV_STATUS_SUCCESS;
+}
+
+static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
+{
+ switch (hc->code) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ case HVCALL_SEND_IPI_EX:
+ return true;
+ }
+
+ return false;
+}
+
+static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc)
+{
+ int reg;
+
+ kvm_fpu_get();
+ for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
+ _kvm_read_sse_reg(reg, &hc->xmm[reg]);
+ kvm_fpu_put();
+}
+
+static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ return hv_vcpu->cpuid_cache.enlightenments_ebx &&
+ hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX;
+ case HVCALL_POST_MESSAGE:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES;
+ case HVCALL_SIGNAL_EVENT:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ case HVCALL_RESET_DEBUG_SESSION:
+ /*
+ * Return 'true' when SynDBG is disabled so the resulting code
+ * will be HV_STATUS_INVALID_HYPERCALL_CODE.
+ */
+ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) ||
+ hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ case HVCALL_SEND_IPI_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_SEND_IPI:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_CLUSTER_IPI_RECOMMENDED;
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ return hv_vcpu->cpuid_cache.features_ebx &
+ HV_ENABLE_EXTENDED_HYPERCALLS;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_hv_hcall hc;
+ u64 ret = HV_STATUS_SUCCESS;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+#ifdef CONFIG_X86_64
+ if (is_64_bit_hypercall(vcpu)) {
+ hc.param = kvm_rcx_read(vcpu);
+ hc.ingpa = kvm_rdx_read(vcpu);
+ hc.outgpa = kvm_r8_read(vcpu);
+ } else
+#endif
+ {
+ hc.param = ((u64)kvm_rdx_read(vcpu) << 32) |
+ (kvm_rax_read(vcpu) & 0xffffffff);
+ hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
+ (kvm_rcx_read(vcpu) & 0xffffffff);
+ hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
+ (kvm_rsi_read(vcpu) & 0xffffffff);
+ }
+
+ hc.code = hc.param & 0xffff;
+ hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET;
+ hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
+ hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ hc.rep = !!(hc.rep_cnt || hc.rep_idx);
+
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt,
+ hc.rep_idx, hc.ingpa, hc.outgpa);
+
+ if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
+ ret = HV_STATUS_ACCESS_DENIED;
+ goto hypercall_complete;
+ }
+
+ if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ goto hypercall_complete;
+ }
+
+ if (hc.fast && is_xmm_fast_hypercall(&hc)) {
+ if (unlikely(hv_vcpu->enforce_cpuid &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ kvm_hv_hypercall_read_xmm(&hc);
+ }
+
+ switch (hc.code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ if (unlikely(hc.rep || hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ kvm_vcpu_on_spin(vcpu, true);
+ break;
+ case HVCALL_SIGNAL_EVENT:
+ if (unlikely(hc.rep || hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hvcall_signal_event(vcpu, &hc);
+ if (ret != HV_STATUS_INVALID_PORT_ID)
+ break;
+ fallthrough; /* maybe userspace knows this conn_id */
+ case HVCALL_POST_MESSAGE:
+ /* don't bother userspace if it has no way to handle it */
+ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (unlikely(hc.rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
+ break;
+ case HVCALL_SEND_IPI:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_SEND_IPI_EX:
+ if (unlikely(hc.rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_send_ipi(vcpu, &hc);
+ break;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ fallthrough;
+ case HVCALL_RESET_DEBUG_SESSION: {
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) {
+ ret = HV_STATUS_OPERATION_DENIED;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ }
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ default:
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+hypercall_complete:
+ return kvm_hv_hypercall_complete(vcpu, ret);
+
+hypercall_userspace_exit:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
+ vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace;
+ return 0;
+}
+
+void kvm_hv_init_vm(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_init(&hv->hv_lock);
+ idr_init(&hv->conn_to_evt);
+}
+
+void kvm_hv_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+ int i;
+
+ idr_for_each_entry(&hv->conn_to_evt, eventfd, i)
+ eventfd_ctx_put(eventfd);
+ idr_destroy(&hv->conn_to_evt);
+}
+
+static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&hv->hv_lock);
+ ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
+ GFP_KERNEL_ACCOUNT);
+ mutex_unlock(&hv->hv_lock);
+
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ eventfd_ctx_put(eventfd);
+ return ret;
+}
+
+static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+
+ mutex_lock(&hv->hv_lock);
+ eventfd = idr_remove(&hv->conn_to_evt, conn_id);
+ mutex_unlock(&hv->hv_lock);
+
+ if (!eventfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ eventfd_ctx_put(eventfd);
+ return 0;
+}
+
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
+{
+ if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
+ (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
+ return -EINVAL;
+
+ if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
+ return kvm_hv_eventfd_deassign(kvm, args->conn_id);
+ return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
+}
+
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ uint16_t evmcs_ver = 0;
+ struct kvm_cpuid_entry2 cpuid_entries[] = {
+ { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS },
+ { .function = HYPERV_CPUID_INTERFACE },
+ { .function = HYPERV_CPUID_VERSION },
+ { .function = HYPERV_CPUID_FEATURES },
+ { .function = HYPERV_CPUID_ENLIGHTMENT_INFO },
+ { .function = HYPERV_CPUID_IMPLEMENT_LIMITS },
+ { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS },
+ { .function = HYPERV_CPUID_SYNDBG_INTERFACE },
+ { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES },
+ { .function = HYPERV_CPUID_NESTED_FEATURES },
+ };
+ int i, nent = ARRAY_SIZE(cpuid_entries);
+
+ if (kvm_x86_ops.nested_ops->get_evmcs_version)
+ evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
+
+ if (cpuid->nent < nent)
+ return -E2BIG;
+
+ if (cpuid->nent > nent)
+ cpuid->nent = nent;
+
+ for (i = 0; i < nent; i++) {
+ struct kvm_cpuid_entry2 *ent = &cpuid_entries[i];
+ u32 signature[3];
+
+ switch (ent->function) {
+ case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS:
+ memcpy(signature, "Linux KVM Hv", 12);
+
+ ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
+ ent->ebx = signature[0];
+ ent->ecx = signature[1];
+ ent->edx = signature[2];
+ break;
+
+ case HYPERV_CPUID_INTERFACE:
+ ent->eax = HYPERV_CPUID_SIGNATURE_EAX;
+ break;
+
+ case HYPERV_CPUID_VERSION:
+ /*
+ * We implement some Hyper-V 2016 functions so let's use
+ * this version.
+ */
+ ent->eax = 0x00003839;
+ ent->ebx = 0x000A0000;
+ break;
+
+ case HYPERV_CPUID_FEATURES:
+ ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE;
+ ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ ent->eax |= HV_MSR_SYNIC_AVAILABLE;
+ ent->eax |= HV_MSR_SYNTIMER_AVAILABLE;
+ ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE;
+ ent->eax |= HV_MSR_HYPERCALL_AVAILABLE;
+ ent->eax |= HV_MSR_VP_INDEX_AVAILABLE;
+ ent->eax |= HV_MSR_RESET_AVAILABLE;
+ ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
+ ent->eax |= HV_ACCESS_FREQUENCY_MSRS;
+ ent->eax |= HV_ACCESS_REENLIGHTENMENT;
+ ent->eax |= HV_ACCESS_TSC_INVARIANT;
+
+ ent->ebx |= HV_POST_MESSAGES;
+ ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
+
+ ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
+ ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ ent->ebx |= HV_DEBUGGING;
+ ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
+ ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH;
+
+ /*
+ * Direct Synthetic timers only make sense with in-kernel
+ * LAPIC
+ */
+ if (!vcpu || lapic_in_kernel(vcpu))
+ ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+ break;
+
+ case HYPERV_CPUID_ENLIGHTMENT_INFO:
+ ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
+ ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
+ if (!vcpu || lapic_in_kernel(vcpu))
+ ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
+ ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
+ if (evmcs_ver)
+ ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+ if (!cpu_smt_possible())
+ ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+ ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
+ /*
+ * Default number of spinlock retry attempts, matches
+ * HyperV 2016.
+ */
+ ent->ebx = 0x00000FFF;
+
+ break;
+
+ case HYPERV_CPUID_IMPLEMENT_LIMITS:
+ /* Maximum number of virtual processors */
+ ent->eax = KVM_MAX_VCPUS;
+ /*
+ * Maximum number of logical processors, matches
+ * HyperV 2016.
+ */
+ ent->ebx = 64;
+
+ break;
+
+ case HYPERV_CPUID_NESTED_FEATURES:
+ ent->eax = evmcs_ver;
+ ent->eax |= HV_X64_NESTED_DIRECT_FLUSH;
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
+ ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
+ break;
+
+ case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS:
+ memcpy(signature, "Linux KVM Hv", 12);
+
+ ent->eax = 0;
+ ent->ebx = signature[0];
+ ent->ecx = signature[1];
+ ent->edx = signature[2];
+ break;
+
+ case HYPERV_CPUID_SYNDBG_INTERFACE:
+ memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
+ ent->eax = signature[0];
+ break;
+
+ case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES:
+ ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
new file mode 100644
index 000000000000..6ce160ffa678
--- /dev/null
+++ b/arch/x86/kvm/hyperv.h
@@ -0,0 +1,327 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ */
+
+#ifndef __ARCH_X86_KVM_HYPERV_H__
+#define __ARCH_X86_KVM_HYPERV_H__
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+
+#ifdef CONFIG_KVM_HYPERV
+
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
+/*
+ * The #defines related to the synthetic debugger are required by KDNet, but
+ * they are not documented in the Hyper-V TLFS because the synthetic debugger
+ * functionality has been deprecated and is subject to removal in future
+ * versions of Windows.
+ */
+#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080
+#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081
+#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082
+
+/*
+ * Hyper-V synthetic debugger platform capabilities
+ * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits.
+ */
+#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1)
+
+/* Hyper-V Synthetic debug options MSR */
+#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1
+#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2
+#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3
+#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4
+#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5
+#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF
+
+/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
+#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2)
+
+static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm)
+{
+ return &kvm->arch.hyperv;
+}
+
+static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return &hv_vcpu->synic;
+}
+
+static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+ struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic);
+
+ return hv_vcpu->vcpu;
+}
+
+static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->kvm->arch.hyperv.hv_syndbg;
+}
+
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
+
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
+
+static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
+{
+ return to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->vec_bitmap);
+}
+
+static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector)
+{
+ return to_hv_vcpu(vcpu) &&
+ test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap);
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
+{
+ return &to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+
+ hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+ stimer[0]);
+ return hv_vcpu->vcpu;
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ return !bitmap_empty(hv_vcpu->stimer_pending_bitmap,
+ HV_SYNIC_STIMER_COUNT);
+}
+
+/*
+ * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8])
+ * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to.
+ */
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * If Hyper-V's invariant TSC control is not exposed to the guest,
+ * the invariant TSC CPUID flag is not suppressed, Windows guests were
+ * observed to be able to handle it correctly. Going forward, VMMs are
+ * encouraged to enable Hyper-V's invariant TSC control when invariant
+ * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V.
+ */
+ if (!hv_vcpu ||
+ !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT))
+ return false;
+
+ /*
+ * If Hyper-V's invariant TSC control is exposed to the guest, KVM is
+ * responsible for suppressing the invariant TSC CPUID flag if the
+ * Hyper-V control is not enabled.
+ */
+ return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_request_tsc_page_update(struct kvm *kvm);
+
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu);
+
+void kvm_hv_init_vm(struct kvm *kvm);
+void kvm_hv_destroy_vm(struct kvm *kvm);
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled);
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce);
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+
+static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
+ bool is_guest_mode)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
+ HV_L1_TLB_FLUSH_FIFO;
+
+ return &hv_vcpu->tlb_flush_fifo[i];
+}
+
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+
+ if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ return;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+}
+
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu &&
+ (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH);
+}
+
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u16 code;
+
+ if (!hv_vcpu)
+ return false;
+
+ code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) :
+ kvm_rax_read(vcpu);
+
+ return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX);
+}
+
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ if (!to_hv_vcpu(vcpu))
+ return 0;
+
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return 0;
+
+ return kvm_hv_get_assist_page(vcpu);
+}
+
+static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu,
+ bool tdp_enabled)
+{
+ /*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && tdp_enabled)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+#else /* CONFIG_KVM_HYPERV */
+static inline void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock) {}
+static inline void kvm_hv_request_tsc_page_update(struct kvm *kvm) {}
+static inline void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) {}
+static inline void kvm_hv_init_vm(struct kvm *kvm) {}
+static inline void kvm_hv_destroy_vm(struct kvm *kvm) {}
+static inline int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+static inline void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ return HV_STATUS_ACCESS_DENIED;
+}
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
+{
+ return false;
+}
+static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector)
+{
+ return false;
+}
+static inline void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) {}
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) {}
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_idx;
+}
+static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, bool tdp_enabled) {}
+#endif /* CONFIG_KVM_HYPERV */
+
+#endif /* __ARCH_X86_KVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 21f68e00524f..850972deac8e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
* Copyright (c) 2006 Intel Corporation
* Copyright (c) 2007 Keir Fraser, XenSource Inc
* Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -29,10 +30,15 @@
* Based on QEMU and Xen.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include "ioapic.h"
#include "irq.h"
#include "i8254.h"
+#include "x86.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -45,32 +51,9 @@
#define RW_STATE_WORD0 3
#define RW_STATE_WORD1 4
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
-{
- union {
- u64 ll;
- struct {
- u32 low, high;
- } l;
- } u, res;
- u64 rl, rh;
-
- u.ll = a;
- rl = (u64)u.l.low * (u64)b;
- rh = (u64)u.l.high * (u64)b;
- rh += (rl >> 32);
- res.l.high = div64_u64(rh, c);
- res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
- return res.ll;
-}
-
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
switch (c->mode) {
default:
@@ -91,20 +74,18 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
c->gate = val;
}
-static int pit_get_gate(struct kvm *kvm, int channel)
+static int pit_get_gate(struct kvm_pit *pit, int channel)
{
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- return kvm->arch.vpit->pit_state.channels[channel].gate;
+ return pit->pit_state.channels[channel].gate;
}
-static s64 __kpit_elapsed(struct kvm *kvm)
+static s64 __kpit_elapsed(struct kvm_pit *pit)
{
s64 elapsed;
ktime_t remaining;
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+ struct kvm_kpit_state *ps = &pit->pit_state;
- if (!ps->pit_timer.period)
+ if (!ps->period)
return 0;
/*
@@ -116,33 +97,29 @@ static s64 __kpit_elapsed(struct kvm *kvm)
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
- elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->pit_timer.period);
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
return elapsed;
}
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
int channel)
{
if (channel == 0)
- return __kpit_elapsed(kvm);
+ return __kpit_elapsed(pit);
return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
}
-static int pit_get_count(struct kvm *kvm, int channel)
+static int pit_get_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int counter;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
case 0:
@@ -162,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel)
return counter;
}
-static int pit_get_out(struct kvm *kvm, int channel)
+static int pit_get_out(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int out;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
default:
@@ -197,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel)
return out;
}
-static void pit_latch_count(struct kvm *kvm, int channel)
+static void pit_latch_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->count_latched) {
- c->latched_count = pit_get_count(kvm, channel);
+ c->latched_count = pit_get_count(pit, channel);
c->count_latched = c->rw_mode;
}
}
-static void pit_latch_status(struct kvm *kvm, int channel)
+static void pit_latch_status(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->status_latched) {
/* TODO: Return NULL COUNT (bit 6). */
- c->status = ((pit_get_out(kvm, channel) << 7) |
+ c->status = ((pit_get_out(pit, channel) << 7) |
(c->rw_mode << 4) |
(c->mode << 1) |
c->bcd);
@@ -227,24 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
-int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
- return atomic_read(&pit->pit_state.pit_timer.pending);
- return 0;
+ return container_of(ps, struct kvm_pit, pit_state);
}
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- spin_lock(&ps->inject_lock);
- if (atomic_dec_return(&ps->pit_timer.pending) < 0)
- atomic_inc(&ps->pit_timer.pending);
- ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
+ struct kvm_pit *pit = pit_state_to_pit(ps);
+
+ atomic_set(&ps->irq_ack, 1);
+ /* irq_ack should be set before pending is read. Order accesses with
+ * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+ */
+ smp_mb();
+ if (atomic_dec_if_positive(&ps->pending) > 0)
+ kthread_queue_work(pit->worker, &pit->expired);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -252,64 +220,153 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (vcpu->vcpu_id != 0 || !pit)
+ /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */
+ if (vcpu->vcpu_id || !pit)
return;
- timer = &pit->pit_state.pit_timer.timer;
+ timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
}
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
{
- pr_debug("pit: execute del timer!\n");
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_flush_work(&pit->expired);
}
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
+static void pit_do_work(struct kthread_work *work)
{
- struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
- pit_timer);
- return ps->is_periodic;
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+ return;
+
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation does not follow
+ * the MP specification. We propagate a PIT interrupt to all
+ * VCPUs and only when LVT0 is in NMI mode. The interrupt can
+ * also be simultaneously delivered through PIC and IOAPIC.
+ */
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
}
-static struct kvm_timer_ops kpit_ops = {
- .is_periodic = kpit_is_periodic,
-};
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = pit_state_to_pit(ps);
+
+ if (atomic_read(&ps->reinject))
+ atomic_inc(&ps->pending);
+
+ kthread_queue_work(pt->worker, &pt->expired);
+
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+ atomic_set(&pit->pit_state.pending, 0);
+ atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+
+ if (atomic_read(&ps->reinject) == reinject)
+ return;
+
+ /*
+ * AMD SVM AVIC accelerates EOI write and does not trap.
+ * This cause in-kernel PIT re-inject mode to fail
+ * since it checks ps->irq_ack before kvm_set_irq()
+ * and relies on the ack notifier to timely queue
+ * the pt->worker work iterm and reinject the missed tick.
+ * So, deactivate APICv when PIT is in reinject mode.
+ */
+ if (reinject) {
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
+ /* The initial state is preserved while ps->reinject == 0. */
+ kvm_pit_reset_reinject(pit);
+ kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ } else {
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
+ kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ }
+
+ atomic_set(&ps->reinject, reinject);
+}
-static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
{
- struct kvm_timer *pt = &ps->pit_timer;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
s64 interval;
- interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ return;
+
+ interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
- pr_debug("pit: create pit timer, interval is %llu nsec\n", interval);
+ pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
- pt->period = interval;
+ hrtimer_cancel(&ps->timer);
+ kthread_flush_work(&pit->expired);
+ ps->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = kvm_timer_fn;
- pt->t_ops = &kpit_ops;
- pt->kvm = ps->pit->kvm;
- pt->vcpu_id = 0;
+ kvm_pit_reset_reinject(pit);
- atomic_set(&pt->pending, 0);
- ps->irq_ack = 1;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (ps->is_periodic) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (ps->period < min_period) {
+ pr_info_ratelimited(
+ "requested %lld ns "
+ "i8254 timer period limited to %lld ns\n",
+ ps->period, min_period);
+ ps->period = min_period;
+ }
+ }
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-
- WARN_ON(!mutex_is_locked(&ps->lock));
+ struct kvm_kpit_state *ps = &pit->pit_state;
- pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
+ pr_debug("load_count val is %u, channel is %d\n", val, channel);
/*
* The largest possible initial count is 0; this is equivalent
@@ -332,33 +389,63 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(ps, val, 0);
+ create_pit_timer(pit, val, 0);
break;
case 2:
case 3:
- create_pit_timer(ps, val, 1);
+ create_pit_timer(pit, val, 1);
break;
default:
- destroy_pit_timer(&ps->pit_timer);
+ destroy_pit_timer(pit);
}
}
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- pit_load_count(kvm, channel, val);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ u8 saved_mode;
+
+ WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
+ if (hpet_legacy_start) {
+ /* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
+ saved_mode = pit->pit_state.channels[0].mode;
+ pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(pit, channel, val);
+ pit->pit_state.channels[0].mode = saved_mode;
+ } else {
+ pit_load_count(pit, channel, val);
+ }
}
-static void pit_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, speaker_dev);
+}
+
+static inline int pit_in_range(gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int channel, access;
struct kvm_kpit_channel_state *s;
u32 val = *(u32 *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
val &= 0xff;
addr &= KVM_PIT_CHANNEL_MASK;
@@ -366,20 +453,19 @@ static void pit_ioport_write(struct kvm_io_device *this,
mutex_lock(&pit_state->lock);
if (val != 0)
- pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n",
- (unsigned int)addr, len, val);
+ pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
if (addr == 3) {
channel = val >> 6;
if (channel == 3) {
/* Read-Back Command. */
for (channel = 0; channel < 3; channel++) {
- s = &pit_state->channels[channel];
if (val & (2 << channel)) {
if (!(val & 0x20))
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
if (!(val & 0x10))
- pit_latch_status(kvm, channel);
+ pit_latch_status(pit, channel);
}
}
} else {
@@ -387,7 +473,7 @@ static void pit_ioport_write(struct kvm_io_device *this,
s = &pit_state->channels[channel];
access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
if (access == 0) {
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
} else {
s->rw_mode = access;
s->read_state = access;
@@ -404,35 +490,41 @@ static void pit_ioport_write(struct kvm_io_device *this,
switch (s->write_state) {
default:
case RW_STATE_LSB:
- pit_load_count(kvm, addr, val);
+ pit_load_count(pit, addr, val);
break;
case RW_STATE_MSB:
- pit_load_count(kvm, addr, val << 8);
+ pit_load_count(pit, addr, val << 8);
break;
case RW_STATE_WORD0:
s->write_latch = val;
s->write_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ pit_load_count(pit, addr, s->write_latch | (val << 8));
s->write_state = RW_STATE_WORD0;
break;
}
}
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static void pit_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int ret, count;
struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
addr &= KVM_PIT_CHANNEL_MASK;
+ if (addr == 3)
+ return 0;
+
s = &pit_state->channels[addr];
mutex_lock(&pit_state->lock);
@@ -460,20 +552,20 @@ static void pit_ioport_read(struct kvm_io_device *this,
switch (s->read_state) {
default:
case RW_STATE_LSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
break;
case RW_STATE_MSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
break;
case RW_STATE_WORD0:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
s->read_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
s->read_state = RW_STATE_WORD0;
break;
@@ -485,200 +577,240 @@ static void pit_ioport_read(struct kvm_io_device *this,
memcpy(data, (char *)&ret, len);
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
-{
- return ((addr >= KVM_PIT_BASE_ADDRESS) &&
- (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
-}
-
-static void speaker_ioport_write(struct kvm_io_device *this,
- gpa_t addr, int len, const void *data)
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
u32 val = *(u32 *) data;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
mutex_lock(&pit_state->lock);
- pit_state->speaker_data_on = (val >> 1) & 1;
- pit_set_gate(kvm, 2, val & 1);
+ if (val & (1 << 1))
+ pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON;
+ else
+ pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON;
+ pit_set_gate(pit, 2, val & 1);
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static void speaker_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
- struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
unsigned int refresh_clock;
int ret;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
/* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
mutex_lock(&pit_state->lock);
- ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
- (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+ ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) |
+ pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) |
+ (refresh_clock << 4);
if (len > sizeof(ret))
len = sizeof(ret);
memcpy(data, (char *)&ret, len);
mutex_unlock(&pit_state->lock);
+ return 0;
}
-static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
-{
- return (addr == KVM_SPEAKER_BASE_ADDRESS);
-}
-
-void kvm_pit_reset(struct kvm_pit *pit)
+static void kvm_pit_reset(struct kvm_pit *pit)
{
int i;
struct kvm_kpit_channel_state *c;
- mutex_lock(&pit->pit_state.lock);
+ pit->pit_state.flags = 0;
for (i = 0; i < 3; i++) {
c = &pit->pit_state.channels[i];
c->mode = 0xff;
c->gate = (i != 2);
- pit_load_count(pit->kvm, i, 0);
+ pit_load_count(pit, i, 0);
}
- mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.irq_ack = 1;
+ kvm_pit_reset_reinject(pit);
}
-static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+static void pit_mask_notifier(struct kvm_irq_mask_notifier *kimn, bool mask)
{
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
- if (!mask) {
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.irq_ack = 1;
- }
+ if (!mask)
+ kvm_pit_reset_reinject(pit);
+}
+
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
}
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+};
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
struct kvm_kpit_state *pit_state;
+ struct pid *pid;
+ pid_t pid_nr;
+ int ret;
- pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
if (!pit)
return NULL;
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0) {
- kfree(pit);
- return NULL;
- }
-
mutex_init(&pit->pit_state.lock);
- mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
-
- /* Initialize PIO device */
- pit->dev.read = pit_ioport_read;
- pit->dev.write = pit_ioport_write;
- pit->dev.in_range = pit_in_range;
- pit->dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
-
- pit->speaker_dev.read = speaker_ioport_read;
- pit->speaker_dev.write = speaker_ioport_write;
- pit->speaker_dev.in_range = speaker_in_range;
- pit->speaker_dev.private = pit;
- kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
-
- kvm->arch.vpit = pit;
+
+ pid = get_pid(task_tgid(current));
+ pid_nr = pid_vnr(pid);
+ put_pid(pid);
+
+ pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr);
+ if (IS_ERR(pit->worker))
+ goto fail_kthread;
+
+ kthread_init_work(&pit->expired, pit_do_work);
+
pit->kvm = kvm;
pit_state = &pit->pit_state;
- pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
- kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
- mutex_unlock(&pit->pit_state.lock);
+ pit->mask_notifier.func = pit_mask_notifier;
kvm_pit_reset(pit);
- pit->mask_notifier.func = pit_mask_notifer;
- kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
-
- return pit;
-}
-
-void kvm_free_pit(struct kvm *kvm)
-{
- struct hrtimer *timer;
-
- if (kvm->arch.vpit) {
- kvm_unregister_irq_mask_notifier(kvm, 0,
- &kvm->arch.vpit->mask_notifier);
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
- hrtimer_cancel(timer);
- kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- kfree(kvm->arch.vpit);
+ kvm_pit_set_reinject(pit, true);
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+ KVM_PIT_MEM_LENGTH, &pit->dev);
+ if (ret < 0)
+ goto fail_register_pit;
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
+ KVM_SPEAKER_BASE_ADDRESS, 4,
+ &pit->speaker_dev);
+ if (ret < 0)
+ goto fail_register_speaker;
}
-}
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- int i;
+ mutex_unlock(&kvm->slots_lock);
- mutex_lock(&kvm->lock);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
- mutex_unlock(&kvm->lock);
+ return pit;
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (kvm->arch.vapics_in_nmi_mode > 0)
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (vcpu)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
+fail_register_speaker:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+fail_register_pit:
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ kthread_destroy_worker(pit->worker);
+fail_kthread:
+ kfree(pit);
+ return NULL;
}
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
+void kvm_free_pit(struct kvm *kvm)
{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kpit_state *ps;
-
- if (vcpu && pit) {
- int inject = 0;
- ps = &pit->pit_state;
-
- /* Try to inject pending interrupts when
- * last one has been acked.
- */
- spin_lock(&ps->inject_lock);
- if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- spin_unlock(&ps->inject_lock);
- if (inject)
- __inject_pit_timer_intr(kvm);
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (pit) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_destroy_worker(pit->worker);
+ kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863ff60b7..60fa499d2f8a 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,8 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __I8254_H
#define __I8254_H
-#include "iodev.h"
+#include <linux/kthread.h>
+#include <kvm/iodev.h>
+
+#include <uapi/asm/kvm.h>
+
+#include "ioapic.h"
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -20,25 +28,28 @@ struct kvm_kpit_channel_state {
};
struct kvm_kpit_state {
+ /* All members before "struct mutex lock" are protected by the lock. */
struct kvm_kpit_channel_state channels[3];
- struct kvm_timer pit_timer;
+ u32 flags;
bool is_periodic;
- u32 speaker_data_on;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+
struct mutex lock;
- struct kvm_pit *pit;
- spinlock_t inject_lock;
- unsigned long irq_ack;
+ atomic_t reinject;
+ atomic_t pending; /* accumulated triggered timers */
+ atomic_t irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
struct kvm_pit {
- unsigned long base_addresss;
struct kvm_io_device dev;
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
- int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
+ struct kthread_worker *worker;
+ struct kthread_work expired;
};
#define KVM_PIT_BASE_ADDRESS 0x40
@@ -48,10 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-void kvm_pit_reset(struct kvm_pit *pit);
+#endif /* CONFIG_KVM_IOAPIC */
#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1ccb50c74f18..2ac7f1678c46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2003-2004 Fabrice Bellard
* Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -25,11 +26,22 @@
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
* Port from Qemu.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/bitops.h>
+
+#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
+#include "trace.h"
+
+#define pr_pic_unimpl(fmt, ...) \
+ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__)
+
+static void pic_irq_request(struct kvm *kvm, int level);
static void pic_lock(struct kvm_pic *s)
__acquires(&s->lock)
@@ -40,40 +52,39 @@ static void pic_lock(struct kvm_pic *s)
static void pic_unlock(struct kvm_pic *s)
__releases(&s->lock)
{
- struct kvm *kvm = s->kvm;
- unsigned acks = s->pending_acks;
bool wakeup = s->wakeup_needed;
struct kvm_vcpu *vcpu;
+ unsigned long i;
- s->pending_acks = 0;
s->wakeup_needed = false;
spin_unlock(&s->lock);
- while (acks) {
- kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
- __ffs(acks));
- acks &= acks - 1;
- }
-
if (wakeup) {
- vcpu = s->kvm->vcpus[0];
- if (vcpu)
- kvm_vcpu_kick(vcpu);
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+ }
}
}
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
- s->isr_ack |= (1 << irq);
-}
-
-void kvm_pic_clear_isr_ack(struct kvm *kvm)
-{
- struct kvm_pic *s = pic_irqchip(kvm);
- s->pics[0].isr_ack = 0xff;
- s->pics[1].isr_ack = 0xff;
+ if (s != &s->pics_state->pics[0])
+ irq += 8;
+ /*
+ * We are dropping lock while calling ack notifiers since ack
+ * notifier callbacks for assigned devices call into PIC recursively.
+ * Other interrupt may be delivered to PIC while lock is dropped but
+ * it should be safe since PIC state is already updated at this stage.
+ */
+ pic_unlock(s->pics_state);
+ kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
+ pic_lock(s->pics_state);
}
/*
@@ -166,10 +177,7 @@ static void pic_update_irq(struct kvm_pic *s)
pic_set_irq1(&s->pics[0], 2, 0);
}
irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0)
- s->irq_request(s->irq_request_opaque, 1);
- else
- s->irq_request(s->irq_request_opaque, 0);
+ pic_irq_request(s->kvm, irq >= 0);
}
void kvm_pic_update_irq(struct kvm_pic *s)
@@ -179,16 +187,22 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
- struct kvm_pic *s = opaque;
- int ret = -1;
+ struct kvm_pic *s = kvm->arch.vpic;
+ int irq = e->irqchip.pin;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
pic_lock(s);
- if (irq >= 0 && irq < PIC_NUM_PINS) {
- ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
- pic_update_irq(s);
- }
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
pic_unlock(s);
return ret;
@@ -200,22 +214,26 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
static inline void pic_intack(struct kvm_kpic_state *s, int irq)
{
s->isr |= 1 << irq;
- if (s->auto_eoi) {
- if (s->rotate_on_auto_eoi)
- s->priority_add = (irq + 1) & 7;
- pic_clear_isr(s, irq);
- }
/*
* We don't clear a level sensitive interrupt here
*/
if (!(s->elcr & (1 << irq)))
s->irr &= ~(1 << irq);
+
+ if (s->auto_eoi) {
+ if (s->rotate_on_auto_eoi)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ }
+
}
int kvm_pic_read_irq(struct kvm *kvm)
{
int irq, irq2, intno;
- struct kvm_pic *s = pic_irqchip(kvm);
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ s->output = 0;
pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
@@ -231,7 +249,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
*/
irq2 = 7;
intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
} else
intno = s->pics[0].irq_base + irq;
} else {
@@ -243,44 +260,43 @@ int kvm_pic_read_irq(struct kvm *kvm)
}
pic_update_irq(s);
pic_unlock(s);
- kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
return intno;
}
-void kvm_pic_reset(struct kvm_kpic_state *s)
+static void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq, irqbase, n;
- struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
-
- if (s == &s->pics_state->pics[0])
- irqbase = 0;
- else
- irqbase = 8;
+ int irq;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ u8 edge_irr = s->irr & ~s->elcr;
+ bool found = false;
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
- if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
- if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
- n = irq + irqbase;
- s->pics_state->pending_acks |= 1 << n;
- }
- }
s->last_irr = 0;
- s->irr = 0;
+ s->irr &= s->elcr;
s->imr = 0;
- s->isr = 0;
- s->isr_ack = 0xff;
s->priority_add = 0;
- s->irq_base = 0;
- s->read_reg_select = 0;
- s->poll = 0;
s->special_mask = 0;
- s->init_state = 0;
- s->auto_eoi = 0;
- s->rotate_on_auto_eoi = 0;
- s->special_fully_nested_mode = 0;
- s->init4 = 0;
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
+ }
+ s->init_state = 1;
+
+ kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = true;
+ break;
+ }
+
+
+ if (!found)
+ return;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (edge_irr & (1 << irq))
+ pic_clear_isr(s, irq);
}
static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -291,19 +307,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
addr &= 1;
if (addr == 0) {
if (val & 0x10) {
- kvm_pic_reset(s); /* init */
- /*
- * deassert a pending interrupt
- */
- s->pics_state->irq_request(s->pics_state->
- irq_request_opaque, 0);
- s->init_state = 1;
s->init4 = val & 1;
if (val & 0x02)
- printk(KERN_ERR "single mode not supported");
+ pr_pic_unimpl("single mode not supported");
if (val & 0x08)
- printk(KERN_ERR
- "level sensitive irq not supported");
+ pr_pic_unimpl(
+ "level sensitive irq not supported");
+ kvm_pic_reset(s);
} else if (val & 0x08) {
if (val & 0x04)
s->poll = 1;
@@ -323,9 +333,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
priority = get_priority(s, s->isr);
if (priority != 8) {
irq = (priority + s->priority_add) & 7;
- pic_clear_isr(s, irq);
if (cmd == 5)
s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
}
break;
@@ -350,10 +360,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
}
} else
switch (s->init_state) {
- case 0: /* normal mode */
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
pic_update_irq(s->pics_state);
break;
+ }
case 1:
s->irq_base = val & 0xf8;
s->init_state = 2;
@@ -386,7 +406,10 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state);
+ /* Bit 7 is 1, means there's an interrupt */
+ ret |= 0x80;
} else {
+ /* Bit 7 is 0, means there's no interrupt */
ret = 0x07;
pic_update_irq(s->pics_state);
}
@@ -394,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
return ret;
}
-static u32 pic_ioport_read(void *opaque, u32 addr1)
+static u32 pic_ioport_read(void *opaque, u32 addr)
{
struct kvm_kpic_state *s = opaque;
- unsigned int addr;
int ret;
- addr = addr1;
- addr &= 1;
if (s->poll) {
- ret = pic_poll_read(s, addr1);
+ ret = pic_poll_read(s, addr);
s->poll = 0;
} else
- if (addr == 0)
+ if ((addr & 1) == 0)
if (s->read_reg_select)
ret = s->isr;
else
@@ -416,128 +436,220 @@ static u32 pic_ioport_read(void *opaque, u32 addr1)
return ret;
}
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+static void elcr_ioport_write(void *opaque, u32 val)
{
struct kvm_kpic_state *s = opaque;
s->elcr = val & s->elcr_mask;
}
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
+static u32 elcr_ioport_read(void *opaque)
{
struct kvm_kpic_state *s = opaque;
return s->elcr;
}
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
- int len, int is_write)
-{
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- case 0x4d0:
- case 0x4d1:
- return 1;
- default:
- return 0;
- }
-}
-
-static void picdev_write(struct kvm_io_device *this,
+static int picdev_write(struct kvm_pic *s,
gpa_t addr, int len, const void *val)
{
- struct kvm_pic *s = this->private;
unsigned char data = *(unsigned char *)val;
if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte write\n");
- return;
+ pr_pic_unimpl("non byte write\n");
+ return 0;
}
- pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
case 0xa0:
case 0xa1:
- pic_ioport_write(&s->pics[addr >> 7], addr, data);
+ pic_lock(s);
+ pic_ioport_write(&s->pics[1], addr, data);
+ pic_unlock(s);
break;
case 0x4d0:
case 0x4d1:
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ pic_lock(s);
+ elcr_ioport_write(&s->pics[addr & 1], data);
+ pic_unlock(s);
break;
+ default:
+ return -EOPNOTSUPP;
}
- pic_unlock(s);
+ return 0;
}
-static void picdev_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *val)
+static int picdev_read(struct kvm_pic *s,
+ gpa_t addr, int len, void *val)
{
- struct kvm_pic *s = this->private;
- unsigned char data = 0;
+ unsigned char *data = (unsigned char *)val;
if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte read\n");
- return;
+ memset(val, 0, len);
+ pr_pic_unimpl("non byte read\n");
+ return 0;
}
- pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
case 0xa0:
case 0xa1:
- data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_lock(s);
+ *data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_unlock(s);
break;
case 0x4d0:
case 0x4d1:
- data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ pic_lock(s);
+ *data = elcr_ioport_read(&s->pics[addr & 1]);
+ pic_unlock(s);
break;
+ default:
+ return -EOPNOTSUPP;
}
- *(unsigned char *)val = data;
- pic_unlock(s);
+ return 0;
+}
+
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_elcr),
+ addr, len, val);
+}
+
+static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_elcr),
+ addr, len, val);
}
/*
* callback when PIC0 irq status changed
*/
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
{
- struct kvm *kvm = opaque;
- struct kvm_vcpu *vcpu = kvm->vcpus[0];
- struct kvm_pic *s = pic_irqchip(kvm);
- int irq = pic_get_irq(&s->pics[0]);
+ struct kvm_pic *s = kvm->arch.vpic;
- s->output = level;
- if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
- s->pics[0].isr_ack &= ~(1 << irq);
+ if (!s->output && level)
s->wakeup_needed = true;
- }
+ s->output = level;
}
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+static const struct kvm_io_device_ops picdev_master_ops = {
+ .read = picdev_master_read,
+ .write = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+ .read = picdev_slave_read,
+ .write = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_elcr_ops = {
+ .read = picdev_elcr_read,
+ .write = picdev_elcr_write,
+};
+
+int kvm_pic_init(struct kvm *kvm)
{
struct kvm_pic *s;
- s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+ int ret;
+
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
if (!s)
- return NULL;
+ return -ENOMEM;
spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
- s->irq_request = pic_irq_request;
- s->irq_request_opaque = kvm;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
/*
* Initialize PIO device
*/
- s->dev.read = picdev_read;
- s->dev.write = picdev_write;
- s->dev.in_range = picdev_in_range;
- s->dev.private = s;
- kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
- return s;
+ kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+ kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+ kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+ &s->dev_master);
+ if (ret < 0)
+ goto fail_unlock;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+ if (ret < 0)
+ goto fail_unreg_2;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr);
+ if (ret < 0)
+ goto fail_unreg_1;
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = s;
+
+ return 0;
+
+fail_unreg_1:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(s);
+
+ return ret;
+}
+
+void kvm_pic_destroy(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (!vpic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr);
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000000..2c2783296aed
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nospec.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+#include "trace.h"
+
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
+ bool line_status);
+
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content = ~0ULL;
+
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+ ioapic->rtc_status.pending_eoi = 0;
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ bool new_val, old_val;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+
+ e = &ioapic->redirtbl[RTC_GSI];
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id,
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode)))
+ return;
+
+ new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+ old_val = test_bit(vcpu->vcpu_id, dest_map->map);
+
+ if (new_val == old_val)
+ return;
+
+ if (new_val) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = e->fields.vector;
+ ioapic->rtc_status.pending_eoi++;
+ } else {
+ __clear_bit(vcpu->vcpu_id, dest_map->map);
+ ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (RTC_GSI >= IOAPIC_NUM_PINS)
+ return;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ (vector == dest_map->vectors[vcpu->vcpu_id]) &&
+ (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map))) {
+ --ioapic->rtc_status.pending_eoi;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+ if (ioapic->rtc_status.pending_eoi > 0)
+ return true; /* coalesced */
+
+ return false;
+}
+
+static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
+{
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm) {
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ entry->fields.dest_id,
+ entry->fields.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, entry->fields.vector))
+ continue;
+
+ /*
+ * If no longer has pending EOI in LAPICs, update
+ * EOI for this vector.
+ */
+ rtc_irq_eoi(ioapic, vcpu, entry->fields.vector);
+ break;
+ }
+}
+
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
+{
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * AMD SVM AVIC accelerate EOI write iff the interrupt is edge
+ * triggered, in which case the in-kernel IOAPIC will not be able
+ * to receive the EOI. In this case, we do a lazy update of the
+ * pending EOI when trying to set IOAPIC irq.
+ */
+ if (edge && kvm_apicv_activated(ioapic->kvm))
+ ioapic_lazy_update_eoi(ioapic, irq);
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * to masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
+
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if (edge) {
+ ioapic->irr_delivered &= ~mask;
+ if (old_irr == ioapic->irr) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+ int index;
+
+ spin_lock(&ioapic->lock);
+
+ /* Make sure we see any missing RTC EOI */
+ if (test_bit(vcpu->vcpu_id, dest_map->map))
+ __set_bit(dest_map->vectors[vcpu->vcpu_id],
+ ioapic_handled_vectors);
+
+ for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+ e = &ioapic->redirtbl[index];
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+ kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
+ index == RTC_GSI) {
+ u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+
+ kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm,
+ e->fields.vector, ioapic_handled_vectors);
+ }
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
+{
+ if (!ioapic_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ union kvm_ioapic_redirect_entry *e;
+ int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
+ DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ /* Preserve read-only fields */
+ old_remote_irr = e->fields.remote_irr;
+ old_delivery_status = e->fields.delivery_status;
+ old_dest_id = e->fields.dest_id;
+ old_dest_mode = e->fields.dest_mode;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ }
+ e->fields.remote_irr = old_remote_irr;
+ e->fields.delivery_status = old_delivery_status;
+
+ /*
+ * Some OSes (Linux, Xen) assume that Remote IRR bit will
+ * be cleared by IOAPIC hardware when the entry is configured
+ * as edge-triggered. This behavior is used to simulate an
+ * explicit EOI on IOAPICs that don't have the EOI register.
+ */
+ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+ e->fields.remote_irr = 0;
+
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) {
+ /*
+ * Pending status in irr may be outdated: the IRQ line may have
+ * already been deasserted by a device while the IRQ was masked.
+ * This occurs, for instance, if the interrupt is handled in a
+ * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this
+ * case the guest acknowledges the interrupt to the device in
+ * its threaded irq handler, i.e. after the EOI but before
+ * unmasking, so at the time of unmasking the IRQ line is
+ * already down but our pending irr bit is still set. In such
+ * cases, injecting this pending interrupt to the guest is
+ * buggy: the guest will receive an extra unwanted interrupt.
+ *
+ * So we need to check here if the IRQ is actually still pending.
+ * As we are generally not able to probe the IRQ line status
+ * directly, we do it through irqfd resampler. Namely, we clear
+ * the pending status and notify the resampler that this interrupt
+ * is done, without actually injecting it into the guest. If the
+ * IRQ line is actually already deasserted, we are done. If it is
+ * still asserted, a new interrupt will be shortly triggered
+ * through irqfd and injected into the guest.
+ *
+ * If, however, it's not possible to resample (no irqfd resampler
+ * registered for this irq), then unconditionally inject this
+ * pending interrupt into the guest, so the guest will not miss
+ * an interrupt, although may get an extra unwanted interrupt.
+ */
+ if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))
+ ioapic->irr &= ~(1 << index);
+ else
+ ioapic_service(ioapic, index, false);
+ }
+ if (e->fields.delivery_mode == APIC_DM_FIXED) {
+ struct kvm_lapic_irq irq;
+
+ irq.vector = e->fields.vector;
+ irq.delivery_mode = e->fields.delivery_mode << 8;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+ irq.level = false;
+ irq.trig_mode = e->fields.trig_mode;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.dest_id = e->fields.dest_id;
+ irq.msi_redir_hint = false;
+ bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ vcpu_bitmap);
+ if (old_dest_mode != e->fields.dest_mode ||
+ old_dest_id != e->fields.dest_id) {
+ /*
+ * Update vcpu_bitmap with vcpus specified in
+ * the previous request as well. This is done to
+ * keep ioapic_handled_vectors synchronized.
+ */
+ irq.dest_id = old_dest_id;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(
+ !!e->fields.dest_mode);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ vcpu_bitmap);
+ }
+ kvm_make_scan_ioapic_request_mask(ioapic->kvm,
+ vcpu_bitmap);
+ } else {
+ kvm_make_scan_ioapic_request(ioapic->kvm);
+ }
+ break;
+ }
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+ int ret;
+
+ if (entry->fields.mask ||
+ (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ entry->fields.remote_irr))
+ return -1;
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode);
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = APIC_DEST_NOSHORT;
+ irqe.msi_redir_hint = false;
+
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr_delivered |= 1 << irq;
+
+ if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
+ BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+ &ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
+ } else
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
+ return ret;
+}
+
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ int irq = e->irqchip.pin;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
+
+ spin_lock(&ioapic->lock);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
+
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+ int i;
+ struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+ eoi_inject.work);
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+ ioapic_service(ioapic, i, false);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[pin];
+
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+ kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ return;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << pin))) {
+ ++ioapic->irq_eoi[pin];
+ if (ioapic->irq_eoi[pin] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[pin] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+ } else {
+ ioapic_service(ioapic, pin, false);
+ }
+ } else {
+ ioapic->irq_eoi[pin] = 0;
+ }
+}
+
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
+{
+ int i;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ rtc_irq_eoi(ioapic, vcpu, vector);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+ kvm_ioapic_update_eoi_one(vcpu, ioapic, trigger_mode, i);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ switch (len) {
+ case 8:
+ case 4:
+ data = *(u32 *) val;
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ ioapic->id = 0;
+ memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi));
+ rtc_irq_eoi_tracking_reset(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+}
+
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ state->irr &= ~ioapic->irr_delivered;
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ kvm_make_scan_ioapic_request(kvm);
+ kvm_ioapic_inject_all(ioapic, state->irr);
+ spin_unlock(&ioapic->lock);
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
new file mode 100644
index 000000000000..bf28dbc11ff6
--- /dev/null
+++ b/arch/x86/kvm/ioapic.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include "irq.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+#define RTC_GSI 8
+
+struct dest_map {
+ /* vcpu bitmap where IRQ has been sent */
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
+
+ /*
+ * Vector sent to a given vcpu, only valid when
+ * the vcpu's bit in map is set
+ */
+ u8 vectors[KVM_MAX_VCPU_IDS];
+};
+
+
+struct rtc_status {
+ int pending_eoi;
+ struct dest_map dest_map;
+};
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ spinlock_t lock;
+ struct rtc_status rtc_status;
+ struct delayed_work eoi_inject;
+ u32 irq_eoi[IOAPIC_NUM_PINS];
+ u32 irr_delivered;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline int ioapic_in_kernel(struct kvm *kvm)
+{
+ return irqchip_full(kvm);
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+ int trigger_mode);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors);
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..7cc8950005b6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,30 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* irq.c: API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+#include "hyperv.h"
+#include "ioapic.h"
#include "irq.h"
-#include "i8254.h"
+#include "trace.h"
#include "x86.h"
+#include "xen.h"
/*
* check if there are pending timer events
@@ -32,14 +26,84 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- int ret;
+ int r = 0;
- ret = pit_has_pending_timer(vcpu);
- ret |= apic_has_pending_timer(vcpu);
+ if (lapic_in_kernel(vcpu))
+ r = apic_has_pending_timer(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ r += kvm_xen_has_pending_timer(vcpu);
- return ret;
+ return r;
}
-EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
+/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+static int get_userspace_extint(struct kvm_vcpu *vcpu)
+{
+ int vector = vcpu->arch.pending_external_vector;
+
+ vcpu->arch.pending_external_vector = -1;
+ return vector;
+}
+
+/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+ /*
+ * FIXME: interrupt.injected represents an interrupt whose
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.injected;
+
+ if (kvm_xen_has_interrupt(v))
+ return 1;
+
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
+ return v->kvm->arch.vpic->output;
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return pending_userspace_extint(v);
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
+ return 0;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_injectable_intr);
/*
* check if there is pending interrupt without
@@ -47,55 +111,523 @@ EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
+ if (kvm_cpu_has_extint(v))
+ return 1;
- if (!irqchip_in_kernel(v->kvm))
- return v->arch.interrupt.pending;
+ if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected)
+ return kvm_x86_call(protected_apic_has_interrupt)(v);
- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm); /* PIC */
- return s->output;
- } else
- return 0;
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
+ return -1;
}
- return 1;
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+#ifdef CONFIG_KVM_XEN
+ if (kvm_xen_has_interrupt(v))
+ return v->kvm->arch.xen.upcall_vector;
+#endif
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return get_userspace_extint(v);
}
-EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_get_extint);
/*
* Read pending interrupt vector and intack.
*/
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
- int vector;
+ int vector = kvm_cpu_get_extint(v);
+ if (vector != -1)
+ return vector; /* PIC */
- if (!irqchip_in_kernel(v->kvm))
- return v->arch.interrupt.nr;
+ vector = kvm_apic_has_interrupt(v); /* APIC */
+ if (vector != -1)
+ kvm_apic_ack_interrupt(v, vector);
- vector = kvm_get_apic_interrupt(v); /* APIC */
- if (vector == -1) {
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm);
- s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(v->kvm);
- }
- }
return vector;
}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
- kvm_inject_apic_timer_irqs(vcpu);
- kvm_inject_pit_timer_irqs(vcpu);
- /* TODO: PIT, RTC etc. */
+ if (lapic_in_kernel(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_inject_timer_irqs(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
+ kvm_x86_call(migrate_timers)(vcpu);
+}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
+{
+ bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
+
+ return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
+}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+static void kvm_msi_to_lapic_irq(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
+ line_status);
+#endif
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_pic_set_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_ioapic_set_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+#endif
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_synic_set_irq;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
+
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_HYPERV
+ kvm_hv_irq_routing_update(kvm);
+#endif
+
+ if (irqchip_split(kvm))
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *entry)
+{
+ unsigned int host_irq = irqfd->producer->irq;
+ struct kvm *kvm = irqfd->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_lapic_irq irq;
+ int r;
+
+ if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
+ return -EINVAL;
+
+ if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
+ kvm_msi_to_lapic_irq(kvm, entry, &irq);
+
+ /*
+ * Force remapped mode if hardware doesn't support posting the
+ * virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
+ * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
+ * posting multicast/broadcast IRQs. If the interrupt can't be
+ * posted, the device MSI needs to be routed to the host so that
+ * the guest's desired interrupt can be synthesized by KVM.
+ *
+ * This means that KVM can only post lowest-priority interrupts
+ * if they have a single CPU as the destination, e.g. only if
+ * the guest has affined the interrupt to a single vCPU.
+ */
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq))
+ vcpu = NULL;
+ }
+
+ if (!irqfd->irq_bypass_vcpu && !vcpu)
+ return 0;
+
+ r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
+ vcpu, irq.vector);
+ if (r) {
+ WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
+ irqfd->irq_bypass_vcpu = NULL;
+ return r;
+ }
+
+ irqfd->irq_bypass_vcpu = vcpu;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
+ return 0;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret = 0;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
+ if (!kvm->arch.nr_possible_bypass_irqs++)
+ kvm_x86_call(pi_start_bypass)(kvm);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
+ if (ret)
+ kvm->arch.nr_possible_bypass_irqs--;
+ }
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+ WARN_ON(irqfd->producer != prod);
+
+ /*
+ * If the producer of an IRQ that is currently being posted to a vCPU
+ * is unregistered, change the associated IRTE back to remapped mode as
+ * the IRQ has been released (or repurposed) by the device driver, i.e.
+ * KVM must relinquish control of the IRTE.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, NULL);
+ if (ret)
+ pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+ irqfd->consumer.eventfd, ret);
+ }
+ irqfd->producer = NULL;
+
+ kvm->arch.nr_possible_bypass_irqs--;
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI &&
+ old->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
+
+ kvm_pi_update_irte(irqfd, new);
+}
+
+#ifdef CONFIG_KVM_IOAPIC
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
}
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9f593188129e..34f4a78a7a01 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* irq.h: in kernel interrupt controller related definitions
* Copyright (c) 2007, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
*/
#ifndef __IRQ_H
@@ -27,10 +15,11 @@
#include <linux/kvm_host.h>
#include <linux/spinlock.h>
-#include "iodev.h"
-#include "ioapic.h"
+#include <kvm/iodev.h>
#include "lapic.h"
+#ifdef CONFIG_KVM_IOAPIC
+
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -38,14 +27,11 @@
struct kvm;
struct kvm_vcpu;
-typedef void irq_request_func(void *opaque, int level);
-
struct kvm_kpic_state {
u8 last_irr; /* edge detection */
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
- u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
@@ -58,6 +44,7 @@ struct kvm_kpic_state {
u8 init4; /* true if 4 byte init */
u8 elcr; /* PIIX edge/trigger selection */
u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
struct kvm_pic *pics_state;
};
@@ -67,29 +54,63 @@ struct kvm_pic {
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- irq_request_func *irq_request;
- void *irq_request_opaque;
int output; /* intr from master PIC */
- struct kvm_io_device dev;
- void (*ack_notifier)(void *opaque, int irq);
+ struct kvm_io_device dev_master;
+ struct kvm_io_device dev_slave;
+ struct kvm_io_device dev_elcr;
+ unsigned long irq_states[PIC_NUM_PINS];
};
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
-void kvm_pic_clear_isr_ack(struct kvm *kvm);
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+static inline int irqchip_full(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
{
- return kvm->arch.vpic;
+ return false;
}
+#endif
-static inline int irqchip_in_kernel(struct kvm *kvm)
+static inline int pic_in_kernel(struct kvm *kvm)
{
- return pic_irqchip(kvm) != NULL;
+ return irqchip_full(kvm);
}
-void kvm_pic_reset(struct kvm_kpic_state *s);
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode != KVM_IRQCHIP_NONE;
+}
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
@@ -98,7 +119,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
-int pit_has_pending_timer(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
new file mode 100644
index 000000000000..24a710d37323
--- /dev/null
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include "vmx/vmx.h"
+#include "svm/svm.h"
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_KVM_AMD)) {
+ BLANK();
+ OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
+ OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
+ OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
+ OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
+ OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
+ }
+
+ if (IS_ENABLED(CONFIG_KVM_INTEL)) {
+ BLANK();
+ OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
+ }
+}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1ff819dce7d3..8ddb01191d6f 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,32 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ASM_KVM_CACHE_REGS_H
#define ASM_KVM_CACHE_REGS_H
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+#include <linux/kvm_host.h>
+
+#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP)
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE \
+ | X86_CR4_CET)
+
+#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG)
+#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)
+#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP)
+
+static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS));
+
+#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
+static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
+{ \
+ return vcpu->arch.regs[VCPU_REGS_##uname]; \
+} \
+static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \
+ unsigned long val) \
+{ \
+ vcpu->arch.regs[VCPU_REGS_##uname] = val; \
+}
+BUILD_KVM_GPR_ACCESSORS(rax, RAX)
+BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
+BUILD_KVM_GPR_ACCESSORS(rcx, RCX)
+BUILD_KVM_GPR_ACCESSORS(rdx, RDX)
+BUILD_KVM_GPR_ACCESSORS(rbp, RBP)
+BUILD_KVM_GPR_ACCESSORS(rsi, RSI)
+BUILD_KVM_GPR_ACCESSORS(rdi, RDI)
+#ifdef CONFIG_X86_64
+BUILD_KVM_GPR_ACCESSORS(r8, R8)
+BUILD_KVM_GPR_ACCESSORS(r9, R9)
+BUILD_KVM_GPR_ACCESSORS(r10, R10)
+BUILD_KVM_GPR_ACCESSORS(r11, R11)
+BUILD_KVM_GPR_ACCESSORS(r12, R12)
+BUILD_KVM_GPR_ACCESSORS(r13, R13)
+BUILD_KVM_GPR_ACCESSORS(r14, R14)
+BUILD_KVM_GPR_ACCESSORS(r15, R15)
+#endif
+
+/*
+ * Using the register cache from interrupt context is generally not allowed, as
+ * caching a register and marking it available/dirty can't be done atomically,
+ * i.e. accesses from interrupt context may clobber state or read stale data if
+ * the vCPU task is in the process of updating the cache. The exception is if
+ * KVM is handling a PMI IRQ/NMI VM-Exit, as that bound code sequence doesn't
+ * touch the cache, it runs after the cache is reset (post VM-Exit), and PMIs
+ * need to access several registers that are cacheable.
+ */
+#define kvm_assert_register_caching_allowed(vcpu) \
+ lockdep_assert_once(in_task() || kvm_arch_pmi_in_guest(vcpu))
+
+/*
+ * avail dirty
+ * 0 0 register in VMCS/VMCB
+ * 0 1 *INVALID*
+ * 1 0 register in vcpu->arch
+ * 1 1 register in vcpu->arch, needs to be stored back
+ */
+static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
{
- if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, reg);
+ kvm_assert_register_caching_allowed(vcpu);
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+/*
+ * kvm_register_test_and_mark_available() is a special snowflake that uses an
+ * arch bitop directly to avoid the explicit instrumentation that comes with
+ * the generic bitops. This allows code that cannot be instrumented (noinstr
+ * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers.
+ */
+static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+/*
+ * The "raw" register helpers are only for cases where the full 64 bits of a
+ * register are read/written irrespective of current vCPU mode. In other words,
+ * odds are good you shouldn't be using the raw variants.
+ */
+static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
+{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return 0;
+
+ if (!kvm_register_is_available(vcpu, reg))
+ kvm_x86_call(cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
-static inline void kvm_register_write(struct kvm_vcpu *vcpu,
- enum kvm_reg reg,
- unsigned long val)
+static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
+ unsigned long val)
{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return;
+
vcpu->arch.regs[reg] = val;
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ kvm_register_mark_dirty(vcpu, reg);
}
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
{
- return kvm_register_read(vcpu, VCPU_REGS_RIP);
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RIP);
}
static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
{
- kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+ kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val);
+}
+
+static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RSP);
+}
+
+static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val);
+}
+
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ might_sleep(); /* on svm */
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value)
+{
+ vcpu->arch.walk_mmu->pdptrs[index] = value;
+}
+
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
+ !kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0);
+ return vcpu->arch.cr0 & mask;
+}
+
+static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr0_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr0_bit));
+
+ return !!kvm_read_cr0_bits(vcpu, cr0_bit);
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
+ !kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4);
+ return vcpu->arch.cr4 & mask;
+}
+
+static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr4_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr4_bit));
+
+ return !!kvm_read_cr4_bits(vcpu, cr4_bit);
+}
+
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3);
+ return vcpu->arch.cr3;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_rax_read(vcpu) & -1u)
+ | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32);
+}
+
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+ vcpu->stat.guest_mode = 1;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+
+ if (vcpu->arch.load_eoi_exitmap_pending) {
+ vcpu->arch.load_eoi_exitmap_pending = false;
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+ }
+
+ vcpu->stat.guest_mode = 0;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
}
#endif
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
new file mode 100644
index 000000000000..fb3dab4b5a53
--- /dev/null
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+#include <asm/desc_defs.h>
+#include "fpu.h"
+
+struct x86_emulate_ctxt;
+enum x86_intercept;
+enum x86_intercept_stage;
+
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+ u8 async_page_fault;
+ unsigned long exit_qualification;
+};
+
+/*
+ * This struct is used to carry enough information from the instruction
+ * decoder to main KVM so that a decision can be made whether the
+ * instruction needs to be intercepted or not.
+ */
+struct x86_instruction_info {
+ u8 intercept; /* which intercept */
+ u8 rep_prefix; /* rep prefix? */
+ u8 modrm_mod; /* mod part of modrm */
+ u8 modrm_reg; /* index of register used */
+ u8 modrm_rm; /* rm part of modrm */
+ u64 src_val; /* value of source operand */
+ u64 dst_val; /* value of destination operand */
+ u8 src_bytes; /* size of source operand */
+ u8 dst_bytes; /* size of destination operand */
+ u8 src_type; /* type of source operand */
+ u8 dst_type; /* type of destination operand */
+ u8 ad_bytes; /* size of src/dst address */
+ u64 rip; /* rip of the instruction */
+ u64 next_rip; /* rip following the instruction */
+};
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
+/* Emulation during event vectoring is unhandleable. */
+#define X86EMUL_UNHANDLEABLE_VECTORING 7
+
+/* x86-specific emulation flags */
+#define X86EMUL_F_WRITE BIT(0)
+#define X86EMUL_F_FETCH BIT(1)
+#define X86EMUL_F_IMPLICIT BIT(2)
+#define X86EMUL_F_INVLPG BIT(3)
+#define X86EMUL_F_MSR BIT(4)
+#define X86EMUL_F_DT_LOAD BIT(5)
+
+struct x86_emulate_ops {
+ void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
+ /*
+ * read_gpr: read a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ */
+ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
+ /*
+ * write_gpr: write a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ * @val: value to write.
+ */
+ void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*read_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *fault, bool system);
+
+ /*
+ * write_std: Write bytes of standard (non-emulated/special) memory.
+ * Used for descriptor writing.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to write to memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*write_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault, bool system);
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * write_emulated: Write bytes to emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, const void *val,
+ unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *fault);
+ void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
+
+ int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count);
+
+ int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, const void *val,
+ unsigned int count);
+
+ bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3, int seg);
+ void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3, int seg);
+ unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
+ int seg);
+ void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
+ int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+ int (*cpl)(struct x86_emulate_ctxt *ctxt);
+ ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr);
+ int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc);
+ int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
+ void (*halt)(struct x86_emulate_ctxt *ctxt);
+ void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
+ int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
+ int (*intercept)(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+
+ bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only);
+ bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt);
+
+ void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
+
+ bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
+ void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
+ int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr);
+ int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
+
+ gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
+ unsigned int flags);
+
+ bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
+ unsigned int flags);
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned int count;
+ union {
+ unsigned long orig_val;
+ u64 orig_val64;
+ };
+ union {
+ unsigned long *reg;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
+ unsigned xmm;
+ unsigned mm;
+ } addr;
+ union {
+ unsigned long val;
+ u64 val64;
+ char valptr[sizeof(avx256_t)];
+ sse128_t vec_val;
+ avx256_t vec_val2;
+ u64 mm_val;
+ void *data;
+ } __aligned(32);
+};
+
+#define X86_MAX_INSTRUCTION_LENGTH 15
+
+struct fetch_cache {
+ u8 data[X86_MAX_INSTRUCTION_LENGTH];
+ u8 *ptr;
+ u8 *end;
+};
+
+struct read_cache {
+ u8 data[1024];
+ unsigned long pos;
+ unsigned long end;
+};
+
+/* Execution mode, passed to the emulator. */
+enum x86emul_mode {
+ X86EMUL_MODE_REAL, /* Real mode. */
+ X86EMUL_MODE_VM86, /* Virtual 8086 mode. */
+ X86EMUL_MODE_PROT16, /* 16-bit protected mode. */
+ X86EMUL_MODE_PROT32, /* 32-bit protected mode. */
+ X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
+};
+
+/*
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+struct fastop;
+
+typedef void (*fastop_t)(struct fastop *);
+
+/*
+ * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is
+ * tracked/accessed via _eip, and except for RIP relative addressing, which
+ * also uses _eip, RIP cannot be a register operand nor can it be an operand in
+ * a ModRM or SIB byte.
+ */
+#ifdef CONFIG_X86_64
+#define NR_EMULATOR_GPRS 16
+#else
+#define NR_EMULATOR_GPRS 8
+#endif
+
+/*
+ * Distinguish between no prefix, REX, or in the future REX2.
+ */
+enum rex_type {
+ REX_NONE,
+ REX_PREFIX,
+};
+
+struct x86_emulate_ctxt {
+ void *vcpu;
+ const struct x86_emulate_ops *ops;
+
+ /* Register state before/after emulation. */
+ unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ enum x86emul_mode mode;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ bool perm_ok; /* do not check permissions if true */
+ bool tf; /* TF value before instruction (after for syscall/sysret) */
+
+ bool have_exception;
+ struct x86_exception exception;
+
+ /* GPA available */
+ bool gpa_available;
+ gpa_t gpa_val;
+
+ /*
+ * decode cache
+ */
+
+ /* current opcode length in bytes */
+ u8 opcode_len;
+ u8 b;
+ u8 intercept;
+ bool op_prefix;
+ u8 op_bytes;
+ u8 ad_bytes;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ fastop_t fop;
+ };
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+
+ bool rip_relative;
+ enum rex_type rex_prefix;
+ u8 rex_bits;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ /* bitmaps of registers in _regs[] that can be read */
+ u16 regs_valid;
+ /* bitmaps of registers in _regs[] that have been written */
+ u16 regs_dirty;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 modrm_seg;
+ u8 seg_override;
+ u64 d;
+ unsigned long _eip;
+
+ /* Here begins the usercopy section. */
+ struct operand src;
+ struct operand src2;
+ struct operand dst;
+ struct operand memop;
+ unsigned long _regs[NR_EMULATOR_GPRS];
+ struct operand *memopp;
+ struct fetch_cache fetch;
+ struct read_cache io_read;
+ struct read_cache mem_read;
+ bool is_branch;
+};
+
+#define KVM_EMULATOR_BUG_ON(cond, ctxt) \
+({ \
+ int __ret = (cond); \
+ \
+ if (WARN_ON_ONCE(__ret)) \
+ ctxt->ops->vm_bugged(ctxt); \
+ unlikely(__ret); \
+})
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 0xf3
+#define REPNE_PREFIX 0xf2
+
+/* CPUID vendors */
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
+
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273
+
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ebx 0x6f677948
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ecx 0x656e6975
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_edx 0x6e65476e
+
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69
+
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ebx 0x746e6543
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ecx 0x736c7561
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_edx 0x48727561
+
+static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx)
+{
+ return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
+static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx)
+{
+ return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) ||
+ (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx);
+}
+
+static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx)
+{
+ return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx;
+}
+
+enum x86_intercept_stage {
+ X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
+ X86_ICPT_PRE_EXCEPT,
+ X86_ICPT_POST_EXCEPT,
+ X86_ICPT_POST_MEMACCESS,
+};
+
+enum x86_intercept {
+ x86_intercept_none,
+ x86_intercept_cr_read,
+ x86_intercept_cr_write,
+ x86_intercept_clts,
+ x86_intercept_lmsw,
+ x86_intercept_smsw,
+ x86_intercept_dr_read,
+ x86_intercept_dr_write,
+ x86_intercept_lidt,
+ x86_intercept_sidt,
+ x86_intercept_lgdt,
+ x86_intercept_sgdt,
+ x86_intercept_lldt,
+ x86_intercept_sldt,
+ x86_intercept_ltr,
+ x86_intercept_str,
+ x86_intercept_rdtsc,
+ x86_intercept_rdpmc,
+ x86_intercept_pushf,
+ x86_intercept_popf,
+ x86_intercept_cpuid,
+ x86_intercept_rsm,
+ x86_intercept_iret,
+ x86_intercept_intn,
+ x86_intercept_invd,
+ x86_intercept_pause,
+ x86_intercept_hlt,
+ x86_intercept_invlpg,
+ x86_intercept_invlpga,
+ x86_intercept_vmrun,
+ x86_intercept_vmload,
+ x86_intercept_vmsave,
+ x86_intercept_vmmcall,
+ x86_intercept_stgi,
+ x86_intercept_clgi,
+ x86_intercept_skinit,
+ x86_intercept_rdtscp,
+ x86_intercept_rdpid,
+ x86_intercept_icebp,
+ x86_intercept_wbinvd,
+ x86_intercept_monitor,
+ x86_intercept_mwait,
+ x86_intercept_rdmsr,
+ x86_intercept_wrmsr,
+ x86_intercept_in,
+ x86_intercept_ins,
+ x86_intercept_out,
+ x86_intercept_outs,
+ x86_intercept_xsetbv,
+
+ nr_x86_intercepts
+};
+
+/* Host execution mode. */
+#if defined(CONFIG_X86_32)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type);
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+#define EMULATION_INTERCEPTED 2
+void init_decode_cache(struct x86_emulate_ctxt *ctxt);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
+
+static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+ BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
new file mode 100644
index 000000000000..ee53e75a60cb
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <asm/mshyperv.h>
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+
+struct kvm_hv_tlb_range {
+ u64 start_gfn;
+ u64 pages;
+};
+
+static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
+ void *data)
+{
+ struct kvm_hv_tlb_range *range = data;
+
+ return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
+ range->pages);
+}
+
+static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
+ struct kvm_hv_tlb_range *range)
+{
+ if (range)
+ return hyperv_flush_guest_mapping_range(root_tdp,
+ kvm_fill_hv_flush_list_func, (void *)range);
+ else
+ return hyperv_flush_guest_mapping(root_tdp);
+}
+
+static int __hv_flush_remote_tlbs_range(struct kvm *kvm,
+ struct kvm_hv_tlb_range *range)
+{
+ struct kvm_arch *kvm_arch = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ int ret = 0, nr_unique_valid_roots;
+ unsigned long i;
+ hpa_t root;
+
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+
+ if (!VALID_PAGE(kvm_arch->hv_root_tdp)) {
+ nr_unique_valid_roots = 0;
+
+ /*
+ * Flush all valid roots, and see if all vCPUs have converged
+ * on a common root, in which case future flushes can skip the
+ * loop and flush the common root.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ root = vcpu->arch.hv_root_tdp;
+ if (!VALID_PAGE(root) || root == kvm_arch->hv_root_tdp)
+ continue;
+
+ /*
+ * Set the tracked root to the first valid root. Keep
+ * this root for the entirety of the loop even if more
+ * roots are encountered as a low effort optimization
+ * to avoid flushing the same (first) root again.
+ */
+ if (++nr_unique_valid_roots == 1)
+ kvm_arch->hv_root_tdp = root;
+
+ if (!ret)
+ ret = hv_remote_flush_root_tdp(root, range);
+
+ /*
+ * Stop processing roots if a failure occurred and
+ * multiple valid roots have already been detected.
+ */
+ if (ret && nr_unique_valid_roots > 1)
+ break;
+ }
+
+ /*
+ * The optimized flush of a single root can't be used if there
+ * are multiple valid roots (obviously).
+ */
+ if (nr_unique_valid_roots > 1)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ } else {
+ ret = hv_remote_flush_root_tdp(kvm_arch->hv_root_tdp, range);
+ }
+
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ return ret;
+}
+
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages)
+{
+ struct kvm_hv_tlb_range range = {
+ .start_gfn = start_gfn,
+ .pages = nr_pages,
+ };
+
+ return __hv_flush_remote_tlbs_range(kvm, &range);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs_range);
+
+int hv_flush_remote_tlbs(struct kvm *kvm)
+{
+ return __hv_flush_remote_tlbs_range(kvm, NULL);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs);
+
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_track_root_tdp);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
new file mode 100644
index 000000000000..eefab3dc8498
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+
+#ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
+
+#if IS_ENABLED(CONFIG_HYPERV)
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
+int hv_flush_remote_tlbs(struct kvm *kvm);
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
+static inline hpa_t hv_get_partition_assist_page(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Partition assist page is something which Hyper-V running in L0
+ * requires from KVM running in L1 before direct TLB flush for L2
+ * guests can be enabled. KVM doesn't currently use the page but to
+ * comply with TLFS it still needs to be allocated. For now, this
+ * is a single page shared among all vCPUs.
+ */
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &vcpu->kvm->arch.hv_pa_pg;
+
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
+
+ if (!*p_hv_pa_pg)
+ return INVALID_PAGE;
+
+ return __pa(*p_hv_pa_pg);
+}
+#else /* !CONFIG_HYPERV */
+static inline int hv_flush_remote_tlbs(struct kvm *kvm)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+}
+#endif /* !CONFIG_HYPERV */
+
+#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644
index ed66e4c078dc..000000000000
--- a/arch/x86/kvm/kvm_svm.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <asm/msr.h>
-
-#include <asm/svm.h>
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
- unsigned long host_cr2;
-
- u32 *msrpm;
- struct vmcb *hsave;
- u64 hsave_msr;
-
- u64 nested_vmcb;
-
- /* These are the merged vectors */
- u32 *nested_msrpm;
-
- /* gpa pointers to the real vectors */
- u64 nested_vmcb_msrpm;
-};
-
-#endif
-
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 26bd6ba74e1c..000000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
-
-struct kvm_timer {
- struct hrtimer timer;
- s64 period; /* unit: ns */
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm_timer_ops *t_ops;
- struct kvm *kvm;
- int vcpu_id;
-};
-
-struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
-};
-
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
-
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ae99d83f81a3..1597dd0b0cc6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Local APIC virtualization
@@ -5,6 +6,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2007 Novell
* Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Dor Laor <dor.laor@qumranet.com>
@@ -12,10 +14,8 @@
* Yaozu (Eddie) Dong <eddie.dong@intel.com>
*
* Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
@@ -24,16 +24,28 @@
#include <linux/smp.h>
#include <linux/hrtimer.h>
#include <linux/io.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/math64.h>
+#include <linux/slab.h>
+#include <asm/apic.h>
#include <asm/processor.h>
+#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <asm/delay.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
#include "irq.h"
+#include "ioapic.h"
+#include "trace.h"
+#include "x86.h"
+#include "xen.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "smm.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -41,99 +53,480 @@
#define mod_64(x, y) ((x) % (y))
#endif
-#define PRId64 "d"
-#define PRIx64 "llx"
-#define PRIu64 "u"
-#define PRIo64 "o"
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION 0x14UL
+#define LAPIC_MMIO_LENGTH (1 << 12)
-#define APIC_BUS_CYCLE_NS 1
+/*
+ * Enable local APIC timer advancement (tscdeadline mode only) with adaptive
+ * tuning. When enabled, KVM programs the host timer event to fire early, i.e.
+ * before the deadline expires, to account for the delay between taking the
+ * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume
+ * the guest, i.e. so that the interrupt arrives in the guest with minimal
+ * latency relative to the deadline programmed by the guest.
+ */
+static bool lapic_timer_advance __read_mostly = true;
+module_param(lapic_timer_advance, bool, 0444);
-/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_NS_INIT 1000
+#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
+/* step-by-step approximation to mitigate fluctuation */
+#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
-#define APIC_LVT_NUM 6
-/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
-#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK 0xc0000
-#define APIC_DEST_NOSHORT 0x0
-#define APIC_DEST_MASK 0x800
-#define MAX_APIC_VECTOR 256
+static bool __read_mostly vector_hashing_enabled = true;
+module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444);
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
- return *((u32 *) (apic->regs + reg_off));
+ apic_set_reg(apic->regs, reg_off, val);
}
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
{
- *((u32 *) (apic->regs + reg_off)) = val;
+ return apic_get_reg64(apic->regs, reg);
}
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+ int reg, u64 val)
{
- return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ apic_set_reg64(apic->regs, reg, val);
}
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
{
- return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+ apic_test_vector(vector, apic->regs + APIC_IRR);
}
-static inline void apic_set_vector(int vec, void *bitmap)
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu);
+
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
+
+static inline int apic_enabled(struct kvm_lapic *apic)
{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
}
-static inline void apic_clear_vector(int vec, void *bitmap)
+#define LVT_MASK \
+ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK \
+ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ return apic->vcpu->vcpu_id;
}
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
+static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
+static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
{
- return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ return kvm_x86_ops.set_hv_timer
+ && !(kvm_mwait_in_guest(vcpu->kvm) ||
+ kvm_can_post_timer_interrupt(vcpu));
}
-static inline int apic_enabled(struct kvm_lapic *apic)
+static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+ return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
}
-#define LVT_MASK \
- (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
-#define LINT_MASK \
- (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
- APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+ u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+ switch (map->logical_mode) {
+ case KVM_APIC_MODE_SW_DISABLED:
+ /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+ *cluster = map->xapic_flat_map;
+ *mask = 0;
+ return true;
+ case KVM_APIC_MODE_X2APIC: {
+ u32 offset = (dest_id >> 16) * 16;
+ u32 max_apic_id = map->max_apic_id;
+
+ if (offset <= max_apic_id) {
+ u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+ offset = array_index_nospec(offset, map->max_apic_id + 1);
+ *cluster = &map->phys_map[offset];
+ *mask = dest_id & (0xffff >> (16 - cluster_size));
+ } else {
+ *mask = 0;
+ }
+
+ return true;
+ }
+ case KVM_APIC_MODE_XAPIC_FLAT:
+ *cluster = map->xapic_flat_map;
+ *mask = dest_id & 0xff;
+ return true;
+ case KVM_APIC_MODE_XAPIC_CLUSTER:
+ *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
+ *mask = dest_id & 0xf;
+ return true;
+ case KVM_APIC_MODE_MAP_DISABLED:
+ return false;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+}
+
+static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu,
+ bool *xapic_id_mismatch)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 x2apic_id = kvm_x2apic_id(apic);
+ u32 xapic_id = kvm_xapic_id(apic);
+ u32 physical_id;
+
+ /*
+ * For simplicity, KVM always allocates enough space for all possible
+ * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on
+ * without the optimized map.
+ */
+ if (WARN_ON_ONCE(xapic_id > new->max_apic_id))
+ return -EINVAL;
+
+ /*
+ * Bail if a vCPU was added and/or enabled its APIC between allocating
+ * the map and doing the actual calculations for the map. Note, KVM
+ * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if
+ * the compiler decides to reload x2apic_id after this check.
+ */
+ if (x2apic_id > new->max_apic_id)
+ return -E2BIG;
+
+ /*
+ * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+ * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+ * 32-bit value. Any unwanted aliasing due to truncation results will
+ * be detected below.
+ */
+ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+ *xapic_id_mismatch = true;
+
+ /*
+ * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+ * Allow sending events to vCPUs by their x2APIC ID even if the target
+ * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+ * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+ * and collide).
+ *
+ * Honor the architectural (and KVM's non-optimized) behavior if
+ * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed
+ * to process messages independently. If multiple vCPUs have the same
+ * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+ * manually modified its xAPIC IDs, events targeting that ID are
+ * supposed to be recognized by all vCPUs with said ID.
+ */
+ if (vcpu->kvm->arch.x2apic_format) {
+ /* See also kvm_apic_match_physical_addr(). */
+ if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
+ new->phys_map[x2apic_id] = apic;
+
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+ } else {
+ /*
+ * Disable the optimized map if the physical APIC ID is already
+ * mapped, i.e. is aliased to multiple vCPUs. The optimized
+ * map requires a strict 1:1 mapping between IDs and vCPUs.
+ */
+ if (apic_x2apic_mode(apic))
+ physical_id = x2apic_id;
+ else
+ physical_id = xapic_id;
+
+ if (new->phys_map[physical_id])
+ return -EINVAL;
+
+ new->phys_map[physical_id] = apic;
+ }
+
+ return 0;
+}
+
+static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ enum kvm_apic_logical_mode logical_mode;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+
+ if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ return;
+
+ if (!kvm_apic_sw_enabled(apic))
+ return;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ if (!ldr)
+ return;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_mode = KVM_APIC_MODE_X2APIC;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+ /*
+ * To optimize logical mode delivery, all software-enabled APICs must
+ * be configured for the same mode.
+ */
+ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+ new->logical_mode = logical_mode;
+ } else if (new->logical_mode != logical_mode) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is read-only and derived directly from the
+ * x2APIC ID, thus is guaranteed to be addressable. KVM reuses
+ * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+ * reversing the LDR calculation to get cluster of APICs, i.e. no
+ * additional work is required.
+ */
+ if (apic_x2apic_mode(apic))
+ return;
+
+ if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+ &cluster, &mask))) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ if (!mask)
+ return;
+
+ ldr = ffs(mask) - 1;
+ if (!is_power_of_2(mask) || cluster[ldr])
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ else
+ cluster[ldr] = apic;
+}
+
+/*
+ * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
+ *
+ * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with
+ * apic_map_lock_held.
+ */
+enum {
+ CLEAN,
+ UPDATE_IN_PROGRESS,
+ DIRTY
+};
+
+static void kvm_recalculate_apic_map(struct kvm *kvm)
+{
+ struct kvm_apic_map *new, *old = NULL;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ u32 max_id = 255; /* enough space for any xAPIC ID */
+ bool xapic_id_mismatch;
+ int r;
+
+ /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
+ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
+ return;
+
+ WARN_ONCE(!irqchip_in_kernel(kvm),
+ "Dirty APIC map without an in-kernel local APIC");
+
+ mutex_lock(&kvm->arch.apic_map_lock);
+
+retry:
+ /*
+ * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+ * or the APIC registers (if dirty). Note, on retry the map may have
+ * not yet been marked dirty by whatever task changed a vCPU's x2APIC
+ * ID, i.e. the map may still show up as in-progress. In that case
+ * this task still needs to retry and complete its calculation.
+ */
+ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
+ DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
+ /* Someone else has updated the map. */
+ mutex_unlock(&kvm->arch.apic_map_lock);
+ return;
+ }
+
+ /*
+ * Reset the mismatch flag between attempts so that KVM does the right
+ * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
+ * keep max_id strictly increasing. Disallowing max_id from shrinking
+ * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
+ * with the highest x2APIC ID is toggling its APIC on and off.
+ */
+ xapic_id_mismatch = false;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_apic_present(vcpu))
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
+
+ new = kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
+ GFP_KERNEL_ACCOUNT);
+
+ if (!new)
+ goto out;
+
+ new->max_apic_id = max_id;
+ new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
+ if (r) {
+ kvfree(new);
+ new = NULL;
+ if (r == -E2BIG) {
+ cond_resched();
+ goto retry;
+ }
+
+ goto out;
+ }
+
+ kvm_recalculate_logical_map(new, vcpu);
+ }
+out:
+ /*
+ * The optimized map is effectively KVM's internal version of APICv,
+ * and all unwanted aliasing that results in disabling the optimized
+ * map also applies to APICv.
+ */
+ if (!new)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+
+ if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+
+ if (xapic_id_mismatch)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+
+ old = rcu_dereference_protected(kvm->arch.apic_map,
+ lockdep_is_held(&kvm->arch.apic_map_lock));
+ rcu_assign_pointer(kvm->arch.apic_map, new);
+ /*
+ * Write kvm->arch.apic_map before clearing apic->apic_map_dirty.
+ * If another update has come in, leave it DIRTY.
+ */
+ atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
+ UPDATE_IN_PROGRESS, CLEAN);
+ mutex_unlock(&kvm->arch.apic_map_lock);
+
+ if (old)
+ kvfree_rcu(old, rcu);
+
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
{
- return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+ bool enabled = val & APIC_SPIV_APIC_ENABLED;
+
+ kvm_lapic_set_reg(apic, APIC_SPIV, val);
+
+ if (enabled != apic->sw_enabled) {
+ apic->sw_enabled = enabled;
+ if (enabled)
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
+ else
+ static_branch_inc(&apic_sw_disabled.key);
+
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ }
+
+ /* Check if there are APF page ready requests pending */
+ if (enabled) {
+ kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
+}
+
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
+{
+ kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+ kvm_lapic_set_reg(apic, APIC_LDR, id);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
+{
+ kvm_lapic_set_reg(apic, APIC_DFR, val);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
+{
+ u32 ldr = kvm_apic_calc_x2apic_ldr(id);
+
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
+ kvm_lapic_set_reg(apic, APIC_ID, id);
+ kvm_lapic_set_reg(apic, APIC_LDR, ldr);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
- return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+ return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
}
-static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
{
- return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
}
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
}
static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -141,88 +534,398 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
}
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
- LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
- LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
- LVT_MASK | APIC_MODE_MASK, /* LVTPC */
- LINT_MASK, LINT_MASK, /* LVT0-1 */
- LVT_MASK /* LVTERR */
+static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
+{
+ return apic->nr_lvt_entries > lvt_index;
+}
+
+static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu)
+{
+ return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P);
+}
+
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 v = 0;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
+
+ /*
+ * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
+ * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
+ * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
+ * version first and level-triggered interrupts never get EOIed in
+ * IOAPIC.
+ */
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
+ !ioapic_in_kernel(vcpu->kvm))
+ v |= APIC_LVR_DIRECTED_EOI;
+ kvm_lapic_set_reg(apic, APIC_LVR, v);
+}
+
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
+{
+ int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
+ return;
+
+ /* Initialize/mask any "new" LVT entries. */
+ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+
+ apic->nr_lvt_entries = nr_lvt_entries;
+
+ /* The number of LVT entries is reflected in the version register. */
+ kvm_apic_set_version(vcpu);
+}
+
+static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
+ [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */
+ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_LINT0] = LINT_MASK,
+ [LVT_LINT1] = LINT_MASK,
+ [LVT_ERROR] = LVT_MASK,
+ [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
};
-static int find_highest_vector(void *bitmap)
+static u8 count_vectors(void *bitmap)
{
- u32 *word = bitmap;
- int word_offset = MAX_APIC_VECTOR >> 5;
+ int vec;
+ u32 *reg;
+ u8 count = 0;
- while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
- continue;
+ for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
+ count += hweight32(*reg);
+ }
- if (likely(!word_offset && !word[0]))
- return -1;
- else
- return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+ return count;
}
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
{
- return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+ unsigned long pir_vals[NR_PIR_WORDS];
+ u32 *__pir = (void *)pir_vals;
+ u32 i, vec;
+ u32 irr_val, prev_irr_val;
+ int max_updated_irr;
+
+ max_updated_irr = -1;
+ *max_irr = -1;
+
+ if (!pi_harvest_pir(pir, pir_vals))
+ return false;
+
+ for (i = vec = 0; i <= 7; i++, vec += 32) {
+ u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
+
+ irr_val = READ_ONCE(*p_irr);
+
+ if (__pir[i]) {
+ prev_irr_val = irr_val;
+ do {
+ irr_val = prev_irr_val | __pir[i];
+ } while (prev_irr_val != irr_val &&
+ !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
+
+ if (prev_irr_val != irr_val)
+ max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
+ }
+ if (irr_val)
+ *max_irr = __fls(irr_val) + vec;
+ }
+
+ return ((max_updated_irr != -1) &&
+ (max_updated_irr == *max_irr));
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_apic_update_irr);
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
{
- apic_clear_vector(vec, apic->regs + APIC_IRR);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
+
+ if (unlikely(!apic->apicv_active && irr_updated))
+ apic->irr_pending = true;
+ return irr_updated;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_irr);
+
+static inline int apic_search_irr(struct kvm_lapic *apic)
+{
+ return apic_find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
- result = find_highest_vector(apic->regs + APIC_IRR);
+ /*
+ * Note that irr_pending is just a hint. It will be always
+ * true with virtual interrupt delivery enabled.
+ */
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
return result;
}
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ if (unlikely(apic->apicv_active)) {
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ } else {
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+ }
+}
+
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
+{
+ apic_clear_irr(vec, vcpu->arch.apic);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_clear_irr);
+
+static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
+{
+ return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
+}
+
+static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+{
+ if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
+ return;
+
+ /*
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
+ */
+ if (unlikely(apic->apicv_active))
+ kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
+ */
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = apic_find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+{
+ if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
+ return;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(apic->apicv_active))
+ kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
+ else {
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
+}
+
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
- if (!apic)
- return 0;
- highest_irr = apic_find_highest_irr(apic);
+ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
+ return;
- return highest_irr;
+ kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_hwapic_isr);
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
+ return apic_find_highest_irr(vcpu->arch.apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_find_highest_irr);
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode);
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map)
{
struct kvm_lapic *apic = vcpu->arch.apic;
return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
- irq->level, irq->trig_mode);
+ irq->level, irq->trig_mode, dest_map);
}
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
+ struct kvm_lapic_irq *irq, u32 min)
{
- int result;
+ int i, count = 0;
+ struct kvm_vcpu *vcpu;
- result = find_highest_vector(apic->regs + APIC_ISR);
- ASSERT(result == -1 || result >= 16);
+ if (min > map->max_apic_id)
+ return 0;
- return result;
+ min = array_index_nospec(min, map->max_apic_id + 1);
+
+ for_each_set_bit(i, ipi_bitmap,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, irq, NULL);
+ }
+ }
+
+ return count;
}
-static void apic_update_ppr(struct kvm_lapic *apic)
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit)
{
- u32 tpr, isrv, ppr;
+ struct kvm_apic_map *map;
+ struct kvm_lapic_irq irq = {0};
+ int cluster_size = op_64_bit ? 64 : 32;
+ int count;
+
+ if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
+ return -KVM_EINVAL;
+
+ irq.vector = icr & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr & APIC_MODE_MASK;
+ irq.level = (icr & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ count = -EOPNOTSUPP;
+ if (likely(map)) {
+ count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
+ min += cluster_size;
+ count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
+ }
+
+ rcu_read_unlock();
+ return count;
+}
+
+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
+ return;
+
+ __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ return false;
+
+ val &= KVM_PV_EOI_ENABLED;
+
+ if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
+ return false;
+
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+
+ return val;
+}
+
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+ int highest_irr;
+ if (kvm_x86_ops.sync_pir_to_irr)
+ highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu);
+ else
+ highest_irr = apic_find_highest_irr(apic);
+ if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+ return -1;
+ return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
+{
+ u32 tpr, isrv, ppr, old_ppr;
int isr;
- tpr = apic_get_reg(apic, APIC_TASKPRI);
+ old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -231,85 +934,421 @@ static void apic_update_ppr(struct kvm_lapic *apic)
else
ppr = isrv & 0xf0;
- apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
- apic, ppr, isr, isrv);
+ *new_ppr = ppr;
+ if (old_ppr != ppr)
+ kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
- apic_set_reg(apic, APIC_PROCPRI, ppr);
+ return ppr < old_ppr;
}
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 ppr;
+
+ if (__apic_update_ppr(apic, &ppr) &&
+ apic_has_interrupt_for_ppr(apic, ppr) != -1)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+ apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_ppr);
+
static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
{
- apic_set_reg(apic, APIC_TASKPRI, tpr);
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
apic_update_ppr(apic);
}
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
{
- return dest == 0xff || kvm_apic_id(apic) == dest;
+ return mda == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
}
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
{
- int result = 0;
- u8 logical_id;
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ /*
+ * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they
+ * were in x2APIC mode if the target APIC ID can't be encoded as an
+ * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which
+ * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC
+ * mode. Match the x2APIC ID if and only if the target APIC ID can't
+ * be encoded in xAPIC to avoid spurious matches against a vCPU that
+ * changed its (addressable) xAPIC ID (which is writable).
+ */
+ if (apic_x2apic_mode(apic) || mda > 0xff)
+ return mda == kvm_x2apic_id(apic);
+
+ return mda == kvm_xapic_id(apic);
+}
+
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ u32 logical_id;
- logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
- switch (apic_get_reg(apic, APIC_DFR)) {
+ logical_id = kvm_lapic_get_reg(apic, APIC_LDR);
+
+ if (apic_x2apic_mode(apic))
+ return ((logical_id >> 16) == (mda >> 16))
+ && (logical_id & mda & 0xffff) != 0;
+
+ logical_id = GET_APIC_LOGICAL_ID(logical_id);
+
+ switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
- if (logical_id & mda)
- result = 1;
- break;
+ return (logical_id & mda) != 0;
case APIC_DFR_CLUSTER:
- if (((logical_id >> 4) == (mda >> 0x4))
- && (logical_id & mda & 0xf))
- result = 1;
- break;
+ return ((logical_id >> 4) == (mda >> 4))
+ && (logical_id & mda & 0xf) != 0;
default:
- printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
- break;
+ return false;
}
+}
- return result;
+/* The KVM local APIC implementation has two quirks:
+ *
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
+ *
+ * - in-kernel IOAPIC messages have to be delivered directly to
+ * x2APIC, because the kernel does not support interrupt remapping.
+ * In order to support broadcast without interrupt remapping, x2APIC
+ * rewrites the destination of non-IPI messages from APIC_BROADCAST
+ * to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
+ */
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+ struct kvm_lapic *source, struct kvm_lapic *target)
+{
+ bool ipi = source != NULL;
+
+ if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
+ return X2APIC_BROADCAST;
+
+ return dest_id;
}
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, int dest, int dest_mode)
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int shorthand, unsigned int dest, int dest_mode)
{
- int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
+ u32 mda = kvm_apic_mda(vcpu, dest, source, target);
- apic_debug("target %p, source %p, dest 0x%x, "
- "dest_mode 0x%x, short_hand 0x%x\n",
- target, source, dest, dest_mode, short_hand);
-
- ASSERT(!target);
- switch (short_hand) {
+ ASSERT(target);
+ switch (shorthand) {
case APIC_DEST_NOSHORT:
- if (dest_mode == 0)
- /* Physical mode. */
- result = kvm_apic_match_physical_addr(target, dest);
+ if (dest_mode == APIC_DEST_PHYSICAL)
+ return kvm_apic_match_physical_addr(target, mda);
else
- /* Logical mode. */
- result = kvm_apic_match_logical_addr(target, dest);
- break;
+ return kvm_apic_match_logical_addr(target, mda);
case APIC_DEST_SELF:
- result = (target == source);
- break;
+ return target == source;
case APIC_DEST_ALLINC:
- result = 1;
- break;
+ return true;
case APIC_DEST_ALLBUT:
- result = (target != source);
- break;
+ return target != source;
default:
- printk(KERN_WARNING "Bad dest shorthand value %x\n",
- short_hand);
- break;
+ return false;
}
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_match_dest);
- return result;
+static int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size)
+{
+ int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus);
+
+ BUG_ON(idx >= bitmap_size);
+ return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+ if (!kvm->arch.disabled_lapic_found) {
+ kvm->arch.disabled_lapic_found = true;
+ pr_info("Disabled LAPIC found during irq injection\n");
+ }
+}
+
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+ struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
+{
+ if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+ if ((irq->dest_id == APIC_BROADCAST &&
+ map->logical_mode != KVM_APIC_MODE_X2APIC))
+ return true;
+ if (irq->dest_id == X2APIC_BROADCAST)
+ return true;
+ } else {
+ bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+ if (irq->dest_id == (x2apic_ipi ?
+ X2APIC_BROADCAST : APIC_BROADCAST))
+ return true;
+ }
+
+ return false;
+}
+
+static bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST || irq->msi_redir_hint);
+}
+
+static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+}
+
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped. In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+ struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+ struct kvm_apic_map *map, struct kvm_lapic ***dst,
+ unsigned long *bitmap)
+{
+ int i, lowest;
+
+ if (irq->shorthand == APIC_DEST_SELF && src) {
+ *dst = src;
+ *bitmap = 1;
+ return true;
+ } else if (irq->shorthand)
+ return false;
+
+ if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
+ return false;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id > map->max_apic_id) {
+ *bitmap = 0;
+ } else {
+ u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
+ *dst = &map->phys_map[dest_id];
+ *bitmap = 1;
+ }
+ return true;
+ }
+
+ *bitmap = 0;
+ if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+ (u16 *)bitmap))
+ return false;
+
+ if (!kvm_lowest_prio_delivery(irq))
+ return true;
+
+ if (!vector_hashing_enabled) {
+ lowest = -1;
+ for_each_set_bit(i, bitmap, 16) {
+ if (!(*dst)[i])
+ continue;
+ if (lowest < 0)
+ lowest = i;
+ else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+ (*dst)[lowest]->vcpu) < 0)
+ lowest = i;
+ }
+ } else {
+ if (!*bitmap)
+ return true;
+
+ lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+ bitmap, 16);
+
+ if (!(*dst)[lowest]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ *bitmap = 0;
+ return true;
+ }
+ }
+
+ *bitmap = (lowest >= 0) ? 1 << lowest : 0;
+
+ return true;
+}
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ int i;
+ bool ret;
+
+ *r = -1;
+
+ if (irq->shorthand == APIC_DEST_SELF) {
+ if (KVM_BUG_ON(!src, kvm)) {
+ *r = 0;
+ return true;
+ }
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+ return true;
+ }
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+ if (ret) {
+ *r = 0;
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * This routine tries to handle interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ * interrupt, handle it in posted mode and use the following mechanism
+ * to find the destination vCPU.
+ * 1. For lowest-priority interrupts, store all the possible
+ * destination vCPUs in an array.
+ * 2. Use "guest vector % max number of destination vCPUs" to find
+ * the right destination vCPU in the array for the lowest-priority
+ * interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
+static bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm,
+ struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ bool ret = false;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+ hweight16(bitmap) == 1) {
+ unsigned long i = find_first_bit(&bitmap, 16);
+
+ if (dst[i]) {
+ *dest_vcpu = dst[i]->vcpu;
+ ret = true;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_intr_is_single_vcpu);
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!vector_hashing_enabled) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
}
/*
@@ -317,41 +1356,56 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
* Return 1 if successfully added and 0 if discarded.
*/
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode)
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map)
{
int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
+ fallthrough;
case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
- result = !apic_test_and_set_irr(vector, apic);
- if (!result) {
+ result = 1;
+
+ if (dest_map) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = vector;
+ }
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
- apic_debug("level trig mode repeatedly for "
- "vector %d", vector);
- break;
+ apic_set_vector(vector, apic->regs + APIC_TMR);
+ else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
}
- if (trig_mode) {
- apic_debug("level trig mode for vector %d", vector);
- apic_set_vector(vector, apic->regs + APIC_TMR);
- } else
- apic_clear_vector(vector, apic->regs + APIC_TMR);
- kvm_vcpu_kick(vcpu);
+ kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
break;
case APIC_DM_REMRD:
- printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+ result = 1;
+ vcpu->arch.pv.pv_unhalted = 1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_SMI:
- printk(KERN_DEBUG "Ignoring guest SMI\n");
+ if (!kvm_inject_smi(vcpu)) {
+ kvm_vcpu_kick(vcpu);
+ result = 1;
+ }
break;
case APIC_DM_NMI:
@@ -361,29 +1415,23 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_INIT:
- if (level) {
+ if (!trig_mode || level) {
result = 1;
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- printk(KERN_DEBUG
- "INIT on a runnable vcpu %d\n",
- vcpu->vcpu_id);
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ /* assumes that there are only KVM_APIC_INIT/SIPI */
+ apic->pending_events = (1UL << KVM_APIC_INIT);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
- } else {
- apic_debug("Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
}
break;
case APIC_DM_STARTUP:
- apic_debug("SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
- if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
- result = 1;
- vcpu->arch.sipi_vector = vector;
- vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- kvm_vcpu_kick(vcpu);
- }
+ result = 1;
+ apic->sipi_vector = vector;
+ /* make sure sipi_vector is visible for the receiver */
+ smp_wmb();
+ set_bit(KVM_APIC_SIPI, &apic->pending_events);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_EXTINT:
@@ -402,77 +1450,178 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
return result;
}
-int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+/*
+ * This routine identifies the destination vcpus mask meant to receive the
+ * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find
+ * out the destination vcpus array and set the bitmap or it traverses to
+ * each available vcpu to identify the same.
+ */
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap)
{
- return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+ struct kvm_lapic **dest_vcpu = NULL;
+ struct kvm_lapic *src = NULL;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ unsigned long bitmap, i;
+ int vcpu_idx;
+ bool ret;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
+ &bitmap);
+ if (ret) {
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dest_vcpu[i])
+ continue;
+ vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx;
+ __set_bit(vcpu_idx, vcpu_bitmap);
+ }
+ } else {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+ if (!kvm_apic_match_dest(vcpu, NULL,
+ irq->shorthand,
+ irq->dest_id,
+ irq->dest_mode))
+ continue;
+ __set_bit(i, vcpu_bitmap);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+ return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
}
-static void apic_set_eoi(struct kvm_lapic *apic)
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+ int __maybe_unused trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /*
+ * If the intercepted EOI is for an IRQ that was pending from previous
+ * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely
+ * no longer need to be intercepted.
+ */
+ if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
+ }
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
+}
+
+static int apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
- int trigger_mode;
+
+ trace_kvm_eoi(apic, vector);
+
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
*/
if (vector == -1)
- return;
+ return vector;
- apic_clear_vector(vector, apic->regs + APIC_ISR);
+ apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
+ if (kvm_hv_synic_has_vector(apic->vcpu, vector))
+ kvm_hv_synic_send_eoi(apic->vcpu, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ return vector;
+}
+
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_eoi(apic, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_eoi_accelerated);
+
+static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low,
+ u32 icr_high, struct kvm_lapic_irq *irq)
+{
+ /* KVM has no delay and should always clear the BUSY/PENDING flag. */
+ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
+ irq->vector = icr_low & APIC_VECTOR_MASK;
+ irq->delivery_mode = icr_low & APIC_MODE_MASK;
+ irq->dest_mode = icr_low & APIC_DEST_MASK;
+ irq->level = (icr_low & APIC_INT_ASSERT) != 0;
+ irq->trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq->shorthand = icr_low & APIC_SHORT_MASK;
+ irq->msi_redir_hint = false;
+ if (apic_x2apic_mode(apic))
+ irq->dest_id = icr_high;
else
- trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high);
}
-static void apic_send_ipi(struct kvm_lapic *apic)
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
{
- u32 icr_low = apic_get_reg(apic, APIC_ICR);
- u32 icr_high = apic_get_reg(apic, APIC_ICR2);
struct kvm_lapic_irq irq;
- irq.vector = icr_low & APIC_VECTOR_MASK;
- irq.delivery_mode = icr_low & APIC_MODE_MASK;
- irq.dest_mode = icr_low & APIC_DEST_MASK;
- irq.level = icr_low & APIC_INT_ASSERT;
- irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
- irq.shorthand = icr_low & APIC_SHORT_MASK;
- irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+ kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq);
- apic_debug("icr_high 0x%x, icr_low 0x%x, "
- "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
- "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
- icr_high, icr_low, irq.shorthand, irq.dest_id,
- irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
- irq.vector);
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
- kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_send_ipi);
static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
- ktime_t remaining;
+ ktime_t remaining, now;
s64 ns;
- u32 tmcct;
ASSERT(apic != NULL);
/* if initial count is 0, current count should also be 0 */
- if (apic_get_reg(apic, APIC_TMICT) == 0)
+ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 ||
+ apic->lapic_timer.period == 0)
return 0;
- remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
if (ktime_to_ns(remaining) < 0)
- remaining = ktime_set(0, 0);
+ remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- tmcct = div64_u64(ns,
- (APIC_BUS_CYCLE_NS * apic->divide_count));
-
- return tmcct;
+ return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -480,7 +1629,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_vcpu *vcpu = apic->vcpu;
struct kvm_run *run = vcpu->run;
- set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -495,48 +1644,104 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
{
u32 val = 0;
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
-
if (offset >= LAPIC_MMIO_LENGTH)
return 0;
switch (offset) {
case APIC_ARBPRI:
- printk(KERN_WARNING "Access APIC ARBPRI register "
- "which is for P6\n");
break;
case APIC_TMCCT: /* Timer CCR */
+ if (apic_lvtt_tscdeadline(apic))
+ return 0;
+
val = apic_get_tmcct(apic);
break;
-
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
case APIC_TASKPRI:
report_tpr_access(apic, false);
- /* fall thru */
+ fallthrough;
default:
- apic_update_ppr(apic);
- val = apic_get_reg(apic, offset);
+ val = kvm_lapic_get_reg(apic, offset);
break;
}
return val;
}
-static void apic_mmio_read(struct kvm_io_device *this,
- gpa_t address, int len, void *data)
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
+#define APIC_REG_MASK(reg) (1ull << ((reg) >> 4))
+#define APIC_REGS_MASK(first, count) \
+ (APIC_REG_MASK(first) * ((1ull << (count)) - 1))
+
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
+{
+ /* Leave bits '0' for reserved and write-only registers. */
+ u64 valid_reg_mask =
+ APIC_REG_MASK(APIC_ID) |
+ APIC_REG_MASK(APIC_LVR) |
+ APIC_REG_MASK(APIC_TASKPRI) |
+ APIC_REG_MASK(APIC_PROCPRI) |
+ APIC_REG_MASK(APIC_LDR) |
+ APIC_REG_MASK(APIC_SPIV) |
+ APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
+ APIC_REG_MASK(APIC_ESR) |
+ APIC_REG_MASK(APIC_ICR) |
+ APIC_REG_MASK(APIC_LVTT) |
+ APIC_REG_MASK(APIC_LVTTHMR) |
+ APIC_REG_MASK(APIC_LVTPC) |
+ APIC_REG_MASK(APIC_LVT0) |
+ APIC_REG_MASK(APIC_LVT1) |
+ APIC_REG_MASK(APIC_LVTERR) |
+ APIC_REG_MASK(APIC_TMICT) |
+ APIC_REG_MASK(APIC_TMCCT) |
+ APIC_REG_MASK(APIC_TDCR);
+
+ if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
+ valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
+
+ /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
+ if (!apic_x2apic_mode(apic))
+ valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_DFR) |
+ APIC_REG_MASK(APIC_ICR2);
+
+ return valid_reg_mask;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask);
+
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
unsigned char alignment = offset & 0xf;
u32 result;
- if ((alignment + len) > 4) {
- printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
- (unsigned long)address, len);
- return;
- }
+ /*
+ * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
+ * x2APIC and needs to be manually handled by the caller.
+ */
+ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
+
+ if (alignment + len > 4)
+ return 1;
+
+ if (offset > 0x3f0 ||
+ !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
+ return 1;
+
result = __apic_read(apic, offset & ~0xf);
+ trace_kvm_apic_read(offset, result);
+
switch (len) {
case 1:
case 2:
@@ -548,95 +1753,621 @@ static void apic_mmio_read(struct kvm_io_device *this,
"should be 1,2, or 4 instead\n", len);
break;
}
+ return 0;
+}
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+ return addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ u32 offset = address - apic->base_address;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ memset(data, 0xff, len);
+ return 0;
+ }
+
+ kvm_lapic_reg_read(apic, offset, len, data);
+
+ return 0;
}
static void update_divide_count(struct kvm_lapic *apic)
{
u32 tmp1, tmp2, tdcr;
- tdcr = apic_get_reg(apic, APIC_TDCR);
+ tdcr = kvm_lapic_get_reg(apic, APIC_TDCR);
tmp1 = tdcr & 0xf;
tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
apic->divide_count = 0x1 << (tmp2 & 0x7);
-
- apic_debug("timer divide count is 0x%x\n",
- apic->divide_count);
}
-static void start_apic_timer(struct kvm_lapic *apic)
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
{
- ktime_t now = apic->lapic_timer.timer.base->get_time();
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_once(
+ "vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
+static void cancel_hv_timer(struct kvm_lapic *apic);
- apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
- APIC_BUS_CYCLE_NS * apic->divide_count;
+static void cancel_apic_timer(struct kvm_lapic *apic)
+{
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ preempt_disable();
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ preempt_enable();
atomic_set(&apic->lapic_timer.pending, 0);
+}
- if (!apic->lapic_timer.period)
+static void apic_update_lvtt(struct kvm_lapic *apic)
+{
+ u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask;
+
+ if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ cancel_apic_timer(apic);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
+ apic->lapic_timer.timer_mode = timer_mode;
+ limit_periodic_timer_frequency(apic);
+ }
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg;
+
+ /*
+ * Assume a timer IRQ was "injected" if the APIC is protected. KVM's
+ * copy of the vIRR is bogus, it's the responsibility of the caller to
+ * precisely check whether or not a timer IRQ is pending.
+ */
+ if (apic->guest_apic_protected)
+ return true;
+
+ reg = kvm_lapic_get_reg(apic, APIC_LVTT);
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+ void *bitmap = apic->regs + APIC_ISR;
+
+ if (apic->apicv_active)
+ bitmap = apic->regs + APIC_IRR;
+
+ if (apic_test_vector(vec, bitmap))
+ return true;
+ }
+ return false;
+}
+
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+ u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+
+ /*
+ * If the guest TSC is running at a different ratio than the host, then
+ * convert the delay to nanoseconds to achieve an accurate delay. Note
+ * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+ * always for VMX enabled hardware.
+ */
+ if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
+ __delay(min(guest_cycles,
+ nsec_to_cycles(vcpu, timer_advance_ns)));
+ } else {
+ u64 delay_ns = guest_cycles * 1000000ULL;
+ do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+ ndelay(min_t(u32, delay_ns, timer_advance_ns));
+ }
+}
+
+static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
+ s64 advance_expire_delta)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
+ u64 ns;
+
+ /* Do not adjust for tiny fluctuations or large random spikes. */
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
+ abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
return;
- hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, apic->lapic_timer.period),
- HRTIMER_MODE_ABS);
+ /* too early */
+ if (advance_expire_delta < 0) {
+ ns = -advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+ } else {
+ /* too late */
+ ns = advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+ }
- apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
- PRIx64 ", "
- "timer initial count 0x%x, period %lldns, "
- "expire @ 0x%016" PRIx64 ".\n", __func__,
- APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- apic_get_reg(apic, APIC_TMICT),
- apic->lapic_timer.period,
- ktime_to_ns(ktime_add_ns(now,
- apic->lapic_timer.period)));
+ if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
-static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
+
+ /*
+ * If the timer fired early, reread the TSC to account for the overhead
+ * of the above adjustment to avoid waiting longer than is necessary.
+ */
+ if (guest_tsc < tsc_deadline)
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ if (guest_tsc < tsc_deadline)
+ __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
+}
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
{
- int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+ lapic_timer_int_injected(vcpu))
+ __kvm_wait_lapic_expire(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_wait_lapic_expire);
+
+static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ kvm_apic_local_deliver(apic, APIC_LVTT);
+ if (apic_lvtt_tscdeadline(apic)) {
+ ktimer->tscdeadline = 0;
+ } else if (apic_lvtt_oneshot(apic)) {
+ ktimer->tscdeadline = 0;
+ ktimer->target_expiration = 0;
+ }
+}
+
+static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
- if (apic_lvt_nmi_mode(lvt0_val)) {
- if (!nmi_wd_enabled) {
- apic_debug("Receive NMI setting on APIC_LVT0 "
- "for cpu %d\n", apic->vcpu->vcpu_id);
- apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+
+ if (!from_timer_fn && apic->apicv_active) {
+ WARN_ON(kvm_get_running_vcpu() != vcpu);
+ kvm_apic_inject_pending_timer_irqs(apic);
+ return;
+ }
+
+ if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
+ /*
+ * Ensure the guest's timer has truly expired before posting an
+ * interrupt. Open code the relevant checks to avoid querying
+ * lapic_timer_int_injected(), which will be false since the
+ * interrupt isn't yet injected. Waiting until after injecting
+ * is not an option since that won't help a posted interrupt.
+ */
+ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
+ __kvm_wait_lapic_expire(vcpu);
+ kvm_apic_inject_pending_timer_irqs(apic);
+ return;
+ }
+
+ atomic_inc(&apic->lapic_timer.pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ if (from_timer_fn)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
+ u64 ns = 0;
+ ktime_t expire;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ unsigned long flags;
+ ktime_t now;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = ktime_get();
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+
+ if (likely(tscdeadline > guest_tsc) &&
+ likely(ns > apic->lapic_timer.timer_advance_ns)) {
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
+ } else
+ apic_timer_expired(apic, false);
+
+ local_irq_restore(flags);
+}
+
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+ return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ (u64)apic->divide_count;
+}
+
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
+{
+ ktime_t now;
+ u64 tscl = rdtsc();
+ s64 deadline;
+
+ now = ktime_get();
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
+
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
+ return false;
+ }
+
+ limit_periodic_timer_frequency(apic);
+ deadline = apic->lapic_timer.period;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+ if (unlikely(count_reg != APIC_TMICT)) {
+ deadline = tmict_to_ns(apic,
+ kvm_lapic_get_reg(apic, count_reg));
+ if (unlikely(deadline <= 0)) {
+ if (apic_lvtt_period(apic))
+ deadline = apic->lapic_timer.period;
+ else
+ deadline = 0;
+ }
+ else if (unlikely(deadline > apic->lapic_timer.period)) {
+ pr_info_ratelimited(
+ "vcpu %i: requested lapic timer restore with "
+ "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+ "Using initial count to start timer.\n",
+ apic->vcpu->vcpu_id,
+ count_reg,
+ kvm_lapic_get_reg(apic, count_reg),
+ deadline, apic->lapic_timer.period);
+ kvm_lapic_set_reg(apic, count_reg, 0);
+ deadline = apic->lapic_timer.period;
+ }
}
- } else if (nmi_wd_enabled)
- apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+ }
+
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, deadline);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
+
+ return true;
}
-static void apic_mmio_write(struct kvm_io_device *this,
- gpa_t address, int len, const void *data)
+static void advance_periodic_target_expiration(struct kvm_lapic *apic)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- unsigned int offset = address - apic->base_address;
- unsigned char alignment = offset & 0xf;
- u32 val;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ ktime_t now = ktime_get();
+ u64 tscl = rdtsc();
+ ktime_t delta;
/*
- * APIC register must be aligned on 128-bits boundary.
- * 32/64/128 bits registers must be accessed thru 32 bits.
- * Refer SDM 8.4.1
+ * Use kernel time as the time source for both the hrtimer deadline and
+ * TSC-based deadline so that they stay synchronized. Computing each
+ * deadline independently will cause the two deadlines to drift apart
+ * over time as differences in the periods accumulate, e.g. due to
+ * differences in the underlying clocks or numerical approximation errors.
*/
- if (len != 4 || alignment) {
- /* Don't shout loud, $infamous_os would cause only noise. */
- apic_debug("apic write: bad size=%d %lx\n",
- len, (long)address);
+ ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration,
+ ktimer->period);
+
+ /*
+ * If the new expiration is in the past, e.g. because userspace stopped
+ * running the VM for an extended duration, then force the expiration
+ * to "now" and don't try to play catch-up with the missed events. KVM
+ * will only deliver a single interrupt regardless of how many events
+ * are pending, i.e. restarting the timer with an expiration in the
+ * past will do nothing more than waste host cycles, and can even lead
+ * to a hard lockup in extreme cases.
+ */
+ if (ktime_before(ktimer->target_expiration, now))
+ ktimer->target_expiration = now;
+
+ /*
+ * Note, ensuring the expiration isn't in the past also prevents delta
+ * from going negative, which could cause the TSC deadline to become
+ * excessively large due to it an unsigned value.
+ */
+ delta = ktime_sub(ktimer->target_expiration, now);
+ ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
+}
+
+static void start_sw_period(struct kvm_lapic *apic)
+{
+ if (!apic->lapic_timer.period)
return;
+
+ if (ktime_after(ktime_get(),
+ apic->lapic_timer.target_expiration)) {
+ apic_timer_expired(apic, false);
+
+ if (apic_lvtt_oneshot(apic))
+ return;
+
+ advance_periodic_target_expiration(apic);
}
- val = *(u32 *) data;
+ hrtimer_start(&apic->lapic_timer.timer,
+ apic->lapic_timer.target_expiration,
+ HRTIMER_MODE_ABS_HARD);
+}
- /* too common printing */
- if (offset != APIC_EOI)
- apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __func__, offset, len, val);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return false;
- offset &= 0xff0;
+ return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
- KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
+static void cancel_hv_timer(struct kvm_lapic *apic)
+{
+ WARN_ON(preemptible());
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ kvm_x86_call(cancel_hv_timer)(apic->vcpu);
+ apic->lapic_timer.hv_timer_in_use = false;
+}
- switch (offset) {
+static bool start_hv_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ bool expired;
+
+ WARN_ON(preemptible());
+ if (!kvm_can_use_hv_timer(vcpu))
+ return false;
+
+ if (!ktimer->tscdeadline)
+ return false;
+
+ if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * To simplify handling the periodic timer, leave the hv timer running
+ * even if the deadline timer has expired, i.e. rely on the resulting
+ * VM-Exit to recompute the periodic timer's target expiration.
+ */
+ if (!apic_lvtt_period(apic)) {
+ /*
+ * Cancel the hv timer if the sw timer fired while the hv timer
+ * was being programmed, or if the hv timer itself expired.
+ */
+ if (atomic_read(&ktimer->pending)) {
+ cancel_hv_timer(apic);
+ } else if (expired) {
+ apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
+ }
+ }
+
+ trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use);
+
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ WARN_ON(preemptible());
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ preempt_disable();
+
+ if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
+ goto out;
+
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
+out:
+ preempt_enable();
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* If the preempt notifier has already run, it also called apic_timer_expired */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ goto out;
+ WARN_ON(kvm_vcpu_is_blocking(vcpu));
+ apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
+
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ advance_periodic_target_expiration(apic);
+ restart_apic_timer(apic);
+ }
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_expired_hv_timer);
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+ restart_apic_timer(vcpu->arch.apic);
+}
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+ preempt_enable();
+}
+
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
+}
+
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
+{
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic, count_reg))
+ return;
+
+ restart_apic_timer(apic);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+ __start_apic_timer(apic, APIC_TMICT);
+}
+
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
+
+ if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
+ apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
+ if (lvt0_in_nmi_mode) {
+ atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ } else
+ atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ }
+}
+
+static int get_lvt_index(u32 reg)
+{
+ if (reg == APIC_LVTCMCI)
+ return LVT_CMCI;
+ if (reg < APIC_LVTT || reg > APIC_LVTERR)
+ return -1;
+ return array_index_nospec(
+ (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES);
+}
+
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+{
+ int ret = 0;
+
+ trace_kvm_apic_write(reg, val);
+
+ switch (reg) {
case APIC_ID: /* Local APIC ID */
- apic_set_reg(apic, APIC_ID, val);
+ if (!apic_x2apic_mode(apic)) {
+ kvm_apic_set_xapic_id(apic, val >> 24);
+ } else {
+ ret = 1;
+ }
break;
case APIC_TASKPRI:
@@ -649,103 +2380,275 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_LDR:
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
+ else
+ ret = 1;
break;
case APIC_DFR:
- apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
+ else
+ ret = 1;
break;
- case APIC_SPIV:
- apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+ case APIC_SPIV: {
+ u32 mask = 0x3ff;
+ if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ mask |= APIC_SPIV_DIRECTED_EOI;
+ apic_set_spiv(apic, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
- u32 lvt_val;
- for (i = 0; i < APIC_LVT_NUM; i++) {
- lvt_val = apic_get_reg(apic,
- APIC_LVTT + 0x10 * i);
- apic_set_reg(apic, APIC_LVTT + 0x10 * i,
- lvt_val | APIC_LVT_MASKED);
+ for (i = 0; i < apic->nr_lvt_entries; i++) {
+ kvm_lapic_set_reg(apic, APIC_LVTx(i),
+ kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
}
+ apic_update_lvtt(apic);
atomic_set(&apic->lapic_timer.pending, 0);
}
break;
-
+ }
case APIC_ICR:
+ WARN_ON_ONCE(apic_x2apic_mode(apic));
+
/* No delay here, so we always clear the pending bit */
- apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
- apic_send_ipi(apic);
+ val &= ~APIC_ICR_BUSY;
+ kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+ kvm_lapic_set_reg(apic, APIC_ICR, val);
break;
-
case APIC_ICR2:
- apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+ if (apic_x2apic_mode(apic))
+ ret = 1;
+ else
+ kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
break;
case APIC_LVT0:
apic_manage_nmi_watchdog(apic, val);
- case APIC_LVTT:
+ fallthrough;
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
case APIC_LVTERR:
- /* TODO: Check vector */
- if (!apic_sw_enabled(apic))
+ case APIC_LVTCMCI: {
+ u32 index = get_lvt_index(reg);
+ if (!kvm_lapic_lvt_supported(apic, index)) {
+ ret = 1;
+ break;
+ }
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
+ val &= apic_lvt_mask[index];
+ kvm_lapic_set_reg(apic, reg, val);
+ break;
+ }
- val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
- apic_set_reg(apic, offset, val);
-
+ case APIC_LVTT:
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask);
+ kvm_lapic_set_reg(apic, APIC_LVTT, val);
+ apic_update_lvtt(apic);
break;
case APIC_TMICT:
- hrtimer_cancel(&apic->lapic_timer.timer);
- apic_set_reg(apic, APIC_TMICT, val);
+ if (apic_lvtt_tscdeadline(apic))
+ break;
+
+ cancel_apic_timer(apic);
+ kvm_lapic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
- return;
+ break;
+
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
- case APIC_TDCR:
- if (val & 4)
- printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
- apic_set_reg(apic, APIC_TDCR, val);
+ kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
+ break;
+ }
+ case APIC_ESR:
+ if (apic_x2apic_mode(apic) && val != 0)
+ ret = 1;
break;
+ case APIC_SELF_IPI:
+ /*
+ * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold
+ * the vector, everything else is reserved.
+ */
+ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
+ ret = 1;
+ else
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
+ break;
default:
- apic_debug("Local APIC Write to read-only register %x\n",
- offset);
+ ret = 1;
break;
}
+ /*
+ * Recalculate APIC maps if necessary, e.g. if the software enable bit
+ * was toggled, the APIC ID changed, etc... The maps are marked dirty
+ * on relevant changes, i.e. this is a nop for most writes.
+ */
+ kvm_recalculate_apic_map(apic->vcpu->kvm);
+
+ return ret;
}
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
- int len, int size)
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, const void *data)
{
- struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
- int ret = 0;
+ struct kvm_lapic *apic = to_lapic(this);
+ unsigned int offset = address - apic->base_address;
+ u32 val;
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
- if (apic_hw_enabled(apic) &&
- (addr >= apic->base_address) &&
- (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
- ret = 1;
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
- return ret;
+ return 0;
+ }
+
+ /*
+ * APIC register must be aligned on 128-bits boundary.
+ * 32/64/128 bits registers must be accessed thru 32 bits.
+ * Refer SDM 8.4.1
+ */
+ if (len != 4 || (offset & 0xf))
+ return 0;
+
+ val = *(u32*)data;
+
+ kvm_lapic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
+}
+
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_set_eoi);
+
+#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
+
+static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast)
+{
+ if (data & X2APIC_ICR_RESERVED_BITS)
+ return 1;
+
+ /*
+ * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
+ * only AMD requires it to be zero, Intel essentially just ignores the
+ * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
+ * the CPU performs the reserved bits checks, i.e. the underlying CPU
+ * behavior will "win". Arbitrarily clear the BUSY bit, as there is no
+ * sane way to provide consistent behavior with respect to hardware.
+ */
+ data &= ~APIC_ICR_BUSY;
+
+ if (fast) {
+ struct kvm_lapic_irq irq;
+ int ignored;
+
+ kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq);
+
+ if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq,
+ &ignored, NULL))
+ return -EWOULDBLOCK;
+
+ trace_kvm_apic_ipi((u32)data, irq.dest_id);
+ } else {
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ }
+ if (kvm_x86_ops.x2apic_icr_is_split) {
+ kvm_lapic_set_reg(apic, APIC_ICR, data);
+ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ }
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+{
+ return __kvm_x2apic_icr_write(apic, data, false);
+}
+
+int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data)
+{
+ return __kvm_x2apic_icr_write(apic, data, true);
+}
+
+static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+{
+ if (kvm_x86_ops.x2apic_icr_is_split)
+ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+
+ return kvm_lapic_get_reg64(apic, APIC_ICR);
+}
+
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /*
+ * ICR is a single 64-bit register when x2APIC is enabled, all others
+ * registers hold 32-bit values. For legacy xAPIC, ICR writes need to
+ * go down the common path to get the upper half from ICR2.
+ *
+ * Note, using the write helpers may incur an unnecessary write to the
+ * virtual APIC state, but KVM needs to conditionally modify the value
+ * in certain cases, e.g. to clear the ICR busy bit. The cost of extra
+ * conditional branches is likely a wash relative to the cost of the
+ * maybe-unecessary write, and both are in the noise anyways.
+ */
+ if (apic_x2apic_mode(apic) && offset == APIC_ICR)
+ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
+ else
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_write_nodecode);
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.apic)
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!vcpu->arch.apic) {
+ static_branch_dec(&kvm_has_noapic_vcpu);
return;
+ }
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
- hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
- if (vcpu->arch.apic->regs_page)
- __free_page(vcpu->arch.apic->regs_page);
+ if (!apic->sw_enabled)
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
- kfree(vcpu->arch.apic);
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
+
+ kfree(apic);
}
/*
@@ -753,120 +2656,276 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
* LAPIC interface
*----------------------------------------------------------------------
*/
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return;
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.tscdeadline = data;
+ start_apic_timer(apic);
+}
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
u64 tpr;
- if (!apic)
- return 0;
- tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+ tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
return (tpr & 0xf0) >> 4;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
+ u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic) {
- value |= MSR_IA32_APICBASE_BSP;
- vcpu->arch.apic_base = value;
+ vcpu->arch.apic_base = value;
+
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+
+ if (!apic)
return;
+
+ /* update jump label if enable bit changes */
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE) {
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
+ /* Check if there are APF page ready requests pending */
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
+ } else {
+ static_branch_inc(&apic_hw_disabled.key);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ }
+ }
+
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE)
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ else if (value & MSR_IA32_APICBASE_ENABLE)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
+
+ if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ kvm_x86_call(set_virtual_apic_mode)(vcpu);
}
- if (apic->vcpu->vcpu_id)
- value &= ~MSR_IA32_APICBASE_BSP;
- vcpu->arch.apic_base = value;
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
- /* with FSB delivery interrupt, we can restart APIC functionality */
- apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
- "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+ if ((value & MSR_IA32_APICBASE_ENABLE) &&
+ apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+ kvm_set_apicv_inhibit(apic->vcpu->kvm,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+ }
+}
+
+int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
+{
+ enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
+ enum lapic_mode new_mode = kvm_apic_mode(value);
+
+ if (vcpu->arch.apic_base == value)
+ return 0;
+
+ u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
+ (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+ if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
+ return 1;
+ if (!host_initiated) {
+ if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
+ return 1;
+ if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
+ return 1;
+ }
+
+ __kvm_apic_set_base(vcpu, value);
+ kvm_recalculate_apic_map(vcpu->kvm);
+ return 0;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_base);
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.apic_base;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /*
+ * When APICv is enabled, KVM must always search the IRR for a pending
+ * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU
+ * isn't running. If APICv is disabled, KVM _should_ search the IRR
+ * for a pending IRQ. But KVM currently doesn't ensure *all* hardware,
+ * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching
+ * the IRR at this time could race with IRQ delivery from hardware that
+ * still sees APICv as being enabled.
+ *
+ * FIXME: Ensure other vCPUs and devices observe the change in APICv
+ * state prior to updating KVM's metadata caches, so that KVM
+ * can safely search the IRR and set irr_pending accordingly.
+ */
+ apic->irr_pending = true;
+
+ if (apic->apicv_active)
+ apic->isr_count = 1;
+ else
+ apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+
+ apic->highest_isr_cache = -1;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+int kvm_alloc_apic_access_page(struct kvm *kvm)
{
- struct kvm_lapic *apic;
+ void __user *hva;
+
+ guard(mutex)(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled ||
+ kvm->arch.apic_access_memslot_inhibited)
+ return 0;
+
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva))
+ return PTR_ERR(hva);
+
+ kvm->arch.apic_access_memslot_enabled = true;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_alloc_apic_access_page);
+
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.apic_access_memslot_enabled)
+ return;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled) {
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ /*
+ * Clear "enabled" after the memslot is deleted so that a
+ * different vCPU doesn't get a false negative when checking
+ * the flag out of slots_lock. No additional memory barrier is
+ * needed as modifying memslots requires waiting other vCPUs to
+ * drop SRCU (see above), and false positives are ok as the
+ * flag is rechecked after acquiring slots_lock.
+ */
+ kvm->arch.apic_access_memslot_enabled = false;
+
+ /*
+ * Mark the memslot as inhibited to prevent reallocating the
+ * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+ */
+ kvm->arch.apic_access_memslot_inhibited = true;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 msr_val;
int i;
- apic_debug("%s\n", __func__);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
- ASSERT(vcpu);
- apic = vcpu->arch.apic;
- ASSERT(apic != NULL);
+ if (!init_event) {
+ msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ msr_val |= MSR_IA32_APICBASE_BSP;
+
+ /*
+ * Use the inner helper to avoid an extra recalcuation of the
+ * optimized APIC map if some other task has dirtied the map.
+ * The recalculation needed for this vCPU will be done after
+ * all APIC state has been initialized (see below).
+ */
+ __kvm_apic_set_base(vcpu, msr_val);
+ }
+
+ if (!apic)
+ return;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
-
- for (i = 0; i < APIC_LVT_NUM; i++)
- apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
- apic_set_reg(apic, APIC_LVT0,
- SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
-
- apic_set_reg(apic, APIC_DFR, 0xffffffffU);
- apic_set_reg(apic, APIC_SPIV, 0xff);
- apic_set_reg(apic, APIC_TASKPRI, 0);
- apic_set_reg(apic, APIC_LDR, 0);
- apic_set_reg(apic, APIC_ESR, 0);
- apic_set_reg(apic, APIC_ICR, 0);
- apic_set_reg(apic, APIC_ICR2, 0);
- apic_set_reg(apic, APIC_TDCR, 0);
- apic_set_reg(apic, APIC_TMICT, 0);
+ /* The xAPIC ID is set at RESET even if the APIC was already enabled. */
+ if (!init_event)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ kvm_apic_set_version(apic->vcpu);
+
+ for (i = 0; i < apic->nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+ apic_update_lvtt(apic);
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ kvm_lapic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+
+ kvm_apic_set_dfr(apic, 0xffffffffU);
+ apic_set_spiv(apic, 0xff);
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, 0);
+ kvm_lapic_set_reg(apic, APIC_ESR, 0);
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+ }
+ kvm_lapic_set_reg(apic, APIC_TDCR, 0);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
for (i = 0; i < 8; i++) {
- apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
+ kvm_apic_update_apicv(vcpu);
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (vcpu->vcpu_id == 0)
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+
+ vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
+ if (apic->apicv_active) {
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_isr_update)(vcpu, -1);
+ }
vcpu->arch.apic_arb_prio = 0;
+ vcpu->arch.apic_attention = 0;
- apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
- "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
- vcpu, kvm_apic_id(apic),
- vcpu->arch.apic_base, apic->base_address);
+ kvm_recalculate_apic_map(vcpu->kvm);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
-
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
- return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
/*
*----------------------------------------------------------------------
@@ -874,33 +2933,37 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
*----------------------------------------------------------------------
*/
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
{
- struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
- lapic_timer);
return apic_lvtt_period(apic);
}
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *lapic = vcpu->arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->lapic_timer.pending);
+ if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
return 0;
}
-static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
- u32 reg = apic_get_reg(apic, lvt_type);
+ u32 reg = kvm_lapic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
+ int r;
- if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
- return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+
+ r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
+ if (r && lvt_type == APIC_LVTPC &&
+ guest_cpuid_is_intel_compatible(apic->vcpu))
+ kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
+ return r;
}
return 0;
}
@@ -913,165 +2976,376 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static struct kvm_timer_ops lapic_timer_ops = {
- .is_periodic = lapic_is_periodic,
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
};
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+
+ apic_timer_expired(apic, true);
+
+ if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) {
+ advance_periodic_target_expiration(apic);
+ hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
ASSERT(vcpu != NULL);
- apic_debug("apic_init %d\n", vcpu->vcpu_id);
- apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ static_branch_inc(&kvm_has_noapic_vcpu);
+ return 0;
+ }
+
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
vcpu->arch.apic = apic;
- apic->regs_page = alloc_page(GFP_KERNEL);
- if (apic->regs_page == NULL) {
+ if (kvm_x86_ops.alloc_apic_backing_page)
+ apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu);
+ else
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
goto nomem_free_apic;
}
- apic->regs = page_address(apic->regs_page);
- memset(apic->regs, 0, PAGE_SIZE);
apic->vcpu = vcpu;
- hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- apic->lapic_timer.timer.function = kvm_timer_fn;
- apic->lapic_timer.t_ops = &lapic_timer_ops;
- apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+ apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+ hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+ if (lapic_timer_advance)
+ apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
- kvm_lapic_reset(vcpu);
- apic->dev.read = apic_mmio_read;
- apic->dev.write = apic_mmio_write;
- apic->dev.in_range = apic_mmio_range;
- apic->dev.private = apic;
+ /*
+ * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
+ * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as this vCPU
+ * will not get notified of any changes until this vCPU is visible to
+ * other vCPUs (marked online and added to the set of vCPUs).
+ *
+ * Opportunistically mark APICv active as VMX in particularly is highly
+ * unlikely to have inhibits. Ignore the current per-VM APICv state so
+ * that vCPU creation is guaranteed to run with a deterministic value,
+ * the request will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
+ apic->apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
return 0;
nomem_free_apic:
kfree(apic);
+ vcpu->arch.apic = NULL;
nomem:
return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
+ u32 ppr;
- if (!apic || !apic_enabled(apic))
+ if (!kvm_apic_present(vcpu))
return -1;
- apic_update_ppr(apic);
- highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ if (apic->guest_apic_protected)
return -1;
- return highest_irr;
+
+ __apic_update_ppr(apic, &ppr);
+ return apic_has_interrupt_for_ppr(apic, ppr);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_has_interrupt);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
- u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
- int r = 0;
+ u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
- if (vcpu->vcpu_id == 0) {
- if (!apic_hw_enabled(vcpu->arch.apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- }
- return r;
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ return 1;
+ return 0;
}
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
- if (kvm_apic_local_deliver(apic, APIC_LVTT))
- atomic_dec(&apic->lapic_timer.pending);
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
+ kvm_apic_inject_pending_timer_irqs(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
}
}
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
- if (vector == -1)
- return -1;
+ if (WARN_ON_ONCE(vector < 0 || !apic))
+ return;
+
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
- apic_set_vector(vector, apic->regs + APIC_ISR);
- apic_update_ppr(apic);
apic_clear_irr(vector, apic);
- return vector;
+ if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) {
+ /*
+ * For auto-EOI interrupts, there might be another pending
+ * interrupt above PPR, so check whether to raise another
+ * KVM_REQ_EVENT.
+ */
+ apic_update_ppr(apic);
+ } else {
+ /*
+ * For normal interrupts, PPR has been raised and there cannot
+ * be a higher-priority pending interrupt---except if there was
+ * a concurrent interrupt injection, but that would have
+ * triggered KVM_REQ_EVENT already.
+ */
+ apic_set_isr(vector, apic);
+ __apic_update_ppr(apic, &ppr);
+ }
+
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_ack_interrupt);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s, bool set)
+{
+ if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic);
+ u32 *id = (u32 *)(s->regs + APIC_ID);
+ u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+ u64 icr;
+
+ if (vcpu->kvm->arch.x2apic_format) {
+ if (*id != x2apic_id)
+ return -EINVAL;
+ } else {
+ /*
+ * Ignore the userspace value when setting APIC state.
+ * KVM's model is that the x2APIC ID is readonly, e.g.
+ * KVM only supports delivering interrupts to KVM's
+ * version of the x2APIC ID. However, for backwards
+ * compatibility, don't reject attempts to set a
+ * mismatched ID for userspace that hasn't opted into
+ * x2apic_format.
+ */
+ if (set)
+ *id = x2apic_id;
+ else
+ *id = x2apic_id << 24;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is fixed and based on the id. And
+ * if the ICR is _not_ split, ICR is internally a single 64-bit
+ * register, but needs to be split to ICR+ICR2 in userspace for
+ * backwards compatibility.
+ */
+ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
+
+ if (!kvm_x86_ops.x2apic_icr_is_split) {
+ if (set) {
+ icr = apic_get_reg(s->regs, APIC_ICR) |
+ (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
+ apic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = apic_get_reg64(s->regs, APIC_ICR);
+ apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+ /*
+ * Get calculated timer current count for remaining timer period (if
+ * any) and store it in the returned register set.
+ */
+ apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
+ return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
+
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
+
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+ r = kvm_apic_state_fixup(vcpu, s, true);
+ if (r) {
+ kvm_recalculate_apic_map(vcpu->kvm);
+ return r;
+ }
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
+
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ kvm_recalculate_apic_map(vcpu->kvm);
+ kvm_apic_set_version(vcpu);
- apic->base_address = vcpu->arch.apic_base &
- MSR_IA32_APICBASE_BASE;
- apic_set_reg(apic, APIC_LVR, APIC_VERSION);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->lapic_timer.timer);
+ cancel_apic_timer(apic);
+ apic->lapic_timer.expired_tscdeadline = 0;
+ apic_update_lvtt(apic);
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
- start_apic_timer(apic);
+ __start_apic_timer(apic, APIC_TMCCT);
+ kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
+ kvm_apic_update_apicv(vcpu);
+ if (apic->apicv_active) {
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
+
+ vcpu->arch.apic_arb_prio = 0;
+
+ return 0;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
struct hrtimer *timer;
- if (!apic)
+ if (!lapic_in_kernel(vcpu) ||
+ kvm_can_post_timer_interrupt(vcpu))
return;
- timer = &apic->lapic_timer.timer;
+ timer = &vcpu->arch.apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
+}
+
+/*
+ * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ * Clear PV EOI in guest memory in any case.
+ */
+static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ int vector;
+ /*
+ * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+ * and KVM_PV_EOI_ENABLED in guest memory as follows:
+ *
+ * KVM_APIC_PV_EOI_PENDING is unset:
+ * -> host disabled PV EOI.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+ * -> host enabled PV EOI, guest did not execute EOI yet.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+ * -> host enabled PV EOI, guest executed EOI.
+ */
+ BUG_ON(!pv_eoi_enabled(vcpu));
+
+ if (pv_eoi_test_and_clr_pending(vcpu))
+ return;
+ vector = apic_set_eoi(apic);
+ trace_kvm_pv_eoi(apic, vector);
}
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
{
u32 data;
- void *vapic;
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+ apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
- data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
- kunmap_atomic(vapic, KM_USER0);
+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
+ return;
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
+/*
+ * apic_sync_pv_eoi_to_guest - called before vmentry
+ *
+ * Detect whether it's safe to enable PV EOI and
+ * if yes do so.
+ */
+static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ if (!pv_eoi_enabled(vcpu) ||
+ /* IRR set or many bits in ISR: could be nested. */
+ apic->irr_pending ||
+ /* Cache not set: could be safe but we don't bother. */
+ apic->highest_isr_cache == -1 ||
+ /* Need EOI to update ioapic. */
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
+ /*
+ * PV EOI was disabled by apic_sync_pv_eoi_from_guest
+ * so we need not do anything here.
+ */
+ return;
+ }
+
+ pv_eoi_set_pending(apic->vcpu);
+}
+
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
{
u32 data, tpr;
int max_irr, max_isr;
- struct kvm_lapic *apic;
- void *vapic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ apic_sync_pv_eoi_to_guest(vcpu, apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- apic = vcpu->arch.apic;
- tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff;
max_irr = apic_find_highest_irr(apic);
if (max_irr < 0)
max_irr = 0;
@@ -1080,15 +3354,179 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
- *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
- kunmap_atomic(vapic, KM_USER0);
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
}
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
+ if (vapic_addr) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.apic->vapic_cache,
+ vapic_addr, sizeof(u32)))
+ return -EINVAL;
+ __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ } else {
+ __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ }
vcpu->arch.apic->vapic_addr = vapic_addr;
+ return 0;
+}
+
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+ u32 low;
+
+ if (reg == APIC_ICR) {
+ *data = kvm_x2apic_icr_read(apic);
+ return 0;
+ }
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
+ return 1;
+
+ *data = low;
+
+ return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+ /*
+ * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
+ * can be written as such, all other registers remain accessible only
+ * through 32-bit reads/writes.
+ */
+ if (reg == APIC_ICR)
+ return kvm_x2apic_icr_write(apic, data);
+
+ /* Bits 63:32 are reserved in all other registers. */
+ if (data >> 32)
+ return 1;
+
+ return kvm_lapic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ return kvm_lapic_msr_write(apic, reg, data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ return kvm_lapic_msr_read(apic, reg, data);
+}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
+}
+
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+{
+ u64 addr = data & ~KVM_MSR_ENABLED;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
+ unsigned long new_len;
+ int ret;
+
+ if (!IS_ALIGNED(addr, 4))
+ return 1;
+
+ if (data & KVM_MSR_ENABLED) {
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
+
+ ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+ if (ret)
+ return ret;
+ }
+
+ vcpu->arch.pv_eoi.msr_val = data;
+
+ return 0;
+}
+
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u8 sipi_vector;
+ int r;
+
+ if (!kvm_apic_has_pending_init_or_sipi(vcpu))
+ return 0;
+
+ if (is_guest_mode(vcpu)) {
+ r = kvm_check_nested_events(vcpu);
+ if (r < 0)
+ return r == -EBUSY ? 0 : r;
+ /*
+ * Continue processing INIT/SIPI even if a nested VM-Exit
+ * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI
+ * are blocked as a result of transitioning to VMX root mode.
+ */
+ }
+
+ /*
+ * INITs are blocked while CPU is in specific states (SMM, VMX root
+ * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in
+ * wait-for-SIPI (WFS).
+ */
+ if (!kvm_apic_init_sipi_allowed(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ return 0;
+ }
+
+ if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (kvm_vcpu_is_bsp(apic->vcpu))
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ else
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
+ }
+ if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ /* evaluate pending_events before reading the vector */
+ smp_rmb();
+ sipi_vector = apic->sipi_vector;
+ kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
+ sipi_vector);
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ }
+ }
+ return 0;
+}
+
+void kvm_lapic_exit(void)
+{
+ static_key_deferred_flush(&apic_hw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
+ static_key_deferred_flush(&apic_sw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a587f8349c46..282b9b7da98c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -1,47 +1,262 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_LAPIC_H
#define __KVM_X86_LAPIC_H
-#include "iodev.h"
-#include "kvm_timer.h"
+#include <kvm/iodev.h>
+
+#include <asm/apic.h>
#include <linux/kvm_host.h>
+#include "hyperv.h"
+#include "smm.h"
+
+#define KVM_APIC_INIT 0
+#define KVM_APIC_SIPI 1
+
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
+
+#define APIC_BUS_CYCLE_NS_DEFAULT 1
+
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+enum lapic_mode {
+ LAPIC_MODE_DISABLED = 0,
+ LAPIC_MODE_INVALID = X2APIC_ENABLE,
+ LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE,
+ LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
+};
+
+enum lapic_lvt_entry {
+ LVT_TIMER,
+ LVT_THERMAL_MONITOR,
+ LVT_PERFORMANCE_COUNTER,
+ LVT_LINT0,
+ LVT_LINT1,
+ LVT_ERROR,
+ LVT_CMCI,
+
+ KVM_APIC_MAX_NR_LVT_ENTRIES,
+};
+
+#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x))
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ ktime_t target_expiration;
+ u32 timer_mode;
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ u64 expired_tscdeadline;
+ u32 timer_advance_ns;
+ atomic_t pending; /* accumulated triggered timers */
+ bool hv_timer_in_use;
+};
+
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
- struct page *regs_page;
+ bool apicv_active;
+ bool sw_enabled;
+ bool irr_pending;
+ bool lvt0_in_nmi_mode;
+ /* Select registers in the vAPIC cannot be read/written. */
+ bool guest_apic_protected;
+ /* Number of bits set in ISR. */
+ s16 isr_count;
+ /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+ int highest_isr_cache;
+ /**
+ * APIC register page. The layout matches the register layout seen by
+ * the guest 1:1, because it is accessed by the vmx microcode.
+ * Note: Only one register, the TPR, is used by the microcode.
+ */
void *regs;
gpa_t vapic_addr;
- struct page *vapic_page;
+ struct gfn_to_hva_cache vapic_cache;
+ unsigned long pending_events;
+ unsigned int sipi_vector;
+ int nr_lvt_entries;
};
+
+struct dest_map;
+
int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
-
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
-
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int shorthand, unsigned int dest, int dest_mode);
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
+int kvm_alloc_apic_access_page(struct kvm *kvm);
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu);
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
+
+int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data);
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+void kvm_lapic_exit(void);
+
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
+
+static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
+{
+ apic_set_vector(vec, apic->regs + APIC_IRR);
+ /*
+ * irr_pending must be true if any interrupt is pending; set it after
+ * APIC_IRR to avoid race with apic_clear_irr
+ */
+ apic->irr_pending = true;
+}
+
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return apic_get_reg(apic->regs, reg_off);
+}
+
+DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_has_noapic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_false_deferred apic_hw_disabled;
+
+static inline bool kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_branch_unlikely(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return true;
+}
+
+extern struct static_key_false_deferred apic_sw_disabled;
+
+static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_branch_unlikely(&apic_sw_disabled.key))
+ return apic->sw_enabled;
+ return true;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active;
+}
+
+static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
+{
+ return !is_smm(vcpu) &&
+ !kvm_x86_call(apic_init_signal_blocked)(vcpu);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap);
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+
+static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
+{
+ return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+}
+
+static inline enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_mode(vcpu->arch.apic_base);
+}
+
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
deleted file mode 100644
index 0ef5bb2b4043..000000000000
--- a/arch/x86/kvm/mmu.c
+++ /dev/null
@@ -1,3205 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "mmu.h"
-
-#include <linux/kvm_host.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/hugetlb.h>
-#include <linux/compiler.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-#include <asm/io.h>
-#include <asm/vmx.h>
-
-/*
- * When setting this variable to true it enables Two-Dimensional-Paging
- * where the hardware walks 2 page tables:
- * 1. the guest-virtual to guest-physical
- * 2. while doing 1. it walks guest-physical to host-physical
- * If the hardware supports that we don't need to do shadow paging.
- */
-bool tdp_enabled = false;
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-static int oos_shadow = 1;
-module_param(oos_shadow, bool, 0644);
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
-#endif
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_LEVEL_MASK(level) \
- (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
- (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
-
-#define PT32_LEVEL_MASK(level) \
- (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-
-#define PT32_INDEX(address, level)\
- (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
- | PT64_NX_MASK)
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_RSVD_MASK (1U << 3)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define RMAP_EXT 4
-
-#define ACC_EXEC_MASK 1
-#define ACC_WRITE_MASK PT_WRITABLE_MASK
-#define ACC_USER_MASK PT_USER_MASK
-#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-
-struct kvm_rmap_desc {
- u64 *shadow_ptes[RMAP_EXT];
- struct kvm_rmap_desc *more;
-};
-
-struct kvm_shadow_walk_iterator {
- u64 addr;
- hpa_t shadow_addr;
- int level;
- u64 *sptep;
- unsigned index;
-};
-
-#define for_each_shadow_entry(_vcpu, _addr, _walker) \
- for (shadow_walk_init(&(_walker), _vcpu, _addr); \
- shadow_walk_okay(&(_walker)); \
- shadow_walk_next(&(_walker)))
-
-
-struct kvm_unsync_walk {
- int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
-static u64 __read_mostly shadow_base_present_pte;
-static u64 __read_mostly shadow_nx_mask;
-static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
-static u64 __read_mostly shadow_user_mask;
-static u64 __read_mostly shadow_accessed_mask;
-static u64 __read_mostly shadow_dirty_mask;
-
-static inline u64 rsvd_bits(int s, int e)
-{
- return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
- shadow_trap_nonpresent_pte = trap_pte;
- shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
-void kvm_mmu_set_base_ptes(u64 base_pte)
-{
- shadow_base_present_pte = base_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
-
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask)
-{
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-
-static int is_write_protection(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr0 & X86_CR0_WP;
-}
-
-static int is_cpuid_PSE36(void)
-{
- return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.shadow_efer & EFER_NX;
-}
-
-static int is_shadow_present_pte(u64 pte)
-{
- return pte != shadow_trap_nonpresent_pte
- && pte != shadow_notrap_nonpresent_pte;
-}
-
-static int is_large_pte(u64 pte)
-{
- return pte & PT_PAGE_SIZE_MASK;
-}
-
-static int is_writeble_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-static int is_dirty_pte(unsigned long pte)
-{
- return pte & shadow_dirty_mask;
-}
-
-static int is_rmap_pte(u64 pte)
-{
- return is_shadow_present_pte(pte);
-}
-
-static pfn_t spte_to_pfn(u64 pte)
-{
- return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t pse36_gfn_delta(u32 gpte)
-{
- int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
-
- return (gpte & PT32_DIR_PSE36_MASK) << shift;
-}
-
-static void set_shadow_pte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
- set_64bit((unsigned long *)sptep, spte);
-#else
- set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
-{
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
-{
- struct page *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- set_page_private(page, 0);
- cache->objects[cache->nobjs++] = page_address(page);
- }
- return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
- pte_chain_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache, 4);
-out:
- return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
- mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
- size_t size)
-{
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
- sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
- kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
- sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
- kfree(rd);
-}
-
-/*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
- */
-static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
-{
- unsigned long idx;
-
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
- return &slot->lpage_info[idx].write_count;
-}
-
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
-{
- int *write_count;
-
- gfn = unalias_gfn(kvm, gfn);
- write_count = slot_largepage_idx(gfn,
- gfn_to_memslot_unaliased(kvm, gfn));
- *write_count += 1;
-}
-
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
-{
- int *write_count;
-
- gfn = unalias_gfn(kvm, gfn);
- write_count = slot_largepage_idx(gfn,
- gfn_to_memslot_unaliased(kvm, gfn));
- *write_count -= 1;
- WARN_ON(*write_count < 0);
-}
-
-static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
- int *largepage_idx;
-
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
- if (slot) {
- largepage_idx = slot_largepage_idx(gfn, slot);
- return *largepage_idx;
- }
-
- return 1;
-}
-
-static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
-{
- struct vm_area_struct *vma;
- unsigned long addr;
- int ret = 0;
-
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return ret;
-
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, addr);
- if (vma && is_vm_hugetlb_page(vma))
- ret = 1;
- up_read(&current->mm->mmap_sem);
-
- return ret;
-}
-
-static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- struct kvm_memory_slot *slot;
-
- if (has_wrprotected_page(vcpu->kvm, large_gfn))
- return 0;
-
- if (!host_largepage_backed(vcpu->kvm, large_gfn))
- return 0;
-
- slot = gfn_to_memslot(vcpu->kvm, large_gfn);
- if (slot && slot->dirty_bitmap)
- return 0;
-
- return 1;
-}
-
-/*
- * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
-{
- struct kvm_memory_slot *slot;
- unsigned long idx;
-
- slot = gfn_to_memslot(kvm, gfn);
- if (!lpage)
- return &slot->rmap[gfn - slot->base_gfn];
-
- idx = (gfn / KVM_PAGES_PER_HPAGE) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE);
-
- return &slot->lpage_info[idx].rmap_pde;
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
- *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
- *
- * Returns the number of rmap entries before the spte was added or zero if
- * the spte was not added.
- *
- */
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
-{
- struct kvm_mmu_page *sp;
- struct kvm_rmap_desc *desc;
- unsigned long *rmapp;
- int i, count = 0;
-
- if (!is_rmap_pte(*spte))
- return count;
- gfn = unalias_gfn(vcpu->kvm, gfn);
- sp = page_header(__pa(spte));
- sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
- if (!*rmapp) {
- rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
- *rmapp = (unsigned long)spte;
- } else if (!(*rmapp & 1)) {
- rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
- desc = mmu_alloc_rmap_desc(vcpu);
- desc->shadow_ptes[0] = (u64 *)*rmapp;
- desc->shadow_ptes[1] = spte;
- *rmapp = (unsigned long)desc | 1;
- } else {
- rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
- desc = desc->more;
- count += RMAP_EXT;
- }
- if (desc->shadow_ptes[RMAP_EXT-1]) {
- desc->more = mmu_alloc_rmap_desc(vcpu);
- desc = desc->more;
- }
- for (i = 0; desc->shadow_ptes[i]; ++i)
- ;
- desc->shadow_ptes[i] = spte;
- }
- return count;
-}
-
-static void rmap_desc_remove_entry(unsigned long *rmapp,
- struct kvm_rmap_desc *desc,
- int i,
- struct kvm_rmap_desc *prev_desc)
-{
- int j;
-
- for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
- ;
- desc->shadow_ptes[i] = desc->shadow_ptes[j];
- desc->shadow_ptes[j] = NULL;
- if (j != 0)
- return;
- if (!prev_desc && !desc->more)
- *rmapp = (unsigned long)desc->shadow_ptes[0];
- else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- *rmapp = (unsigned long)desc->more | 1;
- mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(struct kvm *kvm, u64 *spte)
-{
- struct kvm_rmap_desc *desc;
- struct kvm_rmap_desc *prev_desc;
- struct kvm_mmu_page *sp;
- pfn_t pfn;
- unsigned long *rmapp;
- int i;
-
- if (!is_rmap_pte(*spte))
- return;
- sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writeble_pte(*spte))
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
- if (!*rmapp) {
- printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
- BUG();
- } else if (!(*rmapp & 1)) {
- rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
- if ((u64 *)*rmapp != spte) {
- printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
- spte, *spte);
- BUG();
- }
- *rmapp = 0;
- } else {
- rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- prev_desc = NULL;
- while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
- if (desc->shadow_ptes[i] == spte) {
- rmap_desc_remove_entry(rmapp,
- desc, i,
- prev_desc);
- return;
- }
- prev_desc = desc;
- desc = desc->more;
- }
- BUG();
- }
-}
-
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-{
- struct kvm_rmap_desc *desc;
- struct kvm_rmap_desc *prev_desc;
- u64 *prev_spte;
- int i;
-
- if (!*rmapp)
- return NULL;
- else if (!(*rmapp & 1)) {
- if (!spte)
- return (u64 *)*rmapp;
- return NULL;
- }
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- prev_desc = NULL;
- prev_spte = NULL;
- while (desc) {
- for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
- if (prev_spte == spte)
- return desc->shadow_ptes[i];
- prev_spte = desc->shadow_ptes[i];
- }
- desc = desc->more;
- }
- return NULL;
-}
-
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
-{
- unsigned long *rmapp;
- u64 *spte;
- int write_protected = 0;
-
- gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn, 0);
-
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- BUG_ON(!spte);
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- if (is_writeble_pte(*spte)) {
- set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
- write_protected = 1;
- }
- spte = rmap_next(kvm, rmapp, spte);
- }
- if (write_protected) {
- pfn_t pfn;
-
- spte = rmap_next(kvm, rmapp, NULL);
- pfn = spte_to_pfn(*spte);
- kvm_set_pfn_dirty(pfn);
- }
-
- /* check for huge page mappings */
- rmapp = gfn_to_rmap(kvm, gfn, 1);
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- BUG_ON(!spte);
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
- pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writeble_pte(*spte)) {
- rmap_remove(kvm, spte);
- --kvm->stat.lpages;
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
- spte = NULL;
- write_protected = 1;
- }
- spte = rmap_next(kvm, rmapp, spte);
- }
-
- return write_protected;
-}
-
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
-{
- u64 *spte;
- int need_tlb_flush = 0;
-
- while ((spte = rmap_next(kvm, rmapp, NULL))) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
- need_tlb_flush = 1;
- }
- return need_tlb_flush;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
-{
- int i;
- int retval = 0;
-
- /*
- * If mmap_sem isn't taken, we can look the memslots with only
- * the mmu_lock by skipping over the slots with userspace_addr == 0.
- */
- for (i = 0; i < kvm->nmemslots; i++) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
-
- /* mmu_lock protects userspace_addr */
- if (!start)
- continue;
-
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- retval |= handler(kvm, &memslot->rmap[gfn_offset]);
- retval |= handler(kvm,
- &memslot->lpage_info[
- gfn_offset /
- KVM_PAGES_PER_HPAGE].rmap_pde);
- }
- }
-
- return retval;
-}
-
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
-}
-
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
-{
- u64 *spte;
- int young = 0;
-
- /* always return old for EPT */
- if (!shadow_accessed_mask)
- return 0;
-
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- int _young;
- u64 _spte = *spte;
- BUG_ON(!(_spte & PT_PRESENT_MASK));
- _young = _spte & PT_ACCESSED_MASK;
- if (_young) {
- young = 1;
- clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
- }
- spte = rmap_next(kvm, rmapp, spte);
- }
- return young;
-}
-
-#define RMAP_RECYCLE_THRESHOLD 1000
-
-static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
-{
- unsigned long *rmapp;
-
- gfn = unalias_gfn(vcpu->kvm, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
-
- kvm_unmap_rmapp(vcpu->kvm, rmapp);
- kvm_flush_remote_tlbs(vcpu->kvm);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
- u64 *pos;
- u64 *end;
-
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if (is_shadow_present_pte(*pos)) {
- printk(KERN_ERR "%s: %p %llx\n", __func__,
- pos, *pos);
- return 0;
- }
- return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- ASSERT(is_empty_shadow_page(sp->spt));
- list_del(&sp->link);
- __free_page(virt_to_page(sp->spt));
- __free_page(virt_to_page(sp->gfns));
- kfree(sp);
- ++kvm->arch.n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
- return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
-{
- struct kvm_mmu_page *sp;
-
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&sp->oos_link);
- bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- sp->multimapped = 0;
- sp->parent_pte = parent_pte;
- --vcpu->kvm->arch.n_free_mmu_pages;
- return sp;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!parent_pte)
- return;
- if (!sp->multimapped) {
- u64 *old = sp->parent_pte;
-
- if (!old) {
- sp->parent_pte = parent_pte;
- return;
- }
- sp->multimapped = 1;
- pte_chain = mmu_alloc_pte_chain(vcpu);
- INIT_HLIST_HEAD(&sp->parent_ptes);
- hlist_add_head(&pte_chain->link, &sp->parent_ptes);
- pte_chain->parent_ptes[0] = old;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
- if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
- continue;
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
- if (!pte_chain->parent_ptes[i]) {
- pte_chain->parent_ptes[i] = parent_pte;
- return;
- }
- }
- pte_chain = mmu_alloc_pte_chain(vcpu);
- BUG_ON(!pte_chain);
- hlist_add_head(&pte_chain->link, &sp->parent_ptes);
- pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
- u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!sp->multimapped) {
- BUG_ON(sp->parent_pte != parent_pte);
- sp->parent_pte = NULL;
- return;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- if (pte_chain->parent_ptes[i] != parent_pte)
- continue;
- while (i + 1 < NR_PTE_CHAIN_ENTRIES
- && pte_chain->parent_ptes[i + 1]) {
- pte_chain->parent_ptes[i]
- = pte_chain->parent_ptes[i + 1];
- ++i;
- }
- pte_chain->parent_ptes[i] = NULL;
- if (i == 0) {
- hlist_del(&pte_chain->link);
- mmu_free_pte_chain(pte_chain);
- if (hlist_empty(&sp->parent_ptes)) {
- sp->multimapped = 0;
- sp->parent_pte = NULL;
- }
- }
- return;
- }
- BUG();
-}
-
-
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- mmu_parent_walk_fn fn)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- struct kvm_mmu_page *parent_sp;
- int i;
-
- if (!sp->multimapped && sp->parent_pte) {
- parent_sp = page_header(__pa(sp->parent_pte));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
- return;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
- }
-}
-
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
-{
- unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
-}
-
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!sp->parent_pte)
- return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
- return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
-
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
- return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- mmu_parent_walk(vcpu, sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
-}
-
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
-static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- return 1;
-}
-
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
-{
-}
-
-#define KVM_PAGE_ARRAY_NR 16
-
-struct kvm_mmu_pages {
- struct mmu_page_and_offset {
- struct kvm_mmu_page *sp;
- unsigned int idx;
- } page[KVM_PAGE_ARRAY_NR];
- unsigned int nr;
-};
-
-#define for_each_unsync_children(bitmap, idx) \
- for (idx = find_first_bit(bitmap, 512); \
- idx < 512; \
- idx = find_next_bit(bitmap, 512, idx+1))
-
-static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
- int idx)
-{
- int i;
-
- if (sp->unsync)
- for (i=0; i < pvec->nr; i++)
- if (pvec->page[i].sp == sp)
- return 0;
-
- pvec->page[pvec->nr].sp = sp;
- pvec->page[pvec->nr].idx = idx;
- pvec->nr++;
- return (pvec->nr == KVM_PAGE_ARRAY_NR);
-}
-
-static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_mmu_pages *pvec)
-{
- int i, ret, nr_unsync_leaf = 0;
-
- for_each_unsync_children(sp->unsync_child_bitmap, i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- __clear_bit(i, sp->unsync_child_bitmap);
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- }
-
- if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- }
- }
- }
-
- if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
- sp->unsync_children = 0;
-
- return nr_unsync_leaf;
-}
-
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_mmu_pages *pvec)
-{
- if (!sp->unsync_children)
- return 0;
-
- mmu_pages_add(pvec, sp, 0);
- return __mmu_unsync_walk(sp, pvec);
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: found role %x\n",
- __func__, sp->role.word);
- return sp;
- }
- return NULL;
-}
-
-static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- WARN_ON(!sp->unsync);
- sp->unsync = 0;
- --kvm->stat.mmu_unsync;
-}
-
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- if (sp->role.glevels != vcpu->arch.mmu.root_level) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
- return 1;
- }
-
- if (rmap_write_protect(vcpu->kvm, sp->gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
- return 1;
- }
-
- kvm_mmu_flush_tlb(vcpu);
- return 0;
-}
-
-struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
- unsigned int idx[PT64_ROOT_LEVEL-1];
-};
-
-#define for_each_sp(pvec, sp, parents, i) \
- for (i = mmu_pages_next(&pvec, &parents, -1), \
- sp = pvec.page[i].sp; \
- i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
- i = mmu_pages_next(&pvec, &parents, i))
-
-static int mmu_pages_next(struct kvm_mmu_pages *pvec,
- struct mmu_page_path *parents,
- int i)
-{
- int n;
-
- for (n = i+1; n < pvec->nr; n++) {
- struct kvm_mmu_page *sp = pvec->page[n].sp;
-
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- parents->idx[0] = pvec->page[n].idx;
- return n;
- }
-
- parents->parent[sp->role.level-2] = sp;
- parents->idx[sp->role.level-1] = pvec->page[n].idx;
- }
-
- return n;
-}
-
-static void mmu_pages_clear_parents(struct mmu_page_path *parents)
-{
- struct kvm_mmu_page *sp;
- unsigned int level = 0;
-
- do {
- unsigned int idx = parents->idx[level];
-
- sp = parents->parent[level];
- if (!sp)
- return;
-
- --sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
- __clear_bit(idx, sp->unsync_child_bitmap);
- level++;
- } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
- struct mmu_page_path *parents,
- struct kvm_mmu_pages *pvec)
-{
- parents->parent[parent->role.level-1] = NULL;
- pvec->nr = 0;
-}
-
-static void mmu_sync_children(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *parent)
-{
- int i;
- struct kvm_mmu_page *sp;
- struct mmu_page_path parents;
- struct kvm_mmu_pages pages;
-
- kvm_mmu_pages_init(parent, &parents, &pages);
- while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
-
- for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
-
- if (protected)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp);
- mmu_pages_clear_parents(&parents);
- }
- cond_resched_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_pages_init(parent, &parents, &pages);
- }
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- gva_t gaddr,
- unsigned level,
- int direct,
- unsigned access,
- u64 *parent_pte)
-{
- union kvm_mmu_page_role role;
- unsigned index;
- unsigned quadrant;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *tmp;
-
- role = vcpu->arch.mmu.base_role;
- role.level = level;
- role.direct = direct;
- role.access = access;
- if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
- quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
- quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
- role.quadrant = quadrant;
- }
- pgprintk("%s: looking gfn %lx role %x\n", __func__,
- gfn, role.word);
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
- if (sp->gfn == gfn) {
- if (sp->unsync)
- if (kvm_sync_page(vcpu, sp))
- continue;
-
- if (sp->role.word != role.word)
- continue;
-
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(vcpu, sp);
- }
- pgprintk("%s: found\n", __func__);
- return sp;
- }
- ++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte);
- if (!sp)
- return sp;
- pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
- sp->gfn = gfn;
- sp->role = role;
- hlist_add_head(&sp->hash_link, bucket);
- if (!direct) {
- if (rmap_write_protect(vcpu->kvm, gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- account_shadowed(vcpu->kvm, gfn);
- }
- if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
- vcpu->arch.mmu.prefetch_page(vcpu, sp);
- else
- nonpaging_prefetch_page(vcpu, sp);
- return sp;
-}
-
-static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
- struct kvm_vcpu *vcpu, u64 addr)
-{
- iterator->addr = addr;
- iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
- iterator->level = vcpu->arch.mmu.shadow_root_level;
- if (iterator->level == PT32E_ROOT_LEVEL) {
- iterator->shadow_addr
- = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
- iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
- --iterator->level;
- if (!iterator->shadow_addr)
- iterator->level = 0;
- }
-}
-
-static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
-{
- if (iterator->level < PT_PAGE_TABLE_LEVEL)
- return false;
- iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
- iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
- return true;
-}
-
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
-{
- iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
- --iterator->level;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *sp)
-{
- unsigned i;
- u64 *pt;
- u64 ent;
-
- pt = sp->spt;
-
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (is_shadow_present_pte(pt[i]))
- rmap_remove(kvm, &pt[i]);
- pt[i] = shadow_trap_nonpresent_pte;
- }
- return;
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- ent = pt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_large_pte(ent)) {
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent),
- &pt[i]);
- } else {
- --kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
- }
- }
- pt[i] = shadow_trap_nonpresent_pte;
- }
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
- int i;
-
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm->vcpus[i]->arch.last_pte_updated = NULL;
-}
-
-static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- u64 *parent_pte;
-
- while (sp->multimapped || sp->parent_pte) {
- if (!sp->multimapped)
- parent_pte = sp->parent_pte;
- else {
- struct kvm_pte_chain *chain;
-
- chain = container_of(sp->parent_ptes.first,
- struct kvm_pte_chain, link);
- parent_pte = chain->parent_ptes[0];
- }
- BUG_ON(!parent_pte);
- kvm_mmu_put_page(sp, parent_pte);
- set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
- }
-}
-
-static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent)
-{
- int i, zapped = 0;
- struct mmu_page_path parents;
- struct kvm_mmu_pages pages;
-
- if (parent->role.level == PT_PAGE_TABLE_LEVEL)
- return 0;
-
- kvm_mmu_pages_init(parent, &parents, &pages);
- while (mmu_unsync_walk(parent, &pages)) {
- struct kvm_mmu_page *sp;
-
- for_each_sp(pages, sp, parents, i) {
- kvm_mmu_zap_page(kvm, sp);
- mmu_pages_clear_parents(&parents);
- }
- zapped += pages.nr;
- kvm_mmu_pages_init(parent, &parents, &pages);
- }
-
- return zapped;
-}
-
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int ret;
- ++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp);
- kvm_mmu_page_unlink_children(kvm, sp);
- kvm_mmu_unlink_parents(kvm, sp);
- kvm_flush_remote_tlbs(kvm);
- if (!sp->role.invalid && !sp->role.direct)
- unaccount_shadowed(kvm, sp->gfn);
- if (sp->unsync)
- kvm_unlink_unsync_page(kvm, sp);
- if (!sp->root_count) {
- hlist_del(&sp->hash_link);
- kvm_mmu_free_page(kvm, sp);
- } else {
- sp->role.invalid = 1;
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_reload_remote_mmus(kvm);
- }
- kvm_mmu_reset_last_pte_updated(kvm);
- return ret;
-}
-
-/*
- * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
- */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
-{
- int used_pages;
-
- used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
- used_pages = max(0, used_pages);
-
- /*
- * If we set the number of mmu pages to be smaller be than the
- * number of actived pages , we must to free some mmu pages before we
- * change the value
- */
-
- if (used_pages > kvm_nr_mmu_pages) {
- while (used_pages > kvm_nr_mmu_pages) {
- struct kvm_mmu_page *page;
-
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(kvm, page);
- used_pages--;
- }
- kvm->arch.n_free_mmu_pages = 0;
- }
- else
- kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
- - kvm->arch.n_alloc_mmu_pages;
-
- kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
-}
-
-static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- int r;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- r = 0;
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct) {
- pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- if (kvm_mmu_zap_page(kvm, sp))
- n = bucket->first;
- }
- return r;
-}
-
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *nn;
-
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: zap %lx %x\n",
- __func__, gfn, sp->role.word);
- kvm_mmu_zap_page(kvm, sp);
- }
- }
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
- int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
- struct kvm_mmu_page *sp = page_header(__pa(pte));
-
- __set_bit(slot, sp->slot_bitmap);
-}
-
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
- int i;
- u64 *pt = sp->spt;
-
- if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] == shadow_notrap_nonpresent_pte)
- set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
- }
-}
-
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
- struct page *page;
-
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-
- if (gpa == UNMAPPED_GVA)
- return NULL;
-
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
- return page;
-}
-
-/*
- * The function is based on mtrr_type_lookup() in
- * arch/x86/kernel/cpu/mtrr/generic.c
- */
-static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
- u64 start, u64 end)
-{
- int i;
- u64 base, mask;
- u8 prev_match, curr_match;
- int num_var_ranges = KVM_NR_VAR_MTRR;
-
- if (!mtrr_state->enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state->have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state->fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state->enabled & 2))
- return mtrr_state->def_type;
-
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
-
- if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
- continue;
-
- base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
- (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
- (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
-
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
-
- if ((start & mask) != (base & mask))
- continue;
-
- curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
- continue;
- }
-
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if (prev_match != 0xFF)
- return prev_match;
-
- return mtrr_state->def_type;
-}
-
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u8 mtrr;
-
- mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
- (gfn << PAGE_SHIFT) + PAGE_SIZE);
- if (mtrr == 0xfe || mtrr == 0xff)
- mtrr = MTRR_TYPE_WRBACK;
- return mtrr;
-}
-EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *s;
- struct hlist_node *node, *n;
-
- index = kvm_page_table_hashfn(sp->gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- /* don't unsync if pagetable is shadowed with multiple roles */
- hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
- if (s->gfn != sp->gfn || s->role.direct)
- continue;
- if (s->role.word != sp->role.word)
- return 1;
- }
- ++vcpu->kvm->stat.mmu_unsync;
- sp->unsync = 1;
-
- kvm_mmu_mark_parents_unsync(vcpu, sp);
-
- mmu_convert_notrap(sp);
- return 0;
-}
-
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
-{
- struct kvm_mmu_page *shadow;
-
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
- if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
- return 1;
- if (shadow->unsync)
- return 0;
- if (can_unsync && oos_shadow)
- return kvm_unsync_page(vcpu, shadow);
- return 1;
- }
- return 0;
-}
-
-static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
- unsigned pte_access, int user_fault,
- int write_fault, int dirty, int largepage,
- gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync)
-{
- u64 spte;
- int ret = 0;
-
- /*
- * We don't set the accessed bit, since we sometimes want to see
- * whether the guest actually used the pte (in order to detect
- * demand paging).
- */
- spte = shadow_base_present_pte | shadow_dirty_mask;
- if (!speculative)
- spte |= shadow_accessed_mask;
- if (!dirty)
- pte_access &= ~ACC_WRITE_MASK;
- if (pte_access & ACC_EXEC_MASK)
- spte |= shadow_x_mask;
- else
- spte |= shadow_nx_mask;
- if (pte_access & ACC_USER_MASK)
- spte |= shadow_user_mask;
- if (largepage)
- spte |= PT_PAGE_SIZE_MASK;
- if (tdp_enabled)
- spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
-
- spte |= (u64)pfn << PAGE_SHIFT;
-
- if ((pte_access & ACC_WRITE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-
- if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
- ret = 1;
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
- }
-
- spte |= PT_WRITABLE_MASK;
-
- /*
- * Optimization: for pte sync, if spte was writable the hash
- * lookup is unnecessary (and expensive). Write protection
- * is responsibility of mmu_get_page / kvm_sync_page.
- * Same reasoning can be applied to dirty page accounting.
- */
- if (!can_unsync && is_writeble_pte(*shadow_pte))
- goto set_pte;
-
- if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
- pgprintk("%s: found shadow page for %lx, marking ro\n",
- __func__, gfn);
- ret = 1;
- pte_access &= ~ACC_WRITE_MASK;
- if (is_writeble_pte(spte))
- spte &= ~PT_WRITABLE_MASK;
- }
- }
-
- if (pte_access & ACC_WRITE_MASK)
- mark_page_dirty(vcpu->kvm, gfn);
-
-set_pte:
- set_shadow_pte(shadow_pte, spte);
- return ret;
-}
-
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
- pfn_t pfn, bool speculative)
-{
- int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*shadow_pte);
- int rmap_count;
-
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %lx\n",
- __func__, *shadow_pte, pt_access,
- write_fault, user_fault, gfn);
-
- if (is_rmap_pte(*shadow_pte)) {
- /*
- * If we overwrite a PTE page pointer with a 2MB PMD, unlink
- * the parent of the now unreachable PTE.
- */
- if (largepage && !is_large_pte(*shadow_pte)) {
- struct kvm_mmu_page *child;
- u64 pte = *shadow_pte;
-
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, shadow_pte);
- } else if (pfn != spte_to_pfn(*shadow_pte)) {
- pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*shadow_pte), pfn);
- rmap_remove(vcpu->kvm, shadow_pte);
- } else
- was_rmapped = 1;
- }
- if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, gfn, pfn, speculative, true)) {
- if (write_fault)
- *ptwrite = 1;
- kvm_x86_ops->tlb_flush(vcpu);
- }
-
- pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
- pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- is_large_pte(*shadow_pte)? "2MB" : "4kB",
- is_present_pte(*shadow_pte)?"RW":"R", gfn,
- *shadow_pte, shadow_pte);
- if (!was_rmapped && is_large_pte(*shadow_pte))
- ++vcpu->kvm->stat.lpages;
-
- page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
- if (!is_rmap_pte(*shadow_pte))
- kvm_release_pfn_clean(pfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, gfn, largepage);
- } else {
- if (was_writeble)
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
- }
- if (speculative) {
- vcpu->arch.last_pte_updated = shadow_pte;
- vcpu->arch.last_pte_gfn = gfn;
- }
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int largepage, gfn_t gfn, pfn_t pfn)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- int pt_write = 0;
- gfn_t pseudo_gfn;
-
- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == PT_PAGE_TABLE_LEVEL
- || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write,
- largepage, gfn, pfn, false);
- ++vcpu->stat.pf_fixed;
- break;
- }
-
- if (*iterator.sptep == shadow_trap_nonpresent_pte) {
- pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1,
- 1, ACC_ALL, iterator.sptep);
- if (!sp) {
- pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_pfn_clean(pfn);
- return -ENOMEM;
- }
-
- set_shadow_pte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
- }
- }
- return pt_write;
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
-{
- int r;
- int largepage = 0;
- pfn_t pfn;
- unsigned long mmu_seq;
-
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- largepage = 1;
- }
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
-
- /* mmio */
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
-
- return r;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- spin_unlock(&vcpu->kvm->mmu_lock);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
- }
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
- }
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-}
-
-static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
-{
- int ret = 0;
-
- if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- ret = 1;
- }
-
- return ret;
-}
-
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- gfn_t root_gfn;
- struct kvm_mmu_page *sp;
- int direct = 0;
-
- root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
-
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- ASSERT(!VALID_PAGE(root));
- if (tdp_enabled)
- direct = 1;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, direct,
- ACC_ALL, NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- vcpu->arch.mmu.root_hpa = root;
- return 0;
- }
- direct = !is_paging(vcpu);
- if (tdp_enabled)
- direct = 1;
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- ASSERT(!VALID_PAGE(root));
- if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- if (!is_present_pte(vcpu->arch.pdptrs[i])) {
- vcpu->arch.mmu.pae_root[i] = 0;
- continue;
- }
- root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
- } else if (vcpu->arch.mmu.root_level == 0)
- root_gfn = 0;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, direct,
- ACC_ALL, NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
- }
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
- return 0;
-}
-
-static void mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- }
- }
-}
-
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
- return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
-{
- gfn_t gfn;
- int r;
-
- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- gfn = gva >> PAGE_SHIFT;
-
- return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn);
-}
-
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
- u32 error_code)
-{
- pfn_t pfn;
- int r;
- int largepage = 0;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- unsigned long mmu_seq;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- largepage = 1;
- }
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- largepage, gfn, pfn);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = nonpaging_page_fault;
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
- pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
- mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- u64 addr,
- u32 err_code)
-{
- kvm_inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
- nonpaging_free(vcpu);
-}
-
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
- u64 exb_bit_rsvd = 0;
-
- if (!is_nx(vcpu))
- exb_bit_rsvd = rsvd_bits(63, 63);
- switch (level) {
- case PT32_ROOT_LEVEL:
- /* no rsvd bits for 2 level 4K page table entries */
- context->rsvd_bits_mask[0][1] = 0;
- context->rsvd_bits_mask[0][0] = 0;
- if (is_cpuid_PSE36())
- /* 36bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
- else
- /* 32 bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
- break;
- case PT32E_ROOT_LEVEL:
- context->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PDE */
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PTE */
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62) |
- rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
- break;
- case PT64_ROOT_LEVEL:
- context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
- context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
- context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
- break;
- }
-}
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- ASSERT(is_pae(vcpu));
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging64_page_fault;
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->prefetch_page = paging64_prefetch_page;
- context->sync_page = paging64_sync_page;
- context->invlpg = paging64_invlpg;
- context->free = paging_free;
- context->root_level = level;
- context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging32_page_fault;
- context->gva_to_gpa = paging32_gva_to_gpa;
- context->free = paging_free;
- context->prefetch_page = paging32_prefetch_page;
- context->sync_page = paging32_sync_page;
- context->invlpg = paging32_invlpg;
- context->root_level = PT32_ROOT_LEVEL;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = tdp_page_fault;
- context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
- context->shadow_root_level = kvm_x86_ops->get_tdp_level();
- context->root_hpa = INVALID_PAGE;
-
- if (!is_paging(vcpu)) {
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->root_level = 0;
- } else if (is_long_mode(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->root_level = PT64_ROOT_LEVEL;
- } else if (is_pae(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->root_level = PT32E_ROOT_LEVEL;
- } else {
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
- context->gva_to_gpa = paging32_gva_to_gpa;
- context->root_level = PT32_ROOT_LEVEL;
- }
-
- return 0;
-}
-
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
-{
- int r;
-
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- if (!is_paging(vcpu))
- r = nonpaging_init_context(vcpu);
- else if (is_long_mode(vcpu))
- r = paging64_init_context(vcpu);
- else if (is_pae(vcpu))
- r = paging32E_init_context(vcpu);
- else
- r = paging32_init_context(vcpu);
-
- vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
-
- return r;
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.update_pte.pfn = bad_pfn;
-
- if (tdp_enabled)
- return init_kvm_tdp_mmu(vcpu);
- else
- return init_kvm_softmmu(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
- vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
- destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- r = mmu_alloc_roots(vcpu);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
- if (r)
- goto out;
- kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
- kvm_mmu_flush_tlb(vcpu);
-out:
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte)
-{
- u64 pte;
- struct kvm_mmu_page *child;
-
- pte = *spte;
- if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
- is_large_pte(pte))
- rmap_remove(vcpu->kvm, spte);
- else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, spte);
- }
- }
- set_shadow_pte(spte, shadow_trap_nonpresent_pte);
- if (is_large_pte(pte))
- --vcpu->kvm->stat.lpages;
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte,
- const void *new)
-{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
- if (!vcpu->arch.update_pte.largepage ||
- sp->role.glevels == PT32_ROOT_LEVEL) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
- }
-
- ++vcpu->kvm->stat.mmu_pte_updated;
- if (sp->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, sp, spte, new);
- else
- paging64_update_pte(vcpu, sp, spte, new);
-}
-
-static bool need_remote_flush(u64 old, u64 new)
-{
- if (!is_shadow_present_pte(old))
- return false;
- if (!is_shadow_present_pte(new))
- return true;
- if ((old ^ new) & PT64_BASE_ADDR_MASK)
- return true;
- old ^= PT64_NX_MASK;
- new ^= PT64_NX_MASK;
- return (old & ~new & PT64_PERM_MASK) != 0;
-}
-
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
-{
- if (need_remote_flush(old, new))
- kvm_flush_remote_tlbs(vcpu->kvm);
- else
- kvm_mmu_flush_tlb(vcpu);
-}
-
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
- u64 *spte = vcpu->arch.last_pte_updated;
-
- return !!(spte && (*spte & shadow_accessed_mask));
-}
-
-static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
-{
- gfn_t gfn;
- int r;
- u64 gpte = 0;
- pfn_t pfn;
-
- vcpu->arch.update_pte.largepage = 0;
-
- if (bytes != 4 && bytes != 8)
- return;
-
- /*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode. This is nearly always true
- * (might be false while changing modes). Note it is verified later
- * by update_pte().
- */
- if (is_pae(vcpu)) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- if ((bytes == 4) && (gpa % 4 == 0)) {
- r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
- if (r)
- return;
- memcpy((void *)&gpte + (gpa % 8), new, 4);
- } else if ((bytes == 8) && (gpa % 8 == 0)) {
- memcpy((void *)&gpte, new, 8);
- }
- } else {
- if ((bytes == 4) && (gpa % 4 == 0))
- memcpy((void *)&gpte, new, 4);
- }
- if (!is_present_pte(gpte))
- return;
- gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-
- if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
- gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- vcpu->arch.update_pte.largepage = 1;
- }
- vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
- vcpu->arch.update_pte.gfn = gfn;
- vcpu->arch.update_pte.pfn = pfn;
-}
-
-static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u64 *spte = vcpu->arch.last_pte_updated;
-
- if (spte
- && vcpu->arch.last_pte_gfn == gfn
- && shadow_accessed_mask
- && !(*spte & shadow_accessed_mask)
- && is_shadow_present_pte(*spte))
- set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- bool guest_initiated)
-{
- gfn_t gfn = gpa >> PAGE_SHIFT;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
- u64 entry, gentry;
- u64 *spte;
- unsigned offset = offset_in_page(gpa);
- unsigned pte_size;
- unsigned page_offset;
- unsigned misaligned;
- unsigned quadrant;
- int level;
- int flooded = 0;
- int npte;
- int r;
-
- pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_access_page(vcpu, gfn);
- kvm_mmu_free_some_pages(vcpu);
- ++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, "pre pte write");
- if (guest_initiated) {
- if (gfn == vcpu->arch.last_pt_write_gfn
- && !last_updated_pte_accessed(vcpu)) {
- ++vcpu->arch.last_pt_write_count;
- if (vcpu->arch.last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->arch.last_pt_write_gfn = gfn;
- vcpu->arch.last_pt_write_count = 1;
- vcpu->arch.last_pte_updated = NULL;
- }
- }
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
- continue;
- pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
- misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
- misaligned |= bytes < 4;
- if (misaligned || flooded) {
- /*
- * Misaligned accesses are too much trouble to fix
- * up; also, they usually indicate a page is not used
- * as a page table.
- *
- * If we're seeing too many writes to a page,
- * it may no longer be a page table, or we may be
- * forking, in which case it is better to unmap the
- * page.
- */
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, sp->role.word);
- if (kvm_mmu_zap_page(vcpu->kvm, sp))
- n = bucket->first;
- ++vcpu->kvm->stat.mmu_flooded;
- continue;
- }
- page_offset = offset;
- level = sp->role.level;
- npte = 1;
- if (sp->role.glevels == PT32_ROOT_LEVEL) {
- page_offset <<= 1; /* 32->64 */
- /*
- * A 32-bit pde maps 4MB while the shadow pdes map
- * only 2MB. So we need to double the offset again
- * and zap two pdes instead of one.
- */
- if (level == PT32_ROOT_LEVEL) {
- page_offset &= ~7; /* kill rounding error */
- page_offset <<= 1;
- npte = 2;
- }
- quadrant = page_offset >> PAGE_SHIFT;
- page_offset &= ~PAGE_MASK;
- if (quadrant != sp->role.quadrant)
- continue;
- }
- spte = &sp->spt[page_offset / sizeof(*spte)];
- if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
- gentry = 0;
- r = kvm_read_guest_atomic(vcpu->kvm,
- gpa & ~(u64)(pte_size - 1),
- &gentry, pte_size);
- new = (const void *)&gentry;
- if (r < 0)
- new = NULL;
- }
- while (npte--) {
- entry = *spte;
- mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (new)
- mmu_pte_write_new_pte(vcpu, sp, spte, new);
- mmu_pte_write_flush_tlb(vcpu, entry, *spte);
- ++spte;
- }
- }
- kvm_mmu_audit(vcpu, "post pte write");
- spin_unlock(&vcpu->kvm->mmu_lock);
- if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
- kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
- vcpu->arch.update_pte.pfn = bad_pfn;
- }
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
-
- spin_lock(&vcpu->kvm->mmu_lock);
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- spin_unlock(&vcpu->kvm->mmu_lock);
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
- struct kvm_mmu_page *sp;
-
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
- ++vcpu->kvm->stat.mmu_recycled;
- }
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
-{
- int r;
- enum emulation_result er;
-
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
- if (r < 0)
- goto out;
-
- if (!r) {
- r = 1;
- goto out;
- }
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
-
- er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_DO_MMIO:
- ++vcpu->stat.mmio_exits;
- return 0;
- case EMULATE_FAIL:
- kvm_report_emulation_failure(vcpu, "pagetable");
- return 1;
- default:
- BUG();
- }
-out:
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
-{
- vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_mmu_flush_tlb(vcpu);
- ++vcpu->stat.invlpg;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
-
-void kvm_enable_tdp(void)
-{
- tdp_enabled = true;
-}
-EXPORT_SYMBOL_GPL(kvm_enable_tdp);
-
-void kvm_disable_tdp(void)
-{
- tdp_enabled = false;
-}
-EXPORT_SYMBOL_GPL(kvm_disable_tdp);
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
- free_page((unsigned long)vcpu->arch.mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int i;
-
- ASSERT(vcpu);
-
- if (vcpu->kvm->arch.n_requested_mmu_pages)
- vcpu->kvm->arch.n_free_mmu_pages =
- vcpu->kvm->arch.n_requested_mmu_pages;
- else
- vcpu->kvm->arch.n_free_mmu_pages =
- vcpu->kvm->arch.n_alloc_mmu_pages;
- /*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
- * Therefore we need to allocate shadow page tables in the first
- * 4GB of memory, which happens to fit the DMA32 zone.
- */
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
- if (!page)
- goto error_1;
- vcpu->arch.mmu.pae_root = page_address(page);
- for (i = 0; i < 4; ++i)
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
-
- return 0;
-
-error_1:
- free_mmu_pages(vcpu);
- return -ENOMEM;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
-
- destroy_kvm_mmu(vcpu);
- free_mmu_pages(vcpu);
- mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
- int i;
- u64 *pt;
-
- if (!test_bit(slot, sp->slot_bitmap))
- continue;
-
- pt = sp->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- /* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK)
- pt[i] &= ~PT_WRITABLE_MASK;
- }
- kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
- struct kvm_mmu_page *sp, *node;
-
- spin_lock(&kvm->mmu_lock);
- list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_zap_page(kvm, sp))
- node = container_of(kvm->arch.active_mmu_pages.next,
- struct kvm_mmu_page, link);
- spin_unlock(&kvm->mmu_lock);
-
- kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
-{
- struct kvm_mmu_page *page;
-
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(kvm, page);
-}
-
-static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
-{
- struct kvm *kvm;
- struct kvm *kvm_freed = NULL;
- int cache_count = 0;
-
- spin_lock(&kvm_lock);
-
- list_for_each_entry(kvm, &vm_list, vm_list) {
- int npages;
-
- if (!down_read_trylock(&kvm->slots_lock))
- continue;
- spin_lock(&kvm->mmu_lock);
- npages = kvm->arch.n_alloc_mmu_pages -
- kvm->arch.n_free_mmu_pages;
- cache_count += npages;
- if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
- kvm_mmu_remove_one_alloc_mmu_page(kvm);
- cache_count--;
- kvm_freed = kvm;
- }
- nr_to_scan--;
-
- spin_unlock(&kvm->mmu_lock);
- up_read(&kvm->slots_lock);
- }
- if (kvm_freed)
- list_move_tail(&kvm_freed->vm_list, &vm_list);
-
- spin_unlock(&kvm_lock);
-
- return cache_count;
-}
-
-static struct shrinker mmu_shrinker = {
- .shrink = mmu_shrink,
- .seeks = DEFAULT_SEEKS * 10,
-};
-
-static void mmu_destroy_caches(void)
-{
- if (pte_chain_cache)
- kmem_cache_destroy(pte_chain_cache);
- if (rmap_desc_cache)
- kmem_cache_destroy(rmap_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
-}
-
-void kvm_mmu_module_exit(void)
-{
- mmu_destroy_caches();
- unregister_shrinker(&mmu_shrinker);
-}
-
-int kvm_mmu_module_init(void)
-{
- pte_chain_cache = kmem_cache_create("kvm_pte_chain",
- sizeof(struct kvm_pte_chain),
- 0, 0, NULL);
- if (!pte_chain_cache)
- goto nomem;
- rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
- sizeof(struct kvm_rmap_desc),
- 0, 0, NULL);
- if (!rmap_desc_cache)
- goto nomem;
-
- mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
- sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
- if (!mmu_page_header_cache)
- goto nomem;
-
- register_shrinker(&mmu_shrinker);
-
- return 0;
-
-nomem:
- mmu_destroy_caches();
- return -ENOMEM;
-}
-
-/*
- * Caculate mmu pages needed for kvm.
- */
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
-{
- int i;
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
-
- for (i = 0; i < kvm->nmemslots; i++)
- nr_pages += kvm->memslots[i].npages;
-
- nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
-
- return nr_mmu_pages;
-}
-
-static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
- unsigned len)
-{
- if (len > buffer->len)
- return NULL;
- return buffer->ptr;
-}
-
-static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
- unsigned len)
-{
- void *ret;
-
- ret = pv_mmu_peek_buffer(buffer, len);
- if (!ret)
- return ret;
- buffer->ptr += len;
- buffer->len -= len;
- buffer->processed += len;
- return ret;
-}
-
-static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
- gpa_t addr, gpa_t value)
-{
- int bytes = 8;
- int r;
-
- if (!is_long_mode(vcpu) && !is_pae(vcpu))
- bytes = 4;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- if (!emulator_write_phys(vcpu, addr, &value, bytes))
- return -EFAULT;
-
- return 1;
-}
-
-static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- kvm_set_cr3(vcpu, vcpu->arch.cr3);
- return 1;
-}
-
-static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
- spin_unlock(&vcpu->kvm->mmu_lock);
- return 1;
-}
-
-static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
- struct kvm_pv_mmu_op_buffer *buffer)
-{
- struct kvm_mmu_op_header *header;
-
- header = pv_mmu_peek_buffer(buffer, sizeof *header);
- if (!header)
- return 0;
- switch (header->op) {
- case KVM_MMU_OP_WRITE_PTE: {
- struct kvm_mmu_op_write_pte *wpte;
-
- wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
- if (!wpte)
- return 0;
- return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
- wpte->pte_val);
- }
- case KVM_MMU_OP_FLUSH_TLB: {
- struct kvm_mmu_op_flush_tlb *ftlb;
-
- ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
- if (!ftlb)
- return 0;
- return kvm_pv_mmu_flush_tlb(vcpu);
- }
- case KVM_MMU_OP_RELEASE_PT: {
- struct kvm_mmu_op_release_pt *rpt;
-
- rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
- if (!rpt)
- return 0;
- return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
- }
- default: return 0;
- }
-}
-
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
- gpa_t addr, unsigned long *ret)
-{
- int r;
- struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
-
- buffer->ptr = buffer->buf;
- buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
- buffer->processed = 0;
-
- r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
- if (r)
- goto out;
-
- while (buffer->len) {
- r = kvm_pv_mmu_op_one(vcpu, buffer);
- if (r < 0)
- goto out;
- if (r == 0)
- break;
- }
-
- r = 1;
-out:
- *ret = buffer->processed;
- return r;
-}
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
- gva = (long long)(gva << 16) >> 16;
-#endif
- return gva;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 ent = pt[i];
-
- if (ent == shadow_trap_nonpresent_pte)
- continue;
-
- va = canonicalize(va);
- if (level > 1) {
- if (ent == shadow_notrap_nonpresent_pte)
- printk(KERN_ERR "audit: (%s) nontrapping pte"
- " in nonleaf level: levels %d gva %lx"
- " level %d pte %llx\n", audit_msg,
- vcpu->arch.mmu.root_level, va, level, ent);
- else
- audit_mappings_page(vcpu, ent, va, level - 1);
- } else {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
- gfn_t gfn = gpa >> PAGE_SHIFT;
- pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
- hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
-
- if (is_shadow_present_pte(ent)
- && (ent & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx gpa %llx hpa %llx ent %llx %d\n",
- audit_msg, vcpu->arch.mmu.root_level,
- va, gpa, hpa, ent,
- is_shadow_present_pte(ent));
- else if (ent == shadow_notrap_nonpresent_pte
- && !is_error_hpa(hpa))
- printk(KERN_ERR "audit: (%s) notrap shadow,"
- " valid guest gva %lx\n", audit_msg, va);
- kvm_release_pfn_clean(pfn);
-
- }
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
- int nmaps = 0;
- int i, j, k;
-
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
- struct kvm_rmap_desc *d;
-
- for (j = 0; j < m->npages; ++j) {
- unsigned long *rmapp = &m->rmap[j];
-
- if (!*rmapp)
- continue;
- if (!(*rmapp & 1)) {
- ++nmaps;
- continue;
- }
- d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (d) {
- for (k = 0; k < RMAP_EXT; ++k)
- if (d->shadow_ptes[k])
- ++nmaps;
- else
- break;
- d = d->more;
- }
- }
- }
- return nmaps;
-}
-
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
-{
- int nmaps = 0;
- struct kvm_mmu_page *sp;
- int i;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = pt[i];
-
- if (!(ent & PT_PRESENT_MASK))
- continue;
- if (!(ent & PT_WRITABLE_MASK))
- continue;
- ++nmaps;
- }
- }
- return nmaps;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- int n_rmap = count_rmaps(vcpu);
- int n_actual = count_writable_mappings(vcpu);
-
- if (n_rmap != n_actual)
- printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __func__, audit_msg, n_rmap, n_actual);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- gfn_t gfn;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
-
- gfn = unalias_gfn(vcpu->kvm, sp->gfn);
- slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[gfn - slot->base_gfn];
- if (*rmapp)
- printk(KERN_ERR "%s: (%s) shadow page has writable"
- " mappings: gfn %lx role %x\n",
- __func__, audit_msg, sp->gfn,
- sp->role.word);
- }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
- int olddbg = dbg;
-
- dbg = 0;
- audit_msg = msg;
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- audit_mappings(vcpu);
- dbg = olddbg;
-}
-
-#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3494a2fb136e..830f46145692 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -1,24 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_MMU_H
#define __KVM_X86_MMU_H
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "cpuid.h"
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+extern bool __read_mostly enable_mmio_caching;
#define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_SHIFT 5
#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
#define PT64_NX_SHIFT 63
@@ -28,56 +32,294 @@
#define PT_DIR_PAT_SHIFT 12
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
- (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-#define PT64_ROOT_LEVEL 4
+#define PT64_ROOT_5LEVEL 5
+#define PT64_ROOT_4LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
+
+#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
+
+static __always_inline u64 rsvd_bits(int s, int e)
+{
+ BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s);
+
+ if (__builtin_constant_p(e))
+ BUILD_BUG_ON(e > 63);
+ else
+ e &= 63;
+
+ if (e < s)
+ return 0;
+
+ return ((2ULL << (e - s)) - 1) << s;
+}
+
+static inline gfn_t kvm_mmu_max_gfn(void)
{
- if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
- __kvm_mmu_free_some_pages(vcpu);
+ /*
+ * Note that this uses the host MAXPHYADDR, not the guest's.
+ * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
+ * assuming KVM is running on bare metal, guest accesses beyond
+ * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
+ * (either EPT Violation/Misconfig or #NPF), and so KVM will never
+ * install a SPTE for such addresses. If KVM is running as a VM
+ * itself, on the other hand, it might see a MAXPHYADDR that is less
+ * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
+ * disallows such SPTEs entirely and simplifies the TDP MMU.
+ */
+ int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52;
+
+ return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
+u8 kvm_mmu_get_max_tdp_level(void);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len);
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
+
+ /*
+ * Checking root.hpa is sufficient even when KVM has mirror root.
+ * We can have either:
+ * (1) mirror_root_hpa = INVALID_PAGE, root.hpa = INVALID_PAGE
+ * (2) mirror_root_hpa = root, root.hpa = INVALID_PAGE
+ * (3) mirror_root_hpa = root1, root.hpa = root2
+ * We don't ever have:
+ * mirror_root_hpa = INVALID_PAGE, root.hpa = root
+ */
+ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+ BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
+
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)
+ ? cr3 & X86_CR3_PCID_MASK
+ : 0;
+}
+
+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
+{
+ return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
+}
+
+static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu)
+{
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LAM))
+ return 0;
+
+ return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
+}
+
+static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
+{
+ u64 root_hpa = vcpu->arch.mmu->root.hpa;
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ kvm_x86_call(load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->root_role.level);
+}
+
+static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ /*
+ * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e.
+ * @mmu's snapshot of CR0.WP and thus all related paging metadata may
+ * be stale. Refresh CR0.WP and the metadata on-demand when checking
+ * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing
+ * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does
+ * need to refresh nested_mmu, a.k.a. the walker used to translate L2
+ * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+ */
+ if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu)
+ return;
+
+ __kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+}
+
+/*
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
+ */
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ unsigned pte_access, unsigned pte_pkey,
+ u64 access)
+{
+ /* strip nested paging fault error codes */
+ unsigned int pfec = access;
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
+
+ /*
+ * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
+ * For implicit supervisor accesses, SMAP cannot be overridden.
+ *
+ * SMAP works on supervisor accesses only, and not_smap can
+ * be set or not set when user access with neither has any bearing
+ * on the result.
+ *
+ * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit;
+ * this bit will always be zero in pfec, but it will be one in index
+ * if SMAP checks are being disabled.
+ */
+ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
+ bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
+ int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
+ u32 errcode = PFERR_PRESENT_MASK;
+ bool fault;
+
+ kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+
+ fault = (mmu->permissions[index] >> pte_access) & 1;
+
+ WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK));
+ if (unlikely(mmu->pkru_mask)) {
+ u32 pkru_bits, offset;
+
+ /*
+ * PKRU defines 32 bits, there are 16 domains and 2
+ * attribute bits per domain in pkru. pte_pkey is the
+ * index of the protection domain, so pte_pkey * 2 is
+ * is the index of the first bit for the domain.
+ */
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
+
+ /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+ offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
+
+ pkru_bits &= mmu->pkru_mask >> offset;
+ errcode |= -pkru_bits & PFERR_PK_MASK;
+ fault |= (pkru_bits != 0);
+ }
+
+ return -(u32)fault & errcode;
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
+static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
{
+ /*
+ * Read shadow_root_allocated before related pointers. Hence, threads
+ * reading shadow_root_allocated in any lock context are guaranteed to
+ * see the pointers. Pairs with smp_store_release in
+ * mmu_first_shadow_root_alloc.
+ */
+ return smp_load_acquire(&kvm->arch.shadow_root_allocated);
+}
+
#ifdef CONFIG_X86_64
- return vcpu->arch.shadow_efer & EFER_LMA;
+extern bool tdp_mmu_enabled;
#else
- return 0;
+#define tdp_mmu_enabled false
#endif
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
+
+static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
}
-static inline int is_pae(struct kvm_vcpu *vcpu)
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
{
- return vcpu->arch.cr4 & X86_CR4_PAE;
+ /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
-static inline int is_pse(struct kvm_vcpu *vcpu)
+static inline unsigned long
+__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages,
+ int level)
{
- return vcpu->arch.cr4 & X86_CR4_PSE;
+ return gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
}
-static inline int is_paging(struct kvm_vcpu *vcpu)
+static inline unsigned long
+kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level)
{
- return vcpu->arch.cr0 & X86_CR0_PG;
+ return __kvm_mmu_slot_lpages(slot, slot->npages, level);
}
-static inline int is_present_pte(unsigned long pte)
+static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
+{
+ atomic64_add(count, &kvm->stat.pages[level - 1]);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
+ struct x86_exception *exception);
+
+static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ gpa_t gpa, u64 access,
+ struct x86_exception *exception)
{
- return pte & PT_PRESENT_MASK;
+ if (mmu != &vcpu->arch.nested_mmu)
+ return gpa;
+ return translate_nested_gpa(vcpu, gpa, access, exception);
}
+static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static inline gfn_t kvm_gfn_direct_bits(const struct kvm *kvm)
+{
+ return kvm->arch.gfn_direct_bits;
+}
+
+static inline bool kvm_is_addr_direct(struct kvm *kvm, gpa_t gpa)
+{
+ gpa_t gpa_direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(kvm));
+
+ return !gpa_direct_bits || (gpa & gpa_direct_bits);
+}
+
+static inline bool kvm_is_gfn_alias(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn & kvm_gfn_direct_bits(kvm);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
new file mode 100644
index 000000000000..02c450686b4a
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -0,0 +1,8095 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "irq.h"
+#include "ioapic.h"
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "tdp_mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "smm.h"
+#include "kvm_emulate.h"
+#include "page_track.h"
+#include "cpuid.h"
+#include "spte.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
+#include <linux/kstrtox.h>
+#include <linux/kthread.h>
+#include <linux/wordpart.h>
+
+#include <asm/page.h>
+#include <asm/memtype.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <asm/set_memory.h>
+#include <asm/spec-ctrl.h>
+#include <asm/vmx.h>
+
+#include "trace.h"
+
+static bool nx_hugepage_mitigation_hard_disabled;
+
+int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_period_ms;
+#ifdef CONFIG_PREEMPT_RT
+/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
+static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
+#else
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+#endif
+
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = get_nx_huge_pages,
+};
+
+static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
+ .set = set_nx_huge_pages_recovery_param,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_period_ms, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
+
+static bool __read_mostly force_flush_and_sync_on_reuse;
+module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
+
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
+static bool __ro_after_init tdp_mmu_allowed;
+
+#ifdef CONFIG_X86_64
+bool __read_mostly tdp_mmu_enabled = true;
+module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(tdp_mmu_enabled);
+#endif
+
+static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
+static int max_tdp_level __read_mostly;
+
+#define PTE_PREFETCH_NUM 8
+
+#include <trace/events/kvm.h>
+
+/* make pte_list_desc fit well in cache lines */
+#define PTE_LIST_EXT 14
+
+/*
+ * struct pte_list_desc is the core data structure used to implement a custom
+ * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
+ * given GFN when used in the context of rmaps. Using a custom list allows KVM
+ * to optimize for the common case where many GFNs will have at most a handful
+ * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
+ * memory footprint, which in turn improves runtime performance by exploiting
+ * cache locality.
+ *
+ * A list is comprised of one or more pte_list_desc objects (descriptors).
+ * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
+ * is full and a new SPTEs needs to be added, a new descriptor is allocated and
+ * becomes the head of the list. This means that by definitions, all tail
+ * descriptors are full.
+ *
+ * Note, the meta data fields are deliberately placed at the start of the
+ * structure to optimize the cacheline layout; accessing the descriptor will
+ * touch only a single cacheline so long as @spte_count<=6 (or if only the
+ * descriptors metadata is accessed).
+ */
+struct pte_list_desc {
+ struct pte_list_desc *more;
+ /* The number of PTEs stored in _this_ descriptor. */
+ u32 spte_count;
+ /* The number of PTEs stored in all tails of this descriptor. */
+ u32 tail_count;
+ u64 *sptes[PTE_LIST_EXT];
+};
+
+struct kvm_shadow_walk_iterator {
+ u64 addr;
+ hpa_t shadow_addr;
+ u64 *sptep;
+ int level;
+ unsigned index;
+};
+
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
+ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
+ (_root), (_addr)); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
+
+static struct kmem_cache *pte_list_desc_cache;
+struct kmem_cache *mmu_page_header_cache;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+struct kvm_mmu_role_regs {
+ const unsigned long cr0;
+ const unsigned long cr4;
+ const u64 efer;
+};
+
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
+/*
+ * Yes, lot's of underscores. They're a hint that you probably shouldn't be
+ * reading from the role_regs. Once the root_role is constructed, it becomes
+ * the single source of truth for the MMU's state.
+ */
+#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
+static inline bool __maybe_unused \
+____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
+{ \
+ return !!(regs->reg & flag); \
+}
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
+
+/*
+ * The MMU itself (with a valid role) is the single source of truth for the
+ * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
+ * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
+ * and the vCPU may be incorrect/irrelevant.
+ */
+#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
+{ \
+ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
+}
+BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
+BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
+
+static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+{
+ return mmu->cpu_role.base.level > 0;
+}
+
+static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+{
+ return !mmu->cpu_role.base.has_4_byte_gpte;
+}
+
+static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
+ .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
+ .efer = vcpu->arch.efer,
+ };
+
+ return regs;
+}
+
+static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr3(vcpu);
+}
+
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ return kvm_read_cr3(vcpu);
+
+ return mmu->get_guest_pgd(vcpu);
+}
+
+static inline bool kvm_available_flush_remote_tlbs_range(void)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+ return kvm_x86_ops.flush_remote_tlbs_range;
+#else
+ return false;
+#endif
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
+
+/* Flush the range of guest memory mapped by the given SPTE. */
+static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
+
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+}
+
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+ unsigned int access)
+{
+ u64 spte = make_mmio_spte(vcpu, gfn, access);
+
+ trace_mark_mmio_spte(sptep, gfn, spte);
+ mmu_spte_set(sptep, spte);
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+ gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
+ & shadow_nonpresent_or_rsvd_mask;
+
+ return gpa >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return spte & shadow_mmio_access_mask;
+}
+
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
+{
+ u64 kvm_gen, spte_gen, gen;
+
+ gen = kvm_vcpu_memslots(vcpu)->generation;
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return false;
+
+ kvm_gen = gen & MMIO_SPTE_GEN_MASK;
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
+static int is_cpuid_PSE36(void)
+{
+ return 1;
+}
+
+#ifdef CONFIG_X86_64
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
+ WRITE_ONCE(*sptep, spte);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
+ WRITE_ONCE(*sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
+ return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return READ_ONCE(*sptep);
+}
+#else
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};
+
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high;
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+
+ return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte (mm/gup.c).
+ *
+ * An spte tlb flush may be pending, because they are coalesced and
+ * we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
+}
+#endif
+
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON_ONCE(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte = *sptep;
+
+ WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
+ check_spte_writable_invariants(new_spte);
+
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return false;
+ }
+
+ if (!spte_needs_atomic_update(old_spte))
+ __update_clear_spte_fast(sptep, new_spte);
+ else
+ old_spte = __update_clear_spte_slow(sptep, new_spte);
+
+ WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+ return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
+}
+
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ * Returns the old PTE.
+ */
+static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
+{
+ u64 old_spte = *sptep;
+ int level = sptep_to_sp(sptep)->role.level;
+
+ if (!is_shadow_present_pte(old_spte) ||
+ !spte_needs_atomic_update(old_spte))
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
+ else
+ old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
+
+ if (!is_shadow_present_pte(old_spte))
+ return old_spte;
+
+ kvm_update_page_stats(kvm, level, -1);
+ return old_spte;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
+{
+ return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ if (is_tdp_mmu_active(vcpu)) {
+ kvm_tdp_mmu_walk_lockless_begin();
+ } else {
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
+
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ }
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ if (is_tdp_mmu_active(vcpu)) {
+ kvm_tdp_mmu_walk_lockless_end();
+ } else {
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+ }
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
+{
+ int r;
+
+ /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
+ if (r)
+ return r;
+ if (kvm_has_mirrored_tdp(vcpu->kvm)) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ if (maybe_indirect) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ PT64_ROOT_MAX_LEVEL);
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
+{
+ kmem_cache_free(pte_list_desc_cache, pte_list_desc);
+}
+
+static bool sp_has_gptes(struct kvm_mmu_page *sp);
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (sp->role.passthrough)
+ return sp->gfn;
+
+ if (sp->shadowed_translation)
+ return sp->shadowed_translation[index] >> PAGE_SHIFT;
+
+ return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
+}
+
+/*
+ * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
+ * that the SPTE itself may have a more constrained access permissions that
+ * what the guest enforces. For example, a guest may create an executable
+ * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
+ */
+static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
+{
+ if (sp->shadowed_translation)
+ return sp->shadowed_translation[index] & ACC_ALL;
+
+ /*
+ * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
+ * KVM is not shadowing any guest page tables, so the "guest access
+ * permissions" are just ACC_ALL.
+ *
+ * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
+ * is shadowing a guest huge page with small pages, the guest access
+ * permissions being shadowed are the access permissions of the huge
+ * page.
+ *
+ * In both cases, sp->role.access contains the correct access bits.
+ */
+ return sp->role.access;
+}
+
+static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
+ gfn_t gfn, unsigned int access)
+{
+ if (sp->shadowed_translation) {
+ sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
+ return;
+ }
+
+ WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
+ "access mismatch under %s page %llx (expected %u, got %u)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_access(sp, index), access);
+
+ WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
+ "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
+}
+
+static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
+ unsigned int access)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ kvm_mmu_page_set_translation(sp, index, gfn, access);
+}
+
+/*
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
+ */
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ const struct kvm_memory_slot *slot, int level)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.lpage_info[level - 2][idx];
+}
+
+/*
+ * The most significant bit in disallow_lpage tracks whether or not memory
+ * attributes are mixed, i.e. not identical for all gfns at the current level.
+ * The lower order bits are used to refcount other cases where a hugepage is
+ * disallowed, e.g. if KVM has shadow a page table at the gfn.
+ */
+#define KVM_LPAGE_MIXED_FLAG BIT(31)
+
+static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int old, i;
+
+ for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+
+ old = linfo->disallow_lpage;
+ linfo->disallow_lpage += count;
+ WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages++;
+ /*
+ * Ensure indirect_shadow_pages is elevated prior to re-reading guest
+ * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
+ * emulated writes are visible before re-reading guest PTEs, or that
+ * an emulated write will see the elevated count and acquire mmu_lock
+ * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write().
+ */
+ smp_mb();
+
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PG_LEVEL_4K)
+ return __kvm_write_track_add_gfn(kvm, slot, gfn);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
+}
+
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type)
+{
+ /*
+ * If it's possible to replace the shadow page with an NX huge page,
+ * i.e. if the shadow page is the only thing currently preventing KVM
+ * from using a huge page, add the shadow page to the list of "to be
+ * zapped for NX recovery" pages. Note, the shadow page can already be
+ * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+ * links a shadow page at multiple points.
+ */
+ if (!list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ ++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
+ list_add_tail(&sp->possible_nx_huge_page_link,
+ &kvm->arch.possible_nx_huge_pages[mmu_type].pages);
+}
+
+static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool nx_huge_page_possible)
+{
+ sp->nx_huge_page_disallowed = true;
+
+ if (nx_huge_page_possible)
+ track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages--;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (sp->role.level > PG_LEVEL_4K)
+ return __kvm_write_track_remove_gfn(kvm, slot, gfn);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type)
+{
+ if (list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
+ --kvm->stat.nx_lpage_splits;
+ --kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
+ list_del_init(&sp->possible_nx_huge_page_link);
+}
+
+static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ sp->nx_huge_page_disallowed = false;
+
+ untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
+}
+
+static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return NULL;
+ if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
+ return NULL;
+
+ return slot;
+}
+
+/*
+ * About rmap_head encoding:
+ *
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
+ * pte_list_desc containing more mappings.
+ */
+#define KVM_RMAP_MANY BIT(0)
+
+/*
+ * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
+ * operates with mmu_lock held for write), but rmaps can be walked without
+ * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
+ * being zapped/dropped _while the rmap is locked_.
+ *
+ * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
+ * done while holding mmu_lock for write. This allows a task walking rmaps
+ * without holding mmu_lock to concurrently walk the same entries as a task
+ * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
+ * the rmaps, thus the walks are stable.
+ *
+ * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
+ * only the rmap chains themselves are protected. E.g. holding an rmap's lock
+ * ensures all "struct pte_list_desc" fields are stable.
+ */
+#define KVM_RMAP_LOCKED BIT(1)
+
+static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long old_val, new_val;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * Elide the lock if the rmap is empty, as lockless walkers (read-only
+ * mode) don't need to (and can't) walk an empty rmap, nor can they add
+ * entries to the rmap. I.e. the only paths that process empty rmaps
+ * do so while holding mmu_lock for write, and are mutually exclusive.
+ */
+ old_val = atomic_long_read(&rmap_head->val);
+ if (!old_val)
+ return 0;
+
+ do {
+ /*
+ * If the rmap is locked, wait for it to be unlocked before
+ * trying acquire the lock, e.g. to avoid bouncing the cache
+ * line.
+ */
+ while (old_val & KVM_RMAP_LOCKED) {
+ cpu_relax();
+ old_val = atomic_long_read(&rmap_head->val);
+ }
+
+ /*
+ * Recheck for an empty rmap, it may have been purged by the
+ * task that held the lock.
+ */
+ if (!old_val)
+ return 0;
+
+ new_val = old_val | KVM_RMAP_LOCKED;
+ /*
+ * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
+ * from being reordered outside of the critical section created by
+ * __kvm_rmap_lock().
+ *
+ * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
+ *
+ * For the !old_val case, no ordering is needed, as there is no rmap
+ * to walk.
+ */
+ } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
+
+ /*
+ * Return the old value, i.e. _without_ the LOCKED bit set. It's
+ * impossible for the return value to be 0 (see above), i.e. the read-
+ * only unlock flow can't get a false positive and fail to unlock.
+ */
+ return old_val;
+}
+
+static unsigned long kvm_rmap_lock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return __kvm_rmap_lock(rmap_head);
+}
+
+static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
+ unsigned long val)
+{
+ KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
+ /*
+ * Ensure that all accesses to the rmap have completed before unlocking
+ * the rmap.
+ *
+ * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
+ */
+ atomic_long_set_release(&rmap_head->val, val);
+}
+
+static void kvm_rmap_unlock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ unsigned long new_val)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ __kvm_rmap_unlock(rmap_head, new_val);
+}
+
+static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
+{
+ return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
+}
+
+/*
+ * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
+ * actual locking is the same, but the caller is disallowed from modifying the
+ * rmap, and so the unlock flow is a nop if the rmap is/was empty.
+ */
+static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long rmap_val;
+
+ preempt_disable();
+ rmap_val = __kvm_rmap_lock(rmap_head);
+
+ if (!rmap_val)
+ preempt_enable();
+
+ return rmap_val;
+}
+
+static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
+ unsigned long old_val)
+{
+ if (!old_val)
+ return;
+
+ KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
+
+ __kvm_rmap_unlock(rmap_head, old_val);
+ preempt_enable();
+}
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
+ */
+static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ u64 *spte, struct kvm_rmap_head *rmap_head)
+{
+ unsigned long old_val, new_val;
+ struct pte_list_desc *desc;
+ int count = 0;
+
+ old_val = kvm_rmap_lock(kvm, rmap_head);
+
+ if (!old_val) {
+ new_val = (unsigned long)spte;
+ } else if (!(old_val & KVM_RMAP_MANY)) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->sptes[0] = (u64 *)old_val;
+ desc->sptes[1] = spte;
+ desc->spte_count = 2;
+ desc->tail_count = 0;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
+ ++count;
+ } else {
+ desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
+ count = desc->tail_count + desc->spte_count;
+
+ /*
+ * If the previous head is full, allocate a new head descriptor
+ * as tail descriptors are always kept full.
+ */
+ if (desc->spte_count == PTE_LIST_EXT) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
+ desc->spte_count = 0;
+ desc->tail_count = count;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
+ } else {
+ new_val = old_val;
+ }
+ desc->sptes[desc->spte_count++] = spte;
+ }
+
+ kvm_rmap_unlock(kvm, rmap_head, new_val);
+
+ return count;
+}
+
+static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
+ struct pte_list_desc *desc, int i)
+{
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
+ int j = head_desc->spte_count - 1;
+
+ /*
+ * The head descriptor should never be empty. A new head is added only
+ * when adding an entry and the previous head is full, and heads are
+ * removed (this flow) when they become empty.
+ */
+ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
+
+ /*
+ * Replace the to-be-freed SPTE with the last valid entry from the head
+ * descriptor to ensure that tail descriptors are full at all times.
+ * Note, this also means that tail_count is stable for each descriptor.
+ */
+ desc->sptes[i] = head_desc->sptes[j];
+ head_desc->sptes[j] = NULL;
+ head_desc->spte_count--;
+ if (head_desc->spte_count)
+ return;
+
+ /*
+ * The head descriptor is empty. If there are no tail descriptors,
+ * nullify the rmap head to mark the list as empty, else point the rmap
+ * head at the next descriptor, i.e. the new head.
+ */
+ if (!head_desc->more)
+ *rmap_val = 0;
+ else
+ *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
+ mmu_free_pte_list_desc(head_desc);
+}
+
+static void pte_list_remove(struct kvm *kvm, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ unsigned long rmap_val;
+ int i;
+
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
+ goto out;
+
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
+ goto out;
+
+ rmap_val = 0;
+ } else {
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+ while (desc) {
+ for (i = 0; i < desc->spte_count; ++i) {
+ if (desc->sptes[i] == spte) {
+ pte_list_desc_remove_entry(kvm, &rmap_val,
+ desc, i);
+ goto out;
+ }
+ }
+ desc = desc->more;
+ }
+
+ KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
+ }
+
+out:
+ kvm_rmap_unlock(kvm, rmap_head, rmap_val);
+}
+
+static void kvm_zap_one_rmap_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head, u64 *sptep)
+{
+ mmu_spte_clear_track_bits(kvm, sptep);
+ pte_list_remove(kvm, sptep, rmap_head);
+}
+
+/* Return true if at least one SPTE was zapped, false otherwise */
+static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc, *next;
+ unsigned long rmap_val;
+ int i;
+
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (!rmap_val)
+ return false;
+
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
+ goto out;
+ }
+
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+
+ for (; desc; desc = next) {
+ for (i = 0; i < desc->spte_count; i++)
+ mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
+ next = desc->more;
+ mmu_free_pte_list_desc(desc);
+ }
+out:
+ /* rmap_head is meaningless now, remember to reset it */
+ kvm_rmap_unlock(kvm, rmap_head, 0);
+ return true;
+}
+
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
+ struct pte_list_desc *desc;
+
+ if (!rmap_val)
+ return 0;
+ else if (!(rmap_val & KVM_RMAP_MANY))
+ return 1;
+
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+ return desc->tail_count + desc->spte_count;
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+ const struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap_head;
+
+ sp = sptep_to_sp(spte);
+ gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
+
+ /*
+ * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
+ * so we have to determine which memslots to use based on context
+ * information in sp->role.
+ */
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+
+ slot = __gfn_to_memslot(slots, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+
+ pte_list_remove(kvm, spte, rmap_head);
+}
+
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap. All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+ /* private fields */
+ struct rmap_head *head;
+ struct pte_list_desc *desc; /* holds the sptep if not NULL */
+ int pos; /* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function. This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the iterator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
+{
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
+
+ if (!rmap_val)
+ return NULL;
+
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ iter->desc = NULL;
+ return (u64 *)rmap_val;
+ }
+
+ iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+ iter->pos = 0;
+ return iter->desc->sptes[iter->pos];
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+ if (iter->desc) {
+ if (iter->pos < PTE_LIST_EXT - 1) {
+ ++iter->pos;
+ if (iter->desc->sptes[iter->pos])
+ return iter->desc->sptes[iter->pos];
+ }
+
+ iter->desc = iter->desc->more;
+
+ if (iter->desc) {
+ iter->pos = 0;
+ /* desc->sptes[0] cannot be NULL */
+ return iter->desc->sptes[iter->pos];
+ }
+ }
+
+ return NULL;
+}
+
+#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
+ _sptep_; _sptep_ = rmap_get_next(_iter_))
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
+
+#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+ u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
+
+ if (is_shadow_present_pte(old_spte))
+ rmap_remove(kvm, sptep);
+}
+
+static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(sptep);
+ WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
+
+ drop_spte(kvm, sptep);
+
+ if (flush)
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte write-protection is caused by protecting shadow page table.
+ *
+ * Note: write protection is difference between dirty logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
+ *
+ * Return true if tlb need be flushed.
+ */
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && is_mmu_writable_spte(spte)))
+ return false;
+
+ if (pt_protect)
+ spte &= ~shadow_mmu_writable_mask;
+ spte = spte & ~PT_WRITABLE_MASK;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ flush |= spte_write_protect(sptep, pt_protect);
+
+ return flush;
+}
+
+static bool spte_clear_dirty(u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
+ spte &= ~shadow_dirty_mask;
+ return mmu_spte_update(sptep, spte);
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ if (spte_ad_need_write_protect(*sptep))
+ flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ else
+ flush |= spte_clear_dirty(sptep);
+ }
+
+ return flush;
+}
+
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, true);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
+ while (mask) {
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
+ rmap_write_protect(rmap_head, false);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, false);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
+ while (mask) {
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
+ __rmap_clear_dirty(kvm, rmap_head, slot);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ /*
+ * If the slot was assumed to be "initially all dirty", write-protect
+ * huge pages to ensure they are split to 4KiB on the first write (KVM
+ * dirty logs at 4KiB granularity). If eager page splitting is enabled,
+ * immediately try to split huge pages, e.g. so that vCPUs don't get
+ * saddled with the cost of splitting.
+ *
+ * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
+ * of memslot has no such restriction, so the range can cross two large
+ * pages.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+ gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
+ gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
+
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
+
+ /* Cross two large pages? */
+ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
+ ALIGN(end << PAGE_SHIFT, PMD_SIZE))
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
+ PG_LEVEL_2M);
+ }
+
+ /*
+ * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
+ * mask. If PML is enabled and the GFN doesn't need to be write-
+ * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
+ * Otherwise clear the Writable bit.
+ *
+ * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
+ * enabled but it chooses between clearing the Dirty bit and Writeable
+ * bit based on the context.
+ */
+ if (kvm->arch.cpu_dirty_log_size)
+ kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
+ else
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+int kvm_cpu_dirty_log_size(struct kvm *kvm)
+{
+ return kvm->arch.cpu_dirty_log_size;
+}
+
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level)
+{
+ struct kvm_rmap_head *rmap_head;
+ int i;
+ bool write_protected = false;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = gfn_to_rmap(gfn, i, slot);
+ write_protected |= rmap_write_protect(rmap_head, true);
+ }
+ }
+
+ if (tdp_mmu_enabled)
+ write_protected |=
+ kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
+
+ return write_protected;
+}
+
+static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
+}
+
+static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
+}
+
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ const struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap;
+ int level;
+
+ /* private field. */
+ struct kvm_rmap_head *end_rmap;
+};
+
+static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
+ int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
+}
+
+static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ const struct kvm_memory_slot *slot,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ while (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
+
+ if (atomic_long_read(&iterator->rmap->val))
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
+
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool can_yield, bool flush_on_yield,
+ bool flush)
+{
+ struct slot_rmap_walk_iterator iterator;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap, slot);
+
+ if (!can_yield)
+ continue;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush && flush_on_yield) {
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+ return flush;
+}
+
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
+{
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ true, flush_on_yield, false);
+}
+
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
+{
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+}
+
+static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end, bool can_yield,
+ bool flush)
+{
+ return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, can_yield, true, flush);
+}
+
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool flush = false;
+
+ /*
+ * To prevent races with vCPUs faulting in a gfn using stale data,
+ * zapping a gfn range must be protected by mmu_invalidate_in_progress
+ * (and mmu_invalidate_seq). The only exception is memslot deletion;
+ * in that case, SRCU synchronization ensures that SPTEs are zapped
+ * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the
+ * invalid slot.
+ */
+ lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
+ lockdep_is_held(&kvm->slots_lock));
+
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
+ range->start, range->end,
+ range->may_block, flush);
+
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+
+ if (kvm_x86_ops.set_apic_access_page_addr &&
+ range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+
+ return flush;
+}
+
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void __rmap_add(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
+ const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+ int rmap_count;
+
+ sp = sptep_to_sp(spte);
+ kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
+ kvm_update_page_stats(kvm, sp->role.level, 1);
+
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+ rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
+
+ if (rmap_count > kvm->stat.max_mmu_rmap_size)
+ kvm->stat.max_mmu_rmap_size = rmap_count;
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
+ kvm_zap_all_rmap_sptes(kvm, rmap_head);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+ }
+}
+
+static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
+{
+ struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
+
+ __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
+}
+
+static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ bool test_only)
+{
+ struct kvm_rmap_head *rmap_head;
+ struct rmap_iterator iter;
+ unsigned long rmap_val;
+ bool young = false;
+ u64 *sptep;
+ gfn_t gfn;
+ int level;
+ u64 spte;
+
+ for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ for (gfn = range->start; gfn < range->end;
+ gfn += KVM_PAGES_PER_HPAGE(level)) {
+ rmap_head = gfn_to_rmap(gfn, level, range->slot);
+ rmap_val = kvm_rmap_lock_readonly(rmap_head);
+
+ for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
+ if (!is_accessed_spte(spte))
+ continue;
+
+ if (test_only) {
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
+ return true;
+ }
+
+ if (spte_ad_enabled(spte))
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ else
+ /*
+ * If the following cmpxchg fails, the
+ * spte is being concurrently modified
+ * and should most likely stay young.
+ */
+ cmpxchg64(sptep, spte,
+ mark_spte_for_access_track(spte));
+ young = true;
+ }
+
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
+ }
+ }
+ return young;
+}
+
+static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool young = false;
+
+ if (tdp_mmu_enabled)
+ young = kvm_tdp_mmu_age_gfn_range(kvm, range);
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, false);
+
+ return young;
+}
+
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool young = false;
+
+ if (tdp_mmu_enabled)
+ young = kvm_tdp_mmu_test_age_gfn(kvm, range);
+
+ if (young)
+ return young;
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, true);
+
+ return young;
+}
+
+static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
+{
+#ifdef CONFIG_KVM_PROVE_MMU
+ int i;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
+ pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
+ sp->spt[i], &sp->spt[i],
+ kvm_mmu_page_get_gfn(sp, i));
+ }
+#endif
+}
+
+static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm->arch.n_used_mmu_pages++;
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+}
+
+static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm->arch.n_used_mmu_pages--;
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+}
+
+static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
+{
+ kvm_mmu_check_sptes_at_free(sp);
+
+ hlist_del(&sp->hash_link);
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
+ free_page((unsigned long)sp->shadowed_translation);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return hash_64(gfn, KVM_MMU_HASH_SHIFT);
+}
+
+static void mmu_page_add_parent_pte(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
+ struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ if (!parent_pte)
+ return;
+
+ pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
+}
+
+static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
+}
+
+static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(kvm, sp, parent_pte);
+ mmu_spte_clear_no_track(parent_pte);
+}
+
+static void mark_unsync(u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
+}
+
+static void mark_unsync(u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(spte);
+ if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
+ return;
+ if (sp->unsync_children++)
+ return;
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
+{
+ int i;
+
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON_ONCE((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
+
+ for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
+ struct kvm_mmu_page *child;
+ u64 ent = sp->spt[i];
+
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
+
+ child = spte_to_child_sp(ent);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
+ nr_unsync_leaf += ret;
+ } else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ clear_unsync_child_bit(sp, i);
+ }
+
+ return nr_unsync_leaf;
+}
+
+#define INVALID_INDEX (-1)
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ pvec->nr = 0;
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
+ return __mmu_unsync_walk(sp, pvec);
+}
+
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON_ONCE(!sp->unsync);
+ trace_kvm_mmu_sync_page(sp);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+static bool sp_has_gptes(struct kvm_mmu_page *sp)
+{
+ if (sp->role.direct)
+ return false;
+
+ if (sp->role.passthrough)
+ return false;
+
+ return true;
+}
+
+static __ro_after_init HLIST_HEAD(empty_page_hash);
+
+static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
+{
+ /*
+ * Ensure the load of the hash table pointer itself is ordered before
+ * loads to walk the table. The pointer is set at runtime outside of
+ * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
+ * shadow pages becomes necessary only when KVM needs to shadow L1's
+ * TDP for an L2 guest. Pairs with the smp_store_release() in
+ * kvm_mmu_alloc_page_hash().
+ */
+ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (!page_hash)
+ return &empty_page_hash;
+
+ return &page_hash[kvm_page_table_hashfn(gfn)];
+}
+
+#define for_each_valid_sp(_kvm, _sp, _list) \
+ hlist_for_each_entry(_sp, _list, hash_link) \
+ if (is_obsolete_sp((_kvm), (_sp))) { \
+ } else
+
+#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
+ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
+
+static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
+
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ .passthrough = 0x1,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
+ return false;
+
+ return true;
+}
+
+static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ /* sp->spt[i] has initial value of shadow page table allocation */
+ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
+ return 0;
+
+ return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
+}
+
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int flush = 0;
+ int i;
+
+ if (!kvm_sync_page_check(vcpu, sp))
+ return -1;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ int ret = kvm_sync_spte(vcpu, sp, i);
+
+ if (ret < -1)
+ return -1;
+ flush |= ret;
+ }
+
+ /*
+ * Note, any flush is purely for KVM's correctness, e.g. when dropping
+ * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+ * unmap or dirty logging event doesn't fail to flush. The guest is
+ * responsible for flushing the TLB to ensure any changes in protection
+ * bits are recognized, i.e. until the guest flushes or page faults on
+ * a relevant address, KVM is architecturally allowed to let vCPUs use
+ * cached translations with the old protection bits.
+ */
+ return flush;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int ret = __kvm_sync_page(vcpu, sp);
+
+ if (ret < 0)
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ return ret;
+}
+
+static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
+ struct list_head *invalid_list,
+ bool remote_flush)
+{
+ if (!remote_flush && list_empty(invalid_list))
+ return false;
+
+ if (!list_empty(invalid_list))
+ kvm_mmu_commit_zap_page(kvm, invalid_list);
+ else
+ kvm_flush_remote_tlbs(kvm);
+ return true;
+}
+
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->role.invalid)
+ return true;
+
+ /* TDP MMU pages do not use the MMU generation. */
+ return !is_tdp_mmu_page(sp) &&
+ unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+ unsigned int idx[PT64_ROOT_MAX_LEVEL];
+};
+
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_first(&pvec, &parents); \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents,
+ int i)
+{
+ int n;
+
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
+
+ parents->idx[level-1] = idx;
+ if (level == PG_LEVEL_4K)
+ break;
+
+ parents->parent[level-2] = sp;
+ }
+
+ return n;
+}
+
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON_ONCE(level == PG_LEVEL_4K);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ WARN_ON_ONCE(idx == INVALID_INDEX);
+ clear_unsync_child_bit(sp, idx);
+ level++;
+ } while (!sp->unsync_children);
+}
+
+static int mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent, bool can_yield)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ bool protected = false;
+
+ for_each_sp(pages, sp, parents, i)
+ protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
+
+ if (protected) {
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
+ flush = false;
+ }
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
+ mmu_pages_clear_parents(&parents);
+ }
+ if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ if (!can_yield) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ return -EINTR;
+ }
+
+ cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
+ }
+
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ return 0;
+}
+
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+ atomic_set(&sp->write_flooding_count, 0);
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+ __clear_sp_write_flooding_count(sptep_to_sp(spte));
+}
+
+/*
+ * The vCPU is required when finding indirect shadow pages; the shadow
+ * page may already exist and syncing it needs the vCPU pointer in
+ * order to read guest page tables. Direct shadow pages are never
+ * unsync, thus @vcpu can be NULL if @role.direct is true.
+ */
+static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+ int ret;
+ int collisions = 0;
+ LIST_HEAD(invalid_list);
+
+ for_each_valid_sp(kvm, sp, sp_list) {
+ if (sp->gfn != gfn) {
+ collisions++;
+ continue;
+ }
+
+ if (sp->role.word != role.word) {
+ /*
+ * If the guest is creating an upper-level page, zap
+ * unsync pages for the same gfn. While it's possible
+ * the guest is using recursive page tables, in all
+ * likelihood the guest has stopped using the unsync
+ * page and is installing a completely unrelated page.
+ * Unsync pages must not be left as is, because the new
+ * upper-level page will be write-protected.
+ */
+ if (role.level > PG_LEVEL_4K && sp->unsync)
+ kvm_mmu_prepare_zap_page(kvm, sp,
+ &invalid_list);
+ continue;
+ }
+
+ /* unsync and write-flooding only apply to indirect SPs. */
+ if (sp->role.direct)
+ goto out;
+
+ if (sp->unsync) {
+ if (KVM_BUG_ON(!vcpu, kvm))
+ break;
+
+ /*
+ * The page is good, but is stale. kvm_sync_page does
+ * get the latest guest state, but (unlike mmu_unsync_children)
+ * it doesn't write-protect the page or mark it synchronized!
+ * This way the validity of the mapping is ensured, but the
+ * overhead of write protection is not incurred until the
+ * guest invalidates the TLB mapping. This allows multiple
+ * SPs for a single gfn to be unsync.
+ *
+ * If the sync fails, the page is zapped. If so, break
+ * in order to rebuild it.
+ */
+ ret = kvm_sync_page(vcpu, sp, &invalid_list);
+ if (ret < 0)
+ break;
+
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ if (ret > 0)
+ kvm_flush_remote_tlbs(kvm);
+ }
+
+ __clear_sp_write_flooding_count(sp);
+
+ goto out;
+ }
+
+ sp = NULL;
+ ++kvm->stat.mmu_cache_miss;
+
+out:
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (collisions > kvm->stat.max_mmu_page_hash_collisions)
+ kvm->stat.max_mmu_page_hash_collisions = collisions;
+ return sp;
+}
+
+/* Caches used when allocating a new shadow page. */
+struct shadow_page_caches {
+ struct kvm_mmu_memory_cache *page_header_cache;
+ struct kvm_mmu_memory_cache *shadow_page_cache;
+ struct kvm_mmu_memory_cache *shadowed_info_cache;
+};
+
+static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
+ if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
+ sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
+
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
+ /*
+ * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+ * depends on valid pages being added to the head of the list. See
+ * comments in kvm_zap_obsolete_pages().
+ */
+ sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ list_add(&sp->link, &kvm->arch.active_mmu_pages);
+ kvm_account_mmu_page(kvm, sp);
+
+ sp->gfn = gfn;
+ sp->role = role;
+ hlist_add_head(&sp->hash_link, sp_list);
+ if (sp_has_gptes(sp))
+ account_shadowed(kvm, sp);
+
+ return sp;
+}
+
+/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
+static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct hlist_head *sp_list;
+ struct kvm_mmu_page *sp;
+ bool created = false;
+
+ /*
+ * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
+ * mmu_page_hash must be set prior to creating the first shadow root,
+ * i.e. reaching this point is fully serialized by slots_arch_lock.
+ */
+ BUG_ON(!kvm->arch.mmu_page_hash);
+ sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+
+ sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
+ if (!sp) {
+ created = true;
+ sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
+ }
+
+ trace_kvm_mmu_get_page(sp, created);
+ return sp;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct shadow_page_caches caches = {
+ .page_header_cache = &vcpu->arch.mmu_page_header_cache,
+ .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
+ .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
+ };
+
+ return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
+}
+
+static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
+ unsigned int access)
+{
+ struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
+ union kvm_mmu_page_role role;
+
+ role = parent_sp->role;
+ role.level--;
+ role.access = access;
+ role.direct = direct;
+ role.passthrough = 0;
+
+ /*
+ * If the guest has 4-byte PTEs then that means it's using 32-bit,
+ * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
+ * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
+ * shadow each guest page table with multiple shadow page tables, which
+ * requires extra bookkeeping in the role.
+ *
+ * Specifically, to shadow the guest's page directory (which covers a
+ * 4GiB address space), KVM uses 4 PAE page directories, each mapping
+ * 1GiB of the address space. @role.quadrant encodes which quarter of
+ * the address space each maps.
+ *
+ * To shadow the guest's page tables (which each map a 4MiB region), KVM
+ * uses 2 PAE page tables, each mapping a 2MiB region. For these,
+ * @role.quadrant encodes which half of the region they map.
+ *
+ * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
+ * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow
+ * PDPTEs; those 4 PAE page directories are pre-allocated and their
+ * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes
+ * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume
+ * bit 21 in the PTE (the child here), KVM propagates that bit to the
+ * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE
+ * covers bit 21 (see above), thus the quadrant is calculated from the
+ * _least_ significant bit of the PDE index.
+ */
+ if (role.has_4_byte_gpte) {
+ WARN_ON_ONCE(role.level != PG_LEVEL_4K);
+ role.quadrant = spte_index(sptep) & 1;
+ }
+
+ return role;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
+ u64 *sptep, gfn_t gfn,
+ bool direct, unsigned int access)
+{
+ union kvm_mmu_page_role role;
+
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+ return ERR_PTR(-EEXIST);
+
+ role = kvm_mmu_child_role(sptep, direct, access);
+ return kvm_mmu_get_shadow_page(vcpu, gfn, role);
+}
+
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, hpa_t root,
+ u64 addr)
+{
+ iterator->addr = addr;
+ iterator->shadow_addr = root;
+ iterator->level = vcpu->arch.mmu->root_role.level;
+
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu->root_role.direct)
+ iterator->level = PT32E_ROOT_LEVEL;
+
+ if (iterator->level == PT32E_ROOT_LEVEL) {
+ /*
+ * prev_root is currently only used for 64-bit hosts. So only
+ * the active root_hpa is valid here.
+ */
+ BUG_ON(root != vcpu->arch.mmu->root.hpa);
+
+ iterator->shadow_addr
+ = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
+ iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
+ --iterator->level;
+ if (!iterator->shadow_addr)
+ iterator->level = 0;
+ }
+}
+
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
+ addr);
+}
+
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+ if (iterator->level < PG_LEVEL_4K)
+ return false;
+
+ iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
+ iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+ return true;
+}
+
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
+{
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
+ iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
+ --iterator->level;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ __shadow_walk_next(iterator, *iterator->sptep);
+}
+
+static void __link_shadow_page(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache, u64 *sptep,
+ struct kvm_mmu_page *sp, bool flush)
+{
+ u64 spte;
+
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
+ /*
+ * If an SPTE is present already, it must be a leaf and therefore
+ * a large one. Drop it, and flush the TLB if needed, before
+ * installing sp.
+ */
+ if (is_shadow_present_pte(*sptep))
+ drop_large_spte(kvm, sptep, flush);
+
+ spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
+
+ mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(kvm, cache, sp, sptep);
+
+ /*
+ * The non-direct sub-pagetable must be updated before linking. For
+ * L1 sp, the pagetable is updated via kvm_sync_page() in
+ * kvm_mmu_find_shadow_page() without write-protecting the gfn,
+ * so sp->unsync can be true or false. For higher level non-direct
+ * sp, the pagetable is updated/synced via mmu_sync_children() in
+ * FNAME(fetch)(), so sp->unsync_children can only be false.
+ * WARN_ON_ONCE() if anything happens unexpectedly.
+ */
+ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
+ mark_unsync(sptep);
+}
+
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
+{
+ __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = spte_to_child_sp(*sptep);
+ if (child->role.access == direct_access)
+ return;
+
+ drop_parent_pte(vcpu->kvm, child, sptep);
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
+ }
+}
+
+/* Returns the number of zapped non-leaf child shadow pages. */
+static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte, struct list_head *invalid_list)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (is_last_spte(pte, sp->role.level)) {
+ drop_spte(kvm, spte);
+ } else {
+ child = spte_to_child_sp(pte);
+ drop_parent_pte(kvm, child, spte);
+
+ /*
+ * Recursively zap nested TDP SPs, parentless SPs are
+ * unlikely to be used again in the near future. This
+ * avoids retaining a large number of stale nested SPs.
+ */
+ if (tdp_enabled && invalid_list &&
+ child->role.guest_mode &&
+ !atomic_long_read(&child->parent_ptes.val))
+ return kvm_mmu_prepare_zap_page(kvm, child,
+ invalid_list);
+ }
+ } else if (is_mmio_spte(kvm, pte)) {
+ mmu_spte_clear_no_track(spte);
+ }
+ return 0;
+}
+
+static int kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int zapped = 0;
+ unsigned i;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
+ zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
+
+ return zapped;
+}
+
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
+ drop_parent_pte(kvm, sp, sptep);
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
+{
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ if (parent->role.level == PG_LEVEL_4K)
+ return 0;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ mmu_pages_clear_parents(&parents);
+ zapped++;
+ }
+ }
+
+ return zapped;
+}
+
+static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list,
+ int *nr_zapped)
+{
+ bool list_unstable, zapped_root = false;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ trace_kvm_mmu_prepare_zap_page(sp);
+ ++kvm->stat.mmu_shadow_zapped;
+ *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
+ kvm_mmu_unlink_parents(kvm, sp);
+
+ /* Zapping children means active_mmu_pages has become unstable. */
+ list_unstable = *nr_zapped;
+
+ if (!sp->role.invalid && sp_has_gptes(sp))
+ unaccount_shadowed(kvm, sp);
+
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
+ if (!sp->root_count) {
+ /* Count self */
+ (*nr_zapped)++;
+
+ /*
+ * Already invalid pages (previously active roots) are not on
+ * the active page list. See list_del() in the "else" case of
+ * !sp->root_count.
+ */
+ if (sp->role.invalid)
+ list_add(&sp->link, invalid_list);
+ else
+ list_move(&sp->link, invalid_list);
+ kvm_unaccount_mmu_page(kvm, sp);
+ } else {
+ /*
+ * Remove the active root from the active page list, the root
+ * will be explicitly freed when the root_count hits zero.
+ */
+ list_del(&sp->link);
+
+ /*
+ * Obsolete pages cannot be used on any vCPUs, see the comment
+ * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
+ * treats invalid shadow pages as being obsolete.
+ */
+ zapped_root = !is_obsolete_sp(kvm, sp);
+ }
+
+ if (sp->nx_huge_page_disallowed)
+ unaccount_nx_huge_page(kvm, sp);
+
+ sp->role.invalid = 1;
+
+ /*
+ * Make the request to free obsolete roots after marking the root
+ * invalid, otherwise other vCPUs may not see it as invalid.
+ */
+ if (zapped_root)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
+ return list_unstable;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int nr_zapped;
+
+ __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
+ return nr_zapped;
+}
+
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp, *nsp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ /*
+ * We need to make sure everyone sees our modifications to
+ * the page tables and see changes to vcpu->mode here. The barrier
+ * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ *
+ * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ * guest mode and/or lockless shadow page table walks.
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ list_for_each_entry_safe(sp, nsp, invalid_list, link) {
+ WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_shadow_page(sp);
+ }
+}
+
+static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+ unsigned long nr_to_zap)
+{
+ unsigned long total_zapped = 0;
+ struct kvm_mmu_page *sp, *tmp;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+ int nr_zapped;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return 0;
+
+restart:
+ list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ /*
+ * Don't zap active root pages, the page itself can't be freed
+ * and zapping it will just force vCPUs to realloc and reload.
+ */
+ if (sp->root_count)
+ continue;
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+ &nr_zapped);
+ total_zapped += nr_zapped;
+ if (total_zapped >= nr_to_zap)
+ break;
+
+ if (unstable)
+ goto restart;
+ }
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ kvm->stat.mmu_recycled += total_zapped;
+ return total_zapped;
+}
+
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+{
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
+}
+
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+ unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
+
+ if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
+ return 0;
+
+ kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
+
+ /*
+ * Note, this check is intentionally soft, it only guarantees that one
+ * page is available, while the caller may end up allocating as many as
+ * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
+ * exceeding the (arbitrary by default) limit will not harm the host,
+ * being too aggressive may unnecessarily kill the guest, and getting an
+ * exact count is far more trouble than it's worth, especially in the
+ * page fault paths.
+ */
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
+{
+ write_lock(&kvm->mmu_lock);
+
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+ goal_nr_mmu_pages);
+
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ }
+
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry)
+{
+ struct kvm *kvm = vcpu->kvm;
+ LIST_HEAD(invalid_list);
+ struct kvm_mmu_page *sp;
+ gpa_t gpa = cr2_or_gpa;
+ bool r = false;
+
+ /*
+ * Bail early if there aren't any write-protected shadow pages to avoid
+ * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
+ * by a third party. Reading indirect_shadow_pages without holding
+ * mmu_lock is safe, as this is purely an optimization, i.e. a false
+ * positive is benign, and a false negative will simply result in KVM
+ * skipping the unprotect+retry path, which is also an optimization.
+ */
+ if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
+ goto out;
+
+ if (!vcpu->arch.mmu->root_role.direct) {
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+ if (gpa == INVALID_GPA)
+ goto out;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ /*
+ * Snapshot the result before zapping, as zapping will remove all list
+ * entries, i.e. checking the list later would yield a false negative.
+ */
+ r = !list_empty(&invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+
+out:
+ if (r || always_retry) {
+ vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+ }
+ return r;
+}
+
+static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ trace_kvm_mmu_unsync_page(sp);
+ ++kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+/*
+ * Attempt to unsync any shadow pages that can be reached by the specified gfn,
+ * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
+ * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
+ * be write-protected.
+ */
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool synchronizing, bool prefetch)
+{
+ struct kvm_mmu_page *sp;
+ bool locked = false;
+
+ /*
+ * Force write-protection if the page is being tracked. Note, the page
+ * track machinery is used to write-protect upper-level shadow pages,
+ * i.e. this guards the role.level == 4K assertion below!
+ */
+ if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
+ return -EPERM;
+
+ /*
+ * The page is not write-tracked, mark existing shadow pages unsync
+ * unless KVM is synchronizing an unsync SP. In that case, KVM must
+ * complete emulation of the guest TLB flush before allowing shadow
+ * pages to become unsync (writable by the guest).
+ */
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
+ if (synchronizing)
+ return -EPERM;
+
+ if (sp->unsync)
+ continue;
+
+ if (prefetch)
+ return -EEXIST;
+
+ /*
+ * TDP MMU page faults require an additional spinlock as they
+ * run with mmu_lock held for read, not write, and the unsync
+ * logic is not thread safe. Take the spinklock regardless of
+ * the MMU type to avoid extra conditionals/parameters, there's
+ * no meaningful penalty if mmu_lock is held for write.
+ */
+ if (!locked) {
+ locked = true;
+ spin_lock(&kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * Recheck after taking the spinlock, a different vCPU
+ * may have since marked the page unsync. A false
+ * negative on the unprotected check above is not
+ * possible as clearing sp->unsync _must_ hold mmu_lock
+ * for write, i.e. unsync cannot transition from 1->0
+ * while this CPU holds mmu_lock for read (or write).
+ */
+ if (READ_ONCE(sp->unsync))
+ continue;
+ }
+
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
+ kvm_unsync_page(kvm, sp);
+ }
+ if (locked)
+ spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+ * before the SPTE is updated to allow writes because
+ * kvm_mmu_sync_roots() checks the unsync flags without holding
+ * the MMU lock and so can race with this. If the SPTE was updated
+ * before the page had been marked as unsync-ed, something like the
+ * following could happen:
+ *
+ * CPU 1 CPU 2
+ * ---------------------------------------------------------------------
+ * 1.2 Host updates SPTE
+ * to be writable
+ * 2.1 Guest writes a GPTE for GVA X.
+ * (GPTE being in the guest page table shadowed
+ * by the SP from CPU 1.)
+ * This reads SPTE during the page table walk.
+ * Since SPTE.W is read as 1, there is no
+ * fault.
+ *
+ * 2.2 Guest issues TLB flush.
+ * That causes a VM Exit.
+ *
+ * 2.3 Walking of unsync pages sees sp->unsync is
+ * false and skips the page.
+ *
+ * 2.4 Guest accesses GVA X.
+ * Since the mapping in the SP was not updated,
+ * so the old mapping for GVA X incorrectly
+ * gets used.
+ * 1.1 Host marks SP
+ * as unsync
+ * (sp->unsync = true)
+ *
+ * The write barrier below ensures that 1.1 happens before 1.2 and thus
+ * the situation in 2.4 does not arise. It pairs with the read barrier
+ * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
+ */
+ smp_wmb();
+
+ return 0;
+}
+
+static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *sptep, unsigned int pte_access, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ int level = sp->role.level;
+ int was_rmapped = 0;
+ int ret = RET_PF_FIXED;
+ bool flush = false;
+ bool wrprot;
+ u64 spte;
+
+ /* Prefetching always gets a writable pfn. */
+ bool host_writable = !fault || fault->map_writable;
+ bool prefetch = !fault || fault->prefetch;
+ bool write_fault = fault && fault->write;
+
+ if (unlikely(is_noslot_pfn(pfn))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ return RET_PF_EMULATE;
+ }
+
+ if (is_shadow_present_pte(*sptep)) {
+ if (prefetch && is_last_spte(*sptep, level) &&
+ pfn == spte_to_pfn(*sptep))
+ return RET_PF_SPURIOUS;
+
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *sptep;
+
+ child = spte_to_child_sp(pte);
+ drop_parent_pte(vcpu->kvm, child, sptep);
+ flush = true;
+ } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) {
+ drop_spte(vcpu->kvm, sptep);
+ flush = true;
+ } else
+ was_rmapped = 1;
+ }
+
+ wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
+ false, host_writable, &spte);
+
+ if (*sptep == spte) {
+ ret = RET_PF_SPURIOUS;
+ } else {
+ flush |= mmu_spte_update(sptep, spte);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ }
+
+ if (wrprot && write_fault)
+ ret = RET_PF_WRITE_PROTECTED;
+
+ if (flush)
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
+
+ if (!was_rmapped) {
+ WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
+ rmap_add(vcpu, slot, sptep, gfn, pte_access);
+ } else {
+ /* Already rmapped but the pte_access bits may have changed. */
+ kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
+ }
+
+ return ret;
+}
+
+static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep,
+ int nr_pages, unsigned int access)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
+ int i;
+
+ if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM))
+ return false;
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot)
+ return false;
+
+ nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages);
+ if (nr_pages <= 0)
+ return false;
+
+ for (i = 0; i < nr_pages; i++, gfn++, sptep++) {
+ mmu_set_spte(vcpu, slot, sptep, access, gfn,
+ page_to_pfn(pages[i]), NULL);
+
+ /*
+ * KVM always prefetches writable pages from the primary MMU,
+ * and KVM can make its SPTE writable in the fast page handler,
+ * without notifying the primary MMU. Mark pages/folios dirty
+ * now to ensure file data is written back if it ends up being
+ * written by the guest. Because KVM's prefetching GUPs
+ * writable PTEs, the probability of unnecessary writeback is
+ * extremely low.
+ */
+ kvm_release_page_dirty(pages[i]);
+ }
+
+ return true;
+}
+
+static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
+ unsigned int access = sp->role.access;
+
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access);
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON_ONCE(!sp->role.direct);
+
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
+ if (!start)
+ continue;
+ if (!direct_pte_prefetch_many(vcpu, sp, start, spte))
+ return;
+
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+ if (start)
+ direct_pte_prefetch_many(vcpu, sp, start, spte);
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(sptep);
+
+ /*
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
+ */
+ if (sp_ad_disabled(sp))
+ return;
+
+ if (sp->role.level > PG_LEVEL_4K)
+ return;
+
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
+/*
+ * Lookup the mapping level for @gfn in the current mm.
+ *
+ * WARNING! Use of host_pfn_mapping_level() requires the caller and the end
+ * consumer to be tied into KVM's handlers for MMU notifier events!
+ *
+ * There are several ways to safely use this helper:
+ *
+ * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
+ * consuming it. In this case, mmu_lock doesn't need to be held during the
+ * lookup, but it does need to be held while checking the MMU notifier.
+ *
+ * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
+ * event for the hva. This can be done by explicit checking the MMU notifier
+ * or by ensuring that KVM already has a valid mapping that covers the hva.
+ *
+ * - Do not use the result to install new mappings, e.g. use the host mapping
+ * level only to decide whether or not to zap an entry. In this case, it's
+ * not required to hold mmu_lock (though it's highly likely the caller will
+ * want to hold mmu_lock anyways, e.g. to modify SPTEs).
+ *
+ * Note! The lookup can still race with modifications to host page tables, but
+ * the above "rules" ensure KVM will not _consume_ the result of the walk if a
+ * race with the primary MMU occurs.
+ */
+static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
+ const struct kvm_memory_slot *slot)
+{
+ int level = PG_LEVEL_4K;
+ unsigned long hva;
+ unsigned long flags;
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
+
+ /*
+ * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
+ * is not solely for performance, it's also necessary to avoid the
+ * "writable" check in __gfn_to_hva_many(), which will always fail on
+ * read-only memslots due to gfn_to_hva() assuming writes. Earlier
+ * page fault steps have already verified the guest isn't writing a
+ * read-only memslot.
+ */
+ hva = __gfn_to_hva_memslot(slot, gfn);
+
+ /*
+ * Disable IRQs to prevent concurrent tear down of host page tables,
+ * e.g. if the primary MMU promotes a P*D to a huge page and then frees
+ * the original page table.
+ */
+ local_irq_save(flags);
+
+ /*
+ * Read each entry once. As above, a non-leaf entry can be promoted to
+ * a huge page _during_ this walk. Re-reading the entry could send the
+ * walk into the weeks, e.g. p*d_leaf() returns false (sees the old
+ * value) and then p*d_offset() walks into the target huge page instead
+ * of the old page table (sees the new value).
+ */
+ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
+ if (pgd_none(pgd))
+ goto out;
+
+ p4d = READ_ONCE(*p4d_offset(&pgd, hva));
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ goto out;
+
+ pud = READ_ONCE(*pud_offset(&p4d, hva));
+ if (pud_none(pud) || !pud_present(pud))
+ goto out;
+
+ if (pud_leaf(pud)) {
+ level = PG_LEVEL_1G;
+ goto out;
+ }
+
+ pmd = READ_ONCE(*pmd_offset(&pud, hva));
+ if (pmd_none(pmd) || !pmd_present(pmd))
+ goto out;
+
+ if (pmd_leaf(pmd))
+ level = PG_LEVEL_2M;
+
+out:
+ local_irq_restore(flags);
+ return level;
+}
+
+static u8 kvm_max_level_for_order(int order)
+{
+ BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+ KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+ return PG_LEVEL_1G;
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ bool is_private)
+{
+ u8 max_level, coco_level;
+ kvm_pfn_t pfn;
+
+ /* For faults, use the gmem information that was resolved earlier. */
+ if (fault) {
+ pfn = fault->pfn;
+ max_level = fault->max_level;
+ } else {
+ /* TODO: Call into guest_memfd once hugepages are supported. */
+ WARN_ONCE(1, "Get pfn+order from guest_memfd");
+ pfn = KVM_PFN_ERR_FAULT;
+ max_level = PG_LEVEL_4K;
+ }
+
+ if (max_level == PG_LEVEL_4K)
+ return max_level;
+
+ /*
+ * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
+ * restrictions. A return of '0' means "no additional restrictions", to
+ * allow for using an optional "ret0" static call.
+ */
+ coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private);
+ if (coco_level)
+ max_level = min(max_level, coco_level);
+
+ return max_level;
+}
+
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ struct kvm_lpage_info *linfo;
+ int host_level, max_level;
+ bool is_private;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (fault) {
+ max_level = fault->max_level;
+ is_private = fault->is_private;
+ } else {
+ max_level = PG_LEVEL_NUM;
+ is_private = kvm_mem_is_private(kvm, gfn);
+ }
+
+ max_level = min(max_level, max_huge_page_level);
+ for ( ; max_level > PG_LEVEL_4K; max_level--) {
+ linfo = lpage_info_slot(gfn, slot, max_level);
+ if (!linfo->disallow_lpage)
+ break;
+ }
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ if (is_private || kvm_memslot_is_gmem_only(slot))
+ host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn,
+ is_private);
+ else
+ host_level = host_pfn_mapping_level(kvm, gfn, slot);
+ return min(host_level, max_level);
+}
+
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_memory_slot *slot = fault->slot;
+ kvm_pfn_t mask;
+
+ fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
+
+ if (unlikely(fault->max_level == PG_LEVEL_4K))
+ return;
+
+ if (is_error_noslot_pfn(fault->pfn))
+ return;
+
+ if (kvm_slot_dirty_track_enabled(slot))
+ return;
+
+ /*
+ * Enforce the iTLB multihit workaround after capturing the requested
+ * level, which will be used to do precise, accurate accounting.
+ */
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
+ fault->slot, fault->gfn);
+ if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
+ return;
+
+ /*
+ * mmu_invalidate_retry() was successful and mmu_lock is held, so
+ * the pmd can't be split from under us.
+ */
+ fault->goal_level = fault->req_level;
+ mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
+ VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
+ fault->pfn &= ~mask;
+}
+
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
+{
+ if (cur_level > PG_LEVEL_4K &&
+ cur_level == fault->goal_level &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte) &&
+ spte_to_child_sp(spte)->nx_huge_page_disallowed) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch),
+ * direct_map(), or kvm_tdp_mmu_map() would like to create a
+ * large PTE instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of the
+ * address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
+ KVM_PAGES_PER_HPAGE(cur_level - 1);
+ fault->pfn |= fault->gfn & page_mask;
+ fault->goal_level--;
+ }
+}
+
+static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_shadow_walk_iterator it;
+ struct kvm_mmu_page *sp;
+ int ret;
+ gfn_t base_gfn = fault->gfn;
+
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+ for_each_shadow_entry(vcpu, fault->addr, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
+
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
+ if (it.level == fault->goal_level)
+ break;
+
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
+ if (sp == ERR_PTR(-EEXIST))
+ continue;
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
+ }
+
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ base_gfn, fault->pfn, fault);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
+ direct_pte_prefetch(vcpu, it.sptep);
+ return ret;
+}
+
+static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ unsigned long hva = gfn_to_hva_memslot(slot, gfn);
+
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
+}
+
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ if (is_sigpending_pfn(fault->pfn)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ */
+ if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
+ return RET_PF_EMULATE;
+
+ if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(fault->slot, fault->gfn);
+ return RET_PF_RETRY;
+ }
+
+ return -EFAULT;
+}
+
+static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ unsigned int access)
+{
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
+
+ if (fault->is_private) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
+ access & shadow_mmio_access_mask);
+
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!enable_mmio_caching))
+ return RET_PF_EMULATE;
+
+ /*
+ * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
+ * any guest that generates such gfns is running nested and is being
+ * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
+ * only if L1's MAXPHYADDR is inaccurate with respect to the
+ * hardware's).
+ */
+ if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
+ return RET_PF_EMULATE;
+
+ return RET_PF_CONTINUE;
+}
+
+static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
+{
+ /*
+ * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
+ * reach the common page fault handler if the SPTE has an invalid MMIO
+ * generation number. Refreshing the MMIO generation needs to go down
+ * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
+ */
+ if (fault->rsvd)
+ return false;
+
+ /*
+ * For hardware-protected VMs, certain conditions like attempting to
+ * perform a write to a page which is not in the state that the guest
+ * expects it to be in can result in a nested/extended #PF. In this
+ * case, the below code might misconstrue this situation as being the
+ * result of a write-protected access, and treat it as a spurious case
+ * rather than taking any action to satisfy the real source of the #PF
+ * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ * guest spinning on a #PF indefinitely, so don't attempt the fast path
+ * in this case.
+ *
+ * Note that the kvm_mem_is_private() check might race with an
+ * attribute update, but this will either result in the guest spinning
+ * on RET_PF_SPURIOUS until the update completes, or an actual spurious
+ * case might go down the slow path. Either case will resolve itself.
+ */
+ if (kvm->arch.has_private_mem &&
+ fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
+ return false;
+
+ /*
+ * #PF can be fast if:
+ *
+ * 1. The shadow page table entry is not present and A/D bits are
+ * disabled _by KVM_, which could mean that the fault is potentially
+ * caused by access tracking (if enabled). If A/D bits are enabled
+ * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
+ * bits for L2 and employ access tracking, but the fast page fault
+ * mechanism only supports direct MMUs.
+ * 2. The shadow page table entry is present, the access is a write,
+ * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
+ * the fault was caused by a write-protection violation. If the
+ * SPTE is MMU-writable (determined later), the fault can be fixed
+ * by setting the Writable bit, which can be done out of mmu_lock.
+ */
+ if (!fault->present)
+ return !kvm_ad_enabled;
+
+ /*
+ * Note, instruction fetches and writes are mutually exclusive, ignore
+ * the "exec" flag.
+ */
+ return fault->write;
+}
+
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
+static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, u64 new_spte)
+{
+ /*
+ * Theoretically we could also set dirty bit (and flush TLB) here in
+ * order to eliminate unnecessary PML logging. See comments in
+ * set_spte. But fast_page_fault is very unlikely to happen with PML
+ * enabled, so we do not do this. This might result in the same GPA
+ * to be logged in PML buffer again when the write really happens, and
+ * eventually to be called by mark_page_dirty twice. But it's also no
+ * harm. This also avoids the TLB flush needed after setting dirty bit
+ * so non-PML cases won't be impacted.
+ *
+ * Compare with make_spte() where instead shadow_dirty_mask is set.
+ */
+ if (!try_cmpxchg64(sptep, &old_spte, new_spte))
+ return false;
+
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
+ mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
+
+ return true;
+}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between walk_shadow_page_lockless_{begin,end}.
+ * - The returned sptep must not be used after walk_shadow_page_lockless_end.
+ */
+static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 old_spte;
+ u64 *sptep = NULL;
+
+ for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
+ sptep = iterator.sptep;
+ *spte = old_spte;
+ }
+
+ return sptep;
+}
+
+/*
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ */
+static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp;
+ int ret = RET_PF_INVALID;
+ u64 spte;
+ u64 *sptep;
+ uint retry_count = 0;
+
+ if (!page_fault_can_be_fast(vcpu->kvm, fault))
+ return ret;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ do {
+ u64 new_spte;
+
+ if (tdp_mmu_enabled)
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
+ else
+ sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+
+ /*
+ * It's entirely possible for the mapping to have been zapped
+ * by a different task, but the root page should always be
+ * available as the vCPU holds a reference to its root(s).
+ */
+ if (WARN_ON_ONCE(!sptep))
+ spte = FROZEN_SPTE;
+
+ if (!is_shadow_present_pte(spte))
+ break;
+
+ sp = sptep_to_sp(sptep);
+ if (!is_last_spte(spte, sp->role.level))
+ break;
+
+ /*
+ * Check whether the memory access that caused the fault would
+ * still cause it if it were to be performed right now. If not,
+ * then this is a spurious fault caused by TLB lazily flushed,
+ * or some other CPU has already fixed the PTE after the
+ * current CPU took the fault.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_access_allowed(fault, spte)) {
+ ret = RET_PF_SPURIOUS;
+ break;
+ }
+
+ new_spte = spte;
+
+ /*
+ * KVM only supports fixing page faults outside of MMU lock for
+ * direct MMUs, nested MMUs are always indirect, and KVM always
+ * uses A/D bits for non-nested MMUs. Thus, if A/D bits are
+ * enabled, the SPTE can't be an access-tracked SPTE.
+ */
+ if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte) |
+ shadow_accessed_mask;
+
+ /*
+ * To keep things simple, only SPTEs that are MMU-writable can
+ * be made fully writable outside of mmu_lock, e.g. only SPTEs
+ * that were write-protected for dirty-logging or access
+ * tracking are handled here. Don't bother checking if the
+ * SPTE is writable to prioritize running with A/D bits enabled.
+ * The is_access_allowed() check above handles the common case
+ * of the fault being spurious, and the SPTE is known to be
+ * shadow-present, i.e. except for access tracking restoration
+ * making the new SPTE writable, the check is wasteful.
+ */
+ if (fault->write && is_mmu_writable_spte(spte)) {
+ new_spte |= PT_WRITABLE_MASK;
+
+ /*
+ * Do not fix write-permission on the large spte when
+ * dirty logging is enabled. Since we only dirty the
+ * first page into the dirty-bitmap in
+ * fast_pf_fix_direct_spte(), other pages are missed
+ * if its slot has dirty logging enabled.
+ *
+ * Instead, we let the slow page fault path create a
+ * normal spte to fix the access.
+ */
+ if (sp->role.level > PG_LEVEL_4K &&
+ kvm_slot_dirty_track_enabled(fault->slot))
+ break;
+ }
+
+ /* Verify that the fault can be handled in the fast path */
+ if (new_spte == spte ||
+ !is_access_allowed(fault, new_spte))
+ break;
+
+ /*
+ * Currently, fast page fault only works for direct mapping
+ * since the gfn is not stable for indirect shadow page. See
+ * Documentation/virt/kvm/locking.rst to get more detail.
+ */
+ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
+ ret = RET_PF_FIXED;
+ break;
+ }
+
+ if (++retry_count > 4) {
+ pr_warn_once("Fast #PF retrying more than 4 times.\n");
+ break;
+ }
+
+ } while (true);
+
+ trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
+ walk_shadow_page_lockless_end(vcpu);
+
+ if (ret != RET_PF_INVALID)
+ vcpu->stat.pf_fast++;
+
+ return ret;
+}
+
+static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(*root_hpa))
+ return;
+
+ sp = root_to_sp(*root_hpa);
+ if (WARN_ON_ONCE(!sp))
+ return;
+
+ if (is_tdp_mmu_page(sp)) {
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ kvm_tdp_mmu_put_root(kvm, sp);
+ } else {
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ }
+
+ *root_hpa = INVALID_PAGE;
+}
+
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
+ ulong roots_to_free)
+{
+ bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
+ int i;
+ LIST_HEAD(invalid_list);
+ bool free_active_root;
+
+ WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
+
+ BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
+
+ /* Before acquiring the MMU lock, see if we need to do any real work. */
+ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
+ && VALID_PAGE(mmu->root.hpa);
+
+ if (!free_active_root) {
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+ VALID_PAGE(mmu->prev_roots[i].hpa))
+ break;
+
+ if (i == KVM_MMU_NUM_PREV_ROOTS)
+ return;
+ }
+
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+ mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
+ &invalid_list);
+
+ if (free_active_root) {
+ if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
+ /* Nothing to cleanup for dummy roots. */
+ } else if (root_to_sp(mmu->root.hpa)) {
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
+ } else if (mmu->pae_root) {
+ for (i = 0; i < 4; ++i) {
+ if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+ continue;
+
+ mmu_free_root_page(kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ }
+ }
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
+ }
+
+ if (is_tdp_mmu) {
+ read_unlock(&kvm->mmu_lock);
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ } else {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_roots);
+
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ struct kvm_mmu_page *sp;
+ hpa_t root_hpa;
+ int i;
+
+ /*
+ * This should not be called while L2 is active, L2 can't invalidate
+ * _only_ its own roots, e.g. INVVPID unconditionally exits.
+ */
+ WARN_ON_ONCE(mmu->root_role.guest_mode);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ root_hpa = mmu->prev_roots[i].hpa;
+ if (!VALID_PAGE(root_hpa))
+ continue;
+
+ sp = root_to_sp(root_hpa);
+ if (!sp || sp->role.guest_mode)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_guest_mode_roots);
+
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
+ u8 level)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm_mmu_page *sp;
+
+ role.level = level;
+ role.quadrant = quadrant;
+
+ WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
+ WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
+
+ sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
+ ++sp->root_count;
+
+ return __pa(sp->spt);
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u8 shadow_root_level = mmu->root_role.level;
+ hpa_t root;
+ unsigned i;
+ int r;
+
+ if (tdp_mmu_enabled) {
+ if (kvm_has_mirrored_tdp(vcpu->kvm) &&
+ !VALID_PAGE(mmu->mirror_root_hpa))
+ kvm_tdp_mmu_alloc_root(vcpu, true);
+ kvm_tdp_mmu_alloc_root(vcpu, false);
+ return 0;
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
+ mmu->root.hpa = root;
+ } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
+ PT32_ROOT_LEVEL);
+ mmu->pae_root[i] = root | PT_PRESENT_MASK |
+ shadow_me_value;
+ }
+ mmu->root.hpa = __pa(mmu->pae_root);
+ } else {
+ WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /* root.pgd is ignored for direct MMUs. */
+ mmu->root.pgd = 0;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
+{
+ struct hlist_head *h;
+
+ if (kvm->arch.mmu_page_hash)
+ return 0;
+
+ h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
+ if (!h)
+ return -ENOMEM;
+
+ /*
+ * Ensure the hash table pointer is set only after all stores to zero
+ * the memory are retired. Pairs with the smp_load_acquire() in
+ * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
+ * add (or remove) shadow pages, and so readers are guaranteed to see
+ * an empty list for their current mmu_lock critical section.
+ */
+ smp_store_release(&kvm->arch.mmu_page_hash, h);
+ return 0;
+}
+
+static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ /*
+ * Check if this is the first shadow root being allocated before
+ * taking the lock.
+ */
+ if (kvm_shadow_root_allocated(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /* Recheck, under the lock, whether this is the first shadow root. */
+ if (kvm_shadow_root_allocated(kvm))
+ goto out_unlock;
+
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ goto out_unlock;
+
+ /*
+ * Check if memslot metadata actually needs to be allocated, e.g. all
+ * metadata will be allocated upfront if TDP is disabled.
+ */
+ if (kvm_memslots_have_rmaps(kvm) &&
+ kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Both of these functions are no-ops if the target is
+ * already allocated, so unconditionally calling both
+ * is safe. Intentionally do NOT free allocations on
+ * failure to avoid having to track which allocations
+ * were made now versus when the memslot was created.
+ * The metadata is guaranteed to be freed when the slot
+ * is freed, and will be kept/used if userspace retries
+ * KVM_RUN instead of killing the VM.
+ */
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r)
+ goto out_unlock;
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Ensure that shadow_root_allocated becomes true strictly after
+ * all the related pointers are set.
+ */
+out_success:
+ smp_store_release(&kvm->arch.shadow_root_allocated, true);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 pdptrs[4], pm_mask;
+ gfn_t root_gfn, root_pgd;
+ int quadrant, i, r;
+ hpa_t root;
+
+ root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
+ root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
+ mmu->root.hpa = kvm_mmu_get_dummy_root();
+ return 0;
+ }
+
+ /*
+ * On SVM, reading PDPTRs might access guest memory, which might fault
+ * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
+ */
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ if (!(pdptrs[i] & PT_PRESENT_MASK))
+ continue;
+
+ if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ pdptrs[i] = 0;
+ }
+ }
+
+ r = mmu_first_shadow_root_alloc(vcpu->kvm);
+ if (r)
+ return r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ root = mmu_alloc_root(vcpu, root_gfn, 0,
+ mmu->root_role.level);
+ mmu->root.hpa = root;
+ goto set_root_pgd;
+ }
+
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK | shadow_me_value;
+ if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
+ if (WARN_ON_ONCE(!mmu->pml4_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pml5_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+ }
+ }
+
+ for (i = 0; i < 4; ++i) {
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ continue;
+ }
+ root_gfn = pdptrs[i] >> PAGE_SHIFT;
+ }
+
+ /*
+ * If shadowing 32-bit non-PAE page tables, each PAE page
+ * directory maps one quarter of the guest's non-PAE page
+ * directory. Othwerise each PAE page direct shadows one guest
+ * PAE page directory so that quadrant should be 0.
+ */
+ quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
+
+ root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
+ mmu->pae_root[i] = root | pm_mask;
+ }
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL)
+ mmu->root.hpa = __pa(mmu->pml5_root);
+ else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
+ mmu->root.hpa = __pa(mmu->pml4_root);
+ else
+ mmu->root.hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+ mmu->root.pgd = root_pgd;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
+ u64 *pml5_root = NULL;
+ u64 *pml4_root = NULL;
+ u64 *pae_root;
+
+ /*
+ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+ * tables are allocated and initialized at root creation as there is no
+ * equivalent level in the guest's NPT to shadow. Allocate the tables
+ * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
+ */
+ if (mmu->root_role.direct ||
+ mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+ mmu->root_role.level < PT64_ROOT_4LEVEL)
+ return 0;
+
+ /*
+ * NPT, the only paging mode that uses this horror, uses a fixed number
+ * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+ * all MMus are 5-level. Thus, this can safely require that pml5_root
+ * is allocated if the other roots are valid and pml5 is needed, as any
+ * prior MMU would also have required pml5.
+ */
+ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
+ return 0;
+
+ /*
+ * The special roots should always be allocated in concert. Yell and
+ * bail if KVM ends up in a state where only one of the roots is valid.
+ */
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+ (need_pml5 && mmu->pml5_root)))
+ return -EIO;
+
+ /*
+ * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+ * doesn't need to be decrypted.
+ */
+ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pae_root)
+ return -ENOMEM;
+
+#ifdef CONFIG_X86_64
+ pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml4_root)
+ goto err_pml4;
+
+ if (need_pml5) {
+ pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml5_root)
+ goto err_pml5;
+ }
+#endif
+
+ mmu->pae_root = pae_root;
+ mmu->pml4_root = pml4_root;
+ mmu->pml5_root = pml5_root;
+
+ return 0;
+
+#ifdef CONFIG_X86_64
+err_pml5:
+ free_page((unsigned long)pml4_root);
+err_pml4:
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
+#endif
+}
+
+static bool is_unsync_root(hpa_t root)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
+ return false;
+
+ /*
+ * The read barrier orders the CPU's read of SPTE.W during the page table
+ * walk before the reads of sp->unsync/sp->unsync_children here.
+ *
+ * Even if another CPU was marking the SP as unsync-ed simultaneously,
+ * any guest page table changes are not guaranteed to be visible anyway
+ * until this VCPU issues a TLB flush strictly after those changes are
+ * made. We only need to ensure that the other CPU sets these flags
+ * before any actual changes to the page tables are made. The comments
+ * in mmu_try_to_unsync_pages() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ smp_rmb();
+ sp = root_to_sp(root);
+
+ /*
+ * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
+ * PDPTEs for a given PAE root need to be synchronized individually.
+ */
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ if (sp->unsync || sp->unsync_children)
+ return true;
+
+ return false;
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (vcpu->arch.mmu->root_role.direct)
+ return;
+
+ if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
+ return;
+
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu->root.hpa;
+
+ if (!is_unsync_root(root))
+ return;
+
+ sp = root_to_sp(root);
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_children(vcpu, sp, true);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu->pae_root[i];
+
+ if (IS_VALID_PAE_ROOT(root)) {
+ sp = spte_to_child_sp(root);
+ mmu_sync_children(vcpu, sp, true);
+ }
+ }
+
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ /* sync prev_roots by simply freeing them */
+ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t vaddr, u64 access,
+ struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
+}
+
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ /*
+ * A nested guest cannot use the MMIO cache if it is using nested
+ * page tables, because cr2 is a nGPA while the cache stores GPAs.
+ */
+ if (mmu_is_nested(vcpu))
+ return false;
+
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+/*
+ * Return the level of the lowest level SPTE added to sptes.
+ * That SPTE may be non-present.
+ *
+ * Must be called between walk_shadow_page_lockless_{begin,end}.
+ */
+static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int leaf = -1;
+ u64 spte;
+
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ *root_level = iterator.level;
+ shadow_walk_okay(&iterator);
+ __shadow_walk_next(&iterator, spte)) {
+ leaf = iterator.level;
+ spte = mmu_spte_get_lockless(iterator.sptep);
+
+ sptes[leaf] = spte;
+ }
+
+ return leaf;
+}
+
+static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
+{
+ int leaf;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ if (is_tdp_mmu_active(vcpu))
+ leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level);
+ else
+ leaf = get_walk(vcpu, addr, sptes, root_level);
+
+ walk_shadow_page_lockless_end(vcpu);
+ return leaf;
+}
+
+/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
+static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ struct rsvd_bits_validate *rsvd_check;
+ int root, leaf, level;
+ bool reserved = false;
+
+ leaf = get_sptes_lockless(vcpu, addr, sptes, &root);
+ if (unlikely(leaf < 0)) {
+ *sptep = 0ull;
+ return reserved;
+ }
+
+ *sptep = sptes[leaf];
+
+ /*
+ * Skip reserved bits checks on the terminal leaf if it's not a valid
+ * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by
+ * design, always have reserved bits set. The purpose of the checks is
+ * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
+ */
+ if (!is_shadow_present_pte(sptes[leaf]))
+ leaf++;
+
+ rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
+
+ for (level = root; level >= leaf; level--)
+ reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
+
+ if (reserved) {
+ pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
+ __func__, addr);
+ for (level = root; level >= leaf; level--)
+ pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+ sptes[level], level,
+ get_rsvd_bits(rsvd_check, sptes[level], level));
+ }
+
+ return reserved;
+}
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+ bool reserved;
+
+ if (mmio_info_in_cache(vcpu, addr, direct))
+ return RET_PF_EMULATE;
+
+ reserved = get_mmio_spte(vcpu, addr, &spte);
+ if (WARN_ON_ONCE(reserved))
+ return -EINVAL;
+
+ if (is_mmio_spte(vcpu->kvm, spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned int access = get_mmio_spte_access(spte);
+
+ if (!check_mmio_spte(vcpu, spte))
+ return RET_PF_INVALID;
+
+ if (direct)
+ addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return RET_PF_EMULATE;
+ }
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return RET_PF_RETRY;
+}
+
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(fault->rsvd))
+ return false;
+
+ if (!fault->present || !fault->write)
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+ clear_sp_write_flooding_count(iterator.sptep);
+ walk_shadow_page_lockless_end(vcpu);
+}
+
+static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
+{
+ /* make sure the token value is not 0 */
+ u32 id = vcpu->arch.apf.id;
+
+ if (id << 12 == 0)
+ vcpu->arch.apf.id = 1;
+
+ return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+}
+
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = alloc_apf_token(vcpu);
+ arch.gfn = fault->gfn;
+ arch.error_code = fault->error_code;
+ arch.direct_map = vcpu->arch.mmu->root_role.direct;
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
+
+ return kvm_setup_async_pf(vcpu, fault->addr,
+ kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
+}
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ return;
+
+ if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
+ work->wakeup_all)
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu->root_role.direct &&
+ work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
+ return;
+
+ r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ true, NULL, NULL);
+
+ /*
+ * Account fixed page faults, otherwise they'll never be counted, but
+ * ignore stats for all other return times. Page-ready "faults" aren't
+ * truly spurious and never trigger emulation
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+}
+
+static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int r)
+{
+ kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page,
+ r == RET_PF_RETRY, fault->map_writable);
+}
+
+static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int max_order, r;
+
+ if (!kvm_slot_has_gmem(fault->slot)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
+ &fault->refcounted_page, &max_order);
+ if (r) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return r;
+ }
+
+ fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+ fault->max_level = kvm_max_level_for_order(max_order);
+
+ return RET_PF_CONTINUE;
+}
+
+static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ unsigned int foll = fault->write ? FOLL_WRITE : 0;
+
+ if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot))
+ return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
+
+ foll |= FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
+ /*
+ * If resolving the page failed because I/O is needed to fault-in the
+ * page, then either set up an asynchronous #PF to do the I/O, or if
+ * doing an async #PF isn't possible, retry with I/O allowed. All
+ * other failures are terminal, i.e. retrying won't help.
+ */
+ if (fault->pfn != KVM_PFN_ERR_NEEDS_IO)
+ return RET_PF_CONTINUE;
+
+ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(fault->addr, fault->gfn);
+ if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
+ trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return RET_PF_RETRY;
+ } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
+ return RET_PF_RETRY;
+ }
+ }
+
+ /*
+ * Allow gup to bail on pending non-fatal signals when it's also allowed
+ * to wait for IO. Note, gup always bails if it is unable to quickly
+ * get a page and a fatal signal, i.e. SIGKILL, is pending.
+ */
+ foll |= FOLL_INTERRUPTIBLE;
+ foll &= ~FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
+ return RET_PF_CONTINUE;
+}
+
+static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, unsigned int access)
+{
+ struct kvm_memory_slot *slot = fault->slot;
+ struct kvm *kvm = vcpu->kvm;
+ int ret;
+
+ if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm))
+ return -EFAULT;
+
+ /*
+ * Note that the mmu_invalidate_seq also serves to detect a concurrent
+ * change in attributes. is_page_fault_stale() will detect an
+ * invalidation relate to fault->fn and resume the guest without
+ * installing a mapping in the page tables.
+ */
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * Now that we have a snapshot of mmu_invalidate_seq we can check for a
+ * private vs. shared mismatch.
+ */
+ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (unlikely(!slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn. Punt the
+ * error to userspace if this is a prefault, as KVM's prefaulting ABI
+ * doesn't provide the same forward progress guarantees as KVM_RUN.
+ */
+ if (slot->flags & KVM_MEMSLOT_INVALID) {
+ if (fault->prefetch)
+ return -EAGAIN;
+
+ return RET_PF_RETRY;
+ }
+
+ if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
+ /*
+ * Don't map L1's APIC access page into L2, KVM doesn't support
+ * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
+ * i.e. the access needs to be emulated. Emulating access to
+ * L1's APIC is also correct if L1 is accelerating L2's own
+ * virtual APIC, but for some reason L1 also maps _L1's_ APIC
+ * into L2. Note, vcpu_is_mmio_gpa() always treats access to
+ * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
+ * uses different roots for L1 vs. L2, i.e. there is no danger
+ * of breaking APICv/AVIC for L1.
+ */
+ if (is_guest_mode(vcpu))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (!kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
+ }
+
+ /*
+ * Check for a relevant mmu_notifier invalidation event before getting
+ * the pfn from the primary MMU, and before acquiring mmu_lock.
+ *
+ * For mmu_lock, if there is an in-progress invalidation and the kernel
+ * allows preemption, the invalidation task may drop mmu_lock and yield
+ * in response to mmu_lock being contended, which is *very* counter-
+ * productive as this vCPU can't actually make forward progress until
+ * the invalidation completes.
+ *
+ * Retrying now can also avoid unnessary lock contention in the primary
+ * MMU, as the primary MMU doesn't necessarily hold a single lock for
+ * the duration of the invalidation, i.e. faulting in a conflicting pfn
+ * can cause the invalidation to take longer by holding locks that are
+ * needed to complete the invalidation.
+ *
+ * Do the pre-check even for non-preemtible kernels, i.e. even if KVM
+ * will never yield mmu_lock in response to contention, as this vCPU is
+ * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
+ * to detect retry guarantees the worst case latency for the vCPU.
+ */
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn))
+ return RET_PF_RETRY;
+
+ ret = __kvm_mmu_faultin_pfn(vcpu, fault);
+ if (ret != RET_PF_CONTINUE)
+ return ret;
+
+ if (unlikely(is_error_pfn(fault->pfn)))
+ return kvm_handle_error_pfn(vcpu, fault);
+
+ if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Check again for a relevant mmu_notifier invalidation event purely to
+ * avoid contending mmu_lock. Most invalidations will be detected by
+ * the previous check, but checking is extremely cheap relative to the
+ * overall cost of failing to detect the invalidation until after
+ * mmu_lock is acquired.
+ */
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) {
+ kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
+ return RET_PF_RETRY;
+ }
+
+ return RET_PF_CONTINUE;
+}
+
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ return true;
+
+ /*
+ * Check for a relevant mmu_notifier invalidation event one last time
+ * now that mmu_lock is held, as the "unsafe" checks performed without
+ * holding mmu_lock can get false negatives.
+ */
+ return fault->slot &&
+ mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
+}
+
+static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ int r;
+
+ /* Dummy roots are used only for shadowing bad guest roots. */
+ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_WRITE_PROTECTED;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = make_mmu_pages_available(vcpu);
+ if (r)
+ goto out_unlock;
+
+ r = direct_map(vcpu, fault);
+
+out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
+ fault->max_level = PG_LEVEL_2M;
+ return direct_page_fault(vcpu, fault);
+}
+
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len)
+{
+ int r = 1;
+ u32 flags = vcpu->arch.apf.host_apf_flags;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+ /*
+ * Legacy #PF exception only have a 32-bit error code. Simply drop the
+ * upper bits as KVM doesn't use them for #PF (because they are never
+ * set), and to ensure there are no collisions with KVM-defined bits.
+ */
+ if (WARN_ON_ONCE(error_code >> 32))
+ error_code = lower_32_bits(error_code);
+
+ /*
+ * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
+ * them to conflict with #PF error codes, which are limited to 32 bits.
+ */
+ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
+
+ kvm_request_l1tf_flush_l1d();
+ if (!flags) {
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
+
+ r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
+ insn_len);
+ } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ vcpu->arch.apf.host_apf_flags = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait_schedule(fault_address);
+ local_irq_enable();
+ } else {
+ WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
+ }
+
+ return r;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_page_fault);
+
+#ifdef CONFIG_X86_64
+static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_WRITE_PROTECTED;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ read_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = kvm_tdp_mmu_map(vcpu, fault);
+
+out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+#endif
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+#ifdef CONFIG_X86_64
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_page_fault(vcpu, fault);
+#endif
+
+ return direct_page_fault(vcpu, fault);
+}
+
+static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 error_code, u8 *level)
+{
+ int r;
+
+ /*
+ * Restrict to TDP page fault, since that's the only case where the MMU
+ * is indexed by GPA.
+ */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+ r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ } while (r == RET_PF_RETRY);
+
+ if (r < 0)
+ return r;
+
+ switch (r) {
+ case RET_PF_FIXED:
+ case RET_PF_SPURIOUS:
+ case RET_PF_WRITE_PROTECTED:
+ return 0;
+
+ case RET_PF_EMULATE:
+ return -ENOENT;
+
+ case RET_PF_RETRY:
+ case RET_PF_CONTINUE:
+ case RET_PF_INVALID:
+ default:
+ WARN_ONCE(1, "could not fix page fault during prefault");
+ return -EIO;
+ }
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK;
+ u8 level = PG_LEVEL_4K;
+ u64 direct_bits;
+ u64 end;
+ int r;
+
+ if (!vcpu->kvm->arch.pre_fault_allowed)
+ return -EOPNOTSUPP;
+
+ if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ return -EINVAL;
+
+ /*
+ * reload is efficient when called repeatedly, so we can do it on
+ * every iteration.
+ */
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ direct_bits = 0;
+ if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+ else
+ direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+
+ /*
+ * Shadow paging uses GVA for kvm page fault, so restrict to
+ * two-dimensional paging.
+ */
+ r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level);
+ if (r < 0)
+ return r;
+
+ /*
+ * If the mapping that covers range->gpa can use a huge page, it
+ * may start below it or end after range->gpa + range->size.
+ */
+ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ return min(range->size, end - range->gpa);
+}
+
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
+ WARN_ON_ONCE(!slot->gmem.file) ||
+ WARN_ON_ONCE(!file_count(slot->gmem.file)))
+ return;
+
+ lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
+#endif
+}
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ struct kvm_page_fault fault = {
+ .addr = gfn_to_gpa(gfn),
+ .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+ .prefetch = true,
+ .is_tdp = true,
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = PG_LEVEL_4K,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = true,
+
+ .gfn = gfn,
+ .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ .pfn = pfn,
+ .map_writable = true,
+ };
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Mapping a pre-determined private pfn is intended only for use when
+ * populating a guest_memfd instance. Assert that the slot is backed
+ * by guest_memfd and that the gmem instance's invalidate_lock is held.
+ */
+ kvm_assert_gmem_invalidate_lock_held(fault.slot);
+
+ if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+ return -EIO;
+
+ if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+ return -EPERM;
+
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+
+ guard(read_lock)(&kvm->mmu_lock);
+
+ r = kvm_tdp_mmu_map(vcpu, &fault);
+ } while (r == RET_PF_RETRY);
+
+ if (r != RET_PF_FIXED)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
+#endif
+
+static void nonpaging_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->sync_spte = NULL;
+}
+
+static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root->hpa))
+ return false;
+
+ if (!role.direct && pgd != root->pgd)
+ return false;
+
+ sp = root_to_sp(root->hpa);
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ return role.word == sp->role.word;
+}
+
+/*
+ * Find out if a previously cached root matching the new pgd/role is available,
+ * and insert the current root as the MRU in the cache.
+ * If a matching root is found, it is assigned to kvm_mmu->root and
+ * true is returned.
+ * If no match is found, kvm_mmu->root is left invalid, the LRU root is
+ * evicted to make room for the current root, and false is returned.
+ */
+static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ /*
+ * The swaps end up rotating the cache like this:
+ * C 0 1 2 3 (on entry to the function)
+ * 0 C 1 2 3
+ * 1 C 0 2 3
+ * 2 C 0 1 3
+ * 3 C 0 1 2 (on exit from the loop)
+ */
+ swap(mmu->root, mmu->prev_roots[i]);
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
+ }
+
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+ return false;
+}
+
+/*
+ * Find out if a previously cached root matching the new pgd/role is available.
+ * On entry, mmu->root is invalid.
+ * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
+ * of the cache becomes invalid, and true is returned.
+ * If no match is found, kvm_mmu->root is left invalid and false is returned.
+ */
+static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
+ goto hit;
+
+ return false;
+
+hit:
+ swap(mmu->root, mmu->prev_roots[i]);
+ /* Bubble up the remaining roots. */
+ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
+ mmu->prev_roots[i] = mmu->prev_roots[i + 1];
+ mmu->prev_roots[i].hpa = INVALID_PAGE;
+ return true;
+}
+
+static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd, union kvm_mmu_page_role new_role)
+{
+ /*
+ * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
+ * avoid having to deal with PDPTEs and other complexities.
+ */
+ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+
+ if (VALID_PAGE(mmu->root.hpa))
+ return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
+ else
+ return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
+}
+
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role new_role = mmu->root_role;
+
+ /*
+ * Return immediately if no usable root was found, kvm_mmu_reload()
+ * will establish a valid root prior to the next VM-Enter.
+ */
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
+ return;
+
+ /*
+ * It's possible that the cached previous root page is obsolete because
+ * of a change in the MMU generation number. However, changing the
+ * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
+ * which will free the root set here and allocate a new one.
+ */
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+
+ if (force_flush_and_sync_on_reuse) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the VCPU. When
+ * switching to a new CR3, that GVA->GPA mapping may no longer be
+ * valid. So clear any cached MMIO info even when we don't need to sync
+ * the shadow page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ /*
+ * If this is a direct root page, it doesn't have a write flooding
+ * count. Otherwise, clear the write flooding count.
+ */
+ if (!new_role.direct) {
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (!WARN_ON_ONCE(!sp))
+ __clear_sp_write_flooding_count(sp);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_new_pgd);
+
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
+ unsigned int access)
+{
+ if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ mark_mmio_spte(vcpu, sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, int level, bool nx,
+ bool gbpages, bool pse, bool amd)
+{
+ u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
+ u64 high_bits_rsvd;
+
+ rsvd_check->bad_mt_xwr = 0;
+
+ if (!gbpages)
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ if (level == PT32E_ROOT_LEVEL)
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
+ else
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+
+ /* Note, NX doesn't exist in PDPTEs, this is handled below. */
+ if (!nx)
+ high_bits_rsvd |= rsvd_bits(63, 63);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (amd)
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ rsvd_check->rsvd_bits_mask[0][1] = 0;
+ rsvd_check->rsvd_bits_mask[0][0] = 0;
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+
+ if (!pse) {
+ rsvd_check->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ break;
+ case PT32E_ROOT_LEVEL:
+ rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+ high_bits_rsvd |
+ rsvd_bits(5, 8) |
+ rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ fallthrough;
+ case PT64_ROOT_4LEVEL:
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+ gbpages_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ }
+}
+
+static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ __reset_rsvds_bits_mask(&context->guest_rsvd_check,
+ vcpu->arch.reserved_gpa_bits,
+ context->cpu_role.base.level, is_efer_nx(context),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
+ is_cr4_pse(context),
+ guest_cpuid_is_amd_compatible(vcpu));
+}
+
+static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, bool execonly,
+ int huge_page_level)
+{
+ u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+ u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
+ u64 bad_mt_xwr;
+
+ if (huge_page_level < PG_LEVEL_1G)
+ large_1g_rsvd = rsvd_bits(7, 7);
+ if (huge_page_level < PG_LEVEL_2M)
+ large_2m_rsvd = rsvd_bits(7, 7);
+
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+
+ /* large page */
+ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
+ rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
+ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
+ }
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
+}
+
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly, int huge_page_level)
+{
+ __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+ vcpu->arch.reserved_gpa_bits, execonly,
+ huge_page_level);
+}
+
+static inline u64 reserved_hpa_bits(void)
+{
+ return rsvd_bits(kvm_host.maxphyaddr, 63);
+}
+
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
+ bool is_amd = true;
+ /* KVM doesn't use 2-level page tables for the shadow MMU. */
+ bool is_pse = false;
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
+
+ shadow_zero_check = &context->shadow_zero_check;
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level,
+ context->root_role.efer_nx,
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
+ is_pse, is_amd);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->root_role.level; --i >= 0;) {
+ /*
+ * So far shadow_me_value is a constant during KVM's life
+ * time. Bits in shadow_me_value are allowed to be set.
+ * Bits in shadow_me_mask but not in shadow_me_value are
+ * not allowed to be set.
+ */
+ shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
+ }
+
+}
+
+static inline bool boot_cpu_is_amd(void)
+{
+ WARN_ON_ONCE(!tdp_enabled);
+ return shadow_x_mask == 0;
+}
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
+{
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ shadow_zero_check = &context->shadow_zero_check;
+
+ if (boot_cpu_is_amd())
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level, true,
+ boot_cpu_has(X86_FEATURE_GBPAGES),
+ false, true);
+ else
+ __reset_rsvds_bits_mask_ept(shadow_zero_check,
+ reserved_hpa_bits(), false,
+ max_huge_page_level);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->root_role.level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ reserved_hpa_bits(), execonly,
+ max_huge_page_level);
+}
+
+#define BYTE_MASK(access) \
+ ((1 & (access) ? 2 : 0) | \
+ (2 & (access) ? 4 : 0) | \
+ (3 & (access) ? 8 : 0) | \
+ (4 & (access) ? 16 : 0) | \
+ (5 & (access) ? 32 : 0) | \
+ (6 & (access) ? 64 : 0) | \
+ (7 & (access) ? 128 : 0))
+
+
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
+{
+ unsigned byte;
+
+ const u8 x = BYTE_MASK(ACC_EXEC_MASK);
+ const u8 w = BYTE_MASK(ACC_WRITE_MASK);
+ const u8 u = BYTE_MASK(ACC_USER_MASK);
+
+ bool cr4_smep = is_cr4_smep(mmu);
+ bool cr4_smap = is_cr4_smap(mmu);
+ bool cr0_wp = is_cr0_wp(mmu);
+ bool efer_nx = is_efer_nx(mmu);
+
+ for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+ unsigned pfec = byte << 1;
+
+ /*
+ * Each "*f" variable has a 1 bit for each UWX value
+ * that causes a fault with the given PFEC.
+ */
+
+ /* Faults from writes to non-writable pages */
+ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
+ /* Faults from user mode accesses to supervisor pages */
+ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
+ /* Faults from fetches of non-executable pages*/
+ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
+ /* Faults from kernel mode fetches of user pages */
+ u8 smepf = 0;
+ /* Faults from kernel mode accesses of user pages */
+ u8 smapf = 0;
+
+ if (!ept) {
+ /* Faults from kernel mode accesses to user pages */
+ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+
+ /* Not really needed: !nx will cause pte.nx to fault */
+ if (!efer_nx)
+ ff = 0;
+
+ /* Allow supervisor writes if !cr0.wp */
+ if (!cr0_wp)
+ wf = (pfec & PFERR_USER_MASK) ? wf : 0;
+
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ if (cr4_smep)
+ smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are true:
+ * - X86_CR4_SMAP is set in CR4
+ * - A user page is accessed
+ * - The access is not a fetch
+ * - The access is supervisor mode
+ * - If implicit supervisor access or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first four conditions.
+ * The fifth is computed dynamically in permission_fault();
+ * PFERR_RSVD_MASK bit will be set in PFEC if the access is
+ * *not* subject to SMAP restrictions.
+ */
+ if (cr4_smap)
+ smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
+ }
+
+ mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
+ }
+}
+
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register. Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions. The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD. For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_mmu *mmu)
+{
+ unsigned bit;
+ bool wp;
+
+ mmu->pkru_mask = 0;
+
+ if (!is_cr4_pke(mmu))
+ return;
+
+ wp = is_cr0_wp(mmu);
+
+ for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ unsigned pfec, pkey_bits;
+ bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+ pfec = bit << 1;
+ ff = pfec & PFERR_FETCH_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ wf = pfec & PFERR_WRITE_MASK;
+
+ /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ pte_user = pfec & PFERR_RSVD_MASK;
+
+ /*
+ * Only need to check the access which is not an
+ * instruction fetch and is to a user page.
+ */
+ check_pkey = (!ff && pte_user);
+ /*
+ * write access is controlled by PKRU if it is a
+ * user access or CR0.WP = 1.
+ */
+ check_write = check_pkey && wf && (uf || wp);
+
+ /* PKRU.AD stops both read and write access. */
+ pkey_bits = !!check_pkey;
+ /* PKRU.WD stops write access. */
+ pkey_bits |= (!!check_write) << 1;
+
+ mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ }
+}
+
+static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ if (!is_cr0_pg(mmu))
+ return;
+
+ reset_guest_rsvds_bits_mask(vcpu, mmu);
+ update_permission_bitmask(mmu, false);
+ update_pkru_bitmask(mmu);
+}
+
+static void paging64_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->sync_spte = paging64_sync_spte;
+}
+
+static void paging32_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->sync_spte = paging32_sync_spte;
+}
+
+static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
+ const struct kvm_mmu_role_regs *regs)
+{
+ union kvm_cpu_role role = {0};
+
+ role.base.access = ACC_ALL;
+ role.base.smm = is_smm(vcpu);
+ role.base.guest_mode = is_guest_mode(vcpu);
+ role.ext.valid = 1;
+
+ if (!____is_cr0_pg(regs)) {
+ role.base.direct = 1;
+ return role;
+ }
+
+ role.base.efer_nx = ____is_efer_nx(regs);
+ role.base.cr0_wp = ____is_cr0_wp(regs);
+ role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+ role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
+ role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
+
+ if (____is_efer_lma(regs))
+ role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
+ : PT64_ROOT_4LEVEL;
+ else if (____is_cr4_pae(regs))
+ role.base.level = PT32E_ROOT_LEVEL;
+ else
+ role.base.level = PT32_ROOT_LEVEL;
+
+ role.ext.cr4_smep = ____is_cr4_smep(regs);
+ role.ext.cr4_smap = ____is_cr4_smap(regs);
+ role.ext.cr4_pse = ____is_cr4_pse(regs);
+
+ /* PKEY and LA57 are active iff long mode is active. */
+ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+ role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ role.ext.efer_lma = ____is_efer_lma(regs);
+ return role;
+}
+
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
+
+ BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
+ BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
+
+ if (is_cr0_wp(mmu) == cr0_wp)
+ return;
+
+ mmu->cpu_role.base.cr0_wp = cr0_wp;
+ reset_guest_paging_metadata(vcpu, mmu);
+}
+
+static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
+{
+ int maxpa;
+
+ if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM)
+ maxpa = cpuid_query_maxguestphyaddr(vcpu);
+ else
+ maxpa = cpuid_maxphyaddr(vcpu);
+
+ /* tdp_root_level is architecture forced level, use it if nonzero */
+ if (tdp_root_level)
+ return tdp_root_level;
+
+ /* Use 5-level TDP if and only if it's useful/necessary. */
+ if (max_tdp_level == 5 && maxpa <= 48)
+ return 4;
+
+ return max_tdp_level;
+}
+
+u8 kvm_mmu_get_max_tdp_level(void)
+{
+ return tdp_root_level ? tdp_root_level : max_tdp_level;
+}
+
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ union kvm_mmu_page_role role = {0};
+
+ role.access = ACC_ALL;
+ role.cr0_wp = true;
+ role.efer_nx = true;
+ role.smm = cpu_role.base.smm;
+ role.guest_mode = cpu_role.base.guest_mode;
+ role.ad_disabled = !kvm_ad_enabled;
+ role.level = kvm_mmu_get_tdp_level(vcpu);
+ role.direct = true;
+ role.has_4_byte_gpte = false;
+
+ return role;
+}
+
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
+
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
+ return;
+
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
+ context->page_fault = kvm_tdp_page_fault;
+ context->sync_spte = NULL;
+ context->get_guest_pgd = get_guest_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+
+ if (!is_cr0_pg(context))
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_cr4_pae(context))
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ context->gva_to_gpa = paging32_gva_to_gpa;
+
+ reset_guest_paging_metadata(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(context);
+}
+
+static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ union kvm_cpu_role cpu_role,
+ union kvm_mmu_page_role root_role)
+{
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
+ return;
+
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
+
+ if (!is_cr0_pg(context))
+ nonpaging_init_context(context);
+ else if (is_cr4_pae(context))
+ paging64_init_context(context);
+ else
+ paging32_init_context(context);
+
+ reset_guest_paging_metadata(vcpu, context);
+ reset_shadow_zero_bits_mask(vcpu, context);
+}
+
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_page_role root_role;
+
+ root_role = cpu_role.base;
+
+ /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
+ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
+
+ /*
+ * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts.
+ */
+ root_role.efer_nx = true;
+
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+}
+
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = cr0,
+ .cr4 = cr4 & ~X86_CR4_PKE,
+ .efer = efer,
+ };
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+ union kvm_mmu_page_role root_role;
+
+ /* NPT requires CR0.PG=1. */
+ WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
+
+ root_role = cpu_role.base;
+ root_role.level = kvm_mmu_get_tdp_level(vcpu);
+ if (root_role.level == PT64_ROOT_5LEVEL &&
+ cpu_role.base.level == PT64_ROOT_4LEVEL)
+ root_role.passthrough = 1;
+
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+ kvm_mmu_new_pgd(vcpu, nested_cr3);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu);
+
+static union kvm_cpu_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
+ bool execonly, u8 level)
+{
+ union kvm_cpu_role role = {0};
+
+ /*
+ * KVM does not support SMM transfer monitors, and consequently does not
+ * support the "entry to SMM" control either. role.base.smm is always 0.
+ */
+ WARN_ON_ONCE(is_smm(vcpu));
+ role.base.level = level;
+ role.base.has_4_byte_gpte = false;
+ role.base.direct = false;
+ role.base.ad_disabled = !accessed_dirty;
+ role.base.guest_mode = true;
+ role.base.access = ACC_ALL;
+
+ role.ext.word = 0;
+ role.ext.execonly = execonly;
+ role.ext.valid = 1;
+
+ return role;
+}
+
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ u8 level = vmx_eptp_page_walk_level(new_eptp);
+ union kvm_cpu_role new_mode =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
+ execonly, level);
+
+ if (new_mode.as_u64 != context->cpu_role.as_u64) {
+ /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+ context->cpu_role.as_u64 = new_mode.as_u64;
+ context->root_role.word = new_mode.base.word;
+
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_spte = ept_sync_spte;
+
+ update_permission_bitmask(context, true);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+ reset_ept_shadow_zero_bits_mask(context, execonly);
+ }
+
+ kvm_mmu_new_pgd(vcpu, new_eptp);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu);
+
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+
+ kvm_init_shadow_mmu(vcpu, cpu_role);
+
+ context->get_guest_pgd = get_guest_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+}
+
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role new_mode)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ if (new_mode.as_u64 == g_context->cpu_role.as_u64)
+ return;
+
+ g_context->cpu_role.as_u64 = new_mode.as_u64;
+ g_context->get_guest_pgd = get_guest_cr3;
+ g_context->get_pdptr = kvm_pdptr_read;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * L2 page tables are never shadowed, so there is no need to sync
+ * SPTEs.
+ */
+ g_context->sync_spte = NULL;
+
+ /*
+ * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu))
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_long_mode(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else if (is_pae(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ g_context->gva_to_gpa = paging32_gva_to_gpa;
+
+ reset_guest_paging_metadata(vcpu, g_context);
+}
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+
+ if (mmu_is_nested(vcpu))
+ init_kvm_nested_mmu(vcpu, cpu_role);
+ else if (tdp_enabled)
+ init_kvm_tdp_mmu(vcpu, cpu_role);
+ else
+ init_kvm_softmmu(vcpu, cpu_role);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu);
+
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Invalidate all MMU roles to force them to reinitialize as CPUID
+ * information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_write_track (see struct kvm_mmu_page_role comments). For now
+ * that problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
+ */
+ vcpu->arch.root_mmu.root_role.invalid = 1;
+ vcpu->arch.guest_mmu.root_role.invalid = 1;
+ vcpu->arch.nested_mmu.root_role.invalid = 1;
+ vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
+ */
+ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
+}
+
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ kvm_init_mmu(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
+ if (r)
+ goto out;
+ r = mmu_alloc_special_roots(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->arch.mmu->root_role.direct)
+ r = mmu_alloc_direct_roots(vcpu);
+ else
+ r = mmu_alloc_shadow_roots(vcpu);
+ if (r)
+ goto out;
+
+ kvm_mmu_sync_roots(vcpu);
+
+ kvm_mmu_load_pgd(vcpu);
+
+ /*
+ * Flush any TLB entries for the new root, the provenance of the root
+ * is unknown. Even if KVM ensures there are no stale TLB entries
+ * for a freed root, in theory another hypervisor could have left
+ * stale entries. Flushing on alloc also allows KVM to skip the TLB
+ * flush when freeing a root (see kvm_tdp_mmu_put_root()).
+ */
+ kvm_x86_call(flush_tlb_current)(vcpu);
+out:
+ return r;
+}
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+}
+
+static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root_hpa))
+ return false;
+
+ /*
+ * When freeing obsolete roots, treat roots as obsolete if they don't
+ * have an associated shadow page, as it's impossible to determine if
+ * such roots are fresh or stale. This does mean KVM will get false
+ * positives and free roots that don't strictly need to be freed, but
+ * such false positives are relatively rare:
+ *
+ * (a) only PAE paging and nested NPT have roots without shadow pages
+ * (or any shadow paging flavor with a dummy root, see note below)
+ * (b) remote reloads due to a memslot update obsoletes _all_ roots
+ * (c) KVM doesn't track previous roots for PAE paging, and the guest
+ * is unlikely to zap an in-use PGD.
+ *
+ * Note! Dummy roots are unique in that they are obsoleted by memslot
+ * _creation_! See also FNAME(fetch).
+ */
+ sp = root_to_sp(root_hpa);
+ return !sp || is_obsolete_sp(kvm, sp);
+}
+
+static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
+{
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_obsolete_roots);
+
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+ int *bytes)
+{
+ u64 gentry = 0;
+ int r;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode since we update the sptes only
+ * when they have the same mode.
+ */
+ if (is_pae(vcpu) && *bytes == 4) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ *gpa &= ~(gpa_t)7;
+ *bytes = 8;
+ }
+
+ if (*bytes == 4 || *bytes == 8) {
+ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
+ if (r)
+ gentry = 0;
+ }
+
+ return gentry;
+}
+
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
+{
+ /*
+ * Skip write-flooding detected for the sp whose level is 1, because
+ * it can become unsync, then the guest page is not write-protected.
+ */
+ if (sp->role.level == PG_LEVEL_4K)
+ return false;
+
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+ int bytes)
+{
+ unsigned offset, pte_size, misaligned;
+
+ offset = offset_in_page(gpa);
+ pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
+
+ /*
+ * Sometimes, the OS only writes the last one bytes to update status
+ * bits, for example, in linux, andb instruction is used in clear_bit().
+ */
+ if (!(offset & (pte_size - 1)) && bytes == 1)
+ return false;
+
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+
+ return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+ unsigned page_offset, quadrant;
+ u64 *spte;
+ int level;
+
+ page_offset = offset_in_page(gpa);
+ level = sp->role.level;
+ *nspte = 1;
+ if (sp->role.has_4_byte_gpte) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ *nspte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ return NULL;
+ }
+
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ return spte;
+}
+
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ u64 entry, gentry, *spte;
+ int npte;
+ bool flush = false;
+
+ /*
+ * When emulating guest writes, ensure the written value is visible to
+ * any task that is handling page faults before checking whether or not
+ * KVM is shadowing a guest PTE. This ensures either KVM will create
+ * the correct SPTE in the page fault handler, or this task will see
+ * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in
+ * account_shadowed().
+ */
+ smp_mb();
+ if (!vcpu->kvm->arch.indirect_shadow_pages)
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
+
+ ++vcpu->kvm->stat.mmu_pte_write;
+
+ for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
+ if (detect_write_misaligned(sp, gpa, bytes) ||
+ detect_write_flooding(sp)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ ++vcpu->kvm->stat.mmu_flooded;
+ continue;
+ }
+
+ spte = get_written_sptes(sp, gpa, &npte);
+ if (!spte)
+ continue;
+
+ while (npte--) {
+ entry = *spte;
+ mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
+ if (gentry && sp->role.level != PG_LEVEL_4K)
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ if (is_shadow_present_pte(entry))
+ flush = true;
+ ++spte;
+ }
+ }
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+static bool is_write_to_guest_page_table(u64 error_code)
+{
+ const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
+
+ return (error_code & mask) == mask;
+}
+
+static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 error_code, int *emulation_type)
+{
+ bool direct = vcpu->arch.mmu->root_role.direct;
+
+ /*
+ * Do not try to unprotect and retry if the vCPU re-faulted on the same
+ * RIP with the same address that was previously unprotected, as doing
+ * so will likely put the vCPU into an infinite. E.g. if the vCPU uses
+ * a non-page-table modifying instruction on the PDE that points to the
+ * instruction, then unprotecting the gfn will unmap the instruction's
+ * code, i.e. make it impossible for the instruction to ever complete.
+ */
+ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
+ vcpu->arch.last_retry_addr == cr2_or_gpa)
+ return RET_PF_EMULATE;
+
+ /*
+ * Reset the unprotect+retry values that guard against infinite loops.
+ * The values will be refreshed if KVM explicitly unprotects a gfn and
+ * retries, in all other cases it's safe to retry in the future even if
+ * the next page fault happens on the same RIP+address.
+ */
+ vcpu->arch.last_retry_eip = 0;
+ vcpu->arch.last_retry_addr = 0;
+
+ /*
+ * It should be impossible to reach this point with an MMIO cache hit,
+ * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+ * writable memslot, and creating a memslot should invalidate the MMIO
+ * cache by way of changing the memslot generation. WARN and disallow
+ * retry if MMIO is detected, as retrying MMIO emulation is pointless
+ * and could put the vCPU into an infinite loop because the processor
+ * will keep faulting on the non-existent MMIO address.
+ */
+ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+ return RET_PF_EMULATE;
+
+ /*
+ * Before emulating the instruction, check to see if the access was due
+ * to a read-only violation while the CPU was walking non-nested NPT
+ * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
+ * If L1 is sharing (a subset of) its page tables with L2, e.g. by
+ * having nCR3 share lower level page tables with hCR3, then when KVM
+ * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
+ * unknowingly write-protecting L1's guest page tables, which KVM isn't
+ * shadowing.
+ *
+ * Because the CPU (by default) walks NPT page tables using a write
+ * access (to ensure the CPU can do A/D updates), page walks in L1 can
+ * trigger write faults for the above case even when L1 isn't modifying
+ * PTEs. As a result, KVM will unnecessarily emulate (or at least, try
+ * to emulate) an excessive number of L1 instructions; because L1's MMU
+ * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
+ * and thus no need to emulate in order to guarantee forward progress.
+ *
+ * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
+ * proceed without triggering emulation. If one or more shadow pages
+ * was zapped, skip emulation and resume L1 to let it natively execute
+ * the instruction. If no shadow pages were zapped, then the write-
+ * fault is due to something else entirely, i.e. KVM needs to emulate,
+ * as resuming the guest will put it into an infinite loop.
+ *
+ * Note, this code also applies to Intel CPUs, even though it is *very*
+ * unlikely that an L1 will share its page tables (IA32/PAE/paging64
+ * format) with L2's page tables (EPT format).
+ *
+ * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
+ * unprotect the gfn and retry if an event is awaiting reinjection. If
+ * KVM emulates multiple instructions before completing event injection,
+ * the event could be delayed beyond what is architecturally allowed,
+ * e.g. KVM could inject an IRQ after the TPR has been raised.
+ */
+ if (((direct && is_write_to_guest_page_table(error_code)) ||
+ (!direct && kvm_event_needs_reinjection(vcpu))) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
+ return RET_PF_RETRY;
+
+ /*
+ * The gfn is write-protected, but if KVM detects its emulating an
+ * instruction that is unlikely to be used to modify page tables, or if
+ * emulation fails, KVM can try to unprotect the gfn and let the CPU
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying an instruction from a nested guest as KVM is only explicitly
+ * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+ * going to magically fix whatever issue caused L2 to fail.
+ */
+ if (!is_guest_mode(vcpu))
+ *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+
+ return RET_PF_EMULATE;
+}
+
+int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = EMULTYPE_PF;
+ bool direct = vcpu->arch.mmu->root_role.direct;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ /*
+ * Except for reserved faults (emulated MMIO is shared-only), set the
+ * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
+ * current attributes, which are the source of truth for such VMs. Note,
+ * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
+ * currently supported nested virtualization (among many other things)
+ * for software-protected VMs.
+ */
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ !(error_code & PFERR_RSVD_MASK) &&
+ vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ r = RET_PF_INVALID;
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
+ return -EFAULT;
+
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
+ if (r == RET_PF_EMULATE)
+ goto emulate;
+ }
+
+ if (r == RET_PF_INVALID) {
+ vcpu->stat.pf_taken++;
+
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
+ &emulation_type, NULL);
+ if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
+ return -EIO;
+ }
+
+ if (r < 0)
+ return r;
+
+ if (r == RET_PF_WRITE_PROTECTED)
+ r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+ &emulation_type);
+
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
+
+ /*
+ * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or
+ * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE.
+ * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to
+ * indicate continuing the page fault handling until to the final
+ * page table mapping phase.
+ */
+ WARN_ON_ONCE(r == RET_PF_CONTINUE);
+ if (r != RET_PF_EMULATE)
+ return r;
+
+emulate:
+ return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
+ insn_len);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_page_fault);
+
+void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ int root_level, leaf, level;
+
+ leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level);
+ if (unlikely(leaf < 0))
+ return;
+
+ pr_err("%s %llx", msg, gpa);
+ for (level = root_level; level >= leaf; level--)
+ pr_cont(", spte[%d] = 0x%llx", level, sptes[level]);
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_print_sptes);
+
+static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+
+ vcpu_clear_mmio_info(vcpu, addr);
+
+ /*
+ * Walking and synchronizing SPTEs both assume they are operating in
+ * the context of the current MMU, and would need to be reworked if
+ * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
+ */
+ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
+ return;
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
+ struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
+
+ if (sp->unsync) {
+ int ret = kvm_sync_spte(vcpu, sp, iterator.index);
+
+ if (ret < 0)
+ mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
+ if (ret)
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
+ }
+
+ if (!sp->unsync_children)
+ break;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots)
+{
+ int i;
+
+ WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
+
+ /* It's actually a GPA for vcpu->arch.guest_mmu. */
+ if (mmu != &vcpu->arch.guest_mmu) {
+ /* INVLPG on a non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_invlpg_address(addr, vcpu))
+ return;
+
+ kvm_x86_call(flush_tlb_gva)(vcpu, addr);
+ }
+
+ if (!mmu->sync_spte)
+ return;
+
+ if (roots & KVM_MMU_ROOT_CURRENT)
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (roots & KVM_MMU_ROOT_PREVIOUS(i))
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invalidate_addr);
+
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Blindly sync all roots as it would take
+ * roughly the same amount of work/time to determine whether any of the
+ * previous roots have a global mapping.
+ *
+ * Mappings not reachable via the current or previous cached roots will
+ * be synced when switching to that new cr3, so nothing needs to be
+ * done here for them.
+ */
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg);
+
+
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots = 0;
+ uint i;
+
+ if (pcid == kvm_get_active_pcid(vcpu))
+ roots |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
+ ++vcpu->stat.invlpg;
+
+ /*
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+}
+
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level)
+{
+ tdp_enabled = enable_tdp;
+ tdp_root_level = tdp_forced_root_level;
+ max_tdp_level = tdp_max_root_level;
+
+#ifdef CONFIG_X86_64
+ tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
+#endif
+ /*
+ * max_huge_page_level reflects KVM's MMU capabilities irrespective
+ * of kernel support, e.g. KVM may be capable of using 1GB pages when
+ * the kernel is not. But, KVM never creates a page size greater than
+ * what is used by the kernel for any given HVA, i.e. the kernel's
+ * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
+ */
+ if (tdp_enabled)
+ max_huge_page_level = tdp_huge_page_level;
+ else if (boot_cpu_has(X86_FEATURE_GBPAGES))
+ max_huge_page_level = PG_LEVEL_1G;
+ else
+ max_huge_page_level = PG_LEVEL_2M;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_configure_mmu);
+
+static void free_mmu_pages(struct kvm_mmu *mmu)
+{
+ if (!tdp_enabled && mmu->pae_root)
+ set_memory_encrypted((unsigned long)mmu->pae_root, 1);
+ free_page((unsigned long)mmu->pae_root);
+ free_page((unsigned long)mmu->pml4_root);
+ free_page((unsigned long)mmu->pml5_root);
+}
+
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ struct page *page;
+ int i;
+
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
+ mmu->mirror_root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
+ /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
+ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
+ return 0;
+
+ /*
+ * When using PAE paging, the four PDPTEs are treated as 'root' pages,
+ * while the PDP table is a per-vCPU construct that's allocated at MMU
+ * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
+ * x86_64. Therefore we need to allocate the PDP table in the first
+ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
+ * generally doesn't use PAE paging and can skip allocating the PDP
+ * table. The main exception, handled here, is SVM's 32-bit NPT. The
+ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+ * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
+ */
+ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+ return 0;
+
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+
+ mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_value);
+
+ for (i = 0; i < 4; ++i)
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+
+ return 0;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
+ vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
+ vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_shadow_page_cache.init_value =
+ SHADOW_NONPRESENT_VALUE;
+ if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
+ if (ret)
+ return ret;
+
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
+ if (ret)
+ goto fail_allocate_root;
+
+ return ret;
+ fail_allocate_root:
+ free_mmu_pages(&vcpu->arch.guest_mmu);
+ return ret;
+}
+
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ int nr_zapped, batch = 0;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+restart:
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ /*
+ * No obsolete valid page exists before a newly created page
+ * since active_mmu_pages is a FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
+
+ /*
+ * Invalid pages should never land back on the list of active
+ * pages. Skip the bogus page, otherwise we'll get stuck in an
+ * infinite loop if the page gets put back on the list (again).
+ */
+ if (WARN_ON_ONCE(sp->role.invalid))
+ continue;
+
+ /*
+ * No need to flush the TLB since we're only zapping shadow
+ * pages with an obsolete generation number and all vCPUS have
+ * loaded a new root, i.e. the shadow pages being zapped cannot
+ * be in active use by the guest.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_rwlock_write(&kvm->mmu_lock)) {
+ batch = 0;
+ goto restart;
+ }
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
+ &invalid_list, &nr_zapped);
+ batch += nr_zapped;
+
+ if (unstable)
+ goto restart;
+ }
+
+ /*
+ * Kick all vCPUs (via remote TLB flush) before freeing the page tables
+ * to ensure KVM is not in the middle of a lockless shadow page table
+ * walk, which may reference the pages. The remote TLB flush itself is
+ * not required and is simply a convenient way to kick vCPUs as needed.
+ * KVM performs a local TLB flush when allocating a new root (see
+ * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
+ * running with an obsolete MMU.
+ */
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+static void kvm_mmu_zap_all_fast(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->slots_lock);
+
+ write_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_zap_all_fast(kvm);
+
+ /*
+ * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
+ * held for the entire duration of zapping obsolete pages, it's
+ * impossible for there to be multiple invalid generations associated
+ * with *valid* shadow pages at any given time, i.e. there is exactly
+ * one valid generation and (at most) one invalid generation.
+ */
+ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+
+ /*
+ * In order to ensure all vCPUs drop their soon-to-be invalid roots,
+ * invalidating TDP MMU roots must be done while holding mmu_lock for
+ * write and in the same critical section as making the reload request,
+ * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
+ */
+ if (tdp_mmu_enabled) {
+ /*
+ * External page tables don't support fast zapping, therefore
+ * their mirrors must be invalidated separately by the caller.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS);
+ }
+
+ /*
+ * Notify all vcpus to reload its shadow page table and flush TLB.
+ * Then all vcpus will switch to new shadow page table with the new
+ * mmu_valid_gen.
+ *
+ * Note: we need to do this under the protection of mmu_lock,
+ * otherwise, vcpu would purge shadow page but miss tlb flush.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
+
+ kvm_zap_obsolete_pages(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+
+ /*
+ * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
+ * returning to the caller, e.g. if the zap is in response to a memslot
+ * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
+ * associated with the deleted memslot once the update completes, and
+ * Deferring the zap until the final reference to the root is put would
+ * lead to use-after-free.
+ */
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
+}
+
+int kvm_mmu_init_vm(struct kvm *kvm)
+{
+ int r, i;
+
+ kvm->arch.shadow_mmio_value = shadow_mmio_value;
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
+ spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
+
+ if (tdp_mmu_enabled) {
+ kvm_mmu_init_tdp_mmu(kvm);
+ } else {
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ return r;
+ }
+
+ kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
+ kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
+ kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+ return 0;
+}
+
+static void mmu_free_vm_memory_caches(struct kvm *kvm)
+{
+ kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ kvfree(kvm->arch.mmu_page_hash);
+
+ if (tdp_mmu_enabled)
+ kvm_mmu_uninit_tdp_mmu(kvm);
+
+ mmu_free_vm_memory_caches(kvm);
+}
+
+static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ const struct kvm_memory_slot *memslot;
+ struct kvm_memslots *slots;
+ struct kvm_memslot_iter iter;
+ bool flush = false;
+ gfn_t start, end;
+ int i;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return flush;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ memslot = iter.slot;
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (WARN_ON_ONCE(start >= end))
+ continue;
+
+ flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
+ end, true, flush);
+ }
+ }
+
+ return flush;
+}
+
+/*
+ * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
+ * (not including it)
+ */
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ bool flush;
+
+ if (WARN_ON_ONCE(gfn_end <= gfn_start))
+ return;
+
+ write_lock(&kvm->mmu_lock);
+
+ kvm_mmu_invalidate_begin(kvm);
+
+ kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
+
+ flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
+
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
+
+ if (flush)
+ kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
+
+ kvm_mmu_invalidate_end(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range);
+
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ return rmap_write_protect(rmap_head, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int start_level)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+ read_unlock(&kvm->mmu_lock);
+ }
+}
+
+static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
+{
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static bool need_topup_split_caches_or_resched(struct kvm *kvm)
+{
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ /*
+ * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
+ * to split a single huge page. Calculating how many are actually needed
+ * is possible but not worth the complexity.
+ */
+ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
+ need_topup(&kvm->arch.split_page_header_cache, 1) ||
+ need_topup(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static int topup_split_caches(struct kvm *kvm)
+{
+ /*
+ * Allocating rmap list entries when splitting huge pages for nested
+ * MMUs is uncommon as KVM needs to use a list if and only if there is
+ * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
+ * aliased by multiple L2 gfns and/or from multiple nested roots with
+ * different roles. Aliasing gfns when using TDP is atypical for VMMs;
+ * a few gfns are often aliased during boot, e.g. when remapping BIOS,
+ * but aliasing rarely occurs post-boot or for many gfns. If there is
+ * only one rmap entry, rmap->val points directly at that one entry and
+ * doesn't need to allocate a list. Buffer the cache by the default
+ * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
+ * encounters an aliased gfn or two.
+ */
+ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
+ KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
+ SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
+ if (r)
+ return r;
+
+ r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
+ if (r)
+ return r;
+
+ return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ struct shadow_page_caches caches = {};
+ union kvm_mmu_page_role role;
+ unsigned int access;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
+
+ /*
+ * Note, huge page splitting always uses direct shadow pages, regardless
+ * of whether the huge page itself is mapped by a direct or indirect
+ * shadow page, since the huge page region itself is being directly
+ * mapped with smaller pages.
+ */
+ role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
+
+ /* Direct SPs do not require a shadowed_info_cache. */
+ caches.page_header_cache = &kvm->arch.split_page_header_cache;
+ caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
+
+ /* Safe to pass NULL for vCPU since requesting a direct SP. */
+ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
+}
+
+static void shadow_mmu_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+
+{
+ struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
+ u64 huge_spte = READ_ONCE(*huge_sptep);
+ struct kvm_mmu_page *sp;
+ bool flush = false;
+ u64 *sptep, spte;
+ gfn_t gfn;
+ int index;
+
+ sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
+
+ for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
+ sptep = &sp->spt[index];
+ gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ /*
+ * The SP may already have populated SPTEs, e.g. if this huge
+ * page is aliased by multiple sptes with the same access
+ * permissions. These entries are guaranteed to map the same
+ * gfn-to-pfn translation since the SP is direct, so no need to
+ * modify them.
+ *
+ * However, if a given SPTE points to a lower level page table,
+ * that lower level page table may only be partially populated.
+ * Installing such SPTEs would effectively unmap a potion of the
+ * huge page. Unmapping guest memory always requires a TLB flush
+ * since a subsequent operation on the unmapped regions would
+ * fail to detect the need to flush.
+ */
+ if (is_shadow_present_pte(*sptep)) {
+ flush |= !is_last_spte(*sptep, sp->role.level);
+ continue;
+ }
+
+ spte = make_small_spte(kvm, huge_spte, sp->role, index);
+ mmu_spte_set(sptep, spte);
+ __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
+ }
+
+ __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
+}
+
+static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ int level, r = 0;
+ gfn_t gfn;
+ u64 spte;
+
+ /* Grab information for the tracepoint before dropping the MMU lock. */
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ level = huge_sp->role.level;
+ spte = *huge_sptep;
+
+ if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
+ r = -ENOSPC;
+ goto out;
+ }
+
+ if (need_topup_split_caches_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /*
+ * If the topup succeeds, return -EAGAIN to indicate that the
+ * rmap iterator should be restarted because the MMU lock was
+ * dropped.
+ */
+ r = topup_split_caches(kvm) ?: -EAGAIN;
+ write_lock(&kvm->mmu_lock);
+ goto out;
+ }
+
+ shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
+
+out:
+ trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
+ return r;
+}
+
+static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ struct rmap_iterator iter;
+ struct kvm_mmu_page *sp;
+ u64 *huge_sptep;
+ int r;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
+ sp = sptep_to_sp(huge_sptep);
+
+ /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
+ if (WARN_ON_ONCE(!sp->role.guest_mode))
+ continue;
+
+ /* The rmaps should never contain non-leaf SPTEs. */
+ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
+ continue;
+
+ /* SPs with level >PG_LEVEL_4K should never by unsync. */
+ if (WARN_ON_ONCE(sp->unsync))
+ continue;
+
+ /* Don't bother splitting huge pages on invalid SPs. */
+ if (sp->role.invalid)
+ continue;
+
+ r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
+
+ /*
+ * The split succeeded or needs to be retried because the MMU
+ * lock was dropped. Either way, restart the iterator to get it
+ * back into a consistent state.
+ */
+ if (!r || r == -EAGAIN)
+ goto restart;
+
+ /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
+ break;
+ }
+
+ return false;
+}
+
+static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level)
+{
+ int level;
+
+ /*
+ * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
+ * down to the target level. This ensures pages are recursively split
+ * all the way to the target level. There's no need to split pages
+ * already at the target level.
+ */
+ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
+ __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
+ level, level, start, end - 1, true, true, false);
+}
+
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (!tdp_mmu_enabled)
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same reasons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level)
+{
+ u64 start = memslot->base_gfn;
+ u64 end = start + memslot->npages;
+
+ if (!tdp_mmu_enabled)
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+
+ /*
+ * No TLB flush is necessary here. KVM will flush TLBs after
+ * write-protecting and/or clearing dirty on the newly split SPTEs to
+ * ensure that guest writes are reflected in the dirty log before the
+ * ioctl to enable dirty logging on this memslot completes. Since the
+ * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+ * safe for KVM to decide if a TLB flush is necessary based on the split
+ * SPTEs.
+ */
+}
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_tlb_flush = 0;
+ struct kvm_mmu_page *sp;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ sp = sptep_to_sp(sptep);
+
+ /*
+ * We cannot do huge page mapping for indirect shadow pages,
+ * which are found on the last rmap (level = 1) when not using
+ * tdp; such shadow pages are synced with the page table in
+ * the guest, and the guest page table is using 4K page size
+ * mapping if the indirect sp has level = 1.
+ */
+ if (sp->role.direct &&
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
+ kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
+
+ if (kvm_available_flush_remote_tlbs_range())
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
+ else
+ need_tlb_flush = 1;
+
+ goto restart;
+ }
+ }
+
+ return need_tlb_flush;
+}
+
+static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ /*
+ * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
+ * pages that are already mapped at the maximum hugepage level.
+ */
+ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+}
+
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ kvm_rmap_zap_collapsible_sptes(kvm, slot);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_recover_huge_pages(kvm, slot);
+ read_unlock(&kvm->mmu_lock);
+ }
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ /*
+ * Clear dirty bits only on 4k SPTEs since the legacy MMU only
+ * support dirty logging at a 4k granularity.
+ */
+ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+ read_unlock(&kvm->mmu_lock);
+ }
+
+ /*
+ * The caller will flush the TLBs after this function returns.
+ *
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
+}
+
+static void kvm_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
+ int ign;
+
+ write_lock(&kvm->mmu_lock);
+restart:
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+ if (WARN_ON_ONCE(sp->role.invalid))
+ continue;
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
+ goto restart;
+ if (cond_resched_rwlock_write(&kvm->mmu_lock))
+ goto restart;
+ }
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_all(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_mmu_zap_all(kvm);
+}
+
+static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ bool flush)
+{
+ LIST_HEAD(invalid_list);
+ unsigned long i;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ goto out_flush;
+
+ /*
+ * Since accounting information is stored in struct kvm_arch_memory_slot,
+ * all MMU pages that are shadowing guest PTEs must be zapped before the
+ * memslot is deleted, as freeing such pages after the memslot is freed
+ * will result in use-after-free, e.g. in unaccount_shadowed().
+ */
+ for (i = 0; i < slot->npages; i++) {
+ struct kvm_mmu_page *sp;
+ gfn_t gfn = slot->base_gfn + i;
+
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn)
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ flush = false;
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+out_flush:
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+}
+
+static void kvm_mmu_zap_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ struct kvm_gfn_range range = {
+ .slot = slot,
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .may_block = true,
+ .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED,
+ };
+ bool flush;
+
+ write_lock(&kvm->mmu_lock);
+ flush = kvm_unmap_gfn_range(kvm, &range);
+ kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush);
+ write_unlock(&kvm->mmu_lock);
+}
+
+static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ if (kvm_memslot_flush_zap_all(kvm))
+ kvm_mmu_zap_all_fast(kvm);
+ else
+ kvm_mmu_zap_memslot(kvm, slot);
+}
+
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
+{
+ WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+
+ if (!enable_mmio_caching)
+ return;
+
+ gen &= MMIO_SPTE_GEN_MASK;
+
+ /*
+ * Generation numbers are incremented in multiples of the number of
+ * address spaces in order to provide unique generations across all
+ * address spaces. Strip what is effectively the address space
+ * modifier prior to checking for a wrap of the MMIO generation so
+ * that a wrap in any address space is detected.
+ */
+ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
+
+ /*
+ * The very rare case: if the MMIO generation number has wrapped,
+ * zap all shadow pages.
+ */
+ if (unlikely(gen == 0)) {
+ kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
+ kvm_mmu_zap_all_fast(kvm);
+ }
+}
+
+static void mmu_destroy_caches(void)
+{
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+ /*
+ * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+ * may not be valid even though the VM is globally visible. Do nothing,
+ * as such a VM can't have any possible NX huge pages.
+ */
+ struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
+
+ if (nx_thread)
+ vhost_task_wake(nx_thread);
+}
+
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return sysfs_emit(buffer, "never\n");
+
+ return param_get_bool(buffer, kp);
+}
+
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off")) {
+ new_val = 0;
+ } else if (sysfs_streq(val, "force")) {
+ new_val = 1;
+ } else if (sysfs_streq(val, "auto")) {
+ new_val = get_nx_auto_mode();
+ } else if (sysfs_streq(val, "never")) {
+ new_val = 0;
+
+ mutex_lock(&kvm_lock);
+ if (!list_empty(&vm_list)) {
+ mutex_unlock(&kvm_lock);
+ return -EBUSY;
+ }
+ nx_hugepage_mitigation_hard_disabled = true;
+ mutex_unlock(&kvm_lock);
+ } else if (kstrtobool(val, &new_val) < 0) {
+ return -EINVAL;
+ }
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_mmu_zap_all_fast(kvm);
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_wake_nx_recovery_thread(kvm);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ * Forward the module init call to SPTE code so that it too can handle module
+ * params that need to be resolved/snapshot.
+ */
+void __init kvm_mmu_x86_module_init(void)
+{
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
+ /*
+ * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
+ * TDP MMU is actually enabled is determined in kvm_configure_mmu()
+ * when the vendor module is loaded.
+ */
+ tdp_mmu_allowed = tdp_mmu_enabled;
+
+ kvm_mmu_spte_module_init();
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+ int ret = -ENOMEM;
+
+ /*
+ * MMU roles use union aliasing which is, generally speaking, an
+ * undefined behavior. However, we supposedly know how compilers behave
+ * and the current status quo is unlikely to change. Guardians below are
+ * supposed to let us know if the assumption becomes false.
+ */
+ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
+
+ kvm_mmu_reset_all_pte_masks();
+
+ pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
+ if (!pte_list_desc_cache)
+ goto out;
+
+ mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ sizeof(struct kvm_mmu_page),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!mmu_page_header_cache)
+ goto out;
+
+ return 0;
+
+out:
+ mmu_destroy_caches();
+ return ret;
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ if (tdp_mmu_enabled) {
+ read_lock(&vcpu->kvm->mmu_lock);
+ mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa,
+ NULL);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ }
+ free_mmu_pages(&vcpu->arch.root_mmu);
+ free_mmu_pages(&vcpu->arch.guest_mmu);
+ mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_vendor_module_exit(void)
+{
+ mmu_destroy_caches();
+}
+
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
+{
+ bool was_recovery_enabled, is_recovery_enabled;
+ uint old_period, new_period;
+ int err;
+
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
+
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
+
+ if (is_recovery_enabled &&
+ (!was_recovery_enabled || old_period > new_period)) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_wake_nx_recovery_thread(kvm);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
+ enum kvm_mmu_type mmu_type)
+{
+ unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
+ unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
+}
+
+static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ struct kvm_memory_slot *slot;
+
+ /*
+ * Skip the memslot lookup if dirty tracking can't possibly be enabled,
+ * as memslot lookups are relatively expensive.
+ *
+ * If a memslot update is in progress, reading an incorrect value of
+ * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
+ * zero, KVM will do an unnecessary memslot lookup; if it is becoming
+ * nonzero, the page will be zapped unnecessarily. Either way, this
+ * only affects efficiency in racy situations, and not correctness.
+ */
+ if (!atomic_read(&kvm->nr_memslots_dirty_logging))
+ return false;
+
+ slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn);
+ if (WARN_ON_ONCE(!slot))
+ return false;
+
+ return kvm_slot_dirty_track_enabled(slot);
+}
+
+static void kvm_recover_nx_huge_pages(struct kvm *kvm,
+ const enum kvm_mmu_type mmu_type)
+{
+#ifdef CONFIG_X86_64
+ const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
+ spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock;
+#else
+ const bool is_tdp_mmu = false;
+ spinlock_t *tdp_mmu_pages_lock = NULL;
+#endif
+ unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
+ struct list_head *nx_huge_pages;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+ int rcu_idx;
+
+ nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ /*
+ * Zapping TDP MMU shadow pages, including the remote TLB flush, must
+ * be done under RCU protection, because the pages are freed via RCU
+ * callback.
+ */
+ rcu_read_lock();
+
+ for ( ; to_zap; --to_zap) {
+ if (is_tdp_mmu)
+ spin_lock(tdp_mmu_pages_lock);
+
+ if (list_empty(nx_huge_pages)) {
+ if (is_tdp_mmu)
+ spin_unlock(tdp_mmu_pages_lock);
+ break;
+ }
+
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of shadow pages that be replaced with an
+ * NX huge page is expected to be relatively small compared to
+ * the total number of shadow pages. And because the TDP MMU
+ * doesn't use active_mmu_pages.
+ */
+ sp = list_first_entry(nx_huge_pages,
+ struct kvm_mmu_page,
+ possible_nx_huge_page_link);
+ WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+ WARN_ON_ONCE(!sp->role.direct);
+
+ unaccount_nx_huge_page(kvm, sp);
+
+ if (is_tdp_mmu)
+ spin_unlock(tdp_mmu_pages_lock);
+
+ /*
+ * Do not attempt to recover any NX Huge Pages that are being
+ * dirty tracked, as they would just be faulted back in as 4KiB
+ * pages. The NX Huge Pages in this slot will be recovered,
+ * along with all the other huge pages in the slot, when dirty
+ * logging is disabled.
+ */
+ if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
+ if (is_tdp_mmu)
+ flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
+ else
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ }
+
+ WARN_ON_ONCE(sp->nx_huge_page_disallowed);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
+ if (is_tdp_mmu)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
+ flush = false;
+ rcu_read_lock();
+ }
+ }
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+
+ rcu_read_unlock();
+
+ if (is_tdp_mmu)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
+{
+}
+
+static bool kvm_nx_huge_page_recovery_worker(void *data)
+{
+ struct kvm *kvm = data;
+ long remaining_time;
+ bool enabled;
+ uint period;
+ int i;
+
+ enabled = calc_nx_huge_pages_recovery_period(&period);
+ if (!enabled)
+ return false;
+
+ remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
+ - get_jiffies_64();
+ if (remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ /* check for signals and come back */
+ return true;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+ kvm_recover_nx_huge_pages(kvm, i);
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ return true;
+}
+
+static int kvm_mmu_start_lpage_recovery(struct once *once)
+{
+ struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct vhost_task *nx_thread;
+
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
+ kvm_nx_huge_page_recovery_worker_kill,
+ kvm, "kvm-nx-lpage-recovery");
+
+ if (IS_ERR(nx_thread))
+ return PTR_ERR(nx_thread);
+
+ vhost_task_start(nx_thread);
+
+ /* Make the task visible only once it is fully started. */
+ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+ return 0;
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return 0;
+
+ return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_huge_page_recovery_thread)
+ vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
+}
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
+ /*
+ * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
+ * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
+ * can simply ignore such slots. But if userspace is making memory
+ * PRIVATE, then KVM must prevent the guest from accessing the memory
+ * as shared. And if userspace is making memory SHARED and this point
+ * is reached, then at least one page within the range was previously
+ * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
+ * Zapping SPTEs in this case ensures KVM will reassess whether or not
+ * a hugepage can be used for affected ranges.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ if (WARN_ON_ONCE(range->end <= range->start))
+ return false;
+
+ /*
+ * If the head and tail pages of the range currently allow a hugepage,
+ * i.e. reside fully in the slot and don't have mixed attributes, then
+ * add each corresponding hugepage range to the ongoing invalidation,
+ * e.g. to prevent KVM from creating a hugepage in response to a fault
+ * for a gfn whose attributes aren't changing. Note, only the range
+ * of gfns whose attributes are being modified needs to be explicitly
+ * unmapped, as that will unmap any existing hugepages.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t start = gfn_round_for_level(range->start, level);
+ gfn_t end = gfn_round_for_level(range->end - 1, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+
+ if ((start != range->start || start + nr_pages > range->end) &&
+ start >= slot->base_gfn &&
+ start + nr_pages <= slot->base_gfn + slot->npages &&
+ !hugepage_test_mixed(slot, start, level))
+ kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);
+
+ if (end == start)
+ continue;
+
+ if ((end + nr_pages) > range->end &&
+ (end + nr_pages) <= (slot->base_gfn + slot->npages) &&
+ !hugepage_test_mixed(slot, end, level))
+ kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
+ }
+
+ /* Unmap the old attribute page. */
+ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ range->attr_filter = KVM_FILTER_SHARED;
+ else
+ range->attr_filter = KVM_FILTER_PRIVATE;
+
+ return kvm_unmap_gfn_range(kvm, range);
+}
+
+
+
+static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, int level, unsigned long attrs)
+{
+ const unsigned long start = gfn;
+ const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
+
+ if (level == PG_LEVEL_2M)
+ return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
+
+ for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
+ if (hugepage_test_mixed(slot, gfn, level - 1) ||
+ attrs != kvm_get_memory_attributes(kvm, gfn))
+ return false;
+ }
+ return true;
+}
+
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ unsigned long attrs = range->arg.attributes;
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Calculate which ranges can be mapped with hugepages even if the slot
+ * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over
+ * a range that has PRIVATE GFNs, and conversely converting a range to
+ * SHARED may now allow hugepages.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ /*
+ * The sequence matters here: upper levels consume the result of lower
+ * level's scanning.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn = gfn_round_for_level(range->start, level);
+
+ /* Process the head page if it straddles the range. */
+ if (gfn != range->start || gfn + nr_pages > range->end) {
+ /*
+ * Skip mixed tracking if the aligned gfn isn't covered
+ * by the memslot, KVM can't use a hugepage due to the
+ * misaligned address regardless of memory attributes.
+ */
+ if (gfn >= slot->base_gfn &&
+ gfn + nr_pages <= slot->base_gfn + slot->npages) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ gfn += nr_pages;
+ }
+
+ /*
+ * Pages entirely covered by the range are guaranteed to have
+ * only the attributes which were just set.
+ */
+ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
+ hugepage_clear_mixed(slot, gfn, level);
+
+ /*
+ * Process the last tail page if it straddles the range and is
+ * contained by the memslot. Like the head page, KVM can't
+ * create a hugepage if the slot size is misaligned.
+ */
+ if (gfn < range->end &&
+ (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+ return false;
+}
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ int level;
+
+ if (!kvm_arch_has_private_mem(kvm))
+ return;
+
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ /*
+ * Don't bother tracking mixed attributes for pages that can't
+ * be huge due to alignment, i.e. process only pages that are
+ * entirely contained by the memslot.
+ */
+ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
+ gfn_t start = gfn_round_for_level(slot->base_gfn, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn;
+
+ if (start < slot->base_gfn)
+ start += nr_pages;
+
+ /*
+ * Unlike setting attributes, every potential hugepage needs to
+ * be manually checked as the attributes may already be mixed.
+ */
+ for (gfn = start; gfn < end; gfn += nr_pages) {
+ unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
+
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
new file mode 100644
index 000000000000..73cdcbccc89e
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -0,0 +1,414 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_MMU_INTERNAL_H
+#define __KVM_X86_MMU_INTERNAL_H
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+
+#include "mmu.h"
+
+#ifdef CONFIG_KVM_PROVE_MMU
+#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
+#else
+#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x)
+#endif
+
+/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
+#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12)
+#define __PT_LEVEL_SHIFT(level, bits_per_level) \
+ (PAGE_SHIFT + ((level) - 1) * (bits_per_level))
+#define __PT_INDEX(address, level, bits_per_level) \
+ (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
+
+#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level))
+
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT 0
+#define IS_VALID_PAE_ROOT(x) (!!(x))
+
+typedef u64 __rcu *tdp_ptep_t;
+
+struct kvm_mmu_page {
+ /*
+ * Note, "link" through "spt" fit in a single 64 byte cache line on
+ * 64-bit kernels, keep it that way unless there's a reason not to.
+ */
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ bool tdp_mmu_page;
+ bool unsync;
+ union {
+ u8 mmu_valid_gen;
+
+ /* Only accessed under slots_lock. */
+ bool tdp_mmu_scheduled_root_to_zap;
+ };
+
+ /*
+ * The shadow page can't be replaced by an equivalent huge page
+ * because it is being used to map an executable page in the guest
+ * and the NX huge page mitigation is enabled.
+ */
+ bool nx_huge_page_disallowed;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ union kvm_mmu_page_role role;
+ gfn_t gfn;
+
+ u64 *spt;
+
+ /*
+ * Stores the result of the guest translation being shadowed by each
+ * SPTE. KVM shadows two types of guest translations: nGPA -> GPA
+ * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
+ * cases the result of the translation is a GPA and a set of access
+ * constraints.
+ *
+ * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
+ * access permissions are stored in the lower bits. Note, for
+ * convenience and uniformity across guests, the access permissions are
+ * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
+ */
+ u64 *shadowed_translation;
+
+ /* Currently serving as active root */
+ union {
+ int root_count;
+ refcount_t tdp_mmu_root_count;
+ };
+
+ bool has_mapped_host_mmio;
+
+ union {
+ /* These two members aren't used for TDP MMU */
+ struct {
+ unsigned int unsync_children;
+ /*
+ * Number of writes since the last time traversal
+ * visited this page.
+ */
+ atomic_t write_flooding_count;
+ };
+ /*
+ * Page table page of external PT.
+ * Passed to TDX module, not accessed by KVM.
+ */
+ void *external_spt;
+ };
+ union {
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+ tdp_ptep_t ptep;
+ };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+ /*
+ * Tracks shadow pages that, if zapped, would allow KVM to create an NX
+ * huge page. A shadow page will have nx_huge_page_disallowed set but
+ * not be on the list if a huge page is disallowed for other reasons,
+ * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
+ * isn't properly aligned, etc...
+ */
+ struct list_head possible_nx_huge_page_link;
+#ifdef CONFIG_X86_32
+ /*
+ * Used out of the mmu-lock to avoid reading spte values while an
+ * update is in progress; see the comments in __get_spte_lockless().
+ */
+ int clear_spte_count;
+#endif
+
+#ifdef CONFIG_X86_64
+ /* Used for freeing the page asynchronously if it is a TDP MMU page. */
+ struct rcu_head rcu_head;
+#endif
+};
+
+extern struct kmem_cache *mmu_page_header_cache;
+
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+ return role.smm ? 1 : 0;
+}
+
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+ return kvm_mmu_role_as_id(sp->role);
+}
+
+static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
+{
+ return sp->role.is_mirror;
+}
+
+static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ /*
+ * external_spt is allocated for TDX module to hold private EPT mappings,
+ * TDX module will initialize the page by itself.
+ * Therefore, KVM does not need to initialize or access external_spt.
+ * KVM only interacts with sp->spt for private EPT operations.
+ */
+ sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
+}
+
+static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
+{
+ /*
+ * Since mirror SPs are used only for TDX, which maps private memory
+ * at its "natural" GFN, no mask needs to be applied to them - and, dually,
+ * we expect that the bits is only used for the shared PT.
+ */
+ if (is_mirror_sp(root))
+ return 0;
+ return kvm_gfn_direct_bits(kvm);
+}
+
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ /*
+ * When using the EPT page-modification log, the GPAs in the CPU dirty
+ * log would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages, which bypasses PML, since
+ * writes now result in a vmexit. Note, the check on CPU dirty logging
+ * being enabled is mandatory as the bits used to denote WP-only SPTEs
+ * are reserved for PAE paging (32-bit KVM).
+ */
+ return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode;
+}
+
+static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
+{
+ return gfn & -KVM_PAGES_PER_HPAGE(level);
+}
+
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool synchronizing, bool prefetch);
+
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level);
+
+/* Flush the given page (huge or not) of guest memory. */
+static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
+{
+ kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
+}
+
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
+
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
+{
+ return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
+}
+
+struct kvm_page_fault {
+ /* arguments to kvm_mmu_do_page_fault. */
+ const gpa_t addr;
+ const u64 error_code;
+ const bool prefetch;
+
+ /* Derived from error_code. */
+ const bool exec;
+ const bool write;
+ const bool present;
+ const bool rsvd;
+ const bool user;
+
+ /* Derived from mmu and global state. */
+ const bool is_tdp;
+ const bool is_private;
+ const bool nx_huge_page_workaround_enabled;
+
+ /*
+ * Whether a >4KB mapping can be created or is forbidden due to NX
+ * hugepages.
+ */
+ bool huge_page_disallowed;
+
+ /*
+ * Maximum page size that can be created for this fault; input to
+ * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
+ */
+ u8 max_level;
+
+ /*
+ * Page size that can be created based on the max_level and the
+ * page size used by the host mapping.
+ */
+ u8 req_level;
+
+ /*
+ * Page size that will be created based on the req_level and
+ * huge_page_disallowed.
+ */
+ u8 goal_level;
+
+ /*
+ * Shifted addr, or result of guest page table walk if addr is a gva. In
+ * the case of VM where memslot's can be mapped at multiple GPA aliases
+ * (i.e. TDX), the gfn field does not contain the bit that selects between
+ * the aliases (i.e. the shared bit for TDX).
+ */
+ gfn_t gfn;
+
+ /* The memslot containing gfn. May be NULL. */
+ struct kvm_memory_slot *slot;
+
+ /* Outputs of kvm_mmu_faultin_pfn(). */
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ struct page *refcounted_page;
+ bool map_writable;
+
+ /*
+ * Indicates the guest is trying to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself, i.e. the access
+ * is changing its own translation in the guest page tables.
+ */
+ bool write_fault_to_shadow_pgtable;
+};
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+/*
+ * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
+ * and of course kvm_mmu_do_page_fault().
+ *
+ * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
+ * gfn and retry, or emulate the instruction directly.
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_FIXED: The faulting entry has been fixed.
+ * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ *
+ * Any names added to this enum should be exported to userspace for use in
+ * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
+ *
+ * Note, all values must be greater than or equal to zero so as not to encroach
+ * on -errno return values.
+ */
+enum {
+ RET_PF_CONTINUE = 0,
+ RET_PF_RETRY,
+ RET_PF_EMULATE,
+ RET_PF_WRITE_PROTECTED,
+ RET_PF_INVALID,
+ RET_PF_FIXED,
+ RET_PF_SPURIOUS,
+};
+
+/*
+ * Define RET_PF_CONTINUE as 0 to allow for
+ * - efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero,
+ * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value.
+ */
+static_assert(RET_PF_CONTINUE == 0);
+
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
+static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 err, bool prefetch,
+ int *emulation_type, u8 *level)
+{
+ struct kvm_page_fault fault = {
+ .addr = cr2_or_gpa,
+ .error_code = err,
+ .exec = err & PFERR_FETCH_MASK,
+ .write = err & PFERR_WRITE_MASK,
+ .present = err & PFERR_PRESENT_MASK,
+ .rsvd = err & PFERR_RSVD_MASK,
+ .user = err & PFERR_USER_MASK,
+ .prefetch = prefetch,
+ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+ .nx_huge_page_workaround_enabled =
+ is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = err & PFERR_PRIVATE_ACCESS,
+
+ .pfn = KVM_PFN_ERR_FAULT,
+ };
+ int r;
+
+ if (vcpu->arch.mmu->root_role.direct) {
+ /*
+ * Things like memslots don't understand the concept of a shared
+ * bit. Strip it so that the GFN can be used like normal, and the
+ * fault.addr can be used when the shared bit is needed.
+ */
+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
+ fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
+ }
+
+ /*
+ * With retpoline being active an indirect call is rather expensive,
+ * so do a direct call in the most common case.
+ */
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
+ r = kvm_tdp_page_fault(vcpu, &fault);
+ else
+ r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+
+ /*
+ * Not sure what's happening, but punt to userspace and hope that
+ * they can fix it by changing memory to shared, or they can
+ * provide a better error.
+ */
+ if (r == RET_PF_EMULATE && fault.is_private) {
+ pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
+ kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ return -EFAULT;
+ }
+
+ if (fault.write_fault_to_shadow_pgtable && emulation_type)
+ *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+ if (level)
+ *level = fault.goal_level;
+
+ return r;
+}
+
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
+
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type);
+
+#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
new file mode 100644
index 000000000000..764e3015d021
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_events.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u8, mmu_valid_gen) \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(bool, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *saved_ptr = trace_seq_buffer_ptr(p); \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
+ __entry->gfn, role.level, \
+ role.has_4_byte_gpte ? 4 : 8, \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.efer_nx ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ saved_ptr; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_PK_MASK, "PK" }, \
+ { PFERR_SS_MASK, "SS" }, \
+ { PFERR_SGX_MASK, "SGX" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
+TRACE_DEFINE_ENUM(RET_PF_RETRY);
+TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
+TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_FIXED);
+TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte),
+ TP_ARGS(sptep, gfn, spte),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ __field(unsigned int, gen)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = spte & ACC_ALL;
+ __entry->gen = get_mmio_spte_generation(spte);
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
+ __entry->gfn, __entry->access, __entry->gen)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, int ret),
+ TP_ARGS(vcpu, fault, sptep, old_spte, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gpa_t, cr2_or_gpa)
+ __field(u64, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->cr2_or_gpa = fault->addr;
+ __entry->error_code = fault->error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_zap_all_fast,
+ TP_PROTO(struct kvm *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(__u8, mmu_valid_gen)
+ __field(unsigned int, mmu_used_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+ ),
+
+ TP_printk("kvm-mmu-valid-gen %u used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+ )
+);
+
+
+TRACE_EVENT(
+ check_mmio_spte,
+ TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+ TP_ARGS(spte, kvm_gen, spte_gen),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, kvm_gen)
+ __field(unsigned int, spte_gen)
+ __field(u64, spte)
+ ),
+
+ TP_fast_assign(
+ __entry->kvm_gen = kvm_gen;
+ __entry->spte_gen = spte_gen;
+ __entry->spte = spte;
+ ),
+
+ TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
+ __entry->kvm_gen, __entry->spte_gen,
+ __entry->kvm_gen == __entry->spte_gen
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_set_spte,
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(level, gfn, sptep),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(u64, sptep)
+ __field(u8, level)
+ /* These depend on page entry type, so compute them now. */
+ __field(bool, r)
+ __field(bool, x)
+ __field(signed char, u)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = *sptep;
+ __entry->sptep = virt_to_phys(sptep);
+ __entry->level = level;
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+ __entry->x = is_executable_pte(__entry->spte);
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ ),
+
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ __entry->gfn, __entry->spte,
+ __entry->r ? "r" : "-",
+ __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
+ __entry->x ? "x" : "-",
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->level, __entry->sptep
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_spte_requested,
+ TP_PROTO(struct kvm_page_fault *fault),
+ TP_ARGS(fault),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, pfn)
+ __field(u8, level)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = fault->gfn;
+ __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
+ __entry->level = fault->goal_level;
+ ),
+
+ TP_printk("gfn %llx pfn %llx level %d",
+ __entry->gfn, __entry->pfn, __entry->level
+ )
+);
+
+TRACE_EVENT(
+ kvm_tdp_mmu_spte_changed,
+ TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte),
+ TP_ARGS(as_id, gfn, level, old_spte, new_spte),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ /* Level cannot be larger than 5 on x86, so it fits in a u8. */
+ __field(u8, level)
+ /* as_id can only be 0 or 1 x86, so it fits in a u8. */
+ __field(u8, as_id)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = new_spte;
+ __entry->level = level;
+ __entry->as_id = as_id;
+ ),
+
+ TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx",
+ __entry->as_id, __entry->gfn, __entry->level,
+ __entry->old_spte, __entry->new_spte
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_split_huge_page,
+ TP_PROTO(u64 gfn, u64 spte, int level, int errno),
+ TP_ARGS(gfn, spte, level, errno),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(int, level)
+ __field(int, errno)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = spte;
+ __entry->level = level;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("gfn %llx spte %llx level %d errno %d",
+ __entry->gfn, __entry->spte, __entry->level, __entry->errno)
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH mmu
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
new file mode 100644
index 000000000000..1b17b12393a8
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/lockdep.h>
+#include <linux/kvm_host.h>
+#include <linux/rculist.h>
+
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "page_track.h"
+
+static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * Read external_write_tracking_enabled before related pointers. Pairs
+ * with the smp_store_release in kvm_page_track_write_tracking_enable().
+ */
+ return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
+#else
+ return false;
+#endif
+}
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
+{
+ return kvm_external_write_tracking_enabled(kvm) ||
+ kvm_shadow_root_allocated(kvm) || !tdp_enabled;
+}
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
+{
+ vfree(slot->arch.gfn_write_track);
+ slot->arch.gfn_write_track = NULL;
+}
+
+static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ const size_t size = sizeof(*slot->arch.gfn_write_track);
+
+ if (!slot->arch.gfn_write_track)
+ slot->arch.gfn_write_track = __vcalloc(npages, size,
+ GFP_KERNEL_ACCOUNT);
+
+ return slot->arch.gfn_write_track ? 0 : -ENOMEM;
+}
+
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return 0;
+
+ return __kvm_page_track_write_tracking_alloc(slot, npages);
+}
+
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
+{
+ return __kvm_page_track_write_tracking_alloc(slot, slot->npages);
+}
+
+static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+
+ val = slot->arch.gfn_write_track[index];
+
+ if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.gfn_write_track[index] += count;
+}
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
+ return;
+
+ update_gfn_write_track(slot, gfn, 1);
+
+ /*
+ * new track stops large page mapping for the
+ * tracked page.
+ */
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs(kvm);
+}
+
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
+ return;
+
+ update_gfn_write_track(slot, gfn, -1);
+
+ /*
+ * allow large page mapping for the tracked page
+ * after the tracker is gone.
+ */
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ int index;
+
+ if (!slot)
+ return false;
+
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+ return !!READ_ONCE(slot->arch.gfn_write_track[index]);
+}
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ cleanup_srcu_struct(&head->track_srcu);
+}
+
+int kvm_page_track_init(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ INIT_HLIST_HEAD(&head->track_notifier_list);
+ return init_srcu_struct(&head->track_srcu);
+}
+
+static int kvm_enable_external_write_tracking(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ if (kvm->arch.vm_type == KVM_X86_TDX_VM)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Check for *any* write tracking user (not just external users) under
+ * lock. This avoids unnecessary work, e.g. if KVM itself is using
+ * write tracking, or if two external users raced when registering.
+ */
+ if (kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Intentionally do NOT free allocations on failure to
+ * avoid having to track which allocations were made
+ * now versus when the memslot was created. The
+ * metadata is guaranteed to be freed when the slot is
+ * freed, and will be kept/used if userspace retries
+ * the failed ioctl() instead of killing the VM.
+ */
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+out_success:
+ /*
+ * Ensure that external_write_tracking_enabled becomes true strictly
+ * after all the related pointers are set.
+ */
+ smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+ int r;
+
+ if (!kvm || kvm->mm != current->mm)
+ return -ESRCH;
+
+ if (!kvm_external_write_tracking_enabled(kvm)) {
+ r = kvm_enable_external_write_tracking(kvm);
+ if (r)
+ return r;
+ }
+
+ kvm_get_kvm(kvm);
+
+ head = &kvm->arch.track_notifier_head;
+
+ write_lock(&kvm->mmu_lock);
+ hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+ write_unlock(&kvm->mmu_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ write_lock(&kvm->mmu_lock);
+ hlist_del_rcu(&n->node);
+ write_unlock(&kvm->mmu_lock);
+ synchronize_srcu(&head->track_srcu);
+
+ kvm_put_kvm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_write)
+ n->track_write(gpa, new, bytes, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * Notify external page track nodes that a memory region is being removed from
+ * the VM, e.g. so that users can free any associated metadata.
+ */
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_remove_region)
+ n->track_remove_region(slot->base_gfn, slot->npages, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_add_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn);
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_remove_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn);
+#endif
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
new file mode 100644
index 000000000000..d4d72ed999b1
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PAGE_TRACK_H
+#define __KVM_X86_PAGE_TRACK_H
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_page_track.h>
+
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn);
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn);
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+int kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
+
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes);
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
+{
+ return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
+}
+#else
+static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
+static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
+
+static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa,
+ const u8 *new, int bytes) { }
+static inline void kvm_page_track_delete_slot(struct kvm *kvm,
+ struct kvm_memory_slot *slot) { }
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
+
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
+
+static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ __kvm_page_track_write(vcpu->kvm, gpa, new, bytes);
+
+ kvm_mmu_track_write(vcpu, gpa, new, bytes);
+}
+
+#endif /* __KVM_X86_PAGE_TRACK_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
new file mode 100644
index 000000000000..901cd2bd40b8
--- /dev/null
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -0,0 +1,983 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+/*
+ * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables,
+ * as well as guest EPT tables, so the code in this file is compiled thrice,
+ * once per guest PTE type. The per-type defines are #undef'd at the end.
+ */
+
+#if PTTYPE == 64
+ #define pt_element_t u64
+ #define guest_walker guest_walker64
+ #define FNAME(name) paging##64_##name
+ #define PT_LEVEL_BITS 9
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+ #else
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
+#elif PTTYPE == 32
+ #define pt_element_t u32
+ #define guest_walker guest_walker32
+ #define FNAME(name) paging##32_##name
+ #define PT_LEVEL_BITS 10
+ #define PT_MAX_FULL_LEVELS 2
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+
+ #define PT32_DIR_PSE36_SIZE 4
+ #define PT32_DIR_PSE36_SHIFT 13
+ #define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) ept_##name
+ #define PT_LEVEL_BITS 9
+ #define PT_GUEST_DIRTY_SHIFT 9
+ #define PT_GUEST_ACCESSED_SHIFT 8
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+#else
+ #error Invalid PTTYPE value
+#endif
+
+/* Common logic, but per-type values. These also need to be undefined. */
+#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK)
+#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
+
+#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+ int level;
+ unsigned max_level;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+ bool pte_writable[PT_MAX_FULL_LEVELS];
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
+ gfn_t gfn;
+ struct x86_exception fault;
+};
+
+#if PTTYPE == 32
+static inline gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+#endif
+
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
+{
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
+}
+
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+ unsigned gpte)
+{
+ unsigned mask;
+
+ /* dirty bit is not supported, so no need to track it */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+ PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return pte & PT_PRESENT_MASK;
+#else
+ return pte & 7;
+#endif
+}
+
+static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return false;
+#else
+ return __is_bad_mt_xwr(rsvd_check, gpte);
+#endif
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
+ FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
+}
+
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (!FNAME(is_present_gpte)(gpte))
+ goto no_present;
+
+ /* Prefetch only accessed entries (unless A/D bits are disabled). */
+ if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
+ !(gpte & PT_GUEST_ACCESSED_MASK))
+ goto no_present;
+
+ if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
+static inline unsigned FNAME(gpte_access)(u64 gpte)
+{
+ unsigned access;
+#if PTTYPE == PTTYPE_EPT
+ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+#else
+ BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+ /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
+ access ^= (gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ struct guest_walker *walker,
+ gpa_t addr, int write_fault)
+{
+ unsigned level, index;
+ pt_element_t pte, orig_pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ int ret;
+
+ /* dirty/accessed bits are not supported, so no need to update them */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return 0;
+
+ for (level = walker->max_level; level >= walker->level; --level) {
+ pte = orig_pte = walker->ptes[level - 1];
+ table_gfn = walker->table_gfn[level - 1];
+ ptep_user = walker->ptep_user[level - 1];
+ index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+ if (!(pte & PT_GUEST_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+ pte |= PT_GUEST_ACCESSED_MASK;
+ }
+ if (level == walker->level && write_fault &&
+ !(pte & PT_GUEST_DIRTY_MASK)) {
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+ if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
+ return -EINVAL;
+#endif
+ pte |= PT_GUEST_DIRTY_MASK;
+ }
+ if (pte == orig_pte)
+ continue;
+
+ /*
+ * If the slot is read-only, simply do not process the accessed
+ * and dirty bits. This is the correct thing to do if the slot
+ * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
+ * are only supported if the accessed and dirty bits are already
+ * set in the ROM (so that MMIO writes are never needed).
+ *
+ * Note that NPT does not allow this at all and faults, since
+ * it always wants nested page table entries for the guest
+ * page tables to be writable. And EPT works but will simply
+ * overwrite the read-only memory to set the accessed and dirty
+ * bits.
+ */
+ if (unlikely(!walker->pte_writable[level - 1]))
+ continue;
+
+ ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
+ if (ret)
+ return ret;
+
+ kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
+ walker->ptes[level - 1] = pte;
+ }
+ return 0;
+}
+
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned pkeys = 0;
+#if PTTYPE == 64
+ pte_t pte = {.pte = gpte};
+
+ pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+ return pkeys;
+}
+
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+ unsigned int level, unsigned int gpte)
+{
+ /*
+ * For EPT and PAE paging (both variants), bit 7 is either reserved at
+ * all level or indicates a huge page (ignoring CR3/EPTP). In either
+ * case, bit 7 being set terminates the walk.
+ */
+#if PTTYPE == 32
+ /*
+ * 32-bit paging requires special handling because bit 7 is ignored if
+ * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is
+ * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+ *
+ * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7
+ * is not reserved and does not indicate a large page at this level,
+ * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
+#endif
+ /*
+ * PG_LEVEL_4K always terminates. The RHS has bit 7 set
+ * iff level <= PG_LEVEL_4K, which for our purpose means
+ * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PG_LEVEL_4K - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
+/*
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
+ */
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u64 access)
+{
+ int ret;
+ pt_element_t pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
+ u64 nested_access;
+ gpa_t pte_gpa;
+ bool have_ad;
+ int offset;
+ u64 walk_nx_mask = 0;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+ u16 errcode = 0;
+ gpa_t real_gpa;
+ gfn_t gfn;
+
+ trace_kvm_mmu_pagetable_walk(addr, access);
+retry_walk:
+ walker->level = mmu->cpu_role.base.level;
+ pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
+ have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
+
+#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
+ if (walker->level == PT32E_ROOT_LEVEL) {
+ pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!FNAME(is_present_gpte)(pte))
+ goto error;
+ --walker->level;
+ }
+#endif
+ walker->max_level = walker->level;
+
+ /*
+ * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+ * by the MOV to CR instruction are treated as reads and do not cause the
+ * processor to set the dirty flag in any EPT paging-structure entry.
+ */
+ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
+ pte_access = ~0;
+
+ /*
+ * Queue a page fault for injection if this assertion fails, as callers
+ * assume that walker.fault contains sane info on a walk failure. I.e.
+ * avoid making the situation worse by inducing even worse badness
+ * between when the assertion fails and when KVM kicks the vCPU out to
+ * userspace (because the VM is bugged).
+ */
+ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
+ goto error;
+
+ ++walker->level;
+
+ do {
+ struct kvm_memory_slot *slot;
+ unsigned long host_addr;
+
+ pt_access = pte_access;
+ --walker->level;
+
+ index = PT_INDEX(addr, walker->level);
+ table_gfn = gpte_to_gfn(pte);
+ offset = index * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
+ nested_access, &walker->fault);
+
+ /*
+ * FIXME: This can happen if emulation (for of an INS/OUTS
+ * instruction) triggers a nested page fault. The exit
+ * qualification / exit info field will incorrectly have
+ * "guest page access" as the nested page fault's cause,
+ * instead of "guest page structure access". To fix this,
+ * the x86_exception struct should be augmented with enough
+ * information to fix the exit_qualification or exit_info_1
+ * fields.
+ */
+ if (unlikely(real_gpa == INVALID_GPA))
+ return 0;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
+ if (!kvm_is_visible_memslot(slot))
+ goto error;
+
+ host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
+ &walker->pte_writable[walker->level - 1]);
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ goto error;
+
+ ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(get_user(pte, ptep_user)))
+ goto error;
+ walker->ptep_user[walker->level - 1] = ptep_user;
+
+ trace_kvm_mmu_paging_element(pte, walker->level);
+
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
+ goto error;
+
+ if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
+ errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ goto error;
+ }
+
+ walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
+
+ pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
+ if (unlikely(errcode))
+ goto error;
+
+ gfn = gpte_to_gfn_lvl(pte, walker->level);
+ gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+#if PTTYPE == 32
+ if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(pte);
+#endif
+
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
+ if (real_gpa == INVALID_GPA)
+ return 0;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
+
+ if (!write_fault)
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
+ else
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty.
+ * For modes without A/D bits support accessed_dirty will be
+ * always clear.
+ */
+ accessed_dirty &= pte >>
+ (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
+
+ if (unlikely(!accessed_dirty)) {
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
+ }
+
+ return 1;
+
+error:
+ errcode |= write_fault | user_fault;
+ if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
+ errcode |= PFERR_FETCH_MASK;
+
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+ /*
+ * Use PFERR_RSVD_MASK in error_code to tell if EPT
+ * misconfiguration requires to be injected. The detection is
+ * done by is_rsvd_bits_set() above.
+ *
+ * We set up the value of exit_qualification to inject:
+ * [2:0] - Derive from the access bits. The exit_qualification might be
+ * out of date if it is serving an EPT misconfiguration.
+ * [5:3] - Calculated by the page walk of the guest EPT page tables
+ * [7:8] - Derived from [7:8] of real exit_qualification
+ *
+ * The other bits are set to 0.
+ */
+ if (!(errcode & PFERR_RSVD_MASK)) {
+ walker->fault.exit_qualification = 0;
+
+ if (write_fault)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ if (user_fault)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ if (fetch_fault)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+
+ /*
+ * Note, pte_access holds the raw RWX bits from the EPTE, not
+ * ACC_*_MASK flags!
+ */
+ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
+ }
+#endif
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.async_page_fault = false;
+
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
+ return 0;
+}
+
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
+ access);
+}
+
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, pt_element_t gpte)
+{
+ unsigned pte_access;
+ gfn_t gfn;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ return false;
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access);
+}
+
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PG_LEVEL_4K) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
+ &curr_pte, sizeof(curr_pte));
+
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = sptep_to_sp(sptep);
+
+ if (sp->role.level > PG_LEVEL_4K)
+ return;
+
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (spte == sptep)
+ continue;
+
+ if (is_shadow_present_pte(*spte))
+ continue;
+
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
+ break;
+ }
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
+ */
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ struct guest_walker *gw)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct kvm_shadow_walk_iterator it;
+ unsigned int direct_access, access;
+ int top_level, ret;
+ gfn_t base_gfn = fault->gfn;
+
+ WARN_ON_ONCE(gw->gfn != base_gfn);
+ direct_access = gw->pte_access;
+
+ top_level = vcpu->arch.mmu->cpu_role.base.level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ return RET_PF_RETRY;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ /*
+ * Load a new root and retry the faulting instruction in the extremely
+ * unlikely scenario that the guest root gfn became visible between
+ * loading a dummy root and handling the resulting page fault, e.g. if
+ * userspace create a memslot in the interim.
+ */
+ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
+ kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
+ return RET_PF_RETRY;
+ }
+
+ for_each_shadow_entry(vcpu, fault->addr, it) {
+ gfn_t table_gfn;
+
+ clear_sp_write_flooding_count(it.sptep);
+ if (it.level == gw->level)
+ break;
+
+ table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
+ false, access);
+
+ /*
+ * Synchronize the new page before linking it, as the CPU (KVM)
+ * is architecturally disallowed from inserting non-present
+ * entries into the TLB, i.e. the guest isn't required to flush
+ * the TLB when changing the gPTE from non-present to present.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already
+ * synchronized the page via kvm_sync_page().
+ *
+ * For higher level pages, which cannot be unsync themselves
+ * but can have unsync children, synchronize via the slower
+ * mmu_sync_children(). If KVM needs to drop mmu_lock due to
+ * contention or to reschedule, instruct the caller to retry
+ * the #PF (mmu_sync_children() ensures forward progress will
+ * be made).
+ */
+ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
+
+ /*
+ * Verify that the gpte in the page, which is now either
+ * write-protected or unsync, wasn't modified between the fault
+ * and acquiring mmu_lock. This needs to be done even when
+ * reusing an existing shadow page to ensure the information
+ * gathered by the walker matches the information stored in the
+ * shadow page (which could have been modified by a different
+ * vCPU even if the page was already linked). Holding mmu_lock
+ * prevents the shadow page from changing after this point.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ return RET_PF_RETRY;
+
+ if (sp != ERR_PTR(-EEXIST))
+ link_shadow_page(vcpu, it.sptep, sp);
+
+ if (fault->write && table_gfn == fault->gfn)
+ fault->write_fault_to_shadow_pgtable = true;
+ }
+
+ /*
+ * Adjust the hugepage size _after_ resolving indirect shadow pages.
+ * KVM doesn't support mapping hugepages into the guest for gfns that
+ * are being shadowed by KVM, i.e. allocating a new shadow page may
+ * affect the allowed hugepage size.
+ */
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
+
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
+ if (it.level == fault->goal_level)
+ break;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
+ true, direct_access);
+ if (sp == ERR_PTR(-EEXIST))
+ continue;
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
+ }
+
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
+ base_gfn, fault->pfn, fault);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+ return ret;
+}
+
+/*
+ * Page fault handler. There are several causes for a page fault:
+ * - there is no shadow pte for the guest pte
+ * - write access through a shadow pte marked read only so that we can set
+ * the dirty bit
+ * - write access to a shadow pte marked read only so we can update the page
+ * dirty bitmap, when userspace requests it
+ * - mmio access; in this case we will never install a present shadow pte
+ * - normal guest page fault due to the guest pte marked not present, not
+ * writable, or not executable
+ *
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct guest_walker walker;
+ int r;
+
+ WARN_ON_ONCE(fault->is_tdp);
+
+ /*
+ * Look up the guest pte for the faulting address.
+ * If PFEC.RSVD is set, this is a shadow page fault.
+ * The bit needs to be cleared before walking guest page tables.
+ */
+ r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
+ fault->error_code & ~PFERR_RSVD_MASK);
+
+ /*
+ * The page is not mapped by the guest. Let the guest handle it.
+ */
+ if (!r) {
+ if (!fault->prefetch)
+ kvm_inject_emulated_page_fault(vcpu, &walker.fault);
+
+ return RET_PF_RETRY;
+ }
+
+ fault->gfn = walker.gfn;
+ fault->max_level = walker.level;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault)) {
+ shadow_page_table_clear_flood(vcpu, fault->addr);
+ return RET_PF_WRITE_PROTECTED;
+ }
+
+ r = mmu_topup_memory_caches(vcpu, true);
+ if (r)
+ return r;
+
+ r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+#if PTTYPE != PTTYPE_EPT
+ /*
+ * Treat the guest PTE protections as writable, supervisor-only if this
+ * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
+ * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
+ * otherwise KVM will cache incorrect access information in the SPTE.
+ */
+ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
+ walker.pte_access |= ACC_WRITE_MASK;
+ walker.pte_access &= ~ACC_USER_MASK;
+
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (is_cr4_smep(vcpu->arch.mmu))
+ walker.pte_access &= ~ACC_EXEC_MASK;
+ }
+#endif
+
+ r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = make_mmu_pages_available(vcpu);
+ if (r)
+ goto out_unlock;
+ r = FNAME(fetch)(vcpu, fault, &walker);
+
+out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+ int offset = 0;
+
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << SPTE_LEVEL_BITS;
+
+ return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u64 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = INVALID_GPA;
+ int r;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
+#endif
+
+ r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= addr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+
+/*
+ * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
+ * safe because SPTEs are protected by mmu_notifiers and memslot generations, so
+ * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are
+ * nuked first.
+ *
+ * Returns
+ * < 0: failed to sync spte
+ * 0: the spte is synced and no tlb flushing is required
+ * > 0: the spte is synced and tlb flushing is required
+ */
+static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ bool host_writable;
+ gpa_t first_pte_gpa;
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
+ !sp->shadowed_translation))
+ return 0;
+
+ first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
+
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -1;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ return 1;
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
+ return 0;
+
+ /*
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
+ */
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ return 1;
+ }
+ /*
+ * Do nothing if the permissions are unchanged. The existing SPTE is
+ * still, and prefetch_invalid_gpte() has verified that the A/D bits
+ * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
+ * update due to A/D bits being set in the SPTE but not the gPTE.
+ */
+ if (kvm_mmu_page_get_access(sp, i) == pte_access)
+ return 0;
+
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, true,
+ host_writable, &spte);
+
+ /*
+ * There is no need to mark the pfn dirty, as the new protections must
+ * be a subset of the old protections, i.e. synchronizing a SPTE cannot
+ * change the SPTE from read-only to writable.
+ */
+ return mmu_spte_update(sptep, spte);
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_lvl
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
new file mode 100644
index 000000000000..85a0473809b0
--- /dev/null
+++ b/arch/x86/kvm/mmu/spte.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Macros and functions to access KVM PTEs (also known as SPTEs)
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2020 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "x86.h"
+#include "spte.h"
+
+#include <asm/e820/api.h>
+#include <asm/memtype.h>
+#include <asm/vmx.h>
+
+bool __read_mostly enable_mmio_caching = true;
+static bool __ro_after_init allow_mmio_caching;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mmio_caching);
+
+bool __read_mostly kvm_ad_enabled;
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
+u64 __read_mostly shadow_nx_mask;
+u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+u64 __read_mostly shadow_user_mask;
+u64 __read_mostly shadow_accessed_mask;
+u64 __read_mostly shadow_dirty_mask;
+u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
+u64 __read_mostly shadow_mmio_access_mask;
+u64 __read_mostly shadow_present_mask;
+u64 __read_mostly shadow_me_value;
+u64 __read_mostly shadow_me_mask;
+u64 __read_mostly shadow_acc_track_mask;
+
+u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+static u8 __init kvm_get_host_maxphyaddr(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR,
+ * when reasoning about CPU behavior with respect to MAXPHYADDR.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
+
+void __init kvm_mmu_spte_module_init(void)
+{
+ /*
+ * Snapshot userspace's desire to allow MMIO caching. Whether or not
+ * KVM can actually enable MMIO caching depends on vendor-specific
+ * hardware capabilities and other module params that can't be resolved
+ * until the vendor module is loaded, i.e. enable_mmio_caching can and
+ * will change when the vendor module is (re)loaded.
+ */
+ allow_mmio_caching = enable_mmio_caching;
+
+ kvm_host.maxphyaddr = kvm_get_host_maxphyaddr();
+}
+
+static u64 generation_mmio_spte_mask(u64 gen)
+{
+ u64 mask;
+
+ WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK);
+
+ mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
+ mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
+ return mask;
+}
+
+u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
+{
+ u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
+ u64 spte = generation_mmio_spte_mask(gen);
+ u64 gpa = gfn << PAGE_SHIFT;
+
+ access &= shadow_mmio_access_mask;
+ spte |= vcpu->kvm->arch.shadow_mmio_value | access;
+ spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+ spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
+
+ return spte;
+}
+
+static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+ /*
+ * Some reserved pages, such as those from NVDIMM
+ * DAX devices, are not for MMIO, and can be mapped
+ * with cached memory type for better performance.
+ * However, the above check misconceives those pages
+ * as MMIO, and results in KVM mapping them with UC
+ * memory type, which would hurt the performance.
+ * Therefore, we check the host memory type in addition
+ * and only treat UC/UC-/WC pages as MMIO.
+ */
+ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
+
+ return !e820__mapped_raw_any(pfn_to_hpa(pfn),
+ pfn_to_hpa(pfn + 1) - 1,
+ E820_TYPE_RAM);
+}
+
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio)
+{
+ /*
+ * Determining if a PFN is host MMIO is relative expensive. Cache the
+ * result locally (in the sole caller) to avoid doing the full query
+ * multiple times when creating a single SPTE.
+ */
+ if (*is_host_mmio < 0)
+ *is_host_mmio = __kvm_is_mmio_pfn(pfn);
+
+ return *is_host_mmio;
+}
+
+static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ WRITE_ONCE(root->has_mapped_host_mmio, true);
+ else
+ WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true);
+
+ /*
+ * Force vCPUs to exit and flush CPU buffers if the vCPU is using the
+ * affected root(s).
+ */
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+/*
+ * Returns true if the SPTE needs to be updated atomically due to having bits
+ * that may be changed without holding mmu_lock, and for which KVM must not
+ * lose information. E.g. KVM must not drop Dirty bit information. The caller
+ * is responsible for checking if the SPTE is shadow-present, and for
+ * determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_needs_atomic_update(u64 spte)
+{
+ /* SPTEs can be made Writable bit by KVM's fast page fault handler. */
+ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+ return true;
+
+ /*
+ * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
+ * SPTEs can be restored by KVM's fast page fault handler.
+ */
+ if (!spte_ad_enabled(spte))
+ return true;
+
+ /*
+ * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed
+ * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
+ * invalidate TLBs when aging SPTEs, and so it's safe to clobber the
+ * Accessed bit (and rare in practice).
+ */
+ return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
+}
+
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool synchronizing,
+ bool host_writable, u64 *new_spte)
+{
+ int level = sp->role.level;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
+ int is_host_mmio = -1;
+ bool wrprot = false;
+
+ /*
+ * For the EPT case, shadow_present_mask has no RWX bits set if
+ * exec-only page table entries are supported. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
+
+ if (sp->role.ad_disabled)
+ spte |= SPTE_TDP_AD_DISABLED;
+ else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp))
+ spte |= SPTE_TDP_AD_WRPROT_ONLY;
+
+ spte |= shadow_present_mask;
+ if (!prefetch || synchronizing)
+ spte |= shadow_accessed_mask;
+
+ /*
+ * For simplicity, enforce the NX huge page mitigation even if not
+ * strictly necessary. KVM could ignore the mitigation if paging is
+ * disabled in the guest, as the guest doesn't have any page tables to
+ * abuse. But to safely ignore the mitigation, KVM would have to
+ * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
+ * is toggled on, and that's a net negative for performance when TDP is
+ * enabled. When TDP is disabled, KVM will always switch to a new MMU
+ * when CR0.PG is toggled, but leveraging that to ignore the mitigation
+ * would tie make_spte() further to vCPU/MMU state, and add complexity
+ * just to optimize a mode that is anything but performance critical.
+ */
+ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled(vcpu->kvm)) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_x_mask;
+ else
+ spte |= shadow_nx_mask;
+
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+
+ if (level > PG_LEVEL_4K)
+ spte |= PT_PAGE_SIZE_MASK;
+
+ if (kvm_x86_ops.get_mt_mask)
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn, &is_host_mmio));
+ if (host_writable)
+ spte |= shadow_host_writable_mask;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
+
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ spte |= shadow_me_value;
+
+ spte |= (u64)pfn << PAGE_SHIFT;
+
+ if (pte_access & ACC_WRITE_MASK) {
+ /*
+ * Unsync shadow pages that are reachable by the new, writable
+ * SPTE. Write-protect the SPTE if the page can't be unsync'd,
+ * e.g. it's write-tracked (upper-level SPs) or has one or more
+ * shadow pages and unsync'ing pages is not allowed.
+ *
+ * When overwriting an existing leaf SPTE, and the old SPTE was
+ * writable, skip trying to unsync shadow pages as any relevant
+ * shadow pages must already be unsync, i.e. the hash lookup is
+ * unnecessary (and expensive). Note, this relies on KVM not
+ * changing PFNs without first zapping the old SPTE, which is
+ * guaranteed by both the shadow MMU and the TDP MMU.
+ */
+ if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) &&
+ mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch))
+ wrprot = true;
+ else
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask |
+ shadow_dirty_mask;
+ }
+
+ if (prefetch && !synchronizing)
+ spte = mark_spte_for_access_track(spte);
+
+ WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+ "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
+ get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+
+ /*
+ * Mark the memslot dirty *after* modifying it for access tracking.
+ * Unlike folios, memslots can be safely marked dirty out of mmu_lock,
+ * i.e. in the fast page fault handler.
+ */
+ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON_ONCE(level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
+ !kvm_vcpu_can_access_host_mmio(vcpu) &&
+ kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ kvm_track_host_mmio_mapping(vcpu);
+
+ *new_spte = spte;
+ return wrprot;
+}
+
+static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
+{
+ bool is_access_track = is_access_track_spte(spte);
+
+ if (is_access_track)
+ spte = restore_acc_track_spte(spte);
+
+ KVM_MMU_WARN_ON(set & clear);
+ spte = (spte | set) & ~clear;
+
+ if (is_access_track)
+ spte = mark_spte_for_access_track(spte);
+
+ return spte;
+}
+
+static u64 make_spte_executable(u64 spte)
+{
+ return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
+}
+
+static u64 make_spte_nonexecutable(u64 spte)
+{
+ return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index)
+{
+ u64 child_spte = huge_spte;
+
+ KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm);
+
+ /*
+ * The child_spte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT;
+
+ if (role.level == PG_LEVEL_4K) {
+ child_spte &= ~PT_PAGE_SIZE_MASK;
+
+ /*
+ * When splitting to a 4K page where execution is allowed, mark
+ * the page executable as the NX hugepage mitigation no longer
+ * applies.
+ */
+ if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
+ child_spte = make_spte_executable(child_spte);
+ }
+
+ return child_spte;
+}
+
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
+{
+ u64 huge_spte;
+
+ KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm);
+
+ huge_spte = small_spte | PT_PAGE_SIZE_MASK;
+
+ /*
+ * huge_spte already has the address of the sub-page being collapsed
+ * from small_spte, so just clear the lower address bits to create the
+ * huge page address.
+ */
+ huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
+
+ if (is_nx_huge_page_enabled(kvm))
+ huge_spte = make_spte_nonexecutable(huge_spte);
+
+ return huge_spte;
+}
+
+u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
+{
+ u64 spte = SPTE_MMU_PRESENT_MASK;
+
+ spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_value;
+
+ if (ad_disabled)
+ spte |= SPTE_TDP_AD_DISABLED;
+ else
+ spte |= shadow_accessed_mask;
+
+ return spte;
+}
+
+u64 mark_spte_for_access_track(u64 spte)
+{
+ if (spte_ad_enabled(spte))
+ return spte & ~shadow_accessed_mask;
+
+ if (is_access_track_spte(spte))
+ return spte;
+
+ check_spte_writable_invariants(spte);
+
+ WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
+ "Access Tracking saved bit locations are not zero\n");
+
+ spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
+ spte &= ~(shadow_acc_track_mask | shadow_accessed_mask);
+
+ return spte;
+}
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
+{
+ BUG_ON((u64)(unsigned)access_mask != access_mask);
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
+
+ /*
+ * Reset to the original module param value to honor userspace's desire
+ * to (dis)allow MMIO caching. Update the param itself so that
+ * userspace can see whether or not KVM is actually using MMIO caching.
+ */
+ enable_mmio_caching = allow_mmio_caching;
+ if (!enable_mmio_caching)
+ mmio_value = 0;
+
+ /*
+ * The mask must contain only bits that are carved out specifically for
+ * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO
+ * generation.
+ */
+ if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK))
+ mmio_value = 0;
+
+ /*
+ * Disable MMIO caching if the MMIO value collides with the bits that
+ * are used to hold the relocated GFN when the L1TF mitigation is
+ * enabled. This should never fire as there is no known hardware that
+ * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+ * MMIO value are not susceptible to L1TF.
+ */
+ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+ SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+ mmio_value = 0;
+
+ /*
+ * The masked MMIO value must obviously match itself and a frozen SPTE
+ * must not get a false positive. Frozen SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and frozen SPTEs must
+ * not set any RWX bits.
+ */
+ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+ WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value))
+ mmio_value = 0;
+
+ if (!mmio_value)
+ enable_mmio_caching = false;
+
+ shadow_mmio_value = mmio_value;
+ shadow_mmio_mask = mmio_mask;
+ shadow_mmio_access_mask = access_mask;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_mask);
+
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value)
+{
+ kvm->arch.shadow_mmio_value = mmio_value;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_value);
+
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
+{
+ /* shadow_me_value must be a subset of shadow_me_mask */
+ if (WARN_ON(me_value & ~me_mask))
+ me_value = me_mask = 0;
+
+ shadow_me_value = me_value;
+ shadow_me_mask = me_mask;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask);
+
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
+{
+ kvm_ad_enabled = has_ad_bits;
+
+ shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
+ shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
+ shadow_nx_mask = 0ull;
+ shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
+ shadow_present_mask =
+ (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
+
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
+
+ /*
+ * EPT Misconfigurations are generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+ VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks);
+
+void kvm_mmu_reset_all_pte_masks(void)
+{
+ u8 low_phys_bits;
+ u64 mask;
+
+ kvm_ad_enabled = true;
+
+ /*
+ * If the CPU has 46 or less physical address bits, then set an
+ * appropriate mask to guard against L1TF attacks. Otherwise, it is
+ * assumed that the CPU is not vulnerable to L1TF.
+ *
+ * Some Intel CPUs address the L1 cache using more PA bits than are
+ * reported by CPUID. Use the PA width of the L1 cache when possible
+ * to achieve more effective mitigation, e.g. if system RAM overlaps
+ * the most significant bits of legal physical address space.
+ */
+ shadow_nonpresent_or_rsvd_mask = 0;
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
+ shadow_nonpresent_or_rsvd_mask =
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+ }
+
+ shadow_nonpresent_or_rsvd_lower_gfn_mask =
+ GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+ shadow_user_mask = PT_USER_MASK;
+ shadow_accessed_mask = PT_ACCESSED_MASK;
+ shadow_dirty_mask = PT_DIRTY_MASK;
+ shadow_nx_mask = PT64_NX_MASK;
+ shadow_x_mask = 0;
+ shadow_present_mask = PT_PRESENT_MASK;
+
+ shadow_acc_track_mask = 0;
+ shadow_me_mask = 0;
+ shadow_me_value = 0;
+
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (kvm_host.maxphyaddr < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
+}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
new file mode 100644
index 000000000000..91ce29fd6f1b
--- /dev/null
+++ b/arch/x86/kvm/mmu/spte.h
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef KVM_X86_MMU_SPTE_H
+#define KVM_X86_MMU_SPTE_H
+
+#include <asm/vmx.h>
+
+#include "mmu.h"
+#include "mmu_internal.h"
+
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+. MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
+
+/*
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE. This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
+ */
+#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED == 0);
+
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#else
+#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#endif
+
+#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
+
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+/* The mask for the R/X bits in EPT PTEs */
+#define SPTE_EPT_READABLE_MASK 0x1ull
+#define SPTE_EPT_EXECUTABLE_MASK 0x4ull
+
+#define SPTE_LEVEL_BITS 9
+#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
+#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
+#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page. This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
+ SPTE_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given
+ * SPTE is write-protected. See is_writable_pte() for details.
+ */
+
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
+
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
+
+/*
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * the memslots generation and is derived as follows:
+ *
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
+ *
+ * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
+ * the MMIO generation number, as doing so would require stealing a bit from
+ * the "real" generation number and thus effectively halve the maximum number
+ * of MMIO generations that can be handled before encountering a wrap (which
+ * requires a full MMU zap). The flag is instead explicitly queried when
+ * checking for MMIO spte cache hits.
+ */
+
+#define MMIO_SPTE_GEN_LOW_START 3
+#define MMIO_SPTE_GEN_LOW_END 10
+
+#define MMIO_SPTE_GEN_HIGH_START 52
+#define MMIO_SPTE_GEN_HIGH_END 62
+
+#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+ MMIO_SPTE_GEN_LOW_START)
+#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
+ MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+ (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
+
+/*
+ * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the
+ * MMU-present bit. The generation obviously co-exists with the magic MMIO
+ * mask/value, and MMIO SPTEs are considered !MMU-present.
+ *
+ * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT
+ * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO
+ * and so they're off-limits for generation; additional checks ensure the mask
+ * doesn't overlap legal PA bits), and bit 63 (carved out for future usage).
+ */
+#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0))
+static_assert(!(SPTE_MMIO_ALLOWED_MASK &
+ (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
+
+#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
+#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
+
+/* remember to adjust the comment above as well if you change these */
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
+
+#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
+#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
+
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+
+/*
+ * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress
+ * #VE and get EPT violations on non-present PTEs. We can use the
+ * same value also without TDX for both VMX and SVM:
+ *
+ * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored.
+ * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0)
+ * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1)
+ */
+#ifdef CONFIG_X86_64
+#define SHADOW_NONPRESENT_VALUE BIT_ULL(63)
+static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
+#else
+#define SHADOW_NONPRESENT_VALUE 0ULL
+#endif
+
+
+/*
+ * True if A/D bits are supported in hardware and are enabled by KVM. When
+ * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable
+ * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario
+ * where KVM is using A/D bits for L1, but not L2.
+ */
+extern bool __read_mostly kvm_ad_enabled;
+
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
+extern u64 __read_mostly shadow_nx_mask;
+extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+extern u64 __read_mostly shadow_user_mask;
+extern u64 __read_mostly shadow_accessed_mask;
+extern u64 __read_mostly shadow_dirty_mask;
+extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
+extern u64 __read_mostly shadow_mmio_access_mask;
+extern u64 __read_mostly shadow_present_mask;
+extern u64 __read_mostly shadow_me_value;
+extern u64 __read_mostly shadow_me_mask;
+
+/*
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED;
+ * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
+ * pages.
+ */
+extern u64 __read_mostly shadow_acc_track_mask;
+
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
+
+/*
+ * If a thread running without exclusive control of the MMU lock must perform a
+ * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a
+ * non-present intermediate value. Other threads which encounter this value
+ * should not modify the SPTE.
+ *
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability.
+ *
+ * Only used by the TDP MMU.
+ */
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
+
+/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
+
+static inline bool is_frozen_spte(u64 spte)
+{
+ return spte == FROZEN_SPTE;
+}
+
+/* Get an SPTE's index into its parent's page table (and the spt array). */
+static inline int spte_index(u64 *sptep)
+{
+ return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1);
+}
+
+/*
+ * In some cases, we need to preserve the GFN of a non-present or reserved
+ * SPTE when we usurp the upper five bits of the physical address space to
+ * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
+ * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
+ * left into the reserved bits, i.e. the GFN in the SPTE will be split into
+ * high and low parts. This mask covers the lower bits of the GFN.
+ */
+extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+static inline hpa_t kvm_mmu_get_dummy_root(void)
+{
+ return my_zero_pfn(0) << PAGE_SHIFT;
+}
+
+static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
+{
+ return is_zero_pfn(shadow_page >> PAGE_SHIFT);
+}
+
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
+{
+ return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
+}
+
+static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
+{
+ return to_shadow_page(__pa(sptep));
+}
+
+static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
+{
+ if (kvm_mmu_is_dummy_root(root))
+ return NULL;
+
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ return spte_to_child_sp(root);
+}
+
+static inline bool is_mirror_sptep(tdp_ptep_t sptep)
+{
+ return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
+}
+
+static inline bool kvm_vcpu_can_access_host_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ return READ_ONCE(root->has_mapped_host_mmio);
+
+ return READ_ONCE(vcpu->kvm->arch.has_mapped_host_mmio);
+}
+
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
+{
+ return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
+ likely(enable_mmio_caching);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+ return !!(pte & SPTE_MMU_PRESENT_MASK);
+}
+
+static inline bool is_ept_ve_possible(u64 spte)
+{
+ return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) &&
+ !(spte & VMX_EPT_SUPPRESS_VE_BIT) &&
+ (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE;
+}
+
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
+}
+
+static inline bool spte_ad_need_write_protect(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ /*
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
+ * and non-TDP SPTEs will never set these bits. Optimize for 64-bit
+ * TDP and do the A/D type check unconditionally.
+ */
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
+}
+
+static inline bool is_access_track_spte(u64 spte)
+{
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
+}
+
+static inline bool is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
+static inline bool is_last_spte(u64 pte, int level)
+{
+ return (level == PG_LEVEL_4K) || is_large_pte(pte);
+}
+
+static inline bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
+static inline kvm_pfn_t spte_to_pfn(u64 pte)
+{
+ return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static inline bool is_accessed_spte(u64 spte)
+{
+ return spte & shadow_accessed_mask;
+}
+
+static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+ int level)
+{
+ int bit7 = (pte >> 7) & 1;
+
+ return rsvd_check->rsvd_bits_mask[bit7][level-1];
+}
+
+static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+ u64 pte, int level)
+{
+ return pte & get_rsvd_bits(rsvd_check, pte, level);
+}
+
+static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+ u64 pte)
+{
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+}
+
+static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+ u64 spte, int level)
+{
+ return __is_bad_mt_xwr(rsvd_check, spte) ||
+ __is_rsvd_bits_set(rsvd_check, spte, level);
+}
+
+/*
+ * A shadow-present leaf SPTE may be non-writable for 4 possible reasons:
+ *
+ * 1. To intercept writes for dirty logging. KVM write-protects huge pages
+ * so that they can be split down into the dirty logging
+ * granularity (4KiB) whenever the guest writes to them. KVM also
+ * write-protects 4KiB pages so that writes can be recorded in the dirty log
+ * (e.g. if not using PML). SPTEs are write-protected for dirty logging
+ * during the VM-iotcls that enable dirty logging.
+ *
+ * 2. To intercept writes to guest page tables that KVM is shadowing. When a
+ * guest writes to its page table the corresponding shadow page table will
+ * be marked "unsync". That way KVM knows which shadow page tables need to
+ * be updated on the next TLB flush, INVLPG, etc. and which do not.
+ *
+ * 3. To prevent guest writes to read-only memory, such as for memory in a
+ * read-only memslot or guest memory backed by a read-only VMA. Writes to
+ * such pages are disallowed entirely.
+ *
+ * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this
+ * case, the SPTE is access-protected, not just write-protected!
+ *
+ * For cases #1 and #4, KVM can safely make such SPTEs writable without taking
+ * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it.
+ * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits
+ * in the SPTE:
+ *
+ * shadow_mmu_writable_mask, aka MMU-writable -
+ * Cleared on SPTEs that KVM is currently write-protecting for shadow paging
+ * purposes (case 2 above).
+ *
+ * shadow_host_writable_mask, aka Host-writable -
+ * Cleared on SPTEs that are not host-writable (case 3 above)
+ *
+ * Note, not all possible combinations of PT_WRITABLE_MASK,
+ * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
+ * SPTE can be in only one of the following states, which map to the
+ * aforementioned 3 cases:
+ *
+ * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
+ * ------------------------- | ------------------------ | ----------------
+ * 1 | 1 | 1 (writable)
+ * 1 | 1 | 0 (case 1)
+ * 1 | 0 | 0 (case 2)
+ * 0 | 0 | 0 (case 3)
+ *
+ * The valid combinations of these bits are checked by
+ * check_spte_writable_invariants() whenever an SPTE is modified.
+ *
+ * Clearing the MMU-writable bit is always done under the MMU lock and always
+ * accompanied by a TLB flush before dropping the lock to avoid corrupting the
+ * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
+ * (which does not clear the MMU-writable bit), does not flush TLBs before
+ * dropping the lock, as it only needs to synchronize guest writes with the
+ * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for
+ * access-tracking via the clear_young() MMU notifier also does not flush TLBs.
+ *
+ * So, there is the problem: clearing the MMU-writable bit can encounter a
+ * write-protected SPTE while CPUs still have writable mappings for that SPTE
+ * cached in their TLB. To address this, KVM always flushes TLBs when
+ * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE.
+ *
+ * The Host-writable bit is not modified on present SPTEs, it is only set or
+ * cleared when an SPTE is first faulted in from non-present and then remains
+ * immutable.
+ */
+static inline bool is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+/* Note: spte must be a shadow-present leaf SPTE. */
+static inline void check_spte_writable_invariants(u64 spte)
+{
+ if (spte & shadow_mmu_writable_mask)
+ WARN_ONCE(!(spte & shadow_host_writable_mask),
+ KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx",
+ spte);
+ else
+ WARN_ONCE(is_writable_pte(spte),
+ KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte);
+}
+
+static inline bool is_mmu_writable_spte(u64 spte)
+{
+ return spte & shadow_mmu_writable_mask;
+}
+
+/*
+ * Returns true if the access indicated by @fault is allowed by the existing
+ * SPTE protections. Note, the caller is responsible for checking that the
+ * SPTE is a shadow-present, leaf SPTE (either before or after).
+ */
+static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+{
+ if (fault->exec)
+ return is_executable_pte(spte);
+
+ if (fault->write)
+ return is_writable_pte(spte);
+
+ /* Fault was on Read access */
+ return spte & PT_PRESENT_MASK;
+}
+
+/*
+ * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
+ * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
+ * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM
+ * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
+ * not whether or not SPTEs were modified, i.e. only the write-tracking case
+ * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
+ *
+ * Don't flush if the Accessed bit is cleared, as access tracking tolerates
+ * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a
+ * mmu_notifier.clear_flush_young() event.
+ *
+ * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
+ * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
+ * when clearing dirty logs, KVM flushes based on whether or not dirty entries
+ * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
+ *
+ * Note, this logic only applies to shadow-present leaf SPTEs. The caller is
+ * responsible for checking that the old SPTE is shadow-present, and is also
+ * responsible for determining whether or not a TLB flush is required when
+ * modifying a shadow-present non-leaf SPTE.
+ */
+static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte)
+{
+ return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
+}
+
+static inline u64 get_mmio_spte_generation(u64 spte)
+{
+ u64 gen;
+
+ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
+ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
+ return gen;
+}
+
+bool spte_needs_atomic_update(u64 spte);
+
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool synchronizing,
+ bool host_writable, u64 *new_spte);
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index);
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
+u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
+u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
+u64 mark_spte_for_access_track(u64 spte);
+
+/* Restore an acc-track PTE back to a regular PTE */
+static inline u64 restore_acc_track_spte(u64 spte)
+{
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
+
+ spte &= ~shadow_acc_track_mask;
+ spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
+ spte |= saved_bits;
+
+ return spte;
+}
+
+void __init kvm_mmu_spte_module_init(void);
+void kvm_mmu_reset_all_pte_masks(void);
+
+#endif
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
new file mode 100644
index 000000000000..9e17bfa80901
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "mmu_internal.h"
+#include "tdp_iter.h"
+#include "spte.h"
+
+/*
+ * Recalculates the pointer to the SPTE for the current GFN and level and
+ * reread the SPTE.
+ */
+static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
+{
+ iter->sptep = iter->pt_path[iter->level - 1] +
+ SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level);
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+}
+
+/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+ iter->yielded = false;
+ iter->yielded_gfn = iter->next_last_level_gfn;
+ iter->level = iter->root_level;
+
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ iter->valid = true;
+}
+
+/*
+ * Sets a TDP iterator to walk a pre-order traversal of the paging structure
+ * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
+ */
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits)
+{
+ if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
+ (root->role.level > PT64_ROOT_MAX_LEVEL) ||
+ (gfn_bits && next_last_level_gfn >= gfn_bits))) {
+ iter->valid = false;
+ return;
+ }
+
+ iter->next_last_level_gfn = next_last_level_gfn;
+ iter->gfn_bits = gfn_bits;
+ iter->root_level = root->role.level;
+ iter->min_level = min_level;
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
+ iter->as_id = kvm_mmu_page_as_id(root);
+
+ tdp_iter_restart(iter);
+}
+
+/*
+ * Given an SPTE and its level, returns a pointer containing the host virtual
+ * address of the child page table referenced by the SPTE. Returns null if
+ * there is no such entry.
+ */
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
+{
+ /*
+ * There's no child entry if this entry isn't present or is a
+ * last-level entry.
+ */
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
+ return NULL;
+
+ return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
+}
+
+/*
+ * Steps down one level in the paging structure towards the goal GFN. Returns
+ * true if the iterator was able to step down a level, false otherwise.
+ */
+static bool try_step_down(struct tdp_iter *iter)
+{
+ tdp_ptep_t child_pt;
+
+ if (iter->level == iter->min_level)
+ return false;
+
+ /*
+ * Reread the SPTE before stepping down to avoid traversing into page
+ * tables that are no longer linked from this entry.
+ */
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+
+ child_pt = spte_to_child_pt(iter->old_spte, iter->level);
+ if (!child_pt)
+ return false;
+
+ iter->level--;
+ iter->pt_path[iter->level - 1] = child_pt;
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ return true;
+}
+
+/*
+ * Steps to the next entry in the current page table, at the current page table
+ * level. The next entry could point to a page backing guest memory or another
+ * page table, or it could be non-present. Returns true if the iterator was
+ * able to step to the next entry in the page table, false if the iterator was
+ * already at the end of the current page table.
+ */
+static bool try_step_side(struct tdp_iter *iter)
+{
+ /*
+ * Check if the iterator is already at the end of the current page
+ * table.
+ */
+ if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) ==
+ (SPTE_ENT_PER_PAGE - 1))
+ return false;
+
+ iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
+ iter->next_last_level_gfn = iter->gfn;
+ iter->sptep++;
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+
+ return true;
+}
+
+/*
+ * Tries to traverse back up a level in the paging structure so that the walk
+ * can continue from the next entry in the parent page table. Returns true on a
+ * successful step up, false if already in the root page.
+ */
+static bool try_step_up(struct tdp_iter *iter)
+{
+ if (iter->level == iter->root_level)
+ return false;
+
+ iter->level++;
+ iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ return true;
+}
+
+/*
+ * Step to the next SPTE in a pre-order traversal of the paging structure.
+ * To get to the next SPTE, the iterator either steps down towards the goal
+ * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
+ * higher GFN.
+ *
+ * The basic algorithm is as follows:
+ * 1. If the current SPTE is a non-last-level SPTE, step down into the page
+ * table it points to.
+ * 2. If the iterator cannot step down, it will try to step to the next SPTE
+ * in the current page of the paging structure.
+ * 3. If the iterator cannot step to the next entry in the current page, it will
+ * try to step up to the parent paging structure page. In this case, that
+ * SPTE will have already been visited, and so the iterator must also step
+ * to the side again.
+ */
+void tdp_iter_next(struct tdp_iter *iter)
+{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
+ if (try_step_down(iter))
+ return;
+
+ do {
+ if (try_step_side(iter))
+ return;
+ } while (try_step_up(iter));
+ iter->valid = false;
+}
+
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
new file mode 100644
index 000000000000..364c5da6c499
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __KVM_X86_MMU_TDP_ITER_H
+#define __KVM_X86_MMU_TDP_ITER_H
+
+#include <linux/kvm_host.h>
+
+#include "mmu.h"
+#include "spte.h"
+
+/*
+ * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
+ * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
+ * batched without having to collect the list of zapped SPs. Flows that can
+ * remove SPs must service pending TLB flushes prior to dropping RCU protection.
+ */
+static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
+{
+ return READ_ONCE(*rcu_dereference(sptep));
+}
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
+ return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
+{
+ atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
+ WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+/*
+ * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs,
+ * and have volatile bits (bits that can be set outside of mmu_lock) that
+ * must not be clobbered.
+ */
+static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level)
+{
+ return is_shadow_present_pte(old_spte) &&
+ is_last_spte(old_spte, level) &&
+ spte_needs_atomic_update(old_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
+{
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
+ return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return old_spte;
+}
+
+static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
+ u64 mask, int level)
+{
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
+ return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
+
+ __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
+ return old_spte;
+}
+
+/*
+ * A TDP iterator performs a pre-order walk over a TDP paging structure.
+ */
+struct tdp_iter {
+ /*
+ * The iterator will traverse the paging structure towards the mapping
+ * for this GFN.
+ */
+ gfn_t next_last_level_gfn;
+ /*
+ * The next_last_level_gfn at the time when the thread last
+ * yielded. Only yielding when the next_last_level_gfn !=
+ * yielded_gfn helps ensure forward progress.
+ */
+ gfn_t yielded_gfn;
+ /* Pointers to the page tables traversed to reach the current SPTE */
+ tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
+ /* A pointer to the current SPTE */
+ tdp_ptep_t sptep;
+ /* The lowest GFN (mask bits excluded) mapped by the current SPTE */
+ gfn_t gfn;
+ /* Mask applied to convert the GFN to the mapping GPA */
+ gfn_t gfn_bits;
+ /* The level of the root page given to the iterator */
+ int root_level;
+ /* The lowest level the iterator should traverse to */
+ int min_level;
+ /* The iterator's current level within the paging structure */
+ int level;
+ /* The address space ID, i.e. SMM vs. regular. */
+ int as_id;
+ /* A snapshot of the value at sptep */
+ u64 old_spte;
+ /*
+ * Whether the iterator has a valid state. This will be false if the
+ * iterator walks off the end of the paging structure.
+ */
+ bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
+};
+
+/*
+ * Iterates over every SPTE mapping the GFN range [start, end) in a
+ * preorder traversal.
+ */
+#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \
+ iter.valid && iter.gfn < end; \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte_min_level_all(iter, root, min_level) \
+ for (tdp_iter_start(&iter, root, min_level, 0, 0); \
+ iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte(iter, kvm, root, start, end) \
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end)
+
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);
+
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits);
+void tdp_iter_next(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
+
+#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
new file mode 100644
index 000000000000..9c26038f6b77
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -0,0 +1,1992 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "mmutrace.h"
+#include "tdp_iter.h"
+#include "tdp_mmu.h"
+#include "spte.h"
+
+#include <asm/cmpxchg.h>
+#include <trace/events/kvm.h>
+
+/* Initializes the TDP MMU for the VM, if enabled. */
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+{
+ INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+ spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/* Arbitrarily returns true so that this may be used in if statements. */
+static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return true;
+}
+
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
+{
+ /*
+ * Invalidate all roots, which besides the obvious, schedules all roots
+ * for zapping and thus puts the TDP MMU's reference to each root, i.e.
+ * ultimately frees all roots.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
+
+#ifdef CONFIG_KVM_PROVE_MMU
+ KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+#endif
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+ /*
+ * Ensure that all the outstanding RCU callbacks to free shadow pages
+ * can run before the VM is torn down. Putting the last reference to
+ * zapped roots will create new callbacks.
+ */
+ rcu_barrier();
+}
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->external_spt);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
+
+ tdp_mmu_free_sp(sp);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
+
+ /*
+ * The TDP MMU itself holds a reference to each root until the root is
+ * explicitly invalidated, i.e. the final reference should be never be
+ * put for a valid root.
+ */
+ KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
+ return false;
+
+ if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
+ return false;
+
+ if (likely(!is_mirror_sp(root)))
+ return types & KVM_DIRECT_ROOTS;
+ return types & KVM_MIRROR_ROOTS;
+}
+
+/*
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL) that matches with @types. A reference to the returned root is
+ * acquired, and the reference to @prev_root is released (the caller obviously
+ * must hold a reference to @prev_root if it's non-NULL).
+ *
+ * Roots that doesn't match with @types are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ struct kvm_mmu_page *next_root;
+
+ /*
+ * While the roots themselves are RCU-protected, fields such as
+ * role.invalid are protected by mmu_lock.
+ */
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root) {
+ if (tdp_mmu_root_match(next_root, types) &&
+ kvm_tdp_mmu_get_root(next_root))
+ break;
+
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link, typeof(*next_root), link);
+ }
+
+ rcu_read_unlock();
+
+ if (prev_root)
+ kvm_tdp_mmu_put_root(kvm, prev_root);
+
+ return next_root;
+}
+
+/*
+ * Note: this iterator gets and puts references to the roots it iterates over.
+ * This makes it safe to release the MMU lock and yield within the loop, but
+ * if exiting the loop early, the caller must drop the reference to the most
+ * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode.
+ */
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _types)) \
+ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
+
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
+
+/*
+ * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
+ * the implication being that any flow that holds mmu_lock for read is
+ * inherently yield-friendly and should use the yield-safe variant above.
+ * Holding mmu_lock for write obviates the need for RCU protection as the list
+ * is guaranteed to be stable.
+ */
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types)))) { \
+ } else
+
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ * It is safe to iterate over the SPTEs under the root, but their values will
+ * be unstable, so all writes must be atomic. As this routine is meant to be
+ * used without holding the mmu_lock at all, any bits that are flipped must
+ * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
+ */
+#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types))) { \
+ } else
+
+#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+
+ return sp;
+}
+
+static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
+ gfn_t gfn, union kvm_mmu_page_role role)
+{
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ sp->role = role;
+ sp->gfn = gfn;
+ sp->ptep = sptep;
+ sp->tdp_mmu_page = true;
+
+ trace_kvm_mmu_get_page(sp, true);
+}
+
+static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *parent_sp;
+ union kvm_mmu_page_role role;
+
+ parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
+
+ role = parent_sp->role;
+ role.level--;
+
+ tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
+}
+
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role role = mmu->root_role;
+ int as_id = kvm_mmu_role_as_id(role);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_page *root;
+
+ if (mirror)
+ role.is_mirror = true;
+
+ /*
+ * Check for an existing root before acquiring the pages lock to avoid
+ * unnecessary serialization if multiple vCPUs are loading a new root.
+ * E.g. when bringing up secondary vCPUs, KVM will already have created
+ * a valid root on behalf of the primary vCPU.
+ */
+ read_lock(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+ if (root->role.word == role.word)
+ goto out_read_unlock;
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+
+ /*
+ * Recheck for an existing root after acquiring the pages lock, another
+ * vCPU may have raced ahead and created a new usable root. Manually
+ * walk the list of roots as the standard macros assume that the pages
+ * lock is *not* held. WARN if grabbing a reference to a usable root
+ * fails, as the last reference to a root can only be put *after* the
+ * root has been invalidated, which requires holding mmu_lock for write.
+ */
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (root->role.word == role.word &&
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
+ goto out_spin_unlock;
+ }
+
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, NULL, 0, role);
+
+ /*
+ * TDP MMU roots are kept until they are explicitly invalidated, either
+ * by a memslot update or by the destruction of the VM. Initialize the
+ * refcount to two; one reference for the vCPU, and one reference for
+ * the TDP MMU itself, which is held until the root is invalidated and
+ * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
+ */
+ refcount_set(&root->tdp_mmu_root_count, 2);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+
+out_spin_unlock:
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out_read_unlock:
+ read_unlock(&kvm->mmu_lock);
+ /*
+ * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
+ * and actually consuming the root if it's invalidated after dropping
+ * mmu_lock, and the root can't be freed as this vCPU holds a reference.
+ */
+ if (mirror) {
+ mmu->mirror_root_hpa = __pa(root->spt);
+ } else {
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ }
+}
+
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared);
+
+static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+#ifdef CONFIG_KVM_PROVE_MMU
+ atomic64_inc(&kvm->arch.tdp_mmu_pages);
+#endif
+}
+
+static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+#ifdef CONFIG_KVM_PROVE_MMU
+ atomic64_dec(&kvm->arch.tdp_mmu_pages);
+#endif
+}
+
+/**
+ * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ */
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ tdp_unaccount_mmu_page(kvm, sp);
+
+ if (!sp->nx_huge_page_disallowed)
+ return;
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ sp->nx_huge_page_disallowed = false;
+ untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ int level)
+{
+ /*
+ * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
+ * PTs are removed in a special order, involving free_external_spt().
+ * But remove_external_spte() will be called on non-leaf PTEs via
+ * __tdp_mmu_zap_root(), so avoid the error the former would return
+ * in this case.
+ */
+ if (!is_last_spte(old_spte, level))
+ return;
+
+ /* Zapping leaf spte is allowed only when write lock is held. */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
+}
+
+/**
+ * handle_removed_pt() - handle a page table removed from the TDP structure
+ *
+ * @kvm: kvm instance
+ * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ * of the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Given a page table that has been removed from the TDP paging structure,
+ * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
+ */
+static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
+ int level = sp->role.level;
+ gfn_t base_gfn = sp->gfn;
+ int i;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+
+ tdp_mmu_unlink_sp(kvm, sp);
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ tdp_ptep_t sptep = pt + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_spte;
+
+ if (shared) {
+ /*
+ * Set the SPTE to a nonpresent value that other
+ * threads will not overwrite. If the SPTE was
+ * already marked as frozen then another thread
+ * handling a page fault could overwrite it, so
+ * set the SPTE until it is set from some other
+ * value to the frozen SPTE value.
+ */
+ for (;;) {
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
+ if (!is_frozen_spte(old_spte))
+ break;
+ cpu_relax();
+ }
+ } else {
+ /*
+ * If the SPTE is not MMU-present, there is no backing
+ * page associated with the SPTE and so no side effects
+ * that need to be recorded, and exclusive ownership of
+ * mmu_lock ensures the SPTE can't be made present.
+ * Note, zapping MMIO SPTEs is also unnecessary as they
+ * are guarded by the memslots generation, not by being
+ * unreachable.
+ */
+ old_spte = kvm_tdp_mmu_read_spte(sptep);
+ if (!is_shadow_present_pte(old_spte))
+ continue;
+
+ /*
+ * Use the common helper instead of a raw WRITE_ONCE as
+ * the SPTE needs to be updated atomically if it can be
+ * modified by a different vCPU outside of mmu_lock.
+ * Even though the parent SPTE is !PRESENT, the TLB
+ * hasn't yet been flushed, and both Intel and AMD
+ * document that A/D assists can use upper-level PxE
+ * entries that are cached in the TLB, i.e. the CPU can
+ * still access the page and mark it dirty.
+ *
+ * No retry is needed in the atomic update path as the
+ * sole concern is dropping a Dirty bit, i.e. no other
+ * task can zap/remove the SPTE as mmu_lock is held for
+ * write. Marking the SPTE as a frozen SPTE is not
+ * strictly necessary for the same reason, but using
+ * the frozen SPTE value keeps the shared/exclusive
+ * paths consistent and allows the handle_changed_spte()
+ * call below to hardcode the new value to FROZEN_SPTE.
+ *
+ * Note, even though dropping a Dirty bit is the only
+ * scenario where a non-atomic update could result in a
+ * functional bug, simply checking the Dirty bit isn't
+ * sufficient as a fast page fault could read the upper
+ * level SPTE before it is zapped, and then make this
+ * target SPTE writable, resume the guest, and set the
+ * Dirty bit between reading the SPTE above and writing
+ * it here.
+ */
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+ FROZEN_SPTE, level);
+ }
+ handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
+ old_spte, FROZEN_SPTE, level, shared);
+
+ if (is_mirror_sp(sp)) {
+ KVM_BUG_ON(shared, kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+ }
+
+ if (is_mirror_sp(sp) &&
+ WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
+ /*
+ * Failed to free page table page in mirror page table and
+ * there is nothing to do further.
+ * Intentionally leak the page to prevent the kernel from
+ * accessing the encrypted page.
+ */
+ sp->external_spt = NULL;
+ }
+
+ call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
+{
+ if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ WARN_ON_ONCE(sp->role.level + 1 != level);
+ WARN_ON_ONCE(sp->gfn != gfn);
+ return sp->external_spt;
+ }
+
+ return NULL;
+}
+
+static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
+ gfn_t gfn, u64 old_spte,
+ u64 new_spte, int level)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ int ret = 0;
+
+ KVM_BUG_ON(was_present, kvm);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ /*
+ * We need to lock out other updates to the SPTE until the external
+ * page table has been modified. Use FROZEN_SPTE similar to
+ * the zapping case.
+ */
+ if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
+ return -EBUSY;
+
+ /*
+ * Use different call to either set up middle level
+ * external page table, or leaf.
+ */
+ if (is_leaf) {
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
+ } else {
+ void *external_spt = get_external_spt(gfn, new_spte, level);
+
+ KVM_BUG_ON(!external_spt, kvm);
+ ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
+ }
+ if (ret)
+ __kvm_tdp_mmu_write_spte(sptep, old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return ret;
+}
+
+/**
+ * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * @kvm: kvm instance
+ * @as_id: the address space of the paging structure the SPTE was a part of
+ * @gfn: the base GFN that was mapped by the SPTE
+ * @old_spte: The value of the SPTE before the change
+ * @new_spte: The value of the SPTE after the change
+ * @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Handle bookkeeping that might result from the modification of a SPTE. Note,
+ * dirty logging updates are handled in common code, not here (see make_spte()
+ * and fast_pf_fix_direct_spte()).
+ */
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool was_leaf = was_present && is_last_spte(old_spte, level);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
+
+ WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
+ WARN_ON_ONCE(level < PG_LEVEL_4K);
+ WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+
+ /*
+ * If this warning were to trigger it would indicate that there was a
+ * missing MMU notifier or a race with some notifier handler.
+ * A present, leaf SPTE should never be directly replaced with another
+ * present leaf SPTE pointing to a different PFN. A notifier handler
+ * should be zapping the SPTE before the main MM's page table is
+ * changed, or the SPTE should be zeroed, and the TLBs flushed by the
+ * thread before replacement.
+ */
+ if (was_leaf && is_leaf && pfn_changed) {
+ pr_err("Invalid SPTE change: cannot replace a present leaf\n"
+ "SPTE with another present leaf SPTE mapping a\n"
+ "different PFN!\n"
+ "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
+ as_id, gfn, old_spte, new_spte, level);
+
+ /*
+ * Crash the host to prevent error propagation and guest data
+ * corruption.
+ */
+ BUG();
+ }
+
+ if (old_spte == new_spte)
+ return;
+
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+
+ if (is_leaf)
+ check_spte_writable_invariants(new_spte);
+
+ /*
+ * The only times a SPTE should be changed from a non-present to
+ * non-present state is when an MMIO entry is installed/modified/
+ * removed. In that case, there is nothing to do here.
+ */
+ if (!was_present && !is_present) {
+ /*
+ * If this change does not involve a MMIO SPTE or frozen SPTE,
+ * it is unexpected. Log the change, though it should not
+ * impact the guest since both the former and current SPTEs
+ * are nonpresent.
+ */
+ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
+ !is_mmio_spte(kvm, new_spte) &&
+ !is_frozen_spte(new_spte)))
+ pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
+ "should not be replaced with another,\n"
+ "different nonpresent SPTE, unless one or both\n"
+ "are MMIO SPTEs, or the new SPTE is\n"
+ "a temporary frozen SPTE.\n"
+ "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
+ as_id, gfn, old_spte, new_spte, level);
+ return;
+ }
+
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
+
+ /*
+ * Recursively handle child PTs if the change removed a subtree from
+ * the paging structure. Note the WARN on the PFN changing without the
+ * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
+ * pages are kernel allocations and should never be migrated.
+ */
+ if (was_present && !was_leaf &&
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
+}
+
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ /*
+ * The caller is responsible for ensuring the old SPTE is not a FROZEN
+ * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
+ */
+ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
+
+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ int ret;
+
+ /*
+ * Users of atomic zapping don't operate on mirror roots,
+ * so don't handle it and bug the VM if it's seen.
+ */
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EBUSY;
+
+ ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ if (ret)
+ return ret;
+ } else {
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
+ * and does not hold the mmu_lock. On failure, i.e. if a
+ * different logical CPU modified the SPTE, try_cmpxchg64()
+ * updates iter->old_spte with the current value, so the caller
+ * operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping. Do not mark the page dirty
+ * in KVM's dirty bitmaps.
+ *
+ * If setting the SPTE fails because it has changed, iter->old_spte will be
+ * refreshed to the current value of the spte.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Return:
+ * * 0 - If the SPTE was set.
+ * * -EBUSY - If the SPTE cannot be set. In this case this function will have
+ * no side-effects other than setting iter->old_spte to the last
+ * known value of the spte.
+ */
+static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ int ret;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+ if (ret)
+ return ret;
+
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+
+ return 0;
+}
+
+/*
+ * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * @kvm: KVM instance
+ * @as_id: Address space ID, i.e. regular vs. SMM
+ * @sptep: Pointer to the SPTE
+ * @old_spte: The current value of the SPTE
+ * @new_spte: The new value that will be set for the SPTE
+ * @gfn: The base GFN that was (or will be) mapped by the SPTE
+ * @level: The level _containing_ the SPTE (its parent PT's level)
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
+ */
+static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * No thread should be using this function to set SPTEs to or from the
+ * temporary frozen SPTE value.
+ * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
+ * should be used. If operating under the MMU lock in write mode, the
+ * use of the frozen SPTE should not be necessary.
+ */
+ WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
+
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
+
+ handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+
+ /*
+ * Users that do non-atomic setting of PTEs don't operate on mirror
+ * roots, so don't handle it and bug the VM if it's seen.
+ */
+ if (is_mirror_sptep(sptep)) {
+ KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+
+ return old_spte;
+}
+
+static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte)
+{
+ WARN_ON_ONCE(iter->yielded);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level);
+}
+
+#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
+
+#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
+ tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ if (!is_shadow_present_pte(_iter.old_spte) || \
+ !is_last_spte(_iter.old_spte, _iter.level)) \
+ continue; \
+ else
+
+static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
+ struct tdp_iter *iter)
+{
+ if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
+ return false;
+
+ /* Ensure forward progress has been made before yielding. */
+ return iter->next_last_level_gfn != iter->yielded_gfn;
+}
+
+/*
+ * Yield if the MMU lock is contended or this thread needs to return control
+ * to the scheduler.
+ *
+ * If this function should yield and flush is set, it will perform a remote
+ * TLB flush before yielding.
+ *
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
+ *
+ * Returns true if this function yielded.
+ */
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
+{
+ KVM_MMU_WARN_ON(iter->yielded);
+
+ if (!tdp_mmu_iter_need_resched(kvm, iter))
+ return false;
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ rcu_read_unlock();
+
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
+
+ iter->yielded = true;
+ return true;
+}
+
+static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
+{
+ /*
+ * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
+ * a gpa range that would exceed the max gfn, and KVM does not create
+ * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
+ * the slow emulation path every time.
+ */
+ return kvm_mmu_max_gfn() + 1;
+}
+
+static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared, int zap_level)
+{
+ struct tdp_iter iter;
+
+ for_each_tdp_pte_min_level_all(iter, root, zap_level) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ if (iter.level > zap_level)
+ continue;
+
+ if (!shared)
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
+ goto retry;
+ }
+}
+
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+
+ /*
+ * The root must have an elevated refcount so that it's reachable via
+ * mmu_notifier callbacks, which allows this path to yield and drop
+ * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
+ * must drop all references to relevant pages prior to completing the
+ * callback. Dropping mmu_lock with an unreachable root would result
+ * in zapping SPTEs after a relevant mmu_notifier callback completes
+ * and lead to use-after-free as zapping a SPTE triggers "writeback" of
+ * dirty accessed bits to the SPTE's associated struct page.
+ */
+ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ rcu_read_lock();
+
+ /*
+ * Zap roots in multiple passes of decreasing granularity, i.e. zap at
+ * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
+ * preempt models) or mmu_lock contention (full or real-time models).
+ * Zapping at finer granularity marginally increases the total time of
+ * the zap, but in most cases the zap itself isn't latency sensitive.
+ *
+ * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
+ * in order to mimic the page fault path, which can replace a 1GiB page
+ * table with an equivalent 1GiB hugepage, i.e. can get saddled with
+ * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
+ * allows verifying that KVM can safely zap 1GiB regions, e.g. without
+ * inducing RCU stalls, without relying on a relatively rare event
+ * (zapping roots is orders of magnitude more common). Note, because
+ * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
+ * in the iterator itself is unnecessary.
+ */
+ if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
+ }
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
+ __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
+
+ rcu_read_unlock();
+}
+
+bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ struct tdp_iter iter = {
+ .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
+ .sptep = sp->ptep,
+ .level = sp->role.level + 1,
+ .gfn = sp->gfn,
+ .as_id = kvm_mmu_page_as_id(sp),
+ };
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
+ return false;
+
+ /*
+ * Root shadow pages don't have a parent page table and thus no
+ * associated entry, but they can never be possible NX huge pages.
+ */
+ if (WARN_ON_ONCE(!sp->ptep))
+ return false;
+
+ /*
+ * Since mmu_lock is held in read mode, it's possible another task has
+ * already modified the SPTE. Zap the SPTE if and only if the SPTE
+ * points at the SP's page table, as checking shadow-present isn't
+ * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
+ * another SP. Note, spte_to_child_pt() also checks that the SPTE is
+ * shadow-present, i.e. guards against zapping a frozen SPTE.
+ */
+ if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
+ return false;
+
+ /*
+ * If a different task modified the SPTE, then it should be impossible
+ * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
+ * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
+ * creating non-leaf SPTEs, and all other bits are immutable for non-
+ * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
+ * zapping and replacement.
+ */
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
+ WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * If can_yield is true, will release the MMU lock and reschedule if the
+ * scheduler needs the CPU or there is contention on the MMU lock. If this
+ * function cannot yield, it will not release the MMU lock or reschedule and
+ * the caller must ensure it does not supply too large a GFN range, or the
+ * operation can cause a soft lockup.
+ */
+static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
+{
+ struct tdp_iter iter;
+
+ end = min(end, tdp_mmu_max_gfn_exclusive());
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
+ if (can_yield &&
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
+ flush = false;
+ continue;
+ }
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+
+ /*
+ * Zappings SPTEs in invalid roots doesn't require a TLB flush,
+ * see kvm_tdp_mmu_zap_invalidated_roots() for details.
+ */
+ if (!root->role.invalid)
+ flush = true;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
+ * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
+ */
+ return flush;
+}
+
+/*
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
+ * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
+ * one or more SPTEs were zapped since the MMU lock was last acquired.
+ */
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
+
+ return flush;
+}
+
+void kvm_tdp_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ /*
+ * Zap all direct roots, including invalid direct roots, as all direct
+ * SPTEs must be dropped before returning to the caller. For TDX, mirror
+ * roots don't need handling in response to the mmu notifier (the caller).
+ *
+ * Zap directly even if the root is also being zapped by a concurrent
+ * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
+ * and mmu_lock is already held, which means the other thread has yielded.
+ *
+ * A TLB flush is unnecessary, KVM zaps everything if and only the VM
+ * is being destroyed or the userspace VMM has exited. In both cases,
+ * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+ */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
+ KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
+ tdp_mmu_zap_root(kvm, root, false);
+}
+
+/*
+ * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
+ * zap" completes.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
+{
+ struct kvm_mmu_page *root;
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
+ if (!root->tdp_mmu_scheduled_root_to_zap)
+ continue;
+
+ root->tdp_mmu_scheduled_root_to_zap = false;
+ KVM_BUG_ON(!root->role.invalid, kvm);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB
+ * flush when allocating a new root (see kvm_mmu_load()), and
+ * when migrating a vCPU to a different pCPU. Note, the local
+ * TLB flush on reuse also invalidates paging-structure-cache
+ * entries, i.e. TLB entries for intermediate paging structures,
+ * that may be zapped, as such entries are associated with the
+ * ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, shared);
+
+ /*
+ * The referenced needs to be put *after* zapping the root, as
+ * the root must be reachable by mmu_notifiers while it's being
+ * zapped
+ */
+ kvm_tdp_mmu_put_root(kvm, root);
+ }
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+ * is about to be zapped, e.g. in response to a memslots update. The actual
+ * zapping is done separately so that it happens with mmu_lock with read,
+ * whereas invalidating roots must be done with mmu_lock held for write (unless
+ * the VM is being destroyed).
+ *
+ * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
+ * See kvm_tdp_mmu_alloc_root().
+ */
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types)
+{
+ struct kvm_mmu_page *root;
+
+ /*
+ * Invalidating invalid roots doesn't make sense, prevent developers from
+ * having to think about it.
+ */
+ if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
+ root_types &= ~KVM_INVALID_ROOTS;
+
+ /*
+ * mmu_lock must be held for write to ensure that a root doesn't become
+ * invalid while there are active readers (invalidating a root while
+ * there are active readers may or may not be problematic in practice,
+ * but it's uncharted territory and not supported).
+ *
+ * Waive the assertion if there are no users of @kvm, i.e. the VM is
+ * being destroyed after all references have been put, or if no vCPUs
+ * have been created (which means there are no roots), i.e. the VM is
+ * being destroyed in an error path of KVM_CREATE_VM.
+ */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ refcount_read(&kvm->users_count) && kvm->created_vcpus)
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * As above, mmu_lock isn't held when destroying the VM! There can't
+ * be other references to @kvm, i.e. nothing else can invalidate roots
+ * or get/put references to roots.
+ */
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!tdp_mmu_root_match(root, root_types))
+ continue;
+
+ /*
+ * Note, invalid roots can outlive a memslot update! Invalid
+ * roots must be *zapped* before the memslot update completes,
+ * but a different task can acquire a reference and keep the
+ * root alive after its been zapped.
+ */
+ if (!root->role.invalid) {
+ root->tdp_mmu_scheduled_root_to_zap = true;
+ root->role.invalid = true;
+ }
+ }
+}
+
+/*
+ * Installs a last-level SPTE to handle a TDP page fault.
+ * (NPT/EPT violation/misconfiguration)
+ */
+static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
+ u64 new_spte;
+ int ret = RET_PF_FIXED;
+ bool wrprot = false;
+
+ if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
+ return RET_PF_RETRY;
+
+ if (is_shadow_present_pte(iter->old_spte) &&
+ (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+ is_last_spte(iter->old_spte, iter->level)) {
+ WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
+ return RET_PF_SPURIOUS;
+ }
+
+ if (unlikely(!fault->slot))
+ new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
+ else
+ wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ fault->pfn, iter->old_spte, fault->prefetch,
+ false, fault->map_writable, &new_spte);
+
+ if (new_spte == iter->old_spte)
+ ret = RET_PF_SPURIOUS;
+ else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ return RET_PF_RETRY;
+ else if (is_shadow_present_pte(iter->old_spte) &&
+ (!is_last_spte(iter->old_spte, iter->level) ||
+ WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
+
+ /*
+ * If the page fault was caused by a write but the page is write
+ * protected, emulation is needed. If the emulation was skipped,
+ * the vCPU would have the same fault again.
+ */
+ if (wrprot && fault->write)
+ ret = RET_PF_WRITE_PROTECTED;
+
+ /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
+ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+ new_spte);
+ ret = RET_PF_EMULATE;
+ } else {
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
+ }
+
+ return ret;
+}
+
+/*
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @shared: This operation is running under the MMU lock in read mode.
+ *
+ * Returns: 0 if the new page table was installed. Non-0 if the page table
+ * could not be installed (e.g. the atomic compare-exchange failed).
+ */
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
+ int ret = 0;
+
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_iter_set_spte(kvm, iter, spte);
+ }
+
+ tdp_account_mmu_page(kvm, sp);
+
+ return 0;
+}
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared);
+
+/*
+ * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
+ * page tables and SPTEs to translate the faulting guest physical address.
+ */
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
+ struct kvm *kvm = vcpu->kvm;
+ struct tdp_iter iter;
+ struct kvm_mmu_page *sp;
+ int ret = RET_PF_RETRY;
+
+ KVM_MMU_WARN_ON(!root || root->role.invalid);
+
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
+ int r;
+
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
+
+ /*
+ * If SPTE has been frozen by another thread, just give up and
+ * retry, avoiding unnecessary page table allocation and free.
+ */
+ if (is_frozen_spte(iter.old_spte))
+ goto retry;
+
+ if (iter.level == fault->goal_level)
+ goto map_target_level;
+
+ /* Step down into the lower level page table if it exists. */
+ if (is_shadow_present_pte(iter.old_spte) &&
+ !is_large_pte(iter.old_spte))
+ continue;
+
+ /*
+ * The SPTE is either non-present or points to a huge page that
+ * needs to be split.
+ */
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
+ if (is_mirror_sp(sp))
+ kvm_mmu_alloc_external_spt(vcpu, sp);
+
+ sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
+
+ if (is_shadow_present_pte(iter.old_spte)) {
+ /* Don't support large page for mirrored roots (TDX) */
+ KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
+ r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+ } else {
+ r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+ }
+
+ /*
+ * Force the guest to retry if installing an upper level SPTE
+ * failed, e.g. because a different task modified the SPTE.
+ */
+ if (r) {
+ tdp_mmu_free_sp(sp);
+ goto retry;
+ }
+
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= iter.level) {
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ if (sp->nx_huge_page_disallowed)
+ track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ }
+ }
+
+ /*
+ * The walk aborted before reaching the target level, e.g. because the
+ * iterator detected an upper level SPTE was frozen during traversal.
+ */
+ WARN_ON_ONCE(iter.level == fault->goal_level);
+ goto retry;
+
+map_target_level:
+ ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
+
+retry:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush)
+{
+ enum kvm_tdp_mmu_root_types types;
+ struct kvm_mmu_page *root;
+
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
+ flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
+ range->may_block, flush);
+
+ return flush;
+}
+
+/*
+ * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
+ * if any of the GFNs in the range have been accessed.
+ *
+ * No need to mark the corresponding PFN as accessed as this call is coming
+ * from the clear_young() or clear_flush_young() notifier, which uses the
+ * return value to determine if the page has been accessed.
+ */
+static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
+{
+ u64 new_spte;
+
+ if (spte_ad_enabled(iter->old_spte)) {
+ iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+ shadow_accessed_mask);
+ new_spte = iter->old_spte & ~shadow_accessed_mask;
+ } else {
+ new_spte = mark_spte_for_access_track(iter->old_spte);
+ /*
+ * It is safe for the following cmpxchg to fail. Leave the
+ * Accessed bit set, as the spte is most likely young anyway.
+ */
+ if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
+ return;
+ }
+
+ trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
+ iter->old_spte, new_spte);
+}
+
+static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ bool test_only)
+{
+ enum kvm_tdp_mmu_root_types types;
+ struct kvm_mmu_page *root;
+ struct tdp_iter iter;
+ bool ret = false;
+
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
+
+ /*
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code. Note,
+ * this helper must NOT be used to unmap GFNs, as it processes only
+ * valid roots!
+ */
+ WARN_ON(types & ~KVM_VALID_ROOTS);
+
+ guard(rcu)();
+ for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
+ if (!is_accessed_spte(iter.old_spte))
+ continue;
+
+ if (test_only)
+ return true;
+
+ ret = true;
+ kvm_tdp_mmu_age_spte(kvm, &iter);
+ }
+ }
+
+ return ret;
+}
+
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
+}
+
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
+}
+
+/*
+ * Remove write access from all SPTEs at or above min_level that map GFNs
+ * [start, end). Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
+ */
+static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, int min_level)
+{
+ struct tdp_iter iter;
+ u64 new_spte;
+ bool spte_set = false;
+
+ rcu_read_lock();
+
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level) ||
+ !(iter.old_spte & PT_WRITABLE_MASK))
+ continue;
+
+ new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ goto retry;
+
+ spte_set = true;
+ }
+
+ rcu_read_unlock();
+ return spte_set;
+}
+
+/*
+ * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
+ * only affect leaf SPTEs down to min_level.
+ * Returns true if an SPTE has been changed and the TLBs need to be flushed.
+ */
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level)
+{
+ struct kvm_mmu_page *root;
+ bool spte_set = false;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+ spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
+ slot->base_gfn + slot->npages, min_level);
+
+ return spte_set;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
+ if (!sp)
+ return NULL;
+
+ sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!sp->spt) {
+ kmem_cache_free(mmu_page_header_cache, sp);
+ return NULL;
+ }
+
+ return sp;
+}
+
+/* Note, the caller is responsible for initializing @sp. */
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ const u64 huge_spte = iter->old_spte;
+ const int level = iter->level;
+ int ret, i;
+
+ /*
+ * No need for atomics when writing to sp->spt since the page table has
+ * not been linked in yet and thus is not reachable from any other CPU.
+ */
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
+
+ /*
+ * Replace the huge spte with a pointer to the populated lower level
+ * page table. Since we are making this change without a TLB flush vCPUs
+ * will see a mix of the split mappings and the original huge mapping,
+ * depending on what's currently in their TLB. This is fine from a
+ * correctness standpoint since the translation will be the same either
+ * way.
+ */
+ ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
+ if (ret)
+ goto out;
+
+ /*
+ * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+ * are overwriting from the page stats. But we have to manually update
+ * the page stats with the new present child pages.
+ */
+ kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
+
+out:
+ trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
+ return ret;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct tdp_iter iter;
+
+ rcu_read_lock();
+
+ /*
+ * Traverse the page table splitting all huge pages above the target
+ * level into one lower level. For example, if we encounter a 1GB page
+ * we split it into 512 2MB pages.
+ *
+ * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+ * to visit an SPTE before ever visiting its children, which means we
+ * will correctly recursively split huge pages that are more than one
+ * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+ * and then splitting each of those to 512 4KB pages).
+ */
+ for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+ continue;
+
+ if (!sp) {
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ sp = tdp_mmu_alloc_sp_for_split();
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ if (!sp) {
+ trace_kvm_mmu_split_huge_page(iter.gfn,
+ iter.old_spte,
+ iter.level, -ENOMEM);
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+
+ iter.yielded = true;
+ continue;
+ }
+
+ tdp_mmu_init_child_sp(sp, &iter);
+
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
+ goto retry;
+
+ sp = NULL;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It's possible to exit the loop having never used the last sp if, for
+ * example, a vCPU doing HugePage NX splitting wins the race and
+ * installs its own sp in place of the last sp we tried to split.
+ */
+ if (sp)
+ tdp_mmu_free_sp(sp);
+
+ return 0;
+}
+
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *root;
+ int r = 0;
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
+ if (r) {
+ kvm_tdp_mmu_put_root(kvm, root);
+ break;
+ }
+ }
+}
+
+static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ /*
+ * All TDP MMU shadow pages share the same role as their root, aside
+ * from level, so it is valid to key off any shadow page to determine if
+ * write protection is needed for an entire tree.
+ */
+ return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
+}
+
+static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end)
+{
+ const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
+ struct tdp_iter iter;
+
+ rcu_read_lock();
+
+ tdp_root_for_each_pte(iter, kvm, root, start, end) {
+retry:
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
+ spte_ad_need_write_protect(iter.old_spte));
+
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
+ goto retry;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
+ * memslot.
+ */
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+ clear_dirty_gfn_range(kvm, root, slot->base_gfn,
+ slot->base_gfn + slot->npages);
+}
+
+static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t gfn, unsigned long mask, bool wrprot)
+{
+ const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
+ struct tdp_iter iter;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
+ gfn + BITS_PER_LONG) {
+ if (!mask)
+ break;
+
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
+ spte_ad_need_write_protect(iter.old_spte));
+
+ if (iter.level > PG_LEVEL_4K ||
+ !(mask & (1UL << (iter.gfn - gfn))))
+ continue;
+
+ mask &= ~(1UL << (iter.gfn - gfn));
+
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
+ iter.old_spte, dbit,
+ iter.level);
+
+ trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
+ iter.old_spte,
+ iter.old_spte & ~dbit);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
+ * which a bit is set in mask, starting at gfn. The given memslot is expected to
+ * contain all the GFNs represented by set bits in the mask.
+ */
+void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long mask,
+ bool wrprot)
+{
+ struct kvm_mmu_page *root;
+
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
+ clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
+}
+
+static int tdp_mmu_make_huge_spte(struct kvm *kvm,
+ struct tdp_iter *parent,
+ u64 *huge_spte)
+{
+ struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
+ gfn_t start = parent->gfn;
+ gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
+ struct tdp_iter iter;
+
+ tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
+ /*
+ * Use the parent iterator when checking for forward progress so
+ * that KVM doesn't get stuck continuously trying to yield (i.e.
+ * returning -EAGAIN here and then failing the forward progress
+ * check in the caller ad nauseam).
+ */
+ if (tdp_mmu_iter_need_resched(kvm, parent))
+ return -EAGAIN;
+
+ *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static void recover_huge_pages_range(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ const struct kvm_memory_slot *slot)
+{
+ gfn_t start = slot->base_gfn;
+ gfn_t end = start + slot->npages;
+ struct tdp_iter iter;
+ int max_mapping_level;
+ bool flush = false;
+ u64 huge_spte;
+ int r;
+
+ if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
+ return;
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+ flush = false;
+ continue;
+ }
+
+ if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
+ !is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ /*
+ * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
+ * a large page size, then its parent would have been zapped
+ * instead of stepping down.
+ */
+ if (is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ /*
+ * If iter.gfn resides outside of the slot, i.e. the page for
+ * the current level overlaps but is not contained by the slot,
+ * then the SPTE can't be made huge. More importantly, trying
+ * to query that info from slot->arch.lpage_info will cause an
+ * out-of-bounds access.
+ */
+ if (iter.gfn < start || iter.gfn >= end)
+ continue;
+
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
+ if (max_mapping_level < iter.level)
+ continue;
+
+ r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
+ if (r == -EAGAIN)
+ goto retry;
+ else if (r)
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
+ goto retry;
+
+ flush = true;
+ }
+
+ if (flush)
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+
+ rcu_read_unlock();
+}
+
+/*
+ * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
+ * huge SPTEs where possible.
+ */
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+ recover_huge_pages_range(kvm, root, slot);
+}
+
+/*
+ * Removes write access on the last level SPTE mapping this GFN and unsets the
+ * MMU-writable bit to ensure future writes continue to be intercepted.
+ * Returns true if an SPTE was set and a TLB flush is needed.
+ */
+static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t gfn, int min_level)
+{
+ struct tdp_iter iter;
+ u64 new_spte;
+ bool spte_set = false;
+
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ new_spte = iter.old_spte &
+ ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+
+ if (new_spte == iter.old_spte)
+ break;
+
+ tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
+ spte_set = true;
+ }
+
+ rcu_read_unlock();
+
+ return spte_set;
+}
+
+/*
+ * Removes write access on the last level SPTE mapping this GFN and unsets the
+ * MMU-writable bit to ensure future writes continue to be intercepted.
+ * Returns true if an SPTE was set and a TLB flush is needed.
+ */
+bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level)
+{
+ struct kvm_mmu_page *root;
+ bool spte_set = false;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
+ spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
+
+ return spte_set;
+}
+
+/*
+ * Return the level of the lowest level SPTE added to sptes.
+ * That SPTE may be non-present.
+ *
+ * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ */
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+ struct tdp_iter iter;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+ int leaf = -1;
+
+ *root_level = vcpu->arch.mmu->root_role.level;
+
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ leaf = iter.level;
+ sptes[leaf] = iter.old_spte;
+ }
+
+ return leaf;
+}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
+ *
+ * WARNING: This function is only intended to be called during fast_page_fault.
+ */
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
+ u64 *spte)
+{
+ /* Fast pf is not supported for mirrored roots */
+ struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
+ struct tdp_iter iter;
+ tdp_ptep_t sptep = NULL;
+
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ *spte = iter.old_spte;
+ sptep = iter.sptep;
+ }
+
+ /*
+ * Perform the rcu_dereference to get the raw spte pointer value since
+ * we are passing it up to fast_page_fault, which is shared with the
+ * legacy MMU and thus does not retain the TDP MMU-specific __rcu
+ * annotation.
+ *
+ * This is safe since fast_page_fault obeys the contracts of this
+ * function as well as all TDP MMU contracts around modifying SPTEs
+ * outside of mmu_lock.
+ */
+ return rcu_dereference(sptep);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
new file mode 100644
index 000000000000..bd62977c9199
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __KVM_X86_MMU_TDP_MMU_H
+#define __KVM_X86_MMU_TDP_MMU_H
+
+#include <linux/kvm_host.h>
+
+#include "spte.h"
+
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private);
+
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
+{
+ return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+
+enum kvm_tdp_mmu_root_types {
+ KVM_INVALID_ROOTS = BIT(0),
+ KVM_DIRECT_ROOTS = BIT(1),
+ KVM_MIRROR_ROOTS = BIT(2),
+ KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS,
+ KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS,
+};
+
+static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm,
+ enum kvm_gfn_range_filter process)
+{
+ enum kvm_tdp_mmu_root_types ret = 0;
+
+ if (!kvm_has_mirrored_tdp(kvm))
+ return KVM_DIRECT_ROOTS;
+
+ if (process & KVM_FILTER_PRIVATE)
+ ret |= KVM_MIRROR_ROOTS;
+ if (process & KVM_FILTER_SHARED)
+ ret |= KVM_DIRECT_ROOTS;
+
+ WARN_ON_ONCE(!ret);
+
+ return ret;
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr)))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
+ enum kvm_tdp_mmu_root_types type)
+{
+ if (unlikely(type == KVM_MIRROR_ROOTS))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
+bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp);
+void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
+
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level);
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
+void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long mask,
+ bool wrprot);
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
+
+bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level);
+
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared);
+
+static inline void kvm_tdp_mmu_walk_lockless_begin(void)
+{
+ rcu_read_lock();
+}
+
+static inline void kvm_tdp_mmu_walk_lockless_end(void)
+{
+ rcu_read_unlock();
+}
+
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level);
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
+ u64 *spte);
+
+#ifdef CONFIG_X86_64
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
+#else
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+#endif
+
+#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000000..6f74e2b27c1e
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "x86.h"
+
+static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
+{
+ int index;
+
+ switch (msr) {
+ case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
+ index = msr - MTRRphysBase_MSR(0);
+ return &vcpu->arch.mtrr_state.var[index];
+ case MSR_MTRRfix64K_00000:
+ return &vcpu->arch.mtrr_state.fixed_64k;
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ index = msr - MSR_MTRRfix16K_80000;
+ return &vcpu->arch.mtrr_state.fixed_16k[index];
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ return &vcpu->arch.mtrr_state.fixed_4k[index];
+ case MSR_MTRRdefType:
+ return &vcpu->arch.mtrr_state.deftype;
+ default:
+ break;
+ }
+ return NULL;
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+ u64 mask;
+
+ if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
+ return false;
+
+ mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else {
+ /* MTRR mask */
+ mask |= 0x7ff;
+ }
+
+ return (data & mask) == 0;
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ u64 *mtrr;
+
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
+ return 1;
+
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ *mtrr = data;
+ return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 *mtrr;
+
+ /* MSR_MTRRcap is a readonly MSR. */
+ if (msr == MSR_MTRRcap) {
+ /*
+ * SMRR = 0
+ * WC = 1
+ * FIX = 1
+ * VCNT = KVM_NR_VAR_MTRR
+ */
+ *pdata = 0x500 | KVM_NR_VAR_MTRR;
+ return 0;
+ }
+
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
+ return 1;
+
+ *pdata = *mtrr;
+ return 0;
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
deleted file mode 100644
index 67785f635399..000000000000
--- a/arch/x86/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,611 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
- #define pt_element_t u64
- #define guest_walker guest_walker64
- #define FNAME(name) paging##64_##name
- #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
- #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
- #define PT_LEVEL_BITS PT64_LEVEL_BITS
- #ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
- #define CMPXCHG cmpxchg
- #else
- #define CMPXCHG cmpxchg64
- #define PT_MAX_FULL_LEVELS 2
- #endif
-#elif PTTYPE == 32
- #define pt_element_t u32
- #define guest_walker guest_walker32
- #define FNAME(name) paging##32_##name
- #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
- #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
- #define PT_LEVEL_BITS PT32_LEVEL_BITS
- #define PT_MAX_FULL_LEVELS 2
- #define CMPXCHG cmpxchg
-#else
- #error Invalid PTTYPE value
-#endif
-
-#define gpte_to_gfn FNAME(gpte_to_gfn)
-#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
- int level;
- gfn_t table_gfn[PT_MAX_FULL_LEVELS];
- pt_element_t ptes[PT_MAX_FULL_LEVELS];
- gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
- gfn_t gfn;
- u32 error_code;
-};
-
-static gfn_t gpte_to_gfn(pt_element_t gpte)
-{
- return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
-{
- return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
- gfn_t table_gfn, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
-{
- pt_element_t ret;
- pt_element_t *table;
- struct page *page;
-
- page = gfn_to_page(kvm, table_gfn);
-
- table = kmap_atomic(page, KM_USER0);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table, KM_USER0);
-
- kvm_release_page_dirty(page);
-
- return (ret != orig_pte);
-}
-
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
-{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-#if PTTYPE == 64
- if (is_nx(vcpu))
- access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
- return access;
-}
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- int write_fault, int user_fault, int fetch_fault)
-{
- pt_element_t pte;
- gfn_t table_gfn;
- unsigned index, pt_access, pte_access;
- gpa_t pte_gpa;
- int rsvd_fault = 0;
-
- pgprintk("%s: addr %lx\n", __func__, addr);
-walk:
- walker->level = vcpu->arch.mmu.root_level;
- pte = vcpu->arch.cr3;
-#if PTTYPE == 64
- if (!is_long_mode(vcpu)) {
- pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
- if (!is_present_pte(pte))
- goto not_present;
- --walker->level;
- }
-#endif
- ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
- pt_access = ACC_ALL;
-
- for (;;) {
- index = PT_INDEX(addr, walker->level);
-
- table_gfn = gpte_to_gfn(pte);
- pte_gpa = gfn_to_gpa(table_gfn);
- pte_gpa += index * sizeof(pt_element_t);
- walker->table_gfn[walker->level - 1] = table_gfn;
- walker->pte_gpa[walker->level - 1] = pte_gpa;
- pgprintk("%s: table_gfn[%d] %lx\n", __func__,
- walker->level - 1, table_gfn);
-
- kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
-
- if (!is_present_pte(pte))
- goto not_present;
-
- rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
- if (rsvd_fault)
- goto access_error;
-
- if (write_fault && !is_writeble_pte(pte))
- if (user_fault || is_write_protection(vcpu))
- goto access_error;
-
- if (user_fault && !(pte & PT_USER_MASK))
- goto access_error;
-
-#if PTTYPE == 64
- if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
- goto access_error;
-#endif
-
- if (!(pte & PT_ACCESSED_MASK)) {
- mark_page_dirty(vcpu->kvm, table_gfn);
- if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
- index, pte, pte|PT_ACCESSED_MASK))
- goto walk;
- pte |= PT_ACCESSED_MASK;
- }
-
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
- walker->ptes[walker->level - 1] = pte;
-
- if (walker->level == PT_PAGE_TABLE_LEVEL) {
- walker->gfn = gpte_to_gfn(pte);
- break;
- }
-
- if (walker->level == PT_DIRECTORY_LEVEL
- && (pte & PT_PAGE_SIZE_MASK)
- && (PTTYPE == 64 || is_pse(vcpu))) {
- walker->gfn = gpte_to_gfn_pde(pte);
- walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
- if (PTTYPE == 32 && is_cpuid_PSE36())
- walker->gfn += pse36_gfn_delta(pte);
- break;
- }
-
- pt_access = pte_access;
- --walker->level;
- }
-
- if (write_fault && !is_dirty_pte(pte)) {
- bool ret;
-
- mark_page_dirty(vcpu->kvm, table_gfn);
- ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
- pte|PT_DIRTY_MASK);
- if (ret)
- goto walk;
- pte |= PT_DIRTY_MASK;
- walker->ptes[walker->level - 1] = pte;
- }
-
- walker->pt_access = pt_access;
- walker->pte_access = pte_access;
- pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pt_access, pte_access);
- return 1;
-
-not_present:
- walker->error_code = 0;
- goto err;
-
-access_error:
- walker->error_code = PFERR_PRESENT_MASK;
-
-err:
- if (write_fault)
- walker->error_code |= PFERR_WRITE_MASK;
- if (user_fault)
- walker->error_code |= PFERR_USER_MASK;
- if (fetch_fault)
- walker->error_code |= PFERR_FETCH_MASK;
- if (rsvd_fault)
- walker->error_code |= PFERR_RSVD_MASK;
- return 0;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte;
- unsigned pte_access;
- pfn_t pfn;
- int largepage = vcpu->arch.update_pte.largepage;
-
- gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_pte(gpte))
- set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
- return;
- }
- pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
- return;
- pfn = vcpu->arch.update_pte.pfn;
- if (is_error_pfn(pfn))
- return;
- if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
- return;
- kvm_get_pfn(pfn);
- mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, largepage,
- gpte_to_gfn(gpte), pfn, true);
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
- struct guest_walker *gw,
- int user_fault, int write_fault, int largepage,
- int *ptwrite, pfn_t pfn)
-{
- unsigned access = gw->pt_access;
- struct kvm_mmu_page *shadow_page;
- u64 spte, *sptep = NULL;
- int direct;
- gfn_t table_gfn;
- int r;
- int level;
- pt_element_t curr_pte;
- struct kvm_shadow_walk_iterator iterator;
-
- if (!is_present_pte(gw->ptes[gw->level - 1]))
- return NULL;
-
- for_each_shadow_entry(vcpu, addr, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
- if (level == PT_PAGE_TABLE_LEVEL
- || (largepage && level == PT_DIRECTORY_LEVEL)) {
- mmu_set_spte(vcpu, sptep, access,
- gw->pte_access & access,
- user_fault, write_fault,
- gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- ptwrite, largepage,
- gw->gfn, pfn, false);
- break;
- }
-
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
- continue;
-
- if (is_large_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-
- if (level == PT_DIRECTORY_LEVEL
- && gw->level == PT_DIRECTORY_LEVEL) {
- direct = 1;
- if (!is_dirty_pte(gw->ptes[level - 1]))
- access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
- } else {
- direct = 0;
- table_gfn = gw->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- direct, access, sptep);
- if (!direct) {
- r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
- kvm_mmu_put_page(shadow_page, sptep);
- kvm_release_pfn_clean(pfn);
- sptep = NULL;
- break;
- }
- }
-
- spte = __pa(shadow_page->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *sptep = spte;
- }
-
- return sptep;
-}
-
-/*
- * Page fault handler. There are several causes for a page fault:
- * - there is no shadow pte for the guest pte
- * - write access through a shadow pte marked read only so that we can set
- * the dirty bit
- * - write access to a shadow pte marked read only so we can update the page
- * dirty bitmap, when userspace requests it
- * - mmio access; in this case we will never install a present shadow pte
- * - normal guest page fault due to the guest pte marked not present, not
- * writable, or not executable
- *
- * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- * a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
- u32 error_code)
-{
- int write_fault = error_code & PFERR_WRITE_MASK;
- int user_fault = error_code & PFERR_USER_MASK;
- int fetch_fault = error_code & PFERR_FETCH_MASK;
- struct guest_walker walker;
- u64 *shadow_pte;
- int write_pt = 0;
- int r;
- pfn_t pfn;
- int largepage = 0;
- unsigned long mmu_seq;
-
- pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- kvm_mmu_audit(vcpu, "pre page fault");
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- /*
- * Look up the guest pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
- fetch_fault);
-
- /*
- * The page is not mapped by the guest. Let the guest handle it.
- */
- if (!r) {
- pgprintk("%s: guest page fault\n", __func__);
- inject_page_fault(vcpu, addr, walker.error_code);
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- return 0;
- }
-
- if (walker.level == PT_DIRECTORY_LEVEL) {
- gfn_t large_gfn;
- large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
- if (is_largepage_backed(vcpu, large_gfn)) {
- walker.gfn = large_gfn;
- largepage = 1;
- }
- }
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
-
- /* mmio */
- if (is_error_pfn(pfn)) {
- pgprintk("gfn %lx is mmio\n", walker.gfn);
- kvm_release_pfn_clean(pfn);
- return 1;
- }
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- largepage, &write_pt, pfn);
-
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- shadow_pte, *shadow_pte, write_pt);
-
- if (!write_pt)
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
- ++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, "post page fault (fixed)");
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return write_pt;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
-{
- struct kvm_shadow_walk_iterator iterator;
- pt_element_t gpte;
- gpa_t pte_gpa = -1;
- int level;
- u64 *sptep;
- int need_flush = 0;
-
- spin_lock(&vcpu->kvm->mmu_lock);
-
- for_each_shadow_entry(vcpu, gva, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
-
- /* FIXME: properly handle invlpg on large guest pages */
- if (level == PT_PAGE_TABLE_LEVEL ||
- ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
-
- pte_gpa = (sp->gfn << PAGE_SHIFT);
- pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
-
- if (is_shadow_present_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- if (is_large_pte(*sptep))
- --vcpu->kvm->stat.lpages;
- need_flush = 1;
- }
- set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
- break;
- }
-
- if (!is_shadow_present_pte(*sptep))
- break;
- }
-
- if (need_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- if (pte_gpa == -1)
- return;
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return;
- if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
- if (mmu_topup_memory_caches(vcpu))
- return;
- kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
- sizeof(pt_element_t), 0);
- }
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
- }
-
- return gpa;
-}
-
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i, j, offset, r;
- pt_element_t pt[256 / sizeof(pt_element_t)];
- gpa_t pte_gpa;
-
- if (sp->role.direct
- || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
- nonpaging_prefetch_page(vcpu, sp);
- return;
- }
-
- pte_gpa = gfn_to_gpa(sp->gfn);
- if (PTTYPE == 32) {
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
- pte_gpa += offset * sizeof(pt_element_t);
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
- pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
- for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_pte(pt[j]))
- sp->spt[i+j] = shadow_trap_nonpresent_pte;
- else
- sp->spt[i+j] = shadow_notrap_nonpresent_pte;
- }
-}
-
-/*
- * Using the cached information from sp->gfns is safe because:
- * - The spte has a reference to the struct page, so the pfn for a given gfn
- * can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
- */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- int i, offset, nr_present;
-
- offset = nr_present = 0;
-
- if (PTTYPE == 32)
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- unsigned pte_access;
- pt_element_t gpte;
- gpa_t pte_gpa;
- gfn_t gfn = sp->gfns[i];
-
- if (!is_shadow_present_pte(sp->spt[i]))
- continue;
-
- pte_gpa = gfn_to_gpa(sp->gfn);
- pte_gpa += (i+offset) * sizeof(pt_element_t);
-
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return -EINVAL;
-
- if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
- !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
-
- rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_pte(gpte))
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- set_shadow_pte(&sp->spt[i], nonpresent);
- continue;
- }
-
- nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gfn,
- spte_to_pfn(sp->spt[i]), true, false);
- }
-
- return !nr_present;
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
-#undef PT_LEVEL_BITS
-#undef PT_MAX_FULL_LEVELS
-#undef gpte_to_gfn
-#undef gpte_to_gfn_pde
-#undef CMPXCHG
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000000..487ad19a236e
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,1150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
+ *
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ * Wei Huang <wei@redhat.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <asm/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* This is enough to filter the vast majority of currently defined events. */
+#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
+
+/* Unadultered PMU capabilities of the host, i.e. of hardware. */
+static struct x86_pmu_capability __read_mostly kvm_host_pmu;
+
+/* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */
+struct x86_pmu_capability __read_mostly kvm_pmu_cap;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap);
+
+struct kvm_pmu_emulated_event_selectors {
+ u64 INSTRUCTIONS_RETIRED;
+ u64 BRANCH_INSTRUCTIONS_RETIRED;
+};
+static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
+
+/* Precise Distribution of Instructions Retired (PDIR) */
+static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
+ X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
+ /* Instruction-Accurate PDIR (PDIR++) */
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* Precise Distribution (PDist) */
+static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ * gp counters are stored in gp_counters[] and fixed counters are stored
+ * in fixed_counters[] respectively. Both of them are part of "struct
+ * kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ * However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ * has MSR_K7_PERFCTRn and, for families 15H and later,
+ * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
+ * aliased to MSR_K7_PERFCTRn.
+ * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ * that it also supports fixed counters. idx can be used to as index to
+ * gp and fixed counters.
+ * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ * code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ * all perf counters (both gp and fixed). The mapping relationship
+ * between pmc and perf counters is as the following:
+ * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
+ * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
+ * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
+ */
+
+static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
+
+#define KVM_X86_PMU_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
+ *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
+{
+ memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
+
+#define __KVM_X86_PMU_OP(func) \
+ static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
+#define KVM_X86_PMU_OP(func) \
+ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
+#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+#undef __KVM_X86_PMU_OP
+}
+
+void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
+{
+ bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
+ int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
+
+ /*
+ * Hybrid PMUs don't play nice with virtualization without careful
+ * configuration by userspace, and KVM's APIs for reporting supported
+ * vPMU features do not account for hybrid PMUs. Disable vPMU support
+ * for hybrid PMUs until KVM gains a way to let userspace opt-in.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+ enable_pmu = false;
+ memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu));
+ } else {
+ perf_get_x86_pmu_capability(&kvm_host_pmu);
+ }
+
+ if (enable_pmu) {
+ /*
+ * WARN if perf did NOT disable hardware PMU if the number of
+ * architecturally required GP counters aren't present, i.e. if
+ * there are a non-zero number of counters, but fewer than what
+ * is architecturally required.
+ */
+ if (!kvm_host_pmu.num_counters_gp ||
+ WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs))
+ enable_pmu = false;
+ else if (is_intel && !kvm_host_pmu.version)
+ enable_pmu = false;
+ }
+
+ if (!enable_pmu) {
+ memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
+ return;
+ }
+
+ memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu));
+ kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
+ kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
+ pmu_ops->MAX_NR_GP_COUNTERS);
+ kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
+ KVM_MAX_NR_FIXED_COUNTERS);
+
+ kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+}
+
+static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ bool skip_pmi = false;
+
+ if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
+ if (!in_pmi) {
+ /*
+ * TODO: KVM is currently _choosing_ to not generate records
+ * for emulated instructions, avoiding BUFFER_OVF PMI when
+ * there are no records. Strictly speaking, it should be done
+ * as well in the right context to improve sampling accuracy.
+ */
+ skip_pmi = true;
+ } else {
+ /* Indicate PEBS overflow PMI to guest. */
+ skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
+ (unsigned long *)&pmu->global_status);
+ }
+ } else {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ }
+
+ if (pmc->intr && !skip_pmi)
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+
+ /*
+ * Ignore asynchronous overflow events for counters that are scheduled
+ * to be reprogrammed, e.g. if a PMI for the previous event races with
+ * KVM's handling of a related guest WRMSR.
+ */
+ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
+ return;
+
+ __kvm_perf_overflow(pmc, true);
+
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
+
+static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
+{
+ /*
+ * For some model specific pebs counters with special capabilities
+ * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
+ * level to the maximum value (currently 3, backwards compatible)
+ * so that the perf subsystem would assign specific hardware counter
+ * with that capability for vPMC.
+ */
+ if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
+ (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
+ return 3;
+
+ /*
+ * The non-zero precision level of guest event makes the ordinary
+ * guest event becomes a guest PEBS event and triggers the host
+ * PEBS PMI handler to determine whether the PEBS overflow PMI
+ * comes from the host counters or the guest.
+ */
+ return 1;
+}
+
+static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
+{
+ u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
+
+ if (!sample_period)
+ sample_period = pmc_bitmask(pmc) + 1;
+ return sample_period;
+}
+
+static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
+ bool exclude_user, bool exclude_kernel,
+ bool intr)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ struct perf_event *event;
+ struct perf_event_attr attr = {
+ .type = type,
+ .size = sizeof(attr),
+ .pinned = true,
+ .exclude_idle = true,
+ .exclude_host = 1,
+ .exclude_user = exclude_user,
+ .exclude_kernel = exclude_kernel,
+ .config = config,
+ };
+ bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
+
+ attr.sample_period = get_sample_period(pmc, pmc->counter);
+
+ if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
+ (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
+ /*
+ * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
+ * period. Just clear the sample period so at least
+ * allocating the counter doesn't fail.
+ */
+ attr.sample_period = 0;
+ }
+ if (pebs) {
+ /*
+ * For most PEBS hardware events, the difference in the software
+ * precision levels of guest and host PEBS events will not affect
+ * the accuracy of the PEBS profiling result, because the "event IP"
+ * in the PEBS record is calibrated on the guest side.
+ */
+ attr.precise_ip = pmc_get_pebs_precise_level(pmc);
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_perf_overflow, pmc);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
+ PTR_ERR(event), pmc->idx);
+ return PTR_ERR(event);
+ }
+
+ pmc->perf_event = event;
+ pmc_to_pmu(pmc)->event_count++;
+ pmc->is_paused = false;
+ pmc->intr = intr || pebs;
+ return 0;
+}
+
+static bool pmc_pause_counter(struct kvm_pmc *pmc)
+{
+ u64 counter = pmc->counter;
+ u64 prev_counter;
+
+ /* update counter, reset event value to avoid redundant accumulation */
+ if (pmc->perf_event && !pmc->is_paused)
+ counter += perf_event_pause(pmc->perf_event, true);
+
+ /*
+ * Snapshot the previous counter *after* accumulating state from perf.
+ * If overflow already happened, hardware (via perf) is responsible for
+ * generating a PMI. KVM just needs to detect overflow on emulated
+ * counter events that haven't yet been processed.
+ */
+ prev_counter = counter & pmc_bitmask(pmc);
+
+ counter += pmc->emulated_counter;
+ pmc->counter = counter & pmc_bitmask(pmc);
+
+ pmc->emulated_counter = 0;
+ pmc->is_paused = true;
+
+ return pmc->counter < prev_counter;
+}
+
+static bool pmc_resume_counter(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event)
+ return false;
+
+ /* recalibrate sample period and check if it's accepted by perf core */
+ if (is_sampling_event(pmc->perf_event) &&
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter)))
+ return false;
+
+ if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
+ (!!pmc->perf_event->attr.precise_ip))
+ return false;
+
+ /* reuse perf_event to serve as pmc_reprogram_counter() does*/
+ perf_event_enable(pmc->perf_event);
+ pmc->is_paused = false;
+
+ return true;
+}
+
+static void pmc_release_perf_event(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ pmc->current_config = 0;
+ pmc_to_pmu(pmc)->event_count--;
+ }
+}
+
+static void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ pmc_release_perf_event(pmc);
+ }
+}
+
+static void pmc_update_sample_period(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event || pmc->is_paused ||
+ !is_sampling_event(pmc->perf_event))
+ return;
+
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter));
+}
+
+void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
+{
+ /*
+ * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
+ * read-modify-write. Adjust the counter value so that its value is
+ * relative to the current count, as reading the current count from
+ * perf is faster than pausing and repgrogramming the event in order to
+ * reset it to '0'. Note, this very sneakily offsets the accumulated
+ * emulated count too, by using pmc_read_counter()!
+ */
+ pmc->emulated_counter = 0;
+ pmc->counter += val - pmc_read_counter(pmc);
+ pmc->counter &= pmc_bitmask(pmc);
+ pmc_update_sample_period(pmc);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(pmc_write_counter);
+
+static int filter_cmp(const void *pa, const void *pb, u64 mask)
+{
+ u64 a = *(u64 *)pa & mask;
+ u64 b = *(u64 *)pb & mask;
+
+ return (a > b) - (a < b);
+}
+
+
+static int filter_sort_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE));
+}
+
+/*
+ * For the event filter, searching is done on the 'includes' list and
+ * 'excludes' list separately rather than on the 'events' list (which
+ * has both). As a result the exclude bit can be ignored.
+ */
+static int filter_event_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
+}
+
+static int find_filter_index(u64 *events, u64 nevents, u64 key)
+{
+ u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
+ filter_event_cmp);
+
+ if (!fe)
+ return -1;
+
+ return fe - events;
+}
+
+static bool is_filter_entry_match(u64 filter_event, u64 umask)
+{
+ u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
+ u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
+
+ BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
+ (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
+ ARCH_PERFMON_EVENTSEL_UMASK);
+
+ return (umask & mask) == match;
+}
+
+static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
+{
+ u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
+ u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
+ int i, index;
+
+ index = find_filter_index(events, nevents, event_select);
+ if (index < 0)
+ return false;
+
+ /*
+ * Entries are sorted by the event select. Walk the list in both
+ * directions to process all entries with the targeted event select.
+ */
+ for (i = index; i < nevents; i++) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ for (i = index - 1; i >= 0; i--) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
+ u64 eventsel)
+{
+ if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
+ !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
+ return f->action == KVM_PMU_EVENT_ALLOW;
+
+ return f->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
+ int idx)
+{
+ int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
+
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+
+ return true;
+}
+
+static bool pmc_is_event_allowed(struct kvm_pmc *pmc)
+{
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm *kvm = pmc->vcpu->kvm;
+
+ filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
+ if (!filter)
+ return true;
+
+ if (pmc_is_gp(pmc))
+ return is_gp_event_allowed(filter, pmc->eventsel);
+
+ return is_fixed_event_allowed(filter, pmc->idx);
+}
+
+static int reprogram_counter(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u64 eventsel = pmc->eventsel;
+ u64 new_config = eventsel;
+ bool emulate_overflow;
+ u8 fixed_ctr_ctrl;
+
+ emulate_overflow = pmc_pause_counter(pmc);
+
+ if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) ||
+ !pmc_is_event_allowed(pmc))
+ return 0;
+
+ if (emulate_overflow)
+ __kvm_perf_overflow(pmc, false);
+
+ if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+ printk_once("kvm pmu: pin control bit is ignored\n");
+
+ if (pmc_is_fixed(pmc)) {
+ fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
+ eventsel |= ARCH_PERFMON_EVENTSEL_OS;
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
+ eventsel |= ARCH_PERFMON_EVENTSEL_USR;
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
+ eventsel |= ARCH_PERFMON_EVENTSEL_INT;
+ new_config = (u64)fixed_ctr_ctrl;
+ }
+
+ if (pmc->current_config == new_config && pmc_resume_counter(pmc))
+ return 0;
+
+ pmc_release_perf_event(pmc);
+
+ pmc->current_config = new_config;
+
+ return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
+}
+
+static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel)
+{
+ /*
+ * Ignore checks for edge detect (all events currently emulated by KVM
+ * are always rising edges), pin control (unsupported by modern CPUs),
+ * and counter mask and its invert flag (KVM doesn't emulate multiple
+ * events in a single clock cycle).
+ *
+ * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit
+ * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34).
+ * Checking the "in HLE/RTM transaction" flags is correct as the vCPU
+ * can't be in a transaction if KVM is emulating an instruction.
+ *
+ * Checking the reserved bits might be wrong if they are defined in the
+ * future, but so could ignoring them, so do the simple thing for now.
+ */
+ return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB);
+}
+
+void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc)
+{
+ bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1);
+ bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1);
+
+ /*
+ * Do NOT consult the PMU event filters, as the filters must be checked
+ * at the time of emulation to ensure KVM uses fresh information, e.g.
+ * omitting a PMC from a bitmap could result in a missed event if the
+ * filter is changed to allow counting the event.
+ */
+ if (!pmc_is_locally_enabled(pmc))
+ return;
+
+ if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED))
+ bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1);
+
+ if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED))
+ bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_recalc_pmc_emulation);
+
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
+{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int bit;
+
+ bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
+
+ /*
+ * The reprogramming bitmap can be written asynchronously by something
+ * other than the task that holds vcpu->mutex, take care to clear only
+ * the bits that will actually processed.
+ */
+ BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
+ atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
+
+ kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
+ /*
+ * If reprogramming fails, e.g. due to contention, re-set the
+ * regprogram bit set, i.e. opportunistically try again on the
+ * next PMU refresh. Don't make a new request as doing so can
+ * stall the guest if reprogramming repeatedly fails.
+ */
+ if (reprogram_counter(pmc))
+ set_bit(pmc->idx, pmu->reprogram_pmi);
+ }
+
+ /*
+ * Release unused perf_events if the corresponding guest MSRs weren't
+ * accessed during the last vCPU time slice (need_cleanup is set when
+ * the vCPU is scheduled back in).
+ */
+ if (unlikely(pmu->need_cleanup))
+ kvm_pmu_cleanup(vcpu);
+
+ kvm_for_each_pmc(pmu, pmc, bit, bitmap)
+ kvm_pmu_recalc_pmc_emulation(pmu, pmc);
+}
+
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ /*
+ * On Intel, VMX interception has priority over RDPMC exceptions that
+ * aren't already handled by the emulator, i.e. there are no additional
+ * check needed for Intel PMUs.
+ *
+ * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
+ * i.e. an invalid PMC results in a #GP, not #VMEXIT.
+ */
+ if (!kvm_pmu_ops.check_rdpmc_early)
+ return 0;
+
+ return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
+}
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx)
+{
+ switch (pmc_idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ return true;
+ }
+ return false;
+}
+
+static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ u64 ctr_val;
+
+ switch (idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ ctr_val = rdtsc();
+ break;
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ ctr_val = ktime_get_boottime_ns();
+ break;
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ ctr_val = ktime_get_boottime_ns() +
+ vcpu->kvm->arch.kvmclock_offset;
+ break;
+ default:
+ return 1;
+ }
+
+ *data = ctr_val;
+ return 0;
+}
+
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u64 mask = ~0ull;
+
+ if (!pmu->version)
+ return 1;
+
+ if (is_vmware_backdoor_pmc(idx))
+ return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
+
+ pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
+ if (!pmc)
+ return 1;
+
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
+ (kvm_x86_call(get_cpl)(vcpu) != 0) &&
+ kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
+ return 1;
+
+ *data = pmc_read_counter(pmc) & mask;
+ return 0;
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu)) {
+ kvm_pmu_call(deliver_pmi)(vcpu);
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+ }
+}
+
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
+ default:
+ break;
+ }
+ return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
+ kvm_pmu_call(is_valid_msr)(vcpu, msr);
+}
+
+static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
+
+ if (pmc)
+ __set_bit(pmc->idx, pmu->pmc_in_use);
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ msr_info->data = pmu->global_status;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ msr_info->data = pmu->global_ctrl;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ msr_info->data = 0;
+ break;
+ default:
+ return kvm_pmu_call(get_msr)(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u64 diff;
+
+ /*
+ * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
+ * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
+ */
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (!msr_info->host_initiated)
+ return 1; /* RO MSR */
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ /* Per PPR, Read-only MSR. Writes are ignored. */
+ if (!msr_info->host_initiated)
+ break;
+
+ if (data & pmu->global_status_rsvd)
+ return 1;
+
+ pmu->global_status = data;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ data &= ~pmu->global_ctrl_rsvd;
+ fallthrough;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (!kvm_valid_perf_global_ctrl(pmu, data))
+ return 1;
+
+ if (pmu->global_ctrl != data) {
+ diff = pmu->global_ctrl ^ data;
+ pmu->global_ctrl = data;
+ reprogram_counters(pmu, diff);
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ /*
+ * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
+ * GLOBAL_STATUS, and so the set of reserved bits is the same.
+ */
+ if (data & pmu->global_status_rsvd)
+ return 1;
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ if (!msr_info->host_initiated)
+ pmu->global_status |= data & ~pmu->global_status_rsvd;
+ break;
+ default:
+ kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
+ return kvm_pmu_call(set_msr)(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
+
+ kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
+ pmc_stop_counter(pmc);
+ pmc->counter = 0;
+ pmc->emulated_counter = 0;
+
+ if (pmc_is_gp(pmc))
+ pmc->eventsel = 0;
+ }
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
+
+ kvm_pmu_call(reset)(vcpu);
+}
+
+
+/*
+ * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
+ * and/or PERF_CAPABILITIES.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+ return;
+
+ /*
+ * Stop/release all existing counters/events before realizing the new
+ * vPMU model.
+ */
+ kvm_pmu_reset(vcpu);
+
+ pmu->version = 0;
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_rsvd = ~0ull;
+ pmu->global_status_rsvd = ~0ull;
+ pmu->fixed_ctr_ctrl_rsvd = ~0ull;
+ pmu->pebs_enable_rsvd = ~0ull;
+ pmu->pebs_data_cfg_rsvd = ~0ull;
+ bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+
+ if (!vcpu->kvm->arch.enable_pmu)
+ return;
+
+ kvm_pmu_call(refresh)(vcpu);
+
+ /*
+ * At RESET, both Intel and AMD CPUs set all enable bits for general
+ * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
+ * was written for v1 PMUs don't unknowingly leave GP counters disabled
+ * in the global controls). Emulate that behavior when refreshing the
+ * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
+ */
+ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
+ pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
+
+ bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
+ bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX,
+ pmu->nr_arch_fixed_counters);
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ memset(pmu, 0, sizeof(*pmu));
+ kvm_pmu_call(init)(vcpu);
+}
+
+/* Release perf_events for vPMCs that have been unused for a full time slice. */
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = NULL;
+ DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
+ pmu->pmc_in_use, X86_PMC_IDX_MAX);
+
+ kvm_for_each_pmc(pmu, pmc, i, bitmask) {
+ if (pmc->perf_event && !pmc_is_locally_enabled(pmc))
+ pmc_stop_counter(pmc);
+ }
+
+ kvm_pmu_call(cleanup)(vcpu);
+
+ bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_reset(vcpu);
+}
+
+static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
+{
+ pmc->emulated_counter++;
+ kvm_pmu_request_counter_reprogram(pmc);
+}
+
+static inline bool cpl_is_matched(struct kvm_pmc *pmc)
+{
+ bool select_os, select_user;
+ u64 config;
+
+ if (pmc_is_gp(pmc)) {
+ config = pmc->eventsel;
+ select_os = config & ARCH_PERFMON_EVENTSEL_OS;
+ select_user = config & ARCH_PERFMON_EVENTSEL_USR;
+ } else {
+ config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
+ select_os = config & INTEL_FIXED_0_KERNEL;
+ select_user = config & INTEL_FIXED_0_USER;
+ }
+
+ /*
+ * Skip the CPL lookup, which isn't free on Intel, if the result will
+ * be the same regardless of the CPL.
+ */
+ if (select_os == select_user)
+ return select_os;
+
+ return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
+ select_user;
+}
+
+static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu,
+ const unsigned long *event_pmcs)
+{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i, idx;
+
+ BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
+
+ if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX))
+ return;
+
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX);
+ else if (!bitmap_and(bitmap, event_pmcs,
+ (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
+ return;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_for_each_pmc(pmu, pmc, i, bitmap) {
+ if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc))
+ continue;
+
+ kvm_pmu_incr_counter(pmc);
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+}
+
+void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_instruction_retired);
+
+void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_branch_retired);
+
+static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
+{
+ u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
+ KVM_PMU_MASKED_ENTRY_UMASK_MASK |
+ KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE;
+ int i;
+
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & ~mask)
+ return false;
+ }
+
+ return true;
+}
+
+static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < filter->nevents; i++) {
+ /*
+ * Skip events that are impossible to match against a guest
+ * event. When filtering, only the event select + unit mask
+ * of the guest event is used. To maintain backwards
+ * compatibility, impossible filters can't be rejected :-(
+ */
+ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
+ ARCH_PERFMON_EVENTSEL_UMASK))
+ continue;
+ /*
+ * Convert userspace events to a common in-kernel event so
+ * only one code path is needed to support both events. For
+ * the in-kernel events use masked events because they are
+ * flexible enough to handle both cases. To convert to masked
+ * events all that's needed is to add an "all ones" umask_mask,
+ * (unmasked filter events don't support EXCLUDE).
+ */
+ filter->events[j++] = filter->events[i] |
+ (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
+ }
+
+ filter->nevents = j;
+}
+
+static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i;
+
+ if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
+ convert_to_masked_filter(filter);
+ else if (!is_masked_filter_valid(filter))
+ return -EINVAL;
+
+ /*
+ * Sort entries by event select and includes vs. excludes so that all
+ * entries for a given event select can be processed efficiently during
+ * filtering. The EXCLUDE flag uses a more significant bit than the
+ * event select, and so the sorted list is also effectively split into
+ * includes and excludes sub-lists.
+ */
+ sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
+ filter_sort_cmp, NULL);
+
+ i = filter->nevents;
+ /* Find the first EXCLUDE event (only supported for masked events). */
+ if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
+ break;
+ }
+ }
+
+ filter->nr_includes = i;
+ filter->nr_excludes = filter->nevents - filter->nr_includes;
+ filter->includes = filter->events;
+ filter->excludes = filter->events + filter->nr_includes;
+
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_pmu_event_filter __user *user_filter = argp;
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm_pmu_event_filter tmp;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ size_t size;
+ int r;
+
+ if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.action != KVM_PMU_EVENT_ALLOW &&
+ tmp.action != KVM_PMU_EVENT_DENY)
+ return -EINVAL;
+
+ if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
+ return -EINVAL;
+
+ if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
+ return -E2BIG;
+
+ size = struct_size(filter, events, tmp.nevents);
+ filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!filter)
+ return -ENOMEM;
+
+ filter->action = tmp.action;
+ filter->nevents = tmp.nevents;
+ filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
+ filter->flags = tmp.flags;
+
+ r = -EFAULT;
+ if (copy_from_user(filter->events, user_filter->events,
+ sizeof(filter->events[0]) * filter->nevents))
+ goto cleanup;
+
+ r = prepare_filter_lists(filter);
+ if (r)
+ goto cleanup;
+
+ mutex_lock(&kvm->lock);
+ filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
+ sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
+
+ r = 0;
+cleanup:
+ kfree(filter);
+ return r;
+}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000000..5c3939e91f1d
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#include <linux/nospec.h>
+
+#include <asm/kvm_host.h>
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+
+#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
+ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
+
+/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) \
+ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
+
+#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
+#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
+#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+
+#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED
+
+struct kvm_pmu_ops {
+ struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx);
+ bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ void (*refresh)(struct kvm_vcpu *vcpu);
+ void (*init)(struct kvm_vcpu *vcpu);
+ void (*reset)(struct kvm_vcpu *vcpu);
+ void (*deliver_pmi)(struct kvm_vcpu *vcpu);
+ void (*cleanup)(struct kvm_vcpu *vcpu);
+
+ const u64 EVENTSEL_EVENT;
+ const int MAX_NR_GP_COUNTERS;
+ const int MIN_NR_GP_COUNTERS;
+};
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+
+static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
+{
+ /*
+ * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is
+ * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is
+ * greater than zero. However, KVM only exposes and emulates the MSR
+ * to/for the guest if the guest PMU supports at least "Architectural
+ * Performance Monitoring Version 2".
+ *
+ * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2.
+ */
+ return pmu->version > 1;
+}
+
+/*
+ * KVM tracks all counters in 64-bit bitmaps, with general purpose counters
+ * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0
+ * is tracked internally via index 32. On Intel, (AMD doesn't support fixed
+ * counters), this mirrors how fixed counters are mapped to PERF_GLOBAL_CTRL
+ * and similar MSRs, i.e. tracking fixed counters at base index 32 reduces the
+ * amounter of boilerplate needed to iterate over PMCs *and* simplifies common
+ * enabling/disable/reset operations.
+ *
+ * WARNING! This helper is only for lookups that are initiated by KVM, it is
+ * NOT safe for guest lookups, e.g. will do the wrong thing if passed a raw
+ * ECX value from RDPMC (fixed counters are accessed by setting bit 30 in ECX
+ * for RDPMC, not by adding 32 to the fixed counter index).
+ */
+static inline struct kvm_pmc *kvm_pmc_idx_to_pmc(struct kvm_pmu *pmu, int idx)
+{
+ if (idx < pmu->nr_arch_gp_counters)
+ return &pmu->gp_counters[idx];
+
+ idx -= KVM_FIXED_PMC_BASE_IDX;
+ if (idx >= 0 && idx < pmu->nr_arch_fixed_counters)
+ return &pmu->fixed_counters[idx];
+
+ return NULL;
+}
+
+#define kvm_for_each_pmc(pmu, pmc, i, bitmap) \
+ for_each_set_bit(i, bitmap, X86_PMC_IDX_MAX) \
+ if (!(pmc = kvm_pmc_idx_to_pmc(pmu, i))) \
+ continue; \
+ else \
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter + pmc->emulated_counter;
+
+ if (pmc->perf_event && !pmc->is_paused)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+
+void pmc_write_counter(struct kvm_pmc *pmc, u64 val);
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
+ u64 data)
+{
+ return !(pmu->global_ctrl_rsvd & data);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * parameter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
+
+ return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
+
+ return NULL;
+}
+
+static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (pmc_is_fixed(pmc))
+ return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX) &
+ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
+
+ return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
+}
+
+extern struct x86_pmu_capability kvm_pmu_cap;
+
+void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops);
+
+void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc);
+
+static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
+{
+ kvm_pmu_recalc_pmc_emulation(pmc_to_pmu(pmc), pmc);
+
+ set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
+
+static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
+{
+ int bit;
+
+ if (!diff)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ set_bit(bit, pmu->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
+}
+
+/*
+ * Check if a PMC is enabled by comparing it against global_ctrl bits.
+ *
+ * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled.
+ */
+static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ return true;
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu);
+void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu);
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
new file mode 100644
index 000000000000..81b4a7acf72e
--- /dev/null
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_REVERSE_CPUID_H
+#define ARCH_X86_KVM_REVERSE_CPUID_H
+
+#include <uapi/asm/kvm.h>
+#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
+
+/*
+ * Define a KVM-only feature flag.
+ *
+ * For features that are scattered by cpufeatures.h, __feature_translate() also
+ * needs to be updated to translate the kernel-defined feature into the
+ * KVM-defined feature.
+ *
+ * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h,
+ * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so
+ * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is
+ * needed in this case.
+ */
+#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (ECX) */
+#define KVM_X86_FEATURE_MSR_IMM KVM_X86_FEATURE(CPUID_7_1_ECX, 5)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
+#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
+#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
+#define X86_FEATURE_AVX_VNNI_INT16 KVM_X86_FEATURE(CPUID_7_1_EDX, 10)
+#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
+#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
+#define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1)
+#define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
+#define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3)
+#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
+#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+
+/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */
+#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16)
+#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17)
+#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18)
+
+/* CPUID level 0x80000007 (EDX). */
+#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
+
+/* CPUID level 0x80000022 (EAX) */
+#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0)
+
+/* CPUID level 0x80000021 (ECX) */
+#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1)
+#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2)
+
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+ [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
+ [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
+ [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+ [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
+ [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
+ [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
+ [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
+ [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
+ [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
+ [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX},
+ [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX},
+};
+
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities. And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
+{
+ BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+#define KVM_X86_TRANSLATE_FEATURE(f) \
+ case X86_FEATURE_##f: return KVM_X86_FEATURE_##f
+
+ switch (x86_feature) {
+ KVM_X86_TRANSLATE_FEATURE(SGX1);
+ KVM_X86_TRANSLATE_FEATURE(SGX2);
+ KVM_X86_TRANSLATE_FEATURE(SGX_EDECCSSA);
+ KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC);
+ KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
+ KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO);
+ KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO);
+ KVM_X86_TRANSLATE_FEATURE(MSR_IMM);
+ default:
+ return x86_feature;
+ }
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+ u32 x86_leaf = __feature_translate(x86_feature) / 32;
+
+ reverse_cpuid_check(x86_leaf);
+ return x86_leaf;
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5). The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+ x86_feature = __feature_translate(x86_feature);
+
+ reverse_cpuid_check(x86_feature / 32);
+ return 1 << (x86_feature & 31);
+}
+
+#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
+}
+
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+ return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool set)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ /*
+ * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+ * compiler into using CMOV instead of Jcc when possible.
+ */
+ if (set)
+ *reg |= __feature_bit(x86_feature);
+ else
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+#endif /* ARCH_X86_KVM_REVERSE_CPUID_H */
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
new file mode 100644
index 000000000000..f623c5986119
--- /dev/null
+++ b/arch/x86/kvm/smm.c
@@ -0,0 +1,663 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define CHECK_SMRAM32_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00)
+
+#define CHECK_SMRAM64_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00)
+
+static void check_smram_offsets(void)
+{
+ /* 32 bit SMRAM image */
+ CHECK_SMRAM32_OFFSET(reserved1, 0xFE00);
+ CHECK_SMRAM32_OFFSET(smbase, 0xFEF8);
+ CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC);
+ CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00);
+ CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02);
+ CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04);
+ CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08);
+ CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C);
+ CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10);
+ CHECK_SMRAM32_OFFSET(cr4, 0xFF14);
+ CHECK_SMRAM32_OFFSET(reserved2, 0xFF18);
+ CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A);
+ CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B);
+ CHECK_SMRAM32_OFFSET(ds, 0xFF2C);
+ CHECK_SMRAM32_OFFSET(fs, 0xFF38);
+ CHECK_SMRAM32_OFFSET(gs, 0xFF44);
+ CHECK_SMRAM32_OFFSET(idtr, 0xFF50);
+ CHECK_SMRAM32_OFFSET(tr, 0xFF5C);
+ CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C);
+ CHECK_SMRAM32_OFFSET(ldtr, 0xFF78);
+ CHECK_SMRAM32_OFFSET(es, 0xFF84);
+ CHECK_SMRAM32_OFFSET(cs, 0xFF90);
+ CHECK_SMRAM32_OFFSET(ss, 0xFF9C);
+ CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8);
+ CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC);
+ CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0);
+ CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4);
+ CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8);
+ CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC);
+ CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0);
+ CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4);
+ CHECK_SMRAM32_OFFSET(dr7, 0xFFC8);
+ CHECK_SMRAM32_OFFSET(dr6, 0xFFCC);
+ CHECK_SMRAM32_OFFSET(gprs, 0xFFD0);
+ CHECK_SMRAM32_OFFSET(eip, 0xFFF0);
+ CHECK_SMRAM32_OFFSET(eflags, 0xFFF4);
+ CHECK_SMRAM32_OFFSET(cr3, 0xFFF8);
+ CHECK_SMRAM32_OFFSET(cr0, 0xFFFC);
+
+ /* 64 bit SMRAM image */
+ CHECK_SMRAM64_OFFSET(es, 0xFE00);
+ CHECK_SMRAM64_OFFSET(cs, 0xFE10);
+ CHECK_SMRAM64_OFFSET(ss, 0xFE20);
+ CHECK_SMRAM64_OFFSET(ds, 0xFE30);
+ CHECK_SMRAM64_OFFSET(fs, 0xFE40);
+ CHECK_SMRAM64_OFFSET(gs, 0xFE50);
+ CHECK_SMRAM64_OFFSET(gdtr, 0xFE60);
+ CHECK_SMRAM64_OFFSET(ldtr, 0xFE70);
+ CHECK_SMRAM64_OFFSET(idtr, 0xFE80);
+ CHECK_SMRAM64_OFFSET(tr, 0xFE90);
+ CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0);
+ CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8);
+ CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0);
+ CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8);
+ CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0);
+ CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4);
+ CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8);
+ CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9);
+ CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA);
+ CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB);
+ CHECK_SMRAM64_OFFSET(reserved2, 0xFECC);
+ CHECK_SMRAM64_OFFSET(efer, 0xFED0);
+ CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8);
+ CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0);
+ CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8);
+ CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0);
+ CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC);
+ CHECK_SMRAM64_OFFSET(smbase, 0xFF00);
+ CHECK_SMRAM64_OFFSET(reserved4, 0xFF04);
+ CHECK_SMRAM64_OFFSET(ssp, 0xFF18);
+ CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20);
+ CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28);
+ CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30);
+ CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38);
+ CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40);
+ CHECK_SMRAM64_OFFSET(cr4, 0xFF48);
+ CHECK_SMRAM64_OFFSET(cr3, 0xFF50);
+ CHECK_SMRAM64_OFFSET(cr0, 0xFF58);
+ CHECK_SMRAM64_OFFSET(dr7, 0xFF60);
+ CHECK_SMRAM64_OFFSET(dr6, 0xFF68);
+ CHECK_SMRAM64_OFFSET(rflags, 0xFF70);
+ CHECK_SMRAM64_OFFSET(rip, 0xFF78);
+ CHECK_SMRAM64_OFFSET(gprs, 0xFF80);
+
+ BUILD_BUG_ON(sizeof(union kvm_smram) != 512);
+}
+
+#undef CHECK_SMRAM64_OFFSET
+#undef CHECK_SMRAM32_OFFSET
+
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
+{
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_smm_changed);
+
+void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_32 *state,
+ u32 *selector, int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ *selector = seg.selector;
+ state->base = seg.base;
+ state->limit = seg.limit;
+ state->flags = enter_smm_get_segment_flags(&seg);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ state->selector = seg.selector;
+ state->attributes = enter_smm_get_segment_flags(&seg) >> 8;
+ state->limit = seg.limit;
+ state->base = seg.base;
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_32 *smram)
+{
+ struct desc_ptr dt;
+ int i;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->eflags = kvm_get_rflags(vcpu);
+ smram->eip = kvm_rip_read(vcpu);
+
+ for (i = 0; i < 8; i++)
+ smram->gprs[i] = kvm_register_read_raw(vcpu, i);
+
+ smram->dr6 = (u32)vcpu->arch.dr6;
+ smram->dr7 = (u32)vcpu->arch.dr7;
+
+ enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
+ enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
+
+ kvm_x86_call(get_gdt)(vcpu, &dt);
+ smram->gdtr.base = dt.address;
+ smram->gdtr.limit = dt.size;
+
+ kvm_x86_call(get_idt)(vcpu, &dt);
+ smram->idtr.base = dt.address;
+ smram->idtr.limit = dt.size;
+
+ enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES);
+ enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS);
+ enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS);
+
+ enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS);
+ enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS);
+ enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS);
+
+ smram->cr4 = kvm_read_cr4(vcpu);
+ smram->smm_revision = 0x00020000;
+ smram->smbase = vcpu->arch.smbase;
+
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_64 *smram)
+{
+ struct desc_ptr dt;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i);
+
+ smram->rip = kvm_rip_read(vcpu);
+ smram->rflags = kvm_get_rflags(vcpu);
+
+ smram->dr6 = vcpu->arch.dr6;
+ smram->dr7 = vcpu->arch.dr7;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->cr4 = kvm_read_cr4(vcpu);
+
+ smram->smbase = vcpu->arch.smbase;
+ smram->smm_revison = 0x00020064;
+
+ smram->efer = vcpu->arch.efer;
+
+ enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
+
+ kvm_x86_call(get_idt)(vcpu, &dt);
+ smram->idtr.limit = dt.size;
+ smram->idtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
+
+ kvm_x86_call(get_gdt)(vcpu, &dt);
+ smram->gdtr.limit = dt.size;
+ smram->gdtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES);
+ enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS);
+ enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS);
+ enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS);
+ enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
+ enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
+
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ kvm_msr_read(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, &smram->ssp))
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+}
+#endif
+
+void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ unsigned long cr0;
+ union kvm_smram smram;
+
+ check_smram_offsets();
+
+ memset(smram.bytes, 0, sizeof(smram.bytes));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, &smram.smram64);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, &smram.smram32);
+
+ /*
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
+ *
+ * Kill the VM in the unlikely case of failure, because the VM
+ * can be in undefined state in this case.
+ */
+ if (kvm_x86_call(enter_smm)(vcpu, &smram))
+ goto error;
+
+ kvm_smm_changed(vcpu, true);
+
+ if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
+ goto error;
+
+ if (kvm_x86_call(get_nmi_mask)(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ kvm_x86_call(set_nmi_mask)(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
+
+ kvm_x86_call(set_cr4)(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+
+ if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
+ goto error;
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ if (kvm_x86_call(set_efer)(vcpu, 0))
+ goto error;
+#endif
+
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+ kvm_mmu_reset_context(vcpu);
+ return;
+error:
+ kvm_vm_dead(vcpu->kvm);
+}
+
+static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->db = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->present = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+
+ desc->unusable = !desc->present;
+ desc->padding = 0;
+}
+
+static int rsm_load_seg_32(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_32 *state,
+ u16 selector, int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = selector;
+ desc.base = state->base;
+ desc.limit = state->limit;
+ rsm_set_desc_flags(&desc, state->flags);
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+
+static int rsm_load_seg_64(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = state->selector;
+ rsm_set_desc_flags(&desc, state->attributes << 8);
+ desc.limit = state->limit;
+ desc.base = state->base;
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = kvm_set_cr3(vcpu, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = kvm_set_cr0(vcpu, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = kvm_set_cr4(vcpu, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = kvm_set_cr3(vcpu, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_32 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED;
+ ctxt->_eip = smstate->eip;
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = smstate->gprs[i];
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR);
+ rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR);
+
+ dt.address = smstate->gdtr.base;
+ dt.size = smstate->gdtr.limit;
+ kvm_x86_call(set_gdt)(vcpu, &dt);
+
+ dt.address = smstate->idtr.base;
+ dt.size = smstate->idtr.limit;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+
+ rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
+ rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
+ rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS);
+
+ rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS);
+ rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS);
+ rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS);
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0,
+ smstate->cr3, smstate->cr4);
+
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return r;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_64 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = smstate->gprs[15 - i];
+
+ ctxt->_eip = smstate->rip;
+ ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED;
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ if (__kvm_emulate_msr_write(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR);
+
+ dt.size = smstate->idtr.limit;
+ dt.address = smstate->idtr.base;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+
+ rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
+
+ dt.size = smstate->gdtr.limit;
+ dt.address = smstate->gdtr.base;
+ kvm_x86_call(set_gdt)(vcpu, &dt);
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES);
+ rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS);
+ rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS);
+ rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS);
+ rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
+ rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
+
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ kvm_msr_write(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, smstate->ssp))
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ unsigned long cr0;
+ union kvm_smram smram;
+ u64 smbase;
+ int ret;
+
+ smbase = vcpu->arch.smbase;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram));
+ if (ret < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
+ kvm_x86_call(set_nmi_mask)(vcpu, false);
+
+ kvm_smm_changed(vcpu, false);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
+ struct kvm_segment cs_desc;
+ unsigned long cr4;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PCIDE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.present = 1;
+ kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS);
+ }
+#endif
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = kvm_read_cr0(vcpu);
+ if (cr0 & X86_CR0_PE)
+ kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
+ unsigned long cr4, efer;
+
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PAE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ __kvm_emulate_msr_write(vcpu, MSR_EFER, efer);
+ }
+#endif
+
+ /*
+ * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest
+ * mode should happen _after_ loading state from SMRAM. However, KVM
+ * piggybacks the nested VM-Enter flows (which is wrong for many other
+ * reasons), and so nSVM/nVMX would clobber state that is loaded from
+ * SMRAM and from the VMCS/VMCB.
+ */
+ if (kvm_x86_call(leave_smm)(vcpu, &smram))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ ret = rsm_load_state_64(ctxt, &smram.smram64);
+ else
+#endif
+ ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+ /*
+ * If RSM fails and triggers shutdown, architecturally the shutdown
+ * occurs *before* the transition to guest mode. But due to KVM's
+ * flawed handling of RSM to L2 (see above), the vCPU may already be
+ * in_guest_mode(). Force the vCPU out of guest mode before delivering
+ * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit
+ * that architecturally shouldn't be possible.
+ */
+ if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+ return ret;
+}
diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h
new file mode 100644
index 000000000000..db3c88f16138
--- /dev/null
+++ b/arch/x86/kvm/smm.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_SMM_H
+#define ASM_KVM_SMM_H
+
+#include <linux/build_bug.h>
+
+#ifdef CONFIG_KVM_SMM
+
+
+/*
+ * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout
+ * (https://www.sandpile.org/x86/smm.htm).
+ */
+
+struct kvm_smm_seg_state_32 {
+ u32 flags;
+ u32 limit;
+ u32 base;
+} __packed;
+
+struct kvm_smram_state_32 {
+ u32 reserved1[62];
+ u32 smbase;
+ u32 smm_revision;
+ u16 io_inst_restart;
+ u16 auto_hlt_restart;
+ u32 io_restart_rdi;
+ u32 io_restart_rcx;
+ u32 io_restart_rsi;
+ u32 io_restart_rip;
+ u32 cr4;
+
+ /* A20M#, CPL, shutdown and other reserved/undocumented fields */
+ u16 reserved2;
+ u8 int_shadow; /* KVM extension */
+ u8 reserved3[17];
+
+ struct kvm_smm_seg_state_32 ds;
+ struct kvm_smm_seg_state_32 fs;
+ struct kvm_smm_seg_state_32 gs;
+ struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */
+ struct kvm_smm_seg_state_32 tr;
+ u32 reserved;
+ struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */
+ struct kvm_smm_seg_state_32 ldtr;
+ struct kvm_smm_seg_state_32 es;
+ struct kvm_smm_seg_state_32 cs;
+ struct kvm_smm_seg_state_32 ss;
+
+ u32 es_sel;
+ u32 cs_sel;
+ u32 ss_sel;
+ u32 ds_sel;
+ u32 fs_sel;
+ u32 gs_sel;
+ u32 ldtr_sel;
+ u32 tr_sel;
+
+ u32 dr7;
+ u32 dr6;
+ u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */
+ u32 eip;
+ u32 eflags;
+ u32 cr3;
+ u32 cr0;
+} __packed;
+
+
+/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */
+
+struct kvm_smm_seg_state_64 {
+ u16 selector;
+ u16 attributes;
+ u32 limit;
+ u64 base;
+};
+
+struct kvm_smram_state_64 {
+
+ struct kvm_smm_seg_state_64 es;
+ struct kvm_smm_seg_state_64 cs;
+ struct kvm_smm_seg_state_64 ss;
+ struct kvm_smm_seg_state_64 ds;
+ struct kvm_smm_seg_state_64 fs;
+ struct kvm_smm_seg_state_64 gs;
+ struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 ldtr;
+ struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 tr;
+
+ /* I/O restart and auto halt restart are not implemented by KVM */
+ u64 io_restart_rip;
+ u64 io_restart_rcx;
+ u64 io_restart_rsi;
+ u64 io_restart_rdi;
+ u32 io_restart_dword;
+ u32 reserved1;
+ u8 io_inst_restart;
+ u8 auto_hlt_restart;
+ u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */
+ u8 int_shadow;
+ u32 reserved2;
+
+ u64 efer;
+
+ /*
+ * Two fields below are implemented on AMD only, to store
+ * SVM guest vmcb address if the #SMI was received while in the guest mode.
+ */
+ u64 svm_guest_flag;
+ u64 svm_guest_vmcb_gpa;
+ u64 svm_guest_virtual_int; /* unknown purpose, not implemented */
+
+ u32 reserved3[3];
+ u32 smm_revison;
+ u32 smbase;
+ u32 reserved4[5];
+
+ u64 ssp;
+ /* svm_* fields below are not implemented by KVM */
+ u64 svm_guest_pat;
+ u64 svm_host_efer;
+ u64 svm_host_cr4;
+ u64 svm_host_cr3;
+ u64 svm_host_cr0;
+
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */
+};
+
+union kvm_smram {
+ struct kvm_smram_state_64 smram64;
+ struct kvm_smram_state_32 smram32;
+ u8 bytes[512];
+};
+
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_x86_call(has_emulated_msr)(vcpu->kvm, MSR_IA32_SMBASE))
+ return -ENOTTY;
+
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ return 0;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm);
+void enter_smm(struct kvm_vcpu *vcpu);
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt);
+void process_smi(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; }
+static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; }
+
+/*
+ * emulator_leave_smm is used as a function pointer, so the
+ * stub is defined in x86.c.
+ */
+#endif
+
+#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
deleted file mode 100644
index b1f658ad2f06..000000000000
--- a/arch/x86/kvm/svm.c
+++ /dev/null
@@ -1,2750 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-#include <linux/kvm_host.h>
-
-#include "kvm_svm.h"
-#include "irq.h"
-#include "mmu.h"
-#include "kvm_cache_regs.h"
-#include "x86.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-
-#include <asm/desc.h>
-
-#include <asm/virtext.h>
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
-#define SEG_TYPE_LDT 2
-#define SEG_TYPE_BUSY_TSS16 3
-
-#define SVM_FEATURE_NPT (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-
-/* Turn on to get debugging output*/
-/* #define NESTED_DEBUG */
-
-#ifdef NESTED_DEBUG
-#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
-#else
-#define nsvm_printk(fmt, args...) do {} while(0)
-#endif
-
-/* enable NPT for AMD64 and X86 with PAE */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static bool npt_enabled = true;
-#else
-static bool npt_enabled = false;
-#endif
-static int npt = 1;
-
-module_param(npt, int, S_IRUGO);
-
-static int nested = 0;
-module_param(nested, int, S_IRUGO);
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu);
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
-static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque);
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code);
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-static inline bool is_nested(struct vcpu_svm *svm)
-{
- return svm->nested_vmcb;
-}
-
-static unsigned long iopm_base;
-
-struct kvm_ldttss_desc {
- u16 limit0;
- u16 base0;
- unsigned base1 : 8, type : 5, dpl : 2, p : 1;
- unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
- u32 base3;
- u32 zero1;
-} __attribute__((packed));
-
-struct svm_cpu_data {
- int cpu;
-
- u64 asid_generation;
- u32 max_asid;
- u32 next_asid;
- struct kvm_ldttss_desc *tss_desc;
-
- struct page *save_area;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
-
-struct svm_init_data {
- int cpu;
- int r;
-};
-
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-#define MAX_INST_SIZE 15
-
-static inline u32 svm_has(u32 feat)
-{
- return svm_features & feat;
-}
-
-static inline void clgi(void)
-{
- asm volatile (__ex(SVM_CLGI));
-}
-
-static inline void stgi(void)
-{
- asm volatile (__ex(SVM_STGI));
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
- asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
-}
-
-static inline unsigned long kvm_read_cr2(void)
-{
- unsigned long cr2;
-
- asm volatile ("mov %%cr2, %0" : "=r" (cr2));
- return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
- asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
- to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
-
- to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.shadow_efer = efer;
-}
-
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /* If we are within a nested VM we'd better #VMEXIT and let the
- guest handle the exception */
- if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
- return;
-
- svm->vmcb->control.event_inj = nr
- | SVM_EVTINJ_VALID
- | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
- | SVM_EVTINJ_TYPE_EXEPT;
- svm->vmcb->control.event_inj_err = error_code;
-}
-
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
-static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 ret = 0;
-
- if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
- return ret & mask;
-}
-
-static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (mask == 0)
- svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
- else
- svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
-
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!svm->next_rip) {
- if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
- EMULATE_DONE)
- printk(KERN_DEBUG "%s: NOP\n", __func__);
- return;
- }
- if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
- printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
- __func__, kvm_rip_read(vcpu), svm->next_rip);
-
- kvm_rip_write(vcpu, svm->next_rip);
- svm_set_interrupt_shadow(vcpu, 0);
-}
-
-static int has_svm(void)
-{
- const char *msg;
-
- if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
- }
-
- return 1;
-}
-
-static void svm_hardware_disable(void *garbage)
-{
- cpu_svm_disable();
-}
-
-static void svm_hardware_enable(void *garbage)
-{
-
- struct svm_cpu_data *svm_data;
- uint64_t efer;
- struct desc_ptr gdt_descr;
- struct desc_struct *gdt;
- int me = raw_smp_processor_id();
-
- if (!has_svm()) {
- printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
- return;
- }
- svm_data = per_cpu(svm_data, me);
-
- if (!svm_data) {
- printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
- me);
- return;
- }
-
- svm_data->asid_generation = 1;
- svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
- svm_data->next_asid = svm_data->max_asid + 1;
-
- asm volatile ("sgdt %0" : "=m"(gdt_descr));
- gdt = (struct desc_struct *)gdt_descr.address;
- svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
-
- rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer | EFER_SVME);
-
- wrmsrl(MSR_VM_HSAVE_PA,
- page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
-}
-
-static void svm_cpu_uninit(int cpu)
-{
- struct svm_cpu_data *svm_data
- = per_cpu(svm_data, raw_smp_processor_id());
-
- if (!svm_data)
- return;
-
- per_cpu(svm_data, raw_smp_processor_id()) = NULL;
- __free_page(svm_data->save_area);
- kfree(svm_data);
-}
-
-static int svm_cpu_init(int cpu)
-{
- struct svm_cpu_data *svm_data;
- int r;
-
- svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
- if (!svm_data)
- return -ENOMEM;
- svm_data->cpu = cpu;
- svm_data->save_area = alloc_page(GFP_KERNEL);
- r = -ENOMEM;
- if (!svm_data->save_area)
- goto err_1;
-
- per_cpu(svm_data, cpu) = svm_data;
-
- return 0;
-
-err_1:
- kfree(svm_data);
- return r;
-
-}
-
-static void set_msr_interception(u32 *msrpm, unsigned msr,
- int read, int write)
-{
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr >= msrpm_ranges[i] &&
- msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
- u32 msr_offset = (i * MSRS_IN_RANGE + msr -
- msrpm_ranges[i]) * 2;
-
- u32 *base = msrpm + (msr_offset / 32);
- u32 msr_shift = msr_offset % 32;
- u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
- *base = (*base & ~(0x3 << msr_shift)) |
- (mask << msr_shift);
- return;
- }
- }
- BUG();
-}
-
-static void svm_vcpu_init_msrpm(u32 *msrpm)
-{
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-
-#ifdef CONFIG_X86_64
- set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
- set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
- set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
-#endif
- set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
-}
-
-static void svm_enable_lbrv(struct vcpu_svm *svm)
-{
- u32 *msrpm = svm->msrpm;
-
- svm->vmcb->control.lbr_ctl = 1;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-}
-
-static void svm_disable_lbrv(struct vcpu_svm *svm)
-{
- u32 *msrpm = svm->msrpm;
-
- svm->vmcb->control.lbr_ctl = 0;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
-}
-
-static __init int svm_hardware_setup(void)
-{
- int cpu;
- struct page *iopm_pages;
- void *iopm_va;
- int r;
-
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
- kvm_enable_efer_bits(EFER_FFXSR);
-
- if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME);
- }
-
- for_each_online_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
- if (!svm_has(SVM_FEATURE_NPT))
- npt_enabled = false;
-
- if (npt_enabled && !npt) {
- printk(KERN_INFO "kvm: Nested Paging disabled\n");
- npt_enabled = false;
- }
-
- if (npt_enabled) {
- printk(KERN_INFO "kvm: Nested Paging enabled\n");
- kvm_enable_tdp();
- } else
- kvm_disable_tdp();
-
- return 0;
-
-err:
- __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
- iopm_base = 0;
- return r;
-}
-
-static __exit void svm_hardware_unsetup(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- svm_cpu_uninit(cpu);
-
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
- iopm_base = 0;
-}
-
-static void init_seg(struct vmcb_seg *seg)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
- SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | type;
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static void init_vmcb(struct vcpu_svm *svm)
-{
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
-
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
-
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK |
- INTERCEPT_CR8_MASK;
-
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK;
-
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_exceptions = (1 << PF_VECTOR) |
- (1 << UD_VECTOR) |
- (1 << MC_VECTOR);
-
-
- control->intercept = (1ULL << INTERCEPT_INTR) |
- (1ULL << INTERCEPT_NMI) |
- (1ULL << INTERCEPT_SMI) |
- (1ULL << INTERCEPT_CPUID) |
- (1ULL << INTERCEPT_INVD) |
- (1ULL << INTERCEPT_HLT) |
- (1ULL << INTERCEPT_INVLPG) |
- (1ULL << INTERCEPT_INVLPGA) |
- (1ULL << INTERCEPT_IOIO_PROT) |
- (1ULL << INTERCEPT_MSR_PROT) |
- (1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_SHUTDOWN) |
- (1ULL << INTERCEPT_VMRUN) |
- (1ULL << INTERCEPT_VMMCALL) |
- (1ULL << INTERCEPT_VMLOAD) |
- (1ULL << INTERCEPT_VMSAVE) |
- (1ULL << INTERCEPT_STGI) |
- (1ULL << INTERCEPT_CLGI) |
- (1ULL << INTERCEPT_SKINIT) |
- (1ULL << INTERCEPT_WBINVD) |
- (1ULL << INTERCEPT_MONITOR) |
- (1ULL << INTERCEPT_MWAIT);
-
- control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = __pa(svm->msrpm);
- control->tsc_offset = 0;
- control->int_ctl = V_INTR_MASKING_MASK;
-
- init_seg(&save->es);
- init_seg(&save->ss);
- init_seg(&save->ds);
- init_seg(&save->fs);
- init_seg(&save->gs);
-
- save->cs.selector = 0xf000;
- /* Executable/Readable Code Segment */
- save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
- SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
- save->cs.limit = 0xffff;
- /*
- * cs.base should really be 0xffff0000, but vmx can't handle that, so
- * be consistent with it.
- *
- * Replace when we have real mode working for vmx.
- */
- save->cs.base = 0xf0000;
-
- save->gdtr.limit = 0xffff;
- save->idtr.limit = 0xffff;
-
- init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
- init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
-
- save->efer = EFER_SVME;
- save->dr6 = 0xffff0ff0;
- save->dr7 = 0x400;
- save->rflags = 2;
- save->rip = 0x0000fff0;
- svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
-
- /*
- * cr0 val on cpu init should be 0x60000010, we enable cpu
- * cache by default. the orderly way is to enable cache in bios.
- */
- save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
- save->cr4 = X86_CR4_PAE;
- /* rdx = ?? */
-
- if (npt_enabled) {
- /* Setup VMCB for Nested Paging */
- control->nested_ctl = 1;
- control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_INVLPG));
- control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
- control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
- save->g_pat = 0x0007040600070406ULL;
- /* enable caching because the QEMU Bios doesn't enable it */
- save->cr0 = X86_CR0_ET;
- save->cr3 = 0;
- save->cr4 = 0;
- }
- force_new_asid(&svm->vcpu);
-
- svm->nested_vmcb = 0;
- svm->vcpu.arch.hflags = HF_GIF_MASK;
-}
-
-static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- init_vmcb(svm);
-
- if (vcpu->vcpu_id != 0) {
- kvm_rip_write(vcpu, 0);
- svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
- svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
- }
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
-
- return 0;
-}
-
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
-{
- struct vcpu_svm *svm;
- struct page *page;
- struct page *msrpm_pages;
- struct page *hsave_page;
- struct page *nested_msrpm_pages;
- int err;
-
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- if (!svm) {
- err = -ENOMEM;
- goto out;
- }
-
- err = kvm_vcpu_init(&svm->vcpu, kvm, id);
- if (err)
- goto free_svm;
-
- page = alloc_page(GFP_KERNEL);
- if (!page) {
- err = -ENOMEM;
- goto uninit;
- }
-
- err = -ENOMEM;
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
- if (!msrpm_pages)
- goto uninit;
-
- nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
- if (!nested_msrpm_pages)
- goto uninit;
-
- svm->msrpm = page_address(msrpm_pages);
- svm_vcpu_init_msrpm(svm->msrpm);
-
- hsave_page = alloc_page(GFP_KERNEL);
- if (!hsave_page)
- goto uninit;
- svm->hsave = page_address(hsave_page);
-
- svm->nested_msrpm = page_address(nested_msrpm_pages);
-
- svm->vmcb = page_address(page);
- clear_page(svm->vmcb);
- svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
- svm->asid_generation = 0;
- init_vmcb(svm);
-
- fx_init(&svm->vcpu);
- svm->vcpu.fpu_active = 1;
- svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (svm->vcpu.vcpu_id == 0)
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
-
- return &svm->vcpu;
-
-uninit:
- kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
- kmem_cache_free(kvm_vcpu_cache, svm);
-out:
- return ERR_PTR(err);
-}
-
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
- __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->hsave));
- __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, svm);
-}
-
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- if (unlikely(cpu != vcpu->cpu)) {
- u64 tsc_this, delta;
-
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- rdtscll(tsc_this);
- delta = vcpu->arch.host_tsc - tsc_this;
- svm->vmcb->control.tsc_offset += delta;
- vcpu->cpu = cpu;
- kvm_migrate_timers(vcpu);
- svm->asid_generation = 0;
- }
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-}
-
-static void svm_vcpu_put(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- ++vcpu->stat.host_state_reload;
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- rdtscll(vcpu->arch.host_tsc);
-}
-
-static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
-{
- return to_svm(vcpu)->vmcb->save.rflags;
-}
-
-static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- to_svm(vcpu)->vmcb->save.rflags = rflags;
-}
-
-static void svm_set_vintr(struct vcpu_svm *svm)
-{
- svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
-}
-
-static void svm_clear_vintr(struct vcpu_svm *svm)
-{
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
-}
-
-static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
- switch (seg) {
- case VCPU_SREG_CS: return &save->cs;
- case VCPU_SREG_DS: return &save->ds;
- case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
- case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
- }
- BUG();
- return NULL;
-}
-
-static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- return s->base;
-}
-
-static void svm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- var->base = s->base;
- var->limit = s->limit;
- var->selector = s->selector;
- var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
- var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
- var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
- var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
- var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
- var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
- var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
- var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
-
- /* AMD's VMCB does not have an explicit unusable field, so emulate it
- * for cross vendor migration purposes by "not present"
- */
- var->unusable = !var->present || (var->type == 0);
-
- switch (seg) {
- case VCPU_SREG_CS:
- /*
- * SVM always stores 0 for the 'G' bit in the CS selector in
- * the VMCB on a VMEXIT. This hurts cross-vendor migration:
- * Intel's VMENTRY has a check on the 'G' bit.
- */
- var->g = s->limit > 0xfffff;
- break;
- case VCPU_SREG_TR:
- /*
- * Work around a bug where the busy flag in the tr selector
- * isn't exposed
- */
- var->type |= 0x2;
- break;
- case VCPU_SREG_DS:
- case VCPU_SREG_ES:
- case VCPU_SREG_FS:
- case VCPU_SREG_GS:
- /*
- * The accessed bit must always be set in the segment
- * descriptor cache, although it can be cleared in the
- * descriptor, the cached bit always remains at 1. Since
- * Intel has a check on this, set it here to support
- * cross-vendor migration.
- */
- if (!var->unusable)
- var->type |= 0x1;
- break;
- case VCPU_SREG_SS:
- /* On AMD CPUs sometimes the DB bit in the segment
- * descriptor is left as 1, although the whole segment has
- * been made unusable. Clear it here to pass an Intel VMX
- * entry check when cross vendor migrating.
- */
- if (var->unusable)
- var->db = 0;
- break;
- }
-}
-
-static int svm_get_cpl(struct kvm_vcpu *vcpu)
-{
- struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
- return save->cpl;
-}
-
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->limit = svm->vmcb->save.idtr.limit;
- dt->base = svm->vmcb->save.idtr.base;
-}
-
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.idtr.limit = dt->limit;
- svm->vmcb->save.idtr.base = dt->base ;
-}
-
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->limit = svm->vmcb->save.gdtr.limit;
- dt->base = svm->vmcb->save.gdtr.base;
-}
-
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.gdtr.limit = dt->limit;
- svm->vmcb->save.gdtr.base = dt->base ;
-}
-
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
-#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer |= EFER_LMA;
- svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
- }
-
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer &= ~EFER_LMA;
- svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
- }
- }
-#endif
- if (npt_enabled)
- goto set;
-
- if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- vcpu->fpu_active = 1;
- }
-
- vcpu->arch.cr0 = cr0;
- cr0 |= X86_CR0_PG | X86_CR0_WP;
- if (!vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
- cr0 |= X86_CR0_TS;
- }
-set:
- /*
- * re-enable caching here because the QEMU bios
- * does not do it - this results in some delay at
- * reboot
- */
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
-}
-
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
- unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
-
- if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- force_new_asid(vcpu);
-
- vcpu->arch.cr4 = cr4;
- if (!npt_enabled)
- cr4 |= X86_CR4_PAE;
- cr4 |= host_cr4_mce;
- to_svm(vcpu)->vmcb->save.cr4 = cr4;
-}
-
-static void svm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- s->base = var->base;
- s->limit = var->limit;
- s->selector = var->selector;
- if (var->unusable)
- s->attrib = 0;
- else {
- s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
- s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
- s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
- s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
- s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
- s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
- s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
- s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
- }
- if (seg == VCPU_SREG_CS)
- svm->vmcb->save.cpl
- = (svm->vmcb->save.cs.attrib
- >> SVM_SELECTOR_DPL_SHIFT) & 3;
-
-}
-
-static void update_db_intercept(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.intercept_exceptions &=
- ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
-
- if (vcpu->arch.singlestep)
- svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- svm->vmcb->control.intercept_exceptions |=
- 1 << DB_VECTOR;
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- svm->vmcb->control.intercept_exceptions |=
- 1 << BP_VECTOR;
- } else
- vcpu->guest_debug = 0;
-}
-
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
- int old_debug = vcpu->guest_debug;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- vcpu->guest_debug = dbg->control;
-
- update_db_intercept(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
- else
- svm->vmcb->save.dr7 = vcpu->arch.dr7;
-
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
- else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
- svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
- return 0;
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
-{
- if (svm_data->next_asid > svm_data->max_asid) {
- ++svm_data->asid_generation;
- svm_data->next_asid = 1;
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
- }
-
- svm->asid_generation = svm_data->asid_generation;
- svm->vmcb->control.asid = svm_data->next_asid++;
-}
-
-static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- unsigned long val;
-
- switch (dr) {
- case 0 ... 3:
- val = vcpu->arch.db[dr];
- break;
- case 6:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr6;
- else
- val = svm->vmcb->save.dr6;
- break;
- case 7:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr7;
- else
- val = svm->vmcb->save.dr7;
- break;
- default:
- val = 0;
- }
-
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
- return val;
-}
-
-static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
-
- *exception = 0;
-
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = value;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = value;
- return;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- *exception = UD_VECTOR;
- return;
- case 6:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
- vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
- return;
- case 7:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
- vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- svm->vmcb->save.dr7 = vcpu->arch.dr7;
- vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
- }
- return;
- default:
- /* FIXME: Possible case? */
- printk(KERN_DEBUG "%s: unexpected dr %u\n",
- __func__, dr);
- *exception = UD_VECTOR;
- return;
- }
-}
-
-static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u64 fault_address;
- u32 error_code;
-
- fault_address = svm->vmcb->control.exit_info_2;
- error_code = svm->vmcb->control.exit_info_1;
-
- if (!npt_enabled)
- KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
- else
- KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
- (u32)fault_address, (u32)(fault_address >> 32),
- handler);
- /*
- * FIXME: Tis shouldn't be necessary here, but there is a flush
- * missing in the MMU code. Until we find this bug, flush the
- * complete TLB here on an NPF
- */
- if (npt_enabled)
- svm_flush_tlb(&svm->vcpu);
- else {
- if (kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- }
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
-}
-
-static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (!(svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
- !svm->vcpu.arch.singlestep) {
- kvm_queue_exception(&svm->vcpu, DB_VECTOR);
- return 1;
- }
-
- if (svm->vcpu.arch.singlestep) {
- svm->vcpu.arch.singlestep = false;
- if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
- svm->vmcb->save.rflags &=
- ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(&svm->vcpu);
- }
-
- if (svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc =
- svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- return 0;
- }
-
- return 1;
-}
-
-static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = BP_VECTOR;
- return 0;
-}
-
-static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- int er;
-
- er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
- if (er != EMULATE_DONE)
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
- svm->vmcb->save.cr0 &= ~X86_CR0_TS;
- svm->vcpu.fpu_active = 1;
-
- return 1;
-}
-
-static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- /*
- * On an #MC intercept the MCE handler is not called automatically in
- * the host. So do it by hand here.
- */
- asm volatile (
- "int $0x12\n");
- /* not sure if we ever come back to this point */
-
- return 1;
-}
-
-static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- /*
- * VMCB is undefined after a SHUTDOWN intercept
- * so reinitialize it.
- */
- clear_page(svm->vmcb);
- init_vmcb(svm);
-
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
- return 0;
-}
-
-static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
- int size, in, string;
- unsigned port;
-
- ++svm->vcpu.stat.io_exits;
-
- svm->next_rip = svm->vmcb->control.exit_info_2;
-
- string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
- if (string) {
- if (emulate_instruction(&svm->vcpu,
- kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
-
- in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- port = io_info >> 16;
- size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-
- skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
-}
-
-static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- KVMTRACE_0D(NMI, &svm->vcpu, handler);
- return 1;
-}
-
-static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- ++svm->vcpu.stat.irq_exits;
- KVMTRACE_0D(INTR, &svm->vcpu, handler);
- return 1;
-}
-
-static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- return 1;
-}
-
-static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
- skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- kvm_emulate_hypercall(&svm->vcpu);
- return 1;
-}
-
-static int nested_svm_check_permissions(struct vcpu_svm *svm)
-{
- if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
- || !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
-
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- return 0;
-}
-
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- if (is_nested(svm)) {
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = error_code;
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- if (nested_svm_exit_handled(svm, false)) {
- nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
-
- nested_svm_vmexit(svm);
- return 1;
- }
- }
-
- return 0;
-}
-
-static inline int nested_svm_intr(struct vcpu_svm *svm)
-{
- if (is_nested(svm)) {
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return 0;
-
- if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return 0;
-
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
-
- if (nested_svm_exit_handled(svm, false)) {
- nsvm_printk("VMexit -> INTR\n");
- nested_svm_vmexit(svm);
- return 1;
- }
- }
-
- return 0;
-}
-
-static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
-{
- struct page *page;
-
- down_read(&current->mm->mmap_sem);
- page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
-
- if (is_error_page(page)) {
- printk(KERN_INFO "%s: could not find page at 0x%llx\n",
- __func__, gpa);
- kvm_release_page_clean(page);
- kvm_inject_gp(&svm->vcpu, 0);
- return NULL;
- }
- return page;
-}
-
-static int nested_svm_do(struct vcpu_svm *svm,
- u64 arg1_gpa, u64 arg2_gpa, void *opaque,
- int (*handler)(struct vcpu_svm *svm,
- void *arg1,
- void *arg2,
- void *opaque))
-{
- struct page *arg1_page;
- struct page *arg2_page = NULL;
- void *arg1;
- void *arg2 = NULL;
- int retval;
-
- arg1_page = nested_svm_get_page(svm, arg1_gpa);
- if(arg1_page == NULL)
- return 1;
-
- if (arg2_gpa) {
- arg2_page = nested_svm_get_page(svm, arg2_gpa);
- if(arg2_page == NULL) {
- kvm_release_page_clean(arg1_page);
- return 1;
- }
- }
-
- arg1 = kmap_atomic(arg1_page, KM_USER0);
- if (arg2_gpa)
- arg2 = kmap_atomic(arg2_page, KM_USER1);
-
- retval = handler(svm, arg1, arg2, opaque);
-
- kunmap_atomic(arg1, KM_USER0);
- if (arg2_gpa)
- kunmap_atomic(arg2, KM_USER1);
-
- kvm_release_page_dirty(arg1_page);
- if (arg2_gpa)
- kvm_release_page_dirty(arg2_page);
-
- return retval;
-}
-
-static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
- void *arg1,
- void *arg2,
- void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- bool kvm_overrides = *(bool *)opaque;
- u32 exit_code = svm->vmcb->control.exit_code;
-
- if (kvm_overrides) {
- switch (exit_code) {
- case SVM_EXIT_INTR:
- case SVM_EXIT_NMI:
- return 0;
- /* For now we are always handling NPFs when using them */
- case SVM_EXIT_NPF:
- if (npt_enabled)
- return 0;
- break;
- /* When we're shadowing, trap PFs */
- case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- if (!npt_enabled)
- return 0;
- break;
- default:
- break;
- }
- }
-
- switch (exit_code) {
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
- if (nested_vmcb->control.intercept_cr_read & cr_bits)
- return 1;
- break;
- }
- case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
- if (nested_vmcb->control.intercept_cr_write & cr_bits)
- return 1;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
- if (nested_vmcb->control.intercept_dr_read & dr_bits)
- return 1;
- break;
- }
- case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
- if (nested_vmcb->control.intercept_dr_write & dr_bits)
- return 1;
- break;
- }
- case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
- u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (nested_vmcb->control.intercept_exceptions & excp_bits)
- return 1;
- break;
- }
- default: {
- u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
- nsvm_printk("exit code: 0x%x\n", exit_code);
- if (nested_vmcb->control.intercept & exit_bits)
- return 1;
- }
- }
-
- return 0;
-}
-
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
- void *arg1, void *arg2,
- void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- u8 *msrpm = (u8 *)arg2;
- u32 t0, t1;
- u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u32 param = svm->vmcb->control.exit_info_1 & 1;
-
- if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return 0;
-
- switch(msr) {
- case 0 ... 0x1fff:
- t0 = (msr * 2) % 8;
- t1 = msr / 8;
- break;
- case 0xc0000000 ... 0xc0001fff:
- t0 = (8192 + msr - 0xc0000000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- case 0xc0010000 ... 0xc0011fff:
- t0 = (16384 + msr - 0xc0010000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- default:
- return 1;
- break;
- }
- if (msrpm[t1] & ((1 << param) << t0))
- return 1;
-
- return 0;
-}
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
-{
- bool k = kvm_override;
-
- switch (svm->vmcb->control.exit_code) {
- case SVM_EXIT_MSR:
- return nested_svm_do(svm, svm->nested_vmcb,
- svm->nested_vmcb_msrpm, NULL,
- nested_svm_exit_handled_msr);
- default: break;
- }
-
- return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
- nested_svm_exit_handled_real);
-}
-
-static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- struct vmcb *hsave = svm->hsave;
- u64 nested_save[] = { nested_vmcb->save.cr0,
- nested_vmcb->save.cr3,
- nested_vmcb->save.cr4,
- nested_vmcb->save.efer,
- nested_vmcb->control.intercept_cr_read,
- nested_vmcb->control.intercept_cr_write,
- nested_vmcb->control.intercept_dr_read,
- nested_vmcb->control.intercept_dr_write,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept,
- nested_vmcb->control.msrpm_base_pa,
- nested_vmcb->control.iopm_base_pa,
- nested_vmcb->control.tsc_offset };
-
- /* Give the current vmcb to the guest */
- memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
- nested_vmcb->save.cr0 = nested_save[0];
- if (!npt_enabled)
- nested_vmcb->save.cr3 = nested_save[1];
- nested_vmcb->save.cr4 = nested_save[2];
- nested_vmcb->save.efer = nested_save[3];
- nested_vmcb->control.intercept_cr_read = nested_save[4];
- nested_vmcb->control.intercept_cr_write = nested_save[5];
- nested_vmcb->control.intercept_dr_read = nested_save[6];
- nested_vmcb->control.intercept_dr_write = nested_save[7];
- nested_vmcb->control.intercept_exceptions = nested_save[8];
- nested_vmcb->control.intercept = nested_save[9];
- nested_vmcb->control.msrpm_base_pa = nested_save[10];
- nested_vmcb->control.iopm_base_pa = nested_save[11];
- nested_vmcb->control.tsc_offset = nested_save[12];
-
- /* We always set V_INTR_MASKING and remember the old value in hflags */
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
-
- if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
- (nested_vmcb->control.int_vector)) {
- nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
- nested_vmcb->control.int_vector);
- }
-
- /* Restore the original control entries */
- svm->vmcb->control = hsave->control;
-
- /* Kill any pending exceptions */
- if (svm->vcpu.arch.exception.pending == true)
- nsvm_printk("WARNING: Pending Exception\n");
- svm->vcpu.arch.exception.pending = false;
-
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- svm->vmcb->save.rflags = hsave->save.rflags;
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = hsave->save.cr3;
- svm->vcpu.arch.cr3 = hsave->save.cr3;
- } else {
- kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
- }
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
- svm->vmcb->save.dr7 = 0;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
-
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
- /* Exit nested SVM mode */
- svm->nested_vmcb = 0;
-
- return 0;
-}
-
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
- nsvm_printk("VMexit\n");
- if (nested_svm_do(svm, svm->nested_vmcb, 0,
- NULL, nested_svm_vmexit_real))
- return 1;
-
- kvm_mmu_reset_context(&svm->vcpu);
- kvm_mmu_load(&svm->vcpu);
-
- return 0;
-}
-
-static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
-{
- int i;
- u32 *nested_msrpm = (u32*)arg1;
- for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
- svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
-
- return 0;
-}
-
-static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
-{
- struct vmcb *nested_vmcb = (struct vmcb *)arg1;
- struct vmcb *hsave = svm->hsave;
-
- /* nested_vmcb is our indicator if nested SVM is activated */
- svm->nested_vmcb = svm->vmcb->save.rax;
-
- /* Clear internal status */
- svm->vcpu.arch.exception.pending = false;
-
- /* Save the old vmcb, so we don't need to pick what we save, but
- can restore everything when a VMEXIT occurs */
- memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
- /* We need to remember the original CR3 in the SPT case */
- if (!npt_enabled)
- hsave->save.cr3 = svm->vcpu.arch.cr3;
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rip = svm->next_rip;
-
- if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
- svm->vcpu.arch.hflags |= HF_HIF_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
-
- /* Load the nested guest state */
- svm->vmcb->save.es = nested_vmcb->save.es;
- svm->vmcb->save.cs = nested_vmcb->save.cs;
- svm->vmcb->save.ss = nested_vmcb->save.ss;
- svm->vmcb->save.ds = nested_vmcb->save.ds;
- svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
- svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- svm->vmcb->save.rflags = nested_vmcb->save.rflags;
- svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
- svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
- svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
- svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
- } else {
- kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
- kvm_mmu_reset_context(&svm->vcpu);
- }
- svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
- /* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = nested_vmcb->save.rax;
- svm->vmcb->save.rsp = nested_vmcb->save.rsp;
- svm->vmcb->save.rip = nested_vmcb->save.rip;
- svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
- svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
- svm->vmcb->save.cpl = nested_vmcb->save.cpl;
-
- /* We don't want a nested guest to be more powerful than the guest,
- so all intercepts are ORed */
- svm->vmcb->control.intercept_cr_read |=
- nested_vmcb->control.intercept_cr_read;
- svm->vmcb->control.intercept_cr_write |=
- nested_vmcb->control.intercept_cr_write;
- svm->vmcb->control.intercept_dr_read |=
- nested_vmcb->control.intercept_dr_read;
- svm->vmcb->control.intercept_dr_write |=
- nested_vmcb->control.intercept_dr_write;
- svm->vmcb->control.intercept_exceptions |=
- nested_vmcb->control.intercept_exceptions;
-
- svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
- svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
-
- force_new_asid(&svm->vcpu);
- svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
- svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
- svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
- if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
- nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
- nested_vmcb->control.int_ctl);
- }
- if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
- svm->vcpu.arch.hflags |= HF_VINTR_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
-
- nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
- nested_vmcb->control.exit_int_info,
- nested_vmcb->control.int_state);
-
- svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
- svm->vmcb->control.int_state = nested_vmcb->control.int_state;
- svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
- if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
- nsvm_printk("Injecting Event: 0x%x\n",
- nested_vmcb->control.event_inj);
- svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
- svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
-
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
-
- return 0;
-}
-
-static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
-{
- to_vmcb->save.fs = from_vmcb->save.fs;
- to_vmcb->save.gs = from_vmcb->save.gs;
- to_vmcb->save.tr = from_vmcb->save.tr;
- to_vmcb->save.ldtr = from_vmcb->save.ldtr;
- to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
- to_vmcb->save.star = from_vmcb->save.star;
- to_vmcb->save.lstar = from_vmcb->save.lstar;
- to_vmcb->save.cstar = from_vmcb->save.cstar;
- to_vmcb->save.sfmask = from_vmcb->save.sfmask;
- to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
- to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
- to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
-
- return 1;
-}
-
-static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque)
-{
- return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
-}
-
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
- void *arg2, void *opaque)
-{
- return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
-}
-
-static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
-
- return 1;
-}
-
-static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
-
- return 1;
-}
-
-static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- nsvm_printk("VMrun\n");
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
- NULL, nested_svm_vmrun))
- return 1;
-
- if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
- NULL, nested_svm_vmrun_msrpm))
- return 1;
-
- return 1;
-}
-
-static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
-
- return 1;
-}
-
-static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
-
- /* After a CLGI no interrupts should come */
- svm_clear_vintr(svm);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-
- return 1;
-}
-
-static int invalid_op_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int task_switch_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- u16 tss_selector;
- int reason;
- int int_type = svm->vmcb->control.exit_int_info &
- SVM_EXITINTINFO_TYPE_MASK;
- int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
- uint32_t type =
- svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
- uint32_t idt_v =
- svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
-
- tss_selector = (u16)svm->vmcb->control.exit_info_1;
-
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
- reason = TASK_SWITCH_IRET;
- else if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
- reason = TASK_SWITCH_JMP;
- else if (idt_v)
- reason = TASK_SWITCH_GATE;
- else
- reason = TASK_SWITCH_CALL;
-
- if (reason == TASK_SWITCH_GATE) {
- switch (type) {
- case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = false;
- break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
- kvm_clear_exception_queue(&svm->vcpu);
- break;
- case SVM_EXITINTINFO_TYPE_INTR:
- kvm_clear_interrupt_queue(&svm->vcpu);
- break;
- default:
- break;
- }
- }
-
- if (reason != TASK_SWITCH_GATE ||
- int_type == SVM_EXITINTINFO_TYPE_SOFT ||
- (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
- (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
- skip_emulated_instruction(&svm->vcpu);
-
- return kvm_task_switch(&svm->vcpu, tss_selector, reason);
-}
-
-static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- kvm_emulate_cpuid(&svm->vcpu);
- return 1;
-}
-
-static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- ++svm->vcpu.stat.nmi_window_exits;
- svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
- svm->vcpu.arch.hflags |= HF_IRET_MASK;
- return 1;
-}
-
-static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
-}
-
-static int emulate_on_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
-}
-
-static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
- /* instruction emulation calls kvm_set_cr8() */
- emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
- if (irqchip_in_kernel(svm->vcpu.kvm)) {
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
- return 1;
- }
- if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
- return 1;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
-}
-
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
- u64 tsc;
-
- rdtscll(tsc);
- *data = svm->vmcb->control.tsc_offset + tsc;
- break;
- }
- case MSR_K6_STAR:
- *data = svm->vmcb->save.star;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- *data = svm->vmcb->save.lstar;
- break;
- case MSR_CSTAR:
- *data = svm->vmcb->save.cstar;
- break;
- case MSR_KERNEL_GS_BASE:
- *data = svm->vmcb->save.kernel_gs_base;
- break;
- case MSR_SYSCALL_MASK:
- *data = svm->vmcb->save.sfmask;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
- break;
- case MSR_IA32_SYSENTER_EIP:
- *data = svm->vmcb->save.sysenter_eip;
- break;
- case MSR_IA32_SYSENTER_ESP:
- *data = svm->vmcb->save.sysenter_esp;
- break;
- /* Nobody will change the following 5 values in the VMCB so
- we can safely return them on rdmsr. They will always be 0
- until LBRV is implemented. */
- case MSR_IA32_DEBUGCTLMSR:
- *data = svm->vmcb->save.dbgctl;
- break;
- case MSR_IA32_LASTBRANCHFROMIP:
- *data = svm->vmcb->save.br_from;
- break;
- case MSR_IA32_LASTBRANCHTOIP:
- *data = svm->vmcb->save.br_to;
- break;
- case MSR_IA32_LASTINTFROMIP:
- *data = svm->vmcb->save.last_excp_from;
- break;
- case MSR_IA32_LASTINTTOIP:
- *data = svm->vmcb->save.last_excp_to;
- break;
- case MSR_VM_HSAVE_PA:
- *data = svm->hsave_msr;
- break;
- case MSR_VM_CR:
- *data = 0;
- break;
- case MSR_IA32_UCODE_REV:
- *data = 0x01000065;
- break;
- default:
- return kvm_get_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data;
-
- if (svm_get_msr(&svm->vcpu, ecx, &data))
- kvm_inject_gp(&svm->vcpu, 0);
- else {
- KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
- (u32)(data >> 32), handler);
-
- svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
- svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- skip_emulated_instruction(&svm->vcpu);
- }
- return 1;
-}
-
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TIME_STAMP_COUNTER: {
- u64 tsc;
-
- rdtscll(tsc);
- svm->vmcb->control.tsc_offset = data - tsc;
- break;
- }
- case MSR_K6_STAR:
- svm->vmcb->save.star = data;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- svm->vmcb->save.lstar = data;
- break;
- case MSR_CSTAR:
- svm->vmcb->save.cstar = data;
- break;
- case MSR_KERNEL_GS_BASE:
- svm->vmcb->save.kernel_gs_base = data;
- break;
- case MSR_SYSCALL_MASK:
- svm->vmcb->save.sfmask = data;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
- break;
- case MSR_IA32_SYSENTER_EIP:
- svm->vmcb->save.sysenter_eip = data;
- break;
- case MSR_IA32_SYSENTER_ESP:
- svm->vmcb->save.sysenter_esp = data;
- break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!svm_has(SVM_FEATURE_LBRV)) {
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
- break;
- }
- if (data & DEBUGCTL_RESERVED_BITS)
- return 1;
-
- svm->vmcb->save.dbgctl = data;
- if (data & (1ULL<<0))
- svm_enable_lbrv(svm);
- else
- svm_disable_lbrv(svm);
- break;
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
-
- break;
- case MSR_VM_HSAVE_PA:
- svm->hsave_msr = data;
- break;
- default:
- return kvm_set_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-
- KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data))
- kvm_inject_gp(&svm->vcpu, 0);
- else
- skip_emulated_instruction(&svm->vcpu);
- return 1;
-}
-
-static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
-{
- if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm, kvm_run);
- else
- return rdmsr_interception(svm, kvm_run);
-}
-
-static int interrupt_window_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
-{
- KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
-
- svm_clear_vintr(svm);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(svm->vcpu.kvm) &&
- kvm_run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(&svm->vcpu)) {
- ++svm->vcpu.stat.irq_window_exits;
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
-
- return 1;
-}
-
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
- struct kvm_run *kvm_run) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_CR8] = emulate_on_interception,
- /* for now: */
- [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
- [SVM_EXIT_READ_DR1] = emulate_on_interception,
- [SVM_EXIT_READ_DR2] = emulate_on_interception,
- [SVM_EXIT_READ_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
- [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
- [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
- [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
- [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
- [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
- [SVM_EXIT_INTR] = intr_interception,
- [SVM_EXIT_NMI] = nmi_interception,
- [SVM_EXIT_SMI] = nop_on_interception,
- [SVM_EXIT_INIT] = nop_on_interception,
- [SVM_EXIT_VINTR] = interrupt_window_interception,
- /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
- [SVM_EXIT_CPUID] = cpuid_interception,
- [SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = emulate_on_interception,
- [SVM_EXIT_HLT] = halt_interception,
- [SVM_EXIT_INVLPG] = invlpg_interception,
- [SVM_EXIT_INVLPGA] = invalid_op_interception,
- [SVM_EXIT_IOIO] = io_interception,
- [SVM_EXIT_MSR] = msr_interception,
- [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
- [SVM_EXIT_SHUTDOWN] = shutdown_interception,
- [SVM_EXIT_VMRUN] = vmrun_interception,
- [SVM_EXIT_VMMCALL] = vmmcall_interception,
- [SVM_EXIT_VMLOAD] = vmload_interception,
- [SVM_EXIT_VMSAVE] = vmsave_interception,
- [SVM_EXIT_STGI] = stgi_interception,
- [SVM_EXIT_CLGI] = clgi_interception,
- [SVM_EXIT_SKINIT] = invalid_op_interception,
- [SVM_EXIT_WBINVD] = emulate_on_interception,
- [SVM_EXIT_MONITOR] = invalid_op_interception,
- [SVM_EXIT_MWAIT] = invalid_op_interception,
- [SVM_EXIT_NPF] = pf_interception,
-};
-
-static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 exit_code = svm->vmcb->control.exit_code;
-
- KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
- (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
-
- if (is_nested(svm)) {
- nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
- exit_code, svm->vmcb->control.exit_info_1,
- svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
- if (nested_svm_exit_handled(svm, true)) {
- nested_svm_vmexit(svm);
- nsvm_printk("-> #VMEXIT\n");
- return 1;
- }
- }
-
- if (npt_enabled) {
- int mmu_reload = 0;
- if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
- svm_set_cr0(vcpu, svm->vmcb->save.cr0);
- mmu_reload = 1;
- }
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- }
- if (mmu_reload) {
- kvm_mmu_reset_context(vcpu);
- kvm_mmu_load(vcpu);
- }
- }
-
-
- if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason
- = svm->vmcb->control.exit_code;
- return 0;
- }
-
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
- printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
- "exit_code 0x%x\n",
- __func__, svm->vmcb->control.exit_int_info,
- exit_code);
-
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || !svm_exit_handlers[exit_code]) {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_code;
- return 0;
- }
-
- return svm_exit_handlers[exit_code](svm, kvm_run);
-}
-
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
- svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
- load_TR_desc();
-}
-
-static void pre_svm_run(struct vcpu_svm *svm)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
- /* FIXME: handle wraparound of asid_generation */
- if (svm->asid_generation != svm_data->asid_generation)
- new_asid(svm, svm_data);
-}
-
-static void svm_inject_nmi(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
- vcpu->arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
- ++vcpu->stat.nmi_injections;
-}
-
-static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
-{
- struct vmcb_control_area *control;
-
- KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
-
- ++svm->vcpu.stat.irq_injections;
- control = &svm->vmcb->control;
- control->int_vector = irq;
- control->int_ctl &= ~V_INTR_PRIO_MASK;
- control->int_ctl |= V_IRQ_MASK |
- ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-}
-
-static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = nr |
- SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
-}
-
-static void svm_set_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- nested_svm_intr(svm);
-
- svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (irr == -1)
- return;
-
- if (tpr >= irr)
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
-}
-
-static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- !(svm->vcpu.arch.hflags & HF_NMI_MASK);
-}
-
-static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- return (vmcb->save.rflags & X86_EFLAGS_IF) &&
- !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- (svm->vcpu.arch.hflags & HF_GIF_MASK);
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- svm_set_vintr(to_svm(vcpu));
- svm_inject_irq(to_svm(vcpu), 0x0);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
- == HF_NMI_MASK)
- return; /* IRET will cause a vm exit */
-
- /* Something prevents NMI from been injected. Single step over
- possible problem (IRET or exception injection or interrupt
- shadow) */
- vcpu->arch.singlestep = true;
- svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(vcpu);
-}
-
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- return 0;
-}
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
- int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
- kvm_set_cr8(vcpu, cr8);
- }
-}
-
-static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 cr8;
-
- cr8 = kvm_get_cr8(vcpu);
- svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
- svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-}
-
-static void svm_complete_interrupts(struct vcpu_svm *svm)
-{
- u8 vector;
- int type;
- u32 exitintinfo = svm->vmcb->control.exit_int_info;
-
- if (svm->vcpu.arch.hflags & HF_IRET_MASK)
- svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
-
- svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- if (!(exitintinfo & SVM_EXITINTINFO_VALID))
- return;
-
- vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
- type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
-
- switch (type) {
- case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = true;
- break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
- /* In case of software exception do not reinject an exception
- vector, but re-execute and instruction instead */
- if (kvm_exception_is_soft(vector))
- break;
- if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
- u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_queue_exception_e(&svm->vcpu, vector, err);
-
- } else
- kvm_queue_exception(&svm->vcpu, vector);
- break;
- case SVM_EXITINTINFO_TYPE_INTR:
- kvm_queue_interrupt(&svm->vcpu, vector, false);
- break;
- default:
- break;
- }
-}
-
-#ifdef CONFIG_X86_64
-#define R "r"
-#else
-#define R "e"
-#endif
-
-static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u16 fs_selector;
- u16 gs_selector;
- u16 ldt_selector;
-
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
- pre_svm_run(svm);
-
- sync_lapic_to_cr8(vcpu);
-
- save_host_msrs(vcpu);
- fs_selector = kvm_read_fs();
- gs_selector = kvm_read_gs();
- ldt_selector = kvm_read_ldt();
- svm->host_cr2 = kvm_read_cr2();
- if (!is_nested(svm))
- svm->vmcb->save.cr2 = vcpu->arch.cr2;
- /* required for live migration with NPT */
- if (npt_enabled)
- svm->vmcb->save.cr3 = vcpu->arch.cr3;
-
- clgi();
-
- local_irq_enable();
-
- asm volatile (
- "push %%"R"bp; \n\t"
- "mov %c[rbx](%[svm]), %%"R"bx \n\t"
- "mov %c[rcx](%[svm]), %%"R"cx \n\t"
- "mov %c[rdx](%[svm]), %%"R"dx \n\t"
- "mov %c[rsi](%[svm]), %%"R"si \n\t"
- "mov %c[rdi](%[svm]), %%"R"di \n\t"
- "mov %c[rbp](%[svm]), %%"R"bp \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%[svm]), %%r8 \n\t"
- "mov %c[r9](%[svm]), %%r9 \n\t"
- "mov %c[r10](%[svm]), %%r10 \n\t"
- "mov %c[r11](%[svm]), %%r11 \n\t"
- "mov %c[r12](%[svm]), %%r12 \n\t"
- "mov %c[r13](%[svm]), %%r13 \n\t"
- "mov %c[r14](%[svm]), %%r14 \n\t"
- "mov %c[r15](%[svm]), %%r15 \n\t"
-#endif
-
- /* Enter guest mode */
- "push %%"R"ax \n\t"
- "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
- __ex(SVM_VMLOAD) "\n\t"
- __ex(SVM_VMRUN) "\n\t"
- __ex(SVM_VMSAVE) "\n\t"
- "pop %%"R"ax \n\t"
-
- /* Save guest registers, load host registers */
- "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
- "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
- "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
- "mov %%"R"si, %c[rsi](%[svm]) \n\t"
- "mov %%"R"di, %c[rdi](%[svm]) \n\t"
- "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%[svm]) \n\t"
- "mov %%r9, %c[r9](%[svm]) \n\t"
- "mov %%r10, %c[r10](%[svm]) \n\t"
- "mov %%r11, %c[r11](%[svm]) \n\t"
- "mov %%r12, %c[r12](%[svm]) \n\t"
- "mov %%r13, %c[r13](%[svm]) \n\t"
- "mov %%r14, %c[r14](%[svm]) \n\t"
- "mov %%r15, %c[r15](%[svm]) \n\t"
-#endif
- "pop %%"R"bp"
- :
- : [svm]"a"(svm),
- [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
- [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
- , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
-#endif
- : "cc", "memory"
- , R"bx", R"cx", R"dx", R"si", R"di"
-#ifdef CONFIG_X86_64
- , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#endif
- );
-
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
- kvm_write_cr2(svm->host_cr2);
-
- kvm_load_fs(fs_selector);
- kvm_load_gs(gs_selector);
- kvm_load_ldt(ldt_selector);
- load_host_msrs(vcpu);
-
- reload_tss(vcpu);
-
- local_irq_disable();
-
- stgi();
-
- sync_cr8_to_lapic(vcpu);
-
- svm->next_rip = 0;
-
- svm_complete_interrupts(svm);
-}
-
-#undef R
-
-static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (npt_enabled) {
- svm->vmcb->control.nested_cr3 = root;
- force_new_asid(vcpu);
- return;
- }
-
- svm->vmcb->save.cr3 = root;
- force_new_asid(vcpu);
-
- if (vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
- svm->vmcb->save.cr0 |= X86_CR0_TS;
- vcpu->fpu_active = 0;
- }
-}
-
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
-static void
-svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
- /*
- * Patch in the VMMCALL instruction:
- */
- hypercall[0] = 0x0f;
- hypercall[1] = 0x01;
- hypercall[2] = 0xd9;
-}
-
-static void svm_check_processor_compat(void *rtn)
-{
- *(int *)rtn = 0;
-}
-
-static bool svm_cpu_has_accelerated_tpr(void)
-{
- return false;
-}
-
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
- return PT64_ROOT_LEVEL;
-#else
- return PT32E_ROOT_LEVEL;
-#endif
-}
-
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- return 0;
-}
-
-static struct kvm_x86_ops svm_x86_ops = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
- .hardware_setup = svm_hardware_setup,
- .hardware_unsetup = svm_hardware_unsetup,
- .check_processor_compatibility = svm_check_processor_compat,
- .hardware_enable = svm_hardware_enable,
- .hardware_disable = svm_hardware_disable,
- .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
-
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
- .vcpu_reset = svm_vcpu_reset,
-
- .prepare_guest_switch = svm_prepare_guest_switch,
- .vcpu_load = svm_vcpu_load,
- .vcpu_put = svm_vcpu_put,
-
- .set_guest_debug = svm_guest_debug,
- .get_msr = svm_get_msr,
- .set_msr = svm_set_msr,
- .get_segment_base = svm_get_segment_base,
- .get_segment = svm_get_segment,
- .set_segment = svm_set_segment,
- .get_cpl = svm_get_cpl,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
- .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
- .set_cr0 = svm_set_cr0,
- .set_cr3 = svm_set_cr3,
- .set_cr4 = svm_set_cr4,
- .set_efer = svm_set_efer,
- .get_idt = svm_get_idt,
- .set_idt = svm_set_idt,
- .get_gdt = svm_get_gdt,
- .set_gdt = svm_set_gdt,
- .get_dr = svm_get_dr,
- .set_dr = svm_set_dr,
- .get_rflags = svm_get_rflags,
- .set_rflags = svm_set_rflags,
-
- .tlb_flush = svm_flush_tlb,
-
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .set_interrupt_shadow = svm_set_interrupt_shadow,
- .get_interrupt_shadow = svm_get_interrupt_shadow,
- .patch_hypercall = svm_patch_hypercall,
- .set_irq = svm_set_irq,
- .set_nmi = svm_inject_nmi,
- .queue_exception = svm_queue_exception,
- .interrupt_allowed = svm_interrupt_allowed,
- .nmi_allowed = svm_nmi_allowed,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
-
- .set_tss_addr = svm_set_tss_addr,
- .get_tdp_level = get_npt_level,
- .get_mt_mask = svm_get_mt_mask,
-};
-
-static int __init svm_init(void)
-{
- return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
- THIS_MODULE);
-}
-
-static void __exit svm_exit(void)
-{
- kvm_exit();
-}
-
-module_init(svm_init)
-module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
new file mode 100644
index 000000000000..6b77b2033208
--- /dev/null
+++ b/arch/x86/kvm/svm/avic.c
@@ -0,0 +1,1305 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/hashtable.h>
+#include <linux/amd-iommu.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/msr.h>
+
+#include "trace.h"
+#include "lapic.h"
+#include "x86.h"
+#include "irq.h"
+#include "svm.h"
+
+/*
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
+ * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
+ * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
+ * lookup on the index, where as vCPUs whose index doesn't match their ID need
+ * to walk the entire xarray of vCPUs in the worst case scenario.
+ *
+ * For the vCPU index, use however many bits are currently allowed for the max
+ * guest physical APIC ID (limited by the size of the physical ID table), and
+ * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
+ * size of the GATag is defined by hardware (32 bits), but is an opaque value
+ * as far as hardware is concerned.
+ */
+#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+
+#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
+#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
+
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
+
+#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
+#define AVIC_GATAG(vm_id, vcpu_idx) \
+({ \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
+ \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
+ ga_tag; \
+})
+
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
+
+#define AVIC_AUTO_MODE -1
+
+static int avic_param_set(const char *val, const struct kernel_param *kp)
+{
+ if (val && sysfs_streq(val, "auto")) {
+ *(int *)kp->arg = AVIC_AUTO_MODE;
+ return 0;
+ }
+
+ return param_set_bint(val, kp);
+}
+
+static const struct kernel_param_ops avic_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = avic_param_set,
+ .get = param_get_bool,
+};
+
+/*
+ * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled
+ * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met).
+ */
+static int avic = AVIC_AUTO_MODE;
+module_param_cb(avic, &avic_ops, &avic, 0444);
+__MODULE_PARM_TYPE(avic, "bool");
+
+module_param(enable_ipiv, bool, 0444);
+
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+static bool x2avic_enabled;
+static u32 x2avic_max_physical_id;
+
+static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
+ bool intercept)
+{
+ static const u32 x2avic_passthrough_msrs[] = {
+ X2APIC_MSR(APIC_ID),
+ X2APIC_MSR(APIC_LVR),
+ X2APIC_MSR(APIC_TASKPRI),
+ X2APIC_MSR(APIC_ARBPRI),
+ X2APIC_MSR(APIC_PROCPRI),
+ X2APIC_MSR(APIC_EOI),
+ X2APIC_MSR(APIC_RRR),
+ X2APIC_MSR(APIC_LDR),
+ X2APIC_MSR(APIC_DFR),
+ X2APIC_MSR(APIC_SPIV),
+ X2APIC_MSR(APIC_ISR),
+ X2APIC_MSR(APIC_TMR),
+ X2APIC_MSR(APIC_IRR),
+ X2APIC_MSR(APIC_ESR),
+ X2APIC_MSR(APIC_ICR),
+ X2APIC_MSR(APIC_ICR2),
+
+ /*
+ * Note! Always intercept LVTT, as TSC-deadline timer mode
+ * isn't virtualized by hardware, and the CPU will generate a
+ * #GP instead of a #VMEXIT.
+ */
+ X2APIC_MSR(APIC_LVTTHMR),
+ X2APIC_MSR(APIC_LVTPC),
+ X2APIC_MSR(APIC_LVT0),
+ X2APIC_MSR(APIC_LVT1),
+ X2APIC_MSR(APIC_LVTERR),
+ X2APIC_MSR(APIC_TMICT),
+ X2APIC_MSR(APIC_TMCCT),
+ X2APIC_MSR(APIC_TDCR),
+ };
+ int i;
+
+ if (intercept == svm->x2avic_msrs_intercepted)
+ return;
+
+ if (!x2avic_enabled)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
+ svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
+ MSR_TYPE_RW, intercept);
+
+ svm->x2avic_msrs_intercepted = intercept;
+}
+
+static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ u32 arch_max;
+
+ /*
+ * Return the largest size (x2APIC) when querying without a vCPU, e.g.
+ * to allocate the per-VM table..
+ */
+ if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
+ arch_max = x2avic_max_physical_id;
+ else
+ arch_max = AVIC_MAX_PHYSICAL_ID;
+
+ /*
+ * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID
+ * plus one, so the max possible APIC ID is one less than that.
+ */
+ return min(kvm->arch.max_vcpu_ids - 1, arch_max);
+}
+
+static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
+{
+ return __avic_get_max_physical_id(vcpu->kvm, vcpu);
+}
+
+static void avic_activate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+ vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
+
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+
+ /*
+ * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+ * accesses, while interrupt injection to a running vCPU can be
+ * achieved using AVIC doorbell. KVM disables the APIC access page
+ * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+ * AVIC in hybrid mode activates only the doorbell mechanism.
+ */
+ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
+ vmcb->control.int_ctl |= X2APIC_MODE_MASK;
+
+ /* Disabling MSR intercept for x2APIC registers */
+ avic_set_x2apic_msr_interception(svm, false);
+ } else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
+ /* Enabling MSR intercept for x2APIC registers */
+ avic_set_x2apic_msr_interception(svm, true);
+ }
+}
+
+static void avic_deactivate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ /*
+ * If running nested and the guest uses its own MSR bitmap, there
+ * is no need to update L0's msr bitmap
+ */
+ if (is_guest_mode(&svm->vcpu) &&
+ vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
+ return;
+
+ /* Enabling MSR intercept for x2APIC registers */
+ avic_set_x2apic_msr_interception(svm, true);
+}
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
+ trace_kvm_avic_ga_log(vm_id, vcpu_idx);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+static int avic_get_physical_id_table_order(struct kvm *kvm)
+{
+ /* Provision for the maximum physical ID supported in x2avic mode */
+ return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
+}
+
+int avic_alloc_physical_id_table(struct kvm *kvm)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_apicv)
+ return 0;
+
+ if (kvm_svm->avic_physical_id_table)
+ return 0;
+
+ kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ avic_get_physical_id_table_order(kvm));
+ if (!kvm_svm->avic_physical_id_table)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void avic_vm_destroy(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!enable_apicv)
+ return;
+
+ free_page((unsigned long)kvm_svm->avic_logical_id_table);
+ free_pages((unsigned long)kvm_svm->avic_physical_id_table,
+ avic_get_physical_id_table_order(kvm));
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&kvm_svm->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+int avic_vm_init(struct kvm *kvm)
+{
+ unsigned long flags;
+ int err = -ENOMEM;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
+ u32 vm_id;
+
+ if (!enable_apicv)
+ return 0;
+
+ kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_logical_id_table)
+ goto free_avic;
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ return 0;
+
+free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+}
+
+static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
+{
+ return __sme_set(__pa(svm->vcpu.arch.apic->regs));
+}
+
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+
+ vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
+ vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
+ vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
+
+ if (kvm_apicv_activated(svm->vcpu.kvm))
+ avic_activate_vmcb(svm);
+ else
+ avic_deactivate_vmcb(svm);
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = vcpu->vcpu_id;
+ u64 new_entry;
+
+ /*
+ * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
+ * hardware. Immediately clear apicv_active, i.e. don't wait until the
+ * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
+ * avic_vcpu_load() expects to be called if and only if the vCPU has
+ * fully initialized AVIC.
+ */
+ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+ (id > x2avic_max_physical_id)) {
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
+ vcpu->arch.apic->apicv_active = false;
+ return 0;
+ }
+
+ BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
+ (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
+
+ if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
+ return -EINVAL;
+
+ if (kvm_apicv_activated(vcpu->kvm)) {
+ int ret;
+
+ /*
+ * Note, AVIC hardware walks the nested page table to check
+ * permissions, but does not use the SPA address specified in
+ * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+ * pointer field of the VMCB.
+ */
+ ret = kvm_alloc_apic_access_page(vcpu->kvm);
+ if (ret)
+ return ret;
+ }
+
+ /* Note, fls64() returns the bit position, +1. */
+ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
+ fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ new_entry = avic_get_backing_page_address(svm) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ svm->avic_physical_id_entry = new_entry;
+
+ /*
+ * Initialize the real table, as vCPUs must have a valid entry in order
+ * for broadcast IPIs to function correctly (broadcast IPIs ignore
+ * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
+ */
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
+
+ return 0;
+}
+
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu()) {
+ wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
+ }
+ put_cpu();
+}
+
+
+static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+{
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+}
+
+static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+ u32 icrl)
+{
+ /*
+ * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+ * i.e. APIC ID == vCPU ID.
+ */
+ struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+
+ /* Once again, nothing to do if the target vCPU doesn't exist. */
+ if (unlikely(!target_vcpu))
+ return;
+
+ avic_kick_vcpu(target_vcpu, icrl);
+}
+
+static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+ u32 logid_index, u32 icrl)
+{
+ u32 physical_id;
+
+ if (avic_logical_id_table) {
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ /* Nothing to do if the logical destination is invalid. */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return;
+
+ physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC, the logical APIC ID is a read-only value that is
+ * derived from the x2APIC ID, thus the x2APIC ID can be found
+ * by reversing the calculation (stored in logid_index). Note,
+ * bits 31:20 of the x2APIC ID aren't propagated to the logical
+ * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+ */
+ physical_id = logid_index;
+ }
+
+ avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+}
+
+/*
+ * A fast-path version of avic_kick_target_vcpus(), which attempts to match
+ * destination APIC ID to vCPU without looping through all vCPUs.
+ */
+static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ int dest_mode = icrl & APIC_DEST_MASK;
+ int shorthand = icrl & APIC_SHORT_MASK;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 dest;
+
+ if (shorthand != APIC_DEST_NOSHORT)
+ return -EINVAL;
+
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_XAPIC_DEST_FIELD(icrh);
+
+ if (dest_mode == APIC_DEST_PHYSICAL) {
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(dest != index))
+ return -EINVAL;
+
+ avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
+ } else {
+ u32 *avic_logical_id_table;
+ unsigned long bitmap, i;
+ u32 cluster;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
+ } else {
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
+ }
+
+ /* Nothing to do if there are no destinations in the cluster. */
+ if (unlikely(!bitmap))
+ return 0;
+
+ if (apic_x2apic_mode(source))
+ avic_logical_id_table = NULL;
+ else
+ avic_logical_id_table = kvm_svm->avic_logical_id_table;
+
+ /*
+ * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+ * IDs, thus each bit in the destination is guaranteed to map
+ * to at most one vCPU.
+ */
+ for_each_set_bit(i, &bitmap, 16)
+ avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+ cluster + i, icrl);
+ }
+
+ return 0;
+}
+
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
+ return;
+
+ trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
+
+ /*
+ * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+ * event. There's no need to signal doorbells, as hardware has handled
+ * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+ * since entered the guest will have processed pending IRQs at VMRUN.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+ dest, icrl & APIC_DEST_MASK))
+ avic_kick_vcpu(vcpu, icrl);
+ }
+}
+
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * Emulate IPIs that are not handled by AVIC hardware, which
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
+ * order to clear the BUSY flag.
+ */
+ if (icrl & APIC_ICR_BUSY)
+ kvm_apic_write_nodecode(vcpu, APIC_ICR);
+ else
+ kvm_apic_send_ipi(apic, icrl, icrh);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
+ /* Invalid IPI with vector < 16 */
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
+ }
+
+ return 1;
+}
+
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ u32 cluster, index;
+
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+
+ if (flat) {
+ cluster = 0;
+ } else {
+ cluster = (ldr >> 4);
+ if (cluster >= 0xf)
+ return NULL;
+ ldr &= 0xf;
+ }
+ if (!ldr || !is_power_of_2(ldr))
+ return NULL;
+
+ index = __ffs(ldr);
+ if (WARN_ON_ONCE(index > 7))
+ return NULL;
+ index += (cluster << 2);
+
+ return &kvm_svm->avic_logical_id_table[index];
+}
+
+static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+{
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+}
+
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry;
+
+ /* Note: x2AVIC does not use logical APIC ID table */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+
+ entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+ if (entry)
+ clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
+}
+
+static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+ /* AVIC does not support LDR update for x2APIC */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+
+ if (ldr == svm->ldr_reg)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+
+ svm->ldr_reg = ldr;
+ avic_ldr_write(vcpu, id, ldr);
+}
+
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+
+ if (svm->dfr_reg == dfr)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
+}
+
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
+{
+ u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_LDR:
+ avic_handle_ldr_update(vcpu);
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(vcpu);
+ break;
+ case APIC_RRR:
+ /* Ignore writes to Read Remote Data, it's read-only. */
+ return 1;
+ default:
+ break;
+ }
+
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(vcpu);
+ } else {
+ /* Handling Fault */
+ ret = kvm_emulate_instruction(vcpu, 0);
+ }
+
+ return ret;
+}
+
+int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ raw_spin_lock_init(&svm->ir_list_lock);
+
+ if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
+ return 0;
+
+ ret = avic_init_backing_page(vcpu);
+ if (ret)
+ return ret;
+
+ svm->dfr_reg = APIC_DFR_FLAT;
+
+ return ret;
+}
+
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ avic_handle_dfr_update(vcpu);
+ avic_handle_ldr_update(vcpu);
+}
+
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
+{
+ struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
+ unsigned long flags;
+
+ if (!vcpu)
+ return;
+
+ raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ list_del(&irqfd->vcpu_list);
+ raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
+}
+
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
+{
+ /*
+ * If the IRQ was affined to a different vCPU, remove the IRTE metadata
+ * from the *previous* vCPU's list.
+ */
+ svm_ir_list_del(irqfd);
+
+ if (vcpu) {
+ /*
+ * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
+ * in which case configure the IRTE for legacy mode, but track
+ * the IRTE metadata so that it can be converted to guest mode
+ * if AVIC is enabled/uninhibited in the future.
+ */
+ struct amd_iommu_pi_data pi_data = {
+ .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ vcpu->vcpu_idx),
+ .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
+ .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+ .vector = vector,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 entry;
+ int ret;
+
+ /*
+ * Prevent the vCPU from being scheduled out or migrated until
+ * the IRTE is updated and its metadata has been added to the
+ * list of IRQs being posted to the vCPU, to ensure the IRTE
+ * isn't programmed with stale pCPU/IsRunning information.
+ */
+ guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
+
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is
+ * running. If the vCPU is NOT running, i.e. is blocking or
+ * scheduled out, KVM will update the pCPU info when the vCPU
+ * is awakened and/or scheduled in. See also avic_vcpu_load().
+ */
+ entry = svm->avic_physical_id_entry;
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
+ pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ } else {
+ pi_data.cpu = -1;
+ pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+ }
+
+ ret = irq_set_vcpu_affinity(host_irq, &pi_data);
+ if (ret)
+ return ret;
+
+ /*
+ * Revert to legacy mode if the IOMMU didn't provide metadata
+ * for the IRTE, which KVM needs to keep the IRTE up-to-date,
+ * e.g. if the vCPU is migrated or AVIC is disabled.
+ */
+ if (WARN_ON_ONCE(!pi_data.ir_data)) {
+ irq_set_vcpu_affinity(host_irq, NULL);
+ return -EIO;
+ }
+
+ irqfd->irq_bypass_data = pi_data.ir_data;
+ list_add(&irqfd->vcpu_list, &svm->ir_list);
+ return 0;
+ }
+ return irq_set_vcpu_affinity(host_irq, NULL);
+}
+
+enum avic_vcpu_action {
+ /*
+ * There is no need to differentiate between activate and deactivate,
+ * as KVM only refreshes AVIC state when the vCPU is scheduled in and
+ * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
+ * being (de)activated.
+ */
+ AVIC_TOGGLE_ON_OFF = BIT(0),
+ AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
+ AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
+
+ /*
+ * No unique action is required to deal with a vCPU that stops/starts
+ * running. A vCPU that starts running by definition stops blocking as
+ * well, and a vCPU that stops running can't have been blocking, i.e.
+ * doesn't need to toggle GALogIntr.
+ */
+ AVIC_START_RUNNING = 0,
+ AVIC_STOP_RUNNING = 0,
+
+ /*
+ * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
+ * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
+ * sent to the vCPU.
+ */
+ AVIC_START_BLOCKING = BIT(1),
+};
+
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
+{
+ bool ga_log_intr = (action & AVIC_START_BLOCKING);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_kernel_irqfd *irqfd;
+
+ lockdep_assert_held(&svm->ir_list_lock);
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ if (list_empty(&svm->ir_list))
+ return;
+
+ list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+ void *data = irqfd->irq_bypass_data;
+
+ if (!(action & AVIC_TOGGLE_ON_OFF))
+ WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
+ else if (cpu >= 0)
+ WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
+ else
+ WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
+ }
+}
+
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ int h_physical_id = kvm_cpu_get_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+ u64 entry;
+
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ return;
+
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
+ return;
+
+ /*
+ * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
+ * _currently_ have assigned devices, as that can change. Holding
+ * ir_list_lock ensures that either svm_ir_list_add() will consume
+ * up-to-date entry information, or that this task will wait until
+ * svm_ir_list_add() completes to set the new target pCPU.
+ */
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ entry = svm->avic_physical_id_entry;
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+ entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
+ AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ svm->avic_physical_id_entry = entry;
+
+ /*
+ * If IPI virtualization is disabled, clear IsRunning when updating the
+ * actual Physical ID table, so that the CPU never sees IsRunning=1.
+ * Keep the APIC ID up-to-date in the entry to minimize the chances of
+ * things going sideways if hardware peeks at the ID.
+ */
+ if (!enable_ipiv)
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
+
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
+}
+
+static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+ u64 entry = svm->avic_physical_id_entry;
+
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
+ return;
+
+ /*
+ * Take and hold the per-vCPU interrupt remapping lock while updating
+ * the Physical ID entry even though the lock doesn't protect against
+ * multiple writers (see above). Holding ir_list_lock ensures that
+ * either svm_ir_list_add() will consume up-to-date entry information,
+ * or that this task will wait until svm_ir_list_add() completes to
+ * mark the vCPU as not running.
+ */
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ avic_update_iommu_vcpu_affinity(vcpu, -1, action);
+
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+
+ /*
+ * Keep the previous APIC ID in the entry so that a rogue doorbell from
+ * hardware is at least restricted to a CPU associated with the vCPU.
+ */
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ if (enable_ipiv)
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ /*
+ * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
+ * it's a synthetic flag that usurps an unused should-be-zero bit.
+ */
+ if (action & AVIC_START_BLOCKING)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+
+ svm->avic_physical_id_entry = entry;
+
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ u64 entry = to_svm(vcpu)->avic_physical_id_entry;
+
+ /*
+ * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
+ * vCPU is preempted while its in the process of blocking. WARN if the
+ * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
+ * the AVIC if it wasn't previously loaded.
+ */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
+ if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
+ return;
+
+ /*
+ * The vCPU was preempted while blocking, ensure its IRTEs are
+ * configured to generate GA Log Interrupts.
+ */
+ if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
+ return;
+ }
+
+ __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
+ AVIC_STOP_RUNNING);
+}
+
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ if (!lapic_in_kernel(vcpu) || !enable_apicv)
+ return;
+
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_apicv_post_state_restore(vcpu);
+ avic_activate_vmcb(svm);
+ } else {
+ avic_deactivate_vmcb(svm);
+ }
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ if (!enable_apicv)
+ return;
+
+ /* APICv should only be toggled on/off while the vCPU is running. */
+ WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
+
+ avic_refresh_virtual_apic_mode(vcpu);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
+ else
+ __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
+}
+
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
+ * actually blocks.
+ *
+ * Note, any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
+ * this by checking vIRR one last time before blocking. The memory
+ * barrier implicit in set_current_state orders writing IsRunning=0
+ * before reading the vIRR. The processor needs a matching memory
+ * barrier on interrupt delivery between writing IRR and reading
+ * IsRunning; the lack of this barrier might be the cause of errata #1235).
+ *
+ * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
+ * doesn't need to detect events for scheduling purposes. The doorbell
+ * used to signal running vCPUs cannot be blocked, i.e. will perturb the
+ * CPU and cause noisy neighbor problems if the VM is sending interrupts
+ * to the vCPU while it's scheduled out.
+ */
+ __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
+}
+
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ avic_vcpu_load(vcpu, vcpu->cpu);
+}
+
+static bool __init avic_want_avic_enabled(void)
+{
+ /*
+ * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is
+ * supported (to avoid enabling partial support by default, and because
+ * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for
+ * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags
+ * aren't inclusive of previous generations, i.e. the kernel will set
+ * at most one ZenX feature flag.
+ */
+ if (avic == AVIC_AUTO_MODE)
+ avic = boot_cpu_has(X86_FEATURE_X2AVIC) &&
+ (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4));
+
+ if (!avic || !npt_enabled)
+ return false;
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+ if (boot_cpu_has(X86_FEATURE_X2AVIC))
+ pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n");
+ return false;
+ }
+
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
+ !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) {
+ pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n");
+ return false;
+ }
+
+ /*
+ * Print a scary message if AVIC is force enabled to make it abundantly
+ * clear that ignoring CPUID could have repercussions. See Revision
+ * Guide for specific AMD processor for more details.
+ */
+ if (!boot_cpu_has(X86_FEATURE_AVIC))
+ pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n");
+
+ return true;
+}
+
+/*
+ * Note:
+ * - The module param avic enable both xAPIC and x2APIC mode.
+ * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
+ * - The mode can be switched at run-time.
+ */
+bool __init avic_hardware_setup(void)
+{
+ avic = avic_want_avic_enabled();
+ if (!avic)
+ return false;
+
+ pr_info("AVIC enabled\n");
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+ if (x2avic_enabled) {
+ if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT))
+ x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID;
+ else
+ x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID;
+ pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1);
+ } else {
+ svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
+ }
+
+ /*
+ * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
+ * due to erratum 1235, which results in missed VM-Exits on the sender
+ * and thus missed wake events for blocking vCPUs due to the CPU
+ * failing to see a software update to clear IsRunning.
+ */
+ enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+
+ return true;
+}
+
+void avic_hardware_unsetup(void)
+{
+ if (avic)
+ amd_iommu_register_ga_log_notifier(NULL);
+}
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
new file mode 100644
index 000000000000..088f6429b24c
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM specific code for Hyper-V on KVM.
+ *
+ * Copyright 2022 Red Hat, Inc. and/or its affiliates.
+ */
+#include "hyperv.h"
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
+ svm->vmcb->control.exit_info_2 = 0;
+ nested_svm_vmexit(svm);
+}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000000..d3f8bfc05832
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+#include "svm.h"
+
+#ifdef CONFIG_KVM_HYPERV
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return;
+
+ hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page;
+ hv_vcpu->nested.vm_id = hve->hv_vm_id;
+ hv_vcpu->nested.vp_id = hve->hv_vp_id;
+}
+
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+#else /* CONFIG_KVM_HYPERV */
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {}
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) {}
+#endif /* CONFIG_KVM_HYPERV */
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
new file mode 100644
index 000000000000..c81005b24522
--- /dev/null
+++ b/arch/x86/kvm/svm/nested.c
@@ -0,0 +1,1928 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+
+#include <asm/msr-index.h>
+#include <asm/debugreg.h>
+
+#include "kvm_emulate.h"
+#include "trace.h"
+#include "mmu.h"
+#include "x86.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "svm.h"
+#include "hyperv.h"
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
+ }
+
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
+
+ nested_svm_vmexit(svm);
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.ctl.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ /*
+ * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores
+ * nCR3[4:0] when loading PDPTEs from memory.
+ */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ (cr3 & GENMASK(11, 5)) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.nested_cr3;
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+
+ /*
+ * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note,
+ * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
+ * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
+ */
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
+ svm->nested.ctl.nested_cr3);
+ vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
+void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct vmcb_ctrl_area_cached *g;
+ unsigned int i;
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->vmcb01.ptr->control;
+ g = &svm->nested.ctl;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] = h->intercepts[i];
+
+ if (g->int_ctl & V_INTR_MASKING_MASK) {
+ /*
+ * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
+ * disable intercept of CR8 writes as L2's CR8 does not affect
+ * any interrupt KVM may want to inject.
+ *
+ * Similarly, disable intercept of virtual interrupts (used to
+ * detect interrupt windows) if the saved RFLAGS.IF is '0', as
+ * the effective RFLAGS.IF for L1 interrupts will never be set
+ * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
+ */
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
+ if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ }
+
+ /*
+ * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
+ * flush feature is enabled.
+ */
+ if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] |= g->intercepts[i];
+
+ /* If SMI is not intercepted, ignore guest SMI intercept as well */
+ if (!intercept_smi)
+ vmcb_clr_intercept(c, INTERCEPT_SMI);
+
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
+}
+
+/*
+ * This array (and its actual size) holds the set of offsets (indexing by chunk
+ * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
+ * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
+ * based on CPUID features. This array only tracks MSRs that *might* be passed
+ * through to the guest.
+ *
+ * Hardcode the capacity of the array based on the maximum number of _offsets_.
+ * MSRs are batched together, so there are fewer offsets than MSRs.
+ */
+static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
+static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
+typedef unsigned long nsvm_msrpm_merge_t;
+
+int __init nested_svm_init_msrpm_merge_offsets(void)
+{
+ static const u32 merge_msrs[] __initconst = {
+ MSR_STAR,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_IA32_SYSENTER_ESP,
+ #ifdef CONFIG_X86_64
+ MSR_GS_BASE,
+ MSR_FS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_LSTAR,
+ MSR_CSTAR,
+ MSR_SYSCALL_MASK,
+ #endif
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
+ MSR_IA32_APERF,
+ MSR_IA32_MPERF,
+ MSR_IA32_LASTBRANCHFROMIP,
+ MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP,
+ MSR_IA32_LASTINTTOIP,
+ };
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
+ int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
+ u32 offset;
+
+ if (WARN_ON(bit_nr < 0))
+ return -EIO;
+
+ /*
+ * Merging is done in chunks to reduce the number of accesses
+ * to L1's bitmap.
+ */
+ offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
+
+ for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
+ if (nested_svm_msrpm_merge_offsets[j] == offset)
+ break;
+ }
+
+ if (j < nested_svm_nr_msrpm_merge_offsets)
+ continue;
+
+ if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
+ return -EIO;
+
+ nested_svm_msrpm_merge_offsets[j] = offset;
+ nested_svm_nr_msrpm_merge_offsets++;
+ }
+
+ return 0;
+}
+
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
+static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
+ nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
+ int i;
+
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+#ifdef CONFIG_KVM_HYPERV
+ if (!svm->nested.force_msr_bitmap_recalc) {
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+
+ if (kvm_hv_hypercall_enabled(vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
+ }
+#endif
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
+ const int p = nested_svm_msrpm_merge_offsets[i];
+ nsvm_msrpm_merge_t l1_val;
+ gpa_t gpa;
+
+ gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
+
+ if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
+ return false;
+
+ msrpm02[p] = msrpm01[p] | l1_val;
+ }
+
+ svm->nested.force_msr_bitmap_recalc = false;
+
+#ifdef CONFIG_KVM_HYPERV
+set_msrpm_base_pa:
+#endif
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+ return true;
+}
+
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
+{
+ u64 addr = PAGE_ALIGN(pa);
+
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
+}
+
+static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
+{
+ if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
+ return false;
+
+ if (CC(control->asid == 0))
+ return false;
+
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ return false;
+
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
+ return false;
+
+ if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
+ !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
+ return false;
+ }
+
+ return true;
+}
+
+/* Common checks that apply to both L1 and L2 state. */
+static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
+{
+ if (CC(!(save->efer & EFER_SVME)))
+ return false;
+
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
+ return false;
+
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+ return false;
+
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3)))
+ return false;
+ }
+
+ /* Note, SVM doesn't have any additional restrictions on CR4. */
+ if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
+ return false;
+
+ return true;
+}
+
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area_cached *save = &svm->nested.save;
+
+ return __nested_vmcb_check_save(vcpu, save);
+}
+
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
+
+ return __nested_vmcb_check_controls(vcpu, ctl);
+}
+
+static
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
+ struct vmcb_control_area *from)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ to->intercepts[i] = from->intercepts[i];
+
+ to->iopm_base_pa = from->iopm_base_pa;
+ to->msrpm_base_pa = from->msrpm_base_pa;
+ to->tsc_offset = from->tsc_offset;
+ to->tlb_ctl = from->tlb_ctl;
+ to->int_ctl = from->int_ctl;
+ to->int_vector = from->int_vector;
+ to->int_state = from->int_state;
+ to->exit_code = from->exit_code;
+ to->exit_code_hi = from->exit_code_hi;
+ to->exit_info_1 = from->exit_info_1;
+ to->exit_info_2 = from->exit_info_2;
+ to->exit_int_info = from->exit_int_info;
+ to->exit_int_info_err = from->exit_int_info_err;
+ to->nested_ctl = from->nested_ctl;
+ to->event_inj = from->event_inj;
+ to->event_inj_err = from->event_inj_err;
+ to->next_rip = from->next_rip;
+ to->nested_cr3 = from->nested_cr3;
+ to->virt_ext = from->virt_ext;
+ to->pause_filter_count = from->pause_filter_count;
+ to->pause_filter_thresh = from->pause_filter_thresh;
+
+ /* Copy asid here because nested_vmcb_check_controls will check it. */
+ to->asid = from->asid;
+ to->msrpm_base_pa &= ~0x0fffULL;
+ to->iopm_base_pa &= ~0x0fffULL;
+
+#ifdef CONFIG_KVM_HYPERV
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
+ sizeof(to->hv_enlightenments));
+ }
+#endif
+}
+
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
+{
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
+}
+
+static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
+ struct vmcb_save_area *from)
+{
+ /*
+ * Copy only fields that are validated, as we need them
+ * to avoid TOC/TOU races.
+ */
+ to->efer = from->efer;
+ to->cr0 = from->cr0;
+ to->cr3 = from->cr3;
+ to->cr4 = from->cr4;
+
+ to->dr6 = from->dr6;
+ to->dr7 = from->dr7;
+}
+
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save)
+{
+ __nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
+}
+
+/*
+ * Synchronize fields that are written by the processor, so that
+ * they can be copied back into the vmcb12.
+ */
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
+{
+ u32 mask;
+ svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
+ svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err;
+
+ /* Only a few fields of int_ctl are written by the processor. */
+ mask = V_IRQ_MASK | V_TPR_MASK;
+ /*
+ * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
+ * virtual interrupts in order to request an interrupt window, as KVM
+ * has usurped vmcb02's int_ctl. If an interrupt window opens before
+ * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
+ * If no window opens, V_IRQ will be correctly preserved in vmcb12's
+ * int_ctl (because it was never recognized while L2 was running).
+ */
+ if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
+ !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
+ mask &= ~V_IRQ_MASK;
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
+ if (nested_vnmi_enabled(svm))
+ mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
+
+ svm->nested.ctl.int_ctl &= ~mask;
+ svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
+}
+
+/*
+ * Transfer any event that L0 or L1 wanted to inject into L2 to
+ * EXIT_INT_INFO.
+ */
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 exit_int_info = 0;
+ unsigned int nr;
+
+ if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.vector;
+ exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
+
+ if (vcpu->arch.exception.has_error_code) {
+ exit_int_info |= SVM_EVTINJ_VALID_ERR;
+ vmcb12->control.exit_int_info_err =
+ vcpu->arch.exception.error_code;
+ }
+
+ } else if (vcpu->arch.nmi_injected) {
+ exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ exit_int_info = nr | SVM_EVTINJ_VALID;
+
+ if (vcpu->arch.interrupt.soft)
+ exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
+ else
+ exit_int_info |= SVM_EVTINJ_TYPE_INTR;
+ }
+
+ vmcb12->control.exit_int_info = exit_int_info;
+}
+
+static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ /* Handle pending Hyper-V TLB flush requests */
+ kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled);
+
+ /*
+ * TODO: optimize unconditional TLB flush/MMU sync. A partial list of
+ * things to fix before this can be conditional:
+ *
+ * - Flush TLBs for both L1 and L2 remote TLB flush
+ * - Honor L1's request to flush an ASID on nested VMRUN
+ * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
+ * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
+ * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
+ *
+ * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
+ * NPT guest-physical mappings on VMRUN.
+ */
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
+/*
+ * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
+ * if we are emulating VM-Entry into a guest with NPT enabled.
+ */
+static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_npt, bool reload_pdptrs)
+{
+ if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3)))
+ return -EINVAL;
+
+ if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3)))
+ return -EINVAL;
+
+ vcpu->arch.cr3 = cr3;
+
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
+
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ return 0;
+}
+
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
+{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(vmcb02, VMCB_NPT);
+
+ /* Load the nested guest state */
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ vmcb02->save.gdtr = vmcb12->save.gdtr;
+ vmcb02->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(vmcb02, VMCB_DT);
+ }
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) {
+ vmcb02->save.s_cet = vmcb12->save.s_cet;
+ vmcb02->save.isst_addr = vmcb12->save.isst_addr;
+ vmcb02->save.ssp = vmcb12->save.ssp;
+ vmcb_mark_dirty(vmcb02, VMCB_CET);
+ }
+
+ kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
+
+ svm_set_efer(vcpu, svm->nested.save.efer);
+
+ svm_set_cr0(vcpu, svm->nested.save.cr0);
+ svm_set_cr4(vcpu, svm->nested.save.cr4);
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+ kvm_rax_write(vcpu, vmcb12->save.rax);
+ kvm_rsp_write(vcpu, vmcb12->save.rsp);
+ kvm_rip_write(vcpu, vmcb12->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ vmcb02->save.rax = vmcb12->save.rax;
+ vmcb02->save.rsp = vmcb12->save.rsp;
+ vmcb02->save.rip = vmcb12->save.rip;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(vmcb02, VMCB_DR);
+ }
+
+ if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ /*
+ * Reserved bits of DEBUGCTL are ignored. Be consistent with
+ * svm_set_msr's definition of reserved bits.
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ } else {
+ svm_copy_lbrs(vmcb02, vmcb01);
+ }
+ svm_update_lbrv(&svm->vcpu);
+}
+
+static inline bool is_evtinj_soft(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+ u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ if (type == SVM_EVTINJ_TYPE_SOFT)
+ return true;
+
+ return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
+}
+
+static bool is_evtinj_nmi(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ return type == SVM_EVTINJ_TYPE_NMI;
+}
+
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
+ unsigned long vmcb12_rip,
+ unsigned long vmcb12_csbase)
+{
+ u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+ u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(vcpu);
+
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ else
+ int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+
+ if (vnmi) {
+ if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
+ svm->vcpu.arch.nmi_pending++;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+ if (nested_vnmi_enabled(svm))
+ int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
+ V_NMI_ENABLE_MASK |
+ V_NMI_BLOCKING_MASK);
+ }
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+ vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
+
+ /*
+ * Stash vmcb02's counter if the guest hasn't moved past the guilty
+ * instruction; otherwise, reset the counter to '0'.
+ *
+ * In order to detect if L2 has made forward progress or not, track the
+ * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
+ * is changed, guest has clearly made forward progress, bus_lock_counter
+ * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
+ * scenario, where a buslock happened in L1 before VMRUN, the bus lock
+ * firmly happened on an instruction in the past. Even if vmcb01's
+ * counter is still '1', (because the guilty instruction got patched),
+ * the vCPU has clearly made forward progress and so KVM should reset
+ * vmcb02's counter to '0'.
+ *
+ * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
+ * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
+ * if userspace rate-limits the vCPU, then it's entirely possible that
+ * L1's tick interrupt is pending by the time userspace re-runs the
+ * vCPU. If KVM unconditionally clears the counter on VMRUN, then when
+ * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
+ * entire cycle start over.
+ */
+ if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
+ vmcb02->control.bus_lock_counter = 1;
+ else
+ vmcb02->control.bus_lock_counter = 0;
+
+ /* Done at vmrun: asid. */
+
+ /* Also overwritten later if necessary. */
+ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
+ if (nested_npt_enabled(svm))
+ nested_svm_init_mmu_context(vcpu);
+
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ svm->nested.ctl.tsc_offset,
+ svm->tsc_ratio_msr);
+
+ vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ vmcb02->control.int_ctl =
+ (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
+ (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+ vmcb02->control.int_vector = svm->nested.ctl.int_vector;
+ vmcb02->control.int_state = svm->nested.ctl.int_state;
+ vmcb02->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ /*
+ * next_rip is consumed on VMRUN as the return address pushed on the
+ * stack for injected soft exceptions/interrupts. If nrips is exposed
+ * to L1, take it verbatim from vmcb12. If nrips is supported in
+ * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
+ * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
+ * prior to injecting the event).
+ */
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = svm->nested.ctl.next_rip;
+ else if (boot_cpu_has(X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = vmcb12_rip;
+
+ svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
+ if (is_evtinj_soft(vmcb02->control.event_inj)) {
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = vmcb12_csbase;
+ svm->soft_int_old_rip = vmcb12_rip;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ svm->soft_int_next_rip = svm->nested.ctl.next_rip;
+ else
+ svm->soft_int_next_rip = vmcb12_rip;
+ }
+
+ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER))
+ pause_count12 = svm->nested.ctl.pause_filter_count;
+ else
+ pause_count12 = 0;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD))
+ pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
+ else
+ pause_thresh12 = 0;
+ if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
+
+ } else {
+ /* start from host values otherwise */
+ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+ vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
+ }
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
+ */
+ recalc_intercepts(svm);
+}
+
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
+}
+
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
+ struct vmcb *vmcb12, bool from_vmrun)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ trace_kvm_nested_vmenter(svm->vmcb->save.rip,
+ vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl,
+ vmcb12->control.nested_cr3,
+ vmcb12->save.cr3,
+ KVM_ISA_SVM);
+
+ trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+ vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+ vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+ vmcb12->control.intercepts[INTERCEPT_WORD3],
+ vmcb12->control.intercepts[INTERCEPT_WORD4],
+ vmcb12->control.intercepts[INTERCEPT_WORD5]);
+
+
+ svm->nested.vmcb12_gpa = vmcb12_gpa;
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
+ nested_vmcb02_prepare_save(svm, vmcb12);
+
+ ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
+ nested_npt_enabled(svm), from_vmrun);
+ if (ret)
+ return ret;
+
+ if (!from_vmrun)
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ svm_set_gif(svm, true);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
+ nested_svm_hv_update_vm_vp_ids(vcpu);
+
+ return 0;
+}
+
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ u64 vmcb12_gpa;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+ if (!svm->nested.hsave_msr) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* This fails when VP assist page is enabled but the supplied GPA is bogus */
+ ret = kvm_hv_verify_vp_assist(vcpu);
+ if (ret) {
+ kvm_inject_gp(vcpu, 0);
+ return ret;
+ }
+
+ vmcb12_gpa = svm->vmcb->save.rax;
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ if (ret == -EINVAL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (ret) {
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ vmcb12 = map.hva;
+
+ if (WARN_ON_ONCE(!svm->nested.initialized))
+ return -EINVAL;
+
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+
+ if (!nested_vmcb_check_save(vcpu) ||
+ !nested_vmcb_check_controls(vcpu)) {
+ vmcb12->control.exit_code = SVM_EXIT_ERR;
+ vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_info_1 = 0;
+ vmcb12->control.exit_info_2 = 0;
+ goto out;
+ }
+
+ /*
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
+ */
+ vmcb01->save.efer = vcpu->arch.efer;
+ vmcb01->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb01->save.cr4 = vcpu->arch.cr4;
+ vmcb01->save.rflags = kvm_get_rflags(vcpu);
+ vmcb01->save.rip = kvm_rip_read(vcpu);
+
+ if (!npt_enabled)
+ vmcb01->save.cr3 = kvm_read_cr3(vcpu);
+
+ svm->nested.nested_run_pending = 1;
+
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
+ goto out_exit_err;
+
+ if (nested_svm_merge_msrpm(vcpu))
+ goto out;
+
+out_exit_err:
+ svm->nested.nested_run_pending = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+
+out:
+ kvm_vcpu_unmap(vcpu, &map);
+
+ return ret;
+}
+
+/* Copy state save area fields which are handled by VMRUN */
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save)
+{
+ to_save->es = from_save->es;
+ to_save->cs = from_save->cs;
+ to_save->ss = from_save->ss;
+ to_save->ds = from_save->ds;
+ to_save->gdtr = from_save->gdtr;
+ to_save->idtr = from_save->idtr;
+ to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
+ to_save->efer = from_save->efer;
+ to_save->cr0 = from_save->cr0;
+ to_save->cr3 = from_save->cr3;
+ to_save->cr4 = from_save->cr4;
+ to_save->rax = from_save->rax;
+ to_save->rsp = from_save->rsp;
+ to_save->rip = from_save->rip;
+ to_save->cpl = 0;
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ to_save->s_cet = from_save->s_cet;
+ to_save->isst_addr = from_save->isst_addr;
+ to_save->ssp = from_save->ssp;
+ }
+}
+
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int rc;
+
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(vcpu);
+ svm->nested.vmcb12_gpa = 0;
+ WARN_ON_ONCE(svm->nested.nested_run_pending);
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ /* in case we halted in L2 */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ /* Give the current vmcb to the guest */
+
+ vmcb12->save.es = vmcb02->save.es;
+ vmcb12->save.cs = vmcb02->save.cs;
+ vmcb12->save.ss = vmcb02->save.ss;
+ vmcb12->save.ds = vmcb02->save.ds;
+ vmcb12->save.gdtr = vmcb02->save.gdtr;
+ vmcb12->save.idtr = vmcb02->save.idtr;
+ vmcb12->save.efer = svm->vcpu.arch.efer;
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
+ vmcb12->save.cr2 = vmcb02->save.cr2;
+ vmcb12->save.cr4 = svm->vcpu.arch.cr4;
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
+ vmcb12->save.dr7 = vmcb02->save.dr7;
+ vmcb12->save.dr6 = svm->vcpu.arch.dr6;
+ vmcb12->save.cpl = vmcb02->save.cpl;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
+ vmcb12->save.s_cet = vmcb02->save.s_cet;
+ vmcb12->save.isst_addr = vmcb02->save.isst_addr;
+ vmcb12->save.ssp = vmcb02->save.ssp;
+ }
+
+ vmcb12->control.int_state = vmcb02->control.int_state;
+ vmcb12->control.exit_code = vmcb02->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
+
+ if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ vmcb12->control.next_rip = vmcb02->control.next_rip;
+
+ vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
+ vmcb12->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+ }
+
+ /*
+ * Invalidate bus_lock_rip unless KVM is still waiting for the guest
+ * to make forward progress before re-enabling bus lock detection.
+ */
+ if (!vmcb02->control.bus_lock_counter)
+ svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ kvm_nested_vmexit_handle_ibrs(vcpu);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ /*
+ * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
+ *
+ * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't
+ * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
+ * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
+ * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that
+ * KVM re-requests an interrupt window if necessary, which implicitly
+ * copies this bits from vmcb02 to vmcb01.
+ *
+ * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
+ * is stored in vmcb02, but its value doesn't need to be copied from/to
+ * vmcb01 because it is copied from/to the virtual APIC's TPR register
+ * on each VM entry/exit.
+ *
+ * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
+ * V_GIF. However, GIF is architecturally clear on each VM exit, thus
+ * there is no need to copy V_GIF from vmcb02 to vmcb01.
+ */
+ if (!nested_exit_on_intr(svm))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
+ svm_copy_lbrs(vmcb12, vmcb02);
+ else
+ svm_copy_lbrs(vmcb01, vmcb02);
+
+ svm_update_lbrv(vcpu);
+
+ if (vnmi) {
+ if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+ vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ if (vcpu->arch.nmi_pending) {
+ vcpu->arch.nmi_pending--;
+ vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
+ } else {
+ vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
+ }
+ }
+
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
+ svm_set_gif(svm, false);
+ vmcb01->control.exit_int_info = 0;
+
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+ }
+
+ if (kvm_caps.has_tsc_control &&
+ vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) {
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ svm_write_tsc_multiplier(vcpu);
+ }
+
+ svm->nested.ctl.nested_cr3 = 0;
+
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, vmcb01->save.rflags);
+ svm_set_efer(vcpu, vmcb01->save.efer);
+ svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, vmcb01->save.cr4);
+ kvm_rax_write(vcpu, vmcb01->save.rax);
+ kvm_rsp_write(vcpu, vmcb01->save.rsp);
+ kvm_rip_write(vcpu, vmcb01->save.rip);
+
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
+
+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ kvm_vcpu_unmap(vcpu, &map);
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ nested_svm_uninit_mmu_context(vcpu);
+
+ rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
+ if (rc)
+ return 1;
+
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
+
+ /*
+ * Un-inhibit the AVIC right away, so that other vCPUs can start
+ * to benefit from it right away.
+ */
+ if (kvm_apicv_activated(vcpu->kvm))
+ __kvm_vcpu_update_apicv(vcpu);
+
+ return 0;
+}
+
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
+ return;
+
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
+int svm_allocate_nested(struct vcpu_svm *svm)
+{
+ struct page *vmcb02_page;
+
+ if (svm->nested.initialized)
+ return 0;
+
+ vmcb02_page = snp_safe_alloc_page();
+ if (!vmcb02_page)
+ return -ENOMEM;
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
+
+ svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->nested.msrpm)
+ goto err_free_vmcb02;
+
+ svm->nested.initialized = true;
+ return 0;
+
+err_free_vmcb02:
+ __free_page(vmcb02_page);
+ return -ENOMEM;
+}
+
+void svm_free_nested(struct vcpu_svm *svm)
+{
+ if (!svm->nested.initialized)
+ return;
+
+ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ svm_vcpu_free_msrpm(svm->nested.msrpm);
+ svm->nested.msrpm = NULL;
+
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
+
+ /*
+ * When last_vmcb12_gpa matches the current vmcb12 gpa,
+ * some vmcb12 fields are not loaded if they are marked clean
+ * in the vmcb12, since in this case they are up to date already.
+ *
+ * When the vmcb02 is freed, this optimization becomes invalid.
+ */
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ svm->nested.initialized = false;
+}
+
+void svm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ svm->nested.nested_run_pending = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ nested_svm_uninit_mmu_context(vcpu);
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ gpa_t base = svm->nested.ctl.msrpm_base_pa;
+ int write, bit_nr;
+ u8 value, mask;
+ u32 msr;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ bit_nr = svm_msrpm_bit_nr(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+
+ if (bit_nr < 0)
+ return NESTED_EXIT_DONE;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
+ &value, sizeof(value)))
+ return NESTED_EXIT_DONE;
+
+ mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
+ u64 gpa;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
+ gpa = svm->nested.ctl.iopm_base_pa + (port / 8);
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
+
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ /*
+ * Host-intercepted exceptions have been checked already in
+ * nested_svm_exit_special. There is nothing to do here,
+ * the vmexit is injected by svm_check_nested_events.
+ */
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
+}
+
+static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
+ vmcb->control.exit_code_hi = 0;
+
+ if (ex->has_error_code)
+ vmcb->control.exit_info_1 = ex->error_code;
+
+ /*
+ * EXITINFO2 is undefined for all exception intercepts other
+ * than #PF.
+ */
+ if (ex->vector == PF_VECTOR) {
+ if (ex->has_payload)
+ vmcb->control.exit_info_2 = ex->payload;
+ else
+ vmcb->control.exit_info_2 = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ /* See kvm_check_and_inject_events(). */
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+ } else {
+ WARN_ON(ex->has_payload);
+ }
+
+ nested_svm_vmexit(svm);
+}
+
+static inline bool nested_exit_on_init(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
+}
+
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
+ */
+ bool block_nested_exceptions = svm->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_init(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
+ return 0;
+ }
+
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ nested_svm_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ return 0;
+ }
+
+#ifdef CONFIG_KVM_SMM
+ if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_smi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
+ return 0;
+ }
+#endif
+
+ if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_nmi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_intr(svm))
+ return 0;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
+ return 0;
+ }
+
+ return 0;
+}
+
+int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_NPF:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
+ return NESTED_EXIT_HOST;
+ else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ svm->vcpu.arch.apf.host_apf_flags)
+ /* Trap async PF even if not shadowing */
+ return NESTED_EXIT_HOST;
+ break;
+ }
+ case SVM_EXIT_VMMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_svm_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu))
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
+
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.tsc_scaling_ratio =
+ kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
+ svm->tsc_ratio_msr);
+ svm_write_tsc_multiplier(vcpu);
+}
+
+/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
+static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
+ struct vmcb_ctrl_area_cached *from)
+{
+ unsigned int i;
+
+ memset(dst, 0, sizeof(struct vmcb_control_area));
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->next_rip = from->next_rip;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'hv_enlightenments' are not changed by KVM */
+}
+
+static int svm_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_svm *svm;
+ struct vmcb_control_area *ctl;
+ unsigned long r;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = KVM_STATE_NESTED_FORMAT_SVM,
+ .size = sizeof(kvm_state),
+ };
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+
+ if (!vcpu)
+ return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
+
+ svm = to_svm(vcpu);
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ /* First fill in the header and copy it out. */
+ if (is_guest_mode(vcpu)) {
+ kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
+ kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (svm->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+ }
+
+ if (gif_set(svm))
+ kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (!is_guest_mode(vcpu))
+ goto out;
+
+ /*
+ * Copy over the full size of the VMCB rather than just the size
+ * of the structs.
+ */
+ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
+ return -EFAULT;
+
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return -ENOMEM;
+
+ nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
+ r = copy_to_user(&user_vmcb->control, ctl,
+ sizeof(user_vmcb->control));
+ kfree(ctl);
+ if (r)
+ return -EFAULT;
+
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
+ sizeof(user_vmcb->save)))
+ return -EFAULT;
+out:
+ return kvm_state.size;
+}
+
+static int svm_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+ struct vmcb_control_area *ctl;
+ struct vmcb_save_area *save;
+ struct vmcb_save_area_cached save_cached;
+ struct vmcb_ctrl_area_cached ctl_cached;
+ unsigned long cr0;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+ KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
+ if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
+ return -EINVAL;
+
+ if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
+ KVM_STATE_NESTED_RUN_PENDING |
+ KVM_STATE_NESTED_GIF_SET))
+ return -EINVAL;
+
+ /*
+ * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
+ * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
+ */
+ if (!(vcpu->arch.efer & EFER_SVME)) {
+ /* GIF=1 and no guest mode are required if SVME=0. */
+ if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
+ return -EINVAL;
+ }
+
+ /* SMM temporarily disables SVM, so we cannot be in guest mode. */
+ if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+ return 0;
+ }
+
+ if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
+ return -EINVAL;
+ if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
+ return -EINVAL;
+
+ ctl = memdup_user(&user_vmcb->control, sizeof(*ctl));
+ if (IS_ERR(ctl))
+ return PTR_ERR(ctl);
+
+ save = memdup_user(&user_vmcb->save, sizeof(*save));
+ if (IS_ERR(save)) {
+ kfree(ctl);
+ return PTR_ERR(save);
+ }
+
+ ret = -EINVAL;
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
+ goto out_free;
+
+ /*
+ * Processor state contains L2 state. Check that it is
+ * valid for guest mode (see nested_vmcb_check_save).
+ */
+ cr0 = kvm_read_cr0(vcpu);
+ if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
+ goto out_free;
+
+ /*
+ * Validate host state saved from before VMRUN (see
+ * nested_svm_check_permissions).
+ */
+ __nested_copy_vmcb_save_to_cache(&save_cached, save);
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !__nested_vmcb_check_save(vcpu, &save_cached))
+ goto out_free;
+
+
+ /*
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
+ */
+
+ if (is_guest_mode(vcpu))
+ svm_leave_nested(vcpu);
+ else
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
+ svm->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
+ nested_copy_vmcb_control_to_cache(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+ svm->nested.force_msr_bitmap_recalc = true;
+
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ ret = 0;
+out_free:
+ kfree(save);
+ kfree(ctl);
+
+ return ret;
+}
+
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+
+ if (!nested_svm_merge_msrpm(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ if (kvm_hv_verify_vp_assist(vcpu))
+ return false;
+
+ return true;
+}
+
+struct kvm_x86_nested_ops svm_nested_ops = {
+ .leave_nested = svm_leave_nested,
+ .is_exception_vmexit = nested_svm_is_exception_vmexit,
+ .check_events = svm_check_nested_events,
+ .triple_fault = nested_svm_triple_fault,
+ .get_nested_state_pages = svm_get_nested_state_pages,
+ .get_state = svm_get_nested_state,
+ .set_state = svm_set_nested_state,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
+};
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
new file mode 100644
index 000000000000..bc062285fbf5
--- /dev/null
+++ b/arch/x86/kvm/svm/pmu.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+#include "svm.h"
+
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+static struct kvm_pmc *amd_pmu_get_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ unsigned int num_counters = pmu->nr_arch_gp_counters;
+
+ if (pmc_idx >= num_counters)
+ return NULL;
+
+ return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)];
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+ unsigned int idx;
+
+ if (!pmu->version)
+ return NULL;
+
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ return NULL;
+ /*
+ * Each PMU counter has a pair of CTL and CTR MSRs. CTLn
+ * MSRs (accessed via EVNTSEL) are even, CTRn MSRs are odd.
+ */
+ idx = (unsigned int)((msr - MSR_F15H_PERF_CTL0) / 2);
+ if (!(msr & 0x1) != (type == PMU_TYPE_EVNTSEL))
+ return NULL;
+ break;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ idx = msr - MSR_K7_EVNTSEL0;
+ break;
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ idx = msr - MSR_K7_PERFCTR0;
+ break;
+ default:
+ return NULL;
+ }
+
+ return amd_pmu_get_pmc(pmu, idx);
+}
+
+static int amd_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ if (idx >= pmu->nr_arch_gp_counters)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
+{
+ return amd_pmu_get_pmc(vcpu_to_pmu(vcpu), idx);
+}
+
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+
+ return pmc;
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ switch (msr) {
+ case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
+ return pmu->version > 0;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE);
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ return pmu->version > 1;
+ default:
+ if (msr > MSR_F15H_PERF_CTR5 &&
+ msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters)
+ return pmu->version > 1;
+ break;
+ }
+
+ return amd_msr_idx_to_pmc(vcpu, msr);
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ msr_info->data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ msr_info->data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ pmc_write_counter(pmc, data);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel) {
+ pmc->eventsel = data;
+ kvm_pmu_request_counter_reprogram(pmc);
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ union cpuid_0x80000022_ebx ebx;
+
+ pmu->version = 1;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+ pmu->version = 2;
+ /*
+ * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
+ * CPUID entry is guaranteed to be non-NULL.
+ */
+ BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
+ x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
+ ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
+ pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
+ } else if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ } else {
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ }
+
+ pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters,
+ kvm_pmu_cap.num_counters_gp);
+
+ if (pmu->version > 1) {
+ pmu->global_ctrl_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1);
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd;
+ }
+
+ pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1;
+ pmu->reserved_bits = 0xfffffff000280000ull;
+ pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE);
+
+ for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
+ .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .check_rdpmc_early = amd_check_rdpmc_early,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS,
+ .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
+};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
new file mode 100644
index 000000000000..f59c65abe3cf
--- /dev/null
+++ b/arch/x86/kvm/svm/sev.c
@@ -0,0 +1,5169 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM-SEV support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/psp.h>
+#include <linux/psp-sev.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/misc_cgroup.h>
+#include <linux/processor.h>
+#include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
+
+#include <asm/pkru.h>
+#include <asm/trapnr.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/sev.h>
+
+#include "mmu.h"
+#include "x86.h"
+#include "svm.h"
+#include "svm_ops.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define GHCB_VERSION_MAX 2ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
+
+/* enable/disable SEV support */
+static bool sev_enabled = true;
+module_param_named(sev, sev_enabled, bool, 0444);
+
+/* enable/disable SEV-ES support */
+static bool sev_es_enabled = true;
+module_param_named(sev_es, sev_es_enabled, bool, 0444);
+
+/* enable/disable SEV-SNP support */
+static bool sev_snp_enabled = true;
+module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
+
+/* enable/disable SEV-ES DebugSwap support */
+static bool sev_es_debug_swap_enabled = true;
+module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
+static u64 sev_supported_vmsa_features;
+
+static unsigned int nr_ciphertext_hiding_asids;
+module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444);
+
+#define AP_RESET_HOLD_NONE 0
+#define AP_RESET_HOLD_NAE_EVENT 1
+#define AP_RESET_HOLD_MSR_PROTO 2
+
+/*
+ * SEV-SNP policy bits that can be supported by KVM. These include policy bits
+ * that have implementation support within KVM or policy bits that do not
+ * require implementation support within KVM to enforce the policy.
+ */
+#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET | \
+ SNP_POLICY_MASK_CXL_ALLOW | \
+ SNP_POLICY_MASK_MEM_AES_256_XTS | \
+ SNP_POLICY_MASK_RAPL_DIS | \
+ SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \
+ SNP_POLICY_MASK_PAGE_SWAP_DISABLE)
+
+static u64 snp_supported_policy_bits __ro_after_init;
+
+#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
+
+static u8 sev_enc_bit;
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
+unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned int max_sev_es_asid;
+static unsigned int min_sev_es_asid;
+static unsigned int max_snp_asid;
+static unsigned int min_snp_asid;
+static unsigned long sev_me_mask;
+static unsigned int nr_asids;
+static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
+
+static int snp_decommission_context(struct kvm *kvm);
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+/* Called with the sev_bitmap_lock held, or on shutdown */
+static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
+{
+ int ret, error = 0;
+ unsigned int asid;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
+ if (asid > max_asid)
+ return -EBUSY;
+
+ /*
+ * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+ * so it must be guarded.
+ */
+ down_write(&sev_deactivate_lock);
+
+ /* SNP firmware requires use of WBINVD for ASID recycling. */
+ wbinvd_on_all_cpus();
+
+ if (sev_snp_enabled)
+ ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
+ else
+ ret = sev_guest_df_flush(&error);
+
+ up_write(&sev_deactivate_lock);
+
+ if (ret)
+ pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
+ sev_snp_enabled ? "-SNP" : "", ret, error);
+
+ return ret;
+}
+
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_sev_info(kvm)->enc_context_owner;
+}
+
+static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+
+ return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
+}
+
+static bool snp_is_secure_tsc_enabled(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) &&
+ !WARN_ON_ONCE(!sev_snp_guest(kvm));
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
+{
+ if (sev_flush_asids(min_asid, max_asid))
+ return false;
+
+ /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
+ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+ nr_asids);
+ bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
+
+ return true;
+}
+
+static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ return misc_cg_try_charge(type, sev->misc_cg, 1);
+}
+
+static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+}
+
+static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type)
+{
+ /*
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
+ */
+ unsigned int min_asid, max_asid, asid;
+ bool retry = true;
+ int ret;
+
+ if (vm_type == KVM_X86_SNP_VM) {
+ min_asid = min_snp_asid;
+ max_asid = max_snp_asid;
+ } else if (sev->es_active) {
+ min_asid = min_sev_es_asid;
+ max_asid = max_sev_es_asid;
+ } else {
+ min_asid = min_sev_asid;
+ max_asid = max_sev_asid;
+ }
+
+ /*
+ * The min ASID can end up larger than the max if basic SEV support is
+ * effectively disabled by disallowing use of ASIDs for SEV guests.
+ * Similarly for SEV-ES guests the min ASID can end up larger than the
+ * max when ciphertext hiding is enabled, effectively disabling SEV-ES
+ * support.
+ */
+ if (min_asid > max_asid)
+ return -ENOTTY;
+
+ WARN_ON(sev->misc_cg);
+ sev->misc_cg = get_current_misc_cg();
+ ret = sev_misc_cg_try_charge(sev);
+ if (ret) {
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+ }
+
+ mutex_lock(&sev_bitmap_lock);
+
+again:
+ asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
+ if (asid > max_asid) {
+ if (retry && __sev_recycle_asids(min_asid, max_asid)) {
+ retry = false;
+ goto again;
+ }
+ mutex_unlock(&sev_bitmap_lock);
+ ret = -EBUSY;
+ goto e_uncharge;
+ }
+
+ __set_bit(asid, sev_asid_bitmap);
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ sev->asid = asid;
+ return 0;
+e_uncharge:
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+}
+
+static unsigned int sev_get_asid(struct kvm *kvm)
+{
+ return to_kvm_sev_info(kvm)->asid;
+}
+
+static void sev_asid_free(struct kvm_sev_info *sev)
+{
+ struct svm_cpu_data *sd;
+ int cpu;
+
+ mutex_lock(&sev_bitmap_lock);
+
+ __set_bit(sev->asid, sev_reclaim_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu_ptr(&svm_data, cpu);
+ sd->sev_vmcbs[sev->asid] = NULL;
+ }
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+}
+
+static void sev_decommission(unsigned int handle)
+{
+ struct sev_data_decommission decommission;
+
+ if (!handle)
+ return;
+
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
+}
+
+/*
+ * Transition a page to hypervisor-owned/shared state in the RMP table. This
+ * should not fail under normal conditions, but leak the page should that
+ * happen since it will no longer be usable by the host due to RMP protections.
+ */
+static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
+{
+ if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
+ snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Certain page-states, such as Pre-Guest and Firmware pages (as documented
+ * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
+ * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
+ * unless they are reclaimed first.
+ *
+ * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
+ * might not be usable by the host due to being set as immutable or still
+ * being associated with a guest ASID.
+ *
+ * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
+ * converted back to shared, as the page is no longer usable due to RMP
+ * protections, and it's infeasible for the guest to continue on.
+ */
+static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
+{
+ struct sev_data_snp_page_reclaim data = {0};
+ int fw_err, rc;
+
+ data.paddr = __sme_set(pfn << PAGE_SHIFT);
+ rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
+ if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
+ snp_leak_pages(pfn, 1);
+ return -EIO;
+ }
+
+ if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
+ return -EIO;
+
+ return rc;
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_deactivate deactivate;
+
+ if (!handle)
+ return;
+
+ deactivate.handle = handle;
+
+ /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+ down_read(&sev_deactivate_lock);
+ sev_guest_deactivate(&deactivate, NULL);
+ up_read(&sev_deactivate_lock);
+
+ sev_decommission(handle);
+}
+
+/*
+ * This sets up bounce buffers/firmware pages to handle SNP Guest Request
+ * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
+ * 2.0 specification for more details.
+ *
+ * Technically, when an SNP Guest Request is issued, the guest will provide its
+ * own request/response pages, which could in theory be passed along directly
+ * to firmware rather than using bounce pages. However, these pages would need
+ * special care:
+ *
+ * - Both pages are from shared guest memory, so they need to be protected
+ * from migration/etc. occurring while firmware reads/writes to them. At a
+ * minimum, this requires elevating the ref counts and potentially needing
+ * an explicit pinning of the memory. This places additional restrictions
+ * on what type of memory backends userspace can use for shared guest
+ * memory since there is some reliance on using refcounted pages.
+ *
+ * - The response page needs to be switched to Firmware-owned[1] state
+ * before the firmware can write to it, which can lead to potential
+ * host RMP #PFs if the guest is misbehaved and hands the host a
+ * guest page that KVM might write to for other reasons (e.g. virtio
+ * buffers/etc.).
+ *
+ * Both of these issues can be avoided completely by using separately-allocated
+ * bounce pages for both the request/response pages and passing those to
+ * firmware instead. So that's what is being set up here.
+ *
+ * Guest requests rely on message sequence numbers to ensure requests are
+ * issued to firmware in the order the guest issues them, so concurrent guest
+ * requests generally shouldn't happen. But a misbehaved guest could issue
+ * concurrent guest requests in theory, so a mutex is used to serialize
+ * access to the bounce buffers.
+ *
+ * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
+ * details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
+ * in the APM for details on the related RMP restrictions.
+ */
+static int snp_guest_req_init(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct page *req_page;
+
+ req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!req_page)
+ return -ENOMEM;
+
+ sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sev->guest_resp_buf) {
+ __free_page(req_page);
+ return -EIO;
+ }
+
+ sev->guest_req_buf = page_address(req_page);
+ mutex_init(&sev->guest_req_mutex);
+
+ return 0;
+}
+
+static void snp_guest_req_cleanup(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ if (sev->guest_resp_buf)
+ snp_free_firmware_page(sev->guest_resp_buf);
+
+ if (sev->guest_req_buf)
+ __free_page(virt_to_page(sev->guest_req_buf));
+
+ sev->guest_req_buf = NULL;
+ sev->guest_resp_buf = NULL;
+}
+
+static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_init *data,
+ unsigned long vm_type)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_platform_init_args init_args = {0};
+ bool es_active = vm_type != KVM_X86_SEV_VM;
+ bool snp_active = vm_type == KVM_X86_SNP_VM;
+ u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
+ int ret;
+
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
+ if (data->flags)
+ return -EINVAL;
+
+ if (!snp_active)
+ valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC;
+
+ if (data->vmsa_features & ~valid_vmsa_features)
+ return -EINVAL;
+
+ if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
+ return -EINVAL;
+
+ /*
+ * KVM supports the full range of mandatory features defined by version
+ * 2 of the GHCB protocol, so default to that for SEV-ES guests created
+ * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1).
+ */
+ if (es_active && !data->ghcb_version)
+ data->ghcb_version = 2;
+
+ if (snp_active && data->ghcb_version < 2)
+ return -EINVAL;
+
+ if (unlikely(sev->active))
+ return -EINVAL;
+
+ sev->active = true;
+ sev->es_active = es_active;
+ sev->vmsa_features = data->vmsa_features;
+ sev->ghcb_version = data->ghcb_version;
+
+ if (snp_active)
+ sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+
+ ret = sev_asid_new(sev, vm_type);
+ if (ret)
+ goto e_no_asid;
+
+ init_args.probe = false;
+ ret = sev_platform_init(&init_args);
+ if (ret)
+ goto e_free_asid;
+
+ if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_free_asid;
+ }
+
+ /* This needs to happen after SEV/SNP firmware initialization. */
+ if (snp_active) {
+ ret = snp_guest_req_init(kvm);
+ if (ret)
+ goto e_free;
+ }
+
+ INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
+ sev->need_init = false;
+
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
+
+ return 0;
+
+e_free:
+ free_cpumask_var(sev->have_run_cpus);
+e_free_asid:
+ argp->error = init_args.error;
+ sev_asid_free(sev);
+ sev->asid = 0;
+e_no_asid:
+ sev->vmsa_features = 0;
+ sev->es_active = false;
+ sev->active = false;
+ return ret;
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data = {
+ .vmsa_features = 0,
+ .ghcb_version = 0,
+ };
+ unsigned long vm_type;
+
+ if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
+ return -EINVAL;
+
+ vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
+
+ /*
+ * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
+ * continue to only ever support the minimal GHCB protocol version.
+ */
+ if (vm_type == KVM_X86_SEV_ES_VM)
+ data.ghcb_version = GHCB_VERSION_MIN;
+
+ return __sev_guest_init(kvm, argp, &data, vm_type);
+}
+
+static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data;
+
+ if (!to_kvm_sev_info(kvm)->need_init)
+ return -EINVAL;
+
+ if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
+ kvm->arch.vm_type != KVM_X86_SNP_VM)
+ return -EINVAL;
+
+ if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
+ return -EFAULT;
+
+ return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ unsigned int asid = sev_get_asid(kvm);
+ struct sev_data_activate activate;
+ int ret;
+
+ /* activate ASID on the given handle */
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ return sev_issue_cmd_external_user(fd_file(f), id, data, error);
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_launch_start start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ memset(&start, 0, sizeof(start));
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
+
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
+ }
+
+ start.handle = params.handle;
+ start.policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ /* return handle to userspace */
+ params.handle = start.handle;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start.handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->policy = params.policy;
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+ return ret;
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ unsigned int flags)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ unsigned long npages, size;
+ int npinned;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ unsigned long first, last;
+ int ret;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return ERR_PTR(-EINVAL);
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (WARN_ON_ONCE(npages > INT_MAX))
+ return ERR_PTR(-EINVAL);
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT);
+ else
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
+
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ /* Pin the user virtual address. */
+ npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ unpin_user_pages(pages, npinned);
+
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ unpin_user_pages(pages, npages);
+ kvfree(pages);
+ to_kvm_sev_info(kvm)->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
+ pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_local_page(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_local(page_virtual);
+ cond_resched();
+ }
+}
+
+static void sev_writeback_caches(struct kvm *kvm)
+{
+ /*
+ * Ensure that all dirty guest tagged cache entries are written back
+ * before releasing the pages back to the system for use. CLFLUSH will
+ * not do this without SME_COHERENT, and flushing many cache lines
+ * individually is slower than blasting WBINVD for large VMs, so issue
+ * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
+ * on CPUs that have done VMRUN, i.e. may have dirtied data using the
+ * VM's ASID.
+ *
+ * For simplicity, never remove CPUs from the bitmap. Ideally, KVM
+ * would clear the mask when flushing caches, but doing so requires
+ * serializing multiple calls and having responding CPUs (to the IPI)
+ * mark themselves as still running if they are running (or about to
+ * run) a vCPU for the VM.
+ *
+ * Note, the caller is responsible for ensuring correctness if the mask
+ * can be modified, e.g. if a CPU could be doing VMRUN.
+ */
+ wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ unsigned long i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data data;
+ struct page **inpages;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ data.reserved = 0;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+ return ret;
+}
+
+static int sev_es_sync_vmsa(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+ struct sev_es_save_area *save = svm->sev_es.vmsa;
+ struct xregs_state *xsave;
+ const u8 *s;
+ u8 *d;
+ int i;
+
+ /* Check some debug related fields before encrypting the VMSA */
+ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
+ return -EINVAL;
+
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
+
+ /* Sync registgers */
+ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
+ save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
+ save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
+ save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
+ save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
+ save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
+ save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
+#ifdef CONFIG_X86_64
+ save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
+ save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
+ save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
+ save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
+ save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
+ save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
+ save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
+ save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
+#endif
+ save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
+
+ /* Sync some non-GPR registers before encrypting */
+ save->xcr0 = svm->vcpu.arch.xcr0;
+ save->pkru = svm->vcpu.arch.pkru;
+ save->xss = svm->vcpu.arch.ia32_xss;
+ save->dr6 = svm->vcpu.arch.dr6;
+
+ save->sev_features = sev->vmsa_features;
+
+ /*
+ * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
+ * breaking older measurements.
+ */
+ if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
+ xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
+ save->x87_dp = xsave->i387.rdp;
+ save->mxcsr = xsave->i387.mxcsr;
+ save->x87_ftw = xsave->i387.twd;
+ save->x87_fsw = xsave->i387.swd;
+ save->x87_fcw = xsave->i387.cwd;
+ save->x87_fop = xsave->i387.fop;
+ save->x87_ds = 0;
+ save->x87_cs = 0;
+ save->x87_rip = xsave->i387.rip;
+
+ for (i = 0; i < 8; i++) {
+ /*
+ * The format of the x87 save area is undocumented and
+ * definitely not what you would expect. It consists of
+ * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
+ * area with bytes 8-9 of each register.
+ */
+ d = save->fpreg_x87 + i * 8;
+ s = ((u8 *)xsave->i387.st_space) + i * 16;
+ memcpy(d, s, 8);
+ save->fpreg_x87[64 + i * 2] = s[8];
+ save->fpreg_x87[64 + i * 2 + 1] = s[9];
+ }
+ memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
+
+ s = get_xsave_addr(xsave, XFEATURE_YMM);
+ if (s)
+ memcpy(save->fpreg_ymm, s, 256);
+ else
+ memset(save->fpreg_ymm, 0, 256);
+ }
+
+ pr_debug("Virtual Machine Save Area (VMSA):\n");
+ print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
+
+ return 0;
+}
+
+static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ int *error)
+{
+ struct sev_data_launch_update_vmsa vmsa;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (vcpu->guest_debug) {
+ pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
+ return -EINVAL;
+ }
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /*
+ * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
+ * the VMSA memory content (i.e it will write the same memory region
+ * with the guest's key), so invalidate it first.
+ */
+ clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
+
+ vmsa.reserved = 0;
+ vmsa.handle = to_kvm_sev_info(kvm)->handle;
+ vmsa.address = __sme_pa(svm->sev_es.vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ if (ret)
+ return ret;
+
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+ vcpu->arch.guest_state_protected = true;
+
+ /*
+ * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
+ * only after setting guest_state_protected because KVM_SET_MSRS allows
+ * dynamic toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ return 0;
+}
+
+static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ if (!sev_es_guest(kvm))
+ return -ENOTTY;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = mutex_lock_killable(&vcpu->mutex);
+ if (ret)
+ return ret;
+
+ ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
+
+ mutex_unlock(&vcpu->mutex);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *measure = u64_to_user_ptr(argp->data);
+ struct sev_data_launch_measure data;
+ struct kvm_sev_launch_measure params;
+ void __user *p = NULL;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, measure, sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = u64_to_user_ptr(params.uaddr);
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ }
+
+cmd:
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(measure, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_launch_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ memset(&data, 0, sizeof(data));
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
+ if (ret)
+ return ret;
+
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
+
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct sev_data_dbg data;
+
+ data.reserved = 0;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
+
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ offset = src_paddr & 15;
+ src_paddr = round_down(src_paddr, 16);
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *vaddr,
+ unsigned long dst_paddr,
+ void __user *dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage), vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ unsigned int size;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
+ return -EFAULT;
+
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (IS_ERR(src_p))
+ return PTR_ERR(src_p);
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
+ if (IS_ERR(dst_p)) {
+ sev_unpin_memory(kvm, src_p, n);
+ return PTR_ERR(dst_p);
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
+ * the pages; flush the destination too so that future accesses do not
+ * see stale data.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ (void __user *)dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_launch_secret data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n, i;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(pages, n);
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ memset(&data, 0, sizeof(data));
+
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_unpin_memory;
+ }
+
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_unpin_memory:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < n; i++) {
+ set_page_dirty_lock(pages[i]);
+ mark_page_accessed(pages[i]);
+ }
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *report = u64_to_user_ptr(argp->data);
+ struct sev_data_attestation_report data;
+ struct kvm_sev_attestation_report params;
+ void __user *p;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = u64_to_user_ptr(params.uaddr);
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
+ }
+cmd:
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(report, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct sev_data_send_start data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ params->session_len = data.session_len;
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ &params);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct sev_data_send_update_data data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (IS_ERR(guest_page))
+ return PTR_ERR(guest_page);
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
+ params.hdr_len))
+ ret = -EFAULT;
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ params.handle = start.handle;
+ if (copy_to_user(u64_to_user_ptr(argp->data),
+ &params, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, FOLL_WRITE);
+ if (IS_ERR(guest_page)) {
+ ret = PTR_ERR(guest_page);
+ goto e_free_trans;
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
+ * encrypts the written data with the guest's key, and the cache may
+ * contain dirty, unencrypted data.
+ */
+ sev_clflush_pages(guest_page, n);
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
+static bool is_cmd_allowed_from_mirror(u32 cmd_id)
+{
+ /*
+ * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
+ * active mirror VMs. Also allow the debugging and status commands.
+ */
+ if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
+ cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
+ cmd_id == KVM_SEV_DBG_ENCRYPT)
+ return true;
+
+ return false;
+}
+
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
+
+ /*
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
+ */
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
+ return -EBUSY;
+
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
+ return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
+}
+
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
+
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+}
+
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
+ struct kvm_vcpu *dst_vcpu, *src_vcpu;
+ struct vcpu_svm *dst_svm, *src_svm;
+ struct kvm_sev_info *mirror;
+ unsigned long i;
+
+ dst->active = true;
+ dst->asid = src->asid;
+ dst->handle = src->handle;
+ dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
+ dst->es_active = src->es_active;
+ dst->vmsa_features = src->vmsa_features;
+
+ src->asid = 0;
+ src->active = false;
+ src->handle = 0;
+ src->pages_locked = 0;
+ src->enc_context_owner = NULL;
+ src->es_active = false;
+
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
+
+ kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
+ dst_svm = to_svm(dst_vcpu);
+
+ sev_init_vmcb(dst_svm, false);
+
+ if (!dst->es_active)
+ continue;
+
+ /*
+ * Note, the source is not required to have the same number of
+ * vCPUs as the destination when migrating a vanilla SEV VM.
+ */
+ src_vcpu = kvm_get_vcpu(src_kvm, i);
+ src_svm = to_svm(src_vcpu);
+
+ /*
+ * Transfer VMSA and GHCB state to the destination. Nullify and
+ * clear source fields as appropriate, the state now belongs to
+ * the destination.
+ */
+ memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
+ dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
+ dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
+ dst_vcpu->arch.guest_state_protected = true;
+
+ memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
+ src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
+ src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ src_vcpu->arch.guest_state_protected = false;
+ }
+}
+
+static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
+{
+ struct kvm_vcpu *src_vcpu;
+ unsigned long i;
+
+ if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
+ dst->created_vcpus != atomic_read(&dst->online_vcpus))
+ return -EBUSY;
+
+ if (!sev_es_guest(src))
+ return 0;
+
+ if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
+ return -EINVAL;
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ if (!src_vcpu->arch.guest_state_protected)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm);
+ struct kvm_sev_info *src_sev, *cg_cleanup_sev;
+ CLASS(fd, f)(source_fd);
+ struct kvm *source_kvm;
+ bool charged = false;
+ int ret;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (!file_is_kvm(fd_file(f)))
+ return -EBADF;
+
+ source_kvm = fd_file(f)->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ return ret;
+
+ if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
+ sev_guest(kvm) || !sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ src_sev = to_kvm_sev_info(source_kvm);
+
+ dst_sev->misc_cg = get_current_misc_cg();
+ cg_cleanup_sev = dst_sev;
+ if (dst_sev->misc_cg != src_sev->misc_cg) {
+ ret = sev_misc_cg_try_charge(dst_sev);
+ if (ret)
+ goto out_dst_cgroup;
+ charged = true;
+ }
+
+ ret = kvm_lock_all_vcpus(kvm);
+ if (ret)
+ goto out_dst_cgroup;
+ ret = kvm_lock_all_vcpus(source_kvm);
+ if (ret)
+ goto out_dst_vcpu;
+
+ ret = sev_check_source_vcpus(kvm, source_kvm);
+ if (ret)
+ goto out_source_vcpu;
+
+ /*
+ * Allocate a new have_run_cpus for the destination, i.e. don't copy
+ * the set of CPUs from the source. If a CPU was used to run a vCPU in
+ * the source VM but is never used for the destination VM, then the CPU
+ * can only have cached memory that was accessible to the source VM.
+ */
+ if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto out_source_vcpu;
+ }
+
+ sev_migrate_from(kvm, source_kvm);
+ kvm_vm_dead(source_kvm);
+ cg_cleanup_sev = src_sev;
+ ret = 0;
+
+out_source_vcpu:
+ kvm_unlock_all_vcpus(source_kvm);
+out_dst_vcpu:
+ kvm_unlock_all_vcpus(kvm);
+out_dst_cgroup:
+ /* Operates on the source on success, on the destination on failure. */
+ if (charged)
+ sev_misc_cg_uncharge(cg_cleanup_sev);
+ put_misc_cg(cg_cleanup_sev->misc_cg);
+ cg_cleanup_sev->misc_cg = NULL;
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+ return ret;
+}
+
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
+{
+ if (group != KVM_X86_GRP_SEV)
+ return -ENXIO;
+
+ switch (attr) {
+ case KVM_X86_SEV_VMSA_FEATURES:
+ *val = sev_supported_vmsa_features;
+ return 0;
+
+ case KVM_X86_SNP_POLICY_BITS:
+ *val = snp_supported_policy_bits;
+ return 0;
+
+ default:
+ return -ENXIO;
+ }
+}
+
+/*
+ * The guest context contains all the information, keys and metadata
+ * associated with the guest that the firmware tracks to implement SEV
+ * and SNP features. The firmware stores the guest context in hypervisor
+ * provide page via the SNP_GCTX_CREATE command.
+ */
+static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_snp_addr data = {};
+ void *context;
+ int rc;
+
+ /* Allocate memory for context page */
+ context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+ if (!context)
+ return NULL;
+
+ data.address = __psp_pa(context);
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
+ if (rc) {
+ pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
+ rc, argp->error);
+ snp_free_firmware_page(context);
+ return NULL;
+ }
+
+ return context;
+}
+
+static int snp_bind_asid(struct kvm *kvm, int *error)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_activate data = {0};
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.asid = sev_get_asid(kvm);
+ return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
+}
+
+static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_launch_start start = {0};
+ struct kvm_sev_snp_launch_start params;
+ int rc;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ /* Don't allow userspace to allocate memory for more than 1 SNP context. */
+ if (sev->snp_context)
+ return -EINVAL;
+
+ if (params.flags)
+ return -EINVAL;
+
+ if (params.policy & ~snp_supported_policy_bits)
+ return -EINVAL;
+
+ /* Check for policy bits that must be set */
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
+ return -EINVAL;
+
+ if (snp_is_secure_tsc_enabled(kvm)) {
+ if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz))
+ return -EINVAL;
+
+ start.desired_tsc_khz = kvm->arch.default_tsc_khz;
+ }
+
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
+ start.gctx_paddr = __psp_pa(sev->snp_context);
+ start.policy = params.policy;
+
+ memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
+ if (rc) {
+ pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ sev->policy = params.policy;
+ sev->fd = argp->sev_fd;
+ rc = snp_bind_asid(kvm, &argp->error);
+ if (rc) {
+ pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ return 0;
+
+e_free_context:
+ snp_decommission_context(kvm);
+
+ return rc;
+}
+
+struct sev_gmem_populate_args {
+ __u8 type;
+ int sev_fd;
+ int fw_error;
+};
+
+static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
+ void __user *src, int order, void *opaque)
+{
+ struct sev_gmem_populate_args *sev_populate_args = opaque;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ int n_private = 0, ret, i;
+ int npages = (1 << order);
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
+ return -EINVAL;
+
+ for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
+ struct sev_data_snp_launch_update fw_args = {0};
+ bool assigned = false;
+ int level;
+
+ ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
+ if (ret || assigned) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+ __func__, gfn, ret, assigned);
+ ret = ret ? -EINVAL : -EEXIST;
+ goto err;
+ }
+
+ if (src) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ ret = -EFAULT;
+ goto err;
+ }
+ kunmap_local(vaddr);
+ }
+
+ ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
+ sev_get_asid(kvm), true);
+ if (ret)
+ goto err;
+
+ n_private++;
+
+ fw_args.gctx_paddr = __psp_pa(sev->snp_context);
+ fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
+ fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+ fw_args.page_type = sev_populate_args->type;
+
+ ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &fw_args, &sev_populate_args->fw_error);
+ if (ret)
+ goto fw_err;
+ }
+
+ return 0;
+
+fw_err:
+ /*
+ * If the firmware command failed handle the reclaim and cleanup of that
+ * PFN specially vs. prior pages which can be cleaned up below without
+ * needing to reclaim in advance.
+ *
+ * Additionally, when invalid CPUID function entries are detected,
+ * firmware writes the expected values into the page and leaves it
+ * unencrypted so it can be used for debugging and error-reporting.
+ *
+ * Copy this page back into the source buffer so userspace can use this
+ * information to provide information on which CPUID leaves/fields
+ * failed CPUID validation.
+ */
+ if (!snp_page_reclaim(kvm, pfn + i) &&
+ sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
+ sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
+ pr_debug("Failed to write CPUID page back to userspace\n");
+
+ kunmap_local(vaddr);
+ }
+
+ /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
+ n_private--;
+
+err:
+ pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
+ __func__, ret, sev_populate_args->fw_error, n_private);
+ for (i = 0; i < n_private; i++)
+ kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
+
+ return ret;
+}
+
+static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_gmem_populate_args sev_populate_args = {0};
+ struct kvm_sev_snp_launch_update params;
+ struct kvm_memory_slot *memslot;
+ long npages, count;
+ void __user *src;
+ int ret = 0;
+
+ if (!sev_snp_guest(kvm) || !sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
+ params.gfn_start, params.len, params.type, params.flags);
+
+ if (!params.len || !PAGE_ALIGNED(params.len) || params.flags ||
+ (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
+ return -EINVAL;
+
+ npages = params.len / PAGE_SIZE;
+
+ /*
+ * For each GFN that's being prepared as part of the initial guest
+ * state, the following pre-conditions are verified:
+ *
+ * 1) The backing memslot is a valid private memslot.
+ * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
+ * beforehand.
+ * 3) The PFN of the guest_memfd has not already been set to private
+ * in the RMP table.
+ *
+ * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
+ * faults if there's a race between a fault and an attribute update via
+ * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
+ * here. However, kvm->slots_lock guards against both this as well as
+ * concurrent memslot updates occurring while these checks are being
+ * performed, so use that here to make it easier to reason about the
+ * initial expected state and better guard against unexpected
+ * situations.
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ memslot = gfn_to_memslot(kvm, params.gfn_start);
+ if (!kvm_slot_has_gmem(memslot)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sev_populate_args.sev_fd = argp->sev_fd;
+ sev_populate_args.type = params.type;
+ src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
+
+ count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ sev_gmem_post_populate, &sev_populate_args);
+ if (count < 0) {
+ argp->error = sev_populate_args.fw_error;
+ pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
+ __func__, count, argp->error);
+ ret = -EIO;
+ } else {
+ params.gfn_start += count;
+ params.len -= count * PAGE_SIZE;
+ if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
+ params.uaddr += count * PAGE_SIZE;
+
+ ret = 0;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+ }
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
+
+static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_launch_update data = {};
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.page_type = SNP_PAGE_TYPE_VMSA;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /* Transition the VMSA page to a firmware state. */
+ ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
+ if (ret)
+ return ret;
+
+ /* Issue the SNP command to encrypt the VMSA */
+ data.address = __sme_pa(svm->sev_es.vmsa);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &data, &argp->error);
+ if (ret) {
+ snp_page_reclaim(kvm, pfn);
+
+ return ret;
+ }
+
+ svm->vcpu.arch.guest_state_protected = true;
+ /*
+ * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
+ * be _always_ ON. Enable it only after setting
+ * guest_state_protected because KVM_SET_MSRS allows dynamic
+ * toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ }
+
+ return 0;
+}
+
+static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct kvm_sev_snp_launch_finish params;
+ struct sev_data_snp_launch_finish *data;
+ void *id_block = NULL, *id_auth = NULL;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (!sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ if (params.flags)
+ return -EINVAL;
+
+ /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
+ ret = snp_launch_update_vmsa(kvm, argp);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ if (params.id_block_en) {
+ id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
+ if (IS_ERR(id_block)) {
+ ret = PTR_ERR(id_block);
+ goto e_free;
+ }
+
+ data->id_block_en = 1;
+ data->id_block_paddr = __sme_pa(id_block);
+
+ id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
+ if (IS_ERR(id_auth)) {
+ ret = PTR_ERR(id_auth);
+ goto e_free_id_block;
+ }
+
+ data->id_auth_paddr = __sme_pa(id_auth);
+
+ if (params.auth_key_en)
+ data->auth_key_en = 1;
+ }
+
+ data->vcek_disabled = params.vcek_disabled;
+
+ memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
+ data->gctx_paddr = __psp_pa(sev->snp_context);
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
+ kfree(id_auth);
+
+e_free_id_block:
+ kfree(id_block);
+
+e_free:
+ kfree(data);
+
+ return ret;
+}
+
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!sev_enabled)
+ return -ENOTTY;
+
+ if (!argp)
+ return 0;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ /* Only the enc_context_owner handles some memory enc operations. */
+ if (is_mirroring_enc_context(kvm) &&
+ !is_cmd_allowed_from_mirror(sev_cmd.id)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
+ * allow the use of SNP-specific commands.
+ */
+ if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
+ r = -EPERM;
+ goto out;
+ }
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es_enabled) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_INIT2:
+ r = sev_guest_init2(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_VMSA:
+ r = sev_launch_update_vmsa(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GET_ATTESTATION_REPORT:
+ r = sev_get_attestation_report(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_START:
+ r = snp_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_UPDATE:
+ r = snp_launch_update(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_FINISH:
+ r = snp_launch_finish(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
+ if (!region)
+ return -ENOMEM;
+
+ mutex_lock(&kvm->lock);
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
+ FOLL_WRITE | FOLL_LONGTERM);
+ if (IS_ERR(region->pages)) {
+ ret = PTR_ERR(region->pages);
+ mutex_unlock(&kvm->lock);
+ goto e_free;
+ }
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit. Note, this must be done before dropping kvm->lock,
+ * as region and its array of pages can be freed by a different task
+ * once kvm->lock is released.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ sev_writeback_caches(kvm);
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ CLASS(fd, f)(source_fd);
+ struct kvm *source_kvm;
+ struct kvm_sev_info *source_sev, *mirror_sev;
+ int ret;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (!file_is_kvm(fd_file(f)))
+ return -EBADF;
+
+ source_kvm = fd_file(f)->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ return ret;
+
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
+ ret = -EINVAL;
+ goto e_unlock;
+ }
+
+ mirror_sev = to_kvm_sev_info(kvm);
+ if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_unlock;
+ }
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ source_sev = to_kvm_sev_info(source_kvm);
+ kvm_get_kvm(source_kvm);
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->active = true;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->need_init = false;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
+ ret = 0;
+
+ /*
+ * Do not copy ap_jump_table. Since the mirror does not share the same
+ * KVM contexts as the original, and they may have different
+ * memory-views.
+ */
+
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+ return ret;
+}
+
+static int snp_decommission_context(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_addr data = {};
+ int ret;
+
+ /* If context is not created then do nothing */
+ if (!sev->snp_context)
+ return 0;
+
+ /* Do the decommision, which will unbind the ASID from the SNP context */
+ data.address = __sme_pa(sev->snp_context);
+ down_write(&sev_deactivate_lock);
+ ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+ up_write(&sev_deactivate_lock);
+
+ if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
+ return ret;
+
+ snp_free_firmware_page(sev->snp_context);
+ sev->snp_context = NULL;
+
+ return 0;
+}
+
+void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
+ free_cpumask_var(sev->have_run_cpus);
+
+ /*
+ * If this is a mirror VM, remove it from the owner's list of a mirrors
+ * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
+ * Note, mirror VMs don't support registering encrypted regions.
+ */
+ if (is_mirroring_enc_context(kvm)) {
+ struct kvm *owner_kvm = sev->enc_context_owner;
+
+ mutex_lock(&owner_kvm->lock);
+ list_del(&sev->mirror_entry);
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
+ return;
+ }
+
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ cond_resched();
+ }
+ }
+
+ if (sev_snp_guest(kvm)) {
+ snp_guest_req_cleanup(kvm);
+
+ /*
+ * Decomission handles unbinding of the ASID. If it fails for
+ * some unexpected reason, just leak the ASID.
+ */
+ if (snp_decommission_context(kvm))
+ return;
+ } else {
+ sev_unbind_asid(kvm, sev->handle);
+ }
+
+ sev_asid_free(sev);
+}
+
+void __init sev_set_cpu_caps(void)
+{
+ if (sev_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
+ }
+ if (sev_es_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
+ }
+ if (sev_snp_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
+ }
+}
+
+static bool is_sev_snp_initialized(void)
+{
+ struct sev_user_data_snp_status *status;
+ struct sev_data_snp_addr buf;
+ bool initialized = false;
+ int ret, error = 0;
+
+ status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
+ if (!status)
+ return false;
+
+ buf.address = __psp_pa(status);
+ ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
+ if (ret) {
+ pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ goto out;
+ }
+
+ initialized = !!status->state;
+
+out:
+ snp_free_firmware_page(status);
+
+ return initialized;
+}
+
+void __init sev_hardware_setup(void)
+{
+ unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ struct sev_platform_init_args init_args = {0};
+ bool sev_snp_supported = false;
+ bool sev_es_supported = false;
+ bool sev_supported = false;
+
+ if (!sev_enabled || !npt_enabled || !nrips)
+ goto out;
+
+ /*
+ * SEV must obviously be supported in hardware. Sanity check that the
+ * CPU supports decode assists, which is mandatory for SEV guests to
+ * support instruction emulation. Ditto for flushing by ASID, as SEV
+ * guests are bound to a single ASID, i.e. KVM can't rotate to a new
+ * ASID to effect a TLB flush.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SEV) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
+ goto out;
+
+ /*
+ * The kernel's initcall infrastructure lacks the ability to express
+ * dependencies between initcalls, whereas the modules infrastructure
+ * automatically handles dependencies via symbol loading. Ensure the
+ * PSP SEV driver is initialized before proceeding if KVM is built-in,
+ * as the dependency isn't handled by the initcall infrastructure.
+ */
+ if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init())
+ goto out;
+
+ /* Retrieve SEV CPUID information */
+ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
+
+ /* Set encryption bit location for SEV-ES guests */
+ sev_enc_bit = ebx & 0x3f;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = ecx;
+ if (!max_sev_asid)
+ goto out;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
+
+ /*
+ * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
+ * even though it's never used, so that the bitmap is indexed by the
+ * actual ASID.
+ */
+ nr_asids = max_sev_asid + 1;
+ sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ goto out;
+
+ sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap) {
+ bitmap_free(sev_asid_bitmap);
+ sev_asid_bitmap = NULL;
+ goto out;
+ }
+
+ if (min_sev_asid <= max_sev_asid) {
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
+ }
+ sev_supported = true;
+
+ /* SEV-ES support requested? */
+ if (!sev_es_enabled)
+ goto out;
+
+ /*
+ * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
+ * instruction stream, i.e. can't emulate in response to a #NPF and
+ * instead relies on #NPF(RSVD) being reflected into the guest as #VC
+ * (the guest can then do a #VMGEXIT to request MMIO emulation).
+ */
+ if (!enable_mmio_caching)
+ goto out;
+
+ /* Does the CPU support SEV-ES? */
+ if (!boot_cpu_has(X86_FEATURE_SEV_ES))
+ goto out;
+
+ if (!lbrv) {
+ WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
+ "LBRV must be present for SEV-ES support");
+ goto out;
+ }
+
+ /* Has the system been allocated ASIDs for SEV-ES? */
+ if (min_sev_asid == 1)
+ goto out;
+
+ min_sev_es_asid = min_snp_asid = 1;
+ max_sev_es_asid = max_snp_asid = min_sev_asid - 1;
+
+ sev_es_asid_count = min_sev_asid - 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
+ sev_es_supported = true;
+ sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
+
+out:
+ if (sev_enabled) {
+ init_args.probe = true;
+
+ if (sev_is_snp_ciphertext_hiding_supported())
+ init_args.max_snp_asid = min(nr_ciphertext_hiding_asids,
+ min_sev_asid - 1);
+
+ if (sev_platform_init(&init_args))
+ sev_supported = sev_es_supported = sev_snp_supported = false;
+ else if (sev_snp_supported)
+ sev_snp_supported = is_sev_snp_initialized();
+
+ if (sev_snp_supported) {
+ snp_supported_policy_bits = sev_get_snp_policy_bits() &
+ KVM_SNP_POLICY_MASK_VALID;
+ nr_ciphertext_hiding_asids = init_args.max_snp_asid;
+ }
+
+ /*
+ * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP
+ * ASID range is partitioned into separate SEV-ES and SEV-SNP
+ * ASID ranges, with the SEV-SNP range being [1..max_snp_asid]
+ * and the SEV-ES range being (max_snp_asid..max_sev_es_asid].
+ * Note, SEV-ES may effectively be disabled if all ASIDs from
+ * the joint range are assigned to SEV-SNP.
+ */
+ if (nr_ciphertext_hiding_asids) {
+ max_snp_asid = nr_ciphertext_hiding_asids;
+ min_sev_es_asid = max_snp_asid + 1;
+ pr_info("SEV-SNP ciphertext hiding enabled\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SEV))
+ pr_info("SEV %s (ASIDs %u - %u)\n",
+ sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
+ "unusable" :
+ "disabled",
+ min_sev_asid, max_sev_asid);
+ if (boot_cpu_has(X86_FEATURE_SEV_ES))
+ pr_info("SEV-ES %s (ASIDs %u - %u)\n",
+ sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" :
+ "unusable" :
+ "disabled",
+ min_sev_es_asid, max_sev_es_asid);
+ if (boot_cpu_has(X86_FEATURE_SEV_SNP))
+ pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
+ str_enabled_disabled(sev_snp_supported),
+ min_snp_asid, max_snp_asid);
+
+ sev_enabled = sev_supported;
+ sev_es_enabled = sev_es_supported;
+ sev_snp_enabled = sev_snp_supported;
+
+ if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
+ !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
+ sev_es_debug_swap_enabled = false;
+
+ sev_supported_vmsa_features = 0;
+ if (sev_es_debug_swap_enabled)
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+
+ if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC))
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC;
+}
+
+void sev_hardware_unsetup(void)
+{
+ if (!sev_enabled)
+ return;
+
+ /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
+ sev_flush_asids(1, max_sev_asid);
+
+ bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+ misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+
+ sev_platform_shutdown();
+}
+
+int sev_cpu_init(struct svm_cpu_data *sd)
+{
+ if (!sev_enabled)
+ return 0;
+
+ sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Pages used by hardware to hold guest encrypted state must be flushed before
+ * returning them to the system.
+ */
+static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
+{
+ unsigned int asid = sev_get_asid(vcpu->kvm);
+
+ /*
+ * Note! The address must be a kernel address, as regular page walk
+ * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ * address is non-deterministic and unsafe. This function deliberately
+ * takes a pointer to deter passing in a user address.
+ */
+ unsigned long addr = (unsigned long)va;
+
+ /*
+ * If CPU enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ * flush is still needed in order to work properly with DMA devices.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ clflush_cache_range(va, PAGE_SIZE);
+ return;
+ }
+
+ /*
+ * VM Page Flush takes a host virtual address and a guest ASID. Fall
+ * back to full writeback of caches if this faults so as not to make
+ * any problems worse by leaving stale encrypted data in the cache.
+ */
+ if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ goto do_sev_writeback_caches;
+
+ return;
+
+do_sev_writeback_caches:
+ sev_writeback_caches(vcpu->kvm);
+}
+
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ /*
+ * With SNP+gmem, private/encrypted memory is unreachable via the
+ * hva-based mmu notifiers, i.e. these events are explicitly scoped to
+ * shared pages, where there's no need to flush caches.
+ */
+ if (!sev_guest(kvm) || sev_snp_guest(kvm))
+ return;
+
+ sev_writeback_caches(kvm);
+}
+
+void sev_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return;
+
+ svm = to_svm(vcpu);
+
+ /*
+ * If it's an SNP guest, then the VMSA was marked in the RMP table as
+ * a guest-owned page. Transition the page to hypervisor state before
+ * releasing it back to the system.
+ */
+ if (sev_snp_guest(vcpu->kvm)) {
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
+ goto skip_vmsa_free;
+ }
+
+ if (vcpu->arch.guest_state_protected)
+ sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+
+ __free_page(virt_to_page(svm->sev_es.vmsa));
+
+skip_vmsa_free:
+ if (svm->sev_es.ghcb_sa_free)
+ kvfree(svm->sev_es.ghcb_sa);
+}
+
+static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
+static void dump_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ unsigned int nbits;
+
+ /* Re-use the dump_invalid_vmcb module parameter */
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
+
+ /*
+ * Print KVM's snapshot of the GHCB values that were (unsuccessfully)
+ * used to handle the exit. If the guest has since modified the GHCB
+ * itself, dumping the raw GHCB won't help debug why KVM was unable to
+ * handle the VMGEXIT that KVM observed.
+ */
+ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
+ kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
+ control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
+ control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
+ svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
+}
+
+static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be returned:
+ * GPRs RAX, RBX, RCX, RDX
+ *
+ * Copy their values, even if they may not have been written during the
+ * VM-Exit. It's the guest's responsibility to not consume random data.
+ */
+ ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
+ ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
+ ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
+ ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
+}
+
+static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+ u64 exit_code;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be supplied:
+ * GPRs RAX, RBX, RCX, RDX
+ * XCR0
+ * CPL
+ *
+ * VMMCALL allows the guest to provide extra registers. KVM also
+ * expects RSI for hypercalls, so include that, too.
+ *
+ * Copy their values to the appropriate location if supplied.
+ */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+
+ BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
+ memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm);
+
+ svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm);
+
+ if (kvm_ghcb_xcr0_is_valid(svm))
+ __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm));
+
+ if (kvm_ghcb_xss_is_valid(svm))
+ __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm));
+
+ /* Copy the GHCB exit information into the VMCB fields */
+ exit_code = kvm_ghcb_get_sw_exit_code(svm);
+ control->exit_code = lower_32_bits(exit_code);
+ control->exit_code_hi = upper_32_bits(exit_code);
+ control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm);
+ control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm);
+ svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm);
+
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 exit_code;
+ u64 reason;
+
+ /*
+ * Retrieve the exit code now even though it may not be marked valid
+ * as it could help with debugging.
+ */
+ exit_code = kvm_get_cached_sw_exit_code(control);
+
+ /* Only GHCB Usage code 0 is supported */
+ if (svm->sev_es.ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
+ if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_2_is_valid(svm))
+ goto vmgexit_err;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSC:
+ break;
+ case SVM_EXIT_RDPMC:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_CPUID:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
+ if (!kvm_ghcb_xcr0_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_INVD:
+ break;
+ case SVM_EXIT_IOIO:
+ if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ } else {
+ if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_MSR:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (control->exit_info_1) {
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_VMMCALL:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_cpl_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSCP:
+ break;
+ case SVM_EXIT_WBINVD:
+ break;
+ case SVM_EXIT_MONITOR:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_MWAIT:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_MMIO_READ:
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto vmgexit_err;
+ if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ case SVM_VMGEXIT_AP_JUMP_TABLE:
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ case SVM_VMGEXIT_HV_FEATURES:
+ case SVM_VMGEXIT_TERM_REQUEST:
+ break;
+ case SVM_VMGEXIT_PSC:
+ if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ if (!sev_snp_guest(vcpu->kvm) ||
+ !PAGE_ALIGNED(control->exit_info_1) ||
+ !PAGE_ALIGNED(control->exit_info_2) ||
+ control->exit_info_1 == control->exit_info_2)
+ goto vmgexit_err;
+ break;
+ default:
+ reason = GHCB_ERR_INVALID_EVENT;
+ goto vmgexit_err;
+ }
+
+ return 0;
+
+vmgexit_err:
+ if (reason == GHCB_ERR_INVALID_USAGE) {
+ vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
+ svm->sev_es.ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
+ } else {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
+ exit_code);
+ dump_ghcb(svm);
+ }
+
+ svm_vmgexit_bad_input(svm, reason);
+
+ /* Resume the guest to "return" the error code. */
+ return 1;
+}
+
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
+{
+ /* Clear any indication that the vCPU is in a type of AP Reset Hold */
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
+
+ if (!svm->sev_es.ghcb)
+ return;
+
+ if (svm->sev_es.ghcb_sa_free) {
+ /*
+ * The scratch area lives outside the GHCB, so there is a
+ * buffer that, depending on the operation performed, may
+ * need to be synced, then freed.
+ */
+ if (svm->sev_es.ghcb_sa_sync) {
+ kvm_write_guest(svm->vcpu.kvm,
+ svm->sev_es.sw_scratch,
+ svm->sev_es.ghcb_sa,
+ svm->sev_es.ghcb_sa_len);
+ svm->sev_es.ghcb_sa_sync = false;
+ }
+
+ kvfree(svm->sev_es.ghcb_sa);
+ svm->sev_es.ghcb_sa = NULL;
+ svm->sev_es.ghcb_sa_free = false;
+ }
+
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
+
+ sev_es_sync_to_ghcb(svm);
+
+ kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
+ svm->sev_es.ghcb = NULL;
+}
+
+int pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ struct kvm *kvm = svm->vcpu.kvm;
+ unsigned int asid = sev_get_asid(kvm);
+
+ /*
+ * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid
+ * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP
+ * AP Destroy event.
+ */
+ if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
+ return -EINVAL;
+
+ /*
+ * To optimize cache flushes when memory is reclaimed from an SEV VM,
+ * track physical CPUs that enter the guest for SEV VMs and thus can
+ * have encrypted, dirty data in the cache, and flush caches only for
+ * CPUs that have entered the guest.
+ */
+ if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
+ cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->vcpu.arch.last_vmentry_cpu == cpu)
+ return 0;
+
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ return 0;
+}
+
+#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
+static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_scratch_beg, ghcb_scratch_end;
+ u64 scratch_gpa_beg, scratch_gpa_end;
+ void *scratch_va;
+
+ scratch_gpa_beg = svm->sev_es.sw_scratch;
+ if (!scratch_gpa_beg) {
+ pr_err("vmgexit: scratch gpa not provided\n");
+ goto e_scratch;
+ }
+
+ scratch_gpa_end = scratch_gpa_beg + len;
+ if (scratch_gpa_end < scratch_gpa_beg) {
+ pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
+ len, scratch_gpa_beg);
+ goto e_scratch;
+ }
+
+ if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
+ /* Scratch area begins within GHCB */
+ ghcb_scratch_beg = control->ghcb_gpa +
+ offsetof(struct ghcb, shared_buffer);
+ ghcb_scratch_end = control->ghcb_gpa +
+ offsetof(struct ghcb, reserved_0xff0);
+
+ /*
+ * If the scratch area begins within the GHCB, it must be
+ * completely contained in the GHCB shared buffer area.
+ */
+ if (scratch_gpa_beg < ghcb_scratch_beg ||
+ scratch_gpa_end > ghcb_scratch_end) {
+ pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
+ scratch_gpa_beg, scratch_gpa_end);
+ goto e_scratch;
+ }
+
+ scratch_va = (void *)svm->sev_es.ghcb;
+ scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
+ } else {
+ /*
+ * The guest memory must be read into a kernel buffer, so
+ * limit the size
+ */
+ if (len > GHCB_SCRATCH_AREA_LIMIT) {
+ pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
+ len, GHCB_SCRATCH_AREA_LIMIT);
+ goto e_scratch;
+ }
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
+ if (!scratch_va)
+ return -ENOMEM;
+
+ if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
+ /* Unable to copy scratch area from guest */
+ pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
+
+ kvfree(scratch_va);
+ return -EFAULT;
+ }
+
+ /*
+ * The scratch area is outside the GHCB. The operation will
+ * dictate whether the buffer needs to be synced before running
+ * the vCPU next time (i.e. a read was requested so the data
+ * must be written back to the guest memory).
+ */
+ svm->sev_es.ghcb_sa_sync = sync;
+ svm->sev_es.ghcb_sa_free = true;
+ }
+
+ svm->sev_es.ghcb_sa = scratch_va;
+ svm->sev_es.ghcb_sa_len = len;
+
+ return 0;
+
+e_scratch:
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return 1;
+}
+
+static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
+ unsigned int pos)
+{
+ svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
+ svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
+}
+
+static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
+{
+ return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
+}
+
+static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
+{
+ svm->vmcb->control.ghcb_gpa = value;
+}
+
+static int snp_rmptable_psmash(kvm_pfn_t pfn)
+{
+ int ret;
+
+ pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+ /*
+ * PSMASH_FAIL_INUSE indicates another processor is modifying the
+ * entry, so retry until that's no longer the case.
+ */
+ do {
+ ret = psmash(pfn);
+ } while (ret == PSMASH_FAIL_INUSE);
+
+ return ret;
+}
+
+static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->run->hypercall.ret)
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ else
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
+
+ return 1; /* resume guest */
+}
+
+static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
+{
+ u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
+ u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = 1;
+ vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+
+ vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
+
+ return 0; /* forward request to userspace */
+}
+
+struct psc_buffer {
+ struct psc_hdr hdr;
+ struct psc_entry entries[];
+} __packed;
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
+
+static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
+{
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_2m = false;
+
+ /*
+ * PSC requests always get a "no action" response in SW_EXITINFO1, with
+ * a PSC-specific return code in SW_EXITINFO2 that provides the "real"
+ * return code. E.g. if the PSC request was interrupted, the need to
+ * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
+ */
+ svm_vmgexit_no_action(svm, psc_ret);
+}
+
+static void __snp_complete_one_psc(struct vcpu_svm *svm)
+{
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+ struct psc_entry *entries = psc->entries;
+ struct psc_hdr *hdr = &psc->hdr;
+ __u16 idx;
+
+ /*
+ * Everything in-flight has been processed successfully. Update the
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
+ */
+ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
+ svm->sev_es.psc_inflight--, idx++) {
+ struct psc_entry *entry = &entries[idx];
+
+ entry->cur_page = entry->pagesize ? 512 : 1;
+ }
+
+ hdr->cur_entry = idx;
+}
+
+static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+
+ if (vcpu->run->hypercall.ret) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
+
+ __snp_complete_one_psc(svm);
+
+ /* Handle the next range (if any). */
+ return snp_begin_psc(svm, psc);
+}
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
+{
+ struct psc_entry *entries = psc->entries;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct psc_hdr *hdr = &psc->hdr;
+ struct psc_entry entry_start;
+ u16 idx, idx_start, idx_end;
+ int npages;
+ bool huge;
+ u64 gfn;
+
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+next_range:
+ /* There should be no other PSCs in-flight at this point. */
+ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+ /*
+ * The PSC descriptor buffer can be modified by a misbehaved guest after
+ * validation, so take care to only use validated copies of values used
+ * for things like array indexing.
+ */
+ idx_start = hdr->cur_entry;
+ idx_end = hdr->end_entry;
+
+ if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
+ return 1;
+ }
+
+ /* Find the start of the next range which needs processing. */
+ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
+ entry_start = entries[idx];
+
+ gfn = entry_start.gfn;
+ huge = entry_start.pagesize;
+ npages = huge ? 512 : 1;
+
+ if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
+ return 1;
+ }
+
+ if (entry_start.cur_page) {
+ /*
+ * If this is a partially-completed 2M range, force 4K handling
+ * for the remaining pages since they're effectively split at
+ * this point. Subsequent code should ensure this doesn't get
+ * combined with adjacent PSC entries where 2M handling is still
+ * possible.
+ */
+ npages -= entry_start.cur_page;
+ gfn += entry_start.cur_page;
+ huge = false;
+ }
+
+ if (npages)
+ break;
+ }
+
+ if (idx > idx_end) {
+ /* Nothing more to process. */
+ snp_complete_psc(svm, 0);
+ return 1;
+ }
+
+ svm->sev_es.psc_2m = huge;
+ svm->sev_es.psc_idx = idx;
+ svm->sev_es.psc_inflight = 1;
+
+ /*
+ * Find all subsequent PSC entries that contain adjacent GPA
+ * ranges/operations and can be combined into a single
+ * KVM_HC_MAP_GPA_RANGE exit.
+ */
+ while (++idx <= idx_end) {
+ struct psc_entry entry = entries[idx];
+
+ if (entry.operation != entry_start.operation ||
+ entry.gfn != entry_start.gfn + npages ||
+ entry.cur_page || !!entry.pagesize != huge)
+ break;
+
+ svm->sev_es.psc_inflight++;
+ npages += huge ? 512 : 1;
+ }
+
+ switch (entry_start.operation) {
+ case VMGEXIT_PSC_OP_PRIVATE:
+ case VMGEXIT_PSC_OP_SHARED:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
+ vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= entry_start.pagesize
+ ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
+ : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+ vcpu->arch.complete_userspace_io = snp_complete_one_psc;
+ return 0; /* forward request to userspace */
+ default:
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can always be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion.
+ */
+ __snp_complete_one_psc(svm);
+ goto next_range;
+ }
+
+ BUG();
+}
+
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_memory_slot *slot;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!svm->sev_es.snp_ap_waiting_for_reset)
+ return;
+
+ svm->sev_es.snp_ap_waiting_for_reset = false;
+
+ /* Mark the vCPU as offline and not runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
+
+ /* Clear use of the VMSA */
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+
+ /*
+ * When replacing the VMSA during SEV-SNP AP creation,
+ * mark the VMCB dirty so that full state is always reloaded.
+ */
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
+ return;
+
+ gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
+ return;
+
+ /*
+ * The new VMSA will be private memory guest memory, so retrieve the
+ * PFN from the gmem backend.
+ */
+ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
+ return;
+
+ /*
+ * From this point forward, the VMSA will always be a guest-mapped page
+ * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
+ * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
+ * that involves cleanups like flushing caches, which would ideally be
+ * handled during teardown rather than guest boot. Deferring that also
+ * allows the existing logic for SEV-ES VMSAs to be re-used with
+ * minimal SNP-specific changes.
+ */
+ svm->sev_es.snp_has_guest_vmsa = true;
+
+ /* Use the new VMSA */
+ svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
+
+ /* Mark the vCPU as runnable */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ /*
+ * gmem pages aren't currently migratable, but if this ever changes
+ * then care should be taken to ensure svm->sev_es.vmsa is pinned
+ * through some other means.
+ */
+ kvm_release_page_clean(page);
+}
+
+static int sev_snp_ap_creation(struct vcpu_svm *svm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_vcpu *target_vcpu;
+ struct vcpu_svm *target_svm;
+ unsigned int request;
+ unsigned int apic_id;
+
+ request = lower_32_bits(svm->vmcb->control.exit_info_1);
+ apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
+
+ /* Validate the APIC ID */
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
+ if (!target_vcpu) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
+ apic_id);
+ return -EINVAL;
+ }
+
+ target_svm = to_svm(target_vcpu);
+
+ guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
+
+ switch (request) {
+ case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+ case SVM_VMGEXIT_AP_CREATE:
+ if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
+ vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
+ vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
+ return -EINVAL;
+ }
+
+ if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
+ svm->vmcb->control.exit_info_2);
+ return -EINVAL;
+ }
+
+ /*
+ * Malicious guest can RMPADJUST a large page into VMSA which
+ * will hit the SNP erratum where the CPU will incorrectly signal
+ * an RMP violation #PF if a hugepage collides with the RMP entry
+ * of VMSA page, reject the AP CREATE request if VMSA address from
+ * guest is 2M aligned.
+ */
+ if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
+ vcpu_unimpl(vcpu,
+ "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
+ svm->vmcb->control.exit_info_2);
+ return -EINVAL;
+ }
+
+ target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+ break;
+ case SVM_VMGEXIT_AP_DESTROY:
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
+ request);
+ return -EINVAL;
+ }
+
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
+ /*
+ * Unless Creation is deferred until INIT, signal the vCPU to update
+ * its state.
+ */
+ if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
+ kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
+
+ return 0;
+}
+
+static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct sev_data_snp_guest_request data = {0};
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ sev_ret_code fw_err = 0;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ mutex_lock(&sev->guest_req_mutex);
+
+ if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.req_paddr = __psp_pa(sev->guest_req_buf);
+ data.res_paddr = __psp_pa(sev->guest_resp_buf);
+
+ /*
+ * Firmware failures are propagated on to guest, but any other failure
+ * condition along the way should be reported to userspace. E.g. if
+ * the PSP is dead and commands are timing out.
+ */
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
+ if (ret && !fw_err)
+ goto out_unlock;
+
+ if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ /* No action is requested *from KVM* if there was a firmware error. */
+ svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
+
+ ret = 1; /* resume guest */
+
+out_unlock:
+ mutex_unlock(&sev->guest_req_mutex);
+ return ret;
+}
+
+static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+ u8 msg_type;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
+ &msg_type, 1))
+ return -EIO;
+
+ /*
+ * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
+ * additional certificate data to be provided alongside the attestation
+ * report via the guest-provided data pages indicated by RAX/RBX. The
+ * certificate data is optional and requires additional KVM enablement
+ * to provide an interface for userspace to provide it, but KVM still
+ * needs to be able to handle extended guest requests either way. So
+ * provide a stub implementation that will always return an empty
+ * certificate table in the guest-provided data pages.
+ */
+ if (msg_type == SNP_MSG_REPORT_REQ) {
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 data_npages;
+ gpa_t data_gpa;
+
+ if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
+ goto request_invalid;
+
+ data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
+ data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
+
+ if (!PAGE_ALIGNED(data_gpa))
+ goto request_invalid;
+
+ /*
+ * As per GHCB spec (see "SNP Extended Guest Request"), the
+ * certificate table is terminated by 24-bytes of zeroes.
+ */
+ if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
+ return -EIO;
+ }
+
+ return snp_handle_guest_req(svm, req_gpa, resp_gpa);
+
+request_invalid:
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
+ return 1; /* resume guest */
+}
+
+static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+ u64 ghcb_info;
+ int ret = 1;
+
+ ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
+
+ trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
+ control->ghcb_gpa);
+
+ switch (ghcb_info) {
+ case GHCB_MSR_SEV_INFO_REQ:
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+ break;
+ case GHCB_MSR_CPUID_REQ: {
+ u64 cpuid_fn, cpuid_reg, cpuid_value;
+
+ cpuid_fn = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_FUNC_MASK,
+ GHCB_MSR_CPUID_FUNC_POS);
+
+ /* Initialize the registers needed by the CPUID intercept */
+ vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
+ if (!ret) {
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ cpuid_reg = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_REG_MASK,
+ GHCB_MSR_CPUID_REG_POS);
+ if (cpuid_reg == 0)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
+ else if (cpuid_reg == 1)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
+ else if (cpuid_reg == 2)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
+ else
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
+
+ set_ghcb_msr_bits(svm, cpuid_value,
+ GHCB_MSR_CPUID_VALUE_MASK,
+ GHCB_MSR_CPUID_VALUE_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_AP_RESET_HOLD_REQ:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
+ ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+
+ /*
+ * Preset the result to a non-SIPI return and then only set
+ * the result to non-zero when delivering a SIPI.
+ */
+ set_ghcb_msr_bits(svm, 0,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_HV_FT_REQ:
+ set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
+ GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
+ GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_PREF_GPA_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_REG_GPA_REQ: {
+ u64 gfn;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+
+ svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+
+ set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_PSC_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
+ break;
+ case GHCB_MSR_TERM_REQ: {
+ u64 reason_set, reason_code;
+
+ reason_set = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_SET_MASK,
+ GHCB_MSR_TERM_REASON_SET_POS);
+ reason_code = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_MASK,
+ GHCB_MSR_TERM_REASON_POS);
+ pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
+ reason_set, reason_code);
+
+ goto out_terminate;
+ }
+ default:
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
+ control->ghcb_gpa, ret);
+
+ return ret;
+
+out_terminate:
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
+}
+
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_gpa, exit_code;
+ int ret;
+
+ /* Validate the GHCB */
+ ghcb_gpa = control->ghcb_gpa;
+ if (ghcb_gpa & GHCB_MSR_INFO_MASK)
+ return sev_handle_vmgexit_msr_protocol(svm);
+
+ if (!ghcb_gpa) {
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
+ /* Unable to map GHCB from guest */
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ ghcb_gpa);
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
+
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
+
+ sev_es_sync_from_ghcb(svm);
+
+ /* SEV-SNP guest requires that the GHCB GPA must be registered */
+ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
+ return -EINVAL;
+ }
+
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
+
+ svm_vmgexit_success(svm, 0);
+
+ exit_code = kvm_get_cached_sw_exit_code(control);
+ switch (exit_code) {
+ case SVM_VMGEXIT_MMIO_READ:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_read(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_MMIO_WRITE:
+ ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_write(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ ++vcpu->stat.nmi_window_exits;
+ svm->nmi_masked = false;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
+ ret = kvm_emulate_ap_reset_hold(vcpu);
+ break;
+ case SVM_VMGEXIT_AP_JUMP_TABLE: {
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+
+ switch (control->exit_info_1) {
+ case 0:
+ /* Set AP jump table address */
+ sev->ap_jump_table = control->exit_info_2;
+ break;
+ case 1:
+ /* Get AP jump table address */
+ svm_vmgexit_success(svm, sev->ap_jump_table);
+ break;
+ default:
+ pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
+ control->exit_info_1);
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ }
+ case SVM_VMGEXIT_HV_FEATURES:
+ svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_TERM_REQUEST:
+ pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+ break;
+ case SVM_VMGEXIT_PSC:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ ret = sev_snp_ap_creation(svm);
+ if (ret) {
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ vcpu_unimpl(vcpu,
+ "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
+ break;
+ default:
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
+ }
+
+ return ret;
+}
+
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
+{
+ int count;
+ int bytes;
+ int r;
+
+ if (svm->vmcb->control.exit_info_2 > INT_MAX)
+ return -EINVAL;
+
+ count = svm->vmcb->control.exit_info_2;
+ if (unlikely(check_mul_overflow(count, size, &bytes)))
+ return -EINVAL;
+
+ r = setup_vmgexit_scratch(svm, in, bytes);
+ if (r)
+ return r;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
+ count, in);
+}
+
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ /* Clear intercepts on MSRs that are context switched by hardware. */
+ svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
+
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
+ svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
+
+ svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R,
+ !snp_is_secure_tsc_enabled(vcpu->kvm));
+
+ /*
+ * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
+ * the host/guest supports its use.
+ *
+ * KVM treats the guest as being capable of using XSAVES even if XSAVES
+ * isn't enabled in guest CPUID as there is no intercept for XSAVES,
+ * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is
+ * exposed to the guest and XSAVES is supported in hardware. Condition
+ * full XSS passthrough on the guest being able to use XSAVES *and*
+ * XSAVES being exposed to the guest so that KVM can at least honor
+ * guest CPUID for RDMSR and WRMSR.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
+}
+
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_cpuid_entry2 *best;
+
+ /* For sev guests, the memory encryption bit is not reserved in CR3. */
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
+ if (best)
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
+}
+
+static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
+
+ /*
+ * An SEV-ES guest requires a VMSA area that is a separate from the
+ * VMCB page. Do not include the encryption mask on the VMSA physical
+ * address since hardware will access it using the guest key. Note,
+ * the VMSA will be NULL if this vCPU is the destination for intrahost
+ * migration, and will be copied later.
+ */
+ if (!svm->sev_es.snp_has_guest_vmsa) {
+ if (svm->sev_es.vmsa)
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ else
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
+ svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
+ VMCB_ALLOWED_SEV_FEATURES_VALID;
+
+ /* Can't intercept CR register access, HV can't modify CR registers */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR4_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR8_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+
+ /* Track EFER/CR register changes */
+ svm_set_intercept(svm, TRAP_EFER_WRITE);
+ svm_set_intercept(svm, TRAP_CR0_WRITE);
+ svm_set_intercept(svm, TRAP_CR4_WRITE);
+ svm_set_intercept(svm, TRAP_CR8_WRITE);
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+ if (!sev_vcpu_has_debug_swap(svm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ recalc_intercepts(svm);
+ } else {
+ /*
+ * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
+ * allow debugging SEV-ES guests, and enables DebugSwap iff
+ * NO_NESTED_DATA_BP is supported, so there's no reason to
+ * intercept #DB when DebugSwap is enabled. For simplicity
+ * with respect to guest debug, intercept #DB for other VMs
+ * even if NO_NESTED_DATA_BP is supported, i.e. even if the
+ * guest can't DoS the CPU with infinite #DB vectoring.
+ */
+ clr_exception_intercept(svm, DB_VECTOR);
+ }
+
+ /* Can't intercept XSETBV, HV can't modify XCR0 directly */
+ svm_clr_intercept(svm, INTERCEPT_XSETBV);
+
+ /*
+ * Set the GHCB MSR value as per the GHCB specification when emulating
+ * vCPU RESET for an SEV-ES guest.
+ */
+ if (!init_event)
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+}
+
+void sev_init_vmcb(struct vcpu_svm *svm, bool init_event)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ clr_exception_intercept(svm, UD_VECTOR);
+
+ /*
+ * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
+ * KVM can't decrypt guest memory to decode the faulting instruction.
+ */
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ if (init_event && sev_snp_guest(vcpu->kvm))
+ sev_snp_init_protected_guest_state(vcpu);
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_init_vmcb(svm, init_event);
+}
+
+int sev_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct page *vmsa_page;
+
+ mutex_init(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!sev_es_guest(vcpu->kvm))
+ return 0;
+
+ /*
+ * SEV-ES guests require a separate (from the VMCB) VMSA page used to
+ * contain the encrypted register state of the guest.
+ */
+ vmsa_page = snp_safe_alloc_page();
+ if (!vmsa_page)
+ return -ENOMEM;
+
+ svm->sev_es.vmsa = page_address(vmsa_page);
+
+ vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm);
+
+ return 0;
+}
+
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+
+ /*
+ * All host state for SEV-ES guests is categorized into three swap types
+ * based on how it is handled by hardware during a world switch:
+ *
+ * A: VMRUN: Host state saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * B: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * C: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state initialized to default(reset) values
+ *
+ * Manually save type-B state, i.e. state that is loaded by VMEXIT but
+ * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
+ * by common SVM code).
+ */
+ hostsa->xcr0 = kvm_host.xcr0;
+ hostsa->pkru = read_pkru();
+ hostsa->xss = kvm_host.xss;
+
+ /*
+ * If DebugSwap is enabled, debug registers are loaded but NOT saved by
+ * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
+ * not save or load debug registers. Sadly, KVM can't prevent SNP
+ * guests from lying about DebugSwap on secondary vCPUs, i.e. the
+ * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
+ * the guest has actually enabled (or not!) in the VMSA.
+ *
+ * If DebugSwap is *possible*, save the masks so that they're restored
+ * if the guest enables DebugSwap. But for the DRs themselves, do NOT
+ * rely on the CPU to restore the host values; KVM will restore them as
+ * needed in common code, via hw_breakpoint_restore(). Note, KVM does
+ * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
+ * don't need to be restored per se, KVM just needs to ensure they are
+ * loaded with the correct values *if* the CPU writes the MSRs.
+ */
+ if (sev_vcpu_has_debug_swap(svm) ||
+ (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
+ hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
+ hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
+ hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
+ hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
+ }
+
+ /*
+ * TSC_AUX is always virtualized for SEV-ES guests when the feature is
+ * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area.
+ * Set the save area to the current hardware value, i.e. the current
+ * user return value, so that the correct value is restored on #VMEXIT.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) &&
+ !WARN_ON_ONCE(tsc_aux_uret_slot < 0))
+ hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot);
+}
+
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* First SIPI: Use the values as initially set by the VMM */
+ if (!svm->sev_es.received_first_sipi) {
+ svm->sev_es.received_first_sipi = true;
+ return;
+ }
+
+ /* Subsequent SIPI */
+ switch (svm->sev_es.ap_reset_hold_type) {
+ case AP_RESET_HOLD_NAE_EVENT:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+ */
+ svm_vmgexit_success(svm, 1);
+ break;
+ case AP_RESET_HOLD_MSR_PROTO:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set GHCB data field to a non-zero value.
+ */
+ set_ghcb_msr_bits(svm, 1,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ default:
+ break;
+ }
+}
+
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
+{
+ unsigned long pfn;
+ struct page *p;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+
+ /*
+ * Allocate an SNP-safe page to workaround the SNP erratum where
+ * the CPU will incorrectly signal an RMP violation #PF if a
+ * hugepage (2MB or 1GB) collides with the RMP entry of a
+ * 2MB-aligned VMCB, VMSA, or AVIC backing page.
+ *
+ * Allocate one extra page, choose a page which is not
+ * 2MB-aligned, and free the other.
+ */
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
+ if (!p)
+ return NULL;
+
+ split_page(p, 1);
+
+ pfn = page_to_pfn(p);
+ if (IS_ALIGNED(pfn, PTRS_PER_PMD))
+ __free_page(p++);
+ else
+ __free_page(p + 1);
+
+ return p;
+}
+
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = vcpu->kvm;
+ int order, rmp_level, ret;
+ struct page *page;
+ bool assigned;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ gfn = gpa >> PAGE_SHIFT;
+
+ /*
+ * The only time RMP faults occur for shared pages is when the guest is
+ * triggering an RMP fault for an implicit page-state change from
+ * shared->private. Implicit page-state changes are forwarded to
+ * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
+ * for shared pages should not end up here.
+ */
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!kvm_slot_has_gmem(slot)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret || !assigned) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
+ gpa, pfn, ret);
+ goto out_no_trace;
+ }
+
+ /*
+ * There are 2 cases where a PSMASH may be needed to resolve an #NPF
+ * with PFERR_GUEST_RMP_BIT set:
+ *
+ * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
+ * bit set if the guest issues them with a smaller granularity than
+ * what is indicated by the page-size bit in the 2MB RMP entry for
+ * the PFN that backs the GPA.
+ *
+ * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
+ * smaller than what is indicated by the 2MB RMP entry for the PFN
+ * that backs the GPA.
+ *
+ * In both these cases, the corresponding 2M RMP entry needs to
+ * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
+ * split into 4K RMP entries, then this is likely a spurious case which
+ * can occur when there are concurrent accesses by the guest to a 2MB
+ * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
+ * the process of being PMASH'd into 4K entries. These cases should
+ * resolve automatically on subsequent accesses, so just ignore them
+ * here.
+ */
+ if (rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ ret = snp_rmptable_psmash(pfn);
+ if (ret) {
+ /*
+ * Look it up again. If it's 4K now then the PSMASH may have
+ * raced with another process and the issue has already resolved
+ * itself.
+ */
+ if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
+ assigned && rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
+ gpa, pfn, ret);
+ }
+
+ kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
+out:
+ trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
+out_no_trace:
+ kvm_release_page_unused(page);
+}
+
+static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn = start;
+
+ while (pfn < end) {
+ int ret, rmp_level;
+ bool assigned;
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
+ pfn, start, end, rmp_level, ret);
+ return false;
+ }
+
+ if (assigned) {
+ pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
+ __func__, pfn, start, end, rmp_level);
+ return false;
+ }
+
+ pfn++;
+ }
+
+ return true;
+}
+
+static u8 max_level_for_order(int order)
+{
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
+{
+ kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+
+ /*
+ * If this is a large folio, and the entire 2M range containing the
+ * PFN is currently shared, then the entire 2M-aligned range can be
+ * set to private via a single 2M RMP entry.
+ */
+ if (max_level_for_order(order) > PG_LEVEL_4K &&
+ is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
+ return true;
+
+ return false;
+}
+
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ kvm_pfn_t pfn_aligned;
+ gfn_t gfn_aligned;
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
+ gfn, pfn, rc);
+ return -ENOENT;
+ }
+
+ if (assigned) {
+ pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
+ __func__, gfn, pfn, max_order, level);
+ return 0;
+ }
+
+ if (is_large_rmp_possible(kvm, pfn, max_order)) {
+ level = PG_LEVEL_2M;
+ pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
+ } else {
+ level = PG_LEVEL_4K;
+ pfn_aligned = pfn;
+ gfn_aligned = gfn;
+ }
+
+ rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
+ gfn, pfn, level, rc);
+ return -EINVAL;
+ }
+
+ pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
+ __func__, gfn, pfn, pfn_aligned, max_order, level);
+
+ return 0;
+}
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+ for (pfn = start; pfn < end;) {
+ bool use_2m_update = false;
+ int rc, rmp_level;
+ bool assigned;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (rc || !assigned)
+ goto next_pfn;
+
+ use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+ end >= (pfn + PTRS_PER_PMD) &&
+ rmp_level > PG_LEVEL_4K;
+
+ /*
+ * If an unaligned PFN corresponds to a 2M region assigned as a
+ * large page in the RMP table, PSMASH the region into individual
+ * 4K RMP entries before attempting to convert a 4K sub-page.
+ */
+ if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+ /*
+ * This shouldn't fail, but if it does, report it, but
+ * still try to update RMP entry to shared and pray this
+ * was a spurious error that can be addressed later.
+ */
+ rc = snp_rmptable_psmash(pfn);
+ WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc);
+ }
+
+ rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+ if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc))
+ goto next_pfn;
+
+ /*
+ * SEV-ES avoids host/guest cache coherency issues through
+ * WBNOINVD hooks issued via MMU notifiers during run-time, and
+ * KVM's VM destroy path at shutdown. Those MMU notifier events
+ * don't cover gmem since there is no requirement to map pages
+ * to a HVA in order to use them for a running guest. While the
+ * shutdown path would still likely cover things for SNP guests,
+ * userspace may also free gmem pages during run-time via
+ * hole-punching operations on the guest_memfd, so flush the
+ * cache entries for these pages before free'ing them back to
+ * the host.
+ */
+ clflush_cache_range(__va(pfn_to_hpa(pfn)),
+ use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+ pfn += use_2m_update ? PTRS_PER_PMD : 1;
+ cond_resched();
+ }
+}
+
+int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
+{
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc || !assigned)
+ return PG_LEVEL_4K;
+
+ return level;
+}
+
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area *vmsa;
+ struct kvm_sev_info *sev;
+ int error = 0;
+ int ret;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return NULL;
+
+ /*
+ * If the VMSA has not yet been encrypted, return a pointer to the
+ * current un-encrypted VMSA.
+ */
+ if (!vcpu->arch.guest_state_protected)
+ return (struct vmcb_save_area *)svm->sev_es.vmsa;
+
+ sev = to_kvm_sev_info(vcpu->kvm);
+
+ /* Check if the SEV policy allows debugging */
+ if (sev_snp_guest(vcpu->kvm)) {
+ if (!(sev->policy & SNP_POLICY_MASK_DEBUG))
+ return NULL;
+ } else {
+ if (sev->policy & SEV_POLICY_MASK_NODBG)
+ return NULL;
+ }
+
+ if (sev_snp_guest(vcpu->kvm)) {
+ struct sev_data_snp_dbg dbg = {0};
+
+ vmsa = snp_alloc_firmware_page(__GFP_ZERO);
+ if (!vmsa)
+ return NULL;
+
+ dbg.gctx_paddr = __psp_pa(sev->snp_context);
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+
+ ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
+
+ /*
+ * Return the target page to a hypervisor page no matter what.
+ * If this fails, the page can't be used, so leak it and don't
+ * try to use it.
+ */
+ if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
+ return NULL;
+
+ if (ret) {
+ pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ free_page((unsigned long)vmsa);
+
+ return NULL;
+ }
+ } else {
+ struct sev_data_dbg dbg = {0};
+ struct page *vmsa_page;
+
+ vmsa_page = alloc_page(GFP_KERNEL);
+ if (!vmsa_page)
+ return NULL;
+
+ vmsa = page_address(vmsa_page);
+
+ dbg.handle = sev->handle;
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+ dbg.len = PAGE_SIZE;
+
+ ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
+ if (ret) {
+ pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
+ ret, error, error);
+ __free_page(vmsa_page);
+
+ return NULL;
+ }
+ }
+
+ return vmsa;
+}
+
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
+{
+ /* If the VMSA has not yet been encrypted, nothing was allocated */
+ if (!vcpu->arch.guest_state_protected || !vmsa)
+ return;
+
+ free_page((unsigned long)vmsa);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
new file mode 100644
index 000000000000..f56c2d895011
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.c
@@ -0,0 +1,5510 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "mmu.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "pmu.h"
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/amd-iommu.h>
+#include <linux/sched.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+#include <linux/objtool.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/rwsem.h>
+#include <linux/cc_platform.h>
+#include <linux/smp.h>
+#include <linux/string_choices.h>
+#include <linux/mutex.h>
+
+#include <asm/apic.h>
+#include <asm/msr.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
+#include <asm/spec-ctrl.h>
+#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
+#include <asm/reboot.h>
+#include <asm/fpu/api.h>
+
+#include <trace/events/ipi.h>
+
+#include "trace.h"
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
+MODULE_LICENSE("GPL");
+
+#ifdef MODULE
+static const struct x86_cpu_id svm_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+#endif
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+static bool erratum_383_found __read_mostly;
+
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
+/*
+ * Use nested page tables by default. Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
+
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
+module_param(nested, int, 0444);
+
+/* enable/disable Next RIP Save */
+int nrips = true;
+module_param(nrips, int, 0444);
+
+/* enable/disable Virtual VMLOAD VMSAVE */
+static int vls = true;
+module_param(vls, int, 0444);
+
+/* enable/disable Virtual GIF */
+int vgif = true;
+module_param(vgif, int, 0444);
+
+/* enable/disable LBR virtualization */
+int lbrv = true;
+module_param(lbrv, int, 0444);
+
+static int tsc_scaling = true;
+module_param(tsc_scaling, int, 0444);
+
+module_param(enable_device_posted_irqs, bool, 0444);
+
+bool __read_mostly dump_invalid_vmcb;
+module_param(dump_invalid_vmcb, bool, 0644);
+
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
+
+static bool svm_gp_erratum_intercept = true;
+
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
+
+static unsigned long iopm_base;
+
+DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+
+static DEFINE_MUTEX(vmcb_dump_mutex);
+
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+int tsc_aux_uret_slot __ro_after_init = -1;
+
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+ return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 old_efer = vcpu->arch.efer;
+ vcpu->arch.efer = efer;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
+
+ if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
+ if (!(efer & EFER_SVME)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, true);
+ /* #GP intercept is still needed for vmware backdoor */
+ if (!enable_vmware_backdoor)
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ /*
+ * Free the nested guest state, unless we are in SMM.
+ * In this case we will return to the nested guest
+ * as soon as we leave SMM.
+ */
+ if (!is_smm(vcpu))
+ svm_free_nested(svm);
+
+ } else {
+ int ret = svm_allocate_nested(svm);
+
+ if (ret) {
+ vcpu->arch.efer = old_efer;
+ return ret;
+ }
+
+ /*
+ * Never intercept #GP for SEV guests, KVM can't
+ * decrypt guest memory to workaround the erratum.
+ */
+ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
+ set_exception_intercept(svm, GP_VECTOR);
+ }
+ }
+
+ svm->vmcb->save.efer = efer | EFER_SVME;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ return 0;
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
+
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
+static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ int emul_type,
+ bool commit_side_effects)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_rflags;
+
+ /*
+ * SEV-ES does not expose the next RIP. The RIP update is controlled by
+ * the type of exit and the #VC handler in the guest.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ goto done;
+
+ if (nrips && svm->vmcb->control.next_rip != 0) {
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+ svm->next_rip = svm->vmcb->control.next_rip;
+ }
+
+ if (!svm->next_rip) {
+ if (unlikely(!commit_side_effects))
+ old_rflags = svm->vmcb->save.rflags;
+
+ if (!kvm_emulate_instruction(vcpu, emul_type))
+ return 0;
+
+ if (unlikely(!commit_side_effects))
+ svm->vmcb->save.rflags = old_rflags;
+ } else {
+ kvm_rip_write(vcpu, svm->next_rip);
+ }
+
+done:
+ if (likely(commit_side_effects))
+ svm_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
+}
+
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
+}
+
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
+{
+ const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
+ EMULTYPE_SET_SOFT_INT_VECTOR(vector);
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Due to architectural shortcomings, the CPU doesn't always provide
+ * NextRIP, e.g. if KVM intercepted an exception that occurred while
+ * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
+ * the instruction even if NextRIP is supported to acquire the next
+ * RIP so that it can be shoved into the NextRIP field, otherwise
+ * hardware will fail to advance guest RIP during event injection.
+ * Drop the exception/interrupt if emulation fails and effectively
+ * retry the instruction, it's the least awful option. If NRIPS is
+ * in use, the skip must not commit any side effects such as clearing
+ * the interrupt shadow or RFLAGS.RF.
+ */
+ if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
+ return -EIO;
+
+ rip = kvm_rip_read(vcpu);
+
+ /*
+ * Save the injection information, even when using next_rip, as the
+ * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
+ * doesn't complete due to a VM-Exit occurring while the CPU is
+ * vectoring the event. Decoding the instruction isn't guaranteed to
+ * work as there may be no backing instruction, e.g. if the event is
+ * being injected by L1 for L2, or if the guest is patching INT3 into
+ * a different instruction.
+ */
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = old_rip;
+ svm->soft_int_next_rip = rip;
+
+ if (nrips)
+ kvm_rip_write(vcpu, old_rip);
+
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ svm->vmcb->control.next_rip = rip;
+
+ return 0;
+}
+
+static void svm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (kvm_exception_is_soft(ex->vector) &&
+ svm_update_soft_interrupt_rip(vcpu, ex->vector))
+ return;
+
+ svm->vmcb->control.event_inj = ex->vector
+ | SVM_EVTINJ_VALID
+ | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = ex->error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+ u64 val;
+
+ if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
+ return;
+
+ /* Use _safe variants to not break nested virtualization */
+ if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val))
+ return;
+
+ val |= (1ULL << 47);
+
+ native_write_msr_safe(MSR_AMD64_DC_CFG, val);
+
+ erratum_383_found = true;
+}
+
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Guests should see errata 400 and 415 as fixed (assuming that
+ * HLT and IO instructions are intercepted).
+ */
+ vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+ vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+ /*
+ * By increasing VCPU's osvw.length to 3 we are telling the guest that
+ * all osvw.status bits inside that length, including bit 0 (which is
+ * reserved for erratum 298), are valid. However, if host processor's
+ * osvw_len is 0 then osvw_status[0] carries no information. We need to
+ * be conservative here and therefore we tell the guest that erratum 298
+ * is present (because we really don't know).
+ */
+ if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+ vcpu->arch.osvw.status |= 1;
+}
+
+static bool __kvm_is_svm_supported(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON) {
+ pr_err("CPU %d isn't AMD or Hygon\n", cpu);
+ return false;
+ }
+
+ if (!cpu_has(c, X86_FEATURE_SVM)) {
+ pr_err("SVM not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool kvm_is_svm_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_svm_supported();
+ migrate_enable();
+
+ return supported;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!__kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
+}
+
+static void __svm_write_tsc_multiplier(u64 multiplier)
+{
+ if (multiplier == __this_cpu_read(current_tsc_ratio))
+ return;
+
+ wrmsrq(MSR_AMD64_TSC_RATIO, multiplier);
+ __this_cpu_write(current_tsc_ratio, multiplier);
+}
+
+static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
+{
+ return &sd->save_area->host_sev_es_save;
+}
+
+static inline void kvm_cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrq(MSR_VM_HSAVE_PA, 0);
+ rdmsrq(MSR_EFER, efer);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+ * NMI aren't blocked.
+ */
+ stgi();
+ wrmsrq(MSR_EFER, efer & ~EFER_SVME);
+ }
+}
+
+static void svm_emergency_disable_virtualization_cpu(void)
+{
+ kvm_rebooting = true;
+
+ kvm_cpu_svm_disable();
+}
+
+static void svm_disable_virtualization_cpu(void)
+{
+ /* Make sure we clean up behind us */
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+
+ kvm_cpu_svm_disable();
+
+ amd_pmu_disable_virt();
+}
+
+static int svm_enable_virtualization_cpu(void)
+{
+
+ struct svm_cpu_data *sd;
+ uint64_t efer;
+ int me = raw_smp_processor_id();
+
+ rdmsrq(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
+ sd = per_cpu_ptr(&svm_data, me);
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
+ sd->min_asid = max_sev_asid + 1;
+
+ wrmsrq(MSR_EFER, efer | EFER_SVME);
+
+ wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ /*
+ * Set the default value, even if we don't use TSC scaling
+ * to avoid having stale value in the msr
+ */
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+ }
+
+
+ /*
+ * Get OSVW bits.
+ *
+ * Note that it is possible to have a system with mixed processor
+ * revisions and therefore different OSVW bits. If bits are not the same
+ * on different processors then choose the worst case (i.e. if erratum
+ * is present on one processor and not on another then assume that the
+ * erratum is present everywhere).
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+ u64 len, status = 0;
+ int err;
+
+ err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len);
+ if (!err)
+ err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status);
+
+ if (err)
+ osvw_status = osvw_len = 0;
+ else {
+ if (len < osvw_len)
+ osvw_len = len;
+ osvw_status |= status;
+ osvw_status &= (1ULL << osvw_len) - 1;
+ }
+ } else
+ osvw_status = osvw_len = 0;
+
+ svm_init_erratum_383();
+
+ amd_pmu_enable_virt();
+
+ return 0;
+}
+
+static void svm_cpu_uninit(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+
+ if (!sd->save_area)
+ return;
+
+ kfree(sd->sev_vmcbs);
+ __free_page(__sme_pa_to_page(sd->save_area_pa));
+ sd->save_area_pa = 0;
+ sd->save_area = NULL;
+}
+
+static int svm_cpu_init(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ struct page *save_area_page;
+ int ret = -ENOMEM;
+
+ memset(sd, 0, sizeof(struct svm_cpu_data));
+ save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
+ if (!save_area_page)
+ return ret;
+
+ ret = sev_cpu_init(sd);
+ if (ret)
+ goto free_save_area;
+
+ sd->save_area = page_address(save_area_page);
+ sd->save_area_pa = __sme_page_pa(save_area_page);
+ return 0;
+
+free_save_area:
+ __free_page(save_area_page);
+ return ret;
+
+}
+
+static void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+
+ recalc_intercepts(svm);
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
+ to_svm(vcpu)->msrpm;
+
+ return svm_test_msr_bitmap_write(msrpm, msr);
+}
+
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ void *msrpm = svm->msrpm;
+
+ /* Don't disable interception for MSRs userspace wants to handle. */
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ svm_clear_msr_bitmap_read(msrpm, msr);
+ else
+ svm_set_msr_bitmap_read(msrpm, msr);
+ }
+
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ svm_clear_msr_bitmap_write(msrpm, msr);
+ else
+ svm_set_msr_bitmap_write(msrpm, msr);
+ }
+
+ svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+ svm->nested.force_msr_bitmap_recalc = true;
+}
+
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
+{
+ unsigned int order = get_order(size);
+ struct page *pages = alloc_pages(gfp_mask, order);
+ void *pm;
+
+ if (!pages)
+ return NULL;
+
+ /*
+ * Set all bits in the permissions map so that all MSR and I/O accesses
+ * are intercepted by default.
+ */
+ pm = page_address(pages);
+ memset(pm, 0xff, PAGE_SIZE * (1 << order));
+
+ return pm;
+}
+
+static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (intercept == svm->lbr_msrs_intercepted)
+ return;
+
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
+
+ if (sev_es_guest(vcpu->kvm))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
+
+ svm->lbr_msrs_intercepted = intercept;
+}
+
+void svm_vcpu_free_msrpm(void *msrpm)
+{
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
+}
+
+static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+
+#ifdef CONFIG_X86_64
+ svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
+#endif
+
+ if (lbrv)
+ svm_recalc_lbr_msr_intercepts(vcpu);
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ /*
+ * Disable interception of SPEC_CTRL if KVM doesn't need to manually
+ * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
+ * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
+ * using SPEC_CTRL.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !guest_has_spec_ctrl_msr(vcpu));
+ else
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !svm->spec_ctrl);
+
+ /*
+ * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
+ * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled);
+ }
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_recalc_msr_intercepts(vcpu);
+
+ /*
+ * x2APIC intercepts are modified on-demand and cannot be filtered by
+ * userspace.
+ */
+}
+
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
+ to_vmcb->save.br_from = from_vmcb->save.br_from;
+ to_vmcb->save.br_to = from_vmcb->save.br_to;
+ to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
+ to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
+
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
+static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+}
+
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ __svm_enable_lbrv(vcpu);
+ svm_recalc_lbr_msr_intercepts(vcpu);
+}
+
+static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
+{
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+
+ if (enable_lbrv && !current_enable_lbrv)
+ __svm_enable_lbrv(vcpu);
+ else if (!enable_lbrv && current_enable_lbrv)
+ __svm_disable_lbrv(vcpu);
+
+ /*
+ * During nested transitions, it is possible that the current VMCB has
+ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
+ * In this case, even though LBR_CTL does not need an update, intercepts
+ * do, so always recalculate the intercepts here.
+ */
+ svm_recalc_lbr_msr_intercepts(vcpu);
+}
+
+void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void svm_hardware_unsetup(void)
+{
+ int cpu;
+
+ avic_hardware_unsetup();
+
+ sev_hardware_unsetup();
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
+ iopm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | type;
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.tsc_offset;
+}
+
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->tsc_ratio_msr;
+}
+
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+}
+
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ if (to_svm(vcpu)->guest_state_loaded)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ preempt_enable();
+}
+
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Intercept INVPCID if shadow paging is enabled to sync/free shadow
+ * roots, or if INVPCID is disabled in the guest to inject #UD.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
+ if (!npt_enabled ||
+ !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
+ svm_set_intercept(svm, INTERCEPT_INVPCID);
+ else
+ svm_clr_intercept(svm, INTERCEPT_INVPCID);
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ else
+ svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ }
+
+ if (guest_cpuid_is_intel_compatible(vcpu)) {
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ }
+}
+
+static void svm_recalc_intercepts(struct kvm_vcpu *vcpu)
+{
+ svm_recalc_instruction_intercepts(vcpu);
+ svm_recalc_msr_intercepts(vcpu);
+}
+
+static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct vmcb_control_area *control = &vmcb->control;
+ struct vmcb_save_area *save = &vmcb->save;
+
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR3_READ);
+ svm_set_intercept(svm, INTERCEPT_CR4_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercepts(svm);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ set_exception_intercept(svm, GP_VECTOR);
+
+ svm_set_intercept(svm, INTERCEPT_INTR);
+ svm_set_intercept(svm, INTERCEPT_NMI);
+
+ if (intercept_smi)
+ svm_set_intercept(svm, INTERCEPT_SMI);
+
+ svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ svm_set_intercept(svm, INTERCEPT_RDPMC);
+ svm_set_intercept(svm, INTERCEPT_CPUID);
+ svm_set_intercept(svm, INTERCEPT_INVD);
+ svm_set_intercept(svm, INTERCEPT_INVLPG);
+ svm_set_intercept(svm, INTERCEPT_INVLPGA);
+ svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
+ svm_set_intercept(svm, INTERCEPT_MSR_PROT);
+ svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
+ svm_set_intercept(svm, INTERCEPT_VMRUN);
+ svm_set_intercept(svm, INTERCEPT_VMMCALL);
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_CLGI);
+ svm_set_intercept(svm, INTERCEPT_SKINIT);
+ svm_set_intercept(svm, INTERCEPT_WBINVD);
+ svm_set_intercept(svm, INTERCEPT_XSETBV);
+ svm_set_intercept(svm, INTERCEPT_RDPRU);
+ svm_set_intercept(svm, INTERCEPT_RSM);
+
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
+ svm_set_intercept(svm, INTERCEPT_MONITOR);
+ svm_set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
+ if (!kvm_hlt_in_guest(vcpu->kvm)) {
+ if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
+ svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
+ else
+ svm_set_intercept(svm, INTERCEPT_HLT);
+ }
+
+ control->iopm_base_pa = iopm_base;
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
+ control->int_ctl = V_INTR_MASKING_MASK;
+
+ init_seg(&save->es);
+ init_seg(&save->ss);
+ init_seg(&save->ds);
+ init_seg(&save->fs);
+ init_seg(&save->gs);
+
+ save->cs.selector = 0xf000;
+ save->cs.base = 0xffff0000;
+ /* Executable/Readable Code Segment */
+ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+ SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+ save->cs.limit = 0xffff;
+
+ save->gdtr.base = 0;
+ save->gdtr.limit = 0xffff;
+ save->idtr.base = 0;
+ save->idtr.limit = 0xffff;
+
+ init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+ init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ svm_clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ svm_clr_intercept(svm, INTERCEPT_CR3_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
+ save->g_pat = vcpu->arch.pat;
+ save->cr3 = 0;
+ }
+ svm->current_vmcb->asid_generation = 0;
+ svm->asid = 0;
+
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
+ svm_set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
+ }
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm, vmcb);
+
+ if (vnmi)
+ svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
+
+ if (vgif) {
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ svm_clr_intercept(svm, INTERCEPT_CLGI);
+ svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
+ }
+
+ if (vcpu->kvm->arch.bus_lock_detection_enabled)
+ svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+
+ if (sev_guest(vcpu->kvm))
+ sev_init_vmcb(svm, init_event);
+
+ svm_hv_init_vmcb(vmcb);
+
+ kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
+
+ vmcb_mark_all_dirty(vmcb);
+
+ enable_gif(svm);
+}
+
+static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm_init_osvw(vcpu);
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
+
+ svm->nmi_masked = false;
+ svm->awaiting_iret_completion = false;
+}
+
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->spec_ctrl = 0;
+ svm->virt_spec_ctrl = 0;
+
+ init_vmcb(vcpu, init_event);
+
+ if (!init_event)
+ __svm_vcpu_reset(vcpu);
+}
+
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
+}
+
+static int svm_vcpu_precreate(struct kvm *kvm)
+{
+ return avic_alloc_physical_id_table(kvm);
+}
+
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+ struct page *vmcb01_page;
+ int err;
+
+ BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
+ svm = to_svm(vcpu);
+
+ err = -ENOMEM;
+ vmcb01_page = snp_safe_alloc_page();
+ if (!vmcb01_page)
+ goto out;
+
+ err = sev_vcpu_create(vcpu);
+ if (err)
+ goto error_free_vmcb_page;
+
+ err = avic_init_vcpu(svm);
+ if (err)
+ goto error_free_sev;
+
+ svm->msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->msrpm) {
+ err = -ENOMEM;
+ goto error_free_sev;
+ }
+
+ svm->x2avic_msrs_intercepted = true;
+ svm->lbr_msrs_intercepted = true;
+
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ svm->guest_state_loaded = false;
+
+ return 0;
+
+error_free_sev:
+ sev_free_vcpu(vcpu);
+error_free_vmcb_page:
+ __free_page(vmcb01_page);
+out:
+ return err;
+}
+
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON_ONCE(!list_empty(&svm->ir_list));
+
+ svm_leave_nested(vcpu);
+ svm_free_nested(svm);
+
+ sev_free_vcpu(vcpu);
+
+ __free_page(__sme_pa_to_page(svm->vmcb01.pa));
+ svm_vcpu_free_msrpm(svm->msrpm);
+}
+
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+ struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+ if (!sd->bp_spec_reduce_set)
+ return;
+
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ if (atomic_dec_return(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ /*
+ * Verify a new VM didn't come along, acquire the lock, and increment
+ * the count before this task acquired the lock.
+ */
+ if (atomic_read(&srso_nr_vms))
+ return;
+
+ on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ /*
+ * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+ * transition, i.e. destroying the last VM, is fully complete, e.g. so
+ * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+ */
+ if (atomic_inc_not_zero(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_unmap_ghcb(svm);
+
+ if (svm->guest_state_loaded)
+ return;
+
+ /*
+ * Save additional host state that will be restored on VMEXIT (sev-es)
+ * or subsequent vmload of host save area.
+ */
+ vmsave(sd->save_area_pa);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
+
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+
+ /*
+ * TSC_AUX is always virtualized (context switched by hardware) for
+ * SEV-ES guests when the feature is available. For non-SEV-ES guests,
+ * context switch TSC_AUX via the user_return MSR infrastructure (not
+ * all CPUs support TSC_AUX virtualization).
+ */
+ if (likely(tsc_aux_uret_slot >= 0) &&
+ (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
+ kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+ !sd->bp_spec_reduce_set) {
+ sd->bp_spec_reduce_set = true;
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ }
+ svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+ to_svm(vcpu)->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_load(vcpu, cpu);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_put(vcpu);
+
+ svm_prepare_host_switch(vcpu);
+
+ ++vcpu->stat.host_state_reload;
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+ /*
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
+ * (caused by either a task switch or an inter-privilege IRET),
+ * so we do not need to update the CPL here.
+ */
+ to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ kvm_register_mark_available(vcpu, reg);
+
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ /*
+ * When !npt_enabled, mmu->pdptrs[] is already available since
+ * it is always updated per SDM when moving to CRs.
+ */
+ if (npt_enabled)
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ break;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ }
+}
+
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control;
+
+ /*
+ * The following fields are ignored when AVIC is enabled
+ */
+ WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
+
+ svm_set_intercept(svm, INTERCEPT_VINTR);
+
+ /*
+ * Recalculating intercepts may have cleared the VINTR intercept. If
+ * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
+ * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
+ * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
+ * interrupts will never be unblocked while L2 is running.
+ */
+ if (!svm_is_intercept(svm, INTERCEPT_VINTR))
+ return;
+
+ /*
+ * This is just a dummy VINTR to actually cause a vmexit to happen.
+ * Actual injection of virtual interrupts happens through EVENTINJ.
+ */
+ control = &svm->vmcb->control;
+ control->int_vector = 0x0;
+ control->int_ctl &= ~V_INTR_PRIO_MASK;
+ control->int_ctl |= V_IRQ_MASK |
+ ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+ svm_clr_intercept(svm, INTERCEPT_VINTR);
+
+ /* Drop int_ctl fields related to VINTR injection. */
+ svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+ if (is_guest_mode(&svm->vcpu)) {
+ svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+
+ WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
+ (svm->nested.ctl.int_ctl & V_TPR_MASK));
+
+ svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
+ V_IRQ_INJECTION_BITS_MASK;
+
+ svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
+ }
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
+
+ switch (seg) {
+ case VCPU_SREG_CS: return &save->cs;
+ case VCPU_SREG_DS: return &save->ds;
+ case VCPU_SREG_ES: return &save->es;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
+ case VCPU_SREG_SS: return &save->ss;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
+ }
+ BUG();
+ return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ var->base = s->base;
+ var->limit = s->limit;
+ var->selector = s->selector;
+ var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+ var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+ var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+ var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+ var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+ var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
+ * for cross vendor migration purposes by "not present"
+ */
+ var->unusable = !var->present;
+
+ switch (seg) {
+ case VCPU_SREG_TR:
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ var->type |= 0x2;
+ break;
+ case VCPU_SREG_DS:
+ case VCPU_SREG_ES:
+ case VCPU_SREG_FS:
+ case VCPU_SREG_GS:
+ /*
+ * The accessed bit must always be set in the segment
+ * descriptor cache, although it can be cleared in the
+ * descriptor, the cached bit always remains at 1. Since
+ * Intel has a check on this, set it here to support
+ * cross-vendor migration.
+ */
+ if (!var->unusable)
+ var->type |= 0x1;
+ break;
+ case VCPU_SREG_SS:
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ /* This is symmetric with svm_set_segment() */
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
+ break;
+ }
+}
+
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * For guests that don't set guest_state_protected, the cr3 update is
+ * handled via kvm_mmu_load() while entering the guest. For guests
+ * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
+ * VMCB save area now, since the save area will become the initial
+ * contents of the VMSA, and future VMCB save area updates won't be
+ * seen.
+ */
+ if (sev_es_guest(vcpu->kvm)) {
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ }
+}
+
+static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ return true;
+}
+
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer |= EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ }
+
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer &= ~EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ }
+ }
+#endif
+ vcpu->arch.cr0 = cr0;
+
+ if (!npt_enabled) {
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
+
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
+}
+
+static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return true;
+}
+
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = vcpu->arch.cr4;
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled) {
+ cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ s->base = var->base;
+ s->limit = var->limit;
+ s->selector = var->selector;
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+
+ /*
+ * This is always accurate, except if SYSRET returned to a segment
+ * with SS.DPL != 3. Intel does not have this quirk, and always
+ * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+ * would entail passing the CPL to userspace and back.
+ */
+ if (seg == VCPU_SREG_SS)
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+}
+
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, BP_VECTOR);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ set_exception_intercept(svm, BP_VECTOR);
+ }
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
+{
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = sd->min_asid;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
+
+ svm->current_vmcb->asid_generation = sd->asid_generation;
+ svm->asid = sd->next_asid++;
+}
+
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (unlikely(value != vmcb->save.dr6)) {
+ vmcb->save.dr6 = value;
+ vmcb_mark_dirty(vmcb, VMCB_DR);
+ }
+}
+
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
+ return;
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ /*
+ * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
+ * because db_interception might need it. We can do it before vmentry.
+ */
+ vcpu->arch.dr6 = svm->vmcb->save.dr6;
+ vcpu->arch.dr7 = svm->vmcb->save.dr7;
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ set_dr_intercepts(svm);
+}
+
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ svm->vmcb->save.dr7 = value;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static int pf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int rc;
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ /*
+ * WARN if hardware generates a fault with an error code that collides
+ * with KVM-defined sythentic flags. Clear the flags and continue on,
+ * i.e. don't terminate the VM, as KVM can't possibly be relying on a
+ * flag that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
+ error_code &= ~PFERR_SYNTHETIC_MASK;
+
+ if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
+ rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+
+ if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
+ sev_handle_rmp_fault(vcpu, fault_address, error_code);
+
+ return rc;
+}
+
+static int db_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->nmi_singlestep) {
+ u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
+ return 1;
+ }
+
+ if (svm->nmi_singlestep) {
+ disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
+ kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int bp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = BP_VECTOR;
+ return 0;
+}
+
+static int ud_interception(struct kvm_vcpu *vcpu)
+{
+ return handle_ud(vcpu);
+}
+
+static int ac_interception(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
+static bool is_erratum_383(void)
+{
+ int i;
+ u64 value;
+
+ if (!erratum_383_found)
+ return false;
+
+ if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value))
+ return false;
+
+ /* Bit 62 may or may not be set for this mce */
+ value &= ~(1ULL << 62);
+
+ if (value != 0xb600000000010015ULL)
+ return false;
+
+ /* Clear MCi_STATUS registers */
+ for (i = 0; i < 6; ++i)
+ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0);
+
+ if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) {
+ value &= ~(1ULL << 2);
+ native_write_msr_safe(MSR_IA32_MCG_STATUS, value);
+ }
+
+ /* Flush tlb to evict multi-match entries */
+ __flush_tlb_all();
+
+ return true;
+}
+
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
+{
+ if (is_erratum_383()) {
+ /*
+ * Erratum 383 triggered. Guest state is corrupt so kill the
+ * guest.
+ */
+ pr_err("Guest triggered AMD Erratum 383\n");
+
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ return;
+ }
+
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ kvm_machine_check();
+}
+
+static int mc_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int shutdown_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
+ * the VMCB in a known good state. Unfortuately, KVM doesn't have
+ * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ * userspace. At a platform view, INIT is acceptable behavior as
+ * there exist bare metal platforms that automatically INIT the CPU
+ * in response to shutdown.
+ *
+ * The VM save area for SEV-ES guests has already been encrypted so it
+ * cannot be reinitialized, i.e. synthesizing INIT is futile.
+ */
+ if (!sev_es_guest(vcpu->kvm)) {
+ clear_page(svm->vmcb);
+#ifdef CONFIG_KVM_SMM
+ if (is_smm(vcpu))
+ kvm_smm_changed(vcpu, false);
+#endif
+ kvm_vcpu_reset(vcpu, true);
+ }
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int io_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+ int size, in, string;
+ unsigned port;
+
+ ++vcpu->stat.io_exits;
+ string = (io_info & SVM_IOIO_STR_MASK) != 0;
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ port = io_info >> 16;
+ size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+
+ if (string) {
+ if (sev_es_guest(vcpu->kvm))
+ return sev_es_string_io(svm, size, port, in);
+ else
+ return kvm_emulate_instruction(vcpu, 0);
+ }
+
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+static int nmi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int intr_interception(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (vmload) {
+ svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else {
+ svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+ }
+
+ kvm_vcpu_unmap(vcpu, &map);
+
+ return ret;
+}
+
+static int vmload_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, true);
+}
+
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
+}
+
+static int vmrun_interception(struct kvm_vcpu *vcpu)
+{
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ return nested_svm_vmrun(vcpu);
+}
+
+enum {
+ NONE_SVM_INSTR,
+ SVM_INSTR_VMRUN,
+ SVM_INSTR_VMLOAD,
+ SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+ return NONE_SVM_INSTR;
+
+ switch (ctxt->modrm) {
+ case 0xd8: /* VMRUN */
+ return SVM_INSTR_VMRUN;
+ case 0xda: /* VMLOAD */
+ return SVM_INSTR_VMLOAD;
+ case 0xdb: /* VMSAVE */
+ return SVM_INSTR_VMSAVE;
+ default:
+ break;
+ }
+
+ return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+ const int guest_mode_exit_codes[] = {
+ [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+ [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+ [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+ };
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
+ [SVM_INSTR_VMRUN] = vmrun_interception,
+ [SVM_INSTR_VMLOAD] = vmload_interception,
+ [SVM_INSTR_VMSAVE] = vmsave_interception,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* Returns '1' or -errno on failure, '0' on success. */
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
+ if (ret)
+ return ret;
+ return 1;
+ }
+ return svm_instr_handlers[opcode](vcpu);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ * some AMD CPUs when EAX of these instructions are in the reserved memory
+ * regions (e.g. SMM memory on host).
+ * 2) VMware backdoor
+ */
+static int gp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int opcode;
+
+ /* Both #GP cases have zero error_code */
+ if (error_code)
+ goto reinject;
+
+ /* Decode the instruction for usage later */
+ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+ goto reinject;
+
+ opcode = svm_instr_opcode(vcpu);
+
+ if (opcode == NONE_SVM_INSTR) {
+ if (!enable_vmware_backdoor)
+ goto reinject;
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC.
+ */
+ if (!is_guest_mode(vcpu))
+ return kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+ } else {
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
+ return emulate_svm_instr(vcpu, opcode);
+ }
+
+reinject:
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
+void svm_set_gif(struct vcpu_svm *svm, bool value)
+{
+ if (value) {
+ /*
+ * If VGIF is enabled, the STGI intercept is only added to
+ * detect the opening of the SMI/NMI window; remove it now.
+ * Likewise, clear the VINTR intercept, we will set it
+ * again while processing KVM_REQ_EVENT if needed.
+ */
+ if (vgif)
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ if (svm_is_intercept(svm, INTERCEPT_VINTR))
+ svm_clear_vintr(svm);
+
+ enable_gif(svm);
+ if (svm->vcpu.arch.smi_pending ||
+ svm->vcpu.arch.nmi_pending ||
+ kvm_cpu_has_injectable_intr(&svm->vcpu) ||
+ kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ } else {
+ disable_gif(svm);
+
+ /*
+ * After a CLGI no interrupts should come. But if vGIF is
+ * in use, we still rely on the VINTR intercept (rather than
+ * STGI) to detect an open interrupt window.
+ */
+ if (!vgif)
+ svm_clear_vintr(svm);
+ }
+}
+
+static int stgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
+ return ret;
+}
+
+static int clgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
+ return ret;
+}
+
+static int invlpga_interception(struct kvm_vcpu *vcpu)
+{
+ gva_t gva = kvm_rax_read(vcpu);
+ u32 asid = kvm_rcx_read(vcpu);
+
+ /* FIXME: Handle an address size prefix. */
+ if (!is_long_mode(vcpu))
+ gva = (u32)gva;
+
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, gva);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int skinit_interception(struct kvm_vcpu *vcpu)
+{
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int task_switch_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ vcpu->arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
+ kvm_clear_exception_queue(vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+ if (!svm_skip_emulated_instruction(vcpu))
+ return 0;
+ }
+
+ if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+ int_vec = -1;
+
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code);
+}
+
+static void svm_clr_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+}
+
+static void svm_set_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+}
+
+static int iret_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
+
+ ++vcpu->stat.nmi_window_exits;
+ svm->awaiting_iret_completion = true;
+
+ svm_clr_iret_intercept(svm);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 1;
+}
+
+static int invlpg_interception(struct kvm_vcpu *vcpu)
+{
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return kvm_emulate_instruction(vcpu, 0);
+
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int rsm_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
+}
+
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
+ bool ret = false;
+
+ if (!is_guest_mode(vcpu) ||
+ (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+ cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+ else
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
+ else
+ return 1;
+
+ break;
+ case 3:
+ err = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ val = vcpu->arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ }
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr_trap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_value, new_value;
+ unsigned int cr;
+ int ret = 0;
+
+ new_value = (unsigned long)svm->vmcb->control.exit_info_1;
+
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
+ switch (cr) {
+ case 0:
+ old_value = kvm_read_cr0(vcpu);
+ svm_set_cr0(vcpu, new_value);
+
+ kvm_post_set_cr0(vcpu, old_value, new_value);
+ break;
+ case 4:
+ old_value = kvm_read_cr4(vcpu);
+ svm_set_cr4(vcpu, new_value);
+
+ kvm_post_set_cr4(vcpu, old_value, new_value);
+ break;
+ case 8:
+ ret = kvm_set_cr8(vcpu, new_value);
+ break;
+ default:
+ WARN(1, "unhandled CR%d write trap", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int dr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, dr;
+ int err = 0;
+
+ /*
+ * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
+ * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return 1;
+
+ if (vcpu->guest_debug == 0) {
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ clr_dr_intercepts(svm);
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+ if (dr >= 16) { /* mov to DRn */
+ dr -= 16;
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
+ } else {
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
+ }
+
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
+ r = cr_interception(vcpu);
+ if (lapic_in_kernel(vcpu))
+ return r;
+ if (cr8_prev <= kvm_get_cr8(vcpu))
+ return r;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
+static int efer_trap(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr_info;
+ int ret;
+
+ /*
+ * Clear the EFER_SVME bit from EFER. The SVM code always sets this
+ * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
+ * whether the guest has X86_FEATURE_SVM - this avoids a failure if
+ * the guest doesn't have X86_FEATURE_SVM.
+ */
+ msr_info.host_initiated = false;
+ msr_info.index = MSR_EFER;
+ msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(vcpu, &msr_info);
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int svm_get_feature_msr(u32 msr, u64 *data)
+{
+ *data = 0;
+
+ switch (msr) {
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
+ break;
+ default:
+ return KVM_MSR_RET_UNSUPPORTED;
+ }
+
+ return 0;
+}
+
+static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
+{
+ return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected &&
+ msr_info->index != MSR_IA32_XSS &&
+ !msr_write_intercepted(vcpu, msr_info->index);
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (sev_es_prevent_msr_access(vcpu, msr_info)) {
+ msr_info->data = 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+ }
+
+ switch (msr_info->index) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR))
+ return 1;
+ msr_info->data = svm->tsc_ratio_msr;
+ break;
+ case MSR_STAR:
+ msr_info->data = svm->vmcb01.ptr->save.star;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.lstar;
+ break;
+ case MSR_CSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.cstar;
+ break;
+ case MSR_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.gs.base;
+ break;
+ case MSR_FS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.fs.base;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
+ break;
+ case MSR_SYSCALL_MASK:
+ msr_info->data = svm->vmcb01.ptr->save.sfmask;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+ if (guest_cpuid_is_intel_compatible(vcpu))
+ msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+ if (guest_cpuid_is_intel_compatible(vcpu))
+ msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
+ break;
+ case MSR_IA32_S_CET:
+ msr_info->data = svm->vmcb->save.s_cet;
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ msr_info->data = svm->vmcb->save.isst_addr;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ msr_info->data = svm->vmcb->save.ssp;
+ break;
+ case MSR_TSC_AUX:
+ msr_info->data = svm->tsc_aux;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm->vmcb->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm->vmcb->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm->vmcb->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm->vmcb->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+ msr_info->data = svm->vmcb->save.last_excp_to;
+ break;
+ case MSR_VM_HSAVE_PA:
+ msr_info->data = svm->nested.hsave_msr;
+ break;
+ case MSR_VM_CR:
+ msr_info->data = svm->nested.vm_cr_msr;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ msr_info->data = svm->vmcb->save.spec_ctrl;
+ else
+ msr_info->data = svm->spec_ctrl;
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ msr_info->data = svm->virt_spec_ctrl;
+ break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
+ case MSR_AMD64_DE_CFG:
+ msr_info->data = svm->msr_decfg;
+ break;
+ default:
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+ return 0;
+}
+
+static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
+ return kvm_complete_insn_gp(vcpu, err);
+
+ svm_vmgexit_inject_exception(svm, X86_TRAP_GP);
+ return 1;
+}
+
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret = 0;
+
+ u32 ecx = msr->index;
+ u64 data = msr->data;
+
+ if (sev_es_prevent_msr_access(vcpu, msr))
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+
+ switch (ecx) {
+ case MSR_AMD64_TSC_RATIO:
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) {
+
+ if (!msr->host_initiated)
+ return 1;
+ /*
+ * In case TSC scaling is not enabled, always
+ * leave this MSR at the default value.
+ *
+ * Due to bug in qemu 6.2.0, it would try to set
+ * this msr to 0 if tsc scaling is not enabled.
+ * Ignore this value as well.
+ */
+ if (data != 0 && data != svm->tsc_ratio_msr)
+ return 1;
+ break;
+ }
+
+ if (data & SVM_TSC_RATIO_RSVD)
+ return 1;
+
+ svm->tsc_ratio_msr = data;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ is_guest_mode(vcpu))
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ break;
+ case MSR_IA32_CR_PAT:
+ ret = kvm_set_msr_common(vcpu, msr);
+ if (ret)
+ break;
+
+ svm->vmcb01.ptr->save.g_pat = data;
+ if (is_guest_mode(vcpu))
+ nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (kvm_spec_ctrl_test_value(data))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ svm->vmcb->save.spec_ctrl = data;
+ else
+ svm->spec_ctrl = data;
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_merge_msrpm().
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ if (data & ~SPEC_CTRL_SSBD)
+ return 1;
+
+ svm->virt_spec_ctrl = data;
+ break;
+ case MSR_STAR:
+ svm->vmcb01.ptr->save.star = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ svm->vmcb01.ptr->save.lstar = data;
+ break;
+ case MSR_CSTAR:
+ svm->vmcb01.ptr->save.cstar = data;
+ break;
+ case MSR_GS_BASE:
+ svm->vmcb01.ptr->save.gs.base = data;
+ break;
+ case MSR_FS_BASE:
+ svm->vmcb01.ptr->save.fs.base = data;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ svm->vmcb01.ptr->save.kernel_gs_base = data;
+ break;
+ case MSR_SYSCALL_MASK:
+ svm->vmcb01.ptr->save.sfmask = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ svm->vmcb01.ptr->save.sysenter_cs = data;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+ /*
+ * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+ * when we spoof an Intel vendor ID (for cross vendor migration).
+ * In this case we use this intercept to track the high
+ * 32 bit part of these msrs to support Intel's
+ * implementation of SYSENTER/SYSEXIT.
+ */
+ svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
+ break;
+ case MSR_IA32_S_CET:
+ svm->vmcb->save.s_cet = data;
+ vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ svm->vmcb->save.isst_addr = data;
+ vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ svm->vmcb->save.ssp = data;
+ vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
+ break;
+ case MSR_TSC_AUX:
+ /*
+ * TSC_AUX is always virtualized for SEV-ES guests when the
+ * feature is available. The user return MSR support is not
+ * required in this case because TSC_AUX is restored on #VMEXIT
+ * from the host save area.
+ */
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
+ break;
+
+ /*
+ * TSC_AUX is usually changed only during boot and never read
+ * directly. Intercept TSC_AUX and switch it via user return.
+ */
+ preempt_disable();
+ ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
+ preempt_enable();
+ if (ret)
+ break;
+
+ svm->tsc_aux = data;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!lbrv) {
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+ break;
+ }
+
+ /*
+ * Suppress BTF as KVM doesn't virtualize BTF, but there's no
+ * way to communicate lack of support to the guest.
+ */
+ if (data & DEBUGCTLMSR_BTF) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ data &= ~DEBUGCTLMSR_BTF;
+ }
+
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ if (svm->vmcb->save.dbgctl == data)
+ break;
+
+ svm->vmcb->save.dbgctl = data;
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
+ svm_update_lbrv(vcpu);
+ break;
+ case MSR_VM_HSAVE_PA:
+ /*
+ * Old kernels did not validate the value written to
+ * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
+ * value to allow live migrating buggy or malicious guests
+ * originating from those kernels.
+ */
+ if (!msr->host_initiated && !page_address_valid(vcpu, data))
+ return 1;
+
+ svm->nested.hsave_msr = data & PAGE_MASK;
+ break;
+ case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
+ case MSR_VM_IGNNE:
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+ break;
+ case MSR_AMD64_DE_CFG: {
+ u64 supported_de_cfg;
+
+ if (svm_get_feature_msr(ecx, &supported_de_cfg))
+ return 1;
+
+ if (data & ~supported_de_cfg)
+ return 1;
+
+ svm->msr_decfg = data;
+ break;
+ }
+ default:
+ return kvm_set_msr_common(vcpu, msr);
+ }
+ return ret;
+}
+
+static int msr_interception(struct kvm_vcpu *vcpu)
+{
+ if (to_svm(vcpu)->vmcb->control.exit_info_1)
+ return kvm_emulate_wrmsr(vcpu);
+ else
+ return kvm_emulate_rdmsr(vcpu);
+}
+
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ svm_clear_vintr(to_svm(vcpu));
+
+ /*
+ * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
+ * In this case AVIC was temporarily disabled for
+ * requesting the IRQ window and we have to re-enable it.
+ *
+ * If running nested, still remove the VM wide AVIC inhibit to
+ * support case in which the interrupt window was requested when the
+ * vCPU was not running nested.
+
+ * All vCPUs which run still run nested, will remain to have their
+ * AVIC still inhibited due to per-cpu AVIC inhibition.
+ */
+ kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
+ ++vcpu->stat.irq_window_exits;
+ return 1;
+}
+
+static int pause_interception(struct kvm_vcpu *vcpu)
+{
+ bool in_kernel;
+ /*
+ * CPL is not made available for an SEV-ES guest, therefore
+ * vcpu->arch.preempted_in_kernel can never be true. Just
+ * set in_kernel to false as well.
+ */
+ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
+
+ grow_ple_window(vcpu);
+
+ kvm_vcpu_on_spin(vcpu, in_kernel);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int invpcid_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long type;
+ gva_t gva;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /*
+ * For an INVPCID intercept:
+ * EXITINFO1 provides the linear address of the memory operand.
+ * EXITINFO2 provides the contents of the register operand.
+ */
+ type = svm->vmcb->control.exit_info_2;
+ gva = svm->vmcb->control.exit_info_1;
+
+ /*
+ * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the
+ * stack segment is used. The intercept takes priority over all
+ * #GP checks except CPL>0, but somehow still generates a linear
+ * address? The APM is sorely lacking.
+ */
+ if (is_noncanonical_address(gva, vcpu, 0)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+
+ return kvm_handle_invpcid(vcpu, type, gva);
+}
+
+static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
+ * execute the bus-locking instruction. Set the bus lock counter to '1'
+ * to effectively step past the bus lock.
+ */
+ if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
+ svm->vmcb->control.bus_lock_counter = 1;
+
+ return 1;
+}
+
+static int bus_lock_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_userspace_buslock;
+
+ if (is_guest_mode(vcpu))
+ svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
+
+ return 0;
+}
+
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
+ [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
+ [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
+ [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
+ [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
+ [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_NMI] = nmi_interception,
+ [SVM_EXIT_SMI] = smi_interception,
+ [SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
+ [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
+ [SVM_EXIT_IRET] = iret_interception,
+ [SVM_EXIT_INVD] = kvm_emulate_invd,
+ [SVM_EXIT_PAUSE] = pause_interception,
+ [SVM_EXIT_HLT] = kvm_emulate_halt,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
+ [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_MSR] = msr_interception,
+ [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
+ [SVM_EXIT_SHUTDOWN] = shutdown_interception,
+ [SVM_EXIT_VMRUN] = vmrun_interception,
+ [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
+ [SVM_EXIT_VMLOAD] = vmload_interception,
+ [SVM_EXIT_VMSAVE] = vmsave_interception,
+ [SVM_EXIT_STGI] = stgi_interception,
+ [SVM_EXIT_CLGI] = clgi_interception,
+ [SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
+ [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
+ [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
+ [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
+ [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
+ [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
+ [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
+ [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_INVPCID] = invpcid_interception,
+ [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
+ [SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
+ [SVM_EXIT_RSM] = rsm_interception,
+ [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
+ [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+#ifdef CONFIG_KVM_AMD_SEV
+ [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
+#endif
+};
+
+static void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
+ char *vm_type;
+
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ guard(mutex)(&vmcb_dump_mutex);
+
+ vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
+ sev_es_guest(vcpu->kvm) ? "SEV-ES" :
+ sev_guest(vcpu->kvm) ? "SEV" : "SVM";
+
+ pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
+ vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
+ pr_err("VMCB Control Area:\n");
+ pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
+ pr_err("%-20s%08x %08x\n", "intercepts:",
+ control->intercepts[INTERCEPT_WORD3],
+ control->intercepts[INTERCEPT_WORD4]);
+ pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
+ pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+ pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+ pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+ pr_err("%-20s%d\n", "asid:", control->asid);
+ pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+ pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+ pr_err("%-20s%08x\n", "int_state:", control->int_state);
+ pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+ pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+ pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+ pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+ pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+ pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+ pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
+ pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+ pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+ pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
+ pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+ pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+ pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+ pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+ pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
+ pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
+ pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ save = sev_decrypt_vmsa(vcpu);
+ if (!save)
+ goto no_vmsa;
+
+ save01 = save;
+ }
+
+ pr_err("VMCB State Save Area:\n");
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "es:",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "cs:",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ss:",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ds:",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "fs:",
+ save01->fs.selector, save01->fs.attrib,
+ save01->fs.limit, save01->fs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gs:",
+ save01->gs.selector, save01->gs.attrib,
+ save01->gs.limit, save01->gs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gdtr:",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ldtr:",
+ save01->ldtr.selector, save01->ldtr.attrib,
+ save01->ldtr.limit, save01->ldtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "idtr:",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "tr:",
+ save01->tr.selector, save01->tr.attrib,
+ save01->tr.limit, save01->tr.base);
+ pr_err("vmpl: %d cpl: %d efer: %016llx\n",
+ save->vmpl, save->cpl, save->efer);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr0:", save->cr0, "cr2:", save->cr2);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr3:", save->cr3, "cr4:", save->cr4);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "dr6:", save->dr6, "dr7:", save->dr7);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rip:", save->rip, "rflags:", save->rflags);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsp:", save->rsp, "rax:", save->rax);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "s_cet:", save->s_cet, "ssp:", save->ssp);
+ pr_err("%-15s %016llx\n",
+ "isst_addr:", save->isst_addr);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "star:", save01->star, "lstar:", save01->lstar);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cstar:", save01->cstar, "sfmask:", save01->sfmask);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "kernel_gs_base:", save01->kernel_gs_base,
+ "sysenter_cs:", save01->sysenter_cs);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "sysenter_esp:", save01->sysenter_esp,
+ "sysenter_eip:", save01->sysenter_eip);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "br_from:", save->br_from, "br_to:", save->br_to);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "excp_from:", save->last_excp_from,
+ "excp_to:", save->last_excp_to);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
+
+ pr_err("%-15s %016llx\n",
+ "sev_features", vmsa->sev_features);
+
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp);
+ pr_err("%-15s %016llx\n",
+ "u_cet:", vmsa->u_cet);
+
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rax:", vmsa->rax, "rbx:", vmsa->rbx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r8:", vmsa->r8, "r9:", vmsa->r9);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r10:", vmsa->r10, "r11:", vmsa->r11);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r12:", vmsa->r12, "r13:", vmsa->r13);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r14:", vmsa->r14, "r15:", vmsa->r15);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
+ } else {
+ pr_err("%-15s %016llx %-13s %016lx\n",
+ "rax:", save->rax, "rbx:",
+ vcpu->arch.regs[VCPU_REGS_RBX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
+ "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
+ "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
+ pr_err("%-15s %016lx %-13s %016llx\n",
+ "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
+ "rsp:", save->rsp);
+#ifdef CONFIG_X86_64
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r8:", vcpu->arch.regs[VCPU_REGS_R8],
+ "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r10:", vcpu->arch.regs[VCPU_REGS_R10],
+ "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r12:", vcpu->arch.regs[VCPU_REGS_R12],
+ "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r14:", vcpu->arch.regs[VCPU_REGS_R14],
+ "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
+#endif
+ }
+
+no_vmsa:
+ if (sev_es_guest(vcpu->kvm))
+ sev_free_decrypted_vmsa(vcpu, save);
+}
+
+static bool svm_check_exit_valid(u64 exit_code)
+{
+ return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code]);
+}
+
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ dump_vmcb(vcpu);
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_code);
+ return 0;
+}
+
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ if (!svm_check_exit_valid(exit_code))
+ return svm_handle_invalid_exit(vcpu, exit_code);
+
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ if (exit_code == SVM_EXIT_MSR)
+ return msr_interception(vcpu);
+ else if (exit_code == SVM_EXIT_VINTR)
+ return interrupt_window_interception(vcpu);
+ else if (exit_code == SVM_EXIT_INTR)
+ return intr_interception(vcpu);
+ else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_code == SVM_EXIT_NPF)
+ return npf_interception(vcpu);
+#ifdef CONFIG_KVM_AMD_SEV
+ else if (exit_code == SVM_EXIT_VMGEXIT)
+ return sev_handle_vmgexit(vcpu);
+#endif
+#endif
+ return svm_exit_handlers[exit_code](vcpu);
+}
+
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *reason = control->exit_code;
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+ *intr_info = control->exit_int_info;
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->exit_int_info_err;
+ else
+ *error_code = 0;
+}
+
+static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info,
+ u32 *error_code)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *intr_info = control->event_inj;
+
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->event_inj_err;
+ else
+ *error_code = 0;
+
+}
+
+static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ /* SEV-ES guests must use the CR write traps to track CR registers. */
+ if (!sev_es_guest(vcpu->kvm)) {
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ }
+
+ if (is_guest_mode(vcpu)) {
+ int vmexit;
+
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ return 1;
+ }
+
+ if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+ kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ kvm_run->fail_entry.hardware_entry_failure_reason
+ = svm->vmcb->control.exit_code;
+ kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ dump_vmcb(vcpu);
+ return 0;
+ }
+
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ return svm_invoke_exit_handler(vcpu, exit_code);
+}
+
+static int pre_svm_run(struct kvm_vcpu *vcpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If the previous vmrun of the vmcb occurred on a different physical
+ * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
+ * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+ */
+ if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+ svm->current_vmcb->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ svm->current_vmcb->cpu = vcpu->cpu;
+ }
+
+ if (sev_guest(vcpu->kvm))
+ return pre_sev_run(svm, vcpu->cpu);
+
+ /* FIXME: handle wraparound of asid_generation */
+ if (svm->current_vmcb->asid_generation != sd->asid_generation)
+ new_asid(svm, sd);
+
+ return 0;
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ if (svm->nmi_l1_to_l2)
+ return;
+
+ /*
+ * No need to manually track NMI masking when vNMI is enabled, hardware
+ * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
+ * case where software directly injects an NMI.
+ */
+ if (!is_vnmi_enabled(svm)) {
+ svm->nmi_masked = true;
+ svm_set_iret_intercept(svm);
+ }
+ ++vcpu->stat.nmi_injections;
+}
+
+static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
+}
+
+static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
+ return false;
+
+ svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+
+ /*
+ * Because the pending NMI is serviced by hardware, KVM can't know when
+ * the NMI is "injected", but for all intents and purposes, passing the
+ * NMI off to hardware counts as injection.
+ */
+ ++vcpu->stat.nmi_injections;
+
+ return true;
+}
+
+static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 type;
+
+ if (intr->soft) {
+ if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
+ return;
+
+ type = SVM_EVTINJ_TYPE_SOFT;
+ } else {
+ type = SVM_EVTINJ_TYPE_INTR;
+ }
+
+ trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
+ ++vcpu->stat.irq_injections;
+
+ svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
+}
+
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vector)
+{
+ /*
+ * apic->apicv_active must be read after vcpu->mode.
+ * Pairs with smp_store_release in vcpu_enter_guest.
+ */
+ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
+
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
+ /* Process the interrupt via kvm_check_and_inject_events(). */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ if (in_guest_mode) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ. If
+ * the vCPU exits the guest before the doorbell chimes, hardware
+ * will automatically process AVIC interrupts at the next VMRUN.
+ */
+ avic_ring_doorbell(vcpu);
+ } else {
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ kvm_lapic_set_irr(vector, apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode. This guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that svm_complete_interrupt_delivery
+ * will signal the doorbell if the CPU has already entered the guest.
+ */
+ smp_mb__after_atomic();
+ svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
+}
+
+static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (nested_svm_virtualize_tpr(vcpu))
+ return;
+
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ if (irr == -1)
+ return;
+
+ if (tpr >= irr)
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+}
+
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm))
+ return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
+ else
+ return svm->nmi_masked;
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm)) {
+ if (masked)
+ svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ } else {
+ svm->nmi_masked = masked;
+ if (masked)
+ svm_set_iret_intercept(svm);
+ else
+ svm_clr_iret_intercept(svm);
+ }
+}
+
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!gif_set(svm))
+ return true;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+ return false;
+
+ if (svm_get_nmi_mask(vcpu))
+ return true;
+
+ return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
+}
+
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_nmi_blocked(vcpu))
+ return 0;
+
+ /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+ return -EBUSY;
+ return 1;
+}
+
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!gif_set(svm))
+ return true;
+
+ if (is_guest_mode(vcpu)) {
+ /* As long as interrupts are being delivered... */
+ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
+ ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
+ : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ return true;
+
+ /* ... vmexits aren't blocked by the interrupt shadow */
+ if (nested_exit_on_intr(svm))
+ return false;
+ } else {
+ if (!svm_get_if_flag(vcpu))
+ return true;
+ }
+
+ return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
+}
+
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_interrupt_blocked(vcpu))
+ return 0;
+
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
+ return -EBUSY;
+
+ return 1;
+}
+
+static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept. However, if the vGIF feature is
+ * enabled, the STGI interception will not occur. Enable the irq
+ * window under the assumption that the hardware will set the GIF.
+ */
+ if (vgif || gif_set(svm)) {
+ /*
+ * IRQ window is not needed when AVIC is enabled,
+ * unless we have pending ExtINT since it cannot be injected
+ * via AVIC. In such case, KVM needs to temporarily disable AVIC,
+ * and fallback to injecting IRQ via V_IRQ.
+ *
+ * If running nested, AVIC is already locally inhibited
+ * on this vCPU, therefore there is no need to request
+ * the VM wide AVIC inhibition.
+ */
+ if (!is_guest_mode(vcpu))
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
+ svm_set_vintr(svm);
+ }
+}
+
+static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If NMIs are outright masked, i.e. the vCPU is already handling an
+ * NMI, and KVM has not yet intercepted an IRET, then there is nothing
+ * more to do at this time as KVM has already enabled IRET intercepts.
+ * If KVM has already intercepted IRET, then single-step over the IRET,
+ * as NMIs aren't architecturally unmasked until the IRET completes.
+ *
+ * If vNMI is enabled, KVM should never request an NMI window if NMIs
+ * are masked, as KVM allows at most one to-be-injected NMI and one
+ * pending NMI. If two NMIs arrive simultaneously, KVM will inject one
+ * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are
+ * unmasked. KVM _will_ request an NMI window in some situations, e.g.
+ * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately
+ * inject the NMI. In those situations, KVM needs to single-step over
+ * the STI shadow or intercept STGI.
+ */
+ if (svm_get_nmi_mask(vcpu)) {
+ WARN_ON_ONCE(is_vnmi_enabled(svm));
+
+ if (!svm->awaiting_iret_completion)
+ return; /* IRET will cause a vm exit */
+ }
+
+ /*
+ * SEV-ES guests are responsible for signaling when a vCPU is ready to
+ * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
+ * KVM can't intercept and single-step IRET to detect when NMIs are
+ * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
+ *
+ * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
+ * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
+ * supported NAEs in the GHCB protocol.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (!gif_set(svm)) {
+ if (vgif)
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ return; /* STGI will cause a vm exit */
+ }
+
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
+ svm->nmi_singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+}
+
+static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
+ * A TLB flush for the current ASID flushes both "host" and "guest" TLB
+ * entries, and thus is a superset of Hyper-V's fine grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
+
+ /*
+ * Flush only the current ASID even if the TLB flush was invoked via
+ * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
+ * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
+ * unconditionally does a TLB flush on both nested VM-Enter and nested
+ * VM-Exit (via kvm_mmu_reset_context()).
+ */
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->current_vmcb->asid_generation--;
+}
+
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
+
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
+ * flush the NPT mappings via hypercall as flushing the ASID only
+ * affects virtual to physical mappings, it does not invalidate guest
+ * physical to host physical mappings.
+ */
+ if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
+ hyperv_flush_guest_mapping(root_tdp);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
+ * flushes should be routed to hv_flush_remote_tlbs() without requesting
+ * a "regular" remote flush. Reaching this point means either there's
+ * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
+ * which might be fatal to the guest. Yell, but try to recover.
+ */
+ if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
+ hv_flush_remote_tlbs(vcpu->kvm);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ invlpga(gva, svm->vmcb->control.asid);
+}
+
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (nested_svm_virtualize_tpr(vcpu))
+ return;
+
+ if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+ kvm_set_cr8(vcpu, cr8);
+ }
+}
+
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr8;
+
+ if (nested_svm_virtualize_tpr(vcpu))
+ return;
+
+ cr8 = kvm_get_cr8(vcpu);
+ svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+ svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
+static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ int type)
+{
+ bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
+ bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
+ * associated with the original soft exception/interrupt. next_rip is
+ * cleared on all exits that can occur while vectoring an event, so KVM
+ * needs to manually set next_rip for re-injection. Unlike the !nrips
+ * case below, this needs to be done if and only if KVM is re-injecting
+ * the same event, i.e. if the event is a soft exception/interrupt,
+ * otherwise next_rip is unused on VMRUN.
+ */
+ if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
+ svm->vmcb->control.next_rip = svm->soft_int_next_rip;
+ /*
+ * If NRIPS isn't enabled, KVM must manually advance RIP prior to
+ * injecting the soft exception/interrupt. That advancement needs to
+ * be unwound if vectoring didn't complete. Note, the new event may
+ * not be the injected event, e.g. if KVM injected an INTn, the INTn
+ * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
+ * be the reported vectored event, but RIP still needs to be unwound.
+ */
+ else if (!nrips && (is_soft || is_exception) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
+ kvm_rip_write(vcpu, svm->soft_int_old_rip);
+}
+
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
+ bool soft_int_injected = svm->soft_int_injected;
+
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
+
+ /*
+ * If we've made progress since setting awaiting_iret_completion, we've
+ * executed an IRET and can allow NMI injection.
+ */
+ if (svm->awaiting_iret_completion &&
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
+ svm->awaiting_iret_completion = false;
+ svm->nmi_masked = false;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ if (soft_int_injected)
+ svm_complete_soft_interrupt(vcpu, vector, type);
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ vcpu->arch.nmi_injected = true;
+ svm->nmi_l1_to_l2 = nmi_l1_to_l2;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT: {
+ u32 error_code = 0;
+
+ /*
+ * Never re-inject a #VC exception.
+ */
+ if (vector == X86_TRAP_VC)
+ break;
+
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR)
+ error_code = svm->vmcb->control.exit_int_info_err;
+
+ kvm_requeue_exception(vcpu, vector,
+ exitintinfo & SVM_EXITINTINFO_VALID_ERR,
+ error_code);
+ break;
+ }
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(vcpu, vector, false);
+ break;
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_queue_interrupt(vcpu, vector, true);
+ break;
+ default:
+ break;
+ }
+
+}
+
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ control->exit_int_info = control->event_inj;
+ control->exit_int_info_err = control->event_inj_err;
+ control->event_inj = 0;
+ svm_complete_interrupts(vcpu);
+}
+
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (to_kvm_sev_info(vcpu->kvm)->need_init)
+ return -EINVAL;
+
+ return 1;
+}
+
+static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ /*
+ * Next RIP must be provided as IRQs are disabled, and accessing guest
+ * memory to decode the instruction might fault, i.e. might sleep.
+ */
+ if (!nrips || !control->next_rip)
+ return EXIT_FASTPATH_NONE;
+
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ switch (control->exit_code) {
+ case SVM_EXIT_MSR:
+ if (!control->exit_info_1)
+ break;
+ return handle_fastpath_wrmsr(vcpu);
+ case SVM_EXIT_HLT:
+ return handle_fastpath_hlt(vcpu);
+ case SVM_EXIT_INVD:
+ return handle_fastpath_invd(vcpu);
+ default:
+ break;
+ }
+
+ return EXIT_FASTPATH_NONE;
+}
+
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ guest_state_enter_irqoff();
+
+ /*
+ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
+ * VMRUN controls whether or not physical IRQs are masked (KVM always
+ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
+ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
+ * into guest state if delivery of an event during VMRUN triggers a
+ * #VMEXIT, and the guest_state transitions already tell lockdep that
+ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
+ * this path, so IRQs aren't actually unmasked while running host code.
+ */
+ raw_local_irq_enable();
+
+ amd_clear_divider();
+
+ if (sev_es_guest(vcpu->kvm))
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
+ sev_es_host_save_area(sd));
+ else
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
+
+ raw_local_irq_disable();
+
+ guest_state_exit_irqoff();
+}
+
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ force_immediate_exit = true;
+ }
+
+ if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
+ if (pre_svm_run(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR;
+ vcpu->run->fail_entry.cpu = vcpu->cpu;
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+ }
+
+ sync_lapic_to_cr8(vcpu);
+
+ if (unlikely(svm->asid != svm->vmcb->control.asid)) {
+ svm->vmcb->control.asid = svm->asid;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
+ /*
+ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
+ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
+ * KVM's snapshot is only necessary when DR accesses won't exit.
+ */
+ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
+ svm_set_dr6(vcpu, vcpu->arch.dr6);
+ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
+
+ clgi();
+
+ /*
+ * Hardware only context switches DEBUGCTL if LBR virtualization is
+ * enabled. Manually load DEBUGCTL if necessary (and restore it after
+ * VM-Exit), as running with the host's DEBUGCTL can negatively affect
+ * guest state and can even be fatal, e.g. due to Bus Lock Detect.
+ */
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(svm->vmcb->save.dbgctl);
+
+ kvm_wait_lapic_expire(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
+
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
+
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
+
+ if (!sev_es_guest(vcpu->kvm)) {
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ }
+ vcpu->arch.regs_dirty = 0;
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
+
+ stgi();
+
+ /* Any pending NMI will happen here */
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_after_interrupt(vcpu);
+
+ sync_cr8_to_lapic(vcpu);
+
+ svm->next_rip = 0;
+ if (is_guest_mode(vcpu)) {
+ nested_sync_control_from_vmcb02(svm);
+
+ /* Track VMRUNs that have made past consistency checking */
+ if (svm->nested.nested_run_pending &&
+ svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+ ++vcpu->stat.nested_run;
+
+ svm->nested.nested_run_pending = 0;
+ }
+
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb_mark_all_clean(svm->vmcb);
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ vcpu->arch.apf.host_apf_flags =
+ kvm_read_and_reset_apf_flags();
+
+ vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
+
+ trace_kvm_exit(vcpu, KVM_ISA_SVM);
+
+ svm_complete_interrupts(vcpu);
+
+ return svm_exit_handlers_fastpath(vcpu);
+}
+
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr3;
+
+ if (npt_enabled) {
+ svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+
+ hv_track_root_tdp(vcpu, root_hpa);
+
+ cr3 = vcpu->arch.cr3;
+ } else if (root_level >= PT64_ROOT_4LEVEL) {
+ cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+ } else {
+ /* PCID in the guest should be impossible with a 32-bit MMU. */
+ WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+ cr3 = root_hpa;
+ }
+
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xd9;
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ switch (index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ return false;
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
+ /* SEV-ES guests do not support SMM, so report false */
+ if (kvm && sev_es_guest(kvm))
+ return false;
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
+ * can only disable all variants of by disallowing CR4.OSXSAVE from
+ * being set. As a result, if the host has XSAVE and XSAVES, and the
+ * guest has XSAVE enabled, the guest can execute XSAVES without
+ * faulting. Treat XSAVES as enabled in this case regardless of
+ * whether it's advertised to the guest so that KVM context switches
+ * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
+ * the guest read/write access to the host's XSS.
+ */
+ guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES,
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE));
+
+ /*
+ * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
+ * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
+ * SVM on Intel is bonkers and extremely unlikely to work).
+ */
+ if (guest_cpuid_is_intel_compatible(vcpu))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
+ if (sev_guest(vcpu->kvm))
+ sev_vcpu_after_set_cpuid(svm);
+}
+
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, }
+
+static const struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+
+ /*
+ * Adjust the exit code accordingly if a CR other than CR0 is
+ * being written, and skip straight to the common handling as
+ * only CR0 has an additional selective intercept.
+ */
+ if (info->intercept == x86_intercept_cr_write && info->modrm_reg) {
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ }
+
+ /*
+ * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a
+ * selective CR0 intercept is triggered (the common logic will
+ * treat the selective intercept as being enabled). Note, the
+ * unconditional intercept has higher priority, i.e. this is
+ * only relevant if *only* the selective intercept is enabled.
+ */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) ||
+ !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */
+ if (info->intercept == x86_intercept_clts)
+ break;
+
+ /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */
+ if (info->intercept == x86_intercept_lmsw) {
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ break;
+ }
+
+ /*
+ * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit
+ * other than SVM_CR0_SELECTIVE_MASK is changed.
+ */
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ break;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
+ bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+ switch (to_svm(vcpu)->vmcb->control.exit_code) {
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+ svm_handle_mce(vcpu);
+ break;
+ case SVM_EXIT_INTR:
+ vcpu->arch.at_instruction_boundary = true;
+ break;
+ default:
+ break;
+ }
+}
+
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+ /* [63:9] are reserved. */
+ vcpu->arch.mcg_cap &= 0x1ff;
+}
+
+#ifdef CONFIG_KVM_SMM
+bool svm_smi_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return true;
+
+ return is_smm(vcpu);
+}
+
+static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_smi_blocked(vcpu))
+ return 0;
+
+ /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
+ return -EBUSY;
+
+ return 1;
+}
+
+static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map_save;
+ int ret;
+
+ if (!is_guest_mode(vcpu))
+ return 0;
+
+ /*
+ * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
+ * responsible for ensuring nested SVM and SMIs are mutually exclusive.
+ */
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return 1;
+
+ smram->smram64.svm_guest_flag = 1;
+ smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
+ if (ret)
+ return ret;
+
+ /*
+ * KVM uses VMCB01 to store L1 host state while L2 runs but
+ * VMCB01 is going to be used during SMM and thus the state will
+ * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+ * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+ * format of the area is identical to guest save area offsetted
+ * by 0x400 (matches the offset of 'struct vmcb_save_area'
+ * within 'struct vmcb'). Note: HSAVE area may also be used by
+ * L1 hypervisor to save additional host context (e.g. KVM does
+ * that, see svm_prepare_switch_to_guest()) which must be
+ * preserved.
+ */
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
+ return 1;
+
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+ svm_copy_vmrun_state(map_save.hva + 0x400,
+ &svm->vmcb01.ptr->save);
+
+ kvm_vcpu_unmap(vcpu, &map_save);
+ return 0;
+}
+
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map, map_save;
+ struct vmcb *vmcb12;
+ int ret;
+
+ const struct kvm_smram_state_64 *smram64 = &smram->smram64;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return 0;
+
+ /* Non-zero if SMI arrived while vCPU was in guest mode. */
+ if (!smram64->svm_guest_flag)
+ return 0;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
+ return 1;
+
+ if (!(smram64->efer & EFER_SVME))
+ return 1;
+
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
+ return 1;
+
+ ret = 1;
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
+ goto unmap_map;
+
+ if (svm_allocate_nested(svm))
+ goto unmap_save;
+
+ /*
+ * Restore L1 host state from L1 HSAVE area as VMCB01 was
+ * used during SMM (see svm_enter_smm())
+ */
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
+
+ /*
+ * Enter the nested guest now
+ */
+
+ vmcb_mark_all_dirty(svm->vmcb01.ptr);
+
+ vmcb12 = map.hva;
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+ ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
+
+ if (ret)
+ goto unmap_save;
+
+ svm->nested.nested_run_pending = 1;
+
+unmap_save:
+ kvm_vcpu_unmap(vcpu, &map_save);
+unmap_map:
+ kvm_vcpu_unmap(vcpu, &map);
+ return ret;
+}
+
+static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif)
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ } else {
+ /* We must be in SMM; RSM will cause a vmexit anyway. */
+ }
+}
+#endif
+
+static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool smep, smap, is_user;
+ u64 error_code;
+
+ /* Check that emulation is possible during event vectoring */
+ if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
+ /* Emulation is always possible when KVM has access to all guest state. */
+ if (!sev_guest(vcpu->kvm))
+ return X86EMUL_CONTINUE;
+
+ /* #UD and #GP should never be intercepted for SEV guests. */
+ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+ EMULTYPE_TRAP_UD_FORCED |
+ EMULTYPE_VMWARE_GP));
+
+ /*
+ * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+ * to guest register state.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return X86EMUL_RETRY_INSTR;
+
+ /*
+ * Emulation is possible if the instruction is already decoded, e.g.
+ * when completing I/O after returning from userspace.
+ */
+ if (emul_type & EMULTYPE_NO_DECODE)
+ return X86EMUL_CONTINUE;
+
+ /*
+ * Emulation is possible for SEV guests if and only if a prefilled
+ * buffer containing the bytes of the intercepted instruction is
+ * available. SEV guest memory is encrypted with a guest specific key
+ * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
+ * decode garbage.
+ *
+ * If KVM is NOT trying to simply skip an instruction, inject #UD if
+ * KVM reached this point without an instruction buffer. In practice,
+ * this path should never be hit by a well-behaved guest, e.g. KVM
+ * doesn't intercept #UD or #GP for SEV guests, but this path is still
+ * theoretically reachable, e.g. via unaccelerated fault-like AVIC
+ * access, and needs to be handled by KVM to avoid putting the guest
+ * into an infinite loop. Injecting #UD is somewhat arbitrary, but
+ * its the least awful option given lack of insight into the guest.
+ *
+ * If KVM is trying to skip an instruction, simply resume the guest.
+ * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
+ * will attempt to re-inject the INT3/INTO and skip the instruction.
+ * In that scenario, retrying the INT3/INTO and hoping the guest will
+ * make forward progress is the only option that has a chance of
+ * success (and in practice it will work the vast majority of the time).
+ */
+ if (unlikely(!insn)) {
+ if (emul_type & EMULTYPE_SKIP)
+ return X86EMUL_UNHANDLEABLE;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ /*
+ * Emulate for SEV guests if the insn buffer is not empty. The buffer
+ * will be empty if the DecodeAssist microcode cannot fetch bytes for
+ * the faulting instruction because the code fetch itself faulted, e.g.
+ * the guest attempted to fetch from emulated MMIO or a guest page
+ * table used to translate CS:RIP resides in emulated MMIO.
+ */
+ if (likely(insn_len))
+ return X86EMUL_CONTINUE;
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
+ *
+ * Errata:
+ * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+ * possible that CPU microcode implementing DecodeAssist will fail to
+ * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+ * be '0'. This happens because microcode reads CS:RIP using a _data_
+ * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
+ * gives up and does not fill the instruction bytes buffer.
+ *
+ * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+ * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+ * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+ * GuestIntrBytes field of the VMCB.
+ *
+ * This does _not_ mean that the erratum has been encountered, as the
+ * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+ * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+ * encountered a reserved/not-present #PF.
+ *
+ * To hit the erratum, the following conditions must be true:
+ * 1. CR4.SMAP=1 (obviously).
+ * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
+ * have been hit as the guest would have encountered a SMEP
+ * violation #PF, not a #NPF.
+ * 3. The #NPF is not due to a code fetch, in which case failure to
+ * retrieve the instruction bytes is legitimate (see abvoe).
+ *
+ * In addition, don't apply the erratum workaround if the #NPF occurred
+ * while translating guest page tables (see below).
+ */
+ error_code = svm->vmcb->control.exit_info_1;
+ if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+ goto resume_guest;
+
+ smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
+ smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
+ is_user = svm_get_cpl(vcpu) == 3;
+ if (smap && (!smep || is_user)) {
+ pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
+
+ /*
+ * If the fault occurred in userspace, arbitrarily inject #GP
+ * to avoid killing the guest and to hopefully avoid confusing
+ * the guest kernel too much, e.g. injecting #PF would not be
+ * coherent with respect to the guest's page tables. Request
+ * triple fault if the fault occurred in the kernel as there's
+ * no fault that KVM can inject without confusing the guest.
+ * In practice, the triple fault is moot as no sane SEV kernel
+ * will execute from user memory while also running with SMAP=1.
+ */
+ if (is_user)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+resume_guest:
+ /*
+ * If the erratum was not hit, simply resume the guest and let it fault
+ * again. While awful, e.g. the vCPU may get stuck in an infinite loop
+ * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
+ * userspace will kill the guest, and letting the emulator read garbage
+ * will yield random behavior and potentially corrupt the guest.
+ *
+ * Simply resuming the guest is technically not a violation of the SEV
+ * architecture. AMD's APM states that all code fetches and page table
+ * accesses for SEV guest are encrypted, regardless of the C-Bit. The
+ * APM also states that encrypted accesses to MMIO are "ignored", but
+ * doesn't explicitly define "ignored", i.e. doing nothing and letting
+ * the guest spin is technically "ignoring" the access.
+ */
+ return X86EMUL_RETRY_INSTR;
+}
+
+static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return !gif_set(svm);
+}
+
+static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ if (!sev_es_guest(vcpu->kvm))
+ return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
+
+ sev_vcpu_deliver_sipi_vector(vcpu, vector);
+}
+
+static void svm_vm_destroy(struct kvm *kvm)
+{
+ avic_vm_destroy(kvm);
+ sev_vm_destroy(kvm);
+
+ svm_srso_vm_destroy();
+}
+
+static int svm_vm_init(struct kvm *kvm)
+{
+ int type = kvm->arch.vm_type;
+
+ if (type != KVM_X86_DEFAULT_VM &&
+ type != KVM_X86_SW_PROTECTED_VM) {
+ kvm->arch.has_protected_state =
+ (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
+ to_kvm_sev_info(kvm)->need_init = true;
+
+ kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
+ kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
+ }
+
+ if (!pause_filter_count || !pause_filter_thresh)
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
+
+ if (enable_apicv) {
+ int ret = avic_vm_init(kvm);
+ if (ret)
+ return ret;
+ }
+
+ svm_srso_vm_init();
+ return 0;
+}
+
+static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
+{
+ struct page *page = snp_safe_alloc_page();
+
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+struct kvm_x86_ops svm_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = svm_check_processor_compat,
+
+ .hardware_unsetup = svm_hardware_unsetup,
+ .enable_virtualization_cpu = svm_enable_virtualization_cpu,
+ .disable_virtualization_cpu = svm_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
+ .has_emulated_msr = svm_has_emulated_msr,
+
+ .vcpu_precreate = svm_vcpu_precreate,
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_free = svm_vcpu_free,
+ .vcpu_reset = svm_vcpu_reset,
+
+ .vm_size = sizeof(struct kvm_svm),
+ .vm_init = svm_vm_init,
+ .vm_destroy = svm_vm_destroy,
+
+ .prepare_switch_to_guest = svm_prepare_switch_to_guest,
+ .vcpu_load = svm_vcpu_load,
+ .vcpu_put = svm_vcpu_put,
+ .vcpu_blocking = avic_vcpu_blocking,
+ .vcpu_unblocking = avic_vcpu_unblocking,
+
+ .update_exception_bitmap = svm_update_exception_bitmap,
+ .get_feature_msr = svm_get_feature_msr,
+ .get_msr = svm_get_msr,
+ .set_msr = svm_set_msr,
+ .get_segment_base = svm_get_segment_base,
+ .get_segment = svm_get_segment,
+ .set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
+ .get_cpl_no_cache = svm_get_cpl,
+ .get_cs_db_l_bits = svm_get_cs_db_l_bits,
+ .is_valid_cr0 = svm_is_valid_cr0,
+ .set_cr0 = svm_set_cr0,
+ .post_set_cr3 = sev_post_set_cr3,
+ .is_valid_cr4 = svm_is_valid_cr4,
+ .set_cr4 = svm_set_cr4,
+ .set_efer = svm_set_efer,
+ .get_idt = svm_get_idt,
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+ .get_rflags = svm_get_rflags,
+ .set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
+
+ .flush_tlb_all = svm_flush_tlb_all,
+ .flush_tlb_current = svm_flush_tlb_current,
+ .flush_tlb_gva = svm_flush_tlb_gva,
+ .flush_tlb_guest = svm_flush_tlb_asid,
+
+ .vcpu_pre_run = svm_vcpu_pre_run,
+ .vcpu_run = svm_vcpu_run,
+ .handle_exit = svm_handle_exit,
+ .skip_emulated_instruction = svm_skip_emulated_instruction,
+ .update_emulated_instruction = NULL,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
+ .patch_hypercall = svm_patch_hypercall,
+ .inject_irq = svm_inject_irq,
+ .inject_nmi = svm_inject_nmi,
+ .is_vnmi_pending = svm_is_vnmi_pending,
+ .set_vnmi_pending = svm_set_vnmi_pending,
+ .inject_exception = svm_inject_exception,
+ .cancel_injection = svm_cancel_injection,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .get_nmi_mask = svm_get_nmi_mask,
+ .set_nmi_mask = svm_set_nmi_mask,
+ .enable_nmi_window = svm_enable_nmi_window,
+ .enable_irq_window = svm_enable_irq_window,
+ .update_cr8_intercept = svm_update_cr8_intercept,
+
+ .x2apic_icr_is_split = true,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .apicv_post_state_restore = avic_apicv_post_state_restore,
+ .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
+
+ .get_exit_info = svm_get_exit_info,
+ .get_entry_info = svm_get_entry_info,
+
+ .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
+
+ .get_l2_tsc_offset = svm_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
+ .write_tsc_offset = svm_write_tsc_offset,
+ .write_tsc_multiplier = svm_write_tsc_multiplier,
+
+ .load_mmu_pgd = svm_load_mmu_pgd,
+
+ .check_intercept = svm_check_intercept,
+ .handle_exit_irqoff = svm_handle_exit_irqoff,
+
+ .nested_ops = &svm_nested_ops,
+
+ .deliver_interrupt = svm_deliver_interrupt,
+ .pi_update_irte = avic_pi_update_irte,
+ .setup_mce = svm_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = svm_smi_allowed,
+ .enter_smm = svm_enter_smm,
+ .leave_smm = svm_leave_smm,
+ .enable_smi_window = svm_enable_smi_window,
+#endif
+
+#ifdef CONFIG_KVM_AMD_SEV
+ .dev_get_attr = sev_dev_get_attr,
+ .mem_enc_ioctl = sev_mem_enc_ioctl,
+ .mem_enc_register_region = sev_mem_enc_register_region,
+ .mem_enc_unregister_region = sev_mem_enc_unregister_region,
+ .guest_memory_reclaimed = sev_guest_memory_reclaimed,
+
+ .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
+ .vm_move_enc_context_from = sev_vm_move_enc_context_from,
+#endif
+ .check_emulate_instruction = svm_check_emulate_instruction,
+
+ .apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+ .recalc_intercepts = svm_recalc_intercepts,
+ .complete_emulated_msr = svm_complete_emulated_msr,
+
+ .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+ .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
+ .alloc_apic_backing_page = svm_alloc_apic_backing_page,
+
+ .gmem_prepare = sev_gmem_prepare,
+ .gmem_invalidate = sev_gmem_invalidate,
+ .gmem_max_mapping_level = sev_gmem_max_mapping_level,
+};
+
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrq(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static __init void svm_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ kvm_caps.supported_perf_cap = 0;
+
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+
+ /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+ if (nested) {
+ kvm_cpu_cap_set(X86_FEATURE_SVM);
+ kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
+
+ /*
+ * KVM currently flushes TLBs on *every* nested SVM transition,
+ * and so for all intents and purposes KVM supports flushing by
+ * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
+ */
+ kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
+
+ if (nrips)
+ kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+ if (npt_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
+ if (vls)
+ kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+ if (lbrv)
+ kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
+ kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
+ if (vgif)
+ kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
+ if (vnmi)
+ kvm_cpu_cap_set(X86_FEATURE_VNMI);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
+ kvm_caps.has_bus_lock_exit = true;
+
+ /* CPUID 0x80000008 */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ if (enable_pmu) {
+ /*
+ * Enumerate support for PERFCTR_CORE if and only if KVM has
+ * access to enough counters to virtualize "core" support,
+ * otherwise limit vPMU support to the legacy number of counters.
+ */
+ if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
+ kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
+ kvm_pmu_cap.num_counters_gp);
+ else
+ kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
+
+ if (kvm_pmu_cap.version != 2 ||
+ !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
+ }
+
+ /* CPUID 0x8000001F (SME/SEV features) */
+ sev_set_cpu_caps();
+
+ /*
+ * Clear capabilities that are automatically configured by common code,
+ * but that require explicit SVM support (that isn't yet implemented).
+ */
+ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
+ kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM);
+}
+
+static __init int svm_hardware_setup(void)
+{
+ void *iopm_va;
+ int cpu, r;
+
+ /*
+ * NX is required for shadow paging and for NPT if the NX huge pages
+ * mitigation is enabled.
+ */
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+ kvm_enable_efer_bits(EFER_NX);
+
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_caps.has_tsc_control = true;
+ }
+ }
+ kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 32;
+
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
+ kvm_enable_efer_bits(EFER_AUTOIBRS);
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ pr_info("Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME);
+ if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ))
+ kvm_enable_efer_bits(EFER_LMSLE);
+
+ r = nested_svm_init_msrpm_merge_offsets();
+ if (r)
+ return r;
+ }
+
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+ npt_enabled = false;
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
+ pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled));
+
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
+ /* Setup shadow_me_value and shadow_me_mask */
+ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
+
+ svm_adjust_mmio_mask();
+
+ nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
+ iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
+ if (!iopm_va)
+ return -ENOMEM;
+
+ iopm_base = __sme_set(__pa(iopm_va));
+
+ /*
+ * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
+ * may be modified by svm_adjust_mmio_mask()), as well as nrips.
+ */
+ sev_hardware_setup();
+
+ svm_hv_hardware_setup();
+
+ enable_apicv = avic_hardware_setup();
+ if (!enable_apicv) {
+ enable_ipiv = false;
+ svm_x86_ops.vcpu_blocking = NULL;
+ svm_x86_ops.vcpu_unblocking = NULL;
+ svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
+ if (vgif) {
+ if (!boot_cpu_has(X86_FEATURE_VGIF))
+ vgif = false;
+ else
+ pr_info("Virtual GIF supported\n");
+ }
+
+ vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
+ if (vnmi)
+ pr_info("Virtual NMI enabled\n");
+
+ if (!vnmi) {
+ svm_x86_ops.is_vnmi_pending = NULL;
+ svm_x86_ops.set_vnmi_pending = NULL;
+ }
+
+ if (!enable_pmu)
+ pr_info("PMU virtualization is disabled\n");
+
+ svm_set_cpu_caps();
+
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ return 0;
+
+err:
+ svm_hardware_unsetup();
+ return r;
+}
+
+
+static struct kvm_x86_init_ops svm_init_ops __initdata = {
+ .hardware_setup = svm_hardware_setup,
+
+ .runtime_ops = &svm_x86_ops,
+ .pmu_ops = &amd_pmu_ops,
+};
+
+static void __svm_exit(void)
+{
+ kvm_x86_vendor_exit();
+}
+
+static int __init svm_init(void)
+{
+ int r;
+
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
+
+ __unused_size_checks();
+
+ if (!kvm_is_svm_supported())
+ return -EOPNOTSUPP;
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ __svm_exit();
+ return r;
+}
+
+static void __exit svm_exit(void)
+{
+ kvm_exit();
+ __svm_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
new file mode 100644
index 000000000000..9e151dbdef25
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.h
@@ -0,0 +1,947 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#ifndef __SVM_SVM_H
+#define __SVM_SVM_H
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/bits.h>
+
+#include <asm/svm.h>
+#include <asm/sev-common.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+
+/*
+ * Helpers to convert to/from physical addresses for pages whose address is
+ * consumed directly by hardware. Even though it's a physical address, SVM
+ * often restricts the address to the natural width, hence 'unsigned long'
+ * instead of 'hpa_t'.
+ */
+static inline unsigned long __sme_page_pa(struct page *page)
+{
+ return __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+}
+
+static inline struct page *__sme_pa_to_page(unsigned long pa)
+{
+ return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT);
+}
+
+#define IOPM_SIZE PAGE_SIZE * 3
+#define MSRPM_SIZE PAGE_SIZE * 2
+
+extern bool npt_enabled;
+extern int nrips;
+extern int vgif;
+extern bool intercept_smi;
+extern bool vnmi;
+extern int lbrv;
+
+extern int tsc_aux_uret_slot __ro_after_init;
+
+extern struct kvm_x86_ops svm_x86_ops __initdata;
+
+/*
+ * Clean bits in VMCB.
+ * VMCB_ALL_CLEAN_MASK might also need to
+ * be updated if this enum is modified.
+ */
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+ * AVIC PHYSICAL_TABLE pointer,
+ * AVIC LOGICAL_TABLE pointer
+ */
+ VMCB_CET, /* S_CET, SSP, ISST_ADDR */
+ VMCB_SW = 31, /* Reserved for hypervisor/software use */
+};
+
+#define VMCB_ALL_CLEAN_MASK ( \
+ (1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \
+ (1U << VMCB_ASID) | (1U << VMCB_INTR) | \
+ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \
+ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \
+ (1U << VMCB_LBR) | (1U << VMCB_AVIC) | (1U << VMCB_CET) | \
+ (1U << VMCB_SW))
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ bool es_active; /* SEV-ES enabled guest */
+ bool need_init; /* waiting for SEV_INIT2 */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long policy;
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+ u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ u64 vmsa_features;
+ u16 ghcb_version; /* Highest guest GHCB protocol version allowed */
+ struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
+ struct misc_cg *misc_cg; /* For misc cgroup accounting */
+ atomic_t migration_in_progress;
+ void *snp_context; /* SNP guest context page */
+ void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
+ void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
+ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
+ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ u32 *avic_logical_id_table;
+ u64 *avic_physical_id_table;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
+struct kvm_vcpu;
+
+struct kvm_vmcb_info {
+ struct vmcb *ptr;
+ unsigned long pa;
+ int cpu;
+ uint64_t asid_generation;
+};
+
+struct vmcb_save_area_cached {
+ u64 efer;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+};
+
+struct vmcb_ctrl_area_cached {
+ u32 intercepts[MAX_INTERCEPT];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 next_rip;
+ u64 nested_cr3;
+ u64 virt_ext;
+ u32 clean;
+ u64 bus_lock_rip;
+ union {
+#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
+ struct hv_vmcb_enlightenments hv_enlightenments;
+#endif
+ u8 reserved_sw[32];
+ };
+};
+
+struct svm_nested_state {
+ struct kvm_vmcb_info vmcb02;
+ u64 hsave_msr;
+ u64 vm_cr_msr;
+ u64 vmcb12_gpa;
+ u64 last_vmcb12_gpa;
+
+ /*
+ * The MSR permissions map used for vmcb02, which is the merge result
+ * of vmcb01 and vmcb12
+ */
+ void *msrpm;
+
+ /* A VMRUN has started but has not yet been performed, so
+ * we cannot inject a nested vmexit yet. */
+ bool nested_run_pending;
+
+ /* cache for control fields of the guest */
+ struct vmcb_ctrl_area_cached ctl;
+
+ /*
+ * Note: this struct is not kept up-to-date while L2 runs; it is only
+ * valid within nested_svm_vmrun.
+ */
+ struct vmcb_save_area_cached save;
+
+ bool initialized;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+};
+
+struct vcpu_sev_es_state {
+ /* SEV-ES support */
+ struct sev_es_save_area *vmsa;
+ struct ghcb *ghcb;
+ u8 valid_bitmap[16];
+ struct kvm_host_map ghcb_map;
+ bool received_first_sipi;
+ unsigned int ap_reset_hold_type;
+
+ /* SEV-ES scratch area support */
+ u64 sw_scratch;
+ void *ghcb_sa;
+ u32 ghcb_sa_len;
+ bool ghcb_sa_sync;
+ bool ghcb_sa_free;
+
+ /* SNP Page-State-Change buffer entries currently being processed */
+ u16 psc_idx;
+ u16 psc_inflight;
+ bool psc_2m;
+
+ u64 ghcb_registered_gpa;
+
+ struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
+ gpa_t snp_vmsa_gpa;
+ bool snp_ap_waiting_for_reset;
+ bool snp_has_guest_vmsa;
+};
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
+ struct vmcb *vmcb;
+ struct kvm_vmcb_info vmcb01;
+ struct kvm_vmcb_info *current_vmcb;
+ u32 asid;
+ u32 sysenter_esp_hi;
+ u32 sysenter_eip_hi;
+ uint64_t tsc_aux;
+
+ u64 msr_decfg;
+
+ u64 next_rip;
+
+ u64 spec_ctrl;
+
+ u64 tsc_ratio_msr;
+ /*
+ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+ * translated into the appropriate L2_CFG bits on the host to
+ * perform speculative control.
+ */
+ u64 virt_spec_ctrl;
+
+ void *msrpm;
+
+ ulong nmi_iret_rip;
+
+ struct svm_nested_state nested;
+
+ /* NMI mask value, used when vNMI is not enabled */
+ bool nmi_masked;
+
+ /*
+ * True when NMIs are still masked but guest IRET was just intercepted
+ * and KVM is waiting for RIP to change, which will signal that the
+ * intercepted IRET was retired and thus NMI can be unmasked.
+ */
+ bool awaiting_iret_completion;
+
+ /*
+ * Set when KVM is awaiting IRET completion and needs to inject NMIs as
+ * soon as the IRET completes (e.g. NMI is pending injection). KVM
+ * temporarily steals RFLAGS.TF to single-step the guest in this case
+ * in order to regain control as soon as the NMI-blocking condition
+ * goes away.
+ */
+ bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
+
+ bool nmi_l1_to_l2;
+
+ unsigned long soft_int_csbase;
+ unsigned long soft_int_old_rip;
+ unsigned long soft_int_next_rip;
+ bool soft_int_injected;
+
+ u32 ldr_reg;
+ u32 dfr_reg;
+
+ /* This is essentially a shadow of the vCPU's actual entry in the
+ * Physical ID table that is programmed into the VMCB, i.e. that is
+ * seen by the CPU. If IPI virtualization is disabled, IsRunning is
+ * only ever set in the shadow, i.e. is never propagated to the "real"
+ * table, so that hardware never sees IsRunning=1.
+ */
+ u64 avic_physical_id_entry;
+
+ /*
+ * Per-vCPU list of irqfds that are eligible to post IRQs directly to
+ * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
+ * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
+ * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
+ * and if the irqfd becomes ineligible for posting (to put the IRTE
+ * back into remapped mode).
+ */
+ struct list_head ir_list;
+ raw_spinlock_t ir_list_lock;
+
+ struct vcpu_sev_es_state sev_es;
+
+ bool guest_state_loaded;
+
+ bool x2avic_msrs_intercepted;
+ bool lbr_msrs_intercepted;
+
+ /* Guest GIF value, used when vGIF is not enabled */
+ bool guest_gif;
+};
+
+struct svm_cpu_data {
+ u64 asid_generation;
+ u32 max_asid;
+ u32 next_asid;
+ u32 min_asid;
+
+ bool bp_spec_reduce_set;
+
+ struct vmcb *save_area;
+ unsigned long save_area_pa;
+
+ /* index = sev_asid, value = vmcb pointer */
+ struct vmcb **sev_vmcbs;
+};
+
+DECLARE_PER_CPU(struct svm_cpu_data, svm_data);
+
+void recalc_intercepts(struct vcpu_svm *svm);
+
+static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
+static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
+{
+ return &to_kvm_svm(kvm)->sev_info;
+}
+
+#ifdef CONFIG_KVM_AMD_SEV
+static __always_inline bool sev_guest(struct kvm *kvm)
+{
+ return to_kvm_sev_info(kvm)->active;
+}
+static __always_inline bool sev_es_guest(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ return sev->es_active && !WARN_ON_ONCE(!sev->active);
+}
+
+static __always_inline bool sev_snp_guest(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
+ !WARN_ON_ONCE(!sev_es_guest(kvm));
+}
+#else
+#define sev_guest(kvm) false
+#define sev_es_guest(kvm) false
+#define sev_snp_guest(kvm) false
+#endif
+
+static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val)
+{
+ return svm->sev_es.ghcb_registered_gpa == val;
+}
+
+static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = VMCB_ALL_CLEAN_MASK
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
+{
+ return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
+}
+
+static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+/*
+ * Only the PDPTRs are loaded on demand into the shadow MMU. All other
+ * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
+ *
+ * CR3 might be out of date in the VMCB but it is not marked dirty; instead,
+ * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
+ * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
+ */
+#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
+
+static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __set_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __clear_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_clr_intercept(&vmcb->control, bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
+{
+ return vmcb_is_intercept(&svm->vmcb->control, bit);
+}
+
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
+{
+ return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
+{
+ if (!vgif)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
+ return svm->nested.vmcb02.ptr;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl |= V_GIF_MASK;
+ else
+ svm->guest_gif = true;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl &= ~V_GIF_MASK;
+ else
+ svm->guest_gif = false;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_GIF_MASK);
+ else
+ return svm->guest_gif;
+}
+
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+ return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
+static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
+{
+ return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) &&
+ (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
+}
+
+static inline bool is_x2apic_msrpm_offset(u32 offset)
+{
+ /* 4 msrs per u8, and 4 u8 in u32 */
+ u32 msr = offset * 16;
+
+ return (msr >= APIC_BASE_MSR) &&
+ (msr < (APIC_BASE_MSR + 0x100));
+}
+
+static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm)
+{
+ if (!vnmi)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu))
+ return NULL;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vnmi_vmcb_l1(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK);
+ else
+ return false;
+}
+
+static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm,
+ u64 response, u64 data)
+{
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data);
+}
+
+static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector)
+{
+ u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector;
+
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data);
+}
+
+static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror)
+{
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror);
+}
+
+static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data)
+{
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
+}
+
+static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data)
+{
+ svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
+}
+
+/*
+ * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range
+ * is reserved). Each MSR within a range is covered by two bits, one each for
+ * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted.
+ */
+#define SVM_MSRPM_BYTES_PER_RANGE 2048
+#define SVM_BITS_PER_MSR 2
+#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR)
+#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE)
+static_assert(SVM_MSRS_PER_RANGE == 8192);
+#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1)
+
+static __always_inline int svm_msrpm_bit_nr(u32 msr)
+{
+ int range_nr;
+
+ switch (msr & ~SVM_MSRPM_OFFSET_MASK) {
+ case 0:
+ range_nr = 0;
+ break;
+ case 0xc0000000:
+ range_nr = 1;
+ break;
+ case 0xc0010000:
+ range_nr = 2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +
+ (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR;
+}
+
+#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \
+static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int bit_nr; \
+ \
+ bit_nr = svm_msrpm_bit_nr(msr); \
+ if (bit_nr < 0) \
+ return (rtype)true; \
+ \
+ return bitop##_bit(bit_nr + bit_rw, bitmap); \
+}
+
+#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1)
+
+BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set)
+
+#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
+
+/* svm.c */
+extern bool dump_invalid_vmcb;
+
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask);
+
+static inline void *svm_vcpu_alloc_msrpm(void)
+{
+ return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT);
+}
+
+void svm_vcpu_free_msrpm(void *msrpm);
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_enable_lbrv(struct kvm_vcpu *vcpu);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
+
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void disable_nmi_singlestep(struct vcpu_svm *svm);
+bool svm_smi_blocked(struct kvm_vcpu *vcpu);
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
+void svm_set_gif(struct vcpu_svm *svm, bool value);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write);
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vec);
+
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, true);
+}
+
+/* nested.c */
+
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
+static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK);
+}
+
+static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
+}
+
+static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
+}
+
+static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
+}
+
+int __init nested_svm_init_msrpm_merge_offsets(void);
+
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
+ u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
+void svm_free_nested(struct vcpu_svm *svm);
+int svm_allocate_nested(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save);
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+ return nested_svm_vmexit(svm);
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
+int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code);
+int nested_svm_exit_special(struct vcpu_svm *svm);
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control);
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
+
+extern struct kvm_x86_nested_ops svm_nested_ops;
+
+/* avic.c */
+#define AVIC_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLED) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_NESTED) | \
+ BIT(APICV_INHIBIT_REASON_IRQWIN) | \
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_SEV) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
+)
+
+bool __init avic_hardware_setup(void);
+void avic_hardware_unsetup(void);
+int avic_alloc_physical_id_table(struct kvm *kvm);
+void avic_vm_destroy(struct kvm *kvm);
+int avic_vm_init(struct kvm *kvm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
+int avic_init_vcpu(struct vcpu_svm *svm);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
+
+
+/* sev.c */
+
+int pre_sev_run(struct vcpu_svm *svm, int cpu);
+void sev_init_vmcb(struct vcpu_svm *svm, bool init_event);
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+
+#ifdef CONFIG_KVM_AMD_SEV
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+void sev_guest_memory_reclaimed(struct kvm *kvm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
+
+/* These symbols are used in common code and are stubbed below. */
+
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp);
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
+}
+
+int sev_vcpu_create(struct kvm_vcpu *vcpu);
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+void sev_vm_destroy(struct kvm *kvm);
+void __init sev_set_cpu_caps(void);
+void __init sev_hardware_setup(void);
+void sev_hardware_unsetup(void);
+int sev_cpu_init(struct svm_cpu_data *sd);
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val);
+extern unsigned int max_sev_asid;
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
+#else
+static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
+{
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+}
+
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
+}
+
+static inline int sev_vcpu_create(struct kvm_vcpu *vcpu) { return 0; }
+static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {}
+static inline void sev_vm_destroy(struct kvm *kvm) {}
+static inline void __init sev_set_cpu_caps(void) {}
+static inline void __init sev_hardware_setup(void) {}
+static inline void sev_hardware_unsetup(void) {}
+static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; }
+static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; }
+#define max_sev_asid 0
+static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {}
+static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ return 0;
+}
+static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
+static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
+{
+ return 0;
+}
+
+static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ return NULL;
+}
+static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {}
+#endif
+
+/* vmenter.S */
+
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted,
+ struct sev_es_save_area *hostsa);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+
+#define DEFINE_KVM_GHCB_ACCESSORS(field) \
+static __always_inline u64 kvm_ghcb_get_##field(struct vcpu_svm *svm) \
+{ \
+ return READ_ONCE(svm->sev_es.ghcb->save.field); \
+} \
+ \
+static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+{ \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&svm->sev_es.valid_bitmap); \
+} \
+ \
+static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm) \
+{ \
+ return kvm_ghcb_##field##_is_valid(svm) ? kvm_ghcb_get_##field(svm) : 0; \
+}
+
+DEFINE_KVM_GHCB_ACCESSORS(cpl)
+DEFINE_KVM_GHCB_ACCESSORS(rax)
+DEFINE_KVM_GHCB_ACCESSORS(rcx)
+DEFINE_KVM_GHCB_ACCESSORS(rdx)
+DEFINE_KVM_GHCB_ACCESSORS(rbx)
+DEFINE_KVM_GHCB_ACCESSORS(rsi)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+DEFINE_KVM_GHCB_ACCESSORS(xss)
+
+#endif
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
new file mode 100644
index 000000000000..a8e78c0e5956
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+
+#include <asm/mshyperv.h>
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+static int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve;
+ hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
+
+ if (partition_assist_page == INVALID_PAGE)
+ return -ENOMEM;
+
+ hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ hve->partition_assist_page = partition_assist_page;
+ hve->hv_vm_id = (unsigned long)vcpu->kvm;
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
+ hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+ }
+
+ return 0;
+}
+
+__init void svm_hv_hardware_setup(void)
+{
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
+ pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
+ svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
+ int cpu;
+
+ pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n");
+ for_each_online_cpu(cpu) {
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->nested_control.features.directhypercall = 1;
+ }
+ svm_x86_ops.enable_l2_tlb_flush =
+ svm_hv_enable_l2_tlb_flush;
+ }
+}
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
new file mode 100644
index 000000000000..08f14e6f195c
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+#include "kvm_onhyperv.h"
+#include "svm/hyperv.h"
+
+__init void svm_hv_hardware_setup(void);
+
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB &&
+ !!hve->hv_enlightenments_control.enlightened_npt_tlb;
+}
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) !=
+ sizeof(vmcb->control.reserved_sw));
+
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
+ hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+ hve->hv_enlightenments_control.msr_bitmap = 1;
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ if (hve->hv_enlightenments_control.msr_bitmap)
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+ u32 vp_index = kvm_hv_get_vpindex(vcpu);
+
+ if (hve->hv_vp_id != vp_index) {
+ hve->hv_vp_id = vp_index;
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+ }
+}
+#else
+
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+}
+
+static inline __init void svm_hv_hardware_setup(void)
+{
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
+ struct kvm_vcpu *vcpu)
+{
+}
+#endif /* CONFIG_HYPERV */
+
+#endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
new file mode 100644
index 000000000000..4e725854c63a
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SVM_OPS_H
+#define __KVM_X86_SVM_OPS_H
+
+#include <linux/compiler_types.h>
+
+#include "x86.h"
+
+#define svm_asm(insn, clobber...) \
+do { \
+ asm goto("1: " __stringify(insn) "\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ ::: clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm1(insn, op1, clobber...) \
+do { \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm2(insn, op1, op2, clobber...) \
+do { \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1, op2 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static inline void clgi(void)
+{
+ svm_asm(clgi);
+}
+
+static inline void stgi(void)
+{
+ svm_asm(stgi);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ svm_asm2(invlpga, "c"(asid), "a"(addr));
+}
+
+/*
+ * Despite being a physical address, the portion of rAX that is consumed by
+ * VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
+ * hence 'unsigned long' instead of 'hpa_t'.
+ */
+static __always_inline void vmsave(unsigned long pa)
+{
+ svm_asm1(vmsave, "a" (pa), "memory");
+}
+
+#endif /* __KVM_X86_SVM_OPS_H */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
new file mode 100644
index 000000000000..3392bcadfb89
--- /dev/null
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -0,0 +1,404 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/bitsperlong.h>
+#include <asm/frame.h>
+#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
+#include "kvm-asm-offsets.h"
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+/* Intentionally omit RAX as it's context switched by hardware */
+#define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE)
+#define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE)
+#define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE)
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE)
+#define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE)
+#define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE)
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE)
+#define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE)
+#define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE)
+#define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE)
+#define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE)
+#define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE)
+#define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE)
+#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE)
+#endif
+
+#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa)
+
+.section .noinstr.text, "ax"
+
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+#ifdef CONFIG_X86_64
+ mov SVM_spec_ctrl(%rdi), %rdx
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ je 801b
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov SVM_spec_ctrl(%edi), %eax
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
+ xor %eax, %ecx
+ mov SVM_spec_ctrl + 4(%edi), %edx
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi
+ xor %edx, %esi
+ or %esi, %ecx
+ je 801b
+#endif
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, \spec_ctrl_intercepted
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+ movl %edx, SVM_spec_ctrl + 4(%_ASM_DI)
+998:
+ /* Now restore the host value of the MSR if different from the guest's. */
+#ifdef CONFIG_X86_64
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ cmp SVM_spec_ctrl(%rdi), %rdx
+ je 901b
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ mov SVM_spec_ctrl(%edi), %esi
+ xor %eax, %esi
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx
+ mov SVM_spec_ctrl + 4(%edi), %edi
+ xor %edx, %edi
+ or %edi, %esi
+ je 901b
+#endif
+ wrmsr
+ jmp 901b
+.endm
+
+#define SVM_CLEAR_CPU_BUFFERS \
+ ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM
+
+/**
+ * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
+ */
+SYM_FUNC_START(__svm_vcpu_run)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Needed to restore access to percpu variables. */
+ __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
+
+ /* Finally save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+1: vmload %_ASM_AX
+2:
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
+
+ /* Load guest registers. */
+ mov VCPU_RCX(%_ASM_DI), %_ASM_CX
+ mov VCPU_RDX(%_ASM_DI), %_ASM_DX
+ mov VCPU_RBX(%_ASM_DI), %_ASM_BX
+ mov VCPU_RBP(%_ASM_DI), %_ASM_BP
+ mov VCPU_RSI(%_ASM_DI), %_ASM_SI
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_DI), %r8
+ mov VCPU_R9 (%_ASM_DI), %r9
+ mov VCPU_R10(%_ASM_DI), %r10
+ mov VCPU_R11(%_ASM_DI), %r11
+ mov VCPU_R12(%_ASM_DI), %r12
+ mov VCPU_R13(%_ASM_DI), %r13
+ mov VCPU_R14(%_ASM_DI), %r14
+ mov VCPU_R15(%_ASM_DI), %r15
+#endif
+ mov VCPU_RDI(%_ASM_DI), %_ASM_DI
+
+ /* Clobbers EFLAGS.ZF */
+ SVM_CLEAR_CPU_BUFFERS
+
+ /* Enter guest mode */
+3: vmrun %_ASM_AX
+4:
+ /* Pop @svm to RAX while it's the only available register. */
+ pop %_ASM_AX
+
+ /* Save all guest registers. */
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /* @svm can stay in RDI from now on. */
+ mov %_ASM_AX, %_ASM_DI
+
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+5: vmsave %_ASM_AX
+6:
+
+ /* Restores GSBASE among other things, allowing access to percpu data. */
+ pop %_ASM_AX
+7: vmload %_ASM_AX
+8:
+
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
+
+ /*
+ * Clobbers RAX, RCX, RDX (and ESI, EDI on 32-bit), consumes RDI (@svm)
+ * and RSP (pointer to @spec_ctrl_intercepted).
+ */
+ RESTORE_HOST_SPEC_CTRL
+
+ /*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+ * untrained as soon as we exit the VM and are back to the
+ * kernel. This should be done before re-enabling interrupts
+ * because interrupt handlers won't sanitize 'ret' if the return is
+ * from the kernel.
+ */
+ UNTRAIN_RET_VM
+
+ /*
+ * Clear all general purpose registers except RSP and RAX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RAX are exempt as they are restored by hardware
+ * during VM-Exit.
+ */
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %ebx, %ebx
+ xor %ebp, %ebp
+ xor %esi, %esi
+ xor %edi, %edi
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ RET
+
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP)
+
+10: cmpb $0, _ASM_RIP(kvm_rebooting)
+ jne 2b
+ ud2
+30: cmpb $0, _ASM_RIP(kvm_rebooting)
+ jne 4b
+ ud2
+50: cmpb $0, _ASM_RIP(kvm_rebooting)
+ jne 6b
+ ud2
+70: cmpb $0, _ASM_RIP(kvm_rebooting)
+ jne 8b
+ ud2
+
+ _ASM_EXTABLE(1b, 10b)
+ _ASM_EXTABLE(3b, 30b)
+ _ASM_EXTABLE(5b, 50b)
+ _ASM_EXTABLE(7b, 70b)
+
+SYM_FUNC_END(__svm_vcpu_run)
+
+#ifdef CONFIG_KVM_AMD_SEV
+
+
+#ifdef CONFIG_X86_64
+#define SEV_ES_GPRS_BASE 0x300
+#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE)
+#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE)
+#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE)
+#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE)
+#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE)
+#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE)
+#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE)
+#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE)
+#endif
+
+/**
+ * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
+ */
+SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ FRAME_BEGIN
+
+ /*
+ * Save non-volatile (callee-saved) registers to the host save area.
+ * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not
+ * saved on VMRUN.
+ */
+ mov %rbp, SEV_ES_RBP (%rdx)
+ mov %r15, SEV_ES_R15 (%rdx)
+ mov %r14, SEV_ES_R14 (%rdx)
+ mov %r13, SEV_ES_R13 (%rdx)
+ mov %r12, SEV_ES_R12 (%rdx)
+ mov %rbx, SEV_ES_RBX (%rdx)
+
+ /*
+ * Save volatile registers that hold arguments that are needed after
+ * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted).
+ */
+ mov %rdi, SEV_ES_RDI (%rdx)
+ mov %rsi, SEV_ES_RSI (%rdx)
+
+ /* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%rdi), %rax
+ mov KVM_VMCB_pa(%rax), %rax
+
+ /* Clobbers EFLAGS.ZF */
+ SVM_CLEAR_CPU_BUFFERS
+
+ /* Enter guest mode */
+1: vmrun %rax
+2:
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
+
+ /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */
+ RESTORE_HOST_SPEC_CTRL
+
+ /*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+ * untrained as soon as we exit the VM and are back to the
+ * kernel. This should be done before re-enabling interrupts
+ * because interrupt handlers won't sanitize RET if the return is
+ * from the kernel.
+ */
+ UNTRAIN_RET_VM
+
+ FRAME_END
+ RET
+
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY %sil
+
+3: cmpb $0, kvm_rebooting(%rip)
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
+SYM_FUNC_END(__svm_sev_es_vcpu_run)
+#endif /* CONFIG_KVM_AMD_SEV */
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644
index 86dbac072d0c..000000000000
--- a/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <asm/atomic.h>
-#include "kvm_timer.h"
-
-static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
-{
- int restart_timer = 0;
- wait_queue_head_t *q = &vcpu->wq;
-
- /* FIXME: this code should not know anything about vcpus */
- if (!atomic_inc_and_test(&ktimer->pending))
- set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-
- if (!ktimer->reinject)
- atomic_set(&ktimer->pending, 1);
-
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- restart_timer = 1;
- }
-
- return restart_timer;
-}
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
- int restart_timer;
- struct kvm_vcpu *vcpu;
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-
- vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
- if (!vcpu)
- return HRTIMER_NORESTART;
-
- restart_timer = __kvm_timer_fn(vcpu, ktimer);
- if (restart_timer)
- return HRTIMER_RESTART;
- else
- return HRTIMER_NORESTART;
-}
-
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 000000000000..e79bc9cb7162
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,1976 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
+#include <asm/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+#ifdef CREATE_TRACE_POINTS
+#define tracing_kvm_rip_read(vcpu) ({ \
+ typeof(vcpu) __vcpu = vcpu; \
+ __vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \
+ })
+#endif
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
+ TP_ARGS(vcpu, force_immediate_exit),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned long, rip )
+ __field( bool, immediate_exit )
+ __field( u32, intr_info )
+ __field( u32, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->rip = tracing_kvm_rip_read(vcpu);
+ __entry->immediate_exit = force_immediate_exit;
+
+ kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info,
+ &__entry->error_code);
+ ),
+
+ TP_printk("vcpu %u, rip 0x%lx intr_info 0x%08x error_code 0x%08x%s",
+ __entry->vcpu_id, __entry->rip,
+ __entry->intr_info, __entry->error_code,
+ __entry->immediate_exit ? "[immediate exit]" : "")
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt,
+ __u16 rep_idx, __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ __field( __u16, code )
+ __field( __u16, var_cnt )
+ __field( bool, fast )
+ ),
+
+ TP_fast_assign(
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ __entry->code = code;
+ __entry->var_cnt = var_cnt;
+ __entry->fast = fast;
+ ),
+
+ TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx,
+ __entry->ingpa, __entry->outgpa)
+);
+
+TRACE_EVENT(kvm_hv_hypercall_done,
+ TP_PROTO(u64 result),
+ TP_ARGS(result),
+
+ TP_STRUCT__entry(
+ __field(__u64, result)
+ ),
+
+ TP_fast_assign(
+ __entry->result = result;
+ ),
+
+ TP_printk("result 0x%llx", __entry->result)
+);
+
+/*
+ * Tracepoint for Xen hypercall.
+ */
+TRACE_EVENT(kvm_xen_hypercall,
+ TP_PROTO(u8 cpl, unsigned long nr,
+ unsigned long a0, unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4, unsigned long a5),
+ TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5),
+
+ TP_STRUCT__entry(
+ __field(u8, cpl)
+ __field(unsigned long, nr)
+ __field(unsigned long, a0)
+ __field(unsigned long, a1)
+ __field(unsigned long, a2)
+ __field(unsigned long, a3)
+ __field(unsigned long, a4)
+ __field(unsigned long, a5)
+ ),
+
+ TP_fast_assign(
+ __entry->cpl = cpl;
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ __entry->a4 = a4;
+ __entry->a4 = a5;
+ ),
+
+ TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->cpl, __entry->nr,
+ __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3, __entry->a4, __entry->a5)
+);
+
+
+
+/*
+ * Tracepoint for PIO.
+ */
+
+#define KVM_PIO_IN 0
+#define KVM_PIO_OUT 1
+
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count, const void *data),
+ TP_ARGS(rw, port, size, count, data),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ if (size == 1)
+ __entry->val = *(unsigned char *)data;
+ else if (size == 2)
+ __entry->val = *(unsigned short *)data;
+ else
+ __entry->val = *(unsigned int *)data;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count, __entry->val,
+ __entry->count > 1 ? "(...)" : "")
+);
+
+/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+ TP_PROTO(u64 gpa),
+ TP_ARGS(gpa),
+
+ TP_STRUCT__entry(
+ __field(u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ ),
+
+ TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned int index, unsigned long rax,
+ unsigned long rbx, unsigned long rcx, unsigned long rdx,
+ bool found, bool used_max_basic),
+ TP_ARGS(function, index, rax, rbx, rcx, rdx, found, used_max_basic),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned int, index )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ __field( bool, found )
+ __field( bool, used_max_basic )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->index = index;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ __entry->found = found;
+ __entry->used_max_basic = used_max_basic;
+ ),
+
+ TP_printk("func %x idx %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s%s",
+ __entry->function, __entry->index, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx,
+ __entry->found ? "found" : "not found",
+ __entry->used_max_basic ? ", used max basic" : "")
+);
+
+#define kvm_deliver_mode \
+ {0x0, "Fixed"}, \
+ {0x1, "LowPrio"}, \
+ {0x2, "SMI"}, \
+ {0x3, "Res3"}, \
+ {0x4, "NMI"}, \
+ {0x5, "INIT"}, \
+ {0x6, "SIPI"}, \
+ {0x7, "ExtINT"}
+
+#ifdef CONFIG_KVM_IOAPIC
+TRACE_EVENT(kvm_ioapic_set_irq,
+ TP_PROTO(__u64 e, int pin, bool coalesced),
+ TP_ARGS(e, pin, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ __field( int, pin )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ __entry->pin = pin;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
+ __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+ TP_PROTO(__u64 e),
+ TP_ARGS(e),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+#endif
+
+TRACE_EVENT(kvm_msi_set_irq,
+ TP_PROTO(__u64 address, __u64 data),
+ TP_ARGS(address, data),
+
+ TP_STRUCT__entry(
+ __field( __u64, address )
+ __field( __u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->data = data;
+ ),
+
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
+ __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->address & (1<<2)) ? "logical" : "physical",
+ (__entry->data & (1<<15)) ? "level" : "edge",
+ (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
+#define AREG(x) { APIC_##x, "APIC_" #x }
+
+#define kvm_trace_symbol_apic \
+ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
+ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
+ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
+ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
+ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
+ AREG(ECTRL)
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, u64 val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( u64, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s %s = 0x%llx",
+ __entry->rw ? "write" : "read",
+ __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
+ __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
+#define kvm_print_exit_reason(exit_reason, isa) \
+ (isa == KVM_ISA_VMX) ? \
+ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \
+ __print_symbolic(exit_reason, SVM_EXIT_REASONS), \
+ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \
+ (isa == KVM_ISA_VMX) ? \
+ __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+
+#define TRACE_EVENT_KVM_EXIT(name) \
+TRACE_EVENT(name, \
+ TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \
+ TP_ARGS(vcpu, isa), \
+ \
+ TP_STRUCT__entry( \
+ __field( unsigned int, exit_reason ) \
+ __field( unsigned long, guest_rip ) \
+ __field( u32, isa ) \
+ __field( u64, info1 ) \
+ __field( u64, info2 ) \
+ __field( u32, intr_info ) \
+ __field( u32, error_code ) \
+ __field( unsigned int, vcpu_id ) \
+ __field( u64, requests ) \
+ ), \
+ \
+ TP_fast_assign( \
+ __entry->guest_rip = tracing_kvm_rip_read(vcpu); \
+ __entry->isa = isa; \
+ __entry->vcpu_id = vcpu->vcpu_id; \
+ __entry->requests = READ_ONCE(vcpu->requests); \
+ kvm_x86_call(get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
+ &__entry->info2, \
+ &__entry->intr_info, \
+ &__entry->error_code); \
+ ), \
+ \
+ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
+ "info2 0x%016llx intr_info 0x%08x error_code 0x%08x " \
+ "requests 0x%016llx", \
+ __entry->vcpu_id, \
+ kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
+ __entry->guest_rip, __entry->info1, __entry->info2, \
+ __entry->intr_info, __entry->error_code, \
+ __entry->requests) \
+)
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT_KVM_EXIT(kvm_exit);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int vector, bool soft, bool reinjected),
+ TP_ARGS(vector, soft, reinjected),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vector )
+ __field( bool, soft )
+ __field( bool, reinjected )
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->soft = soft;
+ __entry->reinjected = reinjected;
+ ),
+
+ TP_printk("%s 0x%x%s",
+ __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector,
+ __entry->reinjected ? " [reinjected]" : "")
+);
+
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc \
+ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
+ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \
+ EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP), \
+ EXS(HV), EXS(VC), EXS(SX)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code,
+ bool reinjected),
+ TP_ARGS(exception, has_error, error_code, reinjected),
+
+ TP_STRUCT__entry(
+ __field( u8, exception )
+ __field( u8, has_error )
+ __field( u32, error_code )
+ __field( bool, reinjected )
+ ),
+
+ TP_fast_assign(
+ __entry->exception = exception;
+ __entry->has_error = has_error;
+ __entry->error_code = error_code;
+ __entry->reinjected = reinjected;
+ ),
+
+ TP_printk("%s%s%s%s%s",
+ __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+ !__entry->has_error ? "" : " (",
+ !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }),
+ !__entry->has_error ? "" : ")",
+ __entry->reinjected ? " [reinjected]" : "")
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 fault_address, u64 error_code),
+ TP_ARGS(vcpu, fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned long, guest_rip )
+ __field( u64, fault_address )
+ __field( u64, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->guest_rip = tracing_kvm_rip_read(vcpu);
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("vcpu %u rip 0x%lx address 0x%016llx error_code 0x%llx",
+ __entry->vcpu_id, __entry->guest_rip,
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
+
+ TP_STRUCT__entry(
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
+ ),
+
+ TP_fast_assign(
+ __entry->write = write;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ __entry->exception = exception;
+ ),
+
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+TRACE_EVENT(kvm_pic_set_irq,
+ TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
+ TP_ARGS(chip, pin, elcr, imr, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u8, chip )
+ __field( __u8, pin )
+ __field( __u8, elcr )
+ __field( __u8, imr )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->chip = chip;
+ __entry->pin = pin;
+ __entry->elcr = elcr;
+ __entry->imr = imr;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("chip %u pin %u (%s%s)%s",
+ __entry->chip, __entry->pin,
+ (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
+ (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#define kvm_apic_dst_shorthand \
+ {0x0, "dst"}, \
+ {0x1, "self"}, \
+ {0x2, "all"}, \
+ {0x3, "all-but-self"}
+
+TRACE_EVENT(kvm_apic_ipi,
+ TP_PROTO(__u32 icr_low, __u32 dest_id),
+ TP_ARGS(icr_low, dest_id),
+
+ TP_STRUCT__entry(
+ __field( __u32, icr_low )
+ __field( __u32, dest_id )
+ ),
+
+ TP_fast_assign(
+ __entry->icr_low = icr_low;
+ __entry->dest_id = dest_id;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
+ __entry->dest_id, (u8)__entry->icr_low,
+ __print_symbolic((__entry->icr_low >> 8 & 0x7),
+ kvm_deliver_mode),
+ (__entry->icr_low & (1<<11)) ? "logical" : "physical",
+ (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
+ (__entry->icr_low & (1<<15)) ? "level" : "edge",
+ __print_symbolic((__entry->icr_low >> 18 & 0x3),
+ kvm_apic_dst_shorthand))
+);
+
+TRACE_EVENT(kvm_apic_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
+TRACE_EVENT(kvm_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+TRACE_EVENT(kvm_pv_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+/*
+ * Tracepoint for nested VMRUN
+ */
+TRACE_EVENT(kvm_nested_vmenter,
+ TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
+ __u32 event_inj, bool tdp_enabled, __u64 guest_tdp_pgd,
+ __u64 guest_cr3, __u32 isa),
+ TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, tdp_enabled,
+ guest_tdp_pgd, guest_cr3, isa),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u64, vmcb )
+ __field( __u64, nested_rip )
+ __field( __u32, int_ctl )
+ __field( __u32, event_inj )
+ __field( bool, tdp_enabled )
+ __field( __u64, guest_pgd )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->vmcb = vmcb;
+ __entry->nested_rip = nested_rip;
+ __entry->int_ctl = int_ctl;
+ __entry->event_inj = event_inj;
+ __entry->tdp_enabled = tdp_enabled;
+ __entry->guest_pgd = tdp_enabled ? guest_tdp_pgd : guest_cr3;
+ __entry->isa = isa;
+ ),
+
+ TP_printk("rip: 0x%016llx %s: 0x%016llx nested_rip: 0x%016llx "
+ "int_ctl: 0x%08x event_inj: 0x%08x nested_%s=%s %s: 0x%016llx",
+ __entry->rip,
+ __entry->isa == KVM_ISA_VMX ? "vmcs" : "vmcb",
+ __entry->vmcb,
+ __entry->nested_rip,
+ __entry->int_ctl,
+ __entry->event_inj,
+ __entry->isa == KVM_ISA_VMX ? "ept" : "npt",
+ __entry->tdp_enabled ? "y" : "n",
+ !__entry->tdp_enabled ? "guest_cr3" :
+ __entry->isa == KVM_ISA_VMX ? "nested_eptp" : "nested_cr3",
+ __entry->guest_pgd)
+);
+
+TRACE_EVENT(kvm_nested_intercepts,
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions,
+ __u32 intercept1, __u32 intercept2, __u32 intercept3),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept1,
+ intercept2, intercept3),
+
+ TP_STRUCT__entry(
+ __field( __u16, cr_read )
+ __field( __u16, cr_write )
+ __field( __u32, exceptions )
+ __field( __u32, intercept1 )
+ __field( __u32, intercept2 )
+ __field( __u32, intercept3 )
+ ),
+
+ TP_fast_assign(
+ __entry->cr_read = cr_read;
+ __entry->cr_write = cr_write;
+ __entry->exceptions = exceptions;
+ __entry->intercept1 = intercept1;
+ __entry->intercept2 = intercept2;
+ __entry->intercept3 = intercept3;
+ ),
+
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x "
+ "intercepts: %08x %08x %08x",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept1, __entry->intercept2, __entry->intercept3)
+);
+/*
+ * Tracepoint for #VMEXIT while nested
+ */
+TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
+
+/*
+ * Tracepoint for #VMEXIT reinjected to the guest
+ */
+TRACE_EVENT(kvm_nested_vmexit_inject,
+ TP_PROTO(__u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
+ TP_ARGS(exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err, isa),
+
+ TP_STRUCT__entry(
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
+ ),
+
+ TP_printk("reason: %s%s%s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ kvm_print_exit_reason(__entry->exit_code, __entry->isa),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_nested_intr_vmexit,
+ TP_PROTO(__u64 rip),
+ TP_ARGS(rip),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip
+ ),
+
+ TP_printk("rip: 0x%016llx", __entry->rip)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_invlpga,
+ TP_PROTO(__u64 rip, unsigned int asid, u64 address),
+ TP_ARGS(rip, asid, address),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( unsigned int, asid )
+ __field( __u64, address )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->asid = asid;
+ __entry->address = address;
+ ),
+
+ TP_printk("rip: 0x%016llx asid: %u address: 0x%016llx",
+ __entry->rip, __entry->asid, __entry->address)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_skinit,
+ TP_PROTO(__u64 rip, __u32 slb),
+ TP_ARGS(rip, slb),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, slb )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->slb = slb;
+ ),
+
+ TP_printk("rip: 0x%016llx slb: 0x%08x",
+ __entry->rip, __entry->slb)
+);
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L (1 << 3)
+
+#define kvm_trace_symbol_emul_flags \
+ { 0, "real" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L, "prot64" }
+
+#define kei_decode_mode(mode) ({ \
+ u8 flags = 0xff; \
+ switch (mode) { \
+ case X86EMUL_MODE_REAL: \
+ flags = 0; \
+ break; \
+ case X86EMUL_MODE_VM86: \
+ flags = KVM_EMUL_INSN_F_EFL_VM; \
+ break; \
+ case X86EMUL_MODE_PROT16: \
+ flags = KVM_EMUL_INSN_F_CR0_PE; \
+ break; \
+ case X86EMUL_MODE_PROT32: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D; \
+ break; \
+ case X86EMUL_MODE_PROT64: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L; \
+ break; \
+ } \
+ flags; \
+ })
+
+TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, X86_MAX_INSTRUCTION_LENGTH )
+ __field( __u8, flags )
+ __field( __u8, failed )
+ ),
+
+ TP_fast_assign(
+ __entry->csbase = kvm_x86_call(get_segment_base)(vcpu,
+ VCPU_SREG_CS);
+ __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
+ - vcpu->arch.emulate_ctxt->fetch.data;
+ __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
+ memcpy(__entry->insn,
+ vcpu->arch.emulate_ctxt->fetch.data,
+ X86_MAX_INSTRUCTION_LENGTH);
+ __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode);
+ __entry->failed = failed;
+ ),
+
+ TP_printk("%x:%llx:%s (%s)%s",
+ __entry->csbase, __entry->rip,
+ __print_hex(__entry->insn, __entry->len),
+ __print_symbolic(__entry->flags,
+ kvm_trace_symbol_emul_flags),
+ __entry->failed ? " failed" : ""
+ )
+ );
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
+TRACE_EVENT(
+ vcpu_match_mmio,
+ TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+ TP_ARGS(gva, gpa, write, gpa_match),
+
+ TP_STRUCT__entry(
+ __field(gva_t, gva)
+ __field(gpa_t, gpa)
+ __field(bool, write)
+ __field(bool, gpa_match)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gpa = gpa;
+ __entry->write = write;
+ __entry->gpa_match = gpa_match
+ ),
+
+ TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+ __entry->write ? "Write" : "Read",
+ __entry->gpa_match ? "GPA" : "GVA")
+);
+
+TRACE_EVENT(kvm_write_tsc_offset,
+ TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
+ __u64 next_tsc_offset),
+ TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u64, previous_tsc_offset )
+ __field( __u64, next_tsc_offset )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->previous_tsc_offset = previous_tsc_offset;
+ __entry->next_tsc_offset = next_tsc_offset;
+ ),
+
+ TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
+ __entry->previous_tsc_offset, __entry->next_tsc_offset)
+);
+
+#ifdef CONFIG_X86_64
+
+#define host_clocks \
+ {VDSO_CLOCKMODE_NONE, "none"}, \
+ {VDSO_CLOCKMODE_TSC, "tsc"} \
+
+TRACE_EVENT(kvm_update_master_clock,
+ TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+ TP_ARGS(use_master_clock, host_clock, offset_matched),
+
+ TP_STRUCT__entry(
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ __field( bool, offset_matched )
+ ),
+
+ TP_fast_assign(
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ __entry->offset_matched = offset_matched;
+ ),
+
+ TP_printk("masterclock %d hostclock %s offsetmatched %u",
+ __entry->use_master_clock,
+ __print_symbolic(__entry->host_clock, host_clocks),
+ __entry->offset_matched)
+);
+
+TRACE_EVENT(kvm_track_tsc,
+ TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+ unsigned int online_vcpus, bool use_master_clock,
+ unsigned int host_clock),
+ TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+ host_clock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, nr_vcpus_matched_tsc )
+ __field( unsigned int, online_vcpus )
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->nr_vcpus_matched_tsc = nr_matched;
+ __entry->online_vcpus = online_vcpus;
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ ),
+
+ TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+ " hostclock %s",
+ __entry->vcpu_id, __entry->use_master_clock,
+ __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+ __print_symbolic(__entry->host_clock, host_clocks))
+);
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
+TRACE_EVENT(kvm_ple_window_update,
+ TP_PROTO(unsigned int vcpu_id, unsigned int new, unsigned int old),
+ TP_ARGS(vcpu_id, new, old),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, new )
+ __field( unsigned int, old )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->new = new;
+ __entry->old = old;
+ ),
+
+ TP_printk("vcpu %u old %u new %u (%s)",
+ __entry->vcpu_id, __entry->old, __entry->new,
+ __entry->old < __entry->new ? "growed" : "shrinked")
+);
+
+TRACE_EVENT(kvm_pvclock_update,
+ TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
+ TP_ARGS(vcpu_id, pvclock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u32, version )
+ __field( __u64, tsc_timestamp )
+ __field( __u64, system_time )
+ __field( __u32, tsc_to_system_mul )
+ __field( __s8, tsc_shift )
+ __field( __u8, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->version = pvclock->version;
+ __entry->tsc_timestamp = pvclock->tsc_timestamp;
+ __entry->system_time = pvclock->system_time;
+ __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul;
+ __entry->tsc_shift = pvclock->tsc_shift;
+ __entry->flags = pvclock->flags;
+ ),
+
+ TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, "
+ "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, "
+ "flags 0x%x }",
+ __entry->vcpu_id,
+ __entry->version,
+ __entry->tsc_timestamp,
+ __entry->system_time,
+ __entry->tsc_to_system_mul,
+ __entry->tsc_shift,
+ __entry->flags)
+);
+
+TRACE_EVENT(kvm_wait_lapic_expire,
+ TP_PROTO(unsigned int vcpu_id, s64 delta),
+ TP_ARGS(vcpu_id, delta),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( s64, delta )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->delta = delta;
+ ),
+
+ TP_printk("vcpu %u: delta %lld (%s)",
+ __entry->vcpu_id,
+ __entry->delta,
+ __entry->delta < 0 ? "early" : "late")
+);
+
+TRACE_EVENT(kvm_smm_transition,
+ TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+ TP_ARGS(vcpu_id, smbase, entering),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( u64, smbase )
+ __field( bool, entering )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->smbase = smbase;
+ __entry->entering = entering;
+ ),
+
+ TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+ __entry->vcpu_id,
+ __entry->entering ? "entering" : "leaving",
+ __entry->smbase)
+);
+
+/*
+ * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+ TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
+ unsigned int gsi, unsigned int gvec, bool set),
+ TP_ARGS(host_irq, vcpu, gsi, gvec, set),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, host_irq )
+ __field( int, vcpu_id )
+ __field( unsigned int, gsi )
+ __field( unsigned int, gvec )
+ __field( bool, set )
+ ),
+
+ TP_fast_assign(
+ __entry->host_irq = host_irq;
+ __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1;
+ __entry->gsi = gsi;
+ __entry->gvec = gvec;
+ __entry->set = set;
+ ),
+
+ TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
+ __entry->set ? "enabled and being updated" : "disabled",
+ __entry->host_irq,
+ __entry->vcpu_id,
+ __entry->gsi,
+ __entry->gvec)
+);
+
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+ TP_PROTO(int vcpu_id, u32 sint),
+ TP_ARGS(vcpu_id, sint),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ ),
+
+ TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+ TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+ TP_ARGS(vcpu_id, sint, vector, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ __entry->vector = vector;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu_id %d sint %u vector %d ret %d",
+ __entry->vcpu_id, __entry->sint, __entry->vector,
+ __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+ TP_PROTO(int vcpu_id, int vector),
+ TP_ARGS(vcpu_id, vector),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+ TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+ TP_ARGS(vcpu_id, msr, data, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, msr)
+ __field(u64, data)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->msr = msr;
+ __entry->data = data;
+ __entry->host = host
+ ),
+
+ TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+ __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+ TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+ TP_ARGS(vcpu_id, timer_index, config, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, config)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->config = config;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->config,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+ TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+ TP_ARGS(vcpu_id, timer_index, count, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, count)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->count = count;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d count %llu host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->count,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+ TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, exp_time)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->exp_time = exp_time;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+ TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, count)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->count = count;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+ TP_PROTO(int vcpu_id, int timer_index, int direct, int msg_send_result),
+ TP_ARGS(vcpu_id, timer_index, direct, msg_send_result),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(int, direct)
+ __field(int, msg_send_result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->direct = direct;
+ __entry->msg_send_result = msg_send_result;
+ ),
+
+ TP_printk("vcpu_id %d timer %d direct %d send result %d",
+ __entry->vcpu_id, __entry->timer_index,
+ __entry->direct, __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+#define kvm_print_apicv_inhibit_reasons(inhibits) \
+ (inhibits), (inhibits) ? " " : "", \
+ (inhibits) ? __print_flags(inhibits, "|", APICV_INHIBIT_REASONS) : ""
+
+TRACE_EVENT(kvm_apicv_inhibit_changed,
+ TP_PROTO(int reason, bool set, unsigned long inhibits),
+ TP_ARGS(reason, set, inhibits),
+
+ TP_STRUCT__entry(
+ __field(int, reason)
+ __field(bool, set)
+ __field(unsigned long, inhibits)
+ ),
+
+ TP_fast_assign(
+ __entry->reason = reason;
+ __entry->set = set;
+ __entry->inhibits = inhibits;
+ ),
+
+ TP_printk("%s reason=%u, inhibits=0x%lx%s%s",
+ __entry->set ? "set" : "cleared",
+ __entry->reason,
+ kvm_print_apicv_inhibit_reasons(__entry->inhibits))
+);
+
+TRACE_EVENT(kvm_apicv_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
+/*
+ * Tracepoint for AMD AVIC
+ */
+TRACE_EVENT(kvm_avic_incomplete_ipi,
+ TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+ TP_ARGS(vcpu, icrh, icrl, id, index),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, id)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->id = id;
+ __entry->index = index;
+ ),
+
+ TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u",
+ __entry->vcpu, __entry->icrh, __entry->icrl,
+ __entry->id, __entry->index)
+);
+
+TRACE_EVENT(kvm_avic_unaccelerated_access,
+ TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+ TP_ARGS(vcpu, offset, ft, rw, vec),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, offset)
+ __field(bool, ft)
+ __field(bool, rw)
+ __field(u32, vec)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->offset = offset;
+ __entry->ft = ft;
+ __entry->rw = rw;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x",
+ __entry->vcpu,
+ __entry->offset,
+ __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+ __entry->ft ? "trap" : "fault",
+ __entry->rw ? "write" : "read",
+ __entry->vec)
+);
+
+TRACE_EVENT(kvm_avic_ga_log,
+ TP_PROTO(u32 vmid, u32 vcpuid),
+ TP_ARGS(vmid, vcpuid),
+
+ TP_STRUCT__entry(
+ __field(u32, vmid)
+ __field(u32, vcpuid)
+ ),
+
+ TP_fast_assign(
+ __entry->vmid = vmid;
+ __entry->vcpuid = vcpuid;
+ ),
+
+ TP_printk("vmid=%u, vcpuid=%u",
+ __entry->vmid, __entry->vcpuid)
+);
+
+TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
+ TP_PROTO(u32 icrh, u32 icrl, u32 index),
+ TP_ARGS(icrh, icrl, index),
+
+ TP_STRUCT__entry(
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->index = index;
+ ),
+
+ TP_printk("icrh:icrl=%#08x:%08x, index=%u",
+ __entry->icrh, __entry->icrl, __entry->index)
+);
+
+TRACE_EVENT(kvm_avic_doorbell,
+ TP_PROTO(u32 vcpuid, u32 apicid),
+ TP_ARGS(vcpuid, apicid),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpuid)
+ __field(u32, apicid)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpuid = vcpuid;
+ __entry->apicid = apicid;
+ ),
+
+ TP_printk("vcpuid=%u, apicid=%u",
+ __entry->vcpuid, __entry->apicid)
+);
+
+TRACE_EVENT(kvm_hv_timer_state,
+ TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+ TP_ARGS(vcpu_id, hv_timer_in_use),
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(unsigned int, hv_timer_in_use)
+ ),
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->hv_timer_in_use = hv_timer_in_use;
+ ),
+ TP_printk("vcpu_id %x hv_timer %x",
+ __entry->vcpu_id,
+ __entry->hv_timer_in_use)
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb,
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(processor_mask, address_space, flags, guest_mode),
+
+ TP_STRUCT__entry(
+ __field(u64, processor_mask)
+ __field(u64, address_space)
+ __field(u64, flags)
+ __field(bool, guest_mode)
+ ),
+
+ TP_fast_assign(
+ __entry->processor_mask = processor_mask;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ __entry->guest_mode = guest_mode;
+ ),
+
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s",
+ __entry->processor_mask, __entry->address_space,
+ __entry->flags, __entry->guest_mode ? "(L2)" : "")
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb_ex.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb_ex,
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode),
+
+ TP_STRUCT__entry(
+ __field(u64, valid_bank_mask)
+ __field(u64, format)
+ __field(u64, address_space)
+ __field(u64, flags)
+ __field(bool, guest_mode)
+ ),
+
+ TP_fast_assign(
+ __entry->valid_bank_mask = valid_bank_mask;
+ __entry->format = format;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ __entry->guest_mode = guest_mode;
+ ),
+
+ TP_printk("valid_bank_mask 0x%llx format 0x%llx "
+ "address_space 0x%llx flags 0x%llx %s",
+ __entry->valid_bank_mask, __entry->format,
+ __entry->address_space, __entry->flags,
+ __entry->guest_mode ? "(L2)" : "")
+);
+
+/*
+ * Tracepoints for kvm_hv_send_ipi.
+ */
+TRACE_EVENT(kvm_hv_send_ipi,
+ TP_PROTO(u32 vector, u64 processor_mask),
+ TP_ARGS(vector, processor_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, processor_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->processor_mask = processor_mask;
+ ),
+
+ TP_printk("vector %x processor_mask 0x%llx",
+ __entry->vector, __entry->processor_mask)
+);
+
+TRACE_EVENT(kvm_hv_send_ipi_ex,
+ TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask),
+ TP_ARGS(vector, format, valid_bank_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, format)
+ __field(u64, valid_bank_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->format = format;
+ __entry->valid_bank_mask = valid_bank_mask;
+ ),
+
+ TP_printk("vector %x format %llx valid_bank_mask 0x%llx",
+ __entry->vector, __entry->format,
+ __entry->valid_bank_mask)
+);
+
+TRACE_EVENT(kvm_pv_tlb_flush,
+ TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb),
+ TP_ARGS(vcpu_id, need_flush_tlb),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( bool, need_flush_tlb )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->need_flush_tlb = need_flush_tlb;
+ ),
+
+ TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id,
+ __entry->need_flush_tlb ? "true" : "false")
+);
+
+/*
+ * Tracepoint for failed nested VMX VM-Enter.
+ */
+TRACE_EVENT(kvm_nested_vmenter_failed,
+ TP_PROTO(const char *msg, u32 err),
+ TP_ARGS(msg, err),
+
+ TP_STRUCT__entry(
+ __string(msg, msg)
+ __field(u32, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(msg);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s%s", __get_str(msg), !__entry->err ? "" :
+ __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
+);
+
+/*
+ * Tracepoint for syndbg_set_msr.
+ */
+TRACE_EVENT(kvm_hv_syndbg_set_msr,
+ TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data),
+ TP_ARGS(vcpu_id, vp_index, msr, data),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, vp_index)
+ __field(u32, msr)
+ __field(u64, data)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vp_index = vp_index;
+ __entry->msr = msr;
+ __entry->data = data;
+ ),
+
+ TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx",
+ __entry->vcpu_id, __entry->vp_index, __entry->msr,
+ __entry->data)
+);
+
+/*
+ * Tracepoint for syndbg_get_msr.
+ */
+TRACE_EVENT(kvm_hv_syndbg_get_msr,
+ TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data),
+ TP_ARGS(vcpu_id, vp_index, msr, data),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, vp_index)
+ __field(u32, msr)
+ __field(u64, data)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vp_index = vp_index;
+ __entry->msr = msr;
+ __entry->data = data;
+ ),
+
+ TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx",
+ __entry->vcpu_id, __entry->vp_index, __entry->msr,
+ __entry->data)
+);
+
+/*
+ * Tracepoint for the start of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_enter,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_exit,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the start of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_enter,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa),
+ TP_ARGS(vcpu_id, ghcb_gpa),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx",
+ __entry->vcpu_id, __entry->ghcb_gpa)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa, int result),
+ TP_ARGS(vcpu_id, ghcb_gpa, result),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ __field(int, result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ __entry->result = result;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx, result %d",
+ __entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
+);
+
+/*
+ * Tracepoint for #NPFs due to RMP faults.
+ */
+TRACE_EVENT(kvm_rmp_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code,
+ int rmp_level, int psmash_ret),
+ TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, gpa)
+ __field(u64, pfn)
+ __field(u64, error_code)
+ __field(int, rmp_level)
+ __field(int, psmash_ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gpa = gpa;
+ __entry->pfn = pfn;
+ __entry->error_code = error_code;
+ __entry->rmp_level = rmp_level;
+ __entry->psmash_ret = psmash_ret;
+ ),
+
+ TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d",
+ __entry->vcpu_id, __entry->gpa, __entry->pfn,
+ __entry->error_code, __entry->rmp_level, __entry->psmash_ret)
+);
+
+#endif /* _TRACE_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
index 622aa10f692f..3f9150125e70 100644
--- a/arch/x86/kvm/tss.h
+++ b/arch/x86/kvm/tss.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __TSS_SEGMENT_H
#define __TSS_SEGMENT_H
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
deleted file mode 100644
index 29f912927a58..000000000000
--- a/arch/x86/kvm/vmx.c
+++ /dev/null
@@ -1,3853 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "irq.h"
-#include "mmu.h"
-
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/moduleparam.h>
-#include "kvm_cache_regs.h"
-#include "x86.h"
-
-#include <asm/io.h>
-#include <asm/desc.h>
-#include <asm/vmx.h>
-#include <asm/virtext.h>
-#include <asm/mce.h>
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static int __read_mostly bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, S_IRUGO);
-
-static int __read_mostly enable_vpid = 1;
-module_param_named(vpid, enable_vpid, bool, 0444);
-
-static int __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
-
-static int __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
-
-static int __read_mostly emulate_invalid_guest_state = 0;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-
-struct vmcs {
- u32 revision_id;
- u32 abort;
- char data[0];
-};
-
-struct vcpu_vmx {
- struct kvm_vcpu vcpu;
- struct list_head local_vcpus_link;
- unsigned long host_rsp;
- int launched;
- u8 fail;
- u32 idt_vectoring_info;
- struct kvm_msr_entry *guest_msrs;
- struct kvm_msr_entry *host_msrs;
- int nmsrs;
- int save_nmsrs;
- int msr_offset_efer;
-#ifdef CONFIG_X86_64
- int msr_offset_kernel_gs_base;
-#endif
- struct vmcs *vmcs;
- struct {
- int loaded;
- u16 fs_sel, gs_sel, ldt_sel;
- int gs_ldt_reload_needed;
- int fs_reload_needed;
- int guest_efer_loaded;
- } host_state;
- struct {
- struct {
- bool pending;
- u8 vector;
- unsigned rip;
- } irq;
- } rmode;
- int vpid;
- bool emulation_required;
- enum emulation_result invalid_state_emulation_result;
-
- /* Support for vnmi-less CPUs */
- int soft_vnmi_blocked;
- ktime_t entry_time;
- s64 vnmi_blocked_time;
- u32 exit_reason;
-};
-
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_vmx, vcpu);
-}
-
-static int init_rmode(struct kvm *kvm);
-static u64 construct_eptp(unsigned long root_hpa);
-
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
-static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
-
-static unsigned long *vmx_io_bitmap_a;
-static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-
-static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
-static DEFINE_SPINLOCK(vmx_vpid_lock);
-
-static struct vmcs_config {
- int size;
- int order;
- u32 revision_id;
- u32 pin_based_exec_ctrl;
- u32 cpu_based_exec_ctrl;
- u32 cpu_based_2nd_exec_ctrl;
- u32 vmexit_ctrl;
- u32 vmentry_ctrl;
-} vmcs_config;
-
-static struct vmx_capability {
- u32 ept;
- u32 vpid;
-} vmx_capability;
-
-#define VMX_SEGMENT_FIELD(seg) \
- [VCPU_SREG_##seg] = { \
- .selector = GUEST_##seg##_SELECTOR, \
- .base = GUEST_##seg##_BASE, \
- .limit = GUEST_##seg##_LIMIT, \
- .ar_bytes = GUEST_##seg##_AR_BYTES, \
- }
-
-static struct kvm_vmx_segment_field {
- unsigned selector;
- unsigned base;
- unsigned limit;
- unsigned ar_bytes;
-} kvm_vmx_segment_fields[] = {
- VMX_SEGMENT_FIELD(CS),
- VMX_SEGMENT_FIELD(DS),
- VMX_SEGMENT_FIELD(ES),
- VMX_SEGMENT_FIELD(FS),
- VMX_SEGMENT_FIELD(GS),
- VMX_SEGMENT_FIELD(SS),
- VMX_SEGMENT_FIELD(TR),
- VMX_SEGMENT_FIELD(LDTR),
-};
-
-/*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
- * away by decrementing the array size.
- */
-static const u32 vmx_msr_index[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
-#endif
- MSR_EFER, MSR_K6_STAR,
-};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- rdmsrl(e[i].index, e[i].data);
-}
-
-static inline int is_page_fault(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_no_device(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_invalid_opcode(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_external_interrupt(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-static inline int is_machine_check(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
-}
-
-static inline int cpu_has_vmx_msr_bitmap(void)
-{
- return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
-}
-
-static inline int cpu_has_vmx_tpr_shadow(void)
-{
- return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
-}
-
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
-{
- return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
-}
-
-static inline int cpu_has_secondary_exec_ctrls(void)
-{
- return vmcs_config.cpu_based_exec_ctrl &
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
-}
-
-static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-}
-
-static inline bool cpu_has_vmx_flexpriority(void)
-{
- return cpu_has_vmx_tpr_shadow() &&
- cpu_has_vmx_virtualize_apic_accesses();
-}
-
-static inline int cpu_has_vmx_invept_individual_addr(void)
-{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
-}
-
-static inline int cpu_has_vmx_invept_context(void)
-{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
-}
-
-static inline int cpu_has_vmx_invept_global(void)
-{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
-}
-
-static inline int cpu_has_vmx_ept(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_EPT;
-}
-
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
-{
- return flexpriority_enabled &&
- (cpu_has_vmx_virtualize_apic_accesses()) &&
- (irqchip_in_kernel(kvm));
-}
-
-static inline int cpu_has_vmx_vpid(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_ENABLE_VPID;
-}
-
-static inline int cpu_has_virtual_nmis(void)
-{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
-}
-
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
-static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx->guest_msrs[i].index == msr)
- return i;
- return -1;
-}
-
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
-{
- struct {
- u64 vpid : 16;
- u64 rsvd : 48;
- u64 gva;
- } operand = { vpid, 0, gva };
-
- asm volatile (__ex(ASM_VMX_INVVPID)
- /* CF==1 or ZF==1 --> rc = -1 */
- "; ja 1f ; ud2 ; 1:"
- : : "a"(&operand), "c"(ext) : "cc", "memory");
-}
-
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
-{
- struct {
- u64 eptp, gpa;
- } operand = {eptp, gpa};
-
- asm volatile (__ex(ASM_VMX_INVEPT)
- /* CF==1 or ZF==1 --> rc = -1 */
- "; ja 1f ; ud2 ; 1:\n"
- : : "a" (&operand), "c" (ext) : "cc", "memory");
-}
-
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- i = __find_msr_index(vmx, msr);
- if (i >= 0)
- return &vmx->guest_msrs[i];
- return NULL;
-}
-
-static void vmcs_clear(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(vmcs);
- u8 error;
-
- asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc", "memory");
- if (error)
- printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
- vmcs, phys_addr);
-}
-
-static void __vcpu_clear(void *arg)
-{
- struct vcpu_vmx *vmx = arg;
- int cpu = raw_smp_processor_id();
-
- if (vmx->vcpu.cpu == cpu)
- vmcs_clear(vmx->vmcs);
- if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
- per_cpu(current_vmcs, cpu) = NULL;
- rdtscll(vmx->vcpu.arch.host_tsc);
- list_del(&vmx->local_vcpus_link);
- vmx->vcpu.cpu = -1;
- vmx->launched = 0;
-}
-
-static void vcpu_clear(struct vcpu_vmx *vmx)
-{
- if (vmx->vcpu.cpu == -1)
- return;
- smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
-}
-
-static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
-{
- if (vmx->vpid == 0)
- return;
-
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
-}
-
-static inline void ept_sync_global(void)
-{
- if (cpu_has_vmx_invept_global())
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
-}
-
-static inline void ept_sync_context(u64 eptp)
-{
- if (enable_ept) {
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
- }
-}
-
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
- if (enable_ept) {
- if (cpu_has_vmx_invept_individual_addr())
- __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
- eptp, gpa);
- else
- ept_sync_context(eptp);
- }
-}
-
-static unsigned long vmcs_readl(unsigned long field)
-{
- unsigned long value;
-
- asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
- : "=a"(value) : "d"(field) : "cc");
- return value;
-}
-
-static u16 vmcs_read16(unsigned long field)
-{
- return vmcs_readl(field);
-}
-
-static u32 vmcs_read32(unsigned long field)
-{
- return vmcs_readl(field);
-}
-
-static u64 vmcs_read64(unsigned long field)
-{
-#ifdef CONFIG_X86_64
- return vmcs_readl(field);
-#else
- return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
-#endif
-}
-
-static noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
- printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
- field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
- dump_stack();
-}
-
-static void vmcs_writel(unsigned long field, unsigned long value)
-{
- u8 error;
-
- asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
- : "=q"(error) : "a"(value), "d"(field) : "cc");
- if (unlikely(error))
- vmwrite_error(field, value);
-}
-
-static void vmcs_write16(unsigned long field, u16 value)
-{
- vmcs_writel(field, value);
-}
-
-static void vmcs_write32(unsigned long field, u32 value)
-{
- vmcs_writel(field, value);
-}
-
-static void vmcs_write64(unsigned long field, u64 value)
-{
- vmcs_writel(field, value);
-#ifndef CONFIG_X86_64
- asm volatile ("");
- vmcs_writel(field+1, value >> 32);
-#endif
-}
-
-static void vmcs_clear_bits(unsigned long field, u32 mask)
-{
- vmcs_writel(field, vmcs_readl(field) & ~mask);
-}
-
-static void vmcs_set_bits(unsigned long field, u32 mask)
-{
- vmcs_writel(field, vmcs_readl(field) | mask);
-}
-
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
-{
- u32 eb;
-
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
- if (!vcpu->fpu_active)
- eb |= 1u << NM_VECTOR;
- if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- eb |= 1u << DB_VECTOR;
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- eb |= 1u << BP_VECTOR;
- }
- if (vcpu->arch.rmode.vm86_active)
- eb = ~0;
- if (enable_ept)
- eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
- vmcs_write32(EXCEPTION_BITMAP, eb);
-}
-
-static void reload_tss(void)
-{
- /*
- * VT restores TR but not its size. Useless.
- */
- struct descriptor_table gdt;
- struct desc_struct *descs;
-
- kvm_get_gdt(&gdt);
- descs = (void *)gdt.base;
- descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
- load_TR_desc();
-}
-
-static void load_transition_efer(struct vcpu_vmx *vmx)
-{
- int efer_offset = vmx->msr_offset_efer;
- u64 host_efer = vmx->host_msrs[efer_offset].data;
- u64 guest_efer = vmx->guest_msrs[efer_offset].data;
- u64 ignore_bits;
-
- if (efer_offset < 0)
- return;
- /*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
- * outside long mode
- */
- ignore_bits = EFER_NX | EFER_SCE;
-#ifdef CONFIG_X86_64
- ignore_bits |= EFER_LMA | EFER_LME;
- /* SCE is meaningful only in long mode on Intel */
- if (guest_efer & EFER_LMA)
- ignore_bits &= ~(u64)EFER_SCE;
-#endif
- if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
- return;
-
- vmx->host_state.guest_efer_loaded = 1;
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
- wrmsrl(MSR_EFER, guest_efer);
- vmx->vcpu.stat.efer_reload++;
-}
-
-static void reload_host_efer(struct vcpu_vmx *vmx)
-{
- if (vmx->host_state.guest_efer_loaded) {
- vmx->host_state.guest_efer_loaded = 0;
- load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
- }
-}
-
-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->host_state.loaded)
- return;
-
- vmx->host_state.loaded = 1;
- /*
- * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
- * allow segment selectors with cpl > 0 or ti == 1.
- */
- vmx->host_state.ldt_sel = kvm_read_ldt();
- vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
- vmx->host_state.fs_sel = kvm_read_fs();
- if (!(vmx->host_state.fs_sel & 7)) {
- vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
- vmx->host_state.fs_reload_needed = 0;
- } else {
- vmcs_write16(HOST_FS_SELECTOR, 0);
- vmx->host_state.fs_reload_needed = 1;
- }
- vmx->host_state.gs_sel = kvm_read_gs();
- if (!(vmx->host_state.gs_sel & 7))
- vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
- else {
- vmcs_write16(HOST_GS_SELECTOR, 0);
- vmx->host_state.gs_ldt_reload_needed = 1;
- }
-
-#ifdef CONFIG_X86_64
- vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
- vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
-#else
- vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
- vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
-#endif
-
-#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- save_msrs(vmx->host_msrs +
- vmx->msr_offset_kernel_gs_base, 1);
-
-#endif
- load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- load_transition_efer(vmx);
-}
-
-static void __vmx_load_host_state(struct vcpu_vmx *vmx)
-{
- unsigned long flags;
-
- if (!vmx->host_state.loaded)
- return;
-
- ++vmx->vcpu.stat.host_state_reload;
- vmx->host_state.loaded = 0;
- if (vmx->host_state.fs_reload_needed)
- kvm_load_fs(vmx->host_state.fs_sel);
- if (vmx->host_state.gs_ldt_reload_needed) {
- kvm_load_ldt(vmx->host_state.ldt_sel);
- /*
- * If we have to reload gs, we must take care to
- * preserve our gs base.
- */
- local_irq_save(flags);
- kvm_load_gs(vmx->host_state.gs_sel);
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
-#endif
- local_irq_restore(flags);
- }
- reload_tss();
- save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- load_msrs(vmx->host_msrs, vmx->save_nmsrs);
- reload_host_efer(vmx);
-}
-
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
-{
- preempt_disable();
- __vmx_load_host_state(vmx);
- preempt_enable();
-}
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(vmx->vmcs);
- u64 tsc_this, delta, new_offset;
-
- if (vcpu->cpu != cpu) {
- vcpu_clear(vmx);
- kvm_migrate_timers(vcpu);
- vpid_sync_vcpu_all(vmx);
- local_irq_disable();
- list_add(&vmx->local_vcpus_link,
- &per_cpu(vcpus_on_cpu, cpu));
- local_irq_enable();
- }
-
- if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
- u8 error;
-
- per_cpu(current_vmcs, cpu) = vmx->vmcs;
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc");
- if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
- vmx->vmcs, phys_addr);
- }
-
- if (vcpu->cpu != cpu) {
- struct descriptor_table dt;
- unsigned long sysenter_esp;
-
- vcpu->cpu = cpu;
- /*
- * Linux uses per-cpu TSS and GDT, so set these when switching
- * processors.
- */
- vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- kvm_get_gdt(&dt);
- vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
-
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
- /*
- * Make sure the time stamp counter is monotonous.
- */
- rdtscll(tsc_this);
- if (tsc_this < vcpu->arch.host_tsc) {
- delta = vcpu->arch.host_tsc - tsc_this;
- new_offset = vmcs_read64(TSC_OFFSET) + delta;
- vmcs_write64(TSC_OFFSET, new_offset);
- }
- }
-}
-
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
-{
- __vmx_load_host_state(to_vmx(vcpu));
-}
-
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
- if (vcpu->fpu_active)
- return;
- vcpu->fpu_active = 1;
- vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
- if (vcpu->arch.cr0 & X86_CR0_TS)
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
- update_exception_bitmap(vcpu);
-}
-
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->fpu_active)
- return;
- vcpu->fpu_active = 0;
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
- update_exception_bitmap(vcpu);
-}
-
-static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
-{
- return vmcs_readl(GUEST_RFLAGS);
-}
-
-static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- if (vcpu->arch.rmode.vm86_active)
- rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
- vmcs_writel(GUEST_RFLAGS, rflags);
-}
-
-static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- int ret = 0;
-
- if (interruptibility & GUEST_INTR_STATE_STI)
- ret |= X86_SHADOW_INT_STI;
- if (interruptibility & GUEST_INTR_STATE_MOV_SS)
- ret |= X86_SHADOW_INT_MOV_SS;
-
- return ret & mask;
-}
-
-static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- u32 interruptibility = interruptibility_old;
-
- interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
-
- if (mask & X86_SHADOW_INT_MOV_SS)
- interruptibility |= GUEST_INTR_STATE_MOV_SS;
- if (mask & X86_SHADOW_INT_STI)
- interruptibility |= GUEST_INTR_STATE_STI;
-
- if ((interruptibility != interruptibility_old))
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
- unsigned long rip;
-
- rip = kvm_rip_read(vcpu);
- rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- kvm_rip_write(vcpu, rip);
-
- /* skipping an emulated instruction also counts */
- vmx_set_interrupt_shadow(vcpu, 0);
-}
-
-static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 intr_info = nr | INTR_INFO_VALID_MASK;
-
- if (has_error_code) {
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
- intr_info |= INTR_INFO_DELIVER_CODE_MASK;
- }
-
- if (vcpu->arch.rmode.vm86_active) {
- vmx->rmode.irq.pending = true;
- vmx->rmode.irq.vector = nr;
- vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- if (nr == BP_VECTOR || nr == OF_VECTOR)
- vmx->rmode.irq.rip++;
- intr_info |= INTR_TYPE_SOFT_INTR;
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
- return;
- }
-
- if (kvm_exception_is_soft(nr)) {
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmx->vcpu.arch.event_exit_inst_len);
- intr_info |= INTR_TYPE_SOFT_EXCEPTION;
- } else
- intr_info |= INTR_TYPE_HARD_EXCEPTION;
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-}
-
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-#ifdef CONFIG_X86_64
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
- struct kvm_msr_entry tmp;
-
- tmp = vmx->guest_msrs[to];
- vmx->guest_msrs[to] = vmx->guest_msrs[from];
- vmx->guest_msrs[from] = tmp;
- tmp = vmx->host_msrs[to];
- vmx->host_msrs[to] = vmx->host_msrs[from];
- vmx->host_msrs[from] = tmp;
-}
-#endif
-
-/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
- */
-static void setup_msrs(struct vcpu_vmx *vmx)
-{
- int save_nmsrs;
- unsigned long *msr_bitmap;
-
- vmx_load_host_state(vmx);
- save_nmsrs = 0;
-#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- int index;
-
- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_LSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_CSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- /*
- * MSR_K6_STAR is only needed on long mode guests, and only
- * if efer.sce is enabled.
- */
- index = __find_msr_index(vmx, MSR_K6_STAR);
- if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
- move_msr_up(vmx, index, save_nmsrs++);
- }
-#endif
- vmx->save_nmsrs = save_nmsrs;
-
-#ifdef CONFIG_X86_64
- vmx->msr_offset_kernel_gs_base =
- __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
- vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
-
- if (cpu_has_vmx_msr_bitmap()) {
- if (is_long_mode(&vmx->vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode;
- else
- msr_bitmap = vmx_msr_bitmap_legacy;
-
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
- }
-}
-
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset -- 21.3
- */
-static u64 guest_read_tsc(void)
-{
- u64 host_tsc, tsc_offset;
-
- rdtscll(host_tsc);
- tsc_offset = vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
-}
-
-/*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
- */
-static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
-{
- vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
-}
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- u64 data;
- struct kvm_msr_entry *msr;
-
- if (!pdata) {
- printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
- return -EINVAL;
- }
-
- switch (msr_index) {
-#ifdef CONFIG_X86_64
- case MSR_FS_BASE:
- data = vmcs_readl(GUEST_FS_BASE);
- break;
- case MSR_GS_BASE:
- data = vmcs_readl(GUEST_GS_BASE);
- break;
- case MSR_EFER:
- return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
- case MSR_IA32_TIME_STAMP_COUNTER:
- data = guest_read_tsc();
- break;
- case MSR_IA32_SYSENTER_CS:
- data = vmcs_read32(GUEST_SYSENTER_CS);
- break;
- case MSR_IA32_SYSENTER_EIP:
- data = vmcs_readl(GUEST_SYSENTER_EIP);
- break;
- case MSR_IA32_SYSENTER_ESP:
- data = vmcs_readl(GUEST_SYSENTER_ESP);
- break;
- default:
- vmx_load_host_state(to_vmx(vcpu));
- msr = find_msr_entry(to_vmx(vcpu), msr_index);
- if (msr) {
- data = msr->data;
- break;
- }
- return kvm_get_msr_common(vcpu, msr_index, pdata);
- }
-
- *pdata = data;
- return 0;
-}
-
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr;
- u64 host_tsc;
- int ret = 0;
-
- switch (msr_index) {
- case MSR_EFER:
- vmx_load_host_state(vmx);
- ret = kvm_set_msr_common(vcpu, msr_index, data);
- break;
-#ifdef CONFIG_X86_64
- case MSR_FS_BASE:
- vmcs_writel(GUEST_FS_BASE, data);
- break;
- case MSR_GS_BASE:
- vmcs_writel(GUEST_GS_BASE, data);
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- vmcs_write32(GUEST_SYSENTER_CS, data);
- break;
- case MSR_IA32_SYSENTER_EIP:
- vmcs_writel(GUEST_SYSENTER_EIP, data);
- break;
- case MSR_IA32_SYSENTER_ESP:
- vmcs_writel(GUEST_SYSENTER_ESP, data);
- break;
- case MSR_IA32_TIME_STAMP_COUNTER:
- rdtscll(host_tsc);
- guest_write_tsc(data, host_tsc);
- break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- /*
- * Just discard all writes to the performance counters; this
- * should keep both older linux and windows 64-bit guests
- * happy
- */
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
-
- break;
- case MSR_IA32_CR_PAT:
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- vmcs_write64(GUEST_IA32_PAT, data);
- vcpu->arch.pat = data;
- break;
- }
- /* Otherwise falls through to kvm_set_msr_common */
- default:
- vmx_load_host_state(vmx);
- msr = find_msr_entry(vmx, msr_index);
- if (msr) {
- msr->data = data;
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_index, data);
- }
-
- return ret;
-}
-
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
-{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
- switch (reg) {
- case VCPU_REGS_RSP:
- vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
- break;
- case VCPU_REGS_RIP:
- vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
- break;
- default:
- break;
- }
-}
-
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
- int old_debug = vcpu->guest_debug;
- unsigned long flags;
-
- vcpu->guest_debug = dbg->control;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
- vcpu->guest_debug = 0;
-
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
- else
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-
- flags = vmcs_readl(GUEST_RFLAGS);
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
- else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
- flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- vmcs_writel(GUEST_RFLAGS, flags);
-
- update_exception_bitmap(vcpu);
-
- return 0;
-}
-
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- u64 msr;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- return (msr & (FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED))
- == FEATURE_CONTROL_LOCKED;
- /* locked but not enabled */
-}
-
-static void hardware_enable(void *garbage)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old;
-
- INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
- if ((old & (FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED))
- != (FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED))
- /* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
- FEATURE_CONTROL_LOCKED |
- FEATURE_CONTROL_VMXON_ENABLED);
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
- asm volatile (ASM_VMX_VMXON_RAX
- : : "a"(&phys_addr), "m"(phys_addr)
- : "memory", "cc");
-}
-
-static void vmclear_local_vcpus(void)
-{
- int cpu = raw_smp_processor_id();
- struct vcpu_vmx *vmx, *n;
-
- list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
- local_vcpus_link)
- __vcpu_clear(vmx);
-}
-
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
- */
-static void kvm_cpu_vmxoff(void)
-{
- asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
-}
-
-static void hardware_disable(void *garbage)
-{
- vmclear_local_vcpus();
- kvm_cpu_vmxoff();
-}
-
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
-{
- u32 vmx_msr_low, vmx_msr_high;
- u32 ctl = ctl_min | ctl_opt;
-
- rdmsr(msr, vmx_msr_low, vmx_msr_high);
-
- ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
- ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
-
- /* Ensure minimum (required) set of control bits are supported. */
- if (ctl_min & ~ctl)
- return -EIO;
-
- *result = ctl;
- return 0;
-}
-
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
-{
- u32 vmx_msr_low, vmx_msr_high;
- u32 min, opt, min2, opt2;
- u32 _pin_based_exec_control = 0;
- u32 _cpu_based_exec_control = 0;
- u32 _cpu_based_2nd_exec_control = 0;
- u32 _vmexit_control = 0;
- u32 _vmentry_control = 0;
-
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
- &_pin_based_exec_control) < 0)
- return -EIO;
-
- min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
- CPU_BASED_CR8_LOAD_EXITING |
- CPU_BASED_CR8_STORE_EXITING |
-#endif
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_USE_IO_BITMAPS |
- CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING |
- CPU_BASED_INVLPG_EXITING;
- opt = CPU_BASED_TPR_SHADOW |
- CPU_BASED_USE_MSR_BITMAPS |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
- return -EIO;
-#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
- _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
- ~CPU_BASED_CR8_STORE_EXITING;
-#endif
- if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
- min2 = 0;
- opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT;
- if (adjust_vmx_controls(min2, opt2,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- &_cpu_based_2nd_exec_control) < 0)
- return -EIO;
- }
-#ifndef CONFIG_X86_64
- if (!(_cpu_based_2nd_exec_control &
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
- _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
-#endif
- if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
- /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
- enabled */
- min &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_INVLPG_EXITING);
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
- return -EIO;
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- vmx_capability.ept, vmx_capability.vpid);
- }
-
- min = 0;
-#ifdef CONFIG_X86_64
- min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
- opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
- &_vmexit_control) < 0)
- return -EIO;
-
- min = 0;
- opt = VM_ENTRY_LOAD_IA32_PAT;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
- &_vmentry_control) < 0)
- return -EIO;
-
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
-
- /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
- return -EIO;
-
-#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
- return -EIO;
-#endif
-
- /* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
- return -EIO;
-
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_config.size);
- vmcs_conf->revision_id = vmx_msr_low;
-
- vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
- vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
- vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
- vmcs_conf->vmexit_ctrl = _vmexit_control;
- vmcs_conf->vmentry_ctrl = _vmentry_control;
-
- return 0;
-}
-
-static struct vmcs *alloc_vmcs_cpu(int cpu)
-{
- int node = cpu_to_node(cpu);
- struct page *pages;
- struct vmcs *vmcs;
-
- pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
- if (!pages)
- return NULL;
- vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
- vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
- return vmcs;
-}
-
-static struct vmcs *alloc_vmcs(void)
-{
- return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
-static void free_vmcs(struct vmcs *vmcs)
-{
- free_pages((unsigned long)vmcs, vmcs_config.order);
-}
-
-static void free_kvm_area(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- free_vmcs(per_cpu(vmxarea, cpu));
-}
-
-static __init int alloc_kvm_area(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct vmcs *vmcs;
-
- vmcs = alloc_vmcs_cpu(cpu);
- if (!vmcs) {
- free_kvm_area();
- return -ENOMEM;
- }
-
- per_cpu(vmxarea, cpu) = vmcs;
- }
- return 0;
-}
-
-static __init int hardware_setup(void)
-{
- if (setup_vmcs_config(&vmcs_config) < 0)
- return -EIO;
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (!cpu_has_vmx_vpid())
- enable_vpid = 0;
-
- if (!cpu_has_vmx_ept())
- enable_ept = 0;
-
- if (!cpu_has_vmx_flexpriority())
- flexpriority_enabled = 0;
-
- if (!cpu_has_vmx_tpr_shadow())
- kvm_x86_ops->update_cr8_intercept = NULL;
-
- return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
- free_kvm_area();
-}
-
-static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
- vmcs_write16(sf->selector, save->selector);
- vmcs_writel(sf->base, save->base);
- vmcs_write32(sf->limit, save->limit);
- vmcs_write32(sf->ar_bytes, save->ar);
- } else {
- u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
- << AR_DPL_SHIFT;
- vmcs_write32(sf->ar_bytes, 0x93 | dpl);
- }
-}
-
-static void enter_pmode(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 0;
-
- vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
-
- flags = vmcs_readl(GUEST_RFLAGS);
- flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
- flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
- vmcs_writel(GUEST_RFLAGS, flags);
-
- vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
- (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
-
- update_exception_bitmap(vcpu);
-
- if (emulate_invalid_guest_state)
- return;
-
- fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
-
- vmcs_write16(GUEST_SS_SELECTOR, 0);
- vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
-
- vmcs_write16(GUEST_CS_SELECTOR,
- vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-}
-
-static gva_t rmode_tss_base(struct kvm *kvm)
-{
- if (!kvm->arch.tss_addr) {
- gfn_t base_gfn = kvm->memslots[0].base_gfn +
- kvm->memslots[0].npages - 3;
- return base_gfn << PAGE_SHIFT;
- }
- return kvm->arch.tss_addr;
-}
-
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- save->selector = vmcs_read16(sf->selector);
- save->base = vmcs_readl(sf->base);
- save->limit = vmcs_read32(sf->limit);
- save->ar = vmcs_read32(sf->ar_bytes);
- vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xfffff);
- vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
-}
-
-static void enter_rmode(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- vmx->emulation_required = 1;
- vcpu->arch.rmode.vm86_active = 1;
-
- vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
- vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
- vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
- vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
- vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
- flags = vmcs_readl(GUEST_RFLAGS);
- vcpu->arch.rmode.save_iopl
- = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-
- flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
-
- vmcs_writel(GUEST_RFLAGS, flags);
- vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
- update_exception_bitmap(vcpu);
-
- if (emulate_invalid_guest_state)
- goto continue_rmode;
-
- vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
- vmcs_write32(GUEST_SS_LIMIT, 0xffff);
- vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
-
- vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
- vmcs_writel(GUEST_CS_BASE, 0xf0000);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
-
- fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
-
-continue_rmode:
- kvm_mmu_reset_context(vcpu);
- init_rmode(vcpu->kvm);
-}
-
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
- vcpu->arch.shadow_efer = efer;
- if (!msr)
- return;
- if (efer & EFER_LMA) {
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS) |
- VM_ENTRY_IA32E_MODE);
- msr->data = efer;
- } else {
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS) &
- ~VM_ENTRY_IA32E_MODE);
-
- msr->data = efer & ~EFER_LME;
- }
- setup_msrs(vmx);
-}
-
-#ifdef CONFIG_X86_64
-
-static void enter_lmode(struct kvm_vcpu *vcpu)
-{
- u32 guest_tr_ar;
-
- guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
- if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
- printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
- __func__);
- vmcs_write32(GUEST_TR_AR_BYTES,
- (guest_tr_ar & ~AR_TYPE_MASK)
- | AR_TYPE_BUSY_64_TSS);
- }
- vcpu->arch.shadow_efer |= EFER_LMA;
- vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
-}
-
-static void exit_lmode(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.shadow_efer &= ~EFER_LMA;
-
- vmcs_write32(VM_ENTRY_CONTROLS,
- vmcs_read32(VM_ENTRY_CONTROLS)
- & ~VM_ENTRY_IA32E_MODE);
-}
-
-#endif
-
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
- vpid_sync_vcpu_all(to_vmx(vcpu));
- if (enable_ept)
- ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
-}
-
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
-}
-
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
-{
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
- return;
- }
- vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
- vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
- vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
- vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
- }
-}
-
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
- unsigned long cr0,
- struct kvm_vcpu *vcpu)
-{
- if (!(cr0 & X86_CR0_PG)) {
- /* From paging/starting to nonpaging */
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
- vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
- (CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING));
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
- *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
- *hw_cr0 &= ~X86_CR0_WP;
- } else if (!is_paging(vcpu)) {
- /* From nonpaging to paging */
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
- vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
- ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING));
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
- if (!(vcpu->arch.cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
- }
-}
-
-static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
- struct kvm_vcpu *vcpu)
-{
- if (!is_paging(vcpu)) {
- *hw_cr4 &= ~X86_CR4_PAE;
- *hw_cr4 |= X86_CR4_PSE;
- } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
- *hw_cr4 &= ~X86_CR4_PAE;
-}
-
-static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
- KVM_VM_CR0_ALWAYS_ON;
-
- vmx_fpu_deactivate(vcpu);
-
- if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
- enter_pmode(vcpu);
-
- if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
- enter_rmode(vcpu);
-
-#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
- enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
- exit_lmode(vcpu);
- }
-#endif
-
- if (enable_ept)
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
-
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0, hw_cr0);
- vcpu->arch.cr0 = cr0;
-
- if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
- vmx_fpu_activate(vcpu);
-}
-
-static u64 construct_eptp(unsigned long root_hpa)
-{
- u64 eptp;
-
- /* TODO write the value reading from MSR */
- eptp = VMX_EPT_DEFAULT_MT |
- VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
- eptp |= (root_hpa & PAGE_MASK);
-
- return eptp;
-}
-
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- unsigned long guest_cr3;
- u64 eptp;
-
- guest_cr3 = cr3;
- if (enable_ept) {
- eptp = construct_eptp(cr3);
- vmcs_write64(EPT_POINTER, eptp);
- ept_sync_context(eptp);
- ept_load_pdptrs(vcpu);
- guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- }
-
- vmx_flush_tlb(vcpu);
- vmcs_writel(GUEST_CR3, guest_cr3);
- if (vcpu->arch.cr0 & X86_CR0_PE)
- vmx_fpu_deactivate(vcpu);
-}
-
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
-
- vcpu->arch.cr4 = cr4;
- if (enable_ept)
- ept_update_paging_mode_cr4(&hw_cr4, vcpu);
-
- vmcs_writel(CR4_READ_SHADOW, cr4);
- vmcs_writel(GUEST_CR4, hw_cr4);
-}
-
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- return vmcs_readl(sf->base);
-}
-
-static void vmx_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- u32 ar;
-
- var->base = vmcs_readl(sf->base);
- var->limit = vmcs_read32(sf->limit);
- var->selector = vmcs_read16(sf->selector);
- ar = vmcs_read32(sf->ar_bytes);
- if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
- ar = 0;
- var->type = ar & 15;
- var->s = (ar >> 4) & 1;
- var->dpl = (ar >> 5) & 3;
- var->present = (ar >> 7) & 1;
- var->avl = (ar >> 12) & 1;
- var->l = (ar >> 13) & 1;
- var->db = (ar >> 14) & 1;
- var->g = (ar >> 15) & 1;
- var->unusable = (ar >> 16) & 1;
-}
-
-static int vmx_get_cpl(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment kvm_seg;
-
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
- return 0;
-
- if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
- return 3;
-
- vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
- return kvm_seg.selector & 3;
-}
-
-static u32 vmx_segment_access_rights(struct kvm_segment *var)
-{
- u32 ar;
-
- if (var->unusable)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
- if (ar == 0) /* a 0 value means unusable */
- ar = AR_UNUSABLE_MASK;
-
- return ar;
-}
-
-static void vmx_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- u32 ar;
-
- if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
- vcpu->arch.rmode.tr.selector = var->selector;
- vcpu->arch.rmode.tr.base = var->base;
- vcpu->arch.rmode.tr.limit = var->limit;
- vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
- return;
- }
- vmcs_writel(sf->base, var->base);
- vmcs_write32(sf->limit, var->limit);
- vmcs_write16(sf->selector, var->selector);
- if (vcpu->arch.rmode.vm86_active && var->s) {
- /*
- * Hack real-mode segments into vm86 compatibility.
- */
- if (var->base == 0xffff0000 && var->selector == 0xf000)
- vmcs_writel(sf->base, 0xf0000);
- ar = 0xf3;
- } else
- ar = vmx_segment_access_rights(var);
- vmcs_write32(sf->ar_bytes, ar);
-}
-
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
- u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
-
- *db = (ar >> 14) & 1;
- *l = (ar >> 13) & 1;
-}
-
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_IDTR_BASE);
-}
-
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_IDTR_BASE, dt->base);
-}
-
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_GDTR_BASE);
-}
-
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
-{
- vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_GDTR_BASE, dt->base);
-}
-
-static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_segment var;
- u32 ar;
-
- vmx_get_segment(vcpu, &var, seg);
- ar = vmx_segment_access_rights(&var);
-
- if (var.base != (var.selector << 4))
- return false;
- if (var.limit != 0xffff)
- return false;
- if (ar != 0xf3)
- return false;
-
- return true;
-}
-
-static bool code_segment_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs;
- unsigned int cs_rpl;
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- cs_rpl = cs.selector & SELECTOR_RPL_MASK;
-
- if (cs.unusable)
- return false;
- if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
- return false;
- if (!cs.s)
- return false;
- if (cs.type & AR_TYPE_WRITEABLE_MASK) {
- if (cs.dpl > cs_rpl)
- return false;
- } else {
- if (cs.dpl != cs_rpl)
- return false;
- }
- if (!cs.present)
- return false;
-
- /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
- return true;
-}
-
-static bool stack_segment_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment ss;
- unsigned int ss_rpl;
-
- vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
- ss_rpl = ss.selector & SELECTOR_RPL_MASK;
-
- if (ss.unusable)
- return true;
- if (ss.type != 3 && ss.type != 7)
- return false;
- if (!ss.s)
- return false;
- if (ss.dpl != ss_rpl) /* DPL != RPL */
- return false;
- if (!ss.present)
- return false;
-
- return true;
-}
-
-static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_segment var;
- unsigned int rpl;
-
- vmx_get_segment(vcpu, &var, seg);
- rpl = var.selector & SELECTOR_RPL_MASK;
-
- if (var.unusable)
- return true;
- if (!var.s)
- return false;
- if (!var.present)
- return false;
- if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
- if (var.dpl < rpl) /* DPL < RPL */
- return false;
- }
-
- /* TODO: Add other members to kvm_segment_field to allow checking for other access
- * rights flags
- */
- return true;
-}
-
-static bool tr_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment tr;
-
- vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
-
- if (tr.unusable)
- return false;
- if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
- return false;
- if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
- return false;
- if (!tr.present)
- return false;
-
- return true;
-}
-
-static bool ldtr_valid(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment ldtr;
-
- vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
-
- if (ldtr.unusable)
- return true;
- if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
- return false;
- if (ldtr.type != 2)
- return false;
- if (!ldtr.present)
- return false;
-
- return true;
-}
-
-static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs, ss;
-
- vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
- vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
-
- return ((cs.selector & SELECTOR_RPL_MASK) ==
- (ss.selector & SELECTOR_RPL_MASK));
-}
-
-/*
- * Check if guest state is valid. Returns true if valid, false if
- * not.
- * We assume that registers are always usable
- */
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
-{
- /* real mode guest state checks */
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
- if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
- return false;
- if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
- return false;
- } else {
- /* protected mode guest state checks */
- if (!cs_ss_rpl_check(vcpu))
- return false;
- if (!code_segment_valid(vcpu))
- return false;
- if (!stack_segment_valid(vcpu))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_DS))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_ES))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_FS))
- return false;
- if (!data_segment_valid(vcpu, VCPU_SREG_GS))
- return false;
- if (!tr_valid(vcpu))
- return false;
- if (!ldtr_valid(vcpu))
- return false;
- }
- /* TODO:
- * - Add checks on RIP
- * - Add checks on RFLAGS
- */
-
- return true;
-}
-
-static int init_rmode_tss(struct kvm *kvm)
-{
- gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
- u16 data = 0;
- int ret = 0;
- int r;
-
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- r = kvm_write_guest_page(kvm, fn++, &data,
- TSS_IOPB_BASE_OFFSET, sizeof(u16));
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- data = ~0;
- r = kvm_write_guest_page(kvm, fn, &data,
- RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
- sizeof(u8));
- if (r < 0)
- goto out;
-
- ret = 1;
-out:
- return ret;
-}
-
-static int init_rmode_identity_map(struct kvm *kvm)
-{
- int i, r, ret;
- pfn_t identity_map_pfn;
- u32 tmp;
-
- if (!enable_ept)
- return 1;
- if (unlikely(!kvm->arch.ept_identity_pagetable)) {
- printk(KERN_ERR "EPT: identity-mapping pagetable "
- "haven't been allocated!\n");
- return 0;
- }
- if (likely(kvm->arch.ept_identity_pagetable_done))
- return 1;
- ret = 0;
- identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
- r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- /* Set up identity-mapping pagetable for EPT in real mode */
- for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
- tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
- r = kvm_write_guest_page(kvm, identity_map_pfn,
- &tmp, i * sizeof(tmp), sizeof(tmp));
- if (r < 0)
- goto out;
- }
- kvm->arch.ept_identity_pagetable_done = true;
- ret = 1;
-out:
- return ret;
-}
-
-static void seg_setup(int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- vmcs_write16(sf->selector, 0);
- vmcs_writel(sf->base, 0);
- vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0xf3);
-}
-
-static int alloc_apic_access_page(struct kvm *kvm)
-{
- struct kvm_userspace_memory_region kvm_userspace_mem;
- int r = 0;
-
- down_write(&kvm->slots_lock);
- if (kvm->arch.apic_access_page)
- goto out;
- kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
- kvm_userspace_mem.flags = 0;
- kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
- kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
-
- kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
-out:
- up_write(&kvm->slots_lock);
- return r;
-}
-
-static int alloc_identity_pagetable(struct kvm *kvm)
-{
- struct kvm_userspace_memory_region kvm_userspace_mem;
- int r = 0;
-
- down_write(&kvm->slots_lock);
- if (kvm->arch.ept_identity_pagetable)
- goto out;
- kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
- kvm_userspace_mem.flags = 0;
- kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
-
- kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
- VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
-out:
- up_write(&kvm->slots_lock);
- return r;
-}
-
-static void allocate_vpid(struct vcpu_vmx *vmx)
-{
- int vpid;
-
- vmx->vpid = 0;
- if (!enable_vpid)
- return;
- spin_lock(&vmx_vpid_lock);
- vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
- if (vpid < VMX_NR_VPIDS) {
- vmx->vpid = vpid;
- __set_bit(vpid, vmx_vpid_bitmap);
- }
- spin_unlock(&vmx_vpid_lock);
-}
-
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- /*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
- }
-}
-
-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
-{
- if (!longmode_only)
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
-}
-
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
-{
- u32 host_sysenter_cs, msr_low, msr_high;
- u32 junk;
- u64 host_pat, tsc_this, tsc_base;
- unsigned long a;
- struct descriptor_table dt;
- int i;
- unsigned long kvm_vmx_return;
- u32 exec_control;
-
- /* I/O */
- vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
- vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
-
- if (cpu_has_vmx_msr_bitmap())
- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
-
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
- /* Control */
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
- vmcs_config.pin_based_exec_ctrl);
-
- exec_control = vmcs_config.cpu_based_exec_ctrl;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
- exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
- exec_control |= CPU_BASED_CR8_STORE_EXITING |
- CPU_BASED_CR8_LOAD_EXITING;
-#endif
- }
- if (!enable_ept)
- exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_INVLPG_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
-
- if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- exec_control &=
- ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- if (vmx->vpid == 0)
- exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
- if (!enable_ept)
- exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
- }
-
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
- vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
-
- vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
-
- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
- vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
- vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_FS_BASE, a);
- vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
- rdmsrl(MSR_GS_BASE, a);
- vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
-#else
- vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
- vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-#endif
-
- vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
-
- kvm_get_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
-
- asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
- vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
- rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
- vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
- rdmsrl(MSR_IA32_SYSENTER_ESP, a);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
- rdmsrl(MSR_IA32_SYSENTER_EIP, a);
- vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
-
- if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((u64) msr_high << 32);
- vmcs_write64(HOST_IA32_PAT, host_pat);
- }
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((u64) msr_high << 32);
- /* Write the default value follow host pat */
- vmcs_write64(GUEST_IA32_PAT, host_pat);
- /* Keep arch.pat sync with GUEST_IA32_PAT */
- vmx->vcpu.arch.pat = host_pat;
- }
-
- for (i = 0; i < NR_VMX_MSR; ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- u64 data;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- data = data_low | ((u64)data_high << 32);
- vmx->host_msrs[j].index = index;
- vmx->host_msrs[j].reserved = 0;
- vmx->host_msrs[j].data = data;
- vmx->guest_msrs[j] = vmx->host_msrs[j];
- ++vmx->nmsrs;
- }
-
- vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
-
- /* 22.2.1, 20.8.1 */
- vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
-
- vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
-
- tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
- rdtscll(tsc_this);
- if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
- tsc_base = tsc_this;
-
- guest_write_tsc(0, tsc_base);
-
- return 0;
-}
-
-static int init_rmode(struct kvm *kvm)
-{
- if (!init_rmode_tss(kvm))
- return 0;
- if (!init_rmode_identity_map(kvm))
- return 0;
- return 1;
-}
-
-static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 msr;
- int ret;
-
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- down_read(&vcpu->kvm->slots_lock);
- if (!init_rmode(vmx->vcpu.kvm)) {
- ret = -ENOMEM;
- goto out;
- }
-
- vmx->vcpu.arch.rmode.vm86_active = 0;
-
- vmx->soft_vnmi_blocked = 0;
-
- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- kvm_set_cr8(&vmx->vcpu, 0);
- msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (vmx->vcpu.vcpu_id == 0)
- msr |= MSR_IA32_APICBASE_BSP;
- kvm_set_apic_base(&vmx->vcpu, msr);
-
- fx_init(&vmx->vcpu);
-
- seg_setup(VCPU_SREG_CS);
- /*
- * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
- * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
- */
- if (vmx->vcpu.vcpu_id == 0) {
- vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
- vmcs_writel(GUEST_CS_BASE, 0x000f0000);
- } else {
- vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
- vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
- }
-
- seg_setup(VCPU_SREG_DS);
- seg_setup(VCPU_SREG_ES);
- seg_setup(VCPU_SREG_FS);
- seg_setup(VCPU_SREG_GS);
- seg_setup(VCPU_SREG_SS);
-
- vmcs_write16(GUEST_TR_SELECTOR, 0);
- vmcs_writel(GUEST_TR_BASE, 0);
- vmcs_write32(GUEST_TR_LIMIT, 0xffff);
- vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
- vmcs_write16(GUEST_LDTR_SELECTOR, 0);
- vmcs_writel(GUEST_LDTR_BASE, 0);
- vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
- vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
- vmcs_writel(GUEST_RFLAGS, 0x02);
- if (vmx->vcpu.vcpu_id == 0)
- kvm_rip_write(vcpu, 0xfff0);
- else
- kvm_rip_write(vcpu, 0);
- kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
-
- vmcs_writel(GUEST_DR7, 0x400);
-
- vmcs_writel(GUEST_GDTR_BASE, 0);
- vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
- vmcs_writel(GUEST_IDTR_BASE, 0);
- vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
- vmcs_write32(GUEST_ACTIVITY_STATE, 0);
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
- vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
- /* Special registers */
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
- setup_msrs(vmx);
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
-
- if (cpu_has_vmx_tpr_shadow()) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- page_to_phys(vmx->vcpu.arch.apic->regs_page));
- vmcs_write32(TPR_THRESHOLD, 0);
- }
-
- if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- vmcs_write64(APIC_ACCESS_ADDR,
- page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
-
- if (vmx->vpid != 0)
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-
- vmx->vcpu.arch.cr0 = 0x60000010;
- vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
- vmx_set_cr4(&vmx->vcpu, 0);
- vmx_set_efer(&vmx->vcpu, 0);
- vmx_fpu_activate(&vmx->vcpu);
- update_exception_bitmap(&vmx->vcpu);
-
- vpid_sync_vcpu_all(vmx);
-
- ret = 0;
-
- /* HACK: Don't enable emulation on guest boot/reset */
- vmx->emulation_required = 0;
-
-out:
- up_read(&vcpu->kvm->slots_lock);
- return ret;
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- if (!cpu_has_virtual_nmis()) {
- enable_irq_window(vcpu);
- return;
- }
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- uint32_t intr;
- int irq = vcpu->arch.interrupt.nr;
-
- KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
-
- ++vcpu->stat.irq_injections;
- if (vcpu->arch.rmode.vm86_active) {
- vmx->rmode.irq.pending = true;
- vmx->rmode.irq.vector = irq;
- vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
- return;
- }
- intr = irq | INTR_INFO_VALID_MASK;
- if (vcpu->arch.interrupt.soft) {
- intr |= INTR_TYPE_SOFT_INTR;
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmx->vcpu.arch.event_exit_inst_len);
- } else
- intr |= INTR_TYPE_EXT_INTR;
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
-}
-
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!cpu_has_virtual_nmis()) {
- /*
- * Tracking the NMI-blocked state in software is built upon
- * finding the next open IRQ window. This, in turn, depends on
- * well-behaving guests: They have to keep IRQs disabled at
- * least as long as the NMI handler runs. Otherwise we may
- * cause NMI nesting, maybe breaking the guest. But as this is
- * highly unlikely, we can live with the residual risk.
- */
- vmx->soft_vnmi_blocked = 1;
- vmx->vnmi_blocked_time = 0;
- }
-
- ++vcpu->stat.nmi_injections;
- if (vcpu->arch.rmode.vm86_active) {
- vmx->rmode.irq.pending = true;
- vmx->rmode.irq.vector = NMI_VECTOR;
- vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- NMI_VECTOR | INTR_TYPE_SOFT_INTR |
- INTR_INFO_VALID_MASK);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
- return;
- }
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-}
-
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-{
- if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
- return 0;
-
- return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_NMI));
-}
-
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
-}
-
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- int ret;
- struct kvm_userspace_memory_region tss_mem = {
- .slot = TSS_PRIVATE_MEMSLOT,
- .guest_phys_addr = addr,
- .memory_size = PAGE_SIZE * 3,
- .flags = 0,
- };
-
- ret = kvm_set_memory_region(kvm, &tss_mem, 0);
- if (ret)
- return ret;
- kvm->arch.tss_addr = addr;
- return 0;
-}
-
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
- int vec, u32 err_code)
-{
- /*
- * Instruction with address size override prefix opcode 0x67
- * Cause the #SS fault with 0 error code in VM86 mode.
- */
- if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
- return 1;
- /*
- * Forward all other exceptions that are valid in real mode.
- * FIXME: Breaks guest debugging in real mode, needs to be fixed with
- * the required debugging infrastructure rework.
- */
- switch (vec) {
- case DB_VECTOR:
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return 0;
- kvm_queue_exception(vcpu, vec);
- return 1;
- case BP_VECTOR:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- return 0;
- /* fall through */
- case DE_VECTOR:
- case OF_VECTOR:
- case BR_VECTOR:
- case UD_VECTOR:
- case DF_VECTOR:
- case SS_VECTOR:
- case GP_VECTOR:
- case MF_VECTOR:
- kvm_queue_exception(vcpu, vec);
- return 1;
- }
- return 0;
-}
-
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
- struct pt_regs regs = {
- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
- .flags = X86_EFLAGS_IF,
- };
-
- do_machine_check(&regs, 0);
-#endif
-}
-
-static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- /* already handled by vcpu_run */
- return 1;
-}
-
-static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 intr_info, ex_no, error_code;
- unsigned long cr2, rip, dr6;
- u32 vect_info;
- enum emulation_result er;
-
- vect_info = vmx->idt_vectoring_info;
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- if (is_machine_check(intr_info))
- return handle_machine_check(vcpu, kvm_run);
-
- if ((vect_info & VECTORING_INFO_VALID_MASK) &&
- !is_page_fault(intr_info))
- printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
- "intr info 0x%x\n", __func__, vect_info, intr_info);
-
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
- return 1; /* already handled by vmx_vcpu_run() */
-
- if (is_no_device(intr_info)) {
- vmx_fpu_activate(vcpu);
- return 1;
- }
-
- if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
- if (er != EMULATE_DONE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
-
- error_code = 0;
- rip = kvm_rip_read(vcpu);
- if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
- error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
- if (is_page_fault(intr_info)) {
- /* EPT won't cause page fault directly */
- if (enable_ept)
- BUG();
- cr2 = vmcs_readl(EXIT_QUALIFICATION);
- KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
- (u32)((u64)cr2 >> 32), handler);
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code);
- }
-
- if (vcpu->arch.rmode.vm86_active &&
- handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
- error_code)) {
- if (vcpu->arch.halt_request) {
- vcpu->arch.halt_request = 0;
- return kvm_emulate_halt(vcpu);
- }
- return 1;
- }
-
- ex_no = intr_info & INTR_INFO_VECTOR_MASK;
- switch (ex_no) {
- case DB_VECTOR:
- dr6 = vmcs_readl(EXIT_QUALIFICATION);
- if (!(vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
- vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
- kvm_queue_exception(vcpu, DB_VECTOR);
- return 1;
- }
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
- kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
- /* fall through */
- case BP_VECTOR:
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
- kvm_run->debug.arch.exception = ex_no;
- break;
- default:
- kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
- kvm_run->ex.exception = ex_no;
- kvm_run->ex.error_code = error_code;
- break;
- }
- return 0;
-}
-
-static int handle_external_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- ++vcpu->stat.irq_exits;
- KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
- return 1;
-}
-
-static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
- return 0;
-}
-
-static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- int size, in, string;
- unsigned port;
-
- ++vcpu->stat.io_exits;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- string = (exit_qualification & 16) != 0;
-
- if (string) {
- if (emulate_instruction(vcpu,
- kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
-
- size = (exit_qualification & 7) + 1;
- in = (exit_qualification & 8) != 0;
- port = exit_qualification >> 16;
-
- skip_emulated_instruction(vcpu);
- return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
-}
-
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
- /*
- * Patch in the VMCALL instruction:
- */
- hypercall[0] = 0x0f;
- hypercall[1] = 0x01;
- hypercall[2] = 0xc1;
-}
-
-static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- int cr;
- int reg;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- cr = exit_qualification & 15;
- reg = (exit_qualification >> 8) & 15;
- switch ((exit_qualification >> 4) & 3) {
- case 0: /* mov to cr */
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
- switch (cr) {
- case 0:
- kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
- skip_emulated_instruction(vcpu);
- return 1;
- case 3:
- kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
- skip_emulated_instruction(vcpu);
- return 1;
- case 4:
- kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
- skip_emulated_instruction(vcpu);
- return 1;
- case 8: {
- u8 cr8_prev = kvm_get_cr8(vcpu);
- u8 cr8 = kvm_register_read(vcpu, reg);
- kvm_set_cr8(vcpu, cr8);
- skip_emulated_instruction(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- return 1;
- if (cr8_prev <= cr8)
- return 1;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
- }
- };
- break;
- case 2: /* clts */
- vmx_fpu_deactivate(vcpu);
- vcpu->arch.cr0 &= ~X86_CR0_TS;
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
- vmx_fpu_activate(vcpu);
- KVMTRACE_0D(CLTS, vcpu, handler);
- skip_emulated_instruction(vcpu);
- return 1;
- case 1: /*mov from cr*/
- switch (cr) {
- case 3:
- kvm_register_write(vcpu, reg, vcpu->arch.cr3);
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg),
- (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
- handler);
- skip_emulated_instruction(vcpu);
- return 1;
- case 8:
- kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
- KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
- (u32)kvm_register_read(vcpu, reg), handler);
- skip_emulated_instruction(vcpu);
- return 1;
- }
- break;
- case 3: /* lmsw */
- kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
-
- skip_emulated_instruction(vcpu);
- return 1;
- default:
- break;
- }
- kvm_run->exit_reason = 0;
- pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
- (int)(exit_qualification >> 4) & 3, cr);
- return 0;
-}
-
-static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- unsigned long val;
- int dr, reg;
-
- dr = vmcs_readl(GUEST_DR7);
- if (dr & DR7_GD) {
- /*
- * As the vm-exit takes precedence over the debug trap, we
- * need to emulate the latter, either for the host or the
- * guest debugging itself.
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
- kvm_run->debug.arch.dr7 = dr;
- kvm_run->debug.arch.pc =
- vmcs_readl(GUEST_CS_BASE) +
- vmcs_readl(GUEST_RIP);
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- return 0;
- } else {
- vcpu->arch.dr7 &= ~DR7_GD;
- vcpu->arch.dr6 |= DR6_BD;
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- kvm_queue_exception(vcpu, DB_VECTOR);
- return 1;
- }
- }
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
- reg = DEBUG_REG_ACCESS_REG(exit_qualification);
- if (exit_qualification & TYPE_MOV_FROM_DR) {
- switch (dr) {
- case 0 ... 3:
- val = vcpu->arch.db[dr];
- break;
- case 6:
- val = vcpu->arch.dr6;
- break;
- case 7:
- val = vcpu->arch.dr7;
- break;
- default:
- val = 0;
- }
- kvm_register_write(vcpu, reg, val);
- KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
- } else {
- val = vcpu->arch.regs[reg];
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = val;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = val;
- break;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- break;
- case 6:
- if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
- }
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 7:
- if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
- }
- vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs =
- (val & DR7_BP_EN_MASK);
- }
- break;
- }
- KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
- }
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- kvm_emulate_cpuid(vcpu);
- return 1;
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data;
-
- if (vmx_get_msr(vcpu, ecx, &data)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
-
- /* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
- vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-
- KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
- handler);
-
- if (vmx_set_msr(vcpu, ecx, data) != 0) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- return 1;
-}
-
-static int handle_interrupt_window(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending irq */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-
- KVMTRACE_0D(PEND_INTR, vcpu, handler);
- ++vcpu->stat.irq_window_exits;
-
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(vcpu->kvm) &&
- kvm_run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(vcpu)) {
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
- return 1;
-}
-
-static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- skip_emulated_instruction(vcpu);
- return kvm_emulate_halt(vcpu);
-}
-
-static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- skip_emulated_instruction(vcpu);
- kvm_emulate_hypercall(vcpu);
- return 1;
-}
-
-static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
-}
-
-static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- kvm_mmu_invlpg(vcpu, exit_qualification);
- skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- skip_emulated_instruction(vcpu);
- /* TODO: Add support for VT-d/pass-through device */
- return 1;
-}
-
-static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- enum emulation_result er;
- unsigned long offset;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- offset = exit_qualification & 0xffful;
-
- er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-
- if (er != EMULATE_DONE) {
- printk(KERN_ERR
- "Fail to handle apic access vmexit! Offset is 0x%lx\n",
- offset);
- return -ENOTSUPP;
- }
- return 1;
-}
-
-static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qualification;
- u16 tss_selector;
- int reason, type, idt_v;
-
- idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
- type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- reason = (u32)exit_qualification >> 30;
- if (reason == TASK_SWITCH_GATE && idt_v) {
- switch (type) {
- case INTR_TYPE_NMI_INTR:
- vcpu->arch.nmi_injected = false;
- if (cpu_has_virtual_nmis())
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- break;
- case INTR_TYPE_EXT_INTR:
- case INTR_TYPE_SOFT_INTR:
- kvm_clear_interrupt_queue(vcpu);
- break;
- case INTR_TYPE_HARD_EXCEPTION:
- case INTR_TYPE_SOFT_EXCEPTION:
- kvm_clear_exception_queue(vcpu);
- break;
- default:
- break;
- }
- }
- tss_selector = exit_qualification;
-
- if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
- type != INTR_TYPE_EXT_INTR &&
- type != INTR_TYPE_NMI_INTR))
- skip_emulated_instruction(vcpu);
-
- if (!kvm_task_switch(vcpu, tss_selector, reason))
- return 0;
-
- /* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
-
- /*
- * TODO: What about debug traps on tss switch?
- * Are we supposed to inject them and update dr6?
- */
-
- return 1;
-}
-
-static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- unsigned long exit_qualification;
- gpa_t gpa;
- int gla_validity;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- if (exit_qualification & (1 << 6)) {
- printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
- return -ENOTSUPP;
- }
-
- gla_validity = (exit_qualification >> 7) & 0x3;
- if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
- printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
- printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
- (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
- vmcs_readl(GUEST_LINEAR_ADDRESS));
- printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
- (long unsigned int)exit_qualification);
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = 0;
- return -ENOTSUPP;
- }
-
- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
-}
-
-static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending NMI */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- ++vcpu->stat.nmi_window_exits;
-
- return 1;
-}
-
-static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- enum emulation_result err = EMULATE_DONE;
-
- local_irq_enable();
- preempt_enable();
-
- while (!guest_state_valid(vcpu)) {
- err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-
- if (err == EMULATE_DO_MMIO)
- break;
-
- if (err != EMULATE_DONE) {
- kvm_report_emulation_failure(vcpu, "emulation failure");
- break;
- }
-
- if (signal_pending(current))
- break;
- if (need_resched())
- schedule();
- }
-
- preempt_disable();
- local_irq_disable();
-
- vmx->invalid_state_emulation_result = err;
-}
-
-/*
- * The exit handlers return 1 if the exit was handled fully and guest execution
- * may resume. Otherwise they set the kvm_run parameter to indicate what needs
- * to be done to userspace and return 0.
- */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run) = {
- [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
- [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
- [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
- [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
- [EXIT_REASON_IO_INSTRUCTION] = handle_io,
- [EXIT_REASON_CR_ACCESS] = handle_cr,
- [EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
- [EXIT_REASON_INVLPG] = handle_invlpg,
- [EXIT_REASON_VMCALL] = handle_vmcall,
- [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
- [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
- [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
- [EXIT_REASON_VMPTRST] = handle_vmx_insn,
- [EXIT_REASON_VMREAD] = handle_vmx_insn,
- [EXIT_REASON_VMRESUME] = handle_vmx_insn,
- [EXIT_REASON_VMWRITE] = handle_vmx_insn,
- [EXIT_REASON_VMOFF] = handle_vmx_insn,
- [EXIT_REASON_VMON] = handle_vmx_insn,
- [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
- [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
- [EXIT_REASON_WBINVD] = handle_wbinvd,
- [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
- [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
- [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
-};
-
-static const int kvm_vmx_max_exit_handlers =
- ARRAY_SIZE(kvm_vmx_exit_handlers);
-
-/*
- * The guest has exited. See if we can fix it or if we need userspace
- * assistance.
- */
-static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
- u32 vectoring_info = vmx->idt_vectoring_info;
-
- KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
- (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
-
- /* If we need to emulate an MMIO from handle_invalid_guest_state
- * we just return 0 */
- if (vmx->emulation_required && emulate_invalid_guest_state) {
- if (guest_state_valid(vcpu))
- vmx->emulation_required = 0;
- return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
- }
-
- /* Access CR3 don't cause VMExit in paging mode, so we need
- * to sync with guest real CR3. */
- if (enable_ept && is_paging(vcpu)) {
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- ept_load_pdptrs(vcpu);
- }
-
- if (unlikely(vmx->fail)) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason
- = vmcs_read32(VM_INSTRUCTION_ERROR);
- return 0;
- }
-
- if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_TASK_SWITCH))
- printk(KERN_WARNING "%s: unexpected, valid vectoring info "
- "(0x%x) and exit reason is 0x%x\n",
- __func__, vectoring_info, exit_reason);
-
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
- if (vmx_interrupt_allowed(vcpu)) {
- vmx->soft_vnmi_blocked = 0;
- } else if (vmx->vnmi_blocked_time > 1000000000LL &&
- vcpu->arch.nmi_pending) {
- /*
- * This CPU don't support us in finding the end of an
- * NMI-blocked window if the guest runs with IRQs
- * disabled. So we pull the trigger after 1 s of
- * futile waiting, but inform the user about this.
- */
- printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
- "state on VCPU %d after 1 s timeout\n",
- __func__, vcpu->vcpu_id);
- vmx->soft_vnmi_blocked = 0;
- }
- }
-
- if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
- else {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_reason;
- }
- return 0;
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
-{
- if (irr == -1 || tpr < irr) {
- vmcs_write32(TPR_THRESHOLD, 0);
- return;
- }
-
- vmcs_write32(TPR_THRESHOLD, irr);
-}
-
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
-{
- u32 exit_intr_info;
- u32 idt_vectoring_info = vmx->idt_vectoring_info;
- bool unblock_nmi;
- u8 vector;
- int type;
- bool idtv_info_valid;
-
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-
- /* Handle machine checks before interrupts are enabled */
- if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
- || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
- && is_machine_check(exit_intr_info)))
- kvm_machine_check();
-
- /* We need to handle NMIs before interrupts are enabled */
- if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
- KVMTRACE_0D(NMI, &vmx->vcpu, handler);
- asm("int $2");
- }
-
- idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
-
- if (cpu_has_virtual_nmis()) {
- unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- * SDM 3: 23.2.2 (September 2008)
- * Bit 12 is undefined in any of the following cases:
- * If the VM exit sets the valid bit in the IDT-vectoring
- * information field.
- * If the VM exit is due to a double fault.
- */
- if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
- vector != DF_VECTOR && !idtv_info_valid)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- } else if (unlikely(vmx->soft_vnmi_blocked))
- vmx->vnmi_blocked_time +=
- ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
-
- vmx->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&vmx->vcpu);
- kvm_clear_interrupt_queue(&vmx->vcpu);
-
- if (!idtv_info_valid)
- return;
-
- vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
- type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
-
- switch (type) {
- case INTR_TYPE_NMI_INTR:
- vmx->vcpu.arch.nmi_injected = true;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Clear bit "block by NMI" before VM entry if a NMI
- * delivery faulted.
- */
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- break;
- case INTR_TYPE_SOFT_EXCEPTION:
- vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- /* fall through */
- case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
- kvm_queue_exception_e(&vmx->vcpu, vector, err);
- } else
- kvm_queue_exception(&vmx->vcpu, vector);
- break;
- case INTR_TYPE_SOFT_INTR:
- vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- /* fall through */
- case INTR_TYPE_EXT_INTR:
- kvm_queue_interrupt(&vmx->vcpu, vector,
- type == INTR_TYPE_SOFT_INTR);
- break;
- default:
- break;
- }
-}
-
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen. So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
-{
- vmx->rmode.irq.pending = 0;
- if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
- return;
- kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
- if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
- vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
- vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
- return;
- }
- vmx->idt_vectoring_info =
- VECTORING_INFO_VALID_MASK
- | INTR_TYPE_EXT_INTR
- | vmx->rmode.irq.vector;
-}
-
-#ifdef CONFIG_X86_64
-#define R "r"
-#define Q "q"
-#else
-#define R "e"
-#define Q "l"
-#endif
-
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /* Record the guest's net vcpu time for enforced NMI injections. */
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
- vmx->entry_time = ktime_get();
-
- /* Handle invalid guest state instead of entering VMX */
- if (vmx->emulation_required && emulate_invalid_guest_state) {
- handle_invalid_guest_state(vcpu, kvm_run);
- return;
- }
-
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
- vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
- vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
- /*
- * Loading guest fpu may have cleared host cr0.ts
- */
- vmcs_writel(HOST_CR0, read_cr0());
-
- set_debugreg(vcpu->arch.dr6, 6);
-
- asm(
- /* Store host registers */
- "push %%"R"dx; push %%"R"bp;"
- "push %%"R"cx \n\t"
- "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
- "je 1f \n\t"
- "mov %%"R"sp, %c[host_rsp](%0) \n\t"
- __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
- "1: \n\t"
- /* Check if vmlaunch of vmresume is needed */
- "cmpl $0, %c[launched](%0) \n\t"
- /* Load guest registers. Don't clobber flags. */
- "mov %c[cr2](%0), %%"R"ax \n\t"
- "mov %%"R"ax, %%cr2 \n\t"
- "mov %c[rax](%0), %%"R"ax \n\t"
- "mov %c[rbx](%0), %%"R"bx \n\t"
- "mov %c[rdx](%0), %%"R"dx \n\t"
- "mov %c[rsi](%0), %%"R"si \n\t"
- "mov %c[rdi](%0), %%"R"di \n\t"
- "mov %c[rbp](%0), %%"R"bp \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%0), %%r8 \n\t"
- "mov %c[r9](%0), %%r9 \n\t"
- "mov %c[r10](%0), %%r10 \n\t"
- "mov %c[r11](%0), %%r11 \n\t"
- "mov %c[r12](%0), %%r12 \n\t"
- "mov %c[r13](%0), %%r13 \n\t"
- "mov %c[r14](%0), %%r14 \n\t"
- "mov %c[r15](%0), %%r15 \n\t"
-#endif
- "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
-
- /* Enter guest mode */
- "jne .Llaunched \n\t"
- __ex(ASM_VMX_VMLAUNCH) "\n\t"
- "jmp .Lkvm_vmx_return \n\t"
- ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
- ".Lkvm_vmx_return: "
- /* Save guest registers, load host registers, keep flags */
- "xchg %0, (%%"R"sp) \n\t"
- "mov %%"R"ax, %c[rax](%0) \n\t"
- "mov %%"R"bx, %c[rbx](%0) \n\t"
- "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
- "mov %%"R"dx, %c[rdx](%0) \n\t"
- "mov %%"R"si, %c[rsi](%0) \n\t"
- "mov %%"R"di, %c[rdi](%0) \n\t"
- "mov %%"R"bp, %c[rbp](%0) \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%0) \n\t"
- "mov %%r9, %c[r9](%0) \n\t"
- "mov %%r10, %c[r10](%0) \n\t"
- "mov %%r11, %c[r11](%0) \n\t"
- "mov %%r12, %c[r12](%0) \n\t"
- "mov %%r13, %c[r13](%0) \n\t"
- "mov %%r14, %c[r14](%0) \n\t"
- "mov %%r15, %c[r15](%0) \n\t"
-#endif
- "mov %%cr2, %%"R"ax \n\t"
- "mov %%"R"ax, %c[cr2](%0) \n\t"
-
- "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
- "setbe %c[fail](%0) \n\t"
- : : "c"(vmx), "d"((unsigned long)HOST_RSP),
- [launched]"i"(offsetof(struct vcpu_vmx, launched)),
- [fail]"i"(offsetof(struct vcpu_vmx, fail)),
- [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
- [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
- [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
- [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
-#endif
- [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
- : "cc", "memory"
- , R"bx", R"di", R"si"
-#ifdef CONFIG_X86_64
- , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#endif
- );
-
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- vcpu->arch.regs_dirty = 0;
-
- get_debugreg(vcpu->arch.dr6, 6);
-
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- if (vmx->rmode.irq.pending)
- fixup_rmode_irq(vmx);
-
- asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
- vmx->launched = 1;
-
- vmx_complete_interrupts(vmx);
-}
-
-#undef R
-#undef Q
-
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->vmcs) {
- vcpu_clear(vmx);
- free_vmcs(vmx->vmcs);
- vmx->vmcs = NULL;
- }
-}
-
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- spin_lock(&vmx_vpid_lock);
- if (vmx->vpid != 0)
- __clear_bit(vmx->vpid, vmx_vpid_bitmap);
- spin_unlock(&vmx_vpid_lock);
- vmx_free_vmcs(vcpu);
- kfree(vmx->host_msrs);
- kfree(vmx->guest_msrs);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
-}
-
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
-{
- int err;
- struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- int cpu;
-
- if (!vmx)
- return ERR_PTR(-ENOMEM);
-
- allocate_vpid(vmx);
-
- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
- if (err)
- goto free_vcpu;
-
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!vmx->guest_msrs) {
- err = -ENOMEM;
- goto uninit_vcpu;
- }
-
- vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!vmx->host_msrs)
- goto free_guest_msrs;
-
- vmx->vmcs = alloc_vmcs();
- if (!vmx->vmcs)
- goto free_msrs;
-
- vmcs_clear(vmx->vmcs);
-
- cpu = get_cpu();
- vmx_vcpu_load(&vmx->vcpu, cpu);
- err = vmx_vcpu_setup(vmx);
- vmx_vcpu_put(&vmx->vcpu);
- put_cpu();
- if (err)
- goto free_vmcs;
- if (vm_need_virtualize_apic_accesses(kvm))
- if (alloc_apic_access_page(kvm) != 0)
- goto free_vmcs;
-
- if (enable_ept)
- if (alloc_identity_pagetable(kvm) != 0)
- goto free_vmcs;
-
- return &vmx->vcpu;
-
-free_vmcs:
- free_vmcs(vmx->vmcs);
-free_msrs:
- kfree(vmx->host_msrs);
-free_guest_msrs:
- kfree(vmx->guest_msrs);
-uninit_vcpu:
- kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
- kmem_cache_free(kvm_vcpu_cache, vmx);
- return ERR_PTR(err);
-}
-
-static void __init vmx_check_processor_compat(void *rtn)
-{
- struct vmcs_config vmcs_conf;
-
- *(int *)rtn = 0;
- if (setup_vmcs_config(&vmcs_conf) < 0)
- *(int *)rtn = -EIO;
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- *(int *)rtn = -EIO;
- }
-}
-
-static int get_ept_level(void)
-{
- return VMX_EPT_DEFAULT_GAW + 1;
-}
-
-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- u64 ret;
-
- /* For VT-d and EPT combination
- * 1. MMIO: always map as UC
- * 2. EPT with VT-d:
- * a. VT-d without snooping control feature: can't guarantee the
- * result, try to trust guest.
- * b. VT-d with snooping control feature: snooping control feature of
- * VT-d engine can guarantee the cache correctness. Just set it
- * to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
- * consistent with host MTRR
- */
- if (is_mmio)
- ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- else if (vcpu->kvm->arch.iommu_domain &&
- !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
- ret = kvm_get_guest_memory_type(vcpu, gfn) <<
- VMX_EPT_MT_EPTE_SHIFT;
- else
- ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
- | VMX_EPT_IGMT_BIT;
-
- return ret;
-}
-
-static struct kvm_x86_ops vmx_x86_ops = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .hardware_setup = hardware_setup,
- .hardware_unsetup = hardware_unsetup,
- .check_processor_compatibility = vmx_check_processor_compat,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
-
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_guest_switch = vmx_save_host_state,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .set_guest_debug = set_guest_debug,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
- .set_cr0 = vmx_set_cr0,
- .set_cr3 = vmx_set_cr3,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
-
- .tlb_flush = vmx_flush_tlb,
-
- .run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
- .queue_exception = vmx_queue_exception,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
-
- .set_tss_addr = vmx_set_tss_addr,
- .get_tdp_level = get_ept_level,
- .get_mt_mask = vmx_get_mt_mask,
-};
-
-static int __init vmx_init(void)
-{
- int r;
-
- vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_a)
- return -ENOMEM;
-
- vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b) {
- r = -ENOMEM;
- goto out;
- }
-
- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy) {
- r = -ENOMEM;
- goto out1;
- }
-
- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode) {
- r = -ENOMEM;
- goto out2;
- }
-
- /*
- * Allow direct access to the PC debug port (it is often used for I/O
- * delays, but the vmexits simply slow things down).
- */
- memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
- clear_bit(0x80, vmx_io_bitmap_a);
-
- memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
- set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
- if (r)
- goto out3;
-
- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-
- if (enable_ept) {
- bypass_guest_pf = 0;
- kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
- VMX_EPT_WRITABLE_MASK);
- kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK);
- kvm_enable_tdp();
- } else
- kvm_disable_tdp();
-
- if (bypass_guest_pf)
- kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
- ept_sync_global();
-
- return 0;
-
-out3:
- free_page((unsigned long)vmx_msr_bitmap_longmode);
-out2:
- free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
- free_page((unsigned long)vmx_io_bitmap_b);
-out:
- free_page((unsigned long)vmx_io_bitmap_a);
- return r;
-}
-
-static void __exit vmx_exit(void)
-{
- free_page((unsigned long)vmx_msr_bitmap_legacy);
- free_page((unsigned long)vmx_msr_bitmap_longmode);
- free_page((unsigned long)vmx_io_bitmap_b);
- free_page((unsigned long)vmx_io_bitmap_a);
-
- kvm_exit();
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
new file mode 100644
index 000000000000..02aadb9d730e
--- /dev/null
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -0,0 +1,407 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_CAPS_H
+#define __KVM_X86_VMX_CAPS_H
+
+#include <asm/vmx.h>
+
+#include "../lapic.h"
+#include "../x86.h"
+#include "../pmu.h"
+#include "../cpuid.h"
+
+extern bool __read_mostly enable_vpid;
+extern bool __read_mostly flexpriority_enabled;
+extern bool __read_mostly enable_ept;
+extern bool __read_mostly enable_unrestricted_guest;
+extern bool __read_mostly enable_ept_ad_bits;
+extern bool __read_mostly enable_pml;
+extern int __read_mostly pt_mode;
+
+#define PT_MODE_SYSTEM 0
+#define PT_MODE_HOST_GUEST 1
+
+struct nested_vmx_msrs {
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
+ u32 procbased_ctls_low;
+ u32 procbased_ctls_high;
+ u32 secondary_ctls_low;
+ u32 secondary_ctls_high;
+ u32 pinbased_ctls_low;
+ u32 pinbased_ctls_high;
+ u32 exit_ctls_low;
+ u32 exit_ctls_high;
+ u32 entry_ctls_low;
+ u32 entry_ctls_high;
+ u32 misc_low;
+ u32 misc_high;
+ u32 ept_caps;
+ u32 vpid_caps;
+ u64 basic;
+ u64 cr0_fixed0;
+ u64 cr0_fixed1;
+ u64 cr4_fixed0;
+ u64 cr4_fixed1;
+ u64 vmcs_enum;
+ u64 vmfunc_controls;
+};
+
+struct vmcs_config {
+ u64 basic;
+ u32 pin_based_exec_ctrl;
+ u32 cpu_based_exec_ctrl;
+ u32 cpu_based_2nd_exec_ctrl;
+ u64 cpu_based_3rd_exec_ctrl;
+ u32 vmexit_ctrl;
+ u32 vmentry_ctrl;
+ u64 misc;
+ struct nested_vmx_msrs nested;
+};
+extern struct vmcs_config vmcs_config __ro_after_init;
+
+struct vmx_capability {
+ u32 ept;
+ u32 vpid;
+};
+extern struct vmx_capability vmx_capability __ro_after_init;
+
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+ return vmcs_config.basic & VMX_BASIC_INOUT;
+}
+
+static inline bool cpu_has_vmx_basic_no_hw_errcode_cc(void)
+{
+ return vmcs_config.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC;
+}
+
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
+ vmcs_config.cpu_based_exec_ctrl & CPU_BASED_NMI_WINDOW_EXITING;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_load_ia32_efer(void)
+{
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER;
+}
+
+static inline bool cpu_has_load_perf_global_ctrl(void)
+{
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+}
+
+static inline bool cpu_has_load_cet_ctrl(void)
+{
+ return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE);
+}
+static inline bool cpu_has_vmx_mpx(void)
+{
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;
+}
+
+static inline bool cpu_has_vmx_tpr_shadow(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
+}
+
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
+{
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
+}
+
+static inline bool cpu_has_vmx_msr_bitmap(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
+}
+
+static inline bool cpu_has_secondary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool cpu_has_tertiary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_ept(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline bool vmx_umip_emulated(void)
+{
+ return !boot_cpu_has(X86_FEATURE_UMIP) &&
+ (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_DESC);
+}
+
+static inline bool cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_RDTSCP;
+}
+
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
+static inline bool cpu_has_vmx_vpid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
+static inline bool cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
+static inline bool cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
+static inline bool cpu_has_vmx_rdrand(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDRAND_EXITING;
+}
+
+static inline bool cpu_has_vmx_invpcid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
+static inline bool cpu_has_vmx_shadow_vmcs(void)
+{
+ /* check if the cpu supports writing r/o exit information fields */
+ if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ return false;
+
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool cpu_has_vmx_encls_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENCLS_EXITING;
+}
+
+static inline bool cpu_has_vmx_rdseed(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDSEED_EXITING;
+}
+
+static inline bool cpu_has_vmx_pml(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
+static inline bool cpu_has_vmx_xsaves(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_XSAVES;
+}
+
+static inline bool cpu_has_vmx_waitpkg(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
+static inline bool cpu_has_vmx_bus_lock_detection(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+ return cpu_has_vmx_apic_register_virt() &&
+ cpu_has_vmx_virtual_intr_delivery() &&
+ cpu_has_vmx_posted_intr();
+}
+
+static inline bool cpu_has_vmx_ipiv(void)
+{
+ return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
+}
+
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_mt_wb(void)
+{
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
+}
+
+static inline int ept_caps_to_lpage_level(u32 ept_caps)
+{
+ if (ept_caps & VMX_EPT_1GB_PAGE_BIT)
+ return PG_LEVEL_1G;
+ if (ept_caps & VMX_EPT_2MB_PAGE_BIT)
+ return PG_LEVEL_2M;
+ return PG_LEVEL_4K;
+}
+
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+ return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_context(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_global(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid(void)
+{
+ return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_individual_addr(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+ return (vmcs_config.misc & VMX_MISC_INTEL_PT) &&
+ (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
+ (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
+}
+
+/*
+ * Processor Trace can operate in one of three modes:
+ * a. system-wide: trace both host/guest and output to host buffer
+ * b. host-only: only trace host and output to host buffer
+ * c. host-guest: trace host and guest simultaneously and output to their
+ * respective buffer
+ *
+ * KVM currently only supports (a) and (c).
+ */
+static inline bool vmx_pt_mode_is_system(void)
+{
+ return pt_mode == PT_MODE_SYSTEM;
+}
+static inline bool vmx_pt_mode_is_host_guest(void)
+{
+ return pt_mode == PT_MODE_HOST_GUEST;
+}
+
+static inline bool vmx_pebs_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
+}
+
+static inline bool cpu_has_notify_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_NOTIFY_VM_EXITING;
+}
+
+#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
new file mode 100644
index 000000000000..412d0829d7a2
--- /dev/null
+++ b/arch/x86/kvm/vmx/common.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_X86_VMX_COMMON_H
+#define __KVM_X86_VMX_COMMON_H
+
+#include <linux/kvm_host.h>
+#include <asm/posted_intr.h>
+
+#include "mmu.h"
+
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 bus_lock_detected : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
+struct vcpu_vt {
+ /* Posted interrupt descriptor */
+ struct pi_desc pi_desc;
+
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
+ union vmx_exit_reason exit_reason;
+
+ unsigned long exit_qualification;
+ u32 exit_intr_info;
+
+ /*
+ * If true, guest state has been loaded into hardware, and host state
+ * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into
+ * hardware.
+ */
+ bool guest_state_loaded;
+ bool emulation_required;
+
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+#endif
+};
+
+#ifdef CONFIG_KVM_INTEL_TDX
+
+static __always_inline bool is_td(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu)
+{
+ return is_td(vcpu->kvm);
+}
+
+#else
+
+static __always_inline bool is_td(struct kvm *kvm) { return false; }
+static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
+
+#endif
+
+static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa)
+{
+ /* For TDX the direct mask is the shared mask. */
+ return !kvm_is_addr_direct(kvm, gpa);
+}
+
+static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
+ unsigned long exit_qualification)
+{
+ u64 error_code;
+
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
+ ? PFERR_PRESENT_MASK : 0;
+
+ if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID)
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+}
+
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
+{
+#ifdef CONFIG_SMP
+ if (vcpu->mode == IN_GUEST_MODE) {
+ /*
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
+ *
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
+ *
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
+ *
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
+ *
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
+ */
+
+ if (vcpu != kvm_get_running_vcpu())
+ __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ return;
+ }
+#endif
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
+}
+
+/*
+ * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the
+ * interrupt if necessary.
+ */
+static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
+ struct pi_desc *pi_desc, int vector)
+{
+ if (pi_test_and_set_pir(vector, pi_desc))
+ return;
+
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(pi_desc))
+ return;
+
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+}
+
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_COMMON_H */
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
new file mode 100644
index 000000000000..fa41d036acd4
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/smp.h>
+
+#include "x86.h"
+#include "../cpuid.h"
+#include "hyperv.h"
+#include "nested.h"
+#include "vmcs.h"
+#include "vmx.h"
+#include "trace.h"
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(kvm_hv_get_assist_page(vcpu)))
+ return EVMPTR_INVALID;
+
+ if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry))
+ return EVMPTR_INVALID;
+
+ return hv_vcpu->vp_assist_page.current_nested_vmcs;
+}
+
+uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
+{
+ /*
+ * vmcs_version represents the range of supported Enlightened VMCS
+ * versions: lower 8 bits is the minimal version, higher 8 bits is the
+ * maximum supported version. KVM supports versions from 1 to
+ * KVM_EVMCS_VERSION.
+ *
+ * Note, do not check the Hyper-V is fully enabled in guest CPUID, this
+ * helper is used to _get_ the vCPU's supported CPUID.
+ */
+ if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
+ (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled))
+ return (KVM_EVMCS_VERSION << 8) | 1;
+
+ return 0;
+}
+
+enum evmcs_revision {
+ EVMCSv1_LEGACY,
+ NR_EVMCS_REVISIONS,
+};
+
+enum evmcs_ctrl_type {
+ EVMCS_EXIT_CTRLS,
+ EVMCS_ENTRY_CTRLS,
+ EVMCS_EXEC_CTRL,
+ EVMCS_2NDEXEC,
+ EVMCS_3RDEXEC,
+ EVMCS_PINCTRL,
+ EVMCS_VMFUNC,
+ NR_EVMCS_CTRLS,
+};
+
+static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
+ [EVMCS_EXIT_CTRLS] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL,
+ },
+ [EVMCS_ENTRY_CTRLS] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL,
+ },
+ [EVMCS_EXEC_CTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL,
+ },
+ [EVMCS_2NDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING,
+ },
+ [EVMCS_3RDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC,
+ },
+ [EVMCS_PINCTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL,
+ },
+ [EVMCS_VMFUNC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC,
+ },
+};
+
+static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type)
+{
+ enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY;
+
+ return evmcs_supported_ctrls[ctrl_type][evmcs_rev];
+}
+
+static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * PERF_GLOBAL_CTRL has a quirk where some Windows guests may fail to
+ * boot if a PV CPUID feature flag is not also set. Treat the fields
+ * as unsupported if the flag is not set in guest CPUID. This should
+ * be called only for guest accesses, and all guest accesses should be
+ * gated on Hyper-V being enabled and initialized.
+ */
+ if (WARN_ON_ONCE(!hv_vcpu))
+ return false;
+
+ return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
+}
+
+void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ u32 ctl_low = (u32)*pdata;
+ u32 ctl_high = (u32)(*pdata >> 32);
+ u32 supported_ctrls;
+
+ /*
+ * Hyper-V 2016 and 2019 try using these features even when eVMCS
+ * is enabled but there are no corresponding fields.
+ */
+ switch (msr_index) {
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS);
+ if (!evmcs_has_perf_global_ctrl(vcpu))
+ supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
+ break;
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS);
+ if (!evmcs_has_perf_global_ctrl(vcpu))
+ supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL);
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC);
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL);
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC);
+ break;
+ }
+
+ *pdata = ctl_low | ((u64)ctl_high << 32);
+}
+
+static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type,
+ u32 val)
+{
+ return !(val & ~evmcs_get_supported_ctls(ctrl_type));
+}
+
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
+{
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_PINCTRL,
+ vmcs12->pin_based_vm_exec_control)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL,
+ vmcs12->cpu_based_vm_exec_control)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC,
+ vmcs12->secondary_vm_exec_control)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXIT_CTRLS,
+ vmcs12->vm_exit_controls)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_ENTRY_CTRLS,
+ vmcs12->vm_entry_controls)))
+ return -EINVAL;
+
+ /*
+ * VM-Func controls are 64-bit, but KVM currently doesn't support any
+ * controls in bits 63:32, i.e. dropping those bits on the consistency
+ * check is intentional.
+ */
+ if (WARN_ON_ONCE(vmcs12->vm_function_control >> 32))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_VMFUNC,
+ vmcs12->vm_function_control)))
+ return -EINVAL;
+
+ return 0;
+}
+
+int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.enlightened_vmcs_enabled = true;
+
+ if (vmcs_version)
+ *vmcs_version = nested_get_evmcs_version(vcpu);
+
+ return 0;
+}
+
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_vcpu || !evmcs)
+ return false;
+
+ if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
+}
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
new file mode 100644
index 000000000000..11a339009781
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_HYPERV_H
+#define __KVM_X86_VMX_HYPERV_H
+
+#include <linux/kvm_host.h>
+#include "vmcs12.h"
+#include "vmx.h"
+
+#define EVMPTR_INVALID (-1ULL)
+#define EVMPTR_MAP_PENDING (-2ULL)
+
+enum nested_evmptrld_status {
+ EVMPTRLD_DISABLED,
+ EVMPTRLD_SUCCEEDED,
+ EVMPTRLD_VMFAIL,
+ EVMPTRLD_ERROR,
+};
+
+#ifdef CONFIG_KVM_HYPERV
+static inline bool evmptr_is_valid(u64 evmptr)
+{
+ return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING;
+}
+
+static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx)
+{
+ return evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
+}
+
+static inline bool evmptr_is_set(u64 evmptr)
+{
+ return evmptr != EVMPTR_INVALID;
+}
+
+static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx)
+{
+ return evmptr_is_set(vmx->nested.hv_evmcs_vmptr);
+}
+
+static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx)
+{
+ return vmx->nested.hv_evmcs;
+}
+
+static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu)
+{
+ /*
+ * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
+ * eVMCS has been explicitly enabled by userspace.
+ */
+ return vcpu->arch.hyperv_enabled &&
+ to_vmx(vcpu)->nested.enlightened_vmcs_enabled;
+}
+
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
+uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
+int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version);
+void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+#else
+static inline bool evmptr_is_valid(u64 evmptr)
+{
+ return false;
+}
+
+static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx)
+{
+ return false;
+}
+
+static inline bool evmptr_is_set(u64 evmptr)
+{
+ return false;
+}
+
+static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx)
+{
+ return false;
+}
+
+static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx)
+{
+ return NULL;
+}
+#endif
+
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.c b/arch/x86/kvm/vmx/hyperv_evmcs.c
new file mode 100644
index 000000000000..904bfcd1519b
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains common code for working with Enlightened VMCS which is
+ * used both by Hyper-V on KVM and KVM on Hyper-V.
+ */
+
+#include "hyperv_evmcs.h"
+
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
+ {EVMCS1_OFFSET(name), clean_field}
+
+const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ /*
+ * Not used by KVM:
+ *
+ * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x0000682A, guest_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1A, host_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ */
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h
new file mode 100644
index 000000000000..6536290f4274
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file contains common definitions for working with Enlightened VMCS which
+ * are used both by Hyper-V on KVM and KVM on Hyper-V.
+ */
+#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H
+#define __KVM_X86_VMX_HYPERV_EVMCS_H
+
+#include <hyperv/hvhdk.h>
+
+#include "capabilities.h"
+#include "vmcs12.h"
+
+#define KVM_EVMCS_VERSION 1
+
+/*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ * VM_FUNCTION_CONTROL = 0x00002018,
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ *
+ * TSC_MULTIPLIER = 0x00002032,
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ *
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+#define EVMCS1_SUPPORTED_PINCTRL \
+ (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING | \
+ PIN_BASED_VIRTUAL_NMIS)
+
+#define EVMCS1_SUPPORTED_EXEC_CTRL \
+ (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING | \
+ CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+
+#define EVMCS1_SUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
+
+#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE | \
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_IA32E_MODE | \
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMFUNC (0)
+
+struct evmcs_field {
+ u16 offset;
+ u16 clean_field;
+};
+
+extern const struct evmcs_field vmcs_field_to_evmcs_1[];
+extern const unsigned int nr_evmcs_1_fields;
+
+static __always_inline int evmcs_field_offset(unsigned long field,
+ u16 *clean_field)
+{
+ const struct evmcs_field *evmcs_field;
+ unsigned int index = ROL16(field, 6);
+
+ if (unlikely(index >= nr_evmcs_1_fields))
+ return -ENOENT;
+
+ evmcs_field = &vmcs_field_to_evmcs_1[index];
+
+ /*
+ * Use offset=0 to detect holes in eVMCS. This offset belongs to
+ * 'revision_id' but this field has no encoding and is supposed to
+ * be accessed directly.
+ */
+ if (unlikely(!evmcs_field->offset))
+ return -ENOENT;
+
+ if (clean_field)
+ *clean_field = evmcs_field->clean_field;
+
+ return evmcs_field->offset;
+}
+
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+ unsigned long field, u16 offset)
+{
+ /*
+ * vmcs12_read_any() doesn't care whether the supplied structure
+ * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+ * the exact offset of the required field, use it for convenience
+ * here.
+ */
+ return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
new file mode 100644
index 000000000000..a46ccd670785
--- /dev/null
+++ b/arch/x86/kvm/vmx/main.c
@@ -0,0 +1,1082 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/moduleparam.h>
+
+#include "x86_ops.h"
+#include "vmx.h"
+#include "mmu.h"
+#include "nested.h"
+#include "pmu.h"
+#include "posted_intr.h"
+#include "tdx.h"
+#include "tdx_arch.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt));
+
+static void vt_disable_virtualization_cpu(void)
+{
+ /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
+ if (enable_tdx)
+ tdx_disable_virtualization_cpu();
+ vmx_disable_virtualization_cpu();
+}
+
+static __init int vt_hardware_setup(void)
+{
+ int ret;
+
+ ret = vmx_hardware_setup();
+ if (ret)
+ return ret;
+
+ if (enable_tdx)
+ tdx_hardware_setup();
+
+ return 0;
+}
+
+static int vt_vm_init(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_vm_init(kvm);
+
+ return vmx_vm_init(kvm);
+}
+
+static void vt_vm_pre_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_mmu_release_hkid(kvm);
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_vm_destroy(kvm);
+
+ vmx_vm_destroy(kvm);
+}
+
+static int vt_vcpu_precreate(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_vcpu_precreate(kvm);
+}
+
+static int vt_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_create(vcpu);
+
+ return vmx_vcpu_create(vcpu);
+}
+
+static void vt_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_free(vcpu);
+ return;
+ }
+
+ vmx_vcpu_free(vcpu);
+}
+
+static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_reset(vcpu, init_event);
+ return;
+ }
+
+ vmx_vcpu_reset(vcpu, init_event);
+}
+
+static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_load(vcpu, cpu);
+ return;
+ }
+
+ vmx_vcpu_load(vcpu, cpu);
+}
+
+static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Basic TDX does not support feature PML. KVM does not enable PML in
+ * TD's VMCS, nor does it allocate or flush PML buffer for TDX.
+ */
+ if (WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ return;
+
+ vmx_update_cpu_dirty_logging(vcpu);
+}
+
+static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_prepare_switch_to_guest(vcpu);
+ return;
+ }
+
+ vmx_prepare_switch_to_guest(vcpu);
+}
+
+static void vt_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_put(vcpu);
+ return;
+ }
+
+ vmx_vcpu_put(vcpu);
+}
+
+static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_pre_run(vcpu);
+
+ return vmx_vcpu_pre_run(vcpu);
+}
+
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_run(vcpu, run_flags);
+
+ return vmx_vcpu_run(vcpu, run_flags);
+}
+
+static int vt_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion fastpath)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_handle_exit(vcpu, fastpath);
+
+ return vmx_handle_exit(vcpu, fastpath);
+}
+
+static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ if (unlikely(is_td_vcpu(vcpu)))
+ return tdx_set_msr(vcpu, msr_info);
+
+ return vmx_set_msr(vcpu, msr_info);
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool vt_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ if (kvm && is_td(kvm))
+ return tdx_has_emulated_msr(index);
+
+ return vmx_has_emulated_msr(kvm, index);
+}
+
+static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ if (unlikely(is_td_vcpu(vcpu)))
+ return tdx_get_msr(vcpu, msr_info);
+
+ return vmx_get_msr(vcpu, msr_info);
+}
+
+static void vt_recalc_intercepts(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TDX doesn't allow VMM to configure interception of instructions or
+ * MSR accesses. TDX guest requests MSR accesses by calling TDVMCALL.
+ * The MSR filters will be applied when handling the TDVMCALL for
+ * RDMSR/WRMSR if the userspace has set any.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_recalc_intercepts(vcpu);
+}
+
+static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_complete_emulated_msr(vcpu, err);
+
+ return vmx_complete_emulated_msr(vcpu, err);
+}
+
+#ifdef CONFIG_KVM_SMM
+static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_smi_allowed(vcpu, for_injection);
+}
+
+static int vt_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_enter_smm(vcpu, smram);
+}
+
+static int vt_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_leave_smm(vcpu, smram);
+}
+
+static void vt_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return;
+
+ /* RSM will cause a vmexit anyway. */
+ vmx_enable_smi_window(vcpu);
+}
+#endif
+
+static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ /*
+ * For TDX, this can only be triggered for MMIO emulation. Let the
+ * guest retry after installing the SPTE with suppress #VE bit cleared,
+ * so that the guest will receive #VE when retry. The guest is expected
+ * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on
+ * #VE.
+ */
+ if (is_td_vcpu(vcpu))
+ return X86EMUL_RETRY_INSTR;
+
+ return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len);
+}
+
+static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ /*
+ * INIT and SIPI are always blocked for TDX, i.e., INIT handling and
+ * the OP vcpu_deliver_sipi_vector() won't be called.
+ */
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_apic_init_signal_blocked(vcpu);
+}
+
+static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ /* Only x2APIC mode is supported for TD. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ return vmx_set_virtual_apic_mode(vcpu);
+}
+
+static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ return vmx_hwapic_isr_update(vcpu, max_isr);
+}
+
+static int vt_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return -1;
+
+ return vmx_sync_pir_to_irr(vcpu);
+}
+
+static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ if (is_td_vcpu(apic->vcpu)) {
+ tdx_deliver_interrupt(apic, delivery_mode, trig_mode,
+ vector);
+ return;
+ }
+
+ vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector);
+}
+
+static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_vcpu_after_set_cpuid(vcpu);
+}
+
+static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_update_exception_bitmap(vcpu);
+}
+
+static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_segment_base(vcpu, seg);
+}
+
+static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+ int seg)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(var, 0, sizeof(*var));
+ return;
+ }
+
+ vmx_get_segment(vcpu, var, seg);
+}
+
+static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+ int seg)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_segment(vcpu, var, seg);
+}
+
+static int vt_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_cpl(vcpu);
+}
+
+static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_cpl_no_cache(vcpu);
+}
+
+static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ if (is_td_vcpu(vcpu)) {
+ *db = 0;
+ *l = 0;
+ return;
+ }
+
+ vmx_get_cs_db_l_bits(vcpu, db, l);
+}
+
+static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_is_valid_cr0(vcpu, cr0);
+}
+
+static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_cr0(vcpu, cr0);
+}
+
+static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_is_valid_cr4(vcpu, cr4);
+}
+
+static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_cr4(vcpu, cr4);
+}
+
+static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_set_efer(vcpu, efer);
+}
+
+static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(dt, 0, sizeof(*dt));
+ return;
+ }
+
+ vmx_get_idt(vcpu, dt);
+}
+
+static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_idt(vcpu, dt);
+}
+
+static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(dt, 0, sizeof(*dt));
+ return;
+ }
+
+ vmx_get_gdt(vcpu, dt);
+}
+
+static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_gdt(vcpu, dt);
+}
+
+static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_dr7(vcpu, val);
+}
+
+static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ /*
+ * MOV-DR exiting is always cleared for TD guest, even in debug mode.
+ * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never
+ * reach here for TD vcpu.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_sync_dirty_debug_regs(vcpu);
+}
+
+static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ if (WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ return;
+
+ vmx_cache_reg(vcpu, reg);
+}
+
+static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_rflags(vcpu);
+}
+
+static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_rflags(vcpu, rflags);
+}
+
+static bool vt_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return vmx_get_if_flag(vcpu);
+}
+
+static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_flush_tlb_all(vcpu);
+ return;
+ }
+
+ vmx_flush_tlb_all(vcpu);
+}
+
+static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_flush_tlb_current(vcpu);
+ return;
+ }
+
+ vmx_flush_tlb_current(vcpu);
+}
+
+static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_flush_tlb_gva(vcpu, addr);
+}
+
+static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_flush_tlb_guest(vcpu);
+}
+
+static void vt_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_inject_nmi(vcpu);
+ return;
+ }
+
+ vmx_inject_nmi(vcpu);
+}
+
+static int vt_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ /*
+ * The TDX module manages NMI windows and NMI reinjection, and hides NMI
+ * blocking, all KVM can do is throw an NMI over the wall.
+ */
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_nmi_allowed(vcpu, for_injection);
+}
+
+static bool vt_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ /*
+ * KVM can't get NMI blocking status for TDX guest, assume NMIs are
+ * always unmasked.
+ */
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return vmx_get_nmi_mask(vcpu);
+}
+
+static void vt_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_nmi_mask(vcpu, masked);
+}
+
+static void vt_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ /* Refer to the comments in tdx_inject_nmi(). */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_enable_nmi_window(vcpu);
+}
+
+static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int pgd_level)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
+ return;
+ }
+
+ vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
+}
+
+static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_interrupt_shadow(vcpu, mask);
+}
+
+static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_interrupt_shadow(vcpu);
+}
+
+static void vt_patch_hypercall(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall)
+{
+ /*
+ * Because guest memory is protected, guest can't be patched. TD kernel
+ * is modified to use TDG.VP.VMCALL for hypercall.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_patch_hypercall(vcpu, hypercall);
+}
+
+static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_inject_irq(vcpu, reinjected);
+}
+
+static void vt_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_inject_exception(vcpu);
+}
+
+static void vt_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_cancel_injection(vcpu);
+}
+
+static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_interrupt_allowed(vcpu);
+
+ return vmx_interrupt_allowed(vcpu, for_injection);
+}
+
+static void vt_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_enable_irq_window(vcpu);
+}
+
+static void vt_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = 0;
+ *error_code = 0;
+
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_get_entry_info(vcpu, intr_info, error_code);
+}
+
+static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_get_exit_info(vcpu, reason, info1, info2, intr_info,
+ error_code);
+ return;
+ }
+
+ vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code);
+}
+
+static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_update_cr8_intercept(vcpu, tpr, irr);
+}
+
+static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_apic_access_page_addr(vcpu);
+}
+
+static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ KVM_BUG_ON(!kvm_vcpu_apicv_active(vcpu), vcpu->kvm);
+ return;
+ }
+
+ vmx_refresh_apicv_exec_ctrl(vcpu);
+}
+
+static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
+static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_set_tss_addr(kvm, addr);
+}
+
+static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_set_identity_map_addr(kvm, ident_addr);
+}
+
+static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ /* TDX doesn't support L2 guest at the moment. */
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_l2_tsc_offset(vcpu);
+}
+
+static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ /* TDX doesn't support L2 guest at the moment. */
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_l2_tsc_multiplier(vcpu);
+}
+
+static void vt_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ /* In TDX, tsc offset can't be changed. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_write_tsc_offset(vcpu);
+}
+
+static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ /* In TDX, tsc multiplier can't be changed. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_write_tsc_multiplier(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
+{
+ /* VMX-preemption timer isn't available for TDX. */
+ if (is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired);
+}
+
+static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_cancel_hv_timer(vcpu);
+}
+#endif
+
+static void vt_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_setup_mce(vcpu);
+}
+
+static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
+{
+ if (!is_td(kvm))
+ return -ENOTTY;
+
+ return tdx_vm_ioctl(kvm, argp);
+}
+
+static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_ioctl(vcpu, argp);
+}
+
+static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_unlocked_ioctl(vcpu, argp);
+}
+
+static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
+ bool is_private)
+{
+ if (is_td(kvm))
+ return tdx_gmem_max_mapping_level(kvm, pfn, is_private);
+
+ return 0;
+}
+
+#define vt_op(name) vt_##name
+#define vt_op_tdx_only(name) vt_##name
+#else /* CONFIG_KVM_INTEL_TDX */
+#define vt_op(name) vmx_##name
+#define vt_op_tdx_only(name) NULL
+#endif /* CONFIG_KVM_INTEL_TDX */
+
+#define VMX_REQUIRED_APICV_INHIBITS \
+ (BIT(APICV_INHIBIT_REASON_DISABLED) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
+
+struct kvm_x86_ops vt_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
+
+ .hardware_unsetup = vmx_hardware_unsetup,
+
+ .enable_virtualization_cpu = vmx_enable_virtualization_cpu,
+ .disable_virtualization_cpu = vt_op(disable_virtualization_cpu),
+ .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
+
+ .has_emulated_msr = vt_op(has_emulated_msr),
+
+ .vm_size = sizeof(struct kvm_vmx),
+
+ .vm_init = vt_op(vm_init),
+ .vm_destroy = vt_op(vm_destroy),
+ .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy),
+
+ .vcpu_precreate = vt_op(vcpu_precreate),
+ .vcpu_create = vt_op(vcpu_create),
+ .vcpu_free = vt_op(vcpu_free),
+ .vcpu_reset = vt_op(vcpu_reset),
+
+ .prepare_switch_to_guest = vt_op(prepare_switch_to_guest),
+ .vcpu_load = vt_op(vcpu_load),
+ .vcpu_put = vt_op(vcpu_put),
+
+ .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS,
+
+ .update_exception_bitmap = vt_op(update_exception_bitmap),
+ .get_feature_msr = vmx_get_feature_msr,
+ .get_msr = vt_op(get_msr),
+ .set_msr = vt_op(set_msr),
+
+ .get_segment_base = vt_op(get_segment_base),
+ .get_segment = vt_op(get_segment),
+ .set_segment = vt_op(set_segment),
+ .get_cpl = vt_op(get_cpl),
+ .get_cpl_no_cache = vt_op(get_cpl_no_cache),
+ .get_cs_db_l_bits = vt_op(get_cs_db_l_bits),
+ .is_valid_cr0 = vt_op(is_valid_cr0),
+ .set_cr0 = vt_op(set_cr0),
+ .is_valid_cr4 = vt_op(is_valid_cr4),
+ .set_cr4 = vt_op(set_cr4),
+ .set_efer = vt_op(set_efer),
+ .get_idt = vt_op(get_idt),
+ .set_idt = vt_op(set_idt),
+ .get_gdt = vt_op(get_gdt),
+ .set_gdt = vt_op(set_gdt),
+ .set_dr7 = vt_op(set_dr7),
+ .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs),
+ .cache_reg = vt_op(cache_reg),
+ .get_rflags = vt_op(get_rflags),
+ .set_rflags = vt_op(set_rflags),
+ .get_if_flag = vt_op(get_if_flag),
+
+ .flush_tlb_all = vt_op(flush_tlb_all),
+ .flush_tlb_current = vt_op(flush_tlb_current),
+ .flush_tlb_gva = vt_op(flush_tlb_gva),
+ .flush_tlb_guest = vt_op(flush_tlb_guest),
+
+ .vcpu_pre_run = vt_op(vcpu_pre_run),
+ .vcpu_run = vt_op(vcpu_run),
+ .handle_exit = vt_op(handle_exit),
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vt_op(set_interrupt_shadow),
+ .get_interrupt_shadow = vt_op(get_interrupt_shadow),
+ .patch_hypercall = vt_op(patch_hypercall),
+ .inject_irq = vt_op(inject_irq),
+ .inject_nmi = vt_op(inject_nmi),
+ .inject_exception = vt_op(inject_exception),
+ .cancel_injection = vt_op(cancel_injection),
+ .interrupt_allowed = vt_op(interrupt_allowed),
+ .nmi_allowed = vt_op(nmi_allowed),
+ .get_nmi_mask = vt_op(get_nmi_mask),
+ .set_nmi_mask = vt_op(set_nmi_mask),
+ .enable_nmi_window = vt_op(enable_nmi_window),
+ .enable_irq_window = vt_op(enable_irq_window),
+ .update_cr8_intercept = vt_op(update_cr8_intercept),
+
+ .x2apic_icr_is_split = false,
+ .set_virtual_apic_mode = vt_op(set_virtual_apic_mode),
+ .set_apic_access_page_addr = vt_op(set_apic_access_page_addr),
+ .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl),
+ .load_eoi_exitmap = vt_op(load_eoi_exitmap),
+ .apicv_pre_state_restore = pi_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_isr_update = vt_op(hwapic_isr_update),
+ .sync_pir_to_irr = vt_op(sync_pir_to_irr),
+ .deliver_interrupt = vt_op(deliver_interrupt),
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vt_op(set_tss_addr),
+ .set_identity_map_addr = vt_op(set_identity_map_addr),
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vt_op(get_exit_info),
+ .get_entry_info = vt_op(get_entry_info),
+
+ .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid),
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .get_l2_tsc_offset = vt_op(get_l2_tsc_offset),
+ .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier),
+ .write_tsc_offset = vt_op(write_tsc_offset),
+ .write_tsc_multiplier = vt_op(write_tsc_multiplier),
+
+ .load_mmu_pgd = vt_op(load_mmu_pgd),
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging),
+
+ .nested_ops = &vmx_nested_ops,
+
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_bypass = vmx_pi_start_bypass,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vt_op(set_hv_timer),
+ .cancel_hv_timer = vt_op(cancel_hv_timer),
+#endif
+
+ .setup_mce = vt_op(setup_mce),
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = vt_op(smi_allowed),
+ .enter_smm = vt_op(enter_smm),
+ .leave_smm = vt_op(leave_smm),
+ .enable_smi_window = vt_op(enable_smi_window),
+#endif
+
+ .check_emulate_instruction = vt_op(check_emulate_instruction),
+ .apic_init_signal_blocked = vt_op(apic_init_signal_blocked),
+ .migrate_timers = vmx_migrate_timers,
+
+ .recalc_intercepts = vt_op(recalc_intercepts),
+ .complete_emulated_msr = vt_op(complete_emulated_msr),
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
+
+ .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
+ .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
+ .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl),
+
+ .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
+};
+
+struct kvm_x86_init_ops vt_init_ops __initdata = {
+ .hardware_setup = vt_op(hardware_setup),
+ .handle_intel_pt_intr = NULL,
+
+ .runtime_ops = &vt_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
+};
+
+static void __exit vt_exit(void)
+{
+ kvm_exit();
+ tdx_cleanup();
+ vmx_exit();
+}
+module_exit(vt_exit);
+
+static int __init vt_init(void)
+{
+ unsigned vcpu_size, vcpu_align;
+ int r;
+
+ r = vmx_init();
+ if (r)
+ return r;
+
+ /* tdx_init() has been taken */
+ r = tdx_bringup();
+ if (r)
+ goto err_tdx_bringup;
+
+ /*
+ * TDX and VMX have different vCPU structures. Calculate the
+ * maximum size/align so that kvm_init() can use the larger
+ * values to create the kmem_vcpu_cache.
+ */
+ vcpu_size = sizeof(struct vcpu_vmx);
+ vcpu_align = __alignof__(struct vcpu_vmx);
+ if (enable_tdx) {
+ vcpu_size = max_t(unsigned, vcpu_size,
+ sizeof(struct vcpu_tdx));
+ vcpu_align = max_t(unsigned, vcpu_align,
+ __alignof__(struct vcpu_tdx));
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM);
+ }
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ tdx_cleanup();
+err_tdx_bringup:
+ vmx_exit();
+ return r;
+}
+module_init(vt_init);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
new file mode 100644
index 000000000000..40777278eabb
--- /dev/null
+++ b/arch/x86/kvm/vmx/nested.c
@@ -0,0 +1,7459 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/objtool.h>
+#include <linux/percpu.h>
+
+#include <asm/debugreg.h>
+#include <asm/mmu_context.h>
+#include <asm/msr.h>
+
+#include "x86.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "mmu.h"
+#include "nested.h"
+#include "pmu.h"
+#include "posted_intr.h"
+#include "sgx.h"
+#include "trace.h"
+#include "vmx.h"
+#include "smm.h"
+
+static bool __read_mostly enable_shadow_vmcs = 1;
+module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
+
+static bool __ro_after_init warn_on_missed_cc;
+module_param(warn_on_missed_cc, bool, 0444);
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+/*
+ * Hyper-V requires all of these, so mark them as supported even though
+ * they are just treated the same as all-context.
+ */
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
+enum {
+ VMX_VMREAD_BITMAP,
+ VMX_VMWRITE_BITMAP,
+ VMX_BITMAP_NR
+};
+static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
+
+#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
+#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
+
+struct shadow_vmcs_field {
+ u16 encoding;
+ u16 offset;
+};
+static struct shadow_vmcs_field shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
+#include "vmcs_shadow_fields.h"
+};
+static int max_shadow_read_only_fields =
+ ARRAY_SIZE(shadow_read_only_fields);
+
+static struct shadow_vmcs_field shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
+#include "vmcs_shadow_fields.h"
+};
+static int max_shadow_read_write_fields =
+ ARRAY_SIZE(shadow_read_write_fields);
+
+static void init_vmcs_shadow_fields(void)
+{
+ int i, j;
+
+ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+
+ for (i = j = 0; i < max_shadow_read_only_fields; i++) {
+ struct shadow_vmcs_field entry = shadow_read_only_fields[i];
+ u16 field = entry.encoding;
+
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_only_fields ||
+ shadow_read_only_fields[i + 1].encoding != field + 1))
+ pr_err("Missing field from shadow_read_only_field %x\n",
+ field + 1);
+
+ clear_bit(field, vmx_vmread_bitmap);
+ if (field & 1)
+#ifdef CONFIG_X86_64
+ continue;
+#else
+ entry.offset += sizeof(u32);
+#endif
+ shadow_read_only_fields[j++] = entry;
+ }
+ max_shadow_read_only_fields = j;
+
+ for (i = j = 0; i < max_shadow_read_write_fields; i++) {
+ struct shadow_vmcs_field entry = shadow_read_write_fields[i];
+ u16 field = entry.encoding;
+
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_write_fields ||
+ shadow_read_write_fields[i + 1].encoding != field + 1))
+ pr_err("Missing field from shadow_read_write_field %x\n",
+ field + 1);
+
+ WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
+ field <= GUEST_TR_AR_BYTES,
+ "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
+
+ /*
+ * PML and the preemption timer can be emulated, but the
+ * processor cannot vmwrite to fields that don't exist
+ * on bare metal.
+ */
+ switch (field) {
+ case GUEST_PML_INDEX:
+ if (!cpu_has_vmx_pml())
+ continue;
+ break;
+ case VMX_PREEMPTION_TIMER_VALUE:
+ if (!cpu_has_vmx_preemption_timer())
+ continue;
+ break;
+ case GUEST_INTR_STATUS:
+ if (!cpu_has_vmx_apicv())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ clear_bit(field, vmx_vmwrite_bitmap);
+ clear_bit(field, vmx_vmread_bitmap);
+ if (field & 1)
+#ifdef CONFIG_X86_64
+ continue;
+#else
+ entry.offset += sizeof(u32);
+#endif
+ shadow_read_write_fields[j++] = entry;
+ }
+ max_shadow_read_write_fields = j;
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction (as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
+ * instruction.
+ */
+static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_CF);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
+ u32 vm_instruction_error)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_ZF);
+ get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+ /*
+ * We don't need to force sync to shadow VMCS because
+ * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
+ * fields and thus must be synced.
+ */
+ if (nested_vmx_is_evmptr12_set(to_vmx(vcpu)))
+ to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done if there isn't a current VMCS.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA &&
+ !nested_vmx_is_evmptr12_valid(vmx))
+ return nested_vmx_failInvalid(vcpu);
+
+ return nested_vmx_failValid(vcpu, vm_instruction_error);
+}
+
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to reset guest simply here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+ return fixed_bits_valid(control, low, high);
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+ return low | ((u64)high << 32);
+}
+
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
+}
+
+static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
+ vmx->nested.hv_evmcs = NULL;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ if (hv_vcpu) {
+ hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
+ hv_vcpu->nested.vm_id = 0;
+ hv_vcpu->nested.vp_id = 0;
+ }
+#endif
+}
+
+static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * When Enlightened VMEntry is enabled on the calling CPU we treat
+ * memory area pointer by vmptr as Enlightened VMCS (as there's no good
+ * way to distinguish it from VMCS12) and we must not corrupt it by
+ * writing to the non-existent 'launch_state' field. The area doesn't
+ * have to be the currently active EVMCS on the calling CPU and there's
+ * nothing KVM has to do to transition it from 'active' to 'non-active'
+ * state. It is possible that the area will stay mapped as
+ * vmx->nested.hv_evmcs but this shouldn't be a problem.
+ */
+ if (!guest_cpu_cap_has_evmcs(vcpu) ||
+ !evmptr_is_valid(nested_get_evmptr(vcpu)))
+ return false;
+
+ if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr)
+ nested_release_evmcs(vcpu);
+
+ return true;
+#else
+ return false;
+#endif
+}
+
+static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+ struct loaded_vmcs *prev)
+{
+ struct vmcs_host_state *dest, *src;
+
+ if (unlikely(!vmx->vt.guest_state_loaded))
+ return;
+
+ src = &prev->host_state;
+ dest = &vmx->loaded_vmcs->host_state;
+
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ dest->ldt_sel = src->ldt_sel;
+#ifdef CONFIG_X86_64
+ dest->ds_sel = src->ds_sel;
+ dest->es_sel = src->es_sel;
+#endif
+}
+
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *prev;
+ int cpu;
+
+ if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
+ return;
+
+ cpu = get_cpu();
+ prev = vmx->loaded_vmcs;
+ vmx->loaded_vmcs = vmcs;
+ vmx_vcpu_load_vmcs(vcpu, cpu);
+ vmx_sync_vmcs_host_state(vmx, prev);
+ put_cpu();
+
+ vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
+
+ /*
+ * All lazily updated registers will be reloaded from VMCS12 on both
+ * vmentry and vmexit.
+ */
+ vcpu->arch.regs_dirty = 0;
+}
+
+static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
+ vmx->nested.pi_desc = NULL;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
+ if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+ return;
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ vmx->nested.vmxon = false;
+ vmx->nested.smm.vmxon = false;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ free_vpid(vmx->nested.vpid02);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = INVALID_GPA;
+ if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ free_vmcs(vmx->vmcs01.shadow_vmcs);
+ vmx->vmcs01.shadow_vmcs = NULL;
+ }
+ kfree(vmx->nested.cached_vmcs12);
+ vmx->nested.cached_vmcs12 = NULL;
+ kfree(vmx->nested.cached_shadow_vmcs12);
+ vmx->nested.cached_shadow_vmcs12 = NULL;
+
+ nested_put_vmcs12_pages(vcpu);
+
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
+ nested_release_evmcs(vcpu);
+
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+}
+
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ vmx_leave_nested(vcpu);
+ vcpu_put(vcpu);
+}
+
+#define EPTP_PA_MASK GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+ return VALID_PAGE(root_hpa) &&
+ ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
+static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
+ gpa_t addr)
+{
+ unsigned long roots = 0;
+ uint i;
+ struct kvm_mmu_root_info *cached_root;
+
+ WARN_ON_ONCE(!mmu_is_nested(vcpu));
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ cached_root = &vcpu->arch.mmu->prev_roots[i];
+
+ if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
+ eptp))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
+ u32 vm_exit_reason;
+
+ if (vmx->nested.pml_full) {
+ vm_exit_reason = EXIT_REASON_PML_FULL;
+ vmx->nested.pml_full = false;
+
+ /*
+ * It should be impossible to trigger a nested PML Full VM-Exit
+ * for anything other than an EPT Violation from L2. KVM *can*
+ * trigger nEPT page fault injection in response to an EPT
+ * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
+ * tables also changed, but KVM should not treat EPT Misconfig
+ * VM-Exits as writes.
+ */
+ WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
+
+ /*
+ * PML Full and EPT Violation VM-Exits both use bit 12 to report
+ * "NMI unblocking due to IRET", i.e. the bit can be propagated
+ * as-is from the original EXIT_QUALIFICATION.
+ */
+ exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
+ } else {
+ if (fault->error_code & PFERR_RSVD_MASK) {
+ vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ exit_qualification = 0;
+ } else {
+ exit_qualification = fault->exit_qualification;
+ exit_qualification |= vmx_get_exit_qual(vcpu) &
+ (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
+ vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ }
+
+ /*
+ * Although the caller (kvm_inject_emulated_page_fault) would
+ * have already synced the faulting address in the shadow EPT
+ * tables for the current EPTP12, we also need to sync it for
+ * any other cached EPTP02s based on the same EP4TA, since the
+ * TLB associates mappings to the EP4TA rather than the full EPTP.
+ */
+ nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
+ fault->address);
+ }
+
+ nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
+ vmcs12->guest_physical_address = fault->address;
+}
+
+static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
+ int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
+
+ kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
+ nested_ept_ad_enabled(vcpu),
+ nested_ept_get_eptp(vcpu));
+}
+
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+ nested_ept_new_eptp(vcpu);
+ vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
+ vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
+ vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
+
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code)
+{
+ bool inequality, bit;
+
+ bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ inequality =
+ (error_code & vmcs12->page_fault_error_code_mask) !=
+ vmcs12->page_fault_error_code_match;
+ return inequality ^ bit;
+}
+
+static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * Drop bits 31:16 of the error code when performing the #PF mask+match
+ * check. All VMCS fields involved are 32 bits, but Intel CPUs never
+ * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
+ * error code. Including the to-be-dropped bits in the check might
+ * result in an "impossible" or missed exit from L1's perspective.
+ */
+ if (vector == PF_VECTOR)
+ return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
+
+ return (vmcs12->exception_bitmap & (1u << vector));
+}
+
+static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
+ CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
+ return -EINVAL;
+
+ if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
+ * itself utilizing x2APIC. All MSRs were previously set to be intercepted,
+ * only the "disable intercept" case needs to be handled.
+ */
+static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int type)
+{
+ if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
+
+ if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
+}
+
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
+{
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap[word] = ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+}
+
+#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
+static inline \
+void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
+ unsigned long *msr_bitmap_l1, \
+ unsigned long *msr_bitmap_l0, u32 msr) \
+{ \
+ if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
+ vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
+ vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+ else \
+ vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+}
+BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
+BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
+
+static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
+ unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int types)
+{
+ if (types & MSR_TYPE_R)
+ nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+ if (types & MSR_TYPE_W)
+ nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int msr;
+ unsigned long *msr_bitmap_l1;
+ unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
+ struct kvm_host_map map;
+
+ /* Nothing to do if the MSR bitmap is not in use. */
+ if (!cpu_has_vmx_msr_bitmap() ||
+ !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return false;
+
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
+ * and tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!vmx->nested.force_msr_bitmap_recalc) {
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
+
+ if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+ }
+
+ if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
+ return false;
+
+ msr_bitmap_l1 = (unsigned long *)map.hva;
+
+ /*
+ * To keep the control flow simple, pay eight 8-byte writes (sixteen
+ * 4-byte writes on 32-bit systems) up front to enable intercepts for
+ * the x2APIC MSR range and selectively toggle those relevant to L2.
+ */
+ enable_x2apic_msr_intercepts(msr_bitmap_l0);
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800
+ * and 0x8ff, it just lets the processor take the value
+ * from the virtual-APIC page; take those 256 bits
+ * directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ }
+ }
+
+ nested_vmx_disable_intercept_for_x2apic_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_x2apic_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_x2apic_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
+ }
+
+ /*
+ * Always check vmcs01's bitmap to honor userspace MSR filters and any
+ * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
+ */
+#ifdef CONFIG_X86_64
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_FS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD, MSR_TYPE_W);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_APERF, MSR_TYPE_R);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_MPERF, MSR_TYPE_R);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_U_CET, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_S_CET, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PL0_SSP, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PL1_SSP, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PL2_SSP, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PL3_SSP, MSR_TYPE_RW);
+
+ kvm_vcpu_unmap(vcpu, &map);
+
+ vmx->nested.force_msr_bitmap_recalc = false;
+
+ return true;
+}
+
+static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
+ return;
+
+ kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
+}
+
+static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
+ return;
+
+ kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
+}
+
+/*
+ * In nested virtualization, check if L1 has set
+ * VM_EXIT_ACK_INTR_ON_EXIT
+ */
+static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_ACK_INTR_ON_EXIT;
+}
+
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
+ return -EINVAL;
+ else
+ return 0;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !nested_cpu_has_apic_reg_virt(vmcs12) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has_posted_intr(vmcs12))
+ return 0;
+
+ /*
+ * If virtualize x2apic mode is enabled,
+ * virtualize apic access must be disabled.
+ */
+ if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
+ return -EINVAL;
+
+ /*
+ * If virtual interrupt delivery is enabled,
+ * we must exit on external interrupts.
+ */
+ if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
+ return -EINVAL;
+
+ /*
+ * bits 15:8 should be zero in posted_intr_nv,
+ * the descriptor address has been already checked
+ * in nested_get_vmcs12_pages.
+ *
+ * bits 5:0 of posted_intr_desc_addr should be zero.
+ */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ (CC(!nested_cpu_has_vid(vmcs12)) ||
+ CC(!nested_exit_intr_ack_set(vcpu)) ||
+ CC((vmcs12->posted_intr_nv & 0xff00)) ||
+ CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
+ return -EINVAL;
+
+ /* tpr shadow is needed by all apicv features. */
+ if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
+}
+
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ u32 count, u64 addr)
+{
+ if (count == 0)
+ return 0;
+
+ /*
+ * Exceeding the limit results in architecturally _undefined_ behavior,
+ * i.e. KVM is allowed to do literally anything in response to a bad
+ * limit. Immediately generate a consistency check so that code that
+ * consumes the count doesn't need to worry about extreme edge cases.
+ */
+ if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
+ return -EINVAL;
+
+ if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_exit_msr_load_count,
+ vmcs12->vm_exit_msr_load_addr)) ||
+ CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_exit_msr_store_count,
+ vmcs12->vm_exit_msr_store_addr)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_entry_msr_load_count,
+ vmcs12->vm_entry_msr_load_addr)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (CC(!nested_cpu_has_ept(vmcs12)) ||
+ CC(!page_address_valid(vcpu, vmcs12->pml_address)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
+ !nested_cpu_has_ept(vmcs12)))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
+ !nested_cpu_has_ept(vmcs12)))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
+ CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ /* x2APIC MSR accesses are not allowed */
+ if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
+ return -EINVAL;
+ if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
+ CC(e->index == MSR_IA32_UCODE_REV))
+ return -EINVAL;
+ if (CC(e->reserved != 0))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (CC(e->index == MSR_FS_BASE) ||
+ CC(e->index == MSR_GS_BASE) ||
+ CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ *
+ * One of the failure modes for MSR load/store is when a list exceeds the
+ * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
+ * as possible, process all valid entries before failing rather than precheck
+ * for a capacity violation.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+ u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
+
+ for (i = 0; i < count; i++) {
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
+ goto fail;
+
+ if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ goto fail;
+ }
+ if (nested_vmx_load_msr_check(vcpu, &e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ goto fail;
+ }
+ if (kvm_emulate_msr_write(vcpu, e.index, e.value)) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
+ return i + 1;
+}
+
+static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
+ u32 msr_index,
+ u64 *data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If the L0 hypervisor stored a more accurate value for the TSC that
+ * does not include the time taken for emulation of the L2->L1
+ * VM-exit in L0, use the more accurate value.
+ */
+ if (msr_index == MSR_IA32_TSC) {
+ int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
+ MSR_IA32_TSC);
+
+ if (i >= 0) {
+ u64 val = vmx->msr_autostore.guest.val[i].value;
+
+ *data = kvm_read_l1_tsc(vcpu, val);
+ return true;
+ }
+ }
+
+ if (kvm_emulate_msr_read(vcpu, msr_index, data)) {
+ pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
+ msr_index);
+ return false;
+ }
+ return true;
+}
+
+static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
+ struct vmx_msr_entry *e)
+{
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(*e),
+ e, 2 * sizeof(u32))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(*e));
+ return false;
+ }
+ if (nested_vmx_store_msr_check(vcpu, e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e->index, e->reserved);
+ return false;
+ }
+ return true;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u64 data;
+ u32 i;
+ struct vmx_msr_entry e;
+ u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
+
+ for (i = 0; i < count; i++) {
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
+ return -EINVAL;
+
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+ return -EINVAL;
+
+ if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
+ return -EINVAL;
+
+ if (kvm_vcpu_write_guest(vcpu,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &data, sizeof(data))) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, data);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 count = vmcs12->vm_exit_msr_store_count;
+ u64 gpa = vmcs12->vm_exit_msr_store_addr;
+ struct vmx_msr_entry e;
+ u32 i;
+
+ for (i = 0; i < count; i++) {
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+ return false;
+
+ if (e.index == msr_index)
+ return true;
+ }
+ return false;
+}
+
+static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
+ u32 msr_index)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
+ bool in_vmcs12_store_list;
+ int msr_autostore_slot;
+ bool in_autostore_list;
+ int last;
+
+ msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
+ in_autostore_list = msr_autostore_slot >= 0;
+ in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
+
+ if (in_vmcs12_store_list && !in_autostore_list) {
+ if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
+ /*
+ * Emulated VMEntry does not fail here. Instead a less
+ * accurate value will be returned by
+ * nested_vmx_get_vmexit_msr_value() by reading KVM's
+ * internal MSR state instead of reading the value from
+ * the vmcs02 VMExit MSR-store area.
+ */
+ pr_warn_ratelimited(
+ "Not enough msr entries in msr_autostore. Can't add msr %x\n",
+ msr_index);
+ return;
+ }
+ last = autostore->nr++;
+ autostore->val[last].index = msr_index;
+ } else if (!in_vmcs12_store_list && in_autostore_list) {
+ last = --autostore->nr;
+ autostore->val[msr_autostore_slot] = autostore->val[last];
+ }
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
+ * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
+ * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
+ * @entry_failure_code.
+ */
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_ept, bool reload_pdptrs,
+ enum vm_entry_failure_code *entry_failure_code)
+{
+ if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
+
+ /*
+ * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+ * must not be dereferenced.
+ */
+ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return -EINVAL;
+ }
+
+ vcpu->arch.cr3 = cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
+
+ if (!nested_ept)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ return 0;
+}
+
+/*
+ * Returns if KVM is able to config CPU to tag TLB entries
+ * populated by L2 differently than TLB entries populated
+ * by L1.
+ *
+ * If L0 uses EPT, L1 and L2 run with different EPTP because
+ * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
+ * are tagged with different EPTP.
+ *
+ * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
+ * with different VPID (L1 entries are tagged with vmx->vpid
+ * while L2 entries are tagged with vmx->nested.vpid02).
+ */
+static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ return enable_ept ||
+ (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
+}
+
+static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ bool is_vmenter)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* Handle pending Hyper-V TLB flush requests */
+ kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
+
+ /*
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host, and so architecturally, linear and combined
+ * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
+ * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
+ * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
+ * is required if VPID is disabled in KVM, as a TLB flush (there are no
+ * VPIDs) still occurs from L1's perspective, and KVM may need to
+ * synchronize the MMU in response to the guest TLB flush.
+ *
+ * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
+ * EPT is a special snowflake, as guest-physical mappings aren't
+ * flushed on VPID invalidations, including VM-Enter or VM-Exit with
+ * VPID disabled. As a result, KVM _never_ needs to sync nEPT
+ * entries on VM-Enter because L1 can't rely on VM-Enter to flush
+ * those mappings.
+ */
+ if (!nested_cpu_has_vpid(vmcs12)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /* L2 should never have a VPID if VPID is disabled. */
+ WARN_ON(!enable_vpid);
+
+ /*
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
+ */
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+ superset &= mask;
+ subset &= mask;
+
+ return (superset | subset) == superset;
+}
+
+static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
+ VMX_BASIC_INOUT |
+ VMX_BASIC_TRUE_CTLS |
+ VMX_BASIC_NO_HW_ERROR_CODE_CC;
+
+ const u64 reserved_bits = GENMASK_ULL(63, 57) |
+ GENMASK_ULL(47, 45) |
+ BIT_ULL(31);
+
+ u64 vmx_basic = vmcs_config.nested.basic;
+
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
+ * inverted polarity), the incoming value must not set feature bits or
+ * reserved bits that aren't allowed/supported by KVM. Fields, i.e.
+ * multi-bit values, are explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
+ return -EINVAL;
+
+ /*
+ * KVM does not emulate a version of VMX that constrains physical
+ * addresses of VMX structures (e.g. VMCS) to 32-bits.
+ */
+ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_revision_id(vmx_basic) !=
+ vmx_basic_vmcs_revision_id(data))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
+ return -EINVAL;
+
+ vmx->nested.msrs.basic = data;
+ return 0;
+}
+
+static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
+ u32 **low, u32 **high)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ *low = &msrs->pinbased_ctls_low;
+ *high = &msrs->pinbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ *low = &msrs->procbased_ctls_low;
+ *high = &msrs->procbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ *low = &msrs->exit_ctls_low;
+ *high = &msrs->exit_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ *low = &msrs->entry_ctls_low;
+ *high = &msrs->entry_ctls_high;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *low = &msrs->secondary_ctls_low;
+ *high = &msrs->secondary_ctls_high;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u32 *lowp, *highp;
+ u64 supported;
+
+ vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
+
+ supported = vmx_control_msr(*lowp, *highp);
+
+ /* Check must-be-1 bits are still 1. */
+ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+ return -EINVAL;
+
+ /* Check must-be-0 bits are still 0. */
+ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+ return -EINVAL;
+
+ vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
+ *lowp = data;
+ *highp = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_SHUTDOWN |
+ VMX_MISC_ACTIVITY_WAIT_SIPI |
+ VMX_MISC_INTEL_PT |
+ VMX_MISC_RDMSR_IN_SMM |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMXOFF_BLOCK_SMI |
+ VMX_MISC_ZERO_LEN_INS;
+
+ const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
+
+ u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
+ vmcs_config.nested.misc_high);
+
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * The incoming value must not set feature bits or reserved bits that
+ * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
+ * explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
+ return -EINVAL;
+
+ if ((vmx->nested.msrs.pinbased_ctls_high &
+ PIN_BASED_VMX_PREEMPTION_TIMER) &&
+ vmx_misc_preemption_timer_rate(data) !=
+ vmx_misc_preemption_timer_rate(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+ return -EINVAL;
+
+ vmx->nested.msrs.misc_low = data;
+ vmx->nested.msrs.misc_high = data >> 32;
+
+ return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+ u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
+ vmcs_config.nested.vpid_caps);
+
+ /* Every bit is either reserved or a feature bit. */
+ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+ return -EINVAL;
+
+ vmx->nested.msrs.ept_caps = data;
+ vmx->nested.msrs.vpid_caps = data >> 32;
+ return 0;
+}
+
+static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_CR0_FIXED0:
+ return &msrs->cr0_fixed0;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return &msrs->cr4_fixed0;
+ default:
+ BUG();
+ }
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
+
+ /*
+ * 1 bits (which indicates bits which "must-be-1" during VMX operation)
+ * must be 1 in the restored value.
+ */
+ if (!is_bitwise_subset(data, *msr, -1ULL))
+ return -EINVAL;
+
+ *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
+ return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Don't allow changes to the VMX capability MSRs while the vCPU
+ * is in VMX operation.
+ */
+ if (vmx->nested.vmxon)
+ return -EBUSY;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ return vmx_restore_vmx_basic(vmx, data);
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ /*
+ * The "non-true" VMX capability MSRs are generated from the
+ * "true" MSRs, so we do not support restoring them directly.
+ *
+ * If userspace wants to emulate VMX_BASIC[55]=0, userspace
+ * should restore the "true" MSRs with the must-be-1 bits
+ * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
+ * DEFAULT SETTINGS".
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ return vmx_restore_control_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_MISC:
+ return vmx_restore_vmx_misc(vmx, data);
+ case MSR_IA32_VMX_CR0_FIXED0:
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return vmx_restore_fixed0_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_CR0_FIXED1:
+ case MSR_IA32_VMX_CR4_FIXED1:
+ /*
+ * These MSRs are generated based on the vCPU's CPUID, so we
+ * do not support restoring them directly.
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+ case MSR_IA32_VMX_VMCS_ENUM:
+ vmx->nested.msrs.vmcs_enum = data;
+ return 0;
+ case MSR_IA32_VMX_VMFUNC:
+ if (data & ~vmcs_config.nested.vmfunc_controls)
+ return -EINVAL;
+ vmx->nested.msrs.vmfunc_controls = data;
+ return 0;
+ default:
+ /*
+ * The rest of the VMX capability MSRs do not support restore.
+ */
+ return -EINVAL;
+ }
+}
+
+/* Returns 0 on success, non-0 otherwise. */
+int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ *pdata = msrs->basic;
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
+ *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
+ *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
+ *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
+ *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_MISC:
+ *pdata = vmx_control_msr(
+ msrs->misc_low,
+ msrs->misc_high);
+ break;
+ case MSR_IA32_VMX_CR0_FIXED0:
+ *pdata = msrs->cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR0_FIXED1:
+ *pdata = msrs->cr0_fixed1;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ *pdata = msrs->cr4_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED1:
+ *pdata = msrs->cr4_fixed1;
+ break;
+ case MSR_IA32_VMX_VMCS_ENUM:
+ *pdata = msrs->vmcs_enum;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *pdata = vmx_control_msr(
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+ break;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ *pdata = msrs->ept_caps |
+ ((u64)msrs->vpid_caps << 32);
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ *pdata = msrs->vmfunc_controls;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
+ * been modified by the L1 guest. Note, "writable" in this context means
+ * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
+ * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
+ * VM-exit information fields (which are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS").
+ */
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
+{
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ preempt_disable();
+
+ vmcs_load(shadow_vmcs);
+
+ for (i = 0; i < max_shadow_read_write_fields; i++) {
+ field = shadow_read_write_fields[i];
+ val = __vmcs_readl(field.encoding);
+ vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ preempt_enable();
+}
+
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
+{
+ const struct shadow_vmcs_field *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
+ };
+ const int max_fields[] = {
+ max_shadow_read_write_fields,
+ max_shadow_read_only_fields
+ };
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i, q;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ vmcs_load(shadow_vmcs);
+
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ for (i = 0; i < max_fields[q]; i++) {
+ field = fields[q][i];
+ val = vmcs12_read_any(vmcs12, field.encoding,
+ field.offset);
+ __vmcs_writel(field.encoding, val);
+ }
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
+static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
+
+ /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
+ vmcs12->tpr_threshold = evmcs->tpr_threshold;
+ vmcs12->guest_rip = evmcs->guest_rip;
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
+ hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
+ hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
+ hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
+ vmcs12->guest_rsp = evmcs->guest_rsp;
+ vmcs12->guest_rflags = evmcs->guest_rflags;
+ vmcs12->guest_interruptibility_info =
+ evmcs->guest_interruptibility_info;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ssp = evmcs->guest_ssp;
+ */
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+ vmcs12->cpu_based_vm_exec_control =
+ evmcs->cpu_based_vm_exec_control;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
+ vmcs12->exception_bitmap = evmcs->exception_bitmap;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
+ vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
+ vmcs12->vm_entry_intr_info_field =
+ evmcs->vm_entry_intr_info_field;
+ vmcs12->vm_entry_exception_error_code =
+ evmcs->vm_entry_exception_error_code;
+ vmcs12->vm_entry_instruction_len =
+ evmcs->vm_entry_instruction_len;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+ vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
+ vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
+ vmcs12->host_cr0 = evmcs->host_cr0;
+ vmcs12->host_cr3 = evmcs->host_cr3;
+ vmcs12->host_cr4 = evmcs->host_cr4;
+ vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
+ vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
+ vmcs12->host_rip = evmcs->host_rip;
+ vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
+ vmcs12->host_es_selector = evmcs->host_es_selector;
+ vmcs12->host_cs_selector = evmcs->host_cs_selector;
+ vmcs12->host_ss_selector = evmcs->host_ss_selector;
+ vmcs12->host_ds_selector = evmcs->host_ds_selector;
+ vmcs12->host_fs_selector = evmcs->host_fs_selector;
+ vmcs12->host_gs_selector = evmcs->host_gs_selector;
+ vmcs12->host_tr_selector = evmcs->host_tr_selector;
+ vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
+ * vmcs12->host_ssp = evmcs->host_ssp;
+ * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
+ */
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
+ vmcs12->pin_based_vm_exec_control =
+ evmcs->pin_based_vm_exec_control;
+ vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
+ vmcs12->secondary_vm_exec_control =
+ evmcs->secondary_vm_exec_control;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
+ vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
+ vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
+ vmcs12->msr_bitmap = evmcs->msr_bitmap;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
+ vmcs12->guest_es_base = evmcs->guest_es_base;
+ vmcs12->guest_cs_base = evmcs->guest_cs_base;
+ vmcs12->guest_ss_base = evmcs->guest_ss_base;
+ vmcs12->guest_ds_base = evmcs->guest_ds_base;
+ vmcs12->guest_fs_base = evmcs->guest_fs_base;
+ vmcs12->guest_gs_base = evmcs->guest_gs_base;
+ vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
+ vmcs12->guest_tr_base = evmcs->guest_tr_base;
+ vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
+ vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
+ vmcs12->guest_es_limit = evmcs->guest_es_limit;
+ vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
+ vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
+ vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
+ vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
+ vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
+ vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
+ vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
+ vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
+ vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
+ vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
+ vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
+ vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
+ vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
+ vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
+ vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
+ vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
+ vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
+ vmcs12->guest_es_selector = evmcs->guest_es_selector;
+ vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
+ vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
+ vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
+ vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
+ vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
+ vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
+ vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
+ vmcs12->tsc_offset = evmcs->tsc_offset;
+ vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
+ vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+ vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
+ vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
+ vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
+ vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
+ vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
+ vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
+ vmcs12->guest_cr0 = evmcs->guest_cr0;
+ vmcs12->guest_cr3 = evmcs->guest_cr3;
+ vmcs12->guest_cr4 = evmcs->guest_cr4;
+ vmcs12->guest_dr7 = evmcs->guest_dr7;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
+ vmcs12->host_fs_base = evmcs->host_fs_base;
+ vmcs12->host_gs_base = evmcs->host_gs_base;
+ vmcs12->host_tr_base = evmcs->host_tr_base;
+ vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
+ vmcs12->host_idtr_base = evmcs->host_idtr_base;
+ vmcs12->host_rsp = evmcs->host_rsp;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
+ vmcs12->ept_pointer = evmcs->ept_pointer;
+ vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
+ vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
+ vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
+ vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
+ vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
+ vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
+ vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
+ vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
+ vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
+ vmcs12->guest_pending_dbg_exceptions =
+ evmcs->guest_pending_dbg_exceptions;
+ vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
+ vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
+ vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
+ vmcs12->guest_activity_state = evmcs->guest_activity_state;
+ vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+ vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
+ * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
+ * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
+ */
+ }
+
+ /*
+ * Not used?
+ * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
+ * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
+ * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
+ * vmcs12->page_fault_error_code_mask =
+ * evmcs->page_fault_error_code_mask;
+ * vmcs12->page_fault_error_code_match =
+ * evmcs->page_fault_error_code_match;
+ * vmcs12->cr3_target_count = evmcs->cr3_target_count;
+ * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
+ * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
+ * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
+ */
+
+ /*
+ * Read only fields:
+ * vmcs12->guest_physical_address = evmcs->guest_physical_address;
+ * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
+ * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
+ * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
+ * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
+ * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
+ * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
+ * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
+ * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
+ * vmcs12->exit_qualification = evmcs->exit_qualification;
+ * vmcs12->guest_linear_address = evmcs->guest_linear_address;
+ *
+ * Not present in struct vmcs12:
+ * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
+ * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
+ * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
+ * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
+ */
+
+ return;
+#else /* CONFIG_KVM_HYPERV */
+ KVM_BUG_ON(1, vmx->vcpu.kvm);
+#endif /* CONFIG_KVM_HYPERV */
+}
+
+static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
+
+ /*
+ * Should not be changed by KVM:
+ *
+ * evmcs->host_es_selector = vmcs12->host_es_selector;
+ * evmcs->host_cs_selector = vmcs12->host_cs_selector;
+ * evmcs->host_ss_selector = vmcs12->host_ss_selector;
+ * evmcs->host_ds_selector = vmcs12->host_ds_selector;
+ * evmcs->host_fs_selector = vmcs12->host_fs_selector;
+ * evmcs->host_gs_selector = vmcs12->host_gs_selector;
+ * evmcs->host_tr_selector = vmcs12->host_tr_selector;
+ * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
+ * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
+ * evmcs->host_cr0 = vmcs12->host_cr0;
+ * evmcs->host_cr3 = vmcs12->host_cr3;
+ * evmcs->host_cr4 = vmcs12->host_cr4;
+ * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
+ * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
+ * evmcs->host_rip = vmcs12->host_rip;
+ * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
+ * evmcs->host_fs_base = vmcs12->host_fs_base;
+ * evmcs->host_gs_base = vmcs12->host_gs_base;
+ * evmcs->host_tr_base = vmcs12->host_tr_base;
+ * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
+ * evmcs->host_idtr_base = vmcs12->host_idtr_base;
+ * evmcs->host_rsp = vmcs12->host_rsp;
+ * sync_vmcs02_to_vmcs12() doesn't read these:
+ * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
+ * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
+ * evmcs->msr_bitmap = vmcs12->msr_bitmap;
+ * evmcs->ept_pointer = vmcs12->ept_pointer;
+ * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
+ * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
+ * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
+ * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
+ * evmcs->tpr_threshold = vmcs12->tpr_threshold;
+ * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
+ * evmcs->exception_bitmap = vmcs12->exception_bitmap;
+ * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
+ * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
+ * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
+ * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
+ * evmcs->page_fault_error_code_mask =
+ * vmcs12->page_fault_error_code_mask;
+ * evmcs->page_fault_error_code_match =
+ * vmcs12->page_fault_error_code_match;
+ * evmcs->cr3_target_count = vmcs12->cr3_target_count;
+ * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
+ * evmcs->tsc_offset = vmcs12->tsc_offset;
+ * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
+ * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
+ * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
+ * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
+ * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
+ * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
+ * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
+ * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+ * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
+ * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
+ * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
+ * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
+ *
+ * Not present in struct vmcs12:
+ * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
+ * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
+ * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
+ * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+ * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
+ * evmcs->host_ssp = vmcs12->host_ssp;
+ * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
+ * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
+ * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
+ * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
+ * evmcs->guest_ssp = vmcs12->guest_ssp;
+ */
+
+ evmcs->guest_es_selector = vmcs12->guest_es_selector;
+ evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
+ evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
+ evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
+ evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
+ evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
+ evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
+ evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
+
+ evmcs->guest_es_limit = vmcs12->guest_es_limit;
+ evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
+ evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
+ evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
+ evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
+ evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
+ evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
+ evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
+ evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
+ evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
+
+ evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
+ evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
+ evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
+ evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
+ evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
+ evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
+ evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
+ evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
+
+ evmcs->guest_es_base = vmcs12->guest_es_base;
+ evmcs->guest_cs_base = vmcs12->guest_cs_base;
+ evmcs->guest_ss_base = vmcs12->guest_ss_base;
+ evmcs->guest_ds_base = vmcs12->guest_ds_base;
+ evmcs->guest_fs_base = vmcs12->guest_fs_base;
+ evmcs->guest_gs_base = vmcs12->guest_gs_base;
+ evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
+ evmcs->guest_tr_base = vmcs12->guest_tr_base;
+ evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
+ evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
+
+ evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
+ evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
+
+ evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
+ evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
+ evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
+ evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
+
+ evmcs->guest_pending_dbg_exceptions =
+ vmcs12->guest_pending_dbg_exceptions;
+ evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
+ evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
+
+ evmcs->guest_activity_state = vmcs12->guest_activity_state;
+ evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
+
+ evmcs->guest_cr0 = vmcs12->guest_cr0;
+ evmcs->guest_cr3 = vmcs12->guest_cr3;
+ evmcs->guest_cr4 = vmcs12->guest_cr4;
+ evmcs->guest_dr7 = vmcs12->guest_dr7;
+
+ evmcs->guest_physical_address = vmcs12->guest_physical_address;
+
+ evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
+ evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
+ evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
+ evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
+ evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
+ evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
+ evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
+ evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
+
+ evmcs->exit_qualification = vmcs12->exit_qualification;
+
+ evmcs->guest_linear_address = vmcs12->guest_linear_address;
+ evmcs->guest_rsp = vmcs12->guest_rsp;
+ evmcs->guest_rflags = vmcs12->guest_rflags;
+
+ evmcs->guest_interruptibility_info =
+ vmcs12->guest_interruptibility_info;
+ evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
+ evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
+ evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
+ evmcs->vm_entry_exception_error_code =
+ vmcs12->vm_entry_exception_error_code;
+ evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
+
+ evmcs->guest_rip = vmcs12->guest_rip;
+
+ evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
+
+ return;
+#else /* CONFIG_KVM_HYPERV */
+ KVM_BUG_ON(1, vmx->vcpu.kvm);
+#endif /* CONFIG_KVM_HYPERV */
+}
+
+/*
+ * This is an equivalent of the nested hypervisor executing the vmptrld
+ * instruction.
+ */
+static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
+ struct kvm_vcpu *vcpu, bool from_launch)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool evmcs_gpa_changed = false;
+ u64 evmcs_gpa;
+
+ if (likely(!guest_cpu_cap_has_evmcs(vcpu)))
+ return EVMPTRLD_DISABLED;
+
+ evmcs_gpa = nested_get_evmptr(vcpu);
+ if (!evmptr_is_valid(evmcs_gpa)) {
+ nested_release_evmcs(vcpu);
+ return EVMPTRLD_DISABLED;
+ }
+
+ if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+ vmx->nested.current_vmptr = INVALID_GPA;
+
+ nested_release_evmcs(vcpu);
+
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
+ &vmx->nested.hv_evmcs_map))
+ return EVMPTRLD_ERROR;
+
+ vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
+
+ /*
+ * Currently, KVM only supports eVMCS version 1
+ * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
+ * value to first u32 field of eVMCS which should specify eVMCS
+ * VersionNumber.
+ *
+ * Guest should be aware of supported eVMCS versions by host by
+ * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
+ * expected to set this CPUID leaf according to the value
+ * returned in vmcs_version from nested_enable_evmcs().
+ *
+ * However, it turns out that Microsoft Hyper-V fails to comply
+ * to their own invented interface: When Hyper-V use eVMCS, it
+ * just sets first u32 field of eVMCS to revision_id specified
+ * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
+ * which is one of the supported versions specified in
+ * CPUID.0x4000000A.EAX[0:15].
+ *
+ * To overcome Hyper-V bug, we accept here either a supported
+ * eVMCS version or VMCS12 revision_id as valid values for first
+ * u32 field of eVMCS.
+ */
+ if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
+ (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
+ nested_release_evmcs(vcpu);
+ return EVMPTRLD_VMFAIL;
+ }
+
+ vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
+
+ evmcs_gpa_changed = true;
+ /*
+ * Unlike normal vmcs12, enlightened vmcs12 is not fully
+ * reloaded from guest's memory (read only fields, fields not
+ * present in struct hv_enlightened_vmcs, ...). Make sure there
+ * are no leftovers.
+ */
+ if (from_launch) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ memset(vmcs12, 0, sizeof(*vmcs12));
+ vmcs12->hdr.revision_id = VMCS12_REVISION;
+ }
+
+ }
+
+ /*
+ * Clean fields data can't be used on VMLAUNCH and when we switch
+ * between different L2 guests as KVM keeps a single VMCS12 per L1.
+ */
+ if (from_launch || evmcs_gpa_changed) {
+ vmx->nested.hv_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ vmx->nested.force_msr_bitmap_recalc = true;
+ }
+
+ return EVMPTRLD_SUCCEEDED;
+#else
+ return EVMPTRLD_DISABLED;
+#endif
+}
+
+void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (nested_vmx_is_evmptr12_valid(vmx))
+ copy_vmcs12_to_enlightened(vmx);
+ else
+ copy_vmcs12_to_shadow(vmx);
+
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
+}
+
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+ struct vcpu_vmx *vmx =
+ container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+ vmx->nested.preemption_timer_expired = true;
+ kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+ kvm_vcpu_kick(&vmx->vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+
+ if (!vmx->nested.has_preemption_timer_deadline) {
+ vmx->nested.preemption_timer_deadline =
+ vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
+ vmx->nested.has_preemption_timer_deadline = true;
+ }
+ return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
+}
+
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
+ u64 preemption_timeout)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * A timer value of zero is architecturally guaranteed to cause
+ * a VMExit prior to executing any instructions in the guest.
+ */
+ if (preemption_timeout == 0) {
+ vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+ return;
+ }
+
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
+ preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+ preemption_timeout *= 1000000;
+ do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+ hrtimer_start(&vmx->nested.preemption_timer,
+ ktime_add_ns(ktime_get(), preemption_timeout),
+ HRTIMER_MODE_ABS_PINNED);
+}
+
+static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
+ return vmcs12->guest_ia32_efer;
+ else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
+ else
+ return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
+}
+
+static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
+{
+ struct kvm *kvm = vmx->vcpu.kvm;
+
+ /*
+ * If vmcs02 hasn't been initialized, set the constant vmcs02 state
+ * according to L0's settings (vmcs12 is irrelevant here). Host
+ * fields that come from L0 and are not constant, e.g. HOST_CR3,
+ * will be set as needed prior to VMLAUNCH/VMRESUME.
+ */
+ if (vmx->nested.vmcs02_initialized)
+ return;
+ vmx->nested.vmcs02_initialized = true;
+
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
+
+ /* All VMFUNCs are currently emulated through L0 vmexits. */
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ if (cpu_has_vmx_posted_intr())
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
+ /*
+ * PML is emulated for L2, but never enabled in hardware as the MMU
+ * handles A/D emulation. Disabling PML for L2 also avoids having to
+ * deal with filtering out L2 GPAs from the buffer.
+ */
+ if (enable_pml) {
+ vmcs_write64(PML_ADDRESS, 0);
+ vmcs_write16(GUEST_PML_INDEX, -1);
+ }
+
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
+
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
+ /*
+ * Set the MSR load/store lists to match L0's settings. Only the
+ * addresses are constant (for vmcs02), the counts can change based
+ * on L2's behavior, e.g. switching to/from long mode.
+ */
+ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ vmx_set_constant_host_state(vmx);
+}
+
+static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
+ struct vmcs12 *vmcs12)
+{
+ prepare_vmcs02_constant_state(vmx);
+
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+
+ /*
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host. Emulate this behavior by using vpid01 for L2
+ * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
+ * and VM-Exit are architecturally required to flush VPID=0, but *only*
+ * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
+ * required flushes), but doing so would cause KVM to over-flush. E.g.
+ * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
+ * and then runs L2 X again, then KVM can and should retain TLB entries
+ * for VPID12=1.
+ */
+ if (enable_vpid) {
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ else
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ }
+}
+
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+ struct vmcs12 *vmcs12)
+{
+ u32 exec_control;
+ u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
+
+ if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx))
+ prepare_vmcs02_early_rare(vmx, vmcs12);
+
+ /*
+ * PIN CONTROLS
+ */
+ exec_control = __pin_controls_get(vmcs01);
+ exec_control |= (vmcs12->pin_based_vm_exec_control &
+ ~PIN_BASED_VMX_PREEMPTION_TIMER);
+
+ /* Posted interrupts setting is only taken from vmcs12. */
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+ } else {
+ vmx->nested.posted_intr_nv = -1;
+ exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
+ pin_controls_set(vmx, exec_control);
+
+ /*
+ * EXEC CONTROLS
+ */
+ exec_control = __exec_controls_get(vmcs01); /* L0's desires */
+ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
+ exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+ exec_control |= vmcs12->cpu_based_vm_exec_control;
+
+ vmx->nested.l1_tpr_threshold = -1;
+ if (exec_control & CPU_BASED_TPR_SHADOW)
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+#ifdef CONFIG_X86_64
+ else
+ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING;
+#endif
+
+ /*
+ * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+ * for I/O port accesses.
+ */
+ exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+
+ /*
+ * This bit will be computed in nested_get_vmcs12_pages, because
+ * we do not have access to L1's MSR bitmap yet. For now, keep
+ * the same bit as before, hoping to avoid multiple VMWRITEs that
+ * only set/clear this bit.
+ */
+ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+ exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
+
+ exec_controls_set(vmx, exec_control);
+
+ /*
+ * SECONDARY EXEC CONTROLS
+ */
+ if (cpu_has_secondary_exec_ctrls()) {
+ exec_control = __secondary_exec_controls_get(vmcs01);
+
+ /* Take the following fields only from vmcs12 */
+ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
+ SECONDARY_EXEC_ENABLE_XSAVES |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_DESC);
+
+ if (nested_cpu_has(vmcs12,
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+ exec_control |= vmcs12->secondary_vm_exec_control;
+
+ /* PML is emulated and never enabled in hardware for L2. */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* VMCS shadowing for L2 is emulated for now */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
+ * will not have to rewrite the controls just for this bit.
+ */
+ if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
+ exec_control |= SECONDARY_EXEC_DESC;
+
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
+
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
+ if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+ vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
+ secondary_exec_controls_set(vmx, exec_control);
+ }
+
+ /*
+ * ENTRY CONTROLS
+ *
+ * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
+ * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
+ * on the related bits (if supported by the CPU) in the hope that
+ * we can avoid VMWrites during vmx_set_efer().
+ *
+ * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
+ * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
+ * do the same for L2.
+ */
+ exec_control = __vm_entry_controls_get(vmcs01);
+ exec_control |= (vmcs12->vm_entry_controls &
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
+ exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
+ if (cpu_has_load_ia32_efer()) {
+ if (guest_efer & EFER_LMA)
+ exec_control |= VM_ENTRY_IA32E_MODE;
+ if (guest_efer != kvm_host.efer)
+ exec_control |= VM_ENTRY_LOAD_IA32_EFER;
+ }
+ vm_entry_controls_set(vmx, exec_control);
+
+ /*
+ * EXIT CONTROLS
+ *
+ * L2->L1 exit controls are emulated - the hardware exit is to L0 so
+ * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+ * bits may be modified by vmx_set_efer() in prepare_vmcs02().
+ */
+ exec_control = __vm_exit_controls_get(vmcs01);
+ if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
+ exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ else
+ exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
+ vm_exit_controls_set(vmx, exec_control);
+
+ /*
+ * Interrupt/Exception Fields
+ */
+ if (vmx->nested.nested_run_pending) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
+ } else {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ }
+}
+
+static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet,
+ u64 *ssp, u64 *ssp_tbl)
+{
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ *s_cet = vmcs_readl(GUEST_S_CET);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
+ *ssp = vmcs_readl(GUEST_SSP);
+ *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE);
+ }
+}
+
+static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet,
+ u64 ssp, u64 ssp_tbl)
+{
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ vmcs_writel(GUEST_S_CET, s_cet);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
+ vmcs_writel(GUEST_SSP, ssp);
+ vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl);
+ }
+}
+
+static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+ struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx);
+
+ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+
+ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+ vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+ vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+ vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+ vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+ vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+ vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+ vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+ vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+ vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+ vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+ vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+ vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+ vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+ vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+ vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+ vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+ vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+ vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+ vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+ vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+ vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmx_segment_cache_clear(vmx);
+ }
+
+ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs12->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+ /*
+ * L1 may access the L2's PDPTR, so save them to construct
+ * vmcs12
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
+ if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ }
+
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+
+ /*
+ * Whether page-faults are trapped is determined by a combination of
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
+ * doesn't care about page faults then we should set all of these to
+ * L1's desires. However, if L0 does care about (some) page faults, it
+ * is not easy (if at all possible?) to merge L0 and L1's desires, we
+ * simply ask to exit on each and every L2 page fault. This is done by
+ * setting MASK=MATCH=0 and (see below) EB.PF=1.
+ * Note that below we don't need special code to set EB.PF beyond the
+ * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+ * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+ * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+ */
+ if (vmx_need_pf_intercept(&vmx->vcpu)) {
+ /*
+ * TODO: if both L0 and L1 need the same MASK and MATCH,
+ * go ahead and use it?
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ } else {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
+ }
+
+ if (cpu_has_vmx_apicv()) {
+ vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+ }
+
+ /*
+ * Make sure the msr_autostore list is up to date before we set the
+ * count in the vmcs02.
+ */
+ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)
+ vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet,
+ vmcs12->guest_ssp, vmcs12->guest_ssp_tbl);
+
+ set_cr4_guest_host_mask(vmx);
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ bool from_vmentry,
+ enum vm_entry_failure_code *entry_failure_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
+ bool load_guest_pdptrs_vmcs12 = false;
+
+ if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
+ prepare_vmcs02_rare(vmx, vmcs12);
+ vmx->nested.dirty_vmcs12 = false;
+
+ load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) ||
+ !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ }
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
+ vmx_get_supported_debugctl(vcpu, false));
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
+ }
+
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
+ vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet,
+ vmx->nested.pre_vmenter_ssp,
+ vmx->nested.pre_vmenter_ssp_tbl);
+
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
+ vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+
+ /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+ * bitwise-or of what L1 wants to trap for L2, and what we want to
+ * trap. Note that CR0.TS also needs updating - we do this later.
+ */
+ vmx_update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+ vcpu->arch.pat = vmcs12->guest_ia32_pat;
+ } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
+ }
+
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ vmx_get_l2_tsc_offset(vcpu),
+ vmx_get_l2_tsc_multiplier(vcpu));
+
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ vcpu->arch.l1_tsc_scaling_ratio,
+ vmx_get_l2_tsc_multiplier(vcpu));
+
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (kvm_caps.has_tsc_control)
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
+
+ if (nested_cpu_has_ept(vmcs12))
+ nested_ept_init_mmu_context(vcpu);
+
+ /*
+ * Override the CR0/CR4 read shadows after setting the effective guest
+ * CR0/CR4. The common helpers also set the shadows, but they don't
+ * account for vmcs12's cr0/4_guest_host_mask.
+ */
+ vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+ vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+ vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+ vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+ vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
+ /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /*
+ * Guest state is invalid and unrestricted guest is disabled,
+ * which means L1 attempted VMEntry to L2 with invalid state.
+ * Fail the VMEntry.
+ *
+ * However when force loading the guest state (SMM exit or
+ * loading nested state after migration, it is possible to
+ * have invalid guest state now, which will be later fixed by
+ * restoring L2 register state
+ */
+ if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
+
+ /* Shadow page tables on either EPT or shadow page tables. */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
+ from_vmentry, entry_failure_code))
+ return -EINVAL;
+
+ /*
+ * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
+ * on nested VM-Exit, which can occur without actually running L2 and
+ * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
+ * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
+ * transition to HLT instead of running L2.
+ */
+ if (enable_ept)
+ vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
+
+ /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
+ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
+ is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
+ WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
+
+ kvm_rsp_write(vcpu, vmcs12->guest_rsp);
+ kvm_rip_write(vcpu, vmcs12->guest_rip);
+
+ /*
+ * It was observed that genuine Hyper-V running in L1 doesn't reset
+ * 'hv_clean_fields' by itself, it only sets the corresponding dirty
+ * bits when it changes a field in eVMCS. Mark all fields as clean
+ * here.
+ */
+ if (nested_vmx_is_evmptr12_valid(vmx))
+ evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ return 0;
+}
+
+static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
+{
+ if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
+ nested_cpu_has_virtual_nmis(vmcs12)))
+ return -EINVAL;
+
+ if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
+ nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* Check for memory type validity */
+ switch (new_eptp & VMX_EPTP_MT_MASK) {
+ case VMX_EPTP_MT_UC:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
+ return false;
+ break;
+ case VMX_EPTP_MT_WB:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* Page-walk levels validity. */
+ switch (new_eptp & VMX_EPTP_PWL_MASK) {
+ case VMX_EPTP_PWL_5:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
+ return false;
+ break;
+ case VMX_EPTP_PWL_4:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* Reserved bits should not be set */
+ if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
+ return false;
+
+ /* AD, if set, should be supported */
+ if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Checks related to VM-Execution Control Fields
+ */
+static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+ vmx->nested.msrs.pinbased_ctls_low,
+ vmx->nested.msrs.pinbased_ctls_high)) ||
+ CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+ vmx->nested.msrs.procbased_ctls_low,
+ vmx->nested.msrs.procbased_ctls_high)))
+ return -EINVAL;
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ vmx->nested.msrs.secondary_ctls_low,
+ vmx->nested.msrs.secondary_ctls_high)))
+ return -EINVAL;
+
+ if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
+ nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
+ nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
+ nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
+ nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
+ nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
+ nested_vmx_check_nmi_controls(vmcs12) ||
+ nested_vmx_check_pml_controls(vcpu, vmcs12) ||
+ nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
+ nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
+ nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
+ CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
+ return -EINVAL;
+
+ if (!nested_cpu_has_preemption_timer(vmcs12) &&
+ nested_cpu_has_save_preemption_timer(vmcs12))
+ return -EINVAL;
+
+ if (nested_cpu_has_ept(vmcs12) &&
+ CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
+ return -EINVAL;
+
+ if (nested_cpu_has_vmfunc(vmcs12)) {
+ if (CC(vmcs12->vm_function_control &
+ ~vmx->nested.msrs.vmfunc_controls))
+ return -EINVAL;
+
+ if (nested_cpu_has_eptp_switching(vmcs12)) {
+ if (CC(!nested_cpu_has_ept(vmcs12)) ||
+ CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
+ return -EINVAL;
+ }
+ }
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) &&
+ CC(!vmcs12->tsc_multiplier))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Checks related to VM-Exit Control Fields
+ */
+static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
+ vmx->nested.msrs.exit_ctls_low,
+ vmx->nested.msrs.exit_ctls_high)) ||
+ CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Checks related to VM-Entry Control Fields
+ */
+static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
+ vmx->nested.msrs.entry_ctls_low,
+ vmx->nested.msrs.entry_ctls_high)))
+ return -EINVAL;
+
+ /*
+ * From the Intel SDM, volume 3:
+ * Fields relevant to VM-entry event injection must be set properly.
+ * These fields are the VM-entry interruption-information field, the
+ * VM-entry exception error code, and the VM-entry instruction length.
+ */
+ if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
+ u32 intr_info = vmcs12->vm_entry_intr_info_field;
+ u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
+ u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
+ bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
+ bool urg = nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_UNRESTRICTED_GUEST);
+ bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
+
+ /* VM-entry interruption-info field: interruption type */
+ if (CC(intr_type == INTR_TYPE_RESERVED) ||
+ CC(intr_type == INTR_TYPE_OTHER_EVENT &&
+ !nested_cpu_supports_monitor_trap_flag(vcpu)))
+ return -EINVAL;
+
+ /* VM-entry interruption-info field: vector */
+ if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
+ CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
+ CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
+ return -EINVAL;
+
+ /*
+ * Cannot deliver error code in real mode or if the interrupt
+ * type is not hardware exception. For other cases, do the
+ * consistency check only if the vCPU doesn't enumerate
+ * VMX_BASIC_NO_HW_ERROR_CODE_CC.
+ */
+ if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) {
+ if (CC(has_error_code))
+ return -EINVAL;
+ } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) {
+ if (CC(has_error_code != x86_exception_has_error_code(vector)))
+ return -EINVAL;
+ }
+
+ /* VM-entry exception error code */
+ if (CC(has_error_code &&
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
+ return -EINVAL;
+
+ /* VM-entry interruption-info field: reserved bits */
+ if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
+ return -EINVAL;
+
+ /* VM-entry instruction length */
+ switch (intr_type) {
+ case INTR_TYPE_SOFT_EXCEPTION:
+ case INTR_TYPE_SOFT_INTR:
+ case INTR_TYPE_PRIV_SW_EXCEPTION:
+ if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) ||
+ CC(vmcs12->vm_entry_instruction_len == 0 &&
+ CC(!nested_cpu_has_zero_length_injection(vcpu))))
+ return -EINVAL;
+ }
+ }
+
+ if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
+ nested_check_vm_exit_controls(vcpu, vmcs12) ||
+ nested_check_vm_entry_controls(vcpu, vmcs12))
+ return -EINVAL;
+
+#ifdef CONFIG_KVM_HYPERV
+ if (guest_cpu_cap_has_evmcs(vcpu))
+ return nested_evmcs_check_controls(vmcs12);
+#endif
+
+ return 0;
+}
+
+static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva;
+ u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0;
+
+ /*
+ * Don't bother with the consistency checks if KVM isn't configured to
+ * WARN on missed consistency checks, as KVM needs to rely on hardware
+ * to fully detect an illegal vTPR vs. TRP Threshold combination due to
+ * the vTPR being writable by L1 at all times (it's an in-memory value,
+ * not a VMCS field). I.e. even if the check passes now, it might fail
+ * at the actual VM-Enter.
+ *
+ * Keying off the module param also allows treating an invalid vAPIC
+ * mapping as a consistency check failure without increasing the risk
+ * of breaking a "real" VM.
+ */
+ if (!warn_on_missed_cc)
+ return 0;
+
+ if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ (CC(!vapic) ||
+ CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0)))))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+#ifdef CONFIG_X86_64
+ if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
+ !!(vcpu->arch.efer & EFER_LMA)))
+ return -EINVAL;
+#endif
+ return 0;
+}
+
+static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
+{
+ /*
+ * Check that the given linear address is canonical after a VM exit
+ * from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
+ */
+ u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
+
+ return !__is_canonical_address(la, l1_address_bits_on_exit);
+}
+
+static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet,
+ u64 ssp, u64 ssp_tbl)
+{
+ if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) ||
+ CC(is_noncanonical_msr_address(ssp_tbl, vcpu)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
+
+ if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
+ CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
+ CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
+ return -EINVAL;
+
+ if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP)))
+ return -EINVAL;
+
+ if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
+ CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
+ CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->host_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
+ if (ia32e) {
+ if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+ return -EINVAL;
+ } else {
+ if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+ CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
+ CC((vmcs12->host_rip) >> 32))
+ return -EINVAL;
+ }
+
+ if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_cs_selector == 0) ||
+ CC(vmcs12->host_tr_selector == 0) ||
+ CC(vmcs12->host_ss_selector == 0 && !ia32e))
+ return -EINVAL;
+
+ if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
+ CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12)))
+ return -EINVAL;
+
+ /*
+ * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+ * IA32_EFER MSR must be 0 in the field for that register. In addition,
+ * the values of the LMA and LME bits in the field must each be that of
+ * the host address-space size VM-exit control.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+ if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
+ CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
+ CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
+ return -EINVAL;
+ }
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) {
+ if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet,
+ vmcs12->host_ssp,
+ vmcs12->host_ssp_tbl))
+ return -EINVAL;
+
+ /*
+ * IA32_S_CET and SSP must be canonical if the host will
+ * enter 64-bit mode after VM-exit; otherwise, higher
+ * 32-bits must be all 0s.
+ */
+ if (ia32e) {
+ if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) ||
+ CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu)))
+ return -EINVAL;
+ } else {
+ if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32))
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+ struct vmcs_hdr hdr;
+
+ if (vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
+ return -EINVAL;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
+ return -EINVAL;
+
+ if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))))
+ return -EINVAL;
+
+ if (CC(hdr.revision_id != VMCS12_REVISION) ||
+ CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Checks related to Guest Non-register State
+ */
+static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
+{
+ if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ enum vm_entry_failure_code *entry_failure_code)
+{
+ bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
+
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+
+ if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
+ CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
+ return -EINVAL;
+
+ if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
+ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+ CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
+ return -EINVAL;
+
+ if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
+ *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
+ return -EINVAL;
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->guest_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
+ if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
+ return -EINVAL;
+
+ if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
+ CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
+ return -EINVAL;
+
+ /*
+ * If the load IA32_EFER VM-entry control is 1, the following checks
+ * are performed on the field for the IA32_EFER MSR:
+ * - Bits reserved in the IA32_EFER MSR must be 0.
+ * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+ * the IA-32e mode guest VM-exit control. It must also be identical
+ * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+ * CR0.PG) is 1.
+ */
+ if (to_vmx(vcpu)->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
+ if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
+ CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
+ CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
+ return -EINVAL;
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
+ (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
+ CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
+ return -EINVAL;
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) {
+ if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet,
+ vmcs12->guest_ssp,
+ vmcs12->guest_ssp_tbl))
+ return -EINVAL;
+
+ /*
+ * Guest SSP must have 63:N bits identical, rather than
+ * be canonical (i.e., 63:N-1 bits identical), where N is
+ * the CPU's maximum linear-address width. Similar to
+ * is_noncanonical_msr_address(), use the host's
+ * linear-address width.
+ */
+ if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1)))
+ return -EINVAL;
+ }
+
+ if (nested_check_guest_non_reg_state(vmcs12))
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KVM_HYPERV
+static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * hv_evmcs may end up being not mapped after migration (when
+ * L2 was running), map it here to make sure vmcs12 changes are
+ * properly reflected.
+ */
+ if (guest_cpu_cap_has_evmcs(vcpu) &&
+ vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
+ enum nested_evmptrld_status evmptrld_status =
+ nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+
+ if (evmptrld_status == EVMPTRLD_VMFAIL ||
+ evmptrld_status == EVMPTRLD_ERROR)
+ return false;
+
+ /*
+ * Post migration VMCS12 always provides the most actual
+ * information, copy it to eVMCS upon entry.
+ */
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+ }
+
+ return true;
+}
+#endif
+
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_host_map *map;
+
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+ }
+
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ map = &vmx->nested.apic_access_page_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
+ } else {
+ pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+ }
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ map = &vmx->nested.virtual_apic_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+ } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
+ nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ /*
+ * The processor will never use the TPR shadow, simply
+ * clear the bit from the execution control. Such a
+ * configuration is useless, but it happens in tests.
+ * For any other configuration, failing the vm entry is
+ * _not_ what the processor does but it's basically the
+ * only possibility we have.
+ */
+ exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
+ } else {
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
+ * force VM-Entry to fail.
+ */
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
+ }
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ map = &vmx->nested.pi_desc_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+ vmx->nested.pi_desc =
+ (struct pi_desc *)(((void *)map->hva) +
+ offset_in_page(vmcs12->posted_intr_desc_addr));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
+ } else {
+ /*
+ * Defer the KVM_INTERNAL_EXIT until KVM tries to
+ * access the contents of the VMCS12 posted interrupt
+ * descriptor. (Note that KVM may do this when it
+ * should not, per the architectural specification.)
+ */
+ vmx->nested.pi_desc = NULL;
+ pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
+ }
+ }
+ if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
+ exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+ else
+ exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+
+ return true;
+}
+
+static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_HYPERV
+ /*
+ * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+ * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+ * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+ * migration.
+ */
+ if (!nested_get_evmcs_page(vcpu)) {
+ pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return false;
+ }
+#endif
+
+ if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
+ return false;
+
+ return true;
+}
+
+static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t dst;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return 0;
+
+ if (WARN_ON_ONCE(vmx->nested.pml_full))
+ return 1;
+
+ /*
+ * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
+ * set is already checked as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa &= ~0xFFFull;
+ dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
+
+ if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+ offset_in_page(dst), sizeof(gpa)))
+ return 0;
+
+ vmcs12->guest_pml_index--;
+
+ return 0;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ * Note that many of these exceptions have priority over VM exits, so they
+ * don't have to be checked again here.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+ if (!to_vmx(vcpu)->nested.vmxon) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
+
+/*
+ * If from_vmentry is false, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
+ *
+ * Returns:
+ * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
+ * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
+ * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
+ * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
+ */
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ enum vm_entry_failure_code entry_failure_code;
+ union vmx_exit_reason exit_reason = {
+ .basic = EXIT_REASON_INVALID_STATE,
+ .failed_vmentry = 1,
+ };
+ u32 failed_index;
+
+ trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
+ vmx->nested.current_vmptr,
+ vmcs12->guest_rip,
+ vmcs12->guest_intr_status,
+ vmcs12->vm_entry_intr_info_field,
+ vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
+ vmcs12->ept_pointer,
+ vmcs12->guest_cr3,
+ KVM_ISA_VMX);
+
+ kvm_service_local_tlb_flush_requests(vcpu);
+
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
+ if (kvm_mpx_supported() &&
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE))
+ vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet,
+ &vmx->nested.pre_vmenter_ssp,
+ &vmx->nested.pre_vmenter_ssp_tbl);
+
+ /*
+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the
+ * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
+ * not KVM, KVM must unwind its software model to the pre-VM-Entry host
+ * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
+ * L1's "real" CR3, which causes nested_vmx_restore_host_state() to
+ * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the
+ * unwind naturally setting arch.cr3 to the correct value. Smashing
+ * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
+ * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
+ * overwritten with a shadow CR3 prior to re-entering L1.
+ */
+ if (!enable_ept)
+ vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
+
+ prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
+
+ if (from_vmentry) {
+ if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+ }
+
+ if (nested_vmx_check_controls_late(vcpu, vmcs12)) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return NVMX_VMENTRY_VMFAIL;
+ }
+
+ if (nested_vmx_check_guest_state(vcpu, vmcs12,
+ &entry_failure_code)) {
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
+ vmcs12->exit_qualification = entry_failure_code;
+ goto vmentry_fail_vmexit;
+ }
+ }
+
+ enter_guest_mode(vcpu);
+
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
+ vmcs12->exit_qualification = entry_failure_code;
+ goto vmentry_fail_vmexit_guest_mode;
+ }
+
+ if (from_vmentry) {
+ failed_index = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (failed_index) {
+ exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
+ vmcs12->exit_qualification = failed_index;
+ goto vmentry_fail_vmexit_guest_mode;
+ }
+ } else {
+ /*
+ * The MMU is not initialized to point at the right entities yet and
+ * "get pages" would need to read data from the guest (i.e. we will
+ * need to perform gpa to hpa translation). Request a call
+ * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
+ * have already been set at vmentry time and should not be reset.
+ */
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ }
+
+ /*
+ * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
+ * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
+ * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
+ * unconditionally. Take care to pull data from vmcs01 as appropriate,
+ * e.g. when checking for interrupt windows, as vmcs02 is now loaded.
+ */
+ if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING)) ||
+ kvm_apic_has_pending_init_or_sipi(vcpu) ||
+ kvm_apic_has_interrupt(vcpu))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Do not start the preemption timer hrtimer until after we know
+ * we are successful, so that only nested_vmx_vmexit needs to cancel
+ * the timer.
+ */
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
+ vmx_start_preemption_timer(vcpu, timer_value);
+ }
+
+ /*
+ * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+ * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+ * returned as far as L1 is concerned. It will only return (and set
+ * the success flag) when L2 exits (see nested_vmx_vmexit()).
+ */
+ return NVMX_VMENTRY_SUCCESS;
+
+ /*
+ * A failed consistency check that leads to a VMExit during L1's
+ * VMEnter to L2 is a variation of a normal VMexit, as explained in
+ * 26.7 "VM-entry failures during or after loading guest state".
+ */
+vmentry_fail_vmexit_guest_mode:
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+ leave_guest_mode(vcpu);
+
+vmentry_fail_vmexit:
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
+ if (!from_vmentry)
+ return NVMX_VMENTRY_VMEXIT;
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+ vmcs12->vm_exit_reason = exit_reason.full;
+ if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+ return NVMX_VMENTRY_VMEXIT;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ enum nvmx_vmentry_status status;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
+ enum nested_evmptrld_status evmptrld_status;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
+ if (evmptrld_status == EVMPTRLD_ERROR) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ kvm_pmu_branch_retired(vcpu);
+
+ if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (CC(!nested_vmx_is_evmptr12_valid(vmx) &&
+ vmx->nested.current_vmptr == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
+ * that there *is* a valid VMCS pointer, RFLAGS.CF is set
+ * rather than RFLAGS.ZF, and no error number is stored to the
+ * VM-instruction error field.
+ */
+ if (CC(vmcs12->hdr.shadow_vmcs))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (nested_vmx_is_evmptr12_valid(vmx)) {
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
+
+ copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields);
+ /* Enlightened VMCS doesn't have launch state */
+ vmcs12->launch_state = !launch;
+ } else if (enable_shadow_vmcs) {
+ copy_shadow_to_vmcs12(vmx);
+ }
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+
+ if (CC(vmcs12->launch_state == launch))
+ return nested_vmx_fail(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+
+ if (nested_vmx_check_controls(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ if (nested_vmx_check_address_space_size(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+
+ if (nested_vmx_check_host_state(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+ vmx->nested.nested_run_pending = 1;
+ vmx->nested.has_preemption_timer_deadline = false;
+ status = nested_vmx_enter_non_root_mode(vcpu, true);
+ if (unlikely(status != NVMX_VMENTRY_SUCCESS))
+ goto vmentry_failed;
+
+ /* Hide L1D cache contents from the nested guest. */
+ kvm_request_l1tf_flush_l1d();
+
+ /*
+ * Must happen outside of nested_vmx_enter_non_root_mode() as it will
+ * also be used as part of restoring nVMX state for
+ * snapshot restore (migration).
+ *
+ * In this flow, it is assumed that vmcs12 cache was
+ * transferred as part of captured nVMX state and should
+ * therefore not be read from guest memory (which may not
+ * exist on destination host yet).
+ */
+ nested_cache_shadow_vmcs12(vcpu, vmcs12);
+
+ switch (vmcs12->guest_activity_state) {
+ case GUEST_ACTIVITY_HLT:
+ /*
+ * If we're entering a halted L2 vcpu and the L2 vcpu won't be
+ * awakened by event injection or by an NMI-window VM-exit or
+ * by an interrupt-window VM-exit, halt the vcpu.
+ */
+ if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
+ !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
+ !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
+ (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ vmx->nested.nested_run_pending = 0;
+ return kvm_emulate_halt_noskip(vcpu);
+ }
+ break;
+ case GUEST_ACTIVITY_WAIT_SIPI:
+ vmx->nested.nested_run_pending = 0;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
+ break;
+ default:
+ break;
+ }
+
+ return 1;
+
+vmentry_failed:
+ vmx->nested.nested_run_pending = 0;
+ if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
+ return 0;
+ if (status == NVMX_VMENTRY_VMEXIT)
+ return 1;
+ WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+}
+
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ * didn't trap the bit, because if L1 did, so would L0).
+ * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ * been modified by L2, and L1 knows it. So just leave the old value of
+ * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ * isn't relevant, because if L0 traps this bit it can set it to anything.
+ * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ * changed these bits, and therefore they need to be updated, but L0
+ * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+ vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+ vcpu->arch.cr4_guest_owned_bits));
+}
+
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info)
+{
+ u32 idt_vectoring;
+ unsigned int nr;
+
+ /*
+ * Per the SDM, VM-Exits due to double and triple faults are never
+ * considered to occur during event delivery, even if the double/triple
+ * fault is the result of an escalating vectoring issue.
+ *
+ * Note, the SDM qualifies the double fault behavior with "The original
+ * event results in a double-fault exception". It's unclear why the
+ * qualification exists since exits due to double fault can occur only
+ * while vectoring a different exception (injected events are never
+ * subject to interception), i.e. there's _always_ an original event.
+ *
+ * The SDM also uses NMI as a confusing example for the "original event
+ * causes the VM exit directly" clause. NMI isn't special in any way,
+ * the same rule applies to all events that cause an exit directly.
+ * NMI is an odd choice for the example because NMIs can only occur on
+ * instruction boundaries, i.e. they _can't_ occur during vectoring.
+ */
+ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
+ ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
+ is_double_fault(exit_intr_info))) {
+ vmcs12->idt_vectoring_info_field = 0;
+ } else if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.vector;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (kvm_exception_is_soft(nr)) {
+ vmcs12->vm_exit_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (vcpu->arch.exception.has_error_code) {
+ idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+ vmcs12->idt_vectoring_error_code =
+ vcpu->arch.exception.error_code;
+ }
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else if (vcpu->arch.nmi_injected) {
+ vmcs12->idt_vectoring_info_field =
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (vcpu->arch.interrupt.soft) {
+ idt_vectoring |= INTR_TYPE_SOFT_INTR;
+ vmcs12->vm_entry_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ } else
+ idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else {
+ vmcs12->idt_vectoring_info_field = 0;
+ }
+}
+
+
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ void *vapic_page;
+ u16 status;
+
+ if (!vmx->nested.pi_pending)
+ return 0;
+
+ if (!vmx->nested.pi_desc)
+ goto mmio_needed;
+
+ vmx->nested.pi_pending = false;
+
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return 0;
+
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0) {
+ vapic_page = vmx->nested.virtual_apic_map.hva;
+ if (!vapic_page)
+ goto mmio_needed;
+
+ __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
+ vapic_page, &max_irr);
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ if ((u8)max_irr > ((u8)status & 0xff)) {
+ status &= ~0xff;
+ status |= (u8)max_irr;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
+ return 0;
+
+mmio_needed:
+ kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
+ return -ENXIO;
+}
+
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qual;
+
+ if (ex->has_payload) {
+ exit_qual = ex->payload;
+ } else if (ex->vector == PF_VECTOR) {
+ exit_qual = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ exit_qual = vcpu->arch.dr6;
+ exit_qual &= ~DR6_BT;
+ exit_qual ^= DR6_ACTIVE_LOW;
+ } else {
+ exit_qual = 0;
+ }
+
+ /*
+ * Unlike AMD's Paged Real Mode, which reports an error code on #PF
+ * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
+ * "has error code" flags on VM-Exit if the CPU is in Real Mode.
+ */
+ if (ex->has_error_code && is_protmode(vcpu)) {
+ /*
+ * Intel CPUs do not generate error codes with bits 31:16 set,
+ * and more importantly VMX disallows setting bits 31:16 in the
+ * injected error code for VM-Entry. Drop the bits to mimic
+ * hardware and avoid inducing failure on nested VM-Entry if L1
+ * chooses to inject the exception back to L2. AMD CPUs _do_
+ * generate "full" 32-bit error codes, so KVM allows userspace
+ * to inject exception error codes with bits 31:16 set.
+ */
+ vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(ex->vector))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
+/*
+ * Returns true if a debug trap is (likely) pending delivery. Infer the class
+ * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
+ * Using the payload is flawed because code breakpoints (fault-like) and data
+ * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
+ * this will return false positives if a to-be-injected code breakpoint #DB is
+ * pending (from KVM's perspective, but not "pending" across an instruction
+ * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
+ * too is trap-like.
+ *
+ * KVM "works" despite these flaws as ICEBP isn't currently supported by the
+ * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
+ * #DB has already happened), and MTF isn't marked pending on code breakpoints
+ * from the emulator (because such #DBs are fault-like and thus don't trigger
+ * actions that fire on instruction retire).
+ */
+static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
+{
+ if (!ex->pending || ex->vector != DB_VECTOR)
+ return 0;
+
+ /* General Detect #DBs are always fault-like. */
+ return ex->payload & ~DR6_BD;
+}
+
+/*
+ * Returns true if there's a pending #DB exception that is lower priority than
+ * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
+ * KVM, but could theoretically be injected by userspace. Note, this code is
+ * imperfect, see above.
+ */
+static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
+{
+ return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
+}
+
+/*
+ * Certain VM-exits set the 'pending debug exceptions' field to indicate a
+ * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
+ * represents these debug traps with a payload that is said to be compatible
+ * with the 'pending debug exceptions' field, write the payload to the VMCS
+ * field if a VM-exit is delivered before the debug trap.
+ */
+static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
+{
+ unsigned long pending_dbg;
+
+ pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
+ if (pending_dbg)
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
+}
+
+static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+ to_vmx(vcpu)->nested.preemption_timer_expired;
+}
+
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic = vmx->nested.virtual_apic_map.hva;
+ int max_irr, vppr;
+
+ if (nested_vmx_preemption_timer_pending(vcpu) ||
+ vmx->nested.mtf_pending)
+ return true;
+
+ /*
+ * Virtual Interrupt Delivery doesn't require manual injection. Either
+ * the interrupt is already in GUEST_RVI and will be recognized by CPU
+ * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
+ * the interrupt from the PIR to RVI prior to entering the guest.
+ */
+ if (for_injection)
+ return false;
+
+ if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ __vmx_interrupt_blocked(vcpu))
+ return false;
+
+ if (!vapic)
+ return false;
+
+ vppr = *((u32 *)(vapic + APIC_PROCPRI));
+
+ max_irr = vmx_get_rvi();
+ if ((max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+
+ if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
+ pi_test_on(vmx->nested.pi_desc)) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
+ * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
+ * and less minor edits to splice in the priority of VMX Non-Root specific
+ * events, e.g. MTF and NMI/INTR-window exiting.
+ *
+ * 1 Hardware Reset and Machine Checks
+ * - RESET
+ * - Machine Check
+ *
+ * 2 Trap on Task Switch
+ * - T flag in TSS is set (on task switch)
+ *
+ * 3 External Hardware Interventions
+ * - FLUSH
+ * - STOPCLK
+ * - SMI
+ * - INIT
+ *
+ * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
+ *
+ * 4 Traps on Previous Instruction
+ * - Breakpoints
+ * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
+ * breakpoint, or #DB due to a split-lock access)
+ *
+ * 4.3 VMX-preemption timer expired VM-exit
+ *
+ * 4.6 NMI-window exiting VM-exit[2]
+ *
+ * 5 Nonmaskable Interrupts (NMI)
+ *
+ * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
+ *
+ * 6 Maskable Hardware Interrupts
+ *
+ * 7 Code Breakpoint Fault
+ *
+ * 8 Faults from Fetching Next Instruction
+ * - Code-Segment Limit Violation
+ * - Code Page Fault
+ * - Control protection exception (missing ENDBRANCH at target of indirect
+ * call or jump)
+ *
+ * 9 Faults from Decoding Next Instruction
+ * - Instruction length > 15 bytes
+ * - Invalid Opcode
+ * - Coprocessor Not Available
+ *
+ *10 Faults on Executing Instruction
+ * - Overflow
+ * - Bound error
+ * - Invalid TSS
+ * - Segment Not Present
+ * - Stack fault
+ * - General Protection
+ * - Data Page Fault
+ * - Alignment Check
+ * - x86 FPU Floating-point exception
+ * - SIMD floating-point exception
+ * - Virtualization exception
+ * - Control protection exception
+ *
+ * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
+ * INIT signals, and higher priority events take priority over MTF VM exits.
+ * MTF VM exits take priority over debug-trap exceptions and lower priority
+ * events.
+ *
+ * [2] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
+ * timer take priority over VM exits caused by the "NMI-window exiting"
+ * VM-execution control and lower priority events.
+ *
+ * [3] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by "NMI-window exiting". VM exits caused by this control take
+ * priority over non-maskable interrupts (NMIs) and lower priority events.
+ *
+ * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
+ * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
+ * non-maskable interrupts (NMIs) and higher priority events take priority over
+ * delivery of a virtual interrupt; delivery of a virtual interrupt takes
+ * priority over external interrupts and lower priority events.
+ */
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
+ */
+ bool block_nested_exceptions = vmx->nested.nested_run_pending;
+ /*
+ * Events that don't require injection, i.e. that are virtualized by
+ * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
+ * to regain control in order to deliver the event, and hardware will
+ * handle event ordering, e.g. with respect to injected exceptions.
+ *
+ * But, new events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events,
+ * irrespective of whether or not they're injected, are blocked until
+ * the instruction completes.
+ */
+ bool block_non_injected_events = kvm_event_needs_reinjection(vcpu);
+ /*
+ * Inject events are blocked by nested VM-Enter, as KVM is responsible
+ * for managing priority between concurrent events, i.e. KVM needs to
+ * wait until after VM-Enter completes to deliver injected events.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ block_non_injected_events;
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ clear_bit(KVM_APIC_INIT, &apic->pending_events);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
+ nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+
+ /* MTF is discarded if the vCPU is in WFS. */
+ vmx->nested.mtf_pending = false;
+ return 0;
+ }
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
+ apic->sipi_vector & 0xFFUL);
+ return 0;
+ }
+ /* Fallthrough, the SIPI is completely ignored. */
+ }
+
+ /*
+ * Process exceptions that are higher priority than Monitor Trap Flag:
+ * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
+ * could theoretically come in from userspace), and ICEBP (INT1).
+ *
+ * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
+ * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
+ * across SMI/RSM as it should; that needs to be addressed in order to
+ * prioritize SMI over MTF and trap-like #DBs.
+ */
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+
+ nested_vmx_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (vmx->nested.mtf_pending) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+
+ nested_vmx_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (nested_vmx_preemption_timer_pending(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_nmi(vcpu))
+ goto no_vmexit;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ NMI_VECTOR | INTR_TYPE_NMI_INTR |
+ INTR_INFO_VALID_MASK, 0);
+ /*
+ * The NMI-triggered VM exit counts as injection:
+ * clear this one and block further NMIs.
+ */
+ vcpu->arch.nmi_pending = 0;
+ vmx_set_nmi_mask(vcpu, true);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+ int irq;
+
+ if (!nested_exit_on_intr(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ goto no_vmexit;
+ }
+
+ if (!nested_exit_intr_ack_set(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ irq = kvm_cpu_get_extint(vcpu);
+ if (irq != -1) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+ return 0;
+ }
+
+ irq = kvm_apic_has_interrupt(vcpu);
+ if (WARN_ON_ONCE(irq < 0))
+ goto no_vmexit;
+
+ /*
+ * If the IRQ is L2's PI notification vector, process posted
+ * interrupts for L2 instead of injecting VM-Exit, as the
+ * detection/morphing architecturally occurs when the IRQ is
+ * delivered to the CPU. Note, only interrupts that are routed
+ * through the local APIC trigger posted interrupt processing,
+ * and enabling posted interrupts requires ACK-on-exit.
+ */
+ if (irq == vmx->nested.posted_intr_nv) {
+ /*
+ * Nested posted interrupts are delivered via RVI, i.e.
+ * aren't injected by KVM, and so can be queued even if
+ * manual event injection is disallowed.
+ */
+ if (block_non_injected_events)
+ return -EBUSY;
+
+ vmx->nested.pi_pending = true;
+ kvm_apic_clear_irr(vcpu, irq);
+ goto no_vmexit;
+ }
+
+ if (block_nested_events)
+ return -EBUSY;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+
+ /*
+ * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+ * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+ * if APICv is active.
+ */
+ kvm_apic_ack_interrupt(vcpu, irq);
+ return 0;
+ }
+
+no_vmexit:
+ return vmx_complete_nested_posted_interrupt(vcpu);
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+ ktime_t remaining =
+ hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+ u64 value;
+
+ if (ktime_to_ns(remaining) <= 0)
+ return 0;
+
+ value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+ do_div(value, 1000000);
+ return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
+static bool is_vmcs12_ext_field(unsigned long field)
+{
+ switch (field) {
+ case GUEST_ES_SELECTOR:
+ case GUEST_CS_SELECTOR:
+ case GUEST_SS_SELECTOR:
+ case GUEST_DS_SELECTOR:
+ case GUEST_FS_SELECTOR:
+ case GUEST_GS_SELECTOR:
+ case GUEST_LDTR_SELECTOR:
+ case GUEST_TR_SELECTOR:
+ case GUEST_ES_LIMIT:
+ case GUEST_CS_LIMIT:
+ case GUEST_SS_LIMIT:
+ case GUEST_DS_LIMIT:
+ case GUEST_FS_LIMIT:
+ case GUEST_GS_LIMIT:
+ case GUEST_LDTR_LIMIT:
+ case GUEST_TR_LIMIT:
+ case GUEST_GDTR_LIMIT:
+ case GUEST_IDTR_LIMIT:
+ case GUEST_ES_AR_BYTES:
+ case GUEST_DS_AR_BYTES:
+ case GUEST_FS_AR_BYTES:
+ case GUEST_GS_AR_BYTES:
+ case GUEST_LDTR_AR_BYTES:
+ case GUEST_TR_AR_BYTES:
+ case GUEST_ES_BASE:
+ case GUEST_CS_BASE:
+ case GUEST_SS_BASE:
+ case GUEST_DS_BASE:
+ case GUEST_FS_BASE:
+ case GUEST_GS_BASE:
+ case GUEST_LDTR_BASE:
+ case GUEST_TR_BASE:
+ case GUEST_GDTR_BASE:
+ case GUEST_IDTR_BASE:
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ case GUEST_BNDCFGS:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+ vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
+}
+
+static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
+ return;
+
+
+ WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->nested.vmcs02;
+ vmx_vcpu_load_vmcs(vcpu, cpu);
+
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_load_vmcs(vcpu, cpu);
+ put_cpu();
+}
+
+/*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (nested_vmx_is_evmptr12_valid(vmx))
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
+ !nested_vmx_is_evmptr12_valid(vmx);
+
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+ vmcs12->guest_rip = kvm_rip_read(vcpu);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+
+ vmcs12->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
+ else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
+ else
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
+
+ if (nested_cpu_has_preemption_timer(vmcs12) &&
+ vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
+ !vmx->nested.nested_run_pending)
+ vmcs12->vmx_preemption_timer_value =
+ vmx_get_preemption_timer_value(vcpu);
+
+ /*
+ * In some cases (usually, nested EPT), L2 is allowed to change its
+ * own CR3 without exiting. If it has changed it, we must keep it.
+ * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+ * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+ *
+ * Additionally, restore L2's PDPTR to vmcs12.
+ */
+ if (enable_ept) {
+ vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
+ if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
+ }
+
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+ if (nested_cpu_has_vid(vmcs12))
+ vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
+ vmcs12->vm_entry_controls =
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+ (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+
+ /*
+ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
+ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
+ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
+ * vmcs02 doesn't strictly track vmcs12.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+ vmcs12->guest_dr7 = vcpu->arch.dr7;
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+ vmcs12->guest_ia32_efer = vcpu->arch.efer;
+
+ vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet,
+ &vmcs12->guest_ssp,
+ &vmcs12->guest_ssp_tbl);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info,
+ unsigned long exit_qualification, u32 exit_insn_len)
+{
+ /* update exit information fields: */
+ vmcs12->vm_exit_reason = vm_exit_reason;
+ if (vmx_get_exit_reason(vcpu).enclave_mode)
+ vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
+ vmcs12->exit_qualification = exit_qualification;
+
+ /*
+ * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
+ * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
+ * exit info fields are unmodified.
+ */
+ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ vmcs12->launch_state = 1;
+
+ /* vm_entry_intr_info_field is cleared on exit. Emulate this
+ * instead of reading the real value. */
+ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+ /*
+ * Transfer the event that L0 or L1 may wanted to inject into
+ * L2 to IDT_VECTORING_INFO_FIELD.
+ */
+ vmcs12_save_pending_event(vcpu, vmcs12,
+ vm_exit_reason, exit_intr_info);
+
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+ vmcs12->vm_exit_instruction_len = exit_insn_len;
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ /*
+ * According to spec, there's no need to store the guest's
+ * MSRs if the exit is due to a VM-entry failure that occurs
+ * during or after loading the guest state. Since this exit
+ * does not fall in that category, we need to save the MSRs.
+ */
+ if (nested_vmx_store_msr(vcpu,
+ vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu,
+ VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ }
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ enum vm_entry_failure_code ignored;
+ struct kvm_segment seg;
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->host_ia32_efer;
+ else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ kvm_rsp_write(vcpu, vmcs12->host_rsp);
+ kvm_rip_write(vcpu, vmcs12->host_rip);
+ vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ /*
+ * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+ * actually changed, because vmx_set_cr0 refers to efer set above.
+ *
+ * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+ * (KVM doesn't change it);
+ */
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmx_set_cr0(vcpu, vmcs12->host_cr0);
+
+ /* Same as above - no reason to call set_cr4_guest_host_mask(). */
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
+
+ nested_ept_uninit_mmu_context(vcpu);
+
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
+
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+
+ /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
+ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ /*
+ * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set.
+ * otherwise CET state should be retained across VM-exit, i.e.,
+ * guest values should be propagated from vmcs12 to vmcs01.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE)
+ vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp,
+ vmcs12->host_ssp_tbl);
+ else
+ vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp,
+ vmcs12->guest_ssp_tbl);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+ vcpu->arch.pat = vmcs12->host_ia32_pat;
+ }
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
+ WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl));
+
+ /* Set L1 segment info according to Intel SDM
+ 27.5.2 Loading Host Segment and Descriptor-Table Registers */
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .selector = vmcs12->host_cs_selector,
+ .type = 11,
+ .present = 1,
+ .s = 1,
+ .g = 1
+ };
+ if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ seg.l = 1;
+ else
+ seg.db = 1;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .type = 3,
+ .present = 1,
+ .s = 1,
+ .db = 1,
+ .g = 1
+ };
+ seg.selector = vmcs12->host_ds_selector;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ seg.selector = vmcs12->host_es_selector;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ seg.selector = vmcs12->host_ss_selector;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ seg.selector = vmcs12->host_fs_selector;
+ seg.base = vmcs12->host_fs_base;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ seg.selector = vmcs12->host_gs_selector;
+ seg.base = vmcs12->host_gs_base;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ seg = (struct kvm_segment) {
+ .base = vmcs12->host_tr_base,
+ .limit = 0x67,
+ .selector = vmcs12->host_tr_selector,
+ .type = 11,
+ .present = 1
+ };
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+ memset(&seg, 0, sizeof(seg));
+ seg.unusable = 1;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+
+ kvm_set_dr(vcpu, 7, 0x400);
+ vmx_guest_debugctl_write(vcpu, 0);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+
+ to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+}
+
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+ struct vmx_uret_msr *efer_msr;
+ unsigned int i;
+
+ if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+ return vmcs_read64(GUEST_IA32_EFER);
+
+ if (cpu_has_load_ia32_efer())
+ return kvm_host.efer;
+
+ for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+ if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+ return vmx->msr_autoload.guest.val[i].value;
+ }
+
+ efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
+ if (efer_msr)
+ return efer_msr->data;
+
+ return kvm_host.efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msr_entry g, h;
+ gpa_t gpa;
+ u32 i, j;
+
+ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ /*
+ * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+ * as vmcs01.GUEST_DR7 contains a userspace defined value
+ * and vcpu->arch.dr7 is not squirreled away before the
+ * nested VMENTER (not worth adding a variable in nested_vmx).
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ else
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
+ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
+ vmx_reload_guest_debugctl(vcpu);
+
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+ */
+ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+ nested_ept_uninit_mmu_context(vcpu);
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+
+ /*
+ * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+ * from vmcs01 (if necessary). The PDPTRs are not loaded on
+ * VMFail, like everything else we just need to ensure our
+ * software model is up-to-date.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ ept_save_pdptrs(vcpu);
+
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * This nasty bit of open coding is a compromise between blindly
+ * loading L1's MSRs using the exit load lists (incorrect emulation
+ * of VMFail), leaving the nested VM's MSRs in the software model
+ * (incorrect behavior) and snapshotting the modified MSRs (too
+ * expensive since the lists are unbound by hardware). For each
+ * MSR that was (prematurely) loaded from the nested VMEntry load
+ * list, reload it from the exit load list if it exists and differs
+ * from the guest value. The intent is to stuff host state as
+ * silently as possible, not to fully process the exit load list.
+ */
+ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+ gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+ pr_debug_ratelimited(
+ "%s read MSR index failed (%u, 0x%08llx)\n",
+ __func__, i, gpa);
+ goto vmabort;
+ }
+
+ for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+ gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+ pr_debug_ratelimited(
+ "%s read MSR failed (%u, 0x%08llx)\n",
+ __func__, j, gpa);
+ goto vmabort;
+ }
+ if (h.index != g.index)
+ continue;
+ if (h.value == g.value)
+ break;
+
+ if (nested_vmx_load_msr_check(vcpu, &h)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, j, h.index, h.reserved);
+ goto vmabort;
+ }
+
+ if (kvm_emulate_msr_write(vcpu, h.index, h.value)) {
+ pr_debug_ratelimited(
+ "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+ __func__, j, h.index, h.value);
+ goto vmabort;
+ }
+ }
+ }
+
+ return;
+
+vmabort:
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification,
+ u32 exit_insn_len)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ /* Pending MTF traps are discarded on VM-Exit. */
+ vmx->nested.mtf_pending = false;
+
+ /* trying to cancel vmlaunch/vmresume is a bug */
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+#ifdef CONFIG_KVM_HYPERV
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ /*
+ * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+ * Enlightened VMCS after migration and we still need to
+ * do that when something is forcing L2->L1 exit prior to
+ * the first L2 run.
+ */
+ (void)nested_get_evmcs_page(vcpu);
+ }
+#endif
+
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
+
+ /*
+ * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+ * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
+ * up-to-date before switching to L1.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ vmx_ept_load_pdptrs(vcpu);
+
+ leave_guest_mode(vcpu);
+
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
+ vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ }
+
+ if (likely(!vmx->fail)) {
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+
+ if (vm_exit_reason != -1)
+ prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
+ exit_intr_info, exit_qualification,
+ exit_insn_len);
+
+ /*
+ * Must happen outside of sync_vmcs02_to_vmcs12() as it will
+ * also be used to capture vmcs12 cache as part of
+ * capturing nVMX state for snapshot (migration).
+ *
+ * Otherwise, this flush will dirty guest memory at a
+ * point it is already assumed by user-space to be
+ * immutable.
+ */
+ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
+ } else {
+ /*
+ * The only expected VM-instruction error is "VM entry with
+ * invalid control field(s)." Anything else indicates a
+ * problem with L0.
+ */
+ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ /* VM-Fail at VM-Entry means KVM missed a consistency check. */
+ WARN_ON_ONCE(warn_on_missed_cc);
+ }
+
+ /*
+ * Drop events/exceptions that were queued for re-injection to L2
+ * (picked up via vmx_complete_interrupts()), as well as exceptions
+ * that were pending for L2. Note, this must NOT be hoisted above
+ * prepare_vmcs12(), events/exceptions queued for re-injection need to
+ * be captured in vmcs12 (see vmcs12_save_pending_event()).
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
+ kvm_nested_vmexit_handle_ibrs(vcpu);
+
+ /* Update any VMCS fields that might have changed while L2 ran */
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (kvm_caps.has_tsc_control)
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
+ if (vmx->nested.l1_tpr_threshold != -1)
+ vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
+
+ if (vmx->nested.change_vmcs01_virtual_apic_mode) {
+ vmx->nested.change_vmcs01_virtual_apic_mode = false;
+ vmx_set_virtual_apic_mode(vcpu);
+ }
+
+ if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
+ vmx->nested.update_vmcs01_cpu_dirty_logging = false;
+ vmx_update_cpu_dirty_logging(vcpu);
+ }
+
+ nested_put_vmcs12_pages(vcpu);
+
+ if (vmx->nested.reload_vmcs01_apic_access_page) {
+ vmx->nested.reload_vmcs01_apic_access_page = false;
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ }
+
+ if (vmx->nested.update_vmcs01_apicv_status) {
+ vmx->nested.update_vmcs01_apicv_status = false;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
+ if (vmx->nested.update_vmcs01_hwapic_isr) {
+ vmx->nested.update_vmcs01_hwapic_isr = false;
+ kvm_apic_update_hwapic_isr(vcpu);
+ }
+
+ if ((vm_exit_reason != -1) &&
+ (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+
+ /* in case we halted in L2 */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ if (likely(!vmx->fail)) {
+ if (vm_exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+
+ /*
+ * Process events if an injectable IRQ or NMI is pending, even
+ * if the event is blocked (RFLAGS.IF is cleared on VM-Exit).
+ * If an event became pending while L2 was active, KVM needs to
+ * either inject the event or request an IRQ/NMI window. SMIs
+ * don't need to be processed as SMM is mutually exclusive with
+ * non-root mode. INIT/SIPI don't need to be checked as INIT
+ * is blocked post-VMXON, and SIPIs are ignored.
+ */
+ if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return;
+ }
+
+ /*
+ * After an early L2 VM-entry failure, we're now back
+ * in L1 which thinks it just finished a VMLAUNCH or
+ * VMRESUME instruction, so we need to set the failure
+ * flag and the VM-instruction error field of the VMCS
+ * accordingly, and skip the emulated instruction.
+ */
+ (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ /*
+ * Restore L1's host state to KVM's software model. We're here
+ * because a consistency check was caught by hardware, which
+ * means some amount of guest state has been propagated to KVM's
+ * model and needs to be unwound to the host's state.
+ */
+ nested_vmx_restore_host_state(vcpu);
+
+ vmx->fail = 0;
+}
+
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD, #GP, or #SS.
+ */
+int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
+ u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
+{
+ gva_t off;
+ bool exn;
+ struct kvm_segment s;
+
+ /*
+ * According to Vol. 3B, "Information for VM Exits Due to Instruction
+ * Execution", on an exit, vmx_instruction_info holds most of the
+ * addressing components of the operand. Only the displacement part
+ * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+ * For how an actual address is calculated from all these components,
+ * refer to Vol. 1, "Operand Addressing".
+ */
+ int scaling = vmx_instruction_info & 3;
+ int addr_size = (vmx_instruction_info >> 7) & 7;
+ bool is_reg = vmx_instruction_info & (1u << 10);
+ int seg_reg = (vmx_instruction_info >> 15) & 7;
+ int index_reg = (vmx_instruction_info >> 18) & 0xf;
+ bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+ int base_reg = (vmx_instruction_info >> 23) & 0xf;
+ bool base_is_valid = !(vmx_instruction_info & (1u << 27));
+
+ if (is_reg) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* Addr = segment_base + offset */
+ /* offset = base + [index * scale] + displacement */
+ off = exit_qualification; /* holds the displacement */
+ if (addr_size == 1)
+ off = (gva_t)sign_extend64(off, 31);
+ else if (addr_size == 0)
+ off = (gva_t)sign_extend64(off, 15);
+ if (base_is_valid)
+ off += kvm_register_read(vcpu, base_reg);
+ if (index_is_valid)
+ off += kvm_register_read(vcpu, index_reg) << scaling;
+ vmx_get_segment(vcpu, &s, seg_reg);
+
+ /*
+ * The effective address, i.e. @off, of a memory operand is truncated
+ * based on the address size of the instruction. Note that this is
+ * the *effective address*, i.e. the address prior to accounting for
+ * the segment's base.
+ */
+ if (addr_size == 1) /* 32 bit */
+ off &= 0xffffffff;
+ else if (addr_size == 0) /* 16 bit */
+ off &= 0xffff;
+
+ /* Checks for #GP/#SS exceptions. */
+ exn = false;
+ if (is_long_mode(vcpu)) {
+ /*
+ * The virtual/linear address is never truncated in 64-bit
+ * mode, e.g. a 32-bit address size can yield a 64-bit virtual
+ * address when using FS/GS with a non-zero base.
+ */
+ if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
+ *ret = s.base + off;
+ else
+ *ret = off;
+
+ *ret = vmx_get_untagged_addr(vcpu, *ret, 0);
+ /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+ * non-canonical form. This is the only check on the memory
+ * destination for long mode!
+ */
+ exn = is_noncanonical_address(*ret, vcpu, 0);
+ } else {
+ /*
+ * When not in long mode, the virtual/linear address is
+ * unconditionally truncated to 32 bits regardless of the
+ * address size.
+ */
+ *ret = (s.base + off) & 0xffffffff;
+
+ /* Protected mode: apply checks for segment validity in the
+ * following order:
+ * - segment type check (#GP(0) may be thrown)
+ * - usability check (#GP(0)/#SS(0))
+ * - limit check (#GP(0)/#SS(0))
+ */
+ if (wr)
+ /* #GP(0) if the destination operand is located in a
+ * read-only data segment or any code segment.
+ */
+ exn = ((s.type & 0xa) == 0 || (s.type & 8));
+ else
+ /* #GP(0) if the source operand is located in an
+ * execute-only code segment
+ */
+ exn = ((s.type & 0xa) == 8);
+ if (exn) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
+ */
+ exn = (s.unusable != 0);
+
+ /*
+ * Protected mode: #GP(0)/#SS(0) if the memory operand is
+ * outside the segment limit. All CPUs that support VMX ignore
+ * limit checks for flat segments, i.e. segments with base==0,
+ * limit==0xffffffff and of type expand-up data or code.
+ */
+ if (!(s.base == 0 && s.limit == 0xffffffff &&
+ ((s.type & 8) || !(s.type & 4))))
+ exn = exn || ((u64)off + len - 1 > s.limit);
+ }
+ if (exn) {
+ kvm_queue_exception_e(vcpu,
+ seg_reg == VCPU_SREG_SS ?
+ SS_VECTOR : GP_VECTOR,
+ 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
+ int *ret)
+{
+ gva_t gva;
+ struct x86_exception e;
+ int r;
+
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmcs_read32(VMX_INSTRUCTION_INFO), false,
+ sizeof(*vmpointer), &gva)) {
+ *ret = 1;
+ return -EINVAL;
+ }
+
+ r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
+ if (r != X86EMUL_CONTINUE) {
+ *ret = kvm_handle_memory_failure(vcpu, r, &e);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate a shadow VMCS and associate it with the currently loaded
+ * VMCS, unless such a shadow VMCS already exists. The newly allocated
+ * VMCS is also VMCLEARed, so that it is ready for use.
+ */
+static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
+
+ /*
+ * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+ * when L1 executes VMXOFF or the vCPU is forced out of nested
+ * operation. VMXON faults if the CPU is already post-VMXON, so it
+ * should be impossible to already have an allocated shadow VMCS. KVM
+ * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+ * always be the loaded VMCS.
+ */
+ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+ return loaded_vmcs->shadow_vmcs;
+
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
+ return loaded_vmcs->shadow_vmcs;
+}
+
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+ if (r < 0)
+ goto out_vmcs02;
+
+ vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
+ if (!vmx->nested.cached_vmcs12)
+ goto out_cached_vmcs12;
+
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
+ vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
+ if (!vmx->nested.cached_shadow_vmcs12)
+ goto out_cached_shadow_vmcs12;
+
+ if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
+ goto out_shadow_vmcs;
+
+ hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
+
+ vmx->nested.vpid02 = allocate_vpid();
+
+ vmx->nested.vmcs02_initialized = false;
+ vmx->nested.vmxon = true;
+
+ if (vmx_pt_mode_is_host_guest()) {
+ vmx->pt_desc.guest.ctl = 0;
+ pt_update_intercept_for_msr(vcpu);
+ }
+
+ return 0;
+
+out_shadow_vmcs:
+ kfree(vmx->nested.cached_shadow_vmcs12);
+
+out_cached_shadow_vmcs12:
+ kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
+ return -ENOMEM;
+}
+
+/* Emulate the VMXON instruction. */
+static int handle_vmxon(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ gpa_t vmptr;
+ uint32_t revision;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
+ | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ /*
+ * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
+ * the guest and so cannot rely on hardware to perform the check,
+ * which has higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON).
+ *
+ * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
+ * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
+ * force any of the relevant guest state. For a restricted guest, KVM
+ * does force CR0.PE=1, but only to also force VM86 in order to emulate
+ * Real Mode, and so there's no need to check CR0.PE manually.
+ */
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /*
+ * The CPL is checked for "not in VMX operation" and for "in VMX root",
+ * and has higher priority than the VM-Fail due to being post-VMXON,
+ * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
+ * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
+ * from L2 to L1, i.e. there's no need to check for the vCPU being in
+ * VMX non-root.
+ *
+ * Forwarding the VM-Exit unconditionally, i.e. without performing the
+ * #UD checks (see above), is functionally ok because KVM doesn't allow
+ * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
+ * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
+ * missed by hardware due to shadowing CR0 and/or CR4.
+ */
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (vmx->nested.vmxon)
+ return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+
+ /*
+ * Invalid CR0/CR4 generates #GP. These checks are performed if and
+ * only if the vCPU isn't already in VMX operation, i.e. effectively
+ * have lower priority than the VM-Fail above.
+ */
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ != VMXON_NEEDED_FEATURES) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
+ return ret;
+
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+ * which replaces physical address width with 32
+ */
+ if (!page_address_valid(vcpu, vmptr))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+ revision != VMCS12_REVISION)
+ return nested_vmx_failInvalid(vcpu);
+
+ vmx->nested.vmxon_ptr = vmptr;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmx->nested.current_vmptr == INVALID_GPA)
+ return;
+
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+
+ if (enable_shadow_vmcs) {
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+ vmx_disable_shadow_vmcs(vmx);
+ }
+ vmx->nested.posted_intr_nv = -1;
+
+ /* Flush VMCS12 to guest memory */
+ kvm_vcpu_write_guest_page(vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
+
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
+ vmx->nested.current_vmptr = INVALID_GPA;
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmxoff(struct kvm_vcpu *vcpu)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ free_nested(vcpu);
+
+ if (kvm_apic_has_pending_init_or_sipi(vcpu))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 zero = 0;
+ gpa_t vmptr;
+ int r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
+
+ if (!page_address_valid(vcpu, vmptr))
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+
+ if (vmptr == vmx->nested.vmxon_ptr)
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+
+ if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) {
+ if (vmptr == vmx->nested.current_vmptr)
+ nested_release_vmcs12(vcpu);
+
+ /*
+ * Silently ignore memory errors on VMCLEAR, Intel's pseudocode
+ * for VMCLEAR includes a "ensure that data for VMCS referenced
+ * by the operand is in memory" clause that guards writes to
+ * memory, i.e. doing nothing for I/O is architecturally valid.
+ *
+ * FIXME: Suppress failures if and only if no memslot is found,
+ * i.e. exit to userspace if __copy_to_user() fails.
+ */
+ (void)kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12,
+ launch_state),
+ &zero, sizeof(zero));
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+ return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+ return nested_vmx_run(vcpu, false);
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
+ unsigned long field;
+ u64 value;
+ gva_t gva = 0;
+ short offset;
+ int len, r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ /* Decode instruction info and find the field to read */
+ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
+
+ if (!nested_vmx_is_evmptr12_valid(vmx)) {
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
+ } else {
+ /*
+ * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+ * enlightened VMCS is active VMREAD/VMWRITE instructions are
+ * unsupported. Unfortunately, certain versions of Windows 11
+ * don't comply with this requirement which is not enforced in
+ * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+ * workaround, as misbehaving guests will panic on VM-Fail.
+ * Note, enlightened VMCS is incompatible with shadow VMCS so
+ * all VMREADs from L2 should go to L1.
+ */
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = evmcs_field_offset(field, NULL);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
+ }
+
+ /*
+ * Now copy part of this value to register or memory, as requested.
+ * Note that the number of bits actually copied is 32 or 64 depending
+ * on the guest's mode (32 or 64 bit), not on the given field's length.
+ */
+ if (instr_info & BIT(10)) {
+ kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
+ } else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ instr_info, true, len, &gva))
+ return 1;
+ /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static bool is_shadow_field_rw(unsigned long field)
+{
+ switch (field) {
+#define SHADOW_FIELD_RW(x, y) case x:
+#include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static bool is_shadow_field_ro(unsigned long field)
+{
+ switch (field) {
+#define SHADOW_FIELD_RO(x, y) case x:
+#include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
+ unsigned long field;
+ short offset;
+ gva_t gva;
+ int len, r;
+
+ /*
+ * The value to write might be 32 or 64 bits, depending on L1's long
+ * mode, and eventually we need to write that into a field of several
+ * possible lengths. The code below first zero-extends the value to 64
+ * bit (value), and then copies only the appropriate number of
+ * bits into the vmcs12 field.
+ */
+ u64 value = 0;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMWRITE sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (instr_info & BIT(10))
+ value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
+ else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ instr_info, false, len, &gva))
+ return 1;
+ r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+ }
+
+ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
+
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ /*
+ * If the vCPU supports "VMWRITE to any supported field in the
+ * VMCS," then the "read-only" fields are actually read/write.
+ */
+ if (vmcs_field_readonly(field) &&
+ !nested_cpu_has_vmwrite_any_field(vcpu))
+ return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+
+ /*
+ * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+ * vmcs12, else we may crush a field or consume a stale value.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ /*
+ * Some Intel CPUs intentionally drop the reserved bits of the AR byte
+ * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
+ * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
+ * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
+ * from L1 will return a different value than VMREAD from L2 (L1 sees
+ * the stripped down value, L2 sees the full value as stored by KVM).
+ */
+ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
+ value &= 0x1f0ff;
+
+ vmcs12_write_any(vmcs12, field, offset, value);
+
+ /*
+ * Do not track vmcs12 dirty-state if in guest-mode as we actually
+ * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
+ * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
+ * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
+ /*
+ * L1 can read these fields without exiting, ensure the
+ * shadow VMCS is up-to-date.
+ */
+ if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
+ preempt_disable();
+ vmcs_load(vmx->vmcs01.shadow_vmcs);
+
+ __vmcs_writel(field, value);
+
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ preempt_enable();
+ }
+ vmx->nested.dirty_vmcs12 = true;
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+ vmx->nested.current_vmptr = vmptr;
+ if (enable_shadow_vmcs) {
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER,
+ __pa(vmx->vmcs01.shadow_vmcs));
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+ }
+ vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t vmptr;
+ int r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
+
+ if (!page_address_valid(vcpu, vmptr))
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+
+ if (vmptr == vmx->nested.vmxon_ptr)
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+
+ /* Forbid normal VMPTRLD if Enlightened version was used */
+ if (nested_vmx_is_evmptr12_valid(vmx))
+ return 1;
+
+ if (vmx->nested.current_vmptr != vmptr) {
+ struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
+ struct vmcs_hdr hdr;
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
+ /*
+ * Reads from an unbacked page return all 1s,
+ * which means that the 32 bits located at the
+ * given physical address won't match the required
+ * VMCS12_REVISION identifier.
+ */
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ if (hdr.revision_id != VMCS12_REVISION ||
+ (hdr.shadow_vmcs &&
+ !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ nested_release_vmcs12(vcpu);
+
+ /*
+ * Load VMCS12 from guest memory since it is not already
+ * cached.
+ */
+ if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
+ VMCS12_SIZE)) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ set_current_vmptr(vmx, vmptr);
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
+ struct x86_exception e;
+ gva_t gva;
+ int r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu))))
+ return 1;
+
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
+ true, sizeof(gpa_t), &gva))
+ return 1;
+ /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+ r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
+ sizeof(gpa_t), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info, types;
+ unsigned long type, roots_to_free;
+ struct kvm_mmu *mmu;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 eptp, gpa;
+ } operand;
+ int i, r, gpr_index;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_EPT) ||
+ !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
+
+ types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+ if (type >= 32 || !(types & (1 << type)))
+ return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ /* According to the Intel VMX instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_instruction_info, false, sizeof(operand), &gva))
+ return 1;
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ /*
+ * Nested EPT roots are always held through guest_mmu,
+ * not root_mmu.
+ */
+ mmu = &vcpu->arch.guest_mmu;
+
+ switch (type) {
+ case VMX_EPT_EXTENT_CONTEXT:
+ if (!nested_vmx_check_eptp(vcpu, operand.eptp))
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ roots_to_free = 0;
+ if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
+ mmu->prev_roots[i].pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+ break;
+ case VMX_EPT_EXTENT_GLOBAL:
+ roots_to_free = KVM_MMU_ROOTS_ALL;
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info;
+ unsigned long type, types;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 vpid;
+ u64 gla;
+ } operand;
+ u16 vpid02;
+ int r, gpr_index;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_VPID) ||
+ !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
+
+ types = (vmx->nested.msrs.vpid_caps &
+ VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
+
+ if (type >= 32 || !(types & (1 << type)))
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ /* according to the intel vmx instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_instruction_info, false, sizeof(operand), &gva))
+ return 1;
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.vpid >> 16)
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ /*
+ * Always flush the effective vpid02, i.e. never flush the current VPID
+ * and never explicitly flush vpid01. INVVPID targets a VPID, not a
+ * VMCS, and so whether or not the current vmcs12 has VPID enabled is
+ * irrelevant (and there may not be a loaded vmcs12).
+ */
+ vpid02 = nested_get_vpid02(vcpu);
+ switch (type) {
+ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ /*
+ * LAM doesn't apply to addresses that are inputs to TLB
+ * invalidation.
+ */
+ if (!operand.vpid ||
+ is_noncanonical_invlpg_address(operand.gla, vcpu))
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ vpid_sync_vcpu_addr(vpid02, operand.gla);
+ break;
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+ if (!operand.vpid)
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ vpid_sync_context(vpid02);
+ break;
+ case VMX_VPID_EXTENT_ALL_CONTEXT:
+ vpid_sync_context(vpid02);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /*
+ * Sync the shadow page tables if EPT is disabled, L1 is invalidating
+ * linear mappings for L2 (tagged with L2's VPID). Free all guest
+ * roots as VPIDs are not tracked in the MMU role.
+ *
+ * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
+ * an MMU when EPT is disabled.
+ *
+ * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
+ */
+ if (!enable_ept)
+ kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 index = kvm_rcx_read(vcpu);
+ u64 new_eptp;
+
+ if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
+ return 1;
+ if (index >= VMFUNC_EPTP_ENTRIES)
+ return 1;
+
+ if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
+ &new_eptp, index * 8, 8))
+ return 1;
+
+ /*
+ * If the (L2) guest does a vmfunc to the currently
+ * active ept pointer, we don't have to do anything else
+ */
+ if (vmcs12->ept_pointer != new_eptp) {
+ if (!nested_vmx_check_eptp(vcpu, new_eptp))
+ return 1;
+
+ vmcs12->ept_pointer = new_eptp;
+ nested_ept_new_eptp(vcpu);
+
+ if (!nested_cpu_has_vpid(vmcs12))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
+
+ return 0;
+}
+
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 function = kvm_rax_read(vcpu);
+
+ /*
+ * VMFUNC should never execute cleanly while L1 is active; KVM supports
+ * VMFUNC for nested VMs, but not for L1.
+ */
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
+ * is enabled in vmcs02 if and only if it's enabled in vmcs12.
+ */
+ if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!(vmcs12->vm_function_control & BIT_ULL(function)))
+ goto fail;
+
+ switch (function) {
+ case 0:
+ if (nested_vmx_eptp_switching(vcpu, vmcs12))
+ goto fail;
+ break;
+ default:
+ goto fail;
+ }
+ return kvm_skip_emulated_instruction(vcpu);
+
+fail:
+ /*
+ * This is effectively a reflected VM-Exit, as opposed to a synthesized
+ * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
+ * EXIT_REASON_VMFUNC as the exit reason.
+ */
+ nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full,
+ vmx_get_intr_info(vcpu),
+ vmx_get_exit_qual(vcpu));
+ return 1;
+}
+
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gpa_t bitmap, last_bitmap;
+ u8 b;
+
+ last_bitmap = INVALID_GPA;
+ b = -1;
+
+ while (size > 0) {
+ if (port < 0x8000)
+ bitmap = vmcs12->io_bitmap_a;
+ else if (port < 0x10000)
+ bitmap = vmcs12->io_bitmap_b;
+ else
+ return true;
+ bitmap += (port & 0x7fff) / 8;
+
+ if (last_bitmap != bitmap)
+ if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
+ return true;
+ if (b & (1 << (port & 7)))
+ return true;
+
+ port++;
+ size--;
+ last_bitmap = bitmap;
+ }
+
+ return false;
+}
+
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ union vmx_exit_reason exit_reason)
+{
+ u32 msr_index;
+ gpa_t bitmap;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return true;
+
+ if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM ||
+ exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ msr_index = vmx_get_exit_qual(vcpu);
+ else
+ msr_index = kvm_rcx_read(vcpu);
+
+ /*
+ * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+ * for the four combinations of read/write and low/high MSR numbers.
+ * First we need to figure out which of the four to use:
+ */
+ bitmap = vmcs12->msr_bitmap;
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE ||
+ exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ bitmap += 2048;
+ if (msr_index >= 0xc0000000) {
+ msr_index -= 0xc0000000;
+ bitmap += 1024;
+ }
+
+ /* Then read the msr_index'th bit from this bitmap: */
+ if (msr_index < 1024*8) {
+ unsigned char b;
+ if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
+ return true;
+ return 1 & (b >> (msr_index & 7));
+ } else
+ return true; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ int cr = exit_qualification & 15;
+ int reg;
+ unsigned long val;
+
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ reg = (exit_qualification >> 8) & 15;
+ val = kvm_register_read(vcpu, reg);
+ switch (cr) {
+ case 0:
+ if (vmcs12->cr0_guest_host_mask &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ break;
+ case 3:
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+ return true;
+ break;
+ case 4:
+ if (vmcs12->cr4_guest_host_mask &
+ (vmcs12->cr4_read_shadow ^ val))
+ return true;
+ break;
+ case 8:
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+ return true;
+ break;
+ }
+ break;
+ case 2: /* clts */
+ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+ (vmcs12->cr0_read_shadow & X86_CR0_TS))
+ return true;
+ break;
+ case 1: /* mov from cr */
+ switch (cr) {
+ case 3:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR3_STORE_EXITING)
+ return true;
+ break;
+ case 8:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR8_STORE_EXITING)
+ return true;
+ break;
+ }
+ break;
+ case 3: /* lmsw */
+ /*
+ * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+ * cr0. Other attempted changes are ignored, with no exit.
+ */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ if (vmcs12->cr0_guest_host_mask & 0xe &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+ !(vmcs12->cr0_read_shadow & 0x1) &&
+ (val & 0x1))
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 encls_leaf;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+ return false;
+
+ encls_leaf = kvm_rax_read(vcpu);
+ if (encls_leaf > 62)
+ encls_leaf = 63;
+ return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
+static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, gpa_t bitmap)
+{
+ u32 vmx_instruction_info;
+ unsigned long field;
+ u8 b;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return true;
+
+ /* Decode instruction info and find the field to access */
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+
+ /* Out-of-range fields always cause a VM exit from L2 to L1 */
+ if (field >> 15)
+ return true;
+
+ if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
+ return true;
+
+ return 1 & (b >> (field & 7));
+}
+
+static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
+{
+ u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
+
+ if (nested_cpu_has_mtf(vmcs12))
+ return true;
+
+ /*
+ * An MTF VM-exit may be injected into the guest by setting the
+ * interruption-type to 7 (other event) and the vector field to 0. Such
+ * is the case regardless of the 'monitor trap flag' VM-execution
+ * control.
+ */
+ return entry_intr_info == (INTR_INFO_VALID_MASK
+ | INTR_TYPE_OTHER_EVENT);
+}
+
+/*
+ * Return true if L0 wants to handle an exit from L2 regardless of whether or not
+ * L1 wants the exit. Only call this when in is_guest_mode (L2).
+ */
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
+{
+ u32 intr_info;
+
+ switch ((u16)exit_reason.basic) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
+ if (is_nmi(intr_info))
+ return true;
+ else if (is_page_fault(intr_info))
+ return vcpu->arch.apf.host_apf_flags ||
+ vmx_need_pf_intercept(vcpu);
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return true;
+ else if (is_breakpoint(intr_info) &&
+ vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return true;
+ else if (is_alignment_check(intr_info) &&
+ !vmx_guest_inject_ac(vcpu))
+ return true;
+ else if (is_ve_fault(intr_info))
+ return true;
+ return false;
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return true;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return true;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return true;
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
+ return true;
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return true;
+ case EXIT_REASON_PML_FULL:
+ /*
+ * PML is emulated for an L1 VMM and should never be enabled in
+ * vmcs02, always "handle" PML_FULL by exiting to userspace.
+ */
+ return true;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return true;
+ case EXIT_REASON_BUS_LOCK:
+ /*
+ * At present, bus lock VM exit is never exposed to L1.
+ * Handle L2's bus locks in L0 directly.
+ */
+ return true;
+#ifdef CONFIG_KVM_HYPERV
+ case EXIT_REASON_VMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu);
+#endif
+ default:
+ break;
+ }
+ return false;
+}
+
+/*
+ * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
+ * is_guest_mode (L2).
+ */
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 intr_info;
+
+ switch ((u16)exit_reason.basic) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
+ if (is_nmi(intr_info))
+ return true;
+ else if (is_page_fault(intr_info))
+ return true;
+ return vmcs12->exception_bitmap &
+ (1u << (intr_info & INTR_INFO_VECTOR_MASK));
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return nested_exit_on_intr(vcpu);
+ case EXIT_REASON_TRIPLE_FAULT:
+ return true;
+ case EXIT_REASON_INTERRUPT_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
+ case EXIT_REASON_NMI_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
+ case EXIT_REASON_TASK_SWITCH:
+ return true;
+ case EXIT_REASON_CPUID:
+ return true;
+ case EXIT_REASON_HLT:
+ return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+ case EXIT_REASON_INVD:
+ return true;
+ case EXIT_REASON_INVLPG:
+ return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_RDPMC:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+ case EXIT_REASON_RDRAND:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
+ case EXIT_REASON_RDSEED:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
+ case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+ case EXIT_REASON_VMREAD:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmread_bitmap);
+ case EXIT_REASON_VMWRITE:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmwrite_bitmap);
+ case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+ case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+ case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
+ case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
+ /*
+ * VMX instructions trap unconditionally. This allows L1 to
+ * emulate them for its L2 guest, i.e., allows 3-level nesting!
+ */
+ return true;
+ case EXIT_REASON_CR_ACCESS:
+ return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+ case EXIT_REASON_DR_ACCESS:
+ return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return nested_vmx_exit_handled_io(vcpu, vmcs12);
+ case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ case EXIT_REASON_MSR_READ_IMM:
+ case EXIT_REASON_MSR_WRITE_IMM:
+ return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+ case EXIT_REASON_INVALID_STATE:
+ return true;
+ case EXIT_REASON_MWAIT_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+ case EXIT_REASON_MONITOR_TRAP_FLAG:
+ return nested_vmx_exit_handled_mtf(vmcs12);
+ case EXIT_REASON_MONITOR_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+ case EXIT_REASON_PAUSE_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return true;
+ case EXIT_REASON_TPR_BELOW_THRESHOLD:
+ return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
+ case EXIT_REASON_APIC_ACCESS:
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /*
+ * The controls for "virtualize APIC accesses," "APIC-
+ * register virtualization," and "virtual-interrupt
+ * delivery" only come from vmcs12.
+ */
+ return true;
+ case EXIT_REASON_INVPCID:
+ return
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
+ nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_WBINVD:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+ case EXIT_REASON_XSETBV:
+ return true;
+ case EXIT_REASON_XSAVES:
+ case EXIT_REASON_XRSTORS:
+ /*
+ * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize
+ * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap
+ * verbatim, i.e. any exit is due to L1's bitmap. WARN if
+ * XSAVES isn't enabled, as the CPU is supposed to inject #UD
+ * in that case, before consulting the XSS-bitmap.
+ */
+ WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES));
+ return true;
+ case EXIT_REASON_UMWAIT:
+ case EXIT_REASON_TPAUSE:
+ return nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+ case EXIT_REASON_ENCLS:
+ return nested_vmx_exit_handled_encls(vcpu, vmcs12);
+ case EXIT_REASON_NOTIFY:
+ /* Notify VM exit is not exposed to L1 */
+ return false;
+ case EXIT_REASON_SEAMCALL:
+ case EXIT_REASON_TDCALL:
+ /*
+ * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't
+ * virtualized by KVM for L1 hypervisors, i.e. L1 should
+ * never want or expect such an exit.
+ */
+ return false;
+ default:
+ return true;
+ }
+}
+
+/*
+ * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
+ * reflected into L1.
+ */
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ union vmx_exit_reason exit_reason = vmx->vt.exit_reason;
+ unsigned long exit_qual;
+ u32 exit_intr_info;
+
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+ /*
+ * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
+ * has already loaded L2's state.
+ */
+ if (unlikely(vmx->fail)) {
+ trace_kvm_nested_vmenter_failed(
+ "hardware VM-instruction error: ",
+ vmcs_read32(VM_INSTRUCTION_ERROR));
+ exit_intr_info = 0;
+ exit_qual = 0;
+ goto reflect_vmexit;
+ }
+
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
+
+ /* If L0 (KVM) wants the exit, it trumps L1's desires. */
+ if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /* If L1 doesn't want the exit, handle it in L0. */
+ if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /*
+ * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
+ * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
+ * need to be synthesized by querying the in-kernel LAPIC, but external
+ * interrupts are never reflected to L1 so it's a non-issue.
+ */
+ exit_intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(exit_intr_info)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+ exit_qual = vmx_get_exit_qual(vcpu);
+
+reflect_vmexit:
+ nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
+ return true;
+}
+
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_vmx *vmx;
+ struct vmcs12 *vmcs12;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = KVM_STATE_NESTED_FORMAT_VMX,
+ .size = sizeof(kvm_state),
+ .hdr.vmx.flags = 0,
+ .hdr.vmx.vmxon_pa = INVALID_GPA,
+ .hdr.vmx.vmcs12_pa = INVALID_GPA,
+ .hdr.vmx.preemption_timer_deadline = 0,
+ };
+ struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
+ &user_kvm_nested_state->data.vmx[0];
+
+ if (!vcpu)
+ return kvm_state.size + sizeof(*user_vmx_nested_state);
+
+ vmx = to_vmx(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) &&
+ (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+ kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+ kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
+
+ if (vmx_has_valid_vmcs12(vcpu)) {
+ kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
+
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ if (nested_vmx_is_evmptr12_set(vmx))
+ kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != INVALID_GPA)
+ kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
+ }
+
+ if (vmx->nested.smm.vmxon)
+ kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+ if (vmx->nested.smm.guest_mode)
+ kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+ if (is_guest_mode(vcpu)) {
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (vmx->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+
+ if (vmx->nested.mtf_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
+
+ if (nested_cpu_has_preemption_timer(vmcs12) &&
+ vmx->nested.has_preemption_timer_deadline) {
+ kvm_state.hdr.vmx.flags |=
+ KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
+ kvm_state.hdr.vmx.preemption_timer_deadline =
+ vmx->nested.preemption_timer_deadline;
+ }
+ }
+ }
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (!vmx_has_valid_vmcs12(vcpu))
+ goto out;
+
+ /*
+ * When running L2, the authoritative vmcs12 state is in the
+ * vmcs02. When running L1, the authoritative vmcs12 state is
+ * in the shadow or enlightened vmcs linked to vmcs01, unless
+ * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
+ * vmcs12 state is in the vmcs12 already.
+ */
+ if (is_guest_mode(vcpu)) {
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ } else {
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+ if (!vmx->nested.need_vmcs12_to_shadow_sync) {
+ if (nested_vmx_is_evmptr12_valid(vmx))
+ /*
+ * L1 hypervisor is not obliged to keep eVMCS
+ * clean fields data always up-to-date while
+ * not in guest mode, 'hv_clean_fields' is only
+ * supposed to be actual upon vmentry so we need
+ * to ignore it here and do full copy.
+ */
+ copy_enlightened_to_vmcs12(vmx, 0);
+ else if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+ }
+ }
+
+ BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
+ BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
+
+ /*
+ * Copy over the full allocated size of vmcs12 rather than just the size
+ * of the struct.
+ */
+ if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
+ return -EFAULT;
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
+ if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE))
+ return -EFAULT;
+ }
+out:
+ return kvm_state.size;
+}
+
+void vmx_leave_nested(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.nested_run_pending = 0;
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+ }
+ free_nested(vcpu);
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ enum vm_entry_failure_code ignored;
+ struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
+ &user_kvm_nested_state->data.vmx[0];
+ int ret;
+
+ if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
+ if (kvm_state->hdr.vmx.smm.flags)
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
+ return -EINVAL;
+
+ /*
+ * KVM_STATE_NESTED_EVMCS used to signal that KVM should
+ * enable eVMCS capability on vCPU. However, since then
+ * code was changed such that flag signals vmcs12 should
+ * be copied into eVMCS in guest memory.
+ *
+ * To preserve backwards compatibility, allow user
+ * to set this flag even when there is no VMXON region.
+ */
+ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
+ return -EINVAL;
+ } else {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ return -EINVAL;
+
+ if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
+ return -EINVAL;
+ }
+
+ if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.smm.flags &
+ ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
+ return -EINVAL;
+
+ /*
+ * SMM temporarily disables VMX, so we cannot be in guest mode,
+ * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
+ * must be zero.
+ */
+ if (is_smm(vcpu) ?
+ (kvm_state->flags &
+ (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
+ : kvm_state->hdr.vmx.smm.flags)
+ return -EINVAL;
+
+ if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) ||
+ !vmx->nested.enlightened_vmcs_enabled))
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
+ return 0;
+
+ vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ /* Empty 'VMXON' state is permitted if no VMCS loaded */
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
+ /* See vmx_has_valid_vmcs12. */
+ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
+ (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
+ (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
+ return -EINVAL;
+ else
+ return 0;
+ }
+
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
+ if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
+ !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
+ return -EINVAL;
+
+ set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
+#ifdef CONFIG_KVM_HYPERV
+ } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
+ /*
+ * nested_vmx_handle_enlightened_vmptrld() cannot be called
+ * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
+ * restored yet. EVMCS will be mapped from
+ * nested_get_vmcs12_pages().
+ */
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+#endif
+ } else {
+ return -EINVAL;
+ }
+
+ if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+ vmx->nested.smm.vmxon = true;
+ vmx->nested.vmxon = false;
+
+ if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+ vmx->nested.smm.guest_mode = true;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (vmcs12->hdr.revision_id != VMCS12_REVISION)
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return 0;
+
+ vmx->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ vmx->nested.mtf_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
+
+ ret = -EINVAL;
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
+ struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
+
+ if (kvm_state->size <
+ sizeof(*kvm_state) +
+ sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
+ goto error_guest_mode;
+
+ if (copy_from_user(shadow_vmcs12,
+ user_vmx_nested_state->shadow_vmcs12,
+ sizeof(*shadow_vmcs12))) {
+ ret = -EFAULT;
+ goto error_guest_mode;
+ }
+
+ if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ !shadow_vmcs12->hdr.shadow_vmcs)
+ goto error_guest_mode;
+ }
+
+ vmx->nested.has_preemption_timer_deadline = false;
+ if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
+ vmx->nested.has_preemption_timer_deadline = true;
+ vmx->nested.preemption_timer_deadline =
+ kvm_state->hdr.vmx.preemption_timer_deadline;
+ }
+
+ if (nested_vmx_check_controls(vcpu, vmcs12) ||
+ nested_vmx_check_host_state(vcpu, vmcs12) ||
+ nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
+ goto error_guest_mode;
+
+ vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
+ ret = nested_vmx_enter_non_root_mode(vcpu, false);
+ if (ret)
+ goto error_guest_mode;
+
+ if (vmx->nested.mtf_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+
+error_guest_mode:
+ vmx->nested.nested_run_pending = 0;
+ return ret;
+}
+
+void nested_vmx_set_vmcs_shadowing_bitmap(void)
+{
+ if (enable_shadow_vmcs) {
+ vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+ }
+}
+
+/*
+ * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
+ * that madness to get the encoding for comparison.
+ */
+#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
+
+static u64 nested_vmx_calc_vmcs_enum_msr(void)
+{
+ /*
+ * Note these are the so called "index" of the VMCS field encoding, not
+ * the index into vmcs12.
+ */
+ unsigned int max_idx, idx;
+ int i;
+
+ /*
+ * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
+ * vmcs12, regardless of whether or not the associated feature is
+ * exposed to L1. Simply find the field with the highest index.
+ */
+ max_idx = 0;
+ for (i = 0; i < nr_vmcs12_fields; i++) {
+ /* The vmcs12 table is very, very sparsely populated. */
+ if (!vmcs12_field_offsets[i])
+ continue;
+
+ idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
+ if (idx > max_idx)
+ max_idx = idx;
+ }
+
+ return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
+}
+
+static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->pinbased_ctls_low =
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
+ msrs->pinbased_ctls_high &=
+ PIN_BASED_EXT_INTR_MASK |
+ PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS |
+ (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
+ msrs->pinbased_ctls_high |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->exit_ctls_low =
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
+ msrs->exit_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_EXIT_HOST_ADDR_SPACE_SIZE |
+#endif
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+ VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE;
+ msrs->exit_ctls_high |=
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE;
+
+ /* We support free control of debug control saving. */
+ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+}
+
+static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->entry_ctls_low =
+ VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
+ msrs->entry_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_ENTRY_IA32E_MODE |
+#endif
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
+ VM_ENTRY_LOAD_CET_STATE;
+ msrs->entry_ctls_high |=
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE;
+
+ /* We support free control of debug control loading. */
+ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+}
+
+static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->procbased_ctls_low =
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
+ msrs->procbased_ctls_high &=
+ CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
+ CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+ CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
+ CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ /*
+ * We can allow some features even when not supported by the
+ * hardware. For example, L1 can specify an MSR bitmap - and we
+ * can use it to avoid exits to L1 - even when L0 runs L2
+ * without MSR bitmaps.
+ */
+ msrs->procbased_ctls_high |=
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ CPU_BASED_USE_MSR_BITMAPS;
+
+ /* We support free control of CR3 access interception. */
+ msrs->procbased_ctls_low &=
+ ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
+}
+
+static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
+ struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->secondary_ctls_low = 0;
+
+ msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
+ msrs->secondary_ctls_high &=
+ SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_RDRAND_EXITING |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_ENABLE_XSAVES |
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+
+ /*
+ * We can emulate "VMCS shadowing," even if the hardware
+ * doesn't support it.
+ */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (enable_ept) {
+ /* nested EPT: emulate EPT also to L1 */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_EPT;
+ msrs->ept_caps =
+ VMX_EPT_PAGE_WALK_4_BIT |
+ VMX_EPT_PAGE_WALK_5_BIT |
+ VMX_EPTP_WB_BIT |
+ VMX_EPT_INVEPT_BIT |
+ VMX_EPT_EXECUTE_ONLY_BIT;
+
+ msrs->ept_caps &= ept_caps;
+ msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+ VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+ VMX_EPT_1GB_PAGE_BIT;
+ if (enable_ept_ad_bits) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_PML;
+ msrs->ept_caps |= VMX_EPT_AD_BIT;
+ }
+
+ /*
+ * Advertise EPTP switching irrespective of hardware support,
+ * KVM emulates it in software so long as VMFUNC is supported.
+ */
+ if (cpu_has_vmx_vmfunc())
+ msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
+ }
+
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
+ if (enable_vpid) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VPID;
+ msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SUPPORTED_MASK;
+ }
+
+ if (enable_unrestricted_guest)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
+ if (flexpriority_enabled)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ if (enable_sgx)
+ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+}
+
+static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low |=
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_WAIT_SIPI;
+ msrs->misc_high = 0;
+}
+
+static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
+{
+ /*
+ * This MSR reports some information about VMX support. We
+ * should return information about the VMX we emulate for the
+ * guest, and the VMCS structure we give it - not about the
+ * VMX support of the underlying hardware.
+ */
+ msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
+ X86_MEMTYPE_WB);
+
+ msrs->basic |= VMX_BASIC_TRUE_CTLS;
+ if (cpu_has_vmx_basic_inout())
+ msrs->basic |= VMX_BASIC_INOUT;
+ if (cpu_has_vmx_basic_no_hw_errcode_cc())
+ msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC;
+}
+
+static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
+{
+ /*
+ * These MSRs specify bits which the guest must keep fixed on
+ * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+ * We picked the standard core2 setting.
+ */
+#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
+ msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
+
+ /* These MSRs specify bits which the guest must keep fixed off. */
+ rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
+ rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
+
+ if (vmx_umip_emulated())
+ msrs->cr4_fixed1 |= X86_CR4_UMIP;
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ */
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
+{
+ struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
+
+ /*
+ * Note that as a general rule, the high half of the MSRs (bits in
+ * the control fields which may be 1) should be initialized by the
+ * intersection of the underlying hardware's MSR (i.e., features which
+ * can be supported) and the list of features we want to expose -
+ * because they are known to be properly supported in our code.
+ * Also, usually, the low half of the MSRs (bits which must be 1) can
+ * be set to 0, meaning that L1 may turn off any of these bits. The
+ * reason is that if one of these bits is necessary, it will appear
+ * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+ * fields of vmcs01 and vmcs02, will turn these bits off - and
+ * nested_vmx_l1_wants_exit() will not pass related exits to L1.
+ * These rules have exceptions below.
+ */
+ nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
+
+ nested_vmx_setup_misc_data(vmcs_conf, msrs);
+
+ nested_vmx_setup_basic(msrs);
+
+ nested_vmx_setup_cr_fixed(msrs);
+
+ msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
+}
+
+void nested_vmx_hardware_unsetup(void)
+{
+ int i;
+
+ if (enable_shadow_vmcs) {
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
+ }
+}
+
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
+{
+ int i;
+
+ if (!cpu_has_vmx_shadow_vmcs())
+ enable_shadow_vmcs = 0;
+ if (enable_shadow_vmcs) {
+ for (i = 0; i < VMX_BITMAP_NR; i++) {
+ /*
+ * The vmx_bitmap is not tied to a VM and so should
+ * not be charged to a memcg.
+ */
+ vmx_bitmap[i] = (unsigned long *)
+ __get_free_page(GFP_KERNEL);
+ if (!vmx_bitmap[i]) {
+ nested_vmx_hardware_unsetup();
+ return -ENOMEM;
+ }
+ }
+
+ init_vmcs_shadow_fields();
+ }
+
+ exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
+ exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
+ exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
+ exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
+ exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
+ exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
+ exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
+ exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
+ exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
+ exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
+ exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
+ exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
+
+ return 0;
+}
+
+struct kvm_x86_nested_ops vmx_nested_ops = {
+ .leave_nested = vmx_leave_nested,
+ .is_exception_vmexit = nested_vmx_is_exception_vmexit,
+ .check_events = vmx_check_nested_events,
+ .has_events = vmx_has_nested_events,
+ .triple_fault = nested_vmx_triple_fault,
+ .get_state = vmx_get_nested_state,
+ .set_state = vmx_set_nested_state,
+ .get_nested_state_pages = vmx_get_nested_state_pages,
+ .write_log_dirty = nested_vmx_write_pml_buffer,
+#ifdef CONFIG_KVM_HYPERV
+ .enable_evmcs = nested_enable_evmcs,
+ .get_evmcs_version = nested_get_evmcs_version,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
+#endif
+};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
new file mode 100644
index 000000000000..983484d42ebf
--- /dev/null
+++ b/arch/x86/kvm/vmx/nested.h
@@ -0,0 +1,323 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_NESTED_H
+#define __KVM_X86_VMX_NESTED_H
+
+#include "kvm_cache_regs.h"
+#include "hyperv.h"
+#include "vmcs12.h"
+#include "vmx.h"
+
+/*
+ * Status returned by nested_vmx_enter_non_root_mode():
+ */
+enum nvmx_vmentry_status {
+ NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */
+ NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */
+ NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */
+ NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */
+};
+
+void vmx_leave_nested(struct kvm_vcpu *vcpu);
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps);
+void nested_vmx_hardware_unsetup(void);
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
+void nested_vmx_set_vmcs_shadowing_bitmap(void);
+void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry);
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
+void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification,
+ u32 exit_insn_len);
+
+static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ u32 exit_insn_len;
+
+ if (to_vmx(vcpu)->fail || vm_exit_reason == -1 ||
+ (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ exit_insn_len = 0;
+ else
+ exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info,
+ exit_qualification, exit_insn_len);
+}
+
+void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
+int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
+int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
+ u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size);
+
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
+ return to_vmx(vcpu)->nested.cached_vmcs12;
+}
+
+static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
+ return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
+}
+
+/*
+ * Note: the same condition is checked against the state provided by userspace
+ * in vmx_set_nested_state; if it is satisfied, the nested state must include
+ * the VMCS12.
+ */
+static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ return vmx->nested.current_vmptr != -1ull ||
+ nested_vmx_is_evmptr12_set(vmx);
+}
+
+static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
+static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
+{
+ /* return the page table to be shadowed - in our case, EPT12 */
+ return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+ return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
+}
+
+/*
+ * Return the cr0/4 value that a nested guest would read. This is a combination
+ * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed
+ * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as
+ * the value+mask loaded into vmcs02 may not match the vmcs12 fields.
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+ return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+ (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+ return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+ (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
+static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
+{
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
+}
+
+/*
+ * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
+ * to modify any valid field of the VMCS, or are the VM-exit
+ * information fields read-only?
+ */
+static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low &
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+}
+
+static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
+}
+
+static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
+ CPU_BASED_MONITOR_TRAP_FLAG;
+}
+
+static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+ return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+ return (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ (vmcs12->secondary_vm_exec_control & bit);
+}
+
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+}
+
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
+}
+
+static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
+}
+
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_vmfunc(vmcs12) &&
+ (vmcs12->vm_function_control &
+ VMX_VMFUNC_EPTP_SWITCHING);
+}
+
+static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
+}
+
+static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_exit_controls &
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
+}
+
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK;
+}
+
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
+/*
+ * if fixed0[i] == 1: val[i] must be 1
+ * if fixed1[i] == 0: val[i] must be 0
+ */
+static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
+{
+ return ((val & fixed1) | fixed0) == val;
+}
+
+static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1) &&
+ __kvm_is_valid_cr4(vcpu, val);
+}
+
+static inline bool nested_cpu_has_no_hw_errcode_cc(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC;
+}
+
+/* No difference in the restrictions on guest and host CR4 in VMX operation. */
+#define nested_guest_cr4_valid nested_cr4_valid
+#define nested_host_cr4_valid nested_cr4_valid
+
+extern struct kvm_x86_nested_ops vmx_nested_ops;
+
+#endif /* __KVM_X86_VMX_NESTED_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
new file mode 100644
index 000000000000..de1d9785c01f
--- /dev/null
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -0,0 +1,784 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/msr.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "nested.h"
+#include "pmu.h"
+#include "tdx.h"
+
+/*
+ * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX
+ * to encode the "type" of counter to read, i.e. this is not a "base". And to
+ * further confuse things, non-architectural PMUs use bit 31 as a flag for
+ * "fast" reads, whereas the "type" is an explicit value.
+ */
+#define INTEL_RDPMC_GP 0
+#define INTEL_RDPMC_FIXED INTEL_PMC_FIXED_RDPMC_BASE
+
+#define INTEL_RDPMC_TYPE_MASK GENMASK(31, 16)
+#define INTEL_RDPMC_INDEX_MASK GENMASK(15, 0)
+
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+
+static struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return NULL;
+
+ return &to_vmx(vcpu)->lbr_desc;
+}
+
+static struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return NULL;
+
+ return &to_vmx(vcpu)->lbr_desc.records;
+}
+
+#pragma GCC poison to_vmx
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+ struct kvm_pmc *pmc;
+ u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
+ int i;
+
+ pmu->fixed_ctr_ctrl = data;
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ u8 new_ctrl = fixed_ctrl_field(data, i);
+ u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i);
+
+ if (old_ctrl == new_ctrl)
+ continue;
+
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+ __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use);
+ kvm_pmu_request_counter_reprogram(pmc);
+ }
+}
+
+static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
+{
+ unsigned int type = idx & INTEL_RDPMC_TYPE_MASK;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *counters;
+ unsigned int num_counters;
+ u64 bitmask;
+
+ /*
+ * The encoding of ECX for RDPMC is different for architectural versus
+ * non-architecturals PMUs (PMUs with version '0'). For architectural
+ * PMUs, bits 31:16 specify the PMC type and bits 15:0 specify the PMC
+ * index. For non-architectural PMUs, bit 31 is a "fast" flag, and
+ * bits 30:0 specify the PMC index.
+ *
+ * Yell and reject attempts to read PMCs for a non-architectural PMU,
+ * as KVM doesn't support such PMUs.
+ */
+ if (WARN_ON_ONCE(!pmu->version))
+ return NULL;
+
+ /*
+ * General Purpose (GP) PMCs are supported on all PMUs, and fixed PMCs
+ * are supported on all architectural PMUs, i.e. on all virtual PMUs
+ * supported by KVM. Note, KVM only emulates fixed PMCs for PMU v2+,
+ * but the type itself is still valid, i.e. let RDPMC fail due to
+ * accessing a non-existent counter. Reject attempts to read all other
+ * types, which are unknown/unsupported.
+ */
+ switch (type) {
+ case INTEL_RDPMC_FIXED:
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ bitmask = pmu->counter_bitmask[KVM_PMC_FIXED];
+ break;
+ case INTEL_RDPMC_GP:
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ bitmask = pmu->counter_bitmask[KVM_PMC_GP];
+ break;
+ default:
+ return NULL;
+ }
+
+ idx &= INTEL_RDPMC_INDEX_MASK;
+ if (idx >= num_counters)
+ return NULL;
+
+ *mask &= bitmask;
+ return &counters[array_index_nospec(idx, num_counters)];
+}
+
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
+{
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
+ return 0;
+
+ return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0;
+}
+
+static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ if (!fw_writes_is_enabled(pmu_to_vcpu(pmu)))
+ return NULL;
+
+ return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
+}
+
+static bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return cpuid_model_is_consistent(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return !!vcpu_to_lbr_records(vcpu)->nr;
+}
+
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+ struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
+ bool ret = false;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return ret;
+
+ ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+ (index >= records->from && index < records->from + records->nr) ||
+ (index >= records->to && index < records->to + records->nr);
+
+ if (!ret && records->info)
+ ret = (index >= records->info && index < records->info + records->nr);
+
+ return ret;
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u64 perf_capabilities;
+ int ret;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ return kvm_pmu_has_perf_global_ctrl(pmu);
+ case MSR_IA32_PEBS_ENABLE:
+ ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
+ break;
+ case MSR_IA32_DS_AREA:
+ ret = guest_cpu_cap_has(vcpu, X86_FEATURE_DS);
+ break;
+ case MSR_PEBS_DATA_CFG:
+ perf_capabilities = vcpu_get_perf_capabilities(vcpu);
+ ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) &&
+ ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3);
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+ get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+ intel_pmu_is_valid_lbr_msr(vcpu, msr);
+ break;
+ }
+
+ return ret;
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, msr);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0);
+
+ return pmc;
+}
+
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc)
+ return;
+
+ if (lbr_desc->event) {
+ perf_event_release_kernel(lbr_desc->event);
+ lbr_desc->event = NULL;
+ vcpu_to_pmu(vcpu)->event_count--;
+ }
+}
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct perf_event *event;
+
+ /*
+ * The perf_event_attr is constructed in the minimum efficient way:
+ * - set 'pinned = true' to make it task pinned so that if another
+ * cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+ * - set '.exclude_host = true' to record guest branches behavior;
+ *
+ * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+ * schedule the event without a real HW counter but a fake one;
+ * check is_guest_lbr_event() and __intel_get_event_constraints();
+ *
+ * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+ * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+ * event, which helps KVM to save/restore guest LBR records
+ * during host context switches and reduces quite a lot overhead,
+ * check branch_user_callstack() and intel_pmu_lbr_sched_task();
+ */
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(attr),
+ .config = INTEL_FIXED_VLBR_EVENT,
+ .sample_type = PERF_SAMPLE_BRANCH_STACK,
+ .pinned = true,
+ .exclude_host = true,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ PERF_SAMPLE_BRANCH_USER,
+ };
+
+ if (WARN_ON_ONCE(!lbr_desc))
+ return 0;
+
+ if (unlikely(lbr_desc->event)) {
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1,
+ current, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("%s: failed %ld\n",
+ __func__, PTR_ERR(event));
+ return PTR_ERR(event);
+ }
+ lbr_desc->event = event;
+ pmu->event_count++;
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info, bool read)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ u32 index = msr_info->index;
+
+ if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
+ return false;
+
+ if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0)
+ goto dummy;
+
+ /*
+ * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+ * host at the time the value is read from the msr, and this avoids the
+ * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+ * return 0 on guest reads.
+ */
+ local_irq_disable();
+ if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (read)
+ rdmsrq(index, msr_info->data);
+ else
+ wrmsrq(index, msr_info->data);
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+ return true;
+ }
+ clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+
+dummy:
+ if (read)
+ msr_info->data = 0;
+ return true;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ msr_info->data = pmu->fixed_ctr_ctrl;
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ msr_info->data = pmu->pebs_enable;
+ break;
+ case MSR_IA32_DS_AREA:
+ msr_info->data = pmu->ds_area;
+ break;
+ case MSR_PEBS_DATA_CFG:
+ msr_info->data = pmu->pebs_data_cfg;
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ u64 val = pmc_read_counter(pmc);
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_GP];
+ break;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ u64 val = pmc_read_counter(pmc);
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_FIXED];
+ break;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ msr_info->data = pmc->eventsel;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) {
+ break;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u64 reserved_bits, diff;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (data & pmu->fixed_ctr_ctrl_rsvd)
+ return 1;
+
+ if (pmu->fixed_ctr_ctrl != data)
+ reprogram_fixed_counters(pmu, data);
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ if (data & pmu->pebs_enable_rsvd)
+ return 1;
+
+ if (pmu->pebs_enable != data) {
+ diff = pmu->pebs_enable ^ data;
+ pmu->pebs_enable = data;
+ reprogram_counters(pmu, diff);
+ }
+ break;
+ case MSR_IA32_DS_AREA:
+ if (is_noncanonical_msr_address(data, vcpu))
+ return 1;
+
+ pmu->ds_area = data;
+ break;
+ case MSR_PEBS_DATA_CFG:
+ if (data & pmu->pebs_data_cfg_rsvd)
+ return 1;
+
+ pmu->pebs_data_cfg = data;
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
+ (data & ~pmu->counter_bitmask[KVM_PMC_GP]))
+ return 1;
+
+ if (!msr_info->host_initiated &&
+ !(msr & MSR_PMC_FULL_WIDTH_BIT))
+ data = (s64)(s32)data;
+ pmc_write_counter(pmc, data);
+ break;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ pmc_write_counter(pmc, data);
+ break;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ reserved_bits = pmu->reserved_bits;
+ if ((pmc->idx == 2) &&
+ (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
+ reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
+ if (data & reserved_bits)
+ return 1;
+
+ if (data != pmc->eventsel) {
+ pmc->eventsel = data;
+ kvm_pmu_request_counter_reprogram(pmc);
+ }
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) {
+ break;
+ }
+ /* Not a known PMU MSR. */
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Map fixed counter events to architectural general purpose event encodings.
+ * Perf doesn't provide APIs to allow KVM to directly program a fixed counter,
+ * and so KVM instead programs the architectural event to effectively request
+ * the fixed counter. Perf isn't guaranteed to use a fixed counter and may
+ * instead program the encoding into a general purpose counter, e.g. if a
+ * different perf_event is already utilizing the requested counter, but the end
+ * result is the same (ignoring the fact that using a general purpose counter
+ * will likely exacerbate counter contention).
+ *
+ * Forcibly inlined to allow asserting on @index at build time, and there should
+ * never be more than one user.
+ */
+static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
+{
+ const enum perf_hw_id fixed_pmc_perf_ids[] = {
+ [0] = PERF_COUNT_HW_INSTRUCTIONS,
+ [1] = PERF_COUNT_HW_CPU_CYCLES,
+ [2] = PERF_COUNT_HW_REF_CPU_CYCLES,
+ };
+ u64 eventsel;
+
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUNTERS);
+ BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUNTERS);
+
+ /*
+ * Yell if perf reports support for a fixed counter but perf doesn't
+ * have a known encoding for the associated general purpose event.
+ */
+ eventsel = perf_get_hw_event_config(fixed_pmc_perf_ids[index]);
+ WARN_ON_ONCE(!eventsel && index < kvm_pmu_cap.num_counters_fixed);
+ return eventsel;
+}
+
+static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+ u64 perf_capabilities;
+ u64 counter_rsvd;
+
+ if (!lbr_desc)
+ return;
+
+ memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
+
+ /*
+ * Setting passthrough of LBR MSRs is done only in the VM-Entry loop,
+ * and PMU refresh is disallowed after the vCPU has run, i.e. this code
+ * should never be reached while KVM is passing through MSRs.
+ */
+ if (KVM_BUG_ON(lbr_desc->msr_passthrough, vcpu->kvm))
+ return;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa);
+ if (!entry)
+ return;
+
+ eax.full = entry->eax;
+ edx.full = entry->edx;
+
+ pmu->version = eax.split.version_id;
+ if (!pmu->version)
+ return;
+
+ pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+ kvm_pmu_cap.num_counters_gp);
+ eax.split.bit_width = min_t(int, eax.split.bit_width,
+ kvm_pmu_cap.bit_width_gp);
+ pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(eax.split.bit_width) - 1;
+ eax.split.mask_length = min_t(int, eax.split.mask_length,
+ kvm_pmu_cap.events_mask_len);
+ pmu->available_event_types = ~entry->ebx & (BIT_ULL(eax.split.mask_length) - 1);
+
+ entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
+ if (entry &&
+ (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) {
+ pmu->reserved_bits ^= HSW_IN_TX;
+ pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+ }
+
+ perf_capabilities = vcpu_get_perf_capabilities(vcpu);
+ if (intel_pmu_lbr_is_compatible(vcpu) &&
+ (perf_capabilities & PERF_CAP_LBR_FMT))
+ memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps));
+ else
+ lbr_desc->records.nr = 0;
+
+ if (lbr_desc->records.nr)
+ bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
+
+ if (pmu->version == 1)
+ return;
+
+ pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed,
+ kvm_pmu_cap.num_counters_fixed);
+ edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed,
+ kvm_pmu_cap.bit_width_fixed);
+ pmu->counter_bitmask[KVM_PMC_FIXED] = BIT_ULL(edx.split.bit_width_fixed) - 1;
+
+ intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL |
+ INTEL_FIXED_0_USER |
+ INTEL_FIXED_0_ENABLE_PMI);
+
+ counter_rsvd = ~((BIT_ULL(pmu->nr_arch_gp_counters) - 1) |
+ ((BIT_ULL(pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
+ pmu->global_ctrl_rsvd = counter_rsvd;
+
+ /*
+ * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
+ * share reserved bit definitions. The kernel just happens to use
+ * OVF_CTRL for the names.
+ */
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd
+ & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
+ MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
+ if (vmx_pt_mode_is_host_guest())
+ pmu->global_status_rsvd &=
+ ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
+
+ if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
+ if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
+ pmu->pebs_enable_rsvd = counter_rsvd;
+ pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
+ pmu->pebs_data_cfg_rsvd = ~0xff00000full;
+ intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE);
+ } else {
+ pmu->pebs_enable_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1);
+ }
+ }
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc)
+ return;
+
+ for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
+ }
+
+ for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUNTERS; i++) {
+ pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+ pmu->fixed_counters[i].vcpu = vcpu;
+ pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
+ pmu->fixed_counters[i].current_config = 0;
+ pmu->fixed_counters[i].eventsel = intel_get_fixed_pmc_eventsel(i);
+ }
+
+ lbr_desc->records.nr = 0;
+ lbr_desc->event = NULL;
+ lbr_desc->msr_passthrough = false;
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+/*
+ * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4.
+ *
+ * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and
+ * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL.
+ *
+ * Guest needs to re-enable LBR to resume branches recording.
+ */
+static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+{
+ u64 data = vmx_guest_debugctl_read();
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+ vmx_guest_debugctl_write(vcpu, data);
+ }
+}
+
+static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ u8 version = vcpu_to_pmu(vcpu)->version;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return;
+
+ if (version > 1 && version < 4)
+ intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
+}
+
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+ int i;
+
+ for (i = 0; i < lbr->nr; i++) {
+ vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set);
+ if (lbr->info)
+ vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
+ }
+
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
+}
+
+static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, true);
+ lbr_desc->msr_passthrough = false;
+}
+
+static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, false);
+ lbr_desc->msr_passthrough = true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (WARN_ON_ONCE(!lbr_desc))
+ return;
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+ return;
+ }
+
+ if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ goto warn;
+ } else
+ vmx_enable_lbr_msrs_passthrough(vcpu);
+
+ return;
+
+warn:
+ pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id);
+}
+
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
+{
+ struct kvm_pmc *pmc = NULL;
+ int bit, hw_idx;
+
+ kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) {
+ if (!pmc_is_locally_enabled(pmc) ||
+ !pmc_is_globally_enabled(pmc) || !pmc->perf_event)
+ continue;
+
+ /*
+ * A negative index indicates the event isn't mapped to a
+ * physical counter in the host, e.g. due to contention.
+ */
+ hw_idx = pmc->perf_event->hw.idx;
+ if (hw_idx != pmc->idx && hw_idx > -1)
+ pmu->host_cross_mapped_mask |= BIT_ULL(hw_idx);
+ }
+}
+
+struct kvm_pmu_ops intel_pmu_ops __initdata = {
+ .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
+ .msr_idx_to_pmc = intel_msr_idx_to_pmc,
+ .is_valid_msr = intel_is_valid_msr,
+ .get_msr = intel_pmu_get_msr,
+ .set_msr = intel_pmu_set_msr,
+ .refresh = intel_pmu_refresh,
+ .init = intel_pmu_init,
+ .reset = intel_pmu_reset,
+ .deliver_pmi = intel_pmu_deliver_pmi,
+ .cleanup = intel_pmu_cleanup,
+ .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
+ .MIN_NR_GP_COUNTERS = 1,
+};
diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h
new file mode 100644
index 000000000000..5620d0882cdc
--- /dev/null
+++ b/arch/x86/kvm/vmx/pmu_intel.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_PMU_INTEL_H
+#define __KVM_X86_VMX_PMU_INTEL_H
+
+#include <linux/kvm_host.h>
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+
+struct lbr_desc {
+ /* Basic info about guest LBR records. */
+ struct x86_pmu_lbr records;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
+
+ /* True if LBRs are marked as not intercepted in the MSR bitmap */
+ bool msr_passthrough;
+};
+
+extern struct x86_pmu_lbr vmx_lbr_caps;
+
+#endif /* __KVM_X86_VMX_PMU_INTEL_H */
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
new file mode 100644
index 000000000000..4a6d9a17da23
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/cpu.h>
+
+#include "lapic.h"
+#include "irq.h"
+#include "posted_intr.h"
+#include "trace.h"
+#include "vmx.h"
+#include "tdx.h"
+
+/*
+ * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
+ * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when
+ * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
+ * The vCPUs posted interrupt descriptor is updated at the same time to set its
+ * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
+ * wake the target vCPUs. vCPUs are removed from the list and the notification
+ * vector is reset when the vCPU is scheduled in.
+ */
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
+/*
+ * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
+ * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
+ * ->sched_in() path will need to take the vCPU off the list of the _previous_
+ * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
+ * occur if a wakeup IRQ arrives and attempts to acquire the lock.
+ */
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
+
+#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
+
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vt(vcpu)->pi_desc);
+}
+
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
+{
+ /*
+ * PID.ON can be set at any time by a different vCPU or by hardware,
+ * e.g. a device. PID.control must be written atomically, and the
+ * update must be retried with a fresh snapshot an ON change causes
+ * the cmpxchg to fail.
+ */
+ if (!try_cmpxchg64(&pi_desc->control, pold, new))
+ return -EBUSY;
+
+ return 0;
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+ struct pi_desc old, new;
+ unsigned long flags;
+ unsigned int dest;
+
+ /*
+ * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
+ * PI.SN up-to-date even if there is no assigned device or if APICv is
+ * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
+ */
+ if (!enable_apicv || !lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+ * full update can be skipped as neither the vector nor the destination
+ * needs to be changed. Clear SN even if there is no assigned device,
+ * again for simplicity.
+ */
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+ if (pi_test_and_clear_sn(pi_desc))
+ goto after_clear_sn;
+ return;
+ }
+
+ local_irq_save(flags);
+
+ /*
+ * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+ * list of the _previous_ pCPU, which will not be the same as the
+ * current pCPU if the task was migrated.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+ raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu);
+
+ /*
+ * In addition to taking the wakeup lock for the regular/IRQ
+ * context, tell lockdep it is being taken for the "sched out"
+ * context as well. vCPU loads happens in task context, and
+ * this is taking the lock of the *previous* CPU, i.e. can race
+ * with both the scheduler and the wakeup handler.
+ */
+ raw_spin_lock(spinlock);
+ spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_);
+ list_del(&vt->pi_wakeup_list);
+ spin_release(&spinlock->dep_map, _RET_IP_);
+ raw_spin_unlock(spinlock);
+ }
+
+ dest = cpu_physical_id(cpu);
+ if (!x2apic_mode)
+ dest = (dest << 8) & 0xFF00;
+
+ old.control = READ_ONCE(pi_desc->control);
+ do {
+ new.control = old.control;
+
+ /*
+ * Clear SN (as above) and refresh the destination APIC ID to
+ * handle task migration (@cpu != vcpu->cpu).
+ */
+ new.ndst = dest;
+ __pi_clear_sn(&new);
+
+ /*
+ * Restore the notification vector; in the blocking case, the
+ * descriptor was modified on "put" to use the wakeup vector.
+ */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (pi_try_set_control(pi_desc, &old.control, new.control));
+
+ local_irq_restore(flags);
+
+after_clear_sn:
+
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ if (!pi_is_pir_empty(pi_desc))
+ pi_set_on(pi_desc);
+}
+
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+ /*
+ * Note, reading the number of possible bypass IRQs can race with a
+ * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures
+ * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
+ */
+ return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
+ READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
+}
+
+/*
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
+ */
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+ struct pi_desc old, new;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Acquire the wakeup lock using the "sched out" context to workaround
+ * a lockdep false positive. When this is called, schedule() holds
+ * various per-CPU scheduler locks. When the wakeup handler runs, it
+ * holds this CPU's wakeup lock while calling try_to_wake_up(), which
+ * can eventually take the aforementioned scheduler locks, which causes
+ * lockdep to assume there is deadlock.
+ *
+ * Deadlock can't actually occur because IRQs are disabled for the
+ * entirety of the sched_out critical section, i.e. the wakeup handler
+ * can't run while the scheduler locks are held.
+ */
+ raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu),
+ PI_LOCK_SCHED_OUT);
+ list_add_tail(&vt->pi_wakeup_list,
+ &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+
+ WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
+
+ old.control = READ_ONCE(pi_desc->control);
+ do {
+ /* set 'NV' to 'wakeup vector' */
+ new.control = old.control;
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (pi_try_set_control(pi_desc, &old.control, new.control));
+
+ /*
+ * Send a wakeup IPI to this CPU if an interrupt may have been posted
+ * before the notification vector was updated, in which case the IRQ
+ * will arrive on the non-wakeup vector. An IPI is needed as calling
+ * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+ * enabled until it is safe to call try_to_wake_up() on the task being
+ * scheduled out).
+ */
+ if (pi_test_on(&new))
+ __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
+}
+
+static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The default posted interrupt vector does nothing when
+ * invoked outside guest mode. Return whether a blocked vCPU
+ * can be the target of posted interrupts, as is the case when
+ * using either IPI virtualization or VT-d PI, so that the
+ * notification vector is switched to the one that calls
+ * back to the pi_wakeup_handler() function.
+ */
+ return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) ||
+ vmx_can_use_vtd_pi(vcpu->kvm);
+}
+
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!vmx_needs_pi_wakeup(vcpu))
+ return;
+
+ /*
+ * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
+ * enable the wakeup handler so that notification IRQ wakes the vCPU as
+ * expected. There is no need to enable the wakeup handler if the vCPU
+ * is preempted between setting its wait state and manually scheduling
+ * out, as the task is still runnable, i.e. doesn't need a wake event
+ * from KVM to be scheduled in.
+ *
+ * If the wakeup handler isn't being enabled, Suppress Notifications as
+ * the cost of propagating PIR.IRR to PID.ON is negligible compared to
+ * the cost of a spurious IRQ, and vCPU put/load is a slow path.
+ */
+ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
+ ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
+ (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
+ pi_enable_wakeup_handler(vcpu);
+ else
+ pi_set_sn(pi_desc);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void pi_wakeup_handler(void)
+{
+ int cpu = smp_processor_id();
+ struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
+ raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
+ struct vcpu_vt *vt;
+
+ raw_spin_lock(spinlock);
+ list_for_each_entry(vt, wakeup_list, pi_wakeup_list) {
+
+ if (pi_test_on(&vt->pi_desc))
+ kvm_vcpu_wake_up(vt_to_vcpu(vt));
+ }
+ raw_spin_unlock(spinlock);
+}
+
+void __init pi_init_cpu(int cpu)
+{
+ INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
+ raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
+}
+
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi = vcpu_to_pi_desc(vcpu);
+
+ pi_clear_on(pi);
+ memset(pi->pir, 0, sizeof(pi->pir));
+}
+
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ return pi_test_on(pi_desc) ||
+ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
+}
+
+
+/*
+ * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
+ * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
+ * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
+ */
+void vmx_pi_start_bypass(struct kvm *kvm)
+{
+ if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
+ return;
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
+}
+
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
+{
+ if (vcpu) {
+ struct intel_iommu_pi_data pi_data = {
+ .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
+ .vector = vector,
+ };
+
+ return irq_set_vcpu_affinity(host_irq, &pi_data);
+ } else {
+ return irq_set_vcpu_affinity(host_irq, NULL);
+ }
+}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
new file mode 100644
index 000000000000..a4af39948cf0
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_POSTED_INTR_H
+#define __KVM_X86_VMX_POSTED_INTR_H
+
+#include <linux/bitmap.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
+
+#include <asm/posted_intr.h>
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
+void pi_wakeup_handler(void);
+void __init pi_init_cpu(int cpu);
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+void vmx_pi_start_bypass(struct kvm *kvm);
+
+static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
+{
+ int vec;
+
+ vec = find_last_bit(pi_desc->pir, 256);
+ return vec < 256 ? vec : -1;
+}
+
+#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
new file mode 100644
index 000000000000..6a87a12135fb
--- /dev/null
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_RUN_FLAGS_H
+#define __KVM_X86_VMX_RUN_FLAGS_H
+
+#define VMX_RUN_VMRESUME BIT(0)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(1)
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2)
+
+#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644
index 000000000000..df1d0cf76947
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/msr.h>
+#include <asm/sgx.h>
+
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode. Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+ int size, int alignment, gva_t *gva)
+{
+ struct kvm_segment s;
+ bool fault;
+
+ /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+ *gva = offset;
+ if (!is_64_bit_mode(vcpu)) {
+ vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+ *gva += s.base;
+ }
+
+ if (!IS_ALIGNED(*gva, alignment)) {
+ fault = true;
+ } else if (likely(is_64_bit_mode(vcpu))) {
+ *gva = vmx_get_untagged_addr(vcpu, *gva, 0);
+ fault = is_noncanonical_address(*gva, vcpu, 0);
+ } else {
+ *gva &= 0xffffffff;
+ fault = (s.unusable) ||
+ (s.type != 2 && s.type != 3) ||
+ (*gva > s.limit) ||
+ ((s.base != 0 || s.limit != 0xffffffff) &&
+ (((u64)*gva + size - 1) > s.limit + 1));
+ }
+ if (fault)
+ kvm_inject_gp(vcpu, 0);
+ return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+ unsigned int size)
+{
+ uint64_t data[2] = { addr, size };
+
+ __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data));
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+ unsigned int size)
+{
+ if (__copy_from_user(data, (void __user *)hva, size)) {
+ sgx_handle_emulation_failure(vcpu, hva, size);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+ gpa_t *gpa)
+{
+ struct x86_exception ex;
+
+ if (write)
+ *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+ else
+ *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+ if (*gpa == INVALID_GPA) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+ *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+ if (kvm_is_error_hva(*hva)) {
+ sgx_handle_emulation_failure(vcpu, gpa, 1);
+ return -EFAULT;
+ }
+
+ *hva |= gpa & ~PAGE_MASK;
+
+ return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+ struct x86_exception ex;
+
+ /*
+ * A non-EPCM #PF indicates a bad userspace HVA. This *should* check
+ * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+ * but the error code isn't (yet) plumbed through the ENCLS helpers.
+ */
+ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ /*
+ * If the guest thinks it's running on SGX2 hardware, inject an SGX
+ * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+ * #PF on SGX2). The assumption is that EPCM faults are much more
+ * likely than a bad userspace address.
+ */
+ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) {
+ memset(&ex, 0, sizeof(ex));
+ ex.vector = PF_VECTOR;
+ ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+ PFERR_SGX_MASK;
+ ex.address = gva;
+ ex.error_code_valid = true;
+ ex.nested_page_fault = false;
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ } else {
+ kvm_inject_gp(vcpu, 0);
+ }
+ return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+ struct sgx_pageinfo *pageinfo,
+ unsigned long secs_hva,
+ gva_t secs_gva)
+{
+ struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+ struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+ u64 attributes, xfrm, size;
+ u32 miscselect;
+ u8 max_size_log2;
+ int trapnr, ret;
+
+ sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
+ sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
+ if (!sgx_12_0 || !sgx_12_1) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ miscselect = contents->miscselect;
+ attributes = contents->attributes;
+ xfrm = contents->xfrm;
+ size = contents->size;
+
+ /* Enforce restriction of access to the PROVISIONKEY. */
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+ (attributes & SGX_ATTR_PROVISIONKEY)) {
+ if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+ pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /*
+ * Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. Note
+ * that the allowed XFRM (XFeature Request Mask) isn't strictly bound
+ * by the supported XCR0. FP+SSE *must* be set in XFRM, even if XSAVE
+ * is unsupported, i.e. even if XCR0 itself is completely unsupported.
+ */
+ if ((u32)miscselect & ~sgx_12_0->ebx ||
+ (u32)attributes & ~sgx_12_1->eax ||
+ (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+ (u32)xfrm & ~sgx_12_1->ecx ||
+ (u32)(xfrm >> 32) & ~sgx_12_1->edx ||
+ xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) ||
+ (xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restriction on max enclave size. */
+ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+ sgx_12_0->edx;
+ if (size >= BIT_ULL(max_size_log2)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /*
+ * sgx_virt_ecreate() returns:
+ * 1) 0: ECREATE was successful
+ * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+ * exception number.
+ * 3) -EINVAL: access_ok() on @secs_hva failed. This should never
+ * happen as KVM checks host addresses at memslot creation.
+ * sgx_virt_ecreate() has already warned in this case.
+ */
+ ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+ if (!ret)
+ return kvm_skip_emulated_instruction(vcpu);
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ gva_t pageinfo_gva, secs_gva;
+ gva_t metadata_gva, contents_gva;
+ gpa_t metadata_gpa, contents_gpa, secs_gpa;
+ unsigned long metadata_hva, contents_hva, secs_hva;
+ struct sgx_pageinfo pageinfo;
+ struct sgx_secs *contents;
+ struct x86_exception ex;
+ int r;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+ return 1;
+
+ /*
+ * Copy the PAGEINFO to local memory, its pointers need to be
+ * translated, i.e. we need to do a deep copy/translate.
+ */
+ r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+ sizeof(pageinfo), &ex);
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return 1;
+ } else if (r != X86EMUL_CONTINUE) {
+ sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+ sizeof(pageinfo));
+ return 0;
+ }
+
+ if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+ sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+ &contents_gva))
+ return 1;
+
+ /*
+ * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+ sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid.
+ */
+ if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+ sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+ return 0;
+
+ /*
+ * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+ * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+ * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+ * enforce restriction of access to the PROVISIONKEY.
+ */
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
+ if (!contents)
+ return -ENOMEM;
+
+ /* Exit to userspace if copying from a host userspace address fails. */
+ if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+ free_page((unsigned long)contents);
+ return 0;
+ }
+
+ pageinfo.metadata = metadata_hva;
+ pageinfo.contents = (u64)contents;
+
+ r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+ free_page((unsigned long)contents);
+
+ return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+ unsigned long sig_hva, secs_hva, token_hva, rflags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gva_t sig_gva, secs_gva, token_gva;
+ gpa_t sig_gpa, secs_gpa, token_gpa;
+ int ret, trapnr;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+ return 1;
+
+ /*
+ * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+ sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid. Note, all structures are aligned and
+ * cannot split pages.
+ */
+ if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+ sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+ return 0;
+
+ ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+ (void __user *)secs_hva,
+ vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ /*
+ * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+ * @token_hva or @secs_hva. This should never happen as KVM checks host
+ * addresses at memslot creation. sgx_virt_einit() has already warned
+ * in this case, so just return.
+ */
+ if (ret < 0)
+ return ret;
+
+ rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+ X86_EFLAGS_AF | X86_EFLAGS_SF |
+ X86_EFLAGS_OF);
+ if (ret)
+ rflags |= X86_EFLAGS_ZF;
+ else
+ rflags &= ~X86_EFLAGS_ZF;
+ vmx_set_rflags(vcpu, rflags);
+
+ kvm_rax_write(vcpu, ret);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+ /*
+ * ENCLS generates a #UD if SGX1 isn't supported, i.e. this point will
+ * be reached if and only if the SGX1 leafs are enabled.
+ */
+ if (leaf >= ECREATE && leaf <= ETRACK)
+ return true;
+
+ if (leaf >= EAUG && leaf <= EMODT)
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2);
+
+ return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+ const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+ return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+ u32 leaf = (u32)kvm_rax_read(vcpu);
+
+ if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) ||
+ !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ } else {
+ if (leaf == ECREATE)
+ return handle_encls_ecreate(vcpu);
+ if (leaf == EINIT)
+ return handle_encls_einit(vcpu);
+ WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+ return 0;
+ }
+ return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+ /*
+ * Use Intel's default value for Skylake hardware if Launch Control is
+ * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+ * Launch Control is supported and enabled, i.e. mimic the reset value
+ * and let the guest write the MSRs at will. If Launch Control is
+ * supported but disabled, then use the current MSR values as the hash
+ * MSRs exist but are read-only (locked and not writable).
+ */
+ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+ rdmsrq_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+ sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+ sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+ sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+ sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+ } else {
+ /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+ rdmsrq(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+ rdmsrq(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+ rdmsrq(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+ }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+ sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *guest_cpuid;
+ u32 eax, ebx, ecx, edx;
+
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+ guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+ return true;
+
+ return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ /*
+ * There is no software enable bit for SGX that is virtualized by
+ * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+ * guest (either by the host or by the guest's BIOS) but enabled in the
+ * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+ * the expected system behavior for ENCLS.
+ */
+ u64 bitmap = -1ull;
+
+ /* Nothing to do if hardware doesn't support SGX */
+ if (!cpu_has_vmx_encls_vmexit())
+ return;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) &&
+ sgx_enabled_in_guest_bios(vcpu)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
+ bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+ if (sgx_intercept_encls_ecreate(vcpu))
+ bitmap |= (1 << ECREATE);
+ }
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
+ bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+ /*
+ * Trap and execute EINIT if launch control is enabled in the
+ * host using the guest's values for launch control MSRs, even
+ * if the guest's values are fixed to hardware default values.
+ * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+ * the MSRs is extraordinarily expensive.
+ */
+ if (boot_cpu_has(X86_FEATURE_SGX_LC))
+ bitmap |= (1 << EINIT);
+
+ if (!vmcs12 && is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+ bitmap |= vmcs12->encls_exiting_bitmap;
+ }
+ vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644
index 000000000000..a400888b376d
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ /* Nothing to do if hardware doesn't support SGX */
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
new file mode 100644
index 000000000000..2d7a4d52ccfb
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -0,0 +1,3613 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/fpu/xcr.h>
+#include <linux/misc_cgroup.h>
+#include <linux/mmu_context.h>
+#include <asm/tdx.h>
+#include "capabilities.h"
+#include "mmu.h"
+#include "x86_ops.h"
+#include "lapic.h"
+#include "tdx.h"
+#include "vmx.h"
+#include "mmu/spte.h"
+#include "common.h"
+#include "posted_intr.h"
+#include "irq.h"
+#include <trace/events/kvm.h>
+#include "trace.h"
+
+#pragma GCC poison to_vmx
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \
+({ \
+ struct kvm *_kvm = (__kvm); \
+ bool __ret = !!(__err); \
+ \
+ if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \
+ if (_kvm) \
+ kvm_vm_bugged(_kvm); \
+ pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
+ __err, __args); \
+ } \
+ unlikely(__ret); \
+})
+
+#define TDX_BUG_ON(__err, __fn, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
+
+#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
+
+#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
+
+#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
+ a1, a2, a3)
+
+
+bool enable_tdx __ro_after_init;
+module_param_named(tdx, enable_tdx, bool, 0444);
+
+#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
+#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
+
+static enum cpuhp_state tdx_cpuhp_state;
+
+static const struct tdx_sys_info *tdx_sysinfo;
+
+void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
+{
+ KVM_BUG_ON(1, tdx->vcpu.kvm);
+ pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
+}
+
+void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
+ u64 val, u64 err)
+{
+ KVM_BUG_ON(1, tdx->vcpu.kvm);
+ pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
+}
+
+#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
+
+static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_tdx, kvm);
+}
+
+static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_tdx, vcpu);
+}
+
+static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
+{
+ u64 val = KVM_SUPPORTED_TD_ATTRS;
+
+ if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
+ return 0;
+
+ val &= td_conf->attributes_fixed0;
+
+ return val;
+}
+
+static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
+{
+ u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
+
+ if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
+ return 0;
+
+ val &= td_conf->xfam_fixed0;
+
+ return val;
+}
+
+static int tdx_get_guest_phys_addr_bits(const u32 eax)
+{
+ return (eax & GENMASK(23, 16)) >> 16;
+}
+
+static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
+{
+ return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
+}
+
+#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
+
+static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
+{
+ return entry->function == 7 && entry->index == 0 &&
+ (entry->ebx & TDX_FEATURE_TSX);
+}
+
+static void clear_tsx(struct kvm_cpuid_entry2 *entry)
+{
+ entry->ebx &= ~TDX_FEATURE_TSX;
+}
+
+static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
+{
+ return entry->function == 7 && entry->index == 0 &&
+ (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
+}
+
+static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
+{
+ entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
+}
+
+static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
+{
+ if (has_tsx(entry))
+ clear_tsx(entry);
+
+ if (has_waitpkg(entry))
+ clear_waitpkg(entry);
+}
+
+static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
+{
+ return has_tsx(entry) || has_waitpkg(entry);
+}
+
+#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
+
+static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+
+ entry->function = (u32)td_conf->cpuid_config_leaves[idx];
+ entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
+ entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
+ entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
+ entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
+ entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
+
+ if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
+ entry->index = 0;
+
+ /*
+ * The TDX module doesn't allow configuring the guest phys addr bits
+ * (EAX[23:16]). However, KVM uses it as an interface to the userspace
+ * to configure the GPAW. Report these bits as configurable.
+ */
+ if (entry->function == 0x80000008)
+ entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
+
+ tdx_clear_unsupported_cpuid(entry);
+}
+
+#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
+
+static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
+ struct kvm_tdx_capabilities *caps)
+{
+ int i;
+
+ caps->supported_attrs = tdx_get_supported_attrs(td_conf);
+ if (!caps->supported_attrs)
+ return -EIO;
+
+ caps->supported_xfam = tdx_get_supported_xfam(td_conf);
+ if (!caps->supported_xfam)
+ return -EIO;
+
+ caps->cpuid.nent = td_conf->num_cpuid_config;
+
+ caps->user_tdvmcallinfo_1_r11 =
+ TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
+
+ for (i = 0; i < td_conf->num_cpuid_config; i++)
+ td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
+
+ return 0;
+}
+
+/*
+ * Some SEAMCALLs acquire the TDX module globally, and can fail with
+ * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
+ */
+static DEFINE_MUTEX(tdx_lock);
+
+static atomic_t nr_configured_hkid;
+
+static bool tdx_operand_busy(u64 err)
+{
+ return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
+}
+
+
+/*
+ * A per-CPU list of TD vCPUs associated with a given CPU.
+ * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
+ * list.
+ * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
+ * the old CPU during the IPI callback running on the old CPU, and then added
+ * to the per-CPU list of the new CPU.
+ * - When a TD is tearing down, all vCPUs are disassociated from their current
+ * running CPUs and removed from the per-CPU list during the IPI callback
+ * running on those CPUs.
+ * - When a CPU is brought down, traverse the per-CPU list to disassociate all
+ * associated TD vCPUs and remove them from the per-CPU list.
+ */
+static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
+
+static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
+{
+ return to_tdx(vcpu)->vp_enter_args.r10;
+}
+
+static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
+{
+ return to_tdx(vcpu)->vp_enter_args.r11;
+}
+
+static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
+ long val)
+{
+ to_tdx(vcpu)->vp_enter_args.r10 = val;
+}
+
+static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ to_tdx(vcpu)->vp_enter_args.r11 = val;
+}
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+ tdx_guest_keyid_free(kvm_tdx->hkid);
+ kvm_tdx->hkid = -1;
+ atomic_dec(&nr_configured_hkid);
+ misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+ put_misc_cg(kvm_tdx->misc_cg);
+ kvm_tdx->misc_cg = NULL;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->hkid > 0;
+}
+
+static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_irqs_disabled();
+
+ list_del(&to_tdx(vcpu)->cpu_list);
+
+ /*
+ * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
+ * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
+ * to its list before it's deleted from this CPU's list.
+ */
+ smp_wmb();
+
+ vcpu->cpu = -1;
+}
+
+/*
+ * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
+ * retry (if necessary) after forcing vCPUs to exit and wait for the operation
+ * to complete. All flows that remove/block S-EPT entries run with mmu_lock
+ * held for write, i.e. are mutually exclusive with each other, but they aren't
+ * mutually exclusive with running vCPUs, and so can fail with "operand busy"
+ * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
+ *
+ * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
+ */
+#define tdh_do_no_vcpus(tdh_func, kvm, args...) \
+({ \
+ struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \
+ u64 __err; \
+ \
+ lockdep_assert_held_write(&kvm->mmu_lock); \
+ \
+ __err = tdh_func(args); \
+ if (unlikely(tdx_operand_busy(__err))) { \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \
+ \
+ __err = tdh_func(args); \
+ \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \
+ } \
+ __err; \
+})
+
+/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
+static int __tdx_reclaim_page(struct page *page)
+{
+ u64 err, rcx, rdx, r8;
+
+ err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
+
+ /*
+ * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
+ * before the HKID is released and control pages have also been
+ * released at this point, so there is no possibility of contention.
+ */
+ if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
+ return -EIO;
+
+ return 0;
+}
+
+static int tdx_reclaim_page(struct page *page)
+{
+ int r;
+
+ r = __tdx_reclaim_page(page);
+ if (!r)
+ tdx_quirk_reset_page(page);
+ return r;
+}
+
+
+/*
+ * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
+ * private KeyID. Assume the cache associated with the TDX private KeyID has
+ * been flushed.
+ */
+static void tdx_reclaim_control_page(struct page *ctrl_page)
+{
+ /*
+ * Leak the page if the kernel failed to reclaim the page.
+ * The kernel cannot use it safely anymore.
+ */
+ if (tdx_reclaim_page(ctrl_page))
+ return;
+
+ __free_page(ctrl_page);
+}
+
+struct tdx_flush_vp_arg {
+ struct kvm_vcpu *vcpu;
+ u64 err;
+};
+
+static void tdx_flush_vp(void *_arg)
+{
+ struct tdx_flush_vp_arg *arg = _arg;
+ struct kvm_vcpu *vcpu = arg->vcpu;
+ u64 err;
+
+ arg->err = 0;
+ lockdep_assert_irqs_disabled();
+
+ /* Task migration can race with CPU offlining. */
+ if (unlikely(vcpu->cpu != raw_smp_processor_id()))
+ return;
+
+ /*
+ * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
+ * list tracking still needs to be updated so that it's correct if/when
+ * the vCPU does get initialized.
+ */
+ if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
+ /*
+ * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
+ * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
+ * vp flush function is called when destructing vCPU/TD or vCPU
+ * migration. No other thread uses TDVPR in those cases.
+ */
+ err = tdh_vp_flush(&to_tdx(vcpu)->vp);
+ if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
+ /*
+ * This function is called in IPI context. Do not use
+ * printk to avoid console semaphore.
+ * The caller prints out the error message, instead.
+ */
+ if (err)
+ arg->err = err;
+ }
+ }
+
+ tdx_disassociate_vp(vcpu);
+}
+
+static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
+{
+ struct tdx_flush_vp_arg arg = {
+ .vcpu = vcpu,
+ };
+ int cpu = vcpu->cpu;
+
+ if (unlikely(cpu == -1))
+ return;
+
+ smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
+
+ TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
+}
+
+void tdx_disable_virtualization_cpu(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
+ struct tdx_flush_vp_arg arg;
+ struct vcpu_tdx *tdx, *tmp;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
+ list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
+ arg.vcpu = &tdx->vcpu;
+ tdx_flush_vp(&arg);
+ }
+ local_irq_restore(flags);
+
+ /*
+ * Flush cache now if kexec is possible: this is necessary to avoid
+ * having dirty private memory cachelines when the new kernel boots,
+ * but WBINVD is a relatively expensive operation and doing it during
+ * kexec can exacerbate races in native_stop_other_cpus(). Do it
+ * now, since this is a safe moment and there is going to be no more
+ * TDX activity on this CPU from this point on.
+ */
+ tdx_cpu_flush_cache_for_kexec();
+}
+
+#define TDX_SEAMCALL_RETRIES 10000
+
+static void smp_func_do_phymem_cache_wb(void *unused)
+{
+ u64 err = 0;
+ bool resume;
+ int i;
+
+ /*
+ * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
+ * KeyID on the package or core. The TDX module may not finish the
+ * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
+ * kernel should retry it until it returns success w/o rescheduling.
+ */
+ for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
+ resume = !!err;
+ err = tdh_phymem_cache_wb(resume);
+ switch (err) {
+ case TDX_INTERRUPTED_RESUMABLE:
+ continue;
+ case TDX_NO_HKID_READY_TO_WBCACHE:
+ err = TDX_SUCCESS; /* Already done by other thread */
+ fallthrough;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+ bool packages_allocated, targets_allocated;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages, targets;
+ struct kvm_vcpu *vcpu;
+ unsigned long j;
+ int i;
+ u64 err;
+
+ if (!is_hkid_assigned(kvm_tdx))
+ return;
+
+ packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+ targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
+ cpus_read_lock();
+
+ kvm_for_each_vcpu(j, vcpu, kvm)
+ tdx_flush_vp_on_cpu(vcpu);
+
+ /*
+ * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
+ * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
+ * Multiple TDX guests can be destroyed simultaneously. Take the
+ * mutex to prevent it from getting error.
+ */
+ mutex_lock(&tdx_lock);
+
+ /*
+ * Releasing HKID is in vm_destroy().
+ * After the above flushing vps, there should be no more vCPU
+ * associations, as all vCPU fds have been released at this stage.
+ */
+ err = tdh_mng_vpflushdone(&kvm_tdx->td);
+ if (err == TDX_FLUSHVP_NOT_DONE)
+ goto out;
+ if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
+ pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ goto out;
+ }
+
+ for_each_online_cpu(i) {
+ if (packages_allocated &&
+ cpumask_test_and_set_cpu(topology_physical_package_id(i),
+ packages))
+ continue;
+ if (targets_allocated)
+ cpumask_set_cpu(i, targets);
+ }
+ if (targets_allocated)
+ on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
+ else
+ on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
+ /*
+ * In the case of error in smp_func_do_phymem_cache_wb(), the following
+ * tdh_mng_key_freeid() will fail.
+ */
+ err = tdh_mng_key_freeid(&kvm_tdx->td);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
+ pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ } else {
+ tdx_hkid_free(kvm_tdx);
+ }
+
+out:
+ mutex_unlock(&tdx_lock);
+ cpus_read_unlock();
+ free_cpumask_var(targets);
+ free_cpumask_var(packages);
+}
+
+static void tdx_reclaim_td_control_pages(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err;
+ int i;
+
+ /*
+ * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
+ * heavily with TDX module. Give up freeing TD pages. As the function
+ * already warned, don't warn it again.
+ */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (kvm_tdx->td.tdcs_pages) {
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (!kvm_tdx->td.tdcs_pages[i])
+ continue;
+
+ tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
+ }
+ kfree(kvm_tdx->td.tdcs_pages);
+ kvm_tdx->td.tdcs_pages = NULL;
+ }
+
+ if (!kvm_tdx->td.tdr_page)
+ return;
+
+ if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
+ return;
+
+ /*
+ * Use a SEAMCALL to ask the TDX module to flush the cache based on the
+ * KeyID. TDX module may access TDR while operating on TD (Especially
+ * when it is reclaiming TDCS).
+ */
+ err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
+ return;
+
+ tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
+
+ __free_page(kvm_tdx->td.tdr_page);
+ kvm_tdx->td.tdr_page = NULL;
+}
+
+void tdx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ tdx_reclaim_td_control_pages(kvm);
+
+ kvm_tdx->state = TD_STATE_UNINITIALIZED;
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+ struct kvm_tdx *kvm_tdx = param;
+ u64 err;
+
+ /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
+ err = tdh_mng_key_config(&kvm_tdx->td);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
+ return -EIO;
+
+ return 0;
+}
+
+int tdx_vm_init(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ kvm->arch.has_protected_state = true;
+ /*
+ * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
+ * i.e. all EOIs are accelerated and never trigger exits.
+ */
+ kvm->arch.has_protected_eoi = true;
+ kvm->arch.has_private_mem = true;
+ kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ /*
+ * Because guest TD is protected, VMM can't parse the instruction in TD.
+ * Instead, guest uses MMIO hypercall. For unmodified device driver,
+ * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
+ * instruction into MMIO hypercall.
+ *
+ * SPTE value for MMIO needs to be setup so that #VE is injected into
+ * TD instead of triggering EPT MISCONFIG.
+ * - RWX=0 so that EPT violation is triggered.
+ * - suppress #VE bit is cleared to inject #VE.
+ */
+ kvm_mmu_set_mmio_spte_value(kvm, 0);
+
+ /*
+ * TDX has its own limit of maximum vCPUs it can support for all
+ * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
+ * such limit via the MAX_VCPU_PER_TD global metadata. In
+ * practice, it reflects the number of logical CPUs that ALL
+ * platforms that the TDX module supports can possibly have.
+ *
+ * Limit TDX guest's maximum vCPUs to the number of logical CPUs
+ * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
+ * userspace would result in an unpredictable ABI.
+ */
+ kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
+
+ kvm_tdx->state = TD_STATE_UNINITIALIZED;
+
+ return 0;
+}
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (kvm_tdx->state != TD_STATE_INITIALIZED)
+ return -EIO;
+
+ /*
+ * TDX module mandates APICv, which requires an in-kernel local APIC.
+ * Disallow an in-kernel I/O APIC, because level-triggered interrupts
+ * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
+ */
+ if (!irqchip_split(vcpu->kvm))
+ return -EINVAL;
+
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+ vcpu->arch.apic->guest_apic_protected = true;
+ INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
+
+ vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+
+ vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
+ vcpu->arch.cr0_guest_owned_bits = -1ul;
+ vcpu->arch.cr4_guest_owned_bits = -1ul;
+
+ /* KVM can't change TSC offset/multiplier as TDX module manages them. */
+ vcpu->arch.guest_tsc_protected = true;
+ vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
+ vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
+ vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+ vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+
+ vcpu->arch.guest_state_protected =
+ !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
+
+ if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
+ vcpu->arch.xfd_no_write_intercept = true;
+
+ tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&tdx->vt.pi_desc);
+
+ tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+
+ return 0;
+}
+
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+ if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
+ return;
+
+ tdx_flush_vp_on_cpu(vcpu);
+
+ KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
+ local_irq_disable();
+ /*
+ * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
+ * vcpu->cpu is read before tdx->cpu_list.
+ */
+ smp_rmb();
+
+ list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
+ local_irq_enable();
+}
+
+bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ /*
+ * KVM can't get the interrupt status of TDX guest and it assumes
+ * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
+ * which passes the interrupt blocked flag.
+ */
+ return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+ !to_tdx(vcpu)->vp_enter_args.r12;
+}
+
+static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ u64 vcpu_state_details;
+
+ if (pi_has_pending_interrupt(vcpu))
+ return true;
+
+ /*
+ * Only check RVI pending for HALTED case with IRQ enabled.
+ * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
+ * interrupt was pending before TD exit, then it _must_ be blocked,
+ * otherwise the interrupt would have been serviced at the instruction
+ * boundary.
+ */
+ if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+ to_tdx(vcpu)->vp_enter_args.r12)
+ return false;
+
+ vcpu_state_details =
+ td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
+
+ return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
+}
+
+struct tdx_uret_msr {
+ u32 msr;
+ unsigned int slot;
+ u64 defval;
+};
+
+static struct tdx_uret_msr tdx_uret_msrs[] = {
+ {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
+ {.msr = MSR_STAR,},
+ {.msr = MSR_LSTAR,},
+ {.msr = MSR_TSC_AUX,},
+};
+
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+ int i;
+
+ if (vt->guest_state_loaded)
+ return;
+
+ if (likely(is_64bit_mm(current->mm)))
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ else
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+
+ vt->guest_state_loaded = true;
+
+ /*
+ * Explicitly set user-return MSRs that are clobbered by the TDX-Module
+ * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
+ * written by the TDX-Module. Don't rely on the TDX-Module to actually
+ * clobber the MSRs, as the contract is poorly defined and not upheld.
+ * E.g. the TDX-Module will synthesize an EPT Violation without doing
+ * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
+ * state.
+ */
+ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
+ kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
+ tdx_uret_msrs[i].defval, -1ull);
+}
+
+static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (!vt->guest_state_loaded)
+ return;
+
+ ++vcpu->stat.host_state_reload;
+ wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
+
+ vt->guest_state_loaded = false;
+}
+
+void tdx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ vmx_vcpu_pi_put(vcpu);
+ tdx_prepare_switch_to_host(vcpu);
+}
+
+/*
+ * Life cycles for a TD and a vCPU:
+ * 1. KVM_CREATE_VM ioctl.
+ * TD state is TD_STATE_UNINITIALIZED.
+ * hkid is not assigned at this stage.
+ * 2. KVM_TDX_INIT_VM ioctl.
+ * TD transitions to TD_STATE_INITIALIZED.
+ * hkid is assigned after this stage.
+ * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
+ * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
+ * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
+ * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
+ * kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
+ * 4. KVM_TDX_INIT_VCPU ioctl.
+ * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
+ * vCPU control structures are allocated at this stage.
+ * 5. kvm_destroy_vm().
+ * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
+ * (2) puts hkid to !assigned state.
+ * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
+ * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
+ * 5.3 tdx_vm_destroy()
+ * transitions TD to TD_STATE_UNINITIALIZED state.
+ *
+ * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
+ * - If at 3.3, hkid is still assigned, but the vCPU must be in
+ * VCPU_TD_STATE_UNINITIALIZED state.
+ * - if at 5.2, hkid must be !assigned and all vCPUs must be in
+ * VCPU_TD_STATE_INITIALIZED state and have been dissociated.
+ */
+void tdx_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int i;
+
+ if (vcpu->cpu != -1) {
+ KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
+ tdx_flush_vp_on_cpu(vcpu);
+ return;
+ }
+
+ /*
+ * It is not possible to reclaim pages while hkid is assigned. It might
+ * be assigned if the TD VM is being destroyed but freeing hkid failed,
+ * in which case the pages are leaked.
+ */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (tdx->vp.tdcx_pages) {
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ if (tdx->vp.tdcx_pages[i])
+ tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
+ }
+ kfree(tdx->vp.tdcx_pages);
+ tdx->vp.tdcx_pages = NULL;
+ }
+ if (tdx->vp.tdvpr_page) {
+ tdx_reclaim_control_page(tdx->vp.tdvpr_page);
+ tdx->vp.tdvpr_page = NULL;
+ tdx->vp.tdvpr_pa = 0;
+ }
+
+ tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+}
+
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
+ to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
+ return -EINVAL;
+
+ return 1;
+}
+
+static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+{
+ switch (tdvmcall_leaf(vcpu)) {
+ case EXIT_REASON_CPUID:
+ case EXIT_REASON_HLT:
+ case EXIT_REASON_IO_INSTRUCTION:
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return tdvmcall_leaf(vcpu);
+ case EXIT_REASON_EPT_VIOLATION:
+ return EXIT_REASON_EPT_MISCONFIG;
+ default:
+ break;
+ }
+
+ return EXIT_REASON_TDCALL;
+}
+
+static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u32 exit_reason;
+
+ switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
+ case TDX_SUCCESS:
+ case TDX_NON_RECOVERABLE_VCPU:
+ case TDX_NON_RECOVERABLE_TD:
+ case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
+ case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
+ break;
+ default:
+ return -1u;
+ }
+
+ exit_reason = tdx->vp_enter_ret;
+
+ switch (exit_reason) {
+ case EXIT_REASON_TDCALL:
+ if (tdvmcall_exit_type(vcpu))
+ return EXIT_REASON_VMCALL;
+
+ return tdcall_to_vmx_exit_reason(vcpu);
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
+ * non-instrumentable code with interrupts disabled.
+ */
+ return -1u;
+ default:
+ break;
+ }
+
+ return exit_reason;
+}
+
+static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ guest_state_enter_irqoff();
+
+ tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
+
+ vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
+
+ vt->exit_qualification = tdx->vp_enter_args.rcx;
+ tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
+ tdx->exit_gpa = tdx->vp_enter_args.r8;
+ vt->exit_intr_info = tdx->vp_enter_args.r9;
+
+ vmx_handle_nmi(vcpu);
+
+ guest_state_exit_irqoff();
+}
+
+static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_exit_reason(vcpu).failed_vmentry &&
+ vmx_get_exit_reason(vcpu).full != -1u;
+}
+
+static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
+
+ /*
+ * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
+ * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
+ *
+ * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
+ * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
+ * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
+ * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
+ * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
+ * requester may be blocked endlessly.
+ */
+ if (unlikely(tdx_operand_busy(vp_enter_ret)))
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ return EXIT_FASTPATH_NONE;
+}
+
+#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
+ BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
+ BIT_ULL(VCPU_REGS_RAX) | \
+ BIT_ULL(VCPU_REGS_RBX) | \
+ BIT_ULL(VCPU_REGS_RCX) | \
+ BIT_ULL(VCPU_REGS_RDX) | \
+ BIT_ULL(VCPU_REGS_RBP) | \
+ BIT_ULL(VCPU_REGS_RSI) | \
+ BIT_ULL(VCPU_REGS_RDI) | \
+ BIT_ULL(VCPU_REGS_R8) | \
+ BIT_ULL(VCPU_REGS_R9) | \
+ BIT_ULL(VCPU_REGS_R10) | \
+ BIT_ULL(VCPU_REGS_R11) | \
+ BIT_ULL(VCPU_REGS_R12) | \
+ BIT_ULL(VCPU_REGS_R13) | \
+ BIT_ULL(VCPU_REGS_R14) | \
+ BIT_ULL(VCPU_REGS_R15))
+
+static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+
+ /*
+ * All TDX hosts support PKRU; but even if they didn't,
+ * vcpu->arch.host_pkru would be 0 and the wrpkru would be
+ * skipped.
+ */
+ if (vcpu->arch.host_pkru != 0)
+ wrpkru(vcpu->arch.host_pkru);
+
+ if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
+
+ /*
+ * Likewise, even if a TDX hosts didn't support XSS both arms of
+ * the comparison would be 0 and the wrmsrl would be skipped.
+ */
+ if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
+ wrmsrl(MSR_IA32_XSS, kvm_host.xss);
+}
+
+#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
+ DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
+ DEBUGCTLMSR_FREEZE_IN_SMM)
+
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ /*
+ * WARN if KVM wants to force an immediate exit, as the TDX module does
+ * not guarantee entry into the guest, i.e. it's possible for KVM to
+ * _think_ it completed entry to the guest and forced an immediate exit
+ * without actually having done so. Luckily, KVM never needs to force
+ * an immediate exit for TDX (KVM can't do direct event injection, so
+ * just WARN and continue on.
+ */
+ WARN_ON_ONCE(run_flags);
+
+ /*
+ * Wait until retry of SEPT-zap-related SEAMCALL completes before
+ * allowing vCPU entry to avoid contention with tdh_vp_enter() and
+ * TDCALLs.
+ */
+ if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
+
+ if (pi_test_on(&vt->pi_desc)) {
+ apic->send_IPI_self(POSTED_INTR_VECTOR);
+
+ if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
+ APIC_VECTOR_MASK, &vt->pi_desc))
+ kvm_wait_lapic_expire(vcpu);
+ }
+
+ tdx_vcpu_enter_exit(vcpu);
+
+ if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
+
+ tdx_load_host_xsave_state(vcpu);
+
+ vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
+
+ if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
+ return EXIT_FASTPATH_NONE;
+
+ if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
+ return EXIT_FASTPATH_NONE;
+
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
+
+ if (unlikely(tdx_failed_vmentry(vcpu)))
+ return EXIT_FASTPATH_NONE;
+
+ return tdx_exit_handlers_fastpath(vcpu);
+}
+
+void tdx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.nmi_injections;
+ td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
+ /*
+ * From KVM's perspective, NMI injection is completed right after
+ * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
+ * the TDX module or not.
+ */
+ vcpu->arch.nmi_injected = false;
+ /*
+ * TDX doesn't support KVM to request NMI window exit. If there is
+ * still a pending vNMI, KVM is not able to inject it along with the
+ * one pending in TDX module in a back-to-back way. Since the previous
+ * vNMI is still pending in TDX module, i.e. it has not been delivered
+ * to TDX guest yet, it's OK to collapse the pending vNMI into the
+ * previous one. The guest is expected to handle all the NMI sources
+ * when handling the first vNMI.
+ */
+ vcpu->arch.nmi_pending = 0;
+}
+
+static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
+{
+ u32 intr_info = vmx_get_intr_info(vcpu);
+
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
+ * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
+ */
+ if (is_nmi(intr_info) || is_machine_check(intr_info))
+ return 1;
+
+ vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
+ vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
+ vcpu->run->ex.error_code = 0;
+
+ return 0;
+}
+
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+ tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
+ return 1;
+}
+
+static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
+{
+ kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
+ kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
+ kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
+ kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
+ kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
+
+ return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
+}
+
+/*
+ * Split into chunks and check interrupt pending between chunks. This allows
+ * for timely injection of interrupts to prevent issues with guest lockup
+ * detection.
+ */
+#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
+static void __tdx_map_gpa(struct vcpu_tdx *tdx);
+
+static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (vcpu->run->hypercall.ret) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+ return 1;
+ }
+
+ tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
+ if (tdx->map_gpa_next >= tdx->map_gpa_end)
+ return 1;
+
+ /*
+ * Stop processing the remaining part if there is a pending interrupt,
+ * which could be qualified to deliver. Skip checking pending RVI for
+ * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
+ */
+ if (kvm_vcpu_has_events(vcpu)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
+ tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+ return 1;
+ }
+
+ __tdx_map_gpa(tdx);
+ return 0;
+}
+
+static void __tdx_map_gpa(struct vcpu_tdx *tdx)
+{
+ u64 gpa = tdx->map_gpa_next;
+ u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
+
+ if (size > TDX_MAP_GPA_MAX_LEN)
+ size = TDX_MAP_GPA_MAX_LEN;
+
+ tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
+ tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ tdx->vcpu.run->hypercall.ret = 0;
+ tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+ tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
+ tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
+ KVM_MAP_GPA_RANGE_ENCRYPTED :
+ KVM_MAP_GPA_RANGE_DECRYPTED;
+ tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
+}
+
+static int tdx_map_gpa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 gpa = tdx->vp_enter_args.r12;
+ u64 size = tdx->vp_enter_args.r13;
+ u64 ret;
+
+ /*
+ * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
+ * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
+ * bit set. This is a base call so it should always be supported, but
+ * KVM has no way to ensure that userspace implements the GHCI correctly.
+ * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
+ * to the guest.
+ */
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+ ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ goto error;
+ }
+
+ if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
+ (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
+ vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
+ ret = TDVMCALL_STATUS_INVALID_OPERAND;
+ goto error;
+ }
+
+ if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
+ ret = TDVMCALL_STATUS_ALIGN_ERROR;
+ goto error;
+ }
+
+ tdx->map_gpa_end = gpa + size;
+ tdx->map_gpa_next = gpa;
+
+ __tdx_map_gpa(tdx);
+ return 0;
+
+error:
+ tdvmcall_set_return_code(vcpu, ret);
+ tdx->vp_enter_args.r11 = gpa;
+ return 1;
+}
+
+static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 *regs = vcpu->run->system_event.data;
+ u64 *module_regs = &tdx->vp_enter_args.r8;
+ int index = VCPU_REGS_RAX;
+
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
+ vcpu->run->system_event.ndata = 16;
+
+ /* Dump 16 general-purpose registers to userspace in ascending order. */
+ regs[index++] = tdx->vp_enter_ret;
+ regs[index++] = tdx->vp_enter_args.rcx;
+ regs[index++] = tdx->vp_enter_args.rdx;
+ regs[index++] = tdx->vp_enter_args.rbx;
+ regs[index++] = 0;
+ regs[index++] = 0;
+ regs[index++] = tdx->vp_enter_args.rsi;
+ regs[index] = tdx->vp_enter_args.rdi;
+ for (index = 0; index < 8; index++)
+ regs[VCPU_REGS_R8 + index] = module_regs[index];
+
+ return 0;
+}
+
+static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ /* EAX and ECX for cpuid is stored in R12 and R13. */
+ eax = tdx->vp_enter_args.r12;
+ ecx = tdx->vp_enter_args.r13;
+
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
+
+ tdx->vp_enter_args.r12 = eax;
+ tdx->vp_enter_args.r13 = ebx;
+ tdx->vp_enter_args.r14 = ecx;
+ tdx->vp_enter_args.r15 = edx;
+
+ return 1;
+}
+
+static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ unsigned long val = 0;
+ int ret;
+
+ ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
+ vcpu->arch.pio.port, &val, 1);
+
+ WARN_ON_ONCE(!ret);
+
+ tdvmcall_set_return_val(vcpu, val);
+
+ return 1;
+}
+
+static int tdx_emulate_io(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ unsigned long val = 0;
+ unsigned int port;
+ u64 size, write;
+ int ret;
+
+ ++vcpu->stat.io_exits;
+
+ size = tdx->vp_enter_args.r12;
+ write = tdx->vp_enter_args.r13;
+ port = tdx->vp_enter_args.r14;
+
+ if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ if (write) {
+ val = tdx->vp_enter_args.r15;
+ ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
+ } else {
+ ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
+ }
+
+ if (!ret)
+ vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
+ tdx_complete_pio_in;
+ else if (!write)
+ tdvmcall_set_return_val(vcpu, val);
+
+ return ret;
+}
+
+static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
+{
+ unsigned long val = 0;
+ gpa_t gpa;
+ int size;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+ size = vcpu->mmio_fragments[0].len;
+
+ memcpy(&val, vcpu->run->mmio.data, size);
+ tdvmcall_set_return_val(vcpu, val);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+ return 1;
+}
+
+static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
+ unsigned long val)
+{
+ if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ trace_kvm_fast_mmio(gpa);
+ return 0;
+ }
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
+ if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
+{
+ unsigned long val;
+
+ if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+ return -EOPNOTSUPP;
+
+ tdvmcall_set_return_val(vcpu, val);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+ return 0;
+}
+
+static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int size, write, r;
+ unsigned long val;
+ gpa_t gpa;
+
+ size = tdx->vp_enter_args.r12;
+ write = tdx->vp_enter_args.r13;
+ gpa = tdx->vp_enter_args.r14;
+ val = write ? tdx->vp_enter_args.r15 : 0;
+
+ if (size != 1 && size != 2 && size != 4 && size != 8)
+ goto error;
+ if (write != 0 && write != 1)
+ goto error;
+
+ /*
+ * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
+ * do MMIO emulation for private GPA.
+ */
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
+ vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
+ goto error;
+
+ gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+
+ if (write)
+ r = tdx_mmio_write(vcpu, gpa, size, val);
+ else
+ r = tdx_mmio_read(vcpu, gpa, size);
+ if (!r)
+ /* Kernel completed device emulation. */
+ return 1;
+
+ /* Request the device emulation to userspace device model. */
+ vcpu->mmio_is_write = write;
+ if (!write)
+ vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = size;
+ vcpu->run->mmio.is_write = write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ if (write) {
+ memcpy(vcpu->run->mmio.data, &val, size);
+ } else {
+ vcpu->mmio_fragments[0].gpa = gpa;
+ vcpu->mmio_fragments[0].len = size;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
+ }
+ return 0;
+
+error:
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+}
+
+static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
+
+ /*
+ * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
+ * directly without the support from userspace, just set the value
+ * returned from userspace.
+ */
+ tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
+ tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
+ tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
+ tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
+
+ return 1;
+}
+
+static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ switch (tdx->vp_enter_args.r12) {
+ case 0:
+ tdx->vp_enter_args.r11 = 0;
+ tdx->vp_enter_args.r12 = 0;
+ tdx->vp_enter_args.r13 = 0;
+ tdx->vp_enter_args.r14 = 0;
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
+ return 1;
+ case 1:
+ vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
+ vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
+ vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
+ vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
+ return 0;
+ default:
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+}
+
+static int tdx_complete_simple(struct kvm_vcpu *vcpu)
+{
+ tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
+ return 1;
+}
+
+static int tdx_get_quote(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 gpa = tdx->vp_enter_args.r12;
+ u64 size = tdx->vp_enter_args.r13;
+
+ /* The gpa of buffer must have shared bit set. */
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
+ vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+ vcpu->run->tdx.get_quote.size = size;
+
+ vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+ return 0;
+}
+
+static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 vector = tdx->vp_enter_args.r12;
+
+ if (vector < 32 || vector > 255) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
+ vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ vcpu->run->tdx.setup_event_notify.vector = vector;
+
+ vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+ return 0;
+}
+
+static int handle_tdvmcall(struct kvm_vcpu *vcpu)
+{
+ switch (tdvmcall_leaf(vcpu)) {
+ case TDVMCALL_MAP_GPA:
+ return tdx_map_gpa(vcpu);
+ case TDVMCALL_REPORT_FATAL_ERROR:
+ return tdx_report_fatal_error(vcpu);
+ case TDVMCALL_GET_TD_VM_CALL_INFO:
+ return tdx_get_td_vm_call_info(vcpu);
+ case TDVMCALL_GET_QUOTE:
+ return tdx_get_quote(vcpu);
+ case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
+ return tdx_setup_event_notify_interrupt(vcpu);
+ default:
+ break;
+ }
+
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
+ return 1;
+}
+
+void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
+{
+ u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
+ TDX_SHARED_BIT_PWL_4;
+
+ if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
+ return;
+
+ td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
+}
+
+static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
+ gpa_t gpa = gfn_to_gpa(gfn);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
+ KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
+ return -EIO;
+
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+ kvm_tdx->page_add_src, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
+}
+
+static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 entry, level_state;
+ u64 err;
+
+ err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
+}
+
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
+
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return -EIO;
+
+ WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
+ (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
+
+ /*
+ * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
+ * before kvm_tdx->state. Userspace must not be allowed to pre-fault
+ * arbitrary memory until the initial memory image is finalized. Pairs
+ * with the smp_wmb() in tdx_td_finalize().
+ */
+ smp_rmb();
+
+ /*
+ * If the TD isn't finalized/runnable, then userspace is initializing
+ * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
+ */
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return tdx_mem_page_add(kvm, gfn, level, pfn);
+
+ return tdx_mem_page_aug(kvm, gfn, level, pfn);
+}
+
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ struct page *page = virt_to_page(private_spt);
+ u64 err, entry, level_state;
+
+ err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
+ &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * Ensure shared and private EPTs to be flushed on all vCPUs.
+ * tdh_mem_track() is the only caller that increases TD epoch. An increase in
+ * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
+ * running in guest mode with the value "N - 1".
+ *
+ * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
+ * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
+ * being increased to "N + 1".
+ *
+ * Kicking off all vCPUs after that further results in no vCPUs can run in guest
+ * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
+ * to increase TD epoch to "N + 2").
+ *
+ * TDX module will flush EPT on the next TD enter and make vCPUs to run in
+ * guest mode with TD epoch value "N + 1".
+ *
+ * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
+ * waiting empty IPI handler ack_kick().
+ *
+ * No action is required to the vCPUs being kicked off since the kicking off
+ * occurs certainly after TD epoch increment and before the next
+ * tdh_mem_track().
+ */
+static void tdx_track(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err;
+
+ /* If TD isn't finalized, it's before any vcpu running. */
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return;
+
+ /*
+ * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
+ * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
+ * tracking epoch hasn't completed.
+ */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
+ TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ /*
+ * free_external_spt() is only called after hkid is freed when TD is
+ * tearing down.
+ * KVM doesn't (yet) zap page table pages in mirror page table while
+ * TD is active, though guest pages mapped in mirror page table could be
+ * zapped during TD is active, e.g. for shared <-> private conversion
+ * and slot move/deletion.
+ */
+ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
+ return -EIO;
+
+ /*
+ * The HKID assigned to this TD was already freed and cache was
+ * already flushed. We don't have to flush again.
+ */
+ return tdx_reclaim_page(virt_to_page(private_spt));
+}
+
+static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
+{
+ struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * HKID is released after all private pages have been removed, and set
+ * before any might be populated. Warn if zapping is attempted when
+ * there can't be anything populated in the private EPT.
+ */
+ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
+ return;
+
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return;
+
+ err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
+ return;
+
+ /*
+ * TDX requires TLB tracking before dropping private page. Do
+ * it here, although it is also done later.
+ */
+ tdx_track(kvm);
+
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
+ return;
+
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
+ return;
+
+ tdx_quirk_reset_page(page);
+}
+
+void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ /* TDX supports only posted interrupt. No lapic emulation. */
+ __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+}
+
+static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
+{
+ u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
+ u64 eq = vmx_get_exit_qual(vcpu);
+
+ if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
+ return false;
+
+ return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
+}
+
+static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual;
+ gpa_t gpa = to_tdx(vcpu)->exit_gpa;
+ bool local_retry = false;
+ int ret;
+
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+ if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
+ pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
+ gpa, vcpu->vcpu_id);
+ kvm_vm_dead(vcpu->kvm);
+ return -EIO;
+ }
+ /*
+ * Always treat SEPT violations as write faults. Ignore the
+ * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
+ * TD private pages are always RWX in the SEPT tables,
+ * i.e. they're always mapped writable. Just as importantly,
+ * treating SEPT violations as write faults is necessary to
+ * avoid COW allocations, which will cause TDAUGPAGE failures
+ * due to aliasing a single HPA to multiple GPAs.
+ */
+ exit_qual = EPT_VIOLATION_ACC_WRITE;
+
+ /* Only private GPA triggers zero-step mitigation */
+ local_retry = true;
+ } else {
+ exit_qual = vmx_get_exit_qual(vcpu);
+ /*
+ * EPT violation due to instruction fetch should never be
+ * triggered from shared memory in TDX guest. If such EPT
+ * violation occurs, treat it as broken hardware.
+ */
+ if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
+ return -EIO;
+ }
+
+ trace_kvm_page_fault(vcpu, gpa, exit_qual);
+
+ /*
+ * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
+ * mapping in TDX.
+ *
+ * KVM may return RET_PF_RETRY for private GPA due to
+ * - contentions when atomically updating SPTEs of the mirror page table
+ * - in-progress GFN invalidation or memslot removal.
+ * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
+ * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
+ * or certain TDCALLs.
+ *
+ * If TDH.VP.ENTER is invoked more times than the threshold set by the
+ * TDX module before KVM resolves the private GPA mapping, the TDX
+ * module will activate zero-step mitigation during TDH.VP.ENTER. This
+ * process acquires an SEPT tree lock in the TDX module, leading to
+ * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
+ * operations on other vCPUs.
+ *
+ * Breaking out of local retries for kvm_vcpu_has_events() is for
+ * interrupt injection. kvm_vcpu_has_events() should not see pending
+ * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
+ * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
+ * the guest even if the IRQ/NMI can't be delivered.
+ *
+ * Note: even without breaking out of local retries, zero-step
+ * mitigation may still occur due to
+ * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
+ * - a single RIP causing EPT violations for more GFNs than the
+ * threshold count.
+ * This is safe, as triggering zero-step mitigation only introduces
+ * contentions to page installation SEAMCALLs on other vCPUs, which will
+ * handle retries locally in their EPT violation handlers.
+ */
+ while (1) {
+ struct kvm_memory_slot *slot;
+
+ ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
+
+ if (ret != RET_PF_RETRY || !local_retry)
+ break;
+
+ if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
+ break;
+
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
+ ret = -EIO;
+ break;
+ }
+
+ /*
+ * Bail if the memslot is invalid, i.e. is being deleted, as
+ * faulting in will never succeed and this task needs to drop
+ * SRCU in order to let memslot deletion complete.
+ */
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
+ if (slot && slot->flags & KVM_MEMSLOT_INVALID)
+ break;
+
+ cond_resched();
+ }
+ return ret;
+}
+
+int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
+ tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
+
+ return 1;
+}
+
+
+int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 vp_enter_ret = tdx->vp_enter_ret;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
+
+ if (fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
+ KVM_BUG_ON(1, vcpu->kvm);
+ return -EIO;
+ }
+
+ /*
+ * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
+ * TDX_SEAMCALL_VMFAILINVALID.
+ */
+ if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
+ KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
+ goto unhandled_exit;
+ }
+
+ if (unlikely(tdx_failed_vmentry(vcpu))) {
+ /*
+ * If the guest state is protected, that means off-TD debug is
+ * not enabled, TDX_NON_RECOVERABLE must be set.
+ */
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
+ !(vp_enter_ret & TDX_NON_RECOVERABLE));
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
+ exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
+ kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
+ goto unhandled_exit;
+ }
+
+ WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
+ (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
+
+ switch (exit_reason.basic) {
+ case EXIT_REASON_TRIPLE_FAULT:
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ return 0;
+ case EXIT_REASON_EXCEPTION_NMI:
+ return tdx_handle_exception_nmi(vcpu);
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ ++vcpu->stat.irq_exits;
+ return 1;
+ case EXIT_REASON_CPUID:
+ return tdx_emulate_cpuid(vcpu);
+ case EXIT_REASON_HLT:
+ return kvm_emulate_halt_noskip(vcpu);
+ case EXIT_REASON_TDCALL:
+ return handle_tdvmcall(vcpu);
+ case EXIT_REASON_VMCALL:
+ return tdx_emulate_vmcall(vcpu);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return tdx_emulate_io(vcpu);
+ case EXIT_REASON_MSR_READ:
+ kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+ return kvm_emulate_rdmsr(vcpu);
+ case EXIT_REASON_MSR_WRITE:
+ kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+ kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
+ kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
+ return kvm_emulate_wrmsr(vcpu);
+ case EXIT_REASON_EPT_MISCONFIG:
+ return tdx_emulate_mmio(vcpu);
+ case EXIT_REASON_EPT_VIOLATION:
+ return tdx_handle_ept_violation(vcpu);
+ case EXIT_REASON_OTHER_SMI:
+ /*
+ * Unlike VMX, SMI in SEAM non-root mode (i.e. when
+ * TD guest vCPU is running) will cause VM exit to TDX module,
+ * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
+ * and handled by kernel handler right away.
+ *
+ * The Other SMI exit can also be caused by the SEAM non-root
+ * machine check delivered via Machine Check System Management
+ * Interrupt (MSMI), but it has already been handled by the
+ * kernel machine check handler, i.e., the memory page has been
+ * marked as poisoned and it won't be freed to the free list
+ * when the TDX guest is terminated (the TDX module marks the
+ * guest as dead and prevent it from further running when
+ * machine check happens in SEAM non-root).
+ *
+ * - A MSMI will not reach here, it's handled as non_recoverable
+ * case above.
+ * - If it's not an MSMI, no need to do anything here.
+ */
+ return 1;
+ default:
+ break;
+ }
+
+unhandled_exit:
+ kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
+ return 0;
+}
+
+void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ *reason = tdx->vt.exit_reason.full;
+ if (*reason != -1u) {
+ *info1 = vmx_get_exit_qual(vcpu);
+ *info2 = tdx->ext_exit_qualification;
+ *intr_info = vmx_get_intr_info(vcpu);
+ } else {
+ *info1 = 0;
+ *info2 = 0;
+ *intr_info = 0;
+ }
+
+ *error_code = 0;
+}
+
+bool tdx_has_emulated_msr(u32 index)
+{
+ switch (index) {
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_ARCH_CAPABILITIES:
+ case MSR_IA32_POWER_CTL:
+ case MSR_IA32_CR_PAT:
+ case MSR_MTRRcap:
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_TSC_DEADLINE:
+ case MSR_IA32_MISC_ENABLE:
+ case MSR_PLATFORM_INFO:
+ case MSR_MISC_FEATURES_ENABLES:
+ case MSR_IA32_APICBASE:
+ case MSR_EFER:
+ case MSR_IA32_FEAT_CTL:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_EXT_CTL:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
+ case MSR_KVM_POLL_CONTROL:
+ return true;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ /*
+ * x2APIC registers that are virtualized by the CPU can't be
+ * emulated, KVM doesn't have access to the virtual APIC page.
+ */
+ switch (index) {
+ case X2APIC_MSR(APIC_TASKPRI):
+ case X2APIC_MSR(APIC_PROCPRI):
+ case X2APIC_MSR(APIC_EOI):
+ case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
+ case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
+ case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
+ return false;
+ default:
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+static bool tdx_is_read_only_msr(u32 index)
+{
+ return index == MSR_IA32_APICBASE || index == MSR_EFER ||
+ index == MSR_IA32_FEAT_CTL;
+}
+
+int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_FEAT_CTL:
+ /*
+ * MCE and MCA are advertised via cpuid. Guest kernel could
+ * check if LMCE is enabled or not.
+ */
+ msr->data = FEAT_CTL_LOCKED;
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ msr->data |= FEAT_CTL_LMCE_ENABLED;
+ return 0;
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
+ return 1;
+ msr->data = vcpu->arch.mcg_ext_ctl;
+ return 0;
+ default:
+ if (!tdx_has_emulated_msr(msr->index))
+ return 1;
+
+ return kvm_get_msr_common(vcpu, msr);
+ }
+}
+
+int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
+ (msr->data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = msr->data;
+ return 0;
+ default:
+ if (tdx_is_read_only_msr(msr->index))
+ return 1;
+
+ if (!tdx_has_emulated_msr(msr->index))
+ return 1;
+
+ return kvm_set_msr_common(vcpu, msr);
+ }
+}
+
+static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ struct kvm_tdx_capabilities __user *user_caps;
+ struct kvm_tdx_capabilities *caps = NULL;
+ u32 nr_user_entries;
+ int ret = 0;
+
+ /* flags is reserved for future use */
+ if (cmd->flags)
+ return -EINVAL;
+
+ user_caps = u64_to_user_ptr(cmd->data);
+ if (get_user(nr_user_entries, &user_caps->cpuid.nent))
+ return -EFAULT;
+
+ if (nr_user_entries < td_conf->num_cpuid_config)
+ return -E2BIG;
+
+ caps = kzalloc(struct_size(caps, cpuid.entries,
+ td_conf->num_cpuid_config), GFP_KERNEL);
+ if (!caps)
+ return -ENOMEM;
+
+ ret = init_kvm_tdx_caps(td_conf, caps);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
+ caps->cpuid.nent))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+out:
+ /* kfree() accepts NULL. */
+ kfree(caps);
+ return ret;
+}
+
+/*
+ * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
+ * similar to TDX's GPAW. Use this field as the interface for userspace to
+ * configure the GPAW and EPT level for TDs.
+ *
+ * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
+ * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
+ * supported. Value 52 is only supported when the platform supports 5 level
+ * EPT.
+ */
+static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
+ struct td_params *td_params)
+{
+ const struct kvm_cpuid_entry2 *entry;
+ int guest_pa;
+
+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
+ if (!entry)
+ return -EINVAL;
+
+ guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
+
+ if (guest_pa != 48 && guest_pa != 52)
+ return -EINVAL;
+
+ if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
+ return -EINVAL;
+
+ td_params->eptp_controls = VMX_EPTP_MT_WB;
+ if (guest_pa == 52) {
+ td_params->eptp_controls |= VMX_EPTP_PWL_5;
+ td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
+ } else {
+ td_params->eptp_controls |= VMX_EPTP_PWL_4;
+ }
+
+ return 0;
+}
+
+static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
+ struct td_params *td_params)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ const struct kvm_cpuid_entry2 *entry;
+ struct tdx_cpuid_value *value;
+ int i, copy_cnt = 0;
+
+ /*
+ * td_params.cpuid_values: The number and the order of cpuid_value must
+ * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
+ * It's assumed that td_params was zeroed.
+ */
+ for (i = 0; i < td_conf->num_cpuid_config; i++) {
+ struct kvm_cpuid_entry2 tmp;
+
+ td_init_cpuid_entry2(&tmp, i);
+
+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
+ tmp.function, tmp.index);
+ if (!entry)
+ continue;
+
+ if (tdx_unsupported_cpuid(entry))
+ return -EINVAL;
+
+ copy_cnt++;
+
+ value = &td_params->cpuid_values[i];
+ value->eax = entry->eax;
+ value->ebx = entry->ebx;
+ value->ecx = entry->ecx;
+ value->edx = entry->edx;
+
+ /*
+ * TDX module does not accept nonzero bits 16..23 for the
+ * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
+ */
+ if (tmp.function == 0x80000008)
+ value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
+ }
+
+ /*
+ * Rely on the TDX module to reject invalid configuration, but it can't
+ * check of leafs that don't have a proper slot in td_params->cpuid_values
+ * to stick then. So fail if there were entries that didn't get copied to
+ * td_params.
+ */
+ if (copy_cnt != cpuid->nent)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
+ struct kvm_tdx_init_vm *init_vm)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
+ int ret;
+
+ if (kvm->created_vcpus)
+ return -EBUSY;
+
+ if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
+ return -EINVAL;
+
+ if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
+ return -EINVAL;
+
+ td_params->max_vcpus = kvm->max_vcpus;
+ td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
+ td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
+
+ td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
+ td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
+
+ ret = setup_tdparams_eptp_controls(cpuid, td_params);
+ if (ret)
+ return ret;
+
+ ret = setup_tdparams_cpuids(cpuid, td_params);
+ if (ret)
+ return ret;
+
+#define MEMCPY_SAME_SIZE(dst, src) \
+ do { \
+ BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
+ memcpy((dst), (src), sizeof(dst)); \
+ } while (0)
+
+ MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
+ MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
+ MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
+
+ return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
+ u64 *seamcall_err)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ struct page **tdcs_pages = NULL;
+ struct page *tdr_page;
+ int ret, i;
+ u64 err, rcx;
+
+ *seamcall_err = 0;
+ ret = tdx_guest_keyid_alloc();
+ if (ret < 0)
+ return ret;
+ kvm_tdx->hkid = ret;
+ kvm_tdx->misc_cg = get_current_misc_cg();
+ ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+ if (ret)
+ goto free_hkid;
+
+ ret = -ENOMEM;
+
+ atomic_inc(&nr_configured_hkid);
+
+ tdr_page = alloc_page(GFP_KERNEL);
+ if (!tdr_page)
+ goto free_hkid;
+
+ kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
+ /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
+ kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
+ tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
+ GFP_KERNEL);
+ if (!tdcs_pages)
+ goto free_tdr;
+
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ tdcs_pages[i] = alloc_page(GFP_KERNEL);
+ if (!tdcs_pages[i])
+ goto free_tdcs;
+ }
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
+ goto free_tdcs;
+
+ cpus_read_lock();
+
+ /*
+ * Need at least one CPU of the package to be online in order to
+ * program all packages for host key id. Check it.
+ */
+ for_each_present_cpu(i)
+ cpumask_set_cpu(topology_physical_package_id(i), packages);
+ for_each_online_cpu(i)
+ cpumask_clear_cpu(topology_physical_package_id(i), packages);
+ if (!cpumask_empty(packages)) {
+ ret = -EIO;
+ /*
+ * Because it's hard for human operator to figure out the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
+ pr_warn_ratelimited(MSG_ALLPKG);
+ goto free_packages;
+ }
+
+ /*
+ * TDH.MNG.CREATE tries to grab the global TDX module and fails
+ * with TDX_OPERAND_BUSY when it fails to grab. Take the global
+ * lock to prevent it from failure.
+ */
+ mutex_lock(&tdx_lock);
+ kvm_tdx->td.tdr_page = tdr_page;
+ err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
+ mutex_unlock(&tdx_lock);
+
+ if (err == TDX_RND_NO_ENTROPY) {
+ ret = -EAGAIN;
+ goto free_packages;
+ }
+
+ if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
+ ret = -EIO;
+ goto free_packages;
+ }
+
+ for_each_online_cpu(i) {
+ int pkg = topology_physical_package_id(i);
+
+ if (cpumask_test_and_set_cpu(pkg, packages))
+ continue;
+
+ /*
+ * Program the memory controller in the package with an
+ * encryption key associated to a TDX private host key id
+ * assigned to this TDR. Concurrent operations on same memory
+ * controller results in TDX_OPERAND_BUSY. No locking needed
+ * beyond the cpus_read_lock() above as it serializes against
+ * hotplug and the first online CPU of the package is always
+ * used. We never have two CPUs in the same socket trying to
+ * program the key.
+ */
+ ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+ kvm_tdx, true);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+ if (ret) {
+ i = 0;
+ goto teardown;
+ }
+
+ kvm_tdx->td.tdcs_pages = tdcs_pages;
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
+ if (err == TDX_RND_NO_ENTROPY) {
+ /* Here it's hard to allow userspace to retry. */
+ ret = -EAGAIN;
+ goto teardown;
+ }
+ if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
+ ret = -EIO;
+ goto teardown;
+ }
+ }
+
+ err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
+ if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
+ /*
+ * Because a user gives operands, don't warn.
+ * Return a hint to the user because it's sometimes hard for the
+ * user to figure out which operand is invalid. SEAMCALL status
+ * code includes which operand caused invalid operand error.
+ */
+ *seamcall_err = err;
+ ret = -EINVAL;
+ goto teardown;
+ } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
+ ret = -EIO;
+ goto teardown;
+ }
+
+ return 0;
+
+ /*
+ * The sequence for freeing resources from a partially initialized TD
+ * varies based on where in the initialization flow failure occurred.
+ * Simply use the full teardown and destroy, which naturally play nice
+ * with partial initialization.
+ */
+teardown:
+ /* Only free pages not yet added, so start at 'i' */
+ for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (tdcs_pages[i]) {
+ __free_page(tdcs_pages[i]);
+ tdcs_pages[i] = NULL;
+ }
+ }
+ if (!kvm_tdx->td.tdcs_pages)
+ kfree(tdcs_pages);
+
+ tdx_mmu_release_hkid(kvm);
+ tdx_reclaim_td_control_pages(kvm);
+
+ return ret;
+
+free_packages:
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+
+free_tdcs:
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (tdcs_pages[i])
+ __free_page(tdcs_pages[i]);
+ }
+ kfree(tdcs_pages);
+ kvm_tdx->td.tdcs_pages = NULL;
+
+free_tdr:
+ if (tdr_page)
+ __free_page(tdr_page);
+ kvm_tdx->td.tdr_page = NULL;
+
+free_hkid:
+ tdx_hkid_free(kvm_tdx);
+
+ return ret;
+}
+
+static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
+ u64 *data)
+{
+ u64 err;
+
+ err = tdh_mng_rd(&tdx->td, field_id, data);
+
+ return err;
+}
+
+#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
+#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
+
+static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
+ bool sub_leaf_set, int *entry_index,
+ struct kvm_cpuid_entry2 *out)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
+ u64 ebx_eax, edx_ecx;
+ u64 err = 0;
+
+ if (sub_leaf > 0b1111111)
+ return -EINVAL;
+
+ if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
+ return -EINVAL;
+
+ if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
+ sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
+ return -EINVAL;
+
+ /*
+ * bit 23:17, REVSERVED: reserved, must be 0;
+ * bit 16, LEAF_31: leaf number bit 31;
+ * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
+ * implicitly 0;
+ * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
+ * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
+ * the SUBLEAF_6_0 is all-1.
+ * sub-leaf bits 31:7 are implicitly 0;
+ * bit 0, ELEMENT_I: Element index within field;
+ */
+ field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
+ field_id |= (leaf & 0x7f) << 9;
+ if (sub_leaf_set)
+ field_id |= (sub_leaf & 0x7f) << 1;
+ else
+ field_id |= 0x1fe;
+
+ err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
+ if (err) //TODO check for specific errors
+ goto err_out;
+
+ out->eax = (u32) ebx_eax;
+ out->ebx = (u32) (ebx_eax >> 32);
+
+ field_id++;
+ err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
+ /*
+ * It's weird that reading edx_ecx fails while reading ebx_eax
+ * succeeded.
+ */
+ if (WARN_ON_ONCE(err))
+ goto err_out;
+
+ out->ecx = (u32) edx_ecx;
+ out->edx = (u32) (edx_ecx >> 32);
+
+ out->function = leaf;
+ out->index = sub_leaf;
+ out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
+
+ /*
+ * Work around missing support on old TDX modules, fetch
+ * guest maxpa from gfn_direct_bits.
+ */
+ if (leaf == 0x80000008) {
+ gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+ unsigned int g_maxpa = __ffs(gpa_bits) + 1;
+
+ out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
+ }
+
+ (*entry_index)++;
+
+ return 0;
+
+err_out:
+ out->eax = 0;
+ out->ebx = 0;
+ out->ecx = 0;
+ out->edx = 0;
+
+ return -EIO;
+}
+
+typedef void *tdx_vm_state_guard_t;
+
+static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
+{
+ int r;
+
+ mutex_lock(&kvm->lock);
+
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
+ r = -EBUSY;
+ goto out_err;
+ }
+
+ r = kvm_lock_all_vcpus(kvm);
+ if (r)
+ goto out_err;
+
+ /*
+ * Note the unintuitive ordering! vcpu->mutex must be taken outside
+ * kvm->slots_lock!
+ */
+ mutex_lock(&kvm->slots_lock);
+ return kvm;
+
+out_err:
+ mutex_unlock(&kvm->lock);
+ return ERR_PTR(r);
+}
+
+static void tdx_release_vm_state_locks(struct kvm *kvm)
+{
+ mutex_unlock(&kvm->slots_lock);
+ kvm_unlock_all_vcpus(kvm);
+ mutex_unlock(&kvm->lock);
+}
+
+DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
+ if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
+ tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
+
+static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_init_vm *init_vm;
+ struct td_params *td_params = NULL;
+ u32 nr_user_entries;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
+ BUILD_BUG_ON(sizeof(struct td_params) != 1024);
+
+ if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
+ return -EINVAL;
+
+ if (cmd->flags)
+ return -EINVAL;
+
+ if (get_user(nr_user_entries, &user_data->cpuid.nent))
+ return -EFAULT;
+
+ if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
+
+ init_vm = memdup_user(user_data,
+ struct_size(user_data, cpuid.entries, nr_user_entries));
+ if (IS_ERR(init_vm))
+ return PTR_ERR(init_vm);
+
+ if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (init_vm->cpuid.padding) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
+ if (!td_params) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = setup_tdparams(kvm, td_params, init_vm);
+ if (ret)
+ goto out;
+
+ ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
+ if (ret)
+ goto out;
+
+ kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
+ kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
+ kvm_tdx->attributes = td_params->attributes;
+ kvm_tdx->xfam = td_params->xfam;
+
+ if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
+ kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
+ else
+ kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
+
+ kvm_tdx->state = TD_STATE_INITIALIZED;
+out:
+ /* kfree() accepts NULL. */
+ kfree(init_vm);
+ kfree(td_params);
+
+ return ret;
+}
+
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ /*
+ * flush_tlb_current() is invoked when the first time for the vcpu to
+ * run or when root of shared EPT is invalidated.
+ * KVM only needs to flush shared EPT because the TDX module handles TLB
+ * invalidation for private EPT in tdh_vp_enter();
+ *
+ * A single context invalidation for shared EPT can be performed here.
+ * However, this single context invalidation requires the private EPTP
+ * rather than the shared EPTP to flush shared EPT, as shared EPT uses
+ * private EPTP as its ASID for TLB invalidation.
+ *
+ * To avoid reading back private EPTP, perform a global invalidation for
+ * shared EPT instead to keep this function simple.
+ */
+ ept_sync_global();
+}
+
+void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
+ * ensure that private EPT will be flushed on the next TD enter. No need
+ * to call tdx_track() here again even when this callback is a result of
+ * zapping private EPT.
+ *
+ * Due to the lack of the context to determine which EPT has been
+ * affected by zapping, invoke invept() directly here for both shared
+ * EPT and private EPT for simplicity, though it's not necessary for
+ * private EPT.
+ */
+ ept_sync_global();
+}
+
+static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
+ if (tdx_operand_busy(cmd->hw_error))
+ return -EBUSY;
+ if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
+ return -EIO;
+
+ kvm_tdx->state = TD_STATE_RUNNABLE;
+ /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
+ smp_wmb();
+ kvm->arch.pre_fault_allowed = true;
+ return 0;
+}
+
+static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
+{
+ if (copy_from_user(cmd, argp, sizeof(*cmd)))
+ return -EFAULT;
+
+ /*
+ * Userspace should never set hw_error. KVM writes hw_error to report
+ * hardware-defined error back to userspace.
+ */
+ if (cmd->hw_error)
+ return -EINVAL;
+
+ return 0;
+}
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_tdx_cmd tdx_cmd;
+ int r;
+
+ r = tdx_get_cmd(argp, &tdx_cmd);
+ if (r)
+ return r;
+
+ if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
+ return tdx_get_capabilities(&tdx_cmd);
+
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
+
+ switch (tdx_cmd.id) {
+ case KVM_TDX_INIT_VM:
+ r = tdx_td_init(kvm, &tdx_cmd);
+ break;
+ case KVM_TDX_FINALIZE_VM:
+ r = tdx_td_finalize(kvm, &tdx_cmd);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
+ return -EFAULT;
+
+ return r;
+}
+
+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct page *page;
+ int ret, i;
+ u64 err;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ tdx->vp.tdvpr_page = page;
+
+ /*
+ * page_to_phys() does not work in 'noinstr' code, like guest
+ * entry via tdh_vp_enter(). Precalculate and store it instead
+ * of doing it at runtime later.
+ */
+ tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
+
+ tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
+ GFP_KERNEL);
+ if (!tdx->vp.tdcx_pages) {
+ ret = -ENOMEM;
+ goto free_tdvpr;
+ }
+
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto free_tdcx;
+ }
+ tdx->vp.tdcx_pages[i] = page;
+ }
+
+ err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
+ if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
+ ret = -EIO;
+ goto free_tdcx;
+ }
+
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
+ if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
+ /*
+ * Pages already added are reclaimed by the vcpu_free
+ * method, but the rest are freed here.
+ */
+ for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ __free_page(tdx->vp.tdcx_pages[i]);
+ tdx->vp.tdcx_pages[i] = NULL;
+ }
+ return -EIO;
+ }
+ }
+
+ /*
+ * tdh_vp_init() can take an exclusive lock of the TDR resource inside
+ * the TDX-Module. The TDR resource is also taken as shared in several
+ * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
+ * (TDX-Module locks are try-lock implementations with no slow path).
+ * Take mmu_lock for write to reflect the nature of the lock taken by
+ * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
+ * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
+ */
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+ err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+ if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
+ return -EIO;
+ }
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ return 0;
+
+free_tdcx:
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ if (tdx->vp.tdcx_pages[i])
+ __free_page(tdx->vp.tdcx_pages[i]);
+ tdx->vp.tdcx_pages[i] = NULL;
+ }
+ kfree(tdx->vp.tdcx_pages);
+ tdx->vp.tdcx_pages = NULL;
+
+free_tdvpr:
+ if (tdx->vp.tdvpr_page)
+ __free_page(tdx->vp.tdvpr_page);
+ tdx->vp.tdvpr_page = NULL;
+ tdx->vp.tdvpr_pa = 0;
+
+ return ret;
+}
+
+/* Sometimes reads multipple subleafs. Return how many enties were written. */
+static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
+ struct kvm_cpuid_entry2 *output_e)
+{
+ int sub_leaf = 0;
+ int ret;
+
+ /* First try without a subleaf */
+ ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
+
+ /* If success, or invalid leaf, just give up */
+ if (ret != -EIO)
+ return ret;
+
+ /*
+ * If the try without a subleaf failed, try reading subleafs until
+ * failure. The TDX module only supports 6 bits of subleaf index.
+ */
+ while (1) {
+ /* Keep reading subleafs until there is a failure. */
+ if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
+ return !sub_leaf;
+
+ sub_leaf++;
+ output_e++;
+ }
+
+ return 0;
+}
+
+static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_cpuid2 __user *output;
+ struct kvm_cpuid2 *td_cpuid;
+ int r = 0, i = 0, leaf;
+ u32 level;
+
+ output = u64_to_user_ptr(cmd->data);
+ td_cpuid = kzalloc(sizeof(*td_cpuid) +
+ sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
+ GFP_KERNEL);
+ if (!td_cpuid)
+ return -ENOMEM;
+
+ if (copy_from_user(td_cpuid, output, sizeof(*output))) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ /* Read max CPUID for normal range */
+ if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
+ r = -EIO;
+ goto out;
+ }
+ level = td_cpuid->entries[0].eax;
+
+ for (leaf = 1; leaf <= level; leaf++)
+ tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+
+ /* Read max CPUID for extended range */
+ if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
+ r = -EIO;
+ goto out;
+ }
+ level = td_cpuid->entries[i - 1].eax;
+
+ for (leaf = 0x80000001; leaf <= level; leaf++)
+ tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+
+ if (td_cpuid->nent < i)
+ r = -E2BIG;
+ td_cpuid->nent = i;
+
+ if (copy_to_user(output, td_cpuid, sizeof(*output))) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ if (r == -E2BIG)
+ goto out;
+
+ if (copy_to_user(output->entries, td_cpuid->entries,
+ td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ r = -EFAULT;
+
+out:
+ kfree(td_cpuid);
+
+ return r;
+}
+
+static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ u64 apic_base;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int ret;
+
+ if (cmd->flags)
+ return -EINVAL;
+
+ if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
+ return -EINVAL;
+
+ /*
+ * TDX requires X2APIC, userspace is responsible for configuring guest
+ * CPUID accordingly.
+ */
+ apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
+ if (kvm_apic_set_base(vcpu, apic_base, true))
+ return -EINVAL;
+
+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
+ if (ret)
+ return ret;
+
+ td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
+ td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
+
+ tdx->state = VCPU_TD_STATE_INITIALIZED;
+
+ return 0;
+}
+
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ /*
+ * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
+ * INIT events.
+ *
+ * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
+ * userspace needs to define the vCPU model before KVM can initialize
+ * vCPU state, e.g. to enable x2APIC.
+ */
+ WARN_ON_ONCE(init_event);
+}
+
+struct tdx_gmem_post_populate_arg {
+ struct kvm_vcpu *vcpu;
+ __u32 flags;
+};
+
+static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+ void __user *src, int order, void *_arg)
+{
+ struct tdx_gmem_post_populate_arg *arg = _arg;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
+ gpa_t gpa = gfn_to_gpa(gfn);
+ struct page *src_page;
+ int ret, i;
+
+ if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
+ return -EIO;
+
+ /*
+ * Get the source page if it has been faulted in. Return failure if the
+ * source page has been swapped out or unmapped in primary memory.
+ */
+ ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
+ if (ret < 0)
+ return ret;
+ if (ret != 1)
+ return -ENOMEM;
+
+ kvm_tdx->page_add_src = src_page;
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
+ kvm_tdx->page_add_src = NULL;
+
+ put_page(src_page);
+
+ if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
+ return ret;
+
+ /*
+ * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
+ * between mapping the pfn and now, but slots_lock prevents memslot
+ * updates, filemap_invalidate_lock() prevents guest_memfd updates,
+ * mmu_notifier events can't reach S-EPT entries, and KVM's internal
+ * zapping flows are mutually exclusive with S-EPT mappings.
+ */
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_init_mem_region region;
+ struct tdx_gmem_post_populate_arg arg;
+ long gmem_ret;
+ int ret;
+
+ if (tdx->state != VCPU_TD_STATE_INITIALIZED)
+ return -EINVAL;
+
+ /* Once TD is finalized, the initial guest memory is fixed. */
+ if (kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
+ return -EINVAL;
+
+ if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
+ return -EFAULT;
+
+ if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
+ !region.nr_pages ||
+ region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
+ !vt_is_tdx_private_gpa(kvm, region.gpa) ||
+ !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
+ return -EINVAL;
+
+ ret = 0;
+ while (region.nr_pages) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ arg = (struct tdx_gmem_post_populate_arg) {
+ .vcpu = vcpu,
+ .flags = cmd->flags,
+ };
+ gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
+ u64_to_user_ptr(region.source_addr),
+ 1, tdx_gmem_post_populate, &arg);
+ if (gmem_ret < 0) {
+ ret = gmem_ret;
+ break;
+ }
+
+ if (gmem_ret != 1) {
+ ret = -EIO;
+ break;
+ }
+
+ region.source_addr += PAGE_SIZE;
+ region.gpa += PAGE_SIZE;
+ region.nr_pages--;
+
+ cond_resched();
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
+ ret = -EFAULT;
+ return ret;
+}
+
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_cmd cmd;
+ int r;
+
+ r = tdx_get_cmd(argp, &cmd);
+ if (r)
+ return r;
+
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+
+ switch (cmd.id) {
+ case KVM_TDX_INIT_MEM_REGION:
+ r = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
+ case KVM_TDX_INIT_VCPU:
+ r = tdx_vcpu_init(vcpu, &cmd);
+ break;
+ default:
+ r = -ENOIOCTLCMD;
+ break;
+ }
+
+ vcpu_put(vcpu);
+
+ return r;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ ret = tdx_get_cmd(argp, &cmd);
+ if (ret)
+ return ret;
+
+ switch (cmd.id) {
+ case KVM_TDX_GET_CPUID:
+ ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
+{
+ if (!is_private)
+ return 0;
+
+ return PG_LEVEL_4K;
+}
+
+static int tdx_online_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ int r;
+
+ /* Sanity check CPU is already in post-VMXON */
+ WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
+
+ local_irq_save(flags);
+ r = tdx_cpu_enable();
+ local_irq_restore(flags);
+
+ return r;
+}
+
+static int tdx_offline_cpu(unsigned int cpu)
+{
+ int i;
+
+ /* No TD is running. Allow any cpu to be offline. */
+ if (!atomic_read(&nr_configured_hkid))
+ return 0;
+
+ /*
+ * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
+ * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
+ * controller with pconfig. If we have active TDX HKID, refuse to
+ * offline the last online cpu.
+ */
+ for_each_online_cpu(i) {
+ /*
+ * Found another online cpu on the same package.
+ * Allow to offline.
+ */
+ if (i != cpu && topology_physical_package_id(i) ==
+ topology_physical_package_id(cpu))
+ return 0;
+ }
+
+ /*
+ * This is the last cpu of this package. Don't offline it.
+ *
+ * Because it's hard for human operator to understand the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG_ONLINE \
+ "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
+ pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
+ return -EBUSY;
+}
+
+static void __do_tdx_cleanup(void)
+{
+ /*
+ * Once TDX module is initialized, it cannot be disabled and
+ * re-initialized again w/o runtime update (which isn't
+ * supported by kernel). Only need to remove the cpuhp here.
+ * The TDX host core code tracks TDX status and can handle
+ * 'multiple enabling' scenario.
+ */
+ WARN_ON_ONCE(!tdx_cpuhp_state);
+ cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
+ tdx_cpuhp_state = 0;
+}
+
+static void __tdx_cleanup(void)
+{
+ cpus_read_lock();
+ __do_tdx_cleanup();
+ cpus_read_unlock();
+}
+
+static int __init __do_tdx_bringup(void)
+{
+ int r;
+
+ /*
+ * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
+ * online CPUs before calling tdx_enable(), and on any new
+ * going-online CPU to make sure it is ready for TDX guest.
+ */
+ r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
+ "kvm/cpu/tdx:online",
+ tdx_online_cpu, tdx_offline_cpu);
+ if (r < 0)
+ return r;
+
+ tdx_cpuhp_state = r;
+
+ r = tdx_enable();
+ if (r)
+ __do_tdx_cleanup();
+
+ return r;
+}
+
+static int __init __tdx_bringup(void)
+{
+ const struct tdx_sys_info_td_conf *td_conf;
+ int r, i;
+
+ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
+ /*
+ * Check if MSRs (tdx_uret_msrs) can be saved/restored
+ * before returning to user space.
+ */
+ tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
+ if (tdx_uret_msrs[i].slot == -1) {
+ /* If any MSR isn't supported, it is a KVM bug */
+ pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
+ tdx_uret_msrs[i].msr);
+ return -EIO;
+ }
+ }
+
+ /*
+ * Enabling TDX requires enabling hardware virtualization first,
+ * as making SEAMCALLs requires CPU being in post-VMXON state.
+ */
+ r = kvm_enable_virtualization();
+ if (r)
+ return r;
+
+ cpus_read_lock();
+ r = __do_tdx_bringup();
+ cpus_read_unlock();
+
+ if (r)
+ goto tdx_bringup_err;
+
+ r = -EINVAL;
+ /* Get TDX global information for later use */
+ tdx_sysinfo = tdx_get_sysinfo();
+ if (WARN_ON_ONCE(!tdx_sysinfo))
+ goto get_sysinfo_err;
+
+ /* Check TDX module and KVM capabilities */
+ if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
+ !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
+ goto get_sysinfo_err;
+
+ if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
+ goto get_sysinfo_err;
+
+ /*
+ * TDX has its own limit of maximum vCPUs it can support for all
+ * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
+ * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
+ * extension on per-VM basis.
+ *
+ * TDX module reports such limit via the MAX_VCPU_PER_TD global
+ * metadata. Different modules may report different values.
+ * Some old module may also not support this metadata (in which
+ * case this limit is U16_MAX).
+ *
+ * In practice, the reported value reflects the maximum logical
+ * CPUs that ALL the platforms that the module supports can
+ * possibly have.
+ *
+ * Simply forwarding the MAX_VCPU_PER_TD to userspace could
+ * result in an unpredictable ABI. KVM instead always advertise
+ * the number of logical CPUs the platform has as the maximum
+ * vCPUs for TDX guests.
+ *
+ * Make sure MAX_VCPU_PER_TD reported by TDX module is not
+ * smaller than the number of logical CPUs, otherwise KVM will
+ * report an unsupported value to userspace.
+ *
+ * Note, a platform with TDX enabled in the BIOS cannot support
+ * physical CPU hotplug, and TDX requires the BIOS has marked
+ * all logical CPUs in MADT table as enabled. Just use
+ * num_present_cpus() for the number of logical CPUs.
+ */
+ td_conf = &tdx_sysinfo->td_conf;
+ if (td_conf->max_vcpus_per_td < num_present_cpus()) {
+ pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
+ td_conf->max_vcpus_per_td, num_present_cpus());
+ goto get_sysinfo_err;
+ }
+
+ if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
+ goto get_sysinfo_err;
+
+ /*
+ * Leave hardware virtualization enabled after TDX is enabled
+ * successfully. TDX CPU hotplug depends on this.
+ */
+ return 0;
+
+get_sysinfo_err:
+ __tdx_cleanup();
+tdx_bringup_err:
+ kvm_disable_virtualization();
+ return r;
+}
+
+void tdx_cleanup(void)
+{
+ if (enable_tdx) {
+ misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
+ __tdx_cleanup();
+ kvm_disable_virtualization();
+ }
+}
+
+int __init tdx_bringup(void)
+{
+ int r, i;
+
+ /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
+
+ if (!enable_tdx)
+ return 0;
+
+ if (!enable_ept) {
+ pr_err("EPT is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
+ pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!enable_apicv) {
+ pr_err("APICv is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ pr_err("tdx: OSXSAVE is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+ pr_err("tdx: MOVDIR64B is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
+ pr_err("Self-snoop is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
+ pr_err("tdx: no TDX private KeyIDs available\n");
+ goto success_disable_tdx;
+ }
+
+ if (!enable_virt_at_load) {
+ pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
+ goto success_disable_tdx;
+ }
+
+ /*
+ * Ideally KVM should probe whether TDX module has been loaded
+ * first and then try to bring it up. But TDX needs to use SEAMCALL
+ * to probe whether the module is loaded (there is no CPUID or MSR
+ * for that), and making SEAMCALL requires enabling virtualization
+ * first, just like the rest steps of bringing up TDX module.
+ *
+ * So, for simplicity do everything in __tdx_bringup(); the first
+ * SEAMCALL will return -ENODEV when the module is not loaded. The
+ * only complication is having to make sure that initialization
+ * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
+ * cases.
+ */
+ r = __tdx_bringup();
+ if (r) {
+ /*
+ * Disable TDX only but don't fail to load module if the TDX
+ * module could not be loaded. No need to print message saying
+ * "module is not loaded" because it was printed when the first
+ * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
+ * vm_size, as kvm_x86_ops have already been finalized (and are
+ * intentionally not exported). The S-EPT code is unreachable,
+ * and allocating a few more bytes per VM in a should-be-rare
+ * failure scenario is a non-issue.
+ */
+ if (r == -ENODEV)
+ goto success_disable_tdx;
+
+ enable_tdx = 0;
+ }
+
+ return r;
+
+success_disable_tdx:
+ enable_tdx = 0;
+ return 0;
+}
+
+void __init tdx_hardware_setup(void)
+{
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
+
+ /*
+ * Note, if the TDX module can't be loaded, KVM TDX support will be
+ * disabled but KVM will continue loading (see tdx_bringup()).
+ */
+ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
+
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+ vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
new file mode 100644
index 000000000000..45b5183ccb36
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -0,0 +1,208 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_TDX_H
+#define __KVM_X86_VMX_TDX_H
+
+#include "tdx_arch.h"
+#include "tdx_errno.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+#include "common.h"
+
+void tdx_hardware_setup(void);
+int tdx_bringup(void);
+void tdx_cleanup(void);
+
+extern bool enable_tdx;
+
+/* TDX module hardware states. These follow the TDX module OP_STATEs. */
+enum kvm_tdx_state {
+ TD_STATE_UNINITIALIZED = 0,
+ TD_STATE_INITIALIZED,
+ TD_STATE_RUNNABLE,
+};
+
+struct kvm_tdx {
+ struct kvm kvm;
+
+ struct misc_cg *misc_cg;
+ int hkid;
+ enum kvm_tdx_state state;
+
+ u64 attributes;
+ u64 xfam;
+
+ u64 tsc_offset;
+ u64 tsc_multiplier;
+
+ struct tdx_td td;
+
+ /*
+ * Scratch pointer used to pass the source page to tdx_mem_page_add().
+ * Protected by slots_lock, and non-NULL only when mapping a private
+ * pfn via tdx_gmem_post_populate().
+ */
+ struct page *page_add_src;
+
+ /*
+ * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
+ * not contend with tdh_vp_enter() and TDCALLs.
+ * Set/unset is protected with kvm->mmu_lock.
+ */
+ bool wait_for_sept_zap;
+};
+
+/* TDX module vCPU states */
+enum vcpu_tdx_state {
+ VCPU_TD_STATE_UNINITIALIZED = 0,
+ VCPU_TD_STATE_INITIALIZED,
+};
+
+struct vcpu_tdx {
+ struct kvm_vcpu vcpu;
+ struct vcpu_vt vt;
+ u64 ext_exit_qualification;
+ gpa_t exit_gpa;
+ struct tdx_module_args vp_enter_args;
+
+ struct tdx_vp vp;
+
+ struct list_head cpu_list;
+
+ u64 vp_enter_ret;
+
+ enum vcpu_tdx_state state;
+
+ u64 map_gpa_next;
+ u64 map_gpa_end;
+};
+
+void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
+void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
+ u64 val, u64 err);
+
+static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
+{
+ u64 err, data;
+
+ err = tdh_mng_rd(&kvm_tdx->td, TDCS_EXEC(field), &data);
+ if (unlikely(err)) {
+ pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
+ return 0;
+ }
+ return data;
+}
+
+static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
+{
+#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL
+#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL
+#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL
+#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK)
+
+ /* TDX is 64bit only. HIGH field isn't supported. */
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) &&
+ VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH,
+ "Read/Write to TD VMCS *_HIGH fields not supported");
+
+ BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64);
+
+#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13)
+#define VMCS_ENC_WIDTH_16BIT (0UL << 13)
+#define VMCS_ENC_WIDTH_64BIT (1UL << 13)
+#define VMCS_ENC_WIDTH_32BIT (2UL << 13)
+#define VMCS_ENC_WIDTH_NATURAL (3UL << 13)
+#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK)
+
+ /* TDX is 64bit only. i.e. natural width = 64bit. */
+ BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) &&
+ (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT ||
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL),
+ "Invalid TD VMCS access for 64-bit field");
+ BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) &&
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT,
+ "Invalid TD VMCS access for 32-bit field");
+ BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) &&
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT,
+ "Invalid TD VMCS access for 16-bit field");
+}
+
+static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
+static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {}
+
+#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \
+static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \
+ u32 field) \
+{ \
+ u64 err, data; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \
+ if (unlikely(err)) { \
+ tdh_vp_rd_failed(tdx, #uclass, field, err); \
+ return 0; \
+ } \
+ return (u##bits)data; \
+} \
+static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \
+ u32 field, u##bits val) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \
+ GENMASK_ULL(bits - 1, 0)); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \
+} \
+static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \
+ u32 field, u64 bit) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \
+} \
+static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \
+ u32 field, u64 bit) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\
+}
+
+
+bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu);
+int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err);
+
+TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
+TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
+TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
+
+TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
+TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
+
+#else
+static inline int tdx_bringup(void) { return 0; }
+static inline void tdx_cleanup(void) {}
+
+#define enable_tdx 0
+
+struct kvm_tdx {
+ struct kvm kvm;
+};
+
+struct vcpu_tdx {
+ struct kvm_vcpu vcpu;
+};
+
+static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; }
+static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; }
+
+#endif
+
+#endif
diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
new file mode 100644
index 000000000000..a30e880849e3
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_arch.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* architectural constants/data definitions for TDX SEAMCALLs */
+
+#ifndef __KVM_X86_TDX_ARCH_H
+#define __KVM_X86_TDX_ARCH_H
+
+#include <linux/types.h>
+
+/* TDX control structure (TDR/TDCS/TDVPS) field access codes */
+#define TDX_NON_ARCH BIT_ULL(63)
+#define TDX_CLASS_SHIFT 56
+#define TDX_FIELD_MASK GENMASK_ULL(31, 0)
+
+#define __BUILD_TDX_FIELD(non_arch, class, field) \
+ (((non_arch) ? TDX_NON_ARCH : 0) | \
+ ((u64)(class) << TDX_CLASS_SHIFT) | \
+ ((u64)(field) & TDX_FIELD_MASK))
+
+#define BUILD_TDX_FIELD(class, field) \
+ __BUILD_TDX_FIELD(false, (class), (field))
+
+#define BUILD_TDX_FIELD_NON_ARCH(class, field) \
+ __BUILD_TDX_FIELD(true, (class), (field))
+
+
+/* Class code for TD */
+#define TD_CLASS_EXECUTION_CONTROLS 17ULL
+
+/* Class code for TDVPS */
+#define TDVPS_CLASS_VMCS 0ULL
+#define TDVPS_CLASS_GUEST_GPR 16ULL
+#define TDVPS_CLASS_OTHER_GUEST 17ULL
+#define TDVPS_CLASS_MANAGEMENT 32ULL
+
+enum tdx_tdcs_execution_control {
+ TD_TDCS_EXEC_TSC_OFFSET = 10,
+ TD_TDCS_EXEC_TSC_MULTIPLIER = 11,
+};
+
+enum tdx_vcpu_guest_other_state {
+ TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100,
+};
+
+#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0)
+
+static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details)
+{
+ return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING);
+}
+
+/* @field is any of enum tdx_tdcs_execution_control */
+#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field))
+
+/* @field is the VMCS field encoding */
+#define TDVPS_VMCS(field) BUILD_TDX_FIELD(TDVPS_CLASS_VMCS, (field))
+
+/* @field is any of enum tdx_guest_other_state */
+#define TDVPS_STATE(field) BUILD_TDX_FIELD(TDVPS_CLASS_OTHER_GUEST, (field))
+#define TDVPS_STATE_NON_ARCH(field) BUILD_TDX_FIELD_NON_ARCH(TDVPS_CLASS_OTHER_GUEST, (field))
+
+/* Management class fields */
+enum tdx_vcpu_guest_management {
+ TD_VCPU_PEND_NMI = 11,
+};
+
+/* @field is any of enum tdx_vcpu_guest_management */
+#define TDVPS_MANAGEMENT(field) BUILD_TDX_FIELD(TDVPS_CLASS_MANAGEMENT, (field))
+
+#define TDX_EXTENDMR_CHUNKSIZE 256
+
+struct tdx_cpuid_value {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+} __packed;
+
+#define TDX_TD_ATTR_DEBUG BIT_ULL(0)
+#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28)
+#define TDX_TD_ATTR_PKS BIT_ULL(30)
+#define TDX_TD_ATTR_KL BIT_ULL(31)
+#define TDX_TD_ATTR_PERFMON BIT_ULL(63)
+
+#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0)
+#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6
+/*
+ * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B.
+ */
+struct td_params {
+ u64 attributes;
+ u64 xfam;
+ u16 max_vcpus;
+ u8 reserved0[6];
+
+ u64 eptp_controls;
+ u64 config_flags;
+ u16 tsc_frequency;
+ u8 reserved1[38];
+
+ u64 mrconfigid[6];
+ u64 mrowner[6];
+ u64 mrownerconfig[6];
+ u64 reserved2[4];
+
+ union {
+ DECLARE_FLEX_ARRAY(struct tdx_cpuid_value, cpuid_values);
+ u8 reserved3[768];
+ };
+} __packed __aligned(1024);
+
+/*
+ * Guest uses MAX_PA for GPAW when set.
+ * 0: GPA.SHARED bit is GPA[47]
+ * 1: GPA.SHARED bit is GPA[51]
+ */
+#define TDX_CONFIG_FLAGS_MAX_GPAW BIT_ULL(0)
+
+/*
+ * TDH.VP.ENTER, TDG.VP.VMCALL preserves RBP
+ * 0: RBP can be used for TDG.VP.VMCALL input. RBP is clobbered.
+ * 1: RBP can't be used for TDG.VP.VMCALL input. RBP is preserved.
+ */
+#define TDX_CONFIG_FLAGS_NO_RBP_MOD BIT_ULL(2)
+
+
+/*
+ * TDX requires the frequency to be defined in units of 25MHz, which is the
+ * frequency of the core crystal clock on TDX-capable platforms, i.e. the TDX
+ * module can only program frequencies that are multiples of 25MHz. The
+ * frequency must be between 100mhz and 10ghz (inclusive).
+ */
+#define TDX_TSC_KHZ_TO_25MHZ(tsc_in_khz) ((tsc_in_khz) / (25 * 1000))
+#define TDX_TSC_25MHZ_TO_KHZ(tsc_in_25mhz) ((tsc_in_25mhz) * (25 * 1000))
+#define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000)
+#define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000)
+
+/* Additional Secure EPT entry information */
+#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0)
+#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8)
+#define TDX_SEPT_STATE_SHIFT 8
+
+enum tdx_sept_entry_state {
+ TDX_SEPT_FREE = 0,
+ TDX_SEPT_BLOCKED = 1,
+ TDX_SEPT_PENDING = 2,
+ TDX_SEPT_PENDING_BLOCKED = 3,
+ TDX_SEPT_PRESENT = 4,
+};
+
+static inline u8 tdx_get_sept_level(u64 sept_entry_info)
+{
+ return sept_entry_info & TDX_SEPT_LEVEL_MASK;
+}
+
+static inline u8 tdx_get_sept_state(u64 sept_entry_info)
+{
+ return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT;
+}
+
+#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20)
+
+/*
+ * TD scope metadata field ID.
+ */
+#define TD_MD_FIELD_ID_CPUID_VALUES 0x9410000300000000ULL
+
+#endif /* __KVM_X86_TDX_ARCH_H */
diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
new file mode 100644
index 000000000000..6ff4672c4181
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* architectural status code for SEAMCALL */
+
+#ifndef __KVM_X86_TDX_ERRNO_H
+#define __KVM_X86_TDX_ERRNO_H
+
+#define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL
+
+/*
+ * TDX SEAMCALL Status Codes (returned in RAX)
+ */
+#define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL
+#define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL
+#define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL
+#define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL
+#define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL
+#define TDX_OPERAND_INVALID 0xC000010000000000ULL
+#define TDX_OPERAND_BUSY 0x8000020000000000ULL
+#define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL
+#define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL
+#define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL
+#define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL
+#define TDX_KEY_STATE_INCORRECT 0xC000081100000000ULL
+#define TDX_KEY_CONFIGURED 0x0000081500000000ULL
+#define TDX_NO_HKID_READY_TO_WBCACHE 0x0000082100000000ULL
+#define TDX_FLUSHVP_NOT_DONE 0x8000082400000000ULL
+#define TDX_EPT_WALK_FAILED 0xC0000B0000000000ULL
+#define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL
+#define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL
+
+/*
+ * TDX module operand ID, appears in 31:0 part of error code as
+ * detail information
+ */
+#define TDX_OPERAND_ID_RCX 0x01
+#define TDX_OPERAND_ID_TDR 0x80
+#define TDX_OPERAND_ID_SEPT 0x92
+#define TDX_OPERAND_ID_TD_EPOCH 0xa9
+
+#endif /* __KVM_X86_TDX_ERRNO_H */
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
new file mode 100644
index 000000000000..b25625314658
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -0,0 +1,198 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_VMCS_H
+#define __KVM_X86_VMX_VMCS_H
+
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/nospec.h>
+
+#include <asm/kvm.h>
+#include <asm/vmx.h>
+
+#include "capabilities.h"
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+
+struct vmcs_hdr {
+ u32 revision_id:31;
+ u32 shadow_vmcs:1;
+};
+
+struct vmcs {
+ struct vmcs_hdr hdr;
+ u32 abort;
+ char data[];
+};
+
+DECLARE_PER_CPU(struct vmcs *, current_vmcs);
+
+/*
+ * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
+ * and whose values change infrequently, but are not constant. I.e. this is
+ * used as a write-through cache of the corresponding VMCS fields.
+ */
+struct vmcs_host_state {
+ unsigned long cr3; /* May not match real cr3 */
+ unsigned long cr4; /* May not match real cr4 */
+ unsigned long gs_base;
+ unsigned long fs_base;
+ unsigned long rsp;
+
+ u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
+};
+
+struct vmcs_controls_shadow {
+ u32 vm_entry;
+ u32 vm_exit;
+ u32 pin;
+ u32 exec;
+ u32 secondary_exec;
+ u64 tertiary_exec;
+};
+
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+ struct vmcs *vmcs;
+ struct vmcs *shadow_vmcs;
+ int cpu;
+ bool launched;
+ bool nmi_known_unmasked;
+ bool hv_timer_soft_disabled;
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
+ unsigned long *msr_bitmap;
+ struct list_head loaded_vmcss_on_cpu_link;
+ struct vmcs_host_state host_state;
+ struct vmcs_controls_shadow controls_shadow;
+};
+
+static __always_inline bool is_intr_type(u32 intr_info, u32 type)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type);
+}
+
+static inline bool is_intr_type_n(u32 intr_info, u32 type, u8 vector)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK |
+ INTR_INFO_VECTOR_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type | vector);
+}
+
+static inline bool is_exception_n(u32 intr_info, u8 vector)
+{
+ return is_intr_type_n(intr_info, INTR_TYPE_HARD_EXCEPTION, vector);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+ return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+ return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_double_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, DF_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, PF_VECTOR);
+}
+
+static inline bool is_invalid_opcode(u32 intr_info)
+{
+ return is_exception_n(intr_info, UD_VECTOR);
+}
+
+static inline bool is_gp_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, GP_VECTOR);
+}
+
+static inline bool is_alignment_check(u32 intr_info)
+{
+ return is_exception_n(intr_info, AC_VECTOR);
+}
+
+static inline bool is_machine_check(u32 intr_info)
+{
+ return is_exception_n(intr_info, MC_VECTOR);
+}
+
+static inline bool is_nm_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
+static inline bool is_ve_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, VE_VECTOR);
+}
+
+/* Undocumented: icebp/int1 */
+static inline bool is_icebp(u32 intr_info)
+{
+ return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
+}
+
+static __always_inline bool is_nmi(u32 intr_info)
+{
+ return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
+}
+
+static inline bool is_external_intr(u32 intr_info)
+{
+ return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
+}
+
+static inline bool is_exception_with_error_code(u32 intr_info)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
+
+ return (intr_info & mask) == mask;
+}
+
+enum vmcs_field_width {
+ VMCS_FIELD_WIDTH_U16 = 0,
+ VMCS_FIELD_WIDTH_U64 = 1,
+ VMCS_FIELD_WIDTH_U32 = 2,
+ VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_width(unsigned long field)
+{
+ if (0x1 & field) /* the *_HIGH fields are all 32 bit */
+ return VMCS_FIELD_WIDTH_U32;
+ return (field >> 13) & 0x3;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+ return (((field >> 10) & 0x3) == 1);
+}
+
+#define VMCS_FIELD_INDEX_SHIFT (1)
+#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1)
+
+static inline unsigned int vmcs_field_index(unsigned long field)
+{
+ return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT;
+}
+
+#endif /* __KVM_X86_VMX_VMCS_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
new file mode 100644
index 000000000000..4233b5ca9461
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "vmcs12.h"
+
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
+#define FIELD64(number, name) \
+ FIELD(number, name), \
+ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
+
+const unsigned short vmcs12_field_offsets[] = {
+ FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ FIELD(POSTED_INTR_NV, posted_intr_nv),
+ FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+ FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+ FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+ FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+ FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+ FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+ FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+ FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ FIELD(GUEST_INTR_STATUS, guest_intr_status),
+ FIELD(GUEST_PML_INDEX, guest_pml_index),
+ FIELD(HOST_ES_SELECTOR, host_es_selector),
+ FIELD(HOST_CS_SELECTOR, host_cs_selector),
+ FIELD(HOST_SS_SELECTOR, host_ss_selector),
+ FIELD(HOST_DS_SELECTOR, host_ds_selector),
+ FIELD(HOST_FS_SELECTOR, host_fs_selector),
+ FIELD(HOST_GS_SELECTOR, host_gs_selector),
+ FIELD(HOST_TR_SELECTOR, host_tr_selector),
+ FIELD64(IO_BITMAP_A, io_bitmap_a),
+ FIELD64(IO_BITMAP_B, io_bitmap_b),
+ FIELD64(MSR_BITMAP, msr_bitmap),
+ FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+ FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+ FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+ FIELD64(PML_ADDRESS, pml_address),
+ FIELD64(TSC_OFFSET, tsc_offset),
+ FIELD64(TSC_MULTIPLIER, tsc_multiplier),
+ FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+ FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+ FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
+ FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+ FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+ FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+ FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+ FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
+ FIELD64(VMREAD_BITMAP, vmread_bitmap),
+ FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
+ FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+ FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
+ FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+ FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+ FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+ FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+ FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+ FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+ FIELD64(GUEST_PDPTR0, guest_pdptr0),
+ FIELD64(GUEST_PDPTR1, guest_pdptr1),
+ FIELD64(GUEST_PDPTR2, guest_pdptr2),
+ FIELD64(GUEST_PDPTR3, guest_pdptr3),
+ FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
+ FIELD64(HOST_IA32_PAT, host_ia32_pat),
+ FIELD64(HOST_IA32_EFER, host_ia32_efer),
+ FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+ FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+ FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+ FIELD(EXCEPTION_BITMAP, exception_bitmap),
+ FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+ FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+ FIELD(CR3_TARGET_COUNT, cr3_target_count),
+ FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+ FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+ FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+ FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+ FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+ FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+ FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+ FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+ FIELD(TPR_THRESHOLD, tpr_threshold),
+ FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+ FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+ FIELD(VM_EXIT_REASON, vm_exit_reason),
+ FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+ FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+ FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+ FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+ FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+ FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+ FIELD(GUEST_ES_LIMIT, guest_es_limit),
+ FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+ FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+ FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+ FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+ FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+ FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+ FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+ FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+ FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+ FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+ FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+ FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+ FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+ FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+ FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+ FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+ FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+ FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+ FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+ FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+ FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+ FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
+ FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+ FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+ FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+ FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+ FIELD(EXIT_QUALIFICATION, exit_qualification),
+ FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+ FIELD(GUEST_CR0, guest_cr0),
+ FIELD(GUEST_CR3, guest_cr3),
+ FIELD(GUEST_CR4, guest_cr4),
+ FIELD(GUEST_ES_BASE, guest_es_base),
+ FIELD(GUEST_CS_BASE, guest_cs_base),
+ FIELD(GUEST_SS_BASE, guest_ss_base),
+ FIELD(GUEST_DS_BASE, guest_ds_base),
+ FIELD(GUEST_FS_BASE, guest_fs_base),
+ FIELD(GUEST_GS_BASE, guest_gs_base),
+ FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+ FIELD(GUEST_TR_BASE, guest_tr_base),
+ FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+ FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+ FIELD(GUEST_DR7, guest_dr7),
+ FIELD(GUEST_RSP, guest_rsp),
+ FIELD(GUEST_RIP, guest_rip),
+ FIELD(GUEST_RFLAGS, guest_rflags),
+ FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+ FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+ FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+ FIELD(GUEST_S_CET, guest_s_cet),
+ FIELD(GUEST_SSP, guest_ssp),
+ FIELD(GUEST_INTR_SSP_TABLE, guest_ssp_tbl),
+ FIELD(HOST_CR0, host_cr0),
+ FIELD(HOST_CR3, host_cr3),
+ FIELD(HOST_CR4, host_cr4),
+ FIELD(HOST_FS_BASE, host_fs_base),
+ FIELD(HOST_GS_BASE, host_gs_base),
+ FIELD(HOST_TR_BASE, host_tr_base),
+ FIELD(HOST_GDTR_BASE, host_gdtr_base),
+ FIELD(HOST_IDTR_BASE, host_idtr_base),
+ FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+ FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+ FIELD(HOST_RSP, host_rsp),
+ FIELD(HOST_RIP, host_rip),
+ FIELD(HOST_S_CET, host_s_cet),
+ FIELD(HOST_SSP, host_ssp),
+ FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl),
+};
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
new file mode 100644
index 000000000000..4ad6b16525b9
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -0,0 +1,443 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_VMCS12_H
+#define __KVM_X86_VMX_VMCS12_H
+
+#include <linux/build_bug.h>
+
+#include "vmcs.h"
+
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ *
+ * IMPORTANT: Changing the layout of existing fields in this structure
+ * will break save/restore compatibility with older kvm releases. When
+ * adding new fields, either use space in the reserved padding* arrays
+ * or add the new fields to the end of the structure.
+ */
+typedef u64 natural_width;
+struct __packed vmcs12 {
+ /* According to the Intel spec, a VMCS region must start with the
+ * following two fields. Then follow implementation-specific data.
+ */
+ struct vmcs_hdr hdr;
+ u32 abort;
+
+ u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+ u32 padding[7]; /* room for future expansion */
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 apic_access_addr;
+ u64 posted_intr_desc_addr;
+ u64 ept_pointer;
+ u64 eoi_exit_bitmap0;
+ u64 eoi_exit_bitmap1;
+ u64 eoi_exit_bitmap2;
+ u64 eoi_exit_bitmap3;
+ u64 xss_exit_bitmap;
+ u64 guest_physical_address;
+ u64 vmcs_link_pointer;
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+ u64 guest_ia32_perf_global_ctrl;
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+ u64 guest_bndcfgs;
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+ u64 host_ia32_perf_global_ctrl;
+ u64 vmread_bitmap;
+ u64 vmwrite_bitmap;
+ u64 vm_function_control;
+ u64 eptp_list_address;
+ u64 pml_address;
+ u64 encls_exiting_bitmap;
+ u64 tsc_multiplier;
+ u64 padding64[1]; /* room for future expansion */
+ /*
+ * To allow migration of L1 (complete with its L2 guests) between
+ * machines of different natural widths (32 or 64 bit), we cannot have
+ * unsigned long fields with no explicit size. We use u64 (aliased
+ * natural_width) instead. Luckily, x86 is little-endian.
+ */
+ natural_width cr0_guest_host_mask;
+ natural_width cr4_guest_host_mask;
+ natural_width cr0_read_shadow;
+ natural_width cr4_read_shadow;
+ natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
+ natural_width exit_qualification;
+ natural_width guest_linear_address;
+ natural_width guest_cr0;
+ natural_width guest_cr3;
+ natural_width guest_cr4;
+ natural_width guest_es_base;
+ natural_width guest_cs_base;
+ natural_width guest_ss_base;
+ natural_width guest_ds_base;
+ natural_width guest_fs_base;
+ natural_width guest_gs_base;
+ natural_width guest_ldtr_base;
+ natural_width guest_tr_base;
+ natural_width guest_gdtr_base;
+ natural_width guest_idtr_base;
+ natural_width guest_dr7;
+ natural_width guest_rsp;
+ natural_width guest_rip;
+ natural_width guest_rflags;
+ natural_width guest_pending_dbg_exceptions;
+ natural_width guest_sysenter_esp;
+ natural_width guest_sysenter_eip;
+ natural_width host_cr0;
+ natural_width host_cr3;
+ natural_width host_cr4;
+ natural_width host_fs_base;
+ natural_width host_gs_base;
+ natural_width host_tr_base;
+ natural_width host_gdtr_base;
+ natural_width host_idtr_base;
+ natural_width host_ia32_sysenter_esp;
+ natural_width host_ia32_sysenter_eip;
+ natural_width host_rsp;
+ natural_width host_rip;
+ natural_width host_s_cet;
+ natural_width host_ssp;
+ natural_width host_ssp_tbl;
+ natural_width guest_s_cet;
+ natural_width guest_ssp;
+ natural_width guest_ssp_tbl;
+ natural_width paddingl[2]; /* room for future expansion */
+ u32 pin_based_vm_exec_control;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+ u32 cr3_target_count;
+ u32 vm_exit_controls;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_controls;
+ u32 vm_entry_msr_load_count;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+ u32 secondary_vm_exec_control;
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+ u32 guest_interruptibility_info;
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+ u32 host_ia32_sysenter_cs;
+ u32 vmx_preemption_timer_value;
+ u32 padding32[7]; /* room for future expansion */
+ u16 virtual_processor_id;
+ u16 posted_intr_nv;
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+ u16 guest_intr_status;
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+ u16 guest_pml_index;
+};
+
+/*
+ * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM
+ * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision
+ * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's
+ * to KVM's virtual CPU definition.
+ *
+ * DO NOT change this value, as it will break save/restore compatibility with
+ * older KVM releases.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications and
+ * to preserve userspace ABI.
+ */
+#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
+
+/*
+ * For save/restore compatibility, the vmcs12 field offsets must not change,
+ * although appending fields and/or filling gaps is obviously allowed.
+ */
+#define CHECK_OFFSET(field, loc) \
+ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
+
+static inline void vmx_check_vmcs12_offsets(void)
+{
+ CHECK_OFFSET(hdr, 0);
+ CHECK_OFFSET(abort, 4);
+ CHECK_OFFSET(launch_state, 8);
+ CHECK_OFFSET(io_bitmap_a, 40);
+ CHECK_OFFSET(io_bitmap_b, 48);
+ CHECK_OFFSET(msr_bitmap, 56);
+ CHECK_OFFSET(vm_exit_msr_store_addr, 64);
+ CHECK_OFFSET(vm_exit_msr_load_addr, 72);
+ CHECK_OFFSET(vm_entry_msr_load_addr, 80);
+ CHECK_OFFSET(tsc_offset, 88);
+ CHECK_OFFSET(virtual_apic_page_addr, 96);
+ CHECK_OFFSET(apic_access_addr, 104);
+ CHECK_OFFSET(posted_intr_desc_addr, 112);
+ CHECK_OFFSET(ept_pointer, 120);
+ CHECK_OFFSET(eoi_exit_bitmap0, 128);
+ CHECK_OFFSET(eoi_exit_bitmap1, 136);
+ CHECK_OFFSET(eoi_exit_bitmap2, 144);
+ CHECK_OFFSET(eoi_exit_bitmap3, 152);
+ CHECK_OFFSET(xss_exit_bitmap, 160);
+ CHECK_OFFSET(guest_physical_address, 168);
+ CHECK_OFFSET(vmcs_link_pointer, 176);
+ CHECK_OFFSET(guest_ia32_debugctl, 184);
+ CHECK_OFFSET(guest_ia32_pat, 192);
+ CHECK_OFFSET(guest_ia32_efer, 200);
+ CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
+ CHECK_OFFSET(guest_pdptr0, 216);
+ CHECK_OFFSET(guest_pdptr1, 224);
+ CHECK_OFFSET(guest_pdptr2, 232);
+ CHECK_OFFSET(guest_pdptr3, 240);
+ CHECK_OFFSET(guest_bndcfgs, 248);
+ CHECK_OFFSET(host_ia32_pat, 256);
+ CHECK_OFFSET(host_ia32_efer, 264);
+ CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
+ CHECK_OFFSET(vmread_bitmap, 280);
+ CHECK_OFFSET(vmwrite_bitmap, 288);
+ CHECK_OFFSET(vm_function_control, 296);
+ CHECK_OFFSET(eptp_list_address, 304);
+ CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(encls_exiting_bitmap, 320);
+ CHECK_OFFSET(tsc_multiplier, 328);
+ CHECK_OFFSET(cr0_guest_host_mask, 344);
+ CHECK_OFFSET(cr4_guest_host_mask, 352);
+ CHECK_OFFSET(cr0_read_shadow, 360);
+ CHECK_OFFSET(cr4_read_shadow, 368);
+ CHECK_OFFSET(dead_space, 376);
+ CHECK_OFFSET(exit_qualification, 408);
+ CHECK_OFFSET(guest_linear_address, 416);
+ CHECK_OFFSET(guest_cr0, 424);
+ CHECK_OFFSET(guest_cr3, 432);
+ CHECK_OFFSET(guest_cr4, 440);
+ CHECK_OFFSET(guest_es_base, 448);
+ CHECK_OFFSET(guest_cs_base, 456);
+ CHECK_OFFSET(guest_ss_base, 464);
+ CHECK_OFFSET(guest_ds_base, 472);
+ CHECK_OFFSET(guest_fs_base, 480);
+ CHECK_OFFSET(guest_gs_base, 488);
+ CHECK_OFFSET(guest_ldtr_base, 496);
+ CHECK_OFFSET(guest_tr_base, 504);
+ CHECK_OFFSET(guest_gdtr_base, 512);
+ CHECK_OFFSET(guest_idtr_base, 520);
+ CHECK_OFFSET(guest_dr7, 528);
+ CHECK_OFFSET(guest_rsp, 536);
+ CHECK_OFFSET(guest_rip, 544);
+ CHECK_OFFSET(guest_rflags, 552);
+ CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
+ CHECK_OFFSET(guest_sysenter_esp, 568);
+ CHECK_OFFSET(guest_sysenter_eip, 576);
+ CHECK_OFFSET(host_cr0, 584);
+ CHECK_OFFSET(host_cr3, 592);
+ CHECK_OFFSET(host_cr4, 600);
+ CHECK_OFFSET(host_fs_base, 608);
+ CHECK_OFFSET(host_gs_base, 616);
+ CHECK_OFFSET(host_tr_base, 624);
+ CHECK_OFFSET(host_gdtr_base, 632);
+ CHECK_OFFSET(host_idtr_base, 640);
+ CHECK_OFFSET(host_ia32_sysenter_esp, 648);
+ CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+ CHECK_OFFSET(host_rsp, 664);
+ CHECK_OFFSET(host_rip, 672);
+ CHECK_OFFSET(host_s_cet, 680);
+ CHECK_OFFSET(host_ssp, 688);
+ CHECK_OFFSET(host_ssp_tbl, 696);
+ CHECK_OFFSET(guest_s_cet, 704);
+ CHECK_OFFSET(guest_ssp, 712);
+ CHECK_OFFSET(guest_ssp_tbl, 720);
+ CHECK_OFFSET(pin_based_vm_exec_control, 744);
+ CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+ CHECK_OFFSET(exception_bitmap, 752);
+ CHECK_OFFSET(page_fault_error_code_mask, 756);
+ CHECK_OFFSET(page_fault_error_code_match, 760);
+ CHECK_OFFSET(cr3_target_count, 764);
+ CHECK_OFFSET(vm_exit_controls, 768);
+ CHECK_OFFSET(vm_exit_msr_store_count, 772);
+ CHECK_OFFSET(vm_exit_msr_load_count, 776);
+ CHECK_OFFSET(vm_entry_controls, 780);
+ CHECK_OFFSET(vm_entry_msr_load_count, 784);
+ CHECK_OFFSET(vm_entry_intr_info_field, 788);
+ CHECK_OFFSET(vm_entry_exception_error_code, 792);
+ CHECK_OFFSET(vm_entry_instruction_len, 796);
+ CHECK_OFFSET(tpr_threshold, 800);
+ CHECK_OFFSET(secondary_vm_exec_control, 804);
+ CHECK_OFFSET(vm_instruction_error, 808);
+ CHECK_OFFSET(vm_exit_reason, 812);
+ CHECK_OFFSET(vm_exit_intr_info, 816);
+ CHECK_OFFSET(vm_exit_intr_error_code, 820);
+ CHECK_OFFSET(idt_vectoring_info_field, 824);
+ CHECK_OFFSET(idt_vectoring_error_code, 828);
+ CHECK_OFFSET(vm_exit_instruction_len, 832);
+ CHECK_OFFSET(vmx_instruction_info, 836);
+ CHECK_OFFSET(guest_es_limit, 840);
+ CHECK_OFFSET(guest_cs_limit, 844);
+ CHECK_OFFSET(guest_ss_limit, 848);
+ CHECK_OFFSET(guest_ds_limit, 852);
+ CHECK_OFFSET(guest_fs_limit, 856);
+ CHECK_OFFSET(guest_gs_limit, 860);
+ CHECK_OFFSET(guest_ldtr_limit, 864);
+ CHECK_OFFSET(guest_tr_limit, 868);
+ CHECK_OFFSET(guest_gdtr_limit, 872);
+ CHECK_OFFSET(guest_idtr_limit, 876);
+ CHECK_OFFSET(guest_es_ar_bytes, 880);
+ CHECK_OFFSET(guest_cs_ar_bytes, 884);
+ CHECK_OFFSET(guest_ss_ar_bytes, 888);
+ CHECK_OFFSET(guest_ds_ar_bytes, 892);
+ CHECK_OFFSET(guest_fs_ar_bytes, 896);
+ CHECK_OFFSET(guest_gs_ar_bytes, 900);
+ CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
+ CHECK_OFFSET(guest_tr_ar_bytes, 908);
+ CHECK_OFFSET(guest_interruptibility_info, 912);
+ CHECK_OFFSET(guest_activity_state, 916);
+ CHECK_OFFSET(guest_sysenter_cs, 920);
+ CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+ CHECK_OFFSET(vmx_preemption_timer_value, 928);
+ CHECK_OFFSET(virtual_processor_id, 960);
+ CHECK_OFFSET(posted_intr_nv, 962);
+ CHECK_OFFSET(guest_es_selector, 964);
+ CHECK_OFFSET(guest_cs_selector, 966);
+ CHECK_OFFSET(guest_ss_selector, 968);
+ CHECK_OFFSET(guest_ds_selector, 970);
+ CHECK_OFFSET(guest_fs_selector, 972);
+ CHECK_OFFSET(guest_gs_selector, 974);
+ CHECK_OFFSET(guest_ldtr_selector, 976);
+ CHECK_OFFSET(guest_tr_selector, 978);
+ CHECK_OFFSET(guest_intr_status, 980);
+ CHECK_OFFSET(host_es_selector, 982);
+ CHECK_OFFSET(host_cs_selector, 984);
+ CHECK_OFFSET(host_ss_selector, 986);
+ CHECK_OFFSET(host_ds_selector, 988);
+ CHECK_OFFSET(host_fs_selector, 990);
+ CHECK_OFFSET(host_gs_selector, 992);
+ CHECK_OFFSET(host_tr_selector, 994);
+ CHECK_OFFSET(guest_pml_index, 996);
+}
+
+extern const unsigned short vmcs12_field_offsets[];
+extern const unsigned int nr_vmcs12_fields;
+
+static inline short get_vmcs12_field_offset(unsigned long field)
+{
+ unsigned short offset;
+ unsigned int index;
+
+ if (field >> 15)
+ return -ENOENT;
+
+ index = ROL16(field, 6);
+ if (index >= nr_vmcs12_fields)
+ return -ENOENT;
+
+ index = array_index_nospec(index, nr_vmcs12_fields);
+ offset = vmcs12_field_offsets[index];
+ if (offset == 0)
+ return -ENOENT;
+ return offset;
+}
+
+static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
+ u16 offset)
+{
+ char *p = (char *)vmcs12 + offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ return *((natural_width *)p);
+ case VMCS_FIELD_WIDTH_U16:
+ return *((u16 *)p);
+ case VMCS_FIELD_WIDTH_U32:
+ return *((u32 *)p);
+ case VMCS_FIELD_WIDTH_U64:
+ return *((u64 *)p);
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+}
+
+static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field,
+ u16 offset, u64 field_value)
+{
+ char *p = (char *)vmcs12 + offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_U16:
+ *(u16 *)p = field_value;
+ break;
+ case VMCS_FIELD_WIDTH_U32:
+ *(u32 *)p = field_value;
+ break;
+ case VMCS_FIELD_WIDTH_U64:
+ *(u64 *)p = field_value;
+ break;
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ *(natural_width *)p = field_value;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+#endif /* __KVM_X86_VMX_VMCS12_H */
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
new file mode 100644
index 000000000000..cad128d1657b
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -0,0 +1,79 @@
+#if !defined(SHADOW_FIELD_RO) && !defined(SHADOW_FIELD_RW)
+BUILD_BUG_ON(1)
+#endif
+
+#ifndef SHADOW_FIELD_RO
+#define SHADOW_FIELD_RO(x, y)
+#endif
+#ifndef SHADOW_FIELD_RW
+#define SHADOW_FIELD_RW(x, y)
+#endif
+
+/*
+ * We do NOT shadow fields that are modified when L0
+ * traps and emulates any vmx instruction (e.g. VMPTRLD,
+ * VMXON...) executed by L1.
+ * For example, VM_INSTRUCTION_ERROR is read
+ * by L1 if a vmx instruction fails (part of the error path).
+ * Note the code assumes this logic. If for some reason
+ * we start shadowing these fields then we need to
+ * force a shadow sync when L0 emulates vmx instructions
+ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+ * by nested_vmx_failValid)
+ *
+ * When adding or removing fields here, note that shadowed
+ * fields must always be synced by prepare_vmcs02, not just
+ * prepare_vmcs02_rare.
+ */
+
+/*
+ * Keeping the fields ordered by size is an attempt at improving
+ * branch prediction in vmcs12_read_any and vmcs12_write_any.
+ */
+
+/* 16-bits */
+SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status)
+SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index)
+SHADOW_FIELD_RW(HOST_FS_SELECTOR, host_fs_selector)
+SHADOW_FIELD_RW(HOST_GS_SELECTOR, host_gs_selector)
+
+/* 32-bits */
+SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason)
+SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info)
+SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len)
+SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field)
+SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code)
+SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code)
+SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes)
+SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes)
+SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control)
+SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control)
+SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap)
+SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code)
+SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field)
+SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len)
+SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold)
+SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info)
+SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value)
+
+/* Natural width */
+SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification)
+SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address)
+SHADOW_FIELD_RW(GUEST_RIP, guest_rip)
+SHADOW_FIELD_RW(GUEST_RSP, guest_rsp)
+SHADOW_FIELD_RW(GUEST_CR0, guest_cr0)
+SHADOW_FIELD_RW(GUEST_CR3, guest_cr3)
+SHADOW_FIELD_RW(GUEST_CR4, guest_cr4)
+SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags)
+SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask)
+SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow)
+SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow)
+SHADOW_FIELD_RW(HOST_FS_BASE, host_fs_base)
+SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
+
+/* 64-bit */
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
+
+#undef SHADOW_FIELD_RO
+#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
new file mode 100644
index 000000000000..4426d34811fc
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -0,0 +1,387 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
+#include <asm/percpu.h>
+#include <asm/segment.h>
+#include "kvm-asm-offsets.h"
+#include "run_flags.h"
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
+#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
+#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
+#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
+#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
+#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
+#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
+#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
+#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
+#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
+#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
+#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
+#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
+#endif
+
+.macro VMX_DO_EVENT_IRQOFF call_insn call_target
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ \call_insn \call_target
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ leave
+ RET
+.endm
+
+.section .noinstr.text, "ax"
+
+/**
+ * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
+ * @vmx: struct vcpu_vmx *
+ * @regs: unsigned long * (to guest registers)
+ * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
+ * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
+ * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO
+ *
+ * Returns:
+ * 0 on VM-Exit, 1 on VM-Fail
+ */
+SYM_FUNC_START(__vmx_vcpu_run)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /* Save @vmx for SPEC_CTRL handling */
+ push %_ASM_ARG1
+
+ /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */
+ push %_ASM_ARG3
+
+ /*
+ * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
+ * @regs is needed after VM-Exit to save the guest's register values.
+ */
+ push %_ASM_ARG2
+
+ lea (%_ASM_SP), %_ASM_ARG2
+ call vmx_update_host_rsp
+
+ ALTERNATIVE "jmp .Lspec_ctrl_done", "", X86_FEATURE_MSR_SPEC_CTRL
+
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
+#ifdef CONFIG_X86_64
+ mov VMX_spec_ctrl(%rdi), %rdx
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ je .Lspec_ctrl_done
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov VMX_spec_ctrl(%edi), %eax
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
+ xor %eax, %ecx
+ mov VMX_spec_ctrl + 4(%edi), %edx
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi
+ xor %edx, %edi
+ or %edi, %ecx
+ je .Lspec_ctrl_done
+#endif
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ wrmsr
+
+.Lspec_ctrl_done:
+
+ /*
+ * Since vmentry is serializing on affected CPUs, there's no need for
+ * an LFENCE to stop speculation from skipping the wrmsr.
+ */
+
+ /* Load @regs to RAX. */
+ mov (%_ASM_SP), %_ASM_AX
+
+ /* Load guest registers. Don't clobber flags. */
+ mov VCPU_RCX(%_ASM_AX), %_ASM_CX
+ mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+ mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+ mov VCPU_RBP(%_ASM_AX), %_ASM_BP
+ mov VCPU_RSI(%_ASM_AX), %_ASM_SI
+ mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_AX), %r8
+ mov VCPU_R9 (%_ASM_AX), %r9
+ mov VCPU_R10(%_ASM_AX), %r10
+ mov VCPU_R11(%_ASM_AX), %r11
+ mov VCPU_R12(%_ASM_AX), %r12
+ mov VCPU_R13(%_ASM_AX), %r13
+ mov VCPU_R14(%_ASM_AX), %r14
+ mov VCPU_R15(%_ASM_AX), %r15
+#endif
+ /* Load guest RAX. This kills the @regs pointer! */
+ mov VCPU_RAX(%_ASM_AX), %_ASM_AX
+
+ /*
+ * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is
+ * enabled, do VERW unconditionally. If CPU_BUF_VM_MMIO is enabled,
+ * check @flags to see if the vCPU has access to host MMIO, and if so,
+ * do VERW. Else, do nothing (no mitigations needed/enabled).
+ */
+ ALTERNATIVE_2 "", \
+ __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \
+ jz .Lskip_mmio_verw; \
+ VERW; \
+ .Lskip_mmio_verw:), \
+ X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, \
+ __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM
+
+ /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */
+ testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP)
+ jz .Lvmlaunch
+
+ /*
+ * After a successful VMRESUME/VMLAUNCH, control flow "magically"
+ * resumes below at 'vmx_vmexit' due to the VMCS HOST_RIP setting.
+ * So this isn't a typical function and objtool needs to be told to
+ * save the unwind state here and restore it below.
+ */
+ UNWIND_HINT_SAVE
+
+/*
+ * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at
+ * the 'vmx_vmexit' label below.
+ */
+.Lvmresume:
+ vmresume
+ jmp .Lvmfail
+
+.Lvmlaunch:
+ vmlaunch
+ jmp .Lvmfail
+
+ _ASM_EXTABLE(.Lvmresume, .Lfixup)
+ _ASM_EXTABLE(.Lvmlaunch, .Lfixup)
+
+SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
+
+ /* Restore unwind state from before the VMRESUME/VMLAUNCH. */
+ UNWIND_HINT_RESTORE
+ ENDBR
+
+ /* Temporarily save guest's RAX. */
+ push %_ASM_AX
+
+ /* Reload @regs to RAX. */
+ mov WORD_SIZE(%_ASM_SP), %_ASM_AX
+
+ /* Save all guest registers, including RAX from the stack */
+ pop VCPU_RAX(%_ASM_AX)
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */
+ xor %ebx, %ebx
+
+.Lclear_regs:
+ /* Discard @regs. The register is irrelevant, it just can't be RBX. */
+ pop %_ASM_AX
+
+ /*
+ * Clear all general purpose registers except RSP and RBX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RBX are exempt as RSP is restored by hardware during
+ * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return
+ * value.
+ */
+ xor %eax, %eax
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %ebp, %ebp
+ xor %esi, %esi
+ xor %edi, %edi
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ /*
+ * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
+ * the first unbalanced RET after vmexit!
+ *
+ * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
+ * entries and (in some cases) RSB underflow.
+ *
+ * eIBRS has its own protection against poisoned RSB, so it doesn't
+ * need the RSB filling sequence. But it does need to be enabled, and a
+ * single call to retire, before the first unbalanced RET.
+ */
+
+ FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
+ X86_FEATURE_RSB_VMEXIT_LITE
+
+ pop %_ASM_ARG2 /* @flags */
+ pop %_ASM_ARG1 /* @vmx */
+
+ call vmx_spec_ctrl_restore_host
+
+ CLEAR_BRANCH_HISTORY_VMEXIT
+
+ /* Put return value in AX */
+ mov %_ASM_BX, %_ASM_AX
+
+ pop %_ASM_BX
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ RET
+
+.Lfixup:
+ cmpb $0, _ASM_RIP(kvm_rebooting)
+ jne .Lvmfail
+ ud2
+.Lvmfail:
+ /* VM-Fail: set return value to 1 */
+ mov $1, %_ASM_BX
+ jmp .Lclear_regs
+
+SYM_FUNC_END(__vmx_vcpu_run)
+
+SYM_FUNC_START(vmx_do_nmi_irqoff)
+ VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(vmx_do_nmi_irqoff)
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+/**
+ * vmread_error_trampoline - Trampoline from inline asm to vmread_error()
+ * @field: VMCS field encoding that failed
+ * @fault: %true if the VMREAD faulted, %false if it failed
+ *
+ * Save and restore volatile registers across a call to vmread_error(). Note,
+ * all parameters are passed on the stack.
+ */
+SYM_FUNC_START(vmread_error_trampoline)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+ push %_ASM_AX
+ push %_ASM_CX
+ push %_ASM_DX
+#ifdef CONFIG_X86_64
+ push %rdi
+ push %rsi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+#endif
+
+ /* Load @field and @fault to arg1 and arg2 respectively. */
+ mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2
+ mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1
+
+ call vmread_error_trampoline2
+
+ /* Zero out @fault, which will be popped into the result register. */
+ _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP)
+
+#ifdef CONFIG_X86_64
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rsi
+ pop %rdi
+#endif
+ pop %_ASM_DX
+ pop %_ASM_CX
+ pop %_ASM_AX
+ pop %_ASM_BP
+
+ RET
+SYM_FUNC_END(vmread_error_trampoline)
+#endif
+
+.section .text, "ax"
+
+#ifndef CONFIG_X86_FRED
+
+SYM_FUNC_START(vmx_do_interrupt_irqoff)
+ VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(vmx_do_interrupt_irqoff)
+
+#endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
new file mode 100644
index 000000000000..4cbe8c84b636
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -0,0 +1,8774 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/highmem.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
+#include <linux/mm.h>
+#include <linux/objtool.h>
+#include <linux/sched.h>
+#include <linux/sched/smt.h>
+#include <linux/slab.h>
+#include <linux/tboot.h>
+#include <linux/trace_events.h>
+
+#include <asm/apic.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
+#include <asm/io.h>
+#include <asm/irq_remapping.h>
+#include <asm/reboot.h>
+#include <asm/perf_event.h>
+#include <asm/mmu_context.h>
+#include <asm/mshyperv.h>
+#include <asm/msr.h>
+#include <asm/mwait.h>
+#include <asm/spec-ctrl.h>
+#include <asm/vmx.h>
+
+#include <trace/events/ipi.h>
+
+#include "capabilities.h"
+#include "common.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+#include "irq.h"
+#include "kvm_cache_regs.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "nested.h"
+#include "pmu.h"
+#include "sgx.h"
+#include "trace.h"
+#include "vmcs.h"
+#include "vmcs12.h"
+#include "vmx.h"
+#include "x86.h"
+#include "x86_ops.h"
+#include "smm.h"
+#include "vmx_onhyperv.h"
+#include "posted_intr.h"
+
+#include "mmu/spte.h"
+
+MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
+MODULE_LICENSE("GPL");
+
+#ifdef MODULE
+static const struct x86_cpu_id vmx_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+#endif
+
+bool __read_mostly enable_vpid = 1;
+module_param_named(vpid, enable_vpid, bool, 0444);
+
+static bool __read_mostly enable_vnmi = 1;
+module_param_named(vnmi, enable_vnmi, bool, 0444);
+
+bool __read_mostly flexpriority_enabled = 1;
+module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
+
+bool __read_mostly enable_ept = 1;
+module_param_named(ept, enable_ept, bool, 0444);
+
+bool __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, 0444);
+
+bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
+
+static bool __read_mostly emulate_invalid_guest_state = true;
+module_param(emulate_invalid_guest_state, bool, 0444);
+
+static bool __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, 0444);
+
+module_param(enable_apicv, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
+
+module_param(enable_device_posted_irqs, bool, 0444);
+
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static bool __read_mostly nested = 1;
+module_param(nested, bool, 0444);
+
+bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, 0444);
+
+static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+module_param(error_on_inconsistent_vmcs_config, bool, 0444);
+
+static bool __read_mostly dump_invalid_vmcs = 0;
+module_param(dump_invalid_vmcs, bool, 0644);
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV 2
+
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
+extern bool __read_mostly allow_smaller_maxphyaddr;
+module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
+
+#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+
+#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
+
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+ RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+ RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+ RTIT_STATUS_BYTECNT))
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap: upper bound on the amount of time between two successive
+ * executions of PAUSE in a loop. Also indicate if ple enabled.
+ * According to test, this time is usually smaller than 128 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ * in a PAUSE loop. Tests indicate that most spinlocks are held for
+ * less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
+module_param(ple_gap, uint, 0444);
+
+static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, uint, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, uint, 0444);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, uint, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, uint, 0444);
+
+/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
+int __read_mostly pt_mode = PT_MODE_SYSTEM;
+#ifdef CONFIG_BROKEN
+module_param(pt_mode, int, S_IRUGO);
+#endif
+
+struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
+
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+ const char *option;
+ bool for_parse;
+} vmentry_l1d_param[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
+ [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
+ [VMENTER_L1D_FLUSH_COND] = {"cond", true},
+ [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+ struct page *page;
+ unsigned int i;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
+ if (!enable_ept) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+ return 0;
+ }
+
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
+ /* If set to auto use the default l1tf mitigation method */
+ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ l1tf = VMENTER_L1D_FLUSH_NEVER;
+ break;
+ case L1TF_MITIGATION_AUTO:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ l1tf = VMENTER_L1D_FLUSH_COND;
+ break;
+ case L1TF_MITIGATION_FULL:
+ case L1TF_MITIGATION_FULL_FORCE:
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ break;
+ }
+ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ }
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ /*
+ * This allocation for vmx_l1d_flush_pages is not tied to a VM
+ * lifetime and so should not be charged to a memcg.
+ */
+ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ vmx_l1d_flush_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+ }
+
+ l1tf_vmx_mitigation = l1tf;
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ static_branch_enable(&vmx_l1d_should_flush);
+ else
+ static_branch_disable(&vmx_l1d_should_flush);
+
+ if (l1tf == VMENTER_L1D_FLUSH_COND)
+ static_branch_enable(&vmx_l1d_flush_cond);
+ else
+ static_branch_disable(&vmx_l1d_flush_cond);
+ return 0;
+}
+
+static int vmx_setup_l1d_flush(void)
+{
+ /*
+ * Hand the parameter mitigation value in which was stored in the pre
+ * module init parser. If no parameter was given, it will contain
+ * 'auto' which will be turned into the default 'cond' mitigation mode.
+ */
+ return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+}
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+ unsigned int i;
+
+ if (s) {
+ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+ if (vmentry_l1d_param[i].for_parse &&
+ sysfs_streq(s, vmentry_l1d_param[i].option))
+ return i;
+ }
+ }
+ return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ int l1tf, ret;
+
+ l1tf = vmentry_l1d_flush_parse(s);
+ if (l1tf < 0)
+ return l1tf;
+
+ if (!boot_cpu_has(X86_BUG_L1TF))
+ return 0;
+
+ /*
+ * Has vmx_init() run already? If not then this is the pre init
+ * parameter parsing. In that case just store the value and let
+ * vmx_init() do the proper setup after enable_ept has been
+ * established.
+ */
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+ vmentry_l1d_flush_param = l1tf;
+ return 0;
+ }
+
+ mutex_lock(&vmx_l1d_flush_mutex);
+ ret = __vmx_setup_l1d_flush(l1tf);
+ mutex_unlock(&vmx_l1d_flush_mutex);
+ return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
+ return sysfs_emit(s, "???\n");
+
+ return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ if (!static_branch_unlikely(&vmx_l1d_should_flush))
+ return;
+
+ /*
+ * This code is only executed when the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ /*
+ * Clear the per-cpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator,
+ * or from the interrupt handlers.
+ */
+ if (!kvm_get_cpu_l1tf_flush_l1d())
+ return;
+ kvm_clear_cpu_l1tf_flush_l1d();
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+#else /* CONFIG_CPU_MITIGATIONS*/
+static int vmx_setup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
+ return 0;
+}
+static void vmx_cleanup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+
+}
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
+ return 0;
+}
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sysfs_emit(s, "never\n");
+}
+#endif
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
+ msr |= FB_CLEAR_DIS;
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ /*
+ * Disable VERW's behavior of clearing CPU buffers for the guest if the
+ * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
+ * the mitigation. Disabling the clearing behavior provides a
+ * performance boost for guests that aren't aware that manually clearing
+ * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
+ * and VM-Exit.
+ */
+ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA);
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var);
+
+void vmx_vmexit(void);
+
+#define vmx_insn_failed(fmt...) \
+do { \
+ WARN_ONCE(1, fmt); \
+ pr_warn_ratelimited(fmt); \
+} while (0)
+
+noinline void vmread_error(unsigned long field)
+{
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
+}
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+{
+ if (fault) {
+ kvm_spurious_fault();
+ } else {
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ }
+}
+#endif
+
+noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
+{
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ ext, vpid, gva);
+}
+
+noinline void invept_error(unsigned long ext, u64 eptp)
+{
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
+}
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
+
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
+
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
+
+#define VMX_SEGMENT_FIELD(seg) \
+ [VCPU_SREG_##seg] = { \
+ .selector = GUEST_##seg##_SELECTOR, \
+ .base = GUEST_##seg##_BASE, \
+ .limit = GUEST_##seg##_LIMIT, \
+ .ar_bytes = GUEST_##seg##_AR_BYTES, \
+ }
+
+static const struct kvm_vmx_segment_field {
+ unsigned selector;
+ unsigned base;
+ unsigned limit;
+ unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+ VMX_SEGMENT_FIELD(CS),
+ VMX_SEGMENT_FIELD(DS),
+ VMX_SEGMENT_FIELD(ES),
+ VMX_SEGMENT_FIELD(FS),
+ VMX_SEGMENT_FIELD(GS),
+ VMX_SEGMENT_FIELD(SS),
+ VMX_SEGMENT_FIELD(TR),
+ VMX_SEGMENT_FIELD(LDTR),
+};
+
+
+static unsigned long host_idt_base;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct hv_enlightened_vmcs *evmcs;
+ hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
+
+ if (partition_assist_page == INVALID_PAGE)
+ return -ENOMEM;
+
+ evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
+
+ evmcs->partition_assist_page = partition_assist_page;
+ evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
+ evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+
+ return 0;
+}
+
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&__kvm_is_using_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vt_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!kvm_is_using_evmcs())
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+ /* Xeon E3-1220 V2 */
+0x000306A8,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
+{
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
+}
+
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ i = kvm_find_user_return_msr(msr);
+ if (i >= 0)
+ return &vmx->guest_uret_msrs[i];
+ return NULL;
+}
+
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+ struct vmx_uret_msr *msr, u64 data)
+{
+ unsigned int slot = msr - vmx->guest_uret_msrs;
+ int ret = 0;
+
+ if (msr->load_into_hardware) {
+ preempt_disable();
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
+ preempt_enable();
+ }
+ if (!ret)
+ msr->data = data;
+ return ret;
+}
+
+/*
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
+ */
+static int kvm_cpu_vmxoff(void)
+{
+ asm goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+fault:
+ cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
+}
+
+void vmx_emergency_disable_virtualization_cpu(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v;
+
+ kvm_rebooting = true;
+
+ /*
+ * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+ * set in task context. If this races with VMX is disabled by an NMI,
+ * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+ * kvm_rebooting set.
+ */
+ if (!(__read_cr4() & X86_CR4_VMXE))
+ return;
+
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link) {
+ vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
+
+ kvm_cpu_vmxoff();
+}
+
+static void __loaded_vmcs_clear(void *arg)
+{
+ struct loaded_vmcs *loaded_vmcs = arg;
+ int cpu = raw_smp_processor_id();
+
+ if (loaded_vmcs->cpu != cpu)
+ return; /* vcpu migration can race with cpu offline */
+ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
+ per_cpu(current_vmcs, cpu) = NULL;
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
+ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+ /*
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->cpu to
+ * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
+ * and add loaded_vmcs to its percpu list before it's deleted from this
+ * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ */
+ smp_wmb();
+
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+{
+ int cpu = loaded_vmcs->cpu;
+
+ if (cpu != -1)
+ smp_call_function_single(cpu,
+ __loaded_vmcs_clear, loaded_vmcs, 1);
+}
+
+static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
+ unsigned field)
+{
+ bool ret;
+ u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+ if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+ kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
+ vmx->segment_cache.bitmask = 0;
+ }
+ ret = vmx->segment_cache.bitmask & mask;
+ vmx->segment_cache.bitmask |= mask;
+ return ret;
+}
+
+static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u16 *p = &vmx->segment_cache.seg[seg].selector;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
+ *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
+ return *p;
+}
+
+static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
+{
+ ulong *p = &vmx->segment_cache.seg[seg].base;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
+ *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].limit;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].ar;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
+ return *p;
+}
+
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ u32 eb;
+
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << DB_VECTOR) | (1u << AC_VECTOR);
+ /*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ eb |= (1u << GP_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
+ if (to_vmx(vcpu)->rmode.vm86_active)
+ eb = ~0;
+ if (!vmx_need_pf_intercept(vcpu))
+ eb &= ~(1u << PF_VECTOR);
+
+ /* When we are running a nested L2 guest and L1 specified for it a
+ * certain exception bitmap, we must trap the same exceptions and pass
+ * them to L1. When running L2, we will only handle the exceptions
+ * specified above if L1 did not want them.
+ */
+ if (is_guest_mode(vcpu))
+ eb |= get_vmcs12(vcpu)->exception_bitmap;
+ else {
+ int mask = 0, match = 0;
+
+ if (enable_ept && (eb & (1u << PF_VECTOR))) {
+ /*
+ * If EPT is enabled, #PF is currently only intercepted
+ * if MAXPHYADDR is smaller on the guest than on the
+ * host. In that case we only care about present,
+ * non-reserved faults. For vmcs02, however, PFEC_MASK
+ * and PFEC_MATCH are set in prepare_vmcs02_rare.
+ */
+ mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
+ match = PFERR_PRESENT_MASK;
+ }
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
+ }
+
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
+
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
+{
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
+ return true;
+
+ return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
+}
+
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
+{
+ unsigned int flags = 0;
+
+ if (vmx->loaded_vmcs->launched)
+ flags |= VMX_RUN_VMRESUME;
+
+ /*
+ * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
+ * to change it directly without causing a vmexit. In that case read
+ * it after vmexit and store it in vmx->spec_ctrl.
+ */
+ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
+ flags |= VMX_RUN_SAVE_SPEC_CTRL;
+
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
+ return flags;
+}
+
+static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit)
+{
+ vm_entry_controls_clearbit(vmx, entry);
+ vm_exit_controls_clearbit(vmx, exit);
+}
+
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
+{
+ unsigned int i;
+
+ for (i = 0; i < m->nr; ++i) {
+ if (m->val[i].index == msr)
+ return i;
+ }
+ return -ENOENT;
+}
+
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+ int i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer()) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl()) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return;
+ }
+ break;
+ }
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
+ if (i < 0)
+ goto skip_guest;
+ --m->guest.nr;
+ m->guest.val[i] = m->guest.val[m->guest.nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+
+skip_guest:
+ i = vmx_find_loadstore_msr_slot(&m->host, msr);
+ if (i < 0)
+ return;
+
+ --m->host.nr;
+ m->host.val[i] = m->host.val[m->host.nr];
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+}
+
+static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit,
+ unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
+ u64 guest_val, u64 host_val)
+{
+ vmcs_write64(guest_val_vmcs, guest_val);
+ if (host_val_vmcs != HOST_IA32_EFER)
+ vmcs_write64(host_val_vmcs, host_val);
+ vm_entry_controls_setbit(vmx, entry);
+ vm_exit_controls_setbit(vmx, exit);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+ u64 guest_val, u64 host_val, bool entry_only)
+{
+ int i, j = 0;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer()) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER,
+ GUEST_IA32_EFER,
+ HOST_IA32_EFER,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl()) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
+ GUEST_IA32_PERF_GLOBAL_CTRL,
+ HOST_IA32_PERF_GLOBAL_CTRL,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
+ }
+
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
+ if (!entry_only)
+ j = vmx_find_loadstore_msr_slot(&m->host, msr);
+
+ if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
+ printk_once(KERN_WARNING "Not enough msr switch entries. "
+ "Can't add msr %x\n", msr);
+ return;
+ }
+ if (i < 0) {
+ i = m->guest.nr++;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+ }
+ m->guest.val[i].index = msr;
+ m->guest.val[i].value = guest_val;
+
+ if (entry_only)
+ return;
+
+ if (j < 0) {
+ j = m->host.nr++;
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+ }
+ m->host.val[j].index = msr;
+ m->host.val[j].value = host_val;
+}
+
+static bool update_transition_efer(struct vcpu_vmx *vmx)
+{
+ u64 guest_efer = vmx->vcpu.arch.efer;
+ u64 ignore_bits = 0;
+ int i;
+
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
+
+ /*
+ * LMA and LME handled by hardware; SCE meaningless outside long mode.
+ */
+ ignore_bits |= EFER_SCE;
+#ifdef CONFIG_X86_64
+ ignore_bits |= EFER_LMA | EFER_LME;
+ /* SCE is meaningful only in long mode on Intel */
+ if (guest_efer & EFER_LMA)
+ ignore_bits &= ~(u64)EFER_SCE;
+#endif
+
+ /*
+ * On EPT, we can't emulate NX, so we must switch EFER atomically.
+ * On CPUs that support "load IA32_EFER", always switch EFER
+ * atomically, since it's faster than switching it manually.
+ */
+ if (cpu_has_load_ia32_efer() ||
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
+ if (!(guest_efer & EFER_LMA))
+ guest_efer &= ~EFER_LME;
+ if (guest_efer != kvm_host.efer)
+ add_atomic_switch_msr(vmx, MSR_EFER,
+ guest_efer, kvm_host.efer, false);
+ else
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+ return false;
+ }
+
+ i = kvm_find_user_return_msr(MSR_EFER);
+ if (i < 0)
+ return false;
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+
+ guest_efer &= ~ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
+
+ vmx->guest_uret_msrs[i].data = guest_efer;
+ vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+ return true;
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit kernels, VM exits still load the FS and GS bases from the
+ * VMCS rather than the segment table. KVM uses this helper to figure
+ * out the current bases to poke them into the VMCS before entry.
+ */
+static unsigned long segment_base(u16 selector)
+{
+ struct desc_struct *table;
+ unsigned long v;
+
+ if (!(selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = get_current_gdt_ro();
+
+ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ u16 ldt_selector = kvm_read_ldt();
+
+ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = (struct desc_struct *)segment_base(ldt_selector);
+ }
+ v = get_desc_base(&table[selector >> 3]);
+ return v;
+}
+#endif
+
+static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
+{
+ return vmx_pt_mode_is_host_guest() &&
+ !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+}
+
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+ /* The base must be 128-byte aligned and a legal physical address. */
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
+}
+
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+ u32 i;
+
+ wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ for (i = 0; i < addr_range; i++) {
+ wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+ u32 i;
+
+ rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ for (i = 0; i < addr_range; i++) {
+ rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+ if (vmx_pt_mode_is_system())
+ return;
+
+ /*
+ * GUEST_IA32_RTIT_CTL is already set in the VMCS.
+ * Save host state before VM entry.
+ */
+ rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+ wrmsrq(MSR_IA32_RTIT_CTL, 0);
+ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+ if (vmx_pt_mode_is_system())
+ return;
+
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ }
+
+ /*
+ * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
+ * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
+ */
+ if (vmx->pt_desc.host.ctl)
+ wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
+{
+ if (unlikely(fs_sel != host->fs_sel)) {
+ if (!(fs_sel & 7))
+ vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+ else
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ host->fs_sel = fs_sel;
+ }
+ if (unlikely(gs_sel != host->gs_sel)) {
+ if (!(gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+ else
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ host->gs_sel = gs_sel;
+ }
+ if (unlikely(fs_base != host->fs_base)) {
+ vmcs_writel(HOST_FS_BASE, fs_base);
+ host->fs_base = fs_base;
+ }
+ if (unlikely(gs_base != host->gs_base)) {
+ vmcs_writel(HOST_GS_BASE, gs_base);
+ host->gs_base = gs_base;
+ }
+}
+
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+ struct vmcs_host_state *host_state;
+#ifdef CONFIG_X86_64
+ int cpu = raw_smp_processor_id();
+#endif
+ unsigned long fs_base, gs_base;
+ u16 fs_sel, gs_sel;
+ int i;
+
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+ * to/from long-mode by setting MSR_EFER.LMA.
+ */
+ if (!vmx->guest_uret_msrs_loaded) {
+ vmx->guest_uret_msrs_loaded = true;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ continue;
+
+ kvm_set_user_return_msr(i,
+ vmx->guest_uret_msrs[i].data,
+ vmx->guest_uret_msrs[i].mask);
+ }
+ }
+
+ if (vmx->nested.need_vmcs12_to_shadow_sync)
+ nested_sync_vmcs12_to_shadow(vcpu);
+
+ if (vt->guest_state_loaded)
+ return;
+
+ host_state = &vmx->loaded_vmcs->host_state;
+
+ /*
+ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
+ * allow segment selectors with cpl > 0 or ti == 1.
+ */
+ host_state->ldt_sel = kvm_read_ldt();
+
+#ifdef CONFIG_X86_64
+ savesegment(ds, host_state->ds_sel);
+ savesegment(es, host_state->es_sel);
+
+ gs_base = cpu_kernelmode_gs_base(cpu);
+ if (likely(is_64bit_mm(current->mm))) {
+ current_save_fsgs();
+ fs_sel = current->thread.fsindex;
+ gs_sel = current->thread.gsindex;
+ fs_base = current->thread.fsbase;
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ } else {
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = read_msr(MSR_FS_BASE);
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ }
+
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#else
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = segment_base(fs_sel);
+ gs_base = segment_base(gs_sel);
+#endif
+
+ vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+ vt->guest_state_loaded = true;
+}
+
+static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
+{
+ struct vmcs_host_state *host_state;
+
+ if (!vmx->vt.guest_state_loaded)
+ return;
+
+ host_state = &vmx->loaded_vmcs->host_state;
+
+ ++vmx->vcpu.stat.host_state_reload;
+
+#ifdef CONFIG_X86_64
+ rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
+ if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
+ kvm_load_ldt(host_state->ldt_sel);
+#ifdef CONFIG_X86_64
+ load_gs_index(host_state->gs_sel);
+#else
+ loadsegment(gs, host_state->gs_sel);
+#endif
+ }
+ if (host_state->fs_sel & 7)
+ loadsegment(fs, host_state->fs_sel);
+#ifdef CONFIG_X86_64
+ if (unlikely(host_state->ds_sel | host_state->es_sel)) {
+ loadsegment(ds, host_state->ds_sel);
+ loadsegment(es, host_state->es_sel);
+ }
+#endif
+ invalidate_tss_limit();
+#ifdef CONFIG_X86_64
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
+#endif
+ load_fixmap_gdt(raw_smp_processor_id());
+ vmx->vt.guest_state_loaded = false;
+ vmx->guest_uret_msrs_loaded = false;
+}
+
+#ifdef CONFIG_X86_64
+static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
+{
+ preempt_disable();
+ if (vmx->vt.guest_state_loaded)
+ *cache = read_msr(msr);
+ preempt_enable();
+ return *cache;
+}
+
+static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
+ u64 *cache)
+{
+ preempt_disable();
+ if (vmx->vt.guest_state_loaded)
+ wrmsrns(msr, data);
+ preempt_enable();
+ *cache = data;
+}
+
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+{
+ return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
+ &vmx->msr_guest_kernel_gs_base);
+}
+
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
+ &vmx->msr_guest_kernel_gs_base);
+}
+#endif
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+ struct vmcs *prev;
+
+ if (!already_loaded) {
+ loaded_vmcs_clear(vmx->loaded_vmcs);
+ local_irq_disable();
+
+ /*
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
+ */
+ smp_rmb();
+
+ list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+ &per_cpu(loaded_vmcss_on_cpu, cpu));
+ local_irq_enable();
+ }
+
+ prev = per_cpu(current_vmcs, cpu);
+ if (prev != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ }
+
+ if (!already_loaded) {
+ void *gdt = get_current_gdt_ro();
+
+ /*
+ * Flush all EPTP/VPID contexts, the new pCPU may have stale
+ * TLB entries from its previous association with the vCPU.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ /*
+ * Linux uses per-cpu TSS and GDT, so set these when switching
+ * processors. See 22.2.4.
+ */
+ vmcs_writel(HOST_TR_BASE,
+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
+
+ vmx->loaded_vmcs->cpu = cpu;
+ }
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
+ vmx_vcpu_load_vmcs(vcpu, cpu);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ vmx_vcpu_pi_put(vcpu);
+
+ vmx_prepare_switch_to_host(to_vmx(vcpu));
+}
+
+bool vmx_emulation_required(struct kvm_vcpu *vcpu)
+{
+ return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
+}
+
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long rflags, save_rflags;
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (vmx->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = vmx->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ vmx->rflags = rflags;
+ }
+ return vmx->rflags;
+}
+
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long old_rflags;
+
+ /*
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ * is an unrestricted guest in order to mark L2 as needing emulation
+ * if L1 runs L2 as a restricted guest.
+ */
+ if (is_unrestricted_guest(vcpu)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ vmx->rflags = rflags;
+ vmcs_writel(GUEST_RFLAGS, rflags);
+ return;
+ }
+
+ old_rflags = vmx_get_rflags(vcpu);
+ vmx->rflags = rflags;
+ if (vmx->rmode.vm86_active) {
+ vmx->rmode.save_rflags = rflags;
+ rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+ }
+ vmcs_writel(GUEST_RFLAGS, rflags);
+
+ if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
+}
+
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ int ret = 0;
+
+ if (interruptibility & GUEST_INTR_STATE_STI)
+ ret |= KVM_X86_SHADOW_INT_STI;
+ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+ ret |= KVM_X86_SHADOW_INT_MOV_SS;
+
+ return ret;
+}
+
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ u32 interruptibility = interruptibility_old;
+
+ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
+ interruptibility |= GUEST_INTR_STATE_MOV_SS;
+ else if (mask & KVM_X86_SHADOW_INT_STI)
+ interruptibility |= GUEST_INTR_STATE_STI;
+
+ if ((interruptibility != interruptibility_old))
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long value;
+
+ /*
+ * Any MSR write that attempts to change bits marked reserved will
+ * case a #GP fault.
+ */
+ if (data & vmx->pt_desc.ctl_bitmask)
+ return 1;
+
+ /*
+ * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+ * result in a #GP unless the same write also clears TraceEn.
+ */
+ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+ (data & RTIT_CTL_TRACEEN) &&
+ data != vmx->pt_desc.guest.ctl)
+ return 1;
+
+ /*
+ * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+ * and FabricEn would cause #GP, if
+ * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+ */
+ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+ !(data & RTIT_CTL_FABRIC_EN) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+
+ /*
+ * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+ * utilize encodings marked reserved will cause a #GP fault.
+ */
+ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
+ !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+ RTIT_CTL_MTC_RANGE_OFFSET, &value))
+ return 1;
+ value = intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cycle_thresholds);
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+ !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+ RTIT_CTL_CYC_THRESH_OFFSET, &value))
+ return 1;
+ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+ !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+ RTIT_CTL_PSB_FREQ_OFFSET, &value))
+ return 1;
+
+ /*
+ * If ADDRx_CFG is reserved or the encodings is >2 will
+ * cause a #GP fault.
+ */
+ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
+ return 1;
+ value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
+ return 1;
+ value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
+ return 1;
+ value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
+ return 1;
+
+ return 0;
+}
+
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ /*
+ * Emulation of instructions in SGX enclaves is impossible as RIP does
+ * not point at the failing instruction, and even if it did, the code
+ * stream is inaccessible. Inject #UD instead of exiting to userspace
+ * so that guest userspace can't DoS the guest simply by triggering
+ * emulation (enclaves are CPL3 only).
+ */
+ if (vmx_get_exit_reason(vcpu).enclave_mode) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ /* Check that emulation is possible during event vectoring */
+ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
+ unsigned long rip, orig_rip;
+ u32 instr_len;
+
+ /*
+ * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
+ * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
+ * set when EPT misconfig occurs. In practice, real hardware updates
+ * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
+ * (namely Hyper-V) don't set it due to it being undefined behavior,
+ * i.e. we end up advancing IP with some random value.
+ */
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ /*
+ * Emulating an enclave's instructions isn't supported as KVM
+ * cannot access the enclave's memory or its true RIP, e.g. the
+ * vmcs.GUEST_RIP points at the exit point of the enclave, not
+ * the RIP that actually triggered the VM-Exit. But, because
+ * most instructions that cause VM-Exit will #UD in an enclave,
+ * most instruction-based VM-Exits simply do not occur.
+ *
+ * There are a few exceptions, notably the debug instructions
+ * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+ * and generate #DB/#BP as expected, which KVM might intercept.
+ * But again, the CPU does the dirty work and saves an instr
+ * length of zero so VMMs don't shoot themselves in the foot.
+ * WARN if KVM tries to skip a non-zero length instruction on
+ * a VM-Exit from an enclave.
+ */
+ if (!instr_len)
+ goto rip_updated;
+
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
+
+ orig_rip = kvm_rip_read(vcpu);
+ rip = orig_rip + instr_len;
+#ifdef CONFIG_X86_64
+ /*
+ * We need to mask out the high 32 bits of RIP if not in 64-bit
+ * mode, but just finding out that we are in 64-bit mode is
+ * quite expensive. Only do it if there was a carry.
+ */
+ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
+ rip = (u32)rip;
+#endif
+ kvm_rip_write(vcpu, rip);
+ } else {
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+ }
+
+rip_updated:
+ /* skipping an emulated instruction also counts */
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
+}
+
+/*
+ * Recognizes a pending MTF VM-exit and records the nested state for later
+ * delivery.
+ */
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!is_guest_mode(vcpu))
+ return;
+
+ /*
+ * Per the SDM, MTF takes priority over debug-trap exceptions besides
+ * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
+ * or ICEBP (in the emulator proper), and skipping of ICEBP after an
+ * intercepted #DB deliberately avoids single-step #DB and MTF updates
+ * as ICEBP is higher priority than both. As instruction emulation is
+ * completed at this point (i.e. KVM is at the instruction boundary),
+ * any #DB exception pending delivery must be a debug-trap of lower
+ * priority than MTF. Record the pending MTF state to be delivered in
+ * vmx_check_nested_events().
+ */
+ if (nested_cpu_has_mtf(vmcs12) &&
+ (!vcpu->arch.exception.pending ||
+ vcpu->arch.exception.vector == DB_VECTOR) &&
+ (!vcpu->arch.exception_vmexit.pending ||
+ vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
+ vmx->nested.mtf_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else {
+ vmx->nested.mtf_pending = false;
+ }
+}
+
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ vmx_update_emulated_instruction(vcpu);
+ return skip_emulated_instruction(vcpu);
+}
+
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set,
+ * then the instruction is already executing and RIP has already been
+ * advanced.
+ */
+ if (kvm_hlt_in_guest(vcpu->kvm) &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (ex->has_error_code) {
+ /*
+ * Despite the error code being architecturally defined as 32
+ * bits, and the VMCS field being 32 bits, Intel CPUs and thus
+ * VMX don't actually supporting setting bits 31:16. Hardware
+ * will (should) never provide a bogus error code, but AMD CPUs
+ * do generate error codes with bits 31:16 set, and so KVM's
+ * ABI lets userspace shove in arbitrary 32-bit values. Drop
+ * the upper bits to avoid VM-Fail, losing information that
+ * doesn't really exist is preferable to killing the VM.
+ */
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (kvm_exception_is_soft(ex->vector))
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
+ return;
+ }
+
+ WARN_ON_ONCE(vmx->vt.emulation_required);
+
+ if (kvm_exception_is_soft(ex->vector)) {
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ bool load_into_hardware)
+{
+ struct vmx_uret_msr *uret_msr;
+
+ uret_msr = vmx_find_uret_msr(vmx, msr);
+ if (!uret_msr)
+ return;
+
+ uret_msr->load_into_hardware = load_into_hardware;
+}
+
+/*
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest. Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
+ */
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
+{
+#ifdef CONFIG_X86_64
+ bool load_syscall_msrs;
+
+ /*
+ * The SYSCALL MSRs are only needed on long mode guests, and only
+ * when EFER.SCE is set.
+ */
+ load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ (vmx->vcpu.arch.efer & EFER_SCE);
+
+ vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
+#endif
+ vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
+
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
+
+ /*
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ * kernel and old userspace. If those guests run on a tsx=off host, do
+ * allow guests to use TSX_CTRL, but don't change the value in hardware
+ * so that TSX remains always disabled.
+ */
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
+
+ /*
+ * The set of MSRs to load may have changed, reload MSRs before the
+ * next VM-Enter.
+ */
+ vmx->guest_uret_msrs_loaded = false;
+}
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
+ return vmcs12->tsc_offset;
+
+ return 0;
+}
+
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ return vmcs12->tsc_multiplier;
+
+ return kvm_caps.default_tsc_scaling_ratio;
+}
+
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+}
+
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+}
+
+/*
+ * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
+ * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
+ * backwards compatibility even though KVM doesn't support emulating SMX. And
+ * because userspace set "VMX in SMX", the guest must also be allowed to set it,
+ * e.g. if the MSR is left unlocked and the guest does a RMW operation.
+ */
+#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
+ FEAT_CTL_SGX_LC_ENABLED | \
+ FEAT_CTL_SGX_ENABLED | \
+ FEAT_CTL_LMCE_ENABLED)
+
+static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
+ struct msr_data *msr)
+{
+ uint64_t valid_bits;
+
+ /*
+ * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
+ * exposed to the guest.
+ */
+ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
+ ~KVM_SUPPORTED_FEATURE_CONTROL);
+
+ if (!msr->host_initiated &&
+ (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
+ return false;
+
+ if (msr->host_initiated)
+ valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
+ else
+ valid_bits = vmx->msr_ia32_feature_control_valid_bits;
+
+ return !(msr->data & ~valid_bits);
+}
+
+int vmx_get_feature_msr(u32 msr, u64 *data)
+{
+ switch (msr) {
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!nested)
+ return 1;
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
+ default:
+ return KVM_MSR_RET_UNSUPPORTED;
+ }
+}
+
+/*
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_uret_msr *msr;
+ u32 index;
+
+ switch (msr_info->index) {
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ msr_info->data = vmcs_readl(GUEST_FS_BASE);
+ break;
+ case MSR_GS_BASE:
+ msr_info->data = vmcs_readl(GUEST_GS_BASE);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
+ break;
+#endif
+ case MSR_EFER:
+ return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ goto find_uret_msr;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+ return 1;
+
+ msr_info->data = vmx->msr_ia32_umwait_control;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
+ break;
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr_info->host_initiated &&
+ !(vmx->msr_ia32_feature_control &
+ FEAT_CTL_LMCE_ENABLED))
+ return 1;
+ msr_info->data = vcpu->arch.mcg_ext_ctl;
+ break;
+ case MSR_IA32_FEAT_CTL:
+ msr_info->data = vmx->msr_ia32_feature_control;
+ break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+ [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+ break;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ return 1;
+ if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data))
+ return 1;
+#ifdef CONFIG_KVM_HYPERV
+ /*
+ * Enlightened VMCS v1 doesn't have certain VMCS fields but
+ * instead of just ignoring the features, different Hyper-V
+ * versions are either trying to use them and fail or do some
+ * sanity checking and refuse to boot. Filter all unsupported
+ * features out.
+ */
+ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
+ nested_evmcs_filter_control_msr(vcpu, msr_info->index,
+ &msr_info->data);
+#endif
+ break;
+ case MSR_IA32_RTIT_CTL:
+ if (!vmx_pt_mode_is_host_guest())
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.ctl;
+ break;
+ case MSR_IA32_RTIT_STATUS:
+ if (!vmx_pt_mode_is_host_guest())
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.status;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!vmx_pt_mode_is_host_guest() ||
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.cr3_match;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ if (!vmx_pt_mode_is_host_guest() ||
+ (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output)))
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.output_base;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!vmx_pt_mode_is_host_guest() ||
+ (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output)))
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.output_mask;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
+ if (!vmx_pt_mode_is_host_guest() ||
+ (index >= 2 * vmx->pt_desc.num_address_ranges))
+ return 1;
+ if (index % 2)
+ msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
+ else
+ msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+ break;
+ case MSR_IA32_S_CET:
+ msr_info->data = vmcs_readl(GUEST_S_CET);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ msr_info->data = vmcs_readl(GUEST_SSP);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = vmx_guest_debugctl_read();
+ break;
+ default:
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_info->index);
+ if (msr) {
+ msr_info->data = msr->data;
+ break;
+ }
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+ u64 data)
+{
+#ifdef CONFIG_X86_64
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return (u32)data;
+#endif
+ return (unsigned long)data;
+}
+
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+{
+ u64 debugctl = 0;
+
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
+ if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
+ return debugctl;
+}
+
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
+/*
+ * Writes msr value into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_uret_msr *msr;
+ int ret = 0;
+ u32 msr_index = msr_info->index;
+ u64 data = msr_info->data;
+ u32 index;
+
+ switch (msr_index) {
+ case MSR_EFER:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_FS_BASE, data);
+ break;
+ case MSR_GS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_GS_BASE, data);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_write_guest_kernel_gs_base(vmx, data);
+ break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ if (is_guest_mode(vcpu))
+ get_vmcs12(vcpu)->guest_sysenter_cs = data;
+ vmcs_write32(GUEST_SYSENTER_CS, data);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
+ get_vmcs12(vcpu)->guest_sysenter_eip = data;
+ }
+ vmcs_writel(GUEST_SYSENTER_EIP, data);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
+ get_vmcs12(vcpu)->guest_sysenter_esp = data;
+ }
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+ return 1;
+
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
+ if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+
+ vmx_guest_debugctl_write(vcpu, data);
+
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
+ return 1;
+
+ if (is_guest_mode(vcpu) &&
+ ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
+ (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
+ get_vmcs12(vcpu)->guest_bndcfgs = data;
+
+ vmcs_write64(GUEST_BNDCFGS, data);
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+ return 1;
+
+ /* The reserved bit 1 and non-32 bit [63:32] should be zero */
+ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
+ return 1;
+
+ vmx->msr_ia32_umwait_control = data;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (kvm_spec_ctrl_test_value(data))
+ return 1;
+
+ vmx->spec_ctrl = data;
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging. We update the vmcs01 here for L1 as well
+ * since it will end up touching the MSR anyway now.
+ */
+ vmx_disable_intercept_for_msr(vcpu,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_RW);
+ break;
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
+ return 1;
+ goto find_uret_msr;
+ case MSR_IA32_CR_PAT:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ if (ret)
+ break;
+
+ if (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
+ get_vmcs12(vcpu)->guest_ia32_pat = data;
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, data);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEAT_CTL_LMCE_ENABLED)) ||
+ (data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = data;
+ break;
+ case MSR_IA32_FEAT_CTL:
+ if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
+ return 1;
+
+ vmx->msr_ia32_feature_control = data;
+ if (msr_info->host_initiated && data == 0)
+ vmx_leave_nested(vcpu);
+
+ /* SGX may be enabled/disabled by guest's firmware */
+ vmx_write_encls_bitmap(vcpu, NULL);
+ break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ /*
+ * On real hardware, the LE hash MSRs are writable before
+ * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+ * at which point SGX related bits in IA32_FEATURE_CONTROL
+ * become writable.
+ *
+ * KVM does not emulate SGX activation for simplicity, so
+ * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+ * is unlocked. This is technically not architectural
+ * behavior, but it's close enough.
+ */
+ if (!msr_info->host_initiated &&
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
+ ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+ !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+ return 1;
+ vmx->msr_ia32_sgxlepubkeyhash
+ [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
+ break;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!msr_info->host_initiated)
+ return 1; /* they are read-only */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ return 1;
+ return vmx_set_vmx_msr(vcpu, msr_index, data);
+ case MSR_IA32_RTIT_CTL:
+ if (!vmx_pt_mode_is_host_guest() ||
+ vmx_rtit_ctl_check(vcpu, data) ||
+ vmx->nested.vmxon)
+ return 1;
+ vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+ vmx->pt_desc.guest.ctl = data;
+ pt_update_intercept_for_msr(vcpu);
+ break;
+ case MSR_IA32_RTIT_STATUS:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (data & MSR_IA32_RTIT_STATUS_MASK)
+ return 1;
+ vmx->pt_desc.guest.status = data;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
+ return 1;
+ vmx->pt_desc.guest.cr3_match = data;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ if (!pt_output_base_valid(vcpu, data))
+ return 1;
+ vmx->pt_desc.guest.output_base = data;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ vmx->pt_desc.guest.output_mask = data;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
+ if (index >= 2 * vmx->pt_desc.num_address_ranges)
+ return 1;
+ if (is_noncanonical_msr_address(data, vcpu))
+ return 1;
+ if (index % 2)
+ vmx->pt_desc.guest.addr_b[index / 2] = data;
+ else
+ vmx->pt_desc.guest.addr_a[index / 2] = data;
+ break;
+ case MSR_IA32_S_CET:
+ vmcs_writel(GUEST_S_CET, data);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ vmcs_writel(GUEST_SSP, data);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ vmcs_writel(GUEST_INTR_SSP_TABLE, data);
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (data & PERF_CAP_LBR_FMT) {
+ if ((data & PERF_CAP_LBR_FMT) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ if (data & PERF_CAP_PEBS_FORMAT) {
+ if ((data & PERF_CAP_PEBS_MASK) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
+ return 1;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+
+ default:
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_index);
+ if (msr)
+ ret = vmx_set_guest_uret_msr(vmx, msr, data);
+ else
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ }
+
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
+ return ret;
+}
+
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ unsigned long guest_owned_bits;
+
+ kvm_register_mark_available(vcpu, reg);
+
+ switch (reg) {
+ case VCPU_REGS_RSP:
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ break;
+ case VCPU_REGS_RIP:
+ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
+ case VCPU_EXREG_CR0:
+ guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
+ break;
+ case VCPU_EXREG_CR3:
+ /*
+ * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+ * CR3 is loaded into hardware, not the guest's CR3.
+ */
+ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ break;
+ case VCPU_EXREG_CR4:
+ guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
+ break;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ break;
+ }
+}
+
+/*
+ * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
+ * directly instead of going through cpu_has(), to ensure KVM is trapping
+ * ENCLS whenever it's supported in hardware. It does not matter whether
+ * the host OS supports or has enabled SGX.
+ */
+static bool cpu_has_sgx(void)
+{
+ return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
+}
+
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 ctl = ctl_min | ctl_opt;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
+
+ /* Ensure minimum (required) set of control bits are supported. */
+ if (ctl_min & ~ctl)
+ return -EIO;
+
+ *result = ctl;
+ return 0;
+}
+
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+ u64 allowed;
+
+ rdmsrq(msr, allowed);
+
+ return ctl_opt & allowed;
+}
+
+#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
+({ \
+ int i, r = 0; \
+ \
+ BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
+ BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
+ \
+ for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
+ typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
+ typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
+ \
+ if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
+ continue; \
+ \
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
+ "entry = %llx (%llx), exit = %llx (%llx)\n", \
+ (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
+ (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
+ \
+ if (error_on_inconsistent_vmcs_config) \
+ r = -EIO; \
+ \
+ entry_controls &= ~n_ctrl; \
+ exit_controls &= ~x_ctrl; \
+ } \
+ r; \
+})
+
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
+{
+ u32 _pin_based_exec_control = 0;
+ u32 _cpu_based_exec_control = 0;
+ u32 _cpu_based_2nd_exec_control = 0;
+ u64 _cpu_based_3rd_exec_control = 0;
+ u32 _vmexit_control = 0;
+ u32 _vmentry_control = 0;
+ u64 basic_msr;
+ u64 misc_msr;
+
+ /*
+ * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+ * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+ * intercepts writes to PAT and EFER, i.e. never enables those controls.
+ */
+ struct {
+ u32 entry_control;
+ u32 exit_control;
+ } const vmcs_entry_exit_pairs[] = {
+ { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+ { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
+ { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
+ { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
+ { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
+ };
+
+ memset(vmcs_conf, 0, sizeof(*vmcs_conf));
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control))
+ return -EIO;
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ &_cpu_based_2nd_exec_control))
+ return -EIO;
+ }
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
+#ifndef CONFIG_X86_64
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
+
+ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_2nd_exec_control &= ~(
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_cap->ept, &vmx_cap->vpid);
+
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ vmx_cap->ept) {
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_cap->vpid) {
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->vpid = 0;
+ }
+
+ if (!cpu_has_sgx())
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
+
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+ _cpu_based_3rd_exec_control =
+ adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS3);
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control))
+ return -EIO;
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control))
+ return -EIO;
+
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
+ _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control))
+ return -EIO;
+
+ if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
+ _vmentry_control, _vmexit_control))
+ return -EIO;
+
+ /*
+ * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
+ * can't be used due to an errata where VM Exit may incorrectly clear
+ * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
+ * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
+ }
+
+ rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
+
+ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
+ return -EIO;
+
+#ifdef CONFIG_X86_64
+ /*
+ * KVM expects to be able to shove all legal physical addresses into
+ * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ * 0 for processors that support Intel 64 architecture".
+ */
+ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
+ return -EIO;
+#endif
+
+ /* Require Write-Back (WB) memory type for VMCS accesses. */
+ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
+ return -EIO;
+
+ rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
+
+ vmcs_conf->basic = basic_msr;
+ vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+ vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
+ vmcs_conf->vmexit_ctrl = _vmexit_control;
+ vmcs_conf->vmentry_ctrl = _vmentry_control;
+ vmcs_conf->misc = misc_msr;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
+
+ return 0;
+}
+
+static bool __kvm_is_vmx_supported(void)
+{
+ int cpu = smp_processor_id();
+
+ if (!(cpuid_ecx(1) & feature_bit(VMX))) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static bool kvm_is_vmx_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_vmx_supported();
+ migrate_enable();
+
+ return supported;
+}
+
+int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!__kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+int vmx_enable_virtualization_cpu(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+void vmx_disable_virtualization_cpu(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (kvm_cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
+{
+ int node = cpu_to_node(cpu);
+ struct page *pages;
+ struct vmcs *vmcs;
+
+ pages = __alloc_pages_node(node, flags, 0);
+ if (!pages)
+ return NULL;
+ vmcs = page_address(pages);
+ memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (kvm_is_using_evmcs())
+ vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
+
+ if (shadow)
+ vmcs->hdr.shadow_vmcs = 1;
+ return vmcs;
+}
+
+void free_vmcs(struct vmcs *vmcs)
+{
+ free_page((unsigned long)vmcs);
+}
+
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ if (!loaded_vmcs->vmcs)
+ return;
+ loaded_vmcs_clear(loaded_vmcs);
+ free_vmcs(loaded_vmcs->vmcs);
+ loaded_vmcs->vmcs = NULL;
+ if (loaded_vmcs->msr_bitmap)
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
+ WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+}
+
+int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ loaded_vmcs->vmcs = alloc_vmcs(false);
+ if (!loaded_vmcs->vmcs)
+ return -ENOMEM;
+
+ vmcs_clear(loaded_vmcs->vmcs);
+
+ loaded_vmcs->shadow_vmcs = NULL;
+ loaded_vmcs->hv_timer_soft_disabled = false;
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ loaded_vmcs->msr_bitmap = (unsigned long *)
+ __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!loaded_vmcs->msr_bitmap)
+ goto out_vmcs;
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+ }
+
+ memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+ memset(&loaded_vmcs->controls_shadow, 0,
+ sizeof(struct vmcs_controls_shadow));
+
+ return 0;
+
+out_vmcs:
+ free_loaded_vmcs(loaded_vmcs);
+ return -ENOMEM;
+}
+
+static void free_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
+}
+
+static __init int alloc_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct vmcs *vmcs;
+
+ vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
+ if (!vmcs) {
+ free_kvm_area();
+ return -ENOMEM;
+ }
+
+ /*
+ * When eVMCS is enabled, alloc_vmcs_cpu() sets
+ * vmcs->revision_id to KVM_EVMCS_VERSION instead of
+ * revision_id reported by MSR_IA32_VMX_BASIC.
+ *
+ * However, even though not explicitly documented by
+ * TLFS, VMXArea passed as VMXON argument should
+ * still be marked with revision_id reported by
+ * physical CPU.
+ */
+ if (kvm_is_using_evmcs())
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
+
+ per_cpu(vmxarea, cpu) = vmcs;
+ }
+ return 0;
+}
+
+static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
+ struct kvm_segment *save)
+{
+ if (!emulate_invalid_guest_state) {
+ /*
+ * CS and SS RPL should be equal during guest entry according
+ * to VMX spec, but in reality it is not always so. Since vcpu
+ * is in the middle of the transition from real mode to
+ * protected mode it is safe to assume that RPL 0 is a good
+ * default value.
+ */
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
+ save->selector &= ~SEGMENT_RPL_MASK;
+ save->dpl = save->selector & SEGMENT_RPL_MASK;
+ save->s = 1;
+ }
+ __vmx_set_segment(vcpu, save, seg);
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Update real mode segment cache. It may be not up-to-date if segment
+ * register was written while vcpu was in a guest mode.
+ */
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 0;
+
+ __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ vmcs_writel(GUEST_RFLAGS, flags);
+
+ vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+ (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
+
+ vmx_update_exception_bitmap(vcpu);
+
+ fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+}
+
+static void fix_rmode_seg(int seg, struct kvm_segment *save)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_segment var = *save;
+
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+
+ if (!emulate_invalid_guest_state) {
+ var.selector = var.base >> 4;
+ var.base = var.base & 0xffff0;
+ var.limit = 0xffff;
+ var.g = 0;
+ var.db = 0;
+ var.present = 1;
+ var.s = 1;
+ var.l = 0;
+ var.unusable = 0;
+ var.type = 0x3;
+ var.avl = 0;
+ if (save->base & 0xf)
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
+ }
+
+ vmcs_write16(sf->selector, var.selector);
+ vmcs_writel(sf->base, var.base);
+ vmcs_write32(sf->limit, var.limit);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+ /*
+ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ * should VM-Fail and KVM should reject userspace attempts to stuff
+ * CR0.PG=0 when L2 is active.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 1;
+
+ vmx_segment_cache_clear(vmx);
+
+ vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
+ vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ vmx->rmode.save_rflags = flags;
+
+ flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+
+ vmcs_writel(GUEST_RFLAGS, flags);
+ vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
+ vmx_update_exception_bitmap(vcpu);
+
+ fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+}
+
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* Nothing to do if hardware doesn't support EFER. */
+ if (!vmx_find_uret_msr(vmx, MSR_EFER))
+ return 0;
+
+ vcpu->arch.efer = efer;
+#ifdef CONFIG_X86_64
+ if (efer & EFER_LMA)
+ vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
+ else
+ vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
+#else
+ if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
+ return 1;
+#endif
+
+ vmx_setup_uret_msrs(vmx);
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+ u32 guest_tr_ar;
+
+ vmx_segment_cache_clear(to_vmx(vcpu));
+
+ guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
+ pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+ __func__);
+ vmcs_write32(GUEST_TR_AR_BYTES,
+ (guest_tr_ar & ~VMX_AR_TYPE_MASK)
+ | VMX_AR_TYPE_BUSY_64_TSS);
+ }
+ vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+ vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
+}
+
+#endif
+
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
+ * the CPU is not required to invalidate guest-physical mappings on
+ * VM-Entry, even if VPID is disabled. Guest-physical mappings are
+ * associated with the root EPT structure and not any particular VPID
+ * (INVVPID also isn't required to invalidate guest-physical mappings).
+ */
+ if (enable_ept) {
+ ept_sync_global();
+ } else if (enable_vpid) {
+ if (cpu_has_vmx_invvpid_global()) {
+ vpid_sync_vcpu_global();
+ } else {
+ vpid_sync_vcpu_single(vmx->vpid);
+ vpid_sync_vcpu_single(vmx->nested.vpid02);
+ }
+ }
+}
+
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
+static u64 construct_eptp(hpa_t root_hpa)
+{
+ u64 eptp = root_hpa | VMX_EPTP_MT_WB;
+ struct kvm_mmu_page *root;
+
+ if (kvm_mmu_is_dummy_root(root_hpa))
+ return eptp | VMX_EPTP_PWL_4;
+
+ /*
+ * EPT roots should always have an associated MMU page. Return a "bad"
+ * EPTP to induce VM-Fail instead of continuing on in a unknown state.
+ */
+ root = root_to_sp(root_hpa);
+ if (WARN_ON_ONCE(!root))
+ return INVALID_PAGE;
+
+ eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits && !root->role.ad_disabled)
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+
+ return eptp;
+}
+
+static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
+{
+ u64 eptp = construct_eptp(root_hpa);
+
+ if (VALID_PAGE(eptp))
+ ept_sync_context(eptp);
+ else
+ ept_sync_global();
+}
+
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 root_hpa = mmu->root.hpa;
+
+ /* No flush required if the current context is invalid. */
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ if (enable_ept)
+ vmx_flush_tlb_ept_root(root_hpa);
+ else
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
+}
+
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ /*
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
+ * vmx_flush_tlb_guest() for an explanation of why this is ok.
+ */
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
+}
+
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ /*
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
+ * i.e. no explicit INVVPID is necessary.
+ */
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
+}
+
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
+ vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
+ vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
+ vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
+ }
+}
+
+void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
+ return;
+
+ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
+}
+
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
+
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_guest_mode(vcpu))
+ return nested_guest_cr0_valid(vcpu, cr0);
+
+ if (to_vmx(vcpu)->nested.vmxon)
+ return nested_host_cr0_valid(vcpu, cr0);
+
+ return true;
+}
+
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0, old_cr0_pg;
+ u32 tmp;
+
+ old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+
+ hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
+ if (enable_unrestricted_guest)
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else {
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+ if (!enable_ept)
+ hw_cr0 |= X86_CR0_WP;
+
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
+ enter_pmode(vcpu);
+
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ enter_rmode(vcpu);
+ }
+
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!old_cr0_pg && (cr0 & X86_CR0_PG))
+ enter_lmode(vcpu);
+ else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
+ exit_lmode(vcpu);
+ }
+#endif
+
+ if (enable_ept && !enable_unrestricted_guest) {
+ /*
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+ * KVM's CR3 is installed.
+ */
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
+
+ /*
+ * When running with EPT but not unrestricted guest, KVM must
+ * intercept CR3 accesses when paging is _disabled_. This is
+ * necessary because restricted guests can't actually run with
+ * paging disabled, and so KVM stuffs its own CR3 in order to
+ * run the guest when identity mapped page tables.
+ *
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+ * update, it may be stale with respect to CR3 interception,
+ * e.g. after nested VM-Enter.
+ *
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+ * stores to forward them to L1, even if KVM does not need to
+ * intercept them to preserve its identity mapped page tables.
+ */
+ if (!(cr0 & X86_CR0_PG)) {
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
+ } else if (!is_guest_mode(vcpu)) {
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+ } else {
+ tmp = exec_controls_get(vmx);
+ tmp &= ~CR3_EXITING_BITS;
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+ exec_controls_set(vmx, tmp);
+ }
+
+ /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ }
+
+ /* depends on vcpu->arch.cr0 to be set to a new value */
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
+}
+
+static int vmx_get_max_ept_level(void)
+{
+ if (cpu_has_vmx_ept_5levels())
+ return 5;
+ return 4;
+}
+
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool update_guest_cr3 = true;
+ unsigned long guest_cr3;
+
+ if (enable_ept) {
+ KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
+ root_level != root_to_sp(root_hpa)->role.level);
+ vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
+
+ hv_track_root_tdp(vcpu, root_hpa);
+
+ if (!enable_unrestricted_guest && !is_paging(vcpu))
+ guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
+ guest_cr3 = vcpu->arch.cr3;
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
+ update_guest_cr3 = false;
+ vmx_ept_load_pdptrs(vcpu);
+ } else {
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
+ kvm_get_active_cr3_lam_bits(vcpu);
+ }
+
+ if (update_guest_cr3)
+ vmcs_writel(GUEST_CR3, guest_cr3);
+}
+
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ /*
+ * We operate under the default treatment of SMM, so VMX cannot be
+ * enabled under SMM. Note, whether or not VMXE is allowed at all,
+ * i.e. is a reserved bit, is handled by common x86 code.
+ */
+ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
+ return false;
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return false;
+
+ return true;
+}
+
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr4;
+
+ /*
+ * Pass through host's Machine Check Enable value to hw_cr4, which
+ * is in force while we are in guest mode. Do not let guests control
+ * this bit, even if host CR4.MCE == 0.
+ */
+ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+ if (enable_unrestricted_guest)
+ hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else if (vmx->rmode.vm86_active)
+ hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
+ else
+ hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
+
+ if (vmx_umip_emulated()) {
+ if (cr4 & X86_CR4_UMIP) {
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
+ hw_cr4 &= ~X86_CR4_UMIP;
+ } else if (!is_guest_mode(vcpu) ||
+ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
+ }
+ }
+
+ vcpu->arch.cr4 = cr4;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
+
+ if (!enable_unrestricted_guest) {
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
+
+ /*
+ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
+ * to be manually disabled when guest switches to non-paging
+ * mode.
+ *
+ * If !enable_unrestricted_guest, the CPU is always running
+ * with CR0.PG=1 and CR4 needs to be modified.
+ * If enable_unrestricted_guest, the CPU automatically
+ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
+ */
+ if (!is_paging(vcpu))
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+
+ vmcs_writel(CR4_READ_SHADOW, cr4);
+ vmcs_writel(GUEST_CR4, hw_cr4);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+}
+
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 ar;
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ *var = vmx->rmode.segs[seg];
+ if (seg == VCPU_SREG_TR
+ || var->selector == vmx_read_guest_seg_selector(vmx, seg))
+ return;
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ return;
+ }
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->limit = vmx_read_guest_seg_limit(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ ar = vmx_read_guest_seg_ar(vmx, seg);
+ var->unusable = (ar >> 16) & 1;
+ var->type = ar & 15;
+ var->s = (ar >> 4) & 1;
+ var->dpl = (ar >> 5) & 3;
+ /*
+ * Some userspaces do not preserve unusable property. Since usable
+ * segment has to be present according to VMX spec we can use present
+ * property to amend userspace bug by making unusable segment always
+ * nonpresent. vmx_segment_access_rights() already marks nonpresent
+ * segment as unusable.
+ */
+ var->present = !var->unusable;
+ var->avl = (ar >> 12) & 1;
+ var->l = (ar >> 13) & 1;
+ var->db = (ar >> 14) & 1;
+ var->g = (ar >> 15) & 1;
+}
+
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment s;
+
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ vmx_get_segment(vcpu, &s, seg);
+ return s.base;
+ }
+ return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
+}
+
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ar;
+
+ if (unlikely(vmx->rmode.vm86_active))
+ return 0;
+
+ if (no_cache)
+ ar = vmcs_read32(GUEST_SS_AR_BYTES);
+ else
+ ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+}
+
+int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, false);
+}
+
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, true);
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
+{
+ u32 ar;
+
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
+
+ return ar;
+}
+
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ vmx_segment_cache_clear(vmx);
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ vmx->rmode.segs[seg] = *var;
+ if (seg == VCPU_SREG_TR)
+ vmcs_write16(sf->selector, var->selector);
+ else if (var->s)
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
+ return;
+ }
+
+ vmcs_writel(sf->base, var->base);
+ vmcs_write32(sf->limit, var->limit);
+ vmcs_write16(sf->selector, var->selector);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the userland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
+ var->type |= 0x1; /* Accessed */
+
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
+
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ __vmx_set_segment(vcpu, var, seg);
+
+ to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
+}
+
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
+
+ *db = (ar >> 14) & 1;
+ *l = (ar >> 13) & 1;
+}
+
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_IDTR_BASE, dt->address);
+}
+
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_GDTR_BASE, dt->address);
+}
+
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ u32 ar;
+
+ vmx_get_segment(vcpu, &var, seg);
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+ ar = vmx_segment_access_rights(&var);
+
+ if (var.base != (var.selector << 4))
+ return false;
+ if (var.limit != 0xffff)
+ return false;
+ if (ar != 0xf3)
+ return false;
+
+ return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ unsigned int cs_rpl;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs_rpl = cs.selector & SEGMENT_RPL_MASK;
+
+ if (cs.unusable)
+ return false;
+ if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
+ return false;
+ if (!cs.s)
+ return false;
+ if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
+ if (cs.dpl > cs_rpl)
+ return false;
+ } else {
+ if (cs.dpl != cs_rpl)
+ return false;
+ }
+ if (!cs.present)
+ return false;
+
+ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+ return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ss;
+ unsigned int ss_rpl;
+
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+ ss_rpl = ss.selector & SEGMENT_RPL_MASK;
+
+ if (ss.unusable)
+ return true;
+ if (ss.type != 3 && ss.type != 7)
+ return false;
+ if (!ss.s)
+ return false;
+ if (ss.dpl != ss_rpl) /* DPL != RPL */
+ return false;
+ if (!ss.present)
+ return false;
+
+ return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ unsigned int rpl;
+
+ vmx_get_segment(vcpu, &var, seg);
+ rpl = var.selector & SEGMENT_RPL_MASK;
+
+ if (var.unusable)
+ return true;
+ if (!var.s)
+ return false;
+ if (!var.present)
+ return false;
+ if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
+ if (var.dpl < rpl) /* DPL < RPL */
+ return false;
+ }
+
+ /* TODO: Add other members to kvm_segment_field to allow checking for other access
+ * rights flags
+ */
+ return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment tr;
+
+ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+ if (tr.unusable)
+ return false;
+ if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
+ return false;
+ if (!tr.present)
+ return false;
+
+ return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ldtr;
+
+ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+ if (ldtr.unusable)
+ return true;
+ if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (ldtr.type != 2)
+ return false;
+ if (!ldtr.present)
+ return false;
+
+ return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ss;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+ return ((cs.selector & SEGMENT_RPL_MASK) ==
+ (ss.selector & SEGMENT_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ /* real mode guest state checks */
+ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ } else {
+ /* protected mode guest state checks */
+ if (!cs_ss_rpl_check(vcpu))
+ return false;
+ if (!code_segment_valid(vcpu))
+ return false;
+ if (!stack_segment_valid(vcpu))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ if (!tr_valid(vcpu))
+ return false;
+ if (!ldtr_valid(vcpu))
+ return false;
+ }
+ /* TODO:
+ * - Add checks on RIP
+ * - Add checks on RFLAGS
+ */
+
+ return true;
+}
+
+static int init_rmode_tss(struct kvm *kvm, void __user *ua)
+{
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ u16 data;
+ int i;
+
+ for (i = 0; i < 3; i++) {
+ if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
+ return -EFAULT;
+ }
+
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+ if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
+ return -EFAULT;
+
+ data = ~0;
+ if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+ int i, r = 0;
+ void __user *uaddr;
+ u32 tmp;
+
+ /* Protect kvm_vmx->ept_identity_pagetable_done. */
+ mutex_lock(&kvm->slots_lock);
+
+ if (likely(kvm_vmx->ept_identity_pagetable_done))
+ goto out;
+
+ if (!kvm_vmx->ept_identity_map_addr)
+ kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+
+ uaddr = __x86_set_memory_region(kvm,
+ IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm_vmx->ept_identity_map_addr,
+ PAGE_SIZE);
+ if (IS_ERR(uaddr)) {
+ r = PTR_ERR(uaddr);
+ goto out;
+ }
+
+ /* Set up identity-mapping pagetable for EPT in real mode */
+ for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
+ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
+ r = -EFAULT;
+ goto out;
+ }
+ }
+ kvm_vmx->ept_identity_pagetable_done = true;
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static void seg_setup(int seg)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
+
+ vmcs_write16(sf->selector, 0);
+ vmcs_writel(sf->base, 0);
+ vmcs_write32(sf->limit, 0xffff);
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+
+ vmcs_write32(sf->ar_bytes, ar);
+}
+
+int allocate_vpid(void)
+{
+ int vpid;
+
+ if (!enable_vpid)
+ return 0;
+ spin_lock(&vmx_vpid_lock);
+ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS)
+ __set_bit(vpid, vmx_vpid_bitmap);
+ else
+ vpid = 0;
+ spin_unlock(&vmx_vpid_lock);
+ return vpid;
+}
+
+void free_vpid(int vpid)
+{
+ if (!enable_vpid || vpid == 0)
+ return;
+ spin_lock(&vmx_vpid_lock);
+ __clear_bit(vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
+}
+
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
+{
+ /*
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
+ */
+ if (kvm_is_using_evmcs()) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
+
+ vmx->nested.force_msr_bitmap_recalc = true;
+}
+
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ vmx_msr_bitmap_l01_changed(vmx);
+
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+ }
+
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
+ }
+}
+
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
+ * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
+ * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
+ */
+ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
+ const int write_idx = read_idx + (0x800 / sizeof(u64));
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
+ u8 mode;
+
+ if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
+ return;
+
+ if (cpu_has_secondary_exec_ctrls() &&
+ (secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode = MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ } else {
+ mode = 0;
+ }
+
+ if (mode == vmx->x2apic_msr_bitmap_mode)
+ return;
+
+ vmx->x2apic_msr_bitmap_mode = mode;
+
+ /*
+ * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
+ * registers (0x840 and above) intercepted, KVM doesn't support them.
+ * Intercept all writes by default and poke holes as needed. Pass
+ * through reads for all valid registers by default in x2APIC+APICv
+ * mode, only the current timer count needs on-demand emulation by KVM.
+ */
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
+ msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ else
+ msr_bitmap[read_idx] = ~0ull;
+ msr_bitmap[write_idx] = ~0ull;
+
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
+ !(mode & MSR_BITMAP_MODE_X2APIC));
+
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ if (enable_ipiv)
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
+ }
+}
+
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+ u32 i;
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+ }
+}
+
+static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ bool intercept;
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
+
+ /* PT MSRs can be passed through iff PT is exposed to the guest. */
+ if (vmx_pt_mode_is_host_guest())
+ pt_update_intercept_for_msr(vcpu);
+
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
+ }
+
+ /*
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
+ */
+}
+
+void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
+{
+ vmx_recalc_msr_intercepts(vcpu);
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
+ if (is_guest_mode(vcpu) &&
+ vector == vmx->nested.posted_intr_nv) {
+ /*
+ * If a posted intr is not recognized by hardware,
+ * we will accomplish it in the next vmentry.
+ */
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
+ /* the PIR and ON have been set by L1. */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
+ return 0;
+ }
+ return -1;
+}
+/*
+ * Send interrupt to vcpu via posted interrupt way.
+ * 1. If target vcpu is running(non-root mode), send posted interrupt
+ * notification to vcpu and hardware will sync PIR to vIRR atomically.
+ * 2. If target vcpu isn't running(root mode), kick it to pick up the
+ * interrupt from PIR in next vmentry.
+ */
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+ int r;
+
+ r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+ if (!r)
+ return 0;
+
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!vcpu->arch.apic->apicv_active)
+ return -1;
+
+ __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
+ return 0;
+}
+
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ if (vmx_deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ }
+}
+
+/*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
+{
+ u32 low32, high32;
+ unsigned long tmpl;
+ unsigned long cr0, cr3, cr4;
+
+ cr0 = read_cr0();
+ WARN_ON(cr0 & X86_CR0_TS);
+ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+
+ /* Save the most likely value for this task's CR4 in the VMCS. */
+ cr4 = cr4_read_shadow();
+ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ /*
+ * Load null selectors, so we can avoid reloading them in
+ * vmx_prepare_switch_to_host(), in case userspace uses
+ * the null selectors too (the expected case).
+ */
+ vmcs_write16(HOST_DS_SELECTOR, 0);
+ vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
+ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#endif
+ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
+
+ vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
+
+ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+
+ /*
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
+ */
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
+ rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, low32, high32);
+ vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+ }
+
+ if (cpu_has_load_ia32_efer())
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
+
+ /*
+ * Supervisor shadow stack is not enabled on host side, i.e.,
+ * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
+ * description(RDSSP instruction), SSP is not readable in CPL0,
+ * so resetting the two registers to 0s at VM-Exit does no harm
+ * to kernel execution. When execution flow exits to userspace,
+ * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
+ * 3 and 4 for details.
+ */
+ if (cpu_has_load_cet_ctrl()) {
+ vmcs_writel(HOST_S_CET, kvm_host.s_cet);
+ vmcs_writel(HOST_SSP, 0);
+ vmcs_writel(HOST_INTR_SSP_TABLE, 0);
+ }
+}
+
+void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+ vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
+ ~vcpu->arch.cr4_guest_rsvd_bits;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
+ if (is_guest_mode(&vmx->vcpu))
+ vcpu->arch.cr4_guest_owned_bits &=
+ ~get_vmcs12(vcpu)->cr4_guest_host_mask;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+ u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
+ pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+ if (!enable_vnmi)
+ pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
+
+ if (!enable_preemption_timer)
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ return pin_based_exec_ctrl;
+}
+
+static u32 vmx_get_initial_vmentry_ctrl(void)
+{
+ u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ /*
+ * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
+ */
+ vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
+ VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_IA32E_MODE);
+
+ return vmentry_ctrl;
+}
+
+static u32 vmx_get_initial_vmexit_ctrl(void)
+{
+ u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+ /*
+ * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
+ * nested virtualization and thus allowed to be set in vmcs12.
+ */
+ vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+
+ if (vmx_pt_mode_is_system())
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmexit_ctrl &
+ ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_apicv_status = true;
+ return;
+ }
+
+ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
+
+ secondary_exec_controls_changebit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
+ kvm_vcpu_apicv_active(vcpu));
+ if (enable_ipiv)
+ tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
+ kvm_vcpu_apicv_active(vcpu));
+
+ vmx_update_msr_bitmap_x2apic(vcpu);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+ u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+ /*
+ * Not used by KVM, but fully supported for nesting, i.e. are allowed in
+ * vmcs12 and propagated to vmcs02 when set in vmcs12.
+ */
+ exec_control &= ~(CPU_BASED_RDTSC_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_PAUSE_EXITING);
+
+ /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
+ exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING);
+
+ if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+ exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
+ if (!cpu_need_tpr_shadow(&vmx->vcpu))
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+#ifdef CONFIG_X86_64
+ if (exec_control & CPU_BASED_TPR_SHADOW)
+ exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING);
+ else
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ /* No need to intercept CR3 access or INVPLG when using EPT. */
+ if (enable_ept)
+ exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
+ if (kvm_mwait_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~(CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~CPU_BASED_HLT_EXITING;
+ return exec_control;
+}
+
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+ u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+ /*
+ * IPI virtualization relies on APICv. Disable IPI virtualization if
+ * APICv is inhibited.
+ */
+ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+ exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+ return exec_control;
+}
+
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest. This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+ u32 control, bool enabled, bool exiting)
+{
+ /*
+ * If the control is for an opt-in feature, clear the control if the
+ * feature is not exposed to the guest, i.e. not enabled. If the
+ * control is opt-out, i.e. an exiting control, clear the control if
+ * the feature _is_ exposed to the guest, i.e. exiting/interception is
+ * disabled for the associated instruction. Note, the caller is
+ * responsible presetting exec_control to set all supported bits.
+ */
+ if (enabled == exiting)
+ *exec_control &= ~control;
+
+ /*
+ * Update the nested MSR settings so that a nested VMM can/can't set
+ * controls for features that are/aren't exposed to the guest.
+ */
+ if (nested &&
+ kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
+ /*
+ * All features that can be added or removed to VMX MSRs must
+ * be supported in the first place for nested virtualization.
+ */
+ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
+ enabled = false;
+
+ if (enabled)
+ vmx->nested.msrs.secondary_ctls_high |= control;
+ else
+ vmx->nested.msrs.secondary_ctls_high &= ~control;
+ }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit. This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+ __enabled, exiting); \
+ } \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
+
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+ u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
+ if (!cpu_need_virtualize_apic_accesses(vcpu))
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+ if (!enable_ept) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+ enable_unrestricted_guest = 0;
+ }
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (kvm_pause_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ if (!kvm_vcpu_apicv_active(vcpu))
+ exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+
+ /*
+ * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
+ * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
+ */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
+ /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
+ * in vmx_set_cr4. */
+ exec_control &= ~SECONDARY_EXEC_DESC;
+
+ /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
+ (handle_vmptrld).
+ We can NOT enable shadow_vmcs here because we don't have yet
+ a current VMCS12
+ */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * PML is enabled/disabled when dirty logging of memsmlots changes, but
+ * it needs to be set here when dirty logging is already active, e.g.
+ * if this vCPU was created after dirty logging was enabled.
+ */
+ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
+
+ /*
+ * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ * feature is exposed to the guest. This creates a virtualization hole
+ * if both are supported in hardware but only one is exposed to the
+ * guest, but letting the guest execute RDTSCP or RDPID when either one
+ * is advertised is preferable to emulating the advertised instruction
+ * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ */
+ if (cpu_has_vmx_rdtscp()) {
+ bool rdpid_or_rdtscp_enabled =
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
+
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_ENABLE_RDTSCP,
+ rdpid_or_rdtscp_enabled, false);
+ }
+
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
+
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
+
+ vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+ ENABLE_USR_WAIT_PAUSE, false);
+
+ if (!vcpu->kvm->arch.bus_lock_detection_enabled)
+ exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+
+ if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+ exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
+ return exec_control;
+}
+
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+ return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+ struct page *pages;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+ return 0;
+
+ if (kvm_vmx->pid_table)
+ return 0;
+
+ pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ vmx_get_pid_table_order(kvm));
+ if (!pages)
+ return -ENOMEM;
+
+ kvm_vmx->pid_table = (void *)page_address(pages);
+ return 0;
+}
+
+int vmx_vcpu_precreate(struct kvm *kvm)
+{
+ return vmx_alloc_ipiv_pid_table(kvm);
+}
+
+#define VMX_XSS_EXIT_BITMAP 0
+
+static void init_vmcs(struct vcpu_vmx *vmx)
+{
+ struct kvm *kvm = vmx->vcpu.kvm;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (nested)
+ nested_vmx_set_vmcs_shadowing_bitmap();
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
+
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
+
+ /* Control */
+ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
+
+ exec_controls_set(vmx, vmx_exec_control(vmx));
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
+ }
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
+ if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
+ vmcs_write64(EOI_EXIT_BITMAP0, 0);
+ vmcs_write64(EOI_EXIT_BITMAP1, 0);
+ vmcs_write64(EOI_EXIT_BITMAP2, 0);
+ vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+ vmcs_write16(GUEST_INTR_STATUS, 0);
+
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
+ }
+
+ if (vmx_can_use_ipiv(&vmx->vcpu)) {
+ vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+ vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+ }
+
+ if (!kvm_pause_in_guest(kvm)) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmx->ple_window = ple_window;
+ vmx->ple_window_dirty = true;
+ }
+
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+
+ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
+ vmx_set_constant_host_state(vmx);
+ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+ vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
+
+ /* 22.2.1, 20.8.1 */
+ vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
+
+ vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
+ if (cpu_has_vmx_xsaves())
+ vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
+ if (enable_pml) {
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
+ }
+
+ vmx_write_encls_bitmap(&vmx->vcpu, NULL);
+
+ if (vmx_pt_mode_is_host_guest()) {
+ memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
+ /* Bit[6~0] are forced to 1, writes are ignored. */
+ vmx->pt_desc.guest.output_mask = 0x7F;
+ vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+ }
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(&vmx->vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vmx->vcpu.arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ vmx_setup_uret_msrs(vmx);
+}
+
+static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ init_vmcs(vmx);
+
+ if (nested &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
+
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ vmx->nested.current_vmptr = INVALID_GPA;
+
+#ifdef CONFIG_KVM_HYPERV
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+#endif
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&vmx->vt.pi_desc);
+}
+
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!init_event)
+ __vmx_vcpu_reset(vcpu);
+
+ vmx->rmode.vm86_active = 0;
+ vmx->spec_ctrl = 0;
+
+ vmx->msr_ia32_umwait_control = 0;
+
+ vmx->hv_deadline_tsc = -1;
+ kvm_set_cr8(vcpu, 0);
+
+ seg_setup(VCPU_SREG_CS);
+ vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+ vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
+
+ seg_setup(VCPU_SREG_DS);
+ seg_setup(VCPU_SREG_ES);
+ seg_setup(VCPU_SREG_FS);
+ seg_setup(VCPU_SREG_GS);
+ seg_setup(VCPU_SREG_SS);
+
+ vmcs_write16(GUEST_TR_SELECTOR, 0);
+ vmcs_writel(GUEST_TR_BASE, 0);
+ vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+ vmcs_writel(GUEST_LDTR_BASE, 0);
+ vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+ vmcs_writel(GUEST_GDTR_BASE, 0);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+ vmcs_writel(GUEST_IDTR_BASE, 0);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+ vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
+
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ vmcs_writel(GUEST_SSP, 0);
+ vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
+ }
+ if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
+ kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ vmcs_writel(GUEST_S_CET, 0);
+
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ vpid_sync_context(vmx->vpid);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
+}
+
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
+}
+
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ if (!enable_vnmi ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ vmx_enable_irq_window(vcpu);
+ return;
+ }
+
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
+}
+
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t intr;
+ int irq = vcpu->arch.interrupt.nr;
+
+ trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
+
+ ++vcpu->stat.irq_injections;
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (vcpu->arch.interrupt.soft)
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
+ return;
+ }
+ intr = irq | INTR_INFO_VALID_MASK;
+ if (vcpu->arch.interrupt.soft) {
+ intr |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ } else
+ intr |= INTR_TYPE_EXT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+ vmx_clear_hlt(vcpu);
+}
+
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+
+ ++vcpu->stat.nmi_injections;
+ vmx->loaded_vmcs->nmi_known_unmasked = false;
+
+ if (vmx->rmode.vm86_active) {
+ kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
+ return;
+ }
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+
+ vmx_clear_hlt(vcpu);
+}
+
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool masked;
+
+ if (!enable_vnmi)
+ return vmx->loaded_vmcs->soft_vnmi_blocked;
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return false;
+ masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ return masked;
+}
+
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+ } else {
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
+}
+
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+ return false;
+
+ if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return true;
+
+ return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_NMI));
+}
+
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+
+ /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+ return -EBUSY;
+
+ return !vmx_nmi_blocked(vcpu);
+}
+
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return false;
+
+ return __vmx_interrupt_blocked(vcpu);
+}
+
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return -EBUSY;
+
+ return !vmx_interrupt_blocked(vcpu);
+}
+
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ void __user *ret;
+
+ if (enable_unrestricted_guest)
+ return 0;
+
+ mutex_lock(&kvm->slots_lock);
+ ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ mutex_unlock(&kvm->slots_lock);
+
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ to_kvm_vmx(kvm)->tss_addr = addr;
+
+ return init_rmode_tss(kvm, ret);
+}
+
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
+static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
+{
+ switch (vec) {
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
+ fallthrough;
+ case DB_VECTOR:
+ return !(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
+ case DE_VECTOR:
+ case OF_VECTOR:
+ case BR_VECTOR:
+ case UD_VECTOR:
+ case DF_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ case MF_VECTOR:
+ return true;
+ }
+ return false;
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+ int vec, u32 err_code)
+{
+ /*
+ * Instruction with address size override prefix opcode 0x67
+ * Cause the #SS fault with 0 error code in VM86 mode.
+ */
+ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
+ if (kvm_emulate_instruction(vcpu, 0)) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt_noskip(vcpu);
+ }
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+ /* handled by vmx_vcpu_run() */
+ return 1;
+}
+
+/*
+ * If the host has split lock detection disabled, then #AC is
+ * unconditionally injected into the guest, which is the pre split lock
+ * detection behaviour.
+ *
+ * If the host has split lock detection enabled then #AC is
+ * only injected into the guest when:
+ * - Guest CPL == 3 (user mode)
+ * - Guest has #AC detection enabled in CR0
+ * - Guest EFLAGS has AC bit set
+ */
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
+{
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return true;
+
+ return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+}
+
+static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->xfd &&
+ !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
+}
+
+static int handle_exception_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 intr_info, ex_no, error_code;
+ unsigned long cr2, dr6;
+ u32 vect_info;
+
+ vect_info = vmx->idt_vectoring_info;
+ intr_info = vmx_get_intr_info(vcpu);
+
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
+ if (is_machine_check(intr_info) || is_nmi(intr_info))
+ return 1;
+
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception_p(vcpu, NM_VECTOR,
+ is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
+ return 1;
+ }
+
+ if (is_invalid_opcode(intr_info))
+ return handle_ud(vcpu);
+
+ if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+
+ WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
+ "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
+ dump_vmcs(vcpu);
+ kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
+ return 1;
+ }
+
+ error_code = 0;
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+ if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
+ * error code on #GP.
+ */
+ if (error_code) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+ }
+ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
+ }
+
+ /*
+ * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+ * MMIO, it is better to report an internal error.
+ * See the comments in vmx_handle_exit.
+ */
+ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+ !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 4;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ vcpu->run->internal.data[2] = error_code;
+ vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if (is_page_fault(intr_info)) {
+ cr2 = vmx_get_exit_qual(vcpu);
+ if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+ /*
+ * EPT will cause page fault only if we need to
+ * detect illegal GPAs.
+ */
+ WARN_ON_ONCE(!allow_smaller_maxphyaddr);
+ kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+ return 1;
+ } else
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ }
+
+ ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+
+ if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
+ return handle_rmode_exception(vcpu, ex_no, error_code);
+
+ switch (ex_no) {
+ case DB_VECTOR:
+ dr6 = vmx_get_exit_qual(vcpu);
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Use the inner "skip" helper to
+ * avoid single-step #DB and MTF updates, as ICEBP is
+ * higher priority. Note, skipping ICEBP still clears
+ * STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
+ if (is_icebp(intr_info))
+ WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
+
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
+ return 1;
+ }
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
+ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+ fallthrough;
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ kvm_run->debug.arch.exception = ex_no;
+ break;
+ case AC_VECTOR:
+ if (vmx_guest_inject_ac(vcpu)) {
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
+ }
+
+ /*
+ * Handle split lock. Depending on detection mode this will
+ * either warn and disable split lock detection for this
+ * task or force SIGBUS on it.
+ */
+ if (handle_guest_split_lock(kvm_rip_read(vcpu)))
+ return 1;
+ fallthrough;
+ default:
+ kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+ kvm_run->ex.exception = ex_no;
+ kvm_run->ex.error_code = error_code;
+ break;
+ }
+ return 0;
+}
+
+static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int size, in, string;
+ unsigned port;
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+ string = (exit_qualification & 16) != 0;
+
+ ++vcpu->stat.io_exits;
+
+ if (string)
+ return kvm_emulate_instruction(vcpu, 0);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+ in = (exit_qualification & 8) != 0;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xc1;
+}
+
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /*
+ * We get here when L2 changed cr0 in a way that did not change
+ * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+ * but did change L0 shadowed bits. So we first calculate the
+ * effective cr0 value that L1 would like to write into the
+ * hardware. It consists of the L2-owned bits from the new
+ * value combined with the L1-owned bits from L1's guest_cr0.
+ */
+ val = (val & ~vmcs12->cr0_guest_host_mask) |
+ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+ if (kvm_set_cr0(vcpu, val))
+ return 1;
+ vmcs_writel(CR0_READ_SHADOW, orig_val);
+ return 0;
+ } else {
+ return kvm_set_cr0(vcpu, val);
+ }
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /* analogously to handle_set_cr0 */
+ val = (val & ~vmcs12->cr4_guest_host_mask) |
+ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
+ if (kvm_set_cr4(vcpu, val))
+ return 1;
+ vmcs_writel(CR4_READ_SHADOW, orig_val);
+ return 0;
+ } else
+ return kvm_set_cr4(vcpu, val);
+}
+
+static int handle_desc(struct kvm_vcpu *vcpu)
+{
+ /*
+ * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
+ * and other code needs to be updated if UMIP can be guest owned.
+ */
+ BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
+
+ WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification, val;
+ int cr;
+ int reg;
+ int err;
+ int ret;
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+ cr = exit_qualification & 15;
+ reg = (exit_qualification >> 8) & 15;
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ err = handle_set_cr0(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+
+ err = kvm_set_cr3(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 4:
+ err = handle_set_cr4(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 8: {
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ u8 cr8 = (u8)val;
+ err = kvm_set_cr8(vcpu, cr8);
+ ret = kvm_complete_insn_gp(vcpu, err);
+ if (lapic_in_kernel(vcpu))
+ return ret;
+ if (cr8_prev <= cr8)
+ return ret;
+ /*
+ * TODO: we might be squashing a
+ * KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
+ }
+ break;
+ case 2: /* clts */
+ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+ return -EIO;
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ break;
+ case 3: /* lmsw */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
+ kvm_lmsw(vcpu, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+ default:
+ break;
+ }
+ vcpu->run->exit_reason = 0;
+ vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+ (int)(exit_qualification >> 4) & 3, cr);
+ return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int dr, dr7, reg;
+ int err = 1;
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+ /* First, if DR does not exist, trigger UD */
+ if (!kvm_require_dr(vcpu, dr))
+ return 1;
+
+ if (vmx_get_cpl(vcpu) > 0)
+ goto out;
+
+ dr7 = vmcs_readl(GUEST_DR7);
+ if (dr7 & DR7_GD) {
+ /*
+ * As the vm-exit takes precedence over the debug trap, we
+ * need to emulate the latter, either for the host or the
+ * guest debugging itself.
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
+ vcpu->run->debug.arch.dr7 = dr7;
+ vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ vcpu->run->debug.arch.exception = DB_VECTOR;
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ } else {
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
+ return 1;
+ }
+ }
+
+ if (vcpu->guest_debug == 0) {
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+ if (exit_qualification & TYPE_MOV_FROM_DR) {
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
+ err = 0;
+ } else {
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
+ }
+
+out:
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ get_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+ * a stale dr6 from the guest.
+ */
+ set_debugreg(DR6_RESERVED, 6);
+}
+
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ vmcs_writel(GUEST_DR7, val);
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
+{
+ kvm_apic_update_ppr(vcpu);
+ return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
+{
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ++vcpu->stat.irq_window_exits;
+ return 1;
+}
+
+static int handle_invlpg(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ if (likely(fasteoi)) {
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ int access_type, offset;
+
+ access_type = exit_qualification & APIC_ACCESS_TYPE;
+ offset = exit_qualification & APIC_ACCESS_OFFSET;
+ /*
+ * Sane guest uses MOV to write EOI, with written value
+ * not cared. So make a short-circuit here by avoiding
+ * heavy instruction emulation.
+ */
+ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+ (offset == APIC_EOI)) {
+ kvm_lapic_set_eoi(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ }
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ int vector = exit_qualification & 0xff;
+
+ /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_set_eoi_accelerated(vcpu, vector);
+ return 1;
+}
+
+static int handle_apic_write(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+
+ /*
+ * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+ * hardware has done any necessary aliasing, offset adjustments, etc...
+ * for the access. I.e. the correct value has already been written to
+ * the vAPIC page for the correct 16-byte chunk. KVM needs only to
+ * retrieve the register value and emulate the access.
+ */
+ u32 offset = exit_qualification & 0xff0;
+
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static int handle_task_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
+ bool has_error_code = false;
+ u32 error_code = 0;
+ u16 tss_selector;
+ int reason, type, idt_v, idt_index;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+
+ reason = (u32)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = false;
+ vmx_set_nmi_mask(vcpu, true);
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (vmx->idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ has_error_code = true;
+ error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ }
+ fallthrough;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ kvm_clear_exception_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+ tss_selector = exit_qualification;
+
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ WARN_ON(!skip_emulated_instruction(vcpu));
+
+ /*
+ * TODO: What about debug traps on tss switch?
+ * Are we supposed to inject them and update dr6?
+ */
+ return kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
+ reason, has_error_code, error_code);
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ gpa_t gpa;
+
+ /*
+ * EPT violation happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ * There are errata that may cause this bit to not be set:
+ * AAK134, BY25.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(vcpu, gpa, exit_qualification);
+
+ /*
+ * Check that the GPA doesn't exceed physical memory limits, as that is
+ * a guest page fault. We have to emulate the instruction here, because
+ * if the illegal address is that of a paging structure, then
+ * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
+ * would also use advanced VM-exit information for EPT violations to
+ * reconstruct the page fault error code.
+ */
+ if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
+ return kvm_emulate_instruction(vcpu, 0);
+
+ return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+
+ if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
+ return 1;
+
+ /*
+ * A nested guest cannot optimize MMIO vmexits, because we have an
+ * nGPA here instead of the required GPA.
+ */
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ if (!is_guest_mode(vcpu) &&
+ !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ trace_kvm_fast_mmio(gpa);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+}
+
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+ return -EIO;
+
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
+ ++vcpu->stat.nmi_window_exits;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 1;
+}
+
+/*
+ * Returns true if emulation is required (due to the vCPU having invalid state
+ * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
+ * current vCPU state.
+ */
+static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->vt.emulation_required)
+ return false;
+
+ /*
+ * It is architecturally impossible for emulation to be required when a
+ * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
+ * guest state is invalid and unrestricted guest is disabled, i.e. KVM
+ * should synthesize VM-Fail instead emulation L2 code. This path is
+ * only reachable if userspace modifies L2 guest state after KVM has
+ * performed the nested VM-Enter consistency checks.
+ */
+ if (vmx->nested.nested_run_pending)
+ return true;
+
+ /*
+ * KVM only supports emulating exceptions if the vCPU is in Real Mode.
+ * If emulation is required, KVM can't perform a successful VM-Enter to
+ * inject the exception.
+ */
+ return !vmx->rmode.vm86_active &&
+ (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool intr_window_requested;
+ unsigned count = 130;
+
+ intr_window_requested = exec_controls_get(vmx) &
+ CPU_BASED_INTR_WINDOW_EXITING;
+
+ while (vmx->vt.emulation_required && count-- != 0) {
+ if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
+ return handle_interrupt_window(&vmx->vcpu);
+
+ if (kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return 1;
+
+ /*
+ * Ensure that any updates to kvm->buses[] observed by the
+ * previous instruction (emulated or otherwise) are also
+ * visible to the instruction KVM is about to emulate.
+ */
+ smp_rmb();
+
+ if (!kvm_emulate_instruction(vcpu, 0))
+ return 0;
+
+ if (vmx_unhandleable_emulation_required(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt_noskip(vcpu);
+ }
+
+ /*
+ * Note, return 1 and not 0, vcpu_run() will invoke
+ * xfer_to_guest_mode() which will create a proper return
+ * code.
+ */
+ if (__xfer_to_guest_mode_work_pending())
+ return 1;
+ }
+
+ return 1;
+}
+
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (vmx_unhandleable_emulation_required(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ grow_ple_window(vcpu);
+
+ /*
+ * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
+ * VM-execution control is ignored if CPL > 0. OTOH, KVM
+ * never set PAUSE_EXITING and just set PLE if supported,
+ * so the vcpu must be CPL=0 if it gets a PAUSE exit.
+ */
+ kvm_vcpu_on_spin(vcpu, true);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_monitor_trap(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int handle_invpcid(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info;
+ unsigned long type;
+ gva_t gva;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int gpr_index;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
+
+ /* According to the Intel instruction reference, the memory operand
+ * is read even if it isn't needed (e.g., for type==all)
+ */
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_instruction_info, false,
+ sizeof(operand), &gva))
+ return 1;
+
+ return kvm_handle_invpcid(vcpu, type, gva);
+}
+
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+
+ /*
+ * PML buffer FULL happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ /*
+ * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+ * here.., and there's no userspace involvement needed for PML.
+ */
+ return 1;
+}
+
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+}
+
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
+
+/*
+ * When nested=0, all VMX instruction VM Exits filter here. The handlers
+ * are overwritten by nested_vmx_hardware_setup() when nested=1.
+ */
+static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+#ifndef CONFIG_X86_SGX_KVM
+static int handle_encls(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SGX virtualization is disabled. There is no software enable bit for
+ * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+ * the guest from executing ENCLS (when SGX is supported by hardware).
+ */
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+#endif /* CONFIG_X86_SGX_KVM */
+
+static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
+ * VM-Exits. Unconditionally set the flag here and leave the handling to
+ * vmx_handle_exit().
+ */
+ to_vt(vcpu)->exit_reason.bus_lock_detected = true;
+ return 1;
+}
+
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+ ++vcpu->stat.notify_window_exits;
+
+ /*
+ * Notify VM exit happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+ context_invalid) {
+ vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+ vcpu->run->notify.flags = context_invalid ?
+ KVM_NOTIFY_CONTEXT_INVALID : 0;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
+}
+
+static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
+static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
+ [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
+ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
+ [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
+ [EXIT_REASON_IO_INSTRUCTION] = handle_io,
+ [EXIT_REASON_CR_ACCESS] = handle_cr,
+ [EXIT_REASON_DR_ACCESS] = handle_dr,
+ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
+ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
+ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = kvm_emulate_halt,
+ [EXIT_REASON_INVD] = kvm_emulate_invd,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
+ [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
+ [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
+ [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
+ [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
+ [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
+ [EXIT_REASON_VMREAD] = handle_vmx_instruction,
+ [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
+ [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
+ [EXIT_REASON_VMOFF] = handle_vmx_instruction,
+ [EXIT_REASON_VMON] = handle_vmx_instruction,
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_APIC_WRITE] = handle_apic_write,
+ [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
+ [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
+ [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_GDTR_IDTR] = handle_desc,
+ [EXIT_REASON_LDTR_TR] = handle_desc,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
+ [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
+ [EXIT_REASON_INVEPT] = handle_vmx_instruction,
+ [EXIT_REASON_INVVPID] = handle_vmx_instruction,
+ [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
+ [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
+ [EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_INVPCID] = handle_invpcid,
+ [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
+ [EXIT_REASON_ENCLS] = handle_encls,
+ [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
+ [EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
+ [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
+ [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+ ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ *reason = vmx->vt.exit_reason.full;
+ *info1 = vmx_get_exit_qual(vcpu);
+ if (!(vmx->vt.exit_reason.failed_vmentry)) {
+ *info2 = vmx->idt_vectoring_info;
+ *intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ else
+ *error_code = 0;
+ } else {
+ *info2 = 0;
+ *intr_info = 0;
+ *error_code = 0;
+ }
+}
+
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ else
+ *error_code = 0;
+}
+
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
+{
+ if (vmx->pml_pg) {
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+ }
+}
+
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u16 pml_idx, pml_tail_index;
+ u64 *pml_buf;
+ int i;
+
+ pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+ /* Do nothing if PML buffer is empty */
+ if (pml_idx == PML_HEAD_INDEX)
+ return;
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
+
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
+ pml_buf = page_address(vmx->pml_pg);
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
+ u64 gpa;
+
+ gpa = pml_buf[i];
+ WARN_ON(gpa & (PAGE_SIZE - 1));
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ }
+
+ /* reset PML index */
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
+}
+
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read16(sel),
+ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+ pr_err("%s limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(limit),
+ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
+{
+ unsigned int i;
+ struct vmx_msr_entry *e;
+
+ pr_err("MSR %s:\n", name);
+ for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+ pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmentry_ctl, vmexit_ctl;
+ u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+ u64 tertiary_exec_control;
+ unsigned long cr4;
+ int efer_slot;
+
+ if (!dump_invalid_vmcs) {
+ pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ cr4 = vmcs_readl(GUEST_CR4);
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ else
+ secondary_exec_control = 0;
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+ else
+ tertiary_exec_control = 0;
+
+ pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
+ vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
+ pr_err("*** Guest State ***\n");
+ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+ vmcs_readl(CR0_GUEST_HOST_MASK));
+ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+ if (cpu_has_vmx_ept()) {
+ pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
+ }
+ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
+ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
+ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(GUEST_SYSENTER_ESP),
+ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
+ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
+ efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+ else if (efer_slot >= 0)
+ pr_err("EFER= 0x%016llx (autoload)\n",
+ vmx->msr_autoload.guest.val[efer_slot].value);
+ else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer | (EFER_LMA | EFER_LME));
+ else
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
+ vmcs_read64(GUEST_IA32_DEBUGCTL),
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+ if (cpu_has_load_perf_global_ctrl() &&
+ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
+ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+ pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
+ pr_err("Interruptibility = %08x ActivityState = %08x\n",
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+ vmcs_read32(GUEST_ACTIVITY_STATE));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ pr_err("InterruptStatus = %04x\n",
+ vmcs_read16(GUEST_INTR_STATUS));
+ if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+ if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+ vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
+
+ if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
+ vmcs_readl(GUEST_INTR_SSP_TABLE));
+ pr_err("*** Host State ***\n");
+ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
+ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+ vmcs_read16(HOST_TR_SELECTOR));
+ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+ vmcs_readl(HOST_TR_BASE));
+ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+ vmcs_readl(HOST_CR4));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(HOST_IA32_SYSENTER_ESP),
+ vmcs_read32(HOST_IA32_SYSENTER_CS),
+ vmcs_readl(HOST_IA32_SYSENTER_EIP));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
+ if (cpu_has_load_perf_global_ctrl() &&
+ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+ if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
+ if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
+ vmcs_readl(HOST_INTR_SSP_TABLE));
+
+ pr_err("*** Control State ***\n");
+ pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+ cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+ pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+ pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
+ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+ vmcs_read32(EXCEPTION_BITMAP),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+ pr_err(" reason=%08x qualification=%016lx\n",
+ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+ vmcs_read32(IDT_VECTORING_INFO_FIELD),
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+ pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016llx\n",
+ vmcs_read64(TSC_MULTIPLIER));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+ u16 status = vmcs_read16(GUEST_INTR_STATUS);
+ pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
+ }
+ pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
+ pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
+ }
+ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+ pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ pr_err("PLE Gap=%08x Window=%08x\n",
+ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+ pr_err("Virtual processor ID = 0x%04x\n",
+ vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
+}
+
+/*
+ * The guest has exited. See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
+ u32 vectoring_info = vmx->idt_vectoring_info;
+ u16 exit_handler_index;
+
+ /*
+ * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+ * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+ * querying dirty_bitmap, we only need to kick all vcpus out of guest
+ * mode as if vcpus is in root mode, the PML buffer must has been
+ * flushed already. Note, PML is never enabled in hardware while
+ * running L2.
+ */
+ if (enable_pml && !is_guest_mode(vcpu))
+ vmx_flush_pml_buffer(vcpu);
+
+ /*
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
+ */
+ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ return -EIO;
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * PML is never enabled when running L2, bail immediately if a
+ * PML full exit occurs as something is horribly wrong.
+ */
+ if (exit_reason.basic == EXIT_REASON_PML_FULL)
+ goto unexpected_vmexit;
+
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->vt.emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
+ if (nested_vmx_reflect_vmexit(vcpu))
+ return 1;
+ }
+
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->vt.emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
+ if (exit_reason.failed_vmentry) {
+ dump_vmcs(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason.full;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if (unlikely(vmx->fail)) {
+ dump_vmcs(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = vmcs_read32(VM_INSTRUCTION_ERROR);
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason.basic != EXIT_REASON_PML_FULL &&
+ exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+ exit_reason.basic != EXIT_REASON_NOTIFY &&
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
+ kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
+ return 0;
+ }
+
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked)) {
+ if (!vmx_interrupt_blocked(vcpu)) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ }
+ }
+
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
+ goto unexpected_vmexit;
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ return handle_wrmsr_imm(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
+#endif
+
+ exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_handler_index])
+ goto unexpected_vmexit;
+
+ return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
+
+unexpected_vmexit:
+ dump_vmcs(vcpu);
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
+ return 0;
+}
+
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ int ret = __vmx_handle_exit(vcpu, exit_fastpath);
+
+ /*
+ * Exit to user space when bus lock detected to inform that there is
+ * a bus lock in guest.
+ */
+ if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
+ if (ret > 0)
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+ }
+ return ret;
+}
+
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int tpr_threshold;
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return;
+
+ tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
+ if (is_guest_mode(vcpu))
+ to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
+ else
+ vmcs_write32(TPR_THRESHOLD, tpr_threshold);
+}
+
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 sec_exec_control;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
+ /* Postpone execution until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.change_vmcs01_virtual_apic_mode = true;
+ return;
+ }
+
+ sec_exec_control = secondary_exec_controls_get(vmx);
+ sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+
+ switch (kvm_get_apic_mode(vcpu)) {
+ case LAPIC_MODE_INVALID:
+ WARN_ONCE(true, "Invalid local APIC state");
+ break;
+ case LAPIC_MODE_DISABLED:
+ break;
+ case LAPIC_MODE_XAPIC:
+ if (flexpriority_enabled) {
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ /*
+ * Flush the TLB, reloading the APIC access page will
+ * only do so if its physical address has changed, but
+ * the guest may have inserted a non-APIC mapping into
+ * the TLB while the APIC access page was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+ break;
+ case LAPIC_MODE_X2APIC:
+ if (cpu_has_vmx_virtualize_x2apic_mode())
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ break;
+ }
+ secondary_exec_controls_set(vmx, sec_exec_control);
+
+ vmx_update_msr_bitmap_x2apic(vcpu);
+}
+
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+{
+ const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot;
+ struct page *refcounted_page;
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ bool writable;
+
+ /* Defer reload until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
+ return;
+ }
+
+ if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return;
+
+ /*
+ * Explicitly grab the memslot using KVM's internal slot ID to ensure
+ * KVM doesn't unintentionally grab a userspace memslot. It _should_
+ * be impossible for userspace to create a memslot for the APIC when
+ * APICv is enabled, but paranoia won't hurt in this case.
+ */
+ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return;
+
+ /*
+ * Ensure that the mmu_notifier sequence count is read before KVM
+ * retrieves the pfn from the primary MMU. Note, the memslot is
+ * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
+ * in kvm_mmu_invalidate_end().
+ */
+ mmu_seq = kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * No need to retry if the memslot does not exist or is invalid. KVM
+ * controls the APIC-access page memslot, and only deletes the memslot
+ * if APICv is permanently inhibited, i.e. the memslot won't reappear.
+ */
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ read_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ else
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
+
+ /*
+ * Do not pin the APIC access page in memory so that it can be freely
+ * migrated, the MMU notifier will call us again if it is migrated or
+ * swapped out. KVM backs the memslot with anonymous memory, the pfn
+ * should always point at a refcounted page (if the pfn is valid).
+ */
+ if (!WARN_ON_ONCE(!refcounted_page))
+ kvm_release_page_clean(refcounted_page);
+
+ /*
+ * No need for a manual TLB flush at this point, KVM has already done a
+ * flush if there were SPTEs pointing at the previous page.
+ */
+ read_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+ u16 status;
+ u8 old;
+
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ * Note, userspace can stuff state while L2 is active; assert
+ * that VID is disabled if and only if the vCPU is in KVM_RUN
+ * to avoid false positives if userspace is setting APIC state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run &&
+ nested_cpu_has_vid(get_vmcs12(vcpu)));
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
+ if (max_isr == -1)
+ max_isr = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = status >> 8;
+ if (max_isr != old) {
+ status &= 0xff;
+ status |= max_isr << 8;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_set_rvi(int vector)
+{
+ u16 status;
+ u8 old;
+
+ if (vector == -1)
+ vector = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = (u8)status & 0xff;
+ if ((u8)vector != old) {
+ status &= ~0xff;
+ status |= (u8)vector;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+ int max_irr;
+ bool got_posted_interrupt;
+
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
+ return -EIO;
+
+ if (pi_test_on(&vt->pi_desc)) {
+ pi_clear_on(&vt->pi_desc);
+ /*
+ * IOMMU can write to PID.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
+ got_posted_interrupt =
+ kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
+ } else {
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
+ }
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return max_irr;
+}
+
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
+ vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
+ vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
+ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+}
+
+void vmx_do_interrupt_irqoff(unsigned long entry);
+void vmx_do_nmi_irqoff(void);
+
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
+ * interception may have been caused by L1 interception. Per the SDM,
+ * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
+ *
+ * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
+ * unlike CR2 and DR6, the value is not a payload that is attached to
+ * the #NM exception.
+ */
+ if (is_xfd_nm_fault(vcpu))
+ rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+{
+ /* if exit due to PF check for async PF */
+ if (is_page_fault(intr_info))
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(vcpu);
+ /* Handle machine checks before interrupts are enabled */
+ else if (is_machine_check(intr_info))
+ kvm_machine_check();
+}
+
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
+{
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+
+ if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ return;
+
+ /*
+ * Invoke the kernel's IRQ handler for the vector. Use the FRED path
+ * when it's available even if FRED isn't fully enabled, e.g. even if
+ * FRED isn't supported in hardware, in order to avoid the indirect
+ * CALL in the non-FRED path.
+ */
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ if (IS_ENABLED(CONFIG_X86_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
+ else
+ vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+ kvm_after_interrupt(vcpu);
+
+ vcpu->arch.at_instruction_boundary = true;
+}
+
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+ if (to_vt(vcpu)->emulation_required)
+ return;
+
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_EXCEPTION_NMI:
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ kvm_machine_check();
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ switch (index) {
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
+ /*
+ * We cannot do SMM unless we can run the guest in big
+ * real mode.
+ */
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ return nested;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ case MSR_AMD64_TSC_RATIO:
+ /* This is AMD only. */
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+ u32 exit_intr_info;
+ bool unblock_nmi;
+ u8 vector;
+ bool idtv_info_valid;
+
+ idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ if (enable_vnmi) {
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return;
+
+ exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(),
+ vmx->loaded_vmcs->entry_time));
+}
+
+static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
+ u32 idt_vectoring_info,
+ int instr_len_field,
+ int error_code_field)
+{
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ if (!idtv_info_valid)
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = true;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
+ */
+ vmx_set_nmi_mask(vcpu, false);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ fallthrough;
+ case INTR_TYPE_HARD_EXCEPTION: {
+ u32 error_code = 0;
+
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(error_code_field);
+
+ kvm_requeue_exception(vcpu, vector,
+ idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
+ error_code);
+ break;
+ }
+ case INTR_TYPE_SOFT_INTR:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ fallthrough;
+ case INTR_TYPE_EXT_INTR:
+ kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
+ break;
+ default:
+ break;
+ }
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+ __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
+ VM_EXIT_INSTRUCTION_LEN,
+ IDT_VECTORING_ERROR_CODE);
+}
+
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ __vmx_complete_interrupts(vcpu,
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ VM_ENTRY_INSTRUCTION_LEN,
+ VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
+static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+{
+ int i, nr_msrs;
+ struct perf_guest_switch_msr *msrs;
+ struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
+
+ pmu->host_cross_mapped_mask = 0;
+ if (pmu->pebs_enable & pmu->global_ctrl)
+ intel_pmu_cross_mapped_check(pmu);
+
+ /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
+ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
+ if (!msrs)
+ return;
+
+ for (i = 0; i < nr_msrs; i++)
+ if (msrs[i].host == msrs[i].guest)
+ clear_atomic_switch_msr(vmx, msrs[i].msr);
+ else
+ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
+ msrs[i].host, false);
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (force_immediate_exit) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (vmx->hv_deadline_tsc != -1) {
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* set_hv_timer ensures the delta fits in 32-bits */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = true;
+ }
+}
+
+void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+{
+ if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
+ vmx->loaded_vmcs->host_state.rsp = host_rsp;
+ vmcs_writel(HOST_RSP, host_rsp);
+ }
+}
+
+void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ unsigned int flags)
+{
+ u64 hostval = this_cpu_read(x86_spec_ctrl_current);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
+ return;
+
+ if (flags & VMX_RUN_SAVE_SPEC_CTRL)
+ vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
+
+ /*
+ * If the guest/host SPEC_CTRL values differ, restore the host value.
+ *
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
+ vmx->spec_ctrl != hostval)
+ native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
+
+ barrier_nospec();
+}
+
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
+{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_wrmsr(vcpu);
+ case EXIT_REASON_MSR_WRITE_IMM:
+ return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
+ case EXIT_REASON_INVD:
+ return handle_fastpath_invd(vcpu);
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+}
+
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
+ !is_nmi(vmx_get_intr_info(vcpu)))
+ return;
+
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+ else
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+}
+
+static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ unsigned int flags)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ guest_state_enter_irqoff();
+
+ vmx_l1d_flush(vcpu);
+
+ vmx_disable_fb_clear(vmx);
+
+ if (vcpu->arch.cr2 != native_read_cr2())
+ native_write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ flags);
+
+ vcpu->arch.cr2 = native_read_cr2();
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+
+ vmx->idt_vectoring_info = 0;
+
+ vmx_enable_fb_clear(vmx);
+
+ if (unlikely(vmx->fail)) {
+ vmx->vt.exit_reason.full = 0xdead;
+ goto out;
+ }
+
+ vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ vmx_handle_nmi(vcpu);
+
+out:
+ guest_state_exit_irqoff();
+}
+
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
+{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
+ /*
+ * Don't enter VMX if guest state is invalid, let the exit handler
+ * start emulation until we arrive back to a valid state. Synthesize a
+ * consistency check VM-Exit due to invalid guest state and bail.
+ */
+ if (unlikely(vmx->vt.emulation_required)) {
+ vmx->fail = 0;
+
+ vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->vt.exit_reason.failed_vmentry = 1;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->vt.exit_intr_info = 0;
+ return EXIT_FASTPATH_NONE;
+ }
+
+ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+ vmcs_write32(PLE_WINDOW, vmx->ple_window);
+ }
+
+ /*
+ * We did this in prepare_switch_to_guest, because it needs to
+ * be within srcu_read_lock.
+ */
+ WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
+
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+ * it switches back to the current->mm, which can occur in KVM context
+ * when switching to a temporary mm to patch kernel code, e.g. if KVM
+ * toggles a static key while handling a VM-Exit.
+ */
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+ }
+
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ pt_guest_enter(vmx);
+
+ atomic_switch_perf_msrs(vmx);
+ if (intel_pmu_lbr_is_enabled(vcpu))
+ vmx_passthrough_lbr_msrs(vcpu);
+
+ if (enable_preemption_timer)
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
+ kvm_wait_lapic_expire(vcpu);
+
+ /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
+
+ /* All fields are clean at this point */
+ if (kvm_is_using_evmcs()) {
+ current_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
+ }
+
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
+
+#ifndef CONFIG_X86_64
+ /*
+ * The sysexit path does not restore ds/es, so we must set them to
+ * a reasonable value ourselves.
+ *
+ * We can't defer this to vmx_prepare_switch_to_host() since that
+ * function may be executed in interrupt context, which saves and
+ * restore segments around it, nullifying its effect.
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
+ pt_guest_exit(vmx);
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * Track VMLAUNCH/VMRESUME that have made past guest state
+ * checking.
+ */
+ if (vmx->nested.nested_run_pending &&
+ !vmx_get_exit_reason(vcpu).failed_vmentry)
+ ++vcpu->stat.nested_run;
+
+ vmx->nested.nested_run_pending = 0;
+ }
+
+ if (unlikely(vmx->fail))
+ return EXIT_FASTPATH_NONE;
+
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
+
+ if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
+ return EXIT_FASTPATH_NONE;
+
+ vmx->loaded_vmcs->launched = 1;
+
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
+}
+
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (enable_pml)
+ vmx_destroy_pml_buffer(vmx);
+ free_vpid(vmx->vpid);
+ nested_vmx_free_vcpu(vcpu);
+ free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
+}
+
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vmx_uret_msr *tsx_ctrl;
+ struct vcpu_vmx *vmx;
+ int i, err;
+
+ BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
+ vmx = to_vmx(vcpu);
+
+ INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
+
+ err = -ENOMEM;
+
+ vmx->vpid = allocate_vpid();
+
+ /*
+ * If PML is turned on, failure on enabling PML just results in failure
+ * of creating the vcpu, therefore we can simplify PML logic (by
+ * avoiding dealing with cases, such as enabling PML partially on vcpus
+ * for the guest), etc.
+ */
+ if (enable_pml) {
+ vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmx->pml_pg)
+ goto free_vpid;
+ }
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i)
+ vmx->guest_uret_msrs[i].mask = -1ull;
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ * Keep the host value unchanged to avoid changing CPUID bits
+ * under the host kernel's feet.
+ */
+ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (tsx_ctrl)
+ tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ }
+
+ err = alloc_loaded_vmcs(&vmx->vmcs01);
+ if (err < 0)
+ goto free_pml;
+
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (kvm_is_using_evmcs() &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+
+ if (cpu_need_virtualize_apic_accesses(vcpu)) {
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ if (enable_ept && !enable_unrestricted_guest) {
+ err = init_rmode_identity_map(vcpu->kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
+
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
+
+ vmx->ve_info = page_to_virt(page);
+ }
+
+ if (vmx_can_use_ipiv(vcpu))
+ WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+ __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
+
+ return 0;
+
+free_vmcs:
+ free_loaded_vmcs(vmx->loaded_vmcs);
+free_pml:
+ vmx_destroy_pml_buffer(vmx);
+free_vpid:
+ free_vpid(vmx->vpid);
+ return err;
+}
+
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+
+int vmx_vm_init(struct kvm *kvm)
+{
+ if (!ple_gap)
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
+
+ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ /* 'I explicitly don't care' is set */
+ break;
+ case L1TF_MITIGATION_AUTO:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ /*
+ * Warn upon starting the first VM in a potentially
+ * insecure environment.
+ */
+ if (sched_smt_active())
+ pr_warn_once(L1TF_MSG_SMT);
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+ pr_warn_once(L1TF_MSG_L1D);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ /* Flush is enforced */
+ break;
+ }
+ }
+
+ if (enable_pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
+ return 0;
+}
+
+static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
+{
+ /*
+ * Non-coherent DMA devices need the guest to flush CPU properly.
+ * In that case it is not possible to map all guest RAM as WB, so
+ * always trust guest PAT.
+ */
+ return !kvm_arch_has_noncoherent_dma(kvm) &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
+}
+
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
+ */
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+
+ /* Force WB if ignoring guest PAT */
+ if (vmx_ignore_guest_pat(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
+}
+
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_DESC;
+
+ u32 cur_ctl = secondary_exec_controls_get(vmx);
+
+ secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
+/*
+ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
+ * (indicating "allowed-1") if they are supported in the guest's CPUID.
+ */
+static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
+ vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
+
+#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
+ if (entry && (entry->_reg & (_cpuid_mask))) \
+ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
+} while (0)
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x1);
+ cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
+
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
+ cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+ cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK));
+ cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT));
+
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
+ cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
+
+#undef cr4_fixed1_update
+}
+
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *best = NULL;
+ int i;
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
+ if (!best)
+ return;
+ vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+ vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+ vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+ vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+ }
+
+ /* Get the number of configurable Address Ranges for filtering */
+ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_num_address_ranges);
+
+ /* Initialize and clear the no dependency bits */
+ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
+ RTIT_CTL_BRANCH_EN);
+
+ /*
+ * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
+ * will inject an #GP
+ */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+ /*
+ * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+ * PSBFreq can be set
+ */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+ RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+ /*
+ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
+ */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
+ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+ RTIT_CTL_MTC_RANGE);
+
+ /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
+ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+ RTIT_CTL_PTW_EN);
+
+ /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+ /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+
+ /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+ /* unmask address range configure area */
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
+ vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
+}
+
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * XSAVES is effectively enabled if and only if XSAVE is also exposed
+ * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+ * set if and only if XSAVE is supported.
+ */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
+
+ vmx_setup_uret_msrs(vmx);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(vmx,
+ vmx_secondary_exec_control(vmx));
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ nested_vmx_cr_fixed1_bits_update(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+ guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
+ update_intel_pt_cfg(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ struct vmx_uret_msr *msr;
+ msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (msr) {
+ bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
+ vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ }
+ }
+
+ set_cr4_guest_host_mask(vmx);
+
+ vmx_write_encls_bitmap(vcpu, NULL);
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
+ vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_SGX_LC_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_SGX_LC_ENABLED;
+
+ /* Refresh #PF interception to account for MAXPHYADDR changes. */
+ vmx_update_exception_bitmap(vcpu);
+}
+
+static __init u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = PERF_CAP_FW_WRITES;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
+ x86_perf_get_lbr(&vmx_lbr_caps);
+
+ /*
+ * KVM requires LBR callstack support, as the overhead due to
+ * context switching LBRs without said support is too high.
+ * See intel_pmu_create_guest_lbr_event() for more info.
+ */
+ if (!vmx_lbr_caps.has_callstack)
+ memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
+ else if (vmx_lbr_caps.nr)
+ perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
+ }
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+
+ /*
+ * Disallow adaptive PEBS as it is functionally broken, can be
+ * used by the guest to read *host* LBRs, and can be used to
+ * bypass userspace event filters. To correctly and safely
+ * support adaptive PEBS, KVM needs to:
+ *
+ * 1. Account for the ADAPTIVE flag when (re)programming fixed
+ * counters.
+ *
+ * 2. Gain support from perf (or take direct control of counter
+ * programming) to support events without adaptive PEBS
+ * enabled for the hardware counter.
+ *
+ * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
+ * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
+ *
+ * 4. Document which PMU events are effectively exposed to the
+ * guest via adaptive PEBS, and make adaptive PEBS mutually
+ * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
+ */
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
+}
+
+static __init void vmx_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ /* CPUID 0x1 */
+ if (nested)
+ kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+ /* CPUID 0x7 */
+ if (kvm_mpx_supported())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+ if (!cpu_has_vmx_invpcid())
+ kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
+ if (vmx_pt_mode_is_host_guest())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (vmx_pebs_supported()) {
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
+ }
+
+ if (!enable_pmu)
+ kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
+
+ if (!enable_sgx) {
+ kvm_cpu_cap_clear(X86_FEATURE_SGX);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
+ }
+
+ if (vmx_umip_emulated())
+ kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+ /* CPUID 0xD.1 */
+ if (!cpu_has_vmx_xsaves())
+ kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
+
+ if (cpu_has_vmx_waitpkg())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+
+ /*
+ * Disable CET if unrestricted_guest is unsupported as KVM doesn't
+ * enforce CET HW behaviors in emulator. On platforms with
+ * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
+ * fails, so disable CET in this case too.
+ */
+ if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
+ !cpu_has_vmx_basic_no_hw_errcode_cc()) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ }
+}
+
+static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ unsigned long *exit_qualification)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ int size;
+ bool imm;
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ imm = info->src_type == OP_IMM;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ imm = info->dst_type == OP_IMM;
+ }
+
+
+ *exit_qualification = ((unsigned long)port << 16) | (size - 1);
+
+ if (info->intercept == x86_intercept_ins ||
+ info->intercept == x86_intercept_outs)
+ *exit_qualification |= BIT(4);
+
+ if (info->rep_prefix)
+ *exit_qualification |= BIT(5);
+
+ if (imm)
+ *exit_qualification |= BIT(6);
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qualification = 0;
+ u32 vm_exit_reason;
+ u64 exit_insn_len;
+
+ switch (info->intercept) {
+ case x86_intercept_rdpid:
+ /*
+ * RDPID causes #UD if not enabled through secondary execution
+ * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
+ * TSC_AUX is NOT subject to interception, i.e. checking only
+ * the dedicated execution control is architecturally correct.
+ */
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
+ exception->vector = UD_VECTOR;
+ exception->error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ return X86EMUL_CONTINUE;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
+ break;
+
+ case x86_intercept_lgdt:
+ case x86_intercept_lidt:
+ case x86_intercept_lldt:
+ case x86_intercept_ltr:
+ case x86_intercept_sgdt:
+ case x86_intercept_sidt:
+ case x86_intercept_sldt:
+ case x86_intercept_str:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ return X86EMUL_CONTINUE;
+
+ if (info->intercept == x86_intercept_lldt ||
+ info->intercept == x86_intercept_ltr ||
+ info->intercept == x86_intercept_sldt ||
+ info->intercept == x86_intercept_str)
+ vm_exit_reason = EXIT_REASON_LDTR_TR;
+ else
+ vm_exit_reason = EXIT_REASON_GDTR_IDTR;
+ /*
+ * FIXME: Decode the ModR/M to generate the correct exit
+ * qualification for memory operands.
+ */
+ break;
+
+ case x86_intercept_hlt:
+ if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_HLT;
+ break;
+
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
+ break;
+
+ /* TODO: check more intercepts... */
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
+ if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
+ return X86EMUL_UNHANDLEABLE;
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
+ exit_insn_len);
+ return X86EMUL_INTERCEPTED;
+}
+
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
+{
+ struct vcpu_vmx *vmx;
+ u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
+
+ vmx = to_vmx(vcpu);
+ tscl = rdtsc();
+ guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+ ktimer->timer_advance_ns);
+
+ if (delta_tsc > lapic_timer_advance_cycles)
+ delta_tsc -= lapic_timer_advance_cycles;
+ else
+ delta_tsc = 0;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
+ delta_tsc && u64_shl_div_u64(delta_tsc,
+ kvm_caps.tsc_scaling_ratio_frac_bits,
+ vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ *expired = !delta_tsc;
+ return 0;
+}
+
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->hv_deadline_tsc = -1;
+}
+#endif
+
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (WARN_ON_ONCE(!enable_pml))
+ return;
+
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_cpu_dirty_logging = true;
+ return;
+ }
+
+ /*
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
+ * code, but in that case another update request will be made and so
+ * the guest will never run with a stale PML value.
+ */
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
+ else
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
+}
+
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_LMCE_ENABLED;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_LMCE_ENABLED;
+}
+
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+ return !is_smm(vcpu);
+}
+
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
+ * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
+ * SMI and RSM only modify state that is saved and restored via SMRAM.
+ * E.g. most MSRs are left untouched, but many are modified by VM-Exit
+ * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
+ */
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ vmx_clear_hlt(vcpu);
+ return 0;
+}
+
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ ret = nested_vmx_enter_non_root_mode(vcpu, false);
+ if (ret)
+ return ret;
+
+ vmx->nested.nested_run_pending = 1;
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ /* RSM will cause a vmexit anyway. */
+}
+#endif
+
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
+}
+
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
+
+ if (hrtimer_try_to_cancel(timer) == 1)
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ }
+}
+
+void vmx_hardware_unsetup(void)
+{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
+ if (nested)
+ nested_vmx_hardware_unsetup();
+
+ free_kvm_area();
+}
+
+void vmx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
+}
+
+/*
+ * Note, the SDM states that the linear address is masked *after* the modified
+ * canonicality check, whereas KVM masks (untags) the address and then performs
+ * a "normal" canonicality check. Functionally, the two methods are identical,
+ * and when the masking occurs relative to the canonicality check isn't visible
+ * to software, i.e. KVM's behavior doesn't violate the SDM.
+ */
+gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
+{
+ int lam_bit;
+ unsigned long cr3_bits;
+
+ if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
+ return gva;
+
+ if (!is_64_bit_mode(vcpu))
+ return gva;
+
+ /*
+ * Bit 63 determines if the address should be treated as user address
+ * or a supervisor address.
+ */
+ if (!(gva & BIT_ULL(63))) {
+ cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
+ if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
+ return gva;
+
+ /* LAM_U48 is ignored if LAM_U57 is set. */
+ lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
+ } else {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
+ return gva;
+
+ lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
+ }
+
+ /*
+ * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
+ * Bit 63 is retained from the raw virtual address so that untagging
+ * doesn't change a user access to a supervisor access, and vice versa.
+ */
+ return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
+}
+
+static unsigned int vmx_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
+ return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+ return 1;
+}
+
+static __init void vmx_setup_user_return_msrs(void)
+{
+
+ /*
+ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ * will emulate SYSCALL in legacy mode if the vendor string in guest
+ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ * support this emulation, MSR_STAR is included in the list for i386,
+ * but is never loaded into hardware. MSR_CSTAR is also never loaded
+ * into hardware and is here purely for emulation purposes.
+ */
+ const u32 vmx_uret_msrs_list[] = {
+ #ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ #endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
+ };
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+}
+
+static void __init vmx_setup_me_spte_mask(void)
+{
+ u64 me_mask = 0;
+
+ /*
+ * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
+ * boot_cpu_data.x86_phys_bits holds the actual physical address
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
+ */
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
+ me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
+ kvm_host.maxphyaddr - 1);
+
+ /*
+ * Unlike SME, host kernel doesn't support setting up any
+ * MKTME KeyID on Intel platforms. No memory encryption
+ * bits should be included into the SPTE.
+ */
+ kvm_mmu_set_me_spte_mask(0, me_mask);
+}
+
+__init int vmx_hardware_setup(void)
+{
+ unsigned long host_bndcfgs;
+ struct desc_ptr dt;
+ int r;
+
+ store_idt(&dt);
+ host_idt_base = dt.address;
+
+ vmx_setup_user_return_msrs();
+
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (boot_cpu_has(X86_FEATURE_MPX)) {
+ rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
+ }
+
+ if (!cpu_has_vmx_mpx())
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
+
+ if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+ !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
+ enable_vpid = 0;
+
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels() ||
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
+ enable_ept = 0;
+
+ /* NX support is required for shadow paging. */
+ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Shadow paging doesn't have a (further) performance penalty
+ * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+ * by default
+ */
+ if (!enable_ept)
+ allow_smaller_maxphyaddr = true;
+
+ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
+ enable_ept_ad_bits = 0;
+
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
+ enable_unrestricted_guest = 0;
+
+ if (!cpu_has_vmx_flexpriority())
+ flexpriority_enabled = 0;
+
+ if (!cpu_has_virtual_nmis())
+ enable_vnmi = 0;
+
+#ifdef CONFIG_X86_SGX_KVM
+ if (!cpu_has_vmx_encls_vmexit())
+ enable_sgx = false;
+#endif
+
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if not
+ * using the APIC_ACCESS_ADDR VMCS field.
+ */
+ if (!flexpriority_enabled)
+ vt_x86_ops.set_apic_access_page_addr = NULL;
+
+ if (!cpu_has_vmx_tpr_shadow())
+ vt_x86_ops.update_cr8_intercept = NULL;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
+ && enable_ept) {
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ }
+#endif
+
+ if (!cpu_has_vmx_ple()) {
+ ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
+
+ if (!cpu_has_vmx_apicv())
+ enable_apicv = 0;
+ if (!enable_apicv)
+ vt_x86_ops.sync_pir_to_irr = NULL;
+
+ if (!enable_apicv || !cpu_has_vmx_ipiv())
+ enable_ipiv = false;
+
+ if (cpu_has_vmx_tsc_scaling())
+ kvm_caps.has_tsc_control = true;
+
+ kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 48;
+ kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
+
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
+ if (enable_ept)
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+ cpu_has_vmx_ept_execute_only());
+ else
+ vt_x86_ops.get_mt_mask = NULL;
+
+ /*
+ * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
+ * bits to shadow_zero_check.
+ */
+ vmx_setup_me_spte_mask();
+
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
+ ept_caps_to_lpage_level(vmx_capability.ept));
+
+ /*
+ * Only enable PML when hardware supports PML feature, and both EPT
+ * and EPT A/D bit features are enabled -- PML depends on them to work.
+ */
+ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+ enable_pml = 0;
+
+ if (!cpu_has_vmx_preemption_timer())
+ enable_preemption_timer = false;
+
+ if (enable_preemption_timer) {
+ u64 use_timer_freq = 5000ULL * 1000 * 1000;
+
+ cpu_preemption_timer_multi =
+ vmx_misc_preemption_timer_rate(vmcs_config.misc);
+
+ if (tsc_khz)
+ use_timer_freq = (u64)tsc_khz * 1000;
+ use_timer_freq >>= cpu_preemption_timer_multi;
+
+ /*
+ * KVM "disables" the preemption timer by setting it to its max
+ * value. Don't use the timer if it might cause spurious exits
+ * at a rate faster than 0.1 Hz (of uninterrupted guest time).
+ */
+ if (use_timer_freq > 0xffffffffu / 10)
+ enable_preemption_timer = false;
+ }
+
+ if (!enable_preemption_timer) {
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
+ }
+
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_CMCI_P;
+
+ if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
+ return -EINVAL;
+ if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
+ pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vt_init_ops.handle_intel_pt_intr = NULL;
+
+ setup_default_sgx_lepubkeyhash();
+
+ vmx_set_cpu_caps();
+
+ /*
+ * Configure nested capabilities after core CPU capabilities so that
+ * nested support can be conditional on base support, e.g. so that KVM
+ * can hide/show features based on kvm_cpu_cap_has().
+ */
+ if (nested) {
+ nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
+
+ r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
+ if (r)
+ return r;
+ }
+
+ r = alloc_kvm_area();
+ if (r && nested)
+ nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
+ /*
+ * On Intel CPUs that lack self-snoop feature, letting the guest control
+ * memory types may result in unexpected behavior. So always ignore guest
+ * PAT on those CPUs and map VM as writeback, not allowing userspace to
+ * disable the quirk.
+ *
+ * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
+ * supported, UC is slow enough to cause issues with some older guests (e.g.
+ * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
+ * map the video RAM, causing wayland desktop to fail to get started
+ * correctly). To avoid breaking those older guests that rely on KVM to force
+ * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
+ * safer (for performance) default behavior.
+ *
+ * On top of this, non-coherent DMA devices need the guest to flush CPU
+ * caches properly. This also requires honoring guest PAT, and is forced
+ * independent of the quirk in vmx_ignore_guest_pat().
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ return r;
+}
+
+void vmx_exit(void)
+{
+ allow_smaller_maxphyaddr = false;
+
+ vmx_cleanup_l1d_flush();
+
+ kvm_x86_vendor_exit();
+}
+
+int __init vmx_init(void)
+{
+ int r, cpu;
+
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
+
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
+
+ /*
+ * Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
+ * i.e. there's nothing to unwind if a later step fails.
+ */
+ hv_init_evmcs();
+
+ /*
+ * Parse the VMCS config and VMX capabilities before anything else, so
+ * that the information is available to all setup flows.
+ */
+ if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
+ return -EIO;
+
+ r = kvm_x86_vendor_init(&vt_init_ops);
+ if (r)
+ return r;
+
+ /* Must be called after common x86 init so enable_ept is setup. */
+ r = vmx_setup_l1d_flush();
+ if (r)
+ goto err_l1d_flush;
+
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ pi_init_cpu(cpu);
+ }
+
+ vmx_check_vmcs12_offsets();
+
+ return 0;
+
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
+}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
new file mode 100644
index 000000000000..bc3ed3145d7e
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -0,0 +1,743 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_H
+#define __KVM_X86_VMX_H
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm.h>
+#include <asm/intel_pt.h>
+#include <asm/perf_event.h>
+#include <asm/posted_intr.h>
+
+#include "capabilities.h"
+#include "../kvm_cache_regs.h"
+#include "pmu_intel.h"
+#include "vmcs.h"
+#include "vmx_ops.h"
+#include "../cpuid.h"
+#include "run_flags.h"
+#include "../mmu.h"
+#include "common.h"
+
+#ifdef CONFIG_X86_64
+#define MAX_NR_USER_RETURN_MSRS 7
+#else
+#define MAX_NR_USER_RETURN_MSRS 4
+#endif
+
+#define MAX_NR_LOADSTORE_MSRS 8
+
+struct vmx_msrs {
+ unsigned int nr;
+ struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS];
+};
+
+struct vmx_uret_msr {
+ bool load_into_hardware;
+ u64 data;
+ u64 mask;
+};
+
+enum segment_cache_field {
+ SEG_FIELD_SEL = 0,
+ SEG_FIELD_BASE = 1,
+ SEG_FIELD_LIMIT = 2,
+ SEG_FIELD_AR = 3,
+
+ SEG_FIELD_NR = 4
+};
+
+#define RTIT_ADDR_RANGE 4
+
+struct pt_ctx {
+ u64 ctl;
+ u64 status;
+ u64 output_base;
+ u64 output_mask;
+ u64 cr3_match;
+ u64 addr_a[RTIT_ADDR_RANGE];
+ u64 addr_b[RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+ u64 ctl_bitmask;
+ u32 num_address_ranges;
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+ struct pt_ctx host;
+ struct pt_ctx guest;
+};
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+ /* Has the level1 guest done vmxon? */
+ bool vmxon;
+ gpa_t vmxon_ptr;
+ bool pml_full;
+
+ /* The guest-physical address of the current VMCS L1 keeps for L2 */
+ gpa_t current_vmptr;
+ /*
+ * Cache of the guest's VMCS, existing outside of guest memory.
+ * Loaded from guest memory during VMPTRLD. Flushed to guest
+ * memory during VMCLEAR and VMPTRLD.
+ */
+ struct vmcs12 *cached_vmcs12;
+ /*
+ * Cache of the guest's shadow VMCS, existing outside of guest
+ * memory. Loaded from guest memory during VM entry. Flushed
+ * to guest memory during VM exit.
+ */
+ struct vmcs12 *cached_shadow_vmcs12;
+
+ /*
+ * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer
+ */
+ struct gfn_to_hva_cache shadow_vmcs12_cache;
+
+ /*
+ * GPA to HVA cache for VMCS12
+ */
+ struct gfn_to_hva_cache vmcs12_cache;
+
+ /*
+ * Indicates if the shadow vmcs or enlightened vmcs must be updated
+ * with the data held by struct vmcs12.
+ */
+ bool need_vmcs12_to_shadow_sync;
+ bool dirty_vmcs12;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+
+ /*
+ * Indicates lazily loaded guest state has not yet been decached from
+ * vmcs02.
+ */
+ bool need_sync_vmcs02_to_vmcs12_rare;
+
+ /*
+ * vmcs02 has been initialized, i.e. state that is constant for
+ * vmcs02 has been written to the backing VMCS. Initialization
+ * is delayed until L1 actually attempts to run a nested VM.
+ */
+ bool vmcs02_initialized;
+
+ bool change_vmcs01_virtual_apic_mode;
+ bool reload_vmcs01_apic_access_page;
+ bool update_vmcs01_cpu_dirty_logging;
+ bool update_vmcs01_apicv_status;
+ bool update_vmcs01_hwapic_isr;
+
+ /*
+ * Enlightened VMCS has been enabled. It does not mean that L1 has to
+ * use it. However, VMX features available to L1 will be limited based
+ * on what the enlightened VMCS supports.
+ */
+ bool enlightened_vmcs_enabled;
+
+ /* L2 must run next, and mustn't decide to exit to L1. */
+ bool nested_run_pending;
+
+ /* Pending MTF VM-exit into L1. */
+ bool mtf_pending;
+
+ struct loaded_vmcs vmcs02;
+
+ /*
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
+ */
+ struct kvm_host_map apic_access_page_map;
+ struct kvm_host_map virtual_apic_map;
+ struct kvm_host_map pi_desc_map;
+
+ struct pi_desc *pi_desc;
+ bool pi_pending;
+ u16 posted_intr_nv;
+
+ struct hrtimer preemption_timer;
+ u64 preemption_timer_deadline;
+ bool has_preemption_timer_deadline;
+ bool preemption_timer_expired;
+
+ /*
+ * Used to snapshot MSRs that are conditionally loaded on VM-Enter in
+ * order to propagate the guest's pre-VM-Enter value into vmcs02. For
+ * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value.
+ * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_
+ * userspace restores MSRs before nested state. If userspace restores
+ * MSRs after nested state, the snapshot holds garbage, but KVM can't
+ * detect that, and the garbage value in vmcs02 will be overwritten by
+ * MSR restoration in any case.
+ */
+ u64 pre_vmenter_debugctl;
+ u64 pre_vmenter_bndcfgs;
+ u64 pre_vmenter_s_cet;
+ u64 pre_vmenter_ssp;
+ u64 pre_vmenter_ssp_tbl;
+
+ /* to migrate it to L1 if L2 writes to L1's CR8 directly */
+ int l1_tpr_threshold;
+
+ u16 vpid02;
+ u16 last_vpid;
+
+ struct nested_vmx_msrs msrs;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
+
+#ifdef CONFIG_KVM_HYPERV
+ gpa_t hv_evmcs_vmptr;
+ struct kvm_host_map hv_evmcs_map;
+ struct hv_enlightened_vmcs *hv_evmcs;
+#endif
+};
+
+struct vcpu_vmx {
+ struct kvm_vcpu vcpu;
+ struct vcpu_vt vt;
+ u8 fail;
+ u8 x2apic_msr_bitmap_mode;
+
+ u32 idt_vectoring_info;
+ ulong rflags;
+
+ /*
+ * User return MSRs are always emulated when enabled in the guest, but
+ * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
+ * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
+ * be loaded into hardware if those conditions aren't met.
+ */
+ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
+ bool guest_uret_msrs_loaded;
+#ifdef CONFIG_X86_64
+ u64 msr_guest_kernel_gs_base;
+#endif
+
+ u64 spec_ctrl;
+ u32 msr_ia32_umwait_control;
+
+ /*
+ * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+ * non-nested (L1) guest, it always points to vmcs01. For a nested
+ * guest (L2), it points to a different VMCS.
+ */
+ struct loaded_vmcs vmcs01;
+ struct loaded_vmcs *loaded_vmcs;
+
+ struct msr_autoload {
+ struct vmx_msrs guest;
+ struct vmx_msrs host;
+ } msr_autoload;
+
+ struct msr_autostore {
+ struct vmx_msrs guest;
+ } msr_autostore;
+
+ struct {
+ int vm86_active;
+ ulong save_rflags;
+ struct kvm_segment segs[8];
+ } rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } seg[8];
+ } segment_cache;
+ int vpid;
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
+
+ /* Dynamic PLE window. */
+ unsigned int ple_window;
+ bool ple_window_dirty;
+
+ /* Support for PML */
+#define PML_LOG_NR_ENTRIES 512
+ /* PML is written backwards: this is the first entry written by the CPU */
+#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1)
+
+ struct page *pml_pg;
+
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
+ /* SGX Launch Control public key hash */
+ u64 msr_ia32_sgxlepubkeyhash[4];
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
+
+ struct pt_desc pt_desc;
+ struct lbr_desc lbr_desc;
+
+ /* ve_info must be page aligned. */
+ struct vmx_ve_information *ve_info;
+};
+
+struct kvm_vmx {
+ struct kvm kvm;
+
+ unsigned int tss_addr;
+ bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+ /* Posted Interrupt Descriptor (PID) table for IPI virtualization */
+ u64 *pid_table;
+};
+
+static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu)
+{
+ return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt);
+}
+
+static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt)
+{
+ return &(container_of(vt, struct vcpu_vmx, vt)->vcpu);
+}
+
+static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+ return to_vt(vcpu)->exit_reason;
+}
+
+static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) &&
+ !WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ return vt->exit_qualification;
+}
+
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) &&
+ !WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ return vt->exit_intr_info;
+}
+
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
+int allocate_vpid(void);
+void free_vpid(int vpid);
+void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu);
+bool vmx_emulation_required(struct kvm_vcpu *vcpu);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
+void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
+void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags);
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
+ unsigned int flags);
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
+
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, true);
+}
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+
+gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
+
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
+#define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM)
+
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+{
+ WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS);
+
+ val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+}
+
+static inline u64 vmx_guest_debugctl_read(void)
+{
+ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS;
+}
+
+static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS))
+ return;
+
+ vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS);
+}
+
+/*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+ * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and
+ * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and
+ * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always
+ * VM-Exit.
+ */
+#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \
+static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int f = sizeof(unsigned long); \
+ \
+ if (msr <= 0x1fff) \
+ return bitop##_bit(msr, bitmap + base / f); \
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \
+ return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \
+ return (rtype)true; \
+}
+#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800)
+
+BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set)
+
+static inline u8 vmx_get_rvi(void)
+{
+ return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+#define __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS)
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ (__KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS | \
+ VM_ENTRY_IA32E_MODE)
+#else
+ #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS
+#endif
+#define KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS \
+ (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL | \
+ VM_ENTRY_LOAD_CET_STATE)
+
+#define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ (VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT)
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ (__KVM_REQUIRED_VMX_VM_EXIT_CONTROLS | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE)
+#else
+ #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS
+#endif
+#define KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL | \
+ VM_EXIT_LOAD_CET_STATE)
+
+#define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \
+ (PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING)
+#define KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL \
+ (PIN_BASED_VIRTUAL_NMIS | \
+ PIN_BASED_POSTED_INTR | \
+ PIN_BASED_VMX_PREEMPTION_TIMER)
+
+#define __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING)
+
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (__KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING)
+#else
+ #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL
+#endif
+
+#define KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | \
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+
+#define KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL 0
+#define KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL \
+ (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_APIC_REGISTER_VIRT | \
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
+ SECONDARY_EXEC_SHADOW_VMCS | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_ENABLE_PML | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_ENABLE_VMFUNC | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING | \
+ SECONDARY_EXEC_EPT_VIOLATION_VE)
+
+#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
+#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
+ (TERTIARY_EXEC_IPI_VIRT)
+
+#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
+ vmcs_write##bits(uname, val); \
+ vmx->loaded_vmcs->controls_shadow.lname = val; \
+ } \
+} \
+static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
+static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \
+{ \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
+} \
+static __always_inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
+} \
+static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
+} \
+static __always_inline void lname##_controls_changebit(struct vcpu_vmx *vmx, u##bits val, \
+ bool set) \
+{ \
+ if (set) \
+ lname##_controls_setbit(vmx, val); \
+ else \
+ lname##_controls_clearbit(vmx, val); \
+}
+BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
+
+/*
+ * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
+ * cache on demand. Other registers not listed here are synced to
+ * the cache immediately after VM-Exit.
+ */
+#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \
+ (1 << VCPU_REGS_RSP) | \
+ (1 << VCPU_EXREG_RFLAGS) | \
+ (1 << VCPU_EXREG_PDPTR) | \
+ (1 << VCPU_EXREG_SEGMENTS) | \
+ (1 << VCPU_EXREG_CR0) | \
+ (1 << VCPU_EXREG_CR3) | \
+ (1 << VCPU_EXREG_CR4) | \
+ (1 << VCPU_EXREG_EXIT_INFO_1) | \
+ (1 << VCPU_EXREG_EXIT_INFO_2))
+
+static inline unsigned long vmx_l1_guest_owned_cr0_bits(void)
+{
+ unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+
+ /*
+ * CR0.WP needs to be intercepted when KVM is shadowing legacy paging
+ * in order to construct shadow PTEs with the correct protections.
+ * Note! CR0.WP technically can be passed through to the guest if
+ * paging is disabled, but checking CR0.PG would generate a cyclical
+ * dependency of sorts due to forcing the caller to ensure CR0 holds
+ * the correct value prior to determining which CR0 bits can be owned
+ * by L1. Keep it simple and limit the optimization to EPT.
+ */
+ if (!enable_ept)
+ bits &= ~X86_CR0_WP;
+ return bits;
+}
+
+static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_vmx, kvm);
+}
+
+static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
+
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
+void free_vmcs(struct vmcs *vmcs);
+int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
+void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
+
+static inline struct vmcs *alloc_vmcs(bool shadow)
+{
+ return alloc_vmcs_cpu(shadow, raw_smp_processor_id(),
+ GFP_KERNEL_ACCOUNT);
+}
+
+static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
+{
+ return secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
+static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
+{
+ if (!enable_ept)
+ return true;
+
+ return allow_smaller_maxphyaddr &&
+ cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr;
+}
+
+static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
+{
+ return enable_unrestricted_guest && (!is_guest_mode(vcpu) ||
+ (secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST));
+}
+
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu);
+static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu);
+
+static inline int vmx_get_instr_info_reg(u32 vmx_instr_info)
+{
+ return (vmx_instr_info >> 3) & 0xf;
+}
+
+static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info)
+{
+ return (vmx_instr_info >> 28) & 0xf;
+}
+
+static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && enable_ipiv;
+}
+
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+int vmx_init(void);
+void vmx_exit(void);
+
+#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.c b/arch/x86/kvm/vmx/vmx_onhyperv.c
new file mode 100644
index 000000000000..b9a8b91166d0
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "capabilities.h"
+#include "vmx_onhyperv.h"
+
+DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+/*
+ * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
+ * is: in case a feature has corresponding fields in eVMCS described and it was
+ * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a
+ * feature which has no corresponding eVMCS field, this likely means that KVM
+ * needs to be updated.
+ */
+#define evmcs_check_vmcs_conf(field, ctrl) \
+ do { \
+ typeof(vmcs_conf->field) unsupported; \
+ \
+ unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \
+ if (unsupported) { \
+ pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\
+ (u64)unsupported); \
+ vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \
+ } \
+ } \
+ while (0)
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
+ evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
+ evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC);
+ evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC);
+ evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL);
+ evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL);
+}
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
new file mode 100644
index 000000000000..cdf8cbb69209
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__
+#define __ARCH_X86_KVM_VMX_ONHYPERV_H__
+
+#include <hyperv/hvhdk.h>
+#include <asm/mshyperv.h>
+
+#include <linux/jump_label.h>
+
+#include "capabilities.h"
+#include "hyperv_evmcs.h"
+#include "vmcs12.h"
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+static __always_inline bool kvm_is_using_evmcs(void)
+{
+ return static_branch_unlikely(&__kvm_is_using_evmcs);
+}
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ int offset = evmcs_field_offset(field, clean_field);
+
+ WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
+ return offset;
+}
+
+static __always_inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline u64 evmcs_read64(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static __always_inline u32 evmcs_read32(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static __always_inline u16 evmcs_read16(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static inline void evmcs_load(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ /*
+ * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page()
+ * and aborts enabling the feature otherwise. CPU onlining path is also checked in
+ * vmx_hardware_enable().
+ */
+ if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm))
+ return;
+
+ if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ vp_ap->nested_control.features.directhypercall = 1;
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static __always_inline bool kvm_is_using_evmcs(void) { return false; }
+static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
+static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
+static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
+static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
+static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
+static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void evmcs_load(u64 phys_addr) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
+#endif /* __ARCH_X86_KVM_VMX_ONHYPERV_H__ */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
new file mode 100644
index 000000000000..96677576c836
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -0,0 +1,371 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_INSN_H
+#define __KVM_X86_VMX_INSN_H
+
+#include <linux/nospec.h>
+
+#include <asm/vmx.h>
+
+#include "vmx_onhyperv.h"
+#include "vmcs.h"
+#include "../x86.h"
+
+void vmread_error(unsigned long field);
+void vmwrite_error(unsigned long field, unsigned long value);
+void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
+void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
+void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
+void invept_error(unsigned long ext, u64 eptp);
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+/*
+ * The VMREAD error trampoline _always_ uses the stack to pass parameters, even
+ * for 64-bit targets. Preserving all registers allows the VMREAD inline asm
+ * blob to avoid clobbering GPRs, which in turn allows the compiler to better
+ * optimize sequences of VMREADs.
+ *
+ * Declare the trampoline as an opaque label as it's not safe to call from C
+ * code; there is no way to tell the compiler to pass params on the stack for
+ * 64-bit targets.
+ *
+ * void vmread_error_trampoline(unsigned long field, bool fault);
+ */
+extern unsigned long vmread_error_trampoline;
+
+/*
+ * The second VMREAD error trampoline, called from the assembly trampoline,
+ * exists primarily to enable instrumentation for the VM-Fail path.
+ */
+void vmread_error_trampoline2(unsigned long field, bool fault);
+
+#endif
+
+static __always_inline void vmcs_check16(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "16-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "16-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "16-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "32-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "32-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "64-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "64-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "64-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "Natural width accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "Natural width accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "Natural width accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
+{
+ unsigned long value;
+
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+ asm_goto_output("1: vmread %[field], %[output]\n\t"
+ "jna %l[do_fail]\n\t"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [output] "=r" (value)
+ : [field] "r" (field)
+ : "cc"
+ : do_fail, do_exception);
+
+ return value;
+
+do_fail:
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ return 0;
+
+do_exception:
+ kvm_spurious_fault();
+ return 0;
+
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+ asm volatile("1: vmread %[field], %[output]\n\t"
+ ".byte 0x3e\n\t" /* branch taken hint */
+ "ja 3f\n\t"
+
+ /*
+ * VMREAD failed. Push '0' for @fault, push the failing
+ * @field, and bounce through the trampoline to preserve
+ * volatile registers.
+ */
+ "xorl %k[output], %k[output]\n\t"
+ "2:\n\t"
+ "push %[output]\n\t"
+ "push %[field]\n\t"
+ "call vmread_error_trampoline\n\t"
+
+ /*
+ * Unwind the stack. Note, the trampoline zeros out the
+ * memory for @fault so that the result is '0' on error.
+ */
+ "pop %[field]\n\t"
+ "pop %[output]\n\t"
+ "3:\n\t"
+
+ /* VMREAD faulted. As above, except push '1' for @fault. */
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output])
+
+ : ASM_CALL_CONSTRAINT, [output] "=&r" (value)
+ : [field] "r" (field)
+ : "cc");
+ return value;
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+}
+
+static __always_inline u16 vmcs_read16(unsigned long field)
+{
+ vmcs_check16(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read16(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u32 vmcs_read32(unsigned long field)
+{
+ vmcs_check32(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read32(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u64 vmcs_read64(unsigned long field)
+{
+ vmcs_check64(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read64(field);
+#ifdef CONFIG_X86_64
+ return __vmcs_readl(field);
+#else
+ return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
+#endif
+}
+
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+ vmcs_checkl(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read64(field);
+ return __vmcs_readl(field);
+}
+
+#define vmx_asm1(insn, op1, error_args...) \
+do { \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1 : "cc" : error, fault); \
+ return; \
+error: \
+ instrumentation_begin(); \
+ insn##_error(error_args); \
+ instrumentation_end(); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define vmx_asm2(insn, op1, op2, error_args...) \
+do { \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1, op2 : "cc" : error, fault); \
+ return; \
+error: \
+ instrumentation_begin(); \
+ insn##_error(error_args); \
+ instrumentation_end(); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
+}
+
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
+{
+ vmcs_check16(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write16(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
+{
+ vmcs_check32(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write32(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
+{
+ vmcs_check64(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+ __vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmcs_checkl(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_clear_bits does not support 64-bit fields");
+ if (kvm_is_using_evmcs())
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) & ~mask);
+}
+
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_set_bits does not support 64-bit fields");
+ if (kvm_is_using_evmcs())
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) | mask);
+}
+
+static inline void vmcs_clear(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+
+ vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+
+ if (kvm_is_using_evmcs())
+ return evmcs_load(phys_addr);
+
+ vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+
+ vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
+}
+
+static inline void __invept(unsigned long ext, u64 eptp)
+{
+ struct {
+ u64 eptp;
+ u64 reserved_0;
+ } operand = { eptp, 0 };
+ vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp);
+}
+
+static inline void vpid_sync_vcpu_single(int vpid)
+{
+ if (vpid == 0)
+ return;
+
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(int vpid)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vpid);
+ else if (vpid != 0)
+ vpid_sync_vcpu_global();
+}
+
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_individual_addr())
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ else
+ vpid_sync_context(vpid);
+}
+
+static inline void ept_sync_global(void)
+{
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp);
+ else
+ ept_sync_global();
+}
+
+#endif /* __KVM_X86_VMX_INSN_H */
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
new file mode 100644
index 000000000000..d09abeac2b56
--- /dev/null
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_X86_OPS_H
+#define __KVM_X86_VMX_X86_OPS_H
+
+#include <linux/kvm_host.h>
+
+#include "x86.h"
+
+__init int vmx_hardware_setup(void);
+
+extern struct kvm_x86_ops vt_x86_ops __initdata;
+extern struct kvm_x86_init_ops vt_init_ops __initdata;
+
+void vmx_hardware_unsetup(void);
+int vmx_check_processor_compat(void);
+int vmx_enable_virtualization_cpu(void);
+void vmx_disable_virtualization_cpu(void);
+void vmx_emergency_disable_virtualization_cpu(void);
+int vmx_vm_init(struct kvm *kvm);
+void vmx_vm_destroy(struct kvm *kvm);
+int vmx_vcpu_precreate(struct kvm *kvm);
+int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
+void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath);
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu);
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu);
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu);
+#endif
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception);
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
+void vmx_migrate_timers(struct kvm_vcpu *vcpu);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
+void vmx_recalc_intercepts(struct kvm_vcpu *vcpu);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+int vmx_get_feature_msr(u32 msr, u64 *data);
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#define vmx_complete_emulated_msr kvm_complete_insn_gp
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr);
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall);
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected);
+void vmx_inject_nmi(struct kvm_vcpu *vcpu);
+void vmx_inject_exception(struct kvm_vcpu *vcpu);
+void vmx_cancel_injection(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu);
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu);
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu);
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code);
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_X86_64
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired);
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
+#endif
+void vmx_setup_mce(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_KVM_INTEL_TDX
+void tdx_disable_virtualization_cpu(void);
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_release_hkid(struct kvm *kvm);
+void tdx_vm_destroy(struct kvm *kvm);
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu);
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void tdx_vcpu_free(struct kvm_vcpu *vcpu);
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void tdx_vcpu_put(struct kvm_vcpu *vcpu);
+int tdx_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion fastpath);
+
+void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void tdx_inject_nmi(struct kvm_vcpu *vcpu);
+void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+bool tdx_has_emulated_msr(u32 index);
+int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
+
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
+#endif
+
+#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d4529011828..0c6d899d53dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
@@ -6,975 +7,4639 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "irq.h"
+#include "ioapic.h"
#include "mmu.h"
#include "i8254.h"
#include "tss.h"
#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "mmu/page_track.h"
#include "x86.h"
+#include "cpuid.h"
+#include "pmu.h"
+#include "hyperv.h"
+#include "lapic.h"
+#include "xen.h"
+#include "smm.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
#include <linux/mman.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
-#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
-
-#include <asm/uaccess.h>
+#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/isolation.h>
+#include <linux/mem_encrypt.h>
+#include <linux/suspend.h>
+#include <linux/smp.h>
+
+#include <trace/events/ipi.h>
+#include <trace/events/kvm.h>
+
+#include <asm/debugreg.h>
#include <asm/msr.h>
#include <asm/desc.h>
-#include <asm/mtrr.h>
+#include <asm/mce.h>
+#include <asm/pkru.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/pvclock.h>
+#include <asm/div64.h>
+#include <asm/irq_remapping.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
+#include <asm/intel_pt.h>
+#include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
+#include <clocksource/hyperv_timer.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
#define MAX_IO_MSRS 256
-#define CR0_RESERVED_BITS \
- (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
- | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
- | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS \
- (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
- | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+/*
+ * Note, kvm_caps fields should *never* have default values, all fields must be
+ * recomputed from scratch during vendor module load, e.g. to account for a
+ * vendor module being reloaded with different module parameters.
+ */
+struct kvm_caps kvm_caps __read_mostly;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_caps);
+
+struct kvm_host_values kvm_host __read_mostly;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host);
+
+#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
+
+#define emul_to_vcpu(ctxt) \
+ ((struct kvm_vcpu *)(ctxt)->vcpu)
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
*/
#ifdef CONFIG_X86_64
-static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
+static
+u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
#else
-static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
+static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index);
-
-struct kvm_x86_ops *kvm_x86_ops;
-EXPORT_SYMBOL_GPL(kvm_x86_ops);
-
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "pf_fixed", VCPU_STAT(pf_fixed) },
- { "pf_guest", VCPU_STAT(pf_guest) },
- { "tlb_flush", VCPU_STAT(tlb_flush) },
- { "invlpg", VCPU_STAT(invlpg) },
- { "exits", VCPU_STAT(exits) },
- { "io_exits", VCPU_STAT(io_exits) },
- { "mmio_exits", VCPU_STAT(mmio_exits) },
- { "signal_exits", VCPU_STAT(signal_exits) },
- { "irq_window", VCPU_STAT(irq_window_exits) },
- { "nmi_window", VCPU_STAT(nmi_window_exits) },
- { "halt_exits", VCPU_STAT(halt_exits) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "hypercalls", VCPU_STAT(hypercalls) },
- { "request_irq", VCPU_STAT(request_irq_exits) },
- { "irq_exits", VCPU_STAT(irq_exits) },
- { "host_state_reload", VCPU_STAT(host_state_reload) },
- { "efer_reload", VCPU_STAT(efer_reload) },
- { "fpu_reload", VCPU_STAT(fpu_reload) },
- { "insn_emulation", VCPU_STAT(insn_emulation) },
- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
- { "irq_injections", VCPU_STAT(irq_injections) },
- { "nmi_injections", VCPU_STAT(nmi_injections) },
- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
- { "mmu_flooded", VM_STAT(mmu_flooded) },
- { "mmu_recycled", VM_STAT(mmu_recycled) },
- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
- { "mmu_unsync", VM_STAT(mmu_unsync) },
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages) },
- { NULL }
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
+#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
+
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
+static void process_nmi(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void store_regs(struct kvm_vcpu *vcpu);
+static int sync_regs(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
+
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
+static DEFINE_MUTEX(vendor_module_lock);
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
+struct kvm_x86_ops kvm_x86_ops __read_mostly;
+
+#define KVM_X86_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
+ *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+
+static bool __read_mostly ignore_msrs = 0;
+module_param(ignore_msrs, bool, 0644);
+
+bool __read_mostly report_ignored_msrs = true;
+module_param(report_ignored_msrs, bool, 0644);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs);
+
+unsigned int min_timer_period_us = 200;
+module_param(min_timer_period_us, uint, 0644);
+
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 __read_mostly tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, 0644);
+
+bool __read_mostly enable_vmware_backdoor = false;
+module_param(enable_vmware_backdoor, bool, 0444);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor);
+
+/*
+ * Flags to manipulate forced emulation behavior (any non-zero value will
+ * enable forced emulation).
+ */
+#define KVM_FEP_CLEAR_RFLAGS_RF BIT(1)
+static int __read_mostly force_emulation_prefix;
+module_param(force_emulation_prefix, int, 0644);
+
+int __read_mostly pi_inject_timer = -1;
+module_param(pi_inject_timer, bint, 0644);
+
+/* Enable/disable PMU virtualization */
+bool __read_mostly enable_pmu = true;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu);
+module_param(enable_pmu, bool, 0444);
+
+bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
+/* Enable/disable SMT_RSB bug mitigation */
+static bool __read_mostly mitigate_smt_rsb;
+module_param(mitigate_smt_rsb, bool, 0444);
+
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
+
+struct kvm_user_return_msrs {
+ struct user_return_notifier urn;
+ bool registered;
+ struct kvm_user_return_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-unsigned long segment_base(u16 selector)
-{
- struct descriptor_table gdt;
- struct desc_struct *d;
- unsigned long table_base;
- unsigned long v;
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
+static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs);
- if (selector == 0)
- return 0;
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
+
+#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)
+/*
+ * Note, KVM supports exposing PT to the guest, but does not support context
+ * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping
+ * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support
+ * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs).
+ */
+#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL)
+
+bool __read_mostly allow_smaller_maxphyaddr = 0;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(allow_smaller_maxphyaddr);
+
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_apicv);
+
+bool __read_mostly enable_ipiv = true;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_ipiv);
+
+bool __read_mostly enable_device_posted_irqs = true;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_device_posted_irqs);
+
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+ STATS_DESC_COUNTER(VM, mmu_pte_write),
+ STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+ STATS_DESC_COUNTER(VM, mmu_flooded),
+ STATS_DESC_COUNTER(VM, mmu_recycled),
+ STATS_DESC_COUNTER(VM, mmu_cache_miss),
+ STATS_DESC_ICOUNTER(VM, mmu_unsync),
+ STATS_DESC_ICOUNTER(VM, pages_4k),
+ STATS_DESC_ICOUNTER(VM, pages_2m),
+ STATS_DESC_ICOUNTER(VM, pages_1g),
+ STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
+ STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+};
- asm("sgdt %0" : "=m"(gdt));
- table_base = gdt.base;
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
- if (selector & 4) { /* from ldt */
- u16 ldt_selector;
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, pf_taken),
+ STATS_DESC_COUNTER(VCPU, pf_fixed),
+ STATS_DESC_COUNTER(VCPU, pf_emulate),
+ STATS_DESC_COUNTER(VCPU, pf_spurious),
+ STATS_DESC_COUNTER(VCPU, pf_fast),
+ STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
+ STATS_DESC_COUNTER(VCPU, pf_guest),
+ STATS_DESC_COUNTER(VCPU, tlb_flush),
+ STATS_DESC_COUNTER(VCPU, invlpg),
+ STATS_DESC_COUNTER(VCPU, exits),
+ STATS_DESC_COUNTER(VCPU, io_exits),
+ STATS_DESC_COUNTER(VCPU, mmio_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, irq_window_exits),
+ STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+ STATS_DESC_COUNTER(VCPU, l1d_flush),
+ STATS_DESC_COUNTER(VCPU, halt_exits),
+ STATS_DESC_COUNTER(VCPU, request_irq_exits),
+ STATS_DESC_COUNTER(VCPU, irq_exits),
+ STATS_DESC_COUNTER(VCPU, host_state_reload),
+ STATS_DESC_COUNTER(VCPU, fpu_reload),
+ STATS_DESC_COUNTER(VCPU, insn_emulation),
+ STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+ STATS_DESC_COUNTER(VCPU, hypercalls),
+ STATS_DESC_COUNTER(VCPU, irq_injections),
+ STATS_DESC_COUNTER(VCPU, nmi_injections),
+ STATS_DESC_COUNTER(VCPU, req_event),
+ STATS_DESC_COUNTER(VCPU, nested_run),
+ STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+ STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ STATS_DESC_COUNTER(VCPU, preemption_reported),
+ STATS_DESC_COUNTER(VCPU, preemption_other),
+ STATS_DESC_IBOOLEAN(VCPU, guest_mode),
+ STATS_DESC_COUNTER(VCPU, notify_window_exits),
+};
- asm("sldt %0" : "=g"(ldt_selector));
- table_base = segment_base(ldt_selector);
- }
- d = (struct desc_struct *)(table_base + (selector & ~7));
- v = d->base0 | ((unsigned long)d->base1 << 16) |
- ((unsigned long)d->base2 << 24);
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
+};
+
+static struct kmem_cache *x86_emulator_cache;
+
+/*
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
+ * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
+ * MSRs that KVM emulates without strictly requiring host support.
+ * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
+ * msrs_to_save and emulated_msrs.
+ */
+
+static const u32 msrs_to_save_base[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
#ifdef CONFIG_X86_64
- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- return v;
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS,
+
+ MSR_IA32_U_CET, MSR_IA32_S_CET,
+ MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
+ MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
+};
+
+static const u32 msrs_to_save_pmu[] = {
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+
+ /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+
+ /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+ MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
+};
+
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
+static unsigned num_msrs_to_save;
+
+static const u32 emulated_msrs_all[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+
+#ifdef CONFIG_KVM_HYPERV
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+#endif
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSC_DEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
+ MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
+
+ /*
+ * KVM always supports the "true" VMX control MSRs, even if the host
+ * does not. The VMX MSRs as a whole are considered "emulated" as KVM
+ * doesn't strictly require them to exist in the host (ignoring that
+ * KVM would refuse to load in the first place if the core set of MSRs
+ * aren't supported).
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
+};
+
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
+static unsigned num_emulated_msrs;
+
+/*
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
+ */
+static const u32 msr_based_features_all_except_vmx[] = {
+ MSR_AMD64_DE_CFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_PLATFORM_INFO,
+};
+
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
+static unsigned int num_msr_based_features;
+
+/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
+{
+ int i;
+
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
}
-EXPORT_SYMBOL_GPL(segment_base);
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+static bool kvm_is_advertised_msr(u32 msr_index)
{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->arch.apic_base;
- else
- return vcpu->arch.apic_base;
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ for (i = 0; i < num_emulated_msrs; i++) {
+ if (emulated_msrs[i] == msr_index)
+ return true;
+ }
+
+ return false;
}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
+typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated);
+
+static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
+ u64 *data, bool host_initiated,
+ enum kvm_msr_access rw,
+ msr_access_t msr_access_fn)
{
- /* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->arch.apic_base = data;
+ const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
+ int ret;
+
+ BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
+
+ /*
+ * Zero the data on read failures to avoid leaking stack data to the
+ * guest and/or userspace, e.g. if the failure is ignored below.
+ */
+ ret = msr_access_fn(vcpu, msr, data, host_initiated);
+ if (ret && rw == MSR_TYPE_R)
+ *data = 0;
+
+ if (ret != KVM_MSR_RET_UNSUPPORTED)
+ return ret;
+
+ /*
+ * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
+ * advertises to userspace, even if an MSR isn't fully supported.
+ * Simply check that @data is '0', which covers both the write '0' case
+ * and all reads (in which case @data is zeroed on failure; see above).
+ */
+ if (host_initiated && !*data && kvm_is_advertised_msr(msr))
+ return 0;
+
+ if (!ignore_msrs) {
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, *data);
+ return ret;
+ }
+
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = false;
- vcpu->arch.exception.nr = nr;
+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
+ unsigned int size = sizeof(struct x86_emulate_ctxt);
+
+ return kmem_cache_create_usercopy("x86_emulator", size,
+ __alignof__(struct x86_emulate_ctxt),
+ SLAB_ACCOUNT, useroffset,
+ size - useroffset, NULL);
}
-EXPORT_SYMBOL_GPL(kvm_queue_exception);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
- u32 error_code)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
- ++vcpu->stat.pf_guest;
+ int i;
+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
+ vcpu->arch.apf.gfns[i] = ~0;
+}
- if (vcpu->arch.exception.pending) {
- if (vcpu->arch.exception.nr == PF_VECTOR) {
- printk(KERN_DEBUG "kvm: inject_page_fault:"
- " double fault 0x%lx\n", addr);
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- } else if (vcpu->arch.exception.nr == DF_VECTOR) {
- /* triple fault -> shutdown */
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+static void kvm_destroy_user_return_msrs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
+
+ kvm_nr_uret_msrs = 0;
+}
+
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_user_return_msrs *msrs
+ = container_of(urn, struct kvm_user_return_msrs, urn);
+ struct kvm_user_return_msr_values *values;
+
+ msrs->registered = false;
+ user_return_notifier_unregister(urn);
+
+ for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
+ values = &msrs->values[slot];
+ if (values->host != values->curr) {
+ wrmsrq(kvm_uret_msrs_list[slot], values->host);
+ values->curr = values->host;
}
+ }
+}
+
+static int kvm_probe_user_return_msr(u32 msr)
+{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrq_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrq_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+
+int kvm_add_user_return_msr(u32 msr)
+{
+ BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+ if (kvm_probe_user_return_msr(msr))
+ return -1;
+
+ kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ return kvm_nr_uret_msrs++;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
+{
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (kvm_uret_msrs_list[i] == msr)
+ return i;
+ }
+ return -1;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr);
+
+static void kvm_user_return_msr_cpu_online(void)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
+ u64 value;
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ rdmsrq_safe(kvm_uret_msrs_list[i], &value);
+ msrs->values[i].host = value;
+ msrs->values[i].curr = value;
+ }
+}
+
+static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
+{
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
+ }
+}
+
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
+ int err;
+
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
+ if (value == msrs->values[slot].curr)
+ return 0;
+ err = wrmsrq_safe(kvm_uret_msrs_list[slot], value);
+ if (err)
+ return 1;
+
+ msrs->values[slot].curr = value;
+ kvm_user_return_register_notifier(msrs);
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
+
+u64 kvm_get_user_return_msr(unsigned int slot)
+{
+ return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr);
+
+static void drop_user_return_notifiers(void)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
+
+ if (msrs->registered)
+ kvm_on_user_return(&msrs->urn);
+}
+
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running. Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
+{
+ /* Fault while not rebooting. We want the trace. */
+ BUG_ON(!kvm_rebooting);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault);
+
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+#define EXCPT_FAULT 0
+#define EXCPT_TRAP 1
+#define EXCPT_ABORT 2
+#define EXCPT_INTERRUPT 3
+#define EXCPT_DB 4
+
+static int exception_type(int vector)
+{
+ unsigned int mask;
+
+ if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+ return EXCPT_INTERRUPT;
+
+ mask = 1 << vector;
+
+ /*
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+ */
+ if (mask & (1 << DB_VECTOR))
+ return EXCPT_DB;
+
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ return EXCPT_TRAP;
+
+ if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+ return EXCPT_ABORT;
+
+ /* Reserved exceptions will result in fault */
+ return EXCPT_FAULT;
+}
+
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+ struct kvm_queued_exception *ex)
+{
+ if (!ex->has_payload)
return;
+
+ switch (ex->vector) {
+ case DB_VECTOR:
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
+ /*
+ * In order to reflect the #DB exception payload in guest
+ * dr6, three components need to be considered: active low
+ * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
+ * DR6_BS and DR6_BT)
+ * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
+ * In the target guest dr6:
+ * FIXED_1 bits should always be set.
+ * Active low bits should be cleared if 1-setting in payload.
+ * Active high bits should be set if 1-setting in payload.
+ *
+ * Note, the payload is compatible with the pending debug
+ * exceptions/exit qualification under VMX, that active_low bits
+ * are active high in payload.
+ * So they need to be flipped for DR6.
+ */
+ vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
+ vcpu->arch.dr6 |= ex->payload;
+ vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
+
+ /*
+ * The #DB payload is defined as compatible with the 'pending
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
+ * defined in the 'pending debug exceptions' field (enabled
+ * breakpoint), it is reserved and must be zero in DR6.
+ */
+ vcpu->arch.dr6 &= ~BIT(12);
+ break;
+ case PF_VECTOR:
+ vcpu->arch.cr2 = ex->payload;
+ break;
+ }
+
+ ex->has_payload = false;
+ ex->payload = 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_deliver_exception_payload);
+
+static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+ bool has_error_code, u32 error_code,
+ bool has_payload, unsigned long payload)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+
+ ex->vector = vector;
+ ex->injected = false;
+ ex->pending = true;
+ ex->has_error_code = has_error_code;
+ ex->error_code = error_code;
+ ex->has_payload = has_payload;
+ ex->payload = payload;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error, u32 error_code,
+ bool has_payload, unsigned long payload)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * If the exception is destined for L2, morph it to a VM-Exit if L1
+ * wants to intercept the exception.
+ */
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+ kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+ has_payload, payload);
+ return;
+ }
+
+ if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
+ queue:
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.vector = nr;
+ vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.has_payload = has_payload;
+ vcpu->arch.exception.payload = payload;
+ if (!is_guest_mode(vcpu))
+ kvm_deliver_exception_payload(vcpu,
+ &vcpu->arch.exception);
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.vector;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+ (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /*
+ * Synthesize #DF. Clear the previously injected or pending
+ * exception so as not to incorrectly trigger shutdown.
+ */
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.pending = false;
+
+ kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
+ } else {
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+ }
+}
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception);
+
+
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
+ unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_p);
+
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
+ u32 error_code, unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, true, payload);
+}
+
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error_code, u32 error_code)
+{
+
+ /*
+ * On VM-Entry, an exception can be pending if and only if event
+ * injection was blocked by nested_run_pending. In that case, however,
+ * vcpu_enter_guest() requests an immediate exit, and the guest
+ * shouldn't proceed far enough to need reinjection.
+ */
+ WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
+
+ /*
+ * Do not check for interception when injecting an event for L2, as the
+ * exception was checked for intercept when it was original queued, and
+ * re-checking is incorrect if _L1_ injected the exception, in which
+ * case it's exempt from interception.
+ */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vcpu->arch.exception.injected = true;
+ vcpu->arch.exception.has_error_code = has_error_code;
+ vcpu->arch.exception.vector = nr;
+ vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception);
+
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ return kvm_skip_emulated_instruction(vcpu);
+
+ return 1;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp);
+
+static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
- vcpu->arch.cr2 = addr;
- kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_COMPLETE_USER_EXIT);
}
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ ++vcpu->stat.pf_guest;
+
+ /*
+ * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+ * whether or not L1 wants to intercept "regular" #PF.
+ */
+ if (is_guest_mode(vcpu) && fault->async_page_fault)
+ kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+ true, fault->error_code,
+ true, fault->address);
+ else
+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
+ fault->address);
+}
+
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct kvm_mmu *fault_mmu;
+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
+
+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
+ vcpu->arch.walk_mmu;
+
+ /*
+ * Invalidate the TLB entry for the faulting address, if it exists,
+ * else the access will fault indefinitely (and to emulate hardware).
+ */
+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
+ !(fault->error_code & PFERR_RSVD_MASK))
+ kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+ KVM_MMU_ROOT_CURRENT);
+
+ fault_mmu->inject_page_fault(vcpu, fault);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault);
+
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
- vcpu->arch.nmi_pending = 1;
+ atomic_inc(&vcpu->arch.nmi_queued);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = true;
- vcpu->arch.exception.nr = nr;
- vcpu->arch.exception.error_code = error_code;
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0);
}
-EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e);
-static void __queue_exception(struct kvm_vcpu *vcpu)
+/*
+ * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
+{
+ if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl)
+ return true;
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return false;
+}
+
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
- kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
+ if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
+ return true;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr);
+
+static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
+}
+
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
}
/*
- * Load the pae pdptrs. Return true is they are all valid.
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
*/
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ gpa_t real_gpa;
int i;
int ret;
- u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+ u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
+
+ /*
+ * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
+ * to an L1 GPA.
+ */
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ if (real_gpa == INVALID_GPA)
+ return 0;
+
+ /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
+ cr3 & GENMASK(11, 5), sizeof(pdpte));
+ if (ret < 0)
+ return 0;
- ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte));
- if (ret < 0) {
- ret = 0;
- goto out;
- }
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_pte(pdpte[i]) &&
- (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
- ret = 0;
- goto out;
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
+ return 0;
}
}
- ret = 1;
- memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
-out:
+ /*
+ * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
+ * Shadow page roots need to be reconstructed instead.
+ */
+ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
+ kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
- return ret;
+ memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+ vcpu->arch.pdptrs_from_userspace = false;
+
+ return 1;
}
-EXPORT_SYMBOL_GPL(load_pdptrs);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs);
-static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- bool changed = true;
- int r;
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return false;
+#endif
- if (is_long_mode(vcpu) || !is_pae(vcpu))
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
return false;
- r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
- if (r < 0)
- goto out;
- changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
-out:
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return false;
- return changed;
+ return kvm_x86_call(is_valid_cr0)(vcpu, cr0);
}
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
- if (cr0 & CR0_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
- cr0, vcpu->arch.cr0);
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ /*
+ * CR0.WP is incorporated into the MMU role, but only for non-nested,
+ * indirect shadow MMUs. If paging is disabled, no updates are needed
+ * as there are no permission bits to emulate. If TDP is enabled, the
+ * MMU's metadata needs to be updated, e.g. so that emulating guest
+ * translations does the right thing, but there's no need to unload the
+ * root as CR0.WP doesn't affect SPTEs.
+ */
+ if ((cr0 ^ old_cr0) == X86_CR0_WP) {
+ if (!(cr0 & X86_CR0_PG))
+ return;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
- printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
- kvm_inject_gp(vcpu, 0);
- return;
+ if (tdp_enabled) {
+ kvm_init_mmu(vcpu);
+ return;
+ }
}
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
- printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
- "and a clear PE flag\n");
- kvm_inject_gp(vcpu, 0);
- return;
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+ /*
+ * Clearing CR0.PG is defined to flush the TLB from the guest's
+ * perspective.
+ */
+ if (!(cr0 & X86_CR0_PG))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ /*
+ * Check for async #PF completion events when enabling paging,
+ * as the vCPU may have previously encountered async #PFs (it's
+ * entirely legal for the guest to toggle paging on/off without
+ * waiting for the async #PF queue to drain).
+ */
+ else if (kvm_pv_async_pf_enabled(vcpu))
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
}
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-#ifdef CONFIG_X86_64
- if ((vcpu->arch.shadow_efer & EFER_LME)) {
- int cs_db, cs_l;
+ if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0);
- if (!is_pae(vcpu)) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while PAE is disabled\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while CS.L == 1\n");
- kvm_inject_gp(vcpu, 0);
- return;
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
- }
- } else
-#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
- "reserved bits\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!kvm_is_valid_cr0(vcpu, cr0))
+ return 1;
+
+ cr0 |= X86_CR0_ET;
+
+ /* Write to CR0 reserved bits are ignored, even on Intel. */
+ cr0 &= ~CR0_RESERVED_BITS;
+#ifdef CONFIG_X86_64
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
+ (cr0 & X86_CR0_PG)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ if (cs_l)
+ return 1;
}
+#endif
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
+ !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
+ return 1;
- kvm_x86_ops->set_cr0(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
+ return 1;
- kvm_mmu_reset_context(vcpu);
- return;
+ if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET))
+ return 1;
+
+ kvm_x86_call(set_cr0)(vcpu, cr0);
+
+ kvm_post_set_cr0(vcpu, old_cr0, cr0);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_cr0);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
- KVMTRACE_1D(LMSW, vcpu,
- (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
- handler);
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
-EXPORT_SYMBOL_GPL(kvm_lmsw);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest)
{
- unsigned long old_cr4 = vcpu->arch.cr4;
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+ if (vcpu->arch.guest_state_protected)
+ return;
- if (cr4 & CR4_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
- kvm_inject_gp(vcpu, 0);
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE))
return;
- }
- if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE)) {
- printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
- "in long mode\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
- } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
- && ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
- kvm_inject_gp(vcpu, 0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK,
+ load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss);
+}
+
+static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
return;
- }
- if (cr4 & X86_CR4_VMXE) {
- printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
- kvm_inject_gp(vcpu, 0);
+ if (cpu_feature_enabled(X86_FEATURE_PKU) &&
+ vcpu->arch.pkru != vcpu->arch.host_pkru &&
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
+ wrpkru(vcpu->arch.pkru);
+}
+
+static void kvm_load_host_pkru(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
return;
+
+ if (cpu_feature_enabled(X86_FEATURE_PKU) &&
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
+ vcpu->arch.pkru = rdpkru();
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
+ wrpkru(vcpu->arch.host_pkru);
}
- kvm_x86_ops->set_cr4(vcpu, cr4);
- vcpu->arch.cr4 = cr4;
- vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
- kvm_mmu_reset_context(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_set_cr4);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+#ifdef CONFIG_X86_64
+static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
{
- if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
- kvm_mmu_sync_roots(vcpu);
- kvm_mmu_flush_tlb(vcpu);
- return;
+ return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
+}
+#endif
+
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ u64 xcr0 = xcr;
+ u64 old_xcr0 = vcpu->arch.xcr0;
+ u64 valid_bits;
+
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ if (!(xcr0 & XFEATURE_MASK_FP))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
+ return 1;
+
+ /*
+ * Do not allow the guest to set bits that we do not support
+ * saving. However, xcr0 bit 0 is always set, even if the
+ * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
+ */
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
+ if (xcr0 & ~valid_bits)
+ return 1;
+
+ if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+ (!(xcr0 & XFEATURE_MASK_BNDCSR)))
+ return 1;
+
+ if (xcr0 & XFEATURE_MASK_AVX512) {
+ if (!(xcr0 & XFEATURE_MASK_YMM))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
+ return 1;
}
- if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
- } else {
- if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS) {
- printk(KERN_DEBUG
- "set_cr3: #GP, reserved bits\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
- if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
- printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
- "reserved bits\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
- }
- /*
- * We don't check reserved bits in nonpae mode, because
- * this isn't enforced, and VMware depends on this.
- */
+ if ((xcr0 & XFEATURE_MASK_XTILE) &&
+ ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+ return 1;
+
+ vcpu->arch.xcr0 = xcr0;
+
+ if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr);
+
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
+{
+ /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv);
+
+static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return __kvm_is_valid_cr4(vcpu, cr4) &&
+ kvm_x86_call(is_valid_cr4)(vcpu, cr4);
+}
+
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
+{
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
+ * according to the SDM; however, stale prev_roots could be reused
+ * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
+ * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
+ * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
+ * so fall through.
+ */
+ if (!tdp_enabled &&
+ (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
+ kvm_mmu_unload(vcpu);
+
/*
- * Does the new cr3 value map to physical memory? (Note, we
- * catch an invalid cr3 even in real-mode, because it would
- * cause trouble later on when we turn on paging anyway.)
+ * The TLB has to be flushed for all PCIDs if any of the following
+ * (architecturally required) changes happen:
+ * - CR4.PCIDE is changed from 1 to 0
+ * - CR4.PGE is toggled
*
- * A real CPU would silently accept an invalid cr3 and would
- * attempt to use it - with largely undefined (and often hard
- * to debug) behavior on the guest side.
+ * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
*/
- if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- kvm_inject_gp(vcpu, 0);
- else {
- vcpu->arch.cr3 = cr3;
- vcpu->arch.mmu.new_cr3(vcpu);
+ if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+
+ /*
+ * The TLB has to be flushed for the current PCID if any of the
+ * following (architecturally required) changes happen:
+ * - CR4.SMEP is changed from 0 to 1
+ * - CR4.PAE is toggled
+ */
+ else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
+ ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4);
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+
+ if (!kvm_is_valid_cr4(vcpu, cr4))
+ return 1;
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
+ && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
+ return 1;
+
+ if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ return 1;
}
+
+ if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP))
+ return 1;
+
+ kvm_x86_call(set_cr4)(vcpu, cr4);
+
+ kvm_post_set_cr4(vcpu, old_cr4, cr4);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_cr3);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
{
- if (cr8 & CR8_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
- kvm_inject_gp(vcpu, 0);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots_to_free = 0;
+ int i;
+
+ /*
+ * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+ * this is reachable when running EPT=1 and unrestricted_guest=0, and
+ * also via the emulator. KVM's TDP page tables are not in the scope of
+ * the invalidation, but the guest's TLB entries need to be flushed as
+ * the CPU may have cached entries in its TLB for the target PCID.
+ */
+ if (unlikely(tdp_enabled)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
return;
}
- if (irqchip_in_kernel(vcpu->kvm))
+
+ /*
+ * If neither the current CR3 nor any of the prev_roots use the given
+ * PCID, then nothing needs to be done here because a resync will
+ * happen anyway before switching to any other CR3.
+ */
+ if (kvm_get_active_pcid(vcpu) == pcid) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ /*
+ * If PCID is disabled, there is no need to free prev_roots even if the
+ * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
+ * with PCIDE=0.
+ */
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))
+ return;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
+}
+
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ bool skip_tlb_flush = false;
+ unsigned long pcid = 0;
+#ifdef CONFIG_X86_64
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
+ skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
+ cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ pcid = cr3 & X86_CR3_PCID_MASK;
+ }
+#endif
+
+ /* PDPTRs are always reloaded for PAE paging. */
+ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+ goto handle_tlb_flush;
+
+ /*
+ * Do not condition the GPA check on long mode, this helper is used to
+ * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
+ * the current vCPU mode is accurate.
+ */
+ if (!kvm_vcpu_is_legal_cr3(vcpu, cr3))
+ return 1;
+
+ if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+ return 1;
+
+ if (cr3 != kvm_read_cr3(vcpu))
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ vcpu->arch.cr3 = cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ /* Do not call post_set_cr3, we do not get here for confidential guests. */
+
+handle_tlb_flush:
+ /*
+ * A load of CR3 that flushes the TLB flushes only the current PCID,
+ * even if PCID is disabled, in which case PCID=0 is flushed. It's a
+ * moot point in the end because _disabling_ PCID will flush all PCIDs,
+ * and it's impossible to use a non-zero PCID when PCID is disabled,
+ * i.e. only PCID=0 can be relevant.
+ */
+ if (!skip_tlb_flush)
+ kvm_invalidate_pcid(vcpu, pcid);
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3);
+
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
+ if (lapic_in_kernel(vcpu))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
+ return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_cr8);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
}
-EXPORT_SYMBOL_GPL(kvm_get_cr8);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8);
-static inline u32 bit(int bitno)
+static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
{
- return 1 << (bitno & 31);
+ int i;
+
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
}
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
- */
-static u32 msrs_to_save[] = {
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_K6_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
-};
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+ unsigned long dr7;
-static unsigned num_msrs_to_save;
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ dr7 = vcpu->arch.guest_debug_dr7;
+ else
+ dr7 = vcpu->arch.dr7;
+ kvm_x86_call(set_dr7)(vcpu, dr7);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7);
-static u32 emulated_msrs[] = {
- MSR_IA32_MISC_ENABLE,
-};
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))
+ fixed |= DR6_RTM;
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ fixed |= DR6_BUS_LOCK;
+ return fixed;
+}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
- if (efer & efer_reserved_bits) {
- printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
- efer);
- kvm_inject_gp(vcpu, 0);
- return;
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ case 6:
+ if (!kvm_dr6_valid(val))
+ return 1; /* #GP */
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
+ break;
+ case 5:
+ default: /* 7 */
+ if (!kvm_dr7_valid(val))
+ return 1; /* #GP */
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+ break;
}
- if (is_paging(vcpu)
- && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
- printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr);
+
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ return vcpu->arch.db[array_index_nospec(dr, size)];
+ case 4:
+ case 6:
+ return vcpu->arch.dr6;
+ case 5:
+ default: /* 7 */
+ return vcpu->arch.dr7;
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr);
+
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
+{
+ u32 pmc = kvm_rcx_read(vcpu);
+ u64 data;
+
+ if (kvm_pmu_rdpmc(vcpu, pmc, &data)) {
kvm_inject_gp(vcpu, 0);
- return;
+ return 1;
}
- if (efer & EFER_FFXSR) {
- struct kvm_cpuid_entry2 *feat;
+ kvm_rax_write(vcpu, (u32)data);
+ kvm_rdx_write(vcpu, data >> 32);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc);
- feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
- printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
+/*
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
+ * does not yet virtualize. These include:
+ * 10 - MISC_PACKAGE_CTRLS
+ * 11 - ENERGY_FILTERING_CTL
+ * 12 - DOITM
+ * 18 - FB_CLEAR_CTRL
+ * 21 - XAPIC_DISABLE_STATUS
+ * 23 - OVERCLOCKING_STATUS
+ */
+
+#define KVM_SUPPORTED_ARCH_CAP \
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
+
+static u64 kvm_get_arch_capabilities(void)
+{
+ u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+
+ /*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
+ * If we're doing cache flushes (either "always" or "cond")
+ * we will do one whenever the guest does a vmlaunch/vmresume.
+ * If an outer hypervisor is doing the cache flush for us
+ * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that
+ * capability to the guest too, and if EPT is disabled we're not
+ * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
+ * require a nested hypervisor to do a flush of its own.
+ */
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ data |= ARCH_CAP_RFDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ data |= ARCH_CAP_ITS_NO;
+
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * If RTM=0 because the kernel has disabled TSX, the host might
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
+ * and therefore knows that there cannot be TAA) but keep
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+ * and we want to allow migrating those guests to tsx=off hosts.
+ */
+ data &= ~ARCH_CAP_TAA_NO;
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ data |= ARCH_CAP_TAA_NO;
+ } else {
+ /*
+ * Nothing to do here; we emulate TSX_CTRL if present on the
+ * host so the guest can choose between disabling TSX or
+ * using VERW to clear CPU buffers.
+ */
}
- if (efer & EFER_SVME) {
- struct kvm_cpuid_entry2 *feat;
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
- feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
- printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ return data;
+}
+
+static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ WARN_ON_ONCE(!host_initiated);
+
+ switch (index) {
+ case MSR_IA32_ARCH_CAPABILITIES:
+ *data = kvm_get_arch_capabilities();
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ *data = kvm_caps.supported_perf_cap;
+ break;
+ case MSR_PLATFORM_INFO:
+ *data = MSR_PLATFORM_INFO_CPUID_FAULT;
+ break;
+ case MSR_IA32_UCODE_REV:
+ rdmsrq_safe(index, data);
+ break;
+ default:
+ return kvm_x86_call(get_feature_msr)(index, data);
}
+ return 0;
+}
- kvm_x86_ops->set_efer(vcpu, efer);
+static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
+ kvm_get_feature_msr);
+}
+
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS))
+ return false;
+
+ if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT))
+ return false;
+
+ if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
+ return false;
+
+ if (efer & (EFER_LME | EFER_LMA) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return false;
+
+ if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX))
+ return false;
+
+ return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
+ int r;
+
+ if (efer & efer_reserved_bits)
+ return 1;
+
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
efer &= ~EFER_LMA;
- efer |= vcpu->arch.shadow_efer & EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
- vcpu->arch.shadow_efer = efer;
+ r = kvm_x86_call(set_efer)(vcpu, efer);
+ if (r) {
+ WARN_ON(r > 0);
+ return r;
+ }
- vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
- kvm_mmu_reset_context(vcpu);
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
+ if (!static_cpu_has(X86_FEATURE_XSAVES) &&
+ (efer & EFER_SVME))
+ kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+
+ return 0;
}
void kvm_enable_efer_bits(u64 mask)
{
efer_reserved_bits &= ~mask;
}
-EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits);
+
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
+ struct kvm *kvm = vcpu->kvm;
+ bool allowed;
+ int idx;
+ u32 i;
+
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
+ return true;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
+ u32 start = ranges[i].base;
+ u32 end = start + ranges[i].nmsrs;
+ u32 flags = ranges[i].flags;
+ unsigned long *bitmap = ranges[i].bitmap;
+
+ if ((index >= start) && (index < end) && (flags & type)) {
+ allowed = test_bit(index - start, bitmap);
+ break;
+ }
+ }
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return allowed;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed);
+
+/*
+ * Write @data into the MSR specified by @index. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
+ bool host_initiated)
+{
+ struct msr_data msr;
+
+ switch (index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_msr_address(data, vcpu))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ data = __canonical_address(data, max_host_virt_addr_bits());
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+
+ /*
+ * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ * incomplete and conflicting architectural behavior. Current
+ * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ * reserved and always read as zeros. Enforce Intel's reserved
+ * bits check if the guest CPU is Intel compatible, otherwise
+ * clear the bits. This ensures cross-vendor migration will
+ * provide consistent behavior for the guest.
+ */
+ if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
+ return 1;
+
+ data = (u32)data;
+ break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (!kvm_is_valid_u_s_cet(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ if (!host_initiated)
+ return 1;
+ fallthrough;
+ /*
+ * Note that the MSR emulation here is flawed when a vCPU
+ * doesn't support the Intel 64 architecture. The expected
+ * architectural behavior in this case is that the upper 32
+ * bits do not exist and should always read '0'. However,
+ * because the actual hardware on which the virtual CPU is
+ * running does support Intel 64, XRSTORS/XSAVES in the
+ * guest could observe behavior that violates the
+ * architecture. Intercepting XRSTORS/XSAVES for this
+ * special case isn't deemed worthwhile.
+ */
+ case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return KVM_MSR_RET_UNSUPPORTED;
+ /*
+ * MSR_IA32_INT_SSP_TAB is not present on processors that do
+ * not support Intel 64 architecture.
+ */
+ if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (is_noncanonical_msr_address(data, vcpu))
+ return 1;
+ /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
+ if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4))
+ return 1;
+ break;
+ }
+
+ msr.data = data;
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ return kvm_x86_call(set_msr)(vcpu, &msr);
+}
+static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ return __kvm_set_msr(vcpu, index, *data, host_initiated);
+}
+
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 data, bool host_initiated)
+{
+ return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
+ _kvm_set_msr);
+}
/*
- * Writes msr value into into the appropriate "register".
+ * Read the MSR specified by @index into @data. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ struct msr_data msr;
+ int ret;
+
+ switch (index) {
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+ break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
+ return KVM_MSR_RET_UNSUPPORTED;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ if (!host_initiated)
+ return 1;
+ fallthrough;
+ case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return KVM_MSR_RET_UNSUPPORTED;
+ break;
+ }
+
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ ret = kvm_x86_call(get_msr)(vcpu, &msr);
+ if (!ret)
+ *data = msr.data;
+ return ret;
+}
+
+int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return __kvm_set_msr(vcpu, index, data, true);
+}
+
+int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ return __kvm_get_msr(vcpu, index, data, true);
+}
+
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 *data, bool host_initiated)
+{
+ return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
+ __kvm_get_msr);
+}
+
+int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read);
+
+int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write);
+
+int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+
+ return __kvm_emulate_msr_read(vcpu, index, data);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read);
+
+int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+
+ return __kvm_emulate_msr_write(vcpu, index, data);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write);
+
+
+static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->run->msr.error) {
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+ }
+}
+
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_emulated_msr_access(vcpu);
+}
+
+static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
+}
+
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_fast_msr_access(vcpu);
+}
+
+static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->run->msr.error)
+ kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
+ vcpu->run->msr.data);
+
+ return complete_fast_msr_access(vcpu);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+ switch (r) {
+ case KVM_MSR_RET_UNSUPPORTED:
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
+ case KVM_MSR_RET_FILTERED:
+ return KVM_MSR_EXIT_REASON_FILTER;
+ default:
+ return KVM_MSR_EXIT_REASON_INVAL;
+ }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+ u32 exit_reason, u64 data,
+ int (*completion)(struct kvm_vcpu *vcpu),
+ int r)
+{
+ u64 msr_reason = kvm_msr_reason(r);
+
+ /* Check if the user wanted to know about this MSR fault */
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+ return 0;
+
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->run->msr.error = 0;
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+ vcpu->run->msr.reason = msr_reason;
+ vcpu->run->msr.index = index;
+ vcpu->run->msr.data = data;
+ vcpu->arch.complete_userspace_io = completion;
+
+ return 1;
+}
+
+static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg,
+ int (*complete_rdmsr)(struct kvm_vcpu *))
+{
+ u64 data;
+ int r;
+
+ r = kvm_emulate_msr_read(vcpu, msr, &data);
+
+ if (!r) {
+ trace_kvm_msr_read(msr, data);
+
+ if (reg < 0) {
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ } else {
+ kvm_register_write(vcpu, reg, data);
+ }
+ } else {
+ /* MSR read failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0,
+ complete_rdmsr, r))
+ return 0;
+ trace_kvm_msr_read_ex(msr);
+ }
+
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
+}
+
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1,
+ complete_fast_rdmsr);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr);
+
+int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ vcpu->arch.cui_rdmsr_imm_reg = reg;
+
+ return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm);
+
+static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
- return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+ int r;
+
+ r = kvm_emulate_msr_write(vcpu, msr, data);
+ if (!r) {
+ trace_kvm_msr_write(msr, data);
+ } else {
+ /* MSR write failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data,
+ complete_fast_msr_access, r))
+ return 0;
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
+ trace_kvm_msr_write_ex(msr, data);
+ }
+
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
+}
+
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu),
+ kvm_read_edx_eax(vcpu));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr);
+
+int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm);
+
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd);
+
+fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_emulate_invd(vcpu))
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ return EXIT_FASTPATH_REENTER_GUEST;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_invd);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invalid_op);
+
+
+static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
+{
+ bool enabled;
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS))
+ goto emulate_as_nop;
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
+ enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT);
+ else
+ enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT;
+
+ if (!enabled)
+ return kvm_handle_invalid_op(vcpu);
+
+emulate_as_nop:
+ pr_warn_once("%s instruction emulated as NOP!\n", insn);
+ return kvm_emulate_as_nop(vcpu);
+}
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_mwait);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_monitor);
+
+static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+{
+ xfer_to_guest_mode_prepare();
+
+ return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE ||
+ kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending();
+}
+
+static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
+ kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
+ return EXIT_FASTPATH_NONE;
+ break;
+ case MSR_IA32_TSC_DEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ break;
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+
+ trace_kvm_msr_write(msr, data);
+
+ if (!kvm_skip_emulated_instruction(vcpu))
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ return EXIT_FASTPATH_REENTER_GUEST;
+}
+
+fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu),
+ kvm_read_edx_eax(vcpu));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr);
+
+fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm);
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
+}
+
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return kvm_set_msr(vcpu, index, *data);
+ u64 val;
+
+ /*
+ * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
+ * not support modifying the guest vCPU model on the fly, e.g. changing
+ * the nVMX capabilities while L2 is running is nonsensical. Allow
+ * writes of the same value, e.g. to allow userspace to blindly stuff
+ * all MSRs when emulating RESET.
+ */
+ if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) &&
+ (do_get_msr(vcpu, index, &val) || *data != val))
+ return -EINVAL;
+
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
+}
+
+#ifdef CONFIG_X86_64
+struct pvclock_clock {
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+ u64 base_cycles;
+ u64 offset;
+};
+
+struct pvclock_gtod_data {
+ seqcount_t seq;
+
+ struct pvclock_clock clock; /* extract of a clocksource struct */
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
+
+ ktime_t offs_boot;
+ u64 wall_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+ struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+
+ write_seqcount_begin(&vdata->seq);
+
+ /* copy pvclock gtod data */
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
+ vdata->clock.mask = tk->tkr_mono.mask;
+ vdata->clock.mult = tk->tkr_mono.mult;
+ vdata->clock.shift = tk->tkr_mono.shift;
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
+ vdata->clock.offset = tk->tkr_mono.base;
+
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
+ vdata->raw_clock.offset = tk->tkr_raw.base;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+
+ vdata->offs_boot = tk->offs_boot;
+
+ write_seqcount_end(&vdata->seq);
}
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+static s64 get_kvmclock_base_ns(void)
{
- static int version;
+ /* Count up from boot time, but with the frequency of the raw clock. */
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
+}
+#else
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
+ return ktime_get_boottime_ns();
+}
+#endif
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
+{
+ int version;
+ int r;
struct pvclock_wall_clock wc;
- struct timespec now, sys, boot;
+ u32 wc_sec_hi;
+ u64 wall_nsec;
if (!wall_clock)
return;
- version++;
+ r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ if (r)
+ return;
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+ if (version & 1)
+ ++version; /* first time write, random junk */
- /*
- * The guest calculates current wall clock time by adding
- * system time (updated by kvm_write_guest_time below) to the
- * wall clock specified here. guest system time equals host
- * system time for us, thus we must fill in host boot time here.
- */
- now = current_kernel_time();
- ktime_get_ts(&sys);
- boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+ ++version;
- wc.sec = boot.tv_sec;
- wc.nsec = boot.tv_nsec;
+ if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
+ return;
+
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
+
+ wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+ if (sec_hi_ofs) {
+ wc_sec_hi = wall_nsec >> 32;
+ kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
+ &wc_sec_hi, sizeof(wc_sec_hi));
+ }
+
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
-static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
+ bool old_msr, bool host_initiated)
{
- uint32_t quotient, remainder;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
- /* Don't try to replace with do_div(), this one calculates
- * "(dividend << 32) / divisor" */
- __asm__ ( "divl %4"
- : "=a" (quotient), "=d" (remainder)
- : "0" (0), "1" (dividend), "r" (divisor) );
- return quotient;
+ if (vcpu->vcpu_id == 0 && !host_initiated) {
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
+ }
+
+ vcpu->arch.time = system_time;
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ if (system_time & 1)
+ kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
+ else
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
+
+ return;
+}
+
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ do_shl32_div32(dividend, divisor);
+ return dividend;
}
-static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
+ s8 *pshift, u32 *pmultiplier)
{
- uint64_t nsecs = 1000000000LL;
+ uint64_t scaled64;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
- tps64 = tsc_khz * 1000LL;
- while (tps64 > nsecs*2) {
+ tps64 = base_hz;
+ scaled64 = scaled_hz;
+ while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
- while (tps32 <= (uint32_t)nsecs) {
- tps32 <<= 1;
+ while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+ if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
+ scaled64 >>= 1;
+ else
+ tps32 <<= 1;
shift++;
}
- hv_clock->tsc_shift = shift;
- hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
-
- pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
- __func__, tsc_khz, hv_clock->tsc_shift,
- hv_clock->tsc_to_system_mul);
+ *pshift = shift;
+ *pmultiplier = div_frac(scaled64, tps32);
}
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static unsigned long max_tsc_khz;
-static void kvm_write_guest_time(struct kvm_vcpu *v)
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
{
- struct timespec ts;
- unsigned long flags;
- struct kvm_vcpu_arch *vcpu = &v->arch;
- void *shared_kaddr;
- unsigned long this_tsc_khz;
+ u64 v = (u64)khz * (1000000 + ppm);
+ do_div(v, 1000000);
+ return v;
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_caps.has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ u32 thresh_lo, thresh_hi;
+ int use_scaling = 0;
+
+ /* tsc_khz can be zero if TSC calibration fails */
+ if (user_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
+ return -1;
+ }
+
+ /* Compute a scale to convert nanoseconds in TSC cycles */
+ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
+ &vcpu->arch.virtual_tsc_shift,
+ &vcpu->arch.virtual_tsc_mult);
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
+
+ /*
+ * Compute the variation in TSC rate which is acceptable
+ * within the range of tolerance and decide if the
+ * rate being applied is within that bounds of the hardware
+ * rate. If so, no scaling or compensation need be done.
+ */
+ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+ thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+ if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+ pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ user_tsc_khz, thresh_lo, thresh_hi);
+ use_scaling = 1;
+ }
+ return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+ vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.this_tsc_write;
+ return tsc;
+}
+
+#ifdef CONFIG_X86_64
+static inline bool gtod_is_based_on_tsc(int mode)
+{
+ return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
+}
+#endif
+
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation)
+{
+#ifdef CONFIG_X86_64
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ /*
+ * To use the masterclock, the host clocksource must be based on TSC
+ * and all vCPUs must have matching TSCs. Note, the count for matching
+ * vCPUs doesn't include the reference vCPU, hence "+1".
+ */
+ bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&vcpu->kvm->online_vcpus)) &&
+ gtod_is_based_on_tsc(gtod->clock.vclock_mode);
+
+ /*
+ * Request a masterclock update if the masterclock needs to be toggled
+ * on/off, or when starting a new generation and the masterclock is
+ * enabled (compute_guest_tsc() requires the masterclock snapshot to be
+ * taken _after_ the new generation is created).
+ */
+ if ((ka->use_master_clock && new_generation) ||
+ (ka->use_master_clock != use_master_clock))
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+ atomic_read(&vcpu->kvm->online_vcpus),
+ ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(u64 tsc, u64 ratio)
+{
+ u64 _tsc = tsc;
+
+ if (ratio != kvm_caps.default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ return vcpu->arch.l1_tsc_offset +
+ kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_l1_tsc);
+
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
+{
+ u64 nested_offset;
- if ((!vcpu->time_page))
+ if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
+ nested_offset = l1_offset;
+ else
+ nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+ kvm_caps.tsc_scaling_ratio_frac_bits);
+
+ nested_offset += l2_offset;
+ return nested_offset;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
+{
+ if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
+ return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+ kvm_caps.tsc_scaling_ratio_frac_bits);
+
+ return l1_multiplier;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+{
+ if (vcpu->arch.guest_tsc_protected)
+ return;
+
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vcpu->arch.l1_tsc_offset,
+ l1_offset);
+
+ vcpu->arch.l1_tsc_offset = l1_offset;
+
+ /*
+ * If we are here because L1 chose not to trap WRMSR to TSC then
+ * according to the spec this should set L1's TSC (as opposed to
+ * setting L1's offset for L2).
+ */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ l1_offset,
+ kvm_x86_call(get_l2_tsc_offset)(vcpu),
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_offset = l1_offset;
+
+ kvm_x86_call(write_tsc_offset)(vcpu);
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+ vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+ /* Userspace is changing the multiplier while L2 is active */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ l1_multiplier,
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+ if (kvm_caps.has_tsc_control)
+ kvm_x86_call(write_tsc_multiplier)(vcpu);
+}
+
+static inline bool kvm_check_tsc_unstable(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * TSC is marked unstable when we're running on Hyper-V,
+ * 'TSC page' clocksource is good.
+ */
+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
+ return false;
+#endif
+ return check_tsc_unstable();
+}
+
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched, bool user_set_tsc)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+
+ if (vcpu->arch.guest_tsc_protected)
return;
- this_tsc_khz = get_cpu_var(cpu_tsc_khz);
- if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
- kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
- vcpu->hv_clock_tsc_khz = this_tsc_khz;
+ if (user_set_tsc)
+ vcpu->kvm->arch.user_set_tsc = true;
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = tsc;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ kvm->arch.last_tsc_offset = offset;
+
+ vcpu->arch.last_guest_tsc = tsc;
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+ if (!matched) {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = tsc;
+ kvm->arch.cur_tsc_offset = offset;
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_track_tsc_matching(vcpu, !matched);
+}
+
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
+{
+ u64 data = user_value ? *user_value : 0;
+ struct kvm *kvm = vcpu->kvm;
+ u64 offset, ns, elapsed;
+ unsigned long flags;
+ bool matched = false;
+ bool synchronizing = false;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
+ ns = get_kvmclock_base_ns();
+ elapsed = ns - kvm->arch.last_tsc_nsec;
+
+ if (vcpu->arch.virtual_tsc_khz) {
+ if (data == 0) {
+ /*
+ * Force synchronization when creating a vCPU, or when
+ * userspace explicitly writes a zero value.
+ */
+ synchronizing = true;
+ } else if (kvm->arch.user_set_tsc) {
+ u64 tsc_exp = kvm->arch.last_tsc_write +
+ nsec_to_cycles(vcpu, elapsed);
+ u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+ /*
+ * Here lies UAPI baggage: when a user-initiated TSC write has
+ * a small delta (1 second) of virtual cycle time against the
+ * previously set vCPU, we assume that they were intended to be
+ * in sync and the delta was only due to the racy nature of the
+ * legacy API.
+ *
+ * This trick falls down when restoring a guest which genuinely
+ * has been running for less time than the 1 second of imprecision
+ * which we allow for in the legacy API. In this case, the first
+ * value written by userspace (on any vCPU) should not be subject
+ * to this 'correction' to make it sync up with values that only
+ * come from the kernel's default vCPU creation. Make the 1-second
+ * slop hack only trigger if the user_set_tsc flag is already set.
+ */
+ synchronizing = data < tsc_exp + tsc_hz &&
+ data + tsc_hz > tsc_exp;
+ }
+ }
+
+
+ /*
+ * For a reliable TSC, we can match TSC offsets, and for an unstable
+ * TSC, we add elapsed time in this computation. We could let the
+ * compensation code attempt to catch up if we fall behind, but
+ * it's better to try to match offsets from the beginning.
+ */
+ if (synchronizing &&
+ vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
+ if (!kvm_check_tsc_unstable()) {
+ offset = kvm->arch.cur_tsc_offset;
+ } else {
+ u64 delta = nsec_to_cycles(vcpu, elapsed);
+ data += delta;
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
+ }
+ matched = true;
+ }
+
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+}
+
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
+ kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc((u64) adjustment,
+ vcpu->arch.l1_tsc_scaling_ratio);
+ adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+#ifdef CONFIG_X86_64
+
+static u64 read_tsc(void)
+{
+ u64 ret = (u64)rdtsc_ordered();
+ u64 last = pvclock_gtod_data.clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a function of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+ int *mode)
+{
+ u64 tsc_pg_val;
+ long v;
+
+ switch (clock->vclock_mode) {
+ case VDSO_CLOCKMODE_HVCLOCK:
+ if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
+ tsc_timestamp, &tsc_pg_val)) {
+ /* TSC page valid */
+ *mode = VDSO_CLOCKMODE_HVCLOCK;
+ v = (tsc_pg_val - clock->cycle_last) &
+ clock->mask;
+ } else {
+ /* TSC page invalid */
+ *mode = VDSO_CLOCKMODE_NONE;
+ }
+ break;
+ case VDSO_CLOCKMODE_TSC:
+ *mode = VDSO_CLOCKMODE_TSC;
+ *tsc_timestamp = read_tsc();
+ v = (*tsc_timestamp - clock->cycle_last) &
+ clock->mask;
+ break;
+ default:
+ *mode = VDSO_CLOCKMODE_NONE;
+ }
+
+ if (*mode == VDSO_CLOCKMODE_NONE)
+ *tsc_timestamp = v = 0;
+
+ return v * clock->mult;
+}
+
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->raw_clock.base_cycles;
+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
+ ns >>= gtod->raw_clock.shift;
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += ktime_to_ns(gtod->clock.offset);
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
+static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ts->tv_sec = gtod->wall_time_sec;
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+ tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+ tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
+static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
+ u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ * VCPU0 on CPU0 | VCPU1 on CPU1
+ *
+ * 1. read timespec0,tsc0
+ * 2. | timespec1 = timespec0 + N
+ * | tsc1 = tsc0 + M
+ * 3. transition to guest | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5. | ret1 = timespec1 + (rdtsc - tsc1)
+ * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ * - ret0 < ret1
+ * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ * ...
+ * - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct kvm_arch *ka = &kvm->arch;
+ int vclock_mode;
+ bool host_tsc_clocksource, vcpus_matched;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&kvm->online_vcpus));
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ host_tsc_clocksource = kvm_get_time_and_clockread(
+ &ka->master_kernel_ns,
+ &ka->master_cycle_now);
+
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !ka->backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
+
+ if (ka->use_master_clock)
+ atomic_set(&kvm_guest_has_master_clock, 1);
+
+ vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+ trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+ vcpus_matched);
+#endif
+}
+
+static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
+static void __kvm_start_pvclock_update(struct kvm *kvm)
+{
+ raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
+ write_seqcount_begin(&kvm->arch.pvclock_sc);
+}
+
+static void kvm_start_pvclock_update(struct kvm *kvm)
+{
+ kvm_make_mclock_inprogress_request(kvm);
+
+ /* no guest entries from this point */
+ __kvm_start_pvclock_update(kvm);
+}
+
+static void kvm_end_pvclock_update(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ write_seqcount_end(&ka->pvclock_sc);
+ raw_spin_unlock_irq(&ka->tsc_write_lock);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+}
+
+static void kvm_update_masterclock(struct kvm *kvm)
+{
+ kvm_hv_request_tsc_page_update(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+}
+
+/*
+ * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
+ * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
+ * can change during boot even if the TSC is constant, as it's possible for KVM
+ * to be loaded before TSC calibration completes. Ideally, KVM would get a
+ * notification when calibration completes, but practically speaking calibration
+ * will complete before userspace is alive enough to create VMs.
+ */
+static unsigned long get_cpu_tsc_khz(void)
+{
+ if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return tsc_khz;
+ else
+ return __this_cpu_read(cpu_tsc_khz);
+}
+
+/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
+static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct pvclock_vcpu_time_info hv_clock;
+
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
+ data->flags = 0;
+ if (ka->use_master_clock &&
+ (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
+#ifdef CONFIG_X86_64
+ struct timespec64 ts;
+
+ if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+ data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+ data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+ } else
+#endif
+ data->host_tsc = rdtsc();
+
+ data->flags |= KVM_CLOCK_TSC_STABLE;
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+ kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+ } else {
+ data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
}
- put_cpu_var(cpu_tsc_khz);
+
+ put_cpu();
+}
+
+static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ __get_kvmclock(kvm, data);
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_clock_data data;
+
+ get_kvmclock(kvm, &data);
+ return data.clock;
+}
+
+static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
+ struct kvm_vcpu *vcpu,
+ struct gfn_to_pfn_cache *gpc,
+ unsigned int offset)
+{
+ struct pvclock_vcpu_time_info *guest_hv_clock;
+ struct pvclock_vcpu_time_info hv_clock;
+ unsigned long flags;
+
+ memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));
+
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ guest_hv_clock = (void *)(gpc->khva + offset);
+
+ /*
+ * This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ */
+
+ guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+
+ memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock));
+
+ smp_wmb();
+
+ guest_hv_clock->version = ++hv_clock.version;
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
+}
+
+int kvm_guest_time_update(struct kvm_vcpu *v)
+{
+ struct pvclock_vcpu_time_info hv_clock = {};
+ unsigned long flags, tgt_tsc_khz;
+ unsigned seq;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct kvm_arch *ka = &v->kvm->arch;
+ s64 kernel_ns;
+ u64 tsc_timestamp, host_tsc;
+ bool use_master_clock;
+
+ kernel_ns = 0;
+ host_tsc = 0;
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
- &vcpu->hv_clock.tsc_timestamp);
- ktime_get_ts(&ts);
+ tgt_tsc_khz = get_cpu_tsc_khz();
+ if (unlikely(tgt_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
+ if (!use_master_clock) {
+ host_tsc = rdtsc();
+ kernel_ns = get_kvmclock_base_ns();
+ }
+
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
+
+ /*
+ * We may have to catch up the TSC to match elapsed wall clock
+ * time for two reasons, even if kvmclock is used.
+ * 1) CPU could have been running below the maximum TSC rate
+ * 2) Broken TSC compensation resets the base at each VCPU
+ * entry to avoid unknown leaps of TSC even when running
+ * again on the same CPU. This may cause apparent elapsed
+ * time to disappear, and the guest to stand still or run
+ * very slowly.
+ */
+ if (vcpu->tsc_catchup) {
+ u64 tsc = compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp) {
+ adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
+ tsc_timestamp = tsc;
+ }
+ }
+
local_irq_restore(flags);
/* With all the info we got, fill in the values */
- vcpu->hv_clock.system_time = ts.tv_nsec +
- (NSEC_PER_SEC * (u64)ts.tv_sec);
+ if (kvm_caps.has_tsc_control) {
+ tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
+ v->arch.l1_tsc_scaling_ratio);
+ tgt_tsc_khz = tgt_tsc_khz ? : 1;
+ }
+
+ if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
+ &vcpu->pvclock_tsc_shift,
+ &vcpu->pvclock_tsc_mul);
+ vcpu->hw_tsc_khz = tgt_tsc_khz;
+ }
+
+ hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
+ hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
+ hv_clock.tsc_timestamp = tsc_timestamp;
+ hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_guest_tsc = tsc_timestamp;
+
+ /* If the host uses TSC clocksource, then it is stable */
+ hv_clock.flags = 0;
+ if (use_master_clock)
+ hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT;
+
+ if (vcpu->pv_time.active) {
+ /*
+ * GUEST_STOPPED is only supported by kvmclock, and KVM's
+ * historic behavior is to only process the request if kvmclock
+ * is active/enabled.
+ */
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
+
+ hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED;
+ }
+
+ kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
+
+#ifdef CONFIG_KVM_XEN
/*
- * The interface expects us to write an even number signaling that the
- * update is finished. Since the guest won't see the intermediate
- * state, we just increase by 2 at the end.
+ * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
+ * explicitly told to use TSC as its clocksource Xen will not set this bit.
+ * This default behaviour led to bugs in some guest kernels which cause
+ * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
+ *
+ * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters!
*/
- vcpu->hv_clock.version += 2;
+ if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
+ hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT;
+
+ if (vcpu->xen.vcpu_info_cache.active)
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_cache.active)
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
+#endif
+ return 0;
+}
+
+/*
+ * The pvclock_wall_clock ABI tells the guest the wall clock time at
+ * which it started (i.e. its epoch, when its kvmclock was zero).
+ *
+ * In fact those clocks are subtly different; wall clock frequency is
+ * adjusted by NTP and has leap seconds, while the kvmclock is a
+ * simple function of the TSC without any such adjustment.
+ *
+ * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
+ * that and kvmclock, but even that would be subject to change over
+ * time.
+ *
+ * Attempt to calculate the epoch at a given moment using the *same*
+ * TSC reading via kvm_get_walltime_and_clockread() to obtain both
+ * wallclock and kvmclock times, and subtracting one from the other.
+ *
+ * Fall back to using their values at slightly different moments by
+ * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
+ */
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct pvclock_vcpu_time_info hv_clock;
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned long seq, local_tsc_khz;
+ struct timespec64 ts;
+ uint64_t host_tsc;
- shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
- memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ local_tsc_khz = 0;
+ if (!ka->use_master_clock)
+ break;
+
+ /*
+ * The TSC read and the call to get_cpu_tsc_khz() must happen
+ * on the same CPU.
+ */
+ get_cpu();
+
+ local_tsc_khz = get_cpu_tsc_khz();
+
+ if (local_tsc_khz &&
+ !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+ local_tsc_khz = 0; /* Fall back to old method */
- kunmap_atomic(shared_kaddr, KM_USER0);
+ put_cpu();
- mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+ /*
+ * These values must be snapshotted within the seqcount loop.
+ * After that, it's just mathematics which can happen on any
+ * CPU at any time.
+ */
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+ /*
+ * If the conditions were right, and obtaining the wallclock+TSC was
+ * successful, calculate the KVM clock at the corresponding time and
+ * subtract one from the other to get the guest's epoch in nanoseconds
+ * since 1970-01-01.
+ */
+ if (local_tsc_khz) {
+ kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+ __pvclock_read_cycles(&hv_clock, host_tsc);
+ }
+#endif
+ return ktime_get_real_ns() - get_kvmclock_ns(kvm);
}
-static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * The worst case for a remote vcpu to update its kvmclock
+ * is then bounded by maximum nohz sleep latency.
+ */
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
- struct kvm_vcpu_arch *vcpu = &v->arch;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = v->kvm;
- if (!vcpu->time_page)
- return 0;
- set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
- return 1;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
}
-static bool msr_mtrr_valid(unsigned msr)
+/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
+static bool is_mci_control_msr(u32 msr)
+{
+ return (msr & 3) == 0;
+}
+static bool is_mci_status_msr(u32 msr)
{
+ return (msr & 3) == 1;
+}
+
+/*
+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
+ */
+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
+{
+ /* McStatusWrEn enabled? */
+ if (guest_cpuid_is_amd_compatible(vcpu))
+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
+
+ return false;
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u32 offset, last_msr;
+
switch (msr) {
- case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
- case MSR_MTRRfix64K_00000:
- case MSR_MTRRfix16K_80000:
- case MSR_MTRRfix16K_A0000:
- case MSR_MTRRfix4K_C0000:
- case MSR_MTRRfix4K_C8000:
- case MSR_MTRRfix4K_D0000:
- case MSR_MTRRfix4K_D8000:
- case MSR_MTRRfix4K_E0000:
- case MSR_MTRRfix4K_E8000:
- case MSR_MTRRfix4K_F0000:
- case MSR_MTRRfix4K_F8000:
- case MSR_MTRRdefType:
- case MSR_IA32_CR_PAT:
- return true;
- case 0x2f8:
- return true;
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) &&
+ (data || !msr_info->host_initiated))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return 1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
+ return 1;
+ /* An attempt to write a 1 to a reserved bit raises #GP */
+ if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ vcpu->arch.mci_ctl2_banks[offset] = data;
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ /*
+ * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
+ * values are architecturally undefined. But, some Linux
+ * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
+ * issue on AMD K8s, allow bit 10 to be clear when setting all
+ * other bits in order to avoid an uncaught #GP in the guest.
+ *
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
+ * single-bit ECC data errors.
+ */
+ if (is_mci_control_msr(msr) &&
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+ return 1;
+
+ /*
+ * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
+ * AMD-based CPUs allow non-zero values, but if and only if
+ * HWCR[McStatusWrEn] is set.
+ */
+ if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
+ data != 0 && !can_set_mci_status(vcpu))
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ default:
+ return 1;
}
- return false;
+ return 0;
+}
+
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 4:5 are reserved, Should be zero */
+ if (data & 0x30)
+ return 1;
+
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
+ return 1;
+
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return data ? 1 : 0;
+
+ vcpu->arch.apf.msr_en_val = data;
+
+ if (!kvm_pv_async_pf_enabled(vcpu)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u64)))
+ return 1;
+
+ vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
+ vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
+ kvm_async_pf_wakeup_all(vcpu);
+
+ return 0;
}
-static bool valid_pat_type(unsigned t)
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
{
- return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+ /* Bits 8-63 are reserved */
+ if (data >> 8)
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ vcpu->arch.apf.msr_int_val = data;
+
+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
+
+ return 0;
}
-static bool valid_mtrr_type(unsigned t)
+static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
+ vcpu->arch.time = 0;
}
-static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
- int i;
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_call(flush_tlb_all)(vcpu);
- if (!msr_mtrr_valid(msr))
- return false;
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
- if (msr == MSR_IA32_CR_PAT) {
- for (i = 0; i < 8; i++)
- if (!valid_pat_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
- } else if (msr == MSR_MTRRdefType) {
- if (data & ~0xcff)
- return false;
- return valid_mtrr_type(data & 0xff);
- } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
- for (i = 0; i < 8 ; i++)
- if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+
+ if (!tdp_enabled) {
+ /*
+ * A TLB flush on behalf of the guest is equivalent to
+ * INVPCID(all), toggling CR4.PGE, etc., which requires
+ * a forced sync of the shadow page tables. Ensure all the
+ * roots are synced and the guest TLB in hardware is clean.
+ */
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_sync_prev_roots(vcpu);
}
- /* variable MTRRs */
- return valid_mtrr_type(data & 0xff);
+ kvm_x86_call(flush_tlb_guest)(vcpu);
+
+ /*
+ * Flushing all "guest" TLB is always a superset of Hyper-V's fine
+ * grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
}
-static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_call(flush_tlb_current)(vcpu);
+}
- if (!mtrr_valid(vcpu, msr, data))
- return 1;
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
- if (msr == MSR_MTRRdefType) {
- vcpu->arch.mtrr_state.def_type = data;
- vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
- } else if (msr == MSR_MTRRfix64K_00000)
- p[0] = data;
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- p[1 + msr - MSR_MTRRfix16K_80000] = data;
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- p[3 + msr - MSR_MTRRfix4K_C0000] = data;
- else if (msr == MSR_IA32_CR_PAT)
- vcpu->arch.pat = data;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pt = data;
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests);
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ u64 steal;
+ u32 version;
+
+ if (kvm_xen_msr_enabled(vcpu->kvm)) {
+ kvm_xen_runstate_set_running(vcpu);
+ return;
}
- kvm_mmu_reset_context(vcpu);
- return 0;
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
+ return;
+
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+ */
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ u8 st_preempted = 0;
+ int err = -EFAULT;
+
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+q" (st_preempted),
+ "+&r" (err),
+ "+m" (st->preempted));
+ if (err)
+ goto out;
+
+ user_access_end();
+
+ vcpu->arch.st.preempted = 0;
+
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
+ } else {
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
+ }
+
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
+
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
+
+ smp_wmb();
+
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay -
+ vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
+
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
+
+ out:
+ user_access_end();
+ dirty:
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/*
+ * Returns true if the MSR in question is managed via XSTATE, i.e. is context
+ * switched with the rest of guest FPU state.
+ *
+ * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
+ */
+static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
{
+ if (!vcpu)
+ return false;
+
switch (msr) {
- case MSR_EFER:
- set_efer(vcpu, data);
+ case MSR_IA32_U_CET:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_IBT);
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+ default:
+ return false;
+ }
+}
+
+/*
+ * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
+ * MSR that is managed via XSTATE. Note, the caller is responsible for doing
+ * the initial FPU load, this helper only ensures that guest state is resident
+ * in hardware (the kernel can load its FPU state in IRQ context).
+ *
+ * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
+ * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
+ * consumed when transitioning to lower privilege levels, i.e. are effectively
+ * only consumed by userspace as well.
+ */
+static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info,
+ int access)
+{
+ BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W);
+
+ KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
+ KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
+
+ kvm_fpu_get();
+ if (access == MSR_TYPE_R)
+ rdmsrq(msr_info->index, msr_info->data);
+ else
+ wrmsrq(msr_info->index, msr_info->data);
+ kvm_fpu_put();
+}
+
+static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W);
+}
+
+static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R);
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /*
+ * Do not allow host-initiated writes to trigger the Xen hypercall
+ * page setup; it could incur locking paths which are not expected
+ * if userspace sets the MSR in an unusual location.
+ */
+ if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
+ !msr_info->host_initiated)
+ return kvm_xen_write_hypercall_page(vcpu, data);
+
+ switch (msr) {
+ case MSR_AMD64_NB_CFG:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
+ case MSR_F15H_EX_CFG:
break;
- case MSR_IA32_MC0_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __func__, data);
+
+ case MSR_IA32_UCODE_REV:
+ if (msr_info->host_initiated)
+ vcpu->arch.microcode_version = data;
break;
- case MSR_IA32_MCG_STATUS:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __func__, data);
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return KVM_MSR_RET_UNSUPPORTED;
+ vcpu->arch.arch_capabilities = data;
break;
- case MSR_IA32_MCG_CTL:
- pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __func__, data);
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (data & ~kvm_caps.supported_perf_cap)
+ return 1;
+
+ /*
+ * Note, this is not just a performance optimization! KVM
+ * disallows changing feature MSRs after the vCPU has run; PMU
+ * refresh will bug the VM if called after the vCPU has run.
+ */
+ if (vcpu->arch.perf_capabilities == data)
+ break;
+
+ vcpu->arch.perf_capabilities = data;
+ kvm_pmu_refresh(vcpu);
break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!data) {
- /* We support the non-activated case already */
+ case MSR_IA32_PRED_CMD: {
+ u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+ if (!msr_info->host_initiated) {
+ if ((!guest_has_pred_cmd_msr(vcpu)))
+ return 1;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
+
+ if (!boot_cpu_has(X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+
+ if (data & reserved_bits)
+ return 1;
+
+ if (!data)
break;
- } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
- /* Values other than LBR and BTF are vendor-specific,
- thus reserved and should throw a #GP */
+
+ wrmsrq(MSR_IA32_PRED_CMD, data);
+ break;
+ }
+ case MSR_IA32_FLUSH_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ break;
+ case MSR_EFER:
+ return set_efer(vcpu, msr_info);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ data &= ~(u64)0x8; /* ignore TLB cache disable */
+
+ /*
+ * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+ * through at least v6.6 whine if TscFreqSel is clear,
+ * depending on F/M/S.
+ */
+ if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ vcpu->arch.msr_hwcr = data;
break;
- case MSR_IA32_UCODE_REV:
- case MSR_IA32_UCODE_WRITE:
- case MSR_VM_HSAVE_PA:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
break;
- case 0x200 ... 0x2ff:
- return set_msr_mtrr(vcpu, msr, data);
+ case MSR_IA32_CR_PAT:
+ if (!kvm_pat_valid(data))
+ return 1;
+
+ vcpu->arch.pat = data;
+ break;
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
- kvm_set_apic_base(vcpu, data);
+ return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
+ case MSR_IA32_TSC_DEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
- case MSR_IA32_MISC_ENABLE:
- vcpu->arch.ia32_misc_enable_msr = data;
+ case MSR_IA32_TSC_ADJUST:
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
+ if (!msr_info->host_initiated) {
+ s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ vcpu->arch.ia32_tsc_adjust_msr = data;
+ }
+ break;
+ case MSR_IA32_MISC_ENABLE: {
+ u64 old_val = vcpu->arch.ia32_misc_enable_msr;
+
+ if (!msr_info->host_initiated) {
+ /* RO bits */
+ if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
+ return 1;
+
+ /* R bits, i.e. writes are ignored, but don't fault. */
+ data = data & ~MSR_IA32_MISC_ENABLE_EMON;
+ data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
+ }
+
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
+ ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3))
+ return 1;
+ vcpu->arch.ia32_misc_enable_msr = data;
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+ } else {
+ vcpu->arch.ia32_misc_enable_msr = data;
+ }
+ break;
+ }
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
+ case MSR_IA32_POWER_CTL:
+ vcpu->arch.msr_ia32_power_ctl = data;
+ break;
+ case MSR_IA32_TSC:
+ if (msr_info->host_initiated) {
+ kvm_synchronize_tsc(vcpu, &data);
+ } else if (!vcpu->arch.guest_tsc_protected) {
+ u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ adjust_tsc_offset_guest(vcpu, adj);
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
+ }
+ break;
+ case MSR_IA32_XSS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (data & ~vcpu->arch.guest_supported_xss)
+ return 1;
+ if (vcpu->arch.ia32_xss == data)
+ break;
+ vcpu->arch.ia32_xss = data;
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+ break;
+ case MSR_SMI_COUNT:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smi_count = data;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
vcpu->kvm->arch.wall_clock = data;
- kvm_write_wall_clock(vcpu->kvm, data);
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
break;
- case MSR_KVM_SYSTEM_TIME: {
- if (vcpu->arch.time_page) {
- kvm_release_page_dirty(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
+ case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ if (kvm_pv_enable_async_pf_int(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+ if (data & 0x1) {
+ /*
+ * Pairs with the smp_mb__after_atomic() in
+ * kvm_arch_async_page_present_queued().
+ */
+ smp_store_mb(vcpu->arch.apf.pageready_pending, false);
+
+ kvm_check_async_pf_completion(vcpu);
}
+ break;
+ case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
+ if (unlikely(!sched_info_on()))
+ return 1;
+
+ if (data & KVM_STEAL_RESERVED_MASK)
+ return 1;
- vcpu->arch.time = data;
+ vcpu->arch.st.msr_val = data;
- /* we verify if the enable bit is set... */
- if (!(data & 1))
+ if (!(data & KVM_MSR_ENABLED))
break;
- /* ...but clean it before doing the actual write */
- vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
- vcpu->arch.time_page =
- gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
- if (is_error_page(vcpu->arch.time_page)) {
- kvm_release_page_clean(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
- }
+ if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
+ return 1;
+ break;
- kvm_request_guest_time_update(vcpu);
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
+ /* only enable bit supported */
+ if (data & (-1ULL << 1))
+ return 1;
+
+ vcpu->arch.msr_kvm_poll_control = data;
break;
- }
- default:
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ return set_msr_mce(vcpu, msr_info);
+
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+ if (data)
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Ignore all writes to this no longer documented MSR.
+ * Writes are only relevant for old K7 processors,
+ * all pre-dating SVM, but a recommended workaround from
+ * AMD for these chips. It is possible to specify the
+ * affected processor models on the command line, hence
+ * the need to ignore the workaround.
+ */
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return kvm_hv_set_msr_common(vcpu, msr, data,
+ msr_info->host_initiated);
+#endif
+ case MSR_IA32_BBL_CR_CTL3:
+ /* Drop writes to this legacy MSR -- see rdmsr
+ * counterpart for further detail.
+ */
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.length = data;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.status = data;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.msr_platform_info = data;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+ (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ !supports_cpuid_fault(vcpu)))
+ return 1;
+ vcpu->arch.msr_misc_features_enables = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
-{
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
-static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+ fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
- if (!msr_mtrr_valid(msr))
- return 1;
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
- if (msr == MSR_MTRRdefType)
- *pdata = vcpu->arch.mtrr_state.def_type +
- (vcpu->arch.mtrr_state.enabled << 10);
- else if (msr == MSR_MTRRfix64K_00000)
- *pdata = p[0];
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
- else if (msr == MSR_IA32_CR_PAT)
- *pdata = vcpu->arch.pat;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pdata = *pt;
- }
+ vcpu->arch.guest_fpu.xfd_err = data;
+ break;
+#endif
+ case MSR_IA32_U_CET:
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ kvm_set_xstate_msr(vcpu, msr_info);
+ break;
+ default:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+ return KVM_MSR_RET_UNSUPPORTED;
+ }
return 0;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common);
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 offset, last_msr;
switch (msr) {
- case 0xc0010010: /* SYSCFG */
- case 0xc0010015: /* HWCR */
- case MSR_IA32_PLATFORM_ID:
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MC0_CTL:
- case MSR_IA32_MCG_STATUS:
+ data = 0;
+ break;
case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
case MSR_IA32_MCG_CTL:
- case MSR_IA32_MC0_MISC:
- case MSR_IA32_MC0_MISC+4:
- case MSR_IA32_MC0_MISC+8:
- case MSR_IA32_MC0_MISC+12:
- case MSR_IA32_MC0_MISC+16:
- case MSR_IA32_MC0_MISC+20:
- case MSR_IA32_UCODE_REV:
+ if (!(mcg_cap & MCG_CTL_P) && !host)
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ if (!(mcg_cap & MCG_CMCI_P) && !host)
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ data = vcpu->arch.mci_ctl2_banks[offset];
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ default:
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ switch (msr_info->index) {
+ case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
- case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHFROMIP:
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
+ case MSR_AMD64_SYSCFG:
+ case MSR_K8_TSEG_ADDR:
+ case MSR_K8_TSEG_MASK:
case MSR_VM_HSAVE_PA:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- case MSR_K7_EVNTSEL0:
- data = 0;
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_IA32_PERF_CTL:
+ case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
+ case MSR_F15H_EX_CFG:
+ /*
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
+ * limit) MSRs. Just return 0, as we do not want to expose the host
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
+ * so for existing CPU-specific MSRs.
+ */
+ case MSR_RAPL_POWER_UNIT:
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
+ msr_info->data = 0;
break;
- case MSR_MTRRcap:
- data = 0x500 | KVM_NR_VAR_MTRR;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ msr_info->data = 0;
+ break;
+ case MSR_IA32_UCODE_REV:
+ msr_info->data = vcpu->arch.microcode_version;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return KVM_MSR_RET_UNSUPPORTED;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
+ return KVM_MSR_RET_UNSUPPORTED;
+ msr_info->data = vcpu->arch.perf_capabilities;
+ break;
+ case MSR_IA32_POWER_CTL:
+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
+ break;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * On userspace reads and writes, however, we unconditionally
+ * return L1's TSC value to ensure backwards-compatible
+ * behavior for migration.
+ */
+ u64 offset, ratio;
+
+ if (msr_info->host_initiated) {
+ offset = vcpu->arch.l1_tsc_offset;
+ ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ } else {
+ offset = vcpu->arch.tsc_offset;
+ ratio = vcpu->arch.tsc_scaling_ratio;
+ }
+
+ msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
break;
- case 0x200 ... 0x2ff:
- return get_msr_mtrr(vcpu, msr, pdata);
+ }
+ case MSR_IA32_CR_PAT:
+ msr_info->data = vcpu->arch.pat;
+ break;
+ case MSR_MTRRcap:
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
case 0xcd: /* fsb frequency */
- data = 3;
+ msr_info->data = 3;
+ break;
+ /*
+ * MSR_EBC_FREQUENCY_ID
+ * Conservative value valid for even the basic CPU models.
+ * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+ * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+ * and 266MHz for model 3, or 4. Set Core Clock
+ * Frequency to System Bus Frequency Ratio to 1 (bits
+ * 31:24) even though these are only valid for CPU
+ * models > 2, however guests may end up dividing or
+ * multiplying by zero otherwise.
+ */
+ case MSR_EBC_FREQUENCY_ID:
+ msr_info->data = 1 << 24;
break;
case MSR_IA32_APICBASE:
- data = kvm_get_apic_base(vcpu);
+ msr_info->data = vcpu->arch.apic_base;
+ break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
+ case MSR_IA32_TSC_DEADLINE:
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
break;
case MSR_IA32_MISC_ENABLE:
- data = vcpu->arch.ia32_misc_enable_msr;
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
+ break;
+ case MSR_SMI_COUNT:
+ msr_info->data = vcpu->arch.smi_count;
break;
case MSR_IA32_PERF_STATUS:
/* TSC increment by tick */
- data = 1000ULL;
+ msr_info->data = 1000ULL;
/* CPU multiplier */
- data |= (((uint64_t)4ULL) << 40);
+ msr_info->data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
- data = vcpu->arch.shadow_efer;
+ msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
- data = vcpu->kvm->arch.wall_clock;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
- data = vcpu->arch.time;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_en_val;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_int_val;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = 0;
+ break;
+ case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
+ msr_info->data = vcpu->arch.st.msr_val;
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
+ break;
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
+ break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Provide expected ramp-up count for K7. All other
+ * are set to zero, indicating minimum divisors for
+ * every field.
+ *
+ * This prevents guest kernels on AMD host with CPU
+ * type 6, model 8 and higher from exploding due to
+ * the rdmsr failing.
+ */
+ msr_info->data = 0x20000000;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return kvm_hv_get_msr_common(vcpu,
+ msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+#endif
+ case MSR_IA32_BBL_CR_CTL3:
+ /* This legacy MSR exists but isn't fully documented in current
+ * silicon. It is however accessed by winxp in very narrow
+ * scenarios where it sets bit #19, itself documented as
+ * a "reserved" bit. Best effort attempt to source coherent
+ * read data here should the balance of the register be
+ * interpreted by the guest:
+ *
+ * L2 cache control register 3: 64GB range, 256KB size,
+ * enabled, latency 0x1, configured
+ */
+ msr_info->data = 0xbe702111;
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.length;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.status;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated &&
+ !vcpu->kvm->arch.guest_can_read_msr_platform_info)
+ return 1;
+ msr_info->data = vcpu->arch.msr_platform_info;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ msr_info->data = vcpu->arch.msr_misc_features_enables;
+ break;
+ case MSR_K7_HWCR:
+ msr_info->data = vcpu->arch.msr_hwcr;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+ break;
+#endif
+ case MSR_IA32_U_CET:
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ kvm_get_xstate_msr(vcpu, msr_info);
break;
default:
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
- return 1;
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+
+ return KVM_MSR_RET_UNSUPPORTED;
}
- *pdata = data;
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common);
/*
* Read or write a bunch of msrs. All parameters are kernel addresses.
@@ -986,17 +4651,25 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
+ bool fpu_loaded = false;
int i;
- vcpu_load(vcpu);
-
- down_read(&vcpu->kvm->slots_lock);
- for (i = 0; i < msrs->nmsrs; ++i)
+ for (i = 0; i < msrs->nmsrs; ++i) {
+ /*
+ * If userspace is accessing one or more XSTATE-managed MSRs,
+ * temporarily load the guest's FPU state so that the guest's
+ * MSR value(s) is resident in hardware and thus can be accessed
+ * via RDMSR/WRMSR.
+ */
+ if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) {
+ kvm_load_guest_fpu(vcpu);
+ fpu_loaded = true;
+ }
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
- up_read(&vcpu->kvm->slots_lock);
-
- vcpu_put(vcpu);
+ }
+ if (fpu_loaded)
+ kvm_put_guest_fpu(vcpu);
return i;
}
@@ -1013,87 +4686,342 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
{
struct kvm_msrs msrs;
struct kvm_msr_entry *entries;
- int r, n;
unsigned size;
+ int r;
r = -EFAULT;
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
goto out;
r = -E2BIG;
if (msrs.nmsrs >= MAX_IO_MSRS)
goto out;
- r = -ENOMEM;
size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = vmalloc(size);
- if (!entries)
+ entries = memdup_user(user_msrs->entries, size);
+ if (IS_ERR(entries)) {
+ r = PTR_ERR(entries);
goto out;
+ }
- r = -EFAULT;
- if (copy_from_user(entries, user_msrs->entries, size))
- goto out_free;
+ r = __msr_io(vcpu, &msrs, entries, do_msr);
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
- if (r < 0)
- goto out_free;
-
- r = -EFAULT;
if (writeback && copy_to_user(user_msrs->entries, entries, size))
- goto out_free;
-
- r = n;
+ r = -EFAULT;
-out_free:
- vfree(entries);
+ kfree(entries);
out:
return r;
}
-int kvm_dev_ioctl_check_extension(long ext)
+static inline bool kvm_can_mwait_in_guest(void)
{
+ return boot_cpu_has(X86_FEATURE_MWAIT) &&
+ !boot_cpu_has_bug(X86_BUG_MONITOR) &&
+ boot_cpu_has(X86_FEATURE_ARAT);
+}
+
+static u64 kvm_get_allowed_disable_exits(void)
+{
+ u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ r |= KVM_X86_DISABLE_EXITS_APERFMPERF;
+
+ if (!mitigate_smt_rsb) {
+ r |= KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_CSTATE;
+
+ if (kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ }
+ return r;
+}
+
+#ifdef CONFIG_KVM_HYPERV
+static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 __user *cpuid_arg)
+{
+ struct kvm_cpuid2 cpuid;
int r;
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ return r;
+
+ r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ if (r)
+ return r;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+ return r;
+
+ return 0;
+}
+#endif
+
+static bool kvm_is_vm_type_supported(unsigned long type)
+{
+ return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
+}
+
+static inline u64 kvm_sync_valid_fields(struct kvm *kvm)
+{
+ return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+ int r = 0;
+
switch (ext) {
case KVM_CAP_IRQCHIP:
case KVM_CAP_HLT:
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
- case KVM_CAP_REINJECT_CONTROL:
+ case KVM_CAP_USER_NMI:
case KVM_CAP_IRQ_INJECT_STATUS:
- case KVM_CAP_ASSIGN_DEV_IRQ:
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_IOEVENTFD_NO_LENGTH:
+
+ case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+ case KVM_CAP_VCPU_EVENTS:
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_HYPERV_TIME:
+ case KVM_CAP_HYPERV_SYNIC:
+ case KVM_CAP_HYPERV_SYNIC2:
+ case KVM_CAP_HYPERV_VP_INDEX:
+ case KVM_CAP_HYPERV_EVENTFD:
+ case KVM_CAP_HYPERV_TLBFLUSH:
+ case KVM_CAP_HYPERV_SEND_IPI:
+ case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ case KVM_CAP_SYS_HYPERV_CPUID:
+#endif
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_DEBUGREGS:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
+ case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_ASYNC_PF_INT:
+ case KVM_CAP_GET_TSC_KHZ:
+ case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_DISABLE_QUIRKS:
+ case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_PMU_EVENT_FILTER:
+ case KVM_CAP_PMU_EVENT_MASKED_EVENTS:
+ case KVM_CAP_GET_MSR_FEATURES:
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ case KVM_CAP_EXCEPTION_PAYLOAD:
+ case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+ case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_LAST_CPU:
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ case KVM_CAP_X86_MSR_FILTER:
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
+ case KVM_CAP_SREGS2:
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ case KVM_CAP_VCPU_ATTRIBUTES:
+ case KVM_CAP_SYS_ATTRIBUTES:
+ case KVM_CAP_VAPIC:
+ case KVM_CAP_ENABLE_CAP:
+ case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+ case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_MEMORY_FAULT_INFO:
+ case KVM_CAP_X86_GUEST_MODE:
+ case KVM_CAP_ONE_REG:
r = 1;
break;
- case KVM_CAP_COALESCED_MMIO:
- r = KVM_COALESCED_MMIO_PAGE_OFFSET;
+ case KVM_CAP_PRE_FAULT_MEMORY:
+ r = tdp_enabled;
break;
- case KVM_CAP_VAPIC:
- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
+ r = APIC_BUS_CYCLE_NS_DEFAULT;
+ break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ r = KVM_EXIT_HYPERCALL_VALID_MASK;
+ break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
+#ifdef CONFIG_KVM_XEN
+ case KVM_CAP_XEN_HVM:
+ r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO |
+ KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
+ if (sched_info_on())
+ r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
+ KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
+ break;
+#endif
+ case KVM_CAP_SYNC_REGS:
+ r = kvm_sync_valid_fields(kvm);
+ break;
+ case KVM_CAP_ADJUST_CLOCK:
+ r = KVM_CLOCK_VALID_FLAGS;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = kvm_get_allowed_disable_exits();
+ break;
+ case KVM_CAP_X86_SMM:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ break;
+
+ /* SMBASE is usually relocated above 1M on modern chipsets,
+ * and SMM handlers might indeed rely on 4G segment limits,
+ * so do not report SMM to be available if real mode is
+ * emulated via vm86 mode. Still, do not go to great lengths
+ * to avoid userspace's usage of the feature, because it is a
+ * fringe case that is not enabled except via specific settings
+ * of the module parameters.
+ */
+ r = kvm_x86_call(has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_NR_VCPUS:
+ r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
+ break;
+ case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
+ if (kvm)
+ r = kvm->max_vcpus;
+ break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_IDS;
+ break;
+ case KVM_CAP_PV_MMU: /* obsolete */
+ r = 0;
+ break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
+ case KVM_CAP_XCRS:
+ r = boot_cpu_has(X86_FEATURE_XSAVE);
+ break;
+ case KVM_CAP_TSC_CONTROL:
+ case KVM_CAP_VM_TSC_CONTROL:
+ r = kvm_caps.has_tsc_control;
+ break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
+ case KVM_CAP_NESTED_STATE:
+ r = kvm_x86_ops.nested_ops->get_state ?
+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
break;
- case KVM_CAP_NR_MEMSLOTS:
- r = KVM_MEMORY_SLOTS;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
break;
- case KVM_CAP_PV_MMU:
- r = !tdp_enabled;
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
break;
- case KVM_CAP_IOMMU:
- r = iommu_found();
+#endif
+ case KVM_CAP_SMALLER_MAXPHYADDR:
+ r = (int) allow_smaller_maxphyaddr;
+ break;
+ case KVM_CAP_STEAL_TIME:
+ r = sched_info_on();
+ break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ if (kvm_caps.has_bus_lock_exit)
+ r = KVM_BUS_LOCK_DETECTION_OFF |
+ KVM_BUS_LOCK_DETECTION_EXIT;
+ else
+ r = 0;
+ break;
+ case KVM_CAP_XSAVE2: {
+ r = xstate_required_size(kvm_get_filtered_xcr0(), false);
+ if (r < sizeof(struct kvm_xsave))
+ r = sizeof(struct kvm_xsave);
+ break;
+ }
+ case KVM_CAP_PMU_CAPABILITY:
+ r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
+ break;
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = kvm_caps.supported_quirks;
+ break;
+ case KVM_CAP_X86_NOTIFY_VMEXIT:
+ r = kvm_caps.has_notify_vmexit;
+ break;
+ case KVM_CAP_VM_TYPES:
+ r = kvm_caps.supported_vm_types;
+ break;
+ case KVM_CAP_READONLY_MEM:
+ r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1;
break;
default:
- r = 0;
break;
}
return r;
+}
+static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
+{
+ if (attr->group) {
+ if (kvm_x86_ops.dev_get_attr)
+ return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
+ return -ENXIO;
+ }
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ *val = kvm_caps.supported_xcr0;
+ return 0;
+ default:
+ return -ENXIO;
+ }
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
+ int r;
+ u64 val;
+
+ r = __kvm_x86_dev_get_attr(attr, &val);
+ if (r < 0)
+ return r;
+
+ if (put_user(val, uaddr))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+ u64 val;
+
+ return __kvm_x86_dev_get_attr(attr, &val);
}
long kvm_arch_dev_ioctl(struct file *filp,
@@ -1109,11 +5037,11 @@ long kvm_arch_dev_ioctl(struct file *filp,
unsigned n;
r = -EFAULT;
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
goto out;
n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
goto out;
r = -E2BIG;
if (n < msr_list.nmsrs)
@@ -1124,382 +5052,1112 @@ long kvm_arch_dev_ioctl(struct file *filp,
goto out;
if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
&emulated_msrs,
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ num_emulated_msrs * sizeof(u32)))
goto out;
r = 0;
break;
}
- case KVM_GET_SUPPORTED_CPUID: {
+ case KVM_GET_SUPPORTED_CPUID:
+ case KVM_GET_EMULATED_CPUID: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
- r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
- cpuid_arg->entries);
+
+ r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+ ioctl);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
goto out;
r = 0;
break;
}
+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
+ r = -EFAULT;
+ if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
+ sizeof(kvm_caps.supported_mce_cap)))
+ goto out;
+ r = 0;
+ break;
+ case KVM_GET_MSR_FEATURE_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned int n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msr_based_features;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msr_based_features,
+ num_msr_based_features * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS:
+ r = msr_io(NULL, argp, do_get_feature_msr, 1);
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
+ break;
+#endif
+ case KVM_GET_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_get_attr(&attr);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_has_attr(&attr);
+ break;
+ }
default:
r = -EINVAL;
+ break;
}
out:
return r;
}
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->vcpu_load(vcpu, cpu);
- kvm_request_guest_time_update(vcpu);
+ return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu);
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- kvm_x86_ops->vcpu_put(vcpu);
- kvm_put_guest_fpu(vcpu);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ kvm_request_l1tf_flush_l1d();
+
+ if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_call(has_wbinvd_exit)())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ wbinvd_on_cpu(vcpu->cpu);
+ }
+
+ kvm_x86_call(vcpu_load)(vcpu, cpu);
+
+ if (vcpu != per_cpu(last_vcpu, cpu)) {
+ /*
+ * Flush the branch predictor when switching vCPUs on the same
+ * physical CPU, as each vCPU needs its own branch prediction
+ * domain. No IBPB is needed when switching between L1 and L2
+ * on the same vCPU unless IBRS is advertised to the vCPU; that
+ * is handled on the nested VM-Exit path.
+ */
+ if (static_branch_likely(&switch_vcpu_ibpb))
+ indirect_branch_prediction_barrier();
+ per_cpu(last_vcpu, cpu) = vcpu;
+ }
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+ adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+ vcpu->arch.tsc_offset_adjustment = 0;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+
+ if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ rdtsc() - vcpu->arch.last_host_tsc;
+ if (tsc_delta < 0)
+ mark_tsc_unstable("KVM discovered backwards TSC");
+
+ if (kvm_check_tsc_unstable()) {
+ u64 offset = kvm_compute_l1_tsc_offset(vcpu,
+ vcpu->arch.last_guest_tsc);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ if (!vcpu->arch.guest_tsc_protected)
+ vcpu->arch.tsc_catchup = 1;
+ }
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_restart_hv_timer(vcpu);
+
+ /*
+ * On a host with synchronized TSC, there is no need to update
+ * kvmclock on vcpu->cpu migration
+ */
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != cpu)
+ kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
+ vcpu->cpu = cpu;
+ }
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
-static int is_efer_nx(void)
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- unsigned long long efer = 0;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+ * an instruction boundary and will not trigger guest emulation of any
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
+ * when this is true, for example allowing the vCPU to be marked
+ * preempted if and only if the VM-Exit was due to a host interrupt.
+ */
+ if (!vcpu->arch.at_instruction_boundary) {
+ vcpu->stat.preemption_other++;
+ return;
+ }
+
+ vcpu->stat.preemption_reported++;
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (vcpu->arch.st.preempted)
+ return;
+
+ /* This happens on process exit */
+ if (unlikely(current->mm != vcpu->kvm->mm))
+ return;
+
+ slots = kvm_memslots(vcpu->kvm);
- rdmsrl_safe(MSR_EFER, &efer);
- return efer & EFER_NX;
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+ return;
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- int i;
- struct kvm_cpuid_entry2 *e, *entry;
+ int idx;
- entry = NULL;
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- e = &vcpu->arch.cpuid_entries[i];
- if (e->function == 0x80000001) {
- entry = e;
- break;
- }
- }
- if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
- entry->edx &= ~(1 << 20);
- printk(KERN_INFO "kvm: guest NX capability removed\n");
+ if (vcpu->preempted) {
+ /*
+ * Assume protected guests are in-kernel. Inefficient yielding
+ * due to false positives is preferable to never yielding due
+ * to false negatives.
+ */
+ vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected ||
+ !kvm_x86_call(get_cpl_no_cache)(vcpu);
+
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (kvm_xen_msr_enabled(vcpu->kvm))
+ kvm_xen_runstate_set_preempted(vcpu);
+ else
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
+
+ kvm_x86_call(vcpu_put)(vcpu);
+ vcpu->arch.last_host_tsc = rdtsc();
}
-/* when an old userspace process fills a new kernel module */
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
- struct kvm_cpuid *cpuid,
- struct kvm_cpuid_entry __user *entries)
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
{
- int r, i;
- struct kvm_cpuid_entry *cpuid_entries;
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -ENOMEM;
- cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
- if (!cpuid_entries)
- goto out;
- r = -EFAULT;
- if (copy_from_user(cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
- goto out_free;
- for (i = 0; i < cpuid->nent; i++) {
- vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
- vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
- vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
- vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
- vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
- vcpu->arch.cpuid_entries[i].index = 0;
- vcpu->arch.cpuid_entries[i].flags = 0;
- vcpu->arch.cpuid_entries[i].padding[0] = 0;
- vcpu->arch.cpuid_entries[i].padding[1] = 0;
- vcpu->arch.cpuid_entries[i].padding[2] = 0;
- }
- vcpu->arch.cpuid_nent = cpuid->nent;
- cpuid_fix_nx_cap(vcpu);
- r = 0;
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
-out_free:
- vfree(cpuid_entries);
-out:
- return r;
+ return kvm_apic_get_state(vcpu, s);
}
-static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
{
int r;
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -EFAULT;
- if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
- goto out;
- vcpu->arch.cpuid_nent = cpuid->nent;
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
+ update_cr8_intercept(vcpu);
+
return 0;
+}
-out:
- return r;
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We can accept userspace's request for interrupt injection
+ * as long as we have a place to store the interrupt number.
+ * The actual injection will happen when the CPU is able to
+ * deliver the interrupt.
+ */
+ if (kvm_cpu_has_extint(vcpu))
+ return false;
+
+ /* Acknowledging ExtINT does not happen if LINT0 is masked. */
+ return (!lapic_in_kernel(vcpu) ||
+ kvm_apic_accept_pic_intr(vcpu));
}
-static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Do not cause an interrupt window exit if an exception
+ * is pending or an event needs reinjection; userspace
+ * might want to inject the interrupt manually using KVM_SET_REGS
+ * or KVM_SET_SREGS. For that to work, we must be at an
+ * instruction boundary and with no events half-injected.
+ */
+ return (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ !kvm_is_exception_pending(vcpu));
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ if (irq->irq >= KVM_NR_INTERRUPTS)
+ return -EINVAL;
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
+ return -ENXIO;
+
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+
+ vcpu->arch.pending_external_vector = irq->irq;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+ kvm_inject_nmi(vcpu);
+
+ return 0;
+}
+
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+ struct kvm_tpr_access_ctl *tac)
+{
+ if (tac->flags)
+ return -EINVAL;
+ vcpu->arch.tpr_access_reporting = !!tac->enabled;
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
{
int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
- r = -E2BIG;
- if (cpuid->nent < vcpu->arch.cpuid_nent)
+ r = -EINVAL;
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
goto out;
- r = -EFAULT;
- if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
goto out;
- return 0;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
+ for (bank = 0; bank < bank_num; bank++) {
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+ if (mcg_cap & MCG_CMCI_P)
+ vcpu->arch.mci_ctl2_banks[bank] = 0;
+ }
+ kvm_apic_after_set_mcg_cap(vcpu);
+
+ kvm_x86_call(setup_mce)(vcpu);
out:
- cpuid->nent = vcpu->arch.cpuid_nent;
return r;
}
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index)
+/*
+ * Validate this is an UCNA (uncorrectable no action) error by checking the
+ * MCG_STATUS and MCi_STATUS registers:
+ * - none of the bits for Machine Check Exceptions are set
+ * - both the VAL (valid) and UC (uncorrectable) bits are set
+ * MCI_STATUS_PCC - Processor Context Corrupted
+ * MCI_STATUS_S - Signaled as a Machine Check Exception
+ * MCI_STATUS_AR - Software recoverable Action Required
+ */
+static bool is_ucna(struct kvm_x86_mce *mce)
{
- entry->function = function;
- entry->index = index;
- cpuid_count(entry->function, entry->index,
- &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
- entry->flags = 0;
+ return !mce->mcg_status &&
+ !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
+ (mce->status & MCI_STATUS_VAL) &&
+ (mce->status & MCI_STATUS_UC);
}
-#define F(x) bit(X86_FEATURE_##x)
+static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+
+ banks[1] = mce->status;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index, int *nent, int maxnent)
+ if (!(mcg_cap & MCG_CMCI_P) ||
+ !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
+ return 0;
+
+ if (lapic_in_kernel(vcpu))
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
{
- unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
- unsigned f_lm = F(LM);
-#else
- unsigned f_lm = 0;
-#endif
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
- /* cpuid 1.edx */
- const u32 kvm_supported_word0_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
- 0 /* Reserved, DS, ACPI */ | F(MMX) |
- F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
- 0 /* HTT, TM, Reserved, PBE */;
- /* cpuid 0x80000001.edx */
- const u32 kvm_supported_word1_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* Reserved */ |
- f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
- 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
- /* cpuid 1.ecx */
- const u32 kvm_supported_word4_x86_features =
- F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
- 0 /* DS-CPL, VMX, SMX, EST */ |
- 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
- 0 /* Reserved, DCA */ | F(XMM4_1) |
- F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved, XSAVE, OSXSAVE */;
- /* cpuid 0x80000001.ecx */
- const u32 kvm_supported_word6_x86_features =
- F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
- F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
- 0 /* SKINIT */ | 0 /* WDT */;
-
- /* all calls to cpuid_count() should be made on the same cpu */
- get_cpu();
- do_cpuid_1_ent(entry, function, index);
- ++*nent;
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
- switch (function) {
- case 0:
- entry->eax = min(entry->eax, (u32)0xb);
- break;
- case 1:
- entry->edx &= kvm_supported_word0_x86_features;
- entry->ecx &= kvm_supported_word4_x86_features;
- break;
- /* function 2 entries are STATEFUL. That is, repeated cpuid commands
- * may return different values. This forces us to get_cpu() before
- * issuing the first command, and also to emulate this annoying behavior
- * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
- case 2: {
- int t, times = entry->eax & 0xff;
-
- entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
- entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- for (t = 1; t < times && *nent < maxnent; ++t) {
- do_cpuid_1_ent(&entry[t], function, 0);
- entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
- ++*nent;
+ banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
+
+ if (is_ucna(mce))
+ return kvm_vcpu_x86_set_ucna(vcpu, mce, banks);
+
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return 0;
}
- break;
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ struct kvm_queued_exception *ex;
+
+ process_nmi(vcpu);
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+#endif
+
+ /*
+ * KVM's ABI only allows for one exception to be migrated. Luckily,
+ * the only time there can be two queued exceptions is if there's a
+ * non-exiting _injected_ exception, and a pending exiting exception.
+ * In that case, ignore the VM-Exiting exception as it's an extension
+ * of the injected exception.
+ */
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vcpu->arch.exception.pending &&
+ !vcpu->arch.exception.injected)
+ ex = &vcpu->arch.exception_vmexit;
+ else
+ ex = &vcpu->arch.exception;
+
+ /*
+ * In guest mode, payload delivery should be deferred if the exception
+ * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
+ * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
+ * propagate the payload and so it cannot be safely deferred. Deliver
+ * the payload if the capability hasn't been requested.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
+ ex->pending && ex->has_payload)
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ memset(events, 0, sizeof(*events));
+
+ /*
+ * The API doesn't provide the instruction length for software
+ * exceptions, so don't report them. As long as the guest RIP
+ * isn't advanced, we should expect to encounter the exception
+ * again.
+ */
+ if (!kvm_exception_is_soft(ex->vector)) {
+ events->exception.injected = ex->injected;
+ events->exception.pending = ex->pending;
+ /*
+ * For ABI compatibility, deliberately conflate
+ * pending and injected exceptions when
+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ events->exception.injected |= ex->pending;
+ }
+ events->exception.nr = ex->vector;
+ events->exception.has_error_code = ex->has_error_code;
+ events->exception.error_code = ex->error_code;
+ events->exception_has_payload = ex->has_payload;
+ events->exception_payload = ex->payload;
+
+ events->interrupt.injected =
+ vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
+ events->interrupt.nr = vcpu->arch.interrupt.nr;
+ events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
+
+ events->nmi.injected = vcpu->arch.nmi_injected;
+ events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
+ events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
+
+ /* events->sipi_vector is never valid when reporting to user space */
+
+#ifdef CONFIG_KVM_SMM
+ events->smi.smm = is_smm(vcpu);
+ events->smi.pending = vcpu->arch.smi_pending;
+ events->smi.smm_inside_nmi =
+ !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+#endif
+ events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
+ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM);
+ if (vcpu->kvm->arch.exception_payload_enabled)
+ events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+ if (vcpu->kvm->arch.triple_fault_event) {
+ events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+ }
+}
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM
+ | KVM_VCPUEVENT_VALID_PAYLOAD
+ | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
+ return -EINVAL;
+
+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ return -EINVAL;
+ if (events->exception.pending)
+ events->exception.injected = 0;
+ else
+ events->exception_has_payload = 0;
+ } else {
+ events->exception.pending = 0;
+ events->exception_has_payload = 0;
}
- /* function 4 and 0xb have additional index. */
- case 4: {
- int i, cache_type;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* read more entries until cache_type is zero */
- for (i = 1; *nent < maxnent; ++i) {
- cache_type = entry[i - 1].eax & 0x1f;
- if (!cache_type)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
+
+ if ((events->exception.injected || events->exception.pending) &&
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+ return -EINVAL;
+
+ process_nmi(vcpu);
+
+ /*
+ * Flag that userspace is stuffing an exception, the next KVM_RUN will
+ * morph the exception to a VM-Exit if appropriate. Do this only for
+ * pending exceptions, already-injected exceptions are not subject to
+ * intercpetion. Note, userspace that conflates pending and injected
+ * is hosed, and will incorrectly convert an injected exception into a
+ * pending exception, which in turn may cause a spurious VM-Exit.
+ */
+ vcpu->arch.exception_from_userspace = events->exception.pending;
+
+ vcpu->arch.exception_vmexit.pending = false;
+
+ vcpu->arch.exception.injected = events->exception.injected;
+ vcpu->arch.exception.pending = events->exception.pending;
+ vcpu->arch.exception.vector = events->exception.nr;
+ vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+ vcpu->arch.exception.error_code = events->exception.error_code;
+ vcpu->arch.exception.has_payload = events->exception_has_payload;
+ vcpu->arch.exception.payload = events->exception_payload;
+
+ vcpu->arch.interrupt.injected = events->interrupt.injected;
+ vcpu->arch.interrupt.nr = events->interrupt.nr;
+ vcpu->arch.interrupt.soft = events->interrupt.soft;
+ if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+ kvm_x86_call(set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
+
+ vcpu->arch.nmi_injected = events->nmi.injected;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
+ vcpu->arch.nmi_pending = 0;
+ atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
+ if (events->nmi.pending)
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+ }
+ kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+ lapic_in_kernel(vcpu))
+ vcpu->arch.apic->sipi_vector = events->sipi_vector;
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+#ifdef CONFIG_KVM_SMM
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+ kvm_leave_nested(vcpu);
+ kvm_smm_changed(vcpu, events->smi.smm);
+ }
+
+ vcpu->arch.smi_pending = events->smi.pending;
+
+ if (events->smi.smm) {
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ }
+
+#else
+ if (events->smi.smm || events->smi.pending ||
+ events->smi.smm_inside_nmi)
+ return -EINVAL;
+#endif
+
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
- break;
}
- case 0xb: {
- int i, level_type;
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* read more entries until level_type is zero */
- for (i = 1; *nent < maxnent; ++i) {
- level_type = entry[i - 1].ecx & 0xff00;
- if (!level_type)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
+ if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+ if (!vcpu->kvm->arch.triple_fault_event)
+ return -EINVAL;
+ if (events->triple_fault.pending)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ else
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ unsigned int i;
+
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ memset(dbgregs, 0, sizeof(*dbgregs));
+
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ dbgregs->db[i] = vcpu->arch.db[i];
+
+ dbgregs->dr6 = vcpu->arch.dr6;
+ dbgregs->dr7 = vcpu->arch.dr7;
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ unsigned int i;
+
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ if (!kvm_dr6_valid(dbgregs->dr6))
+ return -EINVAL;
+ if (!kvm_dr7_valid(dbgregs->dr7))
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ vcpu->arch.db[i] = dbgregs->db[i];
+
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = dbgregs->dr6;
+ vcpu->arch.dr7 = dbgregs->dr7;
+ kvm_update_dr7(vcpu);
+
+ return 0;
+}
+
+
+static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
+{
+ /*
+ * Only copy state for features that are enabled for the guest. The
+ * state itself isn't problematic, but setting bits in the header for
+ * features that are supported in *this* host but not exposed to the
+ * guest can result in KVM_SET_XSAVE failing when live migrating to a
+ * compatible host without the features that are NOT exposed to the
+ * guest.
+ *
+ * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
+ * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
+ * supported by the host.
+ */
+ u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
+ XFEATURE_MASK_FPSSE;
+
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
+ supported_xcr0, vcpu->arch.pkru);
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+
+ return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ kvm_caps.supported_xcr0,
+ &vcpu->arch.pkru);
+}
+
+static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ guest_xcrs->nr_xcrs = 0;
+ return 0;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[i].value);
+ break;
}
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor. This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.pv_time.active)
+ return -EINVAL;
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ return 0;
+}
+
+static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = 0;
break;
+ default:
+ r = -ENXIO;
}
- case 0x80000000:
- entry->eax = min(entry->eax, 0x8000001a);
- break;
- case 0x80000001:
- entry->edx &= kvm_supported_word1_x86_features;
- entry->ecx &= kvm_supported_word6_x86_features;
+
+ return r;
+}
+
+static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = -EFAULT;
+ if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
+ break;
+ r = 0;
break;
+ default:
+ r = -ENXIO;
}
- put_cpu();
+
+ return r;
}
-#undef F
+static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET: {
+ u64 offset, tsc, ns;
+ unsigned long flags;
+ bool matched;
+
+ r = -EFAULT;
+ if (get_user(offset, uaddr))
+ break;
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+
+ matched = (vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_offset == offset);
+
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ ns = get_kvmclock_base_ns();
+
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ r = 0;
+ break;
+ }
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
+ unsigned int ioctl,
+ void __user *argp)
{
- struct kvm_cpuid_entry2 *cpuid_entries;
- int limit, nent = 0, r = -E2BIG;
- u32 func;
+ struct kvm_device_attr attr;
+ int r;
- if (cpuid->nent < 1)
- goto out;
- r = -ENOMEM;
- cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
- if (!cpuid_entries)
- goto out;
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
- do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
- limit = cpuid_entries[0].eax;
- for (func = 1; func <= limit && nent < cpuid->nent; ++func)
- do_cpuid_ent(&cpuid_entries[nent], func, 0,
- &nent, cpuid->nent);
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
+ if (attr.group != KVM_VCPU_TSC_CTRL)
+ return -ENXIO;
- do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
- limit = cpuid_entries[nent - 1].eax;
- for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
- do_cpuid_ent(&cpuid_entries[nent], func, 0,
- &nent, cpuid->nent);
- r = -EFAULT;
- if (copy_to_user(entries, cpuid_entries,
- nent * sizeof(struct kvm_cpuid_entry2)))
- goto out_free;
- cpuid->nent = nent;
- r = 0;
+ switch (ioctl) {
+ case KVM_HAS_DEVICE_ATTR:
+ r = kvm_arch_tsc_has_attr(vcpu, &attr);
+ break;
+ case KVM_GET_DEVICE_ATTR:
+ r = kvm_arch_tsc_get_attr(vcpu, &attr);
+ break;
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_arch_tsc_set_attr(vcpu, &attr);
+ break;
+ }
-out_free:
- vfree(cpuid_entries);
-out:
return r;
}
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
{
- vcpu_load(vcpu);
- memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
- vcpu_put(vcpu);
+ if (cap->flags)
+ return -EINVAL;
- return 0;
+ switch (cap->cap) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_CAP_HYPERV_SYNIC2:
+ if (cap->args[0])
+ return -EINVAL;
+ fallthrough;
+
+ case KVM_CAP_HYPERV_SYNIC:
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return -EINVAL;
+ return kvm_hv_activate_synic(vcpu, cap->cap ==
+ KVM_CAP_HYPERV_SYNIC2);
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ {
+ int r;
+ uint16_t vmcs_version;
+ void __user *user_ptr;
+
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
+ return -ENOTTY;
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
+ if (!r) {
+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
+ if (copy_to_user(user_ptr, &vmcs_version,
+ sizeof(vmcs_version)))
+ r = -EFAULT;
+ }
+ return r;
+ }
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ if (!kvm_x86_ops.enable_l2_tlb_flush)
+ return -ENOTTY;
+
+ return kvm_x86_call(enable_l2_tlb_flush)(vcpu);
+
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+#endif
+
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
+ return 0;
+ default:
+ return -EINVAL;
+ }
}
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
+struct kvm_x86_reg_id {
+ __u32 index;
+ __u8 type;
+ __u8 rsvd1;
+ __u8 rsvd2:4;
+ __u8 size:4;
+ __u8 x86;
+};
+
+static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu,
+ struct kvm_x86_reg_id *reg)
{
- vcpu_load(vcpu);
- memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
- vcpu_put(vcpu);
+ switch (reg->index) {
+ case KVM_REG_GUEST_SSP:
+ /*
+ * FIXME: If host-initiated accesses are ever exempted from
+ * ignore_msrs (in kvm_do_msr_access()), drop this manual check
+ * and rely on KVM's standard checks to reject accesses to regs
+ * that don't exist.
+ */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return -EINVAL;
+ reg->type = KVM_X86_REG_TYPE_MSR;
+ reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
+ break;
+ default:
+ return -EINVAL;
+ }
return 0;
}
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_interrupt *irq)
+static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
{
- if (irq->irq < 0 || irq->irq >= 256)
- return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
- return -ENXIO;
- vcpu_load(vcpu);
+ u64 val;
- kvm_queue_interrupt(vcpu, irq->irq, false);
+ if (do_get_msr(vcpu, msr, &val))
+ return -EINVAL;
- vcpu_put(vcpu);
+ if (put_user(val, user_val))
+ return -EFAULT;
return 0;
}
-static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
{
- vcpu_load(vcpu);
- kvm_inject_nmi(vcpu);
- vcpu_put(vcpu);
+ u64 val;
+
+ if (get_user(val, user_val))
+ return -EFAULT;
+
+ if (do_set_msr(vcpu, msr, &val))
+ return -EINVAL;
return 0;
}
-static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
- struct kvm_tpr_access_ctl *tac)
+static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl,
+ void __user *argp)
{
- if (tac->flags)
+ struct kvm_one_reg one_reg;
+ struct kvm_x86_reg_id *reg;
+ u64 __user *user_val;
+ bool load_fpu;
+ int r;
+
+ if (copy_from_user(&one_reg, argp, sizeof(one_reg)))
+ return -EFAULT;
+
+ if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86)
return -EINVAL;
- vcpu->arch.tpr_access_reporting = !!tac->enabled;
+
+ reg = (struct kvm_x86_reg_id *)&one_reg.id;
+ if (reg->rsvd1 || reg->rsvd2)
+ return -EINVAL;
+
+ if (reg->type == KVM_X86_REG_TYPE_KVM) {
+ r = kvm_translate_kvm_reg(vcpu, reg);
+ if (r)
+ return r;
+ }
+
+ if (reg->type != KVM_X86_REG_TYPE_MSR)
+ return -EINVAL;
+
+ if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64)
+ return -EINVAL;
+
+ guard(srcu)(&vcpu->kvm->srcu);
+
+ load_fpu = is_xstate_managed_msr(vcpu, reg->index);
+ if (load_fpu)
+ kvm_load_guest_fpu(vcpu);
+
+ user_val = u64_to_user_ptr(one_reg.addr);
+ if (ioctl == KVM_GET_ONE_REG)
+ r = kvm_get_one_msr(vcpu, reg->index, user_val);
+ else
+ r = kvm_set_one_msr(vcpu, reg->index, user_val);
+
+ if (load_fpu)
+ kvm_put_guest_fpu(vcpu);
+ return r;
+}
+
+static int kvm_get_reg_list(struct kvm_vcpu *vcpu,
+ struct kvm_reg_list __user *user_list)
+{
+ u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0;
+ u64 user_nr_regs;
+
+ if (get_user(user_nr_regs, &user_list->n))
+ return -EFAULT;
+
+ if (put_user(nr_regs, &user_list->n))
+ return -EFAULT;
+
+ if (user_nr_regs < nr_regs)
+ return -E2BIG;
+
+ if (nr_regs &&
+ put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
+ return -EFAULT;
+
return 0;
}
@@ -1509,55 +6167,64 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
- struct kvm_lapic_state *lapic = NULL;
+ union {
+ struct kvm_sregs2 *sregs2;
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+
+ vcpu_load(vcpu);
+ u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
- lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!lapic)
+ if (!u.lapic)
goto out;
- r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
}
case KVM_SET_LAPIC: {
- lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
- r = -ENOMEM;
- if (!lapic)
- goto out;
- r = -EFAULT;
- if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
- goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
- if (r)
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
goto out;
- r = 0;
+ u.lapic = memdup_user(argp, sizeof(*u.lapic));
+ if (IS_ERR(u.lapic)) {
+ r = PTR_ERR(u.lapic);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
break;
}
case KVM_INTERRUPT: {
struct kvm_interrupt irq;
r = -EFAULT;
- if (copy_from_user(&irq, argp, sizeof irq))
+ if (copy_from_user(&irq, argp, sizeof(irq)))
goto out;
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_NMI: {
r = kvm_vcpu_ioctl_nmi(vcpu);
- if (r)
- goto out;
- r = 0;
+ break;
+ }
+ case KVM_SMI: {
+ r = kvm_inject_smi(vcpu);
break;
}
case KVM_SET_CPUID: {
@@ -1565,11 +6232,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
- if (r)
- goto out;
break;
}
case KVM_SET_CPUID2: {
@@ -1577,12 +6242,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
- if (r)
- goto out;
break;
}
case KVM_GET_CPUID2: {
@@ -1590,177 +6253,698 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
goto out;
r = 0;
break;
}
- case KVM_GET_MSRS:
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
+ case KVM_GET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = msr_io(vcpu, argp, do_get_msr, 1);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
- case KVM_SET_MSRS:
+ }
+ case KVM_SET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
r = msr_io(vcpu, argp, do_set_msr, 0);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_GET_ONE_REG:
+ case KVM_SET_ONE_REG:
+ r = kvm_get_set_one_reg(vcpu, ioctl, argp);
+ break;
+ case KVM_GET_REG_LIST:
+ r = kvm_get_reg_list(vcpu, argp);
break;
case KVM_TPR_ACCESS_REPORTING: {
struct kvm_tpr_access_ctl tac;
r = -EFAULT;
- if (copy_from_user(&tac, argp, sizeof tac))
+ if (copy_from_user(&tac, argp, sizeof(tac)))
goto out;
r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &tac, sizeof tac))
+ if (copy_to_user(argp, &tac, sizeof(tac)))
goto out;
r = 0;
break;
};
case KVM_SET_VAPIC_ADDR: {
struct kvm_vapic_addr va;
+ int idx;
r = -EINVAL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&va, argp, sizeof(va)))
+ goto out;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
r = -EFAULT;
- if (copy_from_user(&va, argp, sizeof va))
+ if (copy_from_user(&mce, argp, sizeof(mce)))
goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
+ case KVM_GET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+ break;
r = 0;
- kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
break;
}
+ case KVM_SET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ r = -EFAULT;
+ if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+ break;
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+ r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ break;
+ }
+ case KVM_GET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ if (r < 0)
+ break;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &dbgregs,
+ sizeof(struct kvm_debugregs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbgregs, argp,
+ sizeof(struct kvm_debugregs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+ break;
+ }
+ case KVM_GET_XSAVE: {
+ r = -EINVAL;
+ if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+ break;
+
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ if (r < 0)
+ break;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = memdup_user(argp, size);
+ if (IS_ERR(u.xsave)) {
+ r = PTR_ERR(u.xsave);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+
+ case KVM_GET_XSAVE2: {
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = kzalloc(size, GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ if (r < 0)
+ break;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, size))
+ break;
+
+ r = 0;
+ break;
+ }
+
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ if (r < 0)
+ break;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+ if (IS_ERR(u.xcrs)) {
+ r = PTR_ERR(u.xcrs);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+
+ if (vcpu->arch.guest_tsc_protected)
+ goto out;
+
+ user_tsc_khz = (u32)arg;
+
+ if (kvm_caps.has_tsc_control &&
+ user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
+
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = vcpu->arch.virtual_tsc_khz;
+ goto out;
+ }
+ case KVM_KVMCLOCK_CTRL: {
+ r = kvm_set_guest_paused(vcpu);
+ goto out;
+ }
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
+ case KVM_GET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ u32 user_data_size;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops.nested_ops->get_state)
+ break;
+
+ BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+ r = -EFAULT;
+ if (get_user(user_data_size, &user_kvm_nested_state->size))
+ break;
+
+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
+ user_data_size);
+ if (r < 0)
+ break;
+
+ if (r > user_data_size) {
+ if (put_user(r, &user_kvm_nested_state->size))
+ r = -EFAULT;
+ else
+ r = -E2BIG;
+ break;
+ }
+
+ r = 0;
+ break;
+ }
+ case KVM_SET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ struct kvm_nested_state kvm_state;
+ int idx;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops.nested_ops->set_state)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+ break;
+
+ r = -EINVAL;
+ if (kvm_state.size < sizeof(kvm_state))
+ break;
+
+ if (kvm_state.flags &
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
+ | KVM_STATE_NESTED_GIF_SET))
+ break;
+
+ /* nested_run_pending implies guest_mode. */
+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
+ break;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_XEN_VCPU_GET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_get_attr(vcpu, &xva);
+ if (!r && copy_to_user(argp, &xva, sizeof(xva)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_VCPU_SET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_set_attr(vcpu, &xva);
+ break;
+ }
+#endif
+ case KVM_GET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
+ u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.sregs2)
+ goto out;
+ __get_sregs2(vcpu, u.sregs2);
+ r = -EFAULT;
+ if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
+ u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+ if (IS_ERR(u.sregs2)) {
+ r = PTR_ERR(u.sregs2);
+ u.sregs2 = NULL;
+ goto out;
+ }
+ r = __set_sregs2(vcpu, u.sregs2);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR:
+ case KVM_GET_DEVICE_ATTR:
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
+ break;
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
+ goto out;
+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
+ break;
default:
r = -EINVAL;
}
out:
- kfree(lapic);
+ kfree(u.buffer);
+out_nofree:
+ vcpu_put(vcpu);
return r;
}
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
{
int ret;
if (addr > (unsigned int)(-3 * PAGE_SIZE))
- return -1;
- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+ return -EINVAL;
+ ret = kvm_x86_call(set_tss_addr)(kvm, addr);
return ret;
}
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+ u64 ident_addr)
+{
+ return kvm_x86_call(set_identity_map_addr)(kvm, ident_addr);
+}
+
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
- u32 kvm_nr_mmu_pages)
+ unsigned long kvm_nr_mmu_pages)
{
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- down_write(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+ mutex_lock(&kvm->slots_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- spin_unlock(&kvm->mmu_lock);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
- return kvm->arch.n_alloc_mmu_pages;
+
+ /*
+ * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
+ * before reporting dirty_bitmap to userspace. KVM flushes the buffers
+ * on all VM-Exits, thus we only need to kick running vCPUs to force a
+ * VM-Exit.
+ */
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (!kvm->arch.cpu_dirty_log_size)
+ return;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
}
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
{
- int i;
- struct kvm_mem_alias *alias;
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
- for (i = 0; i < kvm->arch.naliases; ++i) {
- alias = &kvm->arch.aliases[i];
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = -EINVAL;
+ if (cap->args[0] & ~kvm_caps.supported_quirks)
+ break;
+ fallthrough;
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
+ r = 0;
+ break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (kvm->created_vcpus)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
}
- return gfn;
-}
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
-/*
- * Set a new alias region. Aliases map a portion of physical memory into
- * another portion. This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
- struct kvm_memory_alias *alias)
-{
- int r, n;
- struct kvm_mem_alias *p;
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
- r = -EINVAL;
- /* General sanity checks */
- if (alias->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (alias->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (alias->slot >= KVM_ALIAS_SLOTS)
- goto out;
- if (alias->guest_phys_addr + alias->memory_size
- < alias->guest_phys_addr)
- goto out;
- if (alias->target_phys_addr + alias->memory_size
- < alias->target_phys_addr)
- goto out;
+ r = 0;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = -EINVAL;
+ if (cap->args[0] & ~kvm_get_allowed_disable_exits())
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ goto disable_exits_unlock;
- down_write(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
+ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
- p = &kvm->arch.aliases[alias->slot];
- p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
- p->npages = alias->memory_size >> PAGE_SHIFT;
- p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+ if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
+ cpu_smt_possible() &&
+ (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_APERFMPERF)))
+ pr_warn_once(SMT_RSB_MSG);
- for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (kvm->arch.aliases[n - 1].npages)
+ kvm_disable_exits(kvm, cap->args[0]);
+ r = 0;
+disable_exits_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXCEPTION_PAYLOAD:
+ kvm->arch.exception_payload_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+ kvm->arch.triple_fault_event = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
+ break;
+ kvm->arch.user_space_msr_mask = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
break;
- kvm->arch.naliases = n;
- spin_unlock(&kvm->mmu_lock);
- kvm_mmu_zap_all(kvm);
+ if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
+ (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
+ break;
- up_write(&kvm->slots_lock);
+ if (kvm_caps.has_bus_lock_exit &&
+ cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
+ kvm->arch.bus_lock_detection_enabled = true;
+ r = 0;
+ break;
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE: {
+ unsigned long allowed_attributes = 0;
- return 0;
+ r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+ if (r)
+ break;
-out:
- return r;
-}
+ /* KVM only supports the PROVISIONKEY privileged attribute. */
+ if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+ !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+ kvm->arch.sgx_provisioning_allowed = true;
+ else
+ r = -EINVAL;
+ break;
+ }
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (!kvm_x86_ops.vm_copy_enc_context_from)
+ break;
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- int r;
+ r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
+ break;
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (!kvm_x86_ops.vm_move_enc_context_from)
+ break;
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic,
- &pic_irqchip(kvm)->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic,
- &pic_irqchip(kvm)->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- memcpy(&chip->chip.ioapic,
- ioapic_irqchip(kvm),
- sizeof(struct kvm_ioapic_state));
+ r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
+ break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+ r = -EINVAL;
+ break;
+ }
+ kvm->arch.hypercall_exit_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ r = -EINVAL;
+ if (cap->args[0] & ~1)
+ break;
+ kvm->arch.exit_on_emulation_error = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = -EINVAL;
+ if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = -EINVAL;
+ if (cap->args[0] > KVM_MAX_VCPU_IDS)
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
+ ;
+ } else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+ r = 0;
+ } else if (!kvm->arch.max_vcpu_ids) {
+ kvm->arch.max_vcpu_ids = cap->args[0];
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_X86_NOTIFY_VMEXIT:
+ r = -EINVAL;
+ if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+ break;
+ if (!kvm_caps.has_notify_vmexit)
+ break;
+ if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+ break;
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.notify_window = cap->args[0] >> 32;
+ kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+ r = -EINVAL;
+
+ /*
+ * Since the risk of disabling NX hugepages is a guest crashing
+ * the system, ensure the userspace process has permission to
+ * reboot the system.
+ *
+ * Note that unlike the reboot() syscall, the process must have
+ * this capability in the root namespace because exposing
+ * /dev/kvm into a container does not limit the scope of the
+ * iTLB multihit bug to that container. In other words,
+ * this must use capable(), not ns_capable().
+ */
+ if (!capable(CAP_SYS_BOOT)) {
+ r = -EPERM;
+ break;
+ }
+
+ if (cap->args[0])
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.disable_nx_huge_pages = true;
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS: {
+ u64 bus_cycle_ns = cap->args[0];
+ u64 unused;
+
+ /*
+ * Guard against overflow in tmict_to_ns(). 128 is the highest
+ * divide value that can be programmed in APIC_TDCR.
+ */
+ r = -EINVAL;
+ if (!bus_cycle_ns ||
+ check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused))
+ break;
+
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (!irqchip_in_kernel(kvm))
+ r = -ENXIO;
+ else if (kvm->created_vcpus)
+ r = -EINVAL;
+ else
+ kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -1768,249 +6952,416 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
return r;
}
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
+{
+ u32 i;
+
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
+
+ kfree(msr_filter);
+}
+
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
+{
+ unsigned long *bitmap;
+ size_t bitmap_size;
+
+ if (!user_range->nmsrs)
+ return 0;
+
+ if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
+ return -EINVAL;
+
+ if (!user_range->flags)
+ return -EINVAL;
+
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+ return -EINVAL;
+
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
+ .flags = user_range->flags,
+ .base = user_range->base,
+ .nmsrs = user_range->nmsrs,
+ .bitmap = bitmap,
+ };
+
+ msr_filter->count++;
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
+ struct kvm_msr_filter *filter)
+{
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
+ bool default_allow;
+ bool empty = true;
int r;
+ u32 i;
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&pic_irqchip(kvm)->pics[0],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&pic_irqchip(kvm)->pics[1],
- &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- memcpy(ioapic_irqchip(kvm),
- &chip->chip.ioapic,
- sizeof(struct kvm_ioapic_state));
- break;
- default:
- r = -EINVAL;
+ if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
+ empty &= !filter->ranges[i].nmsrs;
+
+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ if (empty && !default_allow)
+ return -EINVAL;
+
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
+ }
+
+ mutex_lock(&kvm->lock);
+ old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
+ /*
+ * Recalc MSR intercepts as userspace may want to intercept accesses to
+ * MSRs that KVM would otherwise pass through to the guest.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS);
+
+ return 0;
+}
+
+#ifdef CONFIG_KVM_COMPAT
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range_compat {
+ __u32 flags;
+ __u32 nmsrs;
+ __u32 base;
+ __u32 bitmap;
+};
+
+struct kvm_msr_filter_compat {
+ __u32 flags;
+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
+
+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
+
+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct kvm *kvm = filp->private_data;
+ long r = -ENOTTY;
+
+ switch (ioctl) {
+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter_compat filter_compat;
+ struct kvm_msr_filter filter;
+ int i;
+
+ if (copy_from_user(&filter_compat, user_msr_filter,
+ sizeof(filter_compat)))
+ return -EFAULT;
+
+ filter.flags = filter_compat.flags;
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+ struct kvm_msr_filter_range_compat *cr;
+
+ cr = &filter_compat.ranges[i];
+ filter.ranges[i] = (struct kvm_msr_filter_range) {
+ .flags = cr->flags,
+ .nmsrs = cr->nmsrs,
+ .base = cr->base,
+ .bitmap = (__u8 *)(ulong)cr->bitmap,
+ };
+ }
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
break;
}
- kvm_pic_update_irq(pic_irqchip(kvm));
+ }
+
return r;
}
+#endif
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
- int r = 0;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
- memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
- return r;
+ /*
+ * Ignore the return, marking the guest paused only "fails" if the vCPU
+ * isn't using kvmclock; continuing on is correct and desirable.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ (void)kvm_set_guest_paused(vcpu);
+
+ return NOTIFY_DONE;
}
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
{
- int r = 0;
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ return kvm_arch_suspend_notifier(kvm);
+ }
- memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count);
- return r;
+ return NOTIFY_DONE;
}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
+static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
{
- if (!kvm->arch.vpit)
- return -ENXIO;
- kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ struct kvm_clock_data data = { 0 };
+
+ get_kvmclock(kvm, &data);
+ if (copy_to_user(argp, &data, sizeof(data)))
+ return -EFAULT;
+
return 0;
}
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
- struct kvm_dirty_log *log)
+static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
{
- int r;
- int n;
- struct kvm_memory_slot *memslot;
- int is_dirty = 0;
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_clock_data data;
+ u64 now_raw_ns;
- down_write(&kvm->slots_lock);
+ if (copy_from_user(&data, argp, sizeof(data)))
+ return -EFAULT;
- r = kvm_get_dirty_log(kvm, log, &is_dirty);
- if (r)
- goto out;
+ /*
+ * Only KVM_CLOCK_REALTIME is used, but allow passing the
+ * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
+ */
+ if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
+ return -EINVAL;
+
+ kvm_hv_request_tsc_page_update(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
- /* If nothing is dirty, don't bother messing with page tables. */
- if (is_dirty) {
- spin_lock(&kvm->mmu_lock);
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
- spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
- memslot = &kvm->memslots[log->slot];
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
- memset(memslot->dirty_bitmap, 0, n);
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'data.clock' is very small.
+ */
+ if (data.flags & KVM_CLOCK_REALTIME) {
+ u64 now_real_ns = ktime_get_real_ns();
+
+ /*
+ * Avoid stepping the kvmclock backwards.
+ */
+ if (now_real_ns > data.realtime)
+ data.clock += now_real_ns - data.realtime;
}
- r = 0;
-out:
- up_write(&kvm->slots_lock);
- return r;
+
+ if (ka->use_master_clock)
+ now_raw_ns = ka->master_kernel_ns;
+ else
+ now_raw_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = data.clock - now_raw_ns;
+ kvm_end_pvclock_update(kvm);
+ return 0;
}
-long kvm_arch_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+
+ if (ioctl == KVM_MEMORY_ENCRYPT_OP &&
+ kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl)
+ return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp);
+
+ return -ENOIOCTLCMD;
+}
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
- int r = -EINVAL;
+ int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
struct kvm_pit_state ps;
- struct kvm_memory_alias alias;
+ struct kvm_pit_state2 ps2;
+ struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
- if (r < 0)
- goto out;
break;
- case KVM_SET_MEMORY_REGION: {
- struct kvm_memory_region kvm_mem;
- struct kvm_userspace_memory_region kvm_userspace_mem;
+ case KVM_SET_IDENTITY_MAP_ADDR: {
+ u64 ident_addr;
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
r = -EFAULT;
- if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
- goto out;
- kvm_userspace_mem.slot = kvm_mem.slot;
- kvm_userspace_mem.flags = kvm_mem.flags;
- kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
- kvm_userspace_mem.memory_size = kvm_mem.memory_size;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
+ if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
+ goto set_identity_unlock;
+ r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
- if (r)
- goto out;
break;
- case KVM_GET_NR_MMU_PAGES:
- r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
- break;
- case KVM_SET_MEMORY_ALIAS:
- r = -EFAULT;
- if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
- goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_CREATE_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto create_irqchip_unlock;
+
+ /*
+ * Disallow an in-kernel I/O APIC if the VM has protected EOIs,
+ * i.e. if KVM can't intercept EOIs and thus can't properly
+ * emulate level-triggered interrupts.
+ */
+ r = -ENOTTY;
+ if (kvm->arch.has_protected_eoi)
+ goto create_irqchip_unlock;
+
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto create_irqchip_unlock;
+
+ r = kvm_pic_init(kvm);
if (r)
- goto out;
- break;
- case KVM_CREATE_IRQCHIP:
- r = -ENOMEM;
- kvm->arch.vpic = kvm_create_pic(kvm);
- if (kvm->arch.vpic) {
- r = kvm_ioapic_init(kvm);
- if (r) {
- kfree(kvm->arch.vpic);
- kvm->arch.vpic = NULL;
- goto out;
- }
- } else
- goto out;
- r = kvm_setup_default_irq_routing(kvm);
+ goto create_irqchip_unlock;
+
+ r = kvm_ioapic_init(kvm);
if (r) {
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
- goto out;
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
}
+
+ r = kvm_setup_default_ioapic_and_pic_routing(kvm);
+ if (r) {
+ kvm_ioapic_destroy(kvm);
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
+ }
+ /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
+ create_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
break;
+ }
case KVM_CREATE_PIT:
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
mutex_lock(&kvm->lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
+ r = -ENOENT;
+ if (!pic_in_kernel(kvm))
+ goto create_pit_unlock;
r = -ENOMEM;
- kvm->arch.vpit = kvm_create_pit(kvm);
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
mutex_unlock(&kvm->lock);
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- mutex_lock(&kvm->lock);
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- mutex_unlock(&kvm->lock);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+ struct kvm_irqchip *chip;
- r = -ENOMEM;
- if (!chip)
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
goto out;
- r = -EFAULT;
- if (copy_from_user(chip, argp, sizeof *chip))
- goto get_irqchip_out;
+ }
+
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
goto get_irqchip_out;
r = -EFAULT;
- if (copy_to_user(argp, chip, sizeof *chip))
+ if (copy_to_user(argp, chip, sizeof(*chip)))
goto get_irqchip_out;
r = 0;
get_irqchip_out:
kfree(chip);
- if (r)
- goto out;
break;
}
case KVM_SET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+ struct kvm_irqchip *chip;
- r = -ENOMEM;
- if (!chip)
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
goto out;
- r = -EFAULT;
- if (copy_from_user(chip, argp, sizeof *chip))
- goto set_irqchip_out;
+ }
+
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
- if (r)
- goto set_irqchip_out;
- r = 0;
set_irqchip_out:
kfree(chip);
- if (r)
- goto out;
break;
}
case KVM_GET_PIT: {
@@ -2031,100 +7382,474 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
case KVM_SET_PIT: {
r = -EFAULT;
- if (copy_from_user(&u.ps, argp, sizeof u.ps))
+ if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
goto out;
+ mutex_lock(&kvm->lock);
r = -ENXIO;
if (!kvm->arch.vpit)
- goto out;
+ goto set_pit_out;
r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+set_pit_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_GET_PIT2: {
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
if (r)
goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+ goto out;
r = 0;
break;
}
+ case KVM_SET_PIT2: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+ goto out;
+ mutex_lock(&kvm->lock);
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto set_pit2_out;
+ r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+set_pit2_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
case KVM_REINJECT_CONTROL: {
struct kvm_reinject_control control;
r = -EFAULT;
if (copy_from_user(&control, argp, sizeof(control)))
goto out;
- r = kvm_vm_ioctl_reinject(kvm, &control);
- if (r)
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
goto out;
+ r = kvm_vm_ioctl_reinject(kvm, &control);
+ break;
+ }
+#endif
+ case KVM_SET_BOOT_CPU_ID:
r = 0;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ r = -EBUSY;
+ else if (arg > KVM_MAX_VCPU_IDS ||
+ (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
+ r = -EINVAL;
+ else
+ kvm->arch.bsp_vcpu_id = arg;
+ mutex_unlock(&kvm->lock);
+ break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_XEN_HVM_CONFIG: {
+ struct kvm_xen_hvm_config xhc;
+ r = -EFAULT;
+ if (copy_from_user(&xhc, argp, sizeof(xhc)))
+ goto out;
+ r = kvm_xen_hvm_config(kvm, &xhc);
+ break;
+ }
+ case KVM_XEN_HVM_GET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
+ goto out;
+ r = kvm_xen_hvm_get_attr(kvm, &xha);
+ if (!r && copy_to_user(argp, &xha, sizeof(xha)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_HVM_SET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
+ goto out;
+ r = kvm_xen_hvm_set_attr(kvm, &xha);
+ break;
+ }
+ case KVM_XEN_HVM_EVTCHN_SEND: {
+ struct kvm_irq_routing_xen_evtchn uxe;
+
+ r = -EFAULT;
+ if (copy_from_user(&uxe, argp, sizeof(uxe)))
+ goto out;
+ r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+ break;
+ }
+#endif
+ case KVM_SET_CLOCK:
+ r = kvm_vm_ioctl_set_clock(kvm, argp);
+ break;
+ case KVM_GET_CLOCK:
+ r = kvm_vm_ioctl_get_clock(kvm, argp);
+ break;
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ user_tsc_khz = (u32)arg;
+
+ if (kvm_caps.has_tsc_control &&
+ user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = READ_ONCE(kvm->arch.default_tsc_khz);
+ goto out;
+ }
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
+ r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
+ break;
+ case KVM_MEMORY_ENCRYPT_REG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_register_region)
+ goto out;
+
+ r = kvm_x86_call(mem_enc_register_region)(kvm, &region);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_unregister_region)
+ goto out;
+
+ r = kvm_x86_call(mem_enc_unregister_region)(kvm, &region);
+ break;
+ }
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_HYPERV_EVENTFD: {
+ struct kvm_hyperv_eventfd hvevfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
+ goto out;
+ r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
+ break;
+ }
+#endif
+ case KVM_SET_PMU_EVENT_FILTER:
+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
+ break;
+ case KVM_X86_SET_MSR_FILTER: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter filter;
+
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+ return -EFAULT;
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
break;
}
default:
- ;
+ r = -ENOTTY;
}
out:
return r;
}
-static void kvm_init_msr_list(void)
+static void kvm_probe_feature_msr(u32 msr_index)
+{
+ u64 data;
+
+ if (kvm_get_feature_msr(NULL, msr_index, &data, true))
+ return;
+
+ msr_based_features[num_msr_based_features++] = msr_index;
+}
+
+static void kvm_probe_msr_to_save(u32 msr_index)
{
u32 dummy[2];
- unsigned i, j;
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
- continue;
- if (j < i)
- msrs_to_save[j] = msrs_to_save[i];
- j++;
+ if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
+ return;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed to guests in
+ * some cases.
+ */
+ switch (msr_index) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ return;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
+ return;
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+ return;
+ break;
+ case MSR_IA32_RTIT_CTL:
+ case MSR_IA32_RTIT_STATUS:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
+ return;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
+ return;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
+ return;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (msr_index - MSR_IA32_RTIT_ADDR0_A >=
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
+ return;
+ break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ...
+ MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
+ if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ...
+ MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
+ if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ...
+ MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
+ if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
+ kvm_pmu_cap.num_counters_fixed)
+ return;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+ return;
+ break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ return;
+ break;
+ case MSR_IA32_TSX_CTRL:
+ if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
+ return;
+ break;
+ case MSR_IA32_XSS:
+ if (!kvm_caps.supported_xss)
+ return;
+ break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ return;
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ if (!kvm_cpu_cap_has(X86_FEATURE_LM))
+ return;
+ fallthrough;
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ return;
+ break;
+ default:
+ break;
}
- num_msrs_to_save = j;
+
+ msrs_to_save[num_msrs_to_save++] = msr_index;
}
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
+static void kvm_init_msr_lists(void)
{
- struct kvm_io_device *dev;
+ unsigned i;
+
+ BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3,
+ "Please update the fixed PMCs in msrs_to_save_pmu[]");
+
+ num_msrs_to_save = 0;
+ num_emulated_msrs = 0;
+ num_msr_based_features = 0;
- if (vcpu->arch.apic) {
- dev = &vcpu->arch.apic->dev;
- if (dev->in_range(dev, addr, len, is_write))
- return dev;
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
+ kvm_probe_msr_to_save(msrs_to_save_base[i]);
+
+ if (enable_pmu) {
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
+ kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
}
- return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
+ if (!kvm_x86_call(has_emulated_msr)(NULL,
+ emulated_msrs_all[i]))
+ continue;
+
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
+ }
+
+ for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
+ kvm_probe_feature_msr(i);
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
+ kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
+}
+
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+ addr, n, v))
+ && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_call(set_segment)(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_call(get_segment)(vcpu, var, seg);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gpa_t t_gpa;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* NPT walks are always user-walks */
+ access |= PFERR_USER_MASK;
+ t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+
+ return t_gpa;
}
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read);
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
- struct kvm_io_device *dev;
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
- if (dev == NULL)
- dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
- is_write);
- return dev;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write);
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u64 access,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
- ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+ if (gpa == INVALID_GPA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+ offset, toread);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -2136,25 +7861,86 @@ out:
return r;
}
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ unsigned offset;
+ int ret;
+
+ /* Inline kvm_read_guest_virt_helper for speed. */
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
+ exception);
+ if (unlikely(gpa == INVALID_GPA))
+ return X86EMUL_PROPAGATE_FAULT;
+
+ offset = addr & (PAGE_SIZE-1);
+ if (WARN_ON(offset + bytes > PAGE_SIZE))
+ bytes = (unsigned)PAGE_SIZE - offset;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
+ if (unlikely(ret < 0))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
+{
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ exception);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_virt);
+
+static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception, bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 access = 0;
+
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
+}
+
+static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u64 access,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
- ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+ if (gpa == INVALID_GPA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -2166,614 +7952,1871 @@ out:
return r;
}
+static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception,
+ bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 access = PFERR_WRITE_MASK;
-static int emulator_read_emulated(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ access, exception);
+}
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception)
{
- struct kvm_io_device *mmio_dev;
- gpa_t gpa;
+ /* kvm_write_guest_virt_system can pull in tons of pages. */
+ kvm_request_l1tf_flush_l1d();
- if (vcpu->mmio_read_completed) {
- memcpy(val, vcpu->mmio_data, bytes);
- vcpu->mmio_read_completed = 0;
- return X86EMUL_CONTINUE;
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ PFERR_WRITE_MASK, exception);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_virt_system);
+
+static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ return kvm_x86_call(check_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
+}
+
+int handle_ud(struct kvm_vcpu *vcpu)
+{
+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
+ int fep_flags = READ_ONCE(force_emulation_prefix);
+ int emul_type = EMULTYPE_TRAP_UD;
+ char sig[5]; /* ud2; .ascii "kvm" */
+ struct x86_exception e;
+ int r;
+
+ r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0);
+ if (r != X86EMUL_CONTINUE)
+ return 1;
+
+ if (fep_flags &&
+ kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
+ sig, sizeof(sig), &e) == 0 &&
+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
+ if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
+ kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
+ kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
+ emul_type = EMULTYPE_TRAP_UD_FORCED;
}
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ return kvm_emulate_instruction(vcpu, emul_type);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_ud);
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t gpa, bool write)
+{
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto mmio;
+ return 1;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu)
- == X86EMUL_CONTINUE)
- return X86EMUL_CONTINUE;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
+ if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+ trace_vcpu_match_mmio(gva, gpa, write, true);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t *gpa, struct x86_exception *exception,
+ bool write)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
-mmio:
/*
- * Is this MMIO handled locally?
+ * currently PKRU is only applied to ept enabled guest so
+ * there is no pkey in EPT page table for L1 guest or EPT
+ * shadow page table for L2 guest.
*/
- mutex_lock(&vcpu->kvm->lock);
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
- if (mmio_dev) {
- kvm_iodevice_read(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
- return X86EMUL_CONTINUE;
+ if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+ !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.mmio_access, 0, access))) {
+ *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ (gva & (PAGE_SIZE - 1));
+ trace_vcpu_match_mmio(gva, *gpa, write, false);
+ return 1;
}
- mutex_unlock(&vcpu->kvm->lock);
- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 0;
+ *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
- return X86EMUL_UNHANDLEABLE;
+ if (*gpa == INVALID_GPA)
+ return -1;
+
+ return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+ const void *val, int bytes)
{
int ret;
- ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+ kvm_page_track_write(vcpu, gpa, val, bytes);
return 1;
}
-static int emulator_write_emulated_onepage(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
+struct read_write_emulator_ops {
+ int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+ int bytes);
+ int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ int bytes, void *val);
+ int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
{
- struct kvm_io_device *mmio_dev;
- gpa_t gpa;
+ if (vcpu->mmio_read_completed) {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_fragments[0].gpa, val);
+ vcpu->mmio_read_completed = 0;
+ return 1;
+ }
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ return 0;
+}
- if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, 2);
- return X86EMUL_PROPAGATE_FAULT;
- }
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
+}
- /* For APIC access vmexit */
- if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto mmio;
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return emulator_write_phys(vcpu, gpa, val, bytes);
+}
+
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
+ return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
+
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
+ return X86EMUL_IO_NEEDED;
+}
+
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
- if (emulator_write_phys(vcpu, gpa, val, bytes))
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ return X86EMUL_CONTINUE;
+}
+
+static const struct read_write_emulator_ops read_emultor = {
+ .read_write_prepare = read_prepare,
+ .read_write_emulate = read_emulate,
+ .read_write_mmio = vcpu_mmio_read,
+ .read_write_exit_mmio = read_exit_mmio,
+};
+
+static const struct read_write_emulator_ops write_emultor = {
+ .read_write_emulate = write_emulate,
+ .read_write_mmio = write_mmio,
+ .read_write_exit_mmio = write_exit_mmio,
+ .write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu,
+ const struct read_write_emulator_ops *ops)
+{
+ gpa_t gpa;
+ int handled, ret;
+ bool write = ops->write;
+ struct kvm_mmio_fragment *frag;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ /*
+ * If the exit was due to a NPF we may already have a GPA.
+ * If the GPA is present, use it to avoid the GVA to GPA table walk.
+ * Note, this cannot be used on string operations since string
+ * operation using rep will only have the initial GPA from the NPF
+ * occurred.
+ */
+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
+ gpa = ctxt->gpa_val;
+ ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
+ } else {
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
+ if (ret < 0)
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE;
-mmio:
/*
* Is this MMIO handled locally?
*/
- mutex_lock(&vcpu->kvm->lock);
- mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
- if (mmio_dev) {
- kvm_iodevice_write(mmio_dev, gpa, bytes, val);
- mutex_unlock(&vcpu->kvm->lock);
+ handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
+ if (handled == bytes)
return X86EMUL_CONTINUE;
- }
- mutex_unlock(&vcpu->kvm->lock);
- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 1;
- memcpy(vcpu->mmio_data, val, bytes);
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
+ WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
+ frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+ frag->gpa = gpa;
+ frag->data = val;
+ frag->len = bytes;
return X86EMUL_CONTINUE;
}
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
+static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val, unsigned int bytes,
+ struct x86_exception *exception,
+ const struct read_write_emulator_ops *ops)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ int rc;
+
+ if (ops->read_write_prepare &&
+ ops->read_write_prepare(vcpu, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ vcpu->mmio_nr_fragments = 0;
+
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
- int rc, now;
+ int now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+ rc = emulator_read_write_onepage(addr, val, now, exception,
+ vcpu, ops);
+
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ addr = (u32)addr;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+
+ rc = emulator_read_write_onepage(addr, val, bytes, exception,
+ vcpu, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (!vcpu->mmio_nr_fragments)
+ return X86EMUL_CONTINUE;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = gpa;
+
+ return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
-static int emulator_cmpxchg_emulated(unsigned long addr,
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, val, bytes,
+ exception, &read_emultor);
+}
+
+static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, (void *)val, bytes,
+ exception, &write_emultor);
+}
+
+#define emulator_try_cmpxchg_user(t, ptr, old, new) \
+ (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
+
+static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ struct x86_exception *exception)
{
- static int reported;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 page_line_mask;
+ unsigned long hva;
+ gpa_t gpa;
+ int r;
- if (!reported) {
- reported = 1;
- printk(KERN_WARNING "kvm: emulating exchange as write\n");
- }
-#ifndef CONFIG_X86_64
/* guests cmpxchg8b have to be emulated atomically */
- if (bytes == 8) {
- gpa_t gpa;
- struct page *page;
- char *kaddr;
- u64 val;
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
- if (gpa == UNMAPPED_GVA ||
- (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto emul_write;
+ if (gpa == INVALID_GPA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
- goto emul_write;
+ /*
+ * Emulate the atomic as a straight write to avoid #AC if SLD is
+ * enabled in the host and the access splits a cache line.
+ */
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ page_line_mask = ~(cache_line_size() - 1);
+ else
+ page_line_mask = PAGE_MASK;
- val = *(u64 *)new;
+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
+ goto emul_write;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ goto emul_write;
- kaddr = kmap_atomic(page, KM_USER0);
- set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
- kunmap_atomic(kaddr, KM_USER0);
- kvm_release_page_dirty(page);
+ hva += offset_in_page(gpa);
+
+ switch (bytes) {
+ case 1:
+ r = emulator_try_cmpxchg_user(u8, hva, old, new);
+ break;
+ case 2:
+ r = emulator_try_cmpxchg_user(u16, hva, old, new);
+ break;
+ case 4:
+ r = emulator_try_cmpxchg_user(u32, hva, old, new);
+ break;
+ case 8:
+ r = emulator_try_cmpxchg_user(u64, hva, old, new);
+ break;
+ default:
+ BUG();
}
+
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * Mark the page dirty _before_ checking whether or not the CMPXCHG was
+ * successful, as the old value is written back on failure. Note, for
+ * live migration, this is unnecessarily conservative as CMPXCHG writes
+ * back the original value and the access is atomic, but KVM's ABI is
+ * that all writes are dirty logged, regardless of the value written.
+ */
+ kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
+
+ if (r)
+ return X86EMUL_CMPXCHG_FAILED;
+
+ kvm_page_track_write(vcpu, gpa, new, bytes);
+
+ return X86EMUL_CONTINUE;
+
emul_write:
-#endif
+ pr_warn_once("emulating exchange as write\n");
+
+ return emulator_write_emulated(ctxt, addr, new, bytes, exception);
+}
+
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *data,
+ unsigned int count, bool in)
+{
+ unsigned i;
+ int r;
- return emulator_write_emulated(addr, new, bytes, vcpu);
+ WARN_ON_ONCE(vcpu->arch.pio.count);
+ for (i = 0; i < count; i++) {
+ if (in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data);
+
+ if (r) {
+ if (i == 0)
+ goto userspace_io;
+
+ /*
+ * Userspace must have unregistered the device while PIO
+ * was running. Drop writes / read as 0.
+ */
+ if (in)
+ memset(data, 0, size * (count - i));
+ break;
+ }
+
+ data += size;
+ }
+ return 1;
+
+userspace_io:
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ if (in)
+ memset(vcpu->arch.pio_data, 0, size * count);
+ else
+ memcpy(vcpu->arch.pio_data, data, size * count);
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+ return 0;
+}
+
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
+{
+ int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
+ if (r)
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, val);
+
+ return r;
+}
+
+static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
+{
+ int size = vcpu->arch.pio.size;
+ unsigned int count = vcpu->arch.pio.count;
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
+ vcpu->arch.pio.count = 0;
+}
+
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ if (vcpu->arch.pio.count) {
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
+ complete_emulator_pio_in(vcpu, val);
+ return 1;
+ }
+
+ return emulator_pio_in(vcpu, size, port, val, count);
+}
+
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, const void *val,
+ unsigned int count)
+{
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, val);
+ return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+}
+
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
+{
+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
}
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return kvm_x86_ops->get_segment_base(vcpu, seg);
+ return kvm_x86_call(get_segment_base)(vcpu, seg);
}
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
{
- kvm_mmu_invlpg(vcpu, address);
- return X86EMUL_CONTINUE;
+ kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
}
-int emulate_clts(struct kvm_vcpu *vcpu)
+static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
{
- KVMTRACE_0D(CLTS, vcpu, handler);
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_call(has_wbinvd_exit)()) {
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
+ put_cpu();
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
return X86EMUL_CONTINUE;
}
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
+ kvm_emulate_wbinvd_noskip(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wbinvd);
- switch (dr) {
- case 0 ... 3:
- *dest = kvm_x86_ops->get_dr(vcpu, dr);
- return X86EMUL_CONTINUE;
+
+
+static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
+}
+
+static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
+{
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr);
+}
+
+static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value)
+{
+
+ return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int res = 0;
+
+ switch (cr) {
+ case 0:
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ res = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ res = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
+ }
+
+ return res;
+}
+
+static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
+{
+ return kvm_x86_call(get_cpl)(emul_to_vcpu(ctxt));
+}
+
+static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_call(get_gdt)(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_call(get_idt)(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_call(set_gdt)(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_call(set_idt)(emul_to_vcpu(ctxt), dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(
+ struct x86_emulate_ctxt *ctxt, int seg)
+{
+ return get_segment_base(emul_to_vcpu(ctxt), seg);
+}
+
+static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3,
+ int seg)
+{
+ struct kvm_segment var;
+
+ kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
+ *selector = var.selector;
+
+ if (var.unusable) {
+ memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
+ return false;
+ }
+
+ if (var.g)
+ var.limit >>= 12;
+ set_desc_limit(desc, var.limit);
+ set_desc_base(desc, (unsigned long)var.base);
+#ifdef CONFIG_X86_64
+ if (base3)
+ *base3 = var.base >> 32;
+#endif
+ desc->type = var.type;
+ desc->s = var.s;
+ desc->dpl = var.dpl;
+ desc->p = var.present;
+ desc->avl = var.avl;
+ desc->l = var.l;
+ desc->d = var.db;
+ desc->g = var.g;
+
+ return true;
+}
+
+static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3,
+ int seg)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_segment var;
+
+ var.selector = selector;
+ var.base = get_desc_base(desc);
+#ifdef CONFIG_X86_64
+ var.base |= ((u64)base3) << 32;
+#endif
+ var.limit = get_desc_limit(desc);
+ if (desc->g)
+ var.limit = (var.limit << 12) | 0xfff;
+ var.type = desc->type;
+ var.dpl = desc->dpl;
+ var.db = desc->d;
+ var.s = desc->s;
+ var.l = desc->l;
+ var.g = desc->g;
+ var.avl = desc->avl;
+ var.present = desc->p;
+ var.unusable = !var.present;
+ var.padding = 0;
+
+ kvm_set_segment(vcpu, &var, seg);
+ return;
+}
+
+static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_emulate_msr_read(vcpu, msr_index, pdata);
+ if (r < 0)
return X86EMUL_UNHANDLEABLE;
+
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_read_ex(msr_index);
+ return X86EMUL_PROPAGATE_FAULT;
}
+
+ trace_kvm_msr_read(msr_index, *pdata);
+ return X86EMUL_CONTINUE;
}
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
{
- unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- int exception;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
- kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
- if (exception) {
- /* FIXME: better handling */
+ r = kvm_emulate_msr_write(vcpu, msr_index, data);
+ if (r < 0)
return X86EMUL_UNHANDLEABLE;
+
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_write_ex(msr_index, data);
+ return X86EMUL_PROPAGATE_FAULT;
}
+
+ trace_kvm_msr_write(msr_index, data);
return X86EMUL_CONTINUE;
}
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
{
- u8 opcodes[4];
- unsigned long rip = kvm_rip_read(vcpu);
- unsigned long rip_linear;
+ /*
+ * Treat emulator accesses to the current shadow stack pointer as host-
+ * initiated, as they aren't true MSR accesses (SSP is a "just a reg"),
+ * and this API is used only for implicit accesses, i.e. not RDMSR, and
+ * so the index is fully KVM-controlled.
+ */
+ if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP))
+ return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata);
- if (!printk_ratelimit())
- return;
+ return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata);
+}
- rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
+static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
+{
+ return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc);
+}
- kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc, u64 *pdata)
+{
+ return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
+static void emulator_halt(struct x86_emulate_ctxt *ctxt)
+{
+ emul_to_vcpu(ctxt)->arch.halt_request = 1;
+}
+
+static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return kvm_x86_call(check_intercept)(emul_to_vcpu(ctxt), info, stage,
+ &ctxt->exception);
+}
+
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
+ bool exact_only)
+{
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
+}
+
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+}
+
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
+}
+
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
+static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt));
+}
+
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+ return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
+}
+
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+ kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
+}
+
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+ kvm_x86_call(set_nmi_mask)(emul_to_vcpu(ctxt), masked);
+}
+
+static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
+{
+ return is_smm(emul_to_vcpu(ctxt));
+}
+
+#ifndef CONFIG_KVM_SMM
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ WARN_ON_ONCE(1);
+ return X86EMUL_UNHANDLEABLE;
+}
+#endif
+
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
+}
- printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
- context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr)
+{
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ *xcr = emul_to_vcpu(ctxt)->arch.xcr0;
+ return 0;
+}
+
+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
+{
+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-static struct x86_emulate_ops emulate_ops = {
- .read_std = kvm_read_guest_virt,
+static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
+
+ if (!kvm->vm_bugged)
+ kvm_vm_bugged(kvm);
+}
+
+static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, unsigned int flags)
+{
+ if (!kvm_x86_ops.get_untagged_addr)
+ return addr;
+
+ return kvm_x86_call(get_untagged_addr)(emul_to_vcpu(ctxt),
+ addr, flags);
+}
+
+static bool emulator_is_canonical_addr(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, unsigned int flags)
+{
+ return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags);
+}
+
+static const struct x86_emulate_ops emulate_ops = {
+ .vm_bugged = emulator_vm_bugged,
+ .read_gpr = emulator_read_gpr,
+ .write_gpr = emulator_write_gpr,
+ .read_std = emulator_read_std,
+ .write_std = emulator_write_std,
+ .fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .invlpg = emulator_invlpg,
+ .pio_in_emulated = emulator_pio_in_emulated,
+ .pio_out_emulated = emulator_pio_out_emulated,
+ .get_segment = emulator_get_segment,
+ .set_segment = emulator_set_segment,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
+ .get_gdt = emulator_get_gdt,
+ .get_idt = emulator_get_idt,
+ .set_gdt = emulator_set_gdt,
+ .set_idt = emulator_set_idt,
+ .get_cr = emulator_get_cr,
+ .set_cr = emulator_set_cr,
+ .cpl = emulator_get_cpl,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .set_msr_with_filter = emulator_set_msr_with_filter,
+ .get_msr_with_filter = emulator_get_msr_with_filter,
+ .get_msr = emulator_get_msr,
+ .check_rdpmc_early = emulator_check_rdpmc_early,
+ .read_pmc = emulator_read_pmc,
+ .halt = emulator_halt,
+ .wbinvd = emulator_wbinvd,
+ .fix_hypercall = emulator_fix_hypercall,
+ .intercept = emulator_intercept,
+ .get_cpuid = emulator_get_cpuid,
+ .guest_has_movbe = emulator_guest_has_movbe,
+ .guest_has_fxsr = emulator_guest_has_fxsr,
+ .guest_has_rdpid = emulator_guest_has_rdpid,
+ .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible,
+ .set_nmi_mask = emulator_set_nmi_mask,
+ .is_smm = emulator_is_smm,
+ .leave_smm = emulator_leave_smm,
+ .triple_fault = emulator_triple_fault,
+ .get_xcr = emulator_get_xcr,
+ .set_xcr = emulator_set_xcr,
+ .get_untagged_addr = emulator_get_untagged_addr,
+ .is_canonical_addr = emulator_is_canonical_addr,
};
-static void cache_all_regs(struct kvm_vcpu *vcpu)
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- kvm_register_read(vcpu, VCPU_REGS_RAX);
- kvm_register_read(vcpu, VCPU_REGS_RSP);
- kvm_register_read(vcpu, VCPU_REGS_RIP);
- vcpu->arch.regs_dirty = ~0;
+ u32 int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
+ kvm_x86_call(set_interrupt_shadow)(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
}
-int emulate_instruction(struct kvm_vcpu *vcpu,
- struct kvm_run *run,
- unsigned long cr2,
- u16 error_code,
- int emulation_type)
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
- int r, shadow_mask;
- struct decode_cache *c;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
+}
+
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt;
+
+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
+ if (!ctxt) {
+ pr_err("failed to allocate vcpu's emulator\n");
+ return NULL;
+ }
+
+ ctxt->vcpu = vcpu;
+ ctxt->ops = &emulate_ops;
+ vcpu->arch.emulate_ctxt = ctxt;
+
+ return ctxt;
+}
+
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int cs_db, cs_l;
+
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+
+ ctxt->gpa_available = false;
+ ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
+ ctxt->eip = kvm_rip_read(vcpu);
+ ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
+ (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
+ cs_db ? X86EMUL_MODE_PROT32 :
+ X86EMUL_MODE_PROT16;
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ init_decode_cache(ctxt);
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+}
+
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ctxt->op_bytes = 2;
+ ctxt->ad_bytes = 2;
+ ctxt->_eip = ctxt->eip + inc_eip;
+ ret = emulate_int_real(ctxt, irq);
+
+ if (ret != X86EMUL_CONTINUE) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ } else {
+ ctxt->eip = ctxt->_eip;
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_realmode_interrupt);
+
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata, u8 *insn_bytes, u8 insn_size)
+{
+ struct kvm_run *run = vcpu->run;
+ u64 info[5];
+ u8 info_start;
- kvm_clear_exception_queue(vcpu);
- vcpu->arch.mmio_fault_cr2 = cr2;
/*
- * TODO: fix x86_emulate.c to use guest_read/write_register
- * instead of direct ->regs accesses, can save hundred cycles
- * on Intel for instructions that don't read/change RSP, for
- * for example.
+ * Zero the whole array used to retrieve the exit info, as casting to
+ * u32 for select entries will leave some chunks uninitialized.
*/
- cache_all_regs(vcpu);
+ memset(&info, 0, sizeof(info));
- vcpu->mmio_is_write = 0;
- vcpu->arch.pio.string = 0;
+ kvm_x86_call(get_exit_info)(vcpu, (u32 *)&info[0], &info[1], &info[2],
+ (u32 *)&info[3], (u32 *)&info[4]);
- if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- int cs_db, cs_l;
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
- vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- vcpu->arch.emulate_ctxt.mode =
- (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_REAL : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-
- /* Reject the instructions other than VMCALL/VMMCALL when
- * try to emulate invalid opcode */
- c = &vcpu->arch.emulate_ctxt.decode;
- if ((emulation_type & EMULTYPE_TRAP_UD) &&
- (!(c->twobyte && c->b == 0x01 &&
- (c->modrm_reg == 0 || c->modrm_reg == 3) &&
- c->modrm_mod == 3 && c->modrm_rm == 1)))
- return EMULATE_FAIL;
-
- ++vcpu->stat.insn_emulation;
- if (r) {
- ++vcpu->stat.insn_emulation_fail;
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- return EMULATE_DONE;
- return EMULATE_FAIL;
- }
- }
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
- if (emulation_type & EMULTYPE_SKIP) {
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
- return EMULATE_DONE;
+ /*
+ * There's currently space for 13 entries, but 5 are used for the exit
+ * reason and info. Restrict to 4 to reduce the maintenance burden
+ * when expanding kvm_run.emulation_failure in the future.
+ */
+ if (WARN_ON_ONCE(ndata > 4))
+ ndata = 4;
+
+ /* Always include the flags as a 'data' entry. */
+ info_start = 1;
+ run->emulation_failure.flags = 0;
+
+ if (insn_size) {
+ BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
+ sizeof(run->emulation_failure.insn_bytes) != 16));
+ info_start += 2;
+ run->emulation_failure.flags |=
+ KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
+ run->emulation_failure.insn_size = insn_size;
+ memset(run->emulation_failure.insn_bytes, 0x90,
+ sizeof(run->emulation_failure.insn_bytes));
+ memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
}
- r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
- shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+ memcpy(&run->internal.data[info_start], info, sizeof(info));
+ memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
+ ndata * sizeof(data[0]));
- if (r == 0)
- kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+ run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
+}
- if (vcpu->arch.pio.string)
- return EMULATE_DO_MMIO;
+static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
- if ((r || vcpu->mmio_is_write) && run) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr;
- memcpy(run->mmio.data, vcpu->mmio_data, 8);
- run->mmio.len = vcpu->mmio_size;
- run->mmio.is_write = vcpu->mmio_is_write;
+ prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
+ ctxt->fetch.end - ctxt->fetch.data);
+}
+
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata)
+{
+ prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ u32 reason, intr_info, error_code;
+ struct kvm_run *run = vcpu->run;
+ u64 info1, info2;
+ int ndata = 0;
+
+ kvm_x86_call(get_exit_info)(vcpu, &reason, &info1, &info2,
+ &intr_info, &error_code);
+
+ run->internal.data[ndata++] = info2;
+ run->internal.data[ndata++] = reason;
+ run->internal.data[ndata++] = info1;
+ run->internal.data[ndata++] = gpa;
+ run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+ run->internal.ndata = ndata;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit);
+
+void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason)
+{
+ vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason);
+
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit);
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+
+ if (emulation_type & EMULTYPE_VMWARE_GP) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
}
- if (r) {
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- return EMULATE_DONE;
- if (!vcpu->mmio_needed) {
- kvm_report_emulation_failure(vcpu, "mmio");
- return EMULATE_FAIL;
- }
- return EMULATE_DO_MMIO;
+ if (kvm->arch.exit_on_emulation_error ||
+ (emulation_type & EMULTYPE_SKIP)) {
+ prepare_emulation_ctxt_failure_exit(vcpu);
+ return 0;
}
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_queue_exception(vcpu, UD_VECTOR);
- if (vcpu->mmio_is_write) {
- vcpu->mmio_needed = 0;
- return EMULATE_DO_MMIO;
+ if (!is_guest_mode(vcpu) && kvm_x86_call(get_cpl)(vcpu) == 0) {
+ prepare_emulation_ctxt_failure_exit(vcpu);
+ return 0;
}
- return EMULATE_DONE;
+ return 1;
}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-static int pio_copy_data(struct kvm_vcpu *vcpu)
+static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa,
+ int emulation_type)
{
- void *p = vcpu->arch.pio_data;
- gva_t q = vcpu->arch.pio.guest_gva;
- unsigned bytes;
- int ret;
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
+ return false;
- bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
- if (vcpu->arch.pio.in)
- ret = kvm_write_guest_virt(q, p, bytes, vcpu);
- else
- ret = kvm_read_guest_virt(q, p, bytes, vcpu);
- return ret;
+ /*
+ * If the failed instruction faulted on an access to page tables that
+ * are used to translate any part of the instruction, KVM can't resolve
+ * the issue by unprotecting the gfn, as zapping the shadow page will
+ * result in the instruction taking a !PRESENT page fault and thus put
+ * the vCPU into an infinite loop of page faults. E.g. KVM will create
+ * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+ * then zap the SPTE to unprotect the gfn, and then do it all over
+ * again. Report the error to userspace.
+ */
+ if (emulation_type & EMULTYPE_WRITE_PF_TO_SP)
+ return false;
+
+ /*
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
+ */
+ __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true);
+
+ /*
+ * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible
+ * all SPTEs were already zapped by a different task. The alternative
+ * is to report the error to userspace and likely terminate the guest,
+ * and the last_retry_{eip,addr} checks will prevent retrying the page
+ * fault indefinitely, i.e. there's nothing to lose by retrying.
+ */
+ return true;
+}
+
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+ unsigned long *db)
+{
+ u32 dr6 = 0;
+ int i;
+ u32 enable, rwlen;
+
+ enable = dr7;
+ rwlen = dr7 >> 16;
+ for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+ if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+ dr6 |= (1 << i);
+ return dr6;
}
-int complete_pio(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
{
- struct kvm_pio_request *io = &vcpu->arch.pio;
- long delta;
+ struct kvm_run *kvm_run = vcpu->run;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ }
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+ return 1;
+}
+
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
int r;
- unsigned long val;
- if (!io->string) {
- if (io->in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(&val, vcpu->arch.pio_data, io->size);
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
- }
- } else {
- if (io->in) {
- r = pio_copy_data(vcpu);
- if (r)
- return r;
- }
+ r = kvm_x86_call(skip_emulated_instruction)(vcpu);
+ if (unlikely(!r))
+ return 0;
- delta = 1;
- if (io->rep) {
- delta *= io->cur_count;
- /*
- * The size of the register should really depend on
- * current address size.
- */
- val = kvm_register_read(vcpu, VCPU_REGS_RCX);
- val -= delta;
- kvm_register_write(vcpu, VCPU_REGS_RCX, val);
- }
- if (io->down)
- delta = -delta;
- delta *= io->size;
- if (io->in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RDI);
- val += delta;
- kvm_register_write(vcpu, VCPU_REGS_RDI, val);
- } else {
- val = kvm_register_read(vcpu, VCPU_REGS_RSI);
- val += delta;
- kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+ kvm_pmu_instruction_retired(vcpu);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ r = kvm_vcpu_do_singlestep(vcpu);
+ return r;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction);
+
+static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
+{
+ if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
+ return true;
+
+ /*
+ * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is
+ * active, but AMD compatible CPUs do not.
+ */
+ if (!guest_cpuid_is_intel_compatible(vcpu))
+ return false;
+
+ return kvm_x86_call(get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS;
+}
+
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
+ int emulation_type, int *r)
+{
+ WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
+
+ /*
+ * Do not check for code breakpoints if hardware has already done the
+ * checks, as inferred from the emulation type. On NO_DECODE and SKIP,
+ * the instruction has passed all exception checks, and all intercepted
+ * exceptions that trigger emulation have lower priority than code
+ * breakpoints, i.e. the fact that the intercepted exception occurred
+ * means any code breakpoints have already been serviced.
+ *
+ * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
+ * hardware has checked the RIP of the magic prefix, but not the RIP of
+ * the instruction being emulated. The intent of forced emulation is
+ * to behave as if KVM intercepted the instruction without an exception
+ * and without a prefix.
+ */
+ if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
+ return false;
+
+ if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+ (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.guest_debug_dr7,
+ vcpu->arch.eff_db);
+
+ if (dr6 != 0) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
+ kvm_run->debug.arch.pc = eip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = 0;
+ return true;
}
}
- io->count -= io->cur_count;
- io->cur_count = 0;
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !kvm_is_code_breakpoint_inhibited(vcpu)) {
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.dr7,
+ vcpu->arch.db);
- return 0;
+ if (dr6 != 0) {
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
+ *r = 1;
+ return true;
+ }
+ }
+
+ return false;
}
-static void kernel_pio(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu,
- void *pd)
+static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
{
- /* TODO: String I/O for in kernel device */
+ switch (ctxt->opcode_len) {
+ case 1:
+ switch (ctxt->b) {
+ case 0xe4: /* IN */
+ case 0xe5:
+ case 0xec:
+ case 0xed:
+ case 0xe6: /* OUT */
+ case 0xe7:
+ case 0xee:
+ case 0xef:
+ case 0x6c: /* INS */
+ case 0x6d:
+ case 0x6e: /* OUTS */
+ case 0x6f:
+ return true;
+ }
+ break;
+ case 2:
+ switch (ctxt->b) {
+ case 0x33: /* RDPMC */
+ return true;
+ }
+ break;
+ }
- mutex_lock(&vcpu->kvm->lock);
- if (vcpu->arch.pio.in)
- kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
- vcpu->arch.pio.size,
- pd);
- else
- kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
- vcpu->arch.pio.size,
- pd);
- mutex_unlock(&vcpu->kvm->lock);
+ return false;
}
-static void pio_string_write(struct kvm_io_device *pio_dev,
- struct kvm_vcpu *vcpu)
+static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt,
+ int emulation_type)
{
- struct kvm_pio_request *io = &vcpu->arch.pio;
- void *pd = vcpu->arch.pio_data;
- int i;
+ u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type);
- mutex_lock(&vcpu->kvm->lock);
- for (i = 0; i < io->cur_count; i++) {
- kvm_iodevice_write(pio_dev, io->port,
- io->size,
- pd);
- pd += io->size;
+ switch (ctxt->b) {
+ case 0xcc:
+ return vector == BP_VECTOR;
+ case 0xcd:
+ return vector == ctxt->src.val;
+ case 0xce:
+ return vector == OF_VECTOR;
+ default:
+ return false;
}
- mutex_unlock(&vcpu->kvm->lock);
}
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
- gpa_t addr, int len,
- int is_write)
+/*
+ * Decode an instruction for emulation. The caller is responsible for handling
+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
+ * code breakpoints have higher priority and thus have already been done by
+ * hardware.
+ *
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
+ * response to a machine check.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len)
{
- return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int r;
+
+ init_emulate_ctxt(vcpu);
+
+ r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+
+ return r;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(x86_decode_emulated_instruction);
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned port)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
{
- struct kvm_io_device *pio_dev;
- unsigned long val;
+ int r;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ bool writeback = true;
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->arch.pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
- vcpu->run->io.port = vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = in;
- vcpu->arch.pio.string = 0;
- vcpu->arch.pio.down = 0;
- vcpu->arch.pio.rep = 0;
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))))
+ emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
+ if (r != X86EMUL_CONTINUE) {
+ if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
+ return 1;
+
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
+ return 1;
+
+ if (r == X86EMUL_UNHANDLEABLE_VECTORING) {
+ kvm_prepare_event_vectoring_exit(vcpu, cr2_or_gpa);
+ return 0;
+ }
+
+ WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+
+ kvm_request_l1tf_flush_l1d();
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ kvm_clear_exception_queue(vcpu);
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(vcpu->arch.pio_data, &val, 4);
+ /*
+ * Return immediately if RIP hits a code breakpoint, such #DBs
+ * are fault-like and are higher priority than any faults on
+ * the code fetch itself.
+ */
+ if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
+ return r;
- pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
- if (pio_dev) {
- kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
- complete_pio(vcpu);
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
+ insn, insn_len);
+ if (r != EMULATION_OK) {
+ if ((emulation_type & EMULTYPE_TRAP_UD) ||
+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
+ return 1;
+
+ if (ctxt->have_exception &&
+ !(emulation_type & EMULTYPE_SKIP)) {
+ /*
+ * #UD should result in just EMULATION_FAILED, and trap-like
+ * exception should not be encountered during decode.
+ */
+ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+ inject_emulated_exception(vcpu);
+ return 1;
+ }
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+ }
+
+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
+ !is_vmware_backdoor_opcode(ctxt)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return 1;
}
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port)
-{
- unsigned now, in_page;
- int ret = 0;
- struct kvm_io_device *pio_dev;
+ /*
+ * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
+ * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
+ * The caller is responsible for updating interruptibility state and
+ * injecting single-step #DBs.
+ */
+ if (emulation_type & EMULTYPE_SKIP) {
+ if (emulation_type & EMULTYPE_SKIP_SOFT_INT &&
+ !is_soft_int_instruction(ctxt, emulation_type))
+ return 0;
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->arch.pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
- vcpu->run->io.port = vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = in;
- vcpu->arch.pio.string = 1;
- vcpu->arch.pio.down = down;
- vcpu->arch.pio.rep = rep;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
+ else
+ ctxt->eip = ctxt->_eip;
- if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
- KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
- handler);
- else
- KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
- handler);
+ if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
+ r = 1;
+ goto writeback;
+ }
- if (!count) {
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return 1;
}
- if (!down)
- in_page = PAGE_SIZE - offset_in_page(address);
- else
- in_page = offset_in_page(address) + size;
- now = min(count, (unsigned long)in_page / size);
- if (!now)
- now = 1;
- if (down) {
- /*
- * String I/O in reverse. Yuck. Kill the guest, fix later.
- */
- pr_unimpl(vcpu, "guest string pio down\n");
- kvm_inject_gp(vcpu, 0);
+ /*
+ * If emulation was caused by a write-protection #PF on a non-page_table
+ * writing instruction, try to unprotect the gfn, i.e. zap shadow pages,
+ * and retry the instruction, as the vCPU is likely no longer using the
+ * gfn as a page table.
+ */
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ !x86_page_table_writing_insn(ctxt) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
return 1;
+
+ /* this is needed for vmware backdoor interface to work since it
+ changes registers values during IO operation */
+ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+ emulator_invalidate_register_cache(ctxt);
+ }
+
+restart:
+ if (emulation_type & EMULTYPE_PF) {
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2_or_gpa;
+
+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
+ if (vcpu->arch.mmu->root_role.direct) {
+ ctxt->gpa_available = true;
+ ctxt->gpa_val = cr2_or_gpa;
+ }
+ } else {
+ /* Sanitize the address out of an abundance of paranoia. */
+ ctxt->exception.address = 0;
}
- vcpu->run->io.count = now;
- vcpu->arch.pio.cur_count = now;
- if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ /*
+ * Check L1's instruction intercepts when emulating instructions for
+ * L2, unless KVM is re-emulating a previously decoded instruction,
+ * e.g. to complete userspace I/O, in which case KVM has already
+ * checked the intercepts.
+ */
+ r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) &&
+ !(emulation_type & EMULTYPE_NO_DECODE));
- vcpu->arch.pio.guest_gva = address;
+ if (r == EMULATION_INTERCEPTED)
+ return 1;
- pio_dev = vcpu_find_pio_dev(vcpu, port,
- vcpu->arch.pio.cur_count,
- !vcpu->arch.pio.in);
- if (!vcpu->arch.pio.in) {
- /* string PIO write */
- ret = pio_copy_data(vcpu);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- kvm_inject_gp(vcpu, 0);
+ if (r == EMULATION_FAILED) {
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
return 1;
+
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+
+ if (ctxt->have_exception) {
+ WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
+ vcpu->mmio_needed = false;
+ r = 1;
+ inject_emulated_exception(vcpu);
+ } else if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in) {
+ /* FIXME: return into emulator if single-stepping. */
+ vcpu->arch.pio.count = 0;
+ } else {
+ writeback = false;
+ vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
- if (ret == 0 && pio_dev) {
- pio_string_write(pio_dev, vcpu);
- complete_pio(vcpu);
- if (vcpu->arch.pio.count == 0)
- ret = 1;
+ r = 0;
+ } else if (vcpu->mmio_needed) {
+ ++vcpu->stat.mmio_exits;
+
+ if (!vcpu->mmio_is_write)
+ writeback = false;
+ r = 0;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ } else if (vcpu->arch.complete_userspace_io) {
+ writeback = false;
+ r = 0;
+ } else if (r == EMULATION_RESTART)
+ goto restart;
+ else
+ r = 1;
+
+writeback:
+ if (writeback) {
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
+ toggle_interruptibility(vcpu, ctxt->interruptibility);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ /*
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
+ * only supports code breakpoints and general detect #DB, both
+ * of which are fault-like.
+ */
+ if (!ctxt->have_exception ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_pmu_instruction_retired(vcpu);
+ if (ctxt->is_branch)
+ kvm_pmu_branch_retired(vcpu);
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ r = kvm_vcpu_do_singlestep(vcpu);
+ kvm_x86_call(update_emulated_instruction)(vcpu);
+ __kvm_set_rflags(vcpu, ctxt->eflags);
}
- } else if (pio_dev)
- pr_unimpl(vcpu, "no string pio read support yet, "
- "port %x size %d count %ld\n",
- port, size, count);
- return ret;
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
+
+ return r;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-static void bounce_off(void *info)
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
{
- /* nothing */
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction);
-static unsigned int ref_freq;
-static unsigned long tsc_khz_ref;
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len)
+{
+ return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction_from_buffer);
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val = kvm_rax_read(vcpu);
+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
+ return 0;
+}
+
+static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
+{
+ unsigned long val;
+
+ /* We should only ever be called with arch.pio.count equal to 1 */
+ BUG_ON(vcpu->arch.pio.count != 1);
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
+
+ complete_emulator_pio_in(vcpu, &val);
+ kvm_rax_write(vcpu, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val;
+ int ret;
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
+
+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
+ if (ret) {
+ kvm_rax_write(vcpu, val);
+ return ret;
+ }
+
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_in;
+
+ return 0;
+}
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
+{
+ int ret;
+
+ if (in)
+ ret = kvm_fast_pio_in(vcpu, size, port);
+ else
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fast_pio);
+
+static int kvmclock_cpu_down_prep(unsigned int cpu)
+{
+ __this_cpu_write(cpu_tsc_khz, 0);
+ return 0;
+}
+
+static void tsc_khz_changed(void *data)
{
struct cpufreq_freqs *freq = data;
+ unsigned long khz;
+
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
+
+ if (data)
+ khz = freq->new;
+ else
+ khz = cpufreq_quick_get(raw_smp_processor_id());
+ if (!khz)
+ khz = tsc_khz;
+ __this_cpu_write(cpu_tsc_khz, khz);
+}
+
+#ifdef CONFIG_X86_64
+static void kvm_hyperv_tsc_notifier(void)
+{
+ struct kvm *kvm;
+ int cpu;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_make_mclock_inprogress_request(kvm);
+
+ /* no guest entries from this point */
+ hyperv_stop_tsc_emulation();
+
+ /* TSC frequency always matches when on Hyper-V */
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ for_each_present_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ }
+ kvm_caps.max_guest_tsc_khz = tsc_khz;
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ __kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+ }
+
+ mutex_unlock(&kvm_lock);
+}
+#endif
+
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
+{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i, send_ipi = 0;
+ int send_ipi = 0;
+ unsigned long i;
- if (!ref_freq)
- ref_freq = freq->old;
+ /*
+ * We allow guests to temporarily run on slowing clocks,
+ * provided we notify them after, or to run on accelerating
+ * clocks, provided we notify them before. Thus time never
+ * goes backwards.
+ *
+ * However, we have a problem. We can't atomically update
+ * the frequency of a given CPU from this function; it is
+ * merely a notifier, which can be called from any CPU.
+ * Changing the TSC frequency at arbitrary points in time
+ * requires a recomputation of local variables related to
+ * the TSC for each VCPU. We must flag these local variables
+ * to be updated and be sure the update takes place with the
+ * new frequency before any guests proceed.
+ *
+ * Unfortunately, the combination of hotplug CPU and frequency
+ * change creates an intractable locking scenario; the order
+ * of when these callouts happen is undefined with respect to
+ * CPU hotplug, and they can race with each other. As such,
+ * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+ * undefined; you can actually have a CPU frequency change take
+ * place in between the computation of X and the setting of the
+ * variable. To protect against this problem, all updates of
+ * the per_cpu tsc_khz variable are done in an interrupt
+ * protected IPI, and all callers wishing to update the value
+ * must wait for a synchronous IPI to complete (which is trivial
+ * if the caller is on the CPU already). This establishes the
+ * necessary total order on variable updates.
+ *
+ * Note that because a guest time update may take place
+ * anytime after the setting of the VCPU's request bit, the
+ * correct TSC value must be set before the request. However,
+ * to ensure the update actually makes it to any guest which
+ * starts running in hardware virtualization between the set
+ * and the acquisition of the spinlock, we must also ping the
+ * CPU after setting the request bit.
+ *
+ */
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
- return 0;
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
- return 0;
- per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
- spin_lock(&kvm_lock);
+ mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- vcpu = kvm->vcpus[i];
- if (!vcpu)
- continue;
- if (vcpu->cpu != freq->cpu)
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != cpu)
continue;
- if (!kvm_request_guest_time_update(vcpu))
- continue;
- if (vcpu->cpu != smp_processor_id())
- send_ipi++;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != raw_smp_processor_id())
+ send_ipi = 1;
}
}
- spin_unlock(&kvm_lock);
+ mutex_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
@@ -2788,109 +9831,527 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
- smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
}
+}
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu;
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+
+ for_each_cpu(cpu, freq->policy->cpus)
+ __kvmclock_cpufreq_notifier(freq, cpu);
+
return 0;
}
static struct notifier_block kvmclock_cpufreq_notifier_block = {
- .notifier_call = kvmclock_cpufreq_notifier
+ .notifier_call = kvmclock_cpufreq_notifier
};
-int kvm_arch_init(void *opaque)
+static int kvmclock_cpu_online(unsigned int cpu)
+{
+ tsc_khz_changed(NULL);
+ return 0;
+}
+
+static void kvm_timer_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ max_tsc_khz = tsc_khz;
+
+ if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+ struct cpufreq_policy *policy;
+ int cpu;
+
+ cpu = get_cpu();
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
+ put_cpu();
+ }
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
+ kvmclock_cpu_online, kvmclock_cpu_down_prep);
+ }
+}
+
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ atomic_set(&kvm_guest_has_master_clock, 0);
+ mutex_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+ void *priv)
{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ struct timekeeper *tk = priv;
+
+ update_pvclock_gtod(tk);
+
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
+ */
+ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
+ atomic_read(&kvm_guest_has_master_clock) != 0)
+ irq_work_queue(&pvclock_irq_work);
+ return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+ .notifier_call = pvclock_gtod_notify,
+};
+#endif
+
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
+static int kvm_x86_check_processor_compatibility(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Compatibility checks are done when loading KVM and when enabling
+ * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ * compatible, i.e. KVM should never perform a compatibility check on
+ * an offline CPU.
+ */
+ WARN_ON(!cpu_online(cpu));
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return kvm_x86_call(check_processor_compatibility)();
+}
+
+static void kvm_x86_check_cpu_compat(void *ret)
+{
+ *(int *)ret = kvm_x86_check_processor_compatibility();
+}
+
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+{
+ u64 host_pat;
int r, cpu;
- struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
- if (kvm_x86_ops) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
- r = -EEXIST;
- goto out;
+ guard(mutex)(&vendor_module_lock);
+
+ if (kvm_x86_ops.enable_virtualization_cpu) {
+ pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ return -EEXIST;
}
- if (!ops->cpu_has_kvm_support()) {
- printk(KERN_ERR "kvm: no hardware support\n");
- r = -EOPNOTSUPP;
- goto out;
+ /*
+ * KVM explicitly assumes that the guest has an FPU and
+ * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
+ * vCPU's FPU state as a fxregs_state struct.
+ */
+ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
+ pr_err("inadequate fpu\n");
+ return -EOPNOTSUPP;
}
- if (ops->disabled_by_bios()) {
- printk(KERN_ERR "kvm: disabled by bios\n");
- r = -EOPNOTSUPP;
- goto out;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
+ * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something
+ * other than WB. Note, EPT doesn't utilize the PAT, but don't bother
+ * with an exception. PAT[0] is set to WB on RESET and also by the
+ * kernel, i.e. failure indicates a kernel bug or broken firmware.
+ */
+ if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) ||
+ (host_pat & GENMASK(2, 0)) != 6) {
+ pr_err("host PAT[0] is not WB\n");
+ return -EIO;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet);
+ /*
+ * Linux doesn't yet support supervisor shadow stacks (SSS), so
+ * KVM doesn't save/restore the associated MSRs, i.e. KVM may
+ * clobber the host values. Yell and refuse to load if SSS is
+ * unexpectedly enabled, e.g. to avoid crashing the host.
+ */
+ if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN))
+ return -EIO;
}
- r = kvm_mmu_module_init();
+ memset(&kvm_caps, 0, sizeof(kvm_caps));
+
+ x86_emulator_cache = kvm_alloc_emulator_cache();
+ if (!x86_emulator_cache) {
+ pr_err("failed to allocate cache for x86 emulator\n");
+ return -ENOMEM;
+ }
+
+ r = kvm_mmu_vendor_module_init();
if (r)
- goto out;
+ goto out_free_x86_emulator_cache;
- kvm_init_msr_list();
+ kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
+ kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
- kvm_x86_ops = ops;
- kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
- kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
- kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
+ kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
+ }
- for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- tsc_khz_ref = tsc_khz;
- cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ rdmsrq(MSR_IA32_XSS, kvm_host.xss);
+ kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS;
+ }
+
+ kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS;
+ kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS;
+
+ rdmsrq_safe(MSR_EFER, &kvm_host.efer);
+
+ kvm_init_pmu_capability(ops->pmu_ops);
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
+
+ WARN_ON_ONCE(kvm_nr_uret_msrs);
+
+ r = ops->hardware_setup();
+ if (r != 0)
+ goto out_mmu_exit;
+
+ enable_device_posted_irqs &= enable_apicv &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+
+ kvm_ops_update(ops);
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ if (r < 0)
+ goto out_unwind_ops;
+ }
+
+ /*
+ * Point of no return! DO NOT add error paths below this point unless
+ * absolutely necessary, as most operations from this point forward
+ * require unwinding.
+ */
+ kvm_timer_init();
+
+ if (pi_inject_timer == -1)
+ pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
+#ifdef CONFIG_X86_64
+ pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
+#endif
+
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+
+ /* KVM always ignores guest PAT for shadow paging. */
+ if (!tdp_enabled)
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ kvm_caps.supported_xss = 0;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
+
+ if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
}
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
+ }
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ kvm_init_msr_lists();
return 0;
-out:
+out_unwind_ops:
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
+ kvm_x86_call(hardware_unsetup)();
+out_mmu_exit:
+ kvm_destroy_user_return_msrs();
+ kvm_mmu_vendor_module_exit();
+out_free_x86_emulator_cache:
+ kmem_cache_destroy(x86_emulator_cache);
return r;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_init);
-void kvm_arch_exit(void)
+void kvm_x86_vendor_exit(void)
{
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ kvm_unregister_perf_callbacks();
+
+#ifdef CONFIG_X86_64
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ clear_hv_tscchange_cb();
+#endif
+ kvm_lapic_exit();
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- kvm_x86_ops = NULL;
- kvm_mmu_module_exit();
+ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+ }
+#ifdef CONFIG_X86_64
+ pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
+ cancel_work_sync(&pvclock_gtod_work);
+#endif
+ kvm_x86_call(hardware_unsetup)();
+ kvm_destroy_user_return_msrs();
+ kvm_mmu_vendor_module_exit();
+ kmem_cache_destroy(x86_emulator_cache);
+#ifdef CONFIG_KVM_XEN
+ static_key_deferred_flush(&kvm_xen_enabled);
+ WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
+#endif
+ mutex_lock(&vendor_module_lock);
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
+ mutex_unlock(&vendor_module_lock);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_exit);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+#ifdef CONFIG_X86_64
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+ unsigned long clock_type)
{
- ++vcpu->stat.halt_exits;
- KVMTRACE_0D(HLT, vcpu, handler);
- if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
- return 1;
- } else {
- vcpu->run->exit_reason = KVM_EXIT_HLT;
- return 0;
- }
+ struct kvm_clock_pairing clock_pairing;
+ struct timespec64 ts;
+ u64 cycle;
+ int ret;
+
+ if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+ return -KVM_EOPNOTSUPP;
+
+ /*
+ * When tsc is in permanent catchup mode guests won't be able to use
+ * pvclock_read_retry loop to get consistent view of pvclock
+ */
+ if (vcpu->arch.tsc_always_catchup)
+ return -KVM_EOPNOTSUPP;
+
+ if (!kvm_get_walltime_and_clockread(&ts, &cycle))
+ return -KVM_EOPNOTSUPP;
+
+ clock_pairing.sec = ts.tv_sec;
+ clock_pairing.nsec = ts.tv_nsec;
+ clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+ clock_pairing.flags = 0;
+ memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
+
+ ret = 0;
+ if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+ sizeof(struct kvm_clock_pairing)))
+ ret = -KVM_EFAULT;
+
+ return ret;
+}
+#endif
+
+/*
+ * kvm_pv_kick_cpu_op: Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
+{
+ /*
+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
+ * common code, e.g. for tracing. Defer initialization to the compiler.
+ */
+ struct kvm_lapic_irq lapic_irq = {
+ .delivery_mode = APIC_DM_REMRD,
+ .dest_mode = APIC_DEST_PHYSICAL,
+ .shorthand = APIC_DEST_NOSHORT,
+ .dest_id = apicid,
+ };
+
+ kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
- unsigned long a1)
+bool kvm_apicv_activated(struct kvm *kvm)
{
- if (is_long_mode(vcpu))
- return a0;
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apicv_activated);
+
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
+{
+ ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+ ulong vcpu_reasons =
+ kvm_x86_call(vcpu_get_apicv_inhibit_reasons)(vcpu);
+
+ return (vm_reasons | vcpu_reasons) == 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_apicv_activated);
+
+static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
+ enum kvm_apicv_inhibit reason, bool set)
+{
+ const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS };
+
+ BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS);
+
+ if (set)
+ __set_bit(reason, inhibits);
else
- return a0 | ((gpa_t)a1 << 32);
+ __clear_bit(reason, inhibits);
+
+ trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
}
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+static void kvm_apicv_init(struct kvm *kvm)
+{
+ enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
+ APICV_INHIBIT_REASON_DISABLED;
+
+ set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
+
+ init_rwsem(&kvm->arch.apicv_update_lock);
+}
+
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
+{
+ struct kvm_vcpu *target = NULL;
+ struct kvm_apic_map *map;
+
+ vcpu->stat.directed_yield_attempted++;
+
+ if (single_task_running())
+ goto no_yield;
+
+ rcu_read_lock();
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
+
+ if (likely(map) && dest_id <= map->max_apic_id) {
+ dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
+ if (map->phys_map[dest_id])
+ target = map->phys_map[dest_id]->vcpu;
+ }
+
+ rcu_read_unlock();
+
+ if (!target || !READ_ONCE(target->ready))
+ goto no_yield;
+
+ /* Ignore requests to yield to self */
+ if (vcpu == target)
+ goto no_yield;
+
+ if (kvm_vcpu_yield_to(target) <= 0)
+ goto no_yield;
+
+ vcpu->stat.directed_yield_successful++;
+
+no_yield:
+ return;
+}
+
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
{
- unsigned long nr, a0, a1, a2, a3, ret;
- int r = 1;
+ u64 ret = vcpu->run->hypercall.ret;
- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ if (!is_64_bit_hypercall(vcpu))
+ ret = (u32)ret;
+ kvm_rax_write(vcpu, ret);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
+ int (*complete_hypercall)(struct kvm_vcpu *))
+{
+ unsigned long ret;
+ unsigned long nr = kvm_rax_read(vcpu);
+ unsigned long a0 = kvm_rbx_read(vcpu);
+ unsigned long a1 = kvm_rcx_read(vcpu);
+ unsigned long a2 = kvm_rdx_read(vcpu);
+ unsigned long a3 = kvm_rsi_read(vcpu);
+ int op_64_bit = is_64_bit_hypercall(vcpu);
- KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+ ++vcpu->stat.hypercalls;
+
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
- if (!is_long_mode(vcpu)) {
+ if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
@@ -2898,696 +10359,1734 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
+ if (cpl) {
+ ret = -KVM_EPERM;
+ goto out;
+ }
+
+ ret = -KVM_ENOSYS;
+
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
- case KVM_HC_MMU_OP:
- r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+ case KVM_HC_KICK_CPU:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
+ break;
+
+ kvm_pv_kick_cpu_op(vcpu->kvm, a1);
+ kvm_sched_yield(vcpu, a1);
+ ret = 0;
+ break;
+#ifdef CONFIG_X86_64
+ case KVM_HC_CLOCK_PAIRING:
+ ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+ break;
+#endif
+ case KVM_HC_SEND_IPI:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
+ break;
+
+ ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
+ case KVM_HC_SCHED_YIELD:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
+ break;
+
+ kvm_sched_yield(vcpu, a0);
+ ret = 0;
+ break;
+ case KVM_HC_MAP_GPA_RANGE: {
+ u64 gpa = a0, npages = a1, attrs = a2;
+
+ ret = -KVM_ENOSYS;
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
+ break;
+
+ if (!PAGE_ALIGNED(gpa) || !npages ||
+ gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+ ret = -KVM_EINVAL;
+ break;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = attrs;
+ vcpu->run->hypercall.flags = 0;
+ if (op_64_bit)
+ vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
+ vcpu->arch.complete_userspace_io = complete_hypercall;
+ return 0;
+ }
default:
ret = -KVM_ENOSYS;
break;
}
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
- ++vcpu->stat.hypercalls;
- return r;
+
+out:
+ vcpu->run->hypercall.ret = ret;
+ return 1;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(____kvm_emulate_hypercall);
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu),
+ complete_hypercall_exit);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_hypercall);
+
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
char instruction[3];
- int ret = 0;
unsigned long rip = kvm_rip_read(vcpu);
-
/*
- * Blow out the MMU to ensure that no other VCPU has an active mapping
- * to ensure that the updated hypercall appears atomically across all
- * VCPUs.
+ * If the quirk is disabled, synthesize a #UD and let the guest pick up
+ * the pieces.
*/
- kvm_mmu_zap_all(vcpu->kvm);
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+ ctxt->exception.error_code_valid = false;
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->have_exception = true;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
- kvm_x86_ops->patch_hypercall(vcpu, instruction);
- if (emulator_write_emulated(rip, instruction, 3, vcpu)
- != X86EMUL_CONTINUE)
- ret = -EFAULT;
+ kvm_x86_call(patch_hypercall)(vcpu, instruction);
- return ret;
+ return emulator_write_emulated(ctxt, rip, instruction, 3,
+ &ctxt->exception);
}
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+ return vcpu->run->request_interrupt_window &&
+ likely(!pic_in_kernel(vcpu->kvm));
}
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+/* Called within kvm->srcu read side. */
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
- struct descriptor_table dt = { limit, base };
+ struct kvm_run *kvm_run = vcpu->run;
- kvm_x86_ops->set_gdt(vcpu, &dt);
-}
+ kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
+ kvm_run->apic_base = vcpu->arch.apic_base;
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct descriptor_table dt = { limit, base };
+ kvm_run->ready_for_interrupt_injection =
+ pic_in_kernel(vcpu->kvm) ||
+ kvm_vcpu_ready_for_interrupt_injection(vcpu);
- kvm_x86_ops->set_idt(vcpu, &dt);
+ if (is_smm(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_SMM;
+ if (is_guest_mode(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
}
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
{
- kvm_lmsw(vcpu, msw);
- *rflags = kvm_x86_ops->get_rflags(vcpu);
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops.update_cr8_intercept)
+ return;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (vcpu->arch.apic->apicv_active)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_call(update_cr8_intercept)(vcpu, tpr, max_irr);
}
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
- unsigned long value;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- switch (cr) {
- case 0:
- value = vcpu->arch.cr0;
- break;
- case 2:
- value = vcpu->arch.cr2;
- break;
- case 3:
- value = vcpu->arch.cr3;
- break;
- case 4:
- value = vcpu->arch.cr4;
- break;
- case 8:
- value = kvm_get_cr8(vcpu);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- return 0;
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ return 1;
}
- KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
- (u32)((u64)value >> 32), handler);
- return value;
+ return kvm_x86_ops.nested_ops->check_events(vcpu);
}
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
- unsigned long *rflags)
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
- (u32)((u64)val >> 32), handler);
+ /*
+ * Suppress the error code if the vCPU is in Real Mode, as Real Mode
+ * exceptions don't report error codes. The presence of an error code
+ * is carried with the exception and only stripped when the exception
+ * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
+ * report an error code despite the CPU being in Real Mode.
+ */
+ vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
- switch (cr) {
- case 0:
- kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
- *rflags = kvm_x86_ops->get_rflags(vcpu);
- break;
- case 2:
- vcpu->arch.cr2 = val;
- break;
- case 3:
- kvm_set_cr3(vcpu, val);
- break;
- case 4:
- kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
- break;
- case 8:
- kvm_set_cr8(vcpu, val & 0xfUL);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- }
+ trace_kvm_inj_exception(vcpu->arch.exception.vector,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.injected);
+
+ kvm_x86_call(inject_exception)(vcpu);
}
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+/*
+ * Check for any event (interrupt or exception) that is ready to be injected,
+ * and if there is at least one event, inject the event with the highest
+ * priority. This handles both "pending" events, i.e. events that have never
+ * been injected into the guest, and "injected" events, i.e. events that were
+ * injected as part of a previous VM-Enter, but weren't successfully delivered
+ * and need to be re-injected.
+ *
+ * Note, this is not guaranteed to be invoked on a guest instruction boundary,
+ * i.e. doesn't guarantee that there's an event window in the guest. KVM must
+ * be able to inject exceptions in the "middle" of an instruction, and so must
+ * also be able to re-inject NMIs and IRQs in the middle of an instruction.
+ * I.e. for exceptions and re-injected events, NOT invoking this on instruction
+ * boundaries is necessary and correct.
+ *
+ * For simplicity, KVM uses a single path to inject all events (except events
+ * that are injected directly from L1 to L2) and doesn't explicitly track
+ * instruction boundaries for asynchronous events. However, because VM-Exits
+ * that can occur during instruction execution typically result in KVM skipping
+ * the instruction or injecting an exception, e.g. instruction and exception
+ * intercepts, and because pending exceptions have higher priority than pending
+ * interrupts, KVM still honors instruction boundaries in most scenarios.
+ *
+ * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
+ * the instruction or inject an exception, then KVM can incorrecty inject a new
+ * asynchronous event if the event became pending after the CPU fetched the
+ * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
+ * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
+ * injected on the restarted instruction instead of being deferred until the
+ * instruction completes.
+ *
+ * In practice, this virtualization hole is unlikely to be observed by the
+ * guest, and even less likely to cause functional problems. To detect the
+ * hole, the guest would have to trigger an event on a side effect of an early
+ * phase of instruction execution, e.g. on the instruction fetch from memory.
+ * And for it to be a functional problem, the guest would need to depend on the
+ * ordering between that side effect, the instruction completing, _and_ the
+ * delivery of the asynchronous event.
+ */
+static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
+ bool *req_immediate_exit)
{
- struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
- int j, nent = vcpu->arch.cpuid_nent;
+ bool can_inject;
+ int r;
+
+ /*
+ * Process nested events first, as nested VM-Exit supersedes event
+ * re-injection. If there's an event queued for re-injection, it will
+ * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ r = kvm_check_nested_events(vcpu);
+ else
+ r = 0;
+
+ /*
+ * Re-inject exceptions and events *especially* if immediate entry+exit
+ * to/from L2 is needed, as any event that has already been injected
+ * into L2 needs to complete its lifecycle before injecting a new event.
+ *
+ * Don't re-inject an NMI or interrupt if there is a pending exception.
+ * This collision arises if an exception occurred while vectoring the
+ * injected event, KVM intercepted said exception, and KVM ultimately
+ * determined the fault belongs to the guest and queues the exception
+ * for injection back into the guest.
+ *
+ * "Injected" interrupts can also collide with pending exceptions if
+ * userspace ignores the "ready for injection" flag and blindly queues
+ * an interrupt. In that case, prioritizing the exception is correct,
+ * as the exception "occurred" before the exit to userspace. Trap-like
+ * exceptions, e.g. most #DBs, have higher priority than interrupts.
+ * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, they're only generated (pended) during instruction
+ * execution, and interrupts are recognized at instruction boundaries.
+ * Thus a pending fault-like exception means the fault occurred on the
+ * *previous* instruction and must be serviced prior to recognizing any
+ * new events in order to fully complete the previous instruction.
+ */
+ if (vcpu->arch.exception.injected)
+ kvm_inject_exception(vcpu);
+ else if (kvm_is_exception_pending(vcpu))
+ ; /* see above */
+ else if (vcpu->arch.nmi_injected)
+ kvm_x86_call(inject_nmi)(vcpu);
+ else if (vcpu->arch.interrupt.injected)
+ kvm_x86_call(inject_irq)(vcpu, true);
- e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
- /* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; ; j = (j + 1) % nent) {
- struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
- if (ej->function == e->function) {
- ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- return j;
+ /*
+ * Exceptions that morph to VM-Exits are handled above, and pending
+ * exceptions on top of injected exceptions that do not VM-Exit should
+ * either morph to #DF or, sadly, override the injected exception.
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
+ vcpu->arch.exception.pending);
+
+ /*
+ * Bail if immediate entry+exit to/from the guest is needed to complete
+ * nested VM-Enter or event re-injection so that a different pending
+ * event can be serviced (or if KVM needs to exit to userspace).
+ *
+ * Otherwise, continue processing events even if VM-Exit occurred. The
+ * VM-Exit will have cleared exceptions that were meant for L2, but
+ * there may now be events that can be injected into L1.
+ */
+ if (r < 0)
+ goto out;
+
+ /*
+ * A pending exception VM-Exit should either result in nested VM-Exit
+ * or force an immediate re-entry and exit to/from L2, and exception
+ * VM-Exits cannot be injected (flag should _never_ be set).
+ */
+ WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+ vcpu->arch.exception_vmexit.pending);
+
+ /*
+ * New events, other than exceptions, cannot be injected if KVM needs
+ * to re-inject a previous event. See above comments on re-injecting
+ * for why pending exceptions get priority.
+ */
+ can_inject = !kvm_event_needs_reinjection(vcpu);
+
+ if (vcpu->arch.exception.pending) {
+ /*
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+ * value pushed on the stack. Trap-like exception and all #DBs
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
+ *
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
+ * describe the behavior of General Detect #DBs, which are
+ * fault-like. They do _not_ set RF, a la code breakpoints.
+ */
+ if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
+ __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
+ if (vcpu->arch.exception.vector == DB_VECTOR) {
+ kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
}
+
+ kvm_inject_exception(vcpu);
+
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
+ can_inject = false;
}
- return 0; /* silence gcc, even though control never reaches here */
-}
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
- u32 function, u32 index)
-{
- if (e->function != function)
- return 0;
- if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
- return 0;
- if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
- !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ /* Don't inject interrupts if the user asked to avoid doing so */
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
return 0;
- return 1;
-}
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
-{
- int i;
- struct kvm_cpuid_entry2 *best = NULL;
+ /*
+ * Finally, inject interrupt events. If an event cannot be injected
+ * due to architectural conditions (e.g. IF=0) a window-open exit
+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
+ * and can architecturally be injected, but we cannot do it right now:
+ * an interrupt could have arrived just now and we have to inject it
+ * as a vmexit, or there could already an event in the queue, which is
+ * indicated by can_inject. In that case we request an immediate exit
+ * in order to make progress and get back here for another iteration.
+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
+ */
+#ifdef CONFIG_KVM_SMM
+ if (vcpu->arch.smi_pending) {
+ r = can_inject ? kvm_x86_call(smi_allowed)(vcpu, true) :
+ -EBUSY;
+ if (r < 0)
+ goto out;
+ if (r) {
+ vcpu->arch.smi_pending = false;
+ ++vcpu->arch.smi_count;
+ enter_smm(vcpu);
+ can_inject = false;
+ } else
+ kvm_x86_call(enable_smi_window)(vcpu);
+ }
+#endif
+
+ if (vcpu->arch.nmi_pending) {
+ r = can_inject ? kvm_x86_call(nmi_allowed)(vcpu, true) :
+ -EBUSY;
+ if (r < 0)
+ goto out;
+ if (r) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_call(inject_nmi)(vcpu);
+ can_inject = false;
+ WARN_ON(kvm_x86_call(nmi_allowed)(vcpu, true) < 0);
+ }
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_call(enable_nmi_window)(vcpu);
+ }
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- struct kvm_cpuid_entry2 *e;
+ if (kvm_cpu_has_injectable_intr(vcpu)) {
+ r = can_inject ? kvm_x86_call(interrupt_allowed)(vcpu, true) :
+ -EBUSY;
+ if (r < 0)
+ goto out;
+ if (r) {
+ int irq = kvm_cpu_get_interrupt(vcpu);
- e = &vcpu->arch.cpuid_entries[i];
- if (is_matching_cpuid_entry(e, function, index)) {
- if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
- move_to_next_stateful_cpuid_entry(vcpu, i);
- best = e;
- break;
+ if (!WARN_ON_ONCE(irq == -1)) {
+ kvm_queue_interrupt(vcpu, irq, false);
+ kvm_x86_call(inject_irq)(vcpu, false);
+ WARN_ON(kvm_x86_call(interrupt_allowed)(vcpu, true) < 0);
+ }
}
- /*
- * Both basic or both extended?
- */
- if (((e->function ^ function) & 0x80000000) == 0)
- if (!best || e->function > best->function)
- best = e;
+ if (kvm_cpu_has_injectable_intr(vcpu))
+ kvm_x86_call(enable_irq_window)(vcpu);
+ }
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu, true))
+ *req_immediate_exit = true;
+
+ /*
+ * KVM must never queue a new exception while injecting an event; KVM
+ * is done emulating and should only propagate the to-be-injected event
+ * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an
+ * infinite loop as KVM will bail from VM-Enter to inject the pending
+ * exception and start the cycle all over.
+ *
+ * Exempt triple faults as they have special handling and won't put the
+ * vCPU into an infinite loop. Triple fault can be queued when running
+ * VMX without unrestricted guest, as that requires KVM to emulate Real
+ * Mode events (see kvm_inject_realmode_interrupt()).
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.pending ||
+ vcpu->arch.exception_vmexit.pending);
+ return 0;
+
+out:
+ if (r == -EBUSY) {
+ *req_immediate_exit = true;
+ r = 0;
}
- return best;
+ return r;
}
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+static void process_nmi(struct kvm_vcpu *vcpu)
{
- struct kvm_cpuid_entry2 *best;
+ unsigned int limit;
+
+ /*
+ * x86 is limited to one NMI pending, but because KVM can't react to
+ * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
+ * scheduled out, KVM needs to play nice with two queued NMIs showing
+ * up at the same time. To handle this scenario, allow two NMIs to be
+ * (temporarily) pending so long as NMIs are not blocked and KVM is not
+ * waiting for a previous NMI injection to complete (which effectively
+ * blocks NMIs). KVM will immediately inject one of the two NMIs, and
+ * will request an NMI window to handle the second NMI.
+ */
+ if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
+ limit = 1;
+ else
+ limit = 2;
+
+ /*
+ * Adjust the limit to account for pending virtual NMIs, which aren't
+ * tracked in vcpu->arch.nmi_pending.
+ */
+ if (kvm_x86_call(is_vnmi_pending)(vcpu))
+ limit--;
+
+ vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+
+ if (vcpu->arch.nmi_pending &&
+ (kvm_x86_call(set_vnmi_pending)(vcpu)))
+ vcpu->arch.nmi_pending--;
- best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best)
- return best->eax & 0xff;
- return 36;
+ if (vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+/* Return total number of NMIs pending injection to the VM */
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
{
- u32 function, index;
- struct kvm_cpuid_entry2 *best;
-
- function = kvm_register_read(vcpu, VCPU_REGS_RAX);
- index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
- best = kvm_find_cpuid_entry(vcpu, function, index);
- if (best) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
- }
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- KVMTRACE_5D(CPUID, vcpu, function,
- (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
- (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+ return vcpu->arch.nmi_pending +
+ kvm_x86_call(is_vnmi_pending)(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap)
{
- return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
- kvm_run->request_interrupt_window &&
- kvm_arch_interrupt_allowed(vcpu));
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
}
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
- kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->cr8 = kvm_get_cr8(vcpu);
- kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
- kvm_run->ready_for_interrupt_injection =
- kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
- !kvm_event_needs_reinjection(vcpu);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
-static void vapic_enter(struct kvm_vcpu *vcpu)
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- struct page *page;
+ bool activate;
- if (!apic || !apic->vapic_addr)
+ if (!lapic_in_kernel(vcpu))
return;
- page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+ down_read(&vcpu->kvm->arch.apicv_update_lock);
+ preempt_disable();
+
+ /* Do not activate APICV when APIC is disabled */
+ activate = kvm_vcpu_apicv_activated(vcpu) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED);
- vcpu->arch.apic->vapic_page = page;
+ if (apic->apicv_active == activate)
+ goto out;
+
+ apic->apicv_active = activate;
+ kvm_apic_update_apicv(vcpu);
+ kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu);
+
+ /*
+ * When APICv gets disabled, we may still have injected interrupts
+ * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+ * still active when the interrupt got accepted. Make sure
+ * kvm_check_and_inject_events() is called to check for that.
+ */
+ if (!apic->apicv_active)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+ preempt_enable();
+ up_read(&vcpu->kvm->arch.apicv_update_lock);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_update_apicv);
-static void vapic_exit(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
+ if (!lapic_in_kernel(vcpu))
+ return;
- if (!apic || !apic->vapic_addr)
+ /*
+ * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+ * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
+ * and hardware doesn't support x2APIC virtualization. E.g. some AMD
+ * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
+ * this case so that KVM can use the AVIC doorbell to inject interrupts
+ * to running vCPUs, but KVM must not create SPTEs for the APIC base as
+ * the vCPU would incorrectly be able to access the vAPIC page via MMIO
+ * despite being in x2APIC mode. For simplicity, inhibiting the APIC
+ * access page is sticky.
+ */
+ if (apic_x2apic_mode(vcpu->arch.apic) &&
+ kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
+ kvm_inhibit_apic_access_page(vcpu);
+
+ __kvm_vcpu_update_apicv(vcpu);
+}
+
+void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
+{
+ unsigned long old, new;
+
+ lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
+
+ if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
return;
- down_read(&vcpu->kvm->slots_lock);
- kvm_release_page_dirty(apic->vapic_page);
- mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- up_read(&vcpu->kvm->slots_lock);
+ old = new = kvm->arch.apicv_inhibit_reasons;
+
+ set_or_clear_apicv_inhibit(&new, reason, set);
+
+ if (!!old != !!new) {
+ /*
+ * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
+ * false positives in the sanity check WARN in vcpu_enter_guest().
+ * This task will wait for all vCPUs to ack the kick IRQ before
+ * updating apicv_inhibit_reasons, and all other vCPUs will
+ * block on acquiring apicv_update_lock so that vCPUs can't
+ * redo vcpu_enter_guest() without seeing the new inhibit state.
+ *
+ * Note, holding apicv_update_lock and taking it in the read
+ * side (handling the request) also prevents other vCPUs from
+ * servicing the request with a stale apicv_inhibit_reasons.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+ kvm->arch.apicv_inhibit_reasons = new;
+ if (new) {
+ unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ kvm_zap_gfn_range(kvm, gfn, gfn+1);
+ srcu_read_unlock(&kvm->srcu, idx);
+ }
+ } else {
+ kvm->arch.apicv_inhibit_reasons = new;
+ }
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
{
- int max_irr, tpr;
+ if (!enable_apicv)
+ return;
- if (!kvm_x86_ops->update_cr8_intercept)
+ down_write(&kvm->arch.apicv_update_lock);
+ __kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
+ up_write(&kvm->arch.apicv_update_lock);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit);
+
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_apic_present(vcpu))
return;
- if (!vcpu->arch.apic->vapic_addr)
- max_irr = kvm_lapic_find_highest_irr(vcpu);
- else
- max_irr = -1;
+ bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
- if (max_irr != -1)
- max_irr >>= 4;
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
- tpr = kvm_lapic_get_cr8(vcpu);
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
+ else if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+ if (is_guest_mode(vcpu))
+ vcpu->arch.load_eoi_exitmap_pending = true;
+ else
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
}
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
+
+#ifdef CONFIG_KVM_HYPERV
+ if (to_hv_vcpu(vcpu)) {
+ u64 eoi_exit_bitmap[4];
- /* try to reinject previous events if any */
- if (vcpu->arch.nmi_injected) {
- kvm_x86_ops->set_nmi(vcpu);
+ bitmap_or((ulong *)eoi_exit_bitmap,
+ vcpu->arch.ioapic_handled_vectors,
+ to_hv_synic(vcpu)->vec_bitmap, 256);
+ kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
+#endif
+ kvm_x86_call(load_eoi_exitmap)(
+ vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
+}
- if (vcpu->arch.interrupt.pending) {
- kvm_x86_ops->set_irq(vcpu);
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+{
+ kvm_x86_call(guest_memory_reclaimed)(kvm);
+}
+
+static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
return;
- }
- /* try to inject new event if pending */
- if (vcpu->arch.nmi_pending) {
- if (kvm_x86_ops->nmi_allowed(vcpu)) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- kvm_x86_ops->set_nmi(vcpu);
- }
- } else if (kvm_cpu_has_interrupt(vcpu)) {
- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
- false);
- kvm_x86_ops->set_irq(vcpu);
- }
- }
+ kvm_x86_call(set_apic_access_page_addr)(vcpu);
}
-static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+/*
+ * Called within kvm->srcu read side.
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
+ * exiting to the userspace. Otherwise, the value will be returned to the
+ * userspace.
+ */
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
- bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
- kvm_run->request_interrupt_window;
+ bool req_int_win =
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
+ u64 run_flags, debug_ctl;
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
- kvm_mmu_unload(vcpu);
+ bool req_immediate_exit = false;
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
+ if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
+ r = -EIO;
+ goto out;
+ }
+
+ if (kvm_dirty_ring_check_request(vcpu)) {
+ r = 0;
+ goto out;
+ }
- if (vcpu->requests) {
- if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
- if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
- kvm_write_guest_time(vcpu);
- if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+ kvm_update_masterclock(vcpu->kvm);
+ if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+ kvm_gen_kvmclock_update(vcpu);
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+ r = kvm_guest_time_update(vcpu);
+ if (unlikely(r))
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
- if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
- kvm_x86_ops->tlb_flush(vcpu);
- if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
- &vcpu->requests)) {
- kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
+ kvm_mmu_load_pgd(vcpu);
+
+ /*
+ * Note, the order matters here, as flushing "all" TLB entries
+ * also flushes the "current" TLB entries, i.e. servicing the
+ * flush "all" will clear any request to flush "current".
+ */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvm_vcpu_flush_tlb_all(vcpu);
+
+ kvm_service_local_tlb_flush_requests(vcpu);
+
+ /*
+ * Fall back to a "full" guest flush if Hyper-V's precise
+ * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
+ * the flushes are considered "remote" and not "local" because
+ * the requests can be initiated from other vCPUs.
+ */
+#ifdef CONFIG_KVM_HYPERV
+ if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
+ kvm_hv_vcpu_flush_tlb(vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+#endif
+
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (is_guest_mode(vcpu))
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_pmu_deliver_pmi(vcpu);
+#ifdef CONFIG_KVM_SMM
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+#endif
+ if (kvm_check_request(KVM_REQ_NMI, vcpu))
+ process_nmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ vcpu->arch.ioapic_handled_vectors)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+ vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
+ vcpu_load_eoi_exitmap(vcpu);
+ if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
+ kvm_vcpu_reload_apic_access_page(vcpu);
+#ifdef CONFIG_KVM_HYPERV
+ if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ vcpu->run->system_event.ndata = 0;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ vcpu->run->system_event.ndata = 0;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv = hv_vcpu->exit;
r = 0;
goto out;
}
+
+ /*
+ * KVM_REQ_HV_STIMER has to be processed after
+ * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+ * depend on the guest clock being up-to-date
+ */
+ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+ kvm_hv_process_stimers(vcpu);
+#endif
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
+ kvm_check_async_pf_completion(vcpu);
+
+ if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu))
+ kvm_x86_call(recalc_intercepts)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
+ kvm_x86_call(update_cpu_dirty_logging)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
+ r = 1;
+ goto out;
+ }
+ }
+ }
+
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
+ kvm_xen_has_interrupt(vcpu)) {
+ ++vcpu->stat.req_event;
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ r = 1;
+ goto out;
+ }
+
+ r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
+ if (req_int_win)
+ kvm_x86_call(enable_irq_window)(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+ }
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r)) {
+ goto cancel_injection;
}
preempt_disable();
- kvm_x86_ops->prepare_guest_switch(vcpu);
- kvm_load_guest_fpu(vcpu);
+ kvm_x86_call(prepare_switch_to_guest)(vcpu);
+ /*
+ * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
+ * IPI are then delayed after guest entry, which ensures that they
+ * result in virtual interrupt delivery.
+ */
local_irq_disable();
- clear_bit(KVM_REQ_KICK, &vcpu->requests);
- smp_mb__after_clear_bit();
+ /* Store vcpu->apicv_active before vcpu->mode. */
+ smp_store_release(&vcpu->mode, IN_GUEST_MODE);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ /*
+ * 1) We should set ->mode before checking ->requests. Please see
+ * the comment in kvm_vcpu_exiting_guest_mode().
+ *
+ * 2) For APICv, we should set ->mode before checking PID.ON. This
+ * pairs with the memory barrier implicit in pi_test_and_set_on
+ * (see vmx_deliver_posted_interrupt).
+ *
+ * 3) This also orders the write to mode from any reads to the page
+ * tables done while the VCPU is running. Please see the comment
+ * in kvm_flush_remote_tlbs.
+ */
+ smp_mb__after_srcu_read_unlock();
- if (vcpu->requests || need_resched() || signal_pending(current)) {
+ /*
+ * Process pending posted interrupts to handle the case where the
+ * notification IRQ arrived in the host, or was never sent (because the
+ * target vCPU wasn't running). Do this regardless of the vCPU's APICv
+ * status, KVM doesn't update assigned devices when APICv is inhibited,
+ * i.e. they can post interrupts even if APICv is temporarily disabled.
+ */
+ if (kvm_lapic_enabled(vcpu))
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
+
+ if (kvm_vcpu_exit_request(vcpu)) {
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
local_irq_enable();
preempt_enable();
+ kvm_vcpu_srcu_read_lock(vcpu);
r = 1;
- goto out;
+ goto cancel_injection;
}
- if (vcpu->arch.exception.pending)
- __queue_exception(vcpu);
- else
- inject_pending_irq(vcpu, kvm_run);
-
- /* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
-
- if (kvm_lapic_enabled(vcpu)) {
- update_cr8_intercept(vcpu);
- kvm_lapic_sync_to_vapic(vcpu);
+ run_flags = 0;
+ if (req_immediate_exit) {
+ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- up_read(&vcpu->kvm->slots_lock);
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
- kvm_guest_enter();
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
- get_debugreg(vcpu->arch.host_dr6, 6);
- get_debugreg(vcpu->arch.host_dr7, 7);
- if (unlikely(vcpu->arch.switch_db_regs)) {
- get_debugreg(vcpu->arch.host_db[0], 0);
- get_debugreg(vcpu->arch.host_db[1], 1);
- get_debugreg(vcpu->arch.host_db[2], 2);
- get_debugreg(vcpu->arch.host_db[3], 3);
+ kvm_load_xfeatures(vcpu, true);
- set_debugreg(0, 7);
+ if (unlikely(vcpu->arch.switch_db_regs &&
+ !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
+ set_debugreg(DR7_FIXED_1, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(DR7_FIXED_1, 7);
+ }
+
+ /*
+ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
+ * can be modified in IRQ context, e.g. via SMP function calls. Inform
+ * vendor code if any host-owned bits were changed, e.g. so that the
+ * value loaded into hardware while running the guest can be updated.
+ */
+ debug_ctl = get_debugctlmsr();
+ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
+ !vcpu->arch.guest_state_protected)
+ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
+ vcpu->arch.host_debugctl = debug_ctl;
+
+ guest_timing_enter_irqoff();
+
+ /*
+ * Swap PKRU with hardware breakpoints disabled to minimize the number
+ * of flows where non-KVM code can run with guest state loaded.
+ */
+ kvm_load_guest_pkru(vcpu);
+
+ for (;;) {
+ /*
+ * Assert that vCPU vs. VM APICv state is consistent. An APICv
+ * update must kick and wait for all vCPUs before toggling the
+ * per-VM state, and responding vCPUs must wait for the update
+ * to complete before servicing KVM_REQ_APICV_UPDATE.
+ */
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+ if (kvm_lapic_enabled(vcpu))
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+ break;
+ }
+
+ run_flags = 0;
+
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
}
- KVMTRACE_0D(VMENTRY, vcpu, entryexit);
- kvm_x86_ops->run(vcpu, kvm_run);
+ kvm_load_host_pkru(vcpu);
- if (unlikely(vcpu->arch.switch_db_regs)) {
- set_debugreg(0, 7);
- set_debugreg(vcpu->arch.host_db[0], 0);
- set_debugreg(vcpu->arch.host_db[1], 1);
- set_debugreg(vcpu->arch.host_db[2], 2);
- set_debugreg(vcpu->arch.host_db[3], 3);
+ /*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
+ kvm_x86_call(sync_dirty_debug_regs)(vcpu);
+ kvm_update_dr0123(vcpu);
+ kvm_update_dr7(vcpu);
}
- set_debugreg(vcpu->arch.host_dr6, 6);
- set_debugreg(vcpu->arch.host_dr7, 7);
- set_bit(KVM_REQ_KICK, &vcpu->requests);
- local_irq_enable();
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
- ++vcpu->stat.exits;
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+
+ kvm_load_xfeatures(vcpu, false);
/*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
+ * Sync xfd before calling handle_exit_irqoff() which may
+ * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+ * in #NM irqoff handler).
*/
- barrier();
+ if (vcpu->arch.xfd_no_write_intercept)
+ fpu_sync_guest_vmexit_xfd_state();
+
+ kvm_x86_call(handle_exit_irqoff)(vcpu);
+
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrq(MSR_IA32_XFD_ERR, 0);
- kvm_guest_exit();
+ /*
+ * Mark this CPU as needing a branch predictor flush before running
+ * userspace. Must be done before enabling preemption to ensure it gets
+ * set for the CPU that actually ran the guest, and not the CPU that it
+ * may migrate to.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER))
+ this_cpu_write(x86_ibpb_exit_to_user, true);
+ /*
+ * Consume any pending interrupts, including the possible source of
+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
+ * An instruction is required after local_irq_enable() to fully unblock
+ * interrupts on processors that implement an interrupt shadow, the
+ * stat.exits increment will do nicely.
+ */
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ local_irq_enable();
+ ++vcpu->stat.exits;
+ local_irq_disable();
+ kvm_after_interrupt(vcpu);
+
+ /*
+ * Wait until after servicing IRQs to account guest time so that any
+ * ticks that occurred while running the guest are properly accounted
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
+ * of accounting via context tracking, but the loss of accuracy is
+ * acceptable for all known use cases.
+ */
+ guest_timing_exit_irqoff();
+
+ local_irq_enable();
preempt_enable();
- down_read(&vcpu->kvm->slots_lock);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ /*
+ * Call this to ensure WC buffers in guest are evicted after each VM
+ * Exit, so that the evicted WC writes can be snooped across all cpus
+ */
+ smp_mb__after_srcu_read_lock();
/*
* Profile KVM exit RIPs:
*/
- if (unlikely(prof_on == KVM_PROFILING)) {
+ if (unlikely(prof_on == KVM_PROFILING &&
+ !vcpu->arch.guest_state_protected)) {
unsigned long rip = kvm_rip_read(vcpu);
profile_hit(KVM_PROFILING, (void *)rip);
}
+ if (unlikely(vcpu->arch.tsc_always_catchup))
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ if (vcpu->arch.apic_attention)
+ kvm_lapic_sync_from_vapic(vcpu);
+
+ if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
+ return 0;
- kvm_lapic_sync_from_vapic(vcpu);
+ r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
+ return r;
- r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+cancel_injection:
+ if (req_immediate_exit)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_x86_call(cancel_injection)(vcpu);
+ if (unlikely(vcpu->arch.apic_attention))
+ kvm_lapic_sync_from_vapic(vcpu);
out:
return r;
}
+static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
- int r;
+ if (!list_empty_careful(&vcpu->async_pf.done))
+ return true;
- if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
- pr_debug("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->arch.sipi_vector);
- kvm_lapic_reset(vcpu);
- r = kvm_arch_vcpu_reset(vcpu);
- if (r)
- return r;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- }
+ if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ kvm_apic_init_sipi_allowed(vcpu))
+ return true;
+
+ if (kvm_is_exception_pending(vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_call(nmi_allowed)(vcpu, false)))
+ return true;
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending &&
+ kvm_x86_call(smi_allowed)(vcpu, false)))
+ return true;
+#endif
+
+ if (kvm_test_request(KVM_REQ_PMI, vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
+ return true;
+
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu, false))
+ return true;
- down_read(&vcpu->kvm->slots_lock);
- vapic_enter(vcpu);
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_has_events);
- r = 1;
- while (r > 0) {
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- r = vcpu_enter_guest(vcpu, kvm_run);
- else {
- up_read(&vcpu->kvm->slots_lock);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
+ kvm_vcpu_has_events(vcpu);
+}
+
+/* Called within kvm->srcu read side. */
+static inline int vcpu_block(struct kvm_vcpu *vcpu)
+{
+ bool hv_timer;
+
+ if (!kvm_arch_vcpu_runnable(vcpu)) {
+ /*
+ * Switch to the software timer before halt-polling/blocking as
+ * the guest's timer may be a break event for the vCPU, and the
+ * hypervisor timer runs only when the CPU is in guest mode.
+ * Switch before halt-polling so that KVM recognizes an expired
+ * timer before blocking.
+ */
+ hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ kvm_vcpu_halt(vcpu);
+ else
kvm_vcpu_block(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
- {
- switch(vcpu->arch.mp_state) {
- case KVM_MP_STATE_HALTED:
- vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
- case KVM_MP_STATE_RUNNABLE:
- break;
- case KVM_MP_STATE_SIPI_RECEIVED:
- default:
- r = -EINTR;
- break;
- }
- }
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ if (hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ /*
+ * If the vCPU is not runnable, a signal or another host event
+ * of some kind is pending; service it without changing the
+ * vCPU's activity state.
+ */
+ if (!kvm_arch_vcpu_runnable(vcpu))
+ return 1;
+ }
+
+ /*
+ * Evaluate nested events before exiting the halted state. This allows
+ * the halt state to be recorded properly in the VMCS12's activity
+ * state field (AMD does not have a similar field and a VM-Exit always
+ * causes a spurious wakeup from HLT).
+ */
+ if (is_guest_mode(vcpu)) {
+ int r = kvm_check_nested_events(vcpu);
+
+ WARN_ON_ONCE(r == -EBUSY);
+ if (r < 0)
+ return 0;
+ }
+
+ if (kvm_apic_accept_events(vcpu) < 0)
+ return 0;
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ case KVM_MP_STATE_AP_RESET_HOLD:
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ fallthrough;
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
+ break;
+ case KVM_MP_STATE_INIT_RECEIVED:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ return 1;
+}
+
+/* Called within kvm->srcu read side. */
+static int vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+
+ for (;;) {
+ /*
+ * If another guest vCPU requests a PV TLB flush in the middle
+ * of instruction emulation, the rest of the emulation could
+ * use a stale page translation. Assume that any code after
+ * this point can start executing an instruction.
+ */
+ vcpu->arch.at_instruction_boundary = false;
+ if (kvm_vcpu_running(vcpu)) {
+ r = vcpu_enter_guest(vcpu);
+ } else {
+ r = vcpu_block(vcpu);
}
if (r <= 0)
break;
- clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+ if (kvm_xen_has_pending_events(vcpu))
+ kvm_xen_inject_pending_events(vcpu);
+
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
+ if (dm_request_for_irq_injection(vcpu) &&
+ kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
+ break;
}
- if (signal_pending(current)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- }
- if (need_resched()) {
- up_read(&vcpu->kvm->slots_lock);
- kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+
+ if (__xfer_to_guest_mode_work_pending()) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ r = kvm_xfer_to_guest_mode_handle_work(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+ if (r)
+ return r;
}
}
- up_read(&vcpu->kvm->slots_lock);
- post_kvm_run_save(vcpu, kvm_run);
+ return r;
+}
- vapic_exit(vcpu);
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
+{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
+ ++vcpu->stat.halt_exits;
+ if (lapic_in_kernel(vcpu)) {
+ if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
+ state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, state);
+ return 1;
+ } else {
+ vcpu->run->exit_reason = reason;
+ return 0;
+ }
+}
- return r;
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt_noskip);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_emulate_halt_noskip(vcpu) && ret;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt);
+
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_emulate_halt(vcpu))
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ if (kvm_vcpu_running(vcpu))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+ return EXIT_FASTPATH_EXIT_HANDLED;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_hlt);
+
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_ap_reset_hold);
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
+ return kvm_vcpu_apicv_active(vcpu) &&
+ kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
+}
+
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ return kvm_arch_dy_has_pending_interrupt(vcpu);
+}
+
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+}
+
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+ BUG_ON(!vcpu->arch.pio.count);
+
+ return complete_emulated_io(vcpu);
+}
+
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * exit
+ * copy data
+ * execute insn
+ *
+ * write:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * copy data
+ * exit
+ */
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ /* FIXME: return into emulator if single-stepping. */
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ return complete_emulated_io(vcpu);
+ }
+
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = frag->gpa;
+ if (vcpu->mmio_is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ return 0;
+}
+
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
+ /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
+ trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
+ ++vcpu->stat.fpu_reload;
+ trace_kvm_fpu(0);
+}
+
+static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
+ * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
+ * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be
+ * converted to INIT_RECEIVED.
+ */
+ if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
+ return -EINVAL;
+
+ /*
+ * Disallow running the vCPU if userspace forced it into an impossible
+ * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked.
+ */
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
+ !kvm_apic_init_sipi_allowed(vcpu))
+ return -EINVAL;
+
+ return kvm_x86_call(vcpu_pre_run)(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ struct kvm_run *kvm_run = vcpu->run;
+ u64 sync_valid_fields;
int r;
- sigset_t sigsaved;
- vcpu_load(vcpu);
+ r = kvm_mmu_post_init_vm(vcpu->kvm);
+ if (r)
+ return r;
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+ vcpu_load(vcpu);
+ kvm_sigset_activate(vcpu);
+ kvm_run->flags = 0;
+ kvm_load_guest_fpu(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+ if (!vcpu->wants_to_run) {
+ r = -EINTR;
+ goto out;
+ }
+
+ /*
+ * Don't bother switching APIC timer emulation from the
+ * hypervisor timer to the software timer, the only way for the
+ * APIC timer to be active is if userspace stuffed vCPU state,
+ * i.e. put the vCPU into a nonsensical state. Only an INIT
+ * will transition the vCPU out of UNINITIALIZED (without more
+ * state stuffing from userspace), which will reset the local
+ * APIC and thus cancel the timer or drop the IRQ (if the timer
+ * already expired).
+ */
+ kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_block(vcpu);
- clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ if (kvm_apic_accept_events(vcpu) < 0) {
+ r = 0;
+ goto out;
+ }
r = -EAGAIN;
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
goto out;
}
- /* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm))
- kvm_set_cr8(vcpu, kvm_run->cr8);
+ sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
+ if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
+ (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
+ r = -EINVAL;
+ goto out;
+ }
- if (vcpu->arch.pio.cur_count) {
- r = complete_pio(vcpu);
- if (r)
+ if (kvm_run->kvm_dirty_regs) {
+ r = sync_regs(vcpu);
+ if (r != 0)
goto out;
}
-#if CONFIG_HAS_IOMEM
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- down_read(&vcpu->kvm->slots_lock);
- r = emulate_instruction(vcpu, kvm_run,
- vcpu->arch.mmio_fault_cr2, 0,
- EMULTYPE_NO_DECODE);
- up_read(&vcpu->kvm->slots_lock);
- if (r == EMULATE_DO_MMIO) {
- /*
- * Read-modify-write. Back to userspace.
- */
- r = 0;
+ /* re-sync apic's tpr */
+ if (!lapic_in_kernel(vcpu)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
goto out;
}
}
-#endif
- if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
- kvm_register_write(vcpu, VCPU_REGS_RAX,
- kvm_run->hypercall.ret);
- r = __vcpu_run(vcpu, kvm_run);
+ /*
+ * If userspace set a pending exception and L2 is active, convert it to
+ * a pending VM-Exit if L1 wants to intercept the exception.
+ */
+ if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+ ex->error_code)) {
+ kvm_queue_exception_vmexit(vcpu, ex->vector,
+ ex->has_error_code, ex->error_code,
+ ex->has_payload, ex->payload);
+ ex->injected = false;
+ ex->pending = false;
+ }
+ vcpu->arch.exception_from_userspace = false;
+
+ if (unlikely(vcpu->arch.complete_userspace_io)) {
+ int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+ vcpu->arch.complete_userspace_io = NULL;
+ r = cui(vcpu);
+ if (r <= 0)
+ goto out;
+ } else {
+ WARN_ON_ONCE(vcpu->arch.pio.count);
+ WARN_ON_ONCE(vcpu->mmio_needed);
+ }
+
+ if (!vcpu->wants_to_run) {
+ r = -EINTR;
+ goto out;
+ }
+
+ r = kvm_x86_vcpu_pre_run(vcpu);
+ if (r <= 0)
+ goto out;
+
+ r = vcpu_run(vcpu);
out:
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ kvm_put_guest_fpu(vcpu);
+ if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
+ store_regs(vcpu);
+ post_kvm_run_save(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ kvm_sigset_deactivate(vcpu);
vcpu_put(vcpu);
return r;
}
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
+ /*
+ * We are here if userspace calls get_regs() in the middle of
+ * instruction emulation. Registers state needs to be copied
+ * back from emulation context to vcpu. Userspace shouldn't do
+ * that usually, but some bad designed PV devices (vmware
+ * backdoor interface) need this to work
+ */
+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ }
+ regs->rax = kvm_rax_read(vcpu);
+ regs->rbx = kvm_rbx_read(vcpu);
+ regs->rcx = kvm_rcx_read(vcpu);
+ regs->rdx = kvm_rdx_read(vcpu);
+ regs->rsi = kvm_rsi_read(vcpu);
+ regs->rdi = kvm_rdi_read(vcpu);
+ regs->rsp = kvm_rsp_read(vcpu);
+ regs->rbp = kvm_rbp_read(vcpu);
#ifdef CONFIG_X86_64
- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+ regs->r8 = kvm_r8_read(vcpu);
+ regs->r9 = kvm_r9_read(vcpu);
+ regs->r10 = kvm_r10_read(vcpu);
+ regs->r11 = kvm_r11_read(vcpu);
+ regs->r12 = kvm_r12_read(vcpu);
+ regs->r13 = kvm_r13_read(vcpu);
+ regs->r14 = kvm_r14_read(vcpu);
+ regs->r15 = kvm_r15_read(vcpu);
#endif
regs->rip = kvm_rip_read(vcpu);
- regs->rflags = kvm_x86_ops->get_rflags(vcpu);
+ regs->rflags = kvm_get_rflags(vcpu);
+}
- /*
- * Don't leak debug flags in case they were set for guest debugging
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+ vcpu_load(vcpu);
+ __get_regs(vcpu, regs);
vcpu_put(vcpu);
-
return 0;
}
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+ kvm_rax_write(vcpu, regs->rax);
+ kvm_rbx_write(vcpu, regs->rbx);
+ kvm_rcx_write(vcpu, regs->rcx);
+ kvm_rdx_write(vcpu, regs->rdx);
+ kvm_rsi_write(vcpu, regs->rsi);
+ kvm_rdi_write(vcpu, regs->rdi);
+ kvm_rsp_write(vcpu, regs->rsp);
+ kvm_rbp_write(vcpu, regs->rbp);
#ifdef CONFIG_X86_64
- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
-
+ kvm_r8_write(vcpu, regs->r8);
+ kvm_r9_write(vcpu, regs->r9);
+ kvm_r10_write(vcpu, regs->r10);
+ kvm_r11_write(vcpu, regs->r11);
+ kvm_r12_write(vcpu, regs->r12);
+ kvm_r13_write(vcpu, regs->r13);
+ kvm_r14_write(vcpu, regs->r14);
+ kvm_r15_write(vcpu, regs->r15);
#endif
kvm_rip_write(vcpu, regs->rip);
- kvm_x86_ops->set_rflags(vcpu, regs->rflags);
-
+ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
vcpu->arch.exception.pending = false;
+ vcpu->arch.exception_vmexit.pending = false;
- vcpu_put(vcpu);
-
- return 0;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-void kvm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- struct kvm_segment cs;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
- kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
- *db = cs.db;
- *l = cs.l;
+ vcpu_load(vcpu);
+ __set_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
- struct descriptor_table dt;
+ struct desc_ptr dt;
- vcpu_load(vcpu);
+ if (vcpu->arch.guest_state_protected)
+ goto skip_protected_regs;
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -3599,1021 +12098,2308 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.limit;
- sregs->idt.base = dt.base;
- kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.limit;
- sregs->gdt.base = dt.base;
+ kvm_x86_call(get_idt)(vcpu, &dt);
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
+ kvm_x86_call(get_gdt)(vcpu, &dt);
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- sregs->cr0 = vcpu->arch.cr0;
sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = vcpu->arch.cr3;
- sregs->cr4 = vcpu->arch.cr4;
+ sregs->cr3 = kvm_read_cr3(vcpu);
+
+skip_protected_regs:
+ sregs->cr0 = kvm_read_cr0(vcpu);
+ sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
- sregs->efer = vcpu->arch.shadow_efer;
- sregs->apic_base = kvm_get_apic_base(vcpu);
+ sregs->efer = vcpu->arch.efer;
+ sregs->apic_base = vcpu->arch.apic_base;
+}
- memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ __get_sregs_common(vcpu, sregs);
- if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
+}
- vcpu_put(vcpu);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int i;
+
+ __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+ if (is_pae_paging(vcpu)) {
+ for (i = 0 ; i < 4 ; i++)
+ sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+ sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ }
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ __get_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ int r;
+
vcpu_load(vcpu);
- mp_state->mp_state = vcpu->arch.mp_state;
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0)
+ goto out;
+ r = 0;
+
+ if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
+ vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
+ vcpu->arch.pv.pv_unhalted)
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ mp_state->mp_state = vcpu->arch.mp_state;
+
+out:
+ kvm_vcpu_srcu_read_unlock(vcpu);
vcpu_put(vcpu);
- return 0;
+ return r;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ int ret = -EINVAL;
+
vcpu_load(vcpu);
- vcpu->arch.mp_state = mp_state->mp_state;
+
+ switch (mp_state->mp_state) {
+ case KVM_MP_STATE_UNINITIALIZED:
+ case KVM_MP_STATE_HALTED:
+ case KVM_MP_STATE_AP_RESET_HOLD:
+ case KVM_MP_STATE_INIT_RECEIVED:
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ break;
+
+ case KVM_MP_STATE_RUNNABLE:
+ break;
+
+ default:
+ goto out;
+ }
+
+ /*
+ * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead
+ * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
+ * Translate SIPI_RECEIVED as appropriate for backwards compatibility.
+ */
+ if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+ mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+ }
+
+ kvm_set_mp_state(vcpu, mp_state->mp_state);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ret = 0;
+out:
vcpu_put(vcpu);
+ return ret;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int ret;
+
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) {
+ u64 u_cet, s_cet;
+
+ /*
+ * Check both User and Supervisor on task switches as inter-
+ * privilege level task switches are impacted by CET at both
+ * the current privilege level and the new privilege level, and
+ * that information is not known at this time. The expectation
+ * is that the guest won't require emulation of task switches
+ * while using IBT or Shadow Stacks.
+ */
+ if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) ||
+ __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet))
+ goto unhandled_task_switch;
+
+ if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN))
+ goto unhandled_task_switch;
+ }
+
+ init_emulate_ctxt(vcpu);
+
+ ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+
+ /*
+ * Report an error userspace if MMIO is needed, as KVM doesn't support
+ * MMIO during a task switch (or any other complex operation).
+ */
+ if (ret || vcpu->mmio_needed)
+ goto unhandled_task_switch;
+
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ return 1;
+
+unhandled_task_switch:
+ vcpu->mmio_needed = false;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch);
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
+static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
- kvm_x86_ops->set_segment(vcpu, var, seg);
+ if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
+ /*
+ * When EFER.LME and CR0.PG are set, the processor is in
+ * 64-bit mode (though maybe in a 32-bit code segment).
+ * CR4.PAE and EFER.LMA must be set.
+ */
+ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
+ return false;
+ if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
+ return false;
+ } else {
+ /*
+ * Not in 64-bit mode: EFER.LMA is clear and the code
+ * segment cannot be 64-bit.
+ */
+ if (sregs->efer & EFER_LMA || sregs->cs.l)
+ return false;
+ }
+
+ return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
+ kvm_is_valid_cr0(vcpu, sregs->cr0);
}
-static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
- struct kvm_segment *kvm_desct)
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+ int *mmu_reset_needed, bool update_pdptrs)
{
- kvm_desct->base = seg_desc->base0;
- kvm_desct->base |= seg_desc->base1 << 16;
- kvm_desct->base |= seg_desc->base2 << 24;
- kvm_desct->limit = seg_desc->limit0;
- kvm_desct->limit |= seg_desc->limit << 16;
- if (seg_desc->g) {
- kvm_desct->limit <<= 12;
- kvm_desct->limit |= 0xfff;
+ int idx;
+ struct desc_ptr dt;
+
+ if (!kvm_is_valid_sregs(vcpu, sregs))
+ return -EINVAL;
+
+ if (kvm_apic_set_base(vcpu, sregs->apic_base, true))
+ return -EINVAL;
+
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
+ kvm_x86_call(set_gdt)(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
+
+ kvm_set_cr8(vcpu, sregs->cr8);
+
+ *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ kvm_x86_call(set_efer)(vcpu, sregs->efer);
+
+ *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
+
+ *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
+
+ if (update_pdptrs) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ *mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
- kvm_desct->selector = selector;
- kvm_desct->type = seg_desc->type;
- kvm_desct->present = seg_desc->p;
- kvm_desct->dpl = seg_desc->dpl;
- kvm_desct->db = seg_desc->d;
- kvm_desct->s = seg_desc->s;
- kvm_desct->l = seg_desc->l;
- kvm_desct->g = seg_desc->g;
- kvm_desct->avl = seg_desc->avl;
- if (!selector)
- kvm_desct->unusable = 1;
- else
- kvm_desct->unusable = 0;
- kvm_desct->padding = 0;
+
+ kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ update_cr8_intercept(vcpu);
+
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !is_protmode(vcpu))
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ return 0;
}
-static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
- u16 selector,
- struct descriptor_table *dtable)
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
- if (selector & 1 << 2) {
- struct kvm_segment kvm_seg;
+ int pending_vec, max_bits;
+ int mmu_reset_needed = 0;
+ int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
- kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+ if (ret)
+ return ret;
- if (kvm_seg.unusable)
- dtable->limit = 0;
- else
- dtable->limit = kvm_seg.limit;
- dtable->base = kvm_seg.base;
+ if (mmu_reset_needed) {
+ kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
- else
- kvm_x86_ops->get_gdt(vcpu, dtable);
+
+ max_bits = KVM_NR_INTERRUPTS;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ return 0;
}
-/* allowed just for 8 bytes segments */
-static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- struct desc_struct *seg_desc)
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
{
- gpa_t gpa;
- struct descriptor_table dtable;
- u16 index = selector >> 3;
+ int mmu_reset_needed = 0;
+ bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+ !(sregs2->efer & EFER_LMA);
+ int i, ret;
- get_segment_descriptor_dtable(vcpu, selector, &dtable);
+ if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+ return -EINVAL;
- if (dtable.limit < index * 8 + 7) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
- return 1;
+ if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+ return -EINVAL;
+
+ ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+ &mmu_reset_needed, !valid_pdptrs);
+ if (ret)
+ return ret;
+
+ if (valid_pdptrs) {
+ for (i = 0; i < 4 ; i++)
+ kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ mmu_reset_needed = 1;
+ vcpu->arch.pdptrs_from_userspace = true;
+ }
+ if (mmu_reset_needed) {
+ kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
- gpa += index * 8;
- return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
+ return 0;
}
-/* allowed just for 8 bytes segments */
-static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- struct desc_struct *seg_desc)
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
{
- gpa_t gpa;
- struct descriptor_table dtable;
- u16 index = selector >> 3;
+ int ret;
- get_segment_descriptor_dtable(vcpu, selector, &dtable);
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
- if (dtable.limit < index * 8 + 7)
- return 1;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
- gpa += index * 8;
- return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
+ vcpu_load(vcpu);
+ ret = __set_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return ret;
}
-static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc)
+static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
{
- u32 base_addr;
+ bool set = false;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (!enable_apicv)
+ return;
- base_addr = seg_desc->base0;
- base_addr |= (seg_desc->base1 << 16);
- base_addr |= (seg_desc->base2 << 24);
+ down_write(&kvm->arch.apicv_update_lock);
- return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
+ set = true;
+ break;
+ }
+ }
+ __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
+ up_write(&kvm->arch.apicv_update_lock);
}
-static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
{
- struct kvm_segment kvm_seg;
+ unsigned long rflags;
+ int i, r;
+
+ if (vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+
+ if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
+ r = -EBUSY;
+ if (kvm_is_exception_pending(vcpu))
+ goto out;
+ if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ else
+ kvm_queue_exception(vcpu, BP_VECTOR);
+ }
+
+ /*
+ * Read rflags as long as potentially injected trace flags are still
+ * filtered out.
+ */
+ rflags = kvm_get_rflags(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+ vcpu->guest_debug = 0;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ for (i = 0; i < KVM_NR_DB_REGS; ++i)
+ vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+ vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
+ } else {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+ kvm_update_dr7(vcpu);
- kvm_get_segment(vcpu, &kvm_seg, seg);
- return kvm_seg.selector;
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
+
+ /*
+ * Trigger an rflags update that will inject or remove the trace
+ * flags.
+ */
+ kvm_set_rflags(vcpu, rflags);
+
+ kvm_x86_call(update_exception_bitmap)(vcpu);
+
+ kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
+
+ r = 0;
+
+out:
+ vcpu_put(vcpu);
+ return r;
}
-static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
- u16 selector,
- struct kvm_segment *kvm_seg)
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
{
- struct desc_struct seg_desc;
+ unsigned long vaddr = tr->linear_address;
+ gpa_t gpa;
+ int idx;
- if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
- return 1;
- seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
+ vcpu_load(vcpu);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ tr->physical_address = gpa;
+ tr->valid = gpa != INVALID_GPA;
+ tr->writeable = 1;
+ tr->usermode = 0;
+
+ vcpu_put(vcpu);
return 0;
}
-static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
- struct kvm_segment segvar = {
- .base = selector << 4,
- .limit = 0xffff,
- .selector = selector,
- .type = 3,
- .present = 1,
- .dpl = 3,
- .db = 0,
- .s = 1,
- .l = 0,
- .g = 0,
- .avl = 0,
- .unusable = 0,
- };
- kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxregs_state *fxsave;
+
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
+ memcpy(fpu->fpr, fxsave->st_space, 128);
+ fpu->fcw = fxsave->cwd;
+ fpu->fsw = fxsave->swd;
+ fpu->ftwx = fxsave->twd;
+ fpu->last_opcode = fxsave->fop;
+ fpu->last_ip = fxsave->rip;
+ fpu->last_dp = fxsave->rdp;
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
+
+ vcpu_put(vcpu);
return 0;
}
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg)
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct kvm_segment kvm_seg;
+ struct fxregs_state *fxsave;
- if (!(vcpu->arch.cr0 & X86_CR0_PE))
- return kvm_load_realmode_segment(vcpu, selector, seg);
- if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
- return 1;
- kvm_seg.type |= type_bits;
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
- if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
- seg != VCPU_SREG_LDTR)
- if (!kvm_seg.s)
- kvm_seg.unusable = 1;
+ memcpy(fxsave->st_space, fpu->fpr, 128);
+ fxsave->cwd = fpu->fcw;
+ fxsave->swd = fpu->fsw;
+ fxsave->twd = fpu->ftwx;
+ fxsave->fop = fpu->last_opcode;
+ fxsave->rip = fpu->last_ip;
+ fxsave->rdp = fpu->last_dp;
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
- kvm_set_segment(vcpu, &kvm_seg, seg);
+ vcpu_put(vcpu);
return 0;
}
-static void save_state_to_tss32(struct kvm_vcpu *vcpu,
- struct tss_segment_32 *tss)
-{
- tss->cr3 = vcpu->arch.cr3;
- tss->eip = kvm_rip_read(vcpu);
- tss->eflags = kvm_x86_ops->get_rflags(vcpu);
- tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
- tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
- tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
- tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
- tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
- tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
- tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
- tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
- tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
- tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss32(struct kvm_vcpu *vcpu,
- struct tss_segment_32 *tss)
-{
- kvm_set_cr3(vcpu, tss->cr3);
-
- kvm_rip_write(vcpu, tss->eip);
- kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
-
- kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
- kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
- kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
- kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
- kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
- if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
- return 1;
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
- return 1;
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+ __get_regs(vcpu, &vcpu->run->s.regs.regs);
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
- return 1;
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+ __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
- return 1;
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
+ kvm_vcpu_ioctl_x86_get_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events);
+}
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
- return 1;
+static int sync_regs(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+ __set_regs(vcpu, &vcpu->run->s.regs.regs);
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+ }
- if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
- return 1;
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+ struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
- if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
- return 1;
- return 0;
-}
+ if (__set_sregs(vcpu, &sregs))
+ return -EINVAL;
-static void save_state_to_tss16(struct kvm_vcpu *vcpu,
- struct tss_segment_16 *tss)
-{
- tss->ip = kvm_rip_read(vcpu);
- tss->flag = kvm_x86_ops->get_rflags(vcpu);
- tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
- tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
- tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
- tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
- tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
- tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
- tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
- tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
- tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
-}
-
-static int load_state_from_tss16(struct kvm_vcpu *vcpu,
- struct tss_segment_16 *tss)
-{
- kvm_rip_write(vcpu, tss->ip);
- kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
- kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
- kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
- kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
- kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
- kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
- kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
- if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
- return 1;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+ }
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
- return 1;
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+ struct kvm_vcpu_events events = vcpu->run->s.regs.events;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
- return 1;
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events))
+ return -EINVAL;
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
- return 1;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+ }
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
- return 1;
return 0;
}
-static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
- u16 old_tss_sel, u32 old_tss_base,
- struct desc_struct *nseg_desc)
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
- struct tss_segment_16 tss_segment_16;
- int ret = 0;
+ if (kvm_check_tsc_unstable() && kvm->created_vcpus)
+ pr_warn_once("SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
- if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
- sizeof tss_segment_16))
- goto out;
+ if (!kvm->arch.max_vcpu_ids)
+ kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
- save_state_to_tss16(vcpu, &tss_segment_16);
+ if (id >= kvm->arch.max_vcpu_ids)
+ return -EINVAL;
- if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
- sizeof tss_segment_16))
- goto out;
+ return kvm_x86_call(vcpu_precreate)(kvm);
+}
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
- &tss_segment_16, sizeof tss_segment_16))
- goto out;
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int r;
- if (old_tss_sel != 0xffff) {
- tss_segment_16.prev_task_link = old_tss_sel;
+ vcpu->arch.last_vmentry_cpu = -1;
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
- if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
- &tss_segment_16.prev_task_link,
- sizeof tss_segment_16.prev_task_link))
- goto out;
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
+
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ else
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED);
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ return r;
+
+ r = kvm_create_lapic(vcpu);
+ if (r < 0)
+ goto fail_mmu_destroy;
+
+ r = -ENOMEM;
+
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto fail_free_lapic;
+ vcpu->arch.pio_data = page_address(page);
+
+ vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
+ GFP_KERNEL_ACCOUNT);
+ vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
+ goto fail_free_mce_banks;
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT))
+ goto fail_free_mce_banks;
+
+ if (!alloc_emulate_ctxt(vcpu))
+ goto free_wbinvd_dirty_mask;
+
+ if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
+ pr_err("failed to allocate vcpu's fpu\n");
+ goto free_emulate_ctxt;
}
- if (load_state_from_tss16(vcpu, &tss_segment_16))
- goto out;
+ kvm_async_pf_hash_reset(vcpu);
- ret = 1;
-out:
- return ret;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
+ }
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
+ r = kvm_x86_call(vcpu_create)(vcpu);
+ if (r)
+ goto free_guest_fpu;
+
+ kvm_xen_init_vcpu(vcpu);
+ vcpu_load(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
+ kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
+ kvm_vcpu_reset(vcpu, false);
+ kvm_init_mmu(vcpu);
+ vcpu_put(vcpu);
+ return 0;
+
+free_guest_fpu:
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
+free_emulate_ctxt:
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+ kfree(vcpu->arch.mci_ctl2_banks);
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+ return r;
}
-static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
- u16 old_tss_sel, u32 old_tss_base,
- struct desc_struct *nseg_desc)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct tss_segment_32 tss_segment_32;
- int ret = 0;
+ if (mutex_lock_killable(&vcpu->mutex))
+ return;
+ vcpu_load(vcpu);
+ kvm_synchronize_tsc(vcpu, NULL);
+ vcpu_put(vcpu);
- if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
- sizeof tss_segment_32))
- goto out;
+ /* poll control enabled by default */
+ vcpu->arch.msr_kvm_poll_control = 1;
+
+ mutex_unlock(&vcpu->mutex);
+}
- save_state_to_tss32(vcpu, &tss_segment_32);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ int idx, cpu;
- if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
- sizeof tss_segment_32))
- goto out;
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_mmu_unload(vcpu);
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
- &tss_segment_32, sizeof tss_segment_32))
- goto out;
+ kvmclock_reset(vcpu);
- if (old_tss_sel != 0xffff) {
- tss_segment_32.prev_task_link = old_tss_sel;
+ for_each_possible_cpu(cpu)
+ cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL);
- if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
- &tss_segment_32.prev_task_link,
- sizeof tss_segment_32.prev_task_link))
- goto out;
- }
+ kvm_x86_call(vcpu_free)(vcpu);
- if (load_state_from_tss32(vcpu, &tss_segment_32))
- goto out;
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
- ret = 1;
-out:
- return ret;
+ kvm_xen_destroy_vcpu(vcpu);
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kfree(vcpu->arch.mci_ctl2_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ kvfree(vcpu->arch.cpuid_entries);
}
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
{
- struct kvm_segment tr_seg;
- struct desc_struct cseg_desc;
- struct desc_struct nseg_desc;
- int ret = 0;
- u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
- u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
+ struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
+ u64 xfeatures_mask;
+ bool fpu_in_use;
+ int i;
- old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
+ /*
+ * Guest FPU state is zero allocated and so doesn't need to be manually
+ * cleared on RESET, i.e. during vCPU creation.
+ */
+ if (!init_event || !fpstate)
+ return;
- /* FIXME: Handle errors. Failure to read either TSS or their
- * descriptors should generate a pagefault.
+ /*
+ * On INIT, only select XSTATE components are zeroed, most components
+ * are unchanged. Currently, the only components that are zeroed and
+ * supported by KVM are MPX and CET related.
*/
- if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
- goto out;
+ xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) &
+ (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR |
+ XFEATURE_MASK_CET_ALL);
+ if (!xfeatures_mask)
+ return;
- if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
- goto out;
+ BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX);
- if (reason != TASK_SWITCH_IRET) {
- int cpl;
+ /*
+ * Unload guest FPU state (if necessary) before zeroing XSTATE fields
+ * as the kernel can only modify the state when its resident in memory,
+ * i.e. when it's not loaded into hardware.
+ *
+ * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN,
+ * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
+ * only path that can trigger INIT emulation _and_ loads FPU state, and
+ * KVM_RUN should _always_ load FPU state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
+ fpu_in_use = fpstate->in_use;
+ if (fpu_in_use)
+ kvm_put_guest_fpu(vcpu);
+ for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX)
+ fpstate_clear_xstate_component(fpstate, i);
+ if (fpu_in_use)
+ kvm_load_guest_fpu(vcpu);
+}
- cpl = kvm_x86_ops->get_cpl(vcpu);
- if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- return 1;
- }
- }
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct kvm_cpuid_entry2 *cpuid_0x1;
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long new_cr0;
- if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
- kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
- return 1;
- }
+ /*
+ * Several of the "set" flows, e.g. ->set_cr0(), read other registers
+ * to handle side effects. RESET emulation hits those flows and relies
+ * on emulated/virtualized registers, including those that are loaded
+ * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
+ * to detect improper or missing initialization.
+ */
+ WARN_ON_ONCE(!init_event &&
+ (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
- if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
- cseg_desc.type &= ~(1 << 1); //clear the B flag
- save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
- }
+ /*
+ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
+ * possible to INIT the vCPU while L2 is active. Force the vCPU back
+ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
+ * bits), i.e. virtualization is disabled.
+ */
+ if (is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+
+ kvm_lapic_reset(vcpu, init_event);
+
+ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
+ vcpu->arch.hflags = 0;
- if (reason == TASK_SWITCH_IRET) {
- u32 eflags = kvm_x86_ops->get_rflags(vcpu);
- kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+ vcpu->arch.smi_pending = 0;
+ vcpu->arch.smi_count = 0;
+ atomic_set(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = 0;
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_interrupt_queue(vcpu);
+ kvm_clear_exception_queue(vcpu);
+
+ memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = DR6_ACTIVE_LOW;
+ vcpu->arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+
+ vcpu->arch.cr2 = 0;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_en_val = 0;
+ vcpu->arch.apf.msr_int_val = 0;
+ vcpu->arch.st.msr_val = 0;
+
+ kvmclock_reset(vcpu);
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
+
+ kvm_xstate_reset(vcpu, init_event);
+
+ if (!init_event) {
+ vcpu->arch.smbase = 0x30000;
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ vcpu->arch.msr_misc_features_enables = 0;
+ vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
+
+ __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+ kvm_msr_write(vcpu, MSR_IA32_XSS, 0);
}
- /* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
- if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
- old_tss_sel = 0xffff;
+ /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+ kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
+
+ /*
+ * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+ * if no CPUID match is found. Note, it's impossible to get a match at
+ * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+ * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
+ * on RESET. But, go through the motions in case that's ever remedied.
+ */
+ cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
+ kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
+
+ kvm_x86_call(vcpu_reset)(vcpu, init_event);
- /* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
- if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
- old_tss_sel = 0xffff;
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0xfff0);
- if (nseg_desc.type & 8)
- ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
- old_tss_base, &nseg_desc);
+ vcpu->arch.cr3 = 0;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+
+ /*
+ * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
+ * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+ * (or qualify) that with a footnote stating that CD/NW are preserved.
+ */
+ new_cr0 = X86_CR0_ET;
+ if (init_event)
+ new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
else
- ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
- old_tss_base, &nseg_desc);
+ new_cr0 |= X86_CR0_NW | X86_CR0_CD;
- if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
- u32 eflags = kvm_x86_ops->get_rflags(vcpu);
- kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
- }
+ kvm_x86_call(set_cr0)(vcpu, new_cr0);
+ kvm_x86_call(set_cr4)(vcpu, 0);
+ kvm_x86_call(set_efer)(vcpu, 0);
+ kvm_x86_call(update_exception_bitmap)(vcpu);
- if (reason != TASK_SWITCH_IRET) {
- nseg_desc.type |= (1 << 1);
- save_guest_segment_descriptor(vcpu, tss_selector,
- &nseg_desc);
+ /*
+ * On the standard CR0/CR4/EFER modification paths, there are several
+ * complex conditions determining whether the MMU has to be reset and/or
+ * which PCIDs have to be flushed. However, CR0.WP and the paging-related
+ * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
+ * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
+ * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here.
+ */
+ if (old_cr0 & X86_CR0_PG) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ kvm_mmu_reset_context(vcpu);
}
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
- seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
- tr_seg.type = 11;
- kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
- return ret;
+ /*
+ * Intel's SDM states that all TLB entries are flushed on INIT. AMD's
+ * APM states the TLBs are untouched by INIT, but it also states that
+ * the TLBs are flushed on "External initialization of the processor."
+ * Flush the guest TLB regardless of vendor, there is no meaningful
+ * benefit in relying on the guest to flush the TLB immediately after
+ * INIT. A spurious TLB flush is benign and likely negligible from a
+ * performance perspective.
+ */
+ if (init_event)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_task_switch);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset);
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
- int mmu_reset_needed = 0;
- int pending_vec, max_bits;
- struct descriptor_table dt;
+ struct kvm_segment cs;
- vcpu_load(vcpu);
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs.selector = vector << 8;
+ cs.base = vector << 12;
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_rip_write(vcpu, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector);
- dt.limit = sregs->idt.limit;
- dt.base = sregs->idt.base;
- kvm_x86_ops->set_idt(vcpu, &dt);
- dt.limit = sregs->gdt.limit;
- dt.base = sregs->gdt.base;
- kvm_x86_ops->set_gdt(vcpu, &dt);
+void kvm_arch_enable_virtualization(void)
+{
+ cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
- vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+void kvm_arch_disable_virtualization(void)
+{
+ cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
- down_read(&vcpu->kvm->slots_lock);
- if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
- vcpu->arch.cr3 = sregs->cr3;
- else
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- up_read(&vcpu->kvm->slots_lock);
+int kvm_arch_enable_virtualization_cpu(void)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+ u64 local_tsc;
+ u64 max_tsc = 0;
+ bool stable, backwards_tsc = false;
- kvm_set_cr8(vcpu, sregs->cr8);
+ kvm_user_return_msr_cpu_online();
- mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
- kvm_set_apic_base(vcpu, sregs->apic_base);
+ ret = kvm_x86_check_processor_compatibility();
+ if (ret)
+ return ret;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ ret = kvm_x86_call(enable_virtualization_cpu)();
+ if (ret != 0)
+ return ret;
- mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
- vcpu->arch.cr0 = sregs->cr0;
+ local_tsc = rdtsc();
+ stable = !kvm_check_tsc_unstable();
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!stable && vcpu->cpu == smp_processor_id())
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+ backwards_tsc = true;
+ if (vcpu->arch.last_host_tsc > max_tsc)
+ max_tsc = vcpu->arch.last_host_tsc;
+ }
+ }
+ }
- mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (!is_long_mode(vcpu) && is_pae(vcpu))
- load_pdptrs(vcpu, vcpu->arch.cr3);
+ /*
+ * Sometimes, even reliable TSCs go backwards. This happens on
+ * platforms that reset TSC during suspend or hibernate actions, but
+ * maintain synchronization. We must compensate. Fortunately, we can
+ * detect that condition here, which happens early in CPU bringup,
+ * before any KVM threads can be running. Unfortunately, we can't
+ * bring the TSCs fully up to date with real time, as we aren't yet far
+ * enough into CPU bringup that we know how much real time has actually
+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
+ * variables that haven't been updated yet.
+ *
+ * So we simply find the maximum observed TSC above, then record the
+ * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
+ * the adjustment will be applied. Note that we accumulate
+ * adjustments, in case multiple suspend cycles happen before some VCPU
+ * gets a chance to run again. In the event that no KVM threads get a
+ * chance to run, we will miss the entire elapsed period, as we'll have
+ * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+ * loose cycle time. This isn't too big a deal, since the loss will be
+ * uniform across all VCPUs (not to mention the scenario is extremely
+ * unlikely). It is possible that a second hibernate recovery happens
+ * much faster than a first, causing the observed TSC here to be
+ * smaller; this would require additional padding adjustment, which is
+ * why we set last_host_tsc to the local tsc observed here.
+ *
+ * N.B. - this code below runs only on platforms with reliable TSC,
+ * as that is the only way backwards_tsc is set above. Also note
+ * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+ * have the same delta_cyc adjustment applied if backwards_tsc
+ * is detected. Note further, this adjustment is only done once,
+ * as we reset last_host_tsc on all VCPUs to stop this from being
+ * called multiple times (one for each physical CPU bringup).
+ *
+ * Platforms with unreliable TSCs don't have to deal with this, they
+ * will be compensated by the logic in vcpu_load, which sets the TSC to
+ * catchup mode. This will catchup all VCPUs to real time, but cannot
+ * guarantee that they stay in perfect synchronization.
+ */
+ if (backwards_tsc) {
+ u64 delta_cyc = max_tsc - local_tsc;
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm->arch.backwards_tsc_observed = true;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.tsc_offset_adjustment += delta_cyc;
+ vcpu->arch.last_host_tsc = local_tsc;
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ }
- if (mmu_reset_needed)
- kvm_mmu_reset_context(vcpu);
+ /*
+ * We have to disable TSC offset matching.. if you were
+ * booting a VM while issuing an S4 host suspend....
+ * you may have some problem. Solving this issue is
+ * left as an exercise to the reader.
+ */
+ kvm->arch.last_tsc_nsec = 0;
+ kvm->arch.last_tsc_write = 0;
+ }
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap, max_bits);
- if (pending_vec < max_bits) {
- kvm_queue_interrupt(vcpu, pending_vec, false);
- pr_debug("Set back pending irq %d\n", pending_vec);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_pic_clear_isr_ack(vcpu->kvm);
}
+ return 0;
+}
- kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+void kvm_arch_disable_virtualization_cpu(void)
+{
+ kvm_x86_call(disable_virtualization_cpu)();
- kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+ /*
+ * Leave the user-return notifiers as-is when disabling virtualization
+ * for reboot, i.e. when disabling via IPI function call, and instead
+ * pin kvm.ko (if it's a module) to defend against use-after-free (in
+ * the *very* unlikely scenario module unload is racing with reboot).
+ * On a forced reboot, tasks aren't frozen before shutdown, and so KVM
+ * could be actively modifying user-return MSR state when the IPI to
+ * disable virtualization arrives. Handle the extreme edge case here
+ * instead of trying to account for it in the normal flows.
+ */
+ if (in_task() || WARN_ON_ONCE(!kvm_rebooting))
+ drop_user_return_notifiers();
+ else
+ __module_get(THIS_MODULE);
+}
- /* Older userspace won't unhalt the vcpu on reset. */
- if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
- sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
- !(vcpu->arch.cr0 & X86_CR0_PE))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_reset_bsp);
- vcpu_put(vcpu);
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+ kfree(kvm->arch.hv_pa_pg);
+#endif
+ __kvm_arch_free_vm(kvm);
+}
+
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ int ret;
+ unsigned long flags;
+
+ if (!kvm_is_vm_type_supported(type))
+ return -EINVAL;
+
+ kvm->arch.vm_type = type;
+ kvm->arch.has_private_mem =
+ (type == KVM_X86_SW_PROTECTED_VM);
+ /* Decided by the vendor code for other VM types. */
+ kvm->arch.pre_fault_allowed =
+ type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
+ kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
+
+ ret = kvm_page_track_init(kvm);
+ if (ret)
+ goto out;
+
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_cleanup_page_track;
+
+ ret = kvm_x86_call(vm_init)(kvm);
+ if (ret)
+ goto out_uninit_mmu;
+
+ atomic_set(&kvm->arch.noncoherent_dma_count, 0);
+
+ raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+ mutex_init(&kvm->arch.apic_map_lock);
+ seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ pvclock_update_vm_gtod_copy(kvm);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+ kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
+ kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
+ kvm->arch.guest_can_read_msr_platform_info = true;
+ kvm->arch.enable_pmu = enable_pmu;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+ kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
+ kvm_apicv_init(kvm);
+ kvm_hv_init_vm(kvm);
+ kvm_xen_init_vm(kvm);
+
+ if (ignore_msrs && !report_ignored_msrs) {
+ pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n"
+ "a supported configuration. Lying to the guest about the existence of MSRs\n"
+ "may cause the guest operating system to hang or produce errors. If a guest\n"
+ "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n");
+ }
+
+ once_init(&kvm->arch.nx_once);
return 0;
+
+out_uninit_mmu:
+ kvm_mmu_uninit_vm(kvm);
+out_cleanup_page_track:
+ kvm_page_track_cleanup(kvm);
+out:
+ return ret;
}
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg)
+/**
+ * __x86_set_memory_region: Setup KVM internal memory slot
+ *
+ * @kvm: the kvm pointer to the VM.
+ * @id: the slot ID to setup.
+ * @gpa: the GPA to install the slot (unused when @size == 0).
+ * @size: the size of the slot. Set to zero to uninstall a slot.
+ *
+ * This function helps to setup a KVM internal memory slot. Specify
+ * @size > 0 to install a new slot, while @size == 0 to uninstall a
+ * slot. The return code can be one of the following:
+ *
+ * HVA: on success (uninstall will return a bogus HVA)
+ * -errno: on error
+ *
+ * The caller should always use IS_ERR() to check the return value
+ * before use. Note, the KVM internal memory slots are guaranteed to
+ * remain valid and unchanged until the VM is destroyed, i.e., the
+ * GPA->HVA translation will not change. However, the HVA is a user
+ * address, i.e. its accessibility is not guaranteed, and must be
+ * accessed via __copy_{to,from}_user().
+ */
+void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size)
{
int i, r;
+ unsigned long hva, old_npages;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot;
- vcpu_load(vcpu);
+ lockdep_assert_held(&kvm->slots_lock);
- if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
- (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
- for (i = 0; i < KVM_NR_DB_REGS; ++i)
- vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
- vcpu->arch.switch_db_regs =
- (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
+ return ERR_PTR_USR(-EINVAL);
+
+ slot = id_to_memslot(slots, id);
+ if (size) {
+ if (slot && slot->npages)
+ return ERR_PTR_USR(-EEXIST);
+
+ /*
+ * MAP_SHARED to prevent internal slot pages from being moved
+ * by fork()/COW.
+ */
+ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0);
+ if (IS_ERR_VALUE(hva))
+ return (void __user *)hva;
} else {
- for (i = 0; i < KVM_NR_DB_REGS; i++)
- vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
+ if (!slot || !slot->npages)
+ return NULL;
+
+ old_npages = slot->npages;
+ hva = slot->userspace_addr;
}
- r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ struct kvm_userspace_memory_region2 m;
- if (dbg->control & KVM_GUESTDBG_INJECT_DB)
- kvm_queue_exception(vcpu, DB_VECTOR);
- else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
- kvm_queue_exception(vcpu, BP_VECTOR);
+ m.slot = id | (i << 16);
+ m.flags = 0;
+ m.guest_phys_addr = gpa;
+ m.userspace_addr = hva;
+ m.memory_size = size;
+ r = kvm_set_internal_memslot(kvm, &m);
+ if (r < 0)
+ return ERR_PTR_USR(r);
+ }
- vcpu_put(vcpu);
+ if (!size)
+ vm_munmap(hva, old_npages * PAGE_SIZE);
- return r;
+ return (void __user *)hva;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__x86_set_memory_region);
-/*
- * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
- u16 cwd;
- u16 swd;
- u16 twd;
- u16 fop;
- u64 rip;
- u64 rdp;
- u32 mxcsr;
- u32 mxcsr_mask;
- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
- u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ /*
+ * Stop all background workers and kthreads before destroying vCPUs, as
+ * iterating over vCPUs in a different task while vCPUs are being freed
+ * is unsafe, i.e. will lead to use-after-free. The PIT also needs to
+ * be stopped before IRQ routing is freed.
+ */
+#ifdef CONFIG_KVM_IOAPIC
+ kvm_free_pit(kvm);
#endif
-};
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
- struct kvm_translation *tr)
+ kvm_mmu_pre_destroy_vm(kvm);
+ static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
{
- unsigned long vaddr = tr->linear_address;
- gpa_t gpa;
+ if (current->mm == kvm->mm) {
+ /*
+ * Free memory regions allocated on behalf of userspace,
+ * unless the memory map has changed due to process exit
+ * or fd copying.
+ */
+ mutex_lock(&kvm->slots_lock);
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_unlock(&kvm->slots_lock);
+ }
+ kvm_destroy_vcpus(kvm);
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
+ kvm_pic_destroy(kvm);
+ kvm_ioapic_destroy(kvm);
+#endif
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
+ kvm_mmu_uninit_vm(kvm);
+ kvm_page_track_cleanup(kvm);
+ kvm_xen_destroy_vm(kvm);
+ kvm_hv_destroy_vm(kvm);
+ kvm_x86_call(vm_destroy)(kvm);
+}
- vcpu_load(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
- up_read(&vcpu->kvm->slots_lock);
- tr->physical_address = gpa;
- tr->valid = gpa != UNMAPPED_GVA;
- tr->writeable = 1;
- tr->usermode = 0;
- vcpu_put(vcpu);
+static void memslot_rmap_free(struct kvm_memory_slot *slot)
+{
+ int i;
- return 0;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ vfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ }
}
-int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+ int i;
- vcpu_load(vcpu);
+ memslot_rmap_free(slot);
- memcpy(fpu->fpr, fxsave->st_space, 128);
- fpu->fcw = fxsave->cwd;
- fpu->fsw = fxsave->swd;
- fpu->ftwx = fxsave->twd;
- fpu->last_opcode = fxsave->fop;
- fpu->last_ip = fxsave->rip;
- fpu->last_dp = fxsave->rdp;
- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+ vfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
+ }
- vcpu_put(vcpu);
+ kvm_page_track_free_memslot(slot);
+}
+
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
+{
+ const int sz = sizeof(*slot->arch.rmap[0]);
+ int i;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ int level = i + 1;
+ int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
+
+ if (slot->arch.rmap[i])
+ continue;
+
+ slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ if (!slot->arch.rmap[i]) {
+ memslot_rmap_free(slot);
+ return -ENOMEM;
+ }
+ }
return 0;
}
-int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+ unsigned long npages = slot->npages;
+ int i, r;
- vcpu_load(vcpu);
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
- memcpy(fxsave->st_space, fpu->fpr, 128);
- fxsave->cwd = fpu->fcw;
- fxsave->swd = fpu->fsw;
- fxsave->twd = fpu->ftwx;
- fxsave->fop = fpu->last_opcode;
- fxsave->rip = fpu->last_ip;
- fxsave->rdp = fpu->last_dp;
- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ r = memslot_rmap_alloc(slot, npages);
+ if (r)
+ return r;
+ }
- vcpu_put(vcpu);
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+ struct kvm_lpage_info *linfo;
+ unsigned long ugfn;
+ int lpages;
+ int level = i + 1;
+
+ lpages = __kvm_mmu_slot_lpages(slot, npages, level);
+
+ linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
+ if (!linfo)
+ goto out_free;
+
+ slot->arch.lpage_info[i - 1] = linfo;
+
+ if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[0].disallow_lpage = 1;
+ if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[lpages - 1].disallow_lpage = 1;
+ ugfn = slot->userspace_addr >> PAGE_SHIFT;
+ /*
+ * If the gfn and userspace address are not aligned wrt each
+ * other, disable large page support for this slot.
+ */
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
+ unsigned long j;
+
+ for (j = 0; j < lpages; ++j)
+ linfo[j].disallow_lpage = 1;
+ }
+ }
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ kvm_mmu_init_memslot_memory_attributes(kvm, slot);
+#endif
+
+ if (kvm_page_track_create_memslot(kvm, slot, npages))
+ goto out_free;
return 0;
+
+out_free:
+ memslot_rmap_free(slot);
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+ vfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
+ }
+ return -ENOMEM;
}
-void fx_init(struct kvm_vcpu *vcpu)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
- unsigned after_mxcsr_mask;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
/*
- * Touch the fpu the first time in non atomic context as if
- * this is the first fpu instruction the exception handler
- * will fire before the instruction returns and it'll have to
- * allocate ram with GFP_KERNEL.
+ * memslots->generation has been incremented.
+ * mmio generation may have reached its maximum value.
*/
- if (!used_math())
- kvm_fx_save(&vcpu->arch.host_fx_image);
+ kvm_mmu_invalidate_mmio_sptes(kvm, gen);
- /* Initialize guest FPU by resetting ours and saving into guest's */
- preempt_disable();
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_finit();
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
- preempt_enable();
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ /*
+ * KVM doesn't support moving memslots when there are external page
+ * trackers attached to the VM, i.e. if KVMGT is in use.
+ */
+ if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm))
+ return -EINVAL;
+
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
+ if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
+ return -EINVAL;
+
+ if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
+ return -EINVAL;
- vcpu->arch.cr0 |= X86_CR0_ET;
- after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
- vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
- memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
- 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+ return kvm_alloc_memslot_metadata(kvm, new);
+ }
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ memcpy(&new->arch, &old->arch, sizeof(old->arch));
+ else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
+ return -EIO;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(fx_init);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+
+static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
- if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+ int nr_slots;
+
+ if (!kvm->arch.cpu_dirty_log_size)
return;
- vcpu->guest_fpu_loaded = 1;
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_restore(&vcpu->arch.guest_fx_image);
+ nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
+ if ((enable && nr_slots == 1) || !nr_slots)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
- if (!vcpu->guest_fpu_loaded)
+ u32 old_flags = old ? old->flags : 0;
+ u32 new_flags = new ? new->flags : 0;
+ bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
+
+ /*
+ * Update CPU dirty logging if dirty logging is being toggled. This
+ * applies to all operations.
+ */
+ if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
+ kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
+
+ /*
+ * Nothing more to do for RO slots (which can't be dirtied and can't be
+ * made writable) or CREATE/MOVE/DELETE of a slot.
+ *
+ * For a memslot with dirty logging disabled:
+ * CREATE: No dirty mappings will already exist.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot()
+ *
+ * For a memslot with dirty logging enabled:
+ * CREATE: No shadow pages exist, thus nothing to write-protect
+ * and no dirty bits to clear.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot().
+ */
+ if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
return;
- vcpu->guest_fpu_loaded = 0;
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
- ++vcpu->stat.fpu_reload;
+ /*
+ * READONLY and non-flags changes were filtered out above, and the only
+ * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
+ * logging isn't being toggled on or off.
+ */
+ if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+ return;
+
+ if (!log_dirty_pages) {
+ /*
+ * Recover huge page mappings in the slot now that dirty logging
+ * is disabled, i.e. now that KVM does not have to track guest
+ * writes at 4KiB granularity.
+ *
+ * Dirty logging might be disabled by userspace if an ongoing VM
+ * live migration is cancelled and the VM must continue running
+ * on the source.
+ */
+ kvm_mmu_recover_huge_pages(kvm, new);
+ } else {
+ /*
+ * Initially-all-set does not require write protecting any page,
+ * because they're all assumed to be dirty.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
+
+ if (kvm->arch.cpu_dirty_log_size) {
+ kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+ } else {
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
+ }
+
+ /*
+ * Unconditionally flush the TLBs after enabling dirty logging.
+ * A flush is almost always going to be necessary (see below),
+ * and unconditionally flushing allows the helpers to omit
+ * the subtly complex checks when removing write access.
+ *
+ * Do the flush outside of mmu_lock to reduce the amount of
+ * time mmu_lock is held. Flushing after dropping mmu_lock is
+ * safe as KVM only needs to guarantee the slot is fully
+ * write-protected before returning to userspace, i.e. before
+ * userspace can consume the dirty status.
+ *
+ * Flushing outside of mmu_lock requires KVM to be careful when
+ * making decisions based on writable status of an SPTE, e.g. a
+ * !writable SPTE doesn't guarantee a CPU can't perform writes.
+ *
+ * Specifically, KVM also write-protects guest page tables to
+ * monitor changes when using shadow paging, and must guarantee
+ * no CPUs can write to those page before mmu_lock is dropped.
+ * Because CPUs may have stale TLB entries at this point, a
+ * !writable SPTE doesn't guarantee CPUs can't perform writes.
+ *
+ * KVM also allows making SPTES writable outside of mmu_lock,
+ * e.g. to allow dirty logging without taking mmu_lock.
+ *
+ * To handle these scenarios, KVM uses a separate software-only
+ * bit (MMU-writable) to track if a SPTE is !writable due to
+ * a guest page table being write-protected (KVM clears the
+ * MMU-writable flag when write-protecting for shadow paging).
+ *
+ * The use of MMU-writable is also the primary motivation for
+ * the unconditional flush. Because KVM must guarantee that a
+ * CPU doesn't contain stale, writable TLB entries for a
+ * !MMU-writable SPTE, KVM must flush if it encounters any
+ * MMU-writable SPTE regardless of whether the actual hardware
+ * writable bit was set. I.e. KVM is almost guaranteed to need
+ * to flush, while unconditionally flushing allows the "remove
+ * write access" helpers to ignore MMU-writable entirely.
+ *
+ * See is_writable_pte() for more details (the case involving
+ * access-tracked SPTEs is particularly relevant).
+ */
+ kvm_flush_remote_tlbs_memslot(kvm, new);
+ }
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
- if (vcpu->arch.time_page) {
- kvm_release_page_dirty(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
+ if (change == KVM_MR_DELETE)
+ kvm_page_track_delete_slot(kvm, old);
+
+ if (!kvm->arch.n_requested_mmu_pages &&
+ (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
+ unsigned long nr_mmu_pages;
+
+ nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
}
- kvm_x86_ops->vcpu_free(vcpu);
+ kvm_mmu_slot_apply_flags(kvm, old, new, change);
+
+ /* Free the arrays associated with the old memslot. */
+ if (change == KVM_MR_MOVE)
+ kvm_arch_free_memslot(kvm, old);
}
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops->vcpu_create(kvm, id);
+ WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu));
+
+ if (vcpu->arch.guest_state_protected)
+ return true;
+
+ return kvm_x86_call(get_cpl)(vcpu) == 0;
}
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
{
- int r;
+ WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu));
- /* We do fxsave: this must be aligned. */
- BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+ if (vcpu->arch.guest_state_protected)
+ return 0;
- vcpu->arch.mtrr_state.have_fixed = 1;
- vcpu_load(vcpu);
- r = kvm_arch_vcpu_reset(vcpu);
- if (r == 0)
- r = kvm_mmu_setup(vcpu);
- vcpu_put(vcpu);
- if (r < 0)
- goto free_vcpu;
+ return kvm_rip_read(vcpu);
+}
- return 0;
-free_vcpu:
- kvm_x86_ops->vcpu_free(vcpu);
- return r;
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
}
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
+ return kvm_x86_call(interrupt_allowed)(vcpu, false);
+}
- kvm_x86_ops->vcpu_free(vcpu);
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
+{
+ /* Can't read the RIP when guest state is protected, just return 0 */
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
+ if (is_64_bit_mode(vcpu))
+ return kvm_rip_read(vcpu);
+ return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
+ kvm_rip_read(vcpu));
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip);
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
{
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = false;
+ return kvm_get_linear_rip(vcpu) == linear_rip;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip);
- vcpu->arch.switch_db_regs = 0;
- memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = DR6_FIXED_1;
- vcpu->arch.dr7 = DR7_FIXED_1;
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
- return kvm_x86_ops->vcpu_reset(vcpu);
+ rflags = kvm_x86_call(get_rflags)(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~X86_EFLAGS_TF;
+ return rflags;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags);
-void kvm_arch_hardware_enable(void *garbage)
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- kvm_x86_ops->hardware_enable(garbage);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
+ kvm_x86_call(set_rflags)(vcpu, rflags);
}
-void kvm_arch_hardware_disable(void *garbage)
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- kvm_x86_ops->hardware_disable(garbage);
+ __kvm_set_rflags(vcpu, rflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags);
-int kvm_arch_hardware_setup(void)
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
- return kvm_x86_ops->hardware_setup();
+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
+
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
}
-void kvm_arch_hardware_unsetup(void)
+static inline u32 kvm_async_pf_next_probe(u32 key)
{
- kvm_x86_ops->hardware_unsetup();
+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
}
-void kvm_arch_check_processor_compat(void *rtn)
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
- kvm_x86_ops->check_processor_compatibility(rtn);
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
}
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
{
- struct page *page;
- struct kvm *kvm;
- int r;
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
- BUG_ON(vcpu->kvm == NULL);
- kvm = vcpu->kvm;
+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+ return key;
+}
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+
+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
+ return;
+
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
}
- vcpu->arch.pio_data = page_address(page);
+}
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
+{
+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
- if (irqchip_in_kernel(kvm)) {
- r = kvm_create_lapic(vcpu);
- if (r < 0)
- goto fail_mmu_destroy;
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
+ sizeof(reason));
+}
+
+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
+{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
+
+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &token, offset, sizeof(token));
+}
+
+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
+{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
+ u32 val;
+
+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &val, offset, sizeof(val)))
+ return false;
+
+ return !val;
+}
+
+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
+{
+
+ if (!kvm_pv_async_pf_enabled(vcpu))
+ return false;
+
+ if (!vcpu->arch.apf.send_always &&
+ (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
+ return false;
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * L1 needs to opt into the special #PF vmexits that are
+ * used to deliver async page faults.
+ */
+ return vcpu->arch.apf.delivery_as_pf_vmexit;
+ } else {
+ /*
+ * Play it safe in case the guest temporarily disables paging.
+ * The real mode IDT in particular is unlikely to have a #PF
+ * exception setup.
+ */
+ return is_paging(vcpu);
}
+}
- return 0;
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!lapic_in_kernel(vcpu) ||
+ kvm_event_needs_reinjection(vcpu) ||
+ kvm_is_exception_pending(vcpu)))
+ return false;
-fail_mmu_destroy:
- kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
- free_page((unsigned long)vcpu->arch.pio_data);
-fail:
- return r;
+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
+ return false;
+
+ /*
+ * If interrupts are off we cannot even use an artificial
+ * halt state.
+ */
+ return kvm_arch_interrupt_allowed(vcpu);
}
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
{
- kvm_free_lapic(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- kvm_mmu_destroy(vcpu);
- up_read(&vcpu->kvm->slots_lock);
- free_page((unsigned long)vcpu->arch.pio_data);
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (kvm_can_deliver_async_pf(vcpu) &&
+ !apf_put_user_notpresent(vcpu)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ fault.async_page_fault = true;
+ kvm_inject_page_fault(vcpu, &fault);
+ return true;
+ } else {
+ /*
+ * It is not possible to deliver a paravirtualized asynchronous
+ * page fault, but putting the guest in an artificial halt state
+ * can be beneficial nevertheless: if an interrupt arrives, we
+ * can deliver it timely and perhaps the guest will schedule
+ * another process. When the instruction that triggered a page
+ * fault is retried, hopefully the page will be ready in the host.
+ */
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return false;
+ }
}
-struct kvm *kvm_arch_create_vm(void)
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
{
- struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vcpu->arch.apf.vec
+ };
- if (!kvm)
- return ERR_PTR(-ENOMEM);
+ if (work->wakeup_all)
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
- INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+ if ((work->wakeup_all || work->notpresent_injected) &&
+ kvm_pv_async_pf_enabled(vcpu) &&
+ !apf_put_user_ready(vcpu, work->arch.token)) {
+ WRITE_ONCE(vcpu->arch.apf.pageready_pending, true);
+ kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+ vcpu->arch.apf.halted = false;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+}
+
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
- rdtscll(kvm->arch.vm_init_tsc);
+ /* Pairs with smp_store_mb() in kvm_set_msr_common(). */
+ smp_mb__after_atomic();
- return kvm;
+ if (!READ_ONCE(vcpu->arch.apf.pageready_pending))
+ kvm_vcpu_kick(vcpu);
}
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
+ if (!kvm_pv_async_pf_enabled(vcpu))
+ return true;
+ else
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
-static void kvm_free_vcpus(struct kvm *kvm)
+static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
- unsigned int i;
+ /*
+ * Non-coherent DMA assignment and de-assignment may affect whether or
+ * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
+ * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
+ * (or last) non-coherent device is (un)registered to so that new SPTEs
+ * with the correct "ignore guest PAT" setting are created.
+ *
+ * If KVM always honors guest PAT, however, there is nothing to do.
+ */
+ if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT))
+ kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
+}
+
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+ if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
+}
+
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+ if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
+}
+
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_arch_has_noncoherent_dma);
+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
+}
+
+#ifdef CONFIG_KVM_GUEST_MEMFD
+/*
+ * KVM doesn't yet support initializing guest_memfd memory as shared for VMs
+ * with private memory (the private vs. shared tracking needs to be moved into
+ * guest_memfd).
+ */
+bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
+{
+ return !kvm_arch_has_private_mem(kvm);
+}
+
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
+{
+ return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order);
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_x86_call(gmem_invalidate)(start, end);
+}
+#endif
+#endif
+
+int kvm_spec_ctrl_test_value(u64 value)
+{
/*
- * Unpin any mmu pages first.
+ * test that setting IA32_SPEC_CTRL to given value
+ * is allowed by the host processor
*/
- for (i = 0; i < KVM_MAX_VCPUS; ++i)
- if (kvm->vcpus[i])
- kvm_unload_vcpu_mmu(kvm->vcpus[i]);
- for (i = 0; i < KVM_MAX_VCPUS; ++i) {
- if (kvm->vcpus[i]) {
- kvm_arch_vcpu_free(kvm->vcpus[i]);
- kvm->vcpus[i] = NULL;
- }
- }
+ u64 saved_value;
+ unsigned long flags;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ ret = 1;
+ else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value))
+ ret = 1;
+ else
+ wrmsrq(MSR_IA32_SPEC_CTRL, saved_value);
+
+ local_irq_restore(flags);
+
+ return ret;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value);
-void kvm_arch_sync_events(struct kvm *kvm)
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
{
- kvm_free_all_assigned_devices(kvm);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct x86_exception fault;
+ u64 access = error_code &
+ (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
+ /*
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+ * tables probably do not match the TLB. Just proceed
+ * with the error code that the processor gave.
+ */
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = error_code;
+ fault.nested_page_fault = false;
+ fault.address = gva;
+ fault.async_page_fault = false;
+ }
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error);
-void kvm_arch_destroy_vm(struct kvm *kvm)
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e)
{
- kvm_iommu_unmap_guest(kvm);
- kvm_free_pit(kvm);
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
- kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
- if (kvm->arch.apic_access_page)
- put_page(kvm->arch.apic_access_page);
- if (kvm->arch.ept_identity_pagetable)
- put_page(kvm->arch.ept_identity_pagetable);
- kfree(kvm);
-}
-
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
- struct kvm_memory_slot old,
- int user_alloc)
-{
- int npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
-
- /*To keep backward compatibility with older userspace,
- *x86 needs to hanlde !user_alloc case.
- */
- if (!user_alloc) {
- if (npages && !old.rmap) {
- unsigned long userspace_addr;
-
- down_write(&current->mm->mmap_sem);
- userspace_addr = do_mmap(NULL, 0,
- npages * PAGE_SIZE,
- PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS,
- 0);
- up_write(&current->mm->mmap_sem);
-
- if (IS_ERR((void *)userspace_addr))
- return PTR_ERR((void *)userspace_addr);
-
- /* set userspace_addr atomically for kvm_hva_to_rmapp */
- spin_lock(&kvm->mmu_lock);
- memslot->userspace_addr = userspace_addr;
- spin_unlock(&kvm->mmu_lock);
- } else {
- if (!old.user_alloc && old.rmap) {
- int ret;
-
- down_write(&current->mm->mmap_sem);
- ret = do_munmap(current->mm, old.userspace_addr,
- old.npages * PAGE_SIZE);
- up_write(&current->mm->mmap_sem);
- if (ret < 0)
- printk(KERN_WARNING
- "kvm_vm_ioctl_set_memory_region: "
- "failed to munmap memory\n");
- }
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ if (KVM_BUG_ON(!e, vcpu->kvm))
+ return -EIO;
+
+ kvm_inject_emulated_page_fault(vcpu, e);
+ return 1;
+ }
+
+ /*
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+ * while handling a VMX instruction KVM could've handled the request
+ * correctly by exiting to userspace and performing I/O but there
+ * doesn't seem to be a real use-case behind such requests, just return
+ * KVM_EXIT_INTERNAL_ERROR for now.
+ */
+ kvm_prepare_emulation_failure_exit(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+ bool pcid_enabled;
+ struct x86_exception e;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int r;
+
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ /*
+ * LAM doesn't apply to addresses that are inputs to TLB
+ * invalidation.
+ */
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_invlpg_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ kvm_invalidate_pcid(vcpu, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ fallthrough;
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invpcid);
- spin_lock(&kvm->mmu_lock);
- if (!kvm->arch.n_requested_mmu_pages) {
- unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned int len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ // VMG change, at this point, we're always done
+ // RIP has already been advanced
+ return 1;
}
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
- spin_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
+ // More MMIO is needed
+ run->mmio.phys_addr = frag->gpa;
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ if (run->mmio.is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
return 0;
}
-void kvm_arch_flush_shadow(struct kvm *kvm)
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
{
- kvm_mmu_zap_all(kvm);
- kvm_reload_remote_mmus(kvm);
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 1;
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_write);
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
+{
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 0;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_read);
+
+static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
+{
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * size;
+}
+
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port);
+
+static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
+{
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+
+ vcpu->arch.pio.count = 0;
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_outs(vcpu, size, port);
+ return 1;
+}
+
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port)
{
- return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
- || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
- || vcpu->arch.nmi_pending;
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
+
+ /* memcpy done already by emulator_pio_out. */
+ advance_sev_es_emulated_pio(vcpu, count, size);
+ if (!ret)
+ break;
+
+ /* Emulation done by the kernel. */
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
+ }
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
+ return 0;
}
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port);
+
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
{
- int me;
- int cpu = vcpu->cpu;
+ unsigned count = vcpu->arch.pio.count;
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+
+ complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+ advance_sev_es_emulated_pio(vcpu, count, size);
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_ins(vcpu, size, port);
+ return 1;
+}
- if (waitqueue_active(&vcpu->wq)) {
- wake_up_interruptible(&vcpu->wq);
- ++vcpu->stat.halt_wakeup;
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port)
+{
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
+ break;
+
+ /* Emulation done by the kernel. */
+ advance_sev_es_emulated_pio(vcpu, count, size);
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
}
- me = get_cpu();
- if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
- smp_send_reschedule(cpu);
- put_cpu();
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+ return 0;
}
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in)
+{
+ vcpu->arch.sev_pio_data = data;
+ vcpu->arch.sev_pio_count = count;
+ return in ? kvm_sev_es_ins(vcpu, size, port)
+ : kvm_sev_es_outs(vcpu, size, port);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_string_io);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
+
+static int __init kvm_x86_init(void)
+{
+ kvm_init_xstate_sizes();
+
+ kvm_mmu_x86_module_init();
+ mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
+ return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
{
- return kvm_x86_ops->interrupt_allowed(vcpu);
+ WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
}
+module_exit(kvm_x86_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 4c8e10af78e8..fdab0ad49098 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -1,29 +1,219 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH_X86_KVM_X86_H
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include <asm/fpu/xstate.h>
+#include <asm/mce.h>
+#include <asm/pvclock.h>
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "cpuid.h"
+
+#define KVM_MAX_MCE_BANKS 32
+
+struct kvm_caps {
+ /* control of guest tsc rate supported? */
+ bool has_tsc_control;
+ /* maximum supported tsc_khz for guests */
+ u32 max_guest_tsc_khz;
+ /* number of bits of the fractional part of the TSC scaling ratio */
+ u8 tsc_scaling_ratio_frac_bits;
+ /* maximum allowed value of TSC scaling ratio */
+ u64 max_tsc_scaling_ratio;
+ /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */
+ u64 default_tsc_scaling_ratio;
+ /* bus lock detection supported? */
+ bool has_bus_lock_exit;
+ /* notify VM exit supported? */
+ bool has_notify_vmexit;
+ /* bit mask of VM types */
+ u32 supported_vm_types;
+
+ u64 supported_mce_cap;
+ u64 supported_xcr0;
+ u64 supported_xss;
+ u64 supported_perf_cap;
+
+ u64 supported_quirks;
+ u64 inapplicable_quirks;
+};
+
+struct kvm_host_values {
+ /*
+ * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
+ * address bits irrespective of features that repurpose legal bits,
+ * e.g. MKTME.
+ */
+ u8 maxphyaddr;
+
+ u64 efer;
+ u64 xcr0;
+ u64 xss;
+ u64 s_cet;
+ u64 arch_capabilities;
+};
+
+void kvm_spurious_fault(void);
+
+#define SIZE_OF_MEMSLOTS_HASHTABLE \
+ (sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES)
+
+/* Sanity check the size of the memslot hash tables. */
+static_assert(SIZE_OF_MEMSLOTS_HASHTABLE ==
+ (1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM))));
+
+/*
+ * Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation.
+ * Spilling over to an order-2 allocation isn't fundamentally problematic, but
+ * isn't expected to happen in the foreseeable future (O(years)). Assert that
+ * the size is an order-0 allocation when ignoring the memslot hash tables, to
+ * help detect and debug unexpected size increases.
+ */
+#define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \
+do { \
+ BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+ BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+} while (0)
+
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
+({ \
+ bool failed = (consistency_check); \
+ if (failed) \
+ trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+ failed; \
+})
+
+/*
+ * The first...last VMX feature MSRs that are emulated by KVM. This may or may
+ * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an
+ * associated feature that KVM supports for nested virtualization.
+ */
+#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC
+#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC
+
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
+
+/*
+ * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves
+ * are arbitrary and have no meaning, the only requirement is that they don't
+ * conflict with "real" MSRs that KVM supports. Use values at the upper end
+ * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values
+ * will be usable until KVM exhausts its supply of paravirtual MSR indices.
+ */
+
+#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff
+
+static inline unsigned int __grow_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int max)
+{
+ u64 ret = val;
+
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ ret *= modifier;
+ else
+ ret += modifier;
+
+ return min(ret, (u64)max);
+}
+
+static inline unsigned int __shrink_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int min)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, min);
+}
+
+#define MSR_IA32_CR_PAT_DEFAULT \
+ PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC)
+
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static inline void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
+/*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect branch
+ * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in
+ * this case) to provide separate predictor modes. Bare metal isolates the host
+ * from the guest, but doesn't isolate different guests from one another (in
+ * this case L1 and L2). The exception is if bare metal supports same mode IBRS,
+ * which offers protection within the same mode, and hence protects L1 from L2.
+ */
+static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu)
+{
+ if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE))
+ return;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS))
+ indirect_branch_prediction_barrier();
+}
+
+static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.last_vmentry_cpu != -1;
+}
+
+static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state)
+{
+ vcpu->arch.mp_state = mp_state;
+ if (mp_state == KVM_MP_STATE_RUNNABLE)
+ vcpu->arch.pv.pv_unhalted = false;
+}
+
+static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending ||
+ vcpu->arch.exception_vmexit.pending ||
+ kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+}
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception_vmexit.pending = false;
}
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
bool soft)
{
- vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.injected = true;
vcpu->arch.interrupt.soft = soft;
vcpu->arch.interrupt.nr = vector;
}
static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
{
- vcpu->arch.interrupt.pending = false;
+ vcpu->arch.interrupt.injected = false;
}
static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+ return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
vcpu->arch.nmi_injected;
}
@@ -31,4 +221,532 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
{
return (nr == BP_VECTOR) || (nr == OF_VECTOR);
}
+
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr0_bit_set(vcpu, X86_CR0_PE);
+}
+
+static inline bool is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return !!(vcpu->arch.efer & EFER_LMA);
+#else
+ return false;
+#endif
+}
+
+static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
+{
+ int cs_db, cs_l;
+
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected);
+
+ if (!is_long_mode(vcpu))
+ return false;
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ return cs_l;
+}
+
+static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If running with protected guest state, the CS register is not
+ * accessible. The hypercall register values will have had to been
+ * provided in 64-bit mode, so assume the guest is in 64-bit.
+ */
+ return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu);
+}
+
+static inline bool x86_exception_has_error_code(unsigned int vector)
+{
+ static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) |
+ BIT(NP_VECTOR) | BIT(SS_VECTOR) | BIT(GP_VECTOR) |
+ BIT(PF_VECTOR) | BIT(AC_VECTOR);
+
+ return (1U << vector) & exception_has_error_code;
+}
+
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
+static inline bool is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PAE);
+}
+
+static inline bool is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PSE);
+}
+
+static inline bool is_paging(struct kvm_vcpu *vcpu)
+{
+ return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG));
+}
+
+static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
+{
+ return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
+}
+
+static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u8 max_host_virt_addr_bits(void)
+{
+ return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48;
+}
+
+/*
+ * x86 MSRs which contain linear addresses, x86 hidden segment bases, and
+ * IDT/GDT bases have static canonicality checks, the size of which depends
+ * only on the CPU's support for 5-level paging, rather than on the state of
+ * CR4.LA57. This applies to both WRMSR and to other instructions that set
+ * their values, e.g. SGDT.
+ *
+ * KVM passes through most of these MSRS and also doesn't intercept the
+ * instructions that set the hidden segment bases.
+ *
+ * Because of this, to be consistent with hardware, even if the guest doesn't
+ * have LA57 enabled in its CPUID, perform canonicality checks based on *host*
+ * support for 5 level paging.
+ *
+ * Finally, instructions which are related to MMU invalidation of a given
+ * linear address, also have a similar static canonical check on address.
+ * This allows for example to invalidate 5-level addresses of a guest from a
+ * host which uses 4-level paging.
+ */
+static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu,
+ unsigned int flags)
+{
+ if (flags & (X86EMUL_F_INVLPG | X86EMUL_F_MSR | X86EMUL_F_DT_LOAD))
+ return !__is_canonical_address(la, max_host_virt_addr_bits());
+ else
+ return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu));
+}
+
+static inline bool is_noncanonical_msr_address(u64 la, struct kvm_vcpu *vcpu)
+{
+ return is_noncanonical_address(la, vcpu, X86EMUL_F_MSR);
+}
+
+static inline bool is_noncanonical_base_address(u64 la, struct kvm_vcpu *vcpu)
+{
+ return is_noncanonical_address(la, vcpu, X86EMUL_F_DT_LOAD);
+}
+
+static inline bool is_noncanonical_invlpg_address(u64 la, struct kvm_vcpu *vcpu)
+{
+ return is_noncanonical_address(la, vcpu, X86EMUL_F_INVLPG);
+}
+
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+ gva_t gva, gfn_t gfn, unsigned access)
+{
+ u64 gen = kvm_memslots(vcpu->kvm)->generation;
+
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return;
+
+ /*
+ * If this is a shadow nested page table, the "GVA" is
+ * actually a nGPA.
+ */
+ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
+ vcpu->arch.mmio_access = access;
+ vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = gen;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
+}
+
+/*
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
+ */
+#define MMIO_GVA_ANY (~(gva_t)0)
+
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ return;
+
+ vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
+{
+ unsigned long val = kvm_register_read_raw(vcpu, reg);
+
+ return is_64_bit_mode(vcpu) ? val : (u32)val;
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ int reg, unsigned long val)
+{
+ if (!is_64_bit_mode(vcpu))
+ val = (u32)val;
+ return kvm_register_write_raw(vcpu, reg, val);
+}
+
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+ return !(kvm->arch.disabled_quirks & quirk);
+}
+
+static __always_inline void kvm_request_l1tf_flush_l1d(void)
+{
+#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL)
+ /*
+ * Use a raw write to set the per-CPU flag, as KVM will ensure a flush
+ * even if preemption is currently enabled.. If the current vCPU task
+ * is migrated to a different CPU (or userspace runs the vCPU on a
+ * different task) before the next VM-Entry, then kvm_arch_vcpu_load()
+ * will request a flush on the new CPU.
+ */
+ raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
+#endif
+}
+
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
+
+u64 get_kvmclock_ns(struct kvm *kvm);
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
+int kvm_guest_time_update(struct kvm_vcpu *v);
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int handle_ud(struct kvm_vcpu *vcpu);
+
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+ struct kvm_queued_exception *ex);
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len);
+fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu);
+fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu);
+fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu);
+
+extern struct kvm_caps kvm_caps;
+extern struct kvm_host_values kvm_host;
+
+extern bool enable_pmu;
+
+/*
+ * Get a filtered version of KVM's supported XCR0 that strips out dynamic
+ * features for which the current process doesn't (yet) have permission to use.
+ * This is intended to be used only when enumerating support to userspace,
+ * e.g. in KVM_GET_SUPPORTED_CPUID and KVM_CAP_XSAVE2, it does NOT need to be
+ * used to check/restrict guest behavior as KVM rejects KVM_SET_CPUID{2} if
+ * userspace attempts to enable unpermitted features.
+ */
+static inline u64 kvm_get_filtered_xcr0(void)
+{
+ u64 permitted_xcr0 = kvm_caps.supported_xcr0;
+
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ if (permitted_xcr0 & XFEATURE_MASK_USER_DYNAMIC) {
+ permitted_xcr0 &= xstate_get_guest_group_perm();
+
+ /*
+ * Treat XTILE_CFG as unsupported if the current process isn't
+ * allowed to use XTILE_DATA, as attempting to set XTILE_CFG in
+ * XCR0 without setting XTILE_DATA is architecturally illegal.
+ */
+ if (!(permitted_xcr0 & XFEATURE_MASK_XTILE_DATA))
+ permitted_xcr0 &= ~XFEATURE_MASK_XTILE_CFG;
+ }
+ return permitted_xcr0;
+}
+
+static inline bool kvm_mpx_supported(void)
+{
+ return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+ == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+}
+
+extern unsigned int min_timer_period_us;
+
+extern bool enable_vmware_backdoor;
+
+extern int pi_inject_timer;
+
+extern bool report_ignored_msrs;
+
+extern bool eager_page_split;
+
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
+}
+
+static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
+}
+
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+{
+ return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+}
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base) \
+ ({ \
+ u32 __quot, __rem; \
+ asm("divl %2" : "=a" (__quot), "=d" (__rem) \
+ : "rm" (base), "0" (0), "1" ((u32) n)); \
+ n = __quot; \
+ __rem; \
+ })
+
+static inline void kvm_disable_exits(struct kvm *kvm, u64 mask)
+{
+ kvm->arch.disabled_exits |= mask;
+}
+
+static inline bool kvm_mwait_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_MWAIT;
+}
+
+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_HLT;
+}
+
+static inline bool kvm_pause_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_PAUSE;
+}
+
+static inline bool kvm_cstate_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE;
+}
+
+static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF;
+}
+
+static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
+{
+ return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
+}
+
+static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
+{
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
+}
+
+static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+{
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
+}
+
+static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI;
+}
+
+static inline bool kvm_pat_valid(u64 data)
+{
+ if (data & 0xF8F8F8F8F8F8F8F8ull)
+ return false;
+ /* 0, 1, 4, 5, 6, 7 are valid values. */
+ return (data | ((data & 0x0202020202020202ull) << 1)) == data;
+}
+
+static inline bool kvm_dr7_valid(u64 data)
+{
+ /* Bits [63:32] are reserved */
+ return !(data >> 32);
+}
+static inline bool kvm_dr6_valid(u64 data)
+{
+ /* Bits [63:32] are reserved */
+ return !(data >> 32);
+}
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static inline void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs);
+#endif
+}
+
+int kvm_spec_ctrl_test_value(u64 value);
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e);
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
+
+enum kvm_msr_access {
+ MSR_TYPE_R = BIT(0),
+ MSR_TYPE_W = BIT(1),
+ MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W,
+};
+
+/*
+ * Internal error codes that are used to indicate that MSR emulation encountered
+ * an error that should result in #GP in the guest, unless userspace handles it.
+ * Note, '1', '0', and negative numbers are off limits, as they are used by KVM
+ * as part of KVM's lightly documented internal KVM_RUN return codes.
+ *
+ * UNSUPPORTED - The MSR isn't supported, either because it is completely
+ * unknown to KVM, or because the MSR should not exist according
+ * to the vCPU model.
+ *
+ * FILTERED - Access to the MSR is denied by a userspace MSR filter.
+ */
+#define KVM_MSR_RET_UNSUPPORTED 2
+#define KVM_MSR_RET_FILTERED 3
+
+static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return !(cr4 & vcpu->arch.cr4_guest_rsvd_bits);
+}
+
+#define __cr4_reserved_bits(__cpu_has, __c) \
+({ \
+ u64 __reserved_bits = CR4_RESERVED_BITS; \
+ \
+ if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
+ __reserved_bits |= X86_CR4_OSXSAVE; \
+ if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
+ __reserved_bits |= X86_CR4_SMEP; \
+ if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
+ __reserved_bits |= X86_CR4_SMAP; \
+ if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
+ __reserved_bits |= X86_CR4_FSGSBASE; \
+ if (!__cpu_has(__c, X86_FEATURE_PKU)) \
+ __reserved_bits |= X86_CR4_PKE; \
+ if (!__cpu_has(__c, X86_FEATURE_LA57)) \
+ __reserved_bits |= X86_CR4_LA57; \
+ if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
+ __reserved_bits |= X86_CR4_UMIP; \
+ if (!__cpu_has(__c, X86_FEATURE_VMX)) \
+ __reserved_bits |= X86_CR4_VMXE; \
+ if (!__cpu_has(__c, X86_FEATURE_PCID)) \
+ __reserved_bits |= X86_CR4_PCIDE; \
+ if (!__cpu_has(__c, X86_FEATURE_LAM)) \
+ __reserved_bits |= X86_CR4_LAM_SUP; \
+ if (!__cpu_has(__c, X86_FEATURE_SHSTK) && \
+ !__cpu_has(__c, X86_FEATURE_IBT)) \
+ __reserved_bits |= X86_CR4_CET; \
+ __reserved_bits; \
+})
+
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in);
+
+static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr)
+{
+ return kvm->arch.hypercall_exit_enabled & BIT(hc_nr);
+}
+
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
+ int (*complete_hypercall)(struct kvm_vcpu *));
+
+#define __kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall) \
+({ \
+ int __ret; \
+ __ret = ____kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall); \
+ \
+ if (__ret > 0) \
+ __ret = complete_hypercall(_vcpu); \
+ __ret; \
+})
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+#define CET_US_RESERVED_BITS GENMASK(9, 6)
+#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0)
+#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10))
+#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12)
+
+static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (data & CET_US_RESERVED_BITS)
+ return false;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ (data & CET_US_SHSTK_MASK_BITS))
+ return false;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
+ (data & CET_US_IBT_MASK_BITS))
+ return false;
+ if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4))
+ return false;
+ /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */
+ if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR))
+ return false;
+
+ return true;
+}
#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
deleted file mode 100644
index 616de4628d60..000000000000
--- a/arch/x86/kvm/x86_emulate.c
+++ /dev/null
@@ -1,2141 +0,0 @@
-/******************************************************************************
- * x86_emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- *
- * Avi Kivity <avi@qumranet.com>
- * Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
-#include <linux/kvm_host.h>
-#include "kvm_cache_regs.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include <linux/module.h>
-#include <asm/kvm_x86_emulate.h>
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
-#define DstReg (2<<1) /* Register operand. */
-#define DstMem (3<<1) /* Memory operand. */
-#define DstAcc (4<<1) /* Destination Accumulator */
-#define DstMask (7<<1)
-/* Source operand type. */
-#define SrcNone (0<<4) /* No source operand. */
-#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
-#define SrcReg (1<<4) /* Register operand. */
-#define SrcMem (2<<4) /* Memory operand. */
-#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
-#define SrcImm (5<<4) /* Immediate operand. */
-#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
-#define SrcOne (7<<4) /* Implied '1' */
-#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
-#define SrcMask (0xf<<4)
-/* Generic ModRM decode. */
-#define ModRM (1<<8)
-/* Destination is only written; never read. */
-#define Mov (1<<9)
-#define BitOp (1<<10)
-#define MemAbs (1<<11) /* Memory operand is absolute displacement */
-#define String (1<<12) /* String instruction (rep capable) */
-#define Stack (1<<13) /* Stack instruction (push/pop) */
-#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
-#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
-#define GroupMask 0xff /* Group number stored in bits 0:7 */
-/* Source 2 operand type */
-#define Src2None (0<<29)
-#define Src2CL (1<<29)
-#define Src2ImmByte (2<<29)
-#define Src2One (3<<29)
-#define Src2Imm16 (4<<29)
-#define Src2Mask (7<<29)
-
-enum {
- Group1_80, Group1_81, Group1_82, Group1_83,
- Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
-};
-
-static u32 opcode_table[256] = {
- /* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
- /* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
- /* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x38 - 0x3F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- 0, 0,
- /* 0x40 - 0x47 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x48 - 0x4F */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x50 - 0x57 */
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- /* 0x58 - 0x5F */
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- /* 0x60 - 0x67 */
- 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
- 0, 0, 0, 0,
- /* 0x68 - 0x6F */
- SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
- /* 0x70 - 0x77 */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x78 - 0x7F */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x80 - 0x87 */
- Group | Group1_80, Group | Group1_81,
- Group | Group1_82, Group | Group1_83,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- /* 0x88 - 0x8F */
- ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
- ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
- DstReg | SrcMem | ModRM | Mov, Group | Group1A,
- /* 0x90 - 0x97 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x98 - 0x9F */
- 0, 0, SrcImm | Src2Imm16, 0,
- ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
- /* 0xA0 - 0xA7 */
- ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
- ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
- ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | String, ImplicitOps | String,
- /* 0xA8 - 0xAF */
- 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | String, ImplicitOps | String,
- /* 0xB0 - 0xB7 */
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- /* 0xB8 - 0xBF */
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- /* 0xC0 - 0xC7 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- 0, ImplicitOps | Stack, 0, 0,
- ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
- /* 0xC8 - 0xCF */
- 0, 0, 0, ImplicitOps | Stack,
- ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
- /* 0xD0 - 0xD7 */
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- 0, 0, 0, 0,
- /* 0xD8 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xE7 */
- 0, 0, 0, 0,
- ByteOp | SrcImmUByte, SrcImmUByte,
- ByteOp | SrcImmUByte, SrcImmUByte,
- /* 0xE8 - 0xEF */
- SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
- /* 0xF0 - 0xF7 */
- 0, 0, 0, 0,
- ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
- /* 0xF8 - 0xFF */
- ImplicitOps, 0, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
-};
-
-static u32 twobyte_table[256] = {
- /* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
- /* 0x10 - 0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
- /* 0x20 - 0x2F */
- ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x48 - 0x4F */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x8F */
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM, 0, 0,
- /* 0xA8 - 0xAF */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM,
- ModRM, 0,
- /* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
- DstMem | SrcReg | ModRM | BitOp,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xB8 - 0xBF */
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xFF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static u32 group_table[] = {
- [Group1_80*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- [Group1_81*8] =
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- [Group1_82*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- [Group1_83*8] =
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- [Group1A*8] =
- DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
- [Group3_Byte*8] =
- ByteOp | SrcImm | DstMem | ModRM, 0,
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group3*8] =
- DstMem | SrcImm | ModRM, 0,
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group4*8] =
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
- 0, 0, 0, 0, 0, 0,
- [Group5*8] =
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
- [Group7*8] =
- 0, 0, ModRM | SrcMem, ModRM | SrcMem,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
-};
-
-static u32 group2_table[] = {
- [Group7*8] =
- SrcNone | ModRM, 0, 0, SrcNone | ModRM,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov, 0,
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k" /* force 32-bit operand */
-#define _STK "%%rsp" /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 "" /* force 32-bit operand */
-#define _STK "%%esp" /* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
- /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
- "movl %"_sav",%"_LO32 _tmp"; " \
- "push %"_tmp"; " \
- "push %"_tmp"; " \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pushf; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
- "pop %"_tmp"; " \
- "orl %"_LO32 _tmp",("_STK"); " \
- "popf; " \
- "pop %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
- /* _sav |= EFLAGS & _msk; */ \
- "pushf; " \
- "pop %"_tmp"; " \
- "andl %"_msk",%"_LO32 _tmp"; " \
- "orl %"_LO32 _tmp",%"_sav"; "
-
-#ifdef CONFIG_X86_64
-#define ON64(x) x
-#else
-#define ON64(x)
-#endif
-
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op _suffix " %"_x"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _y ((_src).val), "i" (EFLAGS_MASK)); \
- } while (0)
-
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 2: \
- ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
- break; \
- case 4: \
- ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
- break; \
- case 8: \
- ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
- break; \
- } \
- } while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- switch ((_dst).bytes) { \
- case 1: \
- ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
- break; \
- default: \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- _wx, _wy, _lx, _ly, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
- do { \
- unsigned long _tmp; \
- _type _clv = (_cl).val; \
- _type _srcv = (_src).val; \
- _type _dstv = (_dst).val; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "2") \
- _op _suffix " %4,%1 \n" \
- _POST_EFLAGS("0", "5", "2") \
- : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
- : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
- ); \
- \
- (_cl).val = (unsigned long) _clv; \
- (_src).val = (unsigned long) _srcv; \
- (_dst).val = (unsigned long) _dstv; \
- } while (0)
-
-#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
- do { \
- switch ((_dst).bytes) { \
- case 2: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "w", unsigned short); \
- break; \
- case 4: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "l", unsigned int); \
- break; \
- case 8: \
- ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "q", unsigned long)); \
- break; \
- } \
- } while (0)
-
-#define __emulate_1op(_op, _dst, _eflags, _suffix) \
- do { \
- unsigned long _tmp; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op _suffix " %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "+m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- } while (0)
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags) \
- do { \
- switch ((_dst).bytes) { \
- case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
- case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
- case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
- case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
- } \
- } while (0)
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip) \
-({ unsigned long _x; \
- rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
- if (rc != 0) \
- goto done; \
- (_eip) += (_size); \
- (_type)_x; \
-})
-
-static inline unsigned long ad_mask(struct decode_cache *c)
-{
- return (1UL << (c->ad_bytes << 3)) - 1;
-}
-
-/* Access/update address held in a register, based on addressing mode. */
-static inline unsigned long
-address_mask(struct decode_cache *c, unsigned long reg)
-{
- if (c->ad_bytes == sizeof(unsigned long))
- return reg;
- else
- return reg & ad_mask(c);
-}
-
-static inline unsigned long
-register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
-{
- return base + address_mask(c, reg);
-}
-
-static inline void
-register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
-{
- if (c->ad_bytes == sizeof(unsigned long))
- *reg += inc;
- else
- *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
-}
-
-static inline void jmp_rel(struct decode_cache *c, int rel)
-{
- register_address_increment(c, &c->eip, rel);
-}
-
-static void set_seg_override(struct decode_cache *c, int seg)
-{
- c->has_seg_override = true;
- c->seg_override = seg;
-}
-
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
-{
- if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
- return 0;
-
- return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
-}
-
-static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
- struct decode_cache *c)
-{
- if (!c->has_seg_override)
- return 0;
-
- return seg_base(ctxt, c->seg_override);
-}
-
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
-{
- return seg_base(ctxt, VCPU_SREG_ES);
-}
-
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
-{
- return seg_base(ctxt, VCPU_SREG_SS);
-}
-
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long linear, u8 *dest)
-{
- struct fetch_cache *fc = &ctxt->decode.fetch;
- int rc;
- int size;
-
- if (linear < fc->start || linear >= fc->end) {
- size = min(15UL, PAGE_SIZE - offset_in_page(linear));
- rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
- if (rc)
- return rc;
- fc->start = linear;
- fc->end = linear + size;
- }
- *dest = fc->data[linear - fc->start];
- return 0;
-}
-
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long eip, void *dest, unsigned size)
-{
- int rc = 0;
-
- eip += ctxt->cs_base;
- while (size--) {
- rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
- if (rc)
- return rc;
- }
- return 0;
-}
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
- int highbyte_regs)
-{
- void *p;
-
- p = &regs[modrm_reg];
- if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
- p = (unsigned char *)&regs[modrm_reg & 3] + 1;
- return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- void *ptr,
- u16 *size, unsigned long *address, int op_bytes)
-{
- int rc;
-
- if (op_bytes == 2)
- op_bytes = 3;
- *address = 0;
- rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu);
- if (rc)
- return rc;
- rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu);
- return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
- int rc = 0;
-
- switch ((condition & 15) >> 1) {
- case 0: /* o */
- rc |= (flags & EFLG_OF);
- break;
- case 1: /* b/c/nae */
- rc |= (flags & EFLG_CF);
- break;
- case 2: /* z/e */
- rc |= (flags & EFLG_ZF);
- break;
- case 3: /* be/na */
- rc |= (flags & (EFLG_CF|EFLG_ZF));
- break;
- case 4: /* s */
- rc |= (flags & EFLG_SF);
- break;
- case 5: /* p/pe */
- rc |= (flags & EFLG_PF);
- break;
- case 7: /* le/ng */
- rc |= (flags & EFLG_ZF);
- /* fall through */
- case 6: /* l/nge */
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
- break;
- }
-
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
- return (!!rc ^ (condition & 1));
-}
-
-static void decode_register_operand(struct operand *op,
- struct decode_cache *c,
- int inhibit_bytereg)
-{
- unsigned reg = c->modrm_reg;
- int highbyte_regs = c->rex_prefix == 0;
-
- if (!(c->d & ModRM))
- reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
- op->type = OP_REG;
- if ((c->d & ByteOp) && !inhibit_bytereg) {
- op->ptr = decode_register(reg, c->regs, highbyte_regs);
- op->val = *(u8 *)op->ptr;
- op->bytes = 1;
- } else {
- op->ptr = decode_register(reg, c->regs, 0);
- op->bytes = c->op_bytes;
- switch (op->bytes) {
- case 2:
- op->val = *(u16 *)op->ptr;
- break;
- case 4:
- op->val = *(u32 *)op->ptr;
- break;
- case 8:
- op->val = *(u64 *) op->ptr;
- break;
- }
- }
- op->orig_val = op->val;
-}
-
-static int decode_modrm(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
- u8 sib;
- int index_reg = 0, base_reg = 0, scale;
- int rc = 0;
-
- if (c->rex_prefix) {
- c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
- index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
- c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
- }
-
- c->modrm = insn_fetch(u8, 1, c->eip);
- c->modrm_mod |= (c->modrm & 0xc0) >> 6;
- c->modrm_reg |= (c->modrm & 0x38) >> 3;
- c->modrm_rm |= (c->modrm & 0x07);
- c->modrm_ea = 0;
- c->use_modrm_ea = 1;
-
- if (c->modrm_mod == 3) {
- c->modrm_ptr = decode_register(c->modrm_rm,
- c->regs, c->d & ByteOp);
- c->modrm_val = *(unsigned long *)c->modrm_ptr;
- return rc;
- }
-
- if (c->ad_bytes == 2) {
- unsigned bx = c->regs[VCPU_REGS_RBX];
- unsigned bp = c->regs[VCPU_REGS_RBP];
- unsigned si = c->regs[VCPU_REGS_RSI];
- unsigned di = c->regs[VCPU_REGS_RDI];
-
- /* 16-bit ModR/M decode. */
- switch (c->modrm_mod) {
- case 0:
- if (c->modrm_rm == 6)
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
- break;
- case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
- break;
- }
- switch (c->modrm_rm) {
- case 0:
- c->modrm_ea += bx + si;
- break;
- case 1:
- c->modrm_ea += bx + di;
- break;
- case 2:
- c->modrm_ea += bp + si;
- break;
- case 3:
- c->modrm_ea += bp + di;
- break;
- case 4:
- c->modrm_ea += si;
- break;
- case 5:
- c->modrm_ea += di;
- break;
- case 6:
- if (c->modrm_mod != 0)
- c->modrm_ea += bp;
- break;
- case 7:
- c->modrm_ea += bx;
- break;
- }
- if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
- (c->modrm_rm == 6 && c->modrm_mod != 0))
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_SS);
- c->modrm_ea = (u16)c->modrm_ea;
- } else {
- /* 32/64-bit ModR/M decode. */
- if ((c->modrm_rm & 7) == 4) {
- sib = insn_fetch(u8, 1, c->eip);
- index_reg |= (sib >> 3) & 7;
- base_reg |= sib & 7;
- scale = sib >> 6;
-
- if ((base_reg & 7) == 5 && c->modrm_mod == 0)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
- else
- c->modrm_ea += c->regs[base_reg];
- if (index_reg != 4)
- c->modrm_ea += c->regs[index_reg] << scale;
- } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- c->rip_relative = 1;
- } else
- c->modrm_ea += c->regs[c->modrm_rm];
- switch (c->modrm_mod) {
- case 0:
- if (c->modrm_rm == 5)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
- break;
- case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
- break;
- }
- }
-done:
- return rc;
-}
-
-static int decode_abs(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc = 0;
-
- switch (c->ad_bytes) {
- case 2:
- c->modrm_ea = insn_fetch(u16, 2, c->eip);
- break;
- case 4:
- c->modrm_ea = insn_fetch(u32, 4, c->eip);
- break;
- case 8:
- c->modrm_ea = insn_fetch(u64, 8, c->eip);
- break;
- }
-done:
- return rc;
-}
-
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc = 0;
- int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, group;
-
- /* Shadow copy of register state. Committed on successful emulation. */
-
- memset(c, 0, sizeof(struct decode_cache));
- c->eip = kvm_rip_read(ctxt->vcpu);
- ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_PROT16:
- def_op_bytes = def_ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- def_op_bytes = def_ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- def_op_bytes = 4;
- def_ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- c->op_bytes = def_op_bytes;
- c->ad_bytes = def_ad_bytes;
-
- /* Legacy prefixes. */
- for (;;) {
- switch (c->b = insn_fetch(u8, 1, c->eip)) {
- case 0x66: /* operand-size override */
- /* switch between 2/4 bytes */
- c->op_bytes = def_op_bytes ^ 6;
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- /* switch between 4/8 bytes */
- c->ad_bytes = def_ad_bytes ^ 12;
- else
- /* switch between 2/4 bytes */
- c->ad_bytes = def_ad_bytes ^ 6;
- break;
- case 0x26: /* ES override */
- case 0x2e: /* CS override */
- case 0x36: /* SS override */
- case 0x3e: /* DS override */
- set_seg_override(c, (c->b >> 3) & 3);
- break;
- case 0x64: /* FS override */
- case 0x65: /* GS override */
- set_seg_override(c, c->b & 7);
- break;
- case 0x40 ... 0x4f: /* REX */
- if (mode != X86EMUL_MODE_PROT64)
- goto done_prefixes;
- c->rex_prefix = c->b;
- continue;
- case 0xf0: /* LOCK */
- c->lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
- case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
- break;
- default:
- goto done_prefixes;
- }
-
- /* Any legacy prefix after a REX prefix nullifies its effect. */
-
- c->rex_prefix = 0;
- }
-
-done_prefixes:
-
- /* REX prefix. */
- if (c->rex_prefix)
- if (c->rex_prefix & 8)
- c->op_bytes = 8; /* REX.W */
-
- /* Opcode byte(s). */
- c->d = opcode_table[c->b];
- if (c->d == 0) {
- /* Two-byte opcode? */
- if (c->b == 0x0f) {
- c->twobyte = 1;
- c->b = insn_fetch(u8, 1, c->eip);
- c->d = twobyte_table[c->b];
- }
- }
-
- if (c->d & Group) {
- group = c->d & GroupMask;
- c->modrm = insn_fetch(u8, 1, c->eip);
- --c->eip;
-
- group = (group << 3) + ((c->modrm >> 3) & 7);
- if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
- c->d = group2_table[group];
- else
- c->d = group_table[group];
- }
-
- /* Unrecognised? */
- if (c->d == 0) {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return -1;
- }
-
- if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
- c->op_bytes = 8;
-
- /* ModRM and SIB bytes. */
- if (c->d & ModRM)
- rc = decode_modrm(ctxt, ops);
- else if (c->d & MemAbs)
- rc = decode_abs(ctxt, ops);
- if (rc)
- goto done;
-
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_DS);
-
- if (!(!c->twobyte && c->b == 0x8d))
- c->modrm_ea += seg_override_base(ctxt, c);
-
- if (c->ad_bytes != 8)
- c->modrm_ea = (u32)c->modrm_ea;
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- switch (c->d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- decode_register_operand(&c->src, c, 0);
- break;
- case SrcMem16:
- c->src.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- c->src.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- c->src.bytes = (c->d & ByteOp) ? 1 :
- c->op_bytes;
- /* Don't fetch the address for invlpg: it could be unmapped. */
- if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
- break;
- srcmem_common:
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->src.type = OP_REG;
- c->src.val = c->modrm_val;
- c->src.ptr = c->modrm_ptr;
- break;
- }
- c->src.type = OP_MEM;
- break;
- case SrcImm:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->src.bytes == 8)
- c->src.bytes = 4;
- /* NB. Immediates are sign-extended as necessary. */
- switch (c->src.bytes) {
- case 1:
- c->src.val = insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->src.val = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- c->src.val = insn_fetch(s32, 4, c->eip);
- break;
- }
- break;
- case SrcImmByte:
- case SrcImmUByte:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = 1;
- if ((c->d & SrcMask) == SrcImmByte)
- c->src.val = insn_fetch(s8, 1, c->eip);
- else
- c->src.val = insn_fetch(u8, 1, c->eip);
- break;
- case SrcOne:
- c->src.bytes = 1;
- c->src.val = 1;
- break;
- }
-
- /*
- * Decode and fetch the second source operand: register, memory
- * or immediate.
- */
- switch (c->d & Src2Mask) {
- case Src2None:
- break;
- case Src2CL:
- c->src2.bytes = 1;
- c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
- break;
- case Src2ImmByte:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 1;
- c->src2.val = insn_fetch(u8, 1, c->eip);
- break;
- case Src2Imm16:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 2;
- c->src2.val = insn_fetch(u16, 2, c->eip);
- break;
- case Src2One:
- c->src2.bytes = 1;
- c->src2.val = 1;
- break;
- }
-
- /* Decode and fetch the destination operand: register or memory. */
- switch (c->d & DstMask) {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- return 0;
- case DstReg:
- decode_register_operand(&c->dst, c,
- c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
- break;
- case DstMem:
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.type = OP_REG;
- c->dst.val = c->dst.orig_val = c->modrm_val;
- c->dst.ptr = c->modrm_ptr;
- break;
- }
- c->dst.type = OP_MEM;
- break;
- case DstAcc:
- c->dst.type = OP_REG;
- c->dst.bytes = c->op_bytes;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- switch (c->op_bytes) {
- case 1:
- c->dst.val = *(u8 *)c->dst.ptr;
- break;
- case 2:
- c->dst.val = *(u16 *)c->dst.ptr;
- break;
- case 4:
- c->dst.val = *(u32 *)c->dst.ptr;
- break;
- }
- c->dst.orig_val = c->dst.val;
- break;
- }
-
- if (c->rip_relative)
- c->modrm_ea += c->eip;
-
-done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-}
-
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
-
- c->dst.type = OP_MEM;
- c->dst.bytes = c->op_bytes;
- c->dst.val = c->src.val;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]);
-}
-
-static int emulate_pop(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- void *dest, int len)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc;
-
- rc = ops->read_emulated(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]),
- dest, len, ctxt->vcpu);
- if (rc != 0)
- return rc;
-
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
- return rc;
-}
-
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc;
-
- rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
- if (rc != 0)
- return rc;
- return 0;
-}
-
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
-{
- struct decode_cache *c = &ctxt->decode;
- switch (c->modrm_reg) {
- case 0: /* rol */
- emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
- break;
- case 1: /* ror */
- emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
- break;
- case 2: /* rcl */
- emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
- break;
- case 3: /* rcr */
- emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
- break;
- case 4: /* sal/shl */
- case 6: /* sal/shl */
- emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
- break;
- case 5: /* shr */
- emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
- break;
- case 7: /* sar */
- emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
- break;
- }
-}
-
-static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc = 0;
-
- switch (c->modrm_reg) {
- case 0 ... 1: /* test */
- emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
- break;
- case 2: /* not */
- c->dst.val = ~c->dst.val;
- break;
- case 3: /* neg */
- emulate_1op("neg", c->dst, ctxt->eflags);
- break;
- default:
- DPRINTF("Cannot emulate %02x\n", c->b);
- rc = X86EMUL_UNHANDLEABLE;
- break;
- }
- return rc;
-}
-
-static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
-
- switch (c->modrm_reg) {
- case 0: /* inc */
- emulate_1op("inc", c->dst, ctxt->eflags);
- break;
- case 1: /* dec */
- emulate_1op("dec", c->dst, ctxt->eflags);
- break;
- case 2: /* call near abs */ {
- long int old_eip;
- old_eip = c->eip;
- c->eip = c->src.val;
- c->src.val = old_eip;
- emulate_push(ctxt);
- break;
- }
- case 4: /* jmp abs */
- c->eip = c->src.val;
- break;
- case 6: /* push */
- emulate_push(ctxt);
- break;
- }
- return 0;
-}
-
-static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long memop)
-{
- struct decode_cache *c = &ctxt->decode;
- u64 old, new;
- int rc;
-
- rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
- if (rc != 0)
- return rc;
-
- if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
- ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
-
- c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
- c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
- ctxt->eflags &= ~EFLG_ZF;
-
- } else {
- new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
- (u32) c->regs[VCPU_REGS_RBX];
-
- rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
- if (rc != 0)
- return rc;
- ctxt->eflags |= EFLG_ZF;
- }
- return 0;
-}
-
-static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- struct decode_cache *c = &ctxt->decode;
- int rc;
- unsigned long cs;
-
- rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
- if (rc)
- return rc;
- if (c->op_bytes == 4)
- c->eip = (u32)c->eip;
- rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
- if (rc)
- return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
- return rc;
-}
-
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- int rc;
- struct decode_cache *c = &ctxt->decode;
-
- switch (c->dst.type) {
- case OP_REG:
- /* The 4-byte case *is* correct:
- * in 64-bit mode we zero-extend.
- */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *)c->dst.ptr = (u8)c->dst.val;
- break;
- case 2:
- *(u16 *)c->dst.ptr = (u16)c->dst.val;
- break;
- case 4:
- *c->dst.ptr = (u32)c->dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *c->dst.ptr = c->dst.val;
- break;
- }
- break;
- case OP_MEM:
- if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- else
- rc = ops->write_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != 0)
- return rc;
- break;
- case OP_NONE:
- /* no writeback */
- break;
- default:
- break;
- }
- return 0;
-}
-
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- ctxt->interruptibility = mask;
-}
-
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
- unsigned long memop = 0;
- u64 msr_data;
- unsigned long saved_eip = 0;
- struct decode_cache *c = &ctxt->decode;
- unsigned int port;
- int io_dir_in;
- int rc = 0;
-
- ctxt->interruptibility = 0;
-
- /* Shadow copy of register state. Committed on successful emulation.
- * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
- * modify them.
- */
-
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
- saved_eip = c->eip;
-
- if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
- memop = c->modrm_ea;
-
- if (c->rep_prefix && (c->d & String)) {
- /* All REP prefixes have the same first termination condition */
- if (c->regs[VCPU_REGS_RCX] == 0) {
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
- /* The second termination condition only applies for REPE
- * and REPNE. Test if the repeat string operation prefix is
- * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
- * corresponding termination condition according to:
- * - if REPE/REPZ and ZF = 0 then done
- * - if REPNE/REPNZ and ZF = 1 then done
- */
- if ((c->b == 0xa6) || (c->b == 0xa7) ||
- (c->b == 0xae) || (c->b == 0xaf)) {
- if ((c->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0)) {
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
- if ((c->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
- }
- c->regs[VCPU_REGS_RCX]--;
- c->eip = kvm_rip_read(ctxt->vcpu);
- }
-
- if (c->src.type == OP_MEM) {
- c->src.ptr = (unsigned long *)memop;
- c->src.val = 0;
- rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu);
- if (rc != 0)
- goto done;
- c->src.orig_val = c->src.val;
- }
-
- if ((c->d & DstMask) == ImplicitOps)
- goto special_insn;
-
-
- if (c->dst.type == OP_MEM) {
- c->dst.ptr = (unsigned long *)memop;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.val = 0;
- if (c->d & BitOp) {
- unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
- c->dst.ptr = (void *)c->dst.ptr +
- (c->src.val & mask) / 8;
- }
- if (!(c->d & Mov) &&
- /* optimisation - avoid slow emulated read */
- ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0))
- goto done;
- }
- c->dst.orig_val = c->dst.val;
-
-special_insn:
-
- if (c->twobyte)
- goto twobyte_insn;
-
- switch (c->b) {
- case 0x00 ... 0x05:
- add: /* add */
- emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
- break;
- case 0x08 ... 0x0d:
- or: /* or */
- emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
- break;
- case 0x10 ... 0x15:
- adc: /* adc */
- emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
- break;
- case 0x18 ... 0x1d:
- sbb: /* sbb */
- emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
- break;
- case 0x20 ... 0x25:
- and: /* and */
- emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
- break;
- case 0x28 ... 0x2d:
- sub: /* sub */
- emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
- break;
- case 0x30 ... 0x35:
- xor: /* xor */
- emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
- break;
- case 0x38 ... 0x3d:
- cmp: /* cmp */
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
- break;
- case 0x40 ... 0x47: /* inc r16/r32 */
- emulate_1op("inc", c->dst, ctxt->eflags);
- break;
- case 0x48 ... 0x4f: /* dec r16/r32 */
- emulate_1op("dec", c->dst, ctxt->eflags);
- break;
- case 0x50 ... 0x57: /* push reg */
- emulate_push(ctxt);
- break;
- case 0x58 ... 0x5f: /* pop reg */
- pop_instruction:
- rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
- if (rc != 0)
- goto done;
- break;
- case 0x63: /* movsxd */
- if (ctxt->mode != X86EMUL_MODE_PROT64)
- goto cannot_emulate;
- c->dst.val = (s32) c->src.val;
- break;
- case 0x68: /* push imm */
- case 0x6a: /* push imm8 */
- emulate_push(ctxt);
- break;
- case 0x6c: /* insb */
- case 0x6d: /* insw/insd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
- 1,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
- (ctxt->eflags & EFLG_DF),
- register_address(c, es_base(ctxt),
- c->regs[VCPU_REGS_RDI]),
- c->rep_prefix,
- c->regs[VCPU_REGS_RDX]) == 0) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
- case 0x6e: /* outsb */
- case 0x6f: /* outsw/outsd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
- 0,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
- (ctxt->eflags & EFLG_DF),
- register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- c->rep_prefix,
- c->regs[VCPU_REGS_RDX]) == 0) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
- case 0x70 ... 0x7f: /* jcc (short) */
- if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, c->src.val);
- break;
- case 0x80 ... 0x83: /* Grp1 */
- switch (c->modrm_reg) {
- case 0:
- goto add;
- case 1:
- goto or;
- case 2:
- goto adc;
- case 3:
- goto sbb;
- case 4:
- goto and;
- case 5:
- goto sub;
- case 6:
- goto xor;
- case 7:
- goto cmp;
- }
- break;
- case 0x84 ... 0x85:
- emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
- break;
- case 0x86 ... 0x87: /* xchg */
- xchg:
- /* Write back the register source. */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *) c->src.ptr = (u8) c->dst.val;
- break;
- case 2:
- *(u16 *) c->src.ptr = (u16) c->dst.val;
- break;
- case 4:
- *c->src.ptr = (u32) c->dst.val;
- break; /* 64b reg: zero-extend */
- case 8:
- *c->src.ptr = c->dst.val;
- break;
- }
- /*
- * Write back the memory destination with implicit LOCK
- * prefix.
- */
- c->dst.val = c->src.val;
- c->lock_prefix = 1;
- break;
- case 0x88 ... 0x8b: /* mov */
- goto mov;
- case 0x8c: { /* mov r/m, sreg */
- struct kvm_segment segreg;
-
- if (c->modrm_reg <= 5)
- kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
- else {
- printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
- c->modrm);
- goto cannot_emulate;
- }
- c->dst.val = segreg.selector;
- break;
- }
- case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->modrm_ea;
- break;
- case 0x8e: { /* mov seg, r/m16 */
- uint16_t sel;
- int type_bits;
- int err;
-
- sel = c->src.val;
- if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
-
- if (c->modrm_reg <= 5) {
- type_bits = (c->modrm_reg == 1) ? 9 : 1;
- err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
- type_bits, c->modrm_reg);
- } else {
- printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
- c->modrm);
- goto cannot_emulate;
- }
-
- if (err < 0)
- goto cannot_emulate;
-
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- }
- case 0x8f: /* pop (sole member of Grp1a) */
- rc = emulate_grp1a(ctxt, ops);
- if (rc != 0)
- goto done;
- break;
- case 0x90: /* nop / xchg r8,rax */
- if (!(c->rex_prefix & 1)) { /* nop */
- c->dst.type = OP_NONE;
- break;
- }
- case 0x91 ... 0x97: /* xchg reg,rax */
- c->src.type = c->dst.type = OP_REG;
- c->src.bytes = c->dst.bytes = c->op_bytes;
- c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
- c->src.val = *(c->src.ptr);
- goto xchg;
- case 0x9c: /* pushf */
- c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt);
- break;
- case 0x9d: /* popf */
- c->dst.type = OP_REG;
- c->dst.ptr = (unsigned long *) &ctxt->eflags;
- c->dst.bytes = c->op_bytes;
- goto pop_instruction;
- case 0xa0 ... 0xa1: /* mov */
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- c->dst.val = c->src.val;
- break;
- case 0xa2 ... 0xa3: /* mov */
- c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
- break;
- case 0xa4 ... 0xa5: /* movs */
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- break;
- case 0xa6 ... 0xa7: /* cmps */
- c->src.type = OP_NONE; /* Disable writeback. */
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]);
- if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu)) != 0)
- goto done;
-
- c->dst.type = OP_NONE; /* Disable writeback. */
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
- goto done;
-
- DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->src.bytes
- : c->src.bytes);
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
-
- break;
- case 0xaa ... 0xab: /* stos */
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- c->dst.val = c->regs[VCPU_REGS_RAX];
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- break;
- case 0xac ... 0xad: /* lods */
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
- goto done;
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- break;
- case 0xae ... 0xaf: /* scas */
- DPRINTF("Urk! I don't handle SCAS.\n");
- goto cannot_emulate;
- case 0xb0 ... 0xbf: /* mov r, imm */
- goto mov;
- case 0xc0 ... 0xc1:
- emulate_grp2(ctxt);
- break;
- case 0xc3: /* ret */
- c->dst.type = OP_REG;
- c->dst.ptr = &c->eip;
- c->dst.bytes = c->op_bytes;
- goto pop_instruction;
- case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
- mov:
- c->dst.val = c->src.val;
- break;
- case 0xcb: /* ret far */
- rc = emulate_ret_far(ctxt, ops);
- if (rc)
- goto done;
- break;
- case 0xd0 ... 0xd1: /* Grp2 */
- c->src.val = 1;
- emulate_grp2(ctxt);
- break;
- case 0xd2 ... 0xd3: /* Grp2 */
- c->src.val = c->regs[VCPU_REGS_RCX];
- emulate_grp2(ctxt);
- break;
- case 0xe4: /* inb */
- case 0xe5: /* in */
- port = c->src.val;
- io_dir_in = 1;
- goto do_io;
- case 0xe6: /* outb */
- case 0xe7: /* out */
- port = c->src.val;
- io_dir_in = 0;
- goto do_io;
- case 0xe8: /* call (near) */ {
- long int rel = c->src.val;
- c->src.val = (unsigned long) c->eip;
- jmp_rel(c, rel);
- emulate_push(ctxt);
- break;
- }
- case 0xe9: /* jmp rel */
- goto jmp;
- case 0xea: /* jmp far */
- if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
- VCPU_SREG_CS) < 0) {
- DPRINTF("jmp far: Failed to load CS descriptor\n");
- goto cannot_emulate;
- }
-
- c->eip = c->src.val;
- break;
- case 0xeb:
- jmp: /* jmp rel short */
- jmp_rel(c, c->src.val);
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xec: /* in al,dx */
- case 0xed: /* in (e/r)ax,dx */
- port = c->regs[VCPU_REGS_RDX];
- io_dir_in = 1;
- goto do_io;
- case 0xee: /* out al,dx */
- case 0xef: /* out (e/r)ax,dx */
- port = c->regs[VCPU_REGS_RDX];
- io_dir_in = 0;
- do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- port) != 0) {
- c->eip = saved_eip;
- goto cannot_emulate;
- }
- break;
- case 0xf4: /* hlt */
- ctxt->vcpu->arch.halt_request = 1;
- break;
- case 0xf5: /* cmc */
- /* complement carry flag from eflags reg */
- ctxt->eflags ^= EFLG_CF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xf6 ... 0xf7: /* Grp3 */
- rc = emulate_grp3(ctxt, ops);
- if (rc != 0)
- goto done;
- break;
- case 0xf8: /* clc */
- ctxt->eflags &= ~EFLG_CF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xfa: /* cli */
- ctxt->eflags &= ~X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xfb: /* sti */
- toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
- ctxt->eflags |= X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xfc: /* cld */
- ctxt->eflags &= ~EFLG_DF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xfd: /* std */
- ctxt->eflags |= EFLG_DF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xfe ... 0xff: /* Grp4/Grp5 */
- rc = emulate_grp45(ctxt, ops);
- if (rc != 0)
- goto done;
- break;
- }
-
-writeback:
- rc = writeback(ctxt, ops);
- if (rc != 0)
- goto done;
-
- /* Commit shadow register state. */
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
-
-done:
- if (rc == X86EMUL_UNHANDLEABLE) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
-
-twobyte_insn:
- switch (c->b) {
- case 0x01: /* lgdt, lidt, lmsw */
- switch (c->modrm_reg) {
- u16 size;
- unsigned long address;
-
- case 0: /* vmcall */
- if (c->modrm_mod != 3 || c->modrm_rm != 1)
- goto cannot_emulate;
-
- rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc)
- goto done;
-
- /* Let the processor re-execute the fixed hypercall */
- c->eip = kvm_rip_read(ctxt->vcpu);
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, c->src.ptr,
- &size, &address, c->op_bytes);
- if (rc)
- goto done;
- realmode_lgdt(ctxt->vcpu, size, address);
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 3: /* lidt/vmmcall */
- if (c->modrm_mod == 3) {
- switch (c->modrm_rm) {
- case 1:
- rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc)
- goto done;
- break;
- default:
- goto cannot_emulate;
- }
- } else {
- rc = read_descriptor(ctxt, ops, c->src.ptr,
- &size, &address,
- c->op_bytes);
- if (rc)
- goto done;
- realmode_lidt(ctxt->vcpu, size, address);
- }
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 4: /* smsw */
- c->dst.bytes = 2;
- c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
- break;
- case 6: /* lmsw */
- realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
- &ctxt->eflags);
- c->dst.type = OP_NONE;
- break;
- case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, memop);
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- default:
- goto cannot_emulate;
- }
- break;
- case 0x06:
- emulate_clts(ctxt->vcpu);
- c->dst.type = OP_NONE;
- break;
- case 0x08: /* invd */
- case 0x09: /* wbinvd */
- case 0x0d: /* GrpP (prefetch) */
- case 0x18: /* Grp16 (prefetch/nop) */
- c->dst.type = OP_NONE;
- break;
- case 0x20: /* mov cr, reg */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- c->regs[c->modrm_rm] =
- realmode_get_cr(ctxt->vcpu, c->modrm_reg);
- c->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x21: /* mov from dr to reg */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
- if (rc)
- goto cannot_emulate;
- c->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x22: /* mov reg, cr */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- realmode_set_cr(ctxt->vcpu,
- c->modrm_reg, c->modrm_val, &ctxt->eflags);
- c->dst.type = OP_NONE;
- break;
- case 0x23: /* mov from reg to dr */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_set_dr(ctxt, c->modrm_reg,
- c->regs[c->modrm_rm]);
- if (rc)
- goto cannot_emulate;
- c->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x30:
- /* wrmsr */
- msr_data = (u32)c->regs[VCPU_REGS_RAX]
- | ((u64)c->regs[VCPU_REGS_RDX] << 32);
- rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
- if (rc) {
- kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = kvm_rip_read(ctxt->vcpu);
- }
- rc = X86EMUL_CONTINUE;
- c->dst.type = OP_NONE;
- break;
- case 0x32:
- /* rdmsr */
- rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
- if (rc) {
- kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = kvm_rip_read(ctxt->vcpu);
- } else {
- c->regs[VCPU_REGS_RAX] = (u32)msr_data;
- c->regs[VCPU_REGS_RDX] = msr_data >> 32;
- }
- rc = X86EMUL_CONTINUE;
- c->dst.type = OP_NONE;
- break;
- case 0x40 ... 0x4f: /* cmov */
- c->dst.val = c->dst.orig_val = c->src.val;
- if (!test_cc(c->b, ctxt->eflags))
- c->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x80 ... 0x8f: /* jnz rel, etc*/
- if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, c->src.val);
- c->dst.type = OP_NONE;
- break;
- case 0xa3:
- bt: /* bt */
- c->dst.type = OP_NONE;
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
- break;
- case 0xa4: /* shld imm8, r, r/m */
- case 0xa5: /* shld cl, r, r/m */
- emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
- break;
- case 0xab:
- bts: /* bts */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
- break;
- case 0xac: /* shrd imm8, r, r/m */
- case 0xad: /* shrd cl, r, r/m */
- emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
- break;
- case 0xae: /* clflush */
- break;
- case 0xb0 ... 0xb1: /* cmpxchg */
- /*
- * Save real source value, then compare EAX against
- * destination.
- */
- c->src.orig_val = c->src.val;
- c->src.val = c->regs[VCPU_REGS_RAX];
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
- if (ctxt->eflags & EFLG_ZF) {
- /* Success: write back to memory. */
- c->dst.val = c->src.orig_val;
- } else {
- /* Failure: write the value we saw to EAX. */
- c->dst.type = OP_REG;
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- }
- break;
- case 0xb3:
- btr: /* btr */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
- break;
- case 0xb6 ... 0xb7: /* movzx */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
- : (u16) c->src.val;
- break;
- case 0xba: /* Grp8 */
- switch (c->modrm_reg & 3) {
- case 0:
- goto bt;
- case 1:
- goto bts;
- case 2:
- goto btr;
- case 3:
- goto btc;
- }
- break;
- case 0xbb:
- btc: /* btc */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
- break;
- case 0xbe ... 0xbf: /* movsx */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
- (s16) c->src.val;
- break;
- case 0xc3: /* movnti */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
- (u64) c->src.val;
- break;
- case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = emulate_grp9(ctxt, ops, memop);
- if (rc != 0)
- goto done;
- c->dst.type = OP_NONE;
- break;
- }
- goto writeback;
-
-cannot_emulate:
- DPRINTF("Cannot emulate %02x\n", c->b);
- c->eip = saved_eip;
- return -1;
-}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
new file mode 100644
index 000000000000..d6b2a665b499
--- /dev/null
+++ b/arch/x86/kvm/xen.c
@@ -0,0 +1,2349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "x86.h"
+#include "xen.h"
+#include "hyperv.h"
+#include "irq.h"
+
+#include <linux/eventfd.h>
+#include <linux/kvm_host.h>
+#include <linux/sched/stat.h>
+
+#include <trace/events/kvm.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/version.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/sched.h>
+
+#include <asm/xen/cpuid.h>
+#include <asm/pvclock.h>
+
+#include "cpuid.h"
+#include "trace.h"
+
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
+
+static int kvm_xen_shared_info_init(struct kvm *kvm)
+{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct pvclock_wall_clock *wc;
+ u32 *wc_sec_hi;
+ u32 wc_version;
+ u64 wall_nsec;
+ int ret = 0;
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ read_lock_irq(&gpc->lock);
+ while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+ read_unlock_irq(&gpc->lock);
+
+ ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
+ if (ret)
+ goto out;
+
+ read_lock_irq(&gpc->lock);
+ }
+
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
+
+ /* Paranoia checks on the 32-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+#ifdef CONFIG_X86_64
+ /* Paranoia checks on the 64-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
+ BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->wc_sec_hi;
+ wc = &shinfo->wc;
+ } else
+#endif
+ {
+ struct compat_shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->arch.wc_sec_hi;
+ wc = &shinfo->wc;
+ }
+
+ /* Increment and ensure an odd value */
+ wc_version = wc->version = (wc->version + 1) | 1;
+ smp_wmb();
+
+ wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
+ wc->sec = (u32)wall_nsec;
+ *wc_sec_hi = wall_nsec >> 32;
+ smp_wmb();
+
+ wc->version = wc_version + 1;
+ read_unlock_irq(&gpc->lock);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
+ struct kvm_xen_evtchn e;
+
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ kvm_xen_set_evtchn(&e, vcpu->kvm);
+
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ }
+}
+
+static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
+ arch.xen.timer);
+ struct kvm_xen_evtchn e;
+ int rc;
+
+ if (atomic_read(&vcpu->arch.xen.timer_pending))
+ return HRTIMER_NORESTART;
+
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
+ if (rc != -EWOULDBLOCK) {
+ vcpu->arch.xen.timer_expires = 0;
+ return HRTIMER_NORESTART;
+ }
+
+ atomic_inc(&vcpu->arch.xen.timer_pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
+ struct pvclock_vcpu_time_info *hv_clock,
+ struct gfn_to_pfn_cache *gpc,
+ unsigned int offset)
+{
+ unsigned long flags;
+ int r;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
+ if (r)
+ return r;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /*
+ * Sanity check TSC shift+multiplier to verify the guest's view of time
+ * is more or less consistent.
+ */
+ if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift ||
+ hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
+ bool linux_wa)
+{
+ struct kvm_vcpu_xen *xen = &vcpu->arch.xen;
+ int64_t kernel_now, delta;
+ uint64_t guest_now;
+ int r = -EOPNOTSUPP;
+
+ /*
+ * The guest provides the requested timeout in absolute nanoseconds
+ * of the KVM clock — as *it* sees it, based on the scaled TSC and
+ * the pvclock information provided by KVM.
+ *
+ * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
+ * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
+ * difference won't matter much as there is no cumulative effect.
+ *
+ * Calculate the time for some arbitrary point in time around "now"
+ * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
+ * delta between the kvmclock "now" value and the guest's requested
+ * timeout, apply the "Linux workaround" described below, and add
+ * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
+ * the absolute CLOCK_MONOTONIC time at which the timer should
+ * fire.
+ */
+ do {
+ struct pvclock_vcpu_time_info hv_clock;
+ uint64_t host_tsc, guest_tsc;
+
+ if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
+ !vcpu->kvm->arch.use_master_clock)
+ break;
+
+ /*
+ * If both Xen PV clocks are active, arbitrarily try to use the
+ * compat clock first, but also try to use the non-compat clock
+ * if the compat clock is unusable. The two PV clocks hold the
+ * same information, but it's possible one (or both) is stale
+ * and/or currently unreachable.
+ */
+ if (xen->vcpu_info_cache.active)
+ r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (r && xen->vcpu_time_info_cache.active)
+ r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0);
+ if (r)
+ break;
+
+ if (!IS_ENABLED(CONFIG_64BIT) ||
+ !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
+ /*
+ * Don't fall back to get_kvmclock_ns() because it's
+ * broken; it has a systemic error in its results
+ * because it scales directly from host TSC to
+ * nanoseconds, and doesn't scale first to guest TSC
+ * and *then* to nanoseconds as the guest does.
+ *
+ * There is a small error introduced here because time
+ * continues to elapse between the ktime_get() and the
+ * subsequent rdtsc(). But not the systemic drift due
+ * to get_kvmclock_ns().
+ */
+ kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
+ host_tsc = rdtsc();
+ }
+
+ /* Calculate the guest kvmclock as the guest would do it. */
+ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
+ guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc);
+ } while (0);
+
+ if (r) {
+ /*
+ * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
+ *
+ * Also if the guest PV clock hasn't been set up yet, as is
+ * likely to be the case during migration when the vCPU has
+ * not been run yet. It would be possible to calculate the
+ * scaling factors properly in that case but there's not much
+ * point in doing so. The get_kvmclock_ns() drift accumulates
+ * over time, so it's OK to use it at startup. Besides, on
+ * migration there's going to be a little bit of skew in the
+ * precise moment at which timers fire anyway. Often they'll
+ * be in the "past" by the time the VM is running again after
+ * migration.
+ */
+ guest_now = get_kvmclock_ns(vcpu->kvm);
+ kernel_now = ktime_get();
+ }
+
+ delta = guest_abs - guest_now;
+
+ /*
+ * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
+ * negative absolute timeout values (caused by integer overflow), and
+ * for values about 13 days in the future (2^50ns) which would be
+ * caused by jiffies overflow. For those cases, Xen sets the timeout
+ * 100ms in the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep churning CPU
+ * time by waking it up). Emulate Xen's workaround when starting the
+ * timer in response to __HYPERVISOR_set_timer_op.
+ */
+ if (linux_wa &&
+ unlikely((int64_t)guest_abs < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ guest_abs = guest_now + delta;
+ }
+
+ /*
+ * Avoid races with the old timer firing. Checking timer_expires
+ * to avoid calling hrtimer_cancel() will only have false positives
+ * so is fine.
+ */
+ if (vcpu->arch.xen.timer_expires)
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ vcpu->arch.xen.timer_expires = guest_abs;
+
+ if (delta <= 0)
+ xen_timer_callback(&vcpu->arch.xen.timer);
+ else
+ hrtimer_start(&vcpu->arch.xen.timer,
+ ktime_add_ns(kernel_now, delta),
+ HRTIMER_MODE_ABS_HARD);
+}
+
+static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+}
+
+static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+ struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
+ size_t user_len, user_len1, user_len2;
+ struct vcpu_runstate_info rs;
+ unsigned long flags;
+ size_t times_ofs;
+ uint8_t *update_bit = NULL;
+ uint64_t entry_time;
+ uint64_t *rs_times;
+ int *rs_state;
+
+ /*
+ * The only difference between 32-bit and 64-bit versions of the
+ * runstate struct is the alignment of uint64_t in 32-bit, which
+ * means that the 64-bit version has an additional 4 bytes of
+ * padding after the first field 'state'. Let's be really really
+ * paranoid about that, and matching it with our internal data
+ * structures that we memcpy into it...
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+#ifdef CONFIG_X86_64
+ /*
+ * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
+ * so each subsequent field is shifted by 4, and it's 4 bytes longer.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
+ offsetof(struct compat_vcpu_runstate_info, time) + 4);
+ BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
+#endif
+ /*
+ * The state field is in the same place at the start of both structs,
+ * and is the same size (int) as vx->current_runstate.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
+ offsetof(struct compat_vcpu_runstate_info, state));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
+ sizeof(vx->current_runstate));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
+ sizeof(vx->current_runstate));
+
+ /*
+ * The state_entry_time field is 64 bits in both versions, and the
+ * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
+ * is little-endian means that it's in the last *byte* of the word.
+ * That detail is important later.
+ */
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
+
+ /*
+ * The time array is four 64-bit quantities in both versions, matching
+ * the vx->runstate_times and immediately following state_entry_time.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+ sizeof_field(struct compat_vcpu_runstate_info, time));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+ sizeof(vx->runstate_times));
+
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ user_len = sizeof(struct vcpu_runstate_info);
+ times_ofs = offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ } else {
+ user_len = sizeof(struct compat_vcpu_runstate_info);
+ times_ofs = offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+ }
+
+ /*
+ * There are basically no alignment constraints. The guest can set it
+ * up so it crosses from one page to the next, and at arbitrary byte
+ * alignment (and the 32-bit ABI doesn't align the 64-bit integers
+ * anyway, even if the overall struct had been 64-bit aligned).
+ */
+ if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+ user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+ user_len2 = user_len - user_len1;
+ } else {
+ user_len1 = user_len;
+ user_len2 = 0;
+ }
+ BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+ /*
+ * Attempt to obtain the GPC lock on *both* (if there are two)
+ * gfn_to_pfn caches that cover the region.
+ */
+ if (atomic) {
+ local_irq_save(flags);
+ if (!read_trylock(&gpc1->lock)) {
+ local_irq_restore(flags);
+ return;
+ }
+ } else {
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+ while (!kvm_gpc_check(gpc1, user_len1)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ if (kvm_gpc_refresh(gpc1, user_len1))
+ return;
+
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+
+ if (likely(!user_len2)) {
+ /*
+ * Set up three pointers directly to the runstate_info
+ * struct in the guest (via the GPC).
+ *
+ * • @rs_state → state field
+ * • @rs_times → state_entry_time field.
+ * • @update_bit → last byte of state_entry_time, which
+ * contains the XEN_RUNSTATE_UPDATE bit.
+ */
+ rs_state = gpc1->khva;
+ rs_times = gpc1->khva + times_ofs;
+ if (v->kvm->arch.xen.runstate_update_flag)
+ update_bit = ((void *)(&rs_times[1])) - 1;
+ } else {
+ /*
+ * The guest's runstate_info is split across two pages and we
+ * need to hold and validate both GPCs simultaneously. We can
+ * declare a lock ordering GPC1 > GPC2 because nothing else
+ * takes them more than one at a time. Set a subclass on the
+ * gpc1 lock to make lockdep shut up about it.
+ */
+ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+ if (atomic) {
+ if (!read_trylock(&gpc2->lock)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+ return;
+ }
+ } else {
+ read_lock(&gpc2->lock);
+ }
+
+ if (!kvm_gpc_check(gpc2, user_len2)) {
+ read_unlock(&gpc2->lock);
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ /*
+ * Use kvm_gpc_activate() here because if the runstate
+ * area was configured in 32-bit mode and only extends
+ * to the second page now because the guest changed to
+ * 64-bit mode, the second GPC won't have been set up.
+ */
+ if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
+ user_len2))
+ return;
+
+ /*
+ * We dropped the lock on GPC1 so we have to go all the
+ * way back and revalidate that too.
+ */
+ goto retry;
+ }
+
+ /*
+ * In this case, the runstate_info struct will be assembled on
+ * the kernel stack (compat or not as appropriate) and will
+ * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
+ * rs pointers accordingly.
+ */
+ rs_times = &rs.state_entry_time;
+
+ /*
+ * The rs_state pointer points to the start of what we'll
+ * copy to the guest, which in the case of a compat guest
+ * is the 32-bit field that the compiler thinks is padding.
+ */
+ rs_state = ((void *)rs_times) - times_ofs;
+
+ /*
+ * The update_bit is still directly in the guest memory,
+ * via one GPC or the other.
+ */
+ if (v->kvm->arch.xen.runstate_update_flag) {
+ if (user_len1 >= times_ofs + sizeof(uint64_t))
+ update_bit = gpc1->khva + times_ofs +
+ sizeof(uint64_t) - 1;
+ else
+ update_bit = gpc2->khva + times_ofs +
+ sizeof(uint64_t) - 1 - user_len1;
+ }
+
+#ifdef CONFIG_X86_64
+ /*
+ * Don't leak kernel memory through the padding in the 64-bit
+ * version of the struct.
+ */
+ memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
+#endif
+ }
+
+ /*
+ * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
+ * state_entry_time field, directly in the guest. We need to set
+ * that (and write-barrier) before writing to the rest of the
+ * structure, and clear it last. Just as Xen does, we address the
+ * single *byte* in which it resides because it might be in a
+ * different cache line to the rest of the 64-bit word, due to
+ * the (lack of) alignment constraints.
+ */
+ entry_time = vx->runstate_entry_time;
+ if (update_bit) {
+ entry_time |= XEN_RUNSTATE_UPDATE;
+ *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
+ smp_wmb();
+ }
+
+ /*
+ * Now assemble the actual structure, either on our kernel stack
+ * or directly in the guest according to how the rs_state and
+ * rs_times pointers were set up above.
+ */
+ *rs_state = vx->current_runstate;
+ rs_times[0] = entry_time;
+ memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+
+ /* For the split case, we have to then copy it to the guest. */
+ if (user_len2) {
+ memcpy(gpc1->khva, rs_state, user_len1);
+ memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
+ }
+ smp_wmb();
+
+ /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
+ if (update_bit) {
+ entry_time &= ~XEN_RUNSTATE_UPDATE;
+ *update_bit = entry_time >> 56;
+ smp_wmb();
+ }
+
+ if (user_len2) {
+ kvm_gpc_mark_dirty_in_slot(gpc2);
+ read_unlock(&gpc2->lock);
+ }
+
+ kvm_gpc_mark_dirty_in_slot(gpc1);
+ read_unlock_irqrestore(&gpc1->lock, flags);
+}
+
+void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ u64 now = get_kvmclock_ns(v->kvm);
+ u64 delta_ns = now - vx->runstate_entry_time;
+ u64 run_delay = current->sched_info.run_delay;
+
+ if (unlikely(!vx->runstate_entry_time))
+ vx->current_runstate = RUNSTATE_offline;
+
+ /*
+ * Time waiting for the scheduler isn't "stolen" if the
+ * vCPU wasn't running anyway.
+ */
+ if (vx->current_runstate == RUNSTATE_running) {
+ u64 steal_ns = run_delay - vx->last_steal;
+
+ delta_ns -= steal_ns;
+
+ vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+ }
+ vx->last_steal = run_delay;
+
+ vx->runstate_times[vx->current_runstate] += delta_ns;
+ vx->current_runstate = state;
+ vx->runstate_entry_time = now;
+
+ if (vx->runstate_cache.active)
+ kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
+}
+
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+{
+ struct kvm_lapic_irq irq = { };
+
+ irq.dest_id = v->vcpu_id;
+ irq.vector = v->arch.xen.upcall_vector;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.level = 1;
+
+ kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
+}
+
+/*
+ * On event channel delivery, the vcpu_info may not have been accessible.
+ * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
+ * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
+ * Do so now that we can sleep in the context of the vCPU to bring the
+ * page in, and refresh the pfn cache for it.
+ */
+void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
+{
+ unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
+
+ if (!evtchn_pending_sel)
+ return;
+
+ /*
+ * Yes, this is an open-coded loop. But that's just what put_user()
+ * does anyway. Page it in and retry the instruction. We're just a
+ * little more honest about it.
+ */
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ /* Now gpc->khva is a valid kernel address for the vcpu_info */
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ struct vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orq %0, %1\n"
+ "notq %0\n"
+ LOCK_PREFIX "andq %0, %2\n"
+ : "=r" (evtchn_pending_sel),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ } else {
+ u32 evtchn_pending_sel32 = evtchn_pending_sel;
+ struct compat_vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orl %0, %1\n"
+ "notl %0\n"
+ LOCK_PREFIX "andl %0, %2\n"
+ : "=r" (evtchn_pending_sel32),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel32));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ }
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (v->arch.xen.upcall_vector)
+ kvm_xen_inject_vcpu_vector(v);
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
+ u8 rc = 0;
+
+ /*
+ * If the global upcall vector (HVMIRQ_callback_vector) is set and
+ * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
+ */
+
+ /* No need for compat handling here */
+ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
+ offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof_field(struct vcpu_info, evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
+
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ */
+ if (in_atomic() || !task_is_running(current))
+ return 1;
+
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
+ /*
+ * If this failed, userspace has screwed up the
+ * vcpu_info mapping. No interrupts for you.
+ */
+ return 0;
+ }
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
+ read_unlock_irqrestore(&gpc->lock, flags);
+ return rc;
+}
+
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
+ r = -EINVAL;
+ } else {
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.long_mode = !!data->u.long_mode;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place. Whilst it's not necessary to do this
+ * unless the mode is actually changed, it does no harm
+ * to make the call anyway.
+ */
+ r = kvm->arch.xen.shinfo_cache.active ?
+ kvm_xen_shared_info_init(kvm) : 0;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ }
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
+ int idx;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
+ gfn_t gfn = data->u.shared_info.gfn;
+
+ if (gfn == KVM_XEN_INVALID_GFN) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
+ gfn_to_gpa(gfn), PAGE_SIZE);
+ }
+ } else {
+ void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
+
+ if (!PAGE_ALIGNED(hva)) {
+ r = -EINVAL;
+ } else if (!hva) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
+ (unsigned long)hva, PAGE_SIZE);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!r && kvm->arch.xen.shinfo_cache.active)
+ r = kvm_xen_shared_info_init(kvm);
+
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ break;
+ }
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.upcall_vector = data->u.vector;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_ATTR_TYPE_EVTCHN:
+ r = kvm_xen_setattr_evtchn(kvm, data);
+ break;
+
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.xen_version = data->u.xen_version;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ return r;
+}
+
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ data->u.long_mode = kvm->arch.xen.long_mode;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+ else
+ data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
+ data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
+ else
+ data->u.shared_info.hva = 0;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = kvm->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ data->u.xen_version = kvm->arch.xen.xen_version;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return r;
+}
+
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int idx, r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
+ /* No compat necessary here. */
+ BUILD_BUG_ON(sizeof(struct vcpu_info) !=
+ sizeof(struct compat_vcpu_info));
+ BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
+ offsetof(struct compat_vcpu_info, time));
+
+ if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
+ } else {
+ if (data->u.hva == 0) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.hva, sizeof(struct vcpu_info));
+ }
+
+ if (!r)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
+ sizeof(struct pvclock_vcpu_time_info));
+ if (!r)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+ size_t sz, sz1, sz2;
+
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ r = 0;
+ deactivate_out:
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ break;
+ }
+
+ /*
+ * If the guest switches to 64-bit mode after setting the runstate
+ * address, that's actually OK. kvm_xen_update_runstate_guest()
+ * will cope.
+ */
+ if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+ sz = sizeof(struct vcpu_runstate_info);
+ else
+ sz = sizeof(struct compat_vcpu_runstate_info);
+
+ /* How much fits in the (first) page? */
+ sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
+ data->u.gpa, sz1);
+ if (r)
+ goto deactivate_out;
+
+ /* Either map the second page, or deactivate the second GPC */
+ if (sz1 >= sz) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ } else {
+ sz2 = sz - sz1;
+ BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache,
+ data->u.gpa + sz1, sz2);
+ if (r)
+ goto deactivate_out;
+ }
+
+ kvm_xen_update_runstate_guest(vcpu, false);
+ break;
+ }
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline) {
+ r = -EINVAL;
+ break;
+ }
+
+ kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline) {
+ r = -EINVAL;
+ break;
+ }
+ if (data->u.runstate.state_entry_time !=
+ (data->u.runstate.time_running +
+ data->u.runstate.time_runnable +
+ data->u.runstate.time_blocked +
+ data->u.runstate.time_offline)) {
+ r = -EINVAL;
+ break;
+ }
+ if (get_kvmclock_ns(vcpu->kvm) <
+ data->u.runstate.state_entry_time) {
+ r = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.xen.current_runstate = data->u.runstate.state;
+ vcpu->arch.xen.runstate_entry_time =
+ data->u.runstate.state_entry_time;
+ vcpu->arch.xen.runstate_times[RUNSTATE_running] =
+ data->u.runstate.time_running;
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
+ data->u.runstate.time_runnable;
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
+ data->u.runstate.time_blocked;
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
+ data->u.runstate.time_offline;
+ vcpu->arch.xen.last_steal = current->sched_info.run_delay;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline &&
+ data->u.runstate.state != (u64)-1) {
+ r = -EINVAL;
+ break;
+ }
+ /* The adjustment must add up */
+ if (data->u.runstate.state_entry_time !=
+ (data->u.runstate.time_running +
+ data->u.runstate.time_runnable +
+ data->u.runstate.time_blocked +
+ data->u.runstate.time_offline)) {
+ r = -EINVAL;
+ break;
+ }
+
+ if (get_kvmclock_ns(vcpu->kvm) <
+ (vcpu->arch.xen.runstate_entry_time +
+ data->u.runstate.state_entry_time)) {
+ r = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.xen.runstate_entry_time +=
+ data->u.runstate.state_entry_time;
+ vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
+ data->u.runstate.time_running;
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
+ data->u.runstate.time_runnable;
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
+ data->u.runstate.time_blocked;
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
+ data->u.runstate.time_offline;
+
+ if (data->u.runstate.state <= RUNSTATE_offline)
+ kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ else if (vcpu->arch.xen.runstate_cache.active)
+ kvm_xen_update_runstate_guest(vcpu, false);
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ if (data->u.vcpu_id >= KVM_MAX_VCPUS)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ if (data->u.timer.port &&
+ data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
+ r = -EINVAL;
+ break;
+ }
+
+ /* Stop the timer (if it's running) before changing the vector */
+ kvm_xen_stop_timer(vcpu);
+ vcpu->arch.xen.timer_virq = data->u.timer.port;
+
+ /* Start the timer if the new value has a valid vector+expiry. */
+ if (data->u.timer.port && data->u.timer.expires_ns)
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
+
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
+ return r;
+}
+
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
+ data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
+ else
+ data->u.gpa = KVM_XEN_INVALID_GPA;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
+ data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
+ else
+ data->u.hva = 0;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (vcpu->arch.xen.vcpu_time_info_cache.active)
+ data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
+ else
+ data->u.gpa = KVM_XEN_INVALID_GPA;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (vcpu->arch.xen.runstate_cache.active) {
+ data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate.state = vcpu->arch.xen.current_runstate;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate.state = vcpu->arch.xen.current_runstate;
+ data->u.runstate.state_entry_time =
+ vcpu->arch.xen.runstate_entry_time;
+ data->u.runstate.time_running =
+ vcpu->arch.xen.runstate_times[RUNSTATE_running];
+ data->u.runstate.time_runnable =
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
+ data->u.runstate.time_blocked =
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
+ data->u.runstate.time_offline =
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline];
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+ r = -EINVAL;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ /*
+ * Ensure a consistent snapshot of state is captured, with a
+ * timer either being pending, or the event channel delivered
+ * to the corresponding bit in the shared_info. Not still
+ * lurking in the timer_pending flag for deferred delivery.
+ * Purely as an optimisation, if the timer_expires field is
+ * zero, that means the timer isn't active (or even in the
+ * timer_pending flag) and there is no need to cancel it.
+ */
+ if (vcpu->arch.xen.timer_expires) {
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ kvm_xen_inject_timer_irqs(vcpu);
+ }
+
+ data->u.timer.port = vcpu->arch.xen.timer_virq;
+ data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+ data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+
+ /*
+ * The hrtimer may trigger and raise the IRQ immediately,
+ * while the returned state causes it to be set up and
+ * raised again on the destination system after migration.
+ * That's fine, as the guest won't even have had a chance
+ * to run and handle the interrupt. Asserting an already
+ * pending event channel is idempotent.
+ */
+ if (vcpu->arch.xen.timer_expires)
+ hrtimer_start_expires(&vcpu->arch.xen.timer,
+ HRTIMER_MODE_ABS_HARD);
+
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = vcpu->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
+ return r;
+}
+
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ bool lm = is_long_mode(vcpu);
+ int r = 0;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ if (kvm->arch.xen.long_mode != lm) {
+ kvm->arch.xen.long_mode = lm;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place.
+ */
+ if (kvm->arch.xen.shinfo_cache.active &&
+ kvm_xen_shared_info_init(kvm))
+ r = 1;
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ if (r)
+ return r;
+
+ /*
+ * If Xen hypercall intercept is enabled, fill the hypercall
+ * page with VMCALL/VMMCALL instructions since that's what
+ * we catch. Else the VMM has provided the hypercall pages
+ * with instructions of its own choosing, so use those.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ u8 instructions[32];
+ int i;
+
+ if (page_num)
+ return 1;
+
+ /* mov imm32, %eax */
+ instructions[0] = 0xb8;
+
+ /* vmcall / vmmcall */
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + 5);
+
+ /* ret */
+ instructions[8] = 0xc3;
+
+ /* int3 to pad */
+ memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
+
+ for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+ *(u32 *)&instructions[1] = i;
+ if (kvm_vcpu_write_guest(vcpu,
+ page_addr + (i * sizeof(instructions)),
+ instructions, sizeof(instructions)))
+ return 1;
+ }
+ } else {
+ /*
+ * Note, truncation is a non-issue as 'lm' is guaranteed to be
+ * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
+ */
+ hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64
+ : kvm->arch.xen.hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64
+ : kvm->arch.xen.hvm_config.blob_size_32;
+ u8 *page;
+ int ret;
+
+ if (page_num >= blob_size)
+ return 1;
+
+ blob_addr += page_num * PAGE_SIZE;
+
+ page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
+ kfree(page);
+ if (ret)
+ return 1;
+ }
+ return 0;
+}
+
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
+{
+ /* Only some feature flags need to be *enabled* by userspace */
+ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+ u32 old_flags;
+
+ if (xhc->flags & ~permitted_flags)
+ return -EINVAL;
+
+ /*
+ * With hypercall interception the kernel generates its own
+ * hypercall page so it must not be provided.
+ */
+ if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
+ (xhc->blob_addr_32 || xhc->blob_addr_64 ||
+ xhc->blob_size_32 || xhc->blob_size_64))
+ return -EINVAL;
+
+ /*
+ * Restrict the MSR to the range that is unofficially reserved for
+ * synthetic, virtualization-defined MSRs, e.g. to prevent confusing
+ * KVM by colliding with a real MSR that requires special handling.
+ */
+ if (xhc->msr &&
+ (xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX))
+ return -EINVAL;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ if (xhc->msr && !kvm->arch.xen.hvm_config.msr)
+ static_branch_inc(&kvm_xen_enabled.key);
+ else if (!xhc->msr && kvm->arch.xen.hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+
+ old_flags = kvm->arch.xen.hvm_config.flags;
+ memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc));
+
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+
+ return 0;
+}
+
+static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ kvm_rax_write(vcpu, result);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
+ return 1;
+
+ return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+}
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
+static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+ evtchn_port_t *ports)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ unsigned long *pending_bits;
+ unsigned long flags;
+ bool ret = true;
+ int idx, i;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
+ goto out_rcu;
+
+ ret = false;
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ }
+
+ for (i = 0; i < nr_ports; i++) {
+ if (test_bit(ports[i], pending_bits)) {
+ ret = true;
+ break;
+ }
+ }
+
+ out_rcu:
+ read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return ret;
+}
+
+static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
+ u64 param, u64 *r)
+{
+ struct sched_poll sched_poll;
+ evtchn_port_t port, *ports;
+ struct x86_exception e;
+ int i;
+
+ if (!lapic_in_kernel(vcpu) ||
+ !(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
+ return false;
+
+ if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
+ struct compat_sched_poll sp32;
+
+ /* Sanity check that the compat struct definition is correct */
+ BUILD_BUG_ON(sizeof(sp32) != 16);
+
+ if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /*
+ * This is a 32-bit pointer to an array of evtchn_port_t which
+ * are uint32_t, so once it's converted no further compat
+ * handling is needed.
+ */
+ sched_poll.ports = (void *)(unsigned long)(sp32.ports);
+ sched_poll.nr_ports = sp32.nr_ports;
+ sched_poll.timeout = sp32.timeout;
+ } else {
+ if (kvm_read_guest_virt(vcpu, param, &sched_poll,
+ sizeof(sched_poll), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+ }
+
+ if (unlikely(sched_poll.nr_ports > 1)) {
+ /* Xen (unofficially) limits number of pollers to 128 */
+ if (sched_poll.nr_ports > 128) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ ports = kmalloc_array(sched_poll.nr_ports,
+ sizeof(*ports), GFP_KERNEL);
+ if (!ports) {
+ *r = -ENOMEM;
+ return true;
+ }
+ } else
+ ports = &port;
+
+ if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
+ sched_poll.nr_ports * sizeof(*ports), &e)) {
+ *r = -EFAULT;
+ goto out;
+ }
+
+ for (i = 0; i < sched_poll.nr_ports; i++) {
+ if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
+ *r = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (sched_poll.nr_ports == 1)
+ vcpu->arch.xen.poll_evtchn = port;
+ else
+ vcpu->arch.xen.poll_evtchn = -1;
+
+ set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
+
+ if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
+
+ if (sched_poll.timeout)
+ mod_timer(&vcpu->arch.xen.poll_timer,
+ jiffies + nsecs_to_jiffies(sched_poll.timeout));
+
+ kvm_vcpu_halt(vcpu);
+
+ if (sched_poll.timeout)
+ timer_delete(&vcpu->arch.xen.poll_timer);
+
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ }
+
+ vcpu->arch.xen.poll_evtchn = 0;
+ *r = 0;
+out:
+ /* Really, this is only needed in case of timeout */
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
+
+ if (unlikely(sched_poll.nr_ports > 1))
+ kfree(ports);
+ return true;
+}
+
+static void cancel_evtchn_poll(struct timer_list *t)
+{
+ struct kvm_vcpu *vcpu = timer_container_of(vcpu, t,
+ arch.xen.poll_timer);
+
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
+ int cmd, u64 param, u64 *r)
+{
+ switch (cmd) {
+ case SCHEDOP_poll:
+ if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
+ return true;
+ fallthrough;
+ case SCHEDOP_yield:
+ kvm_vcpu_on_spin(vcpu, true);
+ *r = 0;
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+struct compat_vcpu_set_singleshot_timer {
+ uint64_t timeout_abs_ns;
+ uint32_t flags;
+} __attribute__((packed));
+
+static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
+ int vcpu_id, u64 param, u64 *r)
+{
+ struct vcpu_set_singleshot_timer oneshot;
+ struct x86_exception e;
+
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ switch (cmd) {
+ case VCPUOP_set_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ /*
+ * The only difference for 32-bit compat is the 4 bytes of
+ * padding after the interesting part of the structure. So
+ * for a faithful emulation of Xen we have to *try* to copy
+ * the padding and return -EFAULT if we can't. Otherwise we
+ * might as well just have copied the 12-byte 32-bit struct.
+ */
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
+ offsetof(struct vcpu_set_singleshot_timer, flags));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, flags));
+
+ if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) :
+ sizeof(struct compat_vcpu_set_singleshot_timer), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
+ *r = 0;
+ return true;
+
+ case VCPUOP_stop_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ kvm_xen_stop_timer(vcpu);
+ *r = 0;
+ return true;
+ }
+
+ return false;
+}
+
+static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
+ u64 *r)
+{
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ if (timeout)
+ kvm_xen_start_timer(vcpu, timeout, true);
+ else
+ kvm_xen_stop_timer(vcpu);
+
+ *r = 0;
+ return true;
+}
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
+{
+ bool longmode;
+ u64 input, params[6], r = -ENOSYS;
+ bool handled = false;
+ u8 cpl;
+
+ input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+ /* Hyper-V hypercalls get bit 31 set in EAX */
+ if ((input & 0x80000000) &&
+ kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ longmode = is_64_bit_hypercall(vcpu);
+ if (!longmode) {
+ params[0] = (u32)kvm_rbx_read(vcpu);
+ params[1] = (u32)kvm_rcx_read(vcpu);
+ params[2] = (u32)kvm_rdx_read(vcpu);
+ params[3] = (u32)kvm_rsi_read(vcpu);
+ params[4] = (u32)kvm_rdi_read(vcpu);
+ params[5] = (u32)kvm_rbp_read(vcpu);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ params[0] = (u64)kvm_rdi_read(vcpu);
+ params[1] = (u64)kvm_rsi_read(vcpu);
+ params[2] = (u64)kvm_rdx_read(vcpu);
+ params[3] = (u64)kvm_r10_read(vcpu);
+ params[4] = (u64)kvm_r8_read(vcpu);
+ params[5] = (u64)kvm_r9_read(vcpu);
+ }
+#endif
+ cpl = kvm_x86_call(get_cpl)(vcpu);
+ trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
+ params[3], params[4], params[5]);
+
+ /*
+ * Only allow hypercall acceleration for CPL0. The rare hypercalls that
+ * are permitted in guest userspace can be handled by the VMM.
+ */
+ if (unlikely(cpl > 0))
+ goto handle_in_userspace;
+
+ switch (input) {
+ case __HYPERVISOR_xen_version:
+ if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+ r = vcpu->kvm->arch.xen.xen_version;
+ handled = true;
+ }
+ break;
+ case __HYPERVISOR_event_channel_op:
+ if (params[0] == EVTCHNOP_send)
+ handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
+ break;
+ case __HYPERVISOR_sched_op:
+ handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
+ params[1], &r);
+ break;
+ case __HYPERVISOR_vcpu_op:
+ handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
+ params[2], &r);
+ break;
+ case __HYPERVISOR_set_timer_op: {
+ u64 timeout = params[0];
+ /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
+ if (!longmode)
+ timeout |= params[1] << 32;
+ handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (handled)
+ return kvm_xen_hypercall_set_result(vcpu, r);
+
+handle_in_userspace:
+ vcpu->run->exit_reason = KVM_EXIT_XEN;
+ vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+ vcpu->run->xen.u.hcall.longmode = longmode;
+ vcpu->run->xen.u.hcall.cpl = cpl;
+ vcpu->run->xen.u.hcall.input = input;
+ vcpu->run->xen.u.hcall.params[0] = params[0];
+ vcpu->run->xen.u.hcall.params[1] = params[1];
+ vcpu->run->xen.u.hcall.params[2] = params[2];
+ vcpu->run->xen.u.hcall.params[3] = params[3];
+ vcpu->run->xen.u.hcall.params[4] = params[4];
+ vcpu->run->xen.u.hcall.params[5] = params[5];
+ vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io =
+ kvm_xen_hypercall_complete_userspace;
+
+ return 0;
+}
+
+static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+{
+ int poll_evtchn = vcpu->arch.xen.poll_evtchn;
+
+ if ((poll_evtchn == port || poll_evtchn == -1) &&
+ test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
+/*
+ * The return value from this function is propagated to kvm_set_irq() API,
+ * so it returns:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ *
+ * It is also called directly from kvm_arch_set_irq_inatomic(), where the
+ * only check on its return value is a comparison with -EWOULDBLOCK'.
+ */
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
+{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct kvm_vcpu *vcpu;
+ unsigned long *pending_bits, *mask_bits;
+ unsigned long flags;
+ int port_word_bit;
+ bool kick_vcpu = false;
+ int vcpu_idx, idx, rc;
+
+ vcpu_idx = READ_ONCE(xe->vcpu_idx);
+ if (vcpu_idx >= 0)
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ else {
+ vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
+ if (!vcpu)
+ return -EINVAL;
+ WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
+ }
+
+ if (xe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ rc = -EWOULDBLOCK;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
+ goto out_rcu;
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = xe->port / 64;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = xe->port / 32;
+ }
+
+ /*
+ * If this port wasn't already set, and if it isn't masked, then
+ * we try to set the corresponding bit in the in-kernel shadow of
+ * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+ * already set, then we kick the vCPU in question to write to the
+ * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+ */
+ if (test_and_set_bit(xe->port, pending_bits)) {
+ rc = 0; /* It was already raised */
+ } else if (test_bit(xe->port, mask_bits)) {
+ rc = -ENOTCONN; /* Masked */
+ kvm_xen_check_poller(vcpu, xe->port);
+ } else {
+ rc = 1; /* Delivered to the bitmap in shared_info. */
+ /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
+ read_unlock_irqrestore(&gpc->lock, flags);
+ gpc = &vcpu->arch.xen.vcpu_info_cache;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
+ /*
+ * Could not access the vcpu_info. Set the bit in-kernel
+ * and prod the vCPU to deliver it for itself.
+ */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ goto out_rcu;
+ }
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ } else {
+ struct compat_vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit,
+ (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ }
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+ kvm_xen_inject_vcpu_vector(vcpu);
+ kick_vcpu = false;
+ }
+ }
+
+ out_rcu:
+ read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (kick_vcpu) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ return rc;
+}
+
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
+{
+ bool mm_borrowed = false;
+ int rc;
+
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
+ if (rc != -EWOULDBLOCK)
+ return rc;
+
+ if (current->mm != kvm->mm) {
+ /*
+ * If not on a thread which already belongs to this KVM,
+ * we'd better be in the irqfd workqueue.
+ */
+ if (WARN_ON_ONCE(current->mm))
+ return -EINVAL;
+
+ kthread_use_mm(kvm->mm);
+ mm_borrowed = true;
+ }
+
+ /*
+ * It is theoretically possible for the page to be unmapped
+ * and the MMU notifier to invalidate the shared_info before
+ * we even get to use it. In that case, this looks like an
+ * infinite loop. It was tempting to do it via the userspace
+ * HVA instead... but that just *hides* the fact that it's
+ * an infinite loop, because if a fault occurs and it waits
+ * for the page to come back, it can *still* immediately
+ * fault and have to wait again, repeatedly.
+ *
+ * Conversely, the page could also have been reinstated by
+ * another thread before we even obtain the mutex above, so
+ * check again *first* before remapping it.
+ */
+ do {
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ int idx;
+
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
+ if (rc != -EWOULDBLOCK)
+ break;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
+ srcu_read_unlock(&kvm->srcu, idx);
+ } while(!rc);
+
+ if (mm_borrowed)
+ kthread_unuse_mm(kvm->mm);
+
+ return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ if (!level)
+ return -EINVAL;
+
+ return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
+}
+
+/*
+ * Set up an event channel interrupt from the KVM IRQ routing table.
+ * Used for e.g. PIRQ from passed through physical devices.
+ */
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+
+{
+ struct kvm_vcpu *vcpu;
+
+ /*
+ * Don't check for the port being within range of max_evtchn_port().
+ * Userspace can configure what ever targets it likes; events just won't
+ * be delivered if/while the target is invalid, just like userspace can
+ * configure MSIs which target non-existent APICs.
+ *
+ * This allow on Live Migration and Live Update, the IRQ routing table
+ * can be restored *independently* of other things like creating vCPUs,
+ * without imposing an ordering dependency on userspace. In this
+ * particular case, the problematic ordering would be with setting the
+ * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096
+ * instead of 1024 event channels.
+ */
+
+ /* We only support 2 level event channels for now */
+ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ /*
+ * Xen gives us interesting mappings from vCPU index to APIC ID,
+ * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
+ * to find it. Do that once at setup time, instead of every time.
+ * But beware that on live update / live migration, the routing
+ * table might be reinstated before the vCPU threads have finished
+ * recreating their vCPUs.
+ */
+ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+ if (vcpu)
+ e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
+ else
+ e->xen_evtchn.vcpu_idx = -1;
+
+ e->xen_evtchn.port = ue->u.xen_evtchn.port;
+ e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
+ e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+ e->set = evtchn_set_fn;
+
+ return 0;
+}
+
+/*
+ * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
+ */
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
+{
+ struct kvm_xen_evtchn e;
+ int ret;
+
+ if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ e.port = uxe->port;
+ e.vcpu_id = uxe->vcpu;
+ e.vcpu_idx = -1;
+ e.priority = uxe->priority;
+
+ ret = kvm_xen_set_evtchn(&e, kvm);
+
+ /*
+ * None of that 'return 1 if it actually got delivered' nonsense.
+ * We don't care if it was masked (-ENOTCONN) either.
+ */
+ if (ret > 0 || ret == -ENOTCONN)
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
+ */
+struct evtchnfd {
+ u32 send_port;
+ u32 type;
+ union {
+ struct kvm_xen_evtchn port;
+ struct {
+ u32 port; /* zero */
+ struct eventfd_ctx *ctx;
+ } eventfd;
+ } deliver;
+};
+
+/*
+ * Update target vCPU or priority for a registered sending channel.
+ */
+static int kvm_xen_eventfd_update(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct evtchnfd *evtchnfd;
+ int ret;
+
+ /* Protect writes to evtchnfd as well as the idr lookup. */
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
+
+ ret = -ENOENT;
+ if (!evtchnfd)
+ goto out_unlock;
+
+ /* For an UPDATE, nothing may change except the priority/vcpu */
+ ret = -EINVAL;
+ if (evtchnfd->type != data->u.evtchn.type)
+ goto out_unlock;
+
+ /*
+ * Port cannot change, and if it's zero that was an eventfd
+ * which can't be changed either.
+ */
+ if (!evtchnfd->deliver.port.port ||
+ evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
+ goto out_unlock;
+
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ goto out_unlock;
+
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ }
+ ret = 0;
+out_unlock:
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return ret;
+}
+
+/*
+ * Configure the target (eventfd or local port delivery) for sending on
+ * a given event channel.
+ */
+static int kvm_xen_eventfd_assign(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct eventfd_ctx *eventfd = NULL;
+ struct evtchnfd *evtchnfd;
+ int ret = -EINVAL;
+
+ evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
+ if (!evtchnfd)
+ return -ENOMEM;
+
+ switch(data->u.evtchn.type) {
+ case EVTCHNSTAT_ipi:
+ /* IPI must map back to the same port# */
+ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
+ goto out_noeventfd; /* -EINVAL */
+ break;
+
+ case EVTCHNSTAT_interdomain:
+ if (data->u.evtchn.deliver.port.port) {
+ if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
+ goto out_noeventfd; /* -EINVAL */
+ } else {
+ eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto out_noeventfd;
+ }
+ }
+ break;
+
+ case EVTCHNSTAT_virq:
+ case EVTCHNSTAT_closed:
+ case EVTCHNSTAT_unbound:
+ case EVTCHNSTAT_pirq:
+ default: /* Unknown event channel type */
+ goto out; /* -EINVAL */
+ }
+
+ evtchnfd->send_port = data->u.evtchn.send_port;
+ evtchnfd->type = data->u.evtchn.type;
+ if (eventfd) {
+ evtchnfd->deliver.eventfd.ctx = eventfd;
+ } else {
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ goto out; /* -EINVAL; */
+
+ evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ }
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
+ GFP_KERNEL);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+out:
+ if (eventfd)
+ eventfd_ctx_put(eventfd);
+out_noeventfd:
+ kfree(evtchnfd);
+ return ret;
+}
+
+static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
+{
+ struct evtchnfd *evtchnfd;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ return 0;
+}
+
+static int kvm_xen_eventfd_reset(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd, **all_evtchnfds;
+ int i;
+ int n = 0;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ /*
+ * Because synchronize_srcu() cannot be called inside the
+ * critical section, first collect all the evtchnfd objects
+ * in an array as they are removed from evtchn_ports.
+ */
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
+ n++;
+
+ all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
+ if (!all_evtchnfds) {
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return -ENOMEM;
+ }
+
+ n = 0;
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ all_evtchnfds[n++] = evtchnfd;
+ idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ synchronize_srcu(&kvm->srcu);
+
+ while (n--) {
+ evtchnfd = all_evtchnfds[n];
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ kfree(all_evtchnfds);
+
+ return 0;
+}
+
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
+ return kvm_xen_eventfd_reset(kvm);
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
+ return kvm_xen_eventfd_deassign(kvm, port);
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
+ return kvm_xen_eventfd_update(kvm, data);
+ if (data->u.evtchn.flags)
+ return -EINVAL;
+
+ return kvm_xen_eventfd_assign(kvm, data);
+}
+
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
+{
+ struct evtchnfd *evtchnfd;
+ struct evtchn_send send;
+ struct x86_exception e;
+
+ /* Sanity check: this structure is the same for 32-bit and 64-bit */
+ BUILD_BUG_ON(sizeof(send) != 4);
+ if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /*
+ * evtchnfd is protected by kvm->srcu; the idr lookup instead
+ * is protected by RCU.
+ */
+ rcu_read_lock();
+ evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+ rcu_read_unlock();
+ if (!evtchnfd)
+ return false;
+
+ if (evtchnfd->deliver.port.port) {
+ int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
+ if (ret < 0 && ret != -ENOTCONN)
+ return false;
+ } else {
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx);
+ }
+
+ *r = 0;
+ return true;
+}
+
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+ vcpu->arch.xen.poll_evtchn = 0;
+
+ timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+ hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm);
+}
+
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_stop_timer(vcpu);
+
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
+
+ timer_delete_sync(&vcpu->arch.xen.poll_timer);
+}
+
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.xen.xen_lock);
+ idr_init(&kvm->arch.xen.evtchn_ports);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm);
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ idr_destroy(&kvm->arch.xen.evtchn_ports);
+
+ if (kvm->arch.xen.hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
new file mode 100644
index 000000000000..59e6128a7bd3
--- /dev/null
+++ b/arch/x86/kvm/xen.h
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#ifndef __ARCH_X86_KVM_XEN_H__
+#define __ARCH_X86_KVM_XEN_H__
+
+#include <asm/xen/cpuid.h>
+#include <asm/xen/hypervisor.h>
+
+#ifdef CONFIG_KVM_XEN
+#include <linux/jump_label_ratelimit.h>
+
+extern struct static_key_false_deferred kvm_xen_enabled;
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu);
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_init_vm(struct kvm *kvm);
+void kvm_xen_destroy_vm(struct kvm *kvm);
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
+ struct kvm *kvm);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue);
+
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The local APIC is being enabled. If the per-vCPU upcall vector is
+ * set and the vCPU's evtchn_upcall_pending flag is set, inject the
+ * interrupt.
+ */
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu))
+ kvm_xen_inject_vcpu_vector(vcpu);
+}
+
+static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.cpuid.base &&
+ function <= vcpu->arch.xen.cpuid.limit &&
+ function == (vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3));
+}
+
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ kvm->arch.xen.hvm_config.msr;
+}
+
+static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr)
+{
+ if (!static_branch_unlikely(&kvm_xen_enabled.key))
+ return false;
+
+ return msr && msr == kvm->arch.xen.hvm_config.msr;
+}
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ (kvm->arch.xen.hvm_config.flags &
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->kvm->arch.xen.upcall_vector)
+ return __kvm_xen_has_interrupt(vcpu);
+
+ return 0;
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.evtchn_pending_sel;
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return !!vcpu->arch.xen.timer_virq;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
+ return atomic_read(&vcpu->arch.xen.timer_pending);
+
+ return 0;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ return 1;
+}
+
+static inline void kvm_xen_init_vm(struct kvm *kvm)
+{
+}
+
+static inline void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+}
+
+static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr)
+{
+ return false;
+}
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function)
+{
+ return false;
+}
+#endif
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
+
+#include <asm/pvclock-abi.h>
+#include <asm/xen/interface.h>
+#include <xen/interface/vcpu.h>
+
+void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state);
+
+static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
+{
+ kvm_xen_update_runstate(vcpu, RUNSTATE_running);
+}
+
+static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If the vCPU wasn't preempted but took a normal exit for
+ * some reason (hypercalls, I/O, etc.), that is accounted as
+ * still RUNSTATE_running, as the VMM is still operating on
+ * behalf of the vCPU. Only if the VMM does actually block
+ * does it need to enter RUNSTATE_blocked.
+ */
+ if (WARN_ON_ONCE(!vcpu->preempted))
+ return;
+
+ kvm_xen_update_runstate(vcpu, RUNSTATE_runnable);
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+struct compat_arch_vcpu_info {
+ unsigned int cr2;
+ unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+ uint8_t evtchn_upcall_pending;
+ uint8_t evtchn_upcall_mask;
+ uint16_t pad;
+ uint32_t evtchn_pending_sel;
+ struct compat_arch_vcpu_info arch;
+ struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+ unsigned int max_pfn;
+ unsigned int pfn_to_mfn_frame_list_list;
+ unsigned int nmi_reason;
+ unsigned int p2m_cr3;
+ unsigned int p2m_vaddr;
+ unsigned int p2m_generation;
+ uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+ struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS];
+ uint32_t evtchn_pending[32];
+ uint32_t evtchn_mask[32];
+ struct pvclock_wall_clock wc;
+ struct compat_arch_shared_info arch;
+};
+
+#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \
+ sizeof_field(struct compat_shared_info, \
+ evtchn_pending))
+struct compat_vcpu_runstate_info {
+ int state;
+ uint64_t state_entry_time;
+ uint64_t time[4];
+} __attribute__((packed));
+
+struct compat_sched_poll {
+ /* This is actually a guest virtual address which points to ports. */
+ uint32_t ports;
+ unsigned int nr_ports;
+ uint64_t timeout;
+};
+
+#endif /* __ARCH_X86_KVM_XEN_H__ */
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 38718041efc3..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
- bool "Lguest guest support"
- select PARAVIRT
- depends on X86_32
- select VIRTIO
- select VIRTIO_RING
- select VIRTIO_CONSOLE
- help
- Lguest is a tiny in-kernel hypervisor. Selecting this will
- allow your kernel to boot under lguest. This option will increase
- your kernel size by about 6k. If in doubt, say N.
-
- If you say Y here, make sure you say Y (or M) to the virtio block
- and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 94e0e54056a9..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y := i386_head.o boot.o
-CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index d677fa9ca650..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1429 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways. First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes. We call the first kernel the Host,
- * and the others the Guests. The program which sets up and configures Guests
- * (such as the example in Documentation/lguest/lguest.c) is called the
- * Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time. This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest? We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/i387.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h> /* for struct machine_ops */
-
-/*G:010 Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways. In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
- .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
- .noirq_start = (u32)lguest_noirq_start,
- .noirq_end = (u32)lguest_noirq_end,
- .kernel_address = PAGE_OFFSET,
- .blocked_interrupts = { 1 }, /* Block timer interrupts */
- .syscall_vec = SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really. We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly. This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
- unsigned long arg2, unsigned long arg3,
- unsigned long arg4)
-{
- /* Note: This code assumes we're uniprocessor. */
- static unsigned int next_call;
- unsigned long flags;
-
- /*
- * Disable interrupts if not already disabled: we don't want an
- * interrupt handler making a hypercall while we're already doing
- * one!
- */
- local_irq_save(flags);
- if (lguest_data.hcall_status[next_call] != 0xFF) {
- /* Table full, so do normal hcall which will flush table. */
- kvm_hypercall4(call, arg1, arg2, arg3, arg4);
- } else {
- lguest_data.hcalls[next_call].arg0 = call;
- lguest_data.hcalls[next_call].arg1 = arg1;
- lguest_data.hcalls[next_call].arg2 = arg2;
- lguest_data.hcalls[next_call].arg3 = arg3;
- lguest_data.hcalls[next_call].arg4 = arg4;
- /* Arguments must all be written before we mark it to go */
- wmb();
- lguest_data.hcall_status[next_call] = 0;
- if (++next_call == LHCALL_RING_SIZE)
- next_call = 0;
- }
- local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall(). This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off. Because hypercalls
- * are reasonably expensive, batching them up makes sense. For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call,
- unsigned long arg1)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall1(call, arg1);
- else
- async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
- unsigned long arg1,
- unsigned long arg2)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall2(call, arg1, arg2);
- else
- async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall3(call, arg1, arg2, arg3);
- else
- async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall4(call, arg1, arg2, arg3, arg4);
- else
- async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue the do-nothing hypercall to flush any stored calls.
-:*/
-static void lguest_leave_lazy_mmu_mode(void)
-{
- kvm_hypercall0(LHCALL_FLUSH_ASYNC);
- paravirt_leave_lazy_mmu();
-}
-
-static void lguest_end_context_switch(struct task_struct *next)
-{
- kvm_hypercall0(LHCALL_FLUSH_ASYNC);
- paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction. The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags"). The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag. Our "save_flags()" just returns that.
- */
-static unsigned long save_fl(void)
-{
- return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-static void irq_disable(void)
-{
- lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment. Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules. In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register. To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
-/*:*/
-
-/* These are in i386_head.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch. One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled. There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag. If an interrupt does come
- * in, we then disable them for real. This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in. Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares? The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
- int entrynum, const gate_desc *g)
-{
- /*
- * The gate_desc structure is 8 bytes long: we hand it to the Host in
- * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
- * around like this; typesafety wasn't a big concern in Linux's early
- * years.
- */
- u32 *desc = (u32 *)g;
- /* Keep the local copy up to date. */
- native_write_idt_entry(dt, entrynum, g);
- /* Tell Host about this new entry. */
- kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *idt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table. There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *gdt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
-}
-
-/*
- * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
- * then tell the Host to reload the entire thing. This operation is so rare
- * that this naive implementation is reasonable.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
- const void *desc, int type)
-{
- native_write_gdt_entry(dt, entrynum, desc, type);
- /* Tell Host about this new entry. */
- kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum,
- dt[entrynum].a, dt[entrynum].b);
-}
-
-/*
- * OK, I lied. There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables). So we have a hypercall specifically for this case.
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- /*
- * There's one problem which normal hardware doesn't have: the Host
- * can't handle us removing entries we're currently using. So we clear
- * the GS register here: if it's needed it'll be reloaded anyway.
- */
- lazy_load_gs(0);
- lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy. Linux only
- * uses this for some strange applications like Wine. We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment. Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops. So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features. It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 5 languages. I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction. Then I pretty much turned
- * off feature bits until the Guest booted. (Don't say that: you'll damage
- * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
- * hardly future proof.) Noone's listening! They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged. So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int function = *ax;
-
- native_cpuid(ax, bx, cx, dx);
- switch (function) {
- /*
- * CPUID 0 gives the highest legal CPUID number (and the ID string).
- * We futureproof our code a little by sticking to known CPUID values.
- */
- case 0:
- if (*ax > 5)
- *ax = 5;
- break;
-
- /*
- * CPUID 1 is a basic feature request.
- *
- * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
- * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
- */
- case 1:
- *cx &= 0x00002201;
- *dx &= 0x07808151;
- /*
- * The Host can do a nice optimization if it knows that the
- * kernel mappings (addresses above 0xC0000000 or whatever
- * PAGE_OFFSET is set to) haven't changed. But Linux calls
- * flush_tlb_user() for both user and kernel mappings unless
- * the Page Global Enable (PGE) feature bit is set.
- */
- *dx |= 0x00002000;
- /*
- * We also lie, and say we're family id 5. 6 or greater
- * leads to a rdmsr in early_init_intel which we can't handle.
- * Family ID is returned as bits 8-12 in ax.
- */
- *ax &= 0xFFFFF0FF;
- *ax |= 0x00000500;
- break;
- /*
- * 0x80000000 returns the highest Extended Function, so we futureproof
- * like we do above by limiting it to known fields.
- */
- case 0x80000000:
- if (*ax > 0x80000008)
- *ax = 0x80000008;
- break;
-
- /*
- * PAE systems can mark pages as non-executable. Linux calls this the
- * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch turn if off here, since we don't
- * support it.
- */
- case 0x80000001:
- *dx &= ~(1 << 20);
- break;
- }
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it. The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0. cr0 allows you to turn on and off all kinds of basic
- * features, but Linux only really cares about one: the horrifically-named Task
- * Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used. Which allows us to restore FPU state
- * lazily after a task switch, and Linux uses that gratefully, but wouldn't a
- * name like "FPUTRAP bit" be a little less cryptic?
- *
- * We store cr0 locally because the Host never changes it. The Guest sometimes
- * wants to read it and we'd prefer not to bother the Host unnecessarily.
- */
-static unsigned long current_cr0;
-static void lguest_write_cr0(unsigned long val)
-{
- lazy_hcall1(LHCALL_TS, val & X86_CR0_TS);
- current_cr0 = val;
-}
-
-static unsigned long lguest_read_cr0(void)
-{
- return current_cr0;
-}
-
-/*
- * Intel provided a special instruction to clear the TS bit for people too cool
- * to use write_cr0() to do it. This "clts" instruction is faster, because all
- * the vowels have been optimized out.
- */
-static void lguest_clts(void)
-{
- lazy_hcall1(LHCALL_TS, 0);
- current_cr0 &= ~X86_CR0_TS;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads. The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
- return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes. The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
- lguest_data.pgdir = cr3;
- lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
- cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
- return lguest_data.pgdir;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
- return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant. The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU
- * maps virtual addresses to physical addresses using "page tables". We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables. But since most virtual addresses
- * are unused, we use a two level index which saves space. The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages. Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page. If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- * Index into top Index into second Offset within page
- * page directory page pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper ]
- * [ Level ]
- * [ Entries ]
- * [(PUD Page)]---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- *
- * And the virtual address is decoded as:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into Index into mid Index into lower Offset within page
- * top entries directory page pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages. The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. Wetell the Host the toplevel and
- * address this corresponds to. The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
- /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
- lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
- ptep->pte_low, ptep->pte_high);
-#else
- lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
- native_set_pud(pudp, pudval);
-
- /* 32 bytes aligned pdpt address and the index. */
- lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
- (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more. This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them. Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
- * the Host anything changed until we've done the first page table switch,
- * which brings boot back to 0.25 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry. These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- native_set_pte_atomic(ptep, pte);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- native_pte_clear(mm, addr, ptep);
- lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
- lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations. On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present). Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
- /* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed. That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit. We don't actually "ack" interrupts as such, we
- * just mask and unmask them. I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(unsigned int irq)
-{
- set_bit(irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(unsigned int irq)
-{
- clear_bit(irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
- .name = "lguest",
- .mask = disable_lguest_irq,
- .mask_ack = disable_lguest_irq,
- .unmask = enable_lguest_irq,
-};
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls), and then tells the
- * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller.
- */
-static void __init lguest_init_IRQ(void)
-{
- unsigned int i;
-
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- /* Some systems map "vectors" to interrupts weirdly. Not us! */
- __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
- if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
- }
-
- /*
- * This call is required to set up for 4k stacks, where we have
- * separate stacks for hard and soft interrupts.
- */
- irq_ctx_init(smp_processor_id());
-}
-
-/*
- * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
- * rather than set them in lguest_init_IRQ we are called here every time an
- * lguest device needs an interrupt.
- *
- * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
- * pass that up!
- */
-void lguest_setup_irq(unsigned int irq)
-{
- irq_to_desc_alloc_node(irq, 0);
- set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
- handle_level_irq, "level");
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static unsigned long lguest_get_wallclock(void)
-{
- return lguest_data.time.tv_sec;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
- return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static cycle_t lguest_clock_read(struct clocksource *cs)
-{
- unsigned long sec, nsec;
-
- /*
- * Since the time is in two parts (seconds and nanoseconds), we risk
- * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
- * and getting 99 and 0. As Linux tends to come apart under the stress
- * of time travel, we must be careful:
- */
- do {
- /* First we read the seconds part. */
- sec = lguest_data.time.tv_sec;
- /*
- * This read memory barrier tells the compiler and the CPU that
- * this can't be reordered: we have to complete the above
- * before going on.
- */
- rmb();
- /* Now we read the nanoseconds part. */
- nsec = lguest_data.time.tv_nsec;
- /* Make sure we've done that. */
- rmb();
- /* Now if the seconds part has changed, try again. */
- } while (unlikely(lguest_data.time.tv_sec != sec));
-
- /* Our lguest clock is in real nanoseconds. */
- return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
- .name = "lguest",
- .rating = 200,
- .read = lguest_clock_read,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << 22,
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future. Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* FIXME: I don't think this can ever happen, but James tells me he had
- * to put this code in. Maybe we should remove it now. Anyone? */
- if (delta < LG_CLOCK_MIN_DELTA) {
- if (printk_ratelimit())
- printk(KERN_DEBUG "%s: small delta %lu ns\n",
- __func__, delta);
- return -ETIME;
- }
-
- /* Please wake us this far in the future. */
- kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta);
- return 0;
-}
-
-static void lguest_clockevent_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- switch (mode) {
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- /* A 0 argument shuts the clock down. */
- kvm_hypercall0(LHCALL_SET_CLOCKEVENT);
- break;
- case CLOCK_EVT_MODE_ONESHOT:
- /* This is what we expect. */
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- BUG();
- case CLOCK_EVT_MODE_RESUME:
- break;
- }
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
- .name = "lguest",
- .features = CLOCK_EVT_FEAT_ONESHOT,
- .set_next_event = lguest_clockevent_set_next_event,
- .set_mode = lguest_clockevent_set_mode,
- .rating = INT_MAX,
- .mult = 1,
- .shift = 0,
- .min_delta_ns = LG_CLOCK_MIN_DELTA,
- .max_delta_ns = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0). We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
-{
- unsigned long flags;
-
- /* Don't interrupt us while this is running. */
- local_irq_save(flags);
- lguest_clockevent.event_handler(&lguest_clockevent);
- local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure. The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
- /* Set up the timer interrupt (0) to go to our simple timer routine */
- set_irq_handler(0, lguest_time_irq);
-
- clocksource_register(&lguest_clock);
-
- /* We can't set cpumask in the initializer: damn C limitations! Set it
- * here and register our timer device. */
- lguest_clockevent.cpumask = cpumask_of(0);
- clockevents_register_device(&lguest_clockevent);
-
- /* Finally, we unblock the timer interrupt. */
- enable_lguest_irq(0);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work. They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use. For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
- THREAD_SIZE / PAGE_SIZE);
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
- /* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices). This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that. Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0. So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads. So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced. It will also never interrupt anything. It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
- return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
- return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
- /* Warn to see if there's any stray references */
- WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
- return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
- return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
- apic->read = lguest_apic_read;
- apic->write = lguest_apic_write;
- apic->icr_read = lguest_apic_icr_read;
- apic->icr_write = lguest_apic_icr_write;
- apic->wait_icr_idle = lguest_apic_wait_icr_idle;
- apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP! Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
- kvm_hypercall0(LHCALL_HALT);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
- kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
- LGUEST_SHUTDOWN_POWEROFF);
-}
-
-/*
- * Panicing.
- *
- * Don't. But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
- kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF);
- /* The hcall won't return, but to keep gcc happy, we're "done". */
- return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
- .notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
- /* We do this here and not earlier because lockcheck used to barf if we
- * did it before start_kernel(). I think we fixed that, so it'd be
- * nice to move it back to lguest_init. Patch welcome... */
- atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
- /*
- *The Linux bootloader header contains an "e820" memory map: the
- * Launcher populated the first entry with our memory limit.
- */
- e820_add_region(boot_params.e820_map[0].addr,
- boot_params.e820_map[0].size,
- boot_params.e820_map[0].type);
-
- /* This string is for the boot messages. */
- return "LGUEST";
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
- * console output.
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
- char scratch[17];
- unsigned int len = count;
-
- /* We use a nul-terminated string, so we make a copy. Icky, huh? */
- if (len > sizeof(scratch) - 1)
- len = sizeof(scratch) - 1;
- scratch[len] = '\0';
- memcpy(scratch, buf, len);
- kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch));
-
- /* This routine returns the number of bytes actually written. */
- return len;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
- kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel. This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"? The rest of that quote is
- * "... But that usually will create another problem." This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient. We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in i386_head.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
- const char *start, *end;
-} lguest_insns[] = {
- [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c). If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
- unsigned long addr, unsigned len)
-{
- unsigned int insn_len;
-
- /* Don't do anything special if we don't have a replacement */
- if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
- /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
- if (len < insn_len)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- /* Copy in our instructions. */
- memcpy(ibuf, lguest_insns[type].start, insn_len);
- return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest. The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
- /* We're under lguest. */
- pv_info.name = "lguest";
- /* Paravirt is enabled. */
- pv_info.paravirt_enabled = 1;
- /* We're running at privilege level 1, not 0 as normal. */
- pv_info.kernel_rpl = 1;
- /* Everyone except Xen runs with this set. */
- pv_info.shared_kernel_pmd = 1;
-
- /*
- * We set up all the lguest overrides for sensitive operations. These
- * are detailed with the operations themselves.
- */
-
- /* Interrupt-related operations */
- pv_irq_ops.init_IRQ = lguest_init_IRQ;
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
- pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
- pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
- pv_irq_ops.safe_halt = lguest_safe_halt;
-
- /* Setup operations */
- pv_init_ops.memory_setup = lguest_memory_setup;
- pv_init_ops.patch = lguest_patch;
-
- /* Intercepts of various CPU instructions */
- pv_cpu_ops.load_gdt = lguest_load_gdt;
- pv_cpu_ops.cpuid = lguest_cpuid;
- pv_cpu_ops.load_idt = lguest_load_idt;
- pv_cpu_ops.iret = lguest_iret;
- pv_cpu_ops.load_sp0 = lguest_load_sp0;
- pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
- pv_cpu_ops.set_ldt = lguest_set_ldt;
- pv_cpu_ops.load_tls = lguest_load_tls;
- pv_cpu_ops.set_debugreg = lguest_set_debugreg;
- pv_cpu_ops.clts = lguest_clts;
- pv_cpu_ops.read_cr0 = lguest_read_cr0;
- pv_cpu_ops.write_cr0 = lguest_write_cr0;
- pv_cpu_ops.read_cr4 = lguest_read_cr4;
- pv_cpu_ops.write_cr4 = lguest_write_cr4;
- pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
- pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
- pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
- pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
- /* Pagetable management */
- pv_mmu_ops.write_cr3 = lguest_write_cr3;
- pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
- pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
- pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
- pv_mmu_ops.set_pte = lguest_set_pte;
- pv_mmu_ops.set_pte_at = lguest_set_pte_at;
- pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
- pv_mmu_ops.pte_clear = lguest_pte_clear;
- pv_mmu_ops.pmd_clear = lguest_pmd_clear;
- pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
- pv_mmu_ops.read_cr2 = lguest_read_cr2;
- pv_mmu_ops.read_cr3 = lguest_read_cr3;
- pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
- pv_mmu_ops.pte_update = lguest_pte_update;
- pv_mmu_ops.pte_update_defer = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /* APIC read/write intercepts */
- set_lguest_basic_apic_ops();
-#endif
-
- /* Time operations */
- pv_time_ops.get_wallclock = lguest_get_wallclock;
- pv_time_ops.time_init = lguest_time_init;
- pv_time_ops.get_tsc_khz = lguest_tsc_khz;
-
- /*
- * Now is a good time to look at the implementations of these functions
- * before returning to the rest of lguest_init().
- */
-
- /*G:070
- * Now we've seen all the paravirt_ops, we return to
- * lguest_init() where the rest of the fairly chaotic boot setup
- * occurs.
- */
-
- /*
- * The stack protector is a weird thing where gcc places a canary
- * value on the stack and then checks it on return. This file is
- * compiled with -fno-stack-protector it, so we got this far without
- * problems. The value of the canary is kept at offset 20 from the
- * %gs register, so we need to set that up before calling C functions
- * in other files.
- */
- setup_stack_canary_segment(0);
-
- /*
- * We could just call load_stack_canary_segment(), but we might as well
- * call switch_to_new_gdt() which loads the whole table and sets up the
- * per-cpu segment descriptor register %fs as well.
- */
- switch_to_new_gdt(0);
-
- /* We actually boot with all memory mapped, but let's say 128MB. */
- max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
-
- /*
- * The Host<->Guest Switcher lives at the top of our address space, and
- * the Host told us how big it is when we made LGUEST_INIT hypercall:
- * it put the answer in lguest_data.reserve_mem
- */
- reserve_top_address(lguest_data.reserve_mem);
-
- /*
- * If we don't initialize the lock dependency checker now, it crashes
- * paravirt_disable_iospace.
- */
- lockdep_init();
-
- /*
- * The IDE code spends about 3 seconds probing for disks: if we reserve
- * all the I/O ports up front it can't get them and so doesn't probe.
- * Other device drivers are similar (but less severe). This cuts the
- * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
- */
- paravirt_disable_iospace();
-
- /*
- * This is messy CPU setup stuff which the native boot code does before
- * start_kernel, so we have to do, too:
- */
- cpu_detect(&new_cpu_data);
- /* head.S usually sets up the first capability word, so do it here. */
- new_cpu_data.x86_capability[0] = cpuid_edx(1);
-
- /* Math is always hard! */
- new_cpu_data.hard_math = 1;
-
- /* We don't have features. We have puppies! Puppies! */
-#ifdef CONFIG_X86_MCE
- mce_disabled = 1;
-#endif
-#ifdef CONFIG_ACPI
- acpi_disabled = 1;
- acpi_ht = 0;
-#endif
-
- /*
- * We set the preferred console to "hvc". This is the "hypervisor
- * virtual console" driver written by the PowerPC people, which we also
- * adapted for lguest's use.
- */
- add_preferred_console("hvc", 0, NULL);
-
- /* Register our very early console. */
- virtio_cons_early_init(early_put_chars);
-
- /*
- * Last of all, we set the power management poweroff hook to point to
- * the Guest routine to power off, and the reboot hook to our restart
- * routine.
- */
- pm_power_off = lguest_power_off;
- machine_ops.restart = lguest_restart;
-
- /*
- * Now we're set up, call i386_start_kernel() in head32.c and we proceed
- * to boot as normal. It never returns.
- */
- i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
deleted file mode 100644
index 27eac0faee48..000000000000
--- a/arch/x86/lguest/i386_head.S
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
- * Our story starts with the kernel booting into startup_32 in
- * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
- * the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe. Finally it checks the
- * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen:
- * if it's set to '1' (lguest's assigned number), then it calls us here.
- *
- * WARNING: be very careful here! We're running at addresses equal to physical
- * addesses (around 0), not above PAGE_OFFSET as most code expectes
- * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
- /*
- * We make the "initialization" hypercall now to tell the Host about
- * us, and also find out where it put our page tables.
- */
- movl $LHCALL_LGUEST_INIT, %eax
- movl $lguest_data - __PAGE_OFFSET, %ebx
- .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
-
- /* Set up the initial stack so we can run C code. */
- movl $(init_thread_union+THREAD_SIZE),%esp
-
- /* Jumps are relative: we're running __PAGE_OFFSET too low. */
- jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers. These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...) \
- lgstart_##name: insns; lgend_##name:; \
- .globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later). If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages. First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
- /*
- * The reverse of irq_disable, this sets lguest_data.irq_enabled to
- * X86_EFLAGS_IF (ie. "Interrupts enabled").
- */
- movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * But now we need to check if the Host wants to know: there might have
- * been interrupts waiting to be delivered, in which case it will have
- * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
- * jump to send_interrupts, otherwise we're done.
- */
- testl $0, lguest_data+LGUEST_DATA_irq_pending
- jnz send_interrupts
- /*
- * One cool thing about x86 is that you can do many things without using
- * a register. In this case, the normal path hasn't needed to save or
- * restore any registers at all!
- */
- ret
-send_interrupts:
- /*
- * OK, now we need a register: eax is used for the hypercall number,
- * which is LHCALL_SEND_INTERRUPTS.
- *
- * We used not to bother with this pending detection at all, which was
- * much simpler. Sooner or later the Host would realize it had to
- * send us an interrupt. But that turns out to make performance 7
- * times worse on a simple tcp benchmark. So now we do this the hard
- * way.
- */
- pushl %eax
- movl $LHCALL_SEND_INTERRUPTS, %eax
- /*
- * This is a vmcall instruction (same thing that KVM uses). Older
- * assembler versions might not know the "vmcall" instruction, so we
- * create one manually here.
- */
- .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
- /* Put eax back the way we found it. */
- popl %eax
- ret
-
-/*
- * Finally, the "popf" or "restore flags" routine. The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
- /* This is just "lguest_data.irq_enabled = flags;" */
- movl %eax, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * Now, if the %eax value has enabled interrupts and
- * lguest_data.irq_pending is set, we want to tell the Host so it can
- * deliver any outstanding interrupts. Fortunately, both values will
- * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
- * instruction will AND them together for us. If both are set, we
- * jump to send_interrupts.
- */
- testl lguest_data+LGUEST_DATA_irq_pending, %eax
- jnz send_interrupts
- /* Again, the normal path has used no extra registers. Clever, huh? */
- ret
-/*:*/
-
-/* These demark the EIP range where host should never deliver interrupts. */
-.global lguest_noirq_start
-.global lguest_noirq_end
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it. However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway. If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last. It's *cool*! It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap. The
- * stack looks like this:
- * old address
- * old code segment & privilege level
- * old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once. The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we need to use a register to do
- * the copy and secondly, the whole thing needs to be atomic. The first
- * problem is easy to solve: push %eax on the stack so we can use it, and then
- * restore it at the end just before the real "iret".
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever. Our solution to this is to surround the
- * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled.
- */
-ENTRY(lguest_iret)
- pushl %eax
- movl 12(%esp), %eax
-lguest_noirq_start:
- /*
- * Note the %ss: segment prefix here. Normal data accesses use the
- * "ds" segment, but that will have already been restored for whatever
- * we're returning to (such as userspace): we can't trust it. The %ss:
- * prefix makes sure we use the stack segment, which is still valid.
- */
- movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
- popl %eax
- iret
-lguest_noirq_end:
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 000000000000..ec2131c9fd20
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
+
+inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 07c31899c9c2..2dba7f83ef97 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -1,25 +1,64 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for x86 specific library files.
#
-obj-$(CONFIG_SMP) := msr.o
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o := n
-lib-y := delay.o
-lib-y += thunk_$(BITS).o
-lib-y += usercopy_$(BITS).o getuser.o putuser.o
+# KCSAN uses udelay for introducing watchpoint delay; avoid recursion.
+KCSAN_SANITIZE_delay.o := n
+ifdef CONFIG_KCSAN
+# In case KCSAN+lockdep+ftrace are enabled, disable ftrace for delay.o to avoid
+# lockdep -> [other libs] -> KCSAN -> udelay -> ftrace -> lockdep recursion.
+CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE)
+endif
+
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN $@
+ cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@
+
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+ $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
+obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
+
+lib-y := delay.o misc.o cmdline.o cpu.o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-y += pc-conf-reg.o
+lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
+lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
+
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-y += iomem.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
+ lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
- lib-y += semaphore_32.o string_32.o
-
- lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
+ lib-y += string_32.o
+ lib-y += memmove_32.o
+ lib-y += cmpxchg8b_emu.o
+ifneq ($(CONFIG_X86_CX8),y)
+ lib-y += atomic64_386_32.o
+endif
else
- obj-y += io_64.o iomap_copy_64.o
+ifneq ($(CONFIG_GENERIC_CSUM),y)
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
- lib-y += thunk_64.o clear_page_64.o copy_page_64.o
+endif
+ lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
- lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+ lib-y += copy_user_64.o copy_user_uncached_64.o
+ lib-y += cmpxchg16b_emu.o
+ lib-y += bhi.o
endif
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0be55a3..a0b4a350daa7 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -1,230 +1,4 @@
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/types.h>
+#define ATOMIC64_EXPORT EXPORT_SYMBOL
-#include <asm/processor.h>
-#include <asm/cmpxchg.h>
-#include <asm/atomic.h>
-
-static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
-{
- u32 low = new;
- u32 high = new >> 32;
-
- asm volatile(
- LOCK_PREFIX "cmpxchg8b %1\n"
- : "+A" (old), "+m" (*ptr)
- : "b" (low), "c" (high)
- );
- return old;
-}
-
-u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
-{
- return cmpxchg8b(&ptr->counter, old_val, new_val);
-}
-EXPORT_SYMBOL(atomic64_cmpxchg);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
-{
- /*
- * Try first with a (possibly incorrect) assumption about
- * what we have there. We'll do two loops most likely,
- * but we'll get an ownership MESI transaction straight away
- * instead of a read transaction followed by a
- * flush-for-ownership transaction:
- */
- u64 old_val, real_val = 0;
-
- do {
- old_val = real_val;
-
- real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
- } while (real_val != old_val);
-
- return old_val;
-}
-EXPORT_SYMBOL(atomic64_xchg);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-void atomic64_set(atomic64_t *ptr, u64 new_val)
-{
- atomic64_xchg(ptr, new_val);
-}
-EXPORT_SYMBOL(atomic64_set);
-
-/**
-EXPORT_SYMBOL(atomic64_read);
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
-{
- /*
- * Try first with a (possibly incorrect) assumption about
- * what we have there. We'll do two loops most likely,
- * but we'll get an ownership MESI transaction straight away
- * instead of a read transaction followed by a
- * flush-for-ownership transaction:
- */
- u64 old_val, new_val, real_val = 0;
-
- do {
- old_val = real_val;
- new_val = old_val + delta;
-
- real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
- } while (real_val != old_val);
-
- return new_val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
-
-u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
-{
- return atomic64_add_return(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub_return);
-
-u64 atomic64_inc_return(atomic64_t *ptr)
-{
- return atomic64_add_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_return);
-
-u64 atomic64_dec_return(atomic64_t *ptr)
-{
- return atomic64_sub_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_return);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-void atomic64_add(u64 delta, atomic64_t *ptr)
-{
- atomic64_add_return(delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_add);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-void atomic64_sub(u64 delta, atomic64_t *ptr)
-{
- atomic64_add(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
-{
- u64 new_val = atomic64_sub_return(delta, ptr);
-
- return new_val == 0;
-}
-EXPORT_SYMBOL(atomic64_sub_and_test);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-void atomic64_inc(atomic64_t *ptr)
-{
- atomic64_add(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-void atomic64_dec(atomic64_t *ptr)
-{
- atomic64_sub(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-int atomic64_dec_and_test(atomic64_t *ptr)
-{
- return atomic64_sub_and_test(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_and_test);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_inc_and_test(atomic64_t *ptr)
-{
- return atomic64_sub_and_test(-1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_and_test);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-int atomic64_add_negative(u64 delta, atomic64_t *ptr)
-{
- s64 new_val = atomic64_add_return(delta, ptr);
-
- return new_val < 0;
-}
-EXPORT_SYMBOL(atomic64_add_negative);
+#include <linux/export.h>
+#include <linux/atomic.h>
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 000000000000..e768815e58ae
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,195 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * atomic64_t for 386/486
+ *
+ * Copyright © 2010 Luca Barbieri
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+
+/* if you want SMP support, implement these with real spinlocks */
+.macro IRQ_SAVE reg
+ pushfl
+ cli
+.endm
+
+.macro IRQ_RESTORE reg
+ popfl
+.endm
+
+#define BEGIN_IRQ_SAVE(op) \
+.macro endp; \
+SYM_FUNC_END(atomic64_##op##_386); \
+.purgem endp; \
+.endm; \
+SYM_FUNC_START(atomic64_##op##_386); \
+ IRQ_SAVE v;
+
+#define ENDP endp
+
+#define RET_IRQ_RESTORE \
+ IRQ_RESTORE v; \
+ RET
+
+#define v %ecx
+BEGIN_IRQ_SAVE(read)
+ movl (v), %eax
+ movl 4(v), %edx
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(set)
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(xchg)
+ movl (v), %eax
+ movl 4(v), %edx
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %ecx
+BEGIN_IRQ_SAVE(add)
+ addl %eax, (v)
+ adcl %edx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %ecx
+BEGIN_IRQ_SAVE(add_return)
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %ecx
+BEGIN_IRQ_SAVE(sub)
+ subl %eax, (v)
+ sbbl %edx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %ecx
+BEGIN_IRQ_SAVE(sub_return)
+ negl %edx
+ negl %eax
+ sbbl $0, %edx
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(inc)
+ addl $1, (v)
+ adcl $0, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(inc_return)
+ movl (v), %eax
+ movl 4(v), %edx
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(dec)
+ subl $1, (v)
+ sbbl $0, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(dec_return)
+ movl (v), %eax
+ movl 4(v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+ RET_IRQ_RESTORE
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(add_unless)
+ addl %eax, %ecx
+ adcl %edx, %edi
+ addl (v), %eax
+ adcl 4(v), %edx
+ cmpl %eax, %ecx
+ je 3f
+1:
+ movl %eax, (v)
+ movl %edx, 4(v)
+ movl $1, %eax
+2:
+ RET_IRQ_RESTORE
+3:
+ cmpl %edx, %edi
+ jne 1b
+ xorl %eax, %eax
+ jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(inc_not_zero)
+ movl (v), %eax
+ movl 4(v), %edx
+ testl %eax, %eax
+ je 3f
+1:
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+ movl $1, %eax
+2:
+ RET_IRQ_RESTORE
+3:
+ testl %edx, %edx
+ jne 1b
+ jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN_IRQ_SAVE(dec_if_positive)
+ movl (v), %eax
+ movl 4(v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ js 1f
+ movl %eax, (v)
+ movl %edx, 4(v)
+1:
+ RET_IRQ_RESTORE
+ENDP
+#undef v
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 000000000000..b2eff07d65e4
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * atomic64_t for 586+
+ *
+ * Copyright © 2010 Luca Barbieri
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+
+.macro read64 reg
+ movl %ebx, %eax
+ movl %ecx, %edx
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+ LOCK_PREFIX
+ cmpxchg8b (\reg)
+.endm
+
+.macro read64_nonatomic reg
+ movl (\reg), %eax
+ movl 4(\reg), %edx
+.endm
+
+SYM_FUNC_START(atomic64_read_cx8)
+ read64 %ecx
+ RET
+SYM_FUNC_END(atomic64_read_cx8)
+
+SYM_FUNC_START(atomic64_set_cx8)
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+ cmpxchg8b (%esi)
+ jne 1b
+
+ RET
+SYM_FUNC_END(atomic64_set_cx8)
+
+SYM_FUNC_START(atomic64_xchg_cx8)
+1:
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ RET
+SYM_FUNC_END(atomic64_xchg_cx8)
+
+.macro addsub_return func ins insc
+SYM_FUNC_START(atomic64_\func\()_return_cx8)
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl %eax, %esi
+ movl %edx, %edi
+ movl %ecx, %ebp
+
+ read64_nonatomic %ecx
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l %esi, %ebx
+ \insc\()l %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%ebp)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ RET
+SYM_FUNC_END(atomic64_\func\()_return_cx8)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+SYM_FUNC_START(atomic64_\func\()_return_cx8)
+ pushl %ebx
+
+ read64_nonatomic %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l $1, %ebx
+ \insc\()l $0, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ RET
+SYM_FUNC_END(atomic64_\func\()_return_cx8)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+SYM_FUNC_START(atomic64_dec_if_positive_cx8)
+ pushl %ebx
+
+ read64 %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ subl $1, %ebx
+ sbb $0, %ecx
+ js 2f
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+2:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ RET
+SYM_FUNC_END(atomic64_dec_if_positive_cx8)
+
+SYM_FUNC_START(atomic64_add_unless_cx8)
+ pushl %ebp
+ pushl %ebx
+/* these just push these two parameters on the stack */
+ pushl %edi
+ pushl %ecx
+
+ movl %eax, %ebp
+ movl %edx, %edi
+
+ read64 %esi
+1:
+ cmpl %eax, 0(%esp)
+ je 4f
+2:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ addl %ebp, %ebx
+ adcl %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ movl $1, %eax
+3:
+ addl $8, %esp
+ popl %ebx
+ popl %ebp
+ RET
+4:
+ cmpl %edx, 4(%esp)
+ jne 2b
+ xorl %eax, %eax
+ jmp 3b
+SYM_FUNC_END(atomic64_add_unless_cx8)
+
+SYM_FUNC_START(atomic64_inc_not_zero_cx8)
+ pushl %ebx
+
+ read64 %esi
+1:
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jz 3f
+ movl %eax, %ebx
+ xorl %ecx, %ecx
+ addl $1, %ebx
+ adcl %edx, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ movl $1, %eax
+3:
+ popl %ebx
+ RET
+SYM_FUNC_END(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/bhi.S b/arch/x86/lib/bhi.S
new file mode 100644
index 000000000000..aad1e5839202
--- /dev/null
+++ b/arch/x86/lib/bhi.S
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/unwind_hints.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * Notably, the FineIBT preamble calling these will have ZF set and eax zero.
+ *
+ * The very last element is in fact larger than 32 bytes, but since its the
+ * last element, this does not matter,
+ *
+ * There are 2 #UD sites, located between 0,1-2,3 and 4,5-6,7 such that they
+ * can be reached using Jcc.d8, these elements (1 and 5) have sufficiently
+ * big alignment holes for this to not stagger the array.
+ */
+
+.pushsection .noinstr.text, "ax"
+
+ .align 32
+SYM_CODE_START(__bhi_args)
+
+#ifdef CONFIG_FINEIBT_BHI
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %rax, %rdi
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 8
+ ANNOTATE_REACHABLE
+.Lud_1: ud2
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_1
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
+ cmovne %rax, %r8
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 8
+ ANNOTATE_REACHABLE
+.Lud_2: ud2
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_6, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
+ cmovne %rax, %r8
+ cmovne %rax, %r9
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_7, SYM_L_LOCAL)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ jne .Lud_2
+ cmovne %rax, %rdi
+ cmovne %rax, %rsi
+ cmovne %rax, %rdx
+ cmovne %rax, %rcx
+ cmovne %rax, %r8
+ cmovne %rax, %r9
+ cmovne %rax, %rsp
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+
+#endif /* CONFIG_FINEIBT_BHI */
+
+ .align 32
+SYM_INNER_LABEL(__bhi_args_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ nop /* Work around toolchain+objtool quirk */
+SYM_CODE_END(__bhi_args)
+
+.popsection
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 000000000000..824664c0ecbd
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/paravirt.h>
+#include <linux/smp.h>
+#include <linux/export.h>
+#include <linux/kvm_types.h>
+
+static void __wbinvd(void *dummy)
+{
+ wbinvd();
+}
+
+void wbinvd_on_cpu(int cpu)
+{
+ smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpu);
+
+void wbinvd_on_all_cpus(void)
+{
+ on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
+
+void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpus_mask);
+
+static void __wbnoinvd(void *dummy)
+{
+ wbnoinvd();
+}
+
+void wbnoinvd_on_all_cpus(void)
+{
+ on_each_cpu(__wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_all_cpus);
+
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb78..68f7fa3e1322 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -18,17 +19,14 @@
* handling.
* Andi Kleen, add zeroing on error
* converted to pure assembler
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/errno.h>
-
+#include <asm/asm.h>
+#include <asm/nospec-branch.h>
+
/*
* computes a partial checksum, e.g. for TCP/UDP fragments
*/
@@ -48,21 +46,16 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
* Fortunately, it is easy to convert 2-byte alignment to 4-byte
* alignment for the unrolled loop.
*/
-ENTRY(csum_partial)
- CFI_STARTPROC
+SYM_FUNC_START(csum_partial)
pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
testl $3, %esi # Check alignment.
jz 2f # Jump if alignment is ok.
testl $1, %esi # Check alignment.
- jz 10f # Jump if alignment is boundary of 2bytes.
+ jz 10f # Jump if alignment is boundary of 2 bytes.
# buf is odd
dec %ecx
@@ -128,32 +121,22 @@ ENTRY(csum_partial)
6: addl %ecx,%eax
adcl $0, %eax
7:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 8f
roll $8, %eax
8:
popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebx
popl %esi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE esi
- ret
- CFI_ENDPROC
-ENDPROC(csum_partial)
+ RET
+SYM_FUNC_END(csum_partial)
#else
/* Version for PentiumII/PPro */
-ENTRY(csum_partial)
- CFI_STARTPROC
+SYM_FUNC_START(csum_partial)
pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -170,7 +153,7 @@ ENTRY(csum_partial)
negl %ebx
lea 45f(%ebx,%ebx,2), %ebx
testl %esi, %esi
- jmp *%ebx
+ JMP_NOSPEC ebx
# Handle 2-byte-aligned regions
20: addw (%esi), %ax
@@ -256,82 +239,55 @@ ENTRY(csum_partial)
addl %ebx,%eax
adcl $0,%eax
80:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 90f
roll $8, %eax
90:
popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebx
popl %esi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE esi
- ret
- CFI_ENDPROC
-ENDPROC(csum_partial)
+ RET
+SYM_FUNC_END(csum_partial)
#endif
+EXPORT_SYMBOL(csum_partial)
/*
unsigned int csum_partial_copy_generic (const char *src, char *dst,
- int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ int len)
*/
/*
* Copy from ds while checksumming, otherwise like csum_partial
- *
- * The macros SRC and DST specify the type of access for the instruction.
- * thus we can call a custom exception handler for all access types.
- *
- * FIXME: could someone double-check whether I haven't mixed up some SRC and
- * DST definitions? It's damn hard to trigger all cases. I hope I got
- * them all but there's no guarantee.
*/
-#define SRC(y...) \
- 9999: y; \
- .section __ex_table, "a"; \
- .long 9999b, 6001f ; \
- .previous
-
-#define DST(y...) \
- 9999: y; \
- .section __ex_table, "a"; \
- .long 9999b, 6002f ; \
- .previous
+#define EXC(y...) \
+ 9999: y; \
+ _ASM_EXTABLE_TYPE(9999b, 7f, EX_TYPE_UACCESS | EX_FLAG_CLEAR_AX)
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
#define ARGBASE 16
#define FP 12
-ENTRY(csum_partial_copy_generic)
- CFI_STARTPROC
+SYM_FUNC_START(csum_partial_copy_generic)
subl $4,%esp
- CFI_ADJUST_CFA_OFFSET 4
pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
movl ARGBASE+8(%esp),%edi # dst
+ movl $-1, %eax # sum
testl $2, %edi # Check alignment.
jz 2f # Jump if alignment is ok.
subl $2, %ecx # Alignment uses up two bytes.
jae 1f # Jump if we had at least two bytes.
addl $2, %ecx # ecx was < 2. Deal with it.
jmp 4f
-SRC(1: movw (%esi), %bx )
+EXC(1: movw (%esi), %bx )
addl $2, %esi
-DST( movw %bx, (%edi) )
+EXC( movw %bx, (%edi) )
addl $2, %edi
addw %bx, %ax
adcl $0, %eax
@@ -339,34 +295,34 @@ DST( movw %bx, (%edi) )
movl %ecx, FP(%esp)
shrl $5, %ecx
jz 2f
- testl %esi, %esi
-SRC(1: movl (%esi), %ebx )
-SRC( movl 4(%esi), %edx )
+ testl %esi, %esi # what's wrong with clc?
+EXC(1: movl (%esi), %ebx )
+EXC( movl 4(%esi), %edx )
adcl %ebx, %eax
-DST( movl %ebx, (%edi) )
+EXC( movl %ebx, (%edi) )
adcl %edx, %eax
-DST( movl %edx, 4(%edi) )
+EXC( movl %edx, 4(%edi) )
-SRC( movl 8(%esi), %ebx )
-SRC( movl 12(%esi), %edx )
+EXC( movl 8(%esi), %ebx )
+EXC( movl 12(%esi), %edx )
adcl %ebx, %eax
-DST( movl %ebx, 8(%edi) )
+EXC( movl %ebx, 8(%edi) )
adcl %edx, %eax
-DST( movl %edx, 12(%edi) )
+EXC( movl %edx, 12(%edi) )
-SRC( movl 16(%esi), %ebx )
-SRC( movl 20(%esi), %edx )
+EXC( movl 16(%esi), %ebx )
+EXC( movl 20(%esi), %edx )
adcl %ebx, %eax
-DST( movl %ebx, 16(%edi) )
+EXC( movl %ebx, 16(%edi) )
adcl %edx, %eax
-DST( movl %edx, 20(%edi) )
+EXC( movl %edx, 20(%edi) )
-SRC( movl 24(%esi), %ebx )
-SRC( movl 28(%esi), %edx )
+EXC( movl 24(%esi), %ebx )
+EXC( movl 28(%esi), %edx )
adcl %ebx, %eax
-DST( movl %ebx, 24(%edi) )
+EXC( movl %ebx, 24(%edi) )
adcl %edx, %eax
-DST( movl %edx, 28(%edi) )
+EXC( movl %edx, 28(%edi) )
lea 32(%esi), %esi
lea 32(%edi), %edi
@@ -378,9 +334,9 @@ DST( movl %edx, 28(%edi) )
andl $0x1c, %edx
je 4f
shrl $2, %edx # This clears CF
-SRC(3: movl (%esi), %ebx )
+EXC(3: movl (%esi), %ebx )
adcl %ebx, %eax
-DST( movl %ebx, (%edi) )
+EXC( movl %ebx, (%edi) )
lea 4(%esi), %esi
lea 4(%edi), %edi
dec %edx
@@ -390,88 +346,49 @@ DST( movl %ebx, (%edi) )
jz 7f
cmpl $2, %ecx
jb 5f
-SRC( movw (%esi), %cx )
+EXC( movw (%esi), %cx )
leal 2(%esi), %esi
-DST( movw %cx, (%edi) )
+EXC( movw %cx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%ecx
-SRC(5: movb (%esi), %cl )
-DST( movb %cl, (%edi) )
+EXC(5: movb (%esi), %cl )
+EXC( movb %cl, (%edi) )
6: addl %ecx, %eax
adcl $0, %eax
7:
-5000:
-
-# Exception handler:
-.section .fixup, "ax"
-
-6001:
- movl ARGBASE+20(%esp), %ebx # src_err_ptr
- movl $-EFAULT, (%ebx)
-
- # zero the complete destination - computing the rest
- # is too much work
- movl ARGBASE+8(%esp), %edi # dst
- movl ARGBASE+12(%esp), %ecx # len
- xorl %eax,%eax
- rep ; stosb
-
- jmp 5000b
-
-6002:
- movl ARGBASE+24(%esp), %ebx # dst_err_ptr
- movl $-EFAULT,(%ebx)
- jmp 5000b
-
-.previous
popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebx
popl %esi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE esi
popl %edi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edi
popl %ecx # equivalent to addl $4,%esp
- CFI_ADJUST_CFA_OFFSET -4
- ret
- CFI_ENDPROC
-ENDPROC(csum_partial_copy_generic)
+ RET
+SYM_FUNC_END(csum_partial_copy_generic)
#else
/* Version for PentiumII/PPro */
#define ROUND1(x) \
- SRC(movl x(%esi), %ebx ) ; \
+ EXC(movl x(%esi), %ebx ) ; \
addl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ EXC(movl %ebx, x(%edi) ) ;
#define ROUND(x) \
- SRC(movl x(%esi), %ebx ) ; \
+ EXC(movl x(%esi), %ebx ) ; \
adcl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
+ EXC(movl %ebx, x(%edi) ) ;
#define ARGBASE 12
-ENTRY(csum_partial_copy_generic)
- CFI_STARTPROC
+SYM_FUNC_START(csum_partial_copy_generic)
pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
- movl ARGBASE+16(%esp),%eax #sum
+ movl $-1, %eax #sum
# movl %ecx, %edx
movl %ecx, %ebx
movl %esi, %edx
@@ -484,10 +401,10 @@ ENTRY(csum_partial_copy_generic)
andl $-32,%edx
lea 3f(%ebx,%ebx), %ebx
testl %esi, %esi
- jmp *%ebx
+ JMP_NOSPEC ebx
1: addl $64,%esi
addl $64,%edi
- SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
+ EXC(movb -32(%edx),%bl) ; EXC(movb (%edx),%bl)
ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
@@ -501,46 +418,27 @@ ENTRY(csum_partial_copy_generic)
jz 7f
cmpl $2, %edx
jb 5f
-SRC( movw (%esi), %dx )
+EXC( movw (%esi), %dx )
leal 2(%esi), %esi
-DST( movw %dx, (%edi) )
+EXC( movw %dx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%edx
5:
-SRC( movb (%esi), %dl )
-DST( movb %dl, (%edi) )
+EXC( movb (%esi), %dl )
+EXC( movb %dl, (%edi) )
6: addl %edx, %eax
adcl $0, %eax
7:
-.section .fixup, "ax"
-6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
- movl $-EFAULT, (%ebx)
- # zero the complete destination (computing the rest is too much work)
- movl ARGBASE+8(%esp),%edi # dst
- movl ARGBASE+12(%esp),%ecx # len
- xorl %eax,%eax
- rep; stosb
- jmp 7b
-6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
- movl $-EFAULT, (%ebx)
- jmp 7b
-.previous
popl %esi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE esi
popl %edi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edi
popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebx
- ret
- CFI_ENDPROC
-ENDPROC(csum_partial_copy_generic)
+ RET
+SYM_FUNC_END(csum_partial_copy_generic)
#undef ROUND
#undef ROUND1
#endif
+EXPORT_SYMBOL(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a9..a508e4a8c66a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,21 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
+#include <linux/cfi_types.h>
+#include <linux/objtool.h>
+#include <asm/asm.h>
/*
- * Zero a page.
- * rdi page
- */
-ENTRY(clear_page_c)
- CFI_STARTPROC
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi - page
+ */
+SYM_TYPED_FUNC_START(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
- ret
- CFI_ENDPROC
-ENDPROC(clear_page_c)
+ RET
+SYM_FUNC_END(clear_page_rep)
+EXPORT_SYMBOL_GPL(clear_page_rep)
-ENTRY(clear_page)
- CFI_STARTPROC
+SYM_TYPED_FUNC_START(clear_page_orig)
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -33,26 +42,103 @@ ENTRY(clear_page)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
- ret
- CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2:
- .previous
- .section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad 1b
- .byte X86_FEATURE_REP_GOOD
- .byte .Lclear_page_end - clear_page
- .byte 2b - 1b
- .previous
+ RET
+SYM_FUNC_END(clear_page_orig)
+EXPORT_SYMBOL_GPL(clear_page_orig)
+
+SYM_TYPED_FUNC_START(clear_page_erms)
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ RET
+SYM_FUNC_END(clear_page_erms)
+EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Default clear user-space.
+ * Input:
+ * rdi destination
+ * rcx count
+ * rax is zero
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ */
+SYM_FUNC_START(rep_stos_alternative)
+ ANNOTATE_NOENDBR
+ cmpq $64,%rcx
+ jae .Lunrolled
+
+ cmp $8,%ecx
+ jae .Lword
+
+ testl %ecx,%ecx
+ je .Lexit
+
+.Lclear_user_tail:
+0: movb %al,(%rdi)
+ inc %rdi
+ dec %rcx
+ jnz .Lclear_user_tail
+.Lexit:
+ RET
+
+ _ASM_EXTABLE_UA( 0b, .Lexit)
+
+.Lword:
+1: movq %rax,(%rdi)
+ addq $8,%rdi
+ sub $8,%ecx
+ je .Lexit
+ cmp $8,%ecx
+ jae .Lword
+ jmp .Lclear_user_tail
+
+ .p2align 4
+.Lunrolled:
+10: movq %rax,(%rdi)
+11: movq %rax,8(%rdi)
+12: movq %rax,16(%rdi)
+13: movq %rax,24(%rdi)
+14: movq %rax,32(%rdi)
+15: movq %rax,40(%rdi)
+16: movq %rax,48(%rdi)
+17: movq %rax,56(%rdi)
+ addq $64,%rdi
+ subq $64,%rcx
+ cmpq $64,%rcx
+ jae .Lunrolled
+ cmpl $8,%ecx
+ jae .Lword
+ testl %ecx,%ecx
+ jne .Lclear_user_tail
+ RET
+
+ /*
+ * If we take an exception on any of the
+ * word stores, we know that %rcx isn't zero,
+ * so we can just go to the tail clearing to
+ * get the exact count.
+ *
+ * The unrolled case might end up clearing
+ * some bytes twice. Don't care.
+ *
+ * We could use the value in %rdi to avoid
+ * a second fault on the exact count case,
+ * but do we really care? No.
+ *
+ * Finally, we could try to align %rdi at the
+ * top of the unrolling. But unaligned stores
+ * just aren't that common or expensive.
+ */
+ _ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(10b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(11b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(12b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(13b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(14b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(15b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(16b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(17b, .Lclear_user_tail)
+SYM_FUNC_END(rep_stos_alternative)
+EXPORT_SYMBOL(rep_stos_alternative)
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 000000000000..c65cd5550454
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Misc librarized functions for cmdline poking.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+
+#include <asm/setup.h>
+#include <asm/cmdline.h>
+#include <asm/bug.h>
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/*
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ *
+ * Returns the position of that @option (starts counting with 1)
+ * or 0 on not found. @option will only be found if it is found
+ * as an entire word in @cmdline. For instance, if @option="car"
+ * then a cmdline which contains "cart" will not match.
+ */
+static int
+__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+ const char *option)
+{
+ char c;
+ int pos = 0, wstart = 0;
+ const char *opptr = NULL;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ pos++;
+
+ switch (state) {
+ case st_wordstart:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ wstart = pos;
+ fallthrough;
+
+ case st_wordcmp:
+ if (!*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for. If the
+ * command-line has a space _or_ ends, then
+ * we matched!
+ */
+ if (!c || myisspace(c))
+ return wstart;
+ /*
+ * We hit the end of the option, but _not_
+ * the end of a word on the cmdline. Not
+ * a match.
+ */
+ } else if (!c) {
+ /*
+ * Hit the NULL terminator on the end of
+ * cmdline.
+ */
+ return 0;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ fallthrough;
+
+ case st_wordskip:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ state = st_wordstart;
+ break;
+ }
+ }
+
+ return 0; /* Buffer overrun */
+}
+
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+ const char *option, char *buffer, int bufsize)
+{
+ char c;
+ int pos = 0, len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy, /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos++ < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ if (!c)
+ break;
+
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ fallthrough;
+
+ case st_wordcmp:
+ if ((c == '=') && !*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for, prepare to
+ * copy the argument.
+ */
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ fallthrough;
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ /*
+ * Increment len, but don't overrun the
+ * supplied buffer and leave room for the
+ * NULL terminator.
+ */
+ if (++len < bufsize)
+ *bufptr++ = c;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+ int ret;
+
+ ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option);
+
+ return ret;
+}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+ int bufsize)
+{
+ int ret;
+
+ ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+
+ return ret;
+}
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000000..4fb44894ad87
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <linux/linkage.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+
+.text
+
+/*
+ * Emulate 'cmpxchg16b %gs:(%rsi)'
+ *
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ *
+ * Notably this is not LOCK prefixed and is not safe against NMIs
+ */
+SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
+
+ pushfq
+ cli
+
+ /* if (*ptr == old) */
+ cmpq __percpu (%rsi), %rax
+ jne .Lnot_same
+ cmpq __percpu 8(%rsi), %rdx
+ jne .Lnot_same
+
+ /* *ptr = new */
+ movq %rbx, __percpu (%rsi)
+ movq %rcx, __percpu 8(%rsi)
+
+ /* set ZF in EFLAGS to indicate success */
+ orl $X86_EFLAGS_ZF, (%rsp)
+
+ popfq
+ RET
+
+.Lnot_same:
+ /* *ptr != old */
+
+ /* old = *ptr */
+ movq __percpu (%rsi), %rax
+ movq __percpu 8(%rsi), %rdx
+
+ /* clear ZF in EFLAGS to indicate failure */
+ andl $(~X86_EFLAGS_ZF), (%rsp)
+
+ popfq
+ RET
+
+SYM_FUNC_END(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
new file mode 100644
index 000000000000..d4bb24347ff8
--- /dev/null
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+
+.text
+
+#ifndef CONFIG_X86_CX8
+
+/*
+ * Emulate 'cmpxchg8b (%esi)' on UP
+ *
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ */
+SYM_FUNC_START(cmpxchg8b_emu)
+
+ pushfl
+ cli
+
+ cmpl (%esi), %eax
+ jne .Lnot_same
+ cmpl 4(%esi), %edx
+ jne .Lnot_same
+
+ movl %ebx, (%esi)
+ movl %ecx, 4(%esi)
+
+ orl $X86_EFLAGS_ZF, (%esp)
+
+ popfl
+ RET
+
+.Lnot_same:
+ movl (%esi), %eax
+ movl 4(%esi), %edx
+
+ andl $(~X86_EFLAGS_ZF), (%esp)
+
+ popfl
+ RET
+
+SYM_FUNC_END(cmpxchg8b_emu)
+EXPORT_SYMBOL(cmpxchg8b_emu)
+
+#endif
+
+#ifndef CONFIG_UML
+
+/*
+ * Emulate 'cmpxchg8b %fs:(%rsi)'
+ *
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ *
+ * Notably this is not LOCK prefixed and is not safe against NMIs
+ */
+SYM_FUNC_START(this_cpu_cmpxchg8b_emu)
+
+ pushfl
+ cli
+
+ cmpl __percpu (%esi), %eax
+ jne .Lnot_same2
+ cmpl __percpu 4(%esi), %edx
+ jne .Lnot_same2
+
+ movl %ebx, __percpu (%esi)
+ movl %ecx, __percpu 4(%esi)
+
+ orl $X86_EFLAGS_ZF, (%esp)
+
+ popfl
+ RET
+
+.Lnot_same2:
+ movl __percpu (%esi), %eax
+ movl __percpu 4(%esi), %edx
+
+ andl $(~X86_EFLAGS_ZF), (%esp)
+
+ popfl
+ RET
+
+SYM_FUNC_END(this_cpu_cmpxchg8b_emu)
+
+#endif
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
new file mode 100644
index 000000000000..97e88e58567b
--- /dev/null
+++ b/arch/x86/lib/copy_mc.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */
+
+#include <linux/jump_label.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/instrumented.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/mce.h>
+
+#ifdef CONFIG_X86_MCE
+static DEFINE_STATIC_KEY_FALSE(copy_mc_fragile_key);
+
+void enable_copy_mc_fragile(void)
+{
+ static_branch_inc(&copy_mc_fragile_key);
+}
+#define copy_mc_fragile_enabled (static_branch_unlikely(&copy_mc_fragile_key))
+
+/*
+ * Similar to copy_user_handle_tail, probe for the write fault point, or
+ * source exception point.
+ */
+__visible notrace unsigned long
+copy_mc_fragile_handle_tail(char *to, char *from, unsigned len)
+{
+ for (; len; --len, to++, from++)
+ if (copy_mc_fragile(to, from, 1))
+ break;
+ return len;
+}
+#else
+/*
+ * No point in doing careful copying, or consulting a static key when
+ * there is no #MC handler in the CONFIG_X86_MCE=n case.
+ */
+void enable_copy_mc_fragile(void)
+{
+}
+#define copy_mc_fragile_enabled (0)
+#endif
+
+unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len);
+
+/**
+ * copy_mc_to_kernel - memory copy that handles source exceptions
+ *
+ * @dst: destination address
+ * @src: source address
+ * @len: number of bytes to copy
+ *
+ * Call into the 'fragile' version on systems that benefit from avoiding
+ * corner case poison consumption scenarios, For example, accessing
+ * poison across 2 cachelines with a single instruction. Almost all
+ * other uses case can use copy_mc_enhanced_fast_string() for a fast
+ * recoverable copy, or fallback to plain memcpy.
+ *
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
+ */
+unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len)
+{
+ unsigned long ret;
+
+ if (copy_mc_fragile_enabled) {
+ instrument_memcpy_before(dst, src, len);
+ ret = copy_mc_fragile(dst, src, len);
+ instrument_memcpy_after(dst, src, len, ret);
+ return ret;
+ }
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ instrument_memcpy_before(dst, src, len);
+ ret = copy_mc_enhanced_fast_string(dst, src, len);
+ instrument_memcpy_after(dst, src, len, ret);
+ return ret;
+ }
+ memcpy(dst, src, len);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_mc_to_kernel);
+
+unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len)
+{
+ unsigned long ret;
+
+ if (copy_mc_fragile_enabled) {
+ instrument_copy_to_user(dst, src, len);
+ __uaccess_begin();
+ ret = copy_mc_fragile((__force void *)dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
+
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ instrument_copy_to_user(dst, src, len);
+ __uaccess_begin();
+ ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
+
+ return copy_user_generic((__force void *)dst, src, len);
+}
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
new file mode 100644
index 000000000000..c859a8a09860
--- /dev/null
+++ b/arch/x86/lib/copy_mc_64.S
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2016-2020 Intel Corporation. All rights reserved. */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+#ifndef CONFIG_UML
+
+#ifdef CONFIG_X86_MCE
+
+/*
+ * copy_mc_fragile - copy memory with indication if an exception / fault happened
+ *
+ * The 'fragile' version is opted into by platform quirks and takes
+ * pains to avoid unrecoverable corner cases like 'fast-string'
+ * instruction sequences, and consuming poison across a cacheline
+ * boundary. The non-fragile version is equivalent to memcpy()
+ * regardless of CPU machine-check-recovery capability.
+ */
+SYM_FUNC_START(copy_mc_fragile)
+ cmpl $8, %edx
+ /* Less than 8 bytes? Go to byte copy loop */
+ jb .L_no_whole_words
+
+ /* Check for bad alignment of source */
+ testl $7, %esi
+ /* Already aligned */
+ jz .L_8byte_aligned
+
+ /* Copy one byte at a time until source is 8-byte aligned */
+ movl %esi, %ecx
+ andl $7, %ecx
+ subl $8, %ecx
+ negl %ecx
+ subl %ecx, %edx
+.L_read_leading_bytes:
+ movb (%rsi), %al
+.L_write_leading_bytes:
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_read_leading_bytes
+
+.L_8byte_aligned:
+ movl %edx, %ecx
+ andl $7, %edx
+ shrl $3, %ecx
+ jz .L_no_whole_words
+
+.L_read_words:
+ movq (%rsi), %r8
+.L_write_words:
+ movq %r8, (%rdi)
+ addq $8, %rsi
+ addq $8, %rdi
+ decl %ecx
+ jnz .L_read_words
+
+ /* Any trailing bytes? */
+.L_no_whole_words:
+ andl %edx, %edx
+ jz .L_done_memcpy_trap
+
+ /* Copy trailing bytes */
+ movl %edx, %ecx
+.L_read_trailing_bytes:
+ movb (%rsi), %al
+.L_write_trailing_bytes:
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_read_trailing_bytes
+
+ /* Copy successful. Return zero */
+.L_done_memcpy_trap:
+ xorl %eax, %eax
+.L_done:
+ RET
+
+ /*
+ * Return number of bytes not copied for any failure. Note that
+ * there is no "tail" handling since the source buffer is 8-byte
+ * aligned and poison is cacheline aligned.
+ */
+.E_read_words:
+ shll $3, %ecx
+.E_leading_bytes:
+ addl %edx, %ecx
+.E_trailing_bytes:
+ mov %ecx, %eax
+ jmp .L_done
+
+ /*
+ * For write fault handling, given the destination is unaligned,
+ * we handle faults on multi-byte writes with a byte-by-byte
+ * copy up to the write-protected page.
+ */
+.E_write_words:
+ shll $3, %ecx
+ addl %edx, %ecx
+ movl %ecx, %edx
+ jmp copy_mc_fragile_handle_tail
+
+ _ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
+ _ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE)
+ _ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
+ _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
+ _ASM_EXTABLE(.L_write_words, .E_write_words)
+ _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
+
+SYM_FUNC_END(copy_mc_fragile)
+#endif /* CONFIG_X86_MCE */
+
+/*
+ * copy_mc_enhanced_fast_string - memory copy with exception handling
+ *
+ * Fast string copy + fault / exception handling. If the CPU does
+ * support machine check exception recovery, but does not support
+ * recovering from fast-string exceptions then this CPU needs to be
+ * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any
+ * machine check recovery support this version should be no slower than
+ * standard memcpy.
+ */
+SYM_FUNC_START(copy_mc_enhanced_fast_string)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+.L_copy:
+ rep movsb
+ /* Copy successful. Return zero */
+ xorl %eax, %eax
+ RET
+
+.E_copy:
+ /*
+ * On fault %rcx is updated such that the copy instruction could
+ * optionally be restarted at the fault position, i.e. it
+ * contains 'bytes remaining'. A non-zero return indicates error
+ * to copy_mc_generic() users, or indicate short transfers to
+ * user-copy routines.
+ */
+ movq %rcx, %rax
+ RET
+
+ _ASM_EXTABLE_TYPE(.L_copy, .E_copy, EX_TYPE_DEFAULT_MCE_SAFE)
+
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
+#endif /* !CONFIG_UML */
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2fc..d8e87fedc20d 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,119 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
+#include <linux/cfi_types.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
ALIGN
-copy_page_c:
- CFI_STARTPROC
- movl $4096/8,%ecx
- rep movsq
- ret
- CFI_ENDPROC
-ENDPROC(copy_page_c)
-
-/* Don't use streaming store because it's better when the target
- ends up in cache. */
-
-/* Could vary the prefetch distance based on SMP/UP */
-
-ENTRY(copy_page)
- CFI_STARTPROC
- subq $3*8,%rsp
- CFI_ADJUST_CFA_OFFSET 3*8
- movq %rbx,(%rsp)
- CFI_REL_OFFSET rbx, 0
- movq %r12,1*8(%rsp)
- CFI_REL_OFFSET r12, 1*8
- movq %r13,2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
-
- movl $(4096/64)-5,%ecx
+SYM_TYPED_FUNC_START(copy_page)
+ ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
+ movl $4096/8, %ecx
+ rep movsq
+ RET
+SYM_FUNC_END(copy_page)
+EXPORT_SYMBOL(copy_page)
+
+SYM_FUNC_START_LOCAL(copy_page_regs)
+ subq $2*8, %rsp
+ movq %rbx, (%rsp)
+ movq %r12, 1*8(%rsp)
+
+ movl $(4096/64)-5, %ecx
.p2align 4
.Loop64:
- dec %rcx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
+ dec %rcx
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
prefetcht0 5*64(%rsi)
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
- jnz .Loop64
+ jnz .Loop64
- movl $5,%ecx
+ movl $5, %ecx
.p2align 4
.Loop2:
- decl %ecx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
-
- leaq 64(%rdi),%rdi
- leaq 64(%rsi),%rsi
-
+ decl %ecx
+
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
+
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
+
+ leaq 64(%rdi), %rdi
+ leaq 64(%rsi), %rsi
jnz .Loop2
- movq (%rsp),%rbx
- CFI_RESTORE rbx
- movq 1*8(%rsp),%r12
- CFI_RESTORE r12
- movq 2*8(%rsp),%r13
- CFI_RESTORE r13
- addq $3*8,%rsp
- CFI_ADJUST_CFA_OFFSET -3*8
- ret
-.Lcopy_page_end:
- CFI_ENDPROC
-ENDPROC(copy_page)
-
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
-2:
- .previous
- .section .altinstructions,"a"
- .align 8
- .quad copy_page
- .quad 1b
- .byte X86_FEATURE_REP_GOOD
- .byte .Lcopy_page_end - copy_page
- .byte 2b - 1b
- .previous
+ movq (%rsp), %rbx
+ movq 1*8(%rsp), %r12
+ addq $2*8, %rsp
+ RET
+SYM_FUNC_END(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85ea..06296eb69fd4 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -1,269 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
* Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
*
* Functions to copy from and to user space.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/cpufeature.h>
-
- .macro ALTERNATIVE_JUMP feature,orig,alt
-0:
- .byte 0xe9 /* 32bit jump */
- .long \orig-1f /* by default jump to orig */
-1:
- .section .altinstr_replacement,"ax"
-2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt-1b /* offset */ /* or alternatively to alt */
- .previous
- .section .altinstructions,"a"
- .align 8
- .quad 0b
- .quad 2b
- .byte \feature /* when feature is set */
- .byte 5
- .byte 5
- .previous
- .endm
-
- .macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
- /* check for bad alignment of destination */
- movl %edi,%ecx
- andl $7,%ecx
- jz 102f /* already aligned */
- subl $8,%ecx
- negl %ecx
- subl %ecx,%edx
-100: movb (%rsi),%al
-101: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 100b
-102:
- .section .fixup,"ax"
-103: addl %ecx,%edx /* ecx is zerorest also */
- jmp copy_user_handle_tail
- .previous
-
- .section __ex_table,"a"
- .align 8
- .quad 100b,103b
- .quad 101b,103b
- .previous
-#endif
- .endm
-
-/* Standard copy_to_user with segment limit checking */
-ENTRY(copy_to_user)
- CFI_STARTPROC
- GET_THREAD_INFO(%rax)
- movq %rdi,%rcx
- addq %rdx,%rcx
- jc bad_to_user
- cmpq TI_addr_limit(%rax),%rcx
- jae bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-ENDPROC(copy_to_user)
-
-/* Standard copy_from_user with segment limit checking */
-ENTRY(copy_from_user)
- CFI_STARTPROC
- GET_THREAD_INFO(%rax)
- movq %rsi,%rcx
- addq %rdx,%rcx
- jc bad_from_user
- cmpq TI_addr_limit(%rax),%rcx
- jae bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-ENDPROC(copy_from_user)
-
-ENTRY(copy_user_generic)
- CFI_STARTPROC
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-ENDPROC(copy_user_generic)
-
-ENTRY(__copy_from_user_inatomic)
- CFI_STARTPROC
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-ENDPROC(__copy_from_user_inatomic)
-
- .section .fixup,"ax"
- /* must zero dest */
-ENTRY(bad_from_user)
-bad_from_user:
- CFI_STARTPROC
- movl %edx,%ecx
- xorl %eax,%eax
- rep
- stosb
-bad_to_user:
- movl %edx,%eax
- ret
- CFI_ENDPROC
-ENDPROC(bad_from_user)
- .previous
+#include <linux/cfi_types.h>
+#include <linux/objtool.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+#include <asm/asm.h>
/*
- * copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro
- * code for rep movsq
+ * rep_movs_alternative - memory copy with exception handling.
+ * This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
*
* Input:
* rdi destination
* rsi source
- * rdx count
+ * rcx count
*
* Output:
- * eax uncopied bytes or 0 if successfull.
- */
-ENTRY(copy_user_generic_unrolled)
- CFI_STARTPROC
- cmpl $8,%edx
- jb 20f /* less then 8 bytes, go to byte copy loop */
- ALIGN_DESTINATION
- movl %edx,%ecx
- andl $63,%edx
- shrl $6,%ecx
- jz 17f
-1: movq (%rsi),%r8
-2: movq 1*8(%rsi),%r9
-3: movq 2*8(%rsi),%r10
-4: movq 3*8(%rsi),%r11
-5: movq %r8,(%rdi)
-6: movq %r9,1*8(%rdi)
-7: movq %r10,2*8(%rdi)
-8: movq %r11,3*8(%rdi)
-9: movq 4*8(%rsi),%r8
-10: movq 5*8(%rsi),%r9
-11: movq 6*8(%rsi),%r10
-12: movq 7*8(%rsi),%r11
-13: movq %r8,4*8(%rdi)
-14: movq %r9,5*8(%rdi)
-15: movq %r10,6*8(%rdi)
-16: movq %r11,7*8(%rdi)
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- decl %ecx
- jnz 1b
-17: movl %edx,%ecx
- andl $7,%edx
- shrl $3,%ecx
- jz 20f
-18: movq (%rsi),%r8
-19: movq %r8,(%rdi)
- leaq 8(%rsi),%rsi
- leaq 8(%rdi),%rdi
- decl %ecx
- jnz 18b
-20: andl %edx,%edx
- jz 23f
- movl %edx,%ecx
-21: movb (%rsi),%al
-22: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 21b
-23: xor %eax,%eax
- ret
-
- .section .fixup,"ax"
-30: shll $6,%ecx
- addl %ecx,%edx
- jmp 60f
-40: lea (%rdx,%rcx,8),%rdx
- jmp 60f
-50: movl %ecx,%edx
-60: jmp copy_user_handle_tail /* ecx is zerorest also */
- .previous
-
- .section __ex_table,"a"
- .align 8
- .quad 1b,30b
- .quad 2b,30b
- .quad 3b,30b
- .quad 4b,30b
- .quad 5b,30b
- .quad 6b,30b
- .quad 7b,30b
- .quad 8b,30b
- .quad 9b,30b
- .quad 10b,30b
- .quad 11b,30b
- .quad 12b,30b
- .quad 13b,30b
- .quad 14b,30b
- .quad 15b,30b
- .quad 16b,30b
- .quad 18b,40b
- .quad 19b,40b
- .quad 21b,50b
- .quad 22b,50b
- .previous
- CFI_ENDPROC
-ENDPROC(copy_user_generic_unrolled)
-
-/* Some CPUs run faster using the string copy instructions.
- * This is also a lot simpler. Use them when possible.
+ * rcx uncopied bytes or 0 if successful.
*
- * Only 4GB of copy is supported. This shouldn't be a problem
- * because the kernel normally only writes from/to page sized chunks
- * even if user space passed a longer buffer.
- * And more would be dangerous because both Intel and AMD have
- * errata with rep movsq > 4GB. If someone feels the need to fix
- * this please consider this.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
+ * NOTE! The calling convention is very intentionally the same as
+ * for 'rep movs', so that we can rewrite the function call with
+ * just a plain 'rep movs' on machines that have FSRM. But to make
+ * it simpler for us, we can clobber rsi/rdi and rax freely.
*/
-ENTRY(copy_user_generic_string)
- CFI_STARTPROC
- andl %edx,%edx
- jz 4f
- cmpl $8,%edx
- jb 2f /* less than 8 bytes, go to byte copy loop */
- ALIGN_DESTINATION
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
-1: rep
- movsq
-2: movl %edx,%ecx
-3: rep
- movsb
-4: xorl %eax,%eax
- ret
-
- .section .fixup,"ax"
-11: lea (%rdx,%rcx,8),%rcx
-12: movl %ecx,%edx /* ecx is zerorest also */
- jmp copy_user_handle_tail
- .previous
-
- .section __ex_table,"a"
- .align 8
- .quad 1b,11b
- .quad 3b,12b
- .previous
- CFI_ENDPROC
-ENDPROC(copy_user_generic_string)
+SYM_FUNC_START(rep_movs_alternative)
+ ANNOTATE_NOENDBR
+ cmpq $64,%rcx
+ jae .Llarge
+
+ cmp $8,%ecx
+ jae .Lword
+
+ testl %ecx,%ecx
+ je .Lexit
+
+.Lcopy_user_tail:
+0: movb (%rsi),%al
+1: movb %al,(%rdi)
+ inc %rdi
+ inc %rsi
+ dec %rcx
+ jne .Lcopy_user_tail
+.Lexit:
+ RET
+
+ _ASM_EXTABLE_UA( 0b, .Lexit)
+ _ASM_EXTABLE_UA( 1b, .Lexit)
+
+ .p2align 4
+.Lword:
+2: movq (%rsi),%rax
+3: movq %rax,(%rdi)
+ addq $8,%rsi
+ addq $8,%rdi
+ sub $8,%ecx
+ je .Lexit
+ cmp $8,%ecx
+ jae .Lword
+ jmp .Lcopy_user_tail
+
+ _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
+
+.Llarge:
+0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS
+1: RET
+
+ _ASM_EXTABLE_UA( 0b, 1b)
+
+.Llarge_movsq:
+ /* Do the first possibly unaligned word */
+0: movq (%rsi),%rax
+1: movq %rax,(%rdi)
+
+ _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
+
+ /* What would be the offset to the aligned destination? */
+ leaq 8(%rdi),%rax
+ andq $-8,%rax
+ subq %rdi,%rax
+
+ /* .. and update pointers and count to match */
+ addq %rax,%rdi
+ addq %rax,%rsi
+ subq %rax,%rcx
+
+ /* make %rcx contain the number of words, %rax the remainder */
+ movq %rcx,%rax
+ shrq $3,%rcx
+ andl $7,%eax
+0: rep movsq
+ movl %eax,%ecx
+ testl %ecx,%ecx
+ jne .Lcopy_user_tail
+ RET
+
+1: leaq (%rax,%rcx,8),%rcx
+ jmp .Lcopy_user_tail
+
+ _ASM_EXTABLE_UA( 0b, 1b)
+SYM_FUNC_END(rep_movs_alternative)
+EXPORT_SYMBOL(rep_movs_alternative)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
deleted file mode 100644
index cb0c112386fb..000000000000
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
- *
- * Functions to copy from and to user space.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
- .macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
- /* check for bad alignment of destination */
- movl %edi,%ecx
- andl $7,%ecx
- jz 102f /* already aligned */
- subl $8,%ecx
- negl %ecx
- subl %ecx,%edx
-100: movb (%rsi),%al
-101: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 100b
-102:
- .section .fixup,"ax"
-103: addl %ecx,%edx /* ecx is zerorest also */
- jmp copy_user_handle_tail
- .previous
-
- .section __ex_table,"a"
- .align 8
- .quad 100b,103b
- .quad 101b,103b
- .previous
-#endif
- .endm
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- */
-ENTRY(__copy_user_nocache)
- CFI_STARTPROC
- cmpl $8,%edx
- jb 20f /* less then 8 bytes, go to byte copy loop */
- ALIGN_DESTINATION
- movl %edx,%ecx
- andl $63,%edx
- shrl $6,%ecx
- jz 17f
-1: movq (%rsi),%r8
-2: movq 1*8(%rsi),%r9
-3: movq 2*8(%rsi),%r10
-4: movq 3*8(%rsi),%r11
-5: movnti %r8,(%rdi)
-6: movnti %r9,1*8(%rdi)
-7: movnti %r10,2*8(%rdi)
-8: movnti %r11,3*8(%rdi)
-9: movq 4*8(%rsi),%r8
-10: movq 5*8(%rsi),%r9
-11: movq 6*8(%rsi),%r10
-12: movq 7*8(%rsi),%r11
-13: movnti %r8,4*8(%rdi)
-14: movnti %r9,5*8(%rdi)
-15: movnti %r10,6*8(%rdi)
-16: movnti %r11,7*8(%rdi)
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- decl %ecx
- jnz 1b
-17: movl %edx,%ecx
- andl $7,%edx
- shrl $3,%ecx
- jz 20f
-18: movq (%rsi),%r8
-19: movnti %r8,(%rdi)
- leaq 8(%rsi),%rsi
- leaq 8(%rdi),%rdi
- decl %ecx
- jnz 18b
-20: andl %edx,%edx
- jz 23f
- movl %edx,%ecx
-21: movb (%rsi),%al
-22: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 21b
-23: xorl %eax,%eax
- sfence
- ret
-
- .section .fixup,"ax"
-30: shll $6,%ecx
- addl %ecx,%edx
- jmp 60f
-40: lea (%rdx,%rcx,8),%rdx
- jmp 60f
-50: movl %ecx,%edx
-60: sfence
- jmp copy_user_handle_tail
- .previous
-
- .section __ex_table,"a"
- .quad 1b,30b
- .quad 2b,30b
- .quad 3b,30b
- .quad 4b,30b
- .quad 5b,30b
- .quad 6b,30b
- .quad 7b,30b
- .quad 8b,30b
- .quad 9b,30b
- .quad 10b,30b
- .quad 11b,30b
- .quad 12b,30b
- .quad 13b,30b
- .quad 14b,30b
- .quad 15b,30b
- .quad 16b,30b
- .quad 18b,40b
- .quad 19b,40b
- .quad 21b,50b
- .quad 22b,50b
- .previous
- CFI_ENDPROC
-ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S
new file mode 100644
index 000000000000..18350b343c2a
--- /dev/null
+++ b/arch/x86/lib/copy_user_uncached_64.S
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
+ */
+
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+#include <asm/asm.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ *
+ * This copies from user space into kernel space, but the kernel
+ * space accesses can take a machine check exception, so they too
+ * need exception handling.
+ *
+ * Note: only 32-bit and 64-bit stores have non-temporal versions,
+ * and we only use aligned versions. Any unaligned parts at the
+ * start or end of the copy will be done using normal cached stores.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * edx count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__copy_user_nocache)
+ ANNOTATE_NOENDBR
+ /* If destination is not 7-byte aligned, we'll have to align it */
+ testb $7,%dil
+ jne .Lalign
+
+.Lis_aligned:
+ cmp $64,%edx
+ jb .Lquadwords
+
+ .p2align 4,0x90
+.Lunrolled:
+10: movq (%rsi),%r8
+11: movq 8(%rsi),%r9
+12: movq 16(%rsi),%r10
+13: movq 24(%rsi),%r11
+20: movnti %r8,(%rdi)
+21: movnti %r9,8(%rdi)
+22: movnti %r10,16(%rdi)
+23: movnti %r11,24(%rdi)
+30: movq 32(%rsi),%r8
+31: movq 40(%rsi),%r9
+32: movq 48(%rsi),%r10
+33: movq 56(%rsi),%r11
+40: movnti %r8,32(%rdi)
+41: movnti %r9,40(%rdi)
+42: movnti %r10,48(%rdi)
+43: movnti %r11,56(%rdi)
+
+ addq $64,%rsi
+ addq $64,%rdi
+ sub $64,%edx
+ cmp $64,%edx
+ jae .Lunrolled
+
+/*
+ * First set of user mode loads have been done
+ * without any stores, so if they fail, we can
+ * just try the non-unrolled loop.
+ */
+_ASM_EXTABLE_UA(10b, .Lquadwords)
+_ASM_EXTABLE_UA(11b, .Lquadwords)
+_ASM_EXTABLE_UA(12b, .Lquadwords)
+_ASM_EXTABLE_UA(13b, .Lquadwords)
+
+/*
+ * The second set of user mode loads have been
+ * done with 32 bytes stored to the destination,
+ * so we need to take that into account before
+ * falling back to the unrolled loop.
+ */
+_ASM_EXTABLE_UA(30b, .Lfixup32)
+_ASM_EXTABLE_UA(31b, .Lfixup32)
+_ASM_EXTABLE_UA(32b, .Lfixup32)
+_ASM_EXTABLE_UA(33b, .Lfixup32)
+
+/*
+ * An exception on a write means that we're
+ * done, but we need to update the count
+ * depending on where in the unrolled loop
+ * we were.
+ */
+_ASM_EXTABLE_UA(20b, .Ldone0)
+_ASM_EXTABLE_UA(21b, .Ldone8)
+_ASM_EXTABLE_UA(22b, .Ldone16)
+_ASM_EXTABLE_UA(23b, .Ldone24)
+_ASM_EXTABLE_UA(40b, .Ldone32)
+_ASM_EXTABLE_UA(41b, .Ldone40)
+_ASM_EXTABLE_UA(42b, .Ldone48)
+_ASM_EXTABLE_UA(43b, .Ldone56)
+
+.Lquadwords:
+ cmp $8,%edx
+ jb .Llong
+50: movq (%rsi),%rax
+51: movnti %rax,(%rdi)
+ addq $8,%rsi
+ addq $8,%rdi
+ sub $8,%edx
+ jmp .Lquadwords
+
+/*
+ * If we fail on the last full quadword, we will
+ * not try to do any byte-wise cached accesses.
+ * We will try to do one more 4-byte uncached
+ * one, though.
+ */
+_ASM_EXTABLE_UA(50b, .Llast4)
+_ASM_EXTABLE_UA(51b, .Ldone0)
+
+.Llong:
+ test $4,%dl
+ je .Lword
+60: movl (%rsi),%eax
+61: movnti %eax,(%rdi)
+ addq $4,%rsi
+ addq $4,%rdi
+ sub $4,%edx
+.Lword:
+ sfence
+ test $2,%dl
+ je .Lbyte
+70: movw (%rsi),%ax
+71: movw %ax,(%rdi)
+ addq $2,%rsi
+ addq $2,%rdi
+ sub $2,%edx
+.Lbyte:
+ test $1,%dl
+ je .Ldone
+80: movb (%rsi),%al
+81: movb %al,(%rdi)
+ dec %edx
+.Ldone:
+ mov %edx,%eax
+ RET
+
+/*
+ * If we fail on the last four bytes, we won't
+ * bother with any fixups. It's dead, Jim. Note
+ * that there's no need for 'sfence' for any
+ * of this, since the exception will have been
+ * serializing.
+ */
+_ASM_EXTABLE_UA(60b, .Ldone)
+_ASM_EXTABLE_UA(61b, .Ldone)
+_ASM_EXTABLE_UA(70b, .Ldone)
+_ASM_EXTABLE_UA(71b, .Ldone)
+_ASM_EXTABLE_UA(80b, .Ldone)
+_ASM_EXTABLE_UA(81b, .Ldone)
+
+/*
+ * This is the "head needs aliging" case when
+ * the destination isn't 8-byte aligned. The
+ * 4-byte case can be done uncached, but any
+ * smaller alignment is done with regular stores.
+ */
+.Lalign:
+ test $1,%dil
+ je .Lalign_word
+ test %edx,%edx
+ je .Ldone
+90: movb (%rsi),%al
+91: movb %al,(%rdi)
+ inc %rsi
+ inc %rdi
+ dec %edx
+.Lalign_word:
+ test $2,%dil
+ je .Lalign_long
+ cmp $2,%edx
+ jb .Lbyte
+92: movw (%rsi),%ax
+93: movw %ax,(%rdi)
+ addq $2,%rsi
+ addq $2,%rdi
+ sub $2,%edx
+.Lalign_long:
+ test $4,%dil
+ je .Lis_aligned
+ cmp $4,%edx
+ jb .Lword
+94: movl (%rsi),%eax
+95: movnti %eax,(%rdi)
+ addq $4,%rsi
+ addq $4,%rdi
+ sub $4,%edx
+ jmp .Lis_aligned
+
+/*
+ * If we fail on the initial alignment accesses,
+ * we're all done. Again, no point in trying to
+ * do byte-by-byte probing if the 4-byte load
+ * fails - we're not doing any uncached accesses
+ * any more.
+ */
+_ASM_EXTABLE_UA(90b, .Ldone)
+_ASM_EXTABLE_UA(91b, .Ldone)
+_ASM_EXTABLE_UA(92b, .Ldone)
+_ASM_EXTABLE_UA(93b, .Ldone)
+_ASM_EXTABLE_UA(94b, .Ldone)
+_ASM_EXTABLE_UA(95b, .Ldone)
+
+/*
+ * Exception table fixups for faults in the middle
+ */
+.Ldone56: sub $8,%edx
+.Ldone48: sub $8,%edx
+.Ldone40: sub $8,%edx
+.Ldone32: sub $8,%edx
+.Ldone24: sub $8,%edx
+.Ldone16: sub $8,%edx
+.Ldone8: sub $8,%edx
+.Ldone0:
+ mov %edx,%eax
+ RET
+
+.Lfixup32:
+ addq $32,%rsi
+ addq $32,%rdi
+ sub $32,%edx
+ jmp .Lquadwords
+
+.Llast4:
+52: movl (%rsi),%eax
+53: movnti %eax,(%rdi)
+ sfence
+ sub $4,%edx
+ mov %edx,%eax
+ RET
+_ASM_EXTABLE_UA(52b, .Ldone0)
+_ASM_EXTABLE_UA(53b, .Ldone0)
+
+SYM_FUNC_END(__copy_user_nocache)
+EXPORT_SYMBOL(__copy_user_nocache)
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
new file mode 100644
index 000000000000..7ad68917a51e
--- /dev/null
+++ b/arch/x86/lib/cpu.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/types.h>
+#include <linux/export.h>
+#include <asm/cpu.h>
+
+unsigned int x86_family(unsigned int sig)
+{
+ unsigned int x86;
+
+ x86 = (sig >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (sig >> 20) & 0xff;
+
+ return x86;
+}
+EXPORT_SYMBOL_GPL(x86_family);
+
+unsigned int x86_model(unsigned int sig)
+{
+ unsigned int fam, model;
+
+ fam = x86_family(sig);
+
+ model = (sig >> 4) & 0xf;
+
+ if (fam >= 0x6)
+ model += ((sig >> 16) & 0xf) << 4;
+
+ return model;
+}
+EXPORT_SYMBOL_GPL(x86_model);
+
+unsigned int x86_stepping(unsigned int sig)
+{
+ return sig & 0xf;
+}
+EXPORT_SYMBOL_GPL(x86_stepping);
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578ea..d9e16a2cf285 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,249 +1,256 @@
/*
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- *
+ * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
+ *
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of this archive
* for more details. No warranty for anything given at all.
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/errno.h>
+#include <asm/asm.h>
/*
* Checksum copy with exception handling.
- * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
* destination is zeroed.
- *
+ *
* Input
* rdi source
* rsi destination
* edx len (32bit)
- * ecx sum (32bit)
- * r8 src_err_ptr (int)
- * r9 dst_err_ptr (int)
*
* Output
* eax 64bit sum. undefined in case of exception.
- *
- * Wrappers need to take care of valid exception sum and zeroing.
+ *
+ * Wrappers need to take care of valid exception sum and zeroing.
* They also should align source or destination to 8 bytes.
*/
.macro source
10:
- .section __ex_table,"a"
- .align 8
- .quad 10b,.Lbad_source
- .previous
+ _ASM_EXTABLE_UA(10b, .Lfault)
.endm
-
+
.macro dest
20:
- .section __ex_table,"a"
- .align 8
- .quad 20b,.Lbad_dest
- .previous
+ _ASM_EXTABLE_UA(20b, .Lfault)
.endm
-
- .macro ignore L=.Lignore
-30:
- .section __ex_table,"a"
- .align 8
- .quad 30b,\L
- .previous
- .endm
-
-
-ENTRY(csum_partial_copy_generic)
- CFI_STARTPROC
- cmpl $3*64,%edx
- jle .Lignore
-
-.Lignore:
- subq $7*8,%rsp
- CFI_ADJUST_CFA_OFFSET 7*8
- movq %rbx,2*8(%rsp)
- CFI_REL_OFFSET rbx, 2*8
- movq %r12,3*8(%rsp)
- CFI_REL_OFFSET r12, 3*8
- movq %r14,4*8(%rsp)
- CFI_REL_OFFSET r14, 4*8
- movq %r13,5*8(%rsp)
- CFI_REL_OFFSET r13, 5*8
- movq %rbp,6*8(%rsp)
- CFI_REL_OFFSET rbp, 6*8
-
- movq %r8,(%rsp)
- movq %r9,1*8(%rsp)
-
- movl %ecx,%eax
- movl %edx,%ecx
-
- xorl %r9d,%r9d
- movq %rcx,%r12
-
- shrq $6,%r12
- jz .Lhandle_tail /* < 64 */
+
+SYM_FUNC_START(csum_partial_copy_generic)
+ subq $5*8, %rsp
+ movq %rbx, 0*8(%rsp)
+ movq %r12, 1*8(%rsp)
+ movq %r14, 2*8(%rsp)
+ movq %r13, 3*8(%rsp)
+ movq %r15, 4*8(%rsp)
+
+ movl $-1, %eax
+ xorl %r9d, %r9d
+ movl %edx, %ecx
+ cmpl $8, %ecx
+ jb .Lshort
+
+ testb $7, %sil
+ jne .Lunaligned
+.Laligned:
+ movl %ecx, %r12d
+
+ shrq $6, %r12
+ jz .Lhandle_tail /* < 64 */
clc
-
+
/* main loop. clear in 64 byte blocks */
/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
/* r11: temp3, rdx: temp4, r12 loopcnt */
- /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+ /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */
.p2align 4
.Lloop:
source
- movq (%rdi),%rbx
+ movq (%rdi), %rbx
source
- movq 8(%rdi),%r8
+ movq 8(%rdi), %r8
source
- movq 16(%rdi),%r11
+ movq 16(%rdi), %r11
source
- movq 24(%rdi),%rdx
+ movq 24(%rdi), %rdx
source
- movq 32(%rdi),%r10
+ movq 32(%rdi), %r10
source
- movq 40(%rdi),%rbp
+ movq 40(%rdi), %r15
source
- movq 48(%rdi),%r14
+ movq 48(%rdi), %r14
source
- movq 56(%rdi),%r13
-
- ignore 2f
+ movq 56(%rdi), %r13
+
+30:
+ /*
+ * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
+ * potentially unmapped kernel address.
+ */
+ _ASM_EXTABLE(30b, 2f)
prefetcht0 5*64(%rdi)
-2:
- adcq %rbx,%rax
- adcq %r8,%rax
- adcq %r11,%rax
- adcq %rdx,%rax
- adcq %r10,%rax
- adcq %rbp,%rax
- adcq %r14,%rax
- adcq %r13,%rax
+2:
+ adcq %rbx, %rax
+ adcq %r8, %rax
+ adcq %r11, %rax
+ adcq %rdx, %rax
+ adcq %r10, %rax
+ adcq %r15, %rax
+ adcq %r14, %rax
+ adcq %r13, %rax
decl %r12d
-
+
dest
- movq %rbx,(%rsi)
+ movq %rbx, (%rsi)
dest
- movq %r8,8(%rsi)
+ movq %r8, 8(%rsi)
dest
- movq %r11,16(%rsi)
+ movq %r11, 16(%rsi)
dest
- movq %rdx,24(%rsi)
+ movq %rdx, 24(%rsi)
dest
- movq %r10,32(%rsi)
+ movq %r10, 32(%rsi)
dest
- movq %rbp,40(%rsi)
+ movq %r15, 40(%rsi)
dest
- movq %r14,48(%rsi)
+ movq %r14, 48(%rsi)
dest
- movq %r13,56(%rsi)
-
-3:
-
- leaq 64(%rdi),%rdi
- leaq 64(%rsi),%rsi
+ movq %r13, 56(%rsi)
- jnz .Lloop
+ leaq 64(%rdi), %rdi
+ leaq 64(%rsi), %rsi
- adcq %r9,%rax
+ jnz .Lloop
- /* do last upto 56 bytes */
+ adcq %r9, %rax
+
+ /* do last up to 56 bytes */
.Lhandle_tail:
- /* ecx: count */
- movl %ecx,%r10d
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lfold
+ /* ecx: count, rcx.63: the end result needs to be rol8 */
+ movq %rcx, %r10
+ andl $63, %ecx
+ shrl $3, %ecx
+ jz .Lfold
clc
.p2align 4
-.Lloop_8:
+.Lloop_8:
source
- movq (%rdi),%rbx
- adcq %rbx,%rax
+ movq (%rdi), %rbx
+ adcq %rbx, %rax
decl %ecx
dest
- movq %rbx,(%rsi)
- leaq 8(%rsi),%rsi /* preserve carry */
- leaq 8(%rdi),%rdi
+ movq %rbx, (%rsi)
+ leaq 8(%rsi), %rsi /* preserve carry */
+ leaq 8(%rdi), %rdi
jnz .Lloop_8
- adcq %r9,%rax /* add in carry */
+ adcq %r9, %rax /* add in carry */
.Lfold:
/* reduce checksum to 32bits */
- movl %eax,%ebx
- shrq $32,%rax
- addl %ebx,%eax
- adcl %r9d,%eax
+ movl %eax, %ebx
+ shrq $32, %rax
+ addl %ebx, %eax
+ adcl %r9d, %eax
- /* do last upto 6 bytes */
+ /* do last up to 6 bytes */
.Lhandle_7:
- movl %r10d,%ecx
- andl $7,%ecx
- shrl $1,%ecx
+ movl %r10d, %ecx
+ andl $7, %ecx
+.L1: /* .Lshort rejoins the common path here */
+ shrl $1, %ecx
jz .Lhandle_1
- movl $2,%edx
- xorl %ebx,%ebx
- clc
+ movl $2, %edx
+ xorl %ebx, %ebx
+ clc
.p2align 4
-.Lloop_1:
+.Lloop_1:
source
- movw (%rdi),%bx
- adcl %ebx,%eax
+ movw (%rdi), %bx
+ adcl %ebx, %eax
decl %ecx
dest
- movw %bx,(%rsi)
- leaq 2(%rdi),%rdi
- leaq 2(%rsi),%rsi
+ movw %bx, (%rsi)
+ leaq 2(%rdi), %rdi
+ leaq 2(%rsi), %rsi
jnz .Lloop_1
- adcl %r9d,%eax /* add in carry */
-
+ adcl %r9d, %eax /* add in carry */
+
/* handle last odd byte */
.Lhandle_1:
- testl $1,%r10d
+ testb $1, %r10b
jz .Lende
- xorl %ebx,%ebx
+ xorl %ebx, %ebx
source
- movb (%rdi),%bl
+ movb (%rdi), %bl
dest
- movb %bl,(%rsi)
- addl %ebx,%eax
- adcl %r9d,%eax /* carry */
-
- CFI_REMEMBER_STATE
+ movb %bl, (%rsi)
+ addl %ebx, %eax
+ adcl %r9d, %eax /* carry */
+
.Lende:
- movq 2*8(%rsp),%rbx
- CFI_RESTORE rbx
- movq 3*8(%rsp),%r12
- CFI_RESTORE r12
- movq 4*8(%rsp),%r14
- CFI_RESTORE r14
- movq 5*8(%rsp),%r13
- CFI_RESTORE r13
- movq 6*8(%rsp),%rbp
- CFI_RESTORE rbp
- addq $7*8,%rsp
- CFI_ADJUST_CFA_OFFSET -7*8
- ret
- CFI_RESTORE_STATE
-
- /* Exception handlers. Very simple, zeroing is done in the wrappers */
-.Lbad_source:
- movq (%rsp),%rax
- testq %rax,%rax
- jz .Lende
- movl $-EFAULT,(%rax)
- jmp .Lende
-
-.Lbad_dest:
- movq 8(%rsp),%rax
- testq %rax,%rax
- jz .Lende
- movl $-EFAULT,(%rax)
- jmp .Lende
- CFI_ENDPROC
-ENDPROC(csum_partial_copy_generic)
+ testq %r10, %r10
+ js .Lwas_odd
+.Lout:
+ movq 0*8(%rsp), %rbx
+ movq 1*8(%rsp), %r12
+ movq 2*8(%rsp), %r14
+ movq 3*8(%rsp), %r13
+ movq 4*8(%rsp), %r15
+ addq $5*8, %rsp
+ RET
+.Lshort:
+ movl %ecx, %r10d
+ jmp .L1
+.Lunaligned:
+ xorl %ebx, %ebx
+ testb $1, %sil
+ jne .Lodd
+1: testb $2, %sil
+ je 2f
+ source
+ movw (%rdi), %bx
+ dest
+ movw %bx, (%rsi)
+ leaq 2(%rdi), %rdi
+ subq $2, %rcx
+ leaq 2(%rsi), %rsi
+ addq %rbx, %rax
+2: testb $4, %sil
+ je .Laligned
+ source
+ movl (%rdi), %ebx
+ dest
+ movl %ebx, (%rsi)
+ leaq 4(%rdi), %rdi
+ subq $4, %rcx
+ leaq 4(%rsi), %rsi
+ addq %rbx, %rax
+ jmp .Laligned
+
+.Lodd:
+ source
+ movb (%rdi), %bl
+ dest
+ movb %bl, (%rsi)
+ leaq 1(%rdi), %rdi
+ leaq 1(%rsi), %rsi
+ /* decrement, set MSB */
+ leaq -1(%rcx, %rcx), %rcx
+ rorq $1, %rcx
+ shll $8, %ebx
+ addq %rbx, %rax
+ jmp 1b
+
+.Lwas_odd:
+ roll $8, %eax
+ jmp .Lout
+
+ /* Exception: just return 0 */
+.Lfault:
+ xorl %eax, %eax
+ jmp .Lout
+SYM_FUNC_END(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e1..c9dae65ac01b 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -1,140 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* arch/x86_64/lib/csum-partial.c
*
* This file contains network checksum routines that are better done
* in an architecture-specific manner due to speed.
*/
-
+
#include <linux/compiler.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
+
+static inline __wsum csum_finalize_sum(u64 temp64)
+{
+ return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32);
+}
-static inline unsigned short from32to16(unsigned a)
+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
{
- unsigned short b = a >> 16;
- asm("addw %w2,%w0\n\t"
- "adcw $0,%w0\n"
- : "=r" (b)
- : "0" (b), "r" (a));
- return b;
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n\t"
+ "adcq %3,%0\n\t"
+ "adcq %4,%0\n\t"
+ "adcq %5,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (sum)
+ :"m" (m[0]), "m" (m[1]), "m" (m[2]),
+ "m" (m[3]), "m" (m[4]));
+ return sum;
}
/*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
*
* This isn't as time critical as it used to be because many NICs
* do hardware checksumming these days.
- *
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
*/
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+__wsum csum_partial(const void *buff, int len, __wsum sum)
{
- unsigned odd, count;
- unsigned long result = 0;
+ u64 temp64 = (__force u64)sum;
- if (unlikely(len == 0))
- return result;
- odd = 1 & (unsigned long) buff;
- if (unlikely(odd)) {
- result = *buff << 8;
- len--;
- buff++;
- }
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *)buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- unsigned long zero;
- unsigned count64;
- if (4 & (unsigned long) buff) {
- result += *(unsigned int *) buff;
- count--;
- len -= 4;
- buff += 4;
- }
- count >>= 1; /* nr of 64-bit words.. */
+ /* Do two 40-byte chunks in parallel to get better ILP */
+ if (likely(len >= 80)) {
+ u64 temp64_2 = 0;
+ do {
+ temp64 = update_csum_40b(temp64, buff);
+ temp64_2 = update_csum_40b(temp64_2, buff + 40);
+ buff += 80;
+ len -= 80;
+ } while (len >= 80);
- /* main loop using 64byte blocks */
- zero = 0;
- count64 = count >> 3;
- while (count64) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq 5*8(%[src]),%[res]\n\t"
- "adcq 6*8(%[src]),%[res]\n\t"
- "adcq 7*8(%[src]),%[res]\n\t"
- "adcq %[zero],%[res]"
- : [res] "=r" (result)
- : [src] "r" (buff), [zero] "r" (zero),
- "[res]" (result));
- buff += 64;
- count64--;
- }
+ asm("addq %1,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (temp64): "r" (temp64_2));
+ }
- /* last upto 7 8byte blocks */
- count %= 8;
- while (count) {
- asm("addq %1,%0\n\t"
- "adcq %2,%0\n"
- : "=r" (result)
- : "m" (*(unsigned long *)buff),
- "r" (zero), "0" (result));
- --count;
- buff += 8;
- }
- result = add32_with_carry(result>>32,
- result&0xffffffff);
+ /*
+ * len == 40 is the hot case due to IPv6 headers, so return
+ * early for that exact case without checking the tail bytes.
+ */
+ if (len >= 40) {
+ temp64 = update_csum_40b(temp64, buff);
+ len -= 40;
+ if (!len)
+ return csum_finalize_sum(temp64);
+ buff += 40;
+ }
- if (len & 4) {
- result += *(unsigned int *) buff;
- buff += 4;
- }
- }
- if (len & 2) {
- result += *(unsigned short *) buff;
- buff += 2;
- }
+ if (len & 32) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r"(temp64)
+ : [src] "r"(buff), "m"(*(const char(*)[32])buff));
+ buff += 32;
}
- if (len & 1)
- result += *buff;
- result = add32_with_carry(result>>32, result & 0xffffffff);
- if (unlikely(odd)) {
- result = from32to16(result);
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+ if (len & 16) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r"(temp64)
+ : [src] "r"(buff), "m"(*(const char(*)[16])buff));
+ buff += 16;
}
- return result;
-}
+ if (len & 8) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r"(temp64)
+ : [src] "r"(buff), "m"(*(const char(*)[8])buff));
+ buff += 8;
+ }
+ if (len & 7) {
+ unsigned int shift = (-len << 3) & 63;
+ unsigned long trail;
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
- return (__force __wsum)add32_with_carry(do_csum(buff, len),
- (__force u32)sum);
+ trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+
+ asm("addq %[trail],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r"(temp64)
+ : [trail] "r"(trail));
+ }
+ return csum_finalize_sum(temp64);
}
+EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
@@ -142,7 +118,6 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
*/
__sum16 ip_compute_csum(const void *buff, int len)
{
- return csum_fold(csum_partial(buff,len,0));
+ return csum_fold(csum_partial(buff, len, 0));
}
EXPORT_SYMBOL(ip_compute_csum);
-
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 459b58a8a15c..f4df4d241526 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -1,134 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2002, 2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v.2
*
* Wrappers of assembly checksum functions for x86-64.
*/
#include <asm/checksum.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <asm/smap.h>
/**
- * csum_partial_copy_from_user - Copy and checksum from user space.
+ * csum_and_copy_from_user - Copy and checksum from user space.
* @src: source address (user space)
* @dst: destination address
* @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad source address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
*/
__wsum
-csum_partial_copy_from_user(const void __user *src, void *dst,
- int len, __wsum isum, int *errp)
+csum_and_copy_from_user(const void __user *src, void *dst, int len)
{
- might_sleep();
- *errp = 0;
-
- if (!likely(access_ok(VERIFY_READ, src, len)))
- goto out_err;
-
- /*
- * Why 6, not 7? To handle odd addresses aligned we
- * would need to do considerable complications to fix the
- * checksum which is defined as an 16bit accumulator. The
- * fix alignment code is primarily for performance
- * compatibility with 32bit and that will handle odd
- * addresses slowly too.
- */
- if (unlikely((unsigned long)src & 6)) {
- while (((unsigned long)src & 6) && len >= 2) {
- __u16 val16;
-
- *errp = __get_user(val16, (const __u16 __user *)src);
- if (*errp)
- return isum;
-
- *(__u16 *)dst = val16;
- isum = (__force __wsum)add32_with_carry(
- (__force unsigned)isum, val16);
- src += 2;
- dst += 2;
- len -= 2;
- }
- }
- isum = csum_partial_copy_generic((__force const void *)src,
- dst, len, isum, errp, NULL);
- if (unlikely(*errp))
- goto out_err;
-
- return isum;
-
-out_err:
- *errp = -EFAULT;
- memset(dst, 0, len);
+ __wsum sum;
- return isum;
+ might_sleep();
+ if (!user_access_begin(src, len))
+ return 0;
+ sum = csum_partial_copy_generic((__force const void *)src, dst, len);
+ user_access_end();
+ return sum;
}
-EXPORT_SYMBOL(csum_partial_copy_from_user);
/**
- * csum_partial_copy_to_user - Copy and checksum to user space.
+ * csum_and_copy_to_user - Copy and checksum to user space.
* @src: source address
* @dst: destination address (user space)
* @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad destination address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
*/
__wsum
-csum_partial_copy_to_user(const void *src, void __user *dst,
- int len, __wsum isum, int *errp)
+csum_and_copy_to_user(const void *src, void __user *dst, int len)
{
- might_sleep();
+ __wsum sum;
- if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
- *errp = -EFAULT;
+ might_sleep();
+ if (!user_access_begin(dst, len))
return 0;
- }
-
- if (unlikely((unsigned long)dst & 6)) {
- while (((unsigned long)dst & 6) && len >= 2) {
- __u16 val16 = *(__u16 *)src;
-
- isum = (__force __wsum)add32_with_carry(
- (__force unsigned)isum, val16);
- *errp = __put_user(val16, (__u16 __user *)dst);
- if (*errp)
- return isum;
- src += 2;
- dst += 2;
- len -= 2;
- }
- }
-
- *errp = 0;
- return csum_partial_copy_generic(src, (void __force *)dst,
- len, isum, NULL, errp);
+ sum = csum_partial_copy_generic(src, (void __force *)dst, len);
+ user_access_end();
+ return sum;
}
-EXPORT_SYMBOL(csum_partial_copy_to_user);
/**
* csum_partial_copy_nocheck - Copy and checksum.
* @src: source address
* @dst: destination address
* @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
*
* Returns an 32bit unfolded checksum of the buffer.
*/
__wsum
-csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+csum_partial_copy_nocheck(const void *src, void *dst, int len)
{
- return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
+ return csum_partial_copy_generic(src, dst, len);
}
EXPORT_SYMBOL(csum_partial_copy_nocheck);
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto, __wsum sum)
+ __u32 len, __u8 proto, __wsum sum)
{
__u64 rest, sum64;
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index ff485d361182..eb2d2e1cbddd 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Precise Delay Loops for i386
*
@@ -11,24 +12,35 @@
* we have to worry about.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/timex.h>
#include <linux/preempt.h>
#include <linux/delay.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include <asm/delay.h>
#include <asm/timer.h>
+#include <asm/mwait.h>
#ifdef CONFIG_SMP
# include <asm/smp.h>
#endif
+static void delay_loop(u64 __loops);
+
+/*
+ * Calibration and selection of the delay mechanism happens only once
+ * during boot.
+ */
+static void (*delay_fn)(u64) __ro_after_init = delay_loop;
+static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init;
+
/* simple loop based delay: */
-static void delay_loop(unsigned long loops)
+static void delay_loop(u64 __loops)
{
+ unsigned long loops = (unsigned long)__loops;
+
asm volatile(
" test %0,%0 \n"
" jz 3f \n"
@@ -42,30 +54,28 @@ static void delay_loop(unsigned long loops)
" jnz 2b \n"
"3: dec %0 \n"
- : /* we don't need output */
- :"a" (loops)
+ : "+a" (loops)
+ :
);
}
/* TSC based delay: */
-static void delay_tsc(unsigned long loops)
+static void delay_tsc(u64 cycles)
{
- unsigned long bclock, now;
+ u64 bclock, now;
int cpu;
preempt_disable();
cpu = smp_processor_id();
- rdtsc_barrier();
- rdtscl(bclock);
+ bclock = rdtsc_ordered();
for (;;) {
- rdtsc_barrier();
- rdtscl(now);
- if ((now - bclock) >= loops)
+ now = rdtsc_ordered();
+ if ((now - bclock) >= cycles)
break;
/* Allow RT tasks to run */
preempt_enable();
- rep_nop();
+ native_pause();
preempt_disable();
/*
@@ -78,30 +88,111 @@ static void delay_tsc(unsigned long loops)
* counter for this CPU.
*/
if (unlikely(cpu != smp_processor_id())) {
- loops -= (now - bclock);
+ cycles -= (now - bclock);
cpu = smp_processor_id();
- rdtsc_barrier();
- rdtscl(bclock);
+ bclock = rdtsc_ordered();
}
}
preempt_enable();
}
/*
- * Since we calibrate only once at boot, this
- * function should be set once at boot and not changed
+ * On Intel the TPAUSE instruction waits until any of:
+ * 1) the TSC counter exceeds the value provided in EDX:EAX
+ * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded
+ * 3) an external interrupt occurs
+ */
+static void delay_halt_tpause(u64 start, u64 cycles)
+{
+ u64 until = start + cycles;
+ u32 eax, edx;
+
+ eax = lower_32_bits(until);
+ edx = upper_32_bits(until);
+
+ /*
+ * Hard code the deeper (C0.2) sleep state because exit latency is
+ * small compared to the "microseconds" that usleep() will delay.
+ */
+ __tpause(TPAUSE_C02_STATE, edx, eax);
+}
+
+/*
+ * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. The input value is the number of TSC cycles
+ * to wait. MWAITX will also exit when the timer expires.
+ */
+static void delay_halt_mwaitx(u64 unused, u64 cycles)
+{
+ u64 delay;
+
+ delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
+ /*
+ * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu
+ * variable as the monitor target.
+ */
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
+
+ /*
+ * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
+ * enter any deep C-state and we use it here in delay() to minimize
+ * wakeup latency.
+ */
+ __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+}
+
+/*
+ * Call a vendor specific function to delay for a given amount of time. Because
+ * these functions may return earlier than requested, check for actual elapsed
+ * time and call again until done.
*/
-static void (*delay_fn)(unsigned long) = delay_loop;
+static void delay_halt(u64 __cycles)
+{
+ u64 start, end, cycles = __cycles;
+
+ /*
+ * Timer value of 0 causes MWAITX to wait indefinitely, unless there
+ * is a store on the memory monitored by MONITORX.
+ */
+ if (!cycles)
+ return;
+
+ start = rdtsc_ordered();
+
+ for (;;) {
+ delay_halt_fn(start, cycles);
+ end = rdtsc_ordered();
+
+ if (cycles <= end - start)
+ break;
+
+ cycles -= end - start;
+ start = end;
+ }
+}
+
+void __init use_tsc_delay(void)
+{
+ if (delay_fn == delay_loop)
+ delay_fn = delay_tsc;
+}
+
+void __init use_tpause_delay(void)
+{
+ delay_halt_fn = delay_halt_tpause;
+ delay_fn = delay_halt;
+}
-void use_tsc_delay(void)
+void use_mwaitx_delay(void)
{
- delay_fn = delay_tsc;
+ delay_halt_fn = delay_halt_mwaitx;
+ delay_fn = delay_halt;
}
-int __devinit read_current_timer(unsigned long *timer_val)
+int read_current_timer(unsigned long *timer_val)
{
if (delay_fn == delay_tsc) {
- rdtscll(*timer_val);
+ *timer_val = rdtsc();
return 0;
}
return -1;
@@ -113,15 +204,15 @@ void __delay(unsigned long loops)
}
EXPORT_SYMBOL(__delay);
-inline void __const_udelay(unsigned long xloops)
+noinline void __const_udelay(unsigned long xloops)
{
+ unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy;
int d0;
xloops *= 4;
asm("mull %%edx"
:"=d" (xloops), "=&a" (d0)
- :"1" (xloops), "0"
- (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
+ :"1" (xloops), "0" (lpj * (HZ / 4)));
__delay(++xloops);
}
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
new file mode 100644
index 000000000000..512a2538596f
--- /dev/null
+++ b/arch/x86/lib/error-inject.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+#include <linux/objtool.h>
+
+asmlinkage void just_return_func(void);
+
+asm(
+ ".text\n"
+ ".type just_return_func, @function\n"
+ ".globl just_return_func\n"
+ ASM_FUNC_ALIGN
+ "just_return_func:\n"
+ ANNOTATE_NOENDBR "\n"
+ ASM_RET
+ ".size just_return_func, .-just_return_func\n"
+);
+
+void override_function_with_return(struct pt_regs *regs)
+{
+ regs->ip = (unsigned long)&just_return_func;
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 51f1504cddd9..9d5654b8a72a 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* __get_user functions.
*
@@ -15,90 +16,157 @@
* __get_user_X
*
* Inputs: %[r|e]ax contains the address.
- * The register is modified, but all changes are undone
- * before returning because the C code doesn't know about it.
*
* Outputs: %[r|e]ax is error code (0 or -EFAULT)
* %[r|e]dx contains zero-extended value
+ * %ecx contains the high half for 32-bit __get_user_8
*
*
* These functions should not modify any other registers,
* as they get called from within inline assembly.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
+#include <linux/objtool.h>
#include <asm/page_types.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/runtime-const.h>
+
+#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
+
+.macro check_range size:req
+.if IS_ENABLED(CONFIG_X86_64)
+ RUNTIME_CONST_PTR USER_PTR_MAX, rdx
+ cmp %rdx, %rax
+ cmova %rdx, %rax
+.else
+ cmp $TASK_SIZE_MAX-\size+1, %eax
+ jae .Lbad_get_user
+ sbb %edx, %edx /* array_index_mask_nospec() */
+ and %edx, %eax
+.endif
+.endm
+
+.macro UACCESS op src dst
+1: \op \src,\dst
+ _ASM_EXTABLE_UA(1b, __get_user_handle_exception)
+.endm
+
.text
-ENTRY(__get_user_1)
- CFI_STARTPROC
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
- jae bad_get_user
-1: movzb (%_ASM_AX),%edx
+SYM_FUNC_START(__get_user_1)
+ ANNOTATE_NOENDBR
+ check_range size=1
+ ASM_STAC
+ UACCESS movzbl (%_ASM_AX),%edx
xor %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_1)
-
-ENTRY(__get_user_2)
- CFI_STARTPROC
- add $1,%_ASM_AX
- jc bad_get_user
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
- jae bad_get_user
-2: movzwl -1(%_ASM_AX),%edx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_1)
+EXPORT_SYMBOL(__get_user_1)
+
+SYM_FUNC_START(__get_user_2)
+ ANNOTATE_NOENDBR
+ check_range size=2
+ ASM_STAC
+ UACCESS movzwl (%_ASM_AX),%edx
xor %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_2)
-
-ENTRY(__get_user_4)
- CFI_STARTPROC
- add $3,%_ASM_AX
- jc bad_get_user
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
- jae bad_get_user
-3: mov -3(%_ASM_AX),%edx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_2)
+EXPORT_SYMBOL(__get_user_2)
+
+SYM_FUNC_START(__get_user_4)
+ ANNOTATE_NOENDBR
+ check_range size=4
+ ASM_STAC
+ UACCESS movl (%_ASM_AX),%edx
xor %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_4)
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_4)
+EXPORT_SYMBOL(__get_user_4)
+SYM_FUNC_START(__get_user_8)
+ ANNOTATE_NOENDBR
+#ifndef CONFIG_X86_64
+ xor %ecx,%ecx
+#endif
+ check_range size=8
+ ASM_STAC
#ifdef CONFIG_X86_64
-ENTRY(__get_user_8)
- CFI_STARTPROC
- add $7,%_ASM_AX
- jc bad_get_user
- GET_THREAD_INFO(%_ASM_DX)
- cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
- jae bad_get_user
-4: movq -7(%_ASM_AX),%_ASM_DX
+ UACCESS movq (%_ASM_AX),%rdx
+#else
+ UACCESS movl (%_ASM_AX),%edx
+ UACCESS movl 4(%_ASM_AX),%ecx
+#endif
+ xor %eax,%eax
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_8)
+EXPORT_SYMBOL(__get_user_8)
+
+/* .. and the same for __get_user, just without the range checks */
+SYM_FUNC_START(__get_user_nocheck_1)
+ ANNOTATE_NOENDBR
+ ASM_STAC
+ ASM_BARRIER_NOSPEC
+ UACCESS movzbl (%_ASM_AX),%edx
xor %eax,%eax
- ret
- CFI_ENDPROC
-ENDPROC(__get_user_8)
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_nocheck_1)
+EXPORT_SYMBOL(__get_user_nocheck_1)
+
+SYM_FUNC_START(__get_user_nocheck_2)
+ ANNOTATE_NOENDBR
+ ASM_STAC
+ ASM_BARRIER_NOSPEC
+ UACCESS movzwl (%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_nocheck_2)
+EXPORT_SYMBOL(__get_user_nocheck_2)
+
+SYM_FUNC_START(__get_user_nocheck_4)
+ ANNOTATE_NOENDBR
+ ASM_STAC
+ ASM_BARRIER_NOSPEC
+ UACCESS movl (%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_nocheck_4)
+EXPORT_SYMBOL(__get_user_nocheck_4)
+
+SYM_FUNC_START(__get_user_nocheck_8)
+ ANNOTATE_NOENDBR
+ ASM_STAC
+ ASM_BARRIER_NOSPEC
+#ifdef CONFIG_X86_64
+ UACCESS movq (%_ASM_AX),%rdx
+#else
+ xor %ecx,%ecx
+ UACCESS movl (%_ASM_AX),%edx
+ UACCESS movl 4(%_ASM_AX),%ecx
#endif
+ xor %eax,%eax
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__get_user_nocheck_8)
+EXPORT_SYMBOL(__get_user_nocheck_8)
+
-bad_get_user:
- CFI_STARTPROC
+SYM_CODE_START_LOCAL(__get_user_handle_exception)
+ ASM_CLAC
+.Lbad_get_user:
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
- ret
- CFI_ENDPROC
-END(bad_get_user)
-
-.section __ex_table,"a"
- _ASM_PTR 1b,bad_get_user
- _ASM_PTR 2b,bad_get_user
- _ASM_PTR 3b,bad_get_user
-#ifdef CONFIG_X86_64
- _ASM_PTR 4b,bad_get_user
-#endif
+ RET
+SYM_CODE_END(__get_user_handle_exception)
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644
index 000000000000..edbeb3ecad38
--- /dev/null
+++ b/arch/x86/lib/hweight.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+SYM_FUNC_START(__sw_hweight32)
+ ANNOTATE_NOENDBR
+
+#ifdef CONFIG_X86_64
+ movl %edi, %eax # w
+#endif
+ __ASM_SIZE(push,) %__ASM_REG(dx)
+ movl %eax, %edx # w -> t
+ shrl %edx # t >>= 1
+ andl $0x55555555, %edx # t &= 0x55555555
+ subl %edx, %eax # w -= t
+
+ movl %eax, %edx # w -> t
+ shrl $2, %eax # w_tmp >>= 2
+ andl $0x33333333, %edx # t &= 0x33333333
+ andl $0x33333333, %eax # w_tmp &= 0x33333333
+ addl %edx, %eax # w = w_tmp + t
+
+ movl %eax, %edx # w -> t
+ shrl $4, %edx # t >>= 4
+ addl %edx, %eax # w_tmp += t
+ andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f
+ imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
+ shrl $24, %eax # w = w_tmp >> 24
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
+ RET
+SYM_FUNC_END(__sw_hweight32)
+EXPORT_SYMBOL(__sw_hweight32)
+
+/*
+ * No 32-bit variant, because it's implemented as an inline wrapper
+ * on top of __arch_hweight32():
+ */
+#ifdef CONFIG_X86_64
+SYM_FUNC_START(__sw_hweight64)
+ ANNOTATE_NOENDBR
+ pushq %rdi
+ pushq %rdx
+
+ movq %rdi, %rdx # w -> t
+ movabsq $0x5555555555555555, %rax
+ shrq %rdx # t >>= 1
+ andq %rdx, %rax # t &= 0x5555555555555555
+ movabsq $0x3333333333333333, %rdx
+ subq %rax, %rdi # w -= t
+
+ movq %rdi, %rax # w -> t
+ shrq $2, %rdi # w_tmp >>= 2
+ andq %rdx, %rax # t &= 0x3333333333333333
+ andq %rdi, %rdx # w_tmp &= 0x3333333333333333
+ addq %rdx, %rax # w = w_tmp + t
+
+ movq %rax, %rdx # w -> t
+ shrq $4, %rdx # t >>= 4
+ addq %rdx, %rax # w_tmp += t
+ movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+ andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f
+ movabsq $0x0101010101010101, %rdx
+ imulq %rdx, %rax # w_tmp *= 0x0101010101010101
+ shrq $56, %rax # w = w_tmp >> 56
+
+ popq %rdx
+ popq %rdi
+ RET
+SYM_FUNC_END(__sw_hweight64)
+EXPORT_SYMBOL(__sw_hweight64)
+#endif
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 000000000000..a5cafd402cfd
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ */
+#include <asm/insn.h> /* __ignore_sync_check__ */
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+ return inat_primary_table[opcode];
+}
+
+int inat_get_last_prefix_id(insn_byte_t last_pfx)
+{
+ insn_attr_t lpfx_attr;
+
+ lpfx_attr = inat_get_opcode_attribute(last_pfx);
+ return inat_last_prefix_id(lpfx_attr);
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
+ insn_attr_t esc_attr)
+{
+ const insn_attr_t *table;
+ int n;
+
+ n = inat_escape_id(esc_attr);
+
+ table = inat_escape_tables[n][0];
+ if (!table)
+ return 0;
+ if (inat_has_variant(table[opcode]) && lpfx_id) {
+ table = inat_escape_tables[n][lpfx_id];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
+ insn_attr_t grp_attr)
+{
+ const insn_attr_t *table;
+ int n;
+
+ n = inat_group_id(grp_attr);
+
+ table = inat_group_tables[n][0];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
+ table = inat_group_tables[n][lpfx_id];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ }
+ return table[X86_MODRM_REG(modrm)] |
+ inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+ insn_byte_t vex_p)
+{
+ const insn_attr_t *table;
+ if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+ return 0;
+ /* At first, this checks the master table */
+ table = inat_avx_tables[vex_m][0];
+ if (!table)
+ return 0;
+ if (!inat_is_group(table[opcode]) && vex_p) {
+ /* If this is not a group, get attribute directly */
+ table = inat_avx_tables[vex_m][vex_p];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
+insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select)
+{
+ const insn_attr_t *table;
+
+ if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX)
+ return 0;
+ map_select -= X86_XOP_M_MIN;
+ /* At first, this checks the master table */
+ table = inat_xop_tables[map_select];
+ if (!table)
+ return 0;
+ return table[opcode];
+}
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
new file mode 100644
index 000000000000..e03eeec55cfe
--- /dev/null
+++ b/arch/x86/lib/insn-eval.c
@@ -0,0 +1,1821 @@
+/*
+ * Utility functions for x86 operand and address decoding
+ *
+ * Copyright (C) Intel Corporation 2017
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/mmu_context.h>
+#include <asm/desc_defs.h>
+#include <asm/desc.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/ldt.h>
+#include <asm/msr.h>
+#include <asm/vm86.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "insn: " fmt
+
+enum reg_type {
+ REG_TYPE_RM = 0,
+ REG_TYPE_REG,
+ REG_TYPE_INDEX,
+ REG_TYPE_BASE,
+};
+
+/**
+ * is_string_insn() - Determine if instruction is a string instruction
+ * @insn: Instruction containing the opcode to inspect
+ *
+ * Returns:
+ *
+ * true if the instruction, determined by the opcode, is any of the
+ * string instructions as defined in the Intel Software Development manual.
+ * False otherwise.
+ */
+static bool is_string_insn(struct insn *insn)
+{
+ /* All string instructions have a 1-byte opcode. */
+ if (insn->opcode.nbytes != 1)
+ return false;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x6c ... 0x6f: /* INS, OUTS */
+ case 0xa4 ... 0xa7: /* MOVS, CMPS */
+ case 0xaa ... 0xaf: /* STOS, LODS, SCAS */
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * insn_has_rep_prefix() - Determine if instruction has a REP prefix
+ * @insn: Instruction containing the prefix to inspect
+ *
+ * Returns:
+ *
+ * true if the instruction has a REP prefix, false if not.
+ */
+bool insn_has_rep_prefix(struct insn *insn)
+{
+ insn_byte_t p;
+
+ insn_get_prefixes(insn);
+
+ for_each_insn_prefix(insn, p) {
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * get_seg_reg_override_idx() - obtain segment register override index
+ * @insn: Valid instruction with segment override prefixes
+ *
+ * Inspect the instruction prefixes in @insn and find segment overrides, if any.
+ *
+ * Returns:
+ *
+ * A constant identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override
+ * prefixes were found.
+ *
+ * -EINVAL in case of error.
+ */
+static int get_seg_reg_override_idx(struct insn *insn)
+{
+ int idx = INAT_SEG_REG_DEFAULT;
+ int num_overrides = 0;
+ insn_byte_t p;
+
+ insn_get_prefixes(insn);
+
+ /* Look for any segment override prefixes. */
+ for_each_insn_prefix(insn, p) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(p);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ idx = INAT_SEG_REG_CS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ idx = INAT_SEG_REG_SS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ idx = INAT_SEG_REG_DS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ idx = INAT_SEG_REG_ES;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_FS):
+ idx = INAT_SEG_REG_FS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_GS):
+ idx = INAT_SEG_REG_GS;
+ num_overrides++;
+ break;
+ /* No default action needed. */
+ }
+ }
+
+ /* More than one segment override prefix leads to undefined behavior. */
+ if (num_overrides > 1)
+ return -EINVAL;
+
+ return idx;
+}
+
+/**
+ * check_seg_overrides() - check if segment override prefixes are allowed
+ * @insn: Valid instruction with segment override prefixes
+ * @regoff: Operand offset, in pt_regs, for which the check is performed
+ *
+ * For a particular register used in register-indirect addressing, determine if
+ * segment override prefixes can be used. Specifically, no overrides are allowed
+ * for rDI if used with a string instruction.
+ *
+ * Returns:
+ *
+ * True if segment override prefixes can be used with the register indicated
+ * in @regoff. False if otherwise.
+ */
+static bool check_seg_overrides(struct insn *insn, int regoff)
+{
+ if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn))
+ return false;
+
+ return true;
+}
+
+/**
+ * resolve_default_seg() - resolve default segment register index for an operand
+ * @insn: Instruction with opcode and address size. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @off: Operand offset, in pt_regs, for which resolution is needed
+ *
+ * Resolve the default segment register index associated with the instruction
+ * operand register indicated by @off. Such index is resolved based on defaults
+ * described in the Intel Software Development Manual.
+ *
+ * Returns:
+ *
+ * If in protected mode, a constant identifying the segment register to use,
+ * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
+{
+ if (any_64bit_mode(regs))
+ return INAT_SEG_REG_IGNORE;
+ /*
+ * Resolve the default segment register as described in Section 3.7.4
+ * of the Intel Software Development Manual Vol. 1:
+ *
+ * + DS for all references involving r[ABCD]X, and rSI.
+ * + If used in a string instruction, ES for rDI. Otherwise, DS.
+ * + AX, CX and DX are not valid register operands in 16-bit address
+ * encodings but are valid for 32-bit and 64-bit encodings.
+ * + -EDOM is reserved to identify for cases in which no register
+ * is used (i.e., displacement-only addressing). Use DS.
+ * + SS for rSP or rBP.
+ * + CS for rIP.
+ */
+
+ switch (off) {
+ case offsetof(struct pt_regs, ax):
+ case offsetof(struct pt_regs, cx):
+ case offsetof(struct pt_regs, dx):
+ /* Need insn to verify address size. */
+ if (insn->addr_bytes == 2)
+ return -EINVAL;
+
+ fallthrough;
+
+ case -EDOM:
+ case offsetof(struct pt_regs, bx):
+ case offsetof(struct pt_regs, si):
+ return INAT_SEG_REG_DS;
+
+ case offsetof(struct pt_regs, di):
+ if (is_string_insn(insn))
+ return INAT_SEG_REG_ES;
+ return INAT_SEG_REG_DS;
+
+ case offsetof(struct pt_regs, bp):
+ case offsetof(struct pt_regs, sp):
+ return INAT_SEG_REG_SS;
+
+ case offsetof(struct pt_regs, ip):
+ return INAT_SEG_REG_CS;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * resolve_seg_reg() - obtain segment register index
+ * @insn: Instruction with operands
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Operand offset, in pt_regs, used to determine segment register
+ *
+ * Determine the segment register associated with the operands and, if
+ * applicable, prefixes and the instruction pointed by @insn.
+ *
+ * The segment register associated to an operand used in register-indirect
+ * addressing depends on:
+ *
+ * a) Whether running in long mode (in such a case segments are ignored, except
+ * if FS or GS are used).
+ *
+ * b) Whether segment override prefixes can be used. Certain instructions and
+ * registers do not allow override prefixes.
+ *
+ * c) Whether segment overrides prefixes are found in the instruction prefixes.
+ *
+ * d) If there are not segment override prefixes or they cannot be used, the
+ * default segment register associated with the operand register is used.
+ *
+ * The function checks first if segment override prefixes can be used with the
+ * operand indicated by @regoff. If allowed, obtain such overridden segment
+ * register index. Lastly, if not prefixes were found or cannot be used, resolve
+ * the segment register index to use based on the defaults described in the
+ * Intel documentation. In long mode, all segment register indexes will be
+ * ignored, except if overrides were found for FS or GS. All these operations
+ * are done using helper functions.
+ *
+ * The operand register, @regoff, is represented as the offset from the base of
+ * pt_regs.
+ *
+ * As stated, the main use of this function is to determine the segment register
+ * index based on the instruction, its operands and prefixes. Hence, @insn
+ * must be valid. However, if @regoff indicates rIP, we don't need to inspect
+ * @insn at all as in this case CS is used in all cases. This case is checked
+ * before proceeding further.
+ *
+ * Please note that this function does not return the value in the segment
+ * register (i.e., the segment selector) but our defined index. The segment
+ * selector needs to be obtained using get_segment_selector() and passing the
+ * segment register index resolved by this function.
+ *
+ * Returns:
+ *
+ * An index identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
+{
+ int idx;
+
+ /*
+ * In the unlikely event of having to resolve the segment register
+ * index for rIP, do it first. Segment override prefixes should not
+ * be used. Hence, it is not necessary to inspect the instruction,
+ * which may be invalid at this point.
+ */
+ if (regoff == offsetof(struct pt_regs, ip)) {
+ if (any_64bit_mode(regs))
+ return INAT_SEG_REG_IGNORE;
+ else
+ return INAT_SEG_REG_CS;
+ }
+
+ if (!insn)
+ return -EINVAL;
+
+ if (!check_seg_overrides(insn, regoff))
+ return resolve_default_seg(insn, regs, regoff);
+
+ idx = get_seg_reg_override_idx(insn);
+ if (idx < 0)
+ return idx;
+
+ if (idx == INAT_SEG_REG_DEFAULT)
+ return resolve_default_seg(insn, regs, regoff);
+
+ /*
+ * In long mode, segment override prefixes are ignored, except for
+ * overrides for FS and GS.
+ */
+ if (any_64bit_mode(regs)) {
+ if (idx != INAT_SEG_REG_FS &&
+ idx != INAT_SEG_REG_GS)
+ idx = INAT_SEG_REG_IGNORE;
+ }
+
+ return idx;
+}
+
+/**
+ * get_segment_selector() - obtain segment selector
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Segment register index to use
+ *
+ * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment
+ * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or
+ * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained
+ * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU
+ * registers. This done for only for completeness as in CONFIG_X86_64 segment
+ * registers are ignored.
+ *
+ * Returns:
+ *
+ * Value of the segment selector, including null when running in
+ * long mode.
+ *
+ * -EINVAL on error.
+ */
+static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
+{
+ unsigned short sel;
+
+#ifdef CONFIG_X86_64
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_IGNORE:
+ return 0;
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ savesegment(ds, sel);
+ return sel;
+ case INAT_SEG_REG_ES:
+ savesegment(es, sel);
+ return sel;
+ case INAT_SEG_REG_FS:
+ savesegment(fs, sel);
+ return sel;
+ case INAT_SEG_REG_GS:
+ savesegment(gs, sel);
+ return sel;
+ default:
+ return -EINVAL;
+ }
+#else /* CONFIG_X86_32 */
+ struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs;
+
+ if (v8086_mode(regs)) {
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ return vm86regs->ds;
+ case INAT_SEG_REG_ES:
+ return vm86regs->es;
+ case INAT_SEG_REG_FS:
+ return vm86regs->fs;
+ case INAT_SEG_REG_GS:
+ return vm86regs->gs;
+ case INAT_SEG_REG_IGNORE:
+ default:
+ return -EINVAL;
+ }
+ }
+
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ return (unsigned short)(regs->ds & 0xffff);
+ case INAT_SEG_REG_ES:
+ return (unsigned short)(regs->es & 0xffff);
+ case INAT_SEG_REG_FS:
+ return (unsigned short)(regs->fs & 0xffff);
+ case INAT_SEG_REG_GS:
+ savesegment(gs, sel);
+ return sel;
+ case INAT_SEG_REG_IGNORE:
+ default:
+ return -EINVAL;
+ }
+#endif /* CONFIG_X86_64 */
+}
+
+static const int pt_regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#else
+ offsetof(struct pt_regs, ds),
+ offsetof(struct pt_regs, es),
+ offsetof(struct pt_regs, fs),
+ offsetof(struct pt_regs, gs),
+#endif
+};
+
+int pt_regs_offset(struct pt_regs *regs, int regno)
+{
+ if ((unsigned)regno < ARRAY_SIZE(pt_regoff))
+ return pt_regoff[regno];
+ return -EDOM;
+}
+
+static int get_regno(struct insn *insn, enum reg_type type)
+{
+ int nr_registers = ARRAY_SIZE(pt_regoff);
+ int regno = 0;
+
+ /*
+ * Don't possibly decode a 32-bit instructions as
+ * reading a 64-bit-only register.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
+ nr_registers -= 8;
+
+ switch (type) {
+ case REG_TYPE_RM:
+ regno = X86_MODRM_RM(insn->modrm.value);
+
+ /*
+ * ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement
+ * follows the ModRM byte.
+ */
+ if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+ return -EDOM;
+
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ case REG_TYPE_REG:
+ regno = X86_MODRM_REG(insn->modrm.value);
+
+ if (X86_REX_R(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ case REG_TYPE_INDEX:
+ regno = X86_SIB_INDEX(insn->sib.value);
+ if (X86_REX_X(insn->rex_prefix.value))
+ regno += 8;
+
+ /*
+ * If ModRM.mod != 3 and SIB.index = 4 the scale*index
+ * portion of the address computation is null. This is
+ * true only if REX.X is 0. In such a case, the SIB index
+ * is used in the address computation.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4)
+ return -EDOM;
+ break;
+
+ case REG_TYPE_BASE:
+ regno = X86_SIB_BASE(insn->sib.value);
+ /*
+ * If ModRM.mod is 0 and SIB.base == 5, the base of the
+ * register-indirect addressing is 0. In this case, a
+ * 32-bit displacement follows the SIB byte.
+ */
+ if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+ return -EDOM;
+
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ default:
+ pr_err_ratelimited("invalid register type: %d\n", type);
+ return -EINVAL;
+ }
+
+ if (regno >= nr_registers) {
+ WARN_ONCE(1, "decoded an instruction with an invalid register");
+ return -EINVAL;
+ }
+ return regno;
+}
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = get_regno(insn, type);
+
+ if (regno < 0)
+ return regno;
+
+ return pt_regs_offset(regs, regno);
+}
+
+/**
+ * get_reg_offset_16() - Obtain offset of register indicated by instruction
+ * @insn: Instruction containing ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ * @offs1: Offset of the first operand register
+ * @offs2: Offset of the second operand register, if applicable
+ *
+ * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte
+ * in @insn. This function is to be used with 16-bit address encodings. The
+ * @offs1 and @offs2 will be written with the offset of the two registers
+ * indicated by the instruction. In cases where any of the registers is not
+ * referenced by the instruction, the value will be set to -EDOM.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error.
+ */
+static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs,
+ int *offs1, int *offs2)
+{
+ /*
+ * 16-bit addressing can use one or two registers. Specifics of
+ * encodings are given in Table 2-1. "16-Bit Addressing Forms with the
+ * ModR/M Byte" of the Intel Software Development Manual.
+ */
+ static const int regoff1[] = {
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, bx),
+ };
+
+ static const int regoff2[] = {
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ -EDOM,
+ -EDOM,
+ -EDOM,
+ -EDOM,
+ };
+
+ if (!offs1 || !offs2)
+ return -EINVAL;
+
+ /* Operand is a register, use the generic function. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ *offs1 = insn_get_modrm_rm_off(insn, regs);
+ *offs2 = -EDOM;
+ return 0;
+ }
+
+ *offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)];
+ *offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)];
+
+ /*
+ * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement-
+ * only addressing. This means that no registers are involved in
+ * computing the effective address. Thus, ensure that the first
+ * register offset is invalid. The second register offset is already
+ * invalid under the aforementioned conditions.
+ */
+ if ((X86_MODRM_MOD(insn->modrm.value) == 0) &&
+ (X86_MODRM_RM(insn->modrm.value) == 6))
+ *offs1 = -EDOM;
+
+ return 0;
+}
+
+/**
+ * get_desc() - Obtain contents of a segment descriptor
+ * @out: Segment descriptor contents on success
+ * @sel: Segment selector
+ *
+ * Given a segment selector, obtain a pointer to the segment descriptor.
+ * Both global and local descriptor tables are supported.
+ *
+ * Returns:
+ *
+ * True on success, false on failure.
+ *
+ * NULL on error.
+ */
+static bool get_desc(struct desc_struct *out, unsigned short sel)
+{
+ struct desc_ptr gdt_desc = {0, 0};
+ unsigned long desc_base;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ bool success = false;
+ struct ldt_struct *ldt;
+
+ /* Bits [15:3] contain the index of the desired entry. */
+ sel >>= 3;
+
+ /*
+ * If we're not in a valid context with a real (not just lazy)
+ * user mm, then don't even try.
+ */
+ if (!nmi_uaccess_okay())
+ return false;
+
+ mutex_lock(&current->mm->context.lock);
+ ldt = current->mm->context.ldt;
+ if (ldt && sel < ldt->nr_entries) {
+ *out = ldt->entries[sel];
+ success = true;
+ }
+
+ mutex_unlock(&current->mm->context.lock);
+
+ return success;
+ }
+#endif
+ native_store_gdt(&gdt_desc);
+
+ /*
+ * Segment descriptors have a size of 8 bytes. Thus, the index is
+ * multiplied by 8 to obtain the memory offset of the desired descriptor
+ * from the base of the GDT. As bits [15:3] of the segment selector
+ * contain the index, it can be regarded as multiplied by 8 already.
+ * All that remains is to clear bits [2:0].
+ */
+ desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK);
+
+ if (desc_base > gdt_desc.size)
+ return false;
+
+ *out = *(struct desc_struct *)(gdt_desc.address + desc_base);
+ return true;
+}
+
+/**
+ * insn_get_seg_base() - Obtain base address of segment descriptor.
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the base address of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, base address of the segment. Zero in long mode,
+ * except when FS or GS are used. In virtual-8086 mode, the segment
+ * selector shifted 4 bits to the right.
+ *
+ * -1L in case of error.
+ */
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ struct desc_struct desc;
+ short sel;
+
+ sel = get_segment_selector(regs, seg_reg_idx);
+ if (sel < 0)
+ return -1L;
+
+ if (v8086_mode(regs))
+ /*
+ * Base is simply the segment selector shifted 4
+ * bits to the right.
+ */
+ return (unsigned long)(sel << 4);
+
+ if (any_64bit_mode(regs)) {
+ /*
+ * Only FS or GS will have a base address, the rest of
+ * the segments' bases are forced to 0.
+ */
+ unsigned long base;
+
+ if (seg_reg_idx == INAT_SEG_REG_FS) {
+ rdmsrq(MSR_FS_BASE, base);
+ } else if (seg_reg_idx == INAT_SEG_REG_GS) {
+ /*
+ * swapgs was called at the kernel entry point. Thus,
+ * MSR_KERNEL_GS_BASE will have the user-space GS base.
+ */
+ if (user_mode(regs))
+ rdmsrq(MSR_KERNEL_GS_BASE, base);
+ else
+ rdmsrq(MSR_GS_BASE, base);
+ } else {
+ base = 0;
+ }
+ return base;
+ }
+
+ /* In protected mode the segment selector cannot be null. */
+ if (!sel)
+ return -1L;
+
+ if (!get_desc(&desc, sel))
+ return -1L;
+
+ return get_desc_base(&desc);
+}
+
+/**
+ * get_seg_limit() - Obtain the limit of a segment descriptor
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the limit of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, the limit of the segment descriptor in bytes.
+ * In long mode and virtual-8086 mode, segment limits are not enforced. Thus,
+ * limit is returned as -1L to imply a limit-less segment.
+ *
+ * Zero is returned on error.
+ */
+static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
+{
+ struct desc_struct desc;
+ unsigned long limit;
+ short sel;
+
+ sel = get_segment_selector(regs, seg_reg_idx);
+ if (sel < 0)
+ return 0;
+
+ if (any_64bit_mode(regs) || v8086_mode(regs))
+ return -1L;
+
+ if (!sel)
+ return 0;
+
+ if (!get_desc(&desc, sel))
+ return 0;
+
+ /*
+ * If the granularity bit is set, the limit is given in multiples
+ * of 4096. This also means that the 12 least significant bits are
+ * not tested when checking the segment limits. In practice,
+ * this means that the segment ends in (limit << 12) + 0xfff.
+ */
+ limit = get_desc_limit(&desc);
+ if (desc.g)
+ limit = (limit << 12) + 0xfff;
+
+ return limit;
+}
+
+/**
+ * insn_get_code_seg_params() - Obtain code segment parameters
+ * @regs: Structure with register values as seen when entering kernel mode
+ *
+ * Obtain address and operand sizes of the code segment. It is obtained from the
+ * selector contained in the CS register in regs. In protected mode, the default
+ * address is determined by inspecting the L and D bits of the segment
+ * descriptor. In virtual-8086 mode, the default is always two bytes for both
+ * address and operand sizes.
+ *
+ * Returns:
+ *
+ * An int containing ORed-in default parameters on success.
+ *
+ * -EINVAL on error.
+ */
+int insn_get_code_seg_params(struct pt_regs *regs)
+{
+ struct desc_struct desc;
+ short sel;
+
+ if (v8086_mode(regs))
+ /* Address and operand size are both 16-bit. */
+ return INSN_CODE_SEG_PARAMS(2, 2);
+
+ sel = get_segment_selector(regs, INAT_SEG_REG_CS);
+ if (sel < 0)
+ return sel;
+
+ if (!get_desc(&desc, sel))
+ return -EINVAL;
+
+ /*
+ * The most significant byte of the Type field of the segment descriptor
+ * determines whether a segment contains data or code. If this is a data
+ * segment, return error.
+ */
+ if (!(desc.type & BIT(3)))
+ return -EINVAL;
+
+ switch ((desc.l << 1) | desc.d) {
+ case 0: /*
+ * Legacy mode. CS.L=0, CS.D=0. Address and operand size are
+ * both 16-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(2, 2);
+ case 1: /*
+ * Legacy mode. CS.L=0, CS.D=1. Address and operand size are
+ * both 32-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(4, 4);
+ case 2: /*
+ * IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit;
+ * operand size is 32-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(4, 8);
+ case 3: /* Invalid setting. CS.L=1, CS.D=1 */
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the r/m part of the ModRM byte. The
+ * register is obtained as an offset from the base of pt_regs. In specific
+ * cases, the returned value can be -EDOM to indicate that the particular value
+ * of ModRM does not refer to a register and shall be ignored.
+ */
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)
+{
+ return get_reg_offset(insn, regs, REG_TYPE_RM);
+}
+
+/**
+ * insn_get_modrm_reg_off() - Obtain register in reg part of the ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the reg part of the ModRM byte. The
+ * register is obtained as an offset from the base of pt_regs.
+ */
+int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
+{
+ return get_reg_offset(insn, regs, REG_TYPE_REG);
+}
+
+/**
+ * insn_get_modrm_reg_ptr() - Obtain register pointer based on ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the reg part of the ModRM byte.
+ * The register is obtained as a pointer within pt_regs.
+ */
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs)
+{
+ int offset;
+
+ offset = insn_get_modrm_reg_off(insn, regs);
+ if (offset < 0)
+ return NULL;
+ return (void *)regs + offset;
+}
+
+/**
+ * get_seg_base_limit() - obtain base address and limit of a segment
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Operand offset, in pt_regs, used to resolve segment descriptor
+ * @base: Obtained segment base
+ * @limit: Obtained segment limit
+ *
+ * Obtain the base address and limit of the segment associated with the operand
+ * @regoff and, if any or allowed, override prefixes in @insn. This function is
+ * different from insn_get_seg_base() as the latter does not resolve the segment
+ * associated with the instruction operand. If a limit is not needed (e.g.,
+ * when running in long mode), @limit can be NULL.
+ *
+ * Returns:
+ *
+ * 0 on success. @base and @limit will contain the base address and of the
+ * resolved segment, respectively.
+ *
+ * -EINVAL on error.
+ */
+static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs,
+ int regoff, unsigned long *base,
+ unsigned long *limit)
+{
+ int seg_reg_idx;
+
+ if (!base)
+ return -EINVAL;
+
+ seg_reg_idx = resolve_seg_reg(insn, regs, regoff);
+ if (seg_reg_idx < 0)
+ return seg_reg_idx;
+
+ *base = insn_get_seg_base(regs, seg_reg_idx);
+ if (*base == -1L)
+ return -EINVAL;
+
+ if (!limit)
+ return 0;
+
+ *limit = get_seg_limit(regs, seg_reg_idx);
+ if (!(*limit))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_reg() - Obtain effective address from register operand
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, with the effective address
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address stored in the register operand as indicated by
+ * the ModRM byte. This function is to be used only with register addressing
+ * (i.e., ModRM.mod is 3). The effective address is saved in @eff_addr. The
+ * register operand, as an offset from the base of pt_regs, is saved in @regoff;
+ * such offset can then be used to resolve the segment associated with the
+ * operand. This function can be used with any of the supported address sizes
+ * in x86.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the effective address stored in the
+ * operand indicated by ModRM. @regoff will have such operand as an offset from
+ * the base of pt_regs.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs,
+ int *regoff, long *eff_addr)
+{
+ int ret;
+
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
+
+ if (X86_MODRM_MOD(insn->modrm.value) != 3)
+ return -EINVAL;
+
+ *regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
+ if (*regoff < 0)
+ return -EINVAL;
+
+ /* Ignore bytes that are outside the address size. */
+ if (insn->addr_bytes == 2)
+ *eff_addr = regs_get_register(regs, *regoff) & 0xffff;
+ else if (insn->addr_bytes == 4)
+ *eff_addr = regs_get_register(regs, *regoff) & 0xffffffff;
+ else /* 64-bit address */
+ *eff_addr = regs_get_register(regs, *regoff);
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_modrm() - Obtain referenced effective address via ModRM
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address referenced by the ModRM byte of @insn. After
+ * identifying the registers involved in the register-indirect memory reference,
+ * its value is obtained from the operands in @regs. The computed address is
+ * stored @eff_addr. Also, the register operand that indicates the associated
+ * segment is stored in @regoff, this parameter can later be used to determine
+ * such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address. @regoff
+ * will have a register, as an offset from the base of pt_regs, that can be used
+ * to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
+ int *regoff, long *eff_addr)
+{
+ long tmp;
+ int ret;
+
+ if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
+ return -EINVAL;
+
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
+
+ if (X86_MODRM_MOD(insn->modrm.value) >